├── .dockerignore
├── MANIFEST.in
├── .github
    ├── titleLint.yml
    ├── CODEOWNERS
    ├── workflows
    │   ├── license.yml
    │   ├── pypipublish.yml
    │   └── pull_request.yml
    ├── stale.yml
    └── PULL_REQUEST_TEMPLATE.md
├── example
    ├── sample_data
    │   ├── sample_tags.csv
    │   ├── sample_schema_description.csv
    │   ├── sample_source.csv
    │   ├── sample_table_last_updated.csv
    │   ├── sample_table_owner.csv
    │   ├── sample_dashboard_owner.csv
    │   ├── sample_dashboard_last_modified.csv
    │   ├── sample_dashboard_usage.csv
    │   ├── sample_application.csv
    │   ├── sample_dashboard_table.csv
    │   ├── sample_dashboard_query.csv
    │   ├── sample_watermark.csv
    │   ├── sample_dashboard_last_execution.csv
    │   ├── sample_table.csv
    │   ├── sample_user.csv
    │   ├── sample_table_programmatic_source.csv
    │   ├── sample_col.csv
    │   ├── sample_table_column_stats.csv
    │   ├── sample_column_usage.csv
    │   └── sample_dashboard_base.csv
    └── __init__.py
├── tests
    ├── __init__.py
    └── unit
    │   ├── __init__.py
    │   ├── task
    │       └── __init__.py
    │   ├── callback
    │       ├── __init__.py
    │       └── test_call_back.py
    │   ├── extractor
    │       ├── __init__.py
    │       ├── user
    │       │   ├── __init__.py
    │       │   └── bamboohr
    │       │   │   ├── __init__.py
    │       │   │   └── test_bamboohr_user_extractor.py
    │       ├── dashboard
    │       │   ├── __init__.py
    │       │   ├── redash
    │       │   │   └── __init__.py
    │       │   ├── tableau
    │       │   │   ├── __init__.py
    │       │   │   ├── test_tableau_dashboard_query_extractor.py
    │       │   │   ├── test_tableau_dashboard_last_modified_extractor.py
    │       │   │   └── test_tableau_dashboard_extractor.py
    │       │   └── mode_analytics
    │       │   │   ├── __init__.py
    │       │   │   └── batch
    │       │   │       ├── __init__.py
    │       │   │       └── test_mode_dashboard_charts_batch_extractor.py
    │       ├── restapi
    │       │   ├── __init__.py
    │       │   └── test_rest_api_extractor.py
    │       ├── test_neo4j_es_last_updated_extractor.py
    │       ├── test_csv_extractor.py
    │       ├── test_generic_extractor.py
    │       └── test_kafka_source_extractor.py
    │   ├── loader
    │       ├── __init__.py
    │       └── test_generic_loader.py
    │   ├── models
    │       ├── __init__.py
    │       ├── schema
    │       │   └── __init__.py
    │       ├── dashboard
    │       │   ├── __init__.py
    │       │   ├── test_dashboard_owner.py
    │       │   ├── test_dashboard_query.py
    │       │   ├── test_dashboard_last_modified.py
    │       │   └── test_dashboard_usage.py
    │       ├── test_neo4j_es_last_updated.py
    │       ├── test_metric_elasticsearch_document.py
    │       ├── test_table_column_usage.py
    │       ├── test_user_elasticsearch_document.py
    │       ├── test_dashboard_elasticsearch_document.py
    │       ├── test_table_elasticsearch_document.py
    │       ├── test_table_lineage.py
    │       ├── test_table_stats.py
    │       ├── test_table_source.py
    │       └── test_table_last_updated.py
    │   ├── publisher
    │       ├── __init__.py
    │       └── test_publisher.py
    │   ├── rest_api
    │       ├── __init__.py
    │       ├── mode_analytics
    │       │   └── __init__.py
    │       └── test_rest_api_failure_handlers.py
    │   ├── usage
    │       ├── __init__.py
    │       └── presto
    │       │   └── __init__.py
    │   ├── filesystem
    │       ├── __init__.py
    │       └── test_filesystem.py
    │   ├── resources
    │       ├── fs_neo4j_csv_loader
    │       │   ├── movies
    │       │   │   ├── nodes
    │       │   │   │   ├── Movie_0.csv
    │       │   │   │   ├── City_0.csv
    │       │   │   │   └── Actor_0.csv
    │       │   │   └── relationships
    │       │   │   │   ├── test_Movie_Actor_ACTOR.csv
    │       │   │   │   └── test_Movie_City_FILMED_AT.csv
    │       │   └── people
    │       │   │   └── nodes
    │       │   │       ├── Person_0.csv
    │       │   │       └── Person_1.csv
    │       ├── csv_publisher
    │       │   ├── nodes
    │       │   │   ├── test_table.csv
    │       │   │   └── test_column.csv
    │       │   └── relations
    │       │   │   └── test_edge_short.csv
    │       └── extractor
    │       │   └── user
    │       │       └── bamboohr
    │       │           └── testdata.xml
    │   └── transformer
    │       ├── __init__.py
    │       ├── test_template_variable_substitution_transformer.py
    │       ├── test_dict_to_model_transformer.py
    │       ├── test_remove_field_transformer.py
    │       ├── test_timestamp_string_to_epoch_transformer.py
    │       ├── test_chained_transformer.py
    │       ├── test_regex_str_replace_transformer.py
    │       ├── test_bigquery_usage_transformer.py
    │       └── test_table_tag_transformer.py
├── databuilder
    ├── job
    │   ├── __init__.py
    │   └── base_job.py
    ├── callback
    │   ├── __init__.py
    │   └── call_back.py
    ├── loader
    │   ├── __init__.py
    │   ├── base_loader.py
    │   ├── generic_loader.py
    │   ├── file_system_csv_loader.py
    │   └── file_system_elasticsearch_json_loader.py
    ├── models
    │   ├── __init__.py
    │   ├── schema
    │   │   ├── __init__.py
    │   │   ├── schema_constant.py
    │   │   └── schema.py
    │   ├── usage
    │   │   ├── __init__.py
    │   │   └── usage_constants.py
    │   ├── cluster
    │   │   ├── __init__.py
    │   │   └── cluster_constants.py
    │   ├── dashboard
    │   │   ├── __init__.py
    │   │   └── dashboard_owner.py
    │   ├── timestamp
    │   │   ├── __init__.py
    │   │   └── timestamp_constants.py
    │   ├── owner_constants.py
    │   ├── graph_node.py
    │   ├── graph_relationship.py
    │   ├── presto_query_logs.py
    │   ├── elasticsearch_document.py
    │   ├── metric_elasticsearch_document.py
    │   ├── user_elasticsearch_document.py
    │   ├── dashboard_elasticsearch_document.py
    │   ├── neo4j_es_last_updated.py
    │   └── table_elasticsearch_document.py
    ├── rest_api
    │   ├── __init__.py
    │   ├── mode_analytics
    │   │   ├── __init__.py
    │   │   └── mode_paginated_rest_api_query.py
    │   ├── rest_api_failure_handlers.py
    │   └── base_rest_api_query.py
    ├── task
    │   ├── __init__.py
    │   ├── base_task.py
    │   └── task.py
    ├── utils
    │   ├── __init__.py
    │   └── closer.py
    ├── extractor
    │   ├── __init__.py
    │   ├── restapi
    │   │   ├── __init__.py
    │   │   └── rest_api_extractor.py
    │   ├── user
    │   │   ├── __init__.py
    │   │   └── bamboohr
    │   │   │   ├── __init__.py
    │   │   │   └── bamboohr_user_extractor.py
    │   ├── dashboard
    │   │   ├── __init__.py
    │   │   ├── redash
    │   │   │   └── __init__.py
    │   │   ├── tableau
    │   │   │   ├── __init__.py
    │   │   │   └── tableau_dashboard_constants.py
    │   │   └── mode_analytics
    │   │   │   ├── __init__.py
    │   │   │   ├── batch
    │   │   │       └── __init__.py
    │   │   │   ├── mode_dashboard_constants.py
    │   │   │   ├── mode_dashboard_usage_extractor.py
    │   │   │   ├── mode_dashboard_last_successful_executions_extractor.py
    │   │   │   └── mode_dashboard_last_modified_timestamp_extractor.py
    │   ├── table_metadata_constants.py
    │   ├── base_extractor.py
    │   ├── generic_extractor.py
    │   ├── neo4j_es_last_updated_extractor.py
    │   ├── postgres_metadata_extractor.py
    │   ├── sql_alchemy_extractor.py
    │   ├── db_api_extractor.py
    │   ├── redshift_metadata_extractor.py
    │   └── glue_extractor.py
    ├── filesystem
    │   ├── __init__.py
    │   └── metadata.py
    ├── publisher
    │   ├── __init__.py
    │   └── base_publisher.py
    ├── serializers
    │   ├── __init__.py
    │   └── neo4_serializer.py
    ├── transformer
    │   ├── __init__.py
    │   ├── remove_field_transformer.py
    │   ├── dict_to_model.py
    │   ├── generic_transformer.py
    │   ├── table_tag_transformer.py
    │   ├── template_variable_substitution_transformer.py
    │   ├── timestamp_string_to_epoch.py
    │   ├── regex_str_replace_transformer.py
    │   ├── bigquery_usage_transformer.py
    │   └── base_transformer.py
    └── __init__.py
├── NOTICE
├── docs
    └── assets
    │   ├── AmundsenDataBuilder.png
    │   └── dashboard_graph_modeling.png
├── .dependabot
    └── config.yml
├── CODE_OF_CONDUCT.md
├── .gitignore
├── Makefile
├── setup.cfg
├── requirements.txt
└── setup.py


/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | 


--------------------------------------------------------------------------------
/.github/titleLint.yml:
--------------------------------------------------------------------------------
1 | regex: (build|ci|docs|feat|fix|perf|refactor|style|test|chore|other): .*
2 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_tags.csv:
--------------------------------------------------------------------------------
1 | name,tag_type
2 | pii,default
3 | high_quality,default
4 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/example/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/job/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/task/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | amundsendatabuilder
2 | Copyright 2018-2019 Lyft Inc.
3 | 
4 | This product includes software developed at Lyft Inc.
5 | 


--------------------------------------------------------------------------------
/databuilder/callback/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/loader/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/rest_api/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/task/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/callback/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/loader/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/publisher/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/rest_api/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/usage/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/extractor/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/filesystem/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/models/schema/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/models/usage/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/publisher/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/serializers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/user/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/filesystem/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/models/schema/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/resources/fs_neo4j_csv_loader/movies/nodes/Movie_0.csv:
--------------------------------------------------------------------------------
1 | "name","KEY","LABEL"
2 | "Top Gun","movie://Top Gun","Movie"
3 | 


--------------------------------------------------------------------------------
/tests/unit/transformer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/usage/presto/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/extractor/restapi/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/extractor/user/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/models/cluster/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/models/dashboard/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/models/timestamp/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/docs/assets/AmundsenDataBuilder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dropbox/amundsendatabuilder/master/docs/assets/AmundsenDataBuilder.png


--------------------------------------------------------------------------------
/tests/unit/extractor/dashboard/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/restapi/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/models/dashboard/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/extractor/dashboard/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/extractor/user/bamboohr/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/rest_api/mode_analytics/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/user/bamboohr/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/rest_api/mode_analytics/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/extractor/dashboard/redash/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/extractor/dashboard/tableau/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/docs/assets/dashboard_graph_modeling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dropbox/amundsendatabuilder/master/docs/assets/dashboard_graph_modeling.png


--------------------------------------------------------------------------------
/example/sample_data/sample_schema_description.csv:
--------------------------------------------------------------------------------
1 | schema_key,schema,description
2 | hive://gold.test_schema,test_schema,"test schema description"


--------------------------------------------------------------------------------
/tests/unit/extractor/dashboard/redash/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/dashboard/tableau/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/resources/fs_neo4j_csv_loader/people/nodes/Person_0.csv:
--------------------------------------------------------------------------------
1 | "name","job","KEY","LABEL"
2 | "Taylor","Engineer","person://Taylor","Person"
3 | 


--------------------------------------------------------------------------------
/tests/unit/resources/fs_neo4j_csv_loader/people/nodes/Person_1.csv:
--------------------------------------------------------------------------------
1 | "name","pet","KEY","LABEL"
2 | "Griffin","Lion","person://Griffin","Person"
3 | 


--------------------------------------------------------------------------------
/.dependabot/config.yml:
--------------------------------------------------------------------------------
1 | version: 1
2 | update_configs:
3 |   - package_manager: "python"
4 |     directory: "/"
5 |     update_schedule: "monthly"
6 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/dashboard/mode_analytics/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/extractor/dashboard/mode_analytics/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/databuilder/extractor/dashboard/mode_analytics/batch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/dashboard/mode_analytics/batch/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 


--------------------------------------------------------------------------------
/tests/unit/resources/fs_neo4j_csv_loader/movies/nodes/City_0.csv:
--------------------------------------------------------------------------------
1 | "name","KEY","LABEL"
2 | "Top Gun","city://San Diego","City"
3 | "Top Gun","city://Oakland","City"
4 | 


--------------------------------------------------------------------------------
/tests/unit/resources/fs_neo4j_csv_loader/movies/nodes/Actor_0.csv:
--------------------------------------------------------------------------------
1 | "name","KEY","LABEL"
2 | "Top Gun","actor://Tom Cruise","Actor"
3 | "Top Gun","actor://Meg Ryan","Actor"
4 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_source.csv:
--------------------------------------------------------------------------------
1 | db_name,cluster,schema,table_name,source,source_type
2 | hive,gold,test_schema,test_table1,"https://github.com/amundsen-io/amundsen/",github
3 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_table_last_updated.csv:
--------------------------------------------------------------------------------
1 | cluster,db,schema,table_name,last_updated_time_epoch
2 | gold,hive,test_schema,test_table1,1570230473
3 | gold,dynamo,test_schema,test_table2,1070230473
4 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_table_owner.csv:
--------------------------------------------------------------------------------
1 | db_name,schema,cluster,table_name,owners
2 | hive,test_schema,gold,test_table1,"roald.amundsen@example.org,chrisc@example.org"
3 | dynamo,test_schema,gold,test_table2,
4 | 


--------------------------------------------------------------------------------
/tests/unit/resources/csv_publisher/nodes/test_table.csv:
--------------------------------------------------------------------------------
1 | "KEY","name","LABEL"
2 | "presto://gold.test_schema1/test_table1","test_table1","Table"
3 | "presto://gold.test_schema1/test_table2","test_table2","Table"
4 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | This project is governed by [Amundsen's code of conduct](https://github.com/amundsen-io/amundsen/blob/master/CODE_OF_CONDUCT.md).
2 | All contributors and participants agree to abide by its terms.
3 | 


--------------------------------------------------------------------------------
/databuilder/models/owner_constants.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | 
5 | OWNER_RELATION_TYPE = 'OWNER'
6 | OWNER_OF_OBJECT_RELATION_TYPE = 'OWNER_OF'
7 | 


--------------------------------------------------------------------------------
/databuilder/extractor/table_metadata_constants.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | # String for partition column badge
5 | PARTITION_BADGE = 'partition column'
6 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_dashboard_owner.csv:
--------------------------------------------------------------------------------
1 | product,cluster,dashboard_group_id,dashboard_id,email
2 | mode,gold,test_group_id_1,test_dashboard_id_1,roald.amundsen@example.org
3 | mode,gold,test_group_id_2,test_dashboard_id_2,buzz@example.org


--------------------------------------------------------------------------------
/example/sample_data/sample_dashboard_last_modified.csv:
--------------------------------------------------------------------------------
1 | product,cluster,dashboard_group_id,dashboard_id,last_modified_timestamp
2 | mode,gold,test_group_id_1,test_dashboard_id_1,1592351454
3 | mode,gold,test_group_id_2,test_dashboard_id_2,1592311423


--------------------------------------------------------------------------------
/example/sample_data/sample_dashboard_usage.csv:
--------------------------------------------------------------------------------
1 | product,cluster,dashboard_group_id,dashboard_id,view_count,email
2 | mode,gold,test_group_id_1,test_dashboard_id_1,100,roald.amundsen@example.org
3 | mode,gold,test_group_id_2,test_dashboard_id_2,2000,chrisc@example.org


--------------------------------------------------------------------------------
/databuilder/models/usage/usage_constants.py:
--------------------------------------------------------------------------------
1 | # Copyright Contributors to the Amundsen project.
2 | # SPDX-License-Identifier: Apache-2.0
3 | 
4 | READ_RELATION_TYPE = 'READ'
5 | READ_REVERSE_RELATION_TYPE = 'READ_BY'
6 | 
7 | READ_RELATION_COUNT_PROPERTY = 'read_count'
8 | 


--------------------------------------------------------------------------------
/tests/unit/resources/csv_publisher/nodes/test_column.csv:
--------------------------------------------------------------------------------
1 | "KEY","name","order_pos:UNQUOTED","type","LABEL"
2 | "presto://gold.test_schema1/test_table1/test_id1","test_id1",1,"bigint","Column"
3 | "presto://gold.test_schema1/test_table1/test_id2","test_id2",2,"bigint","Column"
4 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_application.csv:
--------------------------------------------------------------------------------
1 | task_id,dag_id,exec_date,application_url_template,db_name,schema,table_name,cluster
2 | hive.test_schema.test_table1,event_test,"2018-05-31T00:00:00","https://airflow_host.net/admin/airflow/tree?dag_id={dag_id}",hive,test_schema,test_table1,gold
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.pyc
 3 | *.pyo
 4 | *.pyt
 5 | *.pytc
 6 | *.egg-info
 7 | .*.swp
 8 | .DS_Store
 9 | build/
10 | dist/
11 | venv/
12 | venv3/
13 | .python-version
14 | .cache/
15 | .env
16 | .idea/
17 | .vscode/
18 | .coverage
19 | .mypy_cache
20 | .pytest_cache
21 | **/coverage.xml
22 | **/htmlcov/**
23 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_dashboard_table.csv:
--------------------------------------------------------------------------------
1 | product,cluster,dashboard_group_id,dashboard_id,table_ids
2 | mode,gold,test_group_id_1,test_dashboard_id_1,"hive://gold.test_schema/test_table1"
3 | mode,gold,test_group_id_2,test_dashboard_id_2,"hive://gold.test_schema/test_view1,hive://gold.test_schema/test_table3"
4 | 


--------------------------------------------------------------------------------
/tests/unit/resources/fs_neo4j_csv_loader/movies/relationships/test_Movie_Actor_ACTOR.csv:
--------------------------------------------------------------------------------
1 | "END_KEY","START_LABEL","END_LABEL","START_KEY","TYPE","REVERSE_TYPE"
2 | "actor://Tom Cruise","Movie","Actor","movie://Top Gun","ACTOR","ACTED_IN"
3 | "actor://Meg Ryan","Movie","Actor","movie://Top Gun","ACTOR","ACTED_IN"
4 | 


--------------------------------------------------------------------------------
/databuilder/models/cluster/cluster_constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | CLUSTER_NODE_LABEL = 'Cluster'
 5 | 
 6 | CLUSTER_RELATION_TYPE = 'CLUSTER'
 7 | CLUSTER_REVERSE_RELATION_TYPE = 'CLUSTER_OF'
 8 | 
 9 | CLUSTER_NAME_PROP_KEY = 'name'
10 | 


--------------------------------------------------------------------------------
/tests/unit/resources/fs_neo4j_csv_loader/movies/relationships/test_Movie_City_FILMED_AT.csv:
--------------------------------------------------------------------------------
1 | "END_KEY","START_LABEL","END_LABEL","START_KEY","TYPE","REVERSE_TYPE"
2 | "city://San Diego","Movie","City","city://Top Gun","FILMED_AT","APPEARS_IN"
3 | "city://Oakland","Movie","City","city://Top Gun","FILMED_AT","APPEARS_IN"
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | clean:
 2 | 	find . -name \*.pyc -delete
 3 | 	find . -name __pycache__ -delete
 4 | 	rm -rf dist/
 5 | 
 6 | .PHONY: test_unit
 7 | test_unit:
 8 | 	python3 -bb -m pytest tests
 9 | 
10 | lint:
11 | 	flake8 .
12 | 
13 | .PHONY: mypy
14 | mypy:
15 | 	mypy .
16 | 
17 | .PHONY: test
18 | test: test_unit lint mypy
19 | 
20 | 


--------------------------------------------------------------------------------
/databuilder/models/graph_node.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from collections import namedtuple
 5 | 
 6 | GraphNode = namedtuple(
 7 |     'GraphNode',
 8 |     [
 9 |         'key',
10 |         'label',
11 |         'attributes'
12 |     ]
13 | )
14 | 


--------------------------------------------------------------------------------
/databuilder/models/schema/schema_constant.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | SCHEMA_NODE_LABEL = 'Schema'
 5 | 
 6 | SCHEMA_NAME_ATTR = 'name'
 7 | 
 8 | SCHEMA_RELATION_TYPE = 'SCHEMA'
 9 | SCHEMA_REVERSE_RELATION_TYPE = 'SCHEMA_OF'
10 | 
11 | DATABASE_SCHEMA_KEY_FORMAT = '{db}://{cluster}.{schema}'
12 | 


--------------------------------------------------------------------------------
/tests/unit/resources/csv_publisher/relations/test_edge_short.csv:
--------------------------------------------------------------------------------
1 | "START_LABEL","START_KEY","END_LABEL","END_KEY","TYPE","REVERSE_TYPE"
2 | "Table","presto://gold.test_schema1/test_table1","Column","presto://gold.test_schema1/test_table1/test_id1","COLUMN","BELONG_TO_TABLE"
3 | "Table","presto://gold.test_schema1/test_table1","Column","presto://gold.test_schema1/test_table1/test_id2","COLUMN","BELONG_TO_TABLE"
4 | 


--------------------------------------------------------------------------------
/databuilder/models/graph_relationship.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from collections import namedtuple
 5 | 
 6 | GraphRelationship = namedtuple(
 7 |     'GraphRelationship',
 8 |     [
 9 |         'start_label',
10 |         'end_label',
11 |         'start_key',
12 |         'end_key',
13 |         'type',
14 |         'reverse_type',
15 |         'attributes'
16 |     ]
17 | )
18 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_dashboard_query.csv:
--------------------------------------------------------------------------------
1 | product,cluster,dashboard_group_id,dashboard_id,query_name,query_id,url,query_text
2 | mode,gold,test_group_id_1,test_dashboard_id_1,first query,query_1,http://mode.test_group_id_1.com/test_dashboard_id_1/query/query_1,SELECT * FROM foo.bar
3 | mode,gold,test_group_id_2,test_dashboard_id_2,second query,query_2,http://mode.test_group_id_2.com/test_dashboard_id_2/query/query_2,SELECT * FROM bar.foo JOIN foo.bar USING (baz)


--------------------------------------------------------------------------------
/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | ORGANIZATION = 'organization'
 5 | MODE_ACCESS_TOKEN = 'mode_user_token'
 6 | MODE_PASSWORD_TOKEN = 'mode_password_token'
 7 | 
 8 | # this token is needed to access batch discover endpoint
 9 | # e.g https://mode.com/developer/discovery-api/introduction/
10 | MODE_BEARER_TOKEN = 'mode_bearer_token'
11 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_watermark.csv:
--------------------------------------------------------------------------------
1 | create_time,database,schema,table_name,part_name,part_type,cluster
2 | 2019-10-01T12:13:14,hive,test_schema,test_table1,col3=2017-04-22/col4=0,low_watermark,gold
3 | 2019-10-01T12:13:14,hive,test_schema,test_table1,col3=2019-09-30/col4=11,high_watermark,gold
4 | 2019-10-01T12:13:14,dynamo,test_schema,test_table2,col3=2018-01-01,low_watermark,gold
5 | 2019-10-01T12:13:14,dynamo,test_schema,test_table2,col3=2019-10-01,high_watermark,gold
6 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_dashboard_last_execution.csv:
--------------------------------------------------------------------------------
1 | product,cluster,dashboard_group_id,dashboard_id,execution_id,execution_timestamp,execution_state
2 | mode,gold,test_group_id_1,test_dashboard_id_1,_last_successful_execution,1592351193,success
3 | mode,gold,test_group_id_2,test_dashboard_id_2,_last_successful_execution,1592351210,success
4 | mode,gold,test_group_id_1,test_dashboard_id_1,_last_execution,1593351193,fail
5 | mode,gold,test_group_id_2,test_dashboard_id_2,_last_execution,1594351210,success


--------------------------------------------------------------------------------
/example/sample_data/sample_table.csv:
--------------------------------------------------------------------------------
1 | database,cluster,schema,name,description,tags,is_view,description_source
2 | hive,gold,test_schema,test_table1,"1st test table","tag1,tag2,pii,high_quality",false,
3 | dynamo,gold,test_schema,test_table2,"2nd test table","high_quality,recommended",false,
4 | hive,gold,test_schema,test_view1,"1st test view","tag1",true,
5 | hive,gold,test_schema,test_table3,"3rd test","needs_documentation",false,
6 | hive,gold,test_schema,"test's_table4","4th test","needs_documentation",false,
7 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_user.csv:
--------------------------------------------------------------------------------
1 | email,first_name,last_name,full_name,github_username,team_name,employee_type,manager_email,slack_id,role_name
2 | roald.amundsen@example.org,Roald,Amundsen,"Roald Amundsen",amundsen-io,"Team Amundsen",sailor,"phboss@example.org",ramundzn,swe
3 | chrisc@example.org,Christopher,Columbus,"Christopher Columbus",ChristopherColumbusFAKE,"Team Amundsen",sailor,"phboss@example.org",chrisc,swe
4 | buzz@example.org,Buzz,Aldrin,"Buzz Aldrin",BuzzAldrinFAKE,"Team Amundsen",astronaut,"phboss@example.org",buzz,swe
5 | 


--------------------------------------------------------------------------------
/databuilder/models/presto_query_logs.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | 
 5 | class PrestoQueryLogs:
 6 |     """
 7 |     Presto Query logs model.
 8 |     Sql result has one row per presto query.
 9 |     """
10 | 
11 |     def __init__(self,
12 |                  user: str,
13 |                  query_text: str,
14 |                  occurred_at: str
15 |                  ) -> None:
16 |         self.user = user
17 |         self.query_text = query_text
18 |         self.occurred_at = occurred_at
19 | 


--------------------------------------------------------------------------------
/databuilder/models/timestamp/timestamp_constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from enum import Enum
 5 | 
 6 | NODE_LABEL = 'Timestamp'
 7 | 
 8 | TIMESTAMP_PROPERTY = 'timestamp'
 9 | TIMESTAMP_NAME_PROPERTY = 'name'
10 | # This is deprecated property as it's not generic for the Timestamp
11 | DEPRECATED_TIMESTAMP_PROPERTY = 'last_updated_timestamp'
12 | 
13 | 
14 | LASTUPDATED_RELATION_TYPE = 'LAST_UPDATED_AT'
15 | LASTUPDATED_REVERSE_RELATION_TYPE = 'LAST_UPDATED_TIME_OF'
16 | 
17 | 
18 | class TimestampName(Enum):
19 |     last_updated_timestamp = 1
20 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_table_programmatic_source.csv:
--------------------------------------------------------------------------------
 1 | database,cluster,schema,name,description,tags,description_source
 2 | hive,gold,test_schema,test_table1,"**Size**: 50T
 3 | 
 4 | **Monthly Cost**: $5000","expensive","s3_crawler"
 5 | dynamo,gold,test_schema,test_table2,"**Size**: 1T
 6 | 
 7 | **Monthly Cost**: $50","cheap","s3_crawler"
 8 | hive,gold,test_schema,test_table1,"### Quality Report:
 9 | --- 
10 | Ipsus enom. Ipsus enom ipsus lorenum.
11 | ---
12 | [![Build Status](https://api.travis-ci.com/amundsen-io/amundsendatabuilder.svg?branch=master)](https://travis-ci.com/amundsen-io/amundsendatabuilder)","low_quality","quality_service"
13 | 


--------------------------------------------------------------------------------
/databuilder/loader/base_loader.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import abc
 5 | 
 6 | from pyhocon import ConfigTree
 7 | 
 8 | from databuilder import Scoped
 9 | from typing import Any
10 | 
11 | 
12 | class Loader(Scoped):
13 |     """
14 |     A loader loads to the destination or to the staging area
15 |     """
16 |     @abc.abstractmethod
17 |     def init(self, conf: ConfigTree) -> None:
18 |         pass
19 | 
20 |     @abc.abstractmethod
21 |     def load(self, record: Any) -> None:
22 |         pass
23 | 
24 |     def get_scope(self) -> str:
25 |         return 'loader'
26 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | # Codeowners file by GitHub
 2 | # Reference: https://docs.github.com/en/github/creating-cloning-and-archiving-repositories/about-code-owners
 3 | # Each line is a file pattern followed by one or more owners.
 4 | # Order is important; the last matching pattern takes the most
 5 | # precedence.
 6 | 
 7 | # These owners will be the default owners for everything in
 8 | # the repo. Unless a later match takes precedence,
 9 | # @amundsen-io/amundsen-committerswill be requested for
10 | # review when someone opens a pull request.
11 | *       @amundsen-io/amundsen-committers
12 | 
13 | *.py    @feng-tao @jinhyukchang @allisonsuarez @dikshathakur3119
14 | 


--------------------------------------------------------------------------------
/databuilder/models/elasticsearch_document.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import json
 5 | from abc import ABCMeta
 6 | 
 7 | 
 8 | class ElasticsearchDocument:
 9 |     """
10 |     Base class for ElasticsearchDocument
11 |     Each different resource ESDoc will be a subclass
12 |     """
13 |     __metaclass__ = ABCMeta
14 | 
15 |     def to_json(self) -> str:
16 |         """
17 |         Convert object to json
18 |         :return:
19 |         """
20 |         obj_dict = {k: v for k, v in sorted(self.__dict__.items())}
21 |         data = json.dumps(obj_dict) + "\n"
22 |         return data
23 | 


--------------------------------------------------------------------------------
/databuilder/task/base_task.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import abc
 5 | 
 6 | from pyhocon import ConfigTree
 7 | 
 8 | from databuilder import Scoped
 9 | 
10 | 
11 | class Task(Scoped):
12 |     """
13 |     A Abstract task that can run an abstract task
14 |     """
15 |     @abc.abstractmethod
16 |     def init(self, conf: ConfigTree) -> None:
17 |         pass
18 | 
19 |     @abc.abstractmethod
20 |     def run(self) -> None:
21 |         """
22 |         Runs a task
23 |         :return:
24 |         """
25 |         pass
26 | 
27 |     def get_scope(self) -> str:
28 |         return 'task'
29 | 


--------------------------------------------------------------------------------
/databuilder/filesystem/metadata.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from datetime import datetime
 5 | 
 6 | 
 7 | class FileMetadata(object):
 8 | 
 9 |     def __init__(self,
10 |                  path: str,
11 |                  last_updated: datetime,
12 |                  size: int
13 |                  ) -> None:
14 |         self.path = path
15 |         self.last_updated = last_updated
16 |         self.size = size
17 | 
18 |     def __repr__(self) -> str:
19 |         return """FileMetadata(path={!r}, last_updated={!r}, size={!r})""" \
20 |             .format(self.path, self.last_updated, self.size)
21 | 


--------------------------------------------------------------------------------
/.github/workflows/license.yml:
--------------------------------------------------------------------------------
 1 | name: license
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Golang
18 |       uses: actions/setup-go@v2
19 |     - name: Install addlicense
20 |       run: |
21 |         export PATH=${PATH}:`go env GOPATH`/bin
22 |         go get -v -u github.com/google/addlicense
23 |     - name: Check license
24 |       run: |
25 |         export PATH=${PATH}:`go env GOPATH`/bin
26 |         addlicense -check -l mit -c "Amundsen" $(find $PWD -type f -name '*.py')


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | format = pylint
 3 | exclude = .svc,CVS,.bzr,.hg,.git,__pycache__,venv,build,databuilder/sql_parser/usage/presto/antlr_generated
 4 | max-complexity = 10
 5 | max-line-length = 120
 6 | ignore = NONE
 7 | 
 8 | [pep8]
 9 | max-line-length = 120
10 | 
11 | [tool:pytest]
12 | addopts = -rs --cov=databuilder --cov-fail-under=70 --cov-report=term-missing:skip-covered --cov-report=xml --cov-report=html -vvv
13 | 
14 | [coverage:run]
15 | branch = True
16 | omit = */antlr_generated/*
17 | 
18 | [coverage:xml]
19 | output = build/coverage.xml
20 | 
21 | [coverage:html]
22 | directory = build/coverage_html
23 | 
24 | [mypy]
25 | python_version = 3.6
26 | disallow_untyped_defs = True
27 | ignore_missing_imports = True
28 | 


--------------------------------------------------------------------------------
/databuilder/extractor/dashboard/tableau/tableau_dashboard_constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | API_VERSION = 'api_version'
 5 | API_BASE_URL = 'api_base_url'
 6 | TABLEAU_BASE_URL = 'tableau_base_url'
 7 | SITE_NAME = 'site_name'
 8 | TABLEAU_ACCESS_TOKEN_NAME = 'tableau_personal_access_token_name'
 9 | TABLEAU_ACCESS_TOKEN_SECRET = 'tableau_personal_access_token_secret'
10 | EXCLUDED_PROJECTS = 'excluded_projects'
11 | EXTERNAL_CLUSTER_NAME = 'external_cluster_name'
12 | EXTERNAL_SCHEMA_NAME = 'external_schema_name'
13 | EXTERNAL_TABLE_TYPES = 'external_table_types'
14 | CLUSTER = 'cluster'
15 | DATABASE = 'database'
16 | VERIFY_REQUEST = 'verify_request'
17 | 


--------------------------------------------------------------------------------
/databuilder/extractor/base_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import abc
 5 | 
 6 | from pyhocon import ConfigTree
 7 | from typing import Any
 8 | 
 9 | from databuilder import Scoped
10 | 
11 | 
12 | class Extractor(Scoped):
13 |     """
14 |     An extractor extracts record
15 |     """
16 | 
17 |     @abc.abstractmethod
18 |     def init(self, conf: ConfigTree) -> None:
19 |         pass
20 | 
21 |     @abc.abstractmethod
22 |     def extract(self) -> Any:
23 |         """
24 |         :return: Provides a record or None if no more to extract
25 |         """
26 |         return None
27 | 
28 |     def get_scope(self) -> str:
29 |         return 'extractor'
30 | 


--------------------------------------------------------------------------------
/databuilder/job/base_job.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import abc
 5 | 
 6 | from pyhocon import ConfigTree
 7 | 
 8 | from databuilder import Scoped
 9 | from databuilder.utils.closer import Closer
10 | 
11 | 
12 | class Job(Scoped):
13 |     closer = Closer()
14 | 
15 |     """
16 |     A Databuilder job that represents single work unit.
17 |     """
18 |     @abc.abstractmethod
19 |     def init(self, conf: ConfigTree) -> None:
20 |         pass
21 | 
22 |     @abc.abstractmethod
23 |     def launch(self) -> None:
24 |         """
25 |         Launch a job
26 |         :return: None
27 |         """
28 |         pass
29 | 
30 |     def get_scope(self) -> str:
31 |         return 'job'
32 | 


--------------------------------------------------------------------------------
/databuilder/models/metric_elasticsearch_document.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from typing import List
 5 | 
 6 | from databuilder.models.elasticsearch_document import ElasticsearchDocument
 7 | 
 8 | 
 9 | class MetricESDocument(ElasticsearchDocument):
10 |     """
11 |     Schema for the Search index document
12 |     """
13 | 
14 |     def __init__(self,
15 |                  name: str,
16 |                  description: str,
17 |                  type: str,
18 |                  dashboards: List,
19 |                  tags: List,
20 |                  ) -> None:
21 |         self.name = name
22 |         self.description = description
23 |         self.type = type
24 |         self.dashboards = dashboards
25 |         self.tags = tags
26 | 


--------------------------------------------------------------------------------
/.github/stale.yml:
--------------------------------------------------------------------------------
 1 | # Number of days of inactivity before an issue becomes stale
 2 | daysUntilStale: 14
 3 | # Number of days of inactivity before a stale issue is closed
 4 | daysUntilClose: 21
 5 | # Issues with these labels will never be considered stale
 6 | exemptLabels:
 7 |   - keep fresh
 8 | # Label to use when marking an issue as stale
 9 | staleLabel: stale
10 | # Comment to post when marking an issue as stale. Set to `false` to disable
11 | markComment: >
12 |   This issue has been automatically marked as stale because it has not had
13 |   recent activity. It will be closed if no further activity occurs.
14 | # Comment to post when closing a stale issue. Set to `false` to disable
15 | closeComment: >
16 |   This issue has been automatically closed for inactivity. If you still wish to
17 |   make these changes, please open a new pull request or reopen this one.
18 | 


--------------------------------------------------------------------------------
/.github/workflows/pypipublish.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: Build and Deploy
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |     tags:
 8 |       - '*'
 9 | jobs:
10 |   build-and-publish-python-module:
11 |     name: Build and publish python module to pypi
12 |     runs-on: ubuntu-18.04
13 |     steps:
14 |       - name: Checkout
15 |         uses: actions/checkout@v1
16 |       - name: Setup python 3.6
17 |         uses: actions/setup-python@v1
18 |         with:
19 |           python-version: 3.6
20 |       - name: Add wheel dependency
21 |         run: pip install wheel
22 |       - name: Generate dist
23 |         run: python setup.py sdist bdist_wheel
24 |       - name: Publish to PyPI
25 |         if: startsWith(github.event.ref, 'refs/tags')
26 |         uses: pypa/gh-action-pypi-publish@master
27 |         with:
28 |           user: __token__
29 |           password: ${{ secrets.pypi_password }}
30 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_col.csv:
--------------------------------------------------------------------------------
 1 | name,description,col_type,sort_order,database,cluster,schema,table_name
 2 | col1,"col1 description","string",1,hive,gold,test_schema,test_table1
 3 | col2,"col2 description","string",2,hive,gold,test_schema,test_table1
 4 | col3,"col3 description","string",3,hive,gold,test_schema,test_table1
 5 | col4,"col4 description","string",4,hive,gold,test_schema,test_table1
 6 | col5,"col5 description","float",5,hive,gold,test_schema,test_table1
 7 | col1,"col1 description","string",1,dynamo,gold,test_schema,test_table2
 8 | col2,"col2 description","string",2,dynamo,gold,test_schema,test_table2
 9 | col3,"col3 description","string",3,dynamo,gold,test_schema,test_table2
10 | col4,"col4 description","int",4,dynamo,gold,test_schema,test_table2
11 | col1,"view col description","int",1,hive,gold,test_schema,test_view1
12 | col1,"col1 description","int",1,hive,gold,test_schema,test_table3
13 | 


--------------------------------------------------------------------------------
/tests/unit/rest_api/test_rest_api_failure_handlers.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from databuilder.rest_api.rest_api_failure_handlers import HttpFailureSkipOnStatus
 7 | from mock import MagicMock
 8 | 
 9 | 
10 | class TestHttpFailureSkipOnStatus(unittest.TestCase):
11 | 
12 |     def testSkip(self) -> None:
13 |         failure_handler = HttpFailureSkipOnStatus([404, 400])
14 | 
15 |         exception = MagicMock()
16 |         exception.response.status_code = 404
17 |         self.assertTrue(failure_handler.can_skip_failure(exception=exception))
18 | 
19 |         exception.response.status_code = 400
20 |         self.assertTrue(failure_handler.can_skip_failure(exception=exception))
21 | 
22 |         exception.response.status_code = 500
23 |         self.assertFalse(failure_handler.can_skip_failure(exception=exception))
24 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_table_column_stats.csv:
--------------------------------------------------------------------------------
 1 | cluster,db,schema,table_name,col_name,stat_name,stat_val,start_epoch,end_epoch
 2 | gold,hive,test_schema,test_table1,col1,"distinct values","8",1432300762,1562300762
 3 | gold,hive,test_schema,test_table1,col1,"min","""aardvark""",1432300762,1562300762
 4 | gold,hive,test_schema,test_table1,col1,"max","""zebra""",1432300762,1562300762
 5 | gold,hive,test_schema,test_table1,col1,"num nulls","""500320""",1432300762,1562300762
 6 | gold,hive,test_schema,test_table1,col1,"verified","""230430""",1432300762,1562300762
 7 | gold,hive,test_schema,test_table1,col5,"average","""5.0""",1532300762,1572300762
 8 | gold,hive,test_schema,test_table1,col5,"max","""500.0""",1534300762,1572300762
 9 | gold,hive,test_schema,test_table1,col5,"min","""-500.0""",1534300762,1572300762
10 | gold,dynamo,test_schema,test_table2,col4,"median","""250""",1534300762,1572300762
11 | gold,dynamo,test_schema,test_table2,col4,"average","""400""",1534300762,1572300762


--------------------------------------------------------------------------------
/databuilder/transformer/remove_field_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | from typing import Any, Dict
 6 | 
 7 | from pyhocon import ConfigTree
 8 | 
 9 | from databuilder.transformer.base_transformer import Transformer
10 | 
11 | FIELD_NAMES = 'field_names'  # field name to be removed
12 | 
13 | LOGGER = logging.getLogger(__name__)
14 | 
15 | 
16 | class RemoveFieldTransformer(Transformer):
17 |     """
18 |     Remove field in Dict by specifying list of fields (keys).
19 | 
20 |     """
21 | 
22 |     def init(self, conf: ConfigTree) -> None:
23 |         self._field_names = conf.get_list(FIELD_NAMES)
24 | 
25 |     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
26 | 
27 |         for k in self._field_names:
28 |             if k in record:
29 |                 del record[k]
30 | 
31 |         return record
32 | 
33 |     def get_scope(self) -> str:
34 |         return 'transformer.remove_field'
35 | 


--------------------------------------------------------------------------------
/databuilder/transformer/dict_to_model.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import importlib
 5 | import logging
 6 | 
 7 | from pyhocon import ConfigTree
 8 | from typing import Any, Dict
 9 | 
10 | from databuilder.transformer.base_transformer import Transformer
11 | 
12 | MODEL_CLASS = 'model_class'
13 | 
14 | LOGGER = logging.getLogger(__name__)
15 | 
16 | 
17 | class DictToModel(Transformer):
18 |     """
19 |     Transforms dictionary into model
20 |     """
21 | 
22 |     def init(self, conf: ConfigTree) -> None:
23 |         model_class = conf.get_string(MODEL_CLASS)
24 |         module_name, class_name = model_class.rsplit(".", 1)
25 |         mod = importlib.import_module(module_name)
26 |         self._model_class = getattr(mod, class_name)
27 | 
28 |     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
29 |         return self._model_class(**record)
30 | 
31 |     def get_scope(self) -> str:
32 |         return 'transformer.dict_to_model'
33 | 


--------------------------------------------------------------------------------
/tests/unit/transformer/test_template_variable_substitution_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from pyhocon import ConfigFactory
 7 | 
 8 | from databuilder.transformer.template_variable_substitution_transformer import \
 9 |     TemplateVariableSubstitutionTransformer, FIELD_NAME, TEMPLATE
10 | 
11 | 
12 | class TestTemplateVariableSubstitutionTransformer(unittest.TestCase):
13 | 
14 |     def test_conversion(self) -> None:
15 | 
16 |         transformer = TemplateVariableSubstitutionTransformer()
17 |         config = ConfigFactory.from_dict({
18 |             FIELD_NAME: 'baz',
19 |             TEMPLATE: 'Hello {foo}'
20 |         })
21 |         transformer.init(conf=config)
22 | 
23 |         actual = transformer.transform({'foo': 'bar'})
24 |         expected = {
25 |             'foo': 'bar',
26 |             'baz': 'Hello bar'
27 |         }
28 |         self.assertDictEqual(expected, actual)
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     unittest.main()
33 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_column_usage.csv:
--------------------------------------------------------------------------------
 1 | database,cluster,schema,table_name,column_name,user_email,read_count
 2 | hive,gold,test_schema,test_table1,col1,roald.amundsen@example.org,500
 3 | hive,gold,test_schema,test_table1,col1,aoald0@example.org,100
 4 | hive,gold,test_schema,test_table1,col1,boald1@example.org,100
 5 | hive,gold,test_schema,test_table1,col1,coald2@example.org,100
 6 | hive,gold,test_schema,test_table1,col1,doald3@example.org,100
 7 | hive,gold,test_schema,test_table1,col1,eoald4@example.org,100
 8 | hive,gold,test_schema,test_table1,col1,foald5@example.org,100
 9 | hive,gold,test_schema,test_table1,col1,goald6@example.org,100
10 | hive,gold,test_schema,test_table1,col1,hoald7@example.org,100
11 | hive,gold,test_schema,test_table1,col1,ioald8@example.org,10
12 | hive,gold,test_schema,test_table1,col1,joald9@example.org,10
13 | hive,gold,test_schema,test_table1,col1,koald9@example.org,10
14 | hive,gold,test_schema,test_table2,col1,soald9@example.org,10
15 | hive,gold,test_schema,test_table2,col1,toald9@example.org,10
16 | dynamo,gold,test_schema,test_table2,col1,chrisc@example.org,500
17 | 


--------------------------------------------------------------------------------
/example/sample_data/sample_dashboard_base.csv:
--------------------------------------------------------------------------------
1 | product,cluster,dashboard_group,dashboard_group_id,dashboard_group_description,dashboard_group_url,dashboard_name,dashboard_id,description,created_timestamp,dashboard_url
2 | mode,gold,test group1,test_group_id_1,test group description 1,http://mode.test_group_id_1.com,test dashboard,test_dashboard_id_1,test dashboard description,1592333799,http://mode.test_group_id_1.com/test_dashboard_id_1
3 | mode,gold,test group1,test_group_id_1,test group description 1_2,http://mode.test_group_id_1.com,test dashboard,test_dashboard_id_1_2,test dashboard description 1_2,1592332799,http://mode.test_group_id_1.com/test_dashboard_id_1_2
4 | mode,gold,test group2,test_group_id_2,test group description 2,http://mode.test_group_id_2.com,test dashboard,test_dashboard_id_2,test dashboard description,1592133799,http://mode.test_group_id_2.com/test_dashboard_id_2
5 | superset,gold,test group3,test_group_id_3,test group description 1,http://mode.test_group_id_3.com,test dashboard,test_dashboard_id_3,test dashboard description,1591333799,http://mode.test_group_id_3.com/test_dashboard_id_3
6 | 


--------------------------------------------------------------------------------
/databuilder/rest_api/rest_api_failure_handlers.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import abc
 5 | 
 6 | from typing import Iterable
 7 | 
 8 | 
 9 | class BaseFailureHandler(object, metaclass=abc.ABCMeta):
10 | 
11 |     @abc.abstractmethod
12 |     def can_skip_failure(self,
13 |                          exception: Exception,
14 |                          ) -> bool:
15 |         pass
16 | 
17 | 
18 | class HttpFailureSkipOnStatus(BaseFailureHandler):
19 | 
20 |     def __init__(self,
21 |                  status_codes_to_skip: Iterable[int],
22 |                  ) -> None:
23 |         self._status_codes_to_skip = {v for v in status_codes_to_skip}
24 | 
25 |     def can_skip_failure(self,
26 |                          exception: Exception,
27 |                          ) -> bool:
28 |         try:
29 |             status_code: int = getattr(getattr(exception, 'response'), 'status_code')
30 |             return status_code in self._status_codes_to_skip
31 |         except AttributeError:
32 |             pass
33 | 
34 |         return False
35 | 


--------------------------------------------------------------------------------
/tests/unit/loader/test_generic_loader.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from mock import MagicMock
 7 | from pyhocon import ConfigFactory
 8 | 
 9 | from databuilder.loader.generic_loader import GenericLoader, CALLBACK_FUNCTION
10 | 
11 | 
12 | class TestGenericLoader(unittest.TestCase):
13 | 
14 |     def test_loading(self) -> None:
15 | 
16 |         loader = GenericLoader()
17 |         callback_func = MagicMock()
18 |         loader.init(conf=ConfigFactory.from_dict({
19 |             CALLBACK_FUNCTION: callback_func
20 |         }))
21 | 
22 |         loader.load({'foo': 'bar'})
23 |         loader.close()
24 | 
25 |         callback_func.assert_called_once()
26 | 
27 |     def test_none_loading(self) -> None:
28 | 
29 |         loader = GenericLoader()
30 |         callback_func = MagicMock()
31 |         loader.init(conf=ConfigFactory.from_dict({
32 |             CALLBACK_FUNCTION: callback_func
33 |         }))
34 | 
35 |         loader.load(None)
36 |         loader.close()
37 | 
38 |         callback_func.assert_not_called()
39 | 


--------------------------------------------------------------------------------
/.github/workflows/pull_request.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | on: pull_request
 5 | jobs:
 6 |   pre-commit:
 7 |     runs-on: ubuntu-18.04
 8 |     steps:
 9 |       - name: Checkout
10 |         uses: actions/checkout@v1
11 |       - name: Setup python 3.6
12 |         uses: actions/setup-python@v1
13 |         with:
14 |           python-version: 3.6
15 |   test-unit:
16 |     runs-on: ubuntu-18.04
17 |     strategy:
18 |         matrix:
19 |           python-version: ['3.6.x', '3.7.x']
20 |     steps:
21 |       - name: Checkout
22 |         uses: actions/checkout@v1
23 |       - name: Setup python ${{ matrix.python-version }}
24 |         uses: actions/setup-python@v1
25 |         with:
26 |           python-version: ${{ matrix.python-version }}
27 |       - name: Install dependencies
28 |         run: pip3 install -r requirements.txt && pip3 install .[all] && pip3 install codecov
29 |       - name: Run python unit tests
30 |         run: make test
31 | 


--------------------------------------------------------------------------------
/databuilder/transformer/generic_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | 
 6 | from pyhocon import ConfigTree
 7 | from typing import Any, Dict
 8 | 
 9 | from databuilder.transformer.base_transformer import Transformer
10 | 
11 | CALLBACK_FUNCTION = 'callback_function'
12 | FIELD_NAME = 'field_name'
13 | 
14 | LOGGER = logging.getLogger(__name__)
15 | 
16 | 
17 | class GenericTransformer(Transformer):
18 |     """
19 |     A generic transformer that accepts a callback function that transforms the record on specified field.
20 |     """
21 | 
22 |     def init(self, conf: ConfigTree) -> None:
23 |         self._callback_function = conf.get(CALLBACK_FUNCTION)
24 |         self._field_name = conf.get_string(FIELD_NAME)
25 | 
26 |     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
27 | 
28 |         for k, v in record.items():
29 |             if k == self._field_name:
30 |                 new_val = self._callback_function(v)
31 |                 record[k] = new_val
32 |         return record
33 | 
34 |     def get_scope(self) -> str:
35 |         return 'transformer.generic'
36 | 


--------------------------------------------------------------------------------
/databuilder/transformer/table_tag_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from pyhocon import ConfigFactory, ConfigTree
 5 | from typing import Any
 6 | 
 7 | from databuilder.transformer.base_transformer import Transformer
 8 | from databuilder.models.table_metadata import TableMetadata
 9 | 
10 | 
11 | class TableTagTransformer(Transformer):
12 |     """Simple transformer that adds tags to all table nodes produced as part of a job."""
13 |     # Config
14 |     TAGS = 'tags'
15 |     DEFAULT_CONFIG = ConfigFactory.from_dict({TAGS: None})
16 | 
17 |     def init(self, conf: ConfigTree) -> None:
18 |         conf = conf.with_fallback(TableTagTransformer.DEFAULT_CONFIG)
19 |         tags = conf.get_string(TableTagTransformer.TAGS)
20 | 
21 |         self.tags = TableMetadata.format_tags(tags)
22 | 
23 |     def transform(self, record: Any) -> Any:
24 |         if isinstance(record, TableMetadata):
25 |             if record.tags:
26 |                 record.tags += self.tags
27 |             else:
28 |                 record.tags = self.tags
29 |         return record
30 | 
31 |     def get_scope(self) -> str:
32 |         return 'transformer.table_tag'
33 | 


--------------------------------------------------------------------------------
/databuilder/transformer/template_variable_substitution_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | 
 6 | from pyhocon import ConfigTree
 7 | from typing import Any, Dict
 8 | 
 9 | from databuilder.transformer.base_transformer import Transformer
10 | 
11 | TEMPLATE = 'template'
12 | FIELD_NAME = 'field_name'  # field name to UPSERT
13 | 
14 | LOGGER = logging.getLogger(__name__)
15 | 
16 | 
17 | class TemplateVariableSubstitutionTransformer(Transformer):
18 |     """
19 |     Add/Replace field in Dict by string.format based on given template and provide record Dict as a template parameter
20 |     https://docs.python.org/3.4/library/string.html#string.Formatter.format
21 | 
22 |     """
23 | 
24 |     def init(self, conf: ConfigTree) -> None:
25 | 
26 |         self._template = conf.get_string(TEMPLATE)
27 |         self._field_name = conf.get_string(FIELD_NAME)
28 | 
29 |     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
30 | 
31 |         val = self._template.format(**record)
32 |         record[self._field_name] = val
33 |         return record
34 | 
35 |     def get_scope(self) -> str:
36 |         return 'transformer.template_variable_substitution'
37 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/test_neo4j_es_last_updated_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from mock import patch
 5 | from typing import Any
 6 | import unittest
 7 | 
 8 | from pyhocon import ConfigFactory
 9 | 
10 | from databuilder import Scoped
11 | from databuilder.extractor.neo4j_es_last_updated_extractor import Neo4jEsLastUpdatedExtractor
12 | 
13 | 
14 | class TestNeo4jEsLastUpdatedExtractor(unittest.TestCase):
15 | 
16 |     def setUp(self) -> None:
17 |         config_dict = {
18 |             'extractor.neo4j_es_last_updated.model_class':
19 |                 'databuilder.models.neo4j_es_last_updated.Neo4jESLastUpdated',
20 |         }
21 |         self.conf = ConfigFactory.from_dict(config_dict)
22 | 
23 |     @patch('time.time')
24 |     def test_extraction_with_model_class(self, mock_time: Any) -> None:
25 |         """
26 |         Test Extraction using model class
27 |         """
28 |         mock_time.return_value = 10000000
29 |         extractor = Neo4jEsLastUpdatedExtractor()
30 |         extractor.init(Scoped.get_scoped_conf(conf=self.conf,
31 |                                               scope=extractor.get_scope()))
32 | 
33 |         result = extractor.extract()
34 |         self.assertEqual(result.timestamp, 10000000)
35 | 


--------------------------------------------------------------------------------
/tests/unit/publisher/test_publisher.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from mock import MagicMock
 7 | from pyhocon import ConfigTree
 8 | 
 9 | from databuilder.publisher.base_publisher import Publisher, NoopPublisher
10 | 
11 | 
12 | class TestPublisher(unittest.TestCase):
13 | 
14 |     def testCallback(self) -> None:
15 |         publisher = NoopPublisher()
16 |         callback = MagicMock()
17 |         publisher.register_call_back(callback)
18 |         publisher.publish()
19 | 
20 |         self.assertTrue(callback.on_success.called)
21 | 
22 |     def testFailureCallback(self) -> None:
23 |         publisher = FailedPublisher()
24 |         callback = MagicMock()
25 |         publisher.register_call_back(callback)
26 | 
27 |         try:
28 |             publisher.publish()
29 |         except Exception:
30 |             pass
31 | 
32 |         self.assertTrue(callback.on_failure.called)
33 | 
34 | 
35 | class FailedPublisher(Publisher):
36 |     def __init__(self) -> None:
37 |         super(FailedPublisher, self).__init__()
38 | 
39 |     def init(self, conf: ConfigTree) -> None:
40 |         pass
41 | 
42 |     def publish_impl(self) -> None:
43 |         raise Exception('Bomb')
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     unittest.main()
48 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/test_csv_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from pyhocon import ConfigFactory
 7 | 
 8 | from databuilder import Scoped
 9 | from databuilder.extractor.csv_extractor import CsvExtractor
10 | 
11 | 
12 | class TestCsvExtractor(unittest.TestCase):
13 | 
14 |     def setUp(self) -> None:
15 |         config_dict = {
16 |             'extractor.csv.{}'.format(CsvExtractor.FILE_LOCATION): 'example/sample_data/sample_table.csv',
17 |             'extractor.csv.model_class': 'databuilder.models.table_metadata.TableMetadata',
18 |         }
19 |         self.conf = ConfigFactory.from_dict(config_dict)
20 | 
21 |     def test_extraction_with_model_class(self) -> None:
22 |         """
23 |         Test Extraction using model class
24 |         """
25 |         extractor = CsvExtractor()
26 |         extractor.init(Scoped.get_scoped_conf(conf=self.conf,
27 |                                               scope=extractor.get_scope()))
28 | 
29 |         result = extractor.extract()
30 |         self.assertEqual(result.name, 'test_table1')
31 |         self.assertEqual(result.description._text, '1st test table')
32 |         self.assertEqual(result.database, 'hive')
33 |         self.assertEqual(result.cluster, 'gold')
34 |         self.assertEqual(result.schema, 'test_schema')
35 | 


--------------------------------------------------------------------------------
/databuilder/loader/generic_loader.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | 
 6 | from pyhocon import ConfigTree
 7 | from typing import Optional, Any
 8 | 
 9 | from databuilder.loader.base_loader import Loader
10 | 
11 | LOGGER = logging.getLogger(__name__)
12 | 
13 | CALLBACK_FUNCTION = 'callback_function'
14 | 
15 | 
16 | def log_call_back(record: Optional[Any]) -> None:
17 |     """
18 |     A Sample callback function. Implement any function follows this function's signature to fit your needs.
19 |     :param record:
20 |     :return:
21 |     """
22 |     LOGGER.info('record: {}'.format(record))
23 | 
24 | 
25 | class GenericLoader(Loader):
26 |     """
27 |     Loader class to call back a function provided by user
28 |     """
29 | 
30 |     def init(self, conf: ConfigTree) -> None:
31 |         """
32 |         Initialize file handlers from conf
33 |         :param conf:
34 |         """
35 |         self.conf = conf
36 |         self._callback_func = self.conf.get(CALLBACK_FUNCTION, log_call_back)
37 | 
38 |     def load(self, record: Optional[Any]) -> None:
39 |         """
40 |         Write record to function
41 |         :param record:
42 |         :return:
43 |         """
44 |         if not record:
45 |             return
46 | 
47 |         self._callback_func(record)
48 | 
49 |     def close(self) -> None:
50 |         pass
51 | 
52 |     def get_scope(self) -> str:
53 |         return "loader.generic"
54 | 


--------------------------------------------------------------------------------
/tests/unit/models/test_neo4j_es_last_updated.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | from databuilder.models.neo4j_es_last_updated import Neo4jESLastUpdated
 6 | 
 7 | from databuilder.models.graph_serializable import NODE_KEY, \
 8 |     NODE_LABEL
 9 | from databuilder.serializers import neo4_serializer
10 | 
11 | 
12 | class TestNeo4jESLastUpdated(unittest.TestCase):
13 | 
14 |     def setUp(self) -> None:
15 |         super(TestNeo4jESLastUpdated, self).setUp()
16 |         self.neo4j_es_last_updated = Neo4jESLastUpdated(timestamp=100)
17 | 
18 |         self.expected_node_result = {
19 |             NODE_KEY: 'amundsen_updated_timestamp',
20 |             NODE_LABEL: 'Updatedtimestamp',
21 |             'latest_timestmap:UNQUOTED': 100,
22 |         }
23 | 
24 |     def test_create_nodes(self) -> None:
25 |         nodes = self.neo4j_es_last_updated.create_nodes()
26 |         self.assertEquals(len(nodes), 1)
27 |         serialized_node = neo4_serializer.serialize_node(nodes[0])
28 |         self.assertEquals(serialized_node, self.expected_node_result)
29 | 
30 |     def test_create_next_node(self) -> None:
31 |         next_node = self.neo4j_es_last_updated.create_next_node()
32 |         self.assertEquals(neo4_serializer.serialize_node(next_node), self.expected_node_result)
33 | 
34 |     def test_create_next_relation(self) -> None:
35 |         self.assertIs(self.neo4j_es_last_updated.create_next_relation(), None)
36 | 


--------------------------------------------------------------------------------
/tests/unit/transformer/test_dict_to_model_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from pyhocon import ConfigFactory
 7 | 
 8 | from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS
 9 | from databuilder.models.dashboard.dashboard_execution import DashboardExecution
10 | 
11 | 
12 | class TestDictToModel(unittest.TestCase):
13 | 
14 |     def test_conversion(self) -> None:
15 | 
16 |         transformer = DictToModel()
17 |         config = ConfigFactory.from_dict({
18 |             MODEL_CLASS: 'databuilder.models.dashboard.dashboard_execution.DashboardExecution',
19 |         })
20 |         transformer.init(conf=config)
21 | 
22 |         actual = transformer.transform(
23 |             {
24 |                 'dashboard_group_id': 'foo',
25 |                 'dashboard_id': 'bar',
26 |                 'execution_timestamp': 123456789,
27 |                 'execution_state': 'succeed',
28 |                 'product': 'mode',
29 |                 'cluster': 'gold'
30 |             }
31 |         )
32 | 
33 |         self.assertTrue(isinstance(actual, DashboardExecution))
34 |         self.assertEqual(actual.__repr__(), DashboardExecution(
35 |             dashboard_group_id='foo',
36 |             dashboard_id='bar',
37 |             execution_timestamp=123456789,
38 |             execution_state='succeed',
39 |             product='mode',
40 |             cluster='gold'
41 |         ).__repr__())
42 | 


--------------------------------------------------------------------------------
/databuilder/rest_api/base_rest_api_query.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import abc
 5 | import logging
 6 | 
 7 | from typing import Iterable, Any, Dict, Iterator
 8 | 
 9 | LOGGER = logging.getLogger(__name__)
10 | 
11 | 
12 | class BaseRestApiQuery(object, metaclass=abc.ABCMeta):
13 | 
14 |     @abc.abstractmethod
15 |     def execute(self) -> Iterator[Dict[str, Any]]:
16 |         """
17 |         Provides iterator of the records. It uses iterator so that it can stream the result.
18 |         :return:
19 |         """
20 | 
21 |         return iter([dict()])
22 | 
23 | 
24 | class RestApiQuerySeed(BaseRestApiQuery):
25 |     """
26 |     A seed RestApiQuery.
27 | 
28 |     RestApiQuery is using decorator pattern where it needs to have a seed to begin with. RestApiQuerySeed is for
29 |     RestApiQuery to start with.
30 | 
31 |     Example: see ModeDashboardExtractor._build_restapi_query
32 |     """
33 | 
34 |     def __init__(self,
35 |                  seed_record: Iterable[Dict[str, Any]]
36 |                  ) -> None:
37 |         self._seed_record = seed_record
38 | 
39 |     def execute(self) -> Iterator[Dict[str, Any]]:
40 |         return iter(self._seed_record)
41 | 
42 | 
43 | class EmptyRestApiQuerySeed(RestApiQuerySeed):
44 |     """
45 |     Sometimes there simply isn't a record to seed with.
46 |     """
47 | 
48 |     def __init__(self) -> None:
49 |         super(EmptyRestApiQuerySeed, self).__init__([{'empty_rest_api_query_seed': 1}])
50 | 


--------------------------------------------------------------------------------
/databuilder/models/user_elasticsearch_document.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from databuilder.models.elasticsearch_document import ElasticsearchDocument
 5 | 
 6 | 
 7 | class UserESDocument(ElasticsearchDocument):
 8 |     """
 9 |     Schema for the Search index document for user
10 |     """
11 | 
12 |     def __init__(self,
13 |                  email: str,
14 |                  first_name: str,
15 |                  last_name: str,
16 |                  full_name: str,
17 |                  github_username: str,
18 |                  team_name: str,
19 |                  employee_type: str,
20 |                  manager_email: str,
21 |                  slack_id: str,
22 |                  role_name: str,
23 |                  is_active: bool,
24 |                  total_read: int,
25 |                  total_own: int,
26 |                  total_follow: int,
27 |                  ) -> None:
28 |         self.email = email
29 |         self.first_name = first_name
30 |         self.last_name = last_name
31 |         self.full_name = full_name
32 |         self.github_username = github_username
33 |         self.team_name = team_name
34 |         self.employee_type = employee_type
35 |         self.manager_email = manager_email
36 |         self.slack_id = slack_id
37 |         self.role_name = role_name
38 |         self.is_active = is_active
39 |         self.total_read = total_read
40 |         self.total_own = total_own
41 |         self.total_follow = total_follow
42 | 


--------------------------------------------------------------------------------
/tests/unit/transformer/test_remove_field_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from pyhocon import ConfigFactory
 7 | 
 8 | from databuilder.transformer.remove_field_transformer import RemoveFieldTransformer, FIELD_NAMES
 9 | 
10 | 
11 | class TestRemoveFieldTransformer(unittest.TestCase):
12 | 
13 |     def test_conversion(self) -> None:
14 | 
15 |         transformer = RemoveFieldTransformer()
16 |         config = ConfigFactory.from_dict({
17 |             FIELD_NAMES: ['foo', 'bar'],
18 |         })
19 |         transformer.init(conf=config)
20 | 
21 |         actual = transformer.transform({
22 |             'foo': 'foo_val',
23 |             'bar': 'bar_val',
24 |             'baz': 'baz_val',
25 |         })
26 |         expected = {
27 |             'baz': 'baz_val'
28 |         }
29 |         self.assertDictEqual(expected, actual)
30 | 
31 |     def test_conversion_missing_field(self) -> None:
32 | 
33 |         transformer = RemoveFieldTransformer()
34 |         config = ConfigFactory.from_dict({
35 |             FIELD_NAMES: ['foo', 'bar'],
36 |         })
37 |         transformer.init(conf=config)
38 | 
39 |         actual = transformer.transform({
40 |             'foo': 'foo_val',
41 |             'baz': 'baz_val',
42 |             'john': 'doe',
43 |         })
44 |         expected = {
45 |             'baz': 'baz_val',
46 |             'john': 'doe'
47 |         }
48 |         self.assertDictEqual(expected, actual)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     unittest.main()
53 | 


--------------------------------------------------------------------------------
/tests/unit/models/test_metric_elasticsearch_document.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import json
 5 | import unittest
 6 | 
 7 | from databuilder.models.metric_elasticsearch_document import MetricESDocument
 8 | 
 9 | 
10 | class TestMetricElasticsearchDocument(unittest.TestCase):
11 | 
12 |     def test_to_json(self) -> None:
13 |         """
14 |         Test string generated from to_json method
15 |         """
16 | 
17 |         test_obj = MetricESDocument(name='test_metric_name',
18 |                                     description='test_metric_description',
19 |                                     type='test_metric_type',
20 |                                     dashboards=['test_dashboard_1', 'test_dashboard_2'],
21 |                                     tags=['test_metric_group'])
22 | 
23 |         expected_document_dict = {"name": "test_metric_name",
24 |                                   "description": "test_metric_description",
25 |                                   "type": "test_metric_type",
26 |                                   "dashboards": ['test_dashboard_1', 'test_dashboard_2'],
27 |                                   "tags": ['test_metric_group']
28 |                                   }
29 | 
30 |         result = test_obj.to_json()
31 |         results = result.split("\n")
32 | 
33 |         # verify two new line characters in result
34 |         self.assertEqual(len(results), 2, "Result from to_json() function doesn't have a newline!")
35 |         self.assertDictEqual(json.loads(results[0]), expected_document_dict)
36 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/user/bamboohr/test_bamboohr_user_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import io
 5 | import unittest
 6 | 
 7 | import os
 8 | 
 9 | import responses
10 | from pyhocon import ConfigFactory
11 | 
12 | from databuilder.models.user import User
13 | from databuilder.extractor.user.bamboohr.bamboohr_user_extractor import BamboohrUserExtractor
14 | 
15 | 
16 | class TestBamboohrUserExtractor(unittest.TestCase):
17 |     @responses.activate
18 |     def test_parse_testdata(self) -> None:
19 |         bhr = BamboohrUserExtractor()
20 |         bhr.init(ConfigFactory.from_dict({'api_key': 'api_key', 'subdomain': 'amundsen'}))
21 | 
22 |         testdata_xml = os.path.join(
23 |             os.path.dirname(os.path.realpath(__file__)),
24 |             '../../../resources/extractor/user/bamboohr/testdata.xml'
25 |         )
26 | 
27 |         with io.open(testdata_xml) as testdata:
28 |             responses.add(responses.GET, bhr._employee_directory_uri(), body=testdata.read())
29 | 
30 |         expected = User(
31 |             email='roald@amundsen.io',
32 |             first_name='Roald',
33 |             last_name='Amundsen',
34 |             name='Roald Amundsen',
35 |             team_name='508 Corporate Marketing',
36 |             role_name='Antarctic Explorer',
37 |         )
38 | 
39 |         actual_users = list(bhr._get_extract_iter())
40 | 
41 |         self.assertEqual(1, len(actual_users))
42 |         self.assertEqual(repr(expected), repr(actual_users[0]))
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # The modular source code checker: pep8, pyflakes and co
 2 | # License: MIT
 3 | # Upstream url: http://bitbucket.org/tarek/flake8
 4 | flake8==3.5.0
 5 | 
 6 | # A flake8 plugin that helps you write tidier imports.
 7 | # License: ISCL
 8 | # Upstream url: https://pypi.python.org/pypi/flake8-tidy-imports
 9 | flake8-tidy-imports>=1.1.0,<2.0
10 | 
11 | # A mature full-featured Python testing tool.
12 | # License: MIT
13 | # Upstream url: http://pytest.org/
14 | pytest>=3.6.0,<4.0
15 | 
16 | # Pytest plugin for measuring coverage.
17 | # License: MIT
18 | # Upstream url: https://github.com/pytest-dev/pytest-cov
19 | pytest-cov>=2.5.1,<=2.9
20 | 
21 | # Rolling backport of unittest.mock for all Pythons
22 | # License: BSD
23 | # Upstream url: https://mock.readthedocs.io/en/latest/
24 | mock>=2.0.0,<3.0
25 | 
26 | # Thin-wrapper around the mock package for easier use with py.test.
27 | # License: MIT
28 | # Upstream url: https://pypi.python.org/pypi/pytest-mock
29 | pytest-mock>=1.1,<2.0
30 | 
31 | # Python client for ElasticSearch
32 | # License: Apache Software License
33 | # Upstream url: https://pypi.org/project/elasticsearch/
34 | elasticsearch>=6.2.0,<7.0
35 | 
36 | atomicwrites==1.1.5
37 | more-itertools==4.2.0
38 | pluggy>=0.6.0
39 | py==1.5.3
40 | pyhocon==0.3.42
41 | pyparsing==2.2.0
42 | six>=1.11.0,<2.0.0
43 | sqlalchemy>=1.3.0,<2.0
44 | wheel==0.31.1
45 | neo4j-driver==1.7.2
46 | neotime==1.7.1
47 | mypy==0.782
48 | pytz==2018.4
49 | statsd==3.2.1
50 | retrying==1.3.3
51 | unicodecsv==0.14.1,<1.0
52 | 
53 | httplib2>=0.18.0
54 | unidecode
55 | Jinja2>=2.10.0,<2.12
56 | pandas>=0.21.0,<1.2.0
57 | 
58 | requests==2.23.0,<3.0
59 | responses==0.10.6
60 | 


--------------------------------------------------------------------------------
/tests/unit/transformer/test_timestamp_string_to_epoch_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from pyhocon import ConfigFactory
 7 | 
 8 | from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME, TIMESTAMP_FORMAT
 9 | 
10 | 
11 | class TestTimestampStrToEpoch(unittest.TestCase):
12 | 
13 |     def test_conversion(self) -> None:
14 | 
15 |         transformer = TimestampStringToEpoch()
16 |         config = ConfigFactory.from_dict({
17 |             FIELD_NAME: 'foo',
18 |         })
19 |         transformer.init(conf=config)
20 | 
21 |         actual = transformer.transform({'foo': '2020-02-19T19:52:33.1Z'})
22 |         self.assertDictEqual({'foo': 1582141953}, actual)
23 | 
24 |     def test_conversion_with_format(self) -> None:
25 | 
26 |         transformer = TimestampStringToEpoch()
27 |         config = ConfigFactory.from_dict({
28 |             FIELD_NAME: 'foo',
29 |             TIMESTAMP_FORMAT: '%Y-%m-%dT%H:%M:%SZ'
30 |         })
31 |         transformer.init(conf=config)
32 | 
33 |         actual = transformer.transform({'foo': '2020-02-19T19:52:33Z'})
34 |         self.assertDictEqual({'foo': 1582141953}, actual)
35 | 
36 |     def test_invalid_timestamp(self) -> None:
37 |         transformer = TimestampStringToEpoch()
38 |         config = ConfigFactory.from_dict({
39 |             FIELD_NAME: 'foo',
40 |         })
41 |         transformer.init(conf=config)
42 |         actual = transformer.transform({'foo': '165de33266d4'})
43 |         self.assertEqual(actual['foo'], 0)
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     unittest.main()
48 | 


--------------------------------------------------------------------------------
/databuilder/transformer/timestamp_string_to_epoch.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | from datetime import datetime
 6 | 
 7 | from pyhocon import ConfigFactory
 8 | from pyhocon import ConfigTree
 9 | from typing import Any, Dict
10 | 
11 | from databuilder.transformer.base_transformer import Transformer
12 | 
13 | TIMESTAMP_FORMAT = 'timestamp_format'
14 | FIELD_NAME = 'field_name'
15 | 
16 | LOGGER = logging.getLogger(__name__)
17 | 
18 | DEFAULT_CONFIG = ConfigFactory.from_dict({TIMESTAMP_FORMAT: '%Y-%m-%dT%H:%M:%S.%fZ'})
19 | 
20 | 
21 | class TimestampStringToEpoch(Transformer):
22 |     """
23 |     Transforms string timestamp into epoch
24 |     """
25 | 
26 |     def init(self, conf: ConfigTree) -> None:
27 |         self._conf = conf.with_fallback(DEFAULT_CONFIG)
28 |         self._timestamp_format = self._conf.get_string(TIMESTAMP_FORMAT)
29 |         self._field_name = self._conf.get_string(FIELD_NAME)
30 | 
31 |     def transform(self, record: Dict[str, Any]) -> Dict[str, Any]:
32 |         timestamp_str = record.get(self._field_name, '')
33 | 
34 |         if not timestamp_str:
35 |             return record
36 | 
37 |         try:
38 |             utc_dt = datetime.strptime(timestamp_str, self._timestamp_format)
39 |         except ValueError:
40 |             # if the timestamp_str doesn't match format, no conversion, return initial result
41 |             record[self._field_name] = 0
42 |             return record
43 | 
44 |         record[self._field_name] = int((utc_dt - datetime(1970, 1, 1)).total_seconds())
45 |         return record
46 | 
47 |     def get_scope(self) -> str:
48 |         return 'transformer.timestamp_str_to_epoch'
49 | 


--------------------------------------------------------------------------------
/databuilder/extractor/generic_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import importlib
 5 | from typing import Iterable, Any
 6 | 
 7 | from pyhocon import ConfigTree
 8 | 
 9 | from databuilder.extractor.base_extractor import Extractor
10 | 
11 | 
12 | class GenericExtractor(Extractor):
13 |     """
14 |     Extractor to extract any arbitrary values from users.
15 |     """
16 |     EXTRACTION_ITEMS = 'extraction_items'
17 | 
18 |     def init(self, conf: ConfigTree) -> None:
19 |         """
20 |         Receives a list of dictionaries which is used for extraction
21 |         :param conf:
22 |         :return:
23 |         """
24 |         self.conf = conf
25 |         self.values: Iterable[Any] = conf.get(GenericExtractor.EXTRACTION_ITEMS)
26 | 
27 |         model_class = conf.get('model_class', None)
28 |         if model_class:
29 |             module_name, class_name = model_class.rsplit(".", 1)
30 |             mod = importlib.import_module(module_name)
31 |             self.model_class = getattr(mod, class_name)
32 |             results = [self.model_class(**result)
33 |                        for result in self.values]
34 | 
35 |             self._iter = iter(results)
36 |         else:
37 |             self._iter = iter(self.values)
38 | 
39 |     def extract(self) -> Any:
40 |         """
41 |         Fetch one sql result row, convert to {model_class} if specified before
42 |         returning.
43 |         :return:
44 |         """
45 |         try:
46 |             result = next(self._iter)
47 |             return result
48 |         except StopIteration:
49 |             return None
50 | 
51 |     def get_scope(self) -> str:
52 |         return 'extractor.generic'
53 | 


--------------------------------------------------------------------------------
/databuilder/transformer/regex_str_replace_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | from pyhocon import ConfigTree
 6 | from typing import Any
 7 | 
 8 | from databuilder.transformer.base_transformer import Transformer
 9 | 
10 | 
11 | LOGGER = logging.getLogger(__name__)
12 | 
13 | 
14 | # Config keys
15 | REGEX_REPLACE_TUPLE_LIST = 'regex_replace_tuple_list'
16 | ATTRIBUTE_NAME = 'attribute_name'
17 | 
18 | 
19 | class RegexStrReplaceTransformer(Transformer):
20 |     """
21 |     Generic string replacement transformer using REGEX.
22 |     User can pass list of tuples where tuple contains regex and replacement pair.
23 | 
24 |     Any non-string values will be ignored.
25 |     """
26 | 
27 |     def init(self, conf: ConfigTree) -> None:
28 |         self._regex_replace_tuples = conf.get_list(REGEX_REPLACE_TUPLE_LIST)
29 |         self._attribute_name = conf.get_string(ATTRIBUTE_NAME)
30 | 
31 |     def transform(self, record: Any) -> Any:
32 | 
33 |         if isinstance(record, dict):
34 |             val = record.get(self._attribute_name)
35 |         else:
36 |             val = getattr(record, self._attribute_name)
37 | 
38 |         if val is None or not isinstance(val, str):
39 |             return record
40 | 
41 |         for regex_replace_tuple in self._regex_replace_tuples:
42 |             val = val.replace(regex_replace_tuple[0], regex_replace_tuple[1])
43 | 
44 |         if isinstance(record, dict):
45 |             record[self._attribute_name] = val
46 |         else:
47 |             setattr(record, self._attribute_name, val)
48 | 
49 |         return record
50 | 
51 |     def get_scope(self) -> str:
52 |         return 'transformer.regex_str_replace'
53 | 


--------------------------------------------------------------------------------
/tests/unit/resources/extractor/user/bamboohr/testdata.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <directory>
 3 |  <fieldset>
 4 |   <field id="displayName">Display name</field>
 5 |   <field id="firstName">First name</field>
 6 |   <field id="lastName">Last name</field>
 7 |   <field id="preferredName">Preferred name</field>
 8 |   <field id="gender">Gender</field>
 9 |   <field id="jobTitle">Job title</field>
10 |   <field id="workPhone">Work Phone</field>
11 |   <field id="mobilePhone">Mobile Phone</field>
12 |   <field id="workEmail">Work Email</field>
13 |   <field id="department">Department</field>
14 |   <field id="location">Location</field>
15 |   <field id="workPhoneExtension">Work Ext.</field>
16 |   <field id="photoUploaded">Employee photo</field>
17 |   <field id="photoUrl">Photo URL</field>
18 |   <field id="canUploadPhoto">Can Upload Photo</field>
19 |  </fieldset>
20 |  <employees>
21 |   <employee id="1082">
22 |    <field id="displayName">Roald Amundsen</field>
23 |    <field id="firstName">Roald</field>
24 |    <field id="lastName">Amundsen</field>
25 |    <field id="preferredName"></field>
26 |    <field id="gender">Male</field>
27 |    <field id="jobTitle">Antarctic Explorer</field>
28 |    <field id="workPhone"></field>
29 |    <field id="mobilePhone"></field>
30 |    <field id="workEmail">roald@amundsen.io</field>
31 |    <field id="department">508 Corporate Marketing</field>
32 |    <field id="location">Norway</field>
33 |    <field id="workPhoneExtension"></field>
34 |    <field id="photoUploaded">true</field>
35 |    <field id="photoUrl">https://upload.wikimedia.org/wikipedia/commons/thumb/6/6f/Amundsen_in_fur_skins.jpg/440px-Amundsen_in_fur_skins.jpg</field>
36 |    <field id="canUploadPhoto">no</field>
37 |   </employee>
38 |  </employees>
39 | </directory>
40 | 


--------------------------------------------------------------------------------
/databuilder/extractor/neo4j_es_last_updated_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import importlib
 5 | import time
 6 | from typing import Any
 7 | 
 8 | from pyhocon import ConfigTree
 9 | 
10 | from databuilder.extractor.generic_extractor import GenericExtractor
11 | 
12 | 
13 | class Neo4jEsLastUpdatedExtractor(GenericExtractor):
14 |     """
15 |     Extractor to extract last updated timestamp for neo4j and Es
16 |     """
17 | 
18 |     def init(self, conf: ConfigTree) -> None:
19 |         """
20 |         Receives a list of dictionaries which is used for extraction
21 |         :param conf:
22 |         :return:
23 |         """
24 |         self.conf = conf
25 | 
26 |         model_class = conf.get('model_class', None)
27 |         if model_class:
28 |             module_name, class_name = model_class.rsplit(".", 1)
29 |             mod = importlib.import_module(module_name)
30 |             self.model_class = getattr(mod, class_name)
31 |             last_updated_timestamp = int(time.time())
32 |             result = {'timestamp': last_updated_timestamp}
33 |             results = [self.model_class(**result)]
34 |             self._iter = iter(results)
35 |         else:
36 |             raise RuntimeError('model class needs to be provided!')
37 | 
38 |     def extract(self) -> Any:
39 |         """
40 |         Fetch one sql result row, convert to {model_class} if specified before
41 |         returning.
42 |         :return:
43 |         """
44 |         try:
45 |             result = next(self._iter)
46 |             return result
47 |         except StopIteration:
48 |             return None
49 | 
50 |     def get_scope(self) -> str:
51 |         return 'extractor.neo4j_es_last_updated'
52 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!---
 2 | Provide a general summary of your changes in the Title above
 3 | Include one of these prefixes:
 4 |   fix – Fixes an unexpected problem or unintended behavior
 5 |   feat – Adds a new feature
 6 |   docs – A documentation improvement task
 7 |   build – A task related to our build system
 8 |   ci – A task related to our ci system
 9 |   perf – A performance improvement
10 |   refactor – A code refactor PR
11 |   style – A task about styling
12 |   test – A PR that improve test coverage
13 |   chore – A regular maintenance chore or task
14 |   other – Any other kind of PR
15 | -->
16 | 
17 | ### Summary of Changes
18 | 
19 | _Include a summary of changes then remove this line_
20 | 
21 | ### Tests
22 | 
23 | _What tests did you add or modify and why? If no tests were added or modified, explain why. Remove this line_
24 | 
25 | ### Documentation
26 | 
27 | _What documentation did you add or modify and why? Add any relevant links then remove this line_
28 | 
29 | ### CheckList
30 | 
31 | Make sure you have checked **all** steps below to ensure a timely review.
32 | 
33 | - [ ] PR title addresses the issue accurately and concisely. Example: "Updates the version of Flask to v1.0.2"
34 |   - In case you are adding a dependency, check if the license complies with the [ASF 3rd Party License Policy](https://www.apache.org/legal/resolved.html#category-x).
35 | - [ ] PR includes a summary of changes.
36 | - [ ] PR adds unit tests, updates existing unit tests, **OR** documents why no test additions or modifications are needed.
37 | - [ ] In case of new functionality, my PR adds documentation that describes how to use it.
38 |   - All the public functions and the classes in the PR contain docstrings that explain what it does
39 | - [ ] PR passes `make test`
40 | 


--------------------------------------------------------------------------------
/databuilder/transformer/bigquery_usage_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from pyhocon import ConfigTree
 5 | from typing import Optional, Tuple
 6 | 
 7 | from databuilder.transformer.base_transformer import Transformer
 8 | from databuilder.models.table_column_usage import ColumnReader, TableColumnUsage
 9 | from databuilder.extractor.bigquery_usage_extractor import TableColumnUsageTuple
10 | 
11 | 
12 | class BigqueryUsageTransformer(Transformer):
13 | 
14 |     def init(self, conf: ConfigTree) -> None:
15 |         """
16 |         Transformer to convert TableColumnUsageTuple data to bigquery usage data
17 |         which can be uploaded to Neo4j
18 |         """
19 |         self.conf = conf
20 | 
21 |     def transform(self, record: Tuple[TableColumnUsageTuple, int]) -> Optional[TableColumnUsage]:
22 |         if not record:
23 |             return None
24 | 
25 |         (key, count) = record
26 | 
27 |         if not isinstance(key, TableColumnUsageTuple):
28 |             raise Exception("BigqueryUsageTransformer expects record of type TableColumnUsageTuple")
29 | 
30 |         col_readers = []
31 |         col_readers.append(ColumnReader(database=key.database,
32 |                                         cluster=key.cluster,
33 |                                         schema=key.schema,
34 |                                         table=key.table,
35 |                                         column=key.column,
36 |                                         user_email=key.email,
37 |                                         read_count=count))
38 | 
39 |         return TableColumnUsage(col_readers=col_readers)
40 | 
41 |     def get_scope(self) -> str:
42 |         return 'transformer.bigquery_usage'
43 | 


--------------------------------------------------------------------------------
/tests/unit/models/dashboard/test_dashboard_owner.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from databuilder.models.dashboard.dashboard_owner import DashboardOwner
 7 | from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \
 8 |     RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE
 9 | from databuilder.serializers import neo4_serializer
10 | 
11 | 
12 | class TestDashboardOwner(unittest.TestCase):
13 | 
14 |     def test_dashboard_owner_nodes(self) -> None:
15 |         dashboard_owner = DashboardOwner(email='foo@bar.com', cluster='cluster_id', product='product_id',
16 |                                          dashboard_id='dashboard_id', dashboard_group_id='dashboard_group_id')
17 | 
18 |         actual = dashboard_owner.create_next_node()
19 |         self.assertIsNone(actual)
20 | 
21 |     def test_dashboard_owner_relations(self) -> None:
22 |         dashboard_owner = DashboardOwner(email='foo@bar.com', cluster='cluster_id', product='product_id',
23 |                                          dashboard_id='dashboard_id', dashboard_group_id='dashboard_group_id')
24 | 
25 |         actual = dashboard_owner.create_next_relation()
26 |         actual_serialized = neo4_serializer.serialize_relationship(actual)
27 |         expected = {RELATION_END_KEY: 'foo@bar.com', RELATION_START_LABEL: 'Dashboard', RELATION_END_LABEL: 'User',
28 |                     RELATION_START_KEY: 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id',
29 |                     RELATION_TYPE: 'OWNER',
30 |                     RELATION_REVERSE_TYPE: 'OWNER_OF'}
31 |         assert actual is not None
32 |         self.assertDictEqual(actual_serialized, expected)
33 | 


--------------------------------------------------------------------------------
/tests/unit/filesystem/test_filesystem.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | from datetime import datetime
 6 | 
 7 | from mock import MagicMock
 8 | from pyhocon import ConfigFactory
 9 | from pytz import UTC
10 | 
11 | from databuilder.filesystem.filesystem import FileSystem
12 | from databuilder.filesystem.metadata import FileMetadata
13 | 
14 | 
15 | class TestFileSystem(unittest.TestCase):
16 | 
17 |     def test_is_file(self) -> None:
18 |         dask_fs = MagicMock()
19 |         dask_fs.ls = MagicMock(return_value=['/foo/bar'])
20 | 
21 |         fs = FileSystem()
22 |         conf = ConfigFactory.from_dict({FileSystem.DASK_FILE_SYSTEM: dask_fs})
23 |         fs.init(conf=conf)
24 | 
25 |         self.assertTrue(fs.is_file('/foo/bar'))
26 | 
27 |         dask_fs.ls = MagicMock(return_value=['bar', 'baz'])
28 | 
29 |         fs = FileSystem()
30 |         conf = ConfigFactory.from_dict({FileSystem.DASK_FILE_SYSTEM: dask_fs})
31 |         fs.init(conf=conf)
32 | 
33 |         self.assertFalse(fs.is_file('foo'))
34 | 
35 |     def test_info(self) -> None:
36 |         dask_fs = MagicMock()
37 |         dask_fs.info = MagicMock(return_value={'LastModified': datetime(2018, 8, 14, 4, 12, 3, tzinfo=UTC),
38 |                                                'Size': 15093})
39 |         fs = FileSystem()
40 |         conf = ConfigFactory.from_dict({FileSystem.DASK_FILE_SYSTEM: dask_fs})
41 |         fs.init(conf=conf)
42 |         metadata = fs.info('/foo/bar')
43 | 
44 |         expected = FileMetadata(path='/foo/bar', last_updated=datetime(2018, 8, 14, 4, 12, 3, tzinfo=UTC), size=15093)
45 | 
46 |         self.assertEqual(metadata.__repr__(), expected.__repr__())
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     unittest.main()
51 | 


--------------------------------------------------------------------------------
/databuilder/loader/file_system_csv_loader.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import csv
 5 | import logging
 6 | 
 7 | from pyhocon import ConfigTree
 8 | from typing import Any
 9 | 
10 | from databuilder.loader.base_loader import Loader
11 | 
12 | 
13 | class FileSystemCSVLoader(Loader):
14 |     """
15 |     Loader class to write csv files to Local FileSystem
16 |     """
17 | 
18 |     def init(self, conf: ConfigTree) -> None:
19 |         """
20 |         Initialize file handlers from conf
21 |         :param conf:
22 |         """
23 |         self.conf = conf
24 |         self.file_path = self.conf.get_string('file_path')
25 |         self.file_mode = self.conf.get_string('mode', 'w')
26 | 
27 |         self.file_handler = open(self.file_path, self.file_mode)
28 | 
29 |     def load(self, record: Any) -> None:
30 |         """
31 |         Write record object as csv to file
32 |         :param record:
33 |         :return:
34 |         """
35 |         if not record:
36 |             return
37 | 
38 |         if not hasattr(self, 'writer'):
39 |             self.writer = csv.DictWriter(self.file_handler,
40 |                                          fieldnames=vars(record).keys())
41 |             self.writer.writeheader()
42 | 
43 |         self.writer.writerow(vars(record))
44 |         self.file_handler.flush()
45 | 
46 |     def close(self) -> None:
47 |         """
48 |         Close file handlers
49 |         :return:
50 |         """
51 |         try:
52 |             if self.file_handler:
53 |                 self.file_handler.close()
54 |         except Exception as e:
55 |             logging.warning("Failed trying to close a file handler! %s",
56 |                             str(e))
57 | 
58 |     def get_scope(self) -> str:
59 |         return "loader.filesystem.csv"
60 | 


--------------------------------------------------------------------------------
/databuilder/utils/closer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import atexit
 5 | 
 6 | from typing import Callable, List
 7 | 
 8 | 
 9 | class Closer(object):
10 |     """
11 |     A Closer class that is responsible for collecting closeable callable,
12 |     and close it in group. Registered closeable callable will be guaranteed
13 |     to be called where only last occurred failure will be propagated back.
14 | 
15 |     Order of closing registered closeable callable will be LIFO
16 |     as closeable instance can have dependency each other.
17 |     """
18 | 
19 |     def __init__(self) -> None:
20 |         self._stack: List = []
21 |         atexit.register(self.close)
22 | 
23 |     def register(self, close_callable: Callable) -> None:
24 |         """
25 |         Register closeable callable.
26 |         :param close_callable:
27 |         :return: None
28 |         """
29 |         if not callable(close_callable):
30 |             raise RuntimeError('Only callable can be registered: {}'.format(
31 |                 close_callable))
32 | 
33 |         self._stack.append(close_callable)
34 | 
35 |     def close(self) -> None:
36 |         """
37 |         Execute all closeable callable in LIFO order.
38 |         All registered callable will be guaranteed to be executed. If there
39 |         are multiple failures, only the last one will be propagated.
40 |         :return:
41 |         """
42 |         if not self._stack:
43 |             return
44 | 
45 |         last_exception = None
46 |         while len(self._stack):
47 |             try:
48 |                 close_callable = self._stack.pop()
49 |                 close_callable()
50 |             except Exception as e:
51 |                 last_exception = e
52 | 
53 |         if last_exception:
54 |             raise last_exception
55 | 


--------------------------------------------------------------------------------
/databuilder/models/dashboard_elasticsearch_document.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from typing import List, Optional, Union
 5 | 
 6 | from databuilder.models.elasticsearch_document import ElasticsearchDocument
 7 | 
 8 | 
 9 | class DashboardESDocument(ElasticsearchDocument):
10 |     """
11 |     Schema for the ES dashboard ES document
12 |     """
13 | 
14 |     def __init__(self,
15 |                  group_name: str,
16 |                  name: str,
17 |                  description: Union[str, None],
18 |                  total_usage: int,
19 |                  product: Optional[str] = '',
20 |                  cluster: Optional[str] = '',
21 |                  group_description: Optional[str] = None,
22 |                  query_names: Union[List[str], None] = None,
23 |                  chart_names: Optional[List[str]] = None,
24 |                  group_url: Optional[str] = None,
25 |                  url: Optional[str] = None,
26 |                  uri: Optional[str] = None,
27 |                  last_successful_run_timestamp: Optional[int] = None,
28 |                  tags: Optional[List[str]] = None,
29 |                  badges: Optional[List[str]] = None,
30 |                  ) -> None:
31 |         self.group_name = group_name
32 |         self.name = name
33 |         self.description = description
34 |         self.cluster = cluster
35 |         self.product = product
36 |         self.group_url = group_url
37 |         self.url = url
38 |         self.uri = uri
39 |         self.last_successful_run_timestamp = last_successful_run_timestamp
40 |         self.total_usage = total_usage
41 |         self.group_description = group_description
42 |         self.query_names = query_names
43 |         self.chart_names = chart_names
44 |         self.tags = tags
45 |         self.badges = badges
46 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/test_generic_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from pyhocon import ConfigFactory
 7 | 
 8 | from databuilder import Scoped
 9 | from databuilder.extractor.generic_extractor import GenericExtractor
10 | 
11 | 
12 | class TestGenericExtractor(unittest.TestCase):
13 | 
14 |     def test_extraction_with_model_class(self) -> None:
15 |         """
16 |         Test Extraction using model class
17 |         """
18 |         config_dict = {
19 |             'extractor.generic.extraction_items': [{'timestamp': 10000000}],
20 |             'extractor.generic.model_class':
21 |                 'databuilder.models.neo4j_es_last_updated.Neo4jESLastUpdated',
22 |         }
23 |         conf = ConfigFactory.from_dict(config_dict)
24 | 
25 |         extractor = GenericExtractor()
26 |         self.conf = ConfigFactory.from_dict(config_dict)
27 |         extractor.init(Scoped.get_scoped_conf(conf=conf,
28 |                                               scope=extractor.get_scope()))
29 | 
30 |         result = extractor.extract()
31 |         self.assertEqual(result.timestamp, 10000000)
32 | 
33 |     def test_extraction_without_model_class(self) -> None:
34 |         """
35 |         Test Extraction using model class
36 |         """
37 |         config_dict = {
38 |             'extractor.generic.extraction_items': [{'foo': 1}, {'bar': 2}],
39 |         }
40 |         conf = ConfigFactory.from_dict(config_dict)
41 | 
42 |         extractor = GenericExtractor()
43 |         self.conf = ConfigFactory.from_dict(config_dict)
44 |         extractor.init(Scoped.get_scoped_conf(conf=conf,
45 |                                               scope=extractor.get_scope()))
46 | 
47 |         self.assertEqual(extractor.extract(), {'foo': 1})
48 |         self.assertEqual(extractor.extract(), {'bar': 2})
49 | 


--------------------------------------------------------------------------------
/databuilder/callback/call_back.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import abc
 5 | import logging
 6 | 
 7 | from typing import List, Optional
 8 | 
 9 | LOGGER = logging.getLogger(__name__)
10 | 
11 | 
12 | class Callback(object, metaclass=abc.ABCMeta):
13 |     """
14 |     A callback interface that expected to fire "on_success" if the operation is successful, else "on_failure" if
15 |     operation failed.
16 |     """
17 | 
18 |     @abc.abstractmethod
19 |     def on_success(self) -> None:
20 |         """
21 |         A call back method that will be called when operation is successful
22 |         :return: None
23 |         """
24 |         pass
25 | 
26 |     @abc.abstractmethod
27 |     def on_failure(self) -> None:
28 |         """
29 |         A call back method that will be called when operation failed
30 |         :return: None
31 |         """
32 |         pass
33 | 
34 | 
35 | def notify_callbacks(callbacks: List[Callback], is_success: bool) -> None:
36 |     """
37 |     A Utility method that notifies callback. If any callback fails it will still go through all the callbacks,
38 |     and raise the last exception it experienced.
39 | 
40 |     :param callbacks:
41 |     :param is_success:
42 |     :return:
43 |     """
44 | 
45 |     if not callbacks:
46 |         LOGGER.info('No callbacks to notify')
47 |         return
48 | 
49 |     LOGGER.info('Notifying callbacks')
50 | 
51 |     last_exception: Optional[Exception] = None
52 |     for callback in callbacks:
53 |         try:
54 |             if is_success:
55 |                 callback.on_success()
56 |             else:
57 |                 callback.on_failure()
58 |         except Exception as e:
59 |             LOGGER.exception('Failed while notifying callback')
60 |             last_exception = e
61 | 
62 |     if last_exception:
63 |         raise last_exception
64 | 


--------------------------------------------------------------------------------
/databuilder/extractor/postgres_metadata_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from pyhocon import ConfigFactory, ConfigTree  # noqa: F401
 5 | from typing import Iterator, Union, Dict, Any  # noqa: F401
 6 | 
 7 | from databuilder.extractor.base_postgres_metadata_extractor import BasePostgresMetadataExtractor
 8 | 
 9 | 
10 | class PostgresMetadataExtractor(BasePostgresMetadataExtractor):
11 |     """
12 |     Extracts Postgres table and column metadata from underlying meta store database using SQLAlchemyExtractor
13 |     """
14 | 
15 |     def get_sql_statement(self, use_catalog_as_cluster_name, where_clause_suffix):
16 |         # type: (bool, str) -> str
17 |         if use_catalog_as_cluster_name:
18 |             cluster_source = "c.table_catalog"
19 |         else:
20 |             cluster_source = "'{}'".format(self._cluster)
21 | 
22 |         return """
23 |         SELECT
24 |           {cluster_source} as cluster, c.table_schema as schema, c.table_name as name, pgtd.description as description
25 |           ,c.column_name as col_name, c.data_type as col_type
26 |           , pgcd.description as col_description, ordinal_position as col_sort_order
27 |         FROM INFORMATION_SCHEMA.COLUMNS c
28 |         INNER JOIN
29 |           pg_catalog.pg_statio_all_tables as st on c.table_schema=st.schemaname and c.table_name=st.relname
30 |         LEFT JOIN
31 |           pg_catalog.pg_description pgcd on pgcd.objoid=st.relid and pgcd.objsubid=c.ordinal_position
32 |         LEFT JOIN
33 |           pg_catalog.pg_description pgtd on pgtd.objoid=st.relid and pgtd.objsubid=0
34 |         {where_clause_suffix}
35 |         ORDER by cluster, schema, name, col_sort_order ;
36 |         """.format(
37 |             cluster_source=cluster_source,
38 |             where_clause_suffix=where_clause_suffix,
39 |         )
40 | 
41 |     def get_scope(self):
42 |         # type: () -> str
43 |         return 'extractor.postgres_metadata'
44 | 


--------------------------------------------------------------------------------
/tests/unit/transformer/test_chained_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from mock import MagicMock
 7 | from pyhocon import ConfigFactory
 8 | 
 9 | from databuilder.transformer.base_transformer import ChainedTransformer
10 | 
11 | 
12 | class TestChainedTransformer(unittest.TestCase):
13 | 
14 |     def test_init_not_called(self) -> None:
15 | 
16 |         mock_transformer1 = MagicMock()
17 |         mock_transformer2 = MagicMock()
18 |         chained_transformer = ChainedTransformer(transformers=[mock_transformer1, mock_transformer2])
19 | 
20 |         config = ConfigFactory.from_dict({})
21 |         chained_transformer.init(conf=config)
22 | 
23 |         chained_transformer.transform(
24 |             {
25 |                 'foo': 'bar'
26 |             }
27 |         )
28 | 
29 |         mock_transformer1.init.assert_not_called()
30 |         mock_transformer1.transform.assert_called_once()
31 |         mock_transformer2.init.assert_not_called()
32 |         mock_transformer2.transform.assert_called_once()
33 | 
34 |     def test_init_called(self) -> None:
35 | 
36 |         mock_transformer1 = MagicMock()
37 |         mock_transformer1.get_scope.return_value = 'foo'
38 |         mock_transformer2 = MagicMock()
39 |         mock_transformer2.get_scope.return_value = 'bar'
40 |         chained_transformer = ChainedTransformer(transformers=[mock_transformer1, mock_transformer2],
41 |                                                  is_init_transformers=True)
42 | 
43 |         config = ConfigFactory.from_dict({})
44 |         chained_transformer.init(conf=config)
45 | 
46 |         chained_transformer.transform(
47 |             {
48 |                 'foo': 'bar'
49 |             }
50 |         )
51 | 
52 |         mock_transformer1.init.assert_called_once()
53 |         mock_transformer1.transform.assert_called_once()
54 |         mock_transformer2.init.assert_called_once()
55 |         mock_transformer2.transform.assert_called_once()
56 | 


--------------------------------------------------------------------------------
/databuilder/transformer/base_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import abc
 5 | 
 6 | from pyhocon import ConfigTree
 7 | from typing import Any, Iterable, Optional
 8 | 
 9 | from databuilder import Scoped
10 | 
11 | 
12 | class Transformer(Scoped):
13 |     """
14 |     A transformer transforms a record
15 |     """
16 |     @abc.abstractmethod
17 |     def init(self, conf: ConfigTree) -> None:
18 |         pass
19 | 
20 |     @abc.abstractmethod
21 |     def transform(self, record: Any) -> Any:
22 |         pass
23 | 
24 | 
25 | class NoopTransformer(Transformer):
26 |     """
27 |     A no-op transformer
28 |     """
29 | 
30 |     def init(self, conf: ConfigTree) -> None:
31 |         pass
32 | 
33 |     def transform(self, record: Any) -> Any:
34 |         return record
35 | 
36 |     def get_scope(self) -> str:
37 |         pass
38 | 
39 | 
40 | class ChainedTransformer(Transformer):
41 |     """
42 |     A chained transformer that iterates transformers and transforms a record
43 |     """
44 | 
45 |     def __init__(self,
46 |                  transformers: Iterable[Transformer],
47 |                  is_init_transformers: Optional[bool] = False) -> None:
48 |         self.transformers = transformers
49 |         self.is_init_transformers = is_init_transformers
50 | 
51 |     def init(self, conf: ConfigTree) -> None:
52 |         if self.is_init_transformers:
53 |             for transformer in self.transformers:
54 |                 transformer.init(Scoped.get_scoped_conf(conf, transformer.get_scope()))
55 | 
56 |     def transform(self, record: Any) -> Any:
57 |         for t in self.transformers:
58 |             record = t.transform(record)
59 |             # Check filtered record
60 |             if not record:
61 |                 return None
62 | 
63 |         return record
64 | 
65 |     def get_scope(self) -> str:
66 |         return 'transformer.chained'
67 | 
68 |     def close(self) -> None:
69 |         for t in self.transformers:
70 |             t.close()
71 | 


--------------------------------------------------------------------------------
/databuilder/models/neo4j_es_last_updated.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from typing import List, Union
 5 | 
 6 | from databuilder.models.graph_serializable import GraphSerializable
 7 | from databuilder.models.graph_relationship import GraphRelationship
 8 | from databuilder.models.graph_node import GraphNode
 9 | 
10 | 
11 | class Neo4jESLastUpdated(GraphSerializable):
12 |     """
13 |     Data model to keep track the last updated timestamp for
14 |     neo4j and es.
15 |     """
16 | 
17 |     LABEL = 'Updatedtimestamp'
18 |     KEY = 'amundsen_updated_timestamp'
19 |     LATEST_TIMESTAMP = 'latest_timestmap'
20 | 
21 |     def __init__(self,
22 |                  timestamp: int,
23 |                  ) -> None:
24 |         """
25 |         :param timestamp: epoch for latest updated timestamp for neo4j an es
26 |         """
27 |         self.timestamp = timestamp
28 |         self._node_iter = iter(self.create_nodes())
29 |         self._rel_iter = iter(self.create_relation())
30 | 
31 |     def create_next_node(self) -> Union[GraphNode, None]:
32 |         """
33 |         Will create an orphan node for last updated timestamp.
34 |         """
35 |         try:
36 |             return next(self._node_iter)
37 |         except StopIteration:
38 |             return None
39 | 
40 |     def create_nodes(self) -> List[GraphNode]:
41 |         """
42 |         Create a list of Neo4j node records.
43 |         """
44 |         node = GraphNode(
45 |             key=Neo4jESLastUpdated.KEY,
46 |             label=Neo4jESLastUpdated.LABEL,
47 |             attributes={
48 |                 Neo4jESLastUpdated.LATEST_TIMESTAMP: self.timestamp
49 |             }
50 |         )
51 |         return [node]
52 | 
53 |     def create_next_relation(self) -> Union[GraphRelationship, None]:
54 |         try:
55 |             return next(self._rel_iter)
56 |         except StopIteration:
57 |             return None
58 | 
59 |     def create_relation(self) -> List[GraphRelationship]:
60 |         return []
61 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/restapi/test_rest_api_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from pyhocon import ConfigFactory
 7 | 
 8 | from databuilder.extractor.restapi.rest_api_extractor import RestAPIExtractor, REST_API_QUERY, MODEL_CLASS, \
 9 |     STATIC_RECORD_DICT
10 | from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata
11 | from databuilder.rest_api.base_rest_api_query import RestApiQuerySeed
12 | 
13 | 
14 | class TestRestAPIExtractor(unittest.TestCase):
15 | 
16 |     def test_static_data(self) -> None:
17 | 
18 |         conf = ConfigFactory.from_dict(
19 |             {
20 |                 REST_API_QUERY: RestApiQuerySeed(seed_record=[{'foo': 'bar'}]),
21 |                 STATIC_RECORD_DICT: {'john': 'doe'}
22 |             }
23 |         )
24 |         extractor = RestAPIExtractor()
25 |         extractor.init(conf=conf)
26 | 
27 |         record = extractor.extract()
28 |         expected = {'foo': 'bar', 'john': 'doe'}
29 | 
30 |         self.assertDictEqual(expected, record)
31 | 
32 |     def test_model_construction(self) -> None:
33 |         conf = ConfigFactory.from_dict(
34 |             {
35 |                 REST_API_QUERY: RestApiQuerySeed(
36 |                     seed_record=[{'dashboard_group': 'foo',
37 |                                   'dashboard_name': 'bar',
38 |                                   'description': 'john',
39 |                                   'dashboard_group_description': 'doe'}]),
40 |                 MODEL_CLASS: 'databuilder.models.dashboard.dashboard_metadata.DashboardMetadata',
41 |             }
42 |         )
43 |         extractor = RestAPIExtractor()
44 |         extractor.init(conf=conf)
45 | 
46 |         record = extractor.extract()
47 |         expected = DashboardMetadata(dashboard_group='foo', dashboard_name='bar', description='john',
48 |                                      dashboard_group_description='doe')
49 | 
50 |         self.assertEqual(expected.__repr__(), record.__repr__())
51 | 


--------------------------------------------------------------------------------
/tests/unit/callback/test_call_back.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from mock import MagicMock
 7 | from typing import List
 8 | 
 9 | from databuilder.callback.call_back import Callback, notify_callbacks
10 | 
11 | 
12 | class TestCallBack(unittest.TestCase):
13 | 
14 |     def test_success_notify(self) -> None:
15 |         callback1 = MagicMock()
16 |         callback2 = MagicMock()
17 |         callbacks: List[Callback] = [callback1, callback2]
18 | 
19 |         notify_callbacks(callbacks, is_success=True)
20 | 
21 |         self.assertTrue(callback1.on_success.called)
22 |         self.assertTrue(not callback1.on_failure.called)
23 |         self.assertTrue(callback2.on_success.called)
24 |         self.assertTrue(not callback2.on_failure.called)
25 | 
26 |     def test_failure_notify(self) -> None:
27 |         callback1 = MagicMock()
28 |         callback2 = MagicMock()
29 |         callbacks: List[Callback] = [callback1, callback2]
30 | 
31 |         notify_callbacks(callbacks, is_success=False)
32 | 
33 |         self.assertTrue(not callback1.on_success.called)
34 |         self.assertTrue(callback1.on_failure.called)
35 |         self.assertTrue(not callback2.on_success.called)
36 |         self.assertTrue(callback2.on_failure.called)
37 | 
38 |     def test_notify_failure(self) -> None:
39 |         callback1 = MagicMock()
40 |         callback2 = MagicMock()
41 |         callback2.on_success.side_effect = Exception('Boom')
42 |         callback3 = MagicMock()
43 |         callbacks: List[Callback] = [callback1, callback2, callback3]
44 | 
45 |         try:
46 |             notify_callbacks(callbacks, is_success=True)
47 |             self.assertTrue(False)
48 |         except Exception:
49 |             self.assertTrue(True)
50 | 
51 |         self.assertTrue(callback1.on_success.called)
52 |         self.assertTrue(callback2.on_success.called)
53 |         self.assertTrue(callback3.on_success.called)
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     unittest.main()
58 | 


--------------------------------------------------------------------------------
/databuilder/models/table_elasticsearch_document.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from typing import List, Optional
 5 | 
 6 | from databuilder.models.elasticsearch_document import ElasticsearchDocument
 7 | 
 8 | 
 9 | class TableESDocument(ElasticsearchDocument):
10 |     """
11 |     Schema for the Search index document
12 |     """
13 | 
14 |     def __init__(self,
15 |                  database: str,
16 |                  cluster: str,
17 |                  schema: str,
18 |                  name: str,
19 |                  key: str,
20 |                  description: str,
21 |                  last_updated_timestamp: Optional[int],
22 |                  column_names: List[str],
23 |                  column_descriptions: List[str],
24 |                  total_usage: int,
25 |                  unique_usage: int,
26 |                  tags: List[str],
27 |                  badges: Optional[List[str]] = None,
28 |                  display_name: Optional[str] = None,
29 |                  schema_description: Optional[str] = None,
30 |                  programmatic_descriptions: List[str] = [],
31 |                  ) -> None:
32 |         self.database = database
33 |         self.cluster = cluster
34 |         self.schema = schema
35 |         self.name = name
36 |         self.display_name = display_name if display_name else '{schema}.{table}'.format(schema=schema, table=name)
37 |         self.key = key
38 |         self.description = description
39 |         # todo: use last_updated_timestamp to match the record in metadata
40 |         self.last_updated_timestamp = int(last_updated_timestamp) if last_updated_timestamp else None
41 |         self.column_names = column_names
42 |         self.column_descriptions = column_descriptions
43 |         self.total_usage = total_usage
44 |         self.unique_usage = unique_usage
45 |         # todo: will include tag_type once we have better understanding from UI flow.
46 |         self.tags = tags
47 |         self.badges = badges
48 |         self.schema_description = schema_description
49 |         self.programmatic_descriptions = programmatic_descriptions
50 | 


--------------------------------------------------------------------------------
/tests/unit/transformer/test_regex_str_replace_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from pyhocon import ConfigFactory
 7 | from typing import Any
 8 | 
 9 | from databuilder.transformer.regex_str_replace_transformer import RegexStrReplaceTransformer, \
10 |     REGEX_REPLACE_TUPLE_LIST, ATTRIBUTE_NAME
11 | 
12 | 
13 | class TestRegexReplacement(unittest.TestCase):
14 | 
15 |     def test(self) -> None:
16 |         transformer = self._default_test_transformer()
17 | 
18 |         foo = Foo('abc')
19 |         actual = transformer.transform(foo)
20 | 
21 |         self.assertEqual('bba', actual.val)
22 | 
23 |     def test_numeric_val(self) -> None:
24 |         transformer = self._default_test_transformer()
25 | 
26 |         foo = Foo(6)
27 |         actual = transformer.transform(foo)
28 | 
29 |         self.assertEqual(6, actual.val)
30 | 
31 |     def test_none_val(self) -> None:
32 |         transformer = self._default_test_transformer()
33 | 
34 |         foo = Foo(None)
35 |         actual = transformer.transform(foo)
36 | 
37 |         self.assertEqual(None, actual.val)
38 | 
39 |     def _default_test_transformer(self) -> RegexStrReplaceTransformer:
40 |         config = ConfigFactory.from_dict({
41 |             REGEX_REPLACE_TUPLE_LIST: [('a', 'b'), ('c', 'a')],
42 |             ATTRIBUTE_NAME: 'val'
43 |         })
44 | 
45 |         transformer = RegexStrReplaceTransformer()
46 |         transformer.init(config)
47 | 
48 |         return transformer
49 | 
50 |     def test_dict_replace(self) -> None:
51 |         config = ConfigFactory.from_dict({
52 |             REGEX_REPLACE_TUPLE_LIST: [('\\', '\\\\')],
53 |             ATTRIBUTE_NAME: 'val'
54 |         })
55 | 
56 |         transformer = RegexStrReplaceTransformer()
57 |         transformer.init(config)
58 | 
59 |         d = {'val': '\\'}
60 | 
61 |         actual = transformer.transform(d)
62 | 
63 |         self.assertEqual({'val': '\\\\'}, actual)
64 | 
65 | 
66 | class Foo(object):
67 |     def __init__(self, val: Any) -> None:
68 |         self.val = val
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/tests/unit/models/test_table_column_usage.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from databuilder.models.table_column_usage import ColumnReader, TableColumnUsage
 7 | from typing import no_type_check
 8 | from databuilder.serializers import neo4_serializer
 9 | 
10 | 
11 | class TestTableColumnUsage(unittest.TestCase):
12 | 
13 |     @no_type_check  # mypy is somehow complaining on assignment on expected dict.
14 |     def test_serialize(self) -> None:
15 | 
16 |         col_readers = [ColumnReader(database='db', cluster='gold', schema='scm', table='foo', column='*',
17 |                                     user_email='john@example.com')]
18 |         table_col_usage = TableColumnUsage(col_readers=col_readers)
19 | 
20 |         node_row = table_col_usage.next_node()
21 |         actual = []
22 |         while node_row:
23 | 
24 |             actual.append(neo4_serializer.serialize_node(node_row))
25 |             node_row = table_col_usage.next_node()
26 | 
27 |         expected = [{'first_name': '',
28 |                      'last_name': '',
29 |                      'full_name': '',
30 |                      'employee_type': '',
31 |                      'is_active:UNQUOTED': True,
32 |                      'updated_at:UNQUOTED': 0,
33 |                      'LABEL': 'User',
34 |                      'slack_id': '',
35 |                      'KEY': 'john@example.com',
36 |                      'github_username': '',
37 |                      'team_name': '',
38 |                      'email': 'john@example.com',
39 |                      'role_name': ''}]
40 |         self.assertEqual(expected, actual)
41 | 
42 |         rel_row = table_col_usage.next_relation()
43 |         actual = []
44 |         while rel_row:
45 |             actual.append(neo4_serializer.serialize_relationship(rel_row))
46 |             rel_row = table_col_usage.next_relation()
47 | 
48 |         expected = [{'read_count:UNQUOTED': 1, 'END_KEY': 'john@example.com', 'START_LABEL': 'Table',
49 |                      'END_LABEL': 'User', 'START_KEY': 'db://gold.scm/foo', 'TYPE': 'READ_BY', 'REVERSE_TYPE': 'READ'}]
50 |         self.assertEqual(expected, actual)
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     unittest.main()
55 | 


--------------------------------------------------------------------------------
/databuilder/loader/file_system_elasticsearch_json_loader.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import os
 5 | 
 6 | from pyhocon import ConfigTree
 7 | 
 8 | from databuilder.loader.base_loader import Loader
 9 | from databuilder.models.elasticsearch_document import ElasticsearchDocument
10 | 
11 | 
12 | class FSElasticsearchJSONLoader(Loader):
13 |     """
14 |     Loader class to produce Elasticsearch bulk load file to Local FileSystem
15 |     """
16 |     FILE_PATH_CONFIG_KEY = 'file_path'
17 |     FILE_MODE_CONFIG_KEY = 'mode'
18 | 
19 |     def init(self, conf: ConfigTree) -> None:
20 |         """
21 | 
22 |         :param conf:
23 |         :return:
24 |         """
25 |         self.conf = conf
26 |         self.file_path = self.conf.get_string(FSElasticsearchJSONLoader.FILE_PATH_CONFIG_KEY)
27 |         self.file_mode = self.conf.get_string(FSElasticsearchJSONLoader.FILE_MODE_CONFIG_KEY, 'w')
28 | 
29 |         file_dir = self.file_path.rsplit('/', 1)[0]
30 |         self._ensure_directory_exists(file_dir)
31 |         self.file_handler = open(self.file_path, self.file_mode)
32 | 
33 |     def _ensure_directory_exists(self, path: str) -> None:
34 |         """
35 |         Check to ensure file directory exists; create the directories otherwise
36 |         :param path:
37 |         :return: None
38 |         """
39 |         if os.path.exists(path):
40 |             return  # nothing to do here
41 | 
42 |         os.makedirs(path)
43 | 
44 |     def load(self, record: ElasticsearchDocument) -> None:
45 |         """
46 |         Write a record in json format to file
47 |         :param record:
48 |         :return:
49 |         """
50 |         if not record:
51 |             return
52 | 
53 |         if not isinstance(record, ElasticsearchDocument):
54 |             raise Exception("Record not of type 'ElasticsearchDocument'!")
55 | 
56 |         self.file_handler.write(record.to_json())
57 |         self.file_handler.flush()
58 | 
59 |     def close(self) -> None:
60 |         """
61 |         close the file handler
62 |         :return:
63 |         """
64 |         if self.file_handler:
65 |             self.file_handler.close()
66 | 
67 |     def get_scope(self) -> str:
68 |         return 'loader.filesystem.elasticsearch'
69 | 


--------------------------------------------------------------------------------
/databuilder/serializers/neo4_serializer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from typing import Dict, Any, Optional
 5 | 
 6 | from databuilder.models.graph_relationship import GraphRelationship
 7 | from databuilder.models.graph_node import GraphNode
 8 | from databuilder.models.graph_serializable import (
 9 |     NODE_LABEL,
10 |     NODE_KEY,
11 |     RELATION_END_KEY,
12 |     RELATION_END_LABEL,
13 |     RELATION_REVERSE_TYPE,
14 |     RELATION_START_KEY,
15 |     RELATION_START_LABEL,
16 |     RELATION_TYPE
17 | )
18 | from databuilder.publisher.neo4j_csv_publisher import UNQUOTED_SUFFIX
19 | 
20 | 
21 | def serialize_node(node: Optional[GraphNode]) -> Dict[str, Any]:
22 |     if node is None:
23 |         return {}
24 | 
25 |     node_dict = {
26 |         NODE_LABEL: node.label,
27 |         NODE_KEY: node.key
28 |     }
29 |     for key, value in node.attributes.items():
30 |         key_suffix = _get_neo4j_suffix_value(value)
31 |         formatted_key = "{key}{suffix}".format(
32 |             key=key,
33 |             suffix=key_suffix
34 |         )
35 |         node_dict[formatted_key] = value
36 |     return node_dict
37 | 
38 | 
39 | def serialize_relationship(relationship: Optional[GraphRelationship]) -> Dict[str, Any]:
40 |     if relationship is None:
41 |         return {}
42 | 
43 |     relationship_dict = {
44 |         RELATION_START_KEY: relationship.start_key,
45 |         RELATION_START_LABEL: relationship.start_label,
46 |         RELATION_END_KEY: relationship.end_key,
47 |         RELATION_END_LABEL: relationship.end_label,
48 |         RELATION_TYPE: relationship.type,
49 |         RELATION_REVERSE_TYPE: relationship.reverse_type,
50 |     }
51 |     for key, value in relationship.attributes.items():
52 |         key_suffix = _get_neo4j_suffix_value(value)
53 |         formatted_key = "{key}{suffix}".format(
54 |             key=key,
55 |             suffix=key_suffix
56 |         )
57 |         relationship_dict[formatted_key] = value
58 | 
59 |     return relationship_dict
60 | 
61 | 
62 | def _get_neo4j_suffix_value(value: Any) -> str:
63 |     if isinstance(value, int):
64 |         return UNQUOTED_SUFFIX
65 | 
66 |     if isinstance(value, bool):
67 |         return UNQUOTED_SUFFIX
68 | 
69 |     return ''
70 | 


--------------------------------------------------------------------------------
/tests/unit/models/dashboard/test_dashboard_query.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from databuilder.models.dashboard.dashboard_query import DashboardQuery
 7 | from databuilder.models.graph_serializable import NODE_KEY, \
 8 |     NODE_LABEL, RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \
 9 |     RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE
10 | from databuilder.serializers import neo4_serializer
11 | 
12 | 
13 | class TestDashboardQuery(unittest.TestCase):
14 | 
15 |     def test_create_nodes(self) -> None:
16 | 
17 |         dashboard_query = DashboardQuery(dashboard_group_id='dg_id',
18 |                                          dashboard_id='d_id',
19 |                                          query_id='q_id',
20 |                                          query_name='q_name',
21 |                                          url='http://foo.bar/query/baz',
22 |                                          query_text='SELECT * FROM foo.bar')
23 | 
24 |         actual = dashboard_query.create_next_node()
25 |         actual_serialized = neo4_serializer.serialize_node(actual)
26 |         expected = {'url': 'http://foo.bar/query/baz', 'name': 'q_name', 'id': 'q_id',
27 |                     'query_text': 'SELECT * FROM foo.bar',
28 |                     NODE_KEY: '_dashboard://gold.dg_id/d_id/query/q_id',
29 |                     NODE_LABEL: DashboardQuery.DASHBOARD_QUERY_LABEL}
30 | 
31 |         self.assertEqual(expected, actual_serialized)
32 | 
33 |     def test_create_relation(self) -> None:
34 |         dashboard_query = DashboardQuery(dashboard_group_id='dg_id',
35 |                                          dashboard_id='d_id',
36 |                                          query_id='q_id',
37 |                                          query_name='q_name')
38 | 
39 |         actual = dashboard_query.create_next_relation()
40 |         actual_serialized = neo4_serializer.serialize_relationship(actual)
41 |         expected = {RELATION_END_KEY: '_dashboard://gold.dg_id/d_id/query/q_id', RELATION_START_LABEL: 'Dashboard',
42 |                     RELATION_END_LABEL: DashboardQuery.DASHBOARD_QUERY_LABEL,
43 |                     RELATION_START_KEY: '_dashboard://gold.dg_id/d_id', RELATION_TYPE: 'HAS_QUERY',
44 |                     RELATION_REVERSE_TYPE: 'QUERY_OF'}
45 | 
46 |         self.assertEqual(expected, actual_serialized)
47 | 


--------------------------------------------------------------------------------
/databuilder/extractor/restapi/rest_api_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | import importlib
 6 | from typing import Any, Iterator, Dict, Optional
 7 | 
 8 | from pyhocon import ConfigTree
 9 | 
10 | from databuilder.extractor.base_extractor import Extractor
11 | from databuilder.rest_api.base_rest_api_query import BaseRestApiQuery
12 | 
13 | 
14 | REST_API_QUERY = 'restapi_query'
15 | MODEL_CLASS = 'model_class'
16 | 
17 | # Static record that will be added into extracted record
18 | # For example, DashboardMetadata requires product name (static name) of Dashboard and REST api does not provide
19 | #  it. and you can add {'product': 'mode'} so that it will be included in the record.
20 | STATIC_RECORD_DICT = 'static_record_dict'
21 | 
22 | LOGGER = logging.getLogger(__name__)
23 | 
24 | 
25 | class RestAPIExtractor(Extractor):
26 |     """
27 |     An Extractor that calls one or more REST API to extract the data.
28 |     This extractor almost entirely depends on RestApiQuery.
29 |     """
30 | 
31 |     def init(self, conf: ConfigTree) -> None:
32 | 
33 |         self._restapi_query: BaseRestApiQuery = conf.get(REST_API_QUERY)
34 |         self._iterator: Optional[Iterator[Dict[str, Any]]] = None
35 |         self._static_dict = conf.get(STATIC_RECORD_DICT, dict())
36 |         LOGGER.info('static record: {}'.format(self._static_dict))
37 | 
38 |         model_class = conf.get(MODEL_CLASS, None)
39 |         if model_class:
40 |             module_name, class_name = model_class.rsplit(".", 1)
41 |             mod = importlib.import_module(module_name)
42 |             self.model_class = getattr(mod, class_name)
43 | 
44 |     def extract(self) -> Any:
45 |         """
46 |         Fetch one result row from RestApiQuery, convert to {model_class} if specified before
47 |         returning.
48 |         :return:
49 |         """
50 | 
51 |         if not self._iterator:
52 |             self._iterator = self._restapi_query.execute()
53 | 
54 |         try:
55 |             record = next(self._iterator)
56 |         except StopIteration:
57 |             return None
58 | 
59 |         if self._static_dict:
60 |             record.update(self._static_dict)
61 | 
62 |         if hasattr(self, 'model_class'):
63 |             return self.model_class(**record)
64 | 
65 |         return record
66 | 
67 |     def get_scope(self) -> str:
68 | 
69 |         return 'extractor.restapi'
70 | 


--------------------------------------------------------------------------------
/databuilder/extractor/user/bamboohr/bamboohr_user_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | 
 5 | from pyhocon import ConfigTree
 6 | import requests
 7 | from requests.auth import HTTPBasicAuth
 8 | from typing import Iterator, Optional
 9 | from xml.etree import ElementTree
10 | 
11 | from databuilder.extractor.base_extractor import Extractor
12 | from databuilder.models.user import User
13 | 
14 | 
15 | class BamboohrUserExtractor(Extractor):
16 |     API_KEY = 'api_key'
17 |     SUBDOMAIN = 'subdomain'
18 | 
19 |     def init(self, conf: ConfigTree) -> None:
20 |         self._extract_iter: Optional[Iterator] = None
21 |         self._extract_iter = None
22 | 
23 |         self._api_key = conf.get_string(BamboohrUserExtractor.API_KEY)
24 |         self._subdomain = conf.get_string(BamboohrUserExtractor.SUBDOMAIN)
25 | 
26 |     def extract(self) -> Optional[User]:
27 |         if not self._extract_iter:
28 |             self._extract_iter = self._get_extract_iter()
29 |         try:
30 |             return next(self._extract_iter)
31 |         except StopIteration:
32 |             return None
33 | 
34 |     def _employee_directory_uri(self) -> str:
35 |         return 'https://api.bamboohr.com/api/gateway.php/{subdomain}/v1/employees/directory'.format(
36 |             subdomain=self._subdomain
37 |         )
38 | 
39 |     def _get_extract_iter(self) -> Iterator[User]:
40 |         response = requests.get(
41 |             self._employee_directory_uri(), auth=HTTPBasicAuth(self._api_key, 'x')
42 |         )
43 | 
44 |         root = ElementTree.fromstring(response.content)
45 | 
46 |         for user in root.findall('./employees/employee'):
47 | 
48 |             def get_field(name: str) -> str:
49 |                 field = user.find('./field[@id=\'{name}\']'.format(name=name))
50 |                 if field is not None and field.text is not None:
51 |                     return field.text
52 |                 else:
53 |                     return ''
54 | 
55 |             yield User(
56 |                 email=get_field('workEmail'),
57 |                 first_name=get_field('firstName'),
58 |                 last_name=get_field('lastName'),
59 |                 name=get_field('displayName'),
60 |                 team_name=get_field('department'),
61 |                 role_name=get_field('jobTitle'),
62 |             )
63 | 
64 |     def get_scope(self) -> str:
65 |         return 'extractor.bamboohr_user'
66 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/test_kafka_source_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | from mock import patch, MagicMock
 6 | import unittest
 7 | 
 8 | from pyhocon import ConfigFactory
 9 | 
10 | from databuilder import Scoped
11 | from databuilder.extractor.kafka_source_extractor import KafkaSourceExtractor
12 | 
13 | 
14 | class TestKafkaSourceExtractor(unittest.TestCase):
15 |     def setUp(self) -> None:
16 |         logging.basicConfig(level=logging.INFO)
17 |         config_dict = {
18 |             'extractor.kafka_source.consumer_config': {'"group.id"': 'consumer-group',
19 |                                                        '"enable.auto.commit"': False},
20 |             'extractor.kafka_source.{}'.format(KafkaSourceExtractor.RAW_VALUE_TRANSFORMER):
21 |                 'databuilder.transformer.base_transformer.NoopTransformer',
22 |             'extractor.kafka_source.{}'.format(KafkaSourceExtractor.TOPIC_NAME_LIST): ['test-topic'],
23 |             'extractor.kafka_source.{}'.format(KafkaSourceExtractor.CONSUMER_TOTAL_TIMEOUT_SEC): 1,
24 | 
25 |         }
26 |         self.conf = ConfigFactory.from_dict(config_dict)
27 | 
28 |     def test_consume_success(self) -> None:
29 |         kafka_extractor = KafkaSourceExtractor()
30 |         kafka_extractor.init(Scoped.get_scoped_conf(conf=self.conf,
31 |                                                     scope=kafka_extractor.get_scope()))
32 | 
33 |         with patch.object(kafka_extractor, 'consumer') as mock_consumer:
34 | 
35 |             mock_poll = MagicMock()
36 |             mock_poll.error.return_value = False
37 |             # only return once
38 |             mock_poll.value.side_effect = ['msg']
39 |             mock_consumer.poll.return_value = mock_poll
40 | 
41 |             records = kafka_extractor.consume()
42 |             self.assertEqual(len(records), 1)
43 | 
44 |     def test_consume_fail(self) -> None:
45 |         kafka_extractor = KafkaSourceExtractor()
46 |         kafka_extractor.init(Scoped.get_scoped_conf(conf=self.conf,
47 |                                                     scope=kafka_extractor.get_scope()))
48 | 
49 |         with patch.object(kafka_extractor, 'consumer') as mock_consumer:
50 |             mock_poll = MagicMock()
51 |             mock_poll.error.return_value = True
52 |             mock_consumer.poll.return_value = mock_poll
53 | 
54 |             records = kafka_extractor.consume()
55 |             self.assertEqual(len(records), 0)
56 | 


--------------------------------------------------------------------------------
/tests/unit/models/test_user_elasticsearch_document.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import json
 5 | import unittest
 6 | 
 7 | from databuilder.models.user_elasticsearch_document import UserESDocument
 8 | 
 9 | 
10 | class TestUserElasticsearchDocument(unittest.TestCase):
11 | 
12 |     def test_to_json(self) -> None:
13 |         """
14 |         Test string generated from to_json method
15 |         """
16 |         test_obj = UserESDocument(email='test@email.com',
17 |                                   first_name='test_firstname',
18 |                                   last_name='test_lastname',
19 |                                   full_name='full_name',
20 |                                   github_username='github_user',
21 |                                   team_name='team',
22 |                                   employee_type='fte',
23 |                                   manager_email='test_manager',
24 |                                   slack_id='test_slack',
25 |                                   role_name='role_name',
26 |                                   is_active=True,
27 |                                   total_read=2,
28 |                                   total_own=3,
29 |                                   total_follow=1)
30 | 
31 |         expected_document_dict = {"first_name": "test_firstname",
32 |                                   "last_name": "test_lastname",
33 |                                   "full_name": "full_name",
34 |                                   "team_name": "team",
35 |                                   "total_follow": 1,
36 |                                   "total_read": 2,
37 |                                   "is_active": True,
38 |                                   "total_own": 3,
39 |                                   "slack_id": 'test_slack',
40 |                                   "role_name": 'role_name',
41 |                                   "manager_email": "test_manager",
42 |                                   'github_username': "github_user",
43 |                                   "employee_type": 'fte',
44 |                                   "email": "test@email.com",
45 |                                   }
46 | 
47 |         result = test_obj.to_json()
48 |         results = result.split("\n")
49 | 
50 |         # verify two new line characters in result
51 |         self.assertEqual(len(results), 2, "Result from to_json() function doesn't have a newline!")
52 | 
53 |         self.assertDictEqual(json.loads(results[0]), expected_document_dict)
54 | 


--------------------------------------------------------------------------------
/databuilder/publisher/base_publisher.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import abc
 5 | 
 6 | from pyhocon import ConfigTree
 7 | from typing import List
 8 | 
 9 | from databuilder import Scoped
10 | from databuilder.callback import call_back
11 | from databuilder.callback.call_back import Callback
12 | 
13 | 
14 | class Publisher(Scoped):
15 |     """
16 |     A Publisher that writes dataset (not a record) in Atomic manner,
17 |     if possible.
18 |     (Either success or fail, no partial state)
19 |     Use case: If you want to use neo4j import util or Load CSV util,
20 |     that takes CSV file to load database, you need to first create CSV file.
21 |     CSV file holds number of records, and loader can writes multiple records
22 |     to it. Once loader finishes writing CSV file, you have complete CSV file,
23 |     ready to publish to Neo4j. Publisher can take the location of CSV file,
24 |     and push to Neo4j.
25 | 
26 |     """
27 | 
28 |     def __init__(self) -> None:
29 |         self.call_backs: List[Callback] = []
30 | 
31 |     @abc.abstractmethod
32 |     def init(self, conf: ConfigTree) -> None:
33 |         pass
34 | 
35 |     def publish(self) -> None:
36 |         try:
37 |             self.publish_impl()
38 |         except Exception as e:
39 |             call_back.notify_callbacks(self.call_backs, is_success=False)
40 |             raise e
41 |         call_back.notify_callbacks(self.call_backs, is_success=True)
42 | 
43 |     @abc.abstractmethod
44 |     def publish_impl(self) -> None:
45 |         """
46 |         An implementation of publish method. Subclass of publisher is expected to write publish logic by overriding
47 |         this method
48 |         :return: None
49 |         """
50 |         pass
51 | 
52 |     def register_call_back(self, callback: Callback) -> None:
53 |         """
54 |         Register any callback method that needs to be notified when publisher is either able to successfully publish
55 |         or failed to publish
56 |         :param callback:
57 |         :return: None
58 |         """
59 |         self.call_backs.append(callback)
60 | 
61 |     def get_scope(self) -> str:
62 |         return 'publisher'
63 | 
64 | 
65 | class NoopPublisher(Publisher):
66 |     def __init__(self) -> None:
67 |         super(NoopPublisher, self).__init__()
68 | 
69 |     def init(self, conf: ConfigTree) -> None:
70 |         pass
71 | 
72 |     def publish_impl(self) -> None:
73 |         pass
74 | 
75 |     def get_scope(self) -> str:
76 |         return 'publisher.noop'
77 | 


--------------------------------------------------------------------------------
/databuilder/extractor/sql_alchemy_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import importlib
 5 | from sqlalchemy import create_engine
 6 | 
 7 | from pyhocon import ConfigTree
 8 | from typing import Any
 9 | 
10 | from databuilder.extractor.base_extractor import Extractor
11 | 
12 | 
13 | class SQLAlchemyExtractor(Extractor):
14 |     # Config keys
15 |     CONN_STRING = 'conn_string'
16 |     EXTRACT_SQL = 'extract_sql'
17 |     """
18 |     An Extractor that extracts records via SQLAlchemy. Database that supports SQLAlchemy can use this extractor
19 |     """
20 | 
21 |     def init(self, conf: ConfigTree) -> None:
22 |         """
23 |         Establish connections and import data model class if provided
24 |         :param conf:
25 |         """
26 |         self.conf = conf
27 |         self.conn_string = conf.get_string(SQLAlchemyExtractor.CONN_STRING)
28 |         self.connection = self._get_connection()
29 | 
30 |         self.extract_sql = conf.get_string(SQLAlchemyExtractor.EXTRACT_SQL)
31 | 
32 |         model_class = conf.get('model_class', None)
33 |         if model_class:
34 |             module_name, class_name = model_class.rsplit(".", 1)
35 |             mod = importlib.import_module(module_name)
36 |             self.model_class = getattr(mod, class_name)
37 |         self._execute_query()
38 | 
39 |     def _get_connection(self) -> Any:
40 |         """
41 |         Create a SQLAlchemy connection to Database
42 |         """
43 |         engine = create_engine(self.conn_string)
44 |         conn = engine.connect()
45 |         return conn
46 | 
47 |     def _execute_query(self) -> None:
48 |         """
49 |         Create an iterator to execute sql.
50 |         """
51 |         if not hasattr(self, 'results'):
52 |             self.results = self.connection.execute(self.extract_sql)
53 | 
54 |         if hasattr(self, 'model_class'):
55 |             results = [self.model_class(**result)
56 |                        for result in self.results]
57 |         else:
58 |             results = self.results
59 |         self.iter = iter(results)
60 | 
61 |     def extract(self) -> Any:
62 |         """
63 |         Yield the sql result one at a time.
64 |         convert the result to model if a model_class is provided
65 |         """
66 |         try:
67 |             return next(self.iter)
68 |         except StopIteration:
69 |             return None
70 |         except Exception as e:
71 |             raise e
72 | 
73 |     def get_scope(self) -> str:
74 |         return 'extractor.sqlalchemy'
75 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from setuptools import setup, find_packages
 5 | 
 6 | 
 7 | __version__ = '4.0.3'
 8 | 
 9 | 
10 | requirements = [
11 |     "neo4j-driver>=1.7.2,<4.0",
12 |     "pytz>=2018.4",
13 |     "statsd>=3.2.1",
14 |     "retrying>=1.3.3",
15 |     "requests>=2.23.0,<3.0",
16 |     "elasticsearch>=6.2.0,<7.0",
17 |     "pyhocon>=0.3.42",
18 |     "unidecode",
19 |     "Jinja2>=2.10.0,<2.12",
20 |     "pandas>=0.21.0,<1.2.0"
21 | ]
22 | 
23 | kafka = ['confluent-kafka==1.0.0']
24 | 
25 | cassandra = ['cassandra-driver==3.20.1']
26 | 
27 | glue = ['boto3==1.10.1']
28 | 
29 | snowflake = [
30 |     'snowflake-connector-python',
31 |     'snowflake-sqlalchemy'
32 | ]
33 | 
34 | athena = ['PyAthena[SQLAlchemy]>=1.0.0']
35 | 
36 | # Python API client for google
37 | # License: Apache Software License
38 | # Upstream url: https://github.com/googleapis/google-api-python-client
39 | bigquery = [
40 |     'google-api-python-client>=1.6.0, <2.0.0dev',
41 |     'google-auth-httplib2>=0.0.1'
42 |     'google-auth>=1.0.0, <2.0.0dev'
43 | ]
44 | 
45 | jsonpath = ['jsonpath_rw==1.4.0']
46 | 
47 | db2 = [
48 |     'ibm_db==3.0.1',
49 |     'ibm-db-sa-py3==0.3.1-1'
50 | ]
51 | 
52 | dremio = [
53 |     'pyodbc==4.0.30'
54 | ]
55 | 
56 | druid = [
57 |     'pydruid'
58 | ]
59 | 
60 | spark = [
61 |     'pyspark == 3.0.1'
62 | ]
63 | 
64 | all_deps = requirements + kafka + cassandra + glue + snowflake + athena + \
65 |     bigquery + jsonpath + db2 + dremio + druid + spark
66 | 
67 | setup(
68 |     name='amundsen-databuilder',
69 |     version=__version__,
70 |     description='Amundsen Data builder',
71 |     url='https://www.github.com/amundsen-io/amundsendatabuilder',
72 |     maintainer='Amundsen TSC',
73 |     maintainer_email='amundsen-tsc@lists.lfai.foundation',
74 |     packages=find_packages(exclude=['tests*']),
75 |     dependency_links=[],
76 |     install_requires=requirements,
77 |     python_requires='>=3.6',
78 |     extras_require={
79 |         'all': all_deps,
80 |         'kafka': kafka,  # To use with Kafka source extractor
81 |         'cassandra': cassandra,
82 |         'glue': glue,
83 |         'snowflake': snowflake,
84 |         'athena': athena,
85 |         'bigquery': bigquery,
86 |         'jsonpath': jsonpath,
87 |         'db2': db2,
88 |         'dremio': dremio,
89 |         'druid': druid,
90 |         'delta-lake': spark
91 |     },
92 |     classifiers=[
93 |         'Programming Language :: Python :: 3.6',
94 |         'Programming Language :: Python :: 3.7',
95 |     ],
96 | )
97 | 


--------------------------------------------------------------------------------
/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_usage_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | 
 6 | from pyhocon import ConfigTree
 7 | from typing import Any
 8 | 
 9 | from databuilder.extractor.base_extractor import Extractor
10 | from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils
11 | from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery
12 | from databuilder.rest_api.rest_api_query import RestApiQuery
13 | 
14 | LOGGER = logging.getLogger(__name__)
15 | 
16 | 
17 | class ModeDashboardUsageExtractor(Extractor):
18 |     """
19 |     A Extractor that extracts Mode dashboard's accumulated view count
20 |     """
21 | 
22 |     def init(self, conf: ConfigTree) -> None:
23 |         self._conf = conf
24 | 
25 |         restapi_query = self._build_restapi_query()
26 |         self._extractor = ModeDashboardUtils.create_mode_rest_api_extractor(restapi_query=restapi_query,
27 |                                                                             conf=self._conf)
28 | 
29 |     def extract(self) -> Any:
30 |         return self._extractor.extract()
31 | 
32 |     def get_scope(self) -> str:
33 |         return 'extractor.mode_dashboard_usage'
34 | 
35 |     def _build_restapi_query(self) -> RestApiQuery:
36 |         """
37 |         Build REST API Query. To get Mode Dashboard usage, it needs to call two APIs (spaces API and reports
38 |         API) joining together.
39 |         :return: A RestApiQuery that provides Mode Dashboard metadata
40 |         """
41 | 
42 |         # https://mode.com/developer/api-reference/analytics/reports/#listReportsInSpace
43 |         reports_url_template = 'https://app.mode.com/api/{organization}/spaces/{dashboard_group_id}/reports'
44 | 
45 |         spaces_query = ModeDashboardUtils.get_spaces_query_api(conf=self._conf)
46 |         params = ModeDashboardUtils.get_auth_params(conf=self._conf)
47 | 
48 |         # Reports
49 |         # JSONPATH expression. it goes into array which is located in _embedded.reports and then extracts token,
50 |         # and view_count
51 |         json_path = '_embedded.reports[*].[token,view_count]'
52 |         field_names = ['dashboard_id', 'accumulated_view_count']
53 |         reports_query = ModePaginatedRestApiQuery(query_to_join=spaces_query, url=reports_url_template, params=params,
54 |                                                   json_path=json_path, field_names=field_names, skip_no_result=True)
55 |         return reports_query
56 | 


--------------------------------------------------------------------------------
/tests/unit/transformer/test_bigquery_usage_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from pyhocon import ConfigFactory
 7 | 
 8 | from databuilder.transformer.bigquery_usage_transformer import BigqueryUsageTransformer
 9 | from databuilder.extractor.bigquery_usage_extractor import TableColumnUsageTuple
10 | from databuilder.models.table_column_usage import TableColumnUsage
11 | 
12 | 
13 | class TestBigQueryUsageTransform(unittest.TestCase):
14 | 
15 |     DATABASE = 'bigquery'
16 |     CLUSTER = 'your-project-here'
17 |     DATASET = 'dataset'
18 |     TABLE = 'table'
19 |     COLUMN = '*'
20 |     EMAIL = 'your-user-here@test.com'
21 |     READ_COUNT = 305
22 | 
23 |     def test_transform_function(self) -> None:
24 |         config = ConfigFactory.from_dict({})
25 | 
26 |         transformer = BigqueryUsageTransformer()
27 |         transformer.init(config)
28 | 
29 |         key = TableColumnUsageTuple(database=TestBigQueryUsageTransform.DATABASE,
30 |                                     cluster=TestBigQueryUsageTransform.CLUSTER,
31 |                                     schema=TestBigQueryUsageTransform.DATASET,
32 |                                     table=TestBigQueryUsageTransform.TABLE,
33 |                                     column=TestBigQueryUsageTransform.COLUMN,
34 |                                     email=TestBigQueryUsageTransform.EMAIL)
35 | 
36 |         t1 = (key, TestBigQueryUsageTransform.READ_COUNT)
37 |         xformed = transformer.transform(t1)
38 | 
39 |         assert xformed is not None
40 |         self.assertIsInstance(xformed, TableColumnUsage)
41 |         col_readers = list(xformed.col_readers)
42 |         self.assertEqual(len(col_readers), 1)
43 |         col_reader = col_readers[0]
44 |         self.assertEqual(col_reader.cluster, TestBigQueryUsageTransform.CLUSTER)
45 |         self.assertEqual(col_reader.database, TestBigQueryUsageTransform.DATABASE)
46 |         self.assertEqual(col_reader.schema, TestBigQueryUsageTransform.DATASET)
47 |         self.assertEqual(col_reader.table, TestBigQueryUsageTransform.TABLE)
48 |         self.assertEqual(col_reader.column, TestBigQueryUsageTransform.COLUMN)
49 |         self.assertEqual(col_reader.user_email, TestBigQueryUsageTransform.EMAIL)
50 |         self.assertEqual(col_reader.read_count, TestBigQueryUsageTransform.READ_COUNT)
51 | 
52 |     def test_scope(self) -> None:
53 |         config = ConfigFactory.from_dict({})
54 | 
55 |         transformer = BigqueryUsageTransformer()
56 |         transformer.init(config)
57 | 
58 |         self.assertEqual(transformer.get_scope(), 'transformer.bigquery_usage')
59 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/dashboard/mode_analytics/batch/test_mode_dashboard_charts_batch_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | from mock import patch
 6 | from pyhocon import ConfigFactory
 7 | 
 8 | from databuilder import Scoped
 9 | from databuilder.extractor.dashboard.mode_analytics.batch.\
10 |     mode_dashboard_charts_batch_extractor import ModeDashboardChartsBatchExtractor
11 | 
12 | 
13 | class TestModeDashboardChartsBatchExtractor(unittest.TestCase):
14 |     def setUp(self) -> None:
15 |         config = ConfigFactory.from_dict({
16 |             'extractor.mode_dashboard_chart_batch.organization': 'amundsen',
17 |             'extractor.mode_dashboard_chart_batch.mode_user_token': 'amundsen_user_token',
18 |             'extractor.mode_dashboard_chart_batch.mode_password_token': 'amundsen_password_token',
19 |             'extractor.mode_dashboard_chart_batch.mode_bearer_token': 'amundsen_bearer_token',
20 |         })
21 |         self.config = config
22 | 
23 |     def test_dashboard_chart_extractor_empty_record(self) -> None:
24 |         extractor = ModeDashboardChartsBatchExtractor()
25 |         extractor.init(Scoped.get_scoped_conf(conf=self.config, scope=extractor.get_scope()))
26 | 
27 |         with patch('databuilder.rest_api.rest_api_query.requests.get'):
28 |             record = extractor.extract()
29 |             self.assertIsNone(record)
30 | 
31 |     def test_dashboard_chart_extractor_actual_record(self) -> None:
32 |         extractor = ModeDashboardChartsBatchExtractor()
33 |         extractor.init(Scoped.get_scoped_conf(conf=self.config, scope=extractor.get_scope()))
34 | 
35 |         with patch('databuilder.extractor.restapi.rest_api_extractor.RestAPIExtractor.extract') as mock_get:
36 |             mock_get.return_value = {
37 |                 'organization': 'amundsen',
38 |                 'is_active': None,
39 |                 'updated_at': None,
40 |                 'do_not_update_empty_attribute': True,
41 |                 'dashboard_group_id': 'ggg',
42 |                 'dashboard_id': 'ddd',
43 |                 'query_id': 'yyy',
44 |                 'chart_id': 'xxx',
45 |                 'chart_name': 'some chart',
46 |                 'chart_type': 'bigNumber',
47 |                 'product': 'mode'
48 |             }
49 | 
50 |             record = extractor.extract()
51 |             self.assertEqual(record._dashboard_group_id, 'ggg')
52 |             self.assertEqual(record._dashboard_id, 'ddd')
53 |             self.assertEqual(record._chart_name, 'some chart')
54 |             self.assertEqual(record._product, 'mode')
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     unittest.main()
59 | 


--------------------------------------------------------------------------------
/databuilder/task/task.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | 
 6 | from pyhocon import ConfigTree
 7 | 
 8 | from databuilder import Scoped
 9 | from databuilder.extractor.base_extractor import Extractor
10 | from databuilder.loader.base_loader import Loader
11 | from databuilder.task.base_task import Task
12 | from databuilder.transformer.base_transformer import Transformer
13 | from databuilder.transformer.base_transformer \
14 |     import NoopTransformer
15 | from databuilder.utils.closer import Closer
16 | 
17 | 
18 | LOGGER = logging.getLogger(__name__)
19 | 
20 | 
21 | class DefaultTask(Task):
22 |     """
23 |     A default task expecting to extract, transform and load.
24 | 
25 |     """
26 | 
27 |     # Determines the frequency of the log on task progress
28 |     PROGRESS_REPORT_FREQUENCY = 'progress_report_frequency'
29 | 
30 |     def __init__(self,
31 |                  extractor: Extractor,
32 |                  loader: Loader,
33 |                  transformer: Transformer = NoopTransformer()) -> None:
34 |         self.extractor = extractor
35 |         self.transformer = transformer
36 |         self.loader = loader
37 | 
38 |         self._closer = Closer()
39 |         self._closer.register(self.extractor.close)
40 |         self._closer.register(self.transformer.close)
41 |         self._closer.register(self.loader.close)
42 | 
43 |     def init(self, conf: ConfigTree) -> None:
44 |         self._progress_report_frequency = \
45 |             conf.get_int('{}.{}'.format(self.get_scope(), DefaultTask.PROGRESS_REPORT_FREQUENCY), 500)
46 | 
47 |         self.extractor.init(Scoped.get_scoped_conf(conf, self.extractor.get_scope()))
48 |         self.transformer.init(Scoped.get_scoped_conf(conf, self.transformer.get_scope()))
49 |         self.loader.init(Scoped.get_scoped_conf(conf, self.loader.get_scope()))
50 | 
51 |     def run(self) -> None:
52 |         """
53 |         Runs a task
54 |         :return:
55 |         """
56 |         LOGGER.info('Running a task')
57 |         try:
58 |             record = self.extractor.extract()
59 |             count = 1
60 |             while record:
61 |                 record = self.transformer.transform(record)
62 |                 if not record:
63 |                     record = self.extractor.extract()
64 |                     continue
65 |                 self.loader.load(record)
66 |                 record = self.extractor.extract()
67 |                 count += 1
68 |                 if count > 0 and count % self._progress_report_frequency == 0:
69 |                     LOGGER.info('Extracted {} records so far'.format(count))
70 | 
71 |         finally:
72 |             self._closer.close()
73 | 


--------------------------------------------------------------------------------
/databuilder/models/schema/schema.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from typing import Any, Union, Iterator
 5 | 
 6 | from databuilder.models.graph_serializable import (GraphSerializable)
 7 | from databuilder.models.schema.schema_constant import SCHEMA_NODE_LABEL, SCHEMA_NAME_ATTR
 8 | from databuilder.models.table_metadata import DescriptionMetadata
 9 | from databuilder.models.graph_node import GraphNode
10 | from databuilder.models.graph_relationship import GraphRelationship
11 | 
12 | 
13 | class SchemaModel(GraphSerializable):
14 | 
15 |     def __init__(self,
16 |                  schema_key: str,
17 |                  schema: str,
18 |                  description: str=None,
19 |                  description_source: str=None,
20 |                  **kwargs: Any
21 |                  ) -> None:
22 |         self._schema_key = schema_key
23 |         self._schema = schema
24 |         self._description = DescriptionMetadata.create_description_metadata(text=description,
25 |                                                                             source=description_source) \
26 |             if description else None
27 |         self._node_iterator = self._create_node_iterator()
28 |         self._relation_iterator = self._create_relation_iterator()
29 | 
30 |     def create_next_node(self) -> Union[GraphNode, None]:
31 |         try:
32 |             return next(self._node_iterator)
33 |         except StopIteration:
34 |             return None
35 | 
36 |     def _create_node_iterator(self) -> Iterator[GraphNode]:
37 |         node = GraphNode(
38 |             key=self._schema_key,
39 |             label=SCHEMA_NODE_LABEL,
40 |             attributes={
41 |                 SCHEMA_NAME_ATTR: self._schema,
42 |             }
43 |         )
44 |         yield node
45 | 
46 |         if self._description:
47 |             yield self._description.get_node(self._get_description_node_key())
48 | 
49 |     def create_next_relation(self) -> Union[GraphRelationship, None]:
50 |         try:
51 |             return next(self._relation_iterator)
52 |         except StopIteration:
53 |             return None
54 | 
55 |     def _get_description_node_key(self) -> str:
56 |         desc = self._description.get_description_id() if self._description is not None else ''
57 |         return '{}/{}'.format(self._schema_key, desc)
58 | 
59 |     def _create_relation_iterator(self) -> Iterator[GraphRelationship]:
60 |         if self._description:
61 |             yield self._description.get_relation(start_node=SCHEMA_NODE_LABEL,
62 |                                                  start_key=self._schema_key,
63 |                                                  end_key=self._get_description_node_key())
64 | 


--------------------------------------------------------------------------------
/tests/unit/transformer/test_table_tag_transformer.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from pyhocon import ConfigFactory
 7 | 
 8 | from databuilder.transformer.table_tag_transformer import TableTagTransformer
 9 | from databuilder.models.table_metadata import TableMetadata
10 | 
11 | 
12 | class TestTableTagTransformer(unittest.TestCase):
13 |     def test_single_tag(self) -> None:
14 |         transformer = TableTagTransformer()
15 |         config = ConfigFactory.from_dict({
16 |             TableTagTransformer.TAGS: 'foo',
17 |         })
18 |         transformer.init(conf=config)
19 | 
20 |         result = transformer.transform(TableMetadata(
21 |             database='test_db',
22 |             cluster='test_cluster',
23 |             schema='test_schema',
24 |             name='test_table',
25 |             description='',
26 |         ))
27 | 
28 |         self.assertEqual(result.tags, ['foo'])
29 | 
30 |     def test_multiple_tags_comma_delimited(self) -> None:
31 |         transformer = TableTagTransformer()
32 |         config = ConfigFactory.from_dict({
33 |             TableTagTransformer.TAGS: 'foo,bar',
34 |         })
35 |         transformer.init(conf=config)
36 | 
37 |         result = transformer.transform(TableMetadata(
38 |             database='test_db',
39 |             cluster='test_cluster',
40 |             schema='test_schema',
41 |             name='test_table',
42 |             description='',
43 |         ))
44 | 
45 |         self.assertEqual(result.tags, ['foo', 'bar'])
46 | 
47 |     def test_add_tag_to_existing_tags(self) -> None:
48 |         transformer = TableTagTransformer()
49 |         config = ConfigFactory.from_dict({
50 |             TableTagTransformer.TAGS: 'baz',
51 |         })
52 |         transformer.init(conf=config)
53 | 
54 |         result = transformer.transform(TableMetadata(
55 |             database='test_db',
56 |             cluster='test_cluster',
57 |             schema='test_schema',
58 |             name='test_table',
59 |             description='',
60 |             tags='foo,bar',
61 |         ))
62 |         self.assertEqual(result.tags, ['foo', 'bar', 'baz'])
63 | 
64 |     def test_tags_not_added_to_other_objects(self) -> None:
65 |         transformer = TableTagTransformer()
66 |         config = ConfigFactory.from_dict({
67 |             TableTagTransformer.TAGS: 'new_tag',
68 |         })
69 |         transformer.init(conf=config)
70 | 
71 |         class NotATable():
72 |             tags = 'existing_tag'
73 | 
74 |         result = transformer.transform(NotATable())
75 | 
76 |         self.assertEqual(result.tags, 'existing_tag')
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     unittest.main()
81 | 


--------------------------------------------------------------------------------
/databuilder/extractor/db_api_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import importlib
 5 | import logging
 6 | from typing import Iterable, Any
 7 | 
 8 | from pyhocon import ConfigTree
 9 | 
10 | from databuilder.extractor.base_extractor import Extractor
11 | 
12 | 
13 | LOGGER = logging.getLogger(__name__)
14 | 
15 | 
16 | class DBAPIExtractor(Extractor):
17 |     """
18 |     Generic DB API extractor.
19 |     """
20 |     CONNECTION_CONFIG_KEY = 'connection'
21 |     SQL_CONFIG_KEY = 'sql'
22 | 
23 |     def init(self, conf: ConfigTree) -> None:
24 |         """
25 |         Receives a {Connection} object and {sql} to execute.
26 |         An optional model class can be passed, in which, sql result row
27 |         would be converted to a class instance and returned to calling
28 |         function
29 |         :param conf:
30 |         :return:
31 |         """
32 |         self.conf = conf
33 |         self.connection: Any = conf.get(DBAPIExtractor.CONNECTION_CONFIG_KEY)
34 |         self.cursor = self.connection.cursor()
35 |         self.sql = conf.get(DBAPIExtractor.SQL_CONFIG_KEY)
36 | 
37 |         model_class = conf.get('model_class', None)
38 |         if model_class:
39 |             module_name, class_name = model_class.rsplit(".", 1)
40 |             mod = importlib.import_module(module_name)
41 |             self.model_class = getattr(mod, class_name)
42 | 
43 |         self._iter = iter(self._execute_query())
44 | 
45 |     def _execute_query(self) -> Iterable[Any]:
46 |         """
47 |         Use cursor to execute the {sql}
48 |         :return:
49 |         """
50 |         LOGGER.info('Executing query: \n{}'.format(self.sql))
51 |         self.cursor.execute(self.sql)
52 |         return self.cursor.fetchall()
53 | 
54 |     def extract(self) -> Any:
55 |         """
56 |         Fetch one sql result row, convert to {model_class} if specified before
57 |         returning.
58 |         :return:
59 |         """
60 | 
61 |         try:
62 |             result = next(self._iter)
63 |         except StopIteration:
64 |             return None
65 | 
66 |         if hasattr(self, 'model_class'):
67 |             obj = self.model_class(*result[:len(result)])
68 |             return obj
69 |         else:
70 |             return result
71 | 
72 |     def close(self) -> None:
73 |         """
74 |         close cursor and connection handlers
75 |         :return:
76 |         """
77 |         try:
78 |             self.cursor.close()
79 |             self.connection.close()
80 |         except Exception as e:
81 |             LOGGER.warning("Exception encountered while closing up connection handler!", e)
82 | 
83 |     def get_scope(self) -> str:
84 |         return 'extractor.dbapi'
85 | 


--------------------------------------------------------------------------------
/tests/unit/models/test_dashboard_elasticsearch_document.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import json
 5 | import unittest
 6 | 
 7 | from databuilder.models.dashboard_elasticsearch_document import DashboardESDocument
 8 | 
 9 | 
10 | class TestDashboardElasticsearchDocument(unittest.TestCase):
11 | 
12 |     def test_to_json(self) -> None:
13 |         """
14 |         Test string generated from to_json method
15 |         """
16 |         test_obj = DashboardESDocument(group_name='test_dashboard_group',
17 |                                        name='test_dashboard_name',
18 |                                        description='test_description',
19 |                                        product='mode',
20 |                                        cluster='gold',
21 |                                        group_description='work space group',
22 |                                        query_names=['query1'],
23 |                                        chart_names=['chart1'],
24 |                                        group_url='mode_group_url',
25 |                                        url='mode_report_url',
26 |                                        uri='mode_dashboard://gold.cluster/dashboard_group/dashboard',
27 |                                        last_successful_run_timestamp=10,
28 |                                        total_usage=10,
29 |                                        tags=['test'],
30 |                                        badges=['test_badge'])
31 | 
32 |         expected_document_dict = {"group_name": "test_dashboard_group",
33 |                                   "name": "test_dashboard_name",
34 |                                   "description": "test_description",
35 |                                   "product": "mode",
36 |                                   "cluster": "gold",
37 |                                   "group_url": "mode_group_url",
38 |                                   "url": "mode_report_url",
39 |                                   "uri": "mode_dashboard://gold.cluster/dashboard_group/dashboard",
40 |                                   "query_names": ['query1'],
41 |                                   "chart_names": ['chart1'],
42 |                                   "last_successful_run_timestamp": 10,
43 |                                   "group_description": "work space group",
44 |                                   "total_usage": 10,
45 |                                   "tags": ["test"],
46 |                                   "badges": ["test_badge"],
47 | 
48 |                                   }
49 | 
50 |         result = test_obj.to_json()
51 |         results = result.split("\n")
52 | 
53 |         # verify two new line characters in result
54 |         self.assertEqual(len(results), 2, "Result from to_json() function doesn't have a newline!")
55 |         self.assertDictEqual(json.loads(results[0]), expected_document_dict)
56 | 


--------------------------------------------------------------------------------
/tests/unit/models/test_table_elasticsearch_document.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import json
 5 | import unittest
 6 | 
 7 | from databuilder.models.table_elasticsearch_document import TableESDocument
 8 | 
 9 | 
10 | class TestTableElasticsearchDocument(unittest.TestCase):
11 | 
12 |     def test_to_json(self) -> None:
13 |         """
14 |         Test string generated from to_json method
15 |         """
16 |         test_obj = TableESDocument(database='test_database',
17 |                                    cluster='test_cluster',
18 |                                    schema='test_schema',
19 |                                    name='test_table',
20 |                                    key='test_table_key',
21 |                                    last_updated_timestamp=123456789,
22 |                                    description='test_table_description',
23 |                                    column_names=['test_col1', 'test_col2'],
24 |                                    column_descriptions=['test_description1', 'test_description2'],
25 |                                    total_usage=100,
26 |                                    unique_usage=10,
27 |                                    tags=['test'],
28 |                                    programmatic_descriptions=['test'],
29 |                                    badges=['badge1'],
30 |                                    schema_description='schema description')
31 | 
32 |         expected_document_dict = {"database": "test_database",
33 |                                   "cluster": "test_cluster",
34 |                                   "schema": "test_schema",
35 |                                   "name": "test_table",
36 |                                   "display_name": "test_schema.test_table",
37 |                                   "key": "test_table_key",
38 |                                   "last_updated_timestamp": 123456789,
39 |                                   "description": "test_table_description",
40 |                                   "column_names": ["test_col1", "test_col2"],
41 |                                   "column_descriptions": ["test_description1", "test_description2"],
42 |                                   "total_usage": 100,
43 |                                   "unique_usage": 10,
44 |                                   "tags": ["test"],
45 |                                   "programmatic_descriptions": ['test'],
46 |                                   "badges": ["badge1"],
47 |                                   'schema_description': 'schema description'
48 |                                   }
49 | 
50 |         result = test_obj.to_json()
51 |         results = result.split("\n")
52 | 
53 |         # verify two new line characters in result
54 |         self.assertEqual(len(results), 2, "Result from to_json() function doesn't have a newline!")
55 |         self.assertDictEqual(json.loads(results[0]), expected_document_dict)
56 | 


--------------------------------------------------------------------------------
/databuilder/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import abc
 5 | 
 6 | from pyhocon import ConfigTree, ConfigFactory
 7 | 
 8 | 
 9 | class Scoped(object, metaclass=abc.ABCMeta):
10 |     _EMPTY_CONFIG = ConfigFactory.from_dict({})
11 |     """
12 |     An interface for class that works with scoped (nested) config.
13 |     https://github.com/chimpler/pyhocon
14 |     A scoped instance will use config within its scope. This is a way to
15 |     distribute configuration to its implementation instead of having it in
16 |     one central place.
17 |     This is very useful for DataBuilder as it has different components
18 |     (extractor, transformer, loader, publisher) and its component itself
19 |     could have different implementation.
20 |     For example these can be a configuration for two different extractors
21 |      "extractor.mysql.url" for MySQLExtractor
22 |      "extractor.filesystem.source_path" for FileSystemExtractor
23 | 
24 |      For MySQLExtractor, if you defined scope as "extractor.mysql", scoped
25 |      config will basically reduce it to the config that is only for MySQL.
26 |      config.get("extractor.mysql") provides you all the config within
27 |      'extractor.mysql'. By removing outer context from the config,
28 |      MySQLExtractor is highly reusable.
29 |     """
30 | 
31 |     @abc.abstractmethod
32 |     def init(self, conf: ConfigTree) -> None:
33 |         """
34 |         All scoped instance is expected to be lazily initialized. Means that
35 |         __init__ should not have any heavy operation such as service call.
36 |         The reason behind is that Databuilder is a code at the same time,
37 |         code itself is used as a configuration. For example, you can
38 |         instantiate scoped instance with all the parameters already set,
39 |         ready to run, and actual execution will be executing init() and
40 |         execute.
41 | 
42 |         :param conf: Typesafe config instance
43 |         :return: None
44 |         """
45 |         pass
46 | 
47 |     @abc.abstractmethod
48 |     def get_scope(self) -> str:
49 |         """
50 |         A scope for the config. Typesafe config supports nested config.
51 |         Scope, string, is used to basically peel off nested config
52 |         :return:
53 |         """
54 |         return ''
55 | 
56 |     def close(self) -> None:
57 |         """
58 |         Anything that needs to be cleaned up after the use of the instance.
59 |         :return: None
60 |         """
61 |         pass
62 | 
63 |     @classmethod
64 |     def get_scoped_conf(cls, conf: ConfigTree, scope: str) -> ConfigTree:
65 |         """
66 |         Convenient method to provide scoped method.
67 | 
68 |         :param conf: Type safe config instance
69 |         :param scope: scope string
70 |         :return: Type safe config instance
71 |         """
72 |         if not scope:
73 |             return Scoped._EMPTY_CONFIG
74 | 
75 |         return conf.get(scope, Scoped._EMPTY_CONFIG)
76 | 


--------------------------------------------------------------------------------
/databuilder/extractor/redshift_metadata_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | from pyhocon import ConfigFactory, ConfigTree  # noqa: F401
 5 | from typing import Iterator, Union, Dict, Any  # noqa: F401
 6 | 
 7 | from databuilder.extractor.base_postgres_metadata_extractor import BasePostgresMetadataExtractor
 8 | 
 9 | 
10 | class RedshiftMetadataExtractor(BasePostgresMetadataExtractor):
11 |     """
12 |     Extracts Redshift table and column metadata from underlying meta store database using SQLAlchemyExtractor
13 | 
14 |     This differs from the PostgresMetadataExtractor because in order to support Redshift's late binding views,
15 |     we need to join the INFORMATION_SCHEMA data against the function PG_GET_LATE_BINDING_VIEW_COLS().
16 |     """
17 | 
18 |     def get_sql_statement(self, use_catalog_as_cluster_name, where_clause_suffix):
19 |         # type: (bool, str) -> str
20 |         if use_catalog_as_cluster_name:
21 |             cluster_source = "CURRENT_DATABASE()"
22 |         else:
23 |             cluster_source = "'{}'".format(self._cluster)
24 | 
25 |         return """
26 |         SELECT
27 |             *
28 |         FROM (
29 |             SELECT
30 |               {cluster_source} as cluster,
31 |               c.table_schema as schema,
32 |               c.table_name as name,
33 |               pgtd.description as description,
34 |               c.column_name as col_name,
35 |               c.data_type as col_type,
36 |               pgcd.description as col_description,
37 |               ordinal_position as col_sort_order
38 |             FROM INFORMATION_SCHEMA.COLUMNS c
39 |             INNER JOIN
40 |               pg_catalog.pg_statio_all_tables as st on c.table_schema=st.schemaname and c.table_name=st.relname
41 |             LEFT JOIN
42 |               pg_catalog.pg_description pgcd on pgcd.objoid=st.relid and pgcd.objsubid=c.ordinal_position
43 |             LEFT JOIN
44 |               pg_catalog.pg_description pgtd on pgtd.objoid=st.relid and pgtd.objsubid=0
45 | 
46 |             UNION
47 | 
48 |             SELECT
49 |               {cluster_source} as cluster,
50 |               view_schema as schema,
51 |               view_name as name,
52 |               NULL as description,
53 |               column_name as col_name,
54 |               data_type as col_type,
55 |               NULL as col_description,
56 |               ordinal_position as col_sort_order
57 |             FROM
58 |                 PG_GET_LATE_BINDING_VIEW_COLS()
59 |                     COLS(view_schema NAME, view_name NAME, column_name NAME, data_type VARCHAR, ordinal_position INT)
60 |         )
61 | 
62 |         {where_clause_suffix}
63 |         ORDER by cluster, schema, name, col_sort_order ;
64 |         """.format(
65 |             cluster_source=cluster_source,
66 |             where_clause_suffix=where_clause_suffix,
67 |         )
68 | 
69 |     def get_scope(self):
70 |         # type: () -> str
71 |         return 'extractor.redshift_metadata'
72 | 


--------------------------------------------------------------------------------
/databuilder/rest_api/mode_analytics/mode_paginated_rest_api_query.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | 
 6 | import requests
 7 | from jsonpath_rw import parse
 8 | from typing import Any, Dict
 9 | 
10 | from databuilder.rest_api.rest_api_query import RestApiQuery
11 | 
12 | #  How many records considers as full and indicating there might be next page? In list reports on space API, it's 30.
13 | DEFAULT_MAX_RECORD_SIZE = 30
14 | PAGE_SUFFIX_TEMPLATE = '?page={}'
15 | LIST_REPORTS_PAGINATION_JSON_PATH = '_embedded.reports[*]'  # So far this is the only paginated API that we need.
16 | 
17 | LOGGER = logging.getLogger(__name__)
18 | 
19 | 
20 | class ModePaginatedRestApiQuery(RestApiQuery):
21 |     """
22 |     Certain API such as get list of reports on a space is paginated with query term page.
23 |     https://mode.com/developer/api-cookbook/management/get-all-reports/
24 | 
25 |     This subclass makes sure to detect if there's more page and update URL to get next page.
26 |     """
27 | 
28 |     def __init__(self,
29 |                  pagination_json_path: str = LIST_REPORTS_PAGINATION_JSON_PATH,
30 |                  max_record_size: int = DEFAULT_MAX_RECORD_SIZE,
31 |                  **kwargs: Any
32 |                  ):
33 |         # type (...) -> None
34 |         super(ModePaginatedRestApiQuery, self).__init__(**kwargs)
35 | 
36 |         self._original_url = self._url
37 |         self._max_record_size = max_record_size
38 |         self._current_page = 1
39 |         self._pagination_jsonpath_expr = parse(pagination_json_path)
40 | 
41 |     def _preprocess_url(self,
42 |                         record: Dict[str, Any],
43 |                         ) -> str:
44 |         """
45 |         Updates URL with page information
46 |         :param record:
47 |         :return: a URL that is ready to be called.
48 |         """
49 |         page_suffix = PAGE_SUFFIX_TEMPLATE.format(self._current_page)  # example: ?page=2
50 | 
51 |         # example: http://foo.bar/resources?page=2
52 |         self._url = self._original_url + '{page_suffix}'.format(page_suffix=page_suffix)
53 |         return self._url.format(**record)
54 | 
55 |     def _post_process(self,
56 |                       response: requests.Response,
57 |                       ) -> None:
58 |         """
59 |         Updates trigger to pagination (self._more_pages) as well as current_page (self._current_page)
60 |         Mode does not have explicit indicator that it just the number of records need to be certain number that
61 |         implying that there could be more records on next page.
62 |         :return:
63 |         """
64 | 
65 |         result_list = [match.value for match in self._pagination_jsonpath_expr.find(response.json())]
66 | 
67 |         if result_list and len(result_list) >= self._max_record_size:
68 |             self._more_pages = True
69 |             self._current_page = self._current_page + 1
70 |             return
71 | 
72 |         self._more_pages = False
73 |         self._current_page = 1
74 | 


--------------------------------------------------------------------------------
/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_last_successful_executions_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | 
 6 | from pyhocon import ConfigTree, ConfigFactory
 7 | 
 8 | from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_executions_extractor import \
 9 |     ModeDashboardExecutionsExtractor
10 | from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils
11 | from databuilder.extractor.restapi.rest_api_extractor import STATIC_RECORD_DICT
12 | from databuilder.models.dashboard.dashboard_execution import DashboardExecution
13 | from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery
14 | from databuilder.rest_api.rest_api_query import RestApiQuery
15 | 
16 | LOGGER = logging.getLogger(__name__)
17 | 
18 | 
19 | class ModeDashboardLastSuccessfulExecutionExtractor(ModeDashboardExecutionsExtractor):
20 |     """
21 |     A Extractor that extracts Mode dashboard's last successful run (execution) timestamp.
22 | 
23 |     """
24 | 
25 |     def __init__(self) -> None:
26 |         super(ModeDashboardLastSuccessfulExecutionExtractor, self).__init__()
27 | 
28 |     def init(self, conf: ConfigTree) -> None:
29 |         conf = conf.with_fallback(
30 |             ConfigFactory.from_dict({
31 |                 STATIC_RECORD_DICT: {'product': 'mode',
32 |                                      'execution_state': 'succeeded',
33 |                                      'execution_id': DashboardExecution.LAST_SUCCESSFUL_EXECUTION_ID}
34 |             })
35 |         )
36 |         super(ModeDashboardLastSuccessfulExecutionExtractor, self).init(conf)
37 | 
38 |     def get_scope(self) -> str:
39 |         return 'extractor.mode_dashboard_last_successful_execution'
40 | 
41 |     def _build_restapi_query(self) -> RestApiQuery:
42 |         """
43 |         Build REST API Query. To get Mode Dashboard last successful execution, it needs to call two APIs (spaces API,
44 |         and reports API) joining together.
45 |         :return: A RestApiQuery that provides Mode Dashboard last successful execution (run)
46 |         """
47 | 
48 |         spaces_query = ModeDashboardUtils.get_spaces_query_api(conf=self._conf)
49 |         params = ModeDashboardUtils.get_auth_params(conf=self._conf)
50 | 
51 |         # Reports
52 |         # https://mode.com/developer/api-reference/analytics/reports/#listReportsInSpace
53 |         url = 'https://app.mode.com/api/{organization}/spaces/{dashboard_group_id}/reports'
54 |         json_path = '_embedded.reports[*].[token,last_successfully_run_at]'
55 |         field_names = ['dashboard_id', 'execution_timestamp']
56 |         last_successful_run_query = ModePaginatedRestApiQuery(query_to_join=spaces_query, url=url, params=params,
57 |                                                               json_path=json_path, field_names=field_names,
58 |                                                               skip_no_result=True)
59 | 
60 |         return last_successful_run_query
61 | 


--------------------------------------------------------------------------------
/databuilder/models/dashboard/dashboard_owner.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | 
 6 | from typing import Optional, Any, Union, Iterator
 7 | 
 8 | from databuilder.models.dashboard.dashboard_metadata import DashboardMetadata
 9 | from databuilder.models.graph_serializable import (
10 |     GraphSerializable)
11 | from databuilder.models.owner_constants import OWNER_OF_OBJECT_RELATION_TYPE, OWNER_RELATION_TYPE
12 | from databuilder.models.user import User
13 | 
14 | from databuilder.models.graph_node import GraphNode
15 | from databuilder.models.graph_relationship import GraphRelationship
16 | 
17 | LOGGER = logging.getLogger(__name__)
18 | 
19 | 
20 | class DashboardOwner(GraphSerializable):
21 |     """
22 |     A model that encapsulate Dashboard's owner.
23 |     Note that it does not create new user as it has insufficient information about user but it builds relation
24 |     between User and Dashboard
25 |     """
26 | 
27 |     DASHBOARD_EXECUTION_RELATION_TYPE = 'LAST_EXECUTED'
28 |     EXECUTION_DASHBOARD_RELATION_TYPE = 'LAST_EXECUTION_OF'
29 | 
30 |     def __init__(self,
31 |                  dashboard_group_id: str,
32 |                  dashboard_id: str,
33 |                  email: str,
34 |                  product: Optional[str] = '',
35 |                  cluster: str = 'gold',
36 |                  **kwargs: Any
37 |                  ) -> None:
38 |         self._dashboard_group_id = dashboard_group_id
39 |         self._dashboard_id = dashboard_id
40 |         self._email = email
41 |         self._product = product
42 |         self._cluster = cluster
43 | 
44 |         self._relation_iterator = self._create_relation_iterator()
45 | 
46 |     def create_next_node(self) -> Union[GraphNode, None]:
47 |         return None
48 | 
49 |     def create_next_relation(self) -> Union[GraphRelationship, None]:
50 |         try:
51 |             return next(self._relation_iterator)
52 |         except StopIteration:
53 |             return None
54 | 
55 |     def _create_relation_iterator(self) -> Iterator[GraphRelationship]:
56 |         relationship = GraphRelationship(
57 |             start_label=DashboardMetadata.DASHBOARD_NODE_LABEL,
58 |             end_label=User.USER_NODE_LABEL,
59 |             start_key=DashboardMetadata.DASHBOARD_KEY_FORMAT.format(
60 |                 product=self._product,
61 |                 cluster=self._cluster,
62 |                 dashboard_group=self._dashboard_group_id,
63 |                 dashboard_name=self._dashboard_id
64 |             ),
65 |             end_key=User.get_user_model_key(email=self._email),
66 |             type=OWNER_RELATION_TYPE,
67 |             reverse_type=OWNER_OF_OBJECT_RELATION_TYPE,
68 |             attributes={}
69 |         )
70 |         yield relationship
71 | 
72 |     def __repr__(self) -> str:
73 |         return 'DashboardOwner({!r}, {!r}, {!r}, {!r}, {!r})'.format(
74 |             self._dashboard_group_id,
75 |             self._dashboard_id,
76 |             self._email,
77 |             self._product,
78 |             self._cluster
79 |         )
80 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_query_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | import unittest
 6 | from typing import Any, Dict
 7 | 
 8 | from mock import patch
 9 | from pyhocon import ConfigFactory
10 | 
11 | from databuilder import Scoped
12 | from databuilder.extractor.dashboard.tableau.tableau_dashboard_query_extractor import TableauDashboardQueryExtractor
13 | from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils \
14 |     import TableauDashboardAuth, TableauGraphQLApiExtractor
15 | 
16 | 
17 | logging.basicConfig(level=logging.INFO)
18 | 
19 | 
20 | def mock_query(*_args: Any, **_kwargs: Any) -> Dict[str, Any]:
21 |     return {
22 |         'customSQLTables': [
23 |             {
24 |                 'id': 'fake-query-id',
25 |                 'name': 'Test Query',
26 |                 'query': 'SELECT * FROM foo',
27 |                 'downstreamWorkbooks': [
28 |                     {
29 |                         'name': 'Test Workbook',
30 |                         'projectName': 'Test Project'
31 |                     }
32 |                 ]
33 |             }
34 |         ]
35 |     }
36 | 
37 | 
38 | def mock_token(*_args: Any, **_kwargs: Any) -> str:
39 |     return '123-abc'
40 | 
41 | 
42 | class TestTableauDashboardQuery(unittest.TestCase):
43 | 
44 |     @patch.object(TableauDashboardAuth, '_authenticate', mock_token)
45 |     @patch.object(TableauGraphQLApiExtractor, 'execute_query', mock_query)
46 |     def test_dashboard_query_extractor(self) -> None:
47 | 
48 |         config = ConfigFactory.from_dict({
49 |             'extractor.tableau_dashboard_query.api_base_url': 'api_base_url',
50 |             'extractor.tableau_dashboard_query.api_version': 'tableau_api_version',
51 |             'extractor.tableau_dashboard_query.site_name': 'tableau_site_name',
52 |             'extractor.tableau_dashboard_query.tableau_personal_access_token_name':
53 |                 'tableau_personal_access_token_name',
54 |             'extractor.tableau_dashboard_query.tableau_personal_access_token_secret':
55 |                 'tableau_personal_access_token_secret',
56 |             'extractor.tableau_dashboard_query.excluded_projects': [],
57 |             'extractor.tableau_dashboard_query.cluster': 'tableau_dashboard_cluster',
58 |             'extractor.tableau_dashboard_query.database': 'tableau_dashboard_database',
59 |             'extractor.tableau_dashboard_query.transformer.timestamp_str_to_epoch.timestamp_format':
60 |                 '%Y-%m-%dT%H:%M:%SZ',
61 | 
62 |         })
63 | 
64 |         extractor = TableauDashboardQueryExtractor()
65 |         extractor.init(Scoped.get_scoped_conf(conf=config, scope=extractor.get_scope()))
66 |         record = extractor.extract()
67 | 
68 |         self.assertEqual(record._query_name, 'Test Query')
69 |         self.assertEqual(record._query_text, 'SELECT * FROM foo')
70 |         self.assertEqual(record._dashboard_id, 'Test Workbook')
71 |         self.assertEqual(record._dashboard_group_id, 'Test Project')
72 | 
73 | 
74 | if __name__ == '__main__':
75 |     unittest.main()
76 | 


--------------------------------------------------------------------------------
/tests/unit/models/test_table_lineage.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from databuilder.models.table_lineage import TableLineage
 7 | from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \
 8 |     RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE
 9 | from databuilder.serializers import neo4_serializer
10 | 
11 | 
12 | DB = 'hive'
13 | SCHEMA = 'base'
14 | TABLE = 'test'
15 | CLUSTER = 'default'
16 | 
17 | 
18 | class TestTableLineage(unittest.TestCase):
19 | 
20 |     def setUp(self) -> None:
21 |         super(TestTableLineage, self).setUp()
22 |         self.table_lineage = TableLineage(db_name='hive',
23 |                                           schema=SCHEMA,
24 |                                           table_name=TABLE,
25 |                                           cluster=CLUSTER,
26 |                                           downstream_deps=['hive://default.test_schema/test_table1',
27 |                                                            'hive://default.test_schema/test_table2'])
28 | 
29 |     def test_get_table_model_key(self) -> None:
30 |         metadata = self.table_lineage.get_table_model_key(db=DB,
31 |                                                           cluster=CLUSTER,
32 |                                                           schema=SCHEMA,
33 |                                                           table=TABLE)
34 |         self.assertEqual(metadata, 'hive://default.base/test')
35 | 
36 |     def test_create_nodes(self) -> None:
37 |         nodes = self.table_lineage.create_nodes()
38 |         self.assertEqual(len(nodes), 0)
39 | 
40 |     def test_create_relation(self) -> None:
41 |         relations = self.table_lineage.create_relation()
42 |         self.assertEqual(len(relations), 2)
43 | 
44 |         start_key = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB,
45 |                                                              schema=SCHEMA,
46 |                                                              tbl=TABLE,
47 |                                                              cluster=CLUSTER)
48 |         end_key1 = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB,
49 |                                                             schema='test_schema',
50 |                                                             tbl='test_table1',
51 |                                                             cluster=CLUSTER)
52 | 
53 |         relation = {
54 |             RELATION_START_KEY: start_key,
55 |             RELATION_START_LABEL: 'Table',
56 |             RELATION_END_KEY: end_key1,
57 |             RELATION_END_LABEL: 'Table',
58 |             RELATION_TYPE: TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE,
59 |             RELATION_REVERSE_TYPE: TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE
60 |         }
61 |         actual_relations = [
62 |             neo4_serializer.serialize_relationship(relation)
63 |             for relation in relations
64 |         ]
65 |         self.assertTrue(len(relations), 2)
66 |         self.assertTrue(relation in actual_relations)
67 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_last_modified_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | import unittest
 6 | from typing import Any, Dict
 7 | 
 8 | from mock import patch
 9 | from pyhocon import ConfigFactory
10 | 
11 | from databuilder import Scoped
12 | from databuilder.extractor.dashboard.tableau.tableau_dashboard_last_modified_extractor \
13 |     import TableauDashboardLastModifiedExtractor
14 | from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils \
15 |     import TableauDashboardAuth, TableauGraphQLApiExtractor
16 | 
17 | 
18 | logging.basicConfig(level=logging.INFO)
19 | 
20 | 
21 | def mock_query(*_args: Any, **_kwargs: Any) -> Dict[str, Any]:
22 |     return {
23 |         'workbooks': [
24 |             {
25 |                 'id': 'fake-workbook-id',
26 |                 'name': 'Test Workbook',
27 |                 'projectName': 'Test Project',
28 |                 'updatedAt': '2020-08-04T20:16:05Z'
29 |             }
30 |         ]
31 |     }
32 | 
33 | 
34 | def mock_token(*_args: Any, **_kwargs: Any) -> str:
35 |     return '123-abc'
36 | 
37 | 
38 | class TestTableauDashboardLastModified(unittest.TestCase):
39 | 
40 |     @patch.object(TableauDashboardAuth, '_authenticate', mock_token)
41 |     @patch.object(TableauGraphQLApiExtractor, 'execute_query', mock_query)
42 |     def test_dashboard_last_modified_extractor(self) -> None:
43 | 
44 |         config = ConfigFactory.from_dict({
45 |             'extractor.tableau_dashboard_last_modified.api_base_url': 'api_base_url',
46 |             'extractor.tableau_dashboard_last_modified.api_version': 'tableau_api_version',
47 |             'extractor.tableau_dashboard_last_modified.site_name': 'tableau_site_name',
48 |             'extractor.tableau_dashboard_last_modified.tableau_personal_access_token_name':
49 |                 'tableau_personal_access_token_name',
50 |             'extractor.tableau_dashboard_last_modified.tableau_personal_access_token_secret':
51 |                 'tableau_personal_access_token_secret',
52 |             'extractor.tableau_dashboard_last_modified.excluded_projects': [],
53 |             'extractor.tableau_dashboard_last_modified.cluster': 'tableau_dashboard_cluster',
54 |             'extractor.tableau_dashboard_last_modified.database': 'tableau_dashboard_database',
55 |             'extractor.tableau_dashboard_last_modified.transformer.timestamp_str_to_epoch.timestamp_format':
56 |                 '%Y-%m-%dT%H:%M:%SZ',
57 | 
58 |         })
59 | 
60 |         extractor = TableauDashboardLastModifiedExtractor()
61 |         extractor.init(Scoped.get_scoped_conf(conf=config, scope=extractor.get_scope()))
62 |         record = extractor.extract()
63 | 
64 |         self.assertEqual(record._dashboard_id, 'Test Workbook')
65 |         self.assertEqual(record._dashboard_group_id, 'Test Project')
66 |         self.assertEqual(record._product, 'tableau')
67 |         self.assertEqual(record._cluster, 'tableau_dashboard_cluster')
68 |         self.assertEqual(record._last_modified_timestamp, 1596572165)
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     unittest.main()
73 | 


--------------------------------------------------------------------------------
/tests/unit/models/dashboard/test_dashboard_last_modified.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from typing import Any, Dict
 7 | 
 8 | from databuilder.models.dashboard.dashboard_last_modified import DashboardLastModifiedTimestamp
 9 | from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \
10 |     RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE
11 | from databuilder.serializers import neo4_serializer
12 | 
13 | 
14 | class TestDashboardLastModifiedTimestamp(unittest.TestCase):
15 | 
16 |     def test_dashboard_timestamp_nodes(self) -> None:
17 |         dashboard_last_modified = DashboardLastModifiedTimestamp(last_modified_timestamp=123456789,
18 |                                                                  cluster='cluster_id',
19 |                                                                  product='product_id',
20 |                                                                  dashboard_id='dashboard_id',
21 |                                                                  dashboard_group_id='dashboard_group_id')
22 | 
23 |         actual = dashboard_last_modified.create_next_node()
24 |         actual_serialized = neo4_serializer.serialize_node(actual)
25 |         expected: Dict[str, Any] = {
26 |             'timestamp:UNQUOTED': 123456789,
27 |             'name': 'last_updated_timestamp',
28 |             'KEY': 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id/_last_modified_timestamp',
29 |             'LABEL': 'Timestamp'
30 |         }
31 | 
32 |         assert actual is not None
33 |         self.assertDictEqual(actual_serialized, expected)
34 |         self.assertIsNone(dashboard_last_modified.create_next_node())
35 | 
36 |     def test_dashboard_owner_relations(self) -> None:
37 |         dashboard_last_modified = DashboardLastModifiedTimestamp(last_modified_timestamp=123456789,
38 |                                                                  cluster='cluster_id',
39 |                                                                  product='product_id',
40 |                                                                  dashboard_id='dashboard_id',
41 |                                                                  dashboard_group_id='dashboard_group_id')
42 | 
43 |         actual = dashboard_last_modified.create_next_relation()
44 |         actual_serialized = neo4_serializer.serialize_relationship(actual)
45 |         expected: Dict[str, Any] = {
46 |             RELATION_END_KEY: 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id'
47 |                               '/_last_modified_timestamp',
48 |             RELATION_START_LABEL: 'Dashboard',
49 |             RELATION_END_LABEL: 'Timestamp',
50 |             RELATION_START_KEY: 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id',
51 |             RELATION_TYPE: 'LAST_UPDATED_AT',
52 |             RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF'
53 |         }
54 | 
55 |         assert actual is not None
56 |         self.assertDictEqual(actual_serialized, expected)
57 |         self.assertIsNone(dashboard_last_modified.create_next_relation())
58 | 


--------------------------------------------------------------------------------
/tests/unit/models/test_table_stats.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | from databuilder.models.table_stats import TableColumnStats
 6 | 
 7 | from databuilder.models.graph_serializable import NODE_KEY, \
 8 |     NODE_LABEL, RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \
 9 |     RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE
10 | from databuilder.serializers import neo4_serializer
11 | 
12 | 
13 | class TestTableStats(unittest.TestCase):
14 | 
15 |     def setUp(self) -> None:
16 |         super(TestTableStats, self).setUp()
17 |         self.table_stats = TableColumnStats(table_name='base.test',
18 |                                             col_name='col',
19 |                                             stat_name='avg',
20 |                                             stat_val='1',
21 |                                             start_epoch='1',
22 |                                             end_epoch='2',)
23 | 
24 |         self.expected_node_result = {
25 |             NODE_KEY: 'hive://gold.base/test/col/avg/',
26 |             NODE_LABEL: 'Stat',
27 |             'stat_val': '1',
28 |             'stat_name': 'avg',
29 |             'start_epoch': '1',
30 |             'end_epoch': '2',
31 |         }
32 | 
33 |         self.expected_relation_result = {
34 |             RELATION_START_KEY: 'hive://gold.base/test/col/avg/',
35 |             RELATION_START_LABEL: 'Stat',
36 |             RELATION_END_KEY: 'hive://gold.base/test/col',
37 |             RELATION_END_LABEL: 'Column',
38 |             RELATION_TYPE: 'STAT_OF',
39 |             RELATION_REVERSE_TYPE: 'STAT'
40 |         }
41 | 
42 |     def test_get_table_stat_model_key(self) -> None:
43 |         table_stats = self.table_stats.get_table_stat_model_key()
44 |         self.assertEqual(table_stats, 'hive://gold.base/test/col/avg/')
45 | 
46 |     def test_get_col_key(self) -> None:
47 |         metadata = self.table_stats.get_col_key()
48 |         self.assertEqual(metadata, 'hive://gold.base/test/col')
49 | 
50 |     def test_create_nodes(self) -> None:
51 |         nodes = self.table_stats.create_nodes()
52 |         self.assertEquals(len(nodes), 1)
53 |         serialized_node = neo4_serializer.serialize_node(nodes[0])
54 |         self.assertEquals(serialized_node, self.expected_node_result)
55 | 
56 |     def test_create_relation(self) -> None:
57 |         relation = self.table_stats.create_relation()
58 | 
59 |         self.assertEquals(len(relation), 1)
60 |         serialized_relation = neo4_serializer.serialize_relationship(relation[0])
61 |         self.assertEquals(serialized_relation, self.expected_relation_result)
62 | 
63 |     def test_create_next_node(self) -> None:
64 |         next_node = self.table_stats.create_next_node()
65 |         serialized_node = neo4_serializer.serialize_node(next_node)
66 |         self.assertEquals(serialized_node, self.expected_node_result)
67 | 
68 |     def test_create_next_relation(self) -> None:
69 |         next_relation = self.table_stats.create_next_relation()
70 |         serialized_relation = neo4_serializer.serialize_relationship(next_relation)
71 |         self.assertEquals(serialized_relation, self.expected_relation_result)
72 | 


--------------------------------------------------------------------------------
/databuilder/extractor/dashboard/mode_analytics/mode_dashboard_last_modified_timestamp_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | 
 6 | from pyhocon import ConfigTree, ConfigFactory
 7 | 
 8 | from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_executions_extractor import \
 9 |     ModeDashboardExecutionsExtractor
10 | from databuilder.extractor.dashboard.mode_analytics.mode_dashboard_utils import ModeDashboardUtils
11 | from databuilder.extractor.restapi.rest_api_extractor import STATIC_RECORD_DICT
12 | from databuilder.rest_api.mode_analytics.mode_paginated_rest_api_query import ModePaginatedRestApiQuery
13 | from databuilder.rest_api.rest_api_query import RestApiQuery
14 | from databuilder.transformer.dict_to_model import DictToModel, MODEL_CLASS
15 | from databuilder.transformer.timestamp_string_to_epoch import TimestampStringToEpoch, FIELD_NAME
16 | 
17 | LOGGER = logging.getLogger(__name__)
18 | 
19 | 
20 | class ModeDashboardLastModifiedTimestampExtractor(ModeDashboardExecutionsExtractor):
21 |     """
22 |     A Extractor that extracts Mode dashboard's last modified timestamp.
23 | 
24 |     """
25 | 
26 |     def __init__(self) -> None:
27 |         super(ModeDashboardLastModifiedTimestampExtractor, self).__init__()
28 | 
29 |     def init(self, conf: ConfigTree) -> None:
30 |         conf = conf.with_fallback(
31 |             ConfigFactory.from_dict({
32 |                 STATIC_RECORD_DICT: {'product': 'mode'},
33 |                 '{}.{}'.format(DictToModel().get_scope(), MODEL_CLASS):
34 |                     'databuilder.models.dashboard.dashboard_last_modified.DashboardLastModifiedTimestamp',
35 |                 '{}.{}'.format(TimestampStringToEpoch().get_scope(), FIELD_NAME):
36 |                     'last_modified_timestamp'
37 |             })
38 |         )
39 |         super(ModeDashboardLastModifiedTimestampExtractor, self).init(conf)
40 | 
41 |     def get_scope(self) -> str:
42 |         return 'extractor.mode_dashboard_last_modified_timestamp_execution'
43 | 
44 |     def _build_restapi_query(self) -> RestApiQuery:
45 |         """
46 |         Build REST API Query. To get Mode Dashboard last modified timestamp, it needs to call two APIs (spaces API,
47 |         and reports API) joining together.
48 |         :return: A RestApiQuery that provides Mode Dashboard last successful execution (run)
49 |         """
50 | 
51 |         spaces_query = ModeDashboardUtils.get_spaces_query_api(conf=self._conf)
52 |         params = ModeDashboardUtils.get_auth_params(conf=self._conf)
53 | 
54 |         # Reports
55 |         # https://mode.com/developer/api-reference/analytics/reports/#listReportsInSpace
56 |         url = 'https://app.mode.com/api/{organization}/spaces/{dashboard_group_id}/reports'
57 |         json_path = '_embedded.reports[*].[token,edited_at]'
58 |         field_names = ['dashboard_id', 'last_modified_timestamp']
59 |         last_modified_query = ModePaginatedRestApiQuery(query_to_join=spaces_query, url=url, params=params,
60 |                                                         json_path=json_path, field_names=field_names,
61 |                                                         skip_no_result=True)
62 | 
63 |         return last_modified_query
64 | 


--------------------------------------------------------------------------------
/tests/unit/models/test_table_source.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from databuilder.models.table_source import TableSource
 7 | from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \
 8 |     RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE
 9 | from databuilder.serializers import neo4_serializer
10 | 
11 | 
12 | DB = 'hive'
13 | SCHEMA = 'base'
14 | TABLE = 'test'
15 | CLUSTER = 'default'
16 | SOURCE = '/etl/sql/file.py'
17 | 
18 | 
19 | class TestTableSource(unittest.TestCase):
20 | 
21 |     def setUp(self) -> None:
22 |         super(TestTableSource, self).setUp()
23 |         self.table_source = TableSource(db_name='hive',
24 |                                         schema=SCHEMA,
25 |                                         table_name=TABLE,
26 |                                         cluster=CLUSTER,
27 |                                         source=SOURCE)
28 | 
29 |     def test_get_source_model_key(self) -> None:
30 |         source = self.table_source.get_source_model_key()
31 |         self.assertEqual(source, '{db}://{cluster}.{schema}/{tbl}/_source'.format(db=DB,
32 |                                                                                   schema=SCHEMA,
33 |                                                                                   tbl=TABLE,
34 |                                                                                   cluster=CLUSTER,
35 |                                                                                   ))
36 | 
37 |     def test_get_metadata_model_key(self) -> None:
38 |         metadata = self.table_source.get_metadata_model_key()
39 |         self.assertEqual(metadata, 'hive://default.base/test')
40 | 
41 |     def test_create_nodes(self) -> None:
42 |         nodes = self.table_source.create_nodes()
43 |         self.assertEqual(len(nodes), 1)
44 | 
45 |     def test_create_relation(self) -> None:
46 |         relations = self.table_source.create_relation()
47 |         self.assertEquals(len(relations), 1)
48 |         serialized_relation = neo4_serializer.serialize_relationship(relations[0])
49 | 
50 |         start_key = '{db}://{cluster}.{schema}/{tbl}/_source'.format(db=DB,
51 |                                                                      schema=SCHEMA,
52 |                                                                      tbl=TABLE,
53 |                                                                      cluster=CLUSTER)
54 |         end_key = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB,
55 |                                                            schema=SCHEMA,
56 |                                                            tbl=TABLE,
57 |                                                            cluster=CLUSTER)
58 | 
59 |         expected_relation = {
60 |             RELATION_START_KEY: start_key,
61 |             RELATION_START_LABEL: TableSource.LABEL,
62 |             RELATION_END_KEY: end_key,
63 |             RELATION_END_LABEL: 'Table',
64 |             RELATION_TYPE: TableSource.SOURCE_TABLE_RELATION_TYPE,
65 |             RELATION_REVERSE_TYPE: TableSource.TABLE_SOURCE_RELATION_TYPE
66 |         }
67 | 
68 |         self.assertDictEqual(expected_relation, serialized_relation)
69 | 


--------------------------------------------------------------------------------
/databuilder/extractor/glue_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import boto3
 5 | 
 6 | from pyhocon import ConfigFactory, ConfigTree
 7 | from typing import Iterator, Union, Dict, Any, List
 8 | 
 9 | from databuilder.extractor.base_extractor import Extractor
10 | from databuilder.models.table_metadata import TableMetadata, ColumnMetadata
11 | 
12 | 
13 | class GlueExtractor(Extractor):
14 |     """
15 |     Extracts tables and columns metadata from AWS Glue metastore
16 |     """
17 | 
18 |     CLUSTER_KEY = 'cluster'
19 |     FILTER_KEY = 'filters'
20 |     DEFAULT_CONFIG = ConfigFactory.from_dict({CLUSTER_KEY: 'gold', FILTER_KEY: None})
21 | 
22 |     def init(self, conf: ConfigTree) -> None:
23 |         conf = conf.with_fallback(GlueExtractor.DEFAULT_CONFIG)
24 |         self._cluster = '{}'.format(conf.get_string(GlueExtractor.CLUSTER_KEY))
25 |         self._filters = conf.get(GlueExtractor.FILTER_KEY)
26 |         self._glue = boto3.client('glue')
27 |         self._extract_iter: Union[None, Iterator] = None
28 | 
29 |     def extract(self) -> Union[TableMetadata, None]:
30 |         if not self._extract_iter:
31 |             self._extract_iter = self._get_extract_iter()
32 |         try:
33 |             return next(self._extract_iter)
34 |         except StopIteration:
35 |             return None
36 | 
37 |     def get_scope(self) -> str:
38 |         return 'extractor.glue'
39 | 
40 |     def _get_extract_iter(self) -> Iterator[TableMetadata]:
41 |         """
42 |         It gets all tables and yields TableMetadata
43 |         :return:
44 |         """
45 |         for row in self._get_raw_extract_iter():
46 |             columns, i = [], 0
47 | 
48 |             for column in row['StorageDescriptor']['Columns'] \
49 |                     + row.get('PartitionKeys', []):
50 |                 columns.append(ColumnMetadata(
51 |                     column['Name'],
52 |                     column['Comment'] if 'Comment' in column else None,
53 |                     column['Type'],
54 |                     i
55 |                 ))
56 |                 i += 1
57 | 
58 |             yield TableMetadata(
59 |                 'glue',
60 |                 self._cluster,
61 |                 row['DatabaseName'],
62 |                 row['Name'],
63 |                 row.get('Description') or row.get('Parameters', {}).get('comment'),
64 |                 columns,
65 |                 row.get('TableType') == 'VIRTUAL_VIEW',
66 |             )
67 | 
68 |     def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
69 |         """
70 |         Provides iterator of results row from glue client
71 |         :return:
72 |         """
73 |         tables = self._search_tables()
74 |         return iter(tables)
75 | 
76 |     def _search_tables(self) -> List[Dict[str, Any]]:
77 |         tables = []
78 |         kwargs = {}
79 |         if self._filters is not None:
80 |             kwargs['Filters'] = self._filters
81 |         data = self._glue.search_tables(**kwargs)
82 |         tables += data['TableList']
83 |         while 'NextToken' in data:
84 |             token = data['NextToken']
85 |             kwargs['NextToken'] = token
86 |             data = self._glue.search_tables(**kwargs)
87 |             tables += data['TableList']
88 |         return tables
89 | 


--------------------------------------------------------------------------------
/tests/unit/models/test_table_last_updated.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from databuilder.models.graph_serializable import NODE_KEY, \
 7 |     NODE_LABEL, RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \
 8 |     RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE
 9 | from databuilder.models.table_last_updated import TableLastUpdated
10 | from databuilder.models.timestamp import timestamp_constants
11 | from databuilder.serializers import neo4_serializer
12 | 
13 | 
14 | class TestTableLastUpdated(unittest.TestCase):
15 | 
16 |     def setUp(self) -> None:
17 |         super(TestTableLastUpdated, self).setUp()
18 | 
19 |         self.tableLastUpdated = TableLastUpdated(table_name='test_table',
20 |                                                  last_updated_time_epoch=25195665,
21 |                                                  schema='default')
22 | 
23 |         self.expected_node_result = {
24 |             NODE_KEY: 'hive://gold.default/test_table/timestamp',
25 |             NODE_LABEL: 'Timestamp',
26 |             'last_updated_timestamp:UNQUOTED': 25195665,
27 |             timestamp_constants.TIMESTAMP_PROPERTY + ":UNQUOTED": 25195665,
28 |             'name': 'last_updated_timestamp'
29 |         }
30 | 
31 |         self.expected_relation_result = {
32 |             RELATION_START_KEY: 'hive://gold.default/test_table',
33 |             RELATION_START_LABEL: 'Table',
34 |             RELATION_END_KEY: 'hive://gold.default/test_table/timestamp',
35 |             RELATION_END_LABEL: 'Timestamp',
36 |             RELATION_TYPE: 'LAST_UPDATED_AT',
37 |             RELATION_REVERSE_TYPE: 'LAST_UPDATED_TIME_OF'
38 |         }
39 | 
40 |     def test_create_next_node(self) -> None:
41 |         next_node = self.tableLastUpdated.create_next_node()
42 |         next_node_serialized = neo4_serializer.serialize_node(next_node)
43 |         self.assertEqual(next_node_serialized, self.expected_node_result)
44 | 
45 |     def test_create_next_relation(self) -> None:
46 |         next_relation = self.tableLastUpdated.create_next_relation()
47 |         next_relation_serialized = neo4_serializer.serialize_relationship(next_relation)
48 |         self.assertEqual(next_relation_serialized, self.expected_relation_result)
49 | 
50 |     def test_get_table_model_key(self) -> None:
51 |         table = self.tableLastUpdated.get_table_model_key()
52 |         self.assertEqual(table, 'hive://gold.default/test_table')
53 | 
54 |     def test_get_last_updated_model_key(self) -> None:
55 |         last_updated = self.tableLastUpdated.get_last_updated_model_key()
56 |         self.assertEqual(last_updated, 'hive://gold.default/test_table/timestamp')
57 | 
58 |     def test_create_nodes(self) -> None:
59 |         nodes = self.tableLastUpdated.create_nodes()
60 |         self.assertEquals(len(nodes), 1)
61 |         serialize_node = neo4_serializer.serialize_node(nodes[0])
62 |         self.assertEquals(serialize_node, self.expected_node_result)
63 | 
64 |     def test_create_relation(self) -> None:
65 |         relation = self.tableLastUpdated.create_relation()
66 |         self.assertEquals(len(relation), 1)
67 |         serialized_relation = neo4_serializer.serialize_relationship(relation[0])
68 |         self.assertEquals(serialized_relation, self.expected_relation_result)
69 | 


--------------------------------------------------------------------------------
/tests/unit/models/dashboard/test_dashboard_usage.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import unittest
 5 | 
 6 | from typing import Any, Dict
 7 | 
 8 | from databuilder.models.dashboard.dashboard_usage import DashboardUsage
 9 | from databuilder.models.graph_serializable import RELATION_START_KEY, RELATION_START_LABEL, RELATION_END_KEY, \
10 |     RELATION_END_LABEL, RELATION_TYPE, RELATION_REVERSE_TYPE
11 | from databuilder.serializers import neo4_serializer
12 | 
13 | 
14 | class TestDashboardOwner(unittest.TestCase):
15 | 
16 |     def test_dashboard_usage_user_nodes(self) -> None:
17 |         dashboard_usage = DashboardUsage(dashboard_group_id='dashboard_group_id', dashboard_id='dashboard_id',
18 |                                          email='foo@bar.com', view_count=123, cluster='cluster_id',
19 |                                          product='product_id', should_create_user_node=True)
20 | 
21 |         actual = dashboard_usage.create_next_node()
22 |         actual_serialized = neo4_serializer.serialize_node(actual)
23 |         expected: Dict[str, Any] = {
24 |             'is_active:UNQUOTED': True,
25 |             'last_name': '',
26 |             'full_name': '',
27 |             'employee_type': '',
28 |             'first_name': '',
29 |             'updated_at:UNQUOTED': 0,
30 |             'LABEL': 'User',
31 |             'slack_id': '',
32 |             'KEY': 'foo@bar.com',
33 |             'github_username': '',
34 |             'team_name': '',
35 |             'email': 'foo@bar.com',
36 |             'role_name': ''
37 |         }
38 | 
39 |         assert actual is not None
40 |         self.assertDictEqual(expected, actual_serialized)
41 |         self.assertIsNone(dashboard_usage.create_next_node())
42 | 
43 |     def test_dashboard_usage_no_user_nodes(self) -> None:
44 |         dashboard_usage = DashboardUsage(dashboard_group_id='dashboard_group_id', dashboard_id='dashboard_id',
45 |                                          email='foo@bar.com', view_count=123,
46 |                                          should_create_user_node=False, cluster='cluster_id',
47 |                                          product='product_id')
48 | 
49 |         self.assertIsNone(dashboard_usage.create_next_node())
50 | 
51 |     def test_dashboard_owner_relations(self) -> None:
52 |         dashboard_usage = DashboardUsage(dashboard_group_id='dashboard_group_id', dashboard_id='dashboard_id',
53 |                                          email='foo@bar.com', view_count=123, cluster='cluster_id',
54 |                                          product='product_id')
55 | 
56 |         actual = dashboard_usage.create_next_relation()
57 |         actual_serialized = neo4_serializer.serialize_relationship(actual)
58 |         expected: Dict[str, Any] = {
59 |             'read_count:UNQUOTED': 123,
60 |             RELATION_END_KEY: 'foo@bar.com',
61 |             RELATION_START_LABEL: 'Dashboard',
62 |             RELATION_END_LABEL: 'User',
63 |             RELATION_START_KEY: 'product_id_dashboard://cluster_id.dashboard_group_id/dashboard_id',
64 |             RELATION_TYPE: 'READ_BY',
65 |             RELATION_REVERSE_TYPE: 'READ'
66 |         }
67 | 
68 |         assert actual is not None
69 |         self.assertDictEqual(expected, actual_serialized)
70 |         self.assertIsNone(dashboard_usage.create_next_relation())
71 | 


--------------------------------------------------------------------------------
/tests/unit/extractor/dashboard/tableau/test_tableau_dashboard_extractor.py:
--------------------------------------------------------------------------------
 1 | # Copyright Contributors to the Amundsen project.
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | import logging
 5 | import unittest
 6 | from typing import Any, Dict
 7 | 
 8 | from mock import patch
 9 | from pyhocon import ConfigFactory
10 | 
11 | from databuilder import Scoped
12 | from databuilder.extractor.dashboard.tableau.tableau_dashboard_extractor import TableauDashboardExtractor
13 | from databuilder.extractor.dashboard.tableau.tableau_dashboard_utils \
14 |     import TableauDashboardAuth, TableauGraphQLApiExtractor
15 | 
16 | 
17 | logging.basicConfig(level=logging.INFO)
18 | 
19 | 
20 | def mock_query(*_args: Any, **_kwargs: Any) -> Dict[str, Any]:
21 |     return {
22 |         'workbooks': [
23 |             {
24 |                 'id': 'fake-id',
25 |                 'name': 'Test Workbook',
26 |                 'createdAt': '2020-04-08T05:32:01Z',
27 |                 'description': '',
28 |                 'projectName': 'Test Project',
29 |                 'projectVizportalUrlId': 123,
30 |                 'vizportalUrlId': 456
31 |             }
32 |         ]
33 |     }
34 | 
35 | 
36 | def mock_token(*_args: Any, **_kwargs: Any) -> str:
37 |     return '123-abc'
38 | 
39 | 
40 | class TestTableauDashboardExtractor(unittest.TestCase):
41 | 
42 |     @patch.object(TableauDashboardAuth, '_authenticate', mock_token)
43 |     @patch.object(TableauGraphQLApiExtractor, 'execute_query', mock_query)
44 |     def test_dashboard_metadata_extractor(self) -> None:
45 | 
46 |         config = ConfigFactory.from_dict({
47 |             'extractor.tableau_dashboard_metadata.api_base_url': 'api_base_url',
48 |             'extractor.tableau_dashboard_metadata.tableau_base_url': 'tableau_base_url',
49 |             'extractor.tableau_dashboard_metadata.api_version': 'tableau_api_version',
50 |             'extractor.tableau_dashboard_metadata.site_name': 'tableau_site_name',
51 |             'extractor.tableau_dashboard_metadata.tableau_personal_access_token_name':
52 |                 'tableau_personal_access_token_name',
53 |             'extractor.tableau_dashboard_metadata.tableau_personal_access_token_secret':
54 |                 'tableau_personal_access_token_secret',
55 |             'extractor.tableau_dashboard_metadata.excluded_projects': [],
56 |             'extractor.tableau_dashboard_metadata.cluster': 'tableau_dashboard_cluster',
57 |             'extractor.tableau_dashboard_metadata.database': 'tableau_dashboard_database',
58 |             'extractor.tableau_dashboard_metadata.transformer.timestamp_str_to_epoch.timestamp_format':
59 |                 '%Y-%m-%dT%H:%M:%SZ',
60 | 
61 |         })
62 | 
63 |         extractor = TableauDashboardExtractor()
64 |         extractor.init(Scoped.get_scoped_conf(conf=config, scope=extractor.get_scope()))
65 |         record = extractor.extract()
66 | 
67 |         self.assertEqual(record.dashboard_id, 'Test Workbook')
68 |         self.assertEqual(record.dashboard_name, 'Test Workbook')
69 |         self.assertEqual(record.dashboard_group_id, 'Test Project')
70 |         self.assertEqual(record.dashboard_group, 'Test Project')
71 |         self.assertEqual(record.product, 'tableau')
72 |         self.assertEqual(record.cluster, 'tableau_dashboard_cluster')
73 |         self.assertEqual(record.created_timestamp, 1586323921)
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     unittest.main()
78 | 


--------------------------------------------------------------------------------