├── src ├── ploosh │ ├── version.py │ ├── __main__.py │ ├── engines │ │ ├── compare_engine.py │ │ ├── load_engine.py │ │ ├── load_engine_spark.py │ │ └── load_engine_native.py │ ├── exporters │ │ ├── exporter.py │ │ ├── __init__.py │ │ ├── exporter_csv.py │ │ ├── exporter_json.py │ │ └── exporter_trx.py │ ├── connectors │ │ ├── connector.py │ │ ├── connector_empty.py │ │ ├── connector_sql_spark.py │ │ ├── connector_delta_spark.py │ │ ├── connector_parquet_spark.py │ │ ├── connector_excel.py │ │ ├── connector_empty_spark.py │ │ ├── connector_json.py │ │ ├── connector_json_spark.py │ │ ├── __init__.py │ │ ├── connector_databricks.py │ │ ├── connector_parquet.py │ │ ├── connector_bigquery.py │ │ ├── connector_csv_spark.py │ │ ├── connector_snowflake.py │ │ ├── connector_postgresql.py │ │ ├── connector_mysql.py │ │ ├── connector_mssql.py │ │ ├── connector_odbc.py │ │ ├── connector_csv.py │ │ ├── connector_semantic_model_xmla.py │ │ └── connector_analysis_services.py │ ├── __init__.py │ ├── parameters.py │ ├── execute.py │ ├── logs.py │ └── case.py ├── setup-core.py ├── requirements.txt ├── setup-full.py └── setup.py ├── tests ├── .env │ ├── excel │ │ └── sales.xlsx │ ├── parquet │ │ └── sales.parquet │ ├── delta │ │ └── sales │ │ │ ├── part-00001-4cd9f0b3-de7c-470d-83e2-ede8afcbf39a-c000.snappy.parquet │ │ │ └── _delta_log │ │ │ └── 00000000000000000000.json │ ├── spark │ │ └── setup.sql │ └── csv │ │ ├── sales_with_cr.csv │ │ ├── sales_with_tab.csv │ │ ├── sales_with_comma.csv │ │ └── sales_with_iso_8859_1.csv ├── .data │ ├── sales_by_seller.csv │ └── sales.csv ├── connectors │ ├── test_excel.py │ ├── test_csv.py │ ├── test_parquet_spark.py │ ├── test_delta_spark.py │ ├── test_sql_spark.py │ ├── test_mysql.py │ ├── test_postgresql.py │ ├── test_mssql.py │ ├── test_json.py │ ├── test_parquet.py │ └── test_csv_spark.py └── load_engine │ ├── test_native.py │ └── test_spark.py ├── pyproject.toml ├── .vscode ├── settings.json └── launch.json ├── .gitignore ├── docs ├── connectors │ ├── native │ │ ├── empty.md │ │ ├── excel.md │ │ ├── parquet.md │ │ ├── big_query.md │ │ ├── odbc.md │ │ ├── databricks.md │ │ ├── snowflake.md │ │ ├── csv.md │ │ ├── postgresql.md │ │ ├── mysql.md │ │ └── sqlserver.md │ └── spark │ │ ├── empty.md │ │ ├── sql.md │ │ ├── delta.md │ │ └── csv.md ├── configuration │ ├── custom_parameters.md │ ├── command_line.md │ ├── spark.md │ └── options.md ├── exporters │ ├── trx.md │ ├── csv.md │ └── json.md ├── pipelines │ └── azure_devops.md └── home.md ├── .github └── workflows │ ├── linter.yml │ ├── release.yml │ └── unit_tests.yml ├── readme.md └── debug └── setup.sh /src/ploosh/version.py: -------------------------------------------------------------------------------- 1 | """Current version of ploosh""" 2 | 3 | PLOOSH_VERSION = "0.3.8" -------------------------------------------------------------------------------- /tests/.env/excel/sales.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSharplie/ploosh/HEAD/tests/.env/excel/sales.xlsx -------------------------------------------------------------------------------- /tests/.env/parquet/sales.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSharplie/ploosh/HEAD/tests/.env/parquet/sales.parquet -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pytest.ini_options] 2 | addopts = [ 3 | "--import-mode=importlib", 4 | ] 5 | pythonpath = [ 6 | "src", "tests" 7 | ] -------------------------------------------------------------------------------- /src/setup-core.py: -------------------------------------------------------------------------------- 1 | """Setup script for ploosh light package""" 2 | 3 | from setup import setup_ploosh 4 | 5 | setup_ploosh("ploosh-core", []) 6 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | "tests" 4 | ], 5 | "python.testing.unittestEnabled": false, 6 | "python.testing.pytestEnabled": true 7 | } -------------------------------------------------------------------------------- /tests/.env/delta/sales/part-00001-4cd9f0b3-de7c-470d-83e2-ede8afcbf39a-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CSharplie/ploosh/HEAD/tests/.env/delta/sales/part-00001-4cd9f0b3-de7c-470d-83e2-ede8afcbf39a-c000.snappy.parquet -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | .pytest_cache 3 | __pycache__ 4 | /src/build 5 | /src/dist 6 | /src/ploosh.egg-info 7 | /src/ploosh_core.egg-info 8 | /src/ploosh/output 9 | /src/ploosh/logs 10 | /tests/connectors/connections/ 11 | /data 12 | /logs 13 | /test_cases 14 | /output 15 | /metastore_db 16 | /connections.yml 17 | spark_setup_file_tmp 18 | *.log 19 | -------------------------------------------------------------------------------- /src/ploosh/__main__.py: -------------------------------------------------------------------------------- 1 | """Automatized Testing Framework""" 2 | 3 | from execute import execute 4 | 5 | 6 | def main(): 7 | """Entry point for conda execution""" 8 | # Call the main execution function 9 | execute() 10 | 11 | 12 | # Check if the script is being run directly 13 | if __name__ == "__main__": 14 | # Call the main execution function 15 | main() 16 | -------------------------------------------------------------------------------- /tests/.data/sales_by_seller.csv: -------------------------------------------------------------------------------- 1 | seller_name,total_sales 2 | Jane Smith,615.00 3 | Emma Green,500.00 4 | Lucas Harris,500.00 5 | Henry King,492.00 6 | Harper Lewis,440.00 7 | Ethan White,415.00 8 | Alex Johnson,390.00 9 | Chris Brown,360.00 10 | John Doe,333.00 11 | Amelia Clark,330.00 12 | Sophia Wilson,320.00 13 | Mason Martinez,310.00 14 | Liam Brown,245.00 15 | Sarah White,222.00 16 | Isabella Moore,220.00 17 | Olivia Taylor,165.00 18 | Evelyn Walker,160.00 -------------------------------------------------------------------------------- /src/requirements.txt: -------------------------------------------------------------------------------- 1 | colorama==0.4.6 2 | PyYAML==6.0.1 3 | Pyjeb==0.2.1 4 | pytest==8.3.3 5 | numpy==1.26.3 6 | pandas==2.1.4 7 | openpyxl==3.1.2 8 | sqlalchemy==1.4.51 9 | pyspark==3.5.4 10 | pyodbc==5.0.1 11 | pymysql==1.1.0 12 | pg8000==1.30.3 13 | snowflake-sqlalchemy==1.5.1 14 | databricks-sql-connector==2.9.3 15 | sqlalchemy-bigquery==1.9.0 16 | google-cloud-bigquery-storage==2.24.0 17 | pandas-gbq==0.23.0 18 | pydata-google-auth==1.8.2 19 | azure-identity==1.19.0 20 | delta-spark==3.3.0 21 | deltalake==0.23.2 22 | pyadomd==0.1.1 -------------------------------------------------------------------------------- /tests/.env/spark/setup.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS sales; 2 | 3 | CREATE EXTERNAL TABLE IF NOT EXISTS sales ( 4 | sale_id INT, 5 | seller_name STRING, 6 | card_name STRING, 7 | card_rarity STRING, 8 | card_condition STRING, 9 | price DOUBLE, 10 | quantity INT, 11 | sale_date DATE, 12 | card_set STRING, 13 | buyer_name STRING, 14 | transaction_status STRING 15 | ) 16 | USING csv 17 | OPTIONS ( 18 | path '{{pwd}}/tests/.data/sales.csv', 19 | header 'true', 20 | inferSchema 'true' 21 | ); -------------------------------------------------------------------------------- /src/ploosh/engines/compare_engine.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903,W0613 2 | """ Base class for the comparison engines """ 3 | 4 | class CompareEngine: 5 | """Base class for the comparison engines""" 6 | 7 | success_rate = 1 8 | error_type = None 9 | error_message = None 10 | df_compare_gap = None 11 | df_source = None 12 | df_expected = None 13 | options = None 14 | mode = None 15 | 16 | def compare(self) -> bool: 17 | """Compare the source and expected datasets""" 18 | return False 19 | -------------------------------------------------------------------------------- /src/setup-full.py: -------------------------------------------------------------------------------- 1 | """Setup script for ploosh package""" 2 | 3 | from setup import setup_ploosh 4 | 5 | install_requires = [ 6 | "pyodbc==5.0.1", 7 | "pymysql==1.1.0", 8 | "pg8000==1.30.3", 9 | "snowflake-sqlalchemy==1.5.1", 10 | "databricks-sql-connector==2.9.3", 11 | "sqlalchemy-bigquery==1.9.0", 12 | "google-cloud-bigquery-storage==2.24.0", 13 | "pandas-gbq==0.23.0", 14 | "pydata-google-auth==1.8.2", 15 | "azure-identity==1.19.0", 16 | "pyadomd==0.1.1" 17 | ] 18 | 19 | setup_ploosh("ploosh", install_requires) 20 | -------------------------------------------------------------------------------- /docs/connectors/native/empty.md: -------------------------------------------------------------------------------- 1 | This connector is used to return an empty dataframe with 0 rows and 0 columns 2 | 3 | # Connection configuration 4 | No connection is required by this connector 5 | 6 | # Test case configuration 7 | ## Test case configuration 8 | Test case configuration parameter is required by this connector 9 | 10 | ## Example 11 | ``` yaml 12 | Example Empty: 13 | source: 14 | connection: mysql_example 15 | type: mysql 16 | query: | 17 | select * 18 | from employees 19 | where hire_date < "2000-01-01" 20 | expected: 21 | type: empty 22 | ``` -------------------------------------------------------------------------------- /docs/configuration/custom_parameters.md: -------------------------------------------------------------------------------- 1 | # Custom parameters usefulness 2 | It is possible to add a custom parameter in the connection configuration file to avoid hardcoding sensitive information like passwords or credentials. 3 | 4 | To use a custom parameter, you need to add a parameter in the connection configuration file and use it in the connection configuration. The parameter value can be passed as an environment variable or as a command line argument. 5 | 6 | # Syntax and usage 7 | The syntax of for the parameter is `$var.` in the connection configuration file. 8 | 9 | The parameter value can be passed as a command line argument using the `--p_` option. -------------------------------------------------------------------------------- /src/ploosh/engines/load_engine.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903,W0613 2 | """Base class for all load engines""" 3 | 4 | class LoadEngine: 5 | """Base class for all load engines""" 6 | 7 | count = None 8 | configuration = None 9 | options = None 10 | connection = None 11 | df_data = None 12 | 13 | def get_insensitive_item(self, name: str, items: list) -> str: 14 | """Get item from list case-insensitively""" 15 | for item in items: 16 | if name.upper().strip() == item.upper().strip(): 17 | return item 18 | return name 19 | 20 | def execute(self): 21 | """Execute the load engine""" 22 | return None 23 | -------------------------------------------------------------------------------- /src/ploosh/exporters/exporter.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903,W0613 2 | """Test result exporter""" 3 | from datetime import datetime 4 | 5 | 6 | class Exporter: 7 | """Test result exporter""" 8 | name = None # Name of the exporter 9 | output_path = None # Output path for the exported results 10 | 11 | @staticmethod 12 | def date_to_string(data): 13 | """Convert datetime to string in ISO 8601 format""" 14 | if not isinstance(data, datetime): 15 | return None 16 | 17 | return data.strftime("%Y-%m-%dT%H:%M:%SZ") 18 | 19 | def export(self, cases: dict): 20 | """Export test case results to the destination""" 21 | return None 22 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903,W0613 2 | """Connector to access to remote data""" 3 | 4 | 5 | class Connector: 6 | """Connector to access to remote data""" 7 | name = None # Name of the connector 8 | connection_definition = None # Definition of the connection parameters 9 | configuration_definition = None # Definition of the configuration parameters 10 | is_spark = False # Flag to indicate if the connector uses Spark 11 | spark = None # Spark session object 12 | 13 | def get_data(self, configuration: dict, connection: dict): 14 | """Get data from connector""" 15 | return None # This method should be overridden by subclasses to fetch data 16 | -------------------------------------------------------------------------------- /docs/connectors/native/excel.md: -------------------------------------------------------------------------------- 1 | This connector is used to read Excel files from local file system. 2 | 3 | # Connection configuration 4 | No connection is required by this connector 5 | 6 | # Test case configuration 7 | # Definition 8 | 9 | | Name | Mandatory | Default | Description | 10 | |-----------|:---------:|:-------:|-------------| 11 | | path | yes | | The path to the Excel file to read | 12 | | sheet | yes | | The sheet name or index to read from the Excel file 13 | 14 | # Example 15 | 16 | ``` yaml 17 | Example Excel: 18 | source: 19 | type: mysql 20 | 21 | expected: 22 | type: excel 23 | path: data/employees.xlsx 24 | sheet: employees 25 | 26 | -------------------------------------------------------------------------------- /docs/configuration/command_line.md: -------------------------------------------------------------------------------- 1 | # Mandatory arguments 2 | - `--connection `: The connection file name to use 3 | - `--cases `: The test case folder where the test cases yaml files are stored 4 | 5 | # Optional arguments 6 | - `--export `: The export format to use. Can be `JSON`, `CSV` or `TRX`. Default is `JSON` 7 | - `--filter `: A wildcard to filter the test cases to execute. The default value is `*.yml` which means all the test cases will be executed 8 | - `--p_ `: The parameter value to use. The parameter name is the name of the parameter in the connection file. The parameter value can be passed as an environment variable or as a command line argument -------------------------------------------------------------------------------- /.github/workflows/linter.yml: -------------------------------------------------------------------------------- 1 | name: 'Linter' 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | - develop 8 | paths: 9 | - 'src/**' 10 | - 'tests/**' 11 | jobs: 12 | lint: 13 | name: 'Lint code' 14 | runs-on: ubuntu-22.04 15 | defaults: 16 | run: 17 | shell: bash 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v3 21 | - name: Set up Python 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: "3.12.8" 25 | - name: Install requirements 26 | run: | 27 | pip install -r src/requirements.txt 28 | pip install pylint==3.3.3 29 | - name: Lint code 30 | run: | 31 | pylint --fail-under=9.3 src/ 32 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_empty.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to return empty""" 3 | 4 | import pandas as pd 5 | from connectors.connector import Connector 6 | 7 | 8 | class ConnectorEmpty(Connector): 9 | """Connector to return empty""" 10 | 11 | def __init__(self): 12 | # Initialize the connector with its name and empty definitions 13 | self.name = "EMPTY" 14 | self.connection_definition = [] # No specific connection parameters required 15 | self.configuration_definition = [] # No specific configuration parameters required 16 | 17 | def get_data(self, configuration: dict, connection: dict): 18 | """Return empty value""" 19 | # Create an empty pandas DataFrame 20 | df = pd.DataFrame() 21 | return df 22 | -------------------------------------------------------------------------------- /docs/exporters/trx.md: -------------------------------------------------------------------------------- 1 | # Structure 2 | ``` 3 | output/ 4 | ├─ trx/ 5 | │ ├─ test_results.xml 6 | │ ├─ test_results/ 7 | │ │ ├─ execution ID (guid)/ 8 | │ │ │ ├─ test case 1.xlsx 9 | │ │ ├─ execution ID (guid)/ 10 | │ │ │ ├─ test case 2.xlsx 11 | │ │ └─ ... 12 | ``` 13 | 14 | # test_results.xml 15 | The `test_results.xml` file use the TRX format (Visual Studio Test Results File). It will contain the details of the test cases results in XML format. 16 | 17 | This file can be opened with Visual Studio or any other tool that support the TRX format. 18 | 19 | It can be used with Azure DevOps to publish the test results. 20 | 21 | # test_results folder 22 | The `test_results` folder will contain one xlsx file per test case. Each file will contain a sheet with the gap between the source and the expected dataset -------------------------------------------------------------------------------- /docs/connectors/spark/empty.md: -------------------------------------------------------------------------------- 1 | This connector is used to return an empty dataframe with 0 rows and 0 columns 2 | 3 | ⚠️ A spark connector can be use only with another spark connector. It is not possible to use a spark connector with a non spark connector. 4 | 5 | See [Spark documentation](/docs/configuration-spark-mode/) for more information. 6 | 7 | # Connection configuration 8 | No connection is required by this connector 9 | 10 | # Test case configuration 11 | ## Test case configuration 12 | Test case configuration parameter is required by this connector 13 | 14 | ## Example 15 | ``` yaml 16 | Example Empty Spark: 17 | source: 18 | type: sql_spark 19 | query: | 20 | select * 21 | from employees 22 | where hire_date < "2000-01-01" 23 | expected: 24 | type: empty_spark 25 | ``` -------------------------------------------------------------------------------- /tests/connectors/test_excel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import pytest 4 | from pyjeb import control_and_setup 5 | from ploosh.connectors.connector_excel import ConnectorExcel 6 | 7 | @pytest.fixture 8 | def connector(): 9 | return ConnectorExcel() 10 | 11 | @pytest.fixture 12 | def df_sales(): 13 | return pd.read_csv("./tests/.data/sales.csv", delimiter=",", date_format = "%Y-%m-%d", parse_dates=["sale_date"]) 14 | 15 | def test_get_data(connector, df_sales): 16 | configuration = { 17 | "path": "./tests/.env/excel/sales.xlsx", 18 | "sheet_name": "sales" 19 | } 20 | 21 | configuration = control_and_setup(configuration, connector.configuration_definition) 22 | 23 | df_test = connector.get_data(configuration, None) 24 | 25 | assert len(df_test.compare(df_sales)) == 0 -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_sql_spark.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read SQL file""" 3 | 4 | from connectors.connector import Connector 5 | 6 | 7 | class ConnectorSQLSpark(Connector): 8 | """Connector to execute SQL query over Spark""" 9 | 10 | def __init__(self): 11 | # Initialize the connector with its name and indicate it uses Spark 12 | self.name = "SQL_SPARK" 13 | self.is_spark = True 14 | self.connection_definition = [] # No specific connection parameters required 15 | self.configuration_definition = [ 16 | {"name": "query"} # SQL query to execute 17 | ] 18 | 19 | def get_data(self, configuration: dict, connection: dict): 20 | """Get data from source""" 21 | 22 | # Execute the SQL query using Spark and return the resulting DataFrame 23 | df = self.spark.sql(configuration["query"]) 24 | 25 | return df 26 | -------------------------------------------------------------------------------- /docs/connectors/spark/sql.md: -------------------------------------------------------------------------------- 1 | This connector is used to execute spark SQL. 2 | 3 | ⚠️ A spark connector can be use only with another spark connector. It is not possible to use a spark connector with a non spark connector. 4 | 5 | See [Spark documentation](/docs/configuration-spark-mode/) for more information. 6 | 7 | # Connection configuration 8 | No connection is required by this connector 9 | 10 | # Configuration 11 | ## Test case configuration 12 | | Name | Mandatory | Default | Description | 13 | |-------------------|:---------:|:-----------------------------:|-------------| 14 | | query | yes | | The query to execute to the database 15 | 16 | ## Example 17 | ``` yaml 18 | Example Empty Spark: 19 | source: 20 | type: sql_spark 21 | query: | 22 | select * 23 | from employees 24 | where hire_date < "2000-01-01" 25 | expected: 26 | type: empty_spark 27 | ``` -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_delta_spark.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read Delta table""" 3 | 4 | from connectors.connector import Connector 5 | 6 | 7 | class ConnectorDeltaSpark(Connector): 8 | """Connector to read Delta table with Spark""" 9 | 10 | def __init__(self): 11 | # Initialize the connector with its name and configuration definitions 12 | self.name = "DELTA_SPARK" 13 | self.is_spark = True # Indicates that this connector uses Spark 14 | self.connection_definition = [] # No specific connection parameters required 15 | self.configuration_definition = [ 16 | {"name": "path"}, # Path to the Delta table 17 | ] 18 | 19 | def get_data(self, configuration: dict, connection: dict): 20 | """Get data from source""" 21 | 22 | # Read the Delta table using Spark with the specified path 23 | df = self.spark.read.format("delta").load(configuration["path"]) 24 | 25 | return df 26 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_parquet_spark.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read Parquet file""" 3 | 4 | from connectors.connector import Connector 5 | 6 | 7 | class ConnectorParquetSpark(Connector): 8 | """Connector to read Parquet file""" 9 | 10 | def __init__(self): 11 | # Initialize the connector with its name and configuration definitions 12 | self.name = "PARQUET_SPARK" 13 | self.is_spark = True 14 | self.connection_definition = [] # No specific connection parameters required 15 | self.configuration_definition = [ 16 | {"name": "path"} # Path to the Parquet file 17 | ] 18 | 19 | def get_data(self, configuration: dict, connection: dict): 20 | """Get data from source""" 21 | 22 | # Extract the path and configuration parameters 23 | path = configuration["path"] 24 | 25 | # Read the Parquet file using pandas 26 | df = self.spark.read.parquet(path) 27 | 28 | return df 29 | -------------------------------------------------------------------------------- /src/ploosh/engines/load_engine_spark.py: -------------------------------------------------------------------------------- 1 | from engines.load_engine import LoadEngine 2 | 3 | class LoadEngineSpark(LoadEngine): 4 | """Load engine for Spark""" 5 | 6 | def __init__(self, configuration, options, connection): 7 | """Initialize the LoadEngineSpark class""" 8 | 9 | self.configuration = configuration 10 | self.options = options 11 | self.connection = connection 12 | 13 | def execute(self, df_data): 14 | """Execute the load engine""" 15 | self.count = df_data.count() 16 | 17 | # Cast columns to specified types 18 | for column in self.options["cast"]: 19 | column_name = self.get_insensitive_item(column["name"], df_data.columns) 20 | column_type = column["type"] 21 | if column_type == "datetime": 22 | column_type = "timestamp" 23 | 24 | df_data = df_data.withColumn(column_name, df_data[column_name].cast(column_type)) 25 | 26 | return df_data 27 | -------------------------------------------------------------------------------- /docs/connectors/spark/delta.md: -------------------------------------------------------------------------------- 1 | This connector is used to read Detla table files using Spark. 2 | 3 | ⚠️ A spark connector can be use only with another spark connector. It is not possible to use a spark connector with a non spark connector. 4 | 5 | See [Spark documentation](/docs/configuration-spark-mode/) for more information. 6 | 7 | # Connection configuration 8 | No connection is required by this connector 9 | 10 | # Configuration 11 | ## Test case configuration 12 | | Name | Mandatory | Default | Description | 13 | |-------------------|:---------:|:-----------------------------:|-------------| 14 | | path | yes | | Path to the Delta table 15 | 16 | ## Example 17 | ``` yaml 18 | Example Delta Spark: 19 | source: 20 | type: delta_spark 21 | path: data/employees 22 | expected: 23 | type: sql_spark 24 | query: | 25 | select * 26 | from employees 27 | where hire_date < "2000-01-01" 28 | ``` 29 | -------------------------------------------------------------------------------- /tests/connectors/test_csv.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import pytest 4 | from pyjeb import control_and_setup 5 | from ploosh.connectors.connector_csv import ConnectorCSV 6 | 7 | @pytest.fixture 8 | def connector(): 9 | return ConnectorCSV() 10 | 11 | @pytest.fixture 12 | def df_sales(): 13 | return pd.read_csv("./tests/.data/sales.csv", delimiter=",") 14 | 15 | def test_connection_with_tabulation(connector, df_sales): 16 | configuration = { 17 | "path": "./tests/.env/csv/sales_with_tab.csv", 18 | "delimiter": "\t" 19 | } 20 | 21 | configuration = control_and_setup(configuration, connector.configuration_definition) 22 | 23 | df_test = connector.get_data(configuration, None) 24 | 25 | assert len(df_test.compare(df_sales)) == 0 26 | 27 | def test_connection_with_default(connector, df_sales): 28 | configuration = { 29 | "path": "./tests/.env/csv/sales_with_comma.csv" 30 | } 31 | 32 | configuration = control_and_setup(configuration, connector.configuration_definition) 33 | 34 | df_test = connector.get_data(configuration, None) 35 | 36 | assert len(df_test.compare(df_sales)) == 0 -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_excel.py: -------------------------------------------------------------------------------- 1 | """Connector to read Excel file""" 2 | 3 | import pandas as pd 4 | from connectors.connector import Connector 5 | 6 | 7 | class ConnectorExcel(Connector): 8 | """Connector to read Excel file""" 9 | 10 | def __init__(self): 11 | # Initialize the connector with its name and configuration definitions 12 | self.name = "EXCEL" 13 | self.connection_definition = [] # No specific connection parameters required 14 | self.configuration_definition = [ 15 | {"name": "path"}, # Path to the Excel file 16 | {"name": "sheet_name"}, # Sheet name 17 | {"name": "skiprows", "type": "integer", "default": 0}, # Number of rows to skip 18 | ] 19 | 20 | def get_data(self, configuration: dict, connection: dict): 21 | """Get data from source""" 22 | # Read the Excel file using pandas with the specified configuration options 23 | df = pd.read_excel( 24 | configuration["path"], 25 | sheet_name=configuration["sheet_name"], 26 | skiprows=configuration["skiprows"], 27 | ) 28 | return df 29 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_empty_spark.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to return empty""" 3 | 4 | from pyspark.sql.types import StructType 5 | from connectors.connector import Connector 6 | 7 | 8 | class ConnectorEmptySpark(Connector): 9 | """Connector to return empty""" 10 | 11 | def __init__(self): 12 | # Initialize the connector with its name and indicate it uses Spark 13 | self.name = "EMPTY_SPARK" 14 | self.is_spark = True 15 | self.connection_definition = [] # No specific connection parameters required 16 | self.configuration_definition = [] # No specific configuration parameters required 17 | 18 | def get_data(self, configuration: dict, connection: dict): 19 | """Return empty value""" 20 | 21 | # Create an empty RDD (Resilient Distributed Dataset) 22 | empty_rdd = self.spark.sparkContext.emptyRDD() 23 | 24 | # Define an empty schema (no columns) 25 | columns = StructType([]) 26 | 27 | # Create an empty DataFrame using the empty RDD and schema 28 | df = self.spark.createDataFrame(data = empty_rdd, schema = columns) 29 | 30 | return df 31 | -------------------------------------------------------------------------------- /src/ploosh/exporters/__init__.py: -------------------------------------------------------------------------------- 1 | """Result exporter""" 2 | from importlib import import_module 3 | import os 4 | import inspect 5 | 6 | 7 | def get_exporters(): 8 | """Get all existing exporters""" 9 | connectors = {} 10 | 11 | # List all Python files in the current directory that start with "exporter_" 12 | files = [ 13 | name 14 | for name in os.listdir(os.path.dirname(__file__)) 15 | if name.endswith(".py") and name.startswith("exporter_") 16 | ] 17 | 18 | for file in files: 19 | module_name = file[:-3] # Remove the ".py" extension to get the module name 20 | 21 | # Import the module dynamically 22 | module = import_module(f"exporters.{module_name}") 23 | 24 | # Inspect the module to find classes that start with "Exporter" 25 | for name, obj in inspect.getmembers(module): 26 | if inspect.isclass(obj) and name.startswith("Exporter"): 27 | current_connector = obj() # Instantiate the exporter class 28 | connectors[ 29 | current_connector.name 30 | ] = current_connector # Add the exporter to the connectors dictionary 31 | 32 | return connectors 33 | -------------------------------------------------------------------------------- /tests/connectors/test_parquet_spark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from pyspark.sql import SparkSession 4 | import pytest 5 | from pyjeb import control_and_setup 6 | from ploosh.engines.load_engine_spark import LoadEngineSpark 7 | from ploosh.configuration import Configuration 8 | from ploosh.connectors.connector_parquet_spark import ConnectorParquetSpark 9 | 10 | @pytest.fixture 11 | def connector(): 12 | spark = SparkSession.builder \ 13 | .appName("ploosh") \ 14 | .master("spark://localhost:7077") \ 15 | .config("spark.executor.memory", "1g") \ 16 | .config("spark.driver.memory", "1g") \ 17 | .getOrCreate() 18 | 19 | connector = ConnectorParquetSpark() 20 | connector.spark = spark 21 | 22 | return connector 23 | 24 | @pytest.fixture 25 | def df_sales(): 26 | return pd.read_csv("./tests/.data/sales.csv", delimiter=",") 27 | 28 | def test_default(connector, df_sales): 29 | configuration = { 30 | "path": "./tests/.env/parquet/sales.parquet", 31 | } 32 | 33 | configuration = control_and_setup(configuration, connector.configuration_definition) 34 | df_test = connector.get_data(configuration, {}).toPandas() 35 | 36 | assert len(df_test.compare(df_sales)) == 0 37 | -------------------------------------------------------------------------------- /docs/connectors/native/parquet.md: -------------------------------------------------------------------------------- 1 | This connector is used to read Parquet files from local file system. 2 | 3 | # Connection configuration 4 | No connection is required by this connector 5 | 6 | # Test case configuration 7 | ## Test case configuration 8 | | Name | Mandatory | Default | Description | 9 | |-------------------|:---------:|:-----------------------------:|-------------| 10 | | path | yes | | Path to the Parquet file 11 | | columns | no | None | Subset of columns to load 12 | | engine | no | "auto" | Parquet engine to use ('auto', 'pyarrow', 'fastparquet') 13 | | filters | no | None | Row group filters to apply (for 'pyarrow') 14 | 15 | 16 | ## Example 17 | ``` yaml 18 | Example PARQUET: 19 | source: 20 | type: parquet 21 | path: ../data/parquet/source/example.parquet 22 | columns: ["id", "name"] 23 | filters: 24 | - column: "id" 25 | operator: "!=" 26 | value: 2 27 | 28 | 29 | expected: 30 | type: csv 31 | infer: True 32 | delimiter: ";" 33 | encoding: "utf-8" 34 | engine: "python" 35 | path: data/example.csv 36 | ``` -------------------------------------------------------------------------------- /tests/connectors/test_delta_spark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from pyspark.sql import SparkSession 4 | import pytest 5 | from pyjeb import control_and_setup 6 | from delta import configure_spark_with_delta_pip 7 | from ploosh.connectors.connector_delta_spark import ConnectorDeltaSpark 8 | 9 | @pytest.fixture 10 | def connector(): 11 | spark = SparkSession.builder \ 12 | .appName("ploosh") \ 13 | .master("spark://localhost:7077") \ 14 | .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \ 15 | .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \ 16 | .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") \ 17 | .getOrCreate() 18 | 19 | connector = ConnectorDeltaSpark() 20 | connector.spark = spark 21 | 22 | return connector 23 | 24 | @pytest.fixture 25 | def df_sales(): 26 | return pd.read_csv("./tests/.data/sales.csv", delimiter=",") 27 | 28 | #def test_load_data(connector, df_sales): 29 | # configuration = { 30 | # "path": "./tests/.env/delta/sales" 31 | # } 32 | # 33 | # configuration = control_and_setup(configuration, connector.configuration_definition) 34 | # 35 | # df_test = connector.get_data(configuration, None).toPandas() 36 | # 37 | # assert len(df_test.compare(df_sales)) == 0 -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "0.2.0", 3 | "configurations": [ 4 | { 5 | "name": "Debug Native", 6 | "type": "python", 7 | "request": "launch", 8 | "program": "__main__.py", 9 | "cwd": "${workspaceFolder}/src/ploosh", 10 | "console": "integratedTerminal", 11 | "args": ["--cases", "../../test_cases", "--filter", "*.yaml"], 12 | "justMyCode": true 13 | }, 14 | { 15 | "name": "Debug Native with connection file", 16 | "type": "python", 17 | "request": "launch", 18 | "program": "__main__.py", 19 | "cwd": "${workspaceFolder}/src/ploosh", 20 | "console": "integratedTerminal", 21 | "args": ["--cases", "../../test_cases", "--filter", "*.yaml", "--connections", "connections.yml"], 22 | "justMyCode": true 23 | }, 24 | { 25 | "name": "Debug Spark", 26 | "type": "python", 27 | "request": "launch", 28 | "program": "__main__.py", 29 | "cwd": "${workspaceFolder}/src/ploosh", 30 | "console": "integratedTerminal", 31 | "args": ["--cases", "../../test_cases", "--filter", "*.yaml", "--spark", "True"], 32 | "justMyCode": true 33 | } 34 | ] 35 | } -------------------------------------------------------------------------------- /tests/connectors/test_sql_spark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from pyspark.sql import SparkSession 4 | import pytest 5 | from pyjeb import control_and_setup 6 | from ploosh.engines.load_engine_spark import LoadEngineSpark 7 | from ploosh.configuration import Configuration 8 | from ploosh.connectors.connector_sql_spark import ConnectorSQLSpark 9 | 10 | @pytest.fixture 11 | def connector(): 12 | # connection with hive metastore 13 | spark = SparkSession.builder \ 14 | .appName("ploosh") \ 15 | .master("spark://localhost:7077") \ 16 | .config("spark.executor.memory", "1g") \ 17 | .config("spark.driver.memory", "1g") \ 18 | .config("spark.sql.warehouse.dir", f"{os.getcwd()}/spark-warehouse") \ 19 | .enableHiveSupport() \ 20 | .getOrCreate() 21 | 22 | connector = ConnectorSQLSpark() 23 | connector.spark = spark 24 | 25 | return connector 26 | 27 | @pytest.fixture 28 | def df_sales(): 29 | return pd.read_csv("./tests/.data/sales.csv", delimiter=",", date_format = "%Y-%m-%d", parse_dates=["sale_date"]) 30 | 31 | #def test_get_data(connector, df_sales): 32 | # configuration = { 33 | # "query": "select * from sales;" 34 | # } 35 | # 36 | # connection = {} 37 | # 38 | # df_test = connector.get_data(configuration, connection).toPandas() 39 | # 40 | # assert len(df_test.compare(df_sales)) == 0 -------------------------------------------------------------------------------- /src/ploosh/__init__.py: -------------------------------------------------------------------------------- 1 | """Initialization for command line""" 2 | 3 | import os 4 | import sys 5 | 6 | # Add the current directory to the system path 7 | sys.path.append(os.path.dirname(__file__)) 8 | 9 | from execute import execute 10 | 11 | 12 | def execute_cases( 13 | cases=None, 14 | connections=None, 15 | spark=None, 16 | spark_session=None, 17 | filter=None, 18 | path_output=None, 19 | ): 20 | """Execute test cases with the given parameters""" 21 | args = ["ploosh"] 22 | 23 | # Add cases parameter to arguments if provided 24 | if cases is not None: 25 | args.append("--cases") 26 | args.append(cases) 27 | 28 | # Add connections parameter to arguments if provided 29 | if connections is not None: 30 | args.append("--connections") 31 | args.append(connections) 32 | 33 | # Add spark parameter to arguments if provided 34 | if spark is not None: 35 | args.append("--spark") 36 | args.append(spark) 37 | 38 | # Add filter parameter to arguments if provided 39 | if filter is not None: 40 | args.append("--filter") 41 | args.append(filter) 42 | 43 | # Add output path parameter to arguments if provided 44 | if path_output is not None: 45 | args.append("--output") 46 | args.append(path_output) 47 | 48 | # Execute the test cases with the constructed arguments 49 | execute(args, spark_session) 50 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_json.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read JSON file""" 3 | 4 | import pandas as pd 5 | from connectors.connector import Connector 6 | 7 | 8 | class ConnectorJSON(Connector): 9 | """Connector to read JSON file""" 10 | 11 | def __init__(self): 12 | # Initialize the connector with its name and configuration definitions 13 | self.name = "JSON" 14 | self.connection_definition = [] # No specific connection parameters required 15 | self.configuration_definition = [ 16 | {"name": "path"}, # Path to the JSON file 17 | {"name": "encoding", "type": "string", "default": "utf-8"}, # Encoding to use when reading the JSON file. 18 | {"name": "lines", "type": "boolean", "default": False}, # Whether to treat the file as line-delimited JSON (one JSON object per line). 19 | {"name": "nrows", "type": "integer", "default": None} # Number of lines to read from a line-delimited JSON file. 20 | ] 21 | 22 | def get_data(self, configuration: dict, connection: dict): 23 | """Get data from source""" 24 | 25 | # Read the JSON file using pandas with the specified delimiter 26 | df = pd.read_json(configuration["path"], 27 | encoding = configuration["encoding"], 28 | lines = configuration["lines"], 29 | nrows = configuration["nrows"] 30 | ) 31 | return df 32 | -------------------------------------------------------------------------------- /docs/connectors/native/big_query.md: -------------------------------------------------------------------------------- 1 | This connector allows you to connect to a bigquery instance and execute SQL queries. 2 | 3 | # Connection configuration 4 | ## Definition 5 | | Name | Mandatory | Default | Description | 6 | |--------------------------|:---------:|:----------:|-------------| 7 | | credentials | yes | | The authentication use a [google keyfile](https://googleapis.dev/python/google-api-core/latest/auth.html) encoded in base 64 8 | 9 | ⚠️ it's highly recommended to use a [parameter](/docs/configuration-custom-parameters/) to pass the credentials value 10 | 11 | ## Example 12 | ``` yaml 13 | bigquery_example: 14 | type: bigquery 15 | credentials: $var.gbq_sample_token 16 | ``` 17 | 18 | # Test case configuration 19 | ## Definition 20 | | Name | Mandatory | Default | Description | 21 | |-------------------|:---------:|:-----------------------------:|-------------| 22 | | connection | yes | | The connection to use 23 | | query | yes | | The query to execute to the database 24 | 25 | ## Example 26 | ``` yaml 27 | Example BigQuery: 28 | source: 29 | connection: bigquery_example 30 | type: bigquery 31 | query: | 32 | select * 33 | from `rh.employees` 34 | where hire_date < "2000-01-01" 35 | expected: 36 | type: csv 37 | path: data/employees_before_2000.csv 38 | ``` -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_json_spark.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read json file""" 3 | 4 | from connectors.connector import Connector 5 | 6 | 7 | class ConnectorJSONSpark(Connector): 8 | """Connector to read json file with Spark""" 9 | 10 | def __init__(self): 11 | # Initialize the connector with its name and configuration definitions 12 | self.name = "JSON_SPARK" 13 | self.is_spark = True 14 | self.connection_definition = [] 15 | self.configuration_definition = [ 16 | {"name": "path", "type": "string"}, # Path to the JSON file 17 | {"name": "multiline", "type": "boolean", "default": True}, # Handles multi-line JSON files 18 | {"name": "encoding", "type": "string", "default": "UTF-8"}, # Character encoding format used in the JSON file 19 | {"name": "lineSep", "type": "string", "default": "\n"} # Character used to denote a line break 20 | ] 21 | 22 | def get_data(self, configuration: dict, connection: dict): 23 | """Get data from source""" 24 | 25 | # Read the JSON file using Spark with the specified configuration options 26 | df = self.spark.read.option("multiline", configuration["multiline"]) \ 27 | .option("encoding", configuration["encoding"]) \ 28 | .option("lineSep", configuration["lineSep"]) \ 29 | .json(configuration["path"]) 30 | 31 | return df 32 | -------------------------------------------------------------------------------- /src/ploosh/connectors/__init__.py: -------------------------------------------------------------------------------- 1 | """Data connectors""" 2 | from importlib import import_module 3 | from logs import Log 4 | import inspect 5 | import os 6 | 7 | 8 | def get_connectors(spark_session): 9 | """Get all existing connectors""" 10 | 11 | connectors = {} 12 | 13 | # List all Python files in the current directory that start with "connector_" 14 | files = [ 15 | name 16 | for name in os.listdir(os.path.dirname(__file__)) 17 | if name.endswith(".py") and name.startswith("connector_") 18 | ] 19 | 20 | for file in files: 21 | module_name = file[:-3] # Remove the ".py" extension to get the module name 22 | 23 | try: 24 | # Import the module dynamically 25 | for name, obj in inspect.getmembers(import_module(f"connectors.{module_name}")): 26 | if inspect.isclass(obj) and name.startswith("Connector"): 27 | current_connector = obj() # Instantiate the connector class 28 | 29 | # If a Spark session is provided and the connector is Spark-based, set the Spark session 30 | if spark_session is not None and current_connector.is_spark: 31 | current_connector.spark = spark_session 32 | 33 | # Add the connector to the connectors dictionary 34 | connectors[current_connector.name] = current_connector 35 | except Exception as e: 36 | Log.print_warning(f"Could not load connector {module_name}") 37 | Log.print_warning(str(e)) 38 | 39 | 40 | return connectors 41 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: 'Release on PyPi' 2 | 3 | on: 4 | push: 5 | branches: 6 | - develop 7 | tags: 8 | - '*' 9 | paths: 10 | - 'src/**' 11 | - 'tests/**' 12 | workflow_dispatch: 13 | jobs: 14 | publish: 15 | name: 'Publish on PyPi' 16 | runs-on: ubuntu-latest 17 | defaults: 18 | run: 19 | shell: bash 20 | working-directory: src/ 21 | steps: 22 | - name: Checkout 23 | uses: actions/checkout@v3 24 | - name: Set up Python 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: "3.12.8" 28 | - name: Install requirements 29 | run: | 30 | pip install -r requirements.txt 31 | pip install wheel==0.44.0 32 | pip install twine==6.0.1 33 | pip install setuptools==75.1.0 34 | - name: Build package (full) 35 | run: python setup-full.py sdist bdist_wheel 36 | - name: Build package (core) 37 | run: python setup-core.py sdist bdist_wheel 38 | - name: Check package 39 | run: twine check dist/* 40 | - name: Publish 41 | run: | 42 | if [[ $GITHUB_REF == refs/tags/* ]]; then 43 | echo "Deploying to production environment" 44 | twine upload --repository-url https://upload.pypi.org/legacy/ dist/* -u ${{ secrets.PYPI_USER }} -p '${{ secrets.PYPI_PASSWORD }}' --verbose 45 | 46 | else 47 | echo "Deploying to test environment" 48 | twine upload --repository-url https://test.pypi.org/legacy/ dist/* -u ${{ secrets.PYPI_TEST_USER }} -p '${{ secrets.PYPI_TEST_PASSWORD }}' --verbose 49 | fi -------------------------------------------------------------------------------- /src/ploosh/engines/load_engine_native.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from engines.load_engine import LoadEngine 5 | 6 | class LoadEngineNative(LoadEngine): 7 | """Load engine for native Pandas""" 8 | def __init__(self, configuration, options, connection): 9 | """Initialize the LoadEngineNative class""" 10 | 11 | self.configuration = configuration 12 | self.options = options 13 | self.connection = connection 14 | 15 | def execute(self, df_data): 16 | """Execute the load engine""" 17 | 18 | self.count = len(df_data) 19 | 20 | # Cast columns to specified types 21 | for column in self.options["cast"]: 22 | column_name = self.get_insensitive_item(column["name"], df_data.columns) 23 | column_type = column["type"] 24 | if column_type == "datetime": 25 | column_type = "datetime64[ns]" 26 | df_data[column_name] = df_data[column_name].astype(column_type, errors="ignore") 27 | 28 | # Remap bad columns type 29 | for column in df_data.select_dtypes(include=["object"]).columns: 30 | if len(df_data) == 0: 31 | continue 32 | 33 | if type(df_data[column][0]).__name__ == "Decimal": 34 | df_data[column] = df_data[column].astype(float, errors="ignore") 35 | 36 | # Remove time zones 37 | date_columns = df_data.select_dtypes(include=["datetime64[ns, UTC]"]).columns 38 | for date_column in date_columns: 39 | df_data[date_column] = df_data[date_column].dt.tz_localize(None) 40 | self.count = len(df_data) 41 | 42 | return df_data 43 | -------------------------------------------------------------------------------- /tests/connectors/test_mysql.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import pytest 4 | from pyjeb import control_and_setup 5 | import urllib 6 | from ploosh.connectors.connector_mysql import ConnectorMYSQL 7 | 8 | @pytest.fixture 9 | def connector(): 10 | return ConnectorMYSQL() 11 | 12 | @pytest.fixture 13 | def df_sales(): 14 | return pd.read_csv("./tests/.data/sales.csv", delimiter=",", date_format = "%Y-%m-%d", parse_dates=["sale_date"]) 15 | 16 | def test_connection_with_password(connector, df_sales): 17 | configuration = { 18 | "query": "select * from sales;", 19 | "connection": "debug" 20 | } 21 | 22 | connection = { 23 | "hostname": "localhost", 24 | "username": "ploosh", 25 | "password": os.environ.get("TEST_DB_PASSWORD"), 26 | "database": "ploosh" 27 | } 28 | connection = control_and_setup(connection, connector.connection_definition) 29 | 30 | df_test = connector.get_data(configuration, connection) 31 | 32 | assert len(df_test.compare(df_sales)) == 0 33 | 34 | def test_connection_with_connection_string(connector, df_sales): 35 | configuration = { 36 | "query": "select * from sales;", 37 | "connection": "debug" 38 | } 39 | 40 | password = urllib.parse.quote_plus(os.environ.get('TEST_DB_PASSWORD')) 41 | connection = { 42 | "mode": "connection_string", 43 | "connection_string": f"mysql+pymysql://ploosh:{password}@localhost/ploosh" 44 | } 45 | 46 | connection = control_and_setup(connection, connector.connection_definition) 47 | 48 | df_test = connector.get_data(configuration, connection) 49 | 50 | assert len(df_test.compare(df_sales)) == 0 -------------------------------------------------------------------------------- /tests/connectors/test_postgresql.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import pytest 4 | from pyjeb import control_and_setup 5 | import urllib 6 | from ploosh.connectors.connector_postgresql import ConnectorPostgreSQL 7 | 8 | @pytest.fixture 9 | def connector(): 10 | return ConnectorPostgreSQL() 11 | 12 | @pytest.fixture 13 | def df_sales(): 14 | return pd.read_csv("./tests/.data/sales.csv", delimiter=",", date_format = "%Y-%m-%d", parse_dates=["sale_date"]) 15 | 16 | 17 | def test_connection_with_password(connector, df_sales): 18 | configuration = { 19 | "query": "select * from sales;", 20 | "connection": "debug" 21 | } 22 | 23 | connection = { 24 | "hostname": "localhost", 25 | "username": "ploosh", 26 | "password": os.environ.get("TEST_DB_PASSWORD"), 27 | "database": "ploosh" 28 | } 29 | connection = control_and_setup(connection, connector.connection_definition) 30 | 31 | df_test = connector.get_data(configuration, connection) 32 | 33 | assert len(df_test.compare(df_sales)) == 0 34 | 35 | def test_connection_with_connection_string(connector, df_sales): 36 | configuration = { 37 | "query": "select * from sales;", 38 | "connection": "debug" 39 | } 40 | 41 | password = urllib.parse.quote_plus(os.environ.get('TEST_DB_PASSWORD')) 42 | connection = { 43 | "mode": "connection_string", 44 | "connection_string": f"postgresql+pg8000://ploosh:{password}@localhost/ploosh" 45 | } 46 | 47 | connection = control_and_setup(connection, connector.connection_definition) 48 | 49 | df_test = connector.get_data(configuration, connection) 50 | 51 | assert len(df_test.compare(df_sales)) == 0 52 | 53 | 54 | -------------------------------------------------------------------------------- /docs/connectors/native/odbc.md: -------------------------------------------------------------------------------- 1 | This connector allows to connect to a ODBC datasource and execute SQL queries. 2 | 3 | # Connection configuration 4 | ## Definition 5 | | Name | Mandatory | Default | Description | 6 | |--------------------------|:---------:|:----------:|-------------| 7 | | dsn | yes | | Data Source Name 8 | | auto_commit | no | true | Autocommit mode 9 | | username | no | null | User name 10 | | password | no | null | User password 11 | | driver | no | null | ODBC driver name 12 | | encoding | no | UTF-8 | Encoding to use for the connection 13 | 14 | ⚠️ it's highly recommended to use a [parameter](/docs/configuration-custom-parameters/) to pass the password value 15 | 16 | ## Example 17 | ``` yaml 18 | odbc_example: 19 | type: odbc 20 | dsn: my_dsn 21 | username: pixel 22 | password: $var.odbc_password 23 | ``` 24 | 25 | # Test case configuration 26 | ## Definition 27 | | Name | Mandatory | Default | Description | 28 | |-------------------|:---------:|:-----------------------------:|-------------| 29 | | connection | yes | | The connection to use 30 | | query | yes | | The query to execute to the database 31 | 32 | ## Example 33 | ``` yaml 34 | Example ODBC: 35 | source: 36 | connection: odbc_example 37 | type: odbc 38 | query: | 39 | select * 40 | from employees 41 | where hire_date < '2000-01-01' 42 | type: csv 43 | path: data/employees_before_2000.csv 44 | ``` -------------------------------------------------------------------------------- /docs/connectors/spark/csv.md: -------------------------------------------------------------------------------- 1 | This connector is used to read CSV files using Spark. 2 | 3 | ⚠️ A spark connector can be use only with another spark connector. It is not possible to use a spark connector with a non spark connector. 4 | 5 | See [Spark documentation](/docs/configuration-spark-mode/) for more information. 6 | 7 | # Connection configuration 8 | No connection is required by this connector 9 | 10 | # Configuration 11 | ## Test case configuration 12 | | Name | Mandatory | Default | Description | 13 | |-------------------|:---------:|:-----------------------------:|-------------| 14 | | path | yes | | Path to the CSV 15 | | delimiter | no | , | Column delimiter 16 | | header | no | true | Use the first row as header 17 | | inferSchema | no | False | Infers the input schema automatically from data 18 | | multiline | no | False | Parse one record, which may span multiple lines, per file 19 | | quote | no | '"' | Character used to denote the start and end of a quoted item 20 | | encoding | no | "UTF-8" | Column delimiter 21 | | lineSep | no | "\n" | Column delimiter 22 | 23 | 24 | ## Example 25 | ``` yaml 26 | Example CSV Spark: 27 | source: 28 | type: csv_spark 29 | path: data/employees/*.csv 30 | multiline: False 31 | inferSchema: False 32 | encoding: "UTF-8" 33 | expected: 34 | type: sql_spark 35 | query: | 36 | select * 37 | from employees 38 | where hire_date < "2000-01-01" 39 | ``` -------------------------------------------------------------------------------- /tests/connectors/test_mssql.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import pytest 4 | import urllib 5 | from pyjeb import control_and_setup 6 | from ploosh.connectors.connector_mssql import ConnectorMSSQL 7 | 8 | @pytest.fixture 9 | def connector(): 10 | return ConnectorMSSQL() 11 | 12 | @pytest.fixture 13 | def df_sales(): 14 | return pd.read_csv("./tests/.data/sales.csv", delimiter=",", date_format = "%Y-%m-%d", parse_dates=["sale_date"]) 15 | 16 | def test_connection_with_password(connector, df_sales): 17 | configuration = { 18 | "query": "select * from sales;", 19 | "connection": "debug" 20 | } 21 | 22 | connection = { 23 | "hostname": "localhost", 24 | "username": "sa", 25 | "password": os.environ.get("TEST_DB_PASSWORD"), 26 | "database": "ploosh", 27 | "driver": "ODBC Driver 17 for SQL Server", 28 | "encrypt": False, 29 | } 30 | connection = control_and_setup(connection, connector.connection_definition) 31 | 32 | df_test = connector.get_data(configuration, connection) 33 | 34 | assert len(df_test.compare(df_sales)) == 0 35 | 36 | 37 | def test_connection_with_connection_string(connector, df_sales): 38 | configuration = { 39 | "query": "select * from sales;", 40 | "connection": "debug" 41 | } 42 | 43 | password = urllib.parse.quote_plus(os.environ.get('TEST_DB_PASSWORD')) 44 | connection = { 45 | "mode": "connection_string", 46 | "connection_string": f"mssql+pyodbc://sa:{password}@localhost/ploosh?driver=ODBC+Driver+17+for+SQL+Server" 47 | } 48 | 49 | connection = control_and_setup(connection, connector.connection_definition) 50 | 51 | df_test = connector.get_data(configuration, connection) 52 | 53 | assert len(df_test.compare(df_sales)) == 0 -------------------------------------------------------------------------------- /docs/connectors/native/databricks.md: -------------------------------------------------------------------------------- 1 | This connector allows you to connect to a Databricks instance and execute SQL queries. 2 | 3 | # Connection configuration 4 | ## Definition 5 | | Name | Mandatory | Default | Description | 6 | |--------------------------|:---------:|:----------:|-------------| 7 | | token | yes | | a token generated from databricks. See the [documentation](https://docs.databricks.com/en/dev-tools/auth/pat.html) 8 | | hostname | yes | | url to databricks 9 | | database | yes | | name of the database 10 | | http_path | yes | | the value is available on [JDBC/ODBC](https://docs.databricks.com/en/integrations/compute-details.html) settings 11 | 12 | ## Example 13 | ``` yaml 14 | databricks_example: 15 | type: databricks 16 | hostname: adb-myproject.8.azuredatabricks.net 17 | database: default 18 | token: $var.databricks_token 19 | http_path: /sql/1.0/warehouses/da000000000000000 20 | ``` 21 | 22 | # Test case configuration 23 | ## Definition 24 | | Name | Mandatory | Default | Description | 25 | |-------------------|:---------:|:-----------------------------:|-------------| 26 | | connection | yes | | The connection to use 27 | | query | yes | | The query to execute to the database 28 | 29 | 30 | ## Example 31 | ``` yaml 32 | Example Databricks: 33 | source: 34 | connection: databricks_example 35 | type: databricks 36 | query: | 37 | select * 38 | from `rh.employees` 39 | where hire_date < "2000-01-01" 40 | expected: 41 | type: csv 42 | path: data/employees_before_2000.csv 43 | ``` -------------------------------------------------------------------------------- /docs/connectors/native/snowflake.md: -------------------------------------------------------------------------------- 1 | This connector allows to connect to a Snowflake instance and execute SQL queries. 2 | 3 | # Connection configuration 4 | ## Definition 5 | | Name | Mandatory | Default | Description | 6 | |--------------------------|:---------:|:----------:|-------------| 7 | | account_identifier | yes | | Account identifier of snowflake instance 8 | | username | yes | | User name 9 | | password | yes | | User password 10 | | database | no | null | Target database name 11 | | schema | no | null | Target schema name 12 | | warehouse | no | null | Target warehouse name 13 | | role | no | null | Target role name 14 | 15 | ⚠️ it's highly recommended to use a [parameter](/docs/configuration-custom-parameters/) to pass the password value 16 | 17 | ## Example 18 | ``` yaml 19 | snowflake_example: 20 | type: snowflake 21 | account_identifier: bjpwtqg-kt67582 22 | schema: PUBLIC 23 | warehouse: SF_TUTS_WH 24 | database: SF_TUTS 25 | username: pixel 26 | password: $var.snowflake_password_db 27 | ``` 28 | 29 | # Test case configuration 30 | ## Definition 31 | | Name | Mandatory | Default | Description | 32 | |-------------------|:---------:|:-----------------------------:|-------------| 33 | | connection | yes | | The connection to use 34 | | query | yes | | The query to execute to the database 35 | 36 | ## Example 37 | ``` yaml 38 | Example Snowflake: 39 | source: 40 | connection: snowflake_example 41 | type: snowflake 42 | query: | 43 | select * 44 | from RH.employees 45 | where hire_date < '2000-01-01' 46 | expected: 47 | type: csv 48 | path: data/employees_before_2000.csv 49 | ``` -------------------------------------------------------------------------------- /src/setup.py: -------------------------------------------------------------------------------- 1 | """Setup PyPi module""" 2 | # pylint: disable=C0103 3 | 4 | from setuptools import setup 5 | from ploosh.version import PLOOSH_VERSION 6 | 7 | def setup_ploosh(name, install_requires): 8 | """Setup Ploosh module""" 9 | 10 | with open("../readme.md", encoding="UTF-8") as f: 11 | long_description = "".join(f.readlines()) 12 | 13 | # replace relative link by absolute github link 14 | long_description = long_description.replace("(/", "(https://github.com/CSharplie/ploosh/blob/main/") 15 | 16 | install_requires = install_requires + [ 17 | "colorama==0.4.6", 18 | "PyYAML==6.0.1", 19 | "Pyjeb==0.2.1", 20 | "numpy==1.26.3", 21 | "pandas==2.1.4", 22 | "openpyxl==3.1.2", 23 | "sqlalchemy==1.4.51", 24 | "pyspark==3.5.4", 25 | "deltalake==0.23.2", 26 | "delta-spark==3.3.0", 27 | ] 28 | 29 | setup ( 30 | name = name, 31 | version = PLOOSH_VERSION, 32 | description="A framework to automatize your tests for data projects", 33 | long_description=long_description, 34 | long_description_content_type="text/markdown", 35 | url="https://github.com/CSharplie/ploosh/", 36 | project_urls={ 37 | "Say Thanks!": "https://ploosh.io", 38 | "Bug Tracker": "https://github.com/CSharplie/ploosh/issues", 39 | "CI": "https://github.com/CSharplie/ploosh/actions", 40 | "Documentation": "https://ploosh.io/docs/ploosh/", 41 | "Source Code": "https://github.com/CSharplie/ploosh", 42 | }, 43 | download_url="https://pypi.org/project/ploosh/", 44 | platforms="Any", 45 | python_requires=">=3.6", 46 | license= "Apache License 2.0", 47 | entry_points = { 48 | "console_scripts": [ 49 | "ploosh = ploosh.__main__:main" 50 | ] 51 | }, 52 | install_requires=install_requires, 53 | ) 54 | -------------------------------------------------------------------------------- /docs/exporters/csv.md: -------------------------------------------------------------------------------- 1 | # Structure 2 | ``` 3 | output/ 4 | ├─ csv/ 5 | │ ├─ test_results.csv 6 | │ ├─ test_results/ 7 | │ │ ├─ test case 1.xlsx 8 | │ │ ├─ test case 2.xlsx 9 | │ │ └─ ... 10 | ``` 11 | 12 | The csv extractor will generate a `test_results.csv` file and a `test_results` folder containing the details of the test cases results in xlsx format. 13 | 14 | # test_results.csv 15 | The `test_results.csv` file will contain the following columns: 16 | - `test_case`: the name of the test case 17 | - `status`: the status of the test case. Can be `success`, `failure` or `error` 18 | - `source_start`: the start time of the source extraction 19 | - `source_end`: the end time of the source extraction 20 | - `source_duration`: the duration of the source extraction 21 | - `source_count`: the count of the source dataset 22 | - `expected_start`: the start time of the expected extraction 23 | - `expected_end`: the end time of the expected extraction 24 | - `expected_duration`: the duration of the expected extraction 25 | - `expected_count`: the count of the expected dataset 26 | - `success_rate`: the success rate of the test case 27 | - `error_type`: the type of the error if the test case failed or raised an error 28 | - `error_message`: the error message if the test case failed or raised an error 29 | - 30 | # test_results folder 31 | The `test_results` folder will contain one xlsx file per test case. Each file will contain a sheet with the gap between the source and the expected dataset 32 | 33 | # Example 34 | ``` csv 35 | test_case,status,source_start,source_end,source_duration,source_count,expected_start,expected_end,expected_duration,expected_count,success_rate,error_type,error_message 36 | test 1,passed,2024-02-05T17:08:36Z,2024-02-05T17:08:36Z,0.0032982,100,2024-02-05T17:08:36Z,2024-02-05T17:08:36Z,6.0933333333333335e-05,100,1.0,,, 37 | test 2,failed,2024-02-05T17:08:36Z,2024-02-05T17:08:36Z,0.0032982,100,2024-02-05T17:08:36Z,2024-02-05T17:08:36Z,6.0933333333333335e-05,100,0.95,Data,Some rows are not equals between source dataset and expected dataset 38 | ``` -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Ploosh 2 | 3 | Ploosh is yaml based framework used to automatized the testing process in data projects. 4 | 5 | # Get started 6 | Go to the [ploosh documentation](https://ploosh.io/docs/ploosh/) to find the get started tutorial. 7 | 8 | ## Steps 9 | 1. Install ploosh package 10 | 2. Run tests 11 | 3. Analyse results 12 | 13 | ## Install Ploosh 14 | 15 | Install from [PyPi](https://pypi.org/project/ploosh/) package manager: 16 | ``` shell 17 | pip install ploosh 18 | ``` 19 | 20 | ## Run tests 21 | ``` shell 22 | ploosh --connections "connections.yml" --cases "test_cases" --export "JSON" --p_my_sql_server_password "mypassword" 23 | ``` 24 | 25 | ![Execution result](http://ploosh.io/wp-content/uploads/2024/09/image.png) 26 | 27 | ## Test results 28 | ``` json 29 | [ 30 | { 31 | "name": "Test aggregated data", 32 | "state": "passed", 33 | "source": { 34 | "start": "2024-02-05T17:08:36Z", 35 | "end": "2024-02-05T17:08:36Z", 36 | "duration": 0.0032982 37 | }, 38 | "expected": { 39 | "start": "2024-02-05T17:08:36Z", 40 | "end": "2024-02-05T17:08:36Z", 41 | "duration": 6.0933333333333335e-05 42 | }, 43 | "compare": { 44 | "start": "2024-02-05T17:08:36Z", 45 | "end": "2024-02-05T17:08:36Z", 46 | "duration": 0.00046468333333333334 47 | } 48 | }, 49 | { 50 | "name": "Test unvalid data", 51 | "state": "failed", 52 | "source": { 53 | "start": "2024-02-05T17:08:36Z", 54 | "end": "2024-02-05T17:08:36Z", 55 | "duration": 0.00178865 56 | }, 57 | "expected": { 58 | "start": "2024-02-05T17:08:36Z", 59 | "end": "2024-02-05T17:08:36Z", 60 | "duration": 1.49e-05 61 | }, 62 | "compare": { 63 | "start": "2024-02-05T17:08:36Z", 64 | "end": "2024-02-05T17:08:36Z", 65 | "duration": 1.8333333333333333e-07 66 | }, 67 | "error": { 68 | "type": "count", 69 | "message": "The count in source dataset (55) is differant than the count the in expected dataset (0)" 70 | } 71 | } 72 | ] 73 | ``` 74 | -------------------------------------------------------------------------------- /docs/connectors/native/csv.md: -------------------------------------------------------------------------------- 1 | This connector is used to read CSV files from local file system. 2 | 3 | # Connection configuration 4 | No connection is required by this connector 5 | 6 | # Test case configuration 7 | ## Test case configuration 8 | | Name | Mandatory | Default | Description | 9 | |-------------------|:---------:|:-----------------------------:|-------------| 10 | | path | yes | | Path to the CSV 11 | | delimiter | no | , | Column delimiter 12 | | infer | no | True | Infer the column names 13 | | names | no | None | Sequence of column labels to apply 14 | | usecols | no | None | Subset of columns to select 15 | | skiprows | no | None | Line numbers to skip or number of lines to skip (int) at the start of the file 16 | | skipfooter | no | 0 | Number of lines at bottom of file to skip (Unsupported with engine='c') 17 | | nrows | no | None | Number of rows of file to read. Useful for reading pieces of large files 18 | | lineterminator | no | None | Character used to denote a line break 19 | | quotechar | no | '"' | Character used to denote the start and end of a quoted item 20 | | encoding | no | "utf-8" | Encoding to use for UTF when reading/writing 21 | | engine | no | None | Parser engine to use 22 | 23 | ## Example 24 | ``` yaml 25 | Example CSV: 26 | source: 27 | connection: mysql_example 28 | type: mysql 29 | query: | 30 | select * 31 | from employees 32 | where hire_date < "2000-01-01" 33 | expected: 34 | type: csv 35 | infer: True 36 | delimiter: ";" 37 | encoding: "utf-8" 38 | engine: "python" 39 | path: data/employees_before_2000.csv 40 | ``` -------------------------------------------------------------------------------- /docs/configuration/spark.md: -------------------------------------------------------------------------------- 1 | Ploosh can be executed over spark (in Databricks, Microsoft Fabric or local)) using spark connectors and by calling from python code. 2 | 3 | # Examples 4 | 5 | ### Microsoft Fabric 6 | 7 | __Cell 1__ : Install Ploosh package from PyPi package manager 8 | ``` shell 9 | pip install ploosh 10 | ``` 11 | 12 | __Cell 2__ : Mount the lakehouse to acces the case and connection files 13 | ``` python 14 | mount_point = "/ploosh_config" 15 | workspace_name = "ploosh" 16 | lakehouse_name = "data" 17 | 18 | if(mssparkutils.fs.mount(f"abfss://{workspace_name}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/", mount_point)): 19 | ploosh_config_path = mssparkutils.fs.getMountPath(mountPoint = mount_point) 20 | ``` 21 | 22 | __Cell 3__ : Execute ploosh framework 23 | ``` python 24 | from ploosh import execute_cases 25 | 26 | connections_file_path = f"{ploosh_config_path}/Files/connections.yaml" 27 | cases_folder_path = f"{ploosh_config_path}/Files/cases" 28 | 29 | execute_cases(cases = cases_folder_path, connections = connections_file_path, spark_session = spark) 30 | ``` 31 | 32 | ## Databricks 33 | 34 | __Cell 1__ : Install Ploosh package from PyPi package manager 35 | ``` shell 36 | %pip install ploosh 37 | ``` 38 | 39 | __Cell 2__ : Restart python to make the package available 40 | ``` python 41 | dbutils.library.restartPython() 42 | ``` 43 | 44 | __Cell 3__ : Execute ploosh framework 45 | ``` python 46 | from ploosh import execute_cases 47 | 48 | root_folder = "/Workspace/Shared" 49 | 50 | execute_cases(cases=f"{root_folder}/cases", path_output=f"{root_folder}/output", spark_session=spark) 51 | ``` 52 | 53 | ## Local 54 | 55 | __Step 1__ : Install Ploosh package from PyPi package manager 56 | ``` shell 57 | pip install ploosh 58 | ``` 59 | 60 | __Step 2__ : Initialize the spark session 61 | ``` python 62 | from pyspark.sql import SparkSession 63 | 64 | spark = SparkSession.builder.appName("Ploosh").getOrCreate() 65 | ``` 66 | 67 | __Step 3__ : Execute ploosh framework 68 | ``` python 69 | from ploosh import execute_cases 70 | 71 | execute_cases(cases = "test_cases", connections = "connections.yml", spark_session = spark) 72 | ``` 73 | 74 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_databricks.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read Databricks database""" 3 | 4 | import pandas as pd 5 | from sqlalchemy import create_engine 6 | from connectors.connector import Connector 7 | 8 | 9 | class ConnectorDatabricks(Connector): 10 | """Connector to read Databricks database""" 11 | 12 | def __init__(self): 13 | # Initialize the connector with its name and connection definitions 14 | self.name = "DATABRICKS" 15 | self.connection_definition = [ 16 | { 17 | "name": "token", # Token for authentication 18 | }, 19 | { 20 | "name": "hostname", # Hostname of the Databricks instance 21 | }, 22 | { 23 | "name": "database", # Database name 24 | }, 25 | { 26 | "name": "http_path", # HTTP path for the Databricks cluster 27 | }, 28 | { 29 | "name": "port", # Port number (default is 443) 30 | "default": 443, 31 | "type": "integer", 32 | }, 33 | ] 34 | self.configuration_definition = [ 35 | {"name": "query"}, # SQL query to execute 36 | {"name": "connection"}, # Connection name 37 | ] 38 | 39 | def get_data(self, configuration: dict, connection: dict): 40 | """Get data from source""" 41 | 42 | # Extract connection parameters 43 | token = connection["token"] 44 | hostname = connection["hostname"] 45 | database = connection["database"] 46 | port = connection["port"] 47 | http_path = connection["http_path"] 48 | 49 | # Create the connection string for Databricks 50 | connection_string = ( 51 | f"databricks://token:{token}@{hostname}:{port}/{database}?http_path={http_path}" 52 | ) 53 | 54 | # Create a SQLAlchemy engine using the connection string 55 | sql_connection = create_engine(connection_string, echo=False) 56 | 57 | # Execute the SQL query and read the data into a pandas DataFrame 58 | df = pd.read_sql(configuration["query"], sql_connection) 59 | 60 | return df 61 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_parquet.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read Parquet file""" 3 | 4 | import pandas as pd 5 | from connectors.connector import Connector 6 | 7 | 8 | class ConnectorParquet(Connector): 9 | """Connector to read Parquet file""" 10 | 11 | def __init__(self): 12 | # Initialize the connector with its name and configuration definitions 13 | self.name = "PARQUET" 14 | self.connection_definition = [] # No specific connection parameters required 15 | self.configuration_definition = [ 16 | {"name": "path"}, # Path to the Parquet file 17 | {"name": "columns", "type": "list", "default": None}, # Subset of columns to load 18 | {"name": "engine", "type": "string", "validset": ["auto", "pyarrow", "fastparquet"], "default": "auto"}, # Parquet engine to use ('auto', 'pyarrow', 'fastparquet') 19 | {"name": "filters", "type": "list", "default": None}, # Row group filters to apply (for 'pyarrow') 20 | {"name": "filters.column", "type": "string"}, # The name of the column to filter 21 | {"name": "filters.operator", "type": "string", "validset": ["==", "=", ">", ">=", "<", "<=", "!="]}, # The operator to be used 22 | {"name": "filters.value", "type": "integer"}, # The value to be used to filter the column 23 | ] 24 | 25 | def get_data(self, configuration: dict, connection: dict): 26 | """Get data from source""" 27 | 28 | # Extract the path and configuration parameters 29 | path = configuration["path"] 30 | columns = configuration["columns"] 31 | engine = configuration["engine"] 32 | filters = configuration["filters"] 33 | list_filters = None 34 | if filters is not None: 35 | list_filters = ( 36 | [(filter_spec["column"], filter_spec["operator"], filter_spec["value"]) for filter_spec in filters] 37 | if filters else None 38 | ) 39 | 40 | # Read the Parquet file using pandas 41 | df = pd.read_parquet(path, 42 | columns=columns, 43 | engine=engine, 44 | filters=list_filters) 45 | return df 46 | -------------------------------------------------------------------------------- /tests/connectors/test_json.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import pytest 4 | from pyjeb import control_and_setup 5 | from ploosh.connectors.connector_json import ConnectorJSON 6 | 7 | @pytest.fixture 8 | def connector(): 9 | return ConnectorJSON() 10 | 11 | @pytest.fixture 12 | def df_sales(): 13 | return pd.read_csv(f"{os.getcwd()}/tests/.data/sales.csv", delimiter=",") 14 | 15 | 16 | @pytest.fixture 17 | def df_sales_with_two_rows(): 18 | df = pd.read_csv(f"{os.getcwd()}/tests/.data/sales.csv", delimiter=",") 19 | df_first_2_rows = df.head(2) 20 | return df_first_2_rows 21 | 22 | 23 | def test_json_default(connector, df_sales): 24 | configuration = { 25 | "path": f"{os.getcwd()}/tests/.env/json/sales.json" 26 | } 27 | 28 | configuration = control_and_setup(configuration, connector.configuration_definition) 29 | 30 | df_test = connector.get_data(configuration, None) 31 | 32 | assert len(df_test.compare(df_sales)) == 0 33 | 34 | 35 | def test_json_with_lines_true(connector, df_sales): 36 | configuration = { 37 | "path": f"{os.getcwd()}/tests/.env/json/sales_lines_true.json", 38 | "lines": True 39 | } 40 | 41 | configuration = control_and_setup(configuration, connector.configuration_definition) 42 | 43 | df_test = connector.get_data(configuration, None) 44 | 45 | assert len(df_test.compare(df_sales)) == 0 46 | 47 | 48 | def test_json_with_two_rows(connector, df_sales_with_two_rows): 49 | configuration = { 50 | "path": f"{os.getcwd()}/tests/.env/json/sales_lines_true.json", 51 | "lines": True, 52 | "nrows": 2 53 | } 54 | 55 | configuration = control_and_setup(configuration, connector.configuration_definition) 56 | 57 | df_test = connector.get_data(configuration, None) 58 | 59 | assert len(df_test.compare(df_sales_with_two_rows)) == 0 60 | 61 | 62 | def test_json_with_specific_encoding(connector, df_sales): 63 | configuration = { 64 | "path": f"{os.getcwd()}/tests/.env/json/sales-ISO-8859-1.json" 65 | } 66 | 67 | configuration = control_and_setup(configuration, connector.configuration_definition) 68 | 69 | df_test = connector.get_data(configuration, None) 70 | 71 | assert len(df_test.compare(df_sales)) == 0 72 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_bigquery.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read BigQuery database""" 3 | 4 | import pandas as pd 5 | import pandas_gbq 6 | from sqlalchemy import create_engine 7 | from connectors.connector import Connector 8 | 9 | 10 | class ConnectorBigQuery(Connector): 11 | """Connector to read BigQuery database""" 12 | 13 | def __init__(self): 14 | # Initialize the connector with its name and connection definitions 15 | self.name = "BIGQUERY" 16 | self.connection_definition = [ 17 | { 18 | "name": "credentials", # Credentials for authentication 19 | "default": None, 20 | }, 21 | { 22 | "name": "credentials_type", # Type of credentials (service account or current user) 23 | "validset": ["service_account", "current_user"], 24 | "default": "service_account", 25 | }, 26 | { 27 | "name": "project_id", # Project ID for BigQuery 28 | "default": None, 29 | }, 30 | ] 31 | self.configuration_definition = [ 32 | {"name": "query"}, # SQL query to execute 33 | {"name": "connection"}, # Connection name 34 | ] 35 | 36 | def get_data(self, configuration: dict, connection: dict): 37 | """Get data from source""" 38 | # Extract credentials and credentials type from the connection 39 | credentials = connection["credentials"] 40 | credentials_type = connection["credentials_type"] 41 | 42 | # If using service account credentials, create a connection string and use SQLAlchemy 43 | if credentials_type == "service_account": 44 | connection_string = f"bigquery://?credentials_base64={credentials}" 45 | sql_connection = create_engine(connection_string, echo=False) 46 | df = pd.read_sql(configuration["query"], sql_connection) 47 | # If using current user credentials, use pandas_gbq to read the data 48 | elif credentials_type == "current_user": 49 | df = pandas_gbq.read_gbq( 50 | configuration["query"], connection["project_id"], progress_bar_type=None 51 | ) 52 | 53 | return df 54 | -------------------------------------------------------------------------------- /docs/pipelines/azure_devops.md: -------------------------------------------------------------------------------- 1 | Ploosh is easy to use and can be integrated with any CI/CD pipeline. 2 | The following steps are required to run Ploosh tests in Azure DevOps and publish the results into Azure DevOps Test Plans. 3 | 4 | # Exemple of pipeline 5 | 6 | 1. Install ODBC driver for SQL Server if SQL Server connector is used 7 | 2. Install Ploosh package from PyPi 8 | 3. Execute Ploosh 9 | 1. Provide the connections file 10 | 2. Provide the test cases folder 11 | 3. Provide the export format (TRX for Azure DevOps Test Plans) 12 | 4. Disable the failure flag to avoid the pipeline to fail if a test fails 13 | 5. Provide the passwords as parameters from the variables group 14 | 4. Publish test results 15 | 16 | ```yaml 17 | trigger: 18 | - main 19 | 20 | variables: 21 | - group: demo 22 | stages: 23 | - stage: 24 | displayName: Build 25 | jobs: 26 | - job: 27 | steps: 28 | - checkout: self 29 | - task: CmdLine@2 30 | displayName: Install ODBC driver for SQL Server 31 | inputs: 32 | script: | 33 | curl https://packages.microsoft.com/keys/microsoft.asc | sudo tee /etc/apt/trusted.gpg.d/microsoft.asc 34 | curl https://packages.microsoft.com/config/ubuntu/$(lsb_release -rs)/prod.list | sudo tee /etc/apt/sources.list.d/mssql-release.list 35 | sudo apt-get update 36 | sudo ACCEPT_EULA=Y apt-get install -y msodbcsql18 37 | - task: CmdLine@2 38 | displayName: Install ploosh 39 | inputs: 40 | script: | 41 | pip install ploosh 42 | - task: CmdLine@2 43 | displayName: Execute ploosh 44 | inputs: 45 | script: ploosh --connections "connections.yml" --cases "test_cases" --export "TRX" --failure False --p_mysql_password_db "$(mysql_password)" --p_mssql_password_db "$(mssql_password)" --p_postgresql_password_db "$(postgresql_password)" 46 | - task: PublishTestResults@2 47 | inputs: 48 | testResultsFormat: 'VSTest' 49 | testResultsFiles: '*.xml' 50 | searchFolder: 'output/trx/' 51 | mergeTestResults: true 52 | testRunTitle: '$(Build.DefinitionName)' 53 | ``` -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_csv_spark.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read CSV file""" 3 | 4 | from connectors.connector import Connector 5 | 6 | 7 | class ConnectorCSVSpark(Connector): 8 | """Connector to read CSV file with Spark""" 9 | 10 | def __init__(self): 11 | # Initialize the connector with its name and configuration definitions 12 | self.name = "CSV_SPARK" 13 | self.is_spark = True 14 | self.connection_definition = [] 15 | self.configuration_definition = [ 16 | {"name": "path", "type": "string"}, # Path to the CSV file 17 | {"name": "delimiter", "type": "string", "default": ","}, # Delimiter used in the CSV file 18 | {"name": "header", "type": "boolean", "default": True}, # Whether the CSV file has a header row 19 | {"name": "inferSchema", "type": "boolean", "default": False}, # Infers the input schema automatically from data 20 | {"name": "multiline", "type": "boolean", "default": False}, # Parse one record, which may span multiple lines, per file 21 | {"name": "quote", "type": "string", "default": '"'}, # Character used to denote the start and end of a quoted item 22 | {"name": "encoding", "type": "string", "default": 'UTF-8'}, # Encoding to use for UTF when reading/writing 23 | {"name": "lineSep", "type": "string", "default": "\n"}, # Character used to denote a line break 24 | ] 25 | 26 | def get_data(self, configuration: dict, connection: dict): 27 | """Get data from source""" 28 | 29 | # Read the CSV file using Spark with the specified configuration options 30 | df = self.spark.read.option("delimiter", configuration["delimiter"]) \ 31 | .option("header", configuration["header"]) \ 32 | .option("inferSchema", configuration["inferSchema"])\ 33 | .option("multiline", configuration["multiline"]) \ 34 | .option("quote", configuration["quote"]) \ 35 | .option("encoding", configuration["encoding"]) \ 36 | .option("lineSep", configuration["lineSep"]) \ 37 | .csv(configuration["path"]) 38 | 39 | return df 40 | -------------------------------------------------------------------------------- /tests/connectors/test_parquet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import pytest 4 | from pyjeb import control_and_setup 5 | from ploosh.connectors.connector_parquet import ConnectorParquet 6 | 7 | @pytest.fixture 8 | def connector(): 9 | return ConnectorParquet() 10 | 11 | 12 | @pytest.fixture 13 | def df_sales(): 14 | return pd.read_csv(f"{os.getcwd()}/tests/.data/sales.csv", delimiter=",") 15 | 16 | @pytest.fixture 17 | def df_sales_with_specific_columns(): 18 | df = pd.read_csv(f"{os.getcwd()}/tests/.data/sales.csv", delimiter=",") 19 | df_selected_columns = df[["sale_id", "seller_name", "card_name", "quantity"]] 20 | return df_selected_columns 21 | 22 | @pytest.fixture 23 | def df_sales_with_filters(): 24 | df = pd.read_csv(f"{os.getcwd()}/tests/.data/sales.csv", delimiter=",") 25 | df_filtered = df[(df["sale_id"] > 10) & (df['quantity'] == 1)] 26 | df_filtered = df_filtered.reset_index(drop=True) 27 | return df_filtered 28 | 29 | 30 | def test_default(connector, df_sales): 31 | configuration = { 32 | "path": "./tests/.env/parquet/sales.parquet", 33 | } 34 | 35 | configuration = control_and_setup(configuration, connector.configuration_definition) 36 | df_test = connector.get_data(configuration, {}) 37 | 38 | assert len(df_test.compare(df_sales)) == 0 39 | 40 | 41 | def test_with_specific_columns(connector, df_sales_with_specific_columns): 42 | configuration = { 43 | "path": "./tests/.env/parquet/sales.parquet", 44 | "columns" : ["sale_id", "seller_name", "card_name", "quantity"] 45 | } 46 | 47 | configuration = control_and_setup(configuration, connector.configuration_definition) 48 | df_test = connector.get_data(configuration, {}) 49 | 50 | assert len(df_test.compare(df_sales_with_specific_columns)) == 0 51 | 52 | 53 | def test_with_filters(connector, df_sales_with_filters): 54 | configuration = { 55 | "path": "./tests/.env/parquet/sales.parquet", 56 | "filters" : [{"column": "sale_id", "operator": ">", "value": 10}, 57 | {"column": "quantity", "operator": "==", "value": 1}] 58 | } 59 | 60 | configuration = control_and_setup(configuration, connector.configuration_definition) 61 | df_test = connector.get_data(configuration, {}) 62 | 63 | assert len(df_test.compare(df_sales_with_filters)) == 0 64 | -------------------------------------------------------------------------------- /debug/setup.sh: -------------------------------------------------------------------------------- 1 | # Configuration 2 | db_password=ThePasswordIs9293709B13? 3 | 4 | # Setup dev envrionnement 5 | conda create -n ".ploosh" python=3.12.8 ipython 6 | conda activate .ploosh 7 | 8 | pip install -r ./src/requirements.txt 9 | 10 | # install connectors clients 11 | sudo apt-get update 12 | 13 | sudo apt-get install -y postgresql-client 14 | sudo apt-get install -y mysql-client 15 | sudo ACCEPT_EULA=Y apt-get install -y mssql-tools unixodbc-dev 16 | 17 | # install connectors servers 18 | docker run --name ploosh-mysql \ 19 | -e MYSQL_ROOT_PASSWORD=$db_password \ 20 | -e MYSQL_PASSWORD=$db_password \ 21 | -e MYSQL_DATABASE=ploosh \ 22 | -e MYSQL_USER=ploosh \ 23 | -p 3306:3306 \ 24 | -d mysql 25 | 26 | docker run --name ploosh-postgresql \ 27 | -e POSTGRES_USER=ploosh \ 28 | -e POSTGRES_PASSWORD=$db_password \ 29 | -e POSTGRES_DB=ploosh \ 30 | -p 5432:5432 \ 31 | -d postgres 32 | 33 | docker run --name ploosh-mssql \ 34 | -e "ACCEPT_EULA=Y" \ 35 | -e "MSSQL_SA_PASSWORD=$db_password" \ 36 | --hostname ploosh \ 37 | -p 1433:1433 \ 38 | -d \ 39 | mcr.microsoft.com/mssql/server:2022-latest 40 | 41 | docker run -d --name ploosh-spark-master \ 42 | -e SPARK_MODE=master \ 43 | -e SPARK_MASTER_HOST=ploosh-spark-master \ 44 | -p 7077:7077 -p 8081:8080 \ 45 | -v $(pwd)/tests/.data:$(pwd)/tests/.data \ 46 | -v $(pwd)/tests/.env:$(pwd)/tests/.env \ 47 | --hostname ploosh-spark-master \ 48 | bitnami/spark 49 | 50 | docker run -d --name ploosh-spark-worker \ 51 | -e SPARK_MODE=worker \ 52 | -e SPARK_MASTER_URL=spark://ploosh-spark-master:7077 \ 53 | -v $(pwd)/tests/.data:$(pwd)/tests/.data \ 54 | -v $(pwd)/tests/.env:$(pwd)/tests/.env \ 55 | --link ploosh-spark-master:ploosh-spark-master \ 56 | bitnami/spark 57 | 58 | 59 | docker exec -it ploosh-spark-master pip install delta-spark==3.3.0 60 | docker exec -it ploosh-spark-worker pip install delta-spark==3.3.0 61 | 62 | mysql -h 127.0.0.1 -u ploosh -p$db_password < tests/.env/mysql/setup.sql 63 | 64 | export PGPASSWORD=$db_password; 65 | psql -h 127.0.0.1 -U ploosh -d ploosh -f tests/.env/postgresql/setup.sql 66 | 67 | /opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P $db_password -i tests/.env/mssql/setup.sql 68 | 69 | spark_setup_file=$(pwd)/tests/.env/spark/setup.sql 70 | spark_setup_file_tmp=$(pwd)/tests/.env/spark/setup_tmp.sql 71 | sed "s|{{pwd}}|$(pwd)|g" $spark_setup_file > $spark_setup_file_tmp 72 | spark-sql -f$spark_setup_file_tmp -------------------------------------------------------------------------------- /tests/.env/delta/sales/_delta_log/00000000000000000000.json: -------------------------------------------------------------------------------- 1 | {"protocol":{"minReaderVersion":1,"minWriterVersion":2}} 2 | {"metaData":{"id":"1f95c651-89a8-471e-b4cf-5350af0b517a","name":null,"description":null,"format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"sale_id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"seller_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"card_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"card_rarity\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"card_condition\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"price\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}},{\"name\":\"quantity\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"sale_date\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"card_set\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"buyer_name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"transaction_status\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":[],"createdTime":1736434780759,"configuration":{}}} 3 | {"add":{"path":"part-00001-4cd9f0b3-de7c-470d-83e2-ede8afcbf39a-c000.snappy.parquet","partitionValues":{},"size":6198,"modificationTime":1736434780761,"dataChange":true,"stats":"{\"numRecords\":74,\"minValues\":{\"card_rarity\":\"Common\",\"card_condition\":\"Excellent\",\"price\":5.0,\"seller_name\":\"Alex Johnson\",\"card_name\":\"Aerodactyl\",\"transaction_status\":\"Cancelled\",\"card_set\":\"Base Set\",\"buyer_name\":\"Alex Johnson\",\"sale_date\":\"2024-11-01\",\"quantity\":1,\"sale_id\":1},\"maxValues\":{\"buyer_name\":\"Sophia Wilson\",\"price\":320.0,\"sale_date\":\"2025-01-14\",\"transaction_status\":\"Pending\",\"sale_id\":74,\"card_name\":\"Zapdos\",\"card_rarity\":\"Ultra Rare\",\"card_condition\":\"Near Mint\",\"seller_name\":\"Sophia Wilson\",\"card_set\":\"Jungle\",\"quantity\":20},\"nullCount\":{\"card_name\":0,\"card_rarity\":0,\"sale_id\":0,\"sale_date\":0,\"price\":0,\"seller_name\":0,\"transaction_status\":0,\"card_set\":0,\"quantity\":0,\"buyer_name\":0,\"card_condition\":0}}","tags":null,"deletionVector":null,"baseRowId":null,"defaultRowCommitVersion":null,"clusteringProvider":null}} 4 | {"commitInfo":{"timestamp":1736434780761,"operation":"WRITE","operationParameters":{"mode":"Overwrite"},"operationMetrics":{"execution_time_ms":2,"num_added_files":1,"num_added_rows":74,"num_partitions":0,"num_removed_files":0},"clientVersion":"delta-rs.0.23.0"}} -------------------------------------------------------------------------------- /tests/load_engine/test_native.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from pyjeb import control_and_setup 4 | from ploosh.engines.load_engine_native import LoadEngineNative 5 | from ploosh.configuration import Configuration 6 | 7 | @pytest.fixture 8 | def controls(): 9 | controls = Configuration.case_definition 10 | controls = [control for control in controls if control["name"].startswith("options")] 11 | return controls 12 | 13 | def test_count(controls): 14 | df_data = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}) 15 | parameters = {} 16 | options = control_and_setup(parameters, controls)["options"] 17 | 18 | load_engine = LoadEngineNative(None, options, None) 19 | df_data = load_engine.execute(df_data) 20 | assert load_engine.count == 3 21 | 22 | def test_cast_datetime(controls): 23 | df_data = pd.DataFrame({"A": [1, 2, 3], "B": ["2021-01-01", "2021-01-02", "2021-01-03"]}) 24 | parameters = { 25 | "options": { 26 | "cast": [ 27 | { 28 | "name": "B", 29 | "type": "datetime" 30 | } 31 | ] 32 | } 33 | } 34 | options = control_and_setup(parameters, controls)["options"] 35 | 36 | load_engine = LoadEngineNative(None, options, None) 37 | df_data = load_engine.execute(df_data) 38 | assert df_data["B"].dtype == "datetime64[ns]" 39 | assert df_data["A"].dtype == "int64" 40 | 41 | def test_cast_int(controls): 42 | df_data = pd.DataFrame({"A": [1, 2, 3], "B": ["4", "5", "6"]}) 43 | parameters = { 44 | "options": { 45 | "cast": [ 46 | { 47 | "name": "B", 48 | "type": "int" 49 | } 50 | ] 51 | } 52 | } 53 | options = control_and_setup(parameters, controls)["options"] 54 | 55 | load_engine = LoadEngineNative(None, options, None) 56 | df_data = load_engine.execute(df_data) 57 | assert df_data["B"].dtype == "int64" 58 | assert df_data["A"].dtype == "int64" 59 | 60 | def test_cast_float(controls): 61 | df_data = pd.DataFrame({"A": [1, 2, 3], "B": ["4.0", "5.0", "6.0"]}) 62 | parameters = { 63 | "options": { 64 | "cast": [ 65 | { 66 | "name": "B", 67 | "type": "float" 68 | } 69 | ] 70 | } 71 | } 72 | options = control_and_setup(parameters, controls)["options"] 73 | 74 | load_engine = LoadEngineNative(None, options, None) 75 | df_data = load_engine.execute(df_data) 76 | assert df_data["B"].dtype == "float64" 77 | assert df_data["A"].dtype == "int64" -------------------------------------------------------------------------------- /src/ploosh/parameters.py: -------------------------------------------------------------------------------- 1 | """Module for parsing input parameters""" 2 | 3 | 4 | class Parameters: 5 | """Parse input parameters""" 6 | # Initialize class variables 7 | args = {} 8 | path_connection = None 9 | path_cases = None 10 | path_cases_filter = None 11 | path_output = None 12 | export = None 13 | failure_on_error = None 14 | variables = {} 15 | 16 | def __init__(self, argv: list): 17 | """Initialize Parameters with command-line arguments""" 18 | # Set arguments and variables from the command-line input 19 | self.set_args(argv[1:]) 20 | self.set_variables() 21 | 22 | # Set paths and other parameters from the arguments 23 | self.path_connection = self.get_value("connections", None) 24 | self.path_cases = self.get_value("cases", "./cases") 25 | self.path_cases_filter = self.get_value("filter", "*.yml") 26 | self.path_output = self.get_value("output", "./output") 27 | self.export = self.get_value("export", "JSON").upper() 28 | self.failure_on_error = self.get_value("failure", True) 29 | self.spark_mode = self.get_value("spark", False) 30 | 31 | def set_args(self, args): 32 | """Set dictionary of args with cleaned name""" 33 | for i, name in enumerate(args): 34 | if not name.startswith("-"): 35 | continue 36 | 37 | # Determine the value associated with the argument 38 | value = False 39 | if i != len(args) - 1: 40 | value = args[i + 1] 41 | if value.startswith("-"): 42 | value = True 43 | else: 44 | value = value.replace("'", "").replace("\"", "") 45 | 46 | # Clean the argument name and store it in the dictionary 47 | name = name.replace("-", "") 48 | self.args[name] = value 49 | 50 | def get_value(self, long_name: str, default): 51 | """Get value or default value from args""" 52 | if long_name in self.args: 53 | value = self.args[long_name] 54 | if str(value).upper() == "TRUE": 55 | return True 56 | if str(value).upper() == "FALSE": 57 | return False 58 | return value 59 | 60 | return default 61 | 62 | def set_variables(self): 63 | """Set variable list from args""" 64 | for name, value in self.args.items(): 65 | if not name.startswith("p_"): 66 | continue 67 | 68 | # Clean the variable name and store it in the dictionary 69 | name = name.replace("p_", "") 70 | self.variables[name] = value 71 | -------------------------------------------------------------------------------- /docs/connectors/native/postgresql.md: -------------------------------------------------------------------------------- 1 | This connector allows to connect to a PostgreSQL database and execute SQL queries. 2 | 3 | # Connection configuration 4 | ## Password mode 5 | ### Definition 6 | | Name | Mandatory | Default | Description | 7 | |---------------|:---------:|:----------:|-------------| 8 | | mode | no | password | Change the connection mode. Can be "password" or "connection_string". "connection_string" mode allow to use a custom connection string. 9 | | hostname | yes | | Target host name 10 | | database | yes | | Target database name 11 | | username | yes | | User name 12 | | password | yes | | User password 13 | | port | no | 3306 | Port to use by the connection 14 | | ssl_context | No | False | Set True if the server require a secure transport 15 | 16 | ⚠️ it's highly recommended to use a [parameter](/docs/configuration-custom-parameters/) to pass the password value 17 | 18 | ### Example 19 | ``` yaml 20 | postgresql_example: 21 | type: postgresql 22 | hostname: ploosh.postgresql.database.azure.com 23 | database: SampleDB 24 | username: sa_ploosh 25 | password: $var.sa_ploosh_password 26 | ssl_context: true 27 | ``` 28 | 29 | ### Definition 30 | ## Connection string mode 31 | | Name | Mandatory | Default | Description | 32 | |-------------------|:---------:|:-----------------------------:|-------------| 33 | | mode | no | password | Use "connection_string" value to use custom connection_string 34 | | connection_string | yes | | Connection string use to access in the database. Refer to [SQLAlchemy documentation](https://docs.sqlalchemy.org/en/20/dialects/postgresql.html) to get the accepted format 35 | 36 | ### Example 37 | ``` yaml 38 | postgresql_example: 39 | type: postgresql 40 | mode: connection_string 41 | connection_string: "postgresql+pg8000://sa_ploosh:$var.sa_ploosh_password@ploosh.postgresql.database.azure.com/SampleDB" 42 | ``` 43 | 44 | # Test case configuration 45 | ## Definition 46 | | Name | Mandatory | Default | Description | 47 | |-------------------|:---------:|:-----------------------------:|-------------| 48 | | connection | yes | | The connection to use 49 | | query | yes | | The query to execute to the database 50 | 51 | ## Example 52 | ``` yaml 53 | Example PostgreSQL: 54 | source: 55 | connection: postgresql_example 56 | type: postgresql 57 | query: | 58 | select * 59 | from employees 60 | where hire_date < "2000-01-01" 61 | expected: 62 | type: csv 63 | path: data/employees_before_2000.csv 64 | ``` -------------------------------------------------------------------------------- /docs/connectors/native/mysql.md: -------------------------------------------------------------------------------- 1 | This connector allows to connect to a MySQL database and execute SQL queries. 2 | 3 | # Connection configuration 4 | ## Password mode 5 | ### Definition 6 | | Name | Mandatory | Default | Description | 7 | |--------------------------|:---------:|:----------:|-------------| 8 | | mode | no | password | Change the connection mode. Can be "password" or "connection_string". "connection_string" mode allow to use a custom connection string. 9 | | hostname | yes | | Target host name 10 | | database | yes | | Target database name 11 | | username | yes | | User name 12 | | password | yes | | User password 13 | | port | no | 3306 | Port to use by the connection 14 | | require_secure_transport | No | False | Set True if the server require a secure transport 15 | 16 | ⚠️ it's highly recommended to use a [parameter](/docs/configuration-custom-parameters/) to pass the password value 17 | 18 | ### Example 19 | ``` yaml 20 | mysql_example: 21 | type: mysql 22 | hostname: ploosh.mysql.database.azure.com 23 | database: SampleDB 24 | username: sa_ploosh 25 | password: $var.sa_ploosh_password 26 | require_secure_transport: true 27 | ``` 28 | 29 | ### Definition 30 | ## Connection string mode 31 | | Name | Mandatory | Default | Description | 32 | |-------------------|:---------:|:-----------------------------:|-------------| 33 | | mode | no | password | Use "connection_string" value to use custom connection_string 34 | | connection_string | yes | | Connection string use to access in the database. Refer to [SQLAlchemy documentation](https://docs.sqlalchemy.org/en/20/dialects/mysql.html) to get the accepted format 35 | 36 | ### Example 37 | ``` yaml 38 | mysql_example: 39 | type: mysql 40 | mode: connection_string 41 | connection_string: "mysql+mysqldb://sa_ploosh:$var.sa_ploosh_password@ploosh.mysql.database.azure.com/SampleDB" 42 | ``` 43 | 44 | # Test case configuration 45 | ## Definition 46 | | Name | Mandatory | Default | Description | 47 | |-------------------|:---------:|:-----------------------------:|-------------| 48 | | connection | yes | | The connection to use 49 | | query | yes | | The query to execute to the database 50 | ## Example 51 | 52 | ``` yaml 53 | Example MySQL: 54 | source: 55 | connection: mysql_example 56 | type: mysql 57 | query: | 58 | select * 59 | from employees 60 | where hire_date < "2000-01-01" 61 | expected: 62 | type: csv 63 | path: data/employees_before_2000.csv 64 | ``` -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_snowflake.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read Snowflake database""" 3 | 4 | import pandas as pd 5 | from sqlalchemy import create_engine 6 | from connectors.connector import Connector 7 | 8 | 9 | class ConnectorSnowflake(Connector): 10 | """Connector to read Snowflake database""" 11 | 12 | def __init__(self): 13 | # Initialize the connector with its name and connection definitions 14 | self.name = "SNOWFLAKE" 15 | self.connection_definition = [ 16 | { 17 | "name": "account_identifier", # Snowflake account identifier 18 | }, 19 | { 20 | "name": "username", # Username for authentication 21 | }, 22 | { 23 | "name": "password", # Password for authentication 24 | }, 25 | { 26 | "name": "database", # Database name (optional) 27 | "default": None, 28 | }, 29 | { 30 | "name": "schema", # Schema name (optional) 31 | "default": None, 32 | }, 33 | { 34 | "name": "warehouse", # Warehouse name (optional) 35 | "default": None, 36 | }, 37 | { 38 | "name": "role", # Role name (optional) 39 | "default": None, 40 | }, 41 | ] 42 | self.configuration_definition = [{"name": "query"}, {"name": "connection"}] 43 | 44 | def get_data(self, configuration: dict, connection: dict): 45 | """Get data from source""" 46 | 47 | # Extract connection parameters 48 | account_identifier = connection["account_identifier"] 49 | username = connection["username"] 50 | password = connection["password"] 51 | 52 | # Create the base connection string for Snowflake 53 | connection_string = f"snowflake://{username}:{password}@{account_identifier}/" 54 | 55 | # Append database and schema to the connection string if provided 56 | if connection["database"] is not None: 57 | connection_string += f"{connection['database']}/" 58 | if connection["schema"] is not None: 59 | connection_string += f"{connection['schema']}" 60 | 61 | # Add query parameters to the connection string 62 | connection_string += "?1=1" 63 | if connection["warehouse"] is not None: 64 | connection_string += f"&warehouse={connection['warehouse']}" 65 | if connection["role"] is not None: 66 | connection_string += f"&role={connection['role']}" 67 | 68 | # Create a SQLAlchemy engine using the connection string 69 | sql_connection = create_engine(connection_string, echo=False) 70 | 71 | # Execute the SQL query and read the data into a pandas DataFrame 72 | df = pd.read_sql(configuration["query"], sql_connection) 73 | 74 | return df 75 | -------------------------------------------------------------------------------- /docs/exporters/json.md: -------------------------------------------------------------------------------- 1 | # Structure 2 | ``` 3 | output/ 4 | ├─ json/ 5 | │ ├─ test_results.json 6 | │ ├─ test_results/ 7 | │ │ ├─ test case 1.xlsx 8 | │ │ ├─ test case 2.xlsx 9 | │ │ └─ ... 10 | ``` 11 | 12 | The json extractor will generate a `test_results.json` file and a `test_results` folder containing the details of the test cases results in xlsx format. 13 | 14 | # test_results.json 15 | The `test_results.json` file will contain the following properties: 16 | - `test_case`: the name of the test case 17 | - `status`: the status of the test case. Can be `success`, `failure` or `error` 18 | - `error.type`: the type of the error if the test case failed or raised an error 19 | - `error.message`: the error message if the test case failed or raised an error 20 | - `source.start`: the start time of the source extraction 21 | - `source.end`: the end time of the source extraction 22 | - `source.duration`: the duration of the source extraction 23 | - `source.count`: the count of the source dataset 24 | - `expected.start`: the start time of the expected extraction 25 | - `expected.end`: the end time of the expected extraction 26 | - `expected.duration`: the duration of the expected extraction 27 | - `expected.count`: the count of the expected dataset 28 | - `compare.start`: the start time of the comparison 29 | - `compare.end`: the end time of the comparison 30 | - `compare.duration`: the duration of the comparison 31 | - `compare.success_rate`: the success rate of the test case 32 | 33 | # test_results folder 34 | The `test_results` folder will contain one xlsx file per test case. Each file will contain a sheet with the gap between the source and the expected dataset 35 | 36 | # Example 37 | ``` json 38 | { 39 | "test_case": "test 1", 40 | "status": "passed", 41 | "source": { 42 | "start": "2024-02-05T17:08:36Z", 43 | "end": "2024-02-05T17:08:36Z", 44 | "duration": 0.0032982, 45 | "count": 100 46 | }, 47 | "expected": { 48 | "start": "2024-02-05T17:08:36Z", 49 | "end": "2024-02-05T17:08:36Z", 50 | "duration": 6.0933333333333335e-05, 51 | "count": 100 52 | }, 53 | "compare": { 54 | "start": "2024-02-05T17:08:36Z", 55 | "end": "2024-02-05T17:08:36Z", 56 | "duration": 0.0032982, 57 | "success_rate": 1.0 58 | } 59 | }, 60 | { 61 | "test_case": "test 2", 62 | "status": "failed", 63 | "source": { 64 | "start": "2024-02-05T17:08:36Z", 65 | "end": "2024-02-05T17:08:36Z", 66 | "duration": 0.0032982, 67 | "count": 100 68 | }, 69 | "expected": { 70 | "start": "2024-02-05T17:08:36Z", 71 | "end": "2024-02-05T17:08:36Z", 72 | "duration": 6.0933333333333335e-05, 73 | "count": 100 74 | }, 75 | "compare": { 76 | "start": "2024-02-05T17:08:36Z", 77 | "end": "2024-02-05T17:08:36Z", 78 | "duration": 0.0032982, 79 | "success_rate": 0.95 80 | }, 81 | "error": { 82 | "type": "Data", 83 | "message": "Some rows are not equals between source dataset and expected dataset" 84 | } 85 | } 86 | ``` -------------------------------------------------------------------------------- /docs/configuration/options.md: -------------------------------------------------------------------------------- 1 | Test case allow to define options for the test case execution. The options are defined in the `options` section of the test case configuration. 2 | 3 | # Ignore 4 | The `ignore` option allow to ignore specifics columns in the comparison. The `ignore` option is a list of columns to ignore in the comparison. The columns are defined by their name. 5 | 6 | ## Example 7 | ``` yaml 8 | Example: 9 | options: 10 | ignore: 11 | - column_to_ignore_1 12 | - column_to_ignore_2 13 | source: 14 | connection: my_connection 15 | query: select * from my_table 16 | expected: 17 | connection: my_connection 18 | query: select * from my_table 19 | ``` 20 | 21 | # Sort 22 | The `sort` option allow to sort the dataset before the comparison. The `sort` option is a list of columns to sort the dataset. The columns are defined by their name. 23 | 24 | ## Example 25 | ``` yaml 26 | Example: 27 | options: 28 | sort: 29 | - column_to_sort_1 30 | - column_to_sort_2 31 | source: 32 | connection: my_connection 33 | query: select * from my_table 34 | expected: 35 | connection: my_connection 36 | query: select * from my_table 37 | ``` 38 | 39 | ⚠️ The best practice is to sort the dataset in the source and the expected query to ensure the comparison is done on the same order and provide a better performance. 40 | 41 | # Cast 42 | The `cast` option allow to cast the column type before the comparison. The `cast` option is a list of name and type to cast the column. The column name is defined by their name and the type. 43 | 44 | The allowed types are: 45 | - `int` 46 | - `float` 47 | - `string` 48 | - `datetime` 49 | 50 | ## Example 51 | ``` yaml 52 | Example: 53 | options: 54 | cast: 55 | - name: column_to_cast_1 56 | type: int 57 | - name: column_to_cast_2 58 | type: float 59 | source: 60 | connection: my_connection 61 | query: select * from my_table 62 | expected: 63 | connection: my_connection 64 | query: select * from my_table 65 | ``` 66 | 67 | # Pass rate 68 | The `pass_rate` option allow to define the pass rate of the test case. The pass rate is a float between 0 and 1. The pass rate is the percentage of the rows that need to be the same to pass the test case. 69 | 70 | ## Example 71 | ``` yaml 72 | Example: 73 | options: 74 | pass_rate: 0.95 75 | source: 76 | connection: my_connection 77 | query: select * from my_table 78 | expected: 79 | connection: my_connection 80 | query: select * from my_table 81 | ``` 82 | 83 | # Trim 84 | The `trim` option allow to trim the string columns before the comparison. The `trim` option is a list of columns to trim. The columns are defined by their name. 85 | 86 | ## Example 87 | ``` yaml 88 | Example: 89 | options: 90 | trim: 91 | - column_to_trim_1 92 | - column_to_trim_2 93 | source: 94 | connection: my_connection 95 | query: select * from my_table 96 | expected: 97 | connection: my_connection 98 | query: select * from my_table 99 | ``` 100 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_postgresql.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read PostgreSQL database""" 3 | 4 | import pandas as pd 5 | from sqlalchemy import create_engine 6 | import urllib 7 | from connectors.connector import Connector 8 | 9 | 10 | class ConnectorPostgreSQL(Connector): 11 | """Connector to read PostgreSQL database""" 12 | 13 | def __init__(self): 14 | # Initialize the connector with its name and connection definitions 15 | self.name = "POSTGRESQL" 16 | self.connection_definition = [ 17 | { 18 | "name": "mode", 19 | "default": "password", 20 | "validset": ["password", "connection_string"], 21 | }, 22 | { 23 | "name": "hostname", 24 | "default": None, 25 | }, 26 | { 27 | "name": "database", 28 | "default": None, 29 | }, 30 | { 31 | "name": "username", 32 | "default": None, 33 | }, 34 | { 35 | "name": "password", 36 | "default": None, 37 | }, 38 | { 39 | "name": "port", 40 | "default": 5432, 41 | "type": "integer", 42 | }, 43 | { 44 | "name": "ssl_context", 45 | "default": False, 46 | "type": "boolean", 47 | }, 48 | { 49 | "name": "connection_string", 50 | "default": None, 51 | }, 52 | ] 53 | self.configuration_definition = [{"name": "query"}, {"name": "connection"}] 54 | 55 | def get_data(self, configuration: dict, connection: dict): 56 | """Get data from source""" 57 | 58 | # Use the provided connection string if mode is "connection_string" 59 | connection_string = connection["connection_string"] 60 | if connection["mode"] == "password": 61 | # Extract connection parameters 62 | port = connection["port"] 63 | hostname = connection["hostname"] 64 | username = connection["username"] 65 | password = connection["password"] 66 | database = connection["database"] 67 | # Create the connection string for PostgreSQL 68 | password = urllib.parse.quote_plus(password) 69 | connection_string = ( 70 | f"postgresql+pg8000://{username}:{password}@{hostname}:{port}/{database}" 71 | ) 72 | 73 | # Additional connection arguments 74 | connect_args = {} 75 | if connection["ssl_context"]: 76 | connect_args = {"ssl_context": True} 77 | 78 | # Create a SQLAlchemy engine using the connection string and additional arguments 79 | sql_connection = create_engine( 80 | connection_string, echo=False, connect_args=connect_args 81 | ) 82 | 83 | # Execute the SQL query and read the data into a pandas DataFrame 84 | df = pd.read_sql(configuration["query"], sql_connection) 85 | 86 | return df 87 | -------------------------------------------------------------------------------- /src/ploosh/exporters/exporter_csv.py: -------------------------------------------------------------------------------- 1 | """Export test case result to CSV format""" 2 | import csv 3 | import os 4 | from exporters.exporter import Exporter 5 | 6 | class ExporterCSV(Exporter): 7 | """Export test case result to CSV format""" 8 | 9 | def __init__(self): 10 | # Set the name of the exporter 11 | self.name = "CSV" 12 | 13 | def export(self, cases: dict): 14 | """Export test case results to a CSV file""" 15 | 16 | # Define the output file path 17 | output_file = f"{self.output_path}/csv/test_results.csv" 18 | 19 | # Initialize the data list with headers 20 | data = [[ 21 | "name", 22 | "state", 23 | "source_start", 24 | "source_end", 25 | "source_duration", 26 | "source_count", 27 | "expected_start", 28 | "expected_end", 29 | "expected_duration", 30 | "expected_count", 31 | "compare_start", 32 | "compare_end", 33 | "compare_duration", 34 | "success_rate", 35 | "error_type", 36 | "error_message", 37 | ]] 38 | 39 | # Iterate over each test case and collect data 40 | for name in cases: 41 | case = cases[name] 42 | 43 | # Collect data for the current test case 44 | case_data = [ 45 | name, 46 | case.state, 47 | Exporter.date_to_string(case.source.duration.start), 48 | Exporter.date_to_string(case.source.duration.end), 49 | case.source.duration.duration, 50 | case.source.count, 51 | Exporter.date_to_string(case.expected.duration.start), 52 | Exporter.date_to_string(case.expected.duration.end), 53 | case.expected.duration.duration, 54 | case.expected.count, 55 | Exporter.date_to_string(case.compare_duration.start), 56 | Exporter.date_to_string(case.compare_duration.end), 57 | case.compare_duration.duration, 58 | case.success_rate, 59 | case.error_type, 60 | case.error_message, 61 | ] 62 | 63 | # Append the collected data to the data list 64 | data.append(case_data) 65 | 66 | # If there is a comparison gap, export it to an Excel file 67 | if case.df_compare_gap is not None: 68 | detail_file_path = f"{self.output_path}/json/test_results/{name}.xlsx" 69 | 70 | # Create directories if they do not exist 71 | os.makedirs(os.path.dirname(detail_file_path), exist_ok=True) 72 | case.df_compare_gap.to_excel(detail_file_path) 73 | 74 | # Create directories if they do not exist 75 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 76 | 77 | # Write the collected data to the CSV file 78 | with open(output_file, "w", encoding="UTF-8") as f: 79 | writer = csv.writer(f, lineterminator="\n") 80 | writer.writerows(data) 81 | f.close() 82 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_mysql.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read MYSQL database""" 3 | 4 | import pandas as pd 5 | from sqlalchemy import create_engine 6 | import urllib 7 | from connectors.connector import Connector 8 | 9 | 10 | class ConnectorMYSQL(Connector): 11 | """Connector to read MYSQL database""" 12 | 13 | def __init__(self): 14 | # Initialize the connector with its name and connection definitions 15 | self.name = "MYSQL" 16 | self.connection_definition = [ 17 | { 18 | "name": "mode", 19 | "default": "password", 20 | "validset": ["password", "connection_string"], 21 | }, 22 | { 23 | "name": "hostname", 24 | "default": None, 25 | }, 26 | { 27 | "name": "database", 28 | "default": None, 29 | }, 30 | { 31 | "name": "username", 32 | "default": None, 33 | }, 34 | { 35 | "name": "password", 36 | "default": None, 37 | }, 38 | { 39 | "name": "port", 40 | "default": 3306, 41 | "type": "integer", 42 | }, 43 | { 44 | "name": "require_secure_transport", 45 | "default": False, 46 | "type": "boolean", 47 | }, 48 | { 49 | "name": "connection_string", 50 | "default": None, 51 | }, 52 | ] 53 | self.configuration_definition = [{"name": "query"}, {"name": "connection"}] 54 | 55 | def get_data(self, configuration: dict, connection: dict): 56 | """Get data from source""" 57 | 58 | # Use the provided connection string if mode is "connection_string" 59 | connection_string = connection["connection_string"] 60 | if connection["mode"] == "password": 61 | # Extract connection parameters 62 | port = connection["port"] 63 | hostname = connection["hostname"] 64 | username = connection["username"] 65 | password = connection["password"] 66 | database = connection["database"] 67 | # Create the connection string for MySQL 68 | 69 | password = urllib.parse.quote_plus(password) 70 | connection_string = ( 71 | f"mysql+pymysql://{username}:{password}@{hostname}:{port}/{database}" 72 | ) 73 | 74 | # Additional connection arguments 75 | connect_args = {} 76 | if connection["require_secure_transport"]: 77 | connect_args = {"ssl": {"require_secure_transport": True}} 78 | 79 | # Create a SQLAlchemy engine using the connection string and additional arguments 80 | sql_connection = create_engine( 81 | connection_string, echo = False, connect_args = connect_args 82 | ) 83 | 84 | # Execute the SQL query and read the data into a pandas DataFrame 85 | df = pd.read_sql(configuration["query"], sql_connection) 86 | 87 | return df 88 | -------------------------------------------------------------------------------- /src/ploosh/exporters/exporter_json.py: -------------------------------------------------------------------------------- 1 | """Export test case result to JSON format""" 2 | 3 | import json 4 | import os 5 | from exporters.exporter import Exporter 6 | 7 | 8 | class ExporterJSON(Exporter): 9 | """Export test case result to JSON format""" 10 | 11 | def __init__(self): 12 | # Set the name of the exporter 13 | self.name = "JSON" 14 | 15 | def export(self, cases: dict): 16 | """Export test case results to a JSON file""" 17 | 18 | # Define the output file path 19 | output_file = f"{self.output_path}/json/test_results.json" 20 | 21 | data = [] 22 | # Iterate over each test case and collect data 23 | for name in cases: 24 | case = cases[name] 25 | 26 | # Collect basic data for the current test case 27 | case_data = { 28 | "name": name, 29 | "state": case.state, 30 | } 31 | 32 | # Collect source data if available 33 | if case.source.duration.start is not None: 34 | case_data["source"] = { 35 | "start": Exporter.date_to_string(case.source.duration.start), 36 | "end": Exporter.date_to_string(case.source.duration.end), 37 | "duration": case.source.duration.duration, 38 | "count": case.source.count, 39 | } 40 | 41 | # Collect expected data if available 42 | if case.expected.duration.start is not None: 43 | case_data["expected"] = { 44 | "start": Exporter.date_to_string(case.expected.duration.start), 45 | "end": Exporter.date_to_string(case.expected.duration.end), 46 | "duration": case.expected.duration.duration, 47 | "count": case.expected.count, 48 | } 49 | 50 | # Collect comparison data if available 51 | if case.compare_duration.start is not None: 52 | case_data["compare"] = { 53 | "start": Exporter.date_to_string(case.compare_duration.start), 54 | "end": Exporter.date_to_string(case.compare_duration.end), 55 | "duration": case.compare_duration.duration, 56 | "success_rate": case.success_rate, 57 | } 58 | 59 | # Collect error data if the test case failed or encountered an error 60 | if case.state in ["error", "failed"]: 61 | case_data["error"] = { 62 | "type": case.error_type, 63 | "message": case.error_message, 64 | } 65 | 66 | # Append the collected data to the data list 67 | data.append(case_data) 68 | 69 | # If there is a comparison gap, export it to an Excel file 70 | if case.df_compare_gap is not None: 71 | detail_file_path = f"{self.output_path}/json/test_results/{name}.xlsx" 72 | 73 | # Create directories if they do not exist 74 | os.makedirs(os.path.dirname(detail_file_path), exist_ok=True) 75 | case.df_compare_gap.to_excel(detail_file_path) 76 | 77 | # Create directories if they do not exist 78 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 79 | 80 | # Write the collected data to the JSON file 81 | with open(output_file, "w", encoding="UTF-8") as f: 82 | f.write(json.dumps(data, indent=2)) 83 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_mssql.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read MSSQL database""" 3 | 4 | import pandas as pd 5 | from sqlalchemy import create_engine 6 | import urllib 7 | from connectors.connector import Connector 8 | 9 | 10 | class ConnectorMSSQL(Connector): 11 | """Connector to read MSSQL database""" 12 | 13 | def __init__(self): 14 | # Initialize the connector with its name and connection definitions 15 | self.name = "MSSQL" 16 | self.connection_definition = [ 17 | { 18 | "name": "mode", 19 | "default": "password", 20 | "validset": ["password", "connection_string"], 21 | }, 22 | { 23 | "name": "hostname", 24 | "default": None, 25 | }, 26 | { 27 | "name": "database", 28 | "default": None, 29 | }, 30 | { 31 | "name": "username", 32 | "default": None, 33 | }, 34 | { 35 | "name": "password", 36 | "default": None, 37 | }, 38 | { 39 | "name": "port", 40 | "default": 1433, 41 | "type": "integer", 42 | }, 43 | { 44 | "name": "encrypt", 45 | "default": True, 46 | "type": "boolean", 47 | }, 48 | { 49 | "name": "trust_server_certificate", 50 | "default": False, 51 | "type": "boolean", 52 | }, 53 | { 54 | "name": "driver", 55 | "default": "ODBC Driver 18 for SQL Server", 56 | }, 57 | { 58 | "name": "connection_string", 59 | "default": None, 60 | }, 61 | ] 62 | self.configuration_definition = [{"name": "query"}, {"name": "connection"}] 63 | 64 | def get_data(self, configuration: dict, connection: dict): 65 | """Get data from source""" 66 | 67 | # Use the provided connection string if mode is "connection_string" 68 | connection_string = connection["connection_string"] 69 | if connection["mode"] == "password": 70 | # Extract connection parameters 71 | driver = connection["driver"] 72 | port = connection["port"] 73 | hostname = connection["hostname"] 74 | username = connection["username"] 75 | password = connection["password"] 76 | database = connection["database"] 77 | trust_server_certificate = ( 78 | "yes" if connection["trust_server_certificate"] else "no" 79 | ) 80 | encrypt = "yes" if connection["encrypt"] else "no" 81 | 82 | # Create the ODBC connection string 83 | password = urllib.parse.quote_plus(password) 84 | odbc_connect = f"Driver={driver};Server={hostname};Database={database};Uid={username};Pwd={password};Encrypt={encrypt};TrustServerCertificate={trust_server_certificate};" 85 | connection_string = f"mssql+pyodbc:///?odbc_connect={odbc_connect}" 86 | 87 | # Create a SQLAlchemy engine using the connection string 88 | sql_connection = create_engine(connection_string, echo=False) 89 | 90 | # Execute the SQL query and read the data into a pandas DataFrame 91 | df = pd.read_sql(configuration["query"], sql_connection) 92 | 93 | return df 94 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_odbc.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read ODBC connection""" 3 | 4 | import warnings 5 | import pandas as pd 6 | import pyodbc 7 | from connectors.connector import Connector 8 | 9 | 10 | class ConnectorODCB(Connector): 11 | """Connector to read ODBC connection""" 12 | 13 | def __init__(self): 14 | # Initialize the connector with its name and connection definitions 15 | self.name = "ODBC" 16 | self.connection_definition = [ 17 | { 18 | "name": "mode", 19 | "default": "DSN", 20 | "validset": ["DSN", "connection_string"], 21 | }, 22 | { 23 | "name": "DSN", # Data Source Name for the ODBC connection 24 | "default": None 25 | }, 26 | { 27 | "name": "connection_string", 28 | "default": None 29 | }, 30 | { 31 | "name": "auto_commit", 32 | "type": "boolean", 33 | "default": True, # Whether to enable auto-commit 34 | }, 35 | { 36 | "name": "use_credentials", 37 | "type": "boolean", 38 | "default": False, # Whether to use credentials for the connection 39 | }, 40 | { 41 | "name": "user", 42 | "default": None, # Username for the connection 43 | }, 44 | { 45 | "name": "password", 46 | "default": None, # Password for the connection 47 | }, 48 | { 49 | "name": "encoding", 50 | "default": "UTF-8", # Encoding to use for the connection 51 | }, 52 | ] 53 | self.configuration_definition = [{"name": "query"}, {"name": "connection"}] 54 | 55 | def get_data(self, configuration: dict, connection: dict): 56 | """Get data from source""" 57 | 58 | if connection["mode"] == "DSN": 59 | # Establish the ODBC connection using the provided DSN and optional credentials 60 | if connection["use_credentials"]: 61 | odbc_connection = pyodbc.connect( 62 | f"DSN={connection['DSN']}", 63 | user=connection["user"], 64 | password=connection["password"], 65 | autocommit=connection["auto_commit"], 66 | ) 67 | else: 68 | odbc_connection = pyodbc.connect( 69 | f"DSN={connection['DSN']};", autocommit=connection["auto_commit"] 70 | ) 71 | else: 72 | odbc_connection = pyodbc.connect( 73 | connection["connection_string"], autocommit=connection["auto_commit"] 74 | ) 75 | 76 | # Suppress warnings related to encoding settings 77 | with warnings.catch_warnings(): 78 | warnings.simplefilter("ignore", UserWarning) 79 | 80 | # Set the encoding for the ODBC connection 81 | odbc_connection.setdecoding( 82 | pyodbc.SQL_CHAR, encoding=connection["encoding"] 83 | ) 84 | odbc_connection.setdecoding( 85 | pyodbc.SQL_WCHAR, encoding=connection["encoding"] 86 | ) 87 | odbc_connection.setencoding(encoding=connection["encoding"]) 88 | 89 | # Execute the SQL query and read the data into a pandas DataFrame 90 | df = pd.read_sql(configuration["query"], odbc_connection) 91 | 92 | return df 93 | -------------------------------------------------------------------------------- /tests/connectors/test_csv_spark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | from pyspark.sql import SparkSession 4 | import pytest 5 | from pyjeb import control_and_setup 6 | from ploosh.engines.load_engine_spark import LoadEngineSpark 7 | from ploosh.configuration import Configuration 8 | from ploosh.connectors.connector_csv_spark import ConnectorCSVSpark 9 | 10 | @pytest.fixture 11 | def connector(): 12 | spark = SparkSession.builder \ 13 | .appName("ploosh") \ 14 | .master("spark://localhost:7077") \ 15 | .config("spark.executor.memory", "1g") \ 16 | .config("spark.driver.memory", "1g") \ 17 | .getOrCreate() 18 | 19 | connector = ConnectorCSVSpark() 20 | connector.spark = spark 21 | 22 | return connector 23 | 24 | @pytest.fixture 25 | def df_sales(): 26 | return pd.read_csv("./tests/.data/sales.csv", delimiter=",", dtype=object, date_format = "%Y-%m-%d", parse_dates=["sale_date"]) 27 | 28 | @pytest.fixture 29 | def df_sales_with_types(): 30 | return pd.read_csv("./tests/.data/sales.csv", delimiter=",", dtype={"sale_id": "int64", "product_id": "int64", "sale_amount": "float64"}, date_format = "%Y-%m-%d", parse_dates=["sale_date"]) 31 | 32 | def test_default(connector, df_sales): 33 | configuration = { 34 | "path": f"{os.getcwd()}/tests/.data/sales.csv", 35 | } 36 | 37 | configuration = control_and_setup(configuration, connector.configuration_definition) 38 | 39 | df_test = connector.get_data(configuration, {}).toPandas() 40 | 41 | assert len(df_test.compare(df_sales)) == 0 42 | 43 | def test_delimiter(connector, df_sales): 44 | configuration = { 45 | "path": f"{os.getcwd()}/tests/.env/csv/sales_with_tab.csv", 46 | "delimiter": "\t" 47 | } 48 | 49 | configuration = control_and_setup(configuration, connector.configuration_definition) 50 | 51 | df_test = connector.get_data(configuration, {}).toPandas() 52 | 53 | assert len(df_test.compare(df_sales)) == 0 54 | 55 | def test_infer_schema(connector, df_sales_with_types): 56 | configuration = { 57 | "path": f"{os.getcwd()}/tests/.data/sales.csv", 58 | "inferSchema": True 59 | } 60 | 61 | configuration = control_and_setup(configuration, connector.configuration_definition) 62 | 63 | df_test = connector.get_data(configuration, {}).toPandas() 64 | 65 | assert len(df_test.compare(df_sales_with_types)) == 0 66 | 67 | def test_quote(connector, df_sales): 68 | configuration = { 69 | "path": f"{os.getcwd()}/tests/.env/csv/sales_with_single_quote.csv", 70 | "quote": "'" 71 | } 72 | 73 | configuration = control_and_setup(configuration, connector.configuration_definition) 74 | 75 | df_test = connector.get_data(configuration, {}).toPandas() 76 | 77 | assert len(df_test.compare(df_sales)) == 0 78 | 79 | def test_encoding(connector, df_sales): 80 | configuration = { 81 | "path": f"{os.getcwd()}/tests/.env/csv/sales_with_iso_8859_1.csv", 82 | "encoding": "ISO-8859-1" 83 | } 84 | 85 | configuration = control_and_setup(configuration, connector.configuration_definition) 86 | 87 | df_test = connector.get_data(configuration, {}).toPandas() 88 | 89 | assert len(df_test.compare(df_sales)) == 0 90 | 91 | def test_line_sep(connector, df_sales): 92 | configuration = { 93 | "path": f"{os.getcwd()}/tests/.env/csv/sales_with_cr.csv", 94 | "lineSep": "\r" 95 | } 96 | 97 | configuration = control_and_setup(configuration, connector.configuration_definition) 98 | 99 | df_test = connector.get_data(configuration, {}).toPandas() 100 | 101 | assert len(df_test.compare(df_sales)) == 0 102 | -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_csv.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read CSV file""" 3 | 4 | import json 5 | import pandas as pd 6 | from connectors.connector import Connector 7 | 8 | 9 | class ConnectorCSV(Connector): 10 | """Connector to read CSV file""" 11 | 12 | def __init__(self): 13 | # Initialize the connector with its name and configuration definitions 14 | self.name = "CSV" 15 | self.connection_definition = [] # No specific connection parameters required 16 | self.configuration_definition = [ 17 | {"name": "path"}, # Path to the CSV file 18 | {"name": "delimiter", "default": ","}, # Delimiter used in the CSV file 19 | {"name": "infer", "type": "boolean", "default": True}, # Infer the column names 20 | {"name": "names", "type": "list", "default": None}, # Sequence of column labels to apply 21 | {"name": "usecols", "type": "list", "default": None}, # Subset of columns to select 22 | {"name": "skiprows", "type": "string", "default": None}, # Line numbers to skip (0-indexed) or number of lines to skip (int) at the start of the file 23 | {"name": "skipfooter", "type": "integer", "default": 0}, # Number of lines at bottom of file to skip (Unsupported with engine='c') 24 | {"name": "nrows", "type": "integer", "default": None}, # Number of rows of file to read. Useful for reading pieces of large files. 25 | {"name": "lineterminator", "type": "string", "default": None}, # Character used to denote a line break. 26 | {"name": "quotechar", "type": "string", "default": '"'}, # Character used to denote the start and end of a quoted item. 27 | {"name": "encoding", "type": "string", "default": "utf-8"}, # Encoding to use for UTF when reading/writing. 28 | {"name": "engine", "type": "string", "default": None}, # Parser engine to use. 29 | ] 30 | 31 | def get_data(self, configuration: dict, connection: dict): 32 | """Get data from source""" 33 | 34 | # Extract the path and delimiter from the configuration 35 | path = configuration["path"] 36 | delimiter = configuration["delimiter"] 37 | header = None if configuration["infer"] is False else "infer" 38 | names = configuration["names"] 39 | usecols = configuration["usecols"] 40 | skiprows = None 41 | skipfooter = configuration["skipfooter"] 42 | nrows = configuration["nrows"] 43 | lineterminator = configuration["lineterminator"] 44 | quotechar = configuration["quotechar"] 45 | encoding = configuration["encoding"] 46 | engine = configuration["engine"] 47 | 48 | if configuration["skiprows"] is not None: 49 | try: 50 | skiprows = json.loads(configuration["skiprows"]) 51 | except json.JSONDecodeError: 52 | raise ValueError("The variable is neither a list nor an integer.") 53 | 54 | if skiprows is not None and not isinstance(skiprows, (list, int)): 55 | raise ValueError("The variable is neither a list nor an integer.") 56 | 57 | # Read the CSV file using pandas with the specified delimiter 58 | df = pd.read_csv(path, 59 | delimiter=delimiter, 60 | header=header, 61 | names=names, 62 | usecols=usecols, 63 | skiprows=skiprows, 64 | skipfooter=skipfooter, 65 | nrows=nrows, 66 | lineterminator=lineterminator, 67 | quotechar=quotechar, 68 | engine=engine, 69 | encoding=encoding) 70 | return df -------------------------------------------------------------------------------- /tests/load_engine/test_spark.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.types import StructType, StructField, IntegerType, StringType, TimestampType, FloatType 3 | import pytest 4 | from pyjeb import control_and_setup 5 | from ploosh.engines.load_engine_spark import LoadEngineSpark 6 | from ploosh.configuration import Configuration 7 | 8 | @pytest.fixture 9 | def controls(): 10 | controls = Configuration.case_definition 11 | controls = [control for control in controls if control["name"].startswith("options")] 12 | return controls 13 | 14 | 15 | @pytest.fixture 16 | def spark(): 17 | return SparkSession.builder \ 18 | .appName("ploosh") \ 19 | .master("spark://localhost:7077") \ 20 | .config("spark.executor.memory", "1g") \ 21 | .config("spark.driver.memory", "1g") \ 22 | .getOrCreate() 23 | 24 | def test_count(spark, controls): 25 | schema = StructType([ 26 | StructField("A", IntegerType(), True), 27 | StructField("B", IntegerType(), True) 28 | ]) 29 | df_data = spark.createDataFrame([(1, 4), (2, 5), (3, 6)], schema) 30 | parameters = {} 31 | options = control_and_setup(parameters, controls)["options"] 32 | 33 | load_engine = LoadEngineSpark(None, options, None) 34 | df_data = load_engine.execute(df_data) 35 | assert load_engine.count == 3 36 | 37 | def test_cast_datetime(spark, controls): 38 | schema = StructType([ 39 | StructField("A", IntegerType(), True), 40 | StructField("B", StringType(), True) 41 | ]) 42 | df_data = spark.createDataFrame([(1, "2021-01-01"), (2, "2021-01-01"), (3, "2021-01-01")], schema) 43 | parameters = { 44 | "options": { 45 | "cast": [ 46 | { 47 | "name": "B", 48 | "type": "datetime" 49 | } 50 | ] 51 | } 52 | } 53 | options = control_and_setup(parameters, controls)["options"] 54 | 55 | load_engine = LoadEngineSpark(None, options, None) 56 | df_data = load_engine.execute(df_data) 57 | 58 | assert df_data.schema["B"].dataType == TimestampType() 59 | assert df_data.schema["A"].dataType == IntegerType() 60 | 61 | def test_cast_int(spark, controls): 62 | schema = StructType([ 63 | StructField("A", IntegerType(), True), 64 | StructField("B", StringType(), True) 65 | ]) 66 | df_data = spark.createDataFrame([(1, "4"), (2, "5"), (3, "6")], schema) 67 | parameters = { 68 | "options": { 69 | "cast": [ 70 | { 71 | "name": "B", 72 | "type": "int" 73 | } 74 | ] 75 | } 76 | } 77 | options = control_and_setup(parameters, controls)["options"] 78 | 79 | load_engine = LoadEngineSpark(None, options, None) 80 | df_data = load_engine.execute(df_data) 81 | 82 | assert df_data.schema["B"].dataType == IntegerType() 83 | assert df_data.schema["A"].dataType == IntegerType() 84 | 85 | def test_cast_float(spark, controls): 86 | schema = StructType([ 87 | StructField("A", IntegerType(), True), 88 | StructField("B", StringType(), True) 89 | ]) 90 | df_data = spark.createDataFrame([(1, "4.0"), (2, "5.0"), (3, "6.0")], schema) 91 | parameters = { 92 | "options": { 93 | "cast": [ 94 | { 95 | "name": "B", 96 | "type": "float" 97 | } 98 | ] 99 | } 100 | } 101 | options = control_and_setup(parameters, controls)["options"] 102 | 103 | load_engine = LoadEngineSpark(None, options, None) 104 | df_data = load_engine.execute(df_data) 105 | 106 | assert df_data.schema["B"].dataType == FloatType() 107 | assert df_data.schema["A"].dataType == IntegerType() -------------------------------------------------------------------------------- /docs/connectors/native/sqlserver.md: -------------------------------------------------------------------------------- 1 | This connector allows to connect to a SQL Server database and execute SQL queries. 2 | 3 | # Requirements 4 | ODBC Driver 18 must be installed on the executing computer. 5 | 6 | * For Linux, follow the instructions from [Microsoft documentation](https://learn.microsoft.com/en-us/sql/connect/odbc/linux-mac/installing-the-microsoft-odbc-driver-for-sql-server?view=sql-server-ver15&tabs=ubuntu18-install%2Calpine17-install%2Cdebian8-install%2Credhat7-13-install%2Crhel7-offline#18) 7 | * For Windows, follow the instructions from [Microsoft documentation](https://learn.microsoft.com/en-us/sql/connect/odbc/download-odbc-driver-for-sql-server?view=sql-server-ver15) 8 | * For macOS, follow the instructions from [Microsoft documentation](https://learn.microsoft.com/en-us/sql/connect/odbc/linux-mac/install-microsoft-odbc-driver-sql-server-macos?view=sql-server-ver15) 9 | 10 | # Connection configuration 11 | ## Password mode 12 | ### Definition 13 | | Name | Mandatory | Default | Description | 14 | |----------------------------|:---------:|:-----------------------------:|-------------| 15 | | mode | no | password | Change the connection mode. Can be "password" or "connection_string". "connection_string" mode allow to use a custom connection string. 16 | | hostname | yes | | Target host name 17 | | database | yes | | Target database name 18 | | username | yes | | Sql user name 19 | | password | yes | | Sql user password 20 | | port | no | 1433 | Port to use by the connection 21 | | trust_server_certificate | no | false | Trust the server ssl connection 22 | | encrypt | no | yes | Encrypt the connection 23 | | driver | no | ODBC Driver 18 for SQL Server | Driver to use by the connection 24 | 25 | ⚠️ it's highly recommended to use a [parameter](/docs/configuration-custom-parameters/) to pass the password value 26 | 27 | ### Example 28 | ``` yaml 29 | mssql_example: 30 | type: mssql 31 | hostname: ploosh.database.windows.net 32 | database: SampleDB 33 | username: sa_ploosh 34 | password: $var.sa_ploosh_password 35 | ``` 36 | 37 | ### Definition 38 | ## Connection string mode 39 | | Name | Mandatory | Default | Description | 40 | |-------------------|:---------:|:-----------------------------:|-------------| 41 | | mode | no | password | Use "connection_string" value to use custom connection_string 42 | | connection_string | yes | | Connection string use to access in the database. Refer to [SQLAlchemy documentation](https://docs.sqlalchemy.org/en/20/dialects/mssql.html) to get the accepted format 43 | 44 | ### Example 45 | ``` yaml 46 | mssql_example: 47 | type: mssql 48 | mode: connection_string 49 | connection_string: "mssql+pyodbc://ploosh01:1433/SampleDB?driver=ODBC+Driver+18+for+SQL+Server&TrustServerCertificate=yes&authentication=ActiveDirectoryIntegrated" 50 | ``` 51 | 52 | # Test case configuration 53 | ## Definition 54 | | Name | Mandatory | Default | Description | 55 | |-------------------|:---------:|:-----------------------------:|-------------| 56 | | connection | yes | | The connection to use 57 | | query | yes | | The query to execute to the database 58 | 59 | ## Example 60 | ``` yaml 61 | Example SQL Server: 62 | source: 63 | connection: mssql_example 64 | type: mssql 65 | query: | 66 | select * 67 | from [rh].[employees] 68 | where [hire_date] < '2000-01-01' 69 | expected: 70 | type: csv 71 | path: data/employees_before_2000.csv 72 | ``` -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_semantic_model_xmla.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=R0903 2 | """Connector to read Semantic Model from Fabric XMLA endpoint""" 3 | 4 | import pandas as pd 5 | import requests 6 | from azure.identity import ClientSecretCredential, InteractiveBrowserCredential, UsernamePasswordCredential 7 | from connectors.connector import Connector 8 | import json 9 | 10 | class ConnectorSemanticModel(Connector): 11 | """Connector to read Semantic Model using Fabric XMLA endpoint""" 12 | 13 | def __init__(self): 14 | self.name = "SEMANTIC_MODEL" 15 | self.connection_definition = [ 16 | { 17 | "name": "mode", 18 | "default": "oauth", 19 | "validset": ["oauth"] # , "token", "spn"] To add once tested 20 | }, 21 | { 22 | "name": "token", 23 | "default": None 24 | }, 25 | { 26 | "name": "tenant_id", 27 | "default": None 28 | }, 29 | { 30 | "name": "client_id", 31 | "default": None 32 | }, 33 | { 34 | "name": "client_secret", 35 | "default": None 36 | }, 37 | { 38 | "name": "dataset_id" 39 | } 40 | ] 41 | self.configuration_definition = [ 42 | { 43 | "name": "query" 44 | }, 45 | { 46 | "name": "body", 47 | "default": None 48 | } 49 | ] 50 | 51 | def get_data(self, configuration: dict, connection: dict): 52 | """Get data from source""" 53 | 54 | mode = connection["mode"] 55 | dataset_id = connection["dataset_id"] 56 | query = configuration["query"] 57 | 58 | if mode == "oauth": 59 | try: 60 | interactive_browser_credential_class = InteractiveBrowserCredential() 61 | scope = 'https://analysis.windows.net/powerbi/api/.default' 62 | access_token_class = interactive_browser_credential_class.get_token(scope) 63 | token_string = access_token_class.token 64 | except Exception as connection_error: 65 | raise ValueError(connection_error) 66 | 67 | # uses the token provided in the connection_definition 68 | elif mode == "token": 69 | token_string = connection["token"] 70 | 71 | # get a token from a registered azure app 72 | elif mode == "spn": 73 | scope = 'https://analysis.windows.net/powerbi/api/.default' 74 | tenant_id = connection["tenant_id"] 75 | client_id = connection["client_id"] 76 | client_secret = connection["client_secret"] 77 | authority = f'https://login.microsoftonline.com/' 78 | credential = ClientSecretCredential(tenant_id, client_id, client_secret, authority=authority) 79 | token = credential.get_token(scope) 80 | token_string = token.token # need to define header 81 | 82 | # Initialize query 83 | post_query = f'https://api.powerbi.com/v1.0/myorg/datasets/{dataset_id}/executeQueries' 84 | header = {'Authorization': f'Bearer {token_string}', 'Content-Type': 'application/json'} 85 | body = '''{ 86 | "queries": [ 87 | { 88 | "query": "%s" 89 | } 90 | ], 91 | "serializerSettings": { 92 | "includeNulls": "true" 93 | } 94 | }''' % (query) 95 | 96 | post_r = requests.post(url=post_query, data=body, headers=header) 97 | 98 | if post_r.status_code == 200: 99 | output = post_r.json() 100 | df_results = pd.DataFrame(output) 101 | df_tables = pd.DataFrame(df_results["results"][0]) 102 | df_rows = pd.DataFrame(df_tables["tables"][0]) 103 | flatten_data = df_rows.values.flatten() 104 | df = pd.json_normalize(flatten_data) # type: ignore 105 | 106 | return df 107 | 108 | elif post_r.status_code == 400: 109 | response = json.loads(post_r.text) 110 | error_code = response['error']['code'] 111 | error_message = response['error']['pbi.error']['details'][0]['detail']['value'] 112 | raise ValueError(f"DAX Execution Error : {error_code}\n{error_message}") 113 | 114 | elif post_r.status_code == 404: 115 | raise ValueError("Connection issue: PowerBIEntityNotFound") 116 | 117 | else: 118 | raise ValueError("Execution Error") 119 | -------------------------------------------------------------------------------- /src/ploosh/execute.py: -------------------------------------------------------------------------------- 1 | """Automatized Testing Framework""" 2 | 3 | import sys 4 | from colorama import Fore 5 | from pyspark.sql import SparkSession 6 | from case import StateStatistics 7 | from connectors import get_connectors 8 | from exporters import get_exporters 9 | from parameters import Parameters 10 | from configuration import Configuration 11 | from logs import Log, print_compare_state, print_summary 12 | 13 | 14 | def load_data(current_case, process_type, statistics): 15 | """Load data from source or expected""" 16 | try: 17 | # Attempt to load data for the current case 18 | current_case.load_data(process_type) 19 | return True 20 | except Exception as e: 21 | # Handle any errors that occur during data loading 22 | current_case.load_data_error(process_type, str(e)) 23 | current_case.calculate_durations() 24 | statistics.add_state(current_case.state) 25 | Log.print_error(str(e)) 26 | return False 27 | 28 | 29 | def compare_data(current_case, statistics, spark_session): 30 | """Compare data between source and expected""" 31 | try: 32 | # Compare data using Spark if both connectors are Spark-based 33 | if current_case.source.connector.is_spark and current_case.expected.connector.is_spark: 34 | current_case.compare_dataframes_with_spark(spark_session) 35 | else: 36 | # Otherwise, use a standard comparison 37 | current_case.compare_dataframes() 38 | statistics.add_state(current_case.state) 39 | return True 40 | except Exception as e: 41 | # Handle any errors that occur during data comparison 42 | current_case.compare_dataframes_error(str(e)) 43 | current_case.calculate_durations() 44 | statistics.add_state(current_case.state) 45 | Log.print_error(str(e)) 46 | return False 47 | 48 | 49 | def execute(args=None, spark_session=None): 50 | """Main function to execute test cases""" 51 | Log.init() 52 | Log.print_logo() 53 | 54 | statistics = StateStatistics() 55 | 56 | Log.print(f"{Fore.CYAN}Initialization[...]") 57 | try: 58 | # Parse command-line arguments 59 | if args is None: 60 | parameters = Parameters(sys.argv) 61 | else: 62 | parameters = Parameters(args) 63 | 64 | # Initialize Spark session if needed 65 | if parameters.spark_mode is True and spark_session is None: 66 | Log.print("Start spark session") 67 | spark_session = SparkSession.builder \ 68 | .master("local") \ 69 | .appName("ploosh") \ 70 | .getOrCreate() 71 | 72 | # Load connectors and exporters 73 | Log.print("Load connectors") 74 | connectors = get_connectors(spark_session) 75 | Log.print("Load exporters") 76 | exporters = get_exporters() 77 | 78 | # Load configuration and test cases 79 | Log.print("Load configuration") 80 | configuration = Configuration(parameters, connectors, exporters) 81 | cases = configuration.get_cases() 82 | except Exception as e: 83 | # Handle any errors that occur during initialization 84 | Log.print_error(str(e)) 85 | sys.exit(1) 86 | 87 | Log.print(f"{Fore.CYAN}Start processing tests cases[...]") 88 | for i, case_name in enumerate(cases): 89 | current_case = cases[case_name] 90 | 91 | # Skip disabled test cases 92 | if current_case.disabled: 93 | Log.print(f"{Fore.MAGENTA}{case_name} [...] ({i + 1}/{len(cases)}) - Skipped") 94 | statistics.add_state(current_case.state) 95 | continue 96 | 97 | Log.print(f"{Fore.MAGENTA}{case_name} [...] ({i + 1}/{len(cases)}) - Started") 98 | 99 | # Load source data 100 | Log.print("Load source data") 101 | if not load_data(current_case, "source", statistics): 102 | continue 103 | 104 | # Load expected data 105 | Log.print("Load expected data") 106 | if not load_data(current_case, "expected", statistics): 107 | continue 108 | 109 | # Compare source and expected data 110 | Log.print("Compare source and expected data") 111 | if not compare_data(current_case, statistics, spark_session): 112 | continue 113 | 114 | # Print comparison state and calculate durations 115 | print_compare_state(current_case) 116 | current_case.calculate_durations() 117 | 118 | Log.print(f"{Fore.CYAN}Export results[...]") 119 | # Export test results 120 | configuration.exporter.export(cases) 121 | Log.print(f"{Fore.CYAN}Summary[...]") 122 | # Print summary of test results 123 | print_summary(cases, statistics) 124 | 125 | # Exit with error code if there were errors and failure_on_error is set 126 | if statistics.error > 0 and parameters.failure_on_error: 127 | sys.exit(1) 128 | -------------------------------------------------------------------------------- /.github/workflows/unit_tests.yml: -------------------------------------------------------------------------------- 1 | name: 'Unit tests' 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | - develop 8 | paths: 9 | - 'src/**' 10 | - 'tests/**' 11 | jobs: 12 | tests: 13 | name: 'Execute unit tests' 14 | runs-on: ubuntu-22.04 15 | defaults: 16 | run: 17 | shell: bash 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v3 21 | - name: Set up Python 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: "3.12.8" 25 | - name: Setup Java 26 | uses: actions/setup-java@v2 27 | with: 28 | distribution: 'microsoft' 29 | java-version: '17.0.1' 30 | - name: Install requirements 31 | run: | 32 | pip install -r src/requirements.txt 33 | pip install pytest==8.3.3 34 | pip install pytest-timeout==2.3.1 35 | 36 | # Fix host file to avoid issues with Spark 37 | echo "127.0.0.1 localhost" | sudo tee /etc/hosts 38 | echo "::1 localhost ip6-localhost ip6-loopback" | sudo tee -a /etc/hosts 39 | echo "fe00::0 ip6-localnet" | sudo tee -a /etc/hosts 40 | echo "ff00::0 ip6-mcastprefix" | sudo tee -a /etc/hosts 41 | echo "ff02::1 ip6-allnodes" | sudo tee -a /etc/hosts 42 | echo "ff02::2 ip6-allrouters" | sudo tee -a /etc/hosts 43 | echo "ff02::3 ip6-allhosts" | sudo tee -a /etc/hosts 44 | echo "127.0.0.1 $(hostname)" | sudo tee -a /etc/hosts 45 | - name: Run MySQL container 46 | run: | 47 | docker run --name ploosh_mysql \ 48 | -e MYSQL_ROOT_PASSWORD="${{ secrets.TEST_LOCAL_DB_PASSWORD }}" \ 49 | -e MYSQL_PASSWORD="${{ secrets.TEST_LOCAL_DB_PASSWORD }}" \ 50 | -e MYSQL_DATABASE=ploosh \ 51 | -e MYSQL_USER=ploosh \ 52 | -p 3306:3306 \ 53 | -d mysql 54 | - name: Run PostgreSQL container 55 | run: | 56 | docker run --name ploosh_postgresql \ 57 | -e POSTGRES_USER=ploosh \ 58 | -e POSTGRES_PASSWORD="${{ secrets.TEST_LOCAL_DB_PASSWORD }}" \ 59 | -e POSTGRES_DB=ploosh \ 60 | -p 5432:5432 \ 61 | -d postgres 62 | - name: Run SQL Server container 63 | run: | 64 | docker run --name ploosh_mssql \ 65 | -e ACCEPT_EULA="Y" \ 66 | -e MSSQL_SA_PASSWORD="${{ secrets.TEST_LOCAL_DB_PASSWORD }}" \ 67 | --hostname ploosh \ 68 | -p 1433:1433 \ 69 | -d \ 70 | mcr.microsoft.com/mssql/server:2022-latest 71 | 72 | - name: Run Spark master container 73 | run: | 74 | docker run -d --name ploosh-spark-master \ 75 | -e SPARK_MODE=master \ 76 | -e SPARK_MASTER_HOST=ploosh-spark-master \ 77 | -p 7077:7077 -p 8081:8080 \ 78 | -v $(pwd)/tests/.data:$(pwd)/tests/.data \ 79 | -v $(pwd)/tests/.env:$(pwd)/tests/.env \ 80 | --hostname ploosh-spark-master \ 81 | bitnami/spark:3.5.4 82 | 83 | docker exec ploosh-spark-master pip install delta-spark==3.3.0 84 | - name: Run Spark worker container 85 | run: | 86 | docker run -d --name ploosh-spark-worker \ 87 | -e SPARK_MODE=worker \ 88 | -e SPARK_MASTER_URL=spark://ploosh-spark-master:7077 \ 89 | -e SPARK_WORKER_MEMORY=2g \ 90 | -e SPARK_WORKER_CORES=1 \ 91 | -v $(pwd)/tests/.data:$(pwd)/tests/.data \ 92 | -v $(pwd)/tests/.env:$(pwd)/tests/.env \ 93 | --link ploosh-spark-master:ploosh-spark-master \ 94 | bitnami/spark:3.5.4 95 | 96 | docker exec ploosh-spark-worker pip install delta-spark==3.3.0 97 | - name: Feed databases 98 | run: | 99 | sleep 30 # wait until all services are up 100 | 101 | mysql -h 127.0.0.1 -u ploosh -p'${{ secrets.TEST_LOCAL_DB_PASSWORD }}' < tests/.env/mysql/setup.sql 102 | 103 | export PGPASSWORD='${{ secrets.TEST_LOCAL_DB_PASSWORD }}'; 104 | psql -h 127.0.0.1 -U ploosh -d ploosh -f tests/.env/postgresql/setup.sql 105 | 106 | /opt/mssql-tools/bin/sqlcmd -S localhost -U sa -P "${{ secrets.TEST_LOCAL_DB_PASSWORD }}" -i tests/.env/mssql/setup.sql 107 | 108 | spark_setup_file=$(pwd)/tests/.env/spark/setup.sql 109 | spark_setup_file_tmp=$(pwd)/tests/.env/spark/setup_tmp.sql 110 | sed "s|{{pwd}}|$(pwd)|g" $spark_setup_file > $spark_setup_file_tmp 111 | spark-sql -f$spark_setup_file_tmp 112 | - name: Execute tests 113 | run: | 114 | export TEST_DB_PASSWORD="${{ secrets.TEST_LOCAL_DB_PASSWORD }}" 115 | pytest -rA ./tests 116 | 117 | -------------------------------------------------------------------------------- /docs/home.md: -------------------------------------------------------------------------------- 1 | # What is Ploosh? 2 | 3 | Ploosh is yaml based framework used to automatize the testing process in data projects. It is designed to be simple to use and to be easily integrated in any CI/CD pipelines and it is also designed to be easily extended to support new data connectors. 4 | 5 | ## Connectors 6 | | Type | Native connectors | Spark connectors 7 | |-----------|:----------|:----------| 8 | | Databases | [![Big Query](https://ploosh.io/wp-content/uploads/2025/01/bigquery.png)](/docs/docs/connectors-native-big-query/) [![Databricks](https://ploosh.io/wp-content/uploads/2025/01/databricks.png)](/docs/connectors-native-databricks) [![Snowflake](https://ploosh.io/wp-content/uploads/2025/01/snowflake.png)](/docs/connectors-native-snowflake) [![Sql Server](http://ploosh.io/wp-content/uploads/2025/01/mssql.png)](SQL-Server) [![PostgreSQL](https://ploosh.io/wp-content/uploads/2025/01/postgresql.png)](/docs/connectors-native-postgreSQL) [![MySQL](https://ploosh.io/wp-content/uploads/2025/01/mysql.png)](/docs/connectors-native-mysql) | [![SQL](https://ploosh.io/wp-content/uploads/2025/01/sql.png)](/docs/connectors-spark-sql) 9 | | Files | [![CSV](http://ploosh.io/wp-content/uploads/2025/01/csv.png)](/docs/connectors-native-csv) [![Excel](http://ploosh.io/wp-content/uploads/2025/01/excel.png)](/docs/connectors-native-excel) [![Parquet](http://ploosh.io/wp-content/uploads/2025/01/parquet.png)](/docs/connectors-native-parquet) | [![Delta](http://ploosh.io/wp-content/uploads/2025/01/delta.png)](/docs/connectors-spark-delta) [![CSV](http://ploosh.io/wp-content/uploads/2025/01/csv.png)](/docs/connectors-spark-csv) 10 | | Others | [![CSV](http://ploosh.io/wp-content/uploads/2025/01/empty.png)](/docs/connectors-native-empty) | [![Empty](http://ploosh.io/wp-content/uploads/2025/01/empty.png)](/docs/connectors-spark-empty) 11 | | Not yet but soon | ![JSON](http://ploosh.io/wp-content/uploads/2025/01/json.png) ![Oracle](http://ploosh.io/wp-content/uploads/2025/01/oracle.png) | ![Parquet](http://ploosh.io/wp-content/uploads/2025/01/parquet.png) 12 | 13 | # Get started 14 | 15 | ## Steps 16 | 1. Install Ploosh package 17 | 2. Setup connection file 18 | 3. Setup test cases 19 | 4. Run tests 20 | 4. Get results 21 | 22 | ## Install Ploosh package 23 | Install from [PyPi](https://pypi.org/project/ploosh/) package manager: 24 | ``` shell 25 | pip install ploosh 26 | ``` 27 | 28 | ## Setup connection file 29 | Add a yaml file with name "connections.yml" and following content: 30 | ``` yaml 31 | mssql_getstarted: 32 | type: mysql 33 | hostname: my_server_name.database.windows.net 34 | database: my_database_name 35 | username: my_user_name 36 | // using a parameter is highly recommended 37 | password: $var.my_sql_server_password 38 | ``` 39 | 40 | ## Setup test cases 41 | Add a folder "test_cases" with a yaml file with any name. In this example "example.yaml". Add the following content: 42 | 43 | ``` yaml 44 | Test aggregated data: 45 | options: 46 | sort: 47 | - gender 48 | - domain 49 | source: 50 | connection: mysql_demo 51 | type: mysql 52 | query: | 53 | select gender, right(email, length(email) - position("@" in email)) as domain, count(*) as count 54 | from users 55 | group by gender, domain 56 | expected: 57 | type: csv 58 | path: ./data/test_target_agg.csv 59 | 60 | Test invalid data: 61 | source: 62 | connection: mysql_demo 63 | type: mysql 64 | query: | 65 | select id, first_name, last_name, email, gender, ip_address 66 | from users 67 | where email like "%%.gov" 68 | expected: 69 | type: empty 70 | ``` 71 | 72 | ## Run tests 73 | ``` shell 74 | ploosh --connections "connections.yml" --cases "test_cases" --export "JSON" --p_my_sql_server_password "mypassword" 75 | ``` 76 | 77 | ![Execution result](http://ploosh.io/wp-content/uploads/2024/09/image.png) 78 | 79 | ## Test results 80 | 81 | ``` json 82 | [ 83 | { 84 | "name": "Test aggregated data", 85 | "state": "passed", 86 | "source": { 87 | "start": "2024-02-05T17:08:36Z", 88 | "end": "2024-02-05T17:08:36Z", 89 | "duration": 0.0032982 90 | }, 91 | "expected": { 92 | "start": "2024-02-05T17:08:36Z", 93 | "end": "2024-02-05T17:08:36Z", 94 | "duration": 6.0933333333333335e-05 95 | }, 96 | "compare": { 97 | "start": "2024-02-05T17:08:36Z", 98 | "end": "2024-02-05T17:08:36Z", 99 | "duration": 0.00046468333333333334 100 | } 101 | }, 102 | { 103 | "name": "Test invalid data", 104 | "state": "failed", 105 | "source": { 106 | "start": "2024-02-05T17:08:36Z", 107 | "end": "2024-02-05T17:08:36Z", 108 | "duration": 0.00178865 109 | }, 110 | "expected": { 111 | "start": "2024-02-05T17:08:36Z", 112 | "end": "2024-02-05T17:08:36Z", 113 | "duration": 1.49e-05 114 | }, 115 | "compare": { 116 | "start": "2024-02-05T17:08:36Z", 117 | "end": "2024-02-05T17:08:36Z", 118 | "duration": 1.8333333333333333e-07 119 | }, 120 | "error": { 121 | "type": "count", 122 | "message": "The count in source dataset (55) is different than the count in the expected dataset (0)" 123 | } 124 | } 125 | ] 126 | ``` 127 | 128 | # Run with spark 129 | It's possible to run the tests with spark. To do that, you need to install the spark package or use a platform that already has it installed like Databricks or Microsoft Fabric. 130 | 131 | See the [Spark connector](/docs/configuration-spark-mode/) for more information. -------------------------------------------------------------------------------- /src/ploosh/connectors/connector_analysis_services.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from connectors.connector import Connector 3 | from azure.identity import ClientSecretCredential 4 | from sys import path 5 | from pathlib import Path 6 | 7 | class ConnectorAnalysisServices(Connector): 8 | """Connector to read Analysis Services Model using ADOMD""" 9 | 10 | def __init__(self): 11 | ## ADOMD.dll ## 12 | # Using dll file put into src\ploosh\connectors\modules 13 | # The file need to be packaged to work here 14 | # need to use the absPath (!!! check if it works on linux !!!) 15 | root = Path(r"\Program Files\Microsoft.NET\ADOMD.NET\\") 16 | adomd_path = str(max((root).iterdir())) 17 | path.append(adomd_path) 18 | #absPath = os.path.dirname(__file__) 19 | #path.append(absPath + '\\modules') 20 | 21 | # NEED to pip install pythonnet to make pyadomd work !! 22 | global Pyadomd 23 | from pyadomd import Pyadomd 24 | ## ADOMD.dll -- END ## 25 | 26 | self.name = "ANALYSIS_SERVICES" 27 | self.connection_definition = [ 28 | { 29 | "name": "mode", 30 | "default": "oauth", 31 | "validset": ["oauth", "pbix"] # , "token", "credentials", "spn"] 32 | }, 33 | { 34 | "name": "token", 35 | "default": None 36 | }, 37 | { 38 | "name": "username", 39 | "default": None 40 | }, 41 | { 42 | "name": "password", 43 | "default": None 44 | }, 45 | { 46 | "name": "tenant_id", 47 | "default": None 48 | }, 49 | { 50 | "name": "client_id", 51 | "default": None 52 | }, 53 | { 54 | "name": "client_secret", 55 | "default": None 56 | }, 57 | { 58 | "name": "scope", 59 | "default": 'https://analysis.windows.net/powerbi/api/.default' 60 | }, 61 | { 62 | "name": "dataset_id" 63 | }, 64 | { 65 | "name": "server" 66 | } 67 | ] 68 | self.configuration_definition = [ 69 | { 70 | "name": "query" 71 | } 72 | ] 73 | 74 | def get_data(self, configuration: dict, connection: dict): 75 | """Get data from source""" 76 | 77 | mode = connection["mode"] 78 | 79 | server = connection["server"] 80 | dataset_id = connection["dataset_id"] # For local .pbix --> Dataset ID: in DAX Studio, right click to model name and choose "copy Database ID" 81 | 82 | query = configuration["query"] # DAX Query 83 | 84 | # will open a login page in browser (if local AS instance, will connect automatically) 85 | if mode == "oauth": 86 | connection_string = f'Provider=MSOLAP;Data Source={server};Catalog={dataset_id};' 87 | 88 | # will open a login page in browser (if local AS instance, will connect automatically) 89 | elif mode == "pbix": 90 | connection_string = f'Provider=MSOLAP;Data Source={server};Catalog={dataset_id};' 91 | 92 | # uses the token provided in the connection_definition 93 | elif mode == "token": 94 | token = connection["token"] 95 | connection_string = f'Provider=MSOLAP;Data Source={server};Catalog={dataset_id};User Id=;Password={token};Impersonation Level=Impersonate;' 96 | 97 | # get a token from a registered azure app 98 | elif mode == "spn": 99 | scope = connection["scope"] 100 | tenant_id = connection["tenant_id"] 101 | client_id = connection["client_id"] 102 | client_secret = connection["client_secret"] 103 | authority = f'https://login.microsoftonline.com/' 104 | credential = ClientSecretCredential(tenant_id, client_id, client_secret) # , authority=authority) 105 | token = credential.get_token(scope) 106 | token_string = token.token 107 | connection_string = f'Provider=MSOLAP;Data Source={server};Catalog={dataset_id};User Id=;Password={token_string};Impersonation Level=Impersonate;' 108 | 109 | # uses username and password 110 | elif mode == "credentials": 111 | username = connection["username"] 112 | password = connection["password"] 113 | connection_string = f'Provider=MSOLAP;Data Source={server};Catalog={dataset_id};User Id={username};Password={password};' 114 | 115 | # Create and open connection to AS instance 116 | con = Pyadomd(connection_string) 117 | try: 118 | con.open() # Open the connection 119 | except: 120 | raise ValueError("Can't connect to the AS Instance") 121 | 122 | # execute DAX query 123 | with con.cursor() as cur: 124 | try: 125 | cur.execute(query) 126 | result = cur.fetchone() 127 | column_name = [i.name for i in cur.description] 128 | df = pd.DataFrame(result, columns=column_name) 129 | 130 | # Proactively close connection to AS instance 131 | con.close() 132 | 133 | return df 134 | except Exception as query_error: 135 | error_message = str(query_error) 136 | # Keep only error message without Technical Details 137 | error_summary = error_message.split("Technical Details")[0].strip().split("\r\n at")[0].strip() 138 | raise Exception(f"Erreur lors de l'exécution de la requête :\n{str(error_summary)}") 139 | -------------------------------------------------------------------------------- /tests/.env/csv/sales_with_cr.csv: -------------------------------------------------------------------------------- 1 | sale_id,seller_name,card_name,card_rarity,card_condition,price,quantity,sale_date,card_set,buyer_name,transaction_status 1,John Doe,Charizard,Rare,Mint,250.00,1,2024-11-01,Base Set,Jane Smith,Completed 2,Jane Smith,Blastoise,Holo Rare,Excellent,180.00,1,2024-11-02,Base Set,Alex Johnson,Completed 3,Alex Johnson,Pikachu,Common,Near Mint,15.00,4,2024-11-03,Jungle,Chris Brown,Completed 4,Chris Brown,Dragonite,Ultra Rare,Good,300.00,1,2024-11-04,Fossil,Emma Green,Pending 5,Emma Green,Zapdos,Holo Rare,Mint,150.00,1,2024-11-05,Fossil,Sarah White,Completed 6,Sarah White,Venusaur,Rare,Good,120.00,2,2024-11-06,Base Set,John Doe,Completed 7,Liam Brown,Moltres,Rare,Near Mint,140.00,1,2024-11-07,Fossil,Jane Smith,Completed 8,Olivia Taylor,Articuno,Rare,Excellent,100.00,1,2024-11-08,Fossil,Alex Johnson,Cancelled 9,Sophia Wilson,Eevee,Common,Mint,20.00,3,2024-11-09,Jungle,Chris Brown,Completed 10,Mason Martinez,Jolteon,Rare,Near Mint,80.00,1,2024-11-10,Jungle,Emma Green,Completed 11,Ethan White,Flareon,Rare,Excellent,85.00,1,2024-11-11,Jungle,Sarah White,Completed 12,Lucas Harris,Vaporeon,Rare,Mint,100.00,1,2024-11-12,Jungle,Liam Brown,Completed 13,Amelia Clark,Machamp,Holo Rare,Mint,75.00,1,2024-11-13,Base Set,Olivia Taylor,Completed 14,Harper Lewis,Gengar,Rare,Mint,80.00,1,2024-11-14,Fossil,Sophia Wilson,Completed 15,Evelyn Walker,Snorlax,Rare,Mint,90.00,1,2024-11-15,Jungle,Mason Martinez,Completed 16,Henry King,Charizard,Rare,Excellent,250.00,1,2024-11-16,Base Set,Ethan White,Completed 17,Isabella Moore,Mewtwo,Holo Rare,Near Mint,220.00,1,2024-11-17,Fossil,Lucas Harris,Completed 18,Sophia Wilson,Articuno,Rare,Mint,120.00,1,2024-11-18,Fossil,Amelia Clark,Completed 19,Liam Brown,Pikachu,Common,Good,10.00,5,2024-11-19,Jungle,Harper Lewis,Pending 20,Emma Green,Moltres,Rare,Excellent,140.00,1,2024-11-20,Fossil,Evelyn Walker,Completed 21,Chris Brown,Blastoise,Holo Rare,Mint,180.00,1,2024-11-21,Base Set,Henry King,Completed 22,Alex Johnson,Eevee,Common,Mint,25.00,2,2024-11-22,Jungle,Isabella Moore,Completed 23,John Doe,Dragonite,Ultra Rare,Near Mint,320.00,1,2024-11-23,Base Set,Jane Smith,Pending 24,Jane Smith,Machamp,Holo Rare,Good,70.00,1,2024-11-24,Base Set,Alex Johnson,Completed 25,Sarah White,Vaporeon,Rare,Excellent,100.00,1,2024-11-25,Jungle,Chris Brown,Cancelled 26,Olivia Taylor,Jolteon,Rare,Mint,85.00,1,2024-11-26,Jungle,Emma Green,Completed 27,Henry King,Zapdos,Holo Rare,Good,140.00,1,2024-11-27,Fossil,Sophia Wilson,Completed 28,Ethan White,Gengar,Rare,Excellent,75.00,1,2024-11-28,Fossil,Mason Martinez,Completed 29,Amelia Clark,Mewtwo,Holo Rare,Mint,230.00,1,2024-11-29,Fossil,Ethan White,Completed 30,Lucas Harris,Charizard,Rare,Near Mint,250.00,1,2024-11-30,Base Set,Lucas Harris,Completed 31,Harper Lewis,Snorlax,Rare,Excellent,90.00,1,2024-12-01,Jungle,Liam Brown,Completed 32,Sophia Wilson,Flareon,Rare,Good,85.00,1,2024-12-02,Jungle,Isabella Moore,Pending 33,Mason Martinez,Articuno,Rare,Mint,120.00,1,2024-12-03,Fossil,Harper Lewis,Completed 34,Emma Green,Moltres,Holo Rare,Mint,140.00,1,2024-12-04,Fossil,Henry King,Completed 35,John Doe,Pikachu,Common,Mint,15.00,3,2024-12-05,Jungle,Chris Brown,Completed 36,Jane Smith,Dragonite,Ultra Rare,Excellent,300.00,1,2024-12-06,Base Set,Sophia Wilson,Completed 37,Alex Johnson,Machamp,Holo Rare,Mint,75.00,1,2024-12-07,Base Set,Emma Green,Completed 38,Chris Brown,Vaporeon,Rare,Good,90.00,1,2024-12-08,Jungle,John Doe,Completed 39,Olivia Taylor,Jolteon,Rare,Near Mint,80.00,1,2024-12-09,Jungle,Jane Smith,Pending 40,Ethan White,Gengar,Rare,Mint,85.00,1,2024-12-10,Fossil,Liam Brown,Completed 41,Amelia Clark,Eevee,Common,Excellent,25.00,3,2024-12-11,Jungle,Olivia Taylor,Completed 42,Sophia Wilson,Charizard,Rare,Good,220.00,1,2024-12-12,Base Set,Alex Johnson,Cancelled 43,Lucas Harris,Zapdos,Holo Rare,Mint,150.00,1,2024-12-13,Fossil,Emma Green,Completed 44,Harper Lewis,Mewtwo,Ultra Rare,Near Mint,200.00,1,2024-12-14,Fossil,Sarah White,Completed 45,Henry King,Lapras,Rare,Mint,95.00,1,2024-12-16,Fossil,Sophia Wilson,Completed 46,Ethan White,Ditto,Rare,Excellent,85.00,1,2024-12-17,Fossil,Amelia Clark,Completed 47,Sarah White,Bulbasaur,Common,Near Mint,12.00,5,2024-12-18,Base Set,Lucas Harris,Completed 48,Emma Green,Charmander,Common,Mint,15.00,4,2024-12-19,Base Set,Chris Brown,Pending 49,Jane Smith,Squirtle,Common,Good,10.00,6,2024-12-20,Base Set,Mason Martinez,Completed 50,John Doe,Jigglypuff,Common,Excellent,8.00,10,2024-12-21,Jungle,Liam Brown,Completed 51,Olivia Taylor,Clefairy,Rare,Mint,50.00,1,2024-12-22,Base Set,Ethan White,Completed 52,Lucas Harris,Nidoking,Holo Rare,Good,125.00,1,2024-12-23,Base Set,John Doe,Cancelled 53,Alex Johnson,Hitmonchan,Holo Rare,Near Mint,100.00,1,2024-12-24,Base Set,Jane Smith,Completed 54,Sophia Wilson,Kangaskhan,Rare,Excellent,80.00,1,2024-12-25,Jungle,Henry King,Completed 55,Chris Brown,Scyther,Rare,Mint,85.00,1,2024-12-26,Jungle,Emma Green,Completed 56,Harper Lewis,Pinsir,Rare,Near Mint,70.00,1,2024-12-27,Jungle,Olivia Taylor,Completed 57,Mason Martinez,Aerodactyl,Rare,Good,100.00,1,2024-12-28,Fossil,Sarah White,Completed 58,Liam Brown,Kabutops,Rare,Mint,105.00,1,2024-12-29,Fossil,Alex Johnson,Completed 59,Evelyn Walker,Magikarp,Common,Excellent,5.00,20,2024-12-30,Base Set,Lucas Harris,Completed 60,Amelia Clark,Gyarados,Holo Rare,Near Mint,150.00,1,2024-12-31,Base Set,Sophia Wilson,Pending 61,Sarah White,Ditto,Rare,Mint,90.00,1,2025-01-01,Fossil,Henry King,Completed 62,Emma Green,Pidgeot,Rare,Good,70.00,1,2025-01-02,Jungle,Chris Brown,Completed 63,John Doe,Electabuzz,Rare,Excellent,60.00,2,2025-01-03,Base Set,Liam Brown,Completed 64,Jane Smith,Magmar,Rare,Mint,55.00,1,2025-01-04,Fossil,Mason Martinez,Completed 65,Olivia Taylor,Jynx,Common,Excellent,30.00,3,2025-01-05,Base Set,Ethan White,Completed 66,Alex Johnson,Alakazam,Holo Rare,Mint,175.00,1,2025-01-06,Base Set,Jane Smith,Completed 67,Sophia Wilson,Chansey,Holo Rare,Good,100.00,1,2025-01-07,Base Set,Olivia Taylor,Completed 68,Chris Brown,Geodude,Common,Near Mint,5.00,12,2025-01-08,Base Set,John Doe,Completed 69,Henry King,Grimer,Common,Excellent,7.00,8,2025-01-09,Fossil,Emma Green,Completed 70,Ethan White,Muk,Rare,Mint,85.00,1,2025-01-10,Fossil,Sophia Wilson,Completed 71,Harper Lewis,Rhydon,Rare,Good,75.00,1,2025-01-11,Jungle,Chris Brown,Cancelled 72,Mason Martinez,Tauros,Common,Near Mint,10.00,10,2025-01-12,Jungle,Alex Johnson,Completed 73,Evelyn Walker,Exeggutor,Rare,Mint,65.00,1,2025-01-13,Jungle,Sarah White,Completed 74,Lucas Harris,Venonat,Common,Excellent,5.00,15,2025-01-14,Jungle,Harper Lewis,Pending -------------------------------------------------------------------------------- /src/ploosh/logs.py: -------------------------------------------------------------------------------- 1 | """Module for log functions""" 2 | 3 | import math 4 | import os 5 | import re 6 | import shutil 7 | from datetime import datetime 8 | from colorama import Fore, Style 9 | from version import PLOOSH_VERSION 10 | 11 | class Log: 12 | """Log class contain all functions to log""" 13 | 14 | @staticmethod 15 | def init(): 16 | """Initialize log settings and create log directory""" 17 | Log.LEVELS_PRINT = { 18 | "INFO": Fore.GREEN, 19 | "WARN": Fore.YELLOW, 20 | "ERRO": Fore.RED, 21 | } 22 | 23 | # Get terminal size and set console log space 24 | Log.CONSOLE_WIDTH = shutil.get_terminal_size(fallback=(120, 50)).columns 25 | Log.CONSOLE_WIDTH_GAP = 29 26 | Log.CONSOLE_LOG_SPACE = Log.CONSOLE_WIDTH - Log.CONSOLE_WIDTH_GAP 27 | 28 | # Set log folder and log file path 29 | Log.LOGS_FOLDER = "./logs" 30 | Log.LOGS_PATH = f"{Log.LOGS_FOLDER}/ploosh_{datetime.now().strftime('%Y%m%d%H%M%S')}.log" 31 | 32 | # Create log folder if it doesn't exist 33 | os.makedirs(Log.LOGS_FOLDER, exist_ok=True) 34 | 35 | @staticmethod 36 | def print(message: str, level: str = "INFO", filler: str = "."): 37 | """Print a message with all metadata informations""" 38 | date_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 39 | 40 | # Determine the number of filler characters needed 41 | count_filler = 1 if message.count("[...]") == 0 else message.count("[...]") 42 | 43 | # Remove ANSI escape sequences from the message 44 | raw_message = re.sub(r"[^\w ]*[\d]+m", "", message) 45 | print_length = len(raw_message) 46 | feed_characters = filler * math.trunc( 47 | (Log.CONSOLE_LOG_SPACE - print_length + (5 * count_filler)) / count_filler 48 | ) 49 | message = message.replace("[...]", feed_characters) 50 | 51 | rows_to_print = [message] 52 | # Disable coloration for multi-line messages 53 | if print_length > Log.CONSOLE_LOG_SPACE or "\n" in message: 54 | rows_to_print = [] 55 | message_rows = raw_message.split("\n") 56 | for row in message_rows: 57 | rows_count = math.ceil(len(row) / Log.CONSOLE_LOG_SPACE) 58 | for i in range(0, rows_count): 59 | start = i * Log.CONSOLE_LOG_SPACE 60 | end = (i + 1) * Log.CONSOLE_LOG_SPACE 61 | rows_to_print.append(row[start:end]) 62 | 63 | # Format each row with date, time, and log level 64 | rows_to_print = [ 65 | f"{Fore.CYAN}[{date_time}] {Log.LEVELS_PRINT[level]}[{level}]{Style.RESET_ALL} {row}{Style.RESET_ALL}" 66 | for row in rows_to_print 67 | ] 68 | 69 | # Print each row to the console 70 | for row in rows_to_print: 71 | print(row) 72 | 73 | # Write the log to the log file 74 | with open(Log.LOGS_PATH, "a", encoding="UTF-8") as f: 75 | log_text = "\r\n".join(rows_to_print) + "\r\n" 76 | 77 | # Remove color codes from the log text 78 | for key in Fore.__dict__: 79 | log_text = log_text.replace(Fore.__dict__[key], "") 80 | 81 | for key in Style.__dict__: 82 | log_text = log_text.replace(Style.__dict__[key], "") 83 | 84 | f.write(log_text) 85 | 86 | @staticmethod 87 | def print_error(message: str): 88 | """Print an error message with all metadata informations""" 89 | Log.print(message, "ERRO") 90 | 91 | @staticmethod 92 | def print_warning(message: str): 93 | """Print a warning message with all metadata informations""" 94 | Log.print(message, "WARN") 95 | 96 | @staticmethod 97 | def print_logo(): 98 | """Print the ATF logo""" 99 | Log.print(r"[...]", filler="~") 100 | Log.print(r"[...] .__ .__ [...]", filler=" ") 101 | Log.print(r"[...]______ | | ____ ____ _____| |__ [...]", filler=" ") 102 | Log.print(r"[...]\____ \| | / _ \ / _ \/ ___| | \ [...]", filler=" ") 103 | Log.print(r"[...]| |_> | |_( <_> ( <_> \___ \| Y \[...]", filler=" ") 104 | Log.print(r"[...]| __/|____/\____/ \____/____ |___| /[...]", filler=" ") 105 | Log.print(r"[...]|__| \/ \/ [...]", filler=" ") 106 | Log.print(f"[...]Automatized Testing Framework (v {PLOOSH_VERSION})[...]", filler=" ") 107 | Log.print(r"[...]", filler=" ") 108 | Log.print(r"[...]https://github.com/CSharplie/ploosh", filler=" ") 109 | Log.print(r"[...]", filler="~") 110 | 111 | 112 | def print_compare_state(current_case): 113 | """Print the comparison state of a test case""" 114 | 115 | state = current_case.state.upper() 116 | state_matrix = { 117 | "FAILED": {"color": Fore.YELLOW, "function": Log.print_warning}, 118 | "ERROR": {"color": Fore.RED, "function": Log.print_error}, 119 | "PASSED": {"color": Fore.GREEN, "function": Log.print}, 120 | } 121 | state_item = state_matrix[state] 122 | state_item["function"](f"Compare state: {state_item['color']}{state}") 123 | 124 | if state != "PASSED": 125 | state_item["function"](f"Error type : {state_item['color']}{current_case.error_type.upper()}") 126 | state_item["function"](f"Error message: {state_item['color']}{current_case.error_message}") 127 | 128 | 129 | def print_summary(cases, statistics): 130 | """Print a summary of test case results""" 131 | for case_name in cases: 132 | state = cases[case_name].state 133 | color = Fore.CYAN 134 | 135 | if state == "error": 136 | color = Fore.RED 137 | if state == "passed": 138 | color = Fore.GREEN 139 | if state == "failed": 140 | color = Fore.YELLOW 141 | 142 | if state == "notExecuted": 143 | state = "skipped" 144 | 145 | Log.print(f"{case_name} [...] {color}{state.upper()}") 146 | 147 | # Print overall statistics 148 | message = f"passed: {Fore.GREEN}{statistics.passed}{Style.RESET_ALL}, " 149 | message += f"failed: {Fore.YELLOW}{statistics.failed}{Style.RESET_ALL}, " 150 | message += f"error: {Fore.RED}{statistics.error}{Style.RESET_ALL}, " 151 | message += f"skipped: {Fore.CYAN}{statistics.not_executed}{Style.RESET_ALL}" 152 | 153 | Log.print(message) 154 | -------------------------------------------------------------------------------- /tests/.data/sales.csv: -------------------------------------------------------------------------------- 1 | sale_id,seller_name,card_name,card_rarity,card_condition,price,quantity,sale_date,card_set,buyer_name,transaction_status 2 | 1,John Doe,Charizard,Rare,Mint,250.00,1,2024-11-01,Base Set,Jane Smith,Completed 3 | 2,Jane Smith,Blastoise,Holo Rare,Excellent,180.00,1,2024-11-02,Base Set,Alex Johnson,Completed 4 | 3,Alex Johnson,Pikachu,Common,Near Mint,15.00,4,2024-11-03,Jungle,Chris Brown,Completed 5 | 4,Chris Brown,Dragonite,Ultra Rare,Good,300.00,1,2024-11-04,Fossil,Emma Green,Pending 6 | 5,Emma Green,Zapdos,Holo Rare,Mint,150.00,1,2024-11-05,Fossil,Sarah White,Completed 7 | 6,Sarah White,Venusaur,Rare,Good,120.00,2,2024-11-06,Base Set,John Doe,Completed 8 | 7,Liam Brown,Moltres,Rare,Near Mint,140.00,1,2024-11-07,Fossil,Jane Smith,Completed 9 | 8,Olivia Taylor,Articuno,Rare,Excellent,100.00,1,2024-11-08,Fossil,Alex Johnson,Cancelled 10 | 9,Sophia Wilson,Eevee,Common,Mint,20.00,3,2024-11-09,Jungle,Chris Brown,Completed 11 | 10,Mason Martinez,Jolteon,Rare,Near Mint,80.00,1,2024-11-10,Jungle,Emma Green,Completed 12 | 11,Ethan White,Flareon,Rare,Excellent,85.00,1,2024-11-11,Jungle,Sarah White,Completed 13 | 12,Lucas Harris,Vaporeon,Rare,Mint,100.00,1,2024-11-12,Jungle,Liam Brown,Completed 14 | 13,Amelia Clark,Machamp,Holo Rare,Mint,75.00,1,2024-11-13,Base Set,Olivia Taylor,Completed 15 | 14,Harper Lewis,Gengar,Rare,Mint,80.00,1,2024-11-14,Fossil,Sophia Wilson,Completed 16 | 15,Evelyn Walker,Snorlax,Rare,Mint,90.00,1,2024-11-15,Jungle,Mason Martinez,Completed 17 | 16,Henry King,Charizard,Rare,Excellent,250.00,1,2024-11-16,Base Set,Ethan White,Completed 18 | 17,Isabella Moore,Mewtwo,Holo Rare,Near Mint,220.00,1,2024-11-17,Fossil,Lucas Harris,Completed 19 | 18,Sophia Wilson,Articuno,Rare,Mint,120.00,1,2024-11-18,Fossil,Amelia Clark,Completed 20 | 19,Liam Brown,Pikachu,Common,Good,10.00,5,2024-11-19,Jungle,Harper Lewis,Pending 21 | 20,Emma Green,Moltres,Rare,Excellent,140.00,1,2024-11-20,Fossil,Evelyn Walker,Completed 22 | 21,Chris Brown,Blastoise,Holo Rare,Mint,180.00,1,2024-11-21,Base Set,Henry King,Completed 23 | 22,Alex Johnson,Eevee,Common,Mint,25.00,2,2024-11-22,Jungle,Isabella Moore,Completed 24 | 23,John Doe,Dragonite,Ultra Rare,Near Mint,320.00,1,2024-11-23,Base Set,Jane Smith,Pending 25 | 24,Jane Smith,Machamp,Holo Rare,Good,70.00,1,2024-11-24,Base Set,Alex Johnson,Completed 26 | 25,Sarah White,Vaporeon,Rare,Excellent,100.00,1,2024-11-25,Jungle,Chris Brown,Cancelled 27 | 26,Olivia Taylor,Jolteon,Rare,Mint,85.00,1,2024-11-26,Jungle,Emma Green,Completed 28 | 27,Henry King,Zapdos,Holo Rare,Good,140.00,1,2024-11-27,Fossil,Sophia Wilson,Completed 29 | 28,Ethan White,Gengar,Rare,Excellent,75.00,1,2024-11-28,Fossil,Mason Martinez,Completed 30 | 29,Amelia Clark,Mewtwo,Holo Rare,Mint,230.00,1,2024-11-29,Fossil,Ethan White,Completed 31 | 30,Lucas Harris,Charizard,Rare,Near Mint,250.00,1,2024-11-30,Base Set,Lucas Harris,Completed 32 | 31,Harper Lewis,Snorlax,Rare,Excellent,90.00,1,2024-12-01,Jungle,Liam Brown,Completed 33 | 32,Sophia Wilson,Flareon,Rare,Good,85.00,1,2024-12-02,Jungle,Isabella Moore,Pending 34 | 33,Mason Martinez,Articuno,Rare,Mint,120.00,1,2024-12-03,Fossil,Harper Lewis,Completed 35 | 34,Emma Green,Moltres,Holo Rare,Mint,140.00,1,2024-12-04,Fossil,Henry King,Completed 36 | 35,John Doe,Pikachu,Common,Mint,15.00,3,2024-12-05,Jungle,Chris Brown,Completed 37 | 36,Jane Smith,Dragonite,Ultra Rare,Excellent,300.00,1,2024-12-06,Base Set,Sophia Wilson,Completed 38 | 37,Alex Johnson,Machamp,Holo Rare,Mint,75.00,1,2024-12-07,Base Set,Emma Green,Completed 39 | 38,Chris Brown,Vaporeon,Rare,Good,90.00,1,2024-12-08,Jungle,John Doe,Completed 40 | 39,Olivia Taylor,Jolteon,Rare,Near Mint,80.00,1,2024-12-09,Jungle,Jane Smith,Pending 41 | 40,Ethan White,Gengar,Rare,Mint,85.00,1,2024-12-10,Fossil,Liam Brown,Completed 42 | 41,Amelia Clark,Eevee,Common,Excellent,25.00,3,2024-12-11,Jungle,Olivia Taylor,Completed 43 | 42,Sophia Wilson,Charizard,Rare,Good,220.00,1,2024-12-12,Base Set,Alex Johnson,Cancelled 44 | 43,Lucas Harris,Zapdos,Holo Rare,Mint,150.00,1,2024-12-13,Fossil,Emma Green,Completed 45 | 44,Harper Lewis,Mewtwo,Ultra Rare,Near Mint,200.00,1,2024-12-14,Fossil,Sarah White,Completed 46 | 45,Henry King,Lapras,Rare,Mint,95.00,1,2024-12-16,Fossil,Sophia Wilson,Completed 47 | 46,Ethan White,Ditto,Rare,Excellent,85.00,1,2024-12-17,Fossil,Amelia Clark,Completed 48 | 47,Sarah White,Bulbasaur,Common,Near Mint,12.00,5,2024-12-18,Base Set,Lucas Harris,Completed 49 | 48,Emma Green,Charmander,Common,Mint,15.00,4,2024-12-19,Base Set,Chris Brown,Pending 50 | 49,Jane Smith,Squirtle,Common,Good,10.00,6,2024-12-20,Base Set,Mason Martinez,Completed 51 | 50,John Doe,Jigglypuff,Common,Excellent,8.00,10,2024-12-21,Jungle,Liam Brown,Completed 52 | 51,Olivia Taylor,Clefairy,Rare,Mint,50.00,1,2024-12-22,Base Set,Ethan White,Completed 53 | 52,Lucas Harris,Nidoking,Holo Rare,Good,125.00,1,2024-12-23,Base Set,John Doe,Cancelled 54 | 53,Alex Johnson,Hitmonchan,Holo Rare,Near Mint,100.00,1,2024-12-24,Base Set,Jane Smith,Completed 55 | 54,Sophia Wilson,Kangaskhan,Rare,Excellent,80.00,1,2024-12-25,Jungle,Henry King,Completed 56 | 55,Chris Brown,Scyther,Rare,Mint,85.00,1,2024-12-26,Jungle,Emma Green,Completed 57 | 56,Harper Lewis,Pinsir,Rare,Near Mint,70.00,1,2024-12-27,Jungle,Olivia Taylor,Completed 58 | 57,Mason Martinez,Aerodactyl,Rare,Good,100.00,1,2024-12-28,Fossil,Sarah White,Completed 59 | 58,Liam Brown,Kabutops,Rare,Mint,105.00,1,2024-12-29,Fossil,Alex Johnson,Completed 60 | 59,Evelyn Walker,Magikarp,Common,Excellent,5.00,20,2024-12-30,Base Set,Lucas Harris,Completed 61 | 60,Amelia Clark,Gyarados,Holo Rare,Near Mint,150.00,1,2024-12-31,Base Set,Sophia Wilson,Pending 62 | 61,Sarah White,Ditto,Rare,Mint,90.00,1,2025-01-01,Fossil,Henry King,Completed 63 | 62,Emma Green,Pidgeot,Rare,Good,70.00,1,2025-01-02,Jungle,Chris Brown,Completed 64 | 63,John Doe,Electabuzz,Rare,Excellent,60.00,2,2025-01-03,Base Set,Liam Brown,Completed 65 | 64,Jane Smith,Magmar,Rare,Mint,55.00,1,2025-01-04,Fossil,Mason Martinez,Completed 66 | 65,Olivia Taylor,Jynx,Common,Excellent,30.00,3,2025-01-05,Base Set,Ethan White,Completed 67 | 66,Alex Johnson,Alakazam,Holo Rare,Mint,175.00,1,2025-01-06,Base Set,Jane Smith,Completed 68 | 67,Sophia Wilson,Chansey,Holo Rare,Good,100.00,1,2025-01-07,Base Set,Olivia Taylor,Completed 69 | 68,Chris Brown,Geodude,Common,Near Mint,5.00,12,2025-01-08,Base Set,John Doe,Completed 70 | 69,Henry King,Grimer,Common,Excellent,7.00,8,2025-01-09,Fossil,Emma Green,Completed 71 | 70,Ethan White,Muk,Rare,Mint,85.00,1,2025-01-10,Fossil,Sophia Wilson,Completed 72 | 71,Harper Lewis,Rhydon,Rare,Good,75.00,1,2025-01-11,Jungle,Chris Brown,Cancelled 73 | 72,Mason Martinez,Tauros,Common,Near Mint,10.00,10,2025-01-12,Jungle,Alex Johnson,Completed 74 | 73,Evelyn Walker,Exeggutor,Rare,Mint,65.00,1,2025-01-13,Jungle,Sarah White,Completed 75 | 74,Lucas Harris,Venonat,Common,Excellent,5.00,15,2025-01-14,Jungle,Harper Lewis,Pending -------------------------------------------------------------------------------- /tests/.env/csv/sales_with_tab.csv: -------------------------------------------------------------------------------- 1 | sale_id seller_name card_name card_rarity card_condition price quantity sale_date card_set buyer_name transaction_status 2 | 1 John Doe Charizard Rare Mint 250.00 1 2024-11-01 Base Set Jane Smith Completed 3 | 2 Jane Smith Blastoise Holo Rare Excellent 180.00 1 2024-11-02 Base Set Alex Johnson Completed 4 | 3 Alex Johnson Pikachu Common Near Mint 15.00 4 2024-11-03 Jungle Chris Brown Completed 5 | 4 Chris Brown Dragonite Ultra Rare Good 300.00 1 2024-11-04 Fossil Emma Green Pending 6 | 5 Emma Green Zapdos Holo Rare Mint 150.00 1 2024-11-05 Fossil Sarah White Completed 7 | 6 Sarah White Venusaur Rare Good 120.00 2 2024-11-06 Base Set John Doe Completed 8 | 7 Liam Brown Moltres Rare Near Mint 140.00 1 2024-11-07 Fossil Jane Smith Completed 9 | 8 Olivia Taylor Articuno Rare Excellent 100.00 1 2024-11-08 Fossil Alex Johnson Cancelled 10 | 9 Sophia Wilson Eevee Common Mint 20.00 3 2024-11-09 Jungle Chris Brown Completed 11 | 10 Mason Martinez Jolteon Rare Near Mint 80.00 1 2024-11-10 Jungle Emma Green Completed 12 | 11 Ethan White Flareon Rare Excellent 85.00 1 2024-11-11 Jungle Sarah White Completed 13 | 12 Lucas Harris Vaporeon Rare Mint 100.00 1 2024-11-12 Jungle Liam Brown Completed 14 | 13 Amelia Clark Machamp Holo Rare Mint 75.00 1 2024-11-13 Base Set Olivia Taylor Completed 15 | 14 Harper Lewis Gengar Rare Mint 80.00 1 2024-11-14 Fossil Sophia Wilson Completed 16 | 15 Evelyn Walker Snorlax Rare Mint 90.00 1 2024-11-15 Jungle Mason Martinez Completed 17 | 16 Henry King Charizard Rare Excellent 250.00 1 2024-11-16 Base Set Ethan White Completed 18 | 17 Isabella Moore Mewtwo Holo Rare Near Mint 220.00 1 2024-11-17 Fossil Lucas Harris Completed 19 | 18 Sophia Wilson Articuno Rare Mint 120.00 1 2024-11-18 Fossil Amelia Clark Completed 20 | 19 Liam Brown Pikachu Common Good 10.00 5 2024-11-19 Jungle Harper Lewis Pending 21 | 20 Emma Green Moltres Rare Excellent 140.00 1 2024-11-20 Fossil Evelyn Walker Completed 22 | 21 Chris Brown Blastoise Holo Rare Mint 180.00 1 2024-11-21 Base Set Henry King Completed 23 | 22 Alex Johnson Eevee Common Mint 25.00 2 2024-11-22 Jungle Isabella Moore Completed 24 | 23 John Doe Dragonite Ultra Rare Near Mint 320.00 1 2024-11-23 Base Set Jane Smith Pending 25 | 24 Jane Smith Machamp Holo Rare Good 70.00 1 2024-11-24 Base Set Alex Johnson Completed 26 | 25 Sarah White Vaporeon Rare Excellent 100.00 1 2024-11-25 Jungle Chris Brown Cancelled 27 | 26 Olivia Taylor Jolteon Rare Mint 85.00 1 2024-11-26 Jungle Emma Green Completed 28 | 27 Henry King Zapdos Holo Rare Good 140.00 1 2024-11-27 Fossil Sophia Wilson Completed 29 | 28 Ethan White Gengar Rare Excellent 75.00 1 2024-11-28 Fossil Mason Martinez Completed 30 | 29 Amelia Clark Mewtwo Holo Rare Mint 230.00 1 2024-11-29 Fossil Ethan White Completed 31 | 30 Lucas Harris Charizard Rare Near Mint 250.00 1 2024-11-30 Base Set Lucas Harris Completed 32 | 31 Harper Lewis Snorlax Rare Excellent 90.00 1 2024-12-01 Jungle Liam Brown Completed 33 | 32 Sophia Wilson Flareon Rare Good 85.00 1 2024-12-02 Jungle Isabella Moore Pending 34 | 33 Mason Martinez Articuno Rare Mint 120.00 1 2024-12-03 Fossil Harper Lewis Completed 35 | 34 Emma Green Moltres Holo Rare Mint 140.00 1 2024-12-04 Fossil Henry King Completed 36 | 35 John Doe Pikachu Common Mint 15.00 3 2024-12-05 Jungle Chris Brown Completed 37 | 36 Jane Smith Dragonite Ultra Rare Excellent 300.00 1 2024-12-06 Base Set Sophia Wilson Completed 38 | 37 Alex Johnson Machamp Holo Rare Mint 75.00 1 2024-12-07 Base Set Emma Green Completed 39 | 38 Chris Brown Vaporeon Rare Good 90.00 1 2024-12-08 Jungle John Doe Completed 40 | 39 Olivia Taylor Jolteon Rare Near Mint 80.00 1 2024-12-09 Jungle Jane Smith Pending 41 | 40 Ethan White Gengar Rare Mint 85.00 1 2024-12-10 Fossil Liam Brown Completed 42 | 41 Amelia Clark Eevee Common Excellent 25.00 3 2024-12-11 Jungle Olivia Taylor Completed 43 | 42 Sophia Wilson Charizard Rare Good 220.00 1 2024-12-12 Base Set Alex Johnson Cancelled 44 | 43 Lucas Harris Zapdos Holo Rare Mint 150.00 1 2024-12-13 Fossil Emma Green Completed 45 | 44 Harper Lewis Mewtwo Ultra Rare Near Mint 200.00 1 2024-12-14 Fossil Sarah White Completed 46 | 45 Henry King Lapras Rare Mint 95.00 1 2024-12-16 Fossil Sophia Wilson Completed 47 | 46 Ethan White Ditto Rare Excellent 85.00 1 2024-12-17 Fossil Amelia Clark Completed 48 | 47 Sarah White Bulbasaur Common Near Mint 12.00 5 2024-12-18 Base Set Lucas Harris Completed 49 | 48 Emma Green Charmander Common Mint 15.00 4 2024-12-19 Base Set Chris Brown Pending 50 | 49 Jane Smith Squirtle Common Good 10.00 6 2024-12-20 Base Set Mason Martinez Completed 51 | 50 John Doe Jigglypuff Common Excellent 8.00 10 2024-12-21 Jungle Liam Brown Completed 52 | 51 Olivia Taylor Clefairy Rare Mint 50.00 1 2024-12-22 Base Set Ethan White Completed 53 | 52 Lucas Harris Nidoking Holo Rare Good 125.00 1 2024-12-23 Base Set John Doe Cancelled 54 | 53 Alex Johnson Hitmonchan Holo Rare Near Mint 100.00 1 2024-12-24 Base Set Jane Smith Completed 55 | 54 Sophia Wilson Kangaskhan Rare Excellent 80.00 1 2024-12-25 Jungle Henry King Completed 56 | 55 Chris Brown Scyther Rare Mint 85.00 1 2024-12-26 Jungle Emma Green Completed 57 | 56 Harper Lewis Pinsir Rare Near Mint 70.00 1 2024-12-27 Jungle Olivia Taylor Completed 58 | 57 Mason Martinez Aerodactyl Rare Good 100.00 1 2024-12-28 Fossil Sarah White Completed 59 | 58 Liam Brown Kabutops Rare Mint 105.00 1 2024-12-29 Fossil Alex Johnson Completed 60 | 59 Evelyn Walker Magikarp Common Excellent 5.00 20 2024-12-30 Base Set Lucas Harris Completed 61 | 60 Amelia Clark Gyarados Holo Rare Near Mint 150.00 1 2024-12-31 Base Set Sophia Wilson Pending 62 | 61 Sarah White Ditto Rare Mint 90.00 1 2025-01-01 Fossil Henry King Completed 63 | 62 Emma Green Pidgeot Rare Good 70.00 1 2025-01-02 Jungle Chris Brown Completed 64 | 63 John Doe Electabuzz Rare Excellent 60.00 2 2025-01-03 Base Set Liam Brown Completed 65 | 64 Jane Smith Magmar Rare Mint 55.00 1 2025-01-04 Fossil Mason Martinez Completed 66 | 65 Olivia Taylor Jynx Common Excellent 30.00 3 2025-01-05 Base Set Ethan White Completed 67 | 66 Alex Johnson Alakazam Holo Rare Mint 175.00 1 2025-01-06 Base Set Jane Smith Completed 68 | 67 Sophia Wilson Chansey Holo Rare Good 100.00 1 2025-01-07 Base Set Olivia Taylor Completed 69 | 68 Chris Brown Geodude Common Near Mint 5.00 12 2025-01-08 Base Set John Doe Completed 70 | 69 Henry King Grimer Common Excellent 7.00 8 2025-01-09 Fossil Emma Green Completed 71 | 70 Ethan White Muk Rare Mint 85.00 1 2025-01-10 Fossil Sophia Wilson Completed 72 | 71 Harper Lewis Rhydon Rare Good 75.00 1 2025-01-11 Jungle Chris Brown Cancelled 73 | 72 Mason Martinez Tauros Common Near Mint 10.00 10 2025-01-12 Jungle Alex Johnson Completed 74 | 73 Evelyn Walker Exeggutor Rare Mint 65.00 1 2025-01-13 Jungle Sarah White Completed 75 | 74 Lucas Harris Venonat Common Excellent 5.00 15 2025-01-14 Jungle Harper Lewis Pending -------------------------------------------------------------------------------- /tests/.env/csv/sales_with_comma.csv: -------------------------------------------------------------------------------- 1 | sale_id,seller_name,card_name,card_rarity,card_condition,price,quantity,sale_date,card_set,buyer_name,transaction_status 2 | 1,John Doe,Charizard,Rare,Mint,250.00,1,2024-11-01,Base Set,Jane Smith,Completed 3 | 2,Jane Smith,Blastoise,Holo Rare,Excellent,180.00,1,2024-11-02,Base Set,Alex Johnson,Completed 4 | 3,Alex Johnson,Pikachu,Common,Near Mint,15.00,4,2024-11-03,Jungle,Chris Brown,Completed 5 | 4,Chris Brown,Dragonite,Ultra Rare,Good,300.00,1,2024-11-04,Fossil,Emma Green,Pending 6 | 5,Emma Green,Zapdos,Holo Rare,Mint,150.00,1,2024-11-05,Fossil,Sarah White,Completed 7 | 6,Sarah White,Venusaur,Rare,Good,120.00,2,2024-11-06,Base Set,John Doe,Completed 8 | 7,Liam Brown,Moltres,Rare,Near Mint,140.00,1,2024-11-07,Fossil,Jane Smith,Completed 9 | 8,Olivia Taylor,Articuno,Rare,Excellent,100.00,1,2024-11-08,Fossil,Alex Johnson,Cancelled 10 | 9,Sophia Wilson,Eevee,Common,Mint,20.00,3,2024-11-09,Jungle,Chris Brown,Completed 11 | 10,Mason Martinez,Jolteon,Rare,Near Mint,80.00,1,2024-11-10,Jungle,Emma Green,Completed 12 | 11,Ethan White,Flareon,Rare,Excellent,85.00,1,2024-11-11,Jungle,Sarah White,Completed 13 | 12,Lucas Harris,Vaporeon,Rare,Mint,100.00,1,2024-11-12,Jungle,Liam Brown,Completed 14 | 13,Amelia Clark,Machamp,Holo Rare,Mint,75.00,1,2024-11-13,Base Set,Olivia Taylor,Completed 15 | 14,Harper Lewis,Gengar,Rare,Mint,80.00,1,2024-11-14,Fossil,Sophia Wilson,Completed 16 | 15,Evelyn Walker,Snorlax,Rare,Mint,90.00,1,2024-11-15,Jungle,Mason Martinez,Completed 17 | 16,Henry King,Charizard,Rare,Excellent,250.00,1,2024-11-16,Base Set,Ethan White,Completed 18 | 17,Isabella Moore,Mewtwo,Holo Rare,Near Mint,220.00,1,2024-11-17,Fossil,Lucas Harris,Completed 19 | 18,Sophia Wilson,Articuno,Rare,Mint,120.00,1,2024-11-18,Fossil,Amelia Clark,Completed 20 | 19,Liam Brown,Pikachu,Common,Good,10.00,5,2024-11-19,Jungle,Harper Lewis,Pending 21 | 20,Emma Green,Moltres,Rare,Excellent,140.00,1,2024-11-20,Fossil,Evelyn Walker,Completed 22 | 21,Chris Brown,Blastoise,Holo Rare,Mint,180.00,1,2024-11-21,Base Set,Henry King,Completed 23 | 22,Alex Johnson,Eevee,Common,Mint,25.00,2,2024-11-22,Jungle,Isabella Moore,Completed 24 | 23,John Doe,Dragonite,Ultra Rare,Near Mint,320.00,1,2024-11-23,Base Set,Jane Smith,Pending 25 | 24,Jane Smith,Machamp,Holo Rare,Good,70.00,1,2024-11-24,Base Set,Alex Johnson,Completed 26 | 25,Sarah White,Vaporeon,Rare,Excellent,100.00,1,2024-11-25,Jungle,Chris Brown,Cancelled 27 | 26,Olivia Taylor,Jolteon,Rare,Mint,85.00,1,2024-11-26,Jungle,Emma Green,Completed 28 | 27,Henry King,Zapdos,Holo Rare,Good,140.00,1,2024-11-27,Fossil,Sophia Wilson,Completed 29 | 28,Ethan White,Gengar,Rare,Excellent,75.00,1,2024-11-28,Fossil,Mason Martinez,Completed 30 | 29,Amelia Clark,Mewtwo,Holo Rare,Mint,230.00,1,2024-11-29,Fossil,Ethan White,Completed 31 | 30,Lucas Harris,Charizard,Rare,Near Mint,250.00,1,2024-11-30,Base Set,Lucas Harris,Completed 32 | 31,Harper Lewis,Snorlax,Rare,Excellent,90.00,1,2024-12-01,Jungle,Liam Brown,Completed 33 | 32,Sophia Wilson,Flareon,Rare,Good,85.00,1,2024-12-02,Jungle,Isabella Moore,Pending 34 | 33,Mason Martinez,Articuno,Rare,Mint,120.00,1,2024-12-03,Fossil,Harper Lewis,Completed 35 | 34,Emma Green,Moltres,Holo Rare,Mint,140.00,1,2024-12-04,Fossil,Henry King,Completed 36 | 35,John Doe,Pikachu,Common,Mint,15.00,3,2024-12-05,Jungle,Chris Brown,Completed 37 | 36,Jane Smith,Dragonite,Ultra Rare,Excellent,300.00,1,2024-12-06,Base Set,Sophia Wilson,Completed 38 | 37,Alex Johnson,Machamp,Holo Rare,Mint,75.00,1,2024-12-07,Base Set,Emma Green,Completed 39 | 38,Chris Brown,Vaporeon,Rare,Good,90.00,1,2024-12-08,Jungle,John Doe,Completed 40 | 39,Olivia Taylor,Jolteon,Rare,Near Mint,80.00,1,2024-12-09,Jungle,Jane Smith,Pending 41 | 40,Ethan White,Gengar,Rare,Mint,85.00,1,2024-12-10,Fossil,Liam Brown,Completed 42 | 41,Amelia Clark,Eevee,Common,Excellent,25.00,3,2024-12-11,Jungle,Olivia Taylor,Completed 43 | 42,Sophia Wilson,Charizard,Rare,Good,220.00,1,2024-12-12,Base Set,Alex Johnson,Cancelled 44 | 43,Lucas Harris,Zapdos,Holo Rare,Mint,150.00,1,2024-12-13,Fossil,Emma Green,Completed 45 | 44,Harper Lewis,Mewtwo,Ultra Rare,Near Mint,200.00,1,2024-12-14,Fossil,Sarah White,Completed 46 | 45,Henry King,Lapras,Rare,Mint,95.00,1,2024-12-16,Fossil,Sophia Wilson,Completed 47 | 46,Ethan White,Ditto,Rare,Excellent,85.00,1,2024-12-17,Fossil,Amelia Clark,Completed 48 | 47,Sarah White,Bulbasaur,Common,Near Mint,12.00,5,2024-12-18,Base Set,Lucas Harris,Completed 49 | 48,Emma Green,Charmander,Common,Mint,15.00,4,2024-12-19,Base Set,Chris Brown,Pending 50 | 49,Jane Smith,Squirtle,Common,Good,10.00,6,2024-12-20,Base Set,Mason Martinez,Completed 51 | 50,John Doe,Jigglypuff,Common,Excellent,8.00,10,2024-12-21,Jungle,Liam Brown,Completed 52 | 51,Olivia Taylor,Clefairy,Rare,Mint,50.00,1,2024-12-22,Base Set,Ethan White,Completed 53 | 52,Lucas Harris,Nidoking,Holo Rare,Good,125.00,1,2024-12-23,Base Set,John Doe,Cancelled 54 | 53,Alex Johnson,Hitmonchan,Holo Rare,Near Mint,100.00,1,2024-12-24,Base Set,Jane Smith,Completed 55 | 54,Sophia Wilson,Kangaskhan,Rare,Excellent,80.00,1,2024-12-25,Jungle,Henry King,Completed 56 | 55,Chris Brown,Scyther,Rare,Mint,85.00,1,2024-12-26,Jungle,Emma Green,Completed 57 | 56,Harper Lewis,Pinsir,Rare,Near Mint,70.00,1,2024-12-27,Jungle,Olivia Taylor,Completed 58 | 57,Mason Martinez,Aerodactyl,Rare,Good,100.00,1,2024-12-28,Fossil,Sarah White,Completed 59 | 58,Liam Brown,Kabutops,Rare,Mint,105.00,1,2024-12-29,Fossil,Alex Johnson,Completed 60 | 59,Evelyn Walker,Magikarp,Common,Excellent,5.00,20,2024-12-30,Base Set,Lucas Harris,Completed 61 | 60,Amelia Clark,Gyarados,Holo Rare,Near Mint,150.00,1,2024-12-31,Base Set,Sophia Wilson,Pending 62 | 61,Sarah White,Ditto,Rare,Mint,90.00,1,2025-01-01,Fossil,Henry King,Completed 63 | 62,Emma Green,Pidgeot,Rare,Good,70.00,1,2025-01-02,Jungle,Chris Brown,Completed 64 | 63,John Doe,Electabuzz,Rare,Excellent,60.00,2,2025-01-03,Base Set,Liam Brown,Completed 65 | 64,Jane Smith,Magmar,Rare,Mint,55.00,1,2025-01-04,Fossil,Mason Martinez,Completed 66 | 65,Olivia Taylor,Jynx,Common,Excellent,30.00,3,2025-01-05,Base Set,Ethan White,Completed 67 | 66,Alex Johnson,Alakazam,Holo Rare,Mint,175.00,1,2025-01-06,Base Set,Jane Smith,Completed 68 | 67,Sophia Wilson,Chansey,Holo Rare,Good,100.00,1,2025-01-07,Base Set,Olivia Taylor,Completed 69 | 68,Chris Brown,Geodude,Common,Near Mint,5.00,12,2025-01-08,Base Set,John Doe,Completed 70 | 69,Henry King,Grimer,Common,Excellent,7.00,8,2025-01-09,Fossil,Emma Green,Completed 71 | 70,Ethan White,Muk,Rare,Mint,85.00,1,2025-01-10,Fossil,Sophia Wilson,Completed 72 | 71,Harper Lewis,Rhydon,Rare,Good,75.00,1,2025-01-11,Jungle,Chris Brown,Cancelled 73 | 72,Mason Martinez,Tauros,Common,Near Mint,10.00,10,2025-01-12,Jungle,Alex Johnson,Completed 74 | 73,Evelyn Walker,Exeggutor,Rare,Mint,65.00,1,2025-01-13,Jungle,Sarah White,Completed 75 | 74,Lucas Harris,Venonat,Common,Excellent,5.00,15,2025-01-14,Jungle,Harper Lewis,Pending -------------------------------------------------------------------------------- /tests/.env/csv/sales_with_iso_8859_1.csv: -------------------------------------------------------------------------------- 1 | sale_id,seller_name,card_name,card_rarity,card_condition,price,quantity,sale_date,card_set,buyer_name,transaction_status 2 | 1,John Doe,Charizard,Rare,Mint,250.00,1,2024-11-01,Base Set,Jane Smith,Completed 3 | 2,Jane Smith,Blastoise,Holo Rare,Excellent,180.00,1,2024-11-02,Base Set,Alex Johnson,Completed 4 | 3,Alex Johnson,Pikachu,Common,Near Mint,15.00,4,2024-11-03,Jungle,Chris Brown,Completed 5 | 4,Chris Brown,Dragonite,Ultra Rare,Good,300.00,1,2024-11-04,Fossil,Emma Green,Pending 6 | 5,Emma Green,Zapdos,Holo Rare,Mint,150.00,1,2024-11-05,Fossil,Sarah White,Completed 7 | 6,Sarah White,Venusaur,Rare,Good,120.00,2,2024-11-06,Base Set,John Doe,Completed 8 | 7,Liam Brown,Moltres,Rare,Near Mint,140.00,1,2024-11-07,Fossil,Jane Smith,Completed 9 | 8,Olivia Taylor,Articuno,Rare,Excellent,100.00,1,2024-11-08,Fossil,Alex Johnson,Cancelled 10 | 9,Sophia Wilson,Eevee,Common,Mint,20.00,3,2024-11-09,Jungle,Chris Brown,Completed 11 | 10,Mason Martinez,Jolteon,Rare,Near Mint,80.00,1,2024-11-10,Jungle,Emma Green,Completed 12 | 11,Ethan White,Flareon,Rare,Excellent,85.00,1,2024-11-11,Jungle,Sarah White,Completed 13 | 12,Lucas Harris,Vaporeon,Rare,Mint,100.00,1,2024-11-12,Jungle,Liam Brown,Completed 14 | 13,Amelia Clark,Machamp,Holo Rare,Mint,75.00,1,2024-11-13,Base Set,Olivia Taylor,Completed 15 | 14,Harper Lewis,Gengar,Rare,Mint,80.00,1,2024-11-14,Fossil,Sophia Wilson,Completed 16 | 15,Evelyn Walker,Snorlax,Rare,Mint,90.00,1,2024-11-15,Jungle,Mason Martinez,Completed 17 | 16,Henry King,Charizard,Rare,Excellent,250.00,1,2024-11-16,Base Set,Ethan White,Completed 18 | 17,Isabella Moore,Mewtwo,Holo Rare,Near Mint,220.00,1,2024-11-17,Fossil,Lucas Harris,Completed 19 | 18,Sophia Wilson,Articuno,Rare,Mint,120.00,1,2024-11-18,Fossil,Amelia Clark,Completed 20 | 19,Liam Brown,Pikachu,Common,Good,10.00,5,2024-11-19,Jungle,Harper Lewis,Pending 21 | 20,Emma Green,Moltres,Rare,Excellent,140.00,1,2024-11-20,Fossil,Evelyn Walker,Completed 22 | 21,Chris Brown,Blastoise,Holo Rare,Mint,180.00,1,2024-11-21,Base Set,Henry King,Completed 23 | 22,Alex Johnson,Eevee,Common,Mint,25.00,2,2024-11-22,Jungle,Isabella Moore,Completed 24 | 23,John Doe,Dragonite,Ultra Rare,Near Mint,320.00,1,2024-11-23,Base Set,Jane Smith,Pending 25 | 24,Jane Smith,Machamp,Holo Rare,Good,70.00,1,2024-11-24,Base Set,Alex Johnson,Completed 26 | 25,Sarah White,Vaporeon,Rare,Excellent,100.00,1,2024-11-25,Jungle,Chris Brown,Cancelled 27 | 26,Olivia Taylor,Jolteon,Rare,Mint,85.00,1,2024-11-26,Jungle,Emma Green,Completed 28 | 27,Henry King,Zapdos,Holo Rare,Good,140.00,1,2024-11-27,Fossil,Sophia Wilson,Completed 29 | 28,Ethan White,Gengar,Rare,Excellent,75.00,1,2024-11-28,Fossil,Mason Martinez,Completed 30 | 29,Amelia Clark,Mewtwo,Holo Rare,Mint,230.00,1,2024-11-29,Fossil,Ethan White,Completed 31 | 30,Lucas Harris,Charizard,Rare,Near Mint,250.00,1,2024-11-30,Base Set,Lucas Harris,Completed 32 | 31,Harper Lewis,Snorlax,Rare,Excellent,90.00,1,2024-12-01,Jungle,Liam Brown,Completed 33 | 32,Sophia Wilson,Flareon,Rare,Good,85.00,1,2024-12-02,Jungle,Isabella Moore,Pending 34 | 33,Mason Martinez,Articuno,Rare,Mint,120.00,1,2024-12-03,Fossil,Harper Lewis,Completed 35 | 34,Emma Green,Moltres,Holo Rare,Mint,140.00,1,2024-12-04,Fossil,Henry King,Completed 36 | 35,John Doe,Pikachu,Common,Mint,15.00,3,2024-12-05,Jungle,Chris Brown,Completed 37 | 36,Jane Smith,Dragonite,Ultra Rare,Excellent,300.00,1,2024-12-06,Base Set,Sophia Wilson,Completed 38 | 37,Alex Johnson,Machamp,Holo Rare,Mint,75.00,1,2024-12-07,Base Set,Emma Green,Completed 39 | 38,Chris Brown,Vaporeon,Rare,Good,90.00,1,2024-12-08,Jungle,John Doe,Completed 40 | 39,Olivia Taylor,Jolteon,Rare,Near Mint,80.00,1,2024-12-09,Jungle,Jane Smith,Pending 41 | 40,Ethan White,Gengar,Rare,Mint,85.00,1,2024-12-10,Fossil,Liam Brown,Completed 42 | 41,Amelia Clark,Eevee,Common,Excellent,25.00,3,2024-12-11,Jungle,Olivia Taylor,Completed 43 | 42,Sophia Wilson,Charizard,Rare,Good,220.00,1,2024-12-12,Base Set,Alex Johnson,Cancelled 44 | 43,Lucas Harris,Zapdos,Holo Rare,Mint,150.00,1,2024-12-13,Fossil,Emma Green,Completed 45 | 44,Harper Lewis,Mewtwo,Ultra Rare,Near Mint,200.00,1,2024-12-14,Fossil,Sarah White,Completed 46 | 45,Henry King,Lapras,Rare,Mint,95.00,1,2024-12-16,Fossil,Sophia Wilson,Completed 47 | 46,Ethan White,Ditto,Rare,Excellent,85.00,1,2024-12-17,Fossil,Amelia Clark,Completed 48 | 47,Sarah White,Bulbasaur,Common,Near Mint,12.00,5,2024-12-18,Base Set,Lucas Harris,Completed 49 | 48,Emma Green,Charmander,Common,Mint,15.00,4,2024-12-19,Base Set,Chris Brown,Pending 50 | 49,Jane Smith,Squirtle,Common,Good,10.00,6,2024-12-20,Base Set,Mason Martinez,Completed 51 | 50,John Doe,Jigglypuff,Common,Excellent,8.00,10,2024-12-21,Jungle,Liam Brown,Completed 52 | 51,Olivia Taylor,Clefairy,Rare,Mint,50.00,1,2024-12-22,Base Set,Ethan White,Completed 53 | 52,Lucas Harris,Nidoking,Holo Rare,Good,125.00,1,2024-12-23,Base Set,John Doe,Cancelled 54 | 53,Alex Johnson,Hitmonchan,Holo Rare,Near Mint,100.00,1,2024-12-24,Base Set,Jane Smith,Completed 55 | 54,Sophia Wilson,Kangaskhan,Rare,Excellent,80.00,1,2024-12-25,Jungle,Henry King,Completed 56 | 55,Chris Brown,Scyther,Rare,Mint,85.00,1,2024-12-26,Jungle,Emma Green,Completed 57 | 56,Harper Lewis,Pinsir,Rare,Near Mint,70.00,1,2024-12-27,Jungle,Olivia Taylor,Completed 58 | 57,Mason Martinez,Aerodactyl,Rare,Good,100.00,1,2024-12-28,Fossil,Sarah White,Completed 59 | 58,Liam Brown,Kabutops,Rare,Mint,105.00,1,2024-12-29,Fossil,Alex Johnson,Completed 60 | 59,Evelyn Walker,Magikarp,Common,Excellent,5.00,20,2024-12-30,Base Set,Lucas Harris,Completed 61 | 60,Amelia Clark,Gyarados,Holo Rare,Near Mint,150.00,1,2024-12-31,Base Set,Sophia Wilson,Pending 62 | 61,Sarah White,Ditto,Rare,Mint,90.00,1,2025-01-01,Fossil,Henry King,Completed 63 | 62,Emma Green,Pidgeot,Rare,Good,70.00,1,2025-01-02,Jungle,Chris Brown,Completed 64 | 63,John Doe,Electabuzz,Rare,Excellent,60.00,2,2025-01-03,Base Set,Liam Brown,Completed 65 | 64,Jane Smith,Magmar,Rare,Mint,55.00,1,2025-01-04,Fossil,Mason Martinez,Completed 66 | 65,Olivia Taylor,Jynx,Common,Excellent,30.00,3,2025-01-05,Base Set,Ethan White,Completed 67 | 66,Alex Johnson,Alakazam,Holo Rare,Mint,175.00,1,2025-01-06,Base Set,Jane Smith,Completed 68 | 67,Sophia Wilson,Chansey,Holo Rare,Good,100.00,1,2025-01-07,Base Set,Olivia Taylor,Completed 69 | 68,Chris Brown,Geodude,Common,Near Mint,5.00,12,2025-01-08,Base Set,John Doe,Completed 70 | 69,Henry King,Grimer,Common,Excellent,7.00,8,2025-01-09,Fossil,Emma Green,Completed 71 | 70,Ethan White,Muk,Rare,Mint,85.00,1,2025-01-10,Fossil,Sophia Wilson,Completed 72 | 71,Harper Lewis,Rhydon,Rare,Good,75.00,1,2025-01-11,Jungle,Chris Brown,Cancelled 73 | 72,Mason Martinez,Tauros,Common,Near Mint,10.00,10,2025-01-12,Jungle,Alex Johnson,Completed 74 | 73,Evelyn Walker,Exeggutor,Rare,Mint,65.00,1,2025-01-13,Jungle,Sarah White,Completed 75 | 74,Lucas Harris,Venonat,Common,Excellent,5.00,15,2025-01-14,Jungle,Harper Lewis,Pending -------------------------------------------------------------------------------- /src/ploosh/exporters/exporter_trx.py: -------------------------------------------------------------------------------- 1 | """Export test case result to TRX format""" 2 | 3 | import html 4 | import os 5 | import uuid 6 | import xml.dom.minidom 7 | import numpy as np 8 | from exporters.exporter import Exporter 9 | from case import StateStatistics 10 | 11 | 12 | class ExporterTRX(Exporter): 13 | """Export test case result to TRX format""" 14 | 15 | def __init__(self): 16 | # Set the name of the exporter 17 | self.name = "TRX" 18 | 19 | def get_failed_blocks(self, case_name, current_case, execution_id, output_folder): 20 | """Get XML code for failed cases""" 21 | # Escape the error message to be XML-safe 22 | error_message = html.escape(current_case.error_message, quote=False) 23 | 24 | # Create the XML block for the error message 25 | output_message_xml = f"{error_message}" 26 | result_files_xml = "" 27 | 28 | # If there is a comparison gap, export it to an Excel file 29 | if current_case.df_compare_gap is not None: 30 | detail_file_path = f"{output_folder}/test_results/In/{execution_id}/{case_name}.xlsx" 31 | result_files_xml = f"" 32 | 33 | # Create directories if they do not exist 34 | os.makedirs(os.path.dirname(detail_file_path), exist_ok=True) 35 | current_case.df_compare_gap.to_excel(detail_file_path) 36 | 37 | return output_message_xml, result_files_xml 38 | 39 | def export(self, cases: dict): 40 | """Export test case results to a TRX file""" 41 | 42 | # Generate a unique ID for the TRX file 43 | trx_id = str(uuid.uuid4()) 44 | 45 | # Define the output folder and file path 46 | output_folder = f"{self.output_path}/trx" 47 | output_file = f"{output_folder}/test_results.xml" 48 | 49 | # Generate a unique ID for the test list 50 | test_list_id = str(uuid.uuid4()) 51 | 52 | # Initialize lists to store execution and test IDs 53 | execution_id_list = [] 54 | test_id_list = [] 55 | 56 | # Generate unique IDs for each test case 57 | for _ in list(range(0, len(cases))): 58 | execution_id_list.append(str(uuid.uuid4())) 59 | test_id_list.append(str(uuid.uuid4())) 60 | 61 | # Initialize XML blocks for unit test results, test definitions, and test entries 62 | xml_unit_test_result = "" 63 | xml_test_definitions = "" 64 | xml_test_entry = "" 65 | 66 | # Initialize state statistics 67 | state_statistics = StateStatistics() 68 | 69 | # Initialize lists to store start and end times 70 | start_times = [] 71 | end_times = [] 72 | 73 | # Iterate over each test case and collect data 74 | for i, case_name in enumerate(cases): 75 | current_case = cases[case_name] 76 | 77 | # Collect start and end times for the test case 78 | if current_case.global_duration.start is not None: 79 | start_times.append(current_case.global_duration.start) 80 | end_times.append(current_case.global_duration.end) 81 | 82 | execution_id = execution_id_list[i] 83 | test_id = test_id_list[i] 84 | 85 | # Update state statistics 86 | state_statistics.add_state(current_case.state) 87 | 88 | output_message_xml = "" 89 | result_files_xml = "" 90 | 91 | # If the test case failed, get the XML blocks for the error message and result files 92 | if current_case.state != "passed" and current_case.error_message is not None: 93 | output_message_xml, result_files_xml = self.get_failed_blocks( 94 | case_name, current_case, execution_id_list[i], output_folder 95 | ) 96 | 97 | outcome = current_case.state 98 | if outcome == "error": 99 | outcome = "failed" 100 | 101 | # Create the XML block for the unit test result 102 | xml_unit_test_result += f"""{output_message_xml}{result_files_xml}""" 111 | 112 | # Create the XML block for the test definition 113 | xml_test_definitions += f"" 114 | 115 | # Create the XML block for the test entry 116 | xml_test_entry += f"" 117 | 118 | # Get the global start and end times 119 | global_start_date = Exporter.date_to_string(np.min(np.array(start_times))) 120 | global_end_date = Exporter.date_to_string(np.max(np.array(end_times))) 121 | 122 | # Create the final XML string for the TRX file 123 | xml_string = f""" 124 | 125 | 126 | 127 | {xml_unit_test_result} 128 | {xml_test_definitions} 129 | {xml_test_entry} 130 | 131 | 132 | 139 | 140 | 141 | """ 142 | 143 | # Create directories if they do not exist 144 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 145 | 146 | # Write the XML string to the TRX file 147 | with open(output_file, "w", encoding="UTF-8") as file: 148 | dom_string = xml.dom.minidom.parseString(xml_string).toprettyxml() 149 | dom_string = os.linesep.join([s for s in dom_string.splitlines() if s.strip()]) 150 | file.write(dom_string) 151 | -------------------------------------------------------------------------------- /src/ploosh/case.py: -------------------------------------------------------------------------------- 1 | """Module to manage test case""" 2 | from dataclasses import dataclass 3 | from datetime import datetime 4 | import numpy as np 5 | from engines.compare_engine_native import CompareEngineNative 6 | from engines.compare_engine_spark import CompareEngineSpark 7 | from engines.load_engine_native import LoadEngineNative 8 | from engines.load_engine_spark import LoadEngineSpark 9 | 10 | @dataclass 11 | class StateStatistics: 12 | """Statistics of test case executions""" 13 | not_executed = 0 14 | executed = 0 15 | passed = 0 16 | failed = 0 17 | error = 0 18 | total = 0 19 | 20 | def add_state(self, state): 21 | """Add new state to statistics""" 22 | if state == "passed": 23 | self.passed += 1 24 | if state == "failed": 25 | self.failed += 1 26 | if state == "error": 27 | self.error += 1 28 | if state == "notExecuted": 29 | self.not_executed += 1 30 | 31 | if state != "notExecuted": 32 | self.executed += 1 33 | 34 | self.total += 1 35 | 36 | 37 | @dataclass 38 | class ConnectionDescription: 39 | """Tuple of connection and connector""" 40 | connector = None 41 | connection = None 42 | 43 | def __init__(self, connector, connection): 44 | self.connector = connector 45 | self.connection = connection 46 | 47 | 48 | @dataclass 49 | class Duration: 50 | """Structure of duration""" 51 | start = None 52 | end = None 53 | duration = None 54 | 55 | def calculate_duration(self): 56 | """Calculate the duration between start and end date""" 57 | if self.end is not None: 58 | duration = self.end - self.start 59 | self.duration = duration.seconds + (duration.microseconds / 1000000) 60 | 61 | 62 | @dataclass 63 | class CaseItem: 64 | """Structure of case item (source or expected)""" 65 | connector = None 66 | connection = None 67 | configuration = None 68 | duration = None 69 | df_data = None 70 | count = 0 71 | 72 | def __init__(self, configuration, connector, connection): 73 | self.duration = Duration() 74 | self.connector = connector 75 | self.connection = connection 76 | self.configuration = configuration 77 | 78 | 79 | class Case: 80 | """Test case item""" 81 | options = None 82 | source = None 83 | expected = None 84 | global_duration = None 85 | compare_duration = None 86 | state = "notExecuted" 87 | error_type = None 88 | error_message = None 89 | df_compare_gap = None 90 | disabled = None 91 | success_rate = 1 92 | 93 | def __init__(self, configuration, source, expected, options, disabled): 94 | self.source = CaseItem(configuration["source"], source.connector, source.connection) 95 | self.expected = CaseItem(configuration["expected"], expected.connector, expected.connection) 96 | self.options = options 97 | self.disabled = disabled 98 | self.global_duration = Duration() 99 | self.compare_duration = Duration() 100 | 101 | def get_insensitive_item(self, name: str, items: list) -> str: 102 | """Get item from list case-insensitively""" 103 | for item in items: 104 | if name.upper().strip() == item.upper().strip(): 105 | return item 106 | return name 107 | 108 | def load_data(self, obj_type: str): 109 | """Load data from connector""" 110 | if obj_type == "source": 111 | obj = self.source 112 | else: 113 | obj = self.expected 114 | 115 | obj.duration.start = datetime.now() 116 | 117 | if not self.source.connector.is_spark: 118 | load_engine = LoadEngineNative(obj.configuration, self.options, obj.connection) 119 | else: 120 | load_engine = LoadEngineSpark(obj.configuration, self.options, obj.connection) 121 | 122 | # Load data from connector 123 | obj.df_data = obj.connector.get_data(obj.configuration, obj.connection) 124 | 125 | # Execute load engine 126 | obj.df_data = load_engine.execute(obj.df_data) 127 | obj.count = load_engine.count 128 | 129 | obj.duration.end = datetime.now() 130 | 131 | def load_data_error(self, obj_type: str, message: str): 132 | """Setup error message for data loading""" 133 | if obj_type == "source": 134 | obj = self.source 135 | else: 136 | obj = self.expected 137 | 138 | self.state = "error" 139 | self.error_type = "data" 140 | self.error_message = message 141 | obj.duration.end = datetime.now() 142 | 143 | def compare_dataframes(self): 144 | """Compare source and expected dataframe""" 145 | self.compare_duration.start = datetime.now() 146 | 147 | compare_engine = CompareEngineNative(self.source.df_data, self.expected.df_data, self.options) 148 | compare_state = compare_engine.compare() 149 | 150 | self.error_message = compare_engine.error_message 151 | self.error_type = compare_engine.error_type 152 | self.df_compare_gap = compare_engine.df_compare_gap 153 | self.success_rate = compare_engine.success_rate 154 | 155 | self.compare_duration.end = datetime.now() 156 | 157 | if compare_state: 158 | self.state = "passed" 159 | else: 160 | self.state = "failed" 161 | 162 | def compare_dataframes_with_spark(self, spark_session): 163 | """Compare source and expected dataframe using Spark""" 164 | self.compare_duration.start = datetime.now() 165 | 166 | compare_engine = CompareEngineSpark(self.source.df_data, self.expected.df_data, self.options) 167 | compare_state = compare_engine.compare() 168 | 169 | self.error_message = compare_engine.error_message 170 | self.error_type = compare_engine.error_type 171 | self.df_compare_gap = compare_engine.df_compare_gap 172 | self.success_rate = compare_engine.success_rate 173 | 174 | self.compare_duration.end = datetime.now() 175 | 176 | if compare_state: 177 | self.state = "passed" 178 | else: 179 | self.state = "failed" 180 | self.state = "failed" 181 | 182 | def compare_dataframes_error(self, message): 183 | """Setup error message for compare engine""" 184 | self.state = "Error" 185 | self.error_type = "compare" 186 | self.error_message = message 187 | self.compare_duration.end = datetime.now() 188 | 189 | def calculate_durations(self): 190 | """Calculate durations""" 191 | self.source.duration.calculate_duration() 192 | self.expected.duration.calculate_duration() 193 | self.compare_duration.calculate_duration() 194 | 195 | ends = [] 196 | if self.source.duration.end is not None: 197 | ends.append(self.source.duration.end) 198 | if self.expected.duration.end is not None: 199 | ends.append(self.expected.duration.end) 200 | if self.compare_duration.end is not None: 201 | ends.append(self.compare_duration.end) 202 | 203 | if len(ends) == 0: 204 | self.global_duration.duration = 0 205 | else: 206 | self.global_duration.start = self.source.duration.start 207 | self.global_duration.end = np.max(np.array(ends)) 208 | self.global_duration.calculate_duration() 209 | --------------------------------------------------------------------------------