├── .gitignore ├── Dockerfile ├── README.md ├── config └── airflow.cfg ├── dags ├── example.py └── execute_entities.py ├── docker-compose.yml ├── docs └── CHANGELOG.md ├── gh_2.67.0_windows_amd64.msi ├── logs └── scheduler │ └── latest ├── main.py ├── per_page.py ├── requirements.txt └── src ├── __init__.py ├── api ├── __init__.py └── api_instance.py ├── config └── __init__.py ├── controllers └── paginations │ ├── __init__.py │ └── paginations.py ├── db ├── __init__.py └── database.py ├── endpoints ├── __init__.py ├── data │ └── data.json └── endpoints.py └── utils ├── constants.py └── tools.py /.gitignore: -------------------------------------------------------------------------------- 1 | todo 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 111 | .pdm.toml 112 | .pdm-python 113 | .pdm-build/ 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | link.pem 165 | .gitignore 166 | .pre-commit-config.yaml 167 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:2.10.4 2 | ADD requirements.txt . 3 | RUN pip install --upgrade pip && pip install -r requirements.txt 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Omie API Integration 2 | 3 | This repository provides a Python-based integration with the Omie API. The project fetches data from various Omie API endpoints, cleans the data by removing unwanted fields, and stores the processed results into a PostgreSQL database. 4 | 5 | ## Table of Contents 6 | 7 | - [Features](#features) 8 | - [Prerequisites](#prerequisites) 9 | - [Installation](#installation) 10 | - [Configuration](#configuration) 11 | - [How It Works](#how-it-works) 12 | - [Usage](#usage) 13 | - [Contributing](#contributing) 14 | - [License](#license) 15 | 16 | ## Features 17 | 18 | - **API Integration**: Retrieves data from Omie API endpoints. 19 | - **Pagination Handling**: Automatically iterates through multiple pages based on the API's response. 20 | - **Data Cleaning**: Removes unnecessary fields (e.g., `tags`, `recomendacoes`, `homepage`, `fax_ddd`, `bloquear_exclusao`, `produtor_rural`) from the API response. 21 | - **Database Storage**: Uses Pandas and SQLAlchemy to store data into a PostgreSQL database. 22 | - **Optional File Saving**: Provides an option to save the API response as JSON files. 23 | 24 | ## Prerequisites 25 | 26 | - **Python 3.7+** 27 | - **PostgreSQL**: A running PostgreSQL instance. 28 | - Required Python libraries: 29 | - `pandas` 30 | - `sqlalchemy` 31 | - `requests` (used by the custom API class in `src/api.py`) 32 | 33 | Install the Python dependencies via pip: 34 | ```bash 35 | pip install pandas sqlalchemy requests 36 | ```` 37 | 38 | ## Installation 39 | Clone the repository: 40 | ``` 41 | git clone https://github.com/rphpacheco/omie_api_integration.git 42 | ``` 43 | Navigate to the project directory: 44 | 45 | ``` 46 | cd omie_api_integration 47 | ``` 48 | 49 | (Optional) Create and activate a virtual environment: 50 | 51 | ``` 52 | python -m venv venv 53 | source venv/bin/activate # On Windows: venv\Scripts\activate 54 | ```` 55 | 56 | Install dependencies: If a requirements.txt file is provided, run: 57 | 58 | ``` 59 | pip install -r requirements.txt 60 | ``` 61 | 62 | Otherwise, install the dependencies manually as shown above. 63 | 64 | ## Configuration 65 | Environment Variables: 66 | 67 | Create a `.env` file in the root directory using the provided `.env-pattern` as a template: 68 | 69 | ``` 70 | cp .env-pattern .env 71 | ``` 72 | 73 | Edit the .env file with your credentials: 74 | 75 | ``` 76 | APP_KEY=your_app_key_here 77 | APP_SECRET=your_app_secret_here 78 | BASE_URL=https://api.omie.com.br/api/v1/ 79 | DB_HOST=your_db_host 80 | DB_PORT=your_db_port 81 | DB_USERNANE=your_db_username # Note: The variable name is 'DB_USERNANE' in this project. 82 | DB_PASSWORD=your_db_password 83 | DB_NAME=your_db_name 84 | ``` 85 | 86 | How It Works 87 | 88 | Configuration & Setup: 89 | The project loads settings from the environment and uses a custom configuration class (src/config.py) to manage API and database credentials. 90 | 91 | Fetching Endpoints: 92 | Endpoints are defined and retrieved via the Endpoints class (src/endpoints.py), which provides a list of API endpoints to be queried. 93 | 94 | Data Retrieval & Pagination: 95 | For each endpoint, the script: 96 | 97 | Determines the total number of pages available by making an initial API request. 98 | Iterates through each page, updating the page parameter in the request. 99 | Sends a POST request to the Omie API using the custom Api class (src/api.py). 100 | Data Processing: 101 | After receiving the data, the script: 102 | 103 | Removes unwanted fields using a predefined blacklist. 104 | Normalizes the JSON data using Pandas. 105 | Data Storage: 106 | The processed data is stored in a PostgreSQL database: 107 | 108 | For the first page of data, the corresponding table is created (or replaced). 109 | For subsequent pages, the data is appended to the table. 110 | File Saving Option: 111 | There is also functionality to save the raw JSON response to a file. 112 | 113 | Usage 114 | 115 | Run the main integration script with: 116 | 117 | ``` 118 | python main.py 119 | ``` 120 | 121 | As the script runs, it will: 122 | 123 | Connect to the Omie API using your credentials. 124 | Retrieve and process data from each endpoint. 125 | Store the data in your PostgreSQL database. 126 | Output progress messages to the console, including the total pages and records fetched. 127 | Contributing 128 | Contributions are welcome! If you have suggestions or improvements, feel free to fork the repository and submit a pull request. 129 | 130 | License 131 | This project does not specify a license. Please contact the repository owner for more details. 132 | -------------------------------------------------------------------------------- /config/airflow.cfg: -------------------------------------------------------------------------------- 1 | [core] 2 | # The folder where your airflow pipelines live, most likely a 3 | # subfolder in a code repository. This path must be absolute. 4 | # 5 | # Variable: AIRFLOW__CORE__DAGS_FOLDER 6 | # 7 | dags_folder = /opt/airflow/dags 8 | 9 | # Hostname by providing a path to a callable, which will resolve the hostname. 10 | # The format is "package.function". 11 | # 12 | # For example, default value ``airflow.utils.net.getfqdn`` means that result from patched 13 | # version of `socket.getfqdn() `__, 14 | # see related `CPython Issue `__. 15 | # 16 | # No argument should be required in the function specified. 17 | # If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address`` 18 | # 19 | # Variable: AIRFLOW__CORE__HOSTNAME_CALLABLE 20 | # 21 | hostname_callable = airflow.utils.net.getfqdn 22 | 23 | # A callable to check if a python file has airflow dags defined or not and should 24 | # return ``True`` if it has dags otherwise ``False``. 25 | # If this is not provided, Airflow uses its own heuristic rules. 26 | # 27 | # The function should have the following signature 28 | # 29 | # .. code-block:: python 30 | # 31 | # def func_name(file_path: str, zip_file: zipfile.ZipFile | None = None) -> bool: ... 32 | # 33 | # Variable: AIRFLOW__CORE__MIGHT_CONTAIN_DAG_CALLABLE 34 | # 35 | might_contain_dag_callable = airflow.utils.file.might_contain_dag_via_default_heuristic 36 | 37 | # Default timezone in case supplied date times are naive 38 | # can be `UTC` (default), `system`, or any `IANA ` 39 | # timezone string (e.g. Europe/Amsterdam) 40 | # 41 | # Variable: AIRFLOW__CORE__DEFAULT_TIMEZONE 42 | # 43 | default_timezone = utc 44 | 45 | # The executor class that airflow should use. Choices include 46 | # ``SequentialExecutor``, ``LocalExecutor``, ``CeleryExecutor``, 47 | # ``KubernetesExecutor``, ``CeleryKubernetesExecutor``, ``LocalKubernetesExecutor`` or the 48 | # full import path to the class when using a custom executor. 49 | # 50 | # Variable: AIRFLOW__CORE__EXECUTOR 51 | # 52 | executor = SequentialExecutor 53 | 54 | # The auth manager class that airflow should use. Full import path to the auth manager class. 55 | # 56 | # Variable: AIRFLOW__CORE__AUTH_MANAGER 57 | # 58 | auth_manager = airflow.providers.fab.auth_manager.fab_auth_manager.FabAuthManager 59 | 60 | # This defines the maximum number of task instances that can run concurrently per scheduler in 61 | # Airflow, regardless of the worker count. Generally this value, multiplied by the number of 62 | # schedulers in your cluster, is the maximum number of task instances with the running 63 | # state in the metadata database. Setting this value to zero allows unlimited parallelism. 64 | # 65 | # Variable: AIRFLOW__CORE__PARALLELISM 66 | # 67 | parallelism = 32 68 | 69 | # The maximum number of task instances allowed to run concurrently in each DAG. To calculate 70 | # the number of tasks that is running concurrently for a DAG, add up the number of running 71 | # tasks for all DAG runs of the DAG. This is configurable at the DAG level with ``max_active_tasks``, 72 | # which is defaulted as ``[core] max_active_tasks_per_dag``. 73 | # 74 | # An example scenario when this would be useful is when you want to stop a new dag with an early 75 | # start date from stealing all the executor slots in a cluster. 76 | # 77 | # Variable: AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG 78 | # 79 | max_active_tasks_per_dag = 16 80 | 81 | # Are DAGs paused by default at creation 82 | # 83 | # Variable: AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION 84 | # 85 | dags_are_paused_at_creation = True 86 | 87 | # The maximum number of active DAG runs per DAG. The scheduler will not create more DAG runs 88 | # if it reaches the limit. This is configurable at the DAG level with ``max_active_runs``, 89 | # which is defaulted as ``[core] max_active_runs_per_dag``. 90 | # 91 | # Variable: AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG 92 | # 93 | max_active_runs_per_dag = 16 94 | 95 | # (experimental) The maximum number of consecutive DAG failures before DAG is automatically paused. 96 | # This is also configurable per DAG level with ``max_consecutive_failed_dag_runs``, 97 | # which is defaulted as ``[core] max_consecutive_failed_dag_runs_per_dag``. 98 | # If not specified, then the value is considered as 0, 99 | # meaning that the dags are never paused out by default. 100 | # 101 | # Variable: AIRFLOW__CORE__MAX_CONSECUTIVE_FAILED_DAG_RUNS_PER_DAG 102 | # 103 | max_consecutive_failed_dag_runs_per_dag = 0 104 | 105 | # The name of the method used in order to start Python processes via the multiprocessing module. 106 | # This corresponds directly with the options available in the Python docs: 107 | # `multiprocessing.set_start_method 108 | # `__ 109 | # must be one of the values returned by `multiprocessing.get_all_start_methods() 110 | # `__. 111 | # 112 | # Example: mp_start_method = fork 113 | # 114 | # Variable: AIRFLOW__CORE__MP_START_METHOD 115 | # 116 | # mp_start_method = 117 | 118 | # Whether to load the DAG examples that ship with Airflow. It's good to 119 | # get started, but you probably want to set this to ``False`` in a production 120 | # environment 121 | # 122 | # Variable: AIRFLOW__CORE__LOAD_EXAMPLES 123 | # 124 | load_examples = True 125 | 126 | # Path to the folder containing Airflow plugins 127 | # 128 | # Variable: AIRFLOW__CORE__PLUGINS_FOLDER 129 | # 130 | plugins_folder = /opt/airflow/plugins 131 | 132 | # Should tasks be executed via forking of the parent process 133 | # 134 | # * ``False``: Execute via forking of the parent process 135 | # * ``True``: Spawning a new python process, slower than fork, but means plugin changes picked 136 | # up by tasks straight away 137 | # 138 | # Variable: AIRFLOW__CORE__EXECUTE_TASKS_NEW_PYTHON_INTERPRETER 139 | # 140 | execute_tasks_new_python_interpreter = False 141 | 142 | # Secret key to save connection passwords in the db 143 | # 144 | # Variable: AIRFLOW__CORE__FERNET_KEY 145 | # 146 | fernet_key = 147 | 148 | # Whether to disable pickling dags 149 | # 150 | # Variable: AIRFLOW__CORE__DONOT_PICKLE 151 | # 152 | donot_pickle = True 153 | 154 | # How long before timing out a python file import 155 | # 156 | # Variable: AIRFLOW__CORE__DAGBAG_IMPORT_TIMEOUT 157 | # 158 | dagbag_import_timeout = 30.0 159 | 160 | # Should a traceback be shown in the UI for dagbag import errors, 161 | # instead of just the exception message 162 | # 163 | # Variable: AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACKS 164 | # 165 | dagbag_import_error_tracebacks = True 166 | 167 | # If tracebacks are shown, how many entries from the traceback should be shown 168 | # 169 | # Variable: AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACK_DEPTH 170 | # 171 | dagbag_import_error_traceback_depth = 2 172 | 173 | # How long before timing out a DagFileProcessor, which processes a dag file 174 | # 175 | # Variable: AIRFLOW__CORE__DAG_FILE_PROCESSOR_TIMEOUT 176 | # 177 | dag_file_processor_timeout = 50 178 | 179 | # The class to use for running task instances in a subprocess. 180 | # Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class 181 | # when using a custom task runner. 182 | # 183 | # Variable: AIRFLOW__CORE__TASK_RUNNER 184 | # 185 | task_runner = StandardTaskRunner 186 | 187 | # If set, tasks without a ``run_as_user`` argument will be run with this user 188 | # Can be used to de-elevate a sudo user running Airflow when executing tasks 189 | # 190 | # Variable: AIRFLOW__CORE__DEFAULT_IMPERSONATION 191 | # 192 | default_impersonation = 193 | 194 | # What security module to use (for example kerberos) 195 | # 196 | # Variable: AIRFLOW__CORE__SECURITY 197 | # 198 | security = 199 | 200 | # Turn unit test mode on (overwrites many configuration options with test 201 | # values at runtime) 202 | # 203 | # Variable: AIRFLOW__CORE__UNIT_TEST_MODE 204 | # 205 | unit_test_mode = False 206 | 207 | # Whether to enable pickling for xcom (note that this is insecure and allows for 208 | # RCE exploits). 209 | # 210 | # Variable: AIRFLOW__CORE__ENABLE_XCOM_PICKLING 211 | # 212 | enable_xcom_pickling = False 213 | 214 | # What classes can be imported during deserialization. This is a multi line value. 215 | # The individual items will be parsed as a pattern to a glob function. 216 | # Python built-in classes (like dict) are always allowed. 217 | # 218 | # Variable: AIRFLOW__CORE__ALLOWED_DESERIALIZATION_CLASSES 219 | # 220 | allowed_deserialization_classes = airflow.* 221 | 222 | # What classes can be imported during deserialization. This is a multi line value. 223 | # The individual items will be parsed as regexp patterns. 224 | # This is a secondary option to ``[core] allowed_deserialization_classes``. 225 | # 226 | # Variable: AIRFLOW__CORE__ALLOWED_DESERIALIZATION_CLASSES_REGEXP 227 | # 228 | allowed_deserialization_classes_regexp = 229 | 230 | # When a task is killed forcefully, this is the amount of time in seconds that 231 | # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED 232 | # 233 | # Variable: AIRFLOW__CORE__KILLED_TASK_CLEANUP_TIME 234 | # 235 | killed_task_cleanup_time = 60 236 | 237 | # Whether to override params with dag_run.conf. If you pass some key-value pairs 238 | # through ``airflow dags backfill -c`` or 239 | # ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params. 240 | # 241 | # Variable: AIRFLOW__CORE__DAG_RUN_CONF_OVERRIDES_PARAMS 242 | # 243 | dag_run_conf_overrides_params = True 244 | 245 | # If enabled, Airflow will only scan files containing both ``DAG`` and ``airflow`` (case-insensitive). 246 | # 247 | # Variable: AIRFLOW__CORE__DAG_DISCOVERY_SAFE_MODE 248 | # 249 | dag_discovery_safe_mode = True 250 | 251 | # The pattern syntax used in the 252 | # `.airflowignore 253 | # `__ 254 | # files in the DAG directories. Valid values are ``regexp`` or ``glob``. 255 | # 256 | # Variable: AIRFLOW__CORE__DAG_IGNORE_FILE_SYNTAX 257 | # 258 | dag_ignore_file_syntax = regexp 259 | 260 | # The number of retries each task is going to have by default. Can be overridden at dag or task level. 261 | # 262 | # Variable: AIRFLOW__CORE__DEFAULT_TASK_RETRIES 263 | # 264 | default_task_retries = 0 265 | 266 | # The number of seconds each task is going to wait by default between retries. Can be overridden at 267 | # dag or task level. 268 | # 269 | # Variable: AIRFLOW__CORE__DEFAULT_TASK_RETRY_DELAY 270 | # 271 | default_task_retry_delay = 300 272 | 273 | # The maximum delay (in seconds) each task is going to wait by default between retries. 274 | # This is a global setting and cannot be overridden at task or DAG level. 275 | # 276 | # Variable: AIRFLOW__CORE__MAX_TASK_RETRY_DELAY 277 | # 278 | max_task_retry_delay = 86400 279 | 280 | # The weighting method used for the effective total priority weight of the task 281 | # 282 | # Variable: AIRFLOW__CORE__DEFAULT_TASK_WEIGHT_RULE 283 | # 284 | default_task_weight_rule = downstream 285 | 286 | # Maximum possible time (in seconds) that task will have for execution of auxiliary processes 287 | # (like listeners, mini scheduler...) after task is marked as success.. 288 | # 289 | # Variable: AIRFLOW__CORE__TASK_SUCCESS_OVERTIME 290 | # 291 | task_success_overtime = 20 292 | 293 | # The default task execution_timeout value for the operators. Expected an integer value to 294 | # be passed into timedelta as seconds. If not specified, then the value is considered as None, 295 | # meaning that the operators are never timed out by default. 296 | # 297 | # Variable: AIRFLOW__CORE__DEFAULT_TASK_EXECUTION_TIMEOUT 298 | # 299 | default_task_execution_timeout = 300 | 301 | # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate. 302 | # 303 | # Variable: AIRFLOW__CORE__MIN_SERIALIZED_DAG_UPDATE_INTERVAL 304 | # 305 | min_serialized_dag_update_interval = 30 306 | 307 | # If ``True``, serialized DAGs are compressed before writing to DB. 308 | # 309 | # .. note:: 310 | # 311 | # This will disable the DAG dependencies view 312 | # 313 | # Variable: AIRFLOW__CORE__COMPRESS_SERIALIZED_DAGS 314 | # 315 | compress_serialized_dags = False 316 | 317 | # Fetching serialized DAG can not be faster than a minimum interval to reduce database 318 | # read rate. This config controls when your DAGs are updated in the Webserver 319 | # 320 | # Variable: AIRFLOW__CORE__MIN_SERIALIZED_DAG_FETCH_INTERVAL 321 | # 322 | min_serialized_dag_fetch_interval = 10 323 | 324 | # Maximum number of Rendered Task Instance Fields (Template Fields) per task to store 325 | # in the Database. 326 | # All the template_fields for each of Task Instance are stored in the Database. 327 | # Keeping this number small may cause an error when you try to view ``Rendered`` tab in 328 | # TaskInstance view for older tasks. 329 | # 330 | # Variable: AIRFLOW__CORE__MAX_NUM_RENDERED_TI_FIELDS_PER_TASK 331 | # 332 | max_num_rendered_ti_fields_per_task = 30 333 | 334 | # On each dagrun check against defined SLAs 335 | # 336 | # Variable: AIRFLOW__CORE__CHECK_SLAS 337 | # 338 | check_slas = True 339 | 340 | # Path to custom XCom class that will be used to store and resolve operators results 341 | # 342 | # Example: xcom_backend = path.to.CustomXCom 343 | # 344 | # Variable: AIRFLOW__CORE__XCOM_BACKEND 345 | # 346 | xcom_backend = airflow.models.xcom.BaseXCom 347 | 348 | # By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``, 349 | # if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module. 350 | # 351 | # Variable: AIRFLOW__CORE__LAZY_LOAD_PLUGINS 352 | # 353 | lazy_load_plugins = True 354 | 355 | # By default Airflow providers are lazily-discovered (discovery and imports happen only when required). 356 | # Set it to ``False``, if you want to discover providers whenever 'airflow' is invoked via cli or 357 | # loaded from module. 358 | # 359 | # Variable: AIRFLOW__CORE__LAZY_DISCOVER_PROVIDERS 360 | # 361 | lazy_discover_providers = True 362 | 363 | # Hide sensitive **Variables** or **Connection extra json keys** from UI 364 | # and task logs when set to ``True`` 365 | # 366 | # .. note:: 367 | # 368 | # Connection passwords are always hidden in logs 369 | # 370 | # Variable: AIRFLOW__CORE__HIDE_SENSITIVE_VAR_CONN_FIELDS 371 | # 372 | hide_sensitive_var_conn_fields = True 373 | 374 | # A comma-separated list of extra sensitive keywords to look for in variables names or connection's 375 | # extra JSON. 376 | # 377 | # Variable: AIRFLOW__CORE__SENSITIVE_VAR_CONN_NAMES 378 | # 379 | sensitive_var_conn_names = 380 | 381 | # Task Slot counts for ``default_pool``. This setting would not have any effect in an existing 382 | # deployment where the ``default_pool`` is already created. For existing deployments, users can 383 | # change the number of slots using Webserver, API or the CLI 384 | # 385 | # Variable: AIRFLOW__CORE__DEFAULT_POOL_TASK_SLOT_COUNT 386 | # 387 | default_pool_task_slot_count = 128 388 | 389 | # The maximum list/dict length an XCom can push to trigger task mapping. If the pushed list/dict has a 390 | # length exceeding this value, the task pushing the XCom will be failed automatically to prevent the 391 | # mapped tasks from clogging the scheduler. 392 | # 393 | # Variable: AIRFLOW__CORE__MAX_MAP_LENGTH 394 | # 395 | max_map_length = 1024 396 | 397 | # The default umask to use for process when run in daemon mode (scheduler, worker, etc.) 398 | # 399 | # This controls the file-creation mode mask which determines the initial value of file permission bits 400 | # for newly created files. 401 | # 402 | # This value is treated as an octal-integer. 403 | # 404 | # Variable: AIRFLOW__CORE__DAEMON_UMASK 405 | # 406 | daemon_umask = 0o077 407 | 408 | # Class to use as dataset manager. 409 | # 410 | # Example: dataset_manager_class = airflow.datasets.manager.DatasetManager 411 | # 412 | # Variable: AIRFLOW__CORE__DATASET_MANAGER_CLASS 413 | # 414 | # dataset_manager_class = 415 | 416 | # Kwargs to supply to dataset manager. 417 | # 418 | # Example: dataset_manager_kwargs = {"some_param": "some_value"} 419 | # 420 | # Variable: AIRFLOW__CORE__DATASET_MANAGER_KWARGS 421 | # 422 | # dataset_manager_kwargs = 423 | 424 | # Dataset URI validation should raise an exception if it is not compliant with AIP-60. 425 | # By default this configuration is false, meaning that Airflow 2.x only warns the user. 426 | # In Airflow 3, this configuration will be removed, unconditionally enabling strict validation. 427 | # 428 | # Variable: AIRFLOW__CORE__STRICT_DATASET_URI_VALIDATION 429 | # 430 | strict_dataset_uri_validation = False 431 | 432 | # (experimental) Whether components should use Airflow Internal API for DB connectivity. 433 | # 434 | # Variable: AIRFLOW__CORE__DATABASE_ACCESS_ISOLATION 435 | # 436 | database_access_isolation = False 437 | 438 | # (experimental) Airflow Internal API url. 439 | # Only used if ``[core] database_access_isolation`` is ``True``. 440 | # 441 | # Example: internal_api_url = http://localhost:8080 442 | # 443 | # Variable: AIRFLOW__CORE__INTERNAL_API_URL 444 | # 445 | # internal_api_url = 446 | 447 | # Secret key used to authenticate internal API clients to core. It should be as random as possible. 448 | # However, when running more than 1 instances of webserver / internal API services, make sure all 449 | # of them use the same ``secret_key`` otherwise calls will fail on authentication. 450 | # The authentication token generated using the secret key has a short expiry time though - make 451 | # sure that time on ALL the machines that you run airflow components on is synchronized 452 | # (for example using ntpd) otherwise you might get "forbidden" errors when the logs are accessed. 453 | # 454 | # Variable: AIRFLOW__CORE__INTERNAL_API_SECRET_KEY 455 | # 456 | internal_api_secret_key = bIAkfIszaatwB9ni0WMINw== 457 | 458 | # The ability to allow testing connections across Airflow UI, API and CLI. 459 | # Supported options: ``Disabled``, ``Enabled``, ``Hidden``. Default: Disabled 460 | # Disabled - Disables the test connection functionality and disables the Test Connection button in UI. 461 | # Enabled - Enables the test connection functionality and shows the Test Connection button in UI. 462 | # Hidden - Disables the test connection functionality and hides the Test Connection button in UI. 463 | # Before setting this to Enabled, make sure that you review the users who are able to add/edit 464 | # connections and ensure they are trusted. Connection testing can be done maliciously leading to 465 | # undesired and insecure outcomes. 466 | # See `Airflow Security Model: Capabilities of authenticated UI users 467 | # `__ 468 | # for more details. 469 | # 470 | # Variable: AIRFLOW__CORE__TEST_CONNECTION 471 | # 472 | test_connection = Disabled 473 | 474 | # The maximum length of the rendered template field. If the value to be stored in the 475 | # rendered template field exceeds this size, it's redacted. 476 | # 477 | # Variable: AIRFLOW__CORE__MAX_TEMPLATED_FIELD_LENGTH 478 | # 479 | max_templated_field_length = 4096 480 | 481 | [database] 482 | # Path to the ``alembic.ini`` file. You can either provide the file path relative 483 | # to the Airflow home directory or the absolute path if it is located elsewhere. 484 | # 485 | # Variable: AIRFLOW__DATABASE__ALEMBIC_INI_FILE_PATH 486 | # 487 | alembic_ini_file_path = alembic.ini 488 | 489 | # The SQLAlchemy connection string to the metadata database. 490 | # SQLAlchemy supports many different database engines. 491 | # See: `Set up a Database Backend: Database URI 492 | # `__ 493 | # for more details. 494 | # 495 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_CONN 496 | # 497 | sql_alchemy_conn = sqlite:////opt/airflow/airflow.db 498 | 499 | # Extra engine specific keyword args passed to SQLAlchemy's create_engine, as a JSON-encoded value 500 | # 501 | # Example: sql_alchemy_engine_args = {"arg1": true} 502 | # 503 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_ENGINE_ARGS 504 | # 505 | # sql_alchemy_engine_args = 506 | 507 | # The encoding for the databases 508 | # 509 | # Variable: AIRFLOW__DATABASE__SQL_ENGINE_ENCODING 510 | # 511 | sql_engine_encoding = utf-8 512 | 513 | # Collation for ``dag_id``, ``task_id``, ``key``, ``external_executor_id`` columns 514 | # in case they have different encoding. 515 | # By default this collation is the same as the database collation, however for ``mysql`` and ``mariadb`` 516 | # the default is ``utf8mb3_bin`` so that the index sizes of our index keys will not exceed 517 | # the maximum size of allowed index when collation is set to ``utf8mb4`` variant, see 518 | # `GitHub Issue Comment `__ 519 | # for more details. 520 | # 521 | # Variable: AIRFLOW__DATABASE__SQL_ENGINE_COLLATION_FOR_IDS 522 | # 523 | # sql_engine_collation_for_ids = 524 | 525 | # If SQLAlchemy should pool database connections. 526 | # 527 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_ENABLED 528 | # 529 | sql_alchemy_pool_enabled = True 530 | 531 | # The SQLAlchemy pool size is the maximum number of database connections 532 | # in the pool. 0 indicates no limit. 533 | # 534 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_SIZE 535 | # 536 | sql_alchemy_pool_size = 5 537 | 538 | # The maximum overflow size of the pool. 539 | # When the number of checked-out connections reaches the size set in pool_size, 540 | # additional connections will be returned up to this limit. 541 | # When those additional connections are returned to the pool, they are disconnected and discarded. 542 | # It follows then that the total number of simultaneous connections the pool will allow 543 | # is **pool_size** + **max_overflow**, 544 | # and the total number of "sleeping" connections the pool will allow is pool_size. 545 | # max_overflow can be set to ``-1`` to indicate no overflow limit; 546 | # no limit will be placed on the total number of concurrent connections. Defaults to ``10``. 547 | # 548 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_MAX_OVERFLOW 549 | # 550 | sql_alchemy_max_overflow = 10 551 | 552 | # The SQLAlchemy pool recycle is the number of seconds a connection 553 | # can be idle in the pool before it is invalidated. This config does 554 | # not apply to sqlite. If the number of DB connections is ever exceeded, 555 | # a lower config value will allow the system to recover faster. 556 | # 557 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_RECYCLE 558 | # 559 | sql_alchemy_pool_recycle = 1800 560 | 561 | # Check connection at the start of each connection pool checkout. 562 | # Typically, this is a simple statement like "SELECT 1". 563 | # See `SQLAlchemy Pooling: Disconnect Handling - Pessimistic 564 | # `__ 565 | # for more details. 566 | # 567 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_PRE_PING 568 | # 569 | sql_alchemy_pool_pre_ping = True 570 | 571 | # The schema to use for the metadata database. 572 | # SQLAlchemy supports databases with the concept of multiple schemas. 573 | # 574 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_SCHEMA 575 | # 576 | sql_alchemy_schema = 577 | 578 | # Import path for connect args in SQLAlchemy. Defaults to an empty dict. 579 | # This is useful when you want to configure db engine args that SQLAlchemy won't parse 580 | # in connection string. This can be set by passing a dictionary containing the create engine parameters. 581 | # For more details about passing create engine parameters (keepalives variables, timeout etc) 582 | # in Postgres DB Backend see `Setting up a PostgreSQL Database 583 | # `__ 584 | # e.g ``connect_args={"timeout":30}`` can be defined in ``airflow_local_settings.py`` and 585 | # can be imported as shown below 586 | # 587 | # Example: sql_alchemy_connect_args = airflow_local_settings.connect_args 588 | # 589 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_CONNECT_ARGS 590 | # 591 | # sql_alchemy_connect_args = 592 | 593 | # Important Warning: Use of sql_alchemy_session_maker Highly Discouraged 594 | # Import path for function which returns 'sqlalchemy.orm.sessionmaker'. 595 | # Improper configuration of sql_alchemy_session_maker can lead to serious issues, 596 | # including data corruption, unrecoverable application crashes. Please review the SQLAlchemy 597 | # documentation for detailed guidance on proper configuration and best practices. 598 | # 599 | # Example: sql_alchemy_session_maker = airflow_local_settings._sessionmaker 600 | # 601 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_SESSION_MAKER 602 | # 603 | # sql_alchemy_session_maker = 604 | 605 | # Whether to load the default connections that ship with Airflow when ``airflow db init`` is called. 606 | # It's good to get started, but you probably want to set this to ``False`` in a production environment. 607 | # 608 | # Variable: AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS 609 | # 610 | load_default_connections = True 611 | 612 | # Number of times the code should be retried in case of DB Operational Errors. 613 | # Not all transactions will be retried as it can cause undesired state. 614 | # Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``. 615 | # 616 | # Variable: AIRFLOW__DATABASE__MAX_DB_RETRIES 617 | # 618 | max_db_retries = 3 619 | 620 | # Whether to run alembic migrations during Airflow start up. Sometimes this operation can be expensive, 621 | # and the users can assert the correct version through other means (e.g. through a Helm chart). 622 | # Accepts ``True`` or ``False``. 623 | # 624 | # Variable: AIRFLOW__DATABASE__CHECK_MIGRATIONS 625 | # 626 | check_migrations = True 627 | 628 | [logging] 629 | # The folder where airflow should store its log files. 630 | # This path must be absolute. 631 | # There are a few existing configurations that assume this is set to the default. 632 | # If you choose to override this you may need to update the 633 | # ``[logging] dag_processor_manager_log_location`` and 634 | # ``[logging] child_process_log_directory settings`` as well. 635 | # 636 | # Variable: AIRFLOW__LOGGING__BASE_LOG_FOLDER 637 | # 638 | base_log_folder = /opt/airflow/logs 639 | 640 | # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search. 641 | # Set this to ``True`` if you want to enable remote logging. 642 | # 643 | # Variable: AIRFLOW__LOGGING__REMOTE_LOGGING 644 | # 645 | remote_logging = False 646 | 647 | # Users must supply an Airflow connection id that provides access to the storage 648 | # location. Depending on your remote logging service, this may only be used for 649 | # reading logs, not writing them. 650 | # 651 | # Variable: AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID 652 | # 653 | remote_log_conn_id = 654 | 655 | # Whether the local log files for GCS, S3, WASB and OSS remote logging should be deleted after 656 | # they are uploaded to the remote location. 657 | # 658 | # Variable: AIRFLOW__LOGGING__DELETE_LOCAL_LOGS 659 | # 660 | delete_local_logs = False 661 | 662 | # Path to Google Credential JSON file. If omitted, authorization based on `the Application Default 663 | # Credentials 664 | # `__ will 665 | # be used. 666 | # 667 | # Variable: AIRFLOW__LOGGING__GOOGLE_KEY_PATH 668 | # 669 | google_key_path = 670 | 671 | # Storage bucket URL for remote logging 672 | # S3 buckets should start with **s3://** 673 | # Cloudwatch log groups should start with **cloudwatch://** 674 | # GCS buckets should start with **gs://** 675 | # WASB buckets should start with **wasb** just to help Airflow select correct handler 676 | # Stackdriver logs should start with **stackdriver://** 677 | # 678 | # Variable: AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER 679 | # 680 | remote_base_log_folder = 681 | 682 | # The remote_task_handler_kwargs param is loaded into a dictionary and passed to the ``__init__`` 683 | # of remote task handler and it overrides the values provided by Airflow config. For example if you set 684 | # ``delete_local_logs=False`` and you provide ``{"delete_local_copy": true}``, then the local 685 | # log files will be deleted after they are uploaded to remote location. 686 | # 687 | # Example: remote_task_handler_kwargs = {"delete_local_copy": true} 688 | # 689 | # Variable: AIRFLOW__LOGGING__REMOTE_TASK_HANDLER_KWARGS 690 | # 691 | remote_task_handler_kwargs = 692 | 693 | # Use server-side encryption for logs stored in S3 694 | # 695 | # Variable: AIRFLOW__LOGGING__ENCRYPT_S3_LOGS 696 | # 697 | encrypt_s3_logs = False 698 | 699 | # Logging level. 700 | # 701 | # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``. 702 | # 703 | # Variable: AIRFLOW__LOGGING__LOGGING_LEVEL 704 | # 705 | logging_level = INFO 706 | 707 | # Logging level for celery. If not set, it uses the value of logging_level 708 | # 709 | # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``. 710 | # 711 | # Variable: AIRFLOW__LOGGING__CELERY_LOGGING_LEVEL 712 | # 713 | celery_logging_level = 714 | 715 | # Logging level for Flask-appbuilder UI. 716 | # 717 | # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``. 718 | # 719 | # Variable: AIRFLOW__LOGGING__FAB_LOGGING_LEVEL 720 | # 721 | fab_logging_level = WARNING 722 | 723 | # Logging class 724 | # Specify the class that will specify the logging configuration 725 | # This class has to be on the python classpath 726 | # 727 | # Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG 728 | # 729 | # Variable: AIRFLOW__LOGGING__LOGGING_CONFIG_CLASS 730 | # 731 | logging_config_class = 732 | 733 | # Flag to enable/disable Colored logs in Console 734 | # Colour the logs when the controlling terminal is a TTY. 735 | # 736 | # Variable: AIRFLOW__LOGGING__COLORED_CONSOLE_LOG 737 | # 738 | colored_console_log = True 739 | 740 | # Log format for when Colored logs is enabled 741 | # 742 | # Variable: AIRFLOW__LOGGING__COLORED_LOG_FORMAT 743 | # 744 | colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s 745 | 746 | # Specifies the class utilized by Airflow to implement colored logging 747 | # 748 | # Variable: AIRFLOW__LOGGING__COLORED_FORMATTER_CLASS 749 | # 750 | colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter 751 | 752 | # Format of Log line 753 | # 754 | # Variable: AIRFLOW__LOGGING__LOG_FORMAT 755 | # 756 | log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s 757 | 758 | # Defines the format of log messages for simple logging configuration 759 | # 760 | # Variable: AIRFLOW__LOGGING__SIMPLE_LOG_FORMAT 761 | # 762 | simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s 763 | 764 | # Where to send dag parser logs. If "file", logs are sent to log files defined by child_process_log_directory. 765 | # 766 | # Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_LOG_TARGET 767 | # 768 | dag_processor_log_target = file 769 | 770 | # Format of Dag Processor Log line 771 | # 772 | # Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_LOG_FORMAT 773 | # 774 | dag_processor_log_format = [%%(asctime)s] [SOURCE:DAG_PROCESSOR] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s 775 | 776 | # Determines the formatter class used by Airflow for structuring its log messages 777 | # The default formatter class is timezone-aware, which means that timestamps attached to log entries 778 | # will be adjusted to reflect the local timezone of the Airflow instance 779 | # 780 | # Variable: AIRFLOW__LOGGING__LOG_FORMATTER_CLASS 781 | # 782 | log_formatter_class = airflow.utils.log.timezone_aware.TimezoneAware 783 | 784 | # An import path to a function to add adaptations of each secret added with 785 | # ``airflow.utils.log.secrets_masker.mask_secret`` to be masked in log messages. The given function 786 | # is expected to require a single parameter: the secret to be adapted. It may return a 787 | # single adaptation of the secret or an iterable of adaptations to each be masked as secrets. 788 | # The original secret will be masked as well as any adaptations returned. 789 | # 790 | # Example: secret_mask_adapter = urllib.parse.quote 791 | # 792 | # Variable: AIRFLOW__LOGGING__SECRET_MASK_ADAPTER 793 | # 794 | secret_mask_adapter = 795 | 796 | # Specify prefix pattern like mentioned below with stream handler ``TaskHandlerWithCustomFormatter`` 797 | # 798 | # Example: task_log_prefix_template = {{ti.dag_id}}-{{ti.task_id}}-{{execution_date}}-{{ti.try_number}} 799 | # 800 | # Variable: AIRFLOW__LOGGING__TASK_LOG_PREFIX_TEMPLATE 801 | # 802 | task_log_prefix_template = 803 | 804 | # Formatting for how airflow generates file names/paths for each task run. 805 | # 806 | # Variable: AIRFLOW__LOGGING__LOG_FILENAME_TEMPLATE 807 | # 808 | log_filename_template = dag_id={{ ti.dag_id }}/run_id={{ ti.run_id }}/task_id={{ ti.task_id }}/{%% if ti.map_index >= 0 %%}map_index={{ ti.map_index }}/{%% endif %%}attempt={{ try_number }}.log 809 | 810 | # Formatting for how airflow generates file names for log 811 | # 812 | # Variable: AIRFLOW__LOGGING__LOG_PROCESSOR_FILENAME_TEMPLATE 813 | # 814 | log_processor_filename_template = {{ filename }}.log 815 | 816 | # Full path of dag_processor_manager logfile. 817 | # 818 | # Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_MANAGER_LOG_LOCATION 819 | # 820 | dag_processor_manager_log_location = /opt/airflow/logs/dag_processor_manager/dag_processor_manager.log 821 | 822 | # Whether DAG processor manager will write logs to stdout 823 | # 824 | # Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_MANAGER_LOG_STDOUT 825 | # 826 | dag_processor_manager_log_stdout = False 827 | 828 | # Name of handler to read task instance logs. 829 | # Defaults to use ``task`` handler. 830 | # 831 | # Variable: AIRFLOW__LOGGING__TASK_LOG_READER 832 | # 833 | task_log_reader = task 834 | 835 | # A comma\-separated list of third-party logger names that will be configured to print messages to 836 | # consoles\. 837 | # 838 | # Example: extra_logger_names = connexion,sqlalchemy 839 | # 840 | # Variable: AIRFLOW__LOGGING__EXTRA_LOGGER_NAMES 841 | # 842 | extra_logger_names = 843 | 844 | # When you start an Airflow worker, Airflow starts a tiny web server 845 | # subprocess to serve the workers local log files to the airflow main 846 | # web server, who then builds pages and sends them to users. This defines 847 | # the port on which the logs are served. It needs to be unused, and open 848 | # visible from the main web server to connect into the workers. 849 | # 850 | # Variable: AIRFLOW__LOGGING__WORKER_LOG_SERVER_PORT 851 | # 852 | worker_log_server_port = 8793 853 | 854 | # Port to serve logs from for triggerer. 855 | # See ``[logging] worker_log_server_port`` description for more info. 856 | # 857 | # Variable: AIRFLOW__LOGGING__TRIGGER_LOG_SERVER_PORT 858 | # 859 | trigger_log_server_port = 8794 860 | 861 | # We must parse timestamps to interleave logs between trigger and task. To do so, 862 | # we need to parse timestamps in log files. In case your log format is non-standard, 863 | # you may provide import path to callable which takes a string log line and returns 864 | # the timestamp (datetime.datetime compatible). 865 | # 866 | # Example: interleave_timestamp_parser = path.to.my_func 867 | # 868 | # Variable: AIRFLOW__LOGGING__INTERLEAVE_TIMESTAMP_PARSER 869 | # 870 | # interleave_timestamp_parser = 871 | 872 | # Permissions in the form or of octal string as understood by chmod. The permissions are important 873 | # when you use impersonation, when logs are written by a different user than airflow. The most secure 874 | # way of configuring it in this case is to add both users to the same group and make it the default 875 | # group of both users. Group-writeable logs are default in airflow, but you might decide that you are 876 | # OK with having the logs other-writeable, in which case you should set it to ``0o777``. You might 877 | # decide to add more security if you do not use impersonation and change it to ``0o755`` to make it 878 | # only owner-writeable. You can also make it just readable only for owner by changing it to ``0o700`` 879 | # if all the access (read/write) for your logs happens from the same user. 880 | # 881 | # Example: file_task_handler_new_folder_permissions = 0o775 882 | # 883 | # Variable: AIRFLOW__LOGGING__FILE_TASK_HANDLER_NEW_FOLDER_PERMISSIONS 884 | # 885 | file_task_handler_new_folder_permissions = 0o775 886 | 887 | # Permissions in the form or of octal string as understood by chmod. The permissions are important 888 | # when you use impersonation, when logs are written by a different user than airflow. The most secure 889 | # way of configuring it in this case is to add both users to the same group and make it the default 890 | # group of both users. Group-writeable logs are default in airflow, but you might decide that you are 891 | # OK with having the logs other-writeable, in which case you should set it to ``0o666``. You might 892 | # decide to add more security if you do not use impersonation and change it to ``0o644`` to make it 893 | # only owner-writeable. You can also make it just readable only for owner by changing it to ``0o600`` 894 | # if all the access (read/write) for your logs happens from the same user. 895 | # 896 | # Example: file_task_handler_new_file_permissions = 0o664 897 | # 898 | # Variable: AIRFLOW__LOGGING__FILE_TASK_HANDLER_NEW_FILE_PERMISSIONS 899 | # 900 | file_task_handler_new_file_permissions = 0o664 901 | 902 | # By default Celery sends all logs into stderr. 903 | # If enabled any previous logging handlers will get *removed*. 904 | # With this option AirFlow will create new handlers 905 | # and send low level logs like INFO and WARNING to stdout, 906 | # while sending higher severity logs to stderr. 907 | # 908 | # Variable: AIRFLOW__LOGGING__CELERY_STDOUT_STDERR_SEPARATION 909 | # 910 | celery_stdout_stderr_separation = False 911 | 912 | # If enabled, Airflow may ship messages to task logs from outside the task run context, e.g. from 913 | # the scheduler, executor, or callback execution context. This can help in circumstances such as 914 | # when there's something blocking the execution of the task and ordinarily there may be no task 915 | # logs at all. 916 | # This is set to ``True`` by default. If you encounter issues with this feature 917 | # (e.g. scheduler performance issues) it can be disabled. 918 | # 919 | # Variable: AIRFLOW__LOGGING__ENABLE_TASK_CONTEXT_LOGGER 920 | # 921 | enable_task_context_logger = True 922 | 923 | # A comma separated list of keywords related to errors whose presence should display the line in red 924 | # color in UI 925 | # 926 | # Variable: AIRFLOW__LOGGING__COLOR_LOG_ERROR_KEYWORDS 927 | # 928 | color_log_error_keywords = error,exception 929 | 930 | # A comma separated list of keywords related to warning whose presence should display the line in yellow 931 | # color in UI 932 | # 933 | # Variable: AIRFLOW__LOGGING__COLOR_LOG_WARNING_KEYWORDS 934 | # 935 | color_log_warning_keywords = warn 936 | 937 | [metrics] 938 | # `StatsD `__ integration settings. 939 | 940 | # If true, ``[metrics] metrics_allow_list`` and ``[metrics] metrics_block_list`` will use 941 | # regex pattern matching anywhere within the metric name instead of only prefix matching 942 | # at the start of the name. 943 | # 944 | # Variable: AIRFLOW__METRICS__METRICS_USE_PATTERN_MATCH 945 | # 946 | metrics_use_pattern_match = False 947 | 948 | # Configure an allow list (comma separated string) to send only certain metrics. 949 | # If ``[metrics] metrics_use_pattern_match`` is ``false``, match only the exact metric name prefix. 950 | # If ``[metrics] metrics_use_pattern_match`` is ``true``, provide regex patterns to match. 951 | # 952 | # Example: metrics_allow_list = "scheduler,executor,dagrun,pool,triggerer,celery" or "^scheduler,^executor,heartbeat|timeout" 953 | # 954 | # Variable: AIRFLOW__METRICS__METRICS_ALLOW_LIST 955 | # 956 | metrics_allow_list = 957 | 958 | # Configure a block list (comma separated string) to block certain metrics from being emitted. 959 | # If ``[metrics] metrics_allow_list`` and ``[metrics] metrics_block_list`` are both configured, 960 | # ``[metrics] metrics_block_list`` is ignored. 961 | # 962 | # If ``[metrics] metrics_use_pattern_match`` is ``false``, match only the exact metric name prefix. 963 | # 964 | # If ``[metrics] metrics_use_pattern_match`` is ``true``, provide regex patterns to match. 965 | # 966 | # Example: metrics_block_list = "scheduler,executor,dagrun,pool,triggerer,celery" or "^scheduler,^executor,heartbeat|timeout" 967 | # 968 | # Variable: AIRFLOW__METRICS__METRICS_BLOCK_LIST 969 | # 970 | metrics_block_list = 971 | 972 | # Enables sending metrics to StatsD. 973 | # 974 | # Variable: AIRFLOW__METRICS__STATSD_ON 975 | # 976 | statsd_on = False 977 | 978 | # Specifies the host address where the StatsD daemon (or server) is running 979 | # 980 | # Variable: AIRFLOW__METRICS__STATSD_HOST 981 | # 982 | statsd_host = localhost 983 | 984 | # Specifies the port on which the StatsD daemon (or server) is listening to 985 | # 986 | # Variable: AIRFLOW__METRICS__STATSD_PORT 987 | # 988 | statsd_port = 8125 989 | 990 | # Defines the namespace for all metrics sent from Airflow to StatsD 991 | # 992 | # Variable: AIRFLOW__METRICS__STATSD_PREFIX 993 | # 994 | statsd_prefix = airflow 995 | 996 | # A function that validate the StatsD stat name, apply changes to the stat name if necessary and return 997 | # the transformed stat name. 998 | # 999 | # The function should have the following signature 1000 | # 1001 | # .. code-block:: python 1002 | # 1003 | # def func_name(stat_name: str) -> str: ... 1004 | # 1005 | # Variable: AIRFLOW__METRICS__STAT_NAME_HANDLER 1006 | # 1007 | stat_name_handler = 1008 | 1009 | # To enable datadog integration to send airflow metrics. 1010 | # 1011 | # Variable: AIRFLOW__METRICS__STATSD_DATADOG_ENABLED 1012 | # 1013 | statsd_datadog_enabled = False 1014 | 1015 | # List of datadog tags attached to all metrics(e.g: ``key1:value1,key2:value2``) 1016 | # 1017 | # Variable: AIRFLOW__METRICS__STATSD_DATADOG_TAGS 1018 | # 1019 | statsd_datadog_tags = 1020 | 1021 | # Set to ``False`` to disable metadata tags for some of the emitted metrics 1022 | # 1023 | # Variable: AIRFLOW__METRICS__STATSD_DATADOG_METRICS_TAGS 1024 | # 1025 | statsd_datadog_metrics_tags = True 1026 | 1027 | # If you want to utilise your own custom StatsD client set the relevant 1028 | # module path below. 1029 | # Note: The module path must exist on your 1030 | # `PYTHONPATH ` 1031 | # for Airflow to pick it up 1032 | # 1033 | # Variable: AIRFLOW__METRICS__STATSD_CUSTOM_CLIENT_PATH 1034 | # 1035 | # statsd_custom_client_path = 1036 | 1037 | # If you want to avoid sending all the available metrics tags to StatsD, 1038 | # you can configure a block list of prefixes (comma separated) to filter out metric tags 1039 | # that start with the elements of the list (e.g: ``job_id,run_id``) 1040 | # 1041 | # Example: statsd_disabled_tags = job_id,run_id,dag_id,task_id 1042 | # 1043 | # Variable: AIRFLOW__METRICS__STATSD_DISABLED_TAGS 1044 | # 1045 | statsd_disabled_tags = job_id,run_id 1046 | 1047 | # To enable sending Airflow metrics with StatsD-Influxdb tagging convention. 1048 | # 1049 | # Variable: AIRFLOW__METRICS__STATSD_INFLUXDB_ENABLED 1050 | # 1051 | statsd_influxdb_enabled = False 1052 | 1053 | # Enables sending metrics to OpenTelemetry. 1054 | # 1055 | # Variable: AIRFLOW__METRICS__OTEL_ON 1056 | # 1057 | otel_on = False 1058 | 1059 | # Specifies the hostname or IP address of the OpenTelemetry Collector to which Airflow sends 1060 | # metrics and traces. 1061 | # 1062 | # Variable: AIRFLOW__METRICS__OTEL_HOST 1063 | # 1064 | otel_host = localhost 1065 | 1066 | # Specifies the port of the OpenTelemetry Collector that is listening to. 1067 | # 1068 | # Variable: AIRFLOW__METRICS__OTEL_PORT 1069 | # 1070 | otel_port = 8889 1071 | 1072 | # The prefix for the Airflow metrics. 1073 | # 1074 | # Variable: AIRFLOW__METRICS__OTEL_PREFIX 1075 | # 1076 | otel_prefix = airflow 1077 | 1078 | # Defines the interval, in milliseconds, at which Airflow sends batches of metrics and traces 1079 | # to the configured OpenTelemetry Collector. 1080 | # 1081 | # Variable: AIRFLOW__METRICS__OTEL_INTERVAL_MILLISECONDS 1082 | # 1083 | otel_interval_milliseconds = 60000 1084 | 1085 | # If ``True``, all metrics are also emitted to the console. Defaults to ``False``. 1086 | # 1087 | # Variable: AIRFLOW__METRICS__OTEL_DEBUGGING_ON 1088 | # 1089 | otel_debugging_on = False 1090 | 1091 | # The default service name of traces. 1092 | # 1093 | # Variable: AIRFLOW__METRICS__OTEL_SERVICE 1094 | # 1095 | otel_service = Airflow 1096 | 1097 | # If ``True``, SSL will be enabled. Defaults to ``False``. 1098 | # To establish an HTTPS connection to the OpenTelemetry collector, 1099 | # you need to configure the SSL certificate and key within the OpenTelemetry collector's 1100 | # ``config.yml`` file. 1101 | # 1102 | # Variable: AIRFLOW__METRICS__OTEL_SSL_ACTIVE 1103 | # 1104 | otel_ssl_active = False 1105 | 1106 | [traces] 1107 | # Distributed traces integration settings. 1108 | 1109 | # Enables sending traces to OpenTelemetry. 1110 | # 1111 | # Variable: AIRFLOW__TRACES__OTEL_ON 1112 | # 1113 | otel_on = False 1114 | 1115 | # Specifies the hostname or IP address of the OpenTelemetry Collector to which Airflow sends 1116 | # traces. 1117 | # 1118 | # Variable: AIRFLOW__TRACES__OTEL_HOST 1119 | # 1120 | otel_host = localhost 1121 | 1122 | # Specifies the port of the OpenTelemetry Collector that is listening to. 1123 | # 1124 | # Variable: AIRFLOW__TRACES__OTEL_PORT 1125 | # 1126 | otel_port = 8889 1127 | 1128 | # The default service name of traces. 1129 | # 1130 | # Variable: AIRFLOW__TRACES__OTEL_SERVICE 1131 | # 1132 | otel_service = Airflow 1133 | 1134 | # If True, all traces are also emitted to the console. Defaults to False. 1135 | # 1136 | # Variable: AIRFLOW__TRACES__OTEL_DEBUGGING_ON 1137 | # 1138 | otel_debugging_on = False 1139 | 1140 | # If True, SSL will be enabled. Defaults to False. 1141 | # To establish an HTTPS connection to the OpenTelemetry collector, 1142 | # you need to configure the SSL certificate and key within the OpenTelemetry collector's 1143 | # config.yml file. 1144 | # 1145 | # Variable: AIRFLOW__TRACES__OTEL_SSL_ACTIVE 1146 | # 1147 | otel_ssl_active = False 1148 | 1149 | # If True, after the task is complete, the full task log messages will be added as the 1150 | # span events, chunked by 64k size. defaults to False. 1151 | # 1152 | # Variable: AIRFLOW__TRACES__OTEL_TASK_LOG_EVENT 1153 | # 1154 | otel_task_log_event = False 1155 | 1156 | [secrets] 1157 | # Full class name of secrets backend to enable (will precede env vars and metastore in search path) 1158 | # 1159 | # Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend 1160 | # 1161 | # Variable: AIRFLOW__SECRETS__BACKEND 1162 | # 1163 | backend = 1164 | 1165 | # The backend_kwargs param is loaded into a dictionary and passed to ``__init__`` 1166 | # of secrets backend class. See documentation for the secrets backend you are using. 1167 | # JSON is expected. 1168 | # 1169 | # Example for AWS Systems Manager ParameterStore: 1170 | # ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}`` 1171 | # 1172 | # Variable: AIRFLOW__SECRETS__BACKEND_KWARGS 1173 | # 1174 | backend_kwargs = 1175 | 1176 | # .. note:: |experimental| 1177 | # 1178 | # Enables local caching of Variables, when parsing DAGs only. 1179 | # Using this option can make dag parsing faster if Variables are used in top level code, at the expense 1180 | # of longer propagation time for changes. 1181 | # Please note that this cache concerns only the DAG parsing step. There is no caching in place when DAG 1182 | # tasks are run. 1183 | # 1184 | # Variable: AIRFLOW__SECRETS__USE_CACHE 1185 | # 1186 | use_cache = False 1187 | 1188 | # .. note:: |experimental| 1189 | # 1190 | # When the cache is enabled, this is the duration for which we consider an entry in the cache to be 1191 | # valid. Entries are refreshed if they are older than this many seconds. 1192 | # It means that when the cache is enabled, this is the maximum amount of time you need to wait to see a 1193 | # Variable change take effect. 1194 | # 1195 | # Variable: AIRFLOW__SECRETS__CACHE_TTL_SECONDS 1196 | # 1197 | cache_ttl_seconds = 900 1198 | 1199 | [cli] 1200 | # In what way should the cli access the API. The LocalClient will use the 1201 | # database directly, while the json_client will use the api running on the 1202 | # webserver 1203 | # 1204 | # Variable: AIRFLOW__CLI__API_CLIENT 1205 | # 1206 | api_client = airflow.api.client.local_client 1207 | 1208 | # If you set web_server_url_prefix, do NOT forget to append it here, ex: 1209 | # ``endpoint_url = http://localhost:8080/myroot`` 1210 | # So api will look like: ``http://localhost:8080/myroot/api/experimental/...`` 1211 | # 1212 | # Variable: AIRFLOW__CLI__ENDPOINT_URL 1213 | # 1214 | endpoint_url = http://localhost:8080 1215 | 1216 | [debug] 1217 | # Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first 1218 | # failed task. Helpful for debugging purposes. 1219 | # 1220 | # Variable: AIRFLOW__DEBUG__FAIL_FAST 1221 | # 1222 | fail_fast = False 1223 | 1224 | [api] 1225 | # Enables the deprecated experimental API. Please note that these API endpoints do not have 1226 | # access control. An authenticated user has full access. 1227 | # 1228 | # .. warning:: 1229 | # 1230 | # This `Experimental REST API 1231 | # `__ is 1232 | # deprecated since version 2.0. Please consider using 1233 | # `the Stable REST API 1234 | # `__. 1235 | # For more information on migration, see 1236 | # `RELEASE_NOTES.rst `_ 1237 | # 1238 | # Variable: AIRFLOW__API__ENABLE_EXPERIMENTAL_API 1239 | # 1240 | enable_experimental_api = False 1241 | 1242 | # Comma separated list of auth backends to authenticate users of the API. See 1243 | # `Security: API 1244 | # `__ for possible values. 1245 | # ("airflow.api.auth.backend.default" allows all requests for historic reasons) 1246 | # 1247 | # Variable: AIRFLOW__API__AUTH_BACKENDS 1248 | # 1249 | auth_backends = airflow.api.auth.backend.session 1250 | 1251 | # Used to set the maximum page limit for API requests. If limit passed as param 1252 | # is greater than maximum page limit, it will be ignored and maximum page limit value 1253 | # will be set as the limit 1254 | # 1255 | # Variable: AIRFLOW__API__MAXIMUM_PAGE_LIMIT 1256 | # 1257 | maximum_page_limit = 100 1258 | 1259 | # Used to set the default page limit when limit param is zero or not provided in API 1260 | # requests. Otherwise if positive integer is passed in the API requests as limit, the 1261 | # smallest number of user given limit or maximum page limit is taken as limit. 1262 | # 1263 | # Variable: AIRFLOW__API__FALLBACK_PAGE_LIMIT 1264 | # 1265 | fallback_page_limit = 100 1266 | 1267 | # The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested. 1268 | # 1269 | # Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com 1270 | # 1271 | # Variable: AIRFLOW__API__GOOGLE_OAUTH2_AUDIENCE 1272 | # 1273 | google_oauth2_audience = 1274 | 1275 | # Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on 1276 | # `the Application Default Credentials 1277 | # `__ will 1278 | # be used. 1279 | # 1280 | # Example: google_key_path = /files/service-account-json 1281 | # 1282 | # Variable: AIRFLOW__API__GOOGLE_KEY_PATH 1283 | # 1284 | google_key_path = 1285 | 1286 | # Used in response to a preflight request to indicate which HTTP 1287 | # headers can be used when making the actual request. This header is 1288 | # the server side response to the browser's 1289 | # Access-Control-Request-Headers header. 1290 | # 1291 | # Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_HEADERS 1292 | # 1293 | access_control_allow_headers = 1294 | 1295 | # Specifies the method or methods allowed when accessing the resource. 1296 | # 1297 | # Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_METHODS 1298 | # 1299 | access_control_allow_methods = 1300 | 1301 | # Indicates whether the response can be shared with requesting code from the given origins. 1302 | # Separate URLs with space. 1303 | # 1304 | # Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_ORIGINS 1305 | # 1306 | access_control_allow_origins = 1307 | 1308 | # Indicates whether the **xcomEntries** endpoint supports the **deserialize** 1309 | # flag. If set to ``False``, setting this flag in a request would result in a 1310 | # 400 Bad Request error. 1311 | # 1312 | # Variable: AIRFLOW__API__ENABLE_XCOM_DESERIALIZE_SUPPORT 1313 | # 1314 | enable_xcom_deserialize_support = False 1315 | 1316 | [lineage] 1317 | # what lineage backend to use 1318 | # 1319 | # Variable: AIRFLOW__LINEAGE__BACKEND 1320 | # 1321 | backend = 1322 | 1323 | [operators] 1324 | # The default owner assigned to each new operator, unless 1325 | # provided explicitly or passed via ``default_args`` 1326 | # 1327 | # Variable: AIRFLOW__OPERATORS__DEFAULT_OWNER 1328 | # 1329 | default_owner = airflow 1330 | 1331 | # The default value of attribute "deferrable" in operators and sensors. 1332 | # 1333 | # Variable: AIRFLOW__OPERATORS__DEFAULT_DEFERRABLE 1334 | # 1335 | default_deferrable = false 1336 | 1337 | # Indicates the default number of CPU units allocated to each operator when no specific CPU request 1338 | # is specified in the operator's configuration 1339 | # 1340 | # Variable: AIRFLOW__OPERATORS__DEFAULT_CPUS 1341 | # 1342 | default_cpus = 1 1343 | 1344 | # Indicates the default number of RAM allocated to each operator when no specific RAM request 1345 | # is specified in the operator's configuration 1346 | # 1347 | # Variable: AIRFLOW__OPERATORS__DEFAULT_RAM 1348 | # 1349 | default_ram = 512 1350 | 1351 | # Indicates the default number of disk storage allocated to each operator when no specific disk request 1352 | # is specified in the operator's configuration 1353 | # 1354 | # Variable: AIRFLOW__OPERATORS__DEFAULT_DISK 1355 | # 1356 | default_disk = 512 1357 | 1358 | # Indicates the default number of GPUs allocated to each operator when no specific GPUs request 1359 | # is specified in the operator's configuration 1360 | # 1361 | # Variable: AIRFLOW__OPERATORS__DEFAULT_GPUS 1362 | # 1363 | default_gpus = 0 1364 | 1365 | # Default queue that tasks get assigned to and that worker listen on. 1366 | # 1367 | # Variable: AIRFLOW__OPERATORS__DEFAULT_QUEUE 1368 | # 1369 | default_queue = default 1370 | 1371 | # Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator. 1372 | # If set to ``False``, an exception will be thrown, 1373 | # otherwise only the console message will be displayed. 1374 | # 1375 | # Variable: AIRFLOW__OPERATORS__ALLOW_ILLEGAL_ARGUMENTS 1376 | # 1377 | allow_illegal_arguments = False 1378 | 1379 | [webserver] 1380 | # The message displayed when a user attempts to execute actions beyond their authorised privileges. 1381 | # 1382 | # Variable: AIRFLOW__WEBSERVER__ACCESS_DENIED_MESSAGE 1383 | # 1384 | access_denied_message = Access is Denied 1385 | 1386 | # Path of webserver config file used for configuring the webserver parameters 1387 | # 1388 | # Variable: AIRFLOW__WEBSERVER__CONFIG_FILE 1389 | # 1390 | config_file = /opt/airflow/webserver_config.py 1391 | 1392 | # The base url of your website: Airflow cannot guess what domain or CNAME you are using. 1393 | # This is used to create links in the Log Url column in the Browse - Task Instances menu, 1394 | # as well as in any automated emails sent by Airflow that contain links to your webserver. 1395 | # 1396 | # Variable: AIRFLOW__WEBSERVER__BASE_URL 1397 | # 1398 | base_url = http://localhost:8080 1399 | 1400 | # Default timezone to display all dates in the UI, can be UTC, system, or 1401 | # any IANA timezone string (e.g. **Europe/Amsterdam**). If left empty the 1402 | # default value of core/default_timezone will be used 1403 | # 1404 | # Example: default_ui_timezone = America/New_York 1405 | # 1406 | # Variable: AIRFLOW__WEBSERVER__DEFAULT_UI_TIMEZONE 1407 | # 1408 | default_ui_timezone = UTC 1409 | 1410 | # The ip specified when starting the web server 1411 | # 1412 | # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_HOST 1413 | # 1414 | web_server_host = 0.0.0.0 1415 | 1416 | # The port on which to run the web server 1417 | # 1418 | # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_PORT 1419 | # 1420 | web_server_port = 8080 1421 | 1422 | # Paths to the SSL certificate and key for the web server. When both are 1423 | # provided SSL will be enabled. This does not change the web server port. 1424 | # 1425 | # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_SSL_CERT 1426 | # 1427 | web_server_ssl_cert = 1428 | 1429 | # Paths to the SSL certificate and key for the web server. When both are 1430 | # provided SSL will be enabled. This does not change the web server port. 1431 | # 1432 | # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_SSL_KEY 1433 | # 1434 | web_server_ssl_key = 1435 | 1436 | # The type of backend used to store web session data, can be ``database`` or ``securecookie``. For the 1437 | # ``database`` backend, sessions are store in the database and they can be 1438 | # managed there (for example when you reset password of the user, all sessions for that user are 1439 | # deleted). For the ``securecookie`` backend, sessions are stored in encrypted cookies on the client 1440 | # side. The ``securecookie`` mechanism is 'lighter' than database backend, but sessions are not deleted 1441 | # when you reset password of the user, which means that other than waiting for expiry time, the only 1442 | # way to invalidate all sessions for a user is to change secret_key and restart webserver (which 1443 | # also invalidates and logs out all other user's sessions). 1444 | # 1445 | # When you are using ``database`` backend, make sure to keep your database session table small 1446 | # by periodically running ``airflow db clean --table session`` command, especially if you have 1447 | # automated API calls that will create a new session for each call rather than reuse the sessions 1448 | # stored in browser cookies. 1449 | # 1450 | # Example: session_backend = securecookie 1451 | # 1452 | # Variable: AIRFLOW__WEBSERVER__SESSION_BACKEND 1453 | # 1454 | session_backend = database 1455 | 1456 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond 1457 | # 1458 | # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_MASTER_TIMEOUT 1459 | # 1460 | web_server_master_timeout = 120 1461 | 1462 | # Number of seconds the gunicorn webserver waits before timing out on a worker 1463 | # 1464 | # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_WORKER_TIMEOUT 1465 | # 1466 | web_server_worker_timeout = 120 1467 | 1468 | # Number of workers to refresh at a time. When set to 0, worker refresh is 1469 | # disabled. When nonzero, airflow periodically refreshes webserver workers by 1470 | # bringing up new ones and killing old ones. 1471 | # 1472 | # Variable: AIRFLOW__WEBSERVER__WORKER_REFRESH_BATCH_SIZE 1473 | # 1474 | worker_refresh_batch_size = 1 1475 | 1476 | # Number of seconds to wait before refreshing a batch of workers. 1477 | # 1478 | # Variable: AIRFLOW__WEBSERVER__WORKER_REFRESH_INTERVAL 1479 | # 1480 | worker_refresh_interval = 6000 1481 | 1482 | # If set to ``True``, Airflow will track files in plugins_folder directory. When it detects changes, 1483 | # then reload the gunicorn. If set to ``True``, gunicorn starts without preloading, which is slower, 1484 | # uses more memory, and may cause race conditions. Avoid setting this to ``True`` in production. 1485 | # 1486 | # Variable: AIRFLOW__WEBSERVER__RELOAD_ON_PLUGIN_CHANGE 1487 | # 1488 | reload_on_plugin_change = False 1489 | 1490 | # Secret key used to run your flask app. It should be as random as possible. However, when running 1491 | # more than 1 instances of webserver, make sure all of them use the same ``secret_key`` otherwise 1492 | # one of them will error with "CSRF session token is missing". 1493 | # The webserver key is also used to authorize requests to Celery workers when logs are retrieved. 1494 | # The token generated using the secret key has a short expiry time though - make sure that time on 1495 | # ALL the machines that you run airflow components on is synchronized (for example using ntpd) 1496 | # otherwise you might get "forbidden" errors when the logs are accessed. 1497 | # 1498 | # Variable: AIRFLOW__WEBSERVER__SECRET_KEY 1499 | # 1500 | secret_key = L6oQmxmjwK0yQH+Ltg0JhQ== 1501 | 1502 | # Number of workers to run the Gunicorn web server 1503 | # 1504 | # Variable: AIRFLOW__WEBSERVER__WORKERS 1505 | # 1506 | workers = 4 1507 | 1508 | # The worker class gunicorn should use. Choices include 1509 | # ``sync`` (default), ``eventlet``, ``gevent``. 1510 | # 1511 | # .. warning:: 1512 | # 1513 | # When using ``gevent`` you might also want to set the ``_AIRFLOW_PATCH_GEVENT`` 1514 | # environment variable to ``"1"`` to make sure gevent patching is done as early as possible. 1515 | # 1516 | # Be careful to set ``_AIRFLOW_PATCH_GEVENT`` only on the web server as gevent patching may 1517 | # affect the scheduler behavior via the ``multiprocessing`` sockets module and cause crash. 1518 | # 1519 | # See related Issues / PRs for more details: 1520 | # 1521 | # * https://github.com/benoitc/gunicorn/issues/2796 1522 | # * https://github.com/apache/airflow/issues/8212 1523 | # * https://github.com/apache/airflow/pull/28283 1524 | # 1525 | # Variable: AIRFLOW__WEBSERVER__WORKER_CLASS 1526 | # 1527 | worker_class = sync 1528 | 1529 | # Log files for the gunicorn webserver. '-' means log to stderr. 1530 | # 1531 | # Variable: AIRFLOW__WEBSERVER__ACCESS_LOGFILE 1532 | # 1533 | access_logfile = - 1534 | 1535 | # Log files for the gunicorn webserver. '-' means log to stderr. 1536 | # 1537 | # Variable: AIRFLOW__WEBSERVER__ERROR_LOGFILE 1538 | # 1539 | error_logfile = - 1540 | 1541 | # Access log format for gunicorn webserver. 1542 | # default format is ``%%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s"`` 1543 | # See `Gunicorn Settings: 'access_log_format' Reference 1544 | # `__ for more details 1545 | # 1546 | # Variable: AIRFLOW__WEBSERVER__ACCESS_LOGFORMAT 1547 | # 1548 | access_logformat = 1549 | 1550 | # Expose the configuration file in the web server. Set to ``non-sensitive-only`` to show all values 1551 | # except those that have security implications. ``True`` shows all values. ``False`` hides the 1552 | # configuration completely. 1553 | # 1554 | # Variable: AIRFLOW__WEBSERVER__EXPOSE_CONFIG 1555 | # 1556 | expose_config = False 1557 | 1558 | # Expose hostname in the web server 1559 | # 1560 | # Variable: AIRFLOW__WEBSERVER__EXPOSE_HOSTNAME 1561 | # 1562 | expose_hostname = False 1563 | 1564 | # Expose stacktrace in the web server 1565 | # 1566 | # Variable: AIRFLOW__WEBSERVER__EXPOSE_STACKTRACE 1567 | # 1568 | expose_stacktrace = False 1569 | 1570 | # Default DAG view. Valid values are: ``grid``, ``graph``, ``duration``, ``gantt``, ``landing_times`` 1571 | # 1572 | # Variable: AIRFLOW__WEBSERVER__DAG_DEFAULT_VIEW 1573 | # 1574 | dag_default_view = grid 1575 | 1576 | # Default DAG orientation. Valid values are: 1577 | # ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top) 1578 | # 1579 | # Variable: AIRFLOW__WEBSERVER__DAG_ORIENTATION 1580 | # 1581 | dag_orientation = LR 1582 | 1583 | # Sorting order in grid view. Valid values are: ``topological``, ``hierarchical_alphabetical`` 1584 | # 1585 | # Variable: AIRFLOW__WEBSERVER__GRID_VIEW_SORTING_ORDER 1586 | # 1587 | grid_view_sorting_order = topological 1588 | 1589 | # The amount of time (in secs) webserver will wait for initial handshake 1590 | # while fetching logs from other worker machine 1591 | # 1592 | # Variable: AIRFLOW__WEBSERVER__LOG_FETCH_TIMEOUT_SEC 1593 | # 1594 | log_fetch_timeout_sec = 5 1595 | 1596 | # Time interval (in secs) to wait before next log fetching. 1597 | # 1598 | # Variable: AIRFLOW__WEBSERVER__LOG_FETCH_DELAY_SEC 1599 | # 1600 | log_fetch_delay_sec = 2 1601 | 1602 | # Distance away from page bottom to enable auto tailing. 1603 | # 1604 | # Variable: AIRFLOW__WEBSERVER__LOG_AUTO_TAILING_OFFSET 1605 | # 1606 | log_auto_tailing_offset = 30 1607 | 1608 | # Animation speed for auto tailing log display. 1609 | # 1610 | # Variable: AIRFLOW__WEBSERVER__LOG_ANIMATION_SPEED 1611 | # 1612 | log_animation_speed = 1000 1613 | 1614 | # By default, the webserver shows paused DAGs. Flip this to hide paused 1615 | # DAGs by default 1616 | # 1617 | # Variable: AIRFLOW__WEBSERVER__HIDE_PAUSED_DAGS_BY_DEFAULT 1618 | # 1619 | hide_paused_dags_by_default = False 1620 | 1621 | # Consistent page size across all listing views in the UI 1622 | # 1623 | # Variable: AIRFLOW__WEBSERVER__PAGE_SIZE 1624 | # 1625 | page_size = 100 1626 | 1627 | # Define the color of navigation bar 1628 | # 1629 | # Variable: AIRFLOW__WEBSERVER__NAVBAR_COLOR 1630 | # 1631 | navbar_color = #fff 1632 | 1633 | # Define the color of text in the navigation bar 1634 | # 1635 | # Variable: AIRFLOW__WEBSERVER__NAVBAR_TEXT_COLOR 1636 | # 1637 | navbar_text_color = #51504f 1638 | 1639 | # Define the color of navigation bar links when hovered 1640 | # 1641 | # Variable: AIRFLOW__WEBSERVER__NAVBAR_HOVER_COLOR 1642 | # 1643 | navbar_hover_color = #eee 1644 | 1645 | # Define the color of text in the navigation bar when hovered 1646 | # 1647 | # Variable: AIRFLOW__WEBSERVER__NAVBAR_TEXT_HOVER_COLOR 1648 | # 1649 | navbar_text_hover_color = #51504f 1650 | 1651 | # Define the color of the logo text 1652 | # 1653 | # Variable: AIRFLOW__WEBSERVER__NAVBAR_LOGO_TEXT_COLOR 1654 | # 1655 | navbar_logo_text_color = #51504f 1656 | 1657 | # Default dagrun to show in UI 1658 | # 1659 | # Variable: AIRFLOW__WEBSERVER__DEFAULT_DAG_RUN_DISPLAY_NUMBER 1660 | # 1661 | default_dag_run_display_number = 25 1662 | 1663 | # Enable werkzeug ``ProxyFix`` middleware for reverse proxy 1664 | # 1665 | # Variable: AIRFLOW__WEBSERVER__ENABLE_PROXY_FIX 1666 | # 1667 | enable_proxy_fix = False 1668 | 1669 | # Number of values to trust for ``X-Forwarded-For``. 1670 | # See `Werkzeug: X-Forwarded-For Proxy Fix 1671 | # `__ for more details. 1672 | # 1673 | # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_FOR 1674 | # 1675 | proxy_fix_x_for = 1 1676 | 1677 | # Number of values to trust for ``X-Forwarded-Proto``. 1678 | # See `Werkzeug: X-Forwarded-For Proxy Fix 1679 | # `__ for more details. 1680 | # 1681 | # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PROTO 1682 | # 1683 | proxy_fix_x_proto = 1 1684 | 1685 | # Number of values to trust for ``X-Forwarded-Host``. 1686 | # See `Werkzeug: X-Forwarded-For Proxy Fix 1687 | # `__ for more details. 1688 | # 1689 | # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_HOST 1690 | # 1691 | proxy_fix_x_host = 1 1692 | 1693 | # Number of values to trust for ``X-Forwarded-Port``. 1694 | # See `Werkzeug: X-Forwarded-For Proxy Fix 1695 | # `__ for more details. 1696 | # 1697 | # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PORT 1698 | # 1699 | proxy_fix_x_port = 1 1700 | 1701 | # Number of values to trust for ``X-Forwarded-Prefix``. 1702 | # See `Werkzeug: X-Forwarded-For Proxy Fix 1703 | # `__ for more details. 1704 | # 1705 | # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PREFIX 1706 | # 1707 | proxy_fix_x_prefix = 1 1708 | 1709 | # Set secure flag on session cookie 1710 | # 1711 | # Variable: AIRFLOW__WEBSERVER__COOKIE_SECURE 1712 | # 1713 | cookie_secure = False 1714 | 1715 | # Set samesite policy on session cookie 1716 | # 1717 | # Variable: AIRFLOW__WEBSERVER__COOKIE_SAMESITE 1718 | # 1719 | cookie_samesite = Lax 1720 | 1721 | # Default setting for wrap toggle on DAG code and TI log views. 1722 | # 1723 | # Variable: AIRFLOW__WEBSERVER__DEFAULT_WRAP 1724 | # 1725 | default_wrap = False 1726 | 1727 | # Allow the UI to be rendered in a frame 1728 | # 1729 | # Variable: AIRFLOW__WEBSERVER__X_FRAME_ENABLED 1730 | # 1731 | x_frame_enabled = True 1732 | 1733 | # Send anonymous user activity to your analytics tool 1734 | # choose from ``google_analytics``, ``segment``, ``metarouter``, or ``matomo`` 1735 | # 1736 | # Variable: AIRFLOW__WEBSERVER__ANALYTICS_TOOL 1737 | # 1738 | # analytics_tool = 1739 | 1740 | # Unique ID of your account in the analytics tool 1741 | # 1742 | # Variable: AIRFLOW__WEBSERVER__ANALYTICS_ID 1743 | # 1744 | # analytics_id = 1745 | 1746 | # Your instances url, only applicable to Matomo. 1747 | # 1748 | # Example: analytics_url = https://your.matomo.instance.com/ 1749 | # 1750 | # Variable: AIRFLOW__WEBSERVER__ANALYTICS_URL 1751 | # 1752 | # analytics_url = 1753 | 1754 | # 'Recent Tasks' stats will show for old DagRuns if set 1755 | # 1756 | # Variable: AIRFLOW__WEBSERVER__SHOW_RECENT_STATS_FOR_COMPLETED_RUNS 1757 | # 1758 | show_recent_stats_for_completed_runs = True 1759 | 1760 | # The UI cookie lifetime in minutes. User will be logged out from UI after 1761 | # ``[webserver] session_lifetime_minutes`` of non-activity 1762 | # 1763 | # Variable: AIRFLOW__WEBSERVER__SESSION_LIFETIME_MINUTES 1764 | # 1765 | session_lifetime_minutes = 43200 1766 | 1767 | # Sets a custom page title for the DAGs overview page and site title for all pages 1768 | # 1769 | # Variable: AIRFLOW__WEBSERVER__INSTANCE_NAME 1770 | # 1771 | # instance_name = 1772 | 1773 | # Whether the custom page title for the DAGs overview page contains any Markup language 1774 | # 1775 | # Variable: AIRFLOW__WEBSERVER__INSTANCE_NAME_HAS_MARKUP 1776 | # 1777 | instance_name_has_markup = False 1778 | 1779 | # How frequently, in seconds, the DAG data will auto-refresh in graph or grid view 1780 | # when auto-refresh is turned on 1781 | # 1782 | # Variable: AIRFLOW__WEBSERVER__AUTO_REFRESH_INTERVAL 1783 | # 1784 | auto_refresh_interval = 3 1785 | 1786 | # Boolean for displaying warning for publicly viewable deployment 1787 | # 1788 | # Variable: AIRFLOW__WEBSERVER__WARN_DEPLOYMENT_EXPOSURE 1789 | # 1790 | warn_deployment_exposure = True 1791 | 1792 | # Comma separated string of view events to exclude from dag audit view. 1793 | # All other events will be added minus the ones passed here. 1794 | # The audit logs in the db will not be affected by this parameter. 1795 | # 1796 | # Example: audit_view_excluded_events = cli_task_run,running,success 1797 | # 1798 | # Variable: AIRFLOW__WEBSERVER__AUDIT_VIEW_EXCLUDED_EVENTS 1799 | # 1800 | # audit_view_excluded_events = 1801 | 1802 | # Comma separated string of view events to include in dag audit view. 1803 | # If passed, only these events will populate the dag audit view. 1804 | # The audit logs in the db will not be affected by this parameter. 1805 | # 1806 | # Example: audit_view_included_events = dagrun_cleared,failed 1807 | # 1808 | # Variable: AIRFLOW__WEBSERVER__AUDIT_VIEW_INCLUDED_EVENTS 1809 | # 1810 | # audit_view_included_events = 1811 | 1812 | # Boolean for running SwaggerUI in the webserver. 1813 | # 1814 | # Variable: AIRFLOW__WEBSERVER__ENABLE_SWAGGER_UI 1815 | # 1816 | enable_swagger_ui = True 1817 | 1818 | # Boolean for running Internal API in the webserver. 1819 | # 1820 | # Variable: AIRFLOW__WEBSERVER__RUN_INTERNAL_API 1821 | # 1822 | run_internal_api = False 1823 | 1824 | # The caching algorithm used by the webserver. Must be a valid hashlib function name. 1825 | # 1826 | # Example: caching_hash_method = sha256 1827 | # 1828 | # Variable: AIRFLOW__WEBSERVER__CACHING_HASH_METHOD 1829 | # 1830 | caching_hash_method = md5 1831 | 1832 | # Behavior of the trigger DAG run button for DAGs without params. ``False`` to skip and trigger 1833 | # without displaying a form to add a **dag_run.conf**, ``True`` to always display the form. 1834 | # The form is displayed always if parameters are defined. 1835 | # 1836 | # Variable: AIRFLOW__WEBSERVER__SHOW_TRIGGER_FORM_IF_NO_PARAMS 1837 | # 1838 | show_trigger_form_if_no_params = False 1839 | 1840 | # Number of recent DAG run configurations in the selector on the trigger web form. 1841 | # 1842 | # Example: num_recent_configurations_for_trigger = 10 1843 | # 1844 | # Variable: AIRFLOW__WEBSERVER__NUM_RECENT_CONFIGURATIONS_FOR_TRIGGER 1845 | # 1846 | num_recent_configurations_for_trigger = 5 1847 | 1848 | # A DAG author is able to provide any raw HTML into ``doc_md`` or params description in 1849 | # ``description_md`` for text formatting. This is including potentially unsafe javascript. 1850 | # Displaying the DAG or trigger form in web UI provides the DAG author the potential to 1851 | # inject malicious code into clients browsers. To ensure the web UI is safe by default, 1852 | # raw HTML is disabled by default. If you trust your DAG authors, you can enable HTML 1853 | # support in markdown by setting this option to ``True``. 1854 | # 1855 | # This parameter also enables the deprecated fields ``description_html`` and 1856 | # ``custom_html_form`` in DAG params until the feature is removed in a future version. 1857 | # 1858 | # Example: allow_raw_html_descriptions = False 1859 | # 1860 | # Variable: AIRFLOW__WEBSERVER__ALLOW_RAW_HTML_DESCRIPTIONS 1861 | # 1862 | allow_raw_html_descriptions = False 1863 | 1864 | # The maximum size of the request payload (in MB) that can be sent. 1865 | # 1866 | # Variable: AIRFLOW__WEBSERVER__ALLOWED_PAYLOAD_SIZE 1867 | # 1868 | allowed_payload_size = 1.0 1869 | 1870 | # Require confirmation when changing a DAG in the web UI. This is to prevent accidental changes 1871 | # to a DAG that may be running on sensitive environments like production. 1872 | # When set to ``True``, confirmation dialog will be shown when a user tries to Pause/Unpause, 1873 | # Trigger a DAG 1874 | # 1875 | # Variable: AIRFLOW__WEBSERVER__REQUIRE_CONFIRMATION_DAG_CHANGE 1876 | # 1877 | require_confirmation_dag_change = False 1878 | 1879 | [email] 1880 | # Configuration email backend and whether to 1881 | # send email alerts on retry or failure 1882 | 1883 | # Email backend to use 1884 | # 1885 | # Variable: AIRFLOW__EMAIL__EMAIL_BACKEND 1886 | # 1887 | email_backend = airflow.utils.email.send_email_smtp 1888 | 1889 | # Email connection to use 1890 | # 1891 | # Variable: AIRFLOW__EMAIL__EMAIL_CONN_ID 1892 | # 1893 | email_conn_id = smtp_default 1894 | 1895 | # Whether email alerts should be sent when a task is retried 1896 | # 1897 | # Variable: AIRFLOW__EMAIL__DEFAULT_EMAIL_ON_RETRY 1898 | # 1899 | default_email_on_retry = True 1900 | 1901 | # Whether email alerts should be sent when a task failed 1902 | # 1903 | # Variable: AIRFLOW__EMAIL__DEFAULT_EMAIL_ON_FAILURE 1904 | # 1905 | default_email_on_failure = True 1906 | 1907 | # File that will be used as the template for Email subject (which will be rendered using Jinja2). 1908 | # If not set, Airflow uses a base template. 1909 | # 1910 | # Example: subject_template = /path/to/my_subject_template_file 1911 | # 1912 | # Variable: AIRFLOW__EMAIL__SUBJECT_TEMPLATE 1913 | # 1914 | # subject_template = 1915 | 1916 | # File that will be used as the template for Email content (which will be rendered using Jinja2). 1917 | # If not set, Airflow uses a base template. 1918 | # 1919 | # Example: html_content_template = /path/to/my_html_content_template_file 1920 | # 1921 | # Variable: AIRFLOW__EMAIL__HTML_CONTENT_TEMPLATE 1922 | # 1923 | # html_content_template = 1924 | 1925 | # Email address that will be used as sender address. 1926 | # It can either be raw email or the complete address in a format ``Sender Name `` 1927 | # 1928 | # Example: from_email = Airflow 1929 | # 1930 | # Variable: AIRFLOW__EMAIL__FROM_EMAIL 1931 | # 1932 | # from_email = 1933 | 1934 | # ssl context to use when using SMTP and IMAP SSL connections. By default, the context is "default" 1935 | # which sets it to ``ssl.create_default_context()`` which provides the right balance between 1936 | # compatibility and security, it however requires that certificates in your operating system are 1937 | # updated and that SMTP/IMAP servers of yours have valid certificates that have corresponding public 1938 | # keys installed on your machines. You can switch it to "none" if you want to disable checking 1939 | # of the certificates, but it is not recommended as it allows MITM (man-in-the-middle) attacks 1940 | # if your infrastructure is not sufficiently secured. It should only be set temporarily while you 1941 | # are fixing your certificate configuration. This can be typically done by upgrading to newer 1942 | # version of the operating system you run Airflow components on,by upgrading/refreshing proper 1943 | # certificates in the OS or by updating certificates for your mail servers. 1944 | # 1945 | # Example: ssl_context = default 1946 | # 1947 | # Variable: AIRFLOW__EMAIL__SSL_CONTEXT 1948 | # 1949 | ssl_context = default 1950 | 1951 | [smtp] 1952 | # If you want airflow to send emails on retries, failure, and you want to use 1953 | # the airflow.utils.email.send_email_smtp function, you have to configure an 1954 | # smtp server here 1955 | 1956 | # Specifies the host server address used by Airflow when sending out email notifications via SMTP. 1957 | # 1958 | # Variable: AIRFLOW__SMTP__SMTP_HOST 1959 | # 1960 | smtp_host = localhost 1961 | 1962 | # Determines whether to use the STARTTLS command when connecting to the SMTP server. 1963 | # 1964 | # Variable: AIRFLOW__SMTP__SMTP_STARTTLS 1965 | # 1966 | smtp_starttls = True 1967 | 1968 | # Determines whether to use an SSL connection when talking to the SMTP server. 1969 | # 1970 | # Variable: AIRFLOW__SMTP__SMTP_SSL 1971 | # 1972 | smtp_ssl = False 1973 | 1974 | # Username to authenticate when connecting to smtp server. 1975 | # 1976 | # Example: smtp_user = airflow 1977 | # 1978 | # Variable: AIRFLOW__SMTP__SMTP_USER 1979 | # 1980 | # smtp_user = 1981 | 1982 | # Password to authenticate when connecting to smtp server. 1983 | # 1984 | # Example: smtp_password = airflow 1985 | # 1986 | # Variable: AIRFLOW__SMTP__SMTP_PASSWORD 1987 | # 1988 | # smtp_password = 1989 | 1990 | # Defines the port number on which Airflow connects to the SMTP server to send email notifications. 1991 | # 1992 | # Variable: AIRFLOW__SMTP__SMTP_PORT 1993 | # 1994 | smtp_port = 25 1995 | 1996 | # Specifies the default **from** email address used when Airflow sends email notifications. 1997 | # 1998 | # Variable: AIRFLOW__SMTP__SMTP_MAIL_FROM 1999 | # 2000 | smtp_mail_from = airflow@example.com 2001 | 2002 | # Determines the maximum time (in seconds) the Apache Airflow system will wait for a 2003 | # connection to the SMTP server to be established. 2004 | # 2005 | # Variable: AIRFLOW__SMTP__SMTP_TIMEOUT 2006 | # 2007 | smtp_timeout = 30 2008 | 2009 | # Defines the maximum number of times Airflow will attempt to connect to the SMTP server. 2010 | # 2011 | # Variable: AIRFLOW__SMTP__SMTP_RETRY_LIMIT 2012 | # 2013 | smtp_retry_limit = 5 2014 | 2015 | [sentry] 2016 | # `Sentry `__ integration. Here you can supply 2017 | # additional configuration options based on the Python platform. 2018 | # See `Python / Configuration / Basic Options 2019 | # `__ for more details. 2020 | # Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``, 2021 | # ``ignore_errors``, ``before_breadcrumb``, ``transport``. 2022 | 2023 | # Enable error reporting to Sentry 2024 | # 2025 | # Variable: AIRFLOW__SENTRY__SENTRY_ON 2026 | # 2027 | sentry_on = false 2028 | 2029 | # 2030 | # Variable: AIRFLOW__SENTRY__SENTRY_DSN 2031 | # 2032 | sentry_dsn = 2033 | 2034 | # Dotted path to a before_send function that the sentry SDK should be configured to use. 2035 | # 2036 | # Variable: AIRFLOW__SENTRY__BEFORE_SEND 2037 | # 2038 | # before_send = 2039 | 2040 | [scheduler] 2041 | # Task instances listen for external kill signal (when you clear tasks 2042 | # from the CLI or the UI), this defines the frequency at which they should 2043 | # listen (in seconds). 2044 | # 2045 | # Variable: AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC 2046 | # 2047 | job_heartbeat_sec = 5 2048 | 2049 | # The scheduler constantly tries to trigger new tasks (look at the 2050 | # scheduler section in the docs for more information). This defines 2051 | # how often the scheduler should run (in seconds). 2052 | # 2053 | # Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC 2054 | # 2055 | scheduler_heartbeat_sec = 5 2056 | 2057 | # The frequency (in seconds) at which the LocalTaskJob should send heartbeat signals to the 2058 | # scheduler to notify it's still alive. If this value is set to 0, the heartbeat interval will default 2059 | # to the value of ``[scheduler] scheduler_zombie_task_threshold``. 2060 | # 2061 | # Variable: AIRFLOW__SCHEDULER__LOCAL_TASK_JOB_HEARTBEAT_SEC 2062 | # 2063 | local_task_job_heartbeat_sec = 0 2064 | 2065 | # The number of times to try to schedule each DAG file 2066 | # -1 indicates unlimited number 2067 | # 2068 | # Variable: AIRFLOW__SCHEDULER__NUM_RUNS 2069 | # 2070 | num_runs = -1 2071 | 2072 | # Controls how long the scheduler will sleep between loops, but if there was nothing to do 2073 | # in the loop. i.e. if it scheduled something then it will start the next loop 2074 | # iteration straight away. 2075 | # 2076 | # Variable: AIRFLOW__SCHEDULER__SCHEDULER_IDLE_SLEEP_TIME 2077 | # 2078 | scheduler_idle_sleep_time = 1 2079 | 2080 | # Number of seconds after which a DAG file is parsed. The DAG file is parsed every 2081 | # ``[scheduler] min_file_process_interval`` number of seconds. Updates to DAGs are reflected after 2082 | # this interval. Keeping this number low will increase CPU usage. 2083 | # 2084 | # Variable: AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL 2085 | # 2086 | min_file_process_interval = 30 2087 | 2088 | # How often (in seconds) to check for stale DAGs (DAGs which are no longer present in 2089 | # the expected files) which should be deactivated, as well as datasets that are no longer 2090 | # referenced and should be marked as orphaned. 2091 | # 2092 | # Variable: AIRFLOW__SCHEDULER__PARSING_CLEANUP_INTERVAL 2093 | # 2094 | parsing_cleanup_interval = 60 2095 | 2096 | # How long (in seconds) to wait after we have re-parsed a DAG file before deactivating stale 2097 | # DAGs (DAGs which are no longer present in the expected files). The reason why we need 2098 | # this threshold is to account for the time between when the file is parsed and when the 2099 | # DAG is loaded. The absolute maximum that this could take is ``[core] dag_file_processor_timeout``, 2100 | # but when you have a long timeout configured, it results in a significant delay in the 2101 | # deactivation of stale dags. 2102 | # 2103 | # Variable: AIRFLOW__SCHEDULER__STALE_DAG_THRESHOLD 2104 | # 2105 | stale_dag_threshold = 50 2106 | 2107 | # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes. 2108 | # 2109 | # Variable: AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL 2110 | # 2111 | dag_dir_list_interval = 300 2112 | 2113 | # How often should stats be printed to the logs. Setting to 0 will disable printing stats 2114 | # 2115 | # Variable: AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL 2116 | # 2117 | print_stats_interval = 30 2118 | 2119 | # How often (in seconds) should pool usage stats be sent to StatsD (if statsd_on is enabled) 2120 | # 2121 | # Variable: AIRFLOW__SCHEDULER__POOL_METRICS_INTERVAL 2122 | # 2123 | pool_metrics_interval = 5.0 2124 | 2125 | # If the last scheduler heartbeat happened more than ``[scheduler] scheduler_health_check_threshold`` 2126 | # ago (in seconds), scheduler is considered unhealthy. 2127 | # This is used by the health check in the **/health** endpoint and in ``airflow jobs check`` CLI 2128 | # for SchedulerJob. 2129 | # 2130 | # Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_THRESHOLD 2131 | # 2132 | scheduler_health_check_threshold = 30 2133 | 2134 | # When you start a scheduler, airflow starts a tiny web server 2135 | # subprocess to serve a health check if this is set to ``True`` 2136 | # 2137 | # Variable: AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK 2138 | # 2139 | enable_health_check = False 2140 | 2141 | # When you start a scheduler, airflow starts a tiny web server 2142 | # subprocess to serve a health check on this host 2143 | # 2144 | # Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_SERVER_HOST 2145 | # 2146 | scheduler_health_check_server_host = 0.0.0.0 2147 | 2148 | # When you start a scheduler, airflow starts a tiny web server 2149 | # subprocess to serve a health check on this port 2150 | # 2151 | # Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_SERVER_PORT 2152 | # 2153 | scheduler_health_check_server_port = 8974 2154 | 2155 | # How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs 2156 | # 2157 | # Variable: AIRFLOW__SCHEDULER__ORPHANED_TASKS_CHECK_INTERVAL 2158 | # 2159 | orphaned_tasks_check_interval = 300.0 2160 | 2161 | # Determines the directory where logs for the child processes of the scheduler will be stored 2162 | # 2163 | # Variable: AIRFLOW__SCHEDULER__CHILD_PROCESS_LOG_DIRECTORY 2164 | # 2165 | child_process_log_directory = /opt/airflow/logs/scheduler 2166 | 2167 | # Local task jobs periodically heartbeat to the DB. If the job has 2168 | # not heartbeat in this many seconds, the scheduler will mark the 2169 | # associated task instance as failed and will re-schedule the task. 2170 | # 2171 | # Variable: AIRFLOW__SCHEDULER__SCHEDULER_ZOMBIE_TASK_THRESHOLD 2172 | # 2173 | scheduler_zombie_task_threshold = 300 2174 | 2175 | # How often (in seconds) should the scheduler check for zombie tasks. 2176 | # 2177 | # Variable: AIRFLOW__SCHEDULER__ZOMBIE_DETECTION_INTERVAL 2178 | # 2179 | zombie_detection_interval = 10.0 2180 | 2181 | # Turn off scheduler catchup by setting this to ``False``. 2182 | # Default behavior is unchanged and 2183 | # Command Line Backfills still work, but the scheduler 2184 | # will not do scheduler catchup if this is ``False``, 2185 | # however it can be set on a per DAG basis in the 2186 | # DAG definition (catchup) 2187 | # 2188 | # Variable: AIRFLOW__SCHEDULER__CATCHUP_BY_DEFAULT 2189 | # 2190 | catchup_by_default = True 2191 | 2192 | # Setting this to ``True`` will make first task instance of a task 2193 | # ignore depends_on_past setting. A task instance will be considered 2194 | # as the first task instance of a task when there is no task instance 2195 | # in the DB with an execution_date earlier than it., i.e. no manual marking 2196 | # success will be needed for a newly added task to be scheduled. 2197 | # 2198 | # Variable: AIRFLOW__SCHEDULER__IGNORE_FIRST_DEPENDS_ON_PAST_BY_DEFAULT 2199 | # 2200 | ignore_first_depends_on_past_by_default = True 2201 | 2202 | # This changes the batch size of queries in the scheduling main loop. 2203 | # This should not be greater than ``[core] parallelism``. 2204 | # If this is too high, SQL query performance may be impacted by 2205 | # complexity of query predicate, and/or excessive locking. 2206 | # Additionally, you may hit the maximum allowable query length for your db. 2207 | # Set this to 0 to use the value of ``[core] parallelism`` 2208 | # 2209 | # Variable: AIRFLOW__SCHEDULER__MAX_TIS_PER_QUERY 2210 | # 2211 | max_tis_per_query = 16 2212 | 2213 | # Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries. 2214 | # If this is set to ``False`` then you should not run more than a single 2215 | # scheduler at once 2216 | # 2217 | # Variable: AIRFLOW__SCHEDULER__USE_ROW_LEVEL_LOCKING 2218 | # 2219 | use_row_level_locking = True 2220 | 2221 | # Max number of DAGs to create DagRuns for per scheduler loop. 2222 | # 2223 | # Variable: AIRFLOW__SCHEDULER__MAX_DAGRUNS_TO_CREATE_PER_LOOP 2224 | # 2225 | max_dagruns_to_create_per_loop = 10 2226 | 2227 | # How many DagRuns should a scheduler examine (and lock) when scheduling 2228 | # and queuing tasks. 2229 | # 2230 | # Variable: AIRFLOW__SCHEDULER__MAX_DAGRUNS_PER_LOOP_TO_SCHEDULE 2231 | # 2232 | max_dagruns_per_loop_to_schedule = 20 2233 | 2234 | # Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the 2235 | # same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other 2236 | # dags in some circumstances 2237 | # 2238 | # Variable: AIRFLOW__SCHEDULER__SCHEDULE_AFTER_TASK_EXECUTION 2239 | # 2240 | schedule_after_task_execution = True 2241 | 2242 | # The scheduler reads dag files to extract the airflow modules that are going to be used, 2243 | # and imports them ahead of time to avoid having to re-do it for each parsing process. 2244 | # This flag can be set to ``False`` to disable this behavior in case an airflow module needs 2245 | # to be freshly imported each time (at the cost of increased DAG parsing time). 2246 | # 2247 | # Variable: AIRFLOW__SCHEDULER__PARSING_PRE_IMPORT_MODULES 2248 | # 2249 | parsing_pre_import_modules = True 2250 | 2251 | # The scheduler can run multiple processes in parallel to parse dags. 2252 | # This defines how many processes will run. 2253 | # 2254 | # Variable: AIRFLOW__SCHEDULER__PARSING_PROCESSES 2255 | # 2256 | parsing_processes = 2 2257 | 2258 | # One of ``modified_time``, ``random_seeded_by_host`` and ``alphabetical``. 2259 | # The scheduler will list and sort the dag files to decide the parsing order. 2260 | # 2261 | # * ``modified_time``: Sort by modified time of the files. This is useful on large scale to parse the 2262 | # recently modified DAGs first. 2263 | # * ``random_seeded_by_host``: Sort randomly across multiple Schedulers but with same order on the 2264 | # same host. This is useful when running with Scheduler in HA mode where each scheduler can 2265 | # parse different DAG files. 2266 | # * ``alphabetical``: Sort by filename 2267 | # 2268 | # Variable: AIRFLOW__SCHEDULER__FILE_PARSING_SORT_MODE 2269 | # 2270 | file_parsing_sort_mode = modified_time 2271 | 2272 | # Whether the dag processor is running as a standalone process or it is a subprocess of a scheduler 2273 | # job. 2274 | # 2275 | # Variable: AIRFLOW__SCHEDULER__STANDALONE_DAG_PROCESSOR 2276 | # 2277 | standalone_dag_processor = False 2278 | 2279 | # Only applicable if ``[scheduler] standalone_dag_processor`` is true and callbacks are stored 2280 | # in database. Contains maximum number of callbacks that are fetched during a single loop. 2281 | # 2282 | # Variable: AIRFLOW__SCHEDULER__MAX_CALLBACKS_PER_LOOP 2283 | # 2284 | max_callbacks_per_loop = 20 2285 | 2286 | # Only applicable if ``[scheduler] standalone_dag_processor`` is true. 2287 | # Time in seconds after which dags, which were not updated by Dag Processor are deactivated. 2288 | # 2289 | # Variable: AIRFLOW__SCHEDULER__DAG_STALE_NOT_SEEN_DURATION 2290 | # 2291 | dag_stale_not_seen_duration = 600 2292 | 2293 | # Turn off scheduler use of cron intervals by setting this to ``False``. 2294 | # DAGs submitted manually in the web UI or with trigger_dag will still run. 2295 | # 2296 | # Variable: AIRFLOW__SCHEDULER__USE_JOB_SCHEDULE 2297 | # 2298 | use_job_schedule = True 2299 | 2300 | # Allow externally triggered DagRuns for Execution Dates in the future 2301 | # Only has effect if schedule_interval is set to None in DAG 2302 | # 2303 | # Variable: AIRFLOW__SCHEDULER__ALLOW_TRIGGER_IN_FUTURE 2304 | # 2305 | allow_trigger_in_future = False 2306 | 2307 | # How often to check for expired trigger requests that have not run yet. 2308 | # 2309 | # Variable: AIRFLOW__SCHEDULER__TRIGGER_TIMEOUT_CHECK_INTERVAL 2310 | # 2311 | trigger_timeout_check_interval = 15 2312 | 2313 | # Amount of time a task can be in the queued state before being retried or set to failed. 2314 | # 2315 | # Variable: AIRFLOW__SCHEDULER__TASK_QUEUED_TIMEOUT 2316 | # 2317 | task_queued_timeout = 600.0 2318 | 2319 | # How often to check for tasks that have been in the queued state for 2320 | # longer than ``[scheduler] task_queued_timeout``. 2321 | # 2322 | # Variable: AIRFLOW__SCHEDULER__TASK_QUEUED_TIMEOUT_CHECK_INTERVAL 2323 | # 2324 | task_queued_timeout_check_interval = 120.0 2325 | 2326 | # The run_id pattern used to verify the validity of user input to the run_id parameter when 2327 | # triggering a DAG. This pattern cannot change the pattern used by scheduler to generate run_id 2328 | # for scheduled DAG runs or DAG runs triggered without changing the run_id parameter. 2329 | # 2330 | # Variable: AIRFLOW__SCHEDULER__ALLOWED_RUN_ID_PATTERN 2331 | # 2332 | allowed_run_id_pattern = ^[A-Za-z0-9_.~:+-]+$ 2333 | 2334 | # Whether to create DAG runs that span an interval or one single point in time for cron schedules, when 2335 | # a cron string is provided to ``schedule`` argument of a DAG. 2336 | # 2337 | # * ``True``: **CronDataIntervalTimetable** is used, which is suitable 2338 | # for DAGs with well-defined data interval. You get contiguous intervals from the end of the previous 2339 | # interval up to the scheduled datetime. 2340 | # * ``False``: **CronTriggerTimetable** is used, which is closer to the behavior of cron itself. 2341 | # 2342 | # Notably, for **CronTriggerTimetable**, the logical date is the same as the time the DAG Run will 2343 | # try to schedule, while for **CronDataIntervalTimetable**, the logical date is the beginning of 2344 | # the data interval, but the DAG Run will try to schedule at the end of the data interval. 2345 | # 2346 | # Variable: AIRFLOW__SCHEDULER__CREATE_CRON_DATA_INTERVALS 2347 | # 2348 | create_cron_data_intervals = True 2349 | 2350 | [triggerer] 2351 | # How many triggers a single Triggerer will run at once, by default. 2352 | # 2353 | # Variable: AIRFLOW__TRIGGERER__DEFAULT_CAPACITY 2354 | # 2355 | default_capacity = 1000 2356 | 2357 | # How often to heartbeat the Triggerer job to ensure it hasn't been killed. 2358 | # 2359 | # Variable: AIRFLOW__TRIGGERER__JOB_HEARTBEAT_SEC 2360 | # 2361 | job_heartbeat_sec = 5 2362 | 2363 | # If the last triggerer heartbeat happened more than ``[triggerer] triggerer_health_check_threshold`` 2364 | # ago (in seconds), triggerer is considered unhealthy. 2365 | # This is used by the health check in the **/health** endpoint and in ``airflow jobs check`` CLI 2366 | # for TriggererJob. 2367 | # 2368 | # Variable: AIRFLOW__TRIGGERER__TRIGGERER_HEALTH_CHECK_THRESHOLD 2369 | # 2370 | triggerer_health_check_threshold = 30 2371 | 2372 | [kerberos] 2373 | # Location of your ccache file once kinit has been performed. 2374 | # 2375 | # Variable: AIRFLOW__KERBEROS__CCACHE 2376 | # 2377 | ccache = /tmp/airflow_krb5_ccache 2378 | 2379 | # gets augmented with fqdn 2380 | # 2381 | # Variable: AIRFLOW__KERBEROS__PRINCIPAL 2382 | # 2383 | principal = airflow 2384 | 2385 | # Determines the frequency at which initialization or re-initialization processes occur. 2386 | # 2387 | # Variable: AIRFLOW__KERBEROS__REINIT_FREQUENCY 2388 | # 2389 | reinit_frequency = 3600 2390 | 2391 | # Path to the kinit executable 2392 | # 2393 | # Variable: AIRFLOW__KERBEROS__KINIT_PATH 2394 | # 2395 | kinit_path = kinit 2396 | 2397 | # Designates the path to the Kerberos keytab file for the Airflow user 2398 | # 2399 | # Variable: AIRFLOW__KERBEROS__KEYTAB 2400 | # 2401 | keytab = airflow.keytab 2402 | 2403 | # Allow to disable ticket forwardability. 2404 | # 2405 | # Variable: AIRFLOW__KERBEROS__FORWARDABLE 2406 | # 2407 | forwardable = True 2408 | 2409 | # Allow to remove source IP from token, useful when using token behind NATted Docker host. 2410 | # 2411 | # Variable: AIRFLOW__KERBEROS__INCLUDE_IP 2412 | # 2413 | include_ip = True 2414 | 2415 | [sensors] 2416 | # Sensor default timeout, 7 days by default (7 * 24 * 60 * 60). 2417 | # 2418 | # Variable: AIRFLOW__SENSORS__DEFAULT_TIMEOUT 2419 | # 2420 | default_timeout = 604800 2421 | 2422 | [usage_data_collection] 2423 | # Airflow integrates `Scarf `__ to collect basic platform and usage data 2424 | # during operation. This data assists Airflow maintainers in better understanding how Airflow is used. 2425 | # Insights gained from this telemetry are critical for prioritizing patches, minor releases, and 2426 | # security fixes. Additionally, this information supports key decisions related to the development road map. 2427 | # Check the FAQ doc for more information on what data is collected. 2428 | # 2429 | # Deployments can opt-out of analytics by setting the ``enabled`` option 2430 | # to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable. 2431 | # Individual users can easily opt-out of analytics in various ways documented in the 2432 | # `Scarf Do Not Track docs `__. 2433 | 2434 | # Enable or disable usage data collection and sending. 2435 | # 2436 | # Variable: AIRFLOW__USAGE_DATA_COLLECTION__ENABLED 2437 | # 2438 | enabled = True 2439 | 2440 | [aws] 2441 | # This section contains settings for Amazon Web Services (AWS) integration. 2442 | 2443 | # session_factory = 2444 | cloudwatch_task_handler_json_serializer = airflow.providers.amazon.aws.log.cloudwatch_task_handler.json_serialize_legacy 2445 | 2446 | [aws_batch_executor] 2447 | # This section only applies if you are using the AwsBatchExecutor in 2448 | # Airflow's ``[core]`` configuration. 2449 | # For more information on any of these execution parameters, see the link below: 2450 | # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/batch.html#Batch.Client.submit_job 2451 | # For boto3 credential management, see 2452 | # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html 2453 | 2454 | conn_id = aws_default 2455 | # region_name = 2456 | max_submit_job_attempts = 3 2457 | check_health_on_startup = True 2458 | # job_name = 2459 | # job_queue = 2460 | # job_definition = 2461 | # submit_job_kwargs = 2462 | 2463 | [aws_ecs_executor] 2464 | # This section only applies if you are using the AwsEcsExecutor in 2465 | # Airflow's ``[core]`` configuration. 2466 | # For more information on any of these execution parameters, see the link below: 2467 | # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ecs/client/run_task.html 2468 | # For boto3 credential management, see 2469 | # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html 2470 | 2471 | conn_id = aws_default 2472 | # region_name = 2473 | assign_public_ip = False 2474 | # cluster = 2475 | # capacity_provider_strategy = 2476 | # container_name = 2477 | # launch_type = 2478 | platform_version = LATEST 2479 | # security_groups = 2480 | # subnets = 2481 | # task_definition = 2482 | max_run_task_attempts = 3 2483 | # run_task_kwargs = 2484 | check_health_on_startup = True 2485 | 2486 | [aws_auth_manager] 2487 | # This section only applies if you are using the AwsAuthManager. In other words, if you set 2488 | # ``[core] auth_manager = airflow.providers.amazon.aws.auth_manager.aws_auth_manager.AwsAuthManager`` in 2489 | # Airflow's configuration. 2490 | 2491 | enable = False 2492 | conn_id = aws_default 2493 | # region_name = 2494 | # saml_metadata_url = 2495 | # avp_policy_store_id = 2496 | 2497 | [celery_kubernetes_executor] 2498 | # This section only applies if you are using the ``CeleryKubernetesExecutor`` in 2499 | # ``[core]`` section above 2500 | 2501 | # Define when to send a task to ``KubernetesExecutor`` when using ``CeleryKubernetesExecutor``. 2502 | # When the queue of a task is the value of ``kubernetes_queue`` (default ``kubernetes``), 2503 | # the task is executed via ``KubernetesExecutor``, 2504 | # otherwise via ``CeleryExecutor`` 2505 | # 2506 | # Variable: AIRFLOW__CELERY_KUBERNETES_EXECUTOR__KUBERNETES_QUEUE 2507 | # 2508 | kubernetes_queue = kubernetes 2509 | 2510 | [celery] 2511 | # This section only applies if you are using the CeleryExecutor in 2512 | # ``[core]`` section above 2513 | 2514 | # The app name that will be used by celery 2515 | # 2516 | # Variable: AIRFLOW__CELERY__CELERY_APP_NAME 2517 | # 2518 | celery_app_name = airflow.providers.celery.executors.celery_executor 2519 | 2520 | # The concurrency that will be used when starting workers with the 2521 | # ``airflow celery worker`` command. This defines the number of task instances that 2522 | # a worker will take, so size up your workers based on the resources on 2523 | # your worker box and the nature of your tasks 2524 | # 2525 | # Variable: AIRFLOW__CELERY__WORKER_CONCURRENCY 2526 | # 2527 | worker_concurrency = 16 2528 | 2529 | # The maximum and minimum number of pool processes that will be used to dynamically resize 2530 | # the pool based on load.Enable autoscaling by providing max_concurrency,min_concurrency 2531 | # with the ``airflow celery worker`` command (always keep minimum processes, 2532 | # but grow to maximum if necessary). 2533 | # Pick these numbers based on resources on worker box and the nature of the task. 2534 | # If autoscale option is available, worker_concurrency will be ignored. 2535 | # https://docs.celeryq.dev/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale 2536 | # 2537 | # Example: worker_autoscale = 16,12 2538 | # 2539 | # Variable: AIRFLOW__CELERY__WORKER_AUTOSCALE 2540 | # 2541 | # worker_autoscale = 2542 | 2543 | # Used to increase the number of tasks that a worker prefetches which can improve performance. 2544 | # The number of processes multiplied by worker_prefetch_multiplier is the number of tasks 2545 | # that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily 2546 | # blocked if there are multiple workers and one worker prefetches tasks that sit behind long 2547 | # running tasks while another worker has unutilized processes that are unable to process the already 2548 | # claimed blocked tasks. 2549 | # https://docs.celeryq.dev/en/stable/userguide/optimizing.html#prefetch-limits 2550 | # 2551 | # Variable: AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER 2552 | # 2553 | worker_prefetch_multiplier = 1 2554 | 2555 | # Specify if remote control of the workers is enabled. 2556 | # In some cases when the broker does not support remote control, Celery creates lots of 2557 | # ``.*reply-celery-pidbox`` queues. You can prevent this by setting this to false. 2558 | # However, with this disabled Flower won't work. 2559 | # https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/index.html#broker-overview 2560 | # 2561 | # Variable: AIRFLOW__CELERY__WORKER_ENABLE_REMOTE_CONTROL 2562 | # 2563 | worker_enable_remote_control = true 2564 | 2565 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally 2566 | # a sqlalchemy database. Refer to the Celery documentation for more information. 2567 | # 2568 | # Variable: AIRFLOW__CELERY__BROKER_URL 2569 | # 2570 | broker_url = redis://redis:6379/0 2571 | 2572 | # The Celery result_backend. When a job finishes, it needs to update the 2573 | # metadata of the job. Therefore it will post a message on a message bus, 2574 | # or insert it into a database (depending of the backend) 2575 | # This status is used by the scheduler to update the state of the task 2576 | # The use of a database is highly recommended 2577 | # When not specified, sql_alchemy_conn with a db+ scheme prefix will be used 2578 | # https://docs.celeryq.dev/en/latest/userguide/configuration.html#task-result-backend-settings 2579 | # 2580 | # Example: result_backend = db+postgresql://postgres:airflow@postgres/airflow 2581 | # 2582 | # Variable: AIRFLOW__CELERY__RESULT_BACKEND 2583 | # 2584 | # result_backend = 2585 | 2586 | # Optional configuration dictionary to pass to the Celery result backend SQLAlchemy engine. 2587 | # 2588 | # Example: result_backend_sqlalchemy_engine_options = {"pool_recycle": 1800} 2589 | # 2590 | # Variable: AIRFLOW__CELERY__RESULT_BACKEND_SQLALCHEMY_ENGINE_OPTIONS 2591 | # 2592 | result_backend_sqlalchemy_engine_options = 2593 | 2594 | # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start 2595 | # it ``airflow celery flower``. This defines the IP that Celery Flower runs on 2596 | # 2597 | # Variable: AIRFLOW__CELERY__FLOWER_HOST 2598 | # 2599 | flower_host = 0.0.0.0 2600 | 2601 | # The root URL for Flower 2602 | # 2603 | # Example: flower_url_prefix = /flower 2604 | # 2605 | # Variable: AIRFLOW__CELERY__FLOWER_URL_PREFIX 2606 | # 2607 | flower_url_prefix = 2608 | 2609 | # This defines the port that Celery Flower runs on 2610 | # 2611 | # Variable: AIRFLOW__CELERY__FLOWER_PORT 2612 | # 2613 | flower_port = 5555 2614 | 2615 | # Securing Flower with Basic Authentication 2616 | # Accepts user:password pairs separated by a comma 2617 | # 2618 | # Example: flower_basic_auth = user1:password1,user2:password2 2619 | # 2620 | # Variable: AIRFLOW__CELERY__FLOWER_BASIC_AUTH 2621 | # 2622 | flower_basic_auth = 2623 | 2624 | # How many processes CeleryExecutor uses to sync task state. 2625 | # 0 means to use max(1, number of cores - 1) processes. 2626 | # 2627 | # Variable: AIRFLOW__CELERY__SYNC_PARALLELISM 2628 | # 2629 | sync_parallelism = 0 2630 | 2631 | # Import path for celery configuration options 2632 | # 2633 | # Variable: AIRFLOW__CELERY__CELERY_CONFIG_OPTIONS 2634 | # 2635 | celery_config_options = airflow.providers.celery.executors.default_celery.DEFAULT_CELERY_CONFIG 2636 | 2637 | # 2638 | # Variable: AIRFLOW__CELERY__SSL_ACTIVE 2639 | # 2640 | ssl_active = False 2641 | 2642 | # Path to the client key. 2643 | # 2644 | # Variable: AIRFLOW__CELERY__SSL_KEY 2645 | # 2646 | ssl_key = 2647 | 2648 | # Path to the client certificate. 2649 | # 2650 | # Variable: AIRFLOW__CELERY__SSL_CERT 2651 | # 2652 | ssl_cert = 2653 | 2654 | # Path to the CA certificate. 2655 | # 2656 | # Variable: AIRFLOW__CELERY__SSL_CACERT 2657 | # 2658 | ssl_cacert = 2659 | 2660 | # Celery Pool implementation. 2661 | # Choices include: ``prefork`` (default), ``eventlet``, ``gevent`` or ``solo``. 2662 | # See: 2663 | # https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency 2664 | # https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html 2665 | # 2666 | # Variable: AIRFLOW__CELERY__POOL 2667 | # 2668 | pool = prefork 2669 | 2670 | # The number of seconds to wait before timing out ``send_task_to_executor`` or 2671 | # ``fetch_celery_task_state`` operations. 2672 | # 2673 | # Variable: AIRFLOW__CELERY__OPERATION_TIMEOUT 2674 | # 2675 | operation_timeout = 1.0 2676 | 2677 | task_acks_late = True 2678 | # Celery task will report its status as 'started' when the task is executed by a worker. 2679 | # This is used in Airflow to keep track of the running tasks and if a Scheduler is restarted 2680 | # or run in HA mode, it can adopt the orphan tasks launched by previous SchedulerJob. 2681 | # 2682 | # Variable: AIRFLOW__CELERY__TASK_TRACK_STARTED 2683 | # 2684 | task_track_started = True 2685 | 2686 | # The Maximum number of retries for publishing task messages to the broker when failing 2687 | # due to ``AirflowTaskTimeout`` error before giving up and marking Task as failed. 2688 | # 2689 | # Variable: AIRFLOW__CELERY__TASK_PUBLISH_MAX_RETRIES 2690 | # 2691 | task_publish_max_retries = 3 2692 | 2693 | # Worker initialisation check to validate Metadata Database connection 2694 | # 2695 | # Variable: AIRFLOW__CELERY__WORKER_PRECHECK 2696 | # 2697 | worker_precheck = False 2698 | 2699 | [celery_broker_transport_options] 2700 | # This section is for specifying options which can be passed to the 2701 | # underlying celery broker transport. See: 2702 | # https://docs.celeryq.dev/en/latest/userguide/configuration.html#std:setting-broker_transport_options 2703 | 2704 | # The visibility timeout defines the number of seconds to wait for the worker 2705 | # to acknowledge the task before the message is redelivered to another worker. 2706 | # Make sure to increase the visibility timeout to match the time of the longest 2707 | # ETA you're planning to use. 2708 | # visibility_timeout is only supported for Redis and SQS celery brokers. 2709 | # See: 2710 | # https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/redis.html#visibility-timeout 2711 | # 2712 | # Example: visibility_timeout = 21600 2713 | # 2714 | # Variable: AIRFLOW__CELERY_BROKER_TRANSPORT_OPTIONS__VISIBILITY_TIMEOUT 2715 | # 2716 | # visibility_timeout = 2717 | 2718 | # The sentinel_kwargs parameter allows passing additional options to the Sentinel client. 2719 | # In a typical scenario where Redis Sentinel is used as the broker and Redis servers are 2720 | # password-protected, the password needs to be passed through this parameter. Although its 2721 | # type is string, it is required to pass a string that conforms to the dictionary format. 2722 | # See: 2723 | # https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/redis.html#configuration 2724 | # 2725 | # Example: sentinel_kwargs = {"password": "password_for_redis_server"} 2726 | # 2727 | # Variable: AIRFLOW__CELERY_BROKER_TRANSPORT_OPTIONS__SENTINEL_KWARGS 2728 | # 2729 | # sentinel_kwargs = 2730 | 2731 | [local_kubernetes_executor] 2732 | # This section only applies if you are using the ``LocalKubernetesExecutor`` in 2733 | # ``[core]`` section above 2734 | 2735 | # Define when to send a task to ``KubernetesExecutor`` when using ``LocalKubernetesExecutor``. 2736 | # When the queue of a task is the value of ``kubernetes_queue`` (default ``kubernetes``), 2737 | # the task is executed via ``KubernetesExecutor``, 2738 | # otherwise via ``LocalExecutor`` 2739 | # 2740 | # Variable: AIRFLOW__LOCAL_KUBERNETES_EXECUTOR__KUBERNETES_QUEUE 2741 | # 2742 | kubernetes_queue = kubernetes 2743 | 2744 | [kubernetes_executor] 2745 | # Kwargs to override the default urllib3 Retry used in the kubernetes API client 2746 | # 2747 | # Example: api_client_retry_configuration = { "total": 3, "backoff_factor": 0.5 } 2748 | # 2749 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__API_CLIENT_RETRY_CONFIGURATION 2750 | # 2751 | api_client_retry_configuration = 2752 | 2753 | # Flag to control the information added to kubernetes executor logs for better traceability 2754 | # 2755 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__LOGS_TASK_METADATA 2756 | # 2757 | logs_task_metadata = False 2758 | 2759 | # Path to the YAML pod file that forms the basis for KubernetesExecutor workers. 2760 | # 2761 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__POD_TEMPLATE_FILE 2762 | # 2763 | pod_template_file = 2764 | 2765 | # The repository of the Kubernetes Image for the Worker to Run 2766 | # 2767 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__WORKER_CONTAINER_REPOSITORY 2768 | # 2769 | worker_container_repository = 2770 | 2771 | # The tag of the Kubernetes Image for the Worker to Run 2772 | # 2773 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__WORKER_CONTAINER_TAG 2774 | # 2775 | worker_container_tag = 2776 | 2777 | # The Kubernetes namespace where airflow workers should be created. Defaults to ``default`` 2778 | # 2779 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__NAMESPACE 2780 | # 2781 | namespace = default 2782 | 2783 | # If True, all worker pods will be deleted upon termination 2784 | # 2785 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__DELETE_WORKER_PODS 2786 | # 2787 | delete_worker_pods = True 2788 | 2789 | # If False (and delete_worker_pods is True), 2790 | # failed worker pods will not be deleted so users can investigate them. 2791 | # This only prevents removal of worker pods where the worker itself failed, 2792 | # not when the task it ran failed. 2793 | # 2794 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__DELETE_WORKER_PODS_ON_FAILURE 2795 | # 2796 | delete_worker_pods_on_failure = False 2797 | 2798 | worker_pod_pending_fatal_container_state_reasons = CreateContainerConfigError,ErrImagePull,CreateContainerError,ImageInspectError, InvalidImageName 2799 | # Number of Kubernetes Worker Pod creation calls per scheduler loop. 2800 | # Note that the current default of "1" will only launch a single pod 2801 | # per-heartbeat. It is HIGHLY recommended that users increase this 2802 | # number to match the tolerance of their kubernetes cluster for 2803 | # better performance. 2804 | # 2805 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__WORKER_PODS_CREATION_BATCH_SIZE 2806 | # 2807 | worker_pods_creation_batch_size = 1 2808 | 2809 | # Allows users to launch pods in multiple namespaces. 2810 | # Will require creating a cluster-role for the scheduler, 2811 | # or use multi_namespace_mode_namespace_list configuration. 2812 | # 2813 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__MULTI_NAMESPACE_MODE 2814 | # 2815 | multi_namespace_mode = False 2816 | 2817 | # If multi_namespace_mode is True while scheduler does not have a cluster-role, 2818 | # give the list of namespaces where the scheduler will schedule jobs 2819 | # Scheduler needs to have the necessary permissions in these namespaces. 2820 | # 2821 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__MULTI_NAMESPACE_MODE_NAMESPACE_LIST 2822 | # 2823 | multi_namespace_mode_namespace_list = 2824 | 2825 | # Use the service account kubernetes gives to pods to connect to kubernetes cluster. 2826 | # It's intended for clients that expect to be running inside a pod running on kubernetes. 2827 | # It will raise an exception if called from a process not running in a kubernetes environment. 2828 | # 2829 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__IN_CLUSTER 2830 | # 2831 | in_cluster = True 2832 | 2833 | # When running with in_cluster=False change the default cluster_context or config_file 2834 | # options to Kubernetes client. Leave blank these to use default behaviour like ``kubectl`` has. 2835 | # 2836 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__CLUSTER_CONTEXT 2837 | # 2838 | # cluster_context = 2839 | 2840 | # Path to the kubernetes configfile to be used when ``in_cluster`` is set to False 2841 | # 2842 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__CONFIG_FILE 2843 | # 2844 | # config_file = 2845 | 2846 | # Keyword parameters to pass while calling a kubernetes client core_v1_api methods 2847 | # from Kubernetes Executor provided as a single line formatted JSON dictionary string. 2848 | # List of supported params are similar for all core_v1_apis, hence a single config 2849 | # variable for all apis. See: 2850 | # https://raw.githubusercontent.com/kubernetes-client/python/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/api/core_v1_api.py 2851 | # 2852 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__KUBE_CLIENT_REQUEST_ARGS 2853 | # 2854 | kube_client_request_args = 2855 | 2856 | # Optional keyword arguments to pass to the ``delete_namespaced_pod`` kubernetes client 2857 | # ``core_v1_api`` method when using the Kubernetes Executor. 2858 | # This should be an object and can contain any of the options listed in the ``v1DeleteOptions`` 2859 | # class defined here: 2860 | # https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19 2861 | # 2862 | # Example: delete_option_kwargs = {"grace_period_seconds": 10} 2863 | # 2864 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__DELETE_OPTION_KWARGS 2865 | # 2866 | delete_option_kwargs = 2867 | 2868 | # Enables TCP keepalive mechanism. This prevents Kubernetes API requests to hang indefinitely 2869 | # when idle connection is time-outed on services like cloud load balancers or firewalls. 2870 | # 2871 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__ENABLE_TCP_KEEPALIVE 2872 | # 2873 | enable_tcp_keepalive = True 2874 | 2875 | # When the `enable_tcp_keepalive` option is enabled, TCP probes a connection that has 2876 | # been idle for `tcp_keep_idle` seconds. 2877 | # 2878 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__TCP_KEEP_IDLE 2879 | # 2880 | tcp_keep_idle = 120 2881 | 2882 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond 2883 | # to a keepalive probe, TCP retransmits the probe after `tcp_keep_intvl` seconds. 2884 | # 2885 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__TCP_KEEP_INTVL 2886 | # 2887 | tcp_keep_intvl = 30 2888 | 2889 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond 2890 | # to a keepalive probe, TCP retransmits the probe `tcp_keep_cnt number` of times before 2891 | # a connection is considered to be broken. 2892 | # 2893 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__TCP_KEEP_CNT 2894 | # 2895 | tcp_keep_cnt = 6 2896 | 2897 | # Set this to false to skip verifying SSL certificate of Kubernetes python client. 2898 | # 2899 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__VERIFY_SSL 2900 | # 2901 | verify_ssl = True 2902 | 2903 | # How often in seconds to check for task instances stuck in "queued" status without a pod 2904 | # 2905 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__WORKER_PODS_QUEUED_CHECK_INTERVAL 2906 | # 2907 | worker_pods_queued_check_interval = 60 2908 | 2909 | # Path to a CA certificate to be used by the Kubernetes client to verify the server's SSL certificate. 2910 | # 2911 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__SSL_CA_CERT 2912 | # 2913 | ssl_ca_cert = 2914 | 2915 | # The Maximum number of retries for queuing the task to the kubernetes scheduler when 2916 | # failing due to Kube API exceeded quota errors before giving up and marking task as failed. 2917 | # -1 for unlimited times. 2918 | # 2919 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__TASK_PUBLISH_MAX_RETRIES 2920 | # 2921 | task_publish_max_retries = 0 2922 | 2923 | [common.io] 2924 | # Common IO configuration section 2925 | 2926 | # Path to a location on object storage where XComs can be stored in url format. 2927 | # 2928 | # Example: xcom_objectstorage_path = s3://conn_id@bucket/path 2929 | # 2930 | # Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_PATH 2931 | # 2932 | xcom_objectstorage_path = 2933 | 2934 | # Threshold in bytes for storing XComs in object storage. -1 means always store in the 2935 | # database. 0 means always store in object storage. Any positive number means 2936 | # it will be stored in object storage if the size of the value is greater than the threshold. 2937 | # 2938 | # Example: xcom_objectstorage_threshold = 1000000 2939 | # 2940 | # Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_THRESHOLD 2941 | # 2942 | xcom_objectstorage_threshold = -1 2943 | 2944 | # Compression algorithm to use when storing XComs in object storage. Supported algorithms 2945 | # are a.o.: snappy, zip, gzip, bz2, and lzma. If not specified, no compression will be used. 2946 | # Note that the compression algorithm must be available in the Python installation (e.g. 2947 | # python-snappy for snappy). Zip, gz, bz2 are available by default. 2948 | # 2949 | # Example: xcom_objectstorage_compression = gz 2950 | # 2951 | # Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_COMPRESSION 2952 | # 2953 | xcom_objectstorage_compression = 2954 | 2955 | [elasticsearch] 2956 | # Elasticsearch host 2957 | # 2958 | # Variable: AIRFLOW__ELASTICSEARCH__HOST 2959 | # 2960 | host = 2961 | 2962 | # Format of the log_id, which is used to query for a given tasks logs 2963 | # 2964 | # Variable: AIRFLOW__ELASTICSEARCH__LOG_ID_TEMPLATE 2965 | # 2966 | log_id_template = {dag_id}-{task_id}-{run_id}-{map_index}-{try_number} 2967 | 2968 | # Used to mark the end of a log stream for a task 2969 | # 2970 | # Variable: AIRFLOW__ELASTICSEARCH__END_OF_LOG_MARK 2971 | # 2972 | end_of_log_mark = end_of_log 2973 | 2974 | # Qualified URL for an elasticsearch frontend (like Kibana) with a template argument for log_id 2975 | # Code will construct log_id using the log_id template from the argument above. 2976 | # NOTE: scheme will default to https if one is not provided 2977 | # 2978 | # Example: frontend = http://localhost:5601/app/kibana#/discover?_a=(columns:!(message),query:(language:kuery,query:'log_id: "{log_id}"'),sort:!(log.offset,asc)) 2979 | # 2980 | # Variable: AIRFLOW__ELASTICSEARCH__FRONTEND 2981 | # 2982 | frontend = 2983 | 2984 | # Write the task logs to the stdout of the worker, rather than the default files 2985 | # 2986 | # Variable: AIRFLOW__ELASTICSEARCH__WRITE_STDOUT 2987 | # 2988 | write_stdout = False 2989 | 2990 | # Instead of the default log formatter, write the log lines as JSON 2991 | # 2992 | # Variable: AIRFLOW__ELASTICSEARCH__JSON_FORMAT 2993 | # 2994 | json_format = False 2995 | 2996 | # Log fields to also attach to the json output, if enabled 2997 | # 2998 | # Variable: AIRFLOW__ELASTICSEARCH__JSON_FIELDS 2999 | # 3000 | json_fields = asctime, filename, lineno, levelname, message 3001 | 3002 | # The field where host name is stored (normally either `host` or `host.name`) 3003 | # 3004 | # Variable: AIRFLOW__ELASTICSEARCH__HOST_FIELD 3005 | # 3006 | host_field = host 3007 | 3008 | # The field where offset is stored (normally either `offset` or `log.offset`) 3009 | # 3010 | # Variable: AIRFLOW__ELASTICSEARCH__OFFSET_FIELD 3011 | # 3012 | offset_field = offset 3013 | 3014 | # Comma separated list of index patterns to use when searching for logs (default: `_all`). 3015 | # The index_patterns_callable takes precedence over this. 3016 | # 3017 | # Example: index_patterns = something-* 3018 | # 3019 | # Variable: AIRFLOW__ELASTICSEARCH__INDEX_PATTERNS 3020 | # 3021 | index_patterns = _all 3022 | 3023 | index_patterns_callable = 3024 | 3025 | [elasticsearch_configs] 3026 | # 3027 | # Variable: AIRFLOW__ELASTICSEARCH_CONFIGS__HTTP_COMPRESS 3028 | # 3029 | http_compress = False 3030 | 3031 | # 3032 | # Variable: AIRFLOW__ELASTICSEARCH_CONFIGS__VERIFY_CERTS 3033 | # 3034 | verify_certs = True 3035 | 3036 | [fab] 3037 | # This section contains configs specific to FAB provider. 3038 | 3039 | # Boolean for enabling rate limiting on authentication endpoints. 3040 | # 3041 | # Variable: AIRFLOW__FAB__AUTH_RATE_LIMITED 3042 | # 3043 | auth_rate_limited = True 3044 | 3045 | # Rate limit for authentication endpoints. 3046 | # 3047 | # Variable: AIRFLOW__FAB__AUTH_RATE_LIMIT 3048 | # 3049 | auth_rate_limit = 5 per 40 second 3050 | 3051 | # Update FAB permissions and sync security manager roles 3052 | # on webserver startup 3053 | # 3054 | # Variable: AIRFLOW__FAB__UPDATE_FAB_PERMS 3055 | # 3056 | update_fab_perms = True 3057 | 3058 | [imap] 3059 | # Options for IMAP provider. 3060 | 3061 | # ssl_context = 3062 | 3063 | [azure_remote_logging] 3064 | # Configuration that needs to be set for enable remote logging in Azure Blob Storage 3065 | 3066 | remote_wasb_log_container = airflow-logs 3067 | 3068 | [openlineage] 3069 | # This section applies settings for OpenLineage integration. 3070 | # More about configuration and it's precedence can be found at 3071 | # https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/guides/user.html#transport-setup 3072 | 3073 | # Disable sending events without uninstalling the OpenLineage Provider by setting this to true. 3074 | # 3075 | # Variable: AIRFLOW__OPENLINEAGE__DISABLED 3076 | # 3077 | disabled = False 3078 | 3079 | # Exclude some Operators from emitting OpenLineage events by passing a string of semicolon separated 3080 | # full import paths of Operators to disable. 3081 | # 3082 | # Example: disabled_for_operators = airflow.providers.standard.operators.bash.BashOperator; airflow.providers.standard.operators.python.PythonOperator 3083 | # 3084 | # Variable: AIRFLOW__OPENLINEAGE__DISABLED_FOR_OPERATORS 3085 | # 3086 | disabled_for_operators = 3087 | 3088 | # If this setting is enabled, OpenLineage integration won't collect and emit metadata, 3089 | # unless you explicitly enable it per `DAG` or `Task` using `enable_lineage` method. 3090 | # 3091 | # Variable: AIRFLOW__OPENLINEAGE__SELECTIVE_ENABLE 3092 | # 3093 | selective_enable = False 3094 | 3095 | # Set namespace that the lineage data belongs to, so that if you use multiple OpenLineage producers, 3096 | # events coming from them will be logically separated. 3097 | # 3098 | # Example: namespace = my_airflow_instance_1 3099 | # 3100 | # Variable: AIRFLOW__OPENLINEAGE__NAMESPACE 3101 | # 3102 | # namespace = 3103 | 3104 | # Register custom OpenLineage Extractors by passing a string of semicolon separated full import paths. 3105 | # 3106 | # Example: extractors = full.path.to.ExtractorClass;full.path.to.AnotherExtractorClass 3107 | # 3108 | # Variable: AIRFLOW__OPENLINEAGE__EXTRACTORS 3109 | # 3110 | # extractors = 3111 | 3112 | # Register custom run facet functions by passing a string of semicolon separated full import paths. 3113 | # 3114 | # Example: custom_run_facets = full.path.to.custom_facet_function;full.path.to.another_custom_facet_function 3115 | # 3116 | # Variable: AIRFLOW__OPENLINEAGE__CUSTOM_RUN_FACETS 3117 | # 3118 | custom_run_facets = 3119 | 3120 | # Specify the path to the YAML configuration file. 3121 | # This ensures backwards compatibility with passing config through the `openlineage.yml` file. 3122 | # 3123 | # Example: config_path = full/path/to/openlineage.yml 3124 | # 3125 | # Variable: AIRFLOW__OPENLINEAGE__CONFIG_PATH 3126 | # 3127 | config_path = 3128 | 3129 | # Pass OpenLineage Client transport configuration as JSON string. It should contain type of the 3130 | # transport and additional options (different for each transport type). For more details see: 3131 | # https://openlineage.io/docs/client/python/#built-in-transport-types 3132 | # 3133 | # Currently supported types are: 3134 | # 3135 | # * HTTP 3136 | # * Kafka 3137 | # * Console 3138 | # * File 3139 | # 3140 | # Example: transport = {"type": "http", "url": "http://localhost:5000", "endpoint": "api/v1/lineage"} 3141 | # 3142 | # Variable: AIRFLOW__OPENLINEAGE__TRANSPORT 3143 | # 3144 | transport = 3145 | 3146 | # Disable the inclusion of source code in OpenLineage events by setting this to `true`. 3147 | # By default, several Operators (e.g. Python, Bash) will include their source code in the events 3148 | # unless disabled. 3149 | # 3150 | # Variable: AIRFLOW__OPENLINEAGE__DISABLE_SOURCE_CODE 3151 | # 3152 | disable_source_code = False 3153 | 3154 | # Number of processes to utilize for processing DAG state changes 3155 | # in an asynchronous manner within the scheduler process. 3156 | # 3157 | # Variable: AIRFLOW__OPENLINEAGE__DAG_STATE_CHANGE_PROCESS_POOL_SIZE 3158 | # 3159 | dag_state_change_process_pool_size = 1 3160 | 3161 | # Maximum amount of time (in seconds) that OpenLineage can spend executing metadata extraction. 3162 | # 3163 | # Variable: AIRFLOW__OPENLINEAGE__EXECUTION_TIMEOUT 3164 | # 3165 | execution_timeout = 10 3166 | 3167 | # If true, OpenLineage event will include full task info - potentially containing large fields. 3168 | # 3169 | # Variable: AIRFLOW__OPENLINEAGE__INCLUDE_FULL_TASK_INFO 3170 | # 3171 | include_full_task_info = False 3172 | 3173 | # If true, OpenLineage events will include information useful for debugging - potentially 3174 | # containing large fields e.g. all installed packages and their versions. 3175 | # 3176 | # Variable: AIRFLOW__OPENLINEAGE__DEBUG_MODE 3177 | # 3178 | debug_mode = False 3179 | 3180 | [smtp_provider] 3181 | # Options for SMTP provider. 3182 | 3183 | # ssl context to use when using SMTP and IMAP SSL connections. By default, the context is "default" 3184 | # which sets it to ``ssl.create_default_context()`` which provides the right balance between 3185 | # compatibility and security, it however requires that certificates in your operating system are 3186 | # updated and that SMTP/IMAP servers of yours have valid certificates that have corresponding public 3187 | # keys installed on your machines. You can switch it to "none" if you want to disable checking 3188 | # of the certificates, but it is not recommended as it allows MITM (man-in-the-middle) attacks 3189 | # if your infrastructure is not sufficiently secured. It should only be set temporarily while you 3190 | # are fixing your certificate configuration. This can be typically done by upgrading to newer 3191 | # version of the operating system you run Airflow components on,by upgrading/refreshing proper 3192 | # certificates in the OS or by updating certificates for your mail servers. 3193 | # 3194 | # If you do not set this option explicitly, it will use Airflow "email.ssl_context" configuration, 3195 | # but if this configuration is not present, it will use "default" value. 3196 | # 3197 | # Example: ssl_context = default 3198 | # 3199 | # Variable: AIRFLOW__SMTP_PROVIDER__SSL_CONTEXT 3200 | # 3201 | # ssl_context = 3202 | 3203 | # Allows overriding of the standard templated email subject line when the SmtpNotifier is used. 3204 | # Must provide a path to the template. 3205 | # 3206 | # Example: templated_email_subject_path = path/to/override/email_subject.html 3207 | # 3208 | # Variable: AIRFLOW__SMTP_PROVIDER__TEMPLATED_EMAIL_SUBJECT_PATH 3209 | # 3210 | # templated_email_subject_path = 3211 | 3212 | # Allows overriding of the standard templated email path when the SmtpNotifier is used. Must provide 3213 | # a path to the template. 3214 | # 3215 | # Example: templated_html_content_path = path/to/override/email.html 3216 | # 3217 | # Variable: AIRFLOW__SMTP_PROVIDER__TEMPLATED_HTML_CONTENT_PATH 3218 | # 3219 | # templated_html_content_path = 3220 | -------------------------------------------------------------------------------- /dags/example.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.bash_operator import BashOperator 5 | from airflow.operators.python_operator import PythonOperator 6 | 7 | default_args = { 8 | "owner": "airflow", 9 | "depends_on_past": False, 10 | "start_date": datetime(2025, 1, 1), 11 | "retries": 1, 12 | "retry_delay": timedelta(minutes=5), 13 | } 14 | 15 | dag = DAG( 16 | dag_id="simple_example_dag", 17 | default_args=default_args, 18 | schedule_interval=timedelta(days=1), 19 | catchup=False, 20 | ) 21 | 22 | 23 | def print_hello(): 24 | print("Olá Mundo!!! \n Esta é a minha primeira tarefa") 25 | 26 | 27 | task1 = PythonOperator(task_id="print_hello", python_callable=print_hello, dag=dag) 28 | 29 | task2 = BashOperator( 30 | task_id="print_date", bash_command="date && sleep 5 & date", dag=dag 31 | ) 32 | 33 | task1 >> task2 34 | -------------------------------------------------------------------------------- /dags/execute_entities.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | from airflow import DAG 4 | from airflow.operators.dummy import DummyOperator 5 | from airflow.operators.python import PythonOperator 6 | from airflow.utils.task_group import TaskGroup 7 | from loguru import logger 8 | 9 | default_args = { 10 | "owner": "airflow", 11 | "depends_on_past": False, 12 | "retries": 3, 13 | "retry_delay": timedelta(minutes=1), 14 | } 15 | 16 | 17 | def get_endpoints(): 18 | from src.endpoints import Endpoints 19 | 20 | return Endpoints().get_all() 21 | 22 | 23 | def get_cutomers(endpoint: dict): 24 | from src.controllers.paginations import PaginationController 25 | 26 | resource = endpoint.get("resources", None) 27 | action = endpoint.get("action", None) 28 | params = endpoint.get("params", None) 29 | data_source = endpoint.get("data_source", None) 30 | pagination_type = endpoint.get("pagination_type", "per_page") 31 | page_label = endpoint.get("page_label", None) 32 | total_of_pages_label = endpoint.get("total_of_pages_label", None) 33 | records_label = endpoint.get("records_label", "registros") 34 | 35 | pagination = PaginationController() 36 | 37 | if pagination_type == "date_range": 38 | depends_on = endpoint.get("depends_on", None) 39 | 40 | if depends_on: 41 | from src.db.database import Database 42 | 43 | db = Database() 44 | 45 | try: 46 | accounts = db.select_from_table( 47 | table_name=depends_on, distinct_column="nCodCC" 48 | ) 49 | except Exception as e: 50 | logger.error( 51 | f"An error occurred while selecting from the table '{depends_on}': {e}" 52 | ) 53 | 54 | for account in accounts: 55 | params["nCodCC"] = account 56 | 57 | try: 58 | pagination.pagination( 59 | type=pagination_type, 60 | resource=resource, 61 | action=action, 62 | params=params, 63 | data_source=data_source, 64 | ) 65 | except Exception as e: 66 | logger.error(f"An error occurred while pagination: {e}") 67 | else: 68 | try: 69 | pagination.pagination( 70 | type=pagination_type, 71 | resource=resource, 72 | action=action, 73 | params=params, 74 | data_source=data_source, 75 | page_label=page_label, 76 | total_of_pages_label=total_of_pages_label, 77 | records_label=records_label, 78 | ) 79 | except Exception as e: 80 | logger.error(f"An error occurred while pagination: {e}") 81 | 82 | 83 | with DAG( 84 | "execute_entities", 85 | default_args=default_args, 86 | description="Execute entities", 87 | start_date=datetime(2025, 1, 1), 88 | schedule_interval="0 3 * * *", 89 | catchup=False, 90 | ) as dag: 91 | start = DummyOperator(task_id="start") 92 | end = DummyOperator(task_id="end") 93 | 94 | endpoints = get_endpoints() 95 | 96 | extract_endpoints = [e for e in endpoints if e.get("action") != "ListarExtrato"] 97 | excluded_extract_endpoints = [ 98 | e for e in endpoints if e.get("action") == "ListarExtrato" 99 | ] 100 | 101 | with TaskGroup("extract_and_load_omie_entities") as extract_group: 102 | for endpoint in extract_endpoints: 103 | tasks = PythonOperator( 104 | task_id=f"extract_and_load_{endpoint.get('action', None)}", 105 | python_callable=get_cutomers, 106 | op_kwargs={"endpoint": endpoint}, 107 | dag=dag, 108 | ) 109 | 110 | with TaskGroup("extract_and_load_omie_second_flow") as extract_second_group: 111 | for second_endpoint in excluded_extract_endpoints: 112 | second_tasks = PythonOperator( 113 | task_id=f"extract_and_load_{second_endpoint.get('action', None)}", 114 | python_callable=get_cutomers, 115 | op_kwargs={"endpoint": second_endpoint}, 116 | dag=dag, 117 | ) 118 | 119 | start >> extract_group >> extract_second_group >> end 120 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | x-airflow-common: 2 | &airflow-common 3 | build: . 4 | environment: 5 | &airflow-common-env 6 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 7 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 8 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 9 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 10 | AIRFLOW__CORE__FERNET_KEY: '' 11 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 12 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 13 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' 14 | AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' 15 | AIRFLOW__CORE__PARALLELISM: '4' 16 | AIRFLOW__CORE__DAG_CONCURRENCY: '2' 17 | AIRFLOW__CELERY__WORKER_CONCURRENCY: '2' 18 | # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks 19 | # for other purpose (development, test and especially production usage) build/extend Airflow image. 20 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} 21 | # The following line can be used to set a custom config file, stored in the local config folder 22 | # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file 23 | AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg' 24 | PYTHONPATH: /sources 25 | volumes: 26 | - ./dags:/opt/airflow/dags 27 | - ./logs:/opt/airflow/logs 28 | - ./config:/opt/airflow/config 29 | - ./plugins:/opt/airflow/plugins 30 | - ./src:/opt/airflow/src 31 | - ./requirements.txt:/opt/airflow/requirements.txt 32 | - ./.env:/opt/airflow/.env 33 | - ${AIRFLOW_PROJ_DIR:-.}:/sources 34 | user: "${AIRFLOW_UID:-50000}:0" 35 | depends_on: 36 | &airflow-common-depends-on 37 | redis: 38 | condition: service_healthy 39 | postgres: 40 | condition: service_healthy 41 | 42 | services: 43 | postgres: 44 | image: postgres:13 45 | environment: 46 | POSTGRES_USER: airflow 47 | POSTGRES_PASSWORD: airflow 48 | POSTGRES_DB: airflow 49 | volumes: 50 | - postgres-db-volume:/var/lib/postgresql/data 51 | healthcheck: 52 | test: ["CMD", "pg_isready", "-U", "airflow"] 53 | interval: 10s 54 | retries: 5 55 | start_period: 5s 56 | ports: 57 | - 5432:5432 58 | restart: always 59 | 60 | redis: 61 | image: redis:7.2-bookworm 62 | expose: 63 | - 6379 64 | healthcheck: 65 | test: ["CMD", "redis-cli", "ping"] 66 | interval: 10s 67 | timeout: 30s 68 | retries: 50 69 | start_period: 30s 70 | restart: always 71 | 72 | airflow-webserver: 73 | <<: *airflow-common 74 | command: webserver 75 | ports: 76 | - "8080:8080" 77 | healthcheck: 78 | test: ["CMD", "curl", "--fail", "http://localhost:8080/health"] 79 | interval: 30s 80 | timeout: 10s 81 | retries: 5 82 | start_period: 30s 83 | restart: always 84 | depends_on: 85 | <<: *airflow-common-depends-on 86 | airflow-init: 87 | condition: service_completed_successfully 88 | 89 | airflow-scheduler: 90 | <<: *airflow-common 91 | command: scheduler 92 | healthcheck: 93 | test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] 94 | interval: 30s 95 | timeout: 10s 96 | retries: 5 97 | start_period: 30s 98 | restart: always 99 | depends_on: 100 | <<: *airflow-common-depends-on 101 | airflow-init: 102 | condition: service_completed_successfully 103 | 104 | airflow-worker: 105 | <<: *airflow-common 106 | command: celery worker 107 | healthcheck: 108 | test: 109 | - "CMD-SHELL" 110 | - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' 111 | interval: 30s 112 | timeout: 10s 113 | retries: 5 114 | start_period: 30s 115 | environment: 116 | <<: *airflow-common-env 117 | DUMB_INIT_SETSID: "0" 118 | restart: always 119 | depends_on: 120 | <<: *airflow-common-depends-on 121 | airflow-init: 122 | condition: service_completed_successfully 123 | 124 | # airflow-triggerer: 125 | # <<: *airflow-common 126 | # command: triggerer 127 | # healthcheck: 128 | # test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 129 | # interval: 30s 130 | # timeout: 10s 131 | # retries: 5 132 | # start_period: 30s 133 | # restart: always 134 | # depends_on: 135 | # <<: *airflow-common-depends-on 136 | # airflow-init: 137 | # condition: service_completed_successfully 138 | 139 | airflow-init: 140 | <<: *airflow-common 141 | entrypoint: /bin/bash 142 | command: 143 | - -c 144 | - | 145 | if [[ -z "${AIRFLOW_UID}" ]]; then 146 | echo 147 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 148 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 149 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 150 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 151 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" 152 | echo 153 | fi 154 | one_meg=1048576 155 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 156 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 157 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 158 | warning_resources="false" 159 | if (( mem_available < 4000 )) ; then 160 | echo 161 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 162 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 163 | echo 164 | warning_resources="true" 165 | fi 166 | if (( cpus_available < 2 )); then 167 | echo 168 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 169 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 170 | echo 171 | warning_resources="true" 172 | fi 173 | if (( disk_available < one_meg * 10 )); then 174 | echo 175 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 176 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 177 | echo 178 | warning_resources="true" 179 | fi 180 | if [[ $${warning_resources} == "true" ]]; then 181 | echo 182 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 183 | echo "Please follow the instructions to increase amount of resources available:" 184 | echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin" 185 | echo 186 | fi 187 | 188 | airflow db init 189 | 190 | # Cria o usuário Admin 191 | airflow users create \ 192 | --username airflow \ 193 | --password airflow \ 194 | --firstname Airflow \ 195 | --lastname Admin \ 196 | --role Admin \ 197 | --email airflow@example.com 198 | 199 | mkdir -p /sources/logs /sources/dags /sources/plugins /sources/src 200 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins,src} 201 | # yamllint enable rule:line-length 202 | environment: 203 | <<: *airflow-common-env 204 | _AIRFLOW_DB_MIGRATE: 'true' 205 | _AIRFLOW_WWW_USER_CREATE: 'true' 206 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 207 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 208 | _PIP_ADDITIONAL_REQUIREMENTS: '' 209 | user: "0:0" 210 | volumes: 211 | - ${AIRFLOW_PROJ_DIR:-.}:/sources 212 | 213 | # airflow-cli: 214 | # <<: *airflow-common 215 | # profiles: 216 | # - debug 217 | # environment: 218 | # <<: *airflow-common-env 219 | # CONNECTION_CHECK_MAX_COUNT: "0" 220 | # command: 221 | # - bash 222 | # - -c 223 | # - airflow 224 | 225 | # flower: 226 | # <<: *airflow-common 227 | # command: celery flower 228 | # profiles: 229 | # - flower 230 | # ports: 231 | # - "5555:5555" 232 | # healthcheck: 233 | # test: ["CMD", "curl", "--fail", "http://localhost:5555/"] 234 | # interval: 30s 235 | # timeout: 10s 236 | # retries: 5 237 | # start_period: 30s 238 | # restart: always 239 | # depends_on: 240 | # <<: *airflow-common-depends-on 241 | # airflow-init: 242 | # condition: service_completed_successfully 243 | 244 | volumes: 245 | postgres-db-volume: 246 | -------------------------------------------------------------------------------- /docs/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [0.1.0] - 2025-03-03 4 | 5 | ### Performance Optimizations 6 | 7 | #### 1. Pagination System Improvements 8 | - **Concurrent Processing** 9 | - Added `ThreadPoolExecutor` for parallel page fetching 10 | - Configurable number of workers (default: 5) 11 | - Each page fetch runs in a separate thread 12 | - Improved overall data fetching speed 13 | 14 | - **Batch Processing** 15 | - Implemented batch processing with configurable size (default: 10 pages) 16 | - Reduced number of database operations 17 | - Better memory management 18 | - More efficient data handling 19 | 20 | #### 2. Database Optimizations 21 | - **Connection Pooling** 22 | - Added SQLAlchemy connection pooling with `QueuePool` 23 | - Configurable pool settings: 24 | ```python 25 | pool_size=5 26 | max_overflow=10 27 | pool_timeout=30 28 | pool_pre_ping=True 29 | ``` 30 | - Better connection management and reuse 31 | - Improved performance under concurrent loads 32 | 33 | - **Transaction Management** 34 | - Added `execute_with_transaction` method for proper transaction handling 35 | - Using `with self.engine.begin()` for automatic transaction management 36 | - Better error handling and rollback support 37 | - Proper cleanup of resources 38 | 39 | - **Data Type Handling** 40 | - Improved numeric type handling: 41 | ```python 42 | numeric_columns = [ 43 | 'nSaldo', 'nValorDocumento', 'nSaldoAnterior', 'nSaldoAtual', 44 | 'nSaldoConciliado', 'nSaldoProvisorio', 'nLimiteCreditoTotal', 45 | 'nSaldoDisponivel' 46 | ] 47 | ``` 48 | - Using proper SQLAlchemy types (Numeric(15,2) for decimals) 49 | - Better handling of NULL and empty values 50 | - Proper type conversion and validation 51 | 52 | 53 | ### Code Structure Improvements 54 | 55 | #### 1. Database Class Enhancements 56 | - **New Methods** 57 | - Added `table_exists` method 58 | - Added `execute_with_transaction` method 59 | - Improved `save_into_db` method 60 | - Better transaction management 61 | 62 | ### Why These Changes? 63 | 64 | 1. **Performance** 65 | - The concurrent processing significantly reduces data fetching time 66 | - Batch processing reduces database load 67 | - Connection pooling improves resource utilization 68 | - Better memory management prevents memory leaks 69 | 70 | 2. **Reliability** 71 | - Better transaction management prevents data corruption 72 | 73 | ### Configuration Examples 74 | 75 | 76 | #### Pagination Settings 77 | ```python 78 | self.batch_size = 10 # Number of pages per batch 79 | self.max_workers = 5 # Number of concurrent workers 80 | ``` 81 | 82 | ### Future Improvements 83 | 1. Add monitoring and metrics collection 84 | 2. Implement caching for frequently accessed data 85 | 3. Add more comprehensive error reporting 86 | 4. Implement data validation before saving 87 | 5. Add support for bulk operations 88 | 6. Improve logging and debugging capabilities 89 | 90 | ### Breaking Changes 91 | - Changed database column types for numeric fields 92 | - Modified transaction handling 93 | - Updated API retry mechanism 94 | - Changed batch processing behavior 95 | 96 | ### Dependencies 97 | - Added SQLAlchemy connection pooling 98 | - Updated pandas data type handling 99 | - Added concurrent.futures for parallel processing 100 | - Enhanced logging with loguru 101 | -------------------------------------------------------------------------------- /gh_2.67.0_windows_amd64.msi: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rphpacheco/omie_api_integration/24036cf0a4d0a3290c7ac3c45d2f967c4483ddbe/gh_2.67.0_windows_amd64.msi -------------------------------------------------------------------------------- /logs/scheduler/latest: -------------------------------------------------------------------------------- 1 | /opt/airflow/logs/scheduler/2025-02-06 2 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from src.controllers.paginations import PaginationController 2 | from src.endpoints import Endpoints 3 | 4 | endpoints = Endpoints() 5 | endpoints = endpoints.get_all() 6 | 7 | for endpoint in endpoints: 8 | resource = endpoint.get("resources", None) 9 | action = endpoint.get("action", None) 10 | params = endpoint.get("params", None) 11 | data_source = endpoint.get("data_source", None) 12 | pagination_type = endpoint.get("pagination_type", "per_page") 13 | page_label = endpoint.get("page_label", None) 14 | total_of_pages_label = endpoint.get("total_of_pages_label", None) 15 | records_label = endpoint.get("records_label", "registros") 16 | 17 | pagination = PaginationController() 18 | pagination = pagination.pagination( 19 | type=pagination_type, 20 | resource=resource, 21 | action=action, 22 | params=params, 23 | data_source=data_source, 24 | page_label=page_label, 25 | total_of_pages_label=total_of_pages_label, 26 | records_label=records_label, 27 | ) 28 | -------------------------------------------------------------------------------- /per_page.py: -------------------------------------------------------------------------------- 1 | from src.controllers.paginations import PaginationController 2 | from src.endpoints import Endpoints 3 | 4 | endpoints = Endpoints() 5 | endpoints = endpoints.get_endpoint(action="ListarExtrato") 6 | 7 | for endpoint in endpoints: 8 | resource = endpoint.get("resources", None) 9 | action = endpoint.get("action", None) 10 | params = endpoint.get("params", None) 11 | data_source = endpoint.get("data_source", None) 12 | pagination_type = endpoint.get("pagination_type", "per_page") 13 | 14 | pagination = PaginationController() 15 | 16 | if pagination_type == "date_range": 17 | depends_on = endpoint.get("depends_on", None) 18 | 19 | if depends_on: 20 | from src.db.database import Database 21 | 22 | db = Database() 23 | accounts = db.select_from_table( 24 | table_name=depends_on, distinct_column="nCodCC" 25 | ) 26 | 27 | for account in accounts: 28 | params["nCodCC"] = account 29 | 30 | pagination_execute = pagination.pagination( 31 | type=pagination_type, 32 | resource=resource, 33 | action=action, 34 | params=params, 35 | data_source=data_source, 36 | ) 37 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | annotated-types==0.7.0 2 | certifi==2024.8.30 3 | charset-normalizer==3.4.0 4 | idna==3.10 5 | loguru==0.7.2 6 | numpy 7 | pandas==2.1.2 8 | psycopg2-binary==2.9.10 9 | pydantic==2.9.2 10 | pydantic-settings==2.6.0 11 | pydantic_core==2.23.4 12 | python-dateutil==2.9.0.post0 13 | python-dotenv==1.0.1 14 | pytz==2024.2 15 | requests==2.32.3 16 | setuptools==75.8.0 17 | six==1.16.0 18 | SQLAlchemy==1.4.51 19 | typing_extensions==4.12.2 20 | tzdata==2024.2 21 | urllib3==2.2.3 22 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rphpacheco/omie_api_integration/24036cf0a4d0a3290c7ac3c45d2f967c4483ddbe/src/__init__.py -------------------------------------------------------------------------------- /src/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .api_instance import Api 2 | -------------------------------------------------------------------------------- /src/api/api_instance.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Union 2 | 3 | import requests 4 | from loguru import logger 5 | from requests.adapters import HTTPAdapter 6 | from requests.exceptions import RequestException 7 | from urllib3.util.retry import Retry 8 | 9 | 10 | class Session: 11 | """Manages HTTP session with retry mechanism.""" 12 | 13 | def __init__(self) -> None: 14 | self._session = requests.Session() 15 | self.retry = Retry( 16 | connect=1, 17 | read=1, 18 | total=5, 19 | backoff_factor=1, 20 | status_forcelist=[429, 500, 502, 503, 504], 21 | allowed_methods=["GET", "POST", "PUT", "DELETE"], 22 | respect_retry_after_header=True, 23 | ) 24 | self.adapter = HTTPAdapter(max_retries=self.retry) 25 | self._session.mount("http://", self.adapter) 26 | self._session.mount("https://", self.adapter) 27 | 28 | def get(self) -> Union[requests.Session, None]: 29 | return self._session 30 | 31 | 32 | class Api: 33 | def __init__( 34 | self, 35 | url: str, 36 | headers: dict = None, 37 | params: dict = None, 38 | json: dict = None, 39 | proxies: dict = None, 40 | ) -> None: 41 | self.url = url 42 | self.headers = headers 43 | self.params = params 44 | self.json = json 45 | self.verify = True 46 | self.proxies = proxies 47 | self.session = Session().get() 48 | self.timeout = 30 49 | 50 | def get(self) -> Union[requests.Response, None]: 51 | response = self.session.get( 52 | url=self.url, 53 | headers=self.headers, 54 | params=self.params, 55 | verify=self.verify, 56 | proxies=self.proxies, 57 | timeout=self.timeout, 58 | ) 59 | return response 60 | 61 | def post(self) -> Union[requests.Response, None]: 62 | response = self.session.post( 63 | url=self.url, 64 | headers=self.headers, 65 | params=self.params, 66 | json=self.json, 67 | verify=self.verify, 68 | proxies=self.proxies, 69 | timeout=self.timeout, 70 | ) 71 | return response 72 | 73 | def put(self) -> Union[requests.Response, None]: 74 | response = self.session.put( 75 | url=self.url, 76 | headers=self.headers, 77 | params=self.params, 78 | json=self.json, 79 | verify=self.verify, 80 | proxies=self.proxies, 81 | timeout=self.timeout, 82 | ) 83 | return response 84 | 85 | def delete(self) -> Union[requests.Response, None]: 86 | response = self.session.delete( 87 | url=self.url, 88 | headers=self.headers, 89 | params=self.params, 90 | verify=self.verify, 91 | proxies=self.proxies, 92 | timeout=self.timeout, 93 | ) 94 | 95 | def request(self, method: Callable) -> Union[dict, str, None]: 96 | try: 97 | response = method() 98 | if 200 <= response.status_code < 300: 99 | try: 100 | return response.json() 101 | except ValueError: 102 | logger.warning( 103 | f"Status Code: {response.status_code}\n Success: Response content is not a JSON: {response.text}" 104 | ) 105 | return response.text 106 | else: 107 | logger.error( 108 | f"Status Code: {response.status_code}\n Error: {response.text}" 109 | ) 110 | return response.text 111 | except RequestException as error: 112 | return logger.error(f"Request failed: {error}") 113 | -------------------------------------------------------------------------------- /src/config/__init__.py: -------------------------------------------------------------------------------- 1 | from pydantic_settings import BaseSettings 2 | 3 | 4 | class Settings(BaseSettings): 5 | APP_KEY: str 6 | APP_SECRET: str 7 | BASE_URL: str 8 | DB_HOST: str 9 | DB_PORT: int 10 | DB_USERNAME: str 11 | DB_PASSWORD: str 12 | DB_NAME: str 13 | DATE_INIT: str = "01/01/2025" 14 | 15 | class Config: 16 | env_file = ".env" 17 | env_file_encoding = "utf-8" 18 | extra = "ignore" 19 | -------------------------------------------------------------------------------- /src/controllers/paginations/__init__.py: -------------------------------------------------------------------------------- 1 | from .paginations import PaginationController 2 | -------------------------------------------------------------------------------- /src/controllers/paginations/paginations.py: -------------------------------------------------------------------------------- 1 | import calendar 2 | from concurrent.futures import ThreadPoolExecutor, as_completed 3 | from datetime import datetime 4 | from typing import Literal 5 | 6 | from loguru import logger 7 | 8 | from src.api import Api 9 | from src.config import Settings 10 | from src.db import Database 11 | from src.utils.constants import HEADERS 12 | from src.utils.tools import ( 13 | generate_date_range, 14 | get_body_params_pagination, 15 | get_total_of_pages, 16 | ) 17 | 18 | settings = Settings() 19 | 20 | 21 | class PaginationController: 22 | def __init__(self) -> None: 23 | self.page = 1 24 | self.batch_size = 10 # Number of pages to process in each batch 25 | self.max_workers = 5 # Number of concurrent workers 26 | 27 | def fetch_page( 28 | self, 29 | page: int, 30 | resource: str, 31 | action: str, 32 | params: dict, 33 | page_label: str, 34 | data_source: str, 35 | records_label: str, 36 | ) -> tuple: 37 | """Fetch a single page of data from the API""" 38 | try: 39 | params[page_label] = page 40 | body = get_body_params_pagination( 41 | action=action, params=params, page=page, field_pagination=page_label 42 | ) 43 | 44 | api = Api( 45 | url=f"{settings.BASE_URL}{resource}", 46 | headers=HEADERS, 47 | json=body, 48 | params=params, 49 | ) 50 | response = api.request(api.post) 51 | 52 | records_fetched = response.get(records_label, 0) 53 | contents = response.get(data_source, []) 54 | 55 | # Remove blacklisted fields 56 | black_list = [ 57 | "tags", 58 | "recomendacoes", 59 | "homepage", 60 | "fax_ddd", 61 | "bloquear_exclusao", 62 | "produtor_rural", 63 | ] 64 | for content in contents: 65 | for item in black_list: 66 | if item in content: 67 | del content[item] 68 | 69 | logger.info(f"Page {page} has been fetched with {records_fetched} records.") 70 | return page, contents 71 | 72 | except Exception as e: 73 | logger.error(f"Error fetching page {page}: {e}") 74 | return page, None 75 | 76 | def process_batch(self, batch_pages: list, resource: str, db: Database) -> None: 77 | """Process a batch of pages and save to database""" 78 | try: 79 | all_contents = [] 80 | for page, contents in batch_pages: 81 | if contents: 82 | all_contents.extend(contents) 83 | 84 | if all_contents: 85 | if batch_pages[0][0] == 1: # First batch 86 | db.save_into_db(1, resource, all_contents, replace=True) 87 | else: 88 | db.save_into_db( 89 | batch_pages[0][0], resource, all_contents, replace=False 90 | ) 91 | 92 | except Exception as e: 93 | logger.error( 94 | f"Error processing batch starting with page {batch_pages[0][0]}: {e}" 95 | ) 96 | 97 | def per_page( 98 | self, 99 | resource: str, 100 | action: str, 101 | params: dict, 102 | data_source: str, 103 | page_label: str = "pagina", 104 | total_of_pages_label: str = "total_de_paginas", 105 | records_label: str = "registros", 106 | ): 107 | total_of_pages = get_total_of_pages( 108 | resource, action, params, page_label, total_of_pages_label, records_label 109 | ) 110 | 111 | db = Database() 112 | current_batch = [] 113 | 114 | with ThreadPoolExecutor(max_workers=self.max_workers) as executor: 115 | # Submit all pages for processing 116 | future_to_page = { 117 | executor.submit( 118 | self.fetch_page, 119 | page, 120 | resource, 121 | action, 122 | params.copy(), # Create a copy of params to avoid race conditions 123 | page_label, 124 | data_source, 125 | records_label, 126 | ): page 127 | for page in range(1, total_of_pages + 1) 128 | } 129 | 130 | for future in as_completed(future_to_page): 131 | page, contents = future.result() 132 | current_batch.append((page, contents)) 133 | 134 | # Process batch when it reaches batch_size or is the last batch 135 | if len(current_batch) >= self.batch_size or page == total_of_pages: 136 | self.process_batch(current_batch, resource, db) 137 | current_batch = [] 138 | 139 | def pagination( 140 | self, 141 | type: Literal["per_page", "date_range"], 142 | resource: str, 143 | action: str, 144 | params: dict, 145 | data_source: str, 146 | page_label: str = "pagina", 147 | total_of_pages_label: str = "total_de_paginas", 148 | records_label: str = "registros", 149 | ): 150 | match type: 151 | case "per_page": 152 | return self.per_page( 153 | resource=resource, 154 | action=action, 155 | params=params, 156 | data_source=data_source, 157 | page_label=page_label, 158 | total_of_pages_label=total_of_pages_label, 159 | records_label=records_label, 160 | ) 161 | case "date_range": 162 | return self.date_range( 163 | resource=resource, 164 | action=action, 165 | params=params, 166 | data_source=data_source, 167 | date_init=settings.DATE_INIT, 168 | ) 169 | 170 | def date_range( 171 | self, resource: str, action: str, params: dict, data_source: str, date_init: str 172 | ): 173 | dates = generate_date_range(date_init) 174 | 175 | for date in dates: 176 | date_obj = datetime.strptime(date, "%d/%m/%Y") 177 | last_day = calendar.monthrange(date_obj.year, date_obj.month)[1] 178 | end_of_month_date = date_obj.replace(day=last_day) 179 | end_of_month_date = end_of_month_date.strftime("%d/%m/%Y") 180 | 181 | params["dPeriodoInicial"] = date 182 | params["dPeriodoFinal"] = end_of_month_date 183 | 184 | body = get_body_params_pagination( 185 | action=action, 186 | params=params, 187 | ) 188 | 189 | api = Api(url=f"{settings.BASE_URL}{resource}", headers=HEADERS, json=body) 190 | response = api.request(api.post) 191 | 192 | records_fetched = len(response.get(f"{data_source}", 0)) 193 | 194 | logger.info( 195 | f"nCodCC: {params['nCodCC']} - Date {date} at {end_of_month_date} has been fetched with {records_fetched} records." 196 | ) 197 | 198 | db = Database() 199 | # Verificar este lance do parâmetro page em save_into_db 200 | db.save_into_db(self.page, resource, response) 201 | 202 | self.page += 1 203 | print(f"PAGE: {self.page}") 204 | -------------------------------------------------------------------------------- /src/db/__init__.py: -------------------------------------------------------------------------------- 1 | from .database import Database 2 | -------------------------------------------------------------------------------- /src/db/database.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | from loguru import logger 5 | from sqlalchemy import create_engine, event, text, types 6 | from sqlalchemy.pool import QueuePool 7 | 8 | from src.config import Settings 9 | 10 | settings = Settings() 11 | 12 | 13 | class Database: 14 | """ 15 | A class used to manage interactions with a PostgreSQL database, including creating connections, 16 | retrieving table columns, updating table structures, and saving data. 17 | """ 18 | 19 | def __init__(self): 20 | """ 21 | Initializes the Database instance, establishing a connection to the database. 22 | 23 | Attributes: 24 | engine (sqlalchemy.engine.base.Engine): The SQLAlchemy engine used to connect to the database. 25 | connection (sqlalchemy.engine.base.Connection): The active connection to the database. 26 | """ 27 | self.engine = self.get_engine() 28 | self.connection = self.engine.connect() 29 | 30 | def get_engine(self): 31 | """Creates a SQLAlchemy engine with connection pooling""" 32 | connection_string = f"postgresql://{settings.DB_USERNAME}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}" 33 | engine = create_engine( 34 | connection_string, 35 | poolclass=QueuePool, 36 | pool_size=5, 37 | max_overflow=10, 38 | pool_timeout=30, 39 | pool_pre_ping=True, 40 | ) 41 | 42 | # Add event listeners for connection pooling 43 | @event.listens_for(engine, "connect") 44 | def connect(dbapi_connection, connection_record): 45 | connection_record.info["pid"] = os.getpid() 46 | 47 | @event.listens_for(engine, "checkout") 48 | def checkout(dbapi_connection, connection_record, connection_proxy): 49 | pid = os.getpid() 50 | if connection_record.info["pid"] != pid: 51 | connection_record.connection = connection_proxy.connection = None 52 | from sqlalchemy import exc 53 | 54 | raise exc.DisconnectionError( 55 | "Connection record belongs to pid %s, " 56 | "attempting to check out in pid %s" 57 | % (connection_record.info["pid"], pid) 58 | ) 59 | 60 | return engine 61 | 62 | def execute_with_transaction(self, query, params=None): 63 | """Execute a query within a transaction""" 64 | with self.engine.begin() as connection: 65 | if params: 66 | return connection.execute(query, params) 67 | return connection.execute(query) 68 | 69 | def get_columns_of_db(self, table_name: str): 70 | """ 71 | Retrieves the column names of a specified table from the database. 72 | 73 | Args: 74 | table_name (str): The name of the table for which column names are retrieved. 75 | 76 | Returns: 77 | list: A list of column names in the specified table. 78 | """ 79 | query = text( 80 | f""" 81 | SELECT column_name 82 | FROM information_schema.columns 83 | WHERE table_name = '{table_name}'; 84 | """ 85 | ) 86 | result = self.execute_with_transaction(query) 87 | return [row[0] for row in result] 88 | 89 | def update_table_structure(self, table_name: str, new_columns): 90 | """Updates table structure to match new columns""" 91 | try: 92 | existing_columns = self.get_columns_of_db(table_name) 93 | with self.engine.begin() as connection: 94 | for column in new_columns: 95 | if column not in existing_columns: 96 | alter_query = text( 97 | f'ALTER TABLE {table_name} ADD COLUMN "{column}" TEXT' 98 | ) 99 | connection.execute(alter_query) 100 | except Exception as e: 101 | logger.error(f"Error updating table structure for {table_name}: {e}") 102 | raise 103 | 104 | def save_into_db( 105 | self, page: int, resource: str, content: dict, replace: bool = False 106 | ): 107 | """ 108 | Enhanced version of save_into_db that handles batch processing 109 | 110 | Args: 111 | page (int): The page number or batch start page 112 | resource (str): The resource identifier 113 | content (dict): The data to save 114 | replace (bool): Whether to replace the existing table (True for first batch) 115 | """ 116 | table_name = resource.split("/")[-2] 117 | 118 | try: 119 | # Convert content to DataFrame 120 | if isinstance(content, dict): 121 | for key, value in content.items(): 122 | if isinstance(value, list) and value and isinstance(value[0], dict): 123 | parent_keys = [k for k in content.keys() if k != key] 124 | df = pd.json_normalize( 125 | content, record_path=key, meta=parent_keys 126 | ) 127 | else: 128 | df = pd.json_normalize(content) 129 | 130 | # Convert numeric columns to appropriate types 131 | # TODO: Add more numeric columns to the list or make it dynamic 132 | numeric_columns = [ 133 | "nSaldo", 134 | "nValorDocumento", 135 | "nSaldoAnterior", 136 | "nSaldoAtual", 137 | "nSaldoConciliado", 138 | "nSaldoProvisorio", 139 | "nLimiteCreditoTotal", 140 | "nSaldoDisponivel", 141 | ] 142 | 143 | for col in df.columns: 144 | if col in numeric_columns and col in df.columns: 145 | df[col] = pd.to_numeric( 146 | df[col].replace(["", None], "0"), errors="coerce" 147 | ) 148 | elif df[col].dtype == "object": 149 | df[col] = df[col].astype(str) 150 | 151 | # Create table with correct column types if it doesn't exist 152 | if replace or not self.table_exists(table_name): 153 | with self.engine.begin() as connection: 154 | # Drop table if replacing 155 | if replace and self.table_exists(table_name): 156 | connection.execute(text(f"DROP TABLE IF EXISTS {table_name}")) 157 | 158 | # Create column definitions 159 | columns = [] 160 | for col in df.columns: 161 | if col in numeric_columns: 162 | columns.append(f'"{col}" NUMERIC(15,2)') 163 | else: 164 | columns.append(f'"{col}" TEXT') 165 | 166 | create_table_sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({', '.join(columns)})" 167 | connection.execute(text(create_table_sql)) 168 | 169 | # Define SQLAlchemy types for columns 170 | dtype = {} 171 | for col in df.columns: 172 | if col in numeric_columns: 173 | dtype[col] = types.Numeric(15, 2) 174 | else: 175 | dtype[col] = types.Text() 176 | 177 | # Use SQLAlchemy engine directly for better performance 178 | df.to_sql( 179 | table_name, 180 | self.engine, 181 | if_exists="append", 182 | index=False, 183 | method="multi", 184 | chunksize=1000, 185 | dtype=dtype, 186 | ) 187 | 188 | logger.success( 189 | f"{'Replaced' if replace else 'Appended'} data into table {table_name} starting from page {page}" 190 | ) 191 | 192 | except Exception as e: 193 | logger.error(f"Error saving data into table {table_name}: {e}") 194 | raise 195 | 196 | def table_exists(self, table_name: str) -> bool: 197 | """Check if a table exists in the database""" 198 | query = text( 199 | """ 200 | SELECT EXISTS ( 201 | SELECT FROM information_schema.tables 202 | WHERE table_name = :table_name 203 | ) 204 | """ 205 | ) 206 | result = self.execute_with_transaction( 207 | query, {"table_name": table_name} 208 | ).scalar() 209 | return bool(result) 210 | 211 | def select_from_table(self, table_name: str, distinct_column: str = None): 212 | try: 213 | if distinct_column: 214 | query = text(f'SELECT DISTINCT "{distinct_column}" FROM {table_name}') 215 | result = self.execute_with_transaction(query) 216 | return [row[0] for row in result] 217 | else: 218 | query = text(f"SELECT * FROM {table_name}") 219 | result = self.execute_with_transaction(query) 220 | return [dict(row) for row in result] 221 | except Exception as e: 222 | logger.error(f"Error selecting data from table {table_name}: {e}") 223 | return None 224 | 225 | def __del__(self): 226 | """Ensure proper cleanup of database connections""" 227 | try: 228 | if hasattr(self, "connection"): 229 | self.connection.close() 230 | if hasattr(self, "engine"): 231 | self.engine.dispose() 232 | except: 233 | pass 234 | -------------------------------------------------------------------------------- /src/endpoints/__init__.py: -------------------------------------------------------------------------------- 1 | from .endpoints import Endpoints 2 | -------------------------------------------------------------------------------- /src/endpoints/data/data.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "resources": "geral/clientes/", 4 | "action": "ListarClientes", 5 | "params": { 6 | "pagina": 1, 7 | "registros_por_pagina": 100, 8 | "apenas_importado_api": "N" 9 | }, 10 | "data_source": "clientes_cadastro", 11 | "page_label": "pagina" 12 | }, 13 | { 14 | "resources": "geral/categorias/", 15 | "action": "ListarCategorias", 16 | "params": { 17 | "pagina": 1, 18 | "registros_por_pagina": 100, 19 | "apenas_importado_api": "N" 20 | }, 21 | "data_source": "categoria_cadastro", 22 | "page_label": "pagina" 23 | }, 24 | { 25 | "resources": "geral/empresas/", 26 | "action": "ListarEmpresas", 27 | "params": { 28 | "pagina": 1, 29 | "registros_por_pagina": 100, 30 | "apenas_importado_api": "N" 31 | }, 32 | "data_source": "empresas_cadastro", 33 | "page_label": "pagina" 34 | }, 35 | { 36 | "resources": "geral/departamentos/", 37 | "action": "ListarDepartamentos", 38 | "params": { 39 | "pagina": 1, 40 | "registros_por_pagina": 100 41 | }, 42 | "data_source": "departamentos", 43 | "page_label": "pagina" 44 | }, 45 | { 46 | "resources": "financas/mf/", 47 | "action": "ListarMovimentos", 48 | "params": { 49 | "nPagina": 1, 50 | "nRegPorPagina": 100 51 | }, 52 | "data_source": "movimentos", 53 | "page_label": "nPagina", 54 | "total_of_pages_label": "nTotPaginas", 55 | "records_label": "nRegistros" 56 | }, 57 | { 58 | "resources": "geral/contacorrente/", 59 | "action": "ListarContasCorrentes", 60 | "params": { 61 | "pagina": 1, 62 | "registros_por_pagina": 100, 63 | "apenas_importado_api": "N" 64 | }, 65 | "data_source": "ListarContasCorrentes", 66 | "page_label": "pagina" 67 | }, 68 | { 69 | "resources": "financas/extrato/", 70 | "action": "ListarExtrato", 71 | "params": { 72 | "nCodCC": 0, 73 | "cCodIntCC": "", 74 | "dPeriodoInicial": "", 75 | "dPeriodoFinal": "" 76 | }, 77 | "data_source": "listaMovimentos", 78 | "pagination_type": "date_range", 79 | "depends_on": "contacorrente" 80 | }, 81 | { 82 | "resources": "geral/produtos/", 83 | "action": "ListarProdutos", 84 | "params": { 85 | "pagina": 1, 86 | "registros_por_pagina": 100, 87 | "apenas_importado_api": "N", 88 | "filtrar_apenas_omiepdv": "N" 89 | }, 90 | "data_source": "produto_servico_cadastro", 91 | "page_label": "pagina" 92 | }, 93 | { 94 | "resources": "financas/contapagar/", 95 | "action": "ListarContasPagar", 96 | "params": { 97 | "pagina": 1, 98 | "registros_por_pagina": 100, 99 | "apenas_importado_api": "N" 100 | }, 101 | "data_source": "conta_pagar_cadastro", 102 | "page_label": "pagina" 103 | }, 104 | { 105 | "resources": "financas/contareceber/", 106 | "action": "ListarContasReceber", 107 | "params": { 108 | "pagina": 1, 109 | "registros_por_pagina": 100, 110 | "apenas_importado_api": "N" 111 | }, 112 | "data_source": "conta_receber_cadastro", 113 | "page_label": "pagina" 114 | }, 115 | { 116 | "resources": "financas/pesquisartitulos/", 117 | "action": "PesquisarLancamentos", 118 | "params": { 119 | "nPagina": 1, 120 | "nRegPorPagina": 100 121 | }, 122 | "data_source": "titulosEncontrados", 123 | "page_label": "nPagina", 124 | "total_of_pages_label": "nTotPaginas", 125 | "records_label": "nRegistros" 126 | } 127 | 128 | 129 | ] 130 | -------------------------------------------------------------------------------- /src/endpoints/endpoints.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Optional 3 | 4 | 5 | def read_json(path: str) -> dict: 6 | with open(path, "r") as file: 7 | return json.load(file) 8 | 9 | 10 | class Endpoints: 11 | def __init__(self) -> None: 12 | self.path = "src/endpoints/data/data.json" 13 | self.endpoints = read_json(self.path) 14 | 15 | def get_endpoint( 16 | self, resource: Optional[str] = None, action: Optional[str] = None 17 | ) -> dict: 18 | if action: 19 | for endpoint in self.endpoints: 20 | if endpoint.get("action") == action: 21 | return [endpoint] 22 | elif resource: 23 | for endpoint in self.endpoints: 24 | if endpoint.get("resources") == resource: 25 | return [endpoint] 26 | else: 27 | raise Exception("Resource or action not found") 28 | 29 | def get_all(self) -> list: 30 | return self.endpoints 31 | -------------------------------------------------------------------------------- /src/utils/constants.py: -------------------------------------------------------------------------------- 1 | HEADERS = {"Content-Type": "application/json"} 2 | -------------------------------------------------------------------------------- /src/utils/tools.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Optional 3 | 4 | from src.api import Api 5 | from src.config import Settings 6 | from src.utils.constants import HEADERS 7 | 8 | settings = Settings() 9 | 10 | 11 | def get_body_params_pagination( 12 | action: str, 13 | params: dict, 14 | page: Optional[int] = None, 15 | field_pagination: Optional[str] = None, 16 | ) -> dict: 17 | if field_pagination: 18 | params[field_pagination] = page 19 | 20 | return { 21 | "call": action, 22 | "app_key": settings.APP_KEY, 23 | "app_secret": settings.APP_SECRET, 24 | "param": [params], 25 | } 26 | 27 | 28 | def get_total_of_pages( 29 | resource: str, 30 | action: str, 31 | params: dict, 32 | page_label: Optional[str] = None, 33 | total_of_pages_label: Optional[str] = None, 34 | records_label: Optional[str] = None, 35 | ) -> int: 36 | page_label = "pagina" if page_label is None else page_label 37 | total_of_pages_label = ( 38 | "total_de_paginas" if total_of_pages_label is None else total_of_pages_label 39 | ) 40 | records_label = "registros" if records_label is None else records_label 41 | 42 | payload = get_body_params_pagination(action, params, 1, page_label) 43 | 44 | api = Api( 45 | url=f"{settings.BASE_URL}{resource}", 46 | headers=HEADERS, 47 | json=payload, 48 | params=params, 49 | ) 50 | response = api.request(api.post) 51 | total_of_pages = response.get(total_of_pages_label, 0) 52 | 53 | return total_of_pages 54 | 55 | 56 | def generate_date_range(start_date_str: str): 57 | def add_month(data): 58 | new_month = data.month + 1 59 | new_year = data.year 60 | if new_month > 12: 61 | new_month = 1 62 | new_year += 1 63 | 64 | return data.replace(month=new_month, year=new_year) 65 | 66 | start_date = datetime.strptime(start_date_str, "%d/%m/%Y") 67 | start_date = start_date.replace(day=1) 68 | # "25/01/2025" -> "01/01/2025" 69 | # ["01/01/2025", "01/02/2025"] 70 | 71 | today = datetime.today() 72 | 73 | date_list = [] 74 | 75 | current_date = start_date 76 | while current_date <= today: 77 | date_list.append(current_date.strftime("%d/%m/%Y")) 78 | current_date = add_month(current_date) 79 | 80 | return date_list 81 | --------------------------------------------------------------------------------