├── .gitignore
├── Dockerfile
├── README.md
├── config
    └── airflow.cfg
├── dags
    ├── example.py
    └── execute_entities.py
├── docker-compose.yml
├── docs
    └── CHANGELOG.md
├── gh_2.67.0_windows_amd64.msi
├── logs
    └── scheduler
    │   └── latest
├── main.py
├── per_page.py
├── requirements.txt
└── src
    ├── __init__.py
    ├── api
        ├── __init__.py
        └── api_instance.py
    ├── config
        └── __init__.py
    ├── controllers
        └── paginations
        │   ├── __init__.py
        │   └── paginations.py
    ├── db
        ├── __init__.py
        └── database.py
    ├── endpoints
        ├── __init__.py
        ├── data
        │   └── data.json
        └── endpoints.py
    └── utils
        ├── constants.py
        └── tools.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | todo
  2 | # Byte-compiled / optimized / DLL files
  3 | __pycache__/
  4 | *.py[cod]
  5 | *$py.class
  6 | 
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
111 | .pdm.toml
112 | .pdm-python
113 | .pdm-build/
114 | 
115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116 | __pypackages__/
117 | 
118 | # Celery stuff
119 | celerybeat-schedule
120 | celerybeat.pid
121 | 
122 | # SageMath parsed files
123 | *.sage.py
124 | 
125 | # Environments
126 | .env
127 | .venv
128 | env/
129 | venv/
130 | ENV/
131 | env.bak/
132 | venv.bak/
133 | 
134 | # Spyder project settings
135 | .spyderproject
136 | .spyproject
137 | 
138 | # Rope project settings
139 | .ropeproject
140 | 
141 | # mkdocs documentation
142 | /site
143 | 
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 | 
149 | # Pyre type checker
150 | .pyre/
151 | 
152 | # pytype static type analyzer
153 | .pytype/
154 | 
155 | # Cython debug symbols
156 | cython_debug/
157 | 
158 | # PyCharm
159 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
162 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
163 | #.idea/
164 | link.pem
165 | .gitignore
166 | .pre-commit-config.yaml
167 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM apache/airflow:2.10.4
2 | ADD requirements.txt .
3 | RUN pip install --upgrade pip && pip install -r requirements.txt
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Omie API Integration
  2 | 
  3 | This repository provides a Python-based integration with the Omie API. The project fetches data from various Omie API endpoints, cleans the data by removing unwanted fields, and stores the processed results into a PostgreSQL database.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | - [Features](#features)
  8 | - [Prerequisites](#prerequisites)
  9 | - [Installation](#installation)
 10 | - [Configuration](#configuration)
 11 | - [How It Works](#how-it-works)
 12 | - [Usage](#usage)
 13 | - [Contributing](#contributing)
 14 | - [License](#license)
 15 | 
 16 | ## Features
 17 | 
 18 | - **API Integration**: Retrieves data from Omie API endpoints.
 19 | - **Pagination Handling**: Automatically iterates through multiple pages based on the API's response.
 20 | - **Data Cleaning**: Removes unnecessary fields (e.g., `tags`, `recomendacoes`, `homepage`, `fax_ddd`, `bloquear_exclusao`, `produtor_rural`) from the API response.
 21 | - **Database Storage**: Uses Pandas and SQLAlchemy to store data into a PostgreSQL database.
 22 | - **Optional File Saving**: Provides an option to save the API response as JSON files.
 23 | 
 24 | ## Prerequisites
 25 | 
 26 | - **Python 3.7+**
 27 | - **PostgreSQL**: A running PostgreSQL instance.
 28 | - Required Python libraries:
 29 |   - `pandas`
 30 |   - `sqlalchemy`
 31 |   - `requests` (used by the custom API class in `src/api.py`)
 32 | 
 33 | Install the Python dependencies via pip:
 34 | ```bash
 35 | pip install pandas sqlalchemy requests
 36 | ````
 37 | 
 38 | ## Installation
 39 | Clone the repository:
 40 | ```
 41 | git clone https://github.com/rphpacheco/omie_api_integration.git
 42 | ```
 43 | Navigate to the project directory:
 44 | 
 45 | ```
 46 | cd omie_api_integration
 47 | ```
 48 | 
 49 | (Optional) Create and activate a virtual environment:
 50 | 
 51 | ```
 52 | python -m venv venv
 53 | source venv/bin/activate  # On Windows: venv\Scripts\activate
 54 | ````
 55 | 
 56 | Install dependencies: If a requirements.txt file is provided, run:
 57 | 
 58 | ```
 59 | pip install -r requirements.txt
 60 | ```
 61 | 
 62 | Otherwise, install the dependencies manually as shown above.
 63 | 
 64 | ## Configuration
 65 | Environment Variables:
 66 | 
 67 | Create a `.env` file in the root directory using the provided `.env-pattern` as a template:
 68 | 
 69 | ```
 70 | cp .env-pattern .env
 71 | ```
 72 | 
 73 | Edit the .env file with your credentials:
 74 | 
 75 | ```
 76 | APP_KEY=your_app_key_here
 77 | APP_SECRET=your_app_secret_here
 78 | BASE_URL=https://api.omie.com.br/api/v1/
 79 | DB_HOST=your_db_host
 80 | DB_PORT=your_db_port
 81 | DB_USERNANE=your_db_username   # Note: The variable name is 'DB_USERNANE' in this project.
 82 | DB_PASSWORD=your_db_password
 83 | DB_NAME=your_db_name
 84 | ```
 85 | 
 86 | How It Works
 87 | 
 88 | Configuration & Setup:
 89 | The project loads settings from the environment and uses a custom configuration class (src/config.py) to manage API and database credentials.
 90 | 
 91 | Fetching Endpoints:
 92 | Endpoints are defined and retrieved via the Endpoints class (src/endpoints.py), which provides a list of API endpoints to be queried.
 93 | 
 94 | Data Retrieval & Pagination:
 95 | For each endpoint, the script:
 96 | 
 97 | Determines the total number of pages available by making an initial API request.
 98 | Iterates through each page, updating the page parameter in the request.
 99 | Sends a POST request to the Omie API using the custom Api class (src/api.py).
100 | Data Processing:
101 | After receiving the data, the script:
102 | 
103 | Removes unwanted fields using a predefined blacklist.
104 | Normalizes the JSON data using Pandas.
105 | Data Storage:
106 | The processed data is stored in a PostgreSQL database:
107 | 
108 | For the first page of data, the corresponding table is created (or replaced).
109 | For subsequent pages, the data is appended to the table.
110 | File Saving Option:
111 | There is also functionality to save the raw JSON response to a file.
112 | 
113 | Usage
114 | 
115 | Run the main integration script with:
116 | 
117 | ```
118 | python main.py
119 | ```
120 | 
121 | As the script runs, it will:
122 | 
123 | Connect to the Omie API using your credentials.
124 | Retrieve and process data from each endpoint.
125 | Store the data in your PostgreSQL database.
126 | Output progress messages to the console, including the total pages and records fetched.
127 | Contributing
128 | Contributions are welcome! If you have suggestions or improvements, feel free to fork the repository and submit a pull request.
129 | 
130 | License
131 | This project does not specify a license. Please contact the repository owner for more details.
132 | 


--------------------------------------------------------------------------------
/config/airflow.cfg:
--------------------------------------------------------------------------------
   1 | [core]
   2 | # The folder where your airflow pipelines live, most likely a
   3 | # subfolder in a code repository. This path must be absolute.
   4 | #
   5 | # Variable: AIRFLOW__CORE__DAGS_FOLDER
   6 | #
   7 | dags_folder = /opt/airflow/dags
   8 | 
   9 | # Hostname by providing a path to a callable, which will resolve the hostname.
  10 | # The format is "package.function".
  11 | #
  12 | # For example, default value ``airflow.utils.net.getfqdn`` means that result from patched
  13 | # version of `socket.getfqdn() <https://docs.python.org/3/library/socket.html#socket.getfqdn>`__,
  14 | # see related `CPython Issue <https://github.com/python/cpython/issues/49254>`__.
  15 | #
  16 | # No argument should be required in the function specified.
  17 | # If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address``
  18 | #
  19 | # Variable: AIRFLOW__CORE__HOSTNAME_CALLABLE
  20 | #
  21 | hostname_callable = airflow.utils.net.getfqdn
  22 | 
  23 | # A callable to check if a python file has airflow dags defined or not and should
  24 | # return ``True`` if it has dags otherwise ``False``.
  25 | # If this is not provided, Airflow uses its own heuristic rules.
  26 | #
  27 | # The function should have the following signature
  28 | #
  29 | # .. code-block:: python
  30 | #
  31 | #     def func_name(file_path: str, zip_file: zipfile.ZipFile | None = None) -> bool: ...
  32 | #
  33 | # Variable: AIRFLOW__CORE__MIGHT_CONTAIN_DAG_CALLABLE
  34 | #
  35 | might_contain_dag_callable = airflow.utils.file.might_contain_dag_via_default_heuristic
  36 | 
  37 | # Default timezone in case supplied date times are naive
  38 | # can be `UTC` (default), `system`, or any `IANA <https://www.iana.org/time-zones>`
  39 | # timezone string (e.g. Europe/Amsterdam)
  40 | #
  41 | # Variable: AIRFLOW__CORE__DEFAULT_TIMEZONE
  42 | #
  43 | default_timezone = utc
  44 | 
  45 | # The executor class that airflow should use. Choices include
  46 | # ``SequentialExecutor``, ``LocalExecutor``, ``CeleryExecutor``,
  47 | # ``KubernetesExecutor``, ``CeleryKubernetesExecutor``, ``LocalKubernetesExecutor`` or the
  48 | # full import path to the class when using a custom executor.
  49 | #
  50 | # Variable: AIRFLOW__CORE__EXECUTOR
  51 | #
  52 | executor = SequentialExecutor
  53 | 
  54 | # The auth manager class that airflow should use. Full import path to the auth manager class.
  55 | #
  56 | # Variable: AIRFLOW__CORE__AUTH_MANAGER
  57 | #
  58 | auth_manager = airflow.providers.fab.auth_manager.fab_auth_manager.FabAuthManager
  59 | 
  60 | # This defines the maximum number of task instances that can run concurrently per scheduler in
  61 | # Airflow, regardless of the worker count. Generally this value, multiplied by the number of
  62 | # schedulers in your cluster, is the maximum number of task instances with the running
  63 | # state in the metadata database. Setting this value to zero allows unlimited parallelism.
  64 | #
  65 | # Variable: AIRFLOW__CORE__PARALLELISM
  66 | #
  67 | parallelism = 32
  68 | 
  69 | # The maximum number of task instances allowed to run concurrently in each DAG. To calculate
  70 | # the number of tasks that is running concurrently for a DAG, add up the number of running
  71 | # tasks for all DAG runs of the DAG. This is configurable at the DAG level with ``max_active_tasks``,
  72 | # which is defaulted as ``[core] max_active_tasks_per_dag``.
  73 | #
  74 | # An example scenario when this would be useful is when you want to stop a new dag with an early
  75 | # start date from stealing all the executor slots in a cluster.
  76 | #
  77 | # Variable: AIRFLOW__CORE__MAX_ACTIVE_TASKS_PER_DAG
  78 | #
  79 | max_active_tasks_per_dag = 16
  80 | 
  81 | # Are DAGs paused by default at creation
  82 | #
  83 | # Variable: AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION
  84 | #
  85 | dags_are_paused_at_creation = True
  86 | 
  87 | # The maximum number of active DAG runs per DAG. The scheduler will not create more DAG runs
  88 | # if it reaches the limit. This is configurable at the DAG level with ``max_active_runs``,
  89 | # which is defaulted as ``[core] max_active_runs_per_dag``.
  90 | #
  91 | # Variable: AIRFLOW__CORE__MAX_ACTIVE_RUNS_PER_DAG
  92 | #
  93 | max_active_runs_per_dag = 16
  94 | 
  95 | # (experimental) The maximum number of consecutive DAG failures before DAG is automatically paused.
  96 | # This is also configurable per DAG level with ``max_consecutive_failed_dag_runs``,
  97 | # which is defaulted as ``[core] max_consecutive_failed_dag_runs_per_dag``.
  98 | # If not specified, then the value is considered as 0,
  99 | # meaning that the dags are never paused out by default.
 100 | #
 101 | # Variable: AIRFLOW__CORE__MAX_CONSECUTIVE_FAILED_DAG_RUNS_PER_DAG
 102 | #
 103 | max_consecutive_failed_dag_runs_per_dag = 0
 104 | 
 105 | # The name of the method used in order to start Python processes via the multiprocessing module.
 106 | # This corresponds directly with the options available in the Python docs:
 107 | # `multiprocessing.set_start_method
 108 | # <https://docs.python.org/3/library/multiprocessing.html#multiprocessing.set_start_method>`__
 109 | # must be one of the values returned by `multiprocessing.get_all_start_methods()
 110 | # <https://docs.python.org/3/library/multiprocessing.html#multiprocessing.get_all_start_methods>`__.
 111 | #
 112 | # Example: mp_start_method = fork
 113 | #
 114 | # Variable: AIRFLOW__CORE__MP_START_METHOD
 115 | #
 116 | # mp_start_method =
 117 | 
 118 | # Whether to load the DAG examples that ship with Airflow. It's good to
 119 | # get started, but you probably want to set this to ``False`` in a production
 120 | # environment
 121 | #
 122 | # Variable: AIRFLOW__CORE__LOAD_EXAMPLES
 123 | #
 124 | load_examples = True
 125 | 
 126 | # Path to the folder containing Airflow plugins
 127 | #
 128 | # Variable: AIRFLOW__CORE__PLUGINS_FOLDER
 129 | #
 130 | plugins_folder = /opt/airflow/plugins
 131 | 
 132 | # Should tasks be executed via forking of the parent process
 133 | #
 134 | # * ``False``: Execute via forking of the parent process
 135 | # * ``True``: Spawning a new python process, slower than fork, but means plugin changes picked
 136 | #   up by tasks straight away
 137 | #
 138 | # Variable: AIRFLOW__CORE__EXECUTE_TASKS_NEW_PYTHON_INTERPRETER
 139 | #
 140 | execute_tasks_new_python_interpreter = False
 141 | 
 142 | # Secret key to save connection passwords in the db
 143 | #
 144 | # Variable: AIRFLOW__CORE__FERNET_KEY
 145 | #
 146 | fernet_key =
 147 | 
 148 | # Whether to disable pickling dags
 149 | #
 150 | # Variable: AIRFLOW__CORE__DONOT_PICKLE
 151 | #
 152 | donot_pickle = True
 153 | 
 154 | # How long before timing out a python file import
 155 | #
 156 | # Variable: AIRFLOW__CORE__DAGBAG_IMPORT_TIMEOUT
 157 | #
 158 | dagbag_import_timeout = 30.0
 159 | 
 160 | # Should a traceback be shown in the UI for dagbag import errors,
 161 | # instead of just the exception message
 162 | #
 163 | # Variable: AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACKS
 164 | #
 165 | dagbag_import_error_tracebacks = True
 166 | 
 167 | # If tracebacks are shown, how many entries from the traceback should be shown
 168 | #
 169 | # Variable: AIRFLOW__CORE__DAGBAG_IMPORT_ERROR_TRACEBACK_DEPTH
 170 | #
 171 | dagbag_import_error_traceback_depth = 2
 172 | 
 173 | # How long before timing out a DagFileProcessor, which processes a dag file
 174 | #
 175 | # Variable: AIRFLOW__CORE__DAG_FILE_PROCESSOR_TIMEOUT
 176 | #
 177 | dag_file_processor_timeout = 50
 178 | 
 179 | # The class to use for running task instances in a subprocess.
 180 | # Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class
 181 | # when using a custom task runner.
 182 | #
 183 | # Variable: AIRFLOW__CORE__TASK_RUNNER
 184 | #
 185 | task_runner = StandardTaskRunner
 186 | 
 187 | # If set, tasks without a ``run_as_user`` argument will be run with this user
 188 | # Can be used to de-elevate a sudo user running Airflow when executing tasks
 189 | #
 190 | # Variable: AIRFLOW__CORE__DEFAULT_IMPERSONATION
 191 | #
 192 | default_impersonation =
 193 | 
 194 | # What security module to use (for example kerberos)
 195 | #
 196 | # Variable: AIRFLOW__CORE__SECURITY
 197 | #
 198 | security =
 199 | 
 200 | # Turn unit test mode on (overwrites many configuration options with test
 201 | # values at runtime)
 202 | #
 203 | # Variable: AIRFLOW__CORE__UNIT_TEST_MODE
 204 | #
 205 | unit_test_mode = False
 206 | 
 207 | # Whether to enable pickling for xcom (note that this is insecure and allows for
 208 | # RCE exploits).
 209 | #
 210 | # Variable: AIRFLOW__CORE__ENABLE_XCOM_PICKLING
 211 | #
 212 | enable_xcom_pickling = False
 213 | 
 214 | # What classes can be imported during deserialization. This is a multi line value.
 215 | # The individual items will be parsed as a pattern to a glob function.
 216 | # Python built-in classes (like dict) are always allowed.
 217 | #
 218 | # Variable: AIRFLOW__CORE__ALLOWED_DESERIALIZATION_CLASSES
 219 | #
 220 | allowed_deserialization_classes = airflow.*
 221 | 
 222 | # What classes can be imported during deserialization. This is a multi line value.
 223 | # The individual items will be parsed as regexp patterns.
 224 | # This is a secondary option to ``[core] allowed_deserialization_classes``.
 225 | #
 226 | # Variable: AIRFLOW__CORE__ALLOWED_DESERIALIZATION_CLASSES_REGEXP
 227 | #
 228 | allowed_deserialization_classes_regexp =
 229 | 
 230 | # When a task is killed forcefully, this is the amount of time in seconds that
 231 | # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED
 232 | #
 233 | # Variable: AIRFLOW__CORE__KILLED_TASK_CLEANUP_TIME
 234 | #
 235 | killed_task_cleanup_time = 60
 236 | 
 237 | # Whether to override params with dag_run.conf. If you pass some key-value pairs
 238 | # through ``airflow dags backfill -c`` or
 239 | # ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params.
 240 | #
 241 | # Variable: AIRFLOW__CORE__DAG_RUN_CONF_OVERRIDES_PARAMS
 242 | #
 243 | dag_run_conf_overrides_params = True
 244 | 
 245 | # If enabled, Airflow will only scan files containing both ``DAG`` and ``airflow`` (case-insensitive).
 246 | #
 247 | # Variable: AIRFLOW__CORE__DAG_DISCOVERY_SAFE_MODE
 248 | #
 249 | dag_discovery_safe_mode = True
 250 | 
 251 | # The pattern syntax used in the
 252 | # `.airflowignore
 253 | # <https://airflow.apache.org/docs/apache-airflow/stable/core-concepts/dags.html#airflowignore>`__
 254 | # files in the DAG directories. Valid values are ``regexp`` or ``glob``.
 255 | #
 256 | # Variable: AIRFLOW__CORE__DAG_IGNORE_FILE_SYNTAX
 257 | #
 258 | dag_ignore_file_syntax = regexp
 259 | 
 260 | # The number of retries each task is going to have by default. Can be overridden at dag or task level.
 261 | #
 262 | # Variable: AIRFLOW__CORE__DEFAULT_TASK_RETRIES
 263 | #
 264 | default_task_retries = 0
 265 | 
 266 | # The number of seconds each task is going to wait by default between retries. Can be overridden at
 267 | # dag or task level.
 268 | #
 269 | # Variable: AIRFLOW__CORE__DEFAULT_TASK_RETRY_DELAY
 270 | #
 271 | default_task_retry_delay = 300
 272 | 
 273 | # The maximum delay (in seconds) each task is going to wait by default between retries.
 274 | # This is a global setting and cannot be overridden at task or DAG level.
 275 | #
 276 | # Variable: AIRFLOW__CORE__MAX_TASK_RETRY_DELAY
 277 | #
 278 | max_task_retry_delay = 86400
 279 | 
 280 | # The weighting method used for the effective total priority weight of the task
 281 | #
 282 | # Variable: AIRFLOW__CORE__DEFAULT_TASK_WEIGHT_RULE
 283 | #
 284 | default_task_weight_rule = downstream
 285 | 
 286 | # Maximum possible time (in seconds) that task will have for execution of auxiliary processes
 287 | # (like listeners, mini scheduler...) after task is marked as success..
 288 | #
 289 | # Variable: AIRFLOW__CORE__TASK_SUCCESS_OVERTIME
 290 | #
 291 | task_success_overtime = 20
 292 | 
 293 | # The default task execution_timeout value for the operators. Expected an integer value to
 294 | # be passed into timedelta as seconds. If not specified, then the value is considered as None,
 295 | # meaning that the operators are never timed out by default.
 296 | #
 297 | # Variable: AIRFLOW__CORE__DEFAULT_TASK_EXECUTION_TIMEOUT
 298 | #
 299 | default_task_execution_timeout =
 300 | 
 301 | # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate.
 302 | #
 303 | # Variable: AIRFLOW__CORE__MIN_SERIALIZED_DAG_UPDATE_INTERVAL
 304 | #
 305 | min_serialized_dag_update_interval = 30
 306 | 
 307 | # If ``True``, serialized DAGs are compressed before writing to DB.
 308 | #
 309 | # .. note::
 310 | #
 311 | #     This will disable the DAG dependencies view
 312 | #
 313 | # Variable: AIRFLOW__CORE__COMPRESS_SERIALIZED_DAGS
 314 | #
 315 | compress_serialized_dags = False
 316 | 
 317 | # Fetching serialized DAG can not be faster than a minimum interval to reduce database
 318 | # read rate. This config controls when your DAGs are updated in the Webserver
 319 | #
 320 | # Variable: AIRFLOW__CORE__MIN_SERIALIZED_DAG_FETCH_INTERVAL
 321 | #
 322 | min_serialized_dag_fetch_interval = 10
 323 | 
 324 | # Maximum number of Rendered Task Instance Fields (Template Fields) per task to store
 325 | # in the Database.
 326 | # All the template_fields for each of Task Instance are stored in the Database.
 327 | # Keeping this number small may cause an error when you try to view ``Rendered`` tab in
 328 | # TaskInstance view for older tasks.
 329 | #
 330 | # Variable: AIRFLOW__CORE__MAX_NUM_RENDERED_TI_FIELDS_PER_TASK
 331 | #
 332 | max_num_rendered_ti_fields_per_task = 30
 333 | 
 334 | # On each dagrun check against defined SLAs
 335 | #
 336 | # Variable: AIRFLOW__CORE__CHECK_SLAS
 337 | #
 338 | check_slas = True
 339 | 
 340 | # Path to custom XCom class that will be used to store and resolve operators results
 341 | #
 342 | # Example: xcom_backend = path.to.CustomXCom
 343 | #
 344 | # Variable: AIRFLOW__CORE__XCOM_BACKEND
 345 | #
 346 | xcom_backend = airflow.models.xcom.BaseXCom
 347 | 
 348 | # By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``,
 349 | # if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module.
 350 | #
 351 | # Variable: AIRFLOW__CORE__LAZY_LOAD_PLUGINS
 352 | #
 353 | lazy_load_plugins = True
 354 | 
 355 | # By default Airflow providers are lazily-discovered (discovery and imports happen only when required).
 356 | # Set it to ``False``, if you want to discover providers whenever 'airflow' is invoked via cli or
 357 | # loaded from module.
 358 | #
 359 | # Variable: AIRFLOW__CORE__LAZY_DISCOVER_PROVIDERS
 360 | #
 361 | lazy_discover_providers = True
 362 | 
 363 | # Hide sensitive **Variables** or **Connection extra json keys** from UI
 364 | # and task logs when set to ``True``
 365 | #
 366 | # .. note::
 367 | #
 368 | #     Connection passwords are always hidden in logs
 369 | #
 370 | # Variable: AIRFLOW__CORE__HIDE_SENSITIVE_VAR_CONN_FIELDS
 371 | #
 372 | hide_sensitive_var_conn_fields = True
 373 | 
 374 | # A comma-separated list of extra sensitive keywords to look for in variables names or connection's
 375 | # extra JSON.
 376 | #
 377 | # Variable: AIRFLOW__CORE__SENSITIVE_VAR_CONN_NAMES
 378 | #
 379 | sensitive_var_conn_names =
 380 | 
 381 | # Task Slot counts for ``default_pool``. This setting would not have any effect in an existing
 382 | # deployment where the ``default_pool`` is already created. For existing deployments, users can
 383 | # change the number of slots using Webserver, API or the CLI
 384 | #
 385 | # Variable: AIRFLOW__CORE__DEFAULT_POOL_TASK_SLOT_COUNT
 386 | #
 387 | default_pool_task_slot_count = 128
 388 | 
 389 | # The maximum list/dict length an XCom can push to trigger task mapping. If the pushed list/dict has a
 390 | # length exceeding this value, the task pushing the XCom will be failed automatically to prevent the
 391 | # mapped tasks from clogging the scheduler.
 392 | #
 393 | # Variable: AIRFLOW__CORE__MAX_MAP_LENGTH
 394 | #
 395 | max_map_length = 1024
 396 | 
 397 | # The default umask to use for process when run in daemon mode (scheduler, worker,  etc.)
 398 | #
 399 | # This controls the file-creation mode mask which determines the initial value of file permission bits
 400 | # for newly created files.
 401 | #
 402 | # This value is treated as an octal-integer.
 403 | #
 404 | # Variable: AIRFLOW__CORE__DAEMON_UMASK
 405 | #
 406 | daemon_umask = 0o077
 407 | 
 408 | # Class to use as dataset manager.
 409 | #
 410 | # Example: dataset_manager_class = airflow.datasets.manager.DatasetManager
 411 | #
 412 | # Variable: AIRFLOW__CORE__DATASET_MANAGER_CLASS
 413 | #
 414 | # dataset_manager_class =
 415 | 
 416 | # Kwargs to supply to dataset manager.
 417 | #
 418 | # Example: dataset_manager_kwargs = {"some_param": "some_value"}
 419 | #
 420 | # Variable: AIRFLOW__CORE__DATASET_MANAGER_KWARGS
 421 | #
 422 | # dataset_manager_kwargs =
 423 | 
 424 | # Dataset URI validation should raise an exception if it is not compliant with AIP-60.
 425 | # By default this configuration is false, meaning that Airflow 2.x only warns the user.
 426 | # In Airflow 3, this configuration will be removed, unconditionally enabling strict validation.
 427 | #
 428 | # Variable: AIRFLOW__CORE__STRICT_DATASET_URI_VALIDATION
 429 | #
 430 | strict_dataset_uri_validation = False
 431 | 
 432 | # (experimental) Whether components should use Airflow Internal API for DB connectivity.
 433 | #
 434 | # Variable: AIRFLOW__CORE__DATABASE_ACCESS_ISOLATION
 435 | #
 436 | database_access_isolation = False
 437 | 
 438 | # (experimental) Airflow Internal API url.
 439 | # Only used if ``[core] database_access_isolation`` is ``True``.
 440 | #
 441 | # Example: internal_api_url = http://localhost:8080
 442 | #
 443 | # Variable: AIRFLOW__CORE__INTERNAL_API_URL
 444 | #
 445 | # internal_api_url =
 446 | 
 447 | # Secret key used to authenticate internal API clients to core. It should be as random as possible.
 448 | # However, when running more than 1 instances of webserver / internal API services, make sure all
 449 | # of them use the same ``secret_key`` otherwise calls will fail on authentication.
 450 | # The authentication token generated using the secret key has a short expiry time though - make
 451 | # sure that time on ALL the machines that you run airflow components on is synchronized
 452 | # (for example using ntpd) otherwise you might get "forbidden" errors when the logs are accessed.
 453 | #
 454 | # Variable: AIRFLOW__CORE__INTERNAL_API_SECRET_KEY
 455 | #
 456 | internal_api_secret_key = bIAkfIszaatwB9ni0WMINw==
 457 | 
 458 | # The ability to allow testing connections across Airflow UI, API and CLI.
 459 | # Supported options: ``Disabled``, ``Enabled``, ``Hidden``. Default: Disabled
 460 | # Disabled - Disables the test connection functionality and disables the Test Connection button in UI.
 461 | # Enabled - Enables the test connection functionality and shows the Test Connection button in UI.
 462 | # Hidden - Disables the test connection functionality and hides the Test Connection button in UI.
 463 | # Before setting this to Enabled, make sure that you review the users who are able to add/edit
 464 | # connections and ensure they are trusted. Connection testing can be done maliciously leading to
 465 | # undesired and insecure outcomes.
 466 | # See `Airflow Security Model: Capabilities of authenticated UI users
 467 | # <https://airflow.apache.org/docs/apache-airflow/stable/security/security_model.html#capabilities-of-authenticated-ui-users>`__
 468 | # for more details.
 469 | #
 470 | # Variable: AIRFLOW__CORE__TEST_CONNECTION
 471 | #
 472 | test_connection = Disabled
 473 | 
 474 | # The maximum length of the rendered template field. If the value to be stored in the
 475 | # rendered template field exceeds this size, it's redacted.
 476 | #
 477 | # Variable: AIRFLOW__CORE__MAX_TEMPLATED_FIELD_LENGTH
 478 | #
 479 | max_templated_field_length = 4096
 480 | 
 481 | [database]
 482 | # Path to the ``alembic.ini`` file. You can either provide the file path relative
 483 | # to the Airflow home directory or the absolute path if it is located elsewhere.
 484 | #
 485 | # Variable: AIRFLOW__DATABASE__ALEMBIC_INI_FILE_PATH
 486 | #
 487 | alembic_ini_file_path = alembic.ini
 488 | 
 489 | # The SQLAlchemy connection string to the metadata database.
 490 | # SQLAlchemy supports many different database engines.
 491 | # See: `Set up a Database Backend: Database URI
 492 | # <https://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#database-uri>`__
 493 | # for more details.
 494 | #
 495 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_CONN
 496 | #
 497 | sql_alchemy_conn = sqlite:////opt/airflow/airflow.db
 498 | 
 499 | # Extra engine specific keyword args passed to SQLAlchemy's create_engine, as a JSON-encoded value
 500 | #
 501 | # Example: sql_alchemy_engine_args = {"arg1": true}
 502 | #
 503 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_ENGINE_ARGS
 504 | #
 505 | # sql_alchemy_engine_args =
 506 | 
 507 | # The encoding for the databases
 508 | #
 509 | # Variable: AIRFLOW__DATABASE__SQL_ENGINE_ENCODING
 510 | #
 511 | sql_engine_encoding = utf-8
 512 | 
 513 | # Collation for ``dag_id``, ``task_id``, ``key``, ``external_executor_id`` columns
 514 | # in case they have different encoding.
 515 | # By default this collation is the same as the database collation, however for ``mysql`` and ``mariadb``
 516 | # the default is ``utf8mb3_bin`` so that the index sizes of our index keys will not exceed
 517 | # the maximum size of allowed index when collation is set to ``utf8mb4`` variant, see
 518 | # `GitHub Issue Comment <https://github.com/apache/airflow/pull/17603#issuecomment-901121618>`__
 519 | # for more details.
 520 | #
 521 | # Variable: AIRFLOW__DATABASE__SQL_ENGINE_COLLATION_FOR_IDS
 522 | #
 523 | # sql_engine_collation_for_ids =
 524 | 
 525 | # If SQLAlchemy should pool database connections.
 526 | #
 527 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_ENABLED
 528 | #
 529 | sql_alchemy_pool_enabled = True
 530 | 
 531 | # The SQLAlchemy pool size is the maximum number of database connections
 532 | # in the pool. 0 indicates no limit.
 533 | #
 534 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_SIZE
 535 | #
 536 | sql_alchemy_pool_size = 5
 537 | 
 538 | # The maximum overflow size of the pool.
 539 | # When the number of checked-out connections reaches the size set in pool_size,
 540 | # additional connections will be returned up to this limit.
 541 | # When those additional connections are returned to the pool, they are disconnected and discarded.
 542 | # It follows then that the total number of simultaneous connections the pool will allow
 543 | # is **pool_size** + **max_overflow**,
 544 | # and the total number of "sleeping" connections the pool will allow is pool_size.
 545 | # max_overflow can be set to ``-1`` to indicate no overflow limit;
 546 | # no limit will be placed on the total number of concurrent connections. Defaults to ``10``.
 547 | #
 548 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_MAX_OVERFLOW
 549 | #
 550 | sql_alchemy_max_overflow = 10
 551 | 
 552 | # The SQLAlchemy pool recycle is the number of seconds a connection
 553 | # can be idle in the pool before it is invalidated. This config does
 554 | # not apply to sqlite. If the number of DB connections is ever exceeded,
 555 | # a lower config value will allow the system to recover faster.
 556 | #
 557 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_RECYCLE
 558 | #
 559 | sql_alchemy_pool_recycle = 1800
 560 | 
 561 | # Check connection at the start of each connection pool checkout.
 562 | # Typically, this is a simple statement like "SELECT 1".
 563 | # See `SQLAlchemy Pooling: Disconnect Handling - Pessimistic
 564 | # <https://docs.sqlalchemy.org/en/14/core/pooling.html#disconnect-handling-pessimistic>`__
 565 | # for more details.
 566 | #
 567 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_POOL_PRE_PING
 568 | #
 569 | sql_alchemy_pool_pre_ping = True
 570 | 
 571 | # The schema to use for the metadata database.
 572 | # SQLAlchemy supports databases with the concept of multiple schemas.
 573 | #
 574 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_SCHEMA
 575 | #
 576 | sql_alchemy_schema =
 577 | 
 578 | # Import path for connect args in SQLAlchemy. Defaults to an empty dict.
 579 | # This is useful when you want to configure db engine args that SQLAlchemy won't parse
 580 | # in connection string. This can be set by passing a dictionary containing the create engine parameters.
 581 | # For more details about passing create engine parameters (keepalives variables, timeout etc)
 582 | # in Postgres DB Backend see `Setting up a PostgreSQL Database
 583 | # <https://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#setting-up-a-postgresql-database>`__
 584 | # e.g ``connect_args={"timeout":30}`` can be defined in ``airflow_local_settings.py`` and
 585 | # can be imported as shown below
 586 | #
 587 | # Example: sql_alchemy_connect_args = airflow_local_settings.connect_args
 588 | #
 589 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_CONNECT_ARGS
 590 | #
 591 | # sql_alchemy_connect_args =
 592 | 
 593 | # Important Warning: Use of sql_alchemy_session_maker Highly Discouraged
 594 | # Import path for function which returns 'sqlalchemy.orm.sessionmaker'.
 595 | # Improper configuration of sql_alchemy_session_maker can lead to serious issues,
 596 | # including data corruption, unrecoverable application crashes. Please review the SQLAlchemy
 597 | # documentation for detailed guidance on proper configuration and best practices.
 598 | #
 599 | # Example: sql_alchemy_session_maker = airflow_local_settings._sessionmaker
 600 | #
 601 | # Variable: AIRFLOW__DATABASE__SQL_ALCHEMY_SESSION_MAKER
 602 | #
 603 | # sql_alchemy_session_maker =
 604 | 
 605 | # Whether to load the default connections that ship with Airflow when ``airflow db init`` is called.
 606 | # It's good to get started, but you probably want to set this to ``False`` in a production environment.
 607 | #
 608 | # Variable: AIRFLOW__DATABASE__LOAD_DEFAULT_CONNECTIONS
 609 | #
 610 | load_default_connections = True
 611 | 
 612 | # Number of times the code should be retried in case of DB Operational Errors.
 613 | # Not all transactions will be retried as it can cause undesired state.
 614 | # Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``.
 615 | #
 616 | # Variable: AIRFLOW__DATABASE__MAX_DB_RETRIES
 617 | #
 618 | max_db_retries = 3
 619 | 
 620 | # Whether to run alembic migrations during Airflow start up. Sometimes this operation can be expensive,
 621 | # and the users can assert the correct version through other means (e.g. through a Helm chart).
 622 | # Accepts ``True`` or ``False``.
 623 | #
 624 | # Variable: AIRFLOW__DATABASE__CHECK_MIGRATIONS
 625 | #
 626 | check_migrations = True
 627 | 
 628 | [logging]
 629 | # The folder where airflow should store its log files.
 630 | # This path must be absolute.
 631 | # There are a few existing configurations that assume this is set to the default.
 632 | # If you choose to override this you may need to update the
 633 | # ``[logging] dag_processor_manager_log_location`` and
 634 | # ``[logging] child_process_log_directory settings`` as well.
 635 | #
 636 | # Variable: AIRFLOW__LOGGING__BASE_LOG_FOLDER
 637 | #
 638 | base_log_folder = /opt/airflow/logs
 639 | 
 640 | # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search.
 641 | # Set this to ``True`` if you want to enable remote logging.
 642 | #
 643 | # Variable: AIRFLOW__LOGGING__REMOTE_LOGGING
 644 | #
 645 | remote_logging = False
 646 | 
 647 | # Users must supply an Airflow connection id that provides access to the storage
 648 | # location. Depending on your remote logging service, this may only be used for
 649 | # reading logs, not writing them.
 650 | #
 651 | # Variable: AIRFLOW__LOGGING__REMOTE_LOG_CONN_ID
 652 | #
 653 | remote_log_conn_id =
 654 | 
 655 | # Whether the local log files for GCS, S3, WASB and OSS remote logging should be deleted after
 656 | # they are uploaded to the remote location.
 657 | #
 658 | # Variable: AIRFLOW__LOGGING__DELETE_LOCAL_LOGS
 659 | #
 660 | delete_local_logs = False
 661 | 
 662 | # Path to Google Credential JSON file. If omitted, authorization based on `the Application Default
 663 | # Credentials
 664 | # <https://cloud.google.com/docs/authentication/application-default-credentials>`__ will
 665 | # be used.
 666 | #
 667 | # Variable: AIRFLOW__LOGGING__GOOGLE_KEY_PATH
 668 | #
 669 | google_key_path =
 670 | 
 671 | # Storage bucket URL for remote logging
 672 | # S3 buckets should start with **s3://**
 673 | # Cloudwatch log groups should start with **cloudwatch://**
 674 | # GCS buckets should start with **gs://**
 675 | # WASB buckets should start with **wasb** just to help Airflow select correct handler
 676 | # Stackdriver logs should start with **stackdriver://**
 677 | #
 678 | # Variable: AIRFLOW__LOGGING__REMOTE_BASE_LOG_FOLDER
 679 | #
 680 | remote_base_log_folder =
 681 | 
 682 | # The remote_task_handler_kwargs param is loaded into a dictionary and passed to the ``__init__``
 683 | # of remote task handler and it overrides the values provided by Airflow config. For example if you set
 684 | # ``delete_local_logs=False`` and you provide ``{"delete_local_copy": true}``, then the local
 685 | # log files will be deleted after they are uploaded to remote location.
 686 | #
 687 | # Example: remote_task_handler_kwargs = {"delete_local_copy": true}
 688 | #
 689 | # Variable: AIRFLOW__LOGGING__REMOTE_TASK_HANDLER_KWARGS
 690 | #
 691 | remote_task_handler_kwargs =
 692 | 
 693 | # Use server-side encryption for logs stored in S3
 694 | #
 695 | # Variable: AIRFLOW__LOGGING__ENCRYPT_S3_LOGS
 696 | #
 697 | encrypt_s3_logs = False
 698 | 
 699 | # Logging level.
 700 | #
 701 | # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
 702 | #
 703 | # Variable: AIRFLOW__LOGGING__LOGGING_LEVEL
 704 | #
 705 | logging_level = INFO
 706 | 
 707 | # Logging level for celery. If not set, it uses the value of logging_level
 708 | #
 709 | # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
 710 | #
 711 | # Variable: AIRFLOW__LOGGING__CELERY_LOGGING_LEVEL
 712 | #
 713 | celery_logging_level =
 714 | 
 715 | # Logging level for Flask-appbuilder UI.
 716 | #
 717 | # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``.
 718 | #
 719 | # Variable: AIRFLOW__LOGGING__FAB_LOGGING_LEVEL
 720 | #
 721 | fab_logging_level = WARNING
 722 | 
 723 | # Logging class
 724 | # Specify the class that will specify the logging configuration
 725 | # This class has to be on the python classpath
 726 | #
 727 | # Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG
 728 | #
 729 | # Variable: AIRFLOW__LOGGING__LOGGING_CONFIG_CLASS
 730 | #
 731 | logging_config_class =
 732 | 
 733 | # Flag to enable/disable Colored logs in Console
 734 | # Colour the logs when the controlling terminal is a TTY.
 735 | #
 736 | # Variable: AIRFLOW__LOGGING__COLORED_CONSOLE_LOG
 737 | #
 738 | colored_console_log = True
 739 | 
 740 | # Log format for when Colored logs is enabled
 741 | #
 742 | # Variable: AIRFLOW__LOGGING__COLORED_LOG_FORMAT
 743 | #
 744 | colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s
 745 | 
 746 | # Specifies the class utilized by Airflow to implement colored logging
 747 | #
 748 | # Variable: AIRFLOW__LOGGING__COLORED_FORMATTER_CLASS
 749 | #
 750 | colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter
 751 | 
 752 | # Format of Log line
 753 | #
 754 | # Variable: AIRFLOW__LOGGING__LOG_FORMAT
 755 | #
 756 | log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
 757 | 
 758 | # Defines the format of log messages for simple logging configuration
 759 | #
 760 | # Variable: AIRFLOW__LOGGING__SIMPLE_LOG_FORMAT
 761 | #
 762 | simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s
 763 | 
 764 | # Where to send dag parser logs. If "file", logs are sent to log files defined by child_process_log_directory.
 765 | #
 766 | # Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_LOG_TARGET
 767 | #
 768 | dag_processor_log_target = file
 769 | 
 770 | # Format of Dag Processor Log line
 771 | #
 772 | # Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_LOG_FORMAT
 773 | #
 774 | dag_processor_log_format = [%%(asctime)s] [SOURCE:DAG_PROCESSOR] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s
 775 | 
 776 | # Determines the formatter class used by Airflow for structuring its log messages
 777 | # The default formatter class is timezone-aware, which means that timestamps attached to log entries
 778 | # will be adjusted to reflect the local timezone of the Airflow instance
 779 | #
 780 | # Variable: AIRFLOW__LOGGING__LOG_FORMATTER_CLASS
 781 | #
 782 | log_formatter_class = airflow.utils.log.timezone_aware.TimezoneAware
 783 | 
 784 | # An import path to a function to add adaptations of each secret added with
 785 | # ``airflow.utils.log.secrets_masker.mask_secret`` to be masked in log messages. The given function
 786 | # is expected to require a single parameter: the secret to be adapted. It may return a
 787 | # single adaptation of the secret or an iterable of adaptations to each be masked as secrets.
 788 | # The original secret will be masked as well as any adaptations returned.
 789 | #
 790 | # Example: secret_mask_adapter = urllib.parse.quote
 791 | #
 792 | # Variable: AIRFLOW__LOGGING__SECRET_MASK_ADAPTER
 793 | #
 794 | secret_mask_adapter =
 795 | 
 796 | # Specify prefix pattern like mentioned below with stream handler ``TaskHandlerWithCustomFormatter``
 797 | #
 798 | # Example: task_log_prefix_template = {{ti.dag_id}}-{{ti.task_id}}-{{execution_date}}-{{ti.try_number}}
 799 | #
 800 | # Variable: AIRFLOW__LOGGING__TASK_LOG_PREFIX_TEMPLATE
 801 | #
 802 | task_log_prefix_template =
 803 | 
 804 | # Formatting for how airflow generates file names/paths for each task run.
 805 | #
 806 | # Variable: AIRFLOW__LOGGING__LOG_FILENAME_TEMPLATE
 807 | #
 808 | log_filename_template = dag_id={{ ti.dag_id }}/run_id={{ ti.run_id }}/task_id={{ ti.task_id }}/{%% if ti.map_index >= 0 %%}map_index={{ ti.map_index }}/{%% endif %%}attempt={{ try_number }}.log
 809 | 
 810 | # Formatting for how airflow generates file names for log
 811 | #
 812 | # Variable: AIRFLOW__LOGGING__LOG_PROCESSOR_FILENAME_TEMPLATE
 813 | #
 814 | log_processor_filename_template = {{ filename }}.log
 815 | 
 816 | # Full path of dag_processor_manager logfile.
 817 | #
 818 | # Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_MANAGER_LOG_LOCATION
 819 | #
 820 | dag_processor_manager_log_location = /opt/airflow/logs/dag_processor_manager/dag_processor_manager.log
 821 | 
 822 | # Whether DAG processor manager will write logs to stdout
 823 | #
 824 | # Variable: AIRFLOW__LOGGING__DAG_PROCESSOR_MANAGER_LOG_STDOUT
 825 | #
 826 | dag_processor_manager_log_stdout = False
 827 | 
 828 | # Name of handler to read task instance logs.
 829 | # Defaults to use ``task`` handler.
 830 | #
 831 | # Variable: AIRFLOW__LOGGING__TASK_LOG_READER
 832 | #
 833 | task_log_reader = task
 834 | 
 835 | # A comma\-separated list of third-party logger names that will be configured to print messages to
 836 | # consoles\.
 837 | #
 838 | # Example: extra_logger_names = connexion,sqlalchemy
 839 | #
 840 | # Variable: AIRFLOW__LOGGING__EXTRA_LOGGER_NAMES
 841 | #
 842 | extra_logger_names =
 843 | 
 844 | # When you start an Airflow worker, Airflow starts a tiny web server
 845 | # subprocess to serve the workers local log files to the airflow main
 846 | # web server, who then builds pages and sends them to users. This defines
 847 | # the port on which the logs are served. It needs to be unused, and open
 848 | # visible from the main web server to connect into the workers.
 849 | #
 850 | # Variable: AIRFLOW__LOGGING__WORKER_LOG_SERVER_PORT
 851 | #
 852 | worker_log_server_port = 8793
 853 | 
 854 | # Port to serve logs from for triggerer.
 855 | # See ``[logging] worker_log_server_port`` description for more info.
 856 | #
 857 | # Variable: AIRFLOW__LOGGING__TRIGGER_LOG_SERVER_PORT
 858 | #
 859 | trigger_log_server_port = 8794
 860 | 
 861 | # We must parse timestamps to interleave logs between trigger and task.  To do so,
 862 | # we need to parse timestamps in log files. In case your log format is non-standard,
 863 | # you may provide import path to callable which takes a string log line and returns
 864 | # the timestamp (datetime.datetime compatible).
 865 | #
 866 | # Example: interleave_timestamp_parser = path.to.my_func
 867 | #
 868 | # Variable: AIRFLOW__LOGGING__INTERLEAVE_TIMESTAMP_PARSER
 869 | #
 870 | # interleave_timestamp_parser =
 871 | 
 872 | # Permissions in the form or of octal string as understood by chmod. The permissions are important
 873 | # when you use impersonation, when logs are written by a different user than airflow. The most secure
 874 | # way of configuring it in this case is to add both users to the same group and make it the default
 875 | # group of both users. Group-writeable logs are default in airflow, but you might decide that you are
 876 | # OK with having the logs other-writeable, in which case you should set it to ``0o777``. You might
 877 | # decide to add more security if you do not use impersonation and change it to ``0o755`` to make it
 878 | # only owner-writeable. You can also make it just readable only for owner by changing it to ``0o700``
 879 | # if all the access (read/write) for your logs happens from the same user.
 880 | #
 881 | # Example: file_task_handler_new_folder_permissions = 0o775
 882 | #
 883 | # Variable: AIRFLOW__LOGGING__FILE_TASK_HANDLER_NEW_FOLDER_PERMISSIONS
 884 | #
 885 | file_task_handler_new_folder_permissions = 0o775
 886 | 
 887 | # Permissions in the form or of octal string as understood by chmod. The permissions are important
 888 | # when you use impersonation, when logs are written by a different user than airflow. The most secure
 889 | # way of configuring it in this case is to add both users to the same group and make it the default
 890 | # group of both users. Group-writeable logs are default in airflow, but you might decide that you are
 891 | # OK with having the logs other-writeable, in which case you should set it to ``0o666``. You might
 892 | # decide to add more security if you do not use impersonation and change it to ``0o644`` to make it
 893 | # only owner-writeable. You can also make it just readable only for owner by changing it to ``0o600``
 894 | # if all the access (read/write) for your logs happens from the same user.
 895 | #
 896 | # Example: file_task_handler_new_file_permissions = 0o664
 897 | #
 898 | # Variable: AIRFLOW__LOGGING__FILE_TASK_HANDLER_NEW_FILE_PERMISSIONS
 899 | #
 900 | file_task_handler_new_file_permissions = 0o664
 901 | 
 902 | # By default Celery sends all logs into stderr.
 903 | # If enabled any previous logging handlers will get *removed*.
 904 | # With this option AirFlow will create new handlers
 905 | # and send low level logs like INFO and WARNING to stdout,
 906 | # while sending higher severity logs to stderr.
 907 | #
 908 | # Variable: AIRFLOW__LOGGING__CELERY_STDOUT_STDERR_SEPARATION
 909 | #
 910 | celery_stdout_stderr_separation = False
 911 | 
 912 | # If enabled, Airflow may ship messages to task logs from outside the task run context, e.g. from
 913 | # the scheduler, executor, or callback execution context. This can help in circumstances such as
 914 | # when there's something blocking the execution of the task and ordinarily there may be no task
 915 | # logs at all.
 916 | # This is set to ``True`` by default. If you encounter issues with this feature
 917 | # (e.g. scheduler performance issues) it can be disabled.
 918 | #
 919 | # Variable: AIRFLOW__LOGGING__ENABLE_TASK_CONTEXT_LOGGER
 920 | #
 921 | enable_task_context_logger = True
 922 | 
 923 | # A comma separated list of keywords related to errors whose presence should display the line in red
 924 | # color in UI
 925 | #
 926 | # Variable: AIRFLOW__LOGGING__COLOR_LOG_ERROR_KEYWORDS
 927 | #
 928 | color_log_error_keywords = error,exception
 929 | 
 930 | # A comma separated list of keywords related to warning whose presence should display the line in yellow
 931 | # color in UI
 932 | #
 933 | # Variable: AIRFLOW__LOGGING__COLOR_LOG_WARNING_KEYWORDS
 934 | #
 935 | color_log_warning_keywords = warn
 936 | 
 937 | [metrics]
 938 | # `StatsD <https://github.com/statsd/statsd>`__ integration settings.
 939 | 
 940 | # If true, ``[metrics] metrics_allow_list`` and ``[metrics] metrics_block_list`` will use
 941 | # regex pattern matching anywhere within the metric name instead of only prefix matching
 942 | # at the start of the name.
 943 | #
 944 | # Variable: AIRFLOW__METRICS__METRICS_USE_PATTERN_MATCH
 945 | #
 946 | metrics_use_pattern_match = False
 947 | 
 948 | # Configure an allow list (comma separated string) to send only certain metrics.
 949 | # If ``[metrics] metrics_use_pattern_match`` is ``false``, match only the exact metric name prefix.
 950 | # If ``[metrics] metrics_use_pattern_match`` is ``true``, provide regex patterns to match.
 951 | #
 952 | # Example: metrics_allow_list = "scheduler,executor,dagrun,pool,triggerer,celery" or "^scheduler,^executor,heartbeat|timeout"
 953 | #
 954 | # Variable: AIRFLOW__METRICS__METRICS_ALLOW_LIST
 955 | #
 956 | metrics_allow_list =
 957 | 
 958 | # Configure a block list (comma separated string) to block certain metrics from being emitted.
 959 | # If ``[metrics] metrics_allow_list`` and ``[metrics] metrics_block_list`` are both configured,
 960 | # ``[metrics] metrics_block_list`` is ignored.
 961 | #
 962 | # If ``[metrics] metrics_use_pattern_match`` is ``false``, match only the exact metric name prefix.
 963 | #
 964 | # If ``[metrics] metrics_use_pattern_match`` is ``true``, provide regex patterns to match.
 965 | #
 966 | # Example: metrics_block_list = "scheduler,executor,dagrun,pool,triggerer,celery" or "^scheduler,^executor,heartbeat|timeout"
 967 | #
 968 | # Variable: AIRFLOW__METRICS__METRICS_BLOCK_LIST
 969 | #
 970 | metrics_block_list =
 971 | 
 972 | # Enables sending metrics to StatsD.
 973 | #
 974 | # Variable: AIRFLOW__METRICS__STATSD_ON
 975 | #
 976 | statsd_on = False
 977 | 
 978 | # Specifies the host address where the StatsD daemon (or server) is running
 979 | #
 980 | # Variable: AIRFLOW__METRICS__STATSD_HOST
 981 | #
 982 | statsd_host = localhost
 983 | 
 984 | # Specifies the port on which the StatsD daemon (or server) is listening to
 985 | #
 986 | # Variable: AIRFLOW__METRICS__STATSD_PORT
 987 | #
 988 | statsd_port = 8125
 989 | 
 990 | # Defines the namespace for all metrics sent from Airflow to StatsD
 991 | #
 992 | # Variable: AIRFLOW__METRICS__STATSD_PREFIX
 993 | #
 994 | statsd_prefix = airflow
 995 | 
 996 | # A function that validate the StatsD stat name, apply changes to the stat name if necessary and return
 997 | # the transformed stat name.
 998 | #
 999 | # The function should have the following signature
1000 | #
1001 | # .. code-block:: python
1002 | #
1003 | #     def func_name(stat_name: str) -> str: ...
1004 | #
1005 | # Variable: AIRFLOW__METRICS__STAT_NAME_HANDLER
1006 | #
1007 | stat_name_handler =
1008 | 
1009 | # To enable datadog integration to send airflow metrics.
1010 | #
1011 | # Variable: AIRFLOW__METRICS__STATSD_DATADOG_ENABLED
1012 | #
1013 | statsd_datadog_enabled = False
1014 | 
1015 | # List of datadog tags attached to all metrics(e.g: ``key1:value1,key2:value2``)
1016 | #
1017 | # Variable: AIRFLOW__METRICS__STATSD_DATADOG_TAGS
1018 | #
1019 | statsd_datadog_tags =
1020 | 
1021 | # Set to ``False`` to disable metadata tags for some of the emitted metrics
1022 | #
1023 | # Variable: AIRFLOW__METRICS__STATSD_DATADOG_METRICS_TAGS
1024 | #
1025 | statsd_datadog_metrics_tags = True
1026 | 
1027 | # If you want to utilise your own custom StatsD client set the relevant
1028 | # module path below.
1029 | # Note: The module path must exist on your
1030 | # `PYTHONPATH <https://docs.python.org/3/using/cmdline.html#envvar-PYTHONPATH>`
1031 | # for Airflow to pick it up
1032 | #
1033 | # Variable: AIRFLOW__METRICS__STATSD_CUSTOM_CLIENT_PATH
1034 | #
1035 | # statsd_custom_client_path =
1036 | 
1037 | # If you want to avoid sending all the available metrics tags to StatsD,
1038 | # you can configure a block list of prefixes (comma separated) to filter out metric tags
1039 | # that start with the elements of the list (e.g: ``job_id,run_id``)
1040 | #
1041 | # Example: statsd_disabled_tags = job_id,run_id,dag_id,task_id
1042 | #
1043 | # Variable: AIRFLOW__METRICS__STATSD_DISABLED_TAGS
1044 | #
1045 | statsd_disabled_tags = job_id,run_id
1046 | 
1047 | # To enable sending Airflow metrics with StatsD-Influxdb tagging convention.
1048 | #
1049 | # Variable: AIRFLOW__METRICS__STATSD_INFLUXDB_ENABLED
1050 | #
1051 | statsd_influxdb_enabled = False
1052 | 
1053 | # Enables sending metrics to OpenTelemetry.
1054 | #
1055 | # Variable: AIRFLOW__METRICS__OTEL_ON
1056 | #
1057 | otel_on = False
1058 | 
1059 | # Specifies the hostname or IP address of the OpenTelemetry Collector to which Airflow sends
1060 | # metrics and traces.
1061 | #
1062 | # Variable: AIRFLOW__METRICS__OTEL_HOST
1063 | #
1064 | otel_host = localhost
1065 | 
1066 | # Specifies the port of the OpenTelemetry Collector that is listening to.
1067 | #
1068 | # Variable: AIRFLOW__METRICS__OTEL_PORT
1069 | #
1070 | otel_port = 8889
1071 | 
1072 | # The prefix for the Airflow metrics.
1073 | #
1074 | # Variable: AIRFLOW__METRICS__OTEL_PREFIX
1075 | #
1076 | otel_prefix = airflow
1077 | 
1078 | # Defines the interval, in milliseconds, at which Airflow sends batches of metrics and traces
1079 | # to the configured OpenTelemetry Collector.
1080 | #
1081 | # Variable: AIRFLOW__METRICS__OTEL_INTERVAL_MILLISECONDS
1082 | #
1083 | otel_interval_milliseconds = 60000
1084 | 
1085 | # If ``True``, all metrics are also emitted to the console. Defaults to ``False``.
1086 | #
1087 | # Variable: AIRFLOW__METRICS__OTEL_DEBUGGING_ON
1088 | #
1089 | otel_debugging_on = False
1090 | 
1091 | # The default service name of traces.
1092 | #
1093 | # Variable: AIRFLOW__METRICS__OTEL_SERVICE
1094 | #
1095 | otel_service = Airflow
1096 | 
1097 | # If ``True``, SSL will be enabled. Defaults to ``False``.
1098 | # To establish an HTTPS connection to the OpenTelemetry collector,
1099 | # you need to configure the SSL certificate and key within the OpenTelemetry collector's
1100 | # ``config.yml`` file.
1101 | #
1102 | # Variable: AIRFLOW__METRICS__OTEL_SSL_ACTIVE
1103 | #
1104 | otel_ssl_active = False
1105 | 
1106 | [traces]
1107 | # Distributed traces integration settings.
1108 | 
1109 | # Enables sending traces to OpenTelemetry.
1110 | #
1111 | # Variable: AIRFLOW__TRACES__OTEL_ON
1112 | #
1113 | otel_on = False
1114 | 
1115 | # Specifies the hostname or IP address of the OpenTelemetry Collector to which Airflow sends
1116 | # traces.
1117 | #
1118 | # Variable: AIRFLOW__TRACES__OTEL_HOST
1119 | #
1120 | otel_host = localhost
1121 | 
1122 | # Specifies the port of the OpenTelemetry Collector that is listening to.
1123 | #
1124 | # Variable: AIRFLOW__TRACES__OTEL_PORT
1125 | #
1126 | otel_port = 8889
1127 | 
1128 | # The default service name of traces.
1129 | #
1130 | # Variable: AIRFLOW__TRACES__OTEL_SERVICE
1131 | #
1132 | otel_service = Airflow
1133 | 
1134 | # If True, all traces are also emitted to the console. Defaults to False.
1135 | #
1136 | # Variable: AIRFLOW__TRACES__OTEL_DEBUGGING_ON
1137 | #
1138 | otel_debugging_on = False
1139 | 
1140 | # If True, SSL will be enabled.  Defaults to False.
1141 | # To establish an HTTPS connection to the OpenTelemetry collector,
1142 | # you need to configure the SSL certificate and key within the OpenTelemetry collector's
1143 | # config.yml file.
1144 | #
1145 | # Variable: AIRFLOW__TRACES__OTEL_SSL_ACTIVE
1146 | #
1147 | otel_ssl_active = False
1148 | 
1149 | # If True, after the task is complete, the full task log messages will be added as the
1150 | # span events, chunked by 64k size. defaults to False.
1151 | #
1152 | # Variable: AIRFLOW__TRACES__OTEL_TASK_LOG_EVENT
1153 | #
1154 | otel_task_log_event = False
1155 | 
1156 | [secrets]
1157 | # Full class name of secrets backend to enable (will precede env vars and metastore in search path)
1158 | #
1159 | # Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend
1160 | #
1161 | # Variable: AIRFLOW__SECRETS__BACKEND
1162 | #
1163 | backend =
1164 | 
1165 | # The backend_kwargs param is loaded into a dictionary and passed to ``__init__``
1166 | # of secrets backend class. See documentation for the secrets backend you are using.
1167 | # JSON is expected.
1168 | #
1169 | # Example for AWS Systems Manager ParameterStore:
1170 | # ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}``
1171 | #
1172 | # Variable: AIRFLOW__SECRETS__BACKEND_KWARGS
1173 | #
1174 | backend_kwargs =
1175 | 
1176 | # .. note:: |experimental|
1177 | #
1178 | # Enables local caching of Variables, when parsing DAGs only.
1179 | # Using this option can make dag parsing faster if Variables are used in top level code, at the expense
1180 | # of longer propagation time for changes.
1181 | # Please note that this cache concerns only the DAG parsing step. There is no caching in place when DAG
1182 | # tasks are run.
1183 | #
1184 | # Variable: AIRFLOW__SECRETS__USE_CACHE
1185 | #
1186 | use_cache = False
1187 | 
1188 | # .. note:: |experimental|
1189 | #
1190 | # When the cache is enabled, this is the duration for which we consider an entry in the cache to be
1191 | # valid. Entries are refreshed if they are older than this many seconds.
1192 | # It means that when the cache is enabled, this is the maximum amount of time you need to wait to see a
1193 | # Variable change take effect.
1194 | #
1195 | # Variable: AIRFLOW__SECRETS__CACHE_TTL_SECONDS
1196 | #
1197 | cache_ttl_seconds = 900
1198 | 
1199 | [cli]
1200 | # In what way should the cli access the API. The LocalClient will use the
1201 | # database directly, while the json_client will use the api running on the
1202 | # webserver
1203 | #
1204 | # Variable: AIRFLOW__CLI__API_CLIENT
1205 | #
1206 | api_client = airflow.api.client.local_client
1207 | 
1208 | # If you set web_server_url_prefix, do NOT forget to append it here, ex:
1209 | # ``endpoint_url = http://localhost:8080/myroot``
1210 | # So api will look like: ``http://localhost:8080/myroot/api/experimental/...``
1211 | #
1212 | # Variable: AIRFLOW__CLI__ENDPOINT_URL
1213 | #
1214 | endpoint_url = http://localhost:8080
1215 | 
1216 | [debug]
1217 | # Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first
1218 | # failed task. Helpful for debugging purposes.
1219 | #
1220 | # Variable: AIRFLOW__DEBUG__FAIL_FAST
1221 | #
1222 | fail_fast = False
1223 | 
1224 | [api]
1225 | # Enables the deprecated experimental API. Please note that these API endpoints do not have
1226 | # access control. An authenticated user has full access.
1227 | #
1228 | # .. warning::
1229 | #
1230 | #   This `Experimental REST API
1231 | #   <https://airflow.apache.org/docs/apache-airflow/stable/deprecated-rest-api-ref.html>`__ is
1232 | #   deprecated since version 2.0. Please consider using
1233 | #   `the Stable REST API
1234 | #   <https://airflow.apache.org/docs/apache-airflow/stable/stable-rest-api-ref.html>`__.
1235 | #   For more information on migration, see
1236 | #   `RELEASE_NOTES.rst <https://github.com/apache/airflow/blob/main/RELEASE_NOTES.rst>`_
1237 | #
1238 | # Variable: AIRFLOW__API__ENABLE_EXPERIMENTAL_API
1239 | #
1240 | enable_experimental_api = False
1241 | 
1242 | # Comma separated list of auth backends to authenticate users of the API. See
1243 | # `Security: API
1244 | # <https://airflow.apache.org/docs/apache-airflow/stable/security/api.html>`__ for possible values.
1245 | # ("airflow.api.auth.backend.default" allows all requests for historic reasons)
1246 | #
1247 | # Variable: AIRFLOW__API__AUTH_BACKENDS
1248 | #
1249 | auth_backends = airflow.api.auth.backend.session
1250 | 
1251 | # Used to set the maximum page limit for API requests. If limit passed as param
1252 | # is greater than maximum page limit, it will be ignored and maximum page limit value
1253 | # will be set as the limit
1254 | #
1255 | # Variable: AIRFLOW__API__MAXIMUM_PAGE_LIMIT
1256 | #
1257 | maximum_page_limit = 100
1258 | 
1259 | # Used to set the default page limit when limit param is zero or not provided in API
1260 | # requests. Otherwise if positive integer is passed in the API requests as limit, the
1261 | # smallest number of user given limit or maximum page limit is taken as limit.
1262 | #
1263 | # Variable: AIRFLOW__API__FALLBACK_PAGE_LIMIT
1264 | #
1265 | fallback_page_limit = 100
1266 | 
1267 | # The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested.
1268 | #
1269 | # Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com
1270 | #
1271 | # Variable: AIRFLOW__API__GOOGLE_OAUTH2_AUDIENCE
1272 | #
1273 | google_oauth2_audience =
1274 | 
1275 | # Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on
1276 | # `the Application Default Credentials
1277 | # <https://cloud.google.com/docs/authentication/production#finding_credentials_automatically>`__ will
1278 | # be used.
1279 | #
1280 | # Example: google_key_path = /files/service-account-json
1281 | #
1282 | # Variable: AIRFLOW__API__GOOGLE_KEY_PATH
1283 | #
1284 | google_key_path =
1285 | 
1286 | # Used in response to a preflight request to indicate which HTTP
1287 | # headers can be used when making the actual request. This header is
1288 | # the server side response to the browser's
1289 | # Access-Control-Request-Headers header.
1290 | #
1291 | # Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_HEADERS
1292 | #
1293 | access_control_allow_headers =
1294 | 
1295 | # Specifies the method or methods allowed when accessing the resource.
1296 | #
1297 | # Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_METHODS
1298 | #
1299 | access_control_allow_methods =
1300 | 
1301 | # Indicates whether the response can be shared with requesting code from the given origins.
1302 | # Separate URLs with space.
1303 | #
1304 | # Variable: AIRFLOW__API__ACCESS_CONTROL_ALLOW_ORIGINS
1305 | #
1306 | access_control_allow_origins =
1307 | 
1308 | # Indicates whether the **xcomEntries** endpoint supports the **deserialize**
1309 | # flag. If set to ``False``, setting this flag in a request would result in a
1310 | # 400 Bad Request error.
1311 | #
1312 | # Variable: AIRFLOW__API__ENABLE_XCOM_DESERIALIZE_SUPPORT
1313 | #
1314 | enable_xcom_deserialize_support = False
1315 | 
1316 | [lineage]
1317 | # what lineage backend to use
1318 | #
1319 | # Variable: AIRFLOW__LINEAGE__BACKEND
1320 | #
1321 | backend =
1322 | 
1323 | [operators]
1324 | # The default owner assigned to each new operator, unless
1325 | # provided explicitly or passed via ``default_args``
1326 | #
1327 | # Variable: AIRFLOW__OPERATORS__DEFAULT_OWNER
1328 | #
1329 | default_owner = airflow
1330 | 
1331 | # The default value of attribute "deferrable" in operators and sensors.
1332 | #
1333 | # Variable: AIRFLOW__OPERATORS__DEFAULT_DEFERRABLE
1334 | #
1335 | default_deferrable = false
1336 | 
1337 | # Indicates the default number of CPU units allocated to each operator when no specific CPU request
1338 | # is specified in the operator's configuration
1339 | #
1340 | # Variable: AIRFLOW__OPERATORS__DEFAULT_CPUS
1341 | #
1342 | default_cpus = 1
1343 | 
1344 | # Indicates the default number of RAM allocated to each operator when no specific RAM request
1345 | # is specified in the operator's configuration
1346 | #
1347 | # Variable: AIRFLOW__OPERATORS__DEFAULT_RAM
1348 | #
1349 | default_ram = 512
1350 | 
1351 | # Indicates the default number of disk storage allocated to each operator when no specific disk request
1352 | # is specified in the operator's configuration
1353 | #
1354 | # Variable: AIRFLOW__OPERATORS__DEFAULT_DISK
1355 | #
1356 | default_disk = 512
1357 | 
1358 | # Indicates the default number of GPUs allocated to each operator when no specific GPUs request
1359 | # is specified in the operator's configuration
1360 | #
1361 | # Variable: AIRFLOW__OPERATORS__DEFAULT_GPUS
1362 | #
1363 | default_gpus = 0
1364 | 
1365 | # Default queue that tasks get assigned to and that worker listen on.
1366 | #
1367 | # Variable: AIRFLOW__OPERATORS__DEFAULT_QUEUE
1368 | #
1369 | default_queue = default
1370 | 
1371 | # Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator.
1372 | # If set to ``False``, an exception will be thrown,
1373 | # otherwise only the console message will be displayed.
1374 | #
1375 | # Variable: AIRFLOW__OPERATORS__ALLOW_ILLEGAL_ARGUMENTS
1376 | #
1377 | allow_illegal_arguments = False
1378 | 
1379 | [webserver]
1380 | # The message displayed when a user attempts to execute actions beyond their authorised privileges.
1381 | #
1382 | # Variable: AIRFLOW__WEBSERVER__ACCESS_DENIED_MESSAGE
1383 | #
1384 | access_denied_message = Access is Denied
1385 | 
1386 | # Path of webserver config file used for configuring the webserver parameters
1387 | #
1388 | # Variable: AIRFLOW__WEBSERVER__CONFIG_FILE
1389 | #
1390 | config_file = /opt/airflow/webserver_config.py
1391 | 
1392 | # The base url of your website: Airflow cannot guess what domain or CNAME you are using.
1393 | # This is used to create links in the Log Url column in the Browse - Task Instances menu,
1394 | # as well as in any automated emails sent by Airflow that contain links to your webserver.
1395 | #
1396 | # Variable: AIRFLOW__WEBSERVER__BASE_URL
1397 | #
1398 | base_url = http://localhost:8080
1399 | 
1400 | # Default timezone to display all dates in the UI, can be UTC, system, or
1401 | # any IANA timezone string (e.g. **Europe/Amsterdam**). If left empty the
1402 | # default value of core/default_timezone will be used
1403 | #
1404 | # Example: default_ui_timezone = America/New_York
1405 | #
1406 | # Variable: AIRFLOW__WEBSERVER__DEFAULT_UI_TIMEZONE
1407 | #
1408 | default_ui_timezone = UTC
1409 | 
1410 | # The ip specified when starting the web server
1411 | #
1412 | # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_HOST
1413 | #
1414 | web_server_host = 0.0.0.0
1415 | 
1416 | # The port on which to run the web server
1417 | #
1418 | # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_PORT
1419 | #
1420 | web_server_port = 8080
1421 | 
1422 | # Paths to the SSL certificate and key for the web server. When both are
1423 | # provided SSL will be enabled. This does not change the web server port.
1424 | #
1425 | # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_SSL_CERT
1426 | #
1427 | web_server_ssl_cert =
1428 | 
1429 | # Paths to the SSL certificate and key for the web server. When both are
1430 | # provided SSL will be enabled. This does not change the web server port.
1431 | #
1432 | # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_SSL_KEY
1433 | #
1434 | web_server_ssl_key =
1435 | 
1436 | # The type of backend used to store web session data, can be ``database`` or ``securecookie``. For the
1437 | # ``database`` backend, sessions are store in the database and they can be
1438 | # managed there (for example when you reset password of the user, all sessions for that user are
1439 | # deleted). For the ``securecookie`` backend, sessions are stored in encrypted cookies on the client
1440 | # side. The ``securecookie`` mechanism is 'lighter' than database backend, but sessions are not deleted
1441 | # when you reset password of the user, which means that other than waiting for expiry time, the only
1442 | # way to invalidate all sessions for a user is to change secret_key and restart webserver (which
1443 | # also invalidates and logs out all other user's sessions).
1444 | #
1445 | # When you are using ``database`` backend, make sure to keep your database session table small
1446 | # by periodically running ``airflow db clean --table session`` command, especially if you have
1447 | # automated API calls that will create a new session for each call rather than reuse the sessions
1448 | # stored in browser cookies.
1449 | #
1450 | # Example: session_backend = securecookie
1451 | #
1452 | # Variable: AIRFLOW__WEBSERVER__SESSION_BACKEND
1453 | #
1454 | session_backend = database
1455 | 
1456 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond
1457 | #
1458 | # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_MASTER_TIMEOUT
1459 | #
1460 | web_server_master_timeout = 120
1461 | 
1462 | # Number of seconds the gunicorn webserver waits before timing out on a worker
1463 | #
1464 | # Variable: AIRFLOW__WEBSERVER__WEB_SERVER_WORKER_TIMEOUT
1465 | #
1466 | web_server_worker_timeout = 120
1467 | 
1468 | # Number of workers to refresh at a time. When set to 0, worker refresh is
1469 | # disabled. When nonzero, airflow periodically refreshes webserver workers by
1470 | # bringing up new ones and killing old ones.
1471 | #
1472 | # Variable: AIRFLOW__WEBSERVER__WORKER_REFRESH_BATCH_SIZE
1473 | #
1474 | worker_refresh_batch_size = 1
1475 | 
1476 | # Number of seconds to wait before refreshing a batch of workers.
1477 | #
1478 | # Variable: AIRFLOW__WEBSERVER__WORKER_REFRESH_INTERVAL
1479 | #
1480 | worker_refresh_interval = 6000
1481 | 
1482 | # If set to ``True``, Airflow will track files in plugins_folder directory. When it detects changes,
1483 | # then reload the gunicorn. If set to ``True``, gunicorn starts without preloading, which is slower,
1484 | # uses more memory, and may cause race conditions. Avoid setting this to ``True`` in production.
1485 | #
1486 | # Variable: AIRFLOW__WEBSERVER__RELOAD_ON_PLUGIN_CHANGE
1487 | #
1488 | reload_on_plugin_change = False
1489 | 
1490 | # Secret key used to run your flask app. It should be as random as possible. However, when running
1491 | # more than 1 instances of webserver, make sure all of them use the same ``secret_key`` otherwise
1492 | # one of them will error with "CSRF session token is missing".
1493 | # The webserver key is also used to authorize requests to Celery workers when logs are retrieved.
1494 | # The token generated using the secret key has a short expiry time though - make sure that time on
1495 | # ALL the machines that you run airflow components on is synchronized (for example using ntpd)
1496 | # otherwise you might get "forbidden" errors when the logs are accessed.
1497 | #
1498 | # Variable: AIRFLOW__WEBSERVER__SECRET_KEY
1499 | #
1500 | secret_key = L6oQmxmjwK0yQH+Ltg0JhQ==
1501 | 
1502 | # Number of workers to run the Gunicorn web server
1503 | #
1504 | # Variable: AIRFLOW__WEBSERVER__WORKERS
1505 | #
1506 | workers = 4
1507 | 
1508 | # The worker class gunicorn should use. Choices include
1509 | # ``sync`` (default), ``eventlet``, ``gevent``.
1510 | #
1511 | # .. warning::
1512 | #
1513 | #     When using ``gevent`` you might also want to set the ``_AIRFLOW_PATCH_GEVENT``
1514 | #     environment variable to ``"1"`` to make sure gevent patching is done as early as possible.
1515 | #
1516 | #     Be careful to set ``_AIRFLOW_PATCH_GEVENT`` only on the web server as gevent patching may
1517 | #     affect the scheduler behavior via the ``multiprocessing`` sockets module and cause crash.
1518 | #
1519 | #     See related Issues / PRs for more details:
1520 | #
1521 | #     * https://github.com/benoitc/gunicorn/issues/2796
1522 | #     * https://github.com/apache/airflow/issues/8212
1523 | #     * https://github.com/apache/airflow/pull/28283
1524 | #
1525 | # Variable: AIRFLOW__WEBSERVER__WORKER_CLASS
1526 | #
1527 | worker_class = sync
1528 | 
1529 | # Log files for the gunicorn webserver. '-' means log to stderr.
1530 | #
1531 | # Variable: AIRFLOW__WEBSERVER__ACCESS_LOGFILE
1532 | #
1533 | access_logfile = -
1534 | 
1535 | # Log files for the gunicorn webserver. '-' means log to stderr.
1536 | #
1537 | # Variable: AIRFLOW__WEBSERVER__ERROR_LOGFILE
1538 | #
1539 | error_logfile = -
1540 | 
1541 | # Access log format for gunicorn webserver.
1542 | # default format is ``%%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s"``
1543 | # See `Gunicorn Settings: 'access_log_format' Reference
1544 | # <https://docs.gunicorn.org/en/stable/settings.html#access-log-format>`__ for more details
1545 | #
1546 | # Variable: AIRFLOW__WEBSERVER__ACCESS_LOGFORMAT
1547 | #
1548 | access_logformat =
1549 | 
1550 | # Expose the configuration file in the web server. Set to ``non-sensitive-only`` to show all values
1551 | # except those that have security implications. ``True`` shows all values. ``False`` hides the
1552 | # configuration completely.
1553 | #
1554 | # Variable: AIRFLOW__WEBSERVER__EXPOSE_CONFIG
1555 | #
1556 | expose_config = False
1557 | 
1558 | # Expose hostname in the web server
1559 | #
1560 | # Variable: AIRFLOW__WEBSERVER__EXPOSE_HOSTNAME
1561 | #
1562 | expose_hostname = False
1563 | 
1564 | # Expose stacktrace in the web server
1565 | #
1566 | # Variable: AIRFLOW__WEBSERVER__EXPOSE_STACKTRACE
1567 | #
1568 | expose_stacktrace = False
1569 | 
1570 | # Default DAG view. Valid values are: ``grid``, ``graph``, ``duration``, ``gantt``, ``landing_times``
1571 | #
1572 | # Variable: AIRFLOW__WEBSERVER__DAG_DEFAULT_VIEW
1573 | #
1574 | dag_default_view = grid
1575 | 
1576 | # Default DAG orientation. Valid values are:
1577 | # ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top)
1578 | #
1579 | # Variable: AIRFLOW__WEBSERVER__DAG_ORIENTATION
1580 | #
1581 | dag_orientation = LR
1582 | 
1583 | # Sorting order in grid view. Valid values are: ``topological``, ``hierarchical_alphabetical``
1584 | #
1585 | # Variable: AIRFLOW__WEBSERVER__GRID_VIEW_SORTING_ORDER
1586 | #
1587 | grid_view_sorting_order = topological
1588 | 
1589 | # The amount of time (in secs) webserver will wait for initial handshake
1590 | # while fetching logs from other worker machine
1591 | #
1592 | # Variable: AIRFLOW__WEBSERVER__LOG_FETCH_TIMEOUT_SEC
1593 | #
1594 | log_fetch_timeout_sec = 5
1595 | 
1596 | # Time interval (in secs) to wait before next log fetching.
1597 | #
1598 | # Variable: AIRFLOW__WEBSERVER__LOG_FETCH_DELAY_SEC
1599 | #
1600 | log_fetch_delay_sec = 2
1601 | 
1602 | # Distance away from page bottom to enable auto tailing.
1603 | #
1604 | # Variable: AIRFLOW__WEBSERVER__LOG_AUTO_TAILING_OFFSET
1605 | #
1606 | log_auto_tailing_offset = 30
1607 | 
1608 | # Animation speed for auto tailing log display.
1609 | #
1610 | # Variable: AIRFLOW__WEBSERVER__LOG_ANIMATION_SPEED
1611 | #
1612 | log_animation_speed = 1000
1613 | 
1614 | # By default, the webserver shows paused DAGs. Flip this to hide paused
1615 | # DAGs by default
1616 | #
1617 | # Variable: AIRFLOW__WEBSERVER__HIDE_PAUSED_DAGS_BY_DEFAULT
1618 | #
1619 | hide_paused_dags_by_default = False
1620 | 
1621 | # Consistent page size across all listing views in the UI
1622 | #
1623 | # Variable: AIRFLOW__WEBSERVER__PAGE_SIZE
1624 | #
1625 | page_size = 100
1626 | 
1627 | # Define the color of navigation bar
1628 | #
1629 | # Variable: AIRFLOW__WEBSERVER__NAVBAR_COLOR
1630 | #
1631 | navbar_color = #fff
1632 | 
1633 | # Define the color of text in the navigation bar
1634 | #
1635 | # Variable: AIRFLOW__WEBSERVER__NAVBAR_TEXT_COLOR
1636 | #
1637 | navbar_text_color = #51504f
1638 | 
1639 | # Define the color of navigation bar links when hovered
1640 | #
1641 | # Variable: AIRFLOW__WEBSERVER__NAVBAR_HOVER_COLOR
1642 | #
1643 | navbar_hover_color = #eee
1644 | 
1645 | # Define the color of text in the navigation bar when hovered
1646 | #
1647 | # Variable: AIRFLOW__WEBSERVER__NAVBAR_TEXT_HOVER_COLOR
1648 | #
1649 | navbar_text_hover_color = #51504f
1650 | 
1651 | # Define the color of the logo text
1652 | #
1653 | # Variable: AIRFLOW__WEBSERVER__NAVBAR_LOGO_TEXT_COLOR
1654 | #
1655 | navbar_logo_text_color = #51504f
1656 | 
1657 | # Default dagrun to show in UI
1658 | #
1659 | # Variable: AIRFLOW__WEBSERVER__DEFAULT_DAG_RUN_DISPLAY_NUMBER
1660 | #
1661 | default_dag_run_display_number = 25
1662 | 
1663 | # Enable werkzeug ``ProxyFix`` middleware for reverse proxy
1664 | #
1665 | # Variable: AIRFLOW__WEBSERVER__ENABLE_PROXY_FIX
1666 | #
1667 | enable_proxy_fix = False
1668 | 
1669 | # Number of values to trust for ``X-Forwarded-For``.
1670 | # See `Werkzeug: X-Forwarded-For Proxy Fix
1671 | # <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
1672 | #
1673 | # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_FOR
1674 | #
1675 | proxy_fix_x_for = 1
1676 | 
1677 | # Number of values to trust for ``X-Forwarded-Proto``.
1678 | # See `Werkzeug: X-Forwarded-For Proxy Fix
1679 | # <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
1680 | #
1681 | # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PROTO
1682 | #
1683 | proxy_fix_x_proto = 1
1684 | 
1685 | # Number of values to trust for ``X-Forwarded-Host``.
1686 | # See `Werkzeug: X-Forwarded-For Proxy Fix
1687 | # <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
1688 | #
1689 | # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_HOST
1690 | #
1691 | proxy_fix_x_host = 1
1692 | 
1693 | # Number of values to trust for ``X-Forwarded-Port``.
1694 | # See `Werkzeug: X-Forwarded-For Proxy Fix
1695 | # <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
1696 | #
1697 | # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PORT
1698 | #
1699 | proxy_fix_x_port = 1
1700 | 
1701 | # Number of values to trust for ``X-Forwarded-Prefix``.
1702 | # See `Werkzeug: X-Forwarded-For Proxy Fix
1703 | # <https://werkzeug.palletsprojects.com/en/2.3.x/middleware/proxy_fix/>`__ for more details.
1704 | #
1705 | # Variable: AIRFLOW__WEBSERVER__PROXY_FIX_X_PREFIX
1706 | #
1707 | proxy_fix_x_prefix = 1
1708 | 
1709 | # Set secure flag on session cookie
1710 | #
1711 | # Variable: AIRFLOW__WEBSERVER__COOKIE_SECURE
1712 | #
1713 | cookie_secure = False
1714 | 
1715 | # Set samesite policy on session cookie
1716 | #
1717 | # Variable: AIRFLOW__WEBSERVER__COOKIE_SAMESITE
1718 | #
1719 | cookie_samesite = Lax
1720 | 
1721 | # Default setting for wrap toggle on DAG code and TI log views.
1722 | #
1723 | # Variable: AIRFLOW__WEBSERVER__DEFAULT_WRAP
1724 | #
1725 | default_wrap = False
1726 | 
1727 | # Allow the UI to be rendered in a frame
1728 | #
1729 | # Variable: AIRFLOW__WEBSERVER__X_FRAME_ENABLED
1730 | #
1731 | x_frame_enabled = True
1732 | 
1733 | # Send anonymous user activity to your analytics tool
1734 | # choose from ``google_analytics``, ``segment``, ``metarouter``, or ``matomo``
1735 | #
1736 | # Variable: AIRFLOW__WEBSERVER__ANALYTICS_TOOL
1737 | #
1738 | # analytics_tool =
1739 | 
1740 | # Unique ID of your account in the analytics tool
1741 | #
1742 | # Variable: AIRFLOW__WEBSERVER__ANALYTICS_ID
1743 | #
1744 | # analytics_id =
1745 | 
1746 | # Your instances url, only applicable to Matomo.
1747 | #
1748 | # Example: analytics_url = https://your.matomo.instance.com/
1749 | #
1750 | # Variable: AIRFLOW__WEBSERVER__ANALYTICS_URL
1751 | #
1752 | # analytics_url =
1753 | 
1754 | # 'Recent Tasks' stats will show for old DagRuns if set
1755 | #
1756 | # Variable: AIRFLOW__WEBSERVER__SHOW_RECENT_STATS_FOR_COMPLETED_RUNS
1757 | #
1758 | show_recent_stats_for_completed_runs = True
1759 | 
1760 | # The UI cookie lifetime in minutes. User will be logged out from UI after
1761 | # ``[webserver] session_lifetime_minutes`` of non-activity
1762 | #
1763 | # Variable: AIRFLOW__WEBSERVER__SESSION_LIFETIME_MINUTES
1764 | #
1765 | session_lifetime_minutes = 43200
1766 | 
1767 | # Sets a custom page title for the DAGs overview page and site title for all pages
1768 | #
1769 | # Variable: AIRFLOW__WEBSERVER__INSTANCE_NAME
1770 | #
1771 | # instance_name =
1772 | 
1773 | # Whether the custom page title for the DAGs overview page contains any Markup language
1774 | #
1775 | # Variable: AIRFLOW__WEBSERVER__INSTANCE_NAME_HAS_MARKUP
1776 | #
1777 | instance_name_has_markup = False
1778 | 
1779 | # How frequently, in seconds, the DAG data will auto-refresh in graph or grid view
1780 | # when auto-refresh is turned on
1781 | #
1782 | # Variable: AIRFLOW__WEBSERVER__AUTO_REFRESH_INTERVAL
1783 | #
1784 | auto_refresh_interval = 3
1785 | 
1786 | # Boolean for displaying warning for publicly viewable deployment
1787 | #
1788 | # Variable: AIRFLOW__WEBSERVER__WARN_DEPLOYMENT_EXPOSURE
1789 | #
1790 | warn_deployment_exposure = True
1791 | 
1792 | # Comma separated string of view events to exclude from dag audit view.
1793 | # All other events will be added minus the ones passed here.
1794 | # The audit logs in the db will not be affected by this parameter.
1795 | #
1796 | # Example: audit_view_excluded_events = cli_task_run,running,success
1797 | #
1798 | # Variable: AIRFLOW__WEBSERVER__AUDIT_VIEW_EXCLUDED_EVENTS
1799 | #
1800 | # audit_view_excluded_events =
1801 | 
1802 | # Comma separated string of view events to include in dag audit view.
1803 | # If passed, only these events will populate the dag audit view.
1804 | # The audit logs in the db will not be affected by this parameter.
1805 | #
1806 | # Example: audit_view_included_events = dagrun_cleared,failed
1807 | #
1808 | # Variable: AIRFLOW__WEBSERVER__AUDIT_VIEW_INCLUDED_EVENTS
1809 | #
1810 | # audit_view_included_events =
1811 | 
1812 | # Boolean for running SwaggerUI in the webserver.
1813 | #
1814 | # Variable: AIRFLOW__WEBSERVER__ENABLE_SWAGGER_UI
1815 | #
1816 | enable_swagger_ui = True
1817 | 
1818 | # Boolean for running Internal API in the webserver.
1819 | #
1820 | # Variable: AIRFLOW__WEBSERVER__RUN_INTERNAL_API
1821 | #
1822 | run_internal_api = False
1823 | 
1824 | # The caching algorithm used by the webserver. Must be a valid hashlib function name.
1825 | #
1826 | # Example: caching_hash_method = sha256
1827 | #
1828 | # Variable: AIRFLOW__WEBSERVER__CACHING_HASH_METHOD
1829 | #
1830 | caching_hash_method = md5
1831 | 
1832 | # Behavior of the trigger DAG run button for DAGs without params. ``False`` to skip and trigger
1833 | # without displaying a form to add a **dag_run.conf**, ``True`` to always display the form.
1834 | # The form is displayed always if parameters are defined.
1835 | #
1836 | # Variable: AIRFLOW__WEBSERVER__SHOW_TRIGGER_FORM_IF_NO_PARAMS
1837 | #
1838 | show_trigger_form_if_no_params = False
1839 | 
1840 | # Number of recent DAG run configurations in the selector on the trigger web form.
1841 | #
1842 | # Example: num_recent_configurations_for_trigger = 10
1843 | #
1844 | # Variable: AIRFLOW__WEBSERVER__NUM_RECENT_CONFIGURATIONS_FOR_TRIGGER
1845 | #
1846 | num_recent_configurations_for_trigger = 5
1847 | 
1848 | # A DAG author is able to provide any raw HTML into ``doc_md`` or params description in
1849 | # ``description_md`` for text formatting. This is including potentially unsafe javascript.
1850 | # Displaying the DAG or trigger form in web UI provides the DAG author the potential to
1851 | # inject malicious code into clients browsers. To ensure the web UI is safe by default,
1852 | # raw HTML is disabled by default. If you trust your DAG authors, you can enable HTML
1853 | # support in markdown by setting this option to ``True``.
1854 | #
1855 | # This parameter also enables the deprecated fields ``description_html`` and
1856 | # ``custom_html_form`` in DAG params until the feature is removed in a future version.
1857 | #
1858 | # Example: allow_raw_html_descriptions = False
1859 | #
1860 | # Variable: AIRFLOW__WEBSERVER__ALLOW_RAW_HTML_DESCRIPTIONS
1861 | #
1862 | allow_raw_html_descriptions = False
1863 | 
1864 | # The maximum size of the request payload (in MB) that can be sent.
1865 | #
1866 | # Variable: AIRFLOW__WEBSERVER__ALLOWED_PAYLOAD_SIZE
1867 | #
1868 | allowed_payload_size = 1.0
1869 | 
1870 | # Require confirmation when changing a DAG in the web UI. This is to prevent accidental changes
1871 | # to a DAG that may be running on sensitive environments like production.
1872 | # When set to ``True``, confirmation dialog will be shown when a user tries to Pause/Unpause,
1873 | # Trigger a DAG
1874 | #
1875 | # Variable: AIRFLOW__WEBSERVER__REQUIRE_CONFIRMATION_DAG_CHANGE
1876 | #
1877 | require_confirmation_dag_change = False
1878 | 
1879 | [email]
1880 | # Configuration email backend and whether to
1881 | # send email alerts on retry or failure
1882 | 
1883 | # Email backend to use
1884 | #
1885 | # Variable: AIRFLOW__EMAIL__EMAIL_BACKEND
1886 | #
1887 | email_backend = airflow.utils.email.send_email_smtp
1888 | 
1889 | # Email connection to use
1890 | #
1891 | # Variable: AIRFLOW__EMAIL__EMAIL_CONN_ID
1892 | #
1893 | email_conn_id = smtp_default
1894 | 
1895 | # Whether email alerts should be sent when a task is retried
1896 | #
1897 | # Variable: AIRFLOW__EMAIL__DEFAULT_EMAIL_ON_RETRY
1898 | #
1899 | default_email_on_retry = True
1900 | 
1901 | # Whether email alerts should be sent when a task failed
1902 | #
1903 | # Variable: AIRFLOW__EMAIL__DEFAULT_EMAIL_ON_FAILURE
1904 | #
1905 | default_email_on_failure = True
1906 | 
1907 | # File that will be used as the template for Email subject (which will be rendered using Jinja2).
1908 | # If not set, Airflow uses a base template.
1909 | #
1910 | # Example: subject_template = /path/to/my_subject_template_file
1911 | #
1912 | # Variable: AIRFLOW__EMAIL__SUBJECT_TEMPLATE
1913 | #
1914 | # subject_template =
1915 | 
1916 | # File that will be used as the template for Email content (which will be rendered using Jinja2).
1917 | # If not set, Airflow uses a base template.
1918 | #
1919 | # Example: html_content_template = /path/to/my_html_content_template_file
1920 | #
1921 | # Variable: AIRFLOW__EMAIL__HTML_CONTENT_TEMPLATE
1922 | #
1923 | # html_content_template =
1924 | 
1925 | # Email address that will be used as sender address.
1926 | # It can either be raw email or the complete address in a format ``Sender Name <sender@email.com>``
1927 | #
1928 | # Example: from_email = Airflow <airflow@example.com>
1929 | #
1930 | # Variable: AIRFLOW__EMAIL__FROM_EMAIL
1931 | #
1932 | # from_email =
1933 | 
1934 | # ssl context to use when using SMTP and IMAP SSL connections. By default, the context is "default"
1935 | # which sets it to ``ssl.create_default_context()`` which provides the right balance between
1936 | # compatibility and security, it however requires that certificates in your operating system are
1937 | # updated and that SMTP/IMAP servers of yours have valid certificates that have corresponding public
1938 | # keys installed on your machines. You can switch it to "none" if you want to disable checking
1939 | # of the certificates, but it is not recommended as it allows MITM (man-in-the-middle) attacks
1940 | # if your infrastructure is not sufficiently secured. It should only be set temporarily while you
1941 | # are fixing your certificate configuration. This can be typically done by upgrading to newer
1942 | # version of the operating system you run Airflow components on,by upgrading/refreshing proper
1943 | # certificates in the OS or by updating certificates for your mail servers.
1944 | #
1945 | # Example: ssl_context = default
1946 | #
1947 | # Variable: AIRFLOW__EMAIL__SSL_CONTEXT
1948 | #
1949 | ssl_context = default
1950 | 
1951 | [smtp]
1952 | # If you want airflow to send emails on retries, failure, and you want to use
1953 | # the airflow.utils.email.send_email_smtp function, you have to configure an
1954 | # smtp server here
1955 | 
1956 | # Specifies the host server address used by Airflow when sending out email notifications via SMTP.
1957 | #
1958 | # Variable: AIRFLOW__SMTP__SMTP_HOST
1959 | #
1960 | smtp_host = localhost
1961 | 
1962 | # Determines whether to use the STARTTLS command when connecting to the SMTP server.
1963 | #
1964 | # Variable: AIRFLOW__SMTP__SMTP_STARTTLS
1965 | #
1966 | smtp_starttls = True
1967 | 
1968 | # Determines whether to use an SSL connection when talking to the SMTP server.
1969 | #
1970 | # Variable: AIRFLOW__SMTP__SMTP_SSL
1971 | #
1972 | smtp_ssl = False
1973 | 
1974 | # Username to authenticate when connecting to smtp server.
1975 | #
1976 | # Example: smtp_user = airflow
1977 | #
1978 | # Variable: AIRFLOW__SMTP__SMTP_USER
1979 | #
1980 | # smtp_user =
1981 | 
1982 | # Password to authenticate when connecting to smtp server.
1983 | #
1984 | # Example: smtp_password = airflow
1985 | #
1986 | # Variable: AIRFLOW__SMTP__SMTP_PASSWORD
1987 | #
1988 | # smtp_password =
1989 | 
1990 | # Defines the port number on which Airflow connects to the SMTP server to send email notifications.
1991 | #
1992 | # Variable: AIRFLOW__SMTP__SMTP_PORT
1993 | #
1994 | smtp_port = 25
1995 | 
1996 | # Specifies the default **from** email address used when Airflow sends email notifications.
1997 | #
1998 | # Variable: AIRFLOW__SMTP__SMTP_MAIL_FROM
1999 | #
2000 | smtp_mail_from = airflow@example.com
2001 | 
2002 | # Determines the maximum time (in seconds) the Apache Airflow system will wait for a
2003 | # connection to the SMTP server to be established.
2004 | #
2005 | # Variable: AIRFLOW__SMTP__SMTP_TIMEOUT
2006 | #
2007 | smtp_timeout = 30
2008 | 
2009 | # Defines the maximum number of times Airflow will attempt to connect to the SMTP server.
2010 | #
2011 | # Variable: AIRFLOW__SMTP__SMTP_RETRY_LIMIT
2012 | #
2013 | smtp_retry_limit = 5
2014 | 
2015 | [sentry]
2016 | # `Sentry <https://docs.sentry.io>`__ integration. Here you can supply
2017 | # additional configuration options based on the Python platform.
2018 | # See `Python / Configuration / Basic Options
2019 | # <https://docs.sentry.io/platforms/python/configuration/options/>`__ for more details.
2020 | # Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``,
2021 | # ``ignore_errors``, ``before_breadcrumb``, ``transport``.
2022 | 
2023 | # Enable error reporting to Sentry
2024 | #
2025 | # Variable: AIRFLOW__SENTRY__SENTRY_ON
2026 | #
2027 | sentry_on = false
2028 | 
2029 | #
2030 | # Variable: AIRFLOW__SENTRY__SENTRY_DSN
2031 | #
2032 | sentry_dsn =
2033 | 
2034 | # Dotted path to a before_send function that the sentry SDK should be configured to use.
2035 | #
2036 | # Variable: AIRFLOW__SENTRY__BEFORE_SEND
2037 | #
2038 | # before_send =
2039 | 
2040 | [scheduler]
2041 | # Task instances listen for external kill signal (when you clear tasks
2042 | # from the CLI or the UI), this defines the frequency at which they should
2043 | # listen (in seconds).
2044 | #
2045 | # Variable: AIRFLOW__SCHEDULER__JOB_HEARTBEAT_SEC
2046 | #
2047 | job_heartbeat_sec = 5
2048 | 
2049 | # The scheduler constantly tries to trigger new tasks (look at the
2050 | # scheduler section in the docs for more information). This defines
2051 | # how often the scheduler should run (in seconds).
2052 | #
2053 | # Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEARTBEAT_SEC
2054 | #
2055 | scheduler_heartbeat_sec = 5
2056 | 
2057 | # The frequency (in seconds) at which the LocalTaskJob should send heartbeat signals to the
2058 | # scheduler to notify it's still alive. If this value is set to 0, the heartbeat interval will default
2059 | # to the value of ``[scheduler] scheduler_zombie_task_threshold``.
2060 | #
2061 | # Variable: AIRFLOW__SCHEDULER__LOCAL_TASK_JOB_HEARTBEAT_SEC
2062 | #
2063 | local_task_job_heartbeat_sec = 0
2064 | 
2065 | # The number of times to try to schedule each DAG file
2066 | # -1 indicates unlimited number
2067 | #
2068 | # Variable: AIRFLOW__SCHEDULER__NUM_RUNS
2069 | #
2070 | num_runs = -1
2071 | 
2072 | # Controls how long the scheduler will sleep between loops, but if there was nothing to do
2073 | # in the loop. i.e. if it scheduled something then it will start the next loop
2074 | # iteration straight away.
2075 | #
2076 | # Variable: AIRFLOW__SCHEDULER__SCHEDULER_IDLE_SLEEP_TIME
2077 | #
2078 | scheduler_idle_sleep_time = 1
2079 | 
2080 | # Number of seconds after which a DAG file is parsed. The DAG file is parsed every
2081 | # ``[scheduler] min_file_process_interval`` number of seconds. Updates to DAGs are reflected after
2082 | # this interval. Keeping this number low will increase CPU usage.
2083 | #
2084 | # Variable: AIRFLOW__SCHEDULER__MIN_FILE_PROCESS_INTERVAL
2085 | #
2086 | min_file_process_interval = 30
2087 | 
2088 | # How often (in seconds) to check for stale DAGs (DAGs which are no longer present in
2089 | # the expected files) which should be deactivated, as well as datasets that are no longer
2090 | # referenced and should be marked as orphaned.
2091 | #
2092 | # Variable: AIRFLOW__SCHEDULER__PARSING_CLEANUP_INTERVAL
2093 | #
2094 | parsing_cleanup_interval = 60
2095 | 
2096 | # How long (in seconds) to wait after we have re-parsed a DAG file before deactivating stale
2097 | # DAGs (DAGs which are no longer present in the expected files). The reason why we need
2098 | # this threshold is to account for the time between when the file is parsed and when the
2099 | # DAG is loaded. The absolute maximum that this could take is ``[core] dag_file_processor_timeout``,
2100 | # but when you have a long timeout configured, it results in a significant delay in the
2101 | # deactivation of stale dags.
2102 | #
2103 | # Variable: AIRFLOW__SCHEDULER__STALE_DAG_THRESHOLD
2104 | #
2105 | stale_dag_threshold = 50
2106 | 
2107 | # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes.
2108 | #
2109 | # Variable: AIRFLOW__SCHEDULER__DAG_DIR_LIST_INTERVAL
2110 | #
2111 | dag_dir_list_interval = 300
2112 | 
2113 | # How often should stats be printed to the logs. Setting to 0 will disable printing stats
2114 | #
2115 | # Variable: AIRFLOW__SCHEDULER__PRINT_STATS_INTERVAL
2116 | #
2117 | print_stats_interval = 30
2118 | 
2119 | # How often (in seconds) should pool usage stats be sent to StatsD (if statsd_on is enabled)
2120 | #
2121 | # Variable: AIRFLOW__SCHEDULER__POOL_METRICS_INTERVAL
2122 | #
2123 | pool_metrics_interval = 5.0
2124 | 
2125 | # If the last scheduler heartbeat happened more than ``[scheduler] scheduler_health_check_threshold``
2126 | # ago (in seconds), scheduler is considered unhealthy.
2127 | # This is used by the health check in the **/health** endpoint and in ``airflow jobs check`` CLI
2128 | # for SchedulerJob.
2129 | #
2130 | # Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_THRESHOLD
2131 | #
2132 | scheduler_health_check_threshold = 30
2133 | 
2134 | # When you start a scheduler, airflow starts a tiny web server
2135 | # subprocess to serve a health check if this is set to ``True``
2136 | #
2137 | # Variable: AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK
2138 | #
2139 | enable_health_check = False
2140 | 
2141 | # When you start a scheduler, airflow starts a tiny web server
2142 | # subprocess to serve a health check on this host
2143 | #
2144 | # Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_SERVER_HOST
2145 | #
2146 | scheduler_health_check_server_host = 0.0.0.0
2147 | 
2148 | # When you start a scheduler, airflow starts a tiny web server
2149 | # subprocess to serve a health check on this port
2150 | #
2151 | # Variable: AIRFLOW__SCHEDULER__SCHEDULER_HEALTH_CHECK_SERVER_PORT
2152 | #
2153 | scheduler_health_check_server_port = 8974
2154 | 
2155 | # How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs
2156 | #
2157 | # Variable: AIRFLOW__SCHEDULER__ORPHANED_TASKS_CHECK_INTERVAL
2158 | #
2159 | orphaned_tasks_check_interval = 300.0
2160 | 
2161 | # Determines the directory where logs for the child processes of the scheduler will be stored
2162 | #
2163 | # Variable: AIRFLOW__SCHEDULER__CHILD_PROCESS_LOG_DIRECTORY
2164 | #
2165 | child_process_log_directory = /opt/airflow/logs/scheduler
2166 | 
2167 | # Local task jobs periodically heartbeat to the DB. If the job has
2168 | # not heartbeat in this many seconds, the scheduler will mark the
2169 | # associated task instance as failed and will re-schedule the task.
2170 | #
2171 | # Variable: AIRFLOW__SCHEDULER__SCHEDULER_ZOMBIE_TASK_THRESHOLD
2172 | #
2173 | scheduler_zombie_task_threshold = 300
2174 | 
2175 | # How often (in seconds) should the scheduler check for zombie tasks.
2176 | #
2177 | # Variable: AIRFLOW__SCHEDULER__ZOMBIE_DETECTION_INTERVAL
2178 | #
2179 | zombie_detection_interval = 10.0
2180 | 
2181 | # Turn off scheduler catchup by setting this to ``False``.
2182 | # Default behavior is unchanged and
2183 | # Command Line Backfills still work, but the scheduler
2184 | # will not do scheduler catchup if this is ``False``,
2185 | # however it can be set on a per DAG basis in the
2186 | # DAG definition (catchup)
2187 | #
2188 | # Variable: AIRFLOW__SCHEDULER__CATCHUP_BY_DEFAULT
2189 | #
2190 | catchup_by_default = True
2191 | 
2192 | # Setting this to ``True`` will make first task instance of a task
2193 | # ignore depends_on_past setting. A task instance will be considered
2194 | # as the first task instance of a task when there is no task instance
2195 | # in the DB with an execution_date earlier than it., i.e. no manual marking
2196 | # success will be needed for a newly added task to be scheduled.
2197 | #
2198 | # Variable: AIRFLOW__SCHEDULER__IGNORE_FIRST_DEPENDS_ON_PAST_BY_DEFAULT
2199 | #
2200 | ignore_first_depends_on_past_by_default = True
2201 | 
2202 | # This changes the batch size of queries in the scheduling main loop.
2203 | # This should not be greater than ``[core] parallelism``.
2204 | # If this is too high, SQL query performance may be impacted by
2205 | # complexity of query predicate, and/or excessive locking.
2206 | # Additionally, you may hit the maximum allowable query length for your db.
2207 | # Set this to 0 to use the value of ``[core] parallelism``
2208 | #
2209 | # Variable: AIRFLOW__SCHEDULER__MAX_TIS_PER_QUERY
2210 | #
2211 | max_tis_per_query = 16
2212 | 
2213 | # Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries.
2214 | # If this is set to ``False`` then you should not run more than a single
2215 | # scheduler at once
2216 | #
2217 | # Variable: AIRFLOW__SCHEDULER__USE_ROW_LEVEL_LOCKING
2218 | #
2219 | use_row_level_locking = True
2220 | 
2221 | # Max number of DAGs to create DagRuns for per scheduler loop.
2222 | #
2223 | # Variable: AIRFLOW__SCHEDULER__MAX_DAGRUNS_TO_CREATE_PER_LOOP
2224 | #
2225 | max_dagruns_to_create_per_loop = 10
2226 | 
2227 | # How many DagRuns should a scheduler examine (and lock) when scheduling
2228 | # and queuing tasks.
2229 | #
2230 | # Variable: AIRFLOW__SCHEDULER__MAX_DAGRUNS_PER_LOOP_TO_SCHEDULE
2231 | #
2232 | max_dagruns_per_loop_to_schedule = 20
2233 | 
2234 | # Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the
2235 | # same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other
2236 | # dags in some circumstances
2237 | #
2238 | # Variable: AIRFLOW__SCHEDULER__SCHEDULE_AFTER_TASK_EXECUTION
2239 | #
2240 | schedule_after_task_execution = True
2241 | 
2242 | # The scheduler reads dag files to extract the airflow modules that are going to be used,
2243 | # and imports them ahead of time to avoid having to re-do it for each parsing process.
2244 | # This flag can be set to ``False`` to disable this behavior in case an airflow module needs
2245 | # to be freshly imported each time (at the cost of increased DAG parsing time).
2246 | #
2247 | # Variable: AIRFLOW__SCHEDULER__PARSING_PRE_IMPORT_MODULES
2248 | #
2249 | parsing_pre_import_modules = True
2250 | 
2251 | # The scheduler can run multiple processes in parallel to parse dags.
2252 | # This defines how many processes will run.
2253 | #
2254 | # Variable: AIRFLOW__SCHEDULER__PARSING_PROCESSES
2255 | #
2256 | parsing_processes = 2
2257 | 
2258 | # One of ``modified_time``, ``random_seeded_by_host`` and ``alphabetical``.
2259 | # The scheduler will list and sort the dag files to decide the parsing order.
2260 | #
2261 | # * ``modified_time``: Sort by modified time of the files. This is useful on large scale to parse the
2262 | #   recently modified DAGs first.
2263 | # * ``random_seeded_by_host``: Sort randomly across multiple Schedulers but with same order on the
2264 | #   same host. This is useful when running with Scheduler in HA mode where each scheduler can
2265 | #   parse different DAG files.
2266 | # * ``alphabetical``: Sort by filename
2267 | #
2268 | # Variable: AIRFLOW__SCHEDULER__FILE_PARSING_SORT_MODE
2269 | #
2270 | file_parsing_sort_mode = modified_time
2271 | 
2272 | # Whether the dag processor is running as a standalone process or it is a subprocess of a scheduler
2273 | # job.
2274 | #
2275 | # Variable: AIRFLOW__SCHEDULER__STANDALONE_DAG_PROCESSOR
2276 | #
2277 | standalone_dag_processor = False
2278 | 
2279 | # Only applicable if ``[scheduler] standalone_dag_processor`` is true and  callbacks are stored
2280 | # in database. Contains maximum number of callbacks that are fetched during a single loop.
2281 | #
2282 | # Variable: AIRFLOW__SCHEDULER__MAX_CALLBACKS_PER_LOOP
2283 | #
2284 | max_callbacks_per_loop = 20
2285 | 
2286 | # Only applicable if ``[scheduler] standalone_dag_processor`` is true.
2287 | # Time in seconds after which dags, which were not updated by Dag Processor are deactivated.
2288 | #
2289 | # Variable: AIRFLOW__SCHEDULER__DAG_STALE_NOT_SEEN_DURATION
2290 | #
2291 | dag_stale_not_seen_duration = 600
2292 | 
2293 | # Turn off scheduler use of cron intervals by setting this to ``False``.
2294 | # DAGs submitted manually in the web UI or with trigger_dag will still run.
2295 | #
2296 | # Variable: AIRFLOW__SCHEDULER__USE_JOB_SCHEDULE
2297 | #
2298 | use_job_schedule = True
2299 | 
2300 | # Allow externally triggered DagRuns for Execution Dates in the future
2301 | # Only has effect if schedule_interval is set to None in DAG
2302 | #
2303 | # Variable: AIRFLOW__SCHEDULER__ALLOW_TRIGGER_IN_FUTURE
2304 | #
2305 | allow_trigger_in_future = False
2306 | 
2307 | # How often to check for expired trigger requests that have not run yet.
2308 | #
2309 | # Variable: AIRFLOW__SCHEDULER__TRIGGER_TIMEOUT_CHECK_INTERVAL
2310 | #
2311 | trigger_timeout_check_interval = 15
2312 | 
2313 | # Amount of time a task can be in the queued state before being retried or set to failed.
2314 | #
2315 | # Variable: AIRFLOW__SCHEDULER__TASK_QUEUED_TIMEOUT
2316 | #
2317 | task_queued_timeout = 600.0
2318 | 
2319 | # How often to check for tasks that have been in the queued state for
2320 | # longer than ``[scheduler] task_queued_timeout``.
2321 | #
2322 | # Variable: AIRFLOW__SCHEDULER__TASK_QUEUED_TIMEOUT_CHECK_INTERVAL
2323 | #
2324 | task_queued_timeout_check_interval = 120.0
2325 | 
2326 | # The run_id pattern used to verify the validity of user input to the run_id parameter when
2327 | # triggering a DAG. This pattern cannot change the pattern used by scheduler to generate run_id
2328 | # for scheduled DAG runs or DAG runs triggered without changing the run_id parameter.
2329 | #
2330 | # Variable: AIRFLOW__SCHEDULER__ALLOWED_RUN_ID_PATTERN
2331 | #
2332 | allowed_run_id_pattern = ^[A-Za-z0-9_.~:+-]+$
2333 | 
2334 | # Whether to create DAG runs that span an interval or one single point in time for cron schedules, when
2335 | # a cron string is provided to ``schedule`` argument of a DAG.
2336 | #
2337 | # * ``True``: **CronDataIntervalTimetable** is used, which is suitable
2338 | #   for DAGs with well-defined data interval. You get contiguous intervals from the end of the previous
2339 | #   interval up to the scheduled datetime.
2340 | # * ``False``: **CronTriggerTimetable** is used, which is closer to the behavior of cron itself.
2341 | #
2342 | # Notably, for **CronTriggerTimetable**, the logical date is the same as the time the DAG Run will
2343 | # try to schedule, while for **CronDataIntervalTimetable**, the logical date is the beginning of
2344 | # the data interval, but the DAG Run will try to schedule at the end of the data interval.
2345 | #
2346 | # Variable: AIRFLOW__SCHEDULER__CREATE_CRON_DATA_INTERVALS
2347 | #
2348 | create_cron_data_intervals = True
2349 | 
2350 | [triggerer]
2351 | # How many triggers a single Triggerer will run at once, by default.
2352 | #
2353 | # Variable: AIRFLOW__TRIGGERER__DEFAULT_CAPACITY
2354 | #
2355 | default_capacity = 1000
2356 | 
2357 | # How often to heartbeat the Triggerer job to ensure it hasn't been killed.
2358 | #
2359 | # Variable: AIRFLOW__TRIGGERER__JOB_HEARTBEAT_SEC
2360 | #
2361 | job_heartbeat_sec = 5
2362 | 
2363 | # If the last triggerer heartbeat happened more than ``[triggerer] triggerer_health_check_threshold``
2364 | # ago (in seconds), triggerer is considered unhealthy.
2365 | # This is used by the health check in the **/health** endpoint and in ``airflow jobs check`` CLI
2366 | # for TriggererJob.
2367 | #
2368 | # Variable: AIRFLOW__TRIGGERER__TRIGGERER_HEALTH_CHECK_THRESHOLD
2369 | #
2370 | triggerer_health_check_threshold = 30
2371 | 
2372 | [kerberos]
2373 | # Location of your ccache file once kinit has been performed.
2374 | #
2375 | # Variable: AIRFLOW__KERBEROS__CCACHE
2376 | #
2377 | ccache = /tmp/airflow_krb5_ccache
2378 | 
2379 | # gets augmented with fqdn
2380 | #
2381 | # Variable: AIRFLOW__KERBEROS__PRINCIPAL
2382 | #
2383 | principal = airflow
2384 | 
2385 | # Determines the frequency at which initialization or re-initialization processes occur.
2386 | #
2387 | # Variable: AIRFLOW__KERBEROS__REINIT_FREQUENCY
2388 | #
2389 | reinit_frequency = 3600
2390 | 
2391 | # Path to the kinit executable
2392 | #
2393 | # Variable: AIRFLOW__KERBEROS__KINIT_PATH
2394 | #
2395 | kinit_path = kinit
2396 | 
2397 | # Designates the path to the Kerberos keytab file for the Airflow user
2398 | #
2399 | # Variable: AIRFLOW__KERBEROS__KEYTAB
2400 | #
2401 | keytab = airflow.keytab
2402 | 
2403 | # Allow to disable ticket forwardability.
2404 | #
2405 | # Variable: AIRFLOW__KERBEROS__FORWARDABLE
2406 | #
2407 | forwardable = True
2408 | 
2409 | # Allow to remove source IP from token, useful when using token behind NATted Docker host.
2410 | #
2411 | # Variable: AIRFLOW__KERBEROS__INCLUDE_IP
2412 | #
2413 | include_ip = True
2414 | 
2415 | [sensors]
2416 | # Sensor default timeout, 7 days by default (7 * 24 * 60 * 60).
2417 | #
2418 | # Variable: AIRFLOW__SENSORS__DEFAULT_TIMEOUT
2419 | #
2420 | default_timeout = 604800
2421 | 
2422 | [usage_data_collection]
2423 | # Airflow integrates `Scarf <https://about.scarf.sh/>`__ to collect basic platform and usage data
2424 | # during operation. This data assists Airflow maintainers in better understanding how Airflow is used.
2425 | # Insights gained from this telemetry are critical for prioritizing patches, minor releases, and
2426 | # security fixes. Additionally, this information supports key decisions related to the development road map.
2427 | # Check the FAQ doc for more information on what data is collected.
2428 | #
2429 | # Deployments can opt-out of analytics by setting the ``enabled`` option
2430 | # to ``False``, or the ``SCARF_ANALYTICS=false`` environment variable.
2431 | # Individual users can easily opt-out of analytics in various ways documented in the
2432 | # `Scarf Do Not Track docs <https://docs.scarf.sh/gateway/#do-not-track>`__.
2433 | 
2434 | # Enable or disable usage data collection and sending.
2435 | #
2436 | # Variable: AIRFLOW__USAGE_DATA_COLLECTION__ENABLED
2437 | #
2438 | enabled = True
2439 | 
2440 | [aws]
2441 | # This section contains settings for Amazon Web Services (AWS) integration.
2442 | 
2443 | # session_factory =
2444 | cloudwatch_task_handler_json_serializer = airflow.providers.amazon.aws.log.cloudwatch_task_handler.json_serialize_legacy
2445 | 
2446 | [aws_batch_executor]
2447 | # This section only applies if you are using the AwsBatchExecutor in
2448 | # Airflow's ``[core]`` configuration.
2449 | # For more information on any of these execution parameters, see the link below:
2450 | # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/batch.html#Batch.Client.submit_job
2451 | # For boto3 credential management, see
2452 | # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
2453 | 
2454 | conn_id = aws_default
2455 | # region_name =
2456 | max_submit_job_attempts = 3
2457 | check_health_on_startup = True
2458 | # job_name =
2459 | # job_queue =
2460 | # job_definition =
2461 | # submit_job_kwargs =
2462 | 
2463 | [aws_ecs_executor]
2464 | # This section only applies if you are using the AwsEcsExecutor in
2465 | # Airflow's ``[core]`` configuration.
2466 | # For more information on any of these execution parameters, see the link below:
2467 | # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/ecs/client/run_task.html
2468 | # For boto3 credential management, see
2469 | # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html
2470 | 
2471 | conn_id = aws_default
2472 | # region_name =
2473 | assign_public_ip = False
2474 | # cluster =
2475 | # capacity_provider_strategy =
2476 | # container_name =
2477 | # launch_type =
2478 | platform_version = LATEST
2479 | # security_groups =
2480 | # subnets =
2481 | # task_definition =
2482 | max_run_task_attempts = 3
2483 | # run_task_kwargs =
2484 | check_health_on_startup = True
2485 | 
2486 | [aws_auth_manager]
2487 | # This section only applies if you are using the AwsAuthManager. In other words, if you set
2488 | # ``[core] auth_manager = airflow.providers.amazon.aws.auth_manager.aws_auth_manager.AwsAuthManager`` in
2489 | # Airflow's configuration.
2490 | 
2491 | enable = False
2492 | conn_id = aws_default
2493 | # region_name =
2494 | # saml_metadata_url =
2495 | # avp_policy_store_id =
2496 | 
2497 | [celery_kubernetes_executor]
2498 | # This section only applies if you are using the ``CeleryKubernetesExecutor`` in
2499 | # ``[core]`` section above
2500 | 
2501 | # Define when to send a task to ``KubernetesExecutor`` when using ``CeleryKubernetesExecutor``.
2502 | # When the queue of a task is the value of ``kubernetes_queue`` (default ``kubernetes``),
2503 | # the task is executed via ``KubernetesExecutor``,
2504 | # otherwise via ``CeleryExecutor``
2505 | #
2506 | # Variable: AIRFLOW__CELERY_KUBERNETES_EXECUTOR__KUBERNETES_QUEUE
2507 | #
2508 | kubernetes_queue = kubernetes
2509 | 
2510 | [celery]
2511 | # This section only applies if you are using the CeleryExecutor in
2512 | # ``[core]`` section above
2513 | 
2514 | # The app name that will be used by celery
2515 | #
2516 | # Variable: AIRFLOW__CELERY__CELERY_APP_NAME
2517 | #
2518 | celery_app_name = airflow.providers.celery.executors.celery_executor
2519 | 
2520 | # The concurrency that will be used when starting workers with the
2521 | # ``airflow celery worker`` command. This defines the number of task instances that
2522 | # a worker will take, so size up your workers based on the resources on
2523 | # your worker box and the nature of your tasks
2524 | #
2525 | # Variable: AIRFLOW__CELERY__WORKER_CONCURRENCY
2526 | #
2527 | worker_concurrency = 16
2528 | 
2529 | # The maximum and minimum number of pool processes that will be used to dynamically resize
2530 | # the pool based on load.Enable autoscaling by providing max_concurrency,min_concurrency
2531 | # with the ``airflow celery worker`` command (always keep minimum processes,
2532 | # but grow to maximum if necessary).
2533 | # Pick these numbers based on resources on worker box and the nature of the task.
2534 | # If autoscale option is available, worker_concurrency will be ignored.
2535 | # https://docs.celeryq.dev/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale
2536 | #
2537 | # Example: worker_autoscale = 16,12
2538 | #
2539 | # Variable: AIRFLOW__CELERY__WORKER_AUTOSCALE
2540 | #
2541 | # worker_autoscale =
2542 | 
2543 | # Used to increase the number of tasks that a worker prefetches which can improve performance.
2544 | # The number of processes multiplied by worker_prefetch_multiplier is the number of tasks
2545 | # that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily
2546 | # blocked if there are multiple workers and one worker prefetches tasks that sit behind long
2547 | # running tasks while another worker has unutilized processes that are unable to process the already
2548 | # claimed blocked tasks.
2549 | # https://docs.celeryq.dev/en/stable/userguide/optimizing.html#prefetch-limits
2550 | #
2551 | # Variable: AIRFLOW__CELERY__WORKER_PREFETCH_MULTIPLIER
2552 | #
2553 | worker_prefetch_multiplier = 1
2554 | 
2555 | # Specify if remote control of the workers is enabled.
2556 | # In some cases when the broker does not support remote control, Celery creates lots of
2557 | # ``.*reply-celery-pidbox`` queues. You can prevent this by setting this to false.
2558 | # However, with this disabled Flower won't work.
2559 | # https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/index.html#broker-overview
2560 | #
2561 | # Variable: AIRFLOW__CELERY__WORKER_ENABLE_REMOTE_CONTROL
2562 | #
2563 | worker_enable_remote_control = true
2564 | 
2565 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally
2566 | # a sqlalchemy database. Refer to the Celery documentation for more information.
2567 | #
2568 | # Variable: AIRFLOW__CELERY__BROKER_URL
2569 | #
2570 | broker_url = redis://redis:6379/0
2571 | 
2572 | # The Celery result_backend. When a job finishes, it needs to update the
2573 | # metadata of the job. Therefore it will post a message on a message bus,
2574 | # or insert it into a database (depending of the backend)
2575 | # This status is used by the scheduler to update the state of the task
2576 | # The use of a database is highly recommended
2577 | # When not specified, sql_alchemy_conn with a db+ scheme prefix will be used
2578 | # https://docs.celeryq.dev/en/latest/userguide/configuration.html#task-result-backend-settings
2579 | #
2580 | # Example: result_backend = db+postgresql://postgres:airflow@postgres/airflow
2581 | #
2582 | # Variable: AIRFLOW__CELERY__RESULT_BACKEND
2583 | #
2584 | # result_backend =
2585 | 
2586 | # Optional configuration dictionary to pass to the Celery result backend SQLAlchemy engine.
2587 | #
2588 | # Example: result_backend_sqlalchemy_engine_options = {"pool_recycle": 1800}
2589 | #
2590 | # Variable: AIRFLOW__CELERY__RESULT_BACKEND_SQLALCHEMY_ENGINE_OPTIONS
2591 | #
2592 | result_backend_sqlalchemy_engine_options =
2593 | 
2594 | # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start
2595 | # it ``airflow celery flower``. This defines the IP that Celery Flower runs on
2596 | #
2597 | # Variable: AIRFLOW__CELERY__FLOWER_HOST
2598 | #
2599 | flower_host = 0.0.0.0
2600 | 
2601 | # The root URL for Flower
2602 | #
2603 | # Example: flower_url_prefix = /flower
2604 | #
2605 | # Variable: AIRFLOW__CELERY__FLOWER_URL_PREFIX
2606 | #
2607 | flower_url_prefix =
2608 | 
2609 | # This defines the port that Celery Flower runs on
2610 | #
2611 | # Variable: AIRFLOW__CELERY__FLOWER_PORT
2612 | #
2613 | flower_port = 5555
2614 | 
2615 | # Securing Flower with Basic Authentication
2616 | # Accepts user:password pairs separated by a comma
2617 | #
2618 | # Example: flower_basic_auth = user1:password1,user2:password2
2619 | #
2620 | # Variable: AIRFLOW__CELERY__FLOWER_BASIC_AUTH
2621 | #
2622 | flower_basic_auth =
2623 | 
2624 | # How many processes CeleryExecutor uses to sync task state.
2625 | # 0 means to use max(1, number of cores - 1) processes.
2626 | #
2627 | # Variable: AIRFLOW__CELERY__SYNC_PARALLELISM
2628 | #
2629 | sync_parallelism = 0
2630 | 
2631 | # Import path for celery configuration options
2632 | #
2633 | # Variable: AIRFLOW__CELERY__CELERY_CONFIG_OPTIONS
2634 | #
2635 | celery_config_options = airflow.providers.celery.executors.default_celery.DEFAULT_CELERY_CONFIG
2636 | 
2637 | #
2638 | # Variable: AIRFLOW__CELERY__SSL_ACTIVE
2639 | #
2640 | ssl_active = False
2641 | 
2642 | # Path to the client key.
2643 | #
2644 | # Variable: AIRFLOW__CELERY__SSL_KEY
2645 | #
2646 | ssl_key =
2647 | 
2648 | # Path to the client certificate.
2649 | #
2650 | # Variable: AIRFLOW__CELERY__SSL_CERT
2651 | #
2652 | ssl_cert =
2653 | 
2654 | # Path to the CA certificate.
2655 | #
2656 | # Variable: AIRFLOW__CELERY__SSL_CACERT
2657 | #
2658 | ssl_cacert =
2659 | 
2660 | # Celery Pool implementation.
2661 | # Choices include: ``prefork`` (default), ``eventlet``, ``gevent`` or ``solo``.
2662 | # See:
2663 | # https://docs.celeryq.dev/en/latest/userguide/workers.html#concurrency
2664 | # https://docs.celeryq.dev/en/latest/userguide/concurrency/eventlet.html
2665 | #
2666 | # Variable: AIRFLOW__CELERY__POOL
2667 | #
2668 | pool = prefork
2669 | 
2670 | # The number of seconds to wait before timing out ``send_task_to_executor`` or
2671 | # ``fetch_celery_task_state`` operations.
2672 | #
2673 | # Variable: AIRFLOW__CELERY__OPERATION_TIMEOUT
2674 | #
2675 | operation_timeout = 1.0
2676 | 
2677 | task_acks_late = True
2678 | # Celery task will report its status as 'started' when the task is executed by a worker.
2679 | # This is used in Airflow to keep track of the running tasks and if a Scheduler is restarted
2680 | # or run in HA mode, it can adopt the orphan tasks launched by previous SchedulerJob.
2681 | #
2682 | # Variable: AIRFLOW__CELERY__TASK_TRACK_STARTED
2683 | #
2684 | task_track_started = True
2685 | 
2686 | # The Maximum number of retries for publishing task messages to the broker when failing
2687 | # due to ``AirflowTaskTimeout`` error before giving up and marking Task as failed.
2688 | #
2689 | # Variable: AIRFLOW__CELERY__TASK_PUBLISH_MAX_RETRIES
2690 | #
2691 | task_publish_max_retries = 3
2692 | 
2693 | # Worker initialisation check to validate Metadata Database connection
2694 | #
2695 | # Variable: AIRFLOW__CELERY__WORKER_PRECHECK
2696 | #
2697 | worker_precheck = False
2698 | 
2699 | [celery_broker_transport_options]
2700 | # This section is for specifying options which can be passed to the
2701 | # underlying celery broker transport. See:
2702 | # https://docs.celeryq.dev/en/latest/userguide/configuration.html#std:setting-broker_transport_options
2703 | 
2704 | # The visibility timeout defines the number of seconds to wait for the worker
2705 | # to acknowledge the task before the message is redelivered to another worker.
2706 | # Make sure to increase the visibility timeout to match the time of the longest
2707 | # ETA you're planning to use.
2708 | # visibility_timeout is only supported for Redis and SQS celery brokers.
2709 | # See:
2710 | # https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/redis.html#visibility-timeout
2711 | #
2712 | # Example: visibility_timeout = 21600
2713 | #
2714 | # Variable: AIRFLOW__CELERY_BROKER_TRANSPORT_OPTIONS__VISIBILITY_TIMEOUT
2715 | #
2716 | # visibility_timeout =
2717 | 
2718 | # The sentinel_kwargs parameter allows passing additional options to the Sentinel client.
2719 | # In a typical scenario where Redis Sentinel is used as the broker and Redis servers are
2720 | # password-protected, the password needs to be passed through this parameter. Although its
2721 | # type is string, it is required to pass a string that conforms to the dictionary format.
2722 | # See:
2723 | # https://docs.celeryq.dev/en/stable/getting-started/backends-and-brokers/redis.html#configuration
2724 | #
2725 | # Example: sentinel_kwargs = {"password": "password_for_redis_server"}
2726 | #
2727 | # Variable: AIRFLOW__CELERY_BROKER_TRANSPORT_OPTIONS__SENTINEL_KWARGS
2728 | #
2729 | # sentinel_kwargs =
2730 | 
2731 | [local_kubernetes_executor]
2732 | # This section only applies if you are using the ``LocalKubernetesExecutor`` in
2733 | # ``[core]`` section above
2734 | 
2735 | # Define when to send a task to ``KubernetesExecutor`` when using ``LocalKubernetesExecutor``.
2736 | # When the queue of a task is the value of ``kubernetes_queue`` (default ``kubernetes``),
2737 | # the task is executed via ``KubernetesExecutor``,
2738 | # otherwise via ``LocalExecutor``
2739 | #
2740 | # Variable: AIRFLOW__LOCAL_KUBERNETES_EXECUTOR__KUBERNETES_QUEUE
2741 | #
2742 | kubernetes_queue = kubernetes
2743 | 
2744 | [kubernetes_executor]
2745 | # Kwargs to override the default urllib3 Retry used in the kubernetes API client
2746 | #
2747 | # Example: api_client_retry_configuration = { "total": 3, "backoff_factor": 0.5 }
2748 | #
2749 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__API_CLIENT_RETRY_CONFIGURATION
2750 | #
2751 | api_client_retry_configuration =
2752 | 
2753 | # Flag to control the information added to kubernetes executor logs for better traceability
2754 | #
2755 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__LOGS_TASK_METADATA
2756 | #
2757 | logs_task_metadata = False
2758 | 
2759 | # Path to the YAML pod file that forms the basis for KubernetesExecutor workers.
2760 | #
2761 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__POD_TEMPLATE_FILE
2762 | #
2763 | pod_template_file =
2764 | 
2765 | # The repository of the Kubernetes Image for the Worker to Run
2766 | #
2767 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__WORKER_CONTAINER_REPOSITORY
2768 | #
2769 | worker_container_repository =
2770 | 
2771 | # The tag of the Kubernetes Image for the Worker to Run
2772 | #
2773 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__WORKER_CONTAINER_TAG
2774 | #
2775 | worker_container_tag =
2776 | 
2777 | # The Kubernetes namespace where airflow workers should be created. Defaults to ``default``
2778 | #
2779 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__NAMESPACE
2780 | #
2781 | namespace = default
2782 | 
2783 | # If True, all worker pods will be deleted upon termination
2784 | #
2785 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__DELETE_WORKER_PODS
2786 | #
2787 | delete_worker_pods = True
2788 | 
2789 | # If False (and delete_worker_pods is True),
2790 | # failed worker pods will not be deleted so users can investigate them.
2791 | # This only prevents removal of worker pods where the worker itself failed,
2792 | # not when the task it ran failed.
2793 | #
2794 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__DELETE_WORKER_PODS_ON_FAILURE
2795 | #
2796 | delete_worker_pods_on_failure = False
2797 | 
2798 | worker_pod_pending_fatal_container_state_reasons = CreateContainerConfigError,ErrImagePull,CreateContainerError,ImageInspectError, InvalidImageName
2799 | # Number of Kubernetes Worker Pod creation calls per scheduler loop.
2800 | # Note that the current default of "1" will only launch a single pod
2801 | # per-heartbeat. It is HIGHLY recommended that users increase this
2802 | # number to match the tolerance of their kubernetes cluster for
2803 | # better performance.
2804 | #
2805 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__WORKER_PODS_CREATION_BATCH_SIZE
2806 | #
2807 | worker_pods_creation_batch_size = 1
2808 | 
2809 | # Allows users to launch pods in multiple namespaces.
2810 | # Will require creating a cluster-role for the scheduler,
2811 | # or use multi_namespace_mode_namespace_list configuration.
2812 | #
2813 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__MULTI_NAMESPACE_MODE
2814 | #
2815 | multi_namespace_mode = False
2816 | 
2817 | # If multi_namespace_mode is True while scheduler does not have a cluster-role,
2818 | # give the list of namespaces where the scheduler will schedule jobs
2819 | # Scheduler needs to have the necessary permissions in these namespaces.
2820 | #
2821 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__MULTI_NAMESPACE_MODE_NAMESPACE_LIST
2822 | #
2823 | multi_namespace_mode_namespace_list =
2824 | 
2825 | # Use the service account kubernetes gives to pods to connect to kubernetes cluster.
2826 | # It's intended for clients that expect to be running inside a pod running on kubernetes.
2827 | # It will raise an exception if called from a process not running in a kubernetes environment.
2828 | #
2829 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__IN_CLUSTER
2830 | #
2831 | in_cluster = True
2832 | 
2833 | # When running with in_cluster=False change the default cluster_context or config_file
2834 | # options to Kubernetes client. Leave blank these to use default behaviour like ``kubectl`` has.
2835 | #
2836 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__CLUSTER_CONTEXT
2837 | #
2838 | # cluster_context =
2839 | 
2840 | # Path to the kubernetes configfile to be used when ``in_cluster`` is set to False
2841 | #
2842 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__CONFIG_FILE
2843 | #
2844 | # config_file =
2845 | 
2846 | # Keyword parameters to pass while calling a kubernetes client core_v1_api methods
2847 | # from Kubernetes Executor provided as a single line formatted JSON dictionary string.
2848 | # List of supported params are similar for all core_v1_apis, hence a single config
2849 | # variable for all apis. See:
2850 | # https://raw.githubusercontent.com/kubernetes-client/python/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/api/core_v1_api.py
2851 | #
2852 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__KUBE_CLIENT_REQUEST_ARGS
2853 | #
2854 | kube_client_request_args =
2855 | 
2856 | # Optional keyword arguments to pass to the ``delete_namespaced_pod`` kubernetes client
2857 | # ``core_v1_api`` method when using the Kubernetes Executor.
2858 | # This should be an object and can contain any of the options listed in the ``v1DeleteOptions``
2859 | # class defined here:
2860 | # https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19
2861 | #
2862 | # Example: delete_option_kwargs = {"grace_period_seconds": 10}
2863 | #
2864 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__DELETE_OPTION_KWARGS
2865 | #
2866 | delete_option_kwargs =
2867 | 
2868 | # Enables TCP keepalive mechanism. This prevents Kubernetes API requests to hang indefinitely
2869 | # when idle connection is time-outed on services like cloud load balancers or firewalls.
2870 | #
2871 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__ENABLE_TCP_KEEPALIVE
2872 | #
2873 | enable_tcp_keepalive = True
2874 | 
2875 | # When the `enable_tcp_keepalive` option is enabled, TCP probes a connection that has
2876 | # been idle for `tcp_keep_idle` seconds.
2877 | #
2878 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__TCP_KEEP_IDLE
2879 | #
2880 | tcp_keep_idle = 120
2881 | 
2882 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond
2883 | # to a keepalive probe, TCP retransmits the probe after `tcp_keep_intvl` seconds.
2884 | #
2885 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__TCP_KEEP_INTVL
2886 | #
2887 | tcp_keep_intvl = 30
2888 | 
2889 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond
2890 | # to a keepalive probe, TCP retransmits the probe `tcp_keep_cnt number` of times before
2891 | # a connection is considered to be broken.
2892 | #
2893 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__TCP_KEEP_CNT
2894 | #
2895 | tcp_keep_cnt = 6
2896 | 
2897 | # Set this to false to skip verifying SSL certificate of Kubernetes python client.
2898 | #
2899 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__VERIFY_SSL
2900 | #
2901 | verify_ssl = True
2902 | 
2903 | # How often in seconds to check for task instances stuck in "queued" status without a pod
2904 | #
2905 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__WORKER_PODS_QUEUED_CHECK_INTERVAL
2906 | #
2907 | worker_pods_queued_check_interval = 60
2908 | 
2909 | # Path to a CA certificate to be used by the Kubernetes client to verify the server's SSL certificate.
2910 | #
2911 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__SSL_CA_CERT
2912 | #
2913 | ssl_ca_cert =
2914 | 
2915 | # The Maximum number of retries for queuing the task to the kubernetes scheduler when
2916 | # failing due to Kube API exceeded quota errors before giving up and marking task as failed.
2917 | # -1 for unlimited times.
2918 | #
2919 | # Variable: AIRFLOW__KUBERNETES_EXECUTOR__TASK_PUBLISH_MAX_RETRIES
2920 | #
2921 | task_publish_max_retries = 0
2922 | 
2923 | [common.io]
2924 | # Common IO configuration section
2925 | 
2926 | # Path to a location on object storage where XComs can be stored in url format.
2927 | #
2928 | # Example: xcom_objectstorage_path = s3://conn_id@bucket/path
2929 | #
2930 | # Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_PATH
2931 | #
2932 | xcom_objectstorage_path =
2933 | 
2934 | # Threshold in bytes for storing XComs in object storage. -1 means always store in the
2935 | # database. 0 means always store in object storage. Any positive number means
2936 | # it will be stored in object storage if the size of the value is greater than the threshold.
2937 | #
2938 | # Example: xcom_objectstorage_threshold = 1000000
2939 | #
2940 | # Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_THRESHOLD
2941 | #
2942 | xcom_objectstorage_threshold = -1
2943 | 
2944 | # Compression algorithm to use when storing XComs in object storage. Supported algorithms
2945 | # are a.o.: snappy, zip, gzip, bz2, and lzma. If not specified, no compression will be used.
2946 | # Note that the compression algorithm must be available in the Python installation (e.g.
2947 | # python-snappy for snappy). Zip, gz, bz2 are available by default.
2948 | #
2949 | # Example: xcom_objectstorage_compression = gz
2950 | #
2951 | # Variable: AIRFLOW__COMMON.IO__XCOM_OBJECTSTORAGE_COMPRESSION
2952 | #
2953 | xcom_objectstorage_compression =
2954 | 
2955 | [elasticsearch]
2956 | # Elasticsearch host
2957 | #
2958 | # Variable: AIRFLOW__ELASTICSEARCH__HOST
2959 | #
2960 | host =
2961 | 
2962 | # Format of the log_id, which is used to query for a given tasks logs
2963 | #
2964 | # Variable: AIRFLOW__ELASTICSEARCH__LOG_ID_TEMPLATE
2965 | #
2966 | log_id_template = {dag_id}-{task_id}-{run_id}-{map_index}-{try_number}
2967 | 
2968 | # Used to mark the end of a log stream for a task
2969 | #
2970 | # Variable: AIRFLOW__ELASTICSEARCH__END_OF_LOG_MARK
2971 | #
2972 | end_of_log_mark = end_of_log
2973 | 
2974 | # Qualified URL for an elasticsearch frontend (like Kibana) with a template argument for log_id
2975 | # Code will construct log_id using the log_id template from the argument above.
2976 | # NOTE: scheme will default to https if one is not provided
2977 | #
2978 | # Example: frontend = http://localhost:5601/app/kibana#/discover?_a=(columns:!(message),query:(language:kuery,query:'log_id: "{log_id}"'),sort:!(log.offset,asc))
2979 | #
2980 | # Variable: AIRFLOW__ELASTICSEARCH__FRONTEND
2981 | #
2982 | frontend =
2983 | 
2984 | # Write the task logs to the stdout of the worker, rather than the default files
2985 | #
2986 | # Variable: AIRFLOW__ELASTICSEARCH__WRITE_STDOUT
2987 | #
2988 | write_stdout = False
2989 | 
2990 | # Instead of the default log formatter, write the log lines as JSON
2991 | #
2992 | # Variable: AIRFLOW__ELASTICSEARCH__JSON_FORMAT
2993 | #
2994 | json_format = False
2995 | 
2996 | # Log fields to also attach to the json output, if enabled
2997 | #
2998 | # Variable: AIRFLOW__ELASTICSEARCH__JSON_FIELDS
2999 | #
3000 | json_fields = asctime, filename, lineno, levelname, message
3001 | 
3002 | # The field where host name is stored (normally either `host` or `host.name`)
3003 | #
3004 | # Variable: AIRFLOW__ELASTICSEARCH__HOST_FIELD
3005 | #
3006 | host_field = host
3007 | 
3008 | # The field where offset is stored (normally either `offset` or `log.offset`)
3009 | #
3010 | # Variable: AIRFLOW__ELASTICSEARCH__OFFSET_FIELD
3011 | #
3012 | offset_field = offset
3013 | 
3014 | # Comma separated list of index patterns to use when searching for logs (default: `_all`).
3015 | # The index_patterns_callable takes precedence over this.
3016 | #
3017 | # Example: index_patterns = something-*
3018 | #
3019 | # Variable: AIRFLOW__ELASTICSEARCH__INDEX_PATTERNS
3020 | #
3021 | index_patterns = _all
3022 | 
3023 | index_patterns_callable =
3024 | 
3025 | [elasticsearch_configs]
3026 | #
3027 | # Variable: AIRFLOW__ELASTICSEARCH_CONFIGS__HTTP_COMPRESS
3028 | #
3029 | http_compress = False
3030 | 
3031 | #
3032 | # Variable: AIRFLOW__ELASTICSEARCH_CONFIGS__VERIFY_CERTS
3033 | #
3034 | verify_certs = True
3035 | 
3036 | [fab]
3037 | # This section contains configs specific to FAB provider.
3038 | 
3039 | # Boolean for enabling rate limiting on authentication endpoints.
3040 | #
3041 | # Variable: AIRFLOW__FAB__AUTH_RATE_LIMITED
3042 | #
3043 | auth_rate_limited = True
3044 | 
3045 | # Rate limit for authentication endpoints.
3046 | #
3047 | # Variable: AIRFLOW__FAB__AUTH_RATE_LIMIT
3048 | #
3049 | auth_rate_limit = 5 per 40 second
3050 | 
3051 | # Update FAB permissions and sync security manager roles
3052 | # on webserver startup
3053 | #
3054 | # Variable: AIRFLOW__FAB__UPDATE_FAB_PERMS
3055 | #
3056 | update_fab_perms = True
3057 | 
3058 | [imap]
3059 | # Options for IMAP provider.
3060 | 
3061 | # ssl_context =
3062 | 
3063 | [azure_remote_logging]
3064 | # Configuration that needs to be set for enable remote logging in Azure Blob Storage
3065 | 
3066 | remote_wasb_log_container = airflow-logs
3067 | 
3068 | [openlineage]
3069 | # This section applies settings for OpenLineage integration.
3070 | # More about configuration and it's precedence can be found at
3071 | # https://airflow.apache.org/docs/apache-airflow-providers-openlineage/stable/guides/user.html#transport-setup
3072 | 
3073 | # Disable sending events without uninstalling the OpenLineage Provider by setting this to true.
3074 | #
3075 | # Variable: AIRFLOW__OPENLINEAGE__DISABLED
3076 | #
3077 | disabled = False
3078 | 
3079 | # Exclude some Operators from emitting OpenLineage events by passing a string of semicolon separated
3080 | # full import paths of Operators to disable.
3081 | #
3082 | # Example: disabled_for_operators = airflow.providers.standard.operators.bash.BashOperator; airflow.providers.standard.operators.python.PythonOperator
3083 | #
3084 | # Variable: AIRFLOW__OPENLINEAGE__DISABLED_FOR_OPERATORS
3085 | #
3086 | disabled_for_operators =
3087 | 
3088 | # If this setting is enabled, OpenLineage integration won't collect and emit metadata,
3089 | # unless you explicitly enable it per `DAG` or `Task` using  `enable_lineage` method.
3090 | #
3091 | # Variable: AIRFLOW__OPENLINEAGE__SELECTIVE_ENABLE
3092 | #
3093 | selective_enable = False
3094 | 
3095 | # Set namespace that the lineage data belongs to, so that if you use multiple OpenLineage producers,
3096 | # events coming from them will be logically separated.
3097 | #
3098 | # Example: namespace = my_airflow_instance_1
3099 | #
3100 | # Variable: AIRFLOW__OPENLINEAGE__NAMESPACE
3101 | #
3102 | # namespace =
3103 | 
3104 | # Register custom OpenLineage Extractors by passing a string of semicolon separated full import paths.
3105 | #
3106 | # Example: extractors = full.path.to.ExtractorClass;full.path.to.AnotherExtractorClass
3107 | #
3108 | # Variable: AIRFLOW__OPENLINEAGE__EXTRACTORS
3109 | #
3110 | # extractors =
3111 | 
3112 | # Register custom run facet functions by passing a string of semicolon separated full import paths.
3113 | #
3114 | # Example: custom_run_facets = full.path.to.custom_facet_function;full.path.to.another_custom_facet_function
3115 | #
3116 | # Variable: AIRFLOW__OPENLINEAGE__CUSTOM_RUN_FACETS
3117 | #
3118 | custom_run_facets =
3119 | 
3120 | # Specify the path to the YAML configuration file.
3121 | # This ensures backwards compatibility with passing config through the `openlineage.yml` file.
3122 | #
3123 | # Example: config_path = full/path/to/openlineage.yml
3124 | #
3125 | # Variable: AIRFLOW__OPENLINEAGE__CONFIG_PATH
3126 | #
3127 | config_path =
3128 | 
3129 | # Pass OpenLineage Client transport configuration as JSON string. It should contain type of the
3130 | # transport and additional options (different for each transport type). For more details see:
3131 | # https://openlineage.io/docs/client/python/#built-in-transport-types
3132 | #
3133 | # Currently supported types are:
3134 | #
3135 | #   * HTTP
3136 | #   * Kafka
3137 | #   * Console
3138 | #   * File
3139 | #
3140 | # Example: transport = {"type": "http", "url": "http://localhost:5000", "endpoint": "api/v1/lineage"}
3141 | #
3142 | # Variable: AIRFLOW__OPENLINEAGE__TRANSPORT
3143 | #
3144 | transport =
3145 | 
3146 | # Disable the inclusion of source code in OpenLineage events by setting this to `true`.
3147 | # By default, several Operators (e.g. Python, Bash) will include their source code in the events
3148 | # unless disabled.
3149 | #
3150 | # Variable: AIRFLOW__OPENLINEAGE__DISABLE_SOURCE_CODE
3151 | #
3152 | disable_source_code = False
3153 | 
3154 | # Number of processes to utilize for processing DAG state changes
3155 | # in an asynchronous manner within the scheduler process.
3156 | #
3157 | # Variable: AIRFLOW__OPENLINEAGE__DAG_STATE_CHANGE_PROCESS_POOL_SIZE
3158 | #
3159 | dag_state_change_process_pool_size = 1
3160 | 
3161 | # Maximum amount of time (in seconds) that OpenLineage can spend executing metadata extraction.
3162 | #
3163 | # Variable: AIRFLOW__OPENLINEAGE__EXECUTION_TIMEOUT
3164 | #
3165 | execution_timeout = 10
3166 | 
3167 | # If true, OpenLineage event will include full task info - potentially containing large fields.
3168 | #
3169 | # Variable: AIRFLOW__OPENLINEAGE__INCLUDE_FULL_TASK_INFO
3170 | #
3171 | include_full_task_info = False
3172 | 
3173 | # If true, OpenLineage events will include information useful for debugging - potentially
3174 | # containing large fields e.g. all installed packages and their versions.
3175 | #
3176 | # Variable: AIRFLOW__OPENLINEAGE__DEBUG_MODE
3177 | #
3178 | debug_mode = False
3179 | 
3180 | [smtp_provider]
3181 | # Options for SMTP provider.
3182 | 
3183 | # ssl context to use when using SMTP and IMAP SSL connections. By default, the context is "default"
3184 | # which sets it to ``ssl.create_default_context()`` which provides the right balance between
3185 | # compatibility and security, it however requires that certificates in your operating system are
3186 | # updated and that SMTP/IMAP servers of yours have valid certificates that have corresponding public
3187 | # keys installed on your machines. You can switch it to "none" if you want to disable checking
3188 | # of the certificates, but it is not recommended as it allows MITM (man-in-the-middle) attacks
3189 | # if your infrastructure is not sufficiently secured. It should only be set temporarily while you
3190 | # are fixing your certificate configuration. This can be typically done by upgrading to newer
3191 | # version of the operating system you run Airflow components on,by upgrading/refreshing proper
3192 | # certificates in the OS or by updating certificates for your mail servers.
3193 | #
3194 | # If you do not set this option explicitly, it will use Airflow "email.ssl_context" configuration,
3195 | # but if this configuration is not present, it will use "default" value.
3196 | #
3197 | # Example: ssl_context = default
3198 | #
3199 | # Variable: AIRFLOW__SMTP_PROVIDER__SSL_CONTEXT
3200 | #
3201 | # ssl_context =
3202 | 
3203 | # Allows overriding of the standard templated email subject line when the SmtpNotifier is used.
3204 | # Must provide a path to the template.
3205 | #
3206 | # Example: templated_email_subject_path = path/to/override/email_subject.html
3207 | #
3208 | # Variable: AIRFLOW__SMTP_PROVIDER__TEMPLATED_EMAIL_SUBJECT_PATH
3209 | #
3210 | # templated_email_subject_path =
3211 | 
3212 | # Allows overriding of the standard templated email path when the SmtpNotifier is used. Must provide
3213 | # a path to the template.
3214 | #
3215 | # Example: templated_html_content_path = path/to/override/email.html
3216 | #
3217 | # Variable: AIRFLOW__SMTP_PROVIDER__TEMPLATED_HTML_CONTENT_PATH
3218 | #
3219 | # templated_html_content_path =
3220 | 


--------------------------------------------------------------------------------
/dags/example.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.operators.bash_operator import BashOperator
 5 | from airflow.operators.python_operator import PythonOperator
 6 | 
 7 | default_args = {
 8 |     "owner": "airflow",
 9 |     "depends_on_past": False,
10 |     "start_date": datetime(2025, 1, 1),
11 |     "retries": 1,
12 |     "retry_delay": timedelta(minutes=5),
13 | }
14 | 
15 | dag = DAG(
16 |     dag_id="simple_example_dag",
17 |     default_args=default_args,
18 |     schedule_interval=timedelta(days=1),
19 |     catchup=False,
20 | )
21 | 
22 | 
23 | def print_hello():
24 |     print("Olá Mundo!!! \n Esta é a minha primeira tarefa")
25 | 
26 | 
27 | task1 = PythonOperator(task_id="print_hello", python_callable=print_hello, dag=dag)
28 | 
29 | task2 = BashOperator(
30 |     task_id="print_date", bash_command="date && sleep 5 & date", dag=dag
31 | )
32 | 
33 | task1 >> task2
34 | 


--------------------------------------------------------------------------------
/dags/execute_entities.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | 
  3 | from airflow import DAG
  4 | from airflow.operators.dummy import DummyOperator
  5 | from airflow.operators.python import PythonOperator
  6 | from airflow.utils.task_group import TaskGroup
  7 | from loguru import logger
  8 | 
  9 | default_args = {
 10 |     "owner": "airflow",
 11 |     "depends_on_past": False,
 12 |     "retries": 3,
 13 |     "retry_delay": timedelta(minutes=1),
 14 | }
 15 | 
 16 | 
 17 | def get_endpoints():
 18 |     from src.endpoints import Endpoints
 19 | 
 20 |     return Endpoints().get_all()
 21 | 
 22 | 
 23 | def get_cutomers(endpoint: dict):
 24 |     from src.controllers.paginations import PaginationController
 25 | 
 26 |     resource = endpoint.get("resources", None)
 27 |     action = endpoint.get("action", None)
 28 |     params = endpoint.get("params", None)
 29 |     data_source = endpoint.get("data_source", None)
 30 |     pagination_type = endpoint.get("pagination_type", "per_page")
 31 |     page_label = endpoint.get("page_label", None)
 32 |     total_of_pages_label = endpoint.get("total_of_pages_label", None)
 33 |     records_label = endpoint.get("records_label", "registros")
 34 | 
 35 |     pagination = PaginationController()
 36 | 
 37 |     if pagination_type == "date_range":
 38 |         depends_on = endpoint.get("depends_on", None)
 39 | 
 40 |         if depends_on:
 41 |             from src.db.database import Database
 42 | 
 43 |             db = Database()
 44 | 
 45 |             try:
 46 |                 accounts = db.select_from_table(
 47 |                     table_name=depends_on, distinct_column="nCodCC"
 48 |                 )
 49 |             except Exception as e:
 50 |                 logger.error(
 51 |                     f"An error occurred while selecting from the table '{depends_on}': {e}"
 52 |                 )
 53 | 
 54 |             for account in accounts:
 55 |                 params["nCodCC"] = account
 56 | 
 57 |                 try:
 58 |                     pagination.pagination(
 59 |                         type=pagination_type,
 60 |                         resource=resource,
 61 |                         action=action,
 62 |                         params=params,
 63 |                         data_source=data_source,
 64 |                     )
 65 |                 except Exception as e:
 66 |                     logger.error(f"An error occurred while pagination: {e}")
 67 |     else:
 68 |         try:
 69 |             pagination.pagination(
 70 |                 type=pagination_type,
 71 |                 resource=resource,
 72 |                 action=action,
 73 |                 params=params,
 74 |                 data_source=data_source,
 75 |                 page_label=page_label,
 76 |                 total_of_pages_label=total_of_pages_label,
 77 |                 records_label=records_label,
 78 |             )
 79 |         except Exception as e:
 80 |             logger.error(f"An error occurred while pagination: {e}")
 81 | 
 82 | 
 83 | with DAG(
 84 |     "execute_entities",
 85 |     default_args=default_args,
 86 |     description="Execute entities",
 87 |     start_date=datetime(2025, 1, 1),
 88 |     schedule_interval="0 3 * * *",
 89 |     catchup=False,
 90 | ) as dag:
 91 |     start = DummyOperator(task_id="start")
 92 |     end = DummyOperator(task_id="end")
 93 | 
 94 |     endpoints = get_endpoints()
 95 | 
 96 |     extract_endpoints = [e for e in endpoints if e.get("action") != "ListarExtrato"]
 97 |     excluded_extract_endpoints = [
 98 |         e for e in endpoints if e.get("action") == "ListarExtrato"
 99 |     ]
100 | 
101 |     with TaskGroup("extract_and_load_omie_entities") as extract_group:
102 |         for endpoint in extract_endpoints:
103 |             tasks = PythonOperator(
104 |                 task_id=f"extract_and_load_{endpoint.get('action', None)}",
105 |                 python_callable=get_cutomers,
106 |                 op_kwargs={"endpoint": endpoint},
107 |                 dag=dag,
108 |             )
109 | 
110 |     with TaskGroup("extract_and_load_omie_second_flow") as extract_second_group:
111 |         for second_endpoint in excluded_extract_endpoints:
112 |             second_tasks = PythonOperator(
113 |                 task_id=f"extract_and_load_{second_endpoint.get('action', None)}",
114 |                 python_callable=get_cutomers,
115 |                 op_kwargs={"endpoint": second_endpoint},
116 |                 dag=dag,
117 |             )
118 | 
119 |     start >> extract_group >> extract_second_group >> end
120 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | x-airflow-common:
  2 |   &airflow-common
  3 |   build: .
  4 |   environment:
  5 |     &airflow-common-env
  6 |     AIRFLOW__CORE__EXECUTOR: CeleryExecutor
  7 |     AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow
  8 |     AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow
  9 |     AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0
 10 |     AIRFLOW__CORE__FERNET_KEY: ''
 11 |     AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true'
 12 |     AIRFLOW__CORE__LOAD_EXAMPLES: 'false'
 13 |     AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session'
 14 |     AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true'
 15 |     AIRFLOW__CORE__PARALLELISM: '4'
 16 |     AIRFLOW__CORE__DAG_CONCURRENCY: '2'
 17 |     AIRFLOW__CELERY__WORKER_CONCURRENCY: '2'
 18 |     # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks
 19 |     # for other purpose (development, test and especially production usage) build/extend Airflow image.
 20 |     _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-}
 21 |     # The following line can be used to set a custom config file, stored in the local config folder
 22 |     # If you want to use it, outcomment it and replace airflow.cfg with the name of your config file
 23 |     AIRFLOW_CONFIG: '/opt/airflow/config/airflow.cfg'
 24 |     PYTHONPATH: /sources
 25 |   volumes:
 26 |     - ./dags:/opt/airflow/dags
 27 |     - ./logs:/opt/airflow/logs
 28 |     - ./config:/opt/airflow/config
 29 |     - ./plugins:/opt/airflow/plugins
 30 |     - ./src:/opt/airflow/src
 31 |     - ./requirements.txt:/opt/airflow/requirements.txt
 32 |     - ./.env:/opt/airflow/.env
 33 |     - ${AIRFLOW_PROJ_DIR:-.}:/sources
 34 |   user: "${AIRFLOW_UID:-50000}:0"
 35 |   depends_on:
 36 |     &airflow-common-depends-on
 37 |     redis:
 38 |       condition: service_healthy
 39 |     postgres:
 40 |       condition: service_healthy
 41 | 
 42 | services:
 43 |   postgres:
 44 |     image: postgres:13
 45 |     environment:
 46 |       POSTGRES_USER: airflow
 47 |       POSTGRES_PASSWORD: airflow
 48 |       POSTGRES_DB: airflow
 49 |     volumes:
 50 |       - postgres-db-volume:/var/lib/postgresql/data
 51 |     healthcheck:
 52 |       test: ["CMD", "pg_isready", "-U", "airflow"]
 53 |       interval: 10s
 54 |       retries: 5
 55 |       start_period: 5s
 56 |     ports:
 57 |       - 5432:5432
 58 |     restart: always
 59 | 
 60 |   redis:
 61 |     image: redis:7.2-bookworm
 62 |     expose:
 63 |       - 6379
 64 |     healthcheck:
 65 |       test: ["CMD", "redis-cli", "ping"]
 66 |       interval: 10s
 67 |       timeout: 30s
 68 |       retries: 50
 69 |       start_period: 30s
 70 |     restart: always
 71 | 
 72 |   airflow-webserver:
 73 |     <<: *airflow-common
 74 |     command: webserver
 75 |     ports:
 76 |       - "8080:8080"
 77 |     healthcheck:
 78 |       test: ["CMD", "curl", "--fail", "http://localhost:8080/health"]
 79 |       interval: 30s
 80 |       timeout: 10s
 81 |       retries: 5
 82 |       start_period: 30s
 83 |     restart: always
 84 |     depends_on:
 85 |       <<: *airflow-common-depends-on
 86 |       airflow-init:
 87 |         condition: service_completed_successfully
 88 | 
 89 |   airflow-scheduler:
 90 |     <<: *airflow-common
 91 |     command: scheduler
 92 |     healthcheck:
 93 |       test: ["CMD", "curl", "--fail", "http://localhost:8974/health"]
 94 |       interval: 30s
 95 |       timeout: 10s
 96 |       retries: 5
 97 |       start_period: 30s
 98 |     restart: always
 99 |     depends_on:
100 |       <<: *airflow-common-depends-on
101 |       airflow-init:
102 |         condition: service_completed_successfully
103 | 
104 |   airflow-worker:
105 |     <<: *airflow-common
106 |     command: celery worker
107 |     healthcheck:
108 |       test:
109 |         - "CMD-SHELL"
110 |         - 'celery --app airflow.providers.celery.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}" || celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"'
111 |       interval: 30s
112 |       timeout: 10s
113 |       retries: 5
114 |       start_period: 30s
115 |     environment:
116 |       <<: *airflow-common-env
117 |       DUMB_INIT_SETSID: "0"
118 |     restart: always
119 |     depends_on:
120 |       <<: *airflow-common-depends-on
121 |       airflow-init:
122 |         condition: service_completed_successfully
123 | 
124 |   # airflow-triggerer:
125 |   #   <<: *airflow-common
126 |   #   command: triggerer
127 |   #   healthcheck:
128 |   #     test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"']
129 |   #     interval: 30s
130 |   #     timeout: 10s
131 |   #     retries: 5
132 |   #     start_period: 30s
133 |   #   restart: always
134 |   #   depends_on:
135 |   #     <<: *airflow-common-depends-on
136 |   #     airflow-init:
137 |   #       condition: service_completed_successfully
138 | 
139 |   airflow-init:
140 |     <<: *airflow-common
141 |     entrypoint: /bin/bash
142 |     command:
143 |       - -c
144 |       - |
145 |         if [[ -z "${AIRFLOW_UID}" ]]; then
146 |           echo
147 |           echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m"
148 |           echo "If you are on Linux, you SHOULD follow the instructions below to set "
149 |           echo "AIRFLOW_UID environment variable, otherwise files will be owned by root."
150 |           echo "For other operating systems you can get rid of the warning with manually created .env file:"
151 |           echo "    See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user"
152 |           echo
153 |         fi
154 |         one_meg=1048576
155 |         mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg))
156 |         cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat)
157 |         disk_available=$$(df / | tail -1 | awk '{print $$4}')
158 |         warning_resources="false"
159 |         if (( mem_available < 4000 )) ; then
160 |           echo
161 |           echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m"
162 |           echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))"
163 |           echo
164 |           warning_resources="true"
165 |         fi
166 |         if (( cpus_available < 2 )); then
167 |           echo
168 |           echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m"
169 |           echo "At least 2 CPUs recommended. You have $${cpus_available}"
170 |           echo
171 |           warning_resources="true"
172 |         fi
173 |         if (( disk_available < one_meg * 10 )); then
174 |           echo
175 |           echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m"
176 |           echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))"
177 |           echo
178 |           warning_resources="true"
179 |         fi
180 |         if [[ $${warning_resources} == "true" ]]; then
181 |           echo
182 |           echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m"
183 |           echo "Please follow the instructions to increase amount of resources available:"
184 |           echo "   https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin"
185 |           echo
186 |         fi
187 | 
188 |         airflow db init
189 | 
190 |         # Cria o usuário Admin
191 |         airflow users create \
192 |           --username airflow \
193 |           --password airflow \
194 |           --firstname Airflow \
195 |           --lastname Admin \
196 |           --role Admin \
197 |           --email airflow@example.com
198 | 
199 |         mkdir -p /sources/logs /sources/dags /sources/plugins /sources/src
200 |         chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins,src}
201 |     # yamllint enable rule:line-length
202 |     environment:
203 |       <<: *airflow-common-env
204 |       _AIRFLOW_DB_MIGRATE: 'true'
205 |       _AIRFLOW_WWW_USER_CREATE: 'true'
206 |       _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow}
207 |       _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow}
208 |       _PIP_ADDITIONAL_REQUIREMENTS: ''
209 |     user: "0:0"
210 |     volumes:
211 |       - ${AIRFLOW_PROJ_DIR:-.}:/sources
212 | 
213 |   # airflow-cli:
214 |   #   <<: *airflow-common
215 |   #   profiles:
216 |   #     - debug
217 |   #   environment:
218 |   #     <<: *airflow-common-env
219 |   #     CONNECTION_CHECK_MAX_COUNT: "0"
220 |   #   command:
221 |   #     - bash
222 |   #     - -c
223 |   #     - airflow
224 | 
225 |   # flower:
226 |   #   <<: *airflow-common
227 |   #   command: celery flower
228 |   #   profiles:
229 |   #     - flower
230 |   #   ports:
231 |   #     - "5555:5555"
232 |   #   healthcheck:
233 |   #     test: ["CMD", "curl", "--fail", "http://localhost:5555/"]
234 |   #     interval: 30s
235 |   #     timeout: 10s
236 |   #     retries: 5
237 |   #     start_period: 30s
238 |   #   restart: always
239 |   #   depends_on:
240 |   #     <<: *airflow-common-depends-on
241 |   #     airflow-init:
242 |   #       condition: service_completed_successfully
243 | 
244 | volumes:
245 |   postgres-db-volume:
246 | 


--------------------------------------------------------------------------------
/docs/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## [0.1.0] - 2025-03-03
  4 | 
  5 | ### Performance Optimizations
  6 | 
  7 | #### 1. Pagination System Improvements
  8 | - **Concurrent Processing**
  9 |   - Added `ThreadPoolExecutor` for parallel page fetching
 10 |   - Configurable number of workers (default: 5)
 11 |   - Each page fetch runs in a separate thread
 12 |   - Improved overall data fetching speed
 13 | 
 14 | - **Batch Processing**
 15 |   - Implemented batch processing with configurable size (default: 10 pages)
 16 |   - Reduced number of database operations
 17 |   - Better memory management
 18 |   - More efficient data handling
 19 | 
 20 | #### 2. Database Optimizations
 21 | - **Connection Pooling**
 22 |   - Added SQLAlchemy connection pooling with `QueuePool`
 23 |   - Configurable pool settings:
 24 |     ```python
 25 |     pool_size=5
 26 |     max_overflow=10
 27 |     pool_timeout=30
 28 |     pool_pre_ping=True
 29 |     ```
 30 |   - Better connection management and reuse
 31 |   - Improved performance under concurrent loads
 32 | 
 33 | - **Transaction Management**
 34 |   - Added `execute_with_transaction` method for proper transaction handling
 35 |   - Using `with self.engine.begin()` for automatic transaction management
 36 |   - Better error handling and rollback support
 37 |   - Proper cleanup of resources
 38 | 
 39 | - **Data Type Handling**
 40 |   - Improved numeric type handling:
 41 |     ```python
 42 |     numeric_columns = [
 43 |         'nSaldo', 'nValorDocumento', 'nSaldoAnterior', 'nSaldoAtual',
 44 |         'nSaldoConciliado', 'nSaldoProvisorio', 'nLimiteCreditoTotal',
 45 |         'nSaldoDisponivel'
 46 |     ]
 47 |     ```
 48 |   - Using proper SQLAlchemy types (Numeric(15,2) for decimals)
 49 |   - Better handling of NULL and empty values
 50 |   - Proper type conversion and validation
 51 | 
 52 | 
 53 | ### Code Structure Improvements
 54 | 
 55 | #### 1. Database Class Enhancements
 56 | - **New Methods**
 57 |   - Added `table_exists` method
 58 |   - Added `execute_with_transaction` method
 59 |   - Improved `save_into_db` method
 60 |   - Better transaction management
 61 | 
 62 | ### Why These Changes?
 63 | 
 64 | 1. **Performance**
 65 |    - The concurrent processing significantly reduces data fetching time
 66 |    - Batch processing reduces database load
 67 |    - Connection pooling improves resource utilization
 68 |    - Better memory management prevents memory leaks
 69 | 
 70 | 2. **Reliability**
 71 |    - Better transaction management prevents data corruption
 72 | 
 73 | ### Configuration Examples
 74 | 
 75 | 
 76 | #### Pagination Settings
 77 | ```python
 78 | self.batch_size = 10  # Number of pages per batch
 79 | self.max_workers = 5  # Number of concurrent workers
 80 | ```
 81 | 
 82 | ### Future Improvements
 83 | 1. Add monitoring and metrics collection
 84 | 2. Implement caching for frequently accessed data
 85 | 3. Add more comprehensive error reporting
 86 | 4. Implement data validation before saving
 87 | 5. Add support for bulk operations
 88 | 6. Improve logging and debugging capabilities
 89 | 
 90 | ### Breaking Changes
 91 | - Changed database column types for numeric fields
 92 | - Modified transaction handling
 93 | - Updated API retry mechanism
 94 | - Changed batch processing behavior
 95 | 
 96 | ### Dependencies
 97 | - Added SQLAlchemy connection pooling
 98 | - Updated pandas data type handling
 99 | - Added concurrent.futures for parallel processing
100 | - Enhanced logging with loguru
101 | 


--------------------------------------------------------------------------------
/gh_2.67.0_windows_amd64.msi:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rphpacheco/omie_api_integration/24036cf0a4d0a3290c7ac3c45d2f967c4483ddbe/gh_2.67.0_windows_amd64.msi


--------------------------------------------------------------------------------
/logs/scheduler/latest:
--------------------------------------------------------------------------------
1 | /opt/airflow/logs/scheduler/2025-02-06
2 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from src.controllers.paginations import PaginationController
 2 | from src.endpoints import Endpoints
 3 | 
 4 | endpoints = Endpoints()
 5 | endpoints = endpoints.get_all()
 6 | 
 7 | for endpoint in endpoints:
 8 |     resource = endpoint.get("resources", None)
 9 |     action = endpoint.get("action", None)
10 |     params = endpoint.get("params", None)
11 |     data_source = endpoint.get("data_source", None)
12 |     pagination_type = endpoint.get("pagination_type", "per_page")
13 |     page_label = endpoint.get("page_label", None)
14 |     total_of_pages_label = endpoint.get("total_of_pages_label", None)
15 |     records_label = endpoint.get("records_label", "registros")
16 | 
17 |     pagination = PaginationController()
18 |     pagination = pagination.pagination(
19 |         type=pagination_type,
20 |         resource=resource,
21 |         action=action,
22 |         params=params,
23 |         data_source=data_source,
24 |         page_label=page_label,
25 |         total_of_pages_label=total_of_pages_label,
26 |         records_label=records_label,
27 |     )
28 | 


--------------------------------------------------------------------------------
/per_page.py:
--------------------------------------------------------------------------------
 1 | from src.controllers.paginations import PaginationController
 2 | from src.endpoints import Endpoints
 3 | 
 4 | endpoints = Endpoints()
 5 | endpoints = endpoints.get_endpoint(action="ListarExtrato")
 6 | 
 7 | for endpoint in endpoints:
 8 |     resource = endpoint.get("resources", None)
 9 |     action = endpoint.get("action", None)
10 |     params = endpoint.get("params", None)
11 |     data_source = endpoint.get("data_source", None)
12 |     pagination_type = endpoint.get("pagination_type", "per_page")
13 | 
14 |     pagination = PaginationController()
15 | 
16 |     if pagination_type == "date_range":
17 |         depends_on = endpoint.get("depends_on", None)
18 | 
19 |         if depends_on:
20 |             from src.db.database import Database
21 | 
22 |             db = Database()
23 |             accounts = db.select_from_table(
24 |                 table_name=depends_on, distinct_column="nCodCC"
25 |             )
26 | 
27 |             for account in accounts:
28 |                 params["nCodCC"] = account
29 | 
30 |                 pagination_execute = pagination.pagination(
31 |                     type=pagination_type,
32 |                     resource=resource,
33 |                     action=action,
34 |                     params=params,
35 |                     data_source=data_source,
36 |                 )
37 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | annotated-types==0.7.0
 2 | certifi==2024.8.30
 3 | charset-normalizer==3.4.0
 4 | idna==3.10
 5 | loguru==0.7.2
 6 | numpy
 7 | pandas==2.1.2
 8 | psycopg2-binary==2.9.10
 9 | pydantic==2.9.2
10 | pydantic-settings==2.6.0
11 | pydantic_core==2.23.4
12 | python-dateutil==2.9.0.post0
13 | python-dotenv==1.0.1
14 | pytz==2024.2
15 | requests==2.32.3
16 | setuptools==75.8.0
17 | six==1.16.0
18 | SQLAlchemy==1.4.51
19 | typing_extensions==4.12.2
20 | tzdata==2024.2
21 | urllib3==2.2.3
22 | 


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rphpacheco/omie_api_integration/24036cf0a4d0a3290c7ac3c45d2f967c4483ddbe/src/__init__.py


--------------------------------------------------------------------------------
/src/api/__init__.py:
--------------------------------------------------------------------------------
1 | from .api_instance import Api
2 | 


--------------------------------------------------------------------------------
/src/api/api_instance.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Union
  2 | 
  3 | import requests
  4 | from loguru import logger
  5 | from requests.adapters import HTTPAdapter
  6 | from requests.exceptions import RequestException
  7 | from urllib3.util.retry import Retry
  8 | 
  9 | 
 10 | class Session:
 11 |     """Manages HTTP session with retry mechanism."""
 12 | 
 13 |     def __init__(self) -> None:
 14 |         self._session = requests.Session()
 15 |         self.retry = Retry(
 16 |             connect=1,
 17 |             read=1,
 18 |             total=5,
 19 |             backoff_factor=1,
 20 |             status_forcelist=[429, 500, 502, 503, 504],
 21 |             allowed_methods=["GET", "POST", "PUT", "DELETE"],
 22 |             respect_retry_after_header=True,
 23 |         )
 24 |         self.adapter = HTTPAdapter(max_retries=self.retry)
 25 |         self._session.mount("http://", self.adapter)
 26 |         self._session.mount("https://", self.adapter)
 27 | 
 28 |     def get(self) -> Union[requests.Session, None]:
 29 |         return self._session
 30 | 
 31 | 
 32 | class Api:
 33 |     def __init__(
 34 |         self,
 35 |         url: str,
 36 |         headers: dict = None,
 37 |         params: dict = None,
 38 |         json: dict = None,
 39 |         proxies: dict = None,
 40 |     ) -> None:
 41 |         self.url = url
 42 |         self.headers = headers
 43 |         self.params = params
 44 |         self.json = json
 45 |         self.verify = True
 46 |         self.proxies = proxies
 47 |         self.session = Session().get()
 48 |         self.timeout = 30
 49 | 
 50 |     def get(self) -> Union[requests.Response, None]:
 51 |         response = self.session.get(
 52 |             url=self.url,
 53 |             headers=self.headers,
 54 |             params=self.params,
 55 |             verify=self.verify,
 56 |             proxies=self.proxies,
 57 |             timeout=self.timeout,
 58 |         )
 59 |         return response
 60 | 
 61 |     def post(self) -> Union[requests.Response, None]:
 62 |         response = self.session.post(
 63 |             url=self.url,
 64 |             headers=self.headers,
 65 |             params=self.params,
 66 |             json=self.json,
 67 |             verify=self.verify,
 68 |             proxies=self.proxies,
 69 |             timeout=self.timeout,
 70 |         )
 71 |         return response
 72 | 
 73 |     def put(self) -> Union[requests.Response, None]:
 74 |         response = self.session.put(
 75 |             url=self.url,
 76 |             headers=self.headers,
 77 |             params=self.params,
 78 |             json=self.json,
 79 |             verify=self.verify,
 80 |             proxies=self.proxies,
 81 |             timeout=self.timeout,
 82 |         )
 83 |         return response
 84 | 
 85 |     def delete(self) -> Union[requests.Response, None]:
 86 |         response = self.session.delete(
 87 |             url=self.url,
 88 |             headers=self.headers,
 89 |             params=self.params,
 90 |             verify=self.verify,
 91 |             proxies=self.proxies,
 92 |             timeout=self.timeout,
 93 |         )
 94 | 
 95 |     def request(self, method: Callable) -> Union[dict, str, None]:
 96 |         try:
 97 |             response = method()
 98 |             if 200 <= response.status_code < 300:
 99 |                 try:
100 |                     return response.json()
101 |                 except ValueError:
102 |                     logger.warning(
103 |                         f"Status Code: {response.status_code}\n Success: Response content is not a JSON: {response.text}"
104 |                     )
105 |                     return response.text
106 |             else:
107 |                 logger.error(
108 |                     f"Status Code: {response.status_code}\n Error: {response.text}"
109 |                 )
110 |                 return response.text
111 |         except RequestException as error:
112 |             return logger.error(f"Request failed: {error}")
113 | 


--------------------------------------------------------------------------------
/src/config/__init__.py:
--------------------------------------------------------------------------------
 1 | from pydantic_settings import BaseSettings
 2 | 
 3 | 
 4 | class Settings(BaseSettings):
 5 |     APP_KEY: str
 6 |     APP_SECRET: str
 7 |     BASE_URL: str
 8 |     DB_HOST: str
 9 |     DB_PORT: int
10 |     DB_USERNAME: str
11 |     DB_PASSWORD: str
12 |     DB_NAME: str
13 |     DATE_INIT: str = "01/01/2025"
14 | 
15 |     class Config:
16 |         env_file = ".env"
17 |         env_file_encoding = "utf-8"
18 |         extra = "ignore"
19 | 


--------------------------------------------------------------------------------
/src/controllers/paginations/__init__.py:
--------------------------------------------------------------------------------
1 | from .paginations import PaginationController
2 | 


--------------------------------------------------------------------------------
/src/controllers/paginations/paginations.py:
--------------------------------------------------------------------------------
  1 | import calendar
  2 | from concurrent.futures import ThreadPoolExecutor, as_completed
  3 | from datetime import datetime
  4 | from typing import Literal
  5 | 
  6 | from loguru import logger
  7 | 
  8 | from src.api import Api
  9 | from src.config import Settings
 10 | from src.db import Database
 11 | from src.utils.constants import HEADERS
 12 | from src.utils.tools import (
 13 |     generate_date_range,
 14 |     get_body_params_pagination,
 15 |     get_total_of_pages,
 16 | )
 17 | 
 18 | settings = Settings()
 19 | 
 20 | 
 21 | class PaginationController:
 22 |     def __init__(self) -> None:
 23 |         self.page = 1
 24 |         self.batch_size = 10  # Number of pages to process in each batch
 25 |         self.max_workers = 5  # Number of concurrent workers
 26 | 
 27 |     def fetch_page(
 28 |         self,
 29 |         page: int,
 30 |         resource: str,
 31 |         action: str,
 32 |         params: dict,
 33 |         page_label: str,
 34 |         data_source: str,
 35 |         records_label: str,
 36 |     ) -> tuple:
 37 |         """Fetch a single page of data from the API"""
 38 |         try:
 39 |             params[page_label] = page
 40 |             body = get_body_params_pagination(
 41 |                 action=action, params=params, page=page, field_pagination=page_label
 42 |             )
 43 | 
 44 |             api = Api(
 45 |                 url=f"{settings.BASE_URL}{resource}",
 46 |                 headers=HEADERS,
 47 |                 json=body,
 48 |                 params=params,
 49 |             )
 50 |             response = api.request(api.post)
 51 | 
 52 |             records_fetched = response.get(records_label, 0)
 53 |             contents = response.get(data_source, [])
 54 | 
 55 |             # Remove blacklisted fields
 56 |             black_list = [
 57 |                 "tags",
 58 |                 "recomendacoes",
 59 |                 "homepage",
 60 |                 "fax_ddd",
 61 |                 "bloquear_exclusao",
 62 |                 "produtor_rural",
 63 |             ]
 64 |             for content in contents:
 65 |                 for item in black_list:
 66 |                     if item in content:
 67 |                         del content[item]
 68 | 
 69 |             logger.info(f"Page {page} has been fetched with {records_fetched} records.")
 70 |             return page, contents
 71 | 
 72 |         except Exception as e:
 73 |             logger.error(f"Error fetching page {page}: {e}")
 74 |             return page, None
 75 | 
 76 |     def process_batch(self, batch_pages: list, resource: str, db: Database) -> None:
 77 |         """Process a batch of pages and save to database"""
 78 |         try:
 79 |             all_contents = []
 80 |             for page, contents in batch_pages:
 81 |                 if contents:
 82 |                     all_contents.extend(contents)
 83 | 
 84 |             if all_contents:
 85 |                 if batch_pages[0][0] == 1:  # First batch
 86 |                     db.save_into_db(1, resource, all_contents, replace=True)
 87 |                 else:
 88 |                     db.save_into_db(
 89 |                         batch_pages[0][0], resource, all_contents, replace=False
 90 |                     )
 91 | 
 92 |         except Exception as e:
 93 |             logger.error(
 94 |                 f"Error processing batch starting with page {batch_pages[0][0]}: {e}"
 95 |             )
 96 | 
 97 |     def per_page(
 98 |         self,
 99 |         resource: str,
100 |         action: str,
101 |         params: dict,
102 |         data_source: str,
103 |         page_label: str = "pagina",
104 |         total_of_pages_label: str = "total_de_paginas",
105 |         records_label: str = "registros",
106 |     ):
107 |         total_of_pages = get_total_of_pages(
108 |             resource, action, params, page_label, total_of_pages_label, records_label
109 |         )
110 | 
111 |         db = Database()
112 |         current_batch = []
113 | 
114 |         with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
115 |             # Submit all pages for processing
116 |             future_to_page = {
117 |                 executor.submit(
118 |                     self.fetch_page,
119 |                     page,
120 |                     resource,
121 |                     action,
122 |                     params.copy(),  # Create a copy of params to avoid race conditions
123 |                     page_label,
124 |                     data_source,
125 |                     records_label,
126 |                 ): page
127 |                 for page in range(1, total_of_pages + 1)
128 |             }
129 | 
130 |             for future in as_completed(future_to_page):
131 |                 page, contents = future.result()
132 |                 current_batch.append((page, contents))
133 | 
134 |                 # Process batch when it reaches batch_size or is the last batch
135 |                 if len(current_batch) >= self.batch_size or page == total_of_pages:
136 |                     self.process_batch(current_batch, resource, db)
137 |                     current_batch = []
138 | 
139 |     def pagination(
140 |         self,
141 |         type: Literal["per_page", "date_range"],
142 |         resource: str,
143 |         action: str,
144 |         params: dict,
145 |         data_source: str,
146 |         page_label: str = "pagina",
147 |         total_of_pages_label: str = "total_de_paginas",
148 |         records_label: str = "registros",
149 |     ):
150 |         match type:
151 |             case "per_page":
152 |                 return self.per_page(
153 |                     resource=resource,
154 |                     action=action,
155 |                     params=params,
156 |                     data_source=data_source,
157 |                     page_label=page_label,
158 |                     total_of_pages_label=total_of_pages_label,
159 |                     records_label=records_label,
160 |                 )
161 |             case "date_range":
162 |                 return self.date_range(
163 |                     resource=resource,
164 |                     action=action,
165 |                     params=params,
166 |                     data_source=data_source,
167 |                     date_init=settings.DATE_INIT,
168 |                 )
169 | 
170 |     def date_range(
171 |         self, resource: str, action: str, params: dict, data_source: str, date_init: str
172 |     ):
173 |         dates = generate_date_range(date_init)
174 | 
175 |         for date in dates:
176 |             date_obj = datetime.strptime(date, "%d/%m/%Y")
177 |             last_day = calendar.monthrange(date_obj.year, date_obj.month)[1]
178 |             end_of_month_date = date_obj.replace(day=last_day)
179 |             end_of_month_date = end_of_month_date.strftime("%d/%m/%Y")
180 | 
181 |             params["dPeriodoInicial"] = date
182 |             params["dPeriodoFinal"] = end_of_month_date
183 | 
184 |             body = get_body_params_pagination(
185 |                 action=action,
186 |                 params=params,
187 |             )
188 | 
189 |             api = Api(url=f"{settings.BASE_URL}{resource}", headers=HEADERS, json=body)
190 |             response = api.request(api.post)
191 | 
192 |             records_fetched = len(response.get(f"{data_source}", 0))
193 | 
194 |             logger.info(
195 |                 f"nCodCC: {params['nCodCC']} - Date {date} at {end_of_month_date} has been fetched with {records_fetched} records."
196 |             )
197 | 
198 |             db = Database()
199 |             # Verificar este lance do parâmetro page em save_into_db
200 |             db.save_into_db(self.page, resource, response)
201 | 
202 |             self.page += 1
203 |             print(f"PAGE: {self.page}")
204 | 


--------------------------------------------------------------------------------
/src/db/__init__.py:
--------------------------------------------------------------------------------
1 | from .database import Database
2 | 


--------------------------------------------------------------------------------
/src/db/database.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import pandas as pd
  4 | from loguru import logger
  5 | from sqlalchemy import create_engine, event, text, types
  6 | from sqlalchemy.pool import QueuePool
  7 | 
  8 | from src.config import Settings
  9 | 
 10 | settings = Settings()
 11 | 
 12 | 
 13 | class Database:
 14 |     """
 15 |     A class used to manage interactions with a PostgreSQL database, including creating connections,
 16 |     retrieving table columns, updating table structures, and saving data.
 17 |     """
 18 | 
 19 |     def __init__(self):
 20 |         """
 21 |         Initializes the Database instance, establishing a connection to the database.
 22 | 
 23 |         Attributes:
 24 |             engine (sqlalchemy.engine.base.Engine): The SQLAlchemy engine used to connect to the database.
 25 |             connection (sqlalchemy.engine.base.Connection): The active connection to the database.
 26 |         """
 27 |         self.engine = self.get_engine()
 28 |         self.connection = self.engine.connect()
 29 | 
 30 |     def get_engine(self):
 31 |         """Creates a SQLAlchemy engine with connection pooling"""
 32 |         connection_string = f"postgresql://{settings.DB_USERNAME}:{settings.DB_PASSWORD}@{settings.DB_HOST}:{settings.DB_PORT}/{settings.DB_NAME}"
 33 |         engine = create_engine(
 34 |             connection_string,
 35 |             poolclass=QueuePool,
 36 |             pool_size=5,
 37 |             max_overflow=10,
 38 |             pool_timeout=30,
 39 |             pool_pre_ping=True,
 40 |         )
 41 | 
 42 |         # Add event listeners for connection pooling
 43 |         @event.listens_for(engine, "connect")
 44 |         def connect(dbapi_connection, connection_record):
 45 |             connection_record.info["pid"] = os.getpid()
 46 | 
 47 |         @event.listens_for(engine, "checkout")
 48 |         def checkout(dbapi_connection, connection_record, connection_proxy):
 49 |             pid = os.getpid()
 50 |             if connection_record.info["pid"] != pid:
 51 |                 connection_record.connection = connection_proxy.connection = None
 52 |                 from sqlalchemy import exc
 53 | 
 54 |                 raise exc.DisconnectionError(
 55 |                     "Connection record belongs to pid %s, "
 56 |                     "attempting to check out in pid %s"
 57 |                     % (connection_record.info["pid"], pid)
 58 |                 )
 59 | 
 60 |         return engine
 61 | 
 62 |     def execute_with_transaction(self, query, params=None):
 63 |         """Execute a query within a transaction"""
 64 |         with self.engine.begin() as connection:
 65 |             if params:
 66 |                 return connection.execute(query, params)
 67 |             return connection.execute(query)
 68 | 
 69 |     def get_columns_of_db(self, table_name: str):
 70 |         """
 71 |         Retrieves the column names of a specified table from the database.
 72 | 
 73 |         Args:
 74 |             table_name (str): The name of the table for which column names are retrieved.
 75 | 
 76 |         Returns:
 77 |             list: A list of column names in the specified table.
 78 |         """
 79 |         query = text(
 80 |             f"""
 81 |             SELECT column_name
 82 |             FROM information_schema.columns
 83 |             WHERE table_name = '{table_name}';
 84 |         """
 85 |         )
 86 |         result = self.execute_with_transaction(query)
 87 |         return [row[0] for row in result]
 88 | 
 89 |     def update_table_structure(self, table_name: str, new_columns):
 90 |         """Updates table structure to match new columns"""
 91 |         try:
 92 |             existing_columns = self.get_columns_of_db(table_name)
 93 |             with self.engine.begin() as connection:
 94 |                 for column in new_columns:
 95 |                     if column not in existing_columns:
 96 |                         alter_query = text(
 97 |                             f'ALTER TABLE {table_name} ADD COLUMN "{column}" TEXT'
 98 |                         )
 99 |                         connection.execute(alter_query)
100 |         except Exception as e:
101 |             logger.error(f"Error updating table structure for {table_name}: {e}")
102 |             raise
103 | 
104 |     def save_into_db(
105 |         self, page: int, resource: str, content: dict, replace: bool = False
106 |     ):
107 |         """
108 |         Enhanced version of save_into_db that handles batch processing
109 | 
110 |         Args:
111 |             page (int): The page number or batch start page
112 |             resource (str): The resource identifier
113 |             content (dict): The data to save
114 |             replace (bool): Whether to replace the existing table (True for first batch)
115 |         """
116 |         table_name = resource.split("/")[-2]
117 | 
118 |         try:
119 |             # Convert content to DataFrame
120 |             if isinstance(content, dict):
121 |                 for key, value in content.items():
122 |                     if isinstance(value, list) and value and isinstance(value[0], dict):
123 |                         parent_keys = [k for k in content.keys() if k != key]
124 |                         df = pd.json_normalize(
125 |                             content, record_path=key, meta=parent_keys
126 |                         )
127 |             else:
128 |                 df = pd.json_normalize(content)
129 | 
130 |             # Convert numeric columns to appropriate types
131 |             # TODO: Add more numeric columns to the list or make it dynamic
132 |             numeric_columns = [
133 |                 "nSaldo",
134 |                 "nValorDocumento",
135 |                 "nSaldoAnterior",
136 |                 "nSaldoAtual",
137 |                 "nSaldoConciliado",
138 |                 "nSaldoProvisorio",
139 |                 "nLimiteCreditoTotal",
140 |                 "nSaldoDisponivel",
141 |             ]
142 | 
143 |             for col in df.columns:
144 |                 if col in numeric_columns and col in df.columns:
145 |                     df[col] = pd.to_numeric(
146 |                         df[col].replace(["", None], "0"), errors="coerce"
147 |                     )
148 |                 elif df[col].dtype == "object":
149 |                     df[col] = df[col].astype(str)
150 | 
151 |             # Create table with correct column types if it doesn't exist
152 |             if replace or not self.table_exists(table_name):
153 |                 with self.engine.begin() as connection:
154 |                     # Drop table if replacing
155 |                     if replace and self.table_exists(table_name):
156 |                         connection.execute(text(f"DROP TABLE IF EXISTS {table_name}"))
157 | 
158 |                     # Create column definitions
159 |                     columns = []
160 |                     for col in df.columns:
161 |                         if col in numeric_columns:
162 |                             columns.append(f'"{col}" NUMERIC(15,2)')
163 |                         else:
164 |                             columns.append(f'"{col}" TEXT')
165 | 
166 |                     create_table_sql = f"CREATE TABLE IF NOT EXISTS {table_name} ({', '.join(columns)})"
167 |                     connection.execute(text(create_table_sql))
168 | 
169 |             # Define SQLAlchemy types for columns
170 |             dtype = {}
171 |             for col in df.columns:
172 |                 if col in numeric_columns:
173 |                     dtype[col] = types.Numeric(15, 2)
174 |                 else:
175 |                     dtype[col] = types.Text()
176 | 
177 |             # Use SQLAlchemy engine directly for better performance
178 |             df.to_sql(
179 |                 table_name,
180 |                 self.engine,
181 |                 if_exists="append",
182 |                 index=False,
183 |                 method="multi",
184 |                 chunksize=1000,
185 |                 dtype=dtype,
186 |             )
187 | 
188 |             logger.success(
189 |                 f"{'Replaced' if replace else 'Appended'} data into table {table_name} starting from page {page}"
190 |             )
191 | 
192 |         except Exception as e:
193 |             logger.error(f"Error saving data into table {table_name}: {e}")
194 |             raise
195 | 
196 |     def table_exists(self, table_name: str) -> bool:
197 |         """Check if a table exists in the database"""
198 |         query = text(
199 |             """
200 |             SELECT EXISTS (
201 |                 SELECT FROM information_schema.tables
202 |                 WHERE table_name = :table_name
203 |             )
204 |         """
205 |         )
206 |         result = self.execute_with_transaction(
207 |             query, {"table_name": table_name}
208 |         ).scalar()
209 |         return bool(result)
210 | 
211 |     def select_from_table(self, table_name: str, distinct_column: str = None):
212 |         try:
213 |             if distinct_column:
214 |                 query = text(f'SELECT DISTINCT "{distinct_column}" FROM {table_name}')
215 |                 result = self.execute_with_transaction(query)
216 |                 return [row[0] for row in result]
217 |             else:
218 |                 query = text(f"SELECT * FROM {table_name}")
219 |                 result = self.execute_with_transaction(query)
220 |                 return [dict(row) for row in result]
221 |         except Exception as e:
222 |             logger.error(f"Error selecting data from table {table_name}: {e}")
223 |             return None
224 | 
225 |     def __del__(self):
226 |         """Ensure proper cleanup of database connections"""
227 |         try:
228 |             if hasattr(self, "connection"):
229 |                 self.connection.close()
230 |             if hasattr(self, "engine"):
231 |                 self.engine.dispose()
232 |         except:
233 |             pass
234 | 


--------------------------------------------------------------------------------
/src/endpoints/__init__.py:
--------------------------------------------------------------------------------
1 | from .endpoints import Endpoints
2 | 


--------------------------------------------------------------------------------
/src/endpoints/data/data.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "resources": "geral/clientes/",
  4 |         "action": "ListarClientes",
  5 |         "params": {
  6 |             "pagina": 1,
  7 |             "registros_por_pagina": 100,
  8 |             "apenas_importado_api": "N"
  9 |         },
 10 |         "data_source": "clientes_cadastro",
 11 |         "page_label": "pagina"
 12 |     },
 13 |     {
 14 |         "resources": "geral/categorias/",
 15 |         "action": "ListarCategorias",
 16 |         "params": {
 17 |             "pagina": 1,
 18 |             "registros_por_pagina": 100,
 19 |             "apenas_importado_api": "N"
 20 |         },
 21 |         "data_source": "categoria_cadastro",
 22 |         "page_label": "pagina"
 23 |     },
 24 |     {
 25 |         "resources": "geral/empresas/",
 26 |         "action": "ListarEmpresas",
 27 |         "params": {
 28 |             "pagina": 1,
 29 |             "registros_por_pagina": 100,
 30 |             "apenas_importado_api": "N"
 31 |         },
 32 |         "data_source": "empresas_cadastro",
 33 |         "page_label": "pagina"
 34 |     },
 35 |     {
 36 |         "resources": "geral/departamentos/",
 37 |         "action": "ListarDepartamentos",
 38 |         "params": {
 39 |             "pagina": 1,
 40 |             "registros_por_pagina": 100
 41 |         },
 42 |         "data_source": "departamentos",
 43 |         "page_label": "pagina"
 44 |     },
 45 |     {
 46 |         "resources": "financas/mf/",
 47 |         "action": "ListarMovimentos",
 48 |         "params": {
 49 |             "nPagina": 1,
 50 |             "nRegPorPagina": 100
 51 |         },
 52 |         "data_source": "movimentos",
 53 |         "page_label": "nPagina",
 54 |         "total_of_pages_label": "nTotPaginas",
 55 |         "records_label": "nRegistros"
 56 |     },
 57 |     {
 58 |         "resources": "geral/contacorrente/",
 59 |         "action": "ListarContasCorrentes",
 60 |         "params": {
 61 |             "pagina": 1,
 62 |             "registros_por_pagina": 100,
 63 |             "apenas_importado_api": "N"
 64 |         },
 65 |         "data_source": "ListarContasCorrentes",
 66 |         "page_label": "pagina"
 67 |     },
 68 |     {
 69 |         "resources": "financas/extrato/",
 70 |         "action": "ListarExtrato",
 71 |         "params": {
 72 |             "nCodCC": 0,
 73 |             "cCodIntCC": "",
 74 |             "dPeriodoInicial": "",
 75 |             "dPeriodoFinal": ""
 76 |         },
 77 |         "data_source": "listaMovimentos",
 78 |         "pagination_type": "date_range",
 79 |         "depends_on": "contacorrente"
 80 |     },
 81 |     {
 82 |         "resources": "geral/produtos/",
 83 |         "action": "ListarProdutos",
 84 |         "params": {
 85 |             "pagina": 1,
 86 |             "registros_por_pagina": 100,
 87 |             "apenas_importado_api": "N",
 88 |             "filtrar_apenas_omiepdv": "N"
 89 |         },
 90 |         "data_source": "produto_servico_cadastro",
 91 |         "page_label": "pagina"
 92 |     },
 93 |     {
 94 |         "resources": "financas/contapagar/",
 95 |         "action": "ListarContasPagar",
 96 |         "params": {
 97 |             "pagina": 1,
 98 |             "registros_por_pagina": 100,
 99 |             "apenas_importado_api": "N"
100 |         },
101 |         "data_source": "conta_pagar_cadastro",
102 |         "page_label": "pagina"
103 |     },
104 |     {
105 |         "resources": "financas/contareceber/",
106 |         "action": "ListarContasReceber",
107 |         "params": {
108 |             "pagina": 1,
109 |             "registros_por_pagina": 100,
110 |             "apenas_importado_api": "N"
111 |         },
112 |         "data_source": "conta_receber_cadastro",
113 |         "page_label": "pagina"
114 |     },
115 |     {
116 |         "resources": "financas/pesquisartitulos/",
117 |         "action": "PesquisarLancamentos",
118 |         "params": {
119 |             "nPagina": 1,
120 |             "nRegPorPagina": 100
121 |         },
122 |         "data_source": "titulosEncontrados",
123 |         "page_label": "nPagina",
124 |         "total_of_pages_label": "nTotPaginas",
125 |         "records_label": "nRegistros"
126 |     }
127 | 
128 | 
129 | ]
130 | 


--------------------------------------------------------------------------------
/src/endpoints/endpoints.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from typing import Optional
 3 | 
 4 | 
 5 | def read_json(path: str) -> dict:
 6 |     with open(path, "r") as file:
 7 |         return json.load(file)
 8 | 
 9 | 
10 | class Endpoints:
11 |     def __init__(self) -> None:
12 |         self.path = "src/endpoints/data/data.json"
13 |         self.endpoints = read_json(self.path)
14 | 
15 |     def get_endpoint(
16 |         self, resource: Optional[str] = None, action: Optional[str] = None
17 |     ) -> dict:
18 |         if action:
19 |             for endpoint in self.endpoints:
20 |                 if endpoint.get("action") == action:
21 |                     return [endpoint]
22 |         elif resource:
23 |             for endpoint in self.endpoints:
24 |                 if endpoint.get("resources") == resource:
25 |                     return [endpoint]
26 |         else:
27 |             raise Exception("Resource or action not found")
28 | 
29 |     def get_all(self) -> list:
30 |         return self.endpoints
31 | 


--------------------------------------------------------------------------------
/src/utils/constants.py:
--------------------------------------------------------------------------------
1 | HEADERS = {"Content-Type": "application/json"}
2 | 


--------------------------------------------------------------------------------
/src/utils/tools.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import Optional
 3 | 
 4 | from src.api import Api
 5 | from src.config import Settings
 6 | from src.utils.constants import HEADERS
 7 | 
 8 | settings = Settings()
 9 | 
10 | 
11 | def get_body_params_pagination(
12 |     action: str,
13 |     params: dict,
14 |     page: Optional[int] = None,
15 |     field_pagination: Optional[str] = None,
16 | ) -> dict:
17 |     if field_pagination:
18 |         params[field_pagination] = page
19 | 
20 |     return {
21 |         "call": action,
22 |         "app_key": settings.APP_KEY,
23 |         "app_secret": settings.APP_SECRET,
24 |         "param": [params],
25 |     }
26 | 
27 | 
28 | def get_total_of_pages(
29 |     resource: str,
30 |     action: str,
31 |     params: dict,
32 |     page_label: Optional[str] = None,
33 |     total_of_pages_label: Optional[str] = None,
34 |     records_label: Optional[str] = None,
35 | ) -> int:
36 |     page_label = "pagina" if page_label is None else page_label
37 |     total_of_pages_label = (
38 |         "total_de_paginas" if total_of_pages_label is None else total_of_pages_label
39 |     )
40 |     records_label = "registros" if records_label is None else records_label
41 | 
42 |     payload = get_body_params_pagination(action, params, 1, page_label)
43 | 
44 |     api = Api(
45 |         url=f"{settings.BASE_URL}{resource}",
46 |         headers=HEADERS,
47 |         json=payload,
48 |         params=params,
49 |     )
50 |     response = api.request(api.post)
51 |     total_of_pages = response.get(total_of_pages_label, 0)
52 | 
53 |     return total_of_pages
54 | 
55 | 
56 | def generate_date_range(start_date_str: str):
57 |     def add_month(data):
58 |         new_month = data.month + 1
59 |         new_year = data.year
60 |         if new_month > 12:
61 |             new_month = 1
62 |             new_year += 1
63 | 
64 |         return data.replace(month=new_month, year=new_year)
65 | 
66 |     start_date = datetime.strptime(start_date_str, "%d/%m/%Y")
67 |     start_date = start_date.replace(day=1)
68 |     # "25/01/2025" -> "01/01/2025"
69 |     # ["01/01/2025", "01/02/2025"]
70 | 
71 |     today = datetime.today()
72 | 
73 |     date_list = []
74 | 
75 |     current_date = start_date
76 |     while current_date <= today:
77 |         date_list.append(current_date.strftime("%d/%m/%Y"))
78 |         current_date = add_month(current_date)
79 | 
80 |     return date_list
81 | 


--------------------------------------------------------------------------------