├── .github
    └── workflows
    │   └── databricks-ci.yml
├── .gitignore
├── LICENSE
├── README.md
├── functions
    ├── __init__.py
    ├── cleaning_utils.py
    └── tests
    │   ├── __init__.py
    │   └── test_cleaning_utils.py
├── requirements.txt
└── resources
    └── images
        └── github-action-secrets.png


/.github/workflows/databricks-ci.yml:
--------------------------------------------------------------------------------
 1 | name: Databricks CI
 2 | on: [push, pull_request]
 3 | jobs: 
 4 |   run-databricks-ci: 
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v2
 8 |       - run: python -V
 9 |       - run: pip install virtualenv
10 |       - run: virtualenv venv
11 |       - run: source venv/bin/activate
12 |       - run: pip install -r requirements.txt
13 |       - run: |
14 |           echo "y
15 |           ${{ secrets.DATABRICKS_HOST }}
16 |           ${{ secrets.DATABRICKS_TOKEN }}
17 |           ${{ secrets.DATABRICKS_CLUSTER_ID }}
18 |           ${{ secrets.DATABRICKS_WORKSPACE_ORG_ID }}
19 |           15001" | databricks-connect configure
20 |       - run: pytest functions --junitxml=unit-testresults.xml
21 |       - name: Publish Unit Test Results
22 |         uses: EnricoMi/publish-unit-test-result-action@v1
23 |         if: always()
24 |         with:
25 |           files: unit-testresults.xml


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jonathan Neo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | 
  3 | The code in this repository provides sample PySpark functions and sample PyTest unit tests. 
  4 | 
  5 | # Setting up 
  6 | 
  7 | <details>
  8 | <summary><strong> Step 1: Create your python environment </strong></summary>
  9 | 
 10 | Using conda, you can create your python environment by running: 
 11 | ```
 12 | conda create -n <name_of_your_env> python=3.8
 13 | ```
 14 | 
 15 | And activating by: 
 16 | ```
 17 | conda activate <name_of_your_env> 
 18 | ```
 19 | 
 20 | Note: 
 21 | - We are using `python=3.8` because the newest version of Databricks requires python 3.8
 22 | 
 23 | </details>
 24 | 
 25 | <details>
 26 | <summary><strong> Step 2: Install dependencies </strong></summary>
 27 | 
 28 | Using pip, you can install all dependencies by running: 
 29 | 
 30 | ```
 31 | pip install -r requirements.txt 
 32 | ```
 33 | 
 34 | Note: 
 35 | - This installs all dependencies listed in the `requirements.txt` file located at the root of this repository. 
 36 | 
 37 | </details>
 38 | 
 39 | <details>
 40 | <summary><strong> Step 3: Create your Databricks Cluster </strong></summary>
 41 | 
 42 | For this demo, please create a Databricks Cluster with Runtime `9.1 LTS`. See instructions on how to create a cluster here: https://docs.databricks.com/clusters/create.html
 43 | 
 44 | Databricks runtime 9.1 LTS allows us to use features such as files and modules in Repos, thus allowing us to modularise our code. The selected Databricks runtime version must match the Python version you have installed on your local machine. For this demo, Python 3.8 is compatible with Databricks Runtime 9.1 LTS. For all version mappings, see: https://docs.databricks.com/dev-tools/databricks-connect.html#requirements
 45 | 
 46 | </details>
 47 | 
 48 | <details>
 49 | <summary><strong> Step 4: Configure Databricks Connect </strong></summary>
 50 | 
 51 | Databricks connect allows you to run PySpark code on your local machine on a Databricks Cluster. 
 52 | 
 53 | To configure the connection, run: 
 54 | 
 55 | ```
 56 | databricks-connect configure
 57 | ```
 58 | 
 59 | You will be prompted for the following information: 
 60 | - Databricks Host
 61 | - Databricks Token
 62 | - Cluster ID 
 63 | - Org ID 
 64 | - Port 
 65 | 
 66 | You can obtain all the necessary information by navigating to your Cluster in your Databricks Workspace and referring to the URL. 
 67 | 
 68 | For example: 
 69 | - Full URL: `https://dbc-12345.cloud.databricks.com/?o=987654321#setting/clusters/my-987-cluster/configuration`
 70 |     - Databricks Host: `https://dbc-12345.cloud.databricks.com`
 71 |     - Databricks Token: see instructions on how to generate your databricks token here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
 72 |     - Cluster ID: `my-987-cluster` 
 73 |     - Org ID: `987654321`
 74 |     - Port: `15001` (leave as default)
 75 | 
 76 | </details>
 77 | 
 78 | <details>
 79 | <summary><strong> Step 5: Validate Databricks Connect </strong></summary>
 80 | 
 81 | Validate that you are able to achieve Databricks Connect connectivity from your local machine by running: 
 82 | 
 83 | ```
 84 | databricks-connect test
 85 | ```
 86 | 
 87 | You should see the following response (below is shortened): 
 88 | ```
 89 | * Simple Scala test passed
 90 | * Testing python command
 91 | * Simple PySpark test passed
 92 | * Testing dbutils.fs
 93 | * Simple dbutils test passed
 94 | * All tests passed.
 95 | ```
 96 | 
 97 | </details>
 98 | 
 99 | # Unit tests 
100 | 
101 | Unit tests are performed using PyTest on your local development environment. These same tests can be executed as part of a CI/CD pipeline so that code is always tested before merging into the production branch (e.g. `main`). 
102 | 
103 | <details>
104 | <summary><strong>Writing tests</strong></summary>
105 | 
106 | To understand how to write unit tests, refer to the two files below: 
107 | 
108 | `functions/cleaning_utils.py`
109 | ```python
110 | def lowercase_all_column_names(df:DataFrame)->DataFrame:
111 |     """
112 |     Convert all column names to lower case. 
113 |     """
114 |     for col in df.columns:
115 |         df = df.withColumnRenamed(col, col.lower())
116 |     return df 
117 | ```
118 | 
119 | 
120 | The code above is a PySpark function that accepts a Spark DataFrame, performs some cleaning/transformation, and returns a Spark DataFrame. 
121 | 
122 | We want to be able to perform unit testing on the PySpark function to ensure that the results returned are as expected, and changes to it won't break our expectations. 
123 | 
124 | To test this PySpark function, we write the following unit test: 
125 | 
126 | `functions/tests/test_cleaning_utils.py`
127 | ```python
128 | from pyspark.sql import Row, SparkSession
129 | import pandas as pd
130 | from datetime import datetime
131 | from ..cleaning_utils import *
132 | 
133 | def test_lowercase_all_columns():
134 |     # ASSEMBLE
135 |     test_data = [
136 |         {
137 |             "ID": 1,
138 |             "First_Name": "Bob",
139 |             "Last_Name": "Builder",
140 |             "Age": 24
141 |         },
142 |         {
143 |             "ID": 2,
144 |             "First_Name": "Sam",
145 |             "Last_Name": "Smith",
146 |             "Age": 41
147 |         }
148 |     ]
149 | 
150 |     spark = SparkSession.builder.getOrCreate()
151 |     test_df = spark.createDataFrame(map(lambda x: Row(**x), test_data))
152 | 
153 |     # ACT 
154 |     output_df = lowercase_all_column_names(test_df)
155 | 
156 |     output_df_as_pd = output_df.toPandas()
157 | 
158 |     expected_output_df = pd.DataFrame({
159 |         "id": [1, 2],
160 |         "first_name": ["Bob", "Sam"],
161 |         "last_name": ["Builder", "Smith"],
162 |         "age": [24, 41]
163 |     })
164 |     # ASSERT
165 |     pd.testing.assert_frame_equal(left=expected_output_df,right=output_df_as_pd, check_exact=True)
166 | ```
167 | 
168 | The test above does 3 things: 
169 | 
170 | 1. **Arrange**: Create dummy Spark DataFrame. 
171 | 2. **Act**: Invoke our PySpark Function and passes in our dummy Spark DataFrame.
172 | 3. **Assert**: Check that the data returned matches our expectation after the transformation. The result should be a pass/fail.
173 | 
174 | When developing your tests, you may wish to run your test_.py file to validate that the code can be executed. You can do so by doing: 
175 | ```
176 | python -m functions.tests.test_cleaning_utils
177 | ```
178 | 
179 | 
180 | The benefit of using PyTest is that the results of our testing can be exported into the JUnit XML format, which is a standard test output format that is used by GitHub, Azure DevOps, GitLab, and many more, as a supported Test Report format. 
181 | 
182 | </details>
183 | 
184 | <details>
185 | <summary><strong>Running tests</strong></summary>
186 | To run all tests in the functions folder, run: 
187 | 
188 | ```
189 | pytest functions
190 | ```
191 | 
192 | You should see the following output: 
193 | ```
194 | ======= test session starts =======
195 | collected 3 items   
196 | functions/tests/test_cleaning_utils.py ...   [100%]
197 | ======= 3 passed in 16.40s =======
198 | ```
199 | </details>
200 | 
201 | # Continuous Integration (CI)
202 | 
203 | <details>
204 | <summary><strong>GitHub Actions</strong></summary>
205 | 
206 | To configure GitHub Actions CI pipelines, follow the steps below: 
207 | 
208 | <strong>Step 1: Create .github folder</strong>
209 | 
210 | At the root of your repository, create the following folders: `.github/workflows` 
211 | 
212 | GitHub Actions will look for any `.yml` files stored in `.github/workflows`.
213 | 
214 | <strong>Step 2: Create your secrets</strong>
215 | 
216 | Create the following secrets with the same values you used to run the tests locally. 
217 | 
218 | - `DATABRICKS_HOST`
219 | - `DATABRICKS_TOKEN`
220 | - `DATABRICKS_CLUSTER_ID`
221 | - `DATABRICKS_WORKSPACE_ORG_ID`
222 | 
223 | ![github-action-secrets](resources/images/github-action-secrets.png)
224 | 
225 | For more information about how to create secrets, see: https://docs.github.com/en/actions/security-guides/encrypted-secrets
226 | 
227 | 
228 | <strong>Step 3: Create your yml file</strong>
229 | 
230 | Create a new .yml file with a name of your choice e.g. `databricks-ci.yml` inside of the `.github/workflows` folder. 
231 | 
232 | Below is sample code for a working unit test pipeline with published test results. 
233 | 
234 | ```yml
235 | name: Databricks CI
236 | on: [push, pull_request]
237 | jobs: 
238 |   run-databricks-ci: 
239 |     runs-on: ubuntu-latest
240 |     steps:
241 |       - uses: actions/checkout@v2
242 |       - run: python -V
243 |       - run: pip install virtualenv
244 |       - run: virtualenv venv
245 |       - run: source venv/bin/activate
246 |       - run: pip install -r requirements.txt
247 |       - run: |
248 |           echo "y
249 |           ${{ secrets.DATABRICKS_HOST }}
250 |           ${{ secrets.DATABRICKS_TOKEN }}
251 |           ${{ secrets.DATABRICKS_CLUSTER_ID }}
252 |           ${{ secrets.DATABRICKS_WORKSPACE_ORG_ID }}
253 |           15001" | databricks-connect configure
254 |       - run: pytest functions --junitxml=unit-testresults.xml
255 |       - name: Publish Unit Test Results
256 |         uses: EnricoMi/publish-unit-test-result-action@v1
257 |         if: always()
258 |         with:
259 |           files: unit-testresults.xml
260 | ```
261 | 
262 | YML explained: 
263 | 
264 | ```yml
265 | name: Databricks CI
266 | on: [push, pull_request]
267 | ```
268 | 
269 | The `name` key allows you to specify the name of your pipeline e.g. `Databricks CI`. 
270 | 
271 | The `on` key defines what triggers will kickoff the pipeline e.g. `[push, pull_request]`
272 | 
273 | ```yml
274 | jobs: 
275 |   run-databricks-ci: 
276 |     runs-on: ubuntu-latest
277 | ```
278 | 
279 | `jobs` defines a job which contains multiple steps. 
280 | 
281 | The job runs on `ubuntu-latest` which comes pre-installed with tools such as python. For details on python version and what other tools are pre-installed, see: https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#preinstalled-software
282 | 
283 | 
284 | ```yml
285 | steps:
286 |     - uses: actions/checkout@v2
287 |     - run: python -V
288 |     - run: pip install virtualenv
289 |     - run: virtualenv venv
290 |     - run: source venv/bin/activate
291 |     - run: pip install -r requirements.txt
292 | 
293 | ```
294 | 
295 | `- uses: actions/checkout@v2` checks out the repository onto the runner. 
296 | 
297 | `- run: python -V` checks the python version installed 
298 | 
299 | `- run: pip install virtualenv` installs the virtual environment library 
300 | 
301 | `- run: virtualenv venv` creates a virtual environment with the name `venv` 
302 | 
303 | `- run: source venv/bin/activate` activates the newly created virtual environment 
304 | 
305 | `- run: pip install -r requirements.txt` installs dependencies specified in the `requirements.txt` file
306 | 
307 | ```yml
308 | - run: |
309 |     echo "y
310 |     ${{ secrets.DATABRICKS_HOST }}
311 |     ${{ secrets.DATABRICKS_TOKEN }}
312 |     ${{ secrets.DATABRICKS_CLUSTER_ID }}
313 |     ${{ secrets.DATABRICKS_WORKSPACE_ORG_ID }}
314 |     15001" | databricks-connect configure
315 | ```
316 | 
317 | `echo "<stuff in here>" | databricks-connect configure` invokes the `databricks-connect configure` command and passes the secrets into it. 
318 | 
319 | ```yml
320 | - run: pytest functions --junitxml=unit-testresults.xml
321 | ```
322 | 
323 | The above runs the `pytest` module on the `functions` folder, and outputs the results using the `junitxml` format to a filepath that we specify e.g. `unit-testresults.xml`. 
324 | 
325 | 
326 | ```yml
327 | - name: Publish Unit Test Results
328 |     uses: EnricoMi/publish-unit-test-result-action@v1
329 |     if: always()
330 |     with:
331 |         files: unit-testresults.xml
332 | ```
333 | 
334 | The above publishes the `unit-testresults.xml` by using a third-party action called `EnricoMi/publish-unit-test-result-action@v1`. 
335 | 
336 | 
337 | </details>


--------------------------------------------------------------------------------
/functions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonathanneo/databricks-unit-testing/e3f792330cb772c7f27a58b6c028936258f7ee0b/functions/__init__.py


--------------------------------------------------------------------------------
/functions/cleaning_utils.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import DataFrame, functions as F
 2 | 
 3 | def lowercase_all_column_names(df:DataFrame)->DataFrame:
 4 |     """
 5 |     Convert all column names to lower case. 
 6 |     """
 7 |     for col in df.columns:
 8 |         df = df.withColumnRenamed(col, col.lower())
 9 |     return df 
10 | 
11 | def uppercase_all_column_names(df:DataFrame)->DataFrame:
12 |     """
13 |     Convert all column names to upper case. 
14 |     """
15 |     for col in df.columns:
16 |         df = df.withColumnRenamed(col, col.upper())
17 |     return df 
18 | 
19 | def add_metadata(df:DataFrame, field_dict:dict)->DataFrame:
20 |     for pair in field_dict.items():
21 |         df = df.withColumn(pair[0], F.lit(pair[1]))
22 |     return df


--------------------------------------------------------------------------------
/functions/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonathanneo/databricks-unit-testing/e3f792330cb772c7f27a58b6c028936258f7ee0b/functions/tests/__init__.py


--------------------------------------------------------------------------------
/functions/tests/test_cleaning_utils.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql import Row, SparkSession
  2 | import pandas as pd
  3 | from datetime import datetime
  4 | 
  5 | from ..cleaning_utils import *
  6 | 
  7 | def test_lowercase_all_columns():
  8 |     # ASSEMBLE
  9 |     test_data = [
 10 |         {
 11 |             "ID": 1,
 12 |             "First_Name": "Bob",
 13 |             "Last_Name": "Builder",
 14 |             "Age": 24
 15 |         },
 16 |         {
 17 |             "ID": 2,
 18 |             "First_Name": "Sam",
 19 |             "Last_Name": "Smith",
 20 |             "Age": 41
 21 |         }
 22 |     ]
 23 | 
 24 |     spark = SparkSession.builder.getOrCreate()
 25 |     test_df = spark.createDataFrame(map(lambda x: Row(**x), test_data))
 26 | 
 27 |     # ACT 
 28 |     output_df = lowercase_all_column_names(test_df)
 29 | 
 30 |     output_df_as_pd = output_df.toPandas()
 31 | 
 32 |     expected_output_df = pd.DataFrame({
 33 |         "id": [1, 2],
 34 |         "first_name": ["Bob", "Sam"],
 35 |         "last_name": ["Builder", "Smith"],
 36 |         "age": [24, 41]
 37 |     })
 38 |     # ASSERT
 39 |     pd.testing.assert_frame_equal(left=expected_output_df,right=output_df_as_pd, check_exact=True)
 40 | 
 41 | def test_uppercase_all_columns():
 42 |     # ASSEMBLE
 43 |     test_data = [
 44 |         {
 45 |             "ID": 1,
 46 |             "First_Name": "Bob",
 47 |             "Last_Name": "Builder",
 48 |             "Age": 24
 49 |         },
 50 |         {
 51 |             "ID": 2,
 52 |             "First_Name": "Sam",
 53 |             "Last_Name": "Smith",
 54 |             "Age": 41
 55 |         }
 56 |     ]
 57 | 
 58 |     spark = SparkSession.builder.getOrCreate()
 59 |     test_df = spark.createDataFrame(map(lambda x: Row(**x), test_data))
 60 |     
 61 |     # ACT 
 62 |     output_df = uppercase_all_column_names(test_df)
 63 | 
 64 |     output_df_as_pd = output_df.toPandas()
 65 | 
 66 |     expected_output_df = pd.DataFrame({
 67 |         "ID": [1, 2],
 68 |         "FIRST_NAME": ["Bob", "Sam"],
 69 |         "LAST_NAME": ["Builder", "Smith"],
 70 |         "AGE": [24, 41]
 71 |     })
 72 |     # ASSERT 
 73 |     pd.testing.assert_frame_equal(left=expected_output_df,right=output_df_as_pd, check_exact=True)
 74 | 
 75 | 
 76 | def test_add_metadata():
 77 |     # ASSEMBLE 
 78 |     test_data = [
 79 |         {
 80 |             "id": 1,
 81 |             "first_name": "Bob",
 82 |             "last_name": "Builder",
 83 |             "age": 24
 84 |         },
 85 |         {
 86 |             "id": 2,
 87 |             "first_name": "Sam",
 88 |             "last_name": "Smith",
 89 |             "age": 41
 90 |         }
 91 |     ]
 92 | 
 93 |     now = datetime.now()
 94 |     field_dict = {
 95 |         "task_id": 1,
 96 |         "ingested_at": now
 97 |     }
 98 |     spark = SparkSession.builder.getOrCreate()
 99 |     test_df = spark.createDataFrame(map(lambda x: Row(**x), test_data))
100 | 
101 |     # ACT 
102 |     output_df = add_metadata(df=test_df, field_dict=field_dict)
103 | 
104 |     output_df_as_pd = output_df.toPandas()
105 | 
106 |     expected_output_df = pd.DataFrame({
107 |         "id": [1, 2],
108 |         "first_name": ["Bob", "Sam"],
109 |         "last_name": ["Builder", "Smith"],
110 |         "age": [24, 41],
111 |         "task_id": [1, 1],
112 |         "ingested_at": [now, now]
113 |     })
114 |     # ASSERT 
115 |     pd.testing.assert_frame_equal(left=expected_output_df,right=output_df_as_pd, check_exact=True, check_dtype=False)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | attrs==21.2.0
 2 | certifi==2021.10.8
 3 | databricks-connect==9.1.2
 4 | iniconfig==1.1.1
 5 | numpy==1.21.3
 6 | packaging==21.2
 7 | pandas==1.3.4
 8 | pluggy==1.0.0
 9 | py==1.10.0
10 | py4j==0.10.9
11 | pyarrow==6.0.0
12 | pyparsing==2.4.7
13 | pytest==6.2.5
14 | python-dateutil==2.8.2
15 | pytz==2021.3
16 | six==1.16.0
17 | toml==0.10.2
18 | 


--------------------------------------------------------------------------------
/resources/images/github-action-secrets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jonathanneo/databricks-unit-testing/e3f792330cb772c7f27a58b6c028936258f7ee0b/resources/images/github-action-secrets.png


--------------------------------------------------------------------------------