├── .gitignore ├── .devcontainer ├── .postCreateCommand.sh └── devcontainer.json ├── requirements.txt ├── main.py ├── .github └── dependabot.yml └── dbfs_modified.py /.gitignore: -------------------------------------------------------------------------------- 1 | # environmental variables 2 | .env 3 | 4 | # pycache 5 | __pycache__/ -------------------------------------------------------------------------------- /.devcontainer/.postCreateCommand.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | sudo apt-get update && sudo apt-get -y upgrade 3 | 4 | # Install FUSE 5 | sudo apt-get install -y fuse 6 | 7 | # Install required packages 8 | pip install -r requirements.txt -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2024.2.2 2 | charset-normalizer==3.3.2 3 | fsspec==2024.3.1 4 | fusepy==3.0.1 5 | gitdb==4.0.11 6 | GitPython==3.1.41 7 | idna==3.7 8 | requests==2.31.0 9 | setuptools==69.0.3 10 | smmap==5.0.1 11 | urllib3==2.2.1 12 | wheel==0.43.0 13 | python-dotenv==1.0.1 -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | from dotenv import load_dotenv 5 | from fsspec import fuse 6 | 7 | from dbfs_modified import DatabricksFileSystem 8 | 9 | load_dotenv() 10 | 11 | mount_path = Path("dbfs") 12 | mount_path.mkdir(exist_ok=True) 13 | 14 | fs = DatabricksFileSystem( 15 | instance=os.getenv("DATABRICKS_INSTANCE"), token=os.getenv("DATABRICKS_TOKEN") 16 | ) 17 | fuse.run( 18 | fs, path="/", mount_point=str(mount_path.absolute()), threads=False, foreground=True 19 | ) 20 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for more information: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | # https://containers.dev/guide/dependabot 6 | 7 | version: 2 8 | updates: 9 | - package-ecosystem: "devcontainers" 10 | directory: "/" 11 | schedule: 12 | interval: weekly 13 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/python 3 | { 4 | "name": "Python 3", 5 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 6 | "image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye", 7 | "customizations": { 8 | "vscode": { 9 | "extensions": [ 10 | "ms-python.isort", 11 | "ms-python.black-formatter" 12 | ] 13 | } 14 | }, 15 | 16 | // Features to add to the dev container. More info: https://containers.dev/features. 17 | // "features": {}, 18 | 19 | // Use 'forwardPorts' to make a list of ports inside the container available locally. 20 | // "forwardPorts": [], 21 | 22 | // Use 'postCreateCommand' to run commands after the container is created. 23 | "postCreateCommand": "bash .devcontainer/.postCreateCommand.sh", 24 | 25 | // Configure tool-specific properties. 26 | // "customizations": {}, 27 | 28 | // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. 29 | "remoteUser": "root", 30 | 31 | // Uncomment the next line to run docker args 32 | "runArgs": ["--privileged"] 33 | } 34 | -------------------------------------------------------------------------------- /dbfs_modified.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import urllib 3 | 4 | import requests 5 | import requests.exceptions 6 | from requests.adapters import HTTPAdapter, Retry 7 | 8 | from fsspec import AbstractFileSystem 9 | from fsspec.spec import AbstractBufferedFile 10 | 11 | 12 | class DatabricksException(Exception): 13 | """ 14 | Helper class for exceptions raised in this module. 15 | """ 16 | 17 | def __init__(self, error_code, message): 18 | """Create a new DatabricksException""" 19 | super().__init__(message) 20 | 21 | self.error_code = error_code 22 | self.message = message 23 | 24 | 25 | class DatabricksFileSystem(AbstractFileSystem): 26 | """ 27 | Get access to the Databricks filesystem implementation over HTTP. 28 | Can be used inside and outside of a databricks cluster. 29 | """ 30 | 31 | def __init__(self, instance, token, **kwargs): 32 | """ 33 | Create a new DatabricksFileSystem. 34 | 35 | Parameters 36 | ---------- 37 | instance: str 38 | The instance URL of the databricks cluster. 39 | For example for an Azure databricks cluster, this 40 | has the form adb-..azuredatabricks.net. 41 | token: str 42 | Your personal token. Find out more 43 | here: https://docs.databricks.com/dev-tools/api/latest/authentication.html 44 | """ 45 | self.instance = instance 46 | self.token = token 47 | self.session = requests.Session() 48 | self.retries = Retry( 49 | total=10, 50 | backoff_factor=0.05, 51 | status_forcelist=[408, 429, 500, 502, 503, 504], 52 | ) 53 | 54 | self.session.mount("https://", HTTPAdapter(max_retries=self.retries)) 55 | self.session.headers.update({"Authorization": f"Bearer {self.token}"}) 56 | 57 | super().__init__(**kwargs) 58 | 59 | def ls(self, path, detail=True, **kwargs): 60 | """ 61 | List the contents of the given path. 62 | 63 | Parameters 64 | ---------- 65 | path: str 66 | Absolute path 67 | detail: bool 68 | Return not only the list of filenames, 69 | but also additional information on file sizes 70 | and types. 71 | """ 72 | # if len(path) == 0: 73 | # path = "/" 74 | # print(path) 75 | 76 | out = self._ls_from_cache(path) 77 | 78 | if not out or path != self.old_path: 79 | print("Running the API call") 80 | try: 81 | if path == "": 82 | path = "/" 83 | r = self._send_to_api( 84 | method="get", endpoint="list", json={"path": path} 85 | ) 86 | except DatabricksException as e: 87 | if e.error_code == "RESOURCE_DOES_NOT_EXIST": 88 | raise FileNotFoundError(e.message) 89 | 90 | raise e 91 | files = r["files"] 92 | out = [ 93 | { 94 | "name": o["path"], 95 | "type": "directory" if o["is_dir"] else "file", 96 | "size": o["file_size"], 97 | } 98 | for o in files 99 | ] 100 | self.dircache[path] = out 101 | else: 102 | print("Using the cache") 103 | 104 | self.old_path = path 105 | 106 | if detail: 107 | return out 108 | return [o["name"] for o in out] 109 | 110 | def makedirs(self, path, exist_ok=True): 111 | """ 112 | Create a given absolute path and all of its parents. 113 | 114 | Parameters 115 | ---------- 116 | path: str 117 | Absolute path to create 118 | exist_ok: bool 119 | If false, checks if the folder 120 | exists before creating it (and raises an 121 | Exception if this is the case) 122 | """ 123 | if not exist_ok: 124 | try: 125 | # If the following succeeds, the path is already present 126 | self._send_to_api( 127 | method="get", endpoint="get-status", json={"path": path} 128 | ) 129 | raise FileExistsError(f"Path {path} already exists") 130 | except DatabricksException as e: 131 | if e.error_code == "RESOURCE_DOES_NOT_EXIST": 132 | pass 133 | 134 | try: 135 | self._send_to_api(method="post", endpoint="mkdirs", json={"path": path}) 136 | except DatabricksException as e: 137 | if e.error_code == "RESOURCE_ALREADY_EXISTS": 138 | raise FileExistsError(e.message) 139 | 140 | raise e 141 | self.invalidate_cache(self._parent(path)) 142 | 143 | def mkdir(self, path, create_parents=True, **kwargs): 144 | """ 145 | Create a given absolute path and all of its parents. 146 | 147 | Parameters 148 | ---------- 149 | path: str 150 | Absolute path to create 151 | create_parents: bool 152 | Whether to create all parents or not. 153 | "False" is not implemented so far. 154 | """ 155 | if not create_parents: 156 | raise NotImplementedError 157 | 158 | self.mkdirs(path, **kwargs) 159 | 160 | def rm(self, path, recursive=False, **kwargs): 161 | """ 162 | Remove the file or folder at the given absolute path. 163 | 164 | Parameters 165 | ---------- 166 | path: str 167 | Absolute path what to remove 168 | recursive: bool 169 | Recursively delete all files in a folder. 170 | """ 171 | try: 172 | self._send_to_api( 173 | method="post", 174 | endpoint="delete", 175 | json={"path": path, "recursive": recursive}, 176 | ) 177 | except DatabricksException as e: 178 | # This is not really an exception, it just means 179 | # not everything was deleted so far 180 | if e.error_code == "PARTIAL_DELETE": 181 | self.rm(path=path, recursive=recursive) 182 | elif e.error_code == "IO_ERROR": 183 | # Using the same exception as the os module would use here 184 | raise OSError(e.message) 185 | 186 | raise e 187 | self.invalidate_cache(self._parent(path)) 188 | 189 | def mv( 190 | self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs 191 | ): 192 | """ 193 | Move a source to a destination path. 194 | 195 | A note from the original [databricks API manual] 196 | (https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move). 197 | 198 | When moving a large number of files the API call will time out after 199 | approximately 60s, potentially resulting in partially moved data. 200 | Therefore, for operations that move more than 10k files, we strongly 201 | discourage using the DBFS REST API. 202 | 203 | Parameters 204 | ---------- 205 | source_path: str 206 | From where to move (absolute path) 207 | destination_path: str 208 | To where to move (absolute path) 209 | recursive: bool 210 | Not implemented to far. 211 | maxdepth: 212 | Not implemented to far. 213 | """ 214 | if recursive: 215 | raise NotImplementedError 216 | if maxdepth: 217 | raise NotImplementedError 218 | 219 | try: 220 | self._send_to_api( 221 | method="post", 222 | endpoint="move", 223 | json={"source_path": source_path, "destination_path": destination_path}, 224 | ) 225 | except DatabricksException as e: 226 | if e.error_code == "RESOURCE_DOES_NOT_EXIST": 227 | raise FileNotFoundError(e.message) 228 | elif e.error_code == "RESOURCE_ALREADY_EXISTS": 229 | raise FileExistsError(e.message) 230 | 231 | raise e 232 | self.invalidate_cache(self._parent(source_path)) 233 | self.invalidate_cache(self._parent(destination_path)) 234 | 235 | def _open(self, path, mode="rb", block_size="default", **kwargs): 236 | """ 237 | Overwrite the base class method to make sure to create a DBFile. 238 | All arguments are copied from the base method. 239 | 240 | Only the default blocksize is allowed. 241 | """ 242 | return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs) 243 | 244 | def _send_to_api(self, method, endpoint, json): 245 | """ 246 | Send the given json to the DBFS API 247 | using a get or post request (specified by the argument `method`). 248 | 249 | Parameters 250 | ---------- 251 | method: str 252 | Which http method to use for communication; "get" or "post". 253 | endpoint: str 254 | Where to send the request to (last part of the API URL) 255 | json: dict 256 | Dictionary of information to send 257 | """ 258 | if method == "post": 259 | session_call = self.session.post 260 | elif method == "get": 261 | session_call = self.session.get 262 | else: 263 | raise ValueError(f"Do not understand method {method}") 264 | 265 | url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint) 266 | 267 | r = session_call(url, json=json) 268 | 269 | # The DBFS API will return a json, also in case of an exception. 270 | # We want to preserve this information as good as possible. 271 | try: 272 | r.raise_for_status() 273 | except requests.HTTPError as e: 274 | # try to extract json error message 275 | # if that fails, fall back to the original exception 276 | try: 277 | exception_json = e.response.json() 278 | except Exception: 279 | raise e 280 | 281 | raise DatabricksException(**exception_json) 282 | 283 | return r.json() 284 | 285 | def _create_handle(self, path, overwrite=True): 286 | """ 287 | Internal function to create a handle, which can be used to 288 | write blocks of a file to DBFS. 289 | A handle has a unique identifier which needs to be passed 290 | whenever written during this transaction. 291 | The handle is active for 10 minutes - after that a new 292 | write transaction needs to be created. 293 | Make sure to close the handle after you are finished. 294 | 295 | Parameters 296 | ---------- 297 | path: str 298 | Absolute path for this file. 299 | overwrite: bool 300 | If a file already exist at this location, either overwrite 301 | it or raise an exception. 302 | """ 303 | try: 304 | r = self._send_to_api( 305 | method="post", 306 | endpoint="create", 307 | json={"path": path, "overwrite": overwrite}, 308 | ) 309 | return r["handle"] 310 | except DatabricksException as e: 311 | if e.error_code == "RESOURCE_ALREADY_EXISTS": 312 | raise FileExistsError(e.message) 313 | 314 | raise e 315 | 316 | def _close_handle(self, handle): 317 | """ 318 | Close a handle, which was opened by :func:`_create_handle`. 319 | 320 | Parameters 321 | ---------- 322 | handle: str 323 | Which handle to close. 324 | """ 325 | try: 326 | self._send_to_api(method="post", endpoint="close", json={"handle": handle}) 327 | except DatabricksException as e: 328 | if e.error_code == "RESOURCE_DOES_NOT_EXIST": 329 | raise FileNotFoundError(e.message) 330 | 331 | raise e 332 | 333 | def _add_data(self, handle, data): 334 | """ 335 | Upload data to an already opened file handle 336 | (opened by :func:`_create_handle`). 337 | The maximal allowed data size is 1MB after 338 | conversion to base64. 339 | Remember to close the handle when you are finished. 340 | 341 | Parameters 342 | ---------- 343 | handle: str 344 | Which handle to upload data to. 345 | data: bytes 346 | Block of data to add to the handle. 347 | """ 348 | data = base64.b64encode(data).decode() 349 | try: 350 | self._send_to_api( 351 | method="post", 352 | endpoint="add-block", 353 | json={"handle": handle, "data": data}, 354 | ) 355 | except DatabricksException as e: 356 | if e.error_code == "RESOURCE_DOES_NOT_EXIST": 357 | raise FileNotFoundError(e.message) 358 | elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED": 359 | raise ValueError(e.message) 360 | 361 | raise e 362 | 363 | def _get_data(self, path, start, end): 364 | """ 365 | Download data in bytes from a given absolute path in a block 366 | from [start, start+length]. 367 | The maximum number of allowed bytes to read is 1MB. 368 | 369 | Parameters 370 | ---------- 371 | path: str 372 | Absolute path to download data from 373 | start: int 374 | Start position of the block 375 | end: int 376 | End position of the block 377 | """ 378 | try: 379 | r = self._send_to_api( 380 | method="get", 381 | endpoint="read", 382 | json={"path": path, "offset": start, "length": end - start}, 383 | ) 384 | return base64.b64decode(r["data"]) 385 | except DatabricksException as e: 386 | if e.error_code == "RESOURCE_DOES_NOT_EXIST": 387 | raise FileNotFoundError(e.message) 388 | elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]: 389 | raise ValueError(e.message) 390 | 391 | raise e 392 | 393 | def invalidate_cache(self, path=None): 394 | if path is None: 395 | self.dircache.clear() 396 | else: 397 | self.dircache.pop(path, None) 398 | super().invalidate_cache(path) 399 | 400 | 401 | class DatabricksFile(AbstractBufferedFile): 402 | """ 403 | Helper class for files referenced in the DatabricksFileSystem. 404 | """ 405 | 406 | DEFAULT_BLOCK_SIZE = 1 * 2**20 # only allowed block size 407 | 408 | def __init__( 409 | self, 410 | fs, 411 | path, 412 | mode="rb", 413 | block_size="default", 414 | autocommit=True, 415 | cache_type="readahead", 416 | cache_options=None, 417 | **kwargs, 418 | ): 419 | """ 420 | Create a new instance of the DatabricksFile. 421 | 422 | The blocksize needs to be the default one. 423 | """ 424 | if block_size is None or block_size == "default": 425 | block_size = self.DEFAULT_BLOCK_SIZE 426 | 427 | assert ( 428 | block_size == self.DEFAULT_BLOCK_SIZE 429 | ), f"Only the default block size is allowed, not {block_size}" 430 | 431 | super().__init__( 432 | fs, 433 | path, 434 | mode=mode, 435 | block_size=block_size, 436 | autocommit=autocommit, 437 | cache_type=cache_type, 438 | cache_options=cache_options or {}, 439 | **kwargs, 440 | ) 441 | 442 | def _initiate_upload(self): 443 | """Internal function to start a file upload""" 444 | self.handle = self.fs._create_handle(self.path) 445 | 446 | def _upload_chunk(self, final=False): 447 | """Internal function to add a chunk of data to a started upload""" 448 | self.buffer.seek(0) 449 | data = self.buffer.getvalue() 450 | 451 | data_chunks = [ 452 | data[start:end] for start, end in self._to_sized_blocks(len(data)) 453 | ] 454 | 455 | for data_chunk in data_chunks: 456 | self.fs._add_data(handle=self.handle, data=data_chunk) 457 | 458 | if final: 459 | self.fs._close_handle(handle=self.handle) 460 | return True 461 | 462 | def _fetch_range(self, start, end): 463 | """Internal function to download a block of data""" 464 | return_buffer = b"" 465 | length = end - start 466 | for chunk_start, chunk_end in self._to_sized_blocks(length, start): 467 | return_buffer += self.fs._get_data( 468 | path=self.path, start=chunk_start, end=chunk_end 469 | ) 470 | 471 | return return_buffer 472 | 473 | def _to_sized_blocks(self, length, start=0): 474 | """Helper function to split a range from 0 to total_length into bloksizes""" 475 | end = start + length 476 | for data_chunk in range(start, end, self.blocksize): 477 | data_start = data_chunk 478 | data_end = min(end, data_chunk + self.blocksize) 479 | yield data_start, data_end 480 | --------------------------------------------------------------------------------