├── .gitignore
├── .devcontainer
    ├── .postCreateCommand.sh
    └── devcontainer.json
├── requirements.txt
├── main.py
├── .github
    └── dependabot.yml
└── dbfs_modified.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # environmental variables
2 | .env
3 | 
4 | # pycache
5 | __pycache__/


--------------------------------------------------------------------------------
/.devcontainer/.postCreateCommand.sh:
--------------------------------------------------------------------------------
1 | # !/bin/bash
2 | sudo apt-get update && sudo apt-get -y upgrade
3 | 
4 | # Install FUSE
5 | sudo apt-get install -y fuse
6 | 
7 | # Install required packages
8 | pip install -r requirements.txt


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2024.2.2
 2 | charset-normalizer==3.3.2
 3 | fsspec==2024.3.1
 4 | fusepy==3.0.1
 5 | gitdb==4.0.11
 6 | GitPython==3.1.41
 7 | idna==3.7
 8 | requests==2.31.0
 9 | setuptools==69.0.3
10 | smmap==5.0.1
11 | urllib3==2.2.1
12 | wheel==0.43.0
13 | python-dotenv==1.0.1


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | from dotenv import load_dotenv
 5 | from fsspec import fuse
 6 | 
 7 | from dbfs_modified import DatabricksFileSystem
 8 | 
 9 | load_dotenv()
10 | 
11 | mount_path = Path("dbfs")
12 | mount_path.mkdir(exist_ok=True)
13 | 
14 | fs = DatabricksFileSystem(
15 |     instance=os.getenv("DATABRICKS_INSTANCE"), token=os.getenv("DATABRICKS_TOKEN")
16 | )
17 | fuse.run(
18 |     fs, path="/", mount_point=str(mount_path.absolute()), threads=False, foreground=True
19 | )
20 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for more information:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | # https://containers.dev/guide/dependabot
 6 | 
 7 | version: 2
 8 | updates:
 9 |  - package-ecosystem: "devcontainers"
10 |    directory: "/"
11 |    schedule:
12 |      interval: weekly
13 | 


--------------------------------------------------------------------------------
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | // For format details, see https://aka.ms/devcontainer.json. For config options, see the
 2 | // README at: https://github.com/devcontainers/templates/tree/main/src/python
 3 | {
 4 | 	"name": "Python 3",
 5 | 	// Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 6 | 	"image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye",
 7 | 	"customizations": {
 8 | 		"vscode": {
 9 | 			"extensions": [
10 | 				"ms-python.isort",
11 | 				"ms-python.black-formatter"
12 | 			]
13 | 		}
14 | 	},
15 | 
16 | 	// Features to add to the dev container. More info: https://containers.dev/features.
17 | 	// "features": {},
18 | 
19 | 	// Use 'forwardPorts' to make a list of ports inside the container available locally.
20 | 	// "forwardPorts": [],
21 | 
22 | 	// Use 'postCreateCommand' to run commands after the container is created.
23 | 	"postCreateCommand": "bash .devcontainer/.postCreateCommand.sh",
24 | 
25 | 	// Configure tool-specific properties.
26 | 	// "customizations": {},
27 | 
28 | 	// Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root.
29 | 	"remoteUser": "root",
30 | 
31 | 	// Uncomment the next line to run docker args
32 | 	"runArgs": ["--privileged"]
33 | }
34 | 


--------------------------------------------------------------------------------
/dbfs_modified.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import urllib
  3 | 
  4 | import requests
  5 | import requests.exceptions
  6 | from requests.adapters import HTTPAdapter, Retry
  7 | 
  8 | from fsspec import AbstractFileSystem
  9 | from fsspec.spec import AbstractBufferedFile
 10 | 
 11 | 
 12 | class DatabricksException(Exception):
 13 |     """
 14 |     Helper class for exceptions raised in this module.
 15 |     """
 16 | 
 17 |     def __init__(self, error_code, message):
 18 |         """Create a new DatabricksException"""
 19 |         super().__init__(message)
 20 | 
 21 |         self.error_code = error_code
 22 |         self.message = message
 23 | 
 24 | 
 25 | class DatabricksFileSystem(AbstractFileSystem):
 26 |     """
 27 |     Get access to the Databricks filesystem implementation over HTTP.
 28 |     Can be used inside and outside of a databricks cluster.
 29 |     """
 30 | 
 31 |     def __init__(self, instance, token, **kwargs):
 32 |         """
 33 |         Create a new DatabricksFileSystem.
 34 | 
 35 |         Parameters
 36 |         ----------
 37 |         instance: str
 38 |             The instance URL of the databricks cluster.
 39 |             For example for an Azure databricks cluster, this
 40 |             has the form adb-<some-number>.<two digits>.azuredatabricks.net.
 41 |         token: str
 42 |             Your personal token. Find out more
 43 |             here: https://docs.databricks.com/dev-tools/api/latest/authentication.html
 44 |         """
 45 |         self.instance = instance
 46 |         self.token = token
 47 |         self.session = requests.Session()
 48 |         self.retries = Retry(
 49 |             total=10,
 50 |             backoff_factor=0.05,
 51 |             status_forcelist=[408, 429, 500, 502, 503, 504],
 52 |         )
 53 | 
 54 |         self.session.mount("https://", HTTPAdapter(max_retries=self.retries))
 55 |         self.session.headers.update({"Authorization": f"Bearer {self.token}"})
 56 | 
 57 |         super().__init__(**kwargs)
 58 | 
 59 |     def ls(self, path, detail=True, **kwargs):
 60 |         """
 61 |         List the contents of the given path.
 62 | 
 63 |         Parameters
 64 |         ----------
 65 |         path: str
 66 |             Absolute path
 67 |         detail: bool
 68 |             Return not only the list of filenames,
 69 |             but also additional information on file sizes
 70 |             and types.
 71 |         """
 72 |         # if len(path) == 0:
 73 |         #     path = "/"
 74 |         # print(path)
 75 |         
 76 |         out = self._ls_from_cache(path)
 77 |         
 78 |         if not out or path != self.old_path:
 79 |             print("Running the API call")
 80 |             try:
 81 |                 if path == "":
 82 |                     path = "/"
 83 |                 r = self._send_to_api(
 84 |                     method="get", endpoint="list", json={"path": path}
 85 |                 )
 86 |             except DatabricksException as e:
 87 |                 if e.error_code == "RESOURCE_DOES_NOT_EXIST":
 88 |                     raise FileNotFoundError(e.message)
 89 |         
 90 |                 raise e
 91 |             files = r["files"]
 92 |             out = [
 93 |                 {
 94 |                     "name": o["path"],
 95 |                     "type": "directory" if o["is_dir"] else "file",
 96 |                     "size": o["file_size"],
 97 |                 }
 98 |                 for o in files
 99 |             ]
100 |             self.dircache[path] = out
101 |         else:
102 |             print("Using the cache")
103 | 
104 |         self.old_path = path
105 | 
106 |         if detail:
107 |             return out
108 |         return [o["name"] for o in out]
109 | 
110 |     def makedirs(self, path, exist_ok=True):
111 |         """
112 |         Create a given absolute path and all of its parents.
113 | 
114 |         Parameters
115 |         ----------
116 |         path: str
117 |             Absolute path to create
118 |         exist_ok: bool
119 |             If false, checks if the folder
120 |             exists before creating it (and raises an
121 |             Exception if this is the case)
122 |         """
123 |         if not exist_ok:
124 |             try:
125 |                 # If the following succeeds, the path is already present
126 |                 self._send_to_api(
127 |                     method="get", endpoint="get-status", json={"path": path}
128 |                 )
129 |                 raise FileExistsError(f"Path {path} already exists")
130 |             except DatabricksException as e:
131 |                 if e.error_code == "RESOURCE_DOES_NOT_EXIST":
132 |                     pass
133 | 
134 |         try:
135 |             self._send_to_api(method="post", endpoint="mkdirs", json={"path": path})
136 |         except DatabricksException as e:
137 |             if e.error_code == "RESOURCE_ALREADY_EXISTS":
138 |                 raise FileExistsError(e.message)
139 | 
140 |             raise e
141 |         self.invalidate_cache(self._parent(path))
142 | 
143 |     def mkdir(self, path, create_parents=True, **kwargs):
144 |         """
145 |         Create a given absolute path and all of its parents.
146 | 
147 |         Parameters
148 |         ----------
149 |         path: str
150 |             Absolute path to create
151 |         create_parents: bool
152 |             Whether to create all parents or not.
153 |             "False" is not implemented so far.
154 |         """
155 |         if not create_parents:
156 |             raise NotImplementedError
157 | 
158 |         self.mkdirs(path, **kwargs)
159 | 
160 |     def rm(self, path, recursive=False, **kwargs):
161 |         """
162 |         Remove the file or folder at the given absolute path.
163 | 
164 |         Parameters
165 |         ----------
166 |         path: str
167 |             Absolute path what to remove
168 |         recursive: bool
169 |             Recursively delete all files in a folder.
170 |         """
171 |         try:
172 |             self._send_to_api(
173 |                 method="post",
174 |                 endpoint="delete",
175 |                 json={"path": path, "recursive": recursive},
176 |             )
177 |         except DatabricksException as e:
178 |             # This is not really an exception, it just means
179 |             # not everything was deleted so far
180 |             if e.error_code == "PARTIAL_DELETE":
181 |                 self.rm(path=path, recursive=recursive)
182 |             elif e.error_code == "IO_ERROR":
183 |                 # Using the same exception as the os module would use here
184 |                 raise OSError(e.message)
185 | 
186 |             raise e
187 |         self.invalidate_cache(self._parent(path))
188 | 
189 |     def mv(
190 |         self, source_path, destination_path, recursive=False, maxdepth=None, **kwargs
191 |     ):
192 |         """
193 |         Move a source to a destination path.
194 | 
195 |         A note from the original [databricks API manual]
196 |         (https://docs.databricks.com/dev-tools/api/latest/dbfs.html#move).
197 | 
198 |         When moving a large number of files the API call will time out after
199 |         approximately 60s, potentially resulting in partially moved data.
200 |         Therefore, for operations that move more than 10k files, we strongly
201 |         discourage using the DBFS REST API.
202 | 
203 |         Parameters
204 |         ----------
205 |         source_path: str
206 |             From where to move (absolute path)
207 |         destination_path: str
208 |             To where to move (absolute path)
209 |         recursive: bool
210 |             Not implemented to far.
211 |         maxdepth:
212 |             Not implemented to far.
213 |         """
214 |         if recursive:
215 |             raise NotImplementedError
216 |         if maxdepth:
217 |             raise NotImplementedError
218 | 
219 |         try:
220 |             self._send_to_api(
221 |                 method="post",
222 |                 endpoint="move",
223 |                 json={"source_path": source_path, "destination_path": destination_path},
224 |             )
225 |         except DatabricksException as e:
226 |             if e.error_code == "RESOURCE_DOES_NOT_EXIST":
227 |                 raise FileNotFoundError(e.message)
228 |             elif e.error_code == "RESOURCE_ALREADY_EXISTS":
229 |                 raise FileExistsError(e.message)
230 | 
231 |             raise e
232 |         self.invalidate_cache(self._parent(source_path))
233 |         self.invalidate_cache(self._parent(destination_path))
234 | 
235 |     def _open(self, path, mode="rb", block_size="default", **kwargs):
236 |         """
237 |         Overwrite the base class method to make sure to create a DBFile.
238 |         All arguments are copied from the base method.
239 | 
240 |         Only the default blocksize is allowed.
241 |         """
242 |         return DatabricksFile(self, path, mode=mode, block_size=block_size, **kwargs)
243 | 
244 |     def _send_to_api(self, method, endpoint, json):
245 |         """
246 |         Send the given json to the DBFS API
247 |         using a get or post request (specified by the argument `method`).
248 | 
249 |         Parameters
250 |         ----------
251 |         method: str
252 |             Which http method to use for communication; "get" or "post".
253 |         endpoint: str
254 |             Where to send the request to (last part of the API URL)
255 |         json: dict
256 |             Dictionary of information to send
257 |         """
258 |         if method == "post":
259 |             session_call = self.session.post
260 |         elif method == "get":
261 |             session_call = self.session.get
262 |         else:
263 |             raise ValueError(f"Do not understand method {method}")
264 | 
265 |         url = urllib.parse.urljoin(f"https://{self.instance}/api/2.0/dbfs/", endpoint)
266 | 
267 |         r = session_call(url, json=json)
268 | 
269 |         # The DBFS API will return a json, also in case of an exception.
270 |         # We want to preserve this information as good as possible.
271 |         try:
272 |             r.raise_for_status()
273 |         except requests.HTTPError as e:
274 |             # try to extract json error message
275 |             # if that fails, fall back to the original exception
276 |             try:
277 |                 exception_json = e.response.json()
278 |             except Exception:
279 |                 raise e
280 | 
281 |             raise DatabricksException(**exception_json)
282 | 
283 |         return r.json()
284 | 
285 |     def _create_handle(self, path, overwrite=True):
286 |         """
287 |         Internal function to create a handle, which can be used to
288 |         write blocks of a file to DBFS.
289 |         A handle has a unique identifier which needs to be passed
290 |         whenever written during this transaction.
291 |         The handle is active for 10 minutes - after that a new
292 |         write transaction needs to be created.
293 |         Make sure to close the handle after you are finished.
294 | 
295 |         Parameters
296 |         ----------
297 |         path: str
298 |             Absolute path for this file.
299 |         overwrite: bool
300 |             If a file already exist at this location, either overwrite
301 |             it or raise an exception.
302 |         """
303 |         try:
304 |             r = self._send_to_api(
305 |                 method="post",
306 |                 endpoint="create",
307 |                 json={"path": path, "overwrite": overwrite},
308 |             )
309 |             return r["handle"]
310 |         except DatabricksException as e:
311 |             if e.error_code == "RESOURCE_ALREADY_EXISTS":
312 |                 raise FileExistsError(e.message)
313 | 
314 |             raise e
315 | 
316 |     def _close_handle(self, handle):
317 |         """
318 |         Close a handle, which was opened by :func:`_create_handle`.
319 | 
320 |         Parameters
321 |         ----------
322 |         handle: str
323 |             Which handle to close.
324 |         """
325 |         try:
326 |             self._send_to_api(method="post", endpoint="close", json={"handle": handle})
327 |         except DatabricksException as e:
328 |             if e.error_code == "RESOURCE_DOES_NOT_EXIST":
329 |                 raise FileNotFoundError(e.message)
330 | 
331 |             raise e
332 | 
333 |     def _add_data(self, handle, data):
334 |         """
335 |         Upload data to an already opened file handle
336 |         (opened by :func:`_create_handle`).
337 |         The maximal allowed data size is 1MB after
338 |         conversion to base64.
339 |         Remember to close the handle when you are finished.
340 | 
341 |         Parameters
342 |         ----------
343 |         handle: str
344 |             Which handle to upload data to.
345 |         data: bytes
346 |             Block of data to add to the handle.
347 |         """
348 |         data = base64.b64encode(data).decode()
349 |         try:
350 |             self._send_to_api(
351 |                 method="post",
352 |                 endpoint="add-block",
353 |                 json={"handle": handle, "data": data},
354 |             )
355 |         except DatabricksException as e:
356 |             if e.error_code == "RESOURCE_DOES_NOT_EXIST":
357 |                 raise FileNotFoundError(e.message)
358 |             elif e.error_code == "MAX_BLOCK_SIZE_EXCEEDED":
359 |                 raise ValueError(e.message)
360 | 
361 |             raise e
362 | 
363 |     def _get_data(self, path, start, end):
364 |         """
365 |         Download data in bytes from a given absolute path in a block
366 |         from [start, start+length].
367 |         The maximum number of allowed bytes to read is 1MB.
368 | 
369 |         Parameters
370 |         ----------
371 |         path: str
372 |             Absolute path to download data from
373 |         start: int
374 |             Start position of the block
375 |         end: int
376 |             End position of the block
377 |         """
378 |         try:
379 |             r = self._send_to_api(
380 |                 method="get",
381 |                 endpoint="read",
382 |                 json={"path": path, "offset": start, "length": end - start},
383 |             )
384 |             return base64.b64decode(r["data"])
385 |         except DatabricksException as e:
386 |             if e.error_code == "RESOURCE_DOES_NOT_EXIST":
387 |                 raise FileNotFoundError(e.message)
388 |             elif e.error_code in ["INVALID_PARAMETER_VALUE", "MAX_READ_SIZE_EXCEEDED"]:
389 |                 raise ValueError(e.message)
390 | 
391 |             raise e
392 | 
393 |     def invalidate_cache(self, path=None):
394 |         if path is None:
395 |             self.dircache.clear()
396 |         else:
397 |             self.dircache.pop(path, None)
398 |         super().invalidate_cache(path)
399 | 
400 | 
401 | class DatabricksFile(AbstractBufferedFile):
402 |     """
403 |     Helper class for files referenced in the DatabricksFileSystem.
404 |     """
405 | 
406 |     DEFAULT_BLOCK_SIZE = 1 * 2**20  # only allowed block size
407 | 
408 |     def __init__(
409 |         self,
410 |         fs,
411 |         path,
412 |         mode="rb",
413 |         block_size="default",
414 |         autocommit=True,
415 |         cache_type="readahead",
416 |         cache_options=None,
417 |         **kwargs,
418 |     ):
419 |         """
420 |         Create a new instance of the DatabricksFile.
421 | 
422 |         The blocksize needs to be the default one.
423 |         """
424 |         if block_size is None or block_size == "default":
425 |             block_size = self.DEFAULT_BLOCK_SIZE
426 | 
427 |         assert (
428 |             block_size == self.DEFAULT_BLOCK_SIZE
429 |         ), f"Only the default block size is allowed, not {block_size}"
430 | 
431 |         super().__init__(
432 |             fs,
433 |             path,
434 |             mode=mode,
435 |             block_size=block_size,
436 |             autocommit=autocommit,
437 |             cache_type=cache_type,
438 |             cache_options=cache_options or {},
439 |             **kwargs,
440 |         )
441 | 
442 |     def _initiate_upload(self):
443 |         """Internal function to start a file upload"""
444 |         self.handle = self.fs._create_handle(self.path)
445 | 
446 |     def _upload_chunk(self, final=False):
447 |         """Internal function to add a chunk of data to a started upload"""
448 |         self.buffer.seek(0)
449 |         data = self.buffer.getvalue()
450 | 
451 |         data_chunks = [
452 |             data[start:end] for start, end in self._to_sized_blocks(len(data))
453 |         ]
454 | 
455 |         for data_chunk in data_chunks:
456 |             self.fs._add_data(handle=self.handle, data=data_chunk)
457 | 
458 |         if final:
459 |             self.fs._close_handle(handle=self.handle)
460 |             return True
461 | 
462 |     def _fetch_range(self, start, end):
463 |         """Internal function to download a block of data"""
464 |         return_buffer = b""
465 |         length = end - start
466 |         for chunk_start, chunk_end in self._to_sized_blocks(length, start):
467 |             return_buffer += self.fs._get_data(
468 |                 path=self.path, start=chunk_start, end=chunk_end
469 |             )
470 | 
471 |         return return_buffer
472 | 
473 |     def _to_sized_blocks(self, length, start=0):
474 |         """Helper function to split a range from 0 to total_length into bloksizes"""
475 |         end = start + length
476 |         for data_chunk in range(start, end, self.blocksize):
477 |             data_start = data_chunk
478 |             data_end = min(end, data_chunk + self.blocksize)
479 |             yield data_start, data_end
480 | 


--------------------------------------------------------------------------------