├── codecov.yml ├── tests ├── __init__.py ├── test_memory_store.py ├── test_format.py └── test_disk_store.py ├── .coveragerc ├── .flake8 ├── requirements_dev.txt ├── .vscode └── settings.json ├── Makefile ├── memory_store.py ├── .github └── workflows │ └── build.yml ├── LICENSE ├── example.py ├── notes.md ├── README.md ├── .gitignore ├── format.py └── disk_store.py /codecov.yml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | omit = 4 | example.py 5 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | extend-ignore = E203 4 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | black>=22.1.0 2 | coverage>=6.3.2 3 | flake8>=4.0.1 4 | ipdb>=0.13.9 5 | mypy>=0.950 6 | pytype>=2022.4.26 7 | pytest>=7.1.2 8 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[python]": { 3 | "editor.defaultFormatter": "ms-python.black-formatter" 4 | }, 5 | "python.formatting.provider": "none" 6 | } -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | run: 2 | python3 example.py 3 | 4 | test: 5 | python -m unittest discover -vvv ./tests -p '*.py' -b 6 | 7 | lint: 8 | black --check --diff . 9 | flake8 . 10 | mypy --strict . 11 | pytype . 12 | 13 | coverage: 14 | coverage run -m unittest discover -vvv ./tests -p '*.py' -b 15 | coverage report -m 16 | 17 | html: coverage 18 | coverage html 19 | open htmlcov/index.html 20 | 21 | clean: 22 | python setup.py clean 23 | rm -rf build dist cdbpie.egg-info 24 | 25 | build: clean 26 | python setup.py sdist bdist_wheel 27 | -------------------------------------------------------------------------------- /tests/test_memory_store.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from memory_store import MemoryStorage 4 | 5 | 6 | class TestInMemoryCDB(unittest.TestCase): 7 | def test_get(self) -> None: 8 | store = MemoryStorage() 9 | store.set("name", "jojo") 10 | self.assertEqual(store.get("name"), "jojo") 11 | 12 | def test_invalid_key(self) -> None: 13 | store = MemoryStorage() 14 | self.assertEqual(store.get("some key"), "") 15 | 16 | def test_close(self) -> None: 17 | store = MemoryStorage() 18 | self.assertTrue(store.close()) 19 | -------------------------------------------------------------------------------- /memory_store.py: -------------------------------------------------------------------------------- 1 | class MemoryStorage: 2 | def __init__(self) -> None: 3 | self.data: dict[str, str] = {} 4 | 5 | def set(self, key: str, value: str) -> None: 6 | self.data[key] = value 7 | 8 | def get(self, key: str) -> str: 9 | return self.data.get(key, "") 10 | 11 | def close(self) -> bool: 12 | # NOTE: ideally, I would want this to have () -> None signature, but for some 13 | # reason mypy complains about this: 14 | # 15 | # tests/test_memory_store.py:19: error: "close" of "MemoryStorage" does not 16 | # return a value 17 | # 18 | # check here for more: https://github.com/python/mypy/issues/6549 19 | return True 20 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'master' 7 | pull_request: {} 8 | 9 | jobs: 10 | lint: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/setup-python@v3 14 | - uses: actions/checkout@v3 15 | - name: install dependencies 16 | run: | 17 | python -m pip install --upgrade pip 18 | pip install -r requirements_dev.txt 19 | - name: lint 20 | run: | 21 | make lint 22 | - name: coverage 23 | run: | 24 | make coverage 25 | - name: Upload Coverage to Codecov 26 | uses: codecov/codecov-action@v2 27 | 28 | tests: 29 | runs-on: ${{ matrix.os }} 30 | strategy: 31 | matrix: 32 | os: [ ubuntu-latest, macos-latest, windows-latest ] 33 | python-version: ["3.x", "pypy-3.9"] 34 | steps: 35 | - uses: actions/checkout@v3 36 | - uses: actions/setup-python@v3 37 | with: 38 | python-version: ${{ matrix.python-version }} 39 | - name: tests 40 | run: | 41 | make test 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Avinash Sajjanshetty 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | from memory_store import MemoryStorage 2 | from disk_store import DiskStorage 3 | 4 | 5 | def memory_db() -> None: 6 | store = MemoryStorage() 7 | print(store.get("name")) 8 | store.set("name", "jojo") 9 | print(store.get("name"), "jojo") 10 | 11 | 12 | def store_db() -> None: 13 | store = DiskStorage("data.db") 14 | # on the first run, this will print empty string, but on the next run 15 | # it should print the value from the disk 16 | print(store.get("name")) 17 | store.set("name", "haha") 18 | print(store.get("name")) 19 | store.close() 20 | 21 | 22 | def store_books() -> None: 23 | store = DiskStorage("books.db") 24 | books = { 25 | "crime and punishment": "dostoevsky", 26 | "anna karenina": "tolstoy", 27 | "war and peace": "tolstoy", 28 | "hamlet": "shakespeare", 29 | "othello": "shakespeare", 30 | "brave new world": "huxley", 31 | "dune": "frank herbert", 32 | } 33 | for k, v in books.items(): 34 | store.set(k, v) 35 | print(f"set k={k}, v={v}") 36 | print(f"get k={k}, v={store.get(k)}") 37 | 38 | for k in books.keys(): 39 | print(f"get k={k}, v={store.get(k)}") 40 | store.close() 41 | 42 | 43 | if __name__ == "__main__": 44 | # memory_db() 45 | store_db() 46 | store_books() 47 | -------------------------------------------------------------------------------- /notes.md: -------------------------------------------------------------------------------- 1 | ## Auto annotate types 2 | 3 | 1. install pyannotate and pytest 4 | 5 | pip install pyannotate pytest 6 | 7 | 2. Add the following as `conftest.py` [source](https://github.com/dropbox/pyannotate/blob/a01510d/example/example_conftest.py) 8 | 9 | ```python 10 | # Configuration for pytest to automatically collect types. 11 | # Thanks to Guilherme Salgado. 12 | 13 | import pytest 14 | 15 | 16 | def pytest_collection_finish(session): 17 | """Handle the pytest collection finish hook: configure pyannotate. 18 | Explicitly delay importing `collect_types` until all tests have 19 | been collected. This gives gevent a chance to monkey patch the 20 | world before importing pyannotate. 21 | """ 22 | from pyannotate_runtime import collect_types 23 | collect_types.init_types_collection() 24 | 25 | 26 | @pytest.fixture(autouse=True) 27 | def collect_types_fixture(): 28 | from pyannotate_runtime import collect_types 29 | collect_types.start() 30 | yield 31 | collect_types.stop() 32 | 33 | 34 | def pytest_sessionfinish(session, exitstatus): 35 | from pyannotate_runtime import collect_types 36 | collect_types.dump_stats("type_info.json") 37 | ``` 38 | 39 | 3. run pytest and it will automatically generate the type info: 40 | 41 | pytest 42 | 43 | 4. the type info would be generated in `type_info.json`. Use this to write to the files: 44 | 45 | pyannotate -w --type-info type_info.json disk_store.py example.py format.py memory_store.py tests/test_disk_store.py tests/test_format.py tests/test_memory_store.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CDB - Disk based Log Structured Hash Table Store 2 | 3 | ![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg) 4 | [![build](https://github.com/avinassh/cdb/actions/workflows/build.yml/badge.svg)](https://github.com/avinassh/cdb/actions/workflows/build.yml) 5 | [![codecov](https://codecov.io/gh/avinassh/cdb/branch/master/graph/badge.svg?token=9SA8Q4L7AZ)](https://codecov.io/gh/avinassh/cdb) 6 | [![GitHub license](https://badgen.net/github/license/Naereen/Strapdown.js)](https://github.com/avinassh/cdb/blob/master/LICENSE) 7 | 8 | ![architecture](https://user-images.githubusercontent.com/640792/166490746-fb41709e-cdb5-4c9a-a58b-f4e6d530b5c7.png) 9 | 10 | (educational) build your own disk based KV store 11 | 12 | ## What Next? 13 | 14 | cdb has following the limitations, improving them would be a great challenge for the next step: 15 | 16 | 17 | ## Line Count 18 | 19 | ```shell 20 | $ tokei -f format.py disk_store.py 21 | 22 | =============================================================================== 23 | Language Files Lines Code Comments Blanks 24 | =============================================================================== 25 | Python 2 383 255 103 25 26 | ------------------------------------------------------------------------------- 27 | disk_store.py 196 114 70 12 28 | format.py 187 141 33 13 29 | =============================================================================== 30 | Total 2 383 255 103 25 31 | =============================================================================== 32 | ``` 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # ignore all .db files 132 | *.db -------------------------------------------------------------------------------- /format.py: -------------------------------------------------------------------------------- 1 | """ 2 | format module provides encode/decode functions for serialisation and deserialisation 3 | operations 4 | 5 | format module is generic, does not have any disk or memory specific code. 6 | 7 | The disk storage deals with bytes; you cannot just store a string or object without 8 | converting it to bytes. The programming languages provide abstractions where you don't 9 | have to think about all this when storing things in memory (i.e. RAM). Consider the 10 | following example where you are storing stuff in a hash table: 11 | 12 | books = {} 13 | books["hamlet"] = "shakespeare" 14 | books["anna karenina"] = "tolstoy" 15 | 16 | In the above, the language deals with all the complexities: 17 | 18 | - allocating space on the RAM so that it can store data of `books` 19 | - whenever you add data to `books`, convert that to bytes and keep it in the memory 20 | - whenever the size of `books` increases, move that to somewhere in the RAM so that 21 | we can add new items 22 | 23 | Unfortunately, when it comes to disks, we have to do all this by ourselves, write 24 | code which can allocate space, convert objects to/from bytes and many other operations. 25 | 26 | format module provides two functions which help us with serialisation of data. 27 | 28 | encode_kv - takes the key value pair and encodes them into bytes 29 | decode_kv - takes a bunch of bytes and decodes them into key value pairs 30 | 31 | **workshop note** 32 | 33 | For the workshop, the functions will have the following signature: 34 | 35 | def encode_kv(timestamp: int, key: str, value: str) -> tuple[int, bytes] 36 | def decode_kv(data: bytes) -> tuple[int, str, str] 37 | """ 38 | import struct 39 | 40 | def encode_header(timestamp: int, key_size: int, value_size: int) -> bytes: 41 | header = struct.pack("III",timestamp,key_size, value_size) 42 | return header 43 | 44 | def encode_kv(timestamp: int, key: str, value: str) -> tuple[int, bytes]: 45 | key_size = len(key) 46 | value_size = len(value) 47 | header = encode_header(timestamp=timestamp, key_size=key_size, value_size=value_size) 48 | format = f"{key_size}s{value_size}s" 49 | encoded_kv = struct.pack(format,key.encode("utf-8"), value.encode("utf-8")) 50 | return [key_size + value_size, header+encoded_kv] 51 | 52 | def decode_kv(data: bytes) -> tuple[int, str, str]: 53 | [timestamp, key_size, value_size] = decode_header(data[:12]) 54 | [key, value] = struct.unpack(f"{key_size}s{value_size}s", data[12:]) 55 | return [timestamp, key.decode("utf-8"), value.decode("utf-8")] 56 | 57 | 58 | def decode_header(data: bytes) -> tuple[int, int, int]: 59 | header = struct.unpack("III", data) 60 | return header 61 | -------------------------------------------------------------------------------- /tests/test_format.py: -------------------------------------------------------------------------------- 1 | import random 2 | import struct 3 | import time 4 | import typing 5 | import unittest 6 | import uuid 7 | 8 | from format import encode_header, decode_header, encode_kv, decode_kv 9 | 10 | # TODO: use correct value 11 | HEADER_SIZE: typing.Final[int] = 0 12 | 13 | 14 | def get_random_header() -> tuple[int, int, int]: 15 | # we use 4 bytes to store the int, so max value cannot be greater than 16 | # the following 17 | max_size: int = (2**32) - 1 18 | random_int: typing.Callable[[], int] = lambda: random.randint(0, max_size) 19 | return random_int(), random_int(), random_int() 20 | 21 | 22 | def get_random_kv() -> tuple[int, str, str, int]: 23 | return ( 24 | int(time.time()), 25 | str(uuid.uuid4()), 26 | str(uuid.uuid4()), 27 | HEADER_SIZE + (2 * len(str(uuid.uuid4()))), 28 | ) 29 | 30 | 31 | class Header(typing.NamedTuple): 32 | timestamp: int 33 | key_size: int 34 | val_size: int 35 | 36 | 37 | class KeyValue(typing.NamedTuple): 38 | timestamp: int 39 | key: str 40 | val: str 41 | sz: int 42 | 43 | 44 | class TestHeaderOp(unittest.TestCase): 45 | def header_test(self, tt: Header) -> None: 46 | data = encode_header(tt.timestamp, tt.key_size, tt.val_size) 47 | t, k, v = decode_header(data) 48 | self.assertEqual(tt.timestamp, t) 49 | self.assertEqual(tt.key_size, k) 50 | self.assertEqual(tt.val_size, v) 51 | 52 | def test_header_serialisation(self) -> None: 53 | tests: typing.List[Header] = [ 54 | Header(10, 10, 10), 55 | Header(0, 0, 0), 56 | Header(10000, 10000, 10000), 57 | ] 58 | for tt in tests: 59 | self.header_test(tt) 60 | 61 | def test_random(self) -> None: 62 | for _ in range(100): 63 | tt = Header(*get_random_header()) 64 | self.header_test(tt) 65 | 66 | def test_bad(self) -> None: 67 | # trying to encode an int with size more than 4 bytes should raise an error 68 | self.assertRaises(struct.error, encode_header, 2**32, 5, 5) 69 | 70 | 71 | class TestEncodeKV(unittest.TestCase): 72 | def kv_test(self, tt: KeyValue) -> None: 73 | sz, data = encode_kv(tt.timestamp, tt.key, tt.val) 74 | t, k, v = decode_kv(data) 75 | self.assertEqual(tt.timestamp, t) 76 | self.assertEqual(tt.key, k) 77 | self.assertEqual(tt.val, v) 78 | self.assertEqual(tt.sz, sz) 79 | 80 | def test_KV_serialisation(self) -> None: 81 | tests: typing.List[KeyValue] = [ 82 | KeyValue(10, "hello", "world", HEADER_SIZE + 10), 83 | KeyValue(0, "", "", HEADER_SIZE), 84 | ] 85 | for tt in tests: 86 | self.kv_test(tt) 87 | 88 | def test_random(self) -> None: 89 | for _ in range(100): 90 | tt = KeyValue(*get_random_kv()) 91 | self.kv_test(tt) 92 | -------------------------------------------------------------------------------- /tests/test_disk_store.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import typing 4 | import unittest 5 | 6 | from disk_store import DiskStorage 7 | 8 | 9 | class TempStorageFile: 10 | """ 11 | TempStorageFile provides a wrapper over the temporary files which are used in 12 | testing. 13 | 14 | Python has two APIs to create temporary files, tempfile.TemporaryFile and 15 | tempfile.mkstemp. Files created by tempfile.TemporaryFile gets deleted as soon as 16 | they are closed. Since we need to do tests for persistence, we might open and 17 | close a file multiple times. Files created using tempfile.mkstemp don't have this 18 | limitation, but they have to deleted manually. They don't get deleted when the file 19 | descriptor is out scope or our program has exited. 20 | 21 | Args: 22 | path (str): path to the file where our data needs to be stored. If the path 23 | parameter is empty, then a temporary will be created using tempfile API 24 | """ 25 | 26 | def __init__(self, path: typing.Optional[str] = None): 27 | if path: 28 | self.path = path 29 | return 30 | 31 | fd, self.path = tempfile.mkstemp() 32 | os.close(fd) 33 | 34 | def clean_up(self) -> None: 35 | # NOTE: you might be tempted to use the destructor method `__del__`, however 36 | # destructor method gets called whenever the object goes out of scope, and it 37 | # will delete our database file. Having a separate method would give us better 38 | # control. 39 | os.remove(self.path) 40 | 41 | 42 | class TestDiskCDB(unittest.TestCase): 43 | def setUp(self) -> None: 44 | self.file: TempStorageFile = TempStorageFile() 45 | 46 | def tearDown(self) -> None: 47 | self.file.clean_up() 48 | 49 | def test_get(self) -> None: 50 | store = DiskStorage(file_name=self.file.path) 51 | store.set("name", "jojo") 52 | self.assertEqual(store.get("name"), "jojo") 53 | store.close() 54 | 55 | def test_invalid_key(self) -> None: 56 | store = DiskStorage(file_name=self.file.path) 57 | self.assertEqual(store.get("some key"), "") 58 | store.close() 59 | 60 | def test_dict_api(self) -> None: 61 | store = DiskStorage(file_name=self.file.path) 62 | store["name"] = "jojo" 63 | self.assertEqual(store["name"], "jojo") 64 | store.close() 65 | 66 | def test_persistence(self) -> None: 67 | store = DiskStorage(file_name=self.file.path) 68 | 69 | tests = { 70 | "crime and punishment": "dostoevsky", 71 | "anna karenina": "tolstoy", 72 | "war and peace": "tolstoy", 73 | "hamlet": "shakespeare", 74 | "othello": "shakespeare", 75 | "brave new world": "huxley", 76 | "dune": "frank herbert", 77 | } 78 | for k, v in tests.items(): 79 | store.set(k, v) 80 | self.assertEqual(store.get(k), v) 81 | store.close() 82 | 83 | store = DiskStorage(file_name=self.file.path) 84 | for k, v in tests.items(): 85 | self.assertEqual(store.get(k), v) 86 | store.close() 87 | 88 | 89 | class TestDiskCDBExistingFile(unittest.TestCase): 90 | def test_get_new_file(self) -> None: 91 | t = TempStorageFile(path="temp.db") 92 | store = DiskStorage(file_name=t.path) 93 | store.set("name", "jojo") 94 | self.assertEqual(store.get("name"), "jojo") 95 | store.close() 96 | 97 | # check for key again 98 | store = DiskStorage(file_name=t.path) 99 | self.assertEqual(store.get("name"), "jojo") 100 | store.close() 101 | t.clean_up() 102 | -------------------------------------------------------------------------------- /disk_store.py: -------------------------------------------------------------------------------- 1 | """ 2 | disk_store module implements DiskStorage class which implements the KV store on the 3 | disk 4 | 5 | DiskStorage provides two simple operations to get and set key value pairs. Both key and 6 | value needs to be of string type. All the data is persisted to disk. During startup, 7 | DiskStorage loads all the existing KV pair metadata. It will throw an error if the 8 | file is invalid or corrupt. 9 | 10 | Do note that if the database file is large, then the initialisation will take time 11 | accordingly. The initialisation is also a blocking operation, till it is completed 12 | the DB cannot be used. 13 | 14 | Typical usage example: 15 | 16 | disk: DiskStorage = DiskStore(file_name="books.db") 17 | disk.set(key="othello", value="shakespeare") 18 | author: str = disk.get("othello") 19 | # it also supports dictionary style API too: 20 | disk["hamlet"] = "shakespeare" 21 | """ 22 | import os.path 23 | import time 24 | import typing 25 | import struct 26 | 27 | from format import encode_kv, decode_kv, decode_header 28 | 29 | 30 | # DiskStorage is a Log-Structured Hash Table as described in the BitCask paper. We 31 | # keep appending the data to a file, like a log. DiskStorage maintains an in-memory 32 | # hash table called KeyDir, which keeps the row's location on the disk. 33 | # 34 | # The idea is simple yet brilliant: 35 | # - Write the record to the disk 36 | # - Update the internal hash table to point to that byte offset 37 | # - Whenever we get a read request, check the internal hash table for the address, 38 | # fetch that and return 39 | # 40 | # KeyDir does not store values, only their locations. 41 | # 42 | # The above approach solves a lot of problems: 43 | # - Writes are insanely fast since you are just appending to the file 44 | # - Reads are insanely fast since you do only one disk seek. In B-Tree backed 45 | # storage, there could be 2-3 disk seeks 46 | # 47 | # However, there are drawbacks too: 48 | # - We need to maintain an in-memory hash table KeyDir. A database with a large 49 | # number of keys would require more RAM 50 | # - Since we need to build the KeyDir at initialisation, it will affect the startup 51 | # time too 52 | # - Deleted keys need to be purged from the file to reduce the file size 53 | # 54 | # Read the paper for more details: https://riak.com/assets/bitcask-intro.pdf 55 | 56 | 57 | class DiskStorage: 58 | """ 59 | Implements the KV store on the disk 60 | 61 | Args: 62 | file_name (str): name of the file where all the data will be written. Just 63 | passing the file name will save the data in the current directory. You may 64 | pass the full file location too. 65 | """ 66 | 67 | def __init__(self, file_name: str = "data.db"): 68 | self.file_id = file_name 69 | self.file_handle = open(file_name, "ab+") 70 | self.keydir = {} 71 | self.offset = 0 72 | while self.offset < os.path.getsize(file_name): 73 | self.file_handle.seek(self.offset) 74 | header_bytes = self.file_handle.read(12) 75 | [timestamp, key_size, value_size] = decode_header(header_bytes) 76 | self.file_handle.seek(self.offset) 77 | kv_bytes = self.file_handle.read(12 + key_size + value_size) 78 | [_, key, value] = decode_kv(kv_bytes) 79 | key_size = len(key) 80 | value_size = len(value) 81 | self.offset = self.offset + 12 + key_size + value_size 82 | self.keydir[key] = { 83 | "file_id": file_name, 84 | "value_sz": value_size, 85 | "value_pos": self.offset - value_size, 86 | "tstamp": timestamp, 87 | } 88 | 89 | def set(self, key: str, value: str) -> None: 90 | current_timestamp = int(time.time()) 91 | [_, kv_data] = encode_kv(timestamp=current_timestamp, key=key, value=value) 92 | self.file_handle.write(kv_data) 93 | self.file_handle.flush() 94 | os.fsync(self.file_handle.fileno()) 95 | 96 | if key not in self.keydir: 97 | self.keydir[key] = {} 98 | self.keydir[key]["file_id"] = self.file_id 99 | self.keydir[key]["tstamp"] = current_timestamp 100 | self.keydir[key]["value_sz"] = len(value) 101 | self.keydir[key]["value_pos"] = self.file_handle.tell() - len(value) 102 | 103 | def get(self, key: str) -> str: 104 | value = "" 105 | if key not in self.keydir: 106 | return value 107 | 108 | metadata = self.keydir[key] 109 | file_path = metadata["file_id"] 110 | with open(file_path, "rb") as f: 111 | value_sz = metadata["value_sz"] 112 | value_pos = metadata["value_pos"] 113 | f.seek(value_pos) 114 | value_bytes = f.read(value_sz) 115 | [value] = struct.unpack(f"{value_sz}s", value_bytes) 116 | return value.decode("utf-8") 117 | 118 | def close(self) -> None: 119 | self.file_handle.flush() 120 | os.fsync(self.file_handle.fileno()) 121 | self.file_handle.close() 122 | 123 | def __setitem__(self, key: str, value: str) -> None: 124 | return self.set(key, value) 125 | 126 | def __getitem__(self, item: str) -> str: 127 | return self.get(item) 128 | --------------------------------------------------------------------------------