├── codecov.yml
├── tests
    ├── __init__.py
    ├── test_memory_store.py
    ├── test_format.py
    └── test_disk_store.py
├── .coveragerc
├── .flake8
├── requirements_dev.txt
├── .vscode
    └── settings.json
├── Makefile
├── memory_store.py
├── .github
    └── workflows
    │   └── build.yml
├── LICENSE
├── example.py
├── notes.md
├── README.md
├── .gitignore
├── format.py
└── disk_store.py


/codecov.yml:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 | omit =
4 |     example.py
5 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | extend-ignore = E203
4 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | black>=22.1.0
2 | coverage>=6.3.2
3 | flake8>=4.0.1
4 | ipdb>=0.13.9
5 | mypy>=0.950
6 | pytype>=2022.4.26
7 | pytest>=7.1.2
8 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "[python]": {
3 |         "editor.defaultFormatter": "ms-python.black-formatter"
4 |     },
5 |     "python.formatting.provider": "none"
6 | }


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | run:
 2 | 	python3 example.py
 3 | 
 4 | test:
 5 | 	python -m unittest discover -vvv ./tests -p '*.py' -b
 6 | 
 7 | lint:
 8 | 	black --check --diff .
 9 | 	flake8 .
10 | 	mypy --strict .
11 | 	pytype .
12 | 
13 | coverage:
14 | 	coverage run -m unittest discover -vvv ./tests -p '*.py' -b
15 | 	coverage report -m
16 | 
17 | html: coverage
18 | 	coverage html
19 | 	open htmlcov/index.html
20 | 
21 | clean:
22 | 	python setup.py clean
23 | 	rm -rf build dist cdbpie.egg-info
24 | 
25 | build: clean
26 | 	python setup.py sdist bdist_wheel
27 | 


--------------------------------------------------------------------------------
/tests/test_memory_store.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from memory_store import MemoryStorage
 4 | 
 5 | 
 6 | class TestInMemoryCDB(unittest.TestCase):
 7 |     def test_get(self) -> None:
 8 |         store = MemoryStorage()
 9 |         store.set("name", "jojo")
10 |         self.assertEqual(store.get("name"), "jojo")
11 | 
12 |     def test_invalid_key(self) -> None:
13 |         store = MemoryStorage()
14 |         self.assertEqual(store.get("some key"), "")
15 | 
16 |     def test_close(self) -> None:
17 |         store = MemoryStorage()
18 |         self.assertTrue(store.close())
19 | 


--------------------------------------------------------------------------------
/memory_store.py:
--------------------------------------------------------------------------------
 1 | class MemoryStorage:
 2 |     def __init__(self) -> None:
 3 |         self.data: dict[str, str] = {}
 4 | 
 5 |     def set(self, key: str, value: str) -> None:
 6 |         self.data[key] = value
 7 | 
 8 |     def get(self, key: str) -> str:
 9 |         return self.data.get(key, "")
10 | 
11 |     def close(self) -> bool:
12 |         # NOTE: ideally, I would want this to have () -> None signature, but for some
13 |         # reason mypy complains about this:
14 |         #
15 |         # tests/test_memory_store.py:19: error: "close" of "MemoryStorage" does not
16 |         #   return a value
17 |         #
18 |         # check here for more: https://github.com/python/mypy/issues/6549
19 |         return True
20 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - 'master'
 7 |   pull_request: {}
 8 | 
 9 | jobs:
10 |   lint:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/setup-python@v3
14 |       - uses: actions/checkout@v3
15 |       - name: install dependencies
16 |         run: |
17 |           python -m pip install --upgrade pip
18 |           pip install -r requirements_dev.txt
19 |       - name: lint
20 |         run: |
21 |           make lint
22 |       - name: coverage
23 |         run: |
24 |           make coverage
25 |       - name: Upload Coverage to Codecov
26 |         uses: codecov/codecov-action@v2
27 | 
28 |   tests:
29 |     runs-on: ${{ matrix.os }}
30 |     strategy:
31 |       matrix:
32 |         os: [ ubuntu-latest, macos-latest, windows-latest ]
33 |         python-version: ["3.x", "pypy-3.9"]
34 |     steps:
35 |       - uses: actions/checkout@v3
36 |       - uses: actions/setup-python@v3
37 |         with:
38 |           python-version: ${{ matrix.python-version }}
39 |       - name: tests
40 |         run: |
41 |           make test
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Avinash Sajjanshetty <opensource@avi.im>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | from memory_store import MemoryStorage
 2 | from disk_store import DiskStorage
 3 | 
 4 | 
 5 | def memory_db() -> None:
 6 |     store = MemoryStorage()
 7 |     print(store.get("name"))
 8 |     store.set("name", "jojo")
 9 |     print(store.get("name"), "jojo")
10 | 
11 | 
12 | def store_db() -> None:
13 |     store = DiskStorage("data.db")
14 |     # on the first run, this will print empty string, but on the next run
15 |     # it should print the value from the disk
16 |     print(store.get("name"))
17 |     store.set("name", "haha")
18 |     print(store.get("name"))
19 |     store.close()
20 | 
21 | 
22 | def store_books() -> None:
23 |     store = DiskStorage("books.db")
24 |     books = {
25 |         "crime and punishment": "dostoevsky",
26 |         "anna karenina": "tolstoy",
27 |         "war and peace": "tolstoy",
28 |         "hamlet": "shakespeare",
29 |         "othello": "shakespeare",
30 |         "brave new world": "huxley",
31 |         "dune": "frank herbert",
32 |     }
33 |     for k, v in books.items():
34 |         store.set(k, v)
35 |         print(f"set k={k}, v={v}")
36 |         print(f"get k={k}, v={store.get(k)}")
37 | 
38 |     for k in books.keys():
39 |         print(f"get k={k}, v={store.get(k)}")
40 |     store.close()
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     # memory_db()
45 |     store_db()
46 |     store_books()
47 | 


--------------------------------------------------------------------------------
/notes.md:
--------------------------------------------------------------------------------
 1 | ## Auto annotate types
 2 | 
 3 | 1. install pyannotate and pytest
 4 | 	
 5 | 		pip install pyannotate pytest
 6 | 
 7 | 2. Add the following as `conftest.py` [source](https://github.com/dropbox/pyannotate/blob/a01510d/example/example_conftest.py)
 8 | 
 9 | ```python
10 | # Configuration for pytest to automatically collect types.
11 | # Thanks to Guilherme Salgado.
12 | 
13 | import pytest
14 | 
15 | 
16 | def pytest_collection_finish(session):
17 |     """Handle the pytest collection finish hook: configure pyannotate.
18 |     Explicitly delay importing `collect_types` until all tests have
19 |     been collected.  This gives gevent a chance to monkey patch the
20 |     world before importing pyannotate.
21 |     """
22 |     from pyannotate_runtime import collect_types
23 |     collect_types.init_types_collection()
24 | 
25 | 
26 | @pytest.fixture(autouse=True)
27 | def collect_types_fixture():
28 |     from pyannotate_runtime import collect_types
29 |     collect_types.start()
30 |     yield
31 |     collect_types.stop()
32 | 
33 | 
34 | def pytest_sessionfinish(session, exitstatus):
35 |     from pyannotate_runtime import collect_types
36 |     collect_types.dump_stats("type_info.json")
37 | ```
38 | 
39 | 3. run pytest and it will automatically generate the type info:
40 | 	
41 |         pytest
42 | 
43 | 4. the type info would be generated in `type_info.json`. Use this to write to the files:
44 | 
45 | 	    pyannotate -w --type-info type_info.json disk_store.py example.py format.py memory_store.py tests/test_disk_store.py tests/test_format.py tests/test_memory_store.py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CDB - Disk based Log Structured Hash Table Store
 2 | 
 3 | ![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)
 4 | [![build](https://github.com/avinassh/cdb/actions/workflows/build.yml/badge.svg)](https://github.com/avinassh/cdb/actions/workflows/build.yml)
 5 | [![codecov](https://codecov.io/gh/avinassh/cdb/branch/master/graph/badge.svg?token=9SA8Q4L7AZ)](https://codecov.io/gh/avinassh/cdb)
 6 | [![GitHub license](https://badgen.net/github/license/Naereen/Strapdown.js)](https://github.com/avinassh/cdb/blob/master/LICENSE)
 7 | 
 8 | ![architecture](https://user-images.githubusercontent.com/640792/166490746-fb41709e-cdb5-4c9a-a58b-f4e6d530b5c7.png)
 9 | 
10 | (educational) build your own disk based KV store
11 | 
12 | ## What Next?
13 | 
14 | cdb has following the limitations, improving them would be a great challenge for the next step:
15 | 
16 | 
17 | ## Line Count
18 | 
19 | ```shell
20 | $ tokei -f format.py disk_store.py
21 | 
22 | ===============================================================================
23 |  Language            Files        Lines         Code     Comments       Blanks
24 | ===============================================================================
25 |  Python                  2          383          255          103           25
26 | -------------------------------------------------------------------------------
27 |  disk_store.py                      196          114           70           12
28 |  format.py                          187          141           33           13
29 | ===============================================================================
30 |  Total                   2          383          255          103           25
31 | ===============================================================================
32 | ```
33 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # ignore all .db files
132 | *.db


--------------------------------------------------------------------------------
/format.py:
--------------------------------------------------------------------------------
 1 | """
 2 | format module provides encode/decode functions for serialisation and deserialisation
 3 | operations
 4 | 
 5 | format module is generic, does not have any disk or memory specific code.
 6 | 
 7 | The disk storage deals with bytes; you cannot just store a string or object without
 8 | converting it to bytes. The programming languages provide abstractions where you don't
 9 | have to think about all this when storing things in memory (i.e. RAM). Consider the
10 | following example where you are storing stuff in a hash table:
11 | 
12 |     books = {}
13 |     books["hamlet"] = "shakespeare"
14 |     books["anna karenina"] = "tolstoy"
15 | 
16 | In the above, the language deals with all the complexities:
17 | 
18 |     - allocating space on the RAM so that it can store data of `books`
19 |     - whenever you add data to `books`, convert that to bytes and keep it in the memory
20 |     - whenever the size of `books` increases, move that to somewhere in the RAM so that
21 |       we can add new items
22 | 
23 | Unfortunately, when it comes to disks, we have to do all this by ourselves, write
24 | code which can allocate space, convert objects to/from bytes and many other operations.
25 | 
26 | format module provides two functions which help us with serialisation of data.
27 | 
28 |     encode_kv - takes the key value pair and encodes them into bytes
29 |     decode_kv - takes a bunch of bytes and decodes them into key value pairs
30 | 
31 | **workshop note**
32 | 
33 | For the workshop, the functions will have the following signature:
34 | 
35 |     def encode_kv(timestamp: int, key: str, value: str) -> tuple[int, bytes]
36 |     def decode_kv(data: bytes) -> tuple[int, str, str]
37 | """
38 | import struct
39 | 
40 | def encode_header(timestamp: int, key_size: int, value_size: int) -> bytes:
41 |     header = struct.pack("III",timestamp,key_size, value_size)
42 |     return header
43 | 
44 | def encode_kv(timestamp: int, key: str, value: str) -> tuple[int, bytes]:
45 |     key_size = len(key)
46 |     value_size = len(value)
47 |     header = encode_header(timestamp=timestamp, key_size=key_size, value_size=value_size)
48 |     format = f"{key_size}s{value_size}s"
49 |     encoded_kv = struct.pack(format,key.encode("utf-8"), value.encode("utf-8"))
50 |     return [key_size + value_size, header+encoded_kv]
51 | 
52 | def decode_kv(data: bytes) -> tuple[int, str, str]:
53 |     [timestamp, key_size, value_size] = decode_header(data[:12])
54 |     [key, value] = struct.unpack(f"{key_size}s{value_size}s", data[12:])
55 |     return [timestamp, key.decode("utf-8"), value.decode("utf-8")]
56 | 
57 | 
58 | def decode_header(data: bytes) -> tuple[int, int, int]:
59 |     header = struct.unpack("III", data)
60 |     return header
61 | 


--------------------------------------------------------------------------------
/tests/test_format.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import struct
 3 | import time
 4 | import typing
 5 | import unittest
 6 | import uuid
 7 | 
 8 | from format import encode_header, decode_header, encode_kv, decode_kv
 9 | 
10 | # TODO: use correct value
11 | HEADER_SIZE: typing.Final[int] = 0
12 | 
13 | 
14 | def get_random_header() -> tuple[int, int, int]:
15 |     # we use 4 bytes to store the int, so max value cannot be greater than
16 |     # the following
17 |     max_size: int = (2**32) - 1
18 |     random_int: typing.Callable[[], int] = lambda: random.randint(0, max_size)
19 |     return random_int(), random_int(), random_int()
20 | 
21 | 
22 | def get_random_kv() -> tuple[int, str, str, int]:
23 |     return (
24 |         int(time.time()),
25 |         str(uuid.uuid4()),
26 |         str(uuid.uuid4()),
27 |         HEADER_SIZE + (2 * len(str(uuid.uuid4()))),
28 |     )
29 | 
30 | 
31 | class Header(typing.NamedTuple):
32 |     timestamp: int
33 |     key_size: int
34 |     val_size: int
35 | 
36 | 
37 | class KeyValue(typing.NamedTuple):
38 |     timestamp: int
39 |     key: str
40 |     val: str
41 |     sz: int
42 | 
43 | 
44 | class TestHeaderOp(unittest.TestCase):
45 |     def header_test(self, tt: Header) -> None:
46 |         data = encode_header(tt.timestamp, tt.key_size, tt.val_size)
47 |         t, k, v = decode_header(data)
48 |         self.assertEqual(tt.timestamp, t)
49 |         self.assertEqual(tt.key_size, k)
50 |         self.assertEqual(tt.val_size, v)
51 | 
52 |     def test_header_serialisation(self) -> None:
53 |         tests: typing.List[Header] = [
54 |             Header(10, 10, 10),
55 |             Header(0, 0, 0),
56 |             Header(10000, 10000, 10000),
57 |         ]
58 |         for tt in tests:
59 |             self.header_test(tt)
60 | 
61 |     def test_random(self) -> None:
62 |         for _ in range(100):
63 |             tt = Header(*get_random_header())
64 |             self.header_test(tt)
65 | 
66 |     def test_bad(self) -> None:
67 |         # trying to encode an int with size more than 4 bytes should raise an error
68 |         self.assertRaises(struct.error, encode_header, 2**32, 5, 5)
69 | 
70 | 
71 | class TestEncodeKV(unittest.TestCase):
72 |     def kv_test(self, tt: KeyValue) -> None:
73 |         sz, data = encode_kv(tt.timestamp, tt.key, tt.val)
74 |         t, k, v = decode_kv(data)
75 |         self.assertEqual(tt.timestamp, t)
76 |         self.assertEqual(tt.key, k)
77 |         self.assertEqual(tt.val, v)
78 |         self.assertEqual(tt.sz, sz)
79 | 
80 |     def test_KV_serialisation(self) -> None:
81 |         tests: typing.List[KeyValue] = [
82 |             KeyValue(10, "hello", "world", HEADER_SIZE + 10),
83 |             KeyValue(0, "", "", HEADER_SIZE),
84 |         ]
85 |         for tt in tests:
86 |             self.kv_test(tt)
87 | 
88 |     def test_random(self) -> None:
89 |         for _ in range(100):
90 |             tt = KeyValue(*get_random_kv())
91 |             self.kv_test(tt)
92 | 


--------------------------------------------------------------------------------
/tests/test_disk_store.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import typing
  4 | import unittest
  5 | 
  6 | from disk_store import DiskStorage
  7 | 
  8 | 
  9 | class TempStorageFile:
 10 |     """
 11 |     TempStorageFile provides a wrapper over the temporary files which are used in
 12 |     testing.
 13 | 
 14 |     Python has two APIs to create temporary files, tempfile.TemporaryFile and
 15 |     tempfile.mkstemp. Files created by tempfile.TemporaryFile gets deleted as soon as
 16 |     they are closed. Since we need to do tests for persistence, we might open and
 17 |     close a file multiple times. Files created using tempfile.mkstemp don't have this
 18 |     limitation, but they have to deleted manually. They don't get deleted when the file
 19 |     descriptor is out scope or our program has exited.
 20 | 
 21 |     Args:
 22 |         path (str): path to the file where our data needs to be stored. If the path
 23 |             parameter is empty, then a temporary will be created using tempfile API
 24 |     """
 25 | 
 26 |     def __init__(self, path: typing.Optional[str] = None):
 27 |         if path:
 28 |             self.path = path
 29 |             return
 30 | 
 31 |         fd, self.path = tempfile.mkstemp()
 32 |         os.close(fd)
 33 | 
 34 |     def clean_up(self) -> None:
 35 |         # NOTE: you might be tempted to use the destructor method `__del__`, however
 36 |         # destructor method gets called whenever the object goes out of scope, and it
 37 |         # will delete our database file. Having a separate method would give us better
 38 |         # control.
 39 |         os.remove(self.path)
 40 | 
 41 | 
 42 | class TestDiskCDB(unittest.TestCase):
 43 |     def setUp(self) -> None:
 44 |         self.file: TempStorageFile = TempStorageFile()
 45 | 
 46 |     def tearDown(self) -> None:
 47 |         self.file.clean_up()
 48 | 
 49 |     def test_get(self) -> None:
 50 |         store = DiskStorage(file_name=self.file.path)
 51 |         store.set("name", "jojo")
 52 |         self.assertEqual(store.get("name"), "jojo")
 53 |         store.close()
 54 | 
 55 |     def test_invalid_key(self) -> None:
 56 |         store = DiskStorage(file_name=self.file.path)
 57 |         self.assertEqual(store.get("some key"), "")
 58 |         store.close()
 59 | 
 60 |     def test_dict_api(self) -> None:
 61 |         store = DiskStorage(file_name=self.file.path)
 62 |         store["name"] = "jojo"
 63 |         self.assertEqual(store["name"], "jojo")
 64 |         store.close()
 65 | 
 66 |     def test_persistence(self) -> None:
 67 |         store = DiskStorage(file_name=self.file.path)
 68 | 
 69 |         tests = {
 70 |             "crime and punishment": "dostoevsky",
 71 |             "anna karenina": "tolstoy",
 72 |             "war and peace": "tolstoy",
 73 |             "hamlet": "shakespeare",
 74 |             "othello": "shakespeare",
 75 |             "brave new world": "huxley",
 76 |             "dune": "frank herbert",
 77 |         }
 78 |         for k, v in tests.items():
 79 |             store.set(k, v)
 80 |             self.assertEqual(store.get(k), v)
 81 |         store.close()
 82 | 
 83 |         store = DiskStorage(file_name=self.file.path)
 84 |         for k, v in tests.items():
 85 |             self.assertEqual(store.get(k), v)
 86 |         store.close()
 87 | 
 88 | 
 89 | class TestDiskCDBExistingFile(unittest.TestCase):
 90 |     def test_get_new_file(self) -> None:
 91 |         t = TempStorageFile(path="temp.db")
 92 |         store = DiskStorage(file_name=t.path)
 93 |         store.set("name", "jojo")
 94 |         self.assertEqual(store.get("name"), "jojo")
 95 |         store.close()
 96 | 
 97 |         # check for key again
 98 |         store = DiskStorage(file_name=t.path)
 99 |         self.assertEqual(store.get("name"), "jojo")
100 |         store.close()
101 |         t.clean_up()
102 | 


--------------------------------------------------------------------------------
/disk_store.py:
--------------------------------------------------------------------------------
  1 | """
  2 | disk_store module implements DiskStorage class which implements the KV store on the
  3 | disk
  4 | 
  5 | DiskStorage provides two simple operations to get and set key value pairs. Both key and
  6 | value needs to be of string type. All the data is persisted to disk. During startup,
  7 | DiskStorage loads all the existing KV pair metadata.  It will throw an error if the
  8 | file is invalid or corrupt.
  9 | 
 10 | Do note that if the database file is large, then the initialisation will take time
 11 | accordingly. The initialisation is also a blocking operation, till it is completed
 12 | the DB cannot be used.
 13 | 
 14 | Typical usage example:
 15 | 
 16 |     disk: DiskStorage = DiskStore(file_name="books.db")
 17 |     disk.set(key="othello", value="shakespeare")
 18 |     author: str = disk.get("othello")
 19 |     # it also supports dictionary style API too:
 20 |     disk["hamlet"] = "shakespeare"
 21 | """
 22 | import os.path
 23 | import time
 24 | import typing
 25 | import struct
 26 | 
 27 | from format import encode_kv, decode_kv, decode_header
 28 | 
 29 | 
 30 | # DiskStorage is a Log-Structured Hash Table as described in the BitCask paper. We
 31 | # keep appending the data to a file, like a log. DiskStorage maintains an in-memory
 32 | # hash table called KeyDir, which keeps the row's location on the disk.
 33 | #
 34 | # The idea is simple yet brilliant:
 35 | #   - Write the record to the disk
 36 | #   - Update the internal hash table to point to that byte offset
 37 | #   - Whenever we get a read request, check the internal hash table for the address,
 38 | #       fetch that and return
 39 | #
 40 | # KeyDir does not store values, only their locations.
 41 | #
 42 | # The above approach solves a lot of problems:
 43 | #   - Writes are insanely fast since you are just appending to the file
 44 | #   - Reads are insanely fast since you do only one disk seek. In B-Tree backed
 45 | #       storage, there could be 2-3 disk seeks
 46 | #
 47 | # However, there are drawbacks too:
 48 | #   - We need to maintain an in-memory hash table KeyDir. A database with a large
 49 | #       number of keys would require more RAM
 50 | #   - Since we need to build the KeyDir at initialisation, it will affect the startup
 51 | #       time too
 52 | #   - Deleted keys need to be purged from the file to reduce the file size
 53 | #
 54 | # Read the paper for more details: https://riak.com/assets/bitcask-intro.pdf
 55 | 
 56 | 
 57 | class DiskStorage:
 58 |     """
 59 |     Implements the KV store on the disk
 60 | 
 61 |     Args:
 62 |         file_name (str): name of the file where all the data will be written. Just
 63 |             passing the file name will save the data in the current directory. You may
 64 |             pass the full file location too.
 65 |     """
 66 | 
 67 |     def __init__(self, file_name: str = "data.db"):
 68 |         self.file_id = file_name
 69 |         self.file_handle = open(file_name, "ab+")
 70 |         self.keydir = {}
 71 |         self.offset = 0
 72 |         while self.offset < os.path.getsize(file_name):
 73 |             self.file_handle.seek(self.offset)
 74 |             header_bytes = self.file_handle.read(12)
 75 |             [timestamp, key_size, value_size] = decode_header(header_bytes)
 76 |             self.file_handle.seek(self.offset)
 77 |             kv_bytes = self.file_handle.read(12 + key_size + value_size)
 78 |             [_, key, value] = decode_kv(kv_bytes)
 79 |             key_size = len(key)
 80 |             value_size = len(value)
 81 |             self.offset = self.offset + 12 + key_size + value_size
 82 |             self.keydir[key] = {
 83 |                 "file_id": file_name,
 84 |                 "value_sz": value_size,
 85 |                 "value_pos": self.offset - value_size,
 86 |                 "tstamp": timestamp,
 87 |             }
 88 | 
 89 |     def set(self, key: str, value: str) -> None:
 90 |         current_timestamp = int(time.time())
 91 |         [_, kv_data] = encode_kv(timestamp=current_timestamp, key=key, value=value)
 92 |         self.file_handle.write(kv_data)
 93 |         self.file_handle.flush()
 94 |         os.fsync(self.file_handle.fileno())
 95 | 
 96 |         if key not in self.keydir:
 97 |             self.keydir[key] = {}
 98 |         self.keydir[key]["file_id"] = self.file_id
 99 |         self.keydir[key]["tstamp"] = current_timestamp
100 |         self.keydir[key]["value_sz"] = len(value)
101 |         self.keydir[key]["value_pos"] = self.file_handle.tell() - len(value)
102 | 
103 |     def get(self, key: str) -> str:
104 |         value = ""
105 |         if key not in self.keydir:
106 |             return value
107 | 
108 |         metadata = self.keydir[key]
109 |         file_path = metadata["file_id"]
110 |         with open(file_path, "rb") as f:
111 |             value_sz = metadata["value_sz"]
112 |             value_pos = metadata["value_pos"]
113 |             f.seek(value_pos)
114 |             value_bytes = f.read(value_sz)
115 |             [value] = struct.unpack(f"{value_sz}s", value_bytes)
116 |         return value.decode("utf-8")
117 | 
118 |     def close(self) -> None:
119 |         self.file_handle.flush()
120 |         os.fsync(self.file_handle.fileno())
121 |         self.file_handle.close()
122 | 
123 |     def __setitem__(self, key: str, value: str) -> None:
124 |         return self.set(key, value)
125 | 
126 |     def __getitem__(self, item: str) -> str:
127 |         return self.get(item)
128 | 


--------------------------------------------------------------------------------