├── LICENSE ├── README.md ├── build └── lib │ └── cache_to_disk │ ├── __init__.py │ └── cache_to_disk.py ├── cache_to_disk.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt └── top_level.txt ├── cache_to_disk └── __init__.py └── setup.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Stewart Renehan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cache_to_disk 2 | Local disk caching decorator for python functions with auto-invalidation. 3 | 4 | This is intended to cache functions that both take a long time to run, and have return values that take up too much memory to cache in-memory with redis. The results of the function are pickled and saved to a file, and then unpickled and returned the next time the function is called. The caching is argument specific, so if the function is called with different arguments, the function will be run again. The caching decorator accepts an integer representing the number of days to cache the function for. After this many days, the file for that function will be deleted the next time this module is imported. 5 | 6 | # Installation 7 | ```bash 8 | pip install cache_to_disk 9 | ``` 10 | 11 | # Functions: 12 | cache_to_disk(n_days_to_cache) 13 | delete_disk_caches_for_function(function_name) 14 | delete_old_disk_caches() 15 | 16 | 17 | # Examples: 18 | **cache_to_disk** 19 | ```python 20 | """ 21 | This example caches the function "my_function" for 3 days. 22 | """ 23 | 24 | from cache_to_disk import cache_to_disk 25 | 26 | @cache_to_disk(3) 27 | def my_function(): 28 | to_return = [] 29 | for i in range(10000): 30 | for j in range(i): 31 | to_return.append(i * j ** .23) 32 | return to_return 33 | ``` 34 | **delete_disk_caches_for_function** 35 | 36 | ```python 37 | """ 38 | This example invalidates all of the caches for the function "my_function". The function will be invalidated automatically, but this should be used when the function definition has been changed and you want it to re-run. 39 | """ 40 | 41 | from cache_to_disk import delete_disk_caches_for_function 42 | delete_disk_caches_for_function('my_function') 43 | ``` 44 | 45 | **runtime_accounting** 46 | ```python 47 | """ 48 | This example shows how to check the run-time cache accounting, which shows hits, misses and nocache events 49 | """ 50 | 51 | from cache_to_disk import cache_to_disk 52 | 53 | @cache_to_disk(3) 54 | def query_registrar(host, port, query): 55 | socket = tcp_connect(host, port) 56 | socket.send(query) 57 | response = b'' 58 | while True: 59 | buf = read_wrapper(socket) 60 | if buf is None: 61 | break 62 | response += buf 63 | return response 64 | 65 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 66 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 67 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 68 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 69 | print(query_registrar.cache_info()) 70 | ``` 71 | 72 | **nocache** 73 | ```python 74 | """ 75 | This example shows how to inhibit caching depending on certain conditions, such as a network failure while also returning a value 76 | """ 77 | from cache_to_disk import cache_to_disk, NoCacheCondition 78 | from random import randint 79 | @cache_to_disk(3) 80 | def query_registrar(host, port, query): 81 | socket = tcp_connect(host, port) 82 | socket.send(query) 83 | response = b'' 84 | while True: 85 | try: 86 | if randint(0, 5) > 3: 87 | # Simulate a spurious failure like SIGPIPE/EPIPE 88 | raise socket.error 89 | buf = read_wrapper(socket) 90 | 91 | if buf is None: 92 | break 93 | response += buf 94 | except socket.error: 95 | # To the user, functionalliy requivalent to `return response` except 96 | # the result is not cached, so it can be retried immediately or later 97 | raise NoCacheCondition(function_value=response) 98 | return response 99 | 100 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 101 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 102 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 103 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 104 | print(query_registrar.cache_info()) 105 | ``` 106 | -------------------------------------------------------------------------------- /build/lib/cache_to_disk/__init__.py: -------------------------------------------------------------------------------- 1 | """cache_to_disk: Cache the results of functions persistently on disk 2 | 3 | Original Work, Copyright (c) 2018 Stewart Renehan, MIT License 4 | Author: https://github.com/sarenehan 5 | Project: https://github.com/sarenehan/cache_to_disk 6 | 7 | Modifications: 8 | Author: https://github.com/mzpqnxow 9 | Project: https://github.com/mzpqnxow/cache_to_disk/tree/feature/nocache 10 | 11 | This modified version adds the following: 12 | - Accounting of hits, misses and nocache events 13 | - cache_info(), cache_clear(), cache_size(), cache_get_raw() interfaces accessible 14 | via the function itself for convenience 15 | - NoCacheCondition exception, simple interface for a user to prevent a 16 | specific function result to not be cached, while still passing a return 17 | value to the caller 18 | - Minor refactoring of the decorator, for easier reading 19 | - Minor refactoring of delete_old_disk_caches(), to reduce logical blocks 20 | and depth of indentation 21 | - Default cache age value (DEFAULT_CACHE_AGE) 22 | - Special unlimited age value (UNLIMITED_CACHE_AGE) 23 | - Use of logging module (but defaulting to NullAdapter) 24 | - Minor PEP8 / cosmetic changes 25 | - Minor cosmetic changes to file path generation (use of os.path.join, a constant 26 | for the directory/file path) 27 | - Support getting cache directory or filename from environment: 28 | Cache metadata filename: $DISK_CACHE_FILENAME 29 | Base directory for cache files: $DISK_CACHE_DIR 30 | - Expansion of shell variables and tilde-user values for directories/files 31 | """ 32 | # Standard Library 33 | import json 34 | import logging 35 | import os 36 | import pickle 37 | import warnings 38 | from collections import namedtuple 39 | from copy import deepcopy 40 | from datetime import datetime 41 | from os import getenv 42 | from os.path import ( 43 | dirname, 44 | exists as file_exists, 45 | expanduser, expandvars, getmtime, 46 | isfile, 47 | join as join_path, 48 | realpath) 49 | 50 | 51 | logger = logging.getLogger(__name__) 52 | 53 | if logger.handlers is None: 54 | # Don't log unless user explicitly adds a handler 55 | logger.addHandler(logging.NullHandler()) 56 | 57 | MAX_PICKLE_BYTES = 2 ** 31 - 1 58 | DISK_CACHE_DIR = expanduser(expandvars( 59 | getenv('DISK_CACHE_DIR', join_path(dirname(realpath(__file__)), 'disk_cache')))) 60 | DISK_CACHE_FILE = expanduser(expandvars(join_path( 61 | DISK_CACHE_DIR, getenv('DISK_CACHE_FILENAME', 'cache_to_disk_caches.json')))) 62 | 63 | # Specify 0 for cache age days to keep forever; not recommended for obvious reasons 64 | UNLIMITED_CACHE_AGE = 0 65 | DEFAULT_CACHE_AGE = 7 66 | 67 | _TOTAL_NUMCACHE_KEY = 'total_number_of_cache_to_disks' 68 | 69 | # Run-time cache data, stolen from Python functools.lru_cache implementation 70 | # Events resulting in nocache are cache misses that complete, but instruct cache_to_disk to 71 | # not store the result. Useful, for example, in a function that makes a network request and 72 | # experiences a failure that is considered likely to be temporary. This is accomplished in 73 | # the user function by raising NoCacheCondition 74 | _CacheInfo = namedtuple('CacheInfo', ['hits', 'misses', 'nocache']) 75 | 76 | # This is probably unnecessary ... 77 | # logger.debug('cache_to_disk package loaded; using DISK_CACHE_DIR=%s', 78 | # os.path.relpath(DISK_CACHE_DIR, '.')) 79 | 80 | 81 | class NoCacheCondition(Exception): 82 | """Custom exception for user function to raise to prevent caching on a per-call basis 83 | 84 | The function_value kwarg can be set as a kwarg to return a value other than None to the 85 | original caller 86 | 87 | Example 88 | ------- 89 | The following contrived example will return a value to the caller but avoids it being 90 | cached. In this example, a socket exception is considered a failure, but there is some 91 | value in returning a partial response to the caller in cases such as SIGPIPE/EPIPE in 92 | the read loop 93 | 94 | On a socket exception, the function will effectively return either an empty bytes 95 | buffer or a bytes buffer with partial response data, depending on where the network 96 | exception occurred 97 | 98 | @cache_to_disk(7) 99 | def network_query(hostname, port, query): 100 | response = b'' 101 | try: 102 | socket = tcp_connect(hostname) 103 | socket.send(query) 104 | while True: 105 | # Build the response incrementally 106 | buf = read_bytes(socket, 1024) 107 | if buf is None: 108 | break 109 | response += buf 110 | except socket.error: 111 | raise NoCacheCondition(function_value=buf) 112 | 113 | return response 114 | """ 115 | __slots__ = ['function_value'] 116 | 117 | def __init__(self, function_value=None): 118 | self.function_value = function_value 119 | logger.info('NoCacheCondition caught in cache_to_disk') 120 | 121 | 122 | def write_cache_file(cache_metadata_dict): 123 | """Dump an object as JSON to a file""" 124 | with open(DISK_CACHE_FILE, 'w') as f: 125 | return json.dump(cache_metadata_dict, f) 126 | 127 | 128 | def load_cache_metadata_json(): 129 | """Load a JSON file, create it with empty cache structure if it doesn't exist""" 130 | try: 131 | with open(DISK_CACHE_FILE, 'r') as f: 132 | return json.load(f) 133 | except FileNotFoundError: 134 | write_cache_file({_TOTAL_NUMCACHE_KEY: 0}) 135 | return {_TOTAL_NUMCACHE_KEY: 0} 136 | 137 | 138 | def ensure_dir(directory): 139 | """Create a directory tree if it doesn't already exist""" 140 | if not file_exists(directory): 141 | os.makedirs(directory) 142 | write_cache_file({_TOTAL_NUMCACHE_KEY: 0}) 143 | 144 | 145 | def pickle_big_data(data, file_path): 146 | """Write a pickled Python object to a file in chunks""" 147 | bytes_out = pickle.dumps(data, protocol=4) 148 | with open(file_path, 'wb') as f_out: 149 | for idx in range(0, len(bytes_out), MAX_PICKLE_BYTES): 150 | f_out.write(bytes_out[idx:idx + MAX_PICKLE_BYTES]) 151 | 152 | 153 | def unpickle_big_data(file_path): 154 | """Return a Python object from a file containing pickled data in chunks""" 155 | try: 156 | with open(file_path, 'rb') as f: 157 | return pickle.load(f) 158 | except Exception: # noqa, pylint: disable=broad-except 159 | bytes_in = bytearray(0) 160 | input_size = os.path.getsize(file_path) 161 | with open(file_path, 'rb') as f_in: 162 | for _ in range(0, input_size, MAX_PICKLE_BYTES): 163 | bytes_in += f_in.read(MAX_PICKLE_BYTES) 164 | return pickle.loads(bytes_in) 165 | 166 | 167 | def get_age_of_file(filename, unit='days'): 168 | """Return relative age of a file as a datetime.timedelta""" 169 | age = (datetime.today() - datetime.fromtimestamp(getmtime(filename))) 170 | return getattr(age, unit) 171 | 172 | 173 | def get_files_in_directory(directory): 174 | """Return all files in a directory, non-recursive""" 175 | return [ 176 | f for f in os.listdir(directory) if 177 | isfile(join_path(directory, f)) 178 | ] 179 | 180 | 181 | def delete_old_disk_caches(): 182 | cache_metadata = load_cache_metadata_json() 183 | new_cache_metadata = deepcopy(cache_metadata) 184 | cache_changed = False 185 | for function_name, function_caches in cache_metadata.items(): 186 | if function_name == _TOTAL_NUMCACHE_KEY: 187 | continue 188 | to_keep = [] 189 | for function_cache in function_caches: 190 | max_age_days = int(function_cache['max_age_days']) 191 | file_name = join_path(DISK_CACHE_DIR, function_cache['file_name']) 192 | if not file_exists(file_name): 193 | cache_changed = True 194 | continue 195 | if not get_age_of_file(file_name) > max_age_days != UNLIMITED_CACHE_AGE: 196 | to_keep.append(function_cache) 197 | continue 198 | logger.info('Removing stale cache file %s, > %d days', file_name, max_age_days) 199 | cache_changed = True 200 | os.remove(file_name) 201 | if to_keep: 202 | new_cache_metadata[function_name] = to_keep 203 | if cache_changed: 204 | write_cache_file(new_cache_metadata) 205 | 206 | 207 | def get_disk_cache_for_function(function_name): 208 | cache_metadata = load_cache_metadata_json() 209 | return cache_metadata.get(function_name, None) 210 | 211 | 212 | def get_disk_cache_size_for_function(function_name): 213 | """Return the current number of entries in the cache for a function by name""" 214 | function_cache = get_disk_cache_for_function(function_name) 215 | return None if function_cache is None else len(function_cache) 216 | 217 | 218 | def delete_disk_caches_for_function(function_name): 219 | logger.debug('Removing cache entries for %s', function_name) 220 | n_deleted = 0 221 | cache_metadata = load_cache_metadata_json() 222 | if function_name not in cache_metadata: 223 | return 224 | 225 | functions_to_delete_cache_for = cache_metadata.pop(function_name) 226 | for function_cache in functions_to_delete_cache_for: 227 | file_name = join_path(DISK_CACHE_DIR, function_cache['file_name']) 228 | os.remove(file_name) 229 | n_deleted += 1 230 | logger.debug('Removed %s cache entries for %s', n_deleted, function_name) 231 | write_cache_file(cache_metadata) 232 | 233 | 234 | def cache_exists(cache_metadata, function_name, *args, **kwargs): 235 | if function_name not in cache_metadata: 236 | return False, None 237 | new_caches_for_function = [] 238 | cache_changed = False 239 | for function_cache in cache_metadata[function_name]: 240 | if function_cache['args'] == str(args) and ( 241 | function_cache['kwargs'] == str(kwargs)): 242 | max_age_days = int(function_cache['max_age_days']) 243 | file_name = join_path(DISK_CACHE_DIR, function_cache['file_name']) 244 | if file_exists(file_name): 245 | if get_age_of_file(file_name) > max_age_days != UNLIMITED_CACHE_AGE: 246 | os.remove(file_name) 247 | cache_changed = True 248 | else: 249 | function_value = unpickle_big_data(file_name) 250 | return True, function_value 251 | else: 252 | cache_changed = True 253 | else: 254 | new_caches_for_function.append(function_cache) 255 | if cache_changed: 256 | if new_caches_for_function: 257 | cache_metadata[function_name] = new_caches_for_function 258 | else: 259 | cache_metadata.pop(function_name) 260 | write_cache_file(cache_metadata) 261 | return False, None 262 | 263 | 264 | def cache_function_value( 265 | function_value, 266 | n_days_to_cache, 267 | cache_metadata, 268 | function_name, 269 | *args, 270 | **kwargs): 271 | if function_name == _TOTAL_NUMCACHE_KEY: 272 | raise Exception( 273 | 'Cant cache function named %s' % _TOTAL_NUMCACHE_KEY) 274 | function_caches = cache_metadata.get(function_name, []) 275 | new_file_name = str(int(cache_metadata[_TOTAL_NUMCACHE_KEY]) + 1) + '.pkl' 276 | new_cache = { 277 | 'args': str(args), 278 | 'kwargs': str(kwargs), 279 | 'file_name': new_file_name, 280 | 'max_age_days': n_days_to_cache 281 | } 282 | pickle_big_data(function_value, join_path(DISK_CACHE_DIR, new_file_name)) 283 | function_caches.append(new_cache) 284 | cache_metadata[function_name] = function_caches 285 | cache_metadata[_TOTAL_NUMCACHE_KEY] = int(cache_metadata[_TOTAL_NUMCACHE_KEY]) + 1 286 | write_cache_file(cache_metadata) 287 | 288 | 289 | def cache_to_disk(n_days_to_cache=DEFAULT_CACHE_AGE): 290 | """Cache to disk""" 291 | if n_days_to_cache == UNLIMITED_CACHE_AGE: 292 | warnings.warn('Using an unlimited age cache is not recommended', stacklevel=3) 293 | if isinstance(n_days_to_cache, int): 294 | if n_days_to_cache < 0: 295 | n_days_to_cache = 0 296 | elif n_days_to_cache is not None: 297 | raise TypeError('Expected n_days_to_cache to be an integer or None') 298 | 299 | def decorating_function(original_function): 300 | wrapper = _cache_to_disk_wrapper(original_function, n_days_to_cache, _CacheInfo) 301 | return wrapper 302 | 303 | return decorating_function 304 | 305 | 306 | def _cache_to_disk_wrapper(original_func, n_days_to_cache, _CacheInfo): # noqa, pylint: disable=invalid-name 307 | hits = misses = nocache = 0 308 | 309 | def wrapper(*args, **kwargs): 310 | nonlocal hits, misses, nocache 311 | cache_metadata = load_cache_metadata_json() 312 | already_cached, function_value = cache_exists( 313 | cache_metadata, original_func.__name__, *args, **kwargs) 314 | if already_cached: 315 | logger.debug('Cache HIT on %s (hits=%s, misses=%s, nocache=%s)', 316 | original_func.__name__, hits, misses, nocache) 317 | hits += 1 318 | return function_value 319 | 320 | logger.debug('Cache MISS on %s (hits=%s, misses=%s, nocache=%s)', 321 | original_func.__name__, hits, misses, nocache) 322 | logger.debug(' -- MISS ARGS: (%s)', ','.join( 323 | [str(arg) for arg in args])) 324 | logger.debug(' -- MISS KWARGS: (%s)', ','.join( 325 | ['{}={}'.format(str(k), str(v)) for k, v in kwargs.items()])) 326 | misses += 1 327 | 328 | try: 329 | function_value = original_func(*args, **kwargs) 330 | except NoCacheCondition as err: 331 | nocache += 1 332 | logger.debug('%s() threw NoCacheCondition exception; no new cache entry', original_func.__name__) 333 | function_value = err.function_value 334 | else: 335 | logger.debug('%s() returned, adding cache entry', original_func.__name__) 336 | cache_function_value( 337 | function_value, 338 | n_days_to_cache, 339 | cache_metadata, 340 | original_func.__name__, 341 | *args, 342 | **kwargs) 343 | return function_value 344 | 345 | def cache_info(): 346 | """Report runtime cache statistics""" 347 | return _CacheInfo(hits, misses, nocache) 348 | 349 | def cache_clear(): 350 | """Clear the cache permanently from disk for this function""" 351 | logger.info('Cache clear requested for %s(); %s items in cache ...', original_func.__name__, ) 352 | delete_disk_caches_for_function(original_func.__name__) 353 | 354 | def cache_size(): 355 | """Return the number of cached entries for this function""" 356 | return get_disk_cache_size_for_function(original_func.__name__) 357 | 358 | def cache_get_raw(): 359 | """Return the raw cache object for this function as a list of dicts""" 360 | warnings.warn('This is an internal interface and should not be used lightly', stacklevel=3) 361 | return get_disk_cache_for_function(original_func.__name__) 362 | 363 | wrapper.cache_info = cache_info 364 | wrapper.cache_clear = cache_clear 365 | wrapper.cache_size = cache_size 366 | wrapper.cache_get_raw = cache_get_raw 367 | return wrapper 368 | 369 | 370 | ensure_dir(DISK_CACHE_DIR) 371 | delete_old_disk_caches() 372 | -------------------------------------------------------------------------------- /build/lib/cache_to_disk/cache_to_disk.py: -------------------------------------------------------------------------------- 1 | # Standard Library 2 | from datetime import datetime 3 | from os.path import isfile, join, exists, getmtime 4 | import os 5 | import pickle 6 | 7 | # Thirdparty 8 | import zlib 9 | 10 | max_bytes = 2**31 - 1 11 | disk_cache_dir = os.path.dirname(os.path.realpath(__file__)) + '/disk_cache/' 12 | 13 | 14 | def pickle_big_data(data, file_path): 15 | bytes_out = pickle.dumps(data, protocol=4) 16 | with open(file_path, 'wb') as f_out: 17 | for idx in range(0, len(bytes_out), max_bytes): 18 | f_out.write(bytes_out[idx:idx + max_bytes]) 19 | 20 | 21 | def unpickle_big_data(file_path): 22 | try: 23 | with open(file_path, 'rb') as f: 24 | return pickle.load(f) 25 | except Exception: 26 | bytes_in = bytearray(0) 27 | input_size = os.path.getsize(file_path) 28 | with open(file_path, 'rb') as f_in: 29 | for _ in range(0, input_size, max_bytes): 30 | bytes_in += f_in.read(max_bytes) 31 | return pickle.loads(bytes_in) 32 | 33 | 34 | def get_age_of_file(filename, unit='days'): 35 | age = (datetime.today() - datetime.fromtimestamp(getmtime(filename))) 36 | return getattr(age, unit) 37 | 38 | 39 | def get_files_in_directory(directory): 40 | return [ 41 | f for f in os.listdir(directory) if 42 | isfile(join(directory, f)) 43 | ] 44 | 45 | 46 | def delete_old_disk_caches(): 47 | n_deleted = 0 48 | deleted_caches = [] 49 | for file in get_files_in_directory(disk_cache_dir): 50 | max_age_days = int(file.split('_')[-1].replace('.pkl', '')) 51 | if get_age_of_file(disk_cache_dir + file) > max_age_days: 52 | os.remove(disk_cache_dir + file) 53 | deleted_caches.append(file) 54 | n_deleted += 1 55 | print('Expired {} caches:'.format(n_deleted)) 56 | for deleted_cache in deleted_caches: 57 | print('\t{}'.format(deleted_cache)) 58 | 59 | 60 | def delete_disk_caches_for_function(function_name): 61 | n_deleted = 0 62 | for file in get_files_in_directory(disk_cache_dir): 63 | cached_function = '_'.join(file.split('_')[1:-1]) 64 | if function_name == cached_function: 65 | os.remove(disk_cache_dir + file) 66 | n_deleted += 1 67 | print('Removed {} caches for {}'.format(n_deleted, function_name)) 68 | 69 | 70 | def cache_to_disk(n_days_to_cache): 71 | def decorator(original_func): 72 | delete_old_disk_caches() 73 | 74 | def new_func(*args, **kwargs): 75 | prefix_str = original_func.__name__ + '::' + str(args) + str( 76 | kwargs) 77 | prefix = zlib.adler32(prefix_str.encode()) 78 | filename = '{}_{}_{}.pkl'.format( 79 | prefix, original_func.__name__, n_days_to_cache) 80 | file_path = disk_cache_dir + filename 81 | if exists(file_path): 82 | return unpickle_big_data(file_path) 83 | function_value = original_func(*args, **kwargs) 84 | pickle_big_data(function_value, file_path) 85 | return function_value 86 | return new_func 87 | return decorator 88 | -------------------------------------------------------------------------------- /cache_to_disk.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: cache-to-disk 3 | Version: 2.0.0 4 | Summary: Local disk caching decorator for python function. 5 | Home-page: https://github.com/sarenehan/cache_to_disk 6 | Author: Stewart Renehan 7 | Author-email: sarenehan@gmail.com 8 | License: UNKNOWN 9 | Description: # cache_to_disk 10 | Local disk caching decorator for python functions with auto-invalidation. 11 | 12 | This is intended to cache functions that both take a long time to run, and have return values that take up too much memory to cache in-memory with redis. The results of the function are pickled and saved to a file, and then unpickled and returned the next time the function is called. The caching is argument specific, so if the function is called with different arguments, the function will be run again. The caching decorator accepts an integer representing the number of days to cache the function for. After this many days, the file for that function will be deleted the next time this module is imported. 13 | 14 | # Installation 15 | ```bash 16 | pip install cache_to_disk 17 | ``` 18 | 19 | # Functions: 20 | cache_to_disk(n_days_to_cache) 21 | delete_disk_caches_for_function(function_name) 22 | delete_old_disk_caches() 23 | 24 | 25 | # Examples: 26 | **cache_to_disk** 27 | ```python 28 | """ 29 | This example caches the function "my_function" for 3 days. 30 | """ 31 | 32 | from cache_to_disk import cache_to_disk 33 | 34 | @cache_to_disk(3) 35 | def my_function(): 36 | to_return = [] 37 | for i in range(10000): 38 | for j in range(i): 39 | to_return.append(i * j ** .23) 40 | return to_return 41 | ``` 42 | **delete_disk_caches_for_function** 43 | 44 | ```python 45 | """ 46 | This example invalidates all of the caches for the function "my_function". The function will be invalidated automatically, but this should be used when the function definition has been changed and you want it to re-run. 47 | """ 48 | 49 | from cache_to_disk import delete_disk_caches_for_function 50 | delete_disk_caches_for_function('my_function') 51 | ``` 52 | 53 | **runtime_accounting** 54 | ```python 55 | """ 56 | This example shows how to check the run-time cache accounting, which shows hits, misses and nocache events 57 | """ 58 | 59 | from cache_to_disk import cache_to_disk 60 | 61 | @cache_to_disk(3) 62 | def query_registrar(host, port, query): 63 | socket = tcp_connect(host, port) 64 | socket.send(query) 65 | response = b'' 66 | while True: 67 | buf = read_wrapper(socket) 68 | if buf is None: 69 | break 70 | response += buf 71 | return response 72 | 73 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 74 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 75 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 76 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 77 | print(query_registrar.cache_info()) 78 | ``` 79 | 80 | **nocache** 81 | ```python 82 | """ 83 | This example shows how to inhibit caching depending on certain conditions, such as a network failure while also returning a value 84 | """ 85 | from cache_to_disk import cache_to_disk, NoCacheCondition 86 | from random import randint 87 | @cache_to_disk(3) 88 | def query_registrar(host, port, query): 89 | socket = tcp_connect(host, port) 90 | socket.send(query) 91 | response = b'' 92 | while True: 93 | try: 94 | if randint(0, 5) > 3: 95 | # Simulate a spurious failure like SIGPIPE/EPIPE 96 | raise socket.error 97 | buf = read_wrapper(socket) 98 | 99 | if buf is None: 100 | break 101 | response += buf 102 | except socket.error: 103 | # To the user, functionalliy requivalent to `return response` except 104 | # the result is not cached, so it can be retried immediately or later 105 | raise NoCacheCondition(function_value=response) 106 | return response 107 | 108 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 109 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 110 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 111 | query_registrar('whois.verisign-grs.com', 43, b'test.com') 112 | print(query_registrar.cache_info()) 113 | ``` 114 | 115 | Platform: UNKNOWN 116 | Classifier: Programming Language :: Python :: 3 117 | Classifier: License :: OSI Approved :: MIT License 118 | Classifier: Operating System :: OS Independent 119 | Description-Content-Type: text/markdown 120 | -------------------------------------------------------------------------------- /cache_to_disk.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | setup.py 3 | cache_to_disk/__init__.py 4 | cache_to_disk.egg-info/PKG-INFO 5 | cache_to_disk.egg-info/SOURCES.txt 6 | cache_to_disk.egg-info/dependency_links.txt 7 | cache_to_disk.egg-info/top_level.txt -------------------------------------------------------------------------------- /cache_to_disk.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /cache_to_disk.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | cache_to_disk 2 | -------------------------------------------------------------------------------- /cache_to_disk/__init__.py: -------------------------------------------------------------------------------- 1 | """cache_to_disk: Cache the results of functions persistently on disk 2 | 3 | Original Work, Copyright (c) 2018 Stewart Renehan, MIT License 4 | Author: https://github.com/sarenehan 5 | Project: https://github.com/sarenehan/cache_to_disk 6 | 7 | Modifications: 8 | Author: https://github.com/mzpqnxow 9 | Project: https://github.com/mzpqnxow/cache_to_disk/tree/feature/nocache 10 | 11 | This modified version adds the following: 12 | - Accounting of hits, misses and nocache events 13 | - cache_info(), cache_clear(), cache_size(), cache_get_raw() interfaces accessible 14 | via the function itself for convenience 15 | - NoCacheCondition exception, simple interface for a user to prevent a 16 | specific function result to not be cached, while still passing a return 17 | value to the caller 18 | - Minor refactoring of the decorator, for easier reading 19 | - Minor refactoring of delete_old_disk_caches(), to reduce logical blocks 20 | and depth of indentation 21 | - Default cache age value (DEFAULT_CACHE_AGE) 22 | - Special unlimited age value (UNLIMITED_CACHE_AGE) 23 | - Use of logging module (but defaulting to NullAdapter) 24 | - Minor PEP8 / cosmetic changes 25 | - Minor cosmetic changes to file path generation (use of os.path.join, a constant 26 | for the directory/file path) 27 | - Support getting cache directory or filename from environment: 28 | Cache metadata filename: $DISK_CACHE_FILENAME 29 | Base directory for cache files: $DISK_CACHE_DIR 30 | - Expansion of shell variables and tilde-user values for directories/files 31 | """ 32 | # Standard Library 33 | import json 34 | import logging 35 | import os 36 | import pickle 37 | import warnings 38 | from collections import namedtuple 39 | from copy import deepcopy 40 | from datetime import datetime 41 | from os import getenv 42 | from os.path import ( 43 | dirname, 44 | exists as file_exists, 45 | expanduser, expandvars, getmtime, 46 | isfile, 47 | join as join_path, 48 | realpath) 49 | 50 | 51 | logger = logging.getLogger(__name__) 52 | 53 | if logger.handlers is None: 54 | # Don't log unless user explicitly adds a handler 55 | logger.addHandler(logging.NullHandler()) 56 | 57 | MAX_PICKLE_BYTES = 2 ** 31 - 1 58 | DISK_CACHE_DIR = expanduser(expandvars( 59 | getenv('DISK_CACHE_DIR', join_path(dirname(realpath(__file__)), 'disk_cache')))) 60 | DISK_CACHE_FILE = expanduser(expandvars(join_path( 61 | DISK_CACHE_DIR, getenv('DISK_CACHE_FILENAME', 'cache_to_disk_caches.json')))) 62 | 63 | # Specify 0 for cache age days to keep forever; not recommended for obvious reasons 64 | UNLIMITED_CACHE_AGE = 0 65 | DEFAULT_CACHE_AGE = 7 66 | 67 | _TOTAL_NUMCACHE_KEY = 'total_number_of_cache_to_disks' 68 | 69 | # Run-time cache data, stolen from Python functools.lru_cache implementation 70 | # Events resulting in nocache are cache misses that complete, but instruct cache_to_disk to 71 | # not store the result. Useful, for example, in a function that makes a network request and 72 | # experiences a failure that is considered likely to be temporary. This is accomplished in 73 | # the user function by raising NoCacheCondition 74 | _CacheInfo = namedtuple('CacheInfo', ['hits', 'misses', 'nocache']) 75 | 76 | # This is probably unnecessary ... 77 | # logger.debug('cache_to_disk package loaded; using DISK_CACHE_DIR=%s', 78 | # os.path.relpath(DISK_CACHE_DIR, '.')) 79 | 80 | 81 | class NoCacheCondition(Exception): 82 | """Custom exception for user function to raise to prevent caching on a per-call basis 83 | 84 | The function_value kwarg can be set as a kwarg to return a value other than None to the 85 | original caller 86 | 87 | Example 88 | ------- 89 | The following contrived example will return a value to the caller but avoids it being 90 | cached. In this example, a socket exception is considered a failure, but there is some 91 | value in returning a partial response to the caller in cases such as SIGPIPE/EPIPE in 92 | the read loop 93 | 94 | On a socket exception, the function will effectively return either an empty bytes 95 | buffer or a bytes buffer with partial response data, depending on where the network 96 | exception occurred 97 | 98 | @cache_to_disk(7) 99 | def network_query(hostname, port, query): 100 | response = b'' 101 | try: 102 | socket = tcp_connect(hostname) 103 | socket.send(query) 104 | while True: 105 | # Build the response incrementally 106 | buf = read_bytes(socket, 1024) 107 | if buf is None: 108 | break 109 | response += buf 110 | except socket.error: 111 | raise NoCacheCondition(function_value=buf) 112 | 113 | return response 114 | """ 115 | __slots__ = ['function_value'] 116 | 117 | def __init__(self, function_value=None): 118 | self.function_value = function_value 119 | logger.info('NoCacheCondition caught in cache_to_disk') 120 | 121 | 122 | def write_cache_file(cache_metadata_dict): 123 | """Dump an object as JSON to a file""" 124 | with open(DISK_CACHE_FILE, 'w') as f: 125 | return json.dump(cache_metadata_dict, f) 126 | 127 | 128 | def load_cache_metadata_json(): 129 | """Load a JSON file, create it with empty cache structure if it doesn't exist""" 130 | try: 131 | with open(DISK_CACHE_FILE, 'r') as f: 132 | return json.load(f) 133 | except FileNotFoundError: 134 | write_cache_file({_TOTAL_NUMCACHE_KEY: 0}) 135 | return {_TOTAL_NUMCACHE_KEY: 0} 136 | 137 | 138 | def ensure_dir(directory): 139 | """Create a directory tree if it doesn't already exist""" 140 | if not file_exists(directory): 141 | os.makedirs(directory) 142 | write_cache_file({_TOTAL_NUMCACHE_KEY: 0}) 143 | 144 | 145 | def pickle_big_data(data, file_path): 146 | """Write a pickled Python object to a file in chunks""" 147 | bytes_out = pickle.dumps(data, protocol=4) 148 | with open(file_path, 'wb') as f_out: 149 | for idx in range(0, len(bytes_out), MAX_PICKLE_BYTES): 150 | f_out.write(bytes_out[idx:idx + MAX_PICKLE_BYTES]) 151 | 152 | 153 | def unpickle_big_data(file_path): 154 | """Return a Python object from a file containing pickled data in chunks""" 155 | try: 156 | with open(file_path, 'rb') as f: 157 | return pickle.load(f) 158 | except Exception: # noqa, pylint: disable=broad-except 159 | bytes_in = bytearray(0) 160 | input_size = os.path.getsize(file_path) 161 | with open(file_path, 'rb') as f_in: 162 | for _ in range(0, input_size, MAX_PICKLE_BYTES): 163 | bytes_in += f_in.read(MAX_PICKLE_BYTES) 164 | return pickle.loads(bytes_in) 165 | 166 | 167 | def get_age_of_file(filename, unit='days'): 168 | """Return relative age of a file as a datetime.timedelta""" 169 | age = (datetime.today() - datetime.fromtimestamp(getmtime(filename))) 170 | return getattr(age, unit) 171 | 172 | 173 | def get_files_in_directory(directory): 174 | """Return all files in a directory, non-recursive""" 175 | return [ 176 | f for f in os.listdir(directory) if 177 | isfile(join_path(directory, f)) 178 | ] 179 | 180 | 181 | def delete_old_disk_caches(): 182 | cache_metadata = load_cache_metadata_json() 183 | new_cache_metadata = deepcopy(cache_metadata) 184 | cache_changed = False 185 | for function_name, function_caches in cache_metadata.items(): 186 | if function_name == _TOTAL_NUMCACHE_KEY: 187 | continue 188 | to_keep = [] 189 | for function_cache in function_caches: 190 | max_age_days = int(function_cache['max_age_days']) 191 | file_name = join_path(DISK_CACHE_DIR, function_cache['file_name']) 192 | if not file_exists(file_name): 193 | cache_changed = True 194 | continue 195 | if not get_age_of_file(file_name) > max_age_days != UNLIMITED_CACHE_AGE: 196 | to_keep.append(function_cache) 197 | continue 198 | logger.info('Removing stale cache file %s, > %d days', file_name, max_age_days) 199 | cache_changed = True 200 | os.remove(file_name) 201 | if to_keep: 202 | new_cache_metadata[function_name] = to_keep 203 | if cache_changed: 204 | write_cache_file(new_cache_metadata) 205 | 206 | 207 | def get_disk_cache_for_function(function_name): 208 | cache_metadata = load_cache_metadata_json() 209 | return cache_metadata.get(function_name, None) 210 | 211 | 212 | def get_disk_cache_size_for_function(function_name): 213 | """Return the current number of entries in the cache for a function by name""" 214 | function_cache = get_disk_cache_for_function(function_name) 215 | return None if function_cache is None else len(function_cache) 216 | 217 | 218 | def delete_disk_caches_for_function(function_name): 219 | logger.debug('Removing cache entries for %s', function_name) 220 | n_deleted = 0 221 | cache_metadata = load_cache_metadata_json() 222 | if function_name not in cache_metadata: 223 | return 224 | 225 | functions_to_delete_cache_for = cache_metadata.pop(function_name) 226 | for function_cache in functions_to_delete_cache_for: 227 | file_name = join_path(DISK_CACHE_DIR, function_cache['file_name']) 228 | os.remove(file_name) 229 | n_deleted += 1 230 | logger.debug('Removed %s cache entries for %s', n_deleted, function_name) 231 | write_cache_file(cache_metadata) 232 | 233 | 234 | def cache_exists(cache_metadata, function_name, *args, **kwargs): 235 | if function_name not in cache_metadata: 236 | return False, None 237 | new_caches_for_function = [] 238 | cache_changed = False 239 | for function_cache in cache_metadata[function_name]: 240 | if function_cache['args'] == str(args) and ( 241 | function_cache['kwargs'] == str(kwargs)): 242 | max_age_days = int(function_cache['max_age_days']) 243 | file_name = join_path(DISK_CACHE_DIR, function_cache['file_name']) 244 | if file_exists(file_name): 245 | if get_age_of_file(file_name) > max_age_days != UNLIMITED_CACHE_AGE: 246 | os.remove(file_name) 247 | cache_changed = True 248 | else: 249 | function_value = unpickle_big_data(file_name) 250 | return True, function_value 251 | else: 252 | cache_changed = True 253 | else: 254 | new_caches_for_function.append(function_cache) 255 | if cache_changed: 256 | if new_caches_for_function: 257 | cache_metadata[function_name] = new_caches_for_function 258 | else: 259 | cache_metadata.pop(function_name) 260 | write_cache_file(cache_metadata) 261 | return False, None 262 | 263 | 264 | def cache_function_value( 265 | function_value, 266 | n_days_to_cache, 267 | cache_metadata, 268 | function_name, 269 | *args, 270 | **kwargs): 271 | if function_name == _TOTAL_NUMCACHE_KEY: 272 | raise Exception( 273 | 'Cant cache function named %s' % _TOTAL_NUMCACHE_KEY) 274 | function_caches = cache_metadata.get(function_name, []) 275 | new_file_name = str(int(cache_metadata[_TOTAL_NUMCACHE_KEY]) + 1) + '.pkl' 276 | new_cache = { 277 | 'args': str(args), 278 | 'kwargs': str(kwargs), 279 | 'file_name': new_file_name, 280 | 'max_age_days': n_days_to_cache 281 | } 282 | pickle_big_data(function_value, join_path(DISK_CACHE_DIR, new_file_name)) 283 | function_caches.append(new_cache) 284 | cache_metadata[function_name] = function_caches 285 | cache_metadata[_TOTAL_NUMCACHE_KEY] = int(cache_metadata[_TOTAL_NUMCACHE_KEY]) + 1 286 | write_cache_file(cache_metadata) 287 | 288 | 289 | def cache_to_disk(n_days_to_cache=DEFAULT_CACHE_AGE): 290 | """Cache to disk""" 291 | if n_days_to_cache == UNLIMITED_CACHE_AGE: 292 | warnings.warn('Using an unlimited age cache is not recommended', stacklevel=3) 293 | if isinstance(n_days_to_cache, int): 294 | if n_days_to_cache < 0: 295 | n_days_to_cache = 0 296 | elif n_days_to_cache is not None: 297 | raise TypeError('Expected n_days_to_cache to be an integer or None') 298 | 299 | def decorating_function(original_function): 300 | wrapper = _cache_to_disk_wrapper(original_function, n_days_to_cache, _CacheInfo) 301 | return wrapper 302 | 303 | return decorating_function 304 | 305 | 306 | def _cache_to_disk_wrapper(original_func, n_days_to_cache, _CacheInfo): # noqa, pylint: disable=invalid-name 307 | hits = misses = nocache = 0 308 | 309 | def wrapper(*args, **kwargs): 310 | nonlocal hits, misses, nocache 311 | cache_metadata = load_cache_metadata_json() 312 | already_cached, function_value = cache_exists( 313 | cache_metadata, original_func.__name__, *args, **kwargs) 314 | if already_cached: 315 | logger.debug('Cache HIT on %s (hits=%s, misses=%s, nocache=%s)', 316 | original_func.__name__, hits, misses, nocache) 317 | hits += 1 318 | return function_value 319 | 320 | logger.debug('Cache MISS on %s (hits=%s, misses=%s, nocache=%s)', 321 | original_func.__name__, hits, misses, nocache) 322 | logger.debug(' -- MISS ARGS: (%s)', ','.join( 323 | [str(arg) for arg in args])) 324 | logger.debug(' -- MISS KWARGS: (%s)', ','.join( 325 | ['{}={}'.format(str(k), str(v)) for k, v in kwargs.items()])) 326 | misses += 1 327 | 328 | try: 329 | function_value = original_func(*args, **kwargs) 330 | except NoCacheCondition as err: 331 | nocache += 1 332 | logger.debug('%s() threw NoCacheCondition exception; no new cache entry', original_func.__name__) 333 | function_value = err.function_value 334 | else: 335 | logger.debug('%s() returned, adding cache entry', original_func.__name__) 336 | cache_function_value( 337 | function_value, 338 | n_days_to_cache, 339 | cache_metadata, 340 | original_func.__name__, 341 | *args, 342 | **kwargs) 343 | return function_value 344 | 345 | def cache_info(): 346 | """Report runtime cache statistics""" 347 | return _CacheInfo(hits, misses, nocache) 348 | 349 | def cache_clear(): 350 | """Clear the cache permanently from disk for this function""" 351 | logger.info('Cache clear requested for %s(); %s items in cache ...', original_func.__name__, ) 352 | delete_disk_caches_for_function(original_func.__name__) 353 | 354 | def cache_size(): 355 | """Return the number of cached entries for this function""" 356 | return get_disk_cache_size_for_function(original_func.__name__) 357 | 358 | def cache_get_raw(): 359 | """Return the raw cache object for this function as a list of dicts""" 360 | warnings.warn('This is an internal interface and should not be used lightly', stacklevel=3) 361 | return get_disk_cache_for_function(original_func.__name__) 362 | 363 | wrapper.cache_info = cache_info 364 | wrapper.cache_clear = cache_clear 365 | wrapper.cache_size = cache_size 366 | wrapper.cache_get_raw = cache_get_raw 367 | return wrapper 368 | 369 | 370 | ensure_dir(DISK_CACHE_DIR) 371 | delete_old_disk_caches() 372 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="cache_to_disk", 8 | version="2.0.0", 9 | author="Stewart Renehan", 10 | author_email="sarenehan@gmail.com", 11 | description="Local disk caching decorator for python function.", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/sarenehan/cache_to_disk", 15 | packages=setuptools.find_packages(), 16 | classifiers=( 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ), 21 | ) 22 | --------------------------------------------------------------------------------