├── .gitignore ├── LICENSE ├── README.md ├── pydirwatch ├── __init__.py └── watch.py ├── pyproject.toml └── test ├── integration_STOP.py ├── stress_test.py └── test_watch.py /.gitignore: -------------------------------------------------------------------------------- 1 | #vscode junk 2 | .vscode/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | #Listen history 10 | *.temp 11 | 12 | # C extensions 13 | *.so 14 | #MacOS files 15 | *.DS_Store 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | cover/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | db.sqlite3 70 | db.sqlite3-journal 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | 82 | # PyBuilder 83 | .pybuilder/ 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | # For a library or package, you might want to ignore these files since the code is 95 | # intended to run in multiple environments; otherwise, check them in: 96 | # .python-version 97 | 98 | # pipenv 99 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 100 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 101 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 102 | # install all needed dependencies. 103 | #Pipfile.lock 104 | 105 | # poetry 106 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 107 | # This is especially recommended for binary packages to ensure reproducibility, and is more 108 | # commonly ignored for libraries. 109 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 110 | #poetry.lock 111 | 112 | # pdm 113 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 114 | #pdm.lock 115 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 116 | # in version control. 117 | # https://pdm.fming.dev/#use-with-ide 118 | .pdm.toml 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 noahridge 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-directory-watch 2 | 3 | A minimal python package with the goal to watch a directory for new files. Allows for filtering by filetype and persistent history across restarts. 4 | 5 | Designed to allow for processing of files upon creation. Such as reading CSV data files into a database upon creation. 6 | 7 | *Note this library is experimental and may change without warning. 8 | 9 | # Usage 10 | 11 | The ```listen``` generator prodvides a mechanism to yield the path for each new file in the provided directory. New file discovery is performed by polling the directory. The generator will block if no new file(s) are found so it should be run in a seperate thread or python process. 12 | 13 | The ```manage_history``` context manager provides a mechanism to write a history file to disk. This is simply a text file containing a newline delimited list of paths which have already been processed. The history manager returns a python set of string paths. Any items added to this set will automatically be written to the file when the python process ends. 14 | 15 | The ```listen``` generator can be used without persistent history. Each time the generator is started all files matching the pattern will be yielded. 16 | 17 | ```python 18 | 19 | for new_file_path in listen(Path("test_dir"), pattern = "*.txt"): 20 | 21 | try: 22 | #DO STUFF with new_file_path 23 | print(f"{new_file_path}") 24 | 25 | except Exception: 26 | # For use cases such as writing to database often exceptions 27 | # should be handled without raising exceptions and stopping python process. 28 | traceback.print_exc() 29 | ``` 30 | 31 | The ```listen``` generator will automatically write persistent history file to disk upon exit of the generator the ```errors``` keyword argument can be used determine the treatment of exceptions within the body of the loop. If an exception occours within the body of the loop, the path yeilded on that iteration will not be written to the history file. 32 | 33 | ```python 34 | for new_file_path in listen_with_history(Path("test_dir"), pattern = "*.txt", errors = "raise"): 35 | 36 | try: 37 | #DO STUFF with new_file_path 38 | print(f"{new_file_path}") 39 | 40 | 41 | except Exception: 42 | # For use cases such as writing to database often exceptions 43 | # should be handled without raising exceptions and stopping python process. 44 | traceback.print_exc() 45 | 46 | ``` 47 | If more control over history and exception handling is desired, the ```manage_history``` context manager and ```listen``` generator can be called directly. By creating a history file and using the ```manage_history``` generator, history can be persisted on the disk. Only files not present in the history will be yielded upon starting the generator. A path for the history file should be provided to ```manage_history```. The paths returned from ```manage_history``` are provided as strings, so they must be converted to your desired format before passing to ```listen```. 48 | 49 | ```python 50 | import traceback 51 | from pydirwatch import listen, manage_history 52 | 53 | history_file = (Path.cwd().resolve() / Path("~pydirwatch_history.temp")) 54 | 55 | with mangage_history(history_file) as history: 56 | 57 | for new_file_path in listen(Path("test_dir"), history_paths=history_paths, pattern = "*.txt"): 58 | 59 | try: 60 | #DO STUFF with new_file_path 61 | print(f"{new_file_path}") 62 | 63 | #ADD to history if sucessful 64 | history.add(new_file_path) 65 | 66 | except Exception: 67 | # For use cases such as writing to database often exceptions 68 | # should be handled without raising exceptions and stopping python process. 69 | traceback.print_exc() 70 | ``` 71 | 72 | ## Installation 73 | 74 | ``` 75 | pip install git+https://github.com/noahridge/python-directory-watch.git 76 | ``` 77 | 78 | -------------------------------------------------------------------------------- /pydirwatch/__init__.py: -------------------------------------------------------------------------------- 1 | from .watch import listen, mangage_history, listen_with_history 2 | 3 | __all__ = [listen, mangage_history, listen_with_history] 4 | -------------------------------------------------------------------------------- /pydirwatch/watch.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from contextlib import contextmanager 3 | from typing import Generator 4 | from os import PathLike 5 | from traceback import print_exc 6 | import time 7 | import os 8 | from dataclasses import dataclass 9 | 10 | 11 | @dataclass 12 | class FileInfo: 13 | path: Path 14 | inode: tuple[int, int] 15 | 16 | @classmethod 17 | def from_path(cls, path: os.PathLike): 18 | st = os.stat(path) 19 | return cls(Path(path), (st.st_ino, st.st_dev)) 20 | 21 | def __hash__(self): 22 | return hash(self.inode) 23 | 24 | def __eq__(self, __value: object) -> bool: 25 | if self.inode == __value.inode: 26 | return True 27 | else: 28 | return self == __value 29 | 30 | 31 | @contextmanager 32 | def mangage_history(history_file: Path) -> Generator[set[str], None, None]: 33 | """Context manager which reads and writes to file with newline delimited strings. 34 | Yeilds a set of the strings contained in the file or an empty set, 35 | any items added to the set will be written to the file when the context manager exits. 36 | Creates a file if the history_file argument is not an existing filepath. 37 | 38 | Parameters 39 | ---------- 40 | history_file : Path 41 | Path to history file location 42 | 43 | Yields 44 | ------ 45 | Generator[set[str], None, None] 46 | A set containing items from newline delimited file. 47 | """ 48 | history_file.touch() 49 | 50 | try: 51 | with open(history_file, mode="r") as f: 52 | initial = set([x.strip() for x in f.readlines()]) 53 | yield initial 54 | 55 | finally: 56 | with open(history_file, mode="w") as f: 57 | for file in initial: 58 | f.write(f"{file}\n") 59 | 60 | 61 | def listen( 62 | path: Path, 63 | *, 64 | history_paths: set[PathLike] = set(), 65 | pattern: str = "*", 66 | resolve_paths: bool = True, 67 | polling_rate=10, 68 | ) -> Generator[Path, None, None]: 69 | """Generator which polls for new files in a directory and yeilds when new files are found. Will block unless new file is found. 70 | 71 | Parameters 72 | ---------- 73 | path : Path 74 | The directory which should be watched for new files. 75 | history_paths : set[PathLike], optional 76 | set of path objects which should be ignored (usually because they were already processed), by default empty set() 77 | pattern : str, optional 78 | Unix glob patterns to filter new paths found. Conforms to patterns allowed in pathlib.Path.glob. , by default "*" 79 | resolve_paths : bool, optional 80 | All paths found will be resolved to absolute using pathlib.Path.resolve, by default True 81 | 82 | Yields 83 | ------ 84 | Generator[Path, None, None] 85 | Will yield paths to new files found in the directory. Blocks until new file is found. 86 | 87 | Raises 88 | ------ 89 | ValueError 90 | Check that path parameter is valid directory & pathlib.Path object 91 | ValueError 92 | Check that history_paths is valid python set 93 | ValueError 94 | Check that pattern is a string. 95 | 96 | """ 97 | if not isinstance(pattern, str): 98 | raise ValueError("Input pattern must be a str object. ") 99 | 100 | if not isinstance(path, Path): 101 | raise ValueError("Input path must be a pathlib.Path object") 102 | 103 | if not isinstance(history_paths, set): 104 | raise ValueError( 105 | f"history_paths object must be a python set. Object provided is of type {type(history_paths)}" 106 | ) 107 | 108 | if not path.is_dir(): 109 | raise ValueError(f"Input path must be a directory '{path}' is not a directory.") 110 | 111 | history_paths_converted = set() 112 | for hist_path in history_paths: 113 | history_paths_converted.add(Path(hist_path)) 114 | 115 | yield from _listen( 116 | path, 117 | history_paths=history_paths_converted, 118 | pattern=pattern, 119 | resolve_paths=resolve_paths, 120 | polling_rate=polling_rate, 121 | ) 122 | 123 | 124 | def _listen( 125 | path: Path, 126 | *, 127 | history_paths: set[PathLike] = set(), 128 | pattern: str = "*", 129 | resolve_paths=True, 130 | polling_rate=10, 131 | ) -> Generator[Path, None, None]: 132 | history_info = set([FileInfo.from_path(p) for p in history_paths]) 133 | 134 | while True: 135 | time.sleep(1 / polling_rate) 136 | if resolve_paths: 137 | items = set([FileInfo.from_path((p.resolve())) for p in path.glob(pattern)]) 138 | else: 139 | items = set([FileInfo.from_path(p) for p in path.glob(pattern)]) 140 | 141 | new_items = items.difference(history_info) 142 | 143 | if new_items: 144 | for item in new_items: 145 | yield item.path 146 | 147 | history_info.add(item) 148 | 149 | 150 | def listen_with_history( 151 | path, 152 | *, 153 | pattern="*", 154 | resolve_paths=True, 155 | history_filepath=Path("~pydirwatch_history.tmp"), 156 | errors="raise", 157 | polling_rate=10, 158 | ): 159 | """Generator which polls for new files in a directory and yeilds when new files are found. Will block unless new file is found. 160 | When generator exits it will save history of files read to disk. Restarting the generator with the same history file will 161 | skip any files found in the directory which match those found in the history file. 162 | 163 | Parameters 164 | ---------- 165 | path : Path 166 | The directory which should be watched for new files. 167 | history_paths : set[PathLike], optional 168 | set of path objects which should be ignored (usually because they were already processed), by default empty set() 169 | pattern : str, optional 170 | Unix glob patterns to filter new paths found. Conforms to patterns allowed in pathlib.Path.glob. , by default "*" 171 | resolve_paths : bool, optional 172 | All paths found will be resolved to absolute using pathlib.Path.resolve, by default True 173 | history_filepath : pathlib.Path, optional 174 | filepath for the history file to store persistent history of files read on disk. File will be created if it does not exist. 175 | , by default Path("~pydirwatch_history.tmp") 176 | 177 | Yields 178 | ------ 179 | Generator[Path, None, None] 180 | Will yield paths to new files found in the directory. Blocks until new file is found. 181 | 182 | """ 183 | with mangage_history(history_filepath) as h: 184 | for path in listen( 185 | path=path, 186 | history_paths=h, 187 | resolve_paths=resolve_paths, 188 | pattern=pattern, 189 | polling_rate=polling_rate, 190 | ): 191 | try: 192 | yield path 193 | h.add(path) 194 | 195 | except: 196 | if errors == "suppress": 197 | pass 198 | elif errors == "warn": 199 | print_exc() 200 | else: 201 | raise 202 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling","hatch-vcs"] 3 | build-backend = "hatchling.build" 4 | 5 | [tool.hatch.version] 6 | # path = "pydirwatch/__init__.py" 7 | source = "vcs" 8 | 9 | [project] 10 | name = "python-directory-watch" 11 | dynamic = ["version"] 12 | # version = "0.00.00" 13 | requires-python = ">= 3.8" 14 | # optional-dependencies = ["pytest>=7.1"] 15 | authors = [{name = "Noah Ridge", email = "hello@noahcridge.com"}] 16 | readme = "README.md" 17 | 18 | 19 | [tool.hatch.build.targets.wheel] 20 | packages = ["/pydirwatch"] -------------------------------------------------------------------------------- /test/integration_STOP.py: -------------------------------------------------------------------------------- 1 | from pydirwatch import listen_with_history 2 | import pytest 3 | from pathlib import Path 4 | from threading import Thread, Lock 5 | import time 6 | 7 | 8 | @pytest.fixture(scope="session") 9 | def listen_dir(tmp_path_factory): 10 | p = tmp_path_factory.mktemp("test_listen_dir") 11 | 12 | return p 13 | 14 | 15 | @pytest.fixture(scope="session") 16 | def history_path(tmp_path_factory): 17 | p = tmp_path_factory.mktemp("history_file") 18 | file = p / Path("~history_file.tmp") 19 | 20 | return file 21 | 22 | 23 | def test_listen_with_history(listen_dir, history_path): 24 | lock = Lock() 25 | 26 | def make_new_files(lock: Lock, start, stop, paths): 27 | for i in range(start, stop): 28 | time.sleep(0.1) 29 | with lock: 30 | p = listen_dir / f"test_item_{i}" 31 | p.touch() 32 | 33 | paths.append(p) 34 | 35 | paths1 = [] 36 | 37 | make_file_thread = Thread(target=make_new_files, args=(lock, 0, 100, paths1)) 38 | make_file_thread.start() 39 | 40 | for idx, new_path in enumerate( 41 | listen_with_history(listen_dir, history_filepath=history_path) 42 | ): 43 | with lock: 44 | assert new_path == paths1[idx] 45 | 46 | if idx == 90: 47 | break 48 | 49 | make_file_thread.join() 50 | time.sleep(3) 51 | 52 | # time.sleep(2) 53 | # paths2 = [] 54 | # make_file_thread = Thread(target= make_new_files, args = (lock,100, 200, paths2 )) 55 | # make_file_thread.start() 56 | 57 | # for idx, new_path in enumerate(listen_with_history(listen_dir, history_filepath=history_path)): 58 | # with lock: 59 | # print(f"{new_path =}") 60 | # # print(f"{paths[idx] = }") 61 | # print(idx) 62 | # print(paths2) 63 | # assert new_path == paths2[idx] 64 | 65 | # if idx == 90: 66 | # break 67 | -------------------------------------------------------------------------------- /test/stress_test.py: -------------------------------------------------------------------------------- 1 | from pydirwatch import listen, mangage_history 2 | from threading import Thread, Event 3 | import time 4 | import pytest 5 | from shutil import rmtree 6 | from pathlib import Path 7 | 8 | 9 | def test_stress(): 10 | found_paths = set() 11 | exit_event = Event() 12 | 13 | def run_listener(listen_path, hist, exit_event): 14 | for path in listen(listen_path, history_paths=hist): 15 | # print(path) 16 | found_paths.add(path) 17 | if exit_event.is_set(): 18 | print("exited") 19 | break 20 | 21 | made_paths = set() 22 | 23 | def make_files(listen_path, exit_event): 24 | for idx in range(100): 25 | time.sleep(0.001) 26 | p = Path(listen_path) / f"testfile_{idx}" 27 | p.touch() 28 | made_paths.add(p.resolve()) 29 | 30 | try: 31 | p = Path("test/test_dir") 32 | p.mkdir(exist_ok=False) 33 | 34 | existing = set() 35 | for idx in range(1_000): 36 | exp = Path(p) / f"testfile_{idx}_existing" 37 | exp.touch() 38 | existing.add(exp.resolve()) 39 | time.sleep(1) 40 | 41 | make_files_thread = Thread(target=make_files, args=(p, exit_event)) 42 | 43 | listen_thread = Thread( 44 | target=run_listener, args=(p, existing, exit_event), daemon=True 45 | ) 46 | 47 | listen_thread.start() 48 | make_files_thread.start() 49 | 50 | make_files_thread.join() 51 | 52 | time.sleep(1) 53 | exit_event.set() 54 | print(found_paths) 55 | print(made_paths) 56 | 57 | assert found_paths == made_paths 58 | 59 | finally: 60 | rmtree((p.resolve())) 61 | -------------------------------------------------------------------------------- /test/test_watch.py: -------------------------------------------------------------------------------- 1 | from pydirwatch import listen, mangage_history 2 | import pytest 3 | from pathlib import Path 4 | 5 | 6 | @pytest.fixture() 7 | def empty_history_file(tmpdir): 8 | p = Path(tmpdir) / "temphistory.tmp" 9 | p.touch() 10 | 11 | return p 12 | 13 | 14 | @pytest.fixture() 15 | def filled_history_file(tmpdir): 16 | p = Path(tmpdir) / "temphistory2.tmp" 17 | p.touch() 18 | with p.open(mode="w") as f: 19 | f.write("djkfhskjhf\n") 20 | 21 | return p 22 | 23 | 24 | def test_manage_history(empty_history_file): 25 | items = set(["fasjhf", "sdfhslfh", "sjdfhsjkdhf", "skjdfhskdjf"]) 26 | 27 | with mangage_history(empty_history_file) as h: 28 | h.update(set(items)) 29 | 30 | with open(empty_history_file) as f: 31 | updated_hist = set([x.strip() for x in f.readlines()]) 32 | 33 | assert items == updated_hist 34 | 35 | 36 | def test_manage_history_filled(filled_history_file): 37 | items = set(["fasjhf", "sdfhslfh", "sjdfhsjkdhf", "skjdfhskdjf"]) 38 | 39 | with mangage_history(filled_history_file) as h: 40 | h.update(set(items)) 41 | 42 | with open(filled_history_file) as f: 43 | updated_hist = set([x.strip() for x in f.readlines()]) 44 | 45 | items.add("djkfhskjhf") 46 | 47 | assert updated_hist == items 48 | 49 | 50 | @pytest.fixture(scope="session") 51 | def listen_dir(tmp_path_factory): 52 | p = tmp_path_factory.mktemp("test_listen_dir") 53 | 54 | return p 55 | 56 | 57 | def test_listen(listen_dir): 58 | new_file = Path(listen_dir) / Path("mytestfile.test") 59 | new_file.touch() 60 | 61 | listen_gen = listen(listen_dir) 62 | 63 | assert next(listen_gen) == new_file 64 | 65 | new_file_2 = Path(listen_dir) / Path("mytestfile2.test") 66 | new_file_2.touch() 67 | 68 | assert next(listen_gen) == new_file_2 69 | 70 | 71 | def test_listen_errors(listen_dir): 72 | with pytest.raises(ValueError) as exec_info: 73 | next(listen(listen_dir, history_paths=["sjdfhsdjf"])) 74 | assert exec_info.value.contains("history_paths object must be a python set.") 75 | 76 | with pytest.raises(ValueError) as exec_info: 77 | next(listen(listen_dir, pattern=223)) 78 | assert exec_info.value.contains("Input pattern must be a str object") 79 | 80 | with pytest.raises(ValueError): 81 | next(listen("test/test")) 82 | assert exec_info.value.contains("Input path must be a pathlib.Path object") 83 | 84 | # with pytest.warns(UserWarning, match = "not a pathlib.Path object"): 85 | 86 | # next(listen(listen_dir,history_paths=set(["test/sf/sdf","sdfs/sdfsdg"]))) 87 | --------------------------------------------------------------------------------