├── .gitignore ├── LICENCE ├── README.md ├── database.ini.example ├── main.py ├── requirement.txt └── src ├── __init__.py ├── db ├── __init__.py ├── db.py └── utils.py └── ml.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # Ruff stuff: 171 | .ruff_cache/ 172 | 173 | # PyPI configuration file 174 | .pypirc 175 | 176 | *.JPG 177 | database.ini -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | Copyright © 2025 Darshan Rander 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Picarch 2 | 3 | Picarch is a Python project for face detection and image similarity search using [insightface](https://insightface.ai/) and PostgreSQL. The project detects faces in images, encodes them, stores the embeddings along with image paths in a PostgreSQL database, and allows searching for similar images. 4 | 5 | ## Inspiration 6 | 7 | I had a collection of 12k+ photos and was too lazy go through all the photos and find pictures with me, so I built this project. 8 | 9 | I overengineered a problem and I'm opensourcing it so you don't have to - clone it and save some time in you life 🙃 10 | 11 | ## Features 12 | 13 | - Face detection and embedding using InsightFace. 14 | - Image storage and similarity search using PostgreSQL with vector data. 15 | - Command line interface to encode images, search for similar faces, and manage the database. 16 | 17 | ## Setup 18 | 19 | ### Prerequisites 20 | 21 | - Python 3.12+ 22 | - PostgreSQL database server 23 | 24 | ### Installation 25 | 26 | 1. Clone the repository and navigate into the project directory: 27 | 28 | ```bash 29 | git clone https://github.com/SirusCodes/picarch.git 30 | cd picarch 31 | ``` 32 | 33 | 2. Create a virtual environment (optional but recommended): 34 | 35 | ```bash 36 | python -m venv venv 37 | source venv/bin/activate # On Windows: venv\Scripts\activate 38 | ``` 39 | 40 | 3. Install the required packages: 41 | 42 | ```bash 43 | pip install -r requirement.txt 44 | ``` 45 | 46 | 4. Setup the database: 47 | 48 | Setup [pgvector in Postgres](https://medium.com/@besttechreads/step-by-step-guide-to-installing-pgvector-and-loading-data-in-postgresql-f2cffb5dec43) or you can use a [docker image](https://hub.docker.com/r/pgvector/pgvector). 49 | 50 | ## Running the Project 51 | 52 | Picarch provides several command line commands: 53 | 54 | ### 1. Encode Images 55 | 56 | Recursively search a directory for images, encode faces, and store the embeddings in the database. 57 | 58 | ```bash 59 | python main.py encode 60 | ``` 61 | 62 | > [!NOTE] 63 | > This will take time. I ran it overnight. 64 | 65 | ### 2. Search for a Person 66 | 67 | Provide an image of a face to search for similar images in the database. 68 | 69 | ```bash 70 | python main.py search [--output ] 71 | ``` 72 | 73 | ### 3. Truncate Tables 74 | 75 | Truncate all image and embedding tables in the database. 76 | 77 | ```bash 78 | python main.py truncate 79 | ``` 80 | 81 | ### 4. Drop Tables 82 | 83 | Drop the tables from the database. 84 | 85 | ```bash 86 | python main.py drop 87 | ``` 88 | 89 | ## Project Structure 90 | 91 | - `src/` 92 | - `ml.py`: Contains the image encoding functions. 93 | - `db/`: Contains database utils and classes to handle PostgreSQL operations. 94 | - `database.ini`: Configuration file for the PostgreSQL connection. 95 | - `requirement.txt`: Lists the project dependencies. 96 | - `main.py`: CLI entry point for the project. 97 | -------------------------------------------------------------------------------- /database.ini.example: -------------------------------------------------------------------------------- 1 | [postgresql] 2 | host=HOST 3 | database=DB_NAME 4 | user=DB_USER 5 | password=DB_PASSWORD -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | from multiprocessing import Queue 3 | import os 4 | import shutil 5 | import threading 6 | import time 7 | import logging 8 | from rich.progress import Progress, TimeElapsedColumn, MofNCompleteColumn 9 | import argparse 10 | 11 | import src.ml as ml 12 | import src.db.db as db 13 | 14 | logger = logging.getLogger('picarch') 15 | 16 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 17 | 18 | progress = Progress( 19 | *Progress.get_default_columns(), 20 | TimeElapsedColumn(), 21 | MofNCompleteColumn(), 22 | ) 23 | 24 | def encode_image(image_uri: str): 25 | try: 26 | embeddings = ml.encode(image_uri) 27 | return embeddings 28 | except Exception as e: 29 | logger.error(f"Process: Error embedding {image_uri}: {e}") 30 | 31 | def update_db(picarch: db.Picarch, queue: Queue): 32 | logger.debug("Thread: Database thread started") 33 | try: 34 | while True: 35 | if not queue.empty(): 36 | image_uri, embeddings = queue.get() 37 | if image_uri is None: 38 | logger.info("Thread: Database thread received None, exiting") 39 | break 40 | if embeddings is None: 41 | continue 42 | image_id = picarch.insert_image_path(image_uri) 43 | picarch.insert_image_embeddings(image_id, embeddings) 44 | else: 45 | time.sleep(1) 46 | except Exception as e: 47 | logger.error(f"Thread: Database thread error: {e}") 48 | 49 | def get_images_from_path(path): 50 | image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.webp') 51 | images = [] 52 | for root, dirs, files in os.walk(path): 53 | for file in files: 54 | if file.lower().endswith(image_extensions): 55 | images.append(os.path.join(root, file)) 56 | logger.info(f"Main: Found {len(images)} image(s).") 57 | return images 58 | 59 | def run_truncate(picarch: db.Picarch): 60 | logger.info("Truncating the database...") 61 | picarch.trunate_tables() 62 | logger.info("Database truncated.") 63 | 64 | def run_drop(picarch: db.Picarch): 65 | logger.info("Dropping the database...") 66 | picarch.drop_tables() 67 | logger.info("Database dropped.") 68 | 69 | def main(): 70 | parser = argparse.ArgumentParser(description="Picarch CLI") 71 | subparsers = parser.add_subparsers(dest="command", help="Sub-command help") 72 | 73 | encode_parser = subparsers.add_parser("encode", help="Encode images from a given directory") 74 | encode_parser.add_argument("path", help="Path to search for images") 75 | 76 | search = subparsers.add_parser("search", help="Search for person in images") 77 | search.add_argument("path", help="Person to search for") 78 | search.add_argument("--output", help="Output folder for found images", default="find") 79 | 80 | subparsers.add_parser("truncate", help="Truncate the tables") 81 | 82 | subparsers.add_parser("drop", help="Drop the tables") 83 | 84 | 85 | args = parser.parse_args() 86 | 87 | picarch = db.Picarch() 88 | 89 | if args.command == "encode": 90 | run_processes(args.path, picarch) 91 | elif args.command == "truncate": 92 | run_truncate(picarch) 93 | elif args.command == "drop": 94 | run_drop(picarch) 95 | elif args.command == "search": 96 | run_search(args.output, args.path, picarch) 97 | else: 98 | parser.print_help() 99 | 100 | def run_search(output: str,path: str , picarch: db.Picarch): 101 | logger.info("Searching for person...") 102 | embeddings = ml.encode(path) 103 | if len(embeddings) != 1: 104 | logger.error("Please provide exactly one face to search for.") 105 | return 106 | results = picarch.get_image_similar_to_embedding(embeddings[0]) 107 | logger.info(f"Found {len(results)} result(s).") 108 | find_folder = os.path.join(os.getcwd(), output) 109 | os.makedirs(find_folder, exist_ok=True) 110 | for result in results: 111 | destination = os.path.join(find_folder, os.path.basename(result[0])) 112 | try: 113 | if os.path.exists(result[0]): 114 | shutil.copy2(result[0], destination) 115 | except Exception as e: 116 | logger.error(f"Error copying {result[0]} to {destination}: {e}") 117 | logger.info("Done searching.") 118 | 119 | def get_images_not_embedded(picarch: db.Picarch, images: list[str]): 120 | logger.info("Main: Checking for images that need embedding...") 121 | embedded_images = picarch.get_all_image_paths() 122 | embedded_images = [image[0] for image in embedded_images] 123 | logger.info(f"Main: Found {len(embedded_images)} embedded image(s).") 124 | 125 | job_remaining = [] 126 | for image in images: 127 | if image not in embedded_images: 128 | job_remaining.append(image) 129 | logger.info(f"Main: Found {len(job_remaining)} image(s) to embed.") 130 | return job_remaining 131 | 132 | def run_processes(path, picarch): 133 | images = get_images_from_path(path) 134 | 135 | images = get_images_not_embedded(picarch, images) 136 | 137 | queue = Queue() 138 | db_thread = threading.Thread(target=update_db, args=(picarch, queue,)) 139 | db_thread.start() 140 | 141 | logger.info("Main: embedding images...") 142 | with progress: 143 | task = progress.add_task("[cyan]Embedding images...", total=len(images)) 144 | with concurrent.futures.ProcessPoolExecutor() as executor: 145 | for uri, embedding in zip(images, executor.map(encode_image, images)): 146 | progress.update(task, advance=1) 147 | if embedding is None or len(embedding) == 0: 148 | progress.console.log(f"[bold red]Main: No face found in {uri}[/bold red]") 149 | continue 150 | progress.console.log(f"Embedded {uri}") 151 | queue.put((uri, embedding)) 152 | logger.info("Main: Done embedding images.") 153 | 154 | queue.put((None, None)) 155 | db_thread.join() 156 | 157 | if __name__ == "__main__": 158 | main() -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | psycopg2==2.9.10 2 | setuptools==77.0.3 3 | opencv-python==4.11.0.86 4 | insightface==0.7.3 -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SirusCodes/picarch/e61fdd0f8532f3c25d4731ef2edbd33c816449ec/src/__init__.py -------------------------------------------------------------------------------- /src/db/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SirusCodes/picarch/e61fdd0f8532f3c25d4731ef2edbd33c816449ec/src/db/__init__.py -------------------------------------------------------------------------------- /src/db/db.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | from pgvector.psycopg2 import register_vector 3 | from . import utils 4 | 5 | class Picarch: 6 | def __init__(self): 7 | self.conn = utils.connect(utils.load_config()) 8 | self.create_tables(self.conn) 9 | 10 | def insert_image_path(self, path: str) -> int: 11 | """ insert a new image path into the image_paths table """ 12 | sql = """INSERT INTO image_paths(path) VALUES(%s) RETURNING id;""" 13 | with self.conn.cursor() as cur: 14 | cur.execute(sql, (path,)) 15 | image_id = cur.fetchone()[0] 16 | self.conn.commit() 17 | return image_id 18 | 19 | def get_all_image_paths(self) -> list: 20 | """ get all image paths from the image_paths table """ 21 | sql = """SELECT path FROM image_paths;""" 22 | with self.conn.cursor() as cur: 23 | cur.execute(sql) 24 | return cur.fetchall() 25 | 26 | 27 | def get_image_similar_to_embedding(self, embedding: list) -> list: 28 | """ get similar images to the given image embedding """ 29 | sql = """ 30 | SELECT DISTINCT ON (ip.path) 31 | ip.path, ie.embeddings <=> %s::vector as distance 32 | FROM image_embeddings ie 33 | JOIN image_paths ip ON ie.image_id = ip.id 34 | WHERE 1 - (ie.embeddings <=> %s::vector) >= 0.4 35 | """ 36 | with self.conn.cursor() as cur: 37 | cur.execute(sql, (embedding, embedding,)) 38 | return cur.fetchall() 39 | 40 | def insert_image_embeddings(self, image_id: int, embeddings: list[list]): 41 | """ insert multiple image embeddings into the image_embeddings table """ 42 | for embedding in embeddings: 43 | if len(embedding) != 512: 44 | raise ValueError("Each embedding must have exactly 512 elements.", image_id) 45 | sql = """INSERT INTO image_embeddings(image_id, embeddings) VALUES(%s, %s);""" 46 | with self.conn.cursor() as cur: 47 | for embedding in embeddings: 48 | cur.execute(sql, (image_id, embedding)) 49 | self.conn.commit() 50 | 51 | 52 | def create_tables(self, conn: psycopg2.extensions.connection): 53 | """ create tables in the PostgreSQL database """ 54 | commands = ( 55 | """ 56 | CREATE EXTENSION IF NOT EXISTS vector; 57 | """, 58 | """ 59 | CREATE TABLE IF NOT EXISTS image_paths ( 60 | id SERIAL PRIMARY KEY, 61 | path TEXT NOT NULL UNIQUE 62 | ) 63 | """, 64 | """ 65 | CREATE TABLE IF NOT EXISTS image_embeddings ( 66 | id SERIAL PRIMARY KEY, 67 | image_id INTEGER NOT NULL, 68 | embeddings VECTOR(512) NOT NULL, 69 | FOREIGN KEY (image_id) REFERENCES image_paths (id) ON DELETE CASCADE 70 | ) 71 | """, 72 | """ 73 | CREATE TABLE IF NOT EXISTS image_clusters ( 74 | id SERIAL PRIMARY KEY, 75 | image_id INTEGER NOT NULL, 76 | cluster INTEGER NOT NULL, 77 | FOREIGN KEY (image_id) REFERENCES image_paths (id) ON DELETE CASCADE 78 | ) 79 | """ 80 | ) 81 | 82 | try: 83 | # create a cursor 84 | with conn.cursor() as cur: 85 | # create table one by one 86 | for command in commands: 87 | cur.execute(command) 88 | # close communication with the PostgreSQL database server 89 | cur.close() 90 | # commit the changes 91 | conn.commit() 92 | register_vector(conn) 93 | print('Tables created successfully.') 94 | except (Exception, psycopg2.DatabaseError) as error: 95 | print(error) 96 | 97 | def trunate_tables(self): 98 | """ truncate tables in the PostgreSQL database """ 99 | commands = ( 100 | """ 101 | TRUNCATE TABLE image_paths CASCADE; 102 | """, 103 | """ 104 | TRUNCATE TABLE image_embeddings CASCADE; 105 | """, 106 | """ 107 | TRUNCATE TABLE image_clusters CASCADE; 108 | """ 109 | ) 110 | 111 | try: 112 | # create a cursor 113 | with self.conn.cursor() as cur: 114 | # create table one by one 115 | for command in commands: 116 | cur.execute(command) 117 | # close communication with the PostgreSQL database server 118 | cur.close() 119 | # commit the changes 120 | self.conn.commit() 121 | print('Tables truncated successfully.') 122 | except (Exception, psycopg2.DatabaseError) as error: 123 | print(error) 124 | 125 | def drop_tables(self): 126 | """ drop tables in the PostgreSQL database """ 127 | commands = ( 128 | """ 129 | DROP TABLE IF EXISTS image_paths CASCADE; 130 | """, 131 | """ 132 | DROP TABLE IF EXISTS image_embeddings CASCADE; 133 | """, 134 | """ 135 | DROP TABLE IF EXISTS image_clusters CASCADE; 136 | """ 137 | ) 138 | 139 | try: 140 | # create a cursor 141 | with self.conn.cursor() as cur: 142 | # create table one by one 143 | for command in commands: 144 | cur.execute(command) 145 | # close communication with the PostgreSQL database server 146 | cur.close() 147 | # commit the changes 148 | self.conn.commit() 149 | print('Tables dropped successfully.') 150 | except (Exception, psycopg2.DatabaseError) as error: 151 | print(error) 152 | 153 | def __del__(self): 154 | self.conn.close() -------------------------------------------------------------------------------- /src/db/utils.py: -------------------------------------------------------------------------------- 1 | from configparser import ConfigParser 2 | 3 | import psycopg2 4 | import numpy 5 | 6 | def load_config(filename='database.ini', section='postgresql'): 7 | parser = ConfigParser() 8 | parser.read(filename) 9 | 10 | # get section, default to postgresql 11 | config = {} 12 | if parser.has_section(section): 13 | params = parser.items(section) 14 | for param in params: 15 | config[param[0]] = param[1] 16 | else: 17 | raise Exception('Section {0} not found in the {1} file'.format(section, filename)) 18 | 19 | return config 20 | 21 | def connect(config): 22 | """ Connect to the PostgreSQL database server """ 23 | try: 24 | # connecting to the PostgreSQL server 25 | with psycopg2.connect(**config) as conn: 26 | print('Connected to the PostgreSQL server.') 27 | return conn 28 | except (psycopg2.DatabaseError, Exception) as error: 29 | print(error) 30 | -------------------------------------------------------------------------------- /src/ml.py: -------------------------------------------------------------------------------- 1 | import PIL 2 | import numpy 3 | import cv2 4 | from insightface.app import FaceAnalysis 5 | 6 | app = FaceAnalysis(name='buffalo_l', providers=['CPUExecutionProvider']) # Use 'CUDAExecutionProvider' for GPU 7 | app.prepare(ctx_id=-1) 8 | 9 | def encode(image_uri: str) -> list[numpy.float32]: 10 | image = PIL.Image.open(image_uri) 11 | image = numpy.array(image) 12 | small_frame = cv2.resize(image, (0, 0), fx=0.25, fy=0.25) 13 | rgb_small_frame = cv2.cvtColor(small_frame, cv2.COLOR_BGR2RGB) 14 | 15 | faces = app.get(rgb_small_frame) 16 | 17 | if faces is None: 18 | return [] 19 | 20 | return [face.embedding for face in faces] 21 | 22 | --------------------------------------------------------------------------------