├── .gitignore ├── README.md ├── openvid ├── README.md ├── openvid.py ├── openvid_part_id_parquet.py └── zipstream.py └── video_processing ├── README.md ├── add_aesthetic_laion_score.py ├── add_captions.py ├── add_motion_score.py ├── add_nsfw_score.py ├── add_shot_categories.py ├── add_watermark_laion_score.py ├── extract_frames.py ├── folder_to_parquet.py ├── modules ├── __init__.py ├── aesthetic_laion.py ├── caption_object_ocr.py ├── frames.py ├── nsfw.py ├── optical_flow.py └── watermark_laion.py ├── reference_video_similarity.py ├── requirements.txt ├── scene_split.py └── video_to_scenes.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | .ruff_cache/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # video-dataset-scripts 2 | 3 | Tooling for image generation datasets is well established, with [`img2dataset`](https://github.com/rom1504/img2dataset) covering large scale and various community guides, scripts and UIs covering the small scale. 4 | 5 | Our goal is to make tooling for video generation datasets as established by creating open video dataset scripts suited for small scale, with [`video2dataset`](https://github.com/iejMac/video2dataset) covering large scale. 6 | 7 | *“If I have seen further it is by standing on the shoulders of giants”* 8 | 9 | The repository consists of the tooling we are developing to make it easy for the community to build their own datasets for fine-tuning video generation models. Check out our introduction [blog post](https://huggingface.co/blog/vid_ds_scripts) to learn more and head to the [`video_processing`](./video_processing) directory to get started. 10 | 11 | We plan to keep this repository easy to follow along and nimble. 12 | 13 | ## Features planned 14 | 15 | - [ ] Age detector 16 | - [ ] Scene categorizer 17 | -------------------------------------------------------------------------------- /openvid/README.md: -------------------------------------------------------------------------------- 1 | # OpenVid 2 | 3 | Script to filter and download videos from [datasets/nkp37/OpenVid-1M](https://huggingface.co/datasets/nkp37/OpenVid-1M) without downloading the entire dataset. 4 | 5 | ## Usage 6 | 7 | 1. Download [`openvid.parquet`](https://huggingface.co/datasets/bigdata-pw/OpenVid-1M/resolve/main/openvid.parquet?download=true) from [datasets/bigdata-pw/OpenVid-1M](https://huggingface.co/datasets/bigdata-pw/OpenVid-1M), this version has part numbers linked to each filename. 8 | 2. Edit `PARQUET_PATH` and `BASE_PATH`. 9 | 3. Optionally change the filtering, `aesthetic = df.loc[df["aesthetic score"] >= 7]` 10 | 4. Run the script 11 | 12 | This will 13 | 1. Read then filter the parquet 14 | 2. Get the zip central directory for each part number from the filtered set 15 | - Only 64KB per part is downloaded 16 | 3. Extract all filenames, offsets and sizes from each central directory 17 | 4. Filter the extracted filenames to what we want 18 | 5. Download each video to `BASE_PATH`, using 8 threads. 19 | 20 | `aesthetic score >= 7` (without the multipart zips) filters to 17247 videos and downloads only ~118GB instead of ~7TB for the full set! 21 | 22 | ## TODO 23 | 24 | Support downloading from the multipart zips 25 | 26 | # OpenVid part id parquet 27 | 28 | [`openvid.parquet`](https://huggingface.co/datasets/bigdata-pw/OpenVid-1M/resolve/main/openvid.parquet?download=true) from [datasets/bigdata-pw/OpenVid-1M](https://huggingface.co/datasets/bigdata-pw/OpenVid-1M) was produced using `openvid_part_id_parquet.py`. 29 | 30 | ## Usage 31 | 32 | 1. Download [OpenVid-1M.csv](https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/data/train/OpenVid-1M.csv?download=true) 33 | 2. Run the script 34 | 35 | This will 36 | 1. Read the csv into a dataframe 37 | 2. Get the zip central directory for each part number 38 | - Only 64KB per part is downloaded 39 | 3. Extract filenames from each central directory 40 | 4. Merge `part_id` into the dataframe according to filename 41 | 5. Save `openvid.parquet` 42 | -------------------------------------------------------------------------------- /openvid/openvid.py: -------------------------------------------------------------------------------- 1 | from concurrent.futures import ThreadPoolExecutor, as_completed 2 | import pandas as pd 3 | from tqdm import tqdm 4 | 5 | from zipstream import ZipStream 6 | 7 | PARQUET_PATH = "openvid.parquet" 8 | BASE_PATH = "H:/openvid" 9 | 10 | # skip these for now 11 | MULTI_PART = {73, 76, 78, 83, 88, 89, 92, 95, 96, 102, 103, 111, 118, 183, 184, 185} 12 | 13 | URL = "https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid_part{part}.zip?download=true" 14 | 15 | 16 | df = pd.read_parquet(PARQUET_PATH) 17 | 18 | aesthetic = df.loc[df["aesthetic score"] >= 7] 19 | aesthetic = aesthetic.loc[~df["part_id"].isin(MULTI_PART)] 20 | part_ids = list(aesthetic["part_id"].unique()) 21 | filenames = set(aesthetic["video"]) 22 | 23 | 24 | for part_id in part_ids: 25 | stream = ZipStream(URL.format(part=part_id)) 26 | files = list(filter(lambda file: file.filename.split("/")[-1] in filenames, stream.files)) 27 | 28 | with ThreadPoolExecutor(max_workers=8) as executor: 29 | pbar = tqdm(desc="download", total=len(files)) 30 | futures = {} 31 | for file in files: 32 | filename = file.filename.split("/")[-1] 33 | futures[executor.submit(file.download, filename, BASE_PATH)] = file 34 | for future in as_completed(futures): 35 | _ = future.result() 36 | pbar.update() 37 | -------------------------------------------------------------------------------- /openvid/openvid_part_id_parquet.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from zipstream import ZipStream 3 | import tqdm 4 | 5 | df = pd.read_csv("OpenVid-1M.csv") 6 | 7 | part_ids = list(range(0, 183)) 8 | for multi_part in {73, 76, 78, 83, 88, 89, 92, 95, 96, 102, 103, 111, 118}: 9 | part_ids.remove(multi_part) 10 | 11 | url = "https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid_part{part}.zip?download=true" 12 | 13 | filename_part = [] 14 | 15 | for part_id in tqdm.tqdm(part_ids): 16 | stream = ZipStream(url.format(part=part_id)) 17 | filename_part.extend( 18 | [ 19 | { 20 | "video": file.filename.split("/")[-1], 21 | "part_id": part_id, 22 | "file_offset": file.file_offset, 23 | "file_size": file.file_size, 24 | } 25 | for file in stream.files 26 | ] 27 | ) 28 | 29 | # for split parts we get 1 byte of part a to find the size 30 | # for part b the central directory offset is - size of part a 31 | url_multipart_a = ( 32 | "https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid_part{part}_partaa?download=true" 33 | ) 34 | url_multipart = "https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid_part{part}_partab?download=true" 35 | 36 | for part_id in tqdm.tqdm({73, 76, 78, 83, 88, 89, 92, 95, 96, 102, 103, 111, 118, 183, 184, 185}): 37 | offset = ZipStream.size(url_multipart_a.format(part=part_id)) 38 | stream = ZipStream(url_multipart.format(part=part_id), offset=offset) 39 | filename_part.extend( 40 | [ 41 | { 42 | "video": file.filename.split("/")[-1], 43 | "part_id": part_id, 44 | "file_offset": file.file_offset, 45 | "file_size": file.file_size, 46 | } 47 | for file in stream.files 48 | ] 49 | ) 50 | 51 | data = pd.DataFrame(filename_part) 52 | 53 | df = df.merge(data, how="left") 54 | df["part_id"] = df["part_id"].astype(pd.Int64Dtype()) 55 | df["file_offset"] = df["file_offset"].astype(pd.Int64Dtype()) 56 | df["file_size"] = df["file_size"].astype(pd.Int64Dtype()) 57 | 58 | df.to_parquet("openvid.parquet") 59 | -------------------------------------------------------------------------------- /openvid/zipstream.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import pathlib 3 | import requests 4 | import struct 5 | import tqdm 6 | from typing import Optional 7 | import zlib 8 | 9 | 10 | @dataclass 11 | class LocalFileHeader: 12 | signature: bytes 13 | version: int 14 | flag: int 15 | method: int 16 | modification_time: int 17 | modification_date: int 18 | crc32: int 19 | compressed_size: int 20 | uncompressed_size: int 21 | file_name_length: int 22 | extra_field_length: int 23 | 24 | 25 | @dataclass 26 | class CentralDirectoryFileHeader: 27 | signature: bytes 28 | version: int 29 | minimum_version: int 30 | flag: int 31 | method: int 32 | modification_time: int 33 | modification_date: int 34 | crc32: int 35 | compressed_size: int 36 | uncompressed_size: int 37 | file_name_length: int 38 | extra_field_length: int 39 | file_comment_length: int 40 | disk_number: int 41 | internal_file_attributes: int 42 | external_file_attributes: int 43 | relative_offset: int 44 | 45 | 46 | class ZipStreamFile: 47 | def __init__( 48 | self, 49 | url: str, 50 | filename: str, 51 | file_offset: int, 52 | file_size: int, 53 | ): 54 | self.url = url 55 | self.filename = filename 56 | self.file_offset = file_offset 57 | self.file_size = file_size 58 | 59 | def download( 60 | self, 61 | filename: Optional[str] = None, 62 | base_path: Optional[str] = None, 63 | ): 64 | struct_format = "<4sHHHHHIIIHH" 65 | struct_size = struct.calcsize(struct_format) 66 | headers = {"Range": f"bytes={self.file_offset}-{self.file_offset+struct_size-1}"} 67 | local_file_header = requests.get(self.url, headers=headers, stream=True).content 68 | local_file_header = LocalFileHeader(*struct.unpack(struct_format, local_file_header)) 69 | data_offset = struct_size + local_file_header.file_name_length + local_file_header.extra_field_length 70 | headers = {"Range": f"bytes={self.file_offset+data_offset}-{self.file_offset+data_offset+self.file_size-1}"} 71 | data = requests.get(self.url, headers=headers, stream=True).content 72 | if local_file_header.method == 8: 73 | data = zlib.decompress(data, -15) 74 | elif local_file_header.method != 0: 75 | raise ValueError("Unsupported compression method.") 76 | filename = filename or self.filename 77 | if base_path is not None and filename is not None: 78 | with open(f"{base_path}/{filename}", "wb") as f: 79 | f.write(data) 80 | return data 81 | 82 | def __repr__(self): 83 | return f"ZipStreamFile(\n\turl={self.url},\n\tfilename={self.filename},\n\tfile_offset={self.file_offset},\n\tfile_size={self.file_size}\n)" 84 | 85 | 86 | class ZipStream: 87 | tail_size: int = 65536 88 | 89 | @classmethod 90 | def size(self, url: str): 91 | headers = {"Range": f"bytes=-1"} 92 | return int(requests.get(url, headers=headers).headers["Content-Range"].split("/")[-1]) 93 | 94 | @classmethod 95 | def get_central_directory(self, url: str, offset: Optional[int] = None): 96 | headers = {"Range": f"bytes=-{self.tail_size}"} 97 | tail_data = requests.get(url, headers=headers, stream=True).content 98 | zip64_eocd = b"\x50\x4b\x06\x06" 99 | eocd_offset = tail_data.rfind(zip64_eocd) 100 | eocd = tail_data[eocd_offset:] 101 | cd_offset = int.from_bytes(eocd[48 : 48 + 8], byteorder="little") 102 | if offset is not None: 103 | cd_offset - offset 104 | headers = {"Range": f"bytes={cd_offset}-"} 105 | central_directory = requests.get(url, headers=headers, stream=True).content 106 | return central_directory 107 | 108 | @classmethod 109 | def get_files(self, url: str, central_directory: bytes, file_to_get: str = None): 110 | files = [] 111 | offset = 0 112 | while offset <= len(central_directory): 113 | file, offset = ZipStream.get_file(url=url, central_directory=central_directory, offset=offset) 114 | if file is None: 115 | continue 116 | if file_to_get is None: 117 | files.append(file) 118 | elif file_to_get is not None and file_to_get in file.filename: 119 | return file 120 | return files 121 | 122 | @classmethod 123 | def get_file(self, url: str, central_directory: bytes, offset: int): 124 | struct_format = "<4sHHHHHHIIIHHHHHII" 125 | struct_size = struct.calcsize(struct_format) 126 | buffer = central_directory[offset : offset + struct_size] 127 | if len(buffer) < struct_size: 128 | return None, offset + struct_size 129 | central_directory_file_header = CentralDirectoryFileHeader(*struct.unpack(struct_format, buffer)) 130 | filename = central_directory[ 131 | offset + struct_size : offset + struct_size + central_directory_file_header.file_name_length 132 | ].decode("utf-8") 133 | next_offset = ( 134 | offset 135 | + struct_size 136 | + central_directory_file_header.file_name_length 137 | + central_directory_file_header.extra_field_length 138 | + central_directory_file_header.file_comment_length 139 | ) 140 | if not filename: 141 | return None, next_offset 142 | is_zip64 = (central_directory_file_header.compressed_size == 2**32 - 1) or ( 143 | central_directory_file_header.relative_offset == 2**32 - 1 144 | ) 145 | if is_zip64: 146 | extra = central_directory[ 147 | offset + struct_size + central_directory_file_header.file_name_length : next_offset 148 | ] 149 | central_directory_file_header.relative_offset = int.from_bytes(extra[-8:], byteorder="little") 150 | return ( 151 | ZipStreamFile( 152 | url=url, 153 | filename=filename, 154 | file_offset=central_directory_file_header.relative_offset, 155 | file_size=central_directory_file_header.compressed_size, 156 | ), 157 | next_offset, 158 | ) 159 | 160 | def __init__( 161 | self, 162 | url: str, 163 | central_directory: Optional[bytes] = None, 164 | offset: Optional[int] = None, 165 | ): 166 | self.url = url 167 | central_directory = central_directory or ZipStream.get_central_directory(url=self.url, offset=offset) 168 | self.central_directory = central_directory 169 | self.files = ZipStream.get_files(url=self.url, central_directory=self.central_directory) 170 | -------------------------------------------------------------------------------- /video_processing/README.md: -------------------------------------------------------------------------------- 1 | # Video Processing 2 | 3 | ## Overview of the available filters 4 | * Watermark detection 5 | * Aesthetic scoring 6 | * NSFW scoring 7 | * Motion scoring 8 | * Filtering videos w.r.t reference videos/images 9 | * Shot categories (color, lighting, composition, etc.) 10 | 11 | ## Prerequisite 12 | The examples use the folder `cakeify/`, this can be any folder with videos. 13 | 14 | ## Folder to Parquet 15 | 16 | The first step, this creates a basic parquet with `file` column which is the filename of each video in `path`. 17 | 18 | Other scripts join to this parquet. 19 | 20 | ```sh 21 | python folder_to_parquet.py --path cakeify/ --out-path cakeify.parquet 22 | ``` 23 | 24 | ## Extract frames 25 | 26 | The second step, this extracts up to 3 key frames for use in captioning, watermark detection, etc. 27 | 28 | The `first` key frame if there are 1 or more. 29 | If there are only 2 key frames, we take the `first` and `last`. 30 | If there are 3 or more key frames, we take the `first`, `mid` and `last`. 31 | 32 | ```sh 33 | python extract_frames.py --path cakeify/ --frames-path frames/ --parquet-path cakeify.parquet --parquet-out-path cakeify.parquet 34 | ``` 35 | 36 | `--path` is the folder with videos. 37 | `--frames-path` is the folder where frames are saved. 38 | `--parquet-path` is the `--out-path` from the first step. 39 | `--parquet-out-path` if you want different versions e.g. `--parquet-out-path cakeify_frames.parquet` 40 | 41 | ## Add Captions 42 | 43 | This will use Florence-2 `microsoft/Florence-2-large` to run ``, ``, `` and `` on extracted key frames. 44 | 45 | This uses extracted frames from step 2. 46 | 47 | The list of captions is added to the dataframe `caption` and `detailed_caption` columns. 48 | 49 | ```sh 50 | python add_captions.py --path frames/ --parquet-path cakeify.parquet --parquet-out-path cakeify.parquet --device cuda --dtype float16 51 | ``` 52 | 53 | `--path` is the folder with **frames**. 54 | `--parquet-path` is the `--out-path` from the first step or the `--parquet-out-path` from step 2 if you changed it. 55 | `--parquet-out-path` if you want to different versions `--parquet-out-path cakeify_captions.parquet` 56 | 57 | 58 | ## Add Watermark Laion Score 59 | 60 | This will use [LAION-5B-WatermarkDetection](https://github.com/LAION-AI/LAION-5B-WatermarkDetection) to detect watermarks on extracted frames. 61 | 62 | This uses extracted frames from step 2. 63 | 64 | The list of scores is added to the dataframe `pwatermark` columns. 65 | 66 | ```sh 67 | python add_watermark_laion_score.py --path frames/ --parquet-path cakeify.parquet --parquet-out-path cakeify.parquet --device cpu 68 | ``` 69 | 70 | It will automatically download the for the watermark scorer from [here](https://huggingface.co/finetrainers/laion-watermark-detection). You also specify your own through the `--model` argument. 71 | 72 | `--path` is the folder with **frames**. 73 | `--parquet-path` is the `--out-path` from the first step or the `--parquet-out-path` from step 2 if you changed it. 74 | `--parquet-out-path` if you want to different versions `--parquet-out-path cakeify_captions.parquet` 75 | `--device cuda` is optional as this model is fast on CPU. 76 | 77 | 78 | ## Add Aesthetic Laion Score 79 | 80 | This will use [improved-aesthetic-predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor) to predict an aesthetic score on extracted frames. 81 | 82 | This uses extracted frames from step 2. 83 | 84 | The list of scores is added to the dataframe `aesthetic_score` columns. 85 | 86 | ```sh 87 | python add_aesthetic_laion_score.py --path frames/ --parquet-path cakeify.parquet --parquet-out-path cakeify.parquet --device cpu --dtype float32 88 | ``` 89 | 90 | It will automatically download the MLP params for the aeshtetics predictor from [here](https://huggingface.co/trl-lib/ddpo-aesthetic-predictor). You also specify your own through the `--model` argument. 91 | 92 | `--path` is the folder with **frames**. 93 | `--parquet-path` is the `--out-path` from the first step or the `--parquet-out-path` from step 2 if you changed it. 94 | `--parquet-out-path` if you want to different versions `--parquet-out-path cakeify_captions.parquet` 95 | 96 | Not unusable on CPU, around 1s per image but `--device cuda` and `--dtype float16` is recommended for performance. 97 | 98 | ## Add NSFW Score 99 | 100 | This will use the [Falconsai/nsfw_image_detection](https://huggingface.co/Falconsai/nsfw_image_detection) model to predict an NSFW score on a frame-by-frame basis. 101 | 102 | This uses extracted frames from step 2. 103 | 104 | The list of scores is added to the dataframe `nsfw` columns. 105 | 106 | ```sh 107 | python add_nsfw_score.py --path frames/ --parquet-path cakeify.parquet --parquet-out-path cakeify.parquet --device cuda 108 | ``` 109 | 110 | `--path` is the folder with **frames**. 111 | `--parquet-path` is the `--out-path` from the first step or the `--parquet-out-path` from step 2 if you changed it. 112 | `--parquet-out-path` if you want to different versions `--parquet-out-path cakeify_captions.parquet` 113 | 114 | Not unusable on CPU, around 1s per image but `--device cuda` is recommended for performance. 115 | 116 | Output should look like so: 117 | 118 | ```sh 119 | nsfw_status 120 | 0 [normal] 121 | 1 [normal] 122 | 2 [normal] 123 | file frames nsfw_status 124 | 0 -IvRtqwaetM-Scene-050.mp4 [-IvRtqwaetM-Scene-050_0.jpg] [normal] 125 | 1 -IvRtqwaetM-Scene-002.mp4 [-IvRtqwaetM-Scene-002_0.jpg] [normal] 126 | 2 -IvRtqwaetM-Scene-005.mp4 [-IvRtqwaetM-Scene-005_0.jpg] [normal] 127 | ``` 128 | 129 | ## Add Motion Score 130 | 131 | This will use opencv to calculate a "motion score" with `OpticalFlowFarneback` and `OpticalFlowPyrLK` on extracted key frames. 132 | 133 | Different than captions and watermark, this will use all key frames, if there is only 1 key frame, we also read the first frame of the video. 134 | 135 | The scores are added to the dataframe with `motion_fb` and `motion_lk` columns. 136 | 137 | ```sh 138 | python add_motion_score.py --path cakeify/ --parquet-out-path cakeify.parquet --parquet-path cakeify.parquet 139 | ``` 140 | 141 | `--path` is the folder with **videos**. 142 | `--parquet-path` is the `--out-path` from the first step or the `--parquet-out-path` from another step if you changed it. 143 | `--parquet-out-path` if you want different versions e.g. `--parquet-out-path cakeify_motion_score.parquet` 144 | 145 | ## Add Shot Categories 146 | 147 | This will use a fine-tune of Florence-2, [diffusers/shot-categorizer-v0](https://huggingface.co/diffusers/shot-categorizer-v0) and infer shot information on the key video frames. We follow the same strategy as above: 148 | 149 | > Different than captions and watermark, this will use all key frames, if there is only 1 key frame, we also read the first frame of the video. 150 | 151 | The categories are added to the dataframe with `color`, `lighting`, `lighting_type`, and 152 | `composition` columns. 153 | 154 | ```sh 155 | python add_shot_categories.py --path frames/ --parquet-path cakeify.parquet --parquet-out-path cakeify.parquet --device cuda --dtype float16 156 | ``` 157 | 158 | `--path` is the folder with **frames**. 159 | `--parquet-path` is the `--out-path` from the first step or the `--parquet-out-path` from step 2 if you changed it. 160 | `--parquet-out-path` if you want to different versions `--parquet-out-path cakeify_captions.parquet` 161 | 162 | Sample output: 163 | 164 | ```sh 165 | color ... composition 166 | 0 [Desaturated, Black and White, Desaturated, Bl... ... [Center, Balanced] 167 | 1 [Desaturated, Black and White] ... [Balanced] 168 | 2 [Desaturated, Black and White] ... [Center] 169 | 3 [Desaturated, Black and White, Desaturated, Bl... ... [Left heavy, Left heavy] 170 | 4 [Desaturated, Black and White] ... [Balanced] 171 | .. ... ... ... 172 | ``` 173 | 174 | ## Example Output 175 | 176 | ``` 177 | file motion_fb motion_lk caption detailed_caption 178 | 0 -h5KF2SffqI-Scene-002.mp4 -6.782037e-08 0.061066 [listerine cool mint mouthwash] [The image shows a bottle of listerine cool mi... 179 | 1 -h5KF2SffqI-Scene-003.mp4 4.928587e-01 0.654230 [A small aloe vera plant in a brown pot on a b... [The image shows an aloe vera plant in a pot o... 180 | 2 -h5KF2SffqI-Scene-006.mp4 4.287588e+00 1.033444 [A woman in black gloves is decorating a cake ... [The image shows a woman wearing a black dress... 181 | 3 -h5KF2SffqI-Scene-011.mp4 4.042791e-06 0.034311 [A jar of Nutella sitting on top of a wooden t... [The image shows a jar of Nutella sitting on t... 182 | 4 -h5KF2SffqI-Scene-012.mp4 -4.261375e-01 1.351952 [A bottle of Dove deep moisture body wash sitt... [The image shows a bottle of Dove Deep Moistur... 183 | 5 -h5KF2SffqI-Scene-019.mp4 -4.995294e-01 0.177173 [A person cutting a bowl of dog food with a kn... [The image shows a person cutting into a red b... 184 | 6 -h5KF2SffqI-Scene-023.mp4 9.713798e-07 0.012338 [A wireless router sitting on top of a wooden ... [The image shows a TP-Link TL-WR940N 300Mbps W... 185 | 7 -h5KF2SffqI-Scene-026.mp4 -1.478333e-05 0.059160 [A bottle of ranch dressing with a knife in it.] [The image shows a person using a knife to cut... 186 | 8 7TAIQso5waY-Scene-014.mp4 -1.127474e-05 0.004962 [A person cutting up a box of french fries wit... [The image shows a person cutting out a McDona... 187 | 9 7TAIQso5waY-Scene-075.mp4 1.749514e-06 0.035628 [A person holding a cake with a fox face on it.] [The image shows a person holding a cake with ... 188 | 10 7TAIQso5waY-Scene-079.mp4 9.967135e-06 0.033474 [A person cutting a cake with a knife on a tab... [The image shows a person cutting a cake with ... 189 | 11 GJ2M77Yz60c-Scene-025.mp4 -1.363216e-06 0.025201 [A bottle of school glue sitting on top of a w... [The image shows a bottle of Elmer's School Gl... 190 | 12 GJ2M77Yz60c-Scene-063.mp4 -1.828094e-06 0.023520 [A can of coca cola sitting on top of a table.] [The image shows a can of Coca Cola sitting on... 191 | 13 GJ2M77Yz60c-Scene-071.mp4 -2.134615e-06 0.010385 [A wireless router sitting on top of a wooden ... [The image shows a TP-Link TL-WR940N 300Mbps W... 192 | 14 GJ2M77Yz60c-Scene-227.mp4 1.133161e-01 0.928008 [A cup of kfc chicken with a knife sticking ou... [The image shows a cup of KFC chicken nuggets ... 193 | ``` 194 | 195 | ## Video to Scenes 196 | 197 | This will split a video into scenes using `pyscenedetect`. Videos are transcoded to ensure exact cuts, note that we can implement a lossless `copy` version however cuts will need to be snapped to keyframes which may produce bad clips (part scene A, part scene B). 198 | 199 | ```sh 200 | python video_to_scenes.py --path cakeify/ --out-path cakeify_dataset/ --threshold 27 --min-scene-len 15 201 | # optionally --duration NUMBER_OF_FRAMES to limit duration of scene detection 202 | ``` 203 | 204 | ## Example workflow 205 | 206 | Example workflow for [crush](https://huggingface.co/datasets/bigdata-pw/crush) dataset. 207 | 208 | ```sh 209 | git clone https://github.com/huggingface/dataset-scripts 210 | mkdir raw_video 211 | cd raw_video 212 | yt-dlp -f "bv*[ext=mp4][height<=1080]+ba[ext=m4a]/b[ext=mp4] / bv*+ba/b" -o "%(id)s" https://www.youtube.com/playlist?list=PLlFv9Xg5Kmt17Dh70nXJpjaezzGT-gQV5 213 | cd .. 214 | python dataset-scripts/video_processing/video_to_scenes.py --path raw_video/ --out-path crush/ --threshold 27 --min-scene-len 15 215 | python dataset-scripts/video_processing/folder_to_parquet.py --path crush/ --out-path crush.parquet 216 | python dataset-scripts/video_processing/extract_frames.py --path crush/ --frames-path frames/ --parquet-path crush.parquet --parquet-out-path crush.parquet 217 | python dataset-scripts/video_processing/add_captions.py --path frames/ --parquet-path crush.parquet --parquet-out-path crush.parquet --device cuda --dtype float16 218 | python dataset-scripts/video_processing/add_watermark_laion_score.py --path frames/ --parquet-path crush.parquet --parquet-out-path crush.parquet --device cpu 219 | python dataset-scripts/video_processing/add_aesthetic_laion_score.py --path frames/ --parquet-path crush.parquet --parquet-out-path crush.parquet --device cpu --dtype float32 220 | python dataset-scripts/video_processing/add_motion_score.py --path crush/ --parquet-path crush.parquet --parquet-out-path crush.parquet 221 | ``` 222 | 223 | ### General steps 224 | 225 | 1. Download source videos 226 | 2. Extract scenes (`video_to_scenes`) 227 | 3. Create parquet (`folder_to_parquet`) on the extracted scenes 228 | 4. Extract frames from the scenes (`extract_frames`) 229 | 5. Run any of the other scripts 230 | 231 | Note: motion score script uses the videos, motion score is likely performs better with more frames so it uses all the key frames (and an additional frame from the video if there's only 1). Other scripts use the extracted frames for performance. 232 | 233 | ## Filtering 234 | 235 | ```python 236 | import pandas as pd 237 | 238 | df = pd.read_parquet("crush.parquet") 239 | 240 | # mean pwatermark < 0.5 241 | import numpy as np 242 | df[df.pwatermark.apply(lambda x: np.mean(x) < 0.5)] 243 | # or sum(x) / len(x) 244 | 245 | # first frame pwatermark < 0.1 246 | df[df.pwatermark.apply(lambda x: x[0] < 0.1)] 247 | 248 | # all pwatermark < 0.1 249 | df = df[df.pwatermark.apply(lambda x: all(i < 0.1 for i in x))] 250 | 251 | # aesthetic > 5.0 252 | df = df[df.pwatermark.apply(lambda x: all(i > 5.4 for i in x))] 253 | 254 | df.to_parquet("crush_smol.parquet") 255 | ``` 256 | 257 | ## Reference filtering 258 | 259 | You may filter your videos matching with a reference video/image for better control. We provide the `reference_video_similarity.py` script for this purpose. It can be called like so: 260 | 261 | ```bash 262 | python reference_video_similarity.py --videos_folder=... --reference=reference_image.png 263 | ``` 264 | 265 | The `--videos_folder` should contain the videos at the top-level. `--reference` can either be an image or a video. You can pass a list of references too like so: 266 | 267 | ```bash 268 | python reference_video_similarity.py --videos_folder=... \ 269 | --reference=reference_image_1.png,reference_image_2.png 270 | ``` 271 | 272 | As a third option, you can also pass a folder containing the reference images or videos to `--reference`: 273 | 274 | ```bash 275 | python reference_video_similarity.py --videos_folder=... --reference= 276 | ``` 277 | 278 | You can vary the `--max_num_frames` and the `--batch_size` arguments to control the memory consumption. 279 | 280 | At the end of the execution of the script, you should expect to see parquet file having `video_path`s and the `similarity` scores. 281 | 282 | We leverage the vision encoder of SigLIP ([`google/siglip-so400m-patch14-384`](https://hf.co/google/siglip-so400m-patch14-384)) for this. 283 | -------------------------------------------------------------------------------- /video_processing/add_aesthetic_laion_score.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pathlib 3 | from PIL import Image 4 | from argparse import ArgumentParser 5 | from tqdm import tqdm 6 | from modules import load_aesthetic_laion, run_aesthetic_laion, separate_key_frames_from_row 7 | 8 | parser = ArgumentParser() 9 | parser.add_argument("--path", type=str, required=True) 10 | parser.add_argument("--parquet-path", type=str, required=True) 11 | parser.add_argument("--parquet-out-path", type=str, required=True) 12 | parser.add_argument("--device", type=str, required=True) 13 | parser.add_argument("--model", type=str, default=None) 14 | parser.add_argument("--dtype", type=str, required=True) 15 | args = parser.parse_args() 16 | path = pathlib.Path(args.path) 17 | parquet_path = pathlib.Path(args.parquet_path) 18 | parquet_out_path = pathlib.Path(args.parquet_out_path) 19 | device = args.device 20 | dtype = args.dtype 21 | model_path = args.model 22 | 23 | load_aesthetic_laion(device=device, model_path=model_path, dtype=dtype) 24 | 25 | df = pd.read_parquet(parquet_path) 26 | 27 | data = [] 28 | with tqdm() as pbar: 29 | for _, row in df.iterrows(): 30 | pbar.set_description(row["file"]) 31 | key_frames, first, mid, last = separate_key_frames_from_row(path, row) 32 | pbar.set_postfix_str(f"{len(key_frames)} key frames") 33 | frames = [frame for frame in [first, mid, last] if frame is not None] 34 | scores = [tensor.cpu().item() for tensor in run_aesthetic_laion(frames)] 35 | data.append({"aesthetic_score": scores}) 36 | pbar.update() 37 | 38 | aesthetic_df = pd.DataFrame(data) 39 | 40 | print(aesthetic_df) 41 | 42 | df = df.join(aesthetic_df) 43 | 44 | print(df) 45 | 46 | df.to_parquet(parquet_out_path) 47 | -------------------------------------------------------------------------------- /video_processing/add_captions.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pathlib 3 | from PIL import Image 4 | from argparse import ArgumentParser 5 | from tqdm import tqdm 6 | from modules import run, load_florence, separate_key_frames_from_row 7 | 8 | parser = ArgumentParser() 9 | parser.add_argument("--path", type=str, required=True) 10 | parser.add_argument("--parquet-path", type=str, required=True) 11 | parser.add_argument("--parquet-out-path", type=str, required=True) 12 | parser.add_argument("--device", type=str, required=True) 13 | parser.add_argument("--dtype", type=str, required=True) 14 | args = parser.parse_args() 15 | path = pathlib.Path(args.path) 16 | parquet_path = pathlib.Path(args.parquet_path) 17 | parquet_out_path = pathlib.Path(args.parquet_out_path) 18 | device = args.device 19 | dtype = args.dtype 20 | 21 | 22 | load_florence( 23 | hf_hub_or_path="microsoft/Florence-2-large", 24 | device=device, 25 | dtype=dtype, 26 | ) 27 | 28 | 29 | df = pd.read_parquet(parquet_path) 30 | 31 | task_prompt = [ 32 | "", 33 | "", 34 | "", 35 | "", 36 | ] 37 | 38 | data = [] 39 | with tqdm() as pbar: 40 | for _, row in df.iterrows(): 41 | pbar.set_description(row["file"]) 42 | key_frames, first, mid, last = separate_key_frames_from_row(path, row) 43 | pbar.set_postfix_str(f"{len(key_frames)} key frames") 44 | 45 | frames = [first] 46 | first = run(first, task_prompt=task_prompt) 47 | caption = [first[""]] 48 | detailed_caption = [first[""]] 49 | region_caption = [first[""]] 50 | ocr_region = [first[""]] 51 | if mid: 52 | frames.append(mid) 53 | mid = run(mid, task_prompt=task_prompt) 54 | caption.append(mid[""]) 55 | detailed_caption.append(mid[""]) 56 | region_caption.append(mid[""]) 57 | ocr_region.append(mid[""]) 58 | if last: 59 | frames.append(last) 60 | last = run(last, task_prompt=task_prompt) 61 | caption.append(last[""]) 62 | detailed_caption.append(last[""]) 63 | region_caption.append(last[""]) 64 | ocr_region.append(last[""]) 65 | row = { 66 | "caption": caption, 67 | "detailed_caption": detailed_caption, 68 | "region_caption": region_caption, 69 | "ocr": ocr_region, 70 | } 71 | data.append(row) 72 | pbar.update() 73 | 74 | caption_df = pd.DataFrame(data) 75 | 76 | print(caption_df) 77 | 78 | df = df.join(caption_df) 79 | 80 | print(df) 81 | 82 | df.to_parquet(parquet_out_path) 83 | -------------------------------------------------------------------------------- /video_processing/add_motion_score.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pathlib 3 | from argparse import ArgumentParser 4 | from tqdm import tqdm 5 | from modules import ( 6 | compute_farneback_optical_flow, 7 | compute_lk_optical_flow, 8 | _downscale_maps, 9 | _motion_score, 10 | get_frames, 11 | get_key_frames, 12 | ) 13 | 14 | parser = ArgumentParser() 15 | parser.add_argument("--path", type=str, required=True) 16 | parser.add_argument("--parquet-path", type=str, required=True) 17 | parser.add_argument("--parquet-out-path", type=str, required=True) 18 | args = parser.parse_args() 19 | path = pathlib.Path(args.path) 20 | parquet_path = pathlib.Path(args.parquet_path) 21 | parquet_out_path = pathlib.Path(args.parquet_out_path) 22 | 23 | df = pd.read_parquet(parquet_path) 24 | 25 | data = [] 26 | with tqdm() as pbar: 27 | for _, row in df.iterrows(): 28 | video = path.joinpath(row["file"]) 29 | pbar.set_description(video.name) 30 | key_frames = get_key_frames(video) 31 | if len(key_frames) == 1: 32 | frame = list(next(get_frames(video)))[0] 33 | key_frames.insert(0, frame.to_image()) 34 | pbar.set_postfix_str(f"{len(key_frames)} key frames") 35 | farneback, _, _, _ = compute_farneback_optical_flow(key_frames) 36 | farneback = _motion_score(_downscale_maps(farneback)) 37 | lucas_kanade = _motion_score(compute_lk_optical_flow(key_frames)) 38 | data.append({"motion_fb": farneback, "motion_lk": lucas_kanade}) 39 | pbar.update() 40 | 41 | 42 | motion_df = pd.DataFrame(data) 43 | 44 | print(motion_df) 45 | 46 | df = df.join(motion_df) 47 | 48 | print(df) 49 | 50 | df.to_parquet(parquet_out_path) 51 | -------------------------------------------------------------------------------- /video_processing/add_nsfw_score.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pathlib 3 | from PIL import Image 4 | from argparse import ArgumentParser 5 | from tqdm import tqdm 6 | from modules import load_nsfw, run_nsfw, separate_key_frames_from_row 7 | 8 | 9 | parser = ArgumentParser() 10 | parser.add_argument("--path", type=str, required=True) 11 | parser.add_argument("--parquet-path", type=str, required=True) 12 | parser.add_argument("--parquet-out-path", type=str, required=True) 13 | parser.add_argument("--device", type=str, required=True) 14 | args = parser.parse_args() 15 | path = pathlib.Path(args.path) 16 | parquet_path = pathlib.Path(args.parquet_path) 17 | parquet_out_path = pathlib.Path(args.parquet_out_path) 18 | device = args.device 19 | 20 | load_nsfw(device) 21 | 22 | df = pd.read_parquet(parquet_path) 23 | 24 | data = [] 25 | with tqdm() as pbar: 26 | for _, row in df.iterrows(): 27 | pbar.set_description(row["file"]) 28 | key_frames, first, mid, last = separate_key_frames_from_row(path, row) 29 | pbar.set_postfix_str(f"{len(key_frames)} key frames") 30 | frames = [frame for frame in [first, mid, last] if frame is not None] 31 | labels = [label for label in run_nsfw(frames)] 32 | data.append({"nsfw_status": labels}) 33 | pbar.update() 34 | 35 | nsfw_df = pd.DataFrame(data) 36 | 37 | print(nsfw_df) 38 | 39 | df = df.join(nsfw_df) 40 | 41 | print(df) 42 | 43 | df.to_parquet(parquet_out_path) 44 | -------------------------------------------------------------------------------- /video_processing/add_shot_categories.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pathlib 3 | from argparse import ArgumentParser 4 | from tqdm import tqdm 5 | from PIL import Image 6 | from modules import run, load_florence, separate_key_frames_from_row 7 | 8 | parser = ArgumentParser() 9 | parser.add_argument("--path", type=str, required=True) 10 | parser.add_argument("--parquet-path", type=str, required=True) 11 | parser.add_argument("--parquet-out-path", type=str, required=True) 12 | parser.add_argument("--device", type=str, required=True) 13 | parser.add_argument("--dtype", type=str, required=True) 14 | args = parser.parse_args() 15 | path = pathlib.Path(args.path) 16 | parquet_path = pathlib.Path(args.parquet_path) 17 | parquet_out_path = pathlib.Path(args.parquet_out_path) 18 | device = args.device 19 | dtype = args.dtype 20 | 21 | load_florence(hf_hub_or_path="diffusers/shot-categorizer-v0", device=device, dtype=dtype, check_task_types=False) 22 | 23 | df = pd.read_parquet(parquet_path) 24 | 25 | task_prompt = ["", "", "", ""] 26 | 27 | data = [] 28 | with tqdm() as pbar: 29 | for _, row in df.iterrows(): 30 | pbar.set_description(row["file"]) 31 | key_frames, first, mid, last = separate_key_frames_from_row(path, row) 32 | pbar.set_postfix_str(f"{len(key_frames)} key frames") 33 | 34 | frames = [first] 35 | first = run(first, task_prompt=task_prompt) 36 | color = [first[""]] 37 | lighting = [first[""]] 38 | lighting_type = [first[""]] 39 | composition = [first[""]] 40 | 41 | if mid: 42 | frames.append(mid) 43 | mid = run(mid, task_prompt=task_prompt) 44 | color.append(mid[""]) 45 | lighting.append(mid[""]) 46 | lighting_type.append(mid[""]) 47 | composition.append(mid[""]) 48 | 49 | if last: 50 | frames.append(last) 51 | last = run(last, task_prompt=task_prompt) 52 | color.append(last[""]) 53 | lighting.append(last[""]) 54 | lighting_type.append(last[""]) 55 | composition.append(last[""]) 56 | 57 | row = { 58 | "color": color, 59 | "lighting": lighting, 60 | "lighting_type": lighting_type, 61 | "composition": composition, 62 | } 63 | data.append(row) 64 | pbar.update() 65 | 66 | shot_categorized_df = pd.DataFrame(data) 67 | 68 | print(shot_categorized_df) 69 | 70 | df = df.join(shot_categorized_df) 71 | 72 | print(df) 73 | 74 | df.to_parquet(parquet_out_path) 75 | -------------------------------------------------------------------------------- /video_processing/add_watermark_laion_score.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pathlib 3 | from PIL import Image 4 | from argparse import ArgumentParser 5 | from tqdm import tqdm 6 | from modules import load_watermark_laion, run_watermark_laion, separate_key_frames_from_row 7 | 8 | parser = ArgumentParser() 9 | parser.add_argument("--path", type=str, required=True) 10 | parser.add_argument("--parquet-path", type=str, required=True) 11 | parser.add_argument("--parquet-out-path", type=str, required=True) 12 | parser.add_argument("--device", type=str, required=True) 13 | parser.add_argument("--model", type=str, default=None) 14 | args = parser.parse_args() 15 | path = pathlib.Path(args.path) 16 | parquet_path = pathlib.Path(args.parquet_path) 17 | parquet_out_path = pathlib.Path(args.parquet_out_path) 18 | device = args.device 19 | model_path = args.model 20 | 21 | load_watermark_laion(device=device, model_path=model_path) 22 | 23 | df = pd.read_parquet(parquet_path) 24 | 25 | data = [] 26 | with tqdm() as pbar: 27 | for _, row in df.iterrows(): 28 | pbar.set_description(row["file"]) 29 | key_frames, first, mid, last = separate_key_frames_from_row(path, row) 30 | pbar.set_postfix_str(f"{len(key_frames)} key frames") 31 | frames = [frame for frame in [first, mid, last] if frame is not None] 32 | scores = [tensor.cpu().item() for tensor in run_watermark_laion(frames)] 33 | data.append({"pwatermark": scores}) 34 | pbar.update() 35 | 36 | watermark_df = pd.DataFrame(data) 37 | 38 | print(watermark_df) 39 | 40 | df = df.join(watermark_df) 41 | 42 | print(df) 43 | 44 | df.to_parquet(parquet_out_path) 45 | -------------------------------------------------------------------------------- /video_processing/extract_frames.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pathlib 3 | from argparse import ArgumentParser 4 | from tqdm import tqdm 5 | from modules import get_key_frames 6 | 7 | parser = ArgumentParser() 8 | parser.add_argument("--path", type=str, required=True) 9 | parser.add_argument("--frames-path", type=str, required=True) 10 | parser.add_argument("--parquet-path", type=str, required=True) 11 | parser.add_argument("--parquet-out-path", type=str, required=True) 12 | args = parser.parse_args() 13 | path = pathlib.Path(args.path) 14 | frames_path = pathlib.Path(args.frames_path) 15 | parquet_path = pathlib.Path(args.parquet_path) 16 | parquet_out_path = pathlib.Path(args.parquet_out_path) 17 | 18 | df = pd.read_parquet(parquet_path) 19 | 20 | if "frames" in df.columns: 21 | print("`frames` already found.") 22 | exit() 23 | 24 | data = [] 25 | with tqdm() as pbar: 26 | for _, row in df.iterrows(): 27 | video = path.joinpath(row["file"]) 28 | frames_dir = video.parent.with_name("frames") 29 | if not frames_dir.exists(): 30 | frames_dir.mkdir(parents=True, exist_ok=True) 31 | pbar.set_description(video.name) 32 | key_frames = get_key_frames(video) 33 | pbar.set_postfix_str(f"{len(key_frames)} key frames") 34 | first = key_frames[0] 35 | mid = None 36 | last = None 37 | if len(key_frames) == 2: 38 | last = key_frames[1] 39 | elif len(key_frames) > 2: 40 | mid = key_frames[len(key_frames) // 2] 41 | last = key_frames[-1] 42 | frames = [] 43 | for idx, frame in enumerate([first, mid, last]): 44 | if frame is None: 45 | continue 46 | frame_path = frames_dir.joinpath(f"{video.stem}_{idx}.jpg") 47 | if not frame_path.exists(): 48 | frame.save(frame_path) 49 | frames.append(frame_path.name) 50 | data.append({"frames": frames}) 51 | 52 | 53 | frames_df = pd.DataFrame(data) 54 | 55 | print(frames_df) 56 | 57 | df = df.join(frames_df) 58 | 59 | print(df) 60 | 61 | df.to_parquet(parquet_out_path) 62 | -------------------------------------------------------------------------------- /video_processing/folder_to_parquet.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pathlib 3 | from argparse import ArgumentParser 4 | 5 | parser = ArgumentParser() 6 | parser.add_argument("--path", type=str, required=True) 7 | parser.add_argument("--out-path", type=str, required=True) 8 | args = parser.parse_args() 9 | path = pathlib.Path(args.path) 10 | out_path = pathlib.Path(args.out_path) 11 | 12 | EXTENSIONS = {"avi", "mkv", "mp4"} 13 | 14 | videos = [] 15 | for extension in EXTENSIONS: 16 | videos.extend(list(path.glob(f"*.{extension}"))) 17 | 18 | data = [] 19 | for video in videos: 20 | data.append({"file": video.name}) 21 | 22 | df = pd.DataFrame(data) 23 | 24 | print(df) 25 | 26 | df.to_parquet(out_path, compression="snappy") 27 | -------------------------------------------------------------------------------- /video_processing/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .aesthetic_laion import AestheticScorer, run_aesthetic_laion, load_aesthetic_laion 2 | from .watermark_laion import run_watermark_laion, load_watermark_laion 3 | from .optical_flow import compute_lk_optical_flow, compute_farneback_optical_flow, _downscale_maps, _motion_score 4 | from .caption_object_ocr import run, load_florence 5 | from .nsfw import load_nsfw, run_nsfw 6 | from .frames import get_frames, get_key_frames, separate_key_frames_from_row 7 | -------------------------------------------------------------------------------- /video_processing/modules/aesthetic_laion.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from transformers import CLIPVisionModelWithProjection, CLIPProcessor 4 | from huggingface_hub import hf_hub_download 5 | 6 | MODEL = None 7 | 8 | 9 | class MLP(nn.Module): 10 | def __init__(self): 11 | super().__init__() 12 | self.layers = nn.Sequential( 13 | nn.Linear(768, 1024), 14 | nn.Dropout(0.2), 15 | nn.Linear(1024, 128), 16 | nn.Dropout(0.2), 17 | nn.Linear(128, 64), 18 | nn.Dropout(0.1), 19 | nn.Linear(64, 16), 20 | nn.Linear(16, 1), 21 | ) 22 | 23 | @torch.no_grad() 24 | def forward(self, embed): 25 | return self.layers(embed) 26 | 27 | 28 | class AestheticScorer(torch.nn.Module): 29 | def __init__(self, dtype, path): 30 | super().__init__() 31 | self.clip = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14") 32 | self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14") 33 | 34 | self.mlp = MLP() 35 | if path is None: 36 | path = hf_hub_download("trl-lib/ddpo-aesthetic-predictor", "aesthetic-model.pth") 37 | state_dict = torch.load(path, weights_only=True, map_location=torch.device("cpu")) 38 | self.mlp.load_state_dict(state_dict) 39 | self.dtype = dtype 40 | self.eval() 41 | 42 | @torch.no_grad() 43 | def __call__(self, images): 44 | device = next(self.parameters()).device 45 | inputs = self.processor(images=images, return_tensors="pt") 46 | inputs = {k: v.to(self.dtype).to(device) for k, v in inputs.items()} 47 | embed = self.clip(**inputs)[0] 48 | # normalize embedding 49 | embed = embed / torch.linalg.vector_norm(embed, dim=-1, keepdim=True) 50 | return self.mlp(embed).squeeze(1) 51 | 52 | 53 | def load_aesthetic_laion(model_path, device, dtype): 54 | global MODEL 55 | dtype = getattr(torch, dtype) 56 | MODEL = AestheticScorer(dtype=dtype, path=model_path).to(device) 57 | 58 | 59 | @torch.no_grad 60 | def run_aesthetic_laion(image): 61 | if not isinstance(image, list): 62 | image = [image] 63 | return MODEL(image) 64 | -------------------------------------------------------------------------------- /video_processing/modules/caption_object_ocr.py: -------------------------------------------------------------------------------- 1 | FLORENCE = None 2 | 3 | 4 | def load_florence( 5 | hf_hub_or_path="microsoft/Florence-2-large", 6 | device="cpu", 7 | dtype="float32", 8 | check_task_types=True, 9 | ): 10 | global FLORENCE 11 | from florence_tool import FlorenceTool 12 | 13 | FLORENCE = FlorenceTool( 14 | hf_hub_or_path=hf_hub_or_path, device=device, dtype=dtype, check_task_types=check_task_types 15 | ) 16 | FLORENCE.load_model() 17 | 18 | 19 | def run( 20 | image, 21 | task_prompt, 22 | ): 23 | if FLORENCE is None: 24 | load_florence() 25 | return FLORENCE.run( 26 | image=image, 27 | task_prompt=task_prompt, 28 | ) 29 | -------------------------------------------------------------------------------- /video_processing/modules/frames.py: -------------------------------------------------------------------------------- 1 | import av 2 | from PIL import Image 3 | from pathlib import Path 4 | from typing import Iterator, List, Union 5 | 6 | 7 | def get_key_frames(path: Union[Path, str]) -> List[Image.Image]: 8 | frames = [] 9 | container = av.open(str(path)) 10 | stream = container.streams.video[0] 11 | stream.codec_context.skip_frame = "NONKEY" 12 | for _, frame in enumerate(container.decode(stream)): 13 | frames.append(frame.to_image()) 14 | container.close() 15 | return frames 16 | 17 | 18 | def get_frames(path: Union[Path, str]) -> Iterator[av.VideoFrame]: 19 | container = av.open(str(path)) 20 | stream = container.streams.video[0] 21 | yield container.decode(stream) 22 | 23 | 24 | def separate_key_frames_from_row(path: Path, row: dict[str, list]): 25 | key_frames = [Image.open(path.joinpath(key_frame)) for key_frame in row["frames"]] 26 | first = key_frames[0] 27 | mid = None 28 | last = None 29 | if len(key_frames) == 2: 30 | last = key_frames[1] 31 | elif len(key_frames) > 2: 32 | mid = key_frames[len(key_frames) // 2] 33 | last = key_frames[-1] 34 | return key_frames, first, mid, last 35 | -------------------------------------------------------------------------------- /video_processing/modules/nsfw.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForImageClassification, AutoImageProcessor 2 | import torch 3 | 4 | MODEL_ID = "Falconsai/nsfw_image_detection" 5 | 6 | MODEL, PROCESSOR = None, None 7 | 8 | 9 | def load_nsfw(device): 10 | global MODEL, PROCESSOR 11 | MODEL = AutoModelForImageClassification.from_pretrained(MODEL_ID).eval().to(device) 12 | PROCESSOR = AutoImageProcessor.from_pretrained(MODEL_ID) 13 | 14 | 15 | @torch.no_grad() 16 | def run_nsfw(image): 17 | if not isinstance(image, list): 18 | image = [image] 19 | inputs = PROCESSOR(images=image, return_tensors="pt").to(MODEL.device) 20 | outputs = MODEL(**inputs).logits 21 | predicted_labels = outputs.argmax(-1) 22 | return [MODEL.config.id2label[p.cpu().item()] for p in predicted_labels] 23 | -------------------------------------------------------------------------------- /video_processing/modules/optical_flow.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from PIL import Image 4 | 5 | 6 | def compute_farneback_optical_flow(frames): 7 | prev_gray = cv2.cvtColor(np.array(frames[0]), cv2.COLOR_BGR2GRAY) 8 | flow_maps = [] 9 | magnitudes = [] 10 | angles = [] 11 | images = [] 12 | hsv = np.zeros_like(frames[0]) 13 | hsv[..., 1] = 255 14 | 15 | for frame in frames[1:]: 16 | gray = cv2.cvtColor(np.array(frame), cv2.COLOR_BGR2GRAY) 17 | flow_map = cv2.calcOpticalFlowFarneback( 18 | prev_gray, 19 | gray, 20 | flow=None, 21 | pyr_scale=0.5, 22 | levels=3, 23 | winsize=15, 24 | iterations=3, 25 | poly_n=5, 26 | poly_sigma=1.2, 27 | flags=0, 28 | ) 29 | magnitude, angle = cv2.cartToPolar(flow_map[..., 0], flow_map[..., 1]) 30 | hsv[..., 0] = angle * 180 / np.pi / 2 31 | hsv[..., 2] = cv2.normalize(magnitude, None, 0, 255, cv2.NORM_MINMAX) 32 | bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR) 33 | flow_maps.append(flow_map) 34 | magnitudes.append(magnitude) 35 | angles.append(angle) 36 | images.append(bgr) 37 | prev_gray = gray 38 | return flow_maps, magnitudes, angles, images 39 | 40 | 41 | def compute_lk_optical_flow(frames): 42 | # params for ShiTomasi corner detection 43 | maxCorners = 50 44 | feature_params = dict(maxCorners=maxCorners, qualityLevel=0.3, minDistance=7, blockSize=7) 45 | # Parameters for lucas kanade optical flow 46 | lk_params = dict( 47 | winSize=(15, 15), 48 | maxLevel=2, 49 | criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03), 50 | ) 51 | # Create some random colors 52 | color = np.random.randint(0, 255, (maxCorners, 3)) 53 | # Take first frame and find corners in it 54 | old_frame = frames[0] 55 | old_gray = cv2.cvtColor(np.array(old_frame), cv2.COLOR_BGR2GRAY) 56 | p0 = cv2.goodFeaturesToTrack(old_gray, mask=None, **feature_params) 57 | # Create a mask image for drawing purposes 58 | mask = np.zeros_like(old_frame) 59 | 60 | for frame in frames[1:]: 61 | frame_gray = cv2.cvtColor(np.array(frame), cv2.COLOR_BGR2GRAY) 62 | # calculate optical flow 63 | p1, st, err = cv2.calcOpticalFlowPyrLK(old_gray, frame_gray, p0, None, **lk_params) 64 | # Select good points 65 | if p1 is not None: 66 | good_new = p1[st == 1] 67 | good_old = p0[st == 1] 68 | # draw the tracks 69 | for i, (new, old) in enumerate(zip(good_new, good_old)): 70 | a, b = new.ravel() 71 | c, d = old.ravel() 72 | mask = cv2.line(mask, (int(a), int(b)), (int(c), int(d)), color[i].tolist(), 2) 73 | old_gray = frame_gray.copy() 74 | p0 = good_new.reshape(-1, 1, 2) 75 | return mask 76 | 77 | 78 | def _downscale_maps(flow_maps, downscale_size: int = 16): 79 | return [ 80 | cv2.resize( 81 | flow, 82 | (downscale_size, int(flow.shape[0] * (downscale_size / flow.shape[1]))), 83 | interpolation=cv2.INTER_AREA, 84 | ) 85 | for flow in flow_maps 86 | ] 87 | 88 | 89 | def _motion_score(flow_maps): 90 | average_flow_map = np.mean(np.array(flow_maps), axis=0) 91 | return np.mean(average_flow_map) 92 | 93 | 94 | def _to_image(flow_maps): 95 | return [Image.fromarray(np.array(flow_map)) for flow_map in flow_maps] 96 | -------------------------------------------------------------------------------- /video_processing/modules/watermark_laion.py: -------------------------------------------------------------------------------- 1 | import timm 2 | import torch 3 | import torch.nn as nn 4 | import torchvision.transforms as T 5 | from huggingface_hub import hf_hub_download 6 | 7 | MODEL, TRANSFORMS = None, None 8 | 9 | 10 | def load_watermark_laion(device, model_path): 11 | global MODEL, TRANSFORMS 12 | TRANSFORMS = T.Compose( 13 | [ 14 | T.Resize((256, 256)), 15 | T.ToTensor(), 16 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), 17 | ] 18 | ) 19 | 20 | MODEL = timm.create_model("efficientnet_b3", pretrained=False, num_classes=2) 21 | MODEL.classifier = nn.Sequential( 22 | nn.Linear(in_features=1536, out_features=625), 23 | nn.ReLU(), 24 | nn.Dropout(p=0.3), 25 | nn.Linear(in_features=625, out_features=256), 26 | nn.ReLU(), 27 | nn.Linear(in_features=256, out_features=2), 28 | ) 29 | if model_path is None: 30 | model_path = hf_hub_download("finetrainers/laion-watermark-detection", "watermark_model_v1.pt") 31 | state_dict = torch.load(model_path, weights_only=True) 32 | MODEL.load_state_dict(state_dict) 33 | MODEL.eval().to(device) 34 | 35 | 36 | @torch.no_grad 37 | def run_watermark_laion(image): 38 | if not isinstance(image, list): 39 | image = [image] 40 | pixel_values = torch.stack([TRANSFORMS(_image) for _image in image]) 41 | return nn.functional.softmax(MODEL(pixel_values), dim=1)[:, 0] 42 | -------------------------------------------------------------------------------- /video_processing/reference_video_similarity.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from transformers import SiglipVisionModel, SiglipImageProcessor 4 | from PIL import Image 5 | from tqdm import tqdm 6 | import numpy as np 7 | import argparse 8 | import pandas as pd 9 | from modules import get_frames 10 | 11 | 12 | def compute_video_embedding(frames, model, preprocessor, device, dtype): 13 | """ 14 | Compute video embeddings. `frames` can either be frames of a single video or a list of list of 15 | frames from multiple videos. 16 | """ 17 | if not frames: 18 | return None 19 | 20 | if isinstance(frames[0], list): 21 | video_embeddings = [] 22 | flat_frames = [] 23 | video_lengths = [] 24 | 25 | for video in frames: 26 | video_lengths.append(len(video)) 27 | flat_frames.extend(video) 28 | 29 | all_input = preprocessor(images=flat_frames, return_tensors="pt").to(device) 30 | with torch.no_grad(), torch.autocast(torch.device(device).type, dtype=dtype): 31 | embeddings = model(**all_input).pooler_output 32 | embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True) 33 | embeddings = embeddings.cpu() 34 | 35 | # Group the embeddings back by video 36 | index = 0 37 | for length in video_lengths: 38 | video_emb = embeddings[index : index + length].mean(dim=0) 39 | video_emb = video_emb / video_emb.norm() 40 | video_embeddings.append(video_emb.numpy()) 41 | index += length 42 | 43 | return video_embeddings 44 | else: 45 | all_input = preprocessor(images=frames, return_tensors="pt").to(device) 46 | with torch.no_grad(), torch.autocast(torch.device(device).type, dtype=dtype): 47 | embeddings = model(**all_input).pooler_output 48 | embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True) 49 | embeddings = embeddings.cpu() 50 | 51 | video_embedding = embeddings.mean(dim=0) 52 | video_embedding = video_embedding / video_embedding.norm() 53 | return video_embedding.numpy() 54 | 55 | 56 | def compute_image_embedding(image_path, model, preprocessor, device, dtype): 57 | """ 58 | Computes an embedding for a single image. 59 | """ 60 | image = Image.open(image_path).convert("RGB") 61 | image_input = preprocessor(image, return_tensors="pt").to(device) 62 | with torch.no_grad() and torch.autocast(torch.device(device).type, dtype=dtype): 63 | embedding = model(**image_input).pooler_output 64 | embedding = embedding / embedding.norm(dim=-1, keepdim=True) 65 | return embedding.cpu().numpy().flatten() 66 | 67 | 68 | def compute_reference_embedding(ref_path, model, preprocessor, device, dtype): 69 | """ 70 | Computes the embedding for a reference file (image or video). 71 | """ 72 | video_extensions = (".mp4", ".avi", ".mov", ".mkv") 73 | if ref_path.lower().endswith(video_extensions): 74 | frames = get_frames(ref_path) 75 | frames = next(iter(frames)) 76 | frames = [frame.to_image() for frame in frames] 77 | return compute_video_embedding(frames, model, preprocessor, device, dtype) 78 | else: 79 | return compute_image_embedding(ref_path, model, preprocessor, device, dtype) 80 | 81 | 82 | @torch.no_grad() 83 | @torch.inference_mode() 84 | def main(args): 85 | # List video files in the folder (supports common video extensions) 86 | video_extensions = (".mp4", ".avi", ".mov", ".mkv") 87 | video_files = [ 88 | os.path.join(args.videos_folder, f) 89 | for f in os.listdir(args.videos_folder) 90 | if f.lower().endswith(video_extensions) 91 | ] 92 | print(f"Total video files: {len(video_files)}") 93 | assert video_files 94 | 95 | # Load model. 96 | device = "cuda" if torch.cuda.is_available() else "cpu" 97 | dtype = ( 98 | torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16 99 | ) 100 | model = SiglipVisionModel.from_pretrained( 101 | "google/siglip-so400m-patch14-384", attn_implementation="flash_attention_2" 102 | ).to(device) 103 | preprocessor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384") 104 | 105 | # Process each reference file and average their embeddings. 106 | ref_embeddings = [] 107 | if os.path.isdir(args.reference): 108 | allow_extensions = video_extensions + (".png", ".jpg", ".jpeg") 109 | reference = [ 110 | os.path.join(args.reference, f) for f in os.listdir(args.reference) if f.endswith(allow_extensions) 111 | ] 112 | else: 113 | reference = args.reference.split(",") 114 | 115 | assert reference 116 | 117 | for ref in reference: 118 | emb = compute_reference_embedding(ref, model, preprocessor, device, dtype) 119 | if emb is not None: 120 | ref_embeddings.append(emb) 121 | else: 122 | print(f"Could not compute embedding for reference: {ref}") 123 | 124 | if len(ref_embeddings) == 0: 125 | print("No valid reference embeddings found!") 126 | return 127 | 128 | ref_embedding = np.mean(ref_embeddings, axis=0) 129 | ref_embedding = ref_embedding / np.linalg.norm(ref_embedding) 130 | 131 | results = [] 132 | batch_frames = [] # To collect frames for a batch of videos 133 | batch_paths = [] # To keep track of corresponding video paths 134 | pbar = tqdm(video_files, desc="Computing video embeddings.") 135 | 136 | for video_path in pbar: 137 | pbar.set_postfix_str(f"{video_path}") 138 | 139 | frames_generator = get_frames(video_path) 140 | try: 141 | frames_batch = next(iter(frames_generator)) 142 | except StopIteration: 143 | print(f"Could not extract frames from {video_path}") 144 | continue 145 | 146 | frames = [frame.to_image() for frame in frames_batch] 147 | if not frames: 148 | print(f"Could not extract frames from {video_path}") 149 | continue 150 | 151 | frames = frames[: args.max_num_frames] 152 | batch_frames.append(frames) 153 | batch_paths.append(video_path) 154 | 155 | if len(batch_frames) == args.batch_size: 156 | video_embeddings = compute_video_embedding(batch_frames, model, preprocessor, device, dtype) 157 | for path, video_embedding in zip(batch_paths, video_embeddings): 158 | if video_embedding is not None: 159 | similarity = np.dot(ref_embedding, video_embedding) 160 | results.append((path.split("/")[-1], similarity)) 161 | batch_frames = [] 162 | batch_paths = [] 163 | 164 | # Remaining. 165 | if batch_frames: 166 | video_embeddings = compute_video_embedding(batch_frames, model, preprocessor, device, dtype) 167 | for path, video_embedding in zip(batch_paths, video_embeddings): 168 | if video_embedding is not None: 169 | similarity = np.dot(ref_embedding, video_embedding) 170 | results.append((path.split("/")[-1], similarity)) 171 | 172 | # Sort videos by similarity score (higher means more similar). 173 | results.sort(key=lambda x: x[1], reverse=True) 174 | 175 | # Write results to a parquet file. 176 | df = pd.DataFrame(results, columns=["video_path", "similarity"]) 177 | df.to_parquet(args.parquet_out_path, index=False) 178 | 179 | print(f"\nResults saved to {args.parquet_out_path}") 180 | 181 | 182 | if __name__ == "__main__": 183 | parser = argparse.ArgumentParser() 184 | parser.add_argument( 185 | "--videos_folder", 186 | type=str, 187 | required=True, 188 | help="Path to folder containing videos.", 189 | ) 190 | parser.add_argument( 191 | "--reference", 192 | type=str, 193 | required=True, 194 | help="Reference image/video file(s).", 195 | ) 196 | parser.add_argument( 197 | "--max_num_frames", 198 | type=int, 199 | default=24, 200 | help="Max number of frames per videos.", 201 | ) 202 | parser.add_argument( 203 | "--batch_size", 204 | type=int, 205 | default=16, 206 | help="How many videos to process.", 207 | ) 208 | parser.add_argument( 209 | "--parquet_out_path", 210 | type=str, 211 | default="results.parquet", 212 | help="Path to the output parquet file.", 213 | ) 214 | args = parser.parse_args() 215 | main(args) 216 | -------------------------------------------------------------------------------- /video_processing/requirements.txt: -------------------------------------------------------------------------------- 1 | av 2 | Pillow 3 | git+https://github.com/bigdata-pw/florence-tool.git 4 | transformers 5 | accelerate -------------------------------------------------------------------------------- /video_processing/scene_split.py: -------------------------------------------------------------------------------- 1 | from scenedetect import open_video, SceneManager, ContentDetector, split_video_ffmpeg 2 | 3 | 4 | def get_scenes(path: str, threshold: int = 27, min_scene_len: int = 15, duration: int = None, **kwargs): 5 | detector = ContentDetector(threshold=threshold, min_scene_len=min_scene_len, **kwargs) 6 | scene_manager = SceneManager() 7 | scene_manager.add_detector(detector) 8 | video = open_video(path) 9 | scene_manager.detect_scenes(video=video, duration=duration, show_progress=True) 10 | scenes = scene_manager.get_scene_list() 11 | return scenes 12 | -------------------------------------------------------------------------------- /video_processing/video_to_scenes.py: -------------------------------------------------------------------------------- 1 | from scene_split import get_scenes, split_video_ffmpeg 2 | import pathlib 3 | from argparse import ArgumentParser 4 | from tqdm import tqdm 5 | 6 | 7 | parser = ArgumentParser() 8 | parser.add_argument("--path", type=str, required=True) 9 | parser.add_argument("--out-path", type=str, required=True) 10 | parser.add_argument("--threshold", type=int, default=27) 11 | parser.add_argument("--min-scene-len", type=int, default=15) 12 | parser.add_argument("--duration", type=int, default=None) 13 | args = parser.parse_args() 14 | path = pathlib.Path(args.path) 15 | out_path = pathlib.Path(args.out_path) 16 | threshold = args.threshold 17 | min_scene_len = args.min_scene_len 18 | duration = args.duration 19 | 20 | EXTENSIONS = {"avi", "mkv", "mp4"} 21 | 22 | videos = [] 23 | for extension in EXTENSIONS: 24 | videos.extend(list(path.glob(f"*.{extension}"))) 25 | 26 | for video in tqdm(videos): 27 | scenes = get_scenes(str(video), threshold=threshold, min_scene_len=min_scene_len, duration=duration) 28 | split_video_ffmpeg(str(video), scene_list=scenes, output_dir=str(out_path)) 29 | --------------------------------------------------------------------------------