├── .gitignore
├── README.md
├── openvid
    ├── README.md
    ├── openvid.py
    ├── openvid_part_id_parquet.py
    └── zipstream.py
└── video_processing
    ├── README.md
    ├── add_aesthetic_laion_score.py
    ├── add_captions.py
    ├── add_motion_score.py
    ├── add_nsfw_score.py
    ├── add_shot_categories.py
    ├── add_watermark_laion_score.py
    ├── extract_frames.py
    ├── folder_to_parquet.py
    ├── modules
        ├── __init__.py
        ├── aesthetic_laion.py
        ├── caption_object_ocr.py
        ├── frames.py
        ├── nsfw.py
        ├── optical_flow.py
        └── watermark_laion.py
    ├── reference_video_similarity.py
    ├── requirements.txt
    ├── scene_split.py
    └── video_to_scenes.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 
164 | .ruff_cache/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # video-dataset-scripts
 2 | 
 3 | Tooling for image generation datasets is well established, with [`img2dataset`](https://github.com/rom1504/img2dataset) covering large scale and various community guides, scripts and UIs covering the small scale.
 4 | 
 5 | Our goal is to make tooling for video generation datasets as established by creating open video dataset scripts suited for small scale, with [`video2dataset`](https://github.com/iejMac/video2dataset) covering large scale.
 6 | 
 7 | *“If I have seen further it is by standing on the shoulders of giants”*
 8 | 
 9 | The repository consists of the tooling we are developing to make it easy for the community to build their own datasets for fine-tuning video generation models. Check out our introduction [blog post](https://huggingface.co/blog/vid_ds_scripts) to learn more and head to the [`video_processing`](./video_processing) directory to get started.
10 | 
11 | We plan to keep this repository easy to follow along and nimble. 
12 | 
13 | ## Features planned
14 | 
15 | - [ ] Age detector
16 | - [ ] Scene categorizer
17 | 


--------------------------------------------------------------------------------
/openvid/README.md:
--------------------------------------------------------------------------------
 1 | # OpenVid
 2 | 
 3 | Script to filter and download videos from [datasets/nkp37/OpenVid-1M](https://huggingface.co/datasets/nkp37/OpenVid-1M) without downloading the entire dataset.
 4 | 
 5 | ## Usage
 6 | 
 7 | 1. Download [`openvid.parquet`](https://huggingface.co/datasets/bigdata-pw/OpenVid-1M/resolve/main/openvid.parquet?download=true) from [datasets/bigdata-pw/OpenVid-1M](https://huggingface.co/datasets/bigdata-pw/OpenVid-1M), this version has part numbers linked to each filename.
 8 | 2. Edit `PARQUET_PATH` and `BASE_PATH`.
 9 | 3. Optionally change the filtering, `aesthetic = df.loc[df["aesthetic score"] >= 7]`
10 | 4. Run the script
11 | 
12 | This will
13 | 1. Read then filter the parquet
14 | 2. Get the zip central directory for each part number from the filtered set
15 |     - Only 64KB per part is downloaded
16 | 3. Extract all filenames, offsets and sizes from each central directory
17 | 4. Filter the extracted filenames to what we want
18 | 5. Download each video to `BASE_PATH`, using 8 threads.
19 | 
20 | `aesthetic score >= 7` (without the multipart zips) filters to 17247 videos and downloads only ~118GB instead of ~7TB for the full set!
21 | 
22 | ## TODO
23 | 
24 | Support downloading from the multipart zips
25 | 
26 | # OpenVid part id parquet
27 | 
28 | [`openvid.parquet`](https://huggingface.co/datasets/bigdata-pw/OpenVid-1M/resolve/main/openvid.parquet?download=true) from [datasets/bigdata-pw/OpenVid-1M](https://huggingface.co/datasets/bigdata-pw/OpenVid-1M) was produced using `openvid_part_id_parquet.py`.
29 | 
30 | ## Usage
31 | 
32 | 1. Download [OpenVid-1M.csv](https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/data/train/OpenVid-1M.csv?download=true)
33 | 2. Run the script
34 | 
35 | This will
36 | 1. Read the csv into a dataframe
37 | 2. Get the zip central directory for each part number 
38 |     - Only 64KB per part is downloaded
39 | 3. Extract filenames from each central directory
40 | 4. Merge `part_id` into the dataframe according to filename
41 | 5. Save `openvid.parquet`
42 | 


--------------------------------------------------------------------------------
/openvid/openvid.py:
--------------------------------------------------------------------------------
 1 | from concurrent.futures import ThreadPoolExecutor, as_completed
 2 | import pandas as pd
 3 | from tqdm import tqdm
 4 | 
 5 | from zipstream import ZipStream
 6 | 
 7 | PARQUET_PATH = "openvid.parquet"
 8 | BASE_PATH = "H:/openvid"
 9 | 
10 | # skip these for now
11 | MULTI_PART = {73, 76, 78, 83, 88, 89, 92, 95, 96, 102, 103, 111, 118, 183, 184, 185}
12 | 
13 | URL = "https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid_part{part}.zip?download=true"
14 | 
15 | 
16 | df = pd.read_parquet(PARQUET_PATH)
17 | 
18 | aesthetic = df.loc[df["aesthetic score"] >= 7]
19 | aesthetic = aesthetic.loc[~df["part_id"].isin(MULTI_PART)]
20 | part_ids = list(aesthetic["part_id"].unique())
21 | filenames = set(aesthetic["video"])
22 | 
23 | 
24 | for part_id in part_ids:
25 |     stream = ZipStream(URL.format(part=part_id))
26 |     files = list(filter(lambda file: file.filename.split("/")[-1] in filenames, stream.files))
27 | 
28 |     with ThreadPoolExecutor(max_workers=8) as executor:
29 |         pbar = tqdm(desc="download", total=len(files))
30 |         futures = {}
31 |         for file in files:
32 |             filename = file.filename.split("/")[-1]
33 |             futures[executor.submit(file.download, filename, BASE_PATH)] = file
34 |         for future in as_completed(futures):
35 |             _ = future.result()
36 |             pbar.update()
37 | 


--------------------------------------------------------------------------------
/openvid/openvid_part_id_parquet.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from zipstream import ZipStream
 3 | import tqdm
 4 | 
 5 | df = pd.read_csv("OpenVid-1M.csv")
 6 | 
 7 | part_ids = list(range(0, 183))
 8 | for multi_part in {73, 76, 78, 83, 88, 89, 92, 95, 96, 102, 103, 111, 118}:
 9 |     part_ids.remove(multi_part)
10 | 
11 | url = "https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid_part{part}.zip?download=true"
12 | 
13 | filename_part = []
14 | 
15 | for part_id in tqdm.tqdm(part_ids):
16 |     stream = ZipStream(url.format(part=part_id))
17 |     filename_part.extend(
18 |         [
19 |             {
20 |                 "video": file.filename.split("/")[-1],
21 |                 "part_id": part_id,
22 |                 "file_offset": file.file_offset,
23 |                 "file_size": file.file_size,
24 |             }
25 |             for file in stream.files
26 |         ]
27 |     )
28 | 
29 | # for split parts we get 1 byte of part a to find the size
30 | # for part b the central directory offset is - size of part a
31 | url_multipart_a = (
32 |     "https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid_part{part}_partaa?download=true"
33 | )
34 | url_multipart = "https://huggingface.co/datasets/nkp37/OpenVid-1M/resolve/main/OpenVid_part{part}_partab?download=true"
35 | 
36 | for part_id in tqdm.tqdm({73, 76, 78, 83, 88, 89, 92, 95, 96, 102, 103, 111, 118, 183, 184, 185}):
37 |     offset = ZipStream.size(url_multipart_a.format(part=part_id))
38 |     stream = ZipStream(url_multipart.format(part=part_id), offset=offset)
39 |     filename_part.extend(
40 |         [
41 |             {
42 |                 "video": file.filename.split("/")[-1],
43 |                 "part_id": part_id,
44 |                 "file_offset": file.file_offset,
45 |                 "file_size": file.file_size,
46 |             }
47 |             for file in stream.files
48 |         ]
49 |     )
50 | 
51 | data = pd.DataFrame(filename_part)
52 | 
53 | df = df.merge(data, how="left")
54 | df["part_id"] = df["part_id"].astype(pd.Int64Dtype())
55 | df["file_offset"] = df["file_offset"].astype(pd.Int64Dtype())
56 | df["file_size"] = df["file_size"].astype(pd.Int64Dtype())
57 | 
58 | df.to_parquet("openvid.parquet")
59 | 


--------------------------------------------------------------------------------
/openvid/zipstream.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | import pathlib
  3 | import requests
  4 | import struct
  5 | import tqdm
  6 | from typing import Optional
  7 | import zlib
  8 | 
  9 | 
 10 | @dataclass
 11 | class LocalFileHeader:
 12 |     signature: bytes
 13 |     version: int
 14 |     flag: int
 15 |     method: int
 16 |     modification_time: int
 17 |     modification_date: int
 18 |     crc32: int
 19 |     compressed_size: int
 20 |     uncompressed_size: int
 21 |     file_name_length: int
 22 |     extra_field_length: int
 23 | 
 24 | 
 25 | @dataclass
 26 | class CentralDirectoryFileHeader:
 27 |     signature: bytes
 28 |     version: int
 29 |     minimum_version: int
 30 |     flag: int
 31 |     method: int
 32 |     modification_time: int
 33 |     modification_date: int
 34 |     crc32: int
 35 |     compressed_size: int
 36 |     uncompressed_size: int
 37 |     file_name_length: int
 38 |     extra_field_length: int
 39 |     file_comment_length: int
 40 |     disk_number: int
 41 |     internal_file_attributes: int
 42 |     external_file_attributes: int
 43 |     relative_offset: int
 44 | 
 45 | 
 46 | class ZipStreamFile:
 47 |     def __init__(
 48 |         self,
 49 |         url: str,
 50 |         filename: str,
 51 |         file_offset: int,
 52 |         file_size: int,
 53 |     ):
 54 |         self.url = url
 55 |         self.filename = filename
 56 |         self.file_offset = file_offset
 57 |         self.file_size = file_size
 58 | 
 59 |     def download(
 60 |         self,
 61 |         filename: Optional[str] = None,
 62 |         base_path: Optional[str] = None,
 63 |     ):
 64 |         struct_format = "<4sHHHHHIIIHH"
 65 |         struct_size = struct.calcsize(struct_format)
 66 |         headers = {"Range": f"bytes={self.file_offset}-{self.file_offset+struct_size-1}"}
 67 |         local_file_header = requests.get(self.url, headers=headers, stream=True).content
 68 |         local_file_header = LocalFileHeader(*struct.unpack(struct_format, local_file_header))
 69 |         data_offset = struct_size + local_file_header.file_name_length + local_file_header.extra_field_length
 70 |         headers = {"Range": f"bytes={self.file_offset+data_offset}-{self.file_offset+data_offset+self.file_size-1}"}
 71 |         data = requests.get(self.url, headers=headers, stream=True).content
 72 |         if local_file_header.method == 8:
 73 |             data = zlib.decompress(data, -15)
 74 |         elif local_file_header.method != 0:
 75 |             raise ValueError("Unsupported compression method.")
 76 |         filename = filename or self.filename
 77 |         if base_path is not None and filename is not None:
 78 |             with open(f"{base_path}/{filename}", "wb") as f:
 79 |                 f.write(data)
 80 |         return data
 81 | 
 82 |     def __repr__(self):
 83 |         return f"ZipStreamFile(\n\turl={self.url},\n\tfilename={self.filename},\n\tfile_offset={self.file_offset},\n\tfile_size={self.file_size}\n)"
 84 | 
 85 | 
 86 | class ZipStream:
 87 |     tail_size: int = 65536
 88 | 
 89 |     @classmethod
 90 |     def size(self, url: str):
 91 |         headers = {"Range": f"bytes=-1"}
 92 |         return int(requests.get(url, headers=headers).headers["Content-Range"].split("/")[-1])
 93 | 
 94 |     @classmethod
 95 |     def get_central_directory(self, url: str, offset: Optional[int] = None):
 96 |         headers = {"Range": f"bytes=-{self.tail_size}"}
 97 |         tail_data = requests.get(url, headers=headers, stream=True).content
 98 |         zip64_eocd = b"\x50\x4b\x06\x06"
 99 |         eocd_offset = tail_data.rfind(zip64_eocd)
100 |         eocd = tail_data[eocd_offset:]
101 |         cd_offset = int.from_bytes(eocd[48 : 48 + 8], byteorder="little")
102 |         if offset is not None:
103 |             cd_offset - offset
104 |         headers = {"Range": f"bytes={cd_offset}-"}
105 |         central_directory = requests.get(url, headers=headers, stream=True).content
106 |         return central_directory
107 | 
108 |     @classmethod
109 |     def get_files(self, url: str, central_directory: bytes, file_to_get: str = None):
110 |         files = []
111 |         offset = 0
112 |         while offset <= len(central_directory):
113 |             file, offset = ZipStream.get_file(url=url, central_directory=central_directory, offset=offset)
114 |             if file is None:
115 |                 continue
116 |             if file_to_get is None:
117 |                 files.append(file)
118 |             elif file_to_get is not None and file_to_get in file.filename:
119 |                 return file
120 |         return files
121 | 
122 |     @classmethod
123 |     def get_file(self, url: str, central_directory: bytes, offset: int):
124 |         struct_format = "<4sHHHHHHIIIHHHHHII"
125 |         struct_size = struct.calcsize(struct_format)
126 |         buffer = central_directory[offset : offset + struct_size]
127 |         if len(buffer) < struct_size:
128 |             return None, offset + struct_size
129 |         central_directory_file_header = CentralDirectoryFileHeader(*struct.unpack(struct_format, buffer))
130 |         filename = central_directory[
131 |             offset + struct_size : offset + struct_size + central_directory_file_header.file_name_length
132 |         ].decode("utf-8")
133 |         next_offset = (
134 |             offset
135 |             + struct_size
136 |             + central_directory_file_header.file_name_length
137 |             + central_directory_file_header.extra_field_length
138 |             + central_directory_file_header.file_comment_length
139 |         )
140 |         if not filename:
141 |             return None, next_offset
142 |         is_zip64 = (central_directory_file_header.compressed_size == 2**32 - 1) or (
143 |             central_directory_file_header.relative_offset == 2**32 - 1
144 |         )
145 |         if is_zip64:
146 |             extra = central_directory[
147 |                 offset + struct_size + central_directory_file_header.file_name_length : next_offset
148 |             ]
149 |             central_directory_file_header.relative_offset = int.from_bytes(extra[-8:], byteorder="little")
150 |         return (
151 |             ZipStreamFile(
152 |                 url=url,
153 |                 filename=filename,
154 |                 file_offset=central_directory_file_header.relative_offset,
155 |                 file_size=central_directory_file_header.compressed_size,
156 |             ),
157 |             next_offset,
158 |         )
159 | 
160 |     def __init__(
161 |         self,
162 |         url: str,
163 |         central_directory: Optional[bytes] = None,
164 |         offset: Optional[int] = None,
165 |     ):
166 |         self.url = url
167 |         central_directory = central_directory or ZipStream.get_central_directory(url=self.url, offset=offset)
168 |         self.central_directory = central_directory
169 |         self.files = ZipStream.get_files(url=self.url, central_directory=self.central_directory)
170 | 


--------------------------------------------------------------------------------
/video_processing/README.md:
--------------------------------------------------------------------------------
  1 | # Video Processing
  2 | 
  3 | ## Overview of the available filters
  4 | * Watermark detection
  5 | * Aesthetic scoring
  6 | * NSFW scoring
  7 | * Motion scoring
  8 | * Filtering videos w.r.t reference videos/images
  9 | * Shot categories (color, lighting, composition, etc.)
 10 | 
 11 | ## Prerequisite
 12 | The examples use the folder `cakeify/`, this can be any folder with videos.
 13 | 
 14 | ## Folder to Parquet
 15 | 
 16 | The first step, this creates a basic parquet with `file` column which is the filename of each video in `path`.
 17 | 
 18 | Other scripts join to this parquet.
 19 | 
 20 | ```sh
 21 | python folder_to_parquet.py --path cakeify/ --out-path cakeify.parquet
 22 | ```
 23 | 
 24 | ## Extract frames
 25 | 
 26 | The second step, this extracts up to 3 key frames for use in captioning, watermark detection, etc.
 27 | 
 28 | The `first` key frame if there are 1 or more.
 29 | If there are only 2 key frames, we take the `first` and `last`.
 30 | If there are 3 or more key frames, we take the `first`, `mid` and `last`.
 31 | 
 32 | ```sh
 33 | python extract_frames.py --path cakeify/ --frames-path frames/ --parquet-path cakeify.parquet --parquet-out-path cakeify.parquet
 34 | ```
 35 | 
 36 | `--path` is the folder with videos.
 37 | `--frames-path` is the folder where frames are saved.
 38 | `--parquet-path` is the `--out-path` from the first step.
 39 | `--parquet-out-path` if you want different versions e.g. `--parquet-out-path cakeify_frames.parquet`
 40 | 
 41 | ## Add Captions
 42 | 
 43 | This will use Florence-2 `microsoft/Florence-2-large` to run `<CAPTION>`, `<DETAILED_CAPTION>`, `<DENSE_REGION_CAPTION>` and `<OCR_WITH_REGION>` on extracted key frames.
 44 | 
 45 | This uses extracted frames from step 2.
 46 | 
 47 | The list of captions is added to the dataframe `caption` and `detailed_caption` columns.
 48 | 
 49 | ```sh
 50 | python add_captions.py --path frames/ --parquet-path cakeify.parquet --parquet-out-path cakeify.parquet --device cuda --dtype float16
 51 | ```
 52 | 
 53 | `--path` is the folder with **frames**.
 54 | `--parquet-path` is the `--out-path` from the first step or the `--parquet-out-path` from step 2 if you changed it.
 55 | `--parquet-out-path` if you want to different versions `--parquet-out-path cakeify_captions.parquet`
 56 | 
 57 | 
 58 | ## Add Watermark Laion Score
 59 | 
 60 | This will use [LAION-5B-WatermarkDetection](https://github.com/LAION-AI/LAION-5B-WatermarkDetection) to detect watermarks on extracted frames.
 61 | 
 62 | This uses extracted frames from step 2.
 63 | 
 64 | The list of scores is added to the dataframe `pwatermark` columns.
 65 | 
 66 | ```sh
 67 | python add_watermark_laion_score.py --path frames/ --parquet-path cakeify.parquet --parquet-out-path cakeify.parquet --device cpu 
 68 | ```
 69 | 
 70 | It will automatically download the for the watermark scorer from [here](https://huggingface.co/finetrainers/laion-watermark-detection). You also specify your own through the `--model` argument.
 71 | 
 72 | `--path` is the folder with **frames**.
 73 | `--parquet-path` is the `--out-path` from the first step or the `--parquet-out-path` from step 2 if you changed it.
 74 | `--parquet-out-path` if you want to different versions `--parquet-out-path cakeify_captions.parquet`
 75 | `--device cuda` is optional as this model is fast on CPU.
 76 | 
 77 | 
 78 | ## Add Aesthetic Laion Score
 79 | 
 80 | This will use [improved-aesthetic-predictor](https://github.com/christophschuhmann/improved-aesthetic-predictor) to predict an aesthetic score on extracted frames.
 81 | 
 82 | This uses extracted frames from step 2.
 83 | 
 84 | The list of scores is added to the dataframe `aesthetic_score` columns.
 85 | 
 86 | ```sh
 87 | python add_aesthetic_laion_score.py --path frames/ --parquet-path cakeify.parquet --parquet-out-path cakeify.parquet --device cpu --dtype float32
 88 | ```
 89 | 
 90 | It will automatically download the MLP params for the aeshtetics predictor from [here](https://huggingface.co/trl-lib/ddpo-aesthetic-predictor). You also specify your own through the `--model` argument.
 91 | 
 92 | `--path` is the folder with **frames**.
 93 | `--parquet-path` is the `--out-path` from the first step or the `--parquet-out-path` from step 2 if you changed it.
 94 | `--parquet-out-path` if you want to different versions `--parquet-out-path cakeify_captions.parquet`
 95 | 
 96 | Not unusable on CPU, around 1s per image but `--device cuda` and `--dtype float16` is recommended for performance.
 97 | 
 98 | ## Add NSFW Score
 99 | 
100 | This will use the [Falconsai/nsfw_image_detection](https://huggingface.co/Falconsai/nsfw_image_detection) model to predict an NSFW score on a frame-by-frame basis. 
101 | 
102 | This uses extracted frames from step 2.
103 | 
104 | The list of scores is added to the dataframe `nsfw` columns.
105 | 
106 | ```sh
107 | python add_nsfw_score.py --path frames/ --parquet-path cakeify.parquet --parquet-out-path cakeify.parquet --device cuda
108 | ```
109 | 
110 | `--path` is the folder with **frames**.
111 | `--parquet-path` is the `--out-path` from the first step or the `--parquet-out-path` from step 2 if you changed it.
112 | `--parquet-out-path` if you want to different versions `--parquet-out-path cakeify_captions.parquet`
113 | 
114 | Not unusable on CPU, around 1s per image but `--device cuda` is recommended for performance.
115 | 
116 | Output should look like so:
117 | 
118 | ```sh
119 |   nsfw_status
120 | 0    [normal]
121 | 1    [normal]
122 | 2    [normal]
123 |                         file                         frames nsfw_status
124 | 0  -IvRtqwaetM-Scene-050.mp4  [-IvRtqwaetM-Scene-050_0.jpg]    [normal]
125 | 1  -IvRtqwaetM-Scene-002.mp4  [-IvRtqwaetM-Scene-002_0.jpg]    [normal]
126 | 2  -IvRtqwaetM-Scene-005.mp4  [-IvRtqwaetM-Scene-005_0.jpg]    [normal]
127 | ```
128 | 
129 | ## Add Motion Score
130 | 
131 | This will use opencv to calculate a "motion score" with `OpticalFlowFarneback` and `OpticalFlowPyrLK` on extracted key frames.
132 | 
133 | Different than captions and watermark, this will use all key frames, if there is only 1 key frame, we also read the first frame of the video.
134 | 
135 | The scores are added to the dataframe with `motion_fb` and `motion_lk` columns.
136 | 
137 | ```sh
138 | python add_motion_score.py --path cakeify/ --parquet-out-path cakeify.parquet  --parquet-path cakeify.parquet 
139 | ```
140 | 
141 | `--path` is the folder with **videos**.
142 | `--parquet-path` is the `--out-path` from the first step or the `--parquet-out-path` from another step if you changed it.
143 | `--parquet-out-path` if you want different versions e.g. `--parquet-out-path cakeify_motion_score.parquet`
144 | 
145 | ## Add Shot Categories
146 | 
147 | This will use a fine-tune of Florence-2, [diffusers/shot-categorizer-v0](https://huggingface.co/diffusers/shot-categorizer-v0) and infer shot information on the key video frames. We follow the same strategy as above:
148 | 
149 | > Different than captions and watermark, this will use all key frames, if there is only 1 key frame, we also read the first frame of the video.
150 | 
151 | The categories are added to the dataframe with `color`, `lighting`, `lighting_type`, and
152 | `composition` columns.
153 | 
154 | ```sh
155 | python add_shot_categories.py --path frames/ --parquet-path cakeify.parquet --parquet-out-path cakeify.parquet --device cuda --dtype float16
156 | ```
157 | 
158 | `--path` is the folder with **frames**.
159 | `--parquet-path` is the `--out-path` from the first step or the `--parquet-out-path` from step 2 if you changed it.
160 | `--parquet-out-path` if you want to different versions `--parquet-out-path cakeify_captions.parquet`
161 | 
162 | Sample output:
163 | 
164 | ```sh
165 |                                                 color  ...               composition
166 | 0   [Desaturated, Black and White, Desaturated, Bl...  ...        [Center, Balanced]
167 | 1                      [Desaturated, Black and White]  ...                [Balanced]
168 | 2                      [Desaturated, Black and White]  ...                  [Center]
169 | 3   [Desaturated, Black and White, Desaturated, Bl...  ...  [Left heavy, Left heavy]
170 | 4                      [Desaturated, Black and White]  ...                [Balanced]
171 | ..                                                ...  ...                       ...
172 | ```
173 | 
174 | ## Example Output
175 | 
176 | ```
177 |                          file     motion_fb  motion_lk                                            caption                                   detailed_caption
178 | 0   -h5KF2SffqI-Scene-002.mp4 -6.782037e-08   0.061066                    [listerine cool mint mouthwash]  [The image shows a bottle of listerine cool mi...
179 | 1   -h5KF2SffqI-Scene-003.mp4  4.928587e-01   0.654230  [A small aloe vera plant in a brown pot on a b...  [The image shows an aloe vera plant in a pot o...
180 | 2   -h5KF2SffqI-Scene-006.mp4  4.287588e+00   1.033444  [A woman in black gloves is decorating a cake ...  [The image shows a woman wearing a black dress...
181 | 3   -h5KF2SffqI-Scene-011.mp4  4.042791e-06   0.034311  [A jar of Nutella sitting on top of a wooden t...  [The image shows a jar of Nutella sitting on t...
182 | 4   -h5KF2SffqI-Scene-012.mp4 -4.261375e-01   1.351952  [A bottle of Dove deep moisture body wash sitt...  [The image shows a bottle of Dove Deep Moistur...
183 | 5   -h5KF2SffqI-Scene-019.mp4 -4.995294e-01   0.177173  [A person cutting a bowl of dog food with a kn...  [The image shows a person cutting into a red b...
184 | 6   -h5KF2SffqI-Scene-023.mp4  9.713798e-07   0.012338  [A wireless router sitting on top of a wooden ...  [The image shows a TP-Link TL-WR940N 300Mbps W...
185 | 7   -h5KF2SffqI-Scene-026.mp4 -1.478333e-05   0.059160   [A bottle of ranch dressing with a knife in it.]  [The image shows a person using a knife to cut...
186 | 8   7TAIQso5waY-Scene-014.mp4 -1.127474e-05   0.004962  [A person cutting up a box of french fries wit...  [The image shows a person cutting out a McDona...
187 | 9   7TAIQso5waY-Scene-075.mp4  1.749514e-06   0.035628   [A person holding a cake with a fox face on it.]  [The image shows a person holding a cake with ...
188 | 10  7TAIQso5waY-Scene-079.mp4  9.967135e-06   0.033474  [A person cutting a cake with a knife on a tab...  [The image shows a person cutting a cake with ...
189 | 11  GJ2M77Yz60c-Scene-025.mp4 -1.363216e-06   0.025201  [A bottle of school glue sitting on top of a w...  [The image shows a bottle of Elmer's School Gl...
190 | 12  GJ2M77Yz60c-Scene-063.mp4 -1.828094e-06   0.023520    [A can of coca cola sitting on top of a table.]  [The image shows a can of Coca Cola sitting on...
191 | 13  GJ2M77Yz60c-Scene-071.mp4 -2.134615e-06   0.010385  [A wireless router sitting on top of a wooden ...  [The image shows a TP-Link TL-WR940N 300Mbps W...
192 | 14  GJ2M77Yz60c-Scene-227.mp4  1.133161e-01   0.928008  [A cup of kfc chicken with a knife sticking ou...  [The image shows a cup of KFC chicken nuggets ...
193 | ```
194 | 
195 | ## Video to Scenes
196 | 
197 | This will split a video into scenes using `pyscenedetect`. Videos are transcoded to ensure exact cuts, note that we can implement a lossless `copy` version however cuts will need to be snapped to keyframes which may produce bad clips (part scene A, part scene B).
198 | 
199 | ```sh
200 | python video_to_scenes.py --path cakeify/ --out-path cakeify_dataset/ --threshold 27 --min-scene-len 15
201 | # optionally --duration NUMBER_OF_FRAMES to limit duration of scene detection
202 | ```
203 | 
204 | ## Example workflow
205 | 
206 | Example workflow for [crush](https://huggingface.co/datasets/bigdata-pw/crush) dataset.
207 | 
208 | ```sh
209 | git clone https://github.com/huggingface/dataset-scripts
210 | mkdir raw_video
211 | cd raw_video
212 | yt-dlp -f "bv*[ext=mp4][height<=1080]+ba[ext=m4a]/b[ext=mp4] / bv*+ba/b" -o "%(id)s" https://www.youtube.com/playlist?list=PLlFv9Xg5Kmt17Dh70nXJpjaezzGT-gQV5
213 | cd ..
214 | python dataset-scripts/video_processing/video_to_scenes.py --path raw_video/ --out-path crush/ --threshold 27 --min-scene-len 15
215 | python dataset-scripts/video_processing/folder_to_parquet.py --path crush/ --out-path crush.parquet
216 | python dataset-scripts/video_processing/extract_frames.py --path crush/ --frames-path frames/ --parquet-path crush.parquet --parquet-out-path crush.parquet
217 | python dataset-scripts/video_processing/add_captions.py --path frames/ --parquet-path crush.parquet --parquet-out-path crush.parquet --device cuda --dtype float16
218 | python dataset-scripts/video_processing/add_watermark_laion_score.py --path frames/ --parquet-path crush.parquet --parquet-out-path crush.parquet --device cpu
219 | python dataset-scripts/video_processing/add_aesthetic_laion_score.py --path frames/ --parquet-path crush.parquet --parquet-out-path crush.parquet --device cpu --dtype float32
220 | python dataset-scripts/video_processing/add_motion_score.py --path crush/ --parquet-path crush.parquet --parquet-out-path crush.parquet
221 | ```
222 | 
223 | ### General steps
224 | 
225 | 1. Download source videos
226 | 2. Extract scenes (`video_to_scenes`)
227 | 3. Create parquet (`folder_to_parquet`) on the extracted scenes
228 | 4. Extract frames from the scenes (`extract_frames`)
229 | 5. Run any of the other scripts
230 | 
231 | Note: motion score script uses the videos, motion score is likely performs better with more frames so it uses all the key frames (and an additional frame from the video if there's only 1). Other scripts use the extracted frames for performance.
232 | 
233 | ## Filtering
234 | 
235 | ```python
236 | import pandas as pd
237 | 
238 | df = pd.read_parquet("crush.parquet")
239 | 
240 | # mean pwatermark < 0.5
241 | import numpy as np
242 | df[df.pwatermark.apply(lambda x: np.mean(x) < 0.5)]
243 | # or sum(x) / len(x)
244 | 
245 | # first frame pwatermark < 0.1
246 | df[df.pwatermark.apply(lambda x: x[0] < 0.1)]
247 | 
248 | # all pwatermark < 0.1
249 | df = df[df.pwatermark.apply(lambda x: all(i < 0.1 for i in x))]
250 | 
251 | # aesthetic > 5.0
252 | df = df[df.pwatermark.apply(lambda x: all(i > 5.4 for i in x))]
253 | 
254 | df.to_parquet("crush_smol.parquet")
255 | ```
256 | 
257 | ## Reference filtering
258 | 
259 | You may filter your videos matching with a reference video/image for better control. We provide the `reference_video_similarity.py` script for this purpose. It can be called like so:
260 | 
261 | ```bash
262 | python reference_video_similarity.py --videos_folder=... --reference=reference_image.png
263 | ```
264 | 
265 | The `--videos_folder` should contain the videos at the top-level. `--reference` can either be an image or a video. You can pass a list of references too like so:
266 | 
267 | ```bash
268 | python reference_video_similarity.py --videos_folder=... \
269 |   --reference=reference_image_1.png,reference_image_2.png
270 | ```
271 | 
272 | As a third option, you can also pass a folder containing the reference images or videos to `--reference`:
273 | 
274 | ```bash
275 | python reference_video_similarity.py --videos_folder=... --reference=<PATH_TO_FOLDER>
276 | ```
277 | 
278 | You can vary the `--max_num_frames` and the `--batch_size` arguments to control the memory consumption.
279 | 
280 | At the end of the execution of the script, you should expect to see parquet file having `video_path`s and the `similarity` scores.
281 | 
282 | We leverage the vision encoder of SigLIP ([`google/siglip-so400m-patch14-384`](https://hf.co/google/siglip-so400m-patch14-384)) for this.
283 | 


--------------------------------------------------------------------------------
/video_processing/add_aesthetic_laion_score.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pathlib
 3 | from PIL import Image
 4 | from argparse import ArgumentParser
 5 | from tqdm import tqdm
 6 | from modules import load_aesthetic_laion, run_aesthetic_laion, separate_key_frames_from_row
 7 | 
 8 | parser = ArgumentParser()
 9 | parser.add_argument("--path", type=str, required=True)
10 | parser.add_argument("--parquet-path", type=str, required=True)
11 | parser.add_argument("--parquet-out-path", type=str, required=True)
12 | parser.add_argument("--device", type=str, required=True)
13 | parser.add_argument("--model", type=str, default=None)
14 | parser.add_argument("--dtype", type=str, required=True)
15 | args = parser.parse_args()
16 | path = pathlib.Path(args.path)
17 | parquet_path = pathlib.Path(args.parquet_path)
18 | parquet_out_path = pathlib.Path(args.parquet_out_path)
19 | device = args.device
20 | dtype = args.dtype
21 | model_path = args.model
22 | 
23 | load_aesthetic_laion(device=device, model_path=model_path, dtype=dtype)
24 | 
25 | df = pd.read_parquet(parquet_path)
26 | 
27 | data = []
28 | with tqdm() as pbar:
29 |     for _, row in df.iterrows():
30 |         pbar.set_description(row["file"])
31 |         key_frames, first, mid, last = separate_key_frames_from_row(path, row)
32 |         pbar.set_postfix_str(f"{len(key_frames)} key frames")
33 |         frames = [frame for frame in [first, mid, last] if frame is not None]
34 |         scores = [tensor.cpu().item() for tensor in run_aesthetic_laion(frames)]
35 |         data.append({"aesthetic_score": scores})
36 |         pbar.update()
37 | 
38 | aesthetic_df = pd.DataFrame(data)
39 | 
40 | print(aesthetic_df)
41 | 
42 | df = df.join(aesthetic_df)
43 | 
44 | print(df)
45 | 
46 | df.to_parquet(parquet_out_path)
47 | 


--------------------------------------------------------------------------------
/video_processing/add_captions.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pathlib
 3 | from PIL import Image
 4 | from argparse import ArgumentParser
 5 | from tqdm import tqdm
 6 | from modules import run, load_florence, separate_key_frames_from_row
 7 | 
 8 | parser = ArgumentParser()
 9 | parser.add_argument("--path", type=str, required=True)
10 | parser.add_argument("--parquet-path", type=str, required=True)
11 | parser.add_argument("--parquet-out-path", type=str, required=True)
12 | parser.add_argument("--device", type=str, required=True)
13 | parser.add_argument("--dtype", type=str, required=True)
14 | args = parser.parse_args()
15 | path = pathlib.Path(args.path)
16 | parquet_path = pathlib.Path(args.parquet_path)
17 | parquet_out_path = pathlib.Path(args.parquet_out_path)
18 | device = args.device
19 | dtype = args.dtype
20 | 
21 | 
22 | load_florence(
23 |     hf_hub_or_path="microsoft/Florence-2-large",
24 |     device=device,
25 |     dtype=dtype,
26 | )
27 | 
28 | 
29 | df = pd.read_parquet(parquet_path)
30 | 
31 | task_prompt = [
32 |     "<DENSE_REGION_CAPTION>",
33 |     "<OCR_WITH_REGION>",
34 |     "<CAPTION>",
35 |     "<DETAILED_CAPTION>",
36 | ]
37 | 
38 | data = []
39 | with tqdm() as pbar:
40 |     for _, row in df.iterrows():
41 |         pbar.set_description(row["file"])
42 |         key_frames, first, mid, last = separate_key_frames_from_row(path, row)
43 |         pbar.set_postfix_str(f"{len(key_frames)} key frames")
44 | 
45 |         frames = [first]
46 |         first = run(first, task_prompt=task_prompt)
47 |         caption = [first["<CAPTION>"]]
48 |         detailed_caption = [first["<DETAILED_CAPTION>"]]
49 |         region_caption = [first["<DENSE_REGION_CAPTION>"]]
50 |         ocr_region = [first["<OCR_WITH_REGION>"]]
51 |         if mid:
52 |             frames.append(mid)
53 |             mid = run(mid, task_prompt=task_prompt)
54 |             caption.append(mid["<CAPTION>"])
55 |             detailed_caption.append(mid["<DETAILED_CAPTION>"])
56 |             region_caption.append(mid["<DENSE_REGION_CAPTION>"])
57 |             ocr_region.append(mid["<OCR_WITH_REGION>"])
58 |         if last:
59 |             frames.append(last)
60 |             last = run(last, task_prompt=task_prompt)
61 |             caption.append(last["<CAPTION>"])
62 |             detailed_caption.append(last["<DETAILED_CAPTION>"])
63 |             region_caption.append(last["<DENSE_REGION_CAPTION>"])
64 |             ocr_region.append(last["<OCR_WITH_REGION>"])
65 |         row = {
66 |             "caption": caption,
67 |             "detailed_caption": detailed_caption,
68 |             "region_caption": region_caption,
69 |             "ocr": ocr_region,
70 |         }
71 |         data.append(row)
72 |         pbar.update()
73 | 
74 | caption_df = pd.DataFrame(data)
75 | 
76 | print(caption_df)
77 | 
78 | df = df.join(caption_df)
79 | 
80 | print(df)
81 | 
82 | df.to_parquet(parquet_out_path)
83 | 


--------------------------------------------------------------------------------
/video_processing/add_motion_score.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pathlib
 3 | from argparse import ArgumentParser
 4 | from tqdm import tqdm
 5 | from modules import (
 6 |     compute_farneback_optical_flow,
 7 |     compute_lk_optical_flow,
 8 |     _downscale_maps,
 9 |     _motion_score,
10 |     get_frames,
11 |     get_key_frames,
12 | )
13 | 
14 | parser = ArgumentParser()
15 | parser.add_argument("--path", type=str, required=True)
16 | parser.add_argument("--parquet-path", type=str, required=True)
17 | parser.add_argument("--parquet-out-path", type=str, required=True)
18 | args = parser.parse_args()
19 | path = pathlib.Path(args.path)
20 | parquet_path = pathlib.Path(args.parquet_path)
21 | parquet_out_path = pathlib.Path(args.parquet_out_path)
22 | 
23 | df = pd.read_parquet(parquet_path)
24 | 
25 | data = []
26 | with tqdm() as pbar:
27 |     for _, row in df.iterrows():
28 |         video = path.joinpath(row["file"])
29 |         pbar.set_description(video.name)
30 |         key_frames = get_key_frames(video)
31 |         if len(key_frames) == 1:
32 |             frame = list(next(get_frames(video)))[0]
33 |             key_frames.insert(0, frame.to_image())
34 |         pbar.set_postfix_str(f"{len(key_frames)} key frames")
35 |         farneback, _, _, _ = compute_farneback_optical_flow(key_frames)
36 |         farneback = _motion_score(_downscale_maps(farneback))
37 |         lucas_kanade = _motion_score(compute_lk_optical_flow(key_frames))
38 |         data.append({"motion_fb": farneback, "motion_lk": lucas_kanade})
39 |         pbar.update()
40 | 
41 | 
42 | motion_df = pd.DataFrame(data)
43 | 
44 | print(motion_df)
45 | 
46 | df = df.join(motion_df)
47 | 
48 | print(df)
49 | 
50 | df.to_parquet(parquet_out_path)
51 | 


--------------------------------------------------------------------------------
/video_processing/add_nsfw_score.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pathlib
 3 | from PIL import Image
 4 | from argparse import ArgumentParser
 5 | from tqdm import tqdm
 6 | from modules import load_nsfw, run_nsfw, separate_key_frames_from_row
 7 | 
 8 | 
 9 | parser = ArgumentParser()
10 | parser.add_argument("--path", type=str, required=True)
11 | parser.add_argument("--parquet-path", type=str, required=True)
12 | parser.add_argument("--parquet-out-path", type=str, required=True)
13 | parser.add_argument("--device", type=str, required=True)
14 | args = parser.parse_args()
15 | path = pathlib.Path(args.path)
16 | parquet_path = pathlib.Path(args.parquet_path)
17 | parquet_out_path = pathlib.Path(args.parquet_out_path)
18 | device = args.device
19 | 
20 | load_nsfw(device)
21 | 
22 | df = pd.read_parquet(parquet_path)
23 | 
24 | data = []
25 | with tqdm() as pbar:
26 |     for _, row in df.iterrows():
27 |         pbar.set_description(row["file"])
28 |         key_frames, first, mid, last = separate_key_frames_from_row(path, row)
29 |         pbar.set_postfix_str(f"{len(key_frames)} key frames")
30 |         frames = [frame for frame in [first, mid, last] if frame is not None]
31 |         labels = [label for label in run_nsfw(frames)]
32 |         data.append({"nsfw_status": labels})
33 |         pbar.update()
34 | 
35 | nsfw_df = pd.DataFrame(data)
36 | 
37 | print(nsfw_df)
38 | 
39 | df = df.join(nsfw_df)
40 | 
41 | print(df)
42 | 
43 | df.to_parquet(parquet_out_path)
44 | 


--------------------------------------------------------------------------------
/video_processing/add_shot_categories.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pathlib
 3 | from argparse import ArgumentParser
 4 | from tqdm import tqdm
 5 | from PIL import Image
 6 | from modules import run, load_florence, separate_key_frames_from_row
 7 | 
 8 | parser = ArgumentParser()
 9 | parser.add_argument("--path", type=str, required=True)
10 | parser.add_argument("--parquet-path", type=str, required=True)
11 | parser.add_argument("--parquet-out-path", type=str, required=True)
12 | parser.add_argument("--device", type=str, required=True)
13 | parser.add_argument("--dtype", type=str, required=True)
14 | args = parser.parse_args()
15 | path = pathlib.Path(args.path)
16 | parquet_path = pathlib.Path(args.parquet_path)
17 | parquet_out_path = pathlib.Path(args.parquet_out_path)
18 | device = args.device
19 | dtype = args.dtype
20 | 
21 | load_florence(hf_hub_or_path="diffusers/shot-categorizer-v0", device=device, dtype=dtype, check_task_types=False)
22 | 
23 | df = pd.read_parquet(parquet_path)
24 | 
25 | task_prompt = ["<COLOR>", "<LIGHTING>", "<LIGHTING_TYPE>", "<COMPOSITION>"]
26 | 
27 | data = []
28 | with tqdm() as pbar:
29 |     for _, row in df.iterrows():
30 |         pbar.set_description(row["file"])
31 |         key_frames, first, mid, last = separate_key_frames_from_row(path, row)
32 |         pbar.set_postfix_str(f"{len(key_frames)} key frames")
33 | 
34 |         frames = [first]
35 |         first = run(first, task_prompt=task_prompt)
36 |         color = [first["<COLOR>"]]
37 |         lighting = [first["<LIGHTING>"]]
38 |         lighting_type = [first["<LIGHTING_TYPE>"]]
39 |         composition = [first["<COMPOSITION>"]]
40 | 
41 |         if mid:
42 |             frames.append(mid)
43 |             mid = run(mid, task_prompt=task_prompt)
44 |             color.append(mid["<COLOR>"])
45 |             lighting.append(mid["<LIGHTING>"])
46 |             lighting_type.append(mid["<LIGHTING_TYPE>"])
47 |             composition.append(mid["<COMPOSITION>"])
48 | 
49 |         if last:
50 |             frames.append(last)
51 |             last = run(last, task_prompt=task_prompt)
52 |             color.append(last["<COLOR>"])
53 |             lighting.append(last["<LIGHTING>"])
54 |             lighting_type.append(last["<LIGHTING_TYPE>"])
55 |             composition.append(last["<COMPOSITION>"])
56 | 
57 |         row = {
58 |             "color": color,
59 |             "lighting": lighting,
60 |             "lighting_type": lighting_type,
61 |             "composition": composition,
62 |         }
63 |         data.append(row)
64 |         pbar.update()
65 | 
66 | shot_categorized_df = pd.DataFrame(data)
67 | 
68 | print(shot_categorized_df)
69 | 
70 | df = df.join(shot_categorized_df)
71 | 
72 | print(df)
73 | 
74 | df.to_parquet(parquet_out_path)
75 | 


--------------------------------------------------------------------------------
/video_processing/add_watermark_laion_score.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pathlib
 3 | from PIL import Image
 4 | from argparse import ArgumentParser
 5 | from tqdm import tqdm
 6 | from modules import load_watermark_laion, run_watermark_laion, separate_key_frames_from_row
 7 | 
 8 | parser = ArgumentParser()
 9 | parser.add_argument("--path", type=str, required=True)
10 | parser.add_argument("--parquet-path", type=str, required=True)
11 | parser.add_argument("--parquet-out-path", type=str, required=True)
12 | parser.add_argument("--device", type=str, required=True)
13 | parser.add_argument("--model", type=str, default=None)
14 | args = parser.parse_args()
15 | path = pathlib.Path(args.path)
16 | parquet_path = pathlib.Path(args.parquet_path)
17 | parquet_out_path = pathlib.Path(args.parquet_out_path)
18 | device = args.device
19 | model_path = args.model
20 | 
21 | load_watermark_laion(device=device, model_path=model_path)
22 | 
23 | df = pd.read_parquet(parquet_path)
24 | 
25 | data = []
26 | with tqdm() as pbar:
27 |     for _, row in df.iterrows():
28 |         pbar.set_description(row["file"])
29 |         key_frames, first, mid, last = separate_key_frames_from_row(path, row)
30 |         pbar.set_postfix_str(f"{len(key_frames)} key frames")
31 |         frames = [frame for frame in [first, mid, last] if frame is not None]
32 |         scores = [tensor.cpu().item() for tensor in run_watermark_laion(frames)]
33 |         data.append({"pwatermark": scores})
34 |         pbar.update()
35 | 
36 | watermark_df = pd.DataFrame(data)
37 | 
38 | print(watermark_df)
39 | 
40 | df = df.join(watermark_df)
41 | 
42 | print(df)
43 | 
44 | df.to_parquet(parquet_out_path)
45 | 


--------------------------------------------------------------------------------
/video_processing/extract_frames.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pathlib
 3 | from argparse import ArgumentParser
 4 | from tqdm import tqdm
 5 | from modules import get_key_frames
 6 | 
 7 | parser = ArgumentParser()
 8 | parser.add_argument("--path", type=str, required=True)
 9 | parser.add_argument("--frames-path", type=str, required=True)
10 | parser.add_argument("--parquet-path", type=str, required=True)
11 | parser.add_argument("--parquet-out-path", type=str, required=True)
12 | args = parser.parse_args()
13 | path = pathlib.Path(args.path)
14 | frames_path = pathlib.Path(args.frames_path)
15 | parquet_path = pathlib.Path(args.parquet_path)
16 | parquet_out_path = pathlib.Path(args.parquet_out_path)
17 | 
18 | df = pd.read_parquet(parquet_path)
19 | 
20 | if "frames" in df.columns:
21 |     print("`frames` already found.")
22 |     exit()
23 | 
24 | data = []
25 | with tqdm() as pbar:
26 |     for _, row in df.iterrows():
27 |         video = path.joinpath(row["file"])
28 |         frames_dir = video.parent.with_name("frames")
29 |         if not frames_dir.exists():
30 |             frames_dir.mkdir(parents=True, exist_ok=True)
31 |         pbar.set_description(video.name)
32 |         key_frames = get_key_frames(video)
33 |         pbar.set_postfix_str(f"{len(key_frames)} key frames")
34 |         first = key_frames[0]
35 |         mid = None
36 |         last = None
37 |         if len(key_frames) == 2:
38 |             last = key_frames[1]
39 |         elif len(key_frames) > 2:
40 |             mid = key_frames[len(key_frames) // 2]
41 |             last = key_frames[-1]
42 |         frames = []
43 |         for idx, frame in enumerate([first, mid, last]):
44 |             if frame is None:
45 |                 continue
46 |             frame_path = frames_dir.joinpath(f"{video.stem}_{idx}.jpg")
47 |             if not frame_path.exists():
48 |                 frame.save(frame_path)
49 |             frames.append(frame_path.name)
50 |         data.append({"frames": frames})
51 | 
52 | 
53 | frames_df = pd.DataFrame(data)
54 | 
55 | print(frames_df)
56 | 
57 | df = df.join(frames_df)
58 | 
59 | print(df)
60 | 
61 | df.to_parquet(parquet_out_path)
62 | 


--------------------------------------------------------------------------------
/video_processing/folder_to_parquet.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import pathlib
 3 | from argparse import ArgumentParser
 4 | 
 5 | parser = ArgumentParser()
 6 | parser.add_argument("--path", type=str, required=True)
 7 | parser.add_argument("--out-path", type=str, required=True)
 8 | args = parser.parse_args()
 9 | path = pathlib.Path(args.path)
10 | out_path = pathlib.Path(args.out_path)
11 | 
12 | EXTENSIONS = {"avi", "mkv", "mp4"}
13 | 
14 | videos = []
15 | for extension in EXTENSIONS:
16 |     videos.extend(list(path.glob(f"*.{extension}")))
17 | 
18 | data = []
19 | for video in videos:
20 |     data.append({"file": video.name})
21 | 
22 | df = pd.DataFrame(data)
23 | 
24 | print(df)
25 | 
26 | df.to_parquet(out_path, compression="snappy")
27 | 


--------------------------------------------------------------------------------
/video_processing/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .aesthetic_laion import AestheticScorer, run_aesthetic_laion, load_aesthetic_laion
2 | from .watermark_laion import run_watermark_laion, load_watermark_laion
3 | from .optical_flow import compute_lk_optical_flow, compute_farneback_optical_flow, _downscale_maps, _motion_score
4 | from .caption_object_ocr import run, load_florence
5 | from .nsfw import load_nsfw, run_nsfw
6 | from .frames import get_frames, get_key_frames, separate_key_frames_from_row
7 | 


--------------------------------------------------------------------------------
/video_processing/modules/aesthetic_laion.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from transformers import CLIPVisionModelWithProjection, CLIPProcessor
 4 | from huggingface_hub import hf_hub_download
 5 | 
 6 | MODEL = None
 7 | 
 8 | 
 9 | class MLP(nn.Module):
10 |     def __init__(self):
11 |         super().__init__()
12 |         self.layers = nn.Sequential(
13 |             nn.Linear(768, 1024),
14 |             nn.Dropout(0.2),
15 |             nn.Linear(1024, 128),
16 |             nn.Dropout(0.2),
17 |             nn.Linear(128, 64),
18 |             nn.Dropout(0.1),
19 |             nn.Linear(64, 16),
20 |             nn.Linear(16, 1),
21 |         )
22 | 
23 |     @torch.no_grad()
24 |     def forward(self, embed):
25 |         return self.layers(embed)
26 | 
27 | 
28 | class AestheticScorer(torch.nn.Module):
29 |     def __init__(self, dtype, path):
30 |         super().__init__()
31 |         self.clip = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
32 |         self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
33 | 
34 |         self.mlp = MLP()
35 |         if path is None:
36 |             path = hf_hub_download("trl-lib/ddpo-aesthetic-predictor", "aesthetic-model.pth")
37 |         state_dict = torch.load(path, weights_only=True, map_location=torch.device("cpu"))
38 |         self.mlp.load_state_dict(state_dict)
39 |         self.dtype = dtype
40 |         self.eval()
41 | 
42 |     @torch.no_grad()
43 |     def __call__(self, images):
44 |         device = next(self.parameters()).device
45 |         inputs = self.processor(images=images, return_tensors="pt")
46 |         inputs = {k: v.to(self.dtype).to(device) for k, v in inputs.items()}
47 |         embed = self.clip(**inputs)[0]
48 |         # normalize embedding
49 |         embed = embed / torch.linalg.vector_norm(embed, dim=-1, keepdim=True)
50 |         return self.mlp(embed).squeeze(1)
51 | 
52 | 
53 | def load_aesthetic_laion(model_path, device, dtype):
54 |     global MODEL
55 |     dtype = getattr(torch, dtype)
56 |     MODEL = AestheticScorer(dtype=dtype, path=model_path).to(device)
57 | 
58 | 
59 | @torch.no_grad
60 | def run_aesthetic_laion(image):
61 |     if not isinstance(image, list):
62 |         image = [image]
63 |     return MODEL(image)
64 | 


--------------------------------------------------------------------------------
/video_processing/modules/caption_object_ocr.py:
--------------------------------------------------------------------------------
 1 | FLORENCE = None
 2 | 
 3 | 
 4 | def load_florence(
 5 |     hf_hub_or_path="microsoft/Florence-2-large",
 6 |     device="cpu",
 7 |     dtype="float32",
 8 |     check_task_types=True,
 9 | ):
10 |     global FLORENCE
11 |     from florence_tool import FlorenceTool
12 | 
13 |     FLORENCE = FlorenceTool(
14 |         hf_hub_or_path=hf_hub_or_path, device=device, dtype=dtype, check_task_types=check_task_types
15 |     )
16 |     FLORENCE.load_model()
17 | 
18 | 
19 | def run(
20 |     image,
21 |     task_prompt,
22 | ):
23 |     if FLORENCE is None:
24 |         load_florence()
25 |     return FLORENCE.run(
26 |         image=image,
27 |         task_prompt=task_prompt,
28 |     )
29 | 


--------------------------------------------------------------------------------
/video_processing/modules/frames.py:
--------------------------------------------------------------------------------
 1 | import av
 2 | from PIL import Image
 3 | from pathlib import Path
 4 | from typing import Iterator, List, Union
 5 | 
 6 | 
 7 | def get_key_frames(path: Union[Path, str]) -> List[Image.Image]:
 8 |     frames = []
 9 |     container = av.open(str(path))
10 |     stream = container.streams.video[0]
11 |     stream.codec_context.skip_frame = "NONKEY"
12 |     for _, frame in enumerate(container.decode(stream)):
13 |         frames.append(frame.to_image())
14 |     container.close()
15 |     return frames
16 | 
17 | 
18 | def get_frames(path: Union[Path, str]) -> Iterator[av.VideoFrame]:
19 |     container = av.open(str(path))
20 |     stream = container.streams.video[0]
21 |     yield container.decode(stream)
22 | 
23 | 
24 | def separate_key_frames_from_row(path: Path, row: dict[str, list]):
25 |     key_frames = [Image.open(path.joinpath(key_frame)) for key_frame in row["frames"]]
26 |     first = key_frames[0]
27 |     mid = None
28 |     last = None
29 |     if len(key_frames) == 2:
30 |         last = key_frames[1]
31 |     elif len(key_frames) > 2:
32 |         mid = key_frames[len(key_frames) // 2]
33 |         last = key_frames[-1]
34 |     return key_frames, first, mid, last
35 | 


--------------------------------------------------------------------------------
/video_processing/modules/nsfw.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForImageClassification, AutoImageProcessor
 2 | import torch
 3 | 
 4 | MODEL_ID = "Falconsai/nsfw_image_detection"
 5 | 
 6 | MODEL, PROCESSOR = None, None
 7 | 
 8 | 
 9 | def load_nsfw(device):
10 |     global MODEL, PROCESSOR
11 |     MODEL = AutoModelForImageClassification.from_pretrained(MODEL_ID).eval().to(device)
12 |     PROCESSOR = AutoImageProcessor.from_pretrained(MODEL_ID)
13 | 
14 | 
15 | @torch.no_grad()
16 | def run_nsfw(image):
17 |     if not isinstance(image, list):
18 |         image = [image]
19 |     inputs = PROCESSOR(images=image, return_tensors="pt").to(MODEL.device)
20 |     outputs = MODEL(**inputs).logits
21 |     predicted_labels = outputs.argmax(-1)
22 |     return [MODEL.config.id2label[p.cpu().item()] for p in predicted_labels]
23 | 


--------------------------------------------------------------------------------
/video_processing/modules/optical_flow.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | from PIL import Image
 4 | 
 5 | 
 6 | def compute_farneback_optical_flow(frames):
 7 |     prev_gray = cv2.cvtColor(np.array(frames[0]), cv2.COLOR_BGR2GRAY)
 8 |     flow_maps = []
 9 |     magnitudes = []
10 |     angles = []
11 |     images = []
12 |     hsv = np.zeros_like(frames[0])
13 |     hsv[..., 1] = 255
14 | 
15 |     for frame in frames[1:]:
16 |         gray = cv2.cvtColor(np.array(frame), cv2.COLOR_BGR2GRAY)
17 |         flow_map = cv2.calcOpticalFlowFarneback(
18 |             prev_gray,
19 |             gray,
20 |             flow=None,
21 |             pyr_scale=0.5,
22 |             levels=3,
23 |             winsize=15,
24 |             iterations=3,
25 |             poly_n=5,
26 |             poly_sigma=1.2,
27 |             flags=0,
28 |         )
29 |         magnitude, angle = cv2.cartToPolar(flow_map[..., 0], flow_map[..., 1])
30 |         hsv[..., 0] = angle * 180 / np.pi / 2
31 |         hsv[..., 2] = cv2.normalize(magnitude, None, 0, 255, cv2.NORM_MINMAX)
32 |         bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
33 |         flow_maps.append(flow_map)
34 |         magnitudes.append(magnitude)
35 |         angles.append(angle)
36 |         images.append(bgr)
37 |         prev_gray = gray
38 |     return flow_maps, magnitudes, angles, images
39 | 
40 | 
41 | def compute_lk_optical_flow(frames):
42 |     # params for ShiTomasi corner detection
43 |     maxCorners = 50
44 |     feature_params = dict(maxCorners=maxCorners, qualityLevel=0.3, minDistance=7, blockSize=7)
45 |     # Parameters for lucas kanade optical flow
46 |     lk_params = dict(
47 |         winSize=(15, 15),
48 |         maxLevel=2,
49 |         criteria=(cv2.TERM_CRITERIA_EPS | cv2.TERM_CRITERIA_COUNT, 10, 0.03),
50 |     )
51 |     # Create some random colors
52 |     color = np.random.randint(0, 255, (maxCorners, 3))
53 |     # Take first frame and find corners in it
54 |     old_frame = frames[0]
55 |     old_gray = cv2.cvtColor(np.array(old_frame), cv2.COLOR_BGR2GRAY)
56 |     p0 = cv2.goodFeaturesToTrack(old_gray, mask=None, **feature_params)
57 |     # Create a mask image for drawing purposes
58 |     mask = np.zeros_like(old_frame)
59 | 
60 |     for frame in frames[1:]:
61 |         frame_gray = cv2.cvtColor(np.array(frame), cv2.COLOR_BGR2GRAY)
62 |         # calculate optical flow
63 |         p1, st, err = cv2.calcOpticalFlowPyrLK(old_gray, frame_gray, p0, None, **lk_params)
64 |         # Select good points
65 |         if p1 is not None:
66 |             good_new = p1[st == 1]
67 |             good_old = p0[st == 1]
68 |         # draw the tracks
69 |         for i, (new, old) in enumerate(zip(good_new, good_old)):
70 |             a, b = new.ravel()
71 |             c, d = old.ravel()
72 |             mask = cv2.line(mask, (int(a), int(b)), (int(c), int(d)), color[i].tolist(), 2)
73 |         old_gray = frame_gray.copy()
74 |         p0 = good_new.reshape(-1, 1, 2)
75 |     return mask
76 | 
77 | 
78 | def _downscale_maps(flow_maps, downscale_size: int = 16):
79 |     return [
80 |         cv2.resize(
81 |             flow,
82 |             (downscale_size, int(flow.shape[0] * (downscale_size / flow.shape[1]))),
83 |             interpolation=cv2.INTER_AREA,
84 |         )
85 |         for flow in flow_maps
86 |     ]
87 | 
88 | 
89 | def _motion_score(flow_maps):
90 |     average_flow_map = np.mean(np.array(flow_maps), axis=0)
91 |     return np.mean(average_flow_map)
92 | 
93 | 
94 | def _to_image(flow_maps):
95 |     return [Image.fromarray(np.array(flow_map)) for flow_map in flow_maps]
96 | 


--------------------------------------------------------------------------------
/video_processing/modules/watermark_laion.py:
--------------------------------------------------------------------------------
 1 | import timm
 2 | import torch
 3 | import torch.nn as nn
 4 | import torchvision.transforms as T
 5 | from huggingface_hub import hf_hub_download
 6 | 
 7 | MODEL, TRANSFORMS = None, None
 8 | 
 9 | 
10 | def load_watermark_laion(device, model_path):
11 |     global MODEL, TRANSFORMS
12 |     TRANSFORMS = T.Compose(
13 |         [
14 |             T.Resize((256, 256)),
15 |             T.ToTensor(),
16 |             T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
17 |         ]
18 |     )
19 | 
20 |     MODEL = timm.create_model("efficientnet_b3", pretrained=False, num_classes=2)
21 |     MODEL.classifier = nn.Sequential(
22 |         nn.Linear(in_features=1536, out_features=625),
23 |         nn.ReLU(),
24 |         nn.Dropout(p=0.3),
25 |         nn.Linear(in_features=625, out_features=256),
26 |         nn.ReLU(),
27 |         nn.Linear(in_features=256, out_features=2),
28 |     )
29 |     if model_path is None:
30 |         model_path = hf_hub_download("finetrainers/laion-watermark-detection", "watermark_model_v1.pt")
31 |     state_dict = torch.load(model_path, weights_only=True)
32 |     MODEL.load_state_dict(state_dict)
33 |     MODEL.eval().to(device)
34 | 
35 | 
36 | @torch.no_grad
37 | def run_watermark_laion(image):
38 |     if not isinstance(image, list):
39 |         image = [image]
40 |     pixel_values = torch.stack([TRANSFORMS(_image) for _image in image])
41 |     return nn.functional.softmax(MODEL(pixel_values), dim=1)[:, 0]
42 | 


--------------------------------------------------------------------------------
/video_processing/reference_video_similarity.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from transformers import SiglipVisionModel, SiglipImageProcessor
  4 | from PIL import Image
  5 | from tqdm import tqdm
  6 | import numpy as np
  7 | import argparse
  8 | import pandas as pd
  9 | from modules import get_frames
 10 | 
 11 | 
 12 | def compute_video_embedding(frames, model, preprocessor, device, dtype):
 13 |     """
 14 |     Compute video embeddings. `frames` can either be frames of a single video or a list of list of
 15 |     frames from multiple videos.
 16 |     """
 17 |     if not frames:
 18 |         return None
 19 | 
 20 |     if isinstance(frames[0], list):
 21 |         video_embeddings = []
 22 |         flat_frames = []
 23 |         video_lengths = []
 24 | 
 25 |         for video in frames:
 26 |             video_lengths.append(len(video))
 27 |             flat_frames.extend(video)
 28 | 
 29 |         all_input = preprocessor(images=flat_frames, return_tensors="pt").to(device)
 30 |         with torch.no_grad(), torch.autocast(torch.device(device).type, dtype=dtype):
 31 |             embeddings = model(**all_input).pooler_output
 32 |             embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True)
 33 |         embeddings = embeddings.cpu()
 34 | 
 35 |         # Group the embeddings back by video
 36 |         index = 0
 37 |         for length in video_lengths:
 38 |             video_emb = embeddings[index : index + length].mean(dim=0)
 39 |             video_emb = video_emb / video_emb.norm()
 40 |             video_embeddings.append(video_emb.numpy())
 41 |             index += length
 42 | 
 43 |         return video_embeddings
 44 |     else:
 45 |         all_input = preprocessor(images=frames, return_tensors="pt").to(device)
 46 |         with torch.no_grad(), torch.autocast(torch.device(device).type, dtype=dtype):
 47 |             embeddings = model(**all_input).pooler_output
 48 |             embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True)
 49 |         embeddings = embeddings.cpu()
 50 | 
 51 |         video_embedding = embeddings.mean(dim=0)
 52 |         video_embedding = video_embedding / video_embedding.norm()
 53 |         return video_embedding.numpy()
 54 | 
 55 | 
 56 | def compute_image_embedding(image_path, model, preprocessor, device, dtype):
 57 |     """
 58 |     Computes an embedding for a single image.
 59 |     """
 60 |     image = Image.open(image_path).convert("RGB")
 61 |     image_input = preprocessor(image, return_tensors="pt").to(device)
 62 |     with torch.no_grad() and torch.autocast(torch.device(device).type, dtype=dtype):
 63 |         embedding = model(**image_input).pooler_output
 64 |         embedding = embedding / embedding.norm(dim=-1, keepdim=True)
 65 |     return embedding.cpu().numpy().flatten()
 66 | 
 67 | 
 68 | def compute_reference_embedding(ref_path, model, preprocessor, device, dtype):
 69 |     """
 70 |     Computes the embedding for a reference file (image or video).
 71 |     """
 72 |     video_extensions = (".mp4", ".avi", ".mov", ".mkv")
 73 |     if ref_path.lower().endswith(video_extensions):
 74 |         frames = get_frames(ref_path)
 75 |         frames = next(iter(frames))
 76 |         frames = [frame.to_image() for frame in frames]
 77 |         return compute_video_embedding(frames, model, preprocessor, device, dtype)
 78 |     else:
 79 |         return compute_image_embedding(ref_path, model, preprocessor, device, dtype)
 80 | 
 81 | 
 82 | @torch.no_grad()
 83 | @torch.inference_mode()
 84 | def main(args):
 85 |     # List video files in the folder (supports common video extensions)
 86 |     video_extensions = (".mp4", ".avi", ".mov", ".mkv")
 87 |     video_files = [
 88 |         os.path.join(args.videos_folder, f)
 89 |         for f in os.listdir(args.videos_folder)
 90 |         if f.lower().endswith(video_extensions)
 91 |     ]
 92 |     print(f"Total video files: {len(video_files)}")
 93 |     assert video_files
 94 | 
 95 |     # Load model.
 96 |     device = "cuda" if torch.cuda.is_available() else "cpu"
 97 |     dtype = (
 98 |         torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8 else torch.float16
 99 |     )
100 |     model = SiglipVisionModel.from_pretrained(
101 |         "google/siglip-so400m-patch14-384", attn_implementation="flash_attention_2"
102 |     ).to(device)
103 |     preprocessor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
104 | 
105 |     # Process each reference file and average their embeddings.
106 |     ref_embeddings = []
107 |     if os.path.isdir(args.reference):
108 |         allow_extensions = video_extensions + (".png", ".jpg", ".jpeg")
109 |         reference = [
110 |             os.path.join(args.reference, f) for f in os.listdir(args.reference) if f.endswith(allow_extensions)
111 |         ]
112 |     else:
113 |         reference = args.reference.split(",")
114 | 
115 |     assert reference
116 | 
117 |     for ref in reference:
118 |         emb = compute_reference_embedding(ref, model, preprocessor, device, dtype)
119 |         if emb is not None:
120 |             ref_embeddings.append(emb)
121 |         else:
122 |             print(f"Could not compute embedding for reference: {ref}")
123 | 
124 |     if len(ref_embeddings) == 0:
125 |         print("No valid reference embeddings found!")
126 |         return
127 | 
128 |     ref_embedding = np.mean(ref_embeddings, axis=0)
129 |     ref_embedding = ref_embedding / np.linalg.norm(ref_embedding)
130 | 
131 |     results = []
132 |     batch_frames = []  # To collect frames for a batch of videos
133 |     batch_paths = []  # To keep track of corresponding video paths
134 |     pbar = tqdm(video_files, desc="Computing video embeddings.")
135 | 
136 |     for video_path in pbar:
137 |         pbar.set_postfix_str(f"{video_path}")
138 | 
139 |         frames_generator = get_frames(video_path)
140 |         try:
141 |             frames_batch = next(iter(frames_generator))
142 |         except StopIteration:
143 |             print(f"Could not extract frames from {video_path}")
144 |             continue
145 | 
146 |         frames = [frame.to_image() for frame in frames_batch]
147 |         if not frames:
148 |             print(f"Could not extract frames from {video_path}")
149 |             continue
150 | 
151 |         frames = frames[: args.max_num_frames]
152 |         batch_frames.append(frames)
153 |         batch_paths.append(video_path)
154 | 
155 |         if len(batch_frames) == args.batch_size:
156 |             video_embeddings = compute_video_embedding(batch_frames, model, preprocessor, device, dtype)
157 |             for path, video_embedding in zip(batch_paths, video_embeddings):
158 |                 if video_embedding is not None:
159 |                     similarity = np.dot(ref_embedding, video_embedding)
160 |                     results.append((path.split("/")[-1], similarity))
161 |             batch_frames = []
162 |             batch_paths = []
163 | 
164 |     # Remaining.
165 |     if batch_frames:
166 |         video_embeddings = compute_video_embedding(batch_frames, model, preprocessor, device, dtype)
167 |         for path, video_embedding in zip(batch_paths, video_embeddings):
168 |             if video_embedding is not None:
169 |                 similarity = np.dot(ref_embedding, video_embedding)
170 |                 results.append((path.split("/")[-1], similarity))
171 | 
172 |     # Sort videos by similarity score (higher means more similar).
173 |     results.sort(key=lambda x: x[1], reverse=True)
174 | 
175 |     # Write results to a parquet file.
176 |     df = pd.DataFrame(results, columns=["video_path", "similarity"])
177 |     df.to_parquet(args.parquet_out_path, index=False)
178 | 
179 |     print(f"\nResults saved to {args.parquet_out_path}")
180 | 
181 | 
182 | if __name__ == "__main__":
183 |     parser = argparse.ArgumentParser()
184 |     parser.add_argument(
185 |         "--videos_folder",
186 |         type=str,
187 |         required=True,
188 |         help="Path to folder containing videos.",
189 |     )
190 |     parser.add_argument(
191 |         "--reference",
192 |         type=str,
193 |         required=True,
194 |         help="Reference image/video file(s).",
195 |     )
196 |     parser.add_argument(
197 |         "--max_num_frames",
198 |         type=int,
199 |         default=24,
200 |         help="Max number of frames per videos.",
201 |     )
202 |     parser.add_argument(
203 |         "--batch_size",
204 |         type=int,
205 |         default=16,
206 |         help="How many videos to process.",
207 |     )
208 |     parser.add_argument(
209 |         "--parquet_out_path",
210 |         type=str,
211 |         default="results.parquet",
212 |         help="Path to the output parquet file.",
213 |     )
214 |     args = parser.parse_args()
215 |     main(args)
216 | 


--------------------------------------------------------------------------------
/video_processing/requirements.txt:
--------------------------------------------------------------------------------
1 | av
2 | Pillow
3 | git+https://github.com/bigdata-pw/florence-tool.git
4 | transformers
5 | accelerate


--------------------------------------------------------------------------------
/video_processing/scene_split.py:
--------------------------------------------------------------------------------
 1 | from scenedetect import open_video, SceneManager, ContentDetector, split_video_ffmpeg
 2 | 
 3 | 
 4 | def get_scenes(path: str, threshold: int = 27, min_scene_len: int = 15, duration: int = None, **kwargs):
 5 |     detector = ContentDetector(threshold=threshold, min_scene_len=min_scene_len, **kwargs)
 6 |     scene_manager = SceneManager()
 7 |     scene_manager.add_detector(detector)
 8 |     video = open_video(path)
 9 |     scene_manager.detect_scenes(video=video, duration=duration, show_progress=True)
10 |     scenes = scene_manager.get_scene_list()
11 |     return scenes
12 | 


--------------------------------------------------------------------------------
/video_processing/video_to_scenes.py:
--------------------------------------------------------------------------------
 1 | from scene_split import get_scenes, split_video_ffmpeg
 2 | import pathlib
 3 | from argparse import ArgumentParser
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | parser = ArgumentParser()
 8 | parser.add_argument("--path", type=str, required=True)
 9 | parser.add_argument("--out-path", type=str, required=True)
10 | parser.add_argument("--threshold", type=int, default=27)
11 | parser.add_argument("--min-scene-len", type=int, default=15)
12 | parser.add_argument("--duration", type=int, default=None)
13 | args = parser.parse_args()
14 | path = pathlib.Path(args.path)
15 | out_path = pathlib.Path(args.out_path)
16 | threshold = args.threshold
17 | min_scene_len = args.min_scene_len
18 | duration = args.duration
19 | 
20 | EXTENSIONS = {"avi", "mkv", "mp4"}
21 | 
22 | videos = []
23 | for extension in EXTENSIONS:
24 |     videos.extend(list(path.glob(f"*.{extension}")))
25 | 
26 | for video in tqdm(videos):
27 |     scenes = get_scenes(str(video), threshold=threshold, min_scene_len=min_scene_len, duration=duration)
28 |     split_video_ffmpeg(str(video), scene_list=scenes, output_dir=str(out_path))
29 | 


--------------------------------------------------------------------------------