├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── Makefile
├── README.md
├── docker
    └── Dockerfile
├── encoded_video
    ├── __init__.py
    ├── encoded_video.py
    └── utils.py
├── examples
    └── encoded_video_demo.ipynb
├── requirements.txt
└── setup.py


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | examples/
132 | .DS_Store


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: quality style
 2 | 
 3 | DOCKER_FILE=docker/Dockerfile
 4 | PYTHON_VERSION?=3.7
 5 | SRC?=$(shell 'pwd')
 6 | 
 7 | check_dirs := encoded_video examples
 8 | 
 9 | help:
10 | 	@cat Makefile
11 | 
12 | build:
13 | 	docker build -t encoded-video --build-arg python_version=$(PYTHON_VERSION) -f $(DOCKER_FILE) .
14 | bash: build
15 | 	docker run --rm -it -v $(SRC):/src/workspace encoded-video bash
16 | 
17 | quality:
18 | 	black --check $(check_dirs)
19 | 	isort --check-only $(check_dirs)
20 | 	flake8 $(check_dirs)
21 | 
22 | style:
23 | 	black $(check_dirs)
24 | 	isort $(check_dirs)
25 | 
26 | test:
27 | 	pytest -sv tests/
28 | 
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # encoded-video
 2 | 
 3 | <a href="https://colab.research.google.com/github/nateraw/encoded-video/blob/main/examples/encoded_video_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
 4 | 
 5 | Utilities for serializing/deserializing videos w/ `pyav` and `numpy`. 
 6 | 
 7 | ## Purpose
 8 | 
 9 | 1. Have a helpful API for working with videos
10 | 2. Liberate myself from relying on `torch` or `tensorflow` to do the above
11 | 3. Serialize/deserialize videos without writing directly to file (helpful for sending/recieving videos over APIs)
12 | 
13 | ## Acknowledgments
14 | 
15 | This is more or less a `torch`-less version of `EncodedVideo` from [`pytorchvideo`](https://github.com/facebookresearch/pytorchvideo).
16 | 
17 | ## Setup
18 | 
19 | ```
20 | pip install encoded-video
21 | ```
22 | 
23 | ## Usage
24 | 
25 | ```python
26 | import numpy as np
27 | from encoded_video import bytes_to_video, read_video, video_to_bytes
28 | 
29 | vid = read_video('archery.mp4')
30 | video_arr = vid['video']  # (T, H, W, C)
31 | audio_arr = vid['audio']  # (S,)
32 | 
33 | out_bytes = video_to_bytes(
34 |     video_arr,
35 |     fps=30,
36 |     audio_array=np.expand_dims(audio_arr, 0),
37 |     audio_fps=vid['audio_fps'],
38 |     audio_codec='aac'
39 | )
40 | 
41 | restored_video = bytes_to_video(out_bytes)
42 | ```
43 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM debian:latest
 2 | LABEL maintainer="naterawdata@gmail.com"
 3 | 
 4 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
 5 | 
 6 | # Install system packages
 7 | RUN apt-get update && apt-get install -y --no-install-recommends \
 8 |     bzip2 \
 9 |     git \
10 |     libglib2.0-0 \
11 |     libxext6 \
12 |     libsm6 \
13 |     libxrender1 \
14 |     wget \
15 |     libffi-dev \
16 |     libxml2-dev \
17 |     libxslt-dev \
18 |     lib32z1-dev \
19 |     vim \
20 |     python-dev \
21 |     pkg-config \
22 |     libssl-dev && \
23 |     rm -rf /var/lib/apt/lists/*
24 | 
25 | RUN apt-get update && apt-get upgrade -y && apt-get install -y \
26 |     libavformat-dev libavcodec-dev libavdevice-dev \
27 |     libavutil-dev libswscale-dev libswresample-dev libavfilter-dev
28 | 
29 | 
30 | # General dependencies
31 | # RUN apt-get install -y python-dev pkg-config
32 | 
33 | # Library components
34 | # RUN apt-get install -y \
35 | #     libavformat-dev libavcodec-dev libavdevice-dev \
36 | #     libavutil-dev libswscale-dev libswresample-dev libavfilter-dev
37 | 
38 | # Install conda
39 | ENV CONDA_DIR /opt/conda
40 | ENV PATH $CONDA_DIR/bin:$PATH
41 | ENV CONDA_INSTALLER="Miniconda3-4.5.11-Linux-x86_64.sh"
42 | 
43 | RUN wget --quiet --no-check-certificate https://repo.continuum.io/miniconda/${CONDA_INSTALLER} && \
44 |     /bin/bash /${CONDA_INSTALLER} -f -b -p $CONDA_DIR && \
45 |     rm ${CONDA_INSTALLER} && \
46 |     echo export PATH=$CONDA_DIR/bin:'$PATH' > /etc/profile.d/conda.sh
47 | 
48 | # Make requirements accessible to container
49 | COPY requirements.txt /tmp/
50 | 
51 | # Install specific python version and dependencies
52 | ARG python_version=3.7
53 | RUN conda install -y python=${python_version} && \
54 |     pip install --upgrade pip && \
55 |     pip install -r /tmp/requirements.txt
56 | 
57 | # Add a pythonpath - gives python context to workdir
58 | ENV PYTHONPATH=/src/workspace
59 | 
60 | WORKDIR /src/workspace/
61 | 
62 | # Copy over entire source dir to workspace
63 | COPY . /src/workspace/
64 | 
65 | # Install package
66 | RUN python setup.py develop
67 | 
68 | CMD ["/bin/bash"]


--------------------------------------------------------------------------------
/encoded_video/__init__.py:
--------------------------------------------------------------------------------
1 | from .encoded_video import EncodedVideo
2 | from .utils import bytes_to_video, read_video, video_to_bytes, write_video
3 | 
4 | __version__ = "0.0.2"
5 | 


--------------------------------------------------------------------------------
/encoded_video/encoded_video.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import logging
  3 | import math
  4 | import pathlib
  5 | from typing import BinaryIO, Dict, List, Optional, Tuple
  6 | 
  7 | import av
  8 | import numpy as np
  9 | from iopath.common.file_io import g_pathmgr
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | av.logging.set_level(av.logging.ERROR)
 14 | if not hasattr(av.video.frame.VideoFrame, "pict_type"):
 15 |     av = ImportError(
 16 |         """\
 17 | Your version of PyAV is too old for the necessary video operations in torchvision.
 18 | If you are on Python 3.5, you will have to build from source (the conda-forge
 19 | packages are not up-to-date).  See
 20 | https://github.com/mikeboers/PyAV#installation for instructions on how to
 21 | install PyAV on your system.
 22 | """
 23 |     )
 24 | 
 25 | 
 26 | def thwc_to_cthw(data: np.ndarray) -> np.ndarray:
 27 |     """
 28 |     Permute array from (time, height, weight, channel) to
 29 |     (channel, height, width, time).
 30 |     """
 31 |     # return data.permute(3, 0, 1, 2)
 32 |     return np.transpose(data, (3, 0, 1, 2))
 33 | 
 34 | 
 35 | def secs_to_pts(time_in_seconds: float, time_base: float, start_pts: float) -> float:
 36 |     """
 37 |     Converts a time (in seconds) to the given time base and start_pts offset
 38 |     presentation time.
 39 | 
 40 |     Returns:
 41 |         pts (float): The time in the given time base.
 42 |     """
 43 |     if time_in_seconds == math.inf:
 44 |         return math.inf
 45 | 
 46 |     time_base = float(time_base)
 47 |     return int(time_in_seconds / time_base) + start_pts
 48 | 
 49 | 
 50 | def pts_to_secs(time_in_seconds: float, time_base: float, start_pts: float) -> float:
 51 |     """
 52 |     Converts a present time with the given time base and start_pts offset to seconds.
 53 | 
 54 |     Returns:
 55 |         time_in_seconds (float): The corresponding time in seconds.
 56 |     """
 57 |     if time_in_seconds == math.inf:
 58 |         return math.inf
 59 | 
 60 |     return (time_in_seconds - start_pts) * float(time_base)
 61 | 
 62 | 
 63 | class EncodedVideo(object):
 64 |     """
 65 |     EncodedVideo is an abstraction for accessing clips from an encoded video using
 66 |     PyAV as the decoding backend. It supports selective decoding when header information
 67 |     is available.
 68 |     """
 69 | 
 70 |     def __init__(
 71 |         self,
 72 |         file: BinaryIO,
 73 |         video_name: Optional[str] = None,
 74 |         decode_audio: bool = True,
 75 |     ) -> None:
 76 |         """
 77 |         Args:
 78 |             file (BinaryIO): a file-like object (e.g. io.BytesIO or io.StringIO) that
 79 |                 contains the encoded video.
 80 |         """
 81 |         self._video_name = video_name
 82 |         self._decode_audio = decode_audio
 83 | 
 84 |         try:
 85 |             self._container = av.open(file)
 86 |         except Exception as e:
 87 |             raise RuntimeError(f"Failed to open video {video_name}. {e}")
 88 | 
 89 |         if self._container is None or len(self._container.streams.video) == 0:
 90 |             raise RuntimeError(f"Video stream not found {video_name}")
 91 | 
 92 |         # Retrieve video header information if available.
 93 |         video_stream = self._container.streams.video[0]
 94 |         self._video_time_base = video_stream.time_base
 95 |         self._video_start_pts = video_stream.start_time
 96 |         if self._video_start_pts is None:
 97 |             self._video_start_pts = 0.0
 98 | 
 99 |         video_duration = video_stream.duration
100 | 
101 |         # Retrieve audio header information if available.
102 |         audio_duration = None
103 |         self._has_audio = None
104 |         if self._decode_audio:
105 |             self._has_audio = self._container.streams.audio
106 |             if self._has_audio:
107 |                 self._audio_time_base = self._container.streams.audio[0].time_base
108 |                 self._audio_start_pts = self._container.streams.audio[0].start_time
109 |                 if self._audio_start_pts is None:
110 |                     self._audio_start_pts = 0.0
111 | 
112 |                 audio_duration = self._container.streams.audio[0].duration
113 | 
114 |         # If duration isn't found in header the whole video is decoded to
115 |         # determine the duration.
116 |         self._video, self._audio, self._selective_decoding = (None, None, True)
117 |         if audio_duration is None and video_duration is None:
118 |             self._selective_decoding = False
119 |             self._video, self._audio = self._pyav_decode_video()
120 |             if self._video is None:
121 |                 raise RuntimeError("Unable to decode video stream")
122 | 
123 |             video_duration = self._video[-1][1]
124 |             if self._audio is not None:
125 |                 audio_duration = self._audio[-1][1]
126 | 
127 |         # Take the largest duration of either video or duration stream.
128 |         if audio_duration is not None and video_duration is not None:
129 |             self._duration = max(
130 |                 pts_to_secs(
131 |                     video_duration, self._video_time_base, self._video_start_pts
132 |                 ),
133 |                 pts_to_secs(
134 |                     audio_duration, self._audio_time_base, self._audio_start_pts
135 |                 ),
136 |             )
137 |         elif video_duration is not None:
138 |             self._duration = pts_to_secs(
139 |                 video_duration, self._video_time_base, self._video_start_pts
140 |             )
141 | 
142 |         elif audio_duration is not None:
143 |             self._duration = pts_to_secs(
144 |                 audio_duration, self._audio_time_base, self._audio_start_pts
145 |             )
146 | 
147 |     @classmethod
148 |     def from_path(
149 |         cls, file_path: str, decode_audio: bool = True, decoder: str = "pyav"
150 |     ):
151 |         """
152 |         Fetches the given video path using PathManager (allowing remote uris to be
153 |         fetched) and constructs the EncodedVideo object.
154 | 
155 |         Args:
156 |             file_path (str): a PathManager file-path.
157 |         """
158 |         # We read the file with PathManager so that we can read from remote uris.
159 |         with g_pathmgr.open(file_path, "rb") as fh:
160 |             video_file = io.BytesIO(fh.read())
161 | 
162 |         return cls(video_file, pathlib.Path(file_path).name, decode_audio)
163 | 
164 |     @property
165 |     def name(self) -> Optional[str]:
166 |         """
167 |         Returns:
168 |             name: the name of the stored video if set.
169 |         """
170 |         return self._video_name
171 | 
172 |     @property
173 |     def duration(self) -> float:
174 |         """
175 |         Returns:
176 |             duration: the video's duration/end-time in seconds.
177 |         """
178 |         return self._duration
179 | 
180 |     def get_clip(
181 |         self, start_sec: float, end_sec: float
182 |     ) -> Dict[str, Optional[np.ndarray]]:
183 |         """
184 |         Retrieves frames from the encoded video at the specified start and end times
185 |         in seconds (the video always starts at 0 seconds).
186 | 
187 |         Args:
188 |             start_sec (float): the clip start time in seconds
189 |             end_sec (float): the clip end time in seconds
190 |         Returns:
191 |             clip_data:
192 |                 A dictionary mapping the entries at "video" and "audio" to numpy arrays.
193 | 
194 |                 "video": An array of the clip's RGB frames with shape:
195 |                 (channel, time, height, width). The frames are of type np.float32 and
196 |                 in the range [0 - 255].
197 | 
198 |                 "audio": An array of the clip's audio samples with shape:
199 |                 (samples). The samples are of type np.float32 and
200 |                 in the range [0 - 255].
201 | 
202 |             Returns None if no video or audio found within time range.
203 | 
204 |         """
205 |         if self._selective_decoding:
206 |             self._video, self._audio = self._pyav_decode_video(start_sec, end_sec)
207 | 
208 |         video_frames = None
209 |         if self._video is not None:
210 |             video_start_pts = secs_to_pts(
211 |                 start_sec, self._video_time_base, self._video_start_pts
212 |             )
213 |             video_end_pts = secs_to_pts(
214 |                 end_sec, self._video_time_base, self._video_start_pts
215 |             )
216 | 
217 |             video_frames = [
218 |                 f
219 |                 for f, pts in self._video
220 |                 if pts >= video_start_pts and pts <= video_end_pts
221 |             ]
222 | 
223 |         audio_samples = None
224 |         if self._has_audio and self._audio is not None:
225 |             audio_start_pts = secs_to_pts(
226 |                 start_sec, self._audio_time_base, self._audio_start_pts
227 |             )
228 |             audio_end_pts = secs_to_pts(
229 |                 end_sec, self._audio_time_base, self._audio_start_pts
230 |             )
231 |             audio_samples = [
232 |                 f
233 |                 for f, pts in self._audio
234 |                 if pts >= audio_start_pts and pts <= audio_end_pts
235 |             ]
236 |             audio_samples = np.concatenate(audio_samples, axis=0)
237 |             audio_samples = audio_samples.astype(np.float32)
238 | 
239 |         if video_frames is None or len(video_frames) == 0:
240 |             logger.debug(
241 |                 f"No video found within {start_sec} and {end_sec} seconds. "
242 |                 f"Video starts at time 0 and ends at {self.duration}."
243 |             )
244 | 
245 |             video_frames = None
246 | 
247 |         if video_frames is not None:
248 |             video_frames = np.stack(video_frames).astype(np.float32)
249 | 
250 |         return {
251 |             "video": video_frames,
252 |             "audio": audio_samples,
253 |         }
254 | 
255 |     def close(self):
256 |         """
257 |         Closes the internal video container.
258 |         """
259 |         if self._container is not None:
260 |             self._container.close()
261 | 
262 |     def _pyav_decode_video(
263 |         self, start_secs: float = 0.0, end_secs: float = math.inf
264 |     ) -> float:
265 |         """
266 |         Selectively decodes a video between start_pts and end_pts in time units of the
267 |         self._video's timebase.
268 |         """
269 |         video_and_pts = None
270 |         audio_and_pts = None
271 |         try:
272 |             pyav_video_frames, _ = _pyav_decode_stream(
273 |                 self._container,
274 |                 secs_to_pts(start_secs, self._video_time_base, self._video_start_pts),
275 |                 secs_to_pts(end_secs, self._video_time_base, self._video_start_pts),
276 |                 self._container.streams.video[0],
277 |                 {"video": 0},
278 |             )
279 |             if len(pyav_video_frames) > 0:
280 |                 video_and_pts = [
281 |                     (frame.to_rgb().to_ndarray(), frame.pts)
282 |                     for frame in pyav_video_frames
283 |                 ]
284 | 
285 |             if self._has_audio:
286 |                 pyav_audio_frames, _ = _pyav_decode_stream(
287 |                     self._container,
288 |                     secs_to_pts(
289 |                         start_secs, self._audio_time_base, self._audio_start_pts
290 |                     ),
291 |                     secs_to_pts(end_secs, self._audio_time_base, self._audio_start_pts),
292 |                     self._container.streams.audio[0],
293 |                     {"audio": 0},
294 |                 )
295 | 
296 |                 if len(pyav_audio_frames) > 0:
297 |                     audio_and_pts = [
298 |                         (
299 |                             np.mean(frame.to_ndarray(), axis=0),
300 |                             frame.pts,
301 |                         )
302 |                         for frame in pyav_audio_frames
303 |                     ]
304 | 
305 |         except Exception as e:
306 |             logger.debug(f"Failed to decode video: {self._video_name}. {e}")
307 | 
308 |         return video_and_pts, audio_and_pts
309 | 
310 | 
311 | def _pyav_decode_stream(
312 |     container: av.container.input.InputContainer,
313 |     start_pts: float,
314 |     end_pts: float,
315 |     stream: av.video.stream.VideoStream,
316 |     stream_name: dict,
317 |     buffer_size: int = 0,
318 | ) -> Tuple[List, float]:
319 |     """
320 |     Decode the video with PyAV decoder.
321 |     Args:
322 |         container (container): PyAV container.
323 |         start_pts (int): the starting Presentation TimeStamp to fetch the
324 |             video frames.
325 |         end_pts (int): the ending Presentation TimeStamp of the decoded frames.
326 |         stream (stream): PyAV stream.
327 |         stream_name (dict): a dictionary of streams. For example, {"video": 0}
328 |             means video stream at stream index 0.
329 |     Returns:
330 |         result (list): list of decoded frames.
331 |         max_pts (int): max Presentation TimeStamp of the video sequence.
332 |     """
333 | 
334 |     # Seeking in the stream is imprecise. Thus, seek to an earlier pts by a
335 |     # margin pts.
336 |     margin = 1024
337 |     seek_offset = max(start_pts - margin, 0)
338 |     container.seek(int(seek_offset), any_frame=False, backward=True, stream=stream)
339 |     frames = {}
340 |     max_pts = 0
341 |     for frame in container.decode(**stream_name):
342 |         max_pts = max(max_pts, frame.pts)
343 |         if frame.pts >= start_pts and frame.pts <= end_pts:
344 |             frames[frame.pts] = frame
345 |         elif frame.pts > end_pts:
346 |             break
347 | 
348 |     result = [frames[pts] for pts in sorted(frames)]
349 |     return result, max_pts
350 | 


--------------------------------------------------------------------------------
/encoded_video/utils.py:
--------------------------------------------------------------------------------
  1 | from io import BytesIO
  2 | from typing import Any, Dict, Optional
  3 | 
  4 | import av
  5 | import numpy as np
  6 | 
  7 | from encoded_video import EncodedVideo
  8 | 
  9 | 
 10 | def write_video(
 11 |     filename: str,
 12 |     video_array: np.ndarray,
 13 |     fps: float,
 14 |     video_codec: str = "libx264",
 15 |     options: Optional[Dict[str, Any]] = None,
 16 |     audio_array: Optional[np.ndarray] = None,
 17 |     audio_fps: Optional[float] = None,
 18 |     audio_codec: Optional[str] = None,
 19 |     audio_options: Optional[Dict[str, Any]] = None,
 20 | ) -> None:
 21 |     """
 22 |     Writes a 4d tensor in [T, H, W, C] format in a video file
 23 | 
 24 |     Args:
 25 |         filename (str): path where the video will be saved
 26 |         video_array (Tensor[T, H, W, C]): tensor containing the individual frames,
 27 |             as a uint8 tensor in [T, H, W, C] format
 28 |         fps (Number): video frames per second
 29 |         video_codec (str): the name of the video codec, i.e. "libx264", "h264", etc.
 30 |         options (Dict): dictionary containing options to be passed into the PyAV video stream
 31 |         audio_array (Tensor[C, N]): tensor containing the audio, where C is the number of channels
 32 |             and N is the number of samples
 33 |         audio_fps (Number): audio sample rate, typically 44100 or 48000
 34 |         audio_codec (str): the name of the audio codec, i.e. "mp3", "aac", etc.
 35 |         audio_options (Dict): dictionary containing options to be passed into the PyAV audio stream
 36 |     """
 37 |     # import torch
 38 |     # video_array = torch.as_tensor(video_array, dtype=torch.uint8).numpy()
 39 |     video_array = video_array.astype(np.uint8)
 40 | 
 41 |     # PyAV does not support floating point numbers with decimal point
 42 |     # and will throw OverflowException in case this is not the case
 43 |     if isinstance(fps, float):
 44 |         fps = np.round(fps)
 45 | 
 46 |     with av.open(filename, mode="w") as container:
 47 |         stream = container.add_stream(video_codec, rate=fps)
 48 |         stream.width = video_array.shape[2]
 49 |         stream.height = video_array.shape[1]
 50 |         stream.pix_fmt = "yuv420p" if video_codec != "libx264rgb" else "rgb24"
 51 |         stream.options = options or {}
 52 | 
 53 |         if audio_array is not None:
 54 |             audio_format_dtypes = {
 55 |                 "dbl": "<f8",
 56 |                 "dblp": "<f8",
 57 |                 "flt": "<f4",
 58 |                 "fltp": "<f4",
 59 |                 "s16": "<i2",
 60 |                 "s16p": "<i2",
 61 |                 "s32": "<i4",
 62 |                 "s32p": "<i4",
 63 |                 "u8": "u1",
 64 |                 "u8p": "u1",
 65 |             }
 66 |             a_stream = container.add_stream(audio_codec, rate=audio_fps)
 67 |             a_stream.options = audio_options or {}
 68 | 
 69 |             num_channels = audio_array.shape[0]
 70 |             audio_layout = "stereo" if num_channels > 1 else "mono"
 71 |             audio_sample_fmt = container.streams.audio[0].format.name
 72 | 
 73 |             format_dtype = np.dtype(audio_format_dtypes[audio_sample_fmt])
 74 |             audio_array = audio_array.astype(format_dtype)
 75 | 
 76 |             frame = av.AudioFrame.from_ndarray(
 77 |                 audio_array, format=audio_sample_fmt, layout=audio_layout
 78 |             )
 79 | 
 80 |             frame.sample_rate = audio_fps
 81 | 
 82 |             for packet in a_stream.encode(frame):
 83 |                 container.mux(packet)
 84 | 
 85 |             for packet in a_stream.encode():
 86 |                 container.mux(packet)
 87 | 
 88 |         for img in video_array:
 89 |             frame = av.VideoFrame.from_ndarray(img, format="rgb24")
 90 |             frame.pict_type = "NONE"
 91 |             for packet in stream.encode(frame):
 92 |                 container.mux(packet)
 93 | 
 94 |         # Flush stream
 95 |         for packet in stream.encode():
 96 |             container.mux(packet)
 97 | 
 98 | 
 99 | def video_to_bytes(
100 |     video_array: np.ndarray,
101 |     fps: float,
102 |     video_codec: str = "libx264",
103 |     options: Optional[Dict[str, Any]] = None,
104 |     audio_array: Optional[np.ndarray] = None,
105 |     audio_fps: Optional[float] = None,
106 |     audio_codec: Optional[str] = None,
107 |     audio_options: Optional[Dict[str, Any]] = None,
108 | ) -> bytes:
109 | 
110 |     """
111 |     Writes a 4d tensor in [T, H, W, C] format to buffer
112 | 
113 |     Args:
114 |         video_array (Tensor[T, H, W, C]): tensor containing the individual frames,
115 |             as a uint8 tensor in [T, H, W, C] format
116 |         fps (Number): video frames per second
117 |         video_codec (str): the name of the video codec, i.e. "libx264", "h264", etc.
118 |         options (Dict): dictionary containing options to be passed into the PyAV video stream
119 |         audio_array (Tensor[C, N]): tensor containing the audio, where C is the number of channels
120 |             and N is the number of samples
121 |         audio_fps (Number): audio sample rate, typically 44100 or 48000
122 |         audio_codec (str): the name of the audio codec, i.e. "mp3", "aac", etc.
123 |         audio_options (Dict): dictionary containing options to be passed into the PyAV audio stream
124 |     """
125 | 
126 |     bytes_mp4 = bytes()
127 |     out_file = BytesIO(bytes_mp4)
128 | 
129 |     # Add dummy file name to stream, as write_video will be looking for it
130 |     out_file.name = "out.mp4"
131 | 
132 |     # writes to out_file
133 |     write_video(
134 |         out_file,
135 |         video_array,
136 |         fps,
137 |         video_codec,
138 |         options,
139 |         audio_array,
140 |         audio_fps,
141 |         audio_codec,
142 |         audio_options,
143 |     )
144 | 
145 |     # Return the bytes
146 |     return out_file.getvalue()
147 | 
148 | 
149 | def bytes_to_video(bpayload) -> Dict[str, Any]:
150 |     """Take in memory video bytes and return a video clip dict containing frames, audio, and metadata"""
151 |     vid = EncodedVideo(BytesIO(bpayload))
152 |     clip = vid.get_clip(0, vid.duration)
153 |     clip["duration"] = vid.duration
154 |     clip["fps"] = float(vid._container.streams.video[0].average_rate)
155 |     clip["audio_fps"] = (
156 |         None if not vid._has_audio else vid._container.streams.audio[0].sample_rate
157 |     )
158 |     return clip
159 | 
160 | 
161 | def read_video(filepath):
162 |     """Read a video from file"""
163 |     with open(filepath, "rb") as f:
164 |         bpayload = f.read()
165 |     return bytes_to_video(bpayload)
166 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | av
2 | iopath
3 | numpy
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | 
 3 | with open("requirements.txt", "r") as f:
 4 |     requirements = f.read().splitlines()
 5 | 
 6 | setup(
 7 |     name='encoded-video',
 8 |     packages=find_packages(exclude=['examples']),
 9 |     version='0.0.2',
10 |     # license=, # TODO - codebase heavily draws from pytorchvideo. need to issue license correctly.
11 |     description='Video utilities',
12 |     author='Nathan Raw',
13 |     author_email='naterawdata@gmail.com',
14 |     url='https://github.com/nateraw/encoded-video',
15 |     install_requires=requirements,
16 | )
17 | 


--------------------------------------------------------------------------------