├── .github
└── workflows
│ └── python-publish.yml
├── .gitignore
├── Makefile
├── README.md
├── docker
└── Dockerfile
├── encoded_video
├── __init__.py
├── encoded_video.py
└── utils.py
├── examples
└── encoded_video_demo.ipynb
├── requirements.txt
└── setup.py
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflows will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: Upload Python Package
5 |
6 | on:
7 | release:
8 | types: [created]
9 |
10 | jobs:
11 | deploy:
12 |
13 | runs-on: ubuntu-latest
14 |
15 | steps:
16 | - uses: actions/checkout@v2
17 | - name: Set up Python
18 | uses: actions/setup-python@v2
19 | with:
20 | python-version: '3.x'
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | pip install setuptools wheel twine
25 | - name: Build and publish
26 | env:
27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 | run: |
30 | python setup.py sdist bdist_wheel
31 | twine upload dist/*
32 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | examples/
132 | .DS_Store
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: quality style
2 |
3 | DOCKER_FILE=docker/Dockerfile
4 | PYTHON_VERSION?=3.7
5 | SRC?=$(shell 'pwd')
6 |
7 | check_dirs := encoded_video examples
8 |
9 | help:
10 | @cat Makefile
11 |
12 | build:
13 | docker build -t encoded-video --build-arg python_version=$(PYTHON_VERSION) -f $(DOCKER_FILE) .
14 | bash: build
15 | docker run --rm -it -v $(SRC):/src/workspace encoded-video bash
16 |
17 | quality:
18 | black --check $(check_dirs)
19 | isort --check-only $(check_dirs)
20 | flake8 $(check_dirs)
21 |
22 | style:
23 | black $(check_dirs)
24 | isort $(check_dirs)
25 |
26 | test:
27 | pytest -sv tests/
28 |
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # encoded-video
2 |
3 |
4 |
5 | Utilities for serializing/deserializing videos w/ `pyav` and `numpy`.
6 |
7 | ## Purpose
8 |
9 | 1. Have a helpful API for working with videos
10 | 2. Liberate myself from relying on `torch` or `tensorflow` to do the above
11 | 3. Serialize/deserialize videos without writing directly to file (helpful for sending/recieving videos over APIs)
12 |
13 | ## Acknowledgments
14 |
15 | This is more or less a `torch`-less version of `EncodedVideo` from [`pytorchvideo`](https://github.com/facebookresearch/pytorchvideo).
16 |
17 | ## Setup
18 |
19 | ```
20 | pip install encoded-video
21 | ```
22 |
23 | ## Usage
24 |
25 | ```python
26 | import numpy as np
27 | from encoded_video import bytes_to_video, read_video, video_to_bytes
28 |
29 | vid = read_video('archery.mp4')
30 | video_arr = vid['video'] # (T, H, W, C)
31 | audio_arr = vid['audio'] # (S,)
32 |
33 | out_bytes = video_to_bytes(
34 | video_arr,
35 | fps=30,
36 | audio_array=np.expand_dims(audio_arr, 0),
37 | audio_fps=vid['audio_fps'],
38 | audio_codec='aac'
39 | )
40 |
41 | restored_video = bytes_to_video(out_bytes)
42 | ```
43 |
--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM debian:latest
2 | LABEL maintainer="naterawdata@gmail.com"
3 |
4 | ENV LANG=C.UTF-8 LC_ALL=C.UTF-8
5 |
6 | # Install system packages
7 | RUN apt-get update && apt-get install -y --no-install-recommends \
8 | bzip2 \
9 | git \
10 | libglib2.0-0 \
11 | libxext6 \
12 | libsm6 \
13 | libxrender1 \
14 | wget \
15 | libffi-dev \
16 | libxml2-dev \
17 | libxslt-dev \
18 | lib32z1-dev \
19 | vim \
20 | python-dev \
21 | pkg-config \
22 | libssl-dev && \
23 | rm -rf /var/lib/apt/lists/*
24 |
25 | RUN apt-get update && apt-get upgrade -y && apt-get install -y \
26 | libavformat-dev libavcodec-dev libavdevice-dev \
27 | libavutil-dev libswscale-dev libswresample-dev libavfilter-dev
28 |
29 |
30 | # General dependencies
31 | # RUN apt-get install -y python-dev pkg-config
32 |
33 | # Library components
34 | # RUN apt-get install -y \
35 | # libavformat-dev libavcodec-dev libavdevice-dev \
36 | # libavutil-dev libswscale-dev libswresample-dev libavfilter-dev
37 |
38 | # Install conda
39 | ENV CONDA_DIR /opt/conda
40 | ENV PATH $CONDA_DIR/bin:$PATH
41 | ENV CONDA_INSTALLER="Miniconda3-4.5.11-Linux-x86_64.sh"
42 |
43 | RUN wget --quiet --no-check-certificate https://repo.continuum.io/miniconda/${CONDA_INSTALLER} && \
44 | /bin/bash /${CONDA_INSTALLER} -f -b -p $CONDA_DIR && \
45 | rm ${CONDA_INSTALLER} && \
46 | echo export PATH=$CONDA_DIR/bin:'$PATH' > /etc/profile.d/conda.sh
47 |
48 | # Make requirements accessible to container
49 | COPY requirements.txt /tmp/
50 |
51 | # Install specific python version and dependencies
52 | ARG python_version=3.7
53 | RUN conda install -y python=${python_version} && \
54 | pip install --upgrade pip && \
55 | pip install -r /tmp/requirements.txt
56 |
57 | # Add a pythonpath - gives python context to workdir
58 | ENV PYTHONPATH=/src/workspace
59 |
60 | WORKDIR /src/workspace/
61 |
62 | # Copy over entire source dir to workspace
63 | COPY . /src/workspace/
64 |
65 | # Install package
66 | RUN python setup.py develop
67 |
68 | CMD ["/bin/bash"]
--------------------------------------------------------------------------------
/encoded_video/__init__.py:
--------------------------------------------------------------------------------
1 | from .encoded_video import EncodedVideo
2 | from .utils import bytes_to_video, read_video, video_to_bytes, write_video
3 |
4 | __version__ = "0.0.2"
5 |
--------------------------------------------------------------------------------
/encoded_video/encoded_video.py:
--------------------------------------------------------------------------------
1 | import io
2 | import logging
3 | import math
4 | import pathlib
5 | from typing import BinaryIO, Dict, List, Optional, Tuple
6 |
7 | import av
8 | import numpy as np
9 | from iopath.common.file_io import g_pathmgr
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 | av.logging.set_level(av.logging.ERROR)
14 | if not hasattr(av.video.frame.VideoFrame, "pict_type"):
15 | av = ImportError(
16 | """\
17 | Your version of PyAV is too old for the necessary video operations in torchvision.
18 | If you are on Python 3.5, you will have to build from source (the conda-forge
19 | packages are not up-to-date). See
20 | https://github.com/mikeboers/PyAV#installation for instructions on how to
21 | install PyAV on your system.
22 | """
23 | )
24 |
25 |
26 | def thwc_to_cthw(data: np.ndarray) -> np.ndarray:
27 | """
28 | Permute array from (time, height, weight, channel) to
29 | (channel, height, width, time).
30 | """
31 | # return data.permute(3, 0, 1, 2)
32 | return np.transpose(data, (3, 0, 1, 2))
33 |
34 |
35 | def secs_to_pts(time_in_seconds: float, time_base: float, start_pts: float) -> float:
36 | """
37 | Converts a time (in seconds) to the given time base and start_pts offset
38 | presentation time.
39 |
40 | Returns:
41 | pts (float): The time in the given time base.
42 | """
43 | if time_in_seconds == math.inf:
44 | return math.inf
45 |
46 | time_base = float(time_base)
47 | return int(time_in_seconds / time_base) + start_pts
48 |
49 |
50 | def pts_to_secs(time_in_seconds: float, time_base: float, start_pts: float) -> float:
51 | """
52 | Converts a present time with the given time base and start_pts offset to seconds.
53 |
54 | Returns:
55 | time_in_seconds (float): The corresponding time in seconds.
56 | """
57 | if time_in_seconds == math.inf:
58 | return math.inf
59 |
60 | return (time_in_seconds - start_pts) * float(time_base)
61 |
62 |
63 | class EncodedVideo(object):
64 | """
65 | EncodedVideo is an abstraction for accessing clips from an encoded video using
66 | PyAV as the decoding backend. It supports selective decoding when header information
67 | is available.
68 | """
69 |
70 | def __init__(
71 | self,
72 | file: BinaryIO,
73 | video_name: Optional[str] = None,
74 | decode_audio: bool = True,
75 | ) -> None:
76 | """
77 | Args:
78 | file (BinaryIO): a file-like object (e.g. io.BytesIO or io.StringIO) that
79 | contains the encoded video.
80 | """
81 | self._video_name = video_name
82 | self._decode_audio = decode_audio
83 |
84 | try:
85 | self._container = av.open(file)
86 | except Exception as e:
87 | raise RuntimeError(f"Failed to open video {video_name}. {e}")
88 |
89 | if self._container is None or len(self._container.streams.video) == 0:
90 | raise RuntimeError(f"Video stream not found {video_name}")
91 |
92 | # Retrieve video header information if available.
93 | video_stream = self._container.streams.video[0]
94 | self._video_time_base = video_stream.time_base
95 | self._video_start_pts = video_stream.start_time
96 | if self._video_start_pts is None:
97 | self._video_start_pts = 0.0
98 |
99 | video_duration = video_stream.duration
100 |
101 | # Retrieve audio header information if available.
102 | audio_duration = None
103 | self._has_audio = None
104 | if self._decode_audio:
105 | self._has_audio = self._container.streams.audio
106 | if self._has_audio:
107 | self._audio_time_base = self._container.streams.audio[0].time_base
108 | self._audio_start_pts = self._container.streams.audio[0].start_time
109 | if self._audio_start_pts is None:
110 | self._audio_start_pts = 0.0
111 |
112 | audio_duration = self._container.streams.audio[0].duration
113 |
114 | # If duration isn't found in header the whole video is decoded to
115 | # determine the duration.
116 | self._video, self._audio, self._selective_decoding = (None, None, True)
117 | if audio_duration is None and video_duration is None:
118 | self._selective_decoding = False
119 | self._video, self._audio = self._pyav_decode_video()
120 | if self._video is None:
121 | raise RuntimeError("Unable to decode video stream")
122 |
123 | video_duration = self._video[-1][1]
124 | if self._audio is not None:
125 | audio_duration = self._audio[-1][1]
126 |
127 | # Take the largest duration of either video or duration stream.
128 | if audio_duration is not None and video_duration is not None:
129 | self._duration = max(
130 | pts_to_secs(
131 | video_duration, self._video_time_base, self._video_start_pts
132 | ),
133 | pts_to_secs(
134 | audio_duration, self._audio_time_base, self._audio_start_pts
135 | ),
136 | )
137 | elif video_duration is not None:
138 | self._duration = pts_to_secs(
139 | video_duration, self._video_time_base, self._video_start_pts
140 | )
141 |
142 | elif audio_duration is not None:
143 | self._duration = pts_to_secs(
144 | audio_duration, self._audio_time_base, self._audio_start_pts
145 | )
146 |
147 | @classmethod
148 | def from_path(
149 | cls, file_path: str, decode_audio: bool = True, decoder: str = "pyav"
150 | ):
151 | """
152 | Fetches the given video path using PathManager (allowing remote uris to be
153 | fetched) and constructs the EncodedVideo object.
154 |
155 | Args:
156 | file_path (str): a PathManager file-path.
157 | """
158 | # We read the file with PathManager so that we can read from remote uris.
159 | with g_pathmgr.open(file_path, "rb") as fh:
160 | video_file = io.BytesIO(fh.read())
161 |
162 | return cls(video_file, pathlib.Path(file_path).name, decode_audio)
163 |
164 | @property
165 | def name(self) -> Optional[str]:
166 | """
167 | Returns:
168 | name: the name of the stored video if set.
169 | """
170 | return self._video_name
171 |
172 | @property
173 | def duration(self) -> float:
174 | """
175 | Returns:
176 | duration: the video's duration/end-time in seconds.
177 | """
178 | return self._duration
179 |
180 | def get_clip(
181 | self, start_sec: float, end_sec: float
182 | ) -> Dict[str, Optional[np.ndarray]]:
183 | """
184 | Retrieves frames from the encoded video at the specified start and end times
185 | in seconds (the video always starts at 0 seconds).
186 |
187 | Args:
188 | start_sec (float): the clip start time in seconds
189 | end_sec (float): the clip end time in seconds
190 | Returns:
191 | clip_data:
192 | A dictionary mapping the entries at "video" and "audio" to numpy arrays.
193 |
194 | "video": An array of the clip's RGB frames with shape:
195 | (channel, time, height, width). The frames are of type np.float32 and
196 | in the range [0 - 255].
197 |
198 | "audio": An array of the clip's audio samples with shape:
199 | (samples). The samples are of type np.float32 and
200 | in the range [0 - 255].
201 |
202 | Returns None if no video or audio found within time range.
203 |
204 | """
205 | if self._selective_decoding:
206 | self._video, self._audio = self._pyav_decode_video(start_sec, end_sec)
207 |
208 | video_frames = None
209 | if self._video is not None:
210 | video_start_pts = secs_to_pts(
211 | start_sec, self._video_time_base, self._video_start_pts
212 | )
213 | video_end_pts = secs_to_pts(
214 | end_sec, self._video_time_base, self._video_start_pts
215 | )
216 |
217 | video_frames = [
218 | f
219 | for f, pts in self._video
220 | if pts >= video_start_pts and pts <= video_end_pts
221 | ]
222 |
223 | audio_samples = None
224 | if self._has_audio and self._audio is not None:
225 | audio_start_pts = secs_to_pts(
226 | start_sec, self._audio_time_base, self._audio_start_pts
227 | )
228 | audio_end_pts = secs_to_pts(
229 | end_sec, self._audio_time_base, self._audio_start_pts
230 | )
231 | audio_samples = [
232 | f
233 | for f, pts in self._audio
234 | if pts >= audio_start_pts and pts <= audio_end_pts
235 | ]
236 | audio_samples = np.concatenate(audio_samples, axis=0)
237 | audio_samples = audio_samples.astype(np.float32)
238 |
239 | if video_frames is None or len(video_frames) == 0:
240 | logger.debug(
241 | f"No video found within {start_sec} and {end_sec} seconds. "
242 | f"Video starts at time 0 and ends at {self.duration}."
243 | )
244 |
245 | video_frames = None
246 |
247 | if video_frames is not None:
248 | video_frames = np.stack(video_frames).astype(np.float32)
249 |
250 | return {
251 | "video": video_frames,
252 | "audio": audio_samples,
253 | }
254 |
255 | def close(self):
256 | """
257 | Closes the internal video container.
258 | """
259 | if self._container is not None:
260 | self._container.close()
261 |
262 | def _pyav_decode_video(
263 | self, start_secs: float = 0.0, end_secs: float = math.inf
264 | ) -> float:
265 | """
266 | Selectively decodes a video between start_pts and end_pts in time units of the
267 | self._video's timebase.
268 | """
269 | video_and_pts = None
270 | audio_and_pts = None
271 | try:
272 | pyav_video_frames, _ = _pyav_decode_stream(
273 | self._container,
274 | secs_to_pts(start_secs, self._video_time_base, self._video_start_pts),
275 | secs_to_pts(end_secs, self._video_time_base, self._video_start_pts),
276 | self._container.streams.video[0],
277 | {"video": 0},
278 | )
279 | if len(pyav_video_frames) > 0:
280 | video_and_pts = [
281 | (frame.to_rgb().to_ndarray(), frame.pts)
282 | for frame in pyav_video_frames
283 | ]
284 |
285 | if self._has_audio:
286 | pyav_audio_frames, _ = _pyav_decode_stream(
287 | self._container,
288 | secs_to_pts(
289 | start_secs, self._audio_time_base, self._audio_start_pts
290 | ),
291 | secs_to_pts(end_secs, self._audio_time_base, self._audio_start_pts),
292 | self._container.streams.audio[0],
293 | {"audio": 0},
294 | )
295 |
296 | if len(pyav_audio_frames) > 0:
297 | audio_and_pts = [
298 | (
299 | np.mean(frame.to_ndarray(), axis=0),
300 | frame.pts,
301 | )
302 | for frame in pyav_audio_frames
303 | ]
304 |
305 | except Exception as e:
306 | logger.debug(f"Failed to decode video: {self._video_name}. {e}")
307 |
308 | return video_and_pts, audio_and_pts
309 |
310 |
311 | def _pyav_decode_stream(
312 | container: av.container.input.InputContainer,
313 | start_pts: float,
314 | end_pts: float,
315 | stream: av.video.stream.VideoStream,
316 | stream_name: dict,
317 | buffer_size: int = 0,
318 | ) -> Tuple[List, float]:
319 | """
320 | Decode the video with PyAV decoder.
321 | Args:
322 | container (container): PyAV container.
323 | start_pts (int): the starting Presentation TimeStamp to fetch the
324 | video frames.
325 | end_pts (int): the ending Presentation TimeStamp of the decoded frames.
326 | stream (stream): PyAV stream.
327 | stream_name (dict): a dictionary of streams. For example, {"video": 0}
328 | means video stream at stream index 0.
329 | Returns:
330 | result (list): list of decoded frames.
331 | max_pts (int): max Presentation TimeStamp of the video sequence.
332 | """
333 |
334 | # Seeking in the stream is imprecise. Thus, seek to an earlier pts by a
335 | # margin pts.
336 | margin = 1024
337 | seek_offset = max(start_pts - margin, 0)
338 | container.seek(int(seek_offset), any_frame=False, backward=True, stream=stream)
339 | frames = {}
340 | max_pts = 0
341 | for frame in container.decode(**stream_name):
342 | max_pts = max(max_pts, frame.pts)
343 | if frame.pts >= start_pts and frame.pts <= end_pts:
344 | frames[frame.pts] = frame
345 | elif frame.pts > end_pts:
346 | break
347 |
348 | result = [frames[pts] for pts in sorted(frames)]
349 | return result, max_pts
350 |
--------------------------------------------------------------------------------
/encoded_video/utils.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | from typing import Any, Dict, Optional
3 |
4 | import av
5 | import numpy as np
6 |
7 | from encoded_video import EncodedVideo
8 |
9 |
10 | def write_video(
11 | filename: str,
12 | video_array: np.ndarray,
13 | fps: float,
14 | video_codec: str = "libx264",
15 | options: Optional[Dict[str, Any]] = None,
16 | audio_array: Optional[np.ndarray] = None,
17 | audio_fps: Optional[float] = None,
18 | audio_codec: Optional[str] = None,
19 | audio_options: Optional[Dict[str, Any]] = None,
20 | ) -> None:
21 | """
22 | Writes a 4d tensor in [T, H, W, C] format in a video file
23 |
24 | Args:
25 | filename (str): path where the video will be saved
26 | video_array (Tensor[T, H, W, C]): tensor containing the individual frames,
27 | as a uint8 tensor in [T, H, W, C] format
28 | fps (Number): video frames per second
29 | video_codec (str): the name of the video codec, i.e. "libx264", "h264", etc.
30 | options (Dict): dictionary containing options to be passed into the PyAV video stream
31 | audio_array (Tensor[C, N]): tensor containing the audio, where C is the number of channels
32 | and N is the number of samples
33 | audio_fps (Number): audio sample rate, typically 44100 or 48000
34 | audio_codec (str): the name of the audio codec, i.e. "mp3", "aac", etc.
35 | audio_options (Dict): dictionary containing options to be passed into the PyAV audio stream
36 | """
37 | # import torch
38 | # video_array = torch.as_tensor(video_array, dtype=torch.uint8).numpy()
39 | video_array = video_array.astype(np.uint8)
40 |
41 | # PyAV does not support floating point numbers with decimal point
42 | # and will throw OverflowException in case this is not the case
43 | if isinstance(fps, float):
44 | fps = np.round(fps)
45 |
46 | with av.open(filename, mode="w") as container:
47 | stream = container.add_stream(video_codec, rate=fps)
48 | stream.width = video_array.shape[2]
49 | stream.height = video_array.shape[1]
50 | stream.pix_fmt = "yuv420p" if video_codec != "libx264rgb" else "rgb24"
51 | stream.options = options or {}
52 |
53 | if audio_array is not None:
54 | audio_format_dtypes = {
55 | "dbl": " 1 else "mono"
71 | audio_sample_fmt = container.streams.audio[0].format.name
72 |
73 | format_dtype = np.dtype(audio_format_dtypes[audio_sample_fmt])
74 | audio_array = audio_array.astype(format_dtype)
75 |
76 | frame = av.AudioFrame.from_ndarray(
77 | audio_array, format=audio_sample_fmt, layout=audio_layout
78 | )
79 |
80 | frame.sample_rate = audio_fps
81 |
82 | for packet in a_stream.encode(frame):
83 | container.mux(packet)
84 |
85 | for packet in a_stream.encode():
86 | container.mux(packet)
87 |
88 | for img in video_array:
89 | frame = av.VideoFrame.from_ndarray(img, format="rgb24")
90 | frame.pict_type = "NONE"
91 | for packet in stream.encode(frame):
92 | container.mux(packet)
93 |
94 | # Flush stream
95 | for packet in stream.encode():
96 | container.mux(packet)
97 |
98 |
99 | def video_to_bytes(
100 | video_array: np.ndarray,
101 | fps: float,
102 | video_codec: str = "libx264",
103 | options: Optional[Dict[str, Any]] = None,
104 | audio_array: Optional[np.ndarray] = None,
105 | audio_fps: Optional[float] = None,
106 | audio_codec: Optional[str] = None,
107 | audio_options: Optional[Dict[str, Any]] = None,
108 | ) -> bytes:
109 |
110 | """
111 | Writes a 4d tensor in [T, H, W, C] format to buffer
112 |
113 | Args:
114 | video_array (Tensor[T, H, W, C]): tensor containing the individual frames,
115 | as a uint8 tensor in [T, H, W, C] format
116 | fps (Number): video frames per second
117 | video_codec (str): the name of the video codec, i.e. "libx264", "h264", etc.
118 | options (Dict): dictionary containing options to be passed into the PyAV video stream
119 | audio_array (Tensor[C, N]): tensor containing the audio, where C is the number of channels
120 | and N is the number of samples
121 | audio_fps (Number): audio sample rate, typically 44100 or 48000
122 | audio_codec (str): the name of the audio codec, i.e. "mp3", "aac", etc.
123 | audio_options (Dict): dictionary containing options to be passed into the PyAV audio stream
124 | """
125 |
126 | bytes_mp4 = bytes()
127 | out_file = BytesIO(bytes_mp4)
128 |
129 | # Add dummy file name to stream, as write_video will be looking for it
130 | out_file.name = "out.mp4"
131 |
132 | # writes to out_file
133 | write_video(
134 | out_file,
135 | video_array,
136 | fps,
137 | video_codec,
138 | options,
139 | audio_array,
140 | audio_fps,
141 | audio_codec,
142 | audio_options,
143 | )
144 |
145 | # Return the bytes
146 | return out_file.getvalue()
147 |
148 |
149 | def bytes_to_video(bpayload) -> Dict[str, Any]:
150 | """Take in memory video bytes and return a video clip dict containing frames, audio, and metadata"""
151 | vid = EncodedVideo(BytesIO(bpayload))
152 | clip = vid.get_clip(0, vid.duration)
153 | clip["duration"] = vid.duration
154 | clip["fps"] = float(vid._container.streams.video[0].average_rate)
155 | clip["audio_fps"] = (
156 | None if not vid._has_audio else vid._container.streams.audio[0].sample_rate
157 | )
158 | return clip
159 |
160 |
161 | def read_video(filepath):
162 | """Read a video from file"""
163 | with open(filepath, "rb") as f:
164 | bpayload = f.read()
165 | return bytes_to_video(bpayload)
166 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | av
2 | iopath
3 | numpy
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | with open("requirements.txt", "r") as f:
4 | requirements = f.read().splitlines()
5 |
6 | setup(
7 | name='encoded-video',
8 | packages=find_packages(exclude=['examples']),
9 | version='0.0.2',
10 | # license=, # TODO - codebase heavily draws from pytorchvideo. need to issue license correctly.
11 | description='Video utilities',
12 | author='Nathan Raw',
13 | author_email='naterawdata@gmail.com',
14 | url='https://github.com/nateraw/encoded-video',
15 | install_requires=requirements,
16 | )
17 |
--------------------------------------------------------------------------------