├── .github
    └── workflows
    │   ├── docker-build.yml
    │   └── python-app.yml
├── .gitignore
├── .ruff.toml
├── .streamlit
    └── config.toml
├── Dockerfile
├── LICENSE
├── README.md
├── data
    └── input
    │   ├── blank.csv
    │   ├── blank.jpg
    │   ├── blank.zip
    │   └── test_input.txt
├── docker-compose.yml
├── requirements.test
├── requirements.txt
├── tests
    ├── __init__.py
    └── test_app.py
└── ytdatakit
    ├── __init__.py
    ├── about.py
    ├── about
        ├── __init__.py
        └── app.py
    ├── app.py
    ├── youtube_channel_downloader
        ├── __init__.py
        ├── app.py
        ├── callbacks.py
        ├── config.py
        ├── state.py
        └── yt_channel_download.py
    ├── youtube_downloader
        ├── __init__.py
        ├── app.py
        ├── callbacks.py
        ├── config.py
        ├── state.py
        └── yt_download.py
    ├── youtube_thumbnail_downloader
        ├── __init__.py
        ├── app.py
        ├── callbacks.py
        ├── config.py
        ├── state.py
        ├── yt_thumbnail_downloader.py
        └── zip.py
    └── youtube_transcript_downloader
        ├── __init__.py
        ├── app.py
        ├── callbacks.py
        ├── config.py
        ├── state.py
        └── yt_transcript_download.py


/.github/workflows/docker-build.yml:
--------------------------------------------------------------------------------
 1 | name: 'DockerBuild'
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*'
 7 |   
 8 | jobs:
 9 |   Build_And_Push:
10 |     runs-on: ubuntu-22.04
11 |     permissions:
12 |       contents: read
13 |       packages: write
14 | 
15 |     steps:
16 |       - name: Checkout
17 |         uses: actions/checkout@v2
18 | 
19 |       - name: Log in to Github Docker Image Registry
20 |         uses: docker/login-action@v3
21 |         with:
22 |           registry: ghcr.io
23 |           username: ${{ github.actor }}
24 |           password: ${{ secrets.TOKEN_GITHUB }}
25 | 
26 |       # Uncomment to use Docker Hub
27 |       # - name: Login to Docker Hub
28 |       #   uses: docker/login-action@v3
29 |       #   with:
30 |       #     username: ${{ secrets.USERNAME_DOCKERHUB }}
31 |       #     password: ${{ secrets.TOKEN_DOCKERHUB }}
32 | 
33 |       - name: Docker Meta
34 |         id: meta
35 |         uses: docker/metadata-action@v4
36 |         with:
37 |           images: |
38 |             ghcr.io/${{ github.actor }}/ytdatakit
39 |           # ${{ secrets.DOCKERHUB_USERNAME }}/meme-search
40 |           tags: type=ref,event=tag
41 |           flavor: latest=true
42 | 
43 |       - name: Set up QEMU
44 |         uses: docker/setup-qemu-action@v3
45 | 
46 |       - name: Set Buildx
47 |         uses: docker/setup-buildx-action@v3
48 | 
49 |       - name: Build and Upload for AMD64 and ARM64
50 |         uses: docker/build-push-action@v4
51 |         with:
52 |           context: .
53 |           platforms: linux/amd64,linux/arm64
54 |           push: true
55 |           tags: ${{ steps.meta.outputs.tags }}
56 |           labels: ${{ steps.meta.outputs.labels }}


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |     paths-ignore:
10 |         - '**/README.md'
11 |         - '**/CONTRIBUTING.md'
12 |         - '**LICENSE'
13 |   pull_request:
14 |     branches: [ "main" ]
15 |     paths-ignore:
16 |       - '**/README.md'
17 |       - '**/CONTRIBUTING.md'
18 |       - '**LICENSE'
19 | 
20 | jobs:
21 |   ruff:
22 |     name: lint with ruff
23 |     runs-on: ubuntu-latest
24 |     timeout-minutes: 3
25 |     steps:
26 |       - uses: actions/checkout@v4
27 |       - uses: actions/setup-python@v2
28 |       - uses: chartboost/ruff-action@v1
29 |         with:
30 |           args: 'format --check'
31 |           config: .ruff.toml
32 |   test:
33 |     name: run pytest
34 |     runs-on: ubuntu-latest
35 |     timeout-minutes: 5
36 |     steps:
37 |     - name: Checkout code
38 |       uses: actions/checkout@v2
39 |     - name: Set up Python
40 |       uses: actions/setup-python@v2
41 |       with:
42 |         python-version: '3.10'
43 |     - name: Install dependencies
44 |       run: |
45 |         python -m pip install --upgrade pip
46 |         pip install -r requirements.test
47 |         pip install -r requirements.txt
48 |     - name: Run pytest
49 |       run: |
50 |         PYTHONPATH=. python3.10 -m pytest tests/test_app.py


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | scratch.ipynb
  2 | *.mp4a
  3 | *.mp3
  4 | *.mp4
  5 | venv*
  6 | *.db
  7 | *.faiss
  8 | .DS_Store
  9 | ._.DS_Store
 10 | **/.DS_Store
 11 | **/._.DS_Store
 12 | **/.env*
 13 | bug_reports/
 14 | .ruff_cache/
 15 | .vscode
 16 | notebook_tests/
 17 | scratch_notebooks/
 18 | demos/
 19 | release_notes/
 20 | site/
 21 | tests/test_files/text/test_preprocessed/*
 22 | !.ruff.toml
 23 | push_pypi.sh
 24 | 
 25 | 
 26 | # Byte-compiled / optimized / DLL files
 27 | __pycache__/
 28 | *.py[cod]
 29 | *$py.class
 30 | 
 31 | # C extensions
 32 | *.so
 33 | 
 34 | # Distribution / packaging
 35 | .Python
 36 | build/
 37 | develop-eggs/
 38 | dist/
 39 | downloads/
 40 | eggs/
 41 | .eggs/
 42 | lib/
 43 | lib64/
 44 | parts/
 45 | sdist/
 46 | var/
 47 | wheels/
 48 | share/python-wheels/
 49 | *.egg-info/
 50 | .installed.cfg
 51 | *.egg
 52 | MANIFEST
 53 | 
 54 | # PyInstaller
 55 | #  Usually these files are written by a python script from a template
 56 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 57 | *.manifest
 58 | *.spec
 59 | 
 60 | # Installer logs
 61 | pip-log.txt
 62 | pip-delete-this-directory.txt
 63 | 
 64 | # Unit test / coverage reports
 65 | htmlcov/
 66 | .tox/
 67 | .nox/
 68 | .coverage
 69 | .coverage.*
 70 | .cache
 71 | nosetests.xml
 72 | coverage.xml
 73 | *.cover
 74 | *.py,cover
 75 | .hypothesis/
 76 | .pytest_cache/
 77 | cover/
 78 | 
 79 | # Translations
 80 | *.mo
 81 | *.pot
 82 | 
 83 | # Django stuff:
 84 | *.log
 85 | local_settings.py
 86 | db.sqlite3
 87 | db.sqlite3-journal
 88 | 
 89 | # Flask stuff:
 90 | instance/
 91 | .webassets-cache
 92 | 
 93 | # Scrapy stuff:
 94 | .scrapy
 95 | 
 96 | # Sphinx documentation
 97 | docs/_build/
 98 | 
 99 | # PyBuilder
100 | .pybuilder/
101 | target/
102 | 
103 | # Jupyter Notebook
104 | .ipynb_checkpoints
105 | 
106 | # IPython
107 | profile_default/
108 | ipython_config.py
109 | 
110 | # pyenv
111 | #   For a library or package, you might want to ignore these files since the code is
112 | #   intended to run in multiple environments; otherwise, check them in:
113 | # .python-version
114 | 
115 | # pipenv
116 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
117 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
118 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
119 | #   install all needed dependencies.
120 | #Pipfile.lock
121 | 
122 | # poetry
123 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
124 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
125 | #   commonly ignored for libraries.
126 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
127 | #poetry.lock
128 | 
129 | # pdm
130 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
131 | #pdm.lock
132 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
133 | #   in version control.
134 | #   https://pdm.fming.dev/#use-with-ide
135 | .pdm.toml
136 | 
137 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
138 | __pypackages__/
139 | 
140 | # Celery stuff
141 | celerybeat-schedule
142 | celerybeat.pid
143 | 
144 | # SageMath parsed files
145 | *.sage.py
146 | 
147 | # Environments
148 | .venv
149 | venv/
150 | venv.bak/
151 | 
152 | # Spyder project settings
153 | .spyderproject
154 | .spyproject
155 | 
156 | # Rope project settings
157 | .ropeproject
158 | 
159 | # mkdocs documentation
160 | /site
161 | 
162 | # mypy
163 | .mypy_cache/
164 | .dmypy.json
165 | dmypy.json
166 | 
167 | # Pyre type checker
168 | .pyre/
169 | 
170 | # pytype static type analyzer
171 | .pytype/
172 | 
173 | # Cython debug symbols
174 | cython_debug/


--------------------------------------------------------------------------------
/.ruff.toml:
--------------------------------------------------------------------------------
 1 | line-length = 150
 2 | target-version = "py38"
 3 | lint.select = ["E", "W"]
 4 | lint.fixable = ["ALL"]
 5 | lint.ignore = ["E501", "E999", "E402"]
 6 | exclude = [
 7 |     ".bzr",
 8 |     ".direnv",
 9 |     ".eggs",
10 |     ".git",
11 |     ".git-rewrite",
12 |     ".hg",
13 |     ".ipynb_checkpoints",
14 |     ".mypy_cache",
15 |     ".nox",
16 |     ".pants.d",
17 |     ".pyenv",
18 |     ".pytest_cache",
19 |     ".pytype",
20 |     ".ruff_cache",
21 |     ".svn",
22 |     ".tox",
23 |     ".venv",
24 |     ".vscode",
25 |     "__pypackages__",
26 |     "_build",
27 |     "buck-out",
28 |     "build",
29 |     "dist",
30 |     "node_modules",
31 |     "site-packages",
32 |     "venv",
33 |     "tests",
34 |     "scratch_notebooks",
35 |     "release_notes",
36 |     "notebook_tests",
37 |     "demos"
38 | ]
39 | extend-include = ["*.ipynb"]


--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [browser]
2 | gatherUsageStats = false
3 | 
4 | [server]
5 | runOnSave = true
6 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim
 2 | 
 3 | RUN apt-get update && apt-get install -y \
 4 |     build-essential \
 5 |     software-properties-common \
 6 |     ffmpeg \
 7 |     && rm -rf /var/lib/apt/lists/*
 8 | 
 9 | WORKDIR /home
10 | 
11 | ENV PYTHONPATH=.
12 | 
13 | COPY requirements.txt /home/requirements.txt
14 | COPY ytdatakit /home/ytdatakit
15 | COPY .streamlit /home/.streamlit
16 | RUN pip3 install -r /home/requirements.txt
17 | 
18 | EXPOSE 8502
19 | 
20 | HEALTHCHECK CMD curl --fail http://localhost:8502/_stcore/health || exit 1
21 | 
22 | ENTRYPOINT ["streamlit", "run", "/home/ytdatakit/app.py", "--server.port=8502", "--server.address=0.0.0.0"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Python application](https://github.com/neonwatty/ytdatakit/actions/workflows/python-app.yml/badge.svg)](https://github.com/neonwatty/ytdatakit/actions/workflows/python-app.yml/python-app.yml)
 2 | 
 3 | # A free Python app for downloading YouTube videos, transcripts, thumbnails, and channel data all in one place
 4 | 
 5 | A simple app that lets you conveniently download youtube videos, transcripts, thumbnails, and channel data. All in one place.  Can be easily run locally using python or docker.
 6 | 
 7 | <p align="center">
 8 | <img align="center" src="https://github.com/jermwatt/readme_gifs/blob/main/ytdatakit.gif" height="325">
 9 | </p>
10 | 
11 | 
12 | To run the app install the associated `requirements.txt` and run
13 | 
14 | ```python
15 | python streamlit run ytdatakit/app.py
16 | ```
17 | 
18 | Or run via Docker
19 | 
20 | ```sh
21 | docker compose up
22 | ```
23 | 


--------------------------------------------------------------------------------
/data/input/blank.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neonwatty/python-youtube-data-kit/4892ba613962a51b5919b4bd161cd88781e0c34a/data/input/blank.csv


--------------------------------------------------------------------------------
/data/input/blank.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neonwatty/python-youtube-data-kit/4892ba613962a51b5919b4bd161cd88781e0c34a/data/input/blank.jpg


--------------------------------------------------------------------------------
/data/input/blank.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neonwatty/python-youtube-data-kit/4892ba613962a51b5919b4bd161cd88781e0c34a/data/input/blank.zip


--------------------------------------------------------------------------------
/data/input/test_input.txt:
--------------------------------------------------------------------------------
1 | https://www.youtube.com/watch?v=CYFeKbWO0gc
2 | https://www.youtube.com/shorts/4w30B2ZpbkM
3 | https://www.youtube.com/watch?v=9fpkyTQ8Eis
4 | https://www.youtube.com/shorts/xrPeKdKzErU
5 | https://www.youtube.com/shorts/kdiYlTvpAvk


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 |   ytdatakit:
3 |     image: ghcr.io/neonwatty/ytdatakit:latest
4 |     container_name: ytdatakit
5 |     ports:
6 |       - 8502:8502
7 |     volumes:
8 |       - ./data:/home/data


--------------------------------------------------------------------------------
/requirements.test:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-subtests
3 | ruff


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | yt-dlp
2 | scrapetube
3 | youtube-transcript-api
4 | pandas
5 | streamlit


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | cwd = os.getcwd()
4 | CONTAINER_NAME = "ytdatakit"
5 | APP_FILE = "ytdatakit/app.py"
6 | 


--------------------------------------------------------------------------------
/tests/test_app.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import pytest
 3 | import time
 4 | from tests import APP_FILE
 5 | 
 6 | 
 7 | @pytest.fixture(scope="module")
 8 | def start_app():
 9 |     cmd = f"python -m streamlit run {APP_FILE} --server.headless true"
10 |     process = subprocess.Popen(
11 |         cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
12 |     )
13 |     time.sleep(5)
14 |     yield process
15 |     process.terminate()
16 |     process.wait()
17 | 
18 | 
19 | def test_streamlit(subtests, start_app):
20 |     with subtests.test(msg="server up"):
21 |         assert start_app.poll() is None, "app failed to start"
22 | 
23 |     with subtests.test(msg="streamlit down"):
24 |         start_app.terminate()
25 |         time.sleep(2)
26 |         assert start_app.poll() is not None, "app failed to stop"
27 | 
28 | 


--------------------------------------------------------------------------------
/ytdatakit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neonwatty/python-youtube-data-kit/4892ba613962a51b5919b4bd161cd88781e0c34a/ytdatakit/__init__.py


--------------------------------------------------------------------------------
/ytdatakit/about.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | 
 4 | def about():
 5 |     return st.markdown(
 6 |         (
 7 |             "### About \n"
 8 |             "Some notes on how this works: \n\n"
 9 |             "1.  **youtube / google login**: you do **not** need to be logged into a google account to use the app, with one exception: age restricted videos"
10 |             "2.  **age restricted videos**: this app cannot fetch age restricted videos yet, which requires a user login to google / youtube - this feature is not yet available"
11 |             "3.  **video resolution**: not all videos have all possible resolutions, so you may not be able to fetch the resolution you want for some videos (as they don't exist) \n"
12 |             "4.  **recommended hardware**: this is a very light weight app, so minimum specs should work fine"
13 |             "5.  **proxies**: there is an option in the yt_download module to enter proxy server ips"
14 |         )
15 |     )
16 | 


--------------------------------------------------------------------------------
/ytdatakit/about/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neonwatty/python-youtube-data-kit/4892ba613962a51b5919b4bd161cd88781e0c34a/ytdatakit/about/__init__.py


--------------------------------------------------------------------------------
/ytdatakit/about/app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | 
 4 | def app():
 5 |     st.markdown(
 6 |         "### YTDatakit \n\n"
 7 |         "**Download YouTube videos, transcripts, thumbnails, and channel data - all in one place.** \n\n"
 8 |         "One app per tab - as detailed below. \n\n"
 9 |         "**Tab 1:**  💡 About - you are here. \n\n"
10 |         "**Tab 2:**  🎞️ Video downloader - enter a YouTube / Shorts url and download its mp4 file. \n\n"
11 |         "**Tab 3:**  📜 Transcript downloader - download multiple YouTube / Shorts transcripts at once. \n\n"
12 |         "**Tab 4:**  📌 Thumbnail downloader - download multiple YouTube / Shorts thumbnails at once. \n\n"
13 |         "**Tab 5:**  📕 Channel downloader - download all YouTube video ids associated with a channel name. \n\n"
14 |         ""
15 |         "Each app is illustrated in the gif below. \n \n"
16 |         "![Alt Text](https://github.com/neonwatty/readme_gifs/blob/main/ytdatakit.gif?raw=true)"
17 |     )
18 | 


--------------------------------------------------------------------------------
/ytdatakit/app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from ytdatakit.about.app import app as about_page
 3 | from ytdatakit.youtube_downloader.app import app as video_downloader
 4 | from ytdatakit.youtube_transcript_downloader.app import app as transcript_downloader
 5 | from ytdatakit.youtube_thumbnail_downloader.app import app as thumbnail_downloader
 6 | from ytdatakit.youtube_channel_downloader.app import app as channel_downloader
 7 | 
 8 | app_name = "ytdatakit"
 9 | st.set_page_config(page_title=app_name)
10 | st.title(app_name)
11 | st.markdown("###### Run this app locally by pulling [the official repo](https://github.com/neonwatty/ytdatakit)")
12 | 
13 | 
14 | tab1, tab2, tab3, tab4, tab5 = st.tabs(
15 |     ["💡 About", "🎞️ Video downloader", "📜 Transcript downloader", "📌 Thumbnail downloader", "📕 Channel downloader"]
16 | )
17 | 
18 | with tab1:
19 |     about_page()
20 | with tab2:
21 |     video_downloader()
22 | with tab3:
23 |     transcript_downloader()
24 | with tab4:
25 |     thumbnail_downloader()
26 | with tab5:
27 |     channel_downloader()
28 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_channel_downloader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neonwatty/python-youtube-data-kit/4892ba613962a51b5919b4bd161cd88781e0c34a/ytdatakit/youtube_channel_downloader/__init__.py


--------------------------------------------------------------------------------
/ytdatakit/youtube_channel_downloader/app.py:
--------------------------------------------------------------------------------
 1 | from ytdatakit.youtube_channel_downloader.callbacks import fetch_channel_videos
 2 | from ytdatakit.youtube_channel_downloader.state import state_init, state_reset
 3 | import streamlit as st
 4 | 
 5 | 
 6 | def app():
 7 |     state_init()
 8 |     st.markdown(
 9 |         """
10 |     <style>
11 |     .element-container:has(style){
12 |         display: none;
13 |     }
14 |     #button-download {
15 |         display: none;
16 |     }
17 |     .element-container:has(#button-download) {
18 |         display: none;
19 |     }
20 |     .element-container:has(#button-download) + div button {
21 |         background-color: green;
22 |         border-color: green;
23 |         }
24 |     #button-fetch {
25 |         display: none;
26 |     }
27 |     .element-container:has(#button-fetch) {
28 |         display: none;
29 |     }
30 |     .element-container:has(#button-fetch) + div button {
31 |         background-color: blue;
32 |         border-color: blue;
33 |         }
34 |     </style>
35 |     """,
36 |         unsafe_allow_html=True,
37 |     )
38 | 
39 |     st.markdown(
40 |         """
41 |     <style>
42 |     .custom-font {
43 |         font-size:7.5px !important;
44 |         color: transparent;
45 |     }
46 |     </style>
47 |     """,
48 |         unsafe_allow_html=True,
49 |     )
50 | 
51 |     video_channel_col_a, video_channel_col_b, video_channel_col_c, video_channel_col_empty = st.columns([4, 3, 2, 2])
52 |     with video_channel_col_a:
53 |         channel_name = st.text_input(
54 |             value=st.session_state.channel_name,
55 |             label="🔗 paste YouTube channel name here",
56 |             placeholder="e.g., littletfitness",
57 |         )
58 |     with video_channel_col_b:
59 |         st.markdown('<p class="custom-font">fetch</p>', unsafe_allow_html=True)
60 |         st.markdown('<span id="button-fetch"></span>', unsafe_allow_html=True)
61 |         fetch_btn = st.button(
62 |             "fetch channel video ids",
63 |             type="primary",
64 |         )
65 |         if fetch_btn:
66 |             if channel_name != st.session_state.channel_name:
67 |                 state_reset()
68 |             if st.session_state.channel_fetch_count == 0:
69 |                 df_table, df_download = fetch_channel_videos(channel_name)
70 |                 st.session_state.channel_data_table = df_table
71 |                 st.session_state.channel_data_download = df_download
72 |                 st.session_state.channel_fetch_count += 1
73 | 
74 |     with video_channel_col_c:
75 |         st.markdown('<p class="custom-font">fetch</p>', unsafe_allow_html=True)
76 |         st.markdown('<span id="button-download"></span>', unsafe_allow_html=True)
77 |         st.download_button(
78 |             label="download",
79 |             data=st.session_state.channel_data_download,
80 |             file_name="channel_data.csv",
81 |             mime="text/csv",
82 |             disabled=False if st.session_state.channel_fetch_count > 0 else True,
83 |             type="primary",
84 |         )
85 |     with st.container(border=True):
86 |         st.table(st.session_state.channel_data_table.head(10))
87 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_channel_downloader/callbacks.py:
--------------------------------------------------------------------------------
 1 | from ytdatakit.youtube_channel_downloader.yt_channel_download import get_channel_videos
 2 | import pandas as pd
 3 | import streamlit as st
 4 | 
 5 | 
 6 | @st.cache_data
 7 | def convert_df(df: pd.DataFrame) -> "csv":
 8 |     # IMPORTANT: Cache the conversion to prevent computation on every rerun
 9 |     return df.to_csv().encode("utf-8")
10 | 
11 | 
12 | def fetch_channel_videos(channel_name: str):
13 |     # with st.spinner(text="channel video ids pull in progress..."):
14 |     video_ids, video_urls = get_channel_videos(channel_name)
15 |     if video_ids is not None and video_urls is not None:
16 |         df_table = pd.DataFrame(columns=["youtube_url", "video_id"])
17 |         df_table["youtube_url"] = video_urls
18 |         df_table["video_id"] = video_ids
19 |         df_download = convert_df(df_table)
20 |         return df_table, df_download
21 |     return None, None
22 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_channel_downloader/config.py:
--------------------------------------------------------------------------------
1 | default_channel_name = "Monkhaus"
2 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_channel_downloader/state.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | from ytdatakit.youtube_channel_downloader.config import default_channel_name
 4 | 
 5 | 
 6 | def state_init():
 7 |     df = pd.DataFrame(columns=["youtube_url", "video_id"])
 8 |     if "channel_data_table" not in st.session_state:
 9 |         st.session_state.channel_data_table = df
10 |     if "channel_data_download" not in st.session_state:
11 |         st.session_state.channel_data_download = df.to_csv().encode("utf-8")
12 |     if "channel_name" not in st.session_state:
13 |         st.session_state.channel_name = default_channel_name
14 |     if "channel_fetch_count" not in st.session_state:
15 |         st.session_state.channel_fetch_count = 0
16 | 
17 | 
18 | def state_reset():
19 |     df = pd.DataFrame(columns=["youtube_url", "video_id"])
20 |     if "channel_data_table" not in st.session_state:
21 |         st.session_state.channel_data_table = df
22 |     if "channel_data_download" not in st.session_state:
23 |         st.session_state.channel_data_download = df.to_csv().encode("utf-8")
24 |     st.session_state.channel_fetch_count = 0
25 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_channel_downloader/yt_channel_download.py:
--------------------------------------------------------------------------------
 1 | import yt_dlp
 2 | import scrapetube
 3 | from typing import Tuple
 4 | 
 5 | 
 6 | def get_channel_id_from_name(channel_name: str) -> str | None:
 7 |     ydl_opts = {
 8 |         "quiet": True,
 9 |         "skip_download": True,
10 |         "extract_flat": True,
11 |         "force_generic_extractor": True,
12 |     }
13 | 
14 |     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
15 |         try:
16 |             info = ydl.extract_info(f"ytsearch1:{channel_name}", download=False)
17 |             return info["entries"][0]["channel_id"]
18 |         except Exception as e:
19 |             print(f"FAILURE: get_channel_id_from_name failed with exception {e}")
20 |             return None
21 | 
22 | 
23 | def get_videourl_from_channel_id(channel_id: str) -> Tuple[list, list] | Tuple[None, None]:
24 |     try:
25 |         videos = scrapetube.get_channel(channel_id)
26 |         video_urls = []
27 |         video_ids = []
28 |         for video in videos:
29 |             vid = video["videoId"]
30 |             vurl = "https://www.youtube.com/watch?v=" + vid
31 |             video_ids.append(vid)
32 |             video_urls.append(vurl)
33 |         return video_ids, video_urls
34 |     except Exception as e:
35 |         print(f"FAILURE: get_videourls_from_channel_id failed with exception {e}")
36 |         return None, None
37 | 
38 | 
39 | def get_channel_videos(channel_name: str) -> Tuple[list, list] | Tuple[None, None]:
40 |     try:
41 |         print("INFO: starting channel video id puller...")
42 |         channel_id = get_channel_id_from_name(channel_name)
43 |         if channel_id is not None:
44 |             video_ids, video_urls = get_videourl_from_channel_id(channel_id)
45 |             if video_ids is not None and video_urls is not None:
46 |                 print("...done!")
47 |                 return video_ids, video_urls
48 |             else:
49 |                 print("...done!")
50 |                 return None, None
51 |         else:
52 |             print("...done!")
53 |             return None, None
54 |     except Exception as e:
55 |         print(f"FAILURE: get_channel_videos failed with exception {e}")
56 |         return None, None
57 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_downloader/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | base_dir = os.path.dirname(os.path.abspath(__file__))
4 | main_dir = os.path.dirname(base_dir)
5 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_downloader/app.py:
--------------------------------------------------------------------------------
 1 | from ytdatakit.youtube_downloader.config import video_choices
 2 | from ytdatakit.youtube_downloader.callbacks import callback_download_video
 3 | from ytdatakit.youtube_downloader.state import state_init
 4 | import streamlit as st
 5 | 
 6 | 
 7 | def app():
 8 |     state_init()
 9 |     st.markdown(
10 |         """
11 |     <style>
12 |     .element-container:has(style){
13 |         display: none;
14 |     }
15 |     #button-download {
16 |         display: none;
17 |     }
18 |     .element-container:has(#button-download) {
19 |         display: none;
20 |     }
21 |     .element-container:has(#button-download) + div button {
22 |         background-color: green;
23 |         border-color: green;
24 |         }
25 |     #button-fetch {
26 |         display: none;
27 |     }
28 |     .element-container:has(#button-fetch) {
29 |         display: none;
30 |     }
31 |     .element-container:has(#button-fetch) + div button {
32 |         background-color: blue;
33 |         border-color: blue;
34 |         }
35 |     </style>
36 |     """,
37 |         unsafe_allow_html=True,
38 |     )
39 | 
40 |     st.markdown(
41 |         """
42 |     <style>
43 |     .custom-font {
44 |         font-size:7.5px !important;
45 |         color: transparent;
46 |     }
47 |     </style>
48 |     """,
49 |         unsafe_allow_html=True,
50 |     )
51 | 
52 |     video_download_col_a, video_download_col_b, video_download_col_c = st.columns([4, 3, 2])
53 |     with video_download_col_a:
54 |         url_input = st.text_input(
55 |             value="https://www.youtube.com/watch?v=qQgyoHsknIk",
56 |             label="🔗 Paste YouTube / Shorts URL here",
57 |             placeholder="e.g., https://www.youtube.com/watch?v=.",
58 |             key="youtube_download_text_input",
59 |         )
60 |     with video_download_col_b:
61 |         resolution_dropdown = st.selectbox(options=video_choices, index=st.session_state.youtube_download_resolution_index, label="video resolution")
62 |     with video_download_col_c:
63 |         st.markdown('<p class="custom-font">fetch</p>', unsafe_allow_html=True)
64 |         st.markdown('<span id="button-fetch"></span>', unsafe_allow_html=True)
65 |         st.button(
66 |             "fetch video",
67 |             type="primary",
68 |             on_click=callback_download_video,
69 |             args=(
70 |                 url_input,
71 |                 resolution_dropdown,
72 |             ),
73 |             key="youtube_download_fetch_button",
74 |         )
75 |     with st.container(border=True):
76 |         with open(st.session_state.youtube_download_location, "rb") as file:
77 |             st.markdown('<span id="button-download"></span>', unsafe_allow_html=True)
78 |             st.download_button(
79 |                 label="download video",
80 |                 data=file,
81 |                 file_name=st.session_state.youtube_download_location.split("/")[-1],
82 |                 mime="video/mp4",
83 |                 type="primary",
84 |             )
85 |         st.video(data=st.session_state.youtube_download_location, format="video/mp4")
86 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_downloader/callbacks.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from ytdatakit.youtube_downloader.yt_download import download_video
 3 | from ytdatakit.youtube_downloader.state import default_youtube_download_location
 4 | from ytdatakit.youtube_downloader.config import video_choices
 5 | 
 6 | 
 7 | def callback_download_video(url_input: str, resolution_dropdown: str) -> None:
 8 |     temporary_video_location = download_video(url_input, default_youtube_download_location(), st.session_state.resolution_dropdown)
 9 |     st.session_state.youtube_download_location = temporary_video_location
10 |     st.session_state.youtube_download_resolution_index = video_choices.index(resolution_dropdown)
11 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_downloader/config.py:
--------------------------------------------------------------------------------
1 | app_name = "ytdatakit"
2 | video_choices = ["best", "1080", "720", "360"]
3 | default_clip_video_path = "./data/input/blank.mp4"
4 | default_clip_gif_path = "./data/input/blank.jpg"
5 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_downloader/state.py:
--------------------------------------------------------------------------------
 1 | from ytdatakit.youtube_downloader.config import video_choices, default_clip_video_path
 2 | import streamlit as st
 3 | import tempfile
 4 | 
 5 | 
 6 | def default_youtube_download_location():
 7 |     with tempfile.TemporaryDirectory() as tmpdirname:
 8 |         return tmpdirname
 9 | 
10 | 
11 | def state_init():
12 |     if "resolution_dropdown" not in st.session_state:
13 |         st.session_state.resolution_dropdown = video_choices
14 |     if "youtube_download_location" not in st.session_state:
15 |         st.session_state.youtube_download_location = default_clip_video_path
16 |     if "youtube_download_resolution_index" not in st.session_state:
17 |         st.session_state.youtube_download_resolution_index = 0
18 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_downloader/yt_download.py:
--------------------------------------------------------------------------------
 1 | import yt_dlp
 2 | from yt_dlp import YoutubeDL
 3 | import re
 4 | 
 5 | 
 6 | def is_valid_youtube_url(url: str) -> bool:
 7 |     if not isinstance(url, str):
 8 |         return False
 9 |     pattern = r"^https://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
10 |     if "shorts" in url:
11 |         pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
12 |     return re.match(pattern, url) is not None
13 | 
14 | 
15 | def download_video(url: str, savedir: str, resolution_dropdown: str, my_proxies: dict = {}) -> str:
16 |     try:
17 |         print("Downloading video from youtube...")
18 |         if is_valid_youtube_url(url):
19 |             with YoutubeDL() as ydl:
20 |                 info_dict = ydl.extract_info(url, download=False)
21 |                 video_url = info_dict.get("url", None)
22 |                 video_id = info_dict.get("id", None)
23 |                 video_title = info_dict.get("title", None)
24 |                 video_title = re.sub(r"[^a-zA-Z0-9]", " ", video_title)
25 | 
26 |                 if video_title is None:
27 |                     savepath = savedir + "/" + video_id + ".mp4"
28 |                 else:
29 |                     savepath = savedir + "/" + video_title + ".mp4"
30 | 
31 |             ydl_opts = {
32 |                 "format": "bestvideo+bestaudio/best",
33 |                 "merge_output_format": "mp4",
34 |                 "outtmpl": savepath,
35 |             }
36 |             if resolution_dropdown == "1080":
37 |                 ydl_opts = {
38 |                     "format": "bestvideo[height<=1080]+bestaudio/best",
39 |                     "merge_output_format": "mp4",
40 |                     "outtmpl": savepath,
41 |                 }
42 | 
43 |             if resolution_dropdown == "720":
44 |                 ydl_opts = {
45 |                     "format": "bestvideo[height<=720]+bestaudio/best",
46 |                     "merge_output_format": "mp4",
47 |                     "outtmpl": savepath,
48 |                 }
49 | 
50 |             if resolution_dropdown == "360":
51 |                 ydl_opts = {
52 |                     "format": "bestvideo[height<=360]+bestaudio/best",
53 |                     "merge_output_format": "mp4",
54 |                     "outtmpl": savepath,
55 |                 }
56 | 
57 |             with yt_dlp.YoutubeDL(ydl_opts) as ydl:
58 |                 ydl.download([url])
59 | 
60 |             print("...done!")
61 |             return savepath
62 |         else:
63 |             raise ValueError(f"invalid input url: {url}")
64 |     except Exception as e:
65 |         raise ValueError(f"yt_download failed with exception {e}")
66 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_thumbnail_downloader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neonwatty/python-youtube-data-kit/4892ba613962a51b5919b4bd161cd88781e0c34a/ytdatakit/youtube_thumbnail_downloader/__init__.py


--------------------------------------------------------------------------------
/ytdatakit/youtube_thumbnail_downloader/app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from ytdatakit.youtube_thumbnail_downloader.state import state_init
 3 | from ytdatakit.youtube_thumbnail_downloader.callbacks import fetch_thumbnails
 4 | 
 5 | 
 6 | def app():
 7 |     state_init()
 8 |     st.markdown(
 9 |         """
10 |     <style>
11 |     .element-container:has(style){
12 |         display: none;
13 |     }
14 |     #button-download {
15 |         display: none;
16 |     }
17 |     .element-container:has(#button-download) {
18 |         display: none;
19 |     }
20 |     .element-container:has(#button-download) + div button {
21 |         background-color: green;
22 |         border-color: green;
23 |         }
24 |     #button-fetch {
25 |         display: none;
26 |     }
27 |     .element-container:has(#button-fetch) {
28 |         display: none;
29 |     }
30 |     .element-container:has(#button-fetch) + div button {
31 |         background-color: blue;
32 |         border-color: blue;
33 |         }
34 |     </style>
35 |     """,
36 |         unsafe_allow_html=True,
37 |     )
38 | 
39 |     base = st.container(border=True)
40 |     with base:
41 |         text_urls = st.text_area(
42 |             "youtube urls separated by commas",
43 |             value=st.session_state.thumbnail_text_input_urls if "thumbnail_text_input_urls" in st.session_state else "",
44 |             placeholder="https://www.youtube.com/shorts/o7a9hx-Pqyo, https://www.youtube.com/shorts/xkAYLnIsfX4, ....",
45 |             key="thumbnail_urls_input",
46 |         )
47 |         st.thumbnail_text_input_urls = text_urls
48 |         uploaded_file = st.file_uploader("Choose a File", type=["txt"], key="thumbanils_file_uploader")
49 |         thumbnail_col_1, thumbnail_col_2, thumbnail_col_3 = st.columns([5, 8, 8])
50 |         with thumbnail_col_1:
51 |             st.markdown('<span id="button-fetch"></span>', unsafe_allow_html=True)
52 |             st.button(label="fetch thumbnails", type="primary", on_click=fetch_thumbnails, args=(uploaded_file, text_urls))
53 | 
54 |         with thumbnail_col_2:
55 |             st.markdown('<span id="button-download"></span>', unsafe_allow_html=True)
56 |             if "thumbnails_zip_path" in st.session_state:
57 |                 with open(st.session_state.thumbnails_zip_path, "rb") as file:
58 |                     st.download_button(
59 |                         label="download thumbnails",
60 |                         data=file,  # st.session_state.thumbnails_zip_path if "thumbnails_zip_path" in st.session_state else "./data/input/blank.zip",
61 |                         file_name="thumbnails.zip",
62 |                         mime="application/zip",
63 |                         type="primary",
64 |                         disabled=True if st.session_state.thumbnail_fetch_count == 0 else False,
65 |                     )
66 |             else:
67 |                 st.download_button(
68 |                     label="download thumbnails",
69 |                     data="./data/input/blank.zip",
70 |                     file_name="thumbnails.zip",
71 |                     mime="application/zip",
72 |                     type="primary",
73 |                     disabled=True,
74 |                 )
75 | 
76 |         with st.container(border=True):
77 |             for ind, thumbnail_savepath in enumerate(st.session_state.thumbnail_savepaths):
78 |                 title = st.session_state.thumbnail_data_entries[ind]["video_title"]
79 |                 thumbnail_savepath = st.session_state.thumbnail_savepaths[ind]
80 |                 with st.container(border=True):
81 |                     a, b, c = st.columns([1, 3, 1])
82 |                     with b:
83 |                         st.subheader(title)
84 |                         st.image(thumbnail_savepath)
85 |                         with open(thumbnail_savepath, "rb") as file:
86 |                             st.download_button(
87 |                                 label="download thumbnail",
88 |                                 data=file,
89 |                                 file_name=title + ".jpg",
90 |                                 mime="image/jpg",
91 |                                 key=f"{title} download",
92 |                                 type="primary",
93 |                             )
94 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_thumbnail_downloader/callbacks.py:
--------------------------------------------------------------------------------
 1 | from ytdatakit.youtube_thumbnail_downloader.yt_thumbnail_downloader import get_batch_thumbnails
 2 | from ytdatakit.youtube_thumbnail_downloader.zip import zip_images
 3 | from ytdatakit.youtube_thumbnail_downloader.state import reset_state
 4 | from ytdatakit.youtube_thumbnail_downloader.config import default_thumbnail_location
 5 | import streamlit as st
 6 | from io import StringIO
 7 | import tempfile
 8 | 
 9 | 
10 | def default_temp_savdir():
11 |     with tempfile.TemporaryDirectory() as tmpdirname:
12 |         return tmpdirname
13 | 
14 | 
15 | def urls_normalizer(uploaded_file: "st.uploaded", text_urls: str) -> list:
16 |     youtube_urls = []
17 |     if uploaded_file is not None:
18 |         if text_urls is not None:
19 |             if len(text_urls.strip()) > 0:
20 |                 st.warning("you can enter urls manually or from file but not both", icon="⚠️")
21 |                 st.stop()
22 | 
23 |         if uploaded_file.type == "text/plain":
24 |             stringio = StringIO(uploaded_file.read().decode("utf-8"))
25 |             for line in stringio:
26 |                 youtube_urls.append(line.strip())
27 |     if text_urls is not None:
28 |         if len(text_urls.strip()) > 0:
29 |             if uploaded_file is not None:
30 |                 st.warning("you can enter urls manually or from file but not both", icon="⚠️")
31 |                 st.stop()
32 |             try:
33 |                 text_urls_split = text_urls.split(",")
34 |                 text_urls_split = [v.strip() for v in text_urls_split]
35 |                 youtube_urls = text_urls_split
36 |             except:  # noqa E722
37 |                 st.warning("please check your manually entered urls", icon="⚠️")
38 |                 st.stop()
39 |     return youtube_urls
40 | 
41 | 
42 | def fetch_logic(youtube_urls: list) -> None:
43 |     if youtube_urls != st.session_state.thumbnail_raw_urls:
44 |         st.session_state.thumbnail_raw_urls = youtube_urls
45 |         reset_state()
46 |     if st.session_state.thumbnail_fetch_count == 0:
47 |         st.session_state.local_thumbnail_location = default_thumbnail_location()
48 |         savedir = "/".join(st.session_state.local_thumbnail_location.split("/")[:-2])
49 |         thumbnail_savepaths, thumbnail_data_entries = get_batch_thumbnails(youtube_urls, savedir)
50 |         st.session_state.thumbnail_savepaths = thumbnail_savepaths
51 |         st.session_state.thumbnail_data_entries = thumbnail_data_entries
52 |         st.session_state.thumbnail_fetch_count += 1
53 | 
54 |         st.session_state.thumbnails_zip_path = savedir + "/" + "thumbnails.zip"
55 |         zip_images(thumbnail_savepaths)
56 | 
57 | 
58 | def fetch_thumbnails(uploaded_file, text_urls):
59 |     # with st.spinner(text="thumbnail pull in progress..."):
60 |     youtube_urls = urls_normalizer(uploaded_file, text_urls)
61 |     fetch_logic(youtube_urls)
62 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_thumbnail_downloader/config.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | import uuid
 3 | 
 4 | default_thumbnail_raw_urls = ""
 5 | default_thumbnail_savepaths = []
 6 | default_thumbnail_data_entries = []
 7 | default_thumbnail_text_input_urls = ""
 8 | default_thumbnails_zip_path = "./data/input/blank.zip"
 9 | 
10 | 
11 | def default_thumbnail_location():
12 |     with tempfile.TemporaryDirectory() as tmpdirname:
13 |         return tmpdirname + "/temp_" + str(uuid.uuid4()) + ".jpg"
14 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_thumbnail_downloader/state.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from ytdatakit.youtube_thumbnail_downloader.config import (
 3 |     default_thumbnail_raw_urls,
 4 |     default_thumbnail_savepaths,
 5 |     default_thumbnail_location,
 6 |     default_thumbnail_data_entries,
 7 |     default_thumbnail_text_input_urls,
 8 |     default_thumbnails_zip_path,
 9 | )
10 | 
11 | 
12 | def state_init():
13 |     if "thumbnail_raw_urls" not in st.session_state:
14 |         st.session_state.thumbnail_raw_urls = default_thumbnail_raw_urls
15 |     if "thumbnail_savepaths" not in st.session_state:
16 |         st.session_state.thumbnail_savepaths = default_thumbnail_savepaths
17 |     if "thumbnail_data_entries" not in st.session_state:
18 |         st.session_state.thumbnail_data_entries = default_thumbnail_data_entries
19 |     if "thumbnail_fetch_count" not in st.session_state:
20 |         st.session_state.thumbnail_fetch_count = 0
21 |     if "default_thumbnail_location" not in st.session_state:
22 |         st.session_state.local_thumbnail_location = default_thumbnail_location()
23 |     if "youtube_thumbnails_expander" not in st.session_state:
24 |         st.session_state.youtube_thumbnails_expander = False
25 | 
26 | 
27 | def reset_state():
28 |     st.session_state.thumbnail_savepaths = default_thumbnail_savepaths
29 |     st.session_state.thumbnail_text_input_urls = default_thumbnail_text_input_urls
30 |     st.session_state.thumbnails_zip_path = default_thumbnails_zip_path
31 |     st.session_state.thumbnail_text_input_urls = ""
32 | 
33 |     st.session_state.thumbnail_fetch_count = 0
34 |     st.session_state.youtube_thumbnails_expander = False
35 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_thumbnail_downloader/yt_thumbnail_downloader.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import requests
 3 | from yt_dlp import YoutubeDL
 4 | 
 5 | 
 6 | def is_valid_youtube_url(url: str) -> bool:
 7 |     if not isinstance(url, str):
 8 |         return False
 9 |     pattern = r"^https://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
10 |     if "shorts" in url:
11 |         pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
12 |     return re.match(pattern, url) is not None
13 | 
14 | 
15 | def download_thumbnail(yt_thumbnail_url: str, savepath: str) -> None:
16 |     img_data = requests.get(yt_thumbnail_url).content
17 |     with open(savepath, "wb") as handler:
18 |         handler.write(img_data)
19 | 
20 | 
21 | def get_youtube_thumbnail_url(video_id: str) -> dict:
22 |     if video_id:
23 |         return {
24 |             "default": f"https://img.youtube.com/vi/{video_id}/default.jpg",
25 |             "mqdefault": f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg",
26 |             "hqdefault": f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg",
27 |             "sddefault": f"https://img.youtube.com/vi/{video_id}/sddefault.jpg",
28 |             "maxresdefault": f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg",
29 |         }
30 | 
31 | 
32 | def pull_yt_data(url: str, savedir: str, my_proxies: dict = {}) -> tuple:
33 |     try:
34 |         if is_valid_youtube_url(url):
35 |             with YoutubeDL() as ydl:
36 |                 info_dict = ydl.extract_info(url, download=False)
37 |                 video_url = info_dict.get("url", None)
38 |                 video_id = info_dict.get("id", None)
39 |                 video_title = info_dict.get("title", None)
40 |                 entry = {}
41 |                 entry["video_url"] = url
42 |                 entry["video_id"] = video_id
43 |                 entry["video_title"] = video_title
44 |                 video_title = re.sub(r"[^a-zA-Z0-9]", "", video_title)
45 | 
46 |                 if video_title is None:
47 |                     savepath = savedir + "/" + video_id + ".jpg"
48 |                 else:
49 |                     savepath = savedir + "/" + video_title + ".jpg"
50 | 
51 |                 if video_id:
52 |                     thumbnail_url = get_youtube_thumbnail_url(video_id)["hqdefault"]
53 |                     download_thumbnail(thumbnail_url, savepath)
54 |             print("...done!")
55 |             return savepath, entry
56 |         else:
57 |             raise ValueError(f"invalid input url: {url}")
58 |     except Exception as e:
59 |         raise ValueError(f"yt_download failed with exception {e}")
60 | 
61 | 
62 | def get_batch_thumbnails(yt_urls: list, savedir: str, my_proxies: dict = {}):
63 |     thumbnail_savepaths = []
64 |     entries = []
65 |     for url in yt_urls:
66 |         try:
67 |             thumbnail_savepath, data_entry = pull_yt_data(url, savedir, my_proxies)
68 |             thumbnail_savepaths.append(thumbnail_savepath)
69 |             entries.append(data_entry)
70 |         except Exception as e:
71 |             print(f"url {url} failed with exception {e}")
72 |             pass
73 |     return thumbnail_savepaths, entries
74 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_thumbnail_downloader/zip.py:
--------------------------------------------------------------------------------
 1 | import zipfile
 2 | import os
 3 | import streamlit as st
 4 | 
 5 | 
 6 | def zip_images(image_paths: list):
 7 |     print("INFO: zipping images...")
 8 |     zip_filename = st.session_state.thumbnails_zip_path
 9 |     with zipfile.ZipFile(zip_filename, "w") as zipf:
10 |         for image_path in image_paths:
11 |             _, filename = os.path.split(image_path)
12 |             zipf.write(image_path, arcname=filename)
13 |             print(f"Added {filename} to the zip file.")
14 |     print(f"...done!  images have been zipped into {zip_filename}")
15 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_transcript_downloader/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neonwatty/python-youtube-data-kit/4892ba613962a51b5919b4bd161cd88781e0c34a/ytdatakit/youtube_transcript_downloader/__init__.py


--------------------------------------------------------------------------------
/ytdatakit/youtube_transcript_downloader/app.py:
--------------------------------------------------------------------------------
 1 | from ytdatakit.youtube_transcript_downloader.callbacks import fetch_transcripts
 2 | from ytdatakit.youtube_transcript_downloader.state import state_init
 3 | import streamlit as st
 4 | 
 5 | 
 6 | def app():
 7 |     state_init()
 8 |     st.markdown(
 9 |         """
10 |     <style>
11 |     .element-container:has(style){
12 |         display: none;
13 |     }
14 |     #button-download {
15 |         display: none;
16 |     }
17 |     .element-container:has(#button-download) {
18 |         display: none;
19 |     }
20 |     .element-container:has(#button-download) + div button {
21 |         background-color: green;
22 |         border-color: green;
23 |         }
24 |     #button-fetch {
25 |         display: none;
26 |     }
27 |     .element-container:has(#button-fetch) {
28 |         display: none;
29 |     }
30 |     .element-container:has(#button-fetch) + div button {
31 |         background-color: blue;
32 |         border-color: blue;
33 |         }
34 |     </style>
35 |     """,
36 |         unsafe_allow_html=True,
37 |     )
38 | 
39 |     base = st.container(border=True)
40 |     with base:
41 |         text_urls = st.text_area(
42 |             "youtube urls separated by commas",
43 |             value=st.session_state.transcript_raw_urls,
44 |             placeholder="https://www.youtube.com/shorts/o7a9hx-Pqyo, https://www.youtube.com/shorts/xkAYLnIsfX4, ....",
45 |             key="transcript_urls_input",
46 |         )
47 |         uploaded_file = st.file_uploader("Choose a File", type=["txt"], key="transcripts_file_uploader")
48 |         transcript_col_1, transcript_col_2, transcript_col_3 = st.columns([3, 4, 6])
49 |         with transcript_col_1:
50 |             st.markdown('<span id="button-fetch"></span>', unsafe_allow_html=True)
51 |             fetch_btn = st.button(
52 |                 label="fetch transcripts",
53 |                 type="primary",
54 |             )
55 |             if fetch_btn:
56 |                 df_table, df_download = fetch_transcripts(uploaded_file, text_urls)
57 |                 st.session_state.transcript_data_table = df_table
58 |                 st.session_state.transcript_data_download = df_download
59 |         with transcript_col_2:
60 |             st.markdown('<span id="button-download"></span>', unsafe_allow_html=True)
61 |             st.download_button(
62 |                 label="download transcripts",
63 |                 data=st.session_state.transcript_data_download,
64 |                 file_name="transcripts.csv",
65 |                 mime="text/csv",
66 |                 disabled=False,
67 |                 type="primary",
68 |             )
69 |         with st.container(border=True):
70 |             st.table(st.session_state.transcript_data_table)
71 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_transcript_downloader/callbacks.py:
--------------------------------------------------------------------------------
 1 | from ytdatakit.youtube_transcript_downloader.yt_transcript_download import get_batch_transcripts
 2 | from io import StringIO
 3 | import pandas as pd
 4 | import streamlit as st
 5 | import copy
 6 | 
 7 | 
 8 | @st.cache_data
 9 | def convert_df(df: pd.DataFrame) -> "csv":
10 |     # IMPORTANT: Cache the conversion to prevent computation on every rerun
11 |     return df.to_csv().encode("utf-8")
12 | 
13 | 
14 | def fetch_transcripts(uploaded_file, text_urls):
15 |     # with st.spinner(text="transcript pull in progress..."):
16 |     youtube_urls = []
17 |     if uploaded_file is not None:
18 |         if text_urls is not None:
19 |             if len(text_urls.strip()) > 0:
20 |                 st.warning("you can enter urls manually or from file but not both", icon="⚠️")
21 |                 st.stop()
22 | 
23 |         if uploaded_file.type == "text/plain":
24 |             stringio = StringIO(uploaded_file.read().decode("utf-8"))
25 |             for line in stringio:
26 |                 youtube_urls.append(line.strip())
27 |     if text_urls is not None:
28 |         if len(text_urls.strip()) > 0:
29 |             if uploaded_file is not None:
30 |                 st.warning("you can enter urls manually or from file but not both", icon="⚠️")
31 |                 st.stop()
32 |             try:
33 |                 text_urls_split = text_urls.split(",")
34 |                 text_urls_split = [v.strip() for v in text_urls_split]
35 |                 youtube_urls = text_urls_split
36 |             except:  # noqa E722
37 |                 st.warning("please check your manually entered urls", icon="⚠️")
38 |                 st.stop()
39 | 
40 |         batch_transcripts = get_batch_transcripts(youtube_urls)
41 |         df = pd.DataFrame(batch_transcripts)
42 |         df_download = convert_df(df)
43 | 
44 |         def truncate_and_append(text, length, suffix):
45 |             if len(text) > length:
46 |                 return text[:length] + suffix
47 |             return text
48 | 
49 |         max_length = 100
50 |         suffix = "..."
51 |         df_table = copy.deepcopy(df).astype(str)
52 |         df_table["transcript"] = df_table["transcript"].apply(lambda x: truncate_and_append(x, max_length, suffix))
53 |         return df_table, df_download
54 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_transcript_downloader/config.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/neonwatty/python-youtube-data-kit/4892ba613962a51b5919b4bd161cd88781e0c34a/ytdatakit/youtube_transcript_downloader/config.py


--------------------------------------------------------------------------------
/ytdatakit/youtube_transcript_downloader/state.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def state_init():
 6 |     df = pd.DataFrame(columns=["youtube_url", "video_id", "transcript"])
 7 |     if "transcript_raw_urls" not in st.session_state:
 8 |         st.session_state.transcript_raw_urls = ""
 9 |     if "transcript_data_table" not in st.session_state:
10 |         st.session_state.transcript_data_table = df
11 |     if "transcript_data_download" not in st.session_state:
12 |         st.session_state.transcript_data_download = df.to_csv().encode("utf-8")
13 | 


--------------------------------------------------------------------------------
/ytdatakit/youtube_transcript_downloader/yt_transcript_download.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import List, Dict
 3 | from youtube_transcript_api import YouTubeTranscriptApi
 4 | 
 5 | 
 6 | def is_valid_youtube_url(url: str) -> bool:
 7 |     if not isinstance(url, str):
 8 |         return False
 9 |     pattern = r"^https://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
10 |     if "shorts" in url:
11 |         pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$"  # youtube vido ids are always 11 chars long
12 |     return re.match(pattern, url) is not None
13 | 
14 | 
15 | def get_single_transcript(youtube_url: str) -> dict:
16 |     if is_valid_youtube_url(youtube_url):
17 |         if "shorts" in youtube_url:
18 |             video_id = youtube_url.split("/")[-1]
19 |         else:
20 |             video_id = youtube_url.split("=")[-1]
21 |         try:
22 |             video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
23 |             entry = {}
24 |             entry["youtube_url"] = youtube_url
25 |             entry["video_id"] = video_id
26 |             entry["transcript"] = video_transcript
27 |             return entry
28 |         except Exception as e:
29 |             if "Subtitles are disabled for this video" in str(e):
30 |                 entry = {}
31 |                 entry["youtube_url"] = youtube_url
32 |                 entry["video_id"] = video_id
33 |                 entry["transcript"] = "Subtitles are disabled for this video"
34 |                 return entry
35 |             else:
36 |                 print(e)
37 |     else:
38 |         print(f"FAILURE: youtube_url is not valid - {youtube_url}")
39 | 
40 | 
41 | def get_batch_transcripts(youtube_urls: List[str]) -> List[Dict]:
42 |     try:
43 |         entries = []
44 |         for i, youtube_url in enumerate(youtube_urls):
45 |             entry = get_single_transcript(youtube_url)
46 |             if entry is not None:
47 |                 entries.append(entry)
48 |         return entries
49 |     except Exception as e:
50 |         print(f"FAILURE: get_batch_transcripts function failed with exception {e}")
51 |         return []
52 | 


--------------------------------------------------------------------------------