├── .github └── workflows │ ├── docker-build.yml │ └── python-app.yml ├── .gitignore ├── .ruff.toml ├── .streamlit └── config.toml ├── Dockerfile ├── LICENSE ├── README.md ├── data └── input │ ├── blank.csv │ ├── blank.jpg │ ├── blank.zip │ └── test_input.txt ├── docker-compose.yml ├── requirements.test ├── requirements.txt ├── tests ├── __init__.py └── test_app.py └── ytdatakit ├── __init__.py ├── about.py ├── about ├── __init__.py └── app.py ├── app.py ├── youtube_channel_downloader ├── __init__.py ├── app.py ├── callbacks.py ├── config.py ├── state.py └── yt_channel_download.py ├── youtube_downloader ├── __init__.py ├── app.py ├── callbacks.py ├── config.py ├── state.py └── yt_download.py ├── youtube_thumbnail_downloader ├── __init__.py ├── app.py ├── callbacks.py ├── config.py ├── state.py ├── yt_thumbnail_downloader.py └── zip.py └── youtube_transcript_downloader ├── __init__.py ├── app.py ├── callbacks.py ├── config.py ├── state.py └── yt_transcript_download.py /.github/workflows/docker-build.yml: -------------------------------------------------------------------------------- 1 | name: 'DockerBuild' 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | jobs: 9 | Build_And_Push: 10 | runs-on: ubuntu-22.04 11 | permissions: 12 | contents: read 13 | packages: write 14 | 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v2 18 | 19 | - name: Log in to Github Docker Image Registry 20 | uses: docker/login-action@v3 21 | with: 22 | registry: ghcr.io 23 | username: ${{ github.actor }} 24 | password: ${{ secrets.TOKEN_GITHUB }} 25 | 26 | # Uncomment to use Docker Hub 27 | # - name: Login to Docker Hub 28 | # uses: docker/login-action@v3 29 | # with: 30 | # username: ${{ secrets.USERNAME_DOCKERHUB }} 31 | # password: ${{ secrets.TOKEN_DOCKERHUB }} 32 | 33 | - name: Docker Meta 34 | id: meta 35 | uses: docker/metadata-action@v4 36 | with: 37 | images: | 38 | ghcr.io/${{ github.actor }}/ytdatakit 39 | # ${{ secrets.DOCKERHUB_USERNAME }}/meme-search 40 | tags: type=ref,event=tag 41 | flavor: latest=true 42 | 43 | - name: Set up QEMU 44 | uses: docker/setup-qemu-action@v3 45 | 46 | - name: Set Buildx 47 | uses: docker/setup-buildx-action@v3 48 | 49 | - name: Build and Upload for AMD64 and ARM64 50 | uses: docker/build-push-action@v4 51 | with: 52 | context: . 53 | platforms: linux/amd64,linux/arm64 54 | push: true 55 | tags: ${{ steps.meta.outputs.tags }} 56 | labels: ${{ steps.meta.outputs.labels }} -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | paths-ignore: 10 | - '**/README.md' 11 | - '**/CONTRIBUTING.md' 12 | - '**LICENSE' 13 | pull_request: 14 | branches: [ "main" ] 15 | paths-ignore: 16 | - '**/README.md' 17 | - '**/CONTRIBUTING.md' 18 | - '**LICENSE' 19 | 20 | jobs: 21 | ruff: 22 | name: lint with ruff 23 | runs-on: ubuntu-latest 24 | timeout-minutes: 3 25 | steps: 26 | - uses: actions/checkout@v4 27 | - uses: actions/setup-python@v2 28 | - uses: chartboost/ruff-action@v1 29 | with: 30 | args: 'format --check' 31 | config: .ruff.toml 32 | test: 33 | name: run pytest 34 | runs-on: ubuntu-latest 35 | timeout-minutes: 5 36 | steps: 37 | - name: Checkout code 38 | uses: actions/checkout@v2 39 | - name: Set up Python 40 | uses: actions/setup-python@v2 41 | with: 42 | python-version: '3.10' 43 | - name: Install dependencies 44 | run: | 45 | python -m pip install --upgrade pip 46 | pip install -r requirements.test 47 | pip install -r requirements.txt 48 | - name: Run pytest 49 | run: | 50 | PYTHONPATH=. python3.10 -m pytest tests/test_app.py -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | scratch.ipynb 2 | *.mp4a 3 | *.mp3 4 | *.mp4 5 | venv* 6 | *.db 7 | *.faiss 8 | .DS_Store 9 | ._.DS_Store 10 | **/.DS_Store 11 | **/._.DS_Store 12 | **/.env* 13 | bug_reports/ 14 | .ruff_cache/ 15 | .vscode 16 | notebook_tests/ 17 | scratch_notebooks/ 18 | demos/ 19 | release_notes/ 20 | site/ 21 | tests/test_files/text/test_preprocessed/* 22 | !.ruff.toml 23 | push_pypi.sh 24 | 25 | 26 | # Byte-compiled / optimized / DLL files 27 | __pycache__/ 28 | *.py[cod] 29 | *$py.class 30 | 31 | # C extensions 32 | *.so 33 | 34 | # Distribution / packaging 35 | .Python 36 | build/ 37 | develop-eggs/ 38 | dist/ 39 | downloads/ 40 | eggs/ 41 | .eggs/ 42 | lib/ 43 | lib64/ 44 | parts/ 45 | sdist/ 46 | var/ 47 | wheels/ 48 | share/python-wheels/ 49 | *.egg-info/ 50 | .installed.cfg 51 | *.egg 52 | MANIFEST 53 | 54 | # PyInstaller 55 | # Usually these files are written by a python script from a template 56 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 57 | *.manifest 58 | *.spec 59 | 60 | # Installer logs 61 | pip-log.txt 62 | pip-delete-this-directory.txt 63 | 64 | # Unit test / coverage reports 65 | htmlcov/ 66 | .tox/ 67 | .nox/ 68 | .coverage 69 | .coverage.* 70 | .cache 71 | nosetests.xml 72 | coverage.xml 73 | *.cover 74 | *.py,cover 75 | .hypothesis/ 76 | .pytest_cache/ 77 | cover/ 78 | 79 | # Translations 80 | *.mo 81 | *.pot 82 | 83 | # Django stuff: 84 | *.log 85 | local_settings.py 86 | db.sqlite3 87 | db.sqlite3-journal 88 | 89 | # Flask stuff: 90 | instance/ 91 | .webassets-cache 92 | 93 | # Scrapy stuff: 94 | .scrapy 95 | 96 | # Sphinx documentation 97 | docs/_build/ 98 | 99 | # PyBuilder 100 | .pybuilder/ 101 | target/ 102 | 103 | # Jupyter Notebook 104 | .ipynb_checkpoints 105 | 106 | # IPython 107 | profile_default/ 108 | ipython_config.py 109 | 110 | # pyenv 111 | # For a library or package, you might want to ignore these files since the code is 112 | # intended to run in multiple environments; otherwise, check them in: 113 | # .python-version 114 | 115 | # pipenv 116 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 117 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 118 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 119 | # install all needed dependencies. 120 | #Pipfile.lock 121 | 122 | # poetry 123 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 124 | # This is especially recommended for binary packages to ensure reproducibility, and is more 125 | # commonly ignored for libraries. 126 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 127 | #poetry.lock 128 | 129 | # pdm 130 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 131 | #pdm.lock 132 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 133 | # in version control. 134 | # https://pdm.fming.dev/#use-with-ide 135 | .pdm.toml 136 | 137 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 138 | __pypackages__/ 139 | 140 | # Celery stuff 141 | celerybeat-schedule 142 | celerybeat.pid 143 | 144 | # SageMath parsed files 145 | *.sage.py 146 | 147 | # Environments 148 | .venv 149 | venv/ 150 | venv.bak/ 151 | 152 | # Spyder project settings 153 | .spyderproject 154 | .spyproject 155 | 156 | # Rope project settings 157 | .ropeproject 158 | 159 | # mkdocs documentation 160 | /site 161 | 162 | # mypy 163 | .mypy_cache/ 164 | .dmypy.json 165 | dmypy.json 166 | 167 | # Pyre type checker 168 | .pyre/ 169 | 170 | # pytype static type analyzer 171 | .pytype/ 172 | 173 | # Cython debug symbols 174 | cython_debug/ -------------------------------------------------------------------------------- /.ruff.toml: -------------------------------------------------------------------------------- 1 | line-length = 150 2 | target-version = "py38" 3 | lint.select = ["E", "W"] 4 | lint.fixable = ["ALL"] 5 | lint.ignore = ["E501", "E999", "E402"] 6 | exclude = [ 7 | ".bzr", 8 | ".direnv", 9 | ".eggs", 10 | ".git", 11 | ".git-rewrite", 12 | ".hg", 13 | ".ipynb_checkpoints", 14 | ".mypy_cache", 15 | ".nox", 16 | ".pants.d", 17 | ".pyenv", 18 | ".pytest_cache", 19 | ".pytype", 20 | ".ruff_cache", 21 | ".svn", 22 | ".tox", 23 | ".venv", 24 | ".vscode", 25 | "__pypackages__", 26 | "_build", 27 | "buck-out", 28 | "build", 29 | "dist", 30 | "node_modules", 31 | "site-packages", 32 | "venv", 33 | "tests", 34 | "scratch_notebooks", 35 | "release_notes", 36 | "notebook_tests", 37 | "demos" 38 | ] 39 | extend-include = ["*.ipynb"] -------------------------------------------------------------------------------- /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [browser] 2 | gatherUsageStats = false 3 | 4 | [server] 5 | runOnSave = true 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim 2 | 3 | RUN apt-get update && apt-get install -y \ 4 | build-essential \ 5 | software-properties-common \ 6 | ffmpeg \ 7 | && rm -rf /var/lib/apt/lists/* 8 | 9 | WORKDIR /home 10 | 11 | ENV PYTHONPATH=. 12 | 13 | COPY requirements.txt /home/requirements.txt 14 | COPY ytdatakit /home/ytdatakit 15 | COPY .streamlit /home/.streamlit 16 | RUN pip3 install -r /home/requirements.txt 17 | 18 | EXPOSE 8502 19 | 20 | HEALTHCHECK CMD curl --fail http://localhost:8502/_stcore/health || exit 1 21 | 22 | ENTRYPOINT ["streamlit", "run", "/home/ytdatakit/app.py", "--server.port=8502", "--server.address=0.0.0.0"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [](https://github.com/neonwatty/ytdatakit/actions/workflows/python-app.yml/python-app.yml) 2 | 3 | # A free Python app for downloading YouTube videos, transcripts, thumbnails, and channel data all in one place 4 | 5 | A simple app that lets you conveniently download youtube videos, transcripts, thumbnails, and channel data. All in one place. Can be easily run locally using python or docker. 6 | 7 |
8 |
9 |
fetch
', unsafe_allow_html=True) 60 | st.markdown('', unsafe_allow_html=True) 61 | fetch_btn = st.button( 62 | "fetch channel video ids", 63 | type="primary", 64 | ) 65 | if fetch_btn: 66 | if channel_name != st.session_state.channel_name: 67 | state_reset() 68 | if st.session_state.channel_fetch_count == 0: 69 | df_table, df_download = fetch_channel_videos(channel_name) 70 | st.session_state.channel_data_table = df_table 71 | st.session_state.channel_data_download = df_download 72 | st.session_state.channel_fetch_count += 1 73 | 74 | with video_channel_col_c: 75 | st.markdown('fetch
', unsafe_allow_html=True) 76 | st.markdown('', unsafe_allow_html=True) 77 | st.download_button( 78 | label="download", 79 | data=st.session_state.channel_data_download, 80 | file_name="channel_data.csv", 81 | mime="text/csv", 82 | disabled=False if st.session_state.channel_fetch_count > 0 else True, 83 | type="primary", 84 | ) 85 | with st.container(border=True): 86 | st.table(st.session_state.channel_data_table.head(10)) 87 | -------------------------------------------------------------------------------- /ytdatakit/youtube_channel_downloader/callbacks.py: -------------------------------------------------------------------------------- 1 | from ytdatakit.youtube_channel_downloader.yt_channel_download import get_channel_videos 2 | import pandas as pd 3 | import streamlit as st 4 | 5 | 6 | @st.cache_data 7 | def convert_df(df: pd.DataFrame) -> "csv": 8 | # IMPORTANT: Cache the conversion to prevent computation on every rerun 9 | return df.to_csv().encode("utf-8") 10 | 11 | 12 | def fetch_channel_videos(channel_name: str): 13 | # with st.spinner(text="channel video ids pull in progress..."): 14 | video_ids, video_urls = get_channel_videos(channel_name) 15 | if video_ids is not None and video_urls is not None: 16 | df_table = pd.DataFrame(columns=["youtube_url", "video_id"]) 17 | df_table["youtube_url"] = video_urls 18 | df_table["video_id"] = video_ids 19 | df_download = convert_df(df_table) 20 | return df_table, df_download 21 | return None, None 22 | -------------------------------------------------------------------------------- /ytdatakit/youtube_channel_downloader/config.py: -------------------------------------------------------------------------------- 1 | default_channel_name = "Monkhaus" 2 | -------------------------------------------------------------------------------- /ytdatakit/youtube_channel_downloader/state.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | from ytdatakit.youtube_channel_downloader.config import default_channel_name 4 | 5 | 6 | def state_init(): 7 | df = pd.DataFrame(columns=["youtube_url", "video_id"]) 8 | if "channel_data_table" not in st.session_state: 9 | st.session_state.channel_data_table = df 10 | if "channel_data_download" not in st.session_state: 11 | st.session_state.channel_data_download = df.to_csv().encode("utf-8") 12 | if "channel_name" not in st.session_state: 13 | st.session_state.channel_name = default_channel_name 14 | if "channel_fetch_count" not in st.session_state: 15 | st.session_state.channel_fetch_count = 0 16 | 17 | 18 | def state_reset(): 19 | df = pd.DataFrame(columns=["youtube_url", "video_id"]) 20 | if "channel_data_table" not in st.session_state: 21 | st.session_state.channel_data_table = df 22 | if "channel_data_download" not in st.session_state: 23 | st.session_state.channel_data_download = df.to_csv().encode("utf-8") 24 | st.session_state.channel_fetch_count = 0 25 | -------------------------------------------------------------------------------- /ytdatakit/youtube_channel_downloader/yt_channel_download.py: -------------------------------------------------------------------------------- 1 | import yt_dlp 2 | import scrapetube 3 | from typing import Tuple 4 | 5 | 6 | def get_channel_id_from_name(channel_name: str) -> str | None: 7 | ydl_opts = { 8 | "quiet": True, 9 | "skip_download": True, 10 | "extract_flat": True, 11 | "force_generic_extractor": True, 12 | } 13 | 14 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 15 | try: 16 | info = ydl.extract_info(f"ytsearch1:{channel_name}", download=False) 17 | return info["entries"][0]["channel_id"] 18 | except Exception as e: 19 | print(f"FAILURE: get_channel_id_from_name failed with exception {e}") 20 | return None 21 | 22 | 23 | def get_videourl_from_channel_id(channel_id: str) -> Tuple[list, list] | Tuple[None, None]: 24 | try: 25 | videos = scrapetube.get_channel(channel_id) 26 | video_urls = [] 27 | video_ids = [] 28 | for video in videos: 29 | vid = video["videoId"] 30 | vurl = "https://www.youtube.com/watch?v=" + vid 31 | video_ids.append(vid) 32 | video_urls.append(vurl) 33 | return video_ids, video_urls 34 | except Exception as e: 35 | print(f"FAILURE: get_videourls_from_channel_id failed with exception {e}") 36 | return None, None 37 | 38 | 39 | def get_channel_videos(channel_name: str) -> Tuple[list, list] | Tuple[None, None]: 40 | try: 41 | print("INFO: starting channel video id puller...") 42 | channel_id = get_channel_id_from_name(channel_name) 43 | if channel_id is not None: 44 | video_ids, video_urls = get_videourl_from_channel_id(channel_id) 45 | if video_ids is not None and video_urls is not None: 46 | print("...done!") 47 | return video_ids, video_urls 48 | else: 49 | print("...done!") 50 | return None, None 51 | else: 52 | print("...done!") 53 | return None, None 54 | except Exception as e: 55 | print(f"FAILURE: get_channel_videos failed with exception {e}") 56 | return None, None 57 | -------------------------------------------------------------------------------- /ytdatakit/youtube_downloader/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | base_dir = os.path.dirname(os.path.abspath(__file__)) 4 | main_dir = os.path.dirname(base_dir) 5 | -------------------------------------------------------------------------------- /ytdatakit/youtube_downloader/app.py: -------------------------------------------------------------------------------- 1 | from ytdatakit.youtube_downloader.config import video_choices 2 | from ytdatakit.youtube_downloader.callbacks import callback_download_video 3 | from ytdatakit.youtube_downloader.state import state_init 4 | import streamlit as st 5 | 6 | 7 | def app(): 8 | state_init() 9 | st.markdown( 10 | """ 11 | 36 | """, 37 | unsafe_allow_html=True, 38 | ) 39 | 40 | st.markdown( 41 | """ 42 | 48 | """, 49 | unsafe_allow_html=True, 50 | ) 51 | 52 | video_download_col_a, video_download_col_b, video_download_col_c = st.columns([4, 3, 2]) 53 | with video_download_col_a: 54 | url_input = st.text_input( 55 | value="https://www.youtube.com/watch?v=qQgyoHsknIk", 56 | label="🔗 Paste YouTube / Shorts URL here", 57 | placeholder="e.g., https://www.youtube.com/watch?v=.", 58 | key="youtube_download_text_input", 59 | ) 60 | with video_download_col_b: 61 | resolution_dropdown = st.selectbox(options=video_choices, index=st.session_state.youtube_download_resolution_index, label="video resolution") 62 | with video_download_col_c: 63 | st.markdown('fetch
', unsafe_allow_html=True) 64 | st.markdown('', unsafe_allow_html=True) 65 | st.button( 66 | "fetch video", 67 | type="primary", 68 | on_click=callback_download_video, 69 | args=( 70 | url_input, 71 | resolution_dropdown, 72 | ), 73 | key="youtube_download_fetch_button", 74 | ) 75 | with st.container(border=True): 76 | with open(st.session_state.youtube_download_location, "rb") as file: 77 | st.markdown('', unsafe_allow_html=True) 78 | st.download_button( 79 | label="download video", 80 | data=file, 81 | file_name=st.session_state.youtube_download_location.split("/")[-1], 82 | mime="video/mp4", 83 | type="primary", 84 | ) 85 | st.video(data=st.session_state.youtube_download_location, format="video/mp4") 86 | -------------------------------------------------------------------------------- /ytdatakit/youtube_downloader/callbacks.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from ytdatakit.youtube_downloader.yt_download import download_video 3 | from ytdatakit.youtube_downloader.state import default_youtube_download_location 4 | from ytdatakit.youtube_downloader.config import video_choices 5 | 6 | 7 | def callback_download_video(url_input: str, resolution_dropdown: str) -> None: 8 | temporary_video_location = download_video(url_input, default_youtube_download_location(), st.session_state.resolution_dropdown) 9 | st.session_state.youtube_download_location = temporary_video_location 10 | st.session_state.youtube_download_resolution_index = video_choices.index(resolution_dropdown) 11 | -------------------------------------------------------------------------------- /ytdatakit/youtube_downloader/config.py: -------------------------------------------------------------------------------- 1 | app_name = "ytdatakit" 2 | video_choices = ["best", "1080", "720", "360"] 3 | default_clip_video_path = "./data/input/blank.mp4" 4 | default_clip_gif_path = "./data/input/blank.jpg" 5 | -------------------------------------------------------------------------------- /ytdatakit/youtube_downloader/state.py: -------------------------------------------------------------------------------- 1 | from ytdatakit.youtube_downloader.config import video_choices, default_clip_video_path 2 | import streamlit as st 3 | import tempfile 4 | 5 | 6 | def default_youtube_download_location(): 7 | with tempfile.TemporaryDirectory() as tmpdirname: 8 | return tmpdirname 9 | 10 | 11 | def state_init(): 12 | if "resolution_dropdown" not in st.session_state: 13 | st.session_state.resolution_dropdown = video_choices 14 | if "youtube_download_location" not in st.session_state: 15 | st.session_state.youtube_download_location = default_clip_video_path 16 | if "youtube_download_resolution_index" not in st.session_state: 17 | st.session_state.youtube_download_resolution_index = 0 18 | -------------------------------------------------------------------------------- /ytdatakit/youtube_downloader/yt_download.py: -------------------------------------------------------------------------------- 1 | import yt_dlp 2 | from yt_dlp import YoutubeDL 3 | import re 4 | 5 | 6 | def is_valid_youtube_url(url: str) -> bool: 7 | if not isinstance(url, str): 8 | return False 9 | pattern = r"^https://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long 10 | if "shorts" in url: 11 | pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long 12 | return re.match(pattern, url) is not None 13 | 14 | 15 | def download_video(url: str, savedir: str, resolution_dropdown: str, my_proxies: dict = {}) -> str: 16 | try: 17 | print("Downloading video from youtube...") 18 | if is_valid_youtube_url(url): 19 | with YoutubeDL() as ydl: 20 | info_dict = ydl.extract_info(url, download=False) 21 | video_url = info_dict.get("url", None) 22 | video_id = info_dict.get("id", None) 23 | video_title = info_dict.get("title", None) 24 | video_title = re.sub(r"[^a-zA-Z0-9]", " ", video_title) 25 | 26 | if video_title is None: 27 | savepath = savedir + "/" + video_id + ".mp4" 28 | else: 29 | savepath = savedir + "/" + video_title + ".mp4" 30 | 31 | ydl_opts = { 32 | "format": "bestvideo+bestaudio/best", 33 | "merge_output_format": "mp4", 34 | "outtmpl": savepath, 35 | } 36 | if resolution_dropdown == "1080": 37 | ydl_opts = { 38 | "format": "bestvideo[height<=1080]+bestaudio/best", 39 | "merge_output_format": "mp4", 40 | "outtmpl": savepath, 41 | } 42 | 43 | if resolution_dropdown == "720": 44 | ydl_opts = { 45 | "format": "bestvideo[height<=720]+bestaudio/best", 46 | "merge_output_format": "mp4", 47 | "outtmpl": savepath, 48 | } 49 | 50 | if resolution_dropdown == "360": 51 | ydl_opts = { 52 | "format": "bestvideo[height<=360]+bestaudio/best", 53 | "merge_output_format": "mp4", 54 | "outtmpl": savepath, 55 | } 56 | 57 | with yt_dlp.YoutubeDL(ydl_opts) as ydl: 58 | ydl.download([url]) 59 | 60 | print("...done!") 61 | return savepath 62 | else: 63 | raise ValueError(f"invalid input url: {url}") 64 | except Exception as e: 65 | raise ValueError(f"yt_download failed with exception {e}") 66 | -------------------------------------------------------------------------------- /ytdatakit/youtube_thumbnail_downloader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neonwatty/python-youtube-data-kit/4892ba613962a51b5919b4bd161cd88781e0c34a/ytdatakit/youtube_thumbnail_downloader/__init__.py -------------------------------------------------------------------------------- /ytdatakit/youtube_thumbnail_downloader/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from ytdatakit.youtube_thumbnail_downloader.state import state_init 3 | from ytdatakit.youtube_thumbnail_downloader.callbacks import fetch_thumbnails 4 | 5 | 6 | def app(): 7 | state_init() 8 | st.markdown( 9 | """ 10 | 35 | """, 36 | unsafe_allow_html=True, 37 | ) 38 | 39 | base = st.container(border=True) 40 | with base: 41 | text_urls = st.text_area( 42 | "youtube urls separated by commas", 43 | value=st.session_state.thumbnail_text_input_urls if "thumbnail_text_input_urls" in st.session_state else "", 44 | placeholder="https://www.youtube.com/shorts/o7a9hx-Pqyo, https://www.youtube.com/shorts/xkAYLnIsfX4, ....", 45 | key="thumbnail_urls_input", 46 | ) 47 | st.thumbnail_text_input_urls = text_urls 48 | uploaded_file = st.file_uploader("Choose a File", type=["txt"], key="thumbanils_file_uploader") 49 | thumbnail_col_1, thumbnail_col_2, thumbnail_col_3 = st.columns([5, 8, 8]) 50 | with thumbnail_col_1: 51 | st.markdown('', unsafe_allow_html=True) 52 | st.button(label="fetch thumbnails", type="primary", on_click=fetch_thumbnails, args=(uploaded_file, text_urls)) 53 | 54 | with thumbnail_col_2: 55 | st.markdown('', unsafe_allow_html=True) 56 | if "thumbnails_zip_path" in st.session_state: 57 | with open(st.session_state.thumbnails_zip_path, "rb") as file: 58 | st.download_button( 59 | label="download thumbnails", 60 | data=file, # st.session_state.thumbnails_zip_path if "thumbnails_zip_path" in st.session_state else "./data/input/blank.zip", 61 | file_name="thumbnails.zip", 62 | mime="application/zip", 63 | type="primary", 64 | disabled=True if st.session_state.thumbnail_fetch_count == 0 else False, 65 | ) 66 | else: 67 | st.download_button( 68 | label="download thumbnails", 69 | data="./data/input/blank.zip", 70 | file_name="thumbnails.zip", 71 | mime="application/zip", 72 | type="primary", 73 | disabled=True, 74 | ) 75 | 76 | with st.container(border=True): 77 | for ind, thumbnail_savepath in enumerate(st.session_state.thumbnail_savepaths): 78 | title = st.session_state.thumbnail_data_entries[ind]["video_title"] 79 | thumbnail_savepath = st.session_state.thumbnail_savepaths[ind] 80 | with st.container(border=True): 81 | a, b, c = st.columns([1, 3, 1]) 82 | with b: 83 | st.subheader(title) 84 | st.image(thumbnail_savepath) 85 | with open(thumbnail_savepath, "rb") as file: 86 | st.download_button( 87 | label="download thumbnail", 88 | data=file, 89 | file_name=title + ".jpg", 90 | mime="image/jpg", 91 | key=f"{title} download", 92 | type="primary", 93 | ) 94 | -------------------------------------------------------------------------------- /ytdatakit/youtube_thumbnail_downloader/callbacks.py: -------------------------------------------------------------------------------- 1 | from ytdatakit.youtube_thumbnail_downloader.yt_thumbnail_downloader import get_batch_thumbnails 2 | from ytdatakit.youtube_thumbnail_downloader.zip import zip_images 3 | from ytdatakit.youtube_thumbnail_downloader.state import reset_state 4 | from ytdatakit.youtube_thumbnail_downloader.config import default_thumbnail_location 5 | import streamlit as st 6 | from io import StringIO 7 | import tempfile 8 | 9 | 10 | def default_temp_savdir(): 11 | with tempfile.TemporaryDirectory() as tmpdirname: 12 | return tmpdirname 13 | 14 | 15 | def urls_normalizer(uploaded_file: "st.uploaded", text_urls: str) -> list: 16 | youtube_urls = [] 17 | if uploaded_file is not None: 18 | if text_urls is not None: 19 | if len(text_urls.strip()) > 0: 20 | st.warning("you can enter urls manually or from file but not both", icon="⚠️") 21 | st.stop() 22 | 23 | if uploaded_file.type == "text/plain": 24 | stringio = StringIO(uploaded_file.read().decode("utf-8")) 25 | for line in stringio: 26 | youtube_urls.append(line.strip()) 27 | if text_urls is not None: 28 | if len(text_urls.strip()) > 0: 29 | if uploaded_file is not None: 30 | st.warning("you can enter urls manually or from file but not both", icon="⚠️") 31 | st.stop() 32 | try: 33 | text_urls_split = text_urls.split(",") 34 | text_urls_split = [v.strip() for v in text_urls_split] 35 | youtube_urls = text_urls_split 36 | except: # noqa E722 37 | st.warning("please check your manually entered urls", icon="⚠️") 38 | st.stop() 39 | return youtube_urls 40 | 41 | 42 | def fetch_logic(youtube_urls: list) -> None: 43 | if youtube_urls != st.session_state.thumbnail_raw_urls: 44 | st.session_state.thumbnail_raw_urls = youtube_urls 45 | reset_state() 46 | if st.session_state.thumbnail_fetch_count == 0: 47 | st.session_state.local_thumbnail_location = default_thumbnail_location() 48 | savedir = "/".join(st.session_state.local_thumbnail_location.split("/")[:-2]) 49 | thumbnail_savepaths, thumbnail_data_entries = get_batch_thumbnails(youtube_urls, savedir) 50 | st.session_state.thumbnail_savepaths = thumbnail_savepaths 51 | st.session_state.thumbnail_data_entries = thumbnail_data_entries 52 | st.session_state.thumbnail_fetch_count += 1 53 | 54 | st.session_state.thumbnails_zip_path = savedir + "/" + "thumbnails.zip" 55 | zip_images(thumbnail_savepaths) 56 | 57 | 58 | def fetch_thumbnails(uploaded_file, text_urls): 59 | # with st.spinner(text="thumbnail pull in progress..."): 60 | youtube_urls = urls_normalizer(uploaded_file, text_urls) 61 | fetch_logic(youtube_urls) 62 | -------------------------------------------------------------------------------- /ytdatakit/youtube_thumbnail_downloader/config.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import uuid 3 | 4 | default_thumbnail_raw_urls = "" 5 | default_thumbnail_savepaths = [] 6 | default_thumbnail_data_entries = [] 7 | default_thumbnail_text_input_urls = "" 8 | default_thumbnails_zip_path = "./data/input/blank.zip" 9 | 10 | 11 | def default_thumbnail_location(): 12 | with tempfile.TemporaryDirectory() as tmpdirname: 13 | return tmpdirname + "/temp_" + str(uuid.uuid4()) + ".jpg" 14 | -------------------------------------------------------------------------------- /ytdatakit/youtube_thumbnail_downloader/state.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from ytdatakit.youtube_thumbnail_downloader.config import ( 3 | default_thumbnail_raw_urls, 4 | default_thumbnail_savepaths, 5 | default_thumbnail_location, 6 | default_thumbnail_data_entries, 7 | default_thumbnail_text_input_urls, 8 | default_thumbnails_zip_path, 9 | ) 10 | 11 | 12 | def state_init(): 13 | if "thumbnail_raw_urls" not in st.session_state: 14 | st.session_state.thumbnail_raw_urls = default_thumbnail_raw_urls 15 | if "thumbnail_savepaths" not in st.session_state: 16 | st.session_state.thumbnail_savepaths = default_thumbnail_savepaths 17 | if "thumbnail_data_entries" not in st.session_state: 18 | st.session_state.thumbnail_data_entries = default_thumbnail_data_entries 19 | if "thumbnail_fetch_count" not in st.session_state: 20 | st.session_state.thumbnail_fetch_count = 0 21 | if "default_thumbnail_location" not in st.session_state: 22 | st.session_state.local_thumbnail_location = default_thumbnail_location() 23 | if "youtube_thumbnails_expander" not in st.session_state: 24 | st.session_state.youtube_thumbnails_expander = False 25 | 26 | 27 | def reset_state(): 28 | st.session_state.thumbnail_savepaths = default_thumbnail_savepaths 29 | st.session_state.thumbnail_text_input_urls = default_thumbnail_text_input_urls 30 | st.session_state.thumbnails_zip_path = default_thumbnails_zip_path 31 | st.session_state.thumbnail_text_input_urls = "" 32 | 33 | st.session_state.thumbnail_fetch_count = 0 34 | st.session_state.youtube_thumbnails_expander = False 35 | -------------------------------------------------------------------------------- /ytdatakit/youtube_thumbnail_downloader/yt_thumbnail_downloader.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | from yt_dlp import YoutubeDL 4 | 5 | 6 | def is_valid_youtube_url(url: str) -> bool: 7 | if not isinstance(url, str): 8 | return False 9 | pattern = r"^https://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long 10 | if "shorts" in url: 11 | pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long 12 | return re.match(pattern, url) is not None 13 | 14 | 15 | def download_thumbnail(yt_thumbnail_url: str, savepath: str) -> None: 16 | img_data = requests.get(yt_thumbnail_url).content 17 | with open(savepath, "wb") as handler: 18 | handler.write(img_data) 19 | 20 | 21 | def get_youtube_thumbnail_url(video_id: str) -> dict: 22 | if video_id: 23 | return { 24 | "default": f"https://img.youtube.com/vi/{video_id}/default.jpg", 25 | "mqdefault": f"https://img.youtube.com/vi/{video_id}/mqdefault.jpg", 26 | "hqdefault": f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg", 27 | "sddefault": f"https://img.youtube.com/vi/{video_id}/sddefault.jpg", 28 | "maxresdefault": f"https://img.youtube.com/vi/{video_id}/maxresdefault.jpg", 29 | } 30 | 31 | 32 | def pull_yt_data(url: str, savedir: str, my_proxies: dict = {}) -> tuple: 33 | try: 34 | if is_valid_youtube_url(url): 35 | with YoutubeDL() as ydl: 36 | info_dict = ydl.extract_info(url, download=False) 37 | video_url = info_dict.get("url", None) 38 | video_id = info_dict.get("id", None) 39 | video_title = info_dict.get("title", None) 40 | entry = {} 41 | entry["video_url"] = url 42 | entry["video_id"] = video_id 43 | entry["video_title"] = video_title 44 | video_title = re.sub(r"[^a-zA-Z0-9]", "", video_title) 45 | 46 | if video_title is None: 47 | savepath = savedir + "/" + video_id + ".jpg" 48 | else: 49 | savepath = savedir + "/" + video_title + ".jpg" 50 | 51 | if video_id: 52 | thumbnail_url = get_youtube_thumbnail_url(video_id)["hqdefault"] 53 | download_thumbnail(thumbnail_url, savepath) 54 | print("...done!") 55 | return savepath, entry 56 | else: 57 | raise ValueError(f"invalid input url: {url}") 58 | except Exception as e: 59 | raise ValueError(f"yt_download failed with exception {e}") 60 | 61 | 62 | def get_batch_thumbnails(yt_urls: list, savedir: str, my_proxies: dict = {}): 63 | thumbnail_savepaths = [] 64 | entries = [] 65 | for url in yt_urls: 66 | try: 67 | thumbnail_savepath, data_entry = pull_yt_data(url, savedir, my_proxies) 68 | thumbnail_savepaths.append(thumbnail_savepath) 69 | entries.append(data_entry) 70 | except Exception as e: 71 | print(f"url {url} failed with exception {e}") 72 | pass 73 | return thumbnail_savepaths, entries 74 | -------------------------------------------------------------------------------- /ytdatakit/youtube_thumbnail_downloader/zip.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import os 3 | import streamlit as st 4 | 5 | 6 | def zip_images(image_paths: list): 7 | print("INFO: zipping images...") 8 | zip_filename = st.session_state.thumbnails_zip_path 9 | with zipfile.ZipFile(zip_filename, "w") as zipf: 10 | for image_path in image_paths: 11 | _, filename = os.path.split(image_path) 12 | zipf.write(image_path, arcname=filename) 13 | print(f"Added {filename} to the zip file.") 14 | print(f"...done! images have been zipped into {zip_filename}") 15 | -------------------------------------------------------------------------------- /ytdatakit/youtube_transcript_downloader/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neonwatty/python-youtube-data-kit/4892ba613962a51b5919b4bd161cd88781e0c34a/ytdatakit/youtube_transcript_downloader/__init__.py -------------------------------------------------------------------------------- /ytdatakit/youtube_transcript_downloader/app.py: -------------------------------------------------------------------------------- 1 | from ytdatakit.youtube_transcript_downloader.callbacks import fetch_transcripts 2 | from ytdatakit.youtube_transcript_downloader.state import state_init 3 | import streamlit as st 4 | 5 | 6 | def app(): 7 | state_init() 8 | st.markdown( 9 | """ 10 | 35 | """, 36 | unsafe_allow_html=True, 37 | ) 38 | 39 | base = st.container(border=True) 40 | with base: 41 | text_urls = st.text_area( 42 | "youtube urls separated by commas", 43 | value=st.session_state.transcript_raw_urls, 44 | placeholder="https://www.youtube.com/shorts/o7a9hx-Pqyo, https://www.youtube.com/shorts/xkAYLnIsfX4, ....", 45 | key="transcript_urls_input", 46 | ) 47 | uploaded_file = st.file_uploader("Choose a File", type=["txt"], key="transcripts_file_uploader") 48 | transcript_col_1, transcript_col_2, transcript_col_3 = st.columns([3, 4, 6]) 49 | with transcript_col_1: 50 | st.markdown('', unsafe_allow_html=True) 51 | fetch_btn = st.button( 52 | label="fetch transcripts", 53 | type="primary", 54 | ) 55 | if fetch_btn: 56 | df_table, df_download = fetch_transcripts(uploaded_file, text_urls) 57 | st.session_state.transcript_data_table = df_table 58 | st.session_state.transcript_data_download = df_download 59 | with transcript_col_2: 60 | st.markdown('', unsafe_allow_html=True) 61 | st.download_button( 62 | label="download transcripts", 63 | data=st.session_state.transcript_data_download, 64 | file_name="transcripts.csv", 65 | mime="text/csv", 66 | disabled=False, 67 | type="primary", 68 | ) 69 | with st.container(border=True): 70 | st.table(st.session_state.transcript_data_table) 71 | -------------------------------------------------------------------------------- /ytdatakit/youtube_transcript_downloader/callbacks.py: -------------------------------------------------------------------------------- 1 | from ytdatakit.youtube_transcript_downloader.yt_transcript_download import get_batch_transcripts 2 | from io import StringIO 3 | import pandas as pd 4 | import streamlit as st 5 | import copy 6 | 7 | 8 | @st.cache_data 9 | def convert_df(df: pd.DataFrame) -> "csv": 10 | # IMPORTANT: Cache the conversion to prevent computation on every rerun 11 | return df.to_csv().encode("utf-8") 12 | 13 | 14 | def fetch_transcripts(uploaded_file, text_urls): 15 | # with st.spinner(text="transcript pull in progress..."): 16 | youtube_urls = [] 17 | if uploaded_file is not None: 18 | if text_urls is not None: 19 | if len(text_urls.strip()) > 0: 20 | st.warning("you can enter urls manually or from file but not both", icon="⚠️") 21 | st.stop() 22 | 23 | if uploaded_file.type == "text/plain": 24 | stringio = StringIO(uploaded_file.read().decode("utf-8")) 25 | for line in stringio: 26 | youtube_urls.append(line.strip()) 27 | if text_urls is not None: 28 | if len(text_urls.strip()) > 0: 29 | if uploaded_file is not None: 30 | st.warning("you can enter urls manually or from file but not both", icon="⚠️") 31 | st.stop() 32 | try: 33 | text_urls_split = text_urls.split(",") 34 | text_urls_split = [v.strip() for v in text_urls_split] 35 | youtube_urls = text_urls_split 36 | except: # noqa E722 37 | st.warning("please check your manually entered urls", icon="⚠️") 38 | st.stop() 39 | 40 | batch_transcripts = get_batch_transcripts(youtube_urls) 41 | df = pd.DataFrame(batch_transcripts) 42 | df_download = convert_df(df) 43 | 44 | def truncate_and_append(text, length, suffix): 45 | if len(text) > length: 46 | return text[:length] + suffix 47 | return text 48 | 49 | max_length = 100 50 | suffix = "..." 51 | df_table = copy.deepcopy(df).astype(str) 52 | df_table["transcript"] = df_table["transcript"].apply(lambda x: truncate_and_append(x, max_length, suffix)) 53 | return df_table, df_download 54 | -------------------------------------------------------------------------------- /ytdatakit/youtube_transcript_downloader/config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/neonwatty/python-youtube-data-kit/4892ba613962a51b5919b4bd161cd88781e0c34a/ytdatakit/youtube_transcript_downloader/config.py -------------------------------------------------------------------------------- /ytdatakit/youtube_transcript_downloader/state.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | 4 | 5 | def state_init(): 6 | df = pd.DataFrame(columns=["youtube_url", "video_id", "transcript"]) 7 | if "transcript_raw_urls" not in st.session_state: 8 | st.session_state.transcript_raw_urls = "" 9 | if "transcript_data_table" not in st.session_state: 10 | st.session_state.transcript_data_table = df 11 | if "transcript_data_download" not in st.session_state: 12 | st.session_state.transcript_data_download = df.to_csv().encode("utf-8") 13 | -------------------------------------------------------------------------------- /ytdatakit/youtube_transcript_downloader/yt_transcript_download.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Dict 3 | from youtube_transcript_api import YouTubeTranscriptApi 4 | 5 | 6 | def is_valid_youtube_url(url: str) -> bool: 7 | if not isinstance(url, str): 8 | return False 9 | pattern = r"^https://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long 10 | if "shorts" in url: 11 | pattern = r"^https://www\.youtube\.com/shorts/[A-Za-z0-9_-]{11}$" # youtube vido ids are always 11 chars long 12 | return re.match(pattern, url) is not None 13 | 14 | 15 | def get_single_transcript(youtube_url: str) -> dict: 16 | if is_valid_youtube_url(youtube_url): 17 | if "shorts" in youtube_url: 18 | video_id = youtube_url.split("/")[-1] 19 | else: 20 | video_id = youtube_url.split("=")[-1] 21 | try: 22 | video_transcript = YouTubeTranscriptApi.get_transcript(video_id) 23 | entry = {} 24 | entry["youtube_url"] = youtube_url 25 | entry["video_id"] = video_id 26 | entry["transcript"] = video_transcript 27 | return entry 28 | except Exception as e: 29 | if "Subtitles are disabled for this video" in str(e): 30 | entry = {} 31 | entry["youtube_url"] = youtube_url 32 | entry["video_id"] = video_id 33 | entry["transcript"] = "Subtitles are disabled for this video" 34 | return entry 35 | else: 36 | print(e) 37 | else: 38 | print(f"FAILURE: youtube_url is not valid - {youtube_url}") 39 | 40 | 41 | def get_batch_transcripts(youtube_urls: List[str]) -> List[Dict]: 42 | try: 43 | entries = [] 44 | for i, youtube_url in enumerate(youtube_urls): 45 | entry = get_single_transcript(youtube_url) 46 | if entry is not None: 47 | entries.append(entry) 48 | return entries 49 | except Exception as e: 50 | print(f"FAILURE: get_batch_transcripts function failed with exception {e}") 51 | return [] 52 | --------------------------------------------------------------------------------