├── .gitignore ├── Assets ├── example20.jpg ├── example_0.0.jpg ├── example_0.1.jpg ├── example_0.2.jpg ├── example_0.3.jpg ├── example_0.4.jpg ├── example_0.5.jpg ├── example_0.6.jpg ├── example_1.1.jpg └── title.jpg ├── LICENSE ├── README.md ├── Samples.md ├── YT_Scrape.py ├── old_history.py ├── requirements.txt └── src ├── __init__.py ├── create_new.py ├── download_these.py ├── downloading.py ├── early_views.py ├── entire_channel.py ├── get_api_key.py ├── get_channel_details.py ├── get_channel_id.py ├── get_channel_playlists.py ├── get_channel_videos.py ├── get_playlist_videos.py ├── get_video_stats.py ├── import_downloaded_items.py ├── load_history.py ├── most_watched.py ├── oldest_videos.py ├── subscriptions.py └── vidsPerTime.py /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | youtube.db 3 | youtube1.db 4 | youtube2.db 5 | test.py 6 | test2.py 7 | takeout 8 | __pycache__ 9 | *.txt 10 | *.exe 11 | *.log -------------------------------------------------------------------------------- /Assets/example20.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example20.jpg -------------------------------------------------------------------------------- /Assets/example_0.0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.0.jpg -------------------------------------------------------------------------------- /Assets/example_0.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.1.jpg -------------------------------------------------------------------------------- /Assets/example_0.2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.2.jpg -------------------------------------------------------------------------------- /Assets/example_0.3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.3.jpg -------------------------------------------------------------------------------- /Assets/example_0.4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.4.jpg -------------------------------------------------------------------------------- /Assets/example_0.5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.5.jpg -------------------------------------------------------------------------------- /Assets/example_0.6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.6.jpg -------------------------------------------------------------------------------- /Assets/example_1.1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_1.1.jpg -------------------------------------------------------------------------------- /Assets/title.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/title.jpg -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 T. Sambit Suranjan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | There was an inactivity of more than 2 years on this project. 2 | I am achieving this due to multiple reasons. 3 | Yet I will create a separate private repo for this and will work on adding new metrics and visualizations although at a slow pace. 4 | If it ends nice will merge that with this repo. 5 | 6 | # Youtube_scrape 7 | 8 | 9 |

10 | 11 | MIT license 14 | Twitter 15 | 16 | Scrape data about an entire Channel or just a Playlist, using Youtube API. No OAuth is required. 17 | 18 | ## :heavy_check_mark: Features 19 | 20 | Following features are available : 21 | 22 | ![CLI](/Assets/example_0.0.jpg) 23 | 24 | 1. **create_new** : 25 | 1. It creates a sqlite database to store all data. 26 | 2. Database will be placed in the same folder as the project file, named 'youtube.db' 27 | 3. It will have 4 tables - tb_channel, tb_playlist, tb_videos, video_history 28 | 4. You can use programs like [DB Browser](https://sqlitebrowser.org) , which is lightweight, to view the database. 29 | 2. **Oldest Video on A Topic** : 30 | 1. It is an isolate program, that can be run independently. 31 | 2. It doesn't depend on main code or any database. 32 | 3. **Scrape A Channel**: 33 | 1. Allows to scrape Channel Details and it's playlists. 34 | 2. It can also scrape details for each video of that channel. 35 | 1. If this option is not chosen, the playlist table won't have Playlist Duration. 36 | 4. **Scrape A Single Playlist**: 37 | 1. Allows to scrape info about a single Playlist and details about all it's videos. 38 | 5. **Load Your History**: 39 | 1. Make sure you have downloaded google Takeouts for your account to the PWD. 40 | 2. Make sure you have follwing path './takeout/history/watch-history.html' 41 | 3. Option to keep videos of your history on a separate table or integrate them with main table tb_videos 42 | 1. In order to use next features, you have to integrate them. 43 | 6. **Most Watched Video**: 44 | 1. You can list your most watched 'n' videos 45 | 7. **Early Viewed**: 46 | 1. You can list 'n' videos, which you saw earliest after they were uploaded. 47 | 2. There are some discrepencies, as many videos are reuploaded after you have seen it. 48 | 1. Program ignores those 49 | 3. It now only works when you watched it in IST. 50 | 8. **Generate Download List**: 51 | 1. This will create a text file, that will list Youtube URLs that can be downloaded by Youtube-DL or IDM etc. 52 | 2. It will select videos which are marked 'Worth = 1' i the database. 53 | 1. This operation is to be done by the user directly on the database (using DB Browser or such) 54 | 3. There is option to list videos of a single Channel or from entire DAtabase. 55 | 4. *Caution* : Once a video is processed by this function, it will be marked 'Is_Downloaded = 1'. Next time this function is run, new video IDs will be considered. 56 | 1. Hence User must make sure, all videos in *download_list.txt* are downloaded before rewriting the file. 57 | 58 | ## :computer: Setup Guide 59 | 60 | Below is a detailed guide on setting up the environment. 61 | 62 | ### Youtube API 63 | 64 | First you need to have you Youtube API key. Below is a link of a video, that will guide you. **Watch from 0:00 - 5:30** 65 | [![Getting Youtube API Key](https://img.youtube.com/vi/th5_9woFJmk/0.jpg)](https://www.youtube.com/watch?v=th5_9woFJmk) 66 | 67 | 1. **Note - Youtube API is rate limited to 10000 hits/day.** 68 | 2. You can view your quotas at [here - console](https://console.cloud.google.com/iam-admin/quotas) 69 | 3. Cost of operations is decribed [here -Youtube API docs](https://developers.google.com/youtube/v3/docs) 70 | 4. Code has been optimized to decrease quota usage. You can easily work with 50000 videos/day. For more please check your quota limit. 71 | 72 | ### Installation 73 | 74 | You need to install google-api-python-client to run this project. [github API link](https://github.com/googleapis/google-api-python-client) 75 | Install this library in a [virtualenv](https://virtualenv.pypa.io/en/latest/) using pip. 76 | 77 | #### Mac/Linux 78 | 79 | ``` 80 | pip3 install virtualenv 81 | virtualenv venv 82 | . venv/bin/activate 83 | pip3 install -r requirements.txt 84 | ``` 85 | 86 | #### Windows 87 | 88 | ``` 89 | pip3 install virtualenv 90 | virtualenv venv 91 | venv\Scripts\activate 92 | pip3 install -r requirements.txt 93 | ``` 94 | 95 | ## Working Guide 96 | 97 | 1. Get Your Youtube API key as shown in above video. 98 | 2. Pip install the requirements.txt 99 | 3. Run the program YT_Scrape.py 100 | 101 | The script will ask for required data in the command line and is pretty self-explanatory (Once it runs) 102 | 103 | [View Samples](/Samples.md) 104 | 105 | ## :hearts: Contributing 106 | 107 | There are several ways to help. 108 | 109 | 1. **Spread the word:** More users means more possible people testing and contributing to the app which in turn means better stability and possibly more and better features. You can [![Twitter](https://img.shields.io/twitter/url?style=social&url=https%3A%2F%2Fgithub.com%2FCriticalHunter%2FYoutube_Scraper.git)](https://twitter.com/intent/tweet?text=Wow:&url=https%3A%2F%2Fgithub.com%2FCriticalHunter%2FYoutube_stats.git) or share it on [LinkedIn](http://www.linkedin.com/shareArticle?mini=true&url=https://github.com/CriticalHunter/Youtube_Scraper.git). Every little bit helps ! 110 | 2. **[Make a feature or improvement request](https://github.com/CriticalHunter/Youtube_Scraper/issues/new)**: Something can be be done better? Something essential missing? Let us know! 111 | 3. **[Report bugs](https://github.com/CriticalHunter/Youtube_Scraper/issues/new)** 112 | 4. **Contribute**: You don't have to be programmer to help. 113 | 114 | 1. **Treat Me A Coffee Instead** [Paypal](https://paypal.me/CriticalHunter23) 115 | 116 | ### Pull Requests 117 | 118 | **Pull requests** are of course very welcome! Please make sure to also include the issue number in your commit message, if you're fixing a particular issue (e.g.: `feat: add nice feature with the number #31`). 119 | -------------------------------------------------------------------------------- /Samples.md: -------------------------------------------------------------------------------- 1 | 2 | ### Getting Channel ID 3 | 4 | ![example_1.1](/Assets/example_1.1.jpg) 5 | 6 | 7 | 8 | ### Getting Playlist ID 9 | 10 | ![example2](/Assets/example20.jpg) 11 | 12 | ## Database (in DB Browser) sample results 13 | ### Database Schema 14 | ![example_0.1](/Assets/example_0.1.jpg) 15 | ### tb_channels Table 16 | ![example_0.2](/Assets/example_0.2.jpg) 17 | ### tb_playlists Table 18 | ![example_0.3](/Assets/example_0.3.jpg) 19 | ### tb_videos Table 20 | ![example_0.4](/Assets/example_0.4.jpg) 21 | 22 | ![example_0.5](/Assets/example_0.5.jpg) 23 | 24 | ### video_history Table 25 | ![example_0.6](/Assets/example_0.6.jpg) -------------------------------------------------------------------------------- /YT_Scrape.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, unicode_literals 2 | import re, six, os, sys, json 3 | 4 | 5 | from pyfiglet import Figlet, figlet_format 6 | from pprint import pprint 7 | from PyInquirer import style_from_dict, Token, prompt, Validator, ValidationError 8 | from termcolor import colored 9 | import argparse 10 | 11 | 12 | from src.create_new import dbase 13 | from src.get_api_key import api_key 14 | from src.get_channel_details import get_channel_details 15 | from src.entire_channel import entire_channel 16 | from src.get_playlist_videos import get_playlist_videos 17 | from src.load_history import load_history 18 | from src.most_watched import most_watched 19 | from src.early_views import early_views 20 | from src.download_these import download_n 21 | 22 | from src.downloading import * 23 | 24 | def log1(string, color, figlet=False): 25 | if colored: 26 | if not figlet: 27 | six.print_(colored(string, color)) 28 | else: 29 | six.print_(colored(figlet_format( 30 | string, font='doom'), color)) 31 | else: 32 | six.print_(string) 33 | 34 | log1("Youtube_Scraper", color="blue", figlet=True) 35 | log1("Welcome to Youtube_Scraper", "green") 36 | 37 | style = style_from_dict({ 38 | Token.QuestionMark: '#E91E63 bold', 39 | Token.Selected: '#673AB7 bold', 40 | Token.Instruction: '', # default 41 | Token.Answer: '#2196f3 bold', 42 | Token.Question: '', 43 | }) 44 | 45 | 46 | 47 | class NumberValidator(Validator): 48 | def validate(self, document): 49 | try: 50 | int(document.text) 51 | except ValueError: 52 | raise ValidationError( 53 | message='Please enter a number', 54 | cursor_position=len(document.text)) # Move cursor to end 55 | 56 | 57 | print('Please Choose the desired Options') 58 | print('Press "ctrl+C" to escape at any point\n') 59 | 60 | 61 | dbase() 62 | 63 | if not os.path.exists("key.txt"): 64 | questions = [ 65 | { 66 | 'type': 'input', 67 | 'name': 'API', 68 | 'message': '"key.txt" file not found. Please enter your Youtube API key ' 69 | },] 70 | answers = prompt(questions, style=style) 71 | with open ('key.txt','w') as f: 72 | f.write(answers['API']) 73 | youtube_instance = api_key() 74 | youtube_instance.get_api_key() 75 | youtube = youtube_instance.get_youtube() 76 | if youtube == None: 77 | sys.exit() 78 | 79 | try: 80 | questions = [ 81 | { 82 | 'type': 'list', 83 | 'name': 'operation', 84 | 'message': 'What do you want to do?', 85 | 'choices': ['Find oldest videos on a topic', 'Scrape a Channel','Scrape a Single Playlist' ,'Load Your History','Most Watched Video','Early Viewed Video','Generate Download List','Download Videos using YoutubeDL'], 86 | 'filter': lambda val: val.lower() 87 | }, 88 | { 89 | 'type': 'list', 90 | 'name': 'Channel', 91 | 'message': 'Select Further \n Scraping all videos for a big channel will surpass your free API Limit', 92 | 'choices': ['Scrape Everything for a channel', 'Just Channel Stats (Individual video stats are not scraped)'], 93 | 'when': lambda answers: answers['operation'] == 'scrape a channel' 94 | }, 95 | { 96 | 'type': 'input', 97 | 'name': 'channelID', 98 | 'message': 'Enter the Channel ID (leave it blank to pick channels from Channels.txt)', 99 | 'when': lambda answers: answers['operation'] == 'scrape a channel' and answers['Channel'] != '' 100 | }, 101 | { 102 | 'type': 'input', 103 | 'name': 'playlistID', 104 | 'message': 'Enter the Playlist ID', 105 | 'when': lambda answers: answers['operation'] == 'scrape a single playlist' 106 | }, 107 | { 108 | 'type': 'list', 109 | 'name': 'Download', 110 | 'message': 'What should the list contain?', 111 | 'choices': ['Videos from a single Channel', 'Videos from entire database'], 112 | 'when': lambda answers: answers['operation'] == 'generate download list' 113 | }, 114 | { 115 | 'type': 'confirm', 116 | 'name': 'import', 117 | 'message': 'Do you want to import your video_history into main table(tb_videos) too? ', 118 | 'default': False, 119 | 'when': lambda answers: answers['operation'] == 'load your history' 120 | }, 121 | { 122 | 'type': 'list', 123 | 'name': 'Quality', 124 | 'message': 'What Quality you want to download? (Make sure videos are listed in "download.txt" file)', 125 | 'choices': ['4k/Best Available','1080p','720p','360p'], 126 | 'when': lambda answers: answers['operation'] == 'download videos using youtubedl' 127 | }, 128 | ] 129 | 130 | answers = prompt(questions, style=style) 131 | 132 | 133 | if answers['operation'] == 'find oldest videos on a topic': 134 | os.system("python .\src\oldest_videos.py -h") 135 | 136 | elif answers['operation'] == 'scrape a channel': 137 | if answers['channelID'] == '': 138 | with open("Channels.txt") as f: 139 | for line in f: 140 | new_Ch_ID = line[0]+'C'+line[2:] 141 | new_Ch_ID = new_Ch_ID.strip() 142 | print(new_Ch_ID) 143 | if answers['Channel'] == 'Just Channel Stats (Individual video stats are not scraped)': 144 | get_channel_details(youtube,new_Ch_ID) 145 | elif answers['Channel'] == 'Scrape Everything for a channel': 146 | entire_channel(youtube,new_Ch_ID) 147 | 148 | else: 149 | Ch_ID = answers['channelID'] 150 | new_Ch_ID = Ch_ID[0]+'C'+Ch_ID[2:] 151 | if answers['Channel'] == 'Just Channel Stats (Individual video stats are not scraped)': 152 | get_channel_details(youtube,new_Ch_ID) 153 | elif answers['Channel'] == 'Scrape Everything for a channel': 154 | entire_channel(youtube,new_Ch_ID) 155 | 156 | elif answers['operation'] == 'scrape a single playlist': 157 | get_playlist_videos(youtube,answers['playlistID']) 158 | 159 | elif answers['operation'] == 'load your history': 160 | if answers['import'] == True: 161 | res = 'y' 162 | elif answers['import'] == False: 163 | res = 'n' 164 | print("Please Wait ...") 165 | load_history(res) 166 | 167 | elif answers['operation'] == 'most watched video': 168 | print("If your watch history is not loaded in database, it will give empty result") 169 | print("Please enter, How many items to retrieve e.g. 10 for Top 10 \n") 170 | n = int(input()) 171 | most_watched(n) 172 | 173 | elif answers['operation'] == 'early viewed video': 174 | print("If your watch history is not loaded in database, it will give empty result") 175 | print("Please enter, How many items to retrieve e.g. 10 for Top 10 \n") 176 | n = int(input()) 177 | early_views(n) 178 | 179 | elif answers['operation'] == 'generate download list': 180 | if answers['Download'] == 'Videos from a single Channel': 181 | print("It will list videos that are marked 'Is-Good' and is present in your database") 182 | chc = input("Please enter the channel ID \t") 183 | print("Please enter, How many items the list will contain \n") 184 | n = int(input()) 185 | download_n(chc,n) 186 | elif answers['Download'] == 'Videos from entire database': 187 | print("It will list videos that are marked 'Is-Good' and is present in your database") 188 | chc = '' 189 | print("Please enter, How many items the list will contain \n") 190 | n = int(input()) 191 | download_n(chc,n) 192 | elif answers['operation'] == 'download videos using youtubedl': 193 | print("\nIt will download all the videos that are listed in download.txt") 194 | print("Do you want to replace file names (_ in place of space) and convert thumbnail images (from WEBP to JPEG) ?\n") 195 | chc2 = input("Please enter Y/N \t") 196 | if chc2 == 'Y' or chc2 == 'Yes': 197 | if answers['Quality'] == '4k/Best Available': 198 | download_files('4k') 199 | elif answers['Quality'] == '1080p': 200 | download_files(1080) 201 | elif answers['Quality'] == '720p': 202 | download_files(720) 203 | elif answers['Quality'] == '360p': 204 | download_files(360) 205 | replace2('D:\Youtube') 206 | convertWebp2jpgInDirectory('D:\Youtube') 207 | else: 208 | if answers['Quality'] == '4k/Best Available': 209 | download_files('4k') 210 | elif answers['Quality'] == '1080p': 211 | download_files(1080) 212 | elif answers['Quality'] == '720p': 213 | download_files(720) 214 | elif answers['Quality'] == '360p': 215 | download_files(360) 216 | 217 | except Exception as e: 218 | print(e) -------------------------------------------------------------------------------- /old_history.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | from bs4 import BeautifulSoup 3 | import re 4 | import sqlite3 5 | import time 6 | import datetime,pytz 7 | 8 | with open("test.html",encoding='utf-8') as fp: # To open a local html file 9 | soup = BeautifulSoup(fp,features='lxml') 10 | 11 | soup_str = soup.prettify() # Prettify the HTML, but it becomes String 12 | with open('temp.html','w',encoding='utf-8') as wr: 13 | wr.write(soup_str) 14 | 15 | tags = soup.find_all("c-wiz", {"class": "xDtZAf"}) 16 | for tag in tags: 17 | with open("watched.txt",'a',encoding='utf-8') as f: 18 | date = tag.get('data-date') 19 | foo = tag.find("div", {"class": "QTGV3c"}) 20 | temp = (foo.get_text()) 21 | watched = temp.split(' ') 22 | watched = watched[0] 23 | if watched == 'Watched': 24 | try: 25 | bar = foo.a.get('href') 26 | vid_id = bar[-11:] 27 | time = tag.find("div", {"class": "H3Q9vf XTnvW"}) 28 | time = time.get_text() 29 | f.write(date) 30 | f.write(' ') 31 | tm = re.findall('\d+:\d+ .M',time)[0] 32 | tm1=tm.split(':')[0] 33 | tm2=tm.split(':')[1] 34 | tm21 = tm2.split(' ')[0] 35 | tm22 = tm2.split(' ')[1] 36 | if len(tm1)==1: 37 | tm='0'+tm1+':'+tm21+':'+'00'+' '+tm22 38 | else: 39 | tm=tm1+':'+tm21+':'+'00'+' '+tm22 40 | f.write(tm) 41 | f.write(' ') 42 | f.write(vid_id) 43 | f.write('\n') 44 | except: 45 | pass 46 | 47 | with open("watched.txt",'r',encoding='utf-8') as fhand: 48 | conn = sqlite3.connect('youtube.db') 49 | cur = conn.cursor() 50 | for line in fhand: 51 | time = line[0:-12] 52 | p='%Y%m%d %I:%M:%S %p ' 53 | epoch = (datetime.datetime.strptime(time, p)) 54 | # dtobj3=dtobj1.replace(tzinfo=pytz.UTC) #replace method 55 | # dtobj_kolkata=dtobj3.astimezone(pytz.timezone("Asia/Kolkata")) 56 | # epoch = dtobj_kolkata.timestamp() 57 | new_format = epoch.strftime('%b %d, %Y, %I:%M:%S %p') 58 | vid_id = line[-12:-1] 59 | cur.execute("INSERT OR IGNORE INTO video_history VALUES (?,?,?,?)", (vid_id,new_format,epoch.timestamp(),0)) 60 | conn.commit() 61 | conn.close() 62 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | astroid==2.4.2 2 | atomicwrites==1.4.0 3 | attrs==20.3.0 4 | beautifulsoup4==4.9.3 5 | cachetools==4.2.0 6 | certifi==2024.7.4 7 | chardet==4.0.0 8 | colorama==0.4.4 9 | ffmpeg-python==0.2.0 10 | future==0.18.3 11 | google-api-core==1.24.1 12 | google-api-python-client==1.12.8 13 | google-auth==1.24.0 14 | google-auth-httplib2==0.0.4 15 | googleapis-common-protos==1.52.0 16 | httplib2==0.19.1 17 | idna==3.7 18 | iniconfig==1.1.1 19 | isort==5.7.0 20 | lazy-object-proxy==1.4.3 21 | lxml==4.9.1 22 | mccabe==0.6.1 23 | packaging==20.8 24 | pluggy==0.13.1 25 | prompt-toolkit==1.0.14 26 | protobuf==3.18.3 27 | py==1.10.0 28 | pyasn1==0.4.8 29 | pyasn1-modules==0.2.8 30 | pyfiglet==0.8.post1 31 | Pygments==2.15.0 32 | PyInquirer==1.0.3 33 | pylint==2.6.0 34 | pyparsing==2.4.7 35 | pytest==6.2.1 36 | pytz==2020.4 37 | regex==2020.11.13 38 | requests==2.32.2 39 | rsa==4.7 40 | six==1.15.0 41 | soupsieve==2.1 42 | termcolor==1.1.0 43 | toml==0.10.2 44 | uritemplate==3.0.1 45 | urllib3==1.26.19 46 | wcwidth==0.2.5 47 | wrapt==1.12.1 -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/src/__init__.py -------------------------------------------------------------------------------- /src/create_new.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import os 3 | 4 | def create_new(): 5 | conn = sqlite3.connect('youtube.db') 6 | cur = conn.cursor() 7 | 8 | 9 | cur.execute("""CREATE TABLE IF NOT EXISTS tb_channels ( 10 | Channel_ID TEXT PRIMARY KEY, 11 | Channel_title TEXT, 12 | Published_At TEXT NOT NULL, 13 | Country TEXT, 14 | View_Count INTEGER, 15 | Subscriber_Count INTEGER, 16 | Video_Count INTEGER, 17 | Playlist_Count INTEGER, 18 | Channel_Duration INTEGER, 19 | Duration_in_Text TEXT, 20 | Is_Deleted INTEGER, 21 | Deleted_Videos INTEGER, 22 | Downloaded_Videos INTEGER, 23 | Folder_Size_GB REAL, 24 | Channel_last_Scraped TEXT, 25 | Auto_Update INTEGER, 26 | Description TEXT 27 | ) 28 | 29 | """) 30 | 31 | 32 | cur.execute("""CREATE TABLE IF NOT EXISTS tb_playlists( 33 | Playlist_ID TEXT PRIMARY KEY, 34 | Playlist_title TEXT, 35 | Channel_ID TEXT NOT NULL, 36 | Channel_Title TEXT NOT NULL, 37 | Published_At TEXT NOT NULL, 38 | Current_Video_Count INTEGER, 39 | Playlist_Seconds INTEGER, 40 | Playlist_Duration TEXT, 41 | Is_Seen INTEGER, 42 | Worth INTEGER, 43 | Is_Removed INTEGER, 44 | Deleted_Videos INTEGER, 45 | Downloaded_Videos INTEGER, 46 | Folder_Size_GB REAL, 47 | Playlist_last_Scraped TEXT, 48 | Auto_Update INTEGER 49 | ) 50 | """) 51 | 52 | cur.execute("""CREATE TABLE IF NOT EXISTS tb_videos ( 53 | Video_ID TEXT PRIMARY KEY, 54 | Video_title TEXT, 55 | Is_Seen INTEGER, 56 | Worth INTEGER, 57 | Upload_playlistId TEXT, 58 | Playlist_ID TEXT, 59 | Published_At TEXT NOT NULL, 60 | epoch REAL NOT NULL, 61 | Channel_ID TEXT NOT NULL, 62 | Channel_Title TEXT NOT NULL, 63 | View_Count INTEGER, 64 | Like_Count INTEGER, 65 | Dislike_Count INTEGER, 66 | Upvote_Ratio REAL, 67 | Comment_Count INTEGER, 68 | Duration TEXT, 69 | video_seconds INTEGER, 70 | Is_Licensed INTEGER, 71 | Is_Deleted INTEGER, 72 | Is_Downloaded INTEGER 73 | ) 74 | """) 75 | 76 | cur.execute("""CREATE TABLE IF NOT EXISTS video_history ( 77 | Video_ID TEXT NOT NULL, 78 | Title TEXT, 79 | Watched_at TEXT , 80 | epoch REAL NOT NULL, 81 | Is_in_Main INTEGER, 82 | Is_Deleted INTEGER, 83 | PRIMARY KEY ( Video_ID, epoch) 84 | ) 85 | 86 | """) 87 | 88 | cur.execute("""CREATE TABLE IF NOT EXISTS yt_downloaded ( 89 | Video_ID TEXT PRIMARY KEY, 90 | Resolution TEXT, 91 | Raw_Size INTEGER, 92 | Size REAL, 93 | vid_type TEXT, 94 | FPS TEXT, 95 | bitrate, 96 | Audio_Type TEXT, 97 | Frequency INTEGER, 98 | Channels TEXT, 99 | Is_In_Main INTEGER 100 | ) 101 | 102 | """) 103 | 104 | conn.commit() # Push the data into database 105 | conn.close() 106 | 107 | def migrate(): 108 | conn = sqlite3.connect('youtube.db') 109 | cur = conn.cursor() 110 | 111 | cur.execute("PRAGMA foreign_keys=off") 112 | cur.execute("BEGIN TRANSACTION") 113 | cur.execute("ALTER TABLE tb_channels RENAME TO _tb_channels_old") 114 | cur.execute(""" 115 | CREATE TABLE IF NOT EXISTS tb_channels ( 116 | Channel_ID TEXT PRIMARY KEY, 117 | Channel_title TEXT, 118 | Published_At TEXT NOT NULL, 119 | Country TEXT, 120 | View_Count INTEGER, 121 | Subscriber_Count INTEGER, 122 | Video_Count INTEGER, 123 | Playlist_Count INTEGER 124 | ) 125 | 126 | """) 127 | cur.execute("INSERT INTO tb_channels SELECT * FROM _tb_channels_old") 128 | try: 129 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Channel_Duration INTEGER") 130 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Duration_in_Text TEXT") 131 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Is_Deleted INTEGER") 132 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Deleted_Videos INTEGER") 133 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Downloaded_Videos INTEGER") 134 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Folder_Size_GB REAL") 135 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Channel_last_Scraped TEXT") 136 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Auto_Update INTEGER") 137 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Description TEXT") 138 | except: 139 | # These stats are added after intitial release of this code. 140 | pass 141 | cur.execute("DROP TABLE _tb_channels_old") 142 | 143 | cur.execute("ALTER TABLE tb_playlists RENAME TO _tb_playlists_old") 144 | cur.execute("""CREATE TABLE IF NOT EXISTS tb_playlists( 145 | Playlist_ID TEXT PRIMARY KEY, 146 | Playlist_title TEXT, 147 | Channel_ID TEXT NOT NULL, 148 | Channel_Title TEXT NOT NULL, 149 | Published_At TEXT NOT NULL, 150 | Item_Count INTEGER, 151 | Playlist_Seconds INTEGER, 152 | Playlist_Duration TEXT, 153 | Is_Seen INTEGER, 154 | Worth INTEGER 155 | ) 156 | """) 157 | cur.execute("INSERT INTO tb_playlists SELECT * FROM _tb_playlists_old") 158 | try: 159 | cur.execute("ALTER TABLE tb_playlists ADD COLUMN Is_Removed INTEGER") 160 | cur.execute("ALTER TABLE tb_playlists ADD COLUMN Deleted_Videos INTEGER") 161 | cur.execute("ALTER TABLE tb_playlists ADD COLUMN Downloaded_Videos INTEGER") 162 | cur.execute("ALTER TABLE tb_playlists ADD COLUMN Folder_Size_GB REAL") 163 | cur.execute("ALTER TABLE tb_playlists ADD COLUMN Playlist_last_Scraped TEXT") 164 | cur.execute("ALTER TABLE tb_playlists ADD COLUMN Auto_Update INTEGER") 165 | cur.execute("ALTER TABLE tb_playlists RENAME COLUMN Item_Count TO Current_Video_Count") 166 | except: 167 | # These stats are added after intitial release of this code. 168 | pass 169 | cur.execute("DROP TABLE _tb_playlists_old") 170 | 171 | cur.execute("ALTER TABLE tb_videos RENAME TO _tb_videos_old") 172 | cur.execute("""CREATE TABLE IF NOT EXISTS tb_videos ( 173 | Video_ID TEXT PRIMARY KEY, 174 | Video_title TEXT, 175 | Is_Seen INTEGER, 176 | Worth INTEGER, 177 | Upload_playlistId TEXT, 178 | Playlist_ID TEXT, 179 | Published_At TEXT NOT NULL, 180 | epoch REAL NOT NULL, 181 | Channel_ID TEXT NOT NULL, 182 | Channel_Title TEXT NOT NULL, 183 | View_Count INTEGER, 184 | Like_Count INTEGER, 185 | Dislike_Count INTEGER, 186 | Upvote_Ratio REAL, 187 | Comment_Count INTEGER, 188 | Duration TEXT, 189 | video_seconds INTEGER, 190 | Is_Licensed INTEGER, 191 | Is_Deleted INTEGER, 192 | Is_Downloaded INTEGER 193 | ) 194 | """) 195 | cur.execute("INSERT INTO tb_videos SELECT * FROM _tb_videos_old") 196 | cur.execute("DROP TABLE _tb_videos_old") 197 | 198 | 199 | cur.execute("""CREATE TABLE IF NOT EXISTS yt_downloaded ( 200 | Video_ID TEXT PRIMARY KEY, 201 | Resolution TEXT, 202 | Raw_Size INTEGER, 203 | Size REAL, 204 | vid_type TEXT, 205 | FPS TEXT, 206 | bitrate, 207 | Audio_Type TEXT, 208 | Frequency INTEGER, 209 | Channels TEXT, 210 | Is_In_Main INTEGER 211 | ) 212 | 213 | """) 214 | try: 215 | cur.execute("DROP TABLE tb_downloaded") 216 | except: 217 | pass 218 | cur.execute("PRAGMA foreign_keys=on") 219 | conn.commit() # Push the data into database 220 | conn.close() 221 | 222 | def dbase(): 223 | if not os.path.exists("youtube.db"): 224 | create_new() 225 | else: 226 | conn = sqlite3.connect('youtube.db') 227 | cur = conn.cursor() 228 | try: 229 | cur.execute("SELECT Deleted_Videos FROM tb_channels") 230 | except: 231 | migrate() 232 | 233 | 234 | if __name__ == "__main__": 235 | dbase() -------------------------------------------------------------------------------- /src/download_these.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | def download_n(chc='',n=50): 4 | with open("download.txt",'w',encoding='utf-8') as fp: 5 | conn = sqlite3.connect('youtube.db') 6 | cur = conn.cursor() 7 | if chc == '': 8 | cur.execute("SELECT Video_ID FROM tb_videos WHERE Worth = 1 and Is_Downloaded = 0 LIMIT ?",(n,)) 9 | else: 10 | try: 11 | cur.execute("SELECT Video_ID FROM tb_videos WHERE Worth = 1 and Is_Downloaded = 0 and Channel_ID = ? LIMIT ?",(chc,n)) 12 | except: 13 | print("Please enter correct Channel ID") 14 | down_list = cur.fetchall() 15 | for item in down_list: 16 | link = "https://www.youtube.com/watch?v="+item[0] 17 | cur.execute("UPDATE tb_videos SET Is_Downloaded = 1 WHERE Video_ID = ?",(item[0],)) 18 | fp.write(link) 19 | fp.write('\n') 20 | conn.commit() # Push the data into database 21 | conn.close() 22 | 23 | if __name__ == "__main__": 24 | pass -------------------------------------------------------------------------------- /src/downloading.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | ''' 5 | make It run on both linux / and windows \ 6 | ''' 7 | 8 | def download_files(ch=720): 9 | commandline4k = 'youtube-dl --add-metadata --write-info-json --write-thumbnail --force-ipv4 \ 10 | --sleep-interval 3 --max-sleep-interval 6 --ignore-errors --no-continue --no-overwrites \ 11 | --download-archive archive.log -f "bestvideo+(bestaudio[acodec^=opus]/bestaudio)/bestvideo[height<=360]" \ 12 | --merge-output-format "mkv" -o "D:/Youtube/%(uploader)s/%(upload_date)s_%(title)s %(id)s.%(ext)s" -a download.txt' 13 | commandline1080 = 'youtube-dl --add-metadata --write-info-json --write-thumbnail --force-ipv4 \ 14 | --sleep-interval 3 --max-sleep-interval 6 --ignore-errors --no-continue --no-overwrites \ 15 | --download-archive archive.log -f "bestvideo[height<=1080]+(bestaudio[acodec^=opus]/bestaudio)/bestvideo[height<=360]" \ 16 | --merge-output-format "mkv" -o "D:/Youtube/%(uploader)s/%(upload_date)s_%(title)s %(id)s.%(ext)s" -a download.txt' 17 | commandline720 = 'youtube-dl --add-metadata --write-info-json --write-thumbnail --force-ipv4 \ 18 | --sleep-interval 3 --max-sleep-interval 6 --ignore-errors --no-continue --no-overwrites \ 19 | --download-archive archive.log -f "bestvideo[height<=720]+(bestaudio[acodec^=opus]/bestaudio)/bestvideo[height<=360]" \ 20 | --merge-output-format "mkv" -o "D:/Youtube/%(uploader)s/%(upload_date)s_%(title)s %(id)s.%(ext)s" -a download.txt' 21 | commandline360 = 'youtube-dl --add-metadata --write-info-json --write-thumbnail --force-ipv4 \ 22 | --sleep-interval 3 --max-sleep-interval 6 --ignore-errors --no-continue --no-overwrites \ 23 | --download-archive archive.log -f "bestvideo[height<=360]+(bestaudio[acodec^=opus]/bestaudio)/bestvideo[height<=360]" \ 24 | --merge-output-format "mkv" -o "D:/Youtube/%(uploader)s/%(upload_date)s_%(title)s %(id)s.%(ext)s" -a download.txt' 25 | if ch == 720: 26 | commandline = commandline720 27 | elif ch == '4k': 28 | commandline = commandline4k 29 | elif ch == 1080: 30 | commandline = commandline1080 31 | elif ch == 360: 32 | commandline = commandline360 33 | os.system(commandline) 34 | 35 | def replace2(parent): 36 | for path, folders, files in os.walk(parent): 37 | for f in files: 38 | os.rename(os.path.join(path, f), os.path.join(path, f.replace(' ', '__'))) 39 | for i in range(len(folders)): 40 | new_name = folders[i].replace(' ', '_').replace('.', '_').replace("'", '') 41 | try: 42 | os.rename(os.path.join(path, folders[i]), os.path.join(path, new_name)) 43 | except FileExistsError: 44 | pass 45 | folders[i] = new_name 46 | # Traverse the specified directory, display all file names under the directory 47 | def convertWebp2jpgInDirectory(dir): 48 | if os.path.isdir(dir): 49 | allfiles = os.listdir(dir) 50 | for fi in allfiles: 51 | fi_d = os.path.join(dir, fi) 52 | if os.path.isdir(fi_d): 53 | convertWebp2jpgInDirectory(fi_d) 54 | else: 55 | if fi_d.endswith(".webp"): 56 | webp = os.path.join(dir, fi_d) 57 | webp = '"'+webp+'"' 58 | filename = webp.split("\\")[-1] 59 | 60 | filedir = "\\".join(webp.split("\\")[:-1]) 61 | 62 | newfilename = filename.replace(".webp",'.jpg') 63 | jpg = "%s\%s"%(filedir, newfilename) 64 | # jpg = '"'+jpg+'"' 65 | commandline = "dwebp %s -o %s" % (webp, jpg) 66 | 67 | os.system(commandline) 68 | print(webp + " ------> conversion succeeded") 69 | 70 | deleteline = "rm "+webp 71 | os.system(deleteline) 72 | 73 | if __name__ == '__main__': 74 | pass 75 | # download_files() 76 | # convertWebp2jpgInDirectory("D:\Youtube") 77 | # replace('D:\Youtube') -------------------------------------------------------------------------------- /src/early_views.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | def early_views(n=5): 4 | conn = sqlite3.connect('youtube.db') 5 | cur = conn.cursor() 6 | cur.execute("SELECT video_history.Video_ID, video_history.epoch -tb_videos.epoch As diff,video_history.epoch,tb_videos.epoch,tb_videos.Video_title,tb_videos.epoch, Watched_at FROM video_history \ 7 | LEFT OUTER JOIN tb_videos on tb_videos.Video_id = video_history.Video_ID WHERE (diff-19800) > 0 GROUP BY video_history.Video_ID ORDER BY diff ASC ;") 8 | results = cur.fetchmany(n) 9 | print("Video ID"," Diff in Min","\t","Published AT(UTC)"," Watched AT (IST)","\tVideo Title") 10 | print("-------------------------------------------------------------------------------------------------------") 11 | for result in results: 12 | Link = result[0] 13 | differ = (int(result[1])-19800)/60 14 | differ1 = ("{:6d}".format(int(differ//1))) 15 | differ2 = ("{:.2f}".format(differ % 1)).replace('0.','.') 16 | differ = differ1+differ2 17 | print(Link,'\t',differ,'\t',result[2],'\t',result[3],'\t',result[4]) 18 | conn.commit() 19 | conn.close() 20 | 21 | if __name__ == "__main__": 22 | pass -------------------------------------------------------------------------------- /src/entire_channel.py: -------------------------------------------------------------------------------- 1 | from src.get_channel_playlists import get_channel_playlists 2 | from src.get_channel_details import get_channel_details, get_channel_length 3 | from src.get_playlist_videos import get_playlist_videos 4 | from src.get_channel_videos import get_channel_videos 5 | 6 | def entire_channel(youtube,ch_id): 7 | ch_id = ch_id 8 | ec=True 9 | get_channel_details(youtube,ch_id,ec=ec) 10 | playlists_list = get_channel_playlists(youtube,ch_id) 11 | count = 0 12 | print('\nThere are ',len(playlists_list),' original/imported playlists\n') 13 | for playlist in playlists_list: 14 | count += 1 15 | print('\nParsing playlist ',count,' \\ ',len(playlists_list)) 16 | try: 17 | get_playlist_videos(youtube,playlist,ec=ec,ch_id=ch_id) 18 | except: 19 | print("Error getting Playlist :",playlist) 20 | get_channel_videos(youtube,ch_id) 21 | get_channel_length(ch_id) -------------------------------------------------------------------------------- /src/get_api_key.py: -------------------------------------------------------------------------------- 1 | from googleapiclient.discovery import build 2 | from googleapiclient.errors import HttpError 3 | from httplib2 import ServerNotFoundError 4 | from google.auth.exceptions import DefaultCredentialsError 5 | ''' 6 | Get the API key and save it in a text file named, key.txt in parent folder. 7 | The method to get a youtube API key is well illustrated in the Youtube Video in the README page. 8 | ''' 9 | class api_key(): 10 | def __init__(self): 11 | self.youtube = None 12 | 13 | def get_api_key(self): 14 | try: 15 | with open('key.txt') as key_file: 16 | api_key = key_file.read() 17 | youtube = build('youtube','v3',developerKey=api_key) 18 | self.youtube = youtube 19 | 20 | except HttpError: 21 | print("\nAPI Key is wrong") 22 | print("Please recheck the API key or generate a new key.\nThen modify the 'key.txt' file with new Key\n") 23 | 24 | except ServerNotFoundError: 25 | print("\nUnable to connect to internet...") 26 | print("Please Check Your Internet Connection.\n") 27 | 28 | except DefaultCredentialsError: 29 | print("\n'key.txt' is Blank.") 30 | print("Please save your API key there and then continue.\n") 31 | 32 | except FileNotFoundError: 33 | print("\nNo such file: 'key.txt'") 34 | print("Please create a file named 'key.txt' and place your Youtube API key in it.\n") 35 | 36 | except Exception as e: 37 | print(e) 38 | print("Oops!", e.__class__, "occurred.") 39 | 40 | def get_youtube(self): 41 | return self.youtube 42 | 43 | if __name__ == "__main__": 44 | youtube_instance = api_key() 45 | youtube_instance.get_api_key() 46 | youtube = youtube_instance.get_youtube() 47 | print(youtube) -------------------------------------------------------------------------------- /src/get_channel_details.py: -------------------------------------------------------------------------------- 1 | import sqlite3,datetime 2 | from src.get_channel_playlists import get_channel_playlists 3 | import sys 4 | 5 | def get_channel_details(youtube,channel_id,single=False,playlistID='',ec=False): 6 | 7 | request = youtube.channels().list(part="snippet,statistics", 8 | id=channel_id 9 | ).execute() 10 | 11 | # print(request['items'][0]) 12 | Channel_Id = channel_id 13 | flag1 = True 14 | flag2 = True 15 | conn = sqlite3.connect('youtube.db') 16 | cur = conn.cursor() 17 | try: 18 | Channel_title = request['items'][0]['snippet']['title'] 19 | except: 20 | flag1 = False 21 | try: 22 | cur.execute("SELECT Channel_ID from tb_channels WHERE Channel_ID = ? ",(Channel_Id,)) 23 | temp = cur.fetchone() 24 | if temp is None: 25 | flag2 = False 26 | except: 27 | flag2 = False 28 | cur.execute("SELECT Is_Deleted from tb_channels WHERE Channel_ID = ? ",(Channel_Id,)) 29 | flag3 = cur.fetchone() 30 | if flag3 is None: 31 | pass 32 | else: 33 | flag3 = flag3[0] 34 | if flag1 == False and flag2 == False: 35 | print("Channel ID not valid") 36 | sys.exit() 37 | if flag1 == False and flag2 == True and flag3 == 1: 38 | print("Channel was already Deleted") 39 | conn.commit() # Push the data into database 40 | conn.close() 41 | sys.exit() 42 | if flag1 == False and flag2 == True and flag3 == 0: 43 | cur.execute("SELECT Channel_Id from tb_channels") 44 | cur.execute("UPDATE tb_channels SET Is_Deleted = ? WHERE Channel_ID = ? ",(1,Channel_Id)) 45 | cur.execute("UPDATE tb_channels SET Auto_Update = ? WHERE Channel_ID = ? ",(0,Channel_Id)) 46 | print("Channel is Deleted and now updated in Database") 47 | conn.commit() # Push the data into database 48 | conn.close() 49 | sys.exit() 50 | 51 | Description = request['items'][0]['snippet']['description'] 52 | Published_At = request['items'][0]['snippet']['publishedAt'] 53 | try: 54 | Country = request['items'][0]['snippet']['country'] 55 | except: 56 | Country = None 57 | View_Count = request['items'][0]['statistics']['viewCount'] 58 | try: 59 | Subscriber_Count = request['items'][0]['statistics']['subscriberCount'] 60 | except: 61 | Subscriber_Count = None 62 | Video_Count = request['items'][0]['statistics']['videoCount'] 63 | if ec == False: 64 | Channel_last_Scraped = 'Never' 65 | else: 66 | Channel_last_Scraped = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 67 | 68 | cur.execute("SELECT Downloaded_Videos FROM tb_channels WHERE Channel_ID = ?" ,(Channel_Id,)) 69 | temp = cur.fetchone() 70 | try: 71 | temp = temp[0] 72 | if temp > 0: 73 | Downloaded_Videos = temp 74 | else: 75 | Downloaded_Videos = 0 76 | except: 77 | Downloaded_Videos = 0 78 | cur.execute("SELECT Folder_Size_GB FROM tb_channels WHERE Channel_ID = ?" ,(Channel_Id,)) 79 | temp = cur.fetchone() 80 | try: 81 | temp = temp[0] 82 | if temp > 0: 83 | Folder_Size_GB = temp 84 | else: 85 | Folder_Size_GB = 0 86 | except: 87 | Folder_Size_GB = 0 88 | 89 | params = (Channel_Id,Channel_title,Published_At,Country,View_Count,Subscriber_Count,Video_Count,0,0,'First, Scrape Entire Channel',0,0,Downloaded_Videos,Folder_Size_GB) 90 | 91 | conn = sqlite3.connect('youtube.db') 92 | cur = conn.cursor() 93 | cur.execute("INSERT OR REPLACE INTO tb_channels VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?,?,?, ?,?, 'Never',1,'')", params) 94 | conn.commit() # Push the data into database 95 | conn.close() 96 | conn = sqlite3.connect('youtube.db') 97 | cur = conn.cursor() 98 | cur.execute("UPDATE tb_channels SET Channel_last_Scraped = ? WHERE Channel_ID = ? ",(Channel_last_Scraped,Channel_Id)) 99 | cur.execute("UPDATE tb_channels SET Description = ? WHERE Channel_ID = ? ",(Description,Channel_Id)) 100 | conn.commit() # Push the data into database 101 | conn.close() 102 | get_channel_playlists(youtube,Channel_Id,single,playlistID) 103 | 104 | def get_channel_length(Channel_Id): 105 | conn = sqlite3.connect('youtube.db') 106 | cur = conn.cursor() 107 | cur.execute("SELECT SUM(video_seconds) FROM tb_videos WHERE Channel_ID = ? ",(Channel_Id,)) 108 | tot = cur.fetchone() 109 | tot = tot[0] 110 | if tot is None: 111 | tot = 0 # For channels with 0 Original Videos (e.g. Hasan Minaj) 112 | Duration_in_Text = str(datetime.timedelta(seconds = tot)) 113 | cur.execute("UPDATE tb_channels SET Duration_in_Text = ? WHERE Channel_ID = ? ",(Duration_in_Text,Channel_Id)) 114 | cur.execute("UPDATE tb_channels SET Channel_Duration = ? WHERE Channel_ID = ? ",(tot,Channel_Id)) 115 | cur.execute("SELECT COUNT(Video_ID) FROM tb_videos WHERE Is_Deleted = ? AND Channel_ID = ? ",(1,Channel_Id)) 116 | num = cur.fetchone() 117 | num=num[0] 118 | cur.execute("UPDATE tb_channels SET Deleted_Videos = ? WHERE Channel_ID = ? ",(num,Channel_Id)) 119 | conn.commit() 120 | conn.close() 121 | 122 | if __name__ == "__main__": 123 | pass -------------------------------------------------------------------------------- /src/get_channel_id.py: -------------------------------------------------------------------------------- 1 | from get_api_key import api_key 2 | import argparse 3 | import os 4 | 5 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,\ 6 | description='Get CHannel ID for CHannel User Name, Hopefully!!!') 7 | parser.add_argument("User", help='Enter Channel User Name') 8 | args = parser.parse_args() 9 | 10 | 11 | 12 | def get_channel_id(youtube,ch_name): 13 | request = youtube.channels().list( 14 | part="snippet,contentDetails,statistics", 15 | forUsername=ch_name 16 | ) 17 | response = request.execute() 18 | try: 19 | sub_count = int(response['items'][0]['statistics']['subscriberCount']) 20 | if sub_count > 1000000: 21 | sub_count = str(sub_count / 1000000) 22 | sub_count = sub_count + 'M Subscribers' 23 | elif sub_count > 1000: 24 | sub_count = str(sub_count / 1000) 25 | sub_count = sub_count + 'K Subscribers' 26 | else: 27 | sub_count = str(sub_count) + ' Subscribers' 28 | ch_id = response['items'][0]['id'] 29 | 30 | print(" ") 31 | print(sub_count) 32 | print(ch_id) 33 | return ch_id 34 | except KeyError: 35 | print(" ") 36 | print(" Error : Channel not Found ") 37 | print(" ") 38 | 39 | if __name__ == "__main__": 40 | youtube_instance = api_key() 41 | youtube_instance.get_api_key() 42 | youtube = youtube_instance.get_youtube() 43 | get_channel_id(youtube,args.User) -------------------------------------------------------------------------------- /src/get_channel_playlists.py: -------------------------------------------------------------------------------- 1 | import sqlite3, time 2 | 3 | def get_channel_playlists(youtube,channel_id,single=False,playlistID=''): 4 | 5 | conn = sqlite3.connect('youtube.db') 6 | cur = conn.cursor() 7 | 8 | playlists = [] 9 | playlist_ids = [] 10 | next_page_token = None 11 | 12 | while 1: 13 | res = youtube.playlists().list( part="snippet,contentDetails", 14 | channelId=channel_id, 15 | pageToken=next_page_token, 16 | maxResults=50 17 | ).execute() 18 | playlists += res['items'] 19 | next_page_token = res.get('nextPageToken') 20 | 21 | 22 | for playlist in playlists: 23 | Playlist_ID = playlist['id'] ; playlist_ids.append(Playlist_ID) 24 | if (single == True and playlist['id'] == playlistID) or single == False: 25 | 26 | 27 | Playlist_title = playlist['snippet']['title'] 28 | Channel_Id = playlist['snippet']['channelId'] 29 | Channel_Title = playlist['snippet']['channelTitle'] 30 | Published_At = playlist['snippet']['publishedAt'] 31 | Current_Video_Count = playlist['contentDetails']['itemCount'] 32 | Playlist_Seconds = 0 33 | Playlist_Duration = '0' 34 | cur.execute("SELECT Is_Seen FROM tb_playlists WHERE Playlist_ID = ?" ,(Playlist_ID,)) 35 | temp = cur.fetchone() 36 | try: 37 | temp = temp[0] 38 | if temp == 1: 39 | Is_Seen = 1 40 | else: 41 | Is_Seen = 0 42 | except: 43 | Is_Seen = 0 44 | # 0 = not seen 1 = seen 45 | cur.execute("SELECT Worth FROM tb_playlists WHERE Playlist_ID = ?" ,(Playlist_ID,)) 46 | temp = cur.fetchone() 47 | try: 48 | temp = temp[0] 49 | if temp == 1: 50 | Worth = 1 51 | else: 52 | Worth = 0 53 | except: 54 | Worth = 0 55 | 56 | cur.execute("SELECT Downloaded_Videos FROM tb_playlists WHERE Playlist_ID = ?" ,(Playlist_ID,)) 57 | temp = cur.fetchone() 58 | try: 59 | temp = int(temp[0]) 60 | if temp > 0: 61 | Downloaded_Videos = temp 62 | else: 63 | Downloaded_Videos = 0 64 | except: 65 | Downloaded_Videos = 0 66 | cur.execute("SELECT Folder_Size_GB FROM tb_playlists WHERE Playlist_ID = ?" ,(Playlist_ID,)) 67 | temp = cur.fetchone() 68 | try: 69 | temp = int(temp[0]) 70 | if temp > 0: 71 | Folder_Size_GB = temp 72 | else: 73 | Folder_Size_GB = 0 74 | except: 75 | Folder_Size_GB = 0 76 | params = (Playlist_ID,Playlist_title,Channel_Id,Channel_Title,Published_At,Current_Video_Count,Playlist_Seconds,Playlist_Duration,Is_Seen,Worth,0,0,Downloaded_Videos,Folder_Size_GB) 77 | cur.execute("INSERT OR REPLACE INTO tb_playlists VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,?, 0, 1)", params) 78 | last_time = time.time() 79 | cur.execute("UPDATE tb_playlists SET Playlist_last_Scraped = ? WHERE Playlist_ID = ? ",(last_time,Playlist_ID)) 80 | if next_page_token is None: 81 | break 82 | 83 | playlist_ids = set(playlist_ids) 84 | playlist_ids = list(playlist_ids) 85 | count = len(playlist_ids) 86 | cur.execute("UPDATE tb_channels SET Playlist_Count = ? WHERE Channel_ID = ? ",(count,channel_id)) 87 | 88 | conn.commit() # Push the data into database 89 | conn.close() 90 | 91 | return playlist_ids 92 | 93 | if __name__ == "__main__": 94 | pass -------------------------------------------------------------------------------- /src/get_channel_videos.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | from src.get_video_stats import get_videos_stats 4 | 5 | def get_channel_videos(youtube,channel_id): 6 | conn = sqlite3.connect('youtube.db') 7 | cur = conn.cursor() 8 | 9 | res = youtube.channels().list(id=channel_id, 10 | part='contentDetails').execute() 11 | 12 | playlist_id = res['items'][0]['contentDetails']['relatedPlaylists']['uploads'] 13 | 14 | videos = [] 15 | next_page_token = None 16 | new_video_ids = [] 17 | 18 | try: 19 | while 1: 20 | res = youtube.playlistItems().list(playlistId=playlist_id, 21 | part='snippet', 22 | maxResults=50, 23 | pageToken=next_page_token).execute() 24 | videos += res['items'] 25 | next_page_token = res.get('nextPageToken') 26 | 27 | 28 | video_ids = list(map(lambda x:x['snippet']['resourceId']['videoId'], videos)) 29 | 30 | if next_page_token is None: 31 | break 32 | except: 33 | print("Channel has no Original Videos") 34 | video_ids = [] 35 | CVids = [] 36 | cur.execute("SELECT Video_ID FROM tb_videos WHERE Channel_ID=? AND Playlist_ID IS NOT NULL",(channel_id,)) 37 | temp = cur.fetchall() 38 | for item in temp: 39 | CVids.append(item[0]) 40 | CVids = set(CVids) 41 | video_ids = set(video_ids) 42 | diff = video_ids - CVids 43 | new_video_ids = list(diff) 44 | conn.commit() # Push the data into database 45 | conn.close() 46 | 47 | print('\nParsing ',len(new_video_ids),' videos, which are not in any playlist') 48 | get_videos_stats(youtube,new_video_ids,flag=1) 49 | 50 | if __name__ == "__main__": 51 | pass 52 | 53 | -------------------------------------------------------------------------------- /src/get_playlist_videos.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | from datetime import timedelta 3 | 4 | from src.get_video_stats import get_videos_stats 5 | from src.get_channel_details import get_channel_details 6 | 7 | def get_playlist_videos(youtube,playlistID,ec=False,ch_id=None): 8 | 9 | ch_ID = 'skip' 10 | conn = sqlite3.connect('youtube.db') 11 | cur = conn.cursor() 12 | 13 | videos = [] 14 | next_page_token = None 15 | video_IDS = [] 16 | while 1: 17 | res = youtube.playlistItems().list(part="snippet", 18 | maxResults=50, 19 | playlistId=playlistID, 20 | pageToken=next_page_token 21 | ).execute() 22 | videos += res['items'] 23 | next_page_token = res.get('nextPageToken') 24 | 25 | if next_page_token is None: 26 | break 27 | 28 | 29 | for video in videos: 30 | 31 | Video_id = video['snippet']['resourceId']['videoId']; video_IDS.append(Video_id) 32 | try: 33 | ch_ID = video['snippet']['channelId'] 34 | except: 35 | ch_ID = 'skip' 36 | if ec == True: 37 | params = (Video_id,"",0,0,ch_id,None,None,0,ch_id,'',0,0,0,0,0,'',0,0,1,0) 38 | cur.execute("INSERT OR IGNORE INTO tb_videos VALUES (?, ?, ?,? ,?, ?, ?, ?,?, ?,?,?,?,?,?,?,?,?,?,?)", params) 39 | else: 40 | params = (Video_id,"",0,0,"","","") 41 | cur.execute("INSERT OR IGNORE INTO tb_videos VALUES (?, ?, ?,? ,?, ?, ?, 0,'', '',0,0,0,0,0,'',0,0,0,0)", params) 42 | 43 | 44 | conn.commit() # Push the data into database 45 | conn.close() 46 | 47 | if ch_ID == 'skip': 48 | conn = sqlite3.connect('youtube.db') 49 | cur = conn.cursor() 50 | cur.execute("SELECT Current_Video_Count FROM tb_playlists WHERE playlist_ID = ? ",(playlistID,)) 51 | num = cur.fetchone() 52 | num=num[0] 53 | print(num) 54 | if num == 0: 55 | cur.execute("UPDATE tb_playlists SET Is_Removed = ? WHERE playlist_ID = ? ",(1,playlistID)) 56 | conn.commit() # Push the data into database 57 | conn.close() 58 | return 0 59 | else: 60 | if ec == False: 61 | get_channel_details(youtube,ch_ID,True,playlistID) 62 | 63 | Playlist_Seconds,num_new = get_videos_stats(youtube,video_IDS,1,playlistID) 64 | print('Videos in this playlist =',num_new) 65 | Playlist_Duration = str(timedelta(seconds = Playlist_Seconds)) 66 | conn = sqlite3.connect('youtube.db') 67 | cur = conn.cursor() 68 | 69 | cur.execute("SELECT Current_Video_Count FROM tb_playlists WHERE playlist_ID = ? ",(playlistID,)) 70 | num = cur.fetchone() 71 | num=num[0] 72 | if num != num_new: 73 | cur.execute("UPDATE tb_playlists SET Current_Video_Count = ? WHERE playlist_ID = ? ",(num_new,playlistID)) 74 | 75 | cur.execute("UPDATE tb_playlists SET Playlist_Seconds = ? WHERE playlist_ID = ? ",(Playlist_Seconds,playlistID)) 76 | cur.execute("UPDATE tb_playlists SET Playlist_Duration = ? WHERE playlist_ID = ? ",(Playlist_Duration,playlistID)) 77 | cur.execute("SELECT COUNT(Video_ID) FROM tb_videos WHERE Is_Deleted = ? AND playlist_ID = ? ",(1,playlistID)) 78 | num = cur.fetchone() 79 | num=num[0] 80 | cur.execute("UPDATE tb_playlists SET Deleted_Videos = ? WHERE playlist_ID = ? ",(num,playlistID)) 81 | conn.commit() # Push the data into database 82 | conn.close() 83 | 84 | if __name__ == "__main__": 85 | pass -------------------------------------------------------------------------------- /src/get_video_stats.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | import sqlite3, time 3 | from os import path 4 | import sys 5 | 6 | def get_videos_stats(youtube,video_ids,flag=1,playlistID = None): 7 | oflag = flag 8 | if not path.exists('youtube.db'): 9 | print("Please Create the database First") 10 | sys.exit() 11 | else: 12 | pass 13 | 14 | conn = sqlite3.connect('youtube.db') 15 | cur = conn.cursor() 16 | count1 = 0 17 | stats = [] 18 | tot_len = 0 19 | for i in range(0, len(video_ids), 50): 20 | res = youtube.videos().list(id=','.join(video_ids[i:i+50]), 21 | part='snippet,statistics,contentDetails').execute() 22 | stats += res['items'] 23 | 24 | new_ids = [] 25 | for video in stats: 26 | count1 += 1 27 | try: 28 | Video_id = video['id'] 29 | except: 30 | Video_id = '' 31 | flag = 3 32 | new_ids.append(Video_id) 33 | Video_title = video['snippet']['title'] 34 | Upload_playlistId = video['snippet']['channelId'] 35 | 36 | if playlistID is not None: 37 | Playlist_Id = playlistID # When call is from a playlist 38 | else: 39 | cur.execute("SELECT Playlist_ID FROM tb_videos WHERE Video_ID = ?" ,(Video_id,)) 40 | result = cur.fetchone() 41 | if result is None: 42 | Playlist_Id = None 43 | else: 44 | if type(result) is tuple: 45 | Playlist_Id = result[0] 46 | elif type(result) is str: 47 | Playlist_Id = result 48 | else: 49 | Playlist_Id = None 50 | Published_At = video['snippet']['publishedAt'] 51 | date_format = "%Y-%m-%dT%H:%M:%SZ" 52 | epoch = float(time.mktime(time.strptime(Published_At, date_format))) 53 | Channel_Id = video['snippet']['channelId'] 54 | Channel_Title = video['snippet']['channelTitle'] 55 | try: 56 | View_Count = video['statistics']['viewCount'] 57 | except: 58 | View_Count = 1111 59 | cur.execute("SELECT View_Count FROM tb_videos WHERE Video_ID = ?" ,(Video_id,)) 60 | temp = cur.fetchone() 61 | try: 62 | temp = temp[0] 63 | if View_Count < temp: 64 | continue 65 | except: 66 | pass 67 | try: 68 | Like_Count = video['statistics']['likeCount'] 69 | except: 70 | Like_Count = 0 71 | try: 72 | Dislike_Count = video['statistics']['dislikeCount'] 73 | except: 74 | Dislike_Count = 0 75 | try: 76 | Upvote_Ratio = round(((int(Like_Count)/(int(Like_Count)+(int(Dislike_Count))))*100),3) 77 | except: 78 | Upvote_Ratio = 0 79 | try: 80 | Comment_Count = video['statistics']['commentCount'] 81 | except: 82 | Comment_Count = 0 83 | try: 84 | Duration = str(video['contentDetails']['duration']) 85 | Duration = Duration.replace('PT','') 86 | hh=mm=ss = '00' 87 | if Duration.find('H') != -1: 88 | hh = Duration.split('H')[0] 89 | temp = hh+'H' 90 | if len(hh) == 1: 91 | hh = '0'+hh 92 | Duration = Duration.replace(temp,'') 93 | if Duration.find('M') != -1: 94 | mm = Duration.split('M')[0] 95 | temp = mm+'M' 96 | if len(mm) == 1: 97 | mm = '0'+mm 98 | Duration = Duration.replace(temp,'') 99 | if Duration.find('S') != -1: 100 | ss = Duration.split('S')[0] 101 | if len(ss) == 1: 102 | ss = '0'+ss 103 | Duration = (hh+':'+mm+':'+ss) 104 | video_seconds = timedelta(hours = int(hh), 105 | minutes= int(mm), 106 | seconds= int(ss)).total_seconds() 107 | # if playlistID is not None: 108 | tot_len += video_seconds 109 | except: 110 | Duration = '0' 111 | video_seconds = 0 112 | 113 | try: 114 | Is_Licensed = video['contentDetails']['licensedContent'] 115 | except: 116 | Is_Licensed = 0 117 | cur.execute("SELECT Is_Seen FROM tb_videos WHERE Video_ID = ?" ,(Video_id,)) 118 | temp = cur.fetchone() 119 | try: 120 | temp = temp[0] 121 | if temp == 1: 122 | Is_Seen = 1 123 | else: 124 | Is_Seen = 0 125 | except: 126 | Is_Seen = 0 127 | # 0 = not seen 1 = seen 128 | cur.execute("SELECT Worth FROM tb_videos WHERE Video_ID = ?" ,(Video_id,)) 129 | temp = cur.fetchone() 130 | try: 131 | temp = temp[0] 132 | if temp == 1: 133 | Worth = 1 134 | else: 135 | Worth = 0 136 | except: 137 | Worth = 0 138 | 139 | cur.execute("SELECT Is_Downloaded FROM tb_videos WHERE Video_ID = ?" ,(Video_id,)) 140 | temp = cur.fetchone() 141 | try: 142 | temp = temp[0] 143 | if temp == 1: 144 | Is_Downloaded = 1 145 | else: 146 | Is_Downloaded = 0 147 | except: 148 | Is_Downloaded = 0 149 | Is_Deleted = 0 150 | if flag == 1 or flag == 2: 151 | Is_Deleted = 0 152 | elif flag == 3: 153 | Is_Deleted = 1 154 | print(Video_id,' is deleted') 155 | cur.execute("UPDATE tb_videos SET IS_Deleted = 1 WHERE Video_ID = ?",(Video_id,)) 156 | flag = oflag 157 | params = (Video_id,Video_title,Is_Seen,Worth,Upload_playlistId,Playlist_Id,Published_At,epoch,Channel_Id,Channel_Title,View_Count,Like_Count,Dislike_Count,Upvote_Ratio,Comment_Count,Duration,video_seconds,Is_Licensed,Is_Deleted,Is_Downloaded) 158 | if flag == 1: 159 | cur.execute("INSERT OR REPLACE INTO tb_videos VALUES (?, ?, ?, ?, ?, ?, ? ,? ,? ,? ,? ,? , ?, ?, ?, ?, ?, ?, ?, ?)", params) 160 | elif flag == 2: 161 | cur.execute("INSERT OR IGNORE INTO tb_videos VALUES (?, ?, ?, ?, ?, ?, ? ,? ,? ,? ,? ,? , ?, ?, ?, ?, ?, ?, ?, ?)", params) 162 | conn.commit() 163 | conn.close() 164 | 165 | video_ids = set(video_ids) 166 | new_ids = set(new_ids) 167 | num_new = len(new_ids) 168 | diff = video_ids-new_ids 169 | if len(diff) > 0: 170 | conn = sqlite3.connect('youtube.db') 171 | cur = conn.cursor() 172 | for item in diff: 173 | print(item,' not available') 174 | try: 175 | params = (item,'Not Available',0,0,Channel_Id,playlistID,'','',Channel_Id,Channel_Title,'','','','','','','','',1,0) 176 | cur.execute("INSERT OR IGNORE INTO tb_videos VALUES (?, ?, ?, ?, ?, ?, ? ,? ,? ,? ,? ,? , ?, ?, ?, ?, ?, ?, ?, ?)", params) 177 | cur.execute("UPDATE tb_videos SET IS_Deleted = 1 WHERE Video_ID = ?",(item,)) 178 | except: 179 | pass 180 | conn.commit() 181 | conn.close() 182 | if tot_len > 0: 183 | return tot_len,num_new 184 | 185 | if __name__ == "__main__": 186 | pass -------------------------------------------------------------------------------- /src/import_downloaded_items.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import subprocess, os,re 3 | #SELECT * FROM tb_videos WHERE Video_ID IN (SELECT Video_ID FROM yt_downloaded) ORDER BY Is_Seen 4 | # Sanity Check 5 | from os import listdir 6 | from os.path import isfile, join 7 | import sqlite3 8 | 9 | from get_api_key import api_key 10 | from get_video_stats import get_videos_stats 11 | 12 | def update_local(vid_path): 13 | vid_path1 = '"'+vid_path+'"' 14 | command = "./ffmpeg -i "+vid_path1+" -hide_banner" 15 | try: 16 | with open('log1.txt', "w",encoding='utf-8') as outfile: #latin-1 17 | subprocess.run(command, stderr=subprocess.STDOUT,stdout=outfile) 18 | except Exception as e: 19 | print(e) 20 | with open('log1.txt', "r",encoding='utf-8') as fhand: 21 | for line in fhand: 22 | line=line.lstrip() 23 | temp_match = line[0:11] 24 | if temp_match == 'Stream #0:0': 25 | result = re.findall('\d+x\d+', line) 26 | Resolution = result[0] 27 | result = re.findall('[0-9.]+ fps', line) 28 | try: 29 | fps = result[0] 30 | fps = fps.strip(' fps') 31 | except: 32 | fps = 0 33 | if line.startswith('Duration:'): 34 | result = re.findall('\d+ ', line) 35 | bitrate = result[0] 36 | if line.startswith('Stream #0:1'): 37 | result = re.findall('Audio: [a-zA-Z]+,', line) 38 | temp = result[0] 39 | Audio_Type = temp.strip(',') 40 | Audio_Type = Audio_Type[7:] 41 | result = re.findall('\d+ Hz', line) 42 | temp = result[0] 43 | Frequency = temp.strip(' Hz') 44 | result = re.findall('Hz, \w+,', line) 45 | temp = result[0] 46 | Channels = temp.strip('Hz ,') 47 | raw_size = Path(vid_path).stat().st_size 48 | size = raw_size/(1024*1024) 49 | size = round(size,3) 50 | return (Resolution,raw_size,size,fps,bitrate,Audio_Type,Frequency,Channels) 51 | 52 | def import_vids(): 53 | mypath = ('D:\\Youtube1') 54 | conn = sqlite3.connect('C:\\Users\\Sambit\\Desktop\\Projects\\Youtube\\Youtube_Scraper\\youtube.db') 55 | cur = conn.cursor() 56 | # r=root, d=directories, f = files 57 | for r, d, f in os.walk(mypath): 58 | for file in f: 59 | if file.endswith(("mp4", "mkv", "flv", "wmv", "avi", "mpg", "mpeg")): 60 | vid_path = os.path.join(r, file) 61 | vid_id = vid_path[-15:-4] 62 | with open ("skip_files.txt") as f: 63 | if vid_id in f.read(): 64 | continue 65 | vid_type = vid_path[-3:] 66 | cur.execute("SELECT Video_ID FROM yt_downloaded WHERE Video_ID = ?",(vid_id,)) 67 | if (cur.fetchone()) is None: 68 | Resolution,raw_size,size,fps,bitrate,Audio_Type,Frequency,Channels = update_local(vid_path) 69 | params = (vid_id,Resolution,raw_size,size,vid_type,fps,bitrate,Audio_Type,Frequency,Channels,0) 70 | cur.execute("INSERT OR REPLACE INTO yt_downloaded VALUES (?,?,?,?,?,?,?,?,?,?,?)",params) 71 | 72 | conn.commit() 73 | conn.close() 74 | 75 | def update_vids(): 76 | def is_in_main(): 77 | conn = sqlite3.connect('C:\\Users\\Sambit\\Desktop\\Projects\\Youtube\\Youtube_Scraper\\youtube.db') 78 | cur = conn.cursor() 79 | cur.execute("UPDATE yt_downloaded SET Is_In_Main = 1 WHERE Video_ID IN (SELECT Video_ID FROM yt_downloaded \ 80 | WHERE Video_ID IN (SELECT Video_ID FROM tb_videos))") 81 | conn.commit() 82 | conn.close() 83 | # is_in_main() 84 | for i in range(2000): 85 | conn = sqlite3.connect('C:\\Users\\Sambit\\Desktop\\Projects\\Youtube\\Youtube_Scraper\\youtube.db') 86 | cur = conn.cursor() 87 | cur.execute("SELECT Count(*) FROM yt_downloaded") 88 | tot = cur.fetchone() 89 | cur.execute("SELECT Video_ID FROM yt_downloaded WHERE Is_In_Main = 0 LIMIT 50") 90 | temp = cur.fetchall() 91 | if len(temp) < 1: 92 | print("All Videos (locally downloaded) are now in main table tb_videos") 93 | break 94 | result = [] 95 | for item in temp: 96 | cur.execute("UPDATE yt_downloaded SET Is_In_Main = 1 WHERE Video_ID = ?",(item[0],)) 97 | result.append(item[0]) 98 | 99 | conn.commit() 100 | conn.close() 101 | 102 | print('Parsing Downloaded Videos :',(i*50),' / ',tot[0],end="\r") 103 | print(' ') 104 | youtube_instance = api_key() 105 | youtube_instance.get_api_key() 106 | youtube = youtube_instance.get_youtube() 107 | get_videos_stats(youtube,result,1) 108 | conn = sqlite3.connect('youtube.db') 109 | cur = conn.cursor() 110 | for item in result: 111 | print('New Item added successfully :',item) 112 | cur.execute("UPDATE tb_videos SET Is_Downloaded = 1 WHERE Video_ID = ?",(item,)) 113 | cur.execute("UPDATE tb_videos SET Is_Seen = 1 WHERE Video_ID = ?",(item,)) 114 | cur.execute("UPDATE tb_videos SET Worth = 1 WHERE Video_ID = ?",(item,)) 115 | conn.commit() 116 | conn.close() 117 | is_in_main() 118 | 119 | if __name__ == "__main__": 120 | import_vids() 121 | update_vids() 122 | -------------------------------------------------------------------------------- /src/load_history.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | import sqlite3, time 3 | from bs4 import BeautifulSoup 4 | 5 | from src.get_video_stats import get_videos_stats 6 | from src.get_api_key import api_key 7 | 8 | 9 | def update_title(): 10 | conn = sqlite3.connect('youtube.db') 11 | cur = conn.cursor() 12 | cur.execute("SELECT Video_ID FROM video_history WHERE (Title IS NULL OR Title = '') AND Is_Deleted = 0") 13 | temp = cur.fetchall() 14 | for item in temp: 15 | cur.execute("SELECT Video_title FROM tb_videos WHERE Video_ID = ?",(item[0],)) 16 | tit = cur.fetchone() 17 | cur.execute("UPDATE video_history SET Title = ? WHERE Video_ID = ?",(tit[0],item[0])) 18 | conn.commit() 19 | conn.close() 20 | def update_deleted(): 21 | conn = sqlite3.connect('youtube.db') 22 | cur = conn.cursor() 23 | cur.execute("UPDATE video_history SET Is_Deleted = 1 WHERE Video_ID NOT IN (SELECT Video_ID FROM video_history WHERE Video_ID IN (SELECT Video_ID FROM tb_videos))") 24 | conn.commit() 25 | conn.close() 26 | def update_is_seen(): 27 | conn = sqlite3.connect('youtube.db') 28 | cur = conn.cursor() 29 | cur.execute("UPDATE tb_videos SET Is_Seen = 1 WHERE Video_ID IN (SELECT Video_ID FROM tb_videos \ 30 | WHERE Video_ID IN (SELECT Video_ID FROM video_history))") 31 | conn.commit() 32 | conn.close() 33 | 34 | def update_is_in_main(): 35 | conn = sqlite3.connect('youtube.db') 36 | cur = conn.cursor() 37 | cur.execute("UPDATE video_history SET Is_in_Main = 1 WHERE Video_ID IN (SELECT Video_ID FROM video_history \ 38 | WHERE Video_ID IN (SELECT Video_ID FROM tb_videos))") 39 | 40 | conn.commit() 41 | conn.close() 42 | 43 | def update_history(youtube): 44 | for i in range(2000): 45 | conn = sqlite3.connect('youtube.db') 46 | cur = conn.cursor() 47 | cur.execute("SELECT Count(*) FROM video_history") 48 | tot = cur.fetchone() 49 | cur.execute("SELECT Video_ID FROM video_history WHERE Is_in_Main = 0 AND Is_Deleted = 0 LIMIT 50;") 50 | temp = cur.fetchall() 51 | if len(temp) < 2: 52 | print("All Videos From Watched History are now in main table tb_videos") 53 | break 54 | result = [] 55 | for item in temp: 56 | cur.execute("UPDATE video_history SET Is_in_Main = 1 WHERE Video_ID = ?",(item[0],)) 57 | result.append(item[0]) 58 | 59 | conn.commit() 60 | conn.close() 61 | print('Parsing Watch History Videos :',(i*50),' / ',tot[0],end="\r") 62 | get_videos_stats(youtube,result,1) 63 | update_is_in_main() 64 | 65 | def load_history(res='n'): 66 | count_loc_prog = 0 67 | with open("takeout/history/watch-history.html",encoding='utf-8') as fp: 68 | conn = sqlite3.connect('youtube.db') 69 | cur = conn.cursor() 70 | 71 | soup = BeautifulSoup(fp,'lxml') 72 | soup = soup.body 73 | 74 | videos = soup.find_all("div", {"class": "content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1"}) 75 | 76 | print(len(videos)) 77 | 78 | for video in videos: 79 | count_loc_prog += 1 80 | if count_loc_prog % 500 == 0: 81 | print('Loading into Database : ',count_loc_prog,' / ',len(videos),end="\r") 82 | tags = video.find_all('a') 83 | # try: 84 | 85 | if tags == []: 86 | continue 87 | 88 | V_link = tags[0].get('href') 89 | V_link = V_link.split('=')[-1] 90 | br_tags = video.find_all('br') 91 | for tag in br_tags: 92 | watched_at = str(tag.next_sibling) 93 | if watched_at[-3:-1] == 'IS': 94 | final_time = (watched_at) 95 | temp = final_time.replace('IST','+0530') 96 | epoch = time.mktime(time.strptime(temp, "%b %d, %Y, %I:%M:%S %p %z")) 97 | cur.execute("INSERT OR IGNORE INTO video_history VALUES (?,?,?,?,?,?)", (V_link,'',final_time,epoch,0,0)) 98 | 99 | 100 | conn.commit() # Push the data into database 101 | conn.close() 102 | print("\n Loaded \n") 103 | 104 | if res == 'y' or res == "Y": 105 | youtube_instance = api_key() 106 | youtube_instance.get_api_key() 107 | youtube = youtube_instance.get_youtube() 108 | update_history(youtube) 109 | update_title() 110 | update_deleted() 111 | update_is_seen() 112 | update_is_in_main() 113 | 114 | if __name__ == "__main__": 115 | pass -------------------------------------------------------------------------------- /src/most_watched.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | def most_watched(n=5): 4 | conn = sqlite3.connect('youtube.db') 5 | cur = conn.cursor() 6 | cur.execute("SELECT video_history.Video_ID,COUNT(video_history.Video_ID) AS cnt, Video_title FROM video_history \ 7 | LEFT OUTER JOIN tb_videos on tb_videos.Video_ID = video_history.Video_ID \ 8 | GROUP BY video_history.Video_ID ORDER BY cnt DESC;") 9 | results = cur.fetchmany(n) 10 | print("\t"," Video Link","\t","\t","\t"," Times Watched","\t","\t"," Video Name") 11 | print("-------------------------------------------------------------------------------------------------------") 12 | for result in results: 13 | Link = "https://www.youtube.com/watch?v="+result[0] 14 | if result[2] is None: 15 | title = "Video is not available in local database" 16 | else: 17 | title = result[2] 18 | print(Link,'\t',result[1],'\t',title) 19 | conn.commit() 20 | conn.close() 21 | 22 | if __name__ == "__main__": 23 | pass -------------------------------------------------------------------------------- /src/oldest_videos.py: -------------------------------------------------------------------------------- 1 | from get_api_key import api_key 2 | 3 | import argparse 4 | import os 5 | from datetime import datetime 6 | 7 | youtube_instance = api_key() 8 | youtube_instance.get_api_key() 9 | youtube = youtube_instance.get_youtube() 10 | 11 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,\ 12 | description='Explore the oldest videos on a Topic',\ 13 | epilog='''Examples \n .\oldest_videos.py tesla \n .\oldest_videos.py "game of thrones" -n 15 -s 2012''') 14 | parser.add_argument("topic", help='Enter the topic') 15 | group2 = parser.add_argument_group() 16 | group2.add_argument('-n','--max_results',type=int, metavar='', default=5, help='The script will display "n" results') 17 | group2.add_argument('-s','--start_year',type=int, metavar='', default=2005, help='By default, it will search from 2005') 18 | group2.add_argument('-e','--end_year',type=int, metavar='', default=2010, help='By default, it will search till 2010') 19 | 20 | parser.add_argument('-o','--output', action='store_true', help='output to a File') 21 | 22 | 23 | args = parser.parse_args() 24 | 25 | 26 | def oldest_videos_on_a_topic(topic,Max_limit,start_yr,end_yr): 27 | if args.output: 28 | f = open("old_videos.txt",'w',encoding = 'utf-8') 29 | f.close() 30 | else: 31 | print('\n') 32 | print('Video ID','\t','Upload Date/Time','\t','Video Title') 33 | print('--------','\t','----------------','\t','-----------') 34 | limit = 0 35 | global youtube 36 | start_time = datetime(year=2005, month=4, day=1).strftime('%Y-%m-%dT%H:%M:%SZ') 37 | end_time = datetime(year=2010, month=1, day=1).strftime('%Y-%m-%dT%H:%M:%SZ') 38 | 39 | res = youtube.search().list(part='snippet', 40 | q=topic, 41 | type='video', 42 | publishedAfter=start_time, 43 | publishedBefore=end_time, 44 | maxResults=50).execute() 45 | for item in sorted(res['items'], key=lambda x:x['snippet']['publishedAt']): 46 | title = str(item['snippet']['title']).replace(''',"'").replace('"','"') 47 | if topic.lower() in title.lower(): 48 | limit += 1 49 | date_format = "%Y-%m-%dT%H:%M:%SZ" 50 | publishedAt = datetime.strptime(item['snippet']['publishedAt'], date_format) 51 | if args.output: 52 | f = open("old_videos.txt",'a',encoding = 'utf-8') 53 | f.write(item['id']['videoId']+'\t\t'+str(publishedAt)+'\t\t'+ title ) 54 | f.write('\n') 55 | f.close() 56 | else: 57 | print(item['id']['videoId'],'\t',publishedAt,'\t', title ) 58 | if limit == Max_limit: 59 | break 60 | else: 61 | continue 62 | 63 | if args.output: 64 | print('\nDone! Check the file old_video.txt\n') 65 | else: 66 | print('\n') 67 | 68 | if __name__ == "__main__": 69 | key = input("Enter key\n") 70 | youtube = get_api_key(key) 71 | oldest_videos_on_a_topic(args.topic,args.max_results,args.start_year,args.end_year) -------------------------------------------------------------------------------- /src/subscriptions.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | with open("takeout/subscriptions/subscriptions.json", encoding="utf-8") as f: 4 | subs = json.load(f) 5 | i = 0 6 | sub_list = [] 7 | for sub in subs: 8 | temp = (sub["contentDetails"]["totalItemCount"], sub["snippet"]["resourceId"]["channelId"], sub["snippet"]["title"] ) 9 | # temp = (sub["snippet"]["title"]) 10 | # sub_list.append(temp.title()) 11 | sub_list.append(temp) 12 | sub_list = list(set(sub_list)) 13 | new_lst = [] 14 | for sub in sub_list: 15 | temp = sub[2].title() 16 | sub = (sub[0], sub[1], temp) 17 | new_lst.append(sub) 18 | new_lst.sort(key=lambda x:x[2]) 19 | for sub in new_lst: 20 | print(sub) -------------------------------------------------------------------------------- /src/vidsPerTime.py: -------------------------------------------------------------------------------- 1 | import sqlite3,datetime,calendar 2 | 3 | 4 | 5 | def absolute_dates(): 6 | year_2020 = 1577836800 7 | conn = sqlite3.connect('youtube.db') 8 | cur = conn.cursor() 9 | cur.execute("SELECT MIN(epoch) from video_history") 10 | result = cur.fetchone() 11 | oldest = int(result[0]) 12 | cur.execute("SELECT MAX(epoch) from video_history") 13 | result = cur.fetchone() 14 | newest = int(result[0]) 15 | 16 | years = [2018,2019,2020,2021] 17 | months = [x for x in range(1,13)] 18 | dates = [] 19 | # days = [x for x in range(1,29)] 20 | # for year in years: 21 | # for month in months: 22 | # for day in days: 23 | # date = (year,month,day) 24 | # dates.append(date) 25 | # for date in dates: 26 | # print(date) 27 | cal = calendar.calendar 28 | temp = calendar.itermonthdates(2021, 1) 29 | print(cal) 30 | conn.close() 31 | 32 | def relative_dates(): 33 | conn = sqlite3.connect('youtube.db') 34 | cur = conn.cursor() 35 | 36 | # Absolute, Relative 37 | 38 | start = 1568523230 39 | end = 1609459199 40 | max_res = 0 41 | for start in range (1577836800,1578614400,3600): 42 | end = start + 31622399 43 | cur.execute("SELECT COUNT(Video_ID) from video_history WHERE epoch > ? AND epoch < ?",(start,end)) 44 | result = cur.fetchone() 45 | result = int(result[0]) 46 | 47 | if result > max_res: 48 | max_res = result 49 | start_year = start 50 | end_year = end 51 | if start % 10000 == 0: 52 | print(start) 53 | conn.close() 54 | start_year = datetime.datetime.utcfromtimestamp(start_year).replace(tzinfo=datetime.timezone.utc) 55 | end_year = datetime.datetime.utcfromtimestamp(end_year).replace(tzinfo=datetime.timezone.utc) 56 | print(max_res,start_year,end_year) 57 | 58 | relative_dates() 59 | 60 | 61 | --------------------------------------------------------------------------------