├── .gitignore
├── Assets
    ├── example20.jpg
    ├── example_0.0.jpg
    ├── example_0.1.jpg
    ├── example_0.2.jpg
    ├── example_0.3.jpg
    ├── example_0.4.jpg
    ├── example_0.5.jpg
    ├── example_0.6.jpg
    ├── example_1.1.jpg
    └── title.jpg
├── LICENSE
├── README.md
├── Samples.md
├── YT_Scrape.py
├── old_history.py
├── requirements.txt
└── src
    ├── __init__.py
    ├── create_new.py
    ├── download_these.py
    ├── downloading.py
    ├── early_views.py
    ├── entire_channel.py
    ├── get_api_key.py
    ├── get_channel_details.py
    ├── get_channel_id.py
    ├── get_channel_playlists.py
    ├── get_channel_videos.py
    ├── get_playlist_videos.py
    ├── get_video_stats.py
    ├── import_downloaded_items.py
    ├── load_history.py
    ├── most_watched.py
    ├── oldest_videos.py
    ├── subscriptions.py
    └── vidsPerTime.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | venv
 2 | youtube.db
 3 | youtube1.db
 4 | youtube2.db
 5 | test.py
 6 | test2.py
 7 | takeout
 8 | __pycache__
 9 | *.txt
10 | *.exe
11 | *.log


--------------------------------------------------------------------------------
/Assets/example20.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example20.jpg


--------------------------------------------------------------------------------
/Assets/example_0.0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.0.jpg


--------------------------------------------------------------------------------
/Assets/example_0.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.1.jpg


--------------------------------------------------------------------------------
/Assets/example_0.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.2.jpg


--------------------------------------------------------------------------------
/Assets/example_0.3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.3.jpg


--------------------------------------------------------------------------------
/Assets/example_0.4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.4.jpg


--------------------------------------------------------------------------------
/Assets/example_0.5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.5.jpg


--------------------------------------------------------------------------------
/Assets/example_0.6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.6.jpg


--------------------------------------------------------------------------------
/Assets/example_1.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_1.1.jpg


--------------------------------------------------------------------------------
/Assets/title.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/title.jpg


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 T. Sambit Suranjan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | There was an inactivity of more than 2 years on this project.
  2 | I am achieving this due to multiple reasons.
  3 | Yet I will create a separate private repo for this and will work on adding new metrics and visualizations although at a slow pace.
  4 | If it ends nice will merge that with this repo.
  5 | 
  6 | # Youtube_scrape
  7 | 
  8 | <img src="https://github.com/CriticalHunter/Youtube_Scraper/blob/master/Assets/title.jpg" >
  9 | <p align="center" style="text-align: center;">
 10 | <a href="https://lbesson.mit-license.org">
 11 |   <img alt="MIT license"
 12 |        src="https://img.shields.io/badge/License-MIT-blue.svg?style=flat-square"
 13 |        align="center">
 14 | <a href="https://twitter.com/intent/tweet?text=Wow:&url=https%3A%2F%2Fgithub.com%2FCriticalHunter%2FYoutube_stats.git"><img alt="Twitter" src="https://img.shields.io/twitter/url?style=social&url=https%3A%2F%2Fgithub.com%2FCriticalHunter%2FYoutube_Scraper.git"></a>
 15 | 
 16 | Scrape data about an entire Channel or just a Playlist, using Youtube API. No OAuth is required.
 17 | 
 18 | ## :heavy_check_mark: Features
 19 | 
 20 | Following features are available :
 21 | 
 22 | ![CLI](/Assets/example_0.0.jpg)
 23 | 
 24 | 1. **create_new** :
 25 |    1. It creates a sqlite database to store all data.
 26 |    2. Database will be placed in the same folder as the project file, named 'youtube.db'
 27 |    3. It will have 4 tables - tb_channel, tb_playlist, tb_videos, video_history
 28 |    4. You can use programs like [DB Browser](https://sqlitebrowser.org) , which is lightweight, to view the database.
 29 | 2. **Oldest Video on A Topic** :
 30 |    1. It is an isolate program, that can be run independently.
 31 |    2. It doesn't depend on main code or any database.
 32 | 3. **Scrape A Channel**:
 33 |    1. Allows to scrape Channel Details and it's playlists.
 34 |    2. It can also scrape details for each video of that channel.
 35 |       1. If this option is not chosen, the playlist table won't have Playlist Duration.
 36 | 4. **Scrape A Single Playlist**:
 37 |    1. Allows to scrape info about a single Playlist and details about all it's videos.
 38 | 5. **Load Your History**:
 39 |    1. Make sure you have downloaded google Takeouts for your account to the PWD.
 40 |    2. Make sure you have follwing path './takeout/history/watch-history.html'
 41 |    3. Option to keep videos of your history on a separate table or integrate them with main table tb_videos
 42 |       1. In order to use next features, you have to integrate them.
 43 | 6. **Most Watched Video**:
 44 |    1. You can list your most watched 'n' videos
 45 | 7. **Early Viewed**:
 46 |    1. You can list 'n' videos, which you saw earliest after they were uploaded.
 47 |    2. There are some discrepencies, as many videos are reuploaded after you have seen it.
 48 |       1. Program ignores those
 49 |    3. It now only works when you watched it in IST.
 50 | 8. **Generate Download List**:
 51 |    1. This will create a text file, that will list Youtube URLs that can be downloaded by Youtube-DL or IDM etc.
 52 |    2. It will select videos which are marked 'Worth = 1' i the database.
 53 |       1. This operation is to be done by the user directly on the database (using DB Browser or such)
 54 |    3. There is option to list videos of a single Channel or from entire DAtabase.
 55 |    4. *Caution* : Once a video is processed by this function, it will be marked 'Is_Downloaded = 1'. Next time this function is run, new video IDs will be considered.
 56 |       1. Hence User must make sure, all videos in *download_list.txt* are downloaded before rewriting the file.
 57 | 
 58 | ## :computer: Setup Guide
 59 | 
 60 | Below is a detailed guide on setting up the environment.
 61 | 
 62 | ### Youtube API
 63 | 
 64 | First you need to have you Youtube API key. Below is a link of a video, that will guide you. **Watch from 0:00 - 5:30**
 65 | [![Getting Youtube API Key](https://img.youtube.com/vi/th5_9woFJmk/0.jpg)](https://www.youtube.com/watch?v=th5_9woFJmk)
 66 | 
 67 | 1. **Note - Youtube API is rate limited to 10000 hits/day.**
 68 | 2. You can view your quotas at [here - console](https://console.cloud.google.com/iam-admin/quotas)
 69 | 3. Cost of operations is decribed [here -Youtube API docs](https://developers.google.com/youtube/v3/docs)
 70 | 4. Code has been optimized to decrease quota usage. You can easily work with 50000 videos/day. For more please check your quota limit.
 71 | 
 72 | ### Installation
 73 | 
 74 | You need to install google-api-python-client to run this project. [github API link](https://github.com/googleapis/google-api-python-client)
 75 | Install this library in a [virtualenv](https://virtualenv.pypa.io/en/latest/) using pip.
 76 | 
 77 | #### Mac/Linux
 78 | 
 79 | ```
 80 | pip3 install virtualenv
 81 | virtualenv venv
 82 | . venv/bin/activate
 83 | pip3 install -r requirements.txt
 84 | ```
 85 | 
 86 | #### Windows
 87 | 
 88 | ```
 89 | pip3 install virtualenv
 90 | virtualenv venv
 91 | venv\Scripts\activate
 92 | pip3 install -r requirements.txt
 93 | ```
 94 | 
 95 | ## Working Guide
 96 | 
 97 | 1. Get Your Youtube API key as shown in above video.
 98 | 2. Pip install the requirements.txt
 99 | 3. Run the program YT_Scrape.py
100 | 
101 | The script will ask for required data in the command line and is pretty self-explanatory (Once it runs)
102 | 
103 | [View Samples](/Samples.md)
104 | 
105 | ## :hearts: Contributing
106 | 
107 | There are several ways to help.
108 | 
109 | 1. **Spread the word:** More users means more possible people testing and contributing to the app which in turn means better stability and possibly more and better features. You can [![Twitter](https://img.shields.io/twitter/url?style=social&url=https%3A%2F%2Fgithub.com%2FCriticalHunter%2FYoutube_Scraper.git)](https://twitter.com/intent/tweet?text=Wow:&url=https%3A%2F%2Fgithub.com%2FCriticalHunter%2FYoutube_stats.git) or share it on [LinkedIn](http://www.linkedin.com/shareArticle?mini=true&url=https://github.com/CriticalHunter/Youtube_Scraper.git). Every little bit helps !
110 | 2. **[Make a feature or improvement request](https://github.com/CriticalHunter/Youtube_Scraper/issues/new)**: Something can be be done better? Something essential missing? Let us know!
111 | 3. **[Report bugs](https://github.com/CriticalHunter/Youtube_Scraper/issues/new)**
112 | 4. **Contribute**: You don't have to be programmer to help.
113 | 
114 |    1. **Treat Me A Coffee Instead** [Paypal](https://paypal.me/CriticalHunter23)
115 | 
116 | ### Pull Requests
117 | 
118 | **Pull requests** are of course very welcome! Please make sure to also include the issue number in your commit message, if you're fixing a particular issue (e.g.: `feat: add nice feature with the number #31`).
119 | 


--------------------------------------------------------------------------------
/Samples.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### Getting Channel ID
 3 | 
 4 | ![example_1.1](/Assets/example_1.1.jpg)
 5 | 
 6 | 
 7 | 
 8 | ### Getting Playlist ID
 9 | 
10 | ![example2](/Assets/example20.jpg)
11 | 
12 | ## Database (in DB Browser) sample results
13 | ### Database Schema
14 | ![example_0.1](/Assets/example_0.1.jpg)
15 | ### tb_channels Table
16 | ![example_0.2](/Assets/example_0.2.jpg)
17 | ### tb_playlists Table
18 | ![example_0.3](/Assets/example_0.3.jpg)
19 | ### tb_videos Table
20 | ![example_0.4](/Assets/example_0.4.jpg)
21 | 
22 | ![example_0.5](/Assets/example_0.5.jpg)
23 | 
24 | ### video_history Table
25 | ![example_0.6](/Assets/example_0.6.jpg)


--------------------------------------------------------------------------------
/YT_Scrape.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function, unicode_literals
  2 | import re, six, os, sys, json
  3 | 
  4 | 
  5 | from pyfiglet import Figlet, figlet_format
  6 | from pprint import pprint
  7 | from PyInquirer import style_from_dict, Token, prompt, Validator, ValidationError
  8 | from termcolor import colored
  9 | import argparse
 10 | 
 11 | 
 12 | from src.create_new import dbase
 13 | from src.get_api_key import api_key
 14 | from src.get_channel_details import get_channel_details
 15 | from src.entire_channel import entire_channel
 16 | from src.get_playlist_videos import get_playlist_videos
 17 | from src.load_history import load_history
 18 | from src.most_watched import most_watched
 19 | from src.early_views import early_views
 20 | from src.download_these import download_n
 21 | 
 22 | from src.downloading import *
 23 | 
 24 | def log1(string, color, figlet=False):
 25 |     if colored:
 26 |         if not figlet:
 27 |             six.print_(colored(string, color))
 28 |         else:
 29 |             six.print_(colored(figlet_format(
 30 |                 string, font='doom'), color))
 31 |     else:
 32 |         six.print_(string)
 33 | 
 34 | log1("Youtube_Scraper", color="blue", figlet=True)
 35 | log1("Welcome to Youtube_Scraper", "green")
 36 | 
 37 | style = style_from_dict({
 38 |     Token.QuestionMark: '#E91E63 bold',
 39 |     Token.Selected: '#673AB7 bold',
 40 |     Token.Instruction: '',  # default
 41 |     Token.Answer: '#2196f3 bold',
 42 |     Token.Question: '',
 43 | })
 44 | 
 45 | 
 46 | 
 47 | class NumberValidator(Validator):
 48 |     def validate(self, document):
 49 |         try:
 50 |             int(document.text)
 51 |         except ValueError:
 52 |             raise ValidationError(
 53 |                 message='Please enter a number',
 54 |                 cursor_position=len(document.text))  # Move cursor to end
 55 | 
 56 | 
 57 | print('Please Choose the desired Options')
 58 | print('Press "ctrl+C" to escape at any point\n')
 59 | 
 60 | 
 61 | dbase()
 62 | 
 63 | if not os.path.exists("key.txt"):
 64 |     questions = [
 65 |     {
 66 |         'type': 'input',
 67 |         'name': 'API',
 68 |         'message': '"key.txt" file not found. Please enter your Youtube API key '
 69 |     },]
 70 |     answers = prompt(questions, style=style)
 71 |     with open ('key.txt','w') as f:
 72 |         f.write(answers['API'])
 73 | youtube_instance = api_key()
 74 | youtube_instance.get_api_key()
 75 | youtube = youtube_instance.get_youtube()
 76 | if youtube == None:
 77 |     sys.exit()
 78 | 
 79 | try:
 80 |     questions = [
 81 |         {
 82 |             'type': 'list',
 83 |             'name': 'operation',
 84 |             'message': 'What do you want to do?',
 85 |             'choices': ['Find oldest videos on a topic', 'Scrape a Channel','Scrape a Single Playlist' ,'Load Your History','Most Watched Video','Early Viewed Video','Generate Download List','Download Videos using YoutubeDL'],
 86 |             'filter': lambda val: val.lower()
 87 |         },
 88 |         {
 89 |             'type': 'list',
 90 |             'name': 'Channel',
 91 |             'message': 'Select Further \n Scraping all videos for a big channel will surpass your free API Limit',
 92 |             'choices': ['Scrape Everything for a channel', 'Just Channel Stats (Individual video stats are not scraped)'],
 93 |             'when': lambda answers: answers['operation'] == 'scrape a channel'
 94 |         },
 95 |         {
 96 |             'type': 'input',
 97 |             'name': 'channelID',
 98 |             'message': 'Enter the Channel ID (leave it blank to pick channels from Channels.txt)',
 99 |             'when': lambda answers: answers['operation'] == 'scrape a channel' and answers['Channel'] != ''
100 |         },
101 |         {
102 |             'type': 'input',
103 |             'name': 'playlistID',
104 |             'message': 'Enter the Playlist ID',
105 |             'when': lambda answers: answers['operation'] == 'scrape a single playlist'
106 |         },
107 |         {
108 |             'type': 'list',
109 |             'name': 'Download',
110 |             'message': 'What should the list contain?',
111 |             'choices': ['Videos from a single Channel', 'Videos from entire database'],
112 |             'when': lambda answers: answers['operation'] == 'generate download list'
113 |         },
114 |         {
115 |             'type': 'confirm',
116 |             'name': 'import',
117 |             'message': 'Do you want to import your video_history into main table(tb_videos) too? ',
118 |             'default': False,
119 |             'when': lambda answers: answers['operation'] == 'load your history'
120 |         },
121 |         {
122 |             'type': 'list',
123 |             'name': 'Quality',
124 |             'message': 'What Quality you want to download? (Make sure videos are listed in "download.txt" file)',
125 |             'choices': ['4k/Best Available','1080p','720p','360p'],
126 |             'when': lambda answers: answers['operation'] == 'download videos using youtubedl'
127 |         },
128 |     ]
129 | 
130 |     answers = prompt(questions, style=style)
131 | 
132 | 
133 |     if answers['operation'] == 'find oldest videos on a topic':
134 |         os.system("python .\src\oldest_videos.py -h")
135 | 
136 |     elif answers['operation'] == 'scrape a channel':
137 |         if answers['channelID'] == '':
138 |             with open("Channels.txt") as f:
139 |                 for line in f:
140 |                     new_Ch_ID = line[0]+'C'+line[2:]
141 |                     new_Ch_ID = new_Ch_ID.strip()
142 |                     print(new_Ch_ID)
143 |                     if answers['Channel'] == 'Just Channel Stats (Individual video stats are not scraped)':
144 |                         get_channel_details(youtube,new_Ch_ID)
145 |                     elif answers['Channel'] == 'Scrape Everything for a channel':
146 |                         entire_channel(youtube,new_Ch_ID)
147 |             
148 |         else:
149 |             Ch_ID = answers['channelID']
150 |             new_Ch_ID = Ch_ID[0]+'C'+Ch_ID[2:]
151 |             if answers['Channel'] == 'Just Channel Stats (Individual video stats are not scraped)':
152 |                 get_channel_details(youtube,new_Ch_ID)
153 |             elif answers['Channel'] == 'Scrape Everything for a channel':
154 |                 entire_channel(youtube,new_Ch_ID)
155 | 
156 |     elif answers['operation'] == 'scrape a single playlist':
157 |         get_playlist_videos(youtube,answers['playlistID'])
158 | 
159 |     elif answers['operation'] == 'load your history':
160 |         if answers['import'] == True:
161 |             res = 'y'
162 |         elif answers['import'] == False:
163 |             res = 'n'
164 |         print("Please Wait ...")
165 |         load_history(res)
166 | 
167 |     elif answers['operation'] == 'most watched video':
168 |         print("If your watch history is not loaded in database, it will give empty result")
169 |         print("Please enter, How many items to retrieve e.g. 10 for Top 10 \n")
170 |         n = int(input())
171 |         most_watched(n)
172 | 
173 |     elif answers['operation'] == 'early viewed video':
174 |         print("If your watch history is not loaded in database, it will give empty result")
175 |         print("Please enter, How many items to retrieve e.g. 10 for Top 10 \n")
176 |         n = int(input())
177 |         early_views(n)
178 | 
179 |     elif answers['operation'] == 'generate download list':
180 |         if answers['Download'] == 'Videos from a single Channel':
181 |             print("It will list videos that are marked 'Is-Good' and is present in your database")
182 |             chc = input("Please enter the channel ID \t")
183 |             print("Please enter, How many items the list will contain \n")
184 |             n = int(input())
185 |             download_n(chc,n)
186 |         elif answers['Download'] == 'Videos from entire database':
187 |             print("It will list videos that are marked 'Is-Good' and is present in your database")
188 |             chc = ''
189 |             print("Please enter, How many items the list will contain \n")
190 |             n = int(input())
191 |             download_n(chc,n)
192 |     elif answers['operation'] == 'download videos using youtubedl':
193 |         print("\nIt will download all the videos that are listed in download.txt")
194 |         print("Do you want to replace file names (_ in place of space) and convert thumbnail images (from WEBP to JPEG) ?\n")
195 |         chc2 = input("Please enter Y/N \t")
196 |         if chc2 == 'Y' or chc2 == 'Yes':
197 |             if answers['Quality'] == '4k/Best Available':
198 |                 download_files('4k')
199 |             elif answers['Quality'] == '1080p':
200 |                 download_files(1080)
201 |             elif answers['Quality'] == '720p':
202 |                 download_files(720)
203 |             elif answers['Quality'] == '360p':
204 |                 download_files(360)
205 |             replace2('D:\Youtube')
206 |             convertWebp2jpgInDirectory('D:\Youtube')
207 |         else:
208 |             if answers['Quality'] == '4k/Best Available':
209 |                 download_files('4k')
210 |             elif answers['Quality'] == '1080p':
211 |                 download_files(1080)
212 |             elif answers['Quality'] == '720p':
213 |                 download_files(720)
214 |             elif answers['Quality'] == '360p':
215 |                 download_files(360)
216 | 
217 | except Exception as e:
218 |     print(e)


--------------------------------------------------------------------------------
/old_history.py:
--------------------------------------------------------------------------------
 1 | import pprint
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import sqlite3
 5 | import time
 6 | import datetime,pytz
 7 | 
 8 | with open("test.html",encoding='utf-8') as fp:                                      # To open a local html file
 9 |     soup = BeautifulSoup(fp,features='lxml')
10 | 
11 | soup_str = soup.prettify()                                            # Prettify the HTML, but it becomes String
12 | with open('temp.html','w',encoding='utf-8') as wr:
13 |     wr.write(soup_str)
14 | 
15 | tags = soup.find_all("c-wiz", {"class": "xDtZAf"})
16 | for tag in tags:
17 |     with open("watched.txt",'a',encoding='utf-8') as f:
18 |         date = tag.get('data-date')
19 |         foo = tag.find("div", {"class": "QTGV3c"})
20 |         temp = (foo.get_text())
21 |         watched = temp.split(' ')
22 |         watched = watched[0]
23 |         if watched == 'Watched':
24 |             try:
25 |                 bar = foo.a.get('href')
26 |                 vid_id = bar[-11:]
27 |                 time = tag.find("div", {"class": "H3Q9vf XTnvW"})
28 |                 time = time.get_text()
29 |                 f.write(date)
30 |                 f.write(' ')
31 |                 tm = re.findall('\d+:\d+ .M',time)[0]
32 |                 tm1=tm.split(':')[0]
33 |                 tm2=tm.split(':')[1]
34 |                 tm21 = tm2.split(' ')[0]
35 |                 tm22 = tm2.split(' ')[1]
36 |                 if len(tm1)==1:
37 |                     tm='0'+tm1+':'+tm21+':'+'00'+' '+tm22
38 |                 else:
39 |                     tm=tm1+':'+tm21+':'+'00'+' '+tm22
40 |                 f.write(tm)
41 |                 f.write(' ')
42 |                 f.write(vid_id)
43 |                 f.write('\n')
44 |             except:
45 |                 pass
46 | 
47 | with open("watched.txt",'r',encoding='utf-8') as fhand:
48 |     conn = sqlite3.connect('youtube.db')              
49 |     cur = conn.cursor()
50 |     for line in fhand:
51 |         time = line[0:-12]
52 |         p='%Y%m%d %I:%M:%S %p '
53 |         epoch = (datetime.datetime.strptime(time, p))
54 |         # dtobj3=dtobj1.replace(tzinfo=pytz.UTC) #replace method
55 |         # dtobj_kolkata=dtobj3.astimezone(pytz.timezone("Asia/Kolkata"))
56 |         # epoch = dtobj_kolkata.timestamp()
57 |         new_format = epoch.strftime('%b %d, %Y, %I:%M:%S %p')
58 |         vid_id = line[-12:-1]
59 |         cur.execute("INSERT OR IGNORE INTO video_history VALUES (?,?,?,?)", (vid_id,new_format,epoch.timestamp(),0))
60 |     conn.commit()
61 |     conn.close()
62 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | astroid==2.4.2
 2 | atomicwrites==1.4.0
 3 | attrs==20.3.0
 4 | beautifulsoup4==4.9.3
 5 | cachetools==4.2.0
 6 | certifi==2024.7.4
 7 | chardet==4.0.0
 8 | colorama==0.4.4
 9 | ffmpeg-python==0.2.0
10 | future==0.18.3
11 | google-api-core==1.24.1
12 | google-api-python-client==1.12.8
13 | google-auth==1.24.0
14 | google-auth-httplib2==0.0.4
15 | googleapis-common-protos==1.52.0
16 | httplib2==0.19.1
17 | idna==3.7
18 | iniconfig==1.1.1
19 | isort==5.7.0
20 | lazy-object-proxy==1.4.3
21 | lxml==4.9.1
22 | mccabe==0.6.1
23 | packaging==20.8
24 | pluggy==0.13.1
25 | prompt-toolkit==1.0.14
26 | protobuf==3.18.3
27 | py==1.10.0
28 | pyasn1==0.4.8
29 | pyasn1-modules==0.2.8
30 | pyfiglet==0.8.post1
31 | Pygments==2.15.0
32 | PyInquirer==1.0.3
33 | pylint==2.6.0
34 | pyparsing==2.4.7
35 | pytest==6.2.1
36 | pytz==2020.4
37 | regex==2020.11.13
38 | requests==2.32.2
39 | rsa==4.7
40 | six==1.15.0
41 | soupsieve==2.1
42 | termcolor==1.1.0
43 | toml==0.10.2
44 | uritemplate==3.0.1
45 | urllib3==1.26.19
46 | wcwidth==0.2.5
47 | wrapt==1.12.1


--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/src/__init__.py


--------------------------------------------------------------------------------
/src/create_new.py:
--------------------------------------------------------------------------------
  1 | import sqlite3
  2 | import os
  3 | 
  4 | def create_new():
  5 |     conn = sqlite3.connect('youtube.db')              
  6 |     cur = conn.cursor() 
  7 | 
  8 | 
  9 |     cur.execute("""CREATE TABLE IF NOT EXISTS tb_channels (
 10 |         Channel_ID TEXT PRIMARY KEY,
 11 |         Channel_title TEXT,
 12 |         Published_At TEXT NOT NULL,
 13 |         Country TEXT,
 14 |         View_Count INTEGER,
 15 |         Subscriber_Count INTEGER,
 16 |         Video_Count INTEGER,
 17 |         Playlist_Count INTEGER,
 18 |         Channel_Duration INTEGER,
 19 |         Duration_in_Text TEXT,
 20 |         Is_Deleted INTEGER,
 21 |         Deleted_Videos INTEGER,
 22 |         Downloaded_Videos INTEGER,
 23 |         Folder_Size_GB REAL,
 24 |         Channel_last_Scraped TEXT,
 25 |         Auto_Update INTEGER,
 26 |         Description TEXT
 27 |     )
 28 | 
 29 |         """)
 30 | 
 31 | 
 32 |     cur.execute("""CREATE TABLE IF NOT EXISTS tb_playlists(
 33 |         Playlist_ID TEXT PRIMARY KEY,
 34 |         Playlist_title TEXT,
 35 |         Channel_ID TEXT NOT NULL,
 36 |         Channel_Title TEXT NOT NULL,
 37 |         Published_At TEXT NOT NULL,
 38 |         Current_Video_Count INTEGER,
 39 |         Playlist_Seconds INTEGER,
 40 |         Playlist_Duration TEXT,
 41 |         Is_Seen INTEGER,
 42 |         Worth INTEGER,
 43 |         Is_Removed INTEGER,
 44 |         Deleted_Videos INTEGER,
 45 |         Downloaded_Videos INTEGER,
 46 |         Folder_Size_GB REAL,
 47 |         Playlist_last_Scraped TEXT,
 48 |         Auto_Update INTEGER
 49 |     )
 50 |     """)
 51 | 
 52 |     cur.execute("""CREATE TABLE IF NOT EXISTS tb_videos (
 53 |         Video_ID TEXT PRIMARY KEY,
 54 |         Video_title TEXT,
 55 |         Is_Seen INTEGER,
 56 |         Worth INTEGER,
 57 |         Upload_playlistId TEXT,
 58 |         Playlist_ID TEXT,
 59 |         Published_At TEXT NOT NULL,
 60 |         epoch REAL NOT NULL,
 61 |         Channel_ID TEXT NOT NULL,
 62 |         Channel_Title TEXT NOT NULL,
 63 |         View_Count INTEGER,
 64 |         Like_Count INTEGER,
 65 |         Dislike_Count INTEGER,
 66 |         Upvote_Ratio REAL,
 67 |         Comment_Count INTEGER,
 68 |         Duration TEXT,
 69 |         video_seconds INTEGER,
 70 |         Is_Licensed INTEGER,
 71 |         Is_Deleted INTEGER,
 72 |         Is_Downloaded INTEGER
 73 |     )
 74 |     """)
 75 | 
 76 |     cur.execute("""CREATE TABLE IF NOT EXISTS video_history (
 77 |         Video_ID TEXT NOT NULL,
 78 |         Title TEXT,
 79 |         Watched_at TEXT ,
 80 |         epoch REAL NOT NULL,
 81 |         Is_in_Main INTEGER,
 82 |         Is_Deleted INTEGER,
 83 |         PRIMARY KEY ( Video_ID, epoch)
 84 |     )
 85 | 
 86 |         """)
 87 |     
 88 |     cur.execute("""CREATE TABLE IF NOT EXISTS yt_downloaded (
 89 |         Video_ID TEXT PRIMARY KEY,
 90 |         Resolution TEXT,
 91 |         Raw_Size INTEGER,
 92 |         Size REAL,
 93 |         vid_type TEXT,
 94 |         FPS TEXT,
 95 |         bitrate,
 96 |         Audio_Type TEXT,
 97 |         Frequency INTEGER,
 98 |         Channels TEXT,
 99 |         Is_In_Main INTEGER
100 |     )
101 | 
102 |         """)
103 |     
104 |     conn.commit()                                               # Push the data into database
105 |     conn.close()
106 | 
107 | def migrate():
108 |     conn = sqlite3.connect('youtube.db')              
109 |     cur = conn.cursor() 
110 | 
111 |     cur.execute("PRAGMA foreign_keys=off")
112 |     cur.execute("BEGIN TRANSACTION")
113 |     cur.execute("ALTER TABLE tb_channels RENAME TO _tb_channels_old")
114 |     cur.execute("""
115 |         CREATE TABLE IF NOT EXISTS tb_channels (
116 |         Channel_ID TEXT PRIMARY KEY,
117 |         Channel_title TEXT,
118 |         Published_At TEXT NOT NULL,
119 |         Country TEXT,
120 |         View_Count INTEGER,
121 |         Subscriber_Count INTEGER,
122 |         Video_Count INTEGER,
123 |         Playlist_Count INTEGER
124 |     )
125 | 
126 |         """)
127 |     cur.execute("INSERT INTO tb_channels SELECT * FROM _tb_channels_old")
128 |     try:
129 |         cur.execute("ALTER TABLE tb_channels ADD COLUMN Channel_Duration INTEGER")
130 |         cur.execute("ALTER TABLE tb_channels ADD COLUMN Duration_in_Text TEXT")
131 |         cur.execute("ALTER TABLE tb_channels ADD COLUMN Is_Deleted INTEGER")
132 |         cur.execute("ALTER TABLE tb_channels ADD COLUMN Deleted_Videos INTEGER")
133 |         cur.execute("ALTER TABLE tb_channels ADD COLUMN Downloaded_Videos INTEGER")
134 |         cur.execute("ALTER TABLE tb_channels ADD COLUMN Folder_Size_GB REAL")
135 |         cur.execute("ALTER TABLE tb_channels ADD COLUMN Channel_last_Scraped TEXT")
136 |         cur.execute("ALTER TABLE tb_channels ADD COLUMN Auto_Update INTEGER")
137 |         cur.execute("ALTER TABLE tb_channels ADD COLUMN Description TEXT")
138 |     except:
139 |         # These stats are added after intitial release of this code.
140 |         pass
141 |     cur.execute("DROP TABLE _tb_channels_old")
142 | 
143 |     cur.execute("ALTER TABLE tb_playlists RENAME TO _tb_playlists_old")
144 |     cur.execute("""CREATE TABLE IF NOT EXISTS tb_playlists(
145 |         Playlist_ID TEXT PRIMARY KEY,
146 |         Playlist_title TEXT,
147 |         Channel_ID TEXT NOT NULL,
148 |         Channel_Title TEXT NOT NULL,
149 |         Published_At TEXT NOT NULL,
150 |         Item_Count INTEGER,
151 |         Playlist_Seconds INTEGER,
152 |         Playlist_Duration TEXT,
153 |         Is_Seen INTEGER,
154 |         Worth INTEGER
155 |     )
156 |     """)
157 |     cur.execute("INSERT INTO tb_playlists SELECT * FROM _tb_playlists_old")
158 |     try:
159 |         cur.execute("ALTER TABLE tb_playlists ADD COLUMN Is_Removed INTEGER")
160 |         cur.execute("ALTER TABLE tb_playlists ADD COLUMN Deleted_Videos INTEGER")
161 |         cur.execute("ALTER TABLE tb_playlists ADD COLUMN Downloaded_Videos INTEGER")
162 |         cur.execute("ALTER TABLE tb_playlists ADD COLUMN Folder_Size_GB REAL")
163 |         cur.execute("ALTER TABLE tb_playlists ADD COLUMN Playlist_last_Scraped TEXT")
164 |         cur.execute("ALTER TABLE tb_playlists ADD COLUMN Auto_Update INTEGER")
165 |         cur.execute("ALTER TABLE tb_playlists RENAME COLUMN Item_Count TO Current_Video_Count")
166 |     except:
167 |         # These stats are added after intitial release of this code.
168 |         pass
169 |     cur.execute("DROP TABLE _tb_playlists_old")
170 | 
171 |     cur.execute("ALTER TABLE tb_videos RENAME TO _tb_videos_old")
172 |     cur.execute("""CREATE TABLE IF NOT EXISTS tb_videos (
173 |         Video_ID TEXT PRIMARY KEY,
174 |         Video_title TEXT,
175 |         Is_Seen INTEGER,
176 |         Worth INTEGER,
177 |         Upload_playlistId TEXT,
178 |         Playlist_ID TEXT,
179 |         Published_At TEXT NOT NULL,
180 |         epoch REAL NOT NULL,
181 |         Channel_ID TEXT NOT NULL,
182 |         Channel_Title TEXT NOT NULL,
183 |         View_Count INTEGER,
184 |         Like_Count INTEGER,
185 |         Dislike_Count INTEGER,
186 |         Upvote_Ratio REAL,
187 |         Comment_Count INTEGER,
188 |         Duration TEXT,
189 |         video_seconds INTEGER,
190 |         Is_Licensed INTEGER,
191 |         Is_Deleted INTEGER,
192 |         Is_Downloaded INTEGER
193 |     )
194 |     """)
195 |     cur.execute("INSERT INTO tb_videos SELECT * FROM _tb_videos_old") 
196 |     cur.execute("DROP TABLE _tb_videos_old")
197 | 
198 |     
199 |     cur.execute("""CREATE TABLE IF NOT EXISTS yt_downloaded (
200 |         Video_ID TEXT PRIMARY KEY,
201 |         Resolution TEXT,
202 |         Raw_Size INTEGER,
203 |         Size REAL,
204 |         vid_type TEXT,
205 |         FPS TEXT,
206 |         bitrate,
207 |         Audio_Type TEXT,
208 |         Frequency INTEGER,
209 |         Channels TEXT,
210 |         Is_In_Main INTEGER
211 |     )
212 | 
213 |         """)
214 |     try:
215 |         cur.execute("DROP TABLE tb_downloaded")
216 |     except:
217 |         pass
218 |     cur.execute("PRAGMA foreign_keys=on")
219 |     conn.commit()                                               # Push the data into database
220 |     conn.close()
221 | 
222 | def dbase():
223 |     if not os.path.exists("youtube.db"):
224 |         create_new()
225 |     else:
226 |         conn = sqlite3.connect('youtube.db')              
227 |         cur = conn.cursor() 
228 |         try:
229 |             cur.execute("SELECT Deleted_Videos FROM tb_channels")
230 |         except:
231 |             migrate()
232 | 
233 |         
234 | if __name__ == "__main__":
235 |     dbase()


--------------------------------------------------------------------------------
/src/download_these.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | def download_n(chc='',n=50):
 4 |     with open("download.txt",'w',encoding='utf-8') as fp:
 5 |         conn = sqlite3.connect('youtube.db')              
 6 |         cur = conn.cursor()
 7 |         if chc == '':
 8 |             cur.execute("SELECT Video_ID FROM tb_videos WHERE Worth = 1 and Is_Downloaded = 0 LIMIT ?",(n,))
 9 |         else:
10 |             try:
11 |                 cur.execute("SELECT Video_ID FROM tb_videos WHERE Worth = 1 and Is_Downloaded = 0 and Channel_ID = ? LIMIT ?",(chc,n))
12 |             except:
13 |                 print("Please enter correct Channel ID")
14 |         down_list = cur.fetchall()
15 |         for item in down_list:
16 |             link = "https://www.youtube.com/watch?v="+item[0]
17 |             cur.execute("UPDATE tb_videos SET Is_Downloaded = 1 WHERE Video_ID = ?",(item[0],))
18 |             fp.write(link)
19 |             fp.write('\n')
20 |         conn.commit()                                               # Push the data into database
21 |         conn.close()
22 | 
23 | if __name__ == "__main__":
24 |     pass


--------------------------------------------------------------------------------
/src/downloading.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | '''
 5 | make It run on both linux /  and windows \
 6 |     '''
 7 | 
 8 | def download_files(ch=720):
 9 |     commandline4k = 'youtube-dl --add-metadata --write-info-json  --write-thumbnail --force-ipv4 \
10 |          --sleep-interval 3 --max-sleep-interval 6 --ignore-errors --no-continue --no-overwrites \
11 |         --download-archive archive.log -f "bestvideo+(bestaudio[acodec^=opus]/bestaudio)/bestvideo[height<=360]" \
12 |         --merge-output-format "mkv" -o "D:/Youtube/%(uploader)s/%(upload_date)s_%(title)s %(id)s.%(ext)s" -a download.txt'
13 |     commandline1080 = 'youtube-dl --add-metadata --write-info-json  --write-thumbnail --force-ipv4 \
14 |          --sleep-interval 3 --max-sleep-interval 6 --ignore-errors --no-continue --no-overwrites \
15 |         --download-archive archive.log -f "bestvideo[height<=1080]+(bestaudio[acodec^=opus]/bestaudio)/bestvideo[height<=360]" \
16 |         --merge-output-format "mkv" -o "D:/Youtube/%(uploader)s/%(upload_date)s_%(title)s %(id)s.%(ext)s" -a download.txt'
17 |     commandline720 = 'youtube-dl --add-metadata --write-info-json  --write-thumbnail --force-ipv4 \
18 |          --sleep-interval 3 --max-sleep-interval 6 --ignore-errors --no-continue --no-overwrites \
19 |         --download-archive archive.log -f "bestvideo[height<=720]+(bestaudio[acodec^=opus]/bestaudio)/bestvideo[height<=360]" \
20 |         --merge-output-format "mkv" -o "D:/Youtube/%(uploader)s/%(upload_date)s_%(title)s %(id)s.%(ext)s" -a download.txt'
21 |     commandline360 = 'youtube-dl --add-metadata --write-info-json  --write-thumbnail --force-ipv4 \
22 |          --sleep-interval 3 --max-sleep-interval 6 --ignore-errors --no-continue --no-overwrites \
23 |         --download-archive archive.log -f "bestvideo[height<=360]+(bestaudio[acodec^=opus]/bestaudio)/bestvideo[height<=360]" \
24 |         --merge-output-format "mkv" -o "D:/Youtube/%(uploader)s/%(upload_date)s_%(title)s %(id)s.%(ext)s" -a download.txt'
25 |     if ch == 720:
26 |         commandline = commandline720
27 |     elif ch == '4k':
28 |         commandline = commandline4k
29 |     elif ch == 1080:
30 |         commandline = commandline1080
31 |     elif ch == 360:
32 |         commandline = commandline360
33 |     os.system(commandline)
34 | 
35 | def replace2(parent):
36 |     for path, folders, files in os.walk(parent):
37 |         for f in files:
38 |             os.rename(os.path.join(path, f), os.path.join(path, f.replace(' ', '__')))
39 |         for i in range(len(folders)):
40 |             new_name = folders[i].replace(' ', '_').replace('.', '_').replace("'", '')
41 |             try:
42 |                 os.rename(os.path.join(path, folders[i]), os.path.join(path, new_name))
43 |             except FileExistsError:
44 |                 pass
45 |             folders[i] = new_name
46 | # Traverse the specified directory, display all file names under the directory
47 | def convertWebp2jpgInDirectory(dir):
48 |     if os.path.isdir(dir):
49 |         allfiles = os.listdir(dir)
50 |         for fi in allfiles:
51 |             fi_d = os.path.join(dir, fi)
52 |             if os.path.isdir(fi_d):
53 |                 convertWebp2jpgInDirectory(fi_d)
54 |             else:
55 |                 if fi_d.endswith(".webp"):
56 |                     webp = os.path.join(dir, fi_d)
57 |                     webp = '"'+webp+'"'
58 |                     filename = webp.split("\\")[-1]
59 |     
60 |                     filedir = "\\".join(webp.split("\\")[:-1])
61 |                     
62 |                     newfilename = filename.replace(".webp",'.jpg')
63 |                     jpg = "%s\%s"%(filedir, newfilename)
64 |                     # jpg = '"'+jpg+'"'
65 |                     commandline = "dwebp %s -o %s" % (webp, jpg)
66 |                    
67 |                     os.system(commandline)
68 |                     print(webp + " ------> conversion succeeded")
69 | 
70 |                     deleteline = "rm "+webp
71 |                     os.system(deleteline)
72 | 
73 | if __name__ == '__main__':
74 |     pass
75 | # download_files()
76 | # convertWebp2jpgInDirectory("D:\Youtube")
77 | # replace('D:\Youtube')


--------------------------------------------------------------------------------
/src/early_views.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | def early_views(n=5):
 4 |     conn = sqlite3.connect('youtube.db')              
 5 |     cur = conn.cursor()
 6 |     cur.execute("SELECT video_history.Video_ID, video_history.epoch -tb_videos.epoch As diff,video_history.epoch,tb_videos.epoch,tb_videos.Video_title,tb_videos.epoch, Watched_at FROM video_history \
 7 |                     LEFT OUTER JOIN tb_videos on tb_videos.Video_id = video_history.Video_ID WHERE (diff-19800) > 0 GROUP BY video_history.Video_ID ORDER BY diff ASC ;")
 8 |     results = cur.fetchmany(n)
 9 |     print("Video ID","      Diff in Min","\t","Published AT(UTC)"," Watched AT (IST)","\tVideo Title")
10 |     print("-------------------------------------------------------------------------------------------------------")
11 |     for result in results:
12 |         Link = result[0]
13 |         differ = (int(result[1])-19800)/60
14 |         differ1 = ("{:6d}".format(int(differ//1)))
15 |         differ2 = ("{:.2f}".format(differ % 1)).replace('0.','.')
16 |         differ = differ1+differ2
17 |         print(Link,'\t',differ,'\t',result[2],'\t',result[3],'\t',result[4])
18 |     conn.commit()                                               
19 |     conn.close()
20 | 
21 | if __name__ == "__main__":
22 |     pass


--------------------------------------------------------------------------------
/src/entire_channel.py:
--------------------------------------------------------------------------------
 1 | from src.get_channel_playlists import get_channel_playlists
 2 | from src.get_channel_details import get_channel_details, get_channel_length
 3 | from src.get_playlist_videos import get_playlist_videos
 4 | from src.get_channel_videos import get_channel_videos
 5 | 
 6 | def entire_channel(youtube,ch_id):
 7 |     ch_id = ch_id
 8 |     ec=True
 9 |     get_channel_details(youtube,ch_id,ec=ec)
10 |     playlists_list = get_channel_playlists(youtube,ch_id)
11 |     count = 0
12 |     print('\nThere are ',len(playlists_list),' original/imported playlists\n')
13 |     for playlist in playlists_list:
14 |         count += 1
15 |         print('\nParsing playlist ',count,' \\ ',len(playlists_list))
16 |         try:
17 |             get_playlist_videos(youtube,playlist,ec=ec,ch_id=ch_id)
18 |         except:
19 |             print("Error getting Playlist :",playlist)
20 |     get_channel_videos(youtube,ch_id)
21 |     get_channel_length(ch_id)


--------------------------------------------------------------------------------
/src/get_api_key.py:
--------------------------------------------------------------------------------
 1 | from googleapiclient.discovery import build
 2 | from googleapiclient.errors import HttpError
 3 | from httplib2 import ServerNotFoundError
 4 | from google.auth.exceptions import DefaultCredentialsError
 5 | '''
 6 | Get the API key and save it in a text file named, key.txt in parent folder.
 7 | The method to get a youtube API key is well illustrated in the Youtube Video in the README page.
 8 | '''
 9 | class api_key():
10 |     def __init__(self):
11 |         self.youtube = None
12 | 
13 |     def get_api_key(self):
14 |         try:
15 |             with open('key.txt') as key_file: 
16 |                 api_key = key_file.read()
17 |                 youtube = build('youtube','v3',developerKey=api_key)
18 |                 self.youtube = youtube
19 | 
20 |         except HttpError:
21 |             print("\nAPI Key is wrong")
22 |             print("Please recheck the API key or generate a new key.\nThen modify the 'key.txt' file with new Key\n")
23 | 
24 |         except ServerNotFoundError:
25 |             print("\nUnable to connect to internet...")
26 |             print("Please Check Your Internet Connection.\n")
27 | 
28 |         except DefaultCredentialsError:
29 |             print("\n'key.txt' is Blank.")
30 |             print("Please save your API key there and then continue.\n")
31 |         
32 |         except FileNotFoundError:
33 |             print("\nNo such file: 'key.txt'")
34 |             print("Please create a file named 'key.txt' and place your Youtube API key in it.\n")
35 | 
36 |         except Exception as e:
37 |             print(e)
38 |             print("Oops!", e.__class__, "occurred.")
39 | 
40 |     def get_youtube(self):
41 |         return self.youtube
42 | 
43 | if __name__ == "__main__":
44 |     youtube_instance = api_key()
45 |     youtube_instance.get_api_key()
46 |     youtube = youtube_instance.get_youtube()    
47 |     print(youtube)


--------------------------------------------------------------------------------
/src/get_channel_details.py:
--------------------------------------------------------------------------------
  1 | import sqlite3,datetime
  2 | from src.get_channel_playlists import get_channel_playlists
  3 | import sys
  4 | 
  5 | def get_channel_details(youtube,channel_id,single=False,playlistID='',ec=False):
  6 | 
  7 |     request = youtube.channels().list(part="snippet,statistics",
  8 |                                       id=channel_id
  9 |                                       ).execute()
 10 | 
 11 |     # print(request['items'][0])
 12 |     Channel_Id = channel_id
 13 |     flag1 = True
 14 |     flag2 = True
 15 |     conn = sqlite3.connect('youtube.db')              
 16 |     cur = conn.cursor()
 17 |     try:
 18 |         Channel_title = request['items'][0]['snippet']['title']
 19 |     except:
 20 |         flag1 = False
 21 |     try:
 22 |         cur.execute("SELECT Channel_ID from tb_channels WHERE Channel_ID = ? ",(Channel_Id,))
 23 |         temp = cur.fetchone()
 24 |         if temp is None:
 25 |             flag2 = False
 26 |     except:
 27 |         flag2 = False
 28 |     cur.execute("SELECT Is_Deleted from tb_channels WHERE Channel_ID = ? ",(Channel_Id,))
 29 |     flag3 = cur.fetchone()
 30 |     if flag3 is None:
 31 |         pass
 32 |     else:
 33 |         flag3 = flag3[0]
 34 |     if flag1 == False and flag2 == False:
 35 |         print("Channel ID not valid")
 36 |         sys.exit()
 37 |     if flag1 == False and flag2 == True and flag3 == 1:
 38 |         print("Channel was already Deleted")
 39 |         conn.commit()                                               # Push the data into database
 40 |         conn.close()
 41 |         sys.exit()
 42 |     if flag1 == False and flag2 == True and flag3 == 0:
 43 |         cur.execute("SELECT Channel_Id from tb_channels")
 44 |         cur.execute("UPDATE tb_channels SET Is_Deleted = ? WHERE Channel_ID = ? ",(1,Channel_Id))
 45 |         cur.execute("UPDATE tb_channels SET Auto_Update = ? WHERE Channel_ID = ? ",(0,Channel_Id))
 46 |         print("Channel is Deleted and now updated in Database")
 47 |         conn.commit()                                               # Push the data into database
 48 |         conn.close()
 49 |         sys.exit()
 50 |     
 51 |     Description = request['items'][0]['snippet']['description']
 52 |     Published_At = request['items'][0]['snippet']['publishedAt']
 53 |     try:
 54 |         Country = request['items'][0]['snippet']['country']
 55 |     except:
 56 |         Country = None
 57 |     View_Count = request['items'][0]['statistics']['viewCount']
 58 |     try:
 59 |         Subscriber_Count = request['items'][0]['statistics']['subscriberCount']
 60 |     except:
 61 |         Subscriber_Count = None
 62 |     Video_Count = request['items'][0]['statistics']['videoCount']
 63 |     if ec == False:
 64 |         Channel_last_Scraped = 'Never'
 65 |     else:
 66 |         Channel_last_Scraped = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 67 | 
 68 |     cur.execute("SELECT Downloaded_Videos FROM tb_channels WHERE Channel_ID = ?" ,(Channel_Id,))
 69 |     temp = cur.fetchone()
 70 |     try:
 71 |         temp = temp[0]
 72 |         if temp > 0:
 73 |             Downloaded_Videos = temp
 74 |         else:
 75 |             Downloaded_Videos = 0 
 76 |     except:
 77 |         Downloaded_Videos = 0
 78 |     cur.execute("SELECT Folder_Size_GB FROM tb_channels WHERE Channel_ID = ?" ,(Channel_Id,))
 79 |     temp = cur.fetchone()
 80 |     try:
 81 |         temp = temp[0]
 82 |         if temp > 0:
 83 |             Folder_Size_GB = temp
 84 |         else:
 85 |             Folder_Size_GB = 0 
 86 |     except:
 87 |         Folder_Size_GB = 0
 88 |     
 89 |     params = (Channel_Id,Channel_title,Published_At,Country,View_Count,Subscriber_Count,Video_Count,0,0,'First, Scrape Entire Channel',0,0,Downloaded_Videos,Folder_Size_GB)
 90 | 
 91 |     conn = sqlite3.connect('youtube.db')              
 92 |     cur = conn.cursor()
 93 |     cur.execute("INSERT OR REPLACE INTO tb_channels VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?,?,?, ?,?, 'Never',1,'')", params)
 94 |     conn.commit()                                               # Push the data into database
 95 |     conn.close()
 96 |     conn = sqlite3.connect('youtube.db')              
 97 |     cur = conn.cursor()
 98 |     cur.execute("UPDATE tb_channels SET Channel_last_Scraped = ? WHERE Channel_ID = ? ",(Channel_last_Scraped,Channel_Id))
 99 |     cur.execute("UPDATE tb_channels SET Description = ? WHERE Channel_ID = ? ",(Description,Channel_Id))
100 |     conn.commit()                                               # Push the data into database
101 |     conn.close()
102 |     get_channel_playlists(youtube,Channel_Id,single,playlistID)
103 | 
104 | def get_channel_length(Channel_Id):
105 |     conn = sqlite3.connect('youtube.db')              
106 |     cur = conn.cursor()
107 |     cur.execute("SELECT SUM(video_seconds) FROM tb_videos WHERE Channel_ID = ? ",(Channel_Id,))
108 |     tot = cur.fetchone()
109 |     tot = tot[0]
110 |     if tot is None:
111 |         tot = 0                                                 # For channels with 0 Original Videos (e.g. Hasan Minaj)
112 |     Duration_in_Text = str(datetime.timedelta(seconds = tot))
113 |     cur.execute("UPDATE tb_channels SET Duration_in_Text = ? WHERE Channel_ID = ? ",(Duration_in_Text,Channel_Id))
114 |     cur.execute("UPDATE tb_channels SET Channel_Duration = ? WHERE Channel_ID = ? ",(tot,Channel_Id))
115 |     cur.execute("SELECT COUNT(Video_ID) FROM tb_videos WHERE Is_Deleted = ? AND Channel_ID = ? ",(1,Channel_Id))
116 |     num = cur.fetchone()
117 |     num=num[0]
118 |     cur.execute("UPDATE tb_channels SET Deleted_Videos = ? WHERE Channel_ID = ? ",(num,Channel_Id))
119 |     conn.commit()
120 |     conn.close()
121 | 
122 | if __name__ == "__main__":
123 |     pass


--------------------------------------------------------------------------------
/src/get_channel_id.py:
--------------------------------------------------------------------------------
 1 | from get_api_key import api_key
 2 | import argparse
 3 | import os
 4 | 
 5 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,\
 6 |                                 description='Get CHannel ID for CHannel User Name, Hopefully!!!')
 7 | parser.add_argument("User", help='Enter Channel User Name')
 8 | args = parser.parse_args()
 9 | 
10 | 
11 | 
12 | def get_channel_id(youtube,ch_name):
13 |     request = youtube.channels().list(
14 |             part="snippet,contentDetails,statistics",
15 |             forUsername=ch_name
16 |         )
17 |     response = request.execute()
18 |     try:
19 |         sub_count = int(response['items'][0]['statistics']['subscriberCount'])
20 |         if sub_count > 1000000:
21 |             sub_count = str(sub_count / 1000000)
22 |             sub_count = sub_count + 'M Subscribers'
23 |         elif sub_count > 1000:
24 |             sub_count = str(sub_count / 1000)
25 |             sub_count = sub_count + 'K Subscribers'
26 |         else:
27 |             sub_count = str(sub_count) + ' Subscribers'
28 |         ch_id = response['items'][0]['id']
29 |         
30 |         print(" ")
31 |         print(sub_count)
32 |         print(ch_id)
33 |         return ch_id
34 |     except KeyError:
35 |         print(" ")
36 |         print("          Error : Channel not Found ")
37 |         print(" ")
38 | 
39 | if __name__ == "__main__":
40 |     youtube_instance = api_key()
41 |     youtube_instance.get_api_key()
42 |     youtube = youtube_instance.get_youtube()
43 |     get_channel_id(youtube,args.User)


--------------------------------------------------------------------------------
/src/get_channel_playlists.py:
--------------------------------------------------------------------------------
 1 | import sqlite3, time
 2 | 
 3 | def get_channel_playlists(youtube,channel_id,single=False,playlistID=''):
 4 | 
 5 |     conn = sqlite3.connect('youtube.db')              
 6 |     cur = conn.cursor()
 7 |     
 8 |     playlists = []
 9 |     playlist_ids = []
10 |     next_page_token = None
11 |     
12 |     while 1:
13 |         res = youtube.playlists().list( part="snippet,contentDetails",
14 |                                         channelId=channel_id,
15 |                                         pageToken=next_page_token,
16 |                                         maxResults=50
17 |                                     ).execute()
18 |         playlists += res['items']
19 |         next_page_token = res.get('nextPageToken')
20 |         
21 | 
22 |         for playlist in playlists:
23 |             Playlist_ID = playlist['id']                    ;   playlist_ids.append(Playlist_ID)
24 |             if (single == True and playlist['id'] == playlistID) or single == False:
25 | 
26 |                  
27 |                 Playlist_title = playlist['snippet']['title']
28 |                 Channel_Id = playlist['snippet']['channelId']
29 |                 Channel_Title = playlist['snippet']['channelTitle']
30 |                 Published_At = playlist['snippet']['publishedAt']
31 |                 Current_Video_Count = playlist['contentDetails']['itemCount']
32 |                 Playlist_Seconds = 0
33 |                 Playlist_Duration = '0'
34 |                 cur.execute("SELECT Is_Seen FROM tb_playlists WHERE Playlist_ID = ?" ,(Playlist_ID,))
35 |                 temp = cur.fetchone()
36 |                 try:
37 |                     temp = temp[0]
38 |                     if temp == 1:
39 |                         Is_Seen = 1
40 |                     else:
41 |                         Is_Seen = 0 
42 |                 except:
43 |                     Is_Seen = 0
44 |                                         # 0 = not seen    1 = seen
45 |                 cur.execute("SELECT Worth FROM tb_playlists WHERE Playlist_ID = ?" ,(Playlist_ID,))
46 |                 temp = cur.fetchone()
47 |                 try:
48 |                     temp = temp[0]
49 |                     if temp == 1:
50 |                         Worth = 1
51 |                     else:
52 |                         Worth = 0 
53 |                 except:
54 |                     Worth = 0                        
55 |                 
56 |                 cur.execute("SELECT Downloaded_Videos FROM tb_playlists WHERE Playlist_ID = ?" ,(Playlist_ID,))
57 |                 temp = cur.fetchone()
58 |                 try:
59 |                     temp = int(temp[0])
60 |                     if temp > 0:
61 |                         Downloaded_Videos = temp
62 |                     else:
63 |                         Downloaded_Videos = 0 
64 |                 except:
65 |                     Downloaded_Videos = 0  
66 |                 cur.execute("SELECT Folder_Size_GB FROM tb_playlists WHERE Playlist_ID = ?" ,(Playlist_ID,))
67 |                 temp = cur.fetchone()
68 |                 try:
69 |                     temp = int(temp[0])
70 |                     if temp > 0:
71 |                         Folder_Size_GB = temp
72 |                     else:
73 |                         Folder_Size_GB = 0 
74 |                 except:
75 |                     Folder_Size_GB = 0  
76 |                 params = (Playlist_ID,Playlist_title,Channel_Id,Channel_Title,Published_At,Current_Video_Count,Playlist_Seconds,Playlist_Duration,Is_Seen,Worth,0,0,Downloaded_Videos,Folder_Size_GB)
77 |                 cur.execute("INSERT OR REPLACE INTO tb_playlists VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,?, 0, 1)", params)
78 |                 last_time = time.time()
79 |                 cur.execute("UPDATE tb_playlists SET Playlist_last_Scraped = ? WHERE Playlist_ID = ? ",(last_time,Playlist_ID))
80 |         if next_page_token is None:
81 |             break
82 |     
83 |     playlist_ids = set(playlist_ids)
84 |     playlist_ids = list(playlist_ids)
85 |     count = len(playlist_ids)
86 |     cur.execute("UPDATE tb_channels SET Playlist_Count = ? WHERE Channel_ID = ? ",(count,channel_id))
87 | 
88 |     conn.commit()                                               # Push the data into database
89 |     conn.close()
90 |     
91 |     return playlist_ids
92 | 
93 | if __name__ == "__main__":
94 |     pass


--------------------------------------------------------------------------------
/src/get_channel_videos.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | from src.get_video_stats import get_videos_stats
 4 | 
 5 | def get_channel_videos(youtube,channel_id):
 6 |     conn = sqlite3.connect('youtube.db')              
 7 |     cur = conn.cursor()
 8 | 
 9 |     res = youtube.channels().list(id=channel_id, 
10 |                                   part='contentDetails').execute()
11 |     
12 |     playlist_id = res['items'][0]['contentDetails']['relatedPlaylists']['uploads']
13 |     
14 |     videos = []
15 |     next_page_token = None
16 |     new_video_ids = []
17 | 
18 |     try:
19 |         while 1:
20 |             res = youtube.playlistItems().list(playlistId=playlist_id, 
21 |                                             part='snippet', 
22 |                                             maxResults=50,
23 |                                             pageToken=next_page_token).execute()
24 |             videos += res['items']
25 |             next_page_token = res.get('nextPageToken')
26 |             
27 |         
28 |             video_ids = list(map(lambda x:x['snippet']['resourceId']['videoId'], videos))
29 | 
30 |             if next_page_token is None:
31 |                 break
32 |     except:
33 |         print("Channel has no Original Videos")
34 |         video_ids = []
35 |     CVids = []
36 |     cur.execute("SELECT Video_ID FROM tb_videos WHERE Channel_ID=? AND Playlist_ID IS NOT NULL",(channel_id,))
37 |     temp = cur.fetchall()
38 |     for item in temp:
39 |         CVids.append(item[0])
40 |     CVids = set(CVids)
41 |     video_ids = set(video_ids)
42 |     diff = video_ids - CVids
43 |     new_video_ids = list(diff)
44 |     conn.commit()                                               # Push the data into database
45 |     conn.close()
46 | 
47 |     print('\nParsing ',len(new_video_ids),' videos, which are not in any playlist')
48 |     get_videos_stats(youtube,new_video_ids,flag=1)
49 | 
50 | if __name__ == "__main__":
51 |     pass    
52 | 
53 | 


--------------------------------------------------------------------------------
/src/get_playlist_videos.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | from datetime import timedelta
 3 | 
 4 | from src.get_video_stats import get_videos_stats
 5 | from src.get_channel_details import get_channel_details
 6 | 
 7 | def get_playlist_videos(youtube,playlistID,ec=False,ch_id=None):
 8 | 
 9 |     ch_ID = 'skip'
10 |     conn = sqlite3.connect('youtube.db')              
11 |     cur = conn.cursor()
12 | 
13 |     videos = []
14 |     next_page_token = None
15 |     video_IDS = []
16 |     while 1:
17 |         res = youtube.playlistItems().list(part="snippet",
18 |                                                 maxResults=50,
19 |                                                 playlistId=playlistID,
20 |                                                 pageToken=next_page_token
21 |                                             ).execute()
22 |         videos += res['items']
23 |         next_page_token = res.get('nextPageToken')
24 |         
25 |         if next_page_token is None:
26 |             break
27 |     
28 | 
29 |     for video in videos:
30 |             
31 |             Video_id = video['snippet']['resourceId']['videoId'];   video_IDS.append(Video_id)
32 |             try:
33 |                 ch_ID = video['snippet']['channelId']
34 |             except:
35 |                 ch_ID = 'skip'
36 |             if ec == True:
37 |                 params = (Video_id,"",0,0,ch_id,None,None,0,ch_id,'',0,0,0,0,0,'',0,0,1,0)
38 |                 cur.execute("INSERT OR IGNORE INTO tb_videos VALUES (?, ?, ?,? ,?, ?, ?, ?,?, ?,?,?,?,?,?,?,?,?,?,?)", params)
39 |             else:
40 |                 params = (Video_id,"",0,0,"","","")
41 |                 cur.execute("INSERT OR IGNORE INTO tb_videos VALUES (?, ?, ?,? ,?, ?, ?, 0,'', '',0,0,0,0,0,'',0,0,0,0)", params)    
42 | 
43 |         
44 |     conn.commit()                                               # Push the data into database
45 |     conn.close()
46 |     
47 |     if ch_ID == 'skip':
48 |         conn = sqlite3.connect('youtube.db')              
49 |         cur = conn.cursor()
50 |         cur.execute("SELECT Current_Video_Count FROM tb_playlists WHERE playlist_ID = ? ",(playlistID,))
51 |         num = cur.fetchone()
52 |         num=num[0]
53 |         print(num)
54 |         if num == 0:
55 |             cur.execute("UPDATE tb_playlists SET Is_Removed = ? WHERE playlist_ID = ? ",(1,playlistID))
56 |         conn.commit()                                               # Push the data into database
57 |         conn.close()
58 |         return 0
59 |     else:
60 |         if ec == False:
61 |             get_channel_details(youtube,ch_ID,True,playlistID)
62 | 
63 |         Playlist_Seconds,num_new = get_videos_stats(youtube,video_IDS,1,playlistID)
64 |         print('Videos in this playlist =',num_new)
65 |         Playlist_Duration = str(timedelta(seconds = Playlist_Seconds))
66 |         conn = sqlite3.connect('youtube.db')              
67 |         cur = conn.cursor()
68 |         
69 |         cur.execute("SELECT Current_Video_Count FROM tb_playlists WHERE playlist_ID = ? ",(playlistID,))
70 |         num = cur.fetchone()
71 |         num=num[0]
72 |         if num != num_new:
73 |             cur.execute("UPDATE tb_playlists SET Current_Video_Count = ? WHERE playlist_ID = ? ",(num_new,playlistID))
74 | 
75 |         cur.execute("UPDATE tb_playlists SET Playlist_Seconds = ? WHERE playlist_ID = ? ",(Playlist_Seconds,playlistID))
76 |         cur.execute("UPDATE tb_playlists SET Playlist_Duration = ? WHERE playlist_ID = ? ",(Playlist_Duration,playlistID))
77 |         cur.execute("SELECT COUNT(Video_ID) FROM tb_videos WHERE Is_Deleted = ? AND playlist_ID = ? ",(1,playlistID))
78 |         num = cur.fetchone()
79 |         num=num[0]
80 |         cur.execute("UPDATE tb_playlists SET Deleted_Videos = ? WHERE playlist_ID = ? ",(num,playlistID))
81 |         conn.commit()                                               # Push the data into database
82 |         conn.close()
83 | 
84 | if __name__ == "__main__":
85 |     pass


--------------------------------------------------------------------------------
/src/get_video_stats.py:
--------------------------------------------------------------------------------
  1 | from datetime import timedelta
  2 | import sqlite3, time
  3 | from os import path
  4 | import sys
  5 | 
  6 | def get_videos_stats(youtube,video_ids,flag=1,playlistID = None):
  7 |     oflag = flag
  8 |     if not path.exists('youtube.db'):
  9 |         print("Please Create the database First")
 10 |         sys.exit()
 11 |     else:
 12 |         pass
 13 | 
 14 |     conn = sqlite3.connect('youtube.db')              
 15 |     cur = conn.cursor()
 16 |     count1 = 0
 17 |     stats = []
 18 |     tot_len = 0
 19 |     for i in range(0, len(video_ids), 50):
 20 |         res = youtube.videos().list(id=','.join(video_ids[i:i+50]),
 21 |                                    part='snippet,statistics,contentDetails').execute()
 22 |         stats += res['items']
 23 |     
 24 |     new_ids = []
 25 |     for video in stats:
 26 |         count1 += 1
 27 |         try:
 28 |             Video_id = video['id']
 29 |         except:
 30 |             Video_id = ''
 31 |             flag = 3
 32 |         new_ids.append(Video_id)
 33 |         Video_title = video['snippet']['title']
 34 |         Upload_playlistId = video['snippet']['channelId']
 35 |         
 36 |         if playlistID is not None:
 37 |             Playlist_Id = playlistID                                    # When call is from a playlist
 38 |         else:
 39 |             cur.execute("SELECT Playlist_ID FROM tb_videos WHERE Video_ID = ?" ,(Video_id,))
 40 |             result = cur.fetchone()
 41 |             if result is None:
 42 |                 Playlist_Id = None
 43 |             else:
 44 |                 if type(result) is tuple:
 45 |                     Playlist_Id = result[0]
 46 |                 elif type(result) is str:
 47 |                     Playlist_Id = result
 48 |                 else:
 49 |                     Playlist_Id = None
 50 |         Published_At = video['snippet']['publishedAt']
 51 |         date_format = "%Y-%m-%dT%H:%M:%SZ" 
 52 |         epoch = float(time.mktime(time.strptime(Published_At, date_format)))
 53 |         Channel_Id = video['snippet']['channelId']
 54 |         Channel_Title = video['snippet']['channelTitle']
 55 |         try:
 56 |             View_Count = video['statistics']['viewCount']
 57 |         except:
 58 |             View_Count = 1111
 59 |         cur.execute("SELECT View_Count FROM tb_videos WHERE Video_ID = ?" ,(Video_id,))
 60 |         temp = cur.fetchone()
 61 |         try:
 62 |             temp = temp[0]
 63 |             if View_Count < temp:
 64 |                 continue
 65 |         except:
 66 |             pass
 67 |         try:
 68 |             Like_Count = video['statistics']['likeCount']
 69 |         except:
 70 |             Like_Count = 0
 71 |         try:
 72 |             Dislike_Count = video['statistics']['dislikeCount']
 73 |         except:
 74 |             Dislike_Count = 0
 75 |         try:
 76 |             Upvote_Ratio = round(((int(Like_Count)/(int(Like_Count)+(int(Dislike_Count))))*100),3)
 77 |         except:
 78 |             Upvote_Ratio = 0
 79 |         try:
 80 |             Comment_Count = video['statistics']['commentCount']
 81 |         except:            
 82 |             Comment_Count = 0
 83 |         try:
 84 |             Duration = str(video['contentDetails']['duration'])
 85 |             Duration = Duration.replace('PT','')
 86 |             hh=mm=ss = '00'
 87 |             if Duration.find('H') != -1:
 88 |                 hh = Duration.split('H')[0]
 89 |                 temp = hh+'H'
 90 |                 if len(hh) == 1:
 91 |                     hh = '0'+hh
 92 |                 Duration = Duration.replace(temp,'')
 93 |             if Duration.find('M') != -1:
 94 |                 mm = Duration.split('M')[0]
 95 |                 temp = mm+'M'
 96 |                 if len(mm) == 1:
 97 |                     mm = '0'+mm
 98 |                 Duration = Duration.replace(temp,'')
 99 |             if Duration.find('S') != -1:
100 |                 ss = Duration.split('S')[0]
101 |                 if len(ss) == 1:
102 |                     ss = '0'+ss
103 |             Duration = (hh+':'+mm+':'+ss)
104 |             video_seconds = timedelta(hours = int(hh),
105 |                             minutes= int(mm),
106 |                             seconds= int(ss)).total_seconds()
107 |             # if playlistID is not None:
108 |             tot_len += video_seconds
109 |         except:            
110 |             Duration = '0'
111 |             video_seconds = 0
112 |             
113 |         try:
114 |             Is_Licensed = video['contentDetails']['licensedContent']
115 |         except:            
116 |             Is_Licensed = 0 
117 |         cur.execute("SELECT Is_Seen FROM tb_videos WHERE Video_ID = ?" ,(Video_id,))
118 |         temp = cur.fetchone()
119 |         try:
120 |             temp = temp[0]
121 |             if temp == 1:
122 |                 Is_Seen = 1
123 |             else:
124 |                 Is_Seen = 0 
125 |         except:
126 |             Is_Seen = 0
127 |                                  # 0 = not seen    1 = seen
128 |         cur.execute("SELECT Worth FROM tb_videos WHERE Video_ID = ?" ,(Video_id,))
129 |         temp = cur.fetchone()
130 |         try:
131 |             temp = temp[0]
132 |             if temp == 1:
133 |                 Worth = 1
134 |             else:
135 |                 Worth = 0 
136 |         except:
137 |             Worth = 0      
138 | 
139 |         cur.execute("SELECT Is_Downloaded FROM tb_videos WHERE Video_ID = ?" ,(Video_id,))
140 |         temp = cur.fetchone()
141 |         try:
142 |             temp = temp[0]
143 |             if temp == 1:
144 |                 Is_Downloaded = 1
145 |             else:
146 |                 Is_Downloaded = 0 
147 |         except:
148 |             Is_Downloaded = 0 
149 |         Is_Deleted = 0
150 |         if flag == 1 or flag == 2:
151 |             Is_Deleted = 0
152 |         elif flag == 3:
153 |             Is_Deleted = 1
154 |             print(Video_id,' is deleted')
155 |             cur.execute("UPDATE tb_videos SET IS_Deleted = 1 WHERE Video_ID = ?",(Video_id,))
156 |             flag = oflag
157 |         params = (Video_id,Video_title,Is_Seen,Worth,Upload_playlistId,Playlist_Id,Published_At,epoch,Channel_Id,Channel_Title,View_Count,Like_Count,Dislike_Count,Upvote_Ratio,Comment_Count,Duration,video_seconds,Is_Licensed,Is_Deleted,Is_Downloaded)
158 |         if flag == 1:
159 |             cur.execute("INSERT OR REPLACE INTO tb_videos VALUES (?, ?, ?, ?, ?, ?, ? ,? ,? ,? ,? ,? , ?, ?, ?, ?, ?, ?, ?, ?)", params)
160 |         elif flag == 2:
161 |             cur.execute("INSERT OR IGNORE INTO tb_videos VALUES (?, ?, ?, ?, ?, ?, ? ,? ,? ,? ,? ,? , ?, ?, ?, ?, ?, ?, ?, ?)", params)
162 |     conn.commit()                                               
163 |     conn.close()
164 | 
165 |     video_ids = set(video_ids)
166 |     new_ids = set(new_ids)
167 |     num_new = len(new_ids)
168 |     diff = video_ids-new_ids
169 |     if len(diff) > 0:
170 |         conn = sqlite3.connect('youtube.db')              
171 |         cur = conn.cursor()
172 |         for item in diff:
173 |             print(item,' not available')
174 |             try:
175 |                 params = (item,'Not Available',0,0,Channel_Id,playlistID,'','',Channel_Id,Channel_Title,'','','','','','','','',1,0)
176 |                 cur.execute("INSERT OR IGNORE INTO tb_videos VALUES (?, ?, ?, ?, ?, ?, ? ,? ,? ,? ,? ,? , ?, ?, ?, ?, ?, ?, ?, ?)", params)
177 |                 cur.execute("UPDATE tb_videos SET IS_Deleted = 1 WHERE Video_ID = ?",(item,))
178 |             except:
179 |                 pass
180 |         conn.commit()                                               
181 |         conn.close()
182 |     if tot_len > 0:
183 |         return tot_len,num_new
184 | 
185 | if __name__ == "__main__":
186 |     pass


--------------------------------------------------------------------------------
/src/import_downloaded_items.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | import subprocess, os,re
  3 | #SELECT * FROM tb_videos WHERE Video_ID IN (SELECT Video_ID FROM yt_downloaded) ORDER BY Is_Seen
  4 | # Sanity Check
  5 | from os import listdir
  6 | from os.path import isfile, join
  7 | import sqlite3
  8 | 
  9 | from get_api_key import api_key
 10 | from get_video_stats import get_videos_stats
 11 | 
 12 | def update_local(vid_path):
 13 |     vid_path1 = '"'+vid_path+'"'
 14 |     command = "./ffmpeg -i "+vid_path1+" -hide_banner"
 15 |     try:
 16 |         with open('log1.txt', "w",encoding='utf-8') as outfile:                 #latin-1
 17 |             subprocess.run(command, stderr=subprocess.STDOUT,stdout=outfile)
 18 |     except Exception as e:
 19 |         print(e)
 20 |     with open('log1.txt', "r",encoding='utf-8') as fhand:
 21 |         for line in fhand:
 22 |             line=line.lstrip()
 23 |             temp_match = line[0:11]
 24 |             if temp_match == 'Stream #0:0':
 25 |                 result = re.findall('\d+x\d+', line)
 26 |                 Resolution = result[0]
 27 |                 result = re.findall('[0-9.]+ fps', line)
 28 |                 try:
 29 |                     fps = result[0]
 30 |                     fps = fps.strip(' fps')
 31 |                 except:
 32 |                     fps = 0
 33 |             if line.startswith('Duration:'):
 34 |                 result = re.findall('\d+ ', line)
 35 |                 bitrate = result[0]
 36 |             if line.startswith('Stream #0:1'):
 37 |                 result = re.findall('Audio: [a-zA-Z]+,', line)
 38 |                 temp = result[0]
 39 |                 Audio_Type = temp.strip(',')
 40 |                 Audio_Type = Audio_Type[7:]
 41 |                 result = re.findall('\d+ Hz', line)
 42 |                 temp = result[0]
 43 |                 Frequency = temp.strip(' Hz')
 44 |                 result = re.findall('Hz, \w+,', line)
 45 |                 temp = result[0]
 46 |                 Channels = temp.strip('Hz ,')
 47 |         raw_size = Path(vid_path).stat().st_size
 48 |         size = raw_size/(1024*1024)
 49 |         size = round(size,3)
 50 |         return (Resolution,raw_size,size,fps,bitrate,Audio_Type,Frequency,Channels)
 51 | 
 52 | def import_vids():
 53 |     mypath = ('D:\\Youtube1')
 54 |     conn = sqlite3.connect('C:\\Users\\Sambit\\Desktop\\Projects\\Youtube\\Youtube_Scraper\\youtube.db')
 55 |     cur = conn.cursor()
 56 |     # r=root, d=directories, f = files
 57 |     for r, d, f in os.walk(mypath):
 58 |         for file in f:
 59 |             if file.endswith(("mp4", "mkv", "flv", "wmv", "avi", "mpg", "mpeg")):
 60 |                 vid_path = os.path.join(r, file)
 61 |                 vid_id = vid_path[-15:-4]
 62 |                 with open ("skip_files.txt") as f:
 63 |                     if vid_id in f.read():
 64 |                         continue
 65 |                 vid_type = vid_path[-3:]
 66 |                 cur.execute("SELECT Video_ID FROM yt_downloaded WHERE Video_ID = ?",(vid_id,))
 67 |                 if (cur.fetchone()) is None:
 68 |                     Resolution,raw_size,size,fps,bitrate,Audio_Type,Frequency,Channels = update_local(vid_path)
 69 |                     params = (vid_id,Resolution,raw_size,size,vid_type,fps,bitrate,Audio_Type,Frequency,Channels,0)
 70 |                     cur.execute("INSERT OR REPLACE INTO yt_downloaded VALUES (?,?,?,?,?,?,?,?,?,?,?)",params)
 71 | 
 72 |     conn.commit()
 73 |     conn.close()
 74 | 
 75 | def update_vids():
 76 |     def is_in_main():
 77 |         conn = sqlite3.connect('C:\\Users\\Sambit\\Desktop\\Projects\\Youtube\\Youtube_Scraper\\youtube.db')              
 78 |         cur = conn.cursor()
 79 |         cur.execute("UPDATE yt_downloaded SET Is_In_Main = 1 WHERE Video_ID IN (SELECT Video_ID FROM yt_downloaded \
 80 |                     WHERE Video_ID IN (SELECT Video_ID FROM tb_videos))")
 81 |         conn.commit()                                               
 82 |         conn.close()
 83 |     # is_in_main()
 84 |     for i in range(2000):
 85 |         conn = sqlite3.connect('C:\\Users\\Sambit\\Desktop\\Projects\\Youtube\\Youtube_Scraper\\youtube.db')              
 86 |         cur = conn.cursor() 
 87 |         cur.execute("SELECT Count(*) FROM yt_downloaded")
 88 |         tot = cur.fetchone()
 89 |         cur.execute("SELECT Video_ID FROM yt_downloaded WHERE Is_In_Main = 0 LIMIT 50")
 90 |         temp = cur.fetchall()
 91 |         if len(temp) < 1:
 92 |             print("All Videos (locally downloaded) are now in main table tb_videos")
 93 |             break
 94 |         result = []
 95 |         for item in temp:
 96 |             cur.execute("UPDATE yt_downloaded SET Is_In_Main = 1 WHERE Video_ID = ?",(item[0],))
 97 |             result.append(item[0])
 98 |         
 99 |         conn.commit()                                               
100 |         conn.close()
101 | 
102 |         print('Parsing Downloaded Videos :',(i*50),' / ',tot[0],end="\r")
103 |         print(' ')
104 |         youtube_instance = api_key()
105 |         youtube_instance.get_api_key()
106 |         youtube = youtube_instance.get_youtube()
107 |         get_videos_stats(youtube,result,1)
108 |         conn = sqlite3.connect('youtube.db')              
109 |         cur = conn.cursor()
110 |         for item in result:
111 |             print('New Item added successfully :',item)
112 |             cur.execute("UPDATE tb_videos SET Is_Downloaded = 1 WHERE Video_ID = ?",(item,))
113 |             cur.execute("UPDATE tb_videos SET Is_Seen = 1 WHERE Video_ID = ?",(item,))
114 |             cur.execute("UPDATE tb_videos SET Worth = 1 WHERE Video_ID = ?",(item,))
115 |         conn.commit()                                               
116 |         conn.close()
117 |         is_in_main()
118 | 
119 | if __name__ == "__main__":
120 |     import_vids()
121 |     update_vids()
122 | 


--------------------------------------------------------------------------------
/src/load_history.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | import sqlite3, time
  3 | from bs4 import BeautifulSoup
  4 | 
  5 | from src.get_video_stats import get_videos_stats
  6 | from src.get_api_key import api_key
  7 | 
  8 | 
  9 | def update_title():
 10 |     conn = sqlite3.connect('youtube.db')              
 11 |     cur = conn.cursor()
 12 |     cur.execute("SELECT Video_ID FROM video_history WHERE (Title IS NULL OR Title = '') AND Is_Deleted = 0")
 13 |     temp = cur.fetchall()
 14 |     for item in temp:
 15 |         cur.execute("SELECT Video_title FROM tb_videos WHERE Video_ID = ?",(item[0],))
 16 |         tit = cur.fetchone()
 17 |         cur.execute("UPDATE video_history SET Title = ? WHERE Video_ID = ?",(tit[0],item[0]))
 18 |     conn.commit()
 19 |     conn.close()
 20 | def update_deleted():
 21 |     conn = sqlite3.connect('youtube.db')              
 22 |     cur = conn.cursor()
 23 |     cur.execute("UPDATE video_history SET Is_Deleted = 1 WHERE Video_ID NOT IN (SELECT Video_ID FROM video_history WHERE Video_ID IN (SELECT Video_ID FROM tb_videos))")
 24 |     conn.commit()                                               
 25 |     conn.close()
 26 | def update_is_seen():
 27 |     conn = sqlite3.connect('youtube.db')              
 28 |     cur = conn.cursor()
 29 |     cur.execute("UPDATE tb_videos SET Is_Seen = 1 WHERE Video_ID IN (SELECT Video_ID FROM tb_videos \
 30 |                 WHERE Video_ID IN (SELECT Video_ID FROM video_history))")
 31 |     conn.commit()                                               
 32 |     conn.close()
 33 | 
 34 | def update_is_in_main():
 35 |     conn = sqlite3.connect('youtube.db')              
 36 |     cur = conn.cursor()
 37 |     cur.execute("UPDATE video_history SET Is_in_Main = 1 WHERE Video_ID IN (SELECT Video_ID FROM video_history \
 38 |                 WHERE Video_ID IN (SELECT Video_ID FROM tb_videos))")
 39 |     
 40 |     conn.commit()                                               
 41 |     conn.close()
 42 | 
 43 | def update_history(youtube):
 44 |     for i in range(2000):
 45 |         conn = sqlite3.connect('youtube.db')              
 46 |         cur = conn.cursor() 
 47 |         cur.execute("SELECT Count(*) FROM video_history")
 48 |         tot = cur.fetchone()
 49 |         cur.execute("SELECT Video_ID FROM video_history WHERE Is_in_Main = 0 AND Is_Deleted = 0 LIMIT 50;")
 50 |         temp = cur.fetchall()
 51 |         if len(temp) < 2:
 52 |             print("All Videos From Watched History are now in main table tb_videos")
 53 |             break
 54 |         result = []
 55 |         for item in temp:
 56 |             cur.execute("UPDATE video_history SET Is_in_Main = 1 WHERE Video_ID = ?",(item[0],))
 57 |             result.append(item[0])
 58 |         
 59 |         conn.commit()                                               
 60 |         conn.close()
 61 |         print('Parsing Watch History Videos :',(i*50),' / ',tot[0],end="\r")
 62 |         get_videos_stats(youtube,result,1)
 63 |         update_is_in_main()
 64 | 
 65 | def load_history(res='n'):
 66 |     count_loc_prog = 0
 67 |     with open("takeout/history/watch-history.html",encoding='utf-8') as fp:
 68 |         conn = sqlite3.connect('youtube.db')              
 69 |         cur = conn.cursor()
 70 | 
 71 |         soup = BeautifulSoup(fp,'lxml')
 72 |         soup = soup.body
 73 | 
 74 |         videos = soup.find_all("div", {"class": "content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1"})
 75 | 
 76 |         print(len(videos))
 77 |         
 78 |         for video in videos:
 79 |             count_loc_prog += 1
 80 |             if count_loc_prog % 500 == 0:
 81 |                 print('Loading into Database : ',count_loc_prog,' / ',len(videos),end="\r")
 82 |             tags = video.find_all('a')
 83 |             # try:
 84 | 
 85 |             if tags == []:
 86 |                 continue
 87 |             
 88 |             V_link = tags[0].get('href')
 89 |             V_link = V_link.split('=')[-1]
 90 |             br_tags = video.find_all('br')
 91 |             for tag in br_tags:
 92 |                 watched_at = str(tag.next_sibling)
 93 |                 if watched_at[-3:-1] == 'IS':
 94 |                     final_time = (watched_at)
 95 |                     temp = final_time.replace('IST','+0530')
 96 |                     epoch = time.mktime(time.strptime(temp, "%b %d, %Y, %I:%M:%S %p %z"))
 97 |             cur.execute("INSERT OR IGNORE INTO video_history VALUES (?,?,?,?,?,?)", (V_link,'',final_time,epoch,0,0))
 98 |             
 99 | 
100 |         conn.commit()                                               # Push the data into database
101 |         conn.close()
102 |     print("\n Loaded \n")
103 | 
104 |     if res == 'y' or res == "Y":
105 |         youtube_instance = api_key()
106 |         youtube_instance.get_api_key()
107 |         youtube = youtube_instance.get_youtube()
108 |         update_history(youtube)
109 |         update_title()
110 |         update_deleted()
111 |     update_is_seen()
112 |     update_is_in_main()
113 | 
114 | if __name__ == "__main__":
115 |     pass


--------------------------------------------------------------------------------
/src/most_watched.py:
--------------------------------------------------------------------------------
 1 | import sqlite3
 2 | 
 3 | def most_watched(n=5):
 4 |     conn = sqlite3.connect('youtube.db')              
 5 |     cur = conn.cursor()
 6 |     cur.execute("SELECT video_history.Video_ID,COUNT(video_history.Video_ID) AS cnt, Video_title FROM video_history \
 7 |                     LEFT OUTER JOIN tb_videos on tb_videos.Video_ID = video_history.Video_ID \
 8 |                     GROUP BY video_history.Video_ID ORDER BY cnt DESC;")
 9 |     results = cur.fetchmany(n)
10 |     print("\t","  Video Link","\t","\t","\t","   Times Watched","\t","\t","      Video Name")
11 |     print("-------------------------------------------------------------------------------------------------------")
12 |     for result in results:
13 |         Link = "https://www.youtube.com/watch?v="+result[0]
14 |         if result[2] is None:
15 |             title = "Video is not available in local database"
16 |         else:
17 |             title = result[2]
18 |         print(Link,'\t',result[1],'\t',title)
19 |     conn.commit()                                               
20 |     conn.close()
21 | 
22 | if __name__ == "__main__":
23 |     pass


--------------------------------------------------------------------------------
/src/oldest_videos.py:
--------------------------------------------------------------------------------
 1 | from get_api_key import api_key
 2 | 
 3 | import argparse
 4 | import os
 5 | from datetime import datetime
 6 | 
 7 | youtube_instance = api_key()
 8 | youtube_instance.get_api_key()
 9 | youtube = youtube_instance.get_youtube()
10 | 
11 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,\
12 |                                 description='Explore the oldest videos on a Topic',\
13 |                                  epilog='''Examples \n .\oldest_videos.py tesla \n .\oldest_videos.py "game of thrones" -n 15 -s 2012''')
14 | parser.add_argument("topic", help='Enter the topic')
15 | group2 = parser.add_argument_group()
16 | group2.add_argument('-n','--max_results',type=int, metavar='', default=5, help='The script will display "n" results')
17 | group2.add_argument('-s','--start_year',type=int, metavar='', default=2005, help='By default, it will search from 2005')
18 | group2.add_argument('-e','--end_year',type=int, metavar='', default=2010, help='By default, it will search till 2010')
19 | 
20 | parser.add_argument('-o','--output', action='store_true', help='output to a File')
21 | 
22 | 
23 | args = parser.parse_args()
24 | 
25 | 
26 | def oldest_videos_on_a_topic(topic,Max_limit,start_yr,end_yr):
27 |     if args.output:
28 |         f = open("old_videos.txt",'w',encoding = 'utf-8')
29 |         f.close()
30 |     else:
31 |         print('\n')
32 |         print('Video ID','\t','Upload Date/Time','\t','Video Title')
33 |         print('--------','\t','----------------','\t','-----------')
34 |     limit = 0
35 |     global youtube
36 |     start_time = datetime(year=2005, month=4, day=1).strftime('%Y-%m-%dT%H:%M:%SZ')
37 |     end_time = datetime(year=2010, month=1, day=1).strftime('%Y-%m-%dT%H:%M:%SZ')
38 | 
39 |     res = youtube.search().list(part='snippet',
40 |                             q=topic,
41 |                             type='video',
42 |                             publishedAfter=start_time,
43 |                             publishedBefore=end_time,
44 |                             maxResults=50).execute()
45 |     for item in sorted(res['items'], key=lambda x:x['snippet']['publishedAt']):
46 |         title = str(item['snippet']['title']).replace('&#39;',"'").replace('&quot;','"')
47 |         if topic.lower() in title.lower():
48 |             limit += 1
49 |             date_format = "%Y-%m-%dT%H:%M:%SZ" 
50 |             publishedAt = datetime.strptime(item['snippet']['publishedAt'], date_format)
51 |             if args.output:
52 |                 f = open("old_videos.txt",'a',encoding = 'utf-8')
53 |                 f.write(item['id']['videoId']+'\t\t'+str(publishedAt)+'\t\t'+ title )
54 |                 f.write('\n')
55 |                 f.close()
56 |             else:
57 |                 print(item['id']['videoId'],'\t',publishedAt,'\t', title )
58 |             if limit == Max_limit:
59 |                 break
60 |         else:
61 |             continue
62 | 
63 |     if args.output:   
64 |         print('\nDone! Check the file old_video.txt\n')
65 |     else:
66 |         print('\n')
67 | 
68 | if __name__ == "__main__":
69 |     key = input("Enter key\n")
70 |     youtube = get_api_key(key)
71 |     oldest_videos_on_a_topic(args.topic,args.max_results,args.start_year,args.end_year)


--------------------------------------------------------------------------------
/src/subscriptions.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | with open("takeout/subscriptions/subscriptions.json", encoding="utf-8") as f:
 4 |     subs = json.load(f)
 5 |     i = 0
 6 |     sub_list = []
 7 |     for sub in subs:
 8 |         temp = (sub["contentDetails"]["totalItemCount"], sub["snippet"]["resourceId"]["channelId"], sub["snippet"]["title"] )
 9 |         # temp = (sub["snippet"]["title"])
10 |         # sub_list.append(temp.title())
11 |         sub_list.append(temp)
12 |     sub_list = list(set(sub_list))
13 |     new_lst = []
14 |     for sub in sub_list:
15 |         temp = sub[2].title()
16 |         sub = (sub[0], sub[1], temp)
17 |         new_lst.append(sub)
18 |     new_lst.sort(key=lambda x:x[2])
19 |     for sub in new_lst:
20 |         print(sub)


--------------------------------------------------------------------------------
/src/vidsPerTime.py:
--------------------------------------------------------------------------------
 1 | import sqlite3,datetime,calendar
 2 | 
 3 | 
 4 | 
 5 | def absolute_dates():
 6 |     year_2020 = 1577836800
 7 |     conn = sqlite3.connect('youtube.db')              
 8 |     cur = conn.cursor()
 9 |     cur.execute("SELECT MIN(epoch) from video_history")
10 |     result = cur.fetchone()
11 |     oldest = int(result[0])
12 |     cur.execute("SELECT MAX(epoch) from video_history")
13 |     result = cur.fetchone()
14 |     newest = int(result[0])
15 | 
16 |     years = [2018,2019,2020,2021]
17 |     months = [x for x in range(1,13)]
18 |     dates = []
19 |     # days = [x for x in range(1,29)]
20 |     # for year in years:
21 |     #     for month in months:
22 |     #         for day in days:
23 |     #             date = (year,month,day)
24 |     #             dates.append(date)
25 |     # for date in dates:
26 |     #     print(date)
27 |     cal = calendar.calendar
28 |     temp = calendar.itermonthdates(2021, 1)
29 |     print(cal)
30 |     conn.close()
31 | 
32 | def relative_dates():
33 |     conn = sqlite3.connect('youtube.db')              
34 |     cur = conn.cursor()
35 | 
36 |     # Absolute, Relative
37 | 
38 |     start = 1568523230
39 |     end = 1609459199
40 |     max_res = 0
41 |     for start in range (1577836800,1578614400,3600):
42 |         end = start + 31622399
43 |         cur.execute("SELECT COUNT(Video_ID) from video_history WHERE epoch > ? AND epoch < ?",(start,end))
44 |         result = cur.fetchone()
45 |         result = int(result[0])
46 |         
47 |         if result > max_res:
48 |             max_res = result
49 |             start_year = start
50 |             end_year = end
51 |         if start % 10000 == 0:
52 |             print(start)
53 |     conn.close()
54 |     start_year = datetime.datetime.utcfromtimestamp(start_year).replace(tzinfo=datetime.timezone.utc)
55 |     end_year = datetime.datetime.utcfromtimestamp(end_year).replace(tzinfo=datetime.timezone.utc)
56 |     print(max_res,start_year,end_year)
57 | 
58 | relative_dates()
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------