├── .gitignore
├── Assets
├── example20.jpg
├── example_0.0.jpg
├── example_0.1.jpg
├── example_0.2.jpg
├── example_0.3.jpg
├── example_0.4.jpg
├── example_0.5.jpg
├── example_0.6.jpg
├── example_1.1.jpg
└── title.jpg
├── LICENSE
├── README.md
├── Samples.md
├── YT_Scrape.py
├── old_history.py
├── requirements.txt
└── src
├── __init__.py
├── create_new.py
├── download_these.py
├── downloading.py
├── early_views.py
├── entire_channel.py
├── get_api_key.py
├── get_channel_details.py
├── get_channel_id.py
├── get_channel_playlists.py
├── get_channel_videos.py
├── get_playlist_videos.py
├── get_video_stats.py
├── import_downloaded_items.py
├── load_history.py
├── most_watched.py
├── oldest_videos.py
├── subscriptions.py
└── vidsPerTime.py
/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | youtube.db
3 | youtube1.db
4 | youtube2.db
5 | test.py
6 | test2.py
7 | takeout
8 | __pycache__
9 | *.txt
10 | *.exe
11 | *.log
--------------------------------------------------------------------------------
/Assets/example20.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example20.jpg
--------------------------------------------------------------------------------
/Assets/example_0.0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.0.jpg
--------------------------------------------------------------------------------
/Assets/example_0.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.1.jpg
--------------------------------------------------------------------------------
/Assets/example_0.2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.2.jpg
--------------------------------------------------------------------------------
/Assets/example_0.3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.3.jpg
--------------------------------------------------------------------------------
/Assets/example_0.4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.4.jpg
--------------------------------------------------------------------------------
/Assets/example_0.5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.5.jpg
--------------------------------------------------------------------------------
/Assets/example_0.6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_0.6.jpg
--------------------------------------------------------------------------------
/Assets/example_1.1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/example_1.1.jpg
--------------------------------------------------------------------------------
/Assets/title.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/Assets/title.jpg
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 T. Sambit Suranjan
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | There was an inactivity of more than 2 years on this project.
2 | I am achieving this due to multiple reasons.
3 | Yet I will create a separate private repo for this and will work on adding new metrics and visualizations although at a slow pace.
4 | If it ends nice will merge that with this repo.
5 |
6 | # Youtube_scrape
7 |
8 |
9 |
10 |
11 |
14 |
15 |
16 | Scrape data about an entire Channel or just a Playlist, using Youtube API. No OAuth is required.
17 |
18 | ## :heavy_check_mark: Features
19 |
20 | Following features are available :
21 |
22 | 
23 |
24 | 1. **create_new** :
25 | 1. It creates a sqlite database to store all data.
26 | 2. Database will be placed in the same folder as the project file, named 'youtube.db'
27 | 3. It will have 4 tables - tb_channel, tb_playlist, tb_videos, video_history
28 | 4. You can use programs like [DB Browser](https://sqlitebrowser.org) , which is lightweight, to view the database.
29 | 2. **Oldest Video on A Topic** :
30 | 1. It is an isolate program, that can be run independently.
31 | 2. It doesn't depend on main code or any database.
32 | 3. **Scrape A Channel**:
33 | 1. Allows to scrape Channel Details and it's playlists.
34 | 2. It can also scrape details for each video of that channel.
35 | 1. If this option is not chosen, the playlist table won't have Playlist Duration.
36 | 4. **Scrape A Single Playlist**:
37 | 1. Allows to scrape info about a single Playlist and details about all it's videos.
38 | 5. **Load Your History**:
39 | 1. Make sure you have downloaded google Takeouts for your account to the PWD.
40 | 2. Make sure you have follwing path './takeout/history/watch-history.html'
41 | 3. Option to keep videos of your history on a separate table or integrate them with main table tb_videos
42 | 1. In order to use next features, you have to integrate them.
43 | 6. **Most Watched Video**:
44 | 1. You can list your most watched 'n' videos
45 | 7. **Early Viewed**:
46 | 1. You can list 'n' videos, which you saw earliest after they were uploaded.
47 | 2. There are some discrepencies, as many videos are reuploaded after you have seen it.
48 | 1. Program ignores those
49 | 3. It now only works when you watched it in IST.
50 | 8. **Generate Download List**:
51 | 1. This will create a text file, that will list Youtube URLs that can be downloaded by Youtube-DL or IDM etc.
52 | 2. It will select videos which are marked 'Worth = 1' i the database.
53 | 1. This operation is to be done by the user directly on the database (using DB Browser or such)
54 | 3. There is option to list videos of a single Channel or from entire DAtabase.
55 | 4. *Caution* : Once a video is processed by this function, it will be marked 'Is_Downloaded = 1'. Next time this function is run, new video IDs will be considered.
56 | 1. Hence User must make sure, all videos in *download_list.txt* are downloaded before rewriting the file.
57 |
58 | ## :computer: Setup Guide
59 |
60 | Below is a detailed guide on setting up the environment.
61 |
62 | ### Youtube API
63 |
64 | First you need to have you Youtube API key. Below is a link of a video, that will guide you. **Watch from 0:00 - 5:30**
65 | [](https://www.youtube.com/watch?v=th5_9woFJmk)
66 |
67 | 1. **Note - Youtube API is rate limited to 10000 hits/day.**
68 | 2. You can view your quotas at [here - console](https://console.cloud.google.com/iam-admin/quotas)
69 | 3. Cost of operations is decribed [here -Youtube API docs](https://developers.google.com/youtube/v3/docs)
70 | 4. Code has been optimized to decrease quota usage. You can easily work with 50000 videos/day. For more please check your quota limit.
71 |
72 | ### Installation
73 |
74 | You need to install google-api-python-client to run this project. [github API link](https://github.com/googleapis/google-api-python-client)
75 | Install this library in a [virtualenv](https://virtualenv.pypa.io/en/latest/) using pip.
76 |
77 | #### Mac/Linux
78 |
79 | ```
80 | pip3 install virtualenv
81 | virtualenv venv
82 | . venv/bin/activate
83 | pip3 install -r requirements.txt
84 | ```
85 |
86 | #### Windows
87 |
88 | ```
89 | pip3 install virtualenv
90 | virtualenv venv
91 | venv\Scripts\activate
92 | pip3 install -r requirements.txt
93 | ```
94 |
95 | ## Working Guide
96 |
97 | 1. Get Your Youtube API key as shown in above video.
98 | 2. Pip install the requirements.txt
99 | 3. Run the program YT_Scrape.py
100 |
101 | The script will ask for required data in the command line and is pretty self-explanatory (Once it runs)
102 |
103 | [View Samples](/Samples.md)
104 |
105 | ## :hearts: Contributing
106 |
107 | There are several ways to help.
108 |
109 | 1. **Spread the word:** More users means more possible people testing and contributing to the app which in turn means better stability and possibly more and better features. You can [](https://twitter.com/intent/tweet?text=Wow:&url=https%3A%2F%2Fgithub.com%2FCriticalHunter%2FYoutube_stats.git) or share it on [LinkedIn](http://www.linkedin.com/shareArticle?mini=true&url=https://github.com/CriticalHunter/Youtube_Scraper.git). Every little bit helps !
110 | 2. **[Make a feature or improvement request](https://github.com/CriticalHunter/Youtube_Scraper/issues/new)**: Something can be be done better? Something essential missing? Let us know!
111 | 3. **[Report bugs](https://github.com/CriticalHunter/Youtube_Scraper/issues/new)**
112 | 4. **Contribute**: You don't have to be programmer to help.
113 |
114 | 1. **Treat Me A Coffee Instead** [Paypal](https://paypal.me/CriticalHunter23)
115 |
116 | ### Pull Requests
117 |
118 | **Pull requests** are of course very welcome! Please make sure to also include the issue number in your commit message, if you're fixing a particular issue (e.g.: `feat: add nice feature with the number #31`).
119 |
--------------------------------------------------------------------------------
/Samples.md:
--------------------------------------------------------------------------------
1 |
2 | ### Getting Channel ID
3 |
4 | 
5 |
6 |
7 |
8 | ### Getting Playlist ID
9 |
10 | 
11 |
12 | ## Database (in DB Browser) sample results
13 | ### Database Schema
14 | 
15 | ### tb_channels Table
16 | 
17 | ### tb_playlists Table
18 | 
19 | ### tb_videos Table
20 | 
21 |
22 | 
23 |
24 | ### video_history Table
25 | 
--------------------------------------------------------------------------------
/YT_Scrape.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, unicode_literals
2 | import re, six, os, sys, json
3 |
4 |
5 | from pyfiglet import Figlet, figlet_format
6 | from pprint import pprint
7 | from PyInquirer import style_from_dict, Token, prompt, Validator, ValidationError
8 | from termcolor import colored
9 | import argparse
10 |
11 |
12 | from src.create_new import dbase
13 | from src.get_api_key import api_key
14 | from src.get_channel_details import get_channel_details
15 | from src.entire_channel import entire_channel
16 | from src.get_playlist_videos import get_playlist_videos
17 | from src.load_history import load_history
18 | from src.most_watched import most_watched
19 | from src.early_views import early_views
20 | from src.download_these import download_n
21 |
22 | from src.downloading import *
23 |
24 | def log1(string, color, figlet=False):
25 | if colored:
26 | if not figlet:
27 | six.print_(colored(string, color))
28 | else:
29 | six.print_(colored(figlet_format(
30 | string, font='doom'), color))
31 | else:
32 | six.print_(string)
33 |
34 | log1("Youtube_Scraper", color="blue", figlet=True)
35 | log1("Welcome to Youtube_Scraper", "green")
36 |
37 | style = style_from_dict({
38 | Token.QuestionMark: '#E91E63 bold',
39 | Token.Selected: '#673AB7 bold',
40 | Token.Instruction: '', # default
41 | Token.Answer: '#2196f3 bold',
42 | Token.Question: '',
43 | })
44 |
45 |
46 |
47 | class NumberValidator(Validator):
48 | def validate(self, document):
49 | try:
50 | int(document.text)
51 | except ValueError:
52 | raise ValidationError(
53 | message='Please enter a number',
54 | cursor_position=len(document.text)) # Move cursor to end
55 |
56 |
57 | print('Please Choose the desired Options')
58 | print('Press "ctrl+C" to escape at any point\n')
59 |
60 |
61 | dbase()
62 |
63 | if not os.path.exists("key.txt"):
64 | questions = [
65 | {
66 | 'type': 'input',
67 | 'name': 'API',
68 | 'message': '"key.txt" file not found. Please enter your Youtube API key '
69 | },]
70 | answers = prompt(questions, style=style)
71 | with open ('key.txt','w') as f:
72 | f.write(answers['API'])
73 | youtube_instance = api_key()
74 | youtube_instance.get_api_key()
75 | youtube = youtube_instance.get_youtube()
76 | if youtube == None:
77 | sys.exit()
78 |
79 | try:
80 | questions = [
81 | {
82 | 'type': 'list',
83 | 'name': 'operation',
84 | 'message': 'What do you want to do?',
85 | 'choices': ['Find oldest videos on a topic', 'Scrape a Channel','Scrape a Single Playlist' ,'Load Your History','Most Watched Video','Early Viewed Video','Generate Download List','Download Videos using YoutubeDL'],
86 | 'filter': lambda val: val.lower()
87 | },
88 | {
89 | 'type': 'list',
90 | 'name': 'Channel',
91 | 'message': 'Select Further \n Scraping all videos for a big channel will surpass your free API Limit',
92 | 'choices': ['Scrape Everything for a channel', 'Just Channel Stats (Individual video stats are not scraped)'],
93 | 'when': lambda answers: answers['operation'] == 'scrape a channel'
94 | },
95 | {
96 | 'type': 'input',
97 | 'name': 'channelID',
98 | 'message': 'Enter the Channel ID (leave it blank to pick channels from Channels.txt)',
99 | 'when': lambda answers: answers['operation'] == 'scrape a channel' and answers['Channel'] != ''
100 | },
101 | {
102 | 'type': 'input',
103 | 'name': 'playlistID',
104 | 'message': 'Enter the Playlist ID',
105 | 'when': lambda answers: answers['operation'] == 'scrape a single playlist'
106 | },
107 | {
108 | 'type': 'list',
109 | 'name': 'Download',
110 | 'message': 'What should the list contain?',
111 | 'choices': ['Videos from a single Channel', 'Videos from entire database'],
112 | 'when': lambda answers: answers['operation'] == 'generate download list'
113 | },
114 | {
115 | 'type': 'confirm',
116 | 'name': 'import',
117 | 'message': 'Do you want to import your video_history into main table(tb_videos) too? ',
118 | 'default': False,
119 | 'when': lambda answers: answers['operation'] == 'load your history'
120 | },
121 | {
122 | 'type': 'list',
123 | 'name': 'Quality',
124 | 'message': 'What Quality you want to download? (Make sure videos are listed in "download.txt" file)',
125 | 'choices': ['4k/Best Available','1080p','720p','360p'],
126 | 'when': lambda answers: answers['operation'] == 'download videos using youtubedl'
127 | },
128 | ]
129 |
130 | answers = prompt(questions, style=style)
131 |
132 |
133 | if answers['operation'] == 'find oldest videos on a topic':
134 | os.system("python .\src\oldest_videos.py -h")
135 |
136 | elif answers['operation'] == 'scrape a channel':
137 | if answers['channelID'] == '':
138 | with open("Channels.txt") as f:
139 | for line in f:
140 | new_Ch_ID = line[0]+'C'+line[2:]
141 | new_Ch_ID = new_Ch_ID.strip()
142 | print(new_Ch_ID)
143 | if answers['Channel'] == 'Just Channel Stats (Individual video stats are not scraped)':
144 | get_channel_details(youtube,new_Ch_ID)
145 | elif answers['Channel'] == 'Scrape Everything for a channel':
146 | entire_channel(youtube,new_Ch_ID)
147 |
148 | else:
149 | Ch_ID = answers['channelID']
150 | new_Ch_ID = Ch_ID[0]+'C'+Ch_ID[2:]
151 | if answers['Channel'] == 'Just Channel Stats (Individual video stats are not scraped)':
152 | get_channel_details(youtube,new_Ch_ID)
153 | elif answers['Channel'] == 'Scrape Everything for a channel':
154 | entire_channel(youtube,new_Ch_ID)
155 |
156 | elif answers['operation'] == 'scrape a single playlist':
157 | get_playlist_videos(youtube,answers['playlistID'])
158 |
159 | elif answers['operation'] == 'load your history':
160 | if answers['import'] == True:
161 | res = 'y'
162 | elif answers['import'] == False:
163 | res = 'n'
164 | print("Please Wait ...")
165 | load_history(res)
166 |
167 | elif answers['operation'] == 'most watched video':
168 | print("If your watch history is not loaded in database, it will give empty result")
169 | print("Please enter, How many items to retrieve e.g. 10 for Top 10 \n")
170 | n = int(input())
171 | most_watched(n)
172 |
173 | elif answers['operation'] == 'early viewed video':
174 | print("If your watch history is not loaded in database, it will give empty result")
175 | print("Please enter, How many items to retrieve e.g. 10 for Top 10 \n")
176 | n = int(input())
177 | early_views(n)
178 |
179 | elif answers['operation'] == 'generate download list':
180 | if answers['Download'] == 'Videos from a single Channel':
181 | print("It will list videos that are marked 'Is-Good' and is present in your database")
182 | chc = input("Please enter the channel ID \t")
183 | print("Please enter, How many items the list will contain \n")
184 | n = int(input())
185 | download_n(chc,n)
186 | elif answers['Download'] == 'Videos from entire database':
187 | print("It will list videos that are marked 'Is-Good' and is present in your database")
188 | chc = ''
189 | print("Please enter, How many items the list will contain \n")
190 | n = int(input())
191 | download_n(chc,n)
192 | elif answers['operation'] == 'download videos using youtubedl':
193 | print("\nIt will download all the videos that are listed in download.txt")
194 | print("Do you want to replace file names (_ in place of space) and convert thumbnail images (from WEBP to JPEG) ?\n")
195 | chc2 = input("Please enter Y/N \t")
196 | if chc2 == 'Y' or chc2 == 'Yes':
197 | if answers['Quality'] == '4k/Best Available':
198 | download_files('4k')
199 | elif answers['Quality'] == '1080p':
200 | download_files(1080)
201 | elif answers['Quality'] == '720p':
202 | download_files(720)
203 | elif answers['Quality'] == '360p':
204 | download_files(360)
205 | replace2('D:\Youtube')
206 | convertWebp2jpgInDirectory('D:\Youtube')
207 | else:
208 | if answers['Quality'] == '4k/Best Available':
209 | download_files('4k')
210 | elif answers['Quality'] == '1080p':
211 | download_files(1080)
212 | elif answers['Quality'] == '720p':
213 | download_files(720)
214 | elif answers['Quality'] == '360p':
215 | download_files(360)
216 |
217 | except Exception as e:
218 | print(e)
--------------------------------------------------------------------------------
/old_history.py:
--------------------------------------------------------------------------------
1 | import pprint
2 | from bs4 import BeautifulSoup
3 | import re
4 | import sqlite3
5 | import time
6 | import datetime,pytz
7 |
8 | with open("test.html",encoding='utf-8') as fp: # To open a local html file
9 | soup = BeautifulSoup(fp,features='lxml')
10 |
11 | soup_str = soup.prettify() # Prettify the HTML, but it becomes String
12 | with open('temp.html','w',encoding='utf-8') as wr:
13 | wr.write(soup_str)
14 |
15 | tags = soup.find_all("c-wiz", {"class": "xDtZAf"})
16 | for tag in tags:
17 | with open("watched.txt",'a',encoding='utf-8') as f:
18 | date = tag.get('data-date')
19 | foo = tag.find("div", {"class": "QTGV3c"})
20 | temp = (foo.get_text())
21 | watched = temp.split(' ')
22 | watched = watched[0]
23 | if watched == 'Watched':
24 | try:
25 | bar = foo.a.get('href')
26 | vid_id = bar[-11:]
27 | time = tag.find("div", {"class": "H3Q9vf XTnvW"})
28 | time = time.get_text()
29 | f.write(date)
30 | f.write(' ')
31 | tm = re.findall('\d+:\d+ .M',time)[0]
32 | tm1=tm.split(':')[0]
33 | tm2=tm.split(':')[1]
34 | tm21 = tm2.split(' ')[0]
35 | tm22 = tm2.split(' ')[1]
36 | if len(tm1)==1:
37 | tm='0'+tm1+':'+tm21+':'+'00'+' '+tm22
38 | else:
39 | tm=tm1+':'+tm21+':'+'00'+' '+tm22
40 | f.write(tm)
41 | f.write(' ')
42 | f.write(vid_id)
43 | f.write('\n')
44 | except:
45 | pass
46 |
47 | with open("watched.txt",'r',encoding='utf-8') as fhand:
48 | conn = sqlite3.connect('youtube.db')
49 | cur = conn.cursor()
50 | for line in fhand:
51 | time = line[0:-12]
52 | p='%Y%m%d %I:%M:%S %p '
53 | epoch = (datetime.datetime.strptime(time, p))
54 | # dtobj3=dtobj1.replace(tzinfo=pytz.UTC) #replace method
55 | # dtobj_kolkata=dtobj3.astimezone(pytz.timezone("Asia/Kolkata"))
56 | # epoch = dtobj_kolkata.timestamp()
57 | new_format = epoch.strftime('%b %d, %Y, %I:%M:%S %p')
58 | vid_id = line[-12:-1]
59 | cur.execute("INSERT OR IGNORE INTO video_history VALUES (?,?,?,?)", (vid_id,new_format,epoch.timestamp(),0))
60 | conn.commit()
61 | conn.close()
62 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | astroid==2.4.2
2 | atomicwrites==1.4.0
3 | attrs==20.3.0
4 | beautifulsoup4==4.9.3
5 | cachetools==4.2.0
6 | certifi==2024.7.4
7 | chardet==4.0.0
8 | colorama==0.4.4
9 | ffmpeg-python==0.2.0
10 | future==0.18.3
11 | google-api-core==1.24.1
12 | google-api-python-client==1.12.8
13 | google-auth==1.24.0
14 | google-auth-httplib2==0.0.4
15 | googleapis-common-protos==1.52.0
16 | httplib2==0.19.1
17 | idna==3.7
18 | iniconfig==1.1.1
19 | isort==5.7.0
20 | lazy-object-proxy==1.4.3
21 | lxml==4.9.1
22 | mccabe==0.6.1
23 | packaging==20.8
24 | pluggy==0.13.1
25 | prompt-toolkit==1.0.14
26 | protobuf==3.18.3
27 | py==1.10.0
28 | pyasn1==0.4.8
29 | pyasn1-modules==0.2.8
30 | pyfiglet==0.8.post1
31 | Pygments==2.15.0
32 | PyInquirer==1.0.3
33 | pylint==2.6.0
34 | pyparsing==2.4.7
35 | pytest==6.2.1
36 | pytz==2020.4
37 | regex==2020.11.13
38 | requests==2.32.2
39 | rsa==4.7
40 | six==1.15.0
41 | soupsieve==2.1
42 | termcolor==1.1.0
43 | toml==0.10.2
44 | uritemplate==3.0.1
45 | urllib3==1.26.19
46 | wcwidth==0.2.5
47 | wrapt==1.12.1
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CriticalHunter/Youtube_Scraper/dca7c71823cb49e014a8cdf08dd8e391ddfbd57f/src/__init__.py
--------------------------------------------------------------------------------
/src/create_new.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import os
3 |
4 | def create_new():
5 | conn = sqlite3.connect('youtube.db')
6 | cur = conn.cursor()
7 |
8 |
9 | cur.execute("""CREATE TABLE IF NOT EXISTS tb_channels (
10 | Channel_ID TEXT PRIMARY KEY,
11 | Channel_title TEXT,
12 | Published_At TEXT NOT NULL,
13 | Country TEXT,
14 | View_Count INTEGER,
15 | Subscriber_Count INTEGER,
16 | Video_Count INTEGER,
17 | Playlist_Count INTEGER,
18 | Channel_Duration INTEGER,
19 | Duration_in_Text TEXT,
20 | Is_Deleted INTEGER,
21 | Deleted_Videos INTEGER,
22 | Downloaded_Videos INTEGER,
23 | Folder_Size_GB REAL,
24 | Channel_last_Scraped TEXT,
25 | Auto_Update INTEGER,
26 | Description TEXT
27 | )
28 |
29 | """)
30 |
31 |
32 | cur.execute("""CREATE TABLE IF NOT EXISTS tb_playlists(
33 | Playlist_ID TEXT PRIMARY KEY,
34 | Playlist_title TEXT,
35 | Channel_ID TEXT NOT NULL,
36 | Channel_Title TEXT NOT NULL,
37 | Published_At TEXT NOT NULL,
38 | Current_Video_Count INTEGER,
39 | Playlist_Seconds INTEGER,
40 | Playlist_Duration TEXT,
41 | Is_Seen INTEGER,
42 | Worth INTEGER,
43 | Is_Removed INTEGER,
44 | Deleted_Videos INTEGER,
45 | Downloaded_Videos INTEGER,
46 | Folder_Size_GB REAL,
47 | Playlist_last_Scraped TEXT,
48 | Auto_Update INTEGER
49 | )
50 | """)
51 |
52 | cur.execute("""CREATE TABLE IF NOT EXISTS tb_videos (
53 | Video_ID TEXT PRIMARY KEY,
54 | Video_title TEXT,
55 | Is_Seen INTEGER,
56 | Worth INTEGER,
57 | Upload_playlistId TEXT,
58 | Playlist_ID TEXT,
59 | Published_At TEXT NOT NULL,
60 | epoch REAL NOT NULL,
61 | Channel_ID TEXT NOT NULL,
62 | Channel_Title TEXT NOT NULL,
63 | View_Count INTEGER,
64 | Like_Count INTEGER,
65 | Dislike_Count INTEGER,
66 | Upvote_Ratio REAL,
67 | Comment_Count INTEGER,
68 | Duration TEXT,
69 | video_seconds INTEGER,
70 | Is_Licensed INTEGER,
71 | Is_Deleted INTEGER,
72 | Is_Downloaded INTEGER
73 | )
74 | """)
75 |
76 | cur.execute("""CREATE TABLE IF NOT EXISTS video_history (
77 | Video_ID TEXT NOT NULL,
78 | Title TEXT,
79 | Watched_at TEXT ,
80 | epoch REAL NOT NULL,
81 | Is_in_Main INTEGER,
82 | Is_Deleted INTEGER,
83 | PRIMARY KEY ( Video_ID, epoch)
84 | )
85 |
86 | """)
87 |
88 | cur.execute("""CREATE TABLE IF NOT EXISTS yt_downloaded (
89 | Video_ID TEXT PRIMARY KEY,
90 | Resolution TEXT,
91 | Raw_Size INTEGER,
92 | Size REAL,
93 | vid_type TEXT,
94 | FPS TEXT,
95 | bitrate,
96 | Audio_Type TEXT,
97 | Frequency INTEGER,
98 | Channels TEXT,
99 | Is_In_Main INTEGER
100 | )
101 |
102 | """)
103 |
104 | conn.commit() # Push the data into database
105 | conn.close()
106 |
107 | def migrate():
108 | conn = sqlite3.connect('youtube.db')
109 | cur = conn.cursor()
110 |
111 | cur.execute("PRAGMA foreign_keys=off")
112 | cur.execute("BEGIN TRANSACTION")
113 | cur.execute("ALTER TABLE tb_channels RENAME TO _tb_channels_old")
114 | cur.execute("""
115 | CREATE TABLE IF NOT EXISTS tb_channels (
116 | Channel_ID TEXT PRIMARY KEY,
117 | Channel_title TEXT,
118 | Published_At TEXT NOT NULL,
119 | Country TEXT,
120 | View_Count INTEGER,
121 | Subscriber_Count INTEGER,
122 | Video_Count INTEGER,
123 | Playlist_Count INTEGER
124 | )
125 |
126 | """)
127 | cur.execute("INSERT INTO tb_channels SELECT * FROM _tb_channels_old")
128 | try:
129 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Channel_Duration INTEGER")
130 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Duration_in_Text TEXT")
131 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Is_Deleted INTEGER")
132 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Deleted_Videos INTEGER")
133 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Downloaded_Videos INTEGER")
134 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Folder_Size_GB REAL")
135 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Channel_last_Scraped TEXT")
136 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Auto_Update INTEGER")
137 | cur.execute("ALTER TABLE tb_channels ADD COLUMN Description TEXT")
138 | except:
139 | # These stats are added after intitial release of this code.
140 | pass
141 | cur.execute("DROP TABLE _tb_channels_old")
142 |
143 | cur.execute("ALTER TABLE tb_playlists RENAME TO _tb_playlists_old")
144 | cur.execute("""CREATE TABLE IF NOT EXISTS tb_playlists(
145 | Playlist_ID TEXT PRIMARY KEY,
146 | Playlist_title TEXT,
147 | Channel_ID TEXT NOT NULL,
148 | Channel_Title TEXT NOT NULL,
149 | Published_At TEXT NOT NULL,
150 | Item_Count INTEGER,
151 | Playlist_Seconds INTEGER,
152 | Playlist_Duration TEXT,
153 | Is_Seen INTEGER,
154 | Worth INTEGER
155 | )
156 | """)
157 | cur.execute("INSERT INTO tb_playlists SELECT * FROM _tb_playlists_old")
158 | try:
159 | cur.execute("ALTER TABLE tb_playlists ADD COLUMN Is_Removed INTEGER")
160 | cur.execute("ALTER TABLE tb_playlists ADD COLUMN Deleted_Videos INTEGER")
161 | cur.execute("ALTER TABLE tb_playlists ADD COLUMN Downloaded_Videos INTEGER")
162 | cur.execute("ALTER TABLE tb_playlists ADD COLUMN Folder_Size_GB REAL")
163 | cur.execute("ALTER TABLE tb_playlists ADD COLUMN Playlist_last_Scraped TEXT")
164 | cur.execute("ALTER TABLE tb_playlists ADD COLUMN Auto_Update INTEGER")
165 | cur.execute("ALTER TABLE tb_playlists RENAME COLUMN Item_Count TO Current_Video_Count")
166 | except:
167 | # These stats are added after intitial release of this code.
168 | pass
169 | cur.execute("DROP TABLE _tb_playlists_old")
170 |
171 | cur.execute("ALTER TABLE tb_videos RENAME TO _tb_videos_old")
172 | cur.execute("""CREATE TABLE IF NOT EXISTS tb_videos (
173 | Video_ID TEXT PRIMARY KEY,
174 | Video_title TEXT,
175 | Is_Seen INTEGER,
176 | Worth INTEGER,
177 | Upload_playlistId TEXT,
178 | Playlist_ID TEXT,
179 | Published_At TEXT NOT NULL,
180 | epoch REAL NOT NULL,
181 | Channel_ID TEXT NOT NULL,
182 | Channel_Title TEXT NOT NULL,
183 | View_Count INTEGER,
184 | Like_Count INTEGER,
185 | Dislike_Count INTEGER,
186 | Upvote_Ratio REAL,
187 | Comment_Count INTEGER,
188 | Duration TEXT,
189 | video_seconds INTEGER,
190 | Is_Licensed INTEGER,
191 | Is_Deleted INTEGER,
192 | Is_Downloaded INTEGER
193 | )
194 | """)
195 | cur.execute("INSERT INTO tb_videos SELECT * FROM _tb_videos_old")
196 | cur.execute("DROP TABLE _tb_videos_old")
197 |
198 |
199 | cur.execute("""CREATE TABLE IF NOT EXISTS yt_downloaded (
200 | Video_ID TEXT PRIMARY KEY,
201 | Resolution TEXT,
202 | Raw_Size INTEGER,
203 | Size REAL,
204 | vid_type TEXT,
205 | FPS TEXT,
206 | bitrate,
207 | Audio_Type TEXT,
208 | Frequency INTEGER,
209 | Channels TEXT,
210 | Is_In_Main INTEGER
211 | )
212 |
213 | """)
214 | try:
215 | cur.execute("DROP TABLE tb_downloaded")
216 | except:
217 | pass
218 | cur.execute("PRAGMA foreign_keys=on")
219 | conn.commit() # Push the data into database
220 | conn.close()
221 |
222 | def dbase():
223 | if not os.path.exists("youtube.db"):
224 | create_new()
225 | else:
226 | conn = sqlite3.connect('youtube.db')
227 | cur = conn.cursor()
228 | try:
229 | cur.execute("SELECT Deleted_Videos FROM tb_channels")
230 | except:
231 | migrate()
232 |
233 |
234 | if __name__ == "__main__":
235 | dbase()
--------------------------------------------------------------------------------
/src/download_these.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | def download_n(chc='',n=50):
4 | with open("download.txt",'w',encoding='utf-8') as fp:
5 | conn = sqlite3.connect('youtube.db')
6 | cur = conn.cursor()
7 | if chc == '':
8 | cur.execute("SELECT Video_ID FROM tb_videos WHERE Worth = 1 and Is_Downloaded = 0 LIMIT ?",(n,))
9 | else:
10 | try:
11 | cur.execute("SELECT Video_ID FROM tb_videos WHERE Worth = 1 and Is_Downloaded = 0 and Channel_ID = ? LIMIT ?",(chc,n))
12 | except:
13 | print("Please enter correct Channel ID")
14 | down_list = cur.fetchall()
15 | for item in down_list:
16 | link = "https://www.youtube.com/watch?v="+item[0]
17 | cur.execute("UPDATE tb_videos SET Is_Downloaded = 1 WHERE Video_ID = ?",(item[0],))
18 | fp.write(link)
19 | fp.write('\n')
20 | conn.commit() # Push the data into database
21 | conn.close()
22 |
23 | if __name__ == "__main__":
24 | pass
--------------------------------------------------------------------------------
/src/downloading.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | '''
5 | make It run on both linux / and windows \
6 | '''
7 |
8 | def download_files(ch=720):
9 | commandline4k = 'youtube-dl --add-metadata --write-info-json --write-thumbnail --force-ipv4 \
10 | --sleep-interval 3 --max-sleep-interval 6 --ignore-errors --no-continue --no-overwrites \
11 | --download-archive archive.log -f "bestvideo+(bestaudio[acodec^=opus]/bestaudio)/bestvideo[height<=360]" \
12 | --merge-output-format "mkv" -o "D:/Youtube/%(uploader)s/%(upload_date)s_%(title)s %(id)s.%(ext)s" -a download.txt'
13 | commandline1080 = 'youtube-dl --add-metadata --write-info-json --write-thumbnail --force-ipv4 \
14 | --sleep-interval 3 --max-sleep-interval 6 --ignore-errors --no-continue --no-overwrites \
15 | --download-archive archive.log -f "bestvideo[height<=1080]+(bestaudio[acodec^=opus]/bestaudio)/bestvideo[height<=360]" \
16 | --merge-output-format "mkv" -o "D:/Youtube/%(uploader)s/%(upload_date)s_%(title)s %(id)s.%(ext)s" -a download.txt'
17 | commandline720 = 'youtube-dl --add-metadata --write-info-json --write-thumbnail --force-ipv4 \
18 | --sleep-interval 3 --max-sleep-interval 6 --ignore-errors --no-continue --no-overwrites \
19 | --download-archive archive.log -f "bestvideo[height<=720]+(bestaudio[acodec^=opus]/bestaudio)/bestvideo[height<=360]" \
20 | --merge-output-format "mkv" -o "D:/Youtube/%(uploader)s/%(upload_date)s_%(title)s %(id)s.%(ext)s" -a download.txt'
21 | commandline360 = 'youtube-dl --add-metadata --write-info-json --write-thumbnail --force-ipv4 \
22 | --sleep-interval 3 --max-sleep-interval 6 --ignore-errors --no-continue --no-overwrites \
23 | --download-archive archive.log -f "bestvideo[height<=360]+(bestaudio[acodec^=opus]/bestaudio)/bestvideo[height<=360]" \
24 | --merge-output-format "mkv" -o "D:/Youtube/%(uploader)s/%(upload_date)s_%(title)s %(id)s.%(ext)s" -a download.txt'
25 | if ch == 720:
26 | commandline = commandline720
27 | elif ch == '4k':
28 | commandline = commandline4k
29 | elif ch == 1080:
30 | commandline = commandline1080
31 | elif ch == 360:
32 | commandline = commandline360
33 | os.system(commandline)
34 |
35 | def replace2(parent):
36 | for path, folders, files in os.walk(parent):
37 | for f in files:
38 | os.rename(os.path.join(path, f), os.path.join(path, f.replace(' ', '__')))
39 | for i in range(len(folders)):
40 | new_name = folders[i].replace(' ', '_').replace('.', '_').replace("'", '')
41 | try:
42 | os.rename(os.path.join(path, folders[i]), os.path.join(path, new_name))
43 | except FileExistsError:
44 | pass
45 | folders[i] = new_name
46 | # Traverse the specified directory, display all file names under the directory
47 | def convertWebp2jpgInDirectory(dir):
48 | if os.path.isdir(dir):
49 | allfiles = os.listdir(dir)
50 | for fi in allfiles:
51 | fi_d = os.path.join(dir, fi)
52 | if os.path.isdir(fi_d):
53 | convertWebp2jpgInDirectory(fi_d)
54 | else:
55 | if fi_d.endswith(".webp"):
56 | webp = os.path.join(dir, fi_d)
57 | webp = '"'+webp+'"'
58 | filename = webp.split("\\")[-1]
59 |
60 | filedir = "\\".join(webp.split("\\")[:-1])
61 |
62 | newfilename = filename.replace(".webp",'.jpg')
63 | jpg = "%s\%s"%(filedir, newfilename)
64 | # jpg = '"'+jpg+'"'
65 | commandline = "dwebp %s -o %s" % (webp, jpg)
66 |
67 | os.system(commandline)
68 | print(webp + " ------> conversion succeeded")
69 |
70 | deleteline = "rm "+webp
71 | os.system(deleteline)
72 |
73 | if __name__ == '__main__':
74 | pass
75 | # download_files()
76 | # convertWebp2jpgInDirectory("D:\Youtube")
77 | # replace('D:\Youtube')
--------------------------------------------------------------------------------
/src/early_views.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | def early_views(n=5):
4 | conn = sqlite3.connect('youtube.db')
5 | cur = conn.cursor()
6 | cur.execute("SELECT video_history.Video_ID, video_history.epoch -tb_videos.epoch As diff,video_history.epoch,tb_videos.epoch,tb_videos.Video_title,tb_videos.epoch, Watched_at FROM video_history \
7 | LEFT OUTER JOIN tb_videos on tb_videos.Video_id = video_history.Video_ID WHERE (diff-19800) > 0 GROUP BY video_history.Video_ID ORDER BY diff ASC ;")
8 | results = cur.fetchmany(n)
9 | print("Video ID"," Diff in Min","\t","Published AT(UTC)"," Watched AT (IST)","\tVideo Title")
10 | print("-------------------------------------------------------------------------------------------------------")
11 | for result in results:
12 | Link = result[0]
13 | differ = (int(result[1])-19800)/60
14 | differ1 = ("{:6d}".format(int(differ//1)))
15 | differ2 = ("{:.2f}".format(differ % 1)).replace('0.','.')
16 | differ = differ1+differ2
17 | print(Link,'\t',differ,'\t',result[2],'\t',result[3],'\t',result[4])
18 | conn.commit()
19 | conn.close()
20 |
21 | if __name__ == "__main__":
22 | pass
--------------------------------------------------------------------------------
/src/entire_channel.py:
--------------------------------------------------------------------------------
1 | from src.get_channel_playlists import get_channel_playlists
2 | from src.get_channel_details import get_channel_details, get_channel_length
3 | from src.get_playlist_videos import get_playlist_videos
4 | from src.get_channel_videos import get_channel_videos
5 |
6 | def entire_channel(youtube,ch_id):
7 | ch_id = ch_id
8 | ec=True
9 | get_channel_details(youtube,ch_id,ec=ec)
10 | playlists_list = get_channel_playlists(youtube,ch_id)
11 | count = 0
12 | print('\nThere are ',len(playlists_list),' original/imported playlists\n')
13 | for playlist in playlists_list:
14 | count += 1
15 | print('\nParsing playlist ',count,' \\ ',len(playlists_list))
16 | try:
17 | get_playlist_videos(youtube,playlist,ec=ec,ch_id=ch_id)
18 | except:
19 | print("Error getting Playlist :",playlist)
20 | get_channel_videos(youtube,ch_id)
21 | get_channel_length(ch_id)
--------------------------------------------------------------------------------
/src/get_api_key.py:
--------------------------------------------------------------------------------
1 | from googleapiclient.discovery import build
2 | from googleapiclient.errors import HttpError
3 | from httplib2 import ServerNotFoundError
4 | from google.auth.exceptions import DefaultCredentialsError
5 | '''
6 | Get the API key and save it in a text file named, key.txt in parent folder.
7 | The method to get a youtube API key is well illustrated in the Youtube Video in the README page.
8 | '''
9 | class api_key():
10 | def __init__(self):
11 | self.youtube = None
12 |
13 | def get_api_key(self):
14 | try:
15 | with open('key.txt') as key_file:
16 | api_key = key_file.read()
17 | youtube = build('youtube','v3',developerKey=api_key)
18 | self.youtube = youtube
19 |
20 | except HttpError:
21 | print("\nAPI Key is wrong")
22 | print("Please recheck the API key or generate a new key.\nThen modify the 'key.txt' file with new Key\n")
23 |
24 | except ServerNotFoundError:
25 | print("\nUnable to connect to internet...")
26 | print("Please Check Your Internet Connection.\n")
27 |
28 | except DefaultCredentialsError:
29 | print("\n'key.txt' is Blank.")
30 | print("Please save your API key there and then continue.\n")
31 |
32 | except FileNotFoundError:
33 | print("\nNo such file: 'key.txt'")
34 | print("Please create a file named 'key.txt' and place your Youtube API key in it.\n")
35 |
36 | except Exception as e:
37 | print(e)
38 | print("Oops!", e.__class__, "occurred.")
39 |
40 | def get_youtube(self):
41 | return self.youtube
42 |
43 | if __name__ == "__main__":
44 | youtube_instance = api_key()
45 | youtube_instance.get_api_key()
46 | youtube = youtube_instance.get_youtube()
47 | print(youtube)
--------------------------------------------------------------------------------
/src/get_channel_details.py:
--------------------------------------------------------------------------------
1 | import sqlite3,datetime
2 | from src.get_channel_playlists import get_channel_playlists
3 | import sys
4 |
5 | def get_channel_details(youtube,channel_id,single=False,playlistID='',ec=False):
6 |
7 | request = youtube.channels().list(part="snippet,statistics",
8 | id=channel_id
9 | ).execute()
10 |
11 | # print(request['items'][0])
12 | Channel_Id = channel_id
13 | flag1 = True
14 | flag2 = True
15 | conn = sqlite3.connect('youtube.db')
16 | cur = conn.cursor()
17 | try:
18 | Channel_title = request['items'][0]['snippet']['title']
19 | except:
20 | flag1 = False
21 | try:
22 | cur.execute("SELECT Channel_ID from tb_channels WHERE Channel_ID = ? ",(Channel_Id,))
23 | temp = cur.fetchone()
24 | if temp is None:
25 | flag2 = False
26 | except:
27 | flag2 = False
28 | cur.execute("SELECT Is_Deleted from tb_channels WHERE Channel_ID = ? ",(Channel_Id,))
29 | flag3 = cur.fetchone()
30 | if flag3 is None:
31 | pass
32 | else:
33 | flag3 = flag3[0]
34 | if flag1 == False and flag2 == False:
35 | print("Channel ID not valid")
36 | sys.exit()
37 | if flag1 == False and flag2 == True and flag3 == 1:
38 | print("Channel was already Deleted")
39 | conn.commit() # Push the data into database
40 | conn.close()
41 | sys.exit()
42 | if flag1 == False and flag2 == True and flag3 == 0:
43 | cur.execute("SELECT Channel_Id from tb_channels")
44 | cur.execute("UPDATE tb_channels SET Is_Deleted = ? WHERE Channel_ID = ? ",(1,Channel_Id))
45 | cur.execute("UPDATE tb_channels SET Auto_Update = ? WHERE Channel_ID = ? ",(0,Channel_Id))
46 | print("Channel is Deleted and now updated in Database")
47 | conn.commit() # Push the data into database
48 | conn.close()
49 | sys.exit()
50 |
51 | Description = request['items'][0]['snippet']['description']
52 | Published_At = request['items'][0]['snippet']['publishedAt']
53 | try:
54 | Country = request['items'][0]['snippet']['country']
55 | except:
56 | Country = None
57 | View_Count = request['items'][0]['statistics']['viewCount']
58 | try:
59 | Subscriber_Count = request['items'][0]['statistics']['subscriberCount']
60 | except:
61 | Subscriber_Count = None
62 | Video_Count = request['items'][0]['statistics']['videoCount']
63 | if ec == False:
64 | Channel_last_Scraped = 'Never'
65 | else:
66 | Channel_last_Scraped = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
67 |
68 | cur.execute("SELECT Downloaded_Videos FROM tb_channels WHERE Channel_ID = ?" ,(Channel_Id,))
69 | temp = cur.fetchone()
70 | try:
71 | temp = temp[0]
72 | if temp > 0:
73 | Downloaded_Videos = temp
74 | else:
75 | Downloaded_Videos = 0
76 | except:
77 | Downloaded_Videos = 0
78 | cur.execute("SELECT Folder_Size_GB FROM tb_channels WHERE Channel_ID = ?" ,(Channel_Id,))
79 | temp = cur.fetchone()
80 | try:
81 | temp = temp[0]
82 | if temp > 0:
83 | Folder_Size_GB = temp
84 | else:
85 | Folder_Size_GB = 0
86 | except:
87 | Folder_Size_GB = 0
88 |
89 | params = (Channel_Id,Channel_title,Published_At,Country,View_Count,Subscriber_Count,Video_Count,0,0,'First, Scrape Entire Channel',0,0,Downloaded_Videos,Folder_Size_GB)
90 |
91 | conn = sqlite3.connect('youtube.db')
92 | cur = conn.cursor()
93 | cur.execute("INSERT OR REPLACE INTO tb_channels VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?,?,?, ?,?, 'Never',1,'')", params)
94 | conn.commit() # Push the data into database
95 | conn.close()
96 | conn = sqlite3.connect('youtube.db')
97 | cur = conn.cursor()
98 | cur.execute("UPDATE tb_channels SET Channel_last_Scraped = ? WHERE Channel_ID = ? ",(Channel_last_Scraped,Channel_Id))
99 | cur.execute("UPDATE tb_channels SET Description = ? WHERE Channel_ID = ? ",(Description,Channel_Id))
100 | conn.commit() # Push the data into database
101 | conn.close()
102 | get_channel_playlists(youtube,Channel_Id,single,playlistID)
103 |
104 | def get_channel_length(Channel_Id):
105 | conn = sqlite3.connect('youtube.db')
106 | cur = conn.cursor()
107 | cur.execute("SELECT SUM(video_seconds) FROM tb_videos WHERE Channel_ID = ? ",(Channel_Id,))
108 | tot = cur.fetchone()
109 | tot = tot[0]
110 | if tot is None:
111 | tot = 0 # For channels with 0 Original Videos (e.g. Hasan Minaj)
112 | Duration_in_Text = str(datetime.timedelta(seconds = tot))
113 | cur.execute("UPDATE tb_channels SET Duration_in_Text = ? WHERE Channel_ID = ? ",(Duration_in_Text,Channel_Id))
114 | cur.execute("UPDATE tb_channels SET Channel_Duration = ? WHERE Channel_ID = ? ",(tot,Channel_Id))
115 | cur.execute("SELECT COUNT(Video_ID) FROM tb_videos WHERE Is_Deleted = ? AND Channel_ID = ? ",(1,Channel_Id))
116 | num = cur.fetchone()
117 | num=num[0]
118 | cur.execute("UPDATE tb_channels SET Deleted_Videos = ? WHERE Channel_ID = ? ",(num,Channel_Id))
119 | conn.commit()
120 | conn.close()
121 |
122 | if __name__ == "__main__":
123 | pass
--------------------------------------------------------------------------------
/src/get_channel_id.py:
--------------------------------------------------------------------------------
1 | from get_api_key import api_key
2 | import argparse
3 | import os
4 |
5 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,\
6 | description='Get CHannel ID for CHannel User Name, Hopefully!!!')
7 | parser.add_argument("User", help='Enter Channel User Name')
8 | args = parser.parse_args()
9 |
10 |
11 |
12 | def get_channel_id(youtube,ch_name):
13 | request = youtube.channels().list(
14 | part="snippet,contentDetails,statistics",
15 | forUsername=ch_name
16 | )
17 | response = request.execute()
18 | try:
19 | sub_count = int(response['items'][0]['statistics']['subscriberCount'])
20 | if sub_count > 1000000:
21 | sub_count = str(sub_count / 1000000)
22 | sub_count = sub_count + 'M Subscribers'
23 | elif sub_count > 1000:
24 | sub_count = str(sub_count / 1000)
25 | sub_count = sub_count + 'K Subscribers'
26 | else:
27 | sub_count = str(sub_count) + ' Subscribers'
28 | ch_id = response['items'][0]['id']
29 |
30 | print(" ")
31 | print(sub_count)
32 | print(ch_id)
33 | return ch_id
34 | except KeyError:
35 | print(" ")
36 | print(" Error : Channel not Found ")
37 | print(" ")
38 |
39 | if __name__ == "__main__":
40 | youtube_instance = api_key()
41 | youtube_instance.get_api_key()
42 | youtube = youtube_instance.get_youtube()
43 | get_channel_id(youtube,args.User)
--------------------------------------------------------------------------------
/src/get_channel_playlists.py:
--------------------------------------------------------------------------------
1 | import sqlite3, time
2 |
3 | def get_channel_playlists(youtube,channel_id,single=False,playlistID=''):
4 |
5 | conn = sqlite3.connect('youtube.db')
6 | cur = conn.cursor()
7 |
8 | playlists = []
9 | playlist_ids = []
10 | next_page_token = None
11 |
12 | while 1:
13 | res = youtube.playlists().list( part="snippet,contentDetails",
14 | channelId=channel_id,
15 | pageToken=next_page_token,
16 | maxResults=50
17 | ).execute()
18 | playlists += res['items']
19 | next_page_token = res.get('nextPageToken')
20 |
21 |
22 | for playlist in playlists:
23 | Playlist_ID = playlist['id'] ; playlist_ids.append(Playlist_ID)
24 | if (single == True and playlist['id'] == playlistID) or single == False:
25 |
26 |
27 | Playlist_title = playlist['snippet']['title']
28 | Channel_Id = playlist['snippet']['channelId']
29 | Channel_Title = playlist['snippet']['channelTitle']
30 | Published_At = playlist['snippet']['publishedAt']
31 | Current_Video_Count = playlist['contentDetails']['itemCount']
32 | Playlist_Seconds = 0
33 | Playlist_Duration = '0'
34 | cur.execute("SELECT Is_Seen FROM tb_playlists WHERE Playlist_ID = ?" ,(Playlist_ID,))
35 | temp = cur.fetchone()
36 | try:
37 | temp = temp[0]
38 | if temp == 1:
39 | Is_Seen = 1
40 | else:
41 | Is_Seen = 0
42 | except:
43 | Is_Seen = 0
44 | # 0 = not seen 1 = seen
45 | cur.execute("SELECT Worth FROM tb_playlists WHERE Playlist_ID = ?" ,(Playlist_ID,))
46 | temp = cur.fetchone()
47 | try:
48 | temp = temp[0]
49 | if temp == 1:
50 | Worth = 1
51 | else:
52 | Worth = 0
53 | except:
54 | Worth = 0
55 |
56 | cur.execute("SELECT Downloaded_Videos FROM tb_playlists WHERE Playlist_ID = ?" ,(Playlist_ID,))
57 | temp = cur.fetchone()
58 | try:
59 | temp = int(temp[0])
60 | if temp > 0:
61 | Downloaded_Videos = temp
62 | else:
63 | Downloaded_Videos = 0
64 | except:
65 | Downloaded_Videos = 0
66 | cur.execute("SELECT Folder_Size_GB FROM tb_playlists WHERE Playlist_ID = ?" ,(Playlist_ID,))
67 | temp = cur.fetchone()
68 | try:
69 | temp = int(temp[0])
70 | if temp > 0:
71 | Folder_Size_GB = temp
72 | else:
73 | Folder_Size_GB = 0
74 | except:
75 | Folder_Size_GB = 0
76 | params = (Playlist_ID,Playlist_title,Channel_Id,Channel_Title,Published_At,Current_Video_Count,Playlist_Seconds,Playlist_Duration,Is_Seen,Worth,0,0,Downloaded_Videos,Folder_Size_GB)
77 | cur.execute("INSERT OR REPLACE INTO tb_playlists VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,?, 0, 1)", params)
78 | last_time = time.time()
79 | cur.execute("UPDATE tb_playlists SET Playlist_last_Scraped = ? WHERE Playlist_ID = ? ",(last_time,Playlist_ID))
80 | if next_page_token is None:
81 | break
82 |
83 | playlist_ids = set(playlist_ids)
84 | playlist_ids = list(playlist_ids)
85 | count = len(playlist_ids)
86 | cur.execute("UPDATE tb_channels SET Playlist_Count = ? WHERE Channel_ID = ? ",(count,channel_id))
87 |
88 | conn.commit() # Push the data into database
89 | conn.close()
90 |
91 | return playlist_ids
92 |
93 | if __name__ == "__main__":
94 | pass
--------------------------------------------------------------------------------
/src/get_channel_videos.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | from src.get_video_stats import get_videos_stats
4 |
5 | def get_channel_videos(youtube,channel_id):
6 | conn = sqlite3.connect('youtube.db')
7 | cur = conn.cursor()
8 |
9 | res = youtube.channels().list(id=channel_id,
10 | part='contentDetails').execute()
11 |
12 | playlist_id = res['items'][0]['contentDetails']['relatedPlaylists']['uploads']
13 |
14 | videos = []
15 | next_page_token = None
16 | new_video_ids = []
17 |
18 | try:
19 | while 1:
20 | res = youtube.playlistItems().list(playlistId=playlist_id,
21 | part='snippet',
22 | maxResults=50,
23 | pageToken=next_page_token).execute()
24 | videos += res['items']
25 | next_page_token = res.get('nextPageToken')
26 |
27 |
28 | video_ids = list(map(lambda x:x['snippet']['resourceId']['videoId'], videos))
29 |
30 | if next_page_token is None:
31 | break
32 | except:
33 | print("Channel has no Original Videos")
34 | video_ids = []
35 | CVids = []
36 | cur.execute("SELECT Video_ID FROM tb_videos WHERE Channel_ID=? AND Playlist_ID IS NOT NULL",(channel_id,))
37 | temp = cur.fetchall()
38 | for item in temp:
39 | CVids.append(item[0])
40 | CVids = set(CVids)
41 | video_ids = set(video_ids)
42 | diff = video_ids - CVids
43 | new_video_ids = list(diff)
44 | conn.commit() # Push the data into database
45 | conn.close()
46 |
47 | print('\nParsing ',len(new_video_ids),' videos, which are not in any playlist')
48 | get_videos_stats(youtube,new_video_ids,flag=1)
49 |
50 | if __name__ == "__main__":
51 | pass
52 |
53 |
--------------------------------------------------------------------------------
/src/get_playlist_videos.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | from datetime import timedelta
3 |
4 | from src.get_video_stats import get_videos_stats
5 | from src.get_channel_details import get_channel_details
6 |
7 | def get_playlist_videos(youtube,playlistID,ec=False,ch_id=None):
8 |
9 | ch_ID = 'skip'
10 | conn = sqlite3.connect('youtube.db')
11 | cur = conn.cursor()
12 |
13 | videos = []
14 | next_page_token = None
15 | video_IDS = []
16 | while 1:
17 | res = youtube.playlistItems().list(part="snippet",
18 | maxResults=50,
19 | playlistId=playlistID,
20 | pageToken=next_page_token
21 | ).execute()
22 | videos += res['items']
23 | next_page_token = res.get('nextPageToken')
24 |
25 | if next_page_token is None:
26 | break
27 |
28 |
29 | for video in videos:
30 |
31 | Video_id = video['snippet']['resourceId']['videoId']; video_IDS.append(Video_id)
32 | try:
33 | ch_ID = video['snippet']['channelId']
34 | except:
35 | ch_ID = 'skip'
36 | if ec == True:
37 | params = (Video_id,"",0,0,ch_id,None,None,0,ch_id,'',0,0,0,0,0,'',0,0,1,0)
38 | cur.execute("INSERT OR IGNORE INTO tb_videos VALUES (?, ?, ?,? ,?, ?, ?, ?,?, ?,?,?,?,?,?,?,?,?,?,?)", params)
39 | else:
40 | params = (Video_id,"",0,0,"","","")
41 | cur.execute("INSERT OR IGNORE INTO tb_videos VALUES (?, ?, ?,? ,?, ?, ?, 0,'', '',0,0,0,0,0,'',0,0,0,0)", params)
42 |
43 |
44 | conn.commit() # Push the data into database
45 | conn.close()
46 |
47 | if ch_ID == 'skip':
48 | conn = sqlite3.connect('youtube.db')
49 | cur = conn.cursor()
50 | cur.execute("SELECT Current_Video_Count FROM tb_playlists WHERE playlist_ID = ? ",(playlistID,))
51 | num = cur.fetchone()
52 | num=num[0]
53 | print(num)
54 | if num == 0:
55 | cur.execute("UPDATE tb_playlists SET Is_Removed = ? WHERE playlist_ID = ? ",(1,playlistID))
56 | conn.commit() # Push the data into database
57 | conn.close()
58 | return 0
59 | else:
60 | if ec == False:
61 | get_channel_details(youtube,ch_ID,True,playlistID)
62 |
63 | Playlist_Seconds,num_new = get_videos_stats(youtube,video_IDS,1,playlistID)
64 | print('Videos in this playlist =',num_new)
65 | Playlist_Duration = str(timedelta(seconds = Playlist_Seconds))
66 | conn = sqlite3.connect('youtube.db')
67 | cur = conn.cursor()
68 |
69 | cur.execute("SELECT Current_Video_Count FROM tb_playlists WHERE playlist_ID = ? ",(playlistID,))
70 | num = cur.fetchone()
71 | num=num[0]
72 | if num != num_new:
73 | cur.execute("UPDATE tb_playlists SET Current_Video_Count = ? WHERE playlist_ID = ? ",(num_new,playlistID))
74 |
75 | cur.execute("UPDATE tb_playlists SET Playlist_Seconds = ? WHERE playlist_ID = ? ",(Playlist_Seconds,playlistID))
76 | cur.execute("UPDATE tb_playlists SET Playlist_Duration = ? WHERE playlist_ID = ? ",(Playlist_Duration,playlistID))
77 | cur.execute("SELECT COUNT(Video_ID) FROM tb_videos WHERE Is_Deleted = ? AND playlist_ID = ? ",(1,playlistID))
78 | num = cur.fetchone()
79 | num=num[0]
80 | cur.execute("UPDATE tb_playlists SET Deleted_Videos = ? WHERE playlist_ID = ? ",(num,playlistID))
81 | conn.commit() # Push the data into database
82 | conn.close()
83 |
84 | if __name__ == "__main__":
85 | pass
--------------------------------------------------------------------------------
/src/get_video_stats.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 | import sqlite3, time
3 | from os import path
4 | import sys
5 |
6 | def get_videos_stats(youtube,video_ids,flag=1,playlistID = None):
7 | oflag = flag
8 | if not path.exists('youtube.db'):
9 | print("Please Create the database First")
10 | sys.exit()
11 | else:
12 | pass
13 |
14 | conn = sqlite3.connect('youtube.db')
15 | cur = conn.cursor()
16 | count1 = 0
17 | stats = []
18 | tot_len = 0
19 | for i in range(0, len(video_ids), 50):
20 | res = youtube.videos().list(id=','.join(video_ids[i:i+50]),
21 | part='snippet,statistics,contentDetails').execute()
22 | stats += res['items']
23 |
24 | new_ids = []
25 | for video in stats:
26 | count1 += 1
27 | try:
28 | Video_id = video['id']
29 | except:
30 | Video_id = ''
31 | flag = 3
32 | new_ids.append(Video_id)
33 | Video_title = video['snippet']['title']
34 | Upload_playlistId = video['snippet']['channelId']
35 |
36 | if playlistID is not None:
37 | Playlist_Id = playlistID # When call is from a playlist
38 | else:
39 | cur.execute("SELECT Playlist_ID FROM tb_videos WHERE Video_ID = ?" ,(Video_id,))
40 | result = cur.fetchone()
41 | if result is None:
42 | Playlist_Id = None
43 | else:
44 | if type(result) is tuple:
45 | Playlist_Id = result[0]
46 | elif type(result) is str:
47 | Playlist_Id = result
48 | else:
49 | Playlist_Id = None
50 | Published_At = video['snippet']['publishedAt']
51 | date_format = "%Y-%m-%dT%H:%M:%SZ"
52 | epoch = float(time.mktime(time.strptime(Published_At, date_format)))
53 | Channel_Id = video['snippet']['channelId']
54 | Channel_Title = video['snippet']['channelTitle']
55 | try:
56 | View_Count = video['statistics']['viewCount']
57 | except:
58 | View_Count = 1111
59 | cur.execute("SELECT View_Count FROM tb_videos WHERE Video_ID = ?" ,(Video_id,))
60 | temp = cur.fetchone()
61 | try:
62 | temp = temp[0]
63 | if View_Count < temp:
64 | continue
65 | except:
66 | pass
67 | try:
68 | Like_Count = video['statistics']['likeCount']
69 | except:
70 | Like_Count = 0
71 | try:
72 | Dislike_Count = video['statistics']['dislikeCount']
73 | except:
74 | Dislike_Count = 0
75 | try:
76 | Upvote_Ratio = round(((int(Like_Count)/(int(Like_Count)+(int(Dislike_Count))))*100),3)
77 | except:
78 | Upvote_Ratio = 0
79 | try:
80 | Comment_Count = video['statistics']['commentCount']
81 | except:
82 | Comment_Count = 0
83 | try:
84 | Duration = str(video['contentDetails']['duration'])
85 | Duration = Duration.replace('PT','')
86 | hh=mm=ss = '00'
87 | if Duration.find('H') != -1:
88 | hh = Duration.split('H')[0]
89 | temp = hh+'H'
90 | if len(hh) == 1:
91 | hh = '0'+hh
92 | Duration = Duration.replace(temp,'')
93 | if Duration.find('M') != -1:
94 | mm = Duration.split('M')[0]
95 | temp = mm+'M'
96 | if len(mm) == 1:
97 | mm = '0'+mm
98 | Duration = Duration.replace(temp,'')
99 | if Duration.find('S') != -1:
100 | ss = Duration.split('S')[0]
101 | if len(ss) == 1:
102 | ss = '0'+ss
103 | Duration = (hh+':'+mm+':'+ss)
104 | video_seconds = timedelta(hours = int(hh),
105 | minutes= int(mm),
106 | seconds= int(ss)).total_seconds()
107 | # if playlistID is not None:
108 | tot_len += video_seconds
109 | except:
110 | Duration = '0'
111 | video_seconds = 0
112 |
113 | try:
114 | Is_Licensed = video['contentDetails']['licensedContent']
115 | except:
116 | Is_Licensed = 0
117 | cur.execute("SELECT Is_Seen FROM tb_videos WHERE Video_ID = ?" ,(Video_id,))
118 | temp = cur.fetchone()
119 | try:
120 | temp = temp[0]
121 | if temp == 1:
122 | Is_Seen = 1
123 | else:
124 | Is_Seen = 0
125 | except:
126 | Is_Seen = 0
127 | # 0 = not seen 1 = seen
128 | cur.execute("SELECT Worth FROM tb_videos WHERE Video_ID = ?" ,(Video_id,))
129 | temp = cur.fetchone()
130 | try:
131 | temp = temp[0]
132 | if temp == 1:
133 | Worth = 1
134 | else:
135 | Worth = 0
136 | except:
137 | Worth = 0
138 |
139 | cur.execute("SELECT Is_Downloaded FROM tb_videos WHERE Video_ID = ?" ,(Video_id,))
140 | temp = cur.fetchone()
141 | try:
142 | temp = temp[0]
143 | if temp == 1:
144 | Is_Downloaded = 1
145 | else:
146 | Is_Downloaded = 0
147 | except:
148 | Is_Downloaded = 0
149 | Is_Deleted = 0
150 | if flag == 1 or flag == 2:
151 | Is_Deleted = 0
152 | elif flag == 3:
153 | Is_Deleted = 1
154 | print(Video_id,' is deleted')
155 | cur.execute("UPDATE tb_videos SET IS_Deleted = 1 WHERE Video_ID = ?",(Video_id,))
156 | flag = oflag
157 | params = (Video_id,Video_title,Is_Seen,Worth,Upload_playlistId,Playlist_Id,Published_At,epoch,Channel_Id,Channel_Title,View_Count,Like_Count,Dislike_Count,Upvote_Ratio,Comment_Count,Duration,video_seconds,Is_Licensed,Is_Deleted,Is_Downloaded)
158 | if flag == 1:
159 | cur.execute("INSERT OR REPLACE INTO tb_videos VALUES (?, ?, ?, ?, ?, ?, ? ,? ,? ,? ,? ,? , ?, ?, ?, ?, ?, ?, ?, ?)", params)
160 | elif flag == 2:
161 | cur.execute("INSERT OR IGNORE INTO tb_videos VALUES (?, ?, ?, ?, ?, ?, ? ,? ,? ,? ,? ,? , ?, ?, ?, ?, ?, ?, ?, ?)", params)
162 | conn.commit()
163 | conn.close()
164 |
165 | video_ids = set(video_ids)
166 | new_ids = set(new_ids)
167 | num_new = len(new_ids)
168 | diff = video_ids-new_ids
169 | if len(diff) > 0:
170 | conn = sqlite3.connect('youtube.db')
171 | cur = conn.cursor()
172 | for item in diff:
173 | print(item,' not available')
174 | try:
175 | params = (item,'Not Available',0,0,Channel_Id,playlistID,'','',Channel_Id,Channel_Title,'','','','','','','','',1,0)
176 | cur.execute("INSERT OR IGNORE INTO tb_videos VALUES (?, ?, ?, ?, ?, ?, ? ,? ,? ,? ,? ,? , ?, ?, ?, ?, ?, ?, ?, ?)", params)
177 | cur.execute("UPDATE tb_videos SET IS_Deleted = 1 WHERE Video_ID = ?",(item,))
178 | except:
179 | pass
180 | conn.commit()
181 | conn.close()
182 | if tot_len > 0:
183 | return tot_len,num_new
184 |
185 | if __name__ == "__main__":
186 | pass
--------------------------------------------------------------------------------
/src/import_downloaded_items.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | import subprocess, os,re
3 | #SELECT * FROM tb_videos WHERE Video_ID IN (SELECT Video_ID FROM yt_downloaded) ORDER BY Is_Seen
4 | # Sanity Check
5 | from os import listdir
6 | from os.path import isfile, join
7 | import sqlite3
8 |
9 | from get_api_key import api_key
10 | from get_video_stats import get_videos_stats
11 |
12 | def update_local(vid_path):
13 | vid_path1 = '"'+vid_path+'"'
14 | command = "./ffmpeg -i "+vid_path1+" -hide_banner"
15 | try:
16 | with open('log1.txt', "w",encoding='utf-8') as outfile: #latin-1
17 | subprocess.run(command, stderr=subprocess.STDOUT,stdout=outfile)
18 | except Exception as e:
19 | print(e)
20 | with open('log1.txt', "r",encoding='utf-8') as fhand:
21 | for line in fhand:
22 | line=line.lstrip()
23 | temp_match = line[0:11]
24 | if temp_match == 'Stream #0:0':
25 | result = re.findall('\d+x\d+', line)
26 | Resolution = result[0]
27 | result = re.findall('[0-9.]+ fps', line)
28 | try:
29 | fps = result[0]
30 | fps = fps.strip(' fps')
31 | except:
32 | fps = 0
33 | if line.startswith('Duration:'):
34 | result = re.findall('\d+ ', line)
35 | bitrate = result[0]
36 | if line.startswith('Stream #0:1'):
37 | result = re.findall('Audio: [a-zA-Z]+,', line)
38 | temp = result[0]
39 | Audio_Type = temp.strip(',')
40 | Audio_Type = Audio_Type[7:]
41 | result = re.findall('\d+ Hz', line)
42 | temp = result[0]
43 | Frequency = temp.strip(' Hz')
44 | result = re.findall('Hz, \w+,', line)
45 | temp = result[0]
46 | Channels = temp.strip('Hz ,')
47 | raw_size = Path(vid_path).stat().st_size
48 | size = raw_size/(1024*1024)
49 | size = round(size,3)
50 | return (Resolution,raw_size,size,fps,bitrate,Audio_Type,Frequency,Channels)
51 |
52 | def import_vids():
53 | mypath = ('D:\\Youtube1')
54 | conn = sqlite3.connect('C:\\Users\\Sambit\\Desktop\\Projects\\Youtube\\Youtube_Scraper\\youtube.db')
55 | cur = conn.cursor()
56 | # r=root, d=directories, f = files
57 | for r, d, f in os.walk(mypath):
58 | for file in f:
59 | if file.endswith(("mp4", "mkv", "flv", "wmv", "avi", "mpg", "mpeg")):
60 | vid_path = os.path.join(r, file)
61 | vid_id = vid_path[-15:-4]
62 | with open ("skip_files.txt") as f:
63 | if vid_id in f.read():
64 | continue
65 | vid_type = vid_path[-3:]
66 | cur.execute("SELECT Video_ID FROM yt_downloaded WHERE Video_ID = ?",(vid_id,))
67 | if (cur.fetchone()) is None:
68 | Resolution,raw_size,size,fps,bitrate,Audio_Type,Frequency,Channels = update_local(vid_path)
69 | params = (vid_id,Resolution,raw_size,size,vid_type,fps,bitrate,Audio_Type,Frequency,Channels,0)
70 | cur.execute("INSERT OR REPLACE INTO yt_downloaded VALUES (?,?,?,?,?,?,?,?,?,?,?)",params)
71 |
72 | conn.commit()
73 | conn.close()
74 |
75 | def update_vids():
76 | def is_in_main():
77 | conn = sqlite3.connect('C:\\Users\\Sambit\\Desktop\\Projects\\Youtube\\Youtube_Scraper\\youtube.db')
78 | cur = conn.cursor()
79 | cur.execute("UPDATE yt_downloaded SET Is_In_Main = 1 WHERE Video_ID IN (SELECT Video_ID FROM yt_downloaded \
80 | WHERE Video_ID IN (SELECT Video_ID FROM tb_videos))")
81 | conn.commit()
82 | conn.close()
83 | # is_in_main()
84 | for i in range(2000):
85 | conn = sqlite3.connect('C:\\Users\\Sambit\\Desktop\\Projects\\Youtube\\Youtube_Scraper\\youtube.db')
86 | cur = conn.cursor()
87 | cur.execute("SELECT Count(*) FROM yt_downloaded")
88 | tot = cur.fetchone()
89 | cur.execute("SELECT Video_ID FROM yt_downloaded WHERE Is_In_Main = 0 LIMIT 50")
90 | temp = cur.fetchall()
91 | if len(temp) < 1:
92 | print("All Videos (locally downloaded) are now in main table tb_videos")
93 | break
94 | result = []
95 | for item in temp:
96 | cur.execute("UPDATE yt_downloaded SET Is_In_Main = 1 WHERE Video_ID = ?",(item[0],))
97 | result.append(item[0])
98 |
99 | conn.commit()
100 | conn.close()
101 |
102 | print('Parsing Downloaded Videos :',(i*50),' / ',tot[0],end="\r")
103 | print(' ')
104 | youtube_instance = api_key()
105 | youtube_instance.get_api_key()
106 | youtube = youtube_instance.get_youtube()
107 | get_videos_stats(youtube,result,1)
108 | conn = sqlite3.connect('youtube.db')
109 | cur = conn.cursor()
110 | for item in result:
111 | print('New Item added successfully :',item)
112 | cur.execute("UPDATE tb_videos SET Is_Downloaded = 1 WHERE Video_ID = ?",(item,))
113 | cur.execute("UPDATE tb_videos SET Is_Seen = 1 WHERE Video_ID = ?",(item,))
114 | cur.execute("UPDATE tb_videos SET Worth = 1 WHERE Video_ID = ?",(item,))
115 | conn.commit()
116 | conn.close()
117 | is_in_main()
118 |
119 | if __name__ == "__main__":
120 | import_vids()
121 | update_vids()
122 |
--------------------------------------------------------------------------------
/src/load_history.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | import sqlite3, time
3 | from bs4 import BeautifulSoup
4 |
5 | from src.get_video_stats import get_videos_stats
6 | from src.get_api_key import api_key
7 |
8 |
9 | def update_title():
10 | conn = sqlite3.connect('youtube.db')
11 | cur = conn.cursor()
12 | cur.execute("SELECT Video_ID FROM video_history WHERE (Title IS NULL OR Title = '') AND Is_Deleted = 0")
13 | temp = cur.fetchall()
14 | for item in temp:
15 | cur.execute("SELECT Video_title FROM tb_videos WHERE Video_ID = ?",(item[0],))
16 | tit = cur.fetchone()
17 | cur.execute("UPDATE video_history SET Title = ? WHERE Video_ID = ?",(tit[0],item[0]))
18 | conn.commit()
19 | conn.close()
20 | def update_deleted():
21 | conn = sqlite3.connect('youtube.db')
22 | cur = conn.cursor()
23 | cur.execute("UPDATE video_history SET Is_Deleted = 1 WHERE Video_ID NOT IN (SELECT Video_ID FROM video_history WHERE Video_ID IN (SELECT Video_ID FROM tb_videos))")
24 | conn.commit()
25 | conn.close()
26 | def update_is_seen():
27 | conn = sqlite3.connect('youtube.db')
28 | cur = conn.cursor()
29 | cur.execute("UPDATE tb_videos SET Is_Seen = 1 WHERE Video_ID IN (SELECT Video_ID FROM tb_videos \
30 | WHERE Video_ID IN (SELECT Video_ID FROM video_history))")
31 | conn.commit()
32 | conn.close()
33 |
34 | def update_is_in_main():
35 | conn = sqlite3.connect('youtube.db')
36 | cur = conn.cursor()
37 | cur.execute("UPDATE video_history SET Is_in_Main = 1 WHERE Video_ID IN (SELECT Video_ID FROM video_history \
38 | WHERE Video_ID IN (SELECT Video_ID FROM tb_videos))")
39 |
40 | conn.commit()
41 | conn.close()
42 |
43 | def update_history(youtube):
44 | for i in range(2000):
45 | conn = sqlite3.connect('youtube.db')
46 | cur = conn.cursor()
47 | cur.execute("SELECT Count(*) FROM video_history")
48 | tot = cur.fetchone()
49 | cur.execute("SELECT Video_ID FROM video_history WHERE Is_in_Main = 0 AND Is_Deleted = 0 LIMIT 50;")
50 | temp = cur.fetchall()
51 | if len(temp) < 2:
52 | print("All Videos From Watched History are now in main table tb_videos")
53 | break
54 | result = []
55 | for item in temp:
56 | cur.execute("UPDATE video_history SET Is_in_Main = 1 WHERE Video_ID = ?",(item[0],))
57 | result.append(item[0])
58 |
59 | conn.commit()
60 | conn.close()
61 | print('Parsing Watch History Videos :',(i*50),' / ',tot[0],end="\r")
62 | get_videos_stats(youtube,result,1)
63 | update_is_in_main()
64 |
65 | def load_history(res='n'):
66 | count_loc_prog = 0
67 | with open("takeout/history/watch-history.html",encoding='utf-8') as fp:
68 | conn = sqlite3.connect('youtube.db')
69 | cur = conn.cursor()
70 |
71 | soup = BeautifulSoup(fp,'lxml')
72 | soup = soup.body
73 |
74 | videos = soup.find_all("div", {"class": "content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1"})
75 |
76 | print(len(videos))
77 |
78 | for video in videos:
79 | count_loc_prog += 1
80 | if count_loc_prog % 500 == 0:
81 | print('Loading into Database : ',count_loc_prog,' / ',len(videos),end="\r")
82 | tags = video.find_all('a')
83 | # try:
84 |
85 | if tags == []:
86 | continue
87 |
88 | V_link = tags[0].get('href')
89 | V_link = V_link.split('=')[-1]
90 | br_tags = video.find_all('br')
91 | for tag in br_tags:
92 | watched_at = str(tag.next_sibling)
93 | if watched_at[-3:-1] == 'IS':
94 | final_time = (watched_at)
95 | temp = final_time.replace('IST','+0530')
96 | epoch = time.mktime(time.strptime(temp, "%b %d, %Y, %I:%M:%S %p %z"))
97 | cur.execute("INSERT OR IGNORE INTO video_history VALUES (?,?,?,?,?,?)", (V_link,'',final_time,epoch,0,0))
98 |
99 |
100 | conn.commit() # Push the data into database
101 | conn.close()
102 | print("\n Loaded \n")
103 |
104 | if res == 'y' or res == "Y":
105 | youtube_instance = api_key()
106 | youtube_instance.get_api_key()
107 | youtube = youtube_instance.get_youtube()
108 | update_history(youtube)
109 | update_title()
110 | update_deleted()
111 | update_is_seen()
112 | update_is_in_main()
113 |
114 | if __name__ == "__main__":
115 | pass
--------------------------------------------------------------------------------
/src/most_watched.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 | def most_watched(n=5):
4 | conn = sqlite3.connect('youtube.db')
5 | cur = conn.cursor()
6 | cur.execute("SELECT video_history.Video_ID,COUNT(video_history.Video_ID) AS cnt, Video_title FROM video_history \
7 | LEFT OUTER JOIN tb_videos on tb_videos.Video_ID = video_history.Video_ID \
8 | GROUP BY video_history.Video_ID ORDER BY cnt DESC;")
9 | results = cur.fetchmany(n)
10 | print("\t"," Video Link","\t","\t","\t"," Times Watched","\t","\t"," Video Name")
11 | print("-------------------------------------------------------------------------------------------------------")
12 | for result in results:
13 | Link = "https://www.youtube.com/watch?v="+result[0]
14 | if result[2] is None:
15 | title = "Video is not available in local database"
16 | else:
17 | title = result[2]
18 | print(Link,'\t',result[1],'\t',title)
19 | conn.commit()
20 | conn.close()
21 |
22 | if __name__ == "__main__":
23 | pass
--------------------------------------------------------------------------------
/src/oldest_videos.py:
--------------------------------------------------------------------------------
1 | from get_api_key import api_key
2 |
3 | import argparse
4 | import os
5 | from datetime import datetime
6 |
7 | youtube_instance = api_key()
8 | youtube_instance.get_api_key()
9 | youtube = youtube_instance.get_youtube()
10 |
11 | parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter,\
12 | description='Explore the oldest videos on a Topic',\
13 | epilog='''Examples \n .\oldest_videos.py tesla \n .\oldest_videos.py "game of thrones" -n 15 -s 2012''')
14 | parser.add_argument("topic", help='Enter the topic')
15 | group2 = parser.add_argument_group()
16 | group2.add_argument('-n','--max_results',type=int, metavar='', default=5, help='The script will display "n" results')
17 | group2.add_argument('-s','--start_year',type=int, metavar='', default=2005, help='By default, it will search from 2005')
18 | group2.add_argument('-e','--end_year',type=int, metavar='', default=2010, help='By default, it will search till 2010')
19 |
20 | parser.add_argument('-o','--output', action='store_true', help='output to a File')
21 |
22 |
23 | args = parser.parse_args()
24 |
25 |
26 | def oldest_videos_on_a_topic(topic,Max_limit,start_yr,end_yr):
27 | if args.output:
28 | f = open("old_videos.txt",'w',encoding = 'utf-8')
29 | f.close()
30 | else:
31 | print('\n')
32 | print('Video ID','\t','Upload Date/Time','\t','Video Title')
33 | print('--------','\t','----------------','\t','-----------')
34 | limit = 0
35 | global youtube
36 | start_time = datetime(year=2005, month=4, day=1).strftime('%Y-%m-%dT%H:%M:%SZ')
37 | end_time = datetime(year=2010, month=1, day=1).strftime('%Y-%m-%dT%H:%M:%SZ')
38 |
39 | res = youtube.search().list(part='snippet',
40 | q=topic,
41 | type='video',
42 | publishedAfter=start_time,
43 | publishedBefore=end_time,
44 | maxResults=50).execute()
45 | for item in sorted(res['items'], key=lambda x:x['snippet']['publishedAt']):
46 | title = str(item['snippet']['title']).replace(''',"'").replace('"','"')
47 | if topic.lower() in title.lower():
48 | limit += 1
49 | date_format = "%Y-%m-%dT%H:%M:%SZ"
50 | publishedAt = datetime.strptime(item['snippet']['publishedAt'], date_format)
51 | if args.output:
52 | f = open("old_videos.txt",'a',encoding = 'utf-8')
53 | f.write(item['id']['videoId']+'\t\t'+str(publishedAt)+'\t\t'+ title )
54 | f.write('\n')
55 | f.close()
56 | else:
57 | print(item['id']['videoId'],'\t',publishedAt,'\t', title )
58 | if limit == Max_limit:
59 | break
60 | else:
61 | continue
62 |
63 | if args.output:
64 | print('\nDone! Check the file old_video.txt\n')
65 | else:
66 | print('\n')
67 |
68 | if __name__ == "__main__":
69 | key = input("Enter key\n")
70 | youtube = get_api_key(key)
71 | oldest_videos_on_a_topic(args.topic,args.max_results,args.start_year,args.end_year)
--------------------------------------------------------------------------------
/src/subscriptions.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | with open("takeout/subscriptions/subscriptions.json", encoding="utf-8") as f:
4 | subs = json.load(f)
5 | i = 0
6 | sub_list = []
7 | for sub in subs:
8 | temp = (sub["contentDetails"]["totalItemCount"], sub["snippet"]["resourceId"]["channelId"], sub["snippet"]["title"] )
9 | # temp = (sub["snippet"]["title"])
10 | # sub_list.append(temp.title())
11 | sub_list.append(temp)
12 | sub_list = list(set(sub_list))
13 | new_lst = []
14 | for sub in sub_list:
15 | temp = sub[2].title()
16 | sub = (sub[0], sub[1], temp)
17 | new_lst.append(sub)
18 | new_lst.sort(key=lambda x:x[2])
19 | for sub in new_lst:
20 | print(sub)
--------------------------------------------------------------------------------
/src/vidsPerTime.py:
--------------------------------------------------------------------------------
1 | import sqlite3,datetime,calendar
2 |
3 |
4 |
5 | def absolute_dates():
6 | year_2020 = 1577836800
7 | conn = sqlite3.connect('youtube.db')
8 | cur = conn.cursor()
9 | cur.execute("SELECT MIN(epoch) from video_history")
10 | result = cur.fetchone()
11 | oldest = int(result[0])
12 | cur.execute("SELECT MAX(epoch) from video_history")
13 | result = cur.fetchone()
14 | newest = int(result[0])
15 |
16 | years = [2018,2019,2020,2021]
17 | months = [x for x in range(1,13)]
18 | dates = []
19 | # days = [x for x in range(1,29)]
20 | # for year in years:
21 | # for month in months:
22 | # for day in days:
23 | # date = (year,month,day)
24 | # dates.append(date)
25 | # for date in dates:
26 | # print(date)
27 | cal = calendar.calendar
28 | temp = calendar.itermonthdates(2021, 1)
29 | print(cal)
30 | conn.close()
31 |
32 | def relative_dates():
33 | conn = sqlite3.connect('youtube.db')
34 | cur = conn.cursor()
35 |
36 | # Absolute, Relative
37 |
38 | start = 1568523230
39 | end = 1609459199
40 | max_res = 0
41 | for start in range (1577836800,1578614400,3600):
42 | end = start + 31622399
43 | cur.execute("SELECT COUNT(Video_ID) from video_history WHERE epoch > ? AND epoch < ?",(start,end))
44 | result = cur.fetchone()
45 | result = int(result[0])
46 |
47 | if result > max_res:
48 | max_res = result
49 | start_year = start
50 | end_year = end
51 | if start % 10000 == 0:
52 | print(start)
53 | conn.close()
54 | start_year = datetime.datetime.utcfromtimestamp(start_year).replace(tzinfo=datetime.timezone.utc)
55 | end_year = datetime.datetime.utcfromtimestamp(end_year).replace(tzinfo=datetime.timezone.utc)
56 | print(max_res,start_year,end_year)
57 |
58 | relative_dates()
59 |
60 |
61 |
--------------------------------------------------------------------------------