├── .gitignore
├── geckodriver.exe
├── file_maker.py
├── link-extractor.py
└── transcriptor.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /test.txt
2 | /video_links.txt


--------------------------------------------------------------------------------
/geckodriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Adityaadpandey/data_extractor-neutralizer/HEAD/geckodriver.exe


--------------------------------------------------------------------------------
/file_maker.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def remove_timestamp(line):
 4 |     # Define pattern to match timestamp at the beginning of the line
 5 |     timestamp_pattern = re.compile(r'^\d+:\d+:\d+\.\d+\s')
 6 | 
 7 |     # Remove timestamp from the line
 8 |     return re.sub(timestamp_pattern, '', line)
 9 | 
10 | def clean_text(input_file, output_file):
11 |     with open(input_file, 'r', encoding='utf-8') as infile:
12 |         lines = infile.readlines()
13 | 
14 |     # Remove timestamps from each line
15 |     cleaned_lines = [remove_timestamp(line) for line in lines]
16 | 
17 |     # Remove the first line
18 |     cleaned_lines = cleaned_lines[1:]
19 | 
20 |     with open(output_file, 'w', encoding='utf-8') as outfile:
21 |         outfile.writelines(cleaned_lines)
22 | if __name__ == "__main__":
23 |     input_file_path = "chc.txt"
24 |     output_file_path = "tes.txt"
25 | 
26 |     clean_text(input_file_path, output_file_path)
27 |     print("Timestamps removed successfully.")
28 | 


--------------------------------------------------------------------------------
/link-extractor.py:
--------------------------------------------------------------------------------
 1 | from youtubesearchpython import VideosSearch
 2 | from langdetect import detect
 3 | 
 4 | def get_video_links(search_query, max_results=50):
 5 |     video_links = []
 6 | 
 7 |     while len(video_links) < max_results:
 8 |         videos_search = VideosSearch(search_query, limit=min(20, max_results - len(video_links)))
 9 |         results = videos_search.result()
10 | 
11 |         for video in results['result']:
12 |             try:
13 |                 title = video['title']
14 |                 # Detect language based on the title
15 |                 if detect(title) == 'en':
16 |                     video_links.append(video['link'])
17 | 
18 |                     if len(video_links) >= max_results:
19 |                         break
20 |             except KeyError:
21 |                 pass
22 | 
23 |         if 'nextPageToken' not in results:
24 |             break
25 | 
26 |         videos_search = VideosSearch(search_query, limit=min(20, max_results - len(video_links)), pageToken=results['nextPageToken'])
27 | 
28 |     return video_links
29 | 
30 | def save_links_to_file(video_links, file_path='video_links.txt'):
31 |     with open(file_path, 'w') as file:
32 |         for link in video_links:
33 |             file.write(link + '\n')
34 | 
35 | if __name__ == "__main__":
36 |     search_query = input("Enter your YouTube search query: ")
37 | 
38 |     video_links = get_video_links(search_query)
39 | 
40 |     save_links_to_file(video_links)
41 | 
42 |     print(f"\nList of Video Links saved to 'video_links.txt'")
43 | 


--------------------------------------------------------------------------------
/transcriptor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from selenium import webdriver
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.common.keys import Keys
 5 | from selenium.webdriver.support.ui import WebDriverWait
 6 | from selenium.webdriver.support import expected_conditions as EC
 7 | 
 8 | from selenium.webdriver.firefox.options import Options
 9 | import time
10 | 
11 | def download_transcript(youtube_link):
12 |     options = Options()
13 |     options.headless = True  # Set to True if you don't want to see the browser window
14 | 
15 |     # Provide the directory path where geckodriver is located
16 |     geckodriver_directory = 'C:/Users/adity/test/geckodriver.exe'  # Replace with the actual directory path
17 |     os.environ['PATH'] += os.pathsep + geckodriver_directory
18 | 
19 |     driver = webdriver.Firefox(options=options)
20 | 
21 |     try:
22 |         driver.get('https://tactiq.io/tools/youtube-transcript#youtube-form-link')
23 |         time.sleep(2)  # Allow time for the page to load
24 | 
25 |         # Find the input field and paste the YouTube link
26 |         input_field = WebDriverWait(driver, 1).until(
27 |             EC.presence_of_element_located((By.ID, "yt-2"))
28 |         )
29 | 
30 |         input_field.send_keys(youtube_link)
31 | 
32 |         # Submit the form
33 |         input_field.send_keys(Keys.RETURN)
34 | 
35 |         # Wait for the transcript to load (adjust the sleep duration as needed)
36 |         time.sleep(5)
37 |         try:
38 |             # Find the download button and click it
39 |             download_button = driver.find_element(By.ID, "download")
40 |             download_button.click()
41 |             # Wait for the download to complete (adjust the sleep duration as needed)
42 |             time.sleep(3)
43 |         except:
44 |             pass        
45 | 
46 |     finally:
47 |         driver.quit()
48 | 
49 | if __name__ == "__main__":
50 |     # input_file = input("Enter the file path containing YouTube links: ")
51 |     input_file = "video_links.txt"
52 | 
53 |     with open(input_file, 'r') as file:
54 |         youtube_links = file.read().splitlines()
55 | 
56 |     for link in youtube_links:
57 |         download_transcript(link)
58 | 
59 |     print("Transcripts downloaded successfully.")
60 | 


--------------------------------------------------------------------------------