├── .gitignore ├── geckodriver.exe ├── file_maker.py ├── link-extractor.py └── transcriptor.py /.gitignore: -------------------------------------------------------------------------------- 1 | /test.txt 2 | /video_links.txt -------------------------------------------------------------------------------- /geckodriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Adityaadpandey/data_extractor-neutralizer/HEAD/geckodriver.exe -------------------------------------------------------------------------------- /file_maker.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def remove_timestamp(line): 4 | # Define pattern to match timestamp at the beginning of the line 5 | timestamp_pattern = re.compile(r'^\d+:\d+:\d+\.\d+\s') 6 | 7 | # Remove timestamp from the line 8 | return re.sub(timestamp_pattern, '', line) 9 | 10 | def clean_text(input_file, output_file): 11 | with open(input_file, 'r', encoding='utf-8') as infile: 12 | lines = infile.readlines() 13 | 14 | # Remove timestamps from each line 15 | cleaned_lines = [remove_timestamp(line) for line in lines] 16 | 17 | # Remove the first line 18 | cleaned_lines = cleaned_lines[1:] 19 | 20 | with open(output_file, 'w', encoding='utf-8') as outfile: 21 | outfile.writelines(cleaned_lines) 22 | if __name__ == "__main__": 23 | input_file_path = "chc.txt" 24 | output_file_path = "tes.txt" 25 | 26 | clean_text(input_file_path, output_file_path) 27 | print("Timestamps removed successfully.") 28 | -------------------------------------------------------------------------------- /link-extractor.py: -------------------------------------------------------------------------------- 1 | from youtubesearchpython import VideosSearch 2 | from langdetect import detect 3 | 4 | def get_video_links(search_query, max_results=50): 5 | video_links = [] 6 | 7 | while len(video_links) < max_results: 8 | videos_search = VideosSearch(search_query, limit=min(20, max_results - len(video_links))) 9 | results = videos_search.result() 10 | 11 | for video in results['result']: 12 | try: 13 | title = video['title'] 14 | # Detect language based on the title 15 | if detect(title) == 'en': 16 | video_links.append(video['link']) 17 | 18 | if len(video_links) >= max_results: 19 | break 20 | except KeyError: 21 | pass 22 | 23 | if 'nextPageToken' not in results: 24 | break 25 | 26 | videos_search = VideosSearch(search_query, limit=min(20, max_results - len(video_links)), pageToken=results['nextPageToken']) 27 | 28 | return video_links 29 | 30 | def save_links_to_file(video_links, file_path='video_links.txt'): 31 | with open(file_path, 'w') as file: 32 | for link in video_links: 33 | file.write(link + '\n') 34 | 35 | if __name__ == "__main__": 36 | search_query = input("Enter your YouTube search query: ") 37 | 38 | video_links = get_video_links(search_query) 39 | 40 | save_links_to_file(video_links) 41 | 42 | print(f"\nList of Video Links saved to 'video_links.txt'") 43 | -------------------------------------------------------------------------------- /transcriptor.py: -------------------------------------------------------------------------------- 1 | import os 2 | from selenium import webdriver 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.common.keys import Keys 5 | from selenium.webdriver.support.ui import WebDriverWait 6 | from selenium.webdriver.support import expected_conditions as EC 7 | 8 | from selenium.webdriver.firefox.options import Options 9 | import time 10 | 11 | def download_transcript(youtube_link): 12 | options = Options() 13 | options.headless = True # Set to True if you don't want to see the browser window 14 | 15 | # Provide the directory path where geckodriver is located 16 | geckodriver_directory = 'C:/Users/adity/test/geckodriver.exe' # Replace with the actual directory path 17 | os.environ['PATH'] += os.pathsep + geckodriver_directory 18 | 19 | driver = webdriver.Firefox(options=options) 20 | 21 | try: 22 | driver.get('https://tactiq.io/tools/youtube-transcript#youtube-form-link') 23 | time.sleep(2) # Allow time for the page to load 24 | 25 | # Find the input field and paste the YouTube link 26 | input_field = WebDriverWait(driver, 1).until( 27 | EC.presence_of_element_located((By.ID, "yt-2")) 28 | ) 29 | 30 | input_field.send_keys(youtube_link) 31 | 32 | # Submit the form 33 | input_field.send_keys(Keys.RETURN) 34 | 35 | # Wait for the transcript to load (adjust the sleep duration as needed) 36 | time.sleep(5) 37 | try: 38 | # Find the download button and click it 39 | download_button = driver.find_element(By.ID, "download") 40 | download_button.click() 41 | # Wait for the download to complete (adjust the sleep duration as needed) 42 | time.sleep(3) 43 | except: 44 | pass 45 | 46 | finally: 47 | driver.quit() 48 | 49 | if __name__ == "__main__": 50 | # input_file = input("Enter the file path containing YouTube links: ") 51 | input_file = "video_links.txt" 52 | 53 | with open(input_file, 'r') as file: 54 | youtube_links = file.read().splitlines() 55 | 56 | for link in youtube_links: 57 | download_transcript(link) 58 | 59 | print("Transcripts downloaded successfully.") 60 | --------------------------------------------------------------------------------