├── streamlit_app ├── __init__.py ├── styles │ ├── __init__.py │ └── css.py ├── components │ ├── __init__.py │ ├── progress.py │ ├── main_panel.py │ └── sidebar.py ├── core │ ├── __init__.py │ ├── keyframes_processor.py │ └── collection_runner.py └── utils │ ├── __init__.py │ ├── session_state.py │ └── file_browser.py ├── databases ├── __init__.py ├── utilities.py └── sql_manager.py ├── data_collectors ├── __init__.py ├── utilities.py └── collector.py ├── images └── streamlit-interface.png ├── .gitignore ├── config └── config.ini ├── media_handlers ├── __init__.py ├── video_downloader.py └── session_manager.py ├── requirements.txt ├── setup.py ├── app.py ├── utils └── __init__.py ├── main.py └── README.md /streamlit_app/__init__.py: -------------------------------------------------------------------------------- 1 | # Streamlit UI package -------------------------------------------------------------------------------- /streamlit_app/styles/__init__.py: -------------------------------------------------------------------------------- 1 | # Styles and CSS -------------------------------------------------------------------------------- /streamlit_app/components/__init__.py: -------------------------------------------------------------------------------- 1 | # UI Components -------------------------------------------------------------------------------- /streamlit_app/core/__init__.py: -------------------------------------------------------------------------------- 1 | # Core business logic -------------------------------------------------------------------------------- /streamlit_app/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Streamlit utilities -------------------------------------------------------------------------------- /databases/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .sql_manager import SQLDatabaseManager 3 | -------------------------------------------------------------------------------- /data_collectors/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from .collector import TikTokDataCollector 3 | -------------------------------------------------------------------------------- /images/streamlit-interface.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/estebanpdl/tik-spyder/HEAD/images/streamlit-interface.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore Cache 2 | .ipynb_checkpoints/ 3 | __pycache__/ 4 | .vscode/ 5 | 6 | # package metadata 7 | *.egg-info 8 | -------------------------------------------------------------------------------- /config/config.ini: -------------------------------------------------------------------------------- 1 | [SerpAPI Key] 2 | api_key = your_serp_api_key 3 | 4 | [Apify Token] 5 | apify_token = your_apify_token 6 | -------------------------------------------------------------------------------- /media_handlers/__init__.py: -------------------------------------------------------------------------------- 1 | from .session_manager import RequestSession 2 | from .video_downloader import VideoDownloader 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp 2 | apify-client 3 | httpx 4 | pandas 5 | PySocks 6 | requests 7 | serpapi 8 | stem 9 | streamlit 10 | tqdm 11 | yt-dlp[default] 12 | -------------------------------------------------------------------------------- /streamlit_app/utils/session_state.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import streamlit as st 5 | import time 6 | 7 | def initialize_session_state(): 8 | """Initialize session state variables""" 9 | if 'output_dir' not in st.session_state: 10 | timestamp = int(time.time()) 11 | st.session_state.output_dir = f'./tikspyder-data/{timestamp}' -------------------------------------------------------------------------------- /streamlit_app/utils/file_browser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import tkinter as tk 5 | 6 | # import submodules 7 | from tkinter import filedialog 8 | 9 | def select_directory(): 10 | """Create a directory picker dialog""" 11 | root = tk.Tk() 12 | root.withdraw() 13 | root.wm_attributes('-topmost', 1) 14 | folder_path = filedialog.askdirectory() 15 | root.destroy() 16 | return folder_path -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import os 5 | from setuptools import setup, find_packages 6 | 7 | setup( 8 | name="tikspyder", 9 | version="0.1.0", 10 | packages=find_packages(), 11 | install_requires=[ 12 | "aiohttp", 13 | "apify-client", 14 | "pandas", 15 | "PySocks", 16 | "requests", 17 | "serpapi", 18 | "stem", 19 | "streamlit", 20 | "tqdm", 21 | "yt-dlp[default]" 22 | ], 23 | entry_points={ 24 | 'console_scripts': [ 25 | 'tikspyder=main:main', 26 | ], 27 | }, 28 | python_requires='>=3.6', 29 | author="Esteban Ponce de Leon", 30 | description="A tool for collecting TikTok data", 31 | long_description=open('README.md', encoding='utf-8').read() if os.path.exists('README.md') else '', 32 | long_description_content_type="text/markdown", 33 | ) 34 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import streamlit as st 5 | import os 6 | 7 | # local imports 8 | from streamlit_app.styles.css import load_css 9 | from streamlit_app.utils.session_state import initialize_session_state 10 | from streamlit_app.components.sidebar import render_sidebar 11 | from streamlit_app.components.main_panel import render_main_panel 12 | from streamlit_app.core.collection_runner import run_collection, validate_input 13 | from utils import get_config_attrs, get_project_root 14 | 15 | # Configure Streamlit page 16 | st.set_page_config( 17 | page_title="TikSpyder - TikTok Data Collection", 18 | page_icon="🕷️", 19 | layout="wide", 20 | initial_sidebar_state="expanded" 21 | ) 22 | 23 | # Set theme programmatically to dark 24 | st._config.set_option('theme.base', 'dark') 25 | st._config.set_option('theme.backgroundColor', '#0e1117') 26 | st._config.set_option('theme.secondaryBackgroundColor', '#262730') 27 | st._config.set_option('theme.textColor', '#ffffff') 28 | 29 | def main(): 30 | """Main application entry point""" 31 | # Load styling 32 | load_css() 33 | 34 | # Initialize session state 35 | initialize_session_state() 36 | 37 | # Get project configuration 38 | project_root = get_project_root() 39 | config_path = os.path.join(project_root, 'config') 40 | config_attrs = get_config_attrs(config_path) 41 | 42 | # Main header 43 | st.markdown('

🕷️ TikSpyder

', unsafe_allow_html=True) 44 | st.markdown('

Advanced TikTok Data Collection

', unsafe_allow_html=True) 45 | 46 | # Render UI components 47 | search_config, apify_config = render_sidebar() 48 | collection_config, start_collection = render_main_panel() 49 | 50 | # Handle collection start 51 | if start_collection: 52 | if validate_input(search_config): 53 | run_collection(search_config, apify_config, collection_config, config_attrs) 54 | 55 | if __name__ == '__main__': 56 | main() -------------------------------------------------------------------------------- /streamlit_app/components/progress.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import streamlit as st 5 | import time 6 | 7 | # local imports 8 | from ..styles.css import create_status_badge 9 | 10 | def create_progress_tracker(): 11 | """Create and return progress tracking components""" 12 | # Create status container 13 | status_container = st.container() 14 | 15 | with status_container: 16 | st.markdown("### 🔄 Collection Progress") 17 | 18 | # Create progress indicators 19 | overall_progress = st.progress(0) 20 | status_text = st.empty() 21 | step_container = st.container() 22 | 23 | # Collection steps with icons and descriptions 24 | steps = [ 25 | ("🔍", "Initializing search parameters..."), 26 | ("📡", "Collecting search results..."), 27 | ("🖼️", "Gathering image thumbnails..."), 28 | ("🚀", "Running Apify integration..."), 29 | ("📁", "Generating data files..."), 30 | ("📹", "Downloading videos..."), 31 | ("🎞️", "Extracting keyframes..."), 32 | ("✅", "Collection complete!") 33 | ] 34 | 35 | step_progress = {} 36 | for i, (icon, desc) in enumerate(steps): 37 | step_progress[i] = step_container.empty() 38 | 39 | return overall_progress, status_text, step_progress, steps 40 | 41 | def update_progress(step_num, overall_progress, status_text, step_progress, steps, message=None, progress_value=None): 42 | """Update progress indicators""" 43 | if step_num < len(steps): 44 | icon, desc = steps[step_num] 45 | step_progress[step_num].markdown(f"{icon} {desc}") 46 | 47 | if message: 48 | status_text.markdown(create_status_badge(message, "warning"), unsafe_allow_html=True) 49 | 50 | if progress_value is not None: 51 | overall_progress.progress(progress_value) 52 | 53 | time.sleep(0.1) # Allow UI to update 54 | 55 | def mark_step_complete(step_num, step_progress, message): 56 | """Mark a step as completed""" 57 | step_progress[step_num].markdown(f"✅ {message}") -------------------------------------------------------------------------------- /streamlit_app/core/keyframes_processor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import os 5 | import glob 6 | import subprocess 7 | from concurrent.futures import ThreadPoolExecutor, as_completed 8 | 9 | def extract_keyframes_sync(output_dir, max_workers=3): 10 | """Synchronous keyframes extraction - no async conflicts""" 11 | # Build keyframes path 12 | keyframes_path = f'{output_dir}/keyframes' 13 | if not os.path.exists(keyframes_path): 14 | os.makedirs(keyframes_path) 15 | 16 | # Get all video files 17 | video_path = f'{output_dir}/downloaded_videos' 18 | if not os.path.exists(video_path): 19 | return 20 | 21 | files = glob.glob(f'{video_path}/*.mp4') 22 | if not files: 23 | return 24 | 25 | # Videos already processed 26 | processed_videos = [] 27 | if os.path.exists(keyframes_path): 28 | processed_videos = [d for d in os.listdir(keyframes_path) 29 | if os.path.isdir(os.path.join(keyframes_path, d))] 30 | 31 | def extract_single_video_keyframes(file): 32 | """Extract keyframes from a single video file""" 33 | try: 34 | # Get id from video filename 35 | video_id = os.path.basename(file).split('.')[0] 36 | if video_id in processed_videos: 37 | return 38 | 39 | # Create subdirectory for this video_id 40 | video_keyframes_dir = f'{keyframes_path}/{video_id}' 41 | if not os.path.exists(video_keyframes_dir): 42 | os.makedirs(video_keyframes_dir) 43 | 44 | # FFmpeg command to extract keyframes 45 | cmd = [ 46 | 'ffmpeg', 47 | '-i', file, 48 | '-vf', 'select=eq(pict_type\\,I)', 49 | '-vsync', 'vfr', 50 | '-q:v', '2', 51 | '-y', # Overwrite output files 52 | f'{video_keyframes_dir}/keyframe_%04d.jpg' 53 | ] 54 | 55 | # Run FFmpeg synchronously 56 | subprocess.run( 57 | cmd, 58 | stdout=subprocess.PIPE, 59 | stderr=subprocess.PIPE, 60 | text=True 61 | ) 62 | 63 | except Exception: 64 | # Silently handle errors 65 | pass 66 | 67 | # Process videos with controlled concurrency 68 | max_workers = min(max_workers, len(files)) 69 | 70 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 71 | # Submit all tasks 72 | future_to_file = {executor.submit(extract_single_video_keyframes, file): file 73 | for file in files} 74 | 75 | # Process completed tasks silently 76 | for future in as_completed(future_to_file): 77 | result = future.result() 78 | # Silently handle results - no UI spam 79 | pass -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import os 5 | 6 | # typing 7 | from typing import Dict 8 | 9 | # import submodules 10 | from configparser import ConfigParser 11 | from datetime import datetime 12 | 13 | def get_project_root(): 14 | """Get the project root directory.""" 15 | # Get the directory where main.py is located 16 | current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 17 | return current_dir 18 | 19 | ''' 20 | Get configuration attributes 21 | 22 | ''' 23 | def get_config_attrs(config_dir=None) -> Dict: 24 | ''' 25 | Retrieves configuration attributes from configuration files. 26 | 27 | :param config_dir: Optional path to the config directory. 28 | If None, uses the default path. 29 | :return: A dictionary containing the SerpAPI and Apify credentials. 30 | ''' 31 | if config_dir is None: 32 | project_root = get_project_root() 33 | config_dir = os.path.join(project_root, 'config') 34 | 35 | path = os.path.join(config_dir, 'config.ini') 36 | 37 | # config parser 38 | config = ConfigParser() 39 | config.read(path) 40 | 41 | # Get credentials from both sections 42 | credentials = {} 43 | 44 | # SerpAPI credentials 45 | if 'SerpAPI Key' in config: 46 | credentials.update(dict(config['SerpAPI Key'])) 47 | 48 | # Apify credentials 49 | if 'Apify Token' in config: 50 | credentials.update(dict(config['Apify Token'])) 51 | 52 | return credentials 53 | 54 | ''' 55 | Verify date format 56 | 57 | ''' 58 | def is_valid_date(date_str: str) -> bool: 59 | ''' 60 | Verifies if the given date string is in the format YYYY-MM-DD. 61 | 62 | :param date_str: The date string to verify. 63 | :return: True if the date string is valid, False otherwise. 64 | ''' 65 | try: 66 | # Attempt to parse the date string with the expected format 67 | datetime.strptime(date_str, '%Y-%m-%d') 68 | return True 69 | except ValueError: 70 | # If a ValueError is raised, the format is incorrect 71 | return False 72 | 73 | def verify_date_argument(args: Dict, key: str) -> None: 74 | ''' 75 | Verifies that a date argument in args is correctly formatted. 76 | 77 | :param args: Dictionary containing command line arguments and options. 78 | :param key: The key in args to check for a valid date. 79 | :raises ValueError: If the date is not in the correct format. 80 | ''' 81 | if key in args: 82 | if not is_valid_date(args[key]): 83 | raise ValueError( 84 | f"The date for '{key}' argument is not in the correct " 85 | "format. Use this format: YYYY-MM-DD." 86 | ) 87 | 88 | ''' 89 | Create output data path 90 | 91 | ''' 92 | def create_output_data_path(path: str) -> None: 93 | ''' 94 | Creates the specified directory path if it does not already exist. 95 | 96 | :param path: The directory path to create. 97 | :return: None 98 | ''' 99 | if not os.path.exists(path): 100 | os.makedirs(path) 101 | 102 | -------------------------------------------------------------------------------- /streamlit_app/components/main_panel.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import streamlit as st 5 | 6 | # local imports 7 | from dataclasses import dataclass 8 | from ..utils.file_browser import select_directory 9 | 10 | @dataclass 11 | class CollectionConfig: 12 | """Configuration for collection settings""" 13 | download_videos: bool = True 14 | use_tor: bool = False 15 | max_workers: int = 5 16 | output_dir: str = '' 17 | 18 | def render_main_panel(): 19 | """Render main content panels and return configuration""" 20 | 21 | # Main Content Area - Better organized panels 22 | st.markdown("## ⚙️ Collection Settings") 23 | 24 | # Download Settings Panel 25 | with st.container(): 26 | st.markdown("### 📥 Download & Processing Settings") 27 | st.markdown("") # Add consistent spacing 28 | 29 | col1, col2, col3 = st.columns([1, 1, 1]) 30 | 31 | with col1: 32 | st.markdown("**📹 Download Videos**") 33 | download_videos = st.toggle( 34 | "Enable video downloads", 35 | value=True, 36 | help="Download TikTok videos to local storage", 37 | label_visibility="collapsed" 38 | ) 39 | 40 | with col2: 41 | st.markdown("**🔒 Use Tor Network**") 42 | use_tor = st.toggle( 43 | "Enable Tor for downloads", 44 | help="Enable Tor for anonymous downloads", 45 | label_visibility="collapsed" 46 | ) 47 | 48 | with col3: 49 | max_workers = st.number_input( 50 | '⚡ **Concurrent Workers**', 51 | min_value=1, 52 | max_value=20, 53 | value=5, 54 | help='Number of concurrent download workers' 55 | ) 56 | 57 | st.markdown("---") 58 | 59 | # Output Configuration Panel 60 | with st.container(): 61 | st.markdown("### 📂 Output Configuration") 62 | 63 | # Properly aligned output directory input and browse button 64 | col1, col2 = st.columns([6, 1]) 65 | 66 | with col1: 67 | output_dir = st.text_input( 68 | '**Output Directory**', 69 | value=st.session_state.output_dir, 70 | help='Directory where all collected data will be saved', 71 | placeholder='Enter output directory path...', 72 | label_visibility="visible" 73 | ) 74 | if output_dir != st.session_state.output_dir: 75 | st.session_state.output_dir = output_dir 76 | 77 | with col2: 78 | # Add spacing to align button with input field 79 | st.markdown("
", unsafe_allow_html=True) 80 | if st.button('📁', help="Browse for directory", use_container_width=True): 81 | path = select_directory() 82 | if path: 83 | st.session_state.output_dir = path 84 | st.rerun() 85 | 86 | st.markdown("---") 87 | 88 | # Centered Action Button 89 | col1, col2, col3 = st.columns([1, 2, 1]) 90 | with col2: 91 | start_collection = st.button( 92 | '🚀 **Start Data Collection**', 93 | use_container_width=True, 94 | type="primary" 95 | ) 96 | 97 | return CollectionConfig( 98 | download_videos=download_videos, 99 | use_tor=use_tor, 100 | max_workers=max_workers, 101 | output_dir=st.session_state.output_dir 102 | ), start_collection -------------------------------------------------------------------------------- /data_collectors/utilities.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # typing 4 | from typing import Dict, List 5 | 6 | ''' 7 | Build search query 8 | 9 | ''' 10 | def advanced_search_options(args: Dict) -> str: 11 | ''' 12 | Builds advanced search options based on the provided arguments. 13 | 14 | :param args: Dictionary containing the command line arguments and options. 15 | :return: A formatted query string with advanced search options. 16 | ''' 17 | before = args.get('before', '') 18 | after = args.get('after', '') 19 | 20 | advanced_search = { 21 | 'before': before, 22 | 'after': after 23 | } 24 | 25 | response = [ 26 | f'{k}:{v}' for k, v in advanced_search.items() if v 27 | ] 28 | 29 | return ' '.join(response) 30 | 31 | def build_site_query(site: str, user: str = None, tag: str = None, q: str = '') -> str: 32 | ''' 33 | Builds a site-specific search query based on the provided parameters. 34 | 35 | :param site: TikTok's site domain. 36 | :param user: Optional username to search for content from a specific user. 37 | :param tag: Optional tag to search for content with a specific tag. 38 | :param q: Optional search terms to include in the query. 39 | :return: A formatted site search query string. 40 | ''' 41 | if user is not None: 42 | # remove @ prefix if present 43 | clean_user = user[1:] if user.startswith('@') else user 44 | return f'site:{site}/@{clean_user}/* {q}'.strip() 45 | elif tag is not None: 46 | # remove # prefix if present 47 | clean_tag = tag[1:] if tag.startswith('#') else tag 48 | return f'site:{site}/tag/{clean_tag}/* {q}'.strip() 49 | else: 50 | # normal site search 51 | return f'site:{site}/* {q}'.strip() 52 | 53 | def search_query(args: Dict) -> str: 54 | ''' 55 | Builds the search query string based on the command line arguments. 56 | 57 | :param args: Dictionary containing the command line arguments and options. 58 | :return: A formatted query string. 59 | ''' 60 | q = args.get('q') or '' 61 | advanced_search = advanced_search_options(args) 62 | 63 | return f'{q} {advanced_search}'.strip() 64 | 65 | ''' 66 | Select SerpAPI parameters 67 | 68 | ''' 69 | def select_serpapi_parameters(args: Dict) -> Dict: 70 | ''' 71 | Filters the command line arguments to include only the default SerpAPI 72 | parameters. 73 | 74 | :param args: Dictionary containing the command line arguments and options. 75 | :return: A dictionary containing only the relevant SerpAPI parameters. 76 | ''' 77 | default_serpapi_parameters = [ 78 | 'q', 79 | 'google_domain', 80 | 'gl', 81 | 'hl', 82 | 'cr', 83 | 'lr', 84 | 'safe' 85 | ] 86 | 87 | # filter and return only the relevant SerpAPI parameters 88 | params = { 89 | k: v for k, v in args.items() if k in default_serpapi_parameters and v 90 | } 91 | 92 | # add new parameters 93 | params['engine'] = 'google' 94 | params['start'] = 0 95 | params['nfpr'] = 1 96 | params['num'] = 100 97 | 98 | return params 99 | 100 | ''' 101 | Extract relevant keys from SerpAPI response 102 | 103 | ''' 104 | def extract_results_keys(data: List[Dict], result_type: str) -> List[Dict]: 105 | ''' 106 | Filters the SerpAPI response data to include only entries with 'link' 107 | containing 'video', and returns a list of dictionaries with specified 108 | default keys. 109 | 110 | :param data: List of dictionaries containing the SerpAPI response data. 111 | :param result_type: Type of SerpAPI response: 'search_result' or 112 | 'image_result' 113 | :return: A list of dictionaries, each containing the specified default 114 | keys from the SerpAPI response. 115 | ''' 116 | key_mapping = { 117 | 'search_result': [ 118 | 'source', 119 | 'title', 120 | 'snippet', 121 | 'link', 122 | 'thumbnail', 123 | 'video_link', 124 | 'snippet_highlighted_words', 125 | 'displayed_link' 126 | ], 127 | 'image_result': [ 128 | 'source', 129 | 'thumbnail', 130 | 'title', 131 | 'link', 132 | 'serpapi_related_content_link' 133 | ] 134 | } 135 | 136 | selected_keys = key_mapping.get(result_type, []) 137 | 138 | # filter data to include only entries with 'link' containing 'video' 139 | d = [ 140 | i for i in data if 'link' in i and '/video/' in i['link'] 141 | and 'tiktok.com' in i['link'] 142 | ] 143 | 144 | # return list of dictionaries with specified default keys 145 | return [ 146 | { 147 | k: i[k] for k in selected_keys if k in i 148 | } for i in d 149 | ] 150 | 151 | ''' 152 | Extract relevant keys from related content 153 | ''' 154 | def extract_related_content_keys(data: List[Dict]) -> List[Dict]: 155 | ''' 156 | Filters related content data and returns a list of dictionaries with 157 | specified default keys. 158 | 159 | :param data: List of dictionaries containing related content data. 160 | :return: A list of dictionaries, each containing the specified default 161 | keys for the related content. 162 | ''' 163 | key_mapping = [ 164 | 'source', 165 | 'link', 166 | 'thumbnail', 167 | 'title' 168 | ] 169 | 170 | # return list of dictionaries with specified default keys 171 | return [ 172 | { 173 | k: i[k] for k in key_mapping if k in i 174 | } for i in data 175 | ] 176 | -------------------------------------------------------------------------------- /streamlit_app/styles/css.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import streamlit as st 5 | 6 | def load_css(): 7 | """Load custom CSS for TikTok-inspired theme""" 8 | st.markdown(""" 9 | 182 | """, unsafe_allow_html=True) 183 | 184 | def create_status_badge(text, status_type): 185 | """Create a status badge with specified type""" 186 | return f'{text}' -------------------------------------------------------------------------------- /streamlit_app/components/sidebar.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import streamlit as st 5 | 6 | # import submodules 7 | from dataclasses import dataclass 8 | from typing import Optional 9 | from datetime import date 10 | 11 | @dataclass 12 | class SearchConfig: 13 | """Configuration for search parameters""" 14 | query: Optional[str] = None 15 | user: Optional[str] = None 16 | tag: Optional[str] = None 17 | after_date: Optional[date] = None 18 | before_date: Optional[date] = None 19 | google_domain: str = 'google.com' 20 | gl: Optional[str] = None 21 | hl: Optional[str] = None 22 | cr: Optional[str] = None 23 | lr: Optional[str] = None 24 | safe: str = 'active' 25 | depth: int = 3 26 | 27 | @dataclass 28 | class ApifyConfig: 29 | """Configuration for Apify integration""" 30 | use_apify: bool = False 31 | number_of_results: int = 25 32 | oldest_post_date: Optional[date] = None 33 | newest_post_date: Optional[date] = None 34 | 35 | def render_sidebar(): 36 | """Render sidebar components and return configuration""" 37 | with st.sidebar: 38 | st.markdown("### 🎯 Search Configuration") 39 | st.markdown("") # Add consistent spacing 40 | 41 | # Search Type Selection 42 | search_tab = st.radio( 43 | "**Search Type**", 44 | ["🔍 Keyword", "👤 User Profile", "🏷️ Hashtag"], 45 | horizontal=True 46 | ) 47 | 48 | st.markdown("") # Add spacing after radio buttons 49 | 50 | # Search input based on type 51 | query = user = tag = None 52 | 53 | if search_tab == "🔍 Keyword": 54 | query = st.text_input( 55 | 'Search Keywords', 56 | placeholder='Enter keywords to search for...', 57 | help='Search for TikTok content using keywords' 58 | ) 59 | elif search_tab == "👤 User Profile": 60 | user = st.text_input( 61 | 'TikTok Username', 62 | placeholder='username (without @)', 63 | help='Enter TikTok username without @ symbol' 64 | ) 65 | else: # Hashtag search 66 | tag = st.text_input( 67 | 'Hashtag', 68 | placeholder='hashtag (with or without #)', 69 | help='Enter hashtag with or without # symbol' 70 | ) 71 | 72 | st.markdown("") # Add spacing before divider 73 | st.markdown("---") 74 | st.markdown("") # Add spacing after divider 75 | 76 | # Date Filters Section 77 | st.markdown("### 📅 Date Filters") 78 | st.markdown("") # Add consistent spacing 79 | col1, col2 = st.columns(2) 80 | with col1: 81 | after_date = st.date_input( 82 | 'After Date', 83 | value=None, 84 | help='Posts after this date' 85 | ) 86 | with col2: 87 | before_date = st.date_input( 88 | 'Before Date', 89 | value=None, 90 | help='Posts before this date' 91 | ) 92 | 93 | st.markdown("") # Add spacing before divider 94 | st.markdown("---") 95 | st.markdown("") # Add spacing after divider 96 | 97 | # Apify Integration Section 98 | st.markdown("### 🚀 Apify Integration") 99 | st.markdown("") # Add consistent spacing 100 | 101 | use_apify = st.toggle( 102 | "**Enable Apify**", 103 | help="Enhanced data collection with Apify" 104 | ) 105 | 106 | st.markdown("") # Add spacing after toggle 107 | 108 | if use_apify: 109 | number_of_results = st.number_input( 110 | 'Results Count', 111 | min_value=1, 112 | max_value=1000, 113 | value=25, 114 | help='Number of results to collect' 115 | ) 116 | 117 | st.markdown("") # Add spacing before subsection 118 | st.markdown("**Apify Date Filters**") 119 | st.markdown("") # Add spacing after subsection title 120 | 121 | col1, col2 = st.columns(2) 122 | with col1: 123 | oldest_post_date = st.date_input( 124 | 'Oldest Post', 125 | help='Oldest post date' 126 | ) 127 | with col2: 128 | newest_post_date = st.date_input( 129 | 'Newest Post', 130 | help='Newest post date' 131 | ) 132 | else: 133 | number_of_results = 25 134 | oldest_post_date = None 135 | newest_post_date = None 136 | 137 | st.markdown("---") 138 | 139 | # Advanced Search Options 140 | with st.expander("⚙️ Advanced Search Options"): 141 | st.markdown("**Google Search Settings**") 142 | 143 | # Domain setting (full width) 144 | google_domain = st.text_input( 145 | 'Domain', 146 | value='google.com', 147 | help='e.g., google.com, google.co.uk' 148 | ) 149 | 150 | # Country and Language settings (2 columns) 151 | col1, col2 = st.columns(2) 152 | with col1: 153 | gl = st.text_input( 154 | 'Country Code (GL)', 155 | help='e.g., us, uk, de', 156 | placeholder='us' 157 | ) 158 | cr = st.text_input( 159 | 'Country Restriction', 160 | help='Restrict to specific countries', 161 | placeholder='countryUS' 162 | ) 163 | with col2: 164 | hl = st.text_input( 165 | 'Language Code (HL)', 166 | help='e.g., en, es, fr', 167 | placeholder='en' 168 | ) 169 | lr = st.text_input( 170 | 'Language Restriction', 171 | help='Restrict to specific languages', 172 | placeholder='lang_en' 173 | ) 174 | 175 | # Search settings (2 columns) 176 | col3, col4 = st.columns(2) 177 | with col3: 178 | safe = st.selectbox( 179 | 'Safe Search', 180 | options=['active', 'off'], 181 | index=0, 182 | help='Adult content filter' 183 | ) 184 | with col4: 185 | depth = st.slider( 186 | 'Search Depth', 187 | min_value=1, 188 | max_value=10, 189 | value=3, 190 | help='Related content iterations' 191 | ) 192 | 193 | # Return configuration objects 194 | search_config = SearchConfig( 195 | query=query, 196 | user=user, 197 | tag=tag, 198 | after_date=after_date, 199 | before_date=before_date, 200 | google_domain=google_domain, 201 | gl=gl if gl else None, 202 | hl=hl if hl else None, 203 | cr=cr if cr else None, 204 | lr=lr if lr else None, 205 | safe=safe, 206 | depth=depth 207 | ) 208 | 209 | apify_config = ApifyConfig( 210 | use_apify=use_apify, 211 | number_of_results=number_of_results, 212 | oldest_post_date=oldest_post_date, 213 | newest_post_date=newest_post_date 214 | ) 215 | 216 | return search_config, apify_config -------------------------------------------------------------------------------- /streamlit_app/core/collection_runner.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import streamlit as st 5 | import asyncio 6 | import time 7 | 8 | # import submodules 9 | from concurrent.futures import ThreadPoolExecutor 10 | 11 | # local imports 12 | from data_collectors import TikTokDataCollector 13 | from media_handlers import VideoDownloader 14 | from utils import create_output_data_path 15 | from ..components.progress import create_progress_tracker, update_progress, \ 16 | mark_step_complete 17 | from ..styles.css import create_status_badge 18 | from .keyframes_processor import extract_keyframes_sync 19 | 20 | def build_args_dict(search_config, apify_config, collection_config, config_attrs): 21 | """Build arguments dictionary for collection""" 22 | args = { 23 | 'q': search_config.query, 24 | 'user': search_config.user, 25 | 'tag': search_config.tag, 26 | 'google_domain': search_config.google_domain, 27 | 'gl': search_config.gl, 28 | 'hl': search_config.hl, 29 | 'cr': search_config.cr, 30 | 'lr': search_config.lr, 31 | 'safe': search_config.safe, 32 | 'depth': search_config.depth, 33 | 'before': search_config.before_date.strftime('%Y-%m-%d') if search_config.before_date else None, 34 | 'after': search_config.after_date.strftime('%Y-%m-%d') if search_config.after_date else None, 35 | 'download': collection_config.download_videos, 36 | 'use_tor': collection_config.use_tor, 37 | 'max_workers': collection_config.max_workers, 38 | 'output': collection_config.output_dir, 39 | 'apify': apify_config.use_apify, 40 | 'number_of_results': apify_config.number_of_results 41 | } 42 | 43 | # Add Apify-specific arguments if enabled 44 | if apify_config.use_apify: 45 | args.update({ 46 | 'oldest_post_date': apify_config.oldest_post_date.strftime('%Y-%m-%d') if apify_config.oldest_post_date else None, 47 | 'newest_post_date': apify_config.newest_post_date.strftime('%Y-%m-%d') if apify_config.newest_post_date else None 48 | }) 49 | 50 | # Merge configuration attributes with user arguments 51 | args = {**args, **config_attrs} 52 | 53 | return args 54 | 55 | def validate_input(search_config): 56 | """Validate search input""" 57 | if not search_config.query and not search_config.user and not search_config.tag: 58 | st.error('🚨 Please enter a search term, username, or hashtag to continue!') 59 | return False 60 | return True 61 | 62 | def run_collection(search_config, apify_config, collection_config, config_attrs): 63 | """Enhanced collection function with better progress tracking and feedback""" 64 | 65 | # Build arguments 66 | args = build_args_dict(search_config, apify_config, collection_config, config_attrs) 67 | 68 | # Create progress tracker 69 | overall_progress, status_text, step_progress, steps = create_progress_tracker() 70 | 71 | def run_collection_thread(): 72 | """Run collection in separate thread with own event loop""" 73 | # Create new event loop for this thread 74 | loop = asyncio.new_event_loop() 75 | asyncio.set_event_loop(loop) 76 | 77 | try: 78 | # Create collector in this thread 79 | collector = TikTokDataCollector(args=args) 80 | 81 | # Execute the main collection process 82 | collector.collect_search_data() 83 | 84 | # Generate files 85 | collector.generate_data_files() 86 | 87 | # Get collected videos for download 88 | collected_videos = collector.get_collected_videos() if args['download'] else [] 89 | 90 | return collector, collected_videos 91 | 92 | finally: 93 | loop.close() 94 | 95 | try: 96 | # Create output directory 97 | create_output_data_path(args['output']) 98 | 99 | # Step 1: Initialize 100 | update_progress(0, overall_progress, status_text, step_progress, steps, progress_value=10) 101 | 102 | # Step 2: Start data collection process 103 | update_progress(1, overall_progress, status_text, step_progress, steps, "Searching...", 25) 104 | 105 | # Step 3: Show image collection 106 | update_progress(2, overall_progress, status_text, step_progress, steps, progress_value=35) 107 | 108 | # Step 4: Show Apify preparation 109 | if args['apify']: 110 | update_progress(3, overall_progress, status_text, step_progress, steps, "Preparing Apify...", 45) 111 | else: 112 | step_progress[3].markdown(f"⏭️ Apify integration skipped") 113 | overall_progress.progress(45) 114 | time.sleep(0.1) 115 | 116 | # Run collection in separate thread to avoid asyncio conflicts 117 | with ThreadPoolExecutor() as executor: 118 | future = executor.submit(run_collection_thread) 119 | collector, collected_videos = future.result() 120 | 121 | # Mark data collection steps as complete 122 | mark_step_complete(1, step_progress, "Search results collected") 123 | mark_step_complete(2, step_progress, "Image thumbnails gathered") 124 | if args['apify']: 125 | mark_step_complete(3, step_progress, "Apify integration completed") 126 | 127 | overall_progress.progress(65) 128 | 129 | # Step 5: Generate files (already done in thread) 130 | update_progress(4, overall_progress, status_text, step_progress, steps, "Generating Files...", 75) 131 | mark_step_complete(4, step_progress, "Data files generated") 132 | 133 | # Step 6: Download videos 134 | if args['download']: 135 | update_progress(5, overall_progress, status_text, step_progress, steps, "Downloading...", 80) 136 | 137 | if collected_videos: 138 | st.info(f'📹 Found {len(collected_videos)} videos to download') 139 | 140 | downloader = VideoDownloader( 141 | output=args['output'], 142 | use_tor=args['use_tor'] 143 | ) 144 | downloader.start_download( 145 | urls=collected_videos, 146 | max_workers=args['max_workers'] 147 | ) 148 | 149 | mark_step_complete(5, step_progress, f"{len(collected_videos)} videos downloaded") 150 | else: 151 | mark_step_complete(5, step_progress, "No new videos to download") 152 | else: 153 | mark_step_complete(5, step_progress, "Video download disabled") 154 | 155 | # Step 7: Extract keyframes from available videos 156 | update_progress(6, overall_progress, status_text, step_progress, steps, "Extracting Keyframes...", 90) 157 | 158 | # Extract keyframes from any videos in the output directory 159 | try: 160 | extract_keyframes_sync(args['output'], args['max_workers']) 161 | mark_step_complete(6, step_progress, "Keyframes extracted") 162 | except Exception as e: 163 | step_progress[6].markdown(f"⚠️ Keyframe extraction failed: {str(e)}") 164 | 165 | # Step 8: Complete 166 | overall_progress.progress(100) 167 | update_progress(7, overall_progress, status_text, step_progress, steps) 168 | status_text.markdown(create_status_badge("Success", "success"), unsafe_allow_html=True) 169 | 170 | # Success message with results 171 | st.success('🎉 Collection completed successfully!') 172 | 173 | # Show output location 174 | st.metric("📂 Output Location", args['output']) 175 | 176 | # Show file explorer link 177 | st.markdown(f""" 178 |
179 | 📁 Results saved to:
180 | {args['output']} 181 |
182 | """, unsafe_allow_html=True) 183 | 184 | except Exception as e: 185 | status_text.markdown(create_status_badge("Error", "error"), unsafe_allow_html=True) 186 | st.error(f'❌ An error occurred during collection: {str(e)}') 187 | st.exception(e) -------------------------------------------------------------------------------- /media_handlers/video_downloader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import os 5 | import time 6 | 7 | # threads 8 | from concurrent.futures import ThreadPoolExecutor, as_completed 9 | 10 | # typing 11 | from typing import List 12 | 13 | # pathlib 14 | from pathlib import Path 15 | 16 | # progress bar 17 | from tqdm import tqdm 18 | 19 | # yt_dlp module 20 | from yt_dlp import YoutubeDL 21 | 22 | # stem module 23 | from stem import Signal 24 | from stem.control import Controller 25 | 26 | # Video downloader class 27 | class VideoDownloader: 28 | ''' 29 | VideoDownloader class 30 | 31 | This class handles the downloading of TikTok videos and their audio using 32 | yt-dlp and threading for concurrent downloads. 33 | ''' 34 | def __init__(self, output: str, use_tor: bool = False) -> None: 35 | ''' 36 | Initializes the VideoDownloader with default download options. 37 | Downloads both video and audio when initialized. 38 | 39 | :param output: The original directory path provided by the user 40 | :param use_tor: Boolean indicating whether to use Tor for downloads 41 | ''' 42 | # initialize Tor proxy settings 43 | self.use_tor = use_tor 44 | self.proxy = 'socks5://127.0.0.1:9050' 45 | 46 | # Common options for both video and audio 47 | common_options = { 48 | 'no_warnings': True, 49 | 'quiet': True, 50 | 'ignoreerrors': True, 51 | 'noprogress': True 52 | } 53 | 54 | if self.use_tor: 55 | common_options['proxy'] = self.proxy 56 | 57 | # video download options 58 | self.video_options = { 59 | **common_options, 60 | 'format': '(bv*+ba/b)[vcodec!=?h265]', 61 | 'outtmpl': self._build_output_directory(output, 'downloaded_videos') 62 | } 63 | 64 | # audio download options 65 | self.audio_options = { 66 | **common_options, 67 | 'format': 'bestaudio/best', 68 | 'outtmpl': self._build_output_directory(output, 'downloaded_audios'), 69 | 'postprocessors': [{ 70 | 'key': 'FFmpegExtractAudio', 71 | 'preferredcodec': 'mp3', 72 | }] 73 | } 74 | 75 | def _sanitize_output_path(self, output: str) -> str: 76 | ''' 77 | Ensures the given path uses forward slashes and does not end with a 78 | slash. 79 | 80 | :param output: The original directory path provided by the user 81 | :return: A sanitized directory path with forward slashes and no 82 | trailing slash. 83 | ''' 84 | # create a Path object and normalize the path 85 | path = Path(output) 86 | 87 | # path with the correct separators for the current OS 88 | output = str(path.as_posix()) 89 | 90 | # remove any trailing slashes 91 | output = output.rstrip('/') 92 | 93 | return output 94 | 95 | def _build_output_directory(self, output: str, dir_name: str) -> str: 96 | ''' 97 | Builds and sanitizes the output directory path for downloading videos. 98 | 99 | :param output: The original directory path provided by the user 100 | :param dir_name: Name of the subdirectory (videos or audio) 101 | :return: The full path for saving downloaded files with the filename 102 | template. 103 | ''' 104 | output = self._sanitize_output_path(output=output) 105 | path = f'{output}/{dir_name}' 106 | 107 | # ensure the directory exists 108 | if not os.path.exists(path): 109 | os.makedirs(path) 110 | 111 | return f'{path}/%(id)s.%(ext)s' 112 | 113 | def renew_tor_ip(self) -> None: 114 | ''' 115 | Requests a new Tor circuit to change the IP address. 116 | ''' 117 | try: 118 | with Controller.from_port(port=9051) as controller: 119 | controller.authenticate() 120 | controller.signal(Signal.NEWNYM) 121 | time.sleep(5) 122 | except Exception as e: 123 | print (f'Error renewing Tor IP: {e}') 124 | 125 | def download_content(self, url: str) -> None: 126 | ''' 127 | Downloads both video and audio from the specified URL using yt-dlp. 128 | 129 | :param url: The URL of the TikTok video to download. 130 | ''' 131 | max_attempts = 3 if self.use_tor else 1 132 | for attempt in range(max_attempts): 133 | try: 134 | # download video 135 | with YoutubeDL(self.video_options) as ydl: 136 | ydl.download(url) 137 | 138 | # download audio 139 | with YoutubeDL(self.audio_options) as ydl: 140 | ydl.download(url) 141 | 142 | return 143 | 144 | except Exception as e: 145 | print (f'Error downloading {url}: {e}') 146 | 147 | if self.use_tor and attempt < max_attempts - 1: 148 | print ('Renewing Tor circuit...') 149 | self.renew_tor_ip() 150 | 151 | # wait for circuit to be established 152 | time.sleep(5) 153 | else: 154 | break 155 | 156 | def download_videos(self, urls: List[str], max_workers: int) -> None: 157 | ''' 158 | Downloads multiple videos concurrently using a thread pool. 159 | 160 | :param urls: A list of TikTok video URLs to download. 161 | :param max_workers: The maximum number of threads to use for 162 | downloading. 163 | ''' 164 | with ThreadPoolExecutor(max_workers=max_workers) as executor: 165 | future_to_url = { 166 | executor.submit(self.download_content, url): url 167 | for url in urls 168 | } 169 | for future in tqdm( 170 | as_completed(future_to_url), 171 | total=len(future_to_url), 172 | desc='Downloading content' 173 | ): 174 | url = future_to_url[future] 175 | try: 176 | future.result() 177 | except Exception as e: 178 | print (f'{url} generated an exception: {e}') 179 | 180 | def _test_tor_connection(self) -> bool: 181 | ''' 182 | Tests if Tor is available and working. 183 | 184 | :return: True if Tor is available and working, False otherwise. 185 | ''' 186 | try: 187 | # test if port is open 188 | import socket 189 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 190 | result = sock.connect_ex(('127.0.0.1', 9050)) 191 | if result != 0: 192 | print ('\n\n') 193 | print ('Tor SOCKS port (9050) is not open. Is Tor running?') 194 | print ('Falling back to normal connection.\n') 195 | return False 196 | 197 | # if port is open, test connection 198 | import requests 199 | print ('\n\nTesting Tor connection...') 200 | response = requests.get( 201 | 'https://check.torproject.org/api/ip', 202 | proxies={ 203 | 'http': self.proxy, 204 | 'https': self.proxy 205 | }, 206 | timeout=10 207 | ) 208 | 209 | if response.status_code == 200: 210 | data = response.json() 211 | print (f'Tor connection successful. Exit node IP: {data.get("IP")}\n\n') 212 | return True 213 | else: 214 | print ('Tor enabled but connection check failed. Using normal connection.\n\n') 215 | return False 216 | 217 | except Exception as e: 218 | print (f'\nTor connection failed ({e}). Using normal connection.\n') 219 | return False 220 | 221 | def start_download(self, urls: List[str], max_workers: int) -> None: 222 | ''' 223 | Starts the download process for a list of TikTok video URLs. 224 | 225 | :param urls: A list of TikTok video URLs to download. 226 | :param max_workers: The maximum number of threads to use for 227 | downloading. Default is 5. 228 | ''' 229 | if self.use_tor: 230 | # test Tor connection and update use_tor flag accordingly 231 | self.use_tor = self._test_tor_connection() 232 | 233 | # remove proxy settings if Tor connection failed 234 | if not self.use_tor: 235 | for options in [self.video_options, self.audio_options]: 236 | options.pop('proxy', None) 237 | 238 | print ('> Starting download...\n') 239 | 240 | # download videos 241 | self.download_videos(urls=urls, max_workers=max_workers) 242 | 243 | print ('\n\nDownload complete.') 244 | -------------------------------------------------------------------------------- /media_handlers/session_manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import os 5 | import glob 6 | import aiohttp 7 | import asyncio 8 | import requests 9 | import subprocess 10 | 11 | # progress bar 12 | from tqdm import tqdm 13 | 14 | # aiohttp 15 | from aiohttp import ClientSession 16 | 17 | # typing 18 | from typing import Dict, List 19 | 20 | # HTTP session class 21 | class RequestSession: 22 | ''' 23 | RequestSession 24 | 25 | This class handles HTTP requests and asynchronous tasks for interacting 26 | with the SerpAPI response and processing related content links 27 | 28 | ''' 29 | def __init__(self) -> None: 30 | ''' 31 | Initializes the RequestSession object. 32 | ''' 33 | # request session 34 | headers = {'accept': 'application/json'} 35 | self.req_session = requests.Session() 36 | self.req_session.headers.update(headers) 37 | 38 | # asynchronous event loop 39 | self.loop = asyncio.get_event_loop() 40 | 41 | def load_related_content(self, url: str, api_key: str) -> List[Dict]: 42 | ''' 43 | Loads related content from the given URL using the provided API key. 44 | 45 | :param url: The URL to load related content from. 46 | :param api_key: SerpAPI key for authentication. 47 | :return: A list of dictionaries containing the related content data. 48 | ''' 49 | params = {'api_key': api_key} 50 | 51 | def fetch_content(url: str) -> Dict: 52 | response = self.req_session.get(url, params=params) 53 | response.raise_for_status() 54 | return response.json() 55 | 56 | try: 57 | content = fetch_content(url) 58 | see_more_link = content.get('serpapi_see_more_link') 59 | if see_more_link: 60 | content = fetch_content(see_more_link) 61 | return content 62 | except requests.RequestException as e: 63 | print (f'An error occurred: {e}') 64 | return {} 65 | 66 | def _build_media_filename_path(self, output: str, link: str, file_extension: str) -> str: 67 | ''' 68 | Builds the filename path for saving the image based on the TikTok link. 69 | 70 | :param output: The directory path where the images will be saved. 71 | :param link: The TikTok link from which to extract the post ID. 72 | :param file_extension: The file extension of the media file. 73 | :return: The full path (including filename) where the image will be 74 | saved. 75 | ''' 76 | post_id = link.split('/')[-1].split('?')[0] 77 | return f'{output}/{post_id}.{file_extension}' 78 | 79 | async def fetch_file(self, session: ClientSession, url: str, 80 | filename: str) -> None: 81 | ''' 82 | Fetches a file from a URL and saves it to the output directory. 83 | 84 | :param session: The aiohttp ClientSession object. 85 | :param url: The URL of the file to download. 86 | :param filename: The path (including filename) where the file will be 87 | saved. 88 | ''' 89 | try: 90 | async with session.get(url) as res: 91 | if res.status == 200: 92 | file_data = await res.read() 93 | with open(filename, 'wb') as f: 94 | f.write(file_data) 95 | else: 96 | print ( 97 | f'Failed to download {url}, status code: {res.status}' 98 | ) 99 | except Exception as e: 100 | print (f'An error occurred while downloading {url}: {e}') 101 | 102 | async def download_files(self, urls: List[str], links: List[str], 103 | output: str, file_extension: str) -> None: 104 | ''' 105 | Downloads files from a list of URLs asynchronously. 106 | 107 | :param urls: A list of file URLs to download. 108 | :param links: A list of TikTok links corresponding to the files. 109 | :param output: The directory path where the files will be saved. 110 | :param file_extension: The file extension of the media file. 111 | ''' 112 | async with aiohttp.ClientSession() as session: 113 | tasks = [ 114 | self.fetch_file( 115 | session=session, url=url, 116 | filename=self._build_media_filename_path(output, link, file_extension) 117 | ) for url, link in zip(urls, links) 118 | ] 119 | await asyncio.gather(*tasks) 120 | 121 | def start_media_download(self, urls: List[str], links: List[str], 122 | output: str, media_type: str) -> None: 123 | ''' 124 | Starts the asynchronous download of files from a list of URLs. 125 | 126 | :param urls: A list of file URLs to download. 127 | :param links: A list of TikTok links corresponding to the files. 128 | :param output: The directory path where the files will be saved. 129 | :param media_type: The type of media to download. 130 | ''' 131 | media_object = { 132 | 'image': { 133 | 'path': 'thumbnails', 134 | 'file_extension': 'png' 135 | }, 136 | 'video': { 137 | 'path': 'downloaded_videos', 138 | 'file_extension': 'mp4' 139 | } 140 | } 141 | 142 | path = f'{output}/{media_object[media_type]["path"]}' 143 | if not os.path.exists(path): 144 | os.makedirs(path) 145 | 146 | file_extension = media_object[media_type]['file_extension'] 147 | self.loop.run_until_complete( 148 | self.download_files(urls=urls, links=links, output=path, 149 | file_extension=file_extension) 150 | ) 151 | 152 | def extract_audio_from_videos(self, output: str) -> None: 153 | ''' 154 | Extracts audio from video files. 155 | 156 | :param output: The directory path where audios will be saved. 157 | ''' 158 | # build audio path 159 | audio_path = f'{output}/downloaded_audios' 160 | if not os.path.exists(audio_path): 161 | os.makedirs(audio_path) 162 | 163 | # get all video files 164 | path = f'{output}/downloaded_videos' 165 | files = glob.glob(f'{path}/*.mp4') 166 | 167 | # extract audio from each video 168 | for file in files: 169 | try: 170 | # get id from video filename 171 | video_id = os.path.basename(file).split('.')[0] 172 | 173 | # FFmpeg command to extract audio 174 | cmd = [ 175 | 'ffmpeg', 176 | '-i', file, 177 | '-q:a', '0', 178 | '-map', 'a', 179 | '-y', 180 | f'{audio_path}/{video_id}.mp3' 181 | ] 182 | 183 | subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 184 | except Exception as e: 185 | print (f'Error extracting audio: {e}') 186 | 187 | def extract_keyframes_from_videos(self, output: str, max_concurrent: int) -> None: 188 | ''' 189 | Extracts keyframes from video files. 190 | 191 | :param output: The directory path where keyframes will be saved. 192 | :param max_concurrent: Maximum number of concurrent ffmpeg processes. 193 | ''' 194 | # build keyframes path 195 | keyframes_path = f'{output}/keyframes' 196 | if not os.path.exists(keyframes_path): 197 | os.makedirs(keyframes_path) 198 | 199 | # get all video files 200 | path = f'{output}/downloaded_videos' 201 | files = glob.glob(f'{path}/*.mp4') 202 | 203 | # videos ids already processed 204 | processed_videos = [i.split('\\')[-1] for i in glob.glob(f'{keyframes_path}/*')] 205 | 206 | async def extract_keyframes(file, pbar): 207 | try: 208 | # get id from video filename 209 | video_id = os.path.basename(file).split('.')[0] 210 | if video_id not in processed_videos: 211 | # create subdirectory for this video_id 212 | video_keyframes_dir = f'{keyframes_path}/{video_id}' 213 | if not os.path.exists(video_keyframes_dir): 214 | os.makedirs(video_keyframes_dir) 215 | 216 | # FFmpeg command to extract keyframes 217 | cmd = [ 218 | 'ffmpeg', 219 | '-i', file, 220 | '-vf', 'select=eq(pict_type\\,I)', 221 | '-vsync', 'vfr', 222 | '-q:v', '2', 223 | f'{video_keyframes_dir}/keyframe_%04d.jpg' 224 | ] 225 | 226 | # run FFmpeg as async subprocess 227 | process = await asyncio.create_subprocess_exec( 228 | *cmd, 229 | stdout=asyncio.subprocess.PIPE, 230 | stderr=asyncio.subprocess.PIPE 231 | ) 232 | await process.communicate() 233 | except Exception as e: 234 | print (f'Error extracting keyframes: {e}') 235 | finally: 236 | pbar.update(1) 237 | 238 | async def process_all_videos(): 239 | # create progress bar in the main thread 240 | pbar = tqdm(total=len(files), desc='Extracting keyframes', unit='video') 241 | 242 | # use semaphore to limit concurrent processes 243 | semaphore = asyncio.Semaphore(max_concurrent) 244 | 245 | async def process_with_semaphore(file): 246 | async with semaphore: 247 | await extract_keyframes(file, pbar) 248 | 249 | # create tasks for all videos 250 | tasks = [process_with_semaphore(file) for file in files] 251 | await asyncio.gather(*tasks) 252 | 253 | pbar.close() 254 | 255 | # run the async event loop 256 | self.loop.run_until_complete(process_all_videos()) 257 | -------------------------------------------------------------------------------- /databases/utilities.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import re 5 | import json 6 | 7 | # typing 8 | from typing import Dict, Tuple 9 | 10 | ''' 11 | Extract likes and comments from snippet 12 | 13 | ''' 14 | def extract_likes_comments(text: str) -> Tuple: 15 | ''' 16 | Extracts likes and comments from a given text. 17 | 18 | :param text: The text containing likes and comments. 19 | :return: A tuple containing the extracted likes and comments, or None if 20 | not found. 21 | ''' 22 | # define regex patterns for likes and comments 23 | likes_pattern = re.compile( 24 | r'(\d+(?:[\d,.]*\d+)?(?:[KM])?) Likes', 25 | re.IGNORECASE 26 | ) 27 | 28 | comments_pattern = re.compile( 29 | r'(\d+(?:[\d,.]*\d+)?(?:[KM])?) Comments', 30 | re.IGNORECASE 31 | ) 32 | 33 | # search for likes and comments in the text 34 | likes_match = likes_pattern.search(text) 35 | comments_match = comments_pattern.search(text) 36 | 37 | # extract the matched groups or return None if not found 38 | likes = likes_match.group(1) if likes_match else None 39 | comments = comments_match.group(1) if comments_match else None 40 | 41 | return likes, comments 42 | 43 | ''' 44 | Extract fields from the field link 45 | 46 | ''' 47 | def extract_author_post_id(link: str) -> Tuple: 48 | ''' 49 | Extracts the author, link to the author's page, and post ID from a TikTok 50 | video link. 51 | 52 | :param link: The TikTok video link. 53 | :return: A tuple containing the author's username, link to the author's 54 | page, and the post ID. 55 | ''' 56 | author = link.split('/')[3].replace('@', '') 57 | link_to_author = f'https://www.tiktok.com/@{author}' 58 | post_id = link.split('/')[-1].split('?')[0] 59 | 60 | return author, link_to_author, post_id 61 | 62 | ''' 63 | Get items and keys from search results entries 64 | 65 | ''' 66 | def get_items_from_search_results(entry: Dict) -> Tuple: 67 | ''' 68 | Extracts and processes specific fields from a data entry. 69 | 70 | :param entry: A dictionary containing the data entry. 71 | :return: A tuple containing the extracted and processed values for the 72 | fields. 73 | ''' 74 | # get values 75 | title = entry.get('title', '') 76 | snippet = entry.get('snippet', '') 77 | link = entry.get('link', '') 78 | 79 | # process new fields from data 80 | likes, comments = extract_likes_comments(snippet) 81 | title_snippet = f'{title} {snippet}' 82 | author, link_to_author, post_id = extract_author_post_id(link) 83 | 84 | 85 | return ( 86 | entry.get('source', None), 87 | entry.get('title', None), 88 | entry.get('snippet', None), 89 | entry.get('link', None), 90 | entry.get('thumbnail', None), 91 | entry.get('video_link', None), 92 | ', '.join(entry.get('snippet_highlighted_words', [])) if entry.get( 93 | 'snippet_highlighted_words' 94 | ) else None, 95 | entry.get('displayed_link', None), 96 | title_snippet, 97 | likes, 98 | comments, 99 | author, 100 | link_to_author, 101 | post_id 102 | ) 103 | 104 | ''' 105 | Get items and keys from images results entries 106 | 107 | ''' 108 | def get_items_from_images_results(entry: Dict) -> Tuple: 109 | ''' 110 | Extracts and processes specific fields from an image results entry. 111 | 112 | :param entry: A dictionary containing the image results entry. 113 | :return: A tuple containing the extracted and processed values for the 114 | fields. 115 | ''' 116 | # get values 117 | link = entry.get('link', '') 118 | 119 | # process new fields from data 120 | author, link_to_author, post_id = extract_author_post_id(link) 121 | 122 | return ( 123 | entry.get('source', None), 124 | entry.get('title', None), 125 | entry.get('link', None), 126 | entry.get('thumbnail', None), 127 | author, 128 | link_to_author, 129 | post_id 130 | ) 131 | 132 | ''' 133 | Get items and keys from related content entries 134 | 135 | ''' 136 | def get_items_from_related_content(entry: Dict) -> Tuple: 137 | ''' 138 | Extracts and processes specific fields from a related content entry. 139 | 140 | :param entry: A dictionary containing the related content entry. 141 | :return: A tuple containing the extracted and processed values for the 142 | fields. 143 | ''' 144 | return ( 145 | entry.get('source', None), 146 | entry.get('link', None), 147 | entry.get('thumbnail', None), 148 | entry.get('title', None) 149 | ) 150 | 151 | ''' 152 | Get items and keys from apify profile data 153 | 154 | ''' 155 | def get_items_from_apify_profile_data(entry: Dict) -> Tuple: 156 | ''' 157 | Extracts and processes specific fields from an apify profile data entry. 158 | 159 | :param entry: A dictionary containing the apify profile data entry. 160 | :return: A tuple containing the extracted and processed values for the 161 | fields. 162 | ''' 163 | # convert lists to JSON strings 164 | hashtags = entry.get('hashtags', []) or [] 165 | hashtags_json_str = json.dumps([h.get('name', '') for h in hashtags]) 166 | 167 | 168 | return ( 169 | entry.get('id', None), 170 | entry.get('text', None), 171 | entry.get('textLanguage', None), 172 | entry.get('createTime', None), 173 | entry.get('createTimeISO', None), 174 | entry.get('isAd', None), 175 | entry.get('webVideoUrl', None), 176 | 177 | # author metadata 178 | entry.get('authorMeta', {}).get('id', None), 179 | entry.get('authorMeta', {}).get('name', None), 180 | entry.get('authorMeta', {}).get('profileUrl', None), 181 | entry.get('authorMeta', {}).get('bioLink', None), 182 | entry.get('authorMeta', {}).get('signature', None), 183 | entry.get('authorMeta', {}).get('nickName', None), 184 | entry.get('authorMeta', {}).get('verified', None), 185 | entry.get('authorMeta', {}).get('avatar', None), 186 | entry.get('authorMeta', {}).get('privateAccount', None), 187 | entry.get('authorMeta', {}).get('region', None), 188 | entry.get('authorMeta', {}).get('following', None), 189 | entry.get('authorMeta', {}).get('friends', None), 190 | entry.get('authorMeta', {}).get('fans', None), 191 | entry.get('authorMeta', {}).get('heart', None), 192 | entry.get('authorMeta', {}).get('video', None), 193 | entry.get('authorMeta', {}).get('digg', None), 194 | 195 | # music metadata 196 | entry.get('musicMeta', {}).get('musicId', None), 197 | entry.get('musicMeta', {}).get('musicName', None), 198 | entry.get('musicMeta', {}).get('musicAuthor', None), 199 | entry.get('musicMeta', {}).get('musicOriginal', None), 200 | 201 | # video metadata 202 | entry.get('videoMeta', {}).get('duration', None), 203 | entry.get('videoMeta', {}).get('coverUrl', None), 204 | entry.get('videoMeta', {}).get('downloadAddr', None), 205 | 206 | # engagement metrics 207 | entry.get('diggCount', None), 208 | entry.get('shareCount', None), 209 | entry.get('playCount', None), 210 | entry.get('collectCount', None), 211 | entry.get('commentCount', None), 212 | 213 | # hashtags 214 | hashtags_json_str, 215 | 216 | # additional metadata 217 | entry.get('isSlideshow', None), 218 | entry.get('isPinned', None), 219 | entry.get('isSponsored', None), 220 | entry.get('input') or entry.get('searchQuery'), 221 | entry.get('fromProfileSection', None) 222 | ) 223 | 224 | ''' 225 | Get items and keys from apify hashtag data 226 | 227 | ''' 228 | def get_items_from_apify_hashtag_data(entry: Dict) -> Tuple: 229 | ''' 230 | Extracts and processes specific fields from an apify hashtag data entry. 231 | 232 | :param entry: A dictionary containing the apify hashtag data entry. 233 | :return: A tuple containing the extracted and processed values for the 234 | fields. 235 | ''' 236 | # convert lists to JSON strings 237 | hashtags = entry.get('hashtags', []) or [] 238 | hashtags_json_str = json.dumps([h.get('name', '') for h in hashtags]) 239 | 240 | 241 | return ( 242 | entry.get('id', None), 243 | entry.get('text', None), 244 | entry.get('textLanguage', None), 245 | entry.get('createTime', None), 246 | entry.get('createTimeISO', None), 247 | entry.get('isAd', None), 248 | entry.get('webVideoUrl', None), 249 | 250 | # author metadata 251 | entry.get('authorMeta', {}).get('id', None), 252 | entry.get('authorMeta', {}).get('name', None), 253 | entry.get('authorMeta', {}).get('profileUrl', None), 254 | entry.get('authorMeta', {}).get('bioLink', None), 255 | entry.get('authorMeta', {}).get('signature', None), 256 | entry.get('authorMeta', {}).get('nickName', None), 257 | entry.get('authorMeta', {}).get('verified', None), 258 | entry.get('authorMeta', {}).get('avatar', None), 259 | entry.get('authorMeta', {}).get('privateAccount', None), 260 | entry.get('authorMeta', {}).get('region', None), 261 | entry.get('authorMeta', {}).get('following', None), 262 | entry.get('authorMeta', {}).get('friends', None), 263 | entry.get('authorMeta', {}).get('fans', None), 264 | entry.get('authorMeta', {}).get('heart', None), 265 | entry.get('authorMeta', {}).get('video', None), 266 | entry.get('authorMeta', {}).get('digg', None), 267 | 268 | # music metadata 269 | entry.get('musicMeta', {}).get('musicId', None), 270 | entry.get('musicMeta', {}).get('musicName', None), 271 | entry.get('musicMeta', {}).get('musicAuthor', None), 272 | entry.get('musicMeta', {}).get('musicOriginal', None), 273 | 274 | # video metadata 275 | entry.get('videoMeta', {}).get('duration', None), 276 | entry.get('videoMeta', {}).get('coverUrl', None), 277 | entry.get('videoMeta', {}).get('downloadAddr', None), 278 | 279 | # engagement metrics 280 | entry.get('diggCount', None), 281 | entry.get('shareCount', None), 282 | entry.get('playCount', None), 283 | entry.get('collectCount', None), 284 | entry.get('commentCount', None), 285 | 286 | # hashtags 287 | hashtags_json_str, 288 | 289 | # additional metadata 290 | entry.get('isSlideshow', None), 291 | entry.get('isPinned', None), 292 | entry.get('isSponsored', None), 293 | entry.get('input', None), 294 | entry.get('searchHashtag', {}).get('views', None) 295 | ) 296 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import time 5 | import os 6 | 7 | # import argparse 8 | from argparse import ( 9 | ArgumentParser, RawTextHelpFormatter, SUPPRESS 10 | ) 11 | 12 | # import utils 13 | from utils import get_config_attrs, verify_date_argument, \ 14 | create_output_data_path, get_project_root 15 | 16 | # TikTok data collector 17 | from data_collectors import TikTokDataCollector 18 | 19 | # video downloader 20 | from media_handlers import VideoDownloader, RequestSession 21 | 22 | def launch_streamlit_app(): 23 | '''Launch the Streamlit web interface''' 24 | import subprocess 25 | import sys 26 | 27 | # start process 28 | log_text = f''' 29 | > Starting program at: {time.ctime()} 30 | 31 | ''' 32 | print ('\n\n' + ' '.join(log_text.split()).strip()) 33 | 34 | print ('\n') 35 | print('> Launching TikSpyder Streamlit Interface...') 36 | print('<< Press Ctrl+C to stop the server >>') 37 | print ('\n') 38 | print('-' * 50) 39 | 40 | try: 41 | # Launch streamlit run app.py 42 | subprocess.run([ 43 | sys.executable, 44 | "-m", "streamlit", "run", 45 | os.path.join(os.path.dirname(__file__), 'app.py') 46 | ], check=True) 47 | except subprocess.CalledProcessError as e: 48 | print(f'> Failed to launch Streamlit: {e}') 49 | print('> Make sure streamlit is installed: pip install streamlit') 50 | sys.exit(1) 51 | except KeyboardInterrupt: 52 | # end process 53 | print ('\n') 54 | print('-' * 50) 55 | log_text = f''' 56 | > Ending program at: {time.ctime()} 57 | 58 | ''' 59 | print ('\n\n' + ' '.join(log_text.split()).strip()) 60 | sys.exit(0) 61 | 62 | def main(): 63 | # Get current working directory (where command was executed) 64 | execution_dir = os.getcwd() 65 | 66 | # Get project root directory (where the package is installed) 67 | project_root = get_project_root() 68 | 69 | # Set up project paths for later use instead of changing directories 70 | project_paths = { 71 | 'root': project_root, 72 | 'config': os.path.join(project_root, 'config'), 73 | 'execution': execution_dir 74 | } 75 | 76 | ''' 77 | Arguments 78 | 79 | ''' 80 | formatter = lambda prog: RawTextHelpFormatter( 81 | prog, 82 | indent_increment=2, 83 | max_help_position=52, 84 | width=None 85 | ) 86 | 87 | parser = ArgumentParser( 88 | prog='TikSpyder', 89 | description='Command Line Arguments.', 90 | formatter_class=formatter, 91 | add_help=False 92 | ) 93 | 94 | # help arguments 95 | help_arguments = parser.add_argument_group('Help options') 96 | help_arguments.add_argument( 97 | '-h', 98 | '--help', 99 | action='help', 100 | default=SUPPRESS, 101 | help='Show this help message and exit.' 102 | ) 103 | 104 | # SerpAPI arguments 105 | serpapi_arguments = parser.add_argument_group('SerpAPI options') 106 | 107 | ''' query ''' 108 | serpapi_arguments.add_argument( 109 | '--q', 110 | type=str, 111 | required=False, 112 | metavar='', 113 | help='The search term of phrase for which to retrieve TikTok data.' 114 | ) 115 | 116 | ''' user ''' 117 | serpapi_arguments.add_argument( 118 | '--user', 119 | type=str, 120 | required=False, 121 | metavar='', 122 | help='Specify a TikTok user to search for videos from.' 123 | ) 124 | 125 | ''' tag ''' 126 | serpapi_arguments.add_argument( 127 | '--tag', 128 | type=str, 129 | required=False, 130 | metavar='', 131 | help='Specify a TikTok tag to search for videos from.' 132 | ) 133 | 134 | ''' google domain ''' 135 | serpapi_arguments.add_argument( 136 | '--google-domain', 137 | type=str, 138 | required=False, 139 | default='google.com', 140 | metavar='', 141 | help='Defines the Google domain to use. It defaults to google.com.' 142 | ) 143 | 144 | ''' gl > country ''' 145 | serpapi_arguments.add_argument( 146 | '--gl', 147 | type=str, 148 | required=False, 149 | metavar='', 150 | help=( 151 | "Defines the country to use for the search. Two-letter country " 152 | "code." 153 | ) 154 | ) 155 | 156 | ''' hl > language ''' 157 | serpapi_arguments.add_argument( 158 | '--hl', 159 | type=str, 160 | required=False, 161 | metavar='', 162 | help=( 163 | "Defines the language to use for the search. Two-letter language " 164 | "code." 165 | ) 166 | ) 167 | 168 | ''' cr > multiple countries ''' 169 | serpapi_arguments.add_argument( 170 | '--cr', 171 | type=str, 172 | required=False, 173 | metavar='', 174 | help='Defines one or multiple countries to limit the search to.' 175 | ) 176 | 177 | ''' safe > adult content filter ''' 178 | serpapi_arguments.add_argument( 179 | '--safe', 180 | type=str, 181 | required=False, 182 | default='active', 183 | choices=['active', 'off'], 184 | metavar='', 185 | help='Level of filtering for adult content. Options: active (default), off' 186 | ) 187 | 188 | ''' lr > one or multiple languages ''' 189 | serpapi_arguments.add_argument( 190 | '--lr', 191 | type=str, 192 | required=False, 193 | metavar='', 194 | help='Defines one or multiple languages to limit the search to.' 195 | ) 196 | 197 | ''' depth > defines number of iterations for related content ''' 198 | serpapi_arguments.add_argument( 199 | '--depth', 200 | type=int, 201 | required=False, 202 | default=3, 203 | metavar='', 204 | help='Depth of iterations to follow related content links.' 205 | ) 206 | 207 | # Google advanced search arguments 208 | google_advanced_search_arguments = parser.add_argument_group( 209 | 'Google advanced search options' 210 | ) 211 | 212 | ''' search for posts before a given date ''' 213 | google_advanced_search_arguments.add_argument( 214 | '--before', 215 | type=str, 216 | required=False, 217 | metavar='', 218 | help=( 219 | "Limit results to posts published before the specified date. " 220 | "Format: YYYY-MM-DD." 221 | ) 222 | ) 223 | 224 | ''' search for posts after a given date ''' 225 | google_advanced_search_arguments.add_argument( 226 | '--after', 227 | type=str, 228 | required=False, 229 | metavar='', 230 | help=( 231 | "Limit results to posts published after the specified date. " 232 | "Format: YYYY-MM-DD." 233 | ) 234 | ) 235 | 236 | # Apify optional arguments 237 | apify_arguments = parser.add_argument_group( 238 | 'Optional Apify arguments' 239 | ) 240 | 241 | ''' apify integration ''' 242 | apify_arguments.add_argument( 243 | '--apify', 244 | action='store_true', 245 | required=False, 246 | help='Specify whether to use Apify integration.' 247 | ) 248 | 249 | apify_arguments.add_argument( 250 | '--oldest-post-date', 251 | type=str, 252 | required=False, 253 | metavar='', 254 | help=( 255 | "Filter posts newer than the specified date. " 256 | "Format: YYYY-MM-DD." 257 | ) 258 | ) 259 | 260 | apify_arguments.add_argument( 261 | '--newest-post-date', 262 | type=str, 263 | required=False, 264 | metavar='', 265 | help=( 266 | "Filter posts older than the specified date. " 267 | "Format: YYYY-MM-DD." 268 | ) 269 | ) 270 | 271 | apify_arguments.add_argument( 272 | '--number-of-results', 273 | type=int, 274 | default=25, 275 | required=False, 276 | metavar='', 277 | help=( 278 | "Specify the number of results to return from Apify. Default: 25" 279 | ) 280 | ) 281 | 282 | # optional arguments 283 | optional_arguments = parser.add_argument_group( 284 | 'Optional arguments and parameters' 285 | ) 286 | 287 | ''' use tor ''' 288 | optional_arguments.add_argument( 289 | '--use-tor', 290 | action='store_true', 291 | required=False, 292 | help='Specify whether to use Tor for downloading TikTok videos.' 293 | ) 294 | 295 | ''' download TikTok results ''' 296 | optional_arguments.add_argument( 297 | '-d', 298 | '--download', 299 | action='store_true', 300 | required=False, 301 | help='Specify whether to download TikTok videos from SerpAPI and Apify.' 302 | ) 303 | 304 | ''' max workers > maximum number of threads ''' 305 | optional_arguments.add_argument( 306 | '-w', 307 | '--max-workers', 308 | type=int, 309 | required=False, 310 | metavar='', 311 | help=( 312 | "Specify the maximum number of threads to use for downloading " 313 | "TikTok videos and extracting keyframes." 314 | ) 315 | ) 316 | 317 | ''' output ''' 318 | optional_arguments.add_argument( 319 | '-o', 320 | '--output', 321 | type=str, 322 | required=False, 323 | default=f'./tikspyder-data/{int(time.time())}', 324 | metavar='', 325 | help=( 326 | "Specify output directory path. If not provided, data is " 327 | "saved in the current working directory in a folder named " 328 | "tikspyder-data" 329 | ) 330 | ) 331 | 332 | ''' launch streamlit app ''' 333 | optional_arguments.add_argument( 334 | '--app', 335 | action='store_true', 336 | required=False, 337 | help='Launch the Streamlit web interface instead of using CLI mode.' 338 | ) 339 | 340 | # parse arguments 341 | args = vars(parser.parse_args()) 342 | 343 | # check if user wants to launch Streamlit app 344 | if args.get('app'): 345 | launch_streamlit_app() 346 | return 347 | 348 | # validate that either a query, username or tag was provided 349 | if all(arg is None for arg in [args['user'], args['q'], args['tag']]): 350 | raise ValueError('Either --user, --q or --tag must be provided.') 351 | 352 | # raise error if both user and tag are provided 353 | if args['user'] and args['tag']: 354 | raise ValueError('Both --user and --tag were provided. Only one can be used.') 355 | 356 | # merging SerpAPI configuration attrs with the existing arguments 357 | config_attrs = get_config_attrs(project_paths['config']) 358 | args = {**args, **config_attrs} 359 | 360 | # verify provided dates 361 | for date_key in ['before', 'after']: 362 | if args[date_key] is not None: 363 | verify_date_argument(args, date_key) 364 | 365 | # start process 366 | log_text = f''' 367 | > Starting program at: {time.ctime()} 368 | 369 | ''' 370 | print ('\n\n' + ' '.join(log_text.split()).strip()) 371 | 372 | # create the output data path if not exists 373 | output = args['output'] 374 | create_output_data_path(output) 375 | 376 | # TikTokDataCollector instance 377 | collector = TikTokDataCollector(args=args) 378 | 379 | # TikTok data collection call 380 | collector.collect_search_data() 381 | 382 | # read SQL database and generate csv file 383 | collector.generate_data_files() 384 | 385 | # download videos 386 | if args['download']: 387 | print ('') 388 | print ('-' * 30) 389 | print ('> Downloading videos...') 390 | 391 | # get tiktok urls 392 | collected_videos = collector.get_collected_videos() 393 | 394 | if collected_videos: 395 | print (f'\n> Found {len(collected_videos)} videos to download.') 396 | 397 | # define max workers 398 | max_workers = args['max_workers'] if args['max_workers'] else 5 399 | downloader = VideoDownloader(output=output, use_tor=args['use_tor']) 400 | 401 | # start download 402 | downloader.start_download(urls=collected_videos, max_workers=max_workers) 403 | else: 404 | print ('\n> Search results did not return any videos to download.') 405 | 406 | # extract keyframes 407 | print ('\n') 408 | print ('-' * 30) 409 | print ('Extracting keyframes...') 410 | request_session = RequestSession() 411 | 412 | # define max workers 413 | max_workers = args['max_workers'] if args['max_workers'] else 3 414 | request_session.extract_keyframes_from_videos( 415 | output=output, 416 | max_concurrent=max_workers 417 | ) 418 | print ('\n') 419 | print ('-' * 30) 420 | 421 | # end process 422 | log_text = f''' 423 | > Ending program at: {time.ctime()} 424 | 425 | ''' 426 | print ('\n\n' + ' '.join(log_text.split()).strip()) 427 | 428 | if __name__ == '__main__': 429 | main() 430 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # **TikSpyder** 4 | 5 |
6 | 7 |
8 | 9 | `TikSpyder` is a command-line tool designed to collect TikTok data using SerpAPI for Google search results and Apify for TikTok data extraction. The tool supports video downloading via yt-dlp and uses Python's asynchronous capabilities and multithreading for efficient data collection. 10 | 11 |
12 |
13 | 14 |
15 | 16 | [![GitHub forks](https://img.shields.io/github/forks/estebanpdl/tik-spyder.svg?style=social&label=Fork&maxAge=2592000)](https://GitHub.com/estebanpdl/tik-spyder/network/) 17 | [![GitHub stars](https://img.shields.io/github/stars/estebanpdl/tik-spyder?style=social)](https://github.com/estebanpdl/tik-spyder/stargazers) 18 | [![Open Source](https://badges.frapsoft.com/os/v1/open-source.svg?v=103)](https://x.com/estebanpdl) 19 | [![Made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/) 20 | [![Twitter estebanpdl](https://badgen.net/badge/icon/twitter?icon=twitter&label)](https://x.com/estebanpdl) 21 | [![Buy Me A Coffee](https://img.shields.io/badge/buy%20me%20a%20coffee-donate-yellow.svg)](https://buymeacoffee.com/estebanpdl) 22 | 23 |
24 | 25 |
26 | 27 | ## 🔧 **Companion Tools** 28 | 29 | | Tool | Description | Access | 30 | |------|-------------|--------| 31 | | 🎙️ Audio Transcription | Transcribe audio files from TikTok videos | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qMcMsS2YI9btXGfFN1sCviQeB7RSKqUH) | 32 | 33 |
34 | 35 | ## 🖥️ **User Interface Options** 36 | 37 | TikSpyder provides **two ways** to interact with the tool: 38 | 39 | ### **1. 🎨 Streamlit Web Interface (Recommended for Non-Technical Users)** 40 | A modern, user-friendly web interface with TikTok-inspired dark theme that makes data collection accessible to everyone. 41 | 42 | ![TikSpyder Streamlit Interface](images/streamlit-interface.png) 43 | 44 | **Features:** 45 | - 🎯 **Intuitive Configuration**: Easy search setup with tabbed interface for keywords, users, or hashtags 46 | - 📅 **Visual Date Filters**: Calendar widgets for precise date range selection 47 | - 🚀 **Apify Integration**: Simple toggle to enable enhanced data collection 48 | - ⚙️ **Advanced Options**: Collapsible section for Google search parameters 49 | - 📥 **Download Settings**: Visual controls for video downloads and Tor network usage 50 | - 📂 **File Browser**: Point-and-click directory selection 51 | - 📊 **Real-time Progress**: Live progress tracking with step-by-step status updates 52 | 53 | **Launch the Interface:** 54 | 55 | **Method 1 (Recommended):** 56 | ```sh 57 | # Using package installation 58 | tikspyder --app 59 | 60 | # Using standard installation 61 | python main.py --app 62 | ``` 63 | 64 | **Method 2 (Direct):** 65 | ```sh 66 | streamlit run app.py 67 | ``` 68 | 69 | ### **2. ⌨️ Command Line Interface (For Advanced Users)** 70 | Full-featured command-line tool for automation and scripting scenarios. 71 | 72 | ## 🔍 **Description** 73 | 74 | TikSpyder offers two main methods of data collection: 75 | 1. **Google Search Results**: Using SerpAPI to find TikTok videos based on search queries 76 | 2. **Apify Data collection**: Using Apify to collect videos directly from TikTok profiles or keywords 77 | 78 | The tool supports various filtering options, including date ranges and content types, and can download both videos and thumbnails. Data is stored in a SQLite database and can be exported to CSV files for further analysis. 79 | 80 | Given the dynamic nature of search results and the constantly evolving landscape of TikTok's platform, it's important to note that the data collected by TikSpyder represents a sample rather than a comprehensive dataset. However, this sample can still be valuable for monitoring trends and identifying emerging narratives in the information ecosystem. 81 | 82 | To get the most out of TikSpyder, **it is recommended to test your query using Google's advanced search features. This can help refine your search query, improve the relevance of your results, and test specific keywords more effectively**. By taking advantage of these features, you can ensure that you're collecting the most relevant data for your research or analysis. 83 | 84 |
85 | 86 | ## 🚀 **Features** 87 | 88 | ### **Core Functionality** 89 | - 🔍 Collects TikTok video links using SerpAPI and Apify 90 | - 🖼️ Collects and downloads thumbnails for TikTok videos 91 | - 🔗 Collects related content to the search query 92 | - 💾 Stores collected data in SQLite database 93 | - 📊 Exports data to CSV files for analysis 94 | - 📹 Downloads TikTok videos using yt-dlp 95 | - 🎞️ Extracts keyframes from downloaded videos 96 | - ⚡ Supports asynchronous and multithreaded downloading for improved performance 97 | - 🔒 Supports Tor network for enhanced privacy and rate limiting avoidance 98 | 99 | ### **User Interfaces** 100 | - 🎨 **Modern Streamlit Web Interface**: User-friendly GUI with TikTok-inspired dark theme 101 | - ⌨️ **Command Line Interface**: Full-featured CLI for automation and advanced users 102 | - 🎯 **Search Types**: Support for keywords, user profiles, and hashtag searches 103 | - 📅 **Date Range Filtering**: Precise temporal data collection controls 104 | 105 |
106 | 107 | ## ⚙️ **Requirements** 108 | 109 | ### **System Requirements** 110 | - [Python](https://www.python.org/) >= 3.11.7 111 | - [ffmpeg](https://ffmpeg.org/) (for video processing and keyframe extraction) 112 | 113 | ### **API Keys & Services** 114 | - [SerpAPI key](https://serpapi.com/) (required for Google search functionality) 115 | - [Apify API token](https://apify.com/) (optional, for direct TikTok profile scraping) 116 | 117 | ### **Optional Components** 118 | - [Tor Browser](https://www.torproject.org/) (optional, for enhanced privacy during downloads) 119 | 120 | ### **Platform-Specific Requirements** 121 | - **All Platforms**: Python libraries listed in `requirements.txt` 122 | - **Streamlit Interface**: Automatically installed with requirements 123 | - **Linux Users**: For GUI components, install tkinter: `sudo apt-get install python3-tk` (Ubuntu/Debian) 124 | 125 |
126 | 127 | ## 🔧 **Installation** 128 | 129 | ### **Method 1: Standard Installation** 130 | 131 | 1. Clone the repository 132 | 133 | ```sh 134 | git clone https://github.com/estebanpdl/tik-spyder.git 135 | cd tik-spyder 136 | ``` 137 | 138 | 2. Install the required packages 139 | 140 | ```sh 141 | pip install -r requirements.txt 142 | ``` 143 | 144 | or 145 | 146 | ```sh 147 | pip3 install -r requirements.txt 148 | ``` 149 | 150 | ### **Method 2: Package Installation (Recommended)** 151 | 152 | This method installs TikSpyder as a package, making the `tikspyder` command available from anywhere on your system. 153 | 154 | 1. Clone the repository 155 | 156 | ```sh 157 | git clone https://github.com/estebanpdl/tik-spyder.git 158 | cd tik-spyder 159 | ``` 160 | 161 | 2. Install the package in editable mode 162 | 163 | ```sh 164 | pip install -e . 165 | ``` 166 | 167 | or 168 | 169 | ```sh 170 | pip3 install -e . 171 | ``` 172 | 173 | After installation, you can use `tikspyder` directly from any directory instead of `python main.py`. 174 | 175 | ### **Configuration** 176 | 177 | 3. Once you obtain an API key from SerpAPI and Apify, populate the config/config.ini file with the described values. Replace `api_key_value` and `apify_token_value` with your API key and token. 178 | 179 | ```ini 180 | 181 | [SerpAPI Key] 182 | api_key = your_serp_api_key 183 | 184 | [Apify Token] 185 | apify_token = your_apify_token 186 | ``` 187 | 188 |
189 | 190 | ## 📚 **Usage** 191 | 192 | TikSpyder offers two interface options to suit different user preferences and use cases: 193 | 194 | ## 🎨 **Streamlit Web Interface Usage** 195 | 196 | The Streamlit interface provides an intuitive, visual way to configure and run data collection tasks. 197 | 198 | ### **Launch the Interface** 199 | 200 | ```sh 201 | # Navigate to TikSpyder directory 202 | cd tik-spyder 203 | 204 | # Launch the Streamlit app 205 | streamlit run app.py 206 | ``` 207 | 208 | The interface will automatically open in your default web browser at `http://localhost:8501` 209 | 210 | ### **Using the Interface** 211 | 212 | 1. **🎯 Configure Search**: Choose between keyword, user profile, or hashtag search 213 | 2. **📅 Set Date Filters**: Use calendar widgets to define your collection timeframe 214 | 3. **🚀 Enable Apify** (Optional): Toggle for enhanced direct TikTok data collection 215 | 4. **⚙️ Adjust Advanced Options**: Fine-tune Google search parameters if needed 216 | 5. **📥 Configure Downloads**: Set video download preferences and worker counts 217 | 6. **📂 Choose Output Directory**: Select where your data will be saved 218 | 7. **🚀 Start Collection**: Click the centered "Start Data Collection" button 219 | 220 | --- 221 | 222 | ## ⌨️ **Command Line Interface Usage** 223 | 224 | For advanced users and automation scenarios, TikSpyder provides a full-featured CLI. 225 | 226 | ### **Using Package Installation (Method 2)** 227 | 228 | ```sh 229 | tikspyder [OPTIONS] 230 | ``` 231 | 232 | ### **Using Standard Installation (Method 1)** 233 | 234 | ```sh 235 | python main.py [OPTIONS] 236 | ``` 237 | 238 | ### **Command Line Arguments** 239 | 240 | ```sh 241 | # Package installation 242 | tikspyder --help 243 | 244 | # or 245 | tikspyder -h 246 | 247 | # Standard installation 248 | python main.py --help 249 | 250 | # or 251 | python main.py -h 252 | ``` 253 | 254 | ``` 255 | Command Line Arguments. 256 | 257 | Help options: 258 | -h, --help Show this help message and exit. 259 | 260 | SerpAPI options: 261 | --q The search term of phrase for which to retrieve TikTok data. 262 | --user Specify a TikTok user to search for videos from. 263 | --tag Specify a TikTok tag to search for videos from. 264 | --google-domain Defines the Google domain to use. It defaults to google.com. 265 | --gl Defines the country to use for the search. Two-letter country code. 266 | --hl Defines the language to use for the search. Two-letter language code. 267 | --cr Defines one or multiple countries to limit the search to. 268 | --safe Level of filtering for adult content. Options: active (default), off 269 | --lr Defines one or multiple languages to limit the search to. 270 | --depth Depth of iterations to follow related content links. 271 | 272 | Google advanced search options: 273 | --before Limit results to posts published before the specified date. Format: YYYY-MM-DD. 274 | --after Limit results to posts published after the specified date. Format: YYYY-MM-DD. 275 | 276 | Optional Apify arguments: 277 | --apify Specify whether to use Apify integration. 278 | --oldest-post-date Filter posts newer than the specified date. Format: YYYY-MM-DD. 279 | --newest-post-date Filter posts older than the specified date. Format: YYYY-MM-DD. 280 | --number-of-results Specify the number of results to return from Apify. Default: 25 281 | 282 | Optional arguments and parameters: 283 | --app Launch the Streamlit web interface instead of using CLI mode. 284 | --use-tor Specify whether to use Tor for downloading TikTok videos. 285 | -d, --download Specify whether to download TikTok videos from SerpAPI and Apify. 286 | -w , --max-workers Specify the maximum number of threads to use for downloading TikTok videos and extracting keyframes. 287 | -o , --output Specify output directory path. If not provided, data is saved in the current working directory in a folder named `tikspyder-data` 288 | ``` 289 | 290 | ### **Example Usage** 291 | 292 | 1. Search-based collection: 293 | 294 | ```sh 295 | # Using package installation (Method 2) 296 | tikspyder --q "F-16 AND Enemy AND (Ukraine OR Russia)" --gl us --hl en --after 2024-02-01 --before 2024-05-31 --output {output_directory}/ --download 297 | 298 | # Using standard installation (Method 1) 299 | python main.py --q "F-16 AND Enemy AND (Ukraine OR Russia)" --gl us --hl en --after 2024-02-01 --before 2024-05-31 --output {output_directory}/ --download 300 | 301 | # Note: Replace '{output_directory}' with the desired output path. 302 | ``` 303 | 304 | 2. Profile-based collection: 305 | 306 | ```sh 307 | # Using package installation (Method 2) 308 | tikspyder --q Trump --user username --output {output_directory}/ --download --apify --oldest-post-date 2025-01-01 309 | 310 | # Using standard installation (Method 1) 311 | python main.py --q Trump --user username --output {output_directory}/ --download --apify --oldest-post-date 2025-01-01 312 | 313 | # Note: Replace '{output_directory}' with the desired output path. 314 | ``` 315 | 316 | 3. Tag-based collection 317 | ```sh 318 | # Using package installation (Method 2) 319 | tikspyder --tag sinaloa --apify --oldest-post-date 2025-08-01 --number-of-results 50 --output {output_directory}/ --download 320 | 321 | # Using standard installation (Method 1) 322 | python main.py --tag sinaloa --apify --oldest-post-date 2025-08-01 --number-of-results 50 --output {output_directory}/ --download 323 | 324 | # Note: Replace '{output_directory}' with the desired output path. 325 | ``` 326 | 327 | ### Tor Integration 328 | You can use Tor network for downloading TikTok videos to enhance privacy and avoid rate limiting. To use this feature: 329 | 330 | 1. Make sure Tor Browser is installed and running 331 | 2. Configure your torrc file with: 332 | 333 | ``` 334 | ## Enable SOCKS proxy 335 | SocksPort 9050 336 | 337 | ## Enable Control Port for IP rotation 338 | ControlPort 9051 339 | CookieAuthentication 1 340 | ``` 341 | 342 | 3. Use the `--use-tor` flag when running the script. If Tor connection fails, the script will automatically fall back to a normal connection. 343 | 344 | 345 |
346 | 347 | ## ☕ Support 348 | 349 | If you find TikSpyder helpful, please consider buying me a coffee to support ongoing development and maintenance. Your donation will help me continue to improve the tool and add new features. 350 | 351 | [![Buy Me A Coffee](https://img.shields.io/badge/buy%20me%20a%20coffee-donate-yellow.svg?style=for-the-badge&logo=buy-me-a-coffee&logoColor=white)](https://buymeacoffee.com/estebanpdl) 352 | 353 |
354 | -------------------------------------------------------------------------------- /data_collectors/collector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import os 5 | import time 6 | import json 7 | import uuid 8 | import httpx 9 | 10 | # typing 11 | from typing import Dict, List 12 | 13 | # SerpAPI module 14 | import serpapi 15 | 16 | # Apify client 17 | from apify_client import ApifyClient 18 | 19 | # local dependencies 20 | from .utilities import ( 21 | search_query, 22 | select_serpapi_parameters, 23 | extract_results_keys, 24 | extract_related_content_keys, 25 | build_site_query 26 | ) 27 | 28 | # utils 29 | from pathlib import Path 30 | 31 | # SQLManager 32 | from databases import SQLDatabaseManager 33 | 34 | # Media handlers 35 | from media_handlers import RequestSession 36 | 37 | # SerpAPI collector class 38 | class TikTokDataCollector: 39 | ''' 40 | TikTokDataCollector collects TikTok data from Google search results 41 | using SerpAPI. 42 | ''' 43 | 44 | def __init__(self, args: Dict) -> None: 45 | ''' 46 | Initializes TikTokDataCollector with the given parameters and options 47 | from the command line. 48 | 49 | :param args: Dict containing the command line arguments and options 50 | ''' 51 | # get output data path 52 | self.output = self._sanitize_output_path(args['output']) 53 | 54 | # endpoint for SerpAPI 55 | self.api_key = args['api_key'] 56 | self.endpoint = 'https://serpapi.com/search' 57 | 58 | # Apify token 59 | self.apify_token = args['apify_token'] 60 | 61 | # main site: tiktok.com 62 | self.site = 'tiktok.com' 63 | 64 | # build the search query string 65 | q = search_query(args=args) 66 | 67 | # get provided user and tag 68 | self.user = args['user'] 69 | self.tag = args['tag'] 70 | 71 | # build advanced search query using utility function 72 | self.query = build_site_query( 73 | site=self.site, user=self.user, tag=self.tag, q=q 74 | ) 75 | 76 | # update the query parameter in args 77 | args['q'] = self.query 78 | 79 | # store the parameters 80 | self.parameters = select_serpapi_parameters(args) 81 | 82 | # SerpAPI client 83 | self.client = serpapi.Client(api_key=self.api_key) 84 | 85 | # Apify client 86 | self.run_apify = args['apify'] 87 | if self.run_apify: 88 | if self.user is not None or self.tag is not None: 89 | self.should_download_videos = args['download'] 90 | self.apify_client = ApifyClient(self.apify_token) 91 | 92 | # optional date filters 93 | self.oldest_post_date = args['oldest_post_date'] 94 | self.newest_post_date = args['newest_post_date'] 95 | 96 | # number of results 97 | self.number_of_results = args['number_of_results'] 98 | 99 | # database connection 100 | self.sql_database = SQLDatabaseManager(self.output, self.run_apify) 101 | 102 | # connections 103 | self.related_content_urls = [] 104 | self.related_content_depth = args['depth'] 105 | self.http_session = RequestSession() 106 | 107 | def _sanitize_output_path(self, output: str) -> str: 108 | ''' 109 | Ensures the given path uses forward slashes and does not end with a 110 | slash. 111 | 112 | :param output: The original directory path. 113 | :return: A sanitized directory path with forward slashes and no 114 | trailing slash. 115 | ''' 116 | # create a Path object and normalize the path 117 | path = Path(output) 118 | 119 | # path with the correct separators for the current OS 120 | output = str(path.as_posix()) 121 | 122 | # remove any trailing slashes 123 | output = output.rstrip('/') 124 | 125 | return output 126 | 127 | def collect_search_results(self) -> None: 128 | ''' 129 | Makes an API call to SerpAPI and processes the response data. 130 | 131 | Fetches data based on the initialized parameters and handles pagination 132 | to retrieve data from all available pages. 133 | ''' 134 | print (f'\nAPI call to Google search results\n') 135 | print (f'> search query: {self.query}') 136 | result_type = 'search_result' 137 | try: 138 | api_response = self.client.search(self.parameters) 139 | print ('\n> Searching...') 140 | 141 | # save raw data 142 | self._save_raw_data( 143 | self.output, 144 | result_type=result_type, 145 | data=api_response.data 146 | ) 147 | 148 | # found results 149 | found_results = False 150 | 151 | # process search results 152 | self._process_search_results(api_response.data) 153 | if api_response.data.get('organic_results', []): 154 | found_results = True 155 | 156 | # get next page 157 | next_page = api_response.next_page_url 158 | while next_page: 159 | # get new API response 160 | next_response = api_response.next_page() 161 | 162 | # save raw data 163 | self._save_raw_data( 164 | self.output, 165 | result_type=result_type, 166 | data=next_response.data 167 | ) 168 | 169 | # process search results 170 | self._process_search_results(next_response.data) 171 | 172 | # get next page 173 | next_page = next_response.next_page_url 174 | 175 | # update api_response for the next iteration 176 | api_response = next_response 177 | 178 | # chill out 179 | time.sleep(2) 180 | 181 | # api call status 182 | print ('> Done') 183 | 184 | if not found_results: 185 | print ('No organic results found.') 186 | 187 | except Exception as e: 188 | print (f'An error occurred during the API call: {e}') 189 | 190 | def _process_search_results(self, data: Dict) -> None: 191 | ''' 192 | Processes the response data from SerpAPI, extracting organic results 193 | and inserting them into the SQL database. 194 | 195 | :param data: SerpAPI raw data response 196 | ''' 197 | # get organic search results 198 | field = 'organic_results' 199 | result_type = 'search_result' 200 | results = data.get(field, []) 201 | if results: 202 | d = extract_results_keys(results, result_type=result_type) 203 | 204 | # write results in SQL database 205 | if d: 206 | self.sql_database.insert_search_results(d) 207 | 208 | def collect_image_results(self) -> None: 209 | ''' 210 | Makes an API call to SerpAPI to collect image thumbnails from Google 211 | Images. 212 | ''' 213 | # Google Images API 214 | self.parameters['tbm'] = 'isch' 215 | 216 | # collect images 217 | print (f'\n\nAPI call to Google images') 218 | result_type = 'image_result' 219 | try: 220 | api_response = self.client.search(self.parameters) 221 | print ('\n> Searching images...') 222 | 223 | # save raw data 224 | self._save_raw_data( 225 | self.output, 226 | result_type=result_type, 227 | data=api_response.data 228 | ) 229 | 230 | # found results 231 | found_results = False 232 | 233 | # process images results 234 | self._process_images_results(api_response.data) 235 | if api_response.data.get('images_results', []): 236 | found_results = True 237 | print (f'> Downloading images results...') 238 | 239 | # get next page 240 | next_page = api_response.next_page_url 241 | while next_page: 242 | next_response = api_response.next_page() 243 | 244 | # save raw data 245 | self._save_raw_data( 246 | self.output, 247 | result_type=result_type, 248 | data=next_response.data 249 | ) 250 | 251 | # process image results 252 | self._process_images_results(next_response.data) 253 | 254 | # get next page 255 | next_page = next_response.next_page_url 256 | 257 | # update api_response for the next iteration 258 | api_response = next_response 259 | 260 | # chill out 261 | time.sleep(2) 262 | 263 | # api call status 264 | print ('> Done') 265 | 266 | if not found_results: 267 | print ('No image results found in the response.') 268 | 269 | except Exception as e: 270 | print (f'An error occurred during the API call: {e}') 271 | 272 | # collect related content 273 | print (f'\n\nCollecting related content') 274 | if self.related_content_urls: 275 | self.related_content_urls = self.related_content_urls[ 276 | :self.related_content_depth 277 | ] 278 | for url in self.related_content_urls: 279 | self._collect_related_content(url=url) 280 | print ('> Done') 281 | else: 282 | print ('No related content found.') 283 | 284 | def _process_images_results(self, data: Dict) -> None: 285 | ''' 286 | Processes the response data from SerpAPI, extracting thumbnails 287 | and inserting related data into the SQL database. 288 | 289 | :param data: SerpAPI raw data response 290 | ''' 291 | # get image results 292 | field = 'images_results' 293 | result_type = 'image_result' 294 | results = data.get(field, []) 295 | if results: 296 | d = extract_results_keys(results, result_type=result_type) 297 | 298 | # write results in SQL database 299 | if d: 300 | self.sql_database.insert_images_results(d) 301 | 302 | # download images 303 | thumbnails = [i['thumbnail'] for i in d] 304 | links = [i['link'] for i in d] 305 | self.http_session.start_media_download( 306 | urls=thumbnails, 307 | links=links, 308 | output=self.output, 309 | media_type='image' 310 | ) 311 | 312 | # save related content urls 313 | key = 'serpapi_related_content_link' 314 | self.related_content_urls += [ 315 | i[key] for i in d if key in i 316 | ] 317 | 318 | def _collect_related_content(self, url: str) -> None: 319 | ''' 320 | Collects related content from the given URL. 321 | 322 | :param url: The URL to load related content from. 323 | ''' 324 | result_type = 'related_content' 325 | content = self.http_session.load_related_content( 326 | url=url, 327 | api_key=self.api_key 328 | ) 329 | 330 | # save raw data 331 | self._save_raw_data( 332 | self.output, 333 | result_type=result_type, 334 | data=content 335 | ) 336 | 337 | # process related content 338 | self._process_related_content(content) 339 | 340 | def _process_related_content(self, content: Dict) -> None: 341 | ''' 342 | Processes the related content data. 343 | 344 | :param content: A dictionary containing the related content data. 345 | ''' 346 | # get related content 347 | possible_fields = ['related_content', 'images_results'] 348 | related_content = [] 349 | for field in possible_fields: 350 | related_content = content.get(field, None) 351 | if related_content is not None: 352 | break 353 | 354 | if related_content: 355 | d = extract_related_content_keys(related_content) 356 | 357 | # write results in SQL database 358 | if d: 359 | self.sql_database.insert_related_content(d) 360 | else: 361 | print ('No results found in this URL') 362 | 363 | def _apify_tiktok_profile_scraper(self) -> None: 364 | ''' 365 | Collects search data using Apify. 366 | ''' 367 | print ('\n\nCollecting user data with Apify') 368 | 369 | # get the search results 370 | run_input = { 371 | 'profiles': [self.user], 372 | 'profileScrapeSections': ['videos'], 373 | 'profileSorting': 'latest', 374 | 'resultsPerPage': self.number_of_results, 375 | 'excludePinnedPosts': False, 376 | 'shouldDownloadVideos': self.should_download_videos, 377 | 'shouldDownloadCovers': True, 378 | 'shouldDownloadSubtitles': False, 379 | 'shouldDownloadSlideshowImages': False, 380 | 'shouldDownloadAvatars': True 381 | } 382 | 383 | # add optional date filters 384 | if self.oldest_post_date: 385 | run_input['oldestPostDate'] = self.oldest_post_date 386 | if self.newest_post_date: 387 | run_input['newestPostDate'] = self.newest_post_date 388 | 389 | # run the Apify actor 390 | apify_actor_key = '0FXVyOXXEmdGcV88a' 391 | try: 392 | run = self.apify_client.actor(apify_actor_key).call( 393 | run_input=run_input 394 | ) 395 | 396 | # store data 397 | store_data = [] 398 | for item in self.apify_client.dataset(run['defaultDatasetId']).iterate_items(): 399 | store_data.append(item) 400 | 401 | # write raw data 402 | if store_data: 403 | self._save_raw_data( 404 | self.output, 405 | result_type='apify_profile_data', 406 | data=store_data 407 | ) 408 | 409 | # process data 410 | self._process_apify_profile_data(store_data) 411 | else: 412 | print ('No data found in the Apify run.') 413 | except httpx.LocalProtocolError as e: 414 | print ('Warning: Apify API token is either missing or invalid. Skipping Apify integration.') 415 | 416 | def _process_apify_profile_data(self, data: Dict) -> None: 417 | ''' 418 | Processes the Apify profile data. 419 | 420 | :param data: A dictionary containing the Apify profile data. 421 | ''' 422 | # insert data into SQL database 423 | self.sql_database.insert_apify_profile_data(data) 424 | 425 | # downloading images 426 | thumbnails = [] 427 | links = [] 428 | for item in data: 429 | try: 430 | thumbnails.append(item['videoMeta']['coverUrl']) 431 | links.append(item['webVideoUrl']) 432 | except KeyError: 433 | pass 434 | 435 | self.http_session.start_media_download( 436 | urls=thumbnails, 437 | links=links, 438 | output=self.output, 439 | media_type='image' 440 | ) 441 | print ('> Thumbnails downloaded') 442 | 443 | # get videos from Apify collected data 444 | if self.should_download_videos: 445 | videos = [] 446 | tiktok_links = [] 447 | for item in data: 448 | try: 449 | videos.append(item['videoMeta']['downloadAddr']) 450 | tiktok_links.append(item['webVideoUrl']) 451 | except KeyError: 452 | pass 453 | 454 | # download videos 455 | self.http_session.start_media_download( 456 | urls=videos, 457 | links=tiktok_links, 458 | output=self.output, 459 | media_type='video' 460 | ) 461 | print ('> Videos downloaded') 462 | 463 | # extract audio from videos 464 | print ('> Extracting audio from videos...') 465 | self.http_session.extract_audio_from_videos(self.output) 466 | print ('> Done') 467 | 468 | return 469 | 470 | def _apify_tiktok_hashtag_scraper(self) -> None: 471 | ''' 472 | Collects hashtag data using Apify. 473 | ''' 474 | print ('\n\nCollecting hashtag data with Apify') 475 | 476 | # get the hashtag results 477 | run_input = { 478 | 'hashtags': [self.tag], 479 | 'resultsPerPage': self.number_of_results, 480 | 'searchSection': '/video', 481 | 'searchQueries': [self.tag], 482 | 'excludePinnedPosts': False, 483 | 'shouldDownloadVideos': self.should_download_videos, 484 | 'shouldDownloadCovers': True, 485 | 'shouldDownloadSubtitles': False, 486 | 'shouldDownloadSlideshowImages': False, 487 | 'shouldDownloadAvatars': True 488 | } 489 | 490 | # run the Apify actor 491 | apify_actor_key = 'OtzYfK1ndEGdwWFKQ' 492 | try: 493 | run = self.apify_client.actor(apify_actor_key).call( 494 | run_input=run_input 495 | ) 496 | 497 | # store data 498 | store_data = [] 499 | for item in self.apify_client.dataset(run['defaultDatasetId']).iterate_items(): 500 | store_data.append(item) 501 | 502 | # write raw data 503 | if store_data: 504 | self._save_raw_data( 505 | self.output, 506 | result_type='apify_hashtag_data', 507 | data=store_data 508 | ) 509 | 510 | # process data 511 | self._process_apify_hashtag_data(store_data) 512 | else: 513 | print ('No data found in the Apify run.') 514 | except httpx.LocalProtocolError as e: 515 | print ('Warning: Apify API token is either missing or invalid. Skipping Apify integration.') 516 | 517 | def _process_apify_hashtag_data(self, data: Dict) -> None: 518 | ''' 519 | Processes the Apify hashtag data. 520 | 521 | :param data: A dictionary containing the Apify hashtag data. 522 | ''' 523 | # insert data into SQL database 524 | self.sql_database.insert_apify_hashtag_data(data) 525 | 526 | # downloading images 527 | thumbnails = [] 528 | links = [] 529 | for item in data: 530 | try: 531 | thumbnails.append(item['videoMeta']['coverUrl']) 532 | links.append(item['webVideoUrl']) 533 | except KeyError: 534 | pass 535 | 536 | self.http_session.start_media_download( 537 | urls=thumbnails, 538 | links=links, 539 | output=self.output, 540 | media_type='image' 541 | ) 542 | print ('> Thumbnails downloaded') 543 | 544 | # get videos from Apify collected data 545 | if self.should_download_videos: 546 | videos = [] 547 | tiktok_links = [] 548 | for item in data: 549 | try: 550 | videos.append(item['videoMeta']['downloadAddr']) 551 | tiktok_links.append(item['webVideoUrl']) 552 | except KeyError: 553 | pass 554 | 555 | # download videos 556 | self.http_session.start_media_download( 557 | urls=videos, 558 | links=tiktok_links, 559 | output=self.output, 560 | media_type='video' 561 | ) 562 | print ('> Videos downloaded') 563 | 564 | # extract audio from videos 565 | print ('> Extracting audio from videos...') 566 | self.http_session.extract_audio_from_videos(self.output) 567 | print ('> Done') 568 | 569 | return 570 | 571 | def _save_raw_data(self, output: str, result_type: str, data: Dict) -> None: 572 | ''' 573 | Saves the raw data response from SerpAPI in a JSON file. 574 | 575 | :param output: The directory path where the raw data should be saved. 576 | :param result_type: Type of SerpAPI response: 'search_result', 577 | 'image_result', 'related_content', or Apify response 578 | :param data: The raw data response from SerpAPI to be saved. 579 | ''' 580 | # create the directory structure if it does not exist 581 | folder = f'{output}/raw_data/{result_type}' 582 | if not os.path.exists(folder): 583 | os.makedirs(folder) 584 | 585 | # create a timestamp for the file name 586 | stamp = int(time.time()) 587 | uuid_code = str(uuid.uuid4()).split('-')[-1] 588 | 589 | # convert the data to a JSON string 590 | obj = json.dumps(data, ensure_ascii=False, indent=2) 591 | 592 | # write the JSON string to a file 593 | file_path = f'{folder}/{result_type}_{stamp}_{uuid_code}.json' 594 | with open(file_path, encoding='utf-8', mode='w') as writer: 595 | writer.write(obj) 596 | 597 | def collect_search_data(self) -> None: 598 | ''' 599 | Collects both search results and corresponding image thumbnails. 600 | ''' 601 | print ('\n\n') 602 | print ('-' * 30) 603 | print ('Starting data collection process...\n') 604 | 605 | self.collect_search_results() 606 | self.collect_image_results() 607 | 608 | if self.run_apify: 609 | if self.user is not None: 610 | self._apify_tiktok_profile_scraper() 611 | elif self.tag is not None: 612 | self._apify_tiktok_hashtag_scraper() 613 | 614 | print ('\n\nData collection complete.') 615 | print ('-' * 30) 616 | 617 | def generate_data_files(self) -> None: 618 | ''' 619 | Selects all data from SQL tables and generates CSV files 620 | ''' 621 | print (f'\n\nGenerating CSV files') 622 | self.sql_database.fetch_all_data() 623 | print ('> Done') 624 | 625 | def get_collected_videos(self) -> List[str]: 626 | ''' 627 | Retrieves all collected video links from the SQL database. 628 | 629 | :return: A list of unique video links. 630 | ''' 631 | return self.sql_database.get_collected_videos( 632 | include_user_related_content=self.user is not None 633 | ) 634 | 635 | def get_all_collected_videos(self) -> List[str]: 636 | ''' 637 | Retrieves all unique video links from the query_search_results, 638 | images_results, and Apify tables. 639 | ''' 640 | return self.sql_database.get_all_collected_videos() 641 | -------------------------------------------------------------------------------- /databases/sql_manager.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # import modules 4 | import os 5 | import sqlite3 6 | import pandas as pd 7 | 8 | # SQL submodules 9 | from sqlite3 import Error 10 | 11 | # typing 12 | from typing import List, Optional 13 | 14 | # Database Manager utilities 15 | from .utilities import get_items_from_search_results, \ 16 | get_items_from_images_results, get_items_from_related_content, \ 17 | get_items_from_apify_profile_data, get_items_from_apify_hashtag_data, \ 18 | extract_author_post_id 19 | 20 | # SQLDatabaseManager class 21 | class SQLDatabaseManager: 22 | ''' 23 | SQLDatabaseManager 24 | 25 | This class provides an abstracted interface for interacting with a SQL 26 | database. 27 | ''' 28 | def __init__(self, output: str, run_apify: bool) -> None: 29 | ''' 30 | Initializes the SQLDatabaseManager with the given output path. 31 | 32 | :param output: The directory path where the database file will be 33 | created. 34 | :param run_apify: Whether to run the apify profile scraper. 35 | ''' 36 | self.output = output 37 | self.sql_database_file = f'{self.output}/database.sql' 38 | 39 | # create required SQL tables for data processing - SerpAPI 40 | self.create_search_results_table() 41 | self.create_images_results_table() 42 | self.create_related_content_table() 43 | 44 | # create required SQL tables for data processing - Apify 45 | self.create_apify_profile_scraper_table() 46 | self.create_apify_hashtag_scraper_table() 47 | 48 | def create_sql_connection(self) -> Optional[sqlite3.Connection]: 49 | ''' 50 | Creates a SQL connection. 51 | 52 | :return: A SQLite connection object or None if an error occurred 53 | ''' 54 | try: 55 | conn = sqlite3.connect(self.sql_database_file) 56 | return conn 57 | except Error as e: 58 | print (f'An error occurred: {e}') 59 | return None 60 | 61 | def create_search_results_table(self) -> None: 62 | ''' 63 | Creates the query_search_results table if it does not already exist. 64 | ''' 65 | # set cursor 66 | conn = self.create_sql_connection() 67 | if conn is not None: 68 | cursor = conn.cursor() 69 | 70 | try: 71 | cursor.execute( 72 | ''' 73 | CREATE TABLE IF NOT EXISTS query_search_results ( 74 | record_id INTEGER PRIMARY KEY AUTOINCREMENT, 75 | source TEXT, 76 | title TEXT, 77 | snippet TEXT, 78 | link TEXT UNIQUE, 79 | thumbnail TEXT, 80 | video_link TEXT, 81 | snippet_highlighted_words TEXT, 82 | displayed_link TEXT, 83 | title_snippet TEXT, 84 | likes TEXT, 85 | comments TEXT, 86 | author TEXT, 87 | link_to_author TEXT, 88 | post_id TEXT UNIQUE 89 | ); 90 | ''' 91 | ) 92 | 93 | # commit changes 94 | conn.commit() 95 | except Error as e: 96 | print (f'An error occurred: {e}') 97 | finally: 98 | conn.close() 99 | else: 100 | print ('Failed to create the database connection.') 101 | 102 | def insert_search_results(self, data: List) -> None: 103 | ''' 104 | Inserts data into the query_search_results table. 105 | 106 | :param data: A list of dictionaries containing the data to insert. 107 | ''' 108 | conn = self.create_sql_connection() 109 | if conn is not None: 110 | cursor = conn.cursor() 111 | 112 | try: 113 | for entry in data: 114 | cursor.execute( 115 | ''' 116 | INSERT OR IGNORE INTO query_search_results ( 117 | source, title, snippet, link, thumbnail, 118 | video_link, snippet_highlighted_words, 119 | displayed_link, title_snippet, likes, comments, 120 | author, link_to_author, post_id 121 | ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 122 | ''', 123 | get_items_from_search_results(entry) 124 | ) 125 | 126 | # commit changes 127 | conn.commit() 128 | 129 | except Error as e: 130 | print (f'An error occurred while inserting data: {e}') 131 | finally: 132 | conn.close() 133 | else: 134 | print ('Failed to create the database connection.') 135 | 136 | def create_images_results_table(self) -> None: 137 | ''' 138 | Creates the images_results table if it does not already exist. 139 | ''' 140 | # set cursor 141 | conn = self.create_sql_connection() 142 | if conn is not None: 143 | cursor = conn.cursor() 144 | 145 | try: 146 | cursor.execute( 147 | ''' 148 | CREATE TABLE IF NOT EXISTS images_results ( 149 | record_id INTEGER PRIMARY KEY AUTOINCREMENT, 150 | source TEXT, 151 | title TEXT, 152 | link TEXT UNIQUE, 153 | thumbnail TEXT, 154 | author TEXT, 155 | link_to_author TEXT, 156 | post_id TEXT UNIQUE 157 | ); 158 | ''' 159 | ) 160 | 161 | # commit changes 162 | conn.commit() 163 | except Error as e: 164 | print (f'An error occurred: {e}') 165 | finally: 166 | conn.close() 167 | else: 168 | print ('Failed to create the database connection.') 169 | 170 | def insert_images_results(self, data: List) -> None: 171 | ''' 172 | Inserts data into the images_results table. 173 | 174 | :param data: A list of dictionaries containing the data to insert. 175 | ''' 176 | conn = self.create_sql_connection() 177 | if conn is not None: 178 | cursor = conn.cursor() 179 | 180 | try: 181 | for entry in data: 182 | cursor.execute( 183 | ''' 184 | INSERT OR IGNORE INTO images_results ( 185 | source, title, link, thumbnail, author, 186 | link_to_author, post_id 187 | ) VALUES (?, ?, ?, ?, ?, ?, ?) 188 | ''', 189 | get_items_from_images_results(entry) 190 | ) 191 | 192 | # commit changes 193 | conn.commit() 194 | 195 | except Error as e: 196 | print (f'An error occurred while inserting data: {e}') 197 | finally: 198 | conn.close() 199 | else: 200 | print ('Failed to create the database connection.') 201 | 202 | def create_related_content_table(self) -> None: 203 | ''' 204 | Creates the related_content table if it does not already exist. 205 | ''' 206 | # set cursor 207 | conn = self.create_sql_connection() 208 | if conn is not None: 209 | cursor = conn.cursor() 210 | 211 | try: 212 | cursor.execute( 213 | ''' 214 | CREATE TABLE IF NOT EXISTS related_content ( 215 | record_id INTEGER PRIMARY KEY AUTOINCREMENT, 216 | source TEXT, 217 | link TEXT UNIQUE, 218 | thumbnail TEXT, 219 | title TEXT 220 | ); 221 | ''' 222 | ) 223 | 224 | # commit changes 225 | conn.commit() 226 | except Error as e: 227 | print (f'An error occurred: {e}') 228 | finally: 229 | conn.close() 230 | else: 231 | print ('Failed to create the database connection.') 232 | 233 | def insert_related_content(self, data: List) -> None: 234 | ''' 235 | Inserts data into the related_content table. 236 | 237 | :param data: A list of dictionaries containing the data to insert. 238 | ''' 239 | conn = self.create_sql_connection() 240 | if conn is not None: 241 | cursor = conn.cursor() 242 | 243 | try: 244 | for entry in data: 245 | cursor.execute( 246 | ''' 247 | INSERT OR IGNORE INTO related_content ( 248 | source, link, thumbnail, title 249 | ) VALUES (?, ?, ?, ?) 250 | ''', 251 | get_items_from_related_content(entry) 252 | ) 253 | 254 | # commit changes 255 | conn.commit() 256 | 257 | except Error as e: 258 | print (f'An error occurred while inserting data: {e}') 259 | finally: 260 | conn.close() 261 | else: 262 | print ('Failed to create the database connection.') 263 | 264 | def create_apify_profile_scraper_table(self) -> None: 265 | ''' 266 | Creates the apify_profile_scraper table if it does not already exist. 267 | ''' 268 | conn = self.create_sql_connection() 269 | if conn is not None: 270 | cursor = conn.cursor() 271 | 272 | try: 273 | cursor.execute( 274 | ''' 275 | CREATE TABLE IF NOT EXISTS apify_profile_scraper ( 276 | id TEXT PRIMARY KEY, 277 | text TEXT, 278 | text_language TEXT, 279 | create_time INTEGER, 280 | create_time_iso TEXT, 281 | is_ad BOOLEAN, 282 | web_video_url TEXT UNIQUE, 283 | 284 | author_id TEXT, 285 | author_name TEXT, 286 | author_profile_url TEXT, 287 | author_bio_link TEXT, 288 | author_signature TEXT, 289 | author_nickname TEXT, 290 | author_verified BOOLEAN, 291 | author_avatar TEXT, 292 | author_private_account BOOLEAN, 293 | author_region TEXT, 294 | author_following INTEGER, 295 | author_friends INTEGER, 296 | author_fans INTEGER, 297 | author_heart INTEGER, 298 | author_video INTEGER, 299 | author_digg INTEGER, 300 | 301 | music_id TEXT, 302 | music_name TEXT, 303 | music_author TEXT, 304 | music_original BOOLEAN, 305 | 306 | video_duration INTEGER, 307 | video_thumbnail TEXT, 308 | video_download_url TEXT, 309 | 310 | digg_count INTEGER, 311 | share_count INTEGER, 312 | play_count INTEGER, 313 | collect_count INTEGER, 314 | comment_count INTEGER, 315 | 316 | hashtags TEXT, 317 | is_slideshow BOOLEAN, 318 | is_pinned BOOLEAN, 319 | is_sponsored BOOLEAN, 320 | input_username TEXT, 321 | from_profile_section TEXT, 322 | 323 | UNIQUE (id, web_video_url) 324 | ON CONFLICT REPLACE 325 | ); 326 | ''' 327 | ) 328 | 329 | # commit changes 330 | conn.commit() 331 | except Error as e: 332 | print (f'An error occurred: {e}') 333 | finally: 334 | conn.close() 335 | else: 336 | print ('Failed to create the database connection.') 337 | 338 | def create_apify_hashtag_scraper_table(self) -> None: 339 | ''' 340 | Creates the apify_hashtag_scraper table if it does not already exist. 341 | ''' 342 | conn = self.create_sql_connection() 343 | if conn is not None: 344 | cursor = conn.cursor() 345 | 346 | try: 347 | cursor.execute( 348 | ''' 349 | CREATE TABLE IF NOT EXISTS apify_hashtag_scraper ( 350 | id TEXT PRIMARY KEY, 351 | text TEXT, 352 | text_language TEXT, 353 | create_time INTEGER, 354 | create_time_iso TEXT, 355 | is_ad BOOLEAN, 356 | web_video_url TEXT UNIQUE, 357 | 358 | author_id TEXT, 359 | author_name TEXT, 360 | author_profile_url TEXT, 361 | author_bio_link TEXT, 362 | author_signature TEXT, 363 | author_nickname TEXT, 364 | author_verified BOOLEAN, 365 | author_avatar TEXT, 366 | author_private_account BOOLEAN, 367 | author_region TEXT, 368 | author_following INTEGER, 369 | author_friends INTEGER, 370 | author_fans INTEGER, 371 | author_heart INTEGER, 372 | author_video INTEGER, 373 | author_digg INTEGER, 374 | 375 | music_id TEXT, 376 | music_name TEXT, 377 | music_author TEXT, 378 | music_original BOOLEAN, 379 | 380 | video_duration INTEGER, 381 | video_thumbnail TEXT, 382 | video_download_url TEXT, 383 | 384 | digg_count INTEGER, 385 | share_count INTEGER, 386 | play_count INTEGER, 387 | collect_count INTEGER, 388 | comment_count INTEGER, 389 | 390 | hashtags TEXT, 391 | is_slideshow BOOLEAN, 392 | is_pinned BOOLEAN, 393 | is_sponsored BOOLEAN, 394 | input_search TEXT, 395 | search_hashtag_views INTEGER, 396 | 397 | UNIQUE (id, web_video_url) 398 | ON CONFLICT REPLACE 399 | ); 400 | ''' 401 | ) 402 | 403 | # commit changes 404 | conn.commit() 405 | except Error as e: 406 | print (f'An error occurred: {e}') 407 | finally: 408 | conn.close() 409 | else: 410 | print ('Failed to create the database connection.') 411 | 412 | def insert_apify_profile_data(self, data: List) -> None: 413 | ''' 414 | Inserts data into the apify_profile_scraper table. 415 | 416 | :param data: A list of dictionaries containing the data to insert. 417 | ''' 418 | conn = self.create_sql_connection() 419 | if conn is not None: 420 | cursor = conn.cursor() 421 | 422 | try: 423 | for entry in data: 424 | cursor.execute( 425 | ''' 426 | INSERT OR REPLACE INTO apify_profile_scraper ( 427 | id, text, text_language, create_time, create_time_iso, 428 | is_ad, web_video_url, author_id, author_name, 429 | author_profile_url, author_bio_link, author_signature, 430 | author_nickname, author_verified, author_avatar, 431 | author_private_account, author_region, author_following, 432 | author_friends, author_fans, author_heart, author_video, 433 | author_digg, music_id, music_name, music_author, 434 | music_original, video_duration, video_thumbnail, 435 | video_download_url, digg_count, share_count, play_count, 436 | collect_count, comment_count, hashtags, is_slideshow, 437 | is_pinned, is_sponsored, input_username, 438 | from_profile_section 439 | ) VALUES ( 440 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 441 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 442 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? 443 | ) 444 | ''', 445 | get_items_from_apify_profile_data(entry) 446 | ) 447 | 448 | # commit changes 449 | conn.commit() 450 | except Error as e: 451 | print (f'An error occurred while inserting data: {e}') 452 | finally: 453 | conn.close() 454 | else: 455 | print ('Failed to create the database connection.') 456 | 457 | def insert_apify_hashtag_data(self, data: List) -> None: 458 | ''' 459 | Inserts data into the apify_hashtag_scraper table. 460 | 461 | :param data: A list of dictionaries containing the data to insert. 462 | ''' 463 | conn = self.create_sql_connection() 464 | if conn is not None: 465 | cursor = conn.cursor() 466 | 467 | try: 468 | for entry in data: 469 | cursor.execute( 470 | ''' 471 | INSERT OR REPLACE INTO apify_hashtag_scraper ( 472 | id, text, text_language, create_time, create_time_iso, 473 | is_ad, web_video_url, author_id, author_name, 474 | author_profile_url, author_bio_link, author_signature, 475 | author_nickname, author_verified, author_avatar, 476 | author_private_account, author_region, author_following, 477 | author_friends, author_fans, author_heart, author_video, 478 | author_digg, music_id, music_name, music_author, 479 | music_original, video_duration, video_thumbnail, 480 | video_download_url, digg_count, share_count, play_count, 481 | collect_count, comment_count, hashtags, is_slideshow, 482 | is_pinned, is_sponsored, input_search, 483 | search_hashtag_views 484 | ) VALUES ( 485 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 486 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 487 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? 488 | ) 489 | ''', 490 | get_items_from_apify_hashtag_data(entry) 491 | ) 492 | 493 | # commit changes 494 | conn.commit() 495 | except Error as e: 496 | print (f'An error occurred while inserting data: {e}') 497 | finally: 498 | conn.close() 499 | else: 500 | print ('Failed to create the database connection.') 501 | 502 | def fetch_all_data(self) -> None: 503 | ''' 504 | Fetches all data from the SQL tables 505 | ''' 506 | tables = [ 507 | 'query_search_results', 508 | 'images_results', 509 | 'related_content', 510 | 'apify_profile_scraper', 511 | 'apify_hashtag_scraper' 512 | ] 513 | conn = self.create_sql_connection() 514 | if conn is not None: 515 | try: 516 | for t in tables: 517 | q = f''' 518 | SELECT * 519 | FROM {t} 520 | ''' 521 | # fetch data 522 | df = pd.read_sql_query(q, conn) 523 | 524 | # save data 525 | save_path = f'{self.output}/{t}.csv' 526 | df.to_csv( 527 | save_path, 528 | index=False, 529 | encoding='utf-8' 530 | ) 531 | 532 | except Error as e: 533 | print (f'An error occurred while fetching data from {t}: {e}') 534 | finally: 535 | conn.close() 536 | 537 | def get_collected_videos(self, include_user_related_content: bool) -> List: 538 | ''' 539 | Retrieves all unique video links from the query_search_results and 540 | images_results tables that have not been downloaded yet. 541 | 542 | :param include_user_related_content: Whether to include user related 543 | content from Google search results in the returned list of links. 544 | :return: A list of unique video links. 545 | ''' 546 | data = [] 547 | conn = self.create_sql_connection() 548 | if conn is not None: 549 | cursor = conn.cursor() 550 | 551 | try: 552 | # get all video links from database 553 | cursor.execute( 554 | ''' 555 | SELECT link 556 | FROM query_search_results 557 | UNION 558 | SELECT link 559 | FROM images_results 560 | ''' 561 | ) 562 | 563 | # fetch all links 564 | all_links = [i[0] for i in cursor.fetchall()] 565 | 566 | if include_user_related_content: 567 | # get user from link 568 | user = extract_author_post_id(all_links[0])[0] 569 | 570 | # get all user related content links from database that match the user's TikTok video pattern 571 | cursor.execute( 572 | ''' 573 | SELECT link 574 | FROM related_content 575 | WHERE link LIKE ? 576 | ''', 577 | (f'https://www.tiktok.com/@{user}/video/%',) 578 | ) 579 | 580 | # fetch all links 581 | all_links.extend([i[0] for i in cursor.fetchall()]) 582 | 583 | # remove duplicates 584 | all_links = list(set(all_links)) 585 | 586 | # get list of already downloaded videos 587 | videos_dir = os.path.join(self.output, 'downloaded_videos') 588 | 589 | if os.path.exists(videos_dir): 590 | # get existing video ids 591 | existing_ids = { 592 | os.path.splitext(f)[0] 593 | for f in os.listdir(videos_dir) 594 | if os.path.isfile(os.path.join(videos_dir, f)) 595 | } 596 | 597 | # filter out links whose IDs are already downloaded 598 | data = [ 599 | link for link in all_links 600 | if extract_author_post_id(link)[2] not in existing_ids 601 | ] 602 | else: 603 | data = all_links 604 | except Error as e: 605 | print (f'An error occurred while retrieving data: {e}') 606 | finally: 607 | conn.close() 608 | 609 | return data 610 | 611 | def get_all_collected_videos(self) -> List: 612 | ''' 613 | Retrieves all unique video links from the query_search_results, 614 | images_results, and Apify tables. 615 | ''' 616 | conn = self.create_sql_connection() 617 | if conn is not None: 618 | cursor = conn.cursor() 619 | 620 | try: 621 | # get all video links from database 622 | cursor.execute( 623 | ''' 624 | SELECT web_video_url 625 | FROM apify_profile_scraper 626 | UNION 627 | SELECT web_video_url 628 | FROM apify_hashtag_scraper 629 | UNION 630 | SELECT link 631 | FROM query_search_results 632 | UNION 633 | SELECT link 634 | FROM images_results 635 | ''' 636 | ) 637 | 638 | # fetch all links 639 | all_links = [i[0] for i in cursor.fetchall()] 640 | 641 | # remove duplicates 642 | all_links = list(set(all_links)) 643 | 644 | return all_links 645 | except Error as e: 646 | print (f'An error occurred while retrieving data: {e}') 647 | finally: 648 | conn.close() 649 | 650 | return [] 651 | --------------------------------------------------------------------------------