├── streamlit_app
    ├── __init__.py
    ├── styles
    │   ├── __init__.py
    │   └── css.py
    ├── components
    │   ├── __init__.py
    │   ├── progress.py
    │   ├── main_panel.py
    │   └── sidebar.py
    ├── core
    │   ├── __init__.py
    │   ├── keyframes_processor.py
    │   └── collection_runner.py
    └── utils
    │   ├── __init__.py
    │   ├── session_state.py
    │   └── file_browser.py
├── databases
    ├── __init__.py
    ├── utilities.py
    └── sql_manager.py
├── data_collectors
    ├── __init__.py
    ├── utilities.py
    └── collector.py
├── images
    └── streamlit-interface.png
├── .gitignore
├── config
    └── config.ini
├── media_handlers
    ├── __init__.py
    ├── video_downloader.py
    └── session_manager.py
├── requirements.txt
├── setup.py
├── app.py
├── utils
    └── __init__.py
├── main.py
└── README.md


/streamlit_app/__init__.py:
--------------------------------------------------------------------------------
1 | # Streamlit UI package


--------------------------------------------------------------------------------
/streamlit_app/styles/__init__.py:
--------------------------------------------------------------------------------
1 | # Styles and CSS


--------------------------------------------------------------------------------
/streamlit_app/components/__init__.py:
--------------------------------------------------------------------------------
1 | # UI Components


--------------------------------------------------------------------------------
/streamlit_app/core/__init__.py:
--------------------------------------------------------------------------------
1 | # Core business logic


--------------------------------------------------------------------------------
/streamlit_app/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Streamlit utilities


--------------------------------------------------------------------------------
/databases/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from .sql_manager import SQLDatabaseManager
3 | 


--------------------------------------------------------------------------------
/data_collectors/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from .collector import TikTokDataCollector
3 | 


--------------------------------------------------------------------------------
/images/streamlit-interface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/estebanpdl/tik-spyder/HEAD/images/streamlit-interface.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore Cache
2 | .ipynb_checkpoints/
3 | __pycache__/
4 | .vscode/
5 | 
6 | # package metadata
7 | *.egg-info
8 | 


--------------------------------------------------------------------------------
/config/config.ini:
--------------------------------------------------------------------------------
1 | [SerpAPI Key]
2 | api_key = your_serp_api_key
3 | 
4 | [Apify Token]
5 | apify_token = your_apify_token
6 | 


--------------------------------------------------------------------------------
/media_handlers/__init__.py:
--------------------------------------------------------------------------------
1 | from .session_manager import RequestSession
2 | from .video_downloader import VideoDownloader
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp
 2 | apify-client
 3 | httpx
 4 | pandas
 5 | PySocks
 6 | requests
 7 | serpapi
 8 | stem
 9 | streamlit
10 | tqdm
11 | yt-dlp[default]
12 | 


--------------------------------------------------------------------------------
/streamlit_app/utils/session_state.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # import modules
 4 | import streamlit as st
 5 | import time
 6 | 
 7 | def initialize_session_state():
 8 |     """Initialize session state variables"""
 9 |     if 'output_dir' not in st.session_state:
10 |         timestamp = int(time.time())
11 |         st.session_state.output_dir = f'./tikspyder-data/{timestamp}'


--------------------------------------------------------------------------------
/streamlit_app/utils/file_browser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # import modules
 4 | import tkinter as tk
 5 | 
 6 | # import submodules
 7 | from tkinter import filedialog
 8 | 
 9 | def select_directory():
10 |     """Create a directory picker dialog"""
11 |     root = tk.Tk()
12 |     root.withdraw()
13 |     root.wm_attributes('-topmost', 1)
14 |     folder_path = filedialog.askdirectory()
15 |     root.destroy()
16 |     return folder_path


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # import modules
 4 | import os
 5 | from setuptools import setup, find_packages
 6 | 
 7 | setup(
 8 |     name="tikspyder",
 9 |     version="0.1.0",
10 |     packages=find_packages(),
11 |     install_requires=[
12 |         "aiohttp",
13 |         "apify-client",
14 |         "pandas",
15 |         "PySocks",
16 |         "requests",
17 |         "serpapi",
18 |         "stem",
19 |         "streamlit",
20 |         "tqdm",
21 |         "yt-dlp[default]"
22 |     ],
23 |     entry_points={
24 |         'console_scripts': [
25 |             'tikspyder=main:main',
26 |         ],
27 |     },
28 |     python_requires='>=3.6',
29 |     author="Esteban Ponce de Leon",
30 |     description="A tool for collecting TikTok data",
31 |     long_description=open('README.md', encoding='utf-8').read() if os.path.exists('README.md') else '',
32 |     long_description_content_type="text/markdown",
33 | )
34 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # import modules
 4 | import streamlit as st
 5 | import os
 6 | 
 7 | # local imports
 8 | from streamlit_app.styles.css import load_css
 9 | from streamlit_app.utils.session_state import initialize_session_state
10 | from streamlit_app.components.sidebar import render_sidebar
11 | from streamlit_app.components.main_panel import render_main_panel
12 | from streamlit_app.core.collection_runner import run_collection, validate_input
13 | from utils import get_config_attrs, get_project_root
14 | 
15 | # Configure Streamlit page
16 | st.set_page_config(
17 |     page_title="TikSpyder - TikTok Data Collection",
18 |     page_icon="🕷️",
19 |     layout="wide",
20 |     initial_sidebar_state="expanded"
21 | )
22 | 
23 | # Set theme programmatically to dark
24 | st._config.set_option('theme.base', 'dark')
25 | st._config.set_option('theme.backgroundColor', '#0e1117')
26 | st._config.set_option('theme.secondaryBackgroundColor', '#262730')
27 | st._config.set_option('theme.textColor', '#ffffff')
28 | 
29 | def main():
30 |     """Main application entry point"""
31 |     # Load styling
32 |     load_css()
33 |     
34 |     # Initialize session state
35 |     initialize_session_state()
36 |     
37 |     # Get project configuration
38 |     project_root = get_project_root()
39 |     config_path = os.path.join(project_root, 'config')
40 |     config_attrs = get_config_attrs(config_path)
41 |     
42 |     # Main header
43 |     st.markdown('<h1 class="main-header">🕷️ TikSpyder</h1>', unsafe_allow_html=True)
44 |     st.markdown('<p style="text-align: center; color: var(--text-secondary); font-size: 1.2rem; margin-bottom: 2rem;">Advanced TikTok Data Collection</p>', unsafe_allow_html=True)
45 |     
46 |     # Render UI components
47 |     search_config, apify_config = render_sidebar()
48 |     collection_config, start_collection = render_main_panel()
49 |     
50 |     # Handle collection start
51 |     if start_collection:
52 |         if validate_input(search_config):
53 |             run_collection(search_config, apify_config, collection_config, config_attrs)
54 | 
55 | if __name__ == '__main__':
56 |     main()


--------------------------------------------------------------------------------
/streamlit_app/components/progress.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # import modules
 4 | import streamlit as st
 5 | import time
 6 | 
 7 | # local imports
 8 | from ..styles.css import create_status_badge
 9 | 
10 | def create_progress_tracker():
11 |     """Create and return progress tracking components"""
12 |     # Create status container
13 |     status_container = st.container()
14 |     
15 |     with status_container:
16 |         st.markdown("### 🔄 Collection Progress")
17 |         
18 |         # Create progress indicators
19 |         overall_progress = st.progress(0)
20 |         status_text = st.empty()
21 |         step_container = st.container()
22 |         
23 |         # Collection steps with icons and descriptions
24 |         steps = [
25 |             ("🔍", "Initializing search parameters..."),
26 |             ("📡", "Collecting search results..."),
27 |             ("🖼️", "Gathering image thumbnails..."),
28 |             ("🚀", "Running Apify integration..."),
29 |             ("📁", "Generating data files..."),
30 |             ("📹", "Downloading videos..."),
31 |             ("🎞️", "Extracting keyframes..."),
32 |             ("✅", "Collection complete!")
33 |         ]
34 |         
35 |         step_progress = {}
36 |         for i, (icon, desc) in enumerate(steps):
37 |             step_progress[i] = step_container.empty()
38 |     
39 |     return overall_progress, status_text, step_progress, steps
40 | 
41 | def update_progress(step_num, overall_progress, status_text, step_progress, steps, message=None, progress_value=None):
42 |     """Update progress indicators"""
43 |     if step_num < len(steps):
44 |         icon, desc = steps[step_num]
45 |         step_progress[step_num].markdown(f"{icon} {desc}")
46 |     
47 |     if message:
48 |         status_text.markdown(create_status_badge(message, "warning"), unsafe_allow_html=True)
49 |     
50 |     if progress_value is not None:
51 |         overall_progress.progress(progress_value)
52 |     
53 |     time.sleep(0.1)  # Allow UI to update
54 | 
55 | def mark_step_complete(step_num, step_progress, message):
56 |     """Mark a step as completed"""
57 |     step_progress[step_num].markdown(f"✅ {message}")


--------------------------------------------------------------------------------
/streamlit_app/core/keyframes_processor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # import modules
 4 | import os
 5 | import glob
 6 | import subprocess
 7 | from concurrent.futures import ThreadPoolExecutor, as_completed
 8 | 
 9 | def extract_keyframes_sync(output_dir, max_workers=3):
10 |     """Synchronous keyframes extraction - no async conflicts"""
11 |     # Build keyframes path
12 |     keyframes_path = f'{output_dir}/keyframes'
13 |     if not os.path.exists(keyframes_path):
14 |         os.makedirs(keyframes_path)
15 | 
16 |     # Get all video files
17 |     video_path = f'{output_dir}/downloaded_videos'
18 |     if not os.path.exists(video_path):
19 |         return
20 |     
21 |     files = glob.glob(f'{video_path}/*.mp4')
22 |     if not files:
23 |         return
24 | 
25 |     # Videos already processed
26 |     processed_videos = []
27 |     if os.path.exists(keyframes_path):
28 |         processed_videos = [d for d in os.listdir(keyframes_path) 
29 |                           if os.path.isdir(os.path.join(keyframes_path, d))]
30 | 
31 |     def extract_single_video_keyframes(file):
32 |         """Extract keyframes from a single video file"""
33 |         try:
34 |             # Get id from video filename
35 |             video_id = os.path.basename(file).split('.')[0]
36 |             if video_id in processed_videos:
37 |                 return
38 |             
39 |             # Create subdirectory for this video_id
40 |             video_keyframes_dir = f'{keyframes_path}/{video_id}'
41 |             if not os.path.exists(video_keyframes_dir):
42 |                 os.makedirs(video_keyframes_dir)
43 |             
44 |             # FFmpeg command to extract keyframes
45 |             cmd = [
46 |                 'ffmpeg',
47 |                 '-i', file,
48 |                 '-vf', 'select=eq(pict_type\\,I)',
49 |                 '-vsync', 'vfr',
50 |                 '-q:v', '2',
51 |                 '-y',  # Overwrite output files
52 |                 f'{video_keyframes_dir}/keyframe_%04d.jpg'
53 |             ]
54 | 
55 |             # Run FFmpeg synchronously
56 |             subprocess.run(
57 |                 cmd,
58 |                 stdout=subprocess.PIPE,
59 |                 stderr=subprocess.PIPE,
60 |                 text=True
61 |             )
62 |                 
63 |         except Exception:
64 |             # Silently handle errors
65 |             pass
66 | 
67 |     # Process videos with controlled concurrency
68 |     max_workers = min(max_workers, len(files))
69 |     
70 |     with ThreadPoolExecutor(max_workers=max_workers) as executor:
71 |         # Submit all tasks
72 |         future_to_file = {executor.submit(extract_single_video_keyframes, file): file 
73 |                         for file in files}
74 |         
75 |         # Process completed tasks silently
76 |         for future in as_completed(future_to_file):
77 |             result = future.result()
78 |             # Silently handle results - no UI spam
79 |             pass


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # import modules
  4 | import os
  5 | 
  6 | # typing
  7 | from typing import Dict
  8 | 
  9 | # import submodules
 10 | from configparser import ConfigParser
 11 | from datetime import datetime
 12 | 
 13 | def get_project_root():
 14 |     """Get the project root directory."""
 15 |     # Get the directory where main.py is located
 16 |     current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 17 |     return current_dir
 18 | 
 19 | '''
 20 | Get configuration attributes
 21 | 
 22 | '''
 23 | def get_config_attrs(config_dir=None) -> Dict:
 24 |     '''
 25 |     Retrieves configuration attributes from configuration files.
 26 | 
 27 |     :param config_dir: Optional path to the config directory.
 28 |                        If None, uses the default path.
 29 |     :return: A dictionary containing the SerpAPI and Apify credentials.
 30 |     '''
 31 |     if config_dir is None:
 32 |         project_root = get_project_root()
 33 |         config_dir = os.path.join(project_root, 'config')
 34 |     
 35 |     path = os.path.join(config_dir, 'config.ini')
 36 | 
 37 |     # config parser
 38 |     config = ConfigParser()
 39 |     config.read(path)
 40 | 
 41 |     # Get credentials from both sections
 42 |     credentials = {}
 43 |     
 44 |     # SerpAPI credentials
 45 |     if 'SerpAPI Key' in config:
 46 |         credentials.update(dict(config['SerpAPI Key']))
 47 |     
 48 |     # Apify credentials
 49 |     if 'Apify Token' in config:
 50 |         credentials.update(dict(config['Apify Token']))
 51 |     
 52 |     return credentials
 53 | 
 54 | '''
 55 | Verify date format
 56 | 
 57 | '''
 58 | def is_valid_date(date_str: str) -> bool:
 59 |     '''
 60 |     Verifies if the given date string is in the format YYYY-MM-DD.
 61 | 
 62 |     :param date_str: The date string to verify.
 63 |     :return: True if the date string is valid, False otherwise.
 64 |     '''
 65 |     try:
 66 |         # Attempt to parse the date string with the expected format
 67 |         datetime.strptime(date_str, '%Y-%m-%d')
 68 |         return True
 69 |     except ValueError:
 70 |         # If a ValueError is raised, the format is incorrect
 71 |         return False
 72 | 
 73 | def verify_date_argument(args: Dict, key: str) -> None:
 74 |     '''
 75 |     Verifies that a date argument in args is correctly formatted.
 76 | 
 77 |     :param args: Dictionary containing command line arguments and options.
 78 |     :param key: The key in args to check for a valid date.
 79 |     :raises ValueError: If the date is not in the correct format.
 80 |     '''
 81 |     if key in args:
 82 |         if not is_valid_date(args[key]):
 83 |             raise ValueError(
 84 |                 f"The date for '{key}' argument is not in the correct "
 85 |                 "format. Use this format: YYYY-MM-DD."
 86 |             )
 87 | 
 88 | '''
 89 | Create output data path
 90 | 
 91 | '''
 92 | def create_output_data_path(path: str) -> None:
 93 |     '''
 94 |     Creates the specified directory path if it does not already exist.
 95 | 
 96 |     :param path: The directory path to create.
 97 |     :return: None
 98 |     '''
 99 |     if not os.path.exists(path):
100 |         os.makedirs(path)
101 | 
102 | 


--------------------------------------------------------------------------------
/streamlit_app/components/main_panel.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # import modules
  4 | import streamlit as st
  5 | 
  6 | # local imports
  7 | from dataclasses import dataclass
  8 | from ..utils.file_browser import select_directory
  9 | 
 10 | @dataclass
 11 | class CollectionConfig:
 12 |     """Configuration for collection settings"""
 13 |     download_videos: bool = True
 14 |     use_tor: bool = False
 15 |     max_workers: int = 5
 16 |     output_dir: str = ''
 17 | 
 18 | def render_main_panel():
 19 |     """Render main content panels and return configuration"""
 20 |     
 21 |     # Main Content Area - Better organized panels
 22 |     st.markdown("## ⚙️ Collection Settings")
 23 |     
 24 |     # Download Settings Panel
 25 |     with st.container():
 26 |         st.markdown("### 📥 Download & Processing Settings")
 27 |         st.markdown("")  # Add consistent spacing
 28 |         
 29 |         col1, col2, col3 = st.columns([1, 1, 1])
 30 |         
 31 |         with col1:
 32 |             st.markdown("**📹 Download Videos**")
 33 |             download_videos = st.toggle(
 34 |                 "Enable video downloads", 
 35 |                 value=True,
 36 |                 help="Download TikTok videos to local storage",
 37 |                 label_visibility="collapsed"
 38 |             )
 39 |             
 40 |         with col2:
 41 |             st.markdown("**🔒 Use Tor Network**")
 42 |             use_tor = st.toggle(
 43 |                 "Enable Tor for downloads", 
 44 |                 help="Enable Tor for anonymous downloads",
 45 |                 label_visibility="collapsed"
 46 |             )
 47 |             
 48 |         with col3:
 49 |             max_workers = st.number_input(
 50 |                 '⚡ **Concurrent Workers**',
 51 |                 min_value=1,
 52 |                 max_value=20,
 53 |                 value=5,
 54 |                 help='Number of concurrent download workers'
 55 |             )
 56 |     
 57 |     st.markdown("---")
 58 |     
 59 |     # Output Configuration Panel  
 60 |     with st.container():
 61 |         st.markdown("### 📂 Output Configuration")
 62 |         
 63 |         # Properly aligned output directory input and browse button
 64 |         col1, col2 = st.columns([6, 1])
 65 |         
 66 |         with col1:
 67 |             output_dir = st.text_input(
 68 |                 '**Output Directory**',
 69 |                 value=st.session_state.output_dir,
 70 |                 help='Directory where all collected data will be saved',
 71 |                 placeholder='Enter output directory path...',
 72 |                 label_visibility="visible"
 73 |             )
 74 |             if output_dir != st.session_state.output_dir:
 75 |                 st.session_state.output_dir = output_dir
 76 |         
 77 |         with col2:
 78 |             # Add spacing to align button with input field
 79 |             st.markdown("<br>", unsafe_allow_html=True)
 80 |             if st.button('📁', help="Browse for directory", use_container_width=True):
 81 |                 path = select_directory()
 82 |                 if path:
 83 |                     st.session_state.output_dir = path
 84 |                     st.rerun()
 85 |     
 86 |     st.markdown("---")
 87 |     
 88 |     # Centered Action Button
 89 |     col1, col2, col3 = st.columns([1, 2, 1])
 90 |     with col2:
 91 |         start_collection = st.button(
 92 |             '🚀 **Start Data Collection**', 
 93 |             use_container_width=True,
 94 |             type="primary"
 95 |         )
 96 |     
 97 |     return CollectionConfig(
 98 |         download_videos=download_videos,
 99 |         use_tor=use_tor,
100 |         max_workers=max_workers,
101 |         output_dir=st.session_state.output_dir
102 |     ), start_collection


--------------------------------------------------------------------------------
/data_collectors/utilities.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # typing
  4 | from typing import Dict, List
  5 | 
  6 | '''
  7 | Build search query
  8 | 
  9 | '''
 10 | def advanced_search_options(args: Dict) -> str:
 11 |     '''
 12 |     Builds advanced search options based on the provided arguments.
 13 | 
 14 |     :param args: Dictionary containing the command line arguments and options.
 15 |     :return: A formatted query string with advanced search options.
 16 |     '''
 17 |     before = args.get('before', '')
 18 |     after = args.get('after', '')
 19 | 
 20 |     advanced_search = {
 21 |         'before': before,
 22 |         'after': after
 23 |     }
 24 | 
 25 |     response = [
 26 |         f'{k}:{v}' for k, v in advanced_search.items() if v
 27 |     ]
 28 | 
 29 |     return ' '.join(response)
 30 | 
 31 | def build_site_query(site: str, user: str = None, tag: str = None, q: str = '') -> str:
 32 |     '''
 33 |     Builds a site-specific search query based on the provided parameters.
 34 | 
 35 |     :param site: TikTok's site domain.
 36 |     :param user: Optional username to search for content from a specific user.
 37 |     :param tag: Optional tag to search for content with a specific tag.
 38 |     :param q: Optional search terms to include in the query.
 39 |     :return: A formatted site search query string.
 40 |     '''
 41 |     if user is not None:
 42 |         # remove @ prefix if present
 43 |         clean_user = user[1:] if user.startswith('@') else user
 44 |         return f'site:{site}/@{clean_user}/* {q}'.strip()
 45 |     elif tag is not None:
 46 |         # remove # prefix if present
 47 |         clean_tag = tag[1:] if tag.startswith('#') else tag
 48 |         return f'site:{site}/tag/{clean_tag}/* {q}'.strip()
 49 |     else:
 50 |         # normal site search
 51 |         return f'site:{site}/* {q}'.strip()
 52 |         
 53 | def search_query(args: Dict) -> str:
 54 |     '''
 55 |     Builds the search query string based on the command line arguments.
 56 | 
 57 |     :param args: Dictionary containing the command line arguments and options.
 58 |     :return: A formatted query string.
 59 |     '''
 60 |     q = args.get('q') or ''
 61 |     advanced_search = advanced_search_options(args)
 62 | 
 63 |     return f'{q} {advanced_search}'.strip()
 64 | 
 65 | '''
 66 | Select SerpAPI parameters
 67 | 
 68 | '''
 69 | def select_serpapi_parameters(args: Dict) -> Dict:
 70 |     '''
 71 |     Filters the command line arguments to include only the default SerpAPI
 72 |     parameters.
 73 | 
 74 |     :param args: Dictionary containing the command line arguments and options.
 75 |     :return: A dictionary containing only the relevant SerpAPI parameters.
 76 |     '''
 77 |     default_serpapi_parameters = [
 78 |         'q',
 79 |         'google_domain',
 80 |         'gl',
 81 |         'hl',
 82 |         'cr',
 83 |         'lr',
 84 |         'safe'
 85 |     ]
 86 | 
 87 |     # filter and return only the relevant SerpAPI parameters
 88 |     params = {
 89 |         k: v for k, v in args.items() if k in default_serpapi_parameters and v 
 90 |     }
 91 | 
 92 |     # add new parameters
 93 |     params['engine'] = 'google'
 94 |     params['start'] = 0
 95 |     params['nfpr'] = 1
 96 |     params['num'] = 100
 97 | 
 98 |     return params
 99 | 
100 | '''
101 | Extract relevant keys from SerpAPI response
102 | 
103 | '''
104 | def extract_results_keys(data: List[Dict], result_type: str) -> List[Dict]:
105 |     '''
106 |     Filters the SerpAPI response data to include only entries with 'link'
107 |     containing 'video', and returns a list of dictionaries with specified
108 |     default keys.
109 | 
110 |     :param data: List of dictionaries containing the SerpAPI response data.
111 |     :param result_type: Type of SerpAPI response: 'search_result' or
112 |         'image_result'
113 |     :return: A list of dictionaries, each containing the specified default
114 |         keys from the SerpAPI response.
115 |     '''
116 |     key_mapping = {
117 |         'search_result': [
118 |             'source',
119 |             'title',
120 |             'snippet',
121 |             'link',
122 |             'thumbnail',
123 |             'video_link',
124 |             'snippet_highlighted_words',
125 |             'displayed_link'
126 |         ],
127 |         'image_result': [
128 |             'source',
129 |             'thumbnail',
130 |             'title',
131 |             'link',
132 |             'serpapi_related_content_link'
133 |         ]
134 |     }
135 | 
136 |     selected_keys = key_mapping.get(result_type, [])
137 | 
138 |     # filter data to include only entries with 'link' containing 'video'
139 |     d = [
140 |         i for i in data if 'link' in i and '/video/' in i['link']
141 |         and 'tiktok.com' in i['link']
142 |     ]
143 | 
144 |     # return list of dictionaries with specified default keys
145 |     return [
146 |         {
147 |             k: i[k] for k in selected_keys if k in i
148 |         } for i in d
149 |     ]
150 | 
151 | '''
152 | Extract relevant keys from related content
153 | '''
154 | def extract_related_content_keys(data: List[Dict]) -> List[Dict]:
155 |     '''
156 |     Filters related content data and returns a list of dictionaries with
157 |     specified default keys.
158 | 
159 |     :param data: List of dictionaries containing related content data.
160 |     :return: A list of dictionaries, each containing the specified default
161 |         keys for the related content.
162 |     '''
163 |     key_mapping = [
164 |         'source',
165 |         'link',
166 |         'thumbnail',
167 |         'title'
168 |     ]
169 | 
170 |     # return list of dictionaries with specified default keys
171 |     return [
172 |         {
173 |             k: i[k] for k in key_mapping if k in i
174 |         } for i in data
175 |     ]
176 | 


--------------------------------------------------------------------------------
/streamlit_app/styles/css.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # import modules
  4 | import streamlit as st
  5 | 
  6 | def load_css():
  7 |     """Load custom CSS for TikTok-inspired theme"""
  8 |     st.markdown("""
  9 |     <style>
 10 |     /* Import Google Fonts */
 11 |     @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
 12 |     
 13 |     /* Root variables for TikTok color scheme */
 14 |     :root {
 15 |         --primary-color: #000000;
 16 |         --secondary-color: #ff0050;
 17 |         --accent-color: #25f4ee;
 18 |         --background-dark: #161823;
 19 |         --background-light: #1e2139;
 20 |         --text-primary: #ffffff;
 21 |         --text-secondary: #a0a0a0;
 22 |         --success-color: #00ff88;
 23 |         --warning-color: #ffb800;
 24 |         --error-color: #ff3366;
 25 |     }
 26 |     
 27 |     /* Main app styling */
 28 |     .main .block-container {
 29 |         padding: 1rem 2rem;
 30 |         background: linear-gradient(135deg, #161823 0%, #1e2139 100%);
 31 |         color: var(--text-primary);
 32 |         font-family: 'Inter', sans-serif;
 33 |     }
 34 |     
 35 |     /* Header styling */
 36 |     .main-header {
 37 |         background: linear-gradient(90deg, var(--secondary-color), var(--accent-color));
 38 |         -webkit-background-clip: text;
 39 |         -webkit-text-fill-color: transparent;
 40 |         text-align: center;
 41 |         font-size: 3rem;
 42 |         font-weight: 700;
 43 |         margin-bottom: 2rem;
 44 |         text-shadow: 0 0 30px rgba(255, 0, 80, 0.3);
 45 |     }
 46 |     
 47 |     /* Card styling */
 48 |     .stCard {
 49 |         background: rgba(255, 255, 255, 0.05);
 50 |         border-radius: 15px;
 51 |         border: 1px solid rgba(255, 255, 255, 0.1);
 52 |         box-shadow: 0 8px 32px rgba(0, 0, 0, 0.3);
 53 |         backdrop-filter: blur(10px);
 54 |         padding: 1.5rem;
 55 |         margin: 1rem 0;
 56 |     }
 57 |     
 58 |     /* Button styling */
 59 |     .stButton > button {
 60 |         background: linear-gradient(45deg, var(--secondary-color), var(--accent-color));
 61 |         color: white;
 62 |         border: none;
 63 |         border-radius: 25px;
 64 |         padding: 0.5rem 2rem;
 65 |         font-weight: 600;
 66 |         transition: all 0.3s ease;
 67 |         box-shadow: 0 4px 15px rgba(255, 0, 80, 0.3);
 68 |     }
 69 |     
 70 |     .stButton > button:hover {
 71 |         transform: translateY(-2px);
 72 |         box-shadow: 0 8px 25px rgba(255, 0, 80, 0.5);
 73 |     }
 74 |     
 75 |     /* Sidebar styling */
 76 |     .css-1d391kg {
 77 |         background: linear-gradient(180deg, #161823 0%, #1e2139 100%);
 78 |         border-right: 1px solid rgba(255, 255, 255, 0.1);
 79 |     }
 80 |     
 81 |     /* Enhanced input field styling - work with theme */
 82 |     .stTextInput > div > div > input {
 83 |         border-radius: 10px;
 84 |         border: 1px solid var(--accent-color);
 85 |     }
 86 |     
 87 |     /* Select box styling - work with theme */
 88 |     .stSelectbox > div > div {
 89 |         border-radius: 10px;
 90 |     }
 91 |     
 92 |     /* Date input styling - work with theme */
 93 |     .stDateInput input {
 94 |         border-radius: 10px;
 95 |     }
 96 |     
 97 |     /* Number input styling - work with theme */
 98 |     .stNumberInput input {
 99 |         border-radius: 10px;
100 |     }
101 |     
102 |     /* Metrics styling */
103 |     .metric-card {
104 |         background: rgba(255, 255, 255, 0.05);
105 |         border-radius: 12px;
106 |         padding: 1rem;
107 |         text-align: center;
108 |         border: 1px solid rgba(255, 255, 255, 0.1);
109 |         transition: transform 0.2s ease;
110 |     }
111 |     
112 |     .metric-card:hover {
113 |         transform: scale(1.02);
114 |     }
115 |     
116 |     /* Progress bar styling */
117 |     .stProgress > div > div > div {
118 |         background: linear-gradient(90deg, var(--secondary-color), var(--accent-color));
119 |         border-radius: 10px;
120 |     }
121 |     
122 |     /* Tab styling */
123 |     .stTabs [data-baseweb="tab-list"] {
124 |         gap: 8px;
125 |     }
126 |     
127 |     .stTabs [data-baseweb="tab"] {
128 |         background: rgba(255, 255, 255, 0.1);
129 |         border-radius: 10px;
130 |         color: white;
131 |         border: 1px solid rgba(255, 255, 255, 0.2);
132 |     }
133 |     
134 |     .stTabs [aria-selected="true"] {
135 |         background: linear-gradient(45deg, var(--secondary-color), var(--accent-color));
136 |     }
137 |     
138 |     /* Status badges */
139 |     .status-badge {
140 |         display: inline-block;
141 |         padding: 0.2rem 0.8rem;
142 |         border-radius: 20px;
143 |         font-size: 0.8rem;
144 |         font-weight: 600;
145 |         margin: 0.2rem;
146 |     }
147 |     
148 |     .status-success {
149 |         background: rgba(0, 255, 136, 0.2);
150 |         color: var(--success-color);
151 |         border: 1px solid var(--success-color);
152 |     }
153 |     
154 |     .status-warning {
155 |         background: rgba(255, 184, 0, 0.2);
156 |         color: var(--warning-color);
157 |         border: 1px solid var(--warning-color);
158 |     }
159 |     
160 |     .status-error {
161 |         background: rgba(255, 51, 102, 0.2);
162 |         color: var(--error-color);
163 |         border: 1px solid var(--error-color);
164 |     }
165 |     
166 |     /* Animation keyframes */
167 |     @keyframes pulse {
168 |         0% { opacity: 1; }
169 |         50% { opacity: 0.5; }
170 |         100% { opacity: 1; }
171 |     }
172 |     
173 |     .pulse {
174 |         animation: pulse 2s infinite;
175 |     }
176 |     
177 |     /* Hide default Streamlit styling */
178 |     #MainMenu {visibility: hidden;}
179 |     footer {visibility: hidden;}
180 |     header {visibility: hidden;}
181 |     </style>
182 |     """, unsafe_allow_html=True)
183 | 
184 | def create_status_badge(text, status_type):
185 |     """Create a status badge with specified type"""
186 |     return f'<span class="status-badge status-{status_type}">{text}</span>'


--------------------------------------------------------------------------------
/streamlit_app/components/sidebar.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # import modules
  4 | import streamlit as st
  5 | 
  6 | # import submodules
  7 | from dataclasses import dataclass
  8 | from typing import Optional
  9 | from datetime import date
 10 | 
 11 | @dataclass
 12 | class SearchConfig:
 13 |     """Configuration for search parameters"""
 14 |     query: Optional[str] = None
 15 |     user: Optional[str] = None
 16 |     tag: Optional[str] = None
 17 |     after_date: Optional[date] = None
 18 |     before_date: Optional[date] = None
 19 |     google_domain: str = 'google.com'
 20 |     gl: Optional[str] = None
 21 |     hl: Optional[str] = None
 22 |     cr: Optional[str] = None
 23 |     lr: Optional[str] = None
 24 |     safe: str = 'active'
 25 |     depth: int = 3
 26 | 
 27 | @dataclass
 28 | class ApifyConfig:
 29 |     """Configuration for Apify integration"""
 30 |     use_apify: bool = False
 31 |     number_of_results: int = 25
 32 |     oldest_post_date: Optional[date] = None
 33 |     newest_post_date: Optional[date] = None
 34 | 
 35 | def render_sidebar():
 36 |     """Render sidebar components and return configuration"""
 37 |     with st.sidebar:
 38 |         st.markdown("### 🎯 Search Configuration")
 39 |         st.markdown("")  # Add consistent spacing
 40 |         
 41 |         # Search Type Selection
 42 |         search_tab = st.radio(
 43 |             "**Search Type**",
 44 |             ["🔍 Keyword", "👤 User Profile", "🏷️ Hashtag"],
 45 |             horizontal=True
 46 |         )
 47 |         
 48 |         st.markdown("")  # Add spacing after radio buttons
 49 |         
 50 |         # Search input based on type
 51 |         query = user = tag = None
 52 |         
 53 |         if search_tab == "🔍 Keyword":
 54 |             query = st.text_input(
 55 |                 'Search Keywords',
 56 |                 placeholder='Enter keywords to search for...',
 57 |                 help='Search for TikTok content using keywords'
 58 |             )
 59 |         elif search_tab == "👤 User Profile":
 60 |             user = st.text_input(
 61 |                 'TikTok Username',
 62 |                 placeholder='username (without @)',
 63 |                 help='Enter TikTok username without @ symbol'
 64 |             )
 65 |         else:  # Hashtag search
 66 |             tag = st.text_input(
 67 |                 'Hashtag',
 68 |                 placeholder='hashtag (with or without #)',
 69 |                 help='Enter hashtag with or without # symbol'
 70 |             )
 71 |         
 72 |         st.markdown("")  # Add spacing before divider
 73 |         st.markdown("---")
 74 |         st.markdown("")  # Add spacing after divider
 75 |         
 76 |         # Date Filters Section
 77 |         st.markdown("### 📅 Date Filters")
 78 |         st.markdown("")  # Add consistent spacing
 79 |         col1, col2 = st.columns(2)
 80 |         with col1:
 81 |             after_date = st.date_input(
 82 |                 'After Date',
 83 |                 value=None,
 84 |                 help='Posts after this date'
 85 |             )
 86 |         with col2:
 87 |             before_date = st.date_input(
 88 |                 'Before Date', 
 89 |                 value=None,
 90 |                 help='Posts before this date'
 91 |             )
 92 |         
 93 |         st.markdown("")  # Add spacing before divider
 94 |         st.markdown("---")
 95 |         st.markdown("")  # Add spacing after divider
 96 |         
 97 |         # Apify Integration Section
 98 |         st.markdown("### 🚀 Apify Integration")
 99 |         st.markdown("")  # Add consistent spacing
100 |         
101 |         use_apify = st.toggle(
102 |             "**Enable Apify**", 
103 |             help="Enhanced data collection with Apify"
104 |         )
105 |         
106 |         st.markdown("")  # Add spacing after toggle
107 |         
108 |         if use_apify:
109 |             number_of_results = st.number_input(
110 |                 'Results Count',
111 |                 min_value=1,
112 |                 max_value=1000,
113 |                 value=25,
114 |                 help='Number of results to collect'
115 |             )
116 |             
117 |             st.markdown("")  # Add spacing before subsection
118 |             st.markdown("**Apify Date Filters**")
119 |             st.markdown("")  # Add spacing after subsection title
120 |             
121 |             col1, col2 = st.columns(2)
122 |             with col1:
123 |                 oldest_post_date = st.date_input(
124 |                     'Oldest Post',
125 |                     help='Oldest post date'
126 |                 )
127 |             with col2:
128 |                 newest_post_date = st.date_input(
129 |                     'Newest Post',
130 |                     help='Newest post date'
131 |                 )
132 |         else:
133 |             number_of_results = 25
134 |             oldest_post_date = None
135 |             newest_post_date = None
136 |         
137 |         st.markdown("---")
138 |         
139 |         # Advanced Search Options
140 |         with st.expander("⚙️ Advanced Search Options"):
141 |             st.markdown("**Google Search Settings**")
142 |             
143 |             # Domain setting (full width)
144 |             google_domain = st.text_input(
145 |                 'Domain',
146 |                 value='google.com',
147 |                 help='e.g., google.com, google.co.uk'
148 |             )
149 |             
150 |             # Country and Language settings (2 columns)
151 |             col1, col2 = st.columns(2)
152 |             with col1:
153 |                 gl = st.text_input(
154 |                     'Country Code (GL)', 
155 |                     help='e.g., us, uk, de',
156 |                     placeholder='us'
157 |                 )
158 |                 cr = st.text_input(
159 |                     'Country Restriction', 
160 |                     help='Restrict to specific countries',
161 |                     placeholder='countryUS'
162 |                 )
163 |             with col2:
164 |                 hl = st.text_input(
165 |                     'Language Code (HL)', 
166 |                     help='e.g., en, es, fr',
167 |                     placeholder='en'
168 |                 )
169 |                 lr = st.text_input(
170 |                     'Language Restriction', 
171 |                     help='Restrict to specific languages',
172 |                     placeholder='lang_en'
173 |                 )
174 |             
175 |             # Search settings (2 columns)
176 |             col3, col4 = st.columns(2)
177 |             with col3:
178 |                 safe = st.selectbox(
179 |                     'Safe Search',
180 |                     options=['active', 'off'],
181 |                     index=0,
182 |                     help='Adult content filter'
183 |                 )
184 |             with col4:
185 |                 depth = st.slider(
186 |                     'Search Depth',
187 |                     min_value=1,
188 |                     max_value=10,
189 |                     value=3,
190 |                     help='Related content iterations'
191 |                 )
192 |     
193 |     # Return configuration objects
194 |     search_config = SearchConfig(
195 |         query=query,
196 |         user=user,
197 |         tag=tag,
198 |         after_date=after_date,
199 |         before_date=before_date,
200 |         google_domain=google_domain,
201 |         gl=gl if gl else None,
202 |         hl=hl if hl else None,
203 |         cr=cr if cr else None,
204 |         lr=lr if lr else None,
205 |         safe=safe,
206 |         depth=depth
207 |     )
208 |     
209 |     apify_config = ApifyConfig(
210 |         use_apify=use_apify,
211 |         number_of_results=number_of_results,
212 |         oldest_post_date=oldest_post_date,
213 |         newest_post_date=newest_post_date
214 |     )
215 |     
216 |     return search_config, apify_config


--------------------------------------------------------------------------------
/streamlit_app/core/collection_runner.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # import modules
  4 | import streamlit as st
  5 | import asyncio
  6 | import time
  7 | 
  8 | # import submodules
  9 | from concurrent.futures import ThreadPoolExecutor
 10 | 
 11 | # local imports
 12 | from data_collectors import TikTokDataCollector
 13 | from media_handlers import VideoDownloader
 14 | from utils import create_output_data_path
 15 | from ..components.progress import create_progress_tracker, update_progress, \
 16 |     mark_step_complete
 17 | from ..styles.css import create_status_badge
 18 | from .keyframes_processor import extract_keyframes_sync
 19 | 
 20 | def build_args_dict(search_config, apify_config, collection_config, config_attrs):
 21 |     """Build arguments dictionary for collection"""
 22 |     args = {
 23 |         'q': search_config.query,
 24 |         'user': search_config.user,
 25 |         'tag': search_config.tag,
 26 |         'google_domain': search_config.google_domain,
 27 |         'gl': search_config.gl,
 28 |         'hl': search_config.hl,
 29 |         'cr': search_config.cr,
 30 |         'lr': search_config.lr,
 31 |         'safe': search_config.safe,
 32 |         'depth': search_config.depth,
 33 |         'before': search_config.before_date.strftime('%Y-%m-%d') if search_config.before_date else None,
 34 |         'after': search_config.after_date.strftime('%Y-%m-%d') if search_config.after_date else None,
 35 |         'download': collection_config.download_videos,
 36 |         'use_tor': collection_config.use_tor,
 37 |         'max_workers': collection_config.max_workers,
 38 |         'output': collection_config.output_dir,
 39 |         'apify': apify_config.use_apify,
 40 |         'number_of_results': apify_config.number_of_results
 41 |     }
 42 |     
 43 |     # Add Apify-specific arguments if enabled
 44 |     if apify_config.use_apify:
 45 |         args.update({
 46 |             'oldest_post_date': apify_config.oldest_post_date.strftime('%Y-%m-%d') if apify_config.oldest_post_date else None,
 47 |             'newest_post_date': apify_config.newest_post_date.strftime('%Y-%m-%d') if apify_config.newest_post_date else None
 48 |         })
 49 |     
 50 |     # Merge configuration attributes with user arguments
 51 |     args = {**args, **config_attrs}
 52 |     
 53 |     return args
 54 | 
 55 | def validate_input(search_config):
 56 |     """Validate search input"""
 57 |     if not search_config.query and not search_config.user and not search_config.tag:
 58 |         st.error('🚨 Please enter a search term, username, or hashtag to continue!')
 59 |         return False
 60 |     return True
 61 | 
 62 | def run_collection(search_config, apify_config, collection_config, config_attrs):
 63 |     """Enhanced collection function with better progress tracking and feedback"""
 64 |     
 65 |     # Build arguments
 66 |     args = build_args_dict(search_config, apify_config, collection_config, config_attrs)
 67 |     
 68 |     # Create progress tracker
 69 |     overall_progress, status_text, step_progress, steps = create_progress_tracker()
 70 |     
 71 |     def run_collection_thread():
 72 |         """Run collection in separate thread with own event loop"""
 73 |         # Create new event loop for this thread
 74 |         loop = asyncio.new_event_loop()
 75 |         asyncio.set_event_loop(loop)
 76 |         
 77 |         try:
 78 |             # Create collector in this thread
 79 |             collector = TikTokDataCollector(args=args)
 80 |             
 81 |             # Execute the main collection process
 82 |             collector.collect_search_data()
 83 |             
 84 |             # Generate files
 85 |             collector.generate_data_files()
 86 |             
 87 |             # Get collected videos for download
 88 |             collected_videos = collector.get_collected_videos() if args['download'] else []
 89 |             
 90 |             return collector, collected_videos
 91 |             
 92 |         finally:
 93 |             loop.close()
 94 |     
 95 |     try:
 96 |         # Create output directory
 97 |         create_output_data_path(args['output'])
 98 |         
 99 |         # Step 1: Initialize
100 |         update_progress(0, overall_progress, status_text, step_progress, steps, progress_value=10)
101 |         
102 |         # Step 2: Start data collection process
103 |         update_progress(1, overall_progress, status_text, step_progress, steps, "Searching...", 25)
104 |         
105 |         # Step 3: Show image collection
106 |         update_progress(2, overall_progress, status_text, step_progress, steps, progress_value=35)
107 |         
108 |         # Step 4: Show Apify preparation
109 |         if args['apify']:
110 |             update_progress(3, overall_progress, status_text, step_progress, steps, "Preparing Apify...", 45)
111 |         else:
112 |             step_progress[3].markdown(f"⏭️ Apify integration skipped")
113 |             overall_progress.progress(45)
114 |             time.sleep(0.1)
115 |         
116 |         # Run collection in separate thread to avoid asyncio conflicts
117 |         with ThreadPoolExecutor() as executor:
118 |             future = executor.submit(run_collection_thread)
119 |             collector, collected_videos = future.result()
120 |         
121 |         # Mark data collection steps as complete
122 |         mark_step_complete(1, step_progress, "Search results collected")
123 |         mark_step_complete(2, step_progress, "Image thumbnails gathered")
124 |         if args['apify']:
125 |             mark_step_complete(3, step_progress, "Apify integration completed")
126 |         
127 |         overall_progress.progress(65)
128 |         
129 |         # Step 5: Generate files (already done in thread)
130 |         update_progress(4, overall_progress, status_text, step_progress, steps, "Generating Files...", 75)
131 |         mark_step_complete(4, step_progress, "Data files generated")
132 |         
133 |         # Step 6: Download videos
134 |         if args['download']:
135 |             update_progress(5, overall_progress, status_text, step_progress, steps, "Downloading...", 80)
136 |             
137 |             if collected_videos:
138 |                 st.info(f'📹 Found {len(collected_videos)} videos to download')
139 |                 
140 |                 downloader = VideoDownloader(
141 |                     output=args['output'],
142 |                     use_tor=args['use_tor']
143 |                 )
144 |                 downloader.start_download(
145 |                     urls=collected_videos,
146 |                     max_workers=args['max_workers']
147 |                 )
148 |                 
149 |                 mark_step_complete(5, step_progress, f"{len(collected_videos)} videos downloaded")
150 |             else:
151 |                 mark_step_complete(5, step_progress, "No new videos to download")
152 |         else:
153 |             mark_step_complete(5, step_progress, "Video download disabled")
154 |             
155 |         # Step 7: Extract keyframes from available videos
156 |         update_progress(6, overall_progress, status_text, step_progress, steps, "Extracting Keyframes...", 90)
157 |         
158 |         # Extract keyframes from any videos in the output directory
159 |         try:
160 |             extract_keyframes_sync(args['output'], args['max_workers'])
161 |             mark_step_complete(6, step_progress, "Keyframes extracted")
162 |         except Exception as e:
163 |             step_progress[6].markdown(f"⚠️ Keyframe extraction failed: {str(e)}")
164 |         
165 |         # Step 8: Complete
166 |         overall_progress.progress(100)
167 |         update_progress(7, overall_progress, status_text, step_progress, steps)
168 |         status_text.markdown(create_status_badge("Success", "success"), unsafe_allow_html=True)
169 |         
170 |         # Success message with results
171 |         st.success('🎉 Collection completed successfully!')
172 |         
173 |         # Show output location
174 |         st.metric("📂 Output Location", args['output'])
175 |         
176 |         # Show file explorer link
177 |         st.markdown(f"""
178 |         <div style="margin-top: 1rem; padding: 1rem; background: rgba(0, 255, 136, 0.1); border-radius: 10px; border-left: 4px solid var(--success-color);">
179 |             <strong>📁 Results saved to:</strong><br>
180 |             <code>{args['output']}</code>
181 |         </div>
182 |         """, unsafe_allow_html=True)
183 |         
184 |     except Exception as e:
185 |         status_text.markdown(create_status_badge("Error", "error"), unsafe_allow_html=True)
186 |         st.error(f'❌ An error occurred during collection: {str(e)}')
187 |         st.exception(e)


--------------------------------------------------------------------------------
/media_handlers/video_downloader.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # import modules
  4 | import os
  5 | import time
  6 | 
  7 | # threads
  8 | from concurrent.futures import ThreadPoolExecutor, as_completed
  9 | 
 10 | # typing
 11 | from typing import List
 12 | 
 13 | # pathlib
 14 | from pathlib import Path
 15 | 
 16 | # progress bar
 17 | from tqdm import tqdm
 18 | 
 19 | # yt_dlp module
 20 | from yt_dlp import YoutubeDL
 21 | 
 22 | # stem module
 23 | from stem import Signal
 24 | from stem.control import Controller
 25 | 
 26 | # Video downloader class
 27 | class VideoDownloader:
 28 |     '''
 29 |     VideoDownloader class
 30 | 
 31 |     This class handles the downloading of TikTok videos and their audio using
 32 |     yt-dlp and threading for concurrent downloads.
 33 |     '''
 34 |     def __init__(self, output: str, use_tor: bool = False) -> None:
 35 |         '''
 36 |         Initializes the VideoDownloader with default download options.
 37 |         Downloads both video and audio when initialized.
 38 | 
 39 |         :param output: The original directory path provided by the user
 40 |         :param use_tor: Boolean indicating whether to use Tor for downloads
 41 |         '''
 42 |         # initialize Tor proxy settings
 43 |         self.use_tor = use_tor
 44 |         self.proxy = 'socks5://127.0.0.1:9050'
 45 | 
 46 |         # Common options for both video and audio
 47 |         common_options = {
 48 |             'no_warnings': True,
 49 |             'quiet': True,
 50 |             'ignoreerrors': True,
 51 |             'noprogress': True
 52 |         }
 53 | 
 54 |         if self.use_tor:
 55 |             common_options['proxy'] = self.proxy
 56 | 
 57 |         # video download options
 58 |         self.video_options = {
 59 |             **common_options,
 60 |             'format': '(bv*+ba/b)[vcodec!=?h265]',
 61 |             'outtmpl': self._build_output_directory(output, 'downloaded_videos')
 62 |         }
 63 | 
 64 |         # audio download options
 65 |         self.audio_options = {
 66 |             **common_options,
 67 |             'format': 'bestaudio/best',
 68 |             'outtmpl': self._build_output_directory(output, 'downloaded_audios'),
 69 |             'postprocessors': [{
 70 |                 'key': 'FFmpegExtractAudio',
 71 |                 'preferredcodec': 'mp3',
 72 |             }]
 73 |         }
 74 |     
 75 |     def _sanitize_output_path(self, output: str) -> str:
 76 |         '''
 77 |         Ensures the given path uses forward slashes and does not end with a
 78 |         slash.
 79 | 
 80 |         :param output: The original directory path provided by the user
 81 |         :return: A sanitized directory path with forward slashes and no
 82 |             trailing slash.
 83 |         '''
 84 |         # create a Path object and normalize the path
 85 |         path = Path(output)
 86 | 
 87 |         # path with the correct separators for the current OS
 88 |         output = str(path.as_posix())
 89 | 
 90 |         # remove any trailing slashes
 91 |         output = output.rstrip('/')
 92 | 
 93 |         return output
 94 |     
 95 |     def _build_output_directory(self, output: str, dir_name: str) -> str:
 96 |         '''
 97 |         Builds and sanitizes the output directory path for downloading videos.
 98 | 
 99 |         :param output: The original directory path provided by the user
100 |         :param dir_name: Name of the subdirectory (videos or audio)
101 |         :return: The full path for saving downloaded files with the filename
102 |             template.
103 |         '''
104 |         output = self._sanitize_output_path(output=output)
105 |         path = f'{output}/{dir_name}'
106 | 
107 |         # ensure the directory exists
108 |         if not os.path.exists(path):
109 |             os.makedirs(path)
110 |         
111 |         return f'{path}/%(id)s.%(ext)s'
112 | 
113 |     def renew_tor_ip(self) -> None:
114 |         '''
115 |         Requests a new Tor circuit to change the IP address.
116 |         '''
117 |         try:
118 |             with Controller.from_port(port=9051) as controller:
119 |                 controller.authenticate()
120 |                 controller.signal(Signal.NEWNYM)
121 |                 time.sleep(5)
122 |         except Exception as e:
123 |             print (f'Error renewing Tor IP: {e}')
124 | 
125 |     def download_content(self, url: str) -> None:
126 |         '''
127 |         Downloads both video and audio from the specified URL using yt-dlp.
128 | 
129 |         :param url: The URL of the TikTok video to download.
130 |         '''
131 |         max_attempts = 3 if self.use_tor else 1
132 |         for attempt in range(max_attempts):
133 |             try:
134 |                 # download video
135 |                 with YoutubeDL(self.video_options) as ydl:
136 |                     ydl.download(url)
137 | 
138 |                 # download audio
139 |                 with YoutubeDL(self.audio_options) as ydl:
140 |                     ydl.download(url)
141 |                 
142 |                 return
143 |                 
144 |             except Exception as e:
145 |                 print (f'Error downloading {url}: {e}')
146 |                 
147 |                 if self.use_tor and attempt < max_attempts - 1:
148 |                     print ('Renewing Tor circuit...')
149 |                     self.renew_tor_ip()
150 | 
151 |                     # wait for circuit to be established
152 |                     time.sleep(5)
153 |                 else:
154 |                     break
155 | 
156 |     def download_videos(self, urls: List[str], max_workers: int) -> None:
157 |         '''
158 |         Downloads multiple videos concurrently using a thread pool.
159 | 
160 |         :param urls: A list of TikTok video URLs to download.
161 |         :param max_workers: The maximum number of threads to use for
162 |             downloading.
163 |         '''
164 |         with ThreadPoolExecutor(max_workers=max_workers) as executor:
165 |             future_to_url = {
166 |                 executor.submit(self.download_content, url): url
167 |                 for url in urls
168 |             }
169 |             for future in tqdm(
170 |                     as_completed(future_to_url),
171 |                     total=len(future_to_url),
172 |                     desc='Downloading content'
173 |                 ):
174 |                 url = future_to_url[future]
175 |                 try:
176 |                     future.result()
177 |                 except Exception as e:
178 |                     print (f'{url} generated an exception: {e}')
179 | 
180 |     def _test_tor_connection(self) -> bool:
181 |         '''
182 |         Tests if Tor is available and working.
183 |         
184 |         :return: True if Tor is available and working, False otherwise.
185 |         '''
186 |         try:
187 |             # test if port is open
188 |             import socket
189 |             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
190 |             result = sock.connect_ex(('127.0.0.1', 9050))
191 |             if result != 0:
192 |                 print ('\n\n')
193 |                 print ('Tor SOCKS port (9050) is not open. Is Tor running?')
194 |                 print ('Falling back to normal connection.\n')
195 |                 return False
196 |             
197 |             # if port is open, test connection
198 |             import requests
199 |             print ('\n\nTesting Tor connection...')
200 |             response = requests.get(
201 |                 'https://check.torproject.org/api/ip',
202 |                 proxies={
203 |                     'http': self.proxy,
204 |                     'https': self.proxy
205 |                 },
206 |                 timeout=10
207 |             )
208 |             
209 |             if response.status_code == 200:
210 |                 data = response.json()
211 |                 print (f'Tor connection successful. Exit node IP: {data.get("IP")}\n\n')
212 |                 return True
213 |             else:
214 |                 print ('Tor enabled but connection check failed. Using normal connection.\n\n')
215 |                 return False
216 |         
217 |         except Exception as e:
218 |             print (f'\nTor connection failed ({e}). Using normal connection.\n')
219 |             return False
220 | 
221 |     def start_download(self, urls: List[str], max_workers: int) -> None:
222 |         '''
223 |         Starts the download process for a list of TikTok video URLs.
224 | 
225 |         :param urls: A list of TikTok video URLs to download.
226 |         :param max_workers: The maximum number of threads to use for
227 |             downloading. Default is 5.
228 |         '''
229 |         if self.use_tor:
230 |             # test Tor connection and update use_tor flag accordingly
231 |             self.use_tor = self._test_tor_connection()
232 |             
233 |             # remove proxy settings if Tor connection failed
234 |             if not self.use_tor:
235 |                 for options in [self.video_options, self.audio_options]:
236 |                     options.pop('proxy', None)
237 |         
238 |         print ('> Starting download...\n')
239 |         
240 |         # download videos
241 |         self.download_videos(urls=urls, max_workers=max_workers)
242 | 
243 |         print ('\n\nDownload complete.')
244 | 


--------------------------------------------------------------------------------
/media_handlers/session_manager.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # import modules
  4 | import os
  5 | import glob
  6 | import aiohttp
  7 | import asyncio
  8 | import requests
  9 | import subprocess
 10 | 
 11 | # progress bar
 12 | from tqdm import tqdm
 13 | 
 14 | # aiohttp
 15 | from aiohttp import ClientSession
 16 | 
 17 | # typing
 18 | from typing import Dict, List
 19 | 
 20 | # HTTP session class
 21 | class RequestSession:
 22 |     '''
 23 |     RequestSession
 24 | 
 25 |     This class handles HTTP requests and asynchronous tasks for interacting
 26 |     with the SerpAPI response and processing related content links
 27 | 
 28 |     '''
 29 |     def __init__(self) -> None:
 30 |         '''
 31 |         Initializes the RequestSession object.
 32 |         '''
 33 |         # request session
 34 |         headers = {'accept': 'application/json'}
 35 |         self.req_session = requests.Session()
 36 |         self.req_session.headers.update(headers)
 37 | 
 38 |         # asynchronous event loop
 39 |         self.loop = asyncio.get_event_loop()
 40 |     
 41 |     def load_related_content(self, url: str, api_key: str) -> List[Dict]:
 42 |         '''
 43 |         Loads related content from the given URL using the provided API key.
 44 | 
 45 |         :param url: The URL to load related content from.
 46 |         :param api_key: SerpAPI key for authentication.
 47 |         :return: A list of dictionaries containing the related content data.
 48 |         '''
 49 |         params = {'api_key': api_key}
 50 | 
 51 |         def fetch_content(url: str) -> Dict:
 52 |             response = self.req_session.get(url, params=params)
 53 |             response.raise_for_status()
 54 |             return response.json()
 55 | 
 56 |         try:
 57 |             content = fetch_content(url)
 58 |             see_more_link = content.get('serpapi_see_more_link')
 59 |             if see_more_link:
 60 |                 content = fetch_content(see_more_link)
 61 |             return content
 62 |         except requests.RequestException as e:
 63 |             print (f'An error occurred: {e}')
 64 |             return {}
 65 |     
 66 |     def _build_media_filename_path(self, output: str, link: str, file_extension: str) -> str:
 67 |         '''
 68 |         Builds the filename path for saving the image based on the TikTok link.
 69 | 
 70 |         :param output: The directory path where the images will be saved.
 71 |         :param link: The TikTok link from which to extract the post ID.
 72 |         :param file_extension: The file extension of the media file.
 73 |         :return: The full path (including filename) where the image will be
 74 |             saved.
 75 |         '''
 76 |         post_id = link.split('/')[-1].split('?')[0]
 77 |         return f'{output}/{post_id}.{file_extension}'
 78 |     
 79 |     async def fetch_file(self, session: ClientSession, url: str,
 80 |                          filename: str) -> None:
 81 |         '''
 82 |         Fetches a file from a URL and saves it to the output directory.
 83 | 
 84 |         :param session: The aiohttp ClientSession object.
 85 |         :param url: The URL of the file to download.
 86 |         :param filename: The path (including filename) where the file will be
 87 |             saved.
 88 |         '''
 89 |         try:
 90 |             async with session.get(url) as res:
 91 |                 if res.status == 200:
 92 |                     file_data = await res.read()
 93 |                     with open(filename, 'wb') as f:
 94 |                         f.write(file_data)
 95 |                 else:
 96 |                     print (
 97 |                         f'Failed to download {url}, status code: {res.status}'
 98 |                     )
 99 |         except Exception as e:
100 |             print (f'An error occurred while downloading {url}: {e}')
101 |     
102 |     async def download_files(self, urls: List[str], links: List[str],
103 |                              output: str, file_extension: str) -> None:
104 |         '''
105 |         Downloads files from a list of URLs asynchronously.
106 | 
107 |         :param urls: A list of file URLs to download.
108 |         :param links: A list of TikTok links corresponding to the files.
109 |         :param output: The directory path where the files will be saved.
110 |         :param file_extension: The file extension of the media file.
111 |         '''
112 |         async with aiohttp.ClientSession() as session:
113 |             tasks = [
114 |                 self.fetch_file(
115 |                     session=session, url=url,
116 |                     filename=self._build_media_filename_path(output, link, file_extension)
117 |                 ) for url, link in zip(urls, links)
118 |             ]
119 |             await asyncio.gather(*tasks)
120 |     
121 |     def start_media_download(self, urls: List[str], links: List[str],
122 |                              output: str, media_type: str) -> None:
123 |         '''
124 |         Starts the asynchronous download of files from a list of URLs.
125 | 
126 |         :param urls: A list of file URLs to download.
127 |         :param links: A list of TikTok links corresponding to the files.
128 |         :param output: The directory path where the files will be saved.
129 |         :param media_type: The type of media to download.
130 |         '''
131 |         media_object = {
132 |             'image': {
133 |                 'path': 'thumbnails',
134 |                 'file_extension': 'png'
135 |             },
136 |             'video': {
137 |                 'path': 'downloaded_videos',
138 |                 'file_extension': 'mp4'
139 |             }
140 |         }
141 | 
142 |         path = f'{output}/{media_object[media_type]["path"]}'
143 |         if not os.path.exists(path):
144 |             os.makedirs(path)
145 |         
146 |         file_extension = media_object[media_type]['file_extension']
147 |         self.loop.run_until_complete(
148 |             self.download_files(urls=urls, links=links, output=path,
149 |                                 file_extension=file_extension)
150 |         )
151 | 
152 |     def extract_audio_from_videos(self, output: str) -> None:
153 |         '''
154 |         Extracts audio from video files.
155 | 
156 |         :param output: The directory path where audios will be saved.
157 |         '''
158 |         # build audio path
159 |         audio_path = f'{output}/downloaded_audios'
160 |         if not os.path.exists(audio_path):
161 |             os.makedirs(audio_path)
162 | 
163 |         # get all video files
164 |         path = f'{output}/downloaded_videos'
165 |         files = glob.glob(f'{path}/*.mp4')
166 | 
167 |         # extract audio from each video
168 |         for file in files:
169 |             try:
170 |                 # get id from video filename
171 |                 video_id = os.path.basename(file).split('.')[0]
172 | 
173 |                 # FFmpeg command to extract audio
174 |                 cmd = [
175 |                     'ffmpeg',
176 |                     '-i', file,
177 |                     '-q:a', '0',
178 |                     '-map', 'a',
179 |                     '-y',
180 |                     f'{audio_path}/{video_id}.mp3'
181 |                 ]
182 | 
183 |                 subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
184 |             except Exception as e:
185 |                 print (f'Error extracting audio: {e}')
186 | 
187 |     def extract_keyframes_from_videos(self, output: str, max_concurrent: int) -> None:
188 |         '''
189 |         Extracts keyframes from video files.
190 | 
191 |         :param output: The directory path where keyframes will be saved.
192 |         :param max_concurrent: Maximum number of concurrent ffmpeg processes.
193 |         '''
194 |         # build keyframes path
195 |         keyframes_path = f'{output}/keyframes'
196 |         if not os.path.exists(keyframes_path):
197 |             os.makedirs(keyframes_path)
198 | 
199 |         # get all video files
200 |         path = f'{output}/downloaded_videos'
201 |         files = glob.glob(f'{path}/*.mp4')
202 | 
203 |         # videos ids already processed
204 |         processed_videos = [i.split('\\')[-1] for i in glob.glob(f'{keyframes_path}/*')]
205 | 
206 |         async def extract_keyframes(file, pbar):
207 |             try:
208 |                 # get id from video filename
209 |                 video_id = os.path.basename(file).split('.')[0]
210 |                 if video_id not in processed_videos:
211 |                     # create subdirectory for this video_id
212 |                     video_keyframes_dir = f'{keyframes_path}/{video_id}'
213 |                     if not os.path.exists(video_keyframes_dir):
214 |                         os.makedirs(video_keyframes_dir)
215 |                     
216 |                     # FFmpeg command to extract keyframes
217 |                     cmd = [
218 |                         'ffmpeg',
219 |                         '-i', file,
220 |                         '-vf', 'select=eq(pict_type\\,I)',
221 |                         '-vsync', 'vfr',
222 |                         '-q:v', '2',
223 |                         f'{video_keyframes_dir}/keyframe_%04d.jpg'
224 |                     ]
225 | 
226 |                     # run FFmpeg as async subprocess
227 |                     process = await asyncio.create_subprocess_exec(
228 |                         *cmd,
229 |                         stdout=asyncio.subprocess.PIPE,
230 |                         stderr=asyncio.subprocess.PIPE
231 |                     )
232 |                     await process.communicate()
233 |             except Exception as e:
234 |                 print (f'Error extracting keyframes: {e}')
235 |             finally:
236 |                 pbar.update(1)
237 | 
238 |         async def process_all_videos():
239 |             # create progress bar in the main thread
240 |             pbar = tqdm(total=len(files), desc='Extracting keyframes', unit='video')
241 |             
242 |             # use semaphore to limit concurrent processes
243 |             semaphore = asyncio.Semaphore(max_concurrent)
244 |             
245 |             async def process_with_semaphore(file):
246 |                 async with semaphore:
247 |                     await extract_keyframes(file, pbar)
248 |             
249 |             # create tasks for all videos
250 |             tasks = [process_with_semaphore(file) for file in files]
251 |             await asyncio.gather(*tasks)
252 |             
253 |             pbar.close()
254 | 
255 |         # run the async event loop
256 |         self.loop.run_until_complete(process_all_videos())
257 | 


--------------------------------------------------------------------------------
/databases/utilities.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # import modules
  4 | import re
  5 | import json
  6 | 
  7 | # typing
  8 | from typing import Dict, Tuple
  9 | 
 10 | '''
 11 | Extract likes and comments from snippet
 12 | 
 13 | '''
 14 | def extract_likes_comments(text: str) -> Tuple:
 15 |     '''
 16 |     Extracts likes and comments from a given text.
 17 | 
 18 |     :param text: The text containing likes and comments.
 19 |     :return: A tuple containing the extracted likes and comments, or None if
 20 |         not found.
 21 |     '''
 22 |     # define regex patterns for likes and comments
 23 |     likes_pattern = re.compile(
 24 |         r'(\d+(?:[\d,.]*\d+)?(?:[KM])?) Likes',
 25 |         re.IGNORECASE
 26 |     )
 27 | 
 28 |     comments_pattern = re.compile(
 29 |         r'(\d+(?:[\d,.]*\d+)?(?:[KM])?) Comments',
 30 |         re.IGNORECASE
 31 |     )
 32 | 
 33 |     # search for likes and comments in the text
 34 |     likes_match = likes_pattern.search(text)
 35 |     comments_match = comments_pattern.search(text)
 36 | 
 37 |     # extract the matched groups or return None if not found
 38 |     likes = likes_match.group(1) if likes_match else None
 39 |     comments = comments_match.group(1) if comments_match else None
 40 | 
 41 |     return likes, comments
 42 | 
 43 | '''
 44 | Extract fields from the field link
 45 | 
 46 | '''
 47 | def extract_author_post_id(link: str) -> Tuple:
 48 |     '''
 49 |     Extracts the author, link to the author's page, and post ID from a TikTok
 50 |     video link.
 51 | 
 52 |     :param link: The TikTok video link.
 53 |     :return: A tuple containing the author's username, link to the author's
 54 |         page, and the post ID.
 55 |     '''
 56 |     author = link.split('/')[3].replace('@', '')
 57 |     link_to_author = f'https://www.tiktok.com/@{author}'
 58 |     post_id = link.split('/')[-1].split('?')[0]
 59 | 
 60 |     return author, link_to_author, post_id
 61 | 
 62 | '''
 63 | Get items and keys from search results entries
 64 | 
 65 | '''
 66 | def get_items_from_search_results(entry: Dict) -> Tuple:
 67 |     '''
 68 |     Extracts and processes specific fields from a data entry.
 69 | 
 70 |     :param entry: A dictionary containing the data entry.
 71 |     :return: A tuple containing the extracted and processed values for the
 72 |         fields.
 73 |     '''
 74 |     # get values
 75 |     title = entry.get('title', '')
 76 |     snippet = entry.get('snippet', '')
 77 |     link = entry.get('link', '')
 78 | 
 79 |     # process new fields from data
 80 |     likes, comments = extract_likes_comments(snippet)
 81 |     title_snippet = f'{title} {snippet}'
 82 |     author, link_to_author, post_id = extract_author_post_id(link)
 83 | 
 84 | 
 85 |     return (
 86 |         entry.get('source', None),
 87 |         entry.get('title', None),
 88 |         entry.get('snippet', None),
 89 |         entry.get('link', None),
 90 |         entry.get('thumbnail', None),
 91 |         entry.get('video_link', None),
 92 |         ', '.join(entry.get('snippet_highlighted_words', [])) if entry.get(
 93 |           'snippet_highlighted_words'  
 94 |         ) else None,
 95 |         entry.get('displayed_link', None),
 96 |         title_snippet,
 97 |         likes,
 98 |         comments,
 99 |         author,
100 |         link_to_author,
101 |         post_id
102 |     )
103 | 
104 | '''
105 | Get items and keys from images results entries
106 | 
107 | '''
108 | def get_items_from_images_results(entry: Dict) -> Tuple:
109 |     '''
110 |     Extracts and processes specific fields from an image results entry.
111 | 
112 |     :param entry: A dictionary containing the image results entry.
113 |     :return: A tuple containing the extracted and processed values for the
114 |         fields.
115 |     '''
116 |     # get values
117 |     link = entry.get('link', '')
118 | 
119 |     # process new fields from data
120 |     author, link_to_author, post_id = extract_author_post_id(link)
121 | 
122 |     return (
123 |         entry.get('source', None),
124 |         entry.get('title', None),
125 |         entry.get('link', None),
126 |         entry.get('thumbnail', None),
127 |         author,
128 |         link_to_author,
129 |         post_id
130 |     )
131 | 
132 | '''
133 | Get items and keys from related content entries
134 | 
135 | '''
136 | def get_items_from_related_content(entry: Dict) -> Tuple:
137 |     '''
138 |     Extracts and processes specific fields from a related content entry.
139 | 
140 |     :param entry: A dictionary containing the related content entry.
141 |     :return: A tuple containing the extracted and processed values for the
142 |         fields.
143 |     '''
144 |     return (
145 |         entry.get('source', None),
146 |         entry.get('link', None),
147 |         entry.get('thumbnail', None),
148 |         entry.get('title', None)
149 |     )
150 | 
151 | '''
152 | Get items and keys from apify profile data
153 | 
154 | '''
155 | def get_items_from_apify_profile_data(entry: Dict) -> Tuple:
156 |     '''
157 |     Extracts and processes specific fields from an apify profile data entry.
158 | 
159 |     :param entry: A dictionary containing the apify profile data entry.
160 |     :return: A tuple containing the extracted and processed values for the
161 |         fields.
162 |     '''
163 |     # convert lists to JSON strings
164 |     hashtags = entry.get('hashtags', []) or []
165 |     hashtags_json_str = json.dumps([h.get('name', '') for h in hashtags])
166 | 
167 | 
168 |     return (
169 |         entry.get('id', None),
170 |         entry.get('text', None),
171 |         entry.get('textLanguage', None),
172 |         entry.get('createTime', None),
173 |         entry.get('createTimeISO', None),
174 |         entry.get('isAd', None),
175 |         entry.get('webVideoUrl', None),
176 | 
177 |         # author metadata
178 |         entry.get('authorMeta', {}).get('id', None),
179 |         entry.get('authorMeta', {}).get('name', None),
180 |         entry.get('authorMeta', {}).get('profileUrl', None),
181 |         entry.get('authorMeta', {}).get('bioLink', None),
182 |         entry.get('authorMeta', {}).get('signature', None),
183 |         entry.get('authorMeta', {}).get('nickName', None),
184 |         entry.get('authorMeta', {}).get('verified', None),
185 |         entry.get('authorMeta', {}).get('avatar', None),
186 |         entry.get('authorMeta', {}).get('privateAccount', None),
187 |         entry.get('authorMeta', {}).get('region', None),
188 |         entry.get('authorMeta', {}).get('following', None),
189 |         entry.get('authorMeta', {}).get('friends', None),
190 |         entry.get('authorMeta', {}).get('fans', None),
191 |         entry.get('authorMeta', {}).get('heart', None),
192 |         entry.get('authorMeta', {}).get('video', None),
193 |         entry.get('authorMeta', {}).get('digg', None),
194 | 
195 |         # music metadata
196 |         entry.get('musicMeta', {}).get('musicId', None),
197 |         entry.get('musicMeta', {}).get('musicName', None),
198 |         entry.get('musicMeta', {}).get('musicAuthor', None),
199 |         entry.get('musicMeta', {}).get('musicOriginal', None),
200 | 
201 |         # video metadata
202 |         entry.get('videoMeta', {}).get('duration', None),
203 |         entry.get('videoMeta', {}).get('coverUrl', None),
204 |         entry.get('videoMeta', {}).get('downloadAddr', None),
205 | 
206 |         # engagement metrics
207 |         entry.get('diggCount', None),
208 |         entry.get('shareCount', None),
209 |         entry.get('playCount', None),
210 |         entry.get('collectCount', None),
211 |         entry.get('commentCount', None),
212 | 
213 |         # hashtags
214 |         hashtags_json_str,
215 | 
216 |         # additional metadata
217 |         entry.get('isSlideshow', None),
218 |         entry.get('isPinned', None),
219 |         entry.get('isSponsored', None),
220 |         entry.get('input') or entry.get('searchQuery'),
221 |         entry.get('fromProfileSection', None)
222 |     )
223 | 
224 | '''
225 | Get items and keys from apify hashtag data
226 | 
227 | '''
228 | def get_items_from_apify_hashtag_data(entry: Dict) -> Tuple:
229 |     '''
230 |     Extracts and processes specific fields from an apify hashtag data entry.
231 | 
232 |     :param entry: A dictionary containing the apify hashtag data entry.
233 |     :return: A tuple containing the extracted and processed values for the
234 |         fields.
235 |     '''
236 |     # convert lists to JSON strings
237 |     hashtags = entry.get('hashtags', []) or []
238 |     hashtags_json_str = json.dumps([h.get('name', '') for h in hashtags])
239 | 
240 | 
241 |     return (
242 |         entry.get('id', None),
243 |         entry.get('text', None),
244 |         entry.get('textLanguage', None),
245 |         entry.get('createTime', None),
246 |         entry.get('createTimeISO', None),
247 |         entry.get('isAd', None),
248 |         entry.get('webVideoUrl', None),
249 | 
250 |         # author metadata
251 |         entry.get('authorMeta', {}).get('id', None),
252 |         entry.get('authorMeta', {}).get('name', None),
253 |         entry.get('authorMeta', {}).get('profileUrl', None),
254 |         entry.get('authorMeta', {}).get('bioLink', None),
255 |         entry.get('authorMeta', {}).get('signature', None),
256 |         entry.get('authorMeta', {}).get('nickName', None),
257 |         entry.get('authorMeta', {}).get('verified', None),
258 |         entry.get('authorMeta', {}).get('avatar', None),
259 |         entry.get('authorMeta', {}).get('privateAccount', None),
260 |         entry.get('authorMeta', {}).get('region', None),
261 |         entry.get('authorMeta', {}).get('following', None),
262 |         entry.get('authorMeta', {}).get('friends', None),
263 |         entry.get('authorMeta', {}).get('fans', None),
264 |         entry.get('authorMeta', {}).get('heart', None),
265 |         entry.get('authorMeta', {}).get('video', None),
266 |         entry.get('authorMeta', {}).get('digg', None),
267 | 
268 |         # music metadata
269 |         entry.get('musicMeta', {}).get('musicId', None),
270 |         entry.get('musicMeta', {}).get('musicName', None),
271 |         entry.get('musicMeta', {}).get('musicAuthor', None),
272 |         entry.get('musicMeta', {}).get('musicOriginal', None),
273 | 
274 |         # video metadata
275 |         entry.get('videoMeta', {}).get('duration', None),
276 |         entry.get('videoMeta', {}).get('coverUrl', None),
277 |         entry.get('videoMeta', {}).get('downloadAddr', None),
278 | 
279 |         # engagement metrics
280 |         entry.get('diggCount', None),
281 |         entry.get('shareCount', None),
282 |         entry.get('playCount', None),
283 |         entry.get('collectCount', None),
284 |         entry.get('commentCount', None),
285 | 
286 |         # hashtags
287 |         hashtags_json_str,
288 | 
289 |         # additional metadata
290 |         entry.get('isSlideshow', None),
291 |         entry.get('isPinned', None),
292 |         entry.get('isSponsored', None),
293 |         entry.get('input', None),
294 |         entry.get('searchHashtag', {}).get('views', None)
295 |     )
296 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # import modules
  4 | import time
  5 | import os
  6 | 
  7 | # import argparse
  8 | from argparse import (
  9 | 	ArgumentParser, RawTextHelpFormatter, SUPPRESS
 10 | )
 11 | 
 12 | # import utils
 13 | from utils import get_config_attrs, verify_date_argument, \
 14 |     create_output_data_path, get_project_root
 15 | 
 16 | # TikTok data collector
 17 | from data_collectors import TikTokDataCollector
 18 | 
 19 | # video downloader
 20 | from media_handlers import VideoDownloader, RequestSession
 21 | 
 22 | def launch_streamlit_app():
 23 |     '''Launch the Streamlit web interface'''
 24 |     import subprocess
 25 |     import sys
 26 | 
 27 |     # start process
 28 |     log_text = f'''
 29 |     > Starting program at: {time.ctime()}
 30 | 
 31 |     '''
 32 |     print ('\n\n' + ' '.join(log_text.split()).strip())
 33 | 
 34 |     print ('\n')
 35 |     print('> Launching TikSpyder Streamlit Interface...')
 36 |     print('<< Press Ctrl+C to stop the server >>')
 37 |     print ('\n')
 38 |     print('-' * 50)
 39 |     
 40 |     try:
 41 |         # Launch streamlit run app.py
 42 |         subprocess.run([
 43 |             sys.executable, 
 44 |             "-m", "streamlit", "run", 
 45 |             os.path.join(os.path.dirname(__file__), 'app.py')
 46 |         ], check=True)
 47 |     except subprocess.CalledProcessError as e:
 48 |         print(f'> Failed to launch Streamlit: {e}')
 49 |         print('> Make sure streamlit is installed: pip install streamlit')
 50 |         sys.exit(1)
 51 |     except KeyboardInterrupt:
 52 |         # end process
 53 |         print ('\n')
 54 |         print('-' * 50)
 55 |         log_text = f'''
 56 |         > Ending program at: {time.ctime()}
 57 | 
 58 |         '''
 59 |         print ('\n\n' + ' '.join(log_text.split()).strip())
 60 |         sys.exit(0)
 61 | 
 62 | def main():
 63 |     # Get current working directory (where command was executed)
 64 |     execution_dir = os.getcwd()
 65 |     
 66 |     # Get project root directory (where the package is installed)
 67 |     project_root = get_project_root()
 68 |     
 69 |     # Set up project paths for later use instead of changing directories
 70 |     project_paths = {
 71 |         'root': project_root,
 72 |         'config': os.path.join(project_root, 'config'),
 73 |         'execution': execution_dir
 74 |     }
 75 |     
 76 |     '''
 77 |     Arguments
 78 | 
 79 |     '''
 80 |     formatter = lambda prog: RawTextHelpFormatter(
 81 |         prog,
 82 |         indent_increment=2,
 83 |         max_help_position=52,
 84 |         width=None
 85 |     )
 86 | 
 87 |     parser = ArgumentParser(
 88 |         prog='TikSpyder',
 89 |         description='Command Line Arguments.',
 90 |         formatter_class=formatter,
 91 |         add_help=False
 92 |     )
 93 | 
 94 |     # help arguments
 95 |     help_arguments = parser.add_argument_group('Help options')
 96 |     help_arguments.add_argument(
 97 |         '-h',
 98 |         '--help',
 99 |         action='help',
100 |         default=SUPPRESS,
101 |         help='Show this help message and exit.'
102 |     )
103 | 
104 |     # SerpAPI arguments
105 |     serpapi_arguments = parser.add_argument_group('SerpAPI options')
106 | 
107 |     ''' query '''
108 |     serpapi_arguments.add_argument(
109 |         '--q',
110 |         type=str,
111 |         required=False,
112 |         metavar='',
113 |         help='The search term of phrase for which to retrieve TikTok data.'
114 |     )
115 | 
116 |     ''' user '''
117 |     serpapi_arguments.add_argument(
118 |         '--user',
119 |         type=str,
120 |         required=False,
121 |         metavar='',
122 |         help='Specify a TikTok user to search for videos from.'
123 |     )
124 | 
125 |     ''' tag '''
126 |     serpapi_arguments.add_argument(
127 |         '--tag',
128 |         type=str,
129 |         required=False,
130 |         metavar='',
131 |         help='Specify a TikTok tag to search for videos from.'
132 |     )
133 | 
134 |     ''' google domain '''
135 |     serpapi_arguments.add_argument(
136 |         '--google-domain',
137 |         type=str,
138 |         required=False,
139 |         default='google.com',
140 |         metavar='',
141 |         help='Defines the Google domain to use. It defaults to google.com.'
142 |     )
143 | 
144 |     ''' gl > country '''
145 |     serpapi_arguments.add_argument(
146 |         '--gl',
147 |         type=str,
148 |         required=False,
149 |         metavar='',
150 |         help=(
151 |             "Defines the country to use for the search. Two-letter country "
152 |             "code."
153 |         )
154 |     )
155 | 
156 |     ''' hl > language '''
157 |     serpapi_arguments.add_argument(
158 |         '--hl',
159 |         type=str,
160 |         required=False,
161 |         metavar='',
162 |         help=(
163 |             "Defines the language to use for the search. Two-letter language "
164 |             "code."
165 |         )
166 |     )
167 | 
168 |     ''' cr > multiple countries '''
169 |     serpapi_arguments.add_argument(
170 |         '--cr',
171 |         type=str,
172 |         required=False,
173 |         metavar='',
174 |         help='Defines one or multiple countries to limit the search to.'
175 |     )
176 | 
177 |     ''' safe > adult content filter '''
178 |     serpapi_arguments.add_argument(
179 |         '--safe',
180 |         type=str,
181 |         required=False,
182 |         default='active',
183 |         choices=['active', 'off'],
184 |         metavar='',
185 |         help='Level of filtering for adult content. Options: active (default), off'
186 |     )
187 | 
188 |     ''' lr > one or multiple languages '''
189 |     serpapi_arguments.add_argument(
190 |         '--lr',
191 |         type=str,
192 |         required=False,
193 |         metavar='',
194 |         help='Defines one or multiple languages to limit the search to.'
195 |     )
196 |     
197 |     ''' depth > defines number of iterations for related content '''
198 |     serpapi_arguments.add_argument(
199 |         '--depth',
200 |         type=int,
201 |         required=False,
202 |         default=3,
203 |         metavar='',
204 |         help='Depth of iterations to follow related content links.'
205 |     )
206 | 
207 |     # Google advanced search arguments
208 |     google_advanced_search_arguments = parser.add_argument_group(
209 |         'Google advanced search options'
210 |     )
211 | 
212 |     ''' search for posts before a given date '''
213 |     google_advanced_search_arguments.add_argument(
214 |         '--before',
215 |         type=str,
216 |         required=False,
217 |         metavar='',
218 |         help=(
219 |             "Limit results to posts published before the specified date. "
220 |             "Format: YYYY-MM-DD."
221 |         )
222 |     )
223 | 
224 |     ''' search for posts after a given date '''
225 |     google_advanced_search_arguments.add_argument(
226 |         '--after',
227 |         type=str,
228 |         required=False,
229 |         metavar='',
230 |         help=(
231 |             "Limit results to posts published after the specified date. "
232 |             "Format: YYYY-MM-DD."
233 |         )
234 |     )
235 | 
236 |     # Apify optional arguments
237 |     apify_arguments = parser.add_argument_group(
238 |         'Optional Apify arguments'
239 |     )
240 | 
241 |     ''' apify integration '''
242 |     apify_arguments.add_argument(
243 |         '--apify',
244 |         action='store_true',
245 |         required=False,
246 |         help='Specify whether to use Apify integration.'
247 |     )
248 | 
249 |     apify_arguments.add_argument(
250 |         '--oldest-post-date',
251 |         type=str,
252 |         required=False,
253 |         metavar='',
254 |         help=(
255 |             "Filter posts newer than the specified date. "
256 |             "Format: YYYY-MM-DD."
257 |         )
258 |     )
259 | 
260 |     apify_arguments.add_argument(
261 |         '--newest-post-date',
262 |         type=str,
263 |         required=False,
264 |         metavar='',
265 |         help=(
266 |             "Filter posts older than the specified date. "
267 |             "Format: YYYY-MM-DD."
268 |         )
269 |     )
270 | 
271 |     apify_arguments.add_argument(
272 |         '--number-of-results',
273 |         type=int,
274 |         default=25,
275 |         required=False,
276 |         metavar='',
277 |         help=(
278 |             "Specify the number of results to return from Apify. Default: 25"
279 |         )
280 |     )
281 | 
282 |     # optional arguments
283 |     optional_arguments = parser.add_argument_group(
284 |         'Optional arguments and parameters'
285 |     )
286 | 
287 |     ''' use tor '''
288 |     optional_arguments.add_argument(
289 |         '--use-tor',
290 |         action='store_true',
291 |         required=False,
292 |         help='Specify whether to use Tor for downloading TikTok videos.'
293 |     )
294 | 
295 |     ''' download TikTok results '''
296 |     optional_arguments.add_argument(
297 |         '-d',
298 |         '--download',
299 |         action='store_true',
300 |         required=False,
301 |         help='Specify whether to download TikTok videos from SerpAPI and Apify.'
302 |     )
303 | 
304 |     ''' max workers > maximum number of threads '''
305 |     optional_arguments.add_argument(
306 |         '-w',
307 |         '--max-workers',
308 |         type=int,
309 |         required=False,
310 |         metavar='',
311 |         help=(
312 |             "Specify the maximum number of threads to use for downloading "
313 |             "TikTok videos and extracting keyframes."
314 |         )
315 |     )
316 | 
317 |     ''' output '''
318 |     optional_arguments.add_argument(
319 |         '-o',
320 |         '--output',
321 |         type=str,
322 |         required=False,
323 |         default=f'./tikspyder-data/{int(time.time())}',
324 |         metavar='',
325 |         help=(
326 |             "Specify output directory path. If not provided, data is "
327 |             "saved in the current working directory in a folder named "
328 |             "tikspyder-data"
329 |         )
330 |     )
331 | 
332 |     ''' launch streamlit app '''
333 |     optional_arguments.add_argument(
334 |         '--app',
335 |         action='store_true',
336 |         required=False,
337 |         help='Launch the Streamlit web interface instead of using CLI mode.'
338 |     )
339 | 
340 |     # parse arguments
341 |     args = vars(parser.parse_args())
342 | 
343 |     # check if user wants to launch Streamlit app
344 |     if args.get('app'):
345 |         launch_streamlit_app()
346 |         return
347 | 
348 |     # validate that either a query, username or tag was provided
349 |     if all(arg is None for arg in [args['user'], args['q'], args['tag']]):
350 |         raise ValueError('Either --user, --q or --tag must be provided.')
351 |     
352 |     # raise error if both user and tag are provided
353 |     if args['user'] and args['tag']:
354 |         raise ValueError('Both --user and --tag were provided. Only one can be used.')
355 |     
356 |     # merging SerpAPI configuration attrs with the existing arguments
357 |     config_attrs = get_config_attrs(project_paths['config'])
358 |     args = {**args, **config_attrs}
359 | 
360 |     # verify provided dates
361 |     for date_key in ['before', 'after']:
362 |         if args[date_key] is not None:
363 |             verify_date_argument(args, date_key)
364 |     
365 |     # start process
366 |     log_text = f'''
367 |     > Starting program at: {time.ctime()}
368 | 
369 |     '''
370 |     print ('\n\n' + ' '.join(log_text.split()).strip())
371 | 
372 |     # create the output data path if not exists
373 |     output = args['output']
374 |     create_output_data_path(output)
375 | 
376 |     # TikTokDataCollector instance
377 |     collector = TikTokDataCollector(args=args)
378 | 
379 |     # TikTok data collection call
380 |     collector.collect_search_data()
381 | 
382 |     # read SQL database and generate csv file
383 |     collector.generate_data_files()
384 | 
385 |     # download videos
386 |     if args['download']:
387 |         print ('')
388 |         print ('-' * 30)
389 |         print ('> Downloading videos...')
390 | 
391 |         # get tiktok urls
392 |         collected_videos = collector.get_collected_videos()
393 | 
394 |         if collected_videos:
395 |             print (f'\n> Found {len(collected_videos)} videos to download.')
396 | 
397 |             # define max workers
398 |             max_workers = args['max_workers'] if args['max_workers'] else 5
399 |             downloader = VideoDownloader(output=output, use_tor=args['use_tor'])
400 | 
401 |             # start download
402 |             downloader.start_download(urls=collected_videos, max_workers=max_workers)
403 |         else:
404 |             print ('\n> Search results did not return any videos to download.')
405 |         
406 |         # extract keyframes
407 |         print ('\n')
408 |         print ('-' * 30)
409 |         print ('Extracting keyframes...')
410 |         request_session = RequestSession()
411 | 
412 |         # define max workers
413 |         max_workers = args['max_workers'] if args['max_workers'] else 3
414 |         request_session.extract_keyframes_from_videos(
415 |             output=output,
416 |             max_concurrent=max_workers
417 |         )
418 |         print ('\n')
419 |         print ('-' * 30)
420 |     
421 |     # end process
422 |     log_text = f'''
423 |     > Ending program at: {time.ctime()}
424 | 
425 |     '''
426 |     print ('\n\n' + ' '.join(log_text.split()).strip())
427 | 
428 | if __name__ == '__main__':
429 |     main()
430 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | # **TikSpyder**
  4 | 
  5 | </div>
  6 | 
  7 | <br />
  8 | 
  9 | `TikSpyder` is a command-line tool designed to collect TikTok data using SerpAPI for Google search results and Apify for TikTok data extraction. The tool supports video downloading via yt-dlp and uses Python's asynchronous capabilities and multithreading for efficient data collection.
 10 | 
 11 | <br />
 12 | <br />
 13 | 
 14 | <div align="center">
 15 | 
 16 | [![GitHub forks](https://img.shields.io/github/forks/estebanpdl/tik-spyder.svg?style=social&label=Fork&maxAge=2592000)](https://GitHub.com/estebanpdl/tik-spyder/network/)
 17 | [![GitHub stars](https://img.shields.io/github/stars/estebanpdl/tik-spyder?style=social)](https://github.com/estebanpdl/tik-spyder/stargazers)
 18 | [![Open Source](https://badges.frapsoft.com/os/v1/open-source.svg?v=103)](https://x.com/estebanpdl)
 19 | [![Made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/)
 20 | [![Twitter estebanpdl](https://badgen.net/badge/icon/twitter?icon=twitter&label)](https://x.com/estebanpdl)
 21 | [![Buy Me A Coffee](https://img.shields.io/badge/buy%20me%20a%20coffee-donate-yellow.svg)](https://buymeacoffee.com/estebanpdl)
 22 | 
 23 | </div>
 24 | 
 25 | <hr />
 26 | 
 27 | ## 🔧 **Companion Tools**
 28 | 
 29 | | Tool | Description | Access |
 30 | |------|-------------|--------|
 31 | | 🎙️ Audio Transcription | Transcribe audio files from TikTok videos | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qMcMsS2YI9btXGfFN1sCviQeB7RSKqUH) |
 32 | 
 33 | <br />
 34 | 
 35 | ## 🖥️ **User Interface Options**
 36 | 
 37 | TikSpyder provides **two ways** to interact with the tool:
 38 | 
 39 | ### **1. 🎨 Streamlit Web Interface (Recommended for Non-Technical Users)**
 40 | A modern, user-friendly web interface with TikTok-inspired dark theme that makes data collection accessible to everyone.
 41 | 
 42 | ![TikSpyder Streamlit Interface](images/streamlit-interface.png)
 43 | 
 44 | **Features:**
 45 | - 🎯 **Intuitive Configuration**: Easy search setup with tabbed interface for keywords, users, or hashtags
 46 | - 📅 **Visual Date Filters**: Calendar widgets for precise date range selection  
 47 | - 🚀 **Apify Integration**: Simple toggle to enable enhanced data collection
 48 | - ⚙️ **Advanced Options**: Collapsible section for Google search parameters
 49 | - 📥 **Download Settings**: Visual controls for video downloads and Tor network usage
 50 | - 📂 **File Browser**: Point-and-click directory selection
 51 | - 📊 **Real-time Progress**: Live progress tracking with step-by-step status updates
 52 | 
 53 | **Launch the Interface:**
 54 | 
 55 | **Method 1 (Recommended):**
 56 | ```sh
 57 | # Using package installation
 58 | tikspyder --app
 59 | 
 60 | # Using standard installation  
 61 | python main.py --app
 62 | ```
 63 | 
 64 | **Method 2 (Direct):**
 65 | ```sh
 66 | streamlit run app.py
 67 | ```
 68 | 
 69 | ### **2. ⌨️ Command Line Interface (For Advanced Users)**
 70 | Full-featured command-line tool for automation and scripting scenarios.
 71 | 
 72 | ## 🔍 **Description**
 73 | 
 74 | TikSpyder offers two main methods of data collection:
 75 | 1. **Google Search Results**: Using SerpAPI to find TikTok videos based on search queries
 76 | 2. **Apify Data collection**: Using Apify to collect videos directly from TikTok profiles or keywords
 77 | 
 78 | The tool supports various filtering options, including date ranges and content types, and can download both videos and thumbnails. Data is stored in a SQLite database and can be exported to CSV files for further analysis.
 79 | 
 80 | Given the dynamic nature of search results and the constantly evolving landscape of TikTok's platform, it's important to note that the data collected by TikSpyder represents a sample rather than a comprehensive dataset. However, this sample can still be valuable for monitoring trends and identifying emerging narratives in the information ecosystem.
 81 | 
 82 | To get the most out of TikSpyder, **it is recommended to test your query using Google's advanced search features. This can help refine your search query, improve the relevance of your results, and test specific keywords more effectively**. By taking advantage of these features, you can ensure that you're collecting the most relevant data for your research or analysis.
 83 | 
 84 | <br />
 85 | 
 86 | ## 🚀 **Features**
 87 | 
 88 | ### **Core Functionality**
 89 | - 🔍 Collects TikTok video links using SerpAPI and Apify
 90 | - 🖼️ Collects and downloads thumbnails for TikTok videos  
 91 | - 🔗 Collects related content to the search query
 92 | - 💾 Stores collected data in SQLite database
 93 | - 📊 Exports data to CSV files for analysis
 94 | - 📹 Downloads TikTok videos using yt-dlp
 95 | - 🎞️ Extracts keyframes from downloaded videos
 96 | - ⚡ Supports asynchronous and multithreaded downloading for improved performance
 97 | - 🔒 Supports Tor network for enhanced privacy and rate limiting avoidance
 98 | 
 99 | ### **User Interfaces**
100 | - 🎨 **Modern Streamlit Web Interface**: User-friendly GUI with TikTok-inspired dark theme
101 | - ⌨️ **Command Line Interface**: Full-featured CLI for automation and advanced users
102 | - 🎯 **Search Types**: Support for keywords, user profiles, and hashtag searches
103 | - 📅 **Date Range Filtering**: Precise temporal data collection controls
104 | 
105 | <br />
106 | 
107 | ## ⚙️ **Requirements**
108 | 
109 | ### **System Requirements**
110 | - [Python](https://www.python.org/) >= 3.11.7
111 | - [ffmpeg](https://ffmpeg.org/) (for video processing and keyframe extraction)
112 | 
113 | ### **API Keys & Services**  
114 | - [SerpAPI key](https://serpapi.com/) (required for Google search functionality)
115 | - [Apify API token](https://apify.com/) (optional, for direct TikTok profile scraping)
116 | 
117 | ### **Optional Components**
118 | - [Tor Browser](https://www.torproject.org/) (optional, for enhanced privacy during downloads)
119 | 
120 | ### **Platform-Specific Requirements**
121 | - **All Platforms**: Python libraries listed in `requirements.txt`
122 | - **Streamlit Interface**: Automatically installed with requirements
123 | - **Linux Users**: For GUI components, install tkinter: `sudo apt-get install python3-tk` (Ubuntu/Debian)
124 | 
125 | <br />
126 | 
127 | ## 🔧 **Installation**
128 | 
129 | ### **Method 1: Standard Installation**
130 | 
131 | 1. Clone the repository
132 | 
133 | ```sh
134 | git clone https://github.com/estebanpdl/tik-spyder.git
135 | cd tik-spyder
136 | ```
137 | 
138 | 2. Install the required packages
139 | 
140 | ```sh
141 | pip install -r requirements.txt
142 | ```
143 | 
144 | or
145 | 
146 | ```sh
147 | pip3 install -r requirements.txt
148 | ```
149 | 
150 | ### **Method 2: Package Installation (Recommended)**
151 | 
152 | This method installs TikSpyder as a package, making the `tikspyder` command available from anywhere on your system.
153 | 
154 | 1. Clone the repository
155 | 
156 | ```sh
157 | git clone https://github.com/estebanpdl/tik-spyder.git
158 | cd tik-spyder
159 | ```
160 | 
161 | 2. Install the package in editable mode
162 | 
163 | ```sh
164 | pip install -e .
165 | ```
166 | 
167 | or
168 | 
169 | ```sh
170 | pip3 install -e .
171 | ```
172 | 
173 | After installation, you can use `tikspyder` directly from any directory instead of `python main.py`.
174 | 
175 | ### **Configuration**
176 | 
177 | 3. Once you obtain an API key from SerpAPI and Apify, populate the config/config.ini file with the described values. Replace `api_key_value` and `apify_token_value` with your API key and token.
178 | 
179 | ```ini
180 | 
181 | [SerpAPI Key]
182 | api_key = your_serp_api_key
183 | 
184 | [Apify Token]
185 | apify_token = your_apify_token
186 | ```
187 | 
188 | <br />
189 | 
190 | ## 📚 **Usage**
191 | 
192 | TikSpyder offers two interface options to suit different user preferences and use cases:
193 | 
194 | ## 🎨 **Streamlit Web Interface Usage**
195 | 
196 | The Streamlit interface provides an intuitive, visual way to configure and run data collection tasks.
197 | 
198 | ### **Launch the Interface**
199 | 
200 | ```sh
201 | # Navigate to TikSpyder directory
202 | cd tik-spyder
203 | 
204 | # Launch the Streamlit app
205 | streamlit run app.py
206 | ```
207 | 
208 | The interface will automatically open in your default web browser at `http://localhost:8501`
209 | 
210 | ### **Using the Interface**
211 | 
212 | 1. **🎯 Configure Search**: Choose between keyword, user profile, or hashtag search
213 | 2. **📅 Set Date Filters**: Use calendar widgets to define your collection timeframe  
214 | 3. **🚀 Enable Apify** (Optional): Toggle for enhanced direct TikTok data collection
215 | 4. **⚙️ Adjust Advanced Options**: Fine-tune Google search parameters if needed
216 | 5. **📥 Configure Downloads**: Set video download preferences and worker counts
217 | 6. **📂 Choose Output Directory**: Select where your data will be saved
218 | 7. **🚀 Start Collection**: Click the centered "Start Data Collection" button
219 | 
220 | ---
221 | 
222 | ## ⌨️ **Command Line Interface Usage**
223 | 
224 | For advanced users and automation scenarios, TikSpyder provides a full-featured CLI.
225 | 
226 | ### **Using Package Installation (Method 2)**
227 | 
228 | ```sh
229 | tikspyder [OPTIONS]
230 | ```
231 | 
232 | ### **Using Standard Installation (Method 1)**
233 | 
234 | ```sh
235 | python main.py [OPTIONS]
236 | ```
237 | 
238 | ### **Command Line Arguments**
239 | 
240 | ```sh
241 | # Package installation
242 | tikspyder --help
243 | 
244 | # or
245 | tikspyder -h
246 | 
247 | # Standard installation
248 | python main.py --help
249 | 
250 | # or
251 | python main.py -h
252 | ```
253 | 
254 | ```
255 | Command Line Arguments.
256 | 
257 | Help options:
258 |   -h, --help            Show this help message and exit.
259 | 
260 | SerpAPI options:
261 |   --q                   The search term of phrase for which to retrieve TikTok data.
262 |   --user                Specify a TikTok user to search for videos from.
263 |   --tag                 Specify a TikTok tag to search for videos from.
264 |   --google-domain       Defines the Google domain to use. It defaults to google.com.
265 |   --gl                  Defines the country to use for the search. Two-letter country code.
266 |   --hl                  Defines the language to use for the search. Two-letter language code.
267 |   --cr                  Defines one or multiple countries to limit the search to.
268 |   --safe                Level of filtering for adult content. Options: active (default), off
269 |   --lr                  Defines one or multiple languages to limit the search to.
270 |   --depth               Depth of iterations to follow related content links.
271 | 
272 | Google advanced search options:
273 |   --before              Limit results to posts published before the specified date. Format: YYYY-MM-DD.
274 |   --after               Limit results to posts published after the specified date. Format: YYYY-MM-DD.
275 | 
276 | Optional Apify arguments:
277 |   --apify               Specify whether to use Apify integration.
278 |   --oldest-post-date    Filter posts newer than the specified date. Format: YYYY-MM-DD.
279 |   --newest-post-date    Filter posts older than the specified date. Format: YYYY-MM-DD.
280 |   --number-of-results   Specify the number of results to return from Apify. Default: 25
281 | 
282 | Optional arguments and parameters:
283 |   --app                 Launch the Streamlit web interface instead of using CLI mode.
284 |   --use-tor             Specify whether to use Tor for downloading TikTok videos.
285 |   -d, --download        Specify whether to download TikTok videos from SerpAPI and Apify.
286 |   -w , --max-workers    Specify the maximum number of threads to use for downloading TikTok videos and extracting keyframes.
287 |   -o , --output         Specify output directory path. If not provided, data is saved in the current working directory in a folder named `tikspyder-data`
288 | ```
289 | 
290 | ### **Example Usage**
291 | 
292 | 1. Search-based collection:
293 | 
294 | ```sh
295 | # Using package installation (Method 2)
296 | tikspyder --q "F-16 AND Enemy AND (Ukraine OR Russia)" --gl us --hl en --after 2024-02-01 --before 2024-05-31 --output {output_directory}/ --download
297 | 
298 | # Using standard installation (Method 1)
299 | python main.py --q "F-16 AND Enemy AND (Ukraine OR Russia)" --gl us --hl en --after 2024-02-01 --before 2024-05-31 --output {output_directory}/ --download
300 | 
301 | # Note: Replace '{output_directory}' with the desired output path.
302 | ```
303 | 
304 | 2. Profile-based collection:
305 | 
306 | ```sh
307 | # Using package installation (Method 2)
308 | tikspyder --q Trump --user username --output {output_directory}/ --download --apify --oldest-post-date 2025-01-01
309 | 
310 | # Using standard installation (Method 1)
311 | python main.py --q Trump --user username --output {output_directory}/ --download --apify --oldest-post-date 2025-01-01
312 | 
313 | # Note: Replace '{output_directory}' with the desired output path.
314 | ```
315 | 
316 | 3. Tag-based collection
317 | ```sh
318 | # Using package installation (Method 2)
319 | tikspyder --tag sinaloa --apify --oldest-post-date 2025-08-01 --number-of-results 50 --output {output_directory}/ --download
320 | 
321 | # Using standard installation (Method 1)
322 | python main.py --tag sinaloa --apify --oldest-post-date 2025-08-01 --number-of-results 50 --output {output_directory}/ --download
323 | 
324 | # Note: Replace '{output_directory}' with the desired output path.
325 | ```
326 | 
327 | ### Tor Integration
328 | You can use Tor network for downloading TikTok videos to enhance privacy and avoid rate limiting. To use this feature:
329 | 
330 | 1. Make sure Tor Browser is installed and running
331 | 2. Configure your torrc file with:
332 | 
333 | ```
334 | ## Enable SOCKS proxy
335 | SocksPort 9050
336 | 
337 | ## Enable Control Port for IP rotation
338 | ControlPort 9051
339 | CookieAuthentication 1
340 | ```
341 | 
342 | 3. Use the `--use-tor` flag when running the script. If Tor connection fails, the script will automatically fall back to a normal connection.
343 | 
344 | 
345 | <br />
346 | 
347 | ## ☕ Support
348 | 
349 | If you find TikSpyder helpful, please consider buying me a coffee to support ongoing development and maintenance. Your donation will help me continue to improve the tool and add new features.
350 | 
351 | [![Buy Me A Coffee](https://img.shields.io/badge/buy%20me%20a%20coffee-donate-yellow.svg?style=for-the-badge&logo=buy-me-a-coffee&logoColor=white)](https://buymeacoffee.com/estebanpdl)
352 | 
353 | <br />
354 | 


--------------------------------------------------------------------------------
/data_collectors/collector.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # import modules
  4 | import os
  5 | import time
  6 | import json
  7 | import uuid
  8 | import httpx
  9 | 
 10 | # typing
 11 | from typing import Dict, List
 12 | 
 13 | # SerpAPI module
 14 | import serpapi
 15 | 
 16 | # Apify client
 17 | from apify_client import ApifyClient
 18 | 
 19 | # local dependencies
 20 | from .utilities import (
 21 |     search_query,
 22 |     select_serpapi_parameters,
 23 |     extract_results_keys,
 24 |     extract_related_content_keys,
 25 |     build_site_query
 26 | )
 27 | 
 28 | # utils
 29 | from pathlib import Path
 30 | 
 31 | # SQLManager
 32 | from databases import SQLDatabaseManager
 33 | 
 34 | # Media handlers
 35 | from media_handlers import RequestSession
 36 | 
 37 | # SerpAPI collector class
 38 | class TikTokDataCollector:
 39 |     '''
 40 |     TikTokDataCollector collects TikTok data from Google search results
 41 |     using SerpAPI.
 42 |     '''
 43 | 
 44 |     def __init__(self, args: Dict) -> None:
 45 |         '''
 46 |         Initializes TikTokDataCollector with the given parameters and options
 47 |         from the command line.
 48 |         
 49 |         :param args: Dict containing the command line arguments and options
 50 |         '''
 51 |         # get output data path
 52 |         self.output = self._sanitize_output_path(args['output'])
 53 | 
 54 |         # endpoint for SerpAPI
 55 |         self.api_key = args['api_key']
 56 |         self.endpoint = 'https://serpapi.com/search'
 57 | 
 58 |         # Apify token
 59 |         self.apify_token = args['apify_token']
 60 | 
 61 |         # main site: tiktok.com
 62 |         self.site = 'tiktok.com'
 63 | 
 64 |         # build the search query string
 65 |         q = search_query(args=args)
 66 | 
 67 |         # get provided user and tag
 68 |         self.user = args['user']
 69 |         self.tag = args['tag']
 70 | 
 71 |         # build advanced search query using utility function
 72 |         self.query = build_site_query(
 73 |             site=self.site, user=self.user, tag=self.tag, q=q
 74 |         )
 75 | 
 76 |         # update the query parameter in args
 77 |         args['q'] = self.query
 78 | 
 79 |         # store the parameters
 80 |         self.parameters = select_serpapi_parameters(args)
 81 | 
 82 |         # SerpAPI client
 83 |         self.client = serpapi.Client(api_key=self.api_key)
 84 | 
 85 |         # Apify client
 86 |         self.run_apify = args['apify']
 87 |         if self.run_apify:
 88 |             if self.user is not None or self.tag is not None:
 89 |                 self.should_download_videos = args['download']
 90 |                 self.apify_client = ApifyClient(self.apify_token)
 91 | 
 92 |                 # optional date filters
 93 |                 self.oldest_post_date = args['oldest_post_date']
 94 |                 self.newest_post_date = args['newest_post_date']
 95 | 
 96 |                 # number of results
 97 |                 self.number_of_results = args['number_of_results']
 98 | 
 99 |         # database connection
100 |         self.sql_database = SQLDatabaseManager(self.output, self.run_apify)
101 | 
102 |         # connections
103 |         self.related_content_urls = []
104 |         self.related_content_depth = args['depth']
105 |         self.http_session = RequestSession()
106 |     
107 |     def _sanitize_output_path(self, output: str) -> str:
108 |         '''
109 |         Ensures the given path uses forward slashes and does not end with a
110 |         slash.
111 | 
112 |         :param output: The original directory path.
113 |         :return: A sanitized directory path with forward slashes and no
114 |             trailing slash.
115 |         '''
116 |         # create a Path object and normalize the path
117 |         path = Path(output)
118 | 
119 |         # path with the correct separators for the current OS
120 |         output = str(path.as_posix())
121 | 
122 |         # remove any trailing slashes
123 |         output = output.rstrip('/')
124 | 
125 |         return output
126 | 
127 |     def collect_search_results(self) -> None:
128 |         '''
129 |         Makes an API call to SerpAPI and processes the response data.
130 | 
131 |         Fetches data based on the initialized parameters and handles pagination
132 |         to retrieve data from all available pages.
133 |         '''
134 |         print (f'\nAPI call to Google search results\n')
135 |         print (f'> search query: {self.query}')
136 |         result_type = 'search_result'
137 |         try:
138 |             api_response = self.client.search(self.parameters)
139 |             print ('\n> Searching...')
140 | 
141 |             # save raw data
142 |             self._save_raw_data(
143 |                 self.output,
144 |                 result_type=result_type,
145 |                 data=api_response.data
146 |             )
147 | 
148 |             # found results
149 |             found_results = False
150 | 
151 |             # process search results
152 |             self._process_search_results(api_response.data)
153 |             if api_response.data.get('organic_results', []):
154 |                 found_results = True
155 | 
156 |             # get next page
157 |             next_page = api_response.next_page_url
158 |             while next_page:
159 |                 # get new API response
160 |                 next_response = api_response.next_page()
161 | 
162 |                 # save raw data
163 |                 self._save_raw_data(
164 |                     self.output,
165 |                     result_type=result_type,
166 |                     data=next_response.data
167 |                 )
168 | 
169 |                 # process search results
170 |                 self._process_search_results(next_response.data)
171 | 
172 |                 # get next page
173 |                 next_page = next_response.next_page_url
174 | 
175 |                 # update api_response for the next iteration
176 |                 api_response = next_response
177 | 
178 |                 # chill out
179 |                 time.sleep(2)
180 |             
181 |             # api call status
182 |             print ('> Done')
183 | 
184 |             if not found_results:
185 |                 print ('No organic results found.')
186 |         
187 |         except Exception as e:
188 |             print (f'An error occurred during the API call: {e}')
189 |     
190 |     def _process_search_results(self, data: Dict) -> None:
191 |         '''
192 |         Processes the response data from SerpAPI, extracting organic results
193 |         and inserting them into the SQL database.
194 | 
195 |         :param data: SerpAPI raw data response
196 |         '''
197 |         # get organic search results
198 |         field = 'organic_results'
199 |         result_type = 'search_result'
200 |         results = data.get(field, [])
201 |         if results:
202 |             d = extract_results_keys(results, result_type=result_type)
203 |             
204 |             # write results in SQL database
205 |             if d:
206 |                 self.sql_database.insert_search_results(d)
207 | 
208 |     def collect_image_results(self) -> None:
209 |         '''
210 |         Makes an API call to SerpAPI to collect image thumbnails from Google
211 |         Images.
212 |         '''
213 |         # Google Images API
214 |         self.parameters['tbm'] = 'isch'
215 | 
216 |         # collect images
217 |         print (f'\n\nAPI call to Google images')
218 |         result_type = 'image_result'
219 |         try:
220 |             api_response = self.client.search(self.parameters)
221 |             print ('\n> Searching images...')
222 | 
223 |             # save raw data
224 |             self._save_raw_data(
225 |                 self.output,
226 |                 result_type=result_type,
227 |                 data=api_response.data
228 |             )
229 | 
230 |             # found results
231 |             found_results = False
232 | 
233 |             # process images results
234 |             self._process_images_results(api_response.data)
235 |             if api_response.data.get('images_results', []):
236 |                 found_results = True
237 |                 print (f'> Downloading images results...')
238 | 
239 |             # get next page
240 |             next_page = api_response.next_page_url
241 |             while next_page:
242 |                 next_response = api_response.next_page()
243 | 
244 |                 # save raw data
245 |                 self._save_raw_data(
246 |                     self.output,
247 |                     result_type=result_type,
248 |                     data=next_response.data
249 |                 )
250 | 
251 |                 # process image results
252 |                 self._process_images_results(next_response.data)
253 | 
254 |                 # get next page
255 |                 next_page = next_response.next_page_url
256 | 
257 |                 # update api_response for the next iteration
258 |                 api_response = next_response
259 | 
260 |                 # chill out
261 |                 time.sleep(2)
262 | 
263 |             # api call status
264 |             print ('> Done')
265 | 
266 |             if not found_results:
267 |                 print ('No image results found in the response.')
268 | 
269 |         except Exception as e:
270 |             print (f'An error occurred during the API call: {e}')
271 |         
272 |         # collect related content
273 |         print (f'\n\nCollecting related content')
274 |         if self.related_content_urls:
275 |             self.related_content_urls = self.related_content_urls[
276 |                 :self.related_content_depth
277 |             ]
278 |             for url in self.related_content_urls:
279 |                 self._collect_related_content(url=url)
280 |             print ('> Done')
281 |         else:
282 |             print ('No related content found.')
283 |     
284 |     def _process_images_results(self, data: Dict) -> None:
285 |         '''
286 |         Processes the response data from SerpAPI, extracting thumbnails
287 |         and inserting related data into the SQL database.
288 | 
289 |         :param data: SerpAPI raw data response
290 |         '''
291 |         # get image results
292 |         field = 'images_results'
293 |         result_type = 'image_result'
294 |         results = data.get(field, [])
295 |         if results:
296 |             d = extract_results_keys(results, result_type=result_type)
297 | 
298 |             # write results in SQL database
299 |             if d:
300 |                 self.sql_database.insert_images_results(d)
301 | 
302 |                 # download images
303 |                 thumbnails = [i['thumbnail'] for i in d]
304 |                 links = [i['link'] for i in d]
305 |                 self.http_session.start_media_download(
306 |                     urls=thumbnails,
307 |                     links=links,
308 |                     output=self.output,
309 |                     media_type='image'
310 |                 )
311 | 
312 |                 # save related content urls
313 |                 key = 'serpapi_related_content_link'
314 |                 self.related_content_urls += [
315 |                     i[key] for i in d if key in i
316 |                 ]
317 |     
318 |     def _collect_related_content(self, url: str) -> None:
319 |         '''
320 |         Collects related content from the given URL.
321 | 
322 |         :param url: The URL to load related content from.
323 |         '''
324 |         result_type = 'related_content'
325 |         content = self.http_session.load_related_content(
326 |             url=url,
327 |             api_key=self.api_key
328 |         )
329 | 
330 |         # save raw data
331 |         self._save_raw_data(
332 |             self.output,
333 |             result_type=result_type,
334 |             data=content
335 |         )
336 | 
337 |         # process related content
338 |         self._process_related_content(content)
339 |     
340 |     def _process_related_content(self, content: Dict) -> None:
341 |         '''
342 |         Processes the related content data.
343 | 
344 |         :param content: A dictionary containing the related content data.
345 |         '''
346 |         # get related content
347 |         possible_fields = ['related_content', 'images_results']
348 |         related_content = []
349 |         for field in possible_fields:
350 |             related_content = content.get(field, None)
351 |             if related_content is not None:
352 |                 break
353 |         
354 |         if related_content:
355 |             d = extract_related_content_keys(related_content)
356 | 
357 |             # write results in SQL database
358 |             if d:
359 |                 self.sql_database.insert_related_content(d)
360 |         else:
361 |             print ('No results found in this URL')
362 | 
363 |     def _apify_tiktok_profile_scraper(self) -> None:
364 |         '''
365 |         Collects search data using Apify.
366 |         '''
367 |         print ('\n\nCollecting user data with Apify')
368 | 
369 |         # get the search results
370 |         run_input = {
371 |             'profiles': [self.user],
372 |             'profileScrapeSections': ['videos'],
373 |             'profileSorting': 'latest',
374 |             'resultsPerPage': self.number_of_results,
375 |             'excludePinnedPosts': False,
376 |             'shouldDownloadVideos': self.should_download_videos,
377 |             'shouldDownloadCovers': True,
378 |             'shouldDownloadSubtitles': False,
379 |             'shouldDownloadSlideshowImages': False,
380 |             'shouldDownloadAvatars': True
381 |         }
382 | 
383 |         # add optional date filters
384 |         if self.oldest_post_date:
385 |             run_input['oldestPostDate'] = self.oldest_post_date
386 |         if self.newest_post_date:
387 |             run_input['newestPostDate'] = self.newest_post_date
388 | 
389 |         # run the Apify actor
390 |         apify_actor_key = '0FXVyOXXEmdGcV88a'
391 |         try:
392 |             run = self.apify_client.actor(apify_actor_key).call(
393 |                 run_input=run_input
394 |             )
395 | 
396 |             # store data
397 |             store_data = []
398 |             for item in self.apify_client.dataset(run['defaultDatasetId']).iterate_items():
399 |                 store_data.append(item)
400 | 
401 |             # write raw data
402 |             if store_data:
403 |                 self._save_raw_data(
404 |                     self.output,
405 |                     result_type='apify_profile_data',
406 |                     data=store_data
407 |                 )
408 | 
409 |                 # process data
410 |                 self._process_apify_profile_data(store_data)
411 |             else:
412 |                 print ('No data found in the Apify run.')
413 |         except httpx.LocalProtocolError as e:
414 |             print ('Warning: Apify API token is either missing or invalid. Skipping Apify integration.')
415 |         
416 |     def _process_apify_profile_data(self, data: Dict) -> None:
417 |         '''
418 |         Processes the Apify profile data.
419 | 
420 |         :param data: A dictionary containing the Apify profile data.
421 |         '''
422 |         # insert data into SQL database
423 |         self.sql_database.insert_apify_profile_data(data)
424 | 
425 |         # downloading images
426 |         thumbnails = []
427 |         links = []
428 |         for item in data:
429 |             try:
430 |                 thumbnails.append(item['videoMeta']['coverUrl'])
431 |                 links.append(item['webVideoUrl'])
432 |             except KeyError:
433 |                 pass
434 | 
435 |         self.http_session.start_media_download(
436 |             urls=thumbnails,
437 |             links=links,
438 |             output=self.output,
439 |             media_type='image'
440 |         )
441 |         print ('> Thumbnails downloaded')
442 |         
443 |         # get videos from Apify collected data
444 |         if self.should_download_videos:
445 |             videos = []
446 |             tiktok_links = []
447 |             for item in data:
448 |                 try:
449 |                     videos.append(item['videoMeta']['downloadAddr'])
450 |                     tiktok_links.append(item['webVideoUrl'])
451 |                 except KeyError:
452 |                     pass
453 | 
454 |             # download videos
455 |             self.http_session.start_media_download(
456 |                 urls=videos,
457 |                 links=tiktok_links,
458 |                 output=self.output,
459 |                 media_type='video'
460 |             )
461 |             print ('> Videos downloaded')
462 | 
463 |             # extract audio from videos
464 |             print ('> Extracting audio from videos...')
465 |             self.http_session.extract_audio_from_videos(self.output)
466 |             print ('> Done')
467 | 
468 |         return
469 |     
470 |     def _apify_tiktok_hashtag_scraper(self) -> None:
471 |         '''
472 |         Collects hashtag data using Apify.
473 |         '''
474 |         print ('\n\nCollecting hashtag data with Apify')
475 | 
476 |         # get the hashtag results
477 |         run_input = {
478 |             'hashtags': [self.tag],
479 |             'resultsPerPage': self.number_of_results,
480 |             'searchSection': '/video',
481 |             'searchQueries': [self.tag],
482 |             'excludePinnedPosts': False,
483 |             'shouldDownloadVideos': self.should_download_videos,
484 |             'shouldDownloadCovers': True,
485 |             'shouldDownloadSubtitles': False,
486 |             'shouldDownloadSlideshowImages': False,
487 |             'shouldDownloadAvatars': True
488 |         }
489 | 
490 |         # run the Apify actor
491 |         apify_actor_key = 'OtzYfK1ndEGdwWFKQ'
492 |         try:
493 |             run = self.apify_client.actor(apify_actor_key).call(
494 |                 run_input=run_input
495 |             )
496 | 
497 |             # store data
498 |             store_data = []
499 |             for item in self.apify_client.dataset(run['defaultDatasetId']).iterate_items():
500 |                 store_data.append(item)
501 | 
502 |             # write raw data
503 |             if store_data:
504 |                 self._save_raw_data(
505 |                     self.output,
506 |                     result_type='apify_hashtag_data',
507 |                     data=store_data
508 |                 )
509 | 
510 |                 # process data
511 |                 self._process_apify_hashtag_data(store_data)
512 |             else:
513 |                 print ('No data found in the Apify run.')
514 |         except httpx.LocalProtocolError as e:
515 |             print ('Warning: Apify API token is either missing or invalid. Skipping Apify integration.')
516 |         
517 |     def _process_apify_hashtag_data(self, data: Dict) -> None:
518 |         '''
519 |         Processes the Apify hashtag data.
520 | 
521 |         :param data: A dictionary containing the Apify hashtag data.
522 |         '''
523 |         # insert data into SQL database
524 |         self.sql_database.insert_apify_hashtag_data(data)
525 | 
526 |         # downloading images
527 |         thumbnails = []
528 |         links = []
529 |         for item in data:
530 |             try:
531 |                 thumbnails.append(item['videoMeta']['coverUrl'])
532 |                 links.append(item['webVideoUrl'])
533 |             except KeyError:
534 |                 pass
535 | 
536 |         self.http_session.start_media_download(
537 |             urls=thumbnails,
538 |             links=links,
539 |             output=self.output,
540 |             media_type='image'
541 |         )
542 |         print ('> Thumbnails downloaded')
543 |         
544 |         # get videos from Apify collected data
545 |         if self.should_download_videos:
546 |             videos = []
547 |             tiktok_links = []
548 |             for item in data:
549 |                 try:
550 |                     videos.append(item['videoMeta']['downloadAddr'])
551 |                     tiktok_links.append(item['webVideoUrl'])
552 |                 except KeyError:
553 |                     pass
554 | 
555 |             # download videos
556 |             self.http_session.start_media_download(
557 |                 urls=videos,
558 |                 links=tiktok_links,
559 |                 output=self.output,
560 |                 media_type='video'
561 |             )
562 |             print ('> Videos downloaded')
563 | 
564 |             # extract audio from videos
565 |             print ('> Extracting audio from videos...')
566 |             self.http_session.extract_audio_from_videos(self.output)
567 |             print ('> Done')
568 | 
569 |         return
570 |     
571 |     def _save_raw_data(self, output: str, result_type: str, data: Dict) -> None:
572 |         '''
573 |         Saves the raw data response from SerpAPI in a JSON file.
574 | 
575 |         :param output: The directory path where the raw data should be saved.
576 |         :param result_type: Type of SerpAPI response: 'search_result',
577 |             'image_result', 'related_content', or Apify response
578 |         :param data: The raw data response from SerpAPI to be saved.
579 |         '''
580 |         # create the directory structure if it does not exist
581 |         folder = f'{output}/raw_data/{result_type}'
582 |         if not os.path.exists(folder):
583 |             os.makedirs(folder)
584 |         
585 |         # create a timestamp for the file name
586 |         stamp = int(time.time())
587 |         uuid_code = str(uuid.uuid4()).split('-')[-1]
588 | 
589 |         # convert the data to a JSON string
590 |         obj = json.dumps(data, ensure_ascii=False, indent=2)
591 | 
592 |         # write the JSON string to a file
593 |         file_path = f'{folder}/{result_type}_{stamp}_{uuid_code}.json'
594 |         with open(file_path, encoding='utf-8', mode='w') as writer:
595 |             writer.write(obj)
596 |     
597 |     def collect_search_data(self) -> None:
598 |         '''
599 |         Collects both search results and corresponding image thumbnails.
600 |         '''
601 |         print ('\n\n')
602 |         print ('-' * 30)
603 |         print ('Starting data collection process...\n')
604 | 
605 |         self.collect_search_results()
606 |         self.collect_image_results()
607 | 
608 |         if self.run_apify:
609 |             if self.user is not None:
610 |                 self._apify_tiktok_profile_scraper()
611 |             elif self.tag is not None:
612 |                 self._apify_tiktok_hashtag_scraper()
613 | 
614 |         print ('\n\nData collection complete.')
615 |         print ('-' * 30)
616 | 
617 |     def generate_data_files(self) -> None:
618 |         '''
619 |         Selects all data from SQL tables and generates CSV files
620 |         '''
621 |         print (f'\n\nGenerating CSV files')
622 |         self.sql_database.fetch_all_data()
623 |         print ('> Done')
624 | 
625 |     def get_collected_videos(self) -> List[str]:
626 |         '''
627 |         Retrieves all collected video links from the SQL database.
628 | 
629 |         :return: A list of unique video links.
630 |         '''
631 |         return self.sql_database.get_collected_videos(
632 |             include_user_related_content=self.user is not None
633 |         )
634 |     
635 |     def get_all_collected_videos(self) -> List[str]:
636 |         '''
637 |         Retrieves all unique video links from the query_search_results,
638 |         images_results, and Apify tables.
639 |         '''
640 |         return self.sql_database.get_all_collected_videos()
641 | 


--------------------------------------------------------------------------------
/databases/sql_manager.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # import modules
  4 | import os
  5 | import sqlite3
  6 | import pandas as pd
  7 | 
  8 | # SQL submodules
  9 | from sqlite3 import Error
 10 | 
 11 | # typing
 12 | from typing import List, Optional
 13 | 
 14 | # Database Manager utilities
 15 | from .utilities import get_items_from_search_results, \
 16 |     get_items_from_images_results, get_items_from_related_content, \
 17 |     get_items_from_apify_profile_data, get_items_from_apify_hashtag_data, \
 18 |     extract_author_post_id
 19 | 
 20 | # SQLDatabaseManager class
 21 | class SQLDatabaseManager:
 22 |     '''
 23 |     SQLDatabaseManager
 24 | 
 25 |     This class provides an abstracted interface for interacting with a SQL
 26 |     database.
 27 |     '''
 28 |     def __init__(self, output: str, run_apify: bool) -> None:
 29 |         '''
 30 |         Initializes the SQLDatabaseManager with the given output path.
 31 | 
 32 |         :param output: The directory path where the database file will be
 33 |             created.
 34 |         :param run_apify: Whether to run the apify profile scraper.
 35 |         '''
 36 |         self.output = output
 37 |         self.sql_database_file = f'{self.output}/database.sql'
 38 | 
 39 |         # create required SQL tables for data processing - SerpAPI
 40 |         self.create_search_results_table()
 41 |         self.create_images_results_table()
 42 |         self.create_related_content_table()
 43 | 
 44 |         # create required SQL tables for data processing - Apify
 45 |         self.create_apify_profile_scraper_table()
 46 |         self.create_apify_hashtag_scraper_table()
 47 |     
 48 |     def create_sql_connection(self) -> Optional[sqlite3.Connection]:
 49 |         '''
 50 |         Creates a SQL connection.
 51 | 
 52 |         :return: A SQLite connection object or None if an error occurred
 53 |         '''
 54 |         try:
 55 |             conn = sqlite3.connect(self.sql_database_file)
 56 |             return conn
 57 |         except Error as e:
 58 |             print (f'An error occurred: {e}')
 59 |             return None
 60 |     
 61 |     def create_search_results_table(self) -> None:
 62 |         '''
 63 |         Creates the query_search_results table if it does not already exist.
 64 |         '''
 65 |         # set cursor
 66 |         conn = self.create_sql_connection()
 67 |         if conn is not None:
 68 |             cursor = conn.cursor()
 69 | 
 70 |             try:
 71 |                 cursor.execute(
 72 |                     '''
 73 |                     CREATE TABLE IF NOT EXISTS query_search_results (
 74 |                         record_id INTEGER PRIMARY KEY AUTOINCREMENT,
 75 |                         source TEXT,
 76 |                         title TEXT,
 77 |                         snippet TEXT,
 78 |                         link TEXT UNIQUE,
 79 |                         thumbnail TEXT,
 80 |                         video_link TEXT,
 81 |                         snippet_highlighted_words TEXT,
 82 |                         displayed_link TEXT,
 83 |                         title_snippet TEXT,
 84 |                         likes TEXT,
 85 |                         comments TEXT,
 86 |                         author TEXT,
 87 |                         link_to_author TEXT,
 88 |                         post_id TEXT UNIQUE
 89 |                     );
 90 |                     '''
 91 |                 )
 92 | 
 93 |                 # commit changes
 94 |                 conn.commit()
 95 |             except Error as e:
 96 |                 print (f'An error occurred: {e}')
 97 |             finally:
 98 |                 conn.close()
 99 |         else:
100 |             print ('Failed to create the database connection.')
101 |     
102 |     def insert_search_results(self, data: List) -> None:
103 |         '''
104 |         Inserts data into the query_search_results table.
105 | 
106 |         :param data: A list of dictionaries containing the data to insert.
107 |         '''
108 |         conn = self.create_sql_connection()
109 |         if conn is not None:
110 |             cursor = conn.cursor()
111 | 
112 |             try:
113 |                 for entry in data:
114 |                     cursor.execute(
115 |                         '''
116 |                         INSERT OR IGNORE INTO query_search_results (
117 |                             source, title, snippet, link, thumbnail,
118 |                             video_link, snippet_highlighted_words,
119 |                             displayed_link, title_snippet, likes, comments,
120 |                             author, link_to_author, post_id
121 |                         ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
122 |                         ''',
123 |                         get_items_from_search_results(entry)
124 |                     )
125 | 
126 |                     # commit changes
127 |                     conn.commit()
128 |             
129 |             except Error as e:
130 |                 print (f'An error occurred while inserting data: {e}')
131 |             finally:
132 |                 conn.close()
133 |         else:
134 |             print ('Failed to create the database connection.')
135 | 
136 |     def create_images_results_table(self) -> None:
137 |         '''
138 |         Creates the images_results table if it does not already exist.
139 |         '''
140 |         # set cursor
141 |         conn = self.create_sql_connection()
142 |         if conn is not None:
143 |             cursor = conn.cursor()
144 | 
145 |             try:
146 |                 cursor.execute(
147 |                     '''
148 |                     CREATE TABLE IF NOT EXISTS images_results (
149 |                         record_id INTEGER PRIMARY KEY AUTOINCREMENT,
150 |                         source TEXT,
151 |                         title TEXT,
152 |                         link TEXT UNIQUE,
153 |                         thumbnail TEXT,
154 |                         author TEXT,
155 |                         link_to_author TEXT,
156 |                         post_id TEXT UNIQUE
157 |                     );
158 |                     '''
159 |                 )
160 | 
161 |                 # commit changes
162 |                 conn.commit()
163 |             except Error as e:
164 |                 print (f'An error occurred: {e}')
165 |             finally:
166 |                 conn.close()
167 |         else:
168 |             print ('Failed to create the database connection.')
169 | 
170 |     def insert_images_results(self, data: List) -> None:
171 |         '''
172 |         Inserts data into the images_results table.
173 | 
174 |         :param data: A list of dictionaries containing the data to insert.
175 |         '''
176 |         conn = self.create_sql_connection()
177 |         if conn is not None:
178 |             cursor = conn.cursor()
179 | 
180 |             try:
181 |                 for entry in data:
182 |                     cursor.execute(
183 |                         '''
184 |                         INSERT OR IGNORE INTO images_results (
185 |                             source, title, link, thumbnail, author,
186 |                             link_to_author, post_id
187 |                         ) VALUES (?, ?, ?, ?, ?, ?, ?)
188 |                         ''',
189 |                         get_items_from_images_results(entry)
190 |                     )
191 | 
192 |                     # commit changes
193 |                     conn.commit()
194 |             
195 |             except Error as e:
196 |                 print (f'An error occurred while inserting data: {e}')
197 |             finally:
198 |                 conn.close()
199 |         else:
200 |             print ('Failed to create the database connection.')
201 | 
202 |     def create_related_content_table(self) -> None:
203 |         '''
204 |         Creates the related_content table if it does not already exist.
205 |         '''
206 |         # set cursor
207 |         conn = self.create_sql_connection()
208 |         if conn is not None:
209 |             cursor = conn.cursor()
210 | 
211 |             try:
212 |                 cursor.execute(
213 |                     '''
214 |                     CREATE TABLE IF NOT EXISTS related_content (
215 |                         record_id INTEGER PRIMARY KEY AUTOINCREMENT,
216 |                         source TEXT,
217 |                         link TEXT UNIQUE,
218 |                         thumbnail TEXT,
219 |                         title TEXT
220 |                     );
221 |                     '''
222 |                 )
223 | 
224 |                 # commit changes
225 |                 conn.commit()
226 |             except Error as e:
227 |                 print (f'An error occurred: {e}')
228 |             finally:
229 |                 conn.close()
230 |         else:
231 |             print ('Failed to create the database connection.')
232 |     
233 |     def insert_related_content(self, data: List) -> None:
234 |         '''
235 |         Inserts data into the related_content table.
236 | 
237 |         :param data: A list of dictionaries containing the data to insert.
238 |         '''
239 |         conn = self.create_sql_connection()
240 |         if conn is not None:
241 |             cursor = conn.cursor()
242 | 
243 |             try:
244 |                 for entry in data:
245 |                     cursor.execute(
246 |                         '''
247 |                         INSERT OR IGNORE INTO related_content (
248 |                             source, link, thumbnail, title
249 |                         ) VALUES (?, ?, ?, ?)
250 |                         ''',
251 |                         get_items_from_related_content(entry)
252 |                     )
253 | 
254 |                     # commit changes
255 |                     conn.commit()
256 | 
257 |             except Error as e:
258 |                 print (f'An error occurred while inserting data: {e}')
259 |             finally:
260 |                 conn.close()
261 |         else:
262 |             print ('Failed to create the database connection.')
263 |         
264 |     def create_apify_profile_scraper_table(self) -> None:
265 |         '''
266 |         Creates the apify_profile_scraper table if it does not already exist.
267 |         '''
268 |         conn = self.create_sql_connection()
269 |         if conn is not None:
270 |             cursor = conn.cursor()
271 | 
272 |             try:
273 |                 cursor.execute(
274 |                     '''
275 |                     CREATE TABLE IF NOT EXISTS apify_profile_scraper (
276 |                         id TEXT PRIMARY KEY,
277 |                         text TEXT,
278 |                         text_language TEXT,
279 |                         create_time INTEGER,
280 |                         create_time_iso TEXT,
281 |                         is_ad BOOLEAN,
282 |                         web_video_url TEXT UNIQUE,
283 | 
284 |                         author_id TEXT,
285 |                         author_name TEXT,
286 |                         author_profile_url TEXT,
287 |                         author_bio_link TEXT,
288 |                         author_signature TEXT,
289 |                         author_nickname TEXT,
290 |                         author_verified BOOLEAN,
291 |                         author_avatar TEXT,
292 |                         author_private_account BOOLEAN,
293 |                         author_region TEXT,
294 |                         author_following INTEGER,
295 |                         author_friends INTEGER,
296 |                         author_fans INTEGER,
297 |                         author_heart INTEGER,
298 |                         author_video INTEGER,
299 |                         author_digg INTEGER,
300 | 
301 |                         music_id TEXT,
302 |                         music_name TEXT,
303 |                         music_author TEXT,
304 |                         music_original BOOLEAN,
305 | 
306 |                         video_duration INTEGER,
307 |                         video_thumbnail TEXT,
308 |                         video_download_url TEXT,
309 | 
310 |                         digg_count INTEGER,
311 |                         share_count INTEGER,
312 |                         play_count INTEGER,
313 |                         collect_count INTEGER,
314 |                         comment_count INTEGER,
315 | 
316 |                         hashtags TEXT,
317 |                         is_slideshow BOOLEAN,
318 |                         is_pinned BOOLEAN,
319 |                         is_sponsored BOOLEAN,
320 |                         input_username TEXT,
321 |                         from_profile_section TEXT,
322 | 
323 |                         UNIQUE (id, web_video_url)
324 |                         ON CONFLICT REPLACE
325 |                     );
326 |                     '''
327 |                 )
328 | 
329 |                 # commit changes
330 |                 conn.commit()
331 |             except Error as e:
332 |                 print (f'An error occurred: {e}')
333 |             finally:
334 |                 conn.close()
335 |         else:
336 |             print ('Failed to create the database connection.')
337 | 
338 |     def create_apify_hashtag_scraper_table(self) -> None:
339 |         '''
340 |         Creates the apify_hashtag_scraper table if it does not already exist.
341 |         '''
342 |         conn = self.create_sql_connection()
343 |         if conn is not None:
344 |             cursor = conn.cursor()
345 | 
346 |             try:
347 |                 cursor.execute(
348 |                     '''
349 |                     CREATE TABLE IF NOT EXISTS apify_hashtag_scraper (
350 |                         id TEXT PRIMARY KEY,
351 |                         text TEXT,
352 |                         text_language TEXT,
353 |                         create_time INTEGER,
354 |                         create_time_iso TEXT,
355 |                         is_ad BOOLEAN,
356 |                         web_video_url TEXT UNIQUE,
357 | 
358 |                         author_id TEXT,
359 |                         author_name TEXT,
360 |                         author_profile_url TEXT,
361 |                         author_bio_link TEXT,
362 |                         author_signature TEXT,
363 |                         author_nickname TEXT,
364 |                         author_verified BOOLEAN,
365 |                         author_avatar TEXT,
366 |                         author_private_account BOOLEAN,
367 |                         author_region TEXT,
368 |                         author_following INTEGER,
369 |                         author_friends INTEGER,
370 |                         author_fans INTEGER,
371 |                         author_heart INTEGER,
372 |                         author_video INTEGER,
373 |                         author_digg INTEGER,
374 | 
375 |                         music_id TEXT,
376 |                         music_name TEXT,
377 |                         music_author TEXT,
378 |                         music_original BOOLEAN,
379 | 
380 |                         video_duration INTEGER,
381 |                         video_thumbnail TEXT,
382 |                         video_download_url TEXT,
383 | 
384 |                         digg_count INTEGER,
385 |                         share_count INTEGER,
386 |                         play_count INTEGER,
387 |                         collect_count INTEGER,
388 |                         comment_count INTEGER,
389 | 
390 |                         hashtags TEXT,
391 |                         is_slideshow BOOLEAN,
392 |                         is_pinned BOOLEAN,
393 |                         is_sponsored BOOLEAN,
394 |                         input_search TEXT,
395 |                         search_hashtag_views INTEGER,
396 | 
397 |                         UNIQUE (id, web_video_url)
398 |                         ON CONFLICT REPLACE
399 |                     );
400 |                     '''
401 |                 )
402 | 
403 |                 # commit changes
404 |                 conn.commit()
405 |             except Error as e:
406 |                 print (f'An error occurred: {e}')
407 |             finally:
408 |                 conn.close()
409 |         else:
410 |             print ('Failed to create the database connection.')
411 | 
412 |     def insert_apify_profile_data(self, data: List) -> None:
413 |         '''
414 |         Inserts data into the apify_profile_scraper table.
415 | 
416 |         :param data: A list of dictionaries containing the data to insert.
417 |         '''
418 |         conn = self.create_sql_connection()
419 |         if conn is not None:
420 |             cursor = conn.cursor()
421 | 
422 |             try:
423 |                 for entry in data:
424 |                     cursor.execute(
425 |                         '''
426 |                         INSERT OR REPLACE INTO apify_profile_scraper (
427 |                             id, text, text_language, create_time, create_time_iso,
428 |                             is_ad, web_video_url, author_id, author_name,
429 |                             author_profile_url, author_bio_link, author_signature,
430 |                             author_nickname, author_verified, author_avatar,
431 |                             author_private_account, author_region, author_following,
432 |                             author_friends, author_fans, author_heart, author_video,
433 |                             author_digg, music_id, music_name, music_author,
434 |                             music_original, video_duration, video_thumbnail,
435 |                             video_download_url, digg_count, share_count, play_count,
436 |                             collect_count, comment_count, hashtags, is_slideshow,
437 |                             is_pinned, is_sponsored, input_username,
438 |                             from_profile_section
439 |                         ) VALUES (
440 |                             ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
441 |                             ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
442 |                             ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
443 |                         )
444 |                         ''',
445 |                         get_items_from_apify_profile_data(entry)
446 |                     )
447 | 
448 |                     # commit changes
449 |                     conn.commit()
450 |             except Error as e:
451 |                 print (f'An error occurred while inserting data: {e}')
452 |             finally:
453 |                 conn.close()
454 |         else:
455 |             print ('Failed to create the database connection.')
456 | 
457 |     def insert_apify_hashtag_data(self, data: List) -> None:
458 |         '''
459 |         Inserts data into the apify_hashtag_scraper table.
460 | 
461 |         :param data: A list of dictionaries containing the data to insert.
462 |         '''
463 |         conn = self.create_sql_connection()
464 |         if conn is not None:
465 |             cursor = conn.cursor()
466 | 
467 |             try:
468 |                 for entry in data:
469 |                     cursor.execute(
470 |                         '''
471 |                         INSERT OR REPLACE INTO apify_hashtag_scraper (
472 |                             id, text, text_language, create_time, create_time_iso,
473 |                             is_ad, web_video_url, author_id, author_name,
474 |                             author_profile_url, author_bio_link, author_signature,
475 |                             author_nickname, author_verified, author_avatar,
476 |                             author_private_account, author_region, author_following,
477 |                             author_friends, author_fans, author_heart, author_video,
478 |                             author_digg, music_id, music_name, music_author,
479 |                             music_original, video_duration, video_thumbnail,
480 |                             video_download_url, digg_count, share_count, play_count,
481 |                             collect_count, comment_count, hashtags, is_slideshow,
482 |                             is_pinned, is_sponsored, input_search,
483 |                             search_hashtag_views
484 |                         ) VALUES (
485 |                             ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
486 |                             ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
487 |                             ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
488 |                         )
489 |                         ''',
490 |                         get_items_from_apify_hashtag_data(entry)
491 |                     )
492 | 
493 |                     # commit changes
494 |                     conn.commit()
495 |             except Error as e:
496 |                 print (f'An error occurred while inserting data: {e}')
497 |             finally:
498 |                 conn.close()
499 |         else:
500 |             print ('Failed to create the database connection.')
501 |     
502 |     def fetch_all_data(self) -> None:
503 |         '''
504 |         Fetches all data from the SQL tables
505 |         '''
506 |         tables = [
507 |             'query_search_results',
508 |             'images_results',
509 |             'related_content',
510 |             'apify_profile_scraper',
511 |             'apify_hashtag_scraper'
512 |         ]
513 |         conn = self.create_sql_connection()
514 |         if conn is not None:
515 |             try:
516 |                 for t in tables:
517 |                     q = f'''
518 |                     SELECT *
519 |                     FROM {t}
520 |                     '''
521 |                     # fetch data
522 |                     df = pd.read_sql_query(q, conn)
523 | 
524 |                     # save data
525 |                     save_path = f'{self.output}/{t}.csv'
526 |                     df.to_csv(
527 |                         save_path,
528 |                         index=False,
529 |                         encoding='utf-8'
530 |                     )
531 |             
532 |             except Error as e:
533 |                 print (f'An error occurred while fetching data from {t}: {e}')
534 |             finally:
535 |                 conn.close()
536 |         
537 |     def get_collected_videos(self, include_user_related_content: bool) -> List:
538 |         '''
539 |         Retrieves all unique video links from the query_search_results and
540 |         images_results tables that have not been downloaded yet.
541 | 
542 |         :param include_user_related_content: Whether to include user related
543 |             content from Google search results in the returned list of links.
544 |         :return: A list of unique video links.
545 |         '''
546 |         data = []
547 |         conn = self.create_sql_connection()
548 |         if conn is not None:
549 |             cursor = conn.cursor()
550 | 
551 |             try:
552 |                 # get all video links from database
553 |                 cursor.execute(
554 |                     '''
555 |                     SELECT link
556 |                     FROM query_search_results
557 |                     UNION
558 |                     SELECT link
559 |                     FROM images_results
560 |                     '''
561 |                 )
562 |             
563 |                 # fetch all links
564 |                 all_links = [i[0] for i in cursor.fetchall()]
565 | 
566 |                 if include_user_related_content:
567 |                     # get user from link
568 |                     user = extract_author_post_id(all_links[0])[0]
569 | 
570 |                     # get all user related content links from database that match the user's TikTok video pattern
571 |                     cursor.execute(
572 |                         '''
573 |                         SELECT link
574 |                         FROM related_content
575 |                         WHERE link LIKE ?
576 |                         ''',
577 |                         (f'https://www.tiktok.com/@{user}/video/%',)
578 |                     )
579 | 
580 |                     # fetch all links
581 |                     all_links.extend([i[0] for i in cursor.fetchall()])
582 | 
583 |                     # remove duplicates
584 |                     all_links = list(set(all_links))
585 | 
586 |                 # get list of already downloaded videos
587 |                 videos_dir = os.path.join(self.output, 'downloaded_videos')
588 | 
589 |                 if os.path.exists(videos_dir):
590 |                     # get existing video ids
591 |                     existing_ids = {
592 |                         os.path.splitext(f)[0]
593 |                         for f in os.listdir(videos_dir)
594 |                         if os.path.isfile(os.path.join(videos_dir, f))
595 |                     }
596 | 
597 |                     # filter out links whose IDs are already downloaded
598 |                     data = [
599 |                         link for link in all_links
600 |                         if extract_author_post_id(link)[2] not in existing_ids
601 |                     ]
602 |                 else:
603 |                     data = all_links
604 |             except Error as e:
605 |                 print (f'An error occurred while retrieving data: {e}')
606 |             finally:
607 |                 conn.close()
608 |         
609 |         return data
610 | 
611 |     def get_all_collected_videos(self) -> List:
612 |         '''
613 |         Retrieves all unique video links from the query_search_results,
614 |         images_results, and Apify tables.
615 |         '''
616 |         conn = self.create_sql_connection()
617 |         if conn is not None:
618 |             cursor = conn.cursor()
619 | 
620 |             try:
621 |                 # get all video links from database
622 |                 cursor.execute(
623 |                     '''
624 |                     SELECT web_video_url
625 |                     FROM apify_profile_scraper
626 |                     UNION
627 |                     SELECT web_video_url
628 |                     FROM apify_hashtag_scraper
629 |                     UNION
630 |                     SELECT link
631 |                     FROM query_search_results
632 |                     UNION
633 |                     SELECT link
634 |                     FROM images_results
635 |                     '''
636 |                 )
637 | 
638 |                 # fetch all links
639 |                 all_links = [i[0] for i in cursor.fetchall()]
640 | 
641 |                 # remove duplicates
642 |                 all_links = list(set(all_links))
643 | 
644 |                 return all_links
645 |             except Error as e:
646 |                 print (f'An error occurred while retrieving data: {e}')
647 |             finally:
648 |                 conn.close()
649 |         
650 |         return []
651 | 


--------------------------------------------------------------------------------