├── streamlit_app
├── __init__.py
├── styles
│ ├── __init__.py
│ └── css.py
├── components
│ ├── __init__.py
│ ├── progress.py
│ ├── main_panel.py
│ └── sidebar.py
├── core
│ ├── __init__.py
│ ├── keyframes_processor.py
│ └── collection_runner.py
└── utils
│ ├── __init__.py
│ ├── session_state.py
│ └── file_browser.py
├── databases
├── __init__.py
├── utilities.py
└── sql_manager.py
├── data_collectors
├── __init__.py
├── utilities.py
└── collector.py
├── images
└── streamlit-interface.png
├── .gitignore
├── config
└── config.ini
├── media_handlers
├── __init__.py
├── video_downloader.py
└── session_manager.py
├── requirements.txt
├── setup.py
├── app.py
├── utils
└── __init__.py
├── main.py
└── README.md
/streamlit_app/__init__.py:
--------------------------------------------------------------------------------
1 | # Streamlit UI package
--------------------------------------------------------------------------------
/streamlit_app/styles/__init__.py:
--------------------------------------------------------------------------------
1 | # Styles and CSS
--------------------------------------------------------------------------------
/streamlit_app/components/__init__.py:
--------------------------------------------------------------------------------
1 | # UI Components
--------------------------------------------------------------------------------
/streamlit_app/core/__init__.py:
--------------------------------------------------------------------------------
1 | # Core business logic
--------------------------------------------------------------------------------
/streamlit_app/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Streamlit utilities
--------------------------------------------------------------------------------
/databases/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from .sql_manager import SQLDatabaseManager
3 |
--------------------------------------------------------------------------------
/data_collectors/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from .collector import TikTokDataCollector
3 |
--------------------------------------------------------------------------------
/images/streamlit-interface.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/estebanpdl/tik-spyder/HEAD/images/streamlit-interface.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore Cache
2 | .ipynb_checkpoints/
3 | __pycache__/
4 | .vscode/
5 |
6 | # package metadata
7 | *.egg-info
8 |
--------------------------------------------------------------------------------
/config/config.ini:
--------------------------------------------------------------------------------
1 | [SerpAPI Key]
2 | api_key = your_serp_api_key
3 |
4 | [Apify Token]
5 | apify_token = your_apify_token
6 |
--------------------------------------------------------------------------------
/media_handlers/__init__.py:
--------------------------------------------------------------------------------
1 | from .session_manager import RequestSession
2 | from .video_downloader import VideoDownloader
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp
2 | apify-client
3 | httpx
4 | pandas
5 | PySocks
6 | requests
7 | serpapi
8 | stem
9 | streamlit
10 | tqdm
11 | yt-dlp[default]
12 |
--------------------------------------------------------------------------------
/streamlit_app/utils/session_state.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import streamlit as st
5 | import time
6 |
7 | def initialize_session_state():
8 | """Initialize session state variables"""
9 | if 'output_dir' not in st.session_state:
10 | timestamp = int(time.time())
11 | st.session_state.output_dir = f'./tikspyder-data/{timestamp}'
--------------------------------------------------------------------------------
/streamlit_app/utils/file_browser.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import tkinter as tk
5 |
6 | # import submodules
7 | from tkinter import filedialog
8 |
9 | def select_directory():
10 | """Create a directory picker dialog"""
11 | root = tk.Tk()
12 | root.withdraw()
13 | root.wm_attributes('-topmost', 1)
14 | folder_path = filedialog.askdirectory()
15 | root.destroy()
16 | return folder_path
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import os
5 | from setuptools import setup, find_packages
6 |
7 | setup(
8 | name="tikspyder",
9 | version="0.1.0",
10 | packages=find_packages(),
11 | install_requires=[
12 | "aiohttp",
13 | "apify-client",
14 | "pandas",
15 | "PySocks",
16 | "requests",
17 | "serpapi",
18 | "stem",
19 | "streamlit",
20 | "tqdm",
21 | "yt-dlp[default]"
22 | ],
23 | entry_points={
24 | 'console_scripts': [
25 | 'tikspyder=main:main',
26 | ],
27 | },
28 | python_requires='>=3.6',
29 | author="Esteban Ponce de Leon",
30 | description="A tool for collecting TikTok data",
31 | long_description=open('README.md', encoding='utf-8').read() if os.path.exists('README.md') else '',
32 | long_description_content_type="text/markdown",
33 | )
34 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import streamlit as st
5 | import os
6 |
7 | # local imports
8 | from streamlit_app.styles.css import load_css
9 | from streamlit_app.utils.session_state import initialize_session_state
10 | from streamlit_app.components.sidebar import render_sidebar
11 | from streamlit_app.components.main_panel import render_main_panel
12 | from streamlit_app.core.collection_runner import run_collection, validate_input
13 | from utils import get_config_attrs, get_project_root
14 |
15 | # Configure Streamlit page
16 | st.set_page_config(
17 | page_title="TikSpyder - TikTok Data Collection",
18 | page_icon="🕷️",
19 | layout="wide",
20 | initial_sidebar_state="expanded"
21 | )
22 |
23 | # Set theme programmatically to dark
24 | st._config.set_option('theme.base', 'dark')
25 | st._config.set_option('theme.backgroundColor', '#0e1117')
26 | st._config.set_option('theme.secondaryBackgroundColor', '#262730')
27 | st._config.set_option('theme.textColor', '#ffffff')
28 |
29 | def main():
30 | """Main application entry point"""
31 | # Load styling
32 | load_css()
33 |
34 | # Initialize session state
35 | initialize_session_state()
36 |
37 | # Get project configuration
38 | project_root = get_project_root()
39 | config_path = os.path.join(project_root, 'config')
40 | config_attrs = get_config_attrs(config_path)
41 |
42 | # Main header
43 | st.markdown('
🕷️ TikSpyder
', unsafe_allow_html=True)
44 | st.markdown('Advanced TikTok Data Collection
', unsafe_allow_html=True)
45 |
46 | # Render UI components
47 | search_config, apify_config = render_sidebar()
48 | collection_config, start_collection = render_main_panel()
49 |
50 | # Handle collection start
51 | if start_collection:
52 | if validate_input(search_config):
53 | run_collection(search_config, apify_config, collection_config, config_attrs)
54 |
55 | if __name__ == '__main__':
56 | main()
--------------------------------------------------------------------------------
/streamlit_app/components/progress.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import streamlit as st
5 | import time
6 |
7 | # local imports
8 | from ..styles.css import create_status_badge
9 |
10 | def create_progress_tracker():
11 | """Create and return progress tracking components"""
12 | # Create status container
13 | status_container = st.container()
14 |
15 | with status_container:
16 | st.markdown("### 🔄 Collection Progress")
17 |
18 | # Create progress indicators
19 | overall_progress = st.progress(0)
20 | status_text = st.empty()
21 | step_container = st.container()
22 |
23 | # Collection steps with icons and descriptions
24 | steps = [
25 | ("🔍", "Initializing search parameters..."),
26 | ("📡", "Collecting search results..."),
27 | ("🖼️", "Gathering image thumbnails..."),
28 | ("🚀", "Running Apify integration..."),
29 | ("📁", "Generating data files..."),
30 | ("📹", "Downloading videos..."),
31 | ("🎞️", "Extracting keyframes..."),
32 | ("✅", "Collection complete!")
33 | ]
34 |
35 | step_progress = {}
36 | for i, (icon, desc) in enumerate(steps):
37 | step_progress[i] = step_container.empty()
38 |
39 | return overall_progress, status_text, step_progress, steps
40 |
41 | def update_progress(step_num, overall_progress, status_text, step_progress, steps, message=None, progress_value=None):
42 | """Update progress indicators"""
43 | if step_num < len(steps):
44 | icon, desc = steps[step_num]
45 | step_progress[step_num].markdown(f"{icon} {desc}")
46 |
47 | if message:
48 | status_text.markdown(create_status_badge(message, "warning"), unsafe_allow_html=True)
49 |
50 | if progress_value is not None:
51 | overall_progress.progress(progress_value)
52 |
53 | time.sleep(0.1) # Allow UI to update
54 |
55 | def mark_step_complete(step_num, step_progress, message):
56 | """Mark a step as completed"""
57 | step_progress[step_num].markdown(f"✅ {message}")
--------------------------------------------------------------------------------
/streamlit_app/core/keyframes_processor.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import os
5 | import glob
6 | import subprocess
7 | from concurrent.futures import ThreadPoolExecutor, as_completed
8 |
9 | def extract_keyframes_sync(output_dir, max_workers=3):
10 | """Synchronous keyframes extraction - no async conflicts"""
11 | # Build keyframes path
12 | keyframes_path = f'{output_dir}/keyframes'
13 | if not os.path.exists(keyframes_path):
14 | os.makedirs(keyframes_path)
15 |
16 | # Get all video files
17 | video_path = f'{output_dir}/downloaded_videos'
18 | if not os.path.exists(video_path):
19 | return
20 |
21 | files = glob.glob(f'{video_path}/*.mp4')
22 | if not files:
23 | return
24 |
25 | # Videos already processed
26 | processed_videos = []
27 | if os.path.exists(keyframes_path):
28 | processed_videos = [d for d in os.listdir(keyframes_path)
29 | if os.path.isdir(os.path.join(keyframes_path, d))]
30 |
31 | def extract_single_video_keyframes(file):
32 | """Extract keyframes from a single video file"""
33 | try:
34 | # Get id from video filename
35 | video_id = os.path.basename(file).split('.')[0]
36 | if video_id in processed_videos:
37 | return
38 |
39 | # Create subdirectory for this video_id
40 | video_keyframes_dir = f'{keyframes_path}/{video_id}'
41 | if not os.path.exists(video_keyframes_dir):
42 | os.makedirs(video_keyframes_dir)
43 |
44 | # FFmpeg command to extract keyframes
45 | cmd = [
46 | 'ffmpeg',
47 | '-i', file,
48 | '-vf', 'select=eq(pict_type\\,I)',
49 | '-vsync', 'vfr',
50 | '-q:v', '2',
51 | '-y', # Overwrite output files
52 | f'{video_keyframes_dir}/keyframe_%04d.jpg'
53 | ]
54 |
55 | # Run FFmpeg synchronously
56 | subprocess.run(
57 | cmd,
58 | stdout=subprocess.PIPE,
59 | stderr=subprocess.PIPE,
60 | text=True
61 | )
62 |
63 | except Exception:
64 | # Silently handle errors
65 | pass
66 |
67 | # Process videos with controlled concurrency
68 | max_workers = min(max_workers, len(files))
69 |
70 | with ThreadPoolExecutor(max_workers=max_workers) as executor:
71 | # Submit all tasks
72 | future_to_file = {executor.submit(extract_single_video_keyframes, file): file
73 | for file in files}
74 |
75 | # Process completed tasks silently
76 | for future in as_completed(future_to_file):
77 | result = future.result()
78 | # Silently handle results - no UI spam
79 | pass
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import os
5 |
6 | # typing
7 | from typing import Dict
8 |
9 | # import submodules
10 | from configparser import ConfigParser
11 | from datetime import datetime
12 |
13 | def get_project_root():
14 | """Get the project root directory."""
15 | # Get the directory where main.py is located
16 | current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
17 | return current_dir
18 |
19 | '''
20 | Get configuration attributes
21 |
22 | '''
23 | def get_config_attrs(config_dir=None) -> Dict:
24 | '''
25 | Retrieves configuration attributes from configuration files.
26 |
27 | :param config_dir: Optional path to the config directory.
28 | If None, uses the default path.
29 | :return: A dictionary containing the SerpAPI and Apify credentials.
30 | '''
31 | if config_dir is None:
32 | project_root = get_project_root()
33 | config_dir = os.path.join(project_root, 'config')
34 |
35 | path = os.path.join(config_dir, 'config.ini')
36 |
37 | # config parser
38 | config = ConfigParser()
39 | config.read(path)
40 |
41 | # Get credentials from both sections
42 | credentials = {}
43 |
44 | # SerpAPI credentials
45 | if 'SerpAPI Key' in config:
46 | credentials.update(dict(config['SerpAPI Key']))
47 |
48 | # Apify credentials
49 | if 'Apify Token' in config:
50 | credentials.update(dict(config['Apify Token']))
51 |
52 | return credentials
53 |
54 | '''
55 | Verify date format
56 |
57 | '''
58 | def is_valid_date(date_str: str) -> bool:
59 | '''
60 | Verifies if the given date string is in the format YYYY-MM-DD.
61 |
62 | :param date_str: The date string to verify.
63 | :return: True if the date string is valid, False otherwise.
64 | '''
65 | try:
66 | # Attempt to parse the date string with the expected format
67 | datetime.strptime(date_str, '%Y-%m-%d')
68 | return True
69 | except ValueError:
70 | # If a ValueError is raised, the format is incorrect
71 | return False
72 |
73 | def verify_date_argument(args: Dict, key: str) -> None:
74 | '''
75 | Verifies that a date argument in args is correctly formatted.
76 |
77 | :param args: Dictionary containing command line arguments and options.
78 | :param key: The key in args to check for a valid date.
79 | :raises ValueError: If the date is not in the correct format.
80 | '''
81 | if key in args:
82 | if not is_valid_date(args[key]):
83 | raise ValueError(
84 | f"The date for '{key}' argument is not in the correct "
85 | "format. Use this format: YYYY-MM-DD."
86 | )
87 |
88 | '''
89 | Create output data path
90 |
91 | '''
92 | def create_output_data_path(path: str) -> None:
93 | '''
94 | Creates the specified directory path if it does not already exist.
95 |
96 | :param path: The directory path to create.
97 | :return: None
98 | '''
99 | if not os.path.exists(path):
100 | os.makedirs(path)
101 |
102 |
--------------------------------------------------------------------------------
/streamlit_app/components/main_panel.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import streamlit as st
5 |
6 | # local imports
7 | from dataclasses import dataclass
8 | from ..utils.file_browser import select_directory
9 |
10 | @dataclass
11 | class CollectionConfig:
12 | """Configuration for collection settings"""
13 | download_videos: bool = True
14 | use_tor: bool = False
15 | max_workers: int = 5
16 | output_dir: str = ''
17 |
18 | def render_main_panel():
19 | """Render main content panels and return configuration"""
20 |
21 | # Main Content Area - Better organized panels
22 | st.markdown("## ⚙️ Collection Settings")
23 |
24 | # Download Settings Panel
25 | with st.container():
26 | st.markdown("### 📥 Download & Processing Settings")
27 | st.markdown("") # Add consistent spacing
28 |
29 | col1, col2, col3 = st.columns([1, 1, 1])
30 |
31 | with col1:
32 | st.markdown("**📹 Download Videos**")
33 | download_videos = st.toggle(
34 | "Enable video downloads",
35 | value=True,
36 | help="Download TikTok videos to local storage",
37 | label_visibility="collapsed"
38 | )
39 |
40 | with col2:
41 | st.markdown("**🔒 Use Tor Network**")
42 | use_tor = st.toggle(
43 | "Enable Tor for downloads",
44 | help="Enable Tor for anonymous downloads",
45 | label_visibility="collapsed"
46 | )
47 |
48 | with col3:
49 | max_workers = st.number_input(
50 | '⚡ **Concurrent Workers**',
51 | min_value=1,
52 | max_value=20,
53 | value=5,
54 | help='Number of concurrent download workers'
55 | )
56 |
57 | st.markdown("---")
58 |
59 | # Output Configuration Panel
60 | with st.container():
61 | st.markdown("### 📂 Output Configuration")
62 |
63 | # Properly aligned output directory input and browse button
64 | col1, col2 = st.columns([6, 1])
65 |
66 | with col1:
67 | output_dir = st.text_input(
68 | '**Output Directory**',
69 | value=st.session_state.output_dir,
70 | help='Directory where all collected data will be saved',
71 | placeholder='Enter output directory path...',
72 | label_visibility="visible"
73 | )
74 | if output_dir != st.session_state.output_dir:
75 | st.session_state.output_dir = output_dir
76 |
77 | with col2:
78 | # Add spacing to align button with input field
79 | st.markdown("
", unsafe_allow_html=True)
80 | if st.button('📁', help="Browse for directory", use_container_width=True):
81 | path = select_directory()
82 | if path:
83 | st.session_state.output_dir = path
84 | st.rerun()
85 |
86 | st.markdown("---")
87 |
88 | # Centered Action Button
89 | col1, col2, col3 = st.columns([1, 2, 1])
90 | with col2:
91 | start_collection = st.button(
92 | '🚀 **Start Data Collection**',
93 | use_container_width=True,
94 | type="primary"
95 | )
96 |
97 | return CollectionConfig(
98 | download_videos=download_videos,
99 | use_tor=use_tor,
100 | max_workers=max_workers,
101 | output_dir=st.session_state.output_dir
102 | ), start_collection
--------------------------------------------------------------------------------
/data_collectors/utilities.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # typing
4 | from typing import Dict, List
5 |
6 | '''
7 | Build search query
8 |
9 | '''
10 | def advanced_search_options(args: Dict) -> str:
11 | '''
12 | Builds advanced search options based on the provided arguments.
13 |
14 | :param args: Dictionary containing the command line arguments and options.
15 | :return: A formatted query string with advanced search options.
16 | '''
17 | before = args.get('before', '')
18 | after = args.get('after', '')
19 |
20 | advanced_search = {
21 | 'before': before,
22 | 'after': after
23 | }
24 |
25 | response = [
26 | f'{k}:{v}' for k, v in advanced_search.items() if v
27 | ]
28 |
29 | return ' '.join(response)
30 |
31 | def build_site_query(site: str, user: str = None, tag: str = None, q: str = '') -> str:
32 | '''
33 | Builds a site-specific search query based on the provided parameters.
34 |
35 | :param site: TikTok's site domain.
36 | :param user: Optional username to search for content from a specific user.
37 | :param tag: Optional tag to search for content with a specific tag.
38 | :param q: Optional search terms to include in the query.
39 | :return: A formatted site search query string.
40 | '''
41 | if user is not None:
42 | # remove @ prefix if present
43 | clean_user = user[1:] if user.startswith('@') else user
44 | return f'site:{site}/@{clean_user}/* {q}'.strip()
45 | elif tag is not None:
46 | # remove # prefix if present
47 | clean_tag = tag[1:] if tag.startswith('#') else tag
48 | return f'site:{site}/tag/{clean_tag}/* {q}'.strip()
49 | else:
50 | # normal site search
51 | return f'site:{site}/* {q}'.strip()
52 |
53 | def search_query(args: Dict) -> str:
54 | '''
55 | Builds the search query string based on the command line arguments.
56 |
57 | :param args: Dictionary containing the command line arguments and options.
58 | :return: A formatted query string.
59 | '''
60 | q = args.get('q') or ''
61 | advanced_search = advanced_search_options(args)
62 |
63 | return f'{q} {advanced_search}'.strip()
64 |
65 | '''
66 | Select SerpAPI parameters
67 |
68 | '''
69 | def select_serpapi_parameters(args: Dict) -> Dict:
70 | '''
71 | Filters the command line arguments to include only the default SerpAPI
72 | parameters.
73 |
74 | :param args: Dictionary containing the command line arguments and options.
75 | :return: A dictionary containing only the relevant SerpAPI parameters.
76 | '''
77 | default_serpapi_parameters = [
78 | 'q',
79 | 'google_domain',
80 | 'gl',
81 | 'hl',
82 | 'cr',
83 | 'lr',
84 | 'safe'
85 | ]
86 |
87 | # filter and return only the relevant SerpAPI parameters
88 | params = {
89 | k: v for k, v in args.items() if k in default_serpapi_parameters and v
90 | }
91 |
92 | # add new parameters
93 | params['engine'] = 'google'
94 | params['start'] = 0
95 | params['nfpr'] = 1
96 | params['num'] = 100
97 |
98 | return params
99 |
100 | '''
101 | Extract relevant keys from SerpAPI response
102 |
103 | '''
104 | def extract_results_keys(data: List[Dict], result_type: str) -> List[Dict]:
105 | '''
106 | Filters the SerpAPI response data to include only entries with 'link'
107 | containing 'video', and returns a list of dictionaries with specified
108 | default keys.
109 |
110 | :param data: List of dictionaries containing the SerpAPI response data.
111 | :param result_type: Type of SerpAPI response: 'search_result' or
112 | 'image_result'
113 | :return: A list of dictionaries, each containing the specified default
114 | keys from the SerpAPI response.
115 | '''
116 | key_mapping = {
117 | 'search_result': [
118 | 'source',
119 | 'title',
120 | 'snippet',
121 | 'link',
122 | 'thumbnail',
123 | 'video_link',
124 | 'snippet_highlighted_words',
125 | 'displayed_link'
126 | ],
127 | 'image_result': [
128 | 'source',
129 | 'thumbnail',
130 | 'title',
131 | 'link',
132 | 'serpapi_related_content_link'
133 | ]
134 | }
135 |
136 | selected_keys = key_mapping.get(result_type, [])
137 |
138 | # filter data to include only entries with 'link' containing 'video'
139 | d = [
140 | i for i in data if 'link' in i and '/video/' in i['link']
141 | and 'tiktok.com' in i['link']
142 | ]
143 |
144 | # return list of dictionaries with specified default keys
145 | return [
146 | {
147 | k: i[k] for k in selected_keys if k in i
148 | } for i in d
149 | ]
150 |
151 | '''
152 | Extract relevant keys from related content
153 | '''
154 | def extract_related_content_keys(data: List[Dict]) -> List[Dict]:
155 | '''
156 | Filters related content data and returns a list of dictionaries with
157 | specified default keys.
158 |
159 | :param data: List of dictionaries containing related content data.
160 | :return: A list of dictionaries, each containing the specified default
161 | keys for the related content.
162 | '''
163 | key_mapping = [
164 | 'source',
165 | 'link',
166 | 'thumbnail',
167 | 'title'
168 | ]
169 |
170 | # return list of dictionaries with specified default keys
171 | return [
172 | {
173 | k: i[k] for k in key_mapping if k in i
174 | } for i in data
175 | ]
176 |
--------------------------------------------------------------------------------
/streamlit_app/styles/css.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import streamlit as st
5 |
6 | def load_css():
7 | """Load custom CSS for TikTok-inspired theme"""
8 | st.markdown("""
9 |
182 | """, unsafe_allow_html=True)
183 |
184 | def create_status_badge(text, status_type):
185 | """Create a status badge with specified type"""
186 | return f'{text}'
--------------------------------------------------------------------------------
/streamlit_app/components/sidebar.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import streamlit as st
5 |
6 | # import submodules
7 | from dataclasses import dataclass
8 | from typing import Optional
9 | from datetime import date
10 |
11 | @dataclass
12 | class SearchConfig:
13 | """Configuration for search parameters"""
14 | query: Optional[str] = None
15 | user: Optional[str] = None
16 | tag: Optional[str] = None
17 | after_date: Optional[date] = None
18 | before_date: Optional[date] = None
19 | google_domain: str = 'google.com'
20 | gl: Optional[str] = None
21 | hl: Optional[str] = None
22 | cr: Optional[str] = None
23 | lr: Optional[str] = None
24 | safe: str = 'active'
25 | depth: int = 3
26 |
27 | @dataclass
28 | class ApifyConfig:
29 | """Configuration for Apify integration"""
30 | use_apify: bool = False
31 | number_of_results: int = 25
32 | oldest_post_date: Optional[date] = None
33 | newest_post_date: Optional[date] = None
34 |
35 | def render_sidebar():
36 | """Render sidebar components and return configuration"""
37 | with st.sidebar:
38 | st.markdown("### 🎯 Search Configuration")
39 | st.markdown("") # Add consistent spacing
40 |
41 | # Search Type Selection
42 | search_tab = st.radio(
43 | "**Search Type**",
44 | ["🔍 Keyword", "👤 User Profile", "🏷️ Hashtag"],
45 | horizontal=True
46 | )
47 |
48 | st.markdown("") # Add spacing after radio buttons
49 |
50 | # Search input based on type
51 | query = user = tag = None
52 |
53 | if search_tab == "🔍 Keyword":
54 | query = st.text_input(
55 | 'Search Keywords',
56 | placeholder='Enter keywords to search for...',
57 | help='Search for TikTok content using keywords'
58 | )
59 | elif search_tab == "👤 User Profile":
60 | user = st.text_input(
61 | 'TikTok Username',
62 | placeholder='username (without @)',
63 | help='Enter TikTok username without @ symbol'
64 | )
65 | else: # Hashtag search
66 | tag = st.text_input(
67 | 'Hashtag',
68 | placeholder='hashtag (with or without #)',
69 | help='Enter hashtag with or without # symbol'
70 | )
71 |
72 | st.markdown("") # Add spacing before divider
73 | st.markdown("---")
74 | st.markdown("") # Add spacing after divider
75 |
76 | # Date Filters Section
77 | st.markdown("### 📅 Date Filters")
78 | st.markdown("") # Add consistent spacing
79 | col1, col2 = st.columns(2)
80 | with col1:
81 | after_date = st.date_input(
82 | 'After Date',
83 | value=None,
84 | help='Posts after this date'
85 | )
86 | with col2:
87 | before_date = st.date_input(
88 | 'Before Date',
89 | value=None,
90 | help='Posts before this date'
91 | )
92 |
93 | st.markdown("") # Add spacing before divider
94 | st.markdown("---")
95 | st.markdown("") # Add spacing after divider
96 |
97 | # Apify Integration Section
98 | st.markdown("### 🚀 Apify Integration")
99 | st.markdown("") # Add consistent spacing
100 |
101 | use_apify = st.toggle(
102 | "**Enable Apify**",
103 | help="Enhanced data collection with Apify"
104 | )
105 |
106 | st.markdown("") # Add spacing after toggle
107 |
108 | if use_apify:
109 | number_of_results = st.number_input(
110 | 'Results Count',
111 | min_value=1,
112 | max_value=1000,
113 | value=25,
114 | help='Number of results to collect'
115 | )
116 |
117 | st.markdown("") # Add spacing before subsection
118 | st.markdown("**Apify Date Filters**")
119 | st.markdown("") # Add spacing after subsection title
120 |
121 | col1, col2 = st.columns(2)
122 | with col1:
123 | oldest_post_date = st.date_input(
124 | 'Oldest Post',
125 | help='Oldest post date'
126 | )
127 | with col2:
128 | newest_post_date = st.date_input(
129 | 'Newest Post',
130 | help='Newest post date'
131 | )
132 | else:
133 | number_of_results = 25
134 | oldest_post_date = None
135 | newest_post_date = None
136 |
137 | st.markdown("---")
138 |
139 | # Advanced Search Options
140 | with st.expander("⚙️ Advanced Search Options"):
141 | st.markdown("**Google Search Settings**")
142 |
143 | # Domain setting (full width)
144 | google_domain = st.text_input(
145 | 'Domain',
146 | value='google.com',
147 | help='e.g., google.com, google.co.uk'
148 | )
149 |
150 | # Country and Language settings (2 columns)
151 | col1, col2 = st.columns(2)
152 | with col1:
153 | gl = st.text_input(
154 | 'Country Code (GL)',
155 | help='e.g., us, uk, de',
156 | placeholder='us'
157 | )
158 | cr = st.text_input(
159 | 'Country Restriction',
160 | help='Restrict to specific countries',
161 | placeholder='countryUS'
162 | )
163 | with col2:
164 | hl = st.text_input(
165 | 'Language Code (HL)',
166 | help='e.g., en, es, fr',
167 | placeholder='en'
168 | )
169 | lr = st.text_input(
170 | 'Language Restriction',
171 | help='Restrict to specific languages',
172 | placeholder='lang_en'
173 | )
174 |
175 | # Search settings (2 columns)
176 | col3, col4 = st.columns(2)
177 | with col3:
178 | safe = st.selectbox(
179 | 'Safe Search',
180 | options=['active', 'off'],
181 | index=0,
182 | help='Adult content filter'
183 | )
184 | with col4:
185 | depth = st.slider(
186 | 'Search Depth',
187 | min_value=1,
188 | max_value=10,
189 | value=3,
190 | help='Related content iterations'
191 | )
192 |
193 | # Return configuration objects
194 | search_config = SearchConfig(
195 | query=query,
196 | user=user,
197 | tag=tag,
198 | after_date=after_date,
199 | before_date=before_date,
200 | google_domain=google_domain,
201 | gl=gl if gl else None,
202 | hl=hl if hl else None,
203 | cr=cr if cr else None,
204 | lr=lr if lr else None,
205 | safe=safe,
206 | depth=depth
207 | )
208 |
209 | apify_config = ApifyConfig(
210 | use_apify=use_apify,
211 | number_of_results=number_of_results,
212 | oldest_post_date=oldest_post_date,
213 | newest_post_date=newest_post_date
214 | )
215 |
216 | return search_config, apify_config
--------------------------------------------------------------------------------
/streamlit_app/core/collection_runner.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import streamlit as st
5 | import asyncio
6 | import time
7 |
8 | # import submodules
9 | from concurrent.futures import ThreadPoolExecutor
10 |
11 | # local imports
12 | from data_collectors import TikTokDataCollector
13 | from media_handlers import VideoDownloader
14 | from utils import create_output_data_path
15 | from ..components.progress import create_progress_tracker, update_progress, \
16 | mark_step_complete
17 | from ..styles.css import create_status_badge
18 | from .keyframes_processor import extract_keyframes_sync
19 |
20 | def build_args_dict(search_config, apify_config, collection_config, config_attrs):
21 | """Build arguments dictionary for collection"""
22 | args = {
23 | 'q': search_config.query,
24 | 'user': search_config.user,
25 | 'tag': search_config.tag,
26 | 'google_domain': search_config.google_domain,
27 | 'gl': search_config.gl,
28 | 'hl': search_config.hl,
29 | 'cr': search_config.cr,
30 | 'lr': search_config.lr,
31 | 'safe': search_config.safe,
32 | 'depth': search_config.depth,
33 | 'before': search_config.before_date.strftime('%Y-%m-%d') if search_config.before_date else None,
34 | 'after': search_config.after_date.strftime('%Y-%m-%d') if search_config.after_date else None,
35 | 'download': collection_config.download_videos,
36 | 'use_tor': collection_config.use_tor,
37 | 'max_workers': collection_config.max_workers,
38 | 'output': collection_config.output_dir,
39 | 'apify': apify_config.use_apify,
40 | 'number_of_results': apify_config.number_of_results
41 | }
42 |
43 | # Add Apify-specific arguments if enabled
44 | if apify_config.use_apify:
45 | args.update({
46 | 'oldest_post_date': apify_config.oldest_post_date.strftime('%Y-%m-%d') if apify_config.oldest_post_date else None,
47 | 'newest_post_date': apify_config.newest_post_date.strftime('%Y-%m-%d') if apify_config.newest_post_date else None
48 | })
49 |
50 | # Merge configuration attributes with user arguments
51 | args = {**args, **config_attrs}
52 |
53 | return args
54 |
55 | def validate_input(search_config):
56 | """Validate search input"""
57 | if not search_config.query and not search_config.user and not search_config.tag:
58 | st.error('🚨 Please enter a search term, username, or hashtag to continue!')
59 | return False
60 | return True
61 |
62 | def run_collection(search_config, apify_config, collection_config, config_attrs):
63 | """Enhanced collection function with better progress tracking and feedback"""
64 |
65 | # Build arguments
66 | args = build_args_dict(search_config, apify_config, collection_config, config_attrs)
67 |
68 | # Create progress tracker
69 | overall_progress, status_text, step_progress, steps = create_progress_tracker()
70 |
71 | def run_collection_thread():
72 | """Run collection in separate thread with own event loop"""
73 | # Create new event loop for this thread
74 | loop = asyncio.new_event_loop()
75 | asyncio.set_event_loop(loop)
76 |
77 | try:
78 | # Create collector in this thread
79 | collector = TikTokDataCollector(args=args)
80 |
81 | # Execute the main collection process
82 | collector.collect_search_data()
83 |
84 | # Generate files
85 | collector.generate_data_files()
86 |
87 | # Get collected videos for download
88 | collected_videos = collector.get_collected_videos() if args['download'] else []
89 |
90 | return collector, collected_videos
91 |
92 | finally:
93 | loop.close()
94 |
95 | try:
96 | # Create output directory
97 | create_output_data_path(args['output'])
98 |
99 | # Step 1: Initialize
100 | update_progress(0, overall_progress, status_text, step_progress, steps, progress_value=10)
101 |
102 | # Step 2: Start data collection process
103 | update_progress(1, overall_progress, status_text, step_progress, steps, "Searching...", 25)
104 |
105 | # Step 3: Show image collection
106 | update_progress(2, overall_progress, status_text, step_progress, steps, progress_value=35)
107 |
108 | # Step 4: Show Apify preparation
109 | if args['apify']:
110 | update_progress(3, overall_progress, status_text, step_progress, steps, "Preparing Apify...", 45)
111 | else:
112 | step_progress[3].markdown(f"⏭️ Apify integration skipped")
113 | overall_progress.progress(45)
114 | time.sleep(0.1)
115 |
116 | # Run collection in separate thread to avoid asyncio conflicts
117 | with ThreadPoolExecutor() as executor:
118 | future = executor.submit(run_collection_thread)
119 | collector, collected_videos = future.result()
120 |
121 | # Mark data collection steps as complete
122 | mark_step_complete(1, step_progress, "Search results collected")
123 | mark_step_complete(2, step_progress, "Image thumbnails gathered")
124 | if args['apify']:
125 | mark_step_complete(3, step_progress, "Apify integration completed")
126 |
127 | overall_progress.progress(65)
128 |
129 | # Step 5: Generate files (already done in thread)
130 | update_progress(4, overall_progress, status_text, step_progress, steps, "Generating Files...", 75)
131 | mark_step_complete(4, step_progress, "Data files generated")
132 |
133 | # Step 6: Download videos
134 | if args['download']:
135 | update_progress(5, overall_progress, status_text, step_progress, steps, "Downloading...", 80)
136 |
137 | if collected_videos:
138 | st.info(f'📹 Found {len(collected_videos)} videos to download')
139 |
140 | downloader = VideoDownloader(
141 | output=args['output'],
142 | use_tor=args['use_tor']
143 | )
144 | downloader.start_download(
145 | urls=collected_videos,
146 | max_workers=args['max_workers']
147 | )
148 |
149 | mark_step_complete(5, step_progress, f"{len(collected_videos)} videos downloaded")
150 | else:
151 | mark_step_complete(5, step_progress, "No new videos to download")
152 | else:
153 | mark_step_complete(5, step_progress, "Video download disabled")
154 |
155 | # Step 7: Extract keyframes from available videos
156 | update_progress(6, overall_progress, status_text, step_progress, steps, "Extracting Keyframes...", 90)
157 |
158 | # Extract keyframes from any videos in the output directory
159 | try:
160 | extract_keyframes_sync(args['output'], args['max_workers'])
161 | mark_step_complete(6, step_progress, "Keyframes extracted")
162 | except Exception as e:
163 | step_progress[6].markdown(f"⚠️ Keyframe extraction failed: {str(e)}")
164 |
165 | # Step 8: Complete
166 | overall_progress.progress(100)
167 | update_progress(7, overall_progress, status_text, step_progress, steps)
168 | status_text.markdown(create_status_badge("Success", "success"), unsafe_allow_html=True)
169 |
170 | # Success message with results
171 | st.success('🎉 Collection completed successfully!')
172 |
173 | # Show output location
174 | st.metric("📂 Output Location", args['output'])
175 |
176 | # Show file explorer link
177 | st.markdown(f"""
178 |
179 | 📁 Results saved to:
180 | {args['output']}
181 |
182 | """, unsafe_allow_html=True)
183 |
184 | except Exception as e:
185 | status_text.markdown(create_status_badge("Error", "error"), unsafe_allow_html=True)
186 | st.error(f'❌ An error occurred during collection: {str(e)}')
187 | st.exception(e)
--------------------------------------------------------------------------------
/media_handlers/video_downloader.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import os
5 | import time
6 |
7 | # threads
8 | from concurrent.futures import ThreadPoolExecutor, as_completed
9 |
10 | # typing
11 | from typing import List
12 |
13 | # pathlib
14 | from pathlib import Path
15 |
16 | # progress bar
17 | from tqdm import tqdm
18 |
19 | # yt_dlp module
20 | from yt_dlp import YoutubeDL
21 |
22 | # stem module
23 | from stem import Signal
24 | from stem.control import Controller
25 |
26 | # Video downloader class
27 | class VideoDownloader:
28 | '''
29 | VideoDownloader class
30 |
31 | This class handles the downloading of TikTok videos and their audio using
32 | yt-dlp and threading for concurrent downloads.
33 | '''
34 | def __init__(self, output: str, use_tor: bool = False) -> None:
35 | '''
36 | Initializes the VideoDownloader with default download options.
37 | Downloads both video and audio when initialized.
38 |
39 | :param output: The original directory path provided by the user
40 | :param use_tor: Boolean indicating whether to use Tor for downloads
41 | '''
42 | # initialize Tor proxy settings
43 | self.use_tor = use_tor
44 | self.proxy = 'socks5://127.0.0.1:9050'
45 |
46 | # Common options for both video and audio
47 | common_options = {
48 | 'no_warnings': True,
49 | 'quiet': True,
50 | 'ignoreerrors': True,
51 | 'noprogress': True
52 | }
53 |
54 | if self.use_tor:
55 | common_options['proxy'] = self.proxy
56 |
57 | # video download options
58 | self.video_options = {
59 | **common_options,
60 | 'format': '(bv*+ba/b)[vcodec!=?h265]',
61 | 'outtmpl': self._build_output_directory(output, 'downloaded_videos')
62 | }
63 |
64 | # audio download options
65 | self.audio_options = {
66 | **common_options,
67 | 'format': 'bestaudio/best',
68 | 'outtmpl': self._build_output_directory(output, 'downloaded_audios'),
69 | 'postprocessors': [{
70 | 'key': 'FFmpegExtractAudio',
71 | 'preferredcodec': 'mp3',
72 | }]
73 | }
74 |
75 | def _sanitize_output_path(self, output: str) -> str:
76 | '''
77 | Ensures the given path uses forward slashes and does not end with a
78 | slash.
79 |
80 | :param output: The original directory path provided by the user
81 | :return: A sanitized directory path with forward slashes and no
82 | trailing slash.
83 | '''
84 | # create a Path object and normalize the path
85 | path = Path(output)
86 |
87 | # path with the correct separators for the current OS
88 | output = str(path.as_posix())
89 |
90 | # remove any trailing slashes
91 | output = output.rstrip('/')
92 |
93 | return output
94 |
95 | def _build_output_directory(self, output: str, dir_name: str) -> str:
96 | '''
97 | Builds and sanitizes the output directory path for downloading videos.
98 |
99 | :param output: The original directory path provided by the user
100 | :param dir_name: Name of the subdirectory (videos or audio)
101 | :return: The full path for saving downloaded files with the filename
102 | template.
103 | '''
104 | output = self._sanitize_output_path(output=output)
105 | path = f'{output}/{dir_name}'
106 |
107 | # ensure the directory exists
108 | if not os.path.exists(path):
109 | os.makedirs(path)
110 |
111 | return f'{path}/%(id)s.%(ext)s'
112 |
113 | def renew_tor_ip(self) -> None:
114 | '''
115 | Requests a new Tor circuit to change the IP address.
116 | '''
117 | try:
118 | with Controller.from_port(port=9051) as controller:
119 | controller.authenticate()
120 | controller.signal(Signal.NEWNYM)
121 | time.sleep(5)
122 | except Exception as e:
123 | print (f'Error renewing Tor IP: {e}')
124 |
125 | def download_content(self, url: str) -> None:
126 | '''
127 | Downloads both video and audio from the specified URL using yt-dlp.
128 |
129 | :param url: The URL of the TikTok video to download.
130 | '''
131 | max_attempts = 3 if self.use_tor else 1
132 | for attempt in range(max_attempts):
133 | try:
134 | # download video
135 | with YoutubeDL(self.video_options) as ydl:
136 | ydl.download(url)
137 |
138 | # download audio
139 | with YoutubeDL(self.audio_options) as ydl:
140 | ydl.download(url)
141 |
142 | return
143 |
144 | except Exception as e:
145 | print (f'Error downloading {url}: {e}')
146 |
147 | if self.use_tor and attempt < max_attempts - 1:
148 | print ('Renewing Tor circuit...')
149 | self.renew_tor_ip()
150 |
151 | # wait for circuit to be established
152 | time.sleep(5)
153 | else:
154 | break
155 |
156 | def download_videos(self, urls: List[str], max_workers: int) -> None:
157 | '''
158 | Downloads multiple videos concurrently using a thread pool.
159 |
160 | :param urls: A list of TikTok video URLs to download.
161 | :param max_workers: The maximum number of threads to use for
162 | downloading.
163 | '''
164 | with ThreadPoolExecutor(max_workers=max_workers) as executor:
165 | future_to_url = {
166 | executor.submit(self.download_content, url): url
167 | for url in urls
168 | }
169 | for future in tqdm(
170 | as_completed(future_to_url),
171 | total=len(future_to_url),
172 | desc='Downloading content'
173 | ):
174 | url = future_to_url[future]
175 | try:
176 | future.result()
177 | except Exception as e:
178 | print (f'{url} generated an exception: {e}')
179 |
180 | def _test_tor_connection(self) -> bool:
181 | '''
182 | Tests if Tor is available and working.
183 |
184 | :return: True if Tor is available and working, False otherwise.
185 | '''
186 | try:
187 | # test if port is open
188 | import socket
189 | sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
190 | result = sock.connect_ex(('127.0.0.1', 9050))
191 | if result != 0:
192 | print ('\n\n')
193 | print ('Tor SOCKS port (9050) is not open. Is Tor running?')
194 | print ('Falling back to normal connection.\n')
195 | return False
196 |
197 | # if port is open, test connection
198 | import requests
199 | print ('\n\nTesting Tor connection...')
200 | response = requests.get(
201 | 'https://check.torproject.org/api/ip',
202 | proxies={
203 | 'http': self.proxy,
204 | 'https': self.proxy
205 | },
206 | timeout=10
207 | )
208 |
209 | if response.status_code == 200:
210 | data = response.json()
211 | print (f'Tor connection successful. Exit node IP: {data.get("IP")}\n\n')
212 | return True
213 | else:
214 | print ('Tor enabled but connection check failed. Using normal connection.\n\n')
215 | return False
216 |
217 | except Exception as e:
218 | print (f'\nTor connection failed ({e}). Using normal connection.\n')
219 | return False
220 |
221 | def start_download(self, urls: List[str], max_workers: int) -> None:
222 | '''
223 | Starts the download process for a list of TikTok video URLs.
224 |
225 | :param urls: A list of TikTok video URLs to download.
226 | :param max_workers: The maximum number of threads to use for
227 | downloading. Default is 5.
228 | '''
229 | if self.use_tor:
230 | # test Tor connection and update use_tor flag accordingly
231 | self.use_tor = self._test_tor_connection()
232 |
233 | # remove proxy settings if Tor connection failed
234 | if not self.use_tor:
235 | for options in [self.video_options, self.audio_options]:
236 | options.pop('proxy', None)
237 |
238 | print ('> Starting download...\n')
239 |
240 | # download videos
241 | self.download_videos(urls=urls, max_workers=max_workers)
242 |
243 | print ('\n\nDownload complete.')
244 |
--------------------------------------------------------------------------------
/media_handlers/session_manager.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import os
5 | import glob
6 | import aiohttp
7 | import asyncio
8 | import requests
9 | import subprocess
10 |
11 | # progress bar
12 | from tqdm import tqdm
13 |
14 | # aiohttp
15 | from aiohttp import ClientSession
16 |
17 | # typing
18 | from typing import Dict, List
19 |
20 | # HTTP session class
21 | class RequestSession:
22 | '''
23 | RequestSession
24 |
25 | This class handles HTTP requests and asynchronous tasks for interacting
26 | with the SerpAPI response and processing related content links
27 |
28 | '''
29 | def __init__(self) -> None:
30 | '''
31 | Initializes the RequestSession object.
32 | '''
33 | # request session
34 | headers = {'accept': 'application/json'}
35 | self.req_session = requests.Session()
36 | self.req_session.headers.update(headers)
37 |
38 | # asynchronous event loop
39 | self.loop = asyncio.get_event_loop()
40 |
41 | def load_related_content(self, url: str, api_key: str) -> List[Dict]:
42 | '''
43 | Loads related content from the given URL using the provided API key.
44 |
45 | :param url: The URL to load related content from.
46 | :param api_key: SerpAPI key for authentication.
47 | :return: A list of dictionaries containing the related content data.
48 | '''
49 | params = {'api_key': api_key}
50 |
51 | def fetch_content(url: str) -> Dict:
52 | response = self.req_session.get(url, params=params)
53 | response.raise_for_status()
54 | return response.json()
55 |
56 | try:
57 | content = fetch_content(url)
58 | see_more_link = content.get('serpapi_see_more_link')
59 | if see_more_link:
60 | content = fetch_content(see_more_link)
61 | return content
62 | except requests.RequestException as e:
63 | print (f'An error occurred: {e}')
64 | return {}
65 |
66 | def _build_media_filename_path(self, output: str, link: str, file_extension: str) -> str:
67 | '''
68 | Builds the filename path for saving the image based on the TikTok link.
69 |
70 | :param output: The directory path where the images will be saved.
71 | :param link: The TikTok link from which to extract the post ID.
72 | :param file_extension: The file extension of the media file.
73 | :return: The full path (including filename) where the image will be
74 | saved.
75 | '''
76 | post_id = link.split('/')[-1].split('?')[0]
77 | return f'{output}/{post_id}.{file_extension}'
78 |
79 | async def fetch_file(self, session: ClientSession, url: str,
80 | filename: str) -> None:
81 | '''
82 | Fetches a file from a URL and saves it to the output directory.
83 |
84 | :param session: The aiohttp ClientSession object.
85 | :param url: The URL of the file to download.
86 | :param filename: The path (including filename) where the file will be
87 | saved.
88 | '''
89 | try:
90 | async with session.get(url) as res:
91 | if res.status == 200:
92 | file_data = await res.read()
93 | with open(filename, 'wb') as f:
94 | f.write(file_data)
95 | else:
96 | print (
97 | f'Failed to download {url}, status code: {res.status}'
98 | )
99 | except Exception as e:
100 | print (f'An error occurred while downloading {url}: {e}')
101 |
102 | async def download_files(self, urls: List[str], links: List[str],
103 | output: str, file_extension: str) -> None:
104 | '''
105 | Downloads files from a list of URLs asynchronously.
106 |
107 | :param urls: A list of file URLs to download.
108 | :param links: A list of TikTok links corresponding to the files.
109 | :param output: The directory path where the files will be saved.
110 | :param file_extension: The file extension of the media file.
111 | '''
112 | async with aiohttp.ClientSession() as session:
113 | tasks = [
114 | self.fetch_file(
115 | session=session, url=url,
116 | filename=self._build_media_filename_path(output, link, file_extension)
117 | ) for url, link in zip(urls, links)
118 | ]
119 | await asyncio.gather(*tasks)
120 |
121 | def start_media_download(self, urls: List[str], links: List[str],
122 | output: str, media_type: str) -> None:
123 | '''
124 | Starts the asynchronous download of files from a list of URLs.
125 |
126 | :param urls: A list of file URLs to download.
127 | :param links: A list of TikTok links corresponding to the files.
128 | :param output: The directory path where the files will be saved.
129 | :param media_type: The type of media to download.
130 | '''
131 | media_object = {
132 | 'image': {
133 | 'path': 'thumbnails',
134 | 'file_extension': 'png'
135 | },
136 | 'video': {
137 | 'path': 'downloaded_videos',
138 | 'file_extension': 'mp4'
139 | }
140 | }
141 |
142 | path = f'{output}/{media_object[media_type]["path"]}'
143 | if not os.path.exists(path):
144 | os.makedirs(path)
145 |
146 | file_extension = media_object[media_type]['file_extension']
147 | self.loop.run_until_complete(
148 | self.download_files(urls=urls, links=links, output=path,
149 | file_extension=file_extension)
150 | )
151 |
152 | def extract_audio_from_videos(self, output: str) -> None:
153 | '''
154 | Extracts audio from video files.
155 |
156 | :param output: The directory path where audios will be saved.
157 | '''
158 | # build audio path
159 | audio_path = f'{output}/downloaded_audios'
160 | if not os.path.exists(audio_path):
161 | os.makedirs(audio_path)
162 |
163 | # get all video files
164 | path = f'{output}/downloaded_videos'
165 | files = glob.glob(f'{path}/*.mp4')
166 |
167 | # extract audio from each video
168 | for file in files:
169 | try:
170 | # get id from video filename
171 | video_id = os.path.basename(file).split('.')[0]
172 |
173 | # FFmpeg command to extract audio
174 | cmd = [
175 | 'ffmpeg',
176 | '-i', file,
177 | '-q:a', '0',
178 | '-map', 'a',
179 | '-y',
180 | f'{audio_path}/{video_id}.mp3'
181 | ]
182 |
183 | subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
184 | except Exception as e:
185 | print (f'Error extracting audio: {e}')
186 |
187 | def extract_keyframes_from_videos(self, output: str, max_concurrent: int) -> None:
188 | '''
189 | Extracts keyframes from video files.
190 |
191 | :param output: The directory path where keyframes will be saved.
192 | :param max_concurrent: Maximum number of concurrent ffmpeg processes.
193 | '''
194 | # build keyframes path
195 | keyframes_path = f'{output}/keyframes'
196 | if not os.path.exists(keyframes_path):
197 | os.makedirs(keyframes_path)
198 |
199 | # get all video files
200 | path = f'{output}/downloaded_videos'
201 | files = glob.glob(f'{path}/*.mp4')
202 |
203 | # videos ids already processed
204 | processed_videos = [i.split('\\')[-1] for i in glob.glob(f'{keyframes_path}/*')]
205 |
206 | async def extract_keyframes(file, pbar):
207 | try:
208 | # get id from video filename
209 | video_id = os.path.basename(file).split('.')[0]
210 | if video_id not in processed_videos:
211 | # create subdirectory for this video_id
212 | video_keyframes_dir = f'{keyframes_path}/{video_id}'
213 | if not os.path.exists(video_keyframes_dir):
214 | os.makedirs(video_keyframes_dir)
215 |
216 | # FFmpeg command to extract keyframes
217 | cmd = [
218 | 'ffmpeg',
219 | '-i', file,
220 | '-vf', 'select=eq(pict_type\\,I)',
221 | '-vsync', 'vfr',
222 | '-q:v', '2',
223 | f'{video_keyframes_dir}/keyframe_%04d.jpg'
224 | ]
225 |
226 | # run FFmpeg as async subprocess
227 | process = await asyncio.create_subprocess_exec(
228 | *cmd,
229 | stdout=asyncio.subprocess.PIPE,
230 | stderr=asyncio.subprocess.PIPE
231 | )
232 | await process.communicate()
233 | except Exception as e:
234 | print (f'Error extracting keyframes: {e}')
235 | finally:
236 | pbar.update(1)
237 |
238 | async def process_all_videos():
239 | # create progress bar in the main thread
240 | pbar = tqdm(total=len(files), desc='Extracting keyframes', unit='video')
241 |
242 | # use semaphore to limit concurrent processes
243 | semaphore = asyncio.Semaphore(max_concurrent)
244 |
245 | async def process_with_semaphore(file):
246 | async with semaphore:
247 | await extract_keyframes(file, pbar)
248 |
249 | # create tasks for all videos
250 | tasks = [process_with_semaphore(file) for file in files]
251 | await asyncio.gather(*tasks)
252 |
253 | pbar.close()
254 |
255 | # run the async event loop
256 | self.loop.run_until_complete(process_all_videos())
257 |
--------------------------------------------------------------------------------
/databases/utilities.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import re
5 | import json
6 |
7 | # typing
8 | from typing import Dict, Tuple
9 |
10 | '''
11 | Extract likes and comments from snippet
12 |
13 | '''
14 | def extract_likes_comments(text: str) -> Tuple:
15 | '''
16 | Extracts likes and comments from a given text.
17 |
18 | :param text: The text containing likes and comments.
19 | :return: A tuple containing the extracted likes and comments, or None if
20 | not found.
21 | '''
22 | # define regex patterns for likes and comments
23 | likes_pattern = re.compile(
24 | r'(\d+(?:[\d,.]*\d+)?(?:[KM])?) Likes',
25 | re.IGNORECASE
26 | )
27 |
28 | comments_pattern = re.compile(
29 | r'(\d+(?:[\d,.]*\d+)?(?:[KM])?) Comments',
30 | re.IGNORECASE
31 | )
32 |
33 | # search for likes and comments in the text
34 | likes_match = likes_pattern.search(text)
35 | comments_match = comments_pattern.search(text)
36 |
37 | # extract the matched groups or return None if not found
38 | likes = likes_match.group(1) if likes_match else None
39 | comments = comments_match.group(1) if comments_match else None
40 |
41 | return likes, comments
42 |
43 | '''
44 | Extract fields from the field link
45 |
46 | '''
47 | def extract_author_post_id(link: str) -> Tuple:
48 | '''
49 | Extracts the author, link to the author's page, and post ID from a TikTok
50 | video link.
51 |
52 | :param link: The TikTok video link.
53 | :return: A tuple containing the author's username, link to the author's
54 | page, and the post ID.
55 | '''
56 | author = link.split('/')[3].replace('@', '')
57 | link_to_author = f'https://www.tiktok.com/@{author}'
58 | post_id = link.split('/')[-1].split('?')[0]
59 |
60 | return author, link_to_author, post_id
61 |
62 | '''
63 | Get items and keys from search results entries
64 |
65 | '''
66 | def get_items_from_search_results(entry: Dict) -> Tuple:
67 | '''
68 | Extracts and processes specific fields from a data entry.
69 |
70 | :param entry: A dictionary containing the data entry.
71 | :return: A tuple containing the extracted and processed values for the
72 | fields.
73 | '''
74 | # get values
75 | title = entry.get('title', '')
76 | snippet = entry.get('snippet', '')
77 | link = entry.get('link', '')
78 |
79 | # process new fields from data
80 | likes, comments = extract_likes_comments(snippet)
81 | title_snippet = f'{title} {snippet}'
82 | author, link_to_author, post_id = extract_author_post_id(link)
83 |
84 |
85 | return (
86 | entry.get('source', None),
87 | entry.get('title', None),
88 | entry.get('snippet', None),
89 | entry.get('link', None),
90 | entry.get('thumbnail', None),
91 | entry.get('video_link', None),
92 | ', '.join(entry.get('snippet_highlighted_words', [])) if entry.get(
93 | 'snippet_highlighted_words'
94 | ) else None,
95 | entry.get('displayed_link', None),
96 | title_snippet,
97 | likes,
98 | comments,
99 | author,
100 | link_to_author,
101 | post_id
102 | )
103 |
104 | '''
105 | Get items and keys from images results entries
106 |
107 | '''
108 | def get_items_from_images_results(entry: Dict) -> Tuple:
109 | '''
110 | Extracts and processes specific fields from an image results entry.
111 |
112 | :param entry: A dictionary containing the image results entry.
113 | :return: A tuple containing the extracted and processed values for the
114 | fields.
115 | '''
116 | # get values
117 | link = entry.get('link', '')
118 |
119 | # process new fields from data
120 | author, link_to_author, post_id = extract_author_post_id(link)
121 |
122 | return (
123 | entry.get('source', None),
124 | entry.get('title', None),
125 | entry.get('link', None),
126 | entry.get('thumbnail', None),
127 | author,
128 | link_to_author,
129 | post_id
130 | )
131 |
132 | '''
133 | Get items and keys from related content entries
134 |
135 | '''
136 | def get_items_from_related_content(entry: Dict) -> Tuple:
137 | '''
138 | Extracts and processes specific fields from a related content entry.
139 |
140 | :param entry: A dictionary containing the related content entry.
141 | :return: A tuple containing the extracted and processed values for the
142 | fields.
143 | '''
144 | return (
145 | entry.get('source', None),
146 | entry.get('link', None),
147 | entry.get('thumbnail', None),
148 | entry.get('title', None)
149 | )
150 |
151 | '''
152 | Get items and keys from apify profile data
153 |
154 | '''
155 | def get_items_from_apify_profile_data(entry: Dict) -> Tuple:
156 | '''
157 | Extracts and processes specific fields from an apify profile data entry.
158 |
159 | :param entry: A dictionary containing the apify profile data entry.
160 | :return: A tuple containing the extracted and processed values for the
161 | fields.
162 | '''
163 | # convert lists to JSON strings
164 | hashtags = entry.get('hashtags', []) or []
165 | hashtags_json_str = json.dumps([h.get('name', '') for h in hashtags])
166 |
167 |
168 | return (
169 | entry.get('id', None),
170 | entry.get('text', None),
171 | entry.get('textLanguage', None),
172 | entry.get('createTime', None),
173 | entry.get('createTimeISO', None),
174 | entry.get('isAd', None),
175 | entry.get('webVideoUrl', None),
176 |
177 | # author metadata
178 | entry.get('authorMeta', {}).get('id', None),
179 | entry.get('authorMeta', {}).get('name', None),
180 | entry.get('authorMeta', {}).get('profileUrl', None),
181 | entry.get('authorMeta', {}).get('bioLink', None),
182 | entry.get('authorMeta', {}).get('signature', None),
183 | entry.get('authorMeta', {}).get('nickName', None),
184 | entry.get('authorMeta', {}).get('verified', None),
185 | entry.get('authorMeta', {}).get('avatar', None),
186 | entry.get('authorMeta', {}).get('privateAccount', None),
187 | entry.get('authorMeta', {}).get('region', None),
188 | entry.get('authorMeta', {}).get('following', None),
189 | entry.get('authorMeta', {}).get('friends', None),
190 | entry.get('authorMeta', {}).get('fans', None),
191 | entry.get('authorMeta', {}).get('heart', None),
192 | entry.get('authorMeta', {}).get('video', None),
193 | entry.get('authorMeta', {}).get('digg', None),
194 |
195 | # music metadata
196 | entry.get('musicMeta', {}).get('musicId', None),
197 | entry.get('musicMeta', {}).get('musicName', None),
198 | entry.get('musicMeta', {}).get('musicAuthor', None),
199 | entry.get('musicMeta', {}).get('musicOriginal', None),
200 |
201 | # video metadata
202 | entry.get('videoMeta', {}).get('duration', None),
203 | entry.get('videoMeta', {}).get('coverUrl', None),
204 | entry.get('videoMeta', {}).get('downloadAddr', None),
205 |
206 | # engagement metrics
207 | entry.get('diggCount', None),
208 | entry.get('shareCount', None),
209 | entry.get('playCount', None),
210 | entry.get('collectCount', None),
211 | entry.get('commentCount', None),
212 |
213 | # hashtags
214 | hashtags_json_str,
215 |
216 | # additional metadata
217 | entry.get('isSlideshow', None),
218 | entry.get('isPinned', None),
219 | entry.get('isSponsored', None),
220 | entry.get('input') or entry.get('searchQuery'),
221 | entry.get('fromProfileSection', None)
222 | )
223 |
224 | '''
225 | Get items and keys from apify hashtag data
226 |
227 | '''
228 | def get_items_from_apify_hashtag_data(entry: Dict) -> Tuple:
229 | '''
230 | Extracts and processes specific fields from an apify hashtag data entry.
231 |
232 | :param entry: A dictionary containing the apify hashtag data entry.
233 | :return: A tuple containing the extracted and processed values for the
234 | fields.
235 | '''
236 | # convert lists to JSON strings
237 | hashtags = entry.get('hashtags', []) or []
238 | hashtags_json_str = json.dumps([h.get('name', '') for h in hashtags])
239 |
240 |
241 | return (
242 | entry.get('id', None),
243 | entry.get('text', None),
244 | entry.get('textLanguage', None),
245 | entry.get('createTime', None),
246 | entry.get('createTimeISO', None),
247 | entry.get('isAd', None),
248 | entry.get('webVideoUrl', None),
249 |
250 | # author metadata
251 | entry.get('authorMeta', {}).get('id', None),
252 | entry.get('authorMeta', {}).get('name', None),
253 | entry.get('authorMeta', {}).get('profileUrl', None),
254 | entry.get('authorMeta', {}).get('bioLink', None),
255 | entry.get('authorMeta', {}).get('signature', None),
256 | entry.get('authorMeta', {}).get('nickName', None),
257 | entry.get('authorMeta', {}).get('verified', None),
258 | entry.get('authorMeta', {}).get('avatar', None),
259 | entry.get('authorMeta', {}).get('privateAccount', None),
260 | entry.get('authorMeta', {}).get('region', None),
261 | entry.get('authorMeta', {}).get('following', None),
262 | entry.get('authorMeta', {}).get('friends', None),
263 | entry.get('authorMeta', {}).get('fans', None),
264 | entry.get('authorMeta', {}).get('heart', None),
265 | entry.get('authorMeta', {}).get('video', None),
266 | entry.get('authorMeta', {}).get('digg', None),
267 |
268 | # music metadata
269 | entry.get('musicMeta', {}).get('musicId', None),
270 | entry.get('musicMeta', {}).get('musicName', None),
271 | entry.get('musicMeta', {}).get('musicAuthor', None),
272 | entry.get('musicMeta', {}).get('musicOriginal', None),
273 |
274 | # video metadata
275 | entry.get('videoMeta', {}).get('duration', None),
276 | entry.get('videoMeta', {}).get('coverUrl', None),
277 | entry.get('videoMeta', {}).get('downloadAddr', None),
278 |
279 | # engagement metrics
280 | entry.get('diggCount', None),
281 | entry.get('shareCount', None),
282 | entry.get('playCount', None),
283 | entry.get('collectCount', None),
284 | entry.get('commentCount', None),
285 |
286 | # hashtags
287 | hashtags_json_str,
288 |
289 | # additional metadata
290 | entry.get('isSlideshow', None),
291 | entry.get('isPinned', None),
292 | entry.get('isSponsored', None),
293 | entry.get('input', None),
294 | entry.get('searchHashtag', {}).get('views', None)
295 | )
296 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import time
5 | import os
6 |
7 | # import argparse
8 | from argparse import (
9 | ArgumentParser, RawTextHelpFormatter, SUPPRESS
10 | )
11 |
12 | # import utils
13 | from utils import get_config_attrs, verify_date_argument, \
14 | create_output_data_path, get_project_root
15 |
16 | # TikTok data collector
17 | from data_collectors import TikTokDataCollector
18 |
19 | # video downloader
20 | from media_handlers import VideoDownloader, RequestSession
21 |
22 | def launch_streamlit_app():
23 | '''Launch the Streamlit web interface'''
24 | import subprocess
25 | import sys
26 |
27 | # start process
28 | log_text = f'''
29 | > Starting program at: {time.ctime()}
30 |
31 | '''
32 | print ('\n\n' + ' '.join(log_text.split()).strip())
33 |
34 | print ('\n')
35 | print('> Launching TikSpyder Streamlit Interface...')
36 | print('<< Press Ctrl+C to stop the server >>')
37 | print ('\n')
38 | print('-' * 50)
39 |
40 | try:
41 | # Launch streamlit run app.py
42 | subprocess.run([
43 | sys.executable,
44 | "-m", "streamlit", "run",
45 | os.path.join(os.path.dirname(__file__), 'app.py')
46 | ], check=True)
47 | except subprocess.CalledProcessError as e:
48 | print(f'> Failed to launch Streamlit: {e}')
49 | print('> Make sure streamlit is installed: pip install streamlit')
50 | sys.exit(1)
51 | except KeyboardInterrupt:
52 | # end process
53 | print ('\n')
54 | print('-' * 50)
55 | log_text = f'''
56 | > Ending program at: {time.ctime()}
57 |
58 | '''
59 | print ('\n\n' + ' '.join(log_text.split()).strip())
60 | sys.exit(0)
61 |
62 | def main():
63 | # Get current working directory (where command was executed)
64 | execution_dir = os.getcwd()
65 |
66 | # Get project root directory (where the package is installed)
67 | project_root = get_project_root()
68 |
69 | # Set up project paths for later use instead of changing directories
70 | project_paths = {
71 | 'root': project_root,
72 | 'config': os.path.join(project_root, 'config'),
73 | 'execution': execution_dir
74 | }
75 |
76 | '''
77 | Arguments
78 |
79 | '''
80 | formatter = lambda prog: RawTextHelpFormatter(
81 | prog,
82 | indent_increment=2,
83 | max_help_position=52,
84 | width=None
85 | )
86 |
87 | parser = ArgumentParser(
88 | prog='TikSpyder',
89 | description='Command Line Arguments.',
90 | formatter_class=formatter,
91 | add_help=False
92 | )
93 |
94 | # help arguments
95 | help_arguments = parser.add_argument_group('Help options')
96 | help_arguments.add_argument(
97 | '-h',
98 | '--help',
99 | action='help',
100 | default=SUPPRESS,
101 | help='Show this help message and exit.'
102 | )
103 |
104 | # SerpAPI arguments
105 | serpapi_arguments = parser.add_argument_group('SerpAPI options')
106 |
107 | ''' query '''
108 | serpapi_arguments.add_argument(
109 | '--q',
110 | type=str,
111 | required=False,
112 | metavar='',
113 | help='The search term of phrase for which to retrieve TikTok data.'
114 | )
115 |
116 | ''' user '''
117 | serpapi_arguments.add_argument(
118 | '--user',
119 | type=str,
120 | required=False,
121 | metavar='',
122 | help='Specify a TikTok user to search for videos from.'
123 | )
124 |
125 | ''' tag '''
126 | serpapi_arguments.add_argument(
127 | '--tag',
128 | type=str,
129 | required=False,
130 | metavar='',
131 | help='Specify a TikTok tag to search for videos from.'
132 | )
133 |
134 | ''' google domain '''
135 | serpapi_arguments.add_argument(
136 | '--google-domain',
137 | type=str,
138 | required=False,
139 | default='google.com',
140 | metavar='',
141 | help='Defines the Google domain to use. It defaults to google.com.'
142 | )
143 |
144 | ''' gl > country '''
145 | serpapi_arguments.add_argument(
146 | '--gl',
147 | type=str,
148 | required=False,
149 | metavar='',
150 | help=(
151 | "Defines the country to use for the search. Two-letter country "
152 | "code."
153 | )
154 | )
155 |
156 | ''' hl > language '''
157 | serpapi_arguments.add_argument(
158 | '--hl',
159 | type=str,
160 | required=False,
161 | metavar='',
162 | help=(
163 | "Defines the language to use for the search. Two-letter language "
164 | "code."
165 | )
166 | )
167 |
168 | ''' cr > multiple countries '''
169 | serpapi_arguments.add_argument(
170 | '--cr',
171 | type=str,
172 | required=False,
173 | metavar='',
174 | help='Defines one or multiple countries to limit the search to.'
175 | )
176 |
177 | ''' safe > adult content filter '''
178 | serpapi_arguments.add_argument(
179 | '--safe',
180 | type=str,
181 | required=False,
182 | default='active',
183 | choices=['active', 'off'],
184 | metavar='',
185 | help='Level of filtering for adult content. Options: active (default), off'
186 | )
187 |
188 | ''' lr > one or multiple languages '''
189 | serpapi_arguments.add_argument(
190 | '--lr',
191 | type=str,
192 | required=False,
193 | metavar='',
194 | help='Defines one or multiple languages to limit the search to.'
195 | )
196 |
197 | ''' depth > defines number of iterations for related content '''
198 | serpapi_arguments.add_argument(
199 | '--depth',
200 | type=int,
201 | required=False,
202 | default=3,
203 | metavar='',
204 | help='Depth of iterations to follow related content links.'
205 | )
206 |
207 | # Google advanced search arguments
208 | google_advanced_search_arguments = parser.add_argument_group(
209 | 'Google advanced search options'
210 | )
211 |
212 | ''' search for posts before a given date '''
213 | google_advanced_search_arguments.add_argument(
214 | '--before',
215 | type=str,
216 | required=False,
217 | metavar='',
218 | help=(
219 | "Limit results to posts published before the specified date. "
220 | "Format: YYYY-MM-DD."
221 | )
222 | )
223 |
224 | ''' search for posts after a given date '''
225 | google_advanced_search_arguments.add_argument(
226 | '--after',
227 | type=str,
228 | required=False,
229 | metavar='',
230 | help=(
231 | "Limit results to posts published after the specified date. "
232 | "Format: YYYY-MM-DD."
233 | )
234 | )
235 |
236 | # Apify optional arguments
237 | apify_arguments = parser.add_argument_group(
238 | 'Optional Apify arguments'
239 | )
240 |
241 | ''' apify integration '''
242 | apify_arguments.add_argument(
243 | '--apify',
244 | action='store_true',
245 | required=False,
246 | help='Specify whether to use Apify integration.'
247 | )
248 |
249 | apify_arguments.add_argument(
250 | '--oldest-post-date',
251 | type=str,
252 | required=False,
253 | metavar='',
254 | help=(
255 | "Filter posts newer than the specified date. "
256 | "Format: YYYY-MM-DD."
257 | )
258 | )
259 |
260 | apify_arguments.add_argument(
261 | '--newest-post-date',
262 | type=str,
263 | required=False,
264 | metavar='',
265 | help=(
266 | "Filter posts older than the specified date. "
267 | "Format: YYYY-MM-DD."
268 | )
269 | )
270 |
271 | apify_arguments.add_argument(
272 | '--number-of-results',
273 | type=int,
274 | default=25,
275 | required=False,
276 | metavar='',
277 | help=(
278 | "Specify the number of results to return from Apify. Default: 25"
279 | )
280 | )
281 |
282 | # optional arguments
283 | optional_arguments = parser.add_argument_group(
284 | 'Optional arguments and parameters'
285 | )
286 |
287 | ''' use tor '''
288 | optional_arguments.add_argument(
289 | '--use-tor',
290 | action='store_true',
291 | required=False,
292 | help='Specify whether to use Tor for downloading TikTok videos.'
293 | )
294 |
295 | ''' download TikTok results '''
296 | optional_arguments.add_argument(
297 | '-d',
298 | '--download',
299 | action='store_true',
300 | required=False,
301 | help='Specify whether to download TikTok videos from SerpAPI and Apify.'
302 | )
303 |
304 | ''' max workers > maximum number of threads '''
305 | optional_arguments.add_argument(
306 | '-w',
307 | '--max-workers',
308 | type=int,
309 | required=False,
310 | metavar='',
311 | help=(
312 | "Specify the maximum number of threads to use for downloading "
313 | "TikTok videos and extracting keyframes."
314 | )
315 | )
316 |
317 | ''' output '''
318 | optional_arguments.add_argument(
319 | '-o',
320 | '--output',
321 | type=str,
322 | required=False,
323 | default=f'./tikspyder-data/{int(time.time())}',
324 | metavar='',
325 | help=(
326 | "Specify output directory path. If not provided, data is "
327 | "saved in the current working directory in a folder named "
328 | "tikspyder-data"
329 | )
330 | )
331 |
332 | ''' launch streamlit app '''
333 | optional_arguments.add_argument(
334 | '--app',
335 | action='store_true',
336 | required=False,
337 | help='Launch the Streamlit web interface instead of using CLI mode.'
338 | )
339 |
340 | # parse arguments
341 | args = vars(parser.parse_args())
342 |
343 | # check if user wants to launch Streamlit app
344 | if args.get('app'):
345 | launch_streamlit_app()
346 | return
347 |
348 | # validate that either a query, username or tag was provided
349 | if all(arg is None for arg in [args['user'], args['q'], args['tag']]):
350 | raise ValueError('Either --user, --q or --tag must be provided.')
351 |
352 | # raise error if both user and tag are provided
353 | if args['user'] and args['tag']:
354 | raise ValueError('Both --user and --tag were provided. Only one can be used.')
355 |
356 | # merging SerpAPI configuration attrs with the existing arguments
357 | config_attrs = get_config_attrs(project_paths['config'])
358 | args = {**args, **config_attrs}
359 |
360 | # verify provided dates
361 | for date_key in ['before', 'after']:
362 | if args[date_key] is not None:
363 | verify_date_argument(args, date_key)
364 |
365 | # start process
366 | log_text = f'''
367 | > Starting program at: {time.ctime()}
368 |
369 | '''
370 | print ('\n\n' + ' '.join(log_text.split()).strip())
371 |
372 | # create the output data path if not exists
373 | output = args['output']
374 | create_output_data_path(output)
375 |
376 | # TikTokDataCollector instance
377 | collector = TikTokDataCollector(args=args)
378 |
379 | # TikTok data collection call
380 | collector.collect_search_data()
381 |
382 | # read SQL database and generate csv file
383 | collector.generate_data_files()
384 |
385 | # download videos
386 | if args['download']:
387 | print ('')
388 | print ('-' * 30)
389 | print ('> Downloading videos...')
390 |
391 | # get tiktok urls
392 | collected_videos = collector.get_collected_videos()
393 |
394 | if collected_videos:
395 | print (f'\n> Found {len(collected_videos)} videos to download.')
396 |
397 | # define max workers
398 | max_workers = args['max_workers'] if args['max_workers'] else 5
399 | downloader = VideoDownloader(output=output, use_tor=args['use_tor'])
400 |
401 | # start download
402 | downloader.start_download(urls=collected_videos, max_workers=max_workers)
403 | else:
404 | print ('\n> Search results did not return any videos to download.')
405 |
406 | # extract keyframes
407 | print ('\n')
408 | print ('-' * 30)
409 | print ('Extracting keyframes...')
410 | request_session = RequestSession()
411 |
412 | # define max workers
413 | max_workers = args['max_workers'] if args['max_workers'] else 3
414 | request_session.extract_keyframes_from_videos(
415 | output=output,
416 | max_concurrent=max_workers
417 | )
418 | print ('\n')
419 | print ('-' * 30)
420 |
421 | # end process
422 | log_text = f'''
423 | > Ending program at: {time.ctime()}
424 |
425 | '''
426 | print ('\n\n' + ' '.join(log_text.split()).strip())
427 |
428 | if __name__ == '__main__':
429 | main()
430 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # **TikSpyder**
4 |
5 |
6 |
7 |
8 |
9 | `TikSpyder` is a command-line tool designed to collect TikTok data using SerpAPI for Google search results and Apify for TikTok data extraction. The tool supports video downloading via yt-dlp and uses Python's asynchronous capabilities and multithreading for efficient data collection.
10 |
11 |
12 |
13 |
14 |
15 |
16 | [](https://GitHub.com/estebanpdl/tik-spyder/network/)
17 | [](https://github.com/estebanpdl/tik-spyder/stargazers)
18 | [](https://x.com/estebanpdl)
19 | [](https://www.python.org/)
20 | [](https://x.com/estebanpdl)
21 | [](https://buymeacoffee.com/estebanpdl)
22 |
23 |
24 |
25 |
26 |
27 | ## 🔧 **Companion Tools**
28 |
29 | | Tool | Description | Access |
30 | |------|-------------|--------|
31 | | 🎙️ Audio Transcription | Transcribe audio files from TikTok videos | [](https://colab.research.google.com/drive/1qMcMsS2YI9btXGfFN1sCviQeB7RSKqUH) |
32 |
33 |
34 |
35 | ## 🖥️ **User Interface Options**
36 |
37 | TikSpyder provides **two ways** to interact with the tool:
38 |
39 | ### **1. 🎨 Streamlit Web Interface (Recommended for Non-Technical Users)**
40 | A modern, user-friendly web interface with TikTok-inspired dark theme that makes data collection accessible to everyone.
41 |
42 | 
43 |
44 | **Features:**
45 | - 🎯 **Intuitive Configuration**: Easy search setup with tabbed interface for keywords, users, or hashtags
46 | - 📅 **Visual Date Filters**: Calendar widgets for precise date range selection
47 | - 🚀 **Apify Integration**: Simple toggle to enable enhanced data collection
48 | - ⚙️ **Advanced Options**: Collapsible section for Google search parameters
49 | - 📥 **Download Settings**: Visual controls for video downloads and Tor network usage
50 | - 📂 **File Browser**: Point-and-click directory selection
51 | - 📊 **Real-time Progress**: Live progress tracking with step-by-step status updates
52 |
53 | **Launch the Interface:**
54 |
55 | **Method 1 (Recommended):**
56 | ```sh
57 | # Using package installation
58 | tikspyder --app
59 |
60 | # Using standard installation
61 | python main.py --app
62 | ```
63 |
64 | **Method 2 (Direct):**
65 | ```sh
66 | streamlit run app.py
67 | ```
68 |
69 | ### **2. ⌨️ Command Line Interface (For Advanced Users)**
70 | Full-featured command-line tool for automation and scripting scenarios.
71 |
72 | ## 🔍 **Description**
73 |
74 | TikSpyder offers two main methods of data collection:
75 | 1. **Google Search Results**: Using SerpAPI to find TikTok videos based on search queries
76 | 2. **Apify Data collection**: Using Apify to collect videos directly from TikTok profiles or keywords
77 |
78 | The tool supports various filtering options, including date ranges and content types, and can download both videos and thumbnails. Data is stored in a SQLite database and can be exported to CSV files for further analysis.
79 |
80 | Given the dynamic nature of search results and the constantly evolving landscape of TikTok's platform, it's important to note that the data collected by TikSpyder represents a sample rather than a comprehensive dataset. However, this sample can still be valuable for monitoring trends and identifying emerging narratives in the information ecosystem.
81 |
82 | To get the most out of TikSpyder, **it is recommended to test your query using Google's advanced search features. This can help refine your search query, improve the relevance of your results, and test specific keywords more effectively**. By taking advantage of these features, you can ensure that you're collecting the most relevant data for your research or analysis.
83 |
84 |
85 |
86 | ## 🚀 **Features**
87 |
88 | ### **Core Functionality**
89 | - 🔍 Collects TikTok video links using SerpAPI and Apify
90 | - 🖼️ Collects and downloads thumbnails for TikTok videos
91 | - 🔗 Collects related content to the search query
92 | - 💾 Stores collected data in SQLite database
93 | - 📊 Exports data to CSV files for analysis
94 | - 📹 Downloads TikTok videos using yt-dlp
95 | - 🎞️ Extracts keyframes from downloaded videos
96 | - ⚡ Supports asynchronous and multithreaded downloading for improved performance
97 | - 🔒 Supports Tor network for enhanced privacy and rate limiting avoidance
98 |
99 | ### **User Interfaces**
100 | - 🎨 **Modern Streamlit Web Interface**: User-friendly GUI with TikTok-inspired dark theme
101 | - ⌨️ **Command Line Interface**: Full-featured CLI for automation and advanced users
102 | - 🎯 **Search Types**: Support for keywords, user profiles, and hashtag searches
103 | - 📅 **Date Range Filtering**: Precise temporal data collection controls
104 |
105 |
106 |
107 | ## ⚙️ **Requirements**
108 |
109 | ### **System Requirements**
110 | - [Python](https://www.python.org/) >= 3.11.7
111 | - [ffmpeg](https://ffmpeg.org/) (for video processing and keyframe extraction)
112 |
113 | ### **API Keys & Services**
114 | - [SerpAPI key](https://serpapi.com/) (required for Google search functionality)
115 | - [Apify API token](https://apify.com/) (optional, for direct TikTok profile scraping)
116 |
117 | ### **Optional Components**
118 | - [Tor Browser](https://www.torproject.org/) (optional, for enhanced privacy during downloads)
119 |
120 | ### **Platform-Specific Requirements**
121 | - **All Platforms**: Python libraries listed in `requirements.txt`
122 | - **Streamlit Interface**: Automatically installed with requirements
123 | - **Linux Users**: For GUI components, install tkinter: `sudo apt-get install python3-tk` (Ubuntu/Debian)
124 |
125 |
126 |
127 | ## 🔧 **Installation**
128 |
129 | ### **Method 1: Standard Installation**
130 |
131 | 1. Clone the repository
132 |
133 | ```sh
134 | git clone https://github.com/estebanpdl/tik-spyder.git
135 | cd tik-spyder
136 | ```
137 |
138 | 2. Install the required packages
139 |
140 | ```sh
141 | pip install -r requirements.txt
142 | ```
143 |
144 | or
145 |
146 | ```sh
147 | pip3 install -r requirements.txt
148 | ```
149 |
150 | ### **Method 2: Package Installation (Recommended)**
151 |
152 | This method installs TikSpyder as a package, making the `tikspyder` command available from anywhere on your system.
153 |
154 | 1. Clone the repository
155 |
156 | ```sh
157 | git clone https://github.com/estebanpdl/tik-spyder.git
158 | cd tik-spyder
159 | ```
160 |
161 | 2. Install the package in editable mode
162 |
163 | ```sh
164 | pip install -e .
165 | ```
166 |
167 | or
168 |
169 | ```sh
170 | pip3 install -e .
171 | ```
172 |
173 | After installation, you can use `tikspyder` directly from any directory instead of `python main.py`.
174 |
175 | ### **Configuration**
176 |
177 | 3. Once you obtain an API key from SerpAPI and Apify, populate the config/config.ini file with the described values. Replace `api_key_value` and `apify_token_value` with your API key and token.
178 |
179 | ```ini
180 |
181 | [SerpAPI Key]
182 | api_key = your_serp_api_key
183 |
184 | [Apify Token]
185 | apify_token = your_apify_token
186 | ```
187 |
188 |
189 |
190 | ## 📚 **Usage**
191 |
192 | TikSpyder offers two interface options to suit different user preferences and use cases:
193 |
194 | ## 🎨 **Streamlit Web Interface Usage**
195 |
196 | The Streamlit interface provides an intuitive, visual way to configure and run data collection tasks.
197 |
198 | ### **Launch the Interface**
199 |
200 | ```sh
201 | # Navigate to TikSpyder directory
202 | cd tik-spyder
203 |
204 | # Launch the Streamlit app
205 | streamlit run app.py
206 | ```
207 |
208 | The interface will automatically open in your default web browser at `http://localhost:8501`
209 |
210 | ### **Using the Interface**
211 |
212 | 1. **🎯 Configure Search**: Choose between keyword, user profile, or hashtag search
213 | 2. **📅 Set Date Filters**: Use calendar widgets to define your collection timeframe
214 | 3. **🚀 Enable Apify** (Optional): Toggle for enhanced direct TikTok data collection
215 | 4. **⚙️ Adjust Advanced Options**: Fine-tune Google search parameters if needed
216 | 5. **📥 Configure Downloads**: Set video download preferences and worker counts
217 | 6. **📂 Choose Output Directory**: Select where your data will be saved
218 | 7. **🚀 Start Collection**: Click the centered "Start Data Collection" button
219 |
220 | ---
221 |
222 | ## ⌨️ **Command Line Interface Usage**
223 |
224 | For advanced users and automation scenarios, TikSpyder provides a full-featured CLI.
225 |
226 | ### **Using Package Installation (Method 2)**
227 |
228 | ```sh
229 | tikspyder [OPTIONS]
230 | ```
231 |
232 | ### **Using Standard Installation (Method 1)**
233 |
234 | ```sh
235 | python main.py [OPTIONS]
236 | ```
237 |
238 | ### **Command Line Arguments**
239 |
240 | ```sh
241 | # Package installation
242 | tikspyder --help
243 |
244 | # or
245 | tikspyder -h
246 |
247 | # Standard installation
248 | python main.py --help
249 |
250 | # or
251 | python main.py -h
252 | ```
253 |
254 | ```
255 | Command Line Arguments.
256 |
257 | Help options:
258 | -h, --help Show this help message and exit.
259 |
260 | SerpAPI options:
261 | --q The search term of phrase for which to retrieve TikTok data.
262 | --user Specify a TikTok user to search for videos from.
263 | --tag Specify a TikTok tag to search for videos from.
264 | --google-domain Defines the Google domain to use. It defaults to google.com.
265 | --gl Defines the country to use for the search. Two-letter country code.
266 | --hl Defines the language to use for the search. Two-letter language code.
267 | --cr Defines one or multiple countries to limit the search to.
268 | --safe Level of filtering for adult content. Options: active (default), off
269 | --lr Defines one or multiple languages to limit the search to.
270 | --depth Depth of iterations to follow related content links.
271 |
272 | Google advanced search options:
273 | --before Limit results to posts published before the specified date. Format: YYYY-MM-DD.
274 | --after Limit results to posts published after the specified date. Format: YYYY-MM-DD.
275 |
276 | Optional Apify arguments:
277 | --apify Specify whether to use Apify integration.
278 | --oldest-post-date Filter posts newer than the specified date. Format: YYYY-MM-DD.
279 | --newest-post-date Filter posts older than the specified date. Format: YYYY-MM-DD.
280 | --number-of-results Specify the number of results to return from Apify. Default: 25
281 |
282 | Optional arguments and parameters:
283 | --app Launch the Streamlit web interface instead of using CLI mode.
284 | --use-tor Specify whether to use Tor for downloading TikTok videos.
285 | -d, --download Specify whether to download TikTok videos from SerpAPI and Apify.
286 | -w , --max-workers Specify the maximum number of threads to use for downloading TikTok videos and extracting keyframes.
287 | -o , --output Specify output directory path. If not provided, data is saved in the current working directory in a folder named `tikspyder-data`
288 | ```
289 |
290 | ### **Example Usage**
291 |
292 | 1. Search-based collection:
293 |
294 | ```sh
295 | # Using package installation (Method 2)
296 | tikspyder --q "F-16 AND Enemy AND (Ukraine OR Russia)" --gl us --hl en --after 2024-02-01 --before 2024-05-31 --output {output_directory}/ --download
297 |
298 | # Using standard installation (Method 1)
299 | python main.py --q "F-16 AND Enemy AND (Ukraine OR Russia)" --gl us --hl en --after 2024-02-01 --before 2024-05-31 --output {output_directory}/ --download
300 |
301 | # Note: Replace '{output_directory}' with the desired output path.
302 | ```
303 |
304 | 2. Profile-based collection:
305 |
306 | ```sh
307 | # Using package installation (Method 2)
308 | tikspyder --q Trump --user username --output {output_directory}/ --download --apify --oldest-post-date 2025-01-01
309 |
310 | # Using standard installation (Method 1)
311 | python main.py --q Trump --user username --output {output_directory}/ --download --apify --oldest-post-date 2025-01-01
312 |
313 | # Note: Replace '{output_directory}' with the desired output path.
314 | ```
315 |
316 | 3. Tag-based collection
317 | ```sh
318 | # Using package installation (Method 2)
319 | tikspyder --tag sinaloa --apify --oldest-post-date 2025-08-01 --number-of-results 50 --output {output_directory}/ --download
320 |
321 | # Using standard installation (Method 1)
322 | python main.py --tag sinaloa --apify --oldest-post-date 2025-08-01 --number-of-results 50 --output {output_directory}/ --download
323 |
324 | # Note: Replace '{output_directory}' with the desired output path.
325 | ```
326 |
327 | ### Tor Integration
328 | You can use Tor network for downloading TikTok videos to enhance privacy and avoid rate limiting. To use this feature:
329 |
330 | 1. Make sure Tor Browser is installed and running
331 | 2. Configure your torrc file with:
332 |
333 | ```
334 | ## Enable SOCKS proxy
335 | SocksPort 9050
336 |
337 | ## Enable Control Port for IP rotation
338 | ControlPort 9051
339 | CookieAuthentication 1
340 | ```
341 |
342 | 3. Use the `--use-tor` flag when running the script. If Tor connection fails, the script will automatically fall back to a normal connection.
343 |
344 |
345 |
346 |
347 | ## ☕ Support
348 |
349 | If you find TikSpyder helpful, please consider buying me a coffee to support ongoing development and maintenance. Your donation will help me continue to improve the tool and add new features.
350 |
351 | [](https://buymeacoffee.com/estebanpdl)
352 |
353 |
354 |
--------------------------------------------------------------------------------
/data_collectors/collector.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import os
5 | import time
6 | import json
7 | import uuid
8 | import httpx
9 |
10 | # typing
11 | from typing import Dict, List
12 |
13 | # SerpAPI module
14 | import serpapi
15 |
16 | # Apify client
17 | from apify_client import ApifyClient
18 |
19 | # local dependencies
20 | from .utilities import (
21 | search_query,
22 | select_serpapi_parameters,
23 | extract_results_keys,
24 | extract_related_content_keys,
25 | build_site_query
26 | )
27 |
28 | # utils
29 | from pathlib import Path
30 |
31 | # SQLManager
32 | from databases import SQLDatabaseManager
33 |
34 | # Media handlers
35 | from media_handlers import RequestSession
36 |
37 | # SerpAPI collector class
38 | class TikTokDataCollector:
39 | '''
40 | TikTokDataCollector collects TikTok data from Google search results
41 | using SerpAPI.
42 | '''
43 |
44 | def __init__(self, args: Dict) -> None:
45 | '''
46 | Initializes TikTokDataCollector with the given parameters and options
47 | from the command line.
48 |
49 | :param args: Dict containing the command line arguments and options
50 | '''
51 | # get output data path
52 | self.output = self._sanitize_output_path(args['output'])
53 |
54 | # endpoint for SerpAPI
55 | self.api_key = args['api_key']
56 | self.endpoint = 'https://serpapi.com/search'
57 |
58 | # Apify token
59 | self.apify_token = args['apify_token']
60 |
61 | # main site: tiktok.com
62 | self.site = 'tiktok.com'
63 |
64 | # build the search query string
65 | q = search_query(args=args)
66 |
67 | # get provided user and tag
68 | self.user = args['user']
69 | self.tag = args['tag']
70 |
71 | # build advanced search query using utility function
72 | self.query = build_site_query(
73 | site=self.site, user=self.user, tag=self.tag, q=q
74 | )
75 |
76 | # update the query parameter in args
77 | args['q'] = self.query
78 |
79 | # store the parameters
80 | self.parameters = select_serpapi_parameters(args)
81 |
82 | # SerpAPI client
83 | self.client = serpapi.Client(api_key=self.api_key)
84 |
85 | # Apify client
86 | self.run_apify = args['apify']
87 | if self.run_apify:
88 | if self.user is not None or self.tag is not None:
89 | self.should_download_videos = args['download']
90 | self.apify_client = ApifyClient(self.apify_token)
91 |
92 | # optional date filters
93 | self.oldest_post_date = args['oldest_post_date']
94 | self.newest_post_date = args['newest_post_date']
95 |
96 | # number of results
97 | self.number_of_results = args['number_of_results']
98 |
99 | # database connection
100 | self.sql_database = SQLDatabaseManager(self.output, self.run_apify)
101 |
102 | # connections
103 | self.related_content_urls = []
104 | self.related_content_depth = args['depth']
105 | self.http_session = RequestSession()
106 |
107 | def _sanitize_output_path(self, output: str) -> str:
108 | '''
109 | Ensures the given path uses forward slashes and does not end with a
110 | slash.
111 |
112 | :param output: The original directory path.
113 | :return: A sanitized directory path with forward slashes and no
114 | trailing slash.
115 | '''
116 | # create a Path object and normalize the path
117 | path = Path(output)
118 |
119 | # path with the correct separators for the current OS
120 | output = str(path.as_posix())
121 |
122 | # remove any trailing slashes
123 | output = output.rstrip('/')
124 |
125 | return output
126 |
127 | def collect_search_results(self) -> None:
128 | '''
129 | Makes an API call to SerpAPI and processes the response data.
130 |
131 | Fetches data based on the initialized parameters and handles pagination
132 | to retrieve data from all available pages.
133 | '''
134 | print (f'\nAPI call to Google search results\n')
135 | print (f'> search query: {self.query}')
136 | result_type = 'search_result'
137 | try:
138 | api_response = self.client.search(self.parameters)
139 | print ('\n> Searching...')
140 |
141 | # save raw data
142 | self._save_raw_data(
143 | self.output,
144 | result_type=result_type,
145 | data=api_response.data
146 | )
147 |
148 | # found results
149 | found_results = False
150 |
151 | # process search results
152 | self._process_search_results(api_response.data)
153 | if api_response.data.get('organic_results', []):
154 | found_results = True
155 |
156 | # get next page
157 | next_page = api_response.next_page_url
158 | while next_page:
159 | # get new API response
160 | next_response = api_response.next_page()
161 |
162 | # save raw data
163 | self._save_raw_data(
164 | self.output,
165 | result_type=result_type,
166 | data=next_response.data
167 | )
168 |
169 | # process search results
170 | self._process_search_results(next_response.data)
171 |
172 | # get next page
173 | next_page = next_response.next_page_url
174 |
175 | # update api_response for the next iteration
176 | api_response = next_response
177 |
178 | # chill out
179 | time.sleep(2)
180 |
181 | # api call status
182 | print ('> Done')
183 |
184 | if not found_results:
185 | print ('No organic results found.')
186 |
187 | except Exception as e:
188 | print (f'An error occurred during the API call: {e}')
189 |
190 | def _process_search_results(self, data: Dict) -> None:
191 | '''
192 | Processes the response data from SerpAPI, extracting organic results
193 | and inserting them into the SQL database.
194 |
195 | :param data: SerpAPI raw data response
196 | '''
197 | # get organic search results
198 | field = 'organic_results'
199 | result_type = 'search_result'
200 | results = data.get(field, [])
201 | if results:
202 | d = extract_results_keys(results, result_type=result_type)
203 |
204 | # write results in SQL database
205 | if d:
206 | self.sql_database.insert_search_results(d)
207 |
208 | def collect_image_results(self) -> None:
209 | '''
210 | Makes an API call to SerpAPI to collect image thumbnails from Google
211 | Images.
212 | '''
213 | # Google Images API
214 | self.parameters['tbm'] = 'isch'
215 |
216 | # collect images
217 | print (f'\n\nAPI call to Google images')
218 | result_type = 'image_result'
219 | try:
220 | api_response = self.client.search(self.parameters)
221 | print ('\n> Searching images...')
222 |
223 | # save raw data
224 | self._save_raw_data(
225 | self.output,
226 | result_type=result_type,
227 | data=api_response.data
228 | )
229 |
230 | # found results
231 | found_results = False
232 |
233 | # process images results
234 | self._process_images_results(api_response.data)
235 | if api_response.data.get('images_results', []):
236 | found_results = True
237 | print (f'> Downloading images results...')
238 |
239 | # get next page
240 | next_page = api_response.next_page_url
241 | while next_page:
242 | next_response = api_response.next_page()
243 |
244 | # save raw data
245 | self._save_raw_data(
246 | self.output,
247 | result_type=result_type,
248 | data=next_response.data
249 | )
250 |
251 | # process image results
252 | self._process_images_results(next_response.data)
253 |
254 | # get next page
255 | next_page = next_response.next_page_url
256 |
257 | # update api_response for the next iteration
258 | api_response = next_response
259 |
260 | # chill out
261 | time.sleep(2)
262 |
263 | # api call status
264 | print ('> Done')
265 |
266 | if not found_results:
267 | print ('No image results found in the response.')
268 |
269 | except Exception as e:
270 | print (f'An error occurred during the API call: {e}')
271 |
272 | # collect related content
273 | print (f'\n\nCollecting related content')
274 | if self.related_content_urls:
275 | self.related_content_urls = self.related_content_urls[
276 | :self.related_content_depth
277 | ]
278 | for url in self.related_content_urls:
279 | self._collect_related_content(url=url)
280 | print ('> Done')
281 | else:
282 | print ('No related content found.')
283 |
284 | def _process_images_results(self, data: Dict) -> None:
285 | '''
286 | Processes the response data from SerpAPI, extracting thumbnails
287 | and inserting related data into the SQL database.
288 |
289 | :param data: SerpAPI raw data response
290 | '''
291 | # get image results
292 | field = 'images_results'
293 | result_type = 'image_result'
294 | results = data.get(field, [])
295 | if results:
296 | d = extract_results_keys(results, result_type=result_type)
297 |
298 | # write results in SQL database
299 | if d:
300 | self.sql_database.insert_images_results(d)
301 |
302 | # download images
303 | thumbnails = [i['thumbnail'] for i in d]
304 | links = [i['link'] for i in d]
305 | self.http_session.start_media_download(
306 | urls=thumbnails,
307 | links=links,
308 | output=self.output,
309 | media_type='image'
310 | )
311 |
312 | # save related content urls
313 | key = 'serpapi_related_content_link'
314 | self.related_content_urls += [
315 | i[key] for i in d if key in i
316 | ]
317 |
318 | def _collect_related_content(self, url: str) -> None:
319 | '''
320 | Collects related content from the given URL.
321 |
322 | :param url: The URL to load related content from.
323 | '''
324 | result_type = 'related_content'
325 | content = self.http_session.load_related_content(
326 | url=url,
327 | api_key=self.api_key
328 | )
329 |
330 | # save raw data
331 | self._save_raw_data(
332 | self.output,
333 | result_type=result_type,
334 | data=content
335 | )
336 |
337 | # process related content
338 | self._process_related_content(content)
339 |
340 | def _process_related_content(self, content: Dict) -> None:
341 | '''
342 | Processes the related content data.
343 |
344 | :param content: A dictionary containing the related content data.
345 | '''
346 | # get related content
347 | possible_fields = ['related_content', 'images_results']
348 | related_content = []
349 | for field in possible_fields:
350 | related_content = content.get(field, None)
351 | if related_content is not None:
352 | break
353 |
354 | if related_content:
355 | d = extract_related_content_keys(related_content)
356 |
357 | # write results in SQL database
358 | if d:
359 | self.sql_database.insert_related_content(d)
360 | else:
361 | print ('No results found in this URL')
362 |
363 | def _apify_tiktok_profile_scraper(self) -> None:
364 | '''
365 | Collects search data using Apify.
366 | '''
367 | print ('\n\nCollecting user data with Apify')
368 |
369 | # get the search results
370 | run_input = {
371 | 'profiles': [self.user],
372 | 'profileScrapeSections': ['videos'],
373 | 'profileSorting': 'latest',
374 | 'resultsPerPage': self.number_of_results,
375 | 'excludePinnedPosts': False,
376 | 'shouldDownloadVideos': self.should_download_videos,
377 | 'shouldDownloadCovers': True,
378 | 'shouldDownloadSubtitles': False,
379 | 'shouldDownloadSlideshowImages': False,
380 | 'shouldDownloadAvatars': True
381 | }
382 |
383 | # add optional date filters
384 | if self.oldest_post_date:
385 | run_input['oldestPostDate'] = self.oldest_post_date
386 | if self.newest_post_date:
387 | run_input['newestPostDate'] = self.newest_post_date
388 |
389 | # run the Apify actor
390 | apify_actor_key = '0FXVyOXXEmdGcV88a'
391 | try:
392 | run = self.apify_client.actor(apify_actor_key).call(
393 | run_input=run_input
394 | )
395 |
396 | # store data
397 | store_data = []
398 | for item in self.apify_client.dataset(run['defaultDatasetId']).iterate_items():
399 | store_data.append(item)
400 |
401 | # write raw data
402 | if store_data:
403 | self._save_raw_data(
404 | self.output,
405 | result_type='apify_profile_data',
406 | data=store_data
407 | )
408 |
409 | # process data
410 | self._process_apify_profile_data(store_data)
411 | else:
412 | print ('No data found in the Apify run.')
413 | except httpx.LocalProtocolError as e:
414 | print ('Warning: Apify API token is either missing or invalid. Skipping Apify integration.')
415 |
416 | def _process_apify_profile_data(self, data: Dict) -> None:
417 | '''
418 | Processes the Apify profile data.
419 |
420 | :param data: A dictionary containing the Apify profile data.
421 | '''
422 | # insert data into SQL database
423 | self.sql_database.insert_apify_profile_data(data)
424 |
425 | # downloading images
426 | thumbnails = []
427 | links = []
428 | for item in data:
429 | try:
430 | thumbnails.append(item['videoMeta']['coverUrl'])
431 | links.append(item['webVideoUrl'])
432 | except KeyError:
433 | pass
434 |
435 | self.http_session.start_media_download(
436 | urls=thumbnails,
437 | links=links,
438 | output=self.output,
439 | media_type='image'
440 | )
441 | print ('> Thumbnails downloaded')
442 |
443 | # get videos from Apify collected data
444 | if self.should_download_videos:
445 | videos = []
446 | tiktok_links = []
447 | for item in data:
448 | try:
449 | videos.append(item['videoMeta']['downloadAddr'])
450 | tiktok_links.append(item['webVideoUrl'])
451 | except KeyError:
452 | pass
453 |
454 | # download videos
455 | self.http_session.start_media_download(
456 | urls=videos,
457 | links=tiktok_links,
458 | output=self.output,
459 | media_type='video'
460 | )
461 | print ('> Videos downloaded')
462 |
463 | # extract audio from videos
464 | print ('> Extracting audio from videos...')
465 | self.http_session.extract_audio_from_videos(self.output)
466 | print ('> Done')
467 |
468 | return
469 |
470 | def _apify_tiktok_hashtag_scraper(self) -> None:
471 | '''
472 | Collects hashtag data using Apify.
473 | '''
474 | print ('\n\nCollecting hashtag data with Apify')
475 |
476 | # get the hashtag results
477 | run_input = {
478 | 'hashtags': [self.tag],
479 | 'resultsPerPage': self.number_of_results,
480 | 'searchSection': '/video',
481 | 'searchQueries': [self.tag],
482 | 'excludePinnedPosts': False,
483 | 'shouldDownloadVideos': self.should_download_videos,
484 | 'shouldDownloadCovers': True,
485 | 'shouldDownloadSubtitles': False,
486 | 'shouldDownloadSlideshowImages': False,
487 | 'shouldDownloadAvatars': True
488 | }
489 |
490 | # run the Apify actor
491 | apify_actor_key = 'OtzYfK1ndEGdwWFKQ'
492 | try:
493 | run = self.apify_client.actor(apify_actor_key).call(
494 | run_input=run_input
495 | )
496 |
497 | # store data
498 | store_data = []
499 | for item in self.apify_client.dataset(run['defaultDatasetId']).iterate_items():
500 | store_data.append(item)
501 |
502 | # write raw data
503 | if store_data:
504 | self._save_raw_data(
505 | self.output,
506 | result_type='apify_hashtag_data',
507 | data=store_data
508 | )
509 |
510 | # process data
511 | self._process_apify_hashtag_data(store_data)
512 | else:
513 | print ('No data found in the Apify run.')
514 | except httpx.LocalProtocolError as e:
515 | print ('Warning: Apify API token is either missing or invalid. Skipping Apify integration.')
516 |
517 | def _process_apify_hashtag_data(self, data: Dict) -> None:
518 | '''
519 | Processes the Apify hashtag data.
520 |
521 | :param data: A dictionary containing the Apify hashtag data.
522 | '''
523 | # insert data into SQL database
524 | self.sql_database.insert_apify_hashtag_data(data)
525 |
526 | # downloading images
527 | thumbnails = []
528 | links = []
529 | for item in data:
530 | try:
531 | thumbnails.append(item['videoMeta']['coverUrl'])
532 | links.append(item['webVideoUrl'])
533 | except KeyError:
534 | pass
535 |
536 | self.http_session.start_media_download(
537 | urls=thumbnails,
538 | links=links,
539 | output=self.output,
540 | media_type='image'
541 | )
542 | print ('> Thumbnails downloaded')
543 |
544 | # get videos from Apify collected data
545 | if self.should_download_videos:
546 | videos = []
547 | tiktok_links = []
548 | for item in data:
549 | try:
550 | videos.append(item['videoMeta']['downloadAddr'])
551 | tiktok_links.append(item['webVideoUrl'])
552 | except KeyError:
553 | pass
554 |
555 | # download videos
556 | self.http_session.start_media_download(
557 | urls=videos,
558 | links=tiktok_links,
559 | output=self.output,
560 | media_type='video'
561 | )
562 | print ('> Videos downloaded')
563 |
564 | # extract audio from videos
565 | print ('> Extracting audio from videos...')
566 | self.http_session.extract_audio_from_videos(self.output)
567 | print ('> Done')
568 |
569 | return
570 |
571 | def _save_raw_data(self, output: str, result_type: str, data: Dict) -> None:
572 | '''
573 | Saves the raw data response from SerpAPI in a JSON file.
574 |
575 | :param output: The directory path where the raw data should be saved.
576 | :param result_type: Type of SerpAPI response: 'search_result',
577 | 'image_result', 'related_content', or Apify response
578 | :param data: The raw data response from SerpAPI to be saved.
579 | '''
580 | # create the directory structure if it does not exist
581 | folder = f'{output}/raw_data/{result_type}'
582 | if not os.path.exists(folder):
583 | os.makedirs(folder)
584 |
585 | # create a timestamp for the file name
586 | stamp = int(time.time())
587 | uuid_code = str(uuid.uuid4()).split('-')[-1]
588 |
589 | # convert the data to a JSON string
590 | obj = json.dumps(data, ensure_ascii=False, indent=2)
591 |
592 | # write the JSON string to a file
593 | file_path = f'{folder}/{result_type}_{stamp}_{uuid_code}.json'
594 | with open(file_path, encoding='utf-8', mode='w') as writer:
595 | writer.write(obj)
596 |
597 | def collect_search_data(self) -> None:
598 | '''
599 | Collects both search results and corresponding image thumbnails.
600 | '''
601 | print ('\n\n')
602 | print ('-' * 30)
603 | print ('Starting data collection process...\n')
604 |
605 | self.collect_search_results()
606 | self.collect_image_results()
607 |
608 | if self.run_apify:
609 | if self.user is not None:
610 | self._apify_tiktok_profile_scraper()
611 | elif self.tag is not None:
612 | self._apify_tiktok_hashtag_scraper()
613 |
614 | print ('\n\nData collection complete.')
615 | print ('-' * 30)
616 |
617 | def generate_data_files(self) -> None:
618 | '''
619 | Selects all data from SQL tables and generates CSV files
620 | '''
621 | print (f'\n\nGenerating CSV files')
622 | self.sql_database.fetch_all_data()
623 | print ('> Done')
624 |
625 | def get_collected_videos(self) -> List[str]:
626 | '''
627 | Retrieves all collected video links from the SQL database.
628 |
629 | :return: A list of unique video links.
630 | '''
631 | return self.sql_database.get_collected_videos(
632 | include_user_related_content=self.user is not None
633 | )
634 |
635 | def get_all_collected_videos(self) -> List[str]:
636 | '''
637 | Retrieves all unique video links from the query_search_results,
638 | images_results, and Apify tables.
639 | '''
640 | return self.sql_database.get_all_collected_videos()
641 |
--------------------------------------------------------------------------------
/databases/sql_manager.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # import modules
4 | import os
5 | import sqlite3
6 | import pandas as pd
7 |
8 | # SQL submodules
9 | from sqlite3 import Error
10 |
11 | # typing
12 | from typing import List, Optional
13 |
14 | # Database Manager utilities
15 | from .utilities import get_items_from_search_results, \
16 | get_items_from_images_results, get_items_from_related_content, \
17 | get_items_from_apify_profile_data, get_items_from_apify_hashtag_data, \
18 | extract_author_post_id
19 |
20 | # SQLDatabaseManager class
21 | class SQLDatabaseManager:
22 | '''
23 | SQLDatabaseManager
24 |
25 | This class provides an abstracted interface for interacting with a SQL
26 | database.
27 | '''
28 | def __init__(self, output: str, run_apify: bool) -> None:
29 | '''
30 | Initializes the SQLDatabaseManager with the given output path.
31 |
32 | :param output: The directory path where the database file will be
33 | created.
34 | :param run_apify: Whether to run the apify profile scraper.
35 | '''
36 | self.output = output
37 | self.sql_database_file = f'{self.output}/database.sql'
38 |
39 | # create required SQL tables for data processing - SerpAPI
40 | self.create_search_results_table()
41 | self.create_images_results_table()
42 | self.create_related_content_table()
43 |
44 | # create required SQL tables for data processing - Apify
45 | self.create_apify_profile_scraper_table()
46 | self.create_apify_hashtag_scraper_table()
47 |
48 | def create_sql_connection(self) -> Optional[sqlite3.Connection]:
49 | '''
50 | Creates a SQL connection.
51 |
52 | :return: A SQLite connection object or None if an error occurred
53 | '''
54 | try:
55 | conn = sqlite3.connect(self.sql_database_file)
56 | return conn
57 | except Error as e:
58 | print (f'An error occurred: {e}')
59 | return None
60 |
61 | def create_search_results_table(self) -> None:
62 | '''
63 | Creates the query_search_results table if it does not already exist.
64 | '''
65 | # set cursor
66 | conn = self.create_sql_connection()
67 | if conn is not None:
68 | cursor = conn.cursor()
69 |
70 | try:
71 | cursor.execute(
72 | '''
73 | CREATE TABLE IF NOT EXISTS query_search_results (
74 | record_id INTEGER PRIMARY KEY AUTOINCREMENT,
75 | source TEXT,
76 | title TEXT,
77 | snippet TEXT,
78 | link TEXT UNIQUE,
79 | thumbnail TEXT,
80 | video_link TEXT,
81 | snippet_highlighted_words TEXT,
82 | displayed_link TEXT,
83 | title_snippet TEXT,
84 | likes TEXT,
85 | comments TEXT,
86 | author TEXT,
87 | link_to_author TEXT,
88 | post_id TEXT UNIQUE
89 | );
90 | '''
91 | )
92 |
93 | # commit changes
94 | conn.commit()
95 | except Error as e:
96 | print (f'An error occurred: {e}')
97 | finally:
98 | conn.close()
99 | else:
100 | print ('Failed to create the database connection.')
101 |
102 | def insert_search_results(self, data: List) -> None:
103 | '''
104 | Inserts data into the query_search_results table.
105 |
106 | :param data: A list of dictionaries containing the data to insert.
107 | '''
108 | conn = self.create_sql_connection()
109 | if conn is not None:
110 | cursor = conn.cursor()
111 |
112 | try:
113 | for entry in data:
114 | cursor.execute(
115 | '''
116 | INSERT OR IGNORE INTO query_search_results (
117 | source, title, snippet, link, thumbnail,
118 | video_link, snippet_highlighted_words,
119 | displayed_link, title_snippet, likes, comments,
120 | author, link_to_author, post_id
121 | ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
122 | ''',
123 | get_items_from_search_results(entry)
124 | )
125 |
126 | # commit changes
127 | conn.commit()
128 |
129 | except Error as e:
130 | print (f'An error occurred while inserting data: {e}')
131 | finally:
132 | conn.close()
133 | else:
134 | print ('Failed to create the database connection.')
135 |
136 | def create_images_results_table(self) -> None:
137 | '''
138 | Creates the images_results table if it does not already exist.
139 | '''
140 | # set cursor
141 | conn = self.create_sql_connection()
142 | if conn is not None:
143 | cursor = conn.cursor()
144 |
145 | try:
146 | cursor.execute(
147 | '''
148 | CREATE TABLE IF NOT EXISTS images_results (
149 | record_id INTEGER PRIMARY KEY AUTOINCREMENT,
150 | source TEXT,
151 | title TEXT,
152 | link TEXT UNIQUE,
153 | thumbnail TEXT,
154 | author TEXT,
155 | link_to_author TEXT,
156 | post_id TEXT UNIQUE
157 | );
158 | '''
159 | )
160 |
161 | # commit changes
162 | conn.commit()
163 | except Error as e:
164 | print (f'An error occurred: {e}')
165 | finally:
166 | conn.close()
167 | else:
168 | print ('Failed to create the database connection.')
169 |
170 | def insert_images_results(self, data: List) -> None:
171 | '''
172 | Inserts data into the images_results table.
173 |
174 | :param data: A list of dictionaries containing the data to insert.
175 | '''
176 | conn = self.create_sql_connection()
177 | if conn is not None:
178 | cursor = conn.cursor()
179 |
180 | try:
181 | for entry in data:
182 | cursor.execute(
183 | '''
184 | INSERT OR IGNORE INTO images_results (
185 | source, title, link, thumbnail, author,
186 | link_to_author, post_id
187 | ) VALUES (?, ?, ?, ?, ?, ?, ?)
188 | ''',
189 | get_items_from_images_results(entry)
190 | )
191 |
192 | # commit changes
193 | conn.commit()
194 |
195 | except Error as e:
196 | print (f'An error occurred while inserting data: {e}')
197 | finally:
198 | conn.close()
199 | else:
200 | print ('Failed to create the database connection.')
201 |
202 | def create_related_content_table(self) -> None:
203 | '''
204 | Creates the related_content table if it does not already exist.
205 | '''
206 | # set cursor
207 | conn = self.create_sql_connection()
208 | if conn is not None:
209 | cursor = conn.cursor()
210 |
211 | try:
212 | cursor.execute(
213 | '''
214 | CREATE TABLE IF NOT EXISTS related_content (
215 | record_id INTEGER PRIMARY KEY AUTOINCREMENT,
216 | source TEXT,
217 | link TEXT UNIQUE,
218 | thumbnail TEXT,
219 | title TEXT
220 | );
221 | '''
222 | )
223 |
224 | # commit changes
225 | conn.commit()
226 | except Error as e:
227 | print (f'An error occurred: {e}')
228 | finally:
229 | conn.close()
230 | else:
231 | print ('Failed to create the database connection.')
232 |
233 | def insert_related_content(self, data: List) -> None:
234 | '''
235 | Inserts data into the related_content table.
236 |
237 | :param data: A list of dictionaries containing the data to insert.
238 | '''
239 | conn = self.create_sql_connection()
240 | if conn is not None:
241 | cursor = conn.cursor()
242 |
243 | try:
244 | for entry in data:
245 | cursor.execute(
246 | '''
247 | INSERT OR IGNORE INTO related_content (
248 | source, link, thumbnail, title
249 | ) VALUES (?, ?, ?, ?)
250 | ''',
251 | get_items_from_related_content(entry)
252 | )
253 |
254 | # commit changes
255 | conn.commit()
256 |
257 | except Error as e:
258 | print (f'An error occurred while inserting data: {e}')
259 | finally:
260 | conn.close()
261 | else:
262 | print ('Failed to create the database connection.')
263 |
264 | def create_apify_profile_scraper_table(self) -> None:
265 | '''
266 | Creates the apify_profile_scraper table if it does not already exist.
267 | '''
268 | conn = self.create_sql_connection()
269 | if conn is not None:
270 | cursor = conn.cursor()
271 |
272 | try:
273 | cursor.execute(
274 | '''
275 | CREATE TABLE IF NOT EXISTS apify_profile_scraper (
276 | id TEXT PRIMARY KEY,
277 | text TEXT,
278 | text_language TEXT,
279 | create_time INTEGER,
280 | create_time_iso TEXT,
281 | is_ad BOOLEAN,
282 | web_video_url TEXT UNIQUE,
283 |
284 | author_id TEXT,
285 | author_name TEXT,
286 | author_profile_url TEXT,
287 | author_bio_link TEXT,
288 | author_signature TEXT,
289 | author_nickname TEXT,
290 | author_verified BOOLEAN,
291 | author_avatar TEXT,
292 | author_private_account BOOLEAN,
293 | author_region TEXT,
294 | author_following INTEGER,
295 | author_friends INTEGER,
296 | author_fans INTEGER,
297 | author_heart INTEGER,
298 | author_video INTEGER,
299 | author_digg INTEGER,
300 |
301 | music_id TEXT,
302 | music_name TEXT,
303 | music_author TEXT,
304 | music_original BOOLEAN,
305 |
306 | video_duration INTEGER,
307 | video_thumbnail TEXT,
308 | video_download_url TEXT,
309 |
310 | digg_count INTEGER,
311 | share_count INTEGER,
312 | play_count INTEGER,
313 | collect_count INTEGER,
314 | comment_count INTEGER,
315 |
316 | hashtags TEXT,
317 | is_slideshow BOOLEAN,
318 | is_pinned BOOLEAN,
319 | is_sponsored BOOLEAN,
320 | input_username TEXT,
321 | from_profile_section TEXT,
322 |
323 | UNIQUE (id, web_video_url)
324 | ON CONFLICT REPLACE
325 | );
326 | '''
327 | )
328 |
329 | # commit changes
330 | conn.commit()
331 | except Error as e:
332 | print (f'An error occurred: {e}')
333 | finally:
334 | conn.close()
335 | else:
336 | print ('Failed to create the database connection.')
337 |
338 | def create_apify_hashtag_scraper_table(self) -> None:
339 | '''
340 | Creates the apify_hashtag_scraper table if it does not already exist.
341 | '''
342 | conn = self.create_sql_connection()
343 | if conn is not None:
344 | cursor = conn.cursor()
345 |
346 | try:
347 | cursor.execute(
348 | '''
349 | CREATE TABLE IF NOT EXISTS apify_hashtag_scraper (
350 | id TEXT PRIMARY KEY,
351 | text TEXT,
352 | text_language TEXT,
353 | create_time INTEGER,
354 | create_time_iso TEXT,
355 | is_ad BOOLEAN,
356 | web_video_url TEXT UNIQUE,
357 |
358 | author_id TEXT,
359 | author_name TEXT,
360 | author_profile_url TEXT,
361 | author_bio_link TEXT,
362 | author_signature TEXT,
363 | author_nickname TEXT,
364 | author_verified BOOLEAN,
365 | author_avatar TEXT,
366 | author_private_account BOOLEAN,
367 | author_region TEXT,
368 | author_following INTEGER,
369 | author_friends INTEGER,
370 | author_fans INTEGER,
371 | author_heart INTEGER,
372 | author_video INTEGER,
373 | author_digg INTEGER,
374 |
375 | music_id TEXT,
376 | music_name TEXT,
377 | music_author TEXT,
378 | music_original BOOLEAN,
379 |
380 | video_duration INTEGER,
381 | video_thumbnail TEXT,
382 | video_download_url TEXT,
383 |
384 | digg_count INTEGER,
385 | share_count INTEGER,
386 | play_count INTEGER,
387 | collect_count INTEGER,
388 | comment_count INTEGER,
389 |
390 | hashtags TEXT,
391 | is_slideshow BOOLEAN,
392 | is_pinned BOOLEAN,
393 | is_sponsored BOOLEAN,
394 | input_search TEXT,
395 | search_hashtag_views INTEGER,
396 |
397 | UNIQUE (id, web_video_url)
398 | ON CONFLICT REPLACE
399 | );
400 | '''
401 | )
402 |
403 | # commit changes
404 | conn.commit()
405 | except Error as e:
406 | print (f'An error occurred: {e}')
407 | finally:
408 | conn.close()
409 | else:
410 | print ('Failed to create the database connection.')
411 |
412 | def insert_apify_profile_data(self, data: List) -> None:
413 | '''
414 | Inserts data into the apify_profile_scraper table.
415 |
416 | :param data: A list of dictionaries containing the data to insert.
417 | '''
418 | conn = self.create_sql_connection()
419 | if conn is not None:
420 | cursor = conn.cursor()
421 |
422 | try:
423 | for entry in data:
424 | cursor.execute(
425 | '''
426 | INSERT OR REPLACE INTO apify_profile_scraper (
427 | id, text, text_language, create_time, create_time_iso,
428 | is_ad, web_video_url, author_id, author_name,
429 | author_profile_url, author_bio_link, author_signature,
430 | author_nickname, author_verified, author_avatar,
431 | author_private_account, author_region, author_following,
432 | author_friends, author_fans, author_heart, author_video,
433 | author_digg, music_id, music_name, music_author,
434 | music_original, video_duration, video_thumbnail,
435 | video_download_url, digg_count, share_count, play_count,
436 | collect_count, comment_count, hashtags, is_slideshow,
437 | is_pinned, is_sponsored, input_username,
438 | from_profile_section
439 | ) VALUES (
440 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
441 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
442 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
443 | )
444 | ''',
445 | get_items_from_apify_profile_data(entry)
446 | )
447 |
448 | # commit changes
449 | conn.commit()
450 | except Error as e:
451 | print (f'An error occurred while inserting data: {e}')
452 | finally:
453 | conn.close()
454 | else:
455 | print ('Failed to create the database connection.')
456 |
457 | def insert_apify_hashtag_data(self, data: List) -> None:
458 | '''
459 | Inserts data into the apify_hashtag_scraper table.
460 |
461 | :param data: A list of dictionaries containing the data to insert.
462 | '''
463 | conn = self.create_sql_connection()
464 | if conn is not None:
465 | cursor = conn.cursor()
466 |
467 | try:
468 | for entry in data:
469 | cursor.execute(
470 | '''
471 | INSERT OR REPLACE INTO apify_hashtag_scraper (
472 | id, text, text_language, create_time, create_time_iso,
473 | is_ad, web_video_url, author_id, author_name,
474 | author_profile_url, author_bio_link, author_signature,
475 | author_nickname, author_verified, author_avatar,
476 | author_private_account, author_region, author_following,
477 | author_friends, author_fans, author_heart, author_video,
478 | author_digg, music_id, music_name, music_author,
479 | music_original, video_duration, video_thumbnail,
480 | video_download_url, digg_count, share_count, play_count,
481 | collect_count, comment_count, hashtags, is_slideshow,
482 | is_pinned, is_sponsored, input_search,
483 | search_hashtag_views
484 | ) VALUES (
485 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
486 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?,
487 | ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?
488 | )
489 | ''',
490 | get_items_from_apify_hashtag_data(entry)
491 | )
492 |
493 | # commit changes
494 | conn.commit()
495 | except Error as e:
496 | print (f'An error occurred while inserting data: {e}')
497 | finally:
498 | conn.close()
499 | else:
500 | print ('Failed to create the database connection.')
501 |
502 | def fetch_all_data(self) -> None:
503 | '''
504 | Fetches all data from the SQL tables
505 | '''
506 | tables = [
507 | 'query_search_results',
508 | 'images_results',
509 | 'related_content',
510 | 'apify_profile_scraper',
511 | 'apify_hashtag_scraper'
512 | ]
513 | conn = self.create_sql_connection()
514 | if conn is not None:
515 | try:
516 | for t in tables:
517 | q = f'''
518 | SELECT *
519 | FROM {t}
520 | '''
521 | # fetch data
522 | df = pd.read_sql_query(q, conn)
523 |
524 | # save data
525 | save_path = f'{self.output}/{t}.csv'
526 | df.to_csv(
527 | save_path,
528 | index=False,
529 | encoding='utf-8'
530 | )
531 |
532 | except Error as e:
533 | print (f'An error occurred while fetching data from {t}: {e}')
534 | finally:
535 | conn.close()
536 |
537 | def get_collected_videos(self, include_user_related_content: bool) -> List:
538 | '''
539 | Retrieves all unique video links from the query_search_results and
540 | images_results tables that have not been downloaded yet.
541 |
542 | :param include_user_related_content: Whether to include user related
543 | content from Google search results in the returned list of links.
544 | :return: A list of unique video links.
545 | '''
546 | data = []
547 | conn = self.create_sql_connection()
548 | if conn is not None:
549 | cursor = conn.cursor()
550 |
551 | try:
552 | # get all video links from database
553 | cursor.execute(
554 | '''
555 | SELECT link
556 | FROM query_search_results
557 | UNION
558 | SELECT link
559 | FROM images_results
560 | '''
561 | )
562 |
563 | # fetch all links
564 | all_links = [i[0] for i in cursor.fetchall()]
565 |
566 | if include_user_related_content:
567 | # get user from link
568 | user = extract_author_post_id(all_links[0])[0]
569 |
570 | # get all user related content links from database that match the user's TikTok video pattern
571 | cursor.execute(
572 | '''
573 | SELECT link
574 | FROM related_content
575 | WHERE link LIKE ?
576 | ''',
577 | (f'https://www.tiktok.com/@{user}/video/%',)
578 | )
579 |
580 | # fetch all links
581 | all_links.extend([i[0] for i in cursor.fetchall()])
582 |
583 | # remove duplicates
584 | all_links = list(set(all_links))
585 |
586 | # get list of already downloaded videos
587 | videos_dir = os.path.join(self.output, 'downloaded_videos')
588 |
589 | if os.path.exists(videos_dir):
590 | # get existing video ids
591 | existing_ids = {
592 | os.path.splitext(f)[0]
593 | for f in os.listdir(videos_dir)
594 | if os.path.isfile(os.path.join(videos_dir, f))
595 | }
596 |
597 | # filter out links whose IDs are already downloaded
598 | data = [
599 | link for link in all_links
600 | if extract_author_post_id(link)[2] not in existing_ids
601 | ]
602 | else:
603 | data = all_links
604 | except Error as e:
605 | print (f'An error occurred while retrieving data: {e}')
606 | finally:
607 | conn.close()
608 |
609 | return data
610 |
611 | def get_all_collected_videos(self) -> List:
612 | '''
613 | Retrieves all unique video links from the query_search_results,
614 | images_results, and Apify tables.
615 | '''
616 | conn = self.create_sql_connection()
617 | if conn is not None:
618 | cursor = conn.cursor()
619 |
620 | try:
621 | # get all video links from database
622 | cursor.execute(
623 | '''
624 | SELECT web_video_url
625 | FROM apify_profile_scraper
626 | UNION
627 | SELECT web_video_url
628 | FROM apify_hashtag_scraper
629 | UNION
630 | SELECT link
631 | FROM query_search_results
632 | UNION
633 | SELECT link
634 | FROM images_results
635 | '''
636 | )
637 |
638 | # fetch all links
639 | all_links = [i[0] for i in cursor.fetchall()]
640 |
641 | # remove duplicates
642 | all_links = list(set(all_links))
643 |
644 | return all_links
645 | except Error as e:
646 | print (f'An error occurred while retrieving data: {e}')
647 | finally:
648 | conn.close()
649 |
650 | return []
651 |
--------------------------------------------------------------------------------