├── .gitignore ├── requirements.txt ├── LICENSE ├── table_create_query.txt ├── README.md ├── feed_crawler.py ├── channel_crawler.py └── test.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | ~.ipynb_checkpoints 3 | .xlsx -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.9.3 2 | bs4==0.0.1 3 | certifi==2020.12.5 4 | chardet==4.0.0 5 | colorama==0.4.4 6 | configparser==5.0.1 7 | crayons==0.4.0 8 | idna==2.10 9 | numpy==1.20.1 10 | pandas==1.2.2 11 | psycopg2==2.8.6 12 | python-dateutil==2.8.1 13 | pytz==2021.1 14 | requests==2.25.1 15 | selenium==3.141.0 16 | six==1.15.0 17 | soupsieve==2.2 18 | SQLAlchemy==1.3.23 19 | urllib3==1.26.3 20 | webdriver-manager==3.3.0 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 dongho 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /table_create_query.txt: -------------------------------------------------------------------------------- 1 | create table public.channel_list( 2 | channel_id varchar(50), 3 | channel_user_id varchar(50), 4 | channel_secret_id varchar(100), 5 | channel_nickname varchar(50), 6 | channel_registered_date date, 7 | primary key(channel_id) 8 | ); 9 | 10 | create table public.channel_info_daily( 11 | channel_crawl_date date, 12 | channel_id varchar(50), 13 | following_count integer, 14 | follower_count integer, 15 | heart_count integer, 16 | digg_count integer, 17 | video_count integer, 18 | foreign key(channel_id) references channel_list(channel_id), 19 | primary key(channel_crawl_date,channel_id) 20 | ); 21 | 22 | create table public.video_list( 23 | video_id varchar(50), 24 | channel_id varchar(50), 25 | video_create_date date, 26 | video_description text, 27 | video_hashtag text, 28 | foreign key (channel_id) references channel_list(channel_id), 29 | primary key(video_id) 30 | ); 31 | 32 | create table public.video_info_daily( 33 | video_crawl_date date, 34 | video_id varchar(50), 35 | digg_count integer, 36 | share_count integer, 37 | comment_count integer, 38 | play_count bigint, 39 | foreign key (video_id) references video_list(video_id), 40 | primary key (video_crawl_date,video_id) 41 | ); 42 | 43 | create table public.hashtag_info_daily( 44 | tag_crawl_date date, 45 | tag_id varchar(50), 46 | tag_title varchar(50), 47 | tag_description text, 48 | tag_video_count integer, 49 | tag_view_count bigint, 50 | primary key (tag_id, tag_crawl_date) 51 | ); 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TikTok-Crawler 2 | 3 | This repository implements Tiktok's channel and challenge crawler(channel_crawler.py) and recommended feed crawler(feed_crawler.py). The composition of this repo includes code to insert and load data into the attached Postgre DB. It analyzes requests that web elements from inside TikTok without using API and sends queries to TikTok servers to crawl. In case of channel information when crawling, it is executed by inserting the id of the channel user as the input factor. 4 | 5 | 본 레포지토리에서는 틱톡의 채널정보/일간인기 해시태그 정보(channel_crawler.py), 추천 피드정보(feed_crawler.py)를 구현했다. 본 레포의 구성은 첨부된 Postgresql table에 데이터를 삽입하고 적재하는 코드까지 포함된다. API를 사용하지 않고 틱톡 내부에서 웹 요소를 끌어오는 요청을 분석해 쿼리를 틱톡 서버로 보내 스크래핑한다. 크롤링시 채널 정보같은 경우 채널 user의 id를 입력 인자로 넣어주면 실행된다. 6 | 7 | You can see dev log here! 8 | [https://www.notion.so/hobbeskim/TikTok-Crawler-cf374f0f14674f97b3edd278c953ec99] 9 | 10 | ## Have To Prepare 11 | - Postgresql server 12 | - selenum(only for feed_crawler.py) 13 | 14 | ## Requirements 15 | - channel_crawler.py 16 | - TikTok Account(a.k.a channel) following, follower 17 | - channel's video list 18 | - videos(upload at the channel) reactions 19 | - tiktok hot hashtag(a.k.a challenge) 20 | - all this requirements are stored in DB daily or once 21 | 22 | - feed_crawler.py 23 | - daily recommended feed 24 | - feed posts reactions 25 | 26 | ## Schema and SQL queries 27 | ![schema](https://user-images.githubusercontent.com/57410044/108812814-1e40d480-75f3-11eb-8cdf-102613edd54d.png) 28 | ## Execution 29 | pip install -r requirements.txt 30 | 31 | ### set db_connection_info 32 | 33 | ### CLI 34 | 35 | python channel_crawler.py 36 | 37 | ### enter 38 | > channel_id 39 | 40 | python feed_crawler.py 41 | ![run channel](https://user-images.githubusercontent.com/57410044/108812718-fc475200-75f2-11eb-9ce7-e23d0c5fd2f5.png) 42 | ![run feed](https://user-images.githubusercontent.com/57410044/108812685-ea65af00-75f2-11eb-8577-c1e800e2f739.png) 43 | ![result channel](https://user-images.githubusercontent.com/57410044/108812899-416b8400-75f3-11eb-8ca2-40c620c9941c.png) 44 | ![result feed](https://user-images.githubusercontent.com/57410044/108812958-6233d980-75f3-11eb-842f-2bc03c4e6f63.png) 45 | 46 | 47 | -------------------------------------------------------------------------------- /feed_crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from selenium import webdriver 4 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 5 | from webdriver_manager.chrome import ChromeDriverManager 6 | import time 7 | import random 8 | from bs4 import BeautifulSoup 9 | import pandas as pd 10 | import datetime 11 | import sqlalchemy 12 | from sqlalchemy import create_engine 13 | 14 | class TikTokFeedCrawler(object): 15 | """[summary] 16 | ver 1.0, 작성자 : 김동호@DMK, 작성일:2021.02.19, 최근 수정일:2021.02.19 17 | 틱톡 웹에서 일별로 추천되는 피드의 정보를 크롤링하는 함수(셀레늄 의존) 18 | """ 19 | 20 | def __init__(self): 21 | self.db_connection_info = 'postgresql://username:userpwd@localhost:5432/crawler' 22 | self.tiktok_url = 'https://www.tiktok.com/' 23 | self.feed_url = 'https://t.tiktok.com/api/recommend/item_list/?aid=1988&app_name=tiktok_web&device_platform=web&referer=&root_referer=&user_agent=Mozilla%2F5.0+(Windows+NT+10.0%3B+Win64%3B+x64)+AppleWebKit%2F537.36+(KHTML,+like+Gecko)+Chrome%2F88.0.4324.150+Safari%2F537.36&cookie_enabled=true&screen_width=2560&screen_height=1440&browser_language=ko-KR&browser_platform=Win32&browser_name=Mozilla&browser_version=5.0+(Windows+NT+10.0%3B+Win64%3B+x64)+AppleWebKit%2F537.36+(KHTML,+like+Gecko)+Chrome%2F88.0.4324.150+Safari%2F537.36&browser_online=true&ac=4g&timezone_name=Asia%2FSeoul&priority_region=&verifyFp=verify_kl8znk7j_UCx7rIre_7LBx_4qud_BioE_XdjXw1dexl2R&appId=1180®ion=KR&appType=t&isAndroid=false&isMobile=false&isIOS=false&OS=windows&did=6923212441551719681&count=30&itemID=1' 24 | self.max_feed_count = 100 25 | 26 | def parse_tik(self, page_source: str): 27 | """[summary] 28 | 셀레늄으로 긁어온 페이지 소스에서 개별 피드의 정보를 크롤링 하는 함수 29 | Args: 30 | page_source (str): 페이지 소스 31 | 32 | Returns: 33 | res_list(list of dict): 개별 피드의 정보의 리스트 34 | """ 35 | 36 | res_list = [] 37 | html = BeautifulSoup(page_source, 'html.parser') 38 | 39 | for x in html.select('span.lazyload-wrapper'): 40 | content = x.select('div.feed-item-content') 41 | if not content: 42 | continue 43 | content = content[0] 44 | video_id = str(content.select( 45 | 'a.item-video-card-wrapper')[0]['href']).split('/')[-1] 46 | channel_user_id = content.select('h3.author-uniqueId')[0].text 47 | channel_nickname = content.select('h4.author-nickname')[0].text 48 | caption = content.select('div.tt-video-meta-caption') 49 | video_url = content.select('a.item-video-card-wrapper')[0]['href'] 50 | hashtags = [] 51 | if not caption: 52 | caption_text = "" 53 | else: 54 | caption = caption[0] 55 | if not caption.select('a'): 56 | pass 57 | else: 58 | hashtags = [a.text for a in caption.select('a')] 59 | caption_text = caption.text 60 | 61 | video_hashtag = ','.join(hashtags) # 해시태그 결합 구분자 , 62 | music = content.select('div.tt-video-music')[0].text 63 | reaction_params = {k['title']: k.text for k in content.select( 64 | 'div.pc-action-bar strong')} 65 | print('rect:', reaction_params) 66 | units = {"K": 1000, "M": 1000000, "B": 1000000000} # 정수형 변환 및 칼럼이름 변경 67 | coverted_reaction = {} 68 | for key, value in reaction_params.items(): 69 | try: 70 | coverted_reaction[key] = int(value) 71 | except ValueError: 72 | unit = value[-1] 73 | new_value = float(value[:-1]) 74 | coverted_reaction[key] = int(new_value*units[unit]) 75 | 76 | info_params = { 77 | 'feed_crawl_date': datetime.date.today().isoformat(), 78 | 'video_id': video_id, 79 | 'channel_user_id': channel_user_id, 80 | 'channel_nickname': channel_nickname, 81 | 'video_description': caption_text, 82 | 'video_hashtag': video_hashtag, 83 | 'video_music': music, 84 | 'video_url': video_url, 85 | 'digg_count': coverted_reaction['like'], 86 | 'share_count': coverted_reaction['share'], 87 | 'comment_count': coverted_reaction['comment'] 88 | } 89 | res_list.append(info_params) 90 | 91 | return res_list 92 | 93 | def crawl_feed_info(self): 94 | """[summary] 95 | 틱톡 피드를 크롤링하는 메서드 96 | Returns: 97 | feed_info(pd.DateFrame): 벌크 삽입용 데이터 프레임 98 | """ 99 | url = self.feed_url 100 | res = requests.get(url) 101 | driver = webdriver.Chrome(ChromeDriverManager().install()) 102 | caps = DesiredCapabilities.CHROME 103 | caps['goog:loggingPrefs'] = {'performance': 'ALL'} 104 | driver.get(self.tiktok_url) 105 | 106 | is_next = True 107 | while is_next: 108 | try: 109 | driver.execute_script( 110 | "window.scrollTo(0, document.body.scrollHeight);") 111 | except: 112 | is_next = False 113 | time.sleep(random.uniform(3, 5)) 114 | res = self.parse_tik(driver.page_source) 115 | if len(res) > self.max_feed_count: 116 | is_next = False 117 | 118 | feed_info = pd.DataFrame.from_records(res) 119 | return feed_info 120 | 121 | def insert_db_feed_info(self,feed_info:pd.DataFrame): 122 | """[summary] 123 | Postgre DB에 feed_info 를 insert 하는 함수 124 | Args: 125 | feed_info (pd.DataFrame): 피드 정보 데이터 프레임 126 | """ 127 | engine = create_engine(self.db_connection_info) 128 | 129 | feed_info.to_sql(name='feed_info_daily', 130 | con=engine, 131 | schema='public', 132 | if_exists='append', 133 | index=False, 134 | dtype={ 135 | 'feed_crawl_date': sqlalchemy.Date, 136 | 'video_id': sqlalchemy.types.VARCHAR(50), 137 | 'channel_user_id': sqlalchemy.types.VARCHAR(50), 138 | 'channel_nickname': sqlalchemy.types.VARCHAR(50), 139 | 'video_description': sqlalchemy.types.Text, 140 | 'video_hashtag': sqlalchemy.types.Text, 141 | 'video_music': sqlalchemy.types.VARCHAR(300), 142 | 'video_url': sqlalchemy.types.Text, 143 | 'digg_count': sqlalchemy.types.INTEGER(), 144 | 'share_count': sqlalchemy.types.INTEGER(), 145 | 'comment_count': sqlalchemy.types.INTEGER() 146 | }) 147 | engine.dispose() 148 | 149 | def start(self): 150 | try: 151 | feed_info = self.crawl_feed_info() 152 | self.insert_db_feed_info(feed_info) 153 | except Exception as e : 154 | print(e) 155 | print("CAN NOT CRAWL FEED INFO") 156 | 157 | if __name__ == '__main__': 158 | 159 | crawler = TikTokFeedCrawler() 160 | crawler.start() 161 | print('done') 162 | -------------------------------------------------------------------------------- /channel_crawler.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | import datetime 5 | import json 6 | import random 7 | from sqlalchemy import create_engine 8 | 9 | class TikTokChannelCrawler(object): 10 | """ 11 | ver 1.0, 작성자 : 김동호@DMK, 작성일:2021.02.19, 최근 수정일:2021.02.19 12 | 채널의 이름을 인자로 받아, 13 | 틱톡 개별 채널과 채널의 비디오를 크롤링하고 DB에 삽입까지 수행하는 메서드들이 포함된 클래스입니다. 14 | """ 15 | def __init__(self, user_id): 16 | self.db_connection_info = 'postgresql://username:userpwd@localhost:5432/crawler' 17 | self.channel_id = "" # 1232124541234 정수문자열형태 18 | self.secret_id = "" # MAS21das123asd 해싱형태 19 | self.user_id = user_id 20 | self.base_url = { 21 | 'app_name': 'tiktok_web', 22 | 'device_platform': 'web', 23 | 'referer': 'https:%2F%2Fwww.google.com%2F', 24 | 'root_referer': 'https:%2F%2Fwww.google.com%2F', 25 | 'user_agent': 'Mozilla%2F5.0+(Macintosh%3B+Intel+Mac+OS+X+10_15_5)+AppleWebKit%2F537.36+(KHTML,+like+Gecko)+Chrome%2F88.0.4324.146+Safari%2F537.36', 26 | 'cookie_enabled': 'true', 27 | 'screen_width': '1920', 28 | 'screen_height': '1080', 29 | 'browser_language':'ko-KR', 30 | 'browser_platform': 'MacIntel', 31 | 'browser_name': 'Mozilla', 32 | 'browser_version':'5.0+(Macintosh%3B+Intel+Mac+OS+X+10_15_5)+AppleWebKit%2F537.36+(KHTML,+like+Gecko)+Chrome%2F88.0.4324.146+Safari%2F537.36', 33 | 'browser_online': 'true', 34 | 'ac': '4g', 35 | 'timezone_name': 'Asia%2Fseoul', 36 | 'page_referer': 'https:%2F%2Fwww.tiktok.com%2Fsearch%3Fq%3D%25EC%25A0%259C%25EB%25A6%25AC%26lang%3Dko-KR', 37 | 'priority_region': '', # 없음 38 | 'verifyFp': 'verify_kkqg77dw_hai6t3Ps_dkIT_41yF_ApMY_G7euFo5nq2xH', 39 | 'appId': '1180', 40 | 'region': 'KR', 41 | 'appType': 't', 42 | 'isAndroid': 'false', 43 | 'isMobile': 'false', 44 | 'isIOS': 'false', 45 | 'OS': 'mac', 46 | 'did': '6925283638187689473', 47 | 'count': '30', 48 | 'cursor': '', #UNIX 시간값 - hashtag 엔 미존재 49 | 'language': 'ko-KR', 50 | 'secUid': '' #secret_id 로 세팅 필요 - hashtag엔 미존재 51 | } 52 | 53 | def convert_user_id_to_secret_id(self, user_id): 54 | """[summary] 55 | 입력받은 채널(틱톡커)의 user 아이디(변경가능 값)을 크롤링 url 생성을 위한 secUID로 전환합니다. 56 | Args: 57 | user_id (string): 사용자 ID 58 | 59 | Returns: 60 | secret_id(string): 사용자 secret_id(틱톡 서버 내부용) 61 | """ 62 | try: 63 | url = f"https://www.tiktok.com/@{user_id}" 64 | res = requests.get(url) 65 | secuid_start_index = res.text.find('secUid') 66 | secret_id = res.text[int(secuid_start_index):].split('"')[2] 67 | return secret_id 68 | 69 | except Exception as e: 70 | print("CAN'T GET USER SECRET ID") 71 | print(e) 72 | return "" 73 | 74 | def crawl_channel_info(self, secret_id): 75 | """[summary] 76 | 채널 정보를 크롤링 하는 메소드 닉네임,채널ID,팔로잉,팔로워,좋아요수,좋아요한수,총비디오수 등을 크롤링 77 | Args: 78 | secret_id (string): 사용자 secret_id(틱톡 서버 내부용) 79 | 80 | Returns: 81 | channel_info[dict]: 채널 정보의 딕셔너리 82 | """ 83 | try: 84 | now = str(time.time()).split('.')[0] 85 | url = 'https://t.tiktok.com/api/post/item_list/?aid=1988' 86 | for key,value in self.base_url.items(): 87 | if key == 'cursor': 88 | url = url + '&' + key + '=' + now + '000' 89 | elif key == 'secUid': 90 | url = url + '&' + key + '=' + secret_id 91 | else: 92 | url = url + '&' + key + '=' + value 93 | except Exception as e: 94 | print(e) 95 | 96 | try: 97 | print(url) 98 | res = requests.get(url) 99 | res = json.loads(str(res.text)) 100 | author_info = res['itemList'][0]['author'] 101 | author_stat = res['itemList'][0]['authorStats'] 102 | channel_info = { 103 | 'channel_id':author_info['id'], 104 | 'channel_user_id':author_info['uniqueId'], 105 | 'channel_secret_id':secret_id, 106 | 'channel_nickname':author_info['nickname'], 107 | 'channel_registered_date':datetime.date.today().isoformat(), 108 | 'channel_crawl_date':datetime.date.today().isoformat(), 109 | 'following_count':author_stat['followingCount'], 110 | 'follower_count':author_stat['followerCount'], 111 | 'heart_count':author_stat['heartCount'], 112 | 'video_count':author_stat['videoCount'], 113 | 'digg_count':author_stat['diggCount'] 114 | } 115 | 116 | return channel_info # dict 117 | except Exception as e: 118 | print("CAN'T GET CHANNEL INFO") 119 | print(e) 120 | 121 | def crawl_video_info(self, secret_id): 122 | """[summary] 123 | 채널이 업로드한 video 목록과 개별 video 들의 정보를 크롤링하는 메서드 124 | 비디오ID,생성날짜,설명,해시태그,좋아요수,공유수,댓글수,재생수 등을 크롤링 125 | Args: 126 | secret_id (string): 사용자 secret_id(틱톡 서버 내부용) 127 | 128 | Returns: 129 | video_info_list(list of dict): 비디오 정보가 담긴 딕셔너리의 리스트 130 | """ 131 | 132 | video_info_list = [] 133 | 134 | #초기 커서 설정 (현재 시점) 135 | now = str(time.time()).split('.')[0] 136 | cursor = now + '000' 137 | 138 | while True: 139 | 140 | url = 'https://t.tiktok.com/api/post/item_list/?aid=1988' 141 | print('CURRENT CURSOR : ', cursor) 142 | for key,value in self.base_url.items(): 143 | if key == 'cursor': 144 | url = url + '&' + key + '=' + cursor 145 | elif key == 'secUid': 146 | url = url + '&' + key + '=' + secret_id 147 | else: 148 | url = url + '&' + key + '=' + value 149 | try: 150 | time.sleep(random.randrange(1, 3) +random.random()) 151 | res = requests.get(url) # randomized requesting interval 152 | res = json.loads(str(res.text)) 153 | for item in res['itemList']: 154 | hash_list = [] 155 | try:# if video does not have hashtag 156 | for hashtag in item['challenges']: 157 | hash_list.append(hashtag['title']) 158 | except: 159 | hash_list.append("None") 160 | except Exception as e: 161 | print(e) 162 | try: 163 | video_info = { 164 | 'video_id' : item['id'], 165 | 'channel_id' : item['author']['id'], 166 | 'video_create_date' : datetime.datetime.fromtimestamp(item['createTime']).date().isoformat(), 167 | 'video_crawl_date' : datetime.date.today().isoformat(), 168 | 'video_description' : item['desc'], 169 | 'video_hashtag': ','.join(hash_list), 170 | 'digg_count':item['stats']['diggCount'], 171 | 'share_count':item['stats']['shareCount'], 172 | 'comment_count':item['stats']['commentCount'], 173 | 'play_count':item['stats']['playCount'], 174 | } 175 | video_info_list.append(video_info) 176 | print(res['hasMore']) 177 | if res['hasMore'] == True: 178 | cursor = res['cursor'] 179 | print('NEXT CURSOR :',cursor) 180 | elif res['hasMore'] == False: 181 | break 182 | except Exception as e: 183 | print(e) 184 | 185 | return video_info_list 186 | 187 | def crawl_hashtag_info(self): 188 | """[summary] 189 | 틱톡의 일간 인기 hashtag 를 크롤링해주는 메소드 190 | 태그이름, 설명, 태그 비디오 수, 태그 시청 수 191 | Returns: 192 | hashtag_info_list(list of dict): 개별 해시태그의 정보(딕셔너리)의 리스트 193 | """ 194 | hashtag_info_list = [] 195 | 196 | url = 'https://t.tiktok.com/api/discover/challenge/?aid=1988' 197 | for key,value in self.base_url.items(): 198 | if key == 'cursor' or key == 'secUid':#hashtag 를 가져오는 url에서 필요없는 key,value 배제 199 | pass 200 | else: 201 | url = url + '&' + key + '=' + value 202 | url = url + '&discoverType=0&needItemList=false&keyWord=&offset=0'#hashtag에 필요한 인자들 추가 203 | 204 | res = requests.get(url) 205 | res = json.loads(res.text) 206 | 207 | for challenge in res['challengeInfoList']: 208 | hashtag_info = { 209 | 'tag_crawl_date' : datetime.date.today().isoformat(), 210 | 'tag_id': challenge['challenge']['id'], 211 | 'tag_title': challenge['challenge']['title'], 212 | 'tag_description': challenge['challenge']['desc'], 213 | 'tag_video_count': challenge['stats']['videoCount'], 214 | 'tag_view_count': challenge['stats']['viewCount'], 215 | } 216 | hashtag_info_list.append(hashtag_info) 217 | return hashtag_info_list 218 | 219 | def upsert_db_channel_info(self, channel_info): 220 | """[summary] 221 | 크롤링한 채널 정보를 DB의 channel_list 테이블에 upsert 해주는 함수 222 | Args: 223 | channel_info (dict): 채널의 정보 224 | """ 225 | 226 | db_engine = create_engine(self.db_connection_info) 227 | 228 | try: 229 | query = ''' 230 | insert into channel_list (channel_id,channel_user_id,channel_secret_id,channel_nickname,channel_registered_date) 231 | values ( %(channel_id)s,%(channel_user_id)s,%(channel_secret_id)s,%(channel_nickname)s,%(channel_registered_date)s) 232 | on conflict (channel_id) DO UPDATE SET channel_user_id = %(channel_user_id)s, channel_nickname = %(channel_nickname)s; 233 | ''' 234 | params = { 235 | 'channel_id':channel_info['channel_id'], 236 | 'channel_user_id':channel_info['channel_user_id'], 237 | 'channel_secret_id':channel_info['channel_secret_id'], 238 | 'channel_nickname':channel_info['channel_nickname'], 239 | 'channel_registered_date':channel_info['channel_registered_date'] 240 | } 241 | db_engine.execute(query,params)## at DB channel_info table update 242 | except Exception as e: 243 | print(e) 244 | try: 245 | query = ''' 246 | insert into channel_info_daily (channel_crawl_date,channel_id,following_count,follower_count,heart_count,digg_count,video_count) 247 | values (%(channel_crawl_date)s,%(channel_id)s,%(following_count)s,%(follower_count)s,%(heart_count)s,%(digg_count)s,%(video_count)s) 248 | on conflict do nothing; 249 | ''' 250 | params = { 251 | 'channel_crawl_date':channel_info['channel_crawl_date'], 252 | 'channel_id':channel_info['channel_id'], 253 | 'following_count':channel_info['following_count'], 254 | 'follower_count':channel_info['follower_count'], 255 | 'heart_count':channel_info['heart_count'], 256 | 'digg_count':channel_info['digg_count'], 257 | 'video_count':channel_info['video_count'] 258 | } 259 | db_engine.execute(query,params)## at DB channel_info table update 260 | except Exception as e: 261 | print(e) 262 | 263 | db_engine.dispose() 264 | print("UPSERT DB CHANNEL INFO& DAILY TABLES DONE") 265 | 266 | def upsert_db_video_info(self, video_info_list): 267 | """[summary] 268 | 크롤링한 비디오 정보를 DB의 video_list 테이블 및 video_info_daily에 upsert 해주는 함수 269 | Args: 270 | video_info_list (list of dict): 개별 비디오의 정보 리스트 271 | """ 272 | db_engine = create_engine(self.db_connection_info) # set DB engine by attribute's db info 273 | 274 | for video_info in video_info_list: 275 | try:# video_list tabel upsert sequence 276 | query = ''' 277 | insert into video_list(video_id,channel_id,video_create_date,video_description,video_hashtag) 278 | values (%(video_id)s,%(channel_id)s,%(video_create_date)s,%(video_description)s,%(video_hashtag)s) 279 | on conflict do nothing; 280 | ''' 281 | params = { 282 | 'video_id' : video_info['video_id'], 283 | 'channel_id' : video_info['channel_id'], 284 | 'video_create_date' : video_info['video_create_date'], 285 | 'video_description' : video_info['video_description'], 286 | 'video_hashtag': video_info['video_hashtag'] 287 | } 288 | db_engine.execute(query,params) 289 | except Exception as e: 290 | print(e) 291 | 292 | try: 293 | query = ''' 294 | insert into video_info_daily(video_crawl_date, video_id, digg_count, share_count,comment_count,play_count) 295 | values (%(video_crawl_date)s, %(video_id)s, %(digg_count)s, %(share_count)s,%(comment_count)s,%(play_count)s) 296 | on conflict (video_crawl_date,video_id) do update set digg_count =%(digg_count)s, share_count = %(share_count)s, comment_count = %(comment_count)s,play_count = %(play_count)s; 297 | ''' 298 | params = { 299 | 'video_crawl_date' : video_info['video_crawl_date'], 300 | 'video_id' : video_info['video_id'], 301 | 'digg_count':video_info['digg_count'], 302 | 'share_count':video_info['share_count'], 303 | 'comment_count':video_info['comment_count'], 304 | 'play_count':video_info['play_count'], 305 | } 306 | db_engine.execute(query,params) 307 | except Exception as e: 308 | print(e) 309 | db_engine.dispose() 310 | print('UPSERT DB VIDEO INFO&DAILY TABLE DONE') 311 | 312 | def upsert_db_hashtag_info(self, hashtag_info_list): 313 | """[summary] 314 | 크롤링한 해시태그 정보를 DB의 hashtag_info_list 테이블에 upsert 해주는 함수 315 | Args: 316 | hashtag_info_list (list of dict): 개별 해시태그 정보의 리스트 317 | """ 318 | 319 | db_engine = create_engine(self.db_connection_info) # set DB engine by attribute's db info 320 | 321 | for hashtag in hashtag_info_list: 322 | try: 323 | query = ''' 324 | insert into hashtag_info_daily(tag_crawl_date, tag_id, tag_title, tag_description,tag_video_count,tag_view_count) 325 | values (%(tag_crawl_date)s, %(tag_id)s, %(tag_title)s, %(tag_description)s,%(tag_video_count)s,%(tag_view_count)s) 326 | on conflict (tag_crawl_date,tag_id) do nothing; 327 | ''' 328 | params = { 329 | 'tag_crawl_date' : hashtag['tag_crawl_date'], 330 | 'tag_id': hashtag['tag_id'], 331 | 'tag_title': hashtag['tag_title'], 332 | 'tag_description': hashtag['tag_description'], 333 | 'tag_video_count': hashtag['tag_video_count'], 334 | 'tag_view_count': hashtag['tag_view_count'], 335 | } 336 | db_engine.execute(query,params) 337 | except Exception as e: 338 | print(e) 339 | db_engine.dispose() 340 | print("UPSERT DB HASHTAG DONE.") 341 | 342 | def start(self): 343 | 344 | secret_id = self.convert_user_id_to_secret_id(self.user_id) 345 | self.secret_id = secret_id 346 | print("CHANNEL SECRET ID : ", secret_id) 347 | 348 | try: # channel & video crawling 349 | channel_info = self.crawl_channel_info(self.secret_id) 350 | video_info_list = self.crawl_video_info(self.secret_id) 351 | self.upsert_db_channel_info(channel_info) 352 | self.upsert_db_video_info(video_info_list) 353 | except Exception as e: 354 | print(e) 355 | 356 | try: # tag crawling 357 | hashtag_info_list = self.crawl_hashtag_info() 358 | self.upsert_db_hashtag_info(hashtag_info_list) 359 | except Exception as e: 360 | print(e) 361 | 362 | if __name__ == "__main__": 363 | 364 | print("INPUT TIKTOK CHANNEL ID : ") 365 | user_id = input() 366 | crawler = TikTokChannelCrawler(user_id) 367 | crawler.start() 368 | print("ALL DONE") 369 | -------------------------------------------------------------------------------- /test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 18, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "url = '''\n", 10 | "https://t.tiktok.com/api/post/item_list/?aid=1988&app_name=tiktok_web&device_platform=web&referer=&root_referer=&user_agent=Mozilla%2F5.0+(Macintosh%3B+Intel+Mac+OS+X+10_15_5)+AppleWebKit%2F537.36+(KHTML,+like+Gecko)+Chrome%2F88.0.4324.146+Safari%2F537.36&cookie_enabled=true&screen_width=2560&screen_height=1440&browser_language=ko-KR&browser_platform=MacIntel&browser_name=Mozilla&browser_version=5.0+(Macintosh%3B+Intel+Mac+OS+X+10_15_5)+AppleWebKit%2F537.36+(KHTML,+like+Gecko)+Chrome%2F88.0.4324.146+Safari%2F537.36&browser_online=true&ac=4g&timezone_name=Asia%2FSeoul&page_referer=https:%2F%2Fwww.tiktok.com%2F%40leesiyoung38%3Flang%3Dko-KR%26is_copy_url%3D1%26is_from_webapp%3Dv1&priority_region=&verifyFp=verify_kkqg77dw_hai6t3Ps_dkIT_41yF_ApMY_G7euFo5nq2xH&appId=1180®ion=KR&appType=t&isAndroid=false&isMobile=true&isIOS=true&OS=mac&did=6925283638187689473&count=30&cursor=1608874137000&secUid=MS4wLjABAAAAqmwhSDkAi7fNy9dgVPn16pUYEl6ArLzVKCQjh2clHqleiAk25sy9AgruSwfrUFON&language=ko-KR\n", 11 | "'''" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 39, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import requests\n", 21 | "from bs4 import BeautifulSoup\n", 22 | "import json" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 45, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "response = requests.get(url)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 46, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "print(response)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 49, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "ename": "JSONDecodeError", 58 | "evalue": "Expecting value: line 1 column 1 (char 0)", 59 | "output_type": "error", 60 | "traceback": [ 61 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 62 | "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", 63 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#response = str(response.text)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mresponse\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mloads\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 64 | "\u001b[0;32m~/opt/anaconda3/lib/python3.8/json/__init__.py\u001b[0m in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[0mparse_int\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mparse_float\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 356\u001b[0m parse_constant is None and object_pairs_hook is None and not kw):\n\u001b[0;32m--> 357\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_default_decoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 358\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 359\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mJSONDecoder\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 65 | "\u001b[0;32m~/opt/anaconda3/lib/python3.8/json/decoder.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 335\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 336\u001b[0m \"\"\"\n\u001b[0;32m--> 337\u001b[0;31m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraw_decode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 338\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 66 | "\u001b[0;32m~/opt/anaconda3/lib/python3.8/json/decoder.py\u001b[0m in \u001b[0;36mraw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 353\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscan_once\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0midx\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 354\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 355\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Expecting value\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 356\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", 67 | "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "#response = str(response.text)\n", 73 | "response= json.loads(response)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 50, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "ename": "TypeError", 83 | "evalue": "string indices must be integers", 84 | "output_type": "error", 85 | "traceback": [ 86 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 87 | "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", 88 | "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mresponse\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'itemList'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'authorStats'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", 89 | "\u001b[0;31mTypeError\u001b[0m: string indices must be integers" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "print(response['itemList'][0]['authorStats'])" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 51, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "data": { 104 | "text/plain": [ 105 | "'MS4wLjABAAAAfFdqZCPoOYKe3SvonJn7v2C9UTuw66YoN5EPfaYb-jWuCTx6SGsXecYbbumyo_lH'" 106 | ] 107 | }, 108 | "execution_count": 51, 109 | "metadata": {}, 110 | "output_type": "execute_result" 111 | } 112 | ], 113 | "source": [ 114 | "url = 'https://www.tiktok.com/@yujin__o_o?'\n", 115 | "response =requests.get(url)\n", 116 | "start = response.text.find('secUid')\n", 117 | "response.text[int(start):].split('\"')[2]" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 52, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "\"\\n\\n\\n제리유진 (@yujin__o_o) TikTok | 제리유진 님의 최신 TikTok 동영상 시청\\n\\nTikTokLog inFor YouFollowingLog in to follow creators, like videos, and view comments.Log inAboutNewsroomContactCareersByteDanceTikTok for GoodAdvertiseDevelopersTransparencyHelpSafetyTermsPrivacyCreate PortalCommunity GuidelinesCopyrightMore© 2021 TikTokEnglishEnglishالعربيةDeutsch (Deutschland)EspañolSuomi (Suomi)FrançaisBahasa Indonesia (Indonesia)日本語(日本)한국어 (대한민국)Bahasa Melayu (Malaysia)Русскийไทย (ไทย)Türkçe (Türkiye)Tiếng Việt (Việt Nam)繁體中文Afrikaansעברית (ישראל)Basa Jawa (Indonesia)Cebuano (Pilipinas)Čeština (Česká republika)Italiano (Italia)Magyar (Magyarország)Nederlands (Nederland)Polski (Polska)Português (Brasil)Română (Romania)Svenska (Sverige)KiswahiliFilipino (Pilipinas)Ελληνικά (Ελλάδα)isiZuluУкраїнська (Україна)اردوमराठीहिंदीবাঙ্গালি (ভারত)ਪੰਜਾਬੀ (ਭਾਰਤ)ગુજરાતીଓଡିଆதமிழ்తెలుగుಕನ್ನಡമലയാളംမြန်မာ (မြန်မာ)ខ្មែរ (កម្ពុជា)yujin__o_o 제리유진Follow84Following131.7KFollowers497.8KLikes처음처럼 유진✌🏻\\nYoutube: Jerry제리\\ninsta: yujin__o_o\\n😡Don't use my picture😡VideosLiked\"" 129 | ] 130 | }, 131 | "execution_count": 52, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "soup = BeautifulSoup(response.text,'html.parser')\n", 138 | "soup.text" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 17, 179 | "metadata": { 180 | "scrolled": true 181 | }, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "47\n", 188 | "7.4M\n", 189 | "121.1M\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "#틱톡 계정의 팔로잉 / 팔로워 / 좋아요 수\n", 195 | "for div in html.select('h2.count-infos strong'):\n", 196 | " print(div)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 20, 209 | "metadata": { 210 | "scrolled": false 211 | }, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "
크리에이터를 팔로우하고, 동영상에 \"좋아요\"를 표시하고, 댓글을 보려면 로그인하세요.
More
© 2021 TikTok

한국어 (대한민국)

\"이시영

leesiyoung38

이시영

47팔로잉
7.4M팔로워
121.1M좋아요

이시영_Lee Si Young(actress)🇰🇷\n", 218 | "유튭⬇️⬇️⬇️youtube❤

동영상

좋아요

\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "for div in html.select('#main > div.jsx-977026505.main-body.page-with-header.middle'):\n", 224 | " print(div)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 28, 230 | "metadata": {}, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/plain": [ 235 | "SplitResult(scheme='https', netloc='t.tiktok.com', path='/api/post/item_list/', query='aid=1988&app_name=tiktok_web&device_platform=web&referer=&root_referer=&user_agent=Mozilla%2F5.0+(Macintosh%3B+Intel+Mac+OS+X+10_15_5)+AppleWebKit%2F537.36+(KHTML,+like+Gecko)+Chrome%2F88.0.4324.146+Safari%2F537.36&cookie_enabled=true&screen_width=2560&screen_height=1440&browser_language=ko-KR&browser_platform=MacIntel&browser_name=Mozilla&browser_version=5.0+(Macintosh%3B+Intel+Mac+OS+X+10_15_5)+AppleWebKit%2F537.36+(KHTML,+like+Gecko)+Chrome%2F88.0.4324.146+Safari%2F537.36&browser_online=true&ac=4g&timezone_name=Asia%2FSeoul&page_referer=https:%2F%2Fwww.tiktok.com%2F%40leesiyoung38%3Flang%3Dko-KR%26is_copy_url%3D1%26is_from_webapp%3Dv1&priority_region=&verifyFp=verify_kkqg77dw_hai6t3Ps_dkIT_41yF_ApMY_G7euFo5nq2xH&appId=1180®ion=KR&appType=t&isAndroid=false&isMobile=false&isIOS=false&OS=mac&did=6925283638187689473&count=30&cursor=1611878446000&secUid=MS4wLjABAAAAqmwhSDkAi7fNy9dgVPn16pUYEl6ArLzVKCQjh2clHqleiAk25sy9AgruSwfrUFON&language=ko-KR', fragment='')" 236 | ] 237 | }, 238 | "execution_count": 28, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "from urllib.parse import urlsplit\n", 245 | "url = 'https://t.tiktok.com/api/post/item_list/?aid=1988&app_name=tiktok_web&device_platform=web&referer=&root_referer=&user_agent=Mozilla%2F5.0+(Macintosh%3B+Intel+Mac+OS+X+10_15_5)+AppleWebKit%2F537.36+(KHTML,+like+Gecko)+Chrome%2F88.0.4324.146+Safari%2F537.36&cookie_enabled=true&screen_width=2560&screen_height=1440&browser_language=ko-KR&browser_platform=MacIntel&browser_name=Mozilla&browser_version=5.0+(Macintosh%3B+Intel+Mac+OS+X+10_15_5)+AppleWebKit%2F537.36+(KHTML,+like+Gecko)+Chrome%2F88.0.4324.146+Safari%2F537.36&browser_online=true&ac=4g&timezone_name=Asia%2FSeoul&page_referer=https:%2F%2Fwww.tiktok.com%2F%40leesiyoung38%3Flang%3Dko-KR%26is_copy_url%3D1%26is_from_webapp%3Dv1&priority_region=&verifyFp=verify_kkqg77dw_hai6t3Ps_dkIT_41yF_ApMY_G7euFo5nq2xH&appId=1180®ion=KR&appType=t&isAndroid=false&isMobile=false&isIOS=false&OS=mac&did=6925283638187689473&count=30&cursor=1611878446000&secUid=MS4wLjABAAAAqmwhSDkAi7fNy9dgVPn16pUYEl6ArLzVKCQjh2clHqleiAk25sy9AgruSwfrUFON&language=ko-KR'\n", 246 | "urlsplit(url)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [] 255 | } 256 | ], 257 | "metadata": { 258 | "kernelspec": { 259 | "display_name": "Python 3", 260 | "language": "python", 261 | "name": "python3" 262 | }, 263 | "language_info": { 264 | "codemirror_mode": { 265 | "name": "ipython", 266 | "version": 3 267 | }, 268 | "file_extension": ".py", 269 | "mimetype": "text/x-python", 270 | "name": "python", 271 | "nbconvert_exporter": "python", 272 | "pygments_lexer": "ipython3", 273 | "version": "3.8.3" 274 | } 275 | }, 276 | "nbformat": 4, 277 | "nbformat_minor": 4 278 | } 279 | --------------------------------------------------------------------------------