├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── fb_graphql_scraper ├── __init__.py ├── base │ ├── __init__.py │ └── base_page.py ├── example.py ├── facebook_graphql_scraper.py ├── pages │ ├── __init__.py │ └── page_optional.py ├── tests │ └── __init__.py └── utils │ ├── __init__.py │ ├── locator.py │ ├── parser.py │ └── utils.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/ 2 | .DS_Store 3 | .pypirc 4 | .env -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | This project adheres to [Semantic Versioning](https://semver.org/) and follows the [Keep a Changelog](https://keepachangelog.com/) format. 5 | 6 | --- 7 | 8 | ## [1.1.2] - 2025-05-11 9 | 10 | ### Added 11 | - Introduced `open_browser` parameter in the `FacebookGraphqlScraper` initializer: 12 | Allows opening the browser for manual Facebook login and easier debugging 13 | - Added `get_posts_image(post_id)` utility function: 14 | Retrieves embedded post images by parsing the post preview page 15 | 16 | ### Changed 17 | - Refactored `get_user_posts` function: 18 | - Restored `display_progress` parameter to improve visibility of scraping progress 19 | - Mitigated issues with `days_limit` causing restarts from the beginning, improving efficiency 20 | - Redesigned `requests_flow`: 21 | Switched from using only `before_time` to an alternative fallback method to bypass Facebook's enhanced anti-scraping mechanism 22 | - Modified `base_page.py`: 23 | Enabled browser mode toggling based on `open_browser` flag 24 | 25 | ### Fixed 26 | - Fixed premature termination logic in `get_user_posts` that caused incomplete post collection 27 | - Improved debugging experience by providing clearer runtime outputs 28 | 29 | --- 30 | 31 | ## [1.1.1] - Previous Version 32 | 33 | ### Added 34 | - Initial working version of `get_user_posts` function 35 | - Basic GraphQL request flow for Facebook post scraping 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) <2024> 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Facebook GraphQL Scraper 2 | 3 | ## Install 4 | 5 | To install the latest release from PyPI: 6 | 7 | ```sh 8 | pip install facebook-graphql-scraper 9 | ``` 10 | 11 | ## Requirements 12 | 13 | ```sh 14 | ipython==8.19.0 15 | pytz==2023.3.post1 16 | selenium_wire==5.1.0 17 | tqdm==4.66.1 18 | ``` 19 | 20 | # Support Me 21 | 22 | If you enjoy this project and would like to support me, please consider donating 🙌 23 | Your support will help me continue developing this project and working on other exciting ideas! 24 | 25 | ## 💖 Ways to Support: 26 | 27 | - **PayPal**: [https://www.paypal.me/faustren1z](https://www.paypal.me/faustren1z) 28 | - **Buy Me a Coffee**: [https://buymeacoffee.com/faustren1z](https://buymeacoffee.com/faustren1z) 29 | 30 | Thank you for your support!! 🎉 31 | 32 | ### Usage 33 | 34 | You can choose between two methods to collect user posts data. 35 | - **Pleas setup driver path at first** 36 | - **Log in with your account credentials**: login facebook account 37 | - **Without logging in**: Without logging in, click the X icon to 38 | - **Difference**: The difference between these two methods is that for some personal accounts, you cannot browse the user's posts without logging into a Facebook account. 39 | 40 | ```python 41 | # -*- coding: utf-8 -*- 42 | from fb_graphql_scraper.facebook_graphql_scraper import FacebookGraphqlScraper as fb_graphql_scraper 43 | 44 | 45 | ## Example.1 - without logging in 46 | if __name__ == "__main__": 47 | facebook_user_name = "love.yuweishao" 48 | facebook_user_id = "100044253168423" 49 | days_limit = 100 # Number of days within which to scrape posts 50 | driver_path = "/Users/hongshangren/Downloads/chromedriver-mac-arm64_136/chromedriver" 51 | fb_spider = fb_graphql_scraper(driver_path=driver_path, open_browser=False) 52 | res = fb_spider.get_user_posts(fb_username_or_userid=facebook_user_id, days_limit=days_limit,display_progress=True) 53 | # print(res) 54 | 55 | 56 | ## Example.2 - login in your facebook account to collect data 57 | # if __name__ == "__main__": 58 | # facebook_user_name = "love.yuweishao" 59 | # facebook_user_id = "100044253168423" 60 | # fb_account = "facebook_account" 61 | # fb_pwd = "facebook_paswword" 62 | # days_limit = 30 # Number of days within which to scrape posts 63 | # driver_path = "/Users/hongshangren/Downloads/chromedriver-mac-arm64_136/chromedriver" 64 | # fb_spider = fb_graphql_scraper(fb_account=fb_account,fb_pwd=fb_pwd, driver_path=driver_path, open_browser=False) 65 | # res = fb_spider.get_user_posts(fb_username_or_userid=facebook_user_name, days_limit=days_limit,display_progress=True) 66 | # print(res) 67 | 68 | 69 | 70 | ``` 71 | 72 | ### Optional parameters 73 | 74 | - **display_progress**: 75 | A boolean value (`True` or `False`). 76 | If set to `True`, the scraper will display how many days of posts remain to be collected based on your `days_limit`. 77 | For example, if `days_limit=180`, it will scrape posts from today back to 180 days ago. 78 | During the process, the remaining days will be printed and decrease gradually until reaching 0 or below, at which point scraping stops. 79 | Example output: 80 | `439 more days of posts to collect.` 81 | 82 | - **open_browser**: 83 | If set to `True`, the scraper will launch a browser window. 84 | This allows login-based scraping (if `fb_account` and `fb_pwd` are provided), which may access more content. 85 | However, this mode consumes more memory and **does not guarantee that your Facebook account will avoid being blocked**. 86 | It is also useful for debugging if scraping fails or unexpected behavior occurs. 87 | 88 | - **fb_username_or_userid**: 89 | The Facebook Group ID, Fan Page ID, User ID, or User Name to scrape posts from. 90 | 91 | - **days_limit**: 92 | The number of days of posts to retrieve, counting backwards from today. 93 | 94 | - **fb_account**: 95 | Your Facebook account (Login-based scraping is still under maintenance.) 96 | 97 | - **fb_pwd**: 98 | Your Facebook account password (Login-based scraping is still under maintenance.) 99 | 100 | 101 | 102 | ## Result example 103 | 104 | ```python 105 | {'fb_username_or_userid': '100044253168423', 106 | 'profile': ['任何工作事宜請洽高先生', 107 | '聯絡信箱：hawa00328@gmail.com', 108 | '聯絡電話：0975-386-266', 109 | 'Page', 110 | ' · 演員', 111 | 'hawa00328@gmail.com', 112 | 'Not yet rated (0 Reviews)', 113 | '\ufeff', 114 | '1,484,829 followers'], 115 | 'data': [{'post_id': '1245565493595211', 116 | 'post_url': 'https://www.facebook.com/1245565493595211', 117 | 'username_or_userid': '100044253168423', 118 | 'owing_profile': {'__typename': 'User', 119 | 'name': '邵雨薇', 120 | 'short_name': '邵雨薇', 121 | 'id': '100044253168423'}, 122 | 'published_date': Timestamp('2025-05-09 09:14:42'), 123 | 'published_date2': '2025-05-09', 124 | 'time': 1746782082, 125 | 'reaction_count.count': 3566, 126 | 'comment_rendering_instance.comments.total_count': 55, 127 | 'share_count.count': 13, 128 | 'sub_reactions': {'讚': 3273, '大心': 283, '加油': 6, '哈': 2, '哇': 2}, 129 | 'context': '溫柔的大貓咪\n緬因貓～～～～～～\n好喜歡❤️❤️❤️', 130 | 'video_view_count': None}, 131 | {'post_id': '1243688160449611', 132 | 'post_url': 'https://www.facebook.com/1243688160449611', 133 | 'username_or_userid': '100044253168423', 134 | 'owing_profile': {'__typename': 'User', 135 | 'name': '邵雨薇', 136 | 'short_name': '邵雨薇', 137 | 'id': '100044253168423'}, 138 | 'published_date': Timestamp('2025-05-06 12:38:46'), 139 | 'published_date2': '2025-05-06', 140 | 'time': 1746535126, 141 | 'reaction_count.count': 3270, 142 | 'comment_rendering_instance.comments.total_count': 59, 143 | 'share_count.count': 22, 144 | 'sub_reactions': {'讚': 2978, '大心': 282, '加油': 8, '哈': 2}, 145 | 'context': '💛', 146 | 'video_view_count': None}, 147 | {'post_id': '1242879413863819', 148 | 'post_url': 'https://www.facebook.com/1242879413863819', 149 | 'username_or_userid': '100044253168423', 150 | 'owing_profile': {'__typename': 'User', 151 | 'name': '邵雨薇', 152 | 'short_name': '邵雨薇', 153 | 'id': '100044253168423'}, 154 | 'published_date': Timestamp('2025-05-05 10:02:32'), 155 | 'published_date2': '2025-05-05', 156 | 'time': 1746439352, 157 | 'reaction_count.count': 3868, 158 | 'comment_rendering_instance.comments.total_count': 55, 159 | 'share_count.count': 28, 160 | 'sub_reactions': {'讚': 3493, '大心': 362, '加油': 9, '哈': 3, '哇': 1}, 161 | 'context': '愛的表達方式有很多，\n真誠言語直接的愛、\n以行動表達溫度的愛，\n又或是充滿美麗魔法的愛！ \n\n母親節就給媽媽一份加滿心意以及滿滿美麗的禮物吧！\n#潤姬桃子的愛的魔法\n祝媽媽母親節快樂💗\n\n@uruhime.momoko.official', 162 | 'video_view_count': None}, 163 | {'post_id': '1239140660904361', 164 | 'post_url': 'https://www.facebook.com/1239140660904361', 165 | 'username_or_userid': '100044253168423', 166 | 'owing_profile': {'__typename': 'User', 167 | 'name': '邵雨薇', 168 | 'short_name': '邵雨薇', 169 | 'id': '100044253168423'}, 170 | 'published_date': Timestamp('2025-04-30 09:01:18'), 171 | 'published_date2': '2025-04-30', 172 | 'time': 1746003678, 173 | 'reaction_count.count': 3455, 174 | 'comment_rendering_instance.comments.total_count': 42, 175 | 'share_count.count': 12, 176 | 'sub_reactions': {'讚': 3249, '大心': 199, '哈': 4, '加油': 2, '哇': 1}, 177 | 'context': '紐約碎片。\n\n沒注意到主人在，\n拍完往後轉抖了一大下。\n點點頭🙂\u200d↕️對了主人比個大拇指（意義不明？）', 178 | 'video_view_count': None}, 179 | {'post_id': '1237090651109362', 180 | 'post_url': 'https://www.facebook.com/1237090651109362', 181 | 'username_or_userid': '100044253168423', 182 | 'owing_profile': {'__typename': 'User', 183 | 'name': '邵雨薇', 184 | 'short_name': '邵雨薇', 185 | 'id': '100044253168423'}, 186 | 'published_date': Timestamp('2025-04-27 12:56:19'), 187 | 'published_date2': '2025-04-27', 188 | 'time': 1745758579, 189 | 'reaction_count.count': 4682, 190 | 'comment_rendering_instance.comments.total_count': 25, 191 | 'share_count.count': 12, 192 | 'sub_reactions': {'讚': 4354, '大心': 311, '加油': 11, '哈': 5, '哇': 1}, 193 | 'context': '回家抱老迪（請自動忽略阿爸）\n迪底撿回來也11年了，希望你也健康幸福。\n希望家人們都平安健康快樂。\n\n#迪底是阿筆的第一個兄弟', 194 | 'video_view_count': None}, 195 | {'post_id': '1236471601171267', 196 | 'post_url': 'https://www.facebook.com/1236471601171267', 197 | 'username_or_userid': '100044253168423', 198 | 'owing_profile': {'__typename': 'User', 199 | 'name': '邵雨薇', 200 | 'short_name': '邵雨薇', 201 | 'id': '100044253168423'}, 202 | 'published_date': Timestamp('2025-04-26 16:23:29'), 203 | 'published_date2': '2025-04-26', 204 | 'time': 1745684609, 205 | 'reaction_count.count': 3004, 206 | 'comment_rendering_instance.comments.total_count': 41, 207 | 'share_count.count': 13, 208 | 'sub_reactions': {'讚': 2789, '大心': 210, '哈': 3, '加油': 2}, 209 | 'context': '剛在坐高鐵時，覺得時間實在是過得太快了。\n還來不及消化感受些什麼，轉頭又得先離開。\n一天當三天用確實感覺很精彩，\n但是不是錯過太多細節了呢？晚安', 210 | 'video_view_count': None}, 211 | {'post_id': '1235381784613582', 212 | 'post_url': 'https://www.facebook.com/1235381784613582', 213 | 'username_or_userid': '100044253168423', 214 | 'owing_profile': {'__typename': 'User', 215 | 'name': '邵雨薇', 216 | 'short_name': '邵雨薇', 217 | 'id': '100044253168423'}, 218 | 'published_date': Timestamp('2025-04-25 05:49:56'), 219 | 'published_date2': '2025-04-25', 220 | 'time': 1745560196, 221 | 'reaction_count.count': 6846, 222 | 'comment_rendering_instance.comments.total_count': 101, 223 | 'share_count.count': 40, 224 | 'sub_reactions': {'讚': 6405, '大心': 408, '加油': 19, '哈': 14}, 225 | 'context': '偶爾需要遇見一道彩虹，\n雨後剛轉天晴時，就像一個新希望。', 226 | 'video_view_count': None} 227 | ] 228 | } 229 | ``` 230 | 231 | ### Notes 232 | - If you choose to collect data by logging into your account, you may face the risk of your account being blocked, even if this program only scrolls through Facebook web pages. 233 | - Reaction Categories (EN): [`like`, `haha`, `angry`, `love`, `care`, `wow`, `sad`] 234 | - Reaction Categories (TW): [`讚`, `哈`, `怒`, `大心`, `加油`, `哇`, `嗚`] 235 | 236 | 237 | ```python 238 | 239 | ## To-Do 240 | 241 | - Login-based scraping -------------------------------------------------------------------------------- /fb_graphql_scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaustRen/facebook-graphql-scraper/a53f1970a7b430170dab83fa2f81e19c4354dfeb/fb_graphql_scraper/__init__.py -------------------------------------------------------------------------------- /fb_graphql_scraper/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaustRen/facebook-graphql-scraper/a53f1970a7b430170dab83fa2f81e19c4354dfeb/fb_graphql_scraper/base/__init__.py -------------------------------------------------------------------------------- /fb_graphql_scraper/base/base_page.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from seleniumwire import webdriver 3 | from selenium.webdriver.chrome.service import Service 4 | 5 | class BasePage: 6 | def __init__(self, driver_path: str, open_browser: bool = False): 7 | chrome_options = self._build_options(open_browser) 8 | service = Service(driver_path) 9 | self.driver = webdriver.Chrome(service=service, options=chrome_options) 10 | self.driver.maximize_window() 11 | 12 | @staticmethod 13 | def _build_options(open_browser: bool) -> webdriver.ChromeOptions: 14 | options = webdriver.ChromeOptions() 15 | options.add_argument("--disable-blink-features") 16 | options.add_argument("--disable-notifications") 17 | options.add_argument("--disable-blink-features=AutomationControlled") 18 | if not open_browser: 19 | options.add_argument("--headless=new") 20 | options.add_argument("--blink-settings=imagesEnabled=false") 21 | return options 22 | -------------------------------------------------------------------------------- /fb_graphql_scraper/example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from fb_graphql_scraper.facebook_graphql_scraper import FacebookGraphqlScraper as fb_graphql_scraper 3 | 4 | 5 | ## Example.1 - without logging in 6 | if __name__ == "__main__": 7 | facebook_user_name = "love.yuweishao" 8 | facebook_user_id = "100044253168423" 9 | days_limit = 100 # Number of days within which to scrape posts 10 | driver_path = "/Users/hongshangren/Downloads/chromedriver-mac-arm64_136/chromedriver" 11 | fb_spider = fb_graphql_scraper(driver_path=driver_path, open_browser=False) 12 | res = fb_spider.get_user_posts(fb_username_or_userid=facebook_user_id, days_limit=days_limit,display_progress=True) 13 | 14 | 15 | ## Example.2 - login in your facebook account to collect data 16 | # if __name__ == "__main__": 17 | # facebook_user_name = "love.yuweishao" 18 | # facebook_user_id = "100044253168423" 19 | # fb_account = "facebook_account" 20 | # fb_pwd = "facebook_paswword" 21 | # days_limit = 30 # Number of days within which to scrape posts 22 | # driver_path = "/Users/hongshangren/Downloads/chromedriver-mac-arm64_136/chromedriver" 23 | # fb_spider = fb_graphql_scraper(fb_account=fb_account,fb_pwd=fb_pwd, driver_path=driver_path, open_browser=False) 24 | # res = fb_spider.get_user_posts(fb_username_or_userid=facebook_user_name, days_limit=days_limit,display_progress=True) 25 | # print(res) 26 | 27 | -------------------------------------------------------------------------------- /fb_graphql_scraper/facebook_graphql_scraper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | import time 4 | import json 5 | from bs4 import BeautifulSoup 6 | import requests 7 | from urllib.parse import parse_qs, unquote 8 | from fb_graphql_scraper.base.base_page import BasePage 9 | from fb_graphql_scraper.pages.page_optional import PageOptional 10 | from fb_graphql_scraper.utils.parser import RequestsParser 11 | from fb_graphql_scraper.utils.locator import * 12 | from fb_graphql_scraper.utils.utils import * 13 | 14 | 15 | class FacebookSettings: 16 | """ How to use: 17 | from fb_graphql_scraper.facebook_graphql_scraper import FacebookGraphqlScraper as fb_graphql_scraper 18 | 19 | # >> Example.1 - without logging in 20 | if __name__ == "__main__": 21 | facebook_user_name = "love.yuweishao" 22 | facebook_user_id = "100044253168423" 23 | days_limit = 30 # Number of days within which to scrape posts 24 | driver_path = "/Users/renren/Desktop/FB_graphql_scraper拷貝/fb_graphql_scraper/resources/chromedriver-mac-arm64/chromedriver" 25 | fb_spider = fb_graphql_scraper(driver_path=driver_path) 26 | res = fb_spider.get_user_posts(fb_username_or_userid=facebook_user_name, days_limit=days_limit,display_progress=True) 27 | print(res) 28 | 29 | # >> Example.2 - login in your facebook account to collect data 30 | # if __name__ == "__main__": 31 | # facebook_user_name = "love.yuweishao" 32 | # facebook_user_id = "100044253168423" 33 | # fb_account = "facebook_account" 34 | # fb_pwd = "facebook_paswword" 35 | # days_limit = 30 # Number of days within which to scrape posts 36 | # driver_path = "/Users/renren/Desktop/FB_graphql_scraper拷貝/fb_graphql_scraper/resources/chromedriver-mac-arm64/chromedriver" 37 | # fb_spider = fb_graphql_scraper(fb_account=fb_account,fb_pwd=fb_pwd,driver_path=driver_path) 38 | # res = fb_spider.get_user_posts(fb_username_or_userid=facebook_user_name, days_limit=days_limit,display_progress=True) 39 | # print(res) 40 | """ 41 | def __init__(self, fb_account: str = None, fb_pwd: str = None, driver_path: str = None, open_browser: bool = False): 42 | super().__init__() 43 | self.fb_account = fb_account 44 | self.fb_pwd = fb_pwd 45 | self.driver_path = driver_path 46 | self._set_spider( 47 | driver_path=driver_path, 48 | open_browser=open_browser 49 | ) 50 | self._set_container() 51 | self._set_stop_point() 52 | 53 | def _set_spider(self, driver_path, open_browser): 54 | """Description: Auto login account or click "X" button to continue, 55 | but some accounts cannot display info if you don't login account 56 | Args: url (str): target user which you want to collect data.""" 57 | self.base_page = BasePage( 58 | driver_path=driver_path, 59 | open_browser=open_browser 60 | ) 61 | self.page_optional = PageOptional( 62 | driver=self.base_page.driver, 63 | fb_account=self.fb_account, 64 | fb_pwd=self.fb_pwd 65 | ) 66 | time.sleep(3) 67 | self.requests_parser = RequestsParser(driver=self.page_optional.driver) 68 | 69 | def _set_container(self): 70 | self.post_id_list = [] 71 | self.reaction_count_list = [] 72 | self.profile_feed = [] 73 | self.res = { 74 | "post_caption": [], 75 | "post_date": [], 76 | "post_likes": [], 77 | "comment_share_type": [], 78 | "comment_share_value": [] 79 | } 80 | 81 | def _set_stop_point(self): 82 | self.pre_diff_days = float("-inf") 83 | self.counts_of_same_diff_days = 0 84 | 85 | 86 | class FacebookGraphqlScraper(FacebookSettings): 87 | def __init__(self, fb_account: str = None, fb_pwd: str = None, driver_path: str = None, open_browser: bool = False): 88 | super().__init__(fb_account=fb_account, fb_pwd=fb_pwd, driver_path=driver_path,open_browser=open_browser) 89 | 90 | def check_progress(self, days_limit: int = 61, display_progress:bool=True): 91 | """Check the published date of collected posts""" 92 | driver_requests = self.page_optional.driver.requests 93 | tmp_creation_array = [] 94 | # 取得當前頁面最底部貼文 95 | for i in range(len(driver_requests)-1, -1, -1): 96 | req = driver_requests[i] 97 | req_response, req_url = req.response, req.url 98 | body_out = self.requests_parser.get_graphql_body_content( 99 | req_response=req_response, req_url=req_url) 100 | 101 | if body_out: 102 | for each_body in body_out: 103 | json_data = json.loads(each_body) 104 | try: 105 | each_res = json_data['data']['node'].copy() 106 | each_feedback = find_feedback_with_subscription_target_id( 107 | each_res) 108 | if each_feedback: 109 | creation_time = find_creation(json_data) 110 | tmp_creation_array.append(int(creation_time)) 111 | except Exception as e: # 可以直接略過, 表示此graphql內容並非貼文 112 | pass 113 | diff_days = days_difference_from_now( 114 | tmp_creation_array=tmp_creation_array) 115 | if self.pre_diff_days == diff_days: 116 | self.counts_of_same_diff_days += 1 117 | else: 118 | self.counts_of_same_diff_days = 0 119 | self.pre_diff_days = max(diff_days, self.pre_diff_days) 120 | if display_progress: 121 | print(f"To access posts acquired within the past {self.pre_diff_days} days.") # 已取得n日內貼文 122 | return is_date_exceed_limit(max_days_ago=diff_days, days_limit=days_limit) 123 | 124 | def get_profile_feed(self, dict_in:dict={"data-pagelet": "ProfileTilesFeed_0"}): 125 | time.sleep(2) 126 | page_source = (self.page_optional.driver.page_source) 127 | soup = BeautifulSoup(page_source, "html.parser") 128 | target_div = soup.find("div", dict_in) 129 | if target_div: 130 | texts = target_div.find_all(text=True) 131 | return texts[2::] 132 | 133 | def get_plugin_page_followers(self, fb_username_or_userid): 134 | """透過嵌入式貼文取得粉絲專頁追蹤人數""" 135 | plugin_page_url = f"https://www.facebook.com/plugins/page.php?href=https%3A%2F%2Fwww.facebook.com%2F{fb_username_or_userid}&tabs=timeline&width=340&height=500&small_header=false&adapt_container_width=true&hide_cover=false&show_facepile=true&appId&locale=en_us" 136 | plugin_response = requests.get(url=plugin_page_url) 137 | plugin_soup = BeautifulSoup(plugin_response.text, "html.parser") 138 | plugin_soup = plugin_soup.find("div", class_="_1drq") 139 | if not plugin_soup: 140 | return plugin_soup 141 | return plugin_soup.text 142 | 143 | def format_data(self, res_in, fb_username_or_userid, new_reactions): 144 | final_res = pd.json_normalize(res_in) 145 | final_res['context'] = self.requests_parser.context_list 146 | final_res['username_or_userid'] = fb_username_or_userid 147 | final_res['owing_profile'] = self.requests_parser.owning_profile 148 | final_res['sub_reactions'] = new_reactions 149 | final_res['post_url'] = "https://www.facebook.com/" + final_res['post_id'] 150 | final_res['time'] = self.requests_parser.creation_list 151 | final_res['published_date'] = pd.to_datetime(final_res['time'], unit='s') 152 | final_res['published_date2'] = final_res['published_date'].dt.strftime('%Y-%m-%d') 153 | final_res = final_res[[ 154 | 'post_id', 155 | 'post_url', 156 | 'username_or_userid', 157 | 'owing_profile', 158 | 'published_date', 159 | 'published_date2', 160 | 'time', 161 | 'reaction_count.count', 162 | 'comment_rendering_instance.comments.total_count', 163 | 'share_count.count', 164 | 'sub_reactions', 165 | 'context', 166 | 'video_view_count', 167 | ]].to_dict(orient="records") 168 | filtered_post_id = [] 169 | filtered_data = [] 170 | for each_data in list(final_res): 171 | if each_data["post_id"] not in filtered_post_id: 172 | filtered_data.append(each_data) 173 | filtered_post_id.append(each_data["post_id"]) 174 | return filtered_data 175 | 176 | def process_reactions(self, res_in): 177 | reactions_out = [] 178 | for each_res in res_in: 179 | each_reactions = each_res['top_reactions']['edges'] 180 | processed_reactions = self.requests_parser.process_reactions( 181 | reactions_in=each_reactions) 182 | reactions_out.append(processed_reactions) 183 | return reactions_out 184 | 185 | def get_init_payload(self): 186 | requests_list = self.page_optional.driver.requests 187 | for req in requests_list: 188 | if req.url == "https://www.facebook.com/api/graphql/": 189 | payload = req.body.decode('utf-8') # 解碼成字串 190 | break 191 | first_payload = self.requests_parser.extract_first_payload(payload=payload) 192 | return first_payload 193 | 194 | 195 | def get_user_posts(self, fb_username_or_userid: str, days_limit: int = 61, display_progress:bool=True) -> dict: 196 | url = f"https://www.facebook.com/{fb_username_or_userid}?locale=en_us" # 建立完整user連結 197 | self.page_optional.load_next_page(url=url, clear_limit=20)# driver 跳至該連結 198 | self.page_optional.load_next_page(url=url, clear_limit=20)# 徹底清除requests避免參雜上一用戶資料 199 | self.requests_parser._clean_res() # 清空所有用於儲存結果的array 200 | self._set_container() # 清空用於儲存貼文資訊的array 201 | self._set_stop_point() # 設置/重置停止條件 | 停止條件: 瀏覽器無法往下取得更多貼文(n次) or 已取得目標天數內貼文 202 | 203 | # If you did not login, click X button 204 | if self.fb_account == None: 205 | self.page_optional.click_reject_login_button() 206 | time.sleep(2) 207 | self.page_optional.scroll_window_with_parameter("4000") 208 | for _ in range(30): 209 | try: 210 | init_payload = self.get_init_payload() 211 | payload_variables = init_payload.get("variables") 212 | user_id = str(payload_variables["id"]) 213 | doc_id = str(init_payload.get("doc_id")) 214 | print("Collect posts wihout loggin in.") 215 | break 216 | except Exception as e: 217 | print("Wait 1 second to load page") 218 | time.sleep(1) 219 | 220 | # Get profile information 221 | try: 222 | if self.fb_account == None: 223 | profile_feed = self.get_profile_feed(dict_in={"class": "x1yztbdb"}) 224 | else: 225 | profile_feed = self.get_profile_feed() 226 | 227 | except Exception as e: 228 | try: 229 | if self.fb_account != None: 230 | profile_feed = self.get_profile_feed(dict_in={"class": "x1yztbdb"}) 231 | else: 232 | profile_feed = self.get_profile_feed() 233 | 234 | except Exception as e: 235 | print("Collect profile info failed, profile info will be empty array.") 236 | profile_feed = [] 237 | 238 | if "Page" in profile_feed: 239 | followers = self.get_plugin_page_followers(fb_username_or_userid=fb_username_or_userid) 240 | if followers: profile_feed.append(followers) 241 | 242 | # collect data without login 243 | if self.fb_account == None: 244 | res = self.requests_flow(doc_id = doc_id, fb_username_or_userid=user_id, days_limit=days_limit, profile_feed=profile_feed, display_progress=display_progress) 245 | return res 246 | 247 | # Scroll page 248 | # print("-------------------- Another execute process is started.......... --------------------") 249 | counts_of_round = 0 250 | for _ in range(1000): # max rounds of scrolling page 251 | self.page_optional.scroll_window() 252 | if counts_of_round >= 5: # Check progress every 5 times you scroll the page 253 | if display_progress: 254 | print("Check spider progress..") 255 | if self.check_progress(days_limit=days_limit,display_progress=display_progress): 256 | break 257 | # If you find that the published dates 258 | # of the collected posts are on the same day five times in a row, 259 | # it may mean that the page has scrolled to the bottom. 260 | elif self.counts_of_same_diff_days >= 5: 261 | break 262 | else: 263 | counts_of_round = 0 264 | 265 | counts_of_round += 1 266 | pause(0.7) 267 | 268 | # Collect data, extract graphql from driver requests. 269 | driver_requests = self.page_optional.driver.requests 270 | for req in driver_requests: 271 | req_response, req_url = req.response, req.url 272 | body_out = self.requests_parser.get_graphql_body_content( 273 | req_response=req_response, req_url=req_url) 274 | if body_out: 275 | self.requests_parser.parse_body(body_content=body_out) 276 | res_out = self.requests_parser.collect_posts() 277 | new_reactions = self.process_reactions(res_in=res_out) 278 | 279 | # 建立result 280 | final_res = self.format_data( 281 | res_in=res_out, 282 | fb_username_or_userid=fb_username_or_userid, 283 | new_reactions=new_reactions 284 | ) 285 | return { 286 | "fb_username_or_userid": fb_username_or_userid, 287 | "profile": profile_feed, 288 | "data": final_res, 289 | } 290 | 291 | def requests_flow(self, doc_id:str, fb_username_or_userid:str, days_limit:int, profile_feed:list, display_progress=True): 292 | """ 293 | Fetch more posts from a user's Facebook profile using the requests module. 294 | 295 | Flow: 296 | 1. Get the document ID of the target Facebook profile. 297 | 2. Use the requests module to fetch data from the profile. 298 | 3. Continuously fetch data by checking for new posts until the specified days limit is reached. 299 | 300 | Args: 301 | doc_id (str): The document ID of the target Facebook account. 302 | fb_username_or_userid (str): The Facebook username or user ID of the target account. 303 | days_limit (int): The number of days for which to fetch posts (limits the time range of retrieved posts). 304 | profile_feed (list): A list containing the posts retrieved from the target profile. 305 | 306 | Helper Functions: 307 | 1. get_before_time: 308 | Retrieves Facebook posts from a specified time period before the current date. 309 | 310 | 2. get_payload: 311 | Prepares the payload for the next round of requests to the server. 312 | 313 | 3. get_next_page_status: 314 | Checks whether the target Facebook user has more posts available for retrieval. 315 | 316 | 4. compare_timestamp: 317 | Verifies whether a retrieved post falls within the specified time period for collection. 318 | """ 319 | 320 | url = "https://www.facebook.com/api/graphql/" 321 | before_time = get_before_time() 322 | loop_limit = 5000 323 | is_first_time = True 324 | # Extract data 325 | for i in range(loop_limit): 326 | if is_first_time: 327 | payload_in = get_payload( 328 | doc_id_in=doc_id, 329 | id_in=fb_username_or_userid, 330 | before_time=before_time 331 | ) 332 | is_first_time = False 333 | 334 | # if not the first tiime send request, use function 'get_next_payload' for extracting end cursor to scrape next round 335 | elif not is_first_time: 336 | next_cursor = get_next_cursor(body_content_in=body_content) 337 | payload_in = get_next_payload( 338 | doc_id_in=doc_id, 339 | id_in=fb_username_or_userid, 340 | before_time=before_time, # input before_time 341 | cursor_in=next_cursor 342 | ) 343 | 344 | response = requests.post( 345 | url=url, 346 | data=payload_in, 347 | ) 348 | body = response.content 349 | decoded_body = body.decode("utf-8") 350 | body_content = decoded_body.split("\n") 351 | self.requests_parser.parse_body(body_content=body_content) 352 | 353 | # Check progress 354 | next_page_status = get_next_page_status(body_content=body_content) 355 | 356 | before_time = str(self.requests_parser.creation_list[-1]) 357 | if not next_page_status: 358 | print("There are no more posts.") 359 | break 360 | 361 | # date_object = int(datetime.strptime(before_time, "%Y-%m-%d")) 362 | if compare_timestamp(timestamp=int(before_time), days_limit=days_limit, display_progress=display_progress): 363 | print(f"The scraper has successfully retrieved posts from the past {str(days_limit)} days.") 364 | break 365 | 366 | res_out = self.requests_parser.collect_posts() 367 | new_reactions = self.process_reactions(res_in=res_out) 368 | # create result 369 | final_res = self.format_data( 370 | res_in=res_out, 371 | fb_username_or_userid=fb_username_or_userid, 372 | new_reactions=new_reactions 373 | ) 374 | return { 375 | "fb_username_or_userid": fb_username_or_userid, 376 | "profile": profile_feed, 377 | "data": final_res, 378 | } -------------------------------------------------------------------------------- /fb_graphql_scraper/pages/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaustRen/facebook-graphql-scraper/a53f1970a7b430170dab83fa2f81e19c4354dfeb/fb_graphql_scraper/pages/__init__.py -------------------------------------------------------------------------------- /fb_graphql_scraper/pages/page_optional.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from fb_graphql_scraper.utils.locator import * 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.support import expected_conditions as EC 6 | from selenium.webdriver.common.action_chains import ActionChains 7 | from selenium.webdriver.common.keys import Keys 8 | import time 9 | 10 | 11 | class PageOptional(object): 12 | def __init__(self, driver=None, fb_account: str = None, fb_pwd: str = None): 13 | self.locator = PageLocators 14 | self.xpath_elements = PageXpath 15 | self.class_elements = PageClass 16 | self.page_text = PageText 17 | self.driver = driver 18 | self.fb_account = fb_account 19 | self.fb_pwd = fb_pwd 20 | 21 | # Loggin account 22 | if self.fb_account and self.fb_pwd: 23 | login_page_url = "https://www.facebook.com/login" 24 | self.driver.get(url=login_page_url) 25 | self.login_page() 26 | 27 | def login_page(self): 28 | try: 29 | self.login_account(user=self.fb_account, 30 | password=self.fb_pwd, 31 | ) 32 | time.sleep(5) 33 | except Exception as e: 34 | print(f"Login faield, message: {e}") 35 | 36 | def clean_requests(self): 37 | print(f"Before cleaning driver requests, the number of requests are: {len(self.driver.requests)}") 38 | try: 39 | print("Try to clear driver requests..") 40 | del self.driver.requests 41 | print(f"Clear, the number of requests are: {len(self.driver.requests)}") 42 | except Exception as e: 43 | print(f"Clear unsuccessfully, message: {e}") 44 | 45 | def get_in_url(self): 46 | self.driver.get(url=self.url) 47 | 48 | def login_account(self, user: str, password: str): 49 | user_element = self.driver.find_element(By.NAME, "email") 50 | user_element.send_keys(user) 51 | password_element = self.driver.find_element(By.NAME, "pass") 52 | password_element.send_keys(password) 53 | password_element.send_keys(Keys.ENTER) 54 | 55 | def scroll_window(self): 56 | self.driver.execute_script( 57 | "window.scrollTo(0,document.body.scrollHeight)") 58 | 59 | def scroll_window_with_parameter(self, parameter_in: str): 60 | self.driver.execute_script(f"window.scrollBy(0, {parameter_in});") 61 | 62 | def set_browser_zoom_percent(self, zoom_percent: int): 63 | zoom_percent = str(zoom_percent) 64 | self.driver.execute_script( 65 | f"document.body.style.zoom='{zoom_percent}%'") 66 | 67 | def move_to_element(self, element_in): 68 | ActionChains(self.driver).move_to_element(element_in).perform() 69 | 70 | def load_next_page(self, url:str, clear_limit:int=20): 71 | """>> Move on to target facebook user page, 72 | before moving, clean driver's requests first, 73 | or driver would store previous account's data. 74 | Args: url (str): user(kol) links""" 75 | i = 0 76 | while i <= clear_limit: 77 | self.clean_requests() 78 | if len(self.driver.requests) == 0: 79 | print("Clear all driver requests already!") 80 | break 81 | i += 1 82 | self.driver.get(url=url) 83 | 84 | def click_display_button(self): 85 | elements = self.driver.find_elements(self.locator.DISPLAY_MORE) 86 | for _ in range(10): 87 | for each_element in elements: 88 | if each_element.text == self.page_text.DISPLAY_MORE or each_element.text == self.page_text.DISPLAY_MORE2: 89 | self.move_to_element(element_in=each_element) 90 | self.scroll_window_with_parameter(parameter_in="500") 91 | try: 92 | each_element.click() 93 | elements = self.driver.find_elements( 94 | self.locator.DISPLAY_MORE) 95 | except Exception as e: 96 | print( 97 | f"Click display more unsucessfully, error message:\n{e}") 98 | 99 | def click_display_button2(self): 100 | display_more_xpath = f"//div[@class='{PageClass.DISPLAY_MORE}' and @role='{PageRoleValue.DISPLAY_MORE}' and text()='{PageText.DISPLAY_MORE}']" 101 | elements = self.driver.find_elements(By.XPATH, display_more_xpath) 102 | for _ in range(10): 103 | for each_element in elements: 104 | if each_element.text == self.page_text.DISPLAY_MORE or each_element.text == self.page_text.DISPLAY_MORE2: 105 | self.move_to_element(element_in=each_element) 106 | self.scroll_window_with_parameter(parameter_in="500") 107 | try: 108 | each_element.click() 109 | elements = self.driver.find_elements( 110 | self.locator.DISPLAY_MORE) 111 | except Exception as e: 112 | print( 113 | f"Click display more unsucessfully, error message:\n{e}") 114 | 115 | def click_reject_login_button(self): 116 | try: 117 | reject_login_button = WebDriverWait(self.driver, 10).until( 118 | EC.visibility_of_element_located((self.locator.CLOSELOGIN))) 119 | reject_login_button.click() 120 | except Exception as e: 121 | print(f"Click reject button failed, message:{e}") 122 | 123 | def quit_driver(self): 124 | self.driver.quit() 125 | 126 | def close_driver(self): 127 | self.driver.close() 128 | -------------------------------------------------------------------------------- /fb_graphql_scraper/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaustRen/facebook-graphql-scraper/a53f1970a7b430170dab83fa2f81e19c4354dfeb/fb_graphql_scraper/tests/__init__.py -------------------------------------------------------------------------------- /fb_graphql_scraper/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FaustRen/facebook-graphql-scraper/a53f1970a7b430170dab83fa2f81e19c4354dfeb/fb_graphql_scraper/utils/__init__.py -------------------------------------------------------------------------------- /fb_graphql_scraper/utils/locator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from selenium.webdriver.common.by import By 3 | 4 | 5 | class PageXpath(object): 6 | CLOSE_LOGIN_BUTTON = "/html/body/div[1]/div/div[1]/div/div[5]/div/div/div[1]/div/div[2]/div/div/div/div[1]/div/i" 7 | 8 | 9 | class PageClass(object): 10 | DISPLAY_MORE = "x1i10hfl xjbqb8w x1ejq31n xd10rxx x1sy0etr x17r0tee x972fbf xcfux6l x1qhh985 xm0m39n x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz xt0b8zv xzsf02u x1s688f" 11 | CONTENTS = "x1yztbdb x1n2onr6 xh8yej3 x1ja2u2z" 12 | CAPTION = "x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u x1yc453h" 13 | CAPTION2 = "xdj266r x11i5rnm xat24cr x1mh8g0r x1vvkbs x126k92a" 14 | POSTDATE = "x1i10hfl xjbqb8w x1ejq31n xd10rxx x1sy0etr x17r0tee x972fbf xcfux6l x1qhh985 xm0m39n x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g xt0b8zv xo1l8bm" 15 | POSTDATE = "x1i10hfl xjbqb8w x1ejq31n xd10rxx x1sy0etr x17r0tee x972fbf xcfux6l x1qhh985 xm0m39n x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1heor9g xt0b8zv xo1l8bm" 16 | LIKES = "x6s0dn4 x78zum5 x1iyjqo2 x6ikm8r x10wlt62" 17 | COMMENT_SHARE_PARENTS = "x9f619 x1n2onr6 x1ja2u2z x78zum5 x2lah0s x1qughib x1qjc9v5 xozqiw3 x1q0g3np xykv574 xbmpl8g x4cne27 xifccgj" 18 | COMMENT_SHARE_CHILD = "x193iq5w xeuugli x13faqbe x1vvkbs x1xmvt09 x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x xudqn12 x3x7a5m x6prxxf xvq8zen xo1l8bm xi81zsa" 19 | 20 | 21 | class SoupElement(object): 22 | pass 23 | 24 | 25 | class PageText(object): 26 | DISPLAY_MORE = "查看更多" 27 | DISPLAY_MORE2 = "顯示更多" 28 | 29 | 30 | class PageRoleValue(object): 31 | DISPLAY_MORE = "button" 32 | 33 | 34 | class PageLocators(object): 35 | CLOSELOGIN = ( 36 | By.XPATH, "/html/body/div[1]/div/div[1]/div/div[5]/div/div/div[1]/div/div[2]/div/div/div/div[1]/div/i") 37 | DISPLAY_MORE = ( 38 | By.XPATH, f"//div[@class='{PageClass.DISPLAY_MORE}' and @role='{PageRoleValue.DISPLAY_MORE}' and text()='{PageText.DISPLAY_MORE}']") 39 | 40 | LOGGINUSR1 = ( 41 | "/html/body/div[1]/div[1]/div[1]/div/div/div/div[2]/div/div[1]/form/div[1]/div[1]/input" 42 | ) 43 | LOGGINPWD1 = ( 44 | "/html/body/div[1]/div[1]/div[1]/div/div/div/div[2]/div/div[1]/form/div[1]/div[2]/div/input" 45 | ) 46 | 47 | LOGGINUSR2 = ( 48 | "/html/body/div[1]/div/div[1]/div/div[5]/div/div/div[1]/div/div[2]/div/div/div/div[2]/form/div/div[3]/div/label/div/div/input") 49 | LOGGINPWD2 = ( 50 | "/html/body/div[1]/div/div[1]/div/div[5]/div/div/div[1]/div/div[2]/div/div/div/div[2]/form/div/div[4]/div/label/div/div/input") 51 | 52 | # version.3: facebook login page 53 | LOGGINUSR3 = (By.NAME, "email") 54 | LOGGINPWD3 = (By.NAME, "pass") -------------------------------------------------------------------------------- /fb_graphql_scraper/utils/parser.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import pandas as pd 3 | from seleniumwire.utils import decode 4 | import json 5 | from urllib.parse import parse_qs, unquote 6 | from fb_graphql_scraper.utils.utils import * 7 | 8 | 9 | class RequestsParser(object): 10 | def __init__(self, driver) -> None: 11 | self.driver = driver 12 | self.reaction_names = ["讚", "哈", "怒", "大心", "加油", "哇", "嗚"] 13 | self.en_reaction_names = ["like", "haha", "angry", "love", "care", "sorry", "wow"] 14 | 15 | def get_graphql_body_content(self, req_response, req_url): 16 | target_url = "https://www.facebook.com/api/graphql/" 17 | if req_response and req_url == target_url: 18 | response = req_response 19 | body = decode(response.body, response.headers.get( 20 | 'Content-Encoding', 'identity')) 21 | body_content = body.decode("utf-8").split("\n") 22 | return body_content 23 | return None 24 | 25 | def _clean_res(self): 26 | self.res_new = [] 27 | self.feedback_list = [] 28 | self.context_list = [] 29 | self.creation_list = [] 30 | self.author_id_list = [] 31 | self.author_id_list2 = [] 32 | self.owning_profile = [] 33 | 34 | def parse_body(self, body_content): 35 | for each_body in body_content: 36 | json_data = json.loads(each_body) 37 | self.res_new.append(json_data) 38 | try: 39 | each_res = json_data['data']['node'].copy() 40 | each_feedback = find_feedback_with_subscription_target_id( 41 | each_res) 42 | if each_feedback: 43 | self.feedback_list.append(each_feedback) 44 | message_text = find_message_text(json_data) 45 | creation_time = find_creation(json_data) 46 | owing_profile = find_owning_profile(json_data) 47 | if message_text: 48 | self.context_list.append(message_text) 49 | elif not message_text: 50 | self.context_list.append(None) 51 | if creation_time: 52 | self.creation_list.append(creation_time) 53 | self.owning_profile.append(owing_profile) 54 | 55 | # Did not display or record error message at here 56 | except Exception as e: 57 | pass 58 | 59 | def collect_posts(self): 60 | res_out = [] 61 | for each in self.feedback_list: 62 | res_out.append({ 63 | "post_id": each['subscription_target_id'], 64 | "reaction_count": each['reaction_count'], 65 | "top_reactions": each['top_reactions'], 66 | "share_count": each['share_count'], 67 | "comment_rendering_instance": each['comment_rendering_instance'], 68 | "video_view_count": each['video_view_count'] 69 | }) 70 | return res_out 71 | 72 | def convert_res_to_df(self, res_in): 73 | df_res = pd.json_normalize(res_in) 74 | df_res = df_res[[ 75 | 'post_id', 76 | 'reaction_count.count', 77 | 'comment_rendering_instance.comments.total_count', 78 | 'share_count.count', 79 | 'top_reactions.edges', 80 | 'video_view_count' 81 | ]] 82 | return df_res 83 | 84 | def process_reactions(self, reactions_in) -> dict: 85 | """Extract sub reaction value: 86 | Args: 87 | reactions_in (_type_): _description_ 88 | Returns: 89 | _dict_: { 90 | "like": value, 91 | "haha": value, 92 | "angry": value, 93 | "love": value, 94 | "care": value, 95 | "sorry": value, 96 | "wow": value 97 | } 98 | Note: 99 | """ 100 | reaction_hash = {} 101 | for each_react in reactions_in: 102 | reaction_hash[each_react['node']['localized_name'] 103 | ] = each_react['reaction_count'] # get reaction value 104 | return reaction_hash 105 | 106 | def extract_first_payload(self, payload:str): 107 | parsed_data = parse_qs(payload) 108 | decoded_data = {unquote(k): [unquote(v) for v in vals] for k, vals in parsed_data.items()} # 解碼 keys 和 values 109 | first_payload = {k: v[0] for k, v in decoded_data.items()} # 如果只需要第一個值作為字典中的單一值 110 | payload_variables = json.loads(first_payload['variables']) 111 | first_payload['variables'] = payload_variables 112 | return first_payload -------------------------------------------------------------------------------- /fb_graphql_scraper/utils/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import concurrent.futures as futures 3 | import requests 4 | import re 5 | from bs4 import BeautifulSoup 6 | from datetime import datetime, timedelta 7 | import pytz 8 | import time 9 | import json 10 | 11 | 12 | # if key: 'subscription_target_id' in feedback, store this feedback 13 | def find_feedback_with_subscription_target_id(data): 14 | if isinstance(data, dict): 15 | if 'feedback' in data and isinstance(data['feedback'], dict): 16 | feedback = data['feedback'] 17 | if 'subscription_target_id' in list(feedback.keys()): 18 | return feedback 19 | 20 | # Traverse the values of the dictionary and continue recursively searching 21 | for value in data.values(): 22 | result = find_feedback_with_subscription_target_id(value) 23 | if result: 24 | return result 25 | 26 | # If it is a list, traverse each element in the list and continue recursively searching 27 | elif isinstance(data, list): 28 | for item in data: 29 | result = find_feedback_with_subscription_target_id(item) 30 | if result: 31 | return result 32 | 33 | # If no matching feedback is found, return None 34 | return None 35 | 36 | 37 | def find_message_text(data): 38 | if isinstance(data, dict): 39 | # type is dict，check 'story' key 40 | if 'story' in data: 41 | # if key 'story's value type is dict, and include 'message' key 42 | if isinstance(data['story'], dict) and 'message' in data['story']: 43 | # if key 'message's value type is dict, and include 'text' key 44 | if isinstance(data['story']['message'], dict) and 'text' in data['story']['message']: 45 | # return 'text' key 46 | return data['story']['message']['text'] 47 | 48 | # recursively check each value in dict if can not find anything 49 | for value in data.values(): 50 | result = find_message_text(value) 51 | if result: 52 | return result 53 | elif isinstance(data, list): 54 | # if array, check each element recursively 55 | for item in data: 56 | result = find_message_text(item) 57 | if result: 58 | return result 59 | # 如果沒有符合條件的值，return None 60 | return None 61 | 62 | 63 | def find_creation(data): 64 | if isinstance(data, dict): 65 | # If it's a dictionary, check if it contains the 'story' key 66 | if 'story' in data: 67 | # If the value of the 'story' key is a dictionary and contains the 'creation_time' key 68 | if isinstance(data['story'], dict) and 'creation_time' in data['story']: 69 | # Return the value of the 'creation_time' key 70 | return data['story']['creation_time'] 71 | 72 | # If no matching condition is found, recursively check each value in the dictionary 73 | for value in data.values(): 74 | result = find_creation(value) 75 | if result: 76 | return result 77 | 78 | elif isinstance(data, list): 79 | # If it's a list, recursively check each element in the list 80 | for item in data: 81 | result = find_creation(item) 82 | if result: 83 | return result 84 | # If no matching condition is found, return None 85 | return None 86 | 87 | 88 | def find_actors(data): 89 | if isinstance(data, dict): 90 | # If it's a dictionary, check if it contains the 'story' key 91 | if 'story' in data: 92 | # If the value of the 'story' key is a dictionary and contains the 'actors' key 93 | if isinstance(data['story'], dict) and 'actors' in data['story']: 94 | # Return the value of the 'id' key under 'actors' 95 | return data['story']['actors']['id'] 96 | 97 | # If no matching condition is found, recursively check each value in the dictionary 98 | for value in data.values(): 99 | result = find_actors(value) 100 | if result: 101 | return result 102 | 103 | elif isinstance(data, list): 104 | # If it's a list, recursively check each element in the list 105 | for item in data: 106 | result = find_actors(item) 107 | if result: 108 | return result 109 | # If no matching condition is found, return None 110 | return None 111 | 112 | 113 | def find_owning_profile(data): 114 | if isinstance(data, dict): 115 | # If it's a dictionary, check if it contains the 'story' key 116 | if 'owning_profile' in data: 117 | # If the value of the 'story' key is a dictionary and contains the 'actors' key 118 | if isinstance(data['owning_profile'], dict): 119 | # Return the value of the 'id' key under 'actors' 120 | return data['owning_profile'] 121 | 122 | # If no matching condition is found, recursively check each value in the dictionary 123 | for value in data.values(): 124 | result = find_owning_profile(value) 125 | if result: 126 | return result 127 | 128 | elif isinstance(data, list): 129 | # If it's a list, recursively check each element in the list 130 | for item in data: 131 | result = find_owning_profile(item) 132 | if result: 133 | return result 134 | # If no matching condition is found, return None 135 | return None 136 | 137 | 138 | def timeout(timelimit): 139 | def decorator(func): 140 | def decorated(*args, **kwargs): 141 | with futures.ThreadPoolExecutor(max_workers=1) as executor: 142 | future = executor.submit(func, *args, **kwargs) 143 | try: 144 | result = future.result(timelimit) 145 | except futures.TimeoutError: 146 | print('Time out!') 147 | raise TimeoutError from None 148 | else: 149 | pass 150 | executor._threads.clear() 151 | futures.thread._threads_queues.clear() 152 | return result 153 | return decorated 154 | return decorator 155 | 156 | 157 | def get_current_time(timezone="Asia/Taipei"): 158 | current_time_utc = datetime.utcnow() 159 | target_timezone = pytz.timezone(timezone) 160 | target_current_time = current_time_utc.replace( 161 | tzinfo=pytz.utc).astimezone(target_timezone) 162 | return target_current_time 163 | 164 | 165 | def days_difference_from_now(tmp_creation_array: list) -> int: 166 | """計算第一次發文日期與當前日間隔天數 167 | 168 | Args: 169 | tmp_creation_array (list): _description_ 170 | 171 | Returns: 172 | int: 間隔天數 173 | """ 174 | timestamp = min(tmp_creation_array) 175 | current_date_time = datetime.now() 176 | date_time_obj = datetime.fromtimestamp(timestamp) 177 | difference = current_date_time - date_time_obj 178 | return difference.days 179 | 180 | 181 | def is_date_exceed_limit(max_days_ago, days_limit: int = 61): 182 | if max_days_ago > days_limit: 183 | return True 184 | return False 185 | 186 | def pause(pause_time: int = 1): 187 | time.sleep(pause_time) 188 | 189 | 190 | def get_payload(doc_id_in: str, id_in: str, before_time: str = None): 191 | variables_dict = { 192 | "afterTime": None, 193 | "beforeTime": before_time, 194 | "count": 3, 195 | "cursor": None, 196 | "feedLocation": "TIMELINE", 197 | "feedbackSource": 0, 198 | "focusCommentID": None, 199 | "memorializedSplitTimeFilter": None, 200 | "omitPinnedPost": True, 201 | "postedBy": {"group": "OWNER"}, 202 | "privacy": {"exclusivity": "INCLUSIVE", "filter": "ALL"}, 203 | "privacySelectorRenderLocation": "COMET_STREAM", 204 | "renderLocation": "timeline", 205 | "scale": 3, 206 | "stream_count": 1, 207 | "taggedInOnly": False, 208 | "useDefaultActor": False, 209 | "id": id_in, 210 | "__relay_internal__pv__CometImmersivePhotoCanUserDisable3DMotionrelayprovider": False, 211 | "__relay_internal__pv__IsWorkUserrelayprovider": False, 212 | "__relay_internal__pv__IsMergQAPollsrelayprovider": False, 213 | "__relay_internal__pv__CometUFIReactionsEnableShortNamerelayprovider": False, 214 | "__relay_internal__pv__CometUFIShareActionMigrationrelayprovider": False, 215 | "__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider": False, 216 | "__relay_internal__pv__StoriesTrayShouldShowMetadatarelayprovider": False, 217 | "__relay_internal__pv__StoriesRingrelayprovider": False, 218 | "__relay_internal__pv__EventCometCardImage_prefetchEventImagerelayprovider": False 219 | } 220 | 221 | payload_out = { 222 | "variables": json.dumps(variables_dict), 223 | "doc_id": doc_id_in 224 | } 225 | return payload_out 226 | 227 | def get_next_payload( 228 | doc_id_in:str, 229 | id_in:str, 230 | before_time:str, 231 | cursor_in:str 232 | ): 233 | variables_dict = { 234 | "afterTime": None, 235 | "beforeTime": before_time, 236 | "count": 3, 237 | "cursor": cursor_in, 238 | "feedLocation": "TIMELINE", 239 | "feedbackSource": 0, 240 | "focusCommentID": None, 241 | "memorializedSplitTimeFilter": None, 242 | "omitPinnedPost": True, 243 | "postedBy": {"group": "OWNER"}, 244 | "privacy": {"exclusivity": "INCLUSIVE", "filter": "ALL"}, 245 | "privacySelectorRenderLocation": "COMET_STREAM", 246 | "renderLocation": "timeline", 247 | "scale": 3, 248 | "stream_count": 1, 249 | "taggedInOnly": False, 250 | "useDefaultActor": False, 251 | "id": id_in, 252 | "__relay_internal__pv__CometImmersivePhotoCanUserDisable3DMotionrelayprovider": False, 253 | "__relay_internal__pv__IsWorkUserrelayprovider": False, 254 | "__relay_internal__pv__IsMergQAPollsrelayprovider": False, 255 | "__relay_internal__pv__CometUFIReactionsEnableShortNamerelayprovider": False, 256 | "__relay_internal__pv__CometUFIShareActionMigrationrelayprovider": False, 257 | "__relay_internal__pv__StoriesArmadilloReplyEnabledrelayprovider": False, 258 | "__relay_internal__pv__StoriesTrayShouldShowMetadatarelayprovider": False, 259 | "__relay_internal__pv__StoriesRingrelayprovider": False, 260 | "__relay_internal__pv__EventCometCardImage_prefetchEventImagerelayprovider": False 261 | } 262 | payload_out = { 263 | "variables": json.dumps(variables_dict), 264 | "doc_id": doc_id_in 265 | } 266 | return payload_out 267 | 268 | def get_next_cursor(body_content_in): 269 | for i in range(len(body_content_in)-1, -1, -1): 270 | try: 271 | json_tail = json.loads(body_content_in[i]) 272 | nex_cursor = json_tail.get("data").get( 273 | "page_info").get("end_cursor") 274 | return nex_cursor 275 | except AttributeError: 276 | pass 277 | 278 | def get_next_page_status(body_content): 279 | for each_body in body_content: 280 | try: 281 | tmp_json = json.loads(each_body) 282 | next_page_status = tmp_json.get("data").get( 283 | "page_info").get("has_next_page") 284 | return next_page_status 285 | except Exception as e: 286 | pass 287 | return True # sometimes, scraper can not collect API's "has_next" info, Program choose return True, I will improve this step in the near future. 288 | 289 | 290 | def compare_timestamp(timestamp: int, days_limit: int, display_progress: bool) -> bool: 291 | timestamp_date = datetime.utcfromtimestamp(timestamp).date() 292 | current_date = datetime.utcnow().date() 293 | past_date = current_date - timedelta(days=days_limit) 294 | if display_progress: 295 | days_remaining = (timestamp_date - past_date).days 296 | if days_remaining > 0: 297 | print(f"{days_remaining} more days of posts to collect.") 298 | else: 299 | print("Target days reached or exceeded.") 300 | return timestamp_date < past_date 301 | 302 | 303 | def get_before_time(time_zone='Asia/Taipei'): 304 | location_tz = pytz.timezone(time_zone) 305 | current_time = datetime.now(location_tz) 306 | timestamp = str(int(current_time.timestamp())) 307 | return timestamp 308 | 309 | def get_posts_image(post_id:str): 310 | url = f"https://www.facebook.com/plugins/post.php?href=https%3A%2F%2Fwww.facebook.com%2Ftoolbox003%2Fposts%2F{post_id}&show_text=true&width=800" 311 | """You can check out the content through the link 312 | to better understand what I'm talking about haha""" 313 | response = requests.get(url=url) 314 | response.status_code 315 | soup = BeautifulSoup(response.text, "html.parser") 316 | pattern = re.compile(r"^https://scontent") 317 | all_src_links = [tag['src'] for tag in soup.find_all(src=pattern)] 318 | return all_src_links -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ipython==8.19.0 2 | pytz==2023.3.post1 3 | selenium_wire==5.1.0 4 | tqdm==4.66.1 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import setuptools 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name='facebook-graphql-scraper', 7 | version='1.1.2', 8 | packages=[ 9 | "fb_graphql_scraper", 10 | "fb_graphql_scraper.pages", 11 | "fb_graphql_scraper.base", 12 | "fb_graphql_scraper.tests", 13 | "fb_graphql_scraper.utils", 14 | ], 15 | license='MIT', 16 | description='Implement Facebook scraper for post data retrieval', 17 | long_description=open('README.md').read(), 18 | long_description_content_type='text/markdown', 19 | author='FaustRen', 20 | author_email='faustren1z@gmail.com', 21 | url='https://github.com/FaustRen/FB_graphql_scraper', 22 | classifiers=[ 23 | "Programming Language :: Python :: 3.11", 24 | "License :: OSI Approved :: MIT License", 25 | "Operating System :: OS Independent", 26 | ], 27 | python_requires='>=3.11', 28 | ) --------------------------------------------------------------------------------