├── .gitignore ├── FacebookPostsScraper.py ├── LICENSE ├── README.md ├── main.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /FacebookPostsScraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pickle 4 | import os 5 | from urllib.parse import urlparse, unquote 6 | from urllib.parse import parse_qs 7 | import pandas as pd 8 | import json 9 | 10 | 11 | class FacebookPostsScraper: 12 | 13 | # We need the email and password to access Facebook, and optionally the text in the Url that identifies the "view full post". 14 | def __init__(self, email, password, post_url_text='Full Story'): 15 | self.email = email 16 | self.password = password 17 | self.headers = { # This is the important part: Nokia C3 User Agent 18 | 'User-Agent': 'NokiaC3-00/5.0 (07.20) Profile/MIDP-2.1 Configuration/CLDC-1.1 Mozilla/5.0 AppleWebKit/420+ (KHTML, like Gecko) Safari/420+' 19 | } 20 | self.session = requests.session() # Create the session for the next requests 21 | self.cookies_path = 'session_facebook.cki' # Give a name to store the session in a cookie file. 22 | 23 | # At certain point, we need find the text in the Url to point the url post, in my case, my Facebook is in 24 | # English, this is why it says 'Full Story', so, you need to change this for your language. 25 | # Some translations: 26 | # - English: 'Full Story' 27 | # - Spanish: 'Historia completa' 28 | self.post_url_text = post_url_text 29 | 30 | # Evaluate if NOT exists a cookie file, if NOT exists the we make the Login request to Facebook, 31 | # else we just load the current cookie to maintain the older session. 32 | if self.new_session(): 33 | self.login() 34 | 35 | self.posts = [] # Store the scraped posts 36 | 37 | # We need to check if we already have a session saved or need to log to Facebook 38 | def new_session(self): 39 | if not os.path.exists(self.cookies_path): 40 | return True 41 | 42 | f = open(self.cookies_path, 'rb') 43 | cookies = pickle.load(f) 44 | self.session.cookies = cookies 45 | return False 46 | 47 | # Utility function to make the requests and convert to soup object if necessary 48 | def make_request(self, url, method='GET', data=None, is_soup=True): 49 | if len(url) == 0: 50 | raise Exception(f'Empty Url') 51 | 52 | if method == 'GET': 53 | resp = self.session.get(url, headers=self.headers) 54 | elif method == 'POST': 55 | resp = self.session.post(url, headers=self.headers, data=data) 56 | else: 57 | raise Exception(f'Method [{method}] Not Supported') 58 | 59 | if resp.status_code != 200: 60 | raise Exception(f'Error [{resp.status_code}] > {url}') 61 | 62 | if is_soup: 63 | return BeautifulSoup(resp.text, 'lxml') 64 | return resp 65 | 66 | # The first time we login 67 | def login(self): 68 | # Get the content of HTML of mobile Login Facebook page 69 | url_home = "https://m.facebook.com/" 70 | soup = self.make_request(url_home) 71 | if soup is None: 72 | raise Exception("Couldn't load the Login Page") 73 | 74 | # Here we need to extract this tokens from the Login Page 75 | lsd = soup.find("input", {"name": "lsd"}).get("value") 76 | jazoest = soup.find("input", {"name": "jazoest"}).get("value") 77 | m_ts = soup.find("input", {"name": "m_ts"}).get("value") 78 | li = soup.find("input", {"name": "li"}).get("value") 79 | try_number = soup.find("input", {"name": "try_number"}).get("value") 80 | unrecognized_tries = soup.find("input", {"name": "unrecognized_tries"}).get("value") 81 | 82 | # This is the url to send the login params to Facebook 83 | url_login = "https://m.facebook.com/login/device-based/regular/login/?refsrc=https%3A%2F%2Fm.facebook.com%2F&lwv=100&refid=8" 84 | payload = { 85 | "lsd": lsd, 86 | "jazoest": jazoest, 87 | "m_ts": m_ts, 88 | "li": li, 89 | "try_number": try_number, 90 | "unrecognized_tries": unrecognized_tries, 91 | "email": self.email, 92 | "pass": self.password, 93 | "login": "Iniciar sesión", 94 | "prefill_contact_point": "", 95 | "prefill_source": "", 96 | "prefill_type": "", 97 | "first_prefill_source": "", 98 | "first_prefill_type": "", 99 | "had_cp_prefilled": "false", 100 | "had_password_prefilled": "false", 101 | "is_smart_lock": "false", 102 | "_fb_noscript": "true" 103 | } 104 | soup = self.make_request(url_login, method='POST', data=payload, is_soup=True) 105 | if soup is None: 106 | raise Exception(f"The login request couldn't be made: {url_login}") 107 | 108 | redirect = soup.select_one('a') 109 | if not redirect: 110 | raise Exception("Please log in desktop/mobile Facebook and change your password") 111 | 112 | url_redirect = redirect.get('href', '') 113 | resp = self.make_request(url_redirect) 114 | if resp is None: 115 | raise Exception(f"The login request couldn't be made: {url_redirect}") 116 | 117 | # Finally we get the cookies from the session and save it in a file for future usage 118 | cookies = self.session.cookies 119 | f = open(self.cookies_path, 'wb') 120 | pickle.dump(cookies, f) 121 | 122 | return {'code': 200} 123 | 124 | # Scrap a list of profiles 125 | def get_posts_from_list(self, profiles): 126 | data = [] 127 | n = len(profiles) 128 | 129 | for idx in range(n): 130 | profile = profiles[idx] 131 | print(f'{idx + 1}/{n}. {profile}') 132 | 133 | posts = self.get_posts_from_profile(profile) 134 | data.append(posts) 135 | 136 | return data 137 | 138 | # This is the extraction point! 139 | def get_posts_from_profile(self, url_profile): 140 | # Prepare the Url to point to the posts feed 141 | if "www." in url_profile: url_profile = url_profile.replace('www.', 'm.') 142 | if 'v=timeline' not in url_profile: 143 | if '?' in url_profile: 144 | url_profile = f'{url_profile}&v=timeline' 145 | else: 146 | url_profile = f'{url_profile}?v=timeline' 147 | 148 | is_group = '/groups/' in url_profile 149 | 150 | # Make a simple GET request 151 | soup = self.make_request(url_profile) 152 | if soup is None: 153 | print(f"Couldn't load the Page: {url_profile}") 154 | return [] 155 | 156 | # Now the extraction... 157 | css_profile = '.storyStream > div' # Select the posts from a user profile 158 | css_page = '#recent > div > div > div' # Select the posts from a Facebook page 159 | css_group = '#m_group_stories_container > div > div' # Select the posts from a Facebook group 160 | raw_data = soup.select(f'{css_profile} , {css_page} , {css_group}') # Now join and scrape it 161 | posts = [] 162 | for item in raw_data: # Now, for every post... 163 | published = item.select_one('abbr') # Get the formatted datetime of published 164 | description = item.select('p') # Get list of all p tag, they compose the description 165 | images = item.select('a > img') # Get list of all images 166 | _external_links = item.select('p a') # Get list of any link in the description, this are external links 167 | post_url = item.find('a', text=self.post_url_text) # Get the url to point this post. 168 | like_url = item.find('a', text='Like') # Get the Like url. 169 | 170 | # Clean the publish date 171 | if published is not None: 172 | published = published.get_text() 173 | else: 174 | published = '' 175 | 176 | # Join all the text in p tags, else set empty string 177 | if len(description) > 0: 178 | description = '\n'.join([d.get_text() for d in description]) 179 | else: 180 | description = '' 181 | 182 | # Get all the images links 183 | images = [image.get('src', '') for image in images] 184 | 185 | # Clean the post link 186 | if post_url is not None: 187 | post_url = post_url.get('href', '') 188 | if len(post_url) > 0: 189 | post_url = f'https://www.facebook.com{post_url}' 190 | p_url = urlparse(post_url) 191 | qs = parse_qs(p_url.query) 192 | if not is_group: 193 | post_url = f'{p_url.scheme}://{p_url.hostname}{p_url.path}?story_fbid={qs["story_fbid"][0]}&id={qs["id"][0]}' 194 | else: 195 | post_url = f'{p_url.scheme}://{p_url.hostname}{p_url.path}/permalink/{qs["id"][0]}/' 196 | else: 197 | post_url = '' 198 | 199 | # Clean the Like link 200 | if like_url is not None: 201 | like_url = like_url.get('href', '') 202 | if len(like_url) > 0: 203 | like_url = f'https://m.facebook.com{like_url}' 204 | else: 205 | like_url = '' 206 | 207 | # Get list of external links in post description, if any inside 208 | external_links = [] 209 | for link in _external_links: 210 | link = link.get('href', '') 211 | try: 212 | a = link.index("u=") + 2 213 | z = link.index("&h=") 214 | link = unquote(link[a:z]) 215 | link = link.split("?fbclid=")[0] 216 | external_links.append(link) 217 | except ValueError as e: 218 | continue 219 | post = {'published': published, 'description': description, 'images': images, 220 | 'post_url': post_url, 'external_links': external_links, 'like_url': like_url} 221 | posts.append(post) 222 | self.posts.append(post) 223 | return posts 224 | 225 | def posts_to_csv(self, filename): 226 | if filename[:-4] != '.csv': 227 | filename = f'{filename}.csv' 228 | 229 | df = pd.DataFrame(self.posts) 230 | df.to_csv(filename) 231 | 232 | def posts_to_excel(self, filename): 233 | if filename[:-5] != '.xlsx': 234 | filename = f'{filename}.xlsx' 235 | 236 | df = pd.DataFrame(self.posts) 237 | df.to_excel(filename) 238 | 239 | def posts_to_json(self, filename): 240 | if filename[:-5] != '.json': 241 | filename = f'{filename}.json' 242 | 243 | with open(filename, 'w') as f: 244 | f.write('[') 245 | for entry in self.posts: 246 | json.dump(entry, f) 247 | f.write(',\n') 248 | f.write(']') 249 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Maxwell Smith 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FacebookPostsScraper 2 | 3 | Scraper for posts in Facebook user profiles, pages and groups. 4 | 5 | Extracts list of dicts with: 6 | 7 | | params | description | 8 | | -------------- | ----------- | 9 | | published | Formatted datetime of published | 10 | | description | Post text content | 11 | | images | List of images in posts | 12 | | post_url | The unique post url | 13 | | external_links | External links found in description | 14 | | like_url | The Like url | 15 | 16 | ## Installation 17 | 18 | 1.Get [Python](https://www.python.org/downloads/) (recommended Python 3.7+) 19 | 20 | 2.Clone or download this repository 21 | 22 | ```shell script 23 | git clone https://github.com/adeoy/FacebookPostsScraper.git 24 | ``` 25 | 26 | 3.Install the Python requirements 27 | 28 | ```shell script 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | 4.Follow the examples. 33 | 34 | ## Description 35 | 36 | ### The FacebookPostsScraper Class 37 | 38 | Constructor params: 39 | 40 | | param | description | 41 | | ------------- | ----------- | 42 | | email | Your email to access Facebook | 43 | | password | Your password to access Facebook | 44 | | post_url_text | This is the text in the Url that opens the posts in mobile version, use when your Facebook isn't in English | 45 | 46 | When instantiate object it autologs in, and sets the session and save a cookie file for future use. 47 | 48 | ### Methods 49 | 50 | `get_posts_from_profile` 51 | 52 | params: 53 | 54 | - profile url 55 | 56 | return: 57 | 58 | - list of dicts with the data described above 59 | 60 | `get_posts_from_list` 61 | 62 | params: 63 | 64 | - list of profile urls 65 | 66 | return: 67 | 68 | - list of list of dicts with the data described above 69 | 70 | ## Examples 71 | 72 | ### Example with single url 73 | 74 | ```python 75 | from FacebookPostsScraper import FacebookPostsScraper as Fps 76 | from pprint import pprint as pp 77 | 78 | # Enter your Facebook email and password 79 | email = 'YOUR_EMAIL' 80 | password = 'YOUR_PASWORD' 81 | 82 | # Instantiate an object 83 | fps = Fps(email, password, post_url_text='Full Story') 84 | 85 | # Example with single profile 86 | single_profile = 'https://www.facebook.com/BillGates' 87 | data = fps.get_posts_from_profile(single_profile) 88 | pp(data) 89 | 90 | fps.posts_to_csv('my_posts') # You can export the posts as CSV document 91 | # fps.posts_to_excel('my_posts') # You can export the posts as Excel document 92 | # fps.posts_to_json('my_posts') # You can export the posts as JSON document 93 | ``` 94 | 95 | ### Example with multiple urls 96 | 97 | ```python 98 | from FacebookPostsScraper import FacebookPostsScraper as Fps 99 | from pprint import pprint as pp 100 | 101 | # Enter your Facebook email and password 102 | email = 'YOUR_EMAIL' 103 | password = 'YOUR_PASWORD' 104 | 105 | # Instantiate an object 106 | fps = Fps(email, password, post_url_text='Full Story') 107 | 108 | # Example with multiple profiles 109 | profiles = [ 110 | 'https://www.facebook.com/zuck', # User profile 111 | 'https://www.facebook.com/thepracticaldev', # Facebook page 112 | 'https://www.facebook.com/groups/python' # Facebook group 113 | ] 114 | data = fps.get_posts_from_list(profiles) 115 | pp(data) 116 | 117 | fps.posts_to_csv('my_posts') # You can export the posts as CSV document 118 | # fps.posts_to_excel('my_posts') # You can export the posts as Excel document 119 | # fps.posts_to_json('my_posts') # You can export the posts as JSON document 120 | ``` 121 | 122 | ## Questions 123 | 124 | Please be free of ask anything in you want in the issue sections. 125 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from FacebookPostsScraper import FacebookPostsScraper as Fps 2 | from pprint import pprint as pp 3 | 4 | 5 | def main(): 6 | # Enter your Facebook email and password 7 | email = 'YOUR_EMAIL' 8 | password = 'YOUR_PASSWORD' 9 | 10 | # Instantiate an object 11 | fps = Fps(email, password, post_url_text='Full Story') 12 | 13 | # Example with single profile 14 | single_profile = 'https://www.facebook.com/BillGates' 15 | data = fps.get_posts_from_profile(single_profile) 16 | pp(data) 17 | 18 | # Example with multiple profiles 19 | profiles = [ 20 | 'https://www.facebook.com/zuck', # User profile 21 | 'https://www.facebook.com/thepracticaldev', # Facebook page 22 | 'https://www.facebook.com/groups/python' # Facebook group 23 | 24 | ] 25 | data = fps.get_posts_from_list(profiles) 26 | pp(data) 27 | 28 | fps.posts_to_csv('my_posts') # You can export the posts as CSV document 29 | # fps.posts_to_excel('my_posts') # You can export the posts as Excel document 30 | # fps.posts_to_json('my_posts') # You can export the posts as JSON document 31 | 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.9.0 2 | certifi==2020.4.5.1 3 | chardet==3.0.4 4 | et-xmlfile==1.0.1 5 | idna==2.9 6 | jdcal==1.4.1 7 | lxml==4.5.0 8 | numpy==1.19.0 9 | openpyxl==3.0.4 10 | pandas==1.0.5 11 | python-dateutil==2.8.1 12 | pytz==2020.1 13 | requests==2.23.0 14 | six==1.15.0 15 | soupsieve==2.0 16 | urllib3==1.25.9 17 | --------------------------------------------------------------------------------