├── test ├── __init__.py └── test_utils.py ├── scraper ├── __init__.py ├── __main__.py ├── utils.py └── scraper.py ├── requirements.txt ├── credentials.yaml ├── input.txt ├── .github ├── workflows │ └── main.yaml └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── LICENSE ├── setup.py ├── selectors.json ├── .gitignore ├── params.json └── README.md /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scraper/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | selenium==3.141.0 2 | pyyaml 3 | webdriver_manager -------------------------------------------------------------------------------- /scraper/__main__.py: -------------------------------------------------------------------------------- 1 | from .scraper import scraper 2 | 3 | scraper() 4 | -------------------------------------------------------------------------------- /credentials.yaml: -------------------------------------------------------------------------------- 1 | email: email@email.fr 2 | password: my_plain_password 3 | -------------------------------------------------------------------------------- /input.txt: -------------------------------------------------------------------------------- 1 | #Lines starting with # and empty lines will be ignored 2 | https://www.facebook.com/andrew.ng.96 3 | https://www.facebook.com/zuck -------------------------------------------------------------------------------- /.github/workflows/main.yaml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | # Trigger the workflow on push or pull request 4 | on: [push, pull_request] 5 | 6 | jobs: 7 | lint: 8 | name: Check lint code with Black 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Check out code into the Go module directory 12 | uses: actions/checkout@v1 13 | 14 | - name: Setup Python 3 15 | uses: actions/setup-python@v1 16 | with: 17 | python-version: "3.x" 18 | 19 | - name: Install Black 20 | run: pip3 install black 21 | 22 | - name: Check lint 23 | run: black --check scraper 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Configure '…' 16 | 2. Run '…' 17 | 3. … 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots or command output** 24 | If applicable, add screenshots or `output logs` to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. Linux] 28 | - Python version [e.g. 3.7.5] 29 | - Chrome web driver version [e.g 81] 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /test/test_utils.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | from scraper import utils 3 | 4 | 5 | class Test(TestCase): 6 | def test_identify_url(self): 7 | self.assertEqual( 8 | utils.identify_url("https://www.facebook.com/groups/123456789694/?fref=nf"), 9 | 2, 10 | ) 11 | self.assertEqual( 12 | utils.identify_url("https://www.facebook.com/groups/123456789694"), 2 13 | ) 14 | self.assertEqual( 15 | utils.identify_url( 16 | "https://www.facebook.com/groups/12345645546/permalink/213453415513/" 17 | ), 18 | 3, 19 | ) 20 | self.assertEqual( 21 | utils.identify_url("https://www.facebook.com/dfsdfsdf.sdfsdfs"), 0, 22 | ) 23 | self.assertEqual( 24 | utils.identify_url("https://www.facebook.com/sdfsdfsd/posts/123456784684"), 25 | 1, 26 | ) 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 harismuneer, Hassaan-Elahi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | from scraper import __version__ 4 | 5 | 6 | with open("README.md", "r", encoding="utf-8") as fh: 7 | long_description = fh.read() 8 | 9 | setuptools.setup( 10 | name="ultimate-facebook-scraper", 11 | version=__version__, 12 | author="Haris Muneer", 13 | author_email="haris.muneer@conradlabs.com", 14 | license="MIT", 15 | keywords="Facebook Scraper", 16 | description="A bot which scrapes almost everything about a Facebook user's profile", 17 | long_description_content_type="text/markdown", 18 | long_description=long_description, 19 | url="https://github.com/harismuneer/Ultimate-Facebook-Scraper", 20 | packages=setuptools.find_packages(), 21 | classifiers=[ 22 | "Development Status :: 4 - Beta", 23 | "Programming Language :: Python :: 3", 24 | "Programming Language :: Python :: 3.7", 25 | "License :: OSI Approved :: MIT License", 26 | "Operating System :: OS Independent", 27 | ], 28 | python_requires=">=3.6", 29 | extras_require={"dev": ["black", "twine", "wheel"],}, 30 | install_requires=["selenium==3.141.0", "pyyaml", "webdriver_manager"], 31 | entry_points={ 32 | "console_scripts": ["ultimate-facebook-scraper=scraper.__main__:scraper",], 33 | }, 34 | ) 35 | -------------------------------------------------------------------------------- /selectors.json: -------------------------------------------------------------------------------- 1 | { 2 | "status": "//div[contains(@class, '_5pbx')]", 3 | "sections_bar": "//*[@class='_3cz'][1]/div[2]/div[1]", 4 | "status_exc": ".//div[@class='userContent']", 5 | "temp": ".//div[@class='_3x-2']", 6 | "title": ".//span[@class='fwb fcg']", 7 | "title_exc1": ".//span[@class='fcg']", 8 | "title_exc2": ".//span[@class='fwn fcg']", 9 | "title_element": ".//div[@class='_1dwg _1w_m']", 10 | "background_img_links": "//*[contains(@id, 'pic_')]/div/i", 11 | "firefox_profile_path": "/home/zeryx/.mozilla/firefox/0n8gmjoz.bot", 12 | "facebook_https_prefix": "https://", 13 | "facebook_link_body": ".facebook.com/", 14 | "spotlight": "spotlight", 15 | "default_image": "10354686_10150004552801856_220367501106153455_n.jpg", 16 | "height_script": "return document.body.scrollHeight", 17 | "scroll_script": "window.scrollTo(0, document.body.scrollHeight);", 18 | "title_text": "fb-timeline-cover-name", 19 | "profilePicThumb": "profilePicThumb", 20 | "fb_link": "https://en-gb.facebook.com/", 21 | "single_post" : ".//div[contains(@class, '_5pcr')]", 22 | "post_photos": ".//a[contains(@class, '_5dec') or contains(@class, '_4-eo')]", 23 | "post_photo_small" : ".//img[contains(@class, '_46-i')]", 24 | "post_photo_small_opt1" : ".//img[contains(@class, 'scaledImageFitWidth') or contains(@class, 'scaledImageFitHeight')]", 25 | "comment_section" : ".//*[@class='commentable_item']", 26 | "comment" : ".//div[@aria-label='Comment']", 27 | "comment_author" : ".//a[@class='_6qw4']", 28 | "comment_text" : ".//span[contains(@class,'_3l3x')]", 29 | "more_comment_replies": ".//a[contains(@class,'_4sxc _42ft')]", 30 | "comment_see_more_link" : ".//a[contains(@class,'_5v47 fss')]", 31 | "comment_reply" : "..//..//div[@aria-label='Comment reply']" 32 | 33 | } 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/python 2 | # Edit at https://www.gitignore.io/?templates=python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # pyenv 71 | .python-version 72 | 73 | # pipenv 74 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 75 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 76 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 77 | # install all needed dependencies. 78 | #Pipfile.lock 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | .spyproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | # Mr Developer 94 | .mr.developer.cfg 95 | .project 96 | .pydevproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | .dmypy.json 104 | dmypy.json 105 | 106 | # Pyre type checker 107 | .pyre/ 108 | 109 | # End of https://www.gitignore.io/api/python 110 | 111 | ## IDE 112 | .vscode 113 | .idea 114 | 115 | ## Generated data 116 | /data 117 | scraper/credentials.yaml 118 | scraper/data/ 119 | scraper/debug.log 120 | 121 | ##misplaced configuration files 122 | scraper/selectors.json 123 | scraper/params.json 124 | scraper/input.txt 125 | 126 | ./credentials.yaml 127 | 128 | ## Python venv 129 | venv 130 | -------------------------------------------------------------------------------- /params.json: -------------------------------------------------------------------------------- 1 | { 2 | "Friends": { 3 | "scan_list": [ 4 | "All", 5 | "Mutual Friends", 6 | "Following", 7 | "Followers", 8 | "Work", 9 | "College", 10 | "Current City", 11 | "Hometown" 12 | ], 13 | "section": [ 14 | "/friends", 15 | "/friends_mutual", 16 | "/following", 17 | "/followers", 18 | "/friends_work", 19 | "/friends_college", 20 | "/friends_current_city", 21 | "/friends_hometown" 22 | ], 23 | "elements_path": [ 24 | "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a", 25 | "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a", 26 | "//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a", 27 | "//*[contains(@class,'fbProfileBrowserListItem')]/div/a", 28 | "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a", 29 | "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a", 30 | "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a", 31 | "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a" 32 | ], 33 | "file_names": [ 34 | "All Friends.txt", 35 | "Mutual Friends.txt", 36 | "Following.txt", 37 | "Followers.txt", 38 | "Work Friends.txt", 39 | "College Friends.txt", 40 | "Current City Friends.txt", 41 | "Hometown Friends.txt" 42 | ], 43 | "save_status": 0 44 | }, 45 | "Members": { 46 | "scan_list": [ 47 | "All" 48 | ], 49 | "section": [ 50 | "/members" 51 | ], 52 | "elements_path": [ 53 | "//*[contains(@id,'pagelet_group_members')][1]/div[2]/div/ul/li/div/a" 54 | ], 55 | "file_names": [ 56 | "All Members.txt" 57 | ], 58 | "save_status": 0 59 | }, 60 | "Photos": { 61 | "scan_list": [ 62 | "'s Photos", 63 | "Photos of" 64 | ], 65 | "section": [ 66 | "/photos_all", 67 | "/photos_of" 68 | ], 69 | "elements_path": [ 70 | "//*[contains(@id, 'pic_')]", 71 | "//*[contains(@id, 'pic_')]" 72 | ], 73 | "file_names": [ 74 | "Uploaded Photos.txt", 75 | "Tagged Photos.txt" 76 | ], 77 | "save_status": 1 78 | }, 79 | "Videos": { 80 | "scan_list": [ 81 | "'s Videos", 82 | "Videos of" 83 | ], 84 | "section": [ 85 | "/videos_by", 86 | "/videos_of" 87 | ], 88 | "elements_path": [ 89 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul", 90 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul" 91 | ], 92 | "file_names": [ 93 | "Uploaded Videos.txt", 94 | "Tagged Videos.txt" 95 | ], 96 | "save_status": 2 97 | }, 98 | "About": { 99 | "scan_list": [], 100 | "section": [ 101 | "/about?section=overview", 102 | "/about?section=education", 103 | "/about?section=living", 104 | "/about?section=contact-info", 105 | "/about?section=relationship", 106 | "/about?section=bio", 107 | "/about?section=year-overviews" 108 | ], 109 | "elements_path": [ 110 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div", 111 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div", 112 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div", 113 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div", 114 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div", 115 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div", 116 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div" 117 | ], 118 | "file_names": [ 119 | "Overview.txt", 120 | "Work and Education.txt", 121 | "Places Lived.txt", 122 | "Contact and Basic Info.txt", 123 | "Family and Relationships.txt", 124 | "Details About.txt", 125 | "Life Events.txt" 126 | ], 127 | "save_status": 3 128 | }, 129 | "Posts": { 130 | "scan_list": [], 131 | "section": [], 132 | "elements_path": [ 133 | "//div[@class='_5pcb _4b0l _2q8l']" 134 | ], 135 | "file_names": [ 136 | "Posts.txt" 137 | ], 138 | "save_status": 4 139 | }, 140 | "GroupPosts": { 141 | "scan_list": [], 142 | "section": [], 143 | "elements_path": [ 144 | "//div[@class='_4-u2 mbm _4mrt _5jmm _5pat _5v3q _7cqq _4-u8']" 145 | ], 146 | "file_names": [ 147 | "Posts.txt" 148 | ], 149 | "save_status": 5 150 | } 151 | } -------------------------------------------------------------------------------- /scraper/utils.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | from calendar import calendar 5 | 6 | from selenium.common.exceptions import TimeoutException, NoSuchElementException 7 | from selenium.webdriver.support.ui import WebDriverWait 8 | 9 | 10 | # ----------------------------------------------------------------------------- 11 | # 12 | # ----------------------------------------------------------------------------- 13 | def to_bool(x): 14 | if x in ["False", "0", 0, False]: 15 | return False 16 | elif x in ["True", "1", 1, True]: 17 | return True 18 | else: 19 | raise argparse.ArgumentTypeError("Boolean value expected") 20 | 21 | 22 | def create_post_link(post_id, selectors): 23 | return ( 24 | selectors["facebook_https_prefix"] + selectors["facebook_link_body"] + post_id 25 | ) 26 | 27 | 28 | # ----------------------------------------------------------------------------- 29 | # 30 | # ----------------------------------------------------------------------------- 31 | def create_folder(folder): 32 | if not os.path.exists(folder): 33 | os.mkdir(folder) 34 | 35 | 36 | # ------------------------------------------------------------- 37 | # Helper functions for Page scrolling 38 | # ------------------------------------------------------------- 39 | # check if height changed 40 | def check_height(driver, selectors, old_height): 41 | new_height = driver.execute_script(selectors.get("height_script")) 42 | return new_height != old_height 43 | 44 | 45 | # helper function: used to scroll the page 46 | def scroll(total_scrolls, driver, selectors, scroll_time): 47 | global old_height 48 | current_scrolls = 0 49 | 50 | while True: 51 | try: 52 | if current_scrolls == total_scrolls: 53 | return 54 | 55 | old_height = driver.execute_script(selectors.get("height_script")) 56 | driver.execute_script(selectors.get("scroll_script")) 57 | WebDriverWait(driver, scroll_time, 0.05).until( 58 | lambda driver: check_height(driver, selectors, old_height) 59 | ) 60 | current_scrolls += 1 61 | except TimeoutException: 62 | break 63 | 64 | return 65 | 66 | 67 | # ----------------------------------------------------------------------------- 68 | # Helper Functions for Posts 69 | # ----------------------------------------------------------------------------- 70 | 71 | 72 | def get_status(x, selectors): 73 | status = "" 74 | try: 75 | status = x.find_element_by_xpath( 76 | selectors.get("status") 77 | ).text # use _1xnd for Pages 78 | except Exception: 79 | try: 80 | status = x.find_element_by_xpath(selectors.get("status_exc")).text 81 | except Exception: 82 | pass 83 | return status 84 | 85 | 86 | def get_post_id(x): 87 | post_id = -1 88 | try: 89 | post_id = x.get_attribute("id") 90 | post_id = post_id.split(":")[-1] 91 | except Exception: 92 | pass 93 | return post_id 94 | 95 | 96 | def get_group_post_id(x): 97 | post_id = -1 98 | try: 99 | post_id = x.get_attribute("id") 100 | 101 | post_id = post_id.split("_")[-1] 102 | if ";" in post_id: 103 | post_id = post_id.split(";") 104 | post_id = post_id[2] 105 | else: 106 | post_id = post_id.split(":")[0] 107 | except Exception: 108 | pass 109 | return post_id 110 | 111 | 112 | def get_photo_link(x, selectors, small_photo): 113 | link = "" 114 | try: 115 | if small_photo: 116 | link = x.find_element_by_xpath( 117 | selectors.get("post_photo_small") 118 | ).get_attribute("src") 119 | else: 120 | link = x.get_attribute("data-ploi") 121 | except NoSuchElementException: 122 | try: 123 | link = x.find_element_by_xpath( 124 | selectors.get("post_photo_small_opt1") 125 | ).get_attribute("src") 126 | except AttributeError: 127 | pass 128 | except Exception: 129 | print("Exception (get_post_photo_link):", sys.exc_info()[0]) 130 | except Exception: 131 | print("Exception (get_post_photo_link):", sys.exc_info()[0]) 132 | return link 133 | 134 | 135 | def get_post_photos_links(x, selectors, small_photo): 136 | links = [] 137 | photos = safe_find_elements_by_xpath(x, selectors.get("post_photos")) 138 | if photos is not None: 139 | for el in photos: 140 | links.append(get_photo_link(el, selectors, small_photo)) 141 | return links 142 | 143 | 144 | def get_div_links(x, tag, selectors): 145 | try: 146 | temp = x.find_element_by_xpath(selectors.get("temp")) 147 | return temp.find_element_by_tag_name(tag) 148 | except Exception: 149 | return "" 150 | 151 | 152 | def get_title_links(title): 153 | l = title.find_elements_by_tag_name("a") 154 | return l[-1].text, l[-1].get_attribute("href") 155 | 156 | 157 | def get_title(x, selectors): 158 | title = "" 159 | try: 160 | title = x.find_element_by_xpath(selectors.get("title")) 161 | except Exception: 162 | try: 163 | title = x.find_element_by_xpath(selectors.get("title_exc1")) 164 | except Exception: 165 | try: 166 | title = x.find_element_by_xpath(selectors.get("title_exc2")) 167 | except Exception: 168 | pass 169 | finally: 170 | return title 171 | 172 | 173 | def get_time(x): 174 | time = "" 175 | try: 176 | time = x.find_element_by_tag_name("abbr").get_attribute("title") 177 | time = ( 178 | str("%02d" % int(time.split(", ")[1].split()[1]),) 179 | + "-" 180 | + str( 181 | ( 182 | "%02d" 183 | % ( 184 | int( 185 | ( 186 | list(calendar.month_abbr).index( 187 | time.split(", ")[1].split()[0][:3] 188 | ) 189 | ) 190 | ), 191 | ) 192 | ) 193 | ) 194 | + "-" 195 | + time.split()[3] 196 | + " " 197 | + str("%02d" % int(time.split()[5].split(":")[0])) 198 | + ":" 199 | + str(time.split()[5].split(":")[1]) 200 | ) 201 | except Exception: 202 | pass 203 | 204 | finally: 205 | return time 206 | 207 | 208 | def identify_url(url): 209 | """ 210 | A possible way to identify the link. 211 | Not Exhaustive! 212 | :param url: 213 | :return: 214 | 0 - Profile 215 | 1 - Profile post 216 | 2 - Group 217 | 3 - Group post 218 | """ 219 | if "groups" in url: 220 | if "permalink" in url: 221 | return 3 222 | else: 223 | return 2 224 | elif "posts" in url: 225 | return 1 226 | else: 227 | return 0 228 | 229 | 230 | def safe_find_elements_by_xpath(driver, xpath): 231 | try: 232 | return driver.find_elements_by_xpath(xpath) 233 | except NoSuchElementException: 234 | return None 235 | 236 | 237 | def get_replies(comment_element, selectors): 238 | replies = [] 239 | data = comment_element.find_elements_by_xpath(selectors.get("comment_reply")) 240 | for d in data: 241 | try: 242 | author = d.find_element_by_xpath(selectors.get("comment_author")).text 243 | text = d.find_element_by_xpath(selectors.get("comment_text")).text 244 | replies.append([author, text]) 245 | except Exception: 246 | pass 247 | return replies 248 | 249 | 250 | def safe_find_element_by_id(driver, elem_id): 251 | try: 252 | return driver.find_element_by_id(elem_id) 253 | except NoSuchElementException: 254 | return None 255 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 |
5 |
6 | 7 |

Ultimate Facebook Scraper (UFS)

8 | 9 |

10 | Tooling that automates your social media interactions to collect posts, photos, videos, friends, followers and much more on Facebook. 11 |

12 | 13 |

14 | 15 | 16 | 17 |
18 | 19 | 20 | 21 |

22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 |

38 | 39 |
40 | 41 |

Featured by Top Security Blogs and OSINT Researchers

42 |

43 |
44 | 45 | BlackArch 46 | 47 | 48 | 49 | 50 | Hakin9 51 | 52 | 53 | 54 | 55 | Kali Linux Tutorials 56 | 57 | 58 | 59 | 60 | Security Online 61 | 62 | 63 | 64 |

65 | 66 | 67 | Kitploit 68 | 69 | 70 | 71 | 72 | 73 | Kali Tools 74 | 75 | 76 | 77 | 78 | Journalist ToolBox 79 | 80 | 81 | 82 | 83 | OneHack 84 | 85 | 86 |

87 | 88 |
89 | 90 |

2nd Spot in Top Trending Python Repositories on GitHub

91 |

92 | UFS trended among top Python repositories on GitHub for 3 consecutive weeks 🌟 93 |

94 | 95 | Trending on GitHub 96 | 97 | 98 |

99 |
100 | 101 | ## News Updates 🏆 102 | * UFS got included as an official tool in the [BlackArch Linux distribution](https://blackarch.org/social.html)! 103 | * UFS got listed among the [top 20 hacking tools in 2019](https://www.kitploit.com/2019/12/top-20-most-popular-hacking-tools-in.html)! 104 | 105 | ## Features 🚀 106 | 107 | A bot which scrapes almost everything about a user's Facebook profile including: 108 | 109 | - uploaded photos 110 | - tagged photos 111 | - videos 112 | - friends list and their profile photos (including Followers, Following, Work Friends, College Friends etc) 113 | - and all public posts/statuses available on the user's timeline 114 | - **NEW:** Now you can scrape Facebook group posts. 115 | 116 | Data is scraped in an organized format to be used for educational/research purposes by researchers. This scraper does not use Facebook's Graph API meaning there are no rate limiting issues. 117 | 118 | **This tool is being used by thousands of developers weekly and we are pretty amazed at this response! Thank you!🎉** 119 | 120 | For **citing/referencing** this tool for your research, check the 'Citation' section below. 121 | 122 | ## Note 🤝 123 | 124 | This tool uses xpaths of **'divs'** to extract data. Since Facebook updates its site frequently, the 'divs' get changed. Consequently, we have to update the divs accordingly to correctly scrape data. 125 | 126 | The developers of this tool have devoted time and effort in developing, and maintaining this tool for a long time. **In order to keep this amazing tool alive, we need support from you geeks.** 127 | 128 | The code is intuitive and easy to understand, so you can update the relevant xpaths in the code if you find data is not being scraped from profiles. Facebook has most likely updated their site, so please generate a pull request. Much appreciated! 129 | 130 | ## Sample 131 | 132 |

133 | 134 |

135 | 136 | ## Screenshot 137 | 138 |

139 | 140 |

141 | 142 | --- 143 | 144 | ## Usage 🔧 145 | 146 | ### Installation 💻 147 | 148 | You will need to: 149 | 150 | - Install latest version of [Google Chrome](https://www.google.com/chrome/). 151 | - Install [Python 3](https://www.python.org/downloads/) 152 | - Have a Facebook account without 2FA enabled 153 | 154 | ```bash 155 | git clone https://github.com/harismuneer/Ultimate-Facebook-Scraper.git 156 | cd Ultimate-Facebook-Scraper 157 | 158 | # Install Python requirements 159 | pip install -e . 160 | ``` 161 | 162 | The code is multi-platform and is tested on both Windows and Linux. 163 | Chrome driver is automatically downloaded using the chromedriver_manager package. 164 | 165 | ### How to Run 166 | 167 | - Fill your Facebook credentials into [`credentials.yaml`](credentials.yaml) 168 | - Edit the [`input.txt`](input.txt) file and add profile, groups and individual group posts links as you want in the following format with each link on a new line: 169 | 170 | Make sure the link only contains the username or id number at the end and not any other stuff. Make sure its in the format mentioned above. 171 | 172 | Run the `ultimate-facebook-scraper` command ! 🚀 173 | 174 | ```python 175 | python scraper/scraper.py 176 | ``` 177 | 178 | > Note: There are two modes to download Friends Profile Pics and the user's Photos: Large Size and Small Size. By default they are set to Small Sized Pics because its really quick while Large Size Mode takes time depending on the number of pictures to download. 179 | 180 | You can personalize your scrapping needs using the command line arguments: 181 | 182 | ```bash 183 | python scraper/scraper.py \ 184 | --uploaded_photos True \ 185 | --friends_photos True \ 186 | --friends_small_size True \ 187 | --photos_small_size True \ 188 | --total_scrolls 2500 \ 189 | --scroll_time 8 190 | ``` 191 | 192 | Note that those are the default values so no need to write them down if you're just testing or are okay with them. 193 | 194 | 195 | ## Chromium 196 | 197 | Chromium users can add `--chromium True` to run using the Chromium browser. 198 | 199 | ```bash 200 | python scraper/scraper.py \ 201 | --uploaded_photos True \ 202 | --photos_small_size True \ 203 | --total_scrolls 2500 \ 204 | --scroll_time 8 \ 205 | --chromium True 206 | ``` 207 | 208 | 209 | --- 210 | 211 | ## Citation 📚 212 | 213 | 214 | 215 | 216 | 217 | If you use this tool for your research, then kindly cite it. Click the above badge for more information regarding the complete citation for this tool and diffferent citation formats like IEEE, APA etc. 218 | 219 | --- 220 | 221 | ## Important Message ⚠️ 222 | 223 | This tool is for research purposes only. Hence, the developers of this tool won't be responsible for any misuse of data collected using this tool. Used by many researchers and open source intelligence (OSINT) analysts. 224 | 225 | This tool will not works if your account was set up with 2FA. You must disable it before using. 226 | 227 | --- 228 | 229 | ## Authors 👋 230 | 231 | You can get in touch with us on our LinkedIn Profiles: 232 | 233 | #### Haris Muneer 234 | 235 | [![LinkedIn Link](https://img.shields.io/badge/Connect-harismuneer-blue.svg?logo=linkedin&longCache=true&style=social&label=Connect)](https://www.linkedin.com/in/harismuneer) 236 | 237 | You can also follow my GitHub Profile to stay updated about my latest projects: [![GitHub Follow](https://img.shields.io/badge/Connect-harismuneer-blue.svg?logo=Github&longCache=true&style=social&label=Follow)](https://github.com/harismuneer) 238 | 239 | #### Hassaan Elahi 240 | 241 | [![LinkedIn Link](https://img.shields.io/badge/Connect-Hassaan--Elahi-blue.svg?logo=linkedin&longCache=true&style=social&label=Connect)](https://www.linkedin.com/in/hassaan-elahi/) 242 | 243 | You can also follow my GitHub Profile to stay updated about my latest projects: [![GitHub Follow](https://img.shields.io/badge/Connect-Hassaan--Elahi-blue.svg?logo=Github&longCache=true&style=social&label=Follow)](https://github.com/Hassaan-Elahi) 244 | 245 | If you liked the repo then please support it by giving it a star ⭐! 246 | 247 | ## For Future 🔮 248 | 249 | Shoutout to geeks willing to contribute to this project. Please have a look at the [UFS kanban board](https://github.com/harismuneer/Ultimate-Facebook-Scraper/projects/1) for a list of things to be done. 250 | 251 | There are a lot of features that can be added to this tool like adding support for pages, groups, comments etc! Please contribute :) 252 | 253 | ## Contributions Welcome ✨ 254 | 255 | ![forthebadge](https://forthebadge.com/images/badges/built-with-love.svg) 256 | 257 | If you find any bug in the code or have any improvements in mind then feel free to generate a pull request. 258 | 259 | > Note: We use [Black](https://pypi.org/project/black/) to lint Python files. Please use it in order to have a valid pull request 😉 260 | 261 | ## Issues 🔨 262 | 263 | [![GitHub Issues](https://img.shields.io/github/issues/harismuneer/Ultimate-Facebook-Scraper.svg?style=flat&label=Issues&maxAge=2592000)](https://www.github.com/harismuneer/Ultimate-Facebook-Scraper/issues) 264 | 265 | If you face any issue, you can create a new issue in the Issues Tab and I will be glad to help you out. 266 | 267 | ## License 📄 268 | 269 | [![MIT](https://img.shields.io/cocoapods/l/AFNetworking.svg?style=style&label=License&maxAge=2592000)](LICENSE) 270 | 271 | Copyright (c) 2018-present, harismuneer, Hassaan-Elahi 272 | -------------------------------------------------------------------------------- /scraper/scraper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import sys 4 | import urllib.request 5 | import yaml 6 | import utils 7 | import argparse 8 | 9 | from selenium import webdriver 10 | from selenium.common.exceptions import NoSuchElementException 11 | from selenium.webdriver.chrome.options import Options 12 | from selenium.webdriver.common.by import By 13 | from selenium.webdriver.support import expected_conditions as EC 14 | from selenium.webdriver.support.ui import WebDriverWait 15 | from webdriver_manager.chrome import ChromeDriverManager 16 | 17 | 18 | def get_facebook_images_url(img_links): 19 | urls = [] 20 | 21 | for link in img_links: 22 | if link != "None": 23 | valid_url_found = False 24 | driver.get(link) 25 | 26 | try: 27 | while not valid_url_found: 28 | WebDriverWait(driver, 30).until( 29 | EC.presence_of_element_located( 30 | (By.CLASS_NAME, selectors.get("spotlight")) 31 | ) 32 | ) 33 | element = driver.find_element_by_class_name( 34 | selectors.get("spotlight") 35 | ) 36 | img_url = element.get_attribute("src") 37 | 38 | if img_url.find(".gif") == -1: 39 | valid_url_found = True 40 | urls.append(img_url) 41 | except Exception: 42 | urls.append("None") 43 | else: 44 | urls.append("None") 45 | 46 | return urls 47 | 48 | 49 | # ------------------------------------------------------------- 50 | # ------------------------------------------------------------- 51 | 52 | # takes a url and downloads image from that url 53 | def image_downloader(img_links, folder_name): 54 | """ 55 | Download images from a list of image urls. 56 | :param img_links: 57 | :param folder_name: 58 | :return: list of image names downloaded 59 | """ 60 | img_names = [] 61 | 62 | try: 63 | parent = os.getcwd() 64 | try: 65 | folder = os.path.join(os.getcwd(), folder_name) 66 | utils.create_folder(folder) 67 | os.chdir(folder) 68 | except Exception: 69 | print("Error in changing directory.") 70 | 71 | for link in img_links: 72 | img_name = "None" 73 | 74 | if link != "None": 75 | img_name = (link.split(".jpg")[0]).split("/")[-1] + ".jpg" 76 | 77 | # this is the image id when there's no profile pic 78 | if img_name == selectors.get("default_image"): 79 | img_name = "None" 80 | else: 81 | try: 82 | urllib.request.urlretrieve(link, img_name) 83 | except Exception: 84 | img_name = "None" 85 | 86 | img_names.append(img_name) 87 | 88 | os.chdir(parent) 89 | except Exception: 90 | print("Exception (image_downloader):", sys.exc_info()[0]) 91 | return img_names 92 | 93 | 94 | # ------------------------------------------------------------- 95 | # ------------------------------------------------------------- 96 | 97 | 98 | def extract_and_write_posts(elements, filename): 99 | try: 100 | f = open(filename, "w", newline="\r\n", encoding="utf-8") 101 | f.writelines( 102 | " TIME || TYPE || TITLE || STATUS || LINKS(Shared Posts/Shared Links etc) || POST_ID " 103 | + "\n" 104 | + "\n" 105 | ) 106 | ids = [] 107 | for x in elements: 108 | try: 109 | link = "" 110 | # id 111 | post_id = utils.get_post_id(x) 112 | ids.append(post_id) 113 | 114 | # time 115 | time = utils.get_time(x) 116 | 117 | link, status, title, post_type = get_status_and_title(link, x) 118 | 119 | line = ( 120 | str(time) 121 | + " || " 122 | + str(post_type) 123 | + " || " 124 | + str(title) 125 | + " || " 126 | + str(status) 127 | + " || " 128 | + str(link) 129 | + " || " 130 | + str(post_id) 131 | + "\n" 132 | ) 133 | 134 | try: 135 | f.writelines(line) 136 | except Exception: 137 | print("Posts: Could not map encoded characters") 138 | except Exception: 139 | pass 140 | f.close() 141 | except ValueError: 142 | print("Exception (extract_and_write_posts)", "Status =", sys.exc_info()[0]) 143 | except Exception: 144 | print("Exception (extract_and_write_posts)", "Status =", sys.exc_info()[0]) 145 | return 146 | 147 | 148 | def get_status_and_title(link, x): 149 | # title 150 | title = utils.get_title(x, selectors) 151 | if title.text.find("shared a memory") != -1: 152 | x = x.find_element_by_xpath(selectors.get("title_element")) 153 | title = utils.get_title(x, selectors) 154 | status = utils.get_status(x, selectors) 155 | if title.text == driver.find_element_by_id(selectors.get("title_text")).text: 156 | if status == "": 157 | temp = utils.get_div_links(x, "img", selectors) 158 | if temp == "": # no image tag which means . it is not a life event 159 | link = utils.get_div_links(x, "a", selectors).get_attribute("href") 160 | post_type = "status update without text" 161 | else: 162 | post_type = "life event" 163 | link = utils.get_div_links(x, "a", selectors).get_attribute("href") 164 | status = utils.get_div_links(x, "a", selectors).text 165 | else: 166 | post_type = "status update" 167 | if utils.get_div_links(x, "a", selectors) != "": 168 | link = utils.get_div_links(x, "a", selectors).get_attribute("href") 169 | 170 | elif title.text.find(" shared ") != -1: 171 | x1, link = utils.get_title_links(title) 172 | post_type = "shared " + x1 173 | elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1: 174 | if title.text.find(" at ") != -1: 175 | x1, link = utils.get_title_links(title) 176 | post_type = "check in" 177 | elif title.text.find(" in ") != 1: 178 | status = utils.get_div_links(x, "a", selectors).text 179 | elif title.text.find(" added ") != -1 and title.text.find("photo") != -1: 180 | post_type = "added photo" 181 | link = utils.get_div_links(x, "a", selectors).get_attribute("href") 182 | 183 | elif title.text.find(" added ") != -1 and title.text.find("video") != -1: 184 | post_type = "added video" 185 | link = utils.get_div_links(x, "a", selectors).get_attribute("href") 186 | 187 | else: 188 | post_type = "others" 189 | if not isinstance(title, str): 190 | title = title.text 191 | status = status.replace("\n", " ") 192 | title = title.replace("\n", " ") 193 | return link, status, title, post_type 194 | 195 | 196 | def extract_and_write_group_posts(elements, filename): 197 | try: 198 | f = create_post_file(filename) 199 | ids = [] 200 | for x in elements: 201 | try: 202 | # id 203 | post_id = utils.get_group_post_id(x) 204 | ids.append(post_id) 205 | except Exception: 206 | pass 207 | total = len(ids) 208 | i = 0 209 | for post_id in ids: 210 | i += 1 211 | try: 212 | add_group_post_to_file(f, filename, post_id, i, total, reload=True) 213 | except ValueError: 214 | pass 215 | f.close() 216 | except ValueError: 217 | print("Exception (extract_and_write_posts)", "Status =", sys.exc_info()[0]) 218 | except Exception: 219 | print("Exception (extract_and_write_posts)", "Status =", sys.exc_info()[0]) 220 | return 221 | 222 | 223 | def add_group_post_to_file(f, filename, post_id, number=1, total=1, reload=False): 224 | print("Scraping Post(" + post_id + "). " + str(number) + " of " + str(total)) 225 | photos_dir = os.path.dirname(filename) 226 | if reload: 227 | driver.get(utils.create_post_link(post_id, selectors)) 228 | line = get_group_post_as_line(post_id, photos_dir) 229 | try: 230 | f.writelines(line) 231 | except Exception: 232 | print("Posts: Could not map encoded characters") 233 | 234 | 235 | def create_post_file(filename): 236 | """ 237 | Creates post file and header 238 | :param filename: 239 | :return: file 240 | """ 241 | f = open(filename, "w", newline="\r\n", encoding="utf-8") 242 | f.writelines( 243 | "TIME || TYPE || TITLE || STATUS || LINKS(Shared Posts/Shared Links etc) || POST_ID || " 244 | "PHOTO || COMMENTS " + "\n" 245 | ) 246 | return f 247 | 248 | 249 | # ------------------------------------------------------------- 250 | # ------------------------------------------------------------- 251 | 252 | 253 | def save_to_file(name, elements, status, current_section): 254 | """helper function used to save links to files""" 255 | 256 | # status 0 = dealing with friends list 257 | # status 1 = dealing with photos 258 | # status 2 = dealing with videos 259 | # status 3 = dealing with about section 260 | # status 4 = dealing with posts 261 | # status 5 = dealing with group posts 262 | 263 | try: 264 | f = None # file pointer 265 | 266 | if status != 4 and status != 5: 267 | f = open(name, "w", encoding="utf-8", newline="\r\n") 268 | 269 | results = [] 270 | img_names = [] 271 | 272 | # dealing with Friends 273 | if status == 0: 274 | # get profile links of friends 275 | results = [x.get_attribute("href") for x in elements] 276 | results = [create_original_link(x) for x in results] 277 | 278 | # get names of friends 279 | people_names = [ 280 | x.find_element_by_tag_name("img").get_attribute("aria-label") 281 | for x in elements 282 | ] 283 | 284 | # download friends' photos 285 | try: 286 | if download_friends_photos: 287 | if friends_small_size: 288 | img_links = [ 289 | x.find_element_by_css_selector("img").get_attribute("src") 290 | for x in elements 291 | ] 292 | else: 293 | links = [] 294 | for friend in results: 295 | try: 296 | driver.get(friend) 297 | WebDriverWait(driver, 30).until( 298 | EC.presence_of_element_located( 299 | ( 300 | By.CLASS_NAME, 301 | selectors.get("profilePicThumb"), 302 | ) 303 | ) 304 | ) 305 | l = driver.find_element_by_class_name( 306 | selectors.get("profilePicThumb") 307 | ).get_attribute("href") 308 | except Exception: 309 | l = "None" 310 | 311 | links.append(l) 312 | 313 | for i, _ in enumerate(links): 314 | if links[i] is None: 315 | links[i] = "None" 316 | elif links[i].find("picture/view") != -1: 317 | links[i] = "None" 318 | 319 | img_links = get_facebook_images_url(links) 320 | 321 | folder_names = [ 322 | "Friend's Photos", 323 | "Mutual Friends' Photos", 324 | "Following's Photos", 325 | "Follower's Photos", 326 | "Work Friends Photos", 327 | "College Friends Photos", 328 | "Current City Friends Photos", 329 | "Hometown Friends Photos", 330 | ] 331 | print("Downloading " + folder_names[current_section]) 332 | 333 | img_names = image_downloader( 334 | img_links, folder_names[current_section] 335 | ) 336 | else: 337 | img_names = ["None"] * len(results) 338 | except Exception: 339 | print( 340 | "Exception (Images)", 341 | str(status), 342 | "Status =", 343 | current_section, 344 | sys.exc_info()[0], 345 | ) 346 | 347 | # dealing with Photos 348 | elif status == 1: 349 | results = [x.get_attribute("href") for x in elements] 350 | results.pop(0) 351 | 352 | try: 353 | if download_uploaded_photos: 354 | if photos_small_size: 355 | background_img_links = driver.find_elements_by_xpath( 356 | selectors.get("background_img_links") 357 | ) 358 | background_img_links = [ 359 | x.get_attribute("style") for x in background_img_links 360 | ] 361 | background_img_links = [ 362 | ((x.split("(")[1]).split(")")[0]).strip('"') 363 | for x in background_img_links 364 | ] 365 | else: 366 | background_img_links = get_facebook_images_url(results) 367 | 368 | folder_names = ["Uploaded Photos", "Tagged Photos"] 369 | print("Downloading " + folder_names[current_section]) 370 | 371 | img_names = image_downloader( 372 | background_img_links, folder_names[current_section] 373 | ) 374 | else: 375 | img_names = ["None"] * len(results) 376 | except Exception: 377 | print( 378 | "Exception (Images)", 379 | str(status), 380 | "Status =", 381 | current_section, 382 | sys.exc_info()[0], 383 | ) 384 | 385 | # dealing with Videos 386 | elif status == 2: 387 | results = elements[0].find_elements_by_css_selector("li") 388 | results = [ 389 | x.find_element_by_css_selector("a").get_attribute("href") 390 | for x in results 391 | ] 392 | 393 | try: 394 | if results[0][0] == "/": 395 | results = [r.pop(0) for r in results] 396 | results = [(selectors.get("fb_link") + x) for x in results] 397 | except Exception: 398 | pass 399 | 400 | # dealing with About Section 401 | elif status == 3: 402 | results = elements[0].text 403 | f.writelines(results) 404 | 405 | # dealing with Posts 406 | elif status == 4: 407 | extract_and_write_posts(elements, name) 408 | return 409 | 410 | # dealing with Group Posts 411 | elif status == 5: 412 | extract_and_write_group_posts(elements, name) 413 | return 414 | 415 | """Write results to file""" 416 | if status == 0: 417 | for i, _ in enumerate(results): 418 | # friend's profile link 419 | f.writelines(results[i]) 420 | f.write(",") 421 | 422 | # friend's name 423 | f.writelines(people_names[i]) 424 | f.write(",") 425 | 426 | # friend's downloaded picture id 427 | f.writelines(img_names[i]) 428 | f.write("\n") 429 | 430 | elif status == 1: 431 | for i, _ in enumerate(results): 432 | # image's link 433 | f.writelines(results[i]) 434 | f.write(",") 435 | 436 | # downloaded picture id 437 | f.writelines(img_names[i]) 438 | f.write("\n") 439 | 440 | elif status == 2: 441 | for x in results: 442 | f.writelines(x + "\n") 443 | 444 | f.close() 445 | 446 | except Exception: 447 | print("Exception (save_to_file)", "Status =", str(status), sys.exc_info()[0]) 448 | 449 | return 450 | 451 | 452 | # ---------------------------------------------------------------------------- 453 | # ----------------------------------------------------------------------------- 454 | 455 | 456 | def scrape_data(url, scan_list, section, elements_path, save_status, file_names): 457 | """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile""" 458 | page = [] 459 | 460 | if save_status == 4 or save_status == 5: 461 | page.append(url) 462 | 463 | page += [url + s for s in section] 464 | 465 | for i, _ in enumerate(scan_list): 466 | try: 467 | driver.get(page[i]) 468 | 469 | if ( 470 | (save_status == 0) or (save_status == 1) or (save_status == 2) 471 | ): # Only run this for friends, photos and videos 472 | 473 | # the bar which contains all the sections 474 | sections_bar = driver.find_element_by_xpath( 475 | selectors.get("sections_bar") 476 | ) 477 | 478 | if sections_bar.text.find(scan_list[i]) == -1: 479 | continue 480 | 481 | if save_status != 3: 482 | utils.scroll(total_scrolls, driver, selectors, scroll_time) 483 | pass 484 | 485 | data = driver.find_elements_by_xpath(elements_path[i]) 486 | 487 | save_to_file(file_names[i], data, save_status, i) 488 | 489 | except Exception: 490 | print( 491 | "Exception (scrape_data)", 492 | str(i), 493 | "Status =", 494 | str(save_status), 495 | sys.exc_info()[0], 496 | ) 497 | 498 | 499 | # ----------------------------------------------------------------------------- 500 | # ----------------------------------------------------------------------------- 501 | 502 | 503 | def create_original_link(url): 504 | if url.find(".php") != -1: 505 | original_link = ( 506 | facebook_https_prefix + facebook_link_body + ((url.split("="))[1]) 507 | ) 508 | 509 | if original_link.find("&") != -1: 510 | original_link = original_link.split("&")[0] 511 | 512 | elif url.find("fnr_t") != -1: 513 | original_link = ( 514 | facebook_https_prefix 515 | + facebook_link_body 516 | + ((url.split("/"))[-1].split("?")[0]) 517 | ) 518 | elif url.find("_tab") != -1: 519 | original_link = ( 520 | facebook_https_prefix 521 | + facebook_link_body 522 | + (url.split("?")[0]).split("/")[-1] 523 | ) 524 | else: 525 | original_link = url 526 | 527 | return original_link 528 | 529 | 530 | def scrap_profile(): 531 | data_folder = os.path.join(os.getcwd(), "data") 532 | utils.create_folder(data_folder) 533 | os.chdir(data_folder) 534 | 535 | # execute for all profiles given in input.txt file 536 | url = driver.current_url 537 | user_id = create_original_link(url) 538 | 539 | print("\nScraping:", user_id) 540 | 541 | try: 542 | target_dir = os.path.join(data_folder, user_id.split("/")[-1]) 543 | utils.create_folder(target_dir) 544 | os.chdir(target_dir) 545 | except Exception: 546 | print("Some error occurred in creating the profile directory.") 547 | os.chdir("../..") 548 | return 549 | 550 | to_scrap = ["Friends", "Photos", "Videos", "About", "Posts"] 551 | for item in to_scrap: 552 | print("----------------------------------------") 553 | print("Scraping {}..".format(item)) 554 | 555 | if item == "Posts": 556 | scan_list = [None] 557 | elif item == "About": 558 | scan_list = [None] * 7 559 | else: 560 | scan_list = params[item]["scan_list"] 561 | 562 | section = params[item]["section"] 563 | elements_path = params[item]["elements_path"] 564 | file_names = params[item]["file_names"] 565 | save_status = params[item]["save_status"] 566 | 567 | scrape_data(user_id, scan_list, section, elements_path, save_status, file_names) 568 | 569 | print("{} Done!".format(item)) 570 | 571 | print("Finished Scraping Profile " + str(user_id) + ".") 572 | os.chdir("../..") 573 | 574 | return 575 | 576 | 577 | def get_comments(): 578 | comments = [] 579 | try: 580 | data = driver.find_element_by_xpath(selectors.get("comment_section")) 581 | reply_links = driver.find_elements_by_xpath( 582 | selectors.get("more_comment_replies") 583 | ) 584 | for link in reply_links: 585 | try: 586 | driver.execute_script("arguments[0].click();", link) 587 | except Exception: 588 | pass 589 | see_more_links = driver.find_elements_by_xpath( 590 | selectors.get("comment_see_more_link") 591 | ) 592 | for link in see_more_links: 593 | try: 594 | driver.execute_script("arguments[0].click();", link) 595 | except Exception: 596 | pass 597 | data = data.find_elements_by_xpath(selectors.get("comment")) 598 | for d in data: 599 | try: 600 | author = d.find_element_by_xpath(selectors.get("comment_author")).text 601 | text = d.find_element_by_xpath(selectors.get("comment_text")).text 602 | replies = utils.get_replies(d, selectors) 603 | comments.append([author, text, replies]) 604 | except Exception: 605 | pass 606 | except Exception: 607 | pass 608 | return comments 609 | 610 | 611 | def get_group_post_as_line(post_id, photos_dir): 612 | try: 613 | data = driver.find_element_by_xpath(selectors.get("single_post")) 614 | time = utils.get_time(data) 615 | title = utils.get_title(data, selectors).text 616 | # link, status, title, type = get_status_and_title(title,data) 617 | link = utils.get_div_links(data, "a", selectors) 618 | if link != "": 619 | link = link.get_attribute("href") 620 | post_type = "" 621 | status = '"' + utils.get_status(data, selectors).replace("\r\n", " ") + '"' 622 | photos = utils.get_post_photos_links(data, selectors, photos_small_size) 623 | comments = get_comments() 624 | photos = image_downloader(photos, photos_dir) 625 | line = ( 626 | str(time) 627 | + "||" 628 | + str(post_type) 629 | + "||" 630 | + str(title) 631 | + "||" 632 | + str(status) 633 | + "||" 634 | + str(link) 635 | + "||" 636 | + str(post_id) 637 | + "||" 638 | + str(photos) 639 | + "||" 640 | + str(comments) 641 | + "\n" 642 | ) 643 | return line 644 | except Exception: 645 | return "" 646 | 647 | 648 | def create_folders(): 649 | """ 650 | Creates folder for saving data (profile, post or group) according to current driver url 651 | Changes current dir to target_dir 652 | :return: target_dir or None in case of failure 653 | """ 654 | folder = os.path.join(os.getcwd(), "data") 655 | utils.create_folder(folder) 656 | os.chdir(folder) 657 | try: 658 | item_id = get_item_id(driver.current_url) 659 | target_dir = os.path.join(folder, item_id) 660 | utils.create_folder(target_dir) 661 | os.chdir(target_dir) 662 | return target_dir 663 | except Exception: 664 | print("Some error occurred in creating the group directory.") 665 | os.chdir("../..") 666 | return None 667 | 668 | 669 | def get_item_id(url): 670 | """ 671 | Gets item id from url 672 | :param url: facebook url string 673 | :return: item id or empty string in case of failure 674 | """ 675 | ret = "" 676 | try: 677 | link = create_original_link(url) 678 | ret = link.split("/")[-1] 679 | if ret.strip() == "": 680 | ret = link.split("/")[-2] 681 | except Exception as e: 682 | print("Failed to get id: " + format(e)) 683 | return ret 684 | 685 | 686 | def scrape_group(url): 687 | if create_folders() is None: 688 | return 689 | group_id = get_item_id(url) 690 | # execute for all profiles given in input.txt file 691 | print("\nScraping:", group_id) 692 | 693 | to_scrap = ["GroupPosts"] # , "Photos", "Videos", "About"] 694 | for item in to_scrap: 695 | print("----------------------------------------") 696 | print("Scraping {}..".format(item)) 697 | 698 | if item == "GroupPosts": 699 | scan_list = [None] 700 | elif item == "About": 701 | scan_list = [None] * 7 702 | else: 703 | scan_list = params[item]["scan_list"] 704 | 705 | section = params[item]["section"] 706 | elements_path = params[item]["elements_path"] 707 | file_names = params[item]["file_names"] 708 | save_status = params[item]["save_status"] 709 | 710 | scrape_data(url, scan_list, section, elements_path, save_status, file_names) 711 | 712 | print("{} Done!".format(item)) 713 | 714 | print("Finished Scraping Group " + str(group_id) + ".") 715 | os.chdir("../..") 716 | 717 | return 718 | 719 | 720 | # ----------------------------------------------------------------------------- 721 | # ----------------------------------------------------------------------------- 722 | 723 | 724 | def login(email, password): 725 | """ Logging into our own profile """ 726 | 727 | try: 728 | global driver 729 | 730 | options = Options() 731 | 732 | # Code to disable notifications pop up of Chrome Browser 733 | options.add_argument("--disable-notifications") 734 | options.add_argument("--disable-infobars") 735 | options.add_argument("--mute-audio") 736 | 737 | if headless: 738 | options.add_argument('--headless') 739 | 740 | try: 741 | if chromium: 742 | from selenium.webdriver import Chrome 743 | driver = webdriver.Chrome( 744 | options=options 745 | ) 746 | else: 747 | driver = webdriver.Chrome( 748 | executable_path=ChromeDriverManager().install(), options=options 749 | ) 750 | except Exception: 751 | print("Error loading chrome webdriver " + sys.exc_info()[0]) 752 | exit(1) 753 | 754 | fb_path = facebook_https_prefix + facebook_link_body 755 | driver.get(fb_path) 756 | driver.maximize_window() 757 | 758 | # filling the form 759 | driver.find_element_by_name("email").send_keys(email) 760 | driver.find_element_by_name("pass").send_keys(password) 761 | 762 | try: 763 | # clicking on login button 764 | driver.find_element_by_id("loginbutton").click() 765 | except NoSuchElementException: 766 | # Facebook new design 767 | driver.find_element_by_name("login").click() 768 | 769 | # if your account uses multi factor authentication 770 | mfa_code_input = utils.safe_find_element_by_id(driver, "approvals_code") 771 | 772 | if mfa_code_input is None: 773 | return 774 | 775 | mfa_code_input.send_keys(input("Enter MFA code: ")) 776 | driver.find_element_by_id("checkpointSubmitButton").click() 777 | 778 | # there are so many screens asking you to verify things. Just skip them all 779 | while ( 780 | utils.safe_find_element_by_id(driver, "checkpointSubmitButton") is not None 781 | ): 782 | dont_save_browser_radio = utils.safe_find_element_by_id(driver, "u_0_3") 783 | if dont_save_browser_radio is not None: 784 | dont_save_browser_radio.click() 785 | 786 | driver.find_element_by_id("checkpointSubmitButton").click() 787 | 788 | except Exception: 789 | print("There's some error in log in.") 790 | print(sys.exc_info()[0]) 791 | exit(1) 792 | 793 | 794 | # ----------------------------------------------------------------------------- 795 | # ----------------------------------------------------------------------------- 796 | 797 | 798 | def scraper(**kwargs): 799 | with open("credentials.yaml", "r") as ymlfile: 800 | cfg = yaml.safe_load(stream=ymlfile) 801 | 802 | if ("password" not in cfg) or ("email" not in cfg): 803 | print("Your email or password is missing. Kindly write them in credentials.txt") 804 | exit(1) 805 | urls = [ 806 | facebook_https_prefix + facebook_link_body + get_item_id(line) 807 | for line in open("input.txt", newline="\r\n") 808 | if not line.lstrip().startswith("#") and not line.strip() == "" 809 | ] 810 | 811 | if len(urls) > 0: 812 | print("\nStarting Scraping...") 813 | login(cfg["email"], cfg["password"]) 814 | for url in urls: 815 | driver.get(url) 816 | link_type = utils.identify_url(driver.current_url) 817 | if link_type == 0: 818 | scrap_profile() 819 | elif link_type == 1: 820 | # scrap_post(url) 821 | pass 822 | elif link_type == 2: 823 | scrape_group(driver.current_url) 824 | elif link_type == 3: 825 | file_name = params["GroupPosts"]["file_names"][0] 826 | item_id = get_item_id(driver.current_url) 827 | if create_folders() is None: 828 | continue 829 | f = create_post_file(file_name) 830 | add_group_post_to_file(f, file_name, item_id) 831 | f.close() 832 | os.chdir("../..") 833 | driver.close() 834 | else: 835 | print("Input file is empty.") 836 | 837 | 838 | # ------------------------------------------------------------- 839 | # ------------------------------------------------------------- 840 | # ------------------------------------------------------------- 841 | 842 | if __name__ == "__main__": 843 | ap = argparse.ArgumentParser() 844 | # PLS CHECK IF HELP CAN BE BETTER / LESS AMBIGUOUS 845 | ap.add_argument( 846 | "-dup", 847 | "--uploaded_photos", 848 | help="download users' uploaded photos?", 849 | default=True, 850 | ) 851 | ap.add_argument( 852 | "-dfp", "--friends_photos", help="download users' photos?", default=True 853 | ) 854 | ap.add_argument( 855 | "-fss", 856 | "--friends_small_size", 857 | help="Download friends pictures in small size?", 858 | default=True, 859 | ) 860 | ap.add_argument( 861 | "-pss", 862 | "--photos_small_size", 863 | help="Download photos in small size?", 864 | default=True, 865 | ) 866 | ap.add_argument( 867 | "-ts", 868 | "--total_scrolls", 869 | help="How many times should I scroll down?", 870 | default=2500, 871 | ) 872 | ap.add_argument( 873 | "-st", "--scroll_time", help="How much time should I take to scroll?", default=8 874 | ) 875 | ap.add_argument( 876 | "--chromium", 877 | nargs='?', 878 | const=True, 879 | help="Should I use Chromium instead?", 880 | default=False 881 | ) 882 | ap.add_argument( 883 | "--headless", 884 | nargs='?', 885 | const=True, 886 | help="Should I run in a headless browser?", 887 | default=False 888 | ) 889 | 890 | args = vars(ap.parse_args()) 891 | print(args) 892 | 893 | # --------------------------------------------------------- 894 | # Global Variables 895 | # --------------------------------------------------------- 896 | 897 | # whether to download photos or not 898 | download_uploaded_photos = utils.to_bool(args["uploaded_photos"]) 899 | download_friends_photos = utils.to_bool(args["friends_photos"]) 900 | 901 | # whether to download the full image or its thumbnail (small size) 902 | # if small size is True then it will be very quick else if its false then it will open each photo to download it 903 | # and it will take much more time 904 | friends_small_size = utils.to_bool(args["friends_small_size"]) 905 | photos_small_size = utils.to_bool(args["photos_small_size"]) 906 | 907 | total_scrolls = int(args["total_scrolls"]) 908 | scroll_time = int(args["scroll_time"]) 909 | chromium = utils.to_bool(args["chromium"]) 910 | headless = utils.to_bool(args["headless"]) 911 | 912 | current_scrolls = 0 913 | old_height = 0 914 | 915 | driver = None 916 | 917 | with open("selectors.json") as a, open("params.json") as b: 918 | selectors = json.load(a) 919 | params = json.load(b) 920 | 921 | firefox_profile_path = selectors.get("firefox_profile_path") 922 | facebook_https_prefix = selectors.get("facebook_https_prefix") 923 | facebook_link_body = selectors.get("facebook_link_body") 924 | 925 | # get things rolling 926 | scraper() 927 | --------------------------------------------------------------------------------