├── test
├── __init__.py
└── test_utils.py
├── scraper
├── __init__.py
├── __main__.py
├── utils.py
└── scraper.py
├── requirements.txt
├── credentials.yaml
├── input.txt
├── .github
├── workflows
│ └── main.yaml
└── ISSUE_TEMPLATE
│ ├── feature_request.md
│ └── bug_report.md
├── LICENSE
├── setup.py
├── selectors.json
├── .gitignore
├── params.json
└── README.md
/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/scraper/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.1"
2 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | selenium==3.141.0
2 | pyyaml
3 | webdriver_manager
--------------------------------------------------------------------------------
/scraper/__main__.py:
--------------------------------------------------------------------------------
1 | from .scraper import scraper
2 |
3 | scraper()
4 |
--------------------------------------------------------------------------------
/credentials.yaml:
--------------------------------------------------------------------------------
1 | email: email@email.fr
2 | password: my_plain_password
3 |
--------------------------------------------------------------------------------
/input.txt:
--------------------------------------------------------------------------------
1 | #Lines starting with # and empty lines will be ignored
2 | https://www.facebook.com/andrew.ng.96
3 | https://www.facebook.com/zuck
--------------------------------------------------------------------------------
/.github/workflows/main.yaml:
--------------------------------------------------------------------------------
1 | name: Lint
2 |
3 | # Trigger the workflow on push or pull request
4 | on: [push, pull_request]
5 |
6 | jobs:
7 | lint:
8 | name: Check lint code with Black
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: Check out code into the Go module directory
12 | uses: actions/checkout@v1
13 |
14 | - name: Setup Python 3
15 | uses: actions/setup-python@v1
16 | with:
17 | python-version: "3.x"
18 |
19 | - name: Install Black
20 | run: pip3 install black
21 |
22 | - name: Check lint
23 | run: black --check scraper
24 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Configure '…'
16 | 2. Run '…'
17 | 3. …
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots or command output**
24 | If applicable, add screenshots or `output logs` to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. Linux]
28 | - Python version [e.g. 3.7.5]
29 | - Chrome web driver version [e.g 81]
30 |
31 | **Additional context**
32 | Add any other context about the problem here.
33 |
--------------------------------------------------------------------------------
/test/test_utils.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 | from scraper import utils
3 |
4 |
5 | class Test(TestCase):
6 | def test_identify_url(self):
7 | self.assertEqual(
8 | utils.identify_url("https://www.facebook.com/groups/123456789694/?fref=nf"),
9 | 2,
10 | )
11 | self.assertEqual(
12 | utils.identify_url("https://www.facebook.com/groups/123456789694"), 2
13 | )
14 | self.assertEqual(
15 | utils.identify_url(
16 | "https://www.facebook.com/groups/12345645546/permalink/213453415513/"
17 | ),
18 | 3,
19 | )
20 | self.assertEqual(
21 | utils.identify_url("https://www.facebook.com/dfsdfsdf.sdfsdfs"), 0,
22 | )
23 | self.assertEqual(
24 | utils.identify_url("https://www.facebook.com/sdfsdfsd/posts/123456784684"),
25 | 1,
26 | )
27 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 harismuneer, Hassaan-Elahi
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | from scraper import __version__
4 |
5 |
6 | with open("README.md", "r", encoding="utf-8") as fh:
7 | long_description = fh.read()
8 |
9 | setuptools.setup(
10 | name="ultimate-facebook-scraper",
11 | version=__version__,
12 | author="Haris Muneer",
13 | author_email="haris.muneer@conradlabs.com",
14 | license="MIT",
15 | keywords="Facebook Scraper",
16 | description="A bot which scrapes almost everything about a Facebook user's profile",
17 | long_description_content_type="text/markdown",
18 | long_description=long_description,
19 | url="https://github.com/harismuneer/Ultimate-Facebook-Scraper",
20 | packages=setuptools.find_packages(),
21 | classifiers=[
22 | "Development Status :: 4 - Beta",
23 | "Programming Language :: Python :: 3",
24 | "Programming Language :: Python :: 3.7",
25 | "License :: OSI Approved :: MIT License",
26 | "Operating System :: OS Independent",
27 | ],
28 | python_requires=">=3.6",
29 | extras_require={"dev": ["black", "twine", "wheel"],},
30 | install_requires=["selenium==3.141.0", "pyyaml", "webdriver_manager"],
31 | entry_points={
32 | "console_scripts": ["ultimate-facebook-scraper=scraper.__main__:scraper",],
33 | },
34 | )
35 |
--------------------------------------------------------------------------------
/selectors.json:
--------------------------------------------------------------------------------
1 | {
2 | "status": "//div[contains(@class, '_5pbx')]",
3 | "sections_bar": "//*[@class='_3cz'][1]/div[2]/div[1]",
4 | "status_exc": ".//div[@class='userContent']",
5 | "temp": ".//div[@class='_3x-2']",
6 | "title": ".//span[@class='fwb fcg']",
7 | "title_exc1": ".//span[@class='fcg']",
8 | "title_exc2": ".//span[@class='fwn fcg']",
9 | "title_element": ".//div[@class='_1dwg _1w_m']",
10 | "background_img_links": "//*[contains(@id, 'pic_')]/div/i",
11 | "firefox_profile_path": "/home/zeryx/.mozilla/firefox/0n8gmjoz.bot",
12 | "facebook_https_prefix": "https://",
13 | "facebook_link_body": ".facebook.com/",
14 | "spotlight": "spotlight",
15 | "default_image": "10354686_10150004552801856_220367501106153455_n.jpg",
16 | "height_script": "return document.body.scrollHeight",
17 | "scroll_script": "window.scrollTo(0, document.body.scrollHeight);",
18 | "title_text": "fb-timeline-cover-name",
19 | "profilePicThumb": "profilePicThumb",
20 | "fb_link": "https://en-gb.facebook.com/",
21 | "single_post" : ".//div[contains(@class, '_5pcr')]",
22 | "post_photos": ".//a[contains(@class, '_5dec') or contains(@class, '_4-eo')]",
23 | "post_photo_small" : ".//img[contains(@class, '_46-i')]",
24 | "post_photo_small_opt1" : ".//img[contains(@class, 'scaledImageFitWidth') or contains(@class, 'scaledImageFitHeight')]",
25 | "comment_section" : ".//*[@class='commentable_item']",
26 | "comment" : ".//div[@aria-label='Comment']",
27 | "comment_author" : ".//a[@class='_6qw4']",
28 | "comment_text" : ".//span[contains(@class,'_3l3x')]",
29 | "more_comment_replies": ".//a[contains(@class,'_4sxc _42ft')]",
30 | "comment_see_more_link" : ".//a[contains(@class,'_5v47 fss')]",
31 | "comment_reply" : "..//..//div[@aria-label='Comment reply']"
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by https://www.gitignore.io/api/python
2 | # Edit at https://www.gitignore.io/?templates=python
3 |
4 | ### Python ###
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | wheels/
27 | pip-wheel-metadata/
28 | share/python-wheels/
29 | *.egg-info/
30 | .installed.cfg
31 | *.egg
32 | MANIFEST
33 |
34 | # PyInstaller
35 | # Usually these files are written by a python script from a template
36 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
37 | *.manifest
38 | *.spec
39 |
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 |
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .nox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | nosetests.xml
52 | coverage.xml
53 | *.cover
54 | .hypothesis/
55 | .pytest_cache/
56 |
57 | # Translations
58 | *.mo
59 | *.pot
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # pyenv
71 | .python-version
72 |
73 | # pipenv
74 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
75 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
76 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
77 | # install all needed dependencies.
78 | #Pipfile.lock
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Spyder project settings
87 | .spyderproject
88 | .spyproject
89 |
90 | # Rope project settings
91 | .ropeproject
92 |
93 | # Mr Developer
94 | .mr.developer.cfg
95 | .project
96 | .pydevproject
97 |
98 | # mkdocs documentation
99 | /site
100 |
101 | # mypy
102 | .mypy_cache/
103 | .dmypy.json
104 | dmypy.json
105 |
106 | # Pyre type checker
107 | .pyre/
108 |
109 | # End of https://www.gitignore.io/api/python
110 |
111 | ## IDE
112 | .vscode
113 | .idea
114 |
115 | ## Generated data
116 | /data
117 | scraper/credentials.yaml
118 | scraper/data/
119 | scraper/debug.log
120 |
121 | ##misplaced configuration files
122 | scraper/selectors.json
123 | scraper/params.json
124 | scraper/input.txt
125 |
126 | ./credentials.yaml
127 |
128 | ## Python venv
129 | venv
130 |
--------------------------------------------------------------------------------
/params.json:
--------------------------------------------------------------------------------
1 | {
2 | "Friends": {
3 | "scan_list": [
4 | "All",
5 | "Mutual Friends",
6 | "Following",
7 | "Followers",
8 | "Work",
9 | "College",
10 | "Current City",
11 | "Hometown"
12 | ],
13 | "section": [
14 | "/friends",
15 | "/friends_mutual",
16 | "/following",
17 | "/followers",
18 | "/friends_work",
19 | "/friends_college",
20 | "/friends_current_city",
21 | "/friends_hometown"
22 | ],
23 | "elements_path": [
24 | "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
25 | "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
26 | "//*[contains(@class,'_3i9')][1]/div/div/ul/li[1]/div[2]/div/div/div/div/div[2]/ul/li/div/a",
27 | "//*[contains(@class,'fbProfileBrowserListItem')]/div/a",
28 | "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
29 | "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
30 | "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a",
31 | "//*[contains(@id,'pagelet_timeline_medley_friends')][1]/div[2]/div/ul/li/div/a"
32 | ],
33 | "file_names": [
34 | "All Friends.txt",
35 | "Mutual Friends.txt",
36 | "Following.txt",
37 | "Followers.txt",
38 | "Work Friends.txt",
39 | "College Friends.txt",
40 | "Current City Friends.txt",
41 | "Hometown Friends.txt"
42 | ],
43 | "save_status": 0
44 | },
45 | "Members": {
46 | "scan_list": [
47 | "All"
48 | ],
49 | "section": [
50 | "/members"
51 | ],
52 | "elements_path": [
53 | "//*[contains(@id,'pagelet_group_members')][1]/div[2]/div/ul/li/div/a"
54 | ],
55 | "file_names": [
56 | "All Members.txt"
57 | ],
58 | "save_status": 0
59 | },
60 | "Photos": {
61 | "scan_list": [
62 | "'s Photos",
63 | "Photos of"
64 | ],
65 | "section": [
66 | "/photos_all",
67 | "/photos_of"
68 | ],
69 | "elements_path": [
70 | "//*[contains(@id, 'pic_')]",
71 | "//*[contains(@id, 'pic_')]"
72 | ],
73 | "file_names": [
74 | "Uploaded Photos.txt",
75 | "Tagged Photos.txt"
76 | ],
77 | "save_status": 1
78 | },
79 | "Videos": {
80 | "scan_list": [
81 | "'s Videos",
82 | "Videos of"
83 | ],
84 | "section": [
85 | "/videos_by",
86 | "/videos_of"
87 | ],
88 | "elements_path": [
89 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul",
90 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul"
91 | ],
92 | "file_names": [
93 | "Uploaded Videos.txt",
94 | "Tagged Videos.txt"
95 | ],
96 | "save_status": 2
97 | },
98 | "About": {
99 | "scan_list": [],
100 | "section": [
101 | "/about?section=overview",
102 | "/about?section=education",
103 | "/about?section=living",
104 | "/about?section=contact-info",
105 | "/about?section=relationship",
106 | "/about?section=bio",
107 | "/about?section=year-overviews"
108 | ],
109 | "elements_path": [
110 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
111 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
112 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
113 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
114 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
115 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div",
116 | "//*[contains(@id, 'pagelet_timeline_app_collection_')]/ul/li/div/div[2]/div/div"
117 | ],
118 | "file_names": [
119 | "Overview.txt",
120 | "Work and Education.txt",
121 | "Places Lived.txt",
122 | "Contact and Basic Info.txt",
123 | "Family and Relationships.txt",
124 | "Details About.txt",
125 | "Life Events.txt"
126 | ],
127 | "save_status": 3
128 | },
129 | "Posts": {
130 | "scan_list": [],
131 | "section": [],
132 | "elements_path": [
133 | "//div[@class='_5pcb _4b0l _2q8l']"
134 | ],
135 | "file_names": [
136 | "Posts.txt"
137 | ],
138 | "save_status": 4
139 | },
140 | "GroupPosts": {
141 | "scan_list": [],
142 | "section": [],
143 | "elements_path": [
144 | "//div[@class='_4-u2 mbm _4mrt _5jmm _5pat _5v3q _7cqq _4-u8']"
145 | ],
146 | "file_names": [
147 | "Posts.txt"
148 | ],
149 | "save_status": 5
150 | }
151 | }
--------------------------------------------------------------------------------
/scraper/utils.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import sys
4 | from calendar import calendar
5 |
6 | from selenium.common.exceptions import TimeoutException, NoSuchElementException
7 | from selenium.webdriver.support.ui import WebDriverWait
8 |
9 |
10 | # -----------------------------------------------------------------------------
11 | #
12 | # -----------------------------------------------------------------------------
13 | def to_bool(x):
14 | if x in ["False", "0", 0, False]:
15 | return False
16 | elif x in ["True", "1", 1, True]:
17 | return True
18 | else:
19 | raise argparse.ArgumentTypeError("Boolean value expected")
20 |
21 |
22 | def create_post_link(post_id, selectors):
23 | return (
24 | selectors["facebook_https_prefix"] + selectors["facebook_link_body"] + post_id
25 | )
26 |
27 |
28 | # -----------------------------------------------------------------------------
29 | #
30 | # -----------------------------------------------------------------------------
31 | def create_folder(folder):
32 | if not os.path.exists(folder):
33 | os.mkdir(folder)
34 |
35 |
36 | # -------------------------------------------------------------
37 | # Helper functions for Page scrolling
38 | # -------------------------------------------------------------
39 | # check if height changed
40 | def check_height(driver, selectors, old_height):
41 | new_height = driver.execute_script(selectors.get("height_script"))
42 | return new_height != old_height
43 |
44 |
45 | # helper function: used to scroll the page
46 | def scroll(total_scrolls, driver, selectors, scroll_time):
47 | global old_height
48 | current_scrolls = 0
49 |
50 | while True:
51 | try:
52 | if current_scrolls == total_scrolls:
53 | return
54 |
55 | old_height = driver.execute_script(selectors.get("height_script"))
56 | driver.execute_script(selectors.get("scroll_script"))
57 | WebDriverWait(driver, scroll_time, 0.05).until(
58 | lambda driver: check_height(driver, selectors, old_height)
59 | )
60 | current_scrolls += 1
61 | except TimeoutException:
62 | break
63 |
64 | return
65 |
66 |
67 | # -----------------------------------------------------------------------------
68 | # Helper Functions for Posts
69 | # -----------------------------------------------------------------------------
70 |
71 |
72 | def get_status(x, selectors):
73 | status = ""
74 | try:
75 | status = x.find_element_by_xpath(
76 | selectors.get("status")
77 | ).text # use _1xnd for Pages
78 | except Exception:
79 | try:
80 | status = x.find_element_by_xpath(selectors.get("status_exc")).text
81 | except Exception:
82 | pass
83 | return status
84 |
85 |
86 | def get_post_id(x):
87 | post_id = -1
88 | try:
89 | post_id = x.get_attribute("id")
90 | post_id = post_id.split(":")[-1]
91 | except Exception:
92 | pass
93 | return post_id
94 |
95 |
96 | def get_group_post_id(x):
97 | post_id = -1
98 | try:
99 | post_id = x.get_attribute("id")
100 |
101 | post_id = post_id.split("_")[-1]
102 | if ";" in post_id:
103 | post_id = post_id.split(";")
104 | post_id = post_id[2]
105 | else:
106 | post_id = post_id.split(":")[0]
107 | except Exception:
108 | pass
109 | return post_id
110 |
111 |
112 | def get_photo_link(x, selectors, small_photo):
113 | link = ""
114 | try:
115 | if small_photo:
116 | link = x.find_element_by_xpath(
117 | selectors.get("post_photo_small")
118 | ).get_attribute("src")
119 | else:
120 | link = x.get_attribute("data-ploi")
121 | except NoSuchElementException:
122 | try:
123 | link = x.find_element_by_xpath(
124 | selectors.get("post_photo_small_opt1")
125 | ).get_attribute("src")
126 | except AttributeError:
127 | pass
128 | except Exception:
129 | print("Exception (get_post_photo_link):", sys.exc_info()[0])
130 | except Exception:
131 | print("Exception (get_post_photo_link):", sys.exc_info()[0])
132 | return link
133 |
134 |
135 | def get_post_photos_links(x, selectors, small_photo):
136 | links = []
137 | photos = safe_find_elements_by_xpath(x, selectors.get("post_photos"))
138 | if photos is not None:
139 | for el in photos:
140 | links.append(get_photo_link(el, selectors, small_photo))
141 | return links
142 |
143 |
144 | def get_div_links(x, tag, selectors):
145 | try:
146 | temp = x.find_element_by_xpath(selectors.get("temp"))
147 | return temp.find_element_by_tag_name(tag)
148 | except Exception:
149 | return ""
150 |
151 |
152 | def get_title_links(title):
153 | l = title.find_elements_by_tag_name("a")
154 | return l[-1].text, l[-1].get_attribute("href")
155 |
156 |
157 | def get_title(x, selectors):
158 | title = ""
159 | try:
160 | title = x.find_element_by_xpath(selectors.get("title"))
161 | except Exception:
162 | try:
163 | title = x.find_element_by_xpath(selectors.get("title_exc1"))
164 | except Exception:
165 | try:
166 | title = x.find_element_by_xpath(selectors.get("title_exc2"))
167 | except Exception:
168 | pass
169 | finally:
170 | return title
171 |
172 |
173 | def get_time(x):
174 | time = ""
175 | try:
176 | time = x.find_element_by_tag_name("abbr").get_attribute("title")
177 | time = (
178 | str("%02d" % int(time.split(", ")[1].split()[1]),)
179 | + "-"
180 | + str(
181 | (
182 | "%02d"
183 | % (
184 | int(
185 | (
186 | list(calendar.month_abbr).index(
187 | time.split(", ")[1].split()[0][:3]
188 | )
189 | )
190 | ),
191 | )
192 | )
193 | )
194 | + "-"
195 | + time.split()[3]
196 | + " "
197 | + str("%02d" % int(time.split()[5].split(":")[0]))
198 | + ":"
199 | + str(time.split()[5].split(":")[1])
200 | )
201 | except Exception:
202 | pass
203 |
204 | finally:
205 | return time
206 |
207 |
208 | def identify_url(url):
209 | """
210 | A possible way to identify the link.
211 | Not Exhaustive!
212 | :param url:
213 | :return:
214 | 0 - Profile
215 | 1 - Profile post
216 | 2 - Group
217 | 3 - Group post
218 | """
219 | if "groups" in url:
220 | if "permalink" in url:
221 | return 3
222 | else:
223 | return 2
224 | elif "posts" in url:
225 | return 1
226 | else:
227 | return 0
228 |
229 |
230 | def safe_find_elements_by_xpath(driver, xpath):
231 | try:
232 | return driver.find_elements_by_xpath(xpath)
233 | except NoSuchElementException:
234 | return None
235 |
236 |
237 | def get_replies(comment_element, selectors):
238 | replies = []
239 | data = comment_element.find_elements_by_xpath(selectors.get("comment_reply"))
240 | for d in data:
241 | try:
242 | author = d.find_element_by_xpath(selectors.get("comment_author")).text
243 | text = d.find_element_by_xpath(selectors.get("comment_text")).text
244 | replies.append([author, text])
245 | except Exception:
246 | pass
247 | return replies
248 |
249 |
250 | def safe_find_element_by_id(driver, elem_id):
251 | try:
252 | return driver.find_element_by_id(elem_id)
253 | except NoSuchElementException:
254 | return None
255 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |

4 |
5 |
6 |
7 |
Ultimate Facebook Scraper (UFS)
8 |
9 |
10 | Tooling that automates your social media interactions to collect posts, photos, videos, friends, followers and much more on Facebook.
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 | Featured by Top Security Blogs and OSINT Researchers
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 | 2nd Spot in Top Trending Python Repositories on GitHub
91 |
92 | UFS trended among top Python repositories on GitHub for 3 consecutive weeks 🌟
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 | ## News Updates 🏆
102 | * UFS got included as an official tool in the [BlackArch Linux distribution](https://blackarch.org/social.html)!
103 | * UFS got listed among the [top 20 hacking tools in 2019](https://www.kitploit.com/2019/12/top-20-most-popular-hacking-tools-in.html)!
104 |
105 | ## Features 🚀
106 |
107 | A bot which scrapes almost everything about a user's Facebook profile including:
108 |
109 | - uploaded photos
110 | - tagged photos
111 | - videos
112 | - friends list and their profile photos (including Followers, Following, Work Friends, College Friends etc)
113 | - and all public posts/statuses available on the user's timeline
114 | - **NEW:** Now you can scrape Facebook group posts.
115 |
116 | Data is scraped in an organized format to be used for educational/research purposes by researchers. This scraper does not use Facebook's Graph API meaning there are no rate limiting issues.
117 |
118 | **This tool is being used by thousands of developers weekly and we are pretty amazed at this response! Thank you!🎉**
119 |
120 | For **citing/referencing** this tool for your research, check the 'Citation' section below.
121 |
122 | ## Note 🤝
123 |
124 | This tool uses xpaths of **'divs'** to extract data. Since Facebook updates its site frequently, the 'divs' get changed. Consequently, we have to update the divs accordingly to correctly scrape data.
125 |
126 | The developers of this tool have devoted time and effort in developing, and maintaining this tool for a long time. **In order to keep this amazing tool alive, we need support from you geeks.**
127 |
128 | The code is intuitive and easy to understand, so you can update the relevant xpaths in the code if you find data is not being scraped from profiles. Facebook has most likely updated their site, so please generate a pull request. Much appreciated!
129 |
130 | ## Sample
131 |
132 |
133 |
134 |
135 |
136 | ## Screenshot
137 |
138 |
139 |
140 |
141 |
142 | ---
143 |
144 | ## Usage 🔧
145 |
146 | ### Installation 💻
147 |
148 | You will need to:
149 |
150 | - Install latest version of [Google Chrome](https://www.google.com/chrome/).
151 | - Install [Python 3](https://www.python.org/downloads/)
152 | - Have a Facebook account without 2FA enabled
153 |
154 | ```bash
155 | git clone https://github.com/harismuneer/Ultimate-Facebook-Scraper.git
156 | cd Ultimate-Facebook-Scraper
157 |
158 | # Install Python requirements
159 | pip install -e .
160 | ```
161 |
162 | The code is multi-platform and is tested on both Windows and Linux.
163 | Chrome driver is automatically downloaded using the chromedriver_manager package.
164 |
165 | ### How to Run
166 |
167 | - Fill your Facebook credentials into [`credentials.yaml`](credentials.yaml)
168 | - Edit the [`input.txt`](input.txt) file and add profile, groups and individual group posts links as you want in the following format with each link on a new line:
169 |
170 | Make sure the link only contains the username or id number at the end and not any other stuff. Make sure its in the format mentioned above.
171 |
172 | Run the `ultimate-facebook-scraper` command ! 🚀
173 |
174 | ```python
175 | python scraper/scraper.py
176 | ```
177 |
178 | > Note: There are two modes to download Friends Profile Pics and the user's Photos: Large Size and Small Size. By default they are set to Small Sized Pics because its really quick while Large Size Mode takes time depending on the number of pictures to download.
179 |
180 | You can personalize your scrapping needs using the command line arguments:
181 |
182 | ```bash
183 | python scraper/scraper.py \
184 | --uploaded_photos True \
185 | --friends_photos True \
186 | --friends_small_size True \
187 | --photos_small_size True \
188 | --total_scrolls 2500 \
189 | --scroll_time 8
190 | ```
191 |
192 | Note that those are the default values so no need to write them down if you're just testing or are okay with them.
193 |
194 |
195 | ## Chromium
196 |
197 | Chromium users can add `--chromium True` to run using the Chromium browser.
198 |
199 | ```bash
200 | python scraper/scraper.py \
201 | --uploaded_photos True \
202 | --photos_small_size True \
203 | --total_scrolls 2500 \
204 | --scroll_time 8 \
205 | --chromium True
206 | ```
207 |
208 |
209 | ---
210 |
211 | ## Citation 📚
212 |
213 |
214 |
215 |
216 |
217 | If you use this tool for your research, then kindly cite it. Click the above badge for more information regarding the complete citation for this tool and diffferent citation formats like IEEE, APA etc.
218 |
219 | ---
220 |
221 | ## Important Message ⚠️
222 |
223 | This tool is for research purposes only. Hence, the developers of this tool won't be responsible for any misuse of data collected using this tool. Used by many researchers and open source intelligence (OSINT) analysts.
224 |
225 | This tool will not works if your account was set up with 2FA. You must disable it before using.
226 |
227 | ---
228 |
229 | ## Authors 👋
230 |
231 | You can get in touch with us on our LinkedIn Profiles:
232 |
233 | #### Haris Muneer
234 |
235 | [](https://www.linkedin.com/in/harismuneer)
236 |
237 | You can also follow my GitHub Profile to stay updated about my latest projects: [](https://github.com/harismuneer)
238 |
239 | #### Hassaan Elahi
240 |
241 | [](https://www.linkedin.com/in/hassaan-elahi/)
242 |
243 | You can also follow my GitHub Profile to stay updated about my latest projects: [](https://github.com/Hassaan-Elahi)
244 |
245 | If you liked the repo then please support it by giving it a star ⭐!
246 |
247 | ## For Future 🔮
248 |
249 | Shoutout to geeks willing to contribute to this project. Please have a look at the [UFS kanban board](https://github.com/harismuneer/Ultimate-Facebook-Scraper/projects/1) for a list of things to be done.
250 |
251 | There are a lot of features that can be added to this tool like adding support for pages, groups, comments etc! Please contribute :)
252 |
253 | ## Contributions Welcome ✨
254 |
255 | 
256 |
257 | If you find any bug in the code or have any improvements in mind then feel free to generate a pull request.
258 |
259 | > Note: We use [Black](https://pypi.org/project/black/) to lint Python files. Please use it in order to have a valid pull request 😉
260 |
261 | ## Issues 🔨
262 |
263 | [](https://www.github.com/harismuneer/Ultimate-Facebook-Scraper/issues)
264 |
265 | If you face any issue, you can create a new issue in the Issues Tab and I will be glad to help you out.
266 |
267 | ## License 📄
268 |
269 | [](LICENSE)
270 |
271 | Copyright (c) 2018-present, harismuneer, Hassaan-Elahi
272 |
--------------------------------------------------------------------------------
/scraper/scraper.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import sys
4 | import urllib.request
5 | import yaml
6 | import utils
7 | import argparse
8 |
9 | from selenium import webdriver
10 | from selenium.common.exceptions import NoSuchElementException
11 | from selenium.webdriver.chrome.options import Options
12 | from selenium.webdriver.common.by import By
13 | from selenium.webdriver.support import expected_conditions as EC
14 | from selenium.webdriver.support.ui import WebDriverWait
15 | from webdriver_manager.chrome import ChromeDriverManager
16 |
17 |
18 | def get_facebook_images_url(img_links):
19 | urls = []
20 |
21 | for link in img_links:
22 | if link != "None":
23 | valid_url_found = False
24 | driver.get(link)
25 |
26 | try:
27 | while not valid_url_found:
28 | WebDriverWait(driver, 30).until(
29 | EC.presence_of_element_located(
30 | (By.CLASS_NAME, selectors.get("spotlight"))
31 | )
32 | )
33 | element = driver.find_element_by_class_name(
34 | selectors.get("spotlight")
35 | )
36 | img_url = element.get_attribute("src")
37 |
38 | if img_url.find(".gif") == -1:
39 | valid_url_found = True
40 | urls.append(img_url)
41 | except Exception:
42 | urls.append("None")
43 | else:
44 | urls.append("None")
45 |
46 | return urls
47 |
48 |
49 | # -------------------------------------------------------------
50 | # -------------------------------------------------------------
51 |
52 | # takes a url and downloads image from that url
53 | def image_downloader(img_links, folder_name):
54 | """
55 | Download images from a list of image urls.
56 | :param img_links:
57 | :param folder_name:
58 | :return: list of image names downloaded
59 | """
60 | img_names = []
61 |
62 | try:
63 | parent = os.getcwd()
64 | try:
65 | folder = os.path.join(os.getcwd(), folder_name)
66 | utils.create_folder(folder)
67 | os.chdir(folder)
68 | except Exception:
69 | print("Error in changing directory.")
70 |
71 | for link in img_links:
72 | img_name = "None"
73 |
74 | if link != "None":
75 | img_name = (link.split(".jpg")[0]).split("/")[-1] + ".jpg"
76 |
77 | # this is the image id when there's no profile pic
78 | if img_name == selectors.get("default_image"):
79 | img_name = "None"
80 | else:
81 | try:
82 | urllib.request.urlretrieve(link, img_name)
83 | except Exception:
84 | img_name = "None"
85 |
86 | img_names.append(img_name)
87 |
88 | os.chdir(parent)
89 | except Exception:
90 | print("Exception (image_downloader):", sys.exc_info()[0])
91 | return img_names
92 |
93 |
94 | # -------------------------------------------------------------
95 | # -------------------------------------------------------------
96 |
97 |
98 | def extract_and_write_posts(elements, filename):
99 | try:
100 | f = open(filename, "w", newline="\r\n", encoding="utf-8")
101 | f.writelines(
102 | " TIME || TYPE || TITLE || STATUS || LINKS(Shared Posts/Shared Links etc) || POST_ID "
103 | + "\n"
104 | + "\n"
105 | )
106 | ids = []
107 | for x in elements:
108 | try:
109 | link = ""
110 | # id
111 | post_id = utils.get_post_id(x)
112 | ids.append(post_id)
113 |
114 | # time
115 | time = utils.get_time(x)
116 |
117 | link, status, title, post_type = get_status_and_title(link, x)
118 |
119 | line = (
120 | str(time)
121 | + " || "
122 | + str(post_type)
123 | + " || "
124 | + str(title)
125 | + " || "
126 | + str(status)
127 | + " || "
128 | + str(link)
129 | + " || "
130 | + str(post_id)
131 | + "\n"
132 | )
133 |
134 | try:
135 | f.writelines(line)
136 | except Exception:
137 | print("Posts: Could not map encoded characters")
138 | except Exception:
139 | pass
140 | f.close()
141 | except ValueError:
142 | print("Exception (extract_and_write_posts)", "Status =", sys.exc_info()[0])
143 | except Exception:
144 | print("Exception (extract_and_write_posts)", "Status =", sys.exc_info()[0])
145 | return
146 |
147 |
148 | def get_status_and_title(link, x):
149 | # title
150 | title = utils.get_title(x, selectors)
151 | if title.text.find("shared a memory") != -1:
152 | x = x.find_element_by_xpath(selectors.get("title_element"))
153 | title = utils.get_title(x, selectors)
154 | status = utils.get_status(x, selectors)
155 | if title.text == driver.find_element_by_id(selectors.get("title_text")).text:
156 | if status == "":
157 | temp = utils.get_div_links(x, "img", selectors)
158 | if temp == "": # no image tag which means . it is not a life event
159 | link = utils.get_div_links(x, "a", selectors).get_attribute("href")
160 | post_type = "status update without text"
161 | else:
162 | post_type = "life event"
163 | link = utils.get_div_links(x, "a", selectors).get_attribute("href")
164 | status = utils.get_div_links(x, "a", selectors).text
165 | else:
166 | post_type = "status update"
167 | if utils.get_div_links(x, "a", selectors) != "":
168 | link = utils.get_div_links(x, "a", selectors).get_attribute("href")
169 |
170 | elif title.text.find(" shared ") != -1:
171 | x1, link = utils.get_title_links(title)
172 | post_type = "shared " + x1
173 | elif title.text.find(" at ") != -1 or title.text.find(" in ") != -1:
174 | if title.text.find(" at ") != -1:
175 | x1, link = utils.get_title_links(title)
176 | post_type = "check in"
177 | elif title.text.find(" in ") != 1:
178 | status = utils.get_div_links(x, "a", selectors).text
179 | elif title.text.find(" added ") != -1 and title.text.find("photo") != -1:
180 | post_type = "added photo"
181 | link = utils.get_div_links(x, "a", selectors).get_attribute("href")
182 |
183 | elif title.text.find(" added ") != -1 and title.text.find("video") != -1:
184 | post_type = "added video"
185 | link = utils.get_div_links(x, "a", selectors).get_attribute("href")
186 |
187 | else:
188 | post_type = "others"
189 | if not isinstance(title, str):
190 | title = title.text
191 | status = status.replace("\n", " ")
192 | title = title.replace("\n", " ")
193 | return link, status, title, post_type
194 |
195 |
196 | def extract_and_write_group_posts(elements, filename):
197 | try:
198 | f = create_post_file(filename)
199 | ids = []
200 | for x in elements:
201 | try:
202 | # id
203 | post_id = utils.get_group_post_id(x)
204 | ids.append(post_id)
205 | except Exception:
206 | pass
207 | total = len(ids)
208 | i = 0
209 | for post_id in ids:
210 | i += 1
211 | try:
212 | add_group_post_to_file(f, filename, post_id, i, total, reload=True)
213 | except ValueError:
214 | pass
215 | f.close()
216 | except ValueError:
217 | print("Exception (extract_and_write_posts)", "Status =", sys.exc_info()[0])
218 | except Exception:
219 | print("Exception (extract_and_write_posts)", "Status =", sys.exc_info()[0])
220 | return
221 |
222 |
223 | def add_group_post_to_file(f, filename, post_id, number=1, total=1, reload=False):
224 | print("Scraping Post(" + post_id + "). " + str(number) + " of " + str(total))
225 | photos_dir = os.path.dirname(filename)
226 | if reload:
227 | driver.get(utils.create_post_link(post_id, selectors))
228 | line = get_group_post_as_line(post_id, photos_dir)
229 | try:
230 | f.writelines(line)
231 | except Exception:
232 | print("Posts: Could not map encoded characters")
233 |
234 |
235 | def create_post_file(filename):
236 | """
237 | Creates post file and header
238 | :param filename:
239 | :return: file
240 | """
241 | f = open(filename, "w", newline="\r\n", encoding="utf-8")
242 | f.writelines(
243 | "TIME || TYPE || TITLE || STATUS || LINKS(Shared Posts/Shared Links etc) || POST_ID || "
244 | "PHOTO || COMMENTS " + "\n"
245 | )
246 | return f
247 |
248 |
249 | # -------------------------------------------------------------
250 | # -------------------------------------------------------------
251 |
252 |
253 | def save_to_file(name, elements, status, current_section):
254 | """helper function used to save links to files"""
255 |
256 | # status 0 = dealing with friends list
257 | # status 1 = dealing with photos
258 | # status 2 = dealing with videos
259 | # status 3 = dealing with about section
260 | # status 4 = dealing with posts
261 | # status 5 = dealing with group posts
262 |
263 | try:
264 | f = None # file pointer
265 |
266 | if status != 4 and status != 5:
267 | f = open(name, "w", encoding="utf-8", newline="\r\n")
268 |
269 | results = []
270 | img_names = []
271 |
272 | # dealing with Friends
273 | if status == 0:
274 | # get profile links of friends
275 | results = [x.get_attribute("href") for x in elements]
276 | results = [create_original_link(x) for x in results]
277 |
278 | # get names of friends
279 | people_names = [
280 | x.find_element_by_tag_name("img").get_attribute("aria-label")
281 | for x in elements
282 | ]
283 |
284 | # download friends' photos
285 | try:
286 | if download_friends_photos:
287 | if friends_small_size:
288 | img_links = [
289 | x.find_element_by_css_selector("img").get_attribute("src")
290 | for x in elements
291 | ]
292 | else:
293 | links = []
294 | for friend in results:
295 | try:
296 | driver.get(friend)
297 | WebDriverWait(driver, 30).until(
298 | EC.presence_of_element_located(
299 | (
300 | By.CLASS_NAME,
301 | selectors.get("profilePicThumb"),
302 | )
303 | )
304 | )
305 | l = driver.find_element_by_class_name(
306 | selectors.get("profilePicThumb")
307 | ).get_attribute("href")
308 | except Exception:
309 | l = "None"
310 |
311 | links.append(l)
312 |
313 | for i, _ in enumerate(links):
314 | if links[i] is None:
315 | links[i] = "None"
316 | elif links[i].find("picture/view") != -1:
317 | links[i] = "None"
318 |
319 | img_links = get_facebook_images_url(links)
320 |
321 | folder_names = [
322 | "Friend's Photos",
323 | "Mutual Friends' Photos",
324 | "Following's Photos",
325 | "Follower's Photos",
326 | "Work Friends Photos",
327 | "College Friends Photos",
328 | "Current City Friends Photos",
329 | "Hometown Friends Photos",
330 | ]
331 | print("Downloading " + folder_names[current_section])
332 |
333 | img_names = image_downloader(
334 | img_links, folder_names[current_section]
335 | )
336 | else:
337 | img_names = ["None"] * len(results)
338 | except Exception:
339 | print(
340 | "Exception (Images)",
341 | str(status),
342 | "Status =",
343 | current_section,
344 | sys.exc_info()[0],
345 | )
346 |
347 | # dealing with Photos
348 | elif status == 1:
349 | results = [x.get_attribute("href") for x in elements]
350 | results.pop(0)
351 |
352 | try:
353 | if download_uploaded_photos:
354 | if photos_small_size:
355 | background_img_links = driver.find_elements_by_xpath(
356 | selectors.get("background_img_links")
357 | )
358 | background_img_links = [
359 | x.get_attribute("style") for x in background_img_links
360 | ]
361 | background_img_links = [
362 | ((x.split("(")[1]).split(")")[0]).strip('"')
363 | for x in background_img_links
364 | ]
365 | else:
366 | background_img_links = get_facebook_images_url(results)
367 |
368 | folder_names = ["Uploaded Photos", "Tagged Photos"]
369 | print("Downloading " + folder_names[current_section])
370 |
371 | img_names = image_downloader(
372 | background_img_links, folder_names[current_section]
373 | )
374 | else:
375 | img_names = ["None"] * len(results)
376 | except Exception:
377 | print(
378 | "Exception (Images)",
379 | str(status),
380 | "Status =",
381 | current_section,
382 | sys.exc_info()[0],
383 | )
384 |
385 | # dealing with Videos
386 | elif status == 2:
387 | results = elements[0].find_elements_by_css_selector("li")
388 | results = [
389 | x.find_element_by_css_selector("a").get_attribute("href")
390 | for x in results
391 | ]
392 |
393 | try:
394 | if results[0][0] == "/":
395 | results = [r.pop(0) for r in results]
396 | results = [(selectors.get("fb_link") + x) for x in results]
397 | except Exception:
398 | pass
399 |
400 | # dealing with About Section
401 | elif status == 3:
402 | results = elements[0].text
403 | f.writelines(results)
404 |
405 | # dealing with Posts
406 | elif status == 4:
407 | extract_and_write_posts(elements, name)
408 | return
409 |
410 | # dealing with Group Posts
411 | elif status == 5:
412 | extract_and_write_group_posts(elements, name)
413 | return
414 |
415 | """Write results to file"""
416 | if status == 0:
417 | for i, _ in enumerate(results):
418 | # friend's profile link
419 | f.writelines(results[i])
420 | f.write(",")
421 |
422 | # friend's name
423 | f.writelines(people_names[i])
424 | f.write(",")
425 |
426 | # friend's downloaded picture id
427 | f.writelines(img_names[i])
428 | f.write("\n")
429 |
430 | elif status == 1:
431 | for i, _ in enumerate(results):
432 | # image's link
433 | f.writelines(results[i])
434 | f.write(",")
435 |
436 | # downloaded picture id
437 | f.writelines(img_names[i])
438 | f.write("\n")
439 |
440 | elif status == 2:
441 | for x in results:
442 | f.writelines(x + "\n")
443 |
444 | f.close()
445 |
446 | except Exception:
447 | print("Exception (save_to_file)", "Status =", str(status), sys.exc_info()[0])
448 |
449 | return
450 |
451 |
452 | # ----------------------------------------------------------------------------
453 | # -----------------------------------------------------------------------------
454 |
455 |
456 | def scrape_data(url, scan_list, section, elements_path, save_status, file_names):
457 | """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
458 | page = []
459 |
460 | if save_status == 4 or save_status == 5:
461 | page.append(url)
462 |
463 | page += [url + s for s in section]
464 |
465 | for i, _ in enumerate(scan_list):
466 | try:
467 | driver.get(page[i])
468 |
469 | if (
470 | (save_status == 0) or (save_status == 1) or (save_status == 2)
471 | ): # Only run this for friends, photos and videos
472 |
473 | # the bar which contains all the sections
474 | sections_bar = driver.find_element_by_xpath(
475 | selectors.get("sections_bar")
476 | )
477 |
478 | if sections_bar.text.find(scan_list[i]) == -1:
479 | continue
480 |
481 | if save_status != 3:
482 | utils.scroll(total_scrolls, driver, selectors, scroll_time)
483 | pass
484 |
485 | data = driver.find_elements_by_xpath(elements_path[i])
486 |
487 | save_to_file(file_names[i], data, save_status, i)
488 |
489 | except Exception:
490 | print(
491 | "Exception (scrape_data)",
492 | str(i),
493 | "Status =",
494 | str(save_status),
495 | sys.exc_info()[0],
496 | )
497 |
498 |
499 | # -----------------------------------------------------------------------------
500 | # -----------------------------------------------------------------------------
501 |
502 |
503 | def create_original_link(url):
504 | if url.find(".php") != -1:
505 | original_link = (
506 | facebook_https_prefix + facebook_link_body + ((url.split("="))[1])
507 | )
508 |
509 | if original_link.find("&") != -1:
510 | original_link = original_link.split("&")[0]
511 |
512 | elif url.find("fnr_t") != -1:
513 | original_link = (
514 | facebook_https_prefix
515 | + facebook_link_body
516 | + ((url.split("/"))[-1].split("?")[0])
517 | )
518 | elif url.find("_tab") != -1:
519 | original_link = (
520 | facebook_https_prefix
521 | + facebook_link_body
522 | + (url.split("?")[0]).split("/")[-1]
523 | )
524 | else:
525 | original_link = url
526 |
527 | return original_link
528 |
529 |
530 | def scrap_profile():
531 | data_folder = os.path.join(os.getcwd(), "data")
532 | utils.create_folder(data_folder)
533 | os.chdir(data_folder)
534 |
535 | # execute for all profiles given in input.txt file
536 | url = driver.current_url
537 | user_id = create_original_link(url)
538 |
539 | print("\nScraping:", user_id)
540 |
541 | try:
542 | target_dir = os.path.join(data_folder, user_id.split("/")[-1])
543 | utils.create_folder(target_dir)
544 | os.chdir(target_dir)
545 | except Exception:
546 | print("Some error occurred in creating the profile directory.")
547 | os.chdir("../..")
548 | return
549 |
550 | to_scrap = ["Friends", "Photos", "Videos", "About", "Posts"]
551 | for item in to_scrap:
552 | print("----------------------------------------")
553 | print("Scraping {}..".format(item))
554 |
555 | if item == "Posts":
556 | scan_list = [None]
557 | elif item == "About":
558 | scan_list = [None] * 7
559 | else:
560 | scan_list = params[item]["scan_list"]
561 |
562 | section = params[item]["section"]
563 | elements_path = params[item]["elements_path"]
564 | file_names = params[item]["file_names"]
565 | save_status = params[item]["save_status"]
566 |
567 | scrape_data(user_id, scan_list, section, elements_path, save_status, file_names)
568 |
569 | print("{} Done!".format(item))
570 |
571 | print("Finished Scraping Profile " + str(user_id) + ".")
572 | os.chdir("../..")
573 |
574 | return
575 |
576 |
577 | def get_comments():
578 | comments = []
579 | try:
580 | data = driver.find_element_by_xpath(selectors.get("comment_section"))
581 | reply_links = driver.find_elements_by_xpath(
582 | selectors.get("more_comment_replies")
583 | )
584 | for link in reply_links:
585 | try:
586 | driver.execute_script("arguments[0].click();", link)
587 | except Exception:
588 | pass
589 | see_more_links = driver.find_elements_by_xpath(
590 | selectors.get("comment_see_more_link")
591 | )
592 | for link in see_more_links:
593 | try:
594 | driver.execute_script("arguments[0].click();", link)
595 | except Exception:
596 | pass
597 | data = data.find_elements_by_xpath(selectors.get("comment"))
598 | for d in data:
599 | try:
600 | author = d.find_element_by_xpath(selectors.get("comment_author")).text
601 | text = d.find_element_by_xpath(selectors.get("comment_text")).text
602 | replies = utils.get_replies(d, selectors)
603 | comments.append([author, text, replies])
604 | except Exception:
605 | pass
606 | except Exception:
607 | pass
608 | return comments
609 |
610 |
611 | def get_group_post_as_line(post_id, photos_dir):
612 | try:
613 | data = driver.find_element_by_xpath(selectors.get("single_post"))
614 | time = utils.get_time(data)
615 | title = utils.get_title(data, selectors).text
616 | # link, status, title, type = get_status_and_title(title,data)
617 | link = utils.get_div_links(data, "a", selectors)
618 | if link != "":
619 | link = link.get_attribute("href")
620 | post_type = ""
621 | status = '"' + utils.get_status(data, selectors).replace("\r\n", " ") + '"'
622 | photos = utils.get_post_photos_links(data, selectors, photos_small_size)
623 | comments = get_comments()
624 | photos = image_downloader(photos, photos_dir)
625 | line = (
626 | str(time)
627 | + "||"
628 | + str(post_type)
629 | + "||"
630 | + str(title)
631 | + "||"
632 | + str(status)
633 | + "||"
634 | + str(link)
635 | + "||"
636 | + str(post_id)
637 | + "||"
638 | + str(photos)
639 | + "||"
640 | + str(comments)
641 | + "\n"
642 | )
643 | return line
644 | except Exception:
645 | return ""
646 |
647 |
648 | def create_folders():
649 | """
650 | Creates folder for saving data (profile, post or group) according to current driver url
651 | Changes current dir to target_dir
652 | :return: target_dir or None in case of failure
653 | """
654 | folder = os.path.join(os.getcwd(), "data")
655 | utils.create_folder(folder)
656 | os.chdir(folder)
657 | try:
658 | item_id = get_item_id(driver.current_url)
659 | target_dir = os.path.join(folder, item_id)
660 | utils.create_folder(target_dir)
661 | os.chdir(target_dir)
662 | return target_dir
663 | except Exception:
664 | print("Some error occurred in creating the group directory.")
665 | os.chdir("../..")
666 | return None
667 |
668 |
669 | def get_item_id(url):
670 | """
671 | Gets item id from url
672 | :param url: facebook url string
673 | :return: item id or empty string in case of failure
674 | """
675 | ret = ""
676 | try:
677 | link = create_original_link(url)
678 | ret = link.split("/")[-1]
679 | if ret.strip() == "":
680 | ret = link.split("/")[-2]
681 | except Exception as e:
682 | print("Failed to get id: " + format(e))
683 | return ret
684 |
685 |
686 | def scrape_group(url):
687 | if create_folders() is None:
688 | return
689 | group_id = get_item_id(url)
690 | # execute for all profiles given in input.txt file
691 | print("\nScraping:", group_id)
692 |
693 | to_scrap = ["GroupPosts"] # , "Photos", "Videos", "About"]
694 | for item in to_scrap:
695 | print("----------------------------------------")
696 | print("Scraping {}..".format(item))
697 |
698 | if item == "GroupPosts":
699 | scan_list = [None]
700 | elif item == "About":
701 | scan_list = [None] * 7
702 | else:
703 | scan_list = params[item]["scan_list"]
704 |
705 | section = params[item]["section"]
706 | elements_path = params[item]["elements_path"]
707 | file_names = params[item]["file_names"]
708 | save_status = params[item]["save_status"]
709 |
710 | scrape_data(url, scan_list, section, elements_path, save_status, file_names)
711 |
712 | print("{} Done!".format(item))
713 |
714 | print("Finished Scraping Group " + str(group_id) + ".")
715 | os.chdir("../..")
716 |
717 | return
718 |
719 |
720 | # -----------------------------------------------------------------------------
721 | # -----------------------------------------------------------------------------
722 |
723 |
724 | def login(email, password):
725 | """ Logging into our own profile """
726 |
727 | try:
728 | global driver
729 |
730 | options = Options()
731 |
732 | # Code to disable notifications pop up of Chrome Browser
733 | options.add_argument("--disable-notifications")
734 | options.add_argument("--disable-infobars")
735 | options.add_argument("--mute-audio")
736 |
737 | if headless:
738 | options.add_argument('--headless')
739 |
740 | try:
741 | if chromium:
742 | from selenium.webdriver import Chrome
743 | driver = webdriver.Chrome(
744 | options=options
745 | )
746 | else:
747 | driver = webdriver.Chrome(
748 | executable_path=ChromeDriverManager().install(), options=options
749 | )
750 | except Exception:
751 | print("Error loading chrome webdriver " + sys.exc_info()[0])
752 | exit(1)
753 |
754 | fb_path = facebook_https_prefix + facebook_link_body
755 | driver.get(fb_path)
756 | driver.maximize_window()
757 |
758 | # filling the form
759 | driver.find_element_by_name("email").send_keys(email)
760 | driver.find_element_by_name("pass").send_keys(password)
761 |
762 | try:
763 | # clicking on login button
764 | driver.find_element_by_id("loginbutton").click()
765 | except NoSuchElementException:
766 | # Facebook new design
767 | driver.find_element_by_name("login").click()
768 |
769 | # if your account uses multi factor authentication
770 | mfa_code_input = utils.safe_find_element_by_id(driver, "approvals_code")
771 |
772 | if mfa_code_input is None:
773 | return
774 |
775 | mfa_code_input.send_keys(input("Enter MFA code: "))
776 | driver.find_element_by_id("checkpointSubmitButton").click()
777 |
778 | # there are so many screens asking you to verify things. Just skip them all
779 | while (
780 | utils.safe_find_element_by_id(driver, "checkpointSubmitButton") is not None
781 | ):
782 | dont_save_browser_radio = utils.safe_find_element_by_id(driver, "u_0_3")
783 | if dont_save_browser_radio is not None:
784 | dont_save_browser_radio.click()
785 |
786 | driver.find_element_by_id("checkpointSubmitButton").click()
787 |
788 | except Exception:
789 | print("There's some error in log in.")
790 | print(sys.exc_info()[0])
791 | exit(1)
792 |
793 |
794 | # -----------------------------------------------------------------------------
795 | # -----------------------------------------------------------------------------
796 |
797 |
798 | def scraper(**kwargs):
799 | with open("credentials.yaml", "r") as ymlfile:
800 | cfg = yaml.safe_load(stream=ymlfile)
801 |
802 | if ("password" not in cfg) or ("email" not in cfg):
803 | print("Your email or password is missing. Kindly write them in credentials.txt")
804 | exit(1)
805 | urls = [
806 | facebook_https_prefix + facebook_link_body + get_item_id(line)
807 | for line in open("input.txt", newline="\r\n")
808 | if not line.lstrip().startswith("#") and not line.strip() == ""
809 | ]
810 |
811 | if len(urls) > 0:
812 | print("\nStarting Scraping...")
813 | login(cfg["email"], cfg["password"])
814 | for url in urls:
815 | driver.get(url)
816 | link_type = utils.identify_url(driver.current_url)
817 | if link_type == 0:
818 | scrap_profile()
819 | elif link_type == 1:
820 | # scrap_post(url)
821 | pass
822 | elif link_type == 2:
823 | scrape_group(driver.current_url)
824 | elif link_type == 3:
825 | file_name = params["GroupPosts"]["file_names"][0]
826 | item_id = get_item_id(driver.current_url)
827 | if create_folders() is None:
828 | continue
829 | f = create_post_file(file_name)
830 | add_group_post_to_file(f, file_name, item_id)
831 | f.close()
832 | os.chdir("../..")
833 | driver.close()
834 | else:
835 | print("Input file is empty.")
836 |
837 |
838 | # -------------------------------------------------------------
839 | # -------------------------------------------------------------
840 | # -------------------------------------------------------------
841 |
842 | if __name__ == "__main__":
843 | ap = argparse.ArgumentParser()
844 | # PLS CHECK IF HELP CAN BE BETTER / LESS AMBIGUOUS
845 | ap.add_argument(
846 | "-dup",
847 | "--uploaded_photos",
848 | help="download users' uploaded photos?",
849 | default=True,
850 | )
851 | ap.add_argument(
852 | "-dfp", "--friends_photos", help="download users' photos?", default=True
853 | )
854 | ap.add_argument(
855 | "-fss",
856 | "--friends_small_size",
857 | help="Download friends pictures in small size?",
858 | default=True,
859 | )
860 | ap.add_argument(
861 | "-pss",
862 | "--photos_small_size",
863 | help="Download photos in small size?",
864 | default=True,
865 | )
866 | ap.add_argument(
867 | "-ts",
868 | "--total_scrolls",
869 | help="How many times should I scroll down?",
870 | default=2500,
871 | )
872 | ap.add_argument(
873 | "-st", "--scroll_time", help="How much time should I take to scroll?", default=8
874 | )
875 | ap.add_argument(
876 | "--chromium",
877 | nargs='?',
878 | const=True,
879 | help="Should I use Chromium instead?",
880 | default=False
881 | )
882 | ap.add_argument(
883 | "--headless",
884 | nargs='?',
885 | const=True,
886 | help="Should I run in a headless browser?",
887 | default=False
888 | )
889 |
890 | args = vars(ap.parse_args())
891 | print(args)
892 |
893 | # ---------------------------------------------------------
894 | # Global Variables
895 | # ---------------------------------------------------------
896 |
897 | # whether to download photos or not
898 | download_uploaded_photos = utils.to_bool(args["uploaded_photos"])
899 | download_friends_photos = utils.to_bool(args["friends_photos"])
900 |
901 | # whether to download the full image or its thumbnail (small size)
902 | # if small size is True then it will be very quick else if its false then it will open each photo to download it
903 | # and it will take much more time
904 | friends_small_size = utils.to_bool(args["friends_small_size"])
905 | photos_small_size = utils.to_bool(args["photos_small_size"])
906 |
907 | total_scrolls = int(args["total_scrolls"])
908 | scroll_time = int(args["scroll_time"])
909 | chromium = utils.to_bool(args["chromium"])
910 | headless = utils.to_bool(args["headless"])
911 |
912 | current_scrolls = 0
913 | old_height = 0
914 |
915 | driver = None
916 |
917 | with open("selectors.json") as a, open("params.json") as b:
918 | selectors = json.load(a)
919 | params = json.load(b)
920 |
921 | firefox_profile_path = selectors.get("firefox_profile_path")
922 | facebook_https_prefix = selectors.get("facebook_https_prefix")
923 | facebook_link_body = selectors.get("facebook_link_body")
924 |
925 | # get things rolling
926 | scraper()
927 |
--------------------------------------------------------------------------------