├── .gitignore
├── FacebookPostsScraper.py
├── LICENSE
├── README.md
├── main.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/FacebookPostsScraper.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import pickle
  4 | import os
  5 | from urllib.parse import urlparse, unquote
  6 | from urllib.parse import parse_qs
  7 | import pandas as pd
  8 | import json
  9 | 
 10 | 
 11 | class FacebookPostsScraper:
 12 | 
 13 |     # We need the email and password to access Facebook, and optionally the text in the Url that identifies the "view full post".
 14 |     def __init__(self, email, password, post_url_text='Full Story'):
 15 |         self.email = email
 16 |         self.password = password
 17 |         self.headers = {  # This is the important part: Nokia C3 User Agent
 18 |             'User-Agent': 'NokiaC3-00/5.0 (07.20) Profile/MIDP-2.1 Configuration/CLDC-1.1 Mozilla/5.0 AppleWebKit/420+ (KHTML, like Gecko) Safari/420+'
 19 |         }
 20 |         self.session = requests.session()  # Create the session for the next requests
 21 |         self.cookies_path = 'session_facebook.cki'  # Give a name to store the session in a cookie file.
 22 | 
 23 |         # At certain point, we need find the text in the Url to point the url post, in my case, my Facebook is in
 24 |         # English, this is why it says 'Full Story', so, you need to change this for your language.
 25 |         # Some translations:
 26 |         # - English: 'Full Story'
 27 |         # - Spanish: 'Historia completa'
 28 |         self.post_url_text = post_url_text
 29 | 
 30 |         # Evaluate if NOT exists a cookie file, if NOT exists the we make the Login request to Facebook,
 31 |         # else we just load the current cookie to maintain the older session.
 32 |         if self.new_session():
 33 |             self.login()
 34 | 
 35 |         self.posts = []  # Store the scraped posts
 36 | 
 37 |     # We need to check if we already have a session saved or need to log to Facebook
 38 |     def new_session(self):
 39 |         if not os.path.exists(self.cookies_path):
 40 |             return True
 41 | 
 42 |         f = open(self.cookies_path, 'rb')
 43 |         cookies = pickle.load(f)
 44 |         self.session.cookies = cookies
 45 |         return False
 46 | 
 47 |     # Utility function to make the requests and convert to soup object if necessary
 48 |     def make_request(self, url, method='GET', data=None, is_soup=True):
 49 |         if len(url) == 0:
 50 |             raise Exception(f'Empty Url')
 51 | 
 52 |         if method == 'GET':
 53 |             resp = self.session.get(url, headers=self.headers)
 54 |         elif method == 'POST':
 55 |             resp = self.session.post(url, headers=self.headers, data=data)
 56 |         else:
 57 |             raise Exception(f'Method [{method}] Not Supported')
 58 | 
 59 |         if resp.status_code != 200:
 60 |             raise Exception(f'Error [{resp.status_code}] > {url}')
 61 | 
 62 |         if is_soup:
 63 |             return BeautifulSoup(resp.text, 'lxml')
 64 |         return resp
 65 | 
 66 |     # The first time we login
 67 |     def login(self):
 68 |         # Get the content of HTML of mobile Login Facebook page
 69 |         url_home = "https://m.facebook.com/"
 70 |         soup = self.make_request(url_home)
 71 |         if soup is None:
 72 |             raise Exception("Couldn't load the Login Page")
 73 | 
 74 |         # Here we need to extract this tokens from the Login Page
 75 |         lsd = soup.find("input", {"name": "lsd"}).get("value")
 76 |         jazoest = soup.find("input", {"name": "jazoest"}).get("value")
 77 |         m_ts = soup.find("input", {"name": "m_ts"}).get("value")
 78 |         li = soup.find("input", {"name": "li"}).get("value")
 79 |         try_number = soup.find("input", {"name": "try_number"}).get("value")
 80 |         unrecognized_tries = soup.find("input", {"name": "unrecognized_tries"}).get("value")
 81 | 
 82 |         # This is the url to send the login params to Facebook
 83 |         url_login = "https://m.facebook.com/login/device-based/regular/login/?refsrc=https%3A%2F%2Fm.facebook.com%2F&lwv=100&refid=8"
 84 |         payload = {
 85 |             "lsd": lsd,
 86 |             "jazoest": jazoest,
 87 |             "m_ts": m_ts,
 88 |             "li": li,
 89 |             "try_number": try_number,
 90 |             "unrecognized_tries": unrecognized_tries,
 91 |             "email": self.email,
 92 |             "pass": self.password,
 93 |             "login": "Iniciar sesión",
 94 |             "prefill_contact_point": "",
 95 |             "prefill_source": "",
 96 |             "prefill_type": "",
 97 |             "first_prefill_source": "",
 98 |             "first_prefill_type": "",
 99 |             "had_cp_prefilled": "false",
100 |             "had_password_prefilled": "false",
101 |             "is_smart_lock": "false",
102 |             "_fb_noscript": "true"
103 |         }
104 |         soup = self.make_request(url_login, method='POST', data=payload, is_soup=True)
105 |         if soup is None:
106 |             raise Exception(f"The login request couldn't be made: {url_login}")
107 | 
108 |         redirect = soup.select_one('a')
109 |         if not redirect:
110 |             raise Exception("Please log in desktop/mobile Facebook and change your password")
111 | 
112 |         url_redirect = redirect.get('href', '')
113 |         resp = self.make_request(url_redirect)
114 |         if resp is None:
115 |             raise Exception(f"The login request couldn't be made: {url_redirect}")
116 | 
117 |         # Finally we get the cookies from the session and save it in a file for future usage
118 |         cookies = self.session.cookies
119 |         f = open(self.cookies_path, 'wb')
120 |         pickle.dump(cookies, f)
121 | 
122 |         return {'code': 200}
123 | 
124 |     # Scrap a list of profiles
125 |     def get_posts_from_list(self, profiles):
126 |         data = []
127 |         n = len(profiles)
128 | 
129 |         for idx in range(n):
130 |             profile = profiles[idx]
131 |             print(f'{idx + 1}/{n}. {profile}')
132 | 
133 |             posts = self.get_posts_from_profile(profile)
134 |             data.append(posts)
135 | 
136 |         return data
137 | 
138 |     # This is the extraction point!
139 |     def get_posts_from_profile(self, url_profile):
140 |         # Prepare the Url to point to the posts feed
141 |         if "www." in url_profile: url_profile = url_profile.replace('www.', 'm.')
142 |         if 'v=timeline' not in url_profile:
143 |             if '?' in url_profile:
144 |                 url_profile = f'{url_profile}&v=timeline'
145 |             else:
146 |                 url_profile = f'{url_profile}?v=timeline'
147 | 
148 |         is_group = '/groups/' in url_profile
149 | 
150 |         # Make a simple GET request
151 |         soup = self.make_request(url_profile)
152 |         if soup is None:
153 |             print(f"Couldn't load the Page: {url_profile}")
154 |             return []
155 | 
156 |         # Now the extraction...
157 |         css_profile = '.storyStream > div'  # Select the posts from a user profile
158 |         css_page = '#recent > div > div > div'  # Select the posts from a Facebook page
159 |         css_group = '#m_group_stories_container > div > div'  # Select the posts from a Facebook group
160 |         raw_data = soup.select(f'{css_profile} , {css_page} , {css_group}')  # Now join and scrape it
161 |         posts = []
162 |         for item in raw_data:  # Now, for every post...
163 |             published = item.select_one('abbr')  # Get the formatted datetime of published
164 |             description = item.select('p')  # Get list of all p tag, they compose the description
165 |             images = item.select('a > img')  # Get list of all images
166 |             _external_links = item.select('p a')  # Get list of any link in the description, this are external links
167 |             post_url = item.find('a', text=self.post_url_text)  # Get the url to point this post.
168 |             like_url = item.find('a', text='Like')  # Get the Like url.
169 | 
170 |             # Clean the publish date
171 |             if published is not None:
172 |                 published = published.get_text()
173 |             else:
174 |                 published = ''
175 | 
176 |             # Join all the text in p tags, else set empty string
177 |             if len(description) > 0:
178 |                 description = '\n'.join([d.get_text() for d in description])
179 |             else:
180 |                 description = ''
181 | 
182 |             # Get all the images links
183 |             images = [image.get('src', '') for image in images]
184 | 
185 |             # Clean the post link
186 |             if post_url is not None:
187 |                 post_url = post_url.get('href', '')
188 |                 if len(post_url) > 0:
189 |                     post_url = f'https://www.facebook.com{post_url}'
190 |                     p_url = urlparse(post_url)
191 |                     qs = parse_qs(p_url.query)
192 |                     if not is_group:
193 |                         post_url = f'{p_url.scheme}://{p_url.hostname}{p_url.path}?story_fbid={qs["story_fbid"][0]}&id={qs["id"][0]}'
194 |                     else:
195 |                         post_url = f'{p_url.scheme}://{p_url.hostname}{p_url.path}/permalink/{qs["id"][0]}/'
196 |             else:
197 |                 post_url = ''
198 | 
199 |             # Clean the Like link
200 |             if like_url is not None:
201 |                 like_url = like_url.get('href', '')
202 |                 if len(like_url) > 0:
203 |                     like_url = f'https://m.facebook.com{like_url}'
204 |             else:
205 |                 like_url = ''
206 | 
207 |             # Get list of external links in post description, if any inside
208 |             external_links = []
209 |             for link in _external_links:
210 |                 link = link.get('href', '')
211 |                 try:
212 |                     a = link.index("u=") + 2
213 |                     z = link.index("&h=")
214 |                     link = unquote(link[a:z])
215 |                     link = link.split("?fbclid=")[0]
216 |                     external_links.append(link)
217 |                 except ValueError as e:
218 |                     continue
219 |             post = {'published': published, 'description': description, 'images': images,
220 |                     'post_url': post_url, 'external_links': external_links, 'like_url': like_url}
221 |             posts.append(post)
222 |             self.posts.append(post)
223 |         return posts
224 | 
225 |     def posts_to_csv(self, filename):
226 |         if filename[:-4] != '.csv':
227 |             filename = f'{filename}.csv'
228 | 
229 |         df = pd.DataFrame(self.posts)
230 |         df.to_csv(filename)
231 | 
232 |     def posts_to_excel(self, filename):
233 |         if filename[:-5] != '.xlsx':
234 |             filename = f'{filename}.xlsx'
235 | 
236 |         df = pd.DataFrame(self.posts)
237 |         df.to_excel(filename)
238 | 
239 |     def posts_to_json(self, filename):
240 |         if filename[:-5] != '.json':
241 |             filename = f'{filename}.json'
242 | 
243 |         with open(filename, 'w') as f:
244 |             f.write('[')
245 |             for entry in self.posts:
246 |                 json.dump(entry, f)
247 |                 f.write(',\n')
248 |             f.write(']')
249 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Maxwell Smith
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # FacebookPostsScraper
  2 | 
  3 | Scraper for posts in Facebook user profiles, pages and groups.
  4 | 
  5 | Extracts list of dicts with:
  6 | 
  7 | | params         | description |
  8 | | -------------- | ----------- |
  9 | | published      | Formatted datetime of published |
 10 | | description    | Post text content |
 11 | | images         | List of images in posts |
 12 | | post_url       | The unique post url |
 13 | | external_links | External links found in description |
 14 | | like_url       | The Like url |
 15 | 
 16 | ## Installation
 17 | 
 18 | 1.Get [Python](https://www.python.org/downloads/) (recommended Python 3.7+)
 19 | 
 20 | 2.Clone or download this repository
 21 | 
 22 | ```shell script
 23 | git clone https://github.com/adeoy/FacebookPostsScraper.git
 24 | ```
 25 | 
 26 | 3.Install the Python requirements
 27 | 
 28 | ```shell script
 29 | pip install -r requirements.txt
 30 | ```
 31 | 
 32 | 4.Follow the examples.
 33 | 
 34 | ## Description
 35 | 
 36 | ### The FacebookPostsScraper Class
 37 | 
 38 | Constructor params:
 39 | 
 40 | | param         | description |
 41 | | ------------- | ----------- |
 42 | | email | Your email to access Facebook |
 43 | | password | Your password to access Facebook |
 44 | | post_url_text | This is the text in the Url that opens the posts in mobile version, use when your Facebook isn't in English |
 45 | 
 46 | When instantiate object it autologs in, and sets the session and save a cookie file for future use.
 47 | 
 48 | ### Methods
 49 | 
 50 | `get_posts_from_profile`
 51 | 
 52 | params:
 53 | 
 54 | - profile url
 55 | 
 56 | return:
 57 | 
 58 | - list of dicts with the data described above
 59 | 
 60 | `get_posts_from_list`
 61 | 
 62 | params:
 63 | 
 64 | - list of profile urls
 65 | 
 66 | return:
 67 | 
 68 | - list of list of dicts with the data described above
 69 | 
 70 | ## Examples
 71 | 
 72 | ### Example with single url
 73 | 
 74 | ```python
 75 | from FacebookPostsScraper import FacebookPostsScraper as Fps
 76 | from pprint import pprint as pp
 77 | 
 78 | # Enter your Facebook email and password
 79 | email = 'YOUR_EMAIL'
 80 | password = 'YOUR_PASWORD'
 81 | 
 82 | # Instantiate an object
 83 | fps = Fps(email, password, post_url_text='Full Story')
 84 | 
 85 | # Example with single profile
 86 | single_profile = 'https://www.facebook.com/BillGates'
 87 | data = fps.get_posts_from_profile(single_profile)
 88 | pp(data)
 89 | 
 90 | fps.posts_to_csv('my_posts')  # You can export the posts as CSV document
 91 | # fps.posts_to_excel('my_posts')  # You can export the posts as Excel document
 92 | # fps.posts_to_json('my_posts')  # You can export the posts as JSON document
 93 | ```
 94 | 
 95 | ### Example with multiple urls
 96 | 
 97 | ```python
 98 | from FacebookPostsScraper import FacebookPostsScraper as Fps
 99 | from pprint import pprint as pp
100 | 
101 | # Enter your Facebook email and password
102 | email = 'YOUR_EMAIL'
103 | password = 'YOUR_PASWORD'
104 | 
105 | # Instantiate an object
106 | fps = Fps(email, password, post_url_text='Full Story')
107 | 
108 | # Example with multiple profiles
109 | profiles = [
110 |     'https://www.facebook.com/zuck', # User profile
111 |     'https://www.facebook.com/thepracticaldev', # Facebook page
112 |     'https://www.facebook.com/groups/python' # Facebook group
113 | ]
114 | data = fps.get_posts_from_list(profiles)
115 | pp(data)
116 | 
117 | fps.posts_to_csv('my_posts')  # You can export the posts as CSV document
118 | # fps.posts_to_excel('my_posts')  # You can export the posts as Excel document
119 | # fps.posts_to_json('my_posts')  # You can export the posts as JSON document
120 | ```
121 | 
122 | ## Questions
123 | 
124 | Please be free of ask anything in you want in the issue sections.
125 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | from FacebookPostsScraper import FacebookPostsScraper as Fps
 2 | from pprint import pprint as pp
 3 | 
 4 | 
 5 | def main():
 6 |     # Enter your Facebook email and password
 7 |     email = 'YOUR_EMAIL'
 8 |     password = 'YOUR_PASSWORD'
 9 | 
10 |     # Instantiate an object
11 |     fps = Fps(email, password, post_url_text='Full Story')
12 | 
13 |     # Example with single profile
14 |     single_profile = 'https://www.facebook.com/BillGates'
15 |     data = fps.get_posts_from_profile(single_profile)
16 |     pp(data)
17 | 
18 |     # Example with multiple profiles
19 |     profiles = [
20 |         'https://www.facebook.com/zuck',  # User profile
21 |         'https://www.facebook.com/thepracticaldev',  # Facebook page
22 |         'https://www.facebook.com/groups/python'  # Facebook group
23 | 
24 |     ]
25 |     data = fps.get_posts_from_list(profiles)
26 |     pp(data)
27 | 
28 |     fps.posts_to_csv('my_posts')  # You can export the posts as CSV document
29 |     # fps.posts_to_excel('my_posts')  # You can export the posts as Excel document
30 |     # fps.posts_to_json('my_posts')  # You can export the posts as JSON document
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     main()
35 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | beautifulsoup4==4.9.0
 2 | certifi==2020.4.5.1
 3 | chardet==3.0.4
 4 | et-xmlfile==1.0.1
 5 | idna==2.9
 6 | jdcal==1.4.1
 7 | lxml==4.5.0
 8 | numpy==1.19.0
 9 | openpyxl==3.0.4
10 | pandas==1.0.5
11 | python-dateutil==2.8.1
12 | pytz==2020.1
13 | requests==2.23.0
14 | six==1.15.0
15 | soupsieve==2.0
16 | urllib3==1.25.9
17 | 


--------------------------------------------------------------------------------