├── .flake8
├── .gitignore
├── LICENSE
├── README.md
├── facebook_scraper
    ├── __init__.py
    ├── __main__.py
    ├── constants.py
    ├── exceptions.py
    ├── extractors.py
    ├── facebook_scraper.py
    ├── fb_types.py
    ├── internal_classes.py
    ├── page_iterators.py
    └── utils.py
├── poetry.lock
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
└── tests
    ├── cassettes
        ├── TestGetGroupPosts.test_get_group_posts.yaml
        ├── TestGetGroupPosts.test_smoketest.yaml
        ├── TestGetPhotos.test_smoketest.yaml
        ├── TestGetPosts.test_get_posts.yaml
        ├── TestGetPosts.test_get_posts_fields_presence.yaml
        ├── TestGetPosts.test_get_posts_with_extra_info.yaml
        ├── TestGetPosts.test_get_posts_with_extra_info_fields_presence.yaml
        └── TestGetPosts.test_smoketest.yaml
    ├── manualTEst.py
    ├── test_get_posts.py
    ├── test_parse_date.py
    └── test_parse_duration.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 98
3 | ignore =
4 |     E501
5 |     W503
6 | per-file-ignores =
7 |     utils.py:F401
8 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # SageMath parsed files
 89 | *.sage.py
 90 | 
 91 | # Environments
 92 | .env
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # PyCharm
101 | .idea
102 | 
103 | # Spyder project settings
104 | .spyderproject
105 | .spyproject
106 | 
107 | # Rope project settings
108 | .ropeproject
109 | 
110 | # mkdocs documentation
111 | /site
112 | 
113 | # mypy
114 | .mypy_cache/
115 | .dmypy.json
116 | dmypy.json
117 | 
118 | # Pyre type checker
119 | .pyre/
120 | 
121 | # vim
122 | *~
123 | *.swp
124 | *.swo
125 | 
126 | # VSCode
127 | .vscode
128 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Kevin Zúñiga
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Facebook Scraper
  2 | 
  3 | [![PyPI download month](https://img.shields.io/pypi/dm/facebook-scraper.svg)](https://pypi.python.org/pypi/facebook-scraper/)
  4 | [![PyPI download week](https://img.shields.io/pypi/dw/facebook-scraper.svg)](https://pypi.python.org/pypi/facebook-scraper/)
  5 | [![PyPI download day](https://img.shields.io/pypi/dd/facebook-scraper.svg)](https://pypi.python.org/pypi/facebook-scraper/)
  6 | 
  7 | [![PyPI version](https://img.shields.io/pypi/v/facebook-scraper?color=blue)](https://pypi.python.org/pypi/facebook-scraper/)
  8 | [![PyPI pyversions](https://img.shields.io/pypi/pyversions/facebook-scraper.svg)](https://pypi.python.org/pypi/facebook-scraper/)
  9 | [![GitHub commits since tagged version](https://img.shields.io/github/commits-since/kevinzg/facebook-scraper/v0.2.59)](https://github.com/kevinzg/facebook-scraper/commits/)
 10 | 
 11 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 12 | 
 13 | Scrape Facebook public pages without an API key. Inspired by [twitter-scraper](https://github.com/kennethreitz/twitter-scraper).
 14 | 
 15 | 
 16 | ## Contributions
 17 | We are moving a bit slowly on updates, so if you want to help please check the [TODO](#to-do) section below
 18 | 
 19 | 
 20 | 
 21 | ## Install
 22 | 
 23 | To install the latest release from PyPI (original version):
 24 | 
 25 | ```sh
 26 | pip install facebook-scraper
 27 | ```
 28 | 
 29 | Or, to install this latest master branch:
 30 | 
 31 | ```sh
 32 | pip install git+https://github.com/moda20/facebook-scraper.git@master
 33 | ```
 34 | 
 35 | Or, to force update the branch after an update : 
 36 | 
 37 | ```sh
 38 | pip install --force-reinstall --no-deps git+https://github.com/moda20/facebook-scraper.git@master
 39 | ```
 40 | 
 41 | And to add it to your requirements.txt manually : 
 42 | 
 43 | ```
 44 | facebook-scraper @ git+https://github.com/moda20/facebook-scraper.git@master
 45 | ```
 46 | 
 47 | ## Usage
 48 | 
 49 | in order to get everything running right, follow these steps
 50 | 
 51 | 1. Send the unique **page name, profile name, or ID** as the first parameter 
 52 | 2. Specify the base_url and start_url to use the mbasic page instead
 53 | 3. Get the mbasicHeaders that you want to use and read them from a file in order to inject them into the scraper.
 54 | **you can get these headers from opening an example page and selecting a high-end device in the developer tools (such as samsung s20 ultra)
 55 | . This will help with getting newer versions of posts and higher fidelity images.**
 56 | ```python
 57 | from facebook_scraper import get_posts, _scraper
 58 | import json
 59 | 
 60 | with open('./mbasicHeaders.json', 'r') as file:
 61 |     _scraper.mbasic_headers = json.load(file)
 62 | 
 63 | for post in get_posts('NintendoAmerica', base_url="https://mbasic.facebook.com", start_url="https://mbasic.facebook.com/NintendoAmerica?v=timeline", pages=1):
 64 | ...     print(post['text'][:50])
 65 | ...
 66 | The final step on the road to the Super Smash Bros
 67 | We’re headed to PAX East 3/28-3/31 with new games
 68 | ```
 69 | 
 70 | ### Optional parameters
 71 | 
 72 | *(For the `get_posts` function)*.
 73 | 
 74 | * **group**: group id, to scrape groups instead of pages. Default is `None`.
 75 | * **pages**: how many pages of posts to request, the first 2 pages may have no results, so try with a number greater than 2. Default is 10.
 76 | * **timeout**: how many seconds to wait before timing out. Default is 30.
 77 | * **credentials**: tuple of user and password to login before requesting the posts. Default is `None`.
 78 | * **extra_info**: bool, if true the function will try to do an extra request to get the post reactions. Default is False.
 79 | * **youtube_dl**: bool, use Youtube-DL for (high-quality) video extraction. You need to have youtube-dl installed on your environment. Default is False.
 80 | * **post_urls**: list, URLs or post IDs to extract posts from. Alternative to fetching based on username.
 81 | * **cookies**: One of:
 82 |   - The path to a file containing cookies in Netscape or JSON format. You can extract cookies from your browser after logging into Facebook with an extension like [Get cookies.txt LOCALLY](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) or [Cookie Quick Manager (Firefox)](https://addons.mozilla.org/en-US/firefox/addon/cookie-quick-manager/). Make sure that you include both the c_user cookie and the xs cookie, you will get an InvalidCookies exception if you don't.
 83 |   - A [CookieJar](https://docs.python.org/3.9/library/http.cookiejar.html#http.cookiejar.CookieJar)
 84 |   - A dictionary that can be converted to a CookieJar with [cookiejar_from_dict](https://2.python-requests.org/en/master/api/#requests.cookies.cookiejar_from_dict)
 85 |   - The string `"from_browser"` to try extract Facebook cookies from your browser
 86 | * **options**: Dictionary of options. 
 87 |   * Set `options={"comments": True}` to extract comments.
 88 |   * Set `options={"reactors": True}` to extract the people reacting to the post. 
 89 |   * Set `options={"reactions": True}` to extract the reactions of the post. Similar to `reactors` but only extracts reactions and not the people who reacted. Makes only one request per post 
 90 |   * Both `comments` and `reactors` can also be set to a number to set a limit for the amount of comments/reactors to retrieve.
 91 |   * Set `options={"progress": True}` to get a `tqdm` progress bar while extracting comments and replies.
 92 |   * Set `options={"allow_extra_requests": False}` to disable making extra requests when extracting post data (required for some things like full text and image links).
 93 |   * Set `options={"posts_per_page": 200}` to request 200 posts per page. The default is 4.
 94 |   * Set `options={"image_hop_timeout": 2}` to delay the image cycling by n seconds, this is useful to prevent pinging fb a lot.
 95 |   * Set `options={"HQ_images_max_count": 2}` to limit the max count of returned images.
 96 |   * Set `options={"whitelist_methods": [<the method list you want to use for extraction>]}` to extract only specific sections of a post, this is useful to not use up your requests when you don't need to. Here is the list of methods you can use
 97 | 
 98 | | method name               | description                                                               |
 99 | |---------------------------|---------------------------------------------------------------------------|
100 | | extract_post_url          | will try to extract the unique post url                                   |
101 | | extract_post_id           | will try to extract the unique post_id                                    |
102 | | extract_text              | will try to extract the post's text and full text if needed               |
103 | | extract_time              | will try to extract the post's publishing timestamp                       |
104 | | extract_photo_link        | will try to extract the post's photos, including HQ photos                |
105 | | extract_image_lq*         | will try to extract low quality images for posts                          |
106 | | extract_comments          | will try to extract comments of a post, if enabled in options             |
107 | | extract_shares            | will try to extract shares of a post, if enabled in options               |
108 | | extract_links             | will try to extract links of a post                                       |
109 | | extract_user_id           | will try to extract the posting user's id, can be different than page_id  |
110 | | extract_username          | will try to extract the poster's username                                 |
111 | | extract_video             | will try to extract the video link of a post                              |
112 | | extract_video_thumbnail   | will try to extract the video thumbnail of a post                         |
113 | | extract_video_id          | will try to extract the video's id from a post                            |
114 | | extract_video_meta        | will try to extract the metadata of a video from a psot                   |
115 | | extract_is_live           | will try to extract whether a post's video was live or not                |
116 | | extract_factcheck         | will try to extract whether a post is fact checked or not                 |
117 | | extract_share_information | will try to extract sharing info (count) from a post                      |
118 | | extract_availability      | will try to extract whether a post is available or not (in case fo a 404) |
119 | | extract_listing           | will try to extract a marketplace listing if found                        |
120 | | extract_with              | will try to extract tagged accounts in a post ("user is with xxxxx")      |
121 | 
122 | 
123 | 
124 | 
125 | ## CLI usage
126 | 
127 | ```sh
128 | $ facebook-scraper --filename nintendo_page_posts.csv --pages 10 nintendo
129 | ```
130 | 
131 | Run `facebook-scraper --help` for more details on CLI usage.
132 | 
133 | **Note:** If you get a `UnicodeEncodeError` try adding `--encoding utf-8`.
134 | 
135 | ### Practical example: donwload comments of a post
136 | 
137 | ```python
138 | """
139 | Download comments for a public Facebook post.
140 | """
141 | 
142 | import facebook_scraper as fs
143 | 
144 | # get POST_ID from the URL of the post which can have the following structure:
145 | # https://mbasic.facebook.com/USER/posts/POST_ID
146 | # https://mbasic.facebook.com/groups/GROUP_ID/posts/POST_ID
147 | POST_ID = "https://mbasic.facebook.com/<pageId>/posts/<postId>"
148 | 
149 | # number of comments to download -- set this to True to download all comments
150 | MAX_COMMENTS = 100
151 | 
152 | # get the post (this gives a generator)
153 | gen = fs.get_posts(
154 |     post_urls=[POST_ID],
155 |     options={"comments": MAX_COMMENTS, "progress": True}
156 | )
157 | 
158 | # take 1st element of the generator which is the post we requested
159 | post = next(gen)
160 | 
161 | # extract the comments part
162 | comments = post['comments_full']
163 | 
164 | # process comments as you want...
165 | for comment in comments:
166 | 
167 |     # e.g. ...print them
168 |     print(comment)
169 | 
170 |     # e.g. ...get the replies for them
171 |     for reply in comment['replies']:
172 |         print(' ', reply)
173 | ```
174 | 
175 | ## Post example
176 | 
177 | ```python
178 | {'available': True,
179 |  'comments': 459,
180 |  'comments_full': None,
181 |  'factcheck': None,
182 |  'fetched_time': datetime.datetime(2021, 4, 20, 13, 39, 53, 651417),
183 |  'image': 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/58745049_2257182057699568_1761478225390731264_n.jpg?_nc_cat=111&ccb=1-3&_nc_sid=8024bb&_nc_ohc=ygH2fPmfQpAAX92ABYY&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=7a8a7b4904deb55ec696ae255fff97dd&oe=60A36717',
184 |  'images': ['https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/58745049_2257182057699568_1761478225390731264_n.jpg?_nc_cat=111&ccb=1-3&_nc_sid=8024bb&_nc_ohc=ygH2fPmfQpAAX92ABYY&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=7a8a7b4904deb55ec696ae255fff97dd&oe=60A36717'],
185 |  'is_live': False,
186 |  'likes': 3509,
187 |  'link': 'https://www.nintendo.com/amiibo/line-up/',
188 |  'post_id': '2257188721032235',
189 |  'post_text': 'Don’t let this diminutive version of the Hero of Time fool you, '
190 |               'Young Link is just as heroic as his fully grown version! Young '
191 |               'Link joins the Super Smash Bros. series of amiibo figures!\n'
192 |               '\n'
193 |               'https://www.nintendo.com/amiibo/line-up/',
194 |  'post_url': 'https://facebook.com/story.php?story_fbid=2257188721032235&id=119240841493711',
195 |  'reactions': {'haha': 22, 'like': 2657, 'love': 706, 'sorry': 1, 'wow': 123}, # if `extra_info` was set
196 |  'reactors': None,
197 |  'shared_post_id': None,
198 |  'shared_post_url': None,
199 |  'shared_text': '',
200 |  'shared_time': None,
201 |  'shared_user_id': None,
202 |  'shared_username': None,
203 |  'shares': 441,
204 |  'text': 'Don’t let this diminutive version of the Hero of Time fool you, '
205 |          'Young Link is just as heroic as his fully grown version! Young Link '
206 |          'joins the Super Smash Bros. series of amiibo figures!\n'
207 |          '\n'
208 |          'https://www.nintendo.com/amiibo/line-up/',
209 |  'time': datetime.datetime(2019, 4, 30, 5, 0, 1),
210 |  'full_text':'Don’t let this diminutive version of the Hero of Time fool you, '
211 |          'Young Link is just as heroic as his fully grown version! Young Link '
212 |          'joins the Super Smash Bros. series of amiibo figures!', # !! This will only be present if the post_text and text is truncated
213 |  'user_id': '119240841493711',
214 |  'username': 'Nintendo',
215 |  'video': None,
216 |  'video_id': None,
217 |  'video_thumbnail': None,
218 |  'w3_fb_url': 'https://www.facebook.com/Nintendo/posts/2257188721032235'}
219 | ```
220 | 
221 | ### Notes
222 | 
223 | - There is no guarantee that every field will be extracted (they might be `None`).
224 | - Group posts may be missing some fields like `time` and `post_url`.
225 | - Group scraping may return only one page and not work on private groups.
226 | - If you scrape too much, Facebook might temporarily ban your IP.
227 | - The vast majority of unique IDs on facebook (post IDs, video IDs, photo IDs, comment IDs, profile IDs, etc) can be appended to "https://www.facebook.com/" to result in a redirect to the corresponding object.
228 | - Some functions (such as extracting reactions) require you to be logged into Facebook (pass cookies). If something isn't working as expected, try pass cookies and see if that fixes it.
229 | - Reaction Categories (EN): [`like`, `love`, `haha`, `sorry`, `wow`, `angry`, `care`]
230 | 
231 | ## Comment & Reply example
232 | ```python
233 | {'comment_id': '1417925635669547', 
234 |  'comment_url': 'https://facebook.com/1417925635669547', 
235 |  'commenter_id': '100009665948953', 
236 |  'commenter_url': 'https://facebook.com/tw0311?eav=AfZuEAOAat6KRX5WFplL0SNA4ZW78Z2O7W_sjwMApq67hZxXDwXh2WF2ezhICX1LCT4&fref=nf&rc=p&refid=52&__tn__=R&paipv=0', 
237 |  'commenter_name': 'someone', 
238 |  'commenter_meta': None, 
239 |  'comment_text': 'something', 
240 |  'comment_time': datetime.datetime(2023, 6, 23, 0, 0), 
241 |  'comment_image': 'https://scontent.ftpe8-2.fna.fbcdn.net/m1/v/t6/An_UvxJXg9tdnLU3Y5qjPi0200MLilhzPXUgxzGjQzUMaNcmjdZA6anyrngvkdub33NZzZhd51fpCAEzNHFhko5aKRFP5fS1w_lKwYrzcNLupv27.png?ccb=10-5&oh=00_AfCdlpCwAg-SHhniMQ16uElFHh-OG8kGGmLAzvOY5_WZgw&oe=64BE3279&_nc_sid=7da55a', 
242 |  'comment_reactors': [
243 |    {'name': 'Tom', 'link': 'https://facebook.com/ryan.dwayne?eav=AfaxdKIITTXyZj4H-eanXQgoxzOa8Vag6XkGXXDisGzh_W74RYZSXxlFZBofR4jUIOg&fref=pb&paipv=0', 'type': 'like'}, 
244 |    {'name': 'Macy', 'link': 'https://facebook.com/profile.php?id=100000112053053&eav=AfZ5iWlNN-EjjSwVNQl7E2HiVp25AUZMqfoPvLRZGnbUAQxuLeN8nl6xnnQTJB3uxDM&fref=pb&paipv=0', 'type': 'like'}],
245 |  'comment_reactions': {'like': 2}, 
246 |  'comment_reaction_count': 2, 
247 |  'replies': [
248 |    {'comment_id': '793761608817229', 
249 |     'comment_url': 'https://facebook.com/793761608817229', 
250 |     'commenter_id': '100022377272712', 
251 |     'commenter_url': 'https://facebook.com/brizanne.torres?eav=Afab9uP4ByIMn1xaYK0UDd1SRU8e5Zu7faKEx6qTzLKD2vp_bB1xLDGvTwEd6u8A7jY&fref=nf&rc=p&__tn__=R&paipv=0', 
252 |     'commenter_name': 'David', 
253 |     'commenter_meta': None, 
254 |     'comment_text': 'something', 
255 |     'comment_time': datetime.datetime(2023, 6, 23, 18, 0), 
256 |     'comment_image': None, 
257 |     'comment_reactors': [], 
258 |     'comment_reactions': {'love': 2}, 
259 |     'comment_reaction_count': None}
260 |  ]
261 | }
262 | ```
263 | 
264 | 
265 | ## Profiles
266 | 
267 | The `get_profile` function can extract information from a profile's about section. Pass in the account name or ID as the first parameter.
268 | Note that Facebook serves different information depending on whether you're logged in (cookies parameter), such as Date of birth and Gender. Usage:
269 | 
270 | ```python
271 | from facebook_scraper import get_profile
272 | get_profile("zuck") # Or get_profile("zuck", cookies="cookies.txt")
273 | ```
274 | 
275 | Outputs:
276 | 
277 | ```python
278 | {'About': "I'm trying to make the world a more open place.",
279 |  'Education': 'Harvard University\n'
280 |               'Computer Science and Psychology\n'
281 |               '30 August 2002 - 30 April 2004\n'
282 |               'Phillips Exeter Academy\n'
283 |               'Classics\n'
284 |               'School year 2002\n'
285 |               'Ardsley High School\n'
286 |               'High School\n'
287 |               'September 1998 - June 2000',
288 |  'Favourite Quotes': '"Fortune favors the bold."\n'
289 |                      '- Virgil, Aeneid X.284\n'
290 |                      '\n'
291 |                      '"All children are artists. The problem is how to remain '
292 |                      'an artist once you grow up."\n'
293 |                      '- Pablo Picasso\n'
294 |                      '\n'
295 |                      '"Make things as simple as possible but no simpler."\n'
296 |                      '- Albert Einstein',
297 |  'Name': 'Mark Zuckerberg',
298 |  'Places lived': [{'link': '/profile.php?id=104022926303756&refid=17',
299 |                    'text': 'Palo Alto, California',
300 |                    'type': 'Current town/city'},
301 |                   {'link': '/profile.php?id=105506396148790&refid=17',
302 |                    'text': 'Dobbs Ferry, New York',
303 |                    'type': 'Home town'}],
304 |  'Work': 'Chan Zuckerberg Initiative\n'
305 |          '1 December 2015 - Present\n'
306 |          'Facebook\n'
307 |          'Founder and CEO\n'
308 |          '4 February 2004 - Present\n'
309 |          'Palo Alto, California\n'
310 |          'Bringing the world closer together.'}
311 | ```
312 | 
313 | To extract friends, pass the argument `friends=True`, or to limit the amount of friends retrieved, set `friends` to the desired number.
314 | 
315 | ## Group info
316 | 
317 | The `get_group_info` function can extract info about a group. Pass in the group name or ID as the first parameter.
318 | Note that in order to see the list of admins, you need to be logged in (cookies parameter).
319 | 
320 | Usage:
321 | 
322 | ```python
323 | from facebook_scraper import get_group_info
324 | get_group_info("makeupartistsgroup") # or get_group_info("makeupartistsgroup", cookies="cookies.txt")
325 | ```
326 | 
327 | Output:
328 | 
329 | ```python
330 | {'admins': [{'link': '/africanstylemagazinecom/?refid=18',
331 |              'name': 'African Style Magazine'},
332 |             {'link': '/connectfluencer/?refid=18',
333 |              'name': 'Everythingbrightandbeautiful'},
334 |             {'link': '/Kaakakigroup/?refid=18', 'name': 'Kaakaki Group'},
335 |             {'link': '/opentohelp/?refid=18', 'name': 'Open to Help'}],
336 |  'id': '579169815767106',
337 |  'members': 6814229,
338 |  'name': 'HAIRSTYLES',
339 |  'type': 'Public group'}
340 | ```
341 | 
342 | ## Write to a CSV file directly
343 | 
344 | The library also provides a `write_posts_to_csv()` function that writes posts directly to the disk and is able to resume scraping from the address of the last page. It is very useful when scraping large pages as the data is saved continuously and scraping can be resumed in case of an error. Here is an example to fetch the posts of a group 100 pages at a time and save them in separate files.
345 | 
346 | ```python
347 | import facebook_scraper as fs
348 | 
349 | # Saves the first 100 pages
350 | for i in range(1, 101):
351 |     fs.write_posts_to_csv(
352 |         group=GROUP_ID, # The method uses get_posts internally so you can use the same arguments and they will be passed along
353 |         page_limit=100,
354 |         timeout=60,
355 |         options={
356 |             'allow_extra_requests': False
357 |         },
358 |         filename=f'./data/messages_{i}.csv', # Will throw an error if the file already exists
359 |         resume_file='next_page.txt', # Will save a link to the next page in this file after fetching it and use it when starting.
360 |         matching='.+', # A regex can be used to filter all the posts matching a certain pattern (here, we accept anything)
361 |         not_matching='^Warning', # And likewise those that don't fit a pattern (here, we filter out all posts starting with "Warning")
362 |         keys=[
363 |             'post_id',
364 |             'text',
365 |             'timestamp',
366 |             'time',
367 |             'user_id'
368 |         ], # List of the keys that should be saved for each post, will save all keys if not set
369 |         format='csv', # Output file format, can be csv or json, defaults to csv
370 |         days_limit=3650 # Number of days for the oldest post to fetch, defaults to 3650
371 |     )
372 | 
373 | ```
374 | 
375 | 
376 | ## Funny Graphics 
377 | 
378 | [![Star History Chart](https://api.star-history.com/svg?repos=moda20/facebook-scraper&type=Date)](https://star-history.com/#moda20/facebook-scraper&Date)
379 | ## To-Do
380 | 
381 | - CLI update to work with the latest script updates (NEEDS HELP)
382 | - Async support
383 | - ~~Image galleries~~ (`images` entry)
384 | - ~~Profiles or post authors~~ (`get_profile()`)
385 | - ~~Comments~~ (with `options={'comments': True}`)
386 | 
387 | ## Alternatives and related projects
388 | 
389 | - [facebook-post-scraper](https://github.com/brutalsavage/facebook-post-scraper). Has comments. Uses Selenium.
390 | - [facebook-scraper-selenium](https://github.com/apurvmishra99/facebook-scraper-selenium). "Scrape posts from any group or user into a .csv file without needing to register for any API access".
391 | - [Ultimate Facebook Scraper](https://github.com/harismuneer/Ultimate-Facebook-Scraper).  "Scrapes almost everything about a Facebook user's profile". Uses Selenium.
392 | - [Unofficial APIs](https://github.com/Rolstenhouse/unofficial-apis). List of unofficial APIs for various services, none for Facebook for now, but might be worth to check in the future.
393 | - [major-scrapy-spiders](https://github.com/talhashraf/major-scrapy-spiders). Has a profile spider for Scrapy.
394 | - [facebook-page-post-scraper](https://github.com/minimaxir/facebook-page-post-scraper). Seems abandoned.
395 | - [FBLYZE](https://github.com/isaacmg/fb_scraper). Fork (?).
396 | - [RSSHub](https://github.com/DIYgod/RSSHub/blob/master/lib/routes/facebook/page.js). Generates an RSS feed from Facebook pages.
397 | - [RSS-Bridge](https://github.com/RSS-Bridge/rss-bridge/blob/master/bridges/FacebookBridge.php). Also generates RSS feeds from Facebook pages.
398 | 


--------------------------------------------------------------------------------
/facebook_scraper/__init__.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import json
  3 | import locale
  4 | import logging
  5 | import pathlib
  6 | import sys
  7 | import warnings
  8 | import pickle
  9 | from typing import Any, Dict, Iterator, Optional, Set, Union
 10 | 
 11 | from requests.cookies import cookiejar_from_dict
 12 | 
 13 | from .constants import DEFAULT_REQUESTS_TIMEOUT, DEFAULT_COOKIES_FILE_PATH
 14 | from .facebook_scraper import FacebookScraper
 15 | from .fb_types import Credentials, Post, RawPost, Profile
 16 | from .utils import html_element_to_string, parse_cookie_file
 17 | from . import exceptions
 18 | import traceback
 19 | import time
 20 | from datetime import datetime, timedelta
 21 | import re
 22 | import os
 23 | 
 24 | 
 25 | _scraper = FacebookScraper()
 26 | 
 27 | 
 28 | def set_cookies(cookies):
 29 |     if isinstance(cookies, str):
 30 |         if cookies == "from_browser":
 31 |             try:
 32 |                 import browser_cookie3
 33 | 
 34 |                 cookies = browser_cookie3.load(domain_name='.facebook.com')
 35 |             except:
 36 |                 raise ModuleNotFoundError(
 37 |                     "browser_cookie3 must be installed to use browser cookies"
 38 |                 )
 39 |         else:
 40 |             try:
 41 |                 cookies = parse_cookie_file(cookies)
 42 |             except ValueError as e:
 43 |                 raise exceptions.InvalidCookies(f"Cookies are in an invalid format: {e}")
 44 |     elif isinstance(cookies, dict):
 45 |         cookies = cookiejar_from_dict(cookies)
 46 |     if cookies is not None:
 47 |         cookie_names = [c.name for c in cookies]
 48 |         missing_cookies = [c for c in ['c_user', 'xs'] if c not in cookie_names]
 49 |         if missing_cookies:
 50 |             raise exceptions.InvalidCookies(f"Missing cookies with name(s): {missing_cookies}")
 51 |         _scraper.session.cookies.update(cookies)
 52 |         if not _scraper.is_logged_in():
 53 |             raise exceptions.InvalidCookies(f"Cookies are not valid")
 54 | 
 55 | 
 56 | def unset_cookies():
 57 |     # Explicitly unset cookies to return to unauthenticated requests
 58 |     _scraper.session.cookies = cookiejar_from_dict({})
 59 | 
 60 | 
 61 | def set_proxy(proxy, verify=True):
 62 |     _scraper.set_proxy(proxy, verify)
 63 | 
 64 | 
 65 | def set_user_agent(user_agent):
 66 |     _scraper.set_user_agent(user_agent)
 67 | 
 68 | 
 69 | def set_noscript(noscript):
 70 |     _scraper.set_noscript(noscript)
 71 | 
 72 | 
 73 | def get_profile(
 74 |     account: str,
 75 |     **kwargs,
 76 | ) -> Profile:
 77 |     """Get a Facebook user's profile information
 78 |     Args:
 79 |         account(str): The account of the profile.
 80 |         cookies (Union[dict, CookieJar, str]): Cookie jar to use.
 81 |             Can also be a filename to load the cookies from a file (Netscape format).
 82 |     """
 83 |     _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT)
 84 |     cookies = kwargs.pop('cookies', None)
 85 |     set_cookies(cookies)
 86 |     return _scraper.get_profile(account, **kwargs)
 87 | 
 88 | 
 89 | def get_reactors(
 90 |     post_id: Union[str, int],
 91 |     **kwargs,
 92 | ) -> Iterator[dict]:
 93 |     """Get reactors for a given post ID
 94 |     Args:
 95 |         post_id(str): The post ID, as returned from get_posts
 96 |         cookies (Union[dict, CookieJar, str]): Cookie jar to use.
 97 |             Can also be a filename to load the cookies from a file (Netscape format).
 98 |     """
 99 |     _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT)
100 |     cookies = kwargs.pop('cookies', None)
101 |     set_cookies(cookies)
102 |     return _scraper.get_reactors(post_id, **kwargs)
103 | 
104 | 
105 | def get_friends(
106 |     account: str,
107 |     **kwargs,
108 | ) -> Iterator[Profile]:
109 |     """Get a Facebook user's friends
110 |     Args:
111 |         account(str): The account of the profile.
112 |         cookies (Union[dict, CookieJar, str]): Cookie jar to use.
113 |             Can also be a filename to load the cookies from a file (Netscape format).
114 |     """
115 |     _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT)
116 |     cookies = kwargs.pop('cookies', None)
117 |     set_cookies(cookies)
118 |     return _scraper.get_friends(account, **kwargs)
119 | 
120 | 
121 | def get_page_info(account: str, **kwargs) -> Profile:
122 |     """Get a page's information
123 |     Args:
124 |         account(str): The account of the profile.
125 |         cookies (Union[dict, CookieJar, str]): Cookie jar to use.
126 |             Can also be a filename to load the cookies from a file (Netscape format).
127 |     """
128 |     _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT)
129 |     cookies = kwargs.pop('cookies', None)
130 |     set_cookies(cookies)
131 |     return _scraper.get_page_info(account, **kwargs)
132 | 
133 | 
134 | def get_group_info(group: Union[str, int], **kwargs) -> Profile:
135 |     """Get a group's profile information
136 |     Args:
137 |         group(str or int): The group name or ID
138 |         cookies (Union[dict, CookieJar, str]): Cookie jar to use.
139 |             Can also be a filename to load the cookies from a file (Netscape format).
140 |     """
141 |     _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT)
142 |     cookies = kwargs.pop('cookies', None)
143 |     set_cookies(cookies)
144 |     return _scraper.get_group_info(group, **kwargs)
145 | 
146 | 
147 | def get_shop(account: str, **kwargs) -> Iterator[Post]:
148 |     """Get a page's shop listings
149 |     Args:
150 |         account(str): The account of the profile.
151 |         cookies (Union[dict, CookieJar, str]): Cookie jar to use.
152 |             Can also be a filename to load the cookies from a file (Netscape format).
153 |     """
154 |     _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT)
155 |     cookies = kwargs.pop('cookies', None)
156 |     set_cookies(cookies)
157 |     return _scraper.get_shop(account, **kwargs)
158 | 
159 | 
160 | def get_posts(
161 |     account: Optional[str] = None,
162 |     group: Union[str, int, None] = None,
163 |     post_urls: Optional[Iterator[str]] = None,
164 |     hashtag: Optional[str] = None,
165 |     credentials: Optional[Credentials] = None,
166 |     **kwargs,
167 | ) -> Iterator[Post]:
168 |     """Get posts from a Facebook page or group.
169 | 
170 |     Args:
171 |         account (str): The account of the page.
172 |         group (int): The group id.
173 |         post_urls ([str]): List of manually specified post URLs.
174 |         credentials (Optional[Tuple[str, str]]): Tuple of email and password to login before scraping.
175 |         timeout (int): Timeout for requests.
176 |         page_limit (int): How many pages of posts to go through.
177 |             Use None to try to get all of them.
178 |         extra_info (bool): Set to True to try to get reactions.
179 |         youtube_dl (bool): Use Youtube-DL for video extraction.
180 |         cookies (Union[dict, CookieJar, str]): Cookie jar to use.
181 |             Can also be a filename to load the cookies from a file (Netscape format).
182 | 
183 |     Yields:
184 |         dict: The post representation in a dictionary.
185 |     """
186 |     valid_args = sum(arg is not None for arg in (account, group, post_urls, hashtag))
187 | 
188 |     if valid_args != 1:
189 |         raise ValueError("You need to specify either account, group, or post_urls")
190 | 
191 |     _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT)
192 | 
193 |     cookies = kwargs.pop('cookies', None)
194 | 
195 |     if cookies is not None and credentials is not None:
196 |         raise ValueError("Can't use cookies and credentials arguments at the same time")
197 |     set_cookies(cookies)
198 | 
199 |     options: Union[Dict[str, Any], Set[str]] = kwargs.setdefault('options', {})
200 |     if isinstance(options, set):
201 |         warnings.warn("The options argument should be a dictionary.", stacklevel=2)
202 |         options = {k: True for k in options}
203 |     options.setdefault('account', account)
204 | 
205 |     # TODO: Add a better throttling mechanism
206 |     if 'sleep' in kwargs:
207 |         warnings.warn(
208 |             "The sleep parameter has been removed, it won't have any effect.", stacklevel=2
209 |         )
210 |         kwargs.pop('sleep')
211 | 
212 |     # TODO: Deprecate `pages` in favor of `page_limit` since it is less confusing
213 |     if 'pages' in kwargs:
214 |         kwargs['page_limit'] = kwargs.pop('pages')
215 | 
216 |     # TODO: Deprecate `extra_info` in favor of `options`
217 |     if "reactions" not in options:
218 |         options['reactions'] = kwargs.pop('extra_info', False)
219 |     options['youtube_dl'] = kwargs.pop('youtube_dl', False)
220 | 
221 |     if credentials is not None:
222 |         _scraper.login(*credentials)
223 | 
224 |     if account is not None:
225 |         return _scraper.get_posts(account, **kwargs)
226 | 
227 |     elif group is not None:
228 |         return _scraper.get_group_posts(group, **kwargs)
229 | 
230 |     elif hashtag is not None:
231 |         return _scraper.get_posts_by_hashtag(hashtag, **kwargs)
232 | 
233 |     elif post_urls is not None:
234 |         return _scraper.get_posts_by_url(post_urls, **kwargs)
235 | 
236 |     raise ValueError('No account nor group')
237 | 
238 | 
239 | def get_photos(
240 |     account: str,
241 |     credentials: Optional[Credentials] = None,
242 |     **kwargs,
243 | ) -> Iterator[Post]:
244 |     """Get photo posts from a Facebook page.
245 | 
246 |     Args:
247 |         account (str): The account of the page.
248 |         credentials (Optional[Tuple[str, str]]): Tuple of email and password to login before scraping.
249 |         timeout (int): Timeout for requests.
250 |         page_limit (int): How many pages of posts to go through.
251 |             Use None to try to get all of them.
252 |         extra_info (bool): Set to True to try to get reactions.
253 |         youtube_dl (bool): Use Youtube-DL for video extraction.
254 |         cookies (Union[dict, CookieJar, str]): Cookie jar to use.
255 |             Can also be a filename to load the cookies from a file (Netscape format).
256 | 
257 |     Yields:
258 |         dict: The post representation in a dictionary.
259 |     """
260 |     if account is None:
261 |         raise ValueError("You need to specify account")
262 | 
263 |     _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT)
264 | 
265 |     cookies = kwargs.pop('cookies', None)
266 | 
267 |     if cookies is not None and credentials is not None:
268 |         raise ValueError("Can't use cookies and credentials arguments at the same time")
269 |     set_cookies(cookies)
270 | 
271 |     options: Union[Dict[str, Any], Set[str]] = kwargs.setdefault('options', {})
272 |     if isinstance(options, set):
273 |         warnings.warn("The options argument should be a dictionary.", stacklevel=2)
274 |         options = {k: True for k in options}
275 |     options.setdefault('account', account)
276 | 
277 |     # TODO: Add a better throttling mechanism
278 |     if 'sleep' in kwargs:
279 |         warnings.warn(
280 |             "The sleep parameter has been removed, it won't have any effect.", stacklevel=2
281 |         )
282 |         kwargs.pop('sleep')
283 | 
284 |     # TODO: Deprecate `pages` in favor of `page_limit` since it is less confusing
285 |     if 'pages' in kwargs:
286 |         kwargs['page_limit'] = kwargs.pop('pages')
287 | 
288 |     # TODO: Deprecate `extra_info` in favor of `options`
289 |     options['reactions'] = kwargs.pop('extra_info', False)
290 |     options['youtube_dl'] = kwargs.pop('youtube_dl', False)
291 | 
292 |     if credentials is not None:
293 |         _scraper.login(*credentials)
294 | 
295 |     return _scraper.get_photos(account, **kwargs)
296 | 
297 | 
298 | def get_posts_by_search(
299 |     word: str,
300 |     credentials: Optional[Credentials] = None,
301 |     **kwargs,
302 | ) -> Iterator[Post]:
303 |     """Get posts by searching all of Facebook
304 |     Args:
305 |         word (str): The word for searching posts.
306 |         credentials (Optional[Tuple[str, str]]): Tuple of email and password to login before scraping.
307 |         timeout (int): Timeout for requests.
308 |         page_limit (int): How many pages of posts to go through.
309 |             Use None to try to get all of them.
310 |         extra_info (bool): Set to True to try to get reactions.
311 |         youtube_dl (bool): Use Youtube-DL for video extraction.
312 |         cookies (Union[dict, CookieJar, str]): Cookie jar to use.
313 |             Can also be a filename to load the cookies from a file (Netscape format).
314 | 
315 |     Yields:
316 |         dict: The post representation in a dictionary.
317 |     """
318 |     if not word:
319 |         raise ValueError("You need to specify word")
320 | 
321 |     _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT)
322 | 
323 |     cookies = kwargs.pop('cookies', None)
324 | 
325 |     if cookies is not None and credentials is not None:
326 |         raise ValueError("Can't use cookies and credentials arguments at the same time")
327 |     set_cookies(cookies)
328 | 
329 |     options: Union[Dict[str, Any], Set[str]] = kwargs.setdefault('options', {})
330 |     if isinstance(options, set):
331 |         warnings.warn("The options argument should be a dictionary.", stacklevel=2)
332 |         options = {k: True for k in options}
333 | 
334 |     options.setdefault('word', word)
335 | 
336 |     # TODO: Add a better throttling mechanism
337 |     if 'sleep' in kwargs:
338 |         warnings.warn(
339 |             "The sleep parameter has been removed, it won't have any effect.", stacklevel=2
340 |         )
341 |         kwargs.pop('sleep')
342 | 
343 |     # TODO: Deprecate `pages` in favor of `page_limit` since it is less confusing
344 |     if 'pages' in kwargs:
345 |         kwargs['page_limit'] = kwargs.pop('pages')
346 | 
347 |     # TODO: Deprecate `extra_info` in favor of `options`
348 |     if "reactions" not in options:
349 |         options['reactions'] = kwargs.pop('extra_info', False)
350 |     options['youtube_dl'] = kwargs.pop('youtube_dl', False)
351 | 
352 |     if credentials is not None:
353 |         _scraper.login(*credentials)
354 | 
355 |     if word is not None:
356 |         return _scraper.get_posts_by_search(word, **kwargs)
357 | 
358 |     raise ValueError('No account nor group')
359 | 
360 | 
361 | def write_post_to_disk(post: Post, source: RawPost, location: pathlib.Path):
362 |     post_id = post['post_id']
363 |     filename = f'{post_id}.html'
364 | 
365 |     logger.debug("Writing post %s", post_id)
366 |     with open(location.joinpath(filename), mode='wt') as f:
367 |         f.write('<!--\n')
368 |         json.dump(post, f, indent=4, default=str)
369 |         f.write('\n-->\n')
370 |         f.write(html_element_to_string(source, pretty=True))
371 | 
372 | 
373 | def write_posts_to_csv(
374 |     account: Optional[str] = None,
375 |     group: Union[str, int, None] = None,
376 |     filename: str = None,
377 |     encoding: str = None,
378 |     **kwargs,
379 | ):
380 |     """Write posts from an account or group to a CSV or JSON file
381 | 
382 |     Args:
383 |         account (str): Facebook account name e.g. "nike" or "nintendo"
384 |         group (Union[str, int, None]): Facebook group id e.g. 676845025728409
385 |         filename (str): Filename, defaults to <account or group>_posts.csv
386 |         encoding (str): Encoding for the output file, defaults to locale.getpreferredencoding()
387 |         credentials (Optional[Tuple[str, str]]): Tuple of email and password to login before scraping. Defaults to scrape anonymously
388 |         timeout (Optional[int]): Timeout for requests.
389 |         page_limit (Optional[int]): How many pages of posts to go through.
390 |             Use None to try to get all of them.
391 |         extra_info (Optional[bool]): Set to True to try to get reactions.
392 |         dump_location (Optional[pathlib.Path]): Location where to write the HTML source of the posts.
393 |     """
394 |     dump_location = kwargs.pop('dump_location', None)  # For dumping HTML to disk, for debugging
395 |     if dump_location is not None:
396 |         dump_location.mkdir(exist_ok=True)
397 |         kwargs["remove_source"] = False
398 | 
399 |     # Set a default filename, based on the account name with the appropriate extension
400 |     if filename is None:
401 |         filename = str(account or group) + "_posts." + kwargs.get("format")
402 | 
403 |     if encoding is None:
404 |         encoding = locale.getpreferredencoding()
405 | 
406 |     if os.path.isfile(filename):
407 |         raise FileExistsError(f"{filename} exists")
408 | 
409 |     if filename == "-":
410 |         output_file = sys.stdout
411 |     else:
412 |         output_file = open(filename, 'w', newline='', encoding=encoding)
413 | 
414 |     first_post = True
415 | 
416 |     sleep = kwargs.pop("sleep", 0)
417 | 
418 |     days_limit = kwargs.get("days_limit", 3650)
419 |     max_post_time = datetime.now() - timedelta(days=days_limit)
420 | 
421 |     start_url = None
422 |     resume_file = kwargs.get("resume_file")
423 |     if resume_file:
424 |         try:
425 |             with open(resume_file, "r") as f:
426 |                 existing_url = f.readline().strip()
427 |             logger.debug("Existing URL:" + existing_url)
428 |             if existing_url:
429 |                 start_url = existing_url
430 |         except FileNotFoundError:
431 |             pass
432 | 
433 |     def handle_pagination_url(url):
434 |         if resume_file:
435 |             with open(resume_file, "w") as f:
436 |                 f.write(url + "\n")
437 | 
438 |     keys = kwargs.get("keys")
439 | 
440 |     try:
441 |         for post in get_posts(
442 |             account=account,
443 |             group=group,
444 |             start_url=start_url,
445 |             request_url_callback=handle_pagination_url,
446 |             **kwargs,
447 |         ):
448 |             if dump_location is not None:
449 |                 source = post.pop('source')
450 |                 try:
451 |                     write_post_to_disk(post, source, dump_location)
452 |                 except Exception:
453 |                     logger.exception("Error writing post to disk")
454 |             elif post.get("source"):
455 |                 post["source"] = post["source"].html
456 |             if first_post:
457 |                 if kwargs.get("format") == "json":
458 |                     output_file.write("[\n")
459 |                 else:
460 |                     if not keys:
461 |                         keys = list(post.keys())
462 |                     dict_writer = csv.DictWriter(output_file, keys, extrasaction='ignore')
463 |                     dict_writer.writeheader()
464 |             else:
465 |                 if kwargs.get("format") == "json":
466 |                     output_file.write(",")
467 |             match = None
468 |             if post["text"]:
469 |                 match = re.search(kwargs.get("matching", '.+'), post["text"], flags=re.IGNORECASE)
470 |                 if kwargs.get("not_matching") and re.search(
471 |                     kwargs.get("not_matching"), post["text"], flags=re.IGNORECASE
472 |                 ):
473 |                     match = None
474 |             if match:
475 |                 if kwargs.get("format") == "json":
476 |                     if keys:
477 |                         post = {k: v for k, v in post.items() if k in keys}
478 |                     json.dump(post, output_file, default=str, indent=4)
479 |                 else:
480 |                     dict_writer.writerow(post)
481 |             if not first_post and post["time"] and post["time"] < max_post_time:
482 |                 logger.debug(
483 |                     f"Reached days_limit - {post['time']} is more than {days_limit} days old (older than {max_post_time})"
484 |                 )
485 |                 break
486 |             first_post = False
487 |             time.sleep(sleep)
488 |     except KeyboardInterrupt:
489 |         pass
490 |     except Exception as e:
491 |         traceback.print_exc()
492 | 
493 |     if kwargs.get("format") == "json":
494 |         output_file.write("\n]")
495 |     if first_post:
496 |         print("Couldn't get any posts.", file=sys.stderr)
497 |     output_file.close()
498 | 
499 | 
500 | def get_groups_by_search(
501 |     word: str,
502 |     **kwargs,
503 | ):
504 |     """Searches Facebook groups and yields ids for each result
505 |     on the first page"""
506 |     _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT)
507 |     cookies = kwargs.pop('cookies', None)
508 |     set_cookies(cookies)
509 |     return _scraper.get_groups_by_search(word, **kwargs)
510 | 
511 | 
512 | def enable_logging(level=logging.DEBUG):
513 |     handler = logging.StreamHandler()
514 |     handler.setLevel(level)
515 | 
516 |     logger.addHandler(handler)
517 |     logger.setLevel(level)
518 | 
519 | 
520 | def use_persistent_session(email: str, password: str, cookies_file_path=DEFAULT_COOKIES_FILE_PATH):
521 |     """Login persistently to Facebook and save cookies to a file (default: ".fb-cookies.pckl"). This is highly recommended if you want to scrape several times a day because it will keep your session alive instead of logging in every time (which can be flagged as suspicious by Facebook).
522 | 
523 |     Args:
524 |         email (str): email address to login.
525 |         password (str): password to login.
526 |         cookies_file_path (str, optional): path to the file in which to save cookies. Defaults to ".fb-cookies.pckl".
527 | 
528 |     Raises:
529 |         exceptions.InvalidCredentials: if the credentials are invalid.
530 | 
531 |     Returns:
532 |         Boolean: True if the login was successful, False otherwise.
533 |     """
534 |     try:
535 |         with open(cookies_file_path, "rb") as f:
536 |             cookies = pickle.load(f)
537 |         logger.debug("Loaded cookies from %s", cookies_file_path)
538 |     except FileNotFoundError:
539 |         logger.error("No cookies file found at %s", cookies_file_path)
540 |         cookies = None
541 |     try:
542 |         if not cookies:
543 |             raise exceptions.InvalidCookies()
544 |         set_cookies(cookies)
545 |         logger.debug("Successfully logged in with cookies")
546 |     except exceptions.InvalidCookies:
547 |         logger.exception("Invalid cookies, trying to login with credentials")
548 |         _scraper.login(email, password)
549 |         cookies = _scraper.session.cookies
550 |         with open(cookies_file_path, "wb") as f:
551 |             pickle.dump(cookies, f)
552 |         set_cookies(cookies)
553 |         logger.debug("Successfully logged in with credentials")
554 | 
555 | 
556 | # Disable logging by default
557 | logger = logging.getLogger(__name__)
558 | logger.addHandler(logging.NullHandler())
559 | 


--------------------------------------------------------------------------------
/facebook_scraper/__main__.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import logging
  3 | import pathlib
  4 | import datetime
  5 | import sys
  6 | import locale
  7 | import json
  8 | import csv
  9 | 
 10 | from . import enable_logging, write_posts_to_csv, get_profile
 11 | 
 12 | 
 13 | def run():
 14 |     """facebook-scraper entry point when used as a script"""
 15 |     parser = argparse.ArgumentParser(
 16 |         prog='facebook-scraper',
 17 |         description="Scrape Facebook public pages without an API key",
 18 |     )
 19 |     parser.add_argument('account', type=str, help="Facebook account or group")
 20 |     parser.add_argument('-f', '--filename', type=str, help="Output filename")
 21 |     parser.add_argument('-p', '--pages', type=int, help="Number of pages to download", default=10)
 22 |     parser.add_argument(
 23 |         '-s', '--sleep', type=float, help="How long to sleep for between posts", default=0
 24 |     )
 25 |     parser.add_argument(
 26 |         '-t',
 27 |         '--timeout',
 28 |         type=int,
 29 |         help="How long to wait in seconds for Facebook servers before aborting",
 30 |         default=10,
 31 |     )
 32 |     parser.add_argument('-g', '--group', action='store_true', help="Use group scraper")
 33 |     parser.add_argument('-v', '--verbose', action='count', help="Enable logging", default=0)
 34 |     parser.add_argument('-c', '--cookies', type=str, help="Path to a cookies file")
 35 |     parser.add_argument('--comments', action='store_true', help="Extract comments")
 36 |     parser.add_argument('-r', '--reactions', action='store_true', help="Extract reactions")
 37 |     parser.add_argument('-rs', '--reactors', action='store_true', help="Extract reactors")
 38 |     parser.add_argument(
 39 |         '--dump',
 40 |         type=pathlib.Path,
 41 |         dest='dump_location',
 42 |         help="Location where to save the HTML source of the posts (useful for debugging)",
 43 |         default=None,
 44 |     )
 45 |     parser.add_argument(
 46 |         '--encoding',
 47 |         action='store',
 48 |         help="Encoding for the output file",
 49 |         default=None,
 50 |     )
 51 |     parser.add_argument(
 52 |         '-fmt',
 53 |         '--format',
 54 |         type=str.lower,
 55 |         choices=["csv", "json"],
 56 |         default="csv",
 57 |         help="What format to export as",
 58 |     )
 59 |     parser.add_argument(
 60 |         '-d',
 61 |         '--days-limit',
 62 |         dest='days_limit',
 63 |         default=3650,
 64 |         type=int,
 65 |         help="Number of days to download",
 66 |     )
 67 |     parser.add_argument(
 68 |         '-rf',
 69 |         '--resume-file',
 70 |         type=str,
 71 |         help="Filename to store the last pagination URL in, for resuming",
 72 |     )
 73 |     parser.add_argument(
 74 |         '-ner',
 75 |         '--no-extra-requests',
 76 |         dest='allow_extra_requests',
 77 |         action='store_false',
 78 |         help="Disable making extra requests (for things like high quality image URLs)",
 79 |     )
 80 |     parser.add_argument(
 81 |         '-k',
 82 |         '--keys',
 83 |         type=lambda s: s.split(sep=","),
 84 |         help="Comma separated list of which keys or columns to return. This lets you filter to just your desired outputs.",
 85 |     )
 86 |     parser.add_argument(
 87 |         '-m',
 88 |         '--matching',
 89 |         type=str,
 90 |         default=".+",
 91 |         help='Filter to just posts matching regex expression',
 92 |     )
 93 |     parser.add_argument(
 94 |         '-nm',
 95 |         '--not-matching',
 96 |         type=str,
 97 |         help='Filter to just posts not matching regex expression',
 98 |     )
 99 |     parser.add_argument(
100 |         '--extra-info ',
101 |         dest='extra_info',
102 |         action='store_true',
103 |         help="Try to do an extra request to get the post reactions. Default is False",
104 |         default=False,
105 |     )
106 |     parser.add_argument(
107 |         '--use-youtube-dl',
108 |         dest='youtube_dl',
109 |         action='store_true',
110 |         help='Use Youtube-DL for (high-quality) video extraction. You need to have youtube-dl installed on your environment. Default is False.',
111 |         default=False,
112 |     )
113 |     parser.add_argument(
114 |         '--profile',
115 |         action='store_true',
116 |         help="Extract an account's profile",
117 |         default=False,
118 |     )
119 |     parser.add_argument(
120 |         '--friends', type=int, help='When extracting a profile, how many friends to extract'
121 |     )
122 |     parser.add_argument(
123 |         '-ppp',
124 |         '--posts-per-page',
125 |         dest='posts_per_page',
126 |         default=4,
127 |         type=int,
128 |         help="Number of posts to fetch per page",
129 |     )
130 |     parser.add_argument(
131 |         '--source',
132 |         action='store_true',
133 |         help="Include HTML source",
134 |         default=False,
135 |     )
136 | 
137 |     args = parser.parse_args()
138 | 
139 |     # Enable logging
140 |     if args.verbose > 0:
141 |         args.verbose = min(args.verbose, 3)
142 |         level = {1: logging.WARNING, 2: logging.INFO, 3: logging.DEBUG}[args.verbose]
143 |         enable_logging(level)
144 | 
145 |     if args.profile:
146 |         # Set a default filename, based on the account name with the appropriate extension
147 |         if args.filename is None:
148 |             args.filename = str(args.account) + "_profile." + args.format
149 | 
150 |         if args.encoding is None:
151 |             encoding = locale.getpreferredencoding()
152 | 
153 |         if args.filename == "-":
154 |             output_file = sys.stdout
155 |         else:
156 |             output_file = open(args.filename, 'w', newline='', encoding=encoding)
157 | 
158 |         profile = get_profile(args.account, friends=args.friends, cookies=args.cookies)
159 | 
160 |         if args.format == "json":
161 |             json.dump(profile, output_file, default=str, indent=4)
162 |         else:
163 |             dict_writer = csv.DictWriter(output_file, profile.keys())
164 |             dict_writer.writeheader()
165 |             dict_writer.writerow(profile)
166 |         output_file.close()
167 |     else:
168 |         # Choose the right argument to pass to write_posts_to_csv (group or account)
169 |         account_type = 'group' if args.group else 'account'
170 |         kwargs = {
171 |             account_type: args.account,
172 |             "format": args.format,
173 |             "days_limit": args.days_limit,
174 |             "resume_file": args.resume_file,
175 |             "cookies": args.cookies,
176 |             "timeout": args.timeout,
177 |             "sleep": args.sleep,
178 |             "keys": args.keys,
179 |             "matching": args.matching,
180 |             "not_matching": args.not_matching,
181 |             "options": {
182 |                 "reactions": args.reactions,
183 |                 "reactors": args.reactors,
184 |                 "comments": args.comments,
185 |                 "allow_extra_requests": args.allow_extra_requests,
186 |                 "posts_per_page": args.posts_per_page,
187 |             },
188 |             "youtube_dl": args.youtube_dl,
189 |             "extra_info": args.extra_info,
190 |             "remove_source": not args.source,
191 |         }
192 | 
193 |         write_posts_to_csv(
194 |             **kwargs,
195 |             filename=args.filename,
196 |             pages=args.pages,
197 |             encoding=args.encoding,
198 |             dump_location=args.dump_location,
199 |         )
200 | 
201 | 
202 | if __name__ == '__main__':
203 |     run()
204 | 


--------------------------------------------------------------------------------
/facebook_scraper/constants.py:
--------------------------------------------------------------------------------
 1 | FB_BASE_URL = 'https://facebook.com/'
 2 | FB_W3_BASE_URL = 'https://www.facebook.com/'
 3 | FB_MOBILE_BASE_URL = 'https://m.facebook.com/'
 4 | FB_MBASIC_BASE_URL = 'https://mbasic.facebook.com/'
 5 | 
 6 | DEFAULT_REQUESTS_TIMEOUT = 120
 7 | DEFAULT_PAGE_LIMIT = 10
 8 | 
 9 | DEFAULT_COOKIES_FILE_PATH = '.fb-cookies.pckl'
10 | 


--------------------------------------------------------------------------------
/facebook_scraper/exceptions.py:
--------------------------------------------------------------------------------
 1 | class NotFound(Exception):
 2 |     '''Post, page or profile not found / doesn't exist / deleted'''
 3 | 
 4 |     pass
 5 | 
 6 | 
 7 | class TemporarilyBanned(Exception):
 8 |     '''User account rate limited'''
 9 | 
10 |     pass
11 | 
12 | 
13 | class AccountDisabled(Exception):
14 |     '''User account disabled, with option to appeal'''
15 | 
16 |     pass
17 | 
18 | 
19 | class InvalidCookies(Exception):
20 |     '''Cookies file passed but missing cookies'''
21 | 
22 |     pass
23 | 
24 | 
25 | class LoginRequired(Exception):
26 |     '''Facebook requires a login to see this'''
27 | 
28 |     pass
29 | 
30 | 
31 | class LoginError(Exception):
32 |     '''Failed to log in'''
33 | 
34 |     pass
35 | 
36 | 
37 | class UnexpectedResponse(Exception):
38 |     '''Facebook served something weird'''
39 | 
40 |     pass
41 | 


--------------------------------------------------------------------------------
/facebook_scraper/facebook_scraper.py:
--------------------------------------------------------------------------------
   1 | import itertools
   2 | import logging
   3 | from urllib.parse import urljoin
   4 | import warnings
   5 | import re
   6 | from functools import partial
   7 | from typing import Iterator, Union
   8 | import json
   9 | import demjson3 as demjson
  10 | from urllib.parse import parse_qs, urlparse, unquote
  11 | from datetime import datetime
  12 | import os
  13 | 
  14 | from bs4 import BeautifulSoup
  15 | from requests import RequestException
  16 | from requests_html import HTMLSession
  17 | 
  18 | from . import utils
  19 | from .constants import (
  20 |     DEFAULT_PAGE_LIMIT,
  21 |     FB_BASE_URL,
  22 |     FB_MOBILE_BASE_URL,
  23 |     FB_W3_BASE_URL,
  24 |     FB_MBASIC_BASE_URL,
  25 | )
  26 | from .extractors import (
  27 |     extract_group_post,
  28 |     extract_post,
  29 |     extract_photo_post,
  30 |     extract_story_post,
  31 |     PostExtractor,
  32 |     extract_hashtag_post,
  33 | )
  34 | from .fb_types import Post, Profile
  35 | from .page_iterators import (
  36 |     iter_group_pages,
  37 |     iter_pages,
  38 |     iter_photos,
  39 |     iter_search_pages,
  40 |     iter_hashtag_pages, PageParser,
  41 | )
  42 | from . import exceptions
  43 | 
  44 | 
  45 | logger = logging.getLogger(__name__)
  46 | 
  47 | 
  48 | class FacebookScraper:
  49 |     """Class for creating FacebookScraper Iterators"""
  50 | 
  51 |     base_url = FB_MOBILE_BASE_URL
  52 |     default_headers = {
  53 |         "Accept": "*/*",
  54 |         "Connection": "keep-alive",
  55 |         "Accept-Encoding": "gzip,deflate",
  56 |         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15",
  57 |     }
  58 |     have_checked_locale = False
  59 | 
  60 |     def __init__(self, session=None, requests_kwargs=None):
  61 |         if session is None:
  62 |             session = HTMLSession()
  63 |             session.headers.update(self.default_headers)
  64 | 
  65 |         if requests_kwargs is None:
  66 |             requests_kwargs = {}
  67 | 
  68 |         self.session = session
  69 |         self.requests_kwargs = requests_kwargs
  70 |         self.request_count = 0
  71 |         self.mbasic_headers = None
  72 | 
  73 |     def set_user_agent(self, user_agent):
  74 |         self.session.headers["User-Agent"] = user_agent
  75 | 
  76 |     def set_noscript(self, noscript):
  77 |         if noscript:
  78 |             self.session.cookies.set("noscript", "1")
  79 |         else:
  80 |             self.session.cookies.set("noscript", "0")
  81 | 
  82 |     def set_proxy(self, proxy, verify=True):
  83 |         self.requests_kwargs.update(
  84 |             {'proxies': {'http': proxy, 'https': proxy}, 'verify': verify}
  85 |         )
  86 |         ip = self.get(
  87 |             "http://lumtest.com/myip.json", headers={"Accept": "application/json"}
  88 |         ).json()
  89 |         logger.debug(f"Proxy details: {ip}")
  90 | 
  91 |     def get_posts(self, account: str, **kwargs) -> Iterator[Post]:
  92 |         kwargs["scraper"] = self
  93 |         iter_pages_fn = partial(iter_pages, account=account, request_fn=self.get, **kwargs)
  94 |         return self._generic_get_posts(extract_post, iter_pages_fn, **kwargs)
  95 | 
  96 |     def get_reactors(self, post_id: int, **kwargs) -> Iterator[dict]:
  97 |         reaction_url = (
  98 |             f'https://m.facebook.com/ufi/reaction/profile/browser/?ft_ent_identifier={post_id}'
  99 |         )
 100 |         logger.debug(f"Fetching {reaction_url}")
 101 |         response = self.get(reaction_url)
 102 |         extractor = PostExtractor(response.html, kwargs, self.get, full_post_html=response.html, scraper=self)
 103 |         return extractor.extract_reactors(response)
 104 | 
 105 |     def get_photos(self, account: str, **kwargs) -> Iterator[Post]:
 106 |         kwargs["scraper"] = self
 107 |         iter_pages_fn = partial(iter_photos, account=account, request_fn=self.get, **kwargs)
 108 |         return self._generic_get_posts(extract_post, iter_pages_fn, **kwargs)
 109 | 
 110 |     def get_posts_by_hashtag(self, hashtag: str, **kwargs) -> Iterator[Post]:
 111 |         kwargs["scraper"] = self
 112 |         kwargs["base_url"] = FB_MBASIC_BASE_URL
 113 |         iter_pages_fn = partial(
 114 |             iter_hashtag_pages, hashtag=hashtag, request_fn=self.get, **kwargs
 115 |         )
 116 |         return self._generic_get_posts(extract_hashtag_post, iter_pages_fn, **kwargs)
 117 | 
 118 |     def get_posts_by_url(
 119 |         self, post_urls, options={}, remove_source=True, **kwargs
 120 |     ) -> Iterator[Post]:
 121 |         kwargs["scraper"] = self
 122 |         if self.session.cookies.get("noscript") == "1":
 123 |             options["noscript"] = True
 124 |         for post_url in post_urls:
 125 |             url = str(post_url)
 126 |             if url.startswith(FB_BASE_URL):
 127 |                 url = url.replace(FB_BASE_URL, FB_MBASIC_BASE_URL)
 128 |             if url.startswith(FB_W3_BASE_URL):
 129 |                 url = url.replace(FB_W3_BASE_URL, FB_MBASIC_BASE_URL)
 130 |             if not url.startswith(FB_MOBILE_BASE_URL):
 131 |                 url = utils.urljoin(FB_MBASIC_BASE_URL, url)
 132 | 
 133 |             post = {"original_request_url": post_url, "post_url": url}
 134 |             logger.debug(f"Requesting page from: {url}")
 135 |             response = self.get(url)
 136 |             options["response_url"] = response.url
 137 |             photo_post = False
 138 |             if "/stories/" in url or "/story/" in url:
 139 |                 elem = response.html.find("#story_viewer_content", first=True)
 140 |             else:
 141 |                 # top_level_post_id is not used anymore
 142 |                 elem = response.html.find('[data-ft]', first=True)
 143 |                 if not elem:
 144 |                     elem = response.html.find('div.async_like', first=True)
 145 |                 if response.html.find("div.msg", first=True):
 146 |                     photo_post = True
 147 |                     elem = response.html
 148 |             if not elem:
 149 |                 logger.warning("No raw posts (<article> elements) were found in this page.")
 150 |             else:
 151 |                 comments_area = response.html.find('div.ufi', first=True)
 152 |                 if comments_area:
 153 |                     # Makes likes/shares regexes work
 154 |                     try:
 155 |                         elem = utils.make_html_element(
 156 |                             elem.html.replace("</footer>", comments_area.html + "</footer>")
 157 |                         )
 158 |                     except ValueError as e:
 159 |                         logger.debug(e)
 160 | 
 161 |                 if photo_post:
 162 |                     post.update(
 163 |                         extract_photo_post(
 164 |                             elem,
 165 |                             request_fn=self.get,
 166 |                             options=options,
 167 |                             full_post_html=response.html,
 168 |                             **kwargs,
 169 |                         )
 170 |                     )
 171 |                 elif url.startswith(utils.urljoin(FB_MBASIC_BASE_URL, "/groups/")):
 172 |                     post.update(
 173 |                         extract_group_post(
 174 |                             elem,
 175 |                             request_fn=self.get,
 176 |                             options=options,
 177 |                             full_post_html=response.html,
 178 |                             **kwargs,
 179 |                         )
 180 |                     )
 181 |                 elif "/stories/" in url or "/story/" in url:
 182 |                     post.update(
 183 |                         extract_story_post(
 184 |                             elem,
 185 |                             request_fn=self.get,
 186 |                             options=options,
 187 |                             full_post_html=response.html,
 188 |                             **kwargs,
 189 |                         )
 190 |                     )
 191 |                 else:
 192 |                     post.update(
 193 |                         extract_post(
 194 |                             elem,
 195 |                             request_fn=self.get,
 196 |                             options=options,
 197 |                             full_post_html=response.html,
 198 |                             **kwargs,
 199 |                         )
 200 |                     )
 201 |                 if not post.get("post_url"):
 202 |                     post["post_url"] = url
 203 |                 if remove_source:
 204 |                     post.pop('source', None)
 205 |             yield post
 206 | 
 207 |     def get_posts_by_search(self, word: str, **kwargs) -> Iterator[Post]:
 208 |         kwargs["scraper"] = self
 209 |         iter_pages_fn = partial(iter_search_pages, word=word, request_fn=self.get, **kwargs)
 210 |         return self._generic_get_posts(extract_post, iter_pages_fn, **kwargs)
 211 | 
 212 |     def get_friends(self, account, **kwargs) -> Iterator[Profile]:
 213 |         friend_opt = kwargs.get("friends")
 214 |         limit = None
 215 |         if type(friend_opt) in [int, float]:
 216 |             limit = friend_opt
 217 |         friend_url = kwargs.pop("start_url", None)
 218 |         if not friend_url:
 219 |             friend_url = utils.urljoin(FB_MOBILE_BASE_URL, f'/{account}/friends/')
 220 |         request_url_callback = kwargs.get('request_url_callback')
 221 |         friends_found = 0
 222 |         while friend_url:
 223 |             logger.debug(f"Requesting page from: {friend_url}")
 224 |             response = self.get(friend_url)
 225 |             elems = response.html.find('div[class="timeline"] > div > div')
 226 |             logger.debug(f"Found {len(elems)} friends")
 227 |             for elem in elems:
 228 |                 name = elem.find("h3>a,h1>a", first=True)
 229 |                 if not name:
 230 |                     continue
 231 |                 # Tagline
 232 |                 tagline = elem.find("span.fcg", first=True)
 233 |                 if tagline:
 234 |                     tagline = tagline.text
 235 |                 else:
 236 |                     tagline = ""
 237 |                 # Profile Picture
 238 |                 profile_picture = elem.find("i.profpic", first=True).attrs.get("style")
 239 |                 match = re.search(r"url\('(.+)'\)", profile_picture)
 240 |                 if match:
 241 |                     profile_picture = utils.decode_css_url(match.groups()[0])
 242 |                 # User ID if present, not present if no "add friend"
 243 |                 user_id = elem.find("a.touchable[data-store]", first=True)
 244 |                 if user_id:
 245 |                     user_id = json.loads(user_id.attrs["data-store"]).get("id")
 246 |                 else:
 247 |                     user_id = ""
 248 | 
 249 |                 friend = {
 250 |                     "id": user_id,
 251 |                     "link": name.attrs.get("href"),
 252 |                     "name": name.text,
 253 |                     "profile_picture": profile_picture,
 254 |                     "tagline": tagline,
 255 |                 }
 256 |                 yield friend
 257 |                 friends_found += 1
 258 |             if limit and friends_found > limit:
 259 |                 return
 260 |             more = re.search(r'm_more_friends",href:"([^"]+)"', response.text)
 261 |             if more:
 262 |                 friend_url = utils.urljoin(FB_MOBILE_BASE_URL, more.group(1))
 263 |                 if request_url_callback:
 264 |                     request_url_callback(friend_url)
 265 |             else:
 266 |                 return
 267 | 
 268 |     def get_collection(self, more_url, limit=None, **kwargs) -> Iterator[Profile]:
 269 |         request_url_callback = kwargs.get('request_url_callback')
 270 |         count = 0
 271 |         while more_url:
 272 |             logger.debug(f"Requesting page from: {more_url}")
 273 |             response = self.get(more_url)
 274 |             if response.text.startswith("for (;;);"):
 275 |                 prefix_length = len('for (;;);')
 276 |                 data = json.loads(response.text[prefix_length:])  # Strip 'for (;;);'
 277 |                 for action in data['payload']['actions']:
 278 |                     if action['cmd'] == 'append' and action['html']:
 279 |                         element = utils.make_html_element(
 280 |                             action['html'],
 281 |                             url=FB_MOBILE_BASE_URL,
 282 |                         )
 283 |                         elems = element.find('a.touchable')
 284 |                         html = element.text
 285 |                     elif action['cmd'] == 'script':
 286 |                         more_url = re.search(
 287 |                             r'("\\/timeline\\/app_collection\\/more\\/[^"]+")', action["code"]
 288 |                         )
 289 |                         if more_url:
 290 |                             more_url = more_url.group(1)
 291 |                             more_url = json.loads(more_url)
 292 |             else:
 293 |                 elems = response.html.find('#timelineBody a.touchable')
 294 |                 more_url = re.search(
 295 |                     r'href:"(/timeline/app_collection/more/[^"]+)"', response.text
 296 |                 )
 297 |                 if more_url:
 298 |                     more_url = more_url.group(1)
 299 |             logger.debug(f"Found {len(elems)} elems")
 300 |             for elem in elems:
 301 |                 name = elem.find("strong", first=True).text
 302 |                 link = elem.attrs.get("href")
 303 |                 try:
 304 |                     tagline = elem.find("div.twoLines", first=True).text
 305 |                 except:
 306 |                     tagline = None
 307 |                 profile_picture = elem.find("i.profpic", first=True).attrs.get("style")
 308 |                 match = re.search(r"url\('(.+)'\)", profile_picture)
 309 |                 if match:
 310 |                     profile_picture = utils.decode_css_url(match.groups()[0])
 311 |                 result = {
 312 |                     "link": link,
 313 |                     "name": name,
 314 |                     "profile_picture": profile_picture,
 315 |                     "tagline": tagline,
 316 |                 }
 317 |                 yield result
 318 |                 count += 1
 319 |             if type(limit) in [int, float] and count > limit:
 320 |                 return
 321 |             if more_url and request_url_callback:
 322 |                 request_url_callback(more_url)
 323 | 
 324 |     def get_profile(self, account, **kwargs) -> Profile:
 325 |         account = account.replace("profile.php?id=", "")
 326 |         result = {}
 327 | 
 328 |         if kwargs.get("allow_extra_requests", True):
 329 |             logger.debug(f"Requesting page from: {account}")
 330 |             response = self.get(account)
 331 |             try:
 332 |                 top_post = response.html.find(
 333 |                     '[data-ft*="top_level_post_id"]:not([data-sigil="m-see-translate-link"])',
 334 |                     first=True,
 335 |                 )
 336 |                 assert top_post is not None
 337 |                 top_post = PostExtractor(top_post, kwargs, self.get).extract_post()
 338 |                 top_post.pop("source")
 339 |                 result["top_post"] = top_post
 340 |             except Exception as e:
 341 |                 logger.error(f"Unable to extract top_post {type(e)}:{e}")
 342 | 
 343 |             try:
 344 |                 result["Friend_count"] = utils.parse_int(
 345 |                     response.html.find("a[data-store*='friends']>div>div")[-1].text.split()[0]
 346 |                 )
 347 |             except Exception as e:
 348 |                 result["Friend_count"] = None
 349 |                 logger.error(f"Friend_count extraction failed: {e}")
 350 |             try:
 351 |                 result["Follower_count"] = utils.parse_int(
 352 |                     response.html.find(
 353 |                         "div[data-sigil*='profile-intro-card-log']",
 354 |                         containing="Followed by",
 355 |                         first=True,
 356 |                     ).text
 357 |                 )
 358 |             except Exception as e:
 359 |                 result["Follower_count"] = None
 360 |                 logger.error(f"Follower_count extraction failed: {e}")
 361 |             try:
 362 |                 following_url = f'/{account}?v=following'
 363 |                 logger.debug(f"Fetching {following_url}")
 364 |                 following_response = self.get(following_url)
 365 |                 result["Following_count"] = utils.parse_int(
 366 |                     following_response.html.find("div[role='heading']", first=True).text
 367 |                 )
 368 |             except Exception as e:
 369 |                 result["Following_count"] = None
 370 |                 logger.error(f"Following_count extraction failed: {e}")
 371 | 
 372 |             photo_links = response.html.find("a[href^='/photo.php']")
 373 |             if len(photo_links) == 1:
 374 |                 profile_photo = photo_links[0]
 375 |                 response = self.get(profile_photo.attrs.get("href"))
 376 |                 extractor = PostExtractor(response.html, kwargs, self.get)
 377 |                 result["profile_picture"] = extractor.extract_photo_link_HQ(response.html.html)
 378 |             elif len(photo_links) >= 2:
 379 |                 cover_photo = photo_links[0]
 380 |                 result["cover_photo_text"] = cover_photo.attrs.get("title")
 381 |                 # Check if there is a cover photo or not
 382 |                 if result["cover_photo_text"] is not None:
 383 |                     response = self.get(cover_photo.attrs.get("href"))
 384 |                     extractor = PostExtractor(response.html, kwargs, self.get)
 385 |                     result["cover_photo"] = extractor.extract_photo_link_HQ(response.html.html)
 386 | 
 387 |                     profile_photo = photo_links[1]
 388 |                     response = self.get(profile_photo.attrs.get("href"))
 389 |                     result["profile_picture"] = extractor.extract_photo_link_HQ(
 390 |                         response.html.html
 391 |                     )
 392 |                 else:
 393 |                     result["cover_photo"] = None
 394 |                     profile_photo = photo_links[0]
 395 |                     response = self.get(profile_photo.attrs.get("href"))
 396 |                     extractor = PostExtractor(response.html, kwargs, self.get)
 397 |                     result["profile_picture"] = extractor.extract_photo_link_HQ(
 398 |                         response.html.html
 399 |                     )
 400 |             else:
 401 |                 cover_photo = response.html.find(
 402 |                     "div[data-sigil='cover-photo']>i.img", first=True
 403 |                 )
 404 |                 if cover_photo:
 405 |                     match = re.search(r"url\('(.+)'\)", cover_photo.attrs["style"])
 406 |                     if match:
 407 |                         result["cover_photo"] = utils.decode_css_url(match.groups()[0])
 408 |                 profpic = response.html.find("img.profpic", first=True)
 409 |                 if profpic:
 410 |                     result["profile_picture"] = profpic.attrs["src"]
 411 | 
 412 |         about_url = utils.urljoin(FB_MOBILE_BASE_URL, f'/{account}/about/')
 413 |         logger.debug(f"Requesting page from: {about_url}")
 414 |         response = self.get(about_url)
 415 |         match = re.search(r'entity_id:(\d+)', response.html.html)
 416 |         if match:
 417 |             result["id"] = match.group(1)
 418 |         # Profile name is in the title
 419 |         title = response.html.find("title", first=True).text
 420 |         if " | " in title:
 421 |             title = title.split(" | ")[0]
 422 |         result["Name"] = title
 423 | 
 424 |         about = response.html.find("div#main_column,div.aboutme", first=True)
 425 |         if not about:
 426 |             logger.warning("No about section found")
 427 |             return result
 428 |         for card in about.find("div[data-sigil='profile-card']"):
 429 |             header = card.find("header", first=True).text
 430 |             if header.startswith("About"):
 431 |                 header = "About"  # Truncate strings like "About Mark"
 432 |             if header in ["Work, Education"]:
 433 |                 experience = []
 434 |                 for elem in card.find("div.experience"):
 435 |                     xp = {}
 436 |                     try:
 437 |                         xp["link"] = elem.find("a", first=True).attrs["href"]
 438 |                     except:
 439 |                         pass
 440 |                     bits = elem.text.split("\n")
 441 |                     if len(bits) == 2:
 442 |                         xp["text"], xp["type"] = bits
 443 |                     elif len(bits) == 3:
 444 |                         xp["text"], xp["type"], xp["year"] = bits
 445 |                     else:
 446 |                         xp["text"] = elem.text
 447 |                     experience.append(xp)
 448 |                 result[header] = experience
 449 |             elif header == "Places lived":
 450 |                 places = []
 451 |                 for elem in card.find("div.touchable"):
 452 |                     place = {}
 453 |                     try:
 454 |                         place["link"] = elem.find("a", first=True).attrs["href"]
 455 |                     except:
 456 |                         pass
 457 |                     if "\n" in elem.text:
 458 |                         place["text"], place["type"] = elem.text.split("\n")
 459 |                     else:
 460 |                         place["text"] = elem.text
 461 |                     places.append(place)
 462 |                 result[header] = places
 463 |             else:
 464 |                 bits = card.text.split("\n")[1:]  # Remove header
 465 |                 if len(bits) >= 3 and header == "Relationship":
 466 |                     result[header] = {"to": bits[0], "type": bits[1], "since": bits[2]}
 467 |                 elif len(bits) == 1:
 468 |                     result[header] = bits[0]
 469 |                 elif (
 470 |                     header
 471 |                     in [
 472 |                         "Contact Info",
 473 |                         "Basic Info",
 474 |                         "Education",
 475 |                         "Family Members",
 476 |                         "Other names",
 477 |                     ]
 478 |                     and len(bits) % 2 == 0
 479 |                 ):  # Divisible by two, assume pairs
 480 |                     pairs = {}
 481 |                     for i in range(0, len(bits), 2):
 482 |                         if bits[i + 1] == "Websites":
 483 |                             if "Websites" not in pairs:
 484 |                                 pairs["Websites"] = []
 485 |                             pairs["Websites"].append(bits[i])
 486 |                         else:
 487 |                             pairs[bits[i + 1]] = bits[i]
 488 |                     result[header] = pairs
 489 |                 else:
 490 |                     result[header] = "\n".join(bits)
 491 |         if kwargs.get("friends"):
 492 |             result["Friends"] = list(self.get_friends(account, **kwargs))
 493 |         if kwargs.get("followers"):
 494 |             result["Followers"] = list(
 495 |                 self.get_collection(
 496 |                     f'/{account}?v=followers', limit=kwargs.get("followers"), **kwargs
 497 |                 )
 498 |             )
 499 |         if kwargs.get("following"):
 500 |             result["Following"] = list(
 501 |                 self.get_collection(
 502 |                     f'/{account}?v=following', limit=kwargs.get("following"), **kwargs
 503 |                 )
 504 |             )
 505 | 
 506 |         # Likes
 507 |         if result.get("id") and kwargs.get("likes"):
 508 |             likes_url = utils.urljoin(
 509 |                 FB_MOBILE_BASE_URL,
 510 |                 f'timeline/app_section/?section_token={result["id"]}:2409997254',
 511 |             )
 512 |             logger.debug(f"Requesting page from: {likes_url}")
 513 |             response = self.get(likes_url)
 514 |             result["likes_by_category"] = {}
 515 |             for elem in response.html.find('header[data-sigil="profile-card-header"]'):
 516 |                 count, category = elem.text.split("\n")
 517 |                 count = utils.parse_int(count)
 518 |                 if category == "All Likes":
 519 |                     result["likes_count"] = count
 520 |                 result["likes_by_category"][category] = count
 521 | 
 522 |             all_likes_url = utils.urljoin(
 523 |                 FB_MOBILE_BASE_URL,
 524 |                 f'timeline/app_collection/?collection_token={result["id"]}:2409997254:96',
 525 |             )
 526 |             logger.debug(f"Requesting page from: {all_likes_url}")
 527 |             response = self.get(all_likes_url)
 528 |             result["likes"] = []
 529 |             for elem in response.html.find("div._1a5p"):
 530 |                 result["likes"].append(
 531 |                     {
 532 |                         "name": elem.text,
 533 |                         "link": elem.find("a", first=True).attrs.get("href"),
 534 |                     }
 535 |                 )
 536 |             more_url = re.search(r'href:"(/timeline/app_collection/more/[^"]+)"', response.text)
 537 |             if more_url:
 538 |                 more_url = more_url.group(1)
 539 |             while more_url:
 540 |                 logger.debug(f"Fetching {more_url}")
 541 |                 response = self.get(more_url)
 542 |                 prefix_length = len('for (;;);')
 543 |                 data = json.loads(response.text[prefix_length:])  # Strip 'for (;;);'
 544 |                 for action in data['payload']['actions']:
 545 |                     if action['cmd'] == 'append' and action['html']:
 546 |                         element = utils.make_html_element(
 547 |                             action['html'],
 548 |                             url=FB_MOBILE_BASE_URL,
 549 |                         )
 550 |                         for elem in element.find("div._1a5p"):
 551 |                             result["likes"].append(
 552 |                                 {
 553 |                                     "name": elem.text,
 554 |                                     "link": elem.find("a", first=True).attrs.get("href"),
 555 |                                 }
 556 |                             )
 557 |                     elif action['cmd'] == 'script':
 558 |                         more_url = re.search(
 559 |                             r'("\\/timeline\\/app_collection\\/more\\/[^"]+")', action["code"]
 560 |                         )
 561 |                         if more_url:
 562 |                             more_url = more_url.group(1)
 563 |                             more_url = json.loads(more_url)
 564 | 
 565 |         return result
 566 | 
 567 |     def get_page_reviews(self, page, **kwargs) -> Iterator[Post]:
 568 |         more_url = f"/{page}/reviews"
 569 |         while more_url:
 570 |             logger.debug(f"Fetching {more_url}")
 571 |             response = self.get(more_url)
 572 |             if response.text.startswith("for (;;);"):
 573 |                 prefix_length = len('for (;;);')
 574 |                 data = json.loads(response.text[prefix_length:])  # Strip 'for (;;);'
 575 |                 for action in data['payload']['actions']:
 576 |                     if action['cmd'] == 'replace' and action['html']:
 577 |                         element = utils.make_html_element(
 578 |                             action['html'],
 579 |                             url=FB_MOBILE_BASE_URL,
 580 |                         )
 581 |                         elems = element.find('#page_suggestions_on_liking ~ div')
 582 |                     elif action['cmd'] == 'script':
 583 |                         more_url = re.search(
 584 |                             r'see_more_cards_id","href":"([^"]+)"', action["code"]
 585 |                         )
 586 |                         if more_url:
 587 |                             more_url = more_url.group(1)
 588 |                             more_url = utils.decode_css_url(more_url)
 589 |                             more_url = more_url.replace("\\", "")
 590 |             else:
 591 |                 elems = response.html.find('#page_suggestions_on_liking ~ div')
 592 |                 more_url = re.search(r'see_more_cards_id",href:"([^"]+)"', response.text)
 593 |                 if more_url:
 594 |                     more_url = more_url.group(1)
 595 | 
 596 |             for elem in elems:
 597 |                 header_elem = elem.find("div[data-nt='FB:TEXT4']:has(span)", first=True)
 598 |                 if not header_elem:
 599 |                     continue
 600 |                 bits = list(header_elem.element.itertext())
 601 |                 username = bits[0].strip()
 602 |                 recommends = "recommends" in header_elem.text
 603 |                 links = header_elem.find("a")
 604 |                 if len(links) == 2:
 605 |                     user_url = utils.urljoin(FB_BASE_URL, links[0].attrs["href"])
 606 |                 else:
 607 |                     user_url = None
 608 |                 text_elem = elem.find("div[data-nt='FB:FEED_TEXT'] span p", first=True)
 609 |                 if text_elem:
 610 |                     text = text_elem.text
 611 |                 else:
 612 |                     text = None
 613 |                 date_element = elem.find("abbr[data-store*='time']", first=True)
 614 |                 time = json.loads(date_element.attrs["data-store"])["time"]
 615 |                 yield {
 616 |                     "user_url": user_url,
 617 |                     "username": username,
 618 |                     "profile_picture": elem.find("img", first=True).attrs["src"],
 619 |                     "text": text,
 620 |                     "header": header_elem.text,
 621 |                     "time": datetime.fromtimestamp(time),
 622 |                     "timestamp": time,
 623 |                     "recommends": recommends,
 624 |                     "post_url": utils.urljoin(
 625 |                         FB_BASE_URL, elem.find("a[href*='story']", first=True).attrs["href"]
 626 |                     ),
 627 |                 }
 628 | 
 629 |     def get_page_info(self, page, **kwargs) -> Profile:
 630 |         result = {}
 631 | 
 632 |         try:
 633 |             logger.debug("getting page info using mbasic url")
 634 |             # mbasic info
 635 |             page_url = utils.urljoin(FB_MBASIC_BASE_URL, page)
 636 |             resp = self.get(page_url)
 637 |             container = resp.html.find("div#objects_container", first=True)
 638 |             name_element =  container.find("strong", first=True)
 639 |             result["name"] = name_element.text
 640 |             soupElement = BeautifulSoup(container.html, features='html.parser')
 641 |             ancestorElement = soupElement.select('strong')[0].find_parent('div')
 642 |             if ancestorElement.find_parent('span'):
 643 |                 ancestorElement = ancestorElement.find_parent('span').find_parent('div')
 644 |             description_element = ancestorElement.find_next_sibling("div")
 645 |             logger.debug("description_element")
 646 |             logger.debug(description_element)
 647 |             result["description"] = description_element.text
 648 |             result['category'] = soupElement.select('#category span')[1].text
 649 | 
 650 |             # getting basic info for a page
 651 |             def has_text(element):
 652 |                  return element.get_text(strip=True) != '' and not len(element.find_all(True)) > 0
 653 | 
 654 |             contact_info_elements = soupElement.select("#contact-info")[0].find_all(has_text)
 655 |             basic_info_elements = soupElement.select("#basic-info")[0].find_all(has_text)
 656 |             result['contact_info'] = {contact_info_elements[i].text: contact_info_elements[i + 1].text for i in range(1, len(contact_info_elements), 2)}
 657 |             result['basic_info'] = {basic_info_elements[i].text: basic_info_elements[i + 1].text for i in range(0, len(basic_info_elements), 2)}
 658 |             logger.debug("getting page_id and user_id usong page_info from PageParser")
 659 |             page_basic_info = PageParser(resp).get_page_info()
 660 |             result.update(page_basic_info)
 661 | 
 662 |             result['url'] = resp.url
 663 |             logger.debug("extracting HQ profile photo")
 664 |             extractor = PostExtractor(resp.html, kwargs, self.get)
 665 |             image_elements = resp.html.find("a[href^='/photo.php']")
 666 |             result["cover_picture"] = extractor.extract_photo_link_HQ(response=None, useMbasic=True, mbasicUrl=utils.urljoin(FB_MBASIC_BASE_URL, image_elements[0].attrs['href']))
 667 |             result["profile_picture"] = extractor.extract_photo_link_HQ(response=None, useMbasic=True, mbasicUrl=utils.urljoin(FB_MBASIC_BASE_URL, image_elements[1].attrs['href']))
 668 | 
 669 |         except Exception as e:
 670 |             logger.error(f"Unable to extract page info: {e}")
 671 |         return result
 672 | 
 673 |     def get_group_info(self, group, **kwargs) -> Profile:
 674 |         self.set_user_agent(
 675 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8"
 676 |         )
 677 |         url = f'/groups/{group}'
 678 |         logger.debug(f"Requesting page from: {url}")
 679 |         resp = self.get(url).html
 680 |         try:
 681 |             url = resp.find("a[href*='?view=info']", first=True).attrs["href"]
 682 |             url += "&sfd=1"  # Add parameter to get full "about"-text
 683 |         except AttributeError:
 684 |             raise exceptions.UnexpectedResponse("Unable to resolve view=info URL")
 685 |         logger.debug(f"Requesting page from: {url}")
 686 |         resp = self.get(url).html
 687 |         result = {}
 688 |         result["id"] = re.search(r'/groups/(\d+)', url).group(1)
 689 |         try:
 690 |             result["name"] = resp.find("header h3", first=True).text
 691 |             result["type"] = resp.find("header div", first=True).text
 692 |             members = resp.find("div[data-testid='m_group_sections_members']", first=True)
 693 |             result["members"] = utils.parse_int(members.text)
 694 |         except AttributeError:
 695 |             raise exceptions.UnexpectedResponse("Unable to get one of name, type, or members")
 696 | 
 697 |         # Try to extract the group description
 698 |         try:
 699 |             # Directly tageting the weird generated class names is not optimal, but it's the best i could do.
 700 |             about_div = resp.find("._52jc._55wr", first=True)
 701 | 
 702 |             # Removing the <wbr>-tags that are converted to linebreaks by .text
 703 |             from requests_html import HTML
 704 | 
 705 |             no_word_breaks = HTML(html=about_div.html.replace("<wbr/>", ""))
 706 | 
 707 |             result["about"] = no_word_breaks.text
 708 |         except:
 709 |             result["about"] = None
 710 | 
 711 |         try:
 712 |             url = members.find("a", first=True).attrs.get("href")
 713 |             logger.debug(f"Requesting page from: {url}")
 714 | 
 715 |             resp = self.get(url).html
 716 |             url = resp.find("a[href*='listType=list_admin_moderator']", first=True)
 717 |             if kwargs.get("admins", True):
 718 |                 if url:
 719 |                     url = url.attrs.get("href")
 720 |                     logger.debug(f"Requesting page from: {url}")
 721 |                     try:
 722 |                         respAdmins = self.get(url).html
 723 |                     except:
 724 |                         raise exceptions.UnexpectedResponse("Unable to get admin list")
 725 |                 else:
 726 |                     respAdmins = resp
 727 |                 # Test if we are a member that can add new members
 728 |                 if re.match(
 729 |                     "/groups/members/search",
 730 |                     respAdmins.find(
 731 |                         "div:nth-child(1)>div:nth-child(1) a:not(.touchable)", first=True
 732 |                     ).attrs.get('href'),
 733 |                 ):
 734 |                     admins = respAdmins.find("div:nth-of-type(2)>div.touchable a:not(.touchable)")
 735 |                 else:
 736 |                     admins = respAdmins.find("div:first-child>div.touchable a:not(.touchable)")
 737 |                 result["admins"] = [
 738 |                     {
 739 |                         "name": e.text,
 740 |                         "link": utils.filter_query_params(e.attrs["href"], blacklist=["refid"]),
 741 |                     }
 742 |                     for e in admins
 743 |                 ]
 744 | 
 745 |             url = resp.find("a[href*='listType=list_nonfriend_nonadmin']", first=True)
 746 |             if kwargs.get("members", True):
 747 |                 if url:
 748 |                     url = url.attrs["href"]
 749 |                     members = []
 750 |                     while url:
 751 |                         logger.debug(f"Requesting page from: {url}")
 752 |                         resp = self.get(url).html
 753 |                         elems = resp.find("#root div.touchable a:not(.touchable)")
 754 |                         members.extend([{"name": e.text, "link": e.attrs["href"]} for e in elems])
 755 |                         more = re.search(r'"m_more_item",href:"([^"]+)', resp.text)
 756 |                         if more:
 757 |                             url = more.group(1)
 758 |                         else:
 759 |                             url = None
 760 |                     result["other_members"] = [m for m in members if m not in result["admins"]]
 761 |                 else:
 762 |                     logger.warning("No other members listed")
 763 |         except exceptions.LoginRequired as e:
 764 |             pass
 765 |         return result
 766 | 
 767 |     def get_shop(self, page, **kwargs) -> Iterator[Post]:
 768 |         self.set_user_agent(
 769 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8"
 770 |         )
 771 |         self.set_noscript(True)
 772 |         url = f"{page}/shop/"
 773 |         logger.debug(f"Fetching {url}")
 774 |         resp = self.get(url)
 775 |         more_links = resp.html.find("a[href]", containing="See More")
 776 |         if more_links:
 777 |             url = more_links[-1].attrs["href"]
 778 |             logger.debug(f"Fetching {url}")
 779 |             resp = self.get(url)
 780 |         items = resp.html.find("div.be")
 781 |         results = []
 782 |         for item in items:
 783 |             link_elem = item.find("div.bl a", first=True)
 784 |             name = link_elem.text
 785 |             link = link_elem.attrs["href"]
 786 |             image = item.find("img", first=True).attrs["src"]
 787 |             price = item.find("div.bl")[-1].text
 788 |             result = {"name": name, "link": link, "image": image, "price": price}
 789 |             results.append(result)
 790 |         return results
 791 | 
 792 |     def get_group_posts(self, group: Union[str, int], **kwargs) -> Iterator[Post]:
 793 |         kwargs["scraper"] = self
 794 |         self.set_user_agent(
 795 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8"
 796 |         )
 797 |         iter_pages_fn = partial(iter_group_pages, group=group, request_fn=self.get, **kwargs)
 798 |         return self._generic_get_posts(extract_group_post, iter_pages_fn, **kwargs)
 799 | 
 800 |     def check_locale(self, response):
 801 |         if self.have_checked_locale:
 802 |             return
 803 |         match = re.search(r'"IntlCurrentLocale",\[\],{code:"(\w{2}_\w{2})"}', response.text)
 804 |         if match:
 805 |             locale = match.groups(1)[0]
 806 |             if locale != "en_US":
 807 |                 warnings.warn(
 808 |                     f"Facebook language detected as {locale} - for best results, set to en_US"
 809 |                 )
 810 |             self.have_checked_locale = True
 811 | 
 812 |     def get(self, url, **kwargs):
 813 |         try:
 814 |             self.request_count += 1
 815 |             url = str(url)
 816 |             if not url.startswith("http"):
 817 |                 url = utils.urljoin(FB_MOBILE_BASE_URL, url)
 818 | 
 819 |             if kwargs.get("post"):
 820 |                 kwargs.pop("post")
 821 |                 response = self.session.post(url=url, **kwargs)
 822 |             else:
 823 |                 if url.startswith(FB_MBASIC_BASE_URL) and self.mbasic_headers is not None:
 824 |                     self.session.headers.clear()
 825 |                     self.session.headers.update(self.mbasic_headers)
 826 |                 response = self.session.get(url=url, **self.requests_kwargs, **kwargs)
 827 |                 if url.startswith(FB_MBASIC_BASE_URL) and self.mbasic_headers is not None:
 828 |                     self.session.headers.clear()
 829 |                     self.session.headers.update(self.default_headers)
 830 |             DEBUG = False
 831 |             if DEBUG:
 832 |                 for filename in os.listdir("."):
 833 |                     if filename.endswith(".html") and filename.replace(".html", "") in url:
 834 |                         logger.debug(f"Replacing {url} content with {filename}")
 835 |                         with open(filename) as f:
 836 |                             response.html.html = f.read()
 837 |             response.html.html = response.html.html.replace('<!--', '').replace('-->', '')
 838 |             response.raise_for_status()
 839 |             self.check_locale(response)
 840 | 
 841 |             # Special handling for video posts that redirect to /watch/
 842 |             if response.url == "https://m.facebook.com/watch/?ref=watch_permalink":
 843 |                 post_url = re.search("\d+", url).group()
 844 |                 if post_url:
 845 |                     url = utils.urljoin(
 846 |                         FB_MOBILE_BASE_URL,
 847 |                         f"story.php?story_fbid={post_url}&id=1&m_entstream_source=timeline",
 848 |                     )
 849 |                     post = {"original_request_url": post_url, "post_url": url}
 850 |                     logger.debug(f"Requesting page from: {url}")
 851 |                     response = self.get(url)
 852 |             if "/watch/" in response.url:
 853 |                 video_id = parse_qs(urlparse(response.url).query).get("v")[0]
 854 |                 url = f"story.php?story_fbid={video_id}&id={video_id}&m_entstream_source=video_home&player_suborigin=entry_point&player_format=permalink"
 855 |                 logger.debug(f"Fetching {url}")
 856 |                 response = self.get(url)
 857 | 
 858 |             if "cookie/consent-page" in response.url:
 859 |                 response = self.submit_form(response)
 860 |             if (
 861 |                 response.url.startswith(FB_MOBILE_BASE_URL)
 862 |                 and not response.html.find("script", first=True)
 863 |                 and "script" not in response.html.html
 864 |                 and self.session.cookies.get("noscript") != "1"
 865 |             ):
 866 |                 warnings.warn(
 867 |                     f"Facebook served mbasic/noscript content unexpectedly on {response.url}"
 868 |                 )
 869 |             if response.html.find("h1,h2", containing="Unsupported Browser"):
 870 |                 warnings.warn(f"Facebook says 'Unsupported Browser'")
 871 |             title = response.html.find("title", first=True)
 872 |             not_found_titles = ["page not found", "content not found"]
 873 |             temp_ban_titles = [
 874 |                 "you can't use this feature at the moment",
 875 |                 "you can't use this feature right now",
 876 |                 "you’re temporarily blocked",
 877 |             ]
 878 |             if "checkpoint" in response.url:
 879 |                 if response.html.find("h1", containing="We suspended your account"):
 880 |                     raise exceptions.AccountDisabled("Your Account Has Been Disabled")
 881 |             if title:
 882 |                 if title.text.lower() in not_found_titles:
 883 |                     raise exceptions.NotFound(title.text)
 884 |                 elif title.text.lower() == "error":
 885 |                     raise exceptions.UnexpectedResponse("Your request couldn't be processed")
 886 |                 elif title.text.lower() in temp_ban_titles:
 887 |                     raise exceptions.TemporarilyBanned(title.text)
 888 |                 elif ">your account has been disabled<" in response.html.html.lower():
 889 |                     raise exceptions.AccountDisabled("Your Account Has Been Disabled")
 890 |                 elif (
 891 |                     ">We saw unusual activity on your account. This may mean that someone has used your account without your knowledge.<"
 892 |                     in response.html.html
 893 |                 ):
 894 |                     raise exceptions.AccountDisabled("Your Account Has Been Locked")
 895 |                 elif (
 896 |                     title.text == "Log in to Facebook | Facebook"
 897 |                     or response.url.startswith(utils.urljoin(FB_MOBILE_BASE_URL, "login"))
 898 |                     or response.url.startswith(utils.urljoin(FB_W3_BASE_URL, "login"))
 899 |                     or response.url.startswith(utils.urljoin(FB_MBASIC_BASE_URL, "login"))
 900 |                 ):
 901 |                     raise exceptions.LoginRequired(
 902 |                         "A login (cookies) is required to see this page"
 903 |                     )
 904 |             return response
 905 |         except RequestException as ex:
 906 |             logger.exception("Exception while requesting URL: %s\nException: %r", url, ex)
 907 |             raise
 908 | 
 909 |     def submit_form(self, response, extra_data={}):
 910 |         action = response.html.find("form", first=True).attrs.get('action')
 911 |         url = utils.urljoin(self.base_url, action)
 912 |         elems = response.html.find("input[name][value]")
 913 |         data = {elem.attrs['name']: elem.attrs['value'] for elem in elems}
 914 |         data.update(extra_data)
 915 |         response = self.session.post(url, data=data, **self.requests_kwargs)
 916 |         return response
 917 | 
 918 |     def login(self, email: str, password: str):
 919 |         response = self.get(self.base_url)
 920 | 
 921 |         datr_cookie = re.search('(?<=_js_datr",")[^"]+', response.html.html)
 922 |         if datr_cookie:
 923 |             cookie_value = datr_cookie.group()
 924 |             self.session.cookies.set('datr', cookie_value)
 925 | 
 926 |         response = self.submit_form(
 927 |             response, {"email": email, "pass": password, "_fb_noscript": None}
 928 |         )
 929 | 
 930 |         login_error = response.html.find('#login_error', first=True)
 931 |         if login_error:
 932 |             raise exceptions.LoginError(login_error.text)
 933 | 
 934 |         if "enter login code to continue" in response.text.lower():
 935 |             token = input("Enter 2FA token: ")
 936 |             response = self.submit_form(response, {"approvals_code": token})
 937 |             strong = response.html.find("strong", first=True)
 938 |             if strong and strong.text.startswith("The login code you entered doesn't match"):
 939 |                 raise exceptions.LoginError(strong.text)
 940 |             # Remember Browser
 941 |             response = self.submit_form(response, {"name_action_selected": "save_device"})
 942 |             if "review recent login" in response.text.lower():
 943 |                 response = self.submit_form(response)
 944 |                 # Login near {location} from {browser} on {OS} ({time}). Unset "This wasn't me", leaving "This was me" set.
 945 |                 response = self.submit_form(response, {"submit[This wasn't me]": None})
 946 |                 # Remember Browser. Please save the browser that you just verified. You won't have to enter a code when you log in from browsers that you've saved.
 947 |                 response = self.submit_form(response, {"name_action_selected": "save_device"})
 948 | 
 949 |         if "login approval needed" in response.text.lower() or "checkpoint" in response.url:
 950 |             input(
 951 |                 "Login approval needed. From a browser logged into this account, approve this login from your notifications. Press enter once you've approved it."
 952 |             )
 953 |             response = self.submit_form(response, {"submit[Continue]": "Continue"})
 954 |         if "the password that you entered is incorrect" in response.text.lower():
 955 |             raise exceptions.LoginError("The password that you entered is incorrect")
 956 |         if 'c_user' not in self.session.cookies:
 957 |             with open("login_error.html", "w") as f:
 958 |                 f.write(response.text)
 959 |             raise exceptions.LoginError("Login unsuccessful")
 960 | 
 961 |     def is_logged_in(self) -> bool:
 962 |         try:
 963 |             self.get('https://facebook.com/settings')
 964 |             return True
 965 |         except exceptions.LoginRequired:
 966 |             return False
 967 | 
 968 |     def _generic_get_posts(
 969 |         self,
 970 |         extract_post_fn,
 971 |         iter_pages_fn,
 972 |         page_limit=DEFAULT_PAGE_LIMIT,
 973 |         options=None,
 974 |         remove_source=True,
 975 |         latest_date=None,
 976 |         max_past_limit=5,
 977 |         **kwargs,
 978 |     ):
 979 |         if options is None:
 980 |             options = {}
 981 |         elif isinstance(options, set):
 982 |             warnings.warn("The options argument should be a dictionary.", stacklevel=3)
 983 |             options = {k: True for k in options}
 984 |         if self.session.cookies.get("noscript") == "1":
 985 |             options["noscript"] = True
 986 | 
 987 |         if page_limit and page_limit <= 2:
 988 |             warnings.warn(
 989 |                 "A low page limit (<=2) might return no results, try increasing the limit",
 990 |                 stacklevel=3,
 991 |             )
 992 | 
 993 |         # if latest_date is specified, iterate until the date is reached n times in a row (recurrent_past_posts)
 994 |         if latest_date is not None:
 995 |             # Pinned posts repeat themselves over time, so ignore them
 996 |             pinned_posts = []
 997 | 
 998 |             # Stats
 999 |             null_date_posts = 0
1000 |             total_scraped_posts = 0
1001 | 
1002 |             # Helpers
1003 |             recurrent_past_posts = 0
1004 |             show_every = 50
1005 |             done = False
1006 | 
1007 |             for page in iter_pages_fn():
1008 |                 for post_element in page:
1009 |                     try:
1010 |                         post = extract_post_fn(
1011 |                             post_element, options=options, request_fn=self.get, **kwargs
1012 |                         )
1013 | 
1014 |                         if remove_source:
1015 |                             post.pop("source", None)
1016 | 
1017 |                         # date is None, no way to check latest_date, yield it
1018 |                         if post["time"] is None:
1019 |                             null_date_posts += 1
1020 | 
1021 |                         # date is above latest_date, yield it
1022 |                         if post["time"] > latest_date:
1023 |                             recurrent_past_posts = 0
1024 | 
1025 |                         # if any of above, yield the post and continue
1026 |                         if post["time"] is None or post["time"] > latest_date:
1027 |                             total_scraped_posts += 1
1028 |                             if total_scraped_posts % show_every == 0:
1029 |                                 logger.info("Posts scraped: %s", total_scraped_posts)
1030 | 
1031 |                             yield post
1032 |                             continue
1033 | 
1034 |                         # else, the date is behind the date limit
1035 |                         recurrent_past_posts += 1
1036 | 
1037 |                         # and it has reached the max_past_limit posts
1038 |                         if recurrent_past_posts >= max_past_limit:
1039 |                             done = True
1040 |                             logger.info(
1041 |                                 "Sequential posts behind latest_date reached. Stopping scraping."
1042 |                             )
1043 |                             logger.info(
1044 |                                 "Posts with null date: %s",
1045 |                                 null_date_posts,
1046 |                             )
1047 |                             break
1048 | 
1049 |                         # or the text is not banned (repeated)
1050 |                         if post["text"] is not None and post["text"] not in pinned_posts:
1051 |                             pinned_posts.append(post["text"])
1052 |                             logger.warning(
1053 |                                 "Sequential post #%s behind the date limit: %s. Ignored (in logs) from now on.",
1054 |                                 recurrent_past_posts,
1055 |                                 post["time"],
1056 |                             )
1057 | 
1058 |                     except Exception as e:
1059 |                         logger.exception(
1060 |                             "An exception has occured during scraping: %s. Omitting the post...",
1061 |                             e,
1062 |                         )
1063 | 
1064 |                 # if max_past_limit, stop
1065 |                 if done:
1066 |                     break
1067 | 
1068 |         # else, iterate over pages as usual
1069 |         else:
1070 |             counter = itertools.count(0) if page_limit is None else range(page_limit)
1071 | 
1072 |             logger.debug("Starting to iterate pages")
1073 |             for i, page in zip(counter, iter_pages_fn()):
1074 |                 logger.debug("Extracting posts from page %s", i)
1075 |                 # extra_info is already in the kwargs, so we pop it out
1076 |                 kwargs.pop("extra_info", None)
1077 |                 for post_element in page:
1078 |                     post = extract_post_fn(
1079 |                         post_element,
1080 |                         options=options,
1081 |                         request_fn=self.get,
1082 |                         extra_info=page.extra_info,
1083 |                         **kwargs,
1084 |                     )
1085 |                     if remove_source:
1086 |                         post.pop('source', None)
1087 |                     yield post
1088 | 
1089 |     def get_groups_by_search(self, word: str, **kwargs):
1090 |         group_search_url = utils.urljoin(FB_MOBILE_BASE_URL, f"search/groups/?q={word}")
1091 |         r = self.get(group_search_url)
1092 |         for group_element in r.html.find('div[role="button"]'):
1093 |             button_id = group_element.attrs["id"]
1094 |             group_id = self.find_group_id(button_id, r.text)
1095 |             try:
1096 |                 yield self.get_group_info(group_id)
1097 |             except AttributeError:
1098 |                 continue
1099 | 
1100 |     @staticmethod
1101 |     def find_group_id(button_id, raw_html):
1102 |         """Each group button has an id, which appears later in the script
1103 |         tag followed by the group id."""
1104 |         s = raw_html[raw_html.rfind(button_id) :]
1105 |         group_id = s[s.find("result_id:") :].split(",")[0].split(":")[1]
1106 |         return int(group_id)
1107 | 


--------------------------------------------------------------------------------
/facebook_scraper/fb_types.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Callable, Dict, Iterable, Tuple
 2 | 
 3 | from requests import Response
 4 | from requests_html import Element
 5 | 
 6 | 
 7 | URL = str
 8 | Options = Dict[str, Any]
 9 | Post = Dict[str, Any]
10 | Profile = Dict[str, Any]
11 | RequestFunction = Callable[[URL], Response]
12 | RawPage = Element
13 | RawPost = Element
14 | Page = Iterable[RawPost]
15 | Credentials = Tuple[str, str]
16 | 


--------------------------------------------------------------------------------
/facebook_scraper/internal_classes.py:
--------------------------------------------------------------------------------
1 | class PageClass:
2 |     def __init__(self, raw_posts, extra_info=None):
3 |         self.raw_posts = raw_posts
4 |         self.extra_info = extra_info
5 |         super().__init__()
6 | 
7 |     def __iter__(self):
8 |         return iter(self.raw_posts)


--------------------------------------------------------------------------------
/facebook_scraper/page_iterators.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import re
  4 | import textwrap
  5 | from typing import Iterator, Optional, Union
  6 | import time
  7 | 
  8 | from requests.exceptions import HTTPError
  9 | import warnings
 10 | 
 11 | from . import utils
 12 | from .constants import FB_MOBILE_BASE_URL, FB_MBASIC_BASE_URL
 13 | 
 14 | from .fb_types import URL, Page, RawPage, RequestFunction, Response
 15 | from . import exceptions
 16 | from .internal_classes import PageClass
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | def iter_hashtag_pages(hashtag: str, request_fn: RequestFunction, **kwargs) -> Iterator[Page]:
 22 |     start_url = kwargs.pop("start_url", None)
 23 |     if not start_url:
 24 |         start_url = utils.urljoin(FB_MBASIC_BASE_URL, f'/hashtag/{hashtag}/')
 25 |         try:
 26 |             request_fn(start_url)
 27 |         except Exception as ex:
 28 |             logger.error(ex)
 29 |     return generic_iter_pages(start_url, HashtagPageParser, request_fn, **kwargs)
 30 | 
 31 | 
 32 | def iter_pages(account: str, request_fn: RequestFunction, **kwargs) -> Iterator[Page]:
 33 |     start_url = kwargs.pop("start_url", None)
 34 |     if not start_url:
 35 |         start_url = utils.urljoin(
 36 |             FB_MOBILE_BASE_URL,
 37 |             f'/{account}',
 38 |         )
 39 |     return generic_iter_pages(start_url, PageParser, request_fn, **kwargs)
 40 | 
 41 | 
 42 | def iter_group_pages(
 43 |     group: Union[str, int], request_fn: RequestFunction, **kwargs
 44 | ) -> Iterator[Page]:
 45 |     start_url = kwargs.pop("start_url", None)
 46 | 
 47 |     if not start_url:
 48 |         start_url = utils.urljoin(FB_MOBILE_BASE_URL, f'groups/{group}/')
 49 | 
 50 |     return generic_iter_pages(start_url, GroupPageParser, request_fn, **kwargs)
 51 | 
 52 | 
 53 | def iter_search_pages(word: str, request_fn: RequestFunction, **kwargs) -> Iterator[Page]:
 54 |     start_url = kwargs.pop("start_url", None)
 55 |     if not start_url:
 56 |         start_url = utils.urljoin(
 57 |             FB_MOBILE_BASE_URL,
 58 |             f'/search/posts?q={word}'
 59 |             f'&filters=eyJyZWNlbnRfcG9zdHM6MCI6IntcIm5hbWVcIjpcInJlY2VudF9wb3N0c1wiLFwiYXJnc1wiOlwiXCJ9In0%3D',
 60 |         )
 61 |         try:
 62 |             request_fn(start_url)
 63 |         except Exception as ex:
 64 |             logger.error(ex)
 65 |             start_url = utils.urljoin(FB_MOBILE_BASE_URL, f'/search/posts?q={word}')
 66 |     return generic_iter_pages(start_url, SearchPageParser, request_fn, **kwargs)
 67 | 
 68 | 
 69 | def iter_photos(account: str, request_fn: RequestFunction, **kwargs) -> Iterator[Page]:
 70 |     start_url = utils.urljoin(FB_MOBILE_BASE_URL, f'/{account}/photos/')
 71 |     return generic_iter_pages(start_url, PhotosPageParser, request_fn, **kwargs)
 72 | 
 73 | 
 74 | def generic_iter_pages(
 75 |     start_url, page_parser_cls, request_fn: RequestFunction, **kwargs
 76 | ) -> Iterator[PageClass]:
 77 |     next_url = start_url
 78 | 
 79 |     base_url = kwargs.get('base_url', FB_MOBILE_BASE_URL)
 80 |     request_url_callback = kwargs.get('request_url_callback')
 81 |     while next_url:
 82 |         # Execute callback of starting a new URL request
 83 |         if request_url_callback:
 84 |             # The callback can return an exit code to stop the iteration
 85 |             # This is useful in the cases where the requests triggers an infinite redirect loop.
 86 |             exit_code = request_url_callback(next_url)
 87 |             if exit_code:
 88 |                 logger.debug("Exit code %s received from request_url_callback, exiting", exit_code)
 89 |                 break
 90 | 
 91 |         RETRY_LIMIT = 6
 92 |         for retry in range(1, RETRY_LIMIT + 1):
 93 |             try:
 94 |                 logger.debug("Requesting page from: %s", next_url)
 95 |                 response = request_fn(next_url)
 96 |                 break
 97 |             except HTTPError as e:
 98 |                 if e.response.status_code == 500 and retry < RETRY_LIMIT:
 99 |                     sleep_duration = retry * 2
100 |                     logger.debug(
101 |                         f"Caught exception, retry number {retry}. Sleeping for {sleep_duration}s"
102 |                     )
103 |                     if retry == (RETRY_LIMIT / 2):
104 |                         logger.debug("Requesting noscript")
105 |                         kwargs["scraper"].set_noscript(True)
106 |                     time.sleep(sleep_duration)
107 |                 else:
108 |                     raise
109 | 
110 |         logger.debug("Parsing page response")
111 |         parser = page_parser_cls(response)
112 | 
113 |         page = parser.get_page()
114 | 
115 |         # TODO: If page is actually an iterable calling len(page) might consume it
116 |         logger.debug("Got %s raw posts from page", len(page.raw_posts))
117 |         yield page
118 | 
119 |         logger.debug("Looking for next page URL")
120 |         next_page = parser.get_next_page()
121 |         if next_page:
122 |             posts_per_page = kwargs.get("options", {}).get("posts_per_page")
123 |             if posts_per_page:
124 |                 next_page = next_page.replace("num_to_fetch=4", f"num_to_fetch={posts_per_page}")
125 |             next_url = utils.urljoin(base_url, next_page)
126 |             next_url = next_url.replace("amp;", f"")
127 |         else:
128 |             logger.info("Page parser did not find next page URL")
129 |             next_url = None
130 | 
131 | 
132 | class PageParser:
133 |     """Class for Parsing a single page on a Page"""
134 | 
135 |     json_prefix = 'for (;;);'
136 | 
137 |     cursor_regex = re.compile(r'href[:=]"(/page_content[^"]+)"')  # First request
138 |     cursor_regex_2 = re.compile(r'href"[:=]"(\\/page_content[^"]+)"')  # Other requests
139 |     cursor_regex_3 = re.compile(
140 |         r'href:"(/profile/timeline/stream/\?cursor[^"]+)"'
141 |     )  # scroll/cursor based, first request
142 |     cursor_regex_4 = re.compile(
143 |         r'href\\":\\"\\+(/profile\\+/timeline\\+/stream[^"]+)\"'
144 |     )  # scroll/cursor based, other requests
145 |     # adding new regex for the cursor
146 |     cursor_regex_5 = re.compile(
147 |         r'href="(/profile/timeline/stream/\?cursor[^"]+)"'
148 |     )  # scroll/cursor based, first request
149 | 
150 |     def __init__(self, response: Response):
151 |         self.response = response
152 |         self.html = None
153 |         self.cursor_blob = None
154 | 
155 |         self._parse()
156 | 
157 |     def get_page(self) -> PageClass:
158 |         # Select only elements that have the data-ft attribute
159 |         # it seems top_level_post_id is not always present, an update on the app is needed here but in case it's there
160 |         # we can use it
161 |         page = self._get_page('article[data-ft*="top_level_post_id"]', 'article')
162 |         if (len(page) == 0):
163 |             # TODO remove the backward compatible article selector
164 |             page = self._get_page('article[data-ft], div[role="article"][data-ft]', 'article')
165 |         return PageClass(page, self.get_page_info())
166 | 
167 |     def get_page_info(self):
168 |         more_page_element = self.html.find('a[href*="/mbasic/more/?owner_id"]', first=True)
169 |         # TODO [Code quality] Refactor the regex search to use globally available
170 |         message_page_element = self.html.find('a[href^="/messages/thread/"]', first=True)
171 |         page_id_match = re.search(r'/messages/thread/(\d+)/', message_page_element.attrs.get('href')) if message_page_element else None
172 | 
173 |         # type 2 of page_id matching
174 |         if page_id_match is None:
175 |             logger.debug("trying type 2 of page_id matching against \"intent://user/\"")
176 |             message_page_element = self.html.find('a[href^="intent://user/"]', first=True)
177 |             page_id_match = re.search(r'intent://user/(\d+)/',
178 |                                       message_page_element.attrs.get('href')) if message_page_element else None
179 |         return {
180 |             'user_id': self.html.find('a[href^="/mbasic/more/?owner_id"]', first=True)
181 |             .attrs.get('href')
182 |             .split('owner_id=')[1]
183 |             .split('&')[0]
184 |             if more_page_element
185 |             else None,
186 |             'page_id': page_id_match.group(1) if page_id_match else None
187 |         }
188 | 
189 |     def get_raw_page(self) -> RawPage:
190 |         return self.html
191 | 
192 |     def get_next_page(self) -> Optional[URL]:
193 |         assert self.cursor_blob is not None
194 | 
195 |         match = self.cursor_regex.search(self.cursor_blob)
196 |         if match:
197 |             return utils.unquote(match.groups()[0]).replace("&amp;", "&")
198 | 
199 |         match = self.cursor_regex_2.search(self.cursor_blob)
200 |         if match:
201 |             value = match.groups()[0]
202 |             return utils.unquote(
203 |                 value.encode('utf-8').decode('unicode_escape').replace('\\/', '/')
204 |             ).replace("&amp;", "&")
205 | 
206 |         match = self.cursor_regex_3.search(self.cursor_blob)
207 |         if match:
208 |             return match.groups()[0]
209 | 
210 |         match = self.cursor_regex_4.search(self.response.text)
211 |         if match:
212 |             value = match.groups()[0]
213 |             return re.sub(r'\\+/', '/', value)
214 | 
215 |         match = self.cursor_regex_5.search(self.cursor_blob)
216 |         if match:
217 |             return match.groups()[0]
218 |         return None
219 | 
220 |     def _parse(self):
221 |         if self.response.text.startswith(self.json_prefix):
222 |             self._parse_json()
223 |         else:
224 |             self._parse_html()
225 | 
226 |     def _parse_html(self):
227 |         self.html = self.response.html
228 |         self.cursor_blob = self.response.text
229 | 
230 |     def _parse_json(self):
231 |         prefix_length = len(self.json_prefix)
232 |         data = json.loads(self.response.text[prefix_length:])  # Strip 'for (;;);'
233 | 
234 |         for action in data.get('payload', data)['actions']:
235 |             if action['cmd'] == 'replace':
236 |                 self.html = utils.make_html_element(action['html'], url=FB_MOBILE_BASE_URL)
237 |                 self.cursor_blob = self.html.html
238 |             elif action['cmd'] == 'script':
239 |                 self.cursor_blob = action['code']
240 | 
241 |         assert self.html is not None
242 | 
243 |     def _get_page(self, selection, selection_name) -> Page:
244 |         raw_page = self.get_raw_page()
245 |         raw_posts = raw_page.find(selection)
246 |         # This is not an issue anymore as fb doesn't send bad HTML anymore
247 |         # TODO Remove this in the future as it's not needed
248 |         #for post in raw_posts:
249 |             #if not post.find("footer"):
250 |                 # This is not an issue anymore as fb doesn't send bad HTML anymore
251 |                 # Due to malformed HTML served by Facebook, lxml might misinterpret where the footer should go in article elements
252 |                 # If we limit the parsing just to the section element, it fixes it
253 |                 # Please forgive me for parsing HTML with regex
254 |                 #logger.warning(f"No footer in article - reparsing HTML within <section> element")
255 |                 #html = re.search(r'<section.+?>(.+)</section>', raw_page.html).group(1)
256 |                 #raw_page = utils.make_html_element(html=html)
257 |                 #raw_posts = raw_page.find(selection)
258 |                 #break
259 | 
260 |         if not raw_posts:
261 |             logger.warning(
262 |                 "No raw posts (<%s> elements) were found in this page." % selection_name
263 |             )
264 |             if logger.isEnabledFor(logging.DEBUG):
265 |                 content = textwrap.indent(
266 |                     raw_page.text,
267 |                     prefix='| ',
268 |                     predicate=lambda _: True,
269 |                 )
270 |                 sep = '+' + '-' * 60
271 |                 logger.debug("The page url is: %s", self.response.url)
272 |                 logger.debug("The page content is:\n%s\n%s%s\n", sep, content, sep)
273 | 
274 |         return raw_posts
275 | 
276 | 
277 | class GroupPageParser(PageParser):
278 |     """Class for parsing a single page of a group"""
279 | 
280 |     cursor_regex_3 = re.compile(r'href[=:]"(\/groups\/[^"]+bac=[^"]+)"')  # for Group requests
281 |     cursor_regex_3_basic_new = re.compile(
282 |         r'href[=:]"(\/groups\/[^"]+bacr=[^"]+)"'
283 |     )  # for mbasic Group requests 2023
284 | 
285 |     def get_next_page(self) -> Optional[URL]:
286 |         next_page = super().get_next_page()
287 |         if next_page:
288 |             return next_page
289 | 
290 |         assert self.cursor_blob is not None
291 |         logger.debug("using extra page processor")
292 |         match = self.cursor_regex_3.search(self.cursor_blob)
293 |         if match:
294 |             value = match.groups()[0]
295 |             return value.encode('utf-8').decode('unicode_escape').replace('\\/', '/')
296 |         else:
297 |             match = self.cursor_regex_3_basic_new.search(self.cursor_blob)
298 |             return (
299 |                 match.groups()[0].encode('utf-8').decode('unicode_escape').replace('\\/', '/')
300 |                 if match
301 |                 else None
302 |             )
303 |         return None
304 | 
305 |     def _parse(self):
306 |         self._parse_html()
307 | 
308 | 
309 | class PhotosPageParser(PageParser):
310 |     cursor_regex = re.compile(r'href:"(/photos/pandora/[^"]+)"')
311 |     cursor_regex_2 = re.compile(r'href":"(\\/photos\\/pandora\\/[^"]+)"')
312 | 
313 |     def get_page(self) -> Page:
314 |         return super()._get_page('div._5v64', "div._5v64")
315 | 
316 |     def get_next_page(self) -> Optional[URL]:
317 |         if self.cursor_blob is not None:
318 |             match = self.cursor_regex.search(self.cursor_blob)
319 |             if match:
320 |                 return match.groups()[0]
321 | 
322 |             match = self.cursor_regex_2.search(self.cursor_blob)
323 |             if match:
324 |                 value = match.groups()[0]
325 |                 return value.encode('utf-8').decode('unicode_escape').replace('\\/', '/')
326 | 
327 | 
328 | class SearchPageParser(PageParser):
329 |     cursor_regex = re.compile(r'href[:=]"[^"]+(/search/[^"]+)"')
330 |     cursor_regex_2 = re.compile(r'href":"[^"]+(/search/[^"]+)"')
331 | 
332 |     def get_next_page(self) -> Optional[URL]:
333 |         if self.cursor_blob is not None:
334 |             match = self.cursor_regex.search(self.cursor_blob)
335 |             if match:
336 |                 return match.groups()[0]
337 | 
338 |             match = self.cursor_regex_2.search(self.cursor_blob)
339 |             if match:
340 |                 value = match.groups()[0]
341 |                 return value.encode('utf-8').decode('unicode_escape').replace('\\/', '/')
342 | 
343 | 
344 | class HashtagPageParser(PageParser):
345 |     cursor_regex = re.compile(r'(\/hashtag\/[a-z]+\/\?cursor=[^"]+).*$')
346 | 
347 |     def get_page(self) -> Page:
348 |         return super()._get_page('article', 'article')
349 | 
350 |     def get_next_page(self) -> Optional[URL]:
351 |         assert self.cursor_blob is not None
352 | 
353 |         match = self.cursor_regex.search(self.cursor_blob)
354 |         if match:
355 |             return utils.unquote(match.groups()[0]).replace("&amp;", "&")
356 | 
357 |         return None
358 | 


--------------------------------------------------------------------------------
/facebook_scraper/utils.py:
--------------------------------------------------------------------------------
  1 | import codecs
  2 | import re
  3 | from datetime import datetime, timedelta
  4 | import calendar
  5 | from typing import Optional
  6 | from urllib.parse import parse_qsl, unquote, urlencode, urljoin, urlparse, urlunparse
  7 | 
  8 | import dateparser
  9 | import lxml.html
 10 | from bs4 import BeautifulSoup
 11 | from requests.cookies import RequestsCookieJar
 12 | from requests_html import DEFAULT_URL, Element, PyQuery
 13 | import json
 14 | import traceback
 15 | 
 16 | from . import exceptions
 17 | import logging
 18 | import time
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | def find_and_search(node, selector, pattern, cast=str):
 24 |     container = node.find(selector, first=True)
 25 |     match = container and pattern.search(container.html)
 26 |     return match and cast(match.groups()[0])
 27 | 
 28 | 
 29 | def parse_int(value: str) -> int:
 30 |     return int(''.join(filter(lambda c: c.isdigit(), value)))
 31 | 
 32 | 
 33 | def convert_numeric_abbr(s):
 34 |     mapping = {'k': 1000, 'm': 1e6}
 35 |     s = s.replace(",", "")
 36 |     if s[-1].isalpha():
 37 |         return int(float(s[:-1]) * mapping[s[-1].lower()])
 38 |     return int(s)
 39 | 
 40 | 
 41 | def parse_duration(s) -> int:
 42 |     match = re.search(r'T(?P<hours>\d+H)?(?P<minutes>\d+M)?(?P<seconds>\d+S)', s)
 43 |     if match:
 44 |         result = 0
 45 |         for k, v in match.groupdict().items():
 46 |             if v:
 47 |                 if k == 'hours':
 48 |                     result += int(v.strip("H")) * 60 * 60
 49 |                 elif k == "minutes":
 50 |                     result += int(v.strip("M")) * 60
 51 |                 elif k == "seconds":
 52 |                     result += int(v.strip("S"))
 53 |         return result
 54 | 
 55 | 
 56 | def decode_css_url(url: str) -> str:
 57 |     url = re.sub(r'\\(..) ', r'\\x\g<1>', url)
 58 |     url, _ = codecs.unicode_escape_decode(url)
 59 |     url, _ = codecs.unicode_escape_decode(url)
 60 |     return url
 61 | 
 62 | 
 63 | def get_background_image_url(style):
 64 |     match = re.search(r"url\('(.+)'\)", style)
 65 |     return decode_css_url(match.groups()[0])
 66 | 
 67 | 
 68 | def filter_query_params(url, whitelist=None, blacklist=None) -> str:
 69 |     def is_valid_param(param):
 70 |         if whitelist is not None:
 71 |             return param in whitelist
 72 |         if blacklist is not None:
 73 |             return param not in blacklist
 74 |         return True  # Do nothing
 75 | 
 76 |     parsed_url = urlparse(url)
 77 |     query_params = parse_qsl(parsed_url.query)
 78 |     query_string = urlencode([(k, v) for k, v in query_params if is_valid_param(k)])
 79 |     return urlunparse(parsed_url._replace(query=query_string))
 80 | 
 81 | 
 82 | def combine_url_params(url1, url2) -> str:
 83 |     parsed_url = urlparse(url1)
 84 |     parsed_url2 = urlparse(url2)
 85 |     query_params = parse_qsl(parsed_url.query) + parse_qsl(parsed_url2.query)
 86 |     query_string = urlencode([(k, v) for k, v in query_params])
 87 |     return urlunparse(parsed_url._replace(query=query_string))
 88 | 
 89 | 
 90 | def remove_control_characters(html):
 91 |     # type: (t.Text) -> t.Text
 92 |     """
 93 |     Strip invalid XML characters that `lxml` cannot parse.
 94 |     """
 95 |     # See: https://github.com/html5lib/html5lib-python/issues/96
 96 |     #
 97 |     # The XML 1.0 spec defines the valid character range as:
 98 |     # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
 99 |     #
100 |     # We can instead match the invalid characters by inverting that range into:
101 |     # InvalidChar ::= #xb | #xc | #xFFFE | #xFFFF | [#x0-#x8] | [#xe-#x1F] | [#xD800-#xDFFF]
102 |     #
103 |     # Sources:
104 |     # https://www.w3.org/TR/REC-xml/#charsets,
105 |     # https://lsimons.wordpress.com/2011/03/17/stripping-illegal-characters-out-of-xml-in-python/
106 |     def strip_illegal_xml_characters(s, default, base=10):
107 |         # Compare the "invalid XML character range" numerically
108 |         n = int(s, base)
109 |         if (
110 |             n in (0xB, 0xC, 0xFFFE, 0xFFFF)
111 |             or 0x0 <= n <= 0x8
112 |             or 0xE <= n <= 0x1F
113 |             or 0xD800 <= n <= 0xDFFF
114 |         ):
115 |             return ""
116 |         return default
117 | 
118 |     # We encode all non-ascii characters to XML char-refs, so for example "💖" becomes: "&#x1F496;"
119 |     # Otherwise we'd remove emojis by mistake on narrow-unicode builds of Python
120 |     html = html.encode("ascii", "xmlcharrefreplace").decode("utf-8")
121 |     html = re.sub(
122 |         r"&#(\d+);?", lambda c: strip_illegal_xml_characters(c.group(1), c.group(0)), html
123 |     )
124 |     html = re.sub(
125 |         r"&#[xX]([0-9a-fA-F]+);?",
126 |         lambda c: strip_illegal_xml_characters(c.group(1), c.group(0), base=16),
127 |         html,
128 |     )
129 |     # A regex matching the "invalid XML character range"
130 |     html = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]").sub("", html)
131 |     return html
132 | 
133 | 
134 | def make_html_element(html: str, url=DEFAULT_URL) -> Element:
135 |     html = remove_control_characters(html)
136 |     pq_element = PyQuery(html)[0]  # PyQuery is a list, so we take the first element
137 |     return Element(element=pq_element, url=url)
138 | 
139 | 
140 | month = (
141 |     r"Jan(?:uary)?|"
142 |     r"Feb(?:ruary)?|"
143 |     r"Mar(?:ch)?|"
144 |     r"Apr(?:il)?|"
145 |     r"May|"
146 |     r"Jun(?:e)?|"
147 |     r"Jul(?:y)?|"
148 |     r"Aug(?:ust)?|"
149 |     r"Sep(?:tember)?|"
150 |     r"Oct(?:ober)?|"
151 |     r"Nov(?:ember)?|"
152 |     r"Dec(?:ember)?"
153 | )
154 | day_of_week = r"Mon|" r"Tue|" r"Wed|" r"Thu|" r"Fri|" r"Sat|" r"Sun"
155 | day_of_month = r"\d{1,2}"
156 | specific_date_md = f"(?:{month}) {day_of_month}" + r"(?:,? \d{4})?"
157 | specific_date_dm = f"{day_of_month} (?:{month})" + r"(?:,? \d{4})?"
158 | 
159 | date = f"{specific_date_md}|{specific_date_dm}|Today|Yesterday"
160 | 
161 | hour = r"\d{1,2}"
162 | minute = r"\d{2}"
163 | period = r"AM|PM|"
164 | 
165 | exact_time = f"(?:{date}) at {hour}:{minute} ?(?:{period})"
166 | relative_time_years = r'\b\d{1,2} yr'
167 | relative_time_months = r'\b\d{1,2} (?:mth|mo)'
168 | relative_time_weeks = r'\b\d{1,2} wk'
169 | relative_time_hours = r"\b\d{1,2} ?h(?:rs?)?"
170 | relative_time_mins = r"\b\d{1,2} ?mins?"
171 | relative_time = f"{relative_time_years}|{relative_time_months}|{relative_time_weeks}|{relative_time_hours}|{relative_time_mins}"
172 | 
173 | datetime_regex = re.compile(fr"({exact_time}|{relative_time})", re.IGNORECASE)
174 | day_of_week_regex = re.compile(fr"({day_of_week})", re.IGNORECASE)
175 | 
176 | 
177 | def parse_datetime(text: str, search=True) -> Optional[datetime]:
178 |     """Looks for a string that looks like a date and parses it into a datetime object.
179 | 
180 |     Uses a regex to look for the date in the string.
181 |     Uses dateparser to parse the date (not thread safe).
182 | 
183 |     Args:
184 |         text: The text where the date should be.
185 |         search: If false, skip the regex search and try to parse the complete string.
186 | 
187 |     Returns:
188 |         The datetime object, or None if it couldn't find a date.
189 |     """
190 |     settings = {
191 |         'RELATIVE_BASE': datetime.today().replace(minute=0, hour=0, second=0, microsecond=0)
192 |     }
193 |     if search:
194 |         time_match = datetime_regex.search(text)
195 |         dow_match = day_of_week_regex.search(text)
196 |         if time_match:
197 |             text = time_match.group(0).replace("mth", "month")
198 |         elif dow_match:
199 |             text = dow_match.group(0)
200 |             today = calendar.day_abbr[datetime.today().weekday()]
201 |             if text == today:
202 |                 # Fix for dateparser misinterpreting "last Monday" as today if today is Monday
203 |                 return dateparser.parse(text, settings=settings) - timedelta(days=7)
204 | 
205 |     result = dateparser.parse(text, settings=settings)
206 |     if result:
207 |         return result.replace(microsecond=0)
208 |     return None
209 | 
210 | 
211 | def html_element_to_string(element: Element, pretty=False) -> str:
212 |     html = lxml.html.tostring(element.element, encoding='unicode')
213 |     if pretty:
214 |         html = BeautifulSoup(html, features='html.parser').prettify()
215 |     return html
216 | 
217 | 
218 | def parse_cookie_file(filename: str) -> RequestsCookieJar:
219 |     jar = RequestsCookieJar()
220 | 
221 |     with open(filename, mode='rt') as file:
222 |         data = file.read()
223 | 
224 |     try:
225 |         data = json.loads(data)
226 |         if type(data) is list:
227 |             for c in data:
228 |                 expires = c.get("expirationDate") or c.get("Expires raw")
229 |                 if expires:
230 |                     expires = int(expires)
231 |                 if "Name raw" in c:
232 |                     # Cookie Quick Manager JSON format
233 |                     host = c["Host raw"].replace("https://", "").strip("/")
234 |                     jar.set(
235 |                         c["Name raw"],
236 |                         c["Content raw"],
237 |                         domain=host,
238 |                         path=c["Path raw"],
239 |                         expires=expires,
240 |                     )
241 |                 else:
242 |                     # EditThisCookie JSON format
243 |                     jar.set(
244 |                         c["name"],
245 |                         c["value"],
246 |                         domain=c["domain"],
247 |                         path=c["path"],
248 |                         secure=c["secure"],
249 |                         expires=expires,
250 |                     )
251 |         elif type(data) is dict:
252 |             for k, v in data.items():
253 |                 if type(v) is dict:
254 |                     jar.set(k, v["value"])
255 |                 else:
256 |                     jar.set(k, v)
257 |     except json.decoder.JSONDecodeError:
258 |         # Netscape format
259 |         for i, line in enumerate(data.splitlines()):
260 |             line = line.strip()
261 |             if line == "" or line.startswith('#'):
262 |                 continue
263 | 
264 |             try:
265 |                 domain, _, path, secure, expires, name, value = line.split('\t')
266 |             except Exception as e:
267 |                 raise exceptions.InvalidCookies(f"Can't parse line {i + 1}: '{line}'")
268 |             secure = secure.lower() == 'true'
269 |             expires = None if expires == '0' else int(expires)
270 | 
271 |             jar.set(name, value, domain=domain, path=path, secure=secure, expires=expires)
272 | 
273 |     return jar
274 | 
275 | 
276 | def safe_consume(generator, sleep=0):
277 |     result = []
278 |     try:
279 |         for item in generator:
280 |             result.append(item)
281 |             time.sleep(sleep)
282 |     except Exception as e:
283 |         traceback.print_exc()
284 |         logger.error(f"Exception when consuming {generator}: {type(e)}: {str(e)}")
285 |     return result
286 | 
287 | 
288 | reaction_lookup = {
289 |     '1': {
290 |         'color': '#2078f4',
291 |         'display_name': 'Like',
292 |         'is_deprecated': False,
293 |         'is_visible': True,
294 |         'name': 'like',
295 |         'type': 1,
296 |     },
297 |     '10': {
298 |         'color': '#f0ba15',
299 |         'display_name': 'Confused',
300 |         'is_deprecated': True,
301 |         'is_visible': False,
302 |         'name': 'confused',
303 |         'type': 10,
304 |     },
305 |     '11': {
306 |         'color': '#7e64c4',
307 |         'display_name': 'Thankful',
308 |         'is_deprecated': False,
309 |         'is_visible': True,
310 |         'name': 'dorothy',
311 |         'type': 11,
312 |     },
313 |     '12': {
314 |         'color': '#ec7ebd',
315 |         'display_name': 'Pride',
316 |         'is_deprecated': False,
317 |         'is_visible': True,
318 |         'name': 'toto',
319 |         'type': 12,
320 |     },
321 |     '13': {
322 |         'color': '#f0ba15',
323 |         'display_name': 'Selfie',
324 |         'is_deprecated': False,
325 |         'is_visible': False,
326 |         'name': 'selfie',
327 |         'type': 13,
328 |     },
329 |     '14': {
330 |         'color': '#f0ba15',
331 |         'display_name': 'React',
332 |         'is_deprecated': True,
333 |         'is_visible': False,
334 |         'name': 'flame',
335 |         'type': 14,
336 |     },
337 |     '15': {
338 |         'color': '#f0ba15',
339 |         'display_name': 'React',
340 |         'is_deprecated': True,
341 |         'is_visible': False,
342 |         'name': 'plane',
343 |         'type': 15,
344 |     },
345 |     '16': {
346 |         'color': '#f7b125',
347 |         'display_name': 'Care',
348 |         'is_deprecated': False,
349 |         'is_visible': True,
350 |         'name': 'support',
351 |         'type': 16,
352 |     },
353 |     '2': {
354 |         'color': '#f33e58',
355 |         'display_name': 'Love',
356 |         'is_deprecated': False,
357 |         'is_visible': True,
358 |         'name': 'love',
359 |         'type': 2,
360 |     },
361 |     '3': {
362 |         'color': '#f7b125',
363 |         'display_name': 'Wow',
364 |         'is_deprecated': False,
365 |         'is_visible': True,
366 |         'name': 'wow',
367 |         'type': 3,
368 |     },
369 |     '4': {
370 |         'color': '#f7b125',
371 |         'display_name': 'Haha',
372 |         'is_deprecated': False,
373 |         'is_visible': True,
374 |         'name': 'haha',
375 |         'type': 4,
376 |     },
377 |     '5': {
378 |         'color': '#f0ba15',
379 |         'display_name': 'Yay',
380 |         'is_deprecated': True,
381 |         'is_visible': False,
382 |         'name': 'yay',
383 |         'type': 5,
384 |     },
385 |     '7': {
386 |         'color': '#f7b125',
387 |         'display_name': 'Sad',
388 |         'is_deprecated': False,
389 |         'is_visible': True,
390 |         'name': 'sorry',
391 |         'type': 7,
392 |     },
393 |     '8': {
394 |         'color': '#e9710f',
395 |         'display_name': 'Angry',
396 |         'is_deprecated': False,
397 |         'is_visible': True,
398 |         'name': 'anger',
399 |         'type': 8,
400 |     },
401 |     '1635855486666999': {
402 |         'color': '#2078f4',
403 |         'display_name': 'Like',
404 |         'is_deprecated': False,
405 |         'is_visible': True,
406 |         'name': 'like',
407 |         'type': 1635855486666999,
408 |     },
409 |     '613557422527858': {
410 |         'color': '#f7b125',
411 |         'display_name': 'Care',
412 |         'is_deprecated': False,
413 |         'is_visible': True,
414 |         'name': 'support',
415 |         'type': 613557422527858,
416 |     },
417 |     '1678524932434102': {
418 |         'color': '#f33e58',
419 |         'display_name': 'Love',
420 |         'is_deprecated': False,
421 |         'is_visible': True,
422 |         'name': 'love',
423 |         'type': 1678524932434102,
424 |     },
425 |     '478547315650144': {
426 |         'color': '#f7b125',
427 |         'display_name': 'Wow',
428 |         'is_deprecated': False,
429 |         'is_visible': True,
430 |         'name': 'wow',
431 |         'type': 478547315650144,
432 |     },
433 |     '115940658764963': {
434 |         'color': '#f7b125',
435 |         'display_name': 'Haha',
436 |         'is_deprecated': False,
437 |         'is_visible': True,
438 |         'name': 'haha',
439 |         'type': 115940658764963,
440 |     },
441 |     '908563459236466': {
442 |         'color': '#f7b125',
443 |         'display_name': 'Sad',
444 |         'is_deprecated': False,
445 |         'is_visible': True,
446 |         'name': 'sorry',
447 |         'type': 908563459236466,
448 |     },
449 |     '444813342392137': {
450 |         'color': '#e9710f',
451 |         'display_name': 'Angry',
452 |         'is_deprecated': False,
453 |         'is_visible': True,
454 |         'name': 'anger',
455 |         'type': 444813342392137,
456 |     },
457 | }
458 | 
459 | emoji_class_lookup = {
460 |     'sx_0ae260': 'care',
461 |     'sx_0e815d': 'haha',
462 |     'sx_199220': 'angry',
463 |     'sx_3a00ef': 'like',
464 |     'sx_3ecf2a': 'sad',
465 |     'sx_78dbdd': 'angry',
466 |     'sx_a35dca': 'love',
467 |     'sx_c3ed6c': 'sad',
468 |     'sx_ce3068': 'haha',
469 |     'sx_d80e3a': 'wow',
470 |     'sx_d8e63d': 'care',
471 |     'sx_e303cc': 'like',
472 |     'sx_f21116': 'love',
473 |     'sx_f75acf': 'wow',
474 |     'sx_a70a0c': 'like',
475 | }
476 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "facebook-scraper"
 3 | version = "0.2.60"
 4 | description = "Scrape Facebook public pages without an API key"
 5 | authors = ["Kevin Zúñiga <kevin.zun@gmail.com>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | repository = "https://github.com/kevinzg/facebook-scraper"
 9 | 
10 | [tool.poetry.dependencies]
11 | python = "^3.6"
12 | requests-html = "^0.10.0"
13 | youtube_dl = {version = "*", optional=true}
14 | browser-cookie3 = {version = "*", optional=true}
15 | dateparser = "^1.0.0"
16 | demjson3 = "^3.0.5"
17 | 
18 | [tool.poetry.dev-dependencies]
19 | ipdb = {version = "*", python = "^3.7"}
20 | ipython = {version = "*", python = "^3.7"}
21 | pytest = "^6.2.2"
22 | pytest-vcr = "^1.0.2"
23 | 
24 | [tool.poetry.extras]
25 | youtube-dl = ["youtube_dl"]
26 | browser-cookie3 = ["browser-cookie3"]
27 | 
28 | [tool.poetry.scripts]
29 | facebook-scraper = 'facebook_scraper.__main__:run'
30 | 
31 | [tool.black]
32 | line-length = 98
33 | target-version = ["py36"]
34 | skip-string-normalization = true
35 | 
36 | [build-system]
37 | requires = ["poetry-core>=1.0.0"]
38 | build-backend = "poetry.core.masonry.api"
39 | 
40 | [tool.pytest.ini_options]
41 | filterwarnings = [
42 |     "ignore::UserWarning",
43 | ]


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
  1 | appdirs==1.4.4; python_version >= "3.5" and python_full_version >= "3.6.0" \
  2 |     --hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128 \
  3 |     --hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41
  4 | appnope==0.1.2; python_version >= "3.7" and python_version < "4.0" and sys_platform == "darwin" \
  5 |     --hash=sha256:93aa393e9d6c54c5cd570ccadd8edad61ea0c4b9ea7a01409020c9aa019eb442 \
  6 |     --hash=sha256:dd83cd4b5b460958838f6eb3000c660b1f9caf2a5b1de4264e941512f603258a
  7 | atomicwrites==1.4.0; python_version >= "3.6" and python_full_version < "3.0.0" and sys_platform == "win32" or sys_platform == "win32" and python_version >= "3.6" and python_full_version >= "3.4.0" \
  8 |     --hash=sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197 \
  9 |     --hash=sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a
 10 | attrs==20.3.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6" \
 11 |     --hash=sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6 \
 12 |     --hash=sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700
 13 | backcall==0.2.0; python_version >= "3.7" and python_version < "4.0" \
 14 |     --hash=sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255 \
 15 |     --hash=sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e
 16 | beautifulsoup4==4.9.3; python_full_version >= "3.6.0" \
 17 |     --hash=sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35 \
 18 |     --hash=sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666 \
 19 |     --hash=sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25
 20 | bs4==0.0.1; python_full_version >= "3.6.0" \
 21 |     --hash=sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a
 22 | certifi==2020.12.5; python_full_version >= "3.6.0" \
 23 |     --hash=sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830 \
 24 |     --hash=sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c
 25 | chardet==4.0.0; python_full_version >= "3.6.0" \
 26 |     --hash=sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5 \
 27 |     --hash=sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa
 28 | colorama==0.4.4; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" and sys_platform == "win32" or python_version >= "3.7" and python_version < "4.0" and sys_platform == "win32" and python_full_version >= "3.5.0" \
 29 |     --hash=sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2 \
 30 |     --hash=sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b
 31 | cssselect==1.1.0; python_full_version >= "3.6.0" \
 32 |     --hash=sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf \
 33 |     --hash=sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc
 34 | dateparser==1.0.0; python_version >= "3.5" \
 35 |     --hash=sha256:17202df32c7a36e773136ff353aa3767e987f8b3e27374c39fd21a30a803d6f8 \
 36 |     --hash=sha256:159cc4e01a593706a15cd4e269a0b3345edf3aef8bf9278a57dac8adf5bf1e4a
 37 | decorator==4.4.2; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.2.0" \
 38 |     --hash=sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760 \
 39 |     --hash=sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7
 40 | fake-useragent==0.1.11; python_full_version >= "3.6.0" \
 41 |     --hash=sha256:c104998b750eb097eefc28ae28e92d66397598d2cf41a31aa45d5559ef1adf35
 42 | idna==2.10; python_full_version >= "3.6.0" and python_version >= "3.6" \
 43 |     --hash=sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0 \
 44 |     --hash=sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6
 45 | importlib-metadata==3.7.3; python_version < "3.8" and python_version >= "3.6" and (python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "3.8" or python_full_version >= "3.4.0" and python_version >= "3.6" and python_version < "3.8") \
 46 |     --hash=sha256:b74159469b464a99cb8cc3e21973e4d96e05d3024d337313fedb618a6e86e6f4 \
 47 |     --hash=sha256:742add720a20d0467df2f444ae41704000f50e1234f46174b51f9c6031a1bd71
 48 | iniconfig==1.1.1; python_version >= "3.6" \
 49 |     --hash=sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3 \
 50 |     --hash=sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32
 51 | ipdb==0.13.7; python_version >= "3.7" and python_version < "4.0" \
 52 |     --hash=sha256:178c367a61c1039e44e17c56fcc4a6e7dc11b33561261382d419b6ddb4401810
 53 | ipython-genutils==0.2.0; python_version >= "3.7" and python_version < "4.0" \
 54 |     --hash=sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8 \
 55 |     --hash=sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8
 56 | ipython==7.21.0; python_version >= "3.7" and python_version < "4.0" \
 57 |     --hash=sha256:34207ffb2f653bced2bc8e3756c1db86e7d93e44ed049daae9814fed66d408ec \
 58 |     --hash=sha256:04323f72d5b85b606330b6d7e2dc8d2683ad46c3905e955aa96ecc7a99388e70
 59 | jedi==0.18.0; python_version >= "3.7" and python_version < "4.0" \
 60 |     --hash=sha256:18456d83f65f400ab0c2d3319e48520420ef43b23a086fdc05dff34132f0fb93 \
 61 |     --hash=sha256:92550a404bad8afed881a137ec9a461fed49eca661414be45059329614ed0707
 62 | lxml==4.9.1; python_full_version >= "3.6.0" \
 63 |     --hash=sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318 \
 64 |     --hash=sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c \
 65 |     --hash=sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b \
 66 |     --hash=sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000 \
 67 |     --hash=sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73 \
 68 |     --hash=sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d \
 69 |     --hash=sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb \
 70 |     --hash=sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8 \
 71 |     --hash=sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2 \
 72 |     --hash=sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345 \
 73 |     --hash=sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94 \
 74 |     --hash=sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e \
 75 |     --hash=sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b \
 76 |     --hash=sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc \
 77 |     --hash=sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a \
 78 |     --hash=sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9 \
 79 |     --hash=sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc \
 80 |     --hash=sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387 \
 81 |     --hash=sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb \
 82 |     --hash=sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7 \
 83 |     --hash=sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4 \
 84 |     --hash=sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97 \
 85 |     --hash=sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67 \
 86 |     --hash=sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627 \
 87 |     --hash=sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7 \
 88 |     --hash=sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd \
 89 |     --hash=sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3 \
 90 |     --hash=sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7 \
 91 |     --hash=sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130 \
 92 |     --hash=sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b \
 93 |     --hash=sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036 \
 94 |     --hash=sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785 \
 95 |     --hash=sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca \
 96 |     --hash=sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91 \
 97 |     --hash=sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc \
 98 |     --hash=sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536 \
 99 |     --hash=sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391 \
100 |     --hash=sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3 \
101 |     --hash=sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d \
102 |     --hash=sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21 \
103 |     --hash=sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3 \
104 |     --hash=sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d \
105 |     --hash=sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29 \
106 |     --hash=sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715 \
107 |     --hash=sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed \
108 |     --hash=sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25 \
109 |     --hash=sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c \
110 |     --hash=sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785 \
111 |     --hash=sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837 \
112 |     --hash=sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4 \
113 |     --hash=sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b \
114 |     --hash=sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2 \
115 |     --hash=sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067 \
116 |     --hash=sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448 \
117 |     --hash=sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d \
118 |     --hash=sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2 \
119 |     --hash=sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc \
120 |     --hash=sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c \
121 |     --hash=sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5 \
122 |     --hash=sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84 \
123 |     --hash=sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8 \
124 |     --hash=sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf \
125 |     --hash=sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7 \
126 |     --hash=sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e \
127 |     --hash=sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb \
128 |     --hash=sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b \
129 |     --hash=sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3 \
130 |     --hash=sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad \
131 |     --hash=sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8 \
132 |     --hash=sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f
133 | multidict==5.1.0; python_version >= "3.6" \
134 |     --hash=sha256:b7993704f1a4b204e71debe6095150d43b2ee6150fa4f44d6d966ec356a8d61f \
135 |     --hash=sha256:9dd6e9b1a913d096ac95d0399bd737e00f2af1e1594a787e00f7975778c8b2bf \
136 |     --hash=sha256:f21756997ad8ef815d8ef3d34edd98804ab5ea337feedcd62fb52d22bf531281 \
137 |     --hash=sha256:1ab820665e67373de5802acae069a6a05567ae234ddb129f31d290fc3d1aa56d \
138 |     --hash=sha256:9436dc58c123f07b230383083855593550c4d301d2532045a17ccf6eca505f6d \
139 |     --hash=sha256:830f57206cc96ed0ccf68304141fec9481a096c4d2e2831f311bde1c404401da \
140 |     --hash=sha256:2e68965192c4ea61fff1b81c14ff712fc7dc15d2bd120602e4a3494ea6584224 \
141 |     --hash=sha256:2f1a132f1c88724674271d636e6b7351477c27722f2ed789f719f9e3545a3d26 \
142 |     --hash=sha256:3a4f32116f8f72ecf2a29dabfb27b23ab7cdc0ba807e8459e59a93a9be9506f6 \
143 |     --hash=sha256:46c73e09ad374a6d876c599f2328161bcd95e280f84d2060cf57991dec5cfe76 \
144 |     --hash=sha256:018132dbd8688c7a69ad89c4a3f39ea2f9f33302ebe567a879da8f4ca73f0d0a \
145 |     --hash=sha256:4b186eb7d6ae7c06eb4392411189469e6a820da81447f46c0072a41c748ab73f \
146 |     --hash=sha256:3a041b76d13706b7fff23b9fc83117c7b8fe8d5fe9e6be45eee72b9baa75f348 \
147 |     --hash=sha256:051012ccee979b2b06be928a6150d237aec75dd6bf2d1eeeb190baf2b05abc93 \
148 |     --hash=sha256:6a4d5ce640e37b0efcc8441caeea8f43a06addace2335bd11151bc02d2ee31f9 \
149 |     --hash=sha256:5cf3443199b83ed9e955f511b5b241fd3ae004e3cb81c58ec10f4fe47c7dce37 \
150 |     --hash=sha256:f200755768dc19c6f4e2b672421e0ebb3dd54c38d5a4f262b872d8cfcc9e93b5 \
151 |     --hash=sha256:05c20b68e512166fddba59a918773ba002fdd77800cad9f55b59790030bab632 \
152 |     --hash=sha256:54fd1e83a184e19c598d5e70ba508196fd0bbdd676ce159feb412a4a6664f952 \
153 |     --hash=sha256:0e3c84e6c67eba89c2dbcee08504ba8644ab4284863452450520dad8f1e89b79 \
154 |     --hash=sha256:dc862056f76443a0db4509116c5cd480fe1b6a2d45512a653f9a855cc0517456 \
155 |     --hash=sha256:0e929169f9c090dae0646a011c8b058e5e5fb391466016b39d21745b48817fd7 \
156 |     --hash=sha256:d81eddcb12d608cc08081fa88d046c78afb1bf8107e6feab5d43503fea74a635 \
157 |     --hash=sha256:585fd452dd7782130d112f7ddf3473ffdd521414674c33876187e101b588738a \
158 |     --hash=sha256:37e5438e1c78931df5d3c0c78ae049092877e5e9c02dd1ff5abb9cf27a5914ea \
159 |     --hash=sha256:07b42215124aedecc6083f1ce6b7e5ec5b50047afa701f3442054373a6deb656 \
160 |     --hash=sha256:929006d3c2d923788ba153ad0de8ed2e5ed39fdbe8e7be21e2f22ed06c6783d3 \
161 |     --hash=sha256:b797515be8743b771aa868f83563f789bbd4b236659ba52243b735d80b29ed93 \
162 |     --hash=sha256:d5c65bdf4484872c4af3150aeebe101ba560dcfb34488d9a8ff8dbcd21079647 \
163 |     --hash=sha256:b47a43177a5e65b771b80db71e7be76c0ba23cc8aa73eeeb089ed5219cdbe27d \
164 |     --hash=sha256:806068d4f86cb06af37cd65821554f98240a19ce646d3cd24e1c33587f313eb8 \
165 |     --hash=sha256:46dd362c2f045095c920162e9307de5ffd0a1bfbba0a6e990b344366f55a30c1 \
166 |     --hash=sha256:ace010325c787c378afd7f7c1ac66b26313b3344628652eacd149bdd23c68841 \
167 |     --hash=sha256:ecc771ab628ea281517e24fd2c52e8f31c41e66652d07599ad8818abaad38cda \
168 |     --hash=sha256:fc13a9524bc18b6fb6e0dbec3533ba0496bbed167c56d0aabefd965584557d80 \
169 |     --hash=sha256:7df80d07818b385f3129180369079bd6934cf70469f99daaebfac89dca288359 \
170 |     --hash=sha256:25b4e5f22d3a37ddf3effc0710ba692cfc792c2b9edfb9c05aefe823256e84d5
171 | packaging==20.9; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6" \
172 |     --hash=sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a \
173 |     --hash=sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5
174 | parse==1.19.0; python_full_version >= "3.6.0" \
175 |     --hash=sha256:9ff82852bcb65d139813e2a5197627a94966245c897796760a3a2a8eb66f020b
176 | parso==0.8.1; python_version >= "3.7" and python_version < "4.0" \
177 |     --hash=sha256:15b00182f472319383252c18d5913b69269590616c947747bc50bf4ac768f410 \
178 |     --hash=sha256:8519430ad07087d4c997fda3a7918f7cfa27cb58972a8c89c2a0295a1c940e9e
179 | pexpect==4.8.0; python_version >= "3.7" and python_version < "4.0" and sys_platform != "win32" \
180 |     --hash=sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937 \
181 |     --hash=sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c
182 | pickleshare==0.7.5; python_version >= "3.7" and python_version < "4.0" \
183 |     --hash=sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56 \
184 |     --hash=sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca
185 | pluggy==0.13.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6" \
186 |     --hash=sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d \
187 |     --hash=sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0
188 | prompt-toolkit==3.0.17; python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.1" \
189 |     --hash=sha256:4cea7d09e46723885cb8bc54678175453e5071e9449821dce6f017b1d1fbfc1a \
190 |     --hash=sha256:9397a7162cf45449147ad6042fa37983a081b8a73363a5253dd4072666333137
191 | ptyprocess==0.7.0; python_version >= "3.7" and python_version < "4.0" and sys_platform != "win32" \
192 |     --hash=sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 \
193 |     --hash=sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220
194 | py==1.10.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6" \
195 |     --hash=sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a \
196 |     --hash=sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3
197 | pyee==8.1.0; python_version >= "3.5" and python_full_version >= "3.6.0" \
198 |     --hash=sha256:383973b63ad7ed5e3c0311f8b179c52981f9e7b3eaea0e9a830d13ec34dde65f \
199 |     --hash=sha256:92dacc5bd2bdb8f95aa8dd2585d47ca1c4840e2adb95ccf90034d64f725bfd31
200 | pygments==2.8.1; python_version >= "3.7" and python_version < "4.0" \
201 |     --hash=sha256:534ef71d539ae97d4c3a4cf7d6f110f214b0e687e92f9cb9d2a3b0d3101289c8 \
202 |     --hash=sha256:2656e1a6edcdabf4275f9a3640db59fd5de107d88e8663c5d4e9a0fa62f77f94
203 | pyparsing==2.4.7; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6" \
204 |     --hash=sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b \
205 |     --hash=sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1
206 | pyppeteer==0.0.25; python_version >= "3.5" and python_full_version >= "3.6.0" \
207 |     --hash=sha256:51fe769b722a1718043b74d12c20420f29e0dd9eeea2b66652b7f93a9ad465dd
208 | pyquery==1.4.3; python_full_version >= "3.6.0" \
209 |     --hash=sha256:1fc33b7699455ed25c75282bc8f80ace1ac078b0dda5a933dacbd8b1c1f83963 \
210 |     --hash=sha256:a388eefb6bc4a55350de0316fbd97cda999ae669b6743ae5b99102ba54f5aa72
211 | pytest-vcr==1.0.2 \
212 |     --hash=sha256:23ee51b75abbcc43d926272773aae4f39f93aceb75ed56852d0bf618f92e1896 \
213 |     --hash=sha256:2f316e0539399bea0296e8b8401145c62b6f85e9066af7e57b6151481b0d6d9c
214 | pytest==6.2.2; python_version >= "3.6" \
215 |     --hash=sha256:b574b57423e818210672e07ca1fa90aaf194a4f63f3ab909a2c67ebb22913839 \
216 |     --hash=sha256:9d1edf9e7d0b84d72ea3dbcdfd22b35fb543a5e8f2a60092dd578936bf63d7f9
217 | python-dateutil==2.8.1; python_version >= "3.5" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.5" \
218 |     --hash=sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c \
219 |     --hash=sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a
220 | pytz==2021.1; python_version >= "3.5" \
221 |     --hash=sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798 \
222 |     --hash=sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da
223 | pyyaml==5.4.1; python_version >= "3.5" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.5" \
224 |     --hash=sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922 \
225 |     --hash=sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393 \
226 |     --hash=sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8 \
227 |     --hash=sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185 \
228 |     --hash=sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253 \
229 |     --hash=sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc \
230 |     --hash=sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5 \
231 |     --hash=sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df \
232 |     --hash=sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018 \
233 |     --hash=sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63 \
234 |     --hash=sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b \
235 |     --hash=sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf \
236 |     --hash=sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46 \
237 |     --hash=sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb \
238 |     --hash=sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc \
239 |     --hash=sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696 \
240 |     --hash=sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77 \
241 |     --hash=sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183 \
242 |     --hash=sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10 \
243 |     --hash=sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db \
244 |     --hash=sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e
245 | regex==2021.3.17; python_version >= "3.5" \
246 |     --hash=sha256:b97ec5d299c10d96617cc851b2e0f81ba5d9d6248413cd374ef7f3a8871ee4a6 \
247 |     --hash=sha256:cb4ee827857a5ad9b8ae34d3c8cc51151cb4a3fe082c12ec20ec73e63cc7c6f0 \
248 |     --hash=sha256:633497504e2a485a70a3268d4fc403fe3063a50a50eed1039083e9471ad0101c \
249 |     --hash=sha256:a59a2ee329b3de764b21495d78c92ab00b4ea79acef0f7ae8c1067f773570afa \
250 |     --hash=sha256:f85d6f41e34f6a2d1607e312820971872944f1661a73d33e1e82d35ea3305e14 \
251 |     --hash=sha256:4651f839dbde0816798e698626af6a2469eee6d9964824bb5386091255a1694f \
252 |     --hash=sha256:39c44532d0e4f1639a89e52355b949573e1e2c5116106a395642cbbae0ff9bcd \
253 |     --hash=sha256:3d9a7e215e02bd7646a91fb8bcba30bc55fd42a719d6b35cf80e5bae31d9134e \
254 |     --hash=sha256:159fac1a4731409c830d32913f13f68346d6b8e39650ed5d704a9ce2f9ef9cb3 \
255 |     --hash=sha256:13f50969028e81765ed2a1c5fcfdc246c245cf8d47986d5172e82ab1a0c42ee5 \
256 |     --hash=sha256:b9d8d286c53fe0cbc6d20bf3d583cabcd1499d89034524e3b94c93a5ab85ca90 \
257 |     --hash=sha256:201e2619a77b21a7780580ab7b5ce43835e242d3e20fef50f66a8df0542e437f \
258 |     --hash=sha256:d47d359545b0ccad29d572ecd52c9da945de7cd6cf9c0cfcb0269f76d3555689 \
259 |     --hash=sha256:ea2f41445852c660ba7c3ebf7d70b3779b20d9ca8ba54485a17740db49f46932 \
260 |     --hash=sha256:486a5f8e11e1f5bbfcad87f7c7745eb14796642323e7e1829a331f87a713daaa \
261 |     --hash=sha256:18e25e0afe1cf0f62781a150c1454b2113785401ba285c745acf10c8ca8917df \
262 |     --hash=sha256:a2ee026f4156789df8644d23ef423e6194fad0bc53575534101bb1de5d67e8ce \
263 |     --hash=sha256:4c0788010a93ace8a174d73e7c6c9d3e6e3b7ad99a453c8ee8c975ddd9965643 \
264 |     --hash=sha256:575a832e09d237ae5fedb825a7a5bc6a116090dd57d6417d4f3b75121c73e3be \
265 |     --hash=sha256:8e65e3e4c6feadf6770e2ad89ad3deb524bcb03d8dc679f381d0568c024e0deb \
266 |     --hash=sha256:a0df9a0ad2aad49ea3c7f65edd2ffb3d5c59589b85992a6006354f6fb109bb18 \
267 |     --hash=sha256:b98bc9db003f1079caf07b610377ed1ac2e2c11acc2bea4892e28cc5b509d8d5 \
268 |     --hash=sha256:808404898e9a765e4058bf3d7607d0629000e0a14a6782ccbb089296b76fa8fe \
269 |     --hash=sha256:5770a51180d85ea468234bc7987f5597803a4c3d7463e7323322fe4a1b181578 \
270 |     --hash=sha256:976a54d44fd043d958a69b18705a910a8376196c6b6ee5f2596ffc11bff4420d \
271 |     --hash=sha256:63f3ca8451e5ff7133ffbec9eda641aeab2001be1a01878990f6c87e3c44b9d5 \
272 |     --hash=sha256:bcd945175c29a672f13fce13a11893556cd440e37c1b643d6eeab1988c8b209c \
273 |     --hash=sha256:3d9356add82cff75413bec360c1eca3e58db4a9f5dafa1f19650958a81e3249d \
274 |     --hash=sha256:f5d0c921c99297354cecc5a416ee4280bd3f20fd81b9fb671ca6be71499c3fdf \
275 |     --hash=sha256:14de88eda0976020528efc92d0a1f8830e2fb0de2ae6005a6fc4e062553031fa \
276 |     --hash=sha256:4c2e364491406b7888c2ad4428245fc56c327e34a5dfe58fd40df272b3c3dab3 \
277 |     --hash=sha256:8bd4f91f3fb1c9b1380d6894bd5b4a519409135bec14c0c80151e58394a4e88a \
278 |     --hash=sha256:882f53afe31ef0425b405a3f601c0009b44206ea7f55ee1c606aad3cc213a52c \
279 |     --hash=sha256:07ef35301b4484bce843831e7039a84e19d8d33b3f8b2f9aab86c376813d0139 \
280 |     --hash=sha256:360a01b5fa2ad35b3113ae0c07fb544ad180603fa3b1f074f52d98c1096fa15e \
281 |     --hash=sha256:709f65bb2fa9825f09892617d01246002097f8f9b6dde8d1bb4083cf554701ba \
282 |     --hash=sha256:c66221e947d7207457f8b6f42b12f613b09efa9669f65a587a2a71f6a0e4d106 \
283 |     --hash=sha256:c782da0e45aff131f0bed6e66fbcfa589ff2862fc719b83a88640daa01a5aff7 \
284 |     --hash=sha256:dc9963aacb7da5177e40874585d7407c0f93fb9d7518ec58b86e562f633f36cd \
285 |     --hash=sha256:a0d04128e005142260de3733591ddf476e4902c0c23c1af237d9acf3c96e1b38 \
286 |     --hash=sha256:4b8a1fb724904139149a43e172850f35aa6ea97fb0545244dc0b805e0154ed68
287 | requests-html==0.10.0; python_full_version >= "3.6.0" \
288 |     --hash=sha256:7e929ecfed95fb1d0994bb368295d6d7c4d06b03fcb900c33d7d0b17e6003947 \
289 |     --hash=sha256:cb8a78cf829c4eca9d6233f28524f65dd2bfaafb4bdbbc407f0a0b8f487df6e2
290 | requests==2.25.1; python_full_version >= "3.6.0" \
291 |     --hash=sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e \
292 |     --hash=sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804
293 | six==1.15.0; python_full_version >= "3.6.0" and python_version >= "3.5" and (python_version >= "3.5" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.5") \
294 |     --hash=sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced \
295 |     --hash=sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259
296 | soupsieve==2.2.1; python_version >= "3.6" and python_full_version >= "3.6.0" \
297 |     --hash=sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b \
298 |     --hash=sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc
299 | toml==0.10.2; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.6" \
300 |     --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \
301 |     --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f
302 | tqdm==4.59.0; python_version >= "3.5" and python_full_version >= "3.6.0" \
303 |     --hash=sha256:9fdf349068d047d4cfbe24862c425883af1db29bcddf4b0eeb2524f6fbdb23c7 \
304 |     --hash=sha256:d666ae29164da3e517fcf125e41d4fe96e5bb375cd87ff9763f6b38b5592fe33
305 | traitlets==5.0.5; python_version >= "3.7" and python_version < "4.0" \
306 |     --hash=sha256:69ff3f9d5351f31a7ad80443c2674b7099df13cc41fc5fa6e2f6d3b0330b0426 \
307 |     --hash=sha256:178f4ce988f69189f7e523337a3e11d91c786ded9360174a3d9ca83e79bc5396
308 | typing-extensions==3.7.4.3; python_version < "3.8" and python_version >= "3.6" \
309 |     --hash=sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f \
310 |     --hash=sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918 \
311 |     --hash=sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c
312 | tzlocal==2.1; python_version >= "3.5" \
313 |     --hash=sha256:e2cb6c6b5b604af38597403e9852872d7f534962ae2954c7f35efcb1ccacf4a4 \
314 |     --hash=sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44
315 | urllib3==1.26.4; python_full_version >= "3.6.0" and python_version < "4" and python_version >= "3.5" \
316 |     --hash=sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df \
317 |     --hash=sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937
318 | vcrpy==4.1.1; python_version >= "3.5" \
319 |     --hash=sha256:12c3fcdae7b88ecf11fc0d3e6d77586549d4575a2ceee18e82eee75c1f626162 \
320 |     --hash=sha256:57095bf22fc0a2d99ee9674cdafebed0f3ba763018582450706f7d3a74fff599
321 | w3lib==1.22.0; python_full_version >= "3.6.0" \
322 |     --hash=sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53 \
323 |     --hash=sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df
324 | wcwidth==0.2.5; python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.1" \
325 |     --hash=sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784 \
326 |     --hash=sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83
327 | websockets==9.1; python_version >= "3.6" and python_full_version >= "3.6.0" \
328 |     --hash=sha256:0dd4eb8e0bbf365d6f652711ce21b8fd2b596f873d32aabb0fbb53ec604418cc \
329 |     --hash=sha256:1d0971cc7251aeff955aa742ec541ee8aaea4bb2ebf0245748fbec62f744a37e \
330 |     --hash=sha256:1d6b4fddb12ab9adf87b843cd4316c4bd602db8d5efd2fb83147f0458fe85135 \
331 |     --hash=sha256:230a3506df6b5f446fed2398e58dcaafdff12d67fe1397dff196411a9e820d02 \
332 |     --hash=sha256:276d2339ebf0df4f45df453923ebd2270b87900eda5dfd4a6b0cfa15f82111c3 \
333 |     --hash=sha256:2cf04601633a4ec176b9cc3d3e73789c037641001dbfaf7c411f89cd3e04fcaf \
334 |     --hash=sha256:3ddff38894c7857c476feb3538dd847514379d6dc844961dc99f04b0384b1b1b \
335 |     --hash=sha256:48c222feb3ced18f3dc61168ca18952a22fb88e5eb8902d2bf1b50faefdc34a2 \
336 |     --hash=sha256:51d04df04ed9d08077d10ccbe21e6805791b78eac49d16d30a1f1fe2e44ba0af \
337 |     --hash=sha256:597c28f3aa7a09e8c070a86b03107094ee5cdafcc0d55f2f2eac92faac8dc67d \
338 |     --hash=sha256:5c8f0d82ea2468282e08b0cf5307f3ad022290ed50c45d5cb7767957ca782880 \
339 |     --hash=sha256:7189e51955f9268b2bdd6cc537e0faa06f8fffda7fb386e5922c6391de51b077 \
340 |     --hash=sha256:7df3596838b2a0c07c6f6d67752c53859a54993d4f062689fdf547cb56d0f84f \
341 |     --hash=sha256:826ccf85d4514609219725ba4a7abd569228c2c9f1968e8be05be366f68291ec \
342 |     --hash=sha256:836d14eb53b500fd92bd5db2fc5894f7c72b634f9c2a28f546f75967503d8e25 \
343 |     --hash=sha256:85db8090ba94e22d964498a47fdd933b8875a1add6ebc514c7ac8703eb97bbf0 \
344 |     --hash=sha256:85e701a6c316b7067f1e8675c638036a796fe5116783a4c932e7eb8e305a3ffe \
345 |     --hash=sha256:900589e19200be76dd7cbaa95e9771605b5ce3f62512d039fb3bc5da9014912a \
346 |     --hash=sha256:9147868bb0cc01e6846606cd65cbf9c58598f187b96d14dd1ca17338b08793bb \
347 |     --hash=sha256:9e7fdc775fe7403dbd8bc883ba59576a6232eac96dacb56512daacf7af5d618d \
348 |     --hash=sha256:ab5ee15d3462198c794c49ccd31773d8a2b8c17d622aa184f669d2b98c2f0857 \
349 |     --hash=sha256:ad893d889bc700a5835e0a95a3e4f2c39e91577ab232a3dc03c262a0f8fc4b5c \
350 |     --hash=sha256:b2e71c4670ebe1067fa8632f0d081e47254ee2d3d409de54168b43b0ba9147e0 \
351 |     --hash=sha256:b43b13e5622c5a53ab12f3272e6f42f1ce37cd5b6684b2676cb365403295cd40 \
352 |     --hash=sha256:b4ad84b156cf50529b8ac5cc1638c2cf8680490e3fccb6121316c8c02620a2e4 \
353 |     --hash=sha256:be5fd35e99970518547edc906efab29afd392319f020c3c58b0e1a158e16ed20 \
354 |     --hash=sha256:caa68c95bc1776d3521f81eeb4d5b9438be92514ec2a79fececda814099c8314 \
355 |     --hash=sha256:d144b350045c53c8ff09aa1cfa955012dd32f00c7e0862c199edcabb1a8b32da \
356 |     --hash=sha256:d2c2d9b24d3c65b5a02cac12cbb4e4194e590314519ed49db2f67ef561c3cf58 \
357 |     --hash=sha256:e9e5fd6dbdf95d99bc03732ded1fc8ef22ebbc05999ac7e0c7bf57fe6e4e5ae2 \
358 |     --hash=sha256:ebf459a1c069f9866d8569439c06193c586e72c9330db1390af7c6a0a32c4afd \
359 |     --hash=sha256:f31722f1c033c198aa4a39a01905951c00bd1c74f922e8afc1b1c62adbcdd56a \
360 |     --hash=sha256:f68c352a68e5fdf1e97288d5cec9296664c590c25932a8476224124aaf90dbcd
361 | wrapt==1.12.1; python_version >= "3.5" \
362 |     --hash=sha256:b62ffa81fb85f4332a4f609cab4ac40709470da05643a082ec1eb88e6d9b97d7
363 | yarl==1.6.3; python_version >= "3.6" \
364 |     --hash=sha256:0355a701b3998dcd832d0dc47cc5dedf3874f966ac7f870e0f3a6788d802d434 \
365 |     --hash=sha256:bafb450deef6861815ed579c7a6113a879a6ef58aed4c3a4be54400ae8871478 \
366 |     --hash=sha256:547f7665ad50fa8563150ed079f8e805e63dd85def6674c97efd78eed6c224a6 \
367 |     --hash=sha256:63f90b20ca654b3ecc7a8d62c03ffa46999595f0167d6450fa8383bab252987e \
368 |     --hash=sha256:97b5bdc450d63c3ba30a127d018b866ea94e65655efaf889ebeabc20f7d12406 \
369 |     --hash=sha256:d8d07d102f17b68966e2de0e07bfd6e139c7c02ef06d3a0f8d2f0f055e13bb76 \
370 |     --hash=sha256:15263c3b0b47968c1d90daa89f21fcc889bb4b1aac5555580d74565de6836366 \
371 |     --hash=sha256:b5dfc9a40c198334f4f3f55880ecf910adebdcb2a0b9a9c23c9345faa9185721 \
372 |     --hash=sha256:b2e9a456c121e26d13c29251f8267541bd75e6a1ccf9e859179701c36a078643 \
373 |     --hash=sha256:ce3beb46a72d9f2190f9e1027886bfc513702d748047b548b05dab7dfb584d2e \
374 |     --hash=sha256:2ce4c621d21326a4a5500c25031e102af589edb50c09b321049e388b3934eec3 \
375 |     --hash=sha256:d26608cf178efb8faa5ff0f2d2e77c208f471c5a3709e577a7b3fd0445703ac8 \
376 |     --hash=sha256:4c5bcfc3ed226bf6419f7a33982fb4b8ec2e45785a0561eb99274ebbf09fdd6a \
377 |     --hash=sha256:4736eaee5626db8d9cda9eb5282028cc834e2aeb194e0d8b50217d707e98bb5c \
378 |     --hash=sha256:68dc568889b1c13f1e4745c96b931cc94fdd0defe92a72c2b8ce01091b22e35f \
379 |     --hash=sha256:7356644cbed76119d0b6bd32ffba704d30d747e0c217109d7979a7bc36c4d970 \
380 |     --hash=sha256:00d7ad91b6583602eb9c1d085a2cf281ada267e9a197e8b7cae487dadbfa293e \
381 |     --hash=sha256:69ee97c71fee1f63d04c945f56d5d726483c4762845400a6795a3b75d56b6c50 \
382 |     --hash=sha256:e46fba844f4895b36f4c398c5af062a9808d1f26b2999c58909517384d5deda2 \
383 |     --hash=sha256:31ede6e8c4329fb81c86706ba8f6bf661a924b53ba191b27aa5fcee5714d18ec \
384 |     --hash=sha256:fcbb48a93e8699eae920f8d92f7160c03567b421bc17362a9ffbbd706a816f71 \
385 |     --hash=sha256:72a660bdd24497e3e84f5519e57a9ee9220b6f3ac4d45056961bf22838ce20cc \
386 |     --hash=sha256:324ba3d3c6fee56e2e0b0d09bf5c73824b9f08234339d2b788af65e60040c959 \
387 |     --hash=sha256:e6b5460dc5ad42ad2b36cca524491dfcaffbfd9c8df50508bddc354e787b8dc2 \
388 |     --hash=sha256:6d6283d8e0631b617edf0fd726353cb76630b83a089a40933043894e7f6721e2 \
389 |     --hash=sha256:9ede61b0854e267fd565e7527e2f2eb3ef8858b301319be0604177690e1a3896 \
390 |     --hash=sha256:f0b059678fd549c66b89bed03efcabb009075bd131c248ecdf087bdb6faba24a \
391 |     --hash=sha256:329412812ecfc94a57cd37c9d547579510a9e83c516bc069470db5f75684629e \
392 |     --hash=sha256:c49ff66d479d38ab863c50f7bb27dee97c6627c5fe60697de15529da9c3de724 \
393 |     --hash=sha256:f040bcc6725c821a4c0665f3aa96a4d0805a7aaf2caf266d256b8ed71b9f041c \
394 |     --hash=sha256:d5c32c82990e4ac4d8150fd7652b972216b204de4e83a122546dce571c1bdf25 \
395 |     --hash=sha256:d597767fcd2c3dc49d6eea360c458b65643d1e4dbed91361cf5e36e53c1f8c96 \
396 |     --hash=sha256:8aa3decd5e0e852dc68335abf5478a518b41bf2ab2f330fe44916399efedfae0 \
397 |     --hash=sha256:73494d5b71099ae8cb8754f1df131c11d433b387efab7b51849e7e1e851f07a4 \
398 |     --hash=sha256:5b883e458058f8d6099e4420f0cc2567989032b5f34b271c0827de9f1079a424 \
399 |     --hash=sha256:4953fb0b4fdb7e08b2f3b3be80a00d28c5c8a2056bb066169de00e6501b986b6 \
400 |     --hash=sha256:8a9066529240171b68893d60dca86a763eae2139dd42f42106b03cf4b426bf10
401 | zipp==3.4.1; python_version < "3.8" and python_version >= "3.6" \
402 |     --hash=sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098 \
403 |     --hash=sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76
404 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | appdirs==1.4.4; python_version >= "3.5" and python_full_version >= "3.6.0" \
  2 |     --hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128 \
  3 |     --hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41
  4 | beautifulsoup4==4.9.3; python_full_version >= "3.6.0" \
  5 |     --hash=sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35 \
  6 |     --hash=sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666 \
  7 |     --hash=sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25
  8 | bs4==0.0.1; python_full_version >= "3.6.0" \
  9 |     --hash=sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a
 10 | certifi==2020.12.5; python_full_version >= "3.6.0" \
 11 |     --hash=sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830 \
 12 |     --hash=sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c
 13 | chardet==4.0.0; python_full_version >= "3.6.0" \
 14 |     --hash=sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5 \
 15 |     --hash=sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa
 16 | cssselect==1.1.0; python_full_version >= "3.6.0" \
 17 |     --hash=sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf \
 18 |     --hash=sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc
 19 | dateparser==1.0.0; python_version >= "3.5" \
 20 |     --hash=sha256:17202df32c7a36e773136ff353aa3767e987f8b3e27374c39fd21a30a803d6f8 \
 21 |     --hash=sha256:159cc4e01a593706a15cd4e269a0b3345edf3aef8bf9278a57dac8adf5bf1e4a
 22 | demjson3==3.0.5 \
 23 |     --hash=sha256:ab9aabdd85695f3684fc296f39766a2730f6c8de81d23f7048073dfe2f616d80
 24 | fake-useragent==0.1.11; python_full_version >= "3.6.0" \
 25 |     --hash=sha256:c104998b750eb097eefc28ae28e92d66397598d2cf41a31aa45d5559ef1adf35
 26 | idna==2.10; python_full_version >= "3.6.0" \
 27 |     --hash=sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0 \
 28 |     --hash=sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6
 29 | lxml==4.9.1; python_full_version >= "3.6.0" \
 30 |     --hash=sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318 \
 31 |     --hash=sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c \
 32 |     --hash=sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b \
 33 |     --hash=sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000 \
 34 |     --hash=sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73 \
 35 |     --hash=sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d \
 36 |     --hash=sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb \
 37 |     --hash=sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8 \
 38 |     --hash=sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2 \
 39 |     --hash=sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345 \
 40 |     --hash=sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94 \
 41 |     --hash=sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e \
 42 |     --hash=sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b \
 43 |     --hash=sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc \
 44 |     --hash=sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a \
 45 |     --hash=sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9 \
 46 |     --hash=sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc \
 47 |     --hash=sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387 \
 48 |     --hash=sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb \
 49 |     --hash=sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7 \
 50 |     --hash=sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4 \
 51 |     --hash=sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97 \
 52 |     --hash=sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67 \
 53 |     --hash=sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627 \
 54 |     --hash=sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7 \
 55 |     --hash=sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd \
 56 |     --hash=sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3 \
 57 |     --hash=sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7 \
 58 |     --hash=sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130 \
 59 |     --hash=sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b \
 60 |     --hash=sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036 \
 61 |     --hash=sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785 \
 62 |     --hash=sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca \
 63 |     --hash=sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91 \
 64 |     --hash=sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc \
 65 |     --hash=sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536 \
 66 |     --hash=sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391 \
 67 |     --hash=sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3 \
 68 |     --hash=sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d \
 69 |     --hash=sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21 \
 70 |     --hash=sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3 \
 71 |     --hash=sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d \
 72 |     --hash=sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29 \
 73 |     --hash=sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715 \
 74 |     --hash=sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed \
 75 |     --hash=sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25 \
 76 |     --hash=sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c \
 77 |     --hash=sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785 \
 78 |     --hash=sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837 \
 79 |     --hash=sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4 \
 80 |     --hash=sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b \
 81 |     --hash=sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2 \
 82 |     --hash=sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067 \
 83 |     --hash=sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448 \
 84 |     --hash=sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d \
 85 |     --hash=sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2 \
 86 |     --hash=sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc \
 87 |     --hash=sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c \
 88 |     --hash=sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5 \
 89 |     --hash=sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84 \
 90 |     --hash=sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8 \
 91 |     --hash=sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf \
 92 |     --hash=sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7 \
 93 |     --hash=sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e \
 94 |     --hash=sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb \
 95 |     --hash=sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b \
 96 |     --hash=sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3 \
 97 |     --hash=sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad \
 98 |     --hash=sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8 \
 99 |     --hash=sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f
100 | parse==1.19.0; python_full_version >= "3.6.0" \
101 |     --hash=sha256:9ff82852bcb65d139813e2a5197627a94966245c897796760a3a2a8eb66f020b
102 | pyee==8.1.0; python_version >= "3.5" and python_full_version >= "3.6.0" \
103 |     --hash=sha256:383973b63ad7ed5e3c0311f8b179c52981f9e7b3eaea0e9a830d13ec34dde65f \
104 |     --hash=sha256:92dacc5bd2bdb8f95aa8dd2585d47ca1c4840e2adb95ccf90034d64f725bfd31
105 | pyppeteer==0.0.25; python_version >= "3.5" and python_full_version >= "3.6.0" \
106 |     --hash=sha256:51fe769b722a1718043b74d12c20420f29e0dd9eeea2b66652b7f93a9ad465dd
107 | pyquery==1.4.3; python_full_version >= "3.6.0" \
108 |     --hash=sha256:1fc33b7699455ed25c75282bc8f80ace1ac078b0dda5a933dacbd8b1c1f83963 \
109 |     --hash=sha256:a388eefb6bc4a55350de0316fbd97cda999ae669b6743ae5b99102ba54f5aa72
110 | python-dateutil==2.8.1; python_version >= "3.5" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.5" \
111 |     --hash=sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c \
112 |     --hash=sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a
113 | pytz==2021.1; python_version >= "3.5" \
114 |     --hash=sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798 \
115 |     --hash=sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da
116 | regex==2021.3.17; python_version >= "3.5" \
117 |     --hash=sha256:b97ec5d299c10d96617cc851b2e0f81ba5d9d6248413cd374ef7f3a8871ee4a6 \
118 |     --hash=sha256:cb4ee827857a5ad9b8ae34d3c8cc51151cb4a3fe082c12ec20ec73e63cc7c6f0 \
119 |     --hash=sha256:633497504e2a485a70a3268d4fc403fe3063a50a50eed1039083e9471ad0101c \
120 |     --hash=sha256:a59a2ee329b3de764b21495d78c92ab00b4ea79acef0f7ae8c1067f773570afa \
121 |     --hash=sha256:f85d6f41e34f6a2d1607e312820971872944f1661a73d33e1e82d35ea3305e14 \
122 |     --hash=sha256:4651f839dbde0816798e698626af6a2469eee6d9964824bb5386091255a1694f \
123 |     --hash=sha256:39c44532d0e4f1639a89e52355b949573e1e2c5116106a395642cbbae0ff9bcd \
124 |     --hash=sha256:3d9a7e215e02bd7646a91fb8bcba30bc55fd42a719d6b35cf80e5bae31d9134e \
125 |     --hash=sha256:159fac1a4731409c830d32913f13f68346d6b8e39650ed5d704a9ce2f9ef9cb3 \
126 |     --hash=sha256:13f50969028e81765ed2a1c5fcfdc246c245cf8d47986d5172e82ab1a0c42ee5 \
127 |     --hash=sha256:b9d8d286c53fe0cbc6d20bf3d583cabcd1499d89034524e3b94c93a5ab85ca90 \
128 |     --hash=sha256:201e2619a77b21a7780580ab7b5ce43835e242d3e20fef50f66a8df0542e437f \
129 |     --hash=sha256:d47d359545b0ccad29d572ecd52c9da945de7cd6cf9c0cfcb0269f76d3555689 \
130 |     --hash=sha256:ea2f41445852c660ba7c3ebf7d70b3779b20d9ca8ba54485a17740db49f46932 \
131 |     --hash=sha256:486a5f8e11e1f5bbfcad87f7c7745eb14796642323e7e1829a331f87a713daaa \
132 |     --hash=sha256:18e25e0afe1cf0f62781a150c1454b2113785401ba285c745acf10c8ca8917df \
133 |     --hash=sha256:a2ee026f4156789df8644d23ef423e6194fad0bc53575534101bb1de5d67e8ce \
134 |     --hash=sha256:4c0788010a93ace8a174d73e7c6c9d3e6e3b7ad99a453c8ee8c975ddd9965643 \
135 |     --hash=sha256:575a832e09d237ae5fedb825a7a5bc6a116090dd57d6417d4f3b75121c73e3be \
136 |     --hash=sha256:8e65e3e4c6feadf6770e2ad89ad3deb524bcb03d8dc679f381d0568c024e0deb \
137 |     --hash=sha256:a0df9a0ad2aad49ea3c7f65edd2ffb3d5c59589b85992a6006354f6fb109bb18 \
138 |     --hash=sha256:b98bc9db003f1079caf07b610377ed1ac2e2c11acc2bea4892e28cc5b509d8d5 \
139 |     --hash=sha256:808404898e9a765e4058bf3d7607d0629000e0a14a6782ccbb089296b76fa8fe \
140 |     --hash=sha256:5770a51180d85ea468234bc7987f5597803a4c3d7463e7323322fe4a1b181578 \
141 |     --hash=sha256:976a54d44fd043d958a69b18705a910a8376196c6b6ee5f2596ffc11bff4420d \
142 |     --hash=sha256:63f3ca8451e5ff7133ffbec9eda641aeab2001be1a01878990f6c87e3c44b9d5 \
143 |     --hash=sha256:bcd945175c29a672f13fce13a11893556cd440e37c1b643d6eeab1988c8b209c \
144 |     --hash=sha256:3d9356add82cff75413bec360c1eca3e58db4a9f5dafa1f19650958a81e3249d \
145 |     --hash=sha256:f5d0c921c99297354cecc5a416ee4280bd3f20fd81b9fb671ca6be71499c3fdf \
146 |     --hash=sha256:14de88eda0976020528efc92d0a1f8830e2fb0de2ae6005a6fc4e062553031fa \
147 |     --hash=sha256:4c2e364491406b7888c2ad4428245fc56c327e34a5dfe58fd40df272b3c3dab3 \
148 |     --hash=sha256:8bd4f91f3fb1c9b1380d6894bd5b4a519409135bec14c0c80151e58394a4e88a \
149 |     --hash=sha256:882f53afe31ef0425b405a3f601c0009b44206ea7f55ee1c606aad3cc213a52c \
150 |     --hash=sha256:07ef35301b4484bce843831e7039a84e19d8d33b3f8b2f9aab86c376813d0139 \
151 |     --hash=sha256:360a01b5fa2ad35b3113ae0c07fb544ad180603fa3b1f074f52d98c1096fa15e \
152 |     --hash=sha256:709f65bb2fa9825f09892617d01246002097f8f9b6dde8d1bb4083cf554701ba \
153 |     --hash=sha256:c66221e947d7207457f8b6f42b12f613b09efa9669f65a587a2a71f6a0e4d106 \
154 |     --hash=sha256:c782da0e45aff131f0bed6e66fbcfa589ff2862fc719b83a88640daa01a5aff7 \
155 |     --hash=sha256:dc9963aacb7da5177e40874585d7407c0f93fb9d7518ec58b86e562f633f36cd \
156 |     --hash=sha256:a0d04128e005142260de3733591ddf476e4902c0c23c1af237d9acf3c96e1b38 \
157 |     --hash=sha256:4b8a1fb724904139149a43e172850f35aa6ea97fb0545244dc0b805e0154ed68
158 | requests-html==0.10.0; python_full_version >= "3.6.0" \
159 |     --hash=sha256:7e929ecfed95fb1d0994bb368295d6d7c4d06b03fcb900c33d7d0b17e6003947 \
160 |     --hash=sha256:cb8a78cf829c4eca9d6233f28524f65dd2bfaafb4bdbbc407f0a0b8f487df6e2
161 | requests==2.25.1; python_full_version >= "3.6.0" \
162 |     --hash=sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e \
163 |     --hash=sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804
164 | six==1.15.0; python_full_version >= "3.6.0" and python_version >= "3.5" \
165 |     --hash=sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced \
166 |     --hash=sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259
167 | soupsieve==2.2.1; python_version >= "3.6" and python_full_version >= "3.6.0" \
168 |     --hash=sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b \
169 |     --hash=sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc
170 | tqdm==4.59.0; python_version >= "3.5" and python_full_version >= "3.6.0" \
171 |     --hash=sha256:9fdf349068d047d4cfbe24862c425883af1db29bcddf4b0eeb2524f6fbdb23c7 \
172 |     --hash=sha256:d666ae29164da3e517fcf125e41d4fe96e5bb375cd87ff9763f6b38b5592fe33
173 | tzlocal==2.1; python_version >= "3.5" \
174 |     --hash=sha256:e2cb6c6b5b604af38597403e9852872d7f534962ae2954c7f35efcb1ccacf4a4 \
175 |     --hash=sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44
176 | urllib3==1.26.5; python_full_version >= "3.6.0" and python_version < "4" and python_version >= "3.5" \
177 |     --hash=sha256:753a0374df26658f99d826cfe40394a686d05985786d946fbe4165b5148f5a7c \
178 |     --hash=sha256:a7acd0977125325f516bda9735fa7142b909a8d01e8b2e4c8108d0984e6e0098
179 | w3lib==1.22.0; python_full_version >= "3.6.0" \
180 |     --hash=sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53 \
181 |     --hash=sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df
182 | websockets==9.1; python_version >= "3.6" and python_full_version >= "3.6.0" \
183 |     --hash=sha256:0dd4eb8e0bbf365d6f652711ce21b8fd2b596f873d32aabb0fbb53ec604418cc \
184 |     --hash=sha256:1d0971cc7251aeff955aa742ec541ee8aaea4bb2ebf0245748fbec62f744a37e \
185 |     --hash=sha256:1d6b4fddb12ab9adf87b843cd4316c4bd602db8d5efd2fb83147f0458fe85135 \
186 |     --hash=sha256:230a3506df6b5f446fed2398e58dcaafdff12d67fe1397dff196411a9e820d02 \
187 |     --hash=sha256:276d2339ebf0df4f45df453923ebd2270b87900eda5dfd4a6b0cfa15f82111c3 \
188 |     --hash=sha256:2cf04601633a4ec176b9cc3d3e73789c037641001dbfaf7c411f89cd3e04fcaf \
189 |     --hash=sha256:3ddff38894c7857c476feb3538dd847514379d6dc844961dc99f04b0384b1b1b \
190 |     --hash=sha256:48c222feb3ced18f3dc61168ca18952a22fb88e5eb8902d2bf1b50faefdc34a2 \
191 |     --hash=sha256:51d04df04ed9d08077d10ccbe21e6805791b78eac49d16d30a1f1fe2e44ba0af \
192 |     --hash=sha256:597c28f3aa7a09e8c070a86b03107094ee5cdafcc0d55f2f2eac92faac8dc67d \
193 |     --hash=sha256:5c8f0d82ea2468282e08b0cf5307f3ad022290ed50c45d5cb7767957ca782880 \
194 |     --hash=sha256:7189e51955f9268b2bdd6cc537e0faa06f8fffda7fb386e5922c6391de51b077 \
195 |     --hash=sha256:7df3596838b2a0c07c6f6d67752c53859a54993d4f062689fdf547cb56d0f84f \
196 |     --hash=sha256:826ccf85d4514609219725ba4a7abd569228c2c9f1968e8be05be366f68291ec \
197 |     --hash=sha256:836d14eb53b500fd92bd5db2fc5894f7c72b634f9c2a28f546f75967503d8e25 \
198 |     --hash=sha256:85db8090ba94e22d964498a47fdd933b8875a1add6ebc514c7ac8703eb97bbf0 \
199 |     --hash=sha256:85e701a6c316b7067f1e8675c638036a796fe5116783a4c932e7eb8e305a3ffe \
200 |     --hash=sha256:900589e19200be76dd7cbaa95e9771605b5ce3f62512d039fb3bc5da9014912a \
201 |     --hash=sha256:9147868bb0cc01e6846606cd65cbf9c58598f187b96d14dd1ca17338b08793bb \
202 |     --hash=sha256:9e7fdc775fe7403dbd8bc883ba59576a6232eac96dacb56512daacf7af5d618d \
203 |     --hash=sha256:ab5ee15d3462198c794c49ccd31773d8a2b8c17d622aa184f669d2b98c2f0857 \
204 |     --hash=sha256:ad893d889bc700a5835e0a95a3e4f2c39e91577ab232a3dc03c262a0f8fc4b5c \
205 |     --hash=sha256:b2e71c4670ebe1067fa8632f0d081e47254ee2d3d409de54168b43b0ba9147e0 \
206 |     --hash=sha256:b43b13e5622c5a53ab12f3272e6f42f1ce37cd5b6684b2676cb365403295cd40 \
207 |     --hash=sha256:b4ad84b156cf50529b8ac5cc1638c2cf8680490e3fccb6121316c8c02620a2e4 \
208 |     --hash=sha256:be5fd35e99970518547edc906efab29afd392319f020c3c58b0e1a158e16ed20 \
209 |     --hash=sha256:caa68c95bc1776d3521f81eeb4d5b9438be92514ec2a79fececda814099c8314 \
210 |     --hash=sha256:d144b350045c53c8ff09aa1cfa955012dd32f00c7e0862c199edcabb1a8b32da \
211 |     --hash=sha256:d2c2d9b24d3c65b5a02cac12cbb4e4194e590314519ed49db2f67ef561c3cf58 \
212 |     --hash=sha256:e9e5fd6dbdf95d99bc03732ded1fc8ef22ebbc05999ac7e0c7bf57fe6e4e5ae2 \
213 |     --hash=sha256:ebf459a1c069f9866d8569439c06193c586e72c9330db1390af7c6a0a32c4afd \
214 |     --hash=sha256:f31722f1c033c198aa4a39a01905951c00bd1c74f922e8afc1b1c62adbcdd56a \
215 |     --hash=sha256:f68c352a68e5fdf1e97288d5cec9296664c590c25932a8476224124aaf90dbcd
216 | 


--------------------------------------------------------------------------------
/tests/manualTEst.py:
--------------------------------------------------------------------------------
1 | import json
2 | 
3 | from facebook_scraper import get_posts
4 | 
5 | post = next(get_posts(account=100089065833006, start_url="https://mbasic.facebook.com/<pageId>?v=timeline", cookies='../cks.txt'))
6 | print(post['images'])
7 | 


--------------------------------------------------------------------------------
/tests/test_get_posts.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | import pytest
  4 | 
  5 | from facebook_scraper import *
  6 | 
  7 | 
  8 | @pytest.mark.vcr()
  9 | class TestGetPosts:
 10 |     def test_get_posts(self):
 11 |         expected_post = {
 12 |             'available': True,
 13 |             'comments': 149,
 14 |             'comments_full': None,
 15 |             'factcheck': None,
 16 |             'image': 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=d32fa3269feeaf6904d78a512f41ab26&oe=60E673C5',
 17 |             'image_id': '3065146500236449',
 18 |             'image_ids': [
 19 |                 '3065146500236449',
 20 |                 '3065146626903103',
 21 |                 '3065146783569754',
 22 |                 '3065146886903077',
 23 |             ],
 24 |             'image_lowquality': 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/cp0/e15/q65/p720x720/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=3&oh=426e258c934177d9ded48435efaecc6c&oe=60E74054',
 25 |             'images': [
 26 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=d32fa3269feeaf6904d78a512f41ab26&oe=60E673C5',
 27 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96657922_3065146630236436_9052202957155598336_n.jpg?_nc_cat=101&ccb=1-3&_nc_sid=8024bb&_nc_ohc=MwI_Au5sC60AX93Dkix&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=b947668e646a0e7614671deff90dc9a3&oe=60E41393',
 28 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96557798_3065146790236420_838564679184809984_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&_nc_ohc=ydkcrs8kPykAX_0Fdn4&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=7884c93d73b2a9f806baf829c8f941b0&oe=60E4D7FB',
 29 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96688092_3065146896903076_7861539131082407936_n.jpg?_nc_cat=108&ccb=1-3&_nc_sid=8024bb&_nc_ohc=vqgGsFXTmO4AX82bX5z&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=379eb1c4551d74a13a4cafb07524288e&oe=60E6753F',
 30 |             ],
 31 |             'images_description': [
 32 |                 'No photo description available.',
 33 |                 'No photo description available.',
 34 |                 'No photo description available.',
 35 |                 'No photo description available.',
 36 |             ],
 37 |             'images_lowquality': [
 38 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/cp0/e15/q65/p720x720/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=3&oh=426e258c934177d9ded48435efaecc6c&oe=60E74054',
 39 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-0/cp0/e15/q65/s640x640/96657922_3065146630236436_9052202957155598336_n.jpg?_nc_cat=101&ccb=1-3&_nc_sid=8024bb&_nc_ohc=MwI_Au5sC60AX93Dkix&_nc_ht=scontent.fhlz2-1.fna&tp=9&oh=5c016bd47d3d9ab3ba997b48dbc21a97&oe=60E75F2D',
 40 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-0/cp0/e15/q65/s640x640/96557798_3065146790236420_838564679184809984_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&_nc_ohc=ydkcrs8kPykAX_0Fdn4&_nc_ht=scontent.fhlz2-1.fna&tp=9&oh=ca962fe95d846cbd6e4e78b0884572c9&oe=60E51308',
 41 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-0/cp0/e15/q65/s640x640/96688092_3065146896903076_7861539131082407936_n.jpg?_nc_cat=108&ccb=1-3&_nc_sid=8024bb&_nc_ohc=vqgGsFXTmO4AX82bX5z&_nc_ht=scontent.fhlz2-1.fna&tp=9&oh=7e9da116d24a9faee2fe15c16d7dea8f&oe=60E3DD81',
 42 |             ],
 43 |             'images_lowquality_description': [
 44 |                 'No photo description available.',
 45 |                 'No photo description available.',
 46 |                 'No photo description available.',
 47 |                 'No photo description available.',
 48 |             ],
 49 |             'is_live': False,
 50 |             'likes': 1615,
 51 |             'link': 'https://www.nintendo.com/wallpapers/',
 52 |             'original_request_url': 3065154550235644,
 53 |             'post_id': '3065154550235644',
 54 |             'post_text': 'Check out these themed wallpapers and many more at the link '
 55 |             'below for your personal use! We hope you enjoy them!\n'
 56 |             '\n'
 57 |             'https://www.nintendo.com/wallpapers/',
 58 |             'post_url': 'https://facebook.com/story.php?story_fbid=3065154550235644&id=119240841493711',
 59 |             'reaction_count': None,
 60 |             'reactions': None,
 61 |             'reactors': None,
 62 |             'shared_post_id': None,
 63 |             'shared_post_url': None,
 64 |             'shared_text': '',
 65 |             'shared_time': None,
 66 |             'shared_user_id': None,
 67 |             'shared_username': None,
 68 |             'shares': 281,
 69 |             'text': 'Check out these themed wallpapers and many more at the link below '
 70 |             'for your personal use! We hope you enjoy them!\n'
 71 |             '\n'
 72 |             'https://www.nintendo.com/wallpapers/',
 73 |             'time': datetime.datetime(2020, 5, 13, 13, 1, 18),
 74 |             'user_id': '119240841493711',
 75 |             'user_url': 'https://facebook.com/Nintendo/?refid=52&__tn__=C-R',
 76 |             'username': 'Nintendo',
 77 |             'video': None,
 78 |             'video_duration_seconds': None,
 79 |             'video_height': None,
 80 |             'video_id': None,
 81 |             'video_quality': None,
 82 |             'video_size_MB': None,
 83 |             'video_thumbnail': None,
 84 |             'video_watches': None,
 85 |             'video_width': None,
 86 |             'w3_fb_url': None,
 87 |         }
 88 | 
 89 |         post = next(get_posts(post_urls=[3065154550235644]))
 90 |         print(post)
 91 |         assert post == expected_post
 92 | 
 93 |     def test_get_posts_with_extra_info(self):
 94 |         expected_post = {
 95 |             'available': True,
 96 |             'comments': 149,
 97 |             'comments_full': None,
 98 |             'factcheck': None,
 99 |             'fetched_time': datetime.datetime(2021, 6, 9, 10, 31, 43, 834002),
100 |             'image': 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=d32fa3269feeaf6904d78a512f41ab26&oe=60E673C5',
101 |             'image_id': '3065146500236449',
102 |             'image_ids': [
103 |                 '3065146500236449',
104 |                 '3065146626903103',
105 |                 '3065146783569754',
106 |                 '3065146886903077',
107 |             ],
108 |             'image_lowquality': 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/cp0/e15/q65/p720x720/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=3&oh=426e258c934177d9ded48435efaecc6c&oe=60E74054',
109 |             'images': [
110 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=d32fa3269feeaf6904d78a512f41ab26&oe=60E673C5',
111 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96657922_3065146630236436_9052202957155598336_n.jpg?_nc_cat=101&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=MwI_Au5sC60AX93Dkix&tn=8omYOUODC-SvWcRg&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=607e4783ada8c14a5d0fe50eaed35b74&oe=60E41393',
112 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96557798_3065146790236420_838564679184809984_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=ydkcrs8kPykAX_0Fdn4&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=7884c93d73b2a9f806baf829c8f941b0&oe=60E4D7FB',
113 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96688092_3065146896903076_7861539131082407936_n.jpg?_nc_cat=108&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=vqgGsFXTmO4AX82bX5z&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=379eb1c4551d74a13a4cafb07524288e&oe=60E6753F',
114 |             ],
115 |             'images_description': [
116 |                 'No photo description available.',
117 |                 'No photo description available.',
118 |                 'No photo description available.',
119 |                 'No photo description available.',
120 |             ],
121 |             'images_lowquality': [
122 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/cp0/e15/q65/p720x720/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=3&oh=426e258c934177d9ded48435efaecc6c&oe=60E74054',
123 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-0/cp0/e15/q65/s640x640/96657922_3065146630236436_9052202957155598336_n.jpg?_nc_cat=101&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=MwI_Au5sC60AX93Dkix&tn=8omYOUODC-SvWcRg&_nc_ht=scontent.fhlz2-1.fna&tp=9&oh=85385c57a98cbd698d746ddafc29a61c&oe=60E75F2D',
124 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-0/cp0/e15/q65/s640x640/96557798_3065146790236420_838564679184809984_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=ydkcrs8kPykAX_0Fdn4&_nc_ht=scontent.fhlz2-1.fna&tp=9&oh=ca962fe95d846cbd6e4e78b0884572c9&oe=60E51308',
125 |                 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-0/cp0/e15/q65/s640x640/96688092_3065146896903076_7861539131082407936_n.jpg?_nc_cat=108&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=vqgGsFXTmO4AX82bX5z&_nc_ht=scontent.fhlz2-1.fna&tp=9&oh=7e9da116d24a9faee2fe15c16d7dea8f&oe=60E3DD81',
126 |             ],
127 |             'images_lowquality_description': [
128 |                 'No photo description available.',
129 |                 'No photo description available.',
130 |                 'No photo description available.',
131 |                 'No photo description available.',
132 |             ],
133 |             'is_live': False,
134 |             'likes': 1615,
135 |             'link': 'https://www.nintendo.com/wallpapers/?fbclid=IwAR3uYocTphYdr6YYAznKWWdMBZ-p_Id3uNTFJ3_3lHwjnL3H7rRIEvb8yY8',
136 |             'original_request_url': 3065154550235644,
137 |             'post_id': '3065154550235644',
138 |             'post_text': 'Check out these themed wallpapers and many more at the link '
139 |             'below for your personal use! We hope you enjoy them!\n'
140 |             '\n'
141 |             'https://www.nintendo.com/wallpapers/',
142 |             'post_url': 'https://facebook.com/story.php?story_fbid=3065154550235644&id=119240841493711',
143 |             'reaction_count': 2117,
144 |             'reactions': {
145 |                 'angry': 3,
146 |                 'care': 92,
147 |                 'haha': 4,
148 |                 'like': 1615,
149 |                 'love': 381,
150 |                 'wow': 22,
151 |             },
152 |             'reactors': [],
153 |             'shared_post_id': None,
154 |             'shared_post_url': None,
155 |             'shared_text': '',
156 |             'shared_time': None,
157 |             'shared_user_id': None,
158 |             'shared_username': None,
159 |             'shares': 281,
160 |             'text': 'Check out these themed wallpapers and many more at the link below '
161 |             'for your personal use! We hope you enjoy them!\n'
162 |             '\n'
163 |             'https://www.nintendo.com/wallpapers/',
164 |             'time': datetime.datetime(2020, 5, 13, 13, 1),
165 |             'user_id': '119240841493711',
166 |             'user_url': 'https://facebook.com/Nintendo/?refid=52&__tn__=C-R',
167 |             'username': 'Nintendo',
168 |             'video': None,
169 |             'video_duration_seconds': None,
170 |             'video_height': None,
171 |             'video_id': None,
172 |             'video_quality': None,
173 |             'video_size_MB': None,
174 |             'video_thumbnail': None,
175 |             'video_watches': None,
176 |             'video_width': None,
177 |             'w3_fb_url': 'https://www.facebook.com/story.php?story_fbid=3065154550235644&id=119240841493711',
178 |         }
179 | 
180 |         post = next(
181 |             get_posts(post_urls=[3065154550235644], extra_info=True, cookies="cookies.txt")
182 |         )
183 | 
184 |         fields_to_ignore = ["fetched_time", "link"]
185 |         for field in fields_to_ignore:
186 |             post.pop(field)  # Do not check this field
187 |             expected_post.pop(field)
188 | 
189 |         assert post == expected_post
190 | 
191 |     def test_get_posts_fields_presence(self):
192 |         posts = list(get_posts(account='Nintendo', pages=2, extra_info=True))
193 | 
194 |         assert len(posts) == 6
195 | 
196 |         for post in posts:
197 |             assert 'post_id' in post
198 |             assert 'text' in post
199 |             assert 'time' in post
200 |             assert 'image' in post
201 |             assert 'video' in post
202 |             assert 'likes' in post
203 |             assert 'comments' in post
204 |             assert 'shares' in post
205 |             assert 'post_url' in post
206 |             assert 'link' in post
207 | 
208 |     def test_get_posts_with_extra_info_fields_presence(self):
209 |         posts = list(
210 |             get_posts(account='Nintendo', pages=2, cookies="cookies.txt", extra_info=True)
211 |         )
212 | 
213 |         assert len(posts) == 6
214 | 
215 |         for post in posts:
216 |             assert 'post_id' in post
217 |             assert 'text' in post
218 |             assert 'time' in post
219 |             assert 'video' in post
220 |             assert 'image' in post
221 |             assert 'likes' in post
222 |             assert 'comments' in post
223 |             assert 'shares' in post
224 |             assert 'post_url' in post
225 |             assert 'link' in post
226 |             assert 'shares' in post
227 |             assert 'likes' in post
228 |             assert 'reactions' in post
229 |             assert 'comments' in post
230 |             assert 'w3_fb_url' in post
231 |             assert 'fetched_time' in post
232 | 
233 |     def test_smoketest(self):
234 |         list(get_posts(account='Nintendo', pages=2))
235 | 
236 | 
237 | @pytest.mark.vcr()
238 | class TestGetGroupPosts:
239 |     def test_get_group_posts(self):
240 |         expected_post = {
241 |             'available': True,
242 |             'comments': 1,
243 |             'comments_full': None,
244 |             'factcheck': None,
245 |             'image': None,
246 |             'image_id': None,
247 |             'image_ids': [],
248 |             'image_lowquality': None,
249 |             'images': [],
250 |             'images_description': [],
251 |             'images_lowquality': [],
252 |             'images_lowquality_description': [],
253 |             'is_live': False,
254 |             'likes': 32,
255 |             'link': None,
256 |             'post_id': '1629606003787605',
257 |             'post_text': 'Hola!, This group is aimed to create opportunities for South '
258 |             'American students in Computer Science and related fields.\n'
259 |             '\n'
260 |             'Hope this will help us to know what we are doing in our work, '
261 |             'achievements to be recognized, increase fairness in our area, '
262 |             'and maybe conferences where we might meet.\n'
263 |             '\n'
264 |             'Professors and professionals are also welcomed to share their '
265 |             'experiences and to collaborate among us and learn together.\n'
266 |             '\n'
267 |             'Some short rules for a happy co-existence:\n'
268 |             '1. No business advertisement or spam.\n'
269 |             '2. Topics relevant to Computing, Computer Science, Software '
270 |             'Engineering, and Education.\n'
271 |             '3. Political and religious advertisement are not allowed.',
272 |             'post_url': 'https://m.facebook.com/groups/southamericansincomputing/permalink/1629606003787605/',
273 |             'reaction_count': None,
274 |             'reactions': None,
275 |             'reactors': None,
276 |             'shared_post_id': None,
277 |             'shared_post_url': None,
278 |             'shared_text': '',
279 |             'shared_time': None,
280 |             'shared_user_id': None,
281 |             'shared_username': None,
282 |             'shares': 0,
283 |             'text': 'Hola!, This group is aimed to create opportunities for South '
284 |             'American students in Computer Science and related fields.\n'
285 |             '\n'
286 |             'Hope this will help us to know what we are doing in our work, '
287 |             'achievements to be recognized, increase fairness in our area, and '
288 |             'maybe conferences where we might meet.\n'
289 |             '\n'
290 |             'Professors and professionals are also welcomed to share their '
291 |             'experiences and to collaborate among us and learn together.\n'
292 |             '\n'
293 |             'Some short rules for a happy co-existence:\n'
294 |             '1. No business advertisement or spam.\n'
295 |             '2. Topics relevant to Computing, Computer Science, Software '
296 |             'Engineering, and Education.\n'
297 |             '3. Political and religious advertisement are not allowed.',
298 |             'time': datetime.datetime(2018, 4, 4, 8, 2, 42),
299 |             'user_id': 757122227,
300 |             'user_url': 'https://facebook.com/omarflorez?groupid=117507531664134&refid=18&_ft_=top_level_post_id.1629606003787605%3Acontent_owner_id_new.757122227%3Apage_id.117507531664134%3Astory_location.6%3Atds_flgs.3%3Aott.AX_xo0_Tl6A-u34K%3Apage_insights.%7B%22117507531664134%22%3A%7B%22page_id%22%3A117507531664134%2C%22page_id_type%22%3A%22group%22%2C%22actor_id%22%3A757122227%2C%22dm%22%3A%7B%22isShare%22%3A0%2C%22originalPostOwnerID%22%3A0%7D%2C%22psn%22%3A%22EntGroupDescriptionChangeCreationStory%22%2C%22post_context%22%3A%7B%22object_fbtype%22%3A657%2C%22publish_time%22%3A1522785762%2C%22story_name%22%3A%22EntGroupDescriptionChangeCreationStory%22%2C%22story_fbid%22%3A%5B1629606003787605%5D%7D%2C%22role%22%3A1%2C%22sl%22%3A6%7D%7D&__tn__=C-R',
301 |             'username': 'Omar U. Florez',
302 |             'video': None,
303 |             'video_duration_seconds': None,
304 |             'video_height': None,
305 |             'video_id': None,
306 |             'video_quality': None,
307 |             'video_size_MB': None,
308 |             'video_thumbnail': None,
309 |             'video_watches': None,
310 |             'video_width': None,
311 |             'w3_fb_url': None,
312 |         }
313 | 
314 |         unset_cookies()
315 |         post = next(get_posts(group=117507531664134))
316 | 
317 |         assert post == expected_post
318 | 
319 |     # todo: add a case with requesting a group post with start_url=None
320 | 
321 |     def test_smoketest(self):
322 |         list(get_posts(group=117507531664134, pages=2))
323 | 
324 | 
325 | @pytest.mark.vcr()
326 | class TestGetPhotos:
327 |     def test_smoketest(self):
328 |         list(get_photos(account='Nintendo', pages=2))
329 | 


--------------------------------------------------------------------------------
/tests/test_parse_date.py:
--------------------------------------------------------------------------------
 1 | from facebook_scraper.utils import parse_datetime
 2 | 
 3 | 
 4 | class TestParseDate:
 5 |     dates = [
 6 |         'Oct 1 at 1:00 PM',
 7 |         'Oct 1 at 11:00 PM',
 8 |         'Oct 16 at 1:00 PM',
 9 |         'Oct 16 at 11:00 PM',
10 |         'October 1 at 1:00 PM',
11 |         'October 1 at 11:00 PM',
12 |         'October 16 at 1:00 PM',
13 |         'October 16 at 11:00 PM',
14 |         'October 1, 2019 at 1:00 PM',
15 |         'October 1, 2019 at 11:00 PM',
16 |         'October 16, 2019 at 1:00 PM',
17 |         'October 16, 2019 at 11:00 PM',
18 |         'Yesterday at 1:00 PM',
19 |         'Yesterday at 11:00 PM',
20 |         'Today at 1:00 PM',
21 |         'Today at 11:00 PM',
22 |         'Yesterday at 1:00 PM',
23 |         'Yesterday at 11:00 PM',
24 |         'Yesterday at 15:28',
25 |         '7 November at 20:01',
26 |         '1h',
27 |         '16h',
28 |         '1hrs',
29 |         '16hrs',
30 |         '1 hr',
31 |         '16 hrs',
32 |         '1 min',
33 |         '50 mins',
34 |         'on Sat',
35 |         '1 wk',
36 |         '2 wks',
37 |         '1 yr',
38 |         '2 yrs',
39 |         '1 mth',
40 |         '4 mths',
41 |         'last Tue',
42 |         'last Mon',
43 |         '11 mos',
44 |         '1 mo',
45 |         'Just now',
46 |     ]
47 | 
48 |     def test_all_dates(self):
49 |         for date in self.dates:
50 |             try:
51 |                 assert parse_datetime(date) is not None
52 |             except AssertionError as e:
53 |                 print(f'Failed to parse {date}')
54 |                 raise e
55 | 


--------------------------------------------------------------------------------
/tests/test_parse_duration.py:
--------------------------------------------------------------------------------
 1 | from facebook_scraper.utils import parse_duration
 2 | 
 3 | 
 4 | class TestParseDuration:
 5 |     durations = ['T26S', 'T33M8S', 'T1H28M15S']
 6 | 
 7 |     def test_all_durations(self):
 8 |         for duration in self.durations:
 9 |             try:
10 |                 assert parse_duration(duration) is not None
11 |             except AssertionError as e:
12 |                 print(f'Failed to parse {duration}')
13 |                 raise e
14 | 


--------------------------------------------------------------------------------