├── .flake8 ├── .gitignore ├── LICENSE ├── README.md ├── facebook_scraper ├── __init__.py ├── __main__.py ├── constants.py ├── exceptions.py ├── extractors.py ├── facebook_scraper.py ├── fb_types.py ├── internal_classes.py ├── page_iterators.py └── utils.py ├── poetry.lock ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt └── tests ├── cassettes ├── TestGetGroupPosts.test_get_group_posts.yaml ├── TestGetGroupPosts.test_smoketest.yaml ├── TestGetPhotos.test_smoketest.yaml ├── TestGetPosts.test_get_posts.yaml ├── TestGetPosts.test_get_posts_fields_presence.yaml ├── TestGetPosts.test_get_posts_with_extra_info.yaml ├── TestGetPosts.test_get_posts_with_extra_info_fields_presence.yaml └── TestGetPosts.test_smoketest.yaml ├── manualTEst.py ├── test_get_posts.py ├── test_parse_date.py └── test_parse_duration.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 98 3 | ignore = 4 | E501 5 | W503 6 | per-file-ignores = 7 | utils.py:F401 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # PyCharm 101 | .idea 102 | 103 | # Spyder project settings 104 | .spyderproject 105 | .spyproject 106 | 107 | # Rope project settings 108 | .ropeproject 109 | 110 | # mkdocs documentation 111 | /site 112 | 113 | # mypy 114 | .mypy_cache/ 115 | .dmypy.json 116 | dmypy.json 117 | 118 | # Pyre type checker 119 | .pyre/ 120 | 121 | # vim 122 | *~ 123 | *.swp 124 | *.swo 125 | 126 | # VSCode 127 | .vscode 128 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Kevin Zúñiga 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Facebook Scraper 2 | 3 | [![PyPI download month](https://img.shields.io/pypi/dm/facebook-scraper.svg)](https://pypi.python.org/pypi/facebook-scraper/) 4 | [![PyPI download week](https://img.shields.io/pypi/dw/facebook-scraper.svg)](https://pypi.python.org/pypi/facebook-scraper/) 5 | [![PyPI download day](https://img.shields.io/pypi/dd/facebook-scraper.svg)](https://pypi.python.org/pypi/facebook-scraper/) 6 | 7 | [![PyPI version](https://img.shields.io/pypi/v/facebook-scraper?color=blue)](https://pypi.python.org/pypi/facebook-scraper/) 8 | [![PyPI pyversions](https://img.shields.io/pypi/pyversions/facebook-scraper.svg)](https://pypi.python.org/pypi/facebook-scraper/) 9 | [![GitHub commits since tagged version](https://img.shields.io/github/commits-since/kevinzg/facebook-scraper/v0.2.59)](https://github.com/kevinzg/facebook-scraper/commits/) 10 | 11 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 12 | 13 | Scrape Facebook public pages without an API key. Inspired by [twitter-scraper](https://github.com/kennethreitz/twitter-scraper). 14 | 15 | 16 | ## Contributions 17 | We are moving a bit slowly on updates, so if you want to help please check the [TODO](#to-do) section below 18 | 19 | 20 | 21 | ## Install 22 | 23 | To install the latest release from PyPI (original version): 24 | 25 | ```sh 26 | pip install facebook-scraper 27 | ``` 28 | 29 | Or, to install this latest master branch: 30 | 31 | ```sh 32 | pip install git+https://github.com/moda20/facebook-scraper.git@master 33 | ``` 34 | 35 | Or, to force update the branch after an update : 36 | 37 | ```sh 38 | pip install --force-reinstall --no-deps git+https://github.com/moda20/facebook-scraper.git@master 39 | ``` 40 | 41 | And to add it to your requirements.txt manually : 42 | 43 | ``` 44 | facebook-scraper @ git+https://github.com/moda20/facebook-scraper.git@master 45 | ``` 46 | 47 | ## Usage 48 | 49 | in order to get everything running right, follow these steps 50 | 51 | 1. Send the unique **page name, profile name, or ID** as the first parameter 52 | 2. Specify the base_url and start_url to use the mbasic page instead 53 | 3. Get the mbasicHeaders that you want to use and read them from a file in order to inject them into the scraper. 54 | **you can get these headers from opening an example page and selecting a high-end device in the developer tools (such as samsung s20 ultra) 55 | . This will help with getting newer versions of posts and higher fidelity images.** 56 | ```python 57 | from facebook_scraper import get_posts, _scraper 58 | import json 59 | 60 | with open('./mbasicHeaders.json', 'r') as file: 61 | _scraper.mbasic_headers = json.load(file) 62 | 63 | for post in get_posts('NintendoAmerica', base_url="https://mbasic.facebook.com", start_url="https://mbasic.facebook.com/NintendoAmerica?v=timeline", pages=1): 64 | ... print(post['text'][:50]) 65 | ... 66 | The final step on the road to the Super Smash Bros 67 | We’re headed to PAX East 3/28-3/31 with new games 68 | ``` 69 | 70 | ### Optional parameters 71 | 72 | *(For the `get_posts` function)*. 73 | 74 | * **group**: group id, to scrape groups instead of pages. Default is `None`. 75 | * **pages**: how many pages of posts to request, the first 2 pages may have no results, so try with a number greater than 2. Default is 10. 76 | * **timeout**: how many seconds to wait before timing out. Default is 30. 77 | * **credentials**: tuple of user and password to login before requesting the posts. Default is `None`. 78 | * **extra_info**: bool, if true the function will try to do an extra request to get the post reactions. Default is False. 79 | * **youtube_dl**: bool, use Youtube-DL for (high-quality) video extraction. You need to have youtube-dl installed on your environment. Default is False. 80 | * **post_urls**: list, URLs or post IDs to extract posts from. Alternative to fetching based on username. 81 | * **cookies**: One of: 82 | - The path to a file containing cookies in Netscape or JSON format. You can extract cookies from your browser after logging into Facebook with an extension like [Get cookies.txt LOCALLY](https://chrome.google.com/webstore/detail/get-cookiestxt-locally/cclelndahbckbenkjhflpdbgdldlbecc) or [Cookie Quick Manager (Firefox)](https://addons.mozilla.org/en-US/firefox/addon/cookie-quick-manager/). Make sure that you include both the c_user cookie and the xs cookie, you will get an InvalidCookies exception if you don't. 83 | - A [CookieJar](https://docs.python.org/3.9/library/http.cookiejar.html#http.cookiejar.CookieJar) 84 | - A dictionary that can be converted to a CookieJar with [cookiejar_from_dict](https://2.python-requests.org/en/master/api/#requests.cookies.cookiejar_from_dict) 85 | - The string `"from_browser"` to try extract Facebook cookies from your browser 86 | * **options**: Dictionary of options. 87 | * Set `options={"comments": True}` to extract comments. 88 | * Set `options={"reactors": True}` to extract the people reacting to the post. 89 | * Set `options={"reactions": True}` to extract the reactions of the post. Similar to `reactors` but only extracts reactions and not the people who reacted. Makes only one request per post 90 | * Both `comments` and `reactors` can also be set to a number to set a limit for the amount of comments/reactors to retrieve. 91 | * Set `options={"progress": True}` to get a `tqdm` progress bar while extracting comments and replies. 92 | * Set `options={"allow_extra_requests": False}` to disable making extra requests when extracting post data (required for some things like full text and image links). 93 | * Set `options={"posts_per_page": 200}` to request 200 posts per page. The default is 4. 94 | * Set `options={"image_hop_timeout": 2}` to delay the image cycling by n seconds, this is useful to prevent pinging fb a lot. 95 | * Set `options={"HQ_images_max_count": 2}` to limit the max count of returned images. 96 | * Set `options={"whitelist_methods": []}` to extract only specific sections of a post, this is useful to not use up your requests when you don't need to. Here is the list of methods you can use 97 | 98 | | method name | description | 99 | |---------------------------|---------------------------------------------------------------------------| 100 | | extract_post_url | will try to extract the unique post url | 101 | | extract_post_id | will try to extract the unique post_id | 102 | | extract_text | will try to extract the post's text and full text if needed | 103 | | extract_time | will try to extract the post's publishing timestamp | 104 | | extract_photo_link | will try to extract the post's photos, including HQ photos | 105 | | extract_image_lq* | will try to extract low quality images for posts | 106 | | extract_comments | will try to extract comments of a post, if enabled in options | 107 | | extract_shares | will try to extract shares of a post, if enabled in options | 108 | | extract_links | will try to extract links of a post | 109 | | extract_user_id | will try to extract the posting user's id, can be different than page_id | 110 | | extract_username | will try to extract the poster's username | 111 | | extract_video | will try to extract the video link of a post | 112 | | extract_video_thumbnail | will try to extract the video thumbnail of a post | 113 | | extract_video_id | will try to extract the video's id from a post | 114 | | extract_video_meta | will try to extract the metadata of a video from a psot | 115 | | extract_is_live | will try to extract whether a post's video was live or not | 116 | | extract_factcheck | will try to extract whether a post is fact checked or not | 117 | | extract_share_information | will try to extract sharing info (count) from a post | 118 | | extract_availability | will try to extract whether a post is available or not (in case fo a 404) | 119 | | extract_listing | will try to extract a marketplace listing if found | 120 | | extract_with | will try to extract tagged accounts in a post ("user is with xxxxx") | 121 | 122 | 123 | 124 | 125 | ## CLI usage 126 | 127 | ```sh 128 | $ facebook-scraper --filename nintendo_page_posts.csv --pages 10 nintendo 129 | ``` 130 | 131 | Run `facebook-scraper --help` for more details on CLI usage. 132 | 133 | **Note:** If you get a `UnicodeEncodeError` try adding `--encoding utf-8`. 134 | 135 | ### Practical example: donwload comments of a post 136 | 137 | ```python 138 | """ 139 | Download comments for a public Facebook post. 140 | """ 141 | 142 | import facebook_scraper as fs 143 | 144 | # get POST_ID from the URL of the post which can have the following structure: 145 | # https://mbasic.facebook.com/USER/posts/POST_ID 146 | # https://mbasic.facebook.com/groups/GROUP_ID/posts/POST_ID 147 | POST_ID = "https://mbasic.facebook.com//posts/" 148 | 149 | # number of comments to download -- set this to True to download all comments 150 | MAX_COMMENTS = 100 151 | 152 | # get the post (this gives a generator) 153 | gen = fs.get_posts( 154 | post_urls=[POST_ID], 155 | options={"comments": MAX_COMMENTS, "progress": True} 156 | ) 157 | 158 | # take 1st element of the generator which is the post we requested 159 | post = next(gen) 160 | 161 | # extract the comments part 162 | comments = post['comments_full'] 163 | 164 | # process comments as you want... 165 | for comment in comments: 166 | 167 | # e.g. ...print them 168 | print(comment) 169 | 170 | # e.g. ...get the replies for them 171 | for reply in comment['replies']: 172 | print(' ', reply) 173 | ``` 174 | 175 | ## Post example 176 | 177 | ```python 178 | {'available': True, 179 | 'comments': 459, 180 | 'comments_full': None, 181 | 'factcheck': None, 182 | 'fetched_time': datetime.datetime(2021, 4, 20, 13, 39, 53, 651417), 183 | 'image': 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/58745049_2257182057699568_1761478225390731264_n.jpg?_nc_cat=111&ccb=1-3&_nc_sid=8024bb&_nc_ohc=ygH2fPmfQpAAX92ABYY&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=7a8a7b4904deb55ec696ae255fff97dd&oe=60A36717', 184 | 'images': ['https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/58745049_2257182057699568_1761478225390731264_n.jpg?_nc_cat=111&ccb=1-3&_nc_sid=8024bb&_nc_ohc=ygH2fPmfQpAAX92ABYY&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=7a8a7b4904deb55ec696ae255fff97dd&oe=60A36717'], 185 | 'is_live': False, 186 | 'likes': 3509, 187 | 'link': 'https://www.nintendo.com/amiibo/line-up/', 188 | 'post_id': '2257188721032235', 189 | 'post_text': 'Don’t let this diminutive version of the Hero of Time fool you, ' 190 | 'Young Link is just as heroic as his fully grown version! Young ' 191 | 'Link joins the Super Smash Bros. series of amiibo figures!\n' 192 | '\n' 193 | 'https://www.nintendo.com/amiibo/line-up/', 194 | 'post_url': 'https://facebook.com/story.php?story_fbid=2257188721032235&id=119240841493711', 195 | 'reactions': {'haha': 22, 'like': 2657, 'love': 706, 'sorry': 1, 'wow': 123}, # if `extra_info` was set 196 | 'reactors': None, 197 | 'shared_post_id': None, 198 | 'shared_post_url': None, 199 | 'shared_text': '', 200 | 'shared_time': None, 201 | 'shared_user_id': None, 202 | 'shared_username': None, 203 | 'shares': 441, 204 | 'text': 'Don’t let this diminutive version of the Hero of Time fool you, ' 205 | 'Young Link is just as heroic as his fully grown version! Young Link ' 206 | 'joins the Super Smash Bros. series of amiibo figures!\n' 207 | '\n' 208 | 'https://www.nintendo.com/amiibo/line-up/', 209 | 'time': datetime.datetime(2019, 4, 30, 5, 0, 1), 210 | 'full_text':'Don’t let this diminutive version of the Hero of Time fool you, ' 211 | 'Young Link is just as heroic as his fully grown version! Young Link ' 212 | 'joins the Super Smash Bros. series of amiibo figures!', # !! This will only be present if the post_text and text is truncated 213 | 'user_id': '119240841493711', 214 | 'username': 'Nintendo', 215 | 'video': None, 216 | 'video_id': None, 217 | 'video_thumbnail': None, 218 | 'w3_fb_url': 'https://www.facebook.com/Nintendo/posts/2257188721032235'} 219 | ``` 220 | 221 | ### Notes 222 | 223 | - There is no guarantee that every field will be extracted (they might be `None`). 224 | - Group posts may be missing some fields like `time` and `post_url`. 225 | - Group scraping may return only one page and not work on private groups. 226 | - If you scrape too much, Facebook might temporarily ban your IP. 227 | - The vast majority of unique IDs on facebook (post IDs, video IDs, photo IDs, comment IDs, profile IDs, etc) can be appended to "https://www.facebook.com/" to result in a redirect to the corresponding object. 228 | - Some functions (such as extracting reactions) require you to be logged into Facebook (pass cookies). If something isn't working as expected, try pass cookies and see if that fixes it. 229 | - Reaction Categories (EN): [`like`, `love`, `haha`, `sorry`, `wow`, `angry`, `care`] 230 | 231 | ## Comment & Reply example 232 | ```python 233 | {'comment_id': '1417925635669547', 234 | 'comment_url': 'https://facebook.com/1417925635669547', 235 | 'commenter_id': '100009665948953', 236 | 'commenter_url': 'https://facebook.com/tw0311?eav=AfZuEAOAat6KRX5WFplL0SNA4ZW78Z2O7W_sjwMApq67hZxXDwXh2WF2ezhICX1LCT4&fref=nf&rc=p&refid=52&__tn__=R&paipv=0', 237 | 'commenter_name': 'someone', 238 | 'commenter_meta': None, 239 | 'comment_text': 'something', 240 | 'comment_time': datetime.datetime(2023, 6, 23, 0, 0), 241 | 'comment_image': 'https://scontent.ftpe8-2.fna.fbcdn.net/m1/v/t6/An_UvxJXg9tdnLU3Y5qjPi0200MLilhzPXUgxzGjQzUMaNcmjdZA6anyrngvkdub33NZzZhd51fpCAEzNHFhko5aKRFP5fS1w_lKwYrzcNLupv27.png?ccb=10-5&oh=00_AfCdlpCwAg-SHhniMQ16uElFHh-OG8kGGmLAzvOY5_WZgw&oe=64BE3279&_nc_sid=7da55a', 242 | 'comment_reactors': [ 243 | {'name': 'Tom', 'link': 'https://facebook.com/ryan.dwayne?eav=AfaxdKIITTXyZj4H-eanXQgoxzOa8Vag6XkGXXDisGzh_W74RYZSXxlFZBofR4jUIOg&fref=pb&paipv=0', 'type': 'like'}, 244 | {'name': 'Macy', 'link': 'https://facebook.com/profile.php?id=100000112053053&eav=AfZ5iWlNN-EjjSwVNQl7E2HiVp25AUZMqfoPvLRZGnbUAQxuLeN8nl6xnnQTJB3uxDM&fref=pb&paipv=0', 'type': 'like'}], 245 | 'comment_reactions': {'like': 2}, 246 | 'comment_reaction_count': 2, 247 | 'replies': [ 248 | {'comment_id': '793761608817229', 249 | 'comment_url': 'https://facebook.com/793761608817229', 250 | 'commenter_id': '100022377272712', 251 | 'commenter_url': 'https://facebook.com/brizanne.torres?eav=Afab9uP4ByIMn1xaYK0UDd1SRU8e5Zu7faKEx6qTzLKD2vp_bB1xLDGvTwEd6u8A7jY&fref=nf&rc=p&__tn__=R&paipv=0', 252 | 'commenter_name': 'David', 253 | 'commenter_meta': None, 254 | 'comment_text': 'something', 255 | 'comment_time': datetime.datetime(2023, 6, 23, 18, 0), 256 | 'comment_image': None, 257 | 'comment_reactors': [], 258 | 'comment_reactions': {'love': 2}, 259 | 'comment_reaction_count': None} 260 | ] 261 | } 262 | ``` 263 | 264 | 265 | ## Profiles 266 | 267 | The `get_profile` function can extract information from a profile's about section. Pass in the account name or ID as the first parameter. 268 | Note that Facebook serves different information depending on whether you're logged in (cookies parameter), such as Date of birth and Gender. Usage: 269 | 270 | ```python 271 | from facebook_scraper import get_profile 272 | get_profile("zuck") # Or get_profile("zuck", cookies="cookies.txt") 273 | ``` 274 | 275 | Outputs: 276 | 277 | ```python 278 | {'About': "I'm trying to make the world a more open place.", 279 | 'Education': 'Harvard University\n' 280 | 'Computer Science and Psychology\n' 281 | '30 August 2002 - 30 April 2004\n' 282 | 'Phillips Exeter Academy\n' 283 | 'Classics\n' 284 | 'School year 2002\n' 285 | 'Ardsley High School\n' 286 | 'High School\n' 287 | 'September 1998 - June 2000', 288 | 'Favourite Quotes': '"Fortune favors the bold."\n' 289 | '- Virgil, Aeneid X.284\n' 290 | '\n' 291 | '"All children are artists. The problem is how to remain ' 292 | 'an artist once you grow up."\n' 293 | '- Pablo Picasso\n' 294 | '\n' 295 | '"Make things as simple as possible but no simpler."\n' 296 | '- Albert Einstein', 297 | 'Name': 'Mark Zuckerberg', 298 | 'Places lived': [{'link': '/profile.php?id=104022926303756&refid=17', 299 | 'text': 'Palo Alto, California', 300 | 'type': 'Current town/city'}, 301 | {'link': '/profile.php?id=105506396148790&refid=17', 302 | 'text': 'Dobbs Ferry, New York', 303 | 'type': 'Home town'}], 304 | 'Work': 'Chan Zuckerberg Initiative\n' 305 | '1 December 2015 - Present\n' 306 | 'Facebook\n' 307 | 'Founder and CEO\n' 308 | '4 February 2004 - Present\n' 309 | 'Palo Alto, California\n' 310 | 'Bringing the world closer together.'} 311 | ``` 312 | 313 | To extract friends, pass the argument `friends=True`, or to limit the amount of friends retrieved, set `friends` to the desired number. 314 | 315 | ## Group info 316 | 317 | The `get_group_info` function can extract info about a group. Pass in the group name or ID as the first parameter. 318 | Note that in order to see the list of admins, you need to be logged in (cookies parameter). 319 | 320 | Usage: 321 | 322 | ```python 323 | from facebook_scraper import get_group_info 324 | get_group_info("makeupartistsgroup") # or get_group_info("makeupartistsgroup", cookies="cookies.txt") 325 | ``` 326 | 327 | Output: 328 | 329 | ```python 330 | {'admins': [{'link': '/africanstylemagazinecom/?refid=18', 331 | 'name': 'African Style Magazine'}, 332 | {'link': '/connectfluencer/?refid=18', 333 | 'name': 'Everythingbrightandbeautiful'}, 334 | {'link': '/Kaakakigroup/?refid=18', 'name': 'Kaakaki Group'}, 335 | {'link': '/opentohelp/?refid=18', 'name': 'Open to Help'}], 336 | 'id': '579169815767106', 337 | 'members': 6814229, 338 | 'name': 'HAIRSTYLES', 339 | 'type': 'Public group'} 340 | ``` 341 | 342 | ## Write to a CSV file directly 343 | 344 | The library also provides a `write_posts_to_csv()` function that writes posts directly to the disk and is able to resume scraping from the address of the last page. It is very useful when scraping large pages as the data is saved continuously and scraping can be resumed in case of an error. Here is an example to fetch the posts of a group 100 pages at a time and save them in separate files. 345 | 346 | ```python 347 | import facebook_scraper as fs 348 | 349 | # Saves the first 100 pages 350 | for i in range(1, 101): 351 | fs.write_posts_to_csv( 352 | group=GROUP_ID, # The method uses get_posts internally so you can use the same arguments and they will be passed along 353 | page_limit=100, 354 | timeout=60, 355 | options={ 356 | 'allow_extra_requests': False 357 | }, 358 | filename=f'./data/messages_{i}.csv', # Will throw an error if the file already exists 359 | resume_file='next_page.txt', # Will save a link to the next page in this file after fetching it and use it when starting. 360 | matching='.+', # A regex can be used to filter all the posts matching a certain pattern (here, we accept anything) 361 | not_matching='^Warning', # And likewise those that don't fit a pattern (here, we filter out all posts starting with "Warning") 362 | keys=[ 363 | 'post_id', 364 | 'text', 365 | 'timestamp', 366 | 'time', 367 | 'user_id' 368 | ], # List of the keys that should be saved for each post, will save all keys if not set 369 | format='csv', # Output file format, can be csv or json, defaults to csv 370 | days_limit=3650 # Number of days for the oldest post to fetch, defaults to 3650 371 | ) 372 | 373 | ``` 374 | 375 | 376 | ## Funny Graphics 377 | 378 | [![Star History Chart](https://api.star-history.com/svg?repos=moda20/facebook-scraper&type=Date)](https://star-history.com/#moda20/facebook-scraper&Date) 379 | ## To-Do 380 | 381 | - CLI update to work with the latest script updates (NEEDS HELP) 382 | - Async support 383 | - ~~Image galleries~~ (`images` entry) 384 | - ~~Profiles or post authors~~ (`get_profile()`) 385 | - ~~Comments~~ (with `options={'comments': True}`) 386 | 387 | ## Alternatives and related projects 388 | 389 | - [facebook-post-scraper](https://github.com/brutalsavage/facebook-post-scraper). Has comments. Uses Selenium. 390 | - [facebook-scraper-selenium](https://github.com/apurvmishra99/facebook-scraper-selenium). "Scrape posts from any group or user into a .csv file without needing to register for any API access". 391 | - [Ultimate Facebook Scraper](https://github.com/harismuneer/Ultimate-Facebook-Scraper). "Scrapes almost everything about a Facebook user's profile". Uses Selenium. 392 | - [Unofficial APIs](https://github.com/Rolstenhouse/unofficial-apis). List of unofficial APIs for various services, none for Facebook for now, but might be worth to check in the future. 393 | - [major-scrapy-spiders](https://github.com/talhashraf/major-scrapy-spiders). Has a profile spider for Scrapy. 394 | - [facebook-page-post-scraper](https://github.com/minimaxir/facebook-page-post-scraper). Seems abandoned. 395 | - [FBLYZE](https://github.com/isaacmg/fb_scraper). Fork (?). 396 | - [RSSHub](https://github.com/DIYgod/RSSHub/blob/master/lib/routes/facebook/page.js). Generates an RSS feed from Facebook pages. 397 | - [RSS-Bridge](https://github.com/RSS-Bridge/rss-bridge/blob/master/bridges/FacebookBridge.php). Also generates RSS feeds from Facebook pages. 398 | -------------------------------------------------------------------------------- /facebook_scraper/__init__.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import locale 4 | import logging 5 | import pathlib 6 | import sys 7 | import warnings 8 | import pickle 9 | from typing import Any, Dict, Iterator, Optional, Set, Union 10 | 11 | from requests.cookies import cookiejar_from_dict 12 | 13 | from .constants import DEFAULT_REQUESTS_TIMEOUT, DEFAULT_COOKIES_FILE_PATH 14 | from .facebook_scraper import FacebookScraper 15 | from .fb_types import Credentials, Post, RawPost, Profile 16 | from .utils import html_element_to_string, parse_cookie_file 17 | from . import exceptions 18 | import traceback 19 | import time 20 | from datetime import datetime, timedelta 21 | import re 22 | import os 23 | 24 | 25 | _scraper = FacebookScraper() 26 | 27 | 28 | def set_cookies(cookies): 29 | if isinstance(cookies, str): 30 | if cookies == "from_browser": 31 | try: 32 | import browser_cookie3 33 | 34 | cookies = browser_cookie3.load(domain_name='.facebook.com') 35 | except: 36 | raise ModuleNotFoundError( 37 | "browser_cookie3 must be installed to use browser cookies" 38 | ) 39 | else: 40 | try: 41 | cookies = parse_cookie_file(cookies) 42 | except ValueError as e: 43 | raise exceptions.InvalidCookies(f"Cookies are in an invalid format: {e}") 44 | elif isinstance(cookies, dict): 45 | cookies = cookiejar_from_dict(cookies) 46 | if cookies is not None: 47 | cookie_names = [c.name for c in cookies] 48 | missing_cookies = [c for c in ['c_user', 'xs'] if c not in cookie_names] 49 | if missing_cookies: 50 | raise exceptions.InvalidCookies(f"Missing cookies with name(s): {missing_cookies}") 51 | _scraper.session.cookies.update(cookies) 52 | if not _scraper.is_logged_in(): 53 | raise exceptions.InvalidCookies(f"Cookies are not valid") 54 | 55 | 56 | def unset_cookies(): 57 | # Explicitly unset cookies to return to unauthenticated requests 58 | _scraper.session.cookies = cookiejar_from_dict({}) 59 | 60 | 61 | def set_proxy(proxy, verify=True): 62 | _scraper.set_proxy(proxy, verify) 63 | 64 | 65 | def set_user_agent(user_agent): 66 | _scraper.set_user_agent(user_agent) 67 | 68 | 69 | def set_noscript(noscript): 70 | _scraper.set_noscript(noscript) 71 | 72 | 73 | def get_profile( 74 | account: str, 75 | **kwargs, 76 | ) -> Profile: 77 | """Get a Facebook user's profile information 78 | Args: 79 | account(str): The account of the profile. 80 | cookies (Union[dict, CookieJar, str]): Cookie jar to use. 81 | Can also be a filename to load the cookies from a file (Netscape format). 82 | """ 83 | _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT) 84 | cookies = kwargs.pop('cookies', None) 85 | set_cookies(cookies) 86 | return _scraper.get_profile(account, **kwargs) 87 | 88 | 89 | def get_reactors( 90 | post_id: Union[str, int], 91 | **kwargs, 92 | ) -> Iterator[dict]: 93 | """Get reactors for a given post ID 94 | Args: 95 | post_id(str): The post ID, as returned from get_posts 96 | cookies (Union[dict, CookieJar, str]): Cookie jar to use. 97 | Can also be a filename to load the cookies from a file (Netscape format). 98 | """ 99 | _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT) 100 | cookies = kwargs.pop('cookies', None) 101 | set_cookies(cookies) 102 | return _scraper.get_reactors(post_id, **kwargs) 103 | 104 | 105 | def get_friends( 106 | account: str, 107 | **kwargs, 108 | ) -> Iterator[Profile]: 109 | """Get a Facebook user's friends 110 | Args: 111 | account(str): The account of the profile. 112 | cookies (Union[dict, CookieJar, str]): Cookie jar to use. 113 | Can also be a filename to load the cookies from a file (Netscape format). 114 | """ 115 | _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT) 116 | cookies = kwargs.pop('cookies', None) 117 | set_cookies(cookies) 118 | return _scraper.get_friends(account, **kwargs) 119 | 120 | 121 | def get_page_info(account: str, **kwargs) -> Profile: 122 | """Get a page's information 123 | Args: 124 | account(str): The account of the profile. 125 | cookies (Union[dict, CookieJar, str]): Cookie jar to use. 126 | Can also be a filename to load the cookies from a file (Netscape format). 127 | """ 128 | _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT) 129 | cookies = kwargs.pop('cookies', None) 130 | set_cookies(cookies) 131 | return _scraper.get_page_info(account, **kwargs) 132 | 133 | 134 | def get_group_info(group: Union[str, int], **kwargs) -> Profile: 135 | """Get a group's profile information 136 | Args: 137 | group(str or int): The group name or ID 138 | cookies (Union[dict, CookieJar, str]): Cookie jar to use. 139 | Can also be a filename to load the cookies from a file (Netscape format). 140 | """ 141 | _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT) 142 | cookies = kwargs.pop('cookies', None) 143 | set_cookies(cookies) 144 | return _scraper.get_group_info(group, **kwargs) 145 | 146 | 147 | def get_shop(account: str, **kwargs) -> Iterator[Post]: 148 | """Get a page's shop listings 149 | Args: 150 | account(str): The account of the profile. 151 | cookies (Union[dict, CookieJar, str]): Cookie jar to use. 152 | Can also be a filename to load the cookies from a file (Netscape format). 153 | """ 154 | _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT) 155 | cookies = kwargs.pop('cookies', None) 156 | set_cookies(cookies) 157 | return _scraper.get_shop(account, **kwargs) 158 | 159 | 160 | def get_posts( 161 | account: Optional[str] = None, 162 | group: Union[str, int, None] = None, 163 | post_urls: Optional[Iterator[str]] = None, 164 | hashtag: Optional[str] = None, 165 | credentials: Optional[Credentials] = None, 166 | **kwargs, 167 | ) -> Iterator[Post]: 168 | """Get posts from a Facebook page or group. 169 | 170 | Args: 171 | account (str): The account of the page. 172 | group (int): The group id. 173 | post_urls ([str]): List of manually specified post URLs. 174 | credentials (Optional[Tuple[str, str]]): Tuple of email and password to login before scraping. 175 | timeout (int): Timeout for requests. 176 | page_limit (int): How many pages of posts to go through. 177 | Use None to try to get all of them. 178 | extra_info (bool): Set to True to try to get reactions. 179 | youtube_dl (bool): Use Youtube-DL for video extraction. 180 | cookies (Union[dict, CookieJar, str]): Cookie jar to use. 181 | Can also be a filename to load the cookies from a file (Netscape format). 182 | 183 | Yields: 184 | dict: The post representation in a dictionary. 185 | """ 186 | valid_args = sum(arg is not None for arg in (account, group, post_urls, hashtag)) 187 | 188 | if valid_args != 1: 189 | raise ValueError("You need to specify either account, group, or post_urls") 190 | 191 | _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT) 192 | 193 | cookies = kwargs.pop('cookies', None) 194 | 195 | if cookies is not None and credentials is not None: 196 | raise ValueError("Can't use cookies and credentials arguments at the same time") 197 | set_cookies(cookies) 198 | 199 | options: Union[Dict[str, Any], Set[str]] = kwargs.setdefault('options', {}) 200 | if isinstance(options, set): 201 | warnings.warn("The options argument should be a dictionary.", stacklevel=2) 202 | options = {k: True for k in options} 203 | options.setdefault('account', account) 204 | 205 | # TODO: Add a better throttling mechanism 206 | if 'sleep' in kwargs: 207 | warnings.warn( 208 | "The sleep parameter has been removed, it won't have any effect.", stacklevel=2 209 | ) 210 | kwargs.pop('sleep') 211 | 212 | # TODO: Deprecate `pages` in favor of `page_limit` since it is less confusing 213 | if 'pages' in kwargs: 214 | kwargs['page_limit'] = kwargs.pop('pages') 215 | 216 | # TODO: Deprecate `extra_info` in favor of `options` 217 | if "reactions" not in options: 218 | options['reactions'] = kwargs.pop('extra_info', False) 219 | options['youtube_dl'] = kwargs.pop('youtube_dl', False) 220 | 221 | if credentials is not None: 222 | _scraper.login(*credentials) 223 | 224 | if account is not None: 225 | return _scraper.get_posts(account, **kwargs) 226 | 227 | elif group is not None: 228 | return _scraper.get_group_posts(group, **kwargs) 229 | 230 | elif hashtag is not None: 231 | return _scraper.get_posts_by_hashtag(hashtag, **kwargs) 232 | 233 | elif post_urls is not None: 234 | return _scraper.get_posts_by_url(post_urls, **kwargs) 235 | 236 | raise ValueError('No account nor group') 237 | 238 | 239 | def get_photos( 240 | account: str, 241 | credentials: Optional[Credentials] = None, 242 | **kwargs, 243 | ) -> Iterator[Post]: 244 | """Get photo posts from a Facebook page. 245 | 246 | Args: 247 | account (str): The account of the page. 248 | credentials (Optional[Tuple[str, str]]): Tuple of email and password to login before scraping. 249 | timeout (int): Timeout for requests. 250 | page_limit (int): How many pages of posts to go through. 251 | Use None to try to get all of them. 252 | extra_info (bool): Set to True to try to get reactions. 253 | youtube_dl (bool): Use Youtube-DL for video extraction. 254 | cookies (Union[dict, CookieJar, str]): Cookie jar to use. 255 | Can also be a filename to load the cookies from a file (Netscape format). 256 | 257 | Yields: 258 | dict: The post representation in a dictionary. 259 | """ 260 | if account is None: 261 | raise ValueError("You need to specify account") 262 | 263 | _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT) 264 | 265 | cookies = kwargs.pop('cookies', None) 266 | 267 | if cookies is not None and credentials is not None: 268 | raise ValueError("Can't use cookies and credentials arguments at the same time") 269 | set_cookies(cookies) 270 | 271 | options: Union[Dict[str, Any], Set[str]] = kwargs.setdefault('options', {}) 272 | if isinstance(options, set): 273 | warnings.warn("The options argument should be a dictionary.", stacklevel=2) 274 | options = {k: True for k in options} 275 | options.setdefault('account', account) 276 | 277 | # TODO: Add a better throttling mechanism 278 | if 'sleep' in kwargs: 279 | warnings.warn( 280 | "The sleep parameter has been removed, it won't have any effect.", stacklevel=2 281 | ) 282 | kwargs.pop('sleep') 283 | 284 | # TODO: Deprecate `pages` in favor of `page_limit` since it is less confusing 285 | if 'pages' in kwargs: 286 | kwargs['page_limit'] = kwargs.pop('pages') 287 | 288 | # TODO: Deprecate `extra_info` in favor of `options` 289 | options['reactions'] = kwargs.pop('extra_info', False) 290 | options['youtube_dl'] = kwargs.pop('youtube_dl', False) 291 | 292 | if credentials is not None: 293 | _scraper.login(*credentials) 294 | 295 | return _scraper.get_photos(account, **kwargs) 296 | 297 | 298 | def get_posts_by_search( 299 | word: str, 300 | credentials: Optional[Credentials] = None, 301 | **kwargs, 302 | ) -> Iterator[Post]: 303 | """Get posts by searching all of Facebook 304 | Args: 305 | word (str): The word for searching posts. 306 | credentials (Optional[Tuple[str, str]]): Tuple of email and password to login before scraping. 307 | timeout (int): Timeout for requests. 308 | page_limit (int): How many pages of posts to go through. 309 | Use None to try to get all of them. 310 | extra_info (bool): Set to True to try to get reactions. 311 | youtube_dl (bool): Use Youtube-DL for video extraction. 312 | cookies (Union[dict, CookieJar, str]): Cookie jar to use. 313 | Can also be a filename to load the cookies from a file (Netscape format). 314 | 315 | Yields: 316 | dict: The post representation in a dictionary. 317 | """ 318 | if not word: 319 | raise ValueError("You need to specify word") 320 | 321 | _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT) 322 | 323 | cookies = kwargs.pop('cookies', None) 324 | 325 | if cookies is not None and credentials is not None: 326 | raise ValueError("Can't use cookies and credentials arguments at the same time") 327 | set_cookies(cookies) 328 | 329 | options: Union[Dict[str, Any], Set[str]] = kwargs.setdefault('options', {}) 330 | if isinstance(options, set): 331 | warnings.warn("The options argument should be a dictionary.", stacklevel=2) 332 | options = {k: True for k in options} 333 | 334 | options.setdefault('word', word) 335 | 336 | # TODO: Add a better throttling mechanism 337 | if 'sleep' in kwargs: 338 | warnings.warn( 339 | "The sleep parameter has been removed, it won't have any effect.", stacklevel=2 340 | ) 341 | kwargs.pop('sleep') 342 | 343 | # TODO: Deprecate `pages` in favor of `page_limit` since it is less confusing 344 | if 'pages' in kwargs: 345 | kwargs['page_limit'] = kwargs.pop('pages') 346 | 347 | # TODO: Deprecate `extra_info` in favor of `options` 348 | if "reactions" not in options: 349 | options['reactions'] = kwargs.pop('extra_info', False) 350 | options['youtube_dl'] = kwargs.pop('youtube_dl', False) 351 | 352 | if credentials is not None: 353 | _scraper.login(*credentials) 354 | 355 | if word is not None: 356 | return _scraper.get_posts_by_search(word, **kwargs) 357 | 358 | raise ValueError('No account nor group') 359 | 360 | 361 | def write_post_to_disk(post: Post, source: RawPost, location: pathlib.Path): 362 | post_id = post['post_id'] 363 | filename = f'{post_id}.html' 364 | 365 | logger.debug("Writing post %s", post_id) 366 | with open(location.joinpath(filename), mode='wt') as f: 367 | f.write('\n') 370 | f.write(html_element_to_string(source, pretty=True)) 371 | 372 | 373 | def write_posts_to_csv( 374 | account: Optional[str] = None, 375 | group: Union[str, int, None] = None, 376 | filename: str = None, 377 | encoding: str = None, 378 | **kwargs, 379 | ): 380 | """Write posts from an account or group to a CSV or JSON file 381 | 382 | Args: 383 | account (str): Facebook account name e.g. "nike" or "nintendo" 384 | group (Union[str, int, None]): Facebook group id e.g. 676845025728409 385 | filename (str): Filename, defaults to _posts.csv 386 | encoding (str): Encoding for the output file, defaults to locale.getpreferredencoding() 387 | credentials (Optional[Tuple[str, str]]): Tuple of email and password to login before scraping. Defaults to scrape anonymously 388 | timeout (Optional[int]): Timeout for requests. 389 | page_limit (Optional[int]): How many pages of posts to go through. 390 | Use None to try to get all of them. 391 | extra_info (Optional[bool]): Set to True to try to get reactions. 392 | dump_location (Optional[pathlib.Path]): Location where to write the HTML source of the posts. 393 | """ 394 | dump_location = kwargs.pop('dump_location', None) # For dumping HTML to disk, for debugging 395 | if dump_location is not None: 396 | dump_location.mkdir(exist_ok=True) 397 | kwargs["remove_source"] = False 398 | 399 | # Set a default filename, based on the account name with the appropriate extension 400 | if filename is None: 401 | filename = str(account or group) + "_posts." + kwargs.get("format") 402 | 403 | if encoding is None: 404 | encoding = locale.getpreferredencoding() 405 | 406 | if os.path.isfile(filename): 407 | raise FileExistsError(f"{filename} exists") 408 | 409 | if filename == "-": 410 | output_file = sys.stdout 411 | else: 412 | output_file = open(filename, 'w', newline='', encoding=encoding) 413 | 414 | first_post = True 415 | 416 | sleep = kwargs.pop("sleep", 0) 417 | 418 | days_limit = kwargs.get("days_limit", 3650) 419 | max_post_time = datetime.now() - timedelta(days=days_limit) 420 | 421 | start_url = None 422 | resume_file = kwargs.get("resume_file") 423 | if resume_file: 424 | try: 425 | with open(resume_file, "r") as f: 426 | existing_url = f.readline().strip() 427 | logger.debug("Existing URL:" + existing_url) 428 | if existing_url: 429 | start_url = existing_url 430 | except FileNotFoundError: 431 | pass 432 | 433 | def handle_pagination_url(url): 434 | if resume_file: 435 | with open(resume_file, "w") as f: 436 | f.write(url + "\n") 437 | 438 | keys = kwargs.get("keys") 439 | 440 | try: 441 | for post in get_posts( 442 | account=account, 443 | group=group, 444 | start_url=start_url, 445 | request_url_callback=handle_pagination_url, 446 | **kwargs, 447 | ): 448 | if dump_location is not None: 449 | source = post.pop('source') 450 | try: 451 | write_post_to_disk(post, source, dump_location) 452 | except Exception: 453 | logger.exception("Error writing post to disk") 454 | elif post.get("source"): 455 | post["source"] = post["source"].html 456 | if first_post: 457 | if kwargs.get("format") == "json": 458 | output_file.write("[\n") 459 | else: 460 | if not keys: 461 | keys = list(post.keys()) 462 | dict_writer = csv.DictWriter(output_file, keys, extrasaction='ignore') 463 | dict_writer.writeheader() 464 | else: 465 | if kwargs.get("format") == "json": 466 | output_file.write(",") 467 | match = None 468 | if post["text"]: 469 | match = re.search(kwargs.get("matching", '.+'), post["text"], flags=re.IGNORECASE) 470 | if kwargs.get("not_matching") and re.search( 471 | kwargs.get("not_matching"), post["text"], flags=re.IGNORECASE 472 | ): 473 | match = None 474 | if match: 475 | if kwargs.get("format") == "json": 476 | if keys: 477 | post = {k: v for k, v in post.items() if k in keys} 478 | json.dump(post, output_file, default=str, indent=4) 479 | else: 480 | dict_writer.writerow(post) 481 | if not first_post and post["time"] and post["time"] < max_post_time: 482 | logger.debug( 483 | f"Reached days_limit - {post['time']} is more than {days_limit} days old (older than {max_post_time})" 484 | ) 485 | break 486 | first_post = False 487 | time.sleep(sleep) 488 | except KeyboardInterrupt: 489 | pass 490 | except Exception as e: 491 | traceback.print_exc() 492 | 493 | if kwargs.get("format") == "json": 494 | output_file.write("\n]") 495 | if first_post: 496 | print("Couldn't get any posts.", file=sys.stderr) 497 | output_file.close() 498 | 499 | 500 | def get_groups_by_search( 501 | word: str, 502 | **kwargs, 503 | ): 504 | """Searches Facebook groups and yields ids for each result 505 | on the first page""" 506 | _scraper.requests_kwargs['timeout'] = kwargs.pop('timeout', DEFAULT_REQUESTS_TIMEOUT) 507 | cookies = kwargs.pop('cookies', None) 508 | set_cookies(cookies) 509 | return _scraper.get_groups_by_search(word, **kwargs) 510 | 511 | 512 | def enable_logging(level=logging.DEBUG): 513 | handler = logging.StreamHandler() 514 | handler.setLevel(level) 515 | 516 | logger.addHandler(handler) 517 | logger.setLevel(level) 518 | 519 | 520 | def use_persistent_session(email: str, password: str, cookies_file_path=DEFAULT_COOKIES_FILE_PATH): 521 | """Login persistently to Facebook and save cookies to a file (default: ".fb-cookies.pckl"). This is highly recommended if you want to scrape several times a day because it will keep your session alive instead of logging in every time (which can be flagged as suspicious by Facebook). 522 | 523 | Args: 524 | email (str): email address to login. 525 | password (str): password to login. 526 | cookies_file_path (str, optional): path to the file in which to save cookies. Defaults to ".fb-cookies.pckl". 527 | 528 | Raises: 529 | exceptions.InvalidCredentials: if the credentials are invalid. 530 | 531 | Returns: 532 | Boolean: True if the login was successful, False otherwise. 533 | """ 534 | try: 535 | with open(cookies_file_path, "rb") as f: 536 | cookies = pickle.load(f) 537 | logger.debug("Loaded cookies from %s", cookies_file_path) 538 | except FileNotFoundError: 539 | logger.error("No cookies file found at %s", cookies_file_path) 540 | cookies = None 541 | try: 542 | if not cookies: 543 | raise exceptions.InvalidCookies() 544 | set_cookies(cookies) 545 | logger.debug("Successfully logged in with cookies") 546 | except exceptions.InvalidCookies: 547 | logger.exception("Invalid cookies, trying to login with credentials") 548 | _scraper.login(email, password) 549 | cookies = _scraper.session.cookies 550 | with open(cookies_file_path, "wb") as f: 551 | pickle.dump(cookies, f) 552 | set_cookies(cookies) 553 | logger.debug("Successfully logged in with credentials") 554 | 555 | 556 | # Disable logging by default 557 | logger = logging.getLogger(__name__) 558 | logger.addHandler(logging.NullHandler()) 559 | -------------------------------------------------------------------------------- /facebook_scraper/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import pathlib 4 | import datetime 5 | import sys 6 | import locale 7 | import json 8 | import csv 9 | 10 | from . import enable_logging, write_posts_to_csv, get_profile 11 | 12 | 13 | def run(): 14 | """facebook-scraper entry point when used as a script""" 15 | parser = argparse.ArgumentParser( 16 | prog='facebook-scraper', 17 | description="Scrape Facebook public pages without an API key", 18 | ) 19 | parser.add_argument('account', type=str, help="Facebook account or group") 20 | parser.add_argument('-f', '--filename', type=str, help="Output filename") 21 | parser.add_argument('-p', '--pages', type=int, help="Number of pages to download", default=10) 22 | parser.add_argument( 23 | '-s', '--sleep', type=float, help="How long to sleep for between posts", default=0 24 | ) 25 | parser.add_argument( 26 | '-t', 27 | '--timeout', 28 | type=int, 29 | help="How long to wait in seconds for Facebook servers before aborting", 30 | default=10, 31 | ) 32 | parser.add_argument('-g', '--group', action='store_true', help="Use group scraper") 33 | parser.add_argument('-v', '--verbose', action='count', help="Enable logging", default=0) 34 | parser.add_argument('-c', '--cookies', type=str, help="Path to a cookies file") 35 | parser.add_argument('--comments', action='store_true', help="Extract comments") 36 | parser.add_argument('-r', '--reactions', action='store_true', help="Extract reactions") 37 | parser.add_argument('-rs', '--reactors', action='store_true', help="Extract reactors") 38 | parser.add_argument( 39 | '--dump', 40 | type=pathlib.Path, 41 | dest='dump_location', 42 | help="Location where to save the HTML source of the posts (useful for debugging)", 43 | default=None, 44 | ) 45 | parser.add_argument( 46 | '--encoding', 47 | action='store', 48 | help="Encoding for the output file", 49 | default=None, 50 | ) 51 | parser.add_argument( 52 | '-fmt', 53 | '--format', 54 | type=str.lower, 55 | choices=["csv", "json"], 56 | default="csv", 57 | help="What format to export as", 58 | ) 59 | parser.add_argument( 60 | '-d', 61 | '--days-limit', 62 | dest='days_limit', 63 | default=3650, 64 | type=int, 65 | help="Number of days to download", 66 | ) 67 | parser.add_argument( 68 | '-rf', 69 | '--resume-file', 70 | type=str, 71 | help="Filename to store the last pagination URL in, for resuming", 72 | ) 73 | parser.add_argument( 74 | '-ner', 75 | '--no-extra-requests', 76 | dest='allow_extra_requests', 77 | action='store_false', 78 | help="Disable making extra requests (for things like high quality image URLs)", 79 | ) 80 | parser.add_argument( 81 | '-k', 82 | '--keys', 83 | type=lambda s: s.split(sep=","), 84 | help="Comma separated list of which keys or columns to return. This lets you filter to just your desired outputs.", 85 | ) 86 | parser.add_argument( 87 | '-m', 88 | '--matching', 89 | type=str, 90 | default=".+", 91 | help='Filter to just posts matching regex expression', 92 | ) 93 | parser.add_argument( 94 | '-nm', 95 | '--not-matching', 96 | type=str, 97 | help='Filter to just posts not matching regex expression', 98 | ) 99 | parser.add_argument( 100 | '--extra-info ', 101 | dest='extra_info', 102 | action='store_true', 103 | help="Try to do an extra request to get the post reactions. Default is False", 104 | default=False, 105 | ) 106 | parser.add_argument( 107 | '--use-youtube-dl', 108 | dest='youtube_dl', 109 | action='store_true', 110 | help='Use Youtube-DL for (high-quality) video extraction. You need to have youtube-dl installed on your environment. Default is False.', 111 | default=False, 112 | ) 113 | parser.add_argument( 114 | '--profile', 115 | action='store_true', 116 | help="Extract an account's profile", 117 | default=False, 118 | ) 119 | parser.add_argument( 120 | '--friends', type=int, help='When extracting a profile, how many friends to extract' 121 | ) 122 | parser.add_argument( 123 | '-ppp', 124 | '--posts-per-page', 125 | dest='posts_per_page', 126 | default=4, 127 | type=int, 128 | help="Number of posts to fetch per page", 129 | ) 130 | parser.add_argument( 131 | '--source', 132 | action='store_true', 133 | help="Include HTML source", 134 | default=False, 135 | ) 136 | 137 | args = parser.parse_args() 138 | 139 | # Enable logging 140 | if args.verbose > 0: 141 | args.verbose = min(args.verbose, 3) 142 | level = {1: logging.WARNING, 2: logging.INFO, 3: logging.DEBUG}[args.verbose] 143 | enable_logging(level) 144 | 145 | if args.profile: 146 | # Set a default filename, based on the account name with the appropriate extension 147 | if args.filename is None: 148 | args.filename = str(args.account) + "_profile." + args.format 149 | 150 | if args.encoding is None: 151 | encoding = locale.getpreferredencoding() 152 | 153 | if args.filename == "-": 154 | output_file = sys.stdout 155 | else: 156 | output_file = open(args.filename, 'w', newline='', encoding=encoding) 157 | 158 | profile = get_profile(args.account, friends=args.friends, cookies=args.cookies) 159 | 160 | if args.format == "json": 161 | json.dump(profile, output_file, default=str, indent=4) 162 | else: 163 | dict_writer = csv.DictWriter(output_file, profile.keys()) 164 | dict_writer.writeheader() 165 | dict_writer.writerow(profile) 166 | output_file.close() 167 | else: 168 | # Choose the right argument to pass to write_posts_to_csv (group or account) 169 | account_type = 'group' if args.group else 'account' 170 | kwargs = { 171 | account_type: args.account, 172 | "format": args.format, 173 | "days_limit": args.days_limit, 174 | "resume_file": args.resume_file, 175 | "cookies": args.cookies, 176 | "timeout": args.timeout, 177 | "sleep": args.sleep, 178 | "keys": args.keys, 179 | "matching": args.matching, 180 | "not_matching": args.not_matching, 181 | "options": { 182 | "reactions": args.reactions, 183 | "reactors": args.reactors, 184 | "comments": args.comments, 185 | "allow_extra_requests": args.allow_extra_requests, 186 | "posts_per_page": args.posts_per_page, 187 | }, 188 | "youtube_dl": args.youtube_dl, 189 | "extra_info": args.extra_info, 190 | "remove_source": not args.source, 191 | } 192 | 193 | write_posts_to_csv( 194 | **kwargs, 195 | filename=args.filename, 196 | pages=args.pages, 197 | encoding=args.encoding, 198 | dump_location=args.dump_location, 199 | ) 200 | 201 | 202 | if __name__ == '__main__': 203 | run() 204 | -------------------------------------------------------------------------------- /facebook_scraper/constants.py: -------------------------------------------------------------------------------- 1 | FB_BASE_URL = 'https://facebook.com/' 2 | FB_W3_BASE_URL = 'https://www.facebook.com/' 3 | FB_MOBILE_BASE_URL = 'https://m.facebook.com/' 4 | FB_MBASIC_BASE_URL = 'https://mbasic.facebook.com/' 5 | 6 | DEFAULT_REQUESTS_TIMEOUT = 120 7 | DEFAULT_PAGE_LIMIT = 10 8 | 9 | DEFAULT_COOKIES_FILE_PATH = '.fb-cookies.pckl' 10 | -------------------------------------------------------------------------------- /facebook_scraper/exceptions.py: -------------------------------------------------------------------------------- 1 | class NotFound(Exception): 2 | '''Post, page or profile not found / doesn't exist / deleted''' 3 | 4 | pass 5 | 6 | 7 | class TemporarilyBanned(Exception): 8 | '''User account rate limited''' 9 | 10 | pass 11 | 12 | 13 | class AccountDisabled(Exception): 14 | '''User account disabled, with option to appeal''' 15 | 16 | pass 17 | 18 | 19 | class InvalidCookies(Exception): 20 | '''Cookies file passed but missing cookies''' 21 | 22 | pass 23 | 24 | 25 | class LoginRequired(Exception): 26 | '''Facebook requires a login to see this''' 27 | 28 | pass 29 | 30 | 31 | class LoginError(Exception): 32 | '''Failed to log in''' 33 | 34 | pass 35 | 36 | 37 | class UnexpectedResponse(Exception): 38 | '''Facebook served something weird''' 39 | 40 | pass 41 | -------------------------------------------------------------------------------- /facebook_scraper/facebook_scraper.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import logging 3 | from urllib.parse import urljoin 4 | import warnings 5 | import re 6 | from functools import partial 7 | from typing import Iterator, Union 8 | import json 9 | import demjson3 as demjson 10 | from urllib.parse import parse_qs, urlparse, unquote 11 | from datetime import datetime 12 | import os 13 | 14 | from bs4 import BeautifulSoup 15 | from requests import RequestException 16 | from requests_html import HTMLSession 17 | 18 | from . import utils 19 | from .constants import ( 20 | DEFAULT_PAGE_LIMIT, 21 | FB_BASE_URL, 22 | FB_MOBILE_BASE_URL, 23 | FB_W3_BASE_URL, 24 | FB_MBASIC_BASE_URL, 25 | ) 26 | from .extractors import ( 27 | extract_group_post, 28 | extract_post, 29 | extract_photo_post, 30 | extract_story_post, 31 | PostExtractor, 32 | extract_hashtag_post, 33 | ) 34 | from .fb_types import Post, Profile 35 | from .page_iterators import ( 36 | iter_group_pages, 37 | iter_pages, 38 | iter_photos, 39 | iter_search_pages, 40 | iter_hashtag_pages, PageParser, 41 | ) 42 | from . import exceptions 43 | 44 | 45 | logger = logging.getLogger(__name__) 46 | 47 | 48 | class FacebookScraper: 49 | """Class for creating FacebookScraper Iterators""" 50 | 51 | base_url = FB_MOBILE_BASE_URL 52 | default_headers = { 53 | "Accept": "*/*", 54 | "Connection": "keep-alive", 55 | "Accept-Encoding": "gzip,deflate", 56 | "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.1 Safari/605.1.15", 57 | } 58 | have_checked_locale = False 59 | 60 | def __init__(self, session=None, requests_kwargs=None): 61 | if session is None: 62 | session = HTMLSession() 63 | session.headers.update(self.default_headers) 64 | 65 | if requests_kwargs is None: 66 | requests_kwargs = {} 67 | 68 | self.session = session 69 | self.requests_kwargs = requests_kwargs 70 | self.request_count = 0 71 | self.mbasic_headers = None 72 | 73 | def set_user_agent(self, user_agent): 74 | self.session.headers["User-Agent"] = user_agent 75 | 76 | def set_noscript(self, noscript): 77 | if noscript: 78 | self.session.cookies.set("noscript", "1") 79 | else: 80 | self.session.cookies.set("noscript", "0") 81 | 82 | def set_proxy(self, proxy, verify=True): 83 | self.requests_kwargs.update( 84 | {'proxies': {'http': proxy, 'https': proxy}, 'verify': verify} 85 | ) 86 | ip = self.get( 87 | "http://lumtest.com/myip.json", headers={"Accept": "application/json"} 88 | ).json() 89 | logger.debug(f"Proxy details: {ip}") 90 | 91 | def get_posts(self, account: str, **kwargs) -> Iterator[Post]: 92 | kwargs["scraper"] = self 93 | iter_pages_fn = partial(iter_pages, account=account, request_fn=self.get, **kwargs) 94 | return self._generic_get_posts(extract_post, iter_pages_fn, **kwargs) 95 | 96 | def get_reactors(self, post_id: int, **kwargs) -> Iterator[dict]: 97 | reaction_url = ( 98 | f'https://m.facebook.com/ufi/reaction/profile/browser/?ft_ent_identifier={post_id}' 99 | ) 100 | logger.debug(f"Fetching {reaction_url}") 101 | response = self.get(reaction_url) 102 | extractor = PostExtractor(response.html, kwargs, self.get, full_post_html=response.html, scraper=self) 103 | return extractor.extract_reactors(response) 104 | 105 | def get_photos(self, account: str, **kwargs) -> Iterator[Post]: 106 | kwargs["scraper"] = self 107 | iter_pages_fn = partial(iter_photos, account=account, request_fn=self.get, **kwargs) 108 | return self._generic_get_posts(extract_post, iter_pages_fn, **kwargs) 109 | 110 | def get_posts_by_hashtag(self, hashtag: str, **kwargs) -> Iterator[Post]: 111 | kwargs["scraper"] = self 112 | kwargs["base_url"] = FB_MBASIC_BASE_URL 113 | iter_pages_fn = partial( 114 | iter_hashtag_pages, hashtag=hashtag, request_fn=self.get, **kwargs 115 | ) 116 | return self._generic_get_posts(extract_hashtag_post, iter_pages_fn, **kwargs) 117 | 118 | def get_posts_by_url( 119 | self, post_urls, options={}, remove_source=True, **kwargs 120 | ) -> Iterator[Post]: 121 | kwargs["scraper"] = self 122 | if self.session.cookies.get("noscript") == "1": 123 | options["noscript"] = True 124 | for post_url in post_urls: 125 | url = str(post_url) 126 | if url.startswith(FB_BASE_URL): 127 | url = url.replace(FB_BASE_URL, FB_MBASIC_BASE_URL) 128 | if url.startswith(FB_W3_BASE_URL): 129 | url = url.replace(FB_W3_BASE_URL, FB_MBASIC_BASE_URL) 130 | if not url.startswith(FB_MOBILE_BASE_URL): 131 | url = utils.urljoin(FB_MBASIC_BASE_URL, url) 132 | 133 | post = {"original_request_url": post_url, "post_url": url} 134 | logger.debug(f"Requesting page from: {url}") 135 | response = self.get(url) 136 | options["response_url"] = response.url 137 | photo_post = False 138 | if "/stories/" in url or "/story/" in url: 139 | elem = response.html.find("#story_viewer_content", first=True) 140 | else: 141 | # top_level_post_id is not used anymore 142 | elem = response.html.find('[data-ft]', first=True) 143 | if not elem: 144 | elem = response.html.find('div.async_like', first=True) 145 | if response.html.find("div.msg", first=True): 146 | photo_post = True 147 | elem = response.html 148 | if not elem: 149 | logger.warning("No raw posts (
elements) were found in this page.") 150 | else: 151 | comments_area = response.html.find('div.ufi', first=True) 152 | if comments_area: 153 | # Makes likes/shares regexes work 154 | try: 155 | elem = utils.make_html_element( 156 | elem.html.replace("", comments_area.html + "") 157 | ) 158 | except ValueError as e: 159 | logger.debug(e) 160 | 161 | if photo_post: 162 | post.update( 163 | extract_photo_post( 164 | elem, 165 | request_fn=self.get, 166 | options=options, 167 | full_post_html=response.html, 168 | **kwargs, 169 | ) 170 | ) 171 | elif url.startswith(utils.urljoin(FB_MBASIC_BASE_URL, "/groups/")): 172 | post.update( 173 | extract_group_post( 174 | elem, 175 | request_fn=self.get, 176 | options=options, 177 | full_post_html=response.html, 178 | **kwargs, 179 | ) 180 | ) 181 | elif "/stories/" in url or "/story/" in url: 182 | post.update( 183 | extract_story_post( 184 | elem, 185 | request_fn=self.get, 186 | options=options, 187 | full_post_html=response.html, 188 | **kwargs, 189 | ) 190 | ) 191 | else: 192 | post.update( 193 | extract_post( 194 | elem, 195 | request_fn=self.get, 196 | options=options, 197 | full_post_html=response.html, 198 | **kwargs, 199 | ) 200 | ) 201 | if not post.get("post_url"): 202 | post["post_url"] = url 203 | if remove_source: 204 | post.pop('source', None) 205 | yield post 206 | 207 | def get_posts_by_search(self, word: str, **kwargs) -> Iterator[Post]: 208 | kwargs["scraper"] = self 209 | iter_pages_fn = partial(iter_search_pages, word=word, request_fn=self.get, **kwargs) 210 | return self._generic_get_posts(extract_post, iter_pages_fn, **kwargs) 211 | 212 | def get_friends(self, account, **kwargs) -> Iterator[Profile]: 213 | friend_opt = kwargs.get("friends") 214 | limit = None 215 | if type(friend_opt) in [int, float]: 216 | limit = friend_opt 217 | friend_url = kwargs.pop("start_url", None) 218 | if not friend_url: 219 | friend_url = utils.urljoin(FB_MOBILE_BASE_URL, f'/{account}/friends/') 220 | request_url_callback = kwargs.get('request_url_callback') 221 | friends_found = 0 222 | while friend_url: 223 | logger.debug(f"Requesting page from: {friend_url}") 224 | response = self.get(friend_url) 225 | elems = response.html.find('div[class="timeline"] > div > div') 226 | logger.debug(f"Found {len(elems)} friends") 227 | for elem in elems: 228 | name = elem.find("h3>a,h1>a", first=True) 229 | if not name: 230 | continue 231 | # Tagline 232 | tagline = elem.find("span.fcg", first=True) 233 | if tagline: 234 | tagline = tagline.text 235 | else: 236 | tagline = "" 237 | # Profile Picture 238 | profile_picture = elem.find("i.profpic", first=True).attrs.get("style") 239 | match = re.search(r"url\('(.+)'\)", profile_picture) 240 | if match: 241 | profile_picture = utils.decode_css_url(match.groups()[0]) 242 | # User ID if present, not present if no "add friend" 243 | user_id = elem.find("a.touchable[data-store]", first=True) 244 | if user_id: 245 | user_id = json.loads(user_id.attrs["data-store"]).get("id") 246 | else: 247 | user_id = "" 248 | 249 | friend = { 250 | "id": user_id, 251 | "link": name.attrs.get("href"), 252 | "name": name.text, 253 | "profile_picture": profile_picture, 254 | "tagline": tagline, 255 | } 256 | yield friend 257 | friends_found += 1 258 | if limit and friends_found > limit: 259 | return 260 | more = re.search(r'm_more_friends",href:"([^"]+)"', response.text) 261 | if more: 262 | friend_url = utils.urljoin(FB_MOBILE_BASE_URL, more.group(1)) 263 | if request_url_callback: 264 | request_url_callback(friend_url) 265 | else: 266 | return 267 | 268 | def get_collection(self, more_url, limit=None, **kwargs) -> Iterator[Profile]: 269 | request_url_callback = kwargs.get('request_url_callback') 270 | count = 0 271 | while more_url: 272 | logger.debug(f"Requesting page from: {more_url}") 273 | response = self.get(more_url) 274 | if response.text.startswith("for (;;);"): 275 | prefix_length = len('for (;;);') 276 | data = json.loads(response.text[prefix_length:]) # Strip 'for (;;);' 277 | for action in data['payload']['actions']: 278 | if action['cmd'] == 'append' and action['html']: 279 | element = utils.make_html_element( 280 | action['html'], 281 | url=FB_MOBILE_BASE_URL, 282 | ) 283 | elems = element.find('a.touchable') 284 | html = element.text 285 | elif action['cmd'] == 'script': 286 | more_url = re.search( 287 | r'("\\/timeline\\/app_collection\\/more\\/[^"]+")', action["code"] 288 | ) 289 | if more_url: 290 | more_url = more_url.group(1) 291 | more_url = json.loads(more_url) 292 | else: 293 | elems = response.html.find('#timelineBody a.touchable') 294 | more_url = re.search( 295 | r'href:"(/timeline/app_collection/more/[^"]+)"', response.text 296 | ) 297 | if more_url: 298 | more_url = more_url.group(1) 299 | logger.debug(f"Found {len(elems)} elems") 300 | for elem in elems: 301 | name = elem.find("strong", first=True).text 302 | link = elem.attrs.get("href") 303 | try: 304 | tagline = elem.find("div.twoLines", first=True).text 305 | except: 306 | tagline = None 307 | profile_picture = elem.find("i.profpic", first=True).attrs.get("style") 308 | match = re.search(r"url\('(.+)'\)", profile_picture) 309 | if match: 310 | profile_picture = utils.decode_css_url(match.groups()[0]) 311 | result = { 312 | "link": link, 313 | "name": name, 314 | "profile_picture": profile_picture, 315 | "tagline": tagline, 316 | } 317 | yield result 318 | count += 1 319 | if type(limit) in [int, float] and count > limit: 320 | return 321 | if more_url and request_url_callback: 322 | request_url_callback(more_url) 323 | 324 | def get_profile(self, account, **kwargs) -> Profile: 325 | account = account.replace("profile.php?id=", "") 326 | result = {} 327 | 328 | if kwargs.get("allow_extra_requests", True): 329 | logger.debug(f"Requesting page from: {account}") 330 | response = self.get(account) 331 | try: 332 | top_post = response.html.find( 333 | '[data-ft*="top_level_post_id"]:not([data-sigil="m-see-translate-link"])', 334 | first=True, 335 | ) 336 | assert top_post is not None 337 | top_post = PostExtractor(top_post, kwargs, self.get).extract_post() 338 | top_post.pop("source") 339 | result["top_post"] = top_post 340 | except Exception as e: 341 | logger.error(f"Unable to extract top_post {type(e)}:{e}") 342 | 343 | try: 344 | result["Friend_count"] = utils.parse_int( 345 | response.html.find("a[data-store*='friends']>div>div")[-1].text.split()[0] 346 | ) 347 | except Exception as e: 348 | result["Friend_count"] = None 349 | logger.error(f"Friend_count extraction failed: {e}") 350 | try: 351 | result["Follower_count"] = utils.parse_int( 352 | response.html.find( 353 | "div[data-sigil*='profile-intro-card-log']", 354 | containing="Followed by", 355 | first=True, 356 | ).text 357 | ) 358 | except Exception as e: 359 | result["Follower_count"] = None 360 | logger.error(f"Follower_count extraction failed: {e}") 361 | try: 362 | following_url = f'/{account}?v=following' 363 | logger.debug(f"Fetching {following_url}") 364 | following_response = self.get(following_url) 365 | result["Following_count"] = utils.parse_int( 366 | following_response.html.find("div[role='heading']", first=True).text 367 | ) 368 | except Exception as e: 369 | result["Following_count"] = None 370 | logger.error(f"Following_count extraction failed: {e}") 371 | 372 | photo_links = response.html.find("a[href^='/photo.php']") 373 | if len(photo_links) == 1: 374 | profile_photo = photo_links[0] 375 | response = self.get(profile_photo.attrs.get("href")) 376 | extractor = PostExtractor(response.html, kwargs, self.get) 377 | result["profile_picture"] = extractor.extract_photo_link_HQ(response.html.html) 378 | elif len(photo_links) >= 2: 379 | cover_photo = photo_links[0] 380 | result["cover_photo_text"] = cover_photo.attrs.get("title") 381 | # Check if there is a cover photo or not 382 | if result["cover_photo_text"] is not None: 383 | response = self.get(cover_photo.attrs.get("href")) 384 | extractor = PostExtractor(response.html, kwargs, self.get) 385 | result["cover_photo"] = extractor.extract_photo_link_HQ(response.html.html) 386 | 387 | profile_photo = photo_links[1] 388 | response = self.get(profile_photo.attrs.get("href")) 389 | result["profile_picture"] = extractor.extract_photo_link_HQ( 390 | response.html.html 391 | ) 392 | else: 393 | result["cover_photo"] = None 394 | profile_photo = photo_links[0] 395 | response = self.get(profile_photo.attrs.get("href")) 396 | extractor = PostExtractor(response.html, kwargs, self.get) 397 | result["profile_picture"] = extractor.extract_photo_link_HQ( 398 | response.html.html 399 | ) 400 | else: 401 | cover_photo = response.html.find( 402 | "div[data-sigil='cover-photo']>i.img", first=True 403 | ) 404 | if cover_photo: 405 | match = re.search(r"url\('(.+)'\)", cover_photo.attrs["style"]) 406 | if match: 407 | result["cover_photo"] = utils.decode_css_url(match.groups()[0]) 408 | profpic = response.html.find("img.profpic", first=True) 409 | if profpic: 410 | result["profile_picture"] = profpic.attrs["src"] 411 | 412 | about_url = utils.urljoin(FB_MOBILE_BASE_URL, f'/{account}/about/') 413 | logger.debug(f"Requesting page from: {about_url}") 414 | response = self.get(about_url) 415 | match = re.search(r'entity_id:(\d+)', response.html.html) 416 | if match: 417 | result["id"] = match.group(1) 418 | # Profile name is in the title 419 | title = response.html.find("title", first=True).text 420 | if " | " in title: 421 | title = title.split(" | ")[0] 422 | result["Name"] = title 423 | 424 | about = response.html.find("div#main_column,div.aboutme", first=True) 425 | if not about: 426 | logger.warning("No about section found") 427 | return result 428 | for card in about.find("div[data-sigil='profile-card']"): 429 | header = card.find("header", first=True).text 430 | if header.startswith("About"): 431 | header = "About" # Truncate strings like "About Mark" 432 | if header in ["Work, Education"]: 433 | experience = [] 434 | for elem in card.find("div.experience"): 435 | xp = {} 436 | try: 437 | xp["link"] = elem.find("a", first=True).attrs["href"] 438 | except: 439 | pass 440 | bits = elem.text.split("\n") 441 | if len(bits) == 2: 442 | xp["text"], xp["type"] = bits 443 | elif len(bits) == 3: 444 | xp["text"], xp["type"], xp["year"] = bits 445 | else: 446 | xp["text"] = elem.text 447 | experience.append(xp) 448 | result[header] = experience 449 | elif header == "Places lived": 450 | places = [] 451 | for elem in card.find("div.touchable"): 452 | place = {} 453 | try: 454 | place["link"] = elem.find("a", first=True).attrs["href"] 455 | except: 456 | pass 457 | if "\n" in elem.text: 458 | place["text"], place["type"] = elem.text.split("\n") 459 | else: 460 | place["text"] = elem.text 461 | places.append(place) 462 | result[header] = places 463 | else: 464 | bits = card.text.split("\n")[1:] # Remove header 465 | if len(bits) >= 3 and header == "Relationship": 466 | result[header] = {"to": bits[0], "type": bits[1], "since": bits[2]} 467 | elif len(bits) == 1: 468 | result[header] = bits[0] 469 | elif ( 470 | header 471 | in [ 472 | "Contact Info", 473 | "Basic Info", 474 | "Education", 475 | "Family Members", 476 | "Other names", 477 | ] 478 | and len(bits) % 2 == 0 479 | ): # Divisible by two, assume pairs 480 | pairs = {} 481 | for i in range(0, len(bits), 2): 482 | if bits[i + 1] == "Websites": 483 | if "Websites" not in pairs: 484 | pairs["Websites"] = [] 485 | pairs["Websites"].append(bits[i]) 486 | else: 487 | pairs[bits[i + 1]] = bits[i] 488 | result[header] = pairs 489 | else: 490 | result[header] = "\n".join(bits) 491 | if kwargs.get("friends"): 492 | result["Friends"] = list(self.get_friends(account, **kwargs)) 493 | if kwargs.get("followers"): 494 | result["Followers"] = list( 495 | self.get_collection( 496 | f'/{account}?v=followers', limit=kwargs.get("followers"), **kwargs 497 | ) 498 | ) 499 | if kwargs.get("following"): 500 | result["Following"] = list( 501 | self.get_collection( 502 | f'/{account}?v=following', limit=kwargs.get("following"), **kwargs 503 | ) 504 | ) 505 | 506 | # Likes 507 | if result.get("id") and kwargs.get("likes"): 508 | likes_url = utils.urljoin( 509 | FB_MOBILE_BASE_URL, 510 | f'timeline/app_section/?section_token={result["id"]}:2409997254', 511 | ) 512 | logger.debug(f"Requesting page from: {likes_url}") 513 | response = self.get(likes_url) 514 | result["likes_by_category"] = {} 515 | for elem in response.html.find('header[data-sigil="profile-card-header"]'): 516 | count, category = elem.text.split("\n") 517 | count = utils.parse_int(count) 518 | if category == "All Likes": 519 | result["likes_count"] = count 520 | result["likes_by_category"][category] = count 521 | 522 | all_likes_url = utils.urljoin( 523 | FB_MOBILE_BASE_URL, 524 | f'timeline/app_collection/?collection_token={result["id"]}:2409997254:96', 525 | ) 526 | logger.debug(f"Requesting page from: {all_likes_url}") 527 | response = self.get(all_likes_url) 528 | result["likes"] = [] 529 | for elem in response.html.find("div._1a5p"): 530 | result["likes"].append( 531 | { 532 | "name": elem.text, 533 | "link": elem.find("a", first=True).attrs.get("href"), 534 | } 535 | ) 536 | more_url = re.search(r'href:"(/timeline/app_collection/more/[^"]+)"', response.text) 537 | if more_url: 538 | more_url = more_url.group(1) 539 | while more_url: 540 | logger.debug(f"Fetching {more_url}") 541 | response = self.get(more_url) 542 | prefix_length = len('for (;;);') 543 | data = json.loads(response.text[prefix_length:]) # Strip 'for (;;);' 544 | for action in data['payload']['actions']: 545 | if action['cmd'] == 'append' and action['html']: 546 | element = utils.make_html_element( 547 | action['html'], 548 | url=FB_MOBILE_BASE_URL, 549 | ) 550 | for elem in element.find("div._1a5p"): 551 | result["likes"].append( 552 | { 553 | "name": elem.text, 554 | "link": elem.find("a", first=True).attrs.get("href"), 555 | } 556 | ) 557 | elif action['cmd'] == 'script': 558 | more_url = re.search( 559 | r'("\\/timeline\\/app_collection\\/more\\/[^"]+")', action["code"] 560 | ) 561 | if more_url: 562 | more_url = more_url.group(1) 563 | more_url = json.loads(more_url) 564 | 565 | return result 566 | 567 | def get_page_reviews(self, page, **kwargs) -> Iterator[Post]: 568 | more_url = f"/{page}/reviews" 569 | while more_url: 570 | logger.debug(f"Fetching {more_url}") 571 | response = self.get(more_url) 572 | if response.text.startswith("for (;;);"): 573 | prefix_length = len('for (;;);') 574 | data = json.loads(response.text[prefix_length:]) # Strip 'for (;;);' 575 | for action in data['payload']['actions']: 576 | if action['cmd'] == 'replace' and action['html']: 577 | element = utils.make_html_element( 578 | action['html'], 579 | url=FB_MOBILE_BASE_URL, 580 | ) 581 | elems = element.find('#page_suggestions_on_liking ~ div') 582 | elif action['cmd'] == 'script': 583 | more_url = re.search( 584 | r'see_more_cards_id","href":"([^"]+)"', action["code"] 585 | ) 586 | if more_url: 587 | more_url = more_url.group(1) 588 | more_url = utils.decode_css_url(more_url) 589 | more_url = more_url.replace("\\", "") 590 | else: 591 | elems = response.html.find('#page_suggestions_on_liking ~ div') 592 | more_url = re.search(r'see_more_cards_id",href:"([^"]+)"', response.text) 593 | if more_url: 594 | more_url = more_url.group(1) 595 | 596 | for elem in elems: 597 | header_elem = elem.find("div[data-nt='FB:TEXT4']:has(span)", first=True) 598 | if not header_elem: 599 | continue 600 | bits = list(header_elem.element.itertext()) 601 | username = bits[0].strip() 602 | recommends = "recommends" in header_elem.text 603 | links = header_elem.find("a") 604 | if len(links) == 2: 605 | user_url = utils.urljoin(FB_BASE_URL, links[0].attrs["href"]) 606 | else: 607 | user_url = None 608 | text_elem = elem.find("div[data-nt='FB:FEED_TEXT'] span p", first=True) 609 | if text_elem: 610 | text = text_elem.text 611 | else: 612 | text = None 613 | date_element = elem.find("abbr[data-store*='time']", first=True) 614 | time = json.loads(date_element.attrs["data-store"])["time"] 615 | yield { 616 | "user_url": user_url, 617 | "username": username, 618 | "profile_picture": elem.find("img", first=True).attrs["src"], 619 | "text": text, 620 | "header": header_elem.text, 621 | "time": datetime.fromtimestamp(time), 622 | "timestamp": time, 623 | "recommends": recommends, 624 | "post_url": utils.urljoin( 625 | FB_BASE_URL, elem.find("a[href*='story']", first=True).attrs["href"] 626 | ), 627 | } 628 | 629 | def get_page_info(self, page, **kwargs) -> Profile: 630 | result = {} 631 | 632 | try: 633 | logger.debug("getting page info using mbasic url") 634 | # mbasic info 635 | page_url = utils.urljoin(FB_MBASIC_BASE_URL, page) 636 | resp = self.get(page_url) 637 | container = resp.html.find("div#objects_container", first=True) 638 | name_element = container.find("strong", first=True) 639 | result["name"] = name_element.text 640 | soupElement = BeautifulSoup(container.html, features='html.parser') 641 | ancestorElement = soupElement.select('strong')[0].find_parent('div') 642 | if ancestorElement.find_parent('span'): 643 | ancestorElement = ancestorElement.find_parent('span').find_parent('div') 644 | description_element = ancestorElement.find_next_sibling("div") 645 | logger.debug("description_element") 646 | logger.debug(description_element) 647 | result["description"] = description_element.text 648 | result['category'] = soupElement.select('#category span')[1].text 649 | 650 | # getting basic info for a page 651 | def has_text(element): 652 | return element.get_text(strip=True) != '' and not len(element.find_all(True)) > 0 653 | 654 | contact_info_elements = soupElement.select("#contact-info")[0].find_all(has_text) 655 | basic_info_elements = soupElement.select("#basic-info")[0].find_all(has_text) 656 | result['contact_info'] = {contact_info_elements[i].text: contact_info_elements[i + 1].text for i in range(1, len(contact_info_elements), 2)} 657 | result['basic_info'] = {basic_info_elements[i].text: basic_info_elements[i + 1].text for i in range(0, len(basic_info_elements), 2)} 658 | logger.debug("getting page_id and user_id usong page_info from PageParser") 659 | page_basic_info = PageParser(resp).get_page_info() 660 | result.update(page_basic_info) 661 | 662 | result['url'] = resp.url 663 | logger.debug("extracting HQ profile photo") 664 | extractor = PostExtractor(resp.html, kwargs, self.get) 665 | image_elements = resp.html.find("a[href^='/photo.php']") 666 | result["cover_picture"] = extractor.extract_photo_link_HQ(response=None, useMbasic=True, mbasicUrl=utils.urljoin(FB_MBASIC_BASE_URL, image_elements[0].attrs['href'])) 667 | result["profile_picture"] = extractor.extract_photo_link_HQ(response=None, useMbasic=True, mbasicUrl=utils.urljoin(FB_MBASIC_BASE_URL, image_elements[1].attrs['href'])) 668 | 669 | except Exception as e: 670 | logger.error(f"Unable to extract page info: {e}") 671 | return result 672 | 673 | def get_group_info(self, group, **kwargs) -> Profile: 674 | self.set_user_agent( 675 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 676 | ) 677 | url = f'/groups/{group}' 678 | logger.debug(f"Requesting page from: {url}") 679 | resp = self.get(url).html 680 | try: 681 | url = resp.find("a[href*='?view=info']", first=True).attrs["href"] 682 | url += "&sfd=1" # Add parameter to get full "about"-text 683 | except AttributeError: 684 | raise exceptions.UnexpectedResponse("Unable to resolve view=info URL") 685 | logger.debug(f"Requesting page from: {url}") 686 | resp = self.get(url).html 687 | result = {} 688 | result["id"] = re.search(r'/groups/(\d+)', url).group(1) 689 | try: 690 | result["name"] = resp.find("header h3", first=True).text 691 | result["type"] = resp.find("header div", first=True).text 692 | members = resp.find("div[data-testid='m_group_sections_members']", first=True) 693 | result["members"] = utils.parse_int(members.text) 694 | except AttributeError: 695 | raise exceptions.UnexpectedResponse("Unable to get one of name, type, or members") 696 | 697 | # Try to extract the group description 698 | try: 699 | # Directly tageting the weird generated class names is not optimal, but it's the best i could do. 700 | about_div = resp.find("._52jc._55wr", first=True) 701 | 702 | # Removing the -tags that are converted to linebreaks by .text 703 | from requests_html import HTML 704 | 705 | no_word_breaks = HTML(html=about_div.html.replace("", "")) 706 | 707 | result["about"] = no_word_breaks.text 708 | except: 709 | result["about"] = None 710 | 711 | try: 712 | url = members.find("a", first=True).attrs.get("href") 713 | logger.debug(f"Requesting page from: {url}") 714 | 715 | resp = self.get(url).html 716 | url = resp.find("a[href*='listType=list_admin_moderator']", first=True) 717 | if kwargs.get("admins", True): 718 | if url: 719 | url = url.attrs.get("href") 720 | logger.debug(f"Requesting page from: {url}") 721 | try: 722 | respAdmins = self.get(url).html 723 | except: 724 | raise exceptions.UnexpectedResponse("Unable to get admin list") 725 | else: 726 | respAdmins = resp 727 | # Test if we are a member that can add new members 728 | if re.match( 729 | "/groups/members/search", 730 | respAdmins.find( 731 | "div:nth-child(1)>div:nth-child(1) a:not(.touchable)", first=True 732 | ).attrs.get('href'), 733 | ): 734 | admins = respAdmins.find("div:nth-of-type(2)>div.touchable a:not(.touchable)") 735 | else: 736 | admins = respAdmins.find("div:first-child>div.touchable a:not(.touchable)") 737 | result["admins"] = [ 738 | { 739 | "name": e.text, 740 | "link": utils.filter_query_params(e.attrs["href"], blacklist=["refid"]), 741 | } 742 | for e in admins 743 | ] 744 | 745 | url = resp.find("a[href*='listType=list_nonfriend_nonadmin']", first=True) 746 | if kwargs.get("members", True): 747 | if url: 748 | url = url.attrs["href"] 749 | members = [] 750 | while url: 751 | logger.debug(f"Requesting page from: {url}") 752 | resp = self.get(url).html 753 | elems = resp.find("#root div.touchable a:not(.touchable)") 754 | members.extend([{"name": e.text, "link": e.attrs["href"]} for e in elems]) 755 | more = re.search(r'"m_more_item",href:"([^"]+)', resp.text) 756 | if more: 757 | url = more.group(1) 758 | else: 759 | url = None 760 | result["other_members"] = [m for m in members if m not in result["admins"]] 761 | else: 762 | logger.warning("No other members listed") 763 | except exceptions.LoginRequired as e: 764 | pass 765 | return result 766 | 767 | def get_shop(self, page, **kwargs) -> Iterator[Post]: 768 | self.set_user_agent( 769 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 770 | ) 771 | self.set_noscript(True) 772 | url = f"{page}/shop/" 773 | logger.debug(f"Fetching {url}") 774 | resp = self.get(url) 775 | more_links = resp.html.find("a[href]", containing="See More") 776 | if more_links: 777 | url = more_links[-1].attrs["href"] 778 | logger.debug(f"Fetching {url}") 779 | resp = self.get(url) 780 | items = resp.html.find("div.be") 781 | results = [] 782 | for item in items: 783 | link_elem = item.find("div.bl a", first=True) 784 | name = link_elem.text 785 | link = link_elem.attrs["href"] 786 | image = item.find("img", first=True).attrs["src"] 787 | price = item.find("div.bl")[-1].text 788 | result = {"name": name, "link": link, "image": image, "price": price} 789 | results.append(result) 790 | return results 791 | 792 | def get_group_posts(self, group: Union[str, int], **kwargs) -> Iterator[Post]: 793 | kwargs["scraper"] = self 794 | self.set_user_agent( 795 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8" 796 | ) 797 | iter_pages_fn = partial(iter_group_pages, group=group, request_fn=self.get, **kwargs) 798 | return self._generic_get_posts(extract_group_post, iter_pages_fn, **kwargs) 799 | 800 | def check_locale(self, response): 801 | if self.have_checked_locale: 802 | return 803 | match = re.search(r'"IntlCurrentLocale",\[\],{code:"(\w{2}_\w{2})"}', response.text) 804 | if match: 805 | locale = match.groups(1)[0] 806 | if locale != "en_US": 807 | warnings.warn( 808 | f"Facebook language detected as {locale} - for best results, set to en_US" 809 | ) 810 | self.have_checked_locale = True 811 | 812 | def get(self, url, **kwargs): 813 | try: 814 | self.request_count += 1 815 | url = str(url) 816 | if not url.startswith("http"): 817 | url = utils.urljoin(FB_MOBILE_BASE_URL, url) 818 | 819 | if kwargs.get("post"): 820 | kwargs.pop("post") 821 | response = self.session.post(url=url, **kwargs) 822 | else: 823 | if url.startswith(FB_MBASIC_BASE_URL) and self.mbasic_headers is not None: 824 | self.session.headers.clear() 825 | self.session.headers.update(self.mbasic_headers) 826 | response = self.session.get(url=url, **self.requests_kwargs, **kwargs) 827 | if url.startswith(FB_MBASIC_BASE_URL) and self.mbasic_headers is not None: 828 | self.session.headers.clear() 829 | self.session.headers.update(self.default_headers) 830 | DEBUG = False 831 | if DEBUG: 832 | for filename in os.listdir("."): 833 | if filename.endswith(".html") and filename.replace(".html", "") in url: 834 | logger.debug(f"Replacing {url} content with {filename}") 835 | with open(filename) as f: 836 | response.html.html = f.read() 837 | response.html.html = response.html.html.replace('', '') 838 | response.raise_for_status() 839 | self.check_locale(response) 840 | 841 | # Special handling for video posts that redirect to /watch/ 842 | if response.url == "https://m.facebook.com/watch/?ref=watch_permalink": 843 | post_url = re.search("\d+", url).group() 844 | if post_url: 845 | url = utils.urljoin( 846 | FB_MOBILE_BASE_URL, 847 | f"story.php?story_fbid={post_url}&id=1&m_entstream_source=timeline", 848 | ) 849 | post = {"original_request_url": post_url, "post_url": url} 850 | logger.debug(f"Requesting page from: {url}") 851 | response = self.get(url) 852 | if "/watch/" in response.url: 853 | video_id = parse_qs(urlparse(response.url).query).get("v")[0] 854 | url = f"story.php?story_fbid={video_id}&id={video_id}&m_entstream_source=video_home&player_suborigin=entry_point&player_format=permalink" 855 | logger.debug(f"Fetching {url}") 856 | response = self.get(url) 857 | 858 | if "cookie/consent-page" in response.url: 859 | response = self.submit_form(response) 860 | if ( 861 | response.url.startswith(FB_MOBILE_BASE_URL) 862 | and not response.html.find("script", first=True) 863 | and "script" not in response.html.html 864 | and self.session.cookies.get("noscript") != "1" 865 | ): 866 | warnings.warn( 867 | f"Facebook served mbasic/noscript content unexpectedly on {response.url}" 868 | ) 869 | if response.html.find("h1,h2", containing="Unsupported Browser"): 870 | warnings.warn(f"Facebook says 'Unsupported Browser'") 871 | title = response.html.find("title", first=True) 872 | not_found_titles = ["page not found", "content not found"] 873 | temp_ban_titles = [ 874 | "you can't use this feature at the moment", 875 | "you can't use this feature right now", 876 | "you’re temporarily blocked", 877 | ] 878 | if "checkpoint" in response.url: 879 | if response.html.find("h1", containing="We suspended your account"): 880 | raise exceptions.AccountDisabled("Your Account Has Been Disabled") 881 | if title: 882 | if title.text.lower() in not_found_titles: 883 | raise exceptions.NotFound(title.text) 884 | elif title.text.lower() == "error": 885 | raise exceptions.UnexpectedResponse("Your request couldn't be processed") 886 | elif title.text.lower() in temp_ban_titles: 887 | raise exceptions.TemporarilyBanned(title.text) 888 | elif ">your account has been disabled<" in response.html.html.lower(): 889 | raise exceptions.AccountDisabled("Your Account Has Been Disabled") 890 | elif ( 891 | ">We saw unusual activity on your account. This may mean that someone has used your account without your knowledge.<" 892 | in response.html.html 893 | ): 894 | raise exceptions.AccountDisabled("Your Account Has Been Locked") 895 | elif ( 896 | title.text == "Log in to Facebook | Facebook" 897 | or response.url.startswith(utils.urljoin(FB_MOBILE_BASE_URL, "login")) 898 | or response.url.startswith(utils.urljoin(FB_W3_BASE_URL, "login")) 899 | or response.url.startswith(utils.urljoin(FB_MBASIC_BASE_URL, "login")) 900 | ): 901 | raise exceptions.LoginRequired( 902 | "A login (cookies) is required to see this page" 903 | ) 904 | return response 905 | except RequestException as ex: 906 | logger.exception("Exception while requesting URL: %s\nException: %r", url, ex) 907 | raise 908 | 909 | def submit_form(self, response, extra_data={}): 910 | action = response.html.find("form", first=True).attrs.get('action') 911 | url = utils.urljoin(self.base_url, action) 912 | elems = response.html.find("input[name][value]") 913 | data = {elem.attrs['name']: elem.attrs['value'] for elem in elems} 914 | data.update(extra_data) 915 | response = self.session.post(url, data=data, **self.requests_kwargs) 916 | return response 917 | 918 | def login(self, email: str, password: str): 919 | response = self.get(self.base_url) 920 | 921 | datr_cookie = re.search('(?<=_js_datr",")[^"]+', response.html.html) 922 | if datr_cookie: 923 | cookie_value = datr_cookie.group() 924 | self.session.cookies.set('datr', cookie_value) 925 | 926 | response = self.submit_form( 927 | response, {"email": email, "pass": password, "_fb_noscript": None} 928 | ) 929 | 930 | login_error = response.html.find('#login_error', first=True) 931 | if login_error: 932 | raise exceptions.LoginError(login_error.text) 933 | 934 | if "enter login code to continue" in response.text.lower(): 935 | token = input("Enter 2FA token: ") 936 | response = self.submit_form(response, {"approvals_code": token}) 937 | strong = response.html.find("strong", first=True) 938 | if strong and strong.text.startswith("The login code you entered doesn't match"): 939 | raise exceptions.LoginError(strong.text) 940 | # Remember Browser 941 | response = self.submit_form(response, {"name_action_selected": "save_device"}) 942 | if "review recent login" in response.text.lower(): 943 | response = self.submit_form(response) 944 | # Login near {location} from {browser} on {OS} ({time}). Unset "This wasn't me", leaving "This was me" set. 945 | response = self.submit_form(response, {"submit[This wasn't me]": None}) 946 | # Remember Browser. Please save the browser that you just verified. You won't have to enter a code when you log in from browsers that you've saved. 947 | response = self.submit_form(response, {"name_action_selected": "save_device"}) 948 | 949 | if "login approval needed" in response.text.lower() or "checkpoint" in response.url: 950 | input( 951 | "Login approval needed. From a browser logged into this account, approve this login from your notifications. Press enter once you've approved it." 952 | ) 953 | response = self.submit_form(response, {"submit[Continue]": "Continue"}) 954 | if "the password that you entered is incorrect" in response.text.lower(): 955 | raise exceptions.LoginError("The password that you entered is incorrect") 956 | if 'c_user' not in self.session.cookies: 957 | with open("login_error.html", "w") as f: 958 | f.write(response.text) 959 | raise exceptions.LoginError("Login unsuccessful") 960 | 961 | def is_logged_in(self) -> bool: 962 | try: 963 | self.get('https://facebook.com/settings') 964 | return True 965 | except exceptions.LoginRequired: 966 | return False 967 | 968 | def _generic_get_posts( 969 | self, 970 | extract_post_fn, 971 | iter_pages_fn, 972 | page_limit=DEFAULT_PAGE_LIMIT, 973 | options=None, 974 | remove_source=True, 975 | latest_date=None, 976 | max_past_limit=5, 977 | **kwargs, 978 | ): 979 | if options is None: 980 | options = {} 981 | elif isinstance(options, set): 982 | warnings.warn("The options argument should be a dictionary.", stacklevel=3) 983 | options = {k: True for k in options} 984 | if self.session.cookies.get("noscript") == "1": 985 | options["noscript"] = True 986 | 987 | if page_limit and page_limit <= 2: 988 | warnings.warn( 989 | "A low page limit (<=2) might return no results, try increasing the limit", 990 | stacklevel=3, 991 | ) 992 | 993 | # if latest_date is specified, iterate until the date is reached n times in a row (recurrent_past_posts) 994 | if latest_date is not None: 995 | # Pinned posts repeat themselves over time, so ignore them 996 | pinned_posts = [] 997 | 998 | # Stats 999 | null_date_posts = 0 1000 | total_scraped_posts = 0 1001 | 1002 | # Helpers 1003 | recurrent_past_posts = 0 1004 | show_every = 50 1005 | done = False 1006 | 1007 | for page in iter_pages_fn(): 1008 | for post_element in page: 1009 | try: 1010 | post = extract_post_fn( 1011 | post_element, options=options, request_fn=self.get, **kwargs 1012 | ) 1013 | 1014 | if remove_source: 1015 | post.pop("source", None) 1016 | 1017 | # date is None, no way to check latest_date, yield it 1018 | if post["time"] is None: 1019 | null_date_posts += 1 1020 | 1021 | # date is above latest_date, yield it 1022 | if post["time"] > latest_date: 1023 | recurrent_past_posts = 0 1024 | 1025 | # if any of above, yield the post and continue 1026 | if post["time"] is None or post["time"] > latest_date: 1027 | total_scraped_posts += 1 1028 | if total_scraped_posts % show_every == 0: 1029 | logger.info("Posts scraped: %s", total_scraped_posts) 1030 | 1031 | yield post 1032 | continue 1033 | 1034 | # else, the date is behind the date limit 1035 | recurrent_past_posts += 1 1036 | 1037 | # and it has reached the max_past_limit posts 1038 | if recurrent_past_posts >= max_past_limit: 1039 | done = True 1040 | logger.info( 1041 | "Sequential posts behind latest_date reached. Stopping scraping." 1042 | ) 1043 | logger.info( 1044 | "Posts with null date: %s", 1045 | null_date_posts, 1046 | ) 1047 | break 1048 | 1049 | # or the text is not banned (repeated) 1050 | if post["text"] is not None and post["text"] not in pinned_posts: 1051 | pinned_posts.append(post["text"]) 1052 | logger.warning( 1053 | "Sequential post #%s behind the date limit: %s. Ignored (in logs) from now on.", 1054 | recurrent_past_posts, 1055 | post["time"], 1056 | ) 1057 | 1058 | except Exception as e: 1059 | logger.exception( 1060 | "An exception has occured during scraping: %s. Omitting the post...", 1061 | e, 1062 | ) 1063 | 1064 | # if max_past_limit, stop 1065 | if done: 1066 | break 1067 | 1068 | # else, iterate over pages as usual 1069 | else: 1070 | counter = itertools.count(0) if page_limit is None else range(page_limit) 1071 | 1072 | logger.debug("Starting to iterate pages") 1073 | for i, page in zip(counter, iter_pages_fn()): 1074 | logger.debug("Extracting posts from page %s", i) 1075 | # extra_info is already in the kwargs, so we pop it out 1076 | kwargs.pop("extra_info", None) 1077 | for post_element in page: 1078 | post = extract_post_fn( 1079 | post_element, 1080 | options=options, 1081 | request_fn=self.get, 1082 | extra_info=page.extra_info, 1083 | **kwargs, 1084 | ) 1085 | if remove_source: 1086 | post.pop('source', None) 1087 | yield post 1088 | 1089 | def get_groups_by_search(self, word: str, **kwargs): 1090 | group_search_url = utils.urljoin(FB_MOBILE_BASE_URL, f"search/groups/?q={word}") 1091 | r = self.get(group_search_url) 1092 | for group_element in r.html.find('div[role="button"]'): 1093 | button_id = group_element.attrs["id"] 1094 | group_id = self.find_group_id(button_id, r.text) 1095 | try: 1096 | yield self.get_group_info(group_id) 1097 | except AttributeError: 1098 | continue 1099 | 1100 | @staticmethod 1101 | def find_group_id(button_id, raw_html): 1102 | """Each group button has an id, which appears later in the script 1103 | tag followed by the group id.""" 1104 | s = raw_html[raw_html.rfind(button_id) :] 1105 | group_id = s[s.find("result_id:") :].split(",")[0].split(":")[1] 1106 | return int(group_id) 1107 | -------------------------------------------------------------------------------- /facebook_scraper/fb_types.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable, Dict, Iterable, Tuple 2 | 3 | from requests import Response 4 | from requests_html import Element 5 | 6 | 7 | URL = str 8 | Options = Dict[str, Any] 9 | Post = Dict[str, Any] 10 | Profile = Dict[str, Any] 11 | RequestFunction = Callable[[URL], Response] 12 | RawPage = Element 13 | RawPost = Element 14 | Page = Iterable[RawPost] 15 | Credentials = Tuple[str, str] 16 | -------------------------------------------------------------------------------- /facebook_scraper/internal_classes.py: -------------------------------------------------------------------------------- 1 | class PageClass: 2 | def __init__(self, raw_posts, extra_info=None): 3 | self.raw_posts = raw_posts 4 | self.extra_info = extra_info 5 | super().__init__() 6 | 7 | def __iter__(self): 8 | return iter(self.raw_posts) -------------------------------------------------------------------------------- /facebook_scraper/page_iterators.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import re 4 | import textwrap 5 | from typing import Iterator, Optional, Union 6 | import time 7 | 8 | from requests.exceptions import HTTPError 9 | import warnings 10 | 11 | from . import utils 12 | from .constants import FB_MOBILE_BASE_URL, FB_MBASIC_BASE_URL 13 | 14 | from .fb_types import URL, Page, RawPage, RequestFunction, Response 15 | from . import exceptions 16 | from .internal_classes import PageClass 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def iter_hashtag_pages(hashtag: str, request_fn: RequestFunction, **kwargs) -> Iterator[Page]: 22 | start_url = kwargs.pop("start_url", None) 23 | if not start_url: 24 | start_url = utils.urljoin(FB_MBASIC_BASE_URL, f'/hashtag/{hashtag}/') 25 | try: 26 | request_fn(start_url) 27 | except Exception as ex: 28 | logger.error(ex) 29 | return generic_iter_pages(start_url, HashtagPageParser, request_fn, **kwargs) 30 | 31 | 32 | def iter_pages(account: str, request_fn: RequestFunction, **kwargs) -> Iterator[Page]: 33 | start_url = kwargs.pop("start_url", None) 34 | if not start_url: 35 | start_url = utils.urljoin( 36 | FB_MOBILE_BASE_URL, 37 | f'/{account}', 38 | ) 39 | return generic_iter_pages(start_url, PageParser, request_fn, **kwargs) 40 | 41 | 42 | def iter_group_pages( 43 | group: Union[str, int], request_fn: RequestFunction, **kwargs 44 | ) -> Iterator[Page]: 45 | start_url = kwargs.pop("start_url", None) 46 | 47 | if not start_url: 48 | start_url = utils.urljoin(FB_MOBILE_BASE_URL, f'groups/{group}/') 49 | 50 | return generic_iter_pages(start_url, GroupPageParser, request_fn, **kwargs) 51 | 52 | 53 | def iter_search_pages(word: str, request_fn: RequestFunction, **kwargs) -> Iterator[Page]: 54 | start_url = kwargs.pop("start_url", None) 55 | if not start_url: 56 | start_url = utils.urljoin( 57 | FB_MOBILE_BASE_URL, 58 | f'/search/posts?q={word}' 59 | f'&filters=eyJyZWNlbnRfcG9zdHM6MCI6IntcIm5hbWVcIjpcInJlY2VudF9wb3N0c1wiLFwiYXJnc1wiOlwiXCJ9In0%3D', 60 | ) 61 | try: 62 | request_fn(start_url) 63 | except Exception as ex: 64 | logger.error(ex) 65 | start_url = utils.urljoin(FB_MOBILE_BASE_URL, f'/search/posts?q={word}') 66 | return generic_iter_pages(start_url, SearchPageParser, request_fn, **kwargs) 67 | 68 | 69 | def iter_photos(account: str, request_fn: RequestFunction, **kwargs) -> Iterator[Page]: 70 | start_url = utils.urljoin(FB_MOBILE_BASE_URL, f'/{account}/photos/') 71 | return generic_iter_pages(start_url, PhotosPageParser, request_fn, **kwargs) 72 | 73 | 74 | def generic_iter_pages( 75 | start_url, page_parser_cls, request_fn: RequestFunction, **kwargs 76 | ) -> Iterator[PageClass]: 77 | next_url = start_url 78 | 79 | base_url = kwargs.get('base_url', FB_MOBILE_BASE_URL) 80 | request_url_callback = kwargs.get('request_url_callback') 81 | while next_url: 82 | # Execute callback of starting a new URL request 83 | if request_url_callback: 84 | # The callback can return an exit code to stop the iteration 85 | # This is useful in the cases where the requests triggers an infinite redirect loop. 86 | exit_code = request_url_callback(next_url) 87 | if exit_code: 88 | logger.debug("Exit code %s received from request_url_callback, exiting", exit_code) 89 | break 90 | 91 | RETRY_LIMIT = 6 92 | for retry in range(1, RETRY_LIMIT + 1): 93 | try: 94 | logger.debug("Requesting page from: %s", next_url) 95 | response = request_fn(next_url) 96 | break 97 | except HTTPError as e: 98 | if e.response.status_code == 500 and retry < RETRY_LIMIT: 99 | sleep_duration = retry * 2 100 | logger.debug( 101 | f"Caught exception, retry number {retry}. Sleeping for {sleep_duration}s" 102 | ) 103 | if retry == (RETRY_LIMIT / 2): 104 | logger.debug("Requesting noscript") 105 | kwargs["scraper"].set_noscript(True) 106 | time.sleep(sleep_duration) 107 | else: 108 | raise 109 | 110 | logger.debug("Parsing page response") 111 | parser = page_parser_cls(response) 112 | 113 | page = parser.get_page() 114 | 115 | # TODO: If page is actually an iterable calling len(page) might consume it 116 | logger.debug("Got %s raw posts from page", len(page.raw_posts)) 117 | yield page 118 | 119 | logger.debug("Looking for next page URL") 120 | next_page = parser.get_next_page() 121 | if next_page: 122 | posts_per_page = kwargs.get("options", {}).get("posts_per_page") 123 | if posts_per_page: 124 | next_page = next_page.replace("num_to_fetch=4", f"num_to_fetch={posts_per_page}") 125 | next_url = utils.urljoin(base_url, next_page) 126 | next_url = next_url.replace("amp;", f"") 127 | else: 128 | logger.info("Page parser did not find next page URL") 129 | next_url = None 130 | 131 | 132 | class PageParser: 133 | """Class for Parsing a single page on a Page""" 134 | 135 | json_prefix = 'for (;;);' 136 | 137 | cursor_regex = re.compile(r'href[:=]"(/page_content[^"]+)"') # First request 138 | cursor_regex_2 = re.compile(r'href"[:=]"(\\/page_content[^"]+)"') # Other requests 139 | cursor_regex_3 = re.compile( 140 | r'href:"(/profile/timeline/stream/\?cursor[^"]+)"' 141 | ) # scroll/cursor based, first request 142 | cursor_regex_4 = re.compile( 143 | r'href\\":\\"\\+(/profile\\+/timeline\\+/stream[^"]+)\"' 144 | ) # scroll/cursor based, other requests 145 | # adding new regex for the cursor 146 | cursor_regex_5 = re.compile( 147 | r'href="(/profile/timeline/stream/\?cursor[^"]+)"' 148 | ) # scroll/cursor based, first request 149 | 150 | def __init__(self, response: Response): 151 | self.response = response 152 | self.html = None 153 | self.cursor_blob = None 154 | 155 | self._parse() 156 | 157 | def get_page(self) -> PageClass: 158 | # Select only elements that have the data-ft attribute 159 | # it seems top_level_post_id is not always present, an update on the app is needed here but in case it's there 160 | # we can use it 161 | page = self._get_page('article[data-ft*="top_level_post_id"]', 'article') 162 | if (len(page) == 0): 163 | # TODO remove the backward compatible article selector 164 | page = self._get_page('article[data-ft], div[role="article"][data-ft]', 'article') 165 | return PageClass(page, self.get_page_info()) 166 | 167 | def get_page_info(self): 168 | more_page_element = self.html.find('a[href*="/mbasic/more/?owner_id"]', first=True) 169 | # TODO [Code quality] Refactor the regex search to use globally available 170 | message_page_element = self.html.find('a[href^="/messages/thread/"]', first=True) 171 | page_id_match = re.search(r'/messages/thread/(\d+)/', message_page_element.attrs.get('href')) if message_page_element else None 172 | 173 | # type 2 of page_id matching 174 | if page_id_match is None: 175 | logger.debug("trying type 2 of page_id matching against \"intent://user/\"") 176 | message_page_element = self.html.find('a[href^="intent://user/"]', first=True) 177 | page_id_match = re.search(r'intent://user/(\d+)/', 178 | message_page_element.attrs.get('href')) if message_page_element else None 179 | return { 180 | 'user_id': self.html.find('a[href^="/mbasic/more/?owner_id"]', first=True) 181 | .attrs.get('href') 182 | .split('owner_id=')[1] 183 | .split('&')[0] 184 | if more_page_element 185 | else None, 186 | 'page_id': page_id_match.group(1) if page_id_match else None 187 | } 188 | 189 | def get_raw_page(self) -> RawPage: 190 | return self.html 191 | 192 | def get_next_page(self) -> Optional[URL]: 193 | assert self.cursor_blob is not None 194 | 195 | match = self.cursor_regex.search(self.cursor_blob) 196 | if match: 197 | return utils.unquote(match.groups()[0]).replace("&", "&") 198 | 199 | match = self.cursor_regex_2.search(self.cursor_blob) 200 | if match: 201 | value = match.groups()[0] 202 | return utils.unquote( 203 | value.encode('utf-8').decode('unicode_escape').replace('\\/', '/') 204 | ).replace("&", "&") 205 | 206 | match = self.cursor_regex_3.search(self.cursor_blob) 207 | if match: 208 | return match.groups()[0] 209 | 210 | match = self.cursor_regex_4.search(self.response.text) 211 | if match: 212 | value = match.groups()[0] 213 | return re.sub(r'\\+/', '/', value) 214 | 215 | match = self.cursor_regex_5.search(self.cursor_blob) 216 | if match: 217 | return match.groups()[0] 218 | return None 219 | 220 | def _parse(self): 221 | if self.response.text.startswith(self.json_prefix): 222 | self._parse_json() 223 | else: 224 | self._parse_html() 225 | 226 | def _parse_html(self): 227 | self.html = self.response.html 228 | self.cursor_blob = self.response.text 229 | 230 | def _parse_json(self): 231 | prefix_length = len(self.json_prefix) 232 | data = json.loads(self.response.text[prefix_length:]) # Strip 'for (;;);' 233 | 234 | for action in data.get('payload', data)['actions']: 235 | if action['cmd'] == 'replace': 236 | self.html = utils.make_html_element(action['html'], url=FB_MOBILE_BASE_URL) 237 | self.cursor_blob = self.html.html 238 | elif action['cmd'] == 'script': 239 | self.cursor_blob = action['code'] 240 | 241 | assert self.html is not None 242 | 243 | def _get_page(self, selection, selection_name) -> Page: 244 | raw_page = self.get_raw_page() 245 | raw_posts = raw_page.find(selection) 246 | # This is not an issue anymore as fb doesn't send bad HTML anymore 247 | # TODO Remove this in the future as it's not needed 248 | #for post in raw_posts: 249 | #if not post.find("footer"): 250 | # This is not an issue anymore as fb doesn't send bad HTML anymore 251 | # Due to malformed HTML served by Facebook, lxml might misinterpret where the footer should go in article elements 252 | # If we limit the parsing just to the section element, it fixes it 253 | # Please forgive me for parsing HTML with regex 254 | #logger.warning(f"No footer in article - reparsing HTML within
element") 255 | #html = re.search(r'(.+)
', raw_page.html).group(1) 256 | #raw_page = utils.make_html_element(html=html) 257 | #raw_posts = raw_page.find(selection) 258 | #break 259 | 260 | if not raw_posts: 261 | logger.warning( 262 | "No raw posts (<%s> elements) were found in this page." % selection_name 263 | ) 264 | if logger.isEnabledFor(logging.DEBUG): 265 | content = textwrap.indent( 266 | raw_page.text, 267 | prefix='| ', 268 | predicate=lambda _: True, 269 | ) 270 | sep = '+' + '-' * 60 271 | logger.debug("The page url is: %s", self.response.url) 272 | logger.debug("The page content is:\n%s\n%s%s\n", sep, content, sep) 273 | 274 | return raw_posts 275 | 276 | 277 | class GroupPageParser(PageParser): 278 | """Class for parsing a single page of a group""" 279 | 280 | cursor_regex_3 = re.compile(r'href[=:]"(\/groups\/[^"]+bac=[^"]+)"') # for Group requests 281 | cursor_regex_3_basic_new = re.compile( 282 | r'href[=:]"(\/groups\/[^"]+bacr=[^"]+)"' 283 | ) # for mbasic Group requests 2023 284 | 285 | def get_next_page(self) -> Optional[URL]: 286 | next_page = super().get_next_page() 287 | if next_page: 288 | return next_page 289 | 290 | assert self.cursor_blob is not None 291 | logger.debug("using extra page processor") 292 | match = self.cursor_regex_3.search(self.cursor_blob) 293 | if match: 294 | value = match.groups()[0] 295 | return value.encode('utf-8').decode('unicode_escape').replace('\\/', '/') 296 | else: 297 | match = self.cursor_regex_3_basic_new.search(self.cursor_blob) 298 | return ( 299 | match.groups()[0].encode('utf-8').decode('unicode_escape').replace('\\/', '/') 300 | if match 301 | else None 302 | ) 303 | return None 304 | 305 | def _parse(self): 306 | self._parse_html() 307 | 308 | 309 | class PhotosPageParser(PageParser): 310 | cursor_regex = re.compile(r'href:"(/photos/pandora/[^"]+)"') 311 | cursor_regex_2 = re.compile(r'href":"(\\/photos\\/pandora\\/[^"]+)"') 312 | 313 | def get_page(self) -> Page: 314 | return super()._get_page('div._5v64', "div._5v64") 315 | 316 | def get_next_page(self) -> Optional[URL]: 317 | if self.cursor_blob is not None: 318 | match = self.cursor_regex.search(self.cursor_blob) 319 | if match: 320 | return match.groups()[0] 321 | 322 | match = self.cursor_regex_2.search(self.cursor_blob) 323 | if match: 324 | value = match.groups()[0] 325 | return value.encode('utf-8').decode('unicode_escape').replace('\\/', '/') 326 | 327 | 328 | class SearchPageParser(PageParser): 329 | cursor_regex = re.compile(r'href[:=]"[^"]+(/search/[^"]+)"') 330 | cursor_regex_2 = re.compile(r'href":"[^"]+(/search/[^"]+)"') 331 | 332 | def get_next_page(self) -> Optional[URL]: 333 | if self.cursor_blob is not None: 334 | match = self.cursor_regex.search(self.cursor_blob) 335 | if match: 336 | return match.groups()[0] 337 | 338 | match = self.cursor_regex_2.search(self.cursor_blob) 339 | if match: 340 | value = match.groups()[0] 341 | return value.encode('utf-8').decode('unicode_escape').replace('\\/', '/') 342 | 343 | 344 | class HashtagPageParser(PageParser): 345 | cursor_regex = re.compile(r'(\/hashtag\/[a-z]+\/\?cursor=[^"]+).*$') 346 | 347 | def get_page(self) -> Page: 348 | return super()._get_page('article', 'article') 349 | 350 | def get_next_page(self) -> Optional[URL]: 351 | assert self.cursor_blob is not None 352 | 353 | match = self.cursor_regex.search(self.cursor_blob) 354 | if match: 355 | return utils.unquote(match.groups()[0]).replace("&", "&") 356 | 357 | return None 358 | -------------------------------------------------------------------------------- /facebook_scraper/utils.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import re 3 | from datetime import datetime, timedelta 4 | import calendar 5 | from typing import Optional 6 | from urllib.parse import parse_qsl, unquote, urlencode, urljoin, urlparse, urlunparse 7 | 8 | import dateparser 9 | import lxml.html 10 | from bs4 import BeautifulSoup 11 | from requests.cookies import RequestsCookieJar 12 | from requests_html import DEFAULT_URL, Element, PyQuery 13 | import json 14 | import traceback 15 | 16 | from . import exceptions 17 | import logging 18 | import time 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | def find_and_search(node, selector, pattern, cast=str): 24 | container = node.find(selector, first=True) 25 | match = container and pattern.search(container.html) 26 | return match and cast(match.groups()[0]) 27 | 28 | 29 | def parse_int(value: str) -> int: 30 | return int(''.join(filter(lambda c: c.isdigit(), value))) 31 | 32 | 33 | def convert_numeric_abbr(s): 34 | mapping = {'k': 1000, 'm': 1e6} 35 | s = s.replace(",", "") 36 | if s[-1].isalpha(): 37 | return int(float(s[:-1]) * mapping[s[-1].lower()]) 38 | return int(s) 39 | 40 | 41 | def parse_duration(s) -> int: 42 | match = re.search(r'T(?P\d+H)?(?P\d+M)?(?P\d+S)', s) 43 | if match: 44 | result = 0 45 | for k, v in match.groupdict().items(): 46 | if v: 47 | if k == 'hours': 48 | result += int(v.strip("H")) * 60 * 60 49 | elif k == "minutes": 50 | result += int(v.strip("M")) * 60 51 | elif k == "seconds": 52 | result += int(v.strip("S")) 53 | return result 54 | 55 | 56 | def decode_css_url(url: str) -> str: 57 | url = re.sub(r'\\(..) ', r'\\x\g<1>', url) 58 | url, _ = codecs.unicode_escape_decode(url) 59 | url, _ = codecs.unicode_escape_decode(url) 60 | return url 61 | 62 | 63 | def get_background_image_url(style): 64 | match = re.search(r"url\('(.+)'\)", style) 65 | return decode_css_url(match.groups()[0]) 66 | 67 | 68 | def filter_query_params(url, whitelist=None, blacklist=None) -> str: 69 | def is_valid_param(param): 70 | if whitelist is not None: 71 | return param in whitelist 72 | if blacklist is not None: 73 | return param not in blacklist 74 | return True # Do nothing 75 | 76 | parsed_url = urlparse(url) 77 | query_params = parse_qsl(parsed_url.query) 78 | query_string = urlencode([(k, v) for k, v in query_params if is_valid_param(k)]) 79 | return urlunparse(parsed_url._replace(query=query_string)) 80 | 81 | 82 | def combine_url_params(url1, url2) -> str: 83 | parsed_url = urlparse(url1) 84 | parsed_url2 = urlparse(url2) 85 | query_params = parse_qsl(parsed_url.query) + parse_qsl(parsed_url2.query) 86 | query_string = urlencode([(k, v) for k, v in query_params]) 87 | return urlunparse(parsed_url._replace(query=query_string)) 88 | 89 | 90 | def remove_control_characters(html): 91 | # type: (t.Text) -> t.Text 92 | """ 93 | Strip invalid XML characters that `lxml` cannot parse. 94 | """ 95 | # See: https://github.com/html5lib/html5lib-python/issues/96 96 | # 97 | # The XML 1.0 spec defines the valid character range as: 98 | # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] 99 | # 100 | # We can instead match the invalid characters by inverting that range into: 101 | # InvalidChar ::= #xb | #xc | #xFFFE | #xFFFF | [#x0-#x8] | [#xe-#x1F] | [#xD800-#xDFFF] 102 | # 103 | # Sources: 104 | # https://www.w3.org/TR/REC-xml/#charsets, 105 | # https://lsimons.wordpress.com/2011/03/17/stripping-illegal-characters-out-of-xml-in-python/ 106 | def strip_illegal_xml_characters(s, default, base=10): 107 | # Compare the "invalid XML character range" numerically 108 | n = int(s, base) 109 | if ( 110 | n in (0xB, 0xC, 0xFFFE, 0xFFFF) 111 | or 0x0 <= n <= 0x8 112 | or 0xE <= n <= 0x1F 113 | or 0xD800 <= n <= 0xDFFF 114 | ): 115 | return "" 116 | return default 117 | 118 | # We encode all non-ascii characters to XML char-refs, so for example "💖" becomes: "💖" 119 | # Otherwise we'd remove emojis by mistake on narrow-unicode builds of Python 120 | html = html.encode("ascii", "xmlcharrefreplace").decode("utf-8") 121 | html = re.sub( 122 | r"&#(\d+);?", lambda c: strip_illegal_xml_characters(c.group(1), c.group(0)), html 123 | ) 124 | html = re.sub( 125 | r"&#[xX]([0-9a-fA-F]+);?", 126 | lambda c: strip_illegal_xml_characters(c.group(1), c.group(0), base=16), 127 | html, 128 | ) 129 | # A regex matching the "invalid XML character range" 130 | html = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]").sub("", html) 131 | return html 132 | 133 | 134 | def make_html_element(html: str, url=DEFAULT_URL) -> Element: 135 | html = remove_control_characters(html) 136 | pq_element = PyQuery(html)[0] # PyQuery is a list, so we take the first element 137 | return Element(element=pq_element, url=url) 138 | 139 | 140 | month = ( 141 | r"Jan(?:uary)?|" 142 | r"Feb(?:ruary)?|" 143 | r"Mar(?:ch)?|" 144 | r"Apr(?:il)?|" 145 | r"May|" 146 | r"Jun(?:e)?|" 147 | r"Jul(?:y)?|" 148 | r"Aug(?:ust)?|" 149 | r"Sep(?:tember)?|" 150 | r"Oct(?:ober)?|" 151 | r"Nov(?:ember)?|" 152 | r"Dec(?:ember)?" 153 | ) 154 | day_of_week = r"Mon|" r"Tue|" r"Wed|" r"Thu|" r"Fri|" r"Sat|" r"Sun" 155 | day_of_month = r"\d{1,2}" 156 | specific_date_md = f"(?:{month}) {day_of_month}" + r"(?:,? \d{4})?" 157 | specific_date_dm = f"{day_of_month} (?:{month})" + r"(?:,? \d{4})?" 158 | 159 | date = f"{specific_date_md}|{specific_date_dm}|Today|Yesterday" 160 | 161 | hour = r"\d{1,2}" 162 | minute = r"\d{2}" 163 | period = r"AM|PM|" 164 | 165 | exact_time = f"(?:{date}) at {hour}:{minute} ?(?:{period})" 166 | relative_time_years = r'\b\d{1,2} yr' 167 | relative_time_months = r'\b\d{1,2} (?:mth|mo)' 168 | relative_time_weeks = r'\b\d{1,2} wk' 169 | relative_time_hours = r"\b\d{1,2} ?h(?:rs?)?" 170 | relative_time_mins = r"\b\d{1,2} ?mins?" 171 | relative_time = f"{relative_time_years}|{relative_time_months}|{relative_time_weeks}|{relative_time_hours}|{relative_time_mins}" 172 | 173 | datetime_regex = re.compile(fr"({exact_time}|{relative_time})", re.IGNORECASE) 174 | day_of_week_regex = re.compile(fr"({day_of_week})", re.IGNORECASE) 175 | 176 | 177 | def parse_datetime(text: str, search=True) -> Optional[datetime]: 178 | """Looks for a string that looks like a date and parses it into a datetime object. 179 | 180 | Uses a regex to look for the date in the string. 181 | Uses dateparser to parse the date (not thread safe). 182 | 183 | Args: 184 | text: The text where the date should be. 185 | search: If false, skip the regex search and try to parse the complete string. 186 | 187 | Returns: 188 | The datetime object, or None if it couldn't find a date. 189 | """ 190 | settings = { 191 | 'RELATIVE_BASE': datetime.today().replace(minute=0, hour=0, second=0, microsecond=0) 192 | } 193 | if search: 194 | time_match = datetime_regex.search(text) 195 | dow_match = day_of_week_regex.search(text) 196 | if time_match: 197 | text = time_match.group(0).replace("mth", "month") 198 | elif dow_match: 199 | text = dow_match.group(0) 200 | today = calendar.day_abbr[datetime.today().weekday()] 201 | if text == today: 202 | # Fix for dateparser misinterpreting "last Monday" as today if today is Monday 203 | return dateparser.parse(text, settings=settings) - timedelta(days=7) 204 | 205 | result = dateparser.parse(text, settings=settings) 206 | if result: 207 | return result.replace(microsecond=0) 208 | return None 209 | 210 | 211 | def html_element_to_string(element: Element, pretty=False) -> str: 212 | html = lxml.html.tostring(element.element, encoding='unicode') 213 | if pretty: 214 | html = BeautifulSoup(html, features='html.parser').prettify() 215 | return html 216 | 217 | 218 | def parse_cookie_file(filename: str) -> RequestsCookieJar: 219 | jar = RequestsCookieJar() 220 | 221 | with open(filename, mode='rt') as file: 222 | data = file.read() 223 | 224 | try: 225 | data = json.loads(data) 226 | if type(data) is list: 227 | for c in data: 228 | expires = c.get("expirationDate") or c.get("Expires raw") 229 | if expires: 230 | expires = int(expires) 231 | if "Name raw" in c: 232 | # Cookie Quick Manager JSON format 233 | host = c["Host raw"].replace("https://", "").strip("/") 234 | jar.set( 235 | c["Name raw"], 236 | c["Content raw"], 237 | domain=host, 238 | path=c["Path raw"], 239 | expires=expires, 240 | ) 241 | else: 242 | # EditThisCookie JSON format 243 | jar.set( 244 | c["name"], 245 | c["value"], 246 | domain=c["domain"], 247 | path=c["path"], 248 | secure=c["secure"], 249 | expires=expires, 250 | ) 251 | elif type(data) is dict: 252 | for k, v in data.items(): 253 | if type(v) is dict: 254 | jar.set(k, v["value"]) 255 | else: 256 | jar.set(k, v) 257 | except json.decoder.JSONDecodeError: 258 | # Netscape format 259 | for i, line in enumerate(data.splitlines()): 260 | line = line.strip() 261 | if line == "" or line.startswith('#'): 262 | continue 263 | 264 | try: 265 | domain, _, path, secure, expires, name, value = line.split('\t') 266 | except Exception as e: 267 | raise exceptions.InvalidCookies(f"Can't parse line {i + 1}: '{line}'") 268 | secure = secure.lower() == 'true' 269 | expires = None if expires == '0' else int(expires) 270 | 271 | jar.set(name, value, domain=domain, path=path, secure=secure, expires=expires) 272 | 273 | return jar 274 | 275 | 276 | def safe_consume(generator, sleep=0): 277 | result = [] 278 | try: 279 | for item in generator: 280 | result.append(item) 281 | time.sleep(sleep) 282 | except Exception as e: 283 | traceback.print_exc() 284 | logger.error(f"Exception when consuming {generator}: {type(e)}: {str(e)}") 285 | return result 286 | 287 | 288 | reaction_lookup = { 289 | '1': { 290 | 'color': '#2078f4', 291 | 'display_name': 'Like', 292 | 'is_deprecated': False, 293 | 'is_visible': True, 294 | 'name': 'like', 295 | 'type': 1, 296 | }, 297 | '10': { 298 | 'color': '#f0ba15', 299 | 'display_name': 'Confused', 300 | 'is_deprecated': True, 301 | 'is_visible': False, 302 | 'name': 'confused', 303 | 'type': 10, 304 | }, 305 | '11': { 306 | 'color': '#7e64c4', 307 | 'display_name': 'Thankful', 308 | 'is_deprecated': False, 309 | 'is_visible': True, 310 | 'name': 'dorothy', 311 | 'type': 11, 312 | }, 313 | '12': { 314 | 'color': '#ec7ebd', 315 | 'display_name': 'Pride', 316 | 'is_deprecated': False, 317 | 'is_visible': True, 318 | 'name': 'toto', 319 | 'type': 12, 320 | }, 321 | '13': { 322 | 'color': '#f0ba15', 323 | 'display_name': 'Selfie', 324 | 'is_deprecated': False, 325 | 'is_visible': False, 326 | 'name': 'selfie', 327 | 'type': 13, 328 | }, 329 | '14': { 330 | 'color': '#f0ba15', 331 | 'display_name': 'React', 332 | 'is_deprecated': True, 333 | 'is_visible': False, 334 | 'name': 'flame', 335 | 'type': 14, 336 | }, 337 | '15': { 338 | 'color': '#f0ba15', 339 | 'display_name': 'React', 340 | 'is_deprecated': True, 341 | 'is_visible': False, 342 | 'name': 'plane', 343 | 'type': 15, 344 | }, 345 | '16': { 346 | 'color': '#f7b125', 347 | 'display_name': 'Care', 348 | 'is_deprecated': False, 349 | 'is_visible': True, 350 | 'name': 'support', 351 | 'type': 16, 352 | }, 353 | '2': { 354 | 'color': '#f33e58', 355 | 'display_name': 'Love', 356 | 'is_deprecated': False, 357 | 'is_visible': True, 358 | 'name': 'love', 359 | 'type': 2, 360 | }, 361 | '3': { 362 | 'color': '#f7b125', 363 | 'display_name': 'Wow', 364 | 'is_deprecated': False, 365 | 'is_visible': True, 366 | 'name': 'wow', 367 | 'type': 3, 368 | }, 369 | '4': { 370 | 'color': '#f7b125', 371 | 'display_name': 'Haha', 372 | 'is_deprecated': False, 373 | 'is_visible': True, 374 | 'name': 'haha', 375 | 'type': 4, 376 | }, 377 | '5': { 378 | 'color': '#f0ba15', 379 | 'display_name': 'Yay', 380 | 'is_deprecated': True, 381 | 'is_visible': False, 382 | 'name': 'yay', 383 | 'type': 5, 384 | }, 385 | '7': { 386 | 'color': '#f7b125', 387 | 'display_name': 'Sad', 388 | 'is_deprecated': False, 389 | 'is_visible': True, 390 | 'name': 'sorry', 391 | 'type': 7, 392 | }, 393 | '8': { 394 | 'color': '#e9710f', 395 | 'display_name': 'Angry', 396 | 'is_deprecated': False, 397 | 'is_visible': True, 398 | 'name': 'anger', 399 | 'type': 8, 400 | }, 401 | '1635855486666999': { 402 | 'color': '#2078f4', 403 | 'display_name': 'Like', 404 | 'is_deprecated': False, 405 | 'is_visible': True, 406 | 'name': 'like', 407 | 'type': 1635855486666999, 408 | }, 409 | '613557422527858': { 410 | 'color': '#f7b125', 411 | 'display_name': 'Care', 412 | 'is_deprecated': False, 413 | 'is_visible': True, 414 | 'name': 'support', 415 | 'type': 613557422527858, 416 | }, 417 | '1678524932434102': { 418 | 'color': '#f33e58', 419 | 'display_name': 'Love', 420 | 'is_deprecated': False, 421 | 'is_visible': True, 422 | 'name': 'love', 423 | 'type': 1678524932434102, 424 | }, 425 | '478547315650144': { 426 | 'color': '#f7b125', 427 | 'display_name': 'Wow', 428 | 'is_deprecated': False, 429 | 'is_visible': True, 430 | 'name': 'wow', 431 | 'type': 478547315650144, 432 | }, 433 | '115940658764963': { 434 | 'color': '#f7b125', 435 | 'display_name': 'Haha', 436 | 'is_deprecated': False, 437 | 'is_visible': True, 438 | 'name': 'haha', 439 | 'type': 115940658764963, 440 | }, 441 | '908563459236466': { 442 | 'color': '#f7b125', 443 | 'display_name': 'Sad', 444 | 'is_deprecated': False, 445 | 'is_visible': True, 446 | 'name': 'sorry', 447 | 'type': 908563459236466, 448 | }, 449 | '444813342392137': { 450 | 'color': '#e9710f', 451 | 'display_name': 'Angry', 452 | 'is_deprecated': False, 453 | 'is_visible': True, 454 | 'name': 'anger', 455 | 'type': 444813342392137, 456 | }, 457 | } 458 | 459 | emoji_class_lookup = { 460 | 'sx_0ae260': 'care', 461 | 'sx_0e815d': 'haha', 462 | 'sx_199220': 'angry', 463 | 'sx_3a00ef': 'like', 464 | 'sx_3ecf2a': 'sad', 465 | 'sx_78dbdd': 'angry', 466 | 'sx_a35dca': 'love', 467 | 'sx_c3ed6c': 'sad', 468 | 'sx_ce3068': 'haha', 469 | 'sx_d80e3a': 'wow', 470 | 'sx_d8e63d': 'care', 471 | 'sx_e303cc': 'like', 472 | 'sx_f21116': 'love', 473 | 'sx_f75acf': 'wow', 474 | 'sx_a70a0c': 'like', 475 | } 476 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "facebook-scraper" 3 | version = "0.2.60" 4 | description = "Scrape Facebook public pages without an API key" 5 | authors = ["Kevin Zúñiga "] 6 | license = "MIT" 7 | readme = "README.md" 8 | repository = "https://github.com/kevinzg/facebook-scraper" 9 | 10 | [tool.poetry.dependencies] 11 | python = "^3.6" 12 | requests-html = "^0.10.0" 13 | youtube_dl = {version = "*", optional=true} 14 | browser-cookie3 = {version = "*", optional=true} 15 | dateparser = "^1.0.0" 16 | demjson3 = "^3.0.5" 17 | 18 | [tool.poetry.dev-dependencies] 19 | ipdb = {version = "*", python = "^3.7"} 20 | ipython = {version = "*", python = "^3.7"} 21 | pytest = "^6.2.2" 22 | pytest-vcr = "^1.0.2" 23 | 24 | [tool.poetry.extras] 25 | youtube-dl = ["youtube_dl"] 26 | browser-cookie3 = ["browser-cookie3"] 27 | 28 | [tool.poetry.scripts] 29 | facebook-scraper = 'facebook_scraper.__main__:run' 30 | 31 | [tool.black] 32 | line-length = 98 33 | target-version = ["py36"] 34 | skip-string-normalization = true 35 | 36 | [build-system] 37 | requires = ["poetry-core>=1.0.0"] 38 | build-backend = "poetry.core.masonry.api" 39 | 40 | [tool.pytest.ini_options] 41 | filterwarnings = [ 42 | "ignore::UserWarning", 43 | ] -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.4; python_version >= "3.5" and python_full_version >= "3.6.0" \ 2 | --hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128 \ 3 | --hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 4 | appnope==0.1.2; python_version >= "3.7" and python_version < "4.0" and sys_platform == "darwin" \ 5 | --hash=sha256:93aa393e9d6c54c5cd570ccadd8edad61ea0c4b9ea7a01409020c9aa019eb442 \ 6 | --hash=sha256:dd83cd4b5b460958838f6eb3000c660b1f9caf2a5b1de4264e941512f603258a 7 | atomicwrites==1.4.0; python_version >= "3.6" and python_full_version < "3.0.0" and sys_platform == "win32" or sys_platform == "win32" and python_version >= "3.6" and python_full_version >= "3.4.0" \ 8 | --hash=sha256:6d1784dea7c0c8d4a5172b6c620f40b6e4cbfdf96d783691f2e1302a7b88e197 \ 9 | --hash=sha256:ae70396ad1a434f9c7046fd2dd196fc04b12f9e91ffb859164193be8b6168a7a 10 | attrs==20.3.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6" \ 11 | --hash=sha256:31b2eced602aa8423c2aea9c76a724617ed67cf9513173fd3a4f03e3a929c7e6 \ 12 | --hash=sha256:832aa3cde19744e49938b91fea06d69ecb9e649c93ba974535d08ad92164f700 13 | backcall==0.2.0; python_version >= "3.7" and python_version < "4.0" \ 14 | --hash=sha256:fbbce6a29f263178a1f7915c1940bde0ec2b2a967566fe1c65c1dfb7422bd255 \ 15 | --hash=sha256:5cbdbf27be5e7cfadb448baf0aa95508f91f2bbc6c6437cd9cd06e2a4c215e1e 16 | beautifulsoup4==4.9.3; python_full_version >= "3.6.0" \ 17 | --hash=sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35 \ 18 | --hash=sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666 \ 19 | --hash=sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25 20 | bs4==0.0.1; python_full_version >= "3.6.0" \ 21 | --hash=sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a 22 | certifi==2020.12.5; python_full_version >= "3.6.0" \ 23 | --hash=sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830 \ 24 | --hash=sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c 25 | chardet==4.0.0; python_full_version >= "3.6.0" \ 26 | --hash=sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5 \ 27 | --hash=sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa 28 | colorama==0.4.4; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" and sys_platform == "win32" or python_version >= "3.7" and python_version < "4.0" and sys_platform == "win32" and python_full_version >= "3.5.0" \ 29 | --hash=sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2 \ 30 | --hash=sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b 31 | cssselect==1.1.0; python_full_version >= "3.6.0" \ 32 | --hash=sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf \ 33 | --hash=sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc 34 | dateparser==1.0.0; python_version >= "3.5" \ 35 | --hash=sha256:17202df32c7a36e773136ff353aa3767e987f8b3e27374c39fd21a30a803d6f8 \ 36 | --hash=sha256:159cc4e01a593706a15cd4e269a0b3345edf3aef8bf9278a57dac8adf5bf1e4a 37 | decorator==4.4.2; python_version >= "3.7" and python_full_version < "3.0.0" and python_version < "4.0" or python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.2.0" \ 38 | --hash=sha256:41fa54c2a0cc4ba648be4fd43cff00aedf5b9465c9bf18d64325bc225f08f760 \ 39 | --hash=sha256:e3a62f0520172440ca0dcc823749319382e377f37f140a0b99ef45fecb84bfe7 40 | fake-useragent==0.1.11; python_full_version >= "3.6.0" \ 41 | --hash=sha256:c104998b750eb097eefc28ae28e92d66397598d2cf41a31aa45d5559ef1adf35 42 | idna==2.10; python_full_version >= "3.6.0" and python_version >= "3.6" \ 43 | --hash=sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0 \ 44 | --hash=sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6 45 | importlib-metadata==3.7.3; python_version < "3.8" and python_version >= "3.6" and (python_version >= "3.6" and python_full_version < "3.0.0" and python_version < "3.8" or python_full_version >= "3.4.0" and python_version >= "3.6" and python_version < "3.8") \ 46 | --hash=sha256:b74159469b464a99cb8cc3e21973e4d96e05d3024d337313fedb618a6e86e6f4 \ 47 | --hash=sha256:742add720a20d0467df2f444ae41704000f50e1234f46174b51f9c6031a1bd71 48 | iniconfig==1.1.1; python_version >= "3.6" \ 49 | --hash=sha256:011e24c64b7f47f6ebd835bb12a743f2fbe9a26d4cecaa7f53bc4f35ee9da8b3 \ 50 | --hash=sha256:bc3af051d7d14b2ee5ef9969666def0cd1a000e121eaea580d4a313df4b37f32 51 | ipdb==0.13.7; python_version >= "3.7" and python_version < "4.0" \ 52 | --hash=sha256:178c367a61c1039e44e17c56fcc4a6e7dc11b33561261382d419b6ddb4401810 53 | ipython-genutils==0.2.0; python_version >= "3.7" and python_version < "4.0" \ 54 | --hash=sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8 \ 55 | --hash=sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8 56 | ipython==7.21.0; python_version >= "3.7" and python_version < "4.0" \ 57 | --hash=sha256:34207ffb2f653bced2bc8e3756c1db86e7d93e44ed049daae9814fed66d408ec \ 58 | --hash=sha256:04323f72d5b85b606330b6d7e2dc8d2683ad46c3905e955aa96ecc7a99388e70 59 | jedi==0.18.0; python_version >= "3.7" and python_version < "4.0" \ 60 | --hash=sha256:18456d83f65f400ab0c2d3319e48520420ef43b23a086fdc05dff34132f0fb93 \ 61 | --hash=sha256:92550a404bad8afed881a137ec9a461fed49eca661414be45059329614ed0707 62 | lxml==4.9.1; python_full_version >= "3.6.0" \ 63 | --hash=sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318 \ 64 | --hash=sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c \ 65 | --hash=sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b \ 66 | --hash=sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000 \ 67 | --hash=sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73 \ 68 | --hash=sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d \ 69 | --hash=sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb \ 70 | --hash=sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8 \ 71 | --hash=sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2 \ 72 | --hash=sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345 \ 73 | --hash=sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94 \ 74 | --hash=sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e \ 75 | --hash=sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b \ 76 | --hash=sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc \ 77 | --hash=sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a \ 78 | --hash=sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9 \ 79 | --hash=sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc \ 80 | --hash=sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387 \ 81 | --hash=sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb \ 82 | --hash=sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7 \ 83 | --hash=sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4 \ 84 | --hash=sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97 \ 85 | --hash=sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67 \ 86 | --hash=sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627 \ 87 | --hash=sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7 \ 88 | --hash=sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd \ 89 | --hash=sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3 \ 90 | --hash=sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7 \ 91 | --hash=sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130 \ 92 | --hash=sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b \ 93 | --hash=sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036 \ 94 | --hash=sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785 \ 95 | --hash=sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca \ 96 | --hash=sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91 \ 97 | --hash=sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc \ 98 | --hash=sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536 \ 99 | --hash=sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391 \ 100 | --hash=sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3 \ 101 | --hash=sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d \ 102 | --hash=sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21 \ 103 | --hash=sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3 \ 104 | --hash=sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d \ 105 | --hash=sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29 \ 106 | --hash=sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715 \ 107 | --hash=sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed \ 108 | --hash=sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25 \ 109 | --hash=sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c \ 110 | --hash=sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785 \ 111 | --hash=sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837 \ 112 | --hash=sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4 \ 113 | --hash=sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b \ 114 | --hash=sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2 \ 115 | --hash=sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067 \ 116 | --hash=sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448 \ 117 | --hash=sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d \ 118 | --hash=sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2 \ 119 | --hash=sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc \ 120 | --hash=sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c \ 121 | --hash=sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5 \ 122 | --hash=sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84 \ 123 | --hash=sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8 \ 124 | --hash=sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf \ 125 | --hash=sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7 \ 126 | --hash=sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e \ 127 | --hash=sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb \ 128 | --hash=sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b \ 129 | --hash=sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3 \ 130 | --hash=sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad \ 131 | --hash=sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8 \ 132 | --hash=sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f 133 | multidict==5.1.0; python_version >= "3.6" \ 134 | --hash=sha256:b7993704f1a4b204e71debe6095150d43b2ee6150fa4f44d6d966ec356a8d61f \ 135 | --hash=sha256:9dd6e9b1a913d096ac95d0399bd737e00f2af1e1594a787e00f7975778c8b2bf \ 136 | --hash=sha256:f21756997ad8ef815d8ef3d34edd98804ab5ea337feedcd62fb52d22bf531281 \ 137 | --hash=sha256:1ab820665e67373de5802acae069a6a05567ae234ddb129f31d290fc3d1aa56d \ 138 | --hash=sha256:9436dc58c123f07b230383083855593550c4d301d2532045a17ccf6eca505f6d \ 139 | --hash=sha256:830f57206cc96ed0ccf68304141fec9481a096c4d2e2831f311bde1c404401da \ 140 | --hash=sha256:2e68965192c4ea61fff1b81c14ff712fc7dc15d2bd120602e4a3494ea6584224 \ 141 | --hash=sha256:2f1a132f1c88724674271d636e6b7351477c27722f2ed789f719f9e3545a3d26 \ 142 | --hash=sha256:3a4f32116f8f72ecf2a29dabfb27b23ab7cdc0ba807e8459e59a93a9be9506f6 \ 143 | --hash=sha256:46c73e09ad374a6d876c599f2328161bcd95e280f84d2060cf57991dec5cfe76 \ 144 | --hash=sha256:018132dbd8688c7a69ad89c4a3f39ea2f9f33302ebe567a879da8f4ca73f0d0a \ 145 | --hash=sha256:4b186eb7d6ae7c06eb4392411189469e6a820da81447f46c0072a41c748ab73f \ 146 | --hash=sha256:3a041b76d13706b7fff23b9fc83117c7b8fe8d5fe9e6be45eee72b9baa75f348 \ 147 | --hash=sha256:051012ccee979b2b06be928a6150d237aec75dd6bf2d1eeeb190baf2b05abc93 \ 148 | --hash=sha256:6a4d5ce640e37b0efcc8441caeea8f43a06addace2335bd11151bc02d2ee31f9 \ 149 | --hash=sha256:5cf3443199b83ed9e955f511b5b241fd3ae004e3cb81c58ec10f4fe47c7dce37 \ 150 | --hash=sha256:f200755768dc19c6f4e2b672421e0ebb3dd54c38d5a4f262b872d8cfcc9e93b5 \ 151 | --hash=sha256:05c20b68e512166fddba59a918773ba002fdd77800cad9f55b59790030bab632 \ 152 | --hash=sha256:54fd1e83a184e19c598d5e70ba508196fd0bbdd676ce159feb412a4a6664f952 \ 153 | --hash=sha256:0e3c84e6c67eba89c2dbcee08504ba8644ab4284863452450520dad8f1e89b79 \ 154 | --hash=sha256:dc862056f76443a0db4509116c5cd480fe1b6a2d45512a653f9a855cc0517456 \ 155 | --hash=sha256:0e929169f9c090dae0646a011c8b058e5e5fb391466016b39d21745b48817fd7 \ 156 | --hash=sha256:d81eddcb12d608cc08081fa88d046c78afb1bf8107e6feab5d43503fea74a635 \ 157 | --hash=sha256:585fd452dd7782130d112f7ddf3473ffdd521414674c33876187e101b588738a \ 158 | --hash=sha256:37e5438e1c78931df5d3c0c78ae049092877e5e9c02dd1ff5abb9cf27a5914ea \ 159 | --hash=sha256:07b42215124aedecc6083f1ce6b7e5ec5b50047afa701f3442054373a6deb656 \ 160 | --hash=sha256:929006d3c2d923788ba153ad0de8ed2e5ed39fdbe8e7be21e2f22ed06c6783d3 \ 161 | --hash=sha256:b797515be8743b771aa868f83563f789bbd4b236659ba52243b735d80b29ed93 \ 162 | --hash=sha256:d5c65bdf4484872c4af3150aeebe101ba560dcfb34488d9a8ff8dbcd21079647 \ 163 | --hash=sha256:b47a43177a5e65b771b80db71e7be76c0ba23cc8aa73eeeb089ed5219cdbe27d \ 164 | --hash=sha256:806068d4f86cb06af37cd65821554f98240a19ce646d3cd24e1c33587f313eb8 \ 165 | --hash=sha256:46dd362c2f045095c920162e9307de5ffd0a1bfbba0a6e990b344366f55a30c1 \ 166 | --hash=sha256:ace010325c787c378afd7f7c1ac66b26313b3344628652eacd149bdd23c68841 \ 167 | --hash=sha256:ecc771ab628ea281517e24fd2c52e8f31c41e66652d07599ad8818abaad38cda \ 168 | --hash=sha256:fc13a9524bc18b6fb6e0dbec3533ba0496bbed167c56d0aabefd965584557d80 \ 169 | --hash=sha256:7df80d07818b385f3129180369079bd6934cf70469f99daaebfac89dca288359 \ 170 | --hash=sha256:25b4e5f22d3a37ddf3effc0710ba692cfc792c2b9edfb9c05aefe823256e84d5 171 | packaging==20.9; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6" \ 172 | --hash=sha256:67714da7f7bc052e064859c05c595155bd1ee9f69f76557e21f051443c20947a \ 173 | --hash=sha256:5b327ac1320dc863dca72f4514ecc086f31186744b84a230374cc1fd776feae5 174 | parse==1.19.0; python_full_version >= "3.6.0" \ 175 | --hash=sha256:9ff82852bcb65d139813e2a5197627a94966245c897796760a3a2a8eb66f020b 176 | parso==0.8.1; python_version >= "3.7" and python_version < "4.0" \ 177 | --hash=sha256:15b00182f472319383252c18d5913b69269590616c947747bc50bf4ac768f410 \ 178 | --hash=sha256:8519430ad07087d4c997fda3a7918f7cfa27cb58972a8c89c2a0295a1c940e9e 179 | pexpect==4.8.0; python_version >= "3.7" and python_version < "4.0" and sys_platform != "win32" \ 180 | --hash=sha256:0b48a55dcb3c05f3329815901ea4fc1537514d6ba867a152b581d69ae3710937 \ 181 | --hash=sha256:fc65a43959d153d0114afe13997d439c22823a27cefceb5ff35c2178c6784c0c 182 | pickleshare==0.7.5; python_version >= "3.7" and python_version < "4.0" \ 183 | --hash=sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56 \ 184 | --hash=sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca 185 | pluggy==0.13.1; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6" \ 186 | --hash=sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d \ 187 | --hash=sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0 188 | prompt-toolkit==3.0.17; python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.1" \ 189 | --hash=sha256:4cea7d09e46723885cb8bc54678175453e5071e9449821dce6f017b1d1fbfc1a \ 190 | --hash=sha256:9397a7162cf45449147ad6042fa37983a081b8a73363a5253dd4072666333137 191 | ptyprocess==0.7.0; python_version >= "3.7" and python_version < "4.0" and sys_platform != "win32" \ 192 | --hash=sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35 \ 193 | --hash=sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220 194 | py==1.10.0; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6" \ 195 | --hash=sha256:3b80836aa6d1feeaa108e046da6423ab8f6ceda6468545ae8d02d9d58d18818a \ 196 | --hash=sha256:21b81bda15b66ef5e1a777a21c4dcd9c20ad3efd0b3f817e7a809035269e1bd3 197 | pyee==8.1.0; python_version >= "3.5" and python_full_version >= "3.6.0" \ 198 | --hash=sha256:383973b63ad7ed5e3c0311f8b179c52981f9e7b3eaea0e9a830d13ec34dde65f \ 199 | --hash=sha256:92dacc5bd2bdb8f95aa8dd2585d47ca1c4840e2adb95ccf90034d64f725bfd31 200 | pygments==2.8.1; python_version >= "3.7" and python_version < "4.0" \ 201 | --hash=sha256:534ef71d539ae97d4c3a4cf7d6f110f214b0e687e92f9cb9d2a3b0d3101289c8 \ 202 | --hash=sha256:2656e1a6edcdabf4275f9a3640db59fd5de107d88e8663c5d4e9a0fa62f77f94 203 | pyparsing==2.4.7; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.4.0" and python_version >= "3.6" \ 204 | --hash=sha256:ef9d7589ef3c200abe66653d3f1ab1033c3c419ae9b9bdb1240a85b024efc88b \ 205 | --hash=sha256:c203ec8783bf771a155b207279b9bccb8dea02d8f0c9e5f8ead507bc3246ecc1 206 | pyppeteer==0.0.25; python_version >= "3.5" and python_full_version >= "3.6.0" \ 207 | --hash=sha256:51fe769b722a1718043b74d12c20420f29e0dd9eeea2b66652b7f93a9ad465dd 208 | pyquery==1.4.3; python_full_version >= "3.6.0" \ 209 | --hash=sha256:1fc33b7699455ed25c75282bc8f80ace1ac078b0dda5a933dacbd8b1c1f83963 \ 210 | --hash=sha256:a388eefb6bc4a55350de0316fbd97cda999ae669b6743ae5b99102ba54f5aa72 211 | pytest-vcr==1.0.2 \ 212 | --hash=sha256:23ee51b75abbcc43d926272773aae4f39f93aceb75ed56852d0bf618f92e1896 \ 213 | --hash=sha256:2f316e0539399bea0296e8b8401145c62b6f85e9066af7e57b6151481b0d6d9c 214 | pytest==6.2.2; python_version >= "3.6" \ 215 | --hash=sha256:b574b57423e818210672e07ca1fa90aaf194a4f63f3ab909a2c67ebb22913839 \ 216 | --hash=sha256:9d1edf9e7d0b84d72ea3dbcdfd22b35fb543a5e8f2a60092dd578936bf63d7f9 217 | python-dateutil==2.8.1; python_version >= "3.5" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.5" \ 218 | --hash=sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c \ 219 | --hash=sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a 220 | pytz==2021.1; python_version >= "3.5" \ 221 | --hash=sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798 \ 222 | --hash=sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da 223 | pyyaml==5.4.1; python_version >= "3.5" and python_full_version < "3.0.0" or python_full_version >= "3.6.0" and python_version >= "3.5" \ 224 | --hash=sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922 \ 225 | --hash=sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393 \ 226 | --hash=sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8 \ 227 | --hash=sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185 \ 228 | --hash=sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253 \ 229 | --hash=sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc \ 230 | --hash=sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5 \ 231 | --hash=sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df \ 232 | --hash=sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018 \ 233 | --hash=sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63 \ 234 | --hash=sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b \ 235 | --hash=sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf \ 236 | --hash=sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46 \ 237 | --hash=sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb \ 238 | --hash=sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc \ 239 | --hash=sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696 \ 240 | --hash=sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77 \ 241 | --hash=sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183 \ 242 | --hash=sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10 \ 243 | --hash=sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db \ 244 | --hash=sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e 245 | regex==2021.3.17; python_version >= "3.5" \ 246 | --hash=sha256:b97ec5d299c10d96617cc851b2e0f81ba5d9d6248413cd374ef7f3a8871ee4a6 \ 247 | --hash=sha256:cb4ee827857a5ad9b8ae34d3c8cc51151cb4a3fe082c12ec20ec73e63cc7c6f0 \ 248 | --hash=sha256:633497504e2a485a70a3268d4fc403fe3063a50a50eed1039083e9471ad0101c \ 249 | --hash=sha256:a59a2ee329b3de764b21495d78c92ab00b4ea79acef0f7ae8c1067f773570afa \ 250 | --hash=sha256:f85d6f41e34f6a2d1607e312820971872944f1661a73d33e1e82d35ea3305e14 \ 251 | --hash=sha256:4651f839dbde0816798e698626af6a2469eee6d9964824bb5386091255a1694f \ 252 | --hash=sha256:39c44532d0e4f1639a89e52355b949573e1e2c5116106a395642cbbae0ff9bcd \ 253 | --hash=sha256:3d9a7e215e02bd7646a91fb8bcba30bc55fd42a719d6b35cf80e5bae31d9134e \ 254 | --hash=sha256:159fac1a4731409c830d32913f13f68346d6b8e39650ed5d704a9ce2f9ef9cb3 \ 255 | --hash=sha256:13f50969028e81765ed2a1c5fcfdc246c245cf8d47986d5172e82ab1a0c42ee5 \ 256 | --hash=sha256:b9d8d286c53fe0cbc6d20bf3d583cabcd1499d89034524e3b94c93a5ab85ca90 \ 257 | --hash=sha256:201e2619a77b21a7780580ab7b5ce43835e242d3e20fef50f66a8df0542e437f \ 258 | --hash=sha256:d47d359545b0ccad29d572ecd52c9da945de7cd6cf9c0cfcb0269f76d3555689 \ 259 | --hash=sha256:ea2f41445852c660ba7c3ebf7d70b3779b20d9ca8ba54485a17740db49f46932 \ 260 | --hash=sha256:486a5f8e11e1f5bbfcad87f7c7745eb14796642323e7e1829a331f87a713daaa \ 261 | --hash=sha256:18e25e0afe1cf0f62781a150c1454b2113785401ba285c745acf10c8ca8917df \ 262 | --hash=sha256:a2ee026f4156789df8644d23ef423e6194fad0bc53575534101bb1de5d67e8ce \ 263 | --hash=sha256:4c0788010a93ace8a174d73e7c6c9d3e6e3b7ad99a453c8ee8c975ddd9965643 \ 264 | --hash=sha256:575a832e09d237ae5fedb825a7a5bc6a116090dd57d6417d4f3b75121c73e3be \ 265 | --hash=sha256:8e65e3e4c6feadf6770e2ad89ad3deb524bcb03d8dc679f381d0568c024e0deb \ 266 | --hash=sha256:a0df9a0ad2aad49ea3c7f65edd2ffb3d5c59589b85992a6006354f6fb109bb18 \ 267 | --hash=sha256:b98bc9db003f1079caf07b610377ed1ac2e2c11acc2bea4892e28cc5b509d8d5 \ 268 | --hash=sha256:808404898e9a765e4058bf3d7607d0629000e0a14a6782ccbb089296b76fa8fe \ 269 | --hash=sha256:5770a51180d85ea468234bc7987f5597803a4c3d7463e7323322fe4a1b181578 \ 270 | --hash=sha256:976a54d44fd043d958a69b18705a910a8376196c6b6ee5f2596ffc11bff4420d \ 271 | --hash=sha256:63f3ca8451e5ff7133ffbec9eda641aeab2001be1a01878990f6c87e3c44b9d5 \ 272 | --hash=sha256:bcd945175c29a672f13fce13a11893556cd440e37c1b643d6eeab1988c8b209c \ 273 | --hash=sha256:3d9356add82cff75413bec360c1eca3e58db4a9f5dafa1f19650958a81e3249d \ 274 | --hash=sha256:f5d0c921c99297354cecc5a416ee4280bd3f20fd81b9fb671ca6be71499c3fdf \ 275 | --hash=sha256:14de88eda0976020528efc92d0a1f8830e2fb0de2ae6005a6fc4e062553031fa \ 276 | --hash=sha256:4c2e364491406b7888c2ad4428245fc56c327e34a5dfe58fd40df272b3c3dab3 \ 277 | --hash=sha256:8bd4f91f3fb1c9b1380d6894bd5b4a519409135bec14c0c80151e58394a4e88a \ 278 | --hash=sha256:882f53afe31ef0425b405a3f601c0009b44206ea7f55ee1c606aad3cc213a52c \ 279 | --hash=sha256:07ef35301b4484bce843831e7039a84e19d8d33b3f8b2f9aab86c376813d0139 \ 280 | --hash=sha256:360a01b5fa2ad35b3113ae0c07fb544ad180603fa3b1f074f52d98c1096fa15e \ 281 | --hash=sha256:709f65bb2fa9825f09892617d01246002097f8f9b6dde8d1bb4083cf554701ba \ 282 | --hash=sha256:c66221e947d7207457f8b6f42b12f613b09efa9669f65a587a2a71f6a0e4d106 \ 283 | --hash=sha256:c782da0e45aff131f0bed6e66fbcfa589ff2862fc719b83a88640daa01a5aff7 \ 284 | --hash=sha256:dc9963aacb7da5177e40874585d7407c0f93fb9d7518ec58b86e562f633f36cd \ 285 | --hash=sha256:a0d04128e005142260de3733591ddf476e4902c0c23c1af237d9acf3c96e1b38 \ 286 | --hash=sha256:4b8a1fb724904139149a43e172850f35aa6ea97fb0545244dc0b805e0154ed68 287 | requests-html==0.10.0; python_full_version >= "3.6.0" \ 288 | --hash=sha256:7e929ecfed95fb1d0994bb368295d6d7c4d06b03fcb900c33d7d0b17e6003947 \ 289 | --hash=sha256:cb8a78cf829c4eca9d6233f28524f65dd2bfaafb4bdbbc407f0a0b8f487df6e2 290 | requests==2.25.1; python_full_version >= "3.6.0" \ 291 | --hash=sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e \ 292 | --hash=sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804 293 | six==1.15.0; python_full_version >= "3.6.0" and python_version >= "3.5" and (python_version >= "3.5" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.5") \ 294 | --hash=sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced \ 295 | --hash=sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259 296 | soupsieve==2.2.1; python_version >= "3.6" and python_full_version >= "3.6.0" \ 297 | --hash=sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b \ 298 | --hash=sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc 299 | toml==0.10.2; python_version >= "3.6" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.6" \ 300 | --hash=sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b \ 301 | --hash=sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f 302 | tqdm==4.59.0; python_version >= "3.5" and python_full_version >= "3.6.0" \ 303 | --hash=sha256:9fdf349068d047d4cfbe24862c425883af1db29bcddf4b0eeb2524f6fbdb23c7 \ 304 | --hash=sha256:d666ae29164da3e517fcf125e41d4fe96e5bb375cd87ff9763f6b38b5592fe33 305 | traitlets==5.0.5; python_version >= "3.7" and python_version < "4.0" \ 306 | --hash=sha256:69ff3f9d5351f31a7ad80443c2674b7099df13cc41fc5fa6e2f6d3b0330b0426 \ 307 | --hash=sha256:178f4ce988f69189f7e523337a3e11d91c786ded9360174a3d9ca83e79bc5396 308 | typing-extensions==3.7.4.3; python_version < "3.8" and python_version >= "3.6" \ 309 | --hash=sha256:dafc7639cde7f1b6e1acc0f457842a83e722ccca8eef5270af2d74792619a89f \ 310 | --hash=sha256:7cb407020f00f7bfc3cb3e7881628838e69d8f3fcab2f64742a5e76b2f841918 \ 311 | --hash=sha256:99d4073b617d30288f569d3f13d2bd7548c3a7e4c8de87db09a9d29bb3a4a60c 312 | tzlocal==2.1; python_version >= "3.5" \ 313 | --hash=sha256:e2cb6c6b5b604af38597403e9852872d7f534962ae2954c7f35efcb1ccacf4a4 \ 314 | --hash=sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44 315 | urllib3==1.26.4; python_full_version >= "3.6.0" and python_version < "4" and python_version >= "3.5" \ 316 | --hash=sha256:2f4da4594db7e1e110a944bb1b551fdf4e6c136ad42e4234131391e21eb5b0df \ 317 | --hash=sha256:e7b021f7241115872f92f43c6508082facffbd1c048e3c6e2bb9c2a157e28937 318 | vcrpy==4.1.1; python_version >= "3.5" \ 319 | --hash=sha256:12c3fcdae7b88ecf11fc0d3e6d77586549d4575a2ceee18e82eee75c1f626162 \ 320 | --hash=sha256:57095bf22fc0a2d99ee9674cdafebed0f3ba763018582450706f7d3a74fff599 321 | w3lib==1.22.0; python_full_version >= "3.6.0" \ 322 | --hash=sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53 \ 323 | --hash=sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df 324 | wcwidth==0.2.5; python_version >= "3.7" and python_version < "4.0" and python_full_version >= "3.6.1" \ 325 | --hash=sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784 \ 326 | --hash=sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83 327 | websockets==9.1; python_version >= "3.6" and python_full_version >= "3.6.0" \ 328 | --hash=sha256:0dd4eb8e0bbf365d6f652711ce21b8fd2b596f873d32aabb0fbb53ec604418cc \ 329 | --hash=sha256:1d0971cc7251aeff955aa742ec541ee8aaea4bb2ebf0245748fbec62f744a37e \ 330 | --hash=sha256:1d6b4fddb12ab9adf87b843cd4316c4bd602db8d5efd2fb83147f0458fe85135 \ 331 | --hash=sha256:230a3506df6b5f446fed2398e58dcaafdff12d67fe1397dff196411a9e820d02 \ 332 | --hash=sha256:276d2339ebf0df4f45df453923ebd2270b87900eda5dfd4a6b0cfa15f82111c3 \ 333 | --hash=sha256:2cf04601633a4ec176b9cc3d3e73789c037641001dbfaf7c411f89cd3e04fcaf \ 334 | --hash=sha256:3ddff38894c7857c476feb3538dd847514379d6dc844961dc99f04b0384b1b1b \ 335 | --hash=sha256:48c222feb3ced18f3dc61168ca18952a22fb88e5eb8902d2bf1b50faefdc34a2 \ 336 | --hash=sha256:51d04df04ed9d08077d10ccbe21e6805791b78eac49d16d30a1f1fe2e44ba0af \ 337 | --hash=sha256:597c28f3aa7a09e8c070a86b03107094ee5cdafcc0d55f2f2eac92faac8dc67d \ 338 | --hash=sha256:5c8f0d82ea2468282e08b0cf5307f3ad022290ed50c45d5cb7767957ca782880 \ 339 | --hash=sha256:7189e51955f9268b2bdd6cc537e0faa06f8fffda7fb386e5922c6391de51b077 \ 340 | --hash=sha256:7df3596838b2a0c07c6f6d67752c53859a54993d4f062689fdf547cb56d0f84f \ 341 | --hash=sha256:826ccf85d4514609219725ba4a7abd569228c2c9f1968e8be05be366f68291ec \ 342 | --hash=sha256:836d14eb53b500fd92bd5db2fc5894f7c72b634f9c2a28f546f75967503d8e25 \ 343 | --hash=sha256:85db8090ba94e22d964498a47fdd933b8875a1add6ebc514c7ac8703eb97bbf0 \ 344 | --hash=sha256:85e701a6c316b7067f1e8675c638036a796fe5116783a4c932e7eb8e305a3ffe \ 345 | --hash=sha256:900589e19200be76dd7cbaa95e9771605b5ce3f62512d039fb3bc5da9014912a \ 346 | --hash=sha256:9147868bb0cc01e6846606cd65cbf9c58598f187b96d14dd1ca17338b08793bb \ 347 | --hash=sha256:9e7fdc775fe7403dbd8bc883ba59576a6232eac96dacb56512daacf7af5d618d \ 348 | --hash=sha256:ab5ee15d3462198c794c49ccd31773d8a2b8c17d622aa184f669d2b98c2f0857 \ 349 | --hash=sha256:ad893d889bc700a5835e0a95a3e4f2c39e91577ab232a3dc03c262a0f8fc4b5c \ 350 | --hash=sha256:b2e71c4670ebe1067fa8632f0d081e47254ee2d3d409de54168b43b0ba9147e0 \ 351 | --hash=sha256:b43b13e5622c5a53ab12f3272e6f42f1ce37cd5b6684b2676cb365403295cd40 \ 352 | --hash=sha256:b4ad84b156cf50529b8ac5cc1638c2cf8680490e3fccb6121316c8c02620a2e4 \ 353 | --hash=sha256:be5fd35e99970518547edc906efab29afd392319f020c3c58b0e1a158e16ed20 \ 354 | --hash=sha256:caa68c95bc1776d3521f81eeb4d5b9438be92514ec2a79fececda814099c8314 \ 355 | --hash=sha256:d144b350045c53c8ff09aa1cfa955012dd32f00c7e0862c199edcabb1a8b32da \ 356 | --hash=sha256:d2c2d9b24d3c65b5a02cac12cbb4e4194e590314519ed49db2f67ef561c3cf58 \ 357 | --hash=sha256:e9e5fd6dbdf95d99bc03732ded1fc8ef22ebbc05999ac7e0c7bf57fe6e4e5ae2 \ 358 | --hash=sha256:ebf459a1c069f9866d8569439c06193c586e72c9330db1390af7c6a0a32c4afd \ 359 | --hash=sha256:f31722f1c033c198aa4a39a01905951c00bd1c74f922e8afc1b1c62adbcdd56a \ 360 | --hash=sha256:f68c352a68e5fdf1e97288d5cec9296664c590c25932a8476224124aaf90dbcd 361 | wrapt==1.12.1; python_version >= "3.5" \ 362 | --hash=sha256:b62ffa81fb85f4332a4f609cab4ac40709470da05643a082ec1eb88e6d9b97d7 363 | yarl==1.6.3; python_version >= "3.6" \ 364 | --hash=sha256:0355a701b3998dcd832d0dc47cc5dedf3874f966ac7f870e0f3a6788d802d434 \ 365 | --hash=sha256:bafb450deef6861815ed579c7a6113a879a6ef58aed4c3a4be54400ae8871478 \ 366 | --hash=sha256:547f7665ad50fa8563150ed079f8e805e63dd85def6674c97efd78eed6c224a6 \ 367 | --hash=sha256:63f90b20ca654b3ecc7a8d62c03ffa46999595f0167d6450fa8383bab252987e \ 368 | --hash=sha256:97b5bdc450d63c3ba30a127d018b866ea94e65655efaf889ebeabc20f7d12406 \ 369 | --hash=sha256:d8d07d102f17b68966e2de0e07bfd6e139c7c02ef06d3a0f8d2f0f055e13bb76 \ 370 | --hash=sha256:15263c3b0b47968c1d90daa89f21fcc889bb4b1aac5555580d74565de6836366 \ 371 | --hash=sha256:b5dfc9a40c198334f4f3f55880ecf910adebdcb2a0b9a9c23c9345faa9185721 \ 372 | --hash=sha256:b2e9a456c121e26d13c29251f8267541bd75e6a1ccf9e859179701c36a078643 \ 373 | --hash=sha256:ce3beb46a72d9f2190f9e1027886bfc513702d748047b548b05dab7dfb584d2e \ 374 | --hash=sha256:2ce4c621d21326a4a5500c25031e102af589edb50c09b321049e388b3934eec3 \ 375 | --hash=sha256:d26608cf178efb8faa5ff0f2d2e77c208f471c5a3709e577a7b3fd0445703ac8 \ 376 | --hash=sha256:4c5bcfc3ed226bf6419f7a33982fb4b8ec2e45785a0561eb99274ebbf09fdd6a \ 377 | --hash=sha256:4736eaee5626db8d9cda9eb5282028cc834e2aeb194e0d8b50217d707e98bb5c \ 378 | --hash=sha256:68dc568889b1c13f1e4745c96b931cc94fdd0defe92a72c2b8ce01091b22e35f \ 379 | --hash=sha256:7356644cbed76119d0b6bd32ffba704d30d747e0c217109d7979a7bc36c4d970 \ 380 | --hash=sha256:00d7ad91b6583602eb9c1d085a2cf281ada267e9a197e8b7cae487dadbfa293e \ 381 | --hash=sha256:69ee97c71fee1f63d04c945f56d5d726483c4762845400a6795a3b75d56b6c50 \ 382 | --hash=sha256:e46fba844f4895b36f4c398c5af062a9808d1f26b2999c58909517384d5deda2 \ 383 | --hash=sha256:31ede6e8c4329fb81c86706ba8f6bf661a924b53ba191b27aa5fcee5714d18ec \ 384 | --hash=sha256:fcbb48a93e8699eae920f8d92f7160c03567b421bc17362a9ffbbd706a816f71 \ 385 | --hash=sha256:72a660bdd24497e3e84f5519e57a9ee9220b6f3ac4d45056961bf22838ce20cc \ 386 | --hash=sha256:324ba3d3c6fee56e2e0b0d09bf5c73824b9f08234339d2b788af65e60040c959 \ 387 | --hash=sha256:e6b5460dc5ad42ad2b36cca524491dfcaffbfd9c8df50508bddc354e787b8dc2 \ 388 | --hash=sha256:6d6283d8e0631b617edf0fd726353cb76630b83a089a40933043894e7f6721e2 \ 389 | --hash=sha256:9ede61b0854e267fd565e7527e2f2eb3ef8858b301319be0604177690e1a3896 \ 390 | --hash=sha256:f0b059678fd549c66b89bed03efcabb009075bd131c248ecdf087bdb6faba24a \ 391 | --hash=sha256:329412812ecfc94a57cd37c9d547579510a9e83c516bc069470db5f75684629e \ 392 | --hash=sha256:c49ff66d479d38ab863c50f7bb27dee97c6627c5fe60697de15529da9c3de724 \ 393 | --hash=sha256:f040bcc6725c821a4c0665f3aa96a4d0805a7aaf2caf266d256b8ed71b9f041c \ 394 | --hash=sha256:d5c32c82990e4ac4d8150fd7652b972216b204de4e83a122546dce571c1bdf25 \ 395 | --hash=sha256:d597767fcd2c3dc49d6eea360c458b65643d1e4dbed91361cf5e36e53c1f8c96 \ 396 | --hash=sha256:8aa3decd5e0e852dc68335abf5478a518b41bf2ab2f330fe44916399efedfae0 \ 397 | --hash=sha256:73494d5b71099ae8cb8754f1df131c11d433b387efab7b51849e7e1e851f07a4 \ 398 | --hash=sha256:5b883e458058f8d6099e4420f0cc2567989032b5f34b271c0827de9f1079a424 \ 399 | --hash=sha256:4953fb0b4fdb7e08b2f3b3be80a00d28c5c8a2056bb066169de00e6501b986b6 \ 400 | --hash=sha256:8a9066529240171b68893d60dca86a763eae2139dd42f42106b03cf4b426bf10 401 | zipp==3.4.1; python_version < "3.8" and python_version >= "3.6" \ 402 | --hash=sha256:51cb66cc54621609dd593d1787f286ee42a5c0adbb4b29abea5a63edc3e03098 \ 403 | --hash=sha256:3607921face881ba3e026887d8150cca609d517579abe052ac81fc5aeffdbd76 404 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.4; python_version >= "3.5" and python_full_version >= "3.6.0" \ 2 | --hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128 \ 3 | --hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 4 | beautifulsoup4==4.9.3; python_full_version >= "3.6.0" \ 5 | --hash=sha256:4c98143716ef1cb40bf7f39a8e3eec8f8b009509e74904ba3a7b315431577e35 \ 6 | --hash=sha256:fff47e031e34ec82bf17e00da8f592fe7de69aeea38be00523c04623c04fb666 \ 7 | --hash=sha256:84729e322ad1d5b4d25f805bfa05b902dd96450f43842c4e99067d5e1369eb25 8 | bs4==0.0.1; python_full_version >= "3.6.0" \ 9 | --hash=sha256:36ecea1fd7cc5c0c6e4a1ff075df26d50da647b75376626cc186e2212886dd3a 10 | certifi==2020.12.5; python_full_version >= "3.6.0" \ 11 | --hash=sha256:719a74fb9e33b9bd44cc7f3a8d94bc35e4049deebe19ba7d8e108280cfd59830 \ 12 | --hash=sha256:1a4995114262bffbc2413b159f2a1a480c969de6e6eb13ee966d470af86af59c 13 | chardet==4.0.0; python_full_version >= "3.6.0" \ 14 | --hash=sha256:f864054d66fd9118f2e67044ac8981a54775ec5b67aed0441892edb553d21da5 \ 15 | --hash=sha256:0d6f53a15db4120f2b08c94f11e7d93d2c911ee118b6b30a04ec3ee8310179fa 16 | cssselect==1.1.0; python_full_version >= "3.6.0" \ 17 | --hash=sha256:f612ee47b749c877ebae5bb77035d8f4202c6ad0f0fc1271b3c18ad6c4468ecf \ 18 | --hash=sha256:f95f8dedd925fd8f54edb3d2dfb44c190d9d18512377d3c1e2388d16126879bc 19 | dateparser==1.0.0; python_version >= "3.5" \ 20 | --hash=sha256:17202df32c7a36e773136ff353aa3767e987f8b3e27374c39fd21a30a803d6f8 \ 21 | --hash=sha256:159cc4e01a593706a15cd4e269a0b3345edf3aef8bf9278a57dac8adf5bf1e4a 22 | demjson3==3.0.5 \ 23 | --hash=sha256:ab9aabdd85695f3684fc296f39766a2730f6c8de81d23f7048073dfe2f616d80 24 | fake-useragent==0.1.11; python_full_version >= "3.6.0" \ 25 | --hash=sha256:c104998b750eb097eefc28ae28e92d66397598d2cf41a31aa45d5559ef1adf35 26 | idna==2.10; python_full_version >= "3.6.0" \ 27 | --hash=sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0 \ 28 | --hash=sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6 29 | lxml==4.9.1; python_full_version >= "3.6.0" \ 30 | --hash=sha256:04da965dfebb5dac2619cb90fcf93efdb35b3c6994fea58a157a834f2f94b318 \ 31 | --hash=sha256:0538747a9d7827ce3e16a8fdd201a99e661c7dee3c96c885d8ecba3c35d1032c \ 32 | --hash=sha256:0645e934e940107e2fdbe7c5b6fb8ec6232444260752598bc4d09511bd056c0b \ 33 | --hash=sha256:079b68f197c796e42aa80b1f739f058dcee796dc725cc9a1be0cdb08fc45b000 \ 34 | --hash=sha256:0f3f0059891d3254c7b5fb935330d6db38d6519ecd238ca4fce93c234b4a0f73 \ 35 | --hash=sha256:10d2017f9150248563bb579cd0d07c61c58da85c922b780060dcc9a3aa9f432d \ 36 | --hash=sha256:1355755b62c28950f9ce123c7a41460ed9743c699905cbe664a5bcc5c9c7c7fb \ 37 | --hash=sha256:13c90064b224e10c14dcdf8086688d3f0e612db53766e7478d7754703295c7c8 \ 38 | --hash=sha256:1423631e3d51008871299525b541413c9b6c6423593e89f9c4cfbe8460afc0a2 \ 39 | --hash=sha256:1436cf0063bba7888e43f1ba8d58824f085410ea2025befe81150aceb123e345 \ 40 | --hash=sha256:1a7c59c6ffd6ef5db362b798f350e24ab2cfa5700d53ac6681918f314a4d3b94 \ 41 | --hash=sha256:1e1cf47774373777936c5aabad489fef7b1c087dcd1f426b621fda9dcc12994e \ 42 | --hash=sha256:206a51077773c6c5d2ce1991327cda719063a47adc02bd703c56a662cdb6c58b \ 43 | --hash=sha256:21fb3d24ab430fc538a96e9fbb9b150029914805d551deeac7d7822f64631dfc \ 44 | --hash=sha256:27e590352c76156f50f538dbcebd1925317a0f70540f7dc8c97d2931c595783a \ 45 | --hash=sha256:287605bede6bd36e930577c5925fcea17cb30453d96a7b4c63c14a257118dbb9 \ 46 | --hash=sha256:2aaf6a0a6465d39b5ca69688fce82d20088c1838534982996ec46633dc7ad6cc \ 47 | --hash=sha256:32a73c53783becdb7eaf75a2a1525ea8e49379fb7248c3eeefb9412123536387 \ 48 | --hash=sha256:41fb58868b816c202e8881fd0f179a4644ce6e7cbbb248ef0283a34b73ec73bb \ 49 | --hash=sha256:4780677767dd52b99f0af1f123bc2c22873d30b474aa0e2fc3fe5e02217687c7 \ 50 | --hash=sha256:4878e667ebabe9b65e785ac8da4d48886fe81193a84bbe49f12acff8f7a383a4 \ 51 | --hash=sha256:487c8e61d7acc50b8be82bda8c8d21d20e133c3cbf41bd8ad7eb1aaeb3f07c97 \ 52 | --hash=sha256:4beea0f31491bc086991b97517b9683e5cfb369205dac0148ef685ac12a20a67 \ 53 | --hash=sha256:4cfbe42c686f33944e12f45a27d25a492cc0e43e1dc1da5d6a87cbcaf2e95627 \ 54 | --hash=sha256:4d5bae0a37af799207140652a700f21a85946f107a199bcb06720b13a4f1f0b7 \ 55 | --hash=sha256:4e285b5f2bf321fc0857b491b5028c5f276ec0c873b985d58d7748ece1d770dd \ 56 | --hash=sha256:57e4d637258703d14171b54203fd6822fda218c6c2658a7d30816b10995f29f3 \ 57 | --hash=sha256:5974895115737a74a00b321e339b9c3f45c20275d226398ae79ac008d908bff7 \ 58 | --hash=sha256:5ef87fca280fb15342726bd5f980f6faf8b84a5287fcc2d4962ea8af88b35130 \ 59 | --hash=sha256:603a464c2e67d8a546ddaa206d98e3246e5db05594b97db844c2f0a1af37cf5b \ 60 | --hash=sha256:6653071f4f9bac46fbc30f3c7838b0e9063ee335908c5d61fb7a4a86c8fd2036 \ 61 | --hash=sha256:6ca2264f341dd81e41f3fffecec6e446aa2121e0b8d026fb5130e02de1402785 \ 62 | --hash=sha256:6d279033bf614953c3fc4a0aa9ac33a21e8044ca72d4fa8b9273fe75359d5cca \ 63 | --hash=sha256:6d949f53ad4fc7cf02c44d6678e7ff05ec5f5552b235b9e136bd52e9bf730b91 \ 64 | --hash=sha256:6daa662aba22ef3258934105be2dd9afa5bb45748f4f702a3b39a5bf53a1f4dc \ 65 | --hash=sha256:6eafc048ea3f1b3c136c71a86db393be36b5b3d9c87b1c25204e7d397cee9536 \ 66 | --hash=sha256:830c88747dce8a3e7525defa68afd742b4580df6aa2fdd6f0855481e3994d391 \ 67 | --hash=sha256:86e92728ef3fc842c50a5cb1d5ba2bc66db7da08a7af53fb3da79e202d1b2cd3 \ 68 | --hash=sha256:8caf4d16b31961e964c62194ea3e26a0e9561cdf72eecb1781458b67ec83423d \ 69 | --hash=sha256:8d1a92d8e90b286d491e5626af53afef2ba04da33e82e30744795c71880eaa21 \ 70 | --hash=sha256:8f0a4d179c9a941eb80c3a63cdb495e539e064f8054230844dcf2fcb812b71d3 \ 71 | --hash=sha256:9232b09f5efee6a495a99ae6824881940d6447debe272ea400c02e3b68aad85d \ 72 | --hash=sha256:927a9dd016d6033bc12e0bf5dee1dde140235fc8d0d51099353c76081c03dc29 \ 73 | --hash=sha256:93e414e3206779ef41e5ff2448067213febf260ba747fc65389a3ddaa3fb8715 \ 74 | --hash=sha256:98cafc618614d72b02185ac583c6f7796202062c41d2eeecdf07820bad3295ed \ 75 | --hash=sha256:9c3a88d20e4fe4a2a4a84bf439a5ac9c9aba400b85244c63a1ab7088f85d9d25 \ 76 | --hash=sha256:9f36de4cd0c262dd9927886cc2305aa3f2210db437aa4fed3fb4940b8bf4592c \ 77 | --hash=sha256:a60f90bba4c37962cbf210f0188ecca87daafdf60271f4c6948606e4dabf8785 \ 78 | --hash=sha256:a614e4afed58c14254e67862456d212c4dcceebab2eaa44d627c2ca04bf86837 \ 79 | --hash=sha256:ae06c1e4bc60ee076292e582a7512f304abdf6c70db59b56745cca1684f875a4 \ 80 | --hash=sha256:b122a188cd292c4d2fcd78d04f863b789ef43aa129b233d7c9004de08693728b \ 81 | --hash=sha256:b570da8cd0012f4af9fa76a5635cd31f707473e65a5a335b186069d5c7121ff2 \ 82 | --hash=sha256:bcaa1c495ce623966d9fc8a187da80082334236a2a1c7e141763ffaf7a405067 \ 83 | --hash=sha256:bd34f6d1810d9354dc7e35158aa6cc33456be7706df4420819af6ed966e85448 \ 84 | --hash=sha256:be9eb06489bc975c38706902cbc6888f39e946b81383abc2838d186f0e8b6a9d \ 85 | --hash=sha256:c4b2e0559b68455c085fb0f6178e9752c4be3bba104d6e881eb5573b399d1eb2 \ 86 | --hash=sha256:c62e8dd9754b7debda0c5ba59d34509c4688f853588d75b53c3791983faa96fc \ 87 | --hash=sha256:c852b1530083a620cb0de5f3cd6826f19862bafeaf77586f1aef326e49d95f0c \ 88 | --hash=sha256:d9fc0bf3ff86c17348dfc5d322f627d78273eba545db865c3cd14b3f19e57fa5 \ 89 | --hash=sha256:dad7b164905d3e534883281c050180afcf1e230c3d4a54e8038aa5cfcf312b84 \ 90 | --hash=sha256:e5f66bdf0976ec667fc4594d2812a00b07ed14d1b44259d19a41ae3fff99f2b8 \ 91 | --hash=sha256:e8f0c9d65da595cfe91713bc1222af9ecabd37971762cb830dea2fc3b3bb2acf \ 92 | --hash=sha256:edffbe3c510d8f4bf8640e02ca019e48a9b72357318383ca60e3330c23aaffc7 \ 93 | --hash=sha256:eea5d6443b093e1545ad0210e6cf27f920482bfcf5c77cdc8596aec73523bb7e \ 94 | --hash=sha256:ef72013e20dd5ba86a8ae1aed7f56f31d3374189aa8b433e7b12ad182c0d2dfb \ 95 | --hash=sha256:f05251bbc2145349b8d0b77c0d4e5f3b228418807b1ee27cefb11f69ed3d233b \ 96 | --hash=sha256:f1be258c4d3dc609e654a1dc59d37b17d7fef05df912c01fc2e15eb43a9735f3 \ 97 | --hash=sha256:f9ced82717c7ec65a67667bb05865ffe38af0e835cdd78728f1209c8fffe0cad \ 98 | --hash=sha256:fe17d10b97fdf58155f858606bddb4e037b805a60ae023c009f760d8361a4eb8 \ 99 | --hash=sha256:fe749b052bb7233fe5d072fcb549221a8cb1a16725c47c37e42b0b9cb3ff2c3f 100 | parse==1.19.0; python_full_version >= "3.6.0" \ 101 | --hash=sha256:9ff82852bcb65d139813e2a5197627a94966245c897796760a3a2a8eb66f020b 102 | pyee==8.1.0; python_version >= "3.5" and python_full_version >= "3.6.0" \ 103 | --hash=sha256:383973b63ad7ed5e3c0311f8b179c52981f9e7b3eaea0e9a830d13ec34dde65f \ 104 | --hash=sha256:92dacc5bd2bdb8f95aa8dd2585d47ca1c4840e2adb95ccf90034d64f725bfd31 105 | pyppeteer==0.0.25; python_version >= "3.5" and python_full_version >= "3.6.0" \ 106 | --hash=sha256:51fe769b722a1718043b74d12c20420f29e0dd9eeea2b66652b7f93a9ad465dd 107 | pyquery==1.4.3; python_full_version >= "3.6.0" \ 108 | --hash=sha256:1fc33b7699455ed25c75282bc8f80ace1ac078b0dda5a933dacbd8b1c1f83963 \ 109 | --hash=sha256:a388eefb6bc4a55350de0316fbd97cda999ae669b6743ae5b99102ba54f5aa72 110 | python-dateutil==2.8.1; python_version >= "3.5" and python_full_version < "3.0.0" or python_full_version >= "3.3.0" and python_version >= "3.5" \ 111 | --hash=sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c \ 112 | --hash=sha256:75bb3f31ea686f1197762692a9ee6a7550b59fc6ca3a1f4b5d7e32fb98e2da2a 113 | pytz==2021.1; python_version >= "3.5" \ 114 | --hash=sha256:eb10ce3e7736052ed3623d49975ce333bcd712c7bb19a58b9e2089d4057d0798 \ 115 | --hash=sha256:83a4a90894bf38e243cf052c8b58f381bfe9a7a483f6a9cab140bc7f702ac4da 116 | regex==2021.3.17; python_version >= "3.5" \ 117 | --hash=sha256:b97ec5d299c10d96617cc851b2e0f81ba5d9d6248413cd374ef7f3a8871ee4a6 \ 118 | --hash=sha256:cb4ee827857a5ad9b8ae34d3c8cc51151cb4a3fe082c12ec20ec73e63cc7c6f0 \ 119 | --hash=sha256:633497504e2a485a70a3268d4fc403fe3063a50a50eed1039083e9471ad0101c \ 120 | --hash=sha256:a59a2ee329b3de764b21495d78c92ab00b4ea79acef0f7ae8c1067f773570afa \ 121 | --hash=sha256:f85d6f41e34f6a2d1607e312820971872944f1661a73d33e1e82d35ea3305e14 \ 122 | --hash=sha256:4651f839dbde0816798e698626af6a2469eee6d9964824bb5386091255a1694f \ 123 | --hash=sha256:39c44532d0e4f1639a89e52355b949573e1e2c5116106a395642cbbae0ff9bcd \ 124 | --hash=sha256:3d9a7e215e02bd7646a91fb8bcba30bc55fd42a719d6b35cf80e5bae31d9134e \ 125 | --hash=sha256:159fac1a4731409c830d32913f13f68346d6b8e39650ed5d704a9ce2f9ef9cb3 \ 126 | --hash=sha256:13f50969028e81765ed2a1c5fcfdc246c245cf8d47986d5172e82ab1a0c42ee5 \ 127 | --hash=sha256:b9d8d286c53fe0cbc6d20bf3d583cabcd1499d89034524e3b94c93a5ab85ca90 \ 128 | --hash=sha256:201e2619a77b21a7780580ab7b5ce43835e242d3e20fef50f66a8df0542e437f \ 129 | --hash=sha256:d47d359545b0ccad29d572ecd52c9da945de7cd6cf9c0cfcb0269f76d3555689 \ 130 | --hash=sha256:ea2f41445852c660ba7c3ebf7d70b3779b20d9ca8ba54485a17740db49f46932 \ 131 | --hash=sha256:486a5f8e11e1f5bbfcad87f7c7745eb14796642323e7e1829a331f87a713daaa \ 132 | --hash=sha256:18e25e0afe1cf0f62781a150c1454b2113785401ba285c745acf10c8ca8917df \ 133 | --hash=sha256:a2ee026f4156789df8644d23ef423e6194fad0bc53575534101bb1de5d67e8ce \ 134 | --hash=sha256:4c0788010a93ace8a174d73e7c6c9d3e6e3b7ad99a453c8ee8c975ddd9965643 \ 135 | --hash=sha256:575a832e09d237ae5fedb825a7a5bc6a116090dd57d6417d4f3b75121c73e3be \ 136 | --hash=sha256:8e65e3e4c6feadf6770e2ad89ad3deb524bcb03d8dc679f381d0568c024e0deb \ 137 | --hash=sha256:a0df9a0ad2aad49ea3c7f65edd2ffb3d5c59589b85992a6006354f6fb109bb18 \ 138 | --hash=sha256:b98bc9db003f1079caf07b610377ed1ac2e2c11acc2bea4892e28cc5b509d8d5 \ 139 | --hash=sha256:808404898e9a765e4058bf3d7607d0629000e0a14a6782ccbb089296b76fa8fe \ 140 | --hash=sha256:5770a51180d85ea468234bc7987f5597803a4c3d7463e7323322fe4a1b181578 \ 141 | --hash=sha256:976a54d44fd043d958a69b18705a910a8376196c6b6ee5f2596ffc11bff4420d \ 142 | --hash=sha256:63f3ca8451e5ff7133ffbec9eda641aeab2001be1a01878990f6c87e3c44b9d5 \ 143 | --hash=sha256:bcd945175c29a672f13fce13a11893556cd440e37c1b643d6eeab1988c8b209c \ 144 | --hash=sha256:3d9356add82cff75413bec360c1eca3e58db4a9f5dafa1f19650958a81e3249d \ 145 | --hash=sha256:f5d0c921c99297354cecc5a416ee4280bd3f20fd81b9fb671ca6be71499c3fdf \ 146 | --hash=sha256:14de88eda0976020528efc92d0a1f8830e2fb0de2ae6005a6fc4e062553031fa \ 147 | --hash=sha256:4c2e364491406b7888c2ad4428245fc56c327e34a5dfe58fd40df272b3c3dab3 \ 148 | --hash=sha256:8bd4f91f3fb1c9b1380d6894bd5b4a519409135bec14c0c80151e58394a4e88a \ 149 | --hash=sha256:882f53afe31ef0425b405a3f601c0009b44206ea7f55ee1c606aad3cc213a52c \ 150 | --hash=sha256:07ef35301b4484bce843831e7039a84e19d8d33b3f8b2f9aab86c376813d0139 \ 151 | --hash=sha256:360a01b5fa2ad35b3113ae0c07fb544ad180603fa3b1f074f52d98c1096fa15e \ 152 | --hash=sha256:709f65bb2fa9825f09892617d01246002097f8f9b6dde8d1bb4083cf554701ba \ 153 | --hash=sha256:c66221e947d7207457f8b6f42b12f613b09efa9669f65a587a2a71f6a0e4d106 \ 154 | --hash=sha256:c782da0e45aff131f0bed6e66fbcfa589ff2862fc719b83a88640daa01a5aff7 \ 155 | --hash=sha256:dc9963aacb7da5177e40874585d7407c0f93fb9d7518ec58b86e562f633f36cd \ 156 | --hash=sha256:a0d04128e005142260de3733591ddf476e4902c0c23c1af237d9acf3c96e1b38 \ 157 | --hash=sha256:4b8a1fb724904139149a43e172850f35aa6ea97fb0545244dc0b805e0154ed68 158 | requests-html==0.10.0; python_full_version >= "3.6.0" \ 159 | --hash=sha256:7e929ecfed95fb1d0994bb368295d6d7c4d06b03fcb900c33d7d0b17e6003947 \ 160 | --hash=sha256:cb8a78cf829c4eca9d6233f28524f65dd2bfaafb4bdbbc407f0a0b8f487df6e2 161 | requests==2.25.1; python_full_version >= "3.6.0" \ 162 | --hash=sha256:c210084e36a42ae6b9219e00e48287def368a26d03a048ddad7bfee44f75871e \ 163 | --hash=sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804 164 | six==1.15.0; python_full_version >= "3.6.0" and python_version >= "3.5" \ 165 | --hash=sha256:8b74bedcbbbaca38ff6d7491d76f2b06b3592611af620f8426e82dddb04a5ced \ 166 | --hash=sha256:30639c035cdb23534cd4aa2dd52c3bf48f06e5f4a941509c8bafd8ce11080259 167 | soupsieve==2.2.1; python_version >= "3.6" and python_full_version >= "3.6.0" \ 168 | --hash=sha256:c2c1c2d44f158cdbddab7824a9af8c4f83c76b1e23e049479aa432feb6c4c23b \ 169 | --hash=sha256:052774848f448cf19c7e959adf5566904d525f33a3f8b6ba6f6f8f26ec7de0cc 170 | tqdm==4.59.0; python_version >= "3.5" and python_full_version >= "3.6.0" \ 171 | --hash=sha256:9fdf349068d047d4cfbe24862c425883af1db29bcddf4b0eeb2524f6fbdb23c7 \ 172 | --hash=sha256:d666ae29164da3e517fcf125e41d4fe96e5bb375cd87ff9763f6b38b5592fe33 173 | tzlocal==2.1; python_version >= "3.5" \ 174 | --hash=sha256:e2cb6c6b5b604af38597403e9852872d7f534962ae2954c7f35efcb1ccacf4a4 \ 175 | --hash=sha256:643c97c5294aedc737780a49d9df30889321cbe1204eac2c2ec6134035a92e44 176 | urllib3==1.26.5; python_full_version >= "3.6.0" and python_version < "4" and python_version >= "3.5" \ 177 | --hash=sha256:753a0374df26658f99d826cfe40394a686d05985786d946fbe4165b5148f5a7c \ 178 | --hash=sha256:a7acd0977125325f516bda9735fa7142b909a8d01e8b2e4c8108d0984e6e0098 179 | w3lib==1.22.0; python_full_version >= "3.6.0" \ 180 | --hash=sha256:0161d55537063e00d95a241663ede3395c4c6d7b777972ba2fd58bbab2001e53 \ 181 | --hash=sha256:0ad6d0203157d61149fd45aaed2e24f53902989c32fc1dccc2e2bfba371560df 182 | websockets==9.1; python_version >= "3.6" and python_full_version >= "3.6.0" \ 183 | --hash=sha256:0dd4eb8e0bbf365d6f652711ce21b8fd2b596f873d32aabb0fbb53ec604418cc \ 184 | --hash=sha256:1d0971cc7251aeff955aa742ec541ee8aaea4bb2ebf0245748fbec62f744a37e \ 185 | --hash=sha256:1d6b4fddb12ab9adf87b843cd4316c4bd602db8d5efd2fb83147f0458fe85135 \ 186 | --hash=sha256:230a3506df6b5f446fed2398e58dcaafdff12d67fe1397dff196411a9e820d02 \ 187 | --hash=sha256:276d2339ebf0df4f45df453923ebd2270b87900eda5dfd4a6b0cfa15f82111c3 \ 188 | --hash=sha256:2cf04601633a4ec176b9cc3d3e73789c037641001dbfaf7c411f89cd3e04fcaf \ 189 | --hash=sha256:3ddff38894c7857c476feb3538dd847514379d6dc844961dc99f04b0384b1b1b \ 190 | --hash=sha256:48c222feb3ced18f3dc61168ca18952a22fb88e5eb8902d2bf1b50faefdc34a2 \ 191 | --hash=sha256:51d04df04ed9d08077d10ccbe21e6805791b78eac49d16d30a1f1fe2e44ba0af \ 192 | --hash=sha256:597c28f3aa7a09e8c070a86b03107094ee5cdafcc0d55f2f2eac92faac8dc67d \ 193 | --hash=sha256:5c8f0d82ea2468282e08b0cf5307f3ad022290ed50c45d5cb7767957ca782880 \ 194 | --hash=sha256:7189e51955f9268b2bdd6cc537e0faa06f8fffda7fb386e5922c6391de51b077 \ 195 | --hash=sha256:7df3596838b2a0c07c6f6d67752c53859a54993d4f062689fdf547cb56d0f84f \ 196 | --hash=sha256:826ccf85d4514609219725ba4a7abd569228c2c9f1968e8be05be366f68291ec \ 197 | --hash=sha256:836d14eb53b500fd92bd5db2fc5894f7c72b634f9c2a28f546f75967503d8e25 \ 198 | --hash=sha256:85db8090ba94e22d964498a47fdd933b8875a1add6ebc514c7ac8703eb97bbf0 \ 199 | --hash=sha256:85e701a6c316b7067f1e8675c638036a796fe5116783a4c932e7eb8e305a3ffe \ 200 | --hash=sha256:900589e19200be76dd7cbaa95e9771605b5ce3f62512d039fb3bc5da9014912a \ 201 | --hash=sha256:9147868bb0cc01e6846606cd65cbf9c58598f187b96d14dd1ca17338b08793bb \ 202 | --hash=sha256:9e7fdc775fe7403dbd8bc883ba59576a6232eac96dacb56512daacf7af5d618d \ 203 | --hash=sha256:ab5ee15d3462198c794c49ccd31773d8a2b8c17d622aa184f669d2b98c2f0857 \ 204 | --hash=sha256:ad893d889bc700a5835e0a95a3e4f2c39e91577ab232a3dc03c262a0f8fc4b5c \ 205 | --hash=sha256:b2e71c4670ebe1067fa8632f0d081e47254ee2d3d409de54168b43b0ba9147e0 \ 206 | --hash=sha256:b43b13e5622c5a53ab12f3272e6f42f1ce37cd5b6684b2676cb365403295cd40 \ 207 | --hash=sha256:b4ad84b156cf50529b8ac5cc1638c2cf8680490e3fccb6121316c8c02620a2e4 \ 208 | --hash=sha256:be5fd35e99970518547edc906efab29afd392319f020c3c58b0e1a158e16ed20 \ 209 | --hash=sha256:caa68c95bc1776d3521f81eeb4d5b9438be92514ec2a79fececda814099c8314 \ 210 | --hash=sha256:d144b350045c53c8ff09aa1cfa955012dd32f00c7e0862c199edcabb1a8b32da \ 211 | --hash=sha256:d2c2d9b24d3c65b5a02cac12cbb4e4194e590314519ed49db2f67ef561c3cf58 \ 212 | --hash=sha256:e9e5fd6dbdf95d99bc03732ded1fc8ef22ebbc05999ac7e0c7bf57fe6e4e5ae2 \ 213 | --hash=sha256:ebf459a1c069f9866d8569439c06193c586e72c9330db1390af7c6a0a32c4afd \ 214 | --hash=sha256:f31722f1c033c198aa4a39a01905951c00bd1c74f922e8afc1b1c62adbcdd56a \ 215 | --hash=sha256:f68c352a68e5fdf1e97288d5cec9296664c590c25932a8476224124aaf90dbcd 216 | -------------------------------------------------------------------------------- /tests/manualTEst.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from facebook_scraper import get_posts 4 | 5 | post = next(get_posts(account=100089065833006, start_url="https://mbasic.facebook.com/?v=timeline", cookies='../cks.txt')) 6 | print(post['images']) 7 | -------------------------------------------------------------------------------- /tests/test_get_posts.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import pytest 4 | 5 | from facebook_scraper import * 6 | 7 | 8 | @pytest.mark.vcr() 9 | class TestGetPosts: 10 | def test_get_posts(self): 11 | expected_post = { 12 | 'available': True, 13 | 'comments': 149, 14 | 'comments_full': None, 15 | 'factcheck': None, 16 | 'image': 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=d32fa3269feeaf6904d78a512f41ab26&oe=60E673C5', 17 | 'image_id': '3065146500236449', 18 | 'image_ids': [ 19 | '3065146500236449', 20 | '3065146626903103', 21 | '3065146783569754', 22 | '3065146886903077', 23 | ], 24 | 'image_lowquality': 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/cp0/e15/q65/p720x720/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=3&oh=426e258c934177d9ded48435efaecc6c&oe=60E74054', 25 | 'images': [ 26 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=d32fa3269feeaf6904d78a512f41ab26&oe=60E673C5', 27 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96657922_3065146630236436_9052202957155598336_n.jpg?_nc_cat=101&ccb=1-3&_nc_sid=8024bb&_nc_ohc=MwI_Au5sC60AX93Dkix&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=b947668e646a0e7614671deff90dc9a3&oe=60E41393', 28 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96557798_3065146790236420_838564679184809984_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&_nc_ohc=ydkcrs8kPykAX_0Fdn4&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=7884c93d73b2a9f806baf829c8f941b0&oe=60E4D7FB', 29 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96688092_3065146896903076_7861539131082407936_n.jpg?_nc_cat=108&ccb=1-3&_nc_sid=8024bb&_nc_ohc=vqgGsFXTmO4AX82bX5z&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=379eb1c4551d74a13a4cafb07524288e&oe=60E6753F', 30 | ], 31 | 'images_description': [ 32 | 'No photo description available.', 33 | 'No photo description available.', 34 | 'No photo description available.', 35 | 'No photo description available.', 36 | ], 37 | 'images_lowquality': [ 38 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/cp0/e15/q65/p720x720/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=3&oh=426e258c934177d9ded48435efaecc6c&oe=60E74054', 39 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-0/cp0/e15/q65/s640x640/96657922_3065146630236436_9052202957155598336_n.jpg?_nc_cat=101&ccb=1-3&_nc_sid=8024bb&_nc_ohc=MwI_Au5sC60AX93Dkix&_nc_ht=scontent.fhlz2-1.fna&tp=9&oh=5c016bd47d3d9ab3ba997b48dbc21a97&oe=60E75F2D', 40 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-0/cp0/e15/q65/s640x640/96557798_3065146790236420_838564679184809984_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&_nc_ohc=ydkcrs8kPykAX_0Fdn4&_nc_ht=scontent.fhlz2-1.fna&tp=9&oh=ca962fe95d846cbd6e4e78b0884572c9&oe=60E51308', 41 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-0/cp0/e15/q65/s640x640/96688092_3065146896903076_7861539131082407936_n.jpg?_nc_cat=108&ccb=1-3&_nc_sid=8024bb&_nc_ohc=vqgGsFXTmO4AX82bX5z&_nc_ht=scontent.fhlz2-1.fna&tp=9&oh=7e9da116d24a9faee2fe15c16d7dea8f&oe=60E3DD81', 42 | ], 43 | 'images_lowquality_description': [ 44 | 'No photo description available.', 45 | 'No photo description available.', 46 | 'No photo description available.', 47 | 'No photo description available.', 48 | ], 49 | 'is_live': False, 50 | 'likes': 1615, 51 | 'link': 'https://www.nintendo.com/wallpapers/', 52 | 'original_request_url': 3065154550235644, 53 | 'post_id': '3065154550235644', 54 | 'post_text': 'Check out these themed wallpapers and many more at the link ' 55 | 'below for your personal use! We hope you enjoy them!\n' 56 | '\n' 57 | 'https://www.nintendo.com/wallpapers/', 58 | 'post_url': 'https://facebook.com/story.php?story_fbid=3065154550235644&id=119240841493711', 59 | 'reaction_count': None, 60 | 'reactions': None, 61 | 'reactors': None, 62 | 'shared_post_id': None, 63 | 'shared_post_url': None, 64 | 'shared_text': '', 65 | 'shared_time': None, 66 | 'shared_user_id': None, 67 | 'shared_username': None, 68 | 'shares': 281, 69 | 'text': 'Check out these themed wallpapers and many more at the link below ' 70 | 'for your personal use! We hope you enjoy them!\n' 71 | '\n' 72 | 'https://www.nintendo.com/wallpapers/', 73 | 'time': datetime.datetime(2020, 5, 13, 13, 1, 18), 74 | 'user_id': '119240841493711', 75 | 'user_url': 'https://facebook.com/Nintendo/?refid=52&__tn__=C-R', 76 | 'username': 'Nintendo', 77 | 'video': None, 78 | 'video_duration_seconds': None, 79 | 'video_height': None, 80 | 'video_id': None, 81 | 'video_quality': None, 82 | 'video_size_MB': None, 83 | 'video_thumbnail': None, 84 | 'video_watches': None, 85 | 'video_width': None, 86 | 'w3_fb_url': None, 87 | } 88 | 89 | post = next(get_posts(post_urls=[3065154550235644])) 90 | print(post) 91 | assert post == expected_post 92 | 93 | def test_get_posts_with_extra_info(self): 94 | expected_post = { 95 | 'available': True, 96 | 'comments': 149, 97 | 'comments_full': None, 98 | 'factcheck': None, 99 | 'fetched_time': datetime.datetime(2021, 6, 9, 10, 31, 43, 834002), 100 | 'image': 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=d32fa3269feeaf6904d78a512f41ab26&oe=60E673C5', 101 | 'image_id': '3065146500236449', 102 | 'image_ids': [ 103 | '3065146500236449', 104 | '3065146626903103', 105 | '3065146783569754', 106 | '3065146886903077', 107 | ], 108 | 'image_lowquality': 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/cp0/e15/q65/p720x720/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=3&oh=426e258c934177d9ded48435efaecc6c&oe=60E74054', 109 | 'images': [ 110 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=d32fa3269feeaf6904d78a512f41ab26&oe=60E673C5', 111 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96657922_3065146630236436_9052202957155598336_n.jpg?_nc_cat=101&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=MwI_Au5sC60AX93Dkix&tn=8omYOUODC-SvWcRg&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=607e4783ada8c14a5d0fe50eaed35b74&oe=60E41393', 112 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96557798_3065146790236420_838564679184809984_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=ydkcrs8kPykAX_0Fdn4&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=7884c93d73b2a9f806baf829c8f941b0&oe=60E4D7FB', 113 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/fr/cp0/e15/q65/96688092_3065146896903076_7861539131082407936_n.jpg?_nc_cat=108&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=vqgGsFXTmO4AX82bX5z&_nc_ht=scontent.fhlz2-1.fna&tp=14&oh=379eb1c4551d74a13a4cafb07524288e&oe=60E6753F', 114 | ], 115 | 'images_description': [ 116 | 'No photo description available.', 117 | 'No photo description available.', 118 | 'No photo description available.', 119 | 'No photo description available.', 120 | ], 121 | 'images_lowquality': [ 122 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-9/cp0/e15/q65/p720x720/96724875_3065146506903115_4237164853036318720_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=SvpNqSK7ILIAX93ehWM&_nc_ht=scontent.fhlz2-1.fna&tp=3&oh=426e258c934177d9ded48435efaecc6c&oe=60E74054', 123 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-0/cp0/e15/q65/s640x640/96657922_3065146630236436_9052202957155598336_n.jpg?_nc_cat=101&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=MwI_Au5sC60AX93Dkix&tn=8omYOUODC-SvWcRg&_nc_ht=scontent.fhlz2-1.fna&tp=9&oh=85385c57a98cbd698d746ddafc29a61c&oe=60E75F2D', 124 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-0/cp0/e15/q65/s640x640/96557798_3065146790236420_838564679184809984_n.jpg?_nc_cat=103&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=ydkcrs8kPykAX_0Fdn4&_nc_ht=scontent.fhlz2-1.fna&tp=9&oh=ca962fe95d846cbd6e4e78b0884572c9&oe=60E51308', 125 | 'https://scontent.fhlz2-1.fna.fbcdn.net/v/t1.6435-0/cp0/e15/q65/s640x640/96688092_3065146896903076_7861539131082407936_n.jpg?_nc_cat=108&ccb=1-3&_nc_sid=8024bb&efg=eyJpIjoidCJ9&_nc_ohc=vqgGsFXTmO4AX82bX5z&_nc_ht=scontent.fhlz2-1.fna&tp=9&oh=7e9da116d24a9faee2fe15c16d7dea8f&oe=60E3DD81', 126 | ], 127 | 'images_lowquality_description': [ 128 | 'No photo description available.', 129 | 'No photo description available.', 130 | 'No photo description available.', 131 | 'No photo description available.', 132 | ], 133 | 'is_live': False, 134 | 'likes': 1615, 135 | 'link': 'https://www.nintendo.com/wallpapers/?fbclid=IwAR3uYocTphYdr6YYAznKWWdMBZ-p_Id3uNTFJ3_3lHwjnL3H7rRIEvb8yY8', 136 | 'original_request_url': 3065154550235644, 137 | 'post_id': '3065154550235644', 138 | 'post_text': 'Check out these themed wallpapers and many more at the link ' 139 | 'below for your personal use! We hope you enjoy them!\n' 140 | '\n' 141 | 'https://www.nintendo.com/wallpapers/', 142 | 'post_url': 'https://facebook.com/story.php?story_fbid=3065154550235644&id=119240841493711', 143 | 'reaction_count': 2117, 144 | 'reactions': { 145 | 'angry': 3, 146 | 'care': 92, 147 | 'haha': 4, 148 | 'like': 1615, 149 | 'love': 381, 150 | 'wow': 22, 151 | }, 152 | 'reactors': [], 153 | 'shared_post_id': None, 154 | 'shared_post_url': None, 155 | 'shared_text': '', 156 | 'shared_time': None, 157 | 'shared_user_id': None, 158 | 'shared_username': None, 159 | 'shares': 281, 160 | 'text': 'Check out these themed wallpapers and many more at the link below ' 161 | 'for your personal use! We hope you enjoy them!\n' 162 | '\n' 163 | 'https://www.nintendo.com/wallpapers/', 164 | 'time': datetime.datetime(2020, 5, 13, 13, 1), 165 | 'user_id': '119240841493711', 166 | 'user_url': 'https://facebook.com/Nintendo/?refid=52&__tn__=C-R', 167 | 'username': 'Nintendo', 168 | 'video': None, 169 | 'video_duration_seconds': None, 170 | 'video_height': None, 171 | 'video_id': None, 172 | 'video_quality': None, 173 | 'video_size_MB': None, 174 | 'video_thumbnail': None, 175 | 'video_watches': None, 176 | 'video_width': None, 177 | 'w3_fb_url': 'https://www.facebook.com/story.php?story_fbid=3065154550235644&id=119240841493711', 178 | } 179 | 180 | post = next( 181 | get_posts(post_urls=[3065154550235644], extra_info=True, cookies="cookies.txt") 182 | ) 183 | 184 | fields_to_ignore = ["fetched_time", "link"] 185 | for field in fields_to_ignore: 186 | post.pop(field) # Do not check this field 187 | expected_post.pop(field) 188 | 189 | assert post == expected_post 190 | 191 | def test_get_posts_fields_presence(self): 192 | posts = list(get_posts(account='Nintendo', pages=2, extra_info=True)) 193 | 194 | assert len(posts) == 6 195 | 196 | for post in posts: 197 | assert 'post_id' in post 198 | assert 'text' in post 199 | assert 'time' in post 200 | assert 'image' in post 201 | assert 'video' in post 202 | assert 'likes' in post 203 | assert 'comments' in post 204 | assert 'shares' in post 205 | assert 'post_url' in post 206 | assert 'link' in post 207 | 208 | def test_get_posts_with_extra_info_fields_presence(self): 209 | posts = list( 210 | get_posts(account='Nintendo', pages=2, cookies="cookies.txt", extra_info=True) 211 | ) 212 | 213 | assert len(posts) == 6 214 | 215 | for post in posts: 216 | assert 'post_id' in post 217 | assert 'text' in post 218 | assert 'time' in post 219 | assert 'video' in post 220 | assert 'image' in post 221 | assert 'likes' in post 222 | assert 'comments' in post 223 | assert 'shares' in post 224 | assert 'post_url' in post 225 | assert 'link' in post 226 | assert 'shares' in post 227 | assert 'likes' in post 228 | assert 'reactions' in post 229 | assert 'comments' in post 230 | assert 'w3_fb_url' in post 231 | assert 'fetched_time' in post 232 | 233 | def test_smoketest(self): 234 | list(get_posts(account='Nintendo', pages=2)) 235 | 236 | 237 | @pytest.mark.vcr() 238 | class TestGetGroupPosts: 239 | def test_get_group_posts(self): 240 | expected_post = { 241 | 'available': True, 242 | 'comments': 1, 243 | 'comments_full': None, 244 | 'factcheck': None, 245 | 'image': None, 246 | 'image_id': None, 247 | 'image_ids': [], 248 | 'image_lowquality': None, 249 | 'images': [], 250 | 'images_description': [], 251 | 'images_lowquality': [], 252 | 'images_lowquality_description': [], 253 | 'is_live': False, 254 | 'likes': 32, 255 | 'link': None, 256 | 'post_id': '1629606003787605', 257 | 'post_text': 'Hola!, This group is aimed to create opportunities for South ' 258 | 'American students in Computer Science and related fields.\n' 259 | '\n' 260 | 'Hope this will help us to know what we are doing in our work, ' 261 | 'achievements to be recognized, increase fairness in our area, ' 262 | 'and maybe conferences where we might meet.\n' 263 | '\n' 264 | 'Professors and professionals are also welcomed to share their ' 265 | 'experiences and to collaborate among us and learn together.\n' 266 | '\n' 267 | 'Some short rules for a happy co-existence:\n' 268 | '1. No business advertisement or spam.\n' 269 | '2. Topics relevant to Computing, Computer Science, Software ' 270 | 'Engineering, and Education.\n' 271 | '3. Political and religious advertisement are not allowed.', 272 | 'post_url': 'https://m.facebook.com/groups/southamericansincomputing/permalink/1629606003787605/', 273 | 'reaction_count': None, 274 | 'reactions': None, 275 | 'reactors': None, 276 | 'shared_post_id': None, 277 | 'shared_post_url': None, 278 | 'shared_text': '', 279 | 'shared_time': None, 280 | 'shared_user_id': None, 281 | 'shared_username': None, 282 | 'shares': 0, 283 | 'text': 'Hola!, This group is aimed to create opportunities for South ' 284 | 'American students in Computer Science and related fields.\n' 285 | '\n' 286 | 'Hope this will help us to know what we are doing in our work, ' 287 | 'achievements to be recognized, increase fairness in our area, and ' 288 | 'maybe conferences where we might meet.\n' 289 | '\n' 290 | 'Professors and professionals are also welcomed to share their ' 291 | 'experiences and to collaborate among us and learn together.\n' 292 | '\n' 293 | 'Some short rules for a happy co-existence:\n' 294 | '1. No business advertisement or spam.\n' 295 | '2. Topics relevant to Computing, Computer Science, Software ' 296 | 'Engineering, and Education.\n' 297 | '3. Political and religious advertisement are not allowed.', 298 | 'time': datetime.datetime(2018, 4, 4, 8, 2, 42), 299 | 'user_id': 757122227, 300 | 'user_url': 'https://facebook.com/omarflorez?groupid=117507531664134&refid=18&_ft_=top_level_post_id.1629606003787605%3Acontent_owner_id_new.757122227%3Apage_id.117507531664134%3Astory_location.6%3Atds_flgs.3%3Aott.AX_xo0_Tl6A-u34K%3Apage_insights.%7B%22117507531664134%22%3A%7B%22page_id%22%3A117507531664134%2C%22page_id_type%22%3A%22group%22%2C%22actor_id%22%3A757122227%2C%22dm%22%3A%7B%22isShare%22%3A0%2C%22originalPostOwnerID%22%3A0%7D%2C%22psn%22%3A%22EntGroupDescriptionChangeCreationStory%22%2C%22post_context%22%3A%7B%22object_fbtype%22%3A657%2C%22publish_time%22%3A1522785762%2C%22story_name%22%3A%22EntGroupDescriptionChangeCreationStory%22%2C%22story_fbid%22%3A%5B1629606003787605%5D%7D%2C%22role%22%3A1%2C%22sl%22%3A6%7D%7D&__tn__=C-R', 301 | 'username': 'Omar U. Florez', 302 | 'video': None, 303 | 'video_duration_seconds': None, 304 | 'video_height': None, 305 | 'video_id': None, 306 | 'video_quality': None, 307 | 'video_size_MB': None, 308 | 'video_thumbnail': None, 309 | 'video_watches': None, 310 | 'video_width': None, 311 | 'w3_fb_url': None, 312 | } 313 | 314 | unset_cookies() 315 | post = next(get_posts(group=117507531664134)) 316 | 317 | assert post == expected_post 318 | 319 | # todo: add a case with requesting a group post with start_url=None 320 | 321 | def test_smoketest(self): 322 | list(get_posts(group=117507531664134, pages=2)) 323 | 324 | 325 | @pytest.mark.vcr() 326 | class TestGetPhotos: 327 | def test_smoketest(self): 328 | list(get_photos(account='Nintendo', pages=2)) 329 | -------------------------------------------------------------------------------- /tests/test_parse_date.py: -------------------------------------------------------------------------------- 1 | from facebook_scraper.utils import parse_datetime 2 | 3 | 4 | class TestParseDate: 5 | dates = [ 6 | 'Oct 1 at 1:00 PM', 7 | 'Oct 1 at 11:00 PM', 8 | 'Oct 16 at 1:00 PM', 9 | 'Oct 16 at 11:00 PM', 10 | 'October 1 at 1:00 PM', 11 | 'October 1 at 11:00 PM', 12 | 'October 16 at 1:00 PM', 13 | 'October 16 at 11:00 PM', 14 | 'October 1, 2019 at 1:00 PM', 15 | 'October 1, 2019 at 11:00 PM', 16 | 'October 16, 2019 at 1:00 PM', 17 | 'October 16, 2019 at 11:00 PM', 18 | 'Yesterday at 1:00 PM', 19 | 'Yesterday at 11:00 PM', 20 | 'Today at 1:00 PM', 21 | 'Today at 11:00 PM', 22 | 'Yesterday at 1:00 PM', 23 | 'Yesterday at 11:00 PM', 24 | 'Yesterday at 15:28', 25 | '7 November at 20:01', 26 | '1h', 27 | '16h', 28 | '1hrs', 29 | '16hrs', 30 | '1 hr', 31 | '16 hrs', 32 | '1 min', 33 | '50 mins', 34 | 'on Sat', 35 | '1 wk', 36 | '2 wks', 37 | '1 yr', 38 | '2 yrs', 39 | '1 mth', 40 | '4 mths', 41 | 'last Tue', 42 | 'last Mon', 43 | '11 mos', 44 | '1 mo', 45 | 'Just now', 46 | ] 47 | 48 | def test_all_dates(self): 49 | for date in self.dates: 50 | try: 51 | assert parse_datetime(date) is not None 52 | except AssertionError as e: 53 | print(f'Failed to parse {date}') 54 | raise e 55 | -------------------------------------------------------------------------------- /tests/test_parse_duration.py: -------------------------------------------------------------------------------- 1 | from facebook_scraper.utils import parse_duration 2 | 3 | 4 | class TestParseDuration: 5 | durations = ['T26S', 'T33M8S', 'T1H28M15S'] 6 | 7 | def test_all_durations(self): 8 | for duration in self.durations: 9 | try: 10 | assert parse_duration(duration) is not None 11 | except AssertionError as e: 12 | print(f'Failed to parse {duration}') 13 | raise e 14 | --------------------------------------------------------------------------------