├── .gitignore ├── LICENSE ├── README.md ├── comments.png ├── fbcrawl ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── items.cpython-37.pyc │ ├── pipelines.cpython-37.pyc │ └── settings.cpython-37.pyc ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── comments.cpython-37.pyc │ └── fbcrawl.cpython-37.pyc │ ├── comments.py │ ├── events.py │ ├── fbcrawl.py │ └── profiles.py ├── runner_facebook.sh ├── scrapy.cfg └── trump.png /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__/ 2 | 3 | .* 4 | !.gitignore 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # fbcrawl 2 | Fbcrawl is an advanced crawler for Facebook, written in python, based on the [Scrapy](https://scrapy.org/) framework. 3 | 4 | # UNMAINTAINED 5 | For an undefined period I will be unable to review issues, fix bugs and merge pull requests. As I have been the sole contributor to the project, it's likely that the code will remain frozen at the current stage. 6 | 7 | Anybody who is skilled enough and willing to partecipate, may open a dedicated issue or contact at my email address: rugantio AT gmail DOT com 8 | 9 | I will be back, but in the meantime I'd appreciate if this becomes a community project. 10 | 11 | ## DONATIONS 12 | Fbcrawl is free software. It is not "free as beer" nor "free as speech", it is "free as a toilet": it is always available and working, but someone as to keep it clean and tidy, and I am the only one at the moment, it is not a community project. Please consider make a donation, it will keep this project alive and if I see actual interest from people I will get on with the [TODO](https://github.com/rugantio/fbcrawl/blob/master/README.md#TODO) list. One of the my long-term goal is to refactor the framework with a gui, connections to databases and graph vizualitations. These tasks would take at least a couple of months of work, and I will be able to afford them only with your support! Thank you :) 13 | 14 | [![paypal](https://www.paypalobjects.com/en_US/IT/i/btn/btn_donateCC_LG.gif)](https://www.paypal.com/cgi-bin/webscr?cmd=_donations&business=G96T8U8W7UZDL¤cy_code=EUR&source=url) 15 | 16 | ## DISCLAIMER 17 | This software is not authorized by Facebook and doesn't follow Facebook's [robots.txt](https://www.facebook.com/robots.txt). Scraping without Facebook explicit written is a violation of the [terms and conditions on scraping](http://www.facebook.com/apps/site_scraping_tos_terms.php) and can potentially cause a [lawsuit](https://petewarden.com/2010/04/05/how-i-got-sued-by-facebook/) 18 | 19 | This software is provided as is, for educational purposes, to show how a crawler can be made to recursively parse a facebook page. Use at your own risk. 20 | 21 | # Introduction 22 | 23 |
24 | Donald Trump 25 |
26 | 27 | EDIT: fbcrawl can now crawl comments! check out the "how to crawl comments" section! 28 | 29 | What features can fbcrawl obtain? Everything that you see in the table is crawled by default. I decided to simplify the timestamp feature, leaving out the hour and to ignore comments and commentators, which are going to be parsed post-by-post by another crawler. 30 | 31 | Fbcrawl makes use of an static mobile version of facebook, unknown to many: [https://mbasic.facebook.com](https://mbasic.facebook.com) because it's all plain HTML and we can navigate easily through the pages without having to emulate a browser or inject javascript code. 32 | 33 | ## Installation 34 | Requirements are: **python3** (python2 is also supported) and the **scrapy** framework, that should pull other needed libs as dependencies (twisted, libxml2 etc.). 35 | 36 | Scrapy can be installed through the package manager of the distribution (in my arch box is simply called "scrapy") or through internal python package system, typing: 37 | 38 | ```pip install scrapy``` 39 | 40 | ## Architecture 41 | The way scrapy works is through an engine that manages granularly every step of the crawling process. 42 | 43 | 44 | 45 | The project is thus divided in several files that serve different purposes: 46 | 47 | \fbcrawl 48 |
     49 | README.md -- this file 50 |
     51 | scrapy.cfg -- ini-style file that defines the project 52 |
     53 | \fbcrawl 54 |
         55 | \__init.py__ 56 |
         57 | **items.py** -- defines the fields that we want to export 58 |
         59 | middlewares.py 60 |
         61 | **pipelines.py** -- defines how we handle each item (the set of fields) 62 |
         63 | **settings.py** -- all the parameter settings of the project 64 |
         65 | \spiders 66 |
             67 | \__init.py__ 68 |
             69 | **fbcrawl.py** -- implements the spider for posts 70 |
             71 | **comments.py** -- implements the spider for comments 72 | 73 | ## How to crawl a page (fbcrawl.py) 74 | The core of the crawler is this spider class, `fbcrawl`. On init, it navigates to `mbasic.facebook.com` and logs into facebook according to the provided `credentials`, passed as parameters at execution time (see "How to use"). Then the `parse_page` method is called with the `page` name given at runtime and the crawling process begins recursively retrieving all the posts found in every page. For each of the post it retrieves all the features, using the callback `parse_post`, and all the reactions, using `parse_reactions`. 75 | 76 | The webpage are parsed and the fields are extracted using **XPath** selectors. These selectors are implemented on the python lib `lxml` so they are very fast. 77 | 78 | Thanks to XPath, scrapy can navigate the webpage in a DOM model, as one would navigate a filesystem, with several features of pattern matching. If you know nothing about XPath [this guide](https://blog.scrapinghub.com/2016/10/27/an-introduction-to-xpath-with-examples/) and [this cheatsheet](https://devhints.io/xpath) can be helpful. Other resources are the original [W3C docs](https://www.w3.org/TR/2017/REC-xpath-31-20170321/) and [XPath functions](https://docs.oracle.com/cd/E35413_01/doc.722/e35419/dev_xpath_functions.htm). 79 | 80 | The XPath are easy to obtain using Firefox's or Chromium's dev tools, but sometimes the field relative to a property changes location, which is something to keep in mind. For example, notice how I had to handle the `source` field using the pipe `|` that is the OR operator: `new.add_xpath('source', '//span/strong/a/text() | //div/a/strong/text() | //td/div/h3/strong/a/text()')`. This kind of juggling is helpful to maintain consistency of the data in our table. The control on the data and the policy to use is often implemented in the Item Pipeline. 81 | 82 | So the parse methods populates Item fields (to be explained in the next section) and pass control over to the Item Loader. 83 | 84 | Refer to Scrapy's [Spider documentation](https://docs.scrapy.org/en/latest/topics/spiders.html) for more info. 85 | 86 | ## Items (items.py) 87 | This file defines an Item class, so that the fields that we have extracted can be grouped in Items and organized in a more concise manner. Item objects are simple containers used to collect the scraped data. They provide a dictionary-like API with a convenient syntax for declaring their available fields. 88 | 89 | I have extracted every field present in the post elements and add a few local ones. Namely for each article we have: 90 | 91 | ``` 92 | source - name of the post publisher, if it's shared it's the original one 93 | shared_from - if the post is shared, is the name profile of the original post creator 94 | date - timestamp in datetime.date() format 95 | text - full text of the post, if empty it's a pic or a video 96 | reactions - total number of reactions 97 | likes - number of likes 98 | ahah - number of ahah 99 | love - number of love 100 | wow - number of wow 101 | sigh - number if sigh 102 | grrr - number of grrr 103 | comments - number of comments 104 | url - relative link to the post 105 | ``` 106 | Notice that this file is also used to modify the fields that we want to change before deciding what to do with the items. To accomplish these kinds of tasks, scrapy provides a series of built-in "`processors`" (such as the `input_processor`) and functions (such as `TakeFirst()`) that we can use to adjust the fields we want. These are explained in the official [Item Loaders](https://docs.scrapy.org/en/latest/topics/loaders.html) section of the documentation. 107 | 108 | Also Refer to Scrapy's [Item documentation](https://docs.scrapy.org/en/latest/topics/items.html) for more info. 109 | 110 | ## Settings (settings.py) 111 | Scrapy is a very powerful framework and it allows complex tweaking to be put in place. In this project we changed just only a handful of settings, but keep in mind that there are a lot of them. 112 | To make the crawler synchronous and get all the items one-by-one so that they are chronologically ordered in the final CSV you can set CONCURRENT_REQUESTS = 1 in settings.py. 113 | 114 | Pipelines are useful methods to manipulate items as you can see from the [official guide](https://doc.scrapy.org/en/latest/topics/item-pipeline.html). In our project I have prepared a pipeline to drop all the posts that were made before a certain date, you can check out the code in `pipelines.py`. Pipelines are not initialized by default, they need to be declared here. Since we can define more than one of them a number in the 0-1000 range is used to indicate priority (lower is first). This is why we have set: 115 | ``` 116 | ITEM_PIPELINES = { 117 | 'fbcrawl.pipelines.FbcrawlPipeline': 300, 118 | } 119 | ``` 120 | Besides dropping our items according to timestamp we can also export it locally to a CSV or a JSON. In case we choose to create a CSV file we need to specify the order of the columns by explicitly setting: 121 | ``` 122 | FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] 123 | ``` 124 | Scrapy's default behavior is to follow robots.txt guidelines, so we need to disable this by setting `ROBOTSTXT_OBEY = False`. 125 | 126 | ## How to use 127 | 128 | Make sure that scrapy is installed, and clone this repository. Navigate through the project's top level directory and launch scrapy with: 129 | ``` 130 | scrapy crawl fb -a email="EMAILTOLOGIN" -a password="PASSWORDTOLOGIN" -a page="NAMEOFTHEPAGETOCRAWL" -a date="2018-01-01" -a lang="it" -o DUMPFILE.csv 131 | 132 | ``` 133 | For example, let's say I want to crawl Donald Trump's page: 134 | ``` 135 | scrapy crawl fb -a email="barackobama@gmail.com" -a password="10wnyu31" -a page="DonaldTrump" -a date="2018-01-01" -a lang="it" -o Trump.csv 136 | ``` 137 | The **email** and **password** are valid fb credentials; the login might be cumbersome and some exceptions are handled, like the "save-device" checkpoint. 138 | 139 | The **page** parameter is the name of the page, although full links (with facebook domain inside) are also understood. 140 | 141 | The **date** parameter tells fbcrawl when to stop going back in time; it's optional, the default behavior is to crawl everything available, up to the beginning of 2014. 142 | 143 | The **lang** parameter is of recent introduction and it is the language of facebook interface. If the language is not supported, the crawler will **fail**, in this case change your language interface from within facebook (settings -> language). The crawler has support for just a handful of languages at the moment: italian ("it") is the original and best supported, it will return datetime format for every post, english (en), spanish (es), french(fr), portuguese (pt) will also work for crawling but the timestamp of the post will not be in year-month-day format. If not provided, the language interface will be inferred and if it's supported, will be chosen accordingly. 144 | 145 | By design scrapy is **asynchronous**, it will not return time ordered rows, you can see that the datetime is not linear. Scrapy makes 16 concurrent requests, which allows to crawl a facebook page recursively really quickly. If you want the crawling (and the CSV) ordered **chronologically** you can add **-s CONCURRENT_REQUESTS=1** at runtime or change the parameter in the settings, keep in mind that crawling will be a lot slower. 146 | 147 | While the crawling occurs you can investigate the correct working of the spiders in the console, to show more informations change the last line of settings.py to `LOG_LEVEL = 'DEBUG'`. At the end of the process, if everything has been done right, the result can be visualized on a table. 148 | 149 | The "-o " option states that result is to be saved in a .csv file (comma separated values), similar to a txt file that can be interpreted as a table. Fbcrawl can also save to JSON easily, but this feature is not implemented. 150 | Keep in mind that the default behavior is to append the items crawled at the bottom of the already existing file and not to overwrite it, so you might want to prefix your scrapy command with something like `rm OLDTABLE.csv; scrapy crawl fb etc.`. There are many other ways of exporting, check out the [exporter reference](https://doc.scrapy.org/en/latest/topics/exporters.html) if you want to know more. 151 | 152 | More information regarding Scrapy's [Deployment](https://doc.scrapy.org/en/latest/topics/deploy.html) and [Common Practices](https://doc.scrapy.org/en/latest/topics/practices.html) are present in the official documentation. 153 | 154 | ## How to crawl comments (comments.py) 155 | 156 | A new spider is now dedicated to crawl all the comments from a post (not a page!). 157 | 158 |
159 | Trump comments 160 |
161 | 162 | You can try it out with: 163 | 164 | ``` 165 | scrapy crawl comments -a email="EMAILTOLOGIN" -a password="PASSWORDTOLOGIN" -a page="LINKOFTHEPOSTTOCRAWL" -o DUMPFILE.csv 166 | ``` 167 | 168 | The use is similar to fb spider, the only difference being the -a page parameter, which now is the link to a post. Make sure that the `page` option is a proper post link, for example: 169 | 170 | ``` 171 | rm trump_comments.csv; scrapy crawl comments -a email="obama@gmail.com" -a password="cm380jixke" -a page="https://mbasic.facebook.com/story.php?story_fbid=10162169751605725&id=153080620724" -o trump_comments.csv 172 | ``` 173 | 174 | 175 | (!) Some comments are duplicated. This is because facebook chooses to display a comment both in one page and in the next. There are several ways of handling this unwanted (although interesting on its own) behavior. It's not possible to leave scrapy duplicate filter on, because this would make the crawler quit when it encounters duplicates, leaving out many comments. The best way of handling duplicates is to clean the CSV afterwards using pandas of the csv python module. 176 | For example, with pandas: 177 | ``` 178 | import pandas as pd 179 | df = pd.read_csv('./trump.csv') 180 | df2 = df[df.duplicated() == False] 181 | df2.to_csv('./trump.csv',index=False) 182 | ``` 183 | Another option would be to integrate this process with scrapy, writing a pipeline that checks all the fileds for duplicates and drops the items that are caught (however the crawling would be slower). 184 | 185 | The comments spider is able to crawl all nested replied-to comments. The root comment is indicated as `ROOT` in the `reply-to` column and the replies have a reference to the profile name that they are answering to, in the same `reply-to` column. They usually follow the `ROOT` comment in order, but it's not easy to perfectly turn off concurrency in scrapy, so that's might not always be the case. For regular comments, that don't get replies, the `reply-to` column is empty. 186 | 187 | The supported facebook interface at the moment are `EN` and `IT`, they can be specified via the `-a lang` optional parameter, they would be guessed otherwise. The difference of using a different language interface is in the way the date is handled. The `EN` interface just retrieves the datetime as a string and it's precise to the minute. The `IT` interface processes the datetime and it yields a python `datetime` format, useful for doing time series analysis with pandas for example. 188 | 189 | To enforce concurrency the `CONCURRENT_REQUESTS` parameter is set to `1`, this slows down the crawler but yields a tidier CSV in the end. If you don't care about row orders you can increase the parameter to a higher number and crawling will be faster. 190 | 191 | Reactions are the total number of reactions that the comment gets, a finer subdivision in types of reactions is not implemented. 192 | 193 | 194 | # TODO 195 | ## Idea Brainstorm 196 | ~~The crawler only works in italian:~~ 197 | * ~~add english interface support~~ 198 | * ~~add spanish interface support~~ 199 | * ~~add french interface support~~ 200 | * ~~add portuguese interface support~~ 201 | 202 | ~~Crawling starts from the beginning of 2017, it needs to go back until 2006:~~ 203 | * ~~write appropriate recursive functions in parse_page~~ 204 | 205 | ~~Retrieve CSV timely ordered:~~ 206 | * ~~Implement synchronous crawling~~ 207 | 208 | ~~Comments and commentators are not parsed:~~ 209 | * ~~write a spider that crawls all the comments from a given post~~ 210 | * ~~scrape total number of reactions from comments~~ 211 | * ~~add features representing connections between commentators (-> reply-to)~~ 212 | 213 | The number of shares is not retrieved, it is not available in `mbasic.facebook.com`. Also the number of comments field only counts direct comments and not reply comments, because that's how mbasic works. To fix both of these issues: 214 | * extract URL of post and use m.facebook.com to retrieve these data 215 | 216 | Some other interesting features can be derived. Comments and commentators can be related better to post and page: 217 | * count comments from same commentator under a post 218 | 219 | Better handling of data: 220 | * merge comments and posts and use JSON instead of CSV 221 | * add postgresql pipeline for simple CSV 222 | * add mongodb pipeline for more complex JSON 223 | 224 | Integrate data visualization: 225 | * display reactions and other features (comments, shares etc.) as timeseries 226 | * add networkx or graph-tools support to display connections (features as links) between posts and people (nodes) 227 | * inegrate gephi or save out to gephi 228 | 229 | The script is not very user friendly: 230 | * create a gui using pyqt 231 | -------------------------------------------------------------------------------- /comments.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rugantio/fbcrawl/bda7d6a7da49a57c8a0863b6679c013f74fdb4c1/comments.png -------------------------------------------------------------------------------- /fbcrawl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rugantio/fbcrawl/bda7d6a7da49a57c8a0863b6679c013f74fdb4c1/fbcrawl/__init__.py -------------------------------------------------------------------------------- /fbcrawl/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rugantio/fbcrawl/bda7d6a7da49a57c8a0863b6679c013f74fdb4c1/fbcrawl/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /fbcrawl/__pycache__/items.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rugantio/fbcrawl/bda7d6a7da49a57c8a0863b6679c013f74fdb4c1/fbcrawl/__pycache__/items.cpython-37.pyc -------------------------------------------------------------------------------- /fbcrawl/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rugantio/fbcrawl/bda7d6a7da49a57c8a0863b6679c013f74fdb4c1/fbcrawl/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /fbcrawl/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rugantio/fbcrawl/bda7d6a7da49a57c8a0863b6679c013f74fdb4c1/fbcrawl/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /fbcrawl/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy.loader.processors import TakeFirst, Join, MapCompose 10 | from datetime import datetime, timedelta 11 | 12 | def comments_strip(string,loader_context): 13 | lang = loader_context['lang'] 14 | if lang == 'it': 15 | if string[0].rfind('Commenta') != -1: 16 | return 17 | else: 18 | return string[0].rstrip(' commenti') 19 | 20 | elif lang == 'en': 21 | if(string[0] == 'Share'): 22 | return '0' 23 | new_string = string[0].rstrip(' Comments') 24 | while new_string.rfind(',') != -1: 25 | new_string = new_string[0:new_string.rfind(',')] + new_string[new_string.rfind(',')+1:] 26 | return new_string 27 | else: 28 | return string 29 | 30 | def reactions_strip(string,loader_context): 31 | lang = loader_context['lang'] 32 | if lang == 'it': 33 | newstring = string[0] 34 | #19.298.873 35 | if len(newstring.split()) == 1: 36 | while newstring.rfind('.') != -1: 37 | newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:] 38 | return newstring 39 | #Pamela, Luigi e altri 4 40 | else: 41 | return string 42 | friends = newstring.count(' e ') + newstring.count(',') 43 | newstring = newstring.split()[::-1][0] 44 | while newstring.rfind('.') != -1: 45 | newstring = newstring[0:newstring.rfind('.')] + newstring[newstring.rfind('.')+1:] 46 | return int(newstring) + friends 47 | elif lang == 'en': 48 | newstring = string[0] 49 | #19,298,873 50 | if len(newstring.split()) == 1: 51 | while newstring.rfind(',') != -1: 52 | newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:] 53 | return newstring 54 | #Mark and other 254,134 55 | elif newstring.split()[::-1][1].isdigit(): 56 | friends = newstring.count(' and ') + newstring.count(',') 57 | newstring = newstring.split()[::-1][1] 58 | while newstring.rfind(',') != -1: 59 | newstring = newstring[0:newstring.rfind(',')] + newstring[newstring.rfind(',')+1:] 60 | return int(newstring) + friends 61 | #Philip and 1K others 62 | else: 63 | return newstring 64 | else: 65 | return string 66 | 67 | def url_strip(url): 68 | fullurl = url[0] 69 | #catchin '&id=' is enough to identify the post 70 | i = fullurl.find('&id=') 71 | if i != -1: 72 | return fullurl[:i+4] + fullurl[i+4:].split('&')[0] 73 | else: #catch photos 74 | i = fullurl.find('/photos/') 75 | if i != -1: 76 | return fullurl[:i+8] + fullurl[i+8:].split('/?')[0] 77 | else: #catch albums 78 | i = fullurl.find('/albums/') 79 | if i != -1: 80 | return fullurl[:i+8] + fullurl[i+8:].split('/?')[0] 81 | else: 82 | return fullurl 83 | 84 | def parse_date(date,loader_context): 85 | import json 86 | 87 | d = json.loads(date[0]) #nested dict of features 88 | flat_d = dict() #only retain 'leaves' of d tree 89 | 90 | def recursive_items(dictionary): 91 | ''' 92 | Get most nested key:value pair of nested dict 93 | ''' 94 | for key, value in dictionary.items(): 95 | if type(value) is dict: 96 | yield from recursive_items(value) 97 | else: 98 | yield (key, value) 99 | 100 | for key, value in recursive_items(d): 101 | flat_d[key] = value 102 | 103 | #returns timestamp in localtime conversion from linux timestamp UTC 104 | ret = str(datetime.fromtimestamp(flat_d['publish_time'])) if 'publish_time' in flat_d else None 105 | return ret 106 | 107 | def parse_date2(init_date,loader_context): 108 | lang = loader_context['lang'] 109 | # ============================================================================= 110 | # Italian - status:final 111 | # ============================================================================= 112 | if lang == 'it': 113 | months = { 114 | 'gennaio':1, 115 | 'febbraio':2, 116 | 'marzo':3, 117 | 'aprile':4, 118 | 'maggio':5, 119 | 'giugno':6, 120 | 'luglio':7, 121 | 'agosto':8, 122 | 'settembre':9, 123 | 'ottobre':10, 124 | 'novembre':11, 125 | 'dicembre':12 126 | } 127 | 128 | months_abbr = { 129 | 'gen':1, 130 | 'feb':2, 131 | 'mar':3, 132 | 'apr':4, 133 | 'mag':5, 134 | 'giu':6, 135 | 'lug':7, 136 | 'ago':8, 137 | 'set':9, 138 | 'ott':10, 139 | 'nov':11, 140 | 'dic':12 141 | } 142 | 143 | giorni = { 144 | 'lunedì':0, 145 | 'martedì':1, 146 | 'mercoledì':2, 147 | 'giovedì':3, 148 | 'venerdì':4, 149 | 'sabato':5, 150 | 'domenica':6 151 | } 152 | 153 | date = init_date[0].split() 154 | year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today 155 | 156 | l = len(date) 157 | 158 | #sanity check 159 | if l == 0: 160 | return 'Error: no data' 161 | 162 | #adesso, ieri, 4h, 50min 163 | elif l == 1: 164 | if date[0].isalpha(): 165 | if date[0].lower() == 'ieri': 166 | day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) 167 | #check that yesterday was not in another month 168 | month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) 169 | elif date[0].lower() == 'adesso': 170 | return datetime(year,month,day).date() #return today 171 | else: #not recognized, (return date or init_date) 172 | return date 173 | else: 174 | #4h, 50min (exploit future parsing) 175 | l = 2 176 | new_date = [x for x in date[0] if x.isdigit()] 177 | date[0] = ''.join(new_date) 178 | new_date = [x for x in date[0] if not(x.isdigit())] 179 | date[1] = ''.join(new_date) 180 | # l = 2 181 | elif l == 2: 182 | #22 min (oggi) 183 | if date[1] == 'min': 184 | if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) >= 0: 185 | return datetime(year,month,day).date() 186 | #22 min (ieri) 187 | else: 188 | day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) 189 | month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) 190 | return datetime(year,month,day).date() 191 | #4 h (oggi) 192 | elif date[1] == 'h': 193 | if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: 194 | return datetime(year,month,day).date() 195 | #4 h (ieri) 196 | else: 197 | day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) 198 | month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) 199 | return datetime(year,month,day).date() 200 | #2 gen 201 | elif len(date[1]) == 3 and date[1].isalpha(): 202 | day = int(date[0]) 203 | month = months_abbr[date[1].lower()] 204 | return datetime(year,month,day).date() 205 | #2 gennaio 206 | elif len(date[1]) > 3 and date[1].isalpha(): 207 | day = int(date[0]) 208 | month = months[date[1]] 209 | return datetime(year,month,day).date() 210 | #parsing failed 211 | else: 212 | return date 213 | # l = 3 214 | elif l == 3: 215 | #21 giu 2017 216 | if len(date[1]) == 3 and date[2].isdigit(): 217 | day = int(date[0]) 218 | month = months_abbr[date[1]] 219 | year = int(date[2]) 220 | return datetime(year,month,day).date() 221 | #21 giugno 2017 222 | elif len(date[1]) > 3 and date[2].isdigit(): 223 | day = int(date[0]) 224 | month = months[date[1]] 225 | year = int(date[2]) 226 | return datetime(year,month,day).date() 227 | #9 ore fa 228 | elif date[0].isdigit() and date[1][:2] == 'or': 229 | if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) >= 0: 230 | return datetime(year,month,day).date() 231 | #9 ore fa (ieri) 232 | else: 233 | day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) 234 | month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) 235 | return datetime(year,month,day).date() 236 | #7 minuti fa 237 | elif date[0].isdigit() and date[1][:3] == 'min': 238 | return datetime(year,month,day).date() 239 | 240 | #ieri alle 20:45 241 | elif date[0].lower() == 'ieri' and date[1] == 'alle': 242 | day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) 243 | month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) 244 | return datetime(year,month,day).date() 245 | #oggi alle 11:11 246 | elif date[0].lower() == 'oggi' and date[1] == 'alle': 247 | return datetime(year,month,day).date() 248 | #lunedì alle 12:34 249 | elif date[0].isalpha() and date[1] == 'alle': 250 | today = datetime.now().weekday() #today as a weekday 251 | weekday = giorni[date[0].lower()] #day to be match as number weekday 252 | #weekday is chronologically always lower than day 253 | delta = today - weekday 254 | if delta >= 0: 255 | day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) 256 | month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) 257 | return datetime(year,month,day).date() 258 | #lunedì = 0 sabato = 6, mar 1 ven 5 259 | else: 260 | delta += 8 261 | day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) 262 | month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) 263 | return datetime(year,month,day).date() 264 | #parsing failed 265 | else: 266 | return date 267 | # l = 4 268 | elif l == 4: 269 | #Ieri alle ore 23:32 270 | if date[0].lower() == 'ieri' and date[1] == 'alle': 271 | day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) 272 | month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) 273 | return datetime(year,month,day).date() 274 | #domenica alle ore 19:29 275 | elif date[0].isalpha() and date[1] == 'alle': 276 | today = datetime.now().weekday() #today as a weekday 277 | weekday = giorni[date[0].lower()] #day to be match as number weekday 278 | #weekday is chronologically always lower than day 279 | delta = today - weekday 280 | if delta >= 0: 281 | day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) 282 | month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) 283 | return datetime(year,month,day).date() 284 | #lunedì = 0 sabato = 6, mar 1 ven 5 285 | else: 286 | delta += 8 287 | day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) 288 | month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) 289 | return datetime(year,month,day).date() 290 | #parsing failed 291 | else: 292 | return date 293 | # l = 5 294 | elif l == 5: 295 | if date[2] == 'alle': 296 | #29 feb alle ore 21:49 297 | if len(date[1]) == 3: 298 | day = int(date[0]) 299 | month = months_abbr[date[1].lower()] 300 | return datetime(year,month,day).date() 301 | #29 febbraio alle ore 21:49 302 | else: 303 | day = int(date[0]) 304 | month = months[date[1].lower()] 305 | return datetime(year,month,day).date() 306 | #parsing failed 307 | else: 308 | return date 309 | # l = 6 310 | elif l == 6: 311 | if date[3] == 'alle': 312 | #29 feb 2016 alle ore 21:49 313 | if len(date[1]) == 3: 314 | day = int(date[0]) 315 | month = months_abbr[date[1].lower()] 316 | year = int(date[2]) 317 | return datetime(year,month,day).date() 318 | #29 febbraio 2016 alle ore 21:49 319 | else: 320 | day = int(date[0]) 321 | month = months[date[1].lower()] 322 | year = int(date[2]) 323 | return datetime(year,month,day).date() 324 | #parsing failed 325 | else: 326 | return date 327 | # ============================================================================= 328 | # English - status:beta 329 | # ============================================================================= 330 | elif lang == 'en': 331 | months = { 332 | 'january':1, 333 | 'february':2, 334 | 'march':3, 335 | 'april':4, 336 | 'may':5, 337 | 'june':6, 338 | 'july':7, 339 | 'august':8, 340 | 'september':9, 341 | 'october':10, 342 | 'november':11, 343 | 'december':12 344 | } 345 | 346 | months_abbr = { 347 | 'jan':1, 348 | 'feb':2, 349 | 'mar':3, 350 | 'apr':4, 351 | 'may':5, 352 | 'jun':6, 353 | 'jul':7, 354 | 'aug':8, 355 | 'sep':9, 356 | 'oct':10, 357 | 'nov':11, 358 | 'dec':12 359 | } 360 | 361 | days = { 362 | 'monday':0, 363 | 'tuesday':1, 364 | 'wednesday':2, 365 | 'thursday':3, 366 | 'friday':4, 367 | 'saturday':5, 368 | 'sunday':6 369 | } 370 | 371 | date = init_date[0].split() 372 | year, month, day = [int(i) for i in str(datetime.now().date()).split(sep='-')] #default is today 373 | 374 | l = len(date) 375 | 376 | #sanity check 377 | if l == 0: 378 | return 'Error: no data' 379 | 380 | #Yesterday, Now, 4hr, 50mins 381 | elif l == 1: 382 | if date[0].isalpha(): 383 | if date[0].lower() == 'yesterday': 384 | day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) 385 | #check that yesterday was not in another month 386 | month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) 387 | elif date[0].lower() == 'now': 388 | return datetime(year,month,day).date() #return today 389 | else: #not recognized, (return date or init_date) 390 | return date 391 | else: 392 | #4h, 50min (exploit future parsing) 393 | l = 2 394 | new_date = [x for x in date[0] if x.isdigit()] 395 | date[0] = ''.join(new_date) 396 | new_date = [x for x in date[0] if not(x.isdigit())] 397 | date[1] = ''.join(new_date) 398 | # l = 2 399 | elif l == 2: 400 | if date[1] == 'now': 401 | return datetime(year,month,day).date() 402 | #22 min (ieri) 403 | if date[1] == 'min' or date[1] == 'mins': 404 | if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0: 405 | day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) 406 | month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) 407 | return datetime(year,month,day).date() 408 | #22 min (oggi) 409 | else: 410 | return datetime(year,month,day).date() 411 | 412 | #4 h (ieri) 413 | elif date[1] == 'hr' or date[1] == 'hrs': 414 | if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: 415 | day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) 416 | month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) 417 | return datetime(year,month,day).date() 418 | #4 h (oggi) 419 | else: 420 | return datetime(year,month,day).date() 421 | 422 | #2 jan 423 | elif len(date[1]) == 3 and date[1].isalpha(): 424 | day = int(date[0]) 425 | month = months_abbr[date[1].lower()] 426 | return datetime(year,month,day).date() 427 | #2 january 428 | elif len(date[1]) > 3 and date[1].isalpha(): 429 | day = int(date[0]) 430 | month = months[date[1]] 431 | return datetime(year,month,day).date() 432 | #jan 2 433 | elif len(date[0]) == 3 and date[0].isalpha(): 434 | day = int(date[1]) 435 | month = months_abbr[date[0].lower()] 436 | return datetime(year,month,day).date() 437 | #january 2 438 | elif len(date[0]) > 3 and date[0].isalpha(): 439 | day = int(date[1]) 440 | month = months[date[0]] 441 | return datetime(year,month,day).date() 442 | #parsing failed 443 | else: 444 | return date 445 | return date 446 | # l = 3 447 | elif l == 3: 448 | #5 hours ago 449 | if date[2] == 'ago': 450 | if date[1] == 'hour' or date[1] == 'hours' or date[1] == 'hr' or date[1] == 'hrs': 451 | # 5 hours ago (yesterday) 452 | if int(str(datetime.now().time()).split(sep=':')[0]) - int(date[0]) < 0: 453 | day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) 454 | month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) 455 | return datetime(year,month,day).date() 456 | # 5 hours ago (today) 457 | else: 458 | return datetime(year,month,day).date() 459 | #10 minutes ago 460 | elif date[1] == 'minute' or date[1] == 'minutes' or date[1] == 'min' or date[1] == 'mins': 461 | #22 minutes ago (yesterday) 462 | if int(str(datetime.now().time()).split(sep=':')[1]) - int(date[0]) < 0 and int(str(datetime.now().time()).split(sep=':')[0])==0: 463 | day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) 464 | month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) 465 | return datetime(year,month,day).date() 466 | #22 minutes ago (today) 467 | else: 468 | return datetime(year,month,day).date() 469 | else: 470 | return date 471 | else: 472 | #21 Jun 2017 473 | if len(date[1]) == 3 and date[1].isalpha() and date[2].isdigit(): 474 | day = int(date[0]) 475 | month = months_abbr[date[1].lower()] 476 | year = int(date[2]) 477 | return datetime(year,month,day).date() 478 | #21 June 2017 479 | elif len(date[1]) > 3 and date[1].isalpha() and date[2].isdigit(): 480 | day = int(date[0]) 481 | month = months[date[1].lower()] 482 | year = int(date[2]) 483 | return datetime(year,month,day).date() 484 | #Jul 11, 2016 485 | elif len(date[0]) == 3 and len(date[1]) == 3 and date[0].isalpha(): 486 | day = int(date[1][:-1]) 487 | month = months_abbr[date[0].lower()] 488 | year = int(date[2]) 489 | return datetime(year,month,day).date() 490 | #parsing failed 491 | else: 492 | return date 493 | # l = 4 494 | elif l == 4: 495 | #yesterday at 23:32 PM 496 | if date[0].lower() == 'yesterday' and date[1] == 'at': 497 | day = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[2]) 498 | month = int(str(datetime.now().date()-timedelta(1)).split(sep='-')[1]) 499 | return datetime(year,month,day).date() 500 | #Thursday at 4:27 PM 501 | elif date[1] == 'at': 502 | today = datetime.now().weekday() #today as a weekday 503 | weekday = days[date[0].lower()] #day to be match as number weekday 504 | #weekday is chronologically always lower than day 505 | delta = today - weekday 506 | if delta >= 0: 507 | day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) 508 | month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) 509 | return datetime(year,month,day).date() 510 | #monday = 0 saturday = 6 511 | else: 512 | delta += 8 513 | day = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[2]) 514 | month = int(str(datetime.now().date()-timedelta(delta)).split(sep='-')[1]) 515 | return datetime(year,month,day).date() 516 | #parsing failed 517 | else: 518 | return date 519 | # l = 5 520 | elif l == 5: 521 | if date[2] == 'at': 522 | #Jan 29 at 10:00 PM 523 | if len(date[0]) == 3: 524 | day = int(date[1]) 525 | month = months_abbr[date[0].lower()] 526 | return datetime(year,month,day).date() 527 | #29 febbraio alle ore 21:49 528 | else: 529 | day = int(date[1]) 530 | month = months[date[0].lower()] 531 | return datetime(year,month,day).date() 532 | #parsing failed 533 | else: 534 | return date 535 | # l = 6 536 | elif l == 6: 537 | if date[3] == 'at': 538 | date[1] 539 | #Aug 25, 2016 at 7:00 PM 540 | if len(date[0]) == 3: 541 | day = int(date[1][:-1]) 542 | month = months_abbr[date[0].lower()] 543 | year = int(date[2]) 544 | return datetime(year,month,day).date() 545 | #August 25, 2016 at 7:00 PM 546 | else: 547 | day = int(date[1][:-1]) 548 | month = months[date[0].lower()] 549 | year = int(date[2]) 550 | return datetime(year,month,day).date() 551 | #parsing failed 552 | else: 553 | return date 554 | # l > 6 555 | #parsing failed - l too big 556 | else: 557 | return date 558 | #parsing failed - language not supported 559 | else: 560 | return init_date 561 | 562 | def id_strip(post_id): 563 | import json 564 | d = json.loads(post_id[::-1][0]) #nested dict of features 565 | return str(d['top_level_post_id']) 566 | 567 | 568 | class FbcrawlItem(scrapy.Item): 569 | source = scrapy.Field() 570 | date = scrapy.Field() 571 | text = scrapy.Field( 572 | output_processor=Join(separator=u'') 573 | ) # full text of the post 574 | comments = scrapy.Field( 575 | output_processor=comments_strip 576 | ) 577 | reactions = scrapy.Field( 578 | output_processor=reactions_strip 579 | ) # num of reactions 580 | likes = scrapy.Field( 581 | output_processor=reactions_strip 582 | ) 583 | ahah = scrapy.Field( 584 | output_processor=reactions_strip 585 | ) 586 | love = scrapy.Field( 587 | output_processor=reactions_strip 588 | ) 589 | wow = scrapy.Field( 590 | output_processor=reactions_strip 591 | ) 592 | sigh = scrapy.Field( 593 | output_processor=reactions_strip 594 | ) 595 | grrr = scrapy.Field( 596 | output_processor=reactions_strip 597 | ) 598 | share = scrapy.Field() # num of shares 599 | url = scrapy.Field( 600 | output_processor=url_strip 601 | ) 602 | post_id = scrapy.Field( 603 | output_processor=id_strip 604 | ) 605 | shared_from = scrapy.Field() 606 | 607 | class CommentsItem(scrapy.Item): 608 | source = scrapy.Field() 609 | reply_to=scrapy.Field() 610 | date = scrapy.Field( # when was the post published 611 | output_processor=parse_date2 612 | ) 613 | text = scrapy.Field( 614 | output_processor=Join(separator=u'') 615 | ) # full text of the post 616 | reactions = scrapy.Field( 617 | output_processor=reactions_strip 618 | ) # num of reactions 619 | likes = scrapy.Field( 620 | output_processor=reactions_strip 621 | ) 622 | source_url = scrapy.Field() 623 | url = scrapy.Field() 624 | ahah = scrapy.Field() 625 | love = scrapy.Field() 626 | wow = scrapy.Field() 627 | sigh = scrapy.Field() 628 | grrr = scrapy.Field() 629 | share = scrapy.Field() # num of shares 630 | 631 | class ProfileItem(scrapy.Item): 632 | name = scrapy.Field() 633 | gender = scrapy.Field() 634 | birthday = scrapy.Field() 635 | current_city = scrapy.Field() 636 | hometown = scrapy.Field() 637 | work = scrapy.Field() 638 | education = scrapy.Field() 639 | interested_in = scrapy.Field() 640 | page = scrapy.Field() 641 | 642 | class EventsItem(scrapy.Item): 643 | name = scrapy.Field() 644 | location = scrapy.Field() 645 | where = scrapy.Field() 646 | photo = scrapy.Field() 647 | start_date = scrapy.Field() 648 | end_date = scrapy.Field() 649 | description = scrapy.Field() 650 | -------------------------------------------------------------------------------- /fbcrawl/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class FbcrawlSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class FbcrawlDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /fbcrawl/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy.exceptions import DropItem 9 | from datetime import datetime 10 | 11 | class FbcrawlPipeline(object): 12 | pass 13 | # def process_item(self, item, spider): 14 | # if item['date'] < datetime(2017,1,1).date(): 15 | # raise DropItem("Dropping element because it's older than 01/01/2017") 16 | # elif item['date'] > datetime(2018,3,4).date(): 17 | # raise DropItem("Dropping element because it's newer than 04/03/2018") 18 | # else: 19 | # return item 20 | -------------------------------------------------------------------------------- /fbcrawl/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for fbcrawl project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'fbcrawl' 13 | 14 | SPIDER_MODULES = ['fbcrawl.spiders'] 15 | NEWSPIDER_MODULE = 'fbcrawl.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = False 21 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 23 | CONCURRENT_REQUESTS = 16 24 | 25 | # Configure a delay for requests for the same website (default: 0) 26 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 27 | # See also autothrottle settings and docs 28 | DOWNLOAD_DELAY = 3 29 | 30 | # The download delay setting will honor only one of: 31 | #CONCURRENT_REQUESTS_PER_DOMAIN = 1 32 | #CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | #COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | #TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | #DEFAULT_REQUEST_HEADERS = { 42 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | # 'Accept-Language': 'en', 44 | #} 45 | 46 | # Enable or disable spider middlewares 47 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 48 | #SPIDER_MIDDLEWARES = { 49 | # 'fbcrawl.middlewares.FbcrawlSpiderMiddleware': 543, 50 | #} 51 | 52 | # Enable or disable downloader middlewares 53 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 54 | #DOWNLOADER_MIDDLEWARES = { 55 | # 'fbcrawl.middlewares.FbcrawlDownloaderMiddleware': 543, 56 | #} 57 | 58 | # Enable or disable extensions 59 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 60 | #EXTENSIONS = { 61 | # 'scrapy.extensions.telnet.TelnetConsole': None, 62 | #} 63 | 64 | # Configure item pipelines 65 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 66 | #ITEM_PIPELINES = { 67 | # 'fbcrawl.pipelines.FbcrawlPipeline': 300, 68 | #} 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 72 | #AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | #AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | #AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | #AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | #HTTPCACHE_ENABLED = True 86 | #HTTPCACHE_EXPIRATION_SECS = 0 87 | #HTTPCACHE_DIR = 'httpcache' 88 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | #FEED_EXPORT_FIELDS = ["source", "date", "text", "reactions","likes","ahah","love","wow","sigh","grrr","comments","url"] # specifies the order of the column to export as CSV 91 | URLLENGTH_LIMIT = 99999 92 | FEED_EXPORT_ENCODING = 'utf-8' 93 | DUPEFILTER_DEBUG = True 94 | LOG_LEVEL = 'INFO' 95 | #LOG_LEVEL = 'DEBUG' 96 | -------------------------------------------------------------------------------- /fbcrawl/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /fbcrawl/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rugantio/fbcrawl/bda7d6a7da49a57c8a0863b6679c013f74fdb4c1/fbcrawl/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /fbcrawl/spiders/__pycache__/comments.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rugantio/fbcrawl/bda7d6a7da49a57c8a0863b6679c013f74fdb4c1/fbcrawl/spiders/__pycache__/comments.cpython-37.pyc -------------------------------------------------------------------------------- /fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rugantio/fbcrawl/bda7d6a7da49a57c8a0863b6679c013f74fdb4c1/fbcrawl/spiders/__pycache__/fbcrawl.cpython-37.pyc -------------------------------------------------------------------------------- /fbcrawl/spiders/comments.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | from scrapy.loader import ItemLoader 4 | from scrapy.exceptions import CloseSpider 5 | from fbcrawl.spiders.fbcrawl import FacebookSpider 6 | from fbcrawl.items import CommentsItem, parse_date, parse_date2 7 | 8 | from datetime import datetime 9 | 10 | class CommentsSpider(FacebookSpider): 11 | """ 12 | Parse FB comments, given a post (needs credentials) 13 | """ 14 | name = "comments" 15 | custom_settings = { 16 | 'FEED_EXPORT_FIELDS': ['source','reply_to','date','reactions','text', \ 17 | 'source_url','url'], 18 | 'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter', 19 | 'CONCURRENT_REQUESTS' : 1 20 | } 21 | 22 | def __init__(self, *args, **kwargs): 23 | if 'post' in kwargs and 'page' in kwargs: 24 | raise AttributeError('You need to specifiy only one between post and page') 25 | elif 'post' in kwargs: 26 | self.page = kwargs['post'] 27 | self.type = 'post' 28 | elif 'page' in kwargs: 29 | self.type = 'page' 30 | 31 | super().__init__(*args,**kwargs) 32 | 33 | def parse_page(self, response): 34 | ''' 35 | ''' 36 | if self.type == 'post': 37 | yield scrapy.Request(url=response.url, 38 | callback=self.parse_post, 39 | priority=10, 40 | meta={'index':1}) 41 | elif self.type == 'page': 42 | #select all posts 43 | for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): 44 | many_features = post.xpath('./@data-ft').get() 45 | date = [] 46 | date.append(many_features) 47 | date = parse_date(date,{'lang':self.lang}) 48 | current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date 49 | 50 | if current_date is None: 51 | date_string = post.xpath('.//abbr/text()').get() 52 | date = parse_date2([date_string],{'lang':self.lang}) 53 | current_date = datetime(date.year,date.month,date.day) if date is not None else date 54 | date = str(date) 55 | 56 | if abs(self.count) + 1 > self.max: 57 | raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count))) 58 | self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date)) 59 | 60 | #returns full post-link in a list 61 | post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() 62 | temp_post = response.urljoin(post[0]) 63 | self.count -= 1 64 | yield scrapy.Request(temp_post, 65 | self.parse_post, 66 | priority = self.count, 67 | meta={'index':1}) 68 | 69 | #load following page, try to click on "more" 70 | #after few pages have been scraped, the "more" link might disappears 71 | #if not present look for the highest year not parsed yet 72 | #click once on the year and go back to clicking "more" 73 | 74 | #new_page is different for groups 75 | if self.group == 1: 76 | new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract() 77 | else: 78 | new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() 79 | #this is why lang is needed 80 | 81 | if not new_page: 82 | self.logger.info('[!] "more" link not found, will look for a "year" link') 83 | #self.k is the year link that we look for 84 | if response.meta['flag'] == self.k and self.k >= self.year: 85 | xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" 86 | new_page = response.xpath(xpath).extract() 87 | if new_page: 88 | new_page = response.urljoin(new_page[0]) 89 | self.k -= 1 90 | self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) 91 | yield scrapy.Request(new_page, 92 | callback=self.parse_page, 93 | priority = -1000, 94 | meta={'flag':self.k}) 95 | else: 96 | while not new_page: #sometimes the years are skipped this handles small year gaps 97 | self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1)) 98 | self.k -= 1 99 | if self.k < self.year: 100 | raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date)) 101 | xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" 102 | new_page = response.xpath(xpath).extract() 103 | self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) 104 | new_page = response.urljoin(new_page[0]) 105 | self.k -= 1 106 | yield scrapy.Request(new_page, 107 | callback=self.parse_page, 108 | priority = -1000, 109 | meta={'flag':self.k}) 110 | else: 111 | self.logger.info('Crawling has finished with no errors!') 112 | else: 113 | new_page = response.urljoin(new_page[0]) 114 | if 'flag' in response.meta: 115 | self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page)) 116 | yield scrapy.Request(new_page, 117 | callback=self.parse_page, 118 | priority = -1000, 119 | meta={'flag':response.meta['flag']}) 120 | else: 121 | self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page)) 122 | yield scrapy.Request(new_page, 123 | callback=self.parse_page, 124 | priority = -1000, 125 | meta={'flag':self.k}) 126 | 127 | def parse_post(self, response): 128 | ''' 129 | parse post does multiple things: 130 | 1) loads replied-to-comments page one-by-one (for DFS) 131 | 2) call parse_reply on the nested comments 132 | 3) adds simple (not-replied-to) comments 133 | 4) follows to new comment page 134 | ''' 135 | #load replied-to comments pages 136 | #select nested comment one-by-one matching with the index: response.meta['index'] 137 | path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']' 138 | group_flag = response.meta['group'] if 'group' in response.meta else None 139 | 140 | for reply in response.xpath(path): 141 | source = reply.xpath('.//h3/a/text()').extract() 142 | answer = reply.xpath('.//a[contains(@href,"repl")]/@href').extract() 143 | ans = response.urljoin(answer[::-1][0]) 144 | self.logger.info('{} nested comment'.format(str(response.meta['index']))) 145 | yield scrapy.Request(ans, 146 | callback=self.parse_reply, 147 | priority=1000, 148 | meta={'reply_to':source, 149 | 'url':response.url, 150 | 'index':response.meta['index'], 151 | 'flag':'init', 152 | 'group':group_flag}) 153 | #load regular comments 154 | if not response.xpath(path): #prevents from exec 155 | path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' 156 | for i,reply in enumerate(response.xpath(path2)): 157 | self.logger.info('{} regular comment'.format(i+1)) 158 | new = ItemLoader(item=CommentsItem(),selector=reply) 159 | new.context['lang'] = self.lang 160 | new.add_xpath('source','.//h3/a/text()') 161 | new.add_xpath('source_url','.//h3/a/@href') 162 | new.add_xpath('text','.//div[h3]/div[1]//text()') 163 | new.add_xpath('date','.//abbr/text()') 164 | new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()') 165 | new.add_value('url',response.url) 166 | yield new.load_item() 167 | 168 | #new comment page 169 | if not response.xpath(path): 170 | #for groups 171 | next_xpath = './/div[contains(@id,"see_next")]' 172 | prev_xpath = './/div[contains(@id,"see_prev")]' 173 | if not response.xpath(next_xpath) or group_flag == 1: 174 | for next_page in response.xpath(prev_xpath): 175 | new_page = next_page.xpath('.//@href').extract() 176 | new_page = response.urljoin(new_page[0]) 177 | self.logger.info('New page to be crawled {}'.format(new_page)) 178 | yield scrapy.Request(new_page, 179 | callback=self.parse_post, 180 | meta={'index':1, 181 | 'group':1}) 182 | else: 183 | for next_page in response.xpath(next_xpath): 184 | new_page = next_page.xpath('.//@href').extract() 185 | new_page = response.urljoin(new_page[0]) 186 | self.logger.info('New page to be crawled {}'.format(new_page)) 187 | yield scrapy.Request(new_page, 188 | callback=self.parse_post, 189 | meta={'index':1, 190 | 'group':group_flag}) 191 | 192 | def parse_reply(self,response): 193 | ''' 194 | parse reply to comments, root comment is added if flag 195 | ''' 196 | # from scrapy.utils.response import open_in_browser 197 | # open_in_browser(response) 198 | 199 | if response.meta['flag'] == 'init': 200 | #parse root comment 201 | for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'): 202 | new = ItemLoader(item=CommentsItem(),selector=root) 203 | new.context['lang'] = self.lang 204 | new.add_xpath('source','.//h3/a/text()') 205 | new.add_xpath('source_url','.//h3/a/@href') 206 | new.add_value('reply_to','ROOT') 207 | new.add_xpath('text','.//div[1]//text()') 208 | new.add_xpath('date','.//abbr/text()') 209 | new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()') 210 | new.add_value('url',response.url) 211 | yield new.load_item() 212 | #parse all replies in the page 213 | for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): 214 | new = ItemLoader(item=CommentsItem(),selector=reply) 215 | new.context['lang'] = self.lang 216 | new.add_xpath('source','.//h3/a/text()') 217 | new.add_xpath('source_url','.//h3/a/@href') 218 | new.add_value('reply_to',response.meta['reply_to']) 219 | new.add_xpath('text','.//div[h3]/div[1]//text()') 220 | new.add_xpath('date','.//abbr/text()') 221 | new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()') 222 | new.add_value('url',response.url) 223 | yield new.load_item() 224 | 225 | back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract() 226 | if back: 227 | self.logger.info('Back found, more nested comments') 228 | back_page = response.urljoin(back[0]) 229 | yield scrapy.Request(back_page, 230 | callback=self.parse_reply, 231 | priority = 1000, 232 | meta={'reply_to':response.meta['reply_to'], 233 | 'flag':'back', 234 | 'url':response.meta['url'], 235 | 'index':response.meta['index'], 236 | 'group':response.meta['group']}) 237 | 238 | else: 239 | next_reply = response.meta['url'] 240 | self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url'])) 241 | yield scrapy.Request(next_reply, 242 | callback=self.parse_post, 243 | meta={'index':response.meta['index']+1, 244 | 'group':response.meta['group']}) 245 | 246 | elif response.meta['flag'] == 'back': 247 | #parse all comments 248 | for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): 249 | new = ItemLoader(item=CommentsItem(),selector=reply) 250 | new.context['lang'] = self.lang 251 | new.add_xpath('source','.//h3/a/text()') 252 | new.add_xpath('source_url','.//h3/a/@href') 253 | new.add_value('reply_to',response.meta['reply_to']) 254 | new.add_xpath('text','.//div[h3]/div[1]//text()') 255 | new.add_xpath('date','.//abbr/text()') 256 | new.add_xpath('reactions','.//a[contains(@href,"reaction/profile")]//text()') 257 | new.add_value('url',response.url) 258 | yield new.load_item() 259 | #keep going backwards 260 | back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract() 261 | self.logger.info('Back found, more nested comments') 262 | if back: 263 | back_page = response.urljoin(back[0]) 264 | yield scrapy.Request(back_page, 265 | callback=self.parse_reply, 266 | priority=1000, 267 | meta={'reply_to':response.meta['reply_to'], 268 | 'flag':'back', 269 | 'url':response.meta['url'], 270 | 'index':response.meta['index'], 271 | 'group':response.meta['group']}) 272 | 273 | else: 274 | next_reply = response.meta['url'] 275 | self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url'])) 276 | yield scrapy.Request(next_reply, 277 | callback=self.parse_post, 278 | meta={'index':response.meta['index']+1, 279 | 'group':response.meta['group']}) 280 | 281 | # ============================================================================= 282 | # CRAWL REACTIONS 283 | # ============================================================================= 284 | # def parse_reactions(self,response): 285 | # new = ItemLoader(item=CommentsItem(),response=response, parent=response.meta['item']) 286 | # new.context['lang'] = self.lang 287 | # new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()") 288 | # new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()") 289 | # new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()") 290 | # new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") 291 | # new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") 292 | # new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") 293 | # yield new.load_item() 294 | # 295 | # #substitute 296 | # yield new.load_item() 297 | # ‾‾‾‾‾‾‾‾‾|‾‾‾‾‾‾‾‾‾‾‾ 298 | # _________v___ 299 | # #response --> reply/root 300 | # reactions = response.xpath(".//a[contains(@href,'reaction/profile')]/@href") 301 | # reactions = response.urljoin(reactions[0].extract()) 302 | # if reactions: 303 | # yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new}) 304 | # else: 305 | # yield new.load_item() -------------------------------------------------------------------------------- /fbcrawl/spiders/events.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | from scrapy.loader import ItemLoader 4 | from scrapy.exceptions import CloseSpider 5 | from fbcrawl.spiders.fbcrawl import FacebookSpider 6 | from fbcrawl.items import EventsItem, parse_date, parse_date2 7 | 8 | from datetime import datetime 9 | 10 | class EventsSpider(FacebookSpider): 11 | """ 12 | Parse FB events, given a page (needs credentials) 13 | """ 14 | name = "events" 15 | custom_settings = { 16 | 'FEED_EXPORT_FIELDS': ['name','where','location','photo','start_date', \ 17 | 'end_date','description'], 18 | 'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter', 19 | 'CONCURRENT_REQUESTS' : 1 20 | } 21 | 22 | def __init__(self, *args, **kwargs): 23 | self.page = kwargs['page'] 24 | super().__init__(*args,**kwargs) 25 | 26 | def parse_page(self, response): 27 | yield scrapy.Request(url=response.urljoin('%s/events' % self.page), 28 | callback=self.parse_events, 29 | priority=10, 30 | meta={'index':1}) 31 | 32 | def parse_events(self, response): 33 | TABLE_XPATH='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div/div/div[2]/div/table/tbody/tr' 34 | for event in response.xpath(TABLE_XPATH): 35 | url = event.xpath('//td/div/div/span[3]/div/a[1]/@href').extract_first() 36 | yield response.follow(url, callback=self.parse_event) 37 | 38 | def parse_event(self, response): 39 | EVENT_NAME='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[2]/div[1]/h3/text()' 40 | EVENT_WHERE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dt/div/text()' 41 | EVENT_LOCATION='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[2]/table/tbody/tr/td[2]/dd/div/text()' 42 | DATE='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[3]/div/div[1]/table/tbody/tr/td[2]/dt/div/text()' 43 | EVENT_DESCRIPTION='/html/body/div/div/div[2]/div/table/tbody/tr/td/table/tbody/tr/td/div[2]/div[2]/div[2]/div[2]/text()' 44 | EVENT_COVER='/html/body/div/div/div[2]/div/table/tbody/tr/td/div[2]/div[1]/a/img/@src' 45 | date = response.xpath(DATE).extract_first() 46 | start_date = date.split('–')[0] or None 47 | end_date = date.split('–')[1] or None 48 | name = response.xpath(EVENT_NAME).extract_first() 49 | self.logger.info('Parsing event %s' % name) 50 | yield EventsItem( 51 | name=name, 52 | where=response.xpath(EVENT_WHERE).extract_first(), 53 | location=response.xpath(EVENT_LOCATION).extract_first(), 54 | photo=response.xpath(EVENT_COVER).extract_first(), 55 | start_date=start_date, 56 | end_date=end_date, 57 | description=response.xpath(EVENT_DESCRIPTION).extract_first() 58 | ) 59 | -------------------------------------------------------------------------------- /fbcrawl/spiders/fbcrawl.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import logging 3 | 4 | from scrapy.loader import ItemLoader 5 | from scrapy.http import FormRequest 6 | from scrapy.exceptions import CloseSpider 7 | from fbcrawl.items import FbcrawlItem, parse_date, parse_date2 8 | from datetime import datetime 9 | 10 | class FacebookSpider(scrapy.Spider): 11 | ''' 12 | Parse FB pages (needs credentials) 13 | ''' 14 | name = 'fb' 15 | custom_settings = { 16 | 'FEED_EXPORT_FIELDS': ['source','shared_from','date','text', \ 17 | 'reactions','likes','ahah','love','wow', \ 18 | 'sigh','grrr','comments','post_id','url'], 19 | 'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter', 20 | } 21 | 22 | def __init__(self, *args, **kwargs): 23 | #turn off annoying logging, set LOG_LEVEL=DEBUG in settings.py to see more logs 24 | logger = logging.getLogger('scrapy.middleware') 25 | logger.setLevel(logging.WARNING) 26 | 27 | super().__init__(*args,**kwargs) 28 | 29 | #email & pass need to be passed as attributes! 30 | if 'email' not in kwargs or 'password' not in kwargs: 31 | raise AttributeError('You need to provide valid email and password:\n' 32 | 'scrapy fb -a email="EMAIL" -a password="PASSWORD"') 33 | else: 34 | self.logger.info('Email and password provided, will be used to log in') 35 | 36 | #page name parsing (added support for full urls) 37 | if 'page' in kwargs: 38 | if self.page.find('/groups/') != -1: 39 | self.group = 1 40 | else: 41 | self.group = 0 42 | if self.page.find('https://www.facebook.com/') != -1: 43 | self.page = self.page[25:] 44 | elif self.page.find('https://mbasic.facebook.com/') != -1: 45 | self.page = self.page[28:] 46 | elif self.page.find('https://m.facebook.com/') != -1: 47 | self.page = self.page[23:] 48 | 49 | 50 | #parse date 51 | if 'date' not in kwargs: 52 | self.logger.info('Date attribute not provided, scraping date set to 2004-02-04 (fb launch date)') 53 | self.date = datetime(2004,2,4) 54 | else: 55 | self.date = datetime.strptime(kwargs['date'],'%Y-%m-%d') 56 | self.logger.info('Date attribute provided, fbcrawl will stop crawling at {}'.format(kwargs['date'])) 57 | self.year = self.date.year 58 | 59 | #parse lang, if not provided (but is supported) it will be guessed in parse_home 60 | if 'lang' not in kwargs: 61 | self.logger.info('Language attribute not provided, fbcrawl will try to guess it from the fb interface') 62 | self.logger.info('To specify, add the lang parameter: scrapy fb -a lang="LANGUAGE"') 63 | self.logger.info('Currently choices for "LANGUAGE" are: "en", "es", "fr", "it", "pt"') 64 | self.lang = '_' 65 | elif self.lang == 'en' or self.lang == 'es' or self.lang == 'fr' or self.lang == 'it' or self.lang == 'pt': 66 | self.logger.info('Language attribute recognized, using "{}" for the facebook interface'.format(self.lang)) 67 | else: 68 | self.logger.info('Lang "{}" not currently supported'.format(self.lang)) 69 | self.logger.info('Currently supported languages are: "en", "es", "fr", "it", "pt"') 70 | self.logger.info('Change your interface lang from facebook settings and try again') 71 | raise AttributeError('Language provided not currently supported') 72 | 73 | #max num of posts to crawl 74 | if 'max' not in kwargs: 75 | self.max = int(10e5) 76 | else: 77 | self.max = int(kwargs['max']) 78 | 79 | #current year, this variable is needed for proper parse_page recursion 80 | self.k = datetime.now().year 81 | #count number of posts, used to enforce DFS and insert posts orderly in the csv 82 | self.count = 0 83 | 84 | self.start_urls = ['https://mbasic.facebook.com'] 85 | 86 | def parse(self, response): 87 | ''' 88 | Handle login with provided credentials 89 | ''' 90 | return FormRequest.from_response( 91 | response, 92 | formxpath='//form[contains(@action, "login")]', 93 | formdata={'email': self.email,'pass': self.password}, 94 | callback=self.parse_home 95 | ) 96 | 97 | def parse_home(self, response): 98 | ''' 99 | This method has multiple purposes: 100 | 1) Handle failed logins due to facebook 'save-device' redirection 101 | 2) Set language interface, if not already provided 102 | 3) Navigate to given page 103 | ''' 104 | #handle 'save-device' redirection 105 | if response.xpath("//div/a[contains(@href,'save-device')]"): 106 | self.logger.info('Going through the "save-device" checkpoint') 107 | return FormRequest.from_response( 108 | response, 109 | formdata={'name_action_selected': 'dont_save'}, 110 | callback=self.parse_home 111 | ) 112 | 113 | #set language interface 114 | if self.lang == '_': 115 | if response.xpath("//input[@placeholder='Search Facebook']"): 116 | self.logger.info('Language recognized: lang="en"') 117 | self.lang = 'en' 118 | elif response.xpath("//input[@placeholder='Buscar en Facebook']"): 119 | self.logger.info('Language recognized: lang="es"') 120 | self.lang = 'es' 121 | elif response.xpath("//input[@placeholder='Rechercher sur Facebook']"): 122 | self.logger.info('Language recognized: lang="fr"') 123 | self.lang = 'fr' 124 | elif response.xpath("//input[@placeholder='Cerca su Facebook']"): 125 | self.logger.info('Language recognized: lang="it"') 126 | self.lang = 'it' 127 | elif response.xpath("//input[@placeholder='Pesquisa no Facebook']"): 128 | self.logger.info('Language recognized: lang="pt"') 129 | self.lang = 'pt' 130 | else: 131 | raise AttributeError('Language not recognized\n' 132 | 'Change your interface lang from facebook ' 133 | 'and try again') 134 | 135 | #navigate to provided page 136 | href = response.urljoin(self.page) 137 | self.logger.info('Scraping facebook page {}'.format(href)) 138 | return scrapy.Request(url=href,callback=self.parse_page,meta={'index':1}) 139 | 140 | def parse_page(self, response): 141 | ''' 142 | Parse the given page selecting the posts. 143 | Then ask recursively for another page. 144 | ''' 145 | # #open page in browser for debug 146 | # from scrapy.utils.response import open_in_browser 147 | # open_in_browser(response) 148 | 149 | #select all posts 150 | for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): 151 | 152 | many_features = post.xpath('./@data-ft').get() 153 | date = [] 154 | date.append(many_features) 155 | date = parse_date(date,{'lang':self.lang}) 156 | current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date 157 | 158 | if current_date is None: 159 | date_string = post.xpath('.//abbr/text()').get() 160 | date = parse_date2([date_string],{'lang':self.lang}) 161 | current_date = datetime(date.year,date.month,date.day) if date is not None else date 162 | date = str(date) 163 | 164 | #if 'date' argument is reached stop crawling 165 | if self.date > current_date: 166 | raise CloseSpider('Reached date: {}'.format(self.date)) 167 | 168 | new = ItemLoader(item=FbcrawlItem(),selector=post) 169 | if abs(self.count) + 1 > self.max: 170 | raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count))) 171 | self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date)) 172 | new.add_xpath('comments', './div[2]/div[2]/a[1]/text()') 173 | new.add_value('date',date) 174 | new.add_xpath('post_id','./@data-ft') 175 | new.add_xpath('url', ".//a[contains(@href,'footer')]/@href") 176 | #page_url #new.add_value('url',response.url) 177 | 178 | #returns full post-link in a list 179 | post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() 180 | temp_post = response.urljoin(post[0]) 181 | self.count -= 1 182 | yield scrapy.Request(temp_post, self.parse_post, priority = self.count, meta={'item':new}) 183 | 184 | #load following page, try to click on "more" 185 | #after few pages have been scraped, the "more" link might disappears 186 | #if not present look for the highest year not parsed yet 187 | #click once on the year and go back to clicking "more" 188 | 189 | #new_page is different for groups 190 | if self.group == 1: 191 | new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract() 192 | else: 193 | new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() 194 | #this is why lang is needed ^^^^^^^^^^^^^^^^^^^^^^^^^^ 195 | 196 | if not new_page: 197 | self.logger.info('[!] "more" link not found, will look for a "year" link') 198 | #self.k is the year link that we look for 199 | if response.meta['flag'] == self.k and self.k >= self.year: 200 | xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" 201 | new_page = response.xpath(xpath).extract() 202 | if new_page: 203 | new_page = response.urljoin(new_page[0]) 204 | self.k -= 1 205 | self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) 206 | yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) 207 | else: 208 | while not new_page: #sometimes the years are skipped this handles small year gaps 209 | self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1)) 210 | self.k -= 1 211 | if self.k < self.year: 212 | raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date)) 213 | xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" 214 | new_page = response.xpath(xpath).extract() 215 | self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) 216 | new_page = response.urljoin(new_page[0]) 217 | self.k -= 1 218 | yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) 219 | else: 220 | self.logger.info('Crawling has finished with no errors!') 221 | else: 222 | new_page = response.urljoin(new_page[0]) 223 | if 'flag' in response.meta: 224 | self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page)) 225 | yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':response.meta['flag']}) 226 | else: 227 | self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page)) 228 | yield scrapy.Request(new_page, callback=self.parse_page, meta={'flag':self.k}) 229 | 230 | def parse_post(self,response): 231 | new = ItemLoader(item=FbcrawlItem(),response=response,parent=response.meta['item']) 232 | new.context['lang'] = self.lang 233 | new.add_xpath('source', "//td/div/h3/strong/a/text() | //span/strong/a/text() | //div/div/div/a[contains(@href,'post_id')]/strong/text()") 234 | new.add_xpath('shared_from','//div[contains(@data-ft,"top_level_post_id") and contains(@data-ft,\'"isShare":1\')]/div/div[3]//strong/a/text()') 235 | # new.add_xpath('date','//div/div/abbr/text()') 236 | new.add_xpath('text','//div[@data-ft]//p//text() | //div[@data-ft]/div[@class]/div[@class]/text()') 237 | 238 | #check reactions for old posts 239 | check_reactions = response.xpath("//a[contains(@href,'reaction/profile')]/div/div/text()").get() 240 | if not check_reactions: 241 | yield new.load_item() 242 | else: 243 | new.add_xpath('reactions',"//a[contains(@href,'reaction/profile')]/div/div/text()") 244 | reactions = response.xpath("//div[contains(@id,'sentence')]/a[contains(@href,'reaction/profile')]/@href") 245 | reactions = response.urljoin(reactions[0].extract()) 246 | yield scrapy.Request(reactions, callback=self.parse_reactions, meta={'item':new}) 247 | 248 | def parse_reactions(self,response): 249 | new = ItemLoader(item=FbcrawlItem(),response=response, parent=response.meta['item']) 250 | new.context['lang'] = self.lang 251 | new.add_xpath('likes',"//a[contains(@href,'reaction_type=1')]/span/text()") 252 | new.add_xpath('ahah',"//a[contains(@href,'reaction_type=4')]/span/text()") 253 | new.add_xpath('love',"//a[contains(@href,'reaction_type=2')]/span/text()") 254 | new.add_xpath('wow',"//a[contains(@href,'reaction_type=3')]/span/text()") 255 | new.add_xpath('sigh',"//a[contains(@href,'reaction_type=7')]/span/text()") 256 | new.add_xpath('grrr',"//a[contains(@href,'reaction_type=8')]/span/text()") 257 | yield new.load_item() -------------------------------------------------------------------------------- /fbcrawl/spiders/profiles.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | from scrapy.loader import ItemLoader 4 | from scrapy.exceptions import CloseSpider 5 | from fbcrawl.spiders.fbcrawl import FacebookSpider 6 | from fbcrawl.items import ProfileItem, parse_date, parse_date2 7 | 8 | from datetime import datetime 9 | 10 | class ProfileSpider(FacebookSpider): 11 | """ 12 | Parse FB profiles 13 | """ 14 | name = "profiles" 15 | custom_settings = { 16 | 'FEED_EXPORT_FIELDS': ['name','gender','birthday','current_city', 17 | 'hometown','work','education','interested_in', 18 | 'page'], 19 | 'DUPEFILTER_CLASS' : 'scrapy.dupefilters.BaseDupeFilter', 20 | 'CONCURRENT_REQUESTS' : 1 21 | } 22 | 23 | def __init__(self, *args, **kwargs): 24 | if 'post' in kwargs and 'page' in kwargs: 25 | raise AttributeError('You need to specifiy only one between post and page') 26 | elif 'post' in kwargs: 27 | self.page = kwargs['post'] 28 | self.type = 'post' 29 | elif 'page' in kwargs: 30 | self.type = 'page' 31 | 32 | super().__init__(*args,**kwargs) 33 | 34 | def parse_page(self, response): 35 | ''' 36 | ''' 37 | if self.type == 'post': 38 | yield scrapy.Request(url=response.url, 39 | callback=self.parse_post, 40 | priority=10, 41 | meta={'index':1}) 42 | elif self.type == 'page': 43 | #select all posts 44 | for post in response.xpath("//div[contains(@data-ft,'top_level_post_id')]"): 45 | many_features = post.xpath('./@data-ft').get() 46 | date = [] 47 | date.append(many_features) 48 | date = parse_date(date,{'lang':self.lang}) 49 | current_date = datetime.strptime(date,'%Y-%m-%d %H:%M:%S') if date is not None else date 50 | 51 | if current_date is None: 52 | date_string = post.xpath('.//abbr/text()').get() 53 | date = parse_date2([date_string],{'lang':self.lang}) 54 | current_date = datetime(date.year,date.month,date.day) if date is not None else date 55 | date = str(date) 56 | 57 | if abs(self.count) + 1 > self.max: 58 | raise CloseSpider('Reached max num of post: {}. Crawling finished'.format(abs(self.count))) 59 | self.logger.info('Parsing post n = {}, post_date = {}'.format(abs(self.count)+1,date)) 60 | 61 | #returns full post-link in a list 62 | post = post.xpath(".//a[contains(@href,'footer')]/@href").extract() 63 | temp_post = response.urljoin(post[0]) 64 | self.count -= 1 65 | yield scrapy.Request(temp_post, 66 | self.parse_post, 67 | priority = self.count, 68 | meta={'index':1}) 69 | 70 | #load following page, try to click on "more" 71 | #after few pages have been scraped, the "more" link might disappears 72 | #if not present look for the highest year not parsed yet 73 | #click once on the year and go back to clicking "more" 74 | 75 | #new_page is different for groups 76 | if self.group == 1: 77 | new_page = response.xpath("//div[contains(@id,'stories_container')]/div[2]/a/@href").extract() 78 | else: 79 | new_page = response.xpath("//div[2]/a[contains(@href,'timestart=') and not(contains(text(),'ent')) and not(contains(text(),number()))]/@href").extract() 80 | #this is why lang is needed 81 | 82 | if not new_page: 83 | self.logger.info('[!] "more" link not found, will look for a "year" link') 84 | #self.k is the year link that we look for 85 | if response.meta['flag'] == self.k and self.k >= self.year: 86 | xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" 87 | new_page = response.xpath(xpath).extract() 88 | if new_page: 89 | new_page = response.urljoin(new_page[0]) 90 | self.k -= 1 91 | self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) 92 | yield scrapy.Request(new_page, 93 | callback=self.parse_page, 94 | priority = -1000, 95 | meta={'flag':self.k}) 96 | else: 97 | while not new_page: #sometimes the years are skipped this handles small year gaps 98 | self.logger.info('Link not found for year {}, trying with previous year {}'.format(self.k,self.k-1)) 99 | self.k -= 1 100 | if self.k < self.year: 101 | raise CloseSpider('Reached date: {}. Crawling finished'.format(self.date)) 102 | xpath = "//div/a[contains(@href,'time') and contains(text(),'" + str(self.k) + "')]/@href" 103 | new_page = response.xpath(xpath).extract() 104 | self.logger.info('Found a link for year "{}", new_page = {}'.format(self.k,new_page)) 105 | new_page = response.urljoin(new_page[0]) 106 | self.k -= 1 107 | yield scrapy.Request(new_page, 108 | callback=self.parse_page, 109 | priority = -1000, 110 | meta={'flag':self.k}) 111 | else: 112 | self.logger.info('Crawling has finished with no errors!') 113 | else: 114 | new_page = response.urljoin(new_page[0]) 115 | if 'flag' in response.meta: 116 | self.logger.info('Page scraped, clicking on "more"! new_page = {}'.format(new_page)) 117 | yield scrapy.Request(new_page, 118 | callback=self.parse_page, 119 | priority = -1000, 120 | meta={'flag':response.meta['flag']}) 121 | else: 122 | self.logger.info('First page scraped, clicking on "more"! new_page = {}'.format(new_page)) 123 | yield scrapy.Request(new_page, 124 | callback=self.parse_page, 125 | priority = -1000, 126 | meta={'flag':self.k}) 127 | 128 | def parse_post(self, response): 129 | ''' 130 | parse post does multiple things: 131 | 1) loads replied-to-comments page one-by-one (for DFS) 132 | 2) call parse_reply on the nested comments 133 | 3) adds simple (not-replied-to) comments 134 | 4) follows to new comment page 135 | ''' 136 | #load replied-to comments pages 137 | #select nested comment one-by-one matching with the index: response.meta['index'] 138 | path = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and .//div[contains(@id,"comment_replies")]]' + '['+ str(response.meta['index']) + ']' 139 | group_flag = response.meta['group'] if 'group' in response.meta else None 140 | 141 | for reply in response.xpath(path): 142 | rep = reply.xpath('.//h3/a/@href').get() 143 | profile = 'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about' 144 | yield scrapy.Request(profile, 145 | callback=self.parse_profile, 146 | priority=1000, 147 | meta={'url':response.url, 148 | 'index':response.meta['index'], 149 | 'flag':'init', 150 | 'group':group_flag}) 151 | #load regular comments 152 | if not response.xpath(path): #prevents from exec 153 | path2 = './/div[string-length(@class) = 2 and count(@id)=1 and contains("0123456789", substring(@id,1,1)) and not(.//div[contains(@id,"comment_replies")])]' 154 | for i,reply in enumerate(response.xpath(path2)): 155 | self.logger.info('{} regular comment'.format(i+1)) 156 | rep = reply.xpath('.//h3/a/@href').get() 157 | profile = 'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about' 158 | yield scrapy.Request(profile, 159 | callback=self.parse_profile, 160 | priority=1000, 161 | meta={'url':response.url, 162 | 'index':response.meta['index'], 163 | 'flag':'init', 164 | 'group':group_flag}) 165 | 166 | #new comment page 167 | if not response.xpath(path): 168 | #for groups 169 | next_xpath = './/div[contains(@id,"see_next")]' 170 | prev_xpath = './/div[contains(@id,"see_prev")]' 171 | if not response.xpath(next_xpath) or group_flag == 1: 172 | for next_page in response.xpath(prev_xpath): 173 | new_page = next_page.xpath('.//@href').extract() 174 | new_page = response.urljoin(new_page[0]) 175 | self.logger.info('New page to be crawled {}'.format(new_page)) 176 | yield scrapy.Request(new_page, 177 | callback=self.parse_post, 178 | meta={'index':1, 179 | 'group':1}) 180 | else: 181 | for next_page in response.xpath(next_xpath): 182 | new_page = next_page.xpath('.//@href').extract() 183 | new_page = response.urljoin(new_page[0]) 184 | self.logger.info('New page to be crawled {}'.format(new_page)) 185 | yield scrapy.Request(new_page, 186 | callback=self.parse_post, 187 | meta={'index':1, 188 | 'group':group_flag}) 189 | 190 | def parse_reply(self,response): 191 | ''' 192 | parse reply to comments, root comment is added if flag 193 | ''' 194 | # from scrapy.utils.response import open_in_browser 195 | # open_in_browser(response) 196 | 197 | if response.meta['flag'] == 'init': 198 | #parse root comment 199 | for root in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)!=1 and contains("0123456789", substring(@id,1,1))]'): 200 | rep = root.xpath('.//h3/a/@href').get() 201 | profile = 'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about' 202 | yield scrapy.Request(profile, 203 | callback=self.parse_profile, 204 | priority=1000, 205 | meta={'url':response.url, 206 | 'index':response.meta['index'], 207 | 'flag':'init', 208 | 'group':response['group_flag']}) 209 | #parse all replies in the page 210 | for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): 211 | rep = reply.xpath('.//h3/a/@href').get() 212 | profile = 'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about' 213 | yield scrapy.Request(profile, 214 | callback=self.parse_profile, 215 | priority=1000, 216 | meta={'url':response.url, 217 | 'index':response.meta['index'], 218 | 'flag':'init', 219 | 'group':response['group_flag']}) 220 | 221 | back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract() 222 | if back: 223 | self.logger.info('Back found, more nested comments') 224 | back_page = response.urljoin(back[0]) 225 | yield scrapy.Request(back_page, 226 | callback=self.parse_reply, 227 | priority = 1000, 228 | meta={'reply_to':response.meta['reply_to'], 229 | 'flag':'back', 230 | 'url':response.meta['url'], 231 | 'index':response.meta['index'], 232 | 'group':response.meta['group']}) 233 | 234 | else: 235 | next_reply = response.meta['url'] 236 | self.logger.info('Nested comments crawl finished, heading to proper page: {}'.format(response.meta['url'])) 237 | yield scrapy.Request(next_reply, 238 | callback=self.parse_post, 239 | meta={'index':response.meta['index']+1, 240 | 'group':response.meta['group']}) 241 | 242 | elif response.meta['flag'] == 'back': 243 | #parse all comments 244 | for reply in response.xpath('//div[contains(@id,"root")]/div/div/div[count(@id)=1 and contains("0123456789", substring(@id,1,1))]'): 245 | rep = reply.xpath('.//h3/a/@href').extract()[0] 246 | profile = 'https://mbasic.facebook.com' + rep[:rep.find('?rc')] + '/about' 247 | yield scrapy.Request(profile, 248 | callback=self.parse_profile, 249 | priority=1000, 250 | meta={'url':response.url, 251 | 'index':response.meta['index'], 252 | 'flag':'init', 253 | 'group':response['group_flag']}) 254 | #keep going backwards 255 | back = response.xpath('//div[contains(@id,"comment_replies_more_1")]/a/@href').extract() 256 | self.logger.info('Back found, more nested comments') 257 | if back: 258 | back_page = response.urljoin(back[0]) 259 | yield scrapy.Request(back_page, 260 | callback=self.parse_reply, 261 | priority=1000, 262 | meta={'reply_to':response.meta['reply_to'], 263 | 'flag':'back', 264 | 'url':response.meta['url'], 265 | 'index':response.meta['index'], 266 | 'group':response.meta['group']}) 267 | 268 | else: 269 | next_reply = response.meta['url'] 270 | self.logger.info('Nested comments crawl finished, heading to home page: {}'.format(response.meta['url'])) 271 | yield scrapy.Request(next_reply, 272 | callback=self.parse_post, 273 | meta={'index':response.meta['index']+1, 274 | 'group':response.meta['group']}) 275 | 276 | 277 | def parse_profile(self,response): 278 | new = ItemLoader(item=ProfileItem(),response=response) 279 | self.logger.info('Crawling profile info') 280 | new.add_xpath('name','//span/div/span/strong/text()') 281 | new.add_xpath('gender',"//div[@id='basic-info']//div[@title='Gender']//div/text()") 282 | new.add_xpath('birthday',"//div[@id='basic-info']//div[@title='Birthday']//div/text()") 283 | new.add_xpath('current_city',"//div[@id='living']//div[@title='Current City']//a/text()") 284 | new.add_xpath('hometown',"//div[@id='living']//div[@title='Hometown']//a/text()") 285 | new.add_xpath('work',"//div[@id='work']//a/text()") 286 | new.add_xpath('education',"//div[@id='education']//a/text()") 287 | new.add_xpath('interested_in',"//div[@id='interested-in']//div[not(contains(text(),'Interested In'))]/text()") 288 | new.add_xpath('page',"//div[@id='contact-info']//div[@title='Facebook']//div/text()") 289 | yield new.load_item() 290 | -------------------------------------------------------------------------------- /runner_facebook.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | OLD=$PWD 4 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 5 | 6 | cd $DIR 7 | 8 | if [ -d "/usr/local/crawler-venv" ] 9 | then 10 | source /usr/local/crawler-venv/bin/activate 11 | fi 12 | 13 | FOLDER=$(date +"%Y-%m") 14 | mkdir -p "$OLD/$FOLDER" 15 | 16 | DAY=$(date +"%d") 17 | mkdir -p "$OLD/$FOLDER/$DAY/facebook" 18 | 19 | TIME=$(date +"%s") 20 | 21 | source ~/.secrets 22 | 23 | scrapy crawl events -a email="$FACEBOOK_EMAIL" -a password="$FACEBOOK_PASSWORD" -a page="$1" -o "$OLD/$FOLDER/$DAY/facebook/$TIME_$1.csv" 24 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = fbcrawl.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = fbcrawl 12 | -------------------------------------------------------------------------------- /trump.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rugantio/fbcrawl/bda7d6a7da49a57c8a0863b6679c013f74fdb4c1/trump.png --------------------------------------------------------------------------------