├── README.md └── scraper.py /README.md: -------------------------------------------------------------------------------- 1 | # Non-API-FaceBook-Scraper [2019] 2 | _Scrape public posts from any group or user into a .csv file without needing to register for any API access_ 3 | 4 | ![result](https://i.imgur.com/EgObJWb.png) 5 | ____ 6 | [![Build Status](https://travis-ci.org/joemccann/dillinger.svg?branch=master)](https://travis-ci.org/joemccann/dillinger) 7 | 8 | ### How to use it? 9 | 10 | Firstly, make sure you have selenium >= 3.141.0 and FireFox installed. 11 | 12 | Use `scraper.py` to collect the data. 13 | ``` 14 | usage: scrape.py [-h] [--pages PAGES [PAGES ...]] [-d DEPTH] 15 | Data Collection 16 | arguments: 17 | -h, --help show this help message and exit 18 | --pages PAGES [PAGES ...] 19 | List the pages you want to scrape 20 | for recent posts 21 | 22 | -d DEPTH, --depth DEPTH 23 | How many recent posts you want to gather in 24 | multiples of (roughly) 8. 25 | ``` 26 | Example: ```python scraper.py --pages DonaldTrump senatorsanders -d 20``` 27 | ____ 28 | The output is `posts.csv` inside the script folder. 29 | -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time 3 | import argparse 4 | import csv 5 | 6 | parser = argparse.ArgumentParser(description='Non API public FB miner') 7 | 8 | parser.add_argument('--pages', nargs='+', 9 | dest="pages", 10 | help="List the pages you want to scrape for recent posts") 11 | 12 | parser.add_argument("-d", "--depth", action="store", 13 | dest="depth", default=5, type=int, 14 | help="How many recent posts you want to gather -- in multiples of (roughly) 8.") 15 | 16 | args = parser.parse_args() 17 | 18 | 19 | class Collector(object): 20 | """Collector of recent FaceBook posts. 21 | Note: We bypass the FaceBook-Graph-API by using a 22 | selenium FireFox instance! 23 | This is against the FB guide lines and thus not allowed. 24 | 25 | USE THIS FOR EDUCATIONAL PURPOSES ONLY. DO NOT ACTAULLY RUN IT. 26 | """ 27 | 28 | def __init__(self, pages=["oxfess"], corpus_file="posts.csv", depth=5, delay=2): 29 | super(Collector, self).__init__() 30 | self.pages = pages 31 | self.dump = corpus_file 32 | self.depth = depth + 1 33 | self.delay = delay 34 | # browser instance 35 | self.browser = webdriver.Firefox() 36 | 37 | # creating CSV header 38 | with open(self.dump, "w", newline='', encoding="utf-8") as save_file: 39 | writer = csv.writer(save_file) 40 | writer.writerow(["Source", "utime", "Text"]) 41 | 42 | def strip(self, string): 43 | """Helping function to remove all non alphanumeric characters""" 44 | words = string.split() 45 | words = [word for word in words if "#" not in word] 46 | string = " ".join(words) 47 | clean = "" 48 | for c in string: 49 | if str.isalnum(c) or (c in [" ", ".", ","]): 50 | clean += c 51 | return clean 52 | 53 | def collect_page(self, page): 54 | # navigate to page 55 | self.browser.get('https://www.facebook.com/' + page + '/') 56 | 57 | # Scroll down depth-times and wait delay seconds to load 58 | # between scrolls 59 | for scroll in range(self.depth): 60 | 61 | # Scroll down to bottom 62 | self.browser.execute_script( 63 | "window.scrollTo(0, document.body.scrollHeight);") 64 | 65 | # Wait to load page 66 | time.sleep(self.delay) 67 | 68 | # Once the full page is loaded, we can start scraping 69 | with open(self.dump, "a+", newline='', encoding="utf-8") as save_file: 70 | writer = csv.writer(save_file) 71 | posts = self.browser.find_elements_by_class_name( 72 | "userContentWrapper") 73 | 74 | for post in posts: 75 | 76 | # Creating first CSV row entry with the page name (eg. "DonaldTrump") 77 | analysis = [page] 78 | 79 | # Creating utime entry. 80 | time_element = post.find_element_by_css_selector( 81 | "abbr") 82 | utime = time_element.get_attribute("data-utime") 83 | analysis.append(utime) 84 | 85 | # Creating post text entry 86 | text = "" 87 | text_elements = post.find_elements_by_css_selector( 88 | "p") 89 | for p in text_elements: 90 | text += self.strip(p.text) 91 | analysis.append(text) 92 | 93 | # Write row to csv 94 | writer.writerow(analysis) 95 | 96 | def collect(self): 97 | for page in self.pages: 98 | self.collect_page(page) 99 | 100 | 101 | C = Collector(pages=args.pages, depth=args.depth) 102 | 103 | C.collect() 104 | 105 | --------------------------------------------------------------------------------