├── testing ├── test ├── test.sh ├── README.md ├── .gitignore └── Facebook_Scraper.py /testing: -------------------------------------------------------------------------------- 1 | jdf 2 | -------------------------------------------------------------------------------- /test: -------------------------------------------------------------------------------- 1 | 2 | dbfdbf 3 | -------------------------------------------------------------------------------- /test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | IFS=$'\n' 3 | commits=($(git log -n 2 --pretty=%s)) 4 | echo "${commits[0]}" 5 | echo "${commits[1]}" 6 | 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Facebook-images-scraper 2 | ======================= 3 | 4 | Scrapes the Facebook pages images and saves it to your disk. The URL must be of the form http://www.facebook/the_page_you_need_to_scrap/photo_stream. Supports the new version of the pages only. 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /Facebook_Scraper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from bs4 import BeautifulSoup,Comment 3 | import urlparse 4 | from urllib2 import urlopen 5 | import urllib2 6 | from urllib import urlretrieve 7 | import os 8 | import sys 9 | import json 10 | from threading import Thread 11 | import httplib 12 | import requests 13 | import re 14 | 15 | #Function to start the thread for downloading images in parellel 16 | #comment 17 | def download_parellel(function,images,output): 18 | for i in images: 19 | Thread(target=function,args=(i,output,)).start() 20 | 21 | 22 | #Function that downloads the image 23 | 24 | def download(image,output): 25 | filename = image.split("/")[-1] 26 | outpath = os.path.join(output, filename) 27 | urlretrieve(image, outpath) 28 | 29 | #give an ajax request to a URL so that we get all the images in that page to download 30 | 31 | def extract_from_ajax(download,output,load_url,last_fbid): 32 | url = "http://graph.facebook.com/"+load_url 33 | connect = requests.get(url) 34 | o = connect.json() 35 | 36 | ajaxurl = "https://www.facebook.com/ajax/pagelet/generic.php/TimelinePhotosStreamPagelet?ajaxpipe=1&ajaxpipe_token=AXif26PCkG9XwzHk&no_script_path=1&data=%7B%22scroll_load%22%3Atrue%2C%22last_fbid%22%3A"+last_fbid+"%2C%22fetch_size%22%3A32%2C%22profile_id%22%3A"+o['id']+"%2C%22tab_key%22%3A%22photos_stream%22%2C%22sk%22%3A%22photos_stream%22%7D&__user=0&__a=1&__dyn=7w86i&__req=jsonp_2&__adt=2" 37 | 38 | connection = requests.get(ajaxurl) 39 | 40 | obj = connection.content 41 | pattern_one = re.findall("http:\\\\/\\\\/[a-z]+-[a-z]+.[a-z]+.[a-z]+.[a-z]+.\\\\/[a-z]+-[a-z]+-[a-z0-9]+\\\\/[a-z0-9]+\\\\/[0-9]+_[0-9]+_[0-9]+_[a-z]+.jpg",obj) 42 | pattern_two = re.findall("https:\\\\/\\\\/[a-z]+-[a-z]+-[a-z]+-[a-z]+.[a-z]+.[a-z]+\\\\/[a-z]+-[a-z]+-[a-z0-9]+\\\\/[a-z0-9]+\\\\/[0-9]+_[0-9]+_[0-9]+_[a-z]+.jpg",obj) 43 | nextLinks = []; 44 | cnt = 1 45 | for i in pattern_one: 46 | link = "" 47 | for j,c in enumerate(i): 48 | if c != '\\': 49 | link = link + c 50 | 51 | 52 | nextLinks.append(link) 53 | print link 54 | 55 | for i in pattern_two: 56 | link = "" 57 | for j,c in enumerate(i): 58 | if c != '\\': 59 | link = link + c 60 | 61 | 62 | nextLinks.append(link) 63 | print link 64 | 65 | download_parellel(download,nextLinks,output) 66 | last_link = nextLinks[-2] 67 | last_id = last_link.split("_") 68 | if last_id: 69 | return last_id[-3] 70 | return None 71 | 72 | 73 | def main(url,output,load_url): 74 | links=[]; 75 | soup = BeautifulSoup(urlopen(url)) 76 | parsed = list(urlparse.urlparse(url)) 77 | #print soup 78 | comments = soup.findAll(text=lambda text:isinstance(text,Comment)) 79 | for comment in comments: 80 | #print comment 81 | #print "\n\n\n" 82 | test = BeautifulSoup(comment) 83 | nameTags = test.findAll('div',{"data-starred-src":True}) 84 | lastImage = "" 85 | if nameTags: 86 | for i in nameTags: 87 | links.append(i['data-starred-src']) 88 | 89 | download_parellel(download,links,output) 90 | 91 | #Ajax loading for further images 92 | 93 | lastImage = links[-2] 94 | ret = lastImage.split("_"); 95 | lastid = extract_from_ajax(download,output,load_url,ret[-3]) 96 | 97 | #Repeat ajax request till the last image 98 | while lastid: 99 | lastid = extract_from_ajax(download,output,load_url,lastid) 100 | 101 | 102 | if __name__ == "__main__": 103 | url = sys.argv[-1] 104 | #Change your location 105 | output = "/Users/rkarth/downloads" 106 | all_parts = url.split("/") 107 | load_url = all_parts[-2] 108 | main(url, output,load_url) 109 | 110 | --------------------------------------------------------------------------------