├── testing
├── test
├── test.sh
├── README.md
├── .gitignore
└── Facebook_Scraper.py


/testing:
--------------------------------------------------------------------------------
1 | jdf
2 | 


--------------------------------------------------------------------------------
/test:
--------------------------------------------------------------------------------
1 | 
2 | dbfdbf
3 | 


--------------------------------------------------------------------------------
/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | IFS=$'\n'
3 | commits=($(git log -n 2 --pretty=%s))
4 | echo "${commits[0]}" 
5 | echo "${commits[1]}"
6 | 
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Facebook-images-scraper
2 | =======================
3 | 
4 | Scrapes the Facebook pages images and saves it to your disk. The URL must be of the form http://www.facebook/the_page_you_need_to_scrap/photo_stream. Supports the new version of the pages only. 
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | 
21 | # Installer logs
22 | pip-log.txt
23 | 
24 | # Unit test / coverage reports
25 | .coverage
26 | .tox
27 | nosetests.xml
28 | 
29 | # Translations
30 | *.mo
31 | 
32 | # Mr Developer
33 | .mr.developer.cfg
34 | .project
35 | .pydevproject
36 | 


--------------------------------------------------------------------------------
/Facebook_Scraper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from bs4 import BeautifulSoup,Comment
  3 | import urlparse
  4 | from urllib2 import urlopen
  5 | import urllib2
  6 | from urllib import urlretrieve
  7 | import os
  8 | import sys
  9 | import json
 10 | from threading import Thread
 11 | import httplib
 12 | import requests
 13 | import re
 14 | 
 15 | #Function to start the thread for downloading images in parellel
 16 | #comment
 17 | def download_parellel(function,images,output):
 18 |     for i in images:
 19 |         Thread(target=function,args=(i,output,)).start()
 20 |     
 21 | 
 22 | #Function that downloads the image
 23 | 
 24 | def download(image,output):
 25 |     filename = image.split("/")[-1]
 26 |     outpath = os.path.join(output, filename)   
 27 |     urlretrieve(image, outpath)
 28 |        
 29 | #give an ajax request to a URL so that we get all the images in that page to download
 30 | 
 31 | def extract_from_ajax(download,output,load_url,last_fbid):
 32 |     url = "http://graph.facebook.com/"+load_url
 33 |     connect = requests.get(url)
 34 |     o = connect.json()
 35 |     
 36 |     ajaxurl = "https://www.facebook.com/ajax/pagelet/generic.php/TimelinePhotosStreamPagelet?ajaxpipe=1&ajaxpipe_token=AXif26PCkG9XwzHk&no_script_path=1&data=%7B%22scroll_load%22%3Atrue%2C%22last_fbid%22%3A"+last_fbid+"%2C%22fetch_size%22%3A32%2C%22profile_id%22%3A"+o['id']+"%2C%22tab_key%22%3A%22photos_stream%22%2C%22sk%22%3A%22photos_stream%22%7D&__user=0&__a=1&__dyn=7w86i&__req=jsonp_2&__adt=2"
 37 |     
 38 |     connection = requests.get(ajaxurl)
 39 |     
 40 |     obj = connection.content
 41 |     pattern_one = re.findall("http:\\\\/\\\\/[a-z]+-[a-z]+.[a-z]+.[a-z]+.[a-z]+.\\\\/[a-z]+-[a-z]+-[a-z0-9]+\\\\/[a-z0-9]+\\\\/[0-9]+_[0-9]+_[0-9]+_[a-z]+.jpg",obj)
 42 |     pattern_two = re.findall("https:\\\\/\\\\/[a-z]+-[a-z]+-[a-z]+-[a-z]+.[a-z]+.[a-z]+\\\\/[a-z]+-[a-z]+-[a-z0-9]+\\\\/[a-z0-9]+\\\\/[0-9]+_[0-9]+_[0-9]+_[a-z]+.jpg",obj)
 43 |     nextLinks = [];
 44 |     cnt = 1
 45 |     for i in pattern_one:
 46 |         link = ""
 47 |         for j,c in enumerate(i):
 48 |             if c != '\\':
 49 |                 link = link + c
 50 |             
 51 |             
 52 |         nextLinks.append(link)
 53 |         print link
 54 |     
 55 |     for i in pattern_two:
 56 |         link = ""
 57 |         for j,c in enumerate(i):
 58 |             if c != '\\':
 59 |                 link = link + c
 60 |             
 61 |             
 62 |         nextLinks.append(link)
 63 |         print link    
 64 |     
 65 |     download_parellel(download,nextLinks,output)
 66 |     last_link = nextLinks[-2]
 67 |     last_id = last_link.split("_")
 68 |     if last_id:
 69 |         return last_id[-3]
 70 |     return None
 71 | 
 72 | 
 73 | def main(url,output,load_url):
 74 |     links=[];
 75 |     soup = BeautifulSoup(urlopen(url))
 76 |     parsed = list(urlparse.urlparse(url))
 77 |     #print soup
 78 |     comments = soup.findAll(text=lambda text:isinstance(text,Comment))
 79 |     for comment in comments:
 80 |         #print comment
 81 |         #print "\n\n\n"
 82 |         test = BeautifulSoup(comment)
 83 |         nameTags = test.findAll('div',{"data-starred-src":True})
 84 |         lastImage = ""
 85 |         if nameTags:
 86 |             for i in nameTags:
 87 |                 links.append(i['data-starred-src'])
 88 |                 
 89 |     download_parellel(download,links,output)
 90 |     
 91 |     #Ajax loading for further images
 92 |     
 93 |     lastImage =  links[-2]
 94 |     ret = lastImage.split("_");
 95 |     lastid = extract_from_ajax(download,output,load_url,ret[-3])
 96 |     
 97 |     #Repeat ajax request till the last image
 98 |     while lastid:
 99 |         lastid = extract_from_ajax(download,output,load_url,lastid)
100 |     
101 |     
102 | if __name__ == "__main__":
103 |     url = sys.argv[-1]
104 |     #Change your location
105 |     output = "/Users/rkarth/downloads" 
106 |     all_parts = url.split("/")
107 |     load_url = all_parts[-2]
108 |     main(url, output,load_url)
109 |     
110 | 


--------------------------------------------------------------------------------