├── 4chan.py
├── README.md
└── crontab


/4chan.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import urllib
 3 | import urllib2
 4 | import os
 5 | 
 6 | import time
 7 | 
 8 | start = time.time()
 9 | 
10 | def get_threads(url, board, dir):
11 | 
12 | 
13 |     response = urllib2.urlopen(url)
14 |     soup = BeautifulSoup(response.read(), "lxml")
15 | 
16 |     links = soup.find_all("a", attrs={"class": "replylink"})
17 | 
18 |     for link in links:
19 |         link_string = link['href']
20 |         thread_id = link_string.split('/')
21 |         print thread_id[1]
22 |         thread_url = "http://boards.4chan.org/" + dir + "/thread/" + thread_id[1]
23 |         thread_response = urllib2.urlopen(thread_url)
24 | 
25 |         image_urls = BeautifulSoup(thread_response.read(), "lxml")
26 |         images = image_urls.find_all("a", attrs={"class": "fileThumb"})
27 |         # Chage this to the path directory you want to save it to. This was for a usb drive.
28 |         directory = os.path.dirname("/media/4chan/" + thread_id[1])
29 | 
30 | 
31 |         if not os.path.exists(directory + "/thread/" + thread_id[1]):
32 |             os.makedirs(directory + "/thread/" + thread_id[1])
33 |         for image in images:
34 |             string = image['href']
35 |             one = string.split('/b/')
36 |             urllib.urlretrieve("http:" + image['href'], directory + "/thread/" + thread_id[1] + "/" + one[1])
37 | 
38 | 
39 | 
40 | prepend = ["boards",]
41 | append = ['b',]
42 | 
43 | for dir in append:
44 |     for board in prepend:
45 |         print board
46 |         url = "http://{}.4chan.org/{}".format(board, dir)
47 |         print "This is the directory: " + dir
48 |         get_threads(url, board, dir)
49 | 
50 | 
51 | end = time.time()
52 | print(end - start)
53 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 4chan-b-scraper
2 | script that continually scraps /b/
3 | 
4 | Simply run the script and pic the selected directory to save files to
5 | 


--------------------------------------------------------------------------------
/crontab:
--------------------------------------------------------------------------------
 1 | # Edit this file to introduce tasks to be run by cron.
 2 | #
 3 | # Each task to run has to be defined through a single line
 4 | # indicating with different fields when the task will be run
 5 | # and what command to run for the task
 6 | #
 7 | # To define the time you can provide concrete values for
 8 | # minute (m), hour (h), day of month (dom), month (mon),
 9 | # and day of week (dow) or use '*' in these fields (for 'any').#
10 | # Notice that tasks will be started based on the cron's system
11 | # daemon's notion of time and timezones.
12 | #
13 | # Output of the crontab jobs (including errors) is sent through
14 | # email to the user the crontab file belongs to (unless redirected).
15 | #
16 | # For example, you can run a backup of all your user accounts
17 | # at 5 a.m every week with:
18 | # 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/
19 | #
20 | # For more information see the manual pages of crontab(5) and cron(8)
21 | #
22 | # m h  dom mon dow   command
23 | 0 0 */2 * * python /home/craigslist/Rasp1/final_craigslist.py
24 | */10 * * * * python /home/4chan/main.py
25 | 


--------------------------------------------------------------------------------