├── 4chan.py ├── README.md └── crontab /4chan.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import urllib 3 | import urllib2 4 | import os 5 | 6 | import time 7 | 8 | start = time.time() 9 | 10 | def get_threads(url, board, dir): 11 | 12 | 13 | response = urllib2.urlopen(url) 14 | soup = BeautifulSoup(response.read(), "lxml") 15 | 16 | links = soup.find_all("a", attrs={"class": "replylink"}) 17 | 18 | for link in links: 19 | link_string = link['href'] 20 | thread_id = link_string.split('/') 21 | print thread_id[1] 22 | thread_url = "http://boards.4chan.org/" + dir + "/thread/" + thread_id[1] 23 | thread_response = urllib2.urlopen(thread_url) 24 | 25 | image_urls = BeautifulSoup(thread_response.read(), "lxml") 26 | images = image_urls.find_all("a", attrs={"class": "fileThumb"}) 27 | # Chage this to the path directory you want to save it to. This was for a usb drive. 28 | directory = os.path.dirname("/media/4chan/" + thread_id[1]) 29 | 30 | 31 | if not os.path.exists(directory + "/thread/" + thread_id[1]): 32 | os.makedirs(directory + "/thread/" + thread_id[1]) 33 | for image in images: 34 | string = image['href'] 35 | one = string.split('/b/') 36 | urllib.urlretrieve("http:" + image['href'], directory + "/thread/" + thread_id[1] + "/" + one[1]) 37 | 38 | 39 | 40 | prepend = ["boards",] 41 | append = ['b',] 42 | 43 | for dir in append: 44 | for board in prepend: 45 | print board 46 | url = "http://{}.4chan.org/{}".format(board, dir) 47 | print "This is the directory: " + dir 48 | get_threads(url, board, dir) 49 | 50 | 51 | end = time.time() 52 | print(end - start) 53 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 4chan-b-scraper 2 | script that continually scraps /b/ 3 | 4 | Simply run the script and pic the selected directory to save files to 5 | -------------------------------------------------------------------------------- /crontab: -------------------------------------------------------------------------------- 1 | # Edit this file to introduce tasks to be run by cron. 2 | # 3 | # Each task to run has to be defined through a single line 4 | # indicating with different fields when the task will be run 5 | # and what command to run for the task 6 | # 7 | # To define the time you can provide concrete values for 8 | # minute (m), hour (h), day of month (dom), month (mon), 9 | # and day of week (dow) or use '*' in these fields (for 'any').# 10 | # Notice that tasks will be started based on the cron's system 11 | # daemon's notion of time and timezones. 12 | # 13 | # Output of the crontab jobs (including errors) is sent through 14 | # email to the user the crontab file belongs to (unless redirected). 15 | # 16 | # For example, you can run a backup of all your user accounts 17 | # at 5 a.m every week with: 18 | # 0 5 * * 1 tar -zcf /var/backups/home.tgz /home/ 19 | # 20 | # For more information see the manual pages of crontab(5) and cron(8) 21 | # 22 | # m h dom mon dow command 23 | 0 0 */2 * * python /home/craigslist/Rasp1/final_craigslist.py 24 | */10 * * * * python /home/4chan/main.py 25 | --------------------------------------------------------------------------------