├── .gitignore ├── Python3 ├── TestData │ ├── Claymore-1.jpg │ ├── Claymore-2.jpg │ ├── Emilia-1.jpg │ ├── Emilia-10.png │ ├── Emilia-2.jpg │ ├── Emilia-3.jpg │ ├── Emilia-4.jpg │ ├── Emilia-5.jpg │ ├── Emilia-6.jpg │ ├── Emilia-7.jpg │ ├── Emilia-8.png │ ├── Emilia-9.png │ ├── Hellsing-1.jpg │ ├── Light-1.jpg │ ├── Makise-1.jpg │ ├── Re-Class Battleship.png │ ├── Re-Zero-1.jpg │ ├── Rem-1.png │ ├── Seras-1.jpg │ ├── Seras-2.jpg │ ├── Seras-3.jpg │ ├── Seras-4.jpg │ ├── Seras-5.jpg │ ├── Seras-6.jpg │ ├── Seras-7.jpg │ ├── Seras-8.jpg │ └── Wrong Aspect Ratio │ │ └── Emilia-1.png ├── functions.py └── iqdb.py ├── README.md ├── econtalk.org.py ├── functions.py ├── lyndaleigh.com.py ├── pantyhoseplaza.com.py ├── thechive.com.py ├── thesandbornmaps.cudl.colorado.edu.py ├── wall.alphacoders.com.py ├── wotd.dictionary.com.py └── x-art.com.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | .venv/ 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | -------------------------------------------------------------------------------- /Python3/TestData/Claymore-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Claymore-1.jpg -------------------------------------------------------------------------------- /Python3/TestData/Claymore-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Claymore-2.jpg -------------------------------------------------------------------------------- /Python3/TestData/Emilia-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-1.jpg -------------------------------------------------------------------------------- /Python3/TestData/Emilia-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-10.png -------------------------------------------------------------------------------- /Python3/TestData/Emilia-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-2.jpg -------------------------------------------------------------------------------- /Python3/TestData/Emilia-3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-3.jpg -------------------------------------------------------------------------------- /Python3/TestData/Emilia-4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-4.jpg -------------------------------------------------------------------------------- /Python3/TestData/Emilia-5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-5.jpg -------------------------------------------------------------------------------- /Python3/TestData/Emilia-6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-6.jpg -------------------------------------------------------------------------------- /Python3/TestData/Emilia-7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-7.jpg -------------------------------------------------------------------------------- /Python3/TestData/Emilia-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-8.png -------------------------------------------------------------------------------- /Python3/TestData/Emilia-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-9.png -------------------------------------------------------------------------------- /Python3/TestData/Hellsing-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Hellsing-1.jpg -------------------------------------------------------------------------------- /Python3/TestData/Light-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Light-1.jpg -------------------------------------------------------------------------------- /Python3/TestData/Makise-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Makise-1.jpg -------------------------------------------------------------------------------- /Python3/TestData/Re-Class Battleship.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Re-Class Battleship.png -------------------------------------------------------------------------------- /Python3/TestData/Re-Zero-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Re-Zero-1.jpg -------------------------------------------------------------------------------- /Python3/TestData/Rem-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Rem-1.png -------------------------------------------------------------------------------- /Python3/TestData/Seras-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-1.jpg -------------------------------------------------------------------------------- /Python3/TestData/Seras-2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-2.jpg -------------------------------------------------------------------------------- /Python3/TestData/Seras-3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-3.jpg -------------------------------------------------------------------------------- /Python3/TestData/Seras-4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-4.jpg -------------------------------------------------------------------------------- /Python3/TestData/Seras-5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-5.jpg -------------------------------------------------------------------------------- /Python3/TestData/Seras-6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-6.jpg -------------------------------------------------------------------------------- /Python3/TestData/Seras-7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-7.jpg -------------------------------------------------------------------------------- /Python3/TestData/Seras-8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-8.jpg -------------------------------------------------------------------------------- /Python3/TestData/Wrong Aspect Ratio/Emilia-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Wrong Aspect Ratio/Emilia-1.png -------------------------------------------------------------------------------- /Python3/functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | from bs4 import BeautifulSoup 3 | import requests 4 | 5 | def getSoup(url, data_ = {}, file = {}): 6 | r = requests.post(url, data=data_, files=file) 7 | return BeautifulSoup(r.text, 'lxml') 8 | -------------------------------------------------------------------------------- /Python3/iqdb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | import getopt, requests, sys 3 | from bs4 import BeautifulSoup 4 | from functions import getSoup 5 | 6 | def usage(): 7 | print("./iqdb.py -i ") 8 | 9 | imageFile = None 10 | 11 | try: 12 | opts, args = getopt.getopt(sys.argv[1:], 'hi:') 13 | except getopt.GetoptError: 14 | usage() 15 | sys.exit(2) 16 | for opt, arg in opts: 17 | if opt == '-h': 18 | usage() 19 | sys.exit() 20 | elif opt == '-i': 21 | imageFile = arg 22 | else: 23 | print("Unsupported option and/or argument") 24 | sys.exit(2) 25 | 26 | print("Input file is: " + imageFile) 27 | iqdbSoup = getSoup("http://iqdb.org/", {}, {'file': open(imageFile, 'rb')}) 28 | #print(iqdbSoup.find('div', {'class': 'pages'}).prettify()) 29 | for result in iqdbSoup.find('div', {'class': 'pages'}).findAll('table'): 30 | t1 = result.findAll('tr')[0].findAll('th')[0].text 31 | if t1 != "Your image": 32 | #print(result.prettify()) 33 | print("Image Info:") 34 | print("\t" + t1) 35 | t2 = result.find('td', {'class': 'image'}).find('a')['href'] 36 | if t2[:2] == "//": 37 | t2 = "http:" + t2 38 | print("\t\tSource:\t\t" + t2) 39 | t3 = result.find('img', {'class': 'service-icon'}).nextSibling 40 | print("\t\tSource Page:\t" + t3) 41 | whs = result.findAll('tr')[3].find('td').text.split(' ') 42 | width = int(whs[0].split('×')[0]) 43 | height = int(whs[0].split('×')[1]) 44 | safe = whs[1][1:-1] 45 | print("\t\tWidth:\t\t" + str(width)) 46 | print("\t\tHeight:\t\t" + str(height)) 47 | print("\t\tSafety Status:\t" + safe) 48 | 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scraping-Scripts 2 | ## Disclaimer 3 | These scripts are provided as-is. I assume no liability of any damage made by usage of these scripts. 4 | ## Usage 5 | ### You need the functions.py file for almost every script 6 | ### pantyhoseplaza.com.py 7 | **Edit the script to define your download directory** 8 | The script needs three parameters passed: 9 | - The page number of the Movie Episodes category 10 | - The username to use to log in 11 | - The password to use to log in 12 | 13 | The script downloads JSON-ized data, the thumbnail and videos in all qualities. 14 | If you have any suggestions submit a pull request! 15 | 16 | **IF ANY ERRORS POPUP RERUN THE SCRIPT, IF THEY KEEP OCURRING SUBMIT AN ISSUE** 17 | Thx :) 18 | 19 | ### wotd.dictionary.com.py 20 | *Requested by /u/nsq1* 21 | 22 | Three different usages: 23 | - All until today: python wotd.dictionary.com.py "/mnt/what/ever/directory/" 24 | - Specific date: python wotd.dictionary.com.py "/mnt/what/ever/directory/" yyyy/mm/dd -single 25 | - Date range: python wotd.dictionary.com.py "/mnt/what/ever/directory/" yyyy/mm/dd yyyy/mm/dd 26 | 27 | There is currently one problem with the script. For any dates below 2014/03/01 it throws a 403. If anyone finds a way to fix it submit a pull request! 28 | ### thesandbornmaps.cudl.colorado.edu.py 29 | *Requested by /u/WhiskeyQuebec* 30 | 31 | Options and arguments: 32 | - -s, --simple Constructs the simple/flat directory structure 33 | - -h, --help Shows this text 34 | - --from= Start at the given document number 35 | - --to= End with the given document number 36 | - --save-dir= Store at this location 37 | 38 | ### wall.alphacoders.com.py 39 | *Requested by myself :P* 40 | 41 | Options and arguments: 42 | - -h, --help Shows this printout 43 | - --update Stops at first found already downloaded 44 | - --save-dir= Store at this location 45 | 46 | ### thechive.com.py 47 | *Requested by /u/Broadsid3* 48 | 49 | No options or arguments. Just run the script. I'll add them later when I have more time. 50 | All posts with no valid date format will be stored to NonParsable folder. 51 | 52 | ## Donate 53 | If you like my work and want to donate here's the button! :) 54 | Actually there is no button. I have a personal PayPal account and can't set it up. 55 | Here's the email instead: nicba1010@gmail.com 56 | -------------------------------------------------------------------------------- /econtalk.org.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | from bs4 import BeautifulSoup 4 | from functions import getSoup, fileDl, ensureDir 5 | from datetime import datetime 6 | 7 | baseDir = "/root/econtalk.org/" 8 | baseUrl = "http://www.econtalk.org/" 9 | archiveSoup = getSoup(baseUrl + "archives.html") 10 | 11 | tableRows = archiveSoup.find('div', {'class': 'archive-individual archive-date-based archive'}).findAll('tr') 12 | for tableRow in tableRows: 13 | if tableRows.index(tableRow) == 0: 14 | continue 15 | date = datetime.strptime(tableRow.find('td', {'width': '5%'}).text.strip(), "%Y/%m/%d") 16 | extra = len(tableRow.findAll('td')[2].text.strip()) != 0 17 | name = tableRow.find('a').text 18 | dirName = date.strftime("%Y-%m-%d") + (" Extra " if extra else " ") + "- " + name + "/" 19 | url = tableRow.find('a')['href'] 20 | ensureDir(baseDir + dirName) 21 | print(dirName[:-1]) 22 | if not extra: 23 | podcastSoup = getSoup(url) 24 | url1 = podcastSoup.find('a', text="Download")['href'] 25 | print("\t" + url1) 26 | fileDl(url1, baseDir + dirName, "\t\t") 27 | print("\t" + url) 28 | fileDl(url, baseDir + dirName, "\t\t") 29 | -------------------------------------------------------------------------------- /functions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import urllib2, os, sys, math, urllib 3 | from bs4 import BeautifulSoup 4 | from datetime import timedelta, date 5 | import time 6 | 7 | class outputcolors: 8 | OKGREEN = '\033[92m' 9 | OKBLUE = '\033[94m' 10 | WARNING = '\033[93m' 11 | FAIL = '\033[91m' 12 | ENDC = '\033[0m' 13 | 14 | def getStatus(url): 15 | try: 16 | connection = urllib2.urlopen(url) 17 | code = connection.getcode() 18 | connection.close() 19 | return code 20 | except urllib2.HTTPError, e: 21 | return e.getcode() 22 | 23 | def roundUpTo(x, base): 24 | return int(base * math.ceil(float(x) / base)) 25 | 26 | def roundDownTo(x, base): 27 | return int(base * math.floor(float(x) / base)) 28 | 29 | def ensureDir(f): 30 | if not os.path.exists(f): 31 | os.makedirs(f) 32 | 33 | def replaceTab(s, tabstop = 4): 34 | result = str() 35 | for c in s: 36 | if c == '\t': 37 | while (len(result) % tabstop != 0): 38 | result += ' '; 39 | else: 40 | result += c 41 | return result 42 | 43 | def fileDl(url, dir, prepend, fileName = "?"): 44 | if fileName == "?": 45 | fileName = url.split('/')[-1] 46 | request = urllib2.Request(url) 47 | u = urllib2.urlopen(request) 48 | meta = u.info() 49 | fileSize = -1 50 | try: 51 | fileSize = int(meta.getheaders("Content-Length")[0]) 52 | except Exception: 53 | pass 54 | if os.path.exists(dir + fileName): 55 | if os.stat(dir + fileName).st_size == fileSize: 56 | print(prepend + outputcolors.OKBLUE + "File already downloaded!" + outputcolors.ENDC) 57 | return 42 58 | else: 59 | print(prepend + outputcolors.WARNING + "File downloaded but not fully! Restarting download..." + outputcolors.ENDC) 60 | else: 61 | print(prepend + outputcolors.WARNING + "Downloading file..." + outputcolors.ENDC) 62 | fileHandle = open(dir + fileName, 'wb') 63 | print(prepend + ("Downloading: %s Bytes: %s" % (fileName, "???" if (fileSize == -1) else fileSize))) 64 | fileSizeDl = 0 65 | blockSize = 65536 66 | while True: 67 | buffer = u.read(blockSize) 68 | if not buffer: 69 | break 70 | fileSizeDl += len(buffer) 71 | fileHandle.write(buffer) 72 | status = prepend + r"%12d [%3.2f%%]" % (fileSizeDl, -1.0 if (fileSize == -1) else (fileSizeDl * 100. / fileSize)) 73 | status = "\r" + status 74 | print status, 75 | fileHandle.close() 76 | print("\n" + prepend + outputcolors.OKGREEN + "Done :)" + outputcolors.ENDC) 77 | return 1 78 | 79 | def fileDlWithAuth(url, auth, dir, prepend): 80 | fileName = url.split('/')[-1] 81 | request = urllib2.Request(url) 82 | request.add_header("Authorization", "Basic %s" % auth) 83 | u = urllib2.urlopen(request) 84 | meta = u.info() 85 | fileSize = -1 86 | try: 87 | fileSize = int(meta.getheaders("Content-Length")[0]) 88 | except Exception: 89 | pass 90 | if os.path.exists(dir + fileName): 91 | if os.stat(dir + fileName).st_size == fileSize: 92 | print(prepend + outputcolors.OKBLUE + "File already downloaded!" + outputcolors.ENDC) 93 | return 42 94 | else: 95 | print(prepend + outputcolors.WARNING + "File downloaded but not fully! Restarting download..." + outputcolors.ENDC) 96 | else: 97 | print(prepend + outputcolors.WARNING + "Downloading file..." + outputcolors.ENDC) 98 | fileHandle = open(dir + fileName, 'wb') 99 | print(prepend + ("Downloading: %s Bytes: %s" % (fileName, "???" if (fileSize == -1) else fileSize))) 100 | fileSizeDl = 0 101 | blockSize = 65536 102 | while True: 103 | buffer = u.read(blockSize) 104 | if not buffer: 105 | break 106 | fileSizeDl += len(buffer) 107 | fileHandle.write(buffer) 108 | status = prepend + r"%12d [%3.2f%%]" % (fileSizeDl, -1.0 if (fileSize == -1) else (fileSizeDl * 100. / fileSize)) 109 | status = "\r" + status 110 | print status, 111 | fileHandle.close() 112 | print("\n" + prepend + outputcolors.OKGREEN + "Done :)" + outputcolors.ENDC) 113 | 114 | def getSoup(url): 115 | try: 116 | return BeautifulSoup(urllib2.urlopen(urllib2.Request(url)), "lxml") 117 | except urllib2.HTTPError, e: 118 | print("retrying in 5s") 119 | time.sleep(5) 120 | return getSoup(url) 121 | 122 | def daterange(start_date, end_date): 123 | for n in range(int ((end_date - start_date).days)): 124 | yield start_date + timedelta(n) 125 | -------------------------------------------------------------------------------- /lyndaleigh.com.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | from requests.auth import HTTPBasicAuth 4 | import requests 5 | import sys 6 | from bs4 import BeautifulSoup 7 | import getpass 8 | import time 9 | import magic 10 | import os 11 | import mimetypes 12 | import re 13 | from datetime import datetime 14 | import json 15 | 16 | def ensureDir(directory): 17 | if not os.path.exists(directory): 18 | os.makedirs(directory) 19 | def sCodeChk(resp, prepend = ""): 20 | if DEBUG: 21 | print(prepend + "URL: {}\n".format(resp.url) + prepend + " Response Code: {:3d}".format(resp.status_code)) 22 | return resp.status_code 23 | def ensureLoad(url, s, prepend = ""): 24 | loaded = s.get(url) 25 | bs = BeautifulSoup(loaded.content, 'lxml') 26 | if sCodeChk(loaded, prepend) != 200 or len(bs.prettify()) < 400: 27 | print(prepend + "Retrying in 15 seconds... Bs Size " + str(len(bs.prettify()))) 28 | time.sleep(15) 29 | loaded = ensureLoad(url, s) #Recursive call 30 | return loaded 31 | def fileDL(url, file_name, s): 32 | he = 0 33 | with open(file_name, "wb") as f: 34 | print("Downloading {}...".format(file_name)) 35 | response = s.get(url, stream=True) 36 | total_length = response.headers.get('content-length') 37 | print(response.headers) 38 | if total_length is None: 39 | f.write(response.content) 40 | else: 41 | dl = 0 42 | total_length = int(total_length) 43 | for data in response.iter_content(chunk_size=4096): 44 | dl += len(data) 45 | f.write(data) 46 | done = int(50 * dl / total_length) 47 | sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) ) 48 | sys.stdout.flush() 49 | he = response.headers 50 | return he 51 | base = "http://lyndaleigh.com" 52 | base_url = base + "/members/" 53 | session = requests.Session() 54 | 55 | username = input("Username: ") 56 | password = getpass.getpass("Password: ") 57 | DEBUG = True 58 | basefolder = input("Save Folder: ") 59 | 60 | session.auth = (username, password) 61 | 62 | latest = ensureLoad(base_url + "index.php/latest-updates.html", session) 63 | 64 | while sCodeChk(latest) == 500: 65 | print("Sleeping for 5 seconds...") 66 | time.sleep(5) 67 | latest = session.get(base_url + "index.php/latest-updates.html") 68 | 69 | latest_soup = BeautifulSoup(latest.content, 'lxml') 70 | last_page = int(int(latest_soup.find('a', { 'title' : "End" })['href'].split('=')[-1]) / 45) 71 | 72 | base_gallery_url = base_url + "index.php/latest-updates.html?start=" 73 | 74 | for page in range(last_page + 1): 75 | gallery_page_url = base_gallery_url + str(page * 45) 76 | gallery_page = ensureLoad(gallery_page_url, session) 77 | gallery_page_soup = BeautifulSoup(gallery_page.content, 'lxml') 78 | item_num = 0 79 | for item in gallery_page_soup.findAll('div', { 'class' : 'itemContainer' }): 80 | item_num = item_num + 1 81 | item_type = item.findAll('img', { 'class' : 'uk-responsive-width uk-align-center' })[0]['alt'] 82 | is_video = True if item_type == "Lynda Leigh Video Update" else False 83 | item_url = base + item.find('a', { 'class' : 'uk-thumbnail' })['href'] 84 | item_text_part = item.find('div', { 'class' : 'uk-thumbnail-caption' }) 85 | item_title = item_text_part.findAll('strong')[0].text.strip().replace('/', " of ") 86 | item_desc = item_text_part.find('span', { 'style' : ['font-size: 8pt; line-height: 6px;', 'font-size:9pt;', 'font-size: 8pt; line-height: 5px;', 'font-size: 9pt'] }).text.strip() 87 | item_date = "" 88 | try: 89 | item_date = (datetime.strptime(item_text_part.find(text=re.compile(r'ADDED')).parent.nextSibling.strip(), "%d-%b-%y")).strftime("%Y-%m-%d") 90 | except ValueError: 91 | item_date = (datetime.strptime(item_text_part.find(text=re.compile(r'ADDED')).parent.nextSibling.strip(), "%d-%m-%y")).strftime("%Y-%m-%d") 92 | print('Page {:2d} item {:2d}\n\tType: \t{}\n\tTitle: \t{}\n\tDesc: \t{}'.format(page + 1, item_num, item_type, item_title, item_desc)) 93 | info = {'isvideo': is_video, 'url': item_url, 'name': item_title, 'description': item_desc, 'date': item_date, 'size': 0} 94 | item_page = ensureLoad(item_url, session, "\t") 95 | item_page_soup = BeautifulSoup(item_page.content, 'lxml') 96 | high_res_dl = "" 97 | if is_video and len(item_page_soup.findAll(text='Live Members ONLY..!')) == 0: 98 | #VIDEO DOWNLOAD AND PARSE 99 | try: 100 | high_res_dl = base + item_page_soup.find(text='1080p MPEG').parent.parent['href'] 101 | except AttributeError: 102 | try: 103 | high_res_dl = base + item_page_soup.find(text='MPEG').parent['href'] 104 | except AttributeError: 105 | try: 106 | high_res_dl = base + item_page_soup.find(text='HD MPEG ').parent['href'] 107 | except AttributeError: 108 | try: 109 | high_res_dl = base + item_page_soup.find(text='MPEG 1.4gb').parent['href'] 110 | except AttributeError: 111 | try: 112 | high_res_dl = base + item_page_soup.find(text='MP4').parent['href'] 113 | except AttributeError: 114 | try: 115 | high_res_dl = base + item_page_soup.find(text='720p MPEG').parent.parent['href'] 116 | except AttributeError: 117 | try: 118 | high_res_dl = base + item_page_soup.find(text='Right Click this link to save LOW RES 720p movie').parent['href'] 119 | except AttributeError: 120 | try: 121 | high_res_dl = base + item_page_soup.find(text='HD MPEG 808mb').parent['href'] 122 | except AttributeError: 123 | try: 124 | high_res_dl = item_page_soup.find('source')['src'] 125 | except AttributeError: 126 | print("ERROR") 127 | exit() 128 | except KeyError: 129 | high_res_dl = base + item_page_soup.find(text='1080p MPEG').parent['href'] 130 | if len(high_res_dl): 131 | directory = basefolder + item_date + " " + item_title + "/" 132 | ensureDir(directory) 133 | header = fileDL(high_res_dl, directory + item_title + ".undef", session) 134 | info['size'] = header.get('content-length') 135 | mime = magic.Magic(mime=True) 136 | print("\nExtension detected: " + mime.from_file(directory + item_title + ".undef")) 137 | os.rename(directory + item_title + ".undef", os.path.splitext(directory + item_title + ".undef")[0] + mimetypes.guess_extension(mime.from_file(directory + item_title + ".undef"))) 138 | with open(directory + "data.json", 'w') as outfile: 139 | json.dump(info, outfile) 140 | elif len(item_page_soup.findAll(text='Live Members ONLY..!')) == 0: 141 | #PHOTO DOWNLOAD AND PARSE 142 | try: 143 | high_res_dl = base + item_page_soup.find(text='Hi Res').parent['href'] 144 | except AttributeError: 145 | try: 146 | high_res_dl = base + item_page_soup.find(text='Web Res').parent['href'] 147 | except AttributeError: 148 | try: 149 | high_res_dl = base + item_page_soup.find(text='Med Res').parent['href'] 150 | except AttributeError: 151 | try: 152 | high_res_dl = base + item_page_soup.find(text='Low Res').parent['href'] 153 | except AttributeError: 154 | exit() 155 | print(item_page_soup.prettify()) 156 | print(str(len(item_page_soup.prettify()))) 157 | if len(high_res_dl): 158 | directory = basefolder + item_date + " - Gallery - " + item_title + "/" 159 | ensureDir(directory) 160 | header = fileDL(high_res_dl, directory + item_title + ".undef", session) 161 | info['size'] = header.get('content-length') 162 | mime = magic.Magic(mime=True) 163 | print("\nExtension detected: " + mime.from_file(directory + item_title + ".undef")) 164 | os.rename(directory + item_title + ".undef", os.path.splitext(directory + item_title + ".undef")[0] + mimetypes.guess_extension(mime.from_file(directory + item_title + ".undef"))) 165 | with open(directory + "data.json", 'w') as outfile: 166 | json.dump(info, outfile) 167 | -------------------------------------------------------------------------------- /pantyhoseplaza.com.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import urllib2, base64, sys, re, os, json 3 | from bs4 import BeautifulSoup 4 | from functions import outputcolors, ensureDir, fileDlWithAuth 5 | 6 | if len(sys.argv) != 4: 7 | print("Scraping script for pantyhoseplaza.com porn size.") 8 | print("Usage:\n\tpython pantyhoseplaza.com.py pageNumber username password") 9 | sys.exit() 10 | username = sys.argv[2] 11 | password = sys.argv[3] 12 | baseUrl = "http://www.pantyhoseplaza.com/members/" 13 | baseDir = "/mnt/san/" 14 | regex = re.compile(".*Format.*") 15 | 16 | request = urllib2.Request(baseUrl + "content.php?show=videos§ion=37&page=" + sys.argv[1]) 17 | base64string = base64.encodestring('%s:%s' % (username, password)).replace('\n', '') 18 | request.add_header("Authorization", "Basic %s" % base64string) 19 | result = urllib2.urlopen(request) 20 | 21 | rootSoup = BeautifulSoup(result, "lxml") 22 | 23 | for table in rootSoup.findAll('table', { "bgcolor" : "#1d1d1d" }): 24 | anchor = table.find('a', { "style" : "color:#FF0000" }) 25 | name = anchor.text.strip() 26 | videoUrl = baseUrl + anchor['href'] 27 | description = table.findAll('tr')[1].find('div').text.strip() 28 | dirName = baseDir + "pantyhoseplaza.com/" + name.replace(":", "") 29 | ensureDir(dirName) 30 | print(name + "\n\t" + videoUrl) 31 | requestVid = urllib2.Request(videoUrl) 32 | requestVid.add_header("Authorization", "Basic %s" % base64string) 33 | resultVid = urllib2.urlopen(requestVid) 34 | vidSoup = BeautifulSoup(resultVid, "lxml") 35 | imageUrl = baseUrl + vidSoup.find('img', { "style" : "border-color:#990000" })['src'] 36 | print("\tIMAGE: " + imageUrl) 37 | fileDlWithAuth(imageUrl, base64string, dirName + "/", "\t") 38 | data = {'Name' : name, 'Description' : description} 39 | with open(dirName + '/data.json', 'w') as outfile: 40 | json.dump(data, outfile) 41 | for vidDiv in vidSoup.findAll('div'): 42 | if regex.match(vidDiv.text.strip()): 43 | trueVideoUrl = baseUrl + vidDiv.find('a')['href'] 44 | videoSize = vidDiv.findAll('a')[1].text.strip() 45 | print("\t\t" + videoSize + " => " + trueVideoUrl) 46 | trueVidRequest = urllib2.Request(trueVideoUrl) 47 | trueVidRequest.add_header("Authorization", "Basic %s" % base64string) 48 | trueVidResult = urllib2.urlopen(trueVidRequest) 49 | trueVidSoup = BeautifulSoup(trueVidResult, "lxml") 50 | trueVideoDownloadUrl = baseUrl + trueVidSoup.find('a', text="Click here to download the full length video!")['href'] 51 | print("\t\t\tVIDEO SOURCE URL: " + trueVideoDownloadUrl) 52 | fileDlWithAuth(trueVideoDownloadUrl, base64string, dirName + "/", "\t\t\t") 53 | -------------------------------------------------------------------------------- /thechive.com.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import division 3 | import sys 4 | from datetime import datetime 5 | from functions import getSoup, getStatus, fileDl, ensureDir 6 | 7 | baseDir = "/root/thechive.com/" 8 | baseUrl = "http://thechive.com/page/" 9 | filter = ("Photo", "Photos", "photos", "photo") 10 | 11 | def findLastPage(increment = 1000, last = 0, lastStatus = 200): 12 | imul = 1 13 | if lastStatus == 200: 14 | shouldBe = 404 15 | else: 16 | shouldBe = 200 17 | imul = -1 18 | lastPlusInc = increment * imul + last 19 | newStatus = getStatus(baseUrl + str(lastPlusInc) + "/") 20 | newIncr = increment // 2 if shouldBe == newStatus else increment 21 | print ("\t" + str(newStatus) + " @ " + str(lastPlusInc) + " next status should be " + str(shouldBe) + ", in/decrement will be " + str(newIncr)) 22 | if increment == 1: 23 | if lastStatus == 200: 24 | if getStatus(baseUrl + str(last + 1) + "/") == 404: 25 | return last 26 | newIncr = 1 27 | return findLastPage(newIncr, lastPlusInc, newStatus) 28 | 29 | print("Invoking findLastPage()") 30 | lastPage = findLastPage() 31 | print("Last page is " + str(lastPage)) 32 | 33 | for page in range(0, lastPage + 1): 34 | pageSoup = getSoup("http://thechive.com/page/" + str(page) + "/") 35 | print("Page " + str(page) + " of " + str(lastPage)) 36 | for article in pageSoup.findAll('article', { "role" : "article" }): 37 | date = article.find('time').text.strip() 38 | h3 = article.find('h3', { "class" : "post-title entry-title card-title" }) 39 | name = h3.text.strip() 40 | url = h3.find('a')['href'] 41 | if any(x in name for x in filter): 42 | print("\tName: " + name + "\n\t\tDate: " + date) 43 | dateFolder = "NonParsable/" 44 | try: 45 | dateFolder = datetime.strptime(date, '%b %d, %Y').strftime("%Y/%m/%d/") 46 | except ValueError: 47 | print("\t\tGoing to NonParsable folder") 48 | ensureDir(baseDir + dateFolder + name + "/") 49 | postSoup = getSoup(url) 50 | for countTag in postSoup.findAll('div', { "class" : "count-tag" }): 51 | try: 52 | img = countTag.parent.find('img') 53 | imgSrc = img['src'].split('?')[0] + "?quality=100" 54 | imgName = img['src'].split('?')[0].split('/')[-1] 55 | if any(x in img['class'] for x in { "gif-animate" }): 56 | imgSrc = img['data-gifsrc'].split('?')[0] 57 | imgName = img['data-gifsrc'].split('?')[0].split('/')[-1] 58 | print("\t\t\tImage" + countTag.text + ": " + imgSrc) 59 | fileDl(imgSrc, baseDir + dateFolder + name + "/", "\t\t\t\t", imgName) 60 | except: 61 | 62 | pass 63 | -------------------------------------------------------------------------------- /thesandbornmaps.cudl.colorado.edu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import urllib2, sys, re, getopt 3 | from bs4 import BeautifulSoup 4 | from functions import getSoup, fileDl, ensureDir, roundUpTo, roundDownTo 5 | 6 | baseUrl = "http://cudl.colorado.edu/luna/servlet/view/all?sort=city%2Cdate%2Csheet&os=" 7 | baseDir = "/root/maps/" 8 | 9 | def usage(): 10 | print("University of Colorado Sanborn Maps Scraper. Requested by /u/WhiskeyQuebec. Made by /u/nicba1010.") 11 | print("Specify directory or it'll all go to your root folder!!!") 12 | print("Options available: ") 13 | print("\t-s, --simple\tConstructs the simple/flat directory structure") 14 | print("\t-h, --help\tShows this text") 15 | print("\t--from=\t\tStart at the given document number") 16 | print("\t--to=\t\tEnd with the given document number") 17 | print("\t--save-dir=\tStore at this location") 18 | 19 | startDoc = 0 20 | endDoc = -1 21 | simple = False 22 | 23 | try: 24 | opts, args = getopt.getopt(sys.argv[1:], "hs", ["help", "simple", "from=", "to=", "save-dir="]) 25 | except getopt.GetoptError as err: 26 | print str(err) 27 | usage() 28 | sys.exit() 29 | 30 | for o, a in opts: 31 | if o in ("-s", "--simple"): 32 | simple = True 33 | elif o in ("-h", "--help"): 34 | usage() 35 | sys.exit() 36 | elif o == "--from": 37 | startDoc = int(a) 38 | elif o == "--to": 39 | endDoc = int(a) 40 | elif o == "--save-dir": 41 | baseDir = a 42 | else: 43 | usage() 44 | assert False, "unhandled option" 45 | sys.exit() 46 | 47 | ensureDir(baseDir) 48 | documentSoup = getSoup(baseUrl + "0") 49 | documentTotal = int(documentSoup.find('div', { "id" : "PageRange" }).text.split('of')[1].strip().replace(',','')) 50 | print str(documentTotal) + " documents to download. Let's get started!" 51 | if endDoc == -1: 52 | endDoc = documentTotal 53 | 54 | documentNum = 1 + roundDownTo(startDoc, 50) 55 | print("Scanning range from " + str(roundDownTo(startDoc, 50)) + " to " + str(documentTotal if roundUpTo(endDoc, 50) > documentTotal else roundUpTo(endDoc, 50))) 56 | for i in range(roundDownTo(startDoc, 50), documentTotal if roundUpTo(endDoc, 50) > documentTotal else roundUpTo(endDoc, 50), 50): 57 | print("Documents " + str(i) + " to " + str(i+50)) 58 | groupSoup = getSoup(baseUrl + str(i)) 59 | for mediaContainer in groupSoup.findAll('div', { "class" : "mediaContainer" }): 60 | if documentNum < startDoc: 61 | documentNum += 1 62 | print("Skiping Document " + str(documentNum)) 63 | continue 64 | elif documentNum > endDoc: 65 | print("My job here is done!") 66 | sys.exit(420) 67 | print("\tDocument " + str(documentNum)) 68 | documentNum += 1 69 | blockQuotes = mediaContainer.findAll('blockquote') 70 | try: 71 | print("\t\tCity: \t\t" + blockQuotes[0].text.strip()) 72 | print("\t\tDate: \t\t" + blockQuotes[1].text.strip()) 73 | print("\t\tSheet: \t\t" + blockQuotes[2].text.strip()) 74 | except Exception: 75 | pass 76 | singleDocumentUrl = mediaContainer.find('a')['href'] 77 | print("\t\tDoc Url: \t" + singleDocumentUrl) 78 | singleDocumentSoup = getSoup(singleDocumentUrl) 79 | theJavaScript = singleDocumentSoup.find('div', { "class" : "controlStrip" }).nextSibling.nextSibling 80 | theJP2Url = str(theJavaScript).split("openPdfInWindow")[1].splitlines()[5].strip()[11:-38] 81 | print("\t\tJP2 Url: \t" + theJP2Url) 82 | theXMLUrl = "" 83 | theXMLId = theJP2Url.split('/')[-1][:-4] 84 | if theXMLId == "bou00003": 85 | theXMLUrl = "http://ucblibraries.colorado.edu/systems/digitalinitiatives/xml/bou00.xml" 86 | else: 87 | try: 88 | theXMLUrl = singleDocumentSoup.find('td', text=re.compile(r'METS XML View')).parent.nextSibling.nextSibling.find('a')['href'] 89 | except TypeError: 90 | theXMLUrl = raw_input("\t\tThe XML is not valid, check the page manually and try to find the real XML file, input the url here: ") 91 | print("\t\tXML Url: \t" + str(theXMLUrl)) 92 | if not simple: 93 | ensureDir(baseDir + blockQuotes[0].text.strip() + "/" + blockQuotes[1].text.strip() + "/") 94 | fileDl(theXMLUrl, baseDir if simple else (baseDir + blockQuotes[0].text.strip() + "/" + blockQuotes[1].text.strip() + "/"), "\t\t\t") 95 | print("\t\tXML ID: \t" + theXMLId) 96 | xmlSoup = getSoup(theXMLUrl) 97 | admId = xmlSoup.find('filegrp').find('file', { "id" : theXMLId })['admid'] 98 | imageWidth = int(xmlSoup.find('techmd', { "id" : admId }).find('imagewidth').text.strip()) 99 | imageHeight = int(xmlSoup.find('techmd', { "id" : admId }).find('imagelength').text.strip()) 100 | print("\t\tWidth: \t\t" + str(imageWidth)) 101 | print("\t\tHeight: \t" + str(imageHeight)) 102 | finalUrl = theJP2Url + "&x=0&y=0&width=" + str(imageWidth) + "&height=" + str(imageHeight) 103 | print("\t\tFinal Url: \t" + finalUrl) 104 | fileDl(finalUrl, baseDir if simple else (baseDir + blockQuotes[0].text.strip() + "/" + blockQuotes[1].text.strip() + "/"), "\t\t\t", theXMLId + ".jp2") 105 | -------------------------------------------------------------------------------- /wall.alphacoders.com.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys, getopt 3 | from bs4 import BeautifulSoup 4 | from functions import getSoup, ensureDir, fileDl 5 | 6 | def usage(): 7 | print("wall.alphacoders.com scraping script. Made by /u/nicba1010") 8 | print("Options and arguments:") 9 | print("\t-h, --help\tShows this printout") 10 | print("\t--update\tStops at first found already downloaded") 11 | print("\t--save-dir=\tStore at this location") 12 | 13 | bUrl = "https://wall.alphacoders.com/" 14 | baseUrl = bUrl + "newest_wallpapers.php?page=" 15 | baseDir = "/root/wall.alphacoders.com/" 16 | update = False 17 | stop = False 18 | 19 | try: 20 | opts, args = getopt.getopt(sys.argv[1:], "h", ["help", "update", "save-dir="]) 21 | except getopt.GetoptError as err: 22 | print str(err) 23 | usage() 24 | sys.exit() 25 | 26 | for o, a in opts: 27 | if o == "--update": 28 | update = True 29 | elif o in ("-h", "--help"): 30 | usage() 31 | sys.exit() 32 | elif o == "--save-dir": 33 | baseDir = a 34 | else: 35 | usage() 36 | assert False, "unhandled option" 37 | sys.exit() 38 | 39 | def processWallpaper(url): 40 | wallpaperSoup = getSoup(url) 41 | wallpaperOriginalUrl = wallpaperSoup.find('span', { "class" : "btn btn-success download-button" })['data-href'] 42 | sys.stdout.write("\t\tOriginal Wallpaper Url: " + wallpaperOriginalUrl + "\n\t\t\t") 43 | categories = wallpaperSoup.find('div', { "class" : "floatright" }).findAll('strong') 44 | name = wallpaperSoup.find('div', {'class': 'container center'}).find('div').text.strip().replace("/",".") 45 | tags = wallpaperSoup.findAll('div', {'style': 'padding:5px 10px; margin:1px; display:inline-block;'}) 46 | tagArray = [None]*len(tags) 47 | taglist = "" 48 | index = 0 49 | if len(tags) > 0: 50 | for tag in tags: 51 | tagArray[index] = tag.text.strip() 52 | index += 1 53 | tagArray.sort() 54 | for tag in tagArray: 55 | taglist += "[" + tag + "]" 56 | fileName = taglist + name + ((" " if len(taglist) > 0 else "") if len(name) == 0 else " - ") + wallpaperOriginalUrl.split('/')[-4] + "." + wallpaperOriginalUrl.split('/')[-2] 57 | directoryStructure = baseDir 58 | for i in range(0, len(categories)): 59 | sys.stdout.write(categories[i].text.strip() + ("" if i == (len(categories) - 1) else " => ")) 60 | directoryStructure += categories[i].text.strip() + "/" 61 | sys.stdout.write("\n\t\t\t\tSaving to: " + directoryStructure + fileName + "\n") 62 | ensureDir(directoryStructure) 63 | retval = fileDl(wallpaperOriginalUrl, directoryStructure, "\t\t\t\t\t", fileName) 64 | if int(retval) == 42 and update: 65 | global stop 66 | stop = True 67 | 68 | wallSoup = getSoup(baseUrl + "0") 69 | totalPages = int(wallSoup.find('ul', { "class" : "pagination pagination" }).findAll('li')[-1].find('a')['href'].split('=')[1]) 70 | for i in range(0, totalPages+1): 71 | print("Scraping page " + str(i) + "...") 72 | for thumbContainer in getSoup(baseUrl + str(i)).findAll('div', { "class" : "thumb-container-big " }): 73 | wallpaperUrl = bUrl + thumbContainer.find('a')['href'] 74 | print ("\tbig.php url: " + wallpaperUrl) 75 | processWallpaper(wallpaperUrl) 76 | if stop: 77 | sys.exit(420) 78 | -------------------------------------------------------------------------------- /wotd.dictionary.com.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import urllib2, sys 3 | from bs4 import BeautifulSoup 4 | from functions import getSoup, daterange, fileDl, ensureDir 5 | from datetime import date, datetime 6 | 7 | def dlForDate(singleDate): 8 | print("Getting Word of the Day for: " + singleDate.strftime("%Y/%m/%d")) 9 | wordSoup = getSoup("http://www.dictionary.com/wordoftheday/" + singleDate.strftime("%Y/%m/%d") + "/") 10 | url = wordSoup.find('meta', { "property" : "og:image" })['content'] 11 | print("\tDownloading:" + url) 12 | fileDl(url, sys.argv[1], "\t\t") 13 | 14 | if (len(sys.argv) < 2) or sys.argv[1] == "--help": 15 | print("Scraping script for dictionary.com/wordoftheday/ first WOTD: 1999/5/3") 16 | print("Usage:\n\tAll until today: python wotd.dictionary.com.py \"/mnt/what/ever/directory/\"") 17 | print("\tSpecific date: python wotd.dictionary.com.py \"/mnt/what/ever/directory/\" yyyy/mm/dd -single") 18 | print("\tDate range: python wotd.dictionary.com.py \"/mnt/what/ever/directory/\" yyyy/mm/dd yyyy/mm/dd") 19 | sys.exit() 20 | 21 | startDate = date(1999, 5, 3) 22 | endDate = date.today() 23 | ensureDir(sys.argv[1]) 24 | 25 | if len(sys.argv) == 4 and sys.argv[3] == "-single": 26 | startDate = datetime.strptime(sys.argv[2], "%Y/%m/%d") 27 | print startDate 28 | dlForDate(startDate) 29 | sys.exit() 30 | elif len(sys.argv) == 4 and sys.argv[3] != "-single": 31 | startDate = datetime.strptime(sys.argv[2], "%Y/%m/%d") 32 | endDate = datetime.strptime(sys.argv[3], "%Y/%m/%d") 33 | 34 | for singleDate in daterange(startDate, endDate): 35 | dlForDate(singleDate) 36 | -------------------------------------------------------------------------------- /x-art.com.py: -------------------------------------------------------------------------------- 1 | import mechanize 2 | import os, sys 3 | import os.path 4 | import cookielib 5 | from bs4 import BeautifulSoup 6 | import html2text 7 | from datetime import datetime 8 | import requests 9 | import shutil 10 | 11 | 12 | class HeadRequest(mechanize.Request): 13 | def get_method(self): 14 | return "HEAD" 15 | 16 | 17 | def ensureDir(f): 18 | if not os.path.exists(f): 19 | os.makedirs(f) 20 | 21 | 22 | def videoDl(browseItem): 23 | coverUrl = None 24 | try: 25 | coverUrl = browseItem.find('img')['data-interchange'].split(' ')[4][1:-1] 26 | except: 27 | coverUrl = browseItem.find('img')['src'] 28 | print("\tALT Cover:\t" + coverUrl) 29 | date = datetime.strptime(browseItem.find('h2').text.strip(), "%b %d, %Y") 30 | videoLink = browseItem.parent['href'] 31 | videoHtml = br.open(videoLink).read() 32 | videoSoup = BeautifulSoup(videoHtml, 'lxml') 33 | wideCoverUrl = \ 34 | videoSoup.find('div', {'id': 'mediaplayer'}).parent.findAll('script')[2].text.split('"image"')[1][3:].split( 35 | '"')[0] 36 | rowSingle = videoSoup.find('div', {'class': 'row single'}) 37 | name = rowSingle.findAll('div', {'class': 'small-12 medium-12 large-12 columns'})[1].find('h1').text.strip() 38 | dropDl = videoSoup.find('ul', {'id': 'drop-download'}) 39 | biggest = "" 40 | biggestSize = 0 41 | print(name + "\n\tURL:\t" + videoLink) 42 | for a in dropDl.findAll('a'): 43 | if int(a.text.strip().split('(')[1].split(' MB)')[0]) > biggestSize: 44 | biggestSize = int(a.text.strip().split('(')[1].split(' MB)')[0]) 45 | biggest = a['href'] 46 | print("\tSize:\t" + str(biggestSize) + "\n\tDL URL:\t" + biggest) 47 | dlDir = baseDir + date.strftime("%Y-%m-%d") + " - " + name + "/" 48 | ensureDir(dlDir) 49 | contentLength = str(br.open(HeadRequest(biggest)).info()).splitlines()[2].split(' ')[1].strip() 50 | print("\tDL Dir:\t" + dlDir) 51 | with open(dlDir + "page.html", "w") as htmlFile: 52 | htmlFile.write(videoHtml) 53 | try: 54 | br.retrieve(coverUrl.strip(), dlDir + "portrait.jpg") 55 | except Exception as e: 56 | r = requests.get(coverUrl.strip(), cookies=cj, stream=True) 57 | with open(dlDir + "portrait.jpg", "wb") as portraitFile: 58 | r.raw.decode_content = True 59 | shutil.copyfileobj(r.raw, portraitFile) 60 | try: 61 | br.retrieve(wideCoverUrl.strip(), dlDir + "wide.jpg") 62 | except Exception as e: 63 | r = requests.get(wideCoverUrl.strip(), cookies=cj, stream=True) 64 | with open(dlDir + "wide.jpg", "wb") as wideFile: 65 | r.raw.decode_content = True 66 | shutil.copyfileobj(r.raw, wideFile) 67 | while True: 68 | if os.path.isfile(dlDir + name + ".mp4"): 69 | if int(contentLength) > os.stat(dlDir + name + ".mp4").st_size: 70 | print("\tFilesize too low:\t" + str(os.stat(dlDir + name + ".mp4").st_size)) 71 | print("\tShould be:\t\t" + contentLength) 72 | os.remove(dlDir + name + ".mp4") 73 | else: 74 | return 75 | br.retrieve(biggest, dlDir + name + ".mp4") 76 | 77 | 78 | def galleryDl(browseItem): 79 | coverUrl = None 80 | try: 81 | coverUrl = browseItem.find('img')['data-interchange'].split(' ')[4][1:-1] 82 | except: 83 | coverUrl = browseItem.find('img')['src'] 84 | print("\tALT Cover:\t" + coverUrl) 85 | date = datetime.strptime(browseItem.find('h2').text.strip(), "%b %d, %Y") 86 | videoLink = browseItem.parent['href'] 87 | videoHtml = br.open(videoLink).read() 88 | videoSoup = BeautifulSoup(videoHtml, 'lxml') 89 | bigCoverUrl = \ 90 | videoSoup.find('div', {'class': 'small-12 medium-12 large-6 columns media'}).find('img')['src'].replace('sml', 'lrg') 91 | rowSingle = videoSoup.find('div', {'class': 'row single info-fixed'}) 92 | name = rowSingle.findAll('div', {'class': 'small-12 medium-12 large-12 columns'})[0].find('h1').text.strip() 93 | dropDl = rowSingle.find('ul', {'id': 'drop-download'}) 94 | biggest = "" 95 | biggestSize = 0 96 | print(name + "\n\tURL:\t" + videoLink) 97 | for a in dropDl.findAll('a'): 98 | if int(a.text.strip().split(' ')[0]) > biggestSize: 99 | biggestSize = int(a.text.strip().split(' ')[0]) 100 | biggest = a['href'] 101 | print("\tSize:\t" + str(biggestSize) + "\n\tDL URL:\t" + biggest) 102 | dlDir = baseDir + date.strftime("%Y-%m-%d") + " - [Gallery] - " + name + "/" 103 | ensureDir(dlDir) 104 | contentLength = str(br.open(HeadRequest(biggest)).info()).splitlines()[2].split(' ')[1].strip() 105 | print("\tDL Dir:\t" + dlDir) 106 | with open(dlDir + "page.html", "w") as htmlFile: 107 | htmlFile.write(videoHtml) 108 | try: 109 | br.retrieve(coverUrl.strip(), dlDir + "portrait.jpg") 110 | except Exception as e: 111 | r = requests.get(coverUrl.strip(), cookies=cj, stream=True) 112 | with open(dlDir + "portrait.jpg", "wb") as portraitFile: 113 | r.raw.decode_content = True 114 | shutil.copyfileobj(r.raw, portraitFile) 115 | try: 116 | br.retrieve(bigCoverUrl.strip(), dlDir + "bigPortrait.jpg") 117 | except Exception as e: 118 | r = requests.get(coverUrl.strip(), cookies=cj, stream=True) 119 | with open(dlDir + "bigPortrait.jpg", "wb") as wideFile: 120 | r.raw.decode_content = True 121 | shutil.copyfileobj(r.raw, wideFile) 122 | while True: 123 | if os.path.isfile(dlDir + name + ".zip"): 124 | if int(contentLength) > os.stat(dlDir + name + ".zip").st_size: 125 | print("\tFilesize too low:\t" + str(os.stat(dlDir + name + ".zip").st_size)) 126 | print("\tShould be:\t\t" + contentLength) 127 | os.remove(dlDir + name + ".zip") 128 | else: 129 | return 130 | br.retrieve(biggest, dlDir + name + ".zip") 131 | 132 | 133 | # Browser 134 | br = mechanize.Browser() 135 | 136 | # Cookie Jar 137 | cj = cookielib.LWPCookieJar() 138 | br.set_cookiejar(cj) 139 | 140 | # Browser options 141 | br.set_handle_equiv(True) 142 | br.set_handle_gzip(True) 143 | br.set_handle_redirect(True) 144 | br.set_handle_referer(True) 145 | br.set_handle_robots(False) 146 | br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1) 147 | 148 | br.addheaders = [('User-agent', 'Chrome')] 149 | 150 | br.open('http://x-art.com/members') 151 | 152 | br.select_form(nr=0) 153 | 154 | br.form['uid'] = '' 155 | br.form['pwd'] = '' 156 | 157 | br.submit() 158 | 159 | baseDir = "/mnt/san/Operation X-ART/" 160 | 161 | for i in range(1, 40): 162 | try: 163 | if sys.argv[1] == "-svid": 164 | continue 165 | except: 166 | pass 167 | pageHtml = br.open('http://www.x-art.com/members/videos/recent/All/' + str(i) + '/').read() 168 | pageSoup = BeautifulSoup(pageHtml, 'lxml') 169 | for browseItem in pageSoup.findAll('div', {'class': 'browse-item'}): 170 | videoDl(browseItem) 171 | 172 | for i in reversed(range(1, 64)): 173 | print("Page: " + str(i)) 174 | try: 175 | if sys.argv[1] == "-spic": 176 | continue 177 | except: 178 | pass 179 | pageHtml = br.open('http://www.x-art.com/members/galleries/recent/All/' + str(i) + '/').read() 180 | pageSoup = BeautifulSoup(pageHtml, 'lxml') 181 | for browseItem in pageSoup.findAll('div', {'class': 'browse-item'}): 182 | if len(browseItem.findAll('h2')[1].text.replace('Images', '').strip()) > 0: 183 | galleryDl(browseItem) 184 | if i < 15: 185 | raw_input("Press Enter to continue...") 186 | --------------------------------------------------------------------------------