├── .gitignore
├── Python3
    ├── TestData
    │   ├── Claymore-1.jpg
    │   ├── Claymore-2.jpg
    │   ├── Emilia-1.jpg
    │   ├── Emilia-10.png
    │   ├── Emilia-2.jpg
    │   ├── Emilia-3.jpg
    │   ├── Emilia-4.jpg
    │   ├── Emilia-5.jpg
    │   ├── Emilia-6.jpg
    │   ├── Emilia-7.jpg
    │   ├── Emilia-8.png
    │   ├── Emilia-9.png
    │   ├── Hellsing-1.jpg
    │   ├── Light-1.jpg
    │   ├── Makise-1.jpg
    │   ├── Re-Class Battleship.png
    │   ├── Re-Zero-1.jpg
    │   ├── Rem-1.png
    │   ├── Seras-1.jpg
    │   ├── Seras-2.jpg
    │   ├── Seras-3.jpg
    │   ├── Seras-4.jpg
    │   ├── Seras-5.jpg
    │   ├── Seras-6.jpg
    │   ├── Seras-7.jpg
    │   ├── Seras-8.jpg
    │   └── Wrong Aspect Ratio
    │   │   └── Emilia-1.png
    ├── functions.py
    └── iqdb.py
├── README.md
├── econtalk.org.py
├── functions.py
├── lyndaleigh.com.py
├── pantyhoseplaza.com.py
├── thechive.com.py
├── thesandbornmaps.cudl.colorado.edu.py
├── wall.alphacoders.com.py
├── wotd.dictionary.com.py
└── x-art.com.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | .venv/
83 | venv/
84 | ENV/
85 | 
86 | # Spyder project settings
87 | .spyderproject
88 | 
89 | # Rope project settings
90 | .ropeproject
91 | 


--------------------------------------------------------------------------------
/Python3/TestData/Claymore-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Claymore-1.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Claymore-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Claymore-2.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Emilia-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-1.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Emilia-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-10.png


--------------------------------------------------------------------------------
/Python3/TestData/Emilia-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-2.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Emilia-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-3.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Emilia-4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-4.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Emilia-5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-5.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Emilia-6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-6.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Emilia-7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-7.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Emilia-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-8.png


--------------------------------------------------------------------------------
/Python3/TestData/Emilia-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Emilia-9.png


--------------------------------------------------------------------------------
/Python3/TestData/Hellsing-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Hellsing-1.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Light-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Light-1.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Makise-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Makise-1.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Re-Class Battleship.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Re-Class Battleship.png


--------------------------------------------------------------------------------
/Python3/TestData/Re-Zero-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Re-Zero-1.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Rem-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Rem-1.png


--------------------------------------------------------------------------------
/Python3/TestData/Seras-1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-1.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Seras-2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-2.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Seras-3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-3.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Seras-4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-4.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Seras-5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-5.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Seras-6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-6.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Seras-7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-7.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Seras-8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Seras-8.jpg


--------------------------------------------------------------------------------
/Python3/TestData/Wrong Aspect Ratio/Emilia-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Nicba1010/Scraping-Scripts/d5a3571d59c3586e8b2d53be11c0a4801f1342e1/Python3/TestData/Wrong Aspect Ratio/Emilia-1.png


--------------------------------------------------------------------------------
/Python3/functions.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | from bs4 import BeautifulSoup
3 | import requests
4 | 
5 | def getSoup(url, data_ = {}, file = {}):
6 | 	r = requests.post(url, data=data_, files=file)
7 | 	return BeautifulSoup(r.text, 'lxml')
8 | 


--------------------------------------------------------------------------------
/Python3/iqdb.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | import getopt, requests, sys
 3 | from bs4 import BeautifulSoup
 4 | from functions import getSoup
 5 | 
 6 | def usage():
 7 | 	print("./iqdb.py -i <file>")
 8 | 
 9 | imageFile = None
10 | 
11 | try:
12 | 	opts, args = getopt.getopt(sys.argv[1:], 'hi:')
13 | except getopt.GetoptError:
14 | 	usage()
15 | 	sys.exit(2)
16 | for opt, arg in opts:
17 | 	if opt == '-h':
18 | 		usage()
19 | 		sys.exit()
20 | 	elif opt == '-i':
21 | 		imageFile = arg
22 | 	else:
23 | 		print("Unsupported option and/or argument")
24 | 		sys.exit(2)
25 | 
26 | print("Input file is: " + imageFile)
27 | iqdbSoup = getSoup("http://iqdb.org/", {}, {'file': open(imageFile, 'rb')})
28 | #print(iqdbSoup.find('div', {'class': 'pages'}).prettify())
29 | for result in iqdbSoup.find('div', {'class': 'pages'}).findAll('table'):
30 | 	t1 = result.findAll('tr')[0].findAll('th')[0].text
31 | 	if t1 != "Your image":
32 | 		#print(result.prettify())
33 | 		print("Image Info:")
34 | 		print("\t" + t1)
35 | 		t2 = result.find('td', {'class': 'image'}).find('a')['href']
36 | 		if t2[:2] == "//":
37 | 			t2 = "http:" + t2
38 | 		print("\t\tSource:\t\t" + t2)
39 | 		t3 = result.find('img', {'class': 'service-icon'}).nextSibling
40 | 		print("\t\tSource Page:\t" + t3)
41 | 		whs = result.findAll('tr')[3].find('td').text.split(' ')
42 | 		width = int(whs[0].split('×')[0])
43 | 		height = int(whs[0].split('×')[1])
44 | 		safe = whs[1][1:-1]
45 | 		print("\t\tWidth:\t\t" + str(width))
46 | 		print("\t\tHeight:\t\t" + str(height))
47 | 		print("\t\tSafety Status:\t" + safe)
48 | 		
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scraping-Scripts
 2 | ## Disclaimer
 3 | These scripts are provided as-is. I assume no liability of any damage made by usage of these scripts.
 4 | ## Usage
 5 | ### You need the functions.py file for almost every script
 6 | ### pantyhoseplaza.com.py
 7 | **Edit the script to define your download directory**
 8 | The script needs three parameters passed:
 9 | - The page number of the Movie Episodes category
10 | - The username to use to log in
11 | - The password to use to log in
12 | 
13 | The script downloads JSON-ized data, the thumbnail and videos in all qualities.
14 | If you have any suggestions submit a pull request!
15 | 
16 | **IF ANY ERRORS POPUP RERUN THE SCRIPT, IF THEY KEEP OCURRING SUBMIT AN ISSUE**
17 | Thx :)
18 | 
19 | ### wotd.dictionary.com.py
20 | *Requested by /u/nsq1*
21 | 
22 | Three different usages:
23 | - All until today: python wotd.dictionary.com.py "/mnt/what/ever/directory/"
24 | - Specific date: python wotd.dictionary.com.py "/mnt/what/ever/directory/" yyyy/mm/dd -single
25 | - Date range: python wotd.dictionary.com.py "/mnt/what/ever/directory/" yyyy/mm/dd yyyy/mm/dd
26 | 
27 | There is currently one problem with the script. For any dates below 2014/03/01 it throws a 403. If anyone finds a way to fix it submit a pull request!
28 | ### thesandbornmaps.cudl.colorado.edu.py
29 | *Requested by /u/WhiskeyQuebec*
30 | 
31 | Options and arguments:
32 | - -s, --simple    Constructs the simple/flat directory structure
33 | - -h, --help      Shows this text
34 | - --from=         Start at the given document number
35 | - --to=           End with the given document number
36 | - --save-dir=     Store at this location
37 | 
38 | ### wall.alphacoders.com.py
39 | *Requested by myself :P*
40 | 
41 | Options and arguments:
42 | - -h, --help      Shows this printout
43 | - --update        Stops at first found already downloaded
44 | - --save-dir=     Store at this location
45 | 
46 | ### thechive.com.py
47 | *Requested by /u/Broadsid3*
48 | 
49 | No options or arguments. Just run the script. I'll add them later when I have more time.
50 | All posts with no valid date format will be stored to NonParsable folder.
51 | 
52 | ## Donate
53 | If you like my work and want to donate here's the button! :)
54 | Actually there is no button. I have a personal PayPal account and can't set it up. 
55 | Here's the email instead: nicba1010@gmail.com
56 | 


--------------------------------------------------------------------------------
/econtalk.org.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys
 3 | from bs4 import BeautifulSoup
 4 | from functions import getSoup, fileDl, ensureDir
 5 | from datetime import datetime
 6 | 
 7 | baseDir = "/root/econtalk.org/"
 8 | baseUrl = "http://www.econtalk.org/"
 9 | archiveSoup = getSoup(baseUrl + "archives.html")
10 | 
11 | tableRows = archiveSoup.find('div', {'class': 'archive-individual archive-date-based archive'}).findAll('tr')
12 | for tableRow in tableRows:
13 | 	if tableRows.index(tableRow) == 0:
14 | 		continue
15 | 	date = datetime.strptime(tableRow.find('td', {'width': '5%'}).text.strip(), "%Y/%m/%d")
16 | 	extra = len(tableRow.findAll('td')[2].text.strip()) != 0
17 | 	name = tableRow.find('a').text
18 | 	dirName = date.strftime("%Y-%m-%d") + (" Extra " if extra else " ") + "- " + name + "/"
19 | 	url = tableRow.find('a')['href']
20 | 	ensureDir(baseDir + dirName)
21 | 	print(dirName[:-1])
22 | 	if not extra:
23 | 		podcastSoup = getSoup(url)
24 | 		url1 = podcastSoup.find('a', text="Download")['href']
25 | 		print("\t" + url1)
26 | 		fileDl(url1, baseDir + dirName, "\t\t")
27 | 	print("\t" + url)
28 | 	fileDl(url, baseDir + dirName, "\t\t")
29 | 


--------------------------------------------------------------------------------
/functions.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import urllib2, os, sys, math, urllib
  3 | from bs4 import BeautifulSoup
  4 | from datetime import timedelta, date
  5 | import time
  6 | 
  7 | class outputcolors:
  8 |         OKGREEN = '\033[92m'
  9 |         OKBLUE = '\033[94m'
 10 |         WARNING = '\033[93m'
 11 |         FAIL = '\033[91m'
 12 |         ENDC = '\033[0m'
 13 | 
 14 | def getStatus(url):
 15 | 	try:
 16 | 		connection = urllib2.urlopen(url)
 17 | 		code = connection.getcode()
 18 | 		connection.close()
 19 | 		return code
 20 | 	except urllib2.HTTPError, e:
 21 | 		return e.getcode()
 22 | 
 23 | def roundUpTo(x, base):
 24 | 	return int(base * math.ceil(float(x) / base))
 25 | 
 26 | def roundDownTo(x, base):
 27 |         return int(base * math.floor(float(x) / base))
 28 | 
 29 | def ensureDir(f):
 30 |         if not os.path.exists(f):
 31 |                 os.makedirs(f)
 32 | 
 33 | def replaceTab(s, tabstop = 4):
 34 | 	result = str()
 35 | 	for c in s:
 36 | 		if c == '\t':
 37 | 			while (len(result) % tabstop != 0):
 38 | 				result += ' ';
 39 | 		else:
 40 | 			result += c    
 41 | 	return result
 42 | 
 43 | def fileDl(url, dir, prepend, fileName = "?"):
 44 | 	if fileName == "?":
 45 | 		fileName = url.split('/')[-1]
 46 |         request = urllib2.Request(url)
 47 |         u = urllib2.urlopen(request)
 48 |         meta = u.info()
 49 |         fileSize = -1
 50 |         try:
 51 |                 fileSize = int(meta.getheaders("Content-Length")[0])
 52 |         except Exception:
 53 |                 pass
 54 |         if os.path.exists(dir + fileName):
 55 |                 if os.stat(dir + fileName).st_size == fileSize:
 56 |                         print(prepend + outputcolors.OKBLUE + "File already downloaded!" + outputcolors.ENDC)
 57 |                         return 42
 58 |         	else:
 59 |                 	print(prepend + outputcolors.WARNING + "File downloaded but not fully! Restarting download..." + outputcolors.ENDC)
 60 | 	else:
 61 | 		print(prepend + outputcolors.WARNING + "Downloading file..." + outputcolors.ENDC)
 62 |         fileHandle = open(dir + fileName, 'wb')
 63 |         print(prepend + ("Downloading: %s Bytes: %s" % (fileName, "???" if (fileSize == -1) else fileSize)))
 64 |         fileSizeDl = 0
 65 |         blockSize = 65536
 66 |         while True:
 67 |                 buffer = u.read(blockSize)
 68 |                 if not buffer:
 69 |                         break
 70 |                 fileSizeDl += len(buffer)
 71 |                 fileHandle.write(buffer)
 72 |                 status = prepend + r"%12d  [%3.2f%%]" % (fileSizeDl, -1.0 if (fileSize == -1) else (fileSizeDl * 100. / fileSize))
 73 |                 status = "\r" + status 
 74 |                 print status,
 75 |         fileHandle.close()
 76 |         print("\n" + prepend + outputcolors.OKGREEN + "Done :)" + outputcolors.ENDC)
 77 | 	return 1
 78 | 
 79 | def fileDlWithAuth(url, auth, dir, prepend):
 80 |         fileName = url.split('/')[-1]
 81 |         request = urllib2.Request(url)
 82 |         request.add_header("Authorization", "Basic %s" % auth)
 83 |         u = urllib2.urlopen(request)
 84 |         meta = u.info()
 85 |         fileSize = -1
 86 |         try:
 87 |                 fileSize = int(meta.getheaders("Content-Length")[0])
 88 |         except Exception:
 89 |                 pass
 90 |         if os.path.exists(dir + fileName):
 91 |                 if os.stat(dir + fileName).st_size == fileSize:
 92 |                         print(prepend + outputcolors.OKBLUE + "File already downloaded!" + outputcolors.ENDC)
 93 |                         return 42
 94 |         	else:
 95 |                 	print(prepend + outputcolors.WARNING + "File downloaded but not fully! Restarting download..." + outputcolors.ENDC)
 96 | 	else:
 97 |                 print(prepend + outputcolors.WARNING + "Downloading file..." + outputcolors.ENDC)	
 98 | 	fileHandle = open(dir + fileName, 'wb')
 99 |         print(prepend + ("Downloading: %s Bytes: %s" % (fileName, "???" if (fileSize == -1) else fileSize)))
100 |         fileSizeDl = 0
101 |         blockSize = 65536
102 |         while True:
103 |                 buffer = u.read(blockSize)
104 |                 if not buffer:
105 |                         break
106 |                 fileSizeDl += len(buffer)
107 |                 fileHandle.write(buffer)
108 | 		status = prepend + r"%12d  [%3.2f%%]" % (fileSizeDl, -1.0 if (fileSize == -1) else (fileSizeDl * 100. / fileSize))
109 |                 status = "\r" + status
110 |                 print status,
111 |         fileHandle.close()
112 |         print("\n" + prepend + outputcolors.OKGREEN + "Done :)" + outputcolors.ENDC)
113 | 
114 | def getSoup(url):
115 | 	try:
116 | 		return BeautifulSoup(urllib2.urlopen(urllib2.Request(url)), "lxml")
117 | 	except urllib2.HTTPError, e:
118 | 		print("retrying in 5s")
119 | 		time.sleep(5)
120 | 		return getSoup(url)
121 | 
122 | def daterange(start_date, end_date):
123 |     for n in range(int ((end_date - start_date).days)):
124 |         yield start_date + timedelta(n)
125 | 


--------------------------------------------------------------------------------
/lyndaleigh.com.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | from requests.auth import HTTPBasicAuth
  4 | import requests
  5 | import sys
  6 | from bs4 import BeautifulSoup
  7 | import getpass
  8 | import time
  9 | import magic
 10 | import os
 11 | import mimetypes
 12 | import re
 13 | from datetime import datetime
 14 | import json
 15 | 
 16 | def ensureDir(directory):
 17 | 	if not os.path.exists(directory):
 18 | 		os.makedirs(directory)
 19 | def sCodeChk(resp, prepend = ""):
 20 | 	if DEBUG:
 21 | 		print(prepend + "URL: {}\n".format(resp.url) + prepend + "  Response Code: {:3d}".format(resp.status_code))
 22 | 	return resp.status_code
 23 | def ensureLoad(url, s, prepend = ""):
 24 | 	loaded = s.get(url)
 25 | 	bs = BeautifulSoup(loaded.content, 'lxml')
 26 | 	if sCodeChk(loaded, prepend) != 200 or len(bs.prettify()) < 400:
 27 | 		print(prepend + "Retrying in 15 seconds... Bs Size " + str(len(bs.prettify())))
 28 | 		time.sleep(15)
 29 | 		loaded = ensureLoad(url, s) #Recursive call
 30 | 	return loaded
 31 | def fileDL(url, file_name, s):
 32 | 	he = 0
 33 | 	with open(file_name, "wb") as f:
 34 | 		print("Downloading {}...".format(file_name))
 35 | 		response = s.get(url, stream=True)
 36 | 		total_length = response.headers.get('content-length')
 37 | 		print(response.headers)
 38 | 		if total_length is None:
 39 | 			f.write(response.content)
 40 | 		else:
 41 | 			dl = 0
 42 | 			total_length = int(total_length)
 43 | 			for data in response.iter_content(chunk_size=4096):
 44 | 				dl += len(data)
 45 | 				f.write(data)
 46 | 				done = int(50 * dl / total_length)
 47 | 				sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )
 48 | 				sys.stdout.flush()
 49 | 		he = response.headers
 50 | 	return he
 51 | base = "http://lyndaleigh.com"
 52 | base_url = base + "/members/"
 53 | session = requests.Session()
 54 | 
 55 | username = input("Username: ")
 56 | password = getpass.getpass("Password: ")
 57 | DEBUG = True
 58 | basefolder = input("Save Folder: ")
 59 | 
 60 | session.auth = (username, password)
 61 | 
 62 | latest = ensureLoad(base_url + "index.php/latest-updates.html", session)
 63 | 
 64 | while sCodeChk(latest) == 500:
 65 | 	print("Sleeping for 5 seconds...")
 66 | 	time.sleep(5)
 67 | 	latest = session.get(base_url + "index.php/latest-updates.html")
 68 | 
 69 | latest_soup = BeautifulSoup(latest.content, 'lxml')
 70 | last_page = int(int(latest_soup.find('a', { 'title' : "End" })['href'].split('=')[-1]) / 45)
 71 | 
 72 | base_gallery_url = base_url + "index.php/latest-updates.html?start="
 73 | 
 74 | for page in range(last_page + 1):
 75 | 	gallery_page_url = base_gallery_url + str(page * 45)
 76 | 	gallery_page = ensureLoad(gallery_page_url, session)
 77 | 	gallery_page_soup = BeautifulSoup(gallery_page.content, 'lxml')
 78 | 	item_num = 0
 79 | 	for item in gallery_page_soup.findAll('div', { 'class' : 'itemContainer' }):
 80 | 		item_num = item_num + 1
 81 | 		item_type = item.findAll('img', { 'class' : 'uk-responsive-width uk-align-center' })[0]['alt']
 82 | 		is_video = True if item_type == "Lynda Leigh Video Update" else False
 83 | 		item_url = base + item.find('a', { 'class' : 'uk-thumbnail' })['href']
 84 | 		item_text_part = item.find('div', { 'class' : 'uk-thumbnail-caption' })
 85 | 		item_title = item_text_part.findAll('strong')[0].text.strip().replace('/', " of ")
 86 | 		item_desc = item_text_part.find('span', { 'style' : ['font-size: 8pt; line-height: 6px;', 'font-size:9pt;', 'font-size: 8pt; line-height: 5px;', 'font-size: 9pt'] }).text.strip()
 87 | 		item_date = ""
 88 | 		try:
 89 | 			item_date = (datetime.strptime(item_text_part.find(text=re.compile(r'ADDED')).parent.nextSibling.strip(), "%d-%b-%y")).strftime("%Y-%m-%d")
 90 | 		except ValueError:
 91 | 			item_date = (datetime.strptime(item_text_part.find(text=re.compile(r'ADDED')).parent.nextSibling.strip(), "%d-%m-%y")).strftime("%Y-%m-%d")
 92 | 		print('Page {:2d} item {:2d}\n\tType: \t{}\n\tTitle: \t{}\n\tDesc: \t{}'.format(page + 1, item_num, item_type, item_title, item_desc))
 93 | 		info = {'isvideo': is_video, 'url': item_url, 'name': item_title, 'description': item_desc, 'date': item_date, 'size': 0}
 94 | 		item_page = ensureLoad(item_url, session, "\t")
 95 | 		item_page_soup = BeautifulSoup(item_page.content, 'lxml')
 96 | 		high_res_dl = ""
 97 | 		if is_video and len(item_page_soup.findAll(text='Live Members ONLY..!')) == 0:
 98 | 			#VIDEO DOWNLOAD AND PARSE
 99 | 			try:
100 | 				high_res_dl = base + item_page_soup.find(text='1080p MPEG').parent.parent['href']
101 | 			except AttributeError:
102 | 				try:
103 | 					high_res_dl = base + item_page_soup.find(text='MPEG').parent['href']
104 | 				except AttributeError:
105 | 					try:
106 | 						high_res_dl = base + item_page_soup.find(text='HD MPEG ').parent['href']
107 | 					except AttributeError:
108 | 						try:
109 | 							high_res_dl = base + item_page_soup.find(text='MPEG 1.4gb').parent['href']
110 | 						except AttributeError:
111 | 							try:
112 | 								high_res_dl = base + item_page_soup.find(text='MP4').parent['href']
113 | 							except AttributeError:
114 | 								try:
115 | 									high_res_dl = base + item_page_soup.find(text='720p MPEG').parent.parent['href']
116 | 								except AttributeError:
117 | 									try:
118 | 										high_res_dl = base + item_page_soup.find(text='Right Click this link to save LOW RES 720p movie').parent['href']
119 | 									except AttributeError:
120 | 										try:
121 | 											high_res_dl = base + item_page_soup.find(text='HD MPEG 808mb').parent['href']
122 | 										except AttributeError:
123 | 											try:
124 | 												high_res_dl = item_page_soup.find('source')['src']
125 | 											except AttributeError:
126 | 												print("ERROR")
127 | 												exit()
128 | 			except KeyError:
129 | 				high_res_dl = base + item_page_soup.find(text='1080p MPEG').parent['href']
130 | 			if len(high_res_dl):
131 | 				directory = basefolder + item_date + " " + item_title + "/"
132 | 				ensureDir(directory)
133 | 				header = fileDL(high_res_dl, directory + item_title + ".undef", session)
134 | 				info['size'] = header.get('content-length')
135 | 				mime = magic.Magic(mime=True)
136 | 				print("\nExtension detected: " + mime.from_file(directory + item_title + ".undef"))
137 | 				os.rename(directory + item_title + ".undef", os.path.splitext(directory + item_title + ".undef")[0] + mimetypes.guess_extension(mime.from_file(directory + item_title + ".undef")))
138 | 				with open(directory + "data.json", 'w') as outfile:
139 | 					json.dump(info, outfile)
140 | 		elif len(item_page_soup.findAll(text='Live Members ONLY..!')) == 0:
141 | 			#PHOTO DOWNLOAD AND PARSE
142 | 			try:
143 | 				high_res_dl = base + item_page_soup.find(text='Hi Res').parent['href']
144 | 			except AttributeError:
145 | 				try:
146 | 					high_res_dl = base + item_page_soup.find(text='Web Res').parent['href']
147 | 				except AttributeError:
148 | 					try:
149 | 						high_res_dl = base + item_page_soup.find(text='Med Res').parent['href']
150 | 					except AttributeError:
151 | 						try:
152 | 							high_res_dl = base + item_page_soup.find(text='Low Res').parent['href']
153 | 						except AttributeError:
154 | 							exit()
155 | 							print(item_page_soup.prettify())
156 | 							print(str(len(item_page_soup.prettify())))
157 | 			if len(high_res_dl):
158 | 				directory = basefolder + item_date + " - Gallery - " + item_title + "/"
159 | 				ensureDir(directory)
160 | 				header = fileDL(high_res_dl, directory + item_title + ".undef", session)
161 | 				info['size'] = header.get('content-length')
162 | 				mime = magic.Magic(mime=True)
163 | 				print("\nExtension detected: " + mime.from_file(directory + item_title + ".undef"))
164 | 				os.rename(directory + item_title + ".undef", os.path.splitext(directory + item_title + ".undef")[0] + mimetypes.guess_extension(mime.from_file(directory + item_title + ".undef")))
165 | 				with open(directory + "data.json", 'w') as outfile:
166 | 					json.dump(info, outfile)
167 | 


--------------------------------------------------------------------------------
/pantyhoseplaza.com.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import urllib2, base64, sys, re, os, json
 3 | from bs4 import BeautifulSoup
 4 | from functions import outputcolors, ensureDir, fileDlWithAuth
 5 | 
 6 | if len(sys.argv) != 4:
 7 | 	print("Scraping script for pantyhoseplaza.com porn size.")
 8 | 	print("Usage:\n\tpython pantyhoseplaza.com.py pageNumber username password")
 9 | 	sys.exit()
10 | username = sys.argv[2]
11 | password = sys.argv[3]
12 | baseUrl = "http://www.pantyhoseplaza.com/members/"
13 | baseDir = "/mnt/san/"
14 | regex = re.compile(".*Format.*")
15 | 
16 | request = urllib2.Request(baseUrl + "content.php?show=videos&section=37&page=" + sys.argv[1])
17 | base64string = base64.encodestring('%s:%s' % (username, password)).replace('\n', '')
18 | request.add_header("Authorization", "Basic %s" % base64string)   
19 | result = urllib2.urlopen(request)
20 | 
21 | rootSoup = BeautifulSoup(result, "lxml")
22 | 
23 | for table in rootSoup.findAll('table', { "bgcolor" : "#1d1d1d" }):
24 | 	anchor = table.find('a', { "style" : "color:#FF0000" })
25 | 	name = anchor.text.strip()
26 | 	videoUrl = baseUrl + anchor['href']
27 | 	description = table.findAll('tr')[1].find('div').text.strip()
28 | 	dirName = baseDir + "pantyhoseplaza.com/" + name.replace(":", "")
29 | 	ensureDir(dirName)
30 | 	print(name + "\n\t" + videoUrl)
31 | 	requestVid = urllib2.Request(videoUrl)
32 | 	requestVid.add_header("Authorization", "Basic %s" % base64string)
33 | 	resultVid = urllib2.urlopen(requestVid)
34 | 	vidSoup = BeautifulSoup(resultVid, "lxml")
35 | 	imageUrl = baseUrl + vidSoup.find('img', { "style" : "border-color:#990000" })['src']
36 | 	print("\tIMAGE: " + imageUrl)
37 | 	fileDlWithAuth(imageUrl, base64string, dirName + "/", "\t")
38 | 	data = {'Name' : name, 'Description' : description}
39 | 	with open(dirName + '/data.json', 'w') as outfile:
40 |     		json.dump(data, outfile)
41 | 	for vidDiv in vidSoup.findAll('div'):
42 | 		if regex.match(vidDiv.text.strip()):
43 | 			trueVideoUrl = baseUrl + vidDiv.find('a')['href']
44 | 			videoSize = vidDiv.findAll('a')[1].text.strip()
45 | 			print("\t\t" + videoSize + " => " + trueVideoUrl)
46 | 			trueVidRequest = urllib2.Request(trueVideoUrl)
47 | 			trueVidRequest.add_header("Authorization", "Basic %s" % base64string)
48 | 			trueVidResult = urllib2.urlopen(trueVidRequest)
49 | 			trueVidSoup = BeautifulSoup(trueVidResult, "lxml")
50 | 			trueVideoDownloadUrl = baseUrl + trueVidSoup.find('a', text="Click here to download the full length video!")['href']
51 | 			print("\t\t\tVIDEO SOURCE URL: " + trueVideoDownloadUrl)
52 | 			fileDlWithAuth(trueVideoDownloadUrl, base64string, dirName + "/", "\t\t\t")
53 | 


--------------------------------------------------------------------------------
/thechive.com.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from __future__ import division
 3 | import sys
 4 | from datetime import datetime
 5 | from functions import getSoup, getStatus, fileDl, ensureDir
 6 | 
 7 | baseDir = "/root/thechive.com/"
 8 | baseUrl = "http://thechive.com/page/"
 9 | filter = ("Photo", "Photos", "photos", "photo")
10 | 
11 | def findLastPage(increment = 1000, last = 0, lastStatus = 200):
12 | 	imul = 1
13 | 	if lastStatus == 200:
14 | 		shouldBe = 404
15 | 	else:
16 | 		shouldBe = 200
17 | 		imul = -1
18 | 	lastPlusInc = increment * imul + last
19 | 	newStatus = getStatus(baseUrl + str(lastPlusInc) + "/")
20 | 	newIncr = increment // 2 if shouldBe == newStatus else increment
21 | 	print ("\t" + str(newStatus) + " @ " + str(lastPlusInc) + " next status should be " + str(shouldBe) + ", in/decrement will be " + str(newIncr))
22 | 	if increment == 1:
23 | 		if lastStatus == 200:
24 | 			if getStatus(baseUrl + str(last + 1) + "/") == 404:
25 | 				return last
26 | 		newIncr = 1
27 | 	return findLastPage(newIncr, lastPlusInc, newStatus)
28 | 
29 | print("Invoking findLastPage()")
30 | lastPage = findLastPage()
31 | print("Last page is " + str(lastPage))
32 | 
33 | for page in range(0, lastPage + 1):
34 | 	pageSoup = getSoup("http://thechive.com/page/" + str(page) + "/")
35 | 	print("Page " + str(page) + " of " + str(lastPage))
36 | 	for article in pageSoup.findAll('article', { "role" : "article" }):
37 | 		date = article.find('time').text.strip()
38 | 		h3 = article.find('h3', { "class" : "post-title entry-title card-title" })
39 | 		name = h3.text.strip()
40 | 		url = h3.find('a')['href']
41 | 		if any(x in name for x in filter):
42 | 			print("\tName: " + name + "\n\t\tDate: " + date)
43 | 			dateFolder = "NonParsable/"
44 | 			try:
45 | 				dateFolder = datetime.strptime(date, '%b %d, %Y').strftime("%Y/%m/%d/")
46 | 			except ValueError:
47 | 				print("\t\tGoing to NonParsable folder")
48 | 			ensureDir(baseDir + dateFolder + name + "/")
49 | 			postSoup = getSoup(url)
50 | 			for countTag in postSoup.findAll('div', { "class" : "count-tag" }):
51 | 				try:
52 | 					img = countTag.parent.find('img')
53 | 					imgSrc = img['src'].split('?')[0] + "?quality=100"
54 | 					imgName = img['src'].split('?')[0].split('/')[-1]
55 | 					if any(x in img['class'] for x in { "gif-animate" }):
56 | 						imgSrc = img['data-gifsrc'].split('?')[0]
57 | 						imgName = img['data-gifsrc'].split('?')[0].split('/')[-1]
58 | 					print("\t\t\tImage" + countTag.text + ": " + imgSrc)
59 | 					fileDl(imgSrc, baseDir + dateFolder + name + "/", "\t\t\t\t", imgName)
60 | 				except:
61 | 					
62 | 					pass
63 | 


--------------------------------------------------------------------------------
/thesandbornmaps.cudl.colorado.edu.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import urllib2, sys, re, getopt
  3 | from bs4 import BeautifulSoup
  4 | from functions import getSoup, fileDl, ensureDir, roundUpTo, roundDownTo
  5 | 
  6 | baseUrl = "http://cudl.colorado.edu/luna/servlet/view/all?sort=city%2Cdate%2Csheet&os="
  7 | baseDir = "/root/maps/"
  8 | 
  9 | def usage():
 10 | 	print("University of Colorado Sanborn Maps Scraper. Requested by /u/WhiskeyQuebec. Made by /u/nicba1010.")
 11 | 	print("Specify directory or it'll all go to your root folder!!!")
 12 | 	print("Options available: ")
 13 | 	print("\t-s, --simple\tConstructs the simple/flat directory structure")
 14 | 	print("\t-h, --help\tShows this text")
 15 | 	print("\t--from=\t\tStart at the given document number")
 16 | 	print("\t--to=\t\tEnd with the given document number")
 17 | 	print("\t--save-dir=\tStore at this location")
 18 | 
 19 | startDoc = 0
 20 | endDoc = -1
 21 | simple = False
 22 | 
 23 | try:
 24 | 	opts, args = getopt.getopt(sys.argv[1:], "hs", ["help", "simple", "from=", "to=", "save-dir="])
 25 | except getopt.GetoptError as err:
 26 | 	print str(err)
 27 | 	usage()
 28 | 	sys.exit()
 29 | 
 30 | for o, a in opts:
 31 | 	if o in ("-s", "--simple"):
 32 | 		simple = True
 33 | 	elif o in ("-h", "--help"):
 34 | 		usage()
 35 | 		sys.exit()
 36 | 	elif o == "--from":
 37 | 		startDoc = int(a)
 38 | 	elif o == "--to":
 39 | 		endDoc = int(a)
 40 | 	elif o == "--save-dir":
 41 | 		baseDir = a
 42 | 	else:
 43 | 		usage()
 44 | 		assert False, "unhandled option"
 45 | 		sys.exit()
 46 | 		
 47 | ensureDir(baseDir)
 48 | documentSoup = getSoup(baseUrl + "0")
 49 | documentTotal = int(documentSoup.find('div', { "id" : "PageRange" }).text.split('of')[1].strip().replace(',',''))
 50 | print str(documentTotal) + " documents to download. Let's get started!"
 51 | if endDoc == -1:
 52 | 	endDoc = documentTotal
 53 | 
 54 | documentNum = 1 + roundDownTo(startDoc, 50)
 55 | print("Scanning range from " + str(roundDownTo(startDoc, 50)) + " to " + str(documentTotal if roundUpTo(endDoc, 50) > documentTotal else roundUpTo(endDoc, 50)))
 56 | for i in range(roundDownTo(startDoc, 50), documentTotal if roundUpTo(endDoc, 50) > documentTotal else roundUpTo(endDoc, 50), 50):
 57 | 	print("Documents " + str(i) + " to " + str(i+50))
 58 | 	groupSoup = getSoup(baseUrl + str(i))
 59 | 	for mediaContainer in groupSoup.findAll('div', { "class" : "mediaContainer" }):
 60 | 		if documentNum < startDoc:
 61 | 			documentNum += 1
 62 | 			print("Skiping Document " + str(documentNum))
 63 | 			continue
 64 | 		elif documentNum > endDoc:
 65 | 			print("My job here is done!")
 66 | 			sys.exit(420)
 67 | 		print("\tDocument " + str(documentNum))
 68 | 		documentNum += 1
 69 | 		blockQuotes = mediaContainer.findAll('blockquote')
 70 | 		try:
 71 | 			print("\t\tCity: \t\t" + blockQuotes[0].text.strip())
 72 | 			print("\t\tDate: \t\t" + blockQuotes[1].text.strip())
 73 | 			print("\t\tSheet: \t\t" + blockQuotes[2].text.strip())
 74 | 		except Exception:
 75 | 			pass
 76 | 		singleDocumentUrl = mediaContainer.find('a')['href']
 77 | 		print("\t\tDoc Url: \t" + singleDocumentUrl)
 78 | 		singleDocumentSoup = getSoup(singleDocumentUrl)
 79 | 		theJavaScript = singleDocumentSoup.find('div', { "class" : "controlStrip" }).nextSibling.nextSibling
 80 | 		theJP2Url = str(theJavaScript).split("openPdfInWindow")[1].splitlines()[5].strip()[11:-38]
 81 | 		print("\t\tJP2 Url: \t" + theJP2Url)
 82 | 		theXMLUrl = ""
 83 | 		theXMLId = theJP2Url.split('/')[-1][:-4]
 84 | 		if theXMLId == "bou00003":
 85 | 			theXMLUrl = "http://ucblibraries.colorado.edu/systems/digitalinitiatives/xml/bou00.xml"
 86 | 		else:
 87 | 			try:
 88 | 				theXMLUrl = singleDocumentSoup.find('td', text=re.compile(r'METS XML View')).parent.nextSibling.nextSibling.find('a')['href']
 89 | 			except TypeError:
 90 | 				theXMLUrl = raw_input("\t\tThe XML is not valid, check the page manually and try to find the real XML file, input the url here: ")
 91 | 		print("\t\tXML Url: \t" + str(theXMLUrl))
 92 | 		if not simple:
 93 | 			ensureDir(baseDir + blockQuotes[0].text.strip() + "/" + blockQuotes[1].text.strip() + "/")
 94 | 		fileDl(theXMLUrl, baseDir if simple else (baseDir + blockQuotes[0].text.strip() + "/" + blockQuotes[1].text.strip() + "/"), "\t\t\t")
 95 | 		print("\t\tXML ID: \t" + theXMLId)
 96 | 		xmlSoup = getSoup(theXMLUrl)
 97 | 		admId = xmlSoup.find('filegrp').find('file', { "id" : theXMLId })['admid']
 98 | 		imageWidth = int(xmlSoup.find('techmd', { "id" : admId }).find('imagewidth').text.strip())
 99 | 		imageHeight = int(xmlSoup.find('techmd', { "id" : admId }).find('imagelength').text.strip())
100 | 		print("\t\tWidth: \t\t" + str(imageWidth))
101 | 		print("\t\tHeight: \t" + str(imageHeight))
102 | 		finalUrl = theJP2Url + "&x=0&y=0&width=" + str(imageWidth) + "&height=" + str(imageHeight)
103 | 		print("\t\tFinal Url: \t" + finalUrl)
104 | 		fileDl(finalUrl, baseDir if simple else (baseDir + blockQuotes[0].text.strip() + "/" + blockQuotes[1].text.strip() + "/"), "\t\t\t", theXMLId + ".jp2")
105 | 


--------------------------------------------------------------------------------
/wall.alphacoders.com.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import sys, getopt
 3 | from bs4 import BeautifulSoup
 4 | from functions import getSoup, ensureDir, fileDl
 5 | 
 6 | def usage():
 7 | 	print("wall.alphacoders.com scraping script. Made by /u/nicba1010")
 8 | 	print("Options and arguments:")
 9 | 	print("\t-h, --help\tShows this printout")
10 | 	print("\t--update\tStops at first found already downloaded")
11 | 	print("\t--save-dir=\tStore at this location")
12 | 
13 | bUrl = "https://wall.alphacoders.com/"
14 | baseUrl = bUrl + "newest_wallpapers.php?page="
15 | baseDir = "/root/wall.alphacoders.com/"
16 | update = False
17 | stop = False
18 | 
19 | try:
20 |         opts, args = getopt.getopt(sys.argv[1:], "h", ["help", "update", "save-dir="])
21 | except getopt.GetoptError as err:
22 |         print str(err)
23 |         usage()
24 |         sys.exit()
25 | 
26 | for o, a in opts:
27 |         if o == "--update":
28 |                 update = True
29 |         elif o in ("-h", "--help"):
30 |                 usage()
31 |                 sys.exit()
32 |         elif o == "--save-dir":
33 |                 baseDir = a
34 |         else:
35 |                 usage()
36 |                 assert False, "unhandled option"
37 |                 sys.exit()
38 | 
39 | def processWallpaper(url):
40 | 	wallpaperSoup = getSoup(url)
41 | 	wallpaperOriginalUrl = wallpaperSoup.find('span', { "class" : "btn btn-success download-button" })['data-href']
42 | 	sys.stdout.write("\t\tOriginal Wallpaper Url: " + wallpaperOriginalUrl + "\n\t\t\t")
43 | 	categories = wallpaperSoup.find('div', { "class" : "floatright" }).findAll('strong')
44 | 	name = wallpaperSoup.find('div', {'class': 'container center'}).find('div').text.strip().replace("/",".")
45 | 	tags = wallpaperSoup.findAll('div', {'style': 'padding:5px 10px; margin:1px; display:inline-block;'})
46 | 	tagArray = [None]*len(tags)
47 | 	taglist = ""
48 | 	index = 0
49 | 	if len(tags) > 0:
50 | 		for tag in tags:
51 | 			tagArray[index] = tag.text.strip()
52 | 			index += 1
53 | 	tagArray.sort()
54 | 	for tag in tagArray:
55 | 		taglist += "[" + tag + "]"
56 | 	fileName = taglist + name + ((" " if len(taglist) > 0 else "") if len(name) == 0 else " - ") + wallpaperOriginalUrl.split('/')[-4]  + "." + wallpaperOriginalUrl.split('/')[-2]
57 | 	directoryStructure = baseDir
58 | 	for i in range(0, len(categories)):
59 | 		sys.stdout.write(categories[i].text.strip() + ("" if i == (len(categories) - 1) else " => "))
60 | 		directoryStructure += categories[i].text.strip() + "/"
61 | 	sys.stdout.write("\n\t\t\t\tSaving to: " + directoryStructure + fileName + "\n")
62 | 	ensureDir(directoryStructure)
63 | 	retval = fileDl(wallpaperOriginalUrl, directoryStructure, "\t\t\t\t\t", fileName)
64 | 	if int(retval) == 42 and update:
65 | 		global stop
66 | 		stop = True
67 | 
68 | wallSoup = getSoup(baseUrl + "0")
69 | totalPages = int(wallSoup.find('ul', { "class" : "pagination pagination" }).findAll('li')[-1].find('a')['href'].split('=')[1])
70 | for i in range(0, totalPages+1):
71 | 	print("Scraping page " + str(i) + "...")
72 | 	for thumbContainer in getSoup(baseUrl + str(i)).findAll('div', { "class" : "thumb-container-big " }):
73 | 		wallpaperUrl = bUrl + thumbContainer.find('a')['href']
74 | 		print ("\tbig.php url: " + wallpaperUrl)
75 | 		processWallpaper(wallpaperUrl)
76 | 		if stop:
77 | 			sys.exit(420)
78 | 


--------------------------------------------------------------------------------
/wotd.dictionary.com.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import urllib2, sys
 3 | from bs4 import BeautifulSoup
 4 | from functions import getSoup, daterange, fileDl, ensureDir
 5 | from datetime import date, datetime
 6 | 
 7 | def dlForDate(singleDate):
 8 | 	print("Getting Word of the Day for: " + singleDate.strftime("%Y/%m/%d"))
 9 |         wordSoup = getSoup("http://www.dictionary.com/wordoftheday/" + singleDate.strftime("%Y/%m/%d") + "/")
10 |         url = wordSoup.find('meta', { "property" : "og:image" })['content']
11 |         print("\tDownloading:" + url)
12 |         fileDl(url, sys.argv[1], "\t\t")
13 | 
14 | if (len(sys.argv) < 2) or sys.argv[1] == "--help":
15 | 	print("Scraping script for dictionary.com/wordoftheday/ first WOTD: 1999/5/3")
16 | 	print("Usage:\n\tAll until today: python wotd.dictionary.com.py \"/mnt/what/ever/directory/\"")
17 | 	print("\tSpecific date: python wotd.dictionary.com.py \"/mnt/what/ever/directory/\" yyyy/mm/dd -single")
18 | 	print("\tDate range: python wotd.dictionary.com.py \"/mnt/what/ever/directory/\" yyyy/mm/dd yyyy/mm/dd")
19 | 	sys.exit()
20 | 
21 | startDate = date(1999, 5, 3)
22 | endDate = date.today()
23 | ensureDir(sys.argv[1])
24 | 
25 | if len(sys.argv) == 4 and sys.argv[3] == "-single":
26 | 	startDate = datetime.strptime(sys.argv[2], "%Y/%m/%d")
27 | 	print startDate
28 | 	dlForDate(startDate)
29 | 	sys.exit()
30 | elif len(sys.argv) == 4 and sys.argv[3] != "-single":
31 | 	startDate = datetime.strptime(sys.argv[2], "%Y/%m/%d")
32 | 	endDate = datetime.strptime(sys.argv[3], "%Y/%m/%d")
33 | 
34 | for singleDate in daterange(startDate, endDate):
35 | 	dlForDate(singleDate)
36 | 


--------------------------------------------------------------------------------
/x-art.com.py:
--------------------------------------------------------------------------------
  1 | import mechanize
  2 | import os, sys
  3 | import os.path
  4 | import cookielib
  5 | from bs4 import BeautifulSoup
  6 | import html2text
  7 | from datetime import datetime
  8 | import requests
  9 | import shutil
 10 | 
 11 | 
 12 | class HeadRequest(mechanize.Request):
 13 | 	def get_method(self):
 14 | 		return "HEAD"
 15 | 
 16 | 
 17 | def ensureDir(f):
 18 | 	if not os.path.exists(f):
 19 | 		os.makedirs(f)
 20 | 
 21 | 
 22 | def videoDl(browseItem):
 23 | 	coverUrl = None
 24 | 	try:
 25 | 		coverUrl = browseItem.find('img')['data-interchange'].split(' ')[4][1:-1]
 26 | 	except:
 27 | 		coverUrl = browseItem.find('img')['src']
 28 | 		print("\tALT Cover:\t" + coverUrl)
 29 | 	date = datetime.strptime(browseItem.find('h2').text.strip(), "%b %d, %Y")
 30 | 	videoLink = browseItem.parent['href']
 31 | 	videoHtml = br.open(videoLink).read()
 32 | 	videoSoup = BeautifulSoup(videoHtml, 'lxml')
 33 | 	wideCoverUrl = \
 34 | 		videoSoup.find('div', {'id': 'mediaplayer'}).parent.findAll('script')[2].text.split('"image"')[1][3:].split(
 35 | 			'"')[0]
 36 | 	rowSingle = videoSoup.find('div', {'class': 'row single'})
 37 | 	name = rowSingle.findAll('div', {'class': 'small-12 medium-12 large-12 columns'})[1].find('h1').text.strip()
 38 | 	dropDl = videoSoup.find('ul', {'id': 'drop-download'})
 39 | 	biggest = ""
 40 | 	biggestSize = 0
 41 | 	print(name + "\n\tURL:\t" + videoLink)
 42 | 	for a in dropDl.findAll('a'):
 43 | 		if int(a.text.strip().split('(')[1].split(' MB)')[0]) > biggestSize:
 44 | 			biggestSize = int(a.text.strip().split('(')[1].split(' MB)')[0])
 45 | 			biggest = a['href']
 46 | 	print("\tSize:\t" + str(biggestSize) + "\n\tDL URL:\t" + biggest)
 47 | 	dlDir = baseDir + date.strftime("%Y-%m-%d") + " - " + name + "/"
 48 | 	ensureDir(dlDir)
 49 | 	contentLength = str(br.open(HeadRequest(biggest)).info()).splitlines()[2].split(' ')[1].strip()
 50 | 	print("\tDL Dir:\t" + dlDir)
 51 | 	with open(dlDir + "page.html", "w") as htmlFile:
 52 | 		htmlFile.write(videoHtml)
 53 | 	try:
 54 | 		br.retrieve(coverUrl.strip(), dlDir + "portrait.jpg")
 55 | 	except Exception as e:
 56 | 		r = requests.get(coverUrl.strip(), cookies=cj, stream=True)
 57 | 		with open(dlDir + "portrait.jpg", "wb") as portraitFile:
 58 | 			r.raw.decode_content = True
 59 | 			shutil.copyfileobj(r.raw, portraitFile)
 60 | 	try:
 61 | 		br.retrieve(wideCoverUrl.strip(), dlDir + "wide.jpg")
 62 | 	except Exception as e:
 63 | 		r = requests.get(wideCoverUrl.strip(), cookies=cj, stream=True)
 64 | 		with open(dlDir + "wide.jpg", "wb") as wideFile:
 65 | 			r.raw.decode_content = True
 66 | 			shutil.copyfileobj(r.raw, wideFile)
 67 | 	while True:
 68 | 		if os.path.isfile(dlDir + name + ".mp4"):
 69 | 			if int(contentLength) > os.stat(dlDir + name + ".mp4").st_size:
 70 | 				print("\tFilesize too low:\t" + str(os.stat(dlDir + name + ".mp4").st_size))
 71 | 				print("\tShould be:\t\t" + contentLength)
 72 | 				os.remove(dlDir + name + ".mp4")
 73 | 			else:
 74 | 				return
 75 | 		br.retrieve(biggest, dlDir + name + ".mp4")
 76 | 
 77 | 
 78 | def galleryDl(browseItem):
 79 | 	coverUrl = None
 80 | 	try:
 81 | 		coverUrl = browseItem.find('img')['data-interchange'].split(' ')[4][1:-1]
 82 | 	except:
 83 | 		coverUrl = browseItem.find('img')['src']
 84 | 		print("\tALT Cover:\t" + coverUrl)
 85 | 	date = datetime.strptime(browseItem.find('h2').text.strip(), "%b %d, %Y")
 86 | 	videoLink = browseItem.parent['href']
 87 | 	videoHtml = br.open(videoLink).read()
 88 | 	videoSoup = BeautifulSoup(videoHtml, 'lxml')
 89 | 	bigCoverUrl = \
 90 | 		videoSoup.find('div', {'class': 'small-12 medium-12 large-6 columns media'}).find('img')['src'].replace('sml', 'lrg')
 91 | 	rowSingle = videoSoup.find('div', {'class': 'row single info-fixed'})
 92 | 	name = rowSingle.findAll('div', {'class': 'small-12 medium-12 large-12 columns'})[0].find('h1').text.strip()
 93 | 	dropDl = rowSingle.find('ul', {'id': 'drop-download'})
 94 | 	biggest = ""
 95 | 	biggestSize = 0
 96 | 	print(name + "\n\tURL:\t" + videoLink)
 97 | 	for a in dropDl.findAll('a'):
 98 | 		if int(a.text.strip().split(' ')[0]) > biggestSize:
 99 | 			biggestSize = int(a.text.strip().split(' ')[0])
100 | 			biggest = a['href']
101 | 	print("\tSize:\t" + str(biggestSize) + "\n\tDL URL:\t" + biggest)
102 | 	dlDir = baseDir + date.strftime("%Y-%m-%d") + " - [Gallery] - " + name + "/"
103 | 	ensureDir(dlDir)
104 | 	contentLength = str(br.open(HeadRequest(biggest)).info()).splitlines()[2].split(' ')[1].strip()
105 | 	print("\tDL Dir:\t" + dlDir)
106 | 	with open(dlDir + "page.html", "w") as htmlFile:
107 | 		htmlFile.write(videoHtml)
108 | 	try:
109 | 		br.retrieve(coverUrl.strip(), dlDir + "portrait.jpg")
110 | 	except Exception as e:
111 | 		r = requests.get(coverUrl.strip(), cookies=cj, stream=True)
112 | 		with open(dlDir + "portrait.jpg", "wb") as portraitFile:
113 | 			r.raw.decode_content = True
114 | 			shutil.copyfileobj(r.raw, portraitFile)
115 | 	try:
116 | 		br.retrieve(bigCoverUrl.strip(), dlDir + "bigPortrait.jpg")
117 | 	except Exception as e:
118 | 		r = requests.get(coverUrl.strip(), cookies=cj, stream=True)
119 | 		with open(dlDir + "bigPortrait.jpg", "wb") as wideFile:
120 | 			r.raw.decode_content = True
121 | 			shutil.copyfileobj(r.raw, wideFile)
122 | 	while True:
123 | 		if os.path.isfile(dlDir + name + ".zip"):
124 | 			if int(contentLength) > os.stat(dlDir + name + ".zip").st_size:
125 | 				print("\tFilesize too low:\t" + str(os.stat(dlDir + name + ".zip").st_size))
126 | 				print("\tShould be:\t\t" + contentLength)
127 | 				os.remove(dlDir + name + ".zip")
128 | 			else:
129 | 				return
130 | 		br.retrieve(biggest, dlDir + name + ".zip")
131 | 
132 | 
133 | # Browser
134 | br = mechanize.Browser()
135 | 
136 | # Cookie Jar
137 | cj = cookielib.LWPCookieJar()
138 | br.set_cookiejar(cj)
139 | 
140 | # Browser options
141 | br.set_handle_equiv(True)
142 | br.set_handle_gzip(True)
143 | br.set_handle_redirect(True)
144 | br.set_handle_referer(True)
145 | br.set_handle_robots(False)
146 | br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
147 | 
148 | br.addheaders = [('User-agent', 'Chrome')]
149 | 
150 | br.open('http://x-art.com/members')
151 | 
152 | br.select_form(nr=0)
153 | 
154 | br.form['uid'] = ''
155 | br.form['pwd'] = ''
156 | 
157 | br.submit()
158 | 
159 | baseDir = "/mnt/san/Operation X-ART/"
160 | 
161 | for i in range(1, 40):
162 | 	try:
163 | 		if sys.argv[1] == "-svid":
164 | 			continue
165 | 	except:
166 | 		pass
167 | 	pageHtml = br.open('http://www.x-art.com/members/videos/recent/All/' + str(i) + '/').read()
168 | 	pageSoup = BeautifulSoup(pageHtml, 'lxml')
169 | 	for browseItem in pageSoup.findAll('div', {'class': 'browse-item'}):
170 | 		videoDl(browseItem)
171 | 
172 | for i in reversed(range(1, 64)):
173 | 	print("Page: " + str(i))
174 | 	try:
175 | 		if sys.argv[1] == "-spic":
176 | 			continue
177 | 	except:
178 | 		pass
179 | 	pageHtml = br.open('http://www.x-art.com/members/galleries/recent/All/' + str(i) + '/').read()
180 | 	pageSoup = BeautifulSoup(pageHtml, 'lxml')
181 | 	for browseItem in pageSoup.findAll('div', {'class': 'browse-item'}):
182 | 		if len(browseItem.findAll('h2')[1].text.replace('Images', '').strip()) > 0:
183 | 			galleryDl(browseItem)
184 | 	if i < 15:
185 | 		raw_input("Press Enter to continue...")
186 | 


--------------------------------------------------------------------------------