├── .gitignore ├── 1024.py ├── README.md ├── convert └── socks.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.jpg 3 | *.png 4 | download_list.txt 5 | -------------------------------------------------------------------------------- /1024.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | #encoding=utf-8 3 | # Author: Aaron 4 | # Last modified: 2014-10-22 23:42 5 | # Filename: 1024.py 6 | # Description: 7 | 8 | import sys 9 | import time 10 | import socks 11 | import socket 12 | import urllib2 13 | import urllib 14 | import re 15 | import os 16 | import threading 17 | 18 | SOCKS_PORT = 1080 19 | 20 | USING_PROXY = False 21 | 22 | def get_url(): 23 | pass 24 | if len(sys.argv) != 3: 25 | print >> sys.stderr, "need a url and output name" 26 | sys.exit(-1) 27 | return sys.argv[1] 28 | 29 | def get_outfname(): 30 | pass 31 | if len(sys.argv) != 3: 32 | print >> sys.stderr, "need a url and output name" 33 | sys.exit(-1) 34 | return sys.argv[2] 35 | 36 | def get_html(url): 37 | pass 38 | global SOCKS_PORT 39 | global USING_PROXY 40 | if USING_PROXY: 41 | socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", SOCKS_PORT) 42 | socket.socket = socks.socksocket 43 | 44 | req = urllib2.Request(url, headers={'User-Agent' : "Magic Browser"}) 45 | response = urllib2.urlopen(req) 46 | return response.read() 47 | 48 | def find_image_urls(html): 49 | p = re.compile(r'https?://\S+\.jpe?g') 50 | urls = p.findall(html) 51 | #urls = re.findall(r'https?://\S+\.(jpeg|jpg|png)', html) 52 | if len(urls) == 0: 53 | print >> sys.stderr, "Can not find image urls" 54 | 55 | # filter www.viidii.info redirection 56 | urls = map(lambda url: 'www.viidii.info' in url and urllib.unquote_plus(url)[urllib.unquote_plus(url).find('url=')+4:] or url, urls) 57 | 58 | domain_count = {} 59 | for url in urls: 60 | domain = url.split('/')[2] 61 | domain = '.'.join(domain.split('.')[-2:]) 62 | if domain not in domain_count: 63 | domain_count[domain] = 1 64 | else: 65 | domain_count[domain] += 1 66 | domain_count = sorted(domain_count.items(), key=lambda x:x[1], reverse=True) 67 | 68 | top_domain = domain_count[0][0] 69 | 70 | urls = filter(lambda x : top_domain in x, urls) 71 | 72 | out_f = file('download_list.txt', 'w') 73 | for url in urls: 74 | out_f.write(url+'\n') 75 | out_f.close 76 | 77 | return urls 78 | 79 | 80 | def download_images_wget(image_urls): 81 | os.system('rm -rf tmp; mkdir tmp') 82 | os.system("cd tmp; wget -i ../download_list.txt; cd -") 83 | 84 | def download_images(image_urls): 85 | total_images_count = len(image_urls) 86 | os.system('rm -rf tmp; mkdir tmp') 87 | print >> sys.stderr, "Downloading %d images ..." % (total_images_count) 88 | threads = [] 89 | for i in xrange(0, total_images_count): 90 | image_url = image_urls[i] 91 | t = threading.Thread(target=down_load_single_image, args=(image_url, i)) 92 | threads.append(t) 93 | 94 | for t in threads: 95 | while threading.activeCount() > 11: 96 | time.sleep(1) 97 | t.start() 98 | while threading.activeCount() > 1: 99 | time.sleep(1) 100 | 101 | class AppURLopener(urllib.FancyURLopener): 102 | pass 103 | version="Mozilla/5.0" 104 | 105 | def down_load_single_image(image_url, index): 106 | global SOCKS_PORT 107 | global USING_PROXY 108 | max_retry = 3 109 | is_done = False 110 | for cur_try in xrange(1, max_retry+1): 111 | print >> sys.stderr, "Start downloading image %d attempt %d, url: %s" % (index,cur_try, image_url) 112 | if USING_PROXY: 113 | socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, "127.0.0.1", SOCKS_PORT) 114 | socket.setdefaulttimeout(20) 115 | socket.socket = socks.socksocket 116 | try: 117 | urllib._urlopener = AppURLopener() 118 | urllib.urlretrieve(image_url, 'tmp/%d.jpg' % (index+1)) 119 | except Exception,e: 120 | print >> sys.stderr, e 121 | continue 122 | print >> sys.stderr, "Finish downloading image %d on attempt %d, url: %s" % (index,cur_try, image_url) 123 | is_done = True 124 | return 125 | if False == is_done: 126 | print >> sys.stderr, "Failed downloading image %d, url: %s" % (index, image_url) 127 | 128 | 129 | 130 | def create_pdf(fname): 131 | os.system("./convert tmp/*.jpg %s.pdf" % fname) 132 | 133 | 134 | if __name__ == '__main__': 135 | url = get_url() 136 | html = get_html(url) 137 | image_urls = find_image_urls(html) 138 | download_images(image_urls) 139 | fname = get_outfname() 140 | create_pdf(fname) 141 | print "Thanks to Aaron, good person one life flat safe, 1024 Amen!" 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #1024 to PDF# 2 | ## Descriprion ## 3 | - Download all jpg or jpeg images on a webpage and merge them into one PDF. 4 | - Surpporting MacOS or Linux, with Python2.7. Lower version may work too. 5 | - if USING_PROXY is True, 1024 need a socks5 proxy to do its work, set the proxy to 127.0.0.1:7777, otherwise you may want to change the source code. 6 | 7 | ##Usage## 8 | `python 1024.py PAGE_URL PDF_FNAME.pdf` 9 | 10 | example: 11 | 12 | `python 1024.py http:://t66y.com/xx/xx/xx.html ~/Desktop/My_1024.pdf` 13 | 14 | 15 | -------------------------------------------------------------------------------- /convert: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scenix007/1024toPDF/6e0856cf3eb56913d019b3f856744cd1bd8c67af/convert -------------------------------------------------------------------------------- /socks.py: -------------------------------------------------------------------------------- 1 | """SocksiPy - Python SOCKS module. 2 | Version 1.00 3 | 4 | Copyright 2006 Dan-Haim. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without modification, 7 | are permitted provided that the following conditions are met: 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 3. Neither the name of Dan Haim nor the names of his contributors may be used 14 | to endorse or promote products derived from this software without specific 15 | prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY DAN HAIM "AS IS" AND ANY EXPRESS OR IMPLIED 18 | WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 19 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 20 | EVENT SHALL DAN HAIM OR HIS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA 23 | OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 25 | OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMANGE. 26 | 27 | 28 | This module provides a standard socket-like interface for Python 29 | for tunneling connections through SOCKS proxies. 30 | 31 | """ 32 | 33 | import socket 34 | import struct 35 | 36 | PROXY_TYPE_SOCKS4 = 1 37 | PROXY_TYPE_SOCKS5 = 2 38 | PROXY_TYPE_HTTP = 3 39 | 40 | _defaultproxy = None 41 | _orgsocket = socket.socket 42 | 43 | class ProxyError(Exception): 44 | def __init__(self, value): 45 | self.value = value 46 | def __str__(self): 47 | return repr(self.value) 48 | 49 | class GeneralProxyError(ProxyError): 50 | def __init__(self, value): 51 | self.value = value 52 | def __str__(self): 53 | return repr(self.value) 54 | 55 | class Socks5AuthError(ProxyError): 56 | def __init__(self, value): 57 | self.value = value 58 | def __str__(self): 59 | return repr(self.value) 60 | 61 | class Socks5Error(ProxyError): 62 | def __init__(self, value): 63 | self.value = value 64 | def __str__(self): 65 | return repr(self.value) 66 | 67 | class Socks4Error(ProxyError): 68 | def __init__(self, value): 69 | self.value = value 70 | def __str__(self): 71 | return repr(self.value) 72 | 73 | class HTTPError(ProxyError): 74 | def __init__(self, value): 75 | self.value = value 76 | def __str__(self): 77 | return repr(self.value) 78 | 79 | _generalerrors = ("success", 80 | "invalid data", 81 | "not connected", 82 | "not available", 83 | "bad proxy type", 84 | "bad input") 85 | 86 | _socks5errors = ("succeeded", 87 | "general SOCKS server failure", 88 | "connection not allowed by ruleset", 89 | "Network unreachable", 90 | "Host unreachable", 91 | "Connection refused", 92 | "TTL expired", 93 | "Command not supported", 94 | "Address type not supported", 95 | "Unknown error") 96 | 97 | _socks5autherrors = ("succeeded", 98 | "authentication is required", 99 | "all offered authentication methods were rejected", 100 | "unknown username or invalid password", 101 | "unknown error") 102 | 103 | _socks4errors = ("request granted", 104 | "request rejected or failed", 105 | "request rejected because SOCKS server cannot connect to identd on the client", 106 | "request rejected because the client program and identd report different user-ids", 107 | "unknown error") 108 | 109 | def setdefaultproxy(proxytype=None,addr=None,port=None,rdns=True,username=None,password=None): 110 | """setdefaultproxy(proxytype, addr[, port[, rdns[, username[, password]]]]) 111 | Sets a default proxy which all further socksocket objects will use, 112 | unless explicitly changed. 113 | """ 114 | global _defaultproxy 115 | _defaultproxy = (proxytype,addr,port,rdns,username,password) 116 | 117 | class socksocket(socket.socket): 118 | """socksocket([family[, type[, proto]]]) -> socket object 119 | 120 | Open a SOCKS enabled socket. The parameters are the same as 121 | those of the standard socket init. In order for SOCKS to work, 122 | you must specify family=AF_INET, type=SOCK_STREAM and proto=0. 123 | """ 124 | 125 | def __init__(self, family=socket.AF_INET, type=socket.SOCK_STREAM, proto=0, _sock=None): 126 | _orgsocket.__init__(self,family,type,proto,_sock) 127 | if _defaultproxy != None: 128 | self.__proxy = _defaultproxy 129 | else: 130 | self.__proxy = (None, None, None, None, None, None) 131 | self.__proxysockname = None 132 | self.__proxypeername = None 133 | 134 | def __recvall(self, bytes): 135 | """__recvall(bytes) -> data 136 | Receive EXACTLY the number of bytes requested from the socket. 137 | Blocks until the required number of bytes have been received. 138 | """ 139 | data = "" 140 | while len(data) < bytes: 141 | data = data + self.recv(bytes-len(data)) 142 | return data 143 | 144 | def setproxy(self,proxytype=None,addr=None,port=None,rdns=True,username=None,password=None): 145 | """setproxy(proxytype, addr[, port[, rdns[, username[, password]]]]) 146 | Sets the proxy to be used. 147 | proxytype - The type of the proxy to be used. Three types 148 | are supported: PROXY_TYPE_SOCKS4 (including socks4a), 149 | PROXY_TYPE_SOCKS5 and PROXY_TYPE_HTTP 150 | addr - The address of the server (IP or DNS). 151 | port - The port of the server. Defaults to 1080 for SOCKS 152 | servers and 8080 for HTTP proxy servers. 153 | rdns - Should DNS queries be preformed on the remote side 154 | (rather than the local side). The default is True. 155 | Note: This has no effect with SOCKS4 servers. 156 | username - Username to authenticate with to the server. 157 | The default is no authentication. 158 | password - Password to authenticate with to the server. 159 | Only relevant when username is also provided. 160 | """ 161 | self.__proxy = (proxytype,addr,port,rdns,username,password) 162 | 163 | def __negotiatesocks5(self,destaddr,destport): 164 | """__negotiatesocks5(self,destaddr,destport) 165 | Negotiates a connection through a SOCKS5 server. 166 | """ 167 | # First we'll send the authentication packages we support. 168 | if (self.__proxy[4]!=None) and (self.__proxy[5]!=None): 169 | # The username/password details were supplied to the 170 | # setproxy method so we support the USERNAME/PASSWORD 171 | # authentication (in addition to the standard none). 172 | self.sendall("\x05\x02\x00\x02") 173 | else: 174 | # No username/password were entered, therefore we 175 | # only support connections with no authentication. 176 | self.sendall("\x05\x01\x00") 177 | # We'll receive the server's response to determine which 178 | # method was selected 179 | chosenauth = self.__recvall(2) 180 | if chosenauth[0] != "\x05": 181 | self.close() 182 | raise GeneralProxyError((1,_generalerrors[1])) 183 | # Check the chosen authentication method 184 | if chosenauth[1] == "\x00": 185 | # No authentication is required 186 | pass 187 | elif chosenauth[1] == "\x02": 188 | # Okay, we need to perform a basic username/password 189 | # authentication. 190 | self.sendall("\x01" + chr(len(self.__proxy[4])) + self.__proxy[4] + chr(len(self.proxy[5])) + self.__proxy[5]) 191 | authstat = self.__recvall(2) 192 | if authstat[0] != "\x01": 193 | # Bad response 194 | self.close() 195 | raise GeneralProxyError((1,_generalerrors[1])) 196 | if authstat[1] != "\x00": 197 | # Authentication failed 198 | self.close() 199 | raise Socks5AuthError,((3,_socks5autherrors[3])) 200 | # Authentication succeeded 201 | else: 202 | # Reaching here is always bad 203 | self.close() 204 | if chosenauth[1] == "\xFF": 205 | raise Socks5AuthError((2,_socks5autherrors[2])) 206 | else: 207 | raise GeneralProxyError((1,_generalerrors[1])) 208 | # Now we can request the actual connection 209 | req = "\x05\x01\x00" 210 | # If the given destination address is an IP address, we'll 211 | # use the IPv4 address request even if remote resolving was specified. 212 | try: 213 | ipaddr = socket.inet_aton(destaddr) 214 | req = req + "\x01" + ipaddr 215 | except socket.error: 216 | # Well it's not an IP number, so it's probably a DNS name. 217 | if self.__proxy[3]==True: 218 | # Resolve remotely 219 | ipaddr = None 220 | req = req + "\x03" + chr(len(destaddr)) + destaddr 221 | else: 222 | # Resolve locally 223 | ipaddr = socket.inet_aton(socket.gethostbyname(destaddr)) 224 | req = req + "\x01" + ipaddr 225 | req = req + struct.pack(">H",destport) 226 | self.sendall(req) 227 | # Get the response 228 | resp = self.__recvall(4) 229 | if resp[0] != "\x05": 230 | self.close() 231 | raise GeneralProxyError((1,_generalerrors[1])) 232 | elif resp[1] != "\x00": 233 | # Connection failed 234 | self.close() 235 | if ord(resp[1])<=8: 236 | raise Socks5Error(ord(resp[1]),_generalerrors[ord(resp[1])]) 237 | else: 238 | raise Socks5Error(9,_generalerrors[9]) 239 | # Get the bound address/port 240 | elif resp[3] == "\x01": 241 | boundaddr = self.__recvall(4) 242 | elif resp[3] == "\x03": 243 | resp = resp + self.recv(1) 244 | boundaddr = self.__recvall(resp[4]) 245 | else: 246 | self.close() 247 | raise GeneralProxyError((1,_generalerrors[1])) 248 | boundport = struct.unpack(">H",self.__recvall(2))[0] 249 | self.__proxysockname = (boundaddr,boundport) 250 | if ipaddr != None: 251 | self.__proxypeername = (socket.inet_ntoa(ipaddr),destport) 252 | else: 253 | self.__proxypeername = (destaddr,destport) 254 | 255 | def getproxysockname(self): 256 | """getsockname() -> address info 257 | Returns the bound IP address and port number at the proxy. 258 | """ 259 | return self.__proxysockname 260 | 261 | def getproxypeername(self): 262 | """getproxypeername() -> address info 263 | Returns the IP and port number of the proxy. 264 | """ 265 | return _orgsocket.getpeername(self) 266 | 267 | def getpeername(self): 268 | """getpeername() -> address info 269 | Returns the IP address and port number of the destination 270 | machine (note: getproxypeername returns the proxy) 271 | """ 272 | return self.__proxypeername 273 | 274 | def __negotiatesocks4(self,destaddr,destport): 275 | """__negotiatesocks4(self,destaddr,destport) 276 | Negotiates a connection through a SOCKS4 server. 277 | """ 278 | # Check if the destination address provided is an IP address 279 | rmtrslv = False 280 | try: 281 | ipaddr = socket.inet_aton(destaddr) 282 | except socket.error: 283 | # It's a DNS name. Check where it should be resolved. 284 | if self.__proxy[3]==True: 285 | ipaddr = "\x00\x00\x00\x01" 286 | rmtrslv = True 287 | else: 288 | ipaddr = socket.inet_aton(socket.gethostbyname(destaddr)) 289 | # Construct the request packet 290 | req = "\x04\x01" + struct.pack(">H",destport) + ipaddr 291 | # The username parameter is considered userid for SOCKS4 292 | if self.__proxy[4] != None: 293 | req = req + self.__proxy[4] 294 | req = req + "\x00" 295 | # DNS name if remote resolving is required 296 | # NOTE: This is actually an extension to the SOCKS4 protocol 297 | # called SOCKS4A and may not be supported in all cases. 298 | if rmtrslv==True: 299 | req = req + destaddr + "\x00" 300 | self.sendall(req) 301 | # Get the response from the server 302 | resp = self.__recvall(8) 303 | if resp[0] != "\x00": 304 | # Bad data 305 | self.close() 306 | raise GeneralProxyError((1,_generalerrors[1])) 307 | if resp[1] != "\x5A": 308 | # Server returned an error 309 | self.close() 310 | if ord(resp[1]) in (91,92,93): 311 | self.close() 312 | raise Socks4Error((ord(resp[1]),_socks4errors[ord(resp[1])-90])) 313 | else: 314 | raise Socks4Error((94,_socks4errors[4])) 315 | # Get the bound address/port 316 | self.__proxysockname = (socket.inet_ntoa(resp[4:]),struct.unpack(">H",resp[2:4])[0]) 317 | if rmtrslv != None: 318 | self.__proxypeername = (socket.inet_ntoa(ipaddr),destport) 319 | else: 320 | self.__proxypeername = (destaddr,destport) 321 | 322 | def __negotiatehttp(self,destaddr,destport): 323 | """__negotiatehttp(self,destaddr,destport) 324 | Negotiates a connection through an HTTP server. 325 | """ 326 | # If we need to resolve locally, we do this now 327 | if self.__proxy[3] == False: 328 | addr = socket.gethostbyname(destaddr) 329 | else: 330 | addr = destaddr 331 | self.sendall("CONNECT " + addr + ":" + str(destport) + " HTTP/1.1\r\n" + "Host: " + destaddr + "\r\n\r\n") 332 | # We read the response until we get the string "\r\n\r\n" 333 | resp = self.recv(1) 334 | while resp.find("\r\n\r\n")==-1: 335 | resp = resp + self.recv(1) 336 | # We just need the first line to check if the connection 337 | # was successful 338 | statusline = resp.splitlines()[0].split(" ",2) 339 | if statusline[0] not in ("HTTP/1.0","HTTP/1.1"): 340 | self.close() 341 | raise GeneralProxyError((1,_generalerrors[1])) 342 | try: 343 | statuscode = int(statusline[1]) 344 | except ValueError: 345 | self.close() 346 | raise GeneralProxyError((1,_generalerrors[1])) 347 | if statuscode != 200: 348 | self.close() 349 | raise HTTPError((statuscode,statusline[2])) 350 | self.__proxysockname = ("0.0.0.0",0) 351 | self.__proxypeername = (addr,destport) 352 | 353 | def connect(self,destpair): 354 | """connect(self,despair) 355 | Connects to the specified destination through a proxy. 356 | destpar - A tuple of the IP/DNS address and the port number. 357 | (identical to socket's connect). 358 | To select the proxy server use setproxy(). 359 | """ 360 | # Do a minimal input check first 361 | if (type(destpair) in (list,tuple)==False) or (len(destpair)<2) or (type(destpair[0])!=str) or (type(destpair[1])!=int): 362 | raise GeneralProxyError((5,_generalerrors[5])) 363 | if self.__proxy[0] == PROXY_TYPE_SOCKS5: 364 | if self.__proxy[2] != None: 365 | portnum = self.__proxy[2] 366 | else: 367 | portnum = 1080 368 | _orgsocket.connect(self,(self.__proxy[1],portnum)) 369 | self.__negotiatesocks5(destpair[0],destpair[1]) 370 | elif self.__proxy[0] == PROXY_TYPE_SOCKS4: 371 | if self.__proxy[2] != None: 372 | portnum = self.__proxy[2] 373 | else: 374 | portnum = 1080 375 | _orgsocket.connect(self,(self.__proxy[1],portnum)) 376 | self.__negotiatesocks4(destpair[0],destpair[1]) 377 | elif self.__proxy[0] == PROXY_TYPE_HTTP: 378 | if self.__proxy[2] != None: 379 | portnum = self.__proxy[2] 380 | else: 381 | portnum = 8080 382 | _orgsocket.connect(self,(self.__proxy[1],portnum)) 383 | self.__negotiatehttp(destpair[0],destpair[1]) 384 | elif self.__proxy[0] == None: 385 | _orgsocket.connect(self,(destpair[0],destpair[1])) 386 | else: 387 | raise GeneralProxyError((4,_generalerrors[4])) 388 | --------------------------------------------------------------------------------