├── .gitignore ├── Download.py ├── Api.py ├── README.md ├── main.py └── PostParser.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /Download.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rast' 2 | 3 | import urllib2 4 | from os.path import join, exists, isfile, splitext 5 | import sys 6 | import logging 7 | 8 | def download(urllist, root_dir): 9 | """Thanks to StackOverflow guys""" 10 | for url, name, subdir in urllist: 11 | if name is None: 12 | name = url.split('/')[-1] 13 | dir = join(root_dir, subdir) 14 | fname = join(dir, name) 15 | u = urllib2.urlopen(url) 16 | 17 | #file might exist, so add (1) or (2) etc 18 | counter = 1 19 | if exists(fname) and isfile(fname): 20 | name, ext = splitext(fname) 21 | fname = name + " ({})".format(counter) + ext 22 | while exists(fname) and isfile(fname): 23 | counter += 1 24 | name, ext = splitext(fname) 25 | fname = name[:-4] + " ({})".format(counter) + ext 26 | logging.info(u"Start dl: {}".format(fname)) 27 | f = open(fname, 'wb') 28 | meta = u.info() 29 | file_size = int(meta.getheaders("Content-Length")[0]) 30 | sys.stdout.write("Downloading: %s (%s kb)\n" % (fname.encode('ascii', 'ignore'), file_size/1024)) 31 | 32 | file_size_dl = 0 33 | block_sz = 8192 34 | while True: 35 | buffer = u.read(block_sz) 36 | if not buffer: 37 | break 38 | 39 | file_size_dl += len(buffer) 40 | f.write(buffer) 41 | status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size) 42 | status = status + chr(8)*(len(status)+1) 43 | sys.stdout.write(status) 44 | 45 | f.close() 46 | logging.info(u" End dl: {}".format(fname)) -------------------------------------------------------------------------------- /Api.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rast' 2 | 3 | import json 4 | import urllib2 5 | from urllib import urlencode 6 | import re 7 | from time import sleep 8 | import logging 9 | 10 | def auth(args): 11 | """Interact with user to get access_token""" 12 | 13 | url = "https://oauth.vk.com/oauth/authorize?" + \ 14 | "redirect_uri=https://oauth.vk.com/blank.html&response_type=token&" + \ 15 | "client_id=%s&scope=%s&display=wap" % (args.app_id, ",".join(args.access_rights)) 16 | 17 | print("Please open this url:\n\n\t{}\n".format(url)) 18 | raw_url = raw_input("Grant access to your acc and copy resulting URL here: ") 19 | res = re.search('access_token=([0-9A-Fa-f]+)', raw_url, re.I) 20 | if res is not None: 21 | return res.groups()[0] 22 | else: 23 | return None 24 | 25 | def captcha(data): 26 | """Ask user to solve captcha""" 27 | logging.debug("Captcha needed..") 28 | print("VK thinks you're a bot - and you are ;)") 29 | print("They want you to solve CAPTCHA. Please open this URL, and type here a captcha solution:") 30 | print("\n\t{}\n".format(data[u'error'][u'captcha_img'])) 31 | solution = raw_input("Solution = ").strip() 32 | return data[u'error'][u'captcha_sid'], solution 33 | 34 | 35 | def call_api(method, params, args): 36 | while True: 37 | if isinstance(params, list): 38 | params_list = [kv for kv in params] 39 | elif isinstance(params, dict): 40 | params_list = params.items() 41 | else: 42 | params_list = [params] 43 | params_list.append(("access_token", args.token)) 44 | url = "https://api.vk.com/method/%s?%s" % (method, urlencode(params_list)) 45 | 46 | json_stuff = urllib2.urlopen(url).read() 47 | result = json.loads(json_stuff) 48 | if u'error' in result.keys(): 49 | if result[u'error'][u'error_code'] == 6: # too many requests 50 | logging.debug("Too many requests per second, sleeping..") 51 | sleep(1) 52 | continue 53 | elif result[u'error'][u'error_code'] == 14: # captcha needed :\ 54 | sid, key = captcha(result) 55 | params.extend([(u"captcha_sid", sid), (u"captcha_key", key)]) 56 | continue 57 | else: 58 | msg = "API call resulted in error ({}): {}".format(result[u'error'][u'error_code'], 59 | result[u'error'][u'error_msg']) 60 | logging.error(msg) 61 | raise RuntimeError(msg) 62 | else: 63 | logging.debug("API call succeeded: {}".format(url)) 64 | break 65 | 66 | if not u'response' in result.keys(): 67 | msg = "API call result has no response" 68 | logging.error(msg) 69 | raise RuntimeError(msg) 70 | else: 71 | #logging.debug("API call answer: {}".format(str(result[u'response']))) 72 | return result[u'response'], json_stuff 73 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | vkd (en) 2 | === 3 | WARNING: it is dead, not supported and does not work. See https://github.com/Rast1234/VOffline instead. 4 | === 5 | 6 | VK dumper - save stuff from your vk.com to local drive 7 | --- 8 | 9 | **Features:** 10 | 11 | * Download almost anything from your wall: 12 | * Posts 13 | * Multimedia attachments 14 | * Comments 15 | * Comment attachments 16 | * Some extra raw info for your history 17 | * Download audio tracks 18 | * Correct fime naming 19 | * Text stored also (if available) 20 | * Sort by playlists (as subfolders) 21 | * Download docs 22 | * Auto-change filename if exists 23 | * Write correct extension 24 | * **TODO:** Downloading video 25 | * **TODO** Downloading notes 26 | 27 | **Known limitations, bugs and other considerations:** 28 | 29 | * Unable to download videos (Anyone need this?) 30 | * Unable to download note comments (Maybe an unneeded feature?) 31 | * Tested on a particular user, audio tested on a group 32 | * Can't grant access by itself, so you need to enter auth token manually (see below) 33 | * If wall, documents or audio list has been modified during work, it **will** cause unpredictable things. 34 | * Sometimes you need to solve CAPTCHA (interactively) 35 | 36 | Usage and requirements 37 | --- 38 | You need **Python 2.7** only for this to work. Should work under different OSes, not tested. 39 | For all available list of arguments see `--help` output. 40 | Essential options described below: 41 | * `-i / --id` User ID or group ID to dump. Group ID should be prefixed with `-`, e.x. `-123456` 42 | * `-a / --app_id` Application ID, see below. 43 | * `-m / --mode` Working mode. Supported modes are `wall`, `docs` and `audio`, you may specify all at the same time. Script will download your wall, docs or audio tracks, respectively. 44 | 45 | How-to 46 | --- 47 | 1. Clone this repo 48 | 2. Register your own [vk app here](https://vk.com/editapp?act=create) to get Application ID. Be sure to select **standalone** application type. 49 | 3. Write down your Application ID (ex. **1234567**). 50 | 4. Find your [profile ID here](https://vk.com/settings) near the bottom of the page. For example, **1**. 51 | 5. What do you want to dump? `audio`, `wall` or `docs`? 52 | 6. Now specify everything to the script: 53 | 54 | main.py --app_id 1234567 --id 1 --mode wall audio docs 55 | 56 | 7. Script will ask you to go to a given URL, so, do it :) 57 | 8. Give access for this application to your profile. 58 | 9. You will be redirected to a white page with text about security. 59 | 10. Copy URL of this page. Yeah, I know, VK tells you not to do that for security reasons. But this application is your just registered app, and this script is open-source, so go and read sources if you don't trust me.. 60 | 11. Paste URL int script, it is waiting for you! 61 | 12. Enjoy downloading process. This will take a lot of time, though. 62 | 63 | You may set limits of posts and audio tracks to download, change directory to store data, etc. See `--help` output. 64 | 65 | Results 66 | --- 67 | According to working mode and wall posts' contents, corresponding dirs and files will be created. 68 | Everything will be stored in specified directory, say, `some_dir`: 69 | 70 | some_dir (base directory) 71 | +---- 9876543 (user id) 72 | +---- docs (documents stored here) 73 | | +---- cat.gif 74 | | +---- my_archive.zip 75 | | +---- ... 76 | +---- audio (audio tracks ant dexts) 77 | | +---- Artist1 - Track.mp3 78 | | +---- Artist2 - Track.mp3 79 | | +---- Artist2 - Track.mp3.txt (text for that song) 80 | | +---- Album1 (audio album name) 81 | | | +---- Artist - track.mp3 (tracks in that album) 82 | | +---- ... 83 | +---- post_1234 (wall post id) 84 | | +---- text.html (post text) 85 | | +---- image.jpg (any multimedia attachments) 86 | | +---- music.mp3 87 | | +---- ... 88 | | +---- media_urls.txt (list of attachments' urls) 89 | | +---- comments.json (raw comments, reply from vk server) 90 | | +---- raw.json (raw post, reply from vk server) 91 | | +---- note_1234 (note, if attached to post) 92 | | | +---- text.html (note text) 93 | | | +---- raw.json (raw note, reply from vk server) 94 | | +---- comments (comments dir) 95 | | +---- text.html (all comments' text) 96 | | +---- raw.json (raw comments, reply from vk server) 97 | | +---- image.jpg (any multimedia attachments) 98 | | +---- music.mp3 99 | | +---- ... 100 | | +---- media_urls.txt (list of attachments' urls) 101 | +---- post_1235 102 | | +---- ... 103 | +---- ... 104 | 105 | 106 | 107 | Bugs 108 | --- 109 | Any bug reports, pul requests appreciated. Open an issue [here](https://github.com/Rast1234/vkd/issues) or [PM me](https://vk.com/rast1234) 110 | 111 | Credits: 112 | --- 113 | 114 | Me :) 115 | http://habrahabr.ru/post/143972/ (call_api function) 116 | http://stackoverflow.com/questions/22676/how-do-i-download-a-file-over-http-using-python (fancy progressbar) 117 | Pavel Durov for great social network and its buggy API :) 118 | 119 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | 3 | """Save everything from your VK wall""" 4 | 5 | __author__ = "Rast" 6 | 7 | import logging 8 | import argparse 9 | from collections import defaultdict 10 | from PostParser import PostParser 11 | from Api import call_api, auth 12 | import os 13 | 14 | def arg_parse(): 15 | argparser = argparse.ArgumentParser() 16 | argparser.add_argument("-d", "--dir", 17 | type=str, 18 | help="Directory to store dumped data", 19 | dest="directory", 20 | required=False, 21 | default=".") 22 | argparser.add_argument("-i", "--id", 23 | type=int, 24 | help="User ID to dump. To dump a group, specify its ID with '-' prefix", 25 | metavar="USER_ID|-GROUP_ID", 26 | dest="id", 27 | required=True) 28 | argparser.add_argument("-t", "--token", 29 | type=str, 30 | help="Access token, generated by VK for session", 31 | dest="token", 32 | required=False) 33 | argparser.add_argument("-a", "--app_id", 34 | type=int, 35 | help="Your application ID to access VK API", 36 | dest="app_id", 37 | required=True) 38 | argparser.add_argument("-m", "--mode", 39 | option_strings=['wall', 'audio', 'video', 'notes', 'docs'], 40 | nargs="+", 41 | help="What to dump. Possible values: "+', '.join(['wall', 'audio', 'video', 'notes']), 42 | dest="mode", 43 | required=True) 44 | 45 | argparser.add_argument("--wall_start", 46 | type=int, 47 | help="Post number to start from (first is 0)", 48 | dest="wall_start", 49 | required=False, 50 | metavar="INT", 51 | default=0) 52 | argparser.add_argument("--wall_end", 53 | type=int, 54 | help="Post number to end at (0 = all posts)", 55 | dest="wall_end", 56 | required=False, 57 | metavar="INT", 58 | default=0) 59 | 60 | argparser.add_argument("--audio_start", 61 | type=int, 62 | help="Audio number to start from (first is 0)", 63 | dest="audio_start", 64 | required=False, 65 | metavar="INT", 66 | default=0) 67 | argparser.add_argument("--audio_end", 68 | type=int, 69 | help="Audio number to end at (0 = all audios)", 70 | dest="audio_end", 71 | required=False, 72 | metavar="INT", 73 | default=0) 74 | 75 | argparser.add_argument("--video_start", 76 | type=int, 77 | help="Video number to start from (first is 0)", 78 | dest="video_start", 79 | required=False, 80 | metavar="INT", 81 | default=0) 82 | argparser.add_argument("--video_end", 83 | type=int, 84 | help="Video number to end at (0 = all videos)", 85 | dest="video_end", 86 | required=False, 87 | metavar="INT", 88 | default=0) 89 | 90 | argparser.add_argument("--notes_start", 91 | type=int, 92 | help="Note number to start from (first is 0)", 93 | dest="notes_start", 94 | required=False, 95 | metavar="INT", 96 | default=0) 97 | argparser.add_argument("--notes_end", 98 | type=int, 99 | help="Note number to end at (0 = all notes)", 100 | dest="notes_end", 101 | required=False, 102 | metavar="INT", 103 | default=0) 104 | 105 | argparser.add_argument("--docs_start", 106 | type=int, 107 | help="Document number to start from (first is 0)", 108 | dest="docs_start", 109 | required=False, 110 | metavar="INT", 111 | default=0) 112 | argparser.add_argument("--docs_end", 113 | type=int, 114 | help="Document number to end at (0 = all docs)", 115 | dest="docs_end", 116 | required=False, 117 | metavar="INT", 118 | default=0) 119 | 120 | argparser.add_argument("-v", "--verbose", action="store_true", 121 | help="Print more info to STDOUT while processing") 122 | argparser.add_argument("--no-download", 123 | action="store_true", 124 | help="Do not download attachments, only store links", 125 | dest="no_download", 126 | required=False) 127 | args = argparser.parse_args() 128 | return args 129 | 130 | def process_post(number, post_data, post_parser, json_stuff): 131 | """Post-processing :)""" 132 | data = defaultdict(lambda: "", post_data[1]) 133 | post_parser(number, data, json_stuff) 134 | 135 | def process_audio(number, audio_data, post_parser, json_stuff): 136 | """Audio-processing""" 137 | #data = defaultdict(lambda: "", audio_data[1]) 138 | try: 139 | data = {'attachments': [{'type': 'audio', 140 | 'audio': audio_data[0], 141 | }], 142 | 'id' : 'audio' 143 | } 144 | 145 | post_parser(number, data, json_stuff) 146 | except IndexError: # deleted :( 147 | logging.warning("Deleted track: {}".format(str(audio_data))) 148 | return 149 | 150 | def process_doc(number, doc_data, post_parser, json_stuff): 151 | """Doc-processing""" 152 | data = {'attachments': [{'type': 'doc', 153 | 'doc': doc_data, 154 | }], 155 | 'id' : 'doc' 156 | } 157 | post_parser(number, data, json_stuff) 158 | 159 | 160 | def ranges(start, end, count): 161 | """Determine ranges""" 162 | if end == 0: 163 | end = count 164 | if not 0 <= start < count + 1: 165 | raise RuntimeError("Start argument not in valid range") 166 | if not start <= end <= count: 167 | raise RuntimeError("End argument not in valid range") 168 | logging.info("Working range: from {} to {}".format(start, end)) 169 | total = end - start 170 | return start, end, total 171 | 172 | def main(): 173 | """Main function""" 174 | 175 | args = arg_parse() 176 | args.access_rights = ["wall", "audio", "friends", "notes", "video", "docs"] 177 | args.token = auth(args) if args.token is None else args.token 178 | if args.token is None: 179 | raise RuntimeError("Access token not found") 180 | 181 | 182 | if 'wall' in args.mode: 183 | #determine posts count 184 | (response, json_stuff) = call_api("wall.get", [("owner_id", args.id), ("count", 1), ("offset", 0)], args) 185 | count = response[0] 186 | logging.info("Total posts: {}".format(count)) 187 | print("Wall dowload start") 188 | args.wall_start, args.wall_end, total = ranges(args.wall_start, args.wall_end, count) 189 | counter = 0.0 # float for % 190 | post_parser = PostParser(args.directory, str(args.id), args) 191 | for x in xrange(args.wall_start, args.wall_end): 192 | if args.verbose and counter % 10 == 0: 193 | print("\nDone: {:.2%} ({})".format(counter / total, int(counter))) 194 | (post, json_stuff) = call_api("wall.get", [("owner_id", args.id), ("count", 1), ("offset", x)], args) 195 | process_post(("wall post", x), post, post_parser, json_stuff) 196 | counter += 1 197 | if args.verbose: 198 | print("\nDone: {:.2%} ({})".format(float(total) / total, int(total))) 199 | 200 | if 'audio' in args.mode: 201 | #determine audio count 202 | (response, json_stuff) = call_api("audio.getCount", [("oid", args.id)], args) 203 | count = response 204 | logging.info("Total audio tracks: {}".format(count)) 205 | print("Audio dowload start") 206 | args.audio_start, args.audio_end, total = ranges(args.audio_start, args.audio_end, count) 207 | counter = 0.0 # float for % 208 | #audio_dir = os.path.join(str(args.id), 'audio') 209 | audio_dir = str(args.id) 210 | post_parser = PostParser(args.directory, audio_dir, args) 211 | id_param = "uid" if args.id > 0 else "gid" 212 | args.id *= -1 if args.id < 0 else 1 213 | for x in xrange(args.audio_start, args.audio_end): 214 | if args.verbose and counter % 10 == 0: 215 | print("\nDone: {:.2%} ({})".format(counter / total, int(counter))) 216 | (audio, json_stuff) = call_api("audio.get", [(id_param, args.id), ("count", 1), ("offset", x)], args) 217 | process_audio(("audiotrack", x), audio, post_parser, json_stuff) 218 | counter += 1 219 | if args.verbose: 220 | print("\nDone: {:.2%} ({})".format(float(total) / total, int(total))) 221 | 222 | if 'video' in args.mode: 223 | raise NotImplementedError("Video mode is not written yet, sorry :(") 224 | if 'notes' in args.mode: 225 | raise NotImplementedError("Notes mode is not written yet, sorry :(") 226 | if 'docs' in args.mode: 227 | # get ALL docs 228 | (response, json_stuff) = call_api("docs.get", [("oid", args.id)], args) 229 | count = response[0] 230 | data = response[1:] 231 | logging.info("Total documents: {}".format(count)) 232 | print("Wall dowload start") 233 | args.docs_start, args.docs_end, total = ranges(args.docs_start, args.docs_end, count) 234 | counter = 0.0 # float for % 235 | docs_dir = str(args.id) 236 | post_parser = PostParser(args.directory, docs_dir, args) 237 | data = data[args.docs_start:args.docs_end] 238 | num = args.docs_start 239 | for x in data: 240 | if args.verbose and counter % 10 == 0: 241 | print("\nDone: {:.2%} ({})".format(counter / total, int(counter))) 242 | process_doc(("document", num), x, post_parser, json_stuff) 243 | counter += 1 244 | num += 1 245 | if args.verbose: 246 | print("\nDone: {:.2%} ({})".format(float(total) / total, int(total))) 247 | 248 | if __name__ == '__main__': 249 | logging.basicConfig(format=u"""%(filename).6s : %(lineno)4d #%(levelname)8s [%(asctime)s] %(message)s""", 250 | level=logging.DEBUG, 251 | filename=u'report.log') 252 | ok = False 253 | try: 254 | logging.info("Start") 255 | main() 256 | logging.info("End") 257 | ok = True 258 | print("") 259 | except KeyboardInterrupt: 260 | logging.critical("Interrupted by keystroke") 261 | print "\nWhy, cruel world?.." 262 | finally: 263 | if not ok: 264 | logging.critical("Fail") 265 | -------------------------------------------------------------------------------- /PostParser.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rast' 2 | 3 | import logging 4 | from os import path, makedirs 5 | import json 6 | #from ThreadedDownload import ThreadedDownload # buggy like hell 7 | from Download import download 8 | from Api import call_api 9 | from collections import defaultdict 10 | import re 11 | 12 | def make_dir(base_dir, name): 13 | """Make new dir into base dir, return concatenation""" 14 | if path.exists(base_dir) and path.isdir(base_dir): 15 | directory = path.join(base_dir, name) 16 | if path.exists(directory) and path.isdir(directory): 17 | #raise RuntimeError("Directory already exists: {}".format(directory)) 18 | return directory 19 | else: 20 | makedirs(directory) 21 | return directory 22 | else: 23 | raise RuntimeError("Directory does not exist: {}".format(base_dir)) 24 | 25 | def escape(name): 26 | """Escape the filename""" 27 | result = unicode(re.sub('[^+=\-()$!#%&,.\w\s]', '_', name, flags=re.UNICODE).strip()) 28 | #print("\t{}\n\t{}".format(name, result)) 29 | return result[:250] 30 | 31 | 32 | class PostParser(object): 33 | """Parses given post into data lists (text, music, photos, info, etc.) 34 | 35 | parse post - store useful data: 36 | id (of the post) 37 | to_id (always user?) 38 | from_id (post author) 39 | date (unix timestamp, convert to time) 40 | text (unicode) 41 | attachments: (multimedia!) 42 | type (type name) 43 | : 44 | ... 45 | comments: (obvious) 46 | count 47 | can_post (0|1) 48 | likes: (people list) 49 | count 50 | user_likes (if user liked it) 51 | can_like 52 | can_publish 53 | reposts: (people list) 54 | count 55 | user_reposted (0|1) 56 | signer_id (if group, and if post is signed) 57 | copy_owner_id (if repost, author's id) 58 | copy_post_id (if repost, original post id) 59 | copy_text (if repost, user's response) 60 | 61 | """ 62 | 63 | def __init__(self, base_dir, subdir, args): 64 | """Make directory for current user""" 65 | self.directory = make_dir(base_dir, subdir) 66 | self.args = args 67 | 68 | def __call__(self, tpl, raw_data, json_stuff): 69 | """Process whole post into directory""" 70 | keys = [] 71 | funcs = [] 72 | self.urls = [] 73 | self.prefix = tpl[0] 74 | self.number = tpl[1] 75 | ignore = ['id', 'to_id', 'from_id', 'date', 76 | 'likes', 'reposts', 'signer_id', 77 | 'copy_owner_id', 'copy_post_id', 'copy_post_date', 78 | 'copy_post_type', 'reply_count', 'post_type', 79 | 'post_source', 'online', 'attachment', 'copy_text', 80 | 'media', 'can_edit', 81 | # comments fix 82 | 'uid', 'cid', 'reply_to_cid', 'reply_to_uid', 83 | 'reply_owner_id', 'reply_post_id', 84 | ] 85 | for k in raw_data.keys(): 86 | if k in ignore: 87 | continue 88 | try: 89 | f = getattr(self, k) 90 | keys.append(k) 91 | funcs.append(f) 92 | except AttributeError: 93 | logging.warning("Not implemented: {}".format(k)) 94 | logging.info("Saving: {} for {}".format(', '.join(keys), raw_data['id'])) 95 | self.post_directory = make_dir(self.directory, str(raw_data['id'])) 96 | 97 | self.save_raw(json_stuff) 98 | for (f, k) in zip(funcs, keys): 99 | f(k, raw_data) 100 | 101 | if self.urls and not self.args.no_download: 102 | download(self.urls, 103 | self.post_directory, 104 | ) 105 | 106 | def text(self, key, raw_data): 107 | """Save text of the note""" 108 | text = raw_data['text'] 109 | users_text = raw_data['copy_text'] 110 | stuff = '' 111 | if raw_data['copy_post_id'] == '': # user's post 112 | if text == '': 113 | return 114 | else: 115 | stuff = '

Text:

\n' + text 116 | else: # repost 117 | if text == '': 118 | if users_text == '': 119 | return 120 | else: 121 | stuff = '

Text:

\n' + users_text 122 | else: 123 | if users_text == '': 124 | stuff = '

Original text:

\n' + text 125 | else: 126 | stuff = "

User's text:

\n" + users_text + \ 127 | '

Original text:

\n' + text 128 | 129 | 130 | f_name = path.join(self.post_directory, 'text.html') 131 | out_file = open(f_name, 'a+') 132 | out_file.write(stuff.encode("utf-8")) 133 | out_file.close() 134 | 135 | def attachments(self, key, raw_data): 136 | """Save all attachments""" 137 | f_args = [] 138 | funcs = [] 139 | for att in raw_data[key]: 140 | t = att['type'] 141 | k = 'dl_' + t 142 | try: 143 | f = getattr(self, k) 144 | f_args.append(att[t]) 145 | funcs.append(f) 146 | except AttributeError: 147 | logging.warning("Not implemented downloader: {}".format(t)) 148 | for (f, a) in zip(funcs, f_args): 149 | f(a) 150 | 151 | def comments(self, key, data): 152 | """Save all comments""" 153 | count = data[key]['count'] 154 | if count == 0: 155 | return 156 | comments = [count, ] 157 | for x in xrange(data[key]['count']): 158 | (comment_data, json_stuff) = call_api("wall.getComments", 159 | [("owner_id", self.args.id), 160 | ("post_id", data["id"]), 161 | ("sort", "asc"), 162 | ("offset", x), 163 | ("count", 1), 164 | ("preview_length", 0), 165 | ("need_likes", 1), 166 | ("v", 4.4), 167 | ], self.args) 168 | comments.append(comment_data[1]) 169 | cdata = defaultdict(lambda: '', comment_data[1]) 170 | pp = PostParser(self.post_directory, 'comments', self.args) 171 | pp(('comment to ',self.number), cdata, json_stuff) 172 | json_data = json.dumps(comments, indent=4, ensure_ascii=False) 173 | f_name = path.join(self.post_directory, 'comments.json') 174 | out_file = open(f_name, 'a+') 175 | out_file.write(json_data.encode('utf-8')) 176 | out_file.close() 177 | 178 | def save_raw(self, data): 179 | """Save raw post data""" 180 | data = json.loads(data) 181 | data = json.dumps(data, indent=4, ensure_ascii=False) 182 | 183 | f_name = path.join(self.post_directory, 'raw.json') 184 | out_file = open(f_name, 'a+') 185 | out_file.write(data.encode('utf-8')) 186 | out_file.close() 187 | 188 | def save_url(self, url, name=None, subdir=''): 189 | if name is not None: 190 | name = escape(name) 191 | self.urls.append((url, name, subdir)) 192 | f_name = path.join(self.post_directory, 'media_urls.txt') 193 | out_file = open(f_name, 'a+') 194 | out_file.write(url) 195 | out_file.write('\n') 196 | out_file.close() 197 | 198 | def dl_photo(self, data): 199 | """Download a photo 200 | vk is a bit crazy, it stores photo in a bunch of sizes: 201 | src 202 | src_small 203 | src_big 204 | src_xbig 205 | src_xxbig 206 | src_xxxbig 207 | (and what else?) 208 | """ 209 | sizes = ['src_xxxbig', 'src_xxbig', 'src_xbig', 'src_big', 'src', 'src_small'] 210 | url = None 211 | for s in sizes: 212 | try: 213 | url = data[s] # try to get biggest size 214 | break 215 | except KeyError: 216 | pass 217 | if url is None: 218 | logging.error("Unable to get photo url!") 219 | else: 220 | self.save_url(url) 221 | 222 | def dl_link(self, data): 223 | """Store links in a file""" 224 | url = data['url'] 225 | f_name = path.join(self.post_directory, 'links.txt') 226 | out_file = open(f_name, 'a+') 227 | out_file.write(url) 228 | out_file.write('\n') 229 | out_file.close() 230 | 231 | def dl_photos_list(self, data): 232 | """Download list of photos""" 233 | for x in data: 234 | self.dl_photo(x) 235 | 236 | def dl_audio(self, data): 237 | initial_data = data 238 | aid = data["aid"] 239 | owner = data["owner_id"] 240 | request = "{}_{}".format(owner, aid) 241 | (audio_data, json_stuff) = call_api("audio.getById", [("audios", request), ], self.args) 242 | album = 'no_album' 243 | try: 244 | data = audio_data[0] 245 | artist = data['artist'][:100] 246 | title= data['title'][:100] 247 | name = u"{} - {}.mp3".format(artist, title) 248 | #album = data['album'] # API changed, no time to fix 249 | #album = get_album_name(owner, album, self.args) 250 | #album = escape(album) 251 | make_dir(self.post_directory, album) 252 | self.save_url(data["url"], name, album) 253 | except IndexError: # deleted :( 254 | logging.warning("Deleted track: {}".format(str(initial_data))) 255 | return 256 | 257 | # store lyrics if any 258 | try: 259 | lid = data["lyrics_id"] 260 | except KeyError: 261 | return 262 | (lyrics_data, json_stuff) = call_api("audio.getLyrics", [("lyrics_id", lid), ], self.args) 263 | text = lyrics_data["text"].encode('utf-8') 264 | name = escape(name) 265 | f_name = path.join(self.post_directory, album) 266 | f_name = path.join(f_name, name+'.txt') 267 | # escape! 268 | out_file = open(f_name, 'a+') 269 | out_file.write(text) 270 | out_file.write('\n') 271 | out_file.close() 272 | 273 | 274 | """Download video 275 | There's a walkaround: 276 | http://habrahabr.ru/sandbox/57173/ 277 | But this requires authorization as another app 278 | 279 | def dl_video(self, data): 280 | 281 | #print data 282 | """ 283 | 284 | 285 | def dl_doc(self, data): 286 | """Download document (GIFs, etc.)""" 287 | url = data["url"] 288 | name = data["title"] 289 | name, ext = path.splitext(name) 290 | name = name + '.' + data["ext"] 291 | self.save_url(url, name) 292 | 293 | def dl_note(self, data): 294 | """Download note, not comments""" 295 | (note_data, json_stuff) = call_api("notes.getById", [ 296 | ("owner_id", data["owner_id"]), 297 | ("nid", data["nid"]), 298 | ], self.args) 299 | stuff = u"

{title}

\n{text}".format(**note_data) 300 | ndir = make_dir(self.post_directory, 'note_'+note_data["id"]) 301 | f_name = path.join(ndir, 'text.html') 302 | out_file = open(f_name, 'a+') 303 | out_file.write(stuff.encode("utf-8")) 304 | out_file.close() 305 | 306 | ndata = json.dumps(note_data, indent=4, ensure_ascii=False) 307 | 308 | f_name = path.join(ndir, 'raw.json') 309 | out_file = open(f_name, 'a+') 310 | out_file.write(ndata.encode("utf-8")) 311 | out_file.close() 312 | --------------------------------------------------------------------------------