├── README.md ├── caa-downloader ├── README.md ├── caa-downloader.py └── requirements.txt ├── porn3dx-downloader ├── README.md ├── hydrus_sidecar_routers.png ├── porn3dx-downloader.py └── requirements.txt ├── twscrape-wrapper ├── .gitignore ├── README.md └── twscrape-wrapper.py ├── vrchat-asset-downloader ├── .gitignore ├── README.md ├── requirements.txt └── vrchat-asset-downloader.py └── vroid-hub-downloader ├── README.md ├── requirements.txt └── vroid-hub-downloader.py /README.md: -------------------------------------------------------------------------------- 1 | # misc-scripts 2 | 3 | A variety of Python scripts I've written, mostly for downloading, scraping and otherwise ripping things off the web which require a little effort or would be convenient to automate. 4 | 5 | Each directory has a README describing the script and it's usage to assist anyone else trying to use it. 6 | -------------------------------------------------------------------------------- /caa-downloader/README.md: -------------------------------------------------------------------------------- 1 | # caa-downloader 2 | Download art for a [MusicBrainz](https://musicbrainz.org/) release from the [Cover Art Archive](https://coverartarchive.org/). 3 | 4 | ### Usage 5 | ```sh 6 | usage: caa-downloader.py [-h] [-d DIRECTORY] [-s SIZE] [RELEASES ...] 7 | 8 | positional arguments: 9 | RELEASES releases to download i.e. 10 | 3791c620-7ba4-3db0-bda8-2b060f31a7b8 11 | https://musicbrainz.org/release/3791c620-7ba4-3db0-bda8-2b060f31a7b8 12 | beta.musicbrainz.org/release/3791c620-7ba4-3db0-bda8-2b060f31a7b8/discids 13 | 14 | options: 15 | -h, --help show this help message and exit 16 | -d DIRECTORY, --directory DIRECTORY 17 | save directory (defaults to current) 18 | -s SIZE, --size SIZE image download size (250, 500, 1200, original) 19 | ``` 20 | -------------------------------------------------------------------------------- /caa-downloader/caa-downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import json 4 | import os 5 | import re 6 | import requests 7 | from requests_toolbelt import sessions 8 | import shutil 9 | from tqdm.auto import tqdm 10 | import urllib.parse as urlparse 11 | 12 | VALID_THUMBS = [250, 500, 1200] 13 | API = "http://coverartarchive.org/release/" 14 | RELEASE_REGEX = r"^(?:(?:https?:\/\/)?(?:.*?\.)?musicbrainz\.org\/release\/)?(?P[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})(?:\/.+)?$" 15 | BLOCK_SIZE = 1024 16 | 17 | 18 | def download_image(i_url, filename): 19 | file_path = os.path.join(args.directory, filename) 20 | file_r = requests.get(i_url, stream=True, allow_redirects=True) 21 | if not file_r.ok: 22 | print(f"could not get art for {filename}") 23 | total_size = int(file_r.headers['content-length']) 24 | term_width = shutil.get_terminal_size((80, 20))[0] 25 | with tqdm.wrapattr(open(file_path, "wb"), "write", 26 | desc=f"{filename}", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{rate_fmt}{postfix}]", 27 | ncols=int(term_width * 0.8), total=total_size, 28 | unit="B", unit_scale=True, unit_divisor=BLOCK_SIZE 29 | ) as file_h: 30 | for chunk in file_r.iter_content(BLOCK_SIZE): 31 | file_h.write(chunk) 32 | return 33 | 34 | 35 | def download_covers(s, r_id): 36 | print(f"requesting art for {r_id}") 37 | covers_r = s.get(f"{r_id}") 38 | if not covers_r.ok: 39 | print(f"error: could not find art for release {r_id}") 40 | return 41 | covers_j = covers_r.json() 42 | print(f"found {len(covers_j['images'])} images") 43 | for image_i in range(0, len(covers_j["images"])): 44 | image_j = covers_j["images"][image_i] 45 | image_url = None 46 | if args.size == "original": 47 | image_url = image_j["image"] 48 | else: 49 | image_url = image_j["thumbnails"][args.size] 50 | filename = f"{str(image_i)}_" + "+".join(image_j["types"]) 51 | if len(image_j["comment"]) > 0: 52 | comment = image_j["comment"] 53 | filename += f" ({comment})" 54 | filename_clean = re.sub(r"[^\w\-_\. \[\]\(\)\+]", "_", filename) 55 | filename_clean += "." + image_url.split(".")[-1] 56 | download_image(image_url, filename_clean) 57 | print(f"finished retrieving art for {r_id}") 58 | 59 | 60 | def main(): 61 | if len(args.release_list) == 0: 62 | parser.print_usage() 63 | return 64 | elif args.size != "original" and args.size not in VALID_THUMBS: 65 | print(f"invalid size specified ({args.size})") 66 | return 67 | elif not os.path.isdir(args.directory): 68 | os.makedirs(args.directory) 69 | api_session = sessions.BaseUrlSession(base_url=API) 70 | for release in args.release_list: 71 | release_id_m = re.search(RELEASE_REGEX, release) 72 | if release_id_m: 73 | release_id = release_id_m.group("release_id") 74 | download_covers(api_session, release_id) 75 | else: 76 | print(f"could not parse release id from '{release}'") 77 | 78 | 79 | parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) 80 | parser.add_argument("-d", "--directory", type=str, 81 | help="save directory (defaults to current)", default=os.getcwd()) 82 | parser.add_argument("-s", "--size", type=str, default="original", 83 | help="image download size (250, 500, 1200, original)") 84 | parser.add_argument("release_list", metavar="RELEASES", nargs="*", 85 | help="releases to download i.e.\n3791c620-7ba4-3db0-bda8-2b060f31a7b8\nhttps://musicbrainz.org/release/3791c620-7ba4-3db0-bda8-2b060f31a7b8\nbeta.musicbrainz.org/release/3791c620-7ba4-3db0-bda8-2b060f31a7b8/discids") 86 | args = parser.parse_args() 87 | 88 | if __name__ == "__main__": 89 | main() 90 | -------------------------------------------------------------------------------- /caa-downloader/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | requests_toolbelt 3 | tqdm 4 | -------------------------------------------------------------------------------- /porn3dx-downloader/README.md: -------------------------------------------------------------------------------- 1 | # porn3dx-downloader 2 | Downloads videos and images from posts on [Porn3dx](https://porn3dx.com). Video formats available are better than those you can download with an account, expects and decrypts any encrypted video playlists. 3 | 4 | The `--write-sidecars` option can be used in conjunction with `hydrus_sidecar_routers.png` to import the downloaded files along with tags and other metadata into [hydrus](https://github.com/hydrusnetwork/hydrus). 5 | 6 | ### Usage 7 | ```sh 8 | usage: porn3dx-downloader.py [-h] [-V] [-d DIRECTORY] [--write-sidecars] [-f FORMAT] [-F] [POSTS ...] 9 | 10 | positional arguments: 11 | POSTS post url 12 | 13 | options: 14 | -h, --help show this help message and exit 15 | -V, --verbose print debugging information 16 | -d DIRECTORY, --directory DIRECTORY 17 | save directory (defaults to current) 18 | --write-sidecars write sidecars for urls, timestamps, tags and description notes 19 | -f FORMAT, --format FORMAT 20 | video format, specified by NAME or the keyword 'best' 21 | -F, --list-formats list available formats 22 | ``` 23 | -------------------------------------------------------------------------------- /porn3dx-downloader/hydrus_sidecar_routers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CetaceanNation/misc-scripts/cf070eda5e5c00a510c5355d00daf658240ddc50/porn3dx-downloader/hydrus_sidecar_routers.png -------------------------------------------------------------------------------- /porn3dx-downloader/porn3dx-downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from bs4 import BeautifulSoup as bs 4 | import copy 5 | from Crypto.Cipher import AES 6 | from datetime import datetime 7 | from enum import Enum 8 | import json 9 | import js2py 10 | import os 11 | import re 12 | import requests 13 | from subprocess import Popen, PIPE, STDOUT 14 | import tempfile 15 | import urllib.parse as urlparse 16 | 17 | HOST = "https://porn3dx.com/" 18 | EMBED_HOST = "https://iframe.mediadelivery.net/" 19 | DRM_ACTIVATION_HOST = "https://video-987.mediadelivery.net/" 20 | HEADERS = { 21 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0"} 22 | 23 | POST_REGEX = r".*(?Pporn3dx\.com\/post\/(?P\d+)).*" 24 | GUID_REGEX = r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" 25 | PING_TOKEN_REGEX = r";setTimeout\(function\(\)\{var\ [a-z]=\"(?P" + GUID_REGEX + \ 26 | r")\";var\ [a-z]=(?Pfunction\([a-z]+\)\{.*toLowerCase\(\)\});" 27 | PLAYLIST_REGEX = r"https?:\/\/iframe\.mediadelivery\.net\/" + GUID_REGEX + \ 28 | r"\/playlist.drm\?contextId=(?P" + \ 29 | GUID_REGEX + r")&secret=" + GUID_REGEX 30 | 31 | # 7/14/2022, 3:23:37 PM 32 | # new Date(Date.UTC(2022, 6, 14, 15, 23, 37)).toLocaleString() 33 | XTIME_REGEX = r".*UTC\(((\d+(?:,\ )?)+)\).*" 34 | TAG_CATEGORY_REGEX = r".*bg-(\w+)-100.*" 35 | TAG_CATEGORY_MAP = { 36 | "yellow": "series", 37 | "green": "character", 38 | "purple": "medium", 39 | "blue": "" 40 | } 41 | 42 | 43 | class LogLevel(Enum): 44 | BASIC = 1 45 | VERBOSE = 2 46 | 47 | 48 | def print_log(component, message, level=LogLevel.BASIC, overwrite=False): 49 | if level == LogLevel.VERBOSE and not args.verbose: 50 | return 51 | if overwrite: 52 | print(f"[{component}] {message}", end="\r") 53 | else: 54 | print(f"[{component}] {message}") 55 | 56 | 57 | def get_arguments(): 58 | parser.add_argument("-V", "--verbose", action="store_true", 59 | help="print debugging information") 60 | parser.add_argument("-d", "--directory", type=str, 61 | help="save directory (defaults to current)", default=os.getcwd()) 62 | parser.add_argument("--write-sidecars", action="store_true", 63 | help="write sidecars for urls, timestamps, tags and description notes") 64 | parser.add_argument("-f", "--format", type=str, 65 | help="video format, specified by NAME or the keyword \'best\'", default="best") 66 | parser.add_argument("-F", "--list-formats", 67 | action="store_true", help="list available formats") 68 | parser.add_argument("posts", metavar="POSTS", nargs="*", help="post url") 69 | return parser.parse_args() 70 | 71 | # based on parts of https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py 72 | 73 | 74 | def get_m3u8_info(session, playlist_url, referer_url): 75 | m3u8_info = [] 76 | print_log("get-m3u8-info", 77 | f"retrieving playlist from {playlist_url}", LogLevel.VERBOSE) 78 | m3u8_r = session.get(playlist_url, headers={"Referer": referer_url}) 79 | if not m3u8_r.ok: 80 | print_log("get-m3u8-info", 81 | f"failed to retrieve playlist from {playlist_url}") 82 | m3u8_text = m3u8_r.text 83 | media_details = None 84 | format_details = None 85 | for line in m3u8_text.splitlines(): 86 | if line.startswith("#EXT-X-STREAM-INF:"): 87 | # parse format details 88 | format_details = parse_m3u8_attributes(line) 89 | elif not line.startswith("#") and len(line.strip()) > 0: 90 | if format_details: 91 | if "RESOLUTION" in format_details: 92 | media_name = format_details["RESOLUTION"].split("x")[ 93 | 1] + "p" 94 | else: 95 | media_name = line.split("/")[0] 96 | m3u8_info += [{"location": line, "name": media_name, 97 | "bandwidth": format_details["BANDWIDTH"], "res": format_details["RESOLUTION"]}] 98 | return m3u8_info 99 | 100 | # https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py#L5495 101 | 102 | 103 | def parse_m3u8_attributes(attrib): 104 | info = {} 105 | for (key, val) in re.findall(r'(?P[A-Z0-9-]+)=(?P"[^"]+"|[^",]+)(?:,|$)', attrib): 106 | if val.startswith("\""): 107 | val = val[1:-1] 108 | info[key] = val 109 | return info 110 | 111 | 112 | def print_formats(formats_list): 113 | print(f"{'NAME':<10} {'BANDWIDTH':<10} {'RESOLUTION':<10}") 114 | print(f"{'-' * 10} {'-' * 10} {'-' * 10}") 115 | for format_settings in formats_list: 116 | print(f"{format_settings['name']:<10} " + 117 | f"{format_settings['bandwidth']:<10} " + 118 | f"{format_settings['res']:<10} ") 119 | 120 | 121 | def write_frag(session, frag_url, frag_name, key_context): 122 | try: 123 | with open(frag_name, "wb") as frag_file: 124 | video_frag_r = session.get( 125 | frag_url, headers={"Origin": EMBED_HOST, "Referer": EMBED_HOST}) 126 | if not video_frag_r.ok: 127 | print_log( 128 | f"dl:{post_id}", f"failed to download video fragment '{frag_name}'") 129 | return False 130 | # Fragments are small, decrypt in memory then write to disk 131 | frag_bytes = key_context.decrypt(video_frag_r.content) 132 | frag_file.write(frag_bytes) 133 | return True 134 | except: 135 | print_log(f"dl:{post_id}", 136 | f"exception downloading video fragment '{frag_name}'") 137 | return False 138 | 139 | 140 | def download_stream(session, index, post_data, downloading_format, drm_session, referer_url): 141 | post_id = post_data["id"] 142 | file_name = post_data["basefilename"] 143 | res_name = downloading_format["name"][:-1] 144 | context_id = drm_session["id"] 145 | refresh_token = drm_session["token"] 146 | refresh_function = drm_session["function"] 147 | output_file_name = f"{file_name}.{index}.mp4" 148 | output_file_path = os.path.abspath( 149 | os.path.join(args.directory, output_file_name)) 150 | if os.path.isfile(output_file_path): 151 | print_log(f"dl:{post_id}", "file exists, skipping download") 152 | return output_file_path, post_data 153 | playlist_r = session.get(downloading_format["location"], headers={ 154 | "Referer": referer_url}) 155 | if not playlist_r.ok: 156 | print_log(f"dl:{post_id}", "failed to retrieve post playlist") 157 | return 158 | playlist_text = playlist_r.text 159 | key_context = None 160 | key_count = 0 161 | frag_files = [] 162 | for line in playlist_text.splitlines(): 163 | if line.startswith("#EXT-X-KEY:"): 164 | # New key for decrypting fragments 165 | key_attr = parse_m3u8_attributes(line) 166 | key_r = session.get(key_attr["URI"], headers={ 167 | "Origin": EMBED_HOST, "Referer": EMBED_HOST}) 168 | if not key_r.ok: 169 | print_log(f"key-context:{post_id}", 170 | "failed to retrieve key for segments") 171 | continue 172 | key_bytes = key_r.content 173 | iv_bytes = bytearray.fromhex(key_attr["IV"][2:]) 174 | print_log( 175 | f"key-context:{post_id}", f"new key context [IV: {iv_bytes.hex()}, K: {key_bytes.hex()}]", LogLevel.VERBOSE) 176 | key_context = AES.new(key_bytes, AES.MODE_CBC, iv_bytes) 177 | key_count += 1 178 | # Refresh DRM context 179 | time_in_video = float(key_count) 180 | refresh_string = f"{refresh_token}_{context_id}_{time_in_video}_false_{res_name}" 181 | refresh_hash = refresh_function(refresh_string) 182 | refresh_url = DRM_ACTIVATION_HOST + \ 183 | f".drm/{context_id}/ping?hash={refresh_hash}&time={time_in_video}&paused=false&resolution={res_name}" 184 | print_log( 185 | f"drm:{post_id}", f"refreshing session; {refresh_url}", LogLevel.VERBOSE) 186 | refresh_r = session.get(refresh_url, headers={ 187 | "Origin": EMBED_HOST, "Referer": EMBED_HOST}) 188 | if not refresh_r.ok: 189 | print_log( 190 | f"drm:{post_id}", "failed to refresh the drm session, will continue but likely to fail if the video is long") 191 | elif not line.startswith("#"): 192 | # Write the fragment 193 | frag_file_name = os.path.abspath(os.path.join( 194 | args.directory, f"{file_name}.{index}.{len(frag_files)}.ts")) 195 | if write_frag(session, line, frag_file_name, key_context): 196 | frag_files.append(frag_file_name) 197 | # Use ffmpeg to concatenate all the fragments into a single output file 198 | print_log(f"mpeg-convert:{post_id}", f"merging fragments") 199 | with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as frag_list: 200 | for frag_name in frag_files: 201 | frag_list.write(f"file '{frag_name}'\n") 202 | frag_list.flush() 203 | ffmpeg_list = ["ffmpeg", "-hide_banner", "-y", "-f", "concat", 204 | "-safe", "0", "-i", frag_list.name, "-c", "copy", output_file_path] 205 | print_log("ffmpeg", f"args: {ffmpeg_list}", LogLevel.VERBOSE) 206 | try: 207 | ffmpeg_process = Popen(ffmpeg_list, stdout=PIPE, stderr=PIPE) 208 | stdout, stderr = ffmpeg_process.communicate() 209 | except Exception: 210 | print_log(f"mpeg-convert:{post_id}", "failure in executing ffmpeg") 211 | print_log( 212 | "ffmpeg", f"stdout: {str(stdout)}\n\nstderr: {str(stderr)}", LogLevel.VERBOSE) 213 | return 214 | frag_list.close() 215 | # Cleanup only if file is found 216 | if os.path.isfile(output_file_path): 217 | for frag_name in frag_files: 218 | os.remove(frag_name) 219 | return output_file_path, post_data 220 | print_log(f"mpeg-convert:{post_id}", "could not find output file") 221 | return 222 | 223 | 224 | def download_video(session, index, post_data, content_soup): 225 | post_id = post_data["id"] 226 | iframe_url = content_soup.find("iframe")["src"] 227 | if not iframe_url: 228 | print_log(f"info:{post_id}", "could not find embed url in post page") 229 | return 230 | # Download embed to get formats playlist url 231 | iframe_r = session.get(iframe_url, headers={"Referer": HOST}) 232 | if not iframe_r.ok: 233 | print_log(f"info:{post_id}", "failed to retrieve video embed") 234 | return 235 | iframe_soup = bs(iframe_r.content, "html.parser") 236 | iframe_script = iframe_soup.find_all("script")[-1].string 237 | # Extract formats playlist url from embed script 238 | playlist_m = re.search(PLAYLIST_REGEX, iframe_script) 239 | if not playlist_m: 240 | print_log(f"info:{post_id}", 241 | "could not find format playlist url in embed") 242 | return 243 | playlist_url = playlist_m.group(0) 244 | context_id = playlist_m.group("context_id") 245 | # Get available formats 246 | formats_list = get_m3u8_info(session, playlist_url, iframe_url) 247 | if args.list_formats: 248 | print_log(f"info:{post_id}", "available formats:") 249 | print_formats(formats_list) 250 | return 251 | # Activate DRM session 252 | drm_session = {} 253 | activation_url = urlparse.urljoin( 254 | DRM_ACTIVATION_HOST, f".drm/{context_id}/activate") 255 | if not session.get(activation_url, headers={"Origin": EMBED_HOST, "Referer": EMBED_HOST}).ok: 256 | print_log( 257 | f"drm:{post_id}", "failed to activate drm context, download will not proceed") 258 | return 259 | print_log(f"drm:{post_id}", 260 | f"activated drm context {context_id}", LogLevel.VERBOSE) 261 | drm_session["id"] = context_id 262 | # Extract refresh token from embed script 263 | token_m = re.search(PING_TOKEN_REGEX, iframe_script) 264 | if not token_m: 265 | print_log(f"drm:{post_id}", 266 | "could not find ping refresh token in embed") 267 | return 268 | drm_session["token"] = token_m.group("ping_token") 269 | secret_script = token_m.group("secret_function") 270 | drm_session["function"] = js2py.eval_js(secret_script) 271 | # Select preferred format 272 | downloading_format = None 273 | best_bitrate = 0 274 | for format_settings in formats_list: 275 | if args.format == "best": 276 | if int(format_settings["bandwidth"]) > best_bitrate: 277 | downloading_format = format_settings 278 | best_bitrate = int(format_settings["bandwidth"]) 279 | elif args.format == format_settings["name"]: 280 | downloading_format = format_settings 281 | break 282 | if not downloading_format: 283 | print_log( 284 | f"info:{post_id}", f"the specified format could not be found: {args.format}") 285 | return 286 | downloading_format["location"] = urlparse.urljoin( 287 | playlist_url, format_settings["location"]) 288 | format_name = downloading_format["name"] 289 | print_log(f"info:{post_id}", f"downloading format {format_name}") 290 | return download_stream(session, index, post_data, downloading_format, drm_session, iframe_url) 291 | 292 | 293 | def download_image(session, index, post_data, content_soup): 294 | post_id = post_data["id"] 295 | file_name = post_data["basefilename"] 296 | image_url = content_soup.find("picture").div.img["src"].strip() 297 | post_data["urls"].append(image_url) 298 | image_ext = os.path.splitext(urlparse.urlparse(image_url).path)[1] 299 | output_file_name = f"{file_name}.{index}{image_ext}" 300 | output_file_path = os.path.join(args.directory, output_file_name) 301 | if os.path.isfile(output_file_path): 302 | print_log(f"dl:{post_id}", "file exists, skipping download") 303 | return output_file_path, post_data 304 | with open(output_file_path, "wb") as image_file: 305 | image_r = session.get(image_url) 306 | if not image_r.ok: 307 | print_log(f"dl:{post_id}", "failed to retrieve image content") 308 | return 309 | image_file.write(image_r.content) 310 | return output_file_path, post_data 311 | 312 | 313 | def get_content_caption(post_data, content_soup): 314 | caption_divs = content_soup.find_all("div", recursive=False)[ 315 | 1].find_all("div", recursive=False) 316 | if len(caption_divs) > 1: 317 | caption_text = caption_divs[1].string.strip() 318 | post_data["description"].append("porn3dx caption: " + caption_text) 319 | 320 | 321 | def write_sidecar(path, data): 322 | if path and data: 323 | if len(data["urls"]) > 0: 324 | with open(f"{path}.urls.txt", "w") as urls_sidecar: 325 | for url in data["urls"]: 326 | urls_sidecar.write(f"{url}\n") 327 | if "timestamp" in data and data["timestamp"]: 328 | with open(f"{path}.time.txt", "w") as ts_sidecar: 329 | ts_sidecar.write(str(data["timestamp"])) 330 | if "tags" in data and len(data["tags"]) > 0: 331 | with open(f"{path}.tags.json", "w") as tags_sidecar: 332 | json.dump(data["tags"], tags_sidecar, 333 | ensure_ascii=False, indent=4) 334 | if "description" in data and data["description"]: 335 | with open(f"{path}.note.json", "w") as note_sidecar: 336 | json.dump(data["description"], note_sidecar, 337 | ensure_ascii=False, indent=4) 338 | 339 | 340 | def get_post_data(post_id, soup): 341 | post_data = {} 342 | post_data["id"] = post_id 343 | canonical_url = soup.find("link", rel="canonical")["href"] 344 | post_data["urls"] = [canonical_url] 345 | post_data["basefilename"] = canonical_url.split("/")[-1] 346 | # Info, Tags, Discussion, More 347 | post_meta_divs = soup.find( 348 | id="aside-scroll").div.div.find_all("div", recursive=False) 349 | # User, Like & Share, Description, Stats, Share 350 | info_div = post_meta_divs[0].find_all("div", recursive=False) 351 | tags = [] 352 | post_user_block = info_div[0] 353 | post_desc_block = info_div[2] 354 | tags.append("title:" + post_desc_block.find("h1").string.strip()) 355 | tags.append("creator:" + post_user_block.find_all("a") 356 | [-1].string.strip()[1:]) 357 | desc_and_ts = post_desc_block.find_all("div", recursive=False) 358 | ts_index = 0 359 | post_data["description"] = [] 360 | if len(desc_and_ts) > 1: 361 | ts_index = 1 362 | for description_link in desc_and_ts[0].find_all("a"): 363 | description_link.string = description_link["href"] 364 | post_data["description"].append( 365 | "porn3dx description: " + desc_and_ts[0].get_text().strip()) 366 | xtime_text = desc_and_ts[ts_index].span["x-text"] 367 | date_text_m = re.search(XTIME_REGEX, xtime_text) 368 | if not date_text_m: 369 | print_log(f"info:{post_id}", f"failed parsing date '{xtime_text}'") 370 | return None 371 | date_values = list(map(int, date_text_m.group(1).split(", "))) 372 | post_data["timestamp"] = int(datetime(date_values[0], (date_values[1] + 1) % 373 | 11, date_values[2], date_values[3], date_values[4], date_values[5]).timestamp()) 374 | tag_block = post_meta_divs[1].find_all("div", recursive=False)[1] 375 | for tag_link in tag_block.find_all("a", recursive=False): 376 | tag_category = "" 377 | tag_text = tag_link.string.strip() 378 | for tag_class in tag_link["class"]: 379 | tag_category_m = re.search(TAG_CATEGORY_REGEX, tag_class) 380 | if not tag_category_m: 381 | continue 382 | category_color = tag_category_m.group(1) 383 | if category_color in TAG_CATEGORY_MAP: 384 | tag_category = TAG_CATEGORY_MAP[tag_category_m.group(1)] 385 | break 386 | else: 387 | print_log( 388 | f"info:{post_id}", f"could not map tag category for tag '{tag_text}'") 389 | print_log( 390 | f"info:{post_id}", f"tag category for tag '{tag_text}' resolves to color '{category_color}'") 391 | break 392 | tag_category = tag_category + ":" if tag_category else "" 393 | tags.append(tag_category + tag_text) 394 | post_data["tags"] = tags 395 | print_log(f"info:{post_id}", f"post data: {post_data}", LogLevel.VERBOSE) 396 | return post_data 397 | 398 | 399 | def download_post(session, post_id, post_url): 400 | # Download page to extract iframe embed url 401 | print_log(f"info:{post_id}", "retrieving post page") 402 | post_page_r = session.get(post_url) 403 | if not post_page_r.ok: 404 | print_log(f"info:{post_id}", "failed to retrieve post page") 405 | return 406 | page_soup = bs(post_page_r.content, "html.parser") 407 | post_data = get_post_data(post_id, page_soup) 408 | if not post_data: 409 | print_log(f"info:{post_id}", "failed parsing post data") 410 | return 411 | post_contents = page_soup.find("main", id="postView").find_all("div", recursive=False)[ 412 | 1].find("div", recursive=False).find_all("div", recursive=False) 413 | content_index = 0 414 | for content in post_contents: 415 | if content.find("iframe"): 416 | print_log(f"info:{post_id}", "getting video") 417 | content_result = download_video( 418 | session, content_index, copy.deepcopy(post_data), content) 419 | elif content.find("picture"): 420 | print_log(f"info:{post_id}", "getting image") 421 | content_result = download_image( 422 | session, content_index, copy.deepcopy(post_data), content) 423 | if content_result and args.write_sidecars: 424 | content_path, content_post_data = content_result 425 | get_content_caption(content_post_data, content) 426 | write_sidecar(content_path, content_post_data) 427 | content_index += 1 428 | 429 | 430 | def main(): 431 | if len(args.posts) == 0: 432 | parser.print_usage() 433 | return 434 | elif not os.path.isdir(args.directory): 435 | os.makedirs(args.directory) 436 | s = requests.Session() 437 | s.headers = HEADERS 438 | for post in args.posts: 439 | url_m = re.search(POST_REGEX, post) 440 | if url_m: 441 | post_url = "https://" + url_m.group("url") 442 | download_post(s, url_m.group("id"), post_url) 443 | 444 | 445 | parser = argparse.ArgumentParser() 446 | args = get_arguments() 447 | 448 | if __name__ == "__main__": 449 | main() 450 | -------------------------------------------------------------------------------- /porn3dx-downloader/requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | datetime 3 | js2py 4 | requests 5 | -------------------------------------------------------------------------------- /twscrape-wrapper/.gitignore: -------------------------------------------------------------------------------- 1 | accounts* 2 | 3 | -------------------------------------------------------------------------------- /twscrape-wrapper/README.md: -------------------------------------------------------------------------------- 1 | # twscraper-wrapper 2 | Scrapes tweets into jsonl format. This script makes use of [twscrape](https://github.com/vladkens/twscrape) to replicate the functional output of 3 | ```sh 4 | snscrape --jsonl twitter-user >> file_name.tweets.json 5 | ``` 6 | from before Twitter became worse for scraping (among other things). 7 | 8 | Before using, you must follow the instructions in the [README for twscrape](https://github.com/vladkens/twscrape#add-accounts) to add at least one account to an `accounts.db` file in the same directory as the script. Additional functionality such as sorting saved files, deduplicating tweets in a file, and automatically identifying the last tweet saved as to limit search queries were added for convenience. 9 | 10 | *Unfortunately, retweets are only able to be retrieved from an initial profile scrape. [Advanced search](https://github.com/igorbrigadir/twitter-advanced-search) queries (used with the `save-past` operation) are unable to retrieve retweets properly with modern twitter beyond ~10 days from the present for undocumented reasons. See [#2](https://github.com/CetaceanNation/misc-scripts/issues/2).* 11 | 12 | ### Usage 13 | ```sh 14 | usage: twscrape-wrapper.py [-h] [-n] {save,save-past,sort,dedupe} filename [handle] 15 | 16 | positional arguments: 17 | {save,save-past,sort,dedupe} 18 | operation to perform. 'save' downloads tweets to a file ('save-past' works in reverse), 'sort' re-orders tweets in a file, 'dedupe' removes entries with duplicate ids. 19 | filename file prefix to write tweets to (will be appended with .tweets.json) 20 | handle handle of the account to download from 21 | 22 | options: 23 | -h, --help show this help message and exit 24 | -n prompt for overwriting the existing tweet file 25 | --download-media, -m iterate through scraped/sorted tweets and download all media 26 | ``` 27 | -------------------------------------------------------------------------------- /twscrape-wrapper/twscrape-wrapper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import asyncio 4 | from collections import OrderedDict 5 | from contextlib import aclosing 6 | import datetime 7 | from http import cookiejar 8 | import json 9 | import os 10 | import requests 11 | import sys 12 | from time import sleep 13 | from twscrape import API, AccountsPool, gather 14 | from twscrape.logger import set_log_level 15 | 16 | OPERATIONS = ["save", "save-past", "sort", "dedupe", "cookies-to-string"] 17 | KEY_ORDER_TWEET = ["_type", "url", "date", "rawContent", "renderedContent", "id", "user", "replyCount", "retweetCount", "likeCount", "quoteCount", "conversationId", "lang", "source", "sourceUrl", "sourceLabel", "links", "media", "retweetedTweet", 18 | "quotedTweet", "inReplyToTweetId", "inReplyToUser", "mentionedUsers", "coordinates", "place", "hashtags", "cashtags", "card", "viewCount", "vibe", "content", "outlinks", "outlinksss", "tcooutlinks", "tcooutlinksss", "username"] 19 | KEY_ORDER_USER = ["_type", "username", "id", "displayname", "rawDescription", "renderedDescription", "descriptionLinks", "verified", "created", "followersCount", "friendsCount", "statusesCount", 20 | "favouritesCount", "listedCount", "mediaCount", "location", "protected", "link", "profileImageUrl", "profileBannerUrl", "label", "description", "descriptionUrls", "linkTcourl", "linkUrl", "url"] 21 | BACKWARDS_INTERVAL = 120 22 | MEDIA_HEADERS = { 23 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0"} 24 | MEDIA_IMAGE_NAMES = ["orig", "large", "medium", "900x900", "small", "thumb"] 25 | MEDIA_SLEEP_BETWEEN_DOWNLOADS = 2 26 | 27 | 28 | def datetime_handler(x): 29 | if isinstance(x, datetime.datetime): 30 | return x.isoformat() 31 | raise TypeError("Unknown type") 32 | 33 | 34 | def datetime_to_search_string(dt): 35 | return dt.isoformat().replace("T", "_") + "_UTC" 36 | 37 | 38 | async def get_account_id(api, handle): 39 | try: 40 | twitter_user = await api.user_by_login(handle) 41 | return twitter_user.id 42 | except Exception: 43 | print("could not lookup user with that handle") 44 | sys.exit(1) 45 | 46 | 47 | def get_account_from_file(filepath): 48 | if os.path.isfile(filepath): 49 | try: 50 | with open(filepath, "r") as tweetsfile: 51 | last_tweet_json = tweetsfile.readline() 52 | last_tweet = json.loads(last_tweet_json) 53 | return last_tweet["user"]["username"] 54 | except Exception: 55 | print(f"failed reading the existing tweets file '{filepath}'") 56 | sys.exit(1) 57 | print(f"could not find the specified file '{filepath}'") 58 | sys.exit(1) 59 | 60 | 61 | def get_saved_tweets(filepath): 62 | saved_tweets = [] 63 | saved_tweet_jsons = [] 64 | print(f"looking for tweets in {filepath}") 65 | if os.path.isfile(filepath): 66 | try: 67 | with open(filepath, "r") as tweetsfile: 68 | saved_tweets = tweetsfile.readlines() 69 | if len(saved_tweets) > 0: 70 | for tweet_json in saved_tweets: 71 | saved_tweet_jsons.append(json.loads(tweet_json)) 72 | except Exception: 73 | print("failed reading the existing tweets file") 74 | sys.exit(1) 75 | print(f"found {len(saved_tweet_jsons)} saved tweets") 76 | return saved_tweet_jsons 77 | 78 | 79 | def dedupe_tweets(tweets): 80 | stored_ids = [] 81 | filtered_tweets = [] 82 | for tweet in tweets: 83 | if not tweet["id"] in stored_ids: 84 | stored_ids.append(tweet["id"]) 85 | filtered_tweets.append(tweet) 86 | return filtered_tweets 87 | 88 | 89 | def get_last_tweet(tweets, since=True): 90 | if len(tweets) > 0: 91 | last_tweet_date = datetime.datetime.fromisoformat( 92 | tweets[0]["date"]).timestamp() 93 | else: 94 | last_tweet_date = -1 95 | last_tweet_id = None 96 | for tweet in tweets: 97 | current_tweet_date = datetime.datetime.fromisoformat( 98 | tweet["date"]).timestamp() 99 | if (since and current_tweet_date > last_tweet_date) or (not since and current_tweet_date < last_tweet_date): 100 | last_tweet_date = current_tweet_date 101 | last_tweet_id = tweet["id"] 102 | return last_tweet_date, last_tweet_id 103 | 104 | 105 | async def gather_initial_tweets(api, account_handle): 106 | tweets = [] 107 | account_id = await get_account_id(api, account_handle) 108 | user_tweets = await gather(api.user_tweets(account_id)) 109 | for tweet in user_tweets: 110 | if tweet.user.username == account_handle: 111 | tweets.append(json.loads(json.dumps( 112 | tweet.dict(), default=datetime_handler))) 113 | return tweets 114 | 115 | 116 | async def gather_tweets(api, account_handle, last_timestamp): 117 | tweets = [] 118 | # Subtract a day to try ensuring overlap, prevents <24 hour difference issues 119 | last_datetime = datetime.datetime.fromtimestamp(last_timestamp) 120 | last_datetime -= datetime.timedelta(days=1) 121 | last_datetime_string = datetime_to_search_string(last_datetime) 122 | query_string = f"from:{account_handle} since:{last_datetime_string}" 123 | print(f"query: '{query_string}'") 124 | user_tweets = await gather(api.search(query_string)) 125 | for tweet in user_tweets: 126 | if tweet.user.username == account_handle: 127 | tweets.append(json.loads(json.dumps( 128 | tweet.dict(), default=datetime_handler))) 129 | return tweets 130 | 131 | 132 | async def gather_tweets_backwards(api, account_handle, latest_timestamp): 133 | tweets = [] 134 | # Add a day to try ensuring overlap, prevents <24 hour difference issues 135 | latest_datetime = datetime.datetime.fromtimestamp(latest_timestamp) 136 | latest_datetime += datetime.timedelta(days=1) 137 | while True: 138 | latest_datetime_string = datetime_to_search_string(latest_datetime) 139 | back_one_month = latest_datetime - \ 140 | datetime.timedelta(days=BACKWARDS_INTERVAL) 141 | back_one_month_string = datetime_to_search_string(back_one_month) 142 | query_string = f"from:{account_handle} since:{back_one_month_string} until:{latest_datetime_string}" 143 | print(f"query: '{query_string}'") 144 | user_tweets = await gather(api.search(query_string)) 145 | if len(user_tweets) == 0: 146 | break 147 | for tweet in user_tweets: 148 | if tweet.user.username == account_handle: 149 | tweets.append(json.loads(json.dumps( 150 | tweet.dict(), default=datetime_handler))) 151 | latest_datetime = back_one_month + datetime.timedelta(days=1) 152 | return tweets 153 | 154 | 155 | def sort_tweets(tweets): 156 | return list(reversed(sorted(tweets, key=lambda t: datetime.datetime.fromisoformat(t["date"]).timestamp()))) 157 | 158 | 159 | def order_tweet_dict(tweet): 160 | ordered_tweet = OrderedDict((key, tweet.get(key)) 161 | for key in KEY_ORDER_TWEET) 162 | ordered_tweet["user"] = OrderedDict( 163 | (key, ordered_tweet["user"].get(key)) for key in KEY_ORDER_USER) 164 | return ordered_tweet 165 | 166 | 167 | def write_tweets(tweets, tmp_filepath, filepath, overwrite): 168 | with open(tmp_filepath, "w") as tmp_tweetsfile: 169 | for tweet in tweets: 170 | tweet_json = json.dumps(order_tweet_dict( 171 | tweet), default=datetime_handler) 172 | tmp_tweetsfile.write(f"{tweet_json}\n") 173 | overwrite = overwrite or input("overwrite existing file? (y/N): ") == "y" 174 | if overwrite: 175 | os.replace(tmp_filepath, filepath) 176 | 177 | 178 | def get_tweet_media(base_filepath, tweets): 179 | media_directory = base_filepath + ".media" 180 | if not os.path.isdir(media_directory): 181 | try: 182 | os.makedirs(media_directory) 183 | except: 184 | try: 185 | media_directory = os.path.join(os.getcwd(), media_directory) 186 | os.makedirs(media_directory) 187 | except: 188 | print( 189 | "could not find or make the media directory, skipping media downloads") 190 | return 191 | print(f"downloading media for {len(tweets)} tweets") 192 | media_count = 0 193 | for tweet in tweets: 194 | if "media" in tweet and tweet["media"]: 195 | tweet_url = tweet["url"] 196 | tweet_media = tweet["media"] 197 | media_index = 0 198 | tweet_id = tweet["id"] 199 | if "photos" in tweet_media and tweet_media["photos"]: 200 | images = tweet_media["photos"] 201 | for image in images: 202 | image_url = image["url"] 203 | image_url_path, image_ext = os.path.splitext( 204 | image_url.split("?")[0]) 205 | image_filename = f"{tweet_id}_{media_index}{image_ext}" 206 | image_fmt = image_ext[1:] 207 | for img_name in MEDIA_IMAGE_NAMES: 208 | image_url = f"{image_url_path}?format={image_fmt}&name={img_name}" 209 | image_res = download_media_file( 210 | tweet_url, image_url, os.path.join(media_directory, image_filename)) 211 | if image_res > 1: 212 | media_count += 1 213 | if image_res > 0: 214 | media_index += 1 215 | break 216 | if "videos" in tweet_media and tweet_media["videos"]: 217 | videos = tweet_media["videos"] 218 | for video in videos: 219 | video_variants_sorted = sorted( 220 | video["variants"], key=lambda v: v["bitrate"], reverse=True) 221 | for variant in video_variants_sorted: 222 | video_url = variant["url"] 223 | _, video_ext = os.path.splitext( 224 | video_url.split("?")[0]) 225 | video_filename = f"{tweet_id}_{media_index}{video_ext}" 226 | video_res = download_media_file( 227 | tweet_url, video_url, os.path.join(media_directory, video_filename)) 228 | if video_res > 1: 229 | media_count += 1 230 | if video_res > 0: 231 | media_index += 1 232 | break 233 | if "animated" in tweet_media and tweet_media["animated"]: 234 | animations = tweet_media["animated"] 235 | for animation in animations: 236 | animation_url = animation["videoUrl"] 237 | _, animation_ext = os.path.splitext( 238 | animation_url.split("?")[0]) 239 | animation_filename = f"{tweet_id}_{media_index}{animation_ext}" 240 | animation_res = download_media_file( 241 | tweet_url, animation_url, os.path.join(media_directory, animation_filename)) 242 | if animation_res > 1: 243 | media_count += 1 244 | if animation_res > 0: 245 | media_index += 1 246 | break 247 | print(f"downloaded {media_count} new media files") 248 | 249 | 250 | def download_media_file(tweet_url, media_url, filename): 251 | if os.path.isfile(filename): 252 | return 1 253 | try: 254 | media_r = requests.get(media_url, headers=MEDIA_HEADERS) 255 | if not media_r.ok: 256 | print(f"got bad response for media '{media_url}' from '{tweet_url}'") 257 | return 0 258 | with open(filename, "wb") as media_file: 259 | media_file.write(media_r.content) 260 | media_file.flush() 261 | sleep(MEDIA_SLEEP_BETWEEN_DOWNLOADS) 262 | except: 263 | print(f"got exception for media '{media_url}' from '{tweet_url}'") 264 | return 0 265 | return 2 if os.path.isfile(filename) else 0 266 | 267 | 268 | async def main(): 269 | parser = argparse.ArgumentParser( 270 | formatter_class=argparse.RawTextHelpFormatter) 271 | parser.add_argument("mode", choices=OPERATIONS, 272 | help="operation to perform. 'save' downloads tweets to a file ('save-past' works in reverse), 'sort' re-orders tweets in a file, 'dedupe' removes entries with duplicate ids. 'cookies-to-string' prints a Netscape HTTP Cookie File in a format compatible for adding accounts.") 273 | parser.add_argument( 274 | "filename", help="file prefix to write tweets to (will be appended with .tweets.json)") 275 | parser.add_argument("handle", nargs="?", 276 | help="handle of the account to download from") 277 | parser.add_argument("-n", action="store_false", 278 | help="prompt for overwriting the existing tweet file") 279 | parser.add_argument("--download-media", "-m", action="store_true", 280 | help="iterate through scraped/sorted tweets and download all media") 281 | args = parser.parse_args() 282 | if args.mode == OPERATIONS[4]: 283 | filepath = args.filename 284 | if not os.path.isfile(filepath): 285 | filepath = os.path.join(os.getcwd(), filepath) 286 | cookie_jar = cookiejar.MozillaCookieJar(filepath) 287 | try: 288 | cookie_jar.load() 289 | cookies_string = "" 290 | for cookie in cookie_jar: 291 | cookies_string += f"{cookie.name}={cookie.value}; " 292 | print(cookies_string) 293 | except: 294 | print("cookies could not be loaded, file format might not be valid") 295 | return 296 | base_filepath = args.filename.replace(".tweets.json", "") 297 | tmp_filepath = base_filepath + ".tmp.tweets.json" 298 | filepath = base_filepath + ".tweets.json" 299 | if not os.path.isfile(filepath): 300 | filepath = os.path.join(os.getcwd(), filepath) 301 | if args.mode == OPERATIONS[0] or args.mode == OPERATIONS[1]: 302 | since = args.mode == OPERATIONS[0] 303 | script_path = os.path.dirname(os.path.realpath(__file__)) 304 | api = API(AccountsPool(script_path + "/accounts.db")) 305 | account_handle = args.handle if args.handle else get_account_from_file( 306 | filepath) 307 | if not account_handle: 308 | print("could not get find handle in the provided file") 309 | sys.exit(1) 310 | print(f"getting tweets from account {account_handle}") 311 | saved_tweets = get_saved_tweets(filepath) 312 | last_saved_tweet_date, last_saved_tweet_id = get_last_tweet( 313 | saved_tweets, since) 314 | tweets_gathered = [] 315 | if last_saved_tweet_date < 0: 316 | print("no previous tweets, creating new file") 317 | tweets_gathered = await gather_initial_tweets(api, account_handle) 318 | else: 319 | last_saved_datetime = datetime.datetime.fromtimestamp( 320 | last_saved_tweet_date) 321 | print( 322 | f"retrieving tweets {'since' if since else 'until'} {last_saved_datetime.isoformat()}") 323 | if since: 324 | tweets_gathered = await gather_tweets(api, account_handle, last_saved_tweet_date) 325 | else: 326 | tweets_gathered = await gather_tweets_backwards(api, account_handle, last_saved_tweet_date) 327 | tweets_gathered = list(filter(lambda t: datetime.datetime.fromisoformat( 328 | t["date"]).timestamp() != last_saved_tweet_date, tweets_gathered)) 329 | if len(tweets_gathered) > 0: 330 | sorted_tweets_gathered = sort_tweets(tweets_gathered) 331 | last_retrieved_tweet_date = datetime.datetime.fromisoformat( 332 | sorted_tweets_gathered[-1]["date"]).timestamp() 333 | if last_saved_tweet_date > 0 and last_retrieved_tweet_date > last_saved_tweet_date: 334 | last_retrieved_datetime = datetime.datetime.fromtimestamp( 335 | last_retrieved_tweet_date) 336 | tweets_difference = last_retrieved_datetime - last_saved_datetime 337 | print( 338 | f"warning: oldest tweet retrieved is from {last_retrieved_datetime.isoformat()}, {tweets_difference.days} days difference") 339 | if args.download_media: 340 | get_tweet_media(base_filepath, sorted_tweets_gathered) 341 | sorted_tweets_gathered.extend(saved_tweets) 342 | tweets_filtered = dedupe_tweets(sorted_tweets_gathered) 343 | tweets_sorted = sort_tweets(tweets_filtered) 344 | print( 345 | f"scraped {len(tweets_sorted) - len(saved_tweets)} new tweets") 346 | write_tweets(tweets_sorted, tmp_filepath, filepath, args.n) 347 | else: 348 | print(f"no {'new' if since else 'prior'} tweets found") 349 | elif args.mode == OPERATIONS[2]: 350 | saved_tweets = get_saved_tweets(filepath) 351 | if len(saved_tweets) > 0: 352 | tweets_sorted = sort_tweets(saved_tweets) 353 | print(f"sorted tweets in {filepath}") 354 | write_tweets(tweets_sorted, tmp_filepath, filepath, args.n) 355 | if args.download_media: 356 | get_tweet_media(base_filepath, tweets_sorted) 357 | elif args.mode == OPERATIONS[3]: 358 | saved_tweets = get_saved_tweets(filepath) 359 | if len(saved_tweets) > 0: 360 | tweets_filtered = dedupe_tweets(saved_tweets) 361 | print( 362 | f"removed {len(saved_tweets) - len(tweets_filtered)} duplicates") 363 | write_tweets(tweets_filtered, tmp_filepath, filepath, args.n) 364 | if args.download_media: 365 | get_tweet_media(base_filepath, tweets_filtered) 366 | 367 | if __name__ == "__main__": 368 | asyncio.run(main()) 369 | -------------------------------------------------------------------------------- /vrchat-asset-downloader/.gitignore: -------------------------------------------------------------------------------- 1 | vrchat-token 2 | 3 | -------------------------------------------------------------------------------- /vrchat-asset-downloader/README.md: -------------------------------------------------------------------------------- 1 | # vrchat-asset-downloader 2 | Use the [VRChat](https://vrchat.com/home) API to download assets. **Only works with world maps**. 3 | 4 | ### Usage 5 | ```sh 6 | usage: vrchat-asset-downloader.py [-h] [-V] [-d DIRECTORY] [--write-thumbnail] [--write-json] [--dont-clean-json] [--verify] [--skip-download] 7 | [--revisions REVISIONS] [--list-revisions] 8 | [ASSET IDS ...] 9 | 10 | positional arguments: 11 | ASSET IDS world/avatar id(s) i.e. wrld_12345678-90ab-cdef-1234-567890abcdef 12 | 13 | options: 14 | -h, --help show this help message and exit 15 | -V, --verbose print debugging information 16 | -d DIRECTORY, --directory DIRECTORY 17 | save directory (defaults to current) 18 | --write-thumbnail save thumbnail for the asset (if used with '--revision all', all thumbnail revisions will be retrieved) 19 | --write-json write metadata to .json file(s) 20 | --dont-clean-json retain all json values when writing .json file(s) 21 | --verify whether or not to verify downloaded files against remote hashes 22 | --skip-download skip downloading the actual asset(s) 23 | --revisions REVISIONS 24 | valid values are the keywords 'all' and 'latest', or the revision integer itself 25 | --list-revisions list available revisions for the specified asset 26 | ``` 27 | -------------------------------------------------------------------------------- /vrchat-asset-downloader/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | requests_toolbelt 3 | tqdm 4 | -------------------------------------------------------------------------------- /vrchat-asset-downloader/vrchat-asset-downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import base64 4 | from enum import Enum 5 | import hashlib 6 | import json 7 | import os 8 | import re 9 | import requests 10 | from requests_toolbelt import sessions 11 | import shutil 12 | import sys 13 | from tqdm.auto import tqdm 14 | import urllib.parse as urlparse 15 | 16 | # api.vrchat.cloud domain does not always return full json details 17 | API_URL = "https://vrchat.com/api/1/" 18 | HEADERS = {"Host": "vrchat.com", 19 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"} 20 | GUID_REGEX = r"_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}" 21 | ASSET_REGEX = r"^(?Pwrld)" + GUID_REGEX + r"$" 22 | FILE_REGEX = r"^https?:\/\/api\.vrchat\.cloud\/api\/1\/file\/(?Pfile" + \ 23 | GUID_REGEX + r")\/[0-9]+\/file$" 24 | CLEAN_FILENAME_WIN = r"[\/\\:*?\"<>|]" 25 | CLEAN_FILENAME_POS = r"[\/]" 26 | BLOCK_SIZE = 1024 27 | REMOVE_FROM_JSON = ["favorites", "visits", "popularity", "heat", 28 | "publicOccupants", "privateOccupants", "occupants", "instances"] 29 | ASSET_TYPES = {"wrld": "world", "avtr": "avatar"} 30 | 31 | 32 | class LogLevel(Enum): 33 | BASIC = 1 34 | VERBOSE = 2 35 | 36 | 37 | def clean_filename(path): 38 | if os.name == "nt": 39 | return re.sub(CLEAN_FILENAME_WIN, "_", path) 40 | else: 41 | return re.sub(CLEAN_FILENAME_WIN, "_", path) 42 | 43 | 44 | def get_auth(s): 45 | config_r = s.get(f"config") 46 | if not config_r.ok: 47 | print_log("config", "failed to retrieve API key") 48 | print_log( 49 | "config", f"config endpoint returned status '{config_r.status_code}'", level=LogLevel.VERBOSE) 50 | sys.exit(1) 51 | config_j = config_r.json() 52 | if "clientApiKey" not in config_j or not config_j["clientApiKey"]: 53 | print_log("config", "failed to retrieve API key") 54 | print_log("config", f"config response lacks clientApiKey value", 55 | level=LogLevel.VERBOSE) 56 | sys.exit(1) 57 | clientKey = config_j["clientApiKey"] 58 | token_path = os.path.join(os.getcwd(), "vrchat-token") 59 | auth_cookie = None 60 | if os.path.isfile(token_path): 61 | print_log("auth", "reading saved token") 62 | with open(token_path, "r") as token_file: 63 | auth_cookie = token_file.read() 64 | if not auth_cookie or len(auth_cookie) == 0: 65 | print_log( 66 | "auth", f"while logged in to vrchat.com, visit '{API_URL}auth?apiKey={clientKey}' in your browser") 67 | auth_cookie = input("copy your token value here: ") 68 | s.headers["Cookie"] = f"apiKey={clientKey}; auth={auth_cookie};" 69 | auth_r = s.get(f"auth?apiKey={clientKey}") 70 | if not auth_r.ok: 71 | print_log( 72 | "auth", "error: the token you provided does not appear to be valid") 73 | sys.exit(1) 74 | with open(token_path, "w") as token_file: 75 | token_file.write(auth_cookie) 76 | return clientKey 77 | 78 | 79 | def download_asset(a_type, a_id, s, api_key): 80 | url = f"{a_type}s/{a_id}?{urlparse.urlencode(api_key)}" 81 | r = s.get(url) 82 | if not r.ok: 83 | print_log(f"{a_type}", f"failed to retrieve API response for {a_id}") 84 | print_log( 85 | f"{a_type}", f"asset endpoint returned status '{r.status_code}'", level=LogLevel.VERBOSE) 86 | return 87 | asset_j = r.json() 88 | 89 | file_j = None 90 | if "assetUrl" in asset_j and asset_j["assetUrl"]: 91 | # this URL no longer returned, may not ever exist again 92 | asset_m = re.search(FILE_REGEX, asset_j["assetUrl"]) 93 | if asset_m: 94 | file_id = asset_m.group("file_id") 95 | print_log( 96 | f"{a_type}", f"found asset for '{asset_j['name']}' ({a_id})") 97 | file_j = get_file_json(file_id, s) 98 | asset_j["_assetFile"] = file_j 99 | else: 100 | print_log( 101 | f"{a_type}", f"could not find the asset url for '{asset_j['name']}' ({a_id})") 102 | print_log( 103 | f"{a_type}", f"assetUrl did not match expected pattern ('asset_j['assetUrl']')", level=LogLevel.VERBOSE) 104 | elif "unityPackages" in asset_j and len(asset_j["unityPackages"]) > 0: 105 | # new way to get asset file ids 106 | for unityPackage in asset_j["unityPackages"]: 107 | asset_m = re.search(FILE_REGEX, unityPackage["assetUrl"]) 108 | if asset_m: 109 | file_id = asset_m.group("file_id") 110 | print_log( 111 | f"{a_type}", f"found asset for '{asset_j['name']}' ({a_id})") 112 | file_j = get_file_json(file_id, s) 113 | asset_j["_assetFile"] = file_j 114 | else: 115 | print_log( 116 | f"{a_type}", f"could not find the asset url for '{asset_j['name']}' ({a_id})") 117 | print_log( 118 | f"{a_type}", f"assetUrl did not match expected pattern ('asset_j['assetUrl']')", level=LogLevel.VERBOSE) 119 | else: 120 | print_log( 121 | f"{a_type}", f"could not find the asset url for '{asset_j['name']}' ({a_id})") 122 | print_log(f"{a_type}", f"asset response lacks assetUrl value", 123 | level=LogLevel.VERBOSE) 124 | 125 | image_j = None 126 | if "imageUrl" in asset_j and asset_j["imageUrl"]: 127 | image_m = re.search(FILE_REGEX, asset_j["imageUrl"]) 128 | if image_m: 129 | image_id = image_m.group("file_id") 130 | print_log( 131 | f"{a_type}", f"found image for '{asset_j['name']}' ({a_id})") 132 | image_j = get_file_json(image_id, s) 133 | asset_j["_imageFile"] = image_j 134 | else: 135 | print_log( 136 | f"{a_type}", f"could not find the image url for '{asset_j['name']}' ({a_id})") 137 | print_log( 138 | f"{a_type}", f"imageUrl did not match expected pattern ('{asset_j['imageUrl']}')", level=LogLevel.VERBOSE) 139 | else: 140 | print_log( 141 | f"{a_type}", f"could not find the image url for '{asset_j['name']}' ({a_id})") 142 | print_log(f"{a_type}", f"asset response lacks imageUrl value", 143 | level=LogLevel.VERBOSE) 144 | 145 | if args.dont_clean_json: 146 | for key in REMOVE_FROM_JSON: 147 | if key in asset_j: 148 | asset_j.pop(key) 149 | 150 | if args.list_revisions: 151 | list_file_versions(file_j) 152 | return 153 | else: 154 | save_dir = os.path.join( 155 | args.directory, clean_filename(asset_j["name"])) 156 | if not os.path.isdir(save_dir): 157 | os.makedirs(save_dir) 158 | if args.write_json: 159 | json_filename = clean_filename(f"{a_id}.json") 160 | json_filepath = os.path.join(save_dir, json_filename) 161 | if os.path.isfile(f"{json_filepath}.tmp"): 162 | os.remove(f"{json_filepath}.tmp") 163 | print_log( 164 | f"{a_type}", f"writing asset information to '{json_filename}'") 165 | with open(f"{json_filepath}.tmp", "w") as json_file: 166 | json_file.write(json.dumps(asset_j)) 167 | if os.path.isfile(json_filepath): 168 | os.remove(json_filepath) 169 | os.rename(f"{json_filepath}.tmp", json_filepath) 170 | if image_j and args.write_thumbnail: 171 | download_file_from_json(image_j, save_dir, s) 172 | if asset_j and not args.skip_download: 173 | download_file_from_json(file_j, save_dir, s) 174 | print_log(f"{a_type}", f"finished '{asset_j['name']}' ({a_id})") 175 | 176 | 177 | def get_file_json(f_id, s): 178 | url = f"file/{f_id}" 179 | r = s.get(url) 180 | if not r.ok: 181 | print_log("file", f"failed to retrieve API response for {f_id}") 182 | print_log( 183 | "file", f"file endpoint returned status '{r.status_code}'", level=LogLevel.VERBOSE) 184 | return None 185 | file_j = r.json() 186 | return file_j 187 | 188 | 189 | def list_file_versions(file_j): 190 | print(f"{'VERSION':<7} {'CREATED AT':<24} {'SIZE (BYTES)':<12} {'MD5':<32}") 191 | print(f"{'-' * 7} {'-' * 24} {'-' * 12} {'-' * 32}") 192 | for revision in file_j["versions"][1:]: 193 | md5sum = base64.b64decode(revision["file"]["md5"]) 194 | file_size = revision['file']['sizeInBytes'] 195 | print(f" {revision['version']:<6} " + 196 | f"{revision['created_at']:<24} " + 197 | f"{str(file_size):>12} " + 198 | f"{md5sum.hex():<32}") 199 | 200 | 201 | def download_file_from_json(file_j, save_dir, s): 202 | get_versions = [] 203 | latest_rev = len(file_j["versions"]) - 1 204 | term_width = shutil.get_terminal_size((80, 20))[0] 205 | if args.revisions == "all": 206 | get_versions = [*range(1, latest_rev + 1)] 207 | elif args.revisions == "latest": 208 | get_versions.append(latest_rev) 209 | elif int(args.revisions) < 1 or int(args.revisions) > latest_rev: 210 | print_log( 211 | "file", f"error: revision specified out of range, try --list-revisions") 212 | return 213 | else: 214 | get_versions.append(int(args.revisions)) 215 | print_log("file", f"Downloading {file_j['name']}") 216 | for dl_num, dl_ver in enumerate(get_versions): 217 | cur_j = file_j["versions"][dl_ver]["file"] 218 | file_path = os.path.join(save_dir, clean_filename(cur_j["fileName"])) 219 | if os.path.isfile(file_path): 220 | print_log("file", f"'{cur_j['fileName']}' already exists") 221 | else: 222 | s.headers["Host"] = "api.vrchat.cloud" 223 | redirect_r = s.get( 224 | cur_j["url"], stream=True, allow_redirects=False) 225 | if not redirect_r.ok: 226 | print_log( 227 | f"file", f"could not retrieve file for '{file_j['id']}'") 228 | print_log( 229 | f"file", f"file url '{cur_j['url']}' returned status '{redirect_r.status_code}'", level=LogLevel.VERBOSE) 230 | break 231 | file_r = requests.get(redirect_r.headers["Location"], stream=True) 232 | if not file_r.ok: 233 | print_log( 234 | f"file", f"could not retrieve file for '{file_j['id']}'") 235 | print_log( 236 | f"file", f"file url '{cur_j['url']}' returned status '{file_r.status_code}'", level=LogLevel.VERBOSE) 237 | break 238 | total_size = int(cur_j["sizeInBytes"]) 239 | with tqdm.wrapattr(open(file_path, "wb"), "write", 240 | desc=f"[file] Rev {dl_ver} ({dl_num + 1}/{len(get_versions)})", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{rate_fmt}{postfix}]", 241 | ncols=int(term_width * 0.8), total=total_size, 242 | unit="B", unit_scale=True, unit_divisor=BLOCK_SIZE 243 | ) as file_h: 244 | for chunk in file_r.iter_content(BLOCK_SIZE): 245 | file_h.write(chunk) 246 | file_h.flush() 247 | if args.verify and os.path.isfile(file_path): 248 | verify_file(cur_j["fileName"], file_path, cur_j["md5"]) 249 | return 250 | 251 | 252 | def verify_file(file_name, file_path, md5b64): 253 | print_log("hash", f"verifying {file_name}...", overwrite=True) 254 | remote_md5 = base64.b64decode(md5b64) 255 | with open(file_path, "rb") as file_h: 256 | local_md5 = hashlib.md5() 257 | while chunk := file_h.read(BLOCK_SIZE): 258 | local_md5.update(chunk) 259 | if remote_md5 == local_md5.digest(): 260 | print_log("hash", f"'{file_name}' verified successfully") 261 | else: 262 | print_log("hash", f"'{file_name}' failed to verify") 263 | 264 | 265 | def print_log(component, message, level=LogLevel.BASIC, overwrite=False): 266 | if level == LogLevel.VERBOSE and not args.verbose: 267 | return 268 | if overwrite: 269 | print(f"[{component}] {message}", end="\r") 270 | else: 271 | print(f"[{component}] {message}") 272 | 273 | 274 | def get_arguments(): 275 | parser.add_argument("-V", "--verbose", action="store_true", 276 | help="print debugging information") 277 | parser.add_argument("-d", "--directory", type=str, 278 | help="save directory (defaults to current)", default=os.getcwd()) 279 | parser.add_argument("--write-thumbnail", action="store_true", 280 | help="save thumbnail for the asset (if used with '--revision all', all thumbnail revisions will be retrieved)") 281 | parser.add_argument("--write-json", action="store_true", 282 | help="write metadata to .json file(s)") 283 | parser.add_argument("--dont-clean-json", action="store_false", 284 | help="retain all json values when writing .json file(s)") 285 | parser.add_argument("--verify", action="store_true", 286 | help="whether or not to verify downloaded files against remote hashes", default=False) 287 | parser.add_argument("--skip-download", action="store_true", 288 | help="skip downloading the actual asset(s)") 289 | parser.add_argument("--revisions", type=str, 290 | help="valid values are the keywords 'all' and 'latest', or the revision integer itself", default="latest") 291 | parser.add_argument("--list-revisions", action="store_true", 292 | help="list available revisions for the specified asset") 293 | parser.add_argument("asset_id_list", metavar="ASSET IDS", nargs="*", 294 | help="world/avatar id(s) i.e. wrld_12345678-90ab-cdef-1234-567890abcdef") 295 | return parser.parse_args() 296 | 297 | 298 | def main(): 299 | if len(args.asset_id_list) == 0: 300 | parser.print_usage() 301 | return 302 | elif not os.path.isdir(args.directory): 303 | os.makedirs(args.directory) 304 | api_session = sessions.BaseUrlSession(base_url=API_URL) 305 | api_session.headers = HEADERS 306 | api_key = get_auth(api_session) 307 | api_key_t = {"apiKey": api_key} 308 | for asset_id in args.asset_id_list: 309 | asset_type_m = re.search(ASSET_REGEX, asset_id) 310 | if asset_type_m: 311 | asset_type = asset_type_m.group("asset_type") 312 | download_asset(ASSET_TYPES[asset_type], 313 | asset_id, api_session, api_key_t) 314 | else: 315 | print_log("vrchat-asset-downloader", 316 | f"id {asset_id} does not appear to be valid") 317 | 318 | 319 | parser = argparse.ArgumentParser() 320 | args = get_arguments() 321 | 322 | if __name__ == "__main__": 323 | main() 324 | -------------------------------------------------------------------------------- /vroid-hub-downloader/README.md: -------------------------------------------------------------------------------- 1 | # vroid-hub-downloader 2 | Downloads preview models (viewable in the browser) from [VRoid Hub](https://hub.vroid.com/). Handles decryption and decompression (assist from bin). 3 | 4 | These decrypted models do not "just work", you will have to manually make the adjustments necessary for using them. See the comments on [this gist](https://gist.github.com/Pldare/ebf704c752a8d77ff9603d4adfe54083) for more info. 5 | 6 | ### Usage 7 | ```sh 8 | usage: vroid-hub-downloader.py [-h] [-d DIRECTORY] [--write-info-json] [vroid links/vrm files ...] 9 | 10 | positional arguments: 11 | vroid links/vrm files 12 | vroid hub links or encrypted vrm files i.e. 13 | https://hub.vroid.com/en/users/49620 14 | https://hub.vroid.com/en/characters/6819070713126783571/models/9038381612772945358 15 | 2520951134072570694.vrm 16 | 17 | options: 18 | -h, --help show this help message and exit 19 | -d DIRECTORY, --directory DIRECTORY 20 | save directory (defaults to current) 21 | --write-info-json write user/model json information for urls 22 | ``` 23 | -------------------------------------------------------------------------------- /vroid-hub-downloader/requirements.txt: -------------------------------------------------------------------------------- 1 | pycryptodome 2 | Requests 3 | zstandard 4 | -------------------------------------------------------------------------------- /vroid-hub-downloader/vroid-hub-downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | from Crypto.Cipher import AES 4 | import gzip 5 | import io 6 | import json 7 | import os 8 | import re 9 | import requests 10 | import sys 11 | import zstandard 12 | 13 | USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0" 14 | HOST = "https://hub.vroid.com" 15 | API_VERSION = "11" 16 | MODEL_FILE_EXT = "glb" 17 | VROID_BASE = r"(?:https?:\/\/)?hub\.vroid\.com\/(?P[a-z]{2}\/)?" 18 | VROID_USER = VROID_BASE + r"users/(?P\d+)" 19 | VROID_MODEL = VROID_BASE + \ 20 | r"characters\/(?P\d+)\/models\/(?P\d+)" 21 | 22 | 23 | def unpad(s): 24 | return s[:-ord(s[len(s)-1:])] 25 | 26 | 27 | def get_user_model_ids(user_id): 28 | model_ids = [] 29 | api_url = f"{HOST}/api/users/{user_id}/character_models?antisocial_or_hate_usage=&characterization_allowed_user=&corporate_commercial_use=&credit=&modification=&personal_commercial_use=&political_or_religious_usage=&redistribution=&sexual_expression=&violent_expression=" 30 | page_num = 1 31 | while api_url: 32 | user_r = requests.get( 33 | api_url, headers={"User-Agent": USER_AGENT, "X-Api-Version": API_VERSION}) 34 | if not user_r.ok: 35 | print( 36 | f"[user:{user_id}:page:{page_num}] got bad response from vroid hub, {user_r.status_code}") 37 | break 38 | user_j = user_r.json() 39 | if "next" in user_j["_links"]: 40 | api_url = HOST + user_j["_links"]["next"]["href"] 41 | else: 42 | api_url = None 43 | for model in user_j["data"]: 44 | model_ids.append(model["id"]) 45 | print(f"[user:{user_id}] found {len(model_ids)} models") 46 | return model_ids 47 | 48 | 49 | def download_preview_model(model_id): 50 | model_preview_url = f"{HOST}/api/character_models/{model_id}/optimized_preview" 51 | model_r = requests.get(model_preview_url, allow_redirects=True, headers={ 52 | "User-Agent": USER_AGENT, "X-Api-Version": API_VERSION}) 53 | if not model_r.ok: 54 | print( 55 | f"[model:{model_id}:preview] got bad response from vroid hub, {model_r.status_code}") 56 | print(f"[model:{model_id}:preview] {model_r.content.decode()}") 57 | return None 58 | return io.BytesIO(model_r.content) 59 | 60 | 61 | def decrypt_decompress_model(model_id, model_bytes, model_filename): 62 | if not os.path.isfile(model_filename): 63 | with open(model_filename, "wb") as dec_vrm: 64 | iv_bytes = model_bytes.read(16) 65 | key_bytes = model_bytes.read(32) 66 | key_context = AES.new(key_bytes, AES.MODE_CBC, iv_bytes) 67 | enc_data = model_bytes.read() 68 | dec_data = unpad(key_context.decrypt(enc_data))[4:] 69 | dctx = zstandard.ZstdDecompressor() 70 | with dctx.stream_writer(dec_vrm) as decompressor: 71 | decompressor.write(dec_data) 72 | print( 73 | f"[model:{model_id}] wrote decrypted and decompressed model '{os.path.basename(model_filename)}'") 74 | else: 75 | print( 76 | f"[model:{model_id}] '{os.path.basename(model_filename)}' already exists") 77 | 78 | 79 | def download_model_from_vroid(model_id, subdir=None): 80 | model_path_base = os.path.join( 81 | subdir if subdir else args.directory, model_id) 82 | model_api_url = f"{HOST}/api/character_models/{model_id}" 83 | json_path = f"{model_path_base}.info.json" 84 | model_api_r = requests.get(model_api_url, headers={ 85 | "User-Agent": USER_AGENT, "X-Api-Version": API_VERSION}) 86 | if not model_api_r.ok: 87 | print( 88 | f"[model:{model_id}:api] got bad response from vroid hub, {model_r.status_code}") 89 | return 90 | model_api_j = model_api_r.json()["data"] 91 | if args.write_info_json and not os.path.isfile(json_path): 92 | with open(json_path, "w") as json_file: 93 | json_file.write(json.dumps(model_api_j)) 94 | print(f"[model:{model_id}:api] wrote '{os.path.basename(json_path)}'") 95 | else: 96 | print( 97 | f"[model:{model_id}:api] '{os.path.basename(json_path)}' already exists") 98 | if not "conversion_state" in model_api_j["character_model"]["latest_character_model_version"]: 99 | print( 100 | f"[model:{model_id}:api] warning: JSON response implies model preview does not exist, expecting 404") 101 | elif model_api_j["character_model"]["latest_character_model_version"]["conversion_state"]["current_state"] != "completed": 102 | print( 103 | f"[model:{model_id}:api] warning: JSON response implies model preview is not ready, expecting 404") 104 | enc_vrm = download_preview_model(model_id) 105 | if not enc_vrm: 106 | return 107 | decrypt_decompress_model( 108 | model_id, enc_vrm, f"{model_path_base}.{MODEL_FILE_EXT}") 109 | 110 | 111 | def download_user_from_vroid(user_id): 112 | user_api_url = f"{HOST}/api/users/{user_id}" 113 | user_api_r = requests.get(user_api_url, headers={ 114 | "User-Agent": USER_AGENT, "X-Api-Version": API_VERSION}) 115 | if not user_api_r.ok: 116 | print( 117 | f"[user:{user_id}:api] got bad response from vroid hub, user might not exist, {user_api_r.status_code}") 118 | return 119 | user_api_j = user_api_r.json() 120 | username = user_api_j["data"]["user"]["name"] 121 | user_base_path = os.path.join(args.directory, f"{username} ({user_id})") 122 | if not os.path.isdir(user_base_path): 123 | os.makedirs(user_base_path) 124 | json_path = f"{user_base_path}.info.json" 125 | if args.write_info_json: 126 | with open(json_path, "w") as json_file: 127 | json_file.write(json.dumps(user_api_j["data"])) 128 | print(f"[user:{user_id}:api] wrote '{os.path.basename(json_path)}'") 129 | model_ids = get_user_model_ids(user_id) 130 | for model_id in model_ids: 131 | download_model_from_vroid(model_id, user_base_path) 132 | 133 | 134 | parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) 135 | parser.add_argument("-d", "--directory", type=str, 136 | help="save directory (defaults to current)", default=os.getcwd()) 137 | parser.add_argument("--write-info-json", action="store_true", 138 | help="write user/model json information for urls") 139 | parser.add_argument("vrms", metavar="vroid links/vrm files", nargs="*", 140 | help="vroid hub links or encrypted vrm files i.e.\nhttps://hub.vroid.com/en/users/49620\nhttps://hub.vroid.com/en/characters/6819070713126783571/models/9038381612772945358\n2520951134072570694.vrm") 141 | args = parser.parse_args() 142 | 143 | if not os.path.isdir(args.directory): 144 | os.makedirs(args.directory) 145 | 146 | for vrm in args.vrms: 147 | vroid_usr_m = re.search(VROID_USER, vrm) 148 | model_m = re.search(VROID_MODEL, vrm) 149 | if vroid_usr_m: 150 | user_id = vroid_usr_m.group("user_id") 151 | download_user_from_vroid(user_id) 152 | elif model_m: 153 | model_id = model_m.group("model_id") 154 | download_model_from_vroid(model_id) 155 | else: 156 | if not os.path.isfile(vrm): 157 | print(f"could not find file at path '{vrm}'") 158 | continue 159 | with open(vrm, "rb") as vrm_file: 160 | enc_vrm = io.BytesIO(vrm_file.read()) 161 | model_filename = os.path.join( 162 | args.directory, f"{vrm}.decrypted.{MODEL_FILE_EXT}") 163 | decrypt_decompress_model(enc_vrm, model_filename) 164 | --------------------------------------------------------------------------------