├── README.md
├── caa-downloader
    ├── README.md
    ├── caa-downloader.py
    └── requirements.txt
├── porn3dx-downloader
    ├── README.md
    ├── hydrus_sidecar_routers.png
    ├── porn3dx-downloader.py
    └── requirements.txt
├── twscrape-wrapper
    ├── .gitignore
    ├── README.md
    └── twscrape-wrapper.py
├── vrchat-asset-downloader
    ├── .gitignore
    ├── README.md
    ├── requirements.txt
    └── vrchat-asset-downloader.py
└── vroid-hub-downloader
    ├── README.md
    ├── requirements.txt
    └── vroid-hub-downloader.py


/README.md:
--------------------------------------------------------------------------------
1 | # misc-scripts
2 | 
3 | A variety of Python scripts I've written, mostly for downloading, scraping and otherwise ripping things off the web which require a little effort or would be convenient to automate.
4 | 
5 | Each directory has a README describing the script and it's usage to assist anyone else trying to use it.
6 | 


--------------------------------------------------------------------------------
/caa-downloader/README.md:
--------------------------------------------------------------------------------
 1 | # caa-downloader
 2 | Download art for a [MusicBrainz](https://musicbrainz.org/) release from the [Cover Art Archive](https://coverartarchive.org/).
 3 | 
 4 | ### Usage
 5 | ```sh
 6 | usage: caa-downloader.py [-h] [-d DIRECTORY] [-s SIZE] [RELEASES ...]
 7 | 
 8 | positional arguments:
 9 |   RELEASES              releases to download i.e.
10 |                         3791c620-7ba4-3db0-bda8-2b060f31a7b8
11 |                         https://musicbrainz.org/release/3791c620-7ba4-3db0-bda8-2b060f31a7b8
12 |                         beta.musicbrainz.org/release/3791c620-7ba4-3db0-bda8-2b060f31a7b8/discids
13 | 
14 | options:
15 |   -h, --help            show this help message and exit
16 |   -d DIRECTORY, --directory DIRECTORY
17 |                         save directory (defaults to current)
18 |   -s SIZE, --size SIZE  image download size (250, 500, 1200, original)
19 | ```
20 | 


--------------------------------------------------------------------------------
/caa-downloader/caa-downloader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import argparse
 3 | import json
 4 | import os
 5 | import re
 6 | import requests
 7 | from requests_toolbelt import sessions
 8 | import shutil
 9 | from tqdm.auto import tqdm
10 | import urllib.parse as urlparse
11 | 
12 | VALID_THUMBS = [250, 500, 1200]
13 | API = "http://coverartarchive.org/release/"
14 | RELEASE_REGEX = r"^(?:(?:https?:\/\/)?(?:.*?\.)?musicbrainz\.org\/release\/)?(?P<release_id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})(?:\/.+)?$"
15 | BLOCK_SIZE = 1024
16 | 
17 | 
18 | def download_image(i_url, filename):
19 |     file_path = os.path.join(args.directory, filename)
20 |     file_r = requests.get(i_url, stream=True, allow_redirects=True)
21 |     if not file_r.ok:
22 |         print(f"could not get art for {filename}")
23 |     total_size = int(file_r.headers['content-length'])
24 |     term_width = shutil.get_terminal_size((80, 20))[0]
25 |     with tqdm.wrapattr(open(file_path, "wb"), "write",
26 |                        desc=f"{filename}", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{rate_fmt}{postfix}]",
27 |                        ncols=int(term_width * 0.8), total=total_size,
28 |                        unit="B", unit_scale=True, unit_divisor=BLOCK_SIZE
29 |                        ) as file_h:
30 |         for chunk in file_r.iter_content(BLOCK_SIZE):
31 |             file_h.write(chunk)
32 |     return
33 | 
34 | 
35 | def download_covers(s, r_id):
36 |     print(f"requesting art for {r_id}")
37 |     covers_r = s.get(f"{r_id}")
38 |     if not covers_r.ok:
39 |         print(f"error: could not find art for release {r_id}")
40 |         return
41 |     covers_j = covers_r.json()
42 |     print(f"found {len(covers_j['images'])} images")
43 |     for image_i in range(0, len(covers_j["images"])):
44 |         image_j = covers_j["images"][image_i]
45 |         image_url = None
46 |         if args.size == "original":
47 |             image_url = image_j["image"]
48 |         else:
49 |             image_url = image_j["thumbnails"][args.size]
50 |         filename = f"{str(image_i)}_" + "+".join(image_j["types"])
51 |         if len(image_j["comment"]) > 0:
52 |             comment = image_j["comment"]
53 |             filename += f" ({comment})"
54 |         filename_clean = re.sub(r"[^\w\-_\. \[\]\(\)\+]", "_", filename)
55 |         filename_clean += "." + image_url.split(".")[-1]
56 |         download_image(image_url, filename_clean)
57 |     print(f"finished retrieving art for {r_id}")
58 | 
59 | 
60 | def main():
61 |     if len(args.release_list) == 0:
62 |         parser.print_usage()
63 |         return
64 |     elif args.size != "original" and args.size not in VALID_THUMBS:
65 |         print(f"invalid size specified ({args.size})")
66 |         return
67 |     elif not os.path.isdir(args.directory):
68 |         os.makedirs(args.directory)
69 |     api_session = sessions.BaseUrlSession(base_url=API)
70 |     for release in args.release_list:
71 |         release_id_m = re.search(RELEASE_REGEX, release)
72 |         if release_id_m:
73 |             release_id = release_id_m.group("release_id")
74 |             download_covers(api_session, release_id)
75 |         else:
76 |             print(f"could not parse release id from '{release}'")
77 | 
78 | 
79 | parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
80 | parser.add_argument("-d", "--directory", type=str,
81 |                     help="save directory (defaults to current)", default=os.getcwd())
82 | parser.add_argument("-s", "--size", type=str, default="original",
83 |                     help="image download size (250, 500, 1200, original)")
84 | parser.add_argument("release_list", metavar="RELEASES", nargs="*",
85 |                     help="releases to download i.e.\n3791c620-7ba4-3db0-bda8-2b060f31a7b8\nhttps://musicbrainz.org/release/3791c620-7ba4-3db0-bda8-2b060f31a7b8\nbeta.musicbrainz.org/release/3791c620-7ba4-3db0-bda8-2b060f31a7b8/discids")
86 | args = parser.parse_args()
87 | 
88 | if __name__ == "__main__":
89 |     main()
90 | 


--------------------------------------------------------------------------------
/caa-downloader/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | requests_toolbelt
3 | tqdm
4 | 


--------------------------------------------------------------------------------
/porn3dx-downloader/README.md:
--------------------------------------------------------------------------------
 1 | # porn3dx-downloader
 2 | Downloads videos and images from posts on [Porn3dx](https://porn3dx.com). Video formats available are better than those you can download with an account, expects and decrypts any encrypted video playlists.
 3 | 
 4 | The `--write-sidecars` option can be used in conjunction with `hydrus_sidecar_routers.png` to import the downloaded files along with tags and other metadata into [hydrus](https://github.com/hydrusnetwork/hydrus).
 5 | 
 6 | ### Usage
 7 | ```sh
 8 | usage: porn3dx-downloader.py [-h] [-V] [-d DIRECTORY] [--write-sidecars] [-f FORMAT] [-F] [POSTS ...]
 9 | 
10 | positional arguments:
11 |   POSTS                 post url
12 | 
13 | options:
14 |   -h, --help            show this help message and exit
15 |   -V, --verbose         print debugging information
16 |   -d DIRECTORY, --directory DIRECTORY
17 |                         save directory (defaults to current)
18 |   --write-sidecars      write sidecars for urls, timestamps, tags and description notes
19 |   -f FORMAT, --format FORMAT
20 |                         video format, specified by NAME or the keyword 'best'
21 |   -F, --list-formats    list available formats
22 | ```
23 | 


--------------------------------------------------------------------------------
/porn3dx-downloader/hydrus_sidecar_routers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CetaceanNation/misc-scripts/cf070eda5e5c00a510c5355d00daf658240ddc50/porn3dx-downloader/hydrus_sidecar_routers.png


--------------------------------------------------------------------------------
/porn3dx-downloader/porn3dx-downloader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | from bs4 import BeautifulSoup as bs
  4 | import copy
  5 | from Crypto.Cipher import AES
  6 | from datetime import datetime
  7 | from enum import Enum
  8 | import json
  9 | import js2py
 10 | import os
 11 | import re
 12 | import requests
 13 | from subprocess import Popen, PIPE, STDOUT
 14 | import tempfile
 15 | import urllib.parse as urlparse
 16 | 
 17 | HOST = "https://porn3dx.com/"
 18 | EMBED_HOST = "https://iframe.mediadelivery.net/"
 19 | DRM_ACTIVATION_HOST = "https://video-987.mediadelivery.net/"
 20 | HEADERS = {
 21 |     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0"}
 22 | 
 23 | POST_REGEX = r".*(?P<url>porn3dx\.com\/post\/(?P<id>\d+)).*"
 24 | GUID_REGEX = r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"
 25 | PING_TOKEN_REGEX = r";setTimeout\(function\(\)\{var\ [a-z]=\"(?P<ping_token>" + GUID_REGEX + \
 26 |     r")\";var\ [a-z]=(?P<secret_function>function\([a-z]+\)\{.*toLowerCase\(\)\});"
 27 | PLAYLIST_REGEX = r"https?:\/\/iframe\.mediadelivery\.net\/" + GUID_REGEX + \
 28 |     r"\/playlist.drm\?contextId=(?P<context_id>" + \
 29 |     GUID_REGEX + r")&secret=" + GUID_REGEX
 30 | 
 31 | # 7/14/2022, 3:23:37 PM
 32 | # new Date(Date.UTC(2022, 6, 14, 15, 23, 37)).toLocaleString()
 33 | XTIME_REGEX = r".*UTC\(((\d+(?:,\ )?)+)\).*"
 34 | TAG_CATEGORY_REGEX = r".*bg-(\w+)-100.*"
 35 | TAG_CATEGORY_MAP = {
 36 |     "yellow": "series",
 37 |     "green": "character",
 38 |     "purple": "medium",
 39 |     "blue": ""
 40 | }
 41 | 
 42 | 
 43 | class LogLevel(Enum):
 44 |     BASIC = 1
 45 |     VERBOSE = 2
 46 | 
 47 | 
 48 | def print_log(component, message, level=LogLevel.BASIC, overwrite=False):
 49 |     if level == LogLevel.VERBOSE and not args.verbose:
 50 |         return
 51 |     if overwrite:
 52 |         print(f"[{component}] {message}", end="\r")
 53 |     else:
 54 |         print(f"[{component}] {message}")
 55 | 
 56 | 
 57 | def get_arguments():
 58 |     parser.add_argument("-V", "--verbose", action="store_true",
 59 |                         help="print debugging information")
 60 |     parser.add_argument("-d", "--directory", type=str,
 61 |                         help="save directory (defaults to current)", default=os.getcwd())
 62 |     parser.add_argument("--write-sidecars", action="store_true",
 63 |                         help="write sidecars for urls, timestamps, tags and description notes")
 64 |     parser.add_argument("-f", "--format", type=str,
 65 |                         help="video format, specified by NAME or the keyword \'best\'", default="best")
 66 |     parser.add_argument("-F", "--list-formats",
 67 |                         action="store_true", help="list available formats")
 68 |     parser.add_argument("posts", metavar="POSTS", nargs="*", help="post url")
 69 |     return parser.parse_args()
 70 | 
 71 | # based on parts of https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/extractor/common.py
 72 | 
 73 | 
 74 | def get_m3u8_info(session, playlist_url, referer_url):
 75 |     m3u8_info = []
 76 |     print_log("get-m3u8-info",
 77 |               f"retrieving playlist from {playlist_url}", LogLevel.VERBOSE)
 78 |     m3u8_r = session.get(playlist_url, headers={"Referer": referer_url})
 79 |     if not m3u8_r.ok:
 80 |         print_log("get-m3u8-info",
 81 |                   f"failed to retrieve playlist from {playlist_url}")
 82 |     m3u8_text = m3u8_r.text
 83 |     media_details = None
 84 |     format_details = None
 85 |     for line in m3u8_text.splitlines():
 86 |         if line.startswith("#EXT-X-STREAM-INF:"):
 87 |             # parse format details
 88 |             format_details = parse_m3u8_attributes(line)
 89 |         elif not line.startswith("#") and len(line.strip()) > 0:
 90 |             if format_details:
 91 |                 if "RESOLUTION" in format_details:
 92 |                     media_name = format_details["RESOLUTION"].split("x")[
 93 |                         1] + "p"
 94 |                 else:
 95 |                     media_name = line.split("/")[0]
 96 |                 m3u8_info += [{"location": line, "name": media_name,
 97 |                                "bandwidth": format_details["BANDWIDTH"], "res": format_details["RESOLUTION"]}]
 98 |     return m3u8_info
 99 | 
100 | # https://github.com/ytdl-org/youtube-dl/blob/master/youtube_dl/utils.py#L5495
101 | 
102 | 
103 | def parse_m3u8_attributes(attrib):
104 |     info = {}
105 |     for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
106 |         if val.startswith("\""):
107 |             val = val[1:-1]
108 |         info[key] = val
109 |     return info
110 | 
111 | 
112 | def print_formats(formats_list):
113 |     print(f"{'NAME':<10} {'BANDWIDTH':<10} {'RESOLUTION':<10}")
114 |     print(f"{'-' * 10} {'-' * 10} {'-' * 10}")
115 |     for format_settings in formats_list:
116 |         print(f"{format_settings['name']:<10} " +
117 |               f"{format_settings['bandwidth']:<10} " +
118 |               f"{format_settings['res']:<10} ")
119 | 
120 | 
121 | def write_frag(session, frag_url, frag_name, key_context):
122 |     try:
123 |         with open(frag_name, "wb") as frag_file:
124 |             video_frag_r = session.get(
125 |                 frag_url, headers={"Origin": EMBED_HOST, "Referer": EMBED_HOST})
126 |             if not video_frag_r.ok:
127 |                 print_log(
128 |                     f"dl:{post_id}", f"failed to download video fragment '{frag_name}'")
129 |                 return False
130 |             # Fragments are small, decrypt in memory then write to disk
131 |             frag_bytes = key_context.decrypt(video_frag_r.content)
132 |             frag_file.write(frag_bytes)
133 |         return True
134 |     except:
135 |         print_log(f"dl:{post_id}",
136 |                   f"exception downloading video fragment '{frag_name}'")
137 |         return False
138 | 
139 | 
140 | def download_stream(session, index, post_data, downloading_format, drm_session, referer_url):
141 |     post_id = post_data["id"]
142 |     file_name = post_data["basefilename"]
143 |     res_name = downloading_format["name"][:-1]
144 |     context_id = drm_session["id"]
145 |     refresh_token = drm_session["token"]
146 |     refresh_function = drm_session["function"]
147 |     output_file_name = f"{file_name}.{index}.mp4"
148 |     output_file_path = os.path.abspath(
149 |         os.path.join(args.directory, output_file_name))
150 |     if os.path.isfile(output_file_path):
151 |         print_log(f"dl:{post_id}", "file exists, skipping download")
152 |         return output_file_path, post_data
153 |     playlist_r = session.get(downloading_format["location"], headers={
154 |                              "Referer": referer_url})
155 |     if not playlist_r.ok:
156 |         print_log(f"dl:{post_id}", "failed to retrieve post playlist")
157 |         return
158 |     playlist_text = playlist_r.text
159 |     key_context = None
160 |     key_count = 0
161 |     frag_files = []
162 |     for line in playlist_text.splitlines():
163 |         if line.startswith("#EXT-X-KEY:"):
164 |             # New key for decrypting fragments
165 |             key_attr = parse_m3u8_attributes(line)
166 |             key_r = session.get(key_attr["URI"], headers={
167 |                                 "Origin": EMBED_HOST, "Referer": EMBED_HOST})
168 |             if not key_r.ok:
169 |                 print_log(f"key-context:{post_id}",
170 |                           "failed to retrieve key for segments")
171 |                 continue
172 |             key_bytes = key_r.content
173 |             iv_bytes = bytearray.fromhex(key_attr["IV"][2:])
174 |             print_log(
175 |                 f"key-context:{post_id}", f"new key context [IV: {iv_bytes.hex()}, K: {key_bytes.hex()}]", LogLevel.VERBOSE)
176 |             key_context = AES.new(key_bytes, AES.MODE_CBC, iv_bytes)
177 |             key_count += 1
178 |             # Refresh DRM context
179 |             time_in_video = float(key_count)
180 |             refresh_string = f"{refresh_token}_{context_id}_{time_in_video}_false_{res_name}"
181 |             refresh_hash = refresh_function(refresh_string)
182 |             refresh_url = DRM_ACTIVATION_HOST + \
183 |                 f".drm/{context_id}/ping?hash={refresh_hash}&time={time_in_video}&paused=false&resolution={res_name}"
184 |             print_log(
185 |                 f"drm:{post_id}", f"refreshing session; {refresh_url}", LogLevel.VERBOSE)
186 |             refresh_r = session.get(refresh_url, headers={
187 |                                     "Origin": EMBED_HOST, "Referer": EMBED_HOST})
188 |             if not refresh_r.ok:
189 |                 print_log(
190 |                     f"drm:{post_id}", "failed to refresh the drm session, will continue but likely to fail if the video is long")
191 |         elif not line.startswith("#"):
192 |             # Write the fragment
193 |             frag_file_name = os.path.abspath(os.path.join(
194 |                 args.directory, f"{file_name}.{index}.{len(frag_files)}.ts"))
195 |             if write_frag(session, line, frag_file_name, key_context):
196 |                 frag_files.append(frag_file_name)
197 |     # Use ffmpeg to concatenate all the fragments into a single output file
198 |     print_log(f"mpeg-convert:{post_id}", f"merging fragments")
199 |     with tempfile.NamedTemporaryFile(mode="w", suffix=".txt") as frag_list:
200 |         for frag_name in frag_files:
201 |             frag_list.write(f"file '{frag_name}'\n")
202 |         frag_list.flush()
203 |         ffmpeg_list = ["ffmpeg", "-hide_banner", "-y", "-f", "concat",
204 |                        "-safe", "0", "-i", frag_list.name, "-c", "copy", output_file_path]
205 |         print_log("ffmpeg", f"args: {ffmpeg_list}", LogLevel.VERBOSE)
206 |         try:
207 |             ffmpeg_process = Popen(ffmpeg_list, stdout=PIPE, stderr=PIPE)
208 |             stdout, stderr = ffmpeg_process.communicate()
209 |         except Exception:
210 |             print_log(f"mpeg-convert:{post_id}", "failure in executing ffmpeg")
211 |             print_log(
212 |                 "ffmpeg", f"stdout: {str(stdout)}\n\nstderr: {str(stderr)}", LogLevel.VERBOSE)
213 |             return
214 |         frag_list.close()
215 |     # Cleanup only if file is found
216 |     if os.path.isfile(output_file_path):
217 |         for frag_name in frag_files:
218 |             os.remove(frag_name)
219 |         return output_file_path, post_data
220 |     print_log(f"mpeg-convert:{post_id}", "could not find output file")
221 |     return
222 | 
223 | 
224 | def download_video(session, index, post_data, content_soup):
225 |     post_id = post_data["id"]
226 |     iframe_url = content_soup.find("iframe")["src"]
227 |     if not iframe_url:
228 |         print_log(f"info:{post_id}", "could not find embed url in post page")
229 |         return
230 |     # Download embed to get formats playlist url
231 |     iframe_r = session.get(iframe_url, headers={"Referer": HOST})
232 |     if not iframe_r.ok:
233 |         print_log(f"info:{post_id}", "failed to retrieve video embed")
234 |         return
235 |     iframe_soup = bs(iframe_r.content, "html.parser")
236 |     iframe_script = iframe_soup.find_all("script")[-1].string
237 |     # Extract formats playlist url from embed script
238 |     playlist_m = re.search(PLAYLIST_REGEX, iframe_script)
239 |     if not playlist_m:
240 |         print_log(f"info:{post_id}",
241 |                   "could not find format playlist url in embed")
242 |         return
243 |     playlist_url = playlist_m.group(0)
244 |     context_id = playlist_m.group("context_id")
245 |     # Get available formats
246 |     formats_list = get_m3u8_info(session, playlist_url, iframe_url)
247 |     if args.list_formats:
248 |         print_log(f"info:{post_id}", "available formats:")
249 |         print_formats(formats_list)
250 |         return
251 |     # Activate DRM session
252 |     drm_session = {}
253 |     activation_url = urlparse.urljoin(
254 |         DRM_ACTIVATION_HOST, f".drm/{context_id}/activate")
255 |     if not session.get(activation_url, headers={"Origin": EMBED_HOST, "Referer": EMBED_HOST}).ok:
256 |         print_log(
257 |             f"drm:{post_id}", "failed to activate drm context, download will not proceed")
258 |         return
259 |     print_log(f"drm:{post_id}",
260 |               f"activated drm context {context_id}", LogLevel.VERBOSE)
261 |     drm_session["id"] = context_id
262 |     # Extract refresh token from embed script
263 |     token_m = re.search(PING_TOKEN_REGEX, iframe_script)
264 |     if not token_m:
265 |         print_log(f"drm:{post_id}",
266 |                   "could not find ping refresh token in embed")
267 |         return
268 |     drm_session["token"] = token_m.group("ping_token")
269 |     secret_script = token_m.group("secret_function")
270 |     drm_session["function"] = js2py.eval_js(secret_script)
271 |     # Select preferred format
272 |     downloading_format = None
273 |     best_bitrate = 0
274 |     for format_settings in formats_list:
275 |         if args.format == "best":
276 |             if int(format_settings["bandwidth"]) > best_bitrate:
277 |                 downloading_format = format_settings
278 |                 best_bitrate = int(format_settings["bandwidth"])
279 |         elif args.format == format_settings["name"]:
280 |             downloading_format = format_settings
281 |             break
282 |     if not downloading_format:
283 |         print_log(
284 |             f"info:{post_id}", f"the specified format could not be found: {args.format}")
285 |         return
286 |     downloading_format["location"] = urlparse.urljoin(
287 |         playlist_url, format_settings["location"])
288 |     format_name = downloading_format["name"]
289 |     print_log(f"info:{post_id}", f"downloading format {format_name}")
290 |     return download_stream(session, index, post_data, downloading_format, drm_session, iframe_url)
291 | 
292 | 
293 | def download_image(session, index, post_data, content_soup):
294 |     post_id = post_data["id"]
295 |     file_name = post_data["basefilename"]
296 |     image_url = content_soup.find("picture").div.img["src"].strip()
297 |     post_data["urls"].append(image_url)
298 |     image_ext = os.path.splitext(urlparse.urlparse(image_url).path)[1]
299 |     output_file_name = f"{file_name}.{index}{image_ext}"
300 |     output_file_path = os.path.join(args.directory, output_file_name)
301 |     if os.path.isfile(output_file_path):
302 |         print_log(f"dl:{post_id}", "file exists, skipping download")
303 |         return output_file_path, post_data
304 |     with open(output_file_path, "wb") as image_file:
305 |         image_r = session.get(image_url)
306 |         if not image_r.ok:
307 |             print_log(f"dl:{post_id}", "failed to retrieve image content")
308 |             return
309 |         image_file.write(image_r.content)
310 |     return output_file_path, post_data
311 | 
312 | 
313 | def get_content_caption(post_data, content_soup):
314 |     caption_divs = content_soup.find_all("div", recursive=False)[
315 |         1].find_all("div", recursive=False)
316 |     if len(caption_divs) > 1:
317 |         caption_text = caption_divs[1].string.strip()
318 |         post_data["description"].append("porn3dx caption: " + caption_text)
319 | 
320 | 
321 | def write_sidecar(path, data):
322 |     if path and data:
323 |         if len(data["urls"]) > 0:
324 |             with open(f"{path}.urls.txt", "w") as urls_sidecar:
325 |                 for url in data["urls"]:
326 |                     urls_sidecar.write(f"{url}\n")
327 |         if "timestamp" in data and data["timestamp"]:
328 |             with open(f"{path}.time.txt", "w") as ts_sidecar:
329 |                 ts_sidecar.write(str(data["timestamp"]))
330 |         if "tags" in data and len(data["tags"]) > 0:
331 |             with open(f"{path}.tags.json", "w") as tags_sidecar:
332 |                 json.dump(data["tags"], tags_sidecar,
333 |                           ensure_ascii=False, indent=4)
334 |         if "description" in data and data["description"]:
335 |             with open(f"{path}.note.json", "w") as note_sidecar:
336 |                 json.dump(data["description"], note_sidecar,
337 |                           ensure_ascii=False, indent=4)
338 | 
339 | 
340 | def get_post_data(post_id, soup):
341 |     post_data = {}
342 |     post_data["id"] = post_id
343 |     canonical_url = soup.find("link", rel="canonical")["href"]
344 |     post_data["urls"] = [canonical_url]
345 |     post_data["basefilename"] = canonical_url.split("/")[-1]
346 |     # Info, Tags, Discussion, More
347 |     post_meta_divs = soup.find(
348 |         id="aside-scroll").div.div.find_all("div", recursive=False)
349 |     # User, Like & Share, Description, Stats, Share
350 |     info_div = post_meta_divs[0].find_all("div", recursive=False)
351 |     tags = []
352 |     post_user_block = info_div[0]
353 |     post_desc_block = info_div[2]
354 |     tags.append("title:" + post_desc_block.find("h1").string.strip())
355 |     tags.append("creator:" + post_user_block.find_all("a")
356 |                 [-1].string.strip()[1:])
357 |     desc_and_ts = post_desc_block.find_all("div", recursive=False)
358 |     ts_index = 0
359 |     post_data["description"] = []
360 |     if len(desc_and_ts) > 1:
361 |         ts_index = 1
362 |         for description_link in desc_and_ts[0].find_all("a"):
363 |             description_link.string = description_link["href"]
364 |         post_data["description"].append(
365 |             "porn3dx description: " + desc_and_ts[0].get_text().strip())
366 |     xtime_text = desc_and_ts[ts_index].span["x-text"]
367 |     date_text_m = re.search(XTIME_REGEX, xtime_text)
368 |     if not date_text_m:
369 |         print_log(f"info:{post_id}", f"failed parsing date '{xtime_text}'")
370 |         return None
371 |     date_values = list(map(int, date_text_m.group(1).split(", ")))
372 |     post_data["timestamp"] = int(datetime(date_values[0], (date_values[1] + 1) %
373 |                                  11, date_values[2], date_values[3], date_values[4], date_values[5]).timestamp())
374 |     tag_block = post_meta_divs[1].find_all("div", recursive=False)[1]
375 |     for tag_link in tag_block.find_all("a", recursive=False):
376 |         tag_category = ""
377 |         tag_text = tag_link.string.strip()
378 |         for tag_class in tag_link["class"]:
379 |             tag_category_m = re.search(TAG_CATEGORY_REGEX, tag_class)
380 |             if not tag_category_m:
381 |                 continue
382 |             category_color = tag_category_m.group(1)
383 |             if category_color in TAG_CATEGORY_MAP:
384 |                 tag_category = TAG_CATEGORY_MAP[tag_category_m.group(1)]
385 |                 break
386 |             else:
387 |                 print_log(
388 |                     f"info:{post_id}", f"could not map tag category for tag '{tag_text}'")
389 |                 print_log(
390 |                     f"info:{post_id}", f"tag category for tag '{tag_text}' resolves to color '{category_color}'")
391 |                 break
392 |         tag_category = tag_category + ":" if tag_category else ""
393 |         tags.append(tag_category + tag_text)
394 |     post_data["tags"] = tags
395 |     print_log(f"info:{post_id}", f"post data: {post_data}", LogLevel.VERBOSE)
396 |     return post_data
397 | 
398 | 
399 | def download_post(session, post_id, post_url):
400 |     # Download page to extract iframe embed url
401 |     print_log(f"info:{post_id}", "retrieving post page")
402 |     post_page_r = session.get(post_url)
403 |     if not post_page_r.ok:
404 |         print_log(f"info:{post_id}", "failed to retrieve post page")
405 |         return
406 |     page_soup = bs(post_page_r.content, "html.parser")
407 |     post_data = get_post_data(post_id, page_soup)
408 |     if not post_data:
409 |         print_log(f"info:{post_id}", "failed parsing post data")
410 |         return
411 |     post_contents = page_soup.find("main", id="postView").find_all("div", recursive=False)[
412 |         1].find("div", recursive=False).find_all("div", recursive=False)
413 |     content_index = 0
414 |     for content in post_contents:
415 |         if content.find("iframe"):
416 |             print_log(f"info:{post_id}", "getting video")
417 |             content_result = download_video(
418 |                 session, content_index, copy.deepcopy(post_data), content)
419 |         elif content.find("picture"):
420 |             print_log(f"info:{post_id}", "getting image")
421 |             content_result = download_image(
422 |                 session, content_index, copy.deepcopy(post_data), content)
423 |         if content_result and args.write_sidecars:
424 |             content_path, content_post_data = content_result
425 |             get_content_caption(content_post_data, content)
426 |             write_sidecar(content_path, content_post_data)
427 |         content_index += 1
428 | 
429 | 
430 | def main():
431 |     if len(args.posts) == 0:
432 |         parser.print_usage()
433 |         return
434 |     elif not os.path.isdir(args.directory):
435 |         os.makedirs(args.directory)
436 |     s = requests.Session()
437 |     s.headers = HEADERS
438 |     for post in args.posts:
439 |         url_m = re.search(POST_REGEX, post)
440 |         if url_m:
441 |             post_url = "https://" + url_m.group("url")
442 |             download_post(s, url_m.group("id"), post_url)
443 | 
444 | 
445 | parser = argparse.ArgumentParser()
446 | args = get_arguments()
447 | 
448 | if __name__ == "__main__":
449 |     main()
450 | 


--------------------------------------------------------------------------------
/porn3dx-downloader/requirements.txt:
--------------------------------------------------------------------------------
1 | bs4
2 | datetime
3 | js2py
4 | requests
5 | 


--------------------------------------------------------------------------------
/twscrape-wrapper/.gitignore:
--------------------------------------------------------------------------------
1 | accounts*
2 | 
3 | 


--------------------------------------------------------------------------------
/twscrape-wrapper/README.md:
--------------------------------------------------------------------------------
 1 | # twscraper-wrapper
 2 | Scrapes tweets into jsonl format. This script makes use of [twscrape](https://github.com/vladkens/twscrape) to replicate the functional output of
 3 | ```sh
 4 | snscrape --jsonl twitter-user <handle> >> file_name.tweets.json
 5 | ```
 6 | from before Twitter became worse for scraping (among other things).
 7 | 
 8 | Before using, you must follow the instructions in the [README for twscrape](https://github.com/vladkens/twscrape#add-accounts) to add at least one account to an `accounts.db` file in the same directory as the script. Additional functionality such as sorting saved files, deduplicating tweets in a file, and automatically identifying the last tweet saved as to limit search queries were added for convenience.
 9 | 
10 | *Unfortunately, retweets are only able to be retrieved from an initial profile scrape. [Advanced search](https://github.com/igorbrigadir/twitter-advanced-search) queries (used with the `save-past` operation) are unable to retrieve retweets properly with modern twitter beyond ~10 days from the present for undocumented reasons. See [#2](https://github.com/CetaceanNation/misc-scripts/issues/2).*
11 | 
12 | ### Usage
13 | ```sh
14 | usage: twscrape-wrapper.py [-h] [-n] {save,save-past,sort,dedupe} filename [handle]
15 | 
16 | positional arguments:
17 |   {save,save-past,sort,dedupe}
18 |                         operation to perform. 'save' downloads tweets to a file ('save-past' works in reverse), 'sort' re-orders tweets in a file, 'dedupe' removes entries with duplicate ids.
19 |   filename              file prefix to write tweets to (will be appended with .tweets.json)
20 |   handle                handle of the account to download from
21 | 
22 | options:
23 |   -h, --help            show this help message and exit
24 |   -n                    prompt for overwriting the existing tweet file
25 |   --download-media, -m  iterate through scraped/sorted tweets and download all media
26 | ```
27 | 


--------------------------------------------------------------------------------
/twscrape-wrapper/twscrape-wrapper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import asyncio
  4 | from collections import OrderedDict
  5 | from contextlib import aclosing
  6 | import datetime
  7 | from http import cookiejar
  8 | import json
  9 | import os
 10 | import requests
 11 | import sys
 12 | from time import sleep
 13 | from twscrape import API, AccountsPool, gather
 14 | from twscrape.logger import set_log_level
 15 | 
 16 | OPERATIONS = ["save", "save-past", "sort", "dedupe", "cookies-to-string"]
 17 | KEY_ORDER_TWEET = ["_type", "url", "date", "rawContent", "renderedContent", "id", "user", "replyCount", "retweetCount", "likeCount", "quoteCount", "conversationId", "lang", "source", "sourceUrl", "sourceLabel", "links", "media", "retweetedTweet",
 18 |                    "quotedTweet", "inReplyToTweetId", "inReplyToUser", "mentionedUsers", "coordinates", "place", "hashtags", "cashtags", "card", "viewCount", "vibe", "content", "outlinks", "outlinksss", "tcooutlinks", "tcooutlinksss", "username"]
 19 | KEY_ORDER_USER = ["_type", "username", "id", "displayname", "rawDescription", "renderedDescription", "descriptionLinks", "verified", "created", "followersCount", "friendsCount", "statusesCount",
 20 |                   "favouritesCount", "listedCount", "mediaCount", "location", "protected", "link", "profileImageUrl", "profileBannerUrl", "label", "description", "descriptionUrls", "linkTcourl", "linkUrl", "url"]
 21 | BACKWARDS_INTERVAL = 120
 22 | MEDIA_HEADERS = {
 23 |     "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:128.0) Gecko/20100101 Firefox/128.0"}
 24 | MEDIA_IMAGE_NAMES = ["orig", "large", "medium", "900x900", "small", "thumb"]
 25 | MEDIA_SLEEP_BETWEEN_DOWNLOADS = 2
 26 | 
 27 | 
 28 | def datetime_handler(x):
 29 |     if isinstance(x, datetime.datetime):
 30 |         return x.isoformat()
 31 |     raise TypeError("Unknown type")
 32 | 
 33 | 
 34 | def datetime_to_search_string(dt):
 35 |     return dt.isoformat().replace("T", "_") + "_UTC"
 36 | 
 37 | 
 38 | async def get_account_id(api, handle):
 39 |     try:
 40 |         twitter_user = await api.user_by_login(handle)
 41 |         return twitter_user.id
 42 |     except Exception:
 43 |         print("could not lookup user with that handle")
 44 |         sys.exit(1)
 45 | 
 46 | 
 47 | def get_account_from_file(filepath):
 48 |     if os.path.isfile(filepath):
 49 |         try:
 50 |             with open(filepath, "r") as tweetsfile:
 51 |                 last_tweet_json = tweetsfile.readline()
 52 |             last_tweet = json.loads(last_tweet_json)
 53 |             return last_tweet["user"]["username"]
 54 |         except Exception:
 55 |             print(f"failed reading the existing tweets file '{filepath}'")
 56 |             sys.exit(1)
 57 |     print(f"could not find the specified file '{filepath}'")
 58 |     sys.exit(1)
 59 | 
 60 | 
 61 | def get_saved_tweets(filepath):
 62 |     saved_tweets = []
 63 |     saved_tweet_jsons = []
 64 |     print(f"looking for tweets in {filepath}")
 65 |     if os.path.isfile(filepath):
 66 |         try:
 67 |             with open(filepath, "r") as tweetsfile:
 68 |                 saved_tweets = tweetsfile.readlines()
 69 |             if len(saved_tweets) > 0:
 70 |                 for tweet_json in saved_tweets:
 71 |                     saved_tweet_jsons.append(json.loads(tweet_json))
 72 |         except Exception:
 73 |             print("failed reading the existing tweets file")
 74 |             sys.exit(1)
 75 |     print(f"found {len(saved_tweet_jsons)} saved tweets")
 76 |     return saved_tweet_jsons
 77 | 
 78 | 
 79 | def dedupe_tweets(tweets):
 80 |     stored_ids = []
 81 |     filtered_tweets = []
 82 |     for tweet in tweets:
 83 |         if not tweet["id"] in stored_ids:
 84 |             stored_ids.append(tweet["id"])
 85 |             filtered_tweets.append(tweet)
 86 |     return filtered_tweets
 87 | 
 88 | 
 89 | def get_last_tweet(tweets, since=True):
 90 |     if len(tweets) > 0:
 91 |         last_tweet_date = datetime.datetime.fromisoformat(
 92 |             tweets[0]["date"]).timestamp()
 93 |     else:
 94 |         last_tweet_date = -1
 95 |     last_tweet_id = None
 96 |     for tweet in tweets:
 97 |         current_tweet_date = datetime.datetime.fromisoformat(
 98 |             tweet["date"]).timestamp()
 99 |         if (since and current_tweet_date > last_tweet_date) or (not since and current_tweet_date < last_tweet_date):
100 |             last_tweet_date = current_tweet_date
101 |             last_tweet_id = tweet["id"]
102 |     return last_tweet_date, last_tweet_id
103 | 
104 | 
105 | async def gather_initial_tweets(api, account_handle):
106 |     tweets = []
107 |     account_id = await get_account_id(api, account_handle)
108 |     user_tweets = await gather(api.user_tweets(account_id))
109 |     for tweet in user_tweets:
110 |         if tweet.user.username == account_handle:
111 |             tweets.append(json.loads(json.dumps(
112 |                 tweet.dict(), default=datetime_handler)))
113 |     return tweets
114 | 
115 | 
116 | async def gather_tweets(api, account_handle, last_timestamp):
117 |     tweets = []
118 |     # Subtract a day to try ensuring overlap, prevents <24 hour difference issues
119 |     last_datetime = datetime.datetime.fromtimestamp(last_timestamp)
120 |     last_datetime -= datetime.timedelta(days=1)
121 |     last_datetime_string = datetime_to_search_string(last_datetime)
122 |     query_string = f"from:{account_handle} since:{last_datetime_string}"
123 |     print(f"query: '{query_string}'")
124 |     user_tweets = await gather(api.search(query_string))
125 |     for tweet in user_tweets:
126 |         if tweet.user.username == account_handle:
127 |             tweets.append(json.loads(json.dumps(
128 |                 tweet.dict(), default=datetime_handler)))
129 |     return tweets
130 | 
131 | 
132 | async def gather_tweets_backwards(api, account_handle, latest_timestamp):
133 |     tweets = []
134 |     # Add a day to try ensuring overlap, prevents <24 hour difference issues
135 |     latest_datetime = datetime.datetime.fromtimestamp(latest_timestamp)
136 |     latest_datetime += datetime.timedelta(days=1)
137 |     while True:
138 |         latest_datetime_string = datetime_to_search_string(latest_datetime)
139 |         back_one_month = latest_datetime - \
140 |             datetime.timedelta(days=BACKWARDS_INTERVAL)
141 |         back_one_month_string = datetime_to_search_string(back_one_month)
142 |         query_string = f"from:{account_handle} since:{back_one_month_string} until:{latest_datetime_string}"
143 |         print(f"query: '{query_string}'")
144 |         user_tweets = await gather(api.search(query_string))
145 |         if len(user_tweets) == 0:
146 |             break
147 |         for tweet in user_tweets:
148 |             if tweet.user.username == account_handle:
149 |                 tweets.append(json.loads(json.dumps(
150 |                     tweet.dict(), default=datetime_handler)))
151 |         latest_datetime = back_one_month + datetime.timedelta(days=1)
152 |     return tweets
153 | 
154 | 
155 | def sort_tweets(tweets):
156 |     return list(reversed(sorted(tweets, key=lambda t: datetime.datetime.fromisoformat(t["date"]).timestamp())))
157 | 
158 | 
159 | def order_tweet_dict(tweet):
160 |     ordered_tweet = OrderedDict((key, tweet.get(key))
161 |                                 for key in KEY_ORDER_TWEET)
162 |     ordered_tweet["user"] = OrderedDict(
163 |         (key, ordered_tweet["user"].get(key)) for key in KEY_ORDER_USER)
164 |     return ordered_tweet
165 | 
166 | 
167 | def write_tweets(tweets, tmp_filepath, filepath, overwrite):
168 |     with open(tmp_filepath, "w") as tmp_tweetsfile:
169 |         for tweet in tweets:
170 |             tweet_json = json.dumps(order_tweet_dict(
171 |                 tweet), default=datetime_handler)
172 |             tmp_tweetsfile.write(f"{tweet_json}\n")
173 |     overwrite = overwrite or input("overwrite existing file? (y/N): ") == "y"
174 |     if overwrite:
175 |         os.replace(tmp_filepath, filepath)
176 | 
177 | 
178 | def get_tweet_media(base_filepath, tweets):
179 |     media_directory = base_filepath + ".media"
180 |     if not os.path.isdir(media_directory):
181 |         try:
182 |             os.makedirs(media_directory)
183 |         except:
184 |             try:
185 |                 media_directory = os.path.join(os.getcwd(), media_directory)
186 |                 os.makedirs(media_directory)
187 |             except:
188 |                 print(
189 |                     "could not find or make the media directory, skipping media downloads")
190 |                 return
191 |     print(f"downloading media for {len(tweets)} tweets")
192 |     media_count = 0
193 |     for tweet in tweets:
194 |         if "media" in tweet and tweet["media"]:
195 |             tweet_url = tweet["url"]
196 |             tweet_media = tweet["media"]
197 |             media_index = 0
198 |             tweet_id = tweet["id"]
199 |             if "photos" in tweet_media and tweet_media["photos"]:
200 |                 images = tweet_media["photos"]
201 |                 for image in images:
202 |                     image_url = image["url"]
203 |                     image_url_path, image_ext = os.path.splitext(
204 |                         image_url.split("?")[0])
205 |                     image_filename = f"{tweet_id}_{media_index}{image_ext}"
206 |                     image_fmt = image_ext[1:]
207 |                     for img_name in MEDIA_IMAGE_NAMES:
208 |                         image_url = f"{image_url_path}?format={image_fmt}&name={img_name}"
209 |                         image_res = download_media_file(
210 |                             tweet_url, image_url, os.path.join(media_directory, image_filename))
211 |                         if image_res > 1:
212 |                             media_count += 1
213 |                         if image_res > 0:
214 |                             media_index += 1
215 |                             break
216 |             if "videos" in tweet_media and tweet_media["videos"]:
217 |                 videos = tweet_media["videos"]
218 |                 for video in videos:
219 |                     video_variants_sorted = sorted(
220 |                         video["variants"], key=lambda v: v["bitrate"], reverse=True)
221 |                     for variant in video_variants_sorted:
222 |                         video_url = variant["url"]
223 |                         _, video_ext = os.path.splitext(
224 |                             video_url.split("?")[0])
225 |                         video_filename = f"{tweet_id}_{media_index}{video_ext}"
226 |                         video_res = download_media_file(
227 |                             tweet_url, video_url, os.path.join(media_directory, video_filename))
228 |                         if video_res > 1:
229 |                             media_count += 1
230 |                         if video_res > 0:
231 |                             media_index += 1
232 |                             break
233 |             if "animated" in tweet_media and tweet_media["animated"]:
234 |                 animations = tweet_media["animated"]
235 |                 for animation in animations:
236 |                     animation_url = animation["videoUrl"]
237 |                     _, animation_ext = os.path.splitext(
238 |                         animation_url.split("?")[0])
239 |                     animation_filename = f"{tweet_id}_{media_index}{animation_ext}"
240 |                     animation_res = download_media_file(
241 |                         tweet_url, animation_url, os.path.join(media_directory, animation_filename))
242 |                     if animation_res > 1:
243 |                         media_count += 1
244 |                     if animation_res > 0:
245 |                         media_index += 1
246 |                         break
247 |     print(f"downloaded {media_count} new media files")
248 | 
249 | 
250 | def download_media_file(tweet_url, media_url, filename):
251 |     if os.path.isfile(filename):
252 |         return 1
253 |     try:
254 |         media_r = requests.get(media_url, headers=MEDIA_HEADERS)
255 |         if not media_r.ok:
256 |             print(f"got bad response for media '{media_url}' from '{tweet_url}'")
257 |             return 0
258 |         with open(filename, "wb") as media_file:
259 |             media_file.write(media_r.content)
260 |             media_file.flush()
261 |         sleep(MEDIA_SLEEP_BETWEEN_DOWNLOADS)
262 |     except:
263 |         print(f"got exception for media '{media_url}' from '{tweet_url}'")
264 |         return 0
265 |     return 2 if os.path.isfile(filename) else 0
266 | 
267 | 
268 | async def main():
269 |     parser = argparse.ArgumentParser(
270 |         formatter_class=argparse.RawTextHelpFormatter)
271 |     parser.add_argument("mode", choices=OPERATIONS,
272 |                         help="operation to perform. 'save' downloads tweets to a file ('save-past' works in reverse), 'sort' re-orders tweets in a file, 'dedupe' removes entries with duplicate ids. 'cookies-to-string' prints a Netscape HTTP Cookie File in a format compatible for adding accounts.")
273 |     parser.add_argument(
274 |         "filename", help="file prefix to write tweets to (will be appended with .tweets.json)")
275 |     parser.add_argument("handle", nargs="?",
276 |                         help="handle of the account to download from")
277 |     parser.add_argument("-n", action="store_false",
278 |                         help="prompt for overwriting the existing tweet file")
279 |     parser.add_argument("--download-media", "-m", action="store_true",
280 |                         help="iterate through scraped/sorted tweets and download all media")
281 |     args = parser.parse_args()
282 |     if args.mode == OPERATIONS[4]:
283 |         filepath = args.filename
284 |         if not os.path.isfile(filepath):
285 |             filepath = os.path.join(os.getcwd(), filepath)
286 |         cookie_jar = cookiejar.MozillaCookieJar(filepath)
287 |         try:
288 |             cookie_jar.load()
289 |             cookies_string = ""
290 |             for cookie in cookie_jar:
291 |                 cookies_string += f"{cookie.name}={cookie.value}; "
292 |             print(cookies_string)
293 |         except:
294 |             print("cookies could not be loaded, file format might not be valid")
295 |         return
296 |     base_filepath = args.filename.replace(".tweets.json", "")
297 |     tmp_filepath = base_filepath + ".tmp.tweets.json"
298 |     filepath = base_filepath + ".tweets.json"
299 |     if not os.path.isfile(filepath):
300 |         filepath = os.path.join(os.getcwd(), filepath)
301 |     if args.mode == OPERATIONS[0] or args.mode == OPERATIONS[1]:
302 |         since = args.mode == OPERATIONS[0]
303 |         script_path = os.path.dirname(os.path.realpath(__file__))
304 |         api = API(AccountsPool(script_path + "/accounts.db"))
305 |         account_handle = args.handle if args.handle else get_account_from_file(
306 |             filepath)
307 |         if not account_handle:
308 |             print("could not get find handle in the provided file")
309 |             sys.exit(1)
310 |         print(f"getting tweets from account {account_handle}")
311 |         saved_tweets = get_saved_tweets(filepath)
312 |         last_saved_tweet_date, last_saved_tweet_id = get_last_tweet(
313 |             saved_tweets, since)
314 |         tweets_gathered = []
315 |         if last_saved_tweet_date < 0:
316 |             print("no previous tweets, creating new file")
317 |             tweets_gathered = await gather_initial_tweets(api, account_handle)
318 |         else:
319 |             last_saved_datetime = datetime.datetime.fromtimestamp(
320 |                 last_saved_tweet_date)
321 |             print(
322 |                 f"retrieving tweets {'since' if since else 'until'} {last_saved_datetime.isoformat()}")
323 |             if since:
324 |                 tweets_gathered = await gather_tweets(api, account_handle, last_saved_tweet_date)
325 |             else:
326 |                 tweets_gathered = await gather_tweets_backwards(api, account_handle, last_saved_tweet_date)
327 |             tweets_gathered = list(filter(lambda t: datetime.datetime.fromisoformat(
328 |                 t["date"]).timestamp() != last_saved_tweet_date, tweets_gathered))
329 |         if len(tweets_gathered) > 0:
330 |             sorted_tweets_gathered = sort_tweets(tweets_gathered)
331 |             last_retrieved_tweet_date = datetime.datetime.fromisoformat(
332 |                 sorted_tweets_gathered[-1]["date"]).timestamp()
333 |             if last_saved_tweet_date > 0 and last_retrieved_tweet_date > last_saved_tweet_date:
334 |                 last_retrieved_datetime = datetime.datetime.fromtimestamp(
335 |                     last_retrieved_tweet_date)
336 |                 tweets_difference = last_retrieved_datetime - last_saved_datetime
337 |                 print(
338 |                     f"warning: oldest tweet retrieved is from {last_retrieved_datetime.isoformat()}, {tweets_difference.days} days difference")
339 |             if args.download_media:
340 |                 get_tweet_media(base_filepath, sorted_tweets_gathered)
341 |             sorted_tweets_gathered.extend(saved_tweets)
342 |             tweets_filtered = dedupe_tweets(sorted_tweets_gathered)
343 |             tweets_sorted = sort_tweets(tweets_filtered)
344 |             print(
345 |                 f"scraped {len(tweets_sorted) - len(saved_tweets)} new tweets")
346 |             write_tweets(tweets_sorted, tmp_filepath, filepath, args.n)
347 |         else:
348 |             print(f"no {'new' if since else 'prior'} tweets found")
349 |     elif args.mode == OPERATIONS[2]:
350 |         saved_tweets = get_saved_tweets(filepath)
351 |         if len(saved_tweets) > 0:
352 |             tweets_sorted = sort_tweets(saved_tweets)
353 |             print(f"sorted tweets in {filepath}")
354 |             write_tweets(tweets_sorted, tmp_filepath, filepath, args.n)
355 |         if args.download_media:
356 |             get_tweet_media(base_filepath, tweets_sorted)
357 |     elif args.mode == OPERATIONS[3]:
358 |         saved_tweets = get_saved_tweets(filepath)
359 |         if len(saved_tweets) > 0:
360 |             tweets_filtered = dedupe_tweets(saved_tweets)
361 |             print(
362 |                 f"removed {len(saved_tweets) - len(tweets_filtered)} duplicates")
363 |             write_tweets(tweets_filtered, tmp_filepath, filepath, args.n)
364 |         if args.download_media:
365 |             get_tweet_media(base_filepath, tweets_filtered)
366 | 
367 | if __name__ == "__main__":
368 |     asyncio.run(main())
369 | 


--------------------------------------------------------------------------------
/vrchat-asset-downloader/.gitignore:
--------------------------------------------------------------------------------
1 | vrchat-token
2 | 
3 | 


--------------------------------------------------------------------------------
/vrchat-asset-downloader/README.md:
--------------------------------------------------------------------------------
 1 | # vrchat-asset-downloader
 2 | Use the [VRChat](https://vrchat.com/home) API to download assets. **Only works with world maps**.
 3 | 
 4 | ### Usage
 5 | ```sh
 6 | usage: vrchat-asset-downloader.py [-h] [-V] [-d DIRECTORY] [--write-thumbnail] [--write-json] [--dont-clean-json] [--verify] [--skip-download]
 7 |                                   [--revisions REVISIONS] [--list-revisions]
 8 |                                   [ASSET IDS ...]
 9 | 
10 | positional arguments:
11 |   ASSET IDS             world/avatar id(s) i.e. wrld_12345678-90ab-cdef-1234-567890abcdef
12 | 
13 | options:
14 |   -h, --help            show this help message and exit
15 |   -V, --verbose         print debugging information
16 |   -d DIRECTORY, --directory DIRECTORY
17 |                         save directory (defaults to current)
18 |   --write-thumbnail     save thumbnail for the asset (if used with '--revision all', all thumbnail revisions will be retrieved)
19 |   --write-json          write metadata to .json file(s)
20 |   --dont-clean-json     retain all json values when writing .json file(s)
21 |   --verify              whether or not to verify downloaded files against remote hashes
22 |   --skip-download       skip downloading the actual asset(s)
23 |   --revisions REVISIONS
24 |                         valid values are the keywords 'all' and 'latest', or the revision integer itself
25 |   --list-revisions      list available revisions for the specified asset
26 | ```
27 | 


--------------------------------------------------------------------------------
/vrchat-asset-downloader/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | requests_toolbelt
3 | tqdm
4 | 


--------------------------------------------------------------------------------
/vrchat-asset-downloader/vrchat-asset-downloader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | import base64
  4 | from enum import Enum
  5 | import hashlib
  6 | import json
  7 | import os
  8 | import re
  9 | import requests
 10 | from requests_toolbelt import sessions
 11 | import shutil
 12 | import sys
 13 | from tqdm.auto import tqdm
 14 | import urllib.parse as urlparse
 15 | 
 16 | # api.vrchat.cloud domain does not always return full json details
 17 | API_URL = "https://vrchat.com/api/1/"
 18 | HEADERS = {"Host": "vrchat.com",
 19 |            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Firefox/91.0"}
 20 | GUID_REGEX = r"_[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}"
 21 | ASSET_REGEX = r"^(?P<asset_type>wrld)" + GUID_REGEX + r"$"
 22 | FILE_REGEX = r"^https?:\/\/api\.vrchat\.cloud\/api\/1\/file\/(?P<file_id>file" + \
 23 |     GUID_REGEX + r")\/[0-9]+\/file$"
 24 | CLEAN_FILENAME_WIN = r"[\/\\:*?\"<>|]"
 25 | CLEAN_FILENAME_POS = r"[\/]"
 26 | BLOCK_SIZE = 1024
 27 | REMOVE_FROM_JSON = ["favorites", "visits", "popularity", "heat",
 28 |                     "publicOccupants", "privateOccupants", "occupants", "instances"]
 29 | ASSET_TYPES = {"wrld": "world", "avtr": "avatar"}
 30 | 
 31 | 
 32 | class LogLevel(Enum):
 33 |     BASIC = 1
 34 |     VERBOSE = 2
 35 | 
 36 | 
 37 | def clean_filename(path):
 38 |     if os.name == "nt":
 39 |         return re.sub(CLEAN_FILENAME_WIN, "_", path)
 40 |     else:
 41 |         return re.sub(CLEAN_FILENAME_WIN, "_", path)
 42 | 
 43 | 
 44 | def get_auth(s):
 45 |     config_r = s.get(f"config")
 46 |     if not config_r.ok:
 47 |         print_log("config", "failed to retrieve API key")
 48 |         print_log(
 49 |             "config", f"config endpoint returned status '{config_r.status_code}'", level=LogLevel.VERBOSE)
 50 |         sys.exit(1)
 51 |     config_j = config_r.json()
 52 |     if "clientApiKey" not in config_j or not config_j["clientApiKey"]:
 53 |         print_log("config", "failed to retrieve API key")
 54 |         print_log("config", f"config response lacks clientApiKey value",
 55 |                   level=LogLevel.VERBOSE)
 56 |         sys.exit(1)
 57 |     clientKey = config_j["clientApiKey"]
 58 |     token_path = os.path.join(os.getcwd(), "vrchat-token")
 59 |     auth_cookie = None
 60 |     if os.path.isfile(token_path):
 61 |         print_log("auth", "reading saved token")
 62 |         with open(token_path, "r") as token_file:
 63 |             auth_cookie = token_file.read()
 64 |     if not auth_cookie or len(auth_cookie) == 0:
 65 |         print_log(
 66 |             "auth", f"while logged in to vrchat.com, visit '{API_URL}auth?apiKey={clientKey}' in your browser")
 67 |         auth_cookie = input("copy your token value here: ")
 68 |     s.headers["Cookie"] = f"apiKey={clientKey}; auth={auth_cookie};"
 69 |     auth_r = s.get(f"auth?apiKey={clientKey}")
 70 |     if not auth_r.ok:
 71 |         print_log(
 72 |             "auth", "error: the token you provided does not appear to be valid")
 73 |         sys.exit(1)
 74 |     with open(token_path, "w") as token_file:
 75 |         token_file.write(auth_cookie)
 76 |     return clientKey
 77 | 
 78 | 
 79 | def download_asset(a_type, a_id, s, api_key):
 80 |     url = f"{a_type}s/{a_id}?{urlparse.urlencode(api_key)}"
 81 |     r = s.get(url)
 82 |     if not r.ok:
 83 |         print_log(f"{a_type}", f"failed to retrieve API response for {a_id}")
 84 |         print_log(
 85 |             f"{a_type}", f"asset endpoint returned status '{r.status_code}'", level=LogLevel.VERBOSE)
 86 |         return
 87 |     asset_j = r.json()
 88 | 
 89 |     file_j = None
 90 |     if "assetUrl" in asset_j and asset_j["assetUrl"]:
 91 |         # this URL no longer returned, may not ever exist again
 92 |         asset_m = re.search(FILE_REGEX, asset_j["assetUrl"])
 93 |         if asset_m:
 94 |             file_id = asset_m.group("file_id")
 95 |             print_log(
 96 |                 f"{a_type}", f"found asset for '{asset_j['name']}' ({a_id})")
 97 |             file_j = get_file_json(file_id, s)
 98 |             asset_j["_assetFile"] = file_j
 99 |         else:
100 |             print_log(
101 |                 f"{a_type}", f"could not find the asset url for '{asset_j['name']}' ({a_id})")
102 |             print_log(
103 |                 f"{a_type}", f"assetUrl did not match expected pattern ('asset_j['assetUrl']')", level=LogLevel.VERBOSE)
104 |     elif "unityPackages" in asset_j and len(asset_j["unityPackages"]) > 0:
105 |         # new way to get asset file ids
106 |         for unityPackage in asset_j["unityPackages"]:
107 |             asset_m = re.search(FILE_REGEX, unityPackage["assetUrl"])
108 |             if asset_m:
109 |                 file_id = asset_m.group("file_id")
110 |                 print_log(
111 |                     f"{a_type}", f"found asset for '{asset_j['name']}' ({a_id})")
112 |                 file_j = get_file_json(file_id, s)
113 |                 asset_j["_assetFile"] = file_j
114 |             else:
115 |                 print_log(
116 |                     f"{a_type}", f"could not find the asset url for '{asset_j['name']}' ({a_id})")
117 |                 print_log(
118 |                     f"{a_type}", f"assetUrl did not match expected pattern ('asset_j['assetUrl']')", level=LogLevel.VERBOSE)
119 |     else:
120 |         print_log(
121 |             f"{a_type}", f"could not find the asset url for '{asset_j['name']}' ({a_id})")
122 |         print_log(f"{a_type}", f"asset response lacks assetUrl value",
123 |                   level=LogLevel.VERBOSE)
124 | 
125 |     image_j = None
126 |     if "imageUrl" in asset_j and asset_j["imageUrl"]:
127 |         image_m = re.search(FILE_REGEX, asset_j["imageUrl"])
128 |         if image_m:
129 |             image_id = image_m.group("file_id")
130 |             print_log(
131 |                 f"{a_type}", f"found image for '{asset_j['name']}' ({a_id})")
132 |             image_j = get_file_json(image_id, s)
133 |             asset_j["_imageFile"] = image_j
134 |         else:
135 |             print_log(
136 |                 f"{a_type}", f"could not find the image url for '{asset_j['name']}' ({a_id})")
137 |             print_log(
138 |                 f"{a_type}", f"imageUrl did not match expected pattern ('{asset_j['imageUrl']}')", level=LogLevel.VERBOSE)
139 |     else:
140 |         print_log(
141 |             f"{a_type}", f"could not find the image url for '{asset_j['name']}' ({a_id})")
142 |         print_log(f"{a_type}", f"asset response lacks imageUrl value",
143 |                   level=LogLevel.VERBOSE)
144 | 
145 |     if args.dont_clean_json:
146 |         for key in REMOVE_FROM_JSON:
147 |             if key in asset_j:
148 |                 asset_j.pop(key)
149 | 
150 |     if args.list_revisions:
151 |         list_file_versions(file_j)
152 |         return
153 |     else:
154 |         save_dir = os.path.join(
155 |             args.directory, clean_filename(asset_j["name"]))
156 |         if not os.path.isdir(save_dir):
157 |             os.makedirs(save_dir)
158 |         if args.write_json:
159 |             json_filename = clean_filename(f"{a_id}.json")
160 |             json_filepath = os.path.join(save_dir, json_filename)
161 |             if os.path.isfile(f"{json_filepath}.tmp"):
162 |                 os.remove(f"{json_filepath}.tmp")
163 |             print_log(
164 |                 f"{a_type}", f"writing asset information to '{json_filename}'")
165 |             with open(f"{json_filepath}.tmp", "w") as json_file:
166 |                 json_file.write(json.dumps(asset_j))
167 |             if os.path.isfile(json_filepath):
168 |                 os.remove(json_filepath)
169 |             os.rename(f"{json_filepath}.tmp", json_filepath)
170 |         if image_j and args.write_thumbnail:
171 |             download_file_from_json(image_j, save_dir, s)
172 |         if asset_j and not args.skip_download:
173 |             download_file_from_json(file_j, save_dir, s)
174 |     print_log(f"{a_type}", f"finished '{asset_j['name']}' ({a_id})")
175 | 
176 | 
177 | def get_file_json(f_id, s):
178 |     url = f"file/{f_id}"
179 |     r = s.get(url)
180 |     if not r.ok:
181 |         print_log("file", f"failed to retrieve API response for {f_id}")
182 |         print_log(
183 |             "file", f"file endpoint returned status '{r.status_code}'", level=LogLevel.VERBOSE)
184 |         return None
185 |     file_j = r.json()
186 |     return file_j
187 | 
188 | 
189 | def list_file_versions(file_j):
190 |     print(f"{'VERSION':<7} {'CREATED AT':<24} {'SIZE (BYTES)':<12} {'MD5':<32}")
191 |     print(f"{'-' * 7} {'-' * 24} {'-' * 12} {'-' * 32}")
192 |     for revision in file_j["versions"][1:]:
193 |         md5sum = base64.b64decode(revision["file"]["md5"])
194 |         file_size = revision['file']['sizeInBytes']
195 |         print(f" {revision['version']:<6} " +
196 |               f"{revision['created_at']:<24} " +
197 |               f"{str(file_size):>12} " +
198 |               f"{md5sum.hex():<32}")
199 | 
200 | 
201 | def download_file_from_json(file_j, save_dir, s):
202 |     get_versions = []
203 |     latest_rev = len(file_j["versions"]) - 1
204 |     term_width = shutil.get_terminal_size((80, 20))[0]
205 |     if args.revisions == "all":
206 |         get_versions = [*range(1, latest_rev + 1)]
207 |     elif args.revisions == "latest":
208 |         get_versions.append(latest_rev)
209 |     elif int(args.revisions) < 1 or int(args.revisions) > latest_rev:
210 |         print_log(
211 |             "file", f"error: revision specified out of range, try --list-revisions")
212 |         return
213 |     else:
214 |         get_versions.append(int(args.revisions))
215 |     print_log("file", f"Downloading {file_j['name']}")
216 |     for dl_num, dl_ver in enumerate(get_versions):
217 |         cur_j = file_j["versions"][dl_ver]["file"]
218 |         file_path = os.path.join(save_dir, clean_filename(cur_j["fileName"]))
219 |         if os.path.isfile(file_path):
220 |             print_log("file", f"'{cur_j['fileName']}' already exists")
221 |         else:
222 |             s.headers["Host"] = "api.vrchat.cloud"
223 |             redirect_r = s.get(
224 |                 cur_j["url"], stream=True, allow_redirects=False)
225 |             if not redirect_r.ok:
226 |                 print_log(
227 |                     f"file", f"could not retrieve file for '{file_j['id']}'")
228 |                 print_log(
229 |                     f"file", f"file url '{cur_j['url']}' returned status '{redirect_r.status_code}'", level=LogLevel.VERBOSE)
230 |                 break
231 |             file_r = requests.get(redirect_r.headers["Location"], stream=True)
232 |             if not file_r.ok:
233 |                 print_log(
234 |                     f"file", f"could not retrieve file for '{file_j['id']}'")
235 |                 print_log(
236 |                     f"file", f"file url '{cur_j['url']}' returned status '{file_r.status_code}'", level=LogLevel.VERBOSE)
237 |                 break
238 |             total_size = int(cur_j["sizeInBytes"])
239 |             with tqdm.wrapattr(open(file_path, "wb"), "write",
240 |                                desc=f"[file] Rev {dl_ver} ({dl_num + 1}/{len(get_versions)})", bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{rate_fmt}{postfix}]",
241 |                                ncols=int(term_width * 0.8), total=total_size,
242 |                                unit="B", unit_scale=True, unit_divisor=BLOCK_SIZE
243 |                                ) as file_h:
244 |                 for chunk in file_r.iter_content(BLOCK_SIZE):
245 |                     file_h.write(chunk)
246 |                 file_h.flush()
247 |         if args.verify and os.path.isfile(file_path):
248 |             verify_file(cur_j["fileName"], file_path, cur_j["md5"])
249 |     return
250 | 
251 | 
252 | def verify_file(file_name, file_path, md5b64):
253 |     print_log("hash", f"verifying {file_name}...", overwrite=True)
254 |     remote_md5 = base64.b64decode(md5b64)
255 |     with open(file_path, "rb") as file_h:
256 |         local_md5 = hashlib.md5()
257 |         while chunk := file_h.read(BLOCK_SIZE):
258 |             local_md5.update(chunk)
259 |     if remote_md5 == local_md5.digest():
260 |         print_log("hash", f"'{file_name}' verified successfully")
261 |     else:
262 |         print_log("hash", f"'{file_name}' failed to verify")
263 | 
264 | 
265 | def print_log(component, message, level=LogLevel.BASIC, overwrite=False):
266 |     if level == LogLevel.VERBOSE and not args.verbose:
267 |         return
268 |     if overwrite:
269 |         print(f"[{component}] {message}", end="\r")
270 |     else:
271 |         print(f"[{component}] {message}")
272 | 
273 | 
274 | def get_arguments():
275 |     parser.add_argument("-V", "--verbose", action="store_true",
276 |                         help="print debugging information")
277 |     parser.add_argument("-d", "--directory", type=str,
278 |                         help="save directory (defaults to current)", default=os.getcwd())
279 |     parser.add_argument("--write-thumbnail", action="store_true",
280 |                         help="save thumbnail for the asset (if used with '--revision all', all thumbnail revisions will be retrieved)")
281 |     parser.add_argument("--write-json", action="store_true",
282 |                         help="write metadata to .json file(s)")
283 |     parser.add_argument("--dont-clean-json", action="store_false",
284 |                         help="retain all json values when writing .json file(s)")
285 |     parser.add_argument("--verify", action="store_true",
286 |                         help="whether or not to verify downloaded files against remote hashes", default=False)
287 |     parser.add_argument("--skip-download", action="store_true",
288 |                         help="skip downloading the actual asset(s)")
289 |     parser.add_argument("--revisions", type=str,
290 |                         help="valid values are the keywords 'all' and 'latest', or the revision integer itself", default="latest")
291 |     parser.add_argument("--list-revisions", action="store_true",
292 |                         help="list available revisions for the specified asset")
293 |     parser.add_argument("asset_id_list", metavar="ASSET IDS", nargs="*",
294 |                         help="world/avatar id(s) i.e. wrld_12345678-90ab-cdef-1234-567890abcdef")
295 |     return parser.parse_args()
296 | 
297 | 
298 | def main():
299 |     if len(args.asset_id_list) == 0:
300 |         parser.print_usage()
301 |         return
302 |     elif not os.path.isdir(args.directory):
303 |         os.makedirs(args.directory)
304 |     api_session = sessions.BaseUrlSession(base_url=API_URL)
305 |     api_session.headers = HEADERS
306 |     api_key = get_auth(api_session)
307 |     api_key_t = {"apiKey": api_key}
308 |     for asset_id in args.asset_id_list:
309 |         asset_type_m = re.search(ASSET_REGEX, asset_id)
310 |         if asset_type_m:
311 |             asset_type = asset_type_m.group("asset_type")
312 |             download_asset(ASSET_TYPES[asset_type],
313 |                            asset_id, api_session, api_key_t)
314 |         else:
315 |             print_log("vrchat-asset-downloader",
316 |                       f"id {asset_id} does not appear to be valid")
317 | 
318 | 
319 | parser = argparse.ArgumentParser()
320 | args = get_arguments()
321 | 
322 | if __name__ == "__main__":
323 |     main()
324 | 


--------------------------------------------------------------------------------
/vroid-hub-downloader/README.md:
--------------------------------------------------------------------------------
 1 | # vroid-hub-downloader
 2 | Downloads preview models (viewable in the browser) from [VRoid Hub](https://hub.vroid.com/). Handles decryption and decompression (assist from bin).
 3 | 
 4 | These decrypted models do not "just work", you will have to manually make the adjustments necessary for using them. See the comments on [this gist](https://gist.github.com/Pldare/ebf704c752a8d77ff9603d4adfe54083) for more info.
 5 | 
 6 | ### Usage
 7 | ```sh
 8 | usage: vroid-hub-downloader.py [-h] [-d DIRECTORY] [--write-info-json] [vroid links/vrm files ...]
 9 | 
10 | positional arguments:
11 |   vroid links/vrm files
12 |                         vroid hub links or encrypted vrm files i.e.
13 |                         https://hub.vroid.com/en/users/49620
14 |                         https://hub.vroid.com/en/characters/6819070713126783571/models/9038381612772945358
15 |                         2520951134072570694.vrm
16 | 
17 | options:
18 |   -h, --help            show this help message and exit
19 |   -d DIRECTORY, --directory DIRECTORY
20 |                         save directory (defaults to current)
21 |   --write-info-json     write user/model json information for urls
22 | ```
23 | 


--------------------------------------------------------------------------------
/vroid-hub-downloader/requirements.txt:
--------------------------------------------------------------------------------
1 | pycryptodome
2 | Requests
3 | zstandard
4 | 


--------------------------------------------------------------------------------
/vroid-hub-downloader/vroid-hub-downloader.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import argparse
  3 | from Crypto.Cipher import AES
  4 | import gzip
  5 | import io
  6 | import json
  7 | import os
  8 | import re
  9 | import requests
 10 | import sys
 11 | import zstandard
 12 | 
 13 | USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0"
 14 | HOST = "https://hub.vroid.com"
 15 | API_VERSION = "11"
 16 | MODEL_FILE_EXT = "glb"
 17 | VROID_BASE = r"(?:https?:\/\/)?hub\.vroid\.com\/(?P<lang>[a-z]{2}\/)?"
 18 | VROID_USER = VROID_BASE + r"users/(?P<user_id>\d+)"
 19 | VROID_MODEL = VROID_BASE + \
 20 |     r"characters\/(?P<character_id>\d+)\/models\/(?P<model_id>\d+)"
 21 | 
 22 | 
 23 | def unpad(s):
 24 |     return s[:-ord(s[len(s)-1:])]
 25 | 
 26 | 
 27 | def get_user_model_ids(user_id):
 28 |     model_ids = []
 29 |     api_url = f"{HOST}/api/users/{user_id}/character_models?antisocial_or_hate_usage=&characterization_allowed_user=&corporate_commercial_use=&credit=&modification=&personal_commercial_use=&political_or_religious_usage=&redistribution=&sexual_expression=&violent_expression="
 30 |     page_num = 1
 31 |     while api_url:
 32 |         user_r = requests.get(
 33 |             api_url, headers={"User-Agent": USER_AGENT, "X-Api-Version": API_VERSION})
 34 |         if not user_r.ok:
 35 |             print(
 36 |                 f"[user:{user_id}:page:{page_num}] got bad response from vroid hub, {user_r.status_code}")
 37 |             break
 38 |         user_j = user_r.json()
 39 |         if "next" in user_j["_links"]:
 40 |             api_url = HOST + user_j["_links"]["next"]["href"]
 41 |         else:
 42 |             api_url = None
 43 |         for model in user_j["data"]:
 44 |             model_ids.append(model["id"])
 45 |     print(f"[user:{user_id}] found {len(model_ids)} models")
 46 |     return model_ids
 47 | 
 48 | 
 49 | def download_preview_model(model_id):
 50 |     model_preview_url = f"{HOST}/api/character_models/{model_id}/optimized_preview"
 51 |     model_r = requests.get(model_preview_url, allow_redirects=True, headers={
 52 |                            "User-Agent": USER_AGENT, "X-Api-Version": API_VERSION})
 53 |     if not model_r.ok:
 54 |         print(
 55 |             f"[model:{model_id}:preview] got bad response from vroid hub, {model_r.status_code}")
 56 |         print(f"[model:{model_id}:preview] {model_r.content.decode()}")
 57 |         return None
 58 |     return io.BytesIO(model_r.content)
 59 | 
 60 | 
 61 | def decrypt_decompress_model(model_id, model_bytes, model_filename):
 62 |     if not os.path.isfile(model_filename):
 63 |         with open(model_filename, "wb") as dec_vrm:
 64 |             iv_bytes = model_bytes.read(16)
 65 |             key_bytes = model_bytes.read(32)
 66 |             key_context = AES.new(key_bytes, AES.MODE_CBC, iv_bytes)
 67 |             enc_data = model_bytes.read()
 68 |             dec_data = unpad(key_context.decrypt(enc_data))[4:]
 69 |             dctx = zstandard.ZstdDecompressor()
 70 |             with dctx.stream_writer(dec_vrm) as decompressor:
 71 |                 decompressor.write(dec_data)
 72 |         print(
 73 |             f"[model:{model_id}] wrote decrypted and decompressed model '{os.path.basename(model_filename)}'")
 74 |     else:
 75 |         print(
 76 |             f"[model:{model_id}] '{os.path.basename(model_filename)}' already exists")
 77 | 
 78 | 
 79 | def download_model_from_vroid(model_id, subdir=None):
 80 |     model_path_base = os.path.join(
 81 |         subdir if subdir else args.directory, model_id)
 82 |     model_api_url = f"{HOST}/api/character_models/{model_id}"
 83 |     json_path = f"{model_path_base}.info.json"
 84 |     model_api_r = requests.get(model_api_url, headers={
 85 |                                "User-Agent": USER_AGENT, "X-Api-Version": API_VERSION})
 86 |     if not model_api_r.ok:
 87 |         print(
 88 |             f"[model:{model_id}:api] got bad response from vroid hub, {model_r.status_code}")
 89 |         return
 90 |     model_api_j = model_api_r.json()["data"]
 91 |     if args.write_info_json and not os.path.isfile(json_path):
 92 |         with open(json_path, "w") as json_file:
 93 |             json_file.write(json.dumps(model_api_j))
 94 |         print(f"[model:{model_id}:api] wrote '{os.path.basename(json_path)}'")
 95 |     else:
 96 |         print(
 97 |             f"[model:{model_id}:api] '{os.path.basename(json_path)}' already exists")
 98 |     if not "conversion_state" in model_api_j["character_model"]["latest_character_model_version"]:
 99 |         print(
100 |             f"[model:{model_id}:api] warning: JSON response implies model preview does not exist, expecting 404")
101 |     elif model_api_j["character_model"]["latest_character_model_version"]["conversion_state"]["current_state"] != "completed":
102 |         print(
103 |             f"[model:{model_id}:api] warning: JSON response implies model preview is not ready, expecting 404")
104 |     enc_vrm = download_preview_model(model_id)
105 |     if not enc_vrm:
106 |         return
107 |     decrypt_decompress_model(
108 |         model_id, enc_vrm, f"{model_path_base}.{MODEL_FILE_EXT}")
109 | 
110 | 
111 | def download_user_from_vroid(user_id):
112 |     user_api_url = f"{HOST}/api/users/{user_id}"
113 |     user_api_r = requests.get(user_api_url, headers={
114 |                               "User-Agent": USER_AGENT, "X-Api-Version": API_VERSION})
115 |     if not user_api_r.ok:
116 |         print(
117 |             f"[user:{user_id}:api] got bad response from vroid hub, user might not exist, {user_api_r.status_code}")
118 |         return
119 |     user_api_j = user_api_r.json()
120 |     username = user_api_j["data"]["user"]["name"]
121 |     user_base_path = os.path.join(args.directory, f"{username} ({user_id})")
122 |     if not os.path.isdir(user_base_path):
123 |         os.makedirs(user_base_path)
124 |     json_path = f"{user_base_path}.info.json"
125 |     if args.write_info_json:
126 |         with open(json_path, "w") as json_file:
127 |             json_file.write(json.dumps(user_api_j["data"]))
128 |         print(f"[user:{user_id}:api] wrote '{os.path.basename(json_path)}'")
129 |     model_ids = get_user_model_ids(user_id)
130 |     for model_id in model_ids:
131 |         download_model_from_vroid(model_id, user_base_path)
132 | 
133 | 
134 | parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
135 | parser.add_argument("-d", "--directory", type=str,
136 |                     help="save directory (defaults to current)", default=os.getcwd())
137 | parser.add_argument("--write-info-json", action="store_true",
138 |                     help="write user/model json information for urls")
139 | parser.add_argument("vrms", metavar="vroid links/vrm files", nargs="*",
140 |                     help="vroid hub links or encrypted vrm files i.e.\nhttps://hub.vroid.com/en/users/49620\nhttps://hub.vroid.com/en/characters/6819070713126783571/models/9038381612772945358\n2520951134072570694.vrm")
141 | args = parser.parse_args()
142 | 
143 | if not os.path.isdir(args.directory):
144 |     os.makedirs(args.directory)
145 | 
146 | for vrm in args.vrms:
147 |     vroid_usr_m = re.search(VROID_USER, vrm)
148 |     model_m = re.search(VROID_MODEL, vrm)
149 |     if vroid_usr_m:
150 |         user_id = vroid_usr_m.group("user_id")
151 |         download_user_from_vroid(user_id)
152 |     elif model_m:
153 |         model_id = model_m.group("model_id")
154 |         download_model_from_vroid(model_id)
155 |     else:
156 |         if not os.path.isfile(vrm):
157 |             print(f"could not find file at path '{vrm}'")
158 |             continue
159 |         with open(vrm, "rb") as vrm_file:
160 |             enc_vrm = io.BytesIO(vrm_file.read())
161 |         model_filename = os.path.join(
162 |             args.directory, f"{vrm}.decrypted.{MODEL_FILE_EXT}")
163 |         decrypt_decompress_model(enc_vrm, model_filename)
164 | 


--------------------------------------------------------------------------------