├── README.md ├── __init__.py ├── amazon_get_meta_info.py ├── amazon_get_real_url.py ├── amazon_get_season_urls.py ├── amazon_get_seasons_info.py ├── amazon_is_meta.py ├── amazon_meta_crawler.js ├── funi_request.py ├── getCookie.py ├── get_season_urls.py ├── hbomax_meta_crawler.js ├── log.py ├── netflix_meta_crawler.js ├── paramount_subtitle_trans.py ├── srtConvert.py ├── text ├── Cue.py ├── Mp4TtmlParser.py ├── Mp4VttParser.py ├── TextEngine.py ├── TtmlTextParser.py └── VttTextParser.py └── util ├── DataViewReader.py ├── Functional.py ├── Mp4BoxParsers.py ├── Mp4Parser.py ├── TextParser.py └── exceptions.py /README.md: -------------------------------------------------------------------------------- 1 | # StreamFab Python Modules 2 | 3 | This repository contains a collection of Python modules developed by StreamFab for various streaming-related tasks. 4 | 5 | ## Introduction 6 | 7 | StreamFab is a suite of tools and utilities designed to facilitate streaming media processing and manipulation. These Python modules offer functionality for tasks such as: 8 | 9 | - Stream capturing 10 | - Video format conversion 11 | - Subtitle extraction 12 | - Metadata retrieval 13 | - and more! 14 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThatNotEasy/StreamFab/157f7a91f75459e523076ab9d586c798f1a9c989/__init__.py -------------------------------------------------------------------------------- /amazon_get_meta_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re, sys, string 4 | import bs4,json 5 | 6 | import logging 7 | logger = logging.getLogger() 8 | 9 | def get_seconds(str): 10 | try: 11 | seconds = 0 12 | if str=='': 13 | return seconds 14 | if '時間' in str and '分' in str: 15 | strs = str.split('時間') 16 | hours = strs[0] 17 | min_str = strs[1] 18 | min_strs = min_str.split('分') 19 | min = min_strs[0] 20 | seconds = 60 * 60 * int(hours) + 60 * int(min) 21 | elif 'h' in str: 22 | if ' ' in str: 23 | strs = str.split(' ') 24 | hours = strs[0] 25 | min = strs[2] 26 | seconds = 60 * 60 * int(hours) + 60 * int(min) 27 | else: 28 | if 'min' in str: 29 | str = str[:-3] 30 | strs = str.split('h') 31 | hours = strs[0] 32 | min = strs[1] 33 | seconds = 60 * 60 * int(hours) + 60 * int(min) 34 | else: 35 | str = str[:-1] 36 | seconds = 60 * 60 * int(str) 37 | else: 38 | if ' ' in str: 39 | strs = str.split(' ') 40 | min = strs[0] 41 | seconds = 60 * int(min) 42 | else: 43 | str = str[:-3] 44 | seconds = 60 * int(str) 45 | pass 46 | 47 | except Exception: 48 | return 0 49 | 50 | return seconds 51 | 52 | EU_REGION = "atv-ps-eu" 53 | JP_REGION = "atv-ps-fe" 54 | US_REGION = "atv-ps" 55 | 56 | ROW_NA = "ART4WZ8MWBX2Y" 57 | ROE_EU = "A3K6Y4MI8GDYMT" 58 | ROW_EU = "A2MFUE2XK8ZSSY" 59 | ROW_FE = "A15PK738MTQHSO" 60 | 61 | DE = "A1PA6795UKMFR9" 62 | JP = "A1VC38T7YXB528" 63 | UK = "A1F83G8C2ARO7P" 64 | US = "ATVPDKIKX0DER" 65 | 66 | def getDomainFromMarketplaceId(marketplaceID): 67 | 68 | if marketplaceID==ROW_NA or marketplaceID==ROE_EU or marketplaceID==ROW_EU or marketplaceID==ROW_FE: 69 | return ".primevideo.com" 70 | if marketplaceID==DE: 71 | return ".amazon.de" 72 | if marketplaceID==JP: 73 | return ".amazon.co.jp" 74 | if marketplaceID==UK: 75 | return ".amazon.co.uk" 76 | return ".amazon.com" 77 | 78 | def getAVTFromMarketplaceId(marketplaceID): 79 | 80 | if marketplaceID==ROW_NA or marketplaceID==US: 81 | return US_REGION 82 | if marketplaceID==ROE_EU or marketplaceID==ROW_EU or marketplaceID==UK or marketplaceID==DE: 83 | return EU_REGION 84 | if marketplaceID==ROW_FE or marketplaceID==JP: 85 | return JP_REGION 86 | raise("getAVTFromMarketplaceId error") 87 | 88 | def get_meta_info(webpage): 89 | amazonSoup = bs4.BeautifulSoup(webpage,'html5lib') 90 | 91 | reJson = r'"marketplaceID":"(.*?)"' 92 | marketplaceID = re.findall(reJson, webpage)[0] 93 | 94 | domain = getAVTFromMarketplaceId(marketplaceID) + getDomainFromMarketplaceId(marketplaceID) 95 | 96 | movie_play_url = '' 97 | movie_play_urls = amazonSoup.select('.dv-dp-node-playback') 98 | if len(movie_play_urls)!=0: 99 | movie_play_url = movie_play_urls[0].find("a")['href'] 100 | 101 | current_id = "" 102 | if amazonSoup.find("ol",class_="_3kgCxW")!=None: 103 | lis = amazonSoup.find("ol",class_="_3kgCxW").find_all("li") 104 | for index in range(len(lis)): 105 | li = lis[index] 106 | href = "" 107 | if li.find("a")==None: 108 | continue 109 | if not(li.find("a").has_attr('href')): 110 | continue 111 | href= li.find("a")['href'] 112 | episodeId = "" 113 | if li.find("label")!=None: 114 | episodeId = li.find("label")['for'].replace('selector-','') 115 | if href.split('ref')[0] == movie_play_url.split('ref')[0]: 116 | current_id = episodeId 117 | 118 | reLocale = r'"locale":"(.*?)"' 119 | locale = re.findall(reLocale, webpage)[0] 120 | 121 | reKeywordTitle = r'"keywords":\["(.*?)"\]' 122 | tmp_keyword_title = re.findall(reKeywordTitle, webpage) 123 | reTitle = r'"parentTitle":"(.*?)"' 124 | tmp_title = re.findall(reTitle, webpage) 125 | 126 | h1_title="" 127 | h1_elements = amazonSoup.findAll('h1') 128 | for h1_element in h1_elements: 129 | if h1_element.find('img') : 130 | h1_title=h1_element.find('img').attrs["alt"] 131 | pass 132 | 133 | main_element = amazonSoup.find('main',attrs={'id':'main','data-testid':'detailpage-main'}) 134 | 135 | if len(amazonSoup.select('._2Q73m9'))!=0: 136 | title = amazonSoup.select('._2Q73m9')[0].string 137 | elif len(amazonSoup.find_all(attrs={'data-automation-id':'title'}))!=0: 138 | title = amazonSoup.find_all(attrs={'data-automation-id':'title'})[0].string 139 | elif h1_title: 140 | title = h1_title 141 | elif len(tmp_keyword_title)!=0: 142 | title = tmp_keyword_title[0] 143 | elif len(tmp_title)!=0: 144 | title = tmp_title[0] 145 | else: 146 | title = amazonSoup.title.string 147 | 148 | release_time = '' 149 | release_times = amazonSoup.find_all(attrs={'data-automation-id':'release-year-badge'}) 150 | if len(release_times)==1: 151 | release_time = release_times[0].string 152 | runtime_strs = amazonSoup.find_all(attrs={'data-automation-id':'runtime-badge'}) 153 | runtime_str = "" 154 | if len(runtime_strs)==1: 155 | runtime_str = runtime_strs[0].string 156 | runtime = get_seconds(runtime_str) 157 | 158 | synopsis = '' 159 | e_synopsis = amazonSoup.select('._1wxob_') 160 | if len(e_synopsis)==0: 161 | e_synopsis = amazonSoup.select('._3qsVvm') 162 | if len(e_synopsis)==0 and main_element is not None: 163 | e_synopsis = main_element.select('._5tB6mN') 164 | if len(e_synopsis)==0 and main_element is not None: 165 | e_synopsis = main_element.select('.dv-dp-node-synopsis') 166 | if len(e_synopsis)!=0: 167 | synopsis = e_synopsis[0].string 168 | 169 | thumbs = '' 170 | e_thumb = amazonSoup.select('#atf-full') 171 | if len(e_thumb)!=0: 172 | thumbs = e_thumb[0]['src'] 173 | if not thumbs: 174 | e_thumb = amazonSoup.find("img", class_="_2x6L3o") 175 | if e_thumb: 176 | thumbs = e_thumb['src'] 177 | if not thumbs: 178 | div_thumb = amazonSoup.find(attrs={'data-automation-id':'hero-background'}) 179 | if div_thumb: 180 | e_thumb = div_thumb.find("img") 181 | if e_thumb: 182 | thumbs = e_thumb['src'] 183 | 184 | if current_id=="": 185 | reCurrentId = r'"pageTitleId":"(.*?)"' 186 | current_ids = re.findall(reCurrentId, webpage) 187 | if len(current_ids)!=0: 188 | current_id = re.findall(reCurrentId, webpage)[0] 189 | 190 | try: 191 | audios = [] 192 | subtitles = [] 193 | 194 | meta_info = amazonSoup.select('.dv-dp-node-meta-info') 195 | if (len(meta_info) > 0): 196 | l = len(meta_info[0].select('._2czKtE')) 197 | if (l==1): 198 | e_audios = amazonSoup.select('.dv-dp-node-meta-info')[0].select("._2czKtE")[0].select('dd')[0].text 199 | for e_audio in e_audios.split(','): 200 | if '…' in e_audio: 201 | e_audio = e_audio.split('…')[1] 202 | audios.append(e_audio) 203 | else: 204 | e_subtitles = amazonSoup.select('.dv-dp-node-meta-info')[0].select('._2czKtE')[0].select('dd')[0].text 205 | e_audios = amazonSoup.select('.dv-dp-node-meta-info')[0].select("._2czKtE")[1].select('dd')[0].text 206 | for e_audio in e_audios.split(','): 207 | if '…' in e_audio: 208 | e_audio = e_audio.split('…')[1] 209 | audios.append(e_audio) 210 | for e_subtitle in e_subtitles.split(','): 211 | if '…' in e_subtitle: 212 | e_subtitle = e_subtitle.split('…')[1] 213 | subtitles.append(e_subtitle) 214 | 215 | except Exception as e: 216 | audios = ['English'] 217 | subtitles = ['English'] 218 | # if 'primevideo.com' not in domain: 219 | # current_id = "" 220 | 221 | genres = [] 222 | if main_element is not None: 223 | genres_div = main_element.find('div', {'class': 'dv-node-dp-genres'}) 224 | if genres_div: 225 | genres_spans = genres_div.find_all('span', {'aria-label': True}) 226 | for span in genres_spans: 227 | genres.append(span.get('aria-label')) 228 | # if genres is not None or len(genres) != 0: 229 | # print(genres) 230 | 231 | 232 | directors = [] 233 | casts =[] 234 | try: 235 | if main_element is not None: 236 | div_product_detail = main_element.find(attrs={'data-automation-id':'btf-product-details'}) 237 | if div_product_detail: 238 | directors_dl = div_product_detail.find_all('dl') 239 | for dl in directors_dl: 240 | dt = dl.find('dt', string='Directors') 241 | if dt: 242 | directors = [director.text for director in dt.find_next_sibling('dd').find_all('a')] 243 | break 244 | 245 | casts_dl = div_product_detail.find_all('dl') 246 | for dl in casts_dl: 247 | dt = dl.find('dt', string='Starring') 248 | if dt: 249 | casts = [cast.text for cast in dt.find_next_sibling('dd').find_all('a')] 250 | break 251 | 252 | except Exception as e: 253 | logger.info(str(e)) 254 | directors = [] 255 | casts = [] 256 | 257 | # if casts: 258 | # print('casts :',casts) 259 | # if directors: 260 | # print("directors :",directors) 261 | 262 | 263 | info = { 264 | "id": current_id, 265 | "marketplaceID":marketplaceID, 266 | "domain":domain, 267 | "locale":locale, 268 | "movie_play_url":movie_play_url, 269 | "title": title, 270 | "release_time": release_time, 271 | "runtime": runtime, 272 | "runtime_str":runtime_str, 273 | "subtitles":subtitles, 274 | "audios":audios, 275 | "synopsis": synopsis, 276 | "thumbs": thumbs, 277 | "directors": directors, 278 | "casts": casts, 279 | "writers": [], 280 | "genres": genres, 281 | "moodTags": [] 282 | } 283 | 284 | return info 285 | 286 | def run(params): 287 | arrParams = list(params) 288 | log_path = '' 289 | for index in range(len(arrParams)): 290 | if index==0: 291 | log_path = arrParams[index] 292 | elif index==1: 293 | output_file = arrParams[index] 294 | else: 295 | input_file= arrParams[index] 296 | 297 | # set logging 298 | log_handler = logging.FileHandler(log_path) 299 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 300 | log_handler.setFormatter(log_formatter) 301 | logger.addHandler(log_handler) 302 | logger.setLevel(logging.INFO) 303 | 304 | try: 305 | f = open(input_file,'r',encoding='UTF-8') 306 | webpage = f.read() 307 | f.close() 308 | 309 | info = get_meta_info(webpage) 310 | 311 | with open(output_file,"w",encoding='UTF-8') as f: 312 | json.dump(info,f) 313 | logger.info("get_meta_info complete...") 314 | 315 | except Exception as e: 316 | logger.exception(str(e)) 317 | 318 | import os 319 | if __name__ == '__main__': 320 | 321 | params = [ 322 | "./python.log", 323 | "./get_meta_info.json", 324 | "./amazonGetMetaInfoTempIn.html", 325 | ] 326 | 327 | if os.path.exists(params[0]): 328 | os.remove(params[0]) 329 | if os.path.exists(params[1]): 330 | os.remove(params[1]) 331 | 332 | run(params) -------------------------------------------------------------------------------- /amazon_get_real_url.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re, sys, string 4 | import bs4,json 5 | 6 | import logging 7 | logger = logging.getLogger() 8 | 9 | def run(params): 10 | arrParams = list(params) 11 | log_path = '' 12 | for index in range(len(arrParams)): 13 | if index==0: 14 | log_path = arrParams[index] 15 | elif index==1: 16 | output_file = arrParams[index] 17 | else: 18 | input_file= arrParams[index] 19 | 20 | # set logging 21 | log_handler = logging.FileHandler(log_path) 22 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 23 | log_handler.setFormatter(log_formatter) 24 | logger.addHandler(log_handler) 25 | logger.setLevel(logging.INFO) 26 | 27 | try: 28 | f = open(input_file,'r',encoding='UTF-8') 29 | webpage = f.read() 30 | f.close() 31 | 32 | amazonSoup = bs4.BeautifulSoup(webpage,'html5lib') 33 | expander = amazonSoup.find_all(attrs={'data-automation-id':'ep-expander'}) 34 | if len(expander)==0: 35 | real_url = "" 36 | else: 37 | real_url = expander[0]['href'] 38 | dict = { 39 | "real_url":real_url 40 | } 41 | with open(output_file,"w", encoding='UTF-8') as f: 42 | json.dump(dict,f) 43 | logger.info("get_real_url complete...") 44 | 45 | except Exception as e: 46 | logger.exception(str(e)) 47 | 48 | import os 49 | if __name__ == '__main__': 50 | 51 | params = [ 52 | "./log/python.log", 53 | "./result/get_real_url.json", 54 | "./html/season_1.html", 55 | ] 56 | 57 | if os.path.exists(params[0]): 58 | os.remove(params[0]) 59 | if os.path.exists(params[1]): 60 | os.remove(params[1]) 61 | 62 | run(params) -------------------------------------------------------------------------------- /amazon_get_season_urls.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re, sys, string 4 | import bs4,json 5 | 6 | import logging 7 | logger = logging.getLogger() 8 | 9 | def run(params): 10 | arrParams = list(params) 11 | log_path = '' 12 | for index in range(len(arrParams)): 13 | if index==0: 14 | log_path = arrParams[index] 15 | elif index==1: 16 | output_file = arrParams[index] 17 | else: 18 | input_file= arrParams[index] 19 | 20 | # set logging 21 | log_handler = logging.FileHandler(log_path) 22 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 23 | log_handler.setFormatter(log_formatter) 24 | logger.addHandler(log_handler) 25 | logger.setLevel(logging.INFO) 26 | 27 | try: 28 | f = open(input_file,'r',encoding='UTF-8') 29 | webpage = f.read() 30 | f.close() 31 | 32 | amazonSoup = bs4.BeautifulSoup(webpage,'html5lib') 33 | elems = amazonSoup.select('.dv-node-dp-seasons a') 34 | season_urls = [] 35 | for elem in elems: 36 | href = elem['href'] 37 | season_urls.append(href) 38 | dict = { 39 | "season_urls":season_urls 40 | } 41 | 42 | if season_urls is None or len(season_urls)==0: 43 | logger.warning("No season urls found '.dv-node-dp-seasons a' - trying '.dv-node-dp-seasons-default'...") 44 | elems = amazonSoup.select('.dv-node-dp-seasons-default a') 45 | season_urls = [] 46 | # 递归查找所有元素中的 herf 47 | for elem in elems: 48 | href = elem['href'] 49 | season_urls.append(href) 50 | dict = { 51 | "season_urls":season_urls 52 | } 53 | 54 | 55 | 56 | with open(output_file,"w", encoding='UTF-8') as f: 57 | json.dump(dict,f) 58 | logger.info("amazon_get_season_urls complete...") 59 | 60 | except Exception as e: 61 | logger.exception(str(e)) 62 | 63 | import os 64 | if __name__ == '__main__': 65 | 66 | params = [ 67 | "./amazon/log/python.log", 68 | "./amazon/result/get_season_urls.json", 69 | "./amazon/html/_de/Watch Good Omens - Season 1 _ Prime Video.html", 70 | ] 71 | 72 | if os.path.exists(params[0]): 73 | os.remove(params[0]) 74 | if os.path.exists(params[1]): 75 | os.remove(params[1]) 76 | 77 | run(params) -------------------------------------------------------------------------------- /amazon_get_seasons_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re, sys, string 4 | import bs4,json 5 | 6 | import logging 7 | logger = logging.getLogger() 8 | 9 | def get_seconds(str): 10 | try: 11 | seconds = 0 12 | if str=='': 13 | return seconds 14 | if '時間' in str and '分' in str: 15 | strs = str.split('時間') 16 | hours = strs[0] 17 | min_str = strs[1] 18 | min_strs = min_str.split('分') 19 | min = min_strs[0] 20 | seconds = 60 * 60 * int(hours) + 60 * int(min) 21 | elif 'h' in str: 22 | if ' ' in str: 23 | strs = str.split(' ') 24 | hours = strs[0] 25 | min = strs[2] 26 | seconds = 60 * 60 * int(hours) + 60 * int(min) 27 | else: 28 | if 'min' in str: 29 | str = str[:-3] 30 | strs = str.split('h') 31 | hours = strs[0] 32 | min = strs[1] 33 | seconds = 60 * 60 * int(hours) + 60 * int(min) 34 | else: 35 | str = str[:-1] 36 | seconds = 60 * 60 * int(str) 37 | else: 38 | if ' ' in str: 39 | strs = str.split(' ') 40 | min = strs[0] 41 | seconds = 60 * int(min) 42 | else: 43 | str = str[:-3] 44 | seconds = 60 * int(str) 45 | pass 46 | 47 | except Exception: 48 | return 0 49 | 50 | return seconds 51 | 52 | def is_number(s): 53 | try: 54 | float(s) 55 | return True 56 | except ValueError: 57 | pass 58 | 59 | try: 60 | import unicodedata 61 | unicodedata.numeric(s) 62 | return True 63 | except (TypeError, ValueError): 64 | pass 65 | 66 | return False 67 | 68 | def get_episodes_info(episodes_data): 69 | episodes_array = [] 70 | count = 1 71 | 72 | for part in episodes_data: 73 | lis = part.find_all("li") 74 | for index in range(len(lis)): 75 | li = lis[index] 76 | 77 | title = '' 78 | js_title = li.find("div", class_="js-episode-title-name") 79 | if not js_title: 80 | js_title = li.find("div", class_="dv-episode-noplayback-title") 81 | if not js_title: 82 | js_title = li.find("div", class_="_1TQ2Rs") 83 | if not js_title: 84 | js_title = li.find("div", class_="izvPPq") 85 | 86 | if not js_title: 87 | js_title = li.find("span", class_="S4388s") 88 | if not js_title: 89 | js_title = li.find("span", class_="P1uAb6") 90 | 91 | 92 | if js_title: 93 | title = js_title.text 94 | 95 | synopsis = '' 96 | js_synopsis = li.find("div", class_="_3qsVvm") 97 | if js_synopsis: 98 | synopsis = js_synopsis.string 99 | 100 | if li.find("a")==None: 101 | count += 1 102 | continue 103 | 104 | if not(li.find("a").has_attr('href')): 105 | count += 1 106 | continue 107 | 108 | play_url = "" 109 | play_url = li.find("a")['href'] 110 | if play_url == None: 111 | count += 1 112 | continue 113 | 114 | if 'force_return_url' in play_url: 115 | count += 1 116 | continue 117 | 118 | episodeId = "" 119 | if li.find("label")!=None: 120 | episodeId = li.find("label")['for'].replace('selector-','') 121 | 122 | runtime_str = '' 123 | js_runtime = li.find("div", class_="_1DcCXQ _2_ujMf") 124 | if not js_runtime: 125 | js_runtime = li.find("div", class_="_3rBDNv _1AeKJC") 126 | if not js_runtime: 127 | js_runtime = li.find("div", class_="_1wFEYz ci7S35") 128 | if js_runtime: 129 | divs = js_runtime.find_all('div') 130 | if divs: 131 | if len(divs) > 1: 132 | runtime_str = divs[1].string 133 | else: 134 | runtime_str = divs[0].string 135 | 136 | runtime = get_seconds(runtime_str) 137 | 138 | thumbs_url = '' 139 | pic = li.find("picture") 140 | if pic: 141 | img = pic.find("img") 142 | thumbs_url = img['src'] 143 | 144 | if not thumbs_url: 145 | img = li.find("img") 146 | if img: 147 | thumbs_url = img['src'] 148 | 149 | # positionStr = title.replace(" ",'').split('.')[0] 150 | # if(is_number(positionStr)): 151 | # position = int(positionStr) 152 | # else: 153 | # position = count 154 | position = count 155 | 156 | e_obj = { 157 | "seq": position, 158 | "episodeId": episodeId, 159 | "play_url": play_url, 160 | "synopsis": synopsis, 161 | "title": title, 162 | "runtime": runtime, 163 | "runtime_str":runtime_str, 164 | "thumbs": { 165 | "url": thumbs_url 166 | } 167 | } 168 | 169 | count += 1 170 | episodes_array.append(e_obj) 171 | 172 | return episodes_array 173 | 174 | def get_extras_info(extras_data): 175 | extras = {} 176 | 177 | for tag in extras_data: 178 | extras_tag = '' 179 | js_title = tag.find(class_='ROp-tf') 180 | if not js_title: 181 | js_title = tag.find(class_='Dsc37Q') 182 | if js_title: 183 | extras_tag = js_title.text.replace(" ", "") 184 | 185 | extras_list = [] 186 | lis = [] 187 | ul = tag.find(class_="jxBPRE _28m62t") 188 | if ul: 189 | lis = ul.find_all("li") 190 | 191 | if not lis: 192 | lis = tag.find_all(class_="_1z3n6o") 193 | 194 | for index in range(len(lis)): 195 | li = lis[index] 196 | 197 | title = "" 198 | runtime_str = "" 199 | runtime = 0 200 | rating = "" 201 | play_id = "" 202 | play_url = "" 203 | synopsis = "" 204 | thumbs_url = "" 205 | 206 | js_playbutton = li.find('a', attrs={'data-automation-id':'extras-playbutton'}) 207 | if not js_playbutton: 208 | continue 209 | if js_playbutton: 210 | play_url = js_playbutton['href'] 211 | 212 | js_title = li.find("div", class_="vRplU5") 213 | if not js_title: 214 | js_title = li.find('span', class_="lTKTFD") 215 | if js_title: 216 | title = js_title.string 217 | 218 | js_runtime_str = li.find(attrs={'data-automation-id':'runtime-badge'}) 219 | if js_runtime_str: 220 | runtime_str = js_runtime_str.string 221 | runtime = get_seconds(runtime_str) 222 | 223 | js_rating = li.find(class_="_2BZ5w7") 224 | if not js_rating: 225 | js_rating = li.find('span', class_="G8xF_x") 226 | if js_rating: 227 | rating = js_rating.string 228 | 229 | js_synopsis = li.find(class_="_16wNxC") 230 | if not js_synopsis: 231 | js_synopsis = li.find(class_="rPtVMq") 232 | if js_synopsis: 233 | synopsis = js_synopsis.string 234 | 235 | js_thumbs = li.find('img') 236 | if js_thumbs: 237 | thumbs_url = js_thumbs['src'] 238 | 239 | js_data_asin = li.find(class_="_1Opa2_ dvui-packshot _3g93Un") 240 | if not js_data_asin: 241 | js_data_asin = li.find(class_="_1Opa2_ dvui-packshot _8eIApy") 242 | if js_data_asin: 243 | play_id = js_data_asin['data-asin'] 244 | 245 | meta = { 246 | "title": title, 247 | "runtime": runtime, 248 | "runtime_str": runtime_str, 249 | "rating": rating, 250 | "play_id": play_id, 251 | "play_url": play_url, 252 | "synopsis": synopsis, 253 | "thumbs": {"url":thumbs_url}, 254 | } 255 | extras_list.append(meta) 256 | 257 | extras[extras_tag] = extras_list 258 | 259 | return extras 260 | 261 | def get_seasons_info(webpages): 262 | 263 | all_seasons = [] 264 | 265 | for webpage in webpages: 266 | amazonSoup = bs4.BeautifulSoup(webpage,'html5lib') 267 | 268 | season_info = amazonSoup.find(class_="XR0d6P") 269 | if not season_info: 270 | season_info = amazonSoup.find(class_="GG33WY") 271 | # if not season_info: 272 | # season_info = amazonSoup.find(class_="dv-node-dp-seasons") 273 | if not season_info: 274 | continue 275 | 276 | pageTitleId = '' 277 | 278 | reCurrentId = r'"pageTitleId":"(.*?)"' 279 | page = re.findall(reCurrentId, webpage) 280 | if len(page) > 0: 281 | pageTitleId = page[0] 282 | 283 | if pageTitleId == '': 284 | reCurrentId = r'"originalURI":"(.*?)"' 285 | originalURI = re.findall(reCurrentId, webpage) 286 | if len(originalURI) > 0: 287 | originalURI = originalURI[0] 288 | match = re.search(r'/([A-Z0-9]+)\b', originalURI) 289 | if match: 290 | pageTitleId = match.group(1) 291 | 292 | big_title = '' 293 | titles = amazonSoup.select('._2Q73m9') 294 | if len(titles)!=0: 295 | big_title = titles[0].string 296 | elif len(amazonSoup.find_all(attrs={'data-automation-id':'title'}))!=0: 297 | big_title = amazonSoup.find_all(attrs={'data-automation-id':'title'})[0].string 298 | else: 299 | big_title = amazonSoup.title.string 300 | 301 | release_year = '' 302 | if len(amazonSoup.find_all(attrs={'data-automation-id':'release-year-badge'}))!=0: 303 | release_year = amazonSoup.find_all(attrs={'data-automation-id':'release-year-badge'})[0].string 304 | 305 | season_name = '' 306 | dv_node_dp_seasons = amazonSoup.find_all(class_="dv-node-dp-seasons") 307 | if len(dv_node_dp_seasons)==0: 308 | dv_node_dp_seasons = amazonSoup.find_all(class_='dv-node-dp-seasons-default') 309 | 310 | if len(dv_node_dp_seasons)!=0: 311 | e_season = dv_node_dp_seasons[0].find(class_="_36qUej") 312 | if not e_season: 313 | e_season = dv_node_dp_seasons[0].find(class_="_3R4jka") 314 | 315 | if e_season: 316 | season_name = e_season.string 317 | 318 | if not season_name: 319 | fclass = amazonSoup.find(class_="XqYSS8 dw87r6") 320 | if fclass: 321 | if len(fclass.select("._36qUej"))!=0: 322 | season_name = fclass.select("._36qUej")[0].string 323 | 324 | if not season_name: 325 | fclass =amazonSoup.find_all(class_="XqYSS8 _1J8qi6") 326 | if len(fclass)!=0: 327 | season_name = fclass[0].string 328 | 329 | if not season_name: 330 | data_automation_id = amazonSoup.find(attrs={'data-automation-id':'title'}) 331 | if data_automation_id: 332 | seasonNameDomList = data_automation_id.next_siblings 333 | for seasonNameDom in seasonNameDomList: 334 | if type(seasonNameDom) is not bs4.element.NavigableString: 335 | seasonNameDomStr = seasonNameDom.get_text().strip().replace('\n', '').replace('\r', '').strip() 336 | if "Season 1"==seasonNameDomStr: 337 | season_name = seasonNameDomStr 338 | elif "Season 1" in seasonNameDomStr: 339 | season_name = seasonNameDomStr.split('Season 1')[0].strip().replace('\n', '').replace('\r', '').strip() 340 | else: 341 | season_name = seasonNameDomStr 342 | break 343 | 344 | episodes = [] 345 | episodes_data = amazonSoup.find_all(class_='XR0d6P') 346 | if not episodes_data: 347 | episodes_data = amazonSoup.find_all(class_='GG33WY') 348 | if episodes_data: 349 | episodes = get_episodes_info(episodes_data) 350 | 351 | extras = {} 352 | extras_data = amazonSoup.select('._3g0WlT') 353 | if not extras_data: 354 | extras_data = amazonSoup.select('._3QUUKy') 355 | if not extras_data: 356 | extras_data = amazonSoup.select('._4fOMiL') 357 | if extras_data: 358 | extras = get_extras_info(extras_data) 359 | 360 | bShowMore = False 361 | total_episodes = 0 362 | ep_expander = amazonSoup.find_all(attrs={'data-automation-id':'ep-expander'}) 363 | if ep_expander: 364 | bShowMore = True 365 | text = ep_expander[0].text.strip() 366 | match = re.search(r'\d+', text) 367 | if match: 368 | total_episodes = int(match.group()) 369 | 370 | if amazonSoup.find(class_='_1NNx6V DwgwxH'): 371 | bShowMore = True 372 | 373 | seasons = { 374 | "id": pageTitleId, 375 | "title": big_title, 376 | "release_time": release_year, 377 | "season_name":season_name, 378 | "episodes": episodes, 379 | "extras": extras, 380 | "show_more": bShowMore, 381 | "total_episodes": total_episodes 382 | } 383 | 384 | all_seasons.append(seasons) 385 | 386 | obj = {"seasons": all_seasons} 387 | 388 | return obj 389 | 390 | def run(params): 391 | arrParams = list(params) 392 | log_path = '' 393 | webpages = [] 394 | for index in range(len(arrParams)): 395 | if index==0: 396 | log_path = arrParams[index] 397 | elif index==1: 398 | output_file = arrParams[index] 399 | else: 400 | strIn = arrParams[index] 401 | f = open(strIn,'r', encoding='UTF-8') 402 | webpages.append(f.read()) 403 | f.close() 404 | 405 | # set logging 406 | log_handler = logging.FileHandler(log_path) 407 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 408 | log_handler.setFormatter(log_formatter) 409 | logger.addHandler(log_handler) 410 | logger.setLevel(logging.INFO) 411 | 412 | try: 413 | with open(output_file,"w",encoding='UTF-8') as f: 414 | season_info = get_seasons_info(webpages) 415 | json.dump(season_info,f) 416 | logger.info("amazon_get_seasons_info complete...") 417 | 418 | except Exception as e: 419 | logger.exception(str(e)) 420 | 421 | import os 422 | if __name__ == '__main__': 423 | 424 | params = [ 425 | "./amazon/log/python.log", 426 | "./amazon/result/get_seasons_info.json", 427 | "./amazon/html/_de/Watch Good Omens - Season 1 _ Prime Video.html", 428 | ] 429 | 430 | if os.path.exists(params[0]): 431 | os.remove(params[0]) 432 | if os.path.exists(params[1]): 433 | os.remove(params[1]) 434 | 435 | run(params) -------------------------------------------------------------------------------- /amazon_is_meta.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import re, sys, string 4 | import bs4,json 5 | 6 | import logging 7 | logger = logging.getLogger() 8 | 9 | def run(params): 10 | arrParams = list(params) 11 | log_path = '' 12 | for index in range(len(arrParams)): 13 | if index==0: 14 | log_path = arrParams[index] 15 | elif index==1: 16 | output_file = arrParams[index] 17 | else: 18 | input_file= arrParams[index] 19 | 20 | # set logging 21 | log_handler = logging.FileHandler(log_path) 22 | log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') 23 | log_handler.setFormatter(log_formatter) 24 | logger.addHandler(log_handler) 25 | logger.setLevel(logging.INFO) 26 | 27 | try: 28 | f = open(input_file,'r',encoding='UTF-8') 29 | webpage = f.read() 30 | f.close() 31 | 32 | amazonSoup = bs4.BeautifulSoup(webpage,'html5lib') 33 | #
34 | 35 | is_meta = False 36 | media_type = 'movie' 37 | 38 | main_element = amazonSoup.find('main',attrs={'id':'main','data-testid':'detailpage-main'}) 39 | if main_element is not None: 40 | e_synopsis = main_element.select('._1wxob_') 41 | if len(e_synopsis)==0: 42 | e_synopsis = main_element.select('._3qsVvm') 43 | 44 | if len(e_synopsis)==0: 45 | e_synopsis = main_element.select('._5tB6mN') 46 | 47 | e_thumb = main_element.select('#atf-full') 48 | if len(e_thumb)==0: 49 | e_thumb = main_element.select('.om7nme') 50 | 51 | if len(e_synopsis)==0 and len(e_thumb)==0: 52 | is_meta = False 53 | else: 54 | is_meta = True 55 | 56 | #[show, movie, is_live, has_live, upcoming] 57 | e_live = amazonSoup.find_all(attrs={'data-automation-id':'live-state-badge'}) 58 | if len(e_live) > 0: 59 | str = e_live[0].string 60 | if str == 'LIVE': 61 | media_type = 'is_live' 62 | elif str == 'UPCOMING': 63 | media_type = 'upcoming' 64 | else: 65 | media_type = 'has_live' 66 | else: 67 | e_episodes = amazonSoup.find_all(class_='XR0d6P') 68 | if not e_episodes: 69 | e_episodes = amazonSoup.find_all(class_='GG33WY') 70 | if len(e_episodes) > 0: 71 | media_type = 'show' 72 | else: 73 | media_type = 'movie' 74 | 75 | dict = { 76 | 'is_meta':is_meta, 77 | 'media_type':media_type 78 | } 79 | 80 | with open(output_file,"w", encoding='UTF-8') as f: 81 | json.dump(dict,f) 82 | logging.info("amazon_is_meta complete...") 83 | 84 | except Exception as e: 85 | logger.exception(str(e)) 86 | 87 | import os 88 | if __name__ == '__main__': 89 | 90 | params = [ 91 | "./log/python.log", 92 | "./result/is_meta.json", 93 | "./html/Live/live_Amazon.com_ Wu-Tang Clan & Nas_ NY State of Mind Tour at Climate Pledge Arena _ Movies & TV.html", 94 | #"./html/Watch Meg 2_ The Trench _ Prime Video.html", 95 | ] 96 | 97 | if os.path.exists(params[0]): 98 | os.remove(params[0]) 99 | if os.path.exists(params[1]): 100 | os.remove(params[1]) 101 | 102 | run(params) -------------------------------------------------------------------------------- /funi_request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import logging 4 | def hello(): 5 | print('hello') 6 | pass 7 | 8 | def api_request(id, token, output_file): 9 | 10 | # logging.basicConfig(level=logging.INFO, 11 | # format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', 12 | # datefmt='%m-%d %H:%M', 13 | # filename='e:\\myapp.log', 14 | # filemode='w') 15 | 16 | # console = logging.StreamHandler() 17 | # console.setLevel(logging.INFO) 18 | # # set a format which is simpler for console use 19 | # formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') 20 | # # tell the handler to use this format 21 | # console.setFormatter(formatter) 22 | # logging.getLogger('').addHandler(console) 23 | 24 | hdr = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:70.0) Gecko/20100101 Firefox/70.0' } 25 | hdr['devicetype'] = 'Android Phone' 26 | hdr['Authorization'] = 'Token {}'.format(token) 27 | 28 | url= 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/{}/signed'.format(id) 29 | print('{}'.format(url)) 30 | r= requests.get(url, headers=hdr) 31 | try: 32 | x = r.json() 33 | with open(output_file, "wb") as file: 34 | file.write(bytes(json.dumps(x), encoding='utf-8')) 35 | except: 36 | return None 37 | 38 | if __name__ == '__main__': 39 | # id = '1399964' 40 | id = '1398850' 41 | outputdir = 'C:\\Users\\fab\\AppData\\Local\\Temp\\' 42 | outputfile='{}{}.json'.format(outputdir, id) 43 | api_request(id, '5b506b51f20f11512db78d1ee944dad001a5d398', outputfile) 44 | -------------------------------------------------------------------------------- /getCookie.py: -------------------------------------------------------------------------------- 1 | # uncompyle6 version 3.9.1 2 | # Python bytecode version base 3.7.0 (3394) 3 | # Decompiled from: Python 3.9.6 (default, Jun 27 2024, 17:58:20) 4 | # [GCC 4.8.5 20150623 (Red Hat 4.8.5-44)] 5 | # Embedded file name: getCookie.py 6 | 7 | import os, re, sys, json, base64, sqlite3, random, configparser, logging, logging.config 8 | from win32crypt import CryptUnprotectData 9 | from cryptography.hazmat.primitives.ciphers.aead import AESGCM 10 | from abc import abstractmethod, ABCMeta 11 | NODE_ELK_TOKEN_ID = "elk_token_id" 12 | NODE_PREFIX = "_YY_TID_" 13 | NODE_WEB_TOKEN = "web_token" 14 | NODE_ITEM = "item" 15 | NODE_BROWSER = "browser" 16 | NODE_DOMAIN = "domain" 17 | NODE_TYPE = "type" 18 | VALUE_OLD = "old" 19 | VALUE_NEW = "new" 20 | CONFIG_NODE_PROFILE = "Profile" 21 | CONFIG_KEY_PATH = "path" 22 | 23 | class CookieParser(metaclass=ABCMeta): 24 | 25 | @abstractmethod 26 | def get_elk_data(self, host): 27 | pass 28 | 29 | 30 | class CookieParserBase(CookieParser): 31 | 32 | def get_elk_data(self, host_list): 33 | """ 34 | 从cookie数据中解析elk数据 35 | :param host: 网站域名,e.g. dvdfab.cn 36 | :return: 返回包含网站elk数据的字典 37 | """ 38 | host_elk_data = list() 39 | for host in host_list: 40 | cookie_data_list = self.get_cookie_data(host) 41 | for cookie_data in cookie_data_list: 42 | if NODE_ELK_TOKEN_ID in cookie_data: 43 | elk_old_data = dict() 44 | elk_old_data[NODE_BROWSER] = self.get_browser_name() 45 | elk_old_data[NODE_ITEM] = cookie_data[NODE_ELK_TOKEN_ID] 46 | elk_old_data[NODE_DOMAIN] = host 47 | elk_old_data[NODE_TYPE] = VALUE_OLD 48 | host_elk_data.append(elk_old_data) 49 | keys = cookie_data.keys() 50 | for key in keys: 51 | if key.startswith(NODE_PREFIX): 52 | data = key.split("_", 3) 53 | if len(data) == 4: 54 | elk_new_data = dict() 55 | elk_new_data[NODE_BROWSER] = self.get_browser_name() 56 | elk_new_data[NODE_ITEM] = data[3] 57 | elk_new_data[NODE_DOMAIN] = host 58 | elk_new_data[NODE_TYPE] = VALUE_NEW 59 | host_elk_data.append(elk_new_data) 60 | elif re.match("_YY_V\\d{1,2}_TID_", key): 61 | data = key.split("_", 4) 62 | if len(data) == 5: 63 | elk_new_data = dict() 64 | elk_new_data[NODE_BROWSER] = self.get_browser_name() 65 | elk_new_data[NODE_ITEM] = data[4] 66 | elk_new_data[NODE_DOMAIN] = host 67 | elk_new_data[NODE_TYPE] = data[2].lower() 68 | host_elk_data.append(elk_new_data) 69 | 70 | return host_elk_data 71 | 72 | @abstractmethod 73 | def get_cookie_data(self, host): 74 | """ 75 | 获取host对应的cookie数据 76 | :param host: 网站域名,e.g. dvdfab.cn 77 | :return: 返回包含网站cookie数据的字典的list 78 | """ 79 | pass 80 | 81 | @abstractmethod 82 | def get_sql(self, host): 83 | pass 84 | 85 | @abstractmethod 86 | def get_browser_name(self): 87 | pass 88 | 89 | have_log = False 90 | 91 | 92 | class CookieParserFireFox(CookieParserBase): 93 | __doc__ = "从profiles.ini中获取cookies目录名称" 94 | 95 | def get_cookies_path_from_profile(self): 96 | profile_path_list = list() 97 | mozilla_profile = os.path.join(os.getenv("APPDATA"), "Mozilla\\Firefox") 98 | mozilla_profile_ini = os.path.join(mozilla_profile, "profiles.ini") 99 | if not os.path.exists(mozilla_profile_ini): 100 | logger.warning("Profile.ini file is not found.") 101 | return profile_path_list 102 | try: 103 | profile = configparser.ConfigParser() 104 | profile.read(mozilla_profile_ini) 105 | for i in range(10): 106 | profile_section = CONFIG_NODE_PROFILE + str(i) 107 | if profile.has_section(profile_section): 108 | data_path = os.path.normpath(os.path.join(mozilla_profile, profile.get(profile_section, "Path"))) 109 | cookie_path = os.path.join(data_path, "cookies.sqlite") 110 | profile_path_list.append(cookie_path) 111 | 112 | except Exception as e: 113 | try: 114 | logger.warning("Read profiles.ini failed {}".format(e)) 115 | finally: 116 | e = None 117 | del e 118 | 119 | return profile_path_list 120 | 121 | def get_cookie_path(self): 122 | """ 123 | 获取浏览器cookies文件路径 124 | :return: cookie文件全路径 125 | """ 126 | s_cookiepath_common = os.environ["APPDATA"] + "\\Mozilla\\Firefox\\Profiles" 127 | profile_path_list = list() 128 | if not os.path.exists(s_cookiepath_common): 129 | return profile_path_list 130 | profile_path_list = self.get_cookies_path_from_profile() 131 | if len(profile_path_list) == 0: 132 | l_folds_arr = os.listdir(s_cookiepath_common) 133 | l_folds_end = [os.path.splitext(s_file)[-1][1[:None]] for s_file in l_folds_arr] 134 | if "default-release" in l_folds_end: 135 | cookie_fold_index = l_folds_end.index("default-release") 136 | else: 137 | if "default" in l_folds_end: 138 | cookie_fold_index = l_folds_end.index("default") 139 | else: 140 | if random.randint(0, 100) == 50: 141 | logger.error("Cannot find default folder: {}".format(str(l_folds_end))) 142 | cookie_fold_index = l_folds_end.index("default") 143 | else: 144 | logger.error("Cannot find default folder: {}".format(str(l_folds_end))) 145 | return profile_path_list 146 | cookie_fold = l_folds_arr[cookie_fold_index] 147 | cookie_path = os.path.join(s_cookiepath_common, cookie_fold) 148 | cookie_path = os.path.join(cookie_path, "cookies.sqlite") 149 | profile_path_list.append(cookie_path) 150 | return profile_path_list 151 | 152 | def get_sql(self, host): 153 | sql = "select host,name,value from moz_cookies where host='%s'" % host 154 | return sql 155 | 156 | def get_browser_name(self): 157 | return "FileFox" 158 | 159 | def get_cookie_data(self, host): 160 | cookie_data_list = list() 161 | cookie_path_list = self.get_cookie_path() 162 | for cookie_path in cookie_path_list: 163 | if not os.path.exists(cookie_path): 164 | continue 165 | if self.have_log == False: 166 | logger.info("Found cookies file, {}".format(cookie_path)) 167 | sql = self.get_sql(host) 168 | try: 169 | with sqlite3.connect(cookie_path) as conn: 170 | cur = conn.cursor() 171 | d_cookie = dict() 172 | res = cur.execute(sql).fetchall() 173 | for host_key, name, value in res: 174 | if name == "miniDialog": 175 | continue 176 | d_cookie[name] = value 177 | 178 | if d_cookie: 179 | cookie_data_list.append(d_cookie) 180 | except Exception as e: 181 | try: 182 | logger.warning("occur exception: {}".format(e)) 183 | finally: 184 | e = None 185 | del e 186 | 187 | self.have_log = True 188 | return cookie_data_list 189 | 190 | 191 | class CookieParserGoogleChome(CookieParserBase): 192 | __doc__ = "负责解析google chrome 浏览器cookie数据" 193 | 194 | def get_local_state(self): 195 | """ 196 | 获取Local State文件路径 197 | :return: 文件全路径 198 | """ 199 | s_path = "\\Google\\Chrome\\User Data\\Local State" 200 | s_local_state = os.environ["LOCALAPPDATA"] + s_path 201 | return s_local_state 202 | 203 | def get_cookie_path(self): 204 | """ 205 | s_browser_name: 浏览器名称(Google Chrome, Microsoft Edge) 206 | s_host: 域名 例如: '.umeng.com' 207 | """ 208 | s_cookie = "\\Google\\Chrome\\User Data\\Default\\Network\\Cookies" 209 | s_cookie_path = os.environ["LOCALAPPDATA"] + s_cookie 210 | return s_cookie_path 211 | 212 | def get_sql(self, host): 213 | sql = "select host_key,name,encrypted_value from cookies where host_key='%s'" % host 214 | return sql 215 | 216 | def get_browser_name(self): 217 | return "Chrome" 218 | 219 | def get_cookie_data(self, host): 220 | """ 221 | 根据Local State解析出加密key,根据key解密cookie数据 222 | :param host: 223 | :return: 224 | """ 225 | cookie_data_list = list() 226 | s_cookie_path = self.get_cookie_path() 227 | if not os.path.exists(s_cookie_path): 228 | if self.have_log == False: 229 | logger.warning("The {} cookie files not found, {}".format(self.get_browser_name(), s_cookie_path)) 230 | self.have_log = True 231 | return cookie_data_list 232 | sql = self.get_sql(host) 233 | s_local_state = self.get_local_state() 234 | try: 235 | with sqlite3.connect(s_cookie_path) as conn: 236 | p_cursor = conn.cursor() 237 | p_res = p_cursor.execute(sql).fetchall() 238 | p_cursor.close() 239 | key = self._CookieParserGoogleChome__pull_the_key(self._CookieParserGoogleChome__get_string(s_local_state)) 240 | d_cookie = dict() 241 | for s_host_key, s_name, c_encrypted_value in p_res: 242 | if c_encrypted_value[0[:3]] == b'v10': 243 | d_cookie[s_name] = self._CookieParserGoogleChome__decrypt_string(key, c_encrypted_value) 244 | else: 245 | d_cookie[s_name] = CryptUnprotectData(c_encrypted_value)[1].decode() 246 | 247 | cookie_data_list.append(d_cookie) 248 | except Exception as e: 249 | try: 250 | logger.warning("occur exception: {}".format(e)) 251 | finally: 252 | e = None 253 | del e 254 | 255 | return cookie_data_list 256 | 257 | def __get_string(self, s_local_state): 258 | with open(s_local_state, "r", encoding="utf-8") as f: 259 | s_encrtpted_key = json.load(f)["os_crypt"]["encrypted_key"] 260 | return s_encrtpted_key 261 | 262 | def __pull_the_key(self, base64_encrypted_key): 263 | c_encrypted_key_with_header = base64.b64decode(base64_encrypted_key) 264 | c_encrypted_key = c_encrypted_key_with_header[5[:None]] 265 | c_key = CryptUnprotectData(c_encrypted_key, None, None, None, 0)[1] 266 | return c_key 267 | 268 | def __decrypt_string(self, c_key, c_data): 269 | c_nonce, c_cipherbytes = c_data[3[:15]], c_data[15[:None]] 270 | aesgcm = AESGCM(c_key) 271 | c_plainbytes = aesgcm.decrypt(c_nonce, c_cipherbytes, None) 272 | s_plaintext = c_plainbytes.decode("utf-8") 273 | return s_plaintext 274 | 275 | 276 | class CoolkieParserEdge(CookieParserGoogleChome): 277 | __doc__ = "负责解析Edge浏览器cookie数据 " 278 | 279 | def get_cookie_path(self): 280 | s_cookie = "\\Microsoft\\Edge\\User Data\\Default\\Network\\Cookies" 281 | s_cookie_path = os.environ["LOCALAPPDATA"] + s_cookie 282 | return s_cookie_path 283 | 284 | def get_local_state(self): 285 | s_path = "\\Microsoft\\Edge\\User Data\\Local State" 286 | s_local_state = os.environ["LOCALAPPDATA"] + s_path 287 | return s_local_state 288 | 289 | def get_browser_name(self): 290 | return "Edge" 291 | 292 | 293 | def get_elk_data(host_list): 294 | """收集所有浏览器的elk数据""" 295 | data_dict = {'google_chrome':CookieParserGoogleChome(), 296 | 'edge':CoolkieParserEdge(), 297 | 'firefox':CookieParserFireFox()} 298 | total_data_dict = {} 299 | host_data_list = list() 300 | for key in data_dict: 301 | cookie_parser = data_dict.get(key) 302 | logger.info("Start get data for {} ...".format(key)) 303 | host_data_list += cookie_parser.get_elk_data(host_list) 304 | logger.info("End get data for {} ...".format(key)) 305 | 306 | total_data_dict[NODE_WEB_TOKEN] = host_data_list 307 | return total_data_dict 308 | 309 | 310 | def save_elk_data(data_dict, file): 311 | try: 312 | json_string = json.dumps(data_dict) 313 | json_string = base64.b64encode(json_string.encode("utf-8")) 314 | with open(file, "wb") as f: 315 | f.write(json_string) 316 | except Exception as e: 317 | try: 318 | logging.error("save file exception: {}".format(e)) 319 | finally: 320 | e = None 321 | del e 322 | 323 | 324 | def init_logging(): 325 | global logger 326 | logFilename = os.environ["TEMP"] + "\\devcon.log" 327 | logging.basicConfig(level=(logging.DEBUG), 328 | format="%(asctime)s-%(levelname)s-%(message)s", 329 | datefmt="%y-%m-%d %H:%M", 330 | filename=logFilename, 331 | filemode="w+") 332 | filehandler = logging.FileHandler(logFilename, encoding="utf-8") 333 | logging.getLogger().addHandler(filehandler) 334 | logger = logging.getLogger("devcon.log") 335 | 336 | 337 | if __name__ == "__main__": 338 | if len(sys.argv) < 1: 339 | print("Usage: python.exe output_file_path") 340 | sys.exit(1) 341 | init_logging() 342 | logger.info("App version: 2022-07-27-14-55") 343 | logger.info("Param: " + sys.argv[1]) 344 | host_list = [ 345 | '.dvdfab.cn', '.dvdfab.fr', '.dvdfab.at', '.dvdfab.co.jp', '.dvdfab.org', 346 | '.streamfab.com', '.streamfab.jp', '.streamfab.de', '.streamfab.fr', 347 | '.streamfab.tw'] 348 | logger.info("Start get elk data ...") 349 | try: 350 | data_dic = get_elk_data(host_list) 351 | except Exception as e: 352 | try: 353 | logger.error("Get elk data exception: {}".format(e)) 354 | finally: 355 | e = None 356 | del e 357 | 358 | logger.info("Start save elk data ...") 359 | file = sys.argv[1] 360 | save_elk_data(data_dic, file) 361 | logger.info("End") 362 | -------------------------------------------------------------------------------- /get_season_urls.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import re, sys, string 5 | import bs4 6 | 7 | webpage = ''' 8 | The Website Title 9 | 10 |

Download my Python book from my website.

11 |

Learn Python the easy way!

12 |

By Al Sweigart

13 | 14 | ''' 15 | 16 | def createSrt_vtt(): 17 | exampleSoup = bs4.BeautifulSoup(webpage,'html5lib') 18 | elems = exampleSoup.select('#author') 19 | type(elems) 20 | # print (elems[0].getText()) 21 | # print(webpage) -------------------------------------------------------------------------------- /hbomax_meta_crawler.js: -------------------------------------------------------------------------------- 1 | 2 | function get_seasons_info(json_data, id) { 3 | var json_obj = JSON.parse(json_data) 4 | 5 | var season_num = 0; 6 | for (var i = 0; i < json_obj.length; i++) { 7 | var e_src = json_obj[i].body; 8 | 9 | if (e_src.seasonNumber !== undefined) { 10 | if (e_src.seasonNumber > season_num) { 11 | season_num = e_src.seasonNumber 12 | } 13 | } 14 | // 解决有些单 season 不能获取 tv 的问题 15 | // https://play.hbomax.com/page/urn:hbo:page:GVU2_3QlhmYNJjhsJAWUZ:type:series 16 | if (e_src.numberInSeries !== undefined) { 17 | season_num = 1 18 | } 19 | } 20 | 21 | src_seasons = new Array() 22 | for (var i = 0; i < season_num; i++) { 23 | obj = { 24 | "id": i + 1, 25 | "title": "Season " + String(i + 1), 26 | "release_time": "", 27 | "episodes": [] 28 | } 29 | src_seasons.push(obj) 30 | } 31 | 32 | var all_episodes = new Array() 33 | for (var i = 0; i < json_obj.length; i++) { 34 | 35 | var e_src = json_obj[i].body; 36 | if (e_src.seasonNumber === undefined && e_src.numberInSeries === undefined) { 37 | continue 38 | } 39 | thumbs = e_src.images.tileburnedin 40 | if (thumbs.indexOf("&size") != -1) { 41 | thumbs = thumbs.split('&size')[0] 42 | } 43 | 44 | var e_des = { 45 | "seq": e_src.numberInSeason, 46 | "seasonNumber": e_src.seasonNumber, 47 | "episodeId": String(e_src.numberInSeason), 48 | "runtime": e_src.duration, 49 | "url": 'https://play.hbomax.com/episode/' + e_src.references.viewable, 50 | "synopsis": e_src.titles.full, 51 | "sortInt": e_src.numberInSeason + e_src.seasonNumber, 52 | "title": e_src.titles.full, 53 | "thumbs": { 54 | "url": thumbs 55 | } 56 | } 57 | if (e_src.numberInSeries !== undefined) { 58 | e_des['seasonNumber'] = 1 59 | e_des['seq'] = e_src.numberInSeries 60 | e_des['episodeId'] = e_src.numberInSeries 61 | } 62 | all_episodes.push(e_des) 63 | } 64 | 65 | 66 | var sort_all_episodes = all_episodes.sort(function (a, b) { 67 | return (a.sortInt - b.sortInt); 68 | }) 69 | 70 | for (var i = 0; i < sort_all_episodes.length; i++) { 71 | e = sort_all_episodes[i] 72 | src_seasons[e.seasonNumber - 1].episodes.push(e) 73 | } 74 | 75 | var result = { 76 | "seasons": src_seasons 77 | } 78 | src_seasons_str = JSON.stringify(result) 79 | return src_seasons_str 80 | } 81 | 82 | function get_meta_info(json_data, id) { 83 | if (JSON.parse(json_data)[1].body.details === undefined) { 84 | json_obj = JSON.parse(json_data)[0] 85 | thumbs = json_obj.body.images.tileburnedin 86 | if (thumbs.indexOf("&size") != -1) { 87 | thumbs = thumbs.split('&size')[0] 88 | } 89 | 90 | all_obj = { 91 | "id": json_obj.id, 92 | "title": json_obj.body.titles.full, 93 | "runtime": json_obj.body.duration, 94 | "release_time": json_obj.body.releaseYear, 95 | "synopsis": json_obj.body.summaries.full, 96 | "thumbs": thumbs, 97 | "directors": [], 98 | "casts": [], 99 | "writers": [], 100 | "genres": [json_obj.body.ratingCode], 101 | "moodTags": [] 102 | } 103 | return JSON.stringify(all_obj) 104 | } else { 105 | json_obj = JSON.parse(json_data)[1] 106 | thumbs = json_obj.body.details.image.uri 107 | if (thumbs.indexOf("&size") != -1) { 108 | thumbs = thumbs.split('&size')[0] 109 | } 110 | 111 | all_obj = { 112 | "id": id, 113 | "title": json_obj.body.details.title, 114 | "runtime": "", 115 | "release_time": "", 116 | "synopsis": json_obj.body.details.description, 117 | "thumbs": thumbs, 118 | "directors": [], 119 | "casts": [], 120 | "writers": [], 121 | "genres": [], 122 | "moodTags": [] 123 | } 124 | return JSON.stringify(all_obj) 125 | } 126 | } -------------------------------------------------------------------------------- /log.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | import datetime 4 | from pathlib import Path 5 | 6 | 7 | def setup_logger(name: str, write_to_file: bool = False) -> logging.Logger: 8 | formatter = logging.Formatter('%(asctime)s %(name)s %(filename)s %(lineno)s : %(levelname)s %(message)s') 9 | log_time = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") 10 | if getattr(sys, 'frozen', False): 11 | log_folder_path = Path(sys.executable).parent / 'logs' 12 | else: 13 | log_folder_path = Path(__file__).parent.parent / 'logs' 14 | if log_folder_path.exists() is False: 15 | log_folder_path.mkdir() 16 | 17 | ch = logging.StreamHandler() 18 | ch.setLevel(logging.DEBUG) 19 | ch.setFormatter(formatter) 20 | lt = logging.getLogger(f'{name}') 21 | lt.setLevel(logging.DEBUG) 22 | lt.addHandler(ch) 23 | if write_to_file: 24 | log_file_path = log_folder_path / f'{name}-{log_time}.log' 25 | fh = logging.FileHandler(log_file_path.resolve().as_posix(), encoding='utf-8') 26 | fh.setLevel(logging.DEBUG) 27 | fh.setFormatter(formatter) 28 | lt.addHandler(fh) 29 | lt.info(f'log file -> {log_file_path}') 30 | return lt 31 | 32 | 33 | log = setup_logger('pyshaka') -------------------------------------------------------------------------------- /netflix_meta_crawler.js: -------------------------------------------------------------------------------- 1 | function get_seasons_info(json_data, id) { 2 | let json_obj = JSON.parse(json_data) 3 | let seasons_src = json_obj.video.seasons 4 | let seasons_des = new Array() 5 | 6 | for (let index = 0; index < seasons_src.length; index++) { 7 | const s_src = seasons_src[index]; 8 | let episodes_des = new Array() 9 | for (let j = 0; j < seasons_src[index].episodes.length; j++) { 10 | const e_src = seasons_src[index].episodes[j]; 11 | let e_des = { 12 | "seq": e_src.seq, 13 | "episodeId": String(e_src.episodeId), 14 | "url": 'https://www.netflix.com/watch/' + String(e_src.episodeId), 15 | "synopsis": e_src.synopsis, 16 | "title": e_src.title, 17 | "runtime": e_src.runtime, 18 | "thumbs": { 19 | "url": e_src.thumbs[0].url 20 | } 21 | } 22 | episodes_des.push(e_des) 23 | } 24 | let s_des = { 25 | "id": String(s_src.id), 26 | "release_time": String(s_src.year), 27 | "episodes": episodes_des, 28 | "seq": s_src.seq 29 | } 30 | 31 | seasons_des.push(s_des) 32 | } 33 | let all_obj = { 34 | "seasons": seasons_des 35 | } 36 | all_json = JSON.stringify(all_obj) 37 | return all_json 38 | } 39 | 40 | function get_meta_info(json_data, id) { 41 | 42 | let json_obj = JSON.parse(json_data) 43 | let video = json_obj.video 44 | let thumbs_url = video.artwork[0].url; 45 | let thumbs = { 46 | 'url': thumbs_url 47 | } 48 | 49 | all_obj = { 50 | "id": video.currentEpisode, 51 | "type": video.type, 52 | "title": video.title, 53 | "runtime": video.runtime, 54 | "release_time": video.year, 55 | "synopsis": video.synopsis, 56 | "thumbs": thumbs, 57 | "directors": [], 58 | "casts": [], 59 | "writers": [], 60 | "genres": [video.rating], 61 | "moodTags": [] 62 | } 63 | all_json = JSON.stringify(all_obj) 64 | return all_json 65 | } -------------------------------------------------------------------------------- /paramount_subtitle_trans.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from pathlib import Path 3 | from datetime import datetime 4 | from argparse import ArgumentParser 5 | 6 | from util.TextParser import TimeContext 7 | from text.Mp4VttParser import Mp4VttParser 8 | from text.Mp4TtmlParser import Mp4TtmlParser 9 | from text.Cue import Cue 10 | from log import log 11 | 12 | 13 | class CmdArgs: 14 | def __init__(self): 15 | self.debug = None # type: bool 16 | self.type = None # type: str 17 | self.timescale = None # type: int 18 | self.init_path = None # type: str 19 | self.segments_path = None # type: str 20 | self.segment_time = None # type: float 21 | 22 | 23 | def command_handler(args: CmdArgs): 24 | ''' 25 | 对命令参数进行校验和修正 26 | ''' 27 | assert args.type in ['wvtt', 'ttml'], f'not support {args.type} now' 28 | args.timescale = int(args.timescale) 29 | if args.init_path: 30 | args.init_path = args.init_path.strip() 31 | args.segments_path = args.segments_path.strip() 32 | args.segment_time = float(args.segment_time) 33 | 34 | 35 | def loop_nestedCues(lines: List[str], nestedCues: List[Cue], index: int, segment_time: float): 36 | payload = '' 37 | for cue in nestedCues: 38 | if len(cue.nestedCues) > 0: 39 | loop_nestedCues(lines, cue.nestedCues, index, segment_time) 40 | if cue.payload != '': 41 | if payload == '': 42 | payload = cue.payload 43 | else: 44 | payload = f'{payload} {cue.payload}' 45 | # 这里突然想不起注释掉的原因了 好像是会重复... 46 | # lines.append(cue) 47 | cue = nestedCues[0] 48 | payload = payload 49 | if payload != '': 50 | cue.payload = payload 51 | cue.startTime += segment_time * index 52 | cue.endTime += segment_time * index 53 | lines.append(cue) 54 | 55 | 56 | def compare(cue: Cue): 57 | return cue.startTime 58 | 59 | 60 | # def compare(cue1: Cue, cue2: Cue): 61 | # if cue1.startTime < cue2.startTime: 62 | # return -1 63 | # if cue1.startTime > cue2.startTime: 64 | # return 1 65 | # return 0 66 | 67 | 68 | def gentm(tm: float): 69 | return datetime.utcfromtimestamp(tm).strftime('%H:%M:%S.%f')[:-3] 70 | 71 | 72 | def test_parse_mp4vtt(): 73 | mp4vttparser = Mp4VttParser() 74 | vttInitSegment = Path("test/assets/vtt-init.mp4").read_bytes() 75 | mp4vttparser.parseInit(vttInitSegment) 76 | vttSegment = Path("test/assets/vtt-segment.mp4").read_bytes() 77 | timecontext = TimeContext(**{'periodStart': 0, 'segmentStart': 0, 'segmentEnd': 0}) 78 | mp4vttparser.parseMedia(vttSegment, timecontext) 79 | 80 | 81 | def parse(args: CmdArgs): 82 | if args.type == 'wvtt': 83 | parser = Mp4VttParser() 84 | elif args.type == 'ttml': 85 | parser = Mp4TtmlParser() 86 | else: 87 | assert 1 == 0, 'never should be here' 88 | if args.init_path: 89 | init_path = Path(args.init_path) 90 | parser.parseInit(init_path.read_bytes()) 91 | else: 92 | parser.set_timescale(args.timescale) 93 | segments_path = Path(args.segments_path) 94 | time = TimeContext(**{'periodStart': 0, 'segmentStart': 0, 'segmentEnd': 0}) 95 | index = 0 96 | cues = [] 97 | for segment_path in segments_path.iterdir(): 98 | if segment_path.is_dir(): 99 | if args.debug: 100 | log.debug(f'{segment_path} is not a file, skip it') 101 | continue 102 | if segment_path.suffix not in ['.mp4', '.m4s', '.dash', '.ts']: 103 | if args.debug: 104 | log.debug(f"{segment_path} suffix is not in ['.mp4', '.m4s', '.dash', '.ts'], skip it") 105 | continue 106 | if args.init_path and segment_path.name == init_path.name: 107 | if args.debug: 108 | log.debug(f"{segment_path} is init_path , skip it") 109 | continue 110 | if args.debug: 111 | log.debug(f'start parseMedia for {segment_path}') 112 | _cues = parser.parseMedia(segment_path.read_bytes(), time) 113 | 114 | for cue in _cues: 115 | cue.file = segment_path.name 116 | if len(cue.nestedCues) > 0: 117 | loop_nestedCues(cues, cue.nestedCues, index, args.segment_time) 118 | if cue.payload != '': 119 | cue.startTime += args.segment_time * index 120 | cue.endTime += args.segment_time * index 121 | cues.append(cue) 122 | index += 1 123 | # 按Cue.startTime从小到大排序 124 | cues.sort(key=compare) 125 | if args.debug: 126 | log.debug(f'cues count {len(cues)}') 127 | assert len(cues) > 0, 'ohh, it is a bug...' 128 | # 去重 129 | # 1. 如果当前行的endTime等于下一行的startTime 并且下一行内容与当前行相同 取下一行的endTime作为当前行的endTime 然后去除下一行 130 | # 2. 否则将下一行作为当前行 再次进行比较 直到比较结束 131 | offset = 0 132 | cues_fix = [] # type: List[Cue] 133 | cue = cues[offset] 134 | while offset < len(cues) - 1: 135 | offset += 1 136 | # 跳过空的行 137 | next_cue = cues[offset] 138 | if cue.payload == '': 139 | cue = next_cue 140 | continue 141 | if cue.payload == next_cue.payload and cue.endTime == next_cue.startTime: 142 | cue.endTime = next_cue.endTime 143 | else: 144 | cues_fix.append(cue) 145 | cue = next_cue 146 | # 最后一行也不能掉 147 | next_cue = cues[offset] 148 | if cue.payload == next_cue.payload and cue.endTime == next_cue.startTime: 149 | cue.endTime = next_cue.endTime 150 | else: 151 | cues_fix.append(cue) 152 | cue = next_cue 153 | if args.debug: 154 | log.debug(f'after reduce duplicated lines, now lines count is {len(cues_fix)}') 155 | # 先用列表放内容 最后join 156 | contents = ["WEBVTT"] # type: List[str] 157 | for cue in cues_fix: 158 | settings = cue._settings 159 | if settings != '': 160 | settings = ' ' + settings 161 | contents.append(f'{gentm(cue.startTime)} --> {gentm(cue.endTime)}{settings}\n{cue.payload}') 162 | content = '\n\n'.join(contents) 163 | segments_path.with_suffix(".vtt").write_text(content, encoding='utf-8') 164 | log.info(f'{len(cues_fix)} lines of subtitle was founded. (*^▽^*)') 165 | log.info(f'write to {segments_path.with_suffix(".vtt").resolve()}') 166 | 167 | 168 | def transVtt(infile , inpath): 169 | parser = Mp4VttParser() 170 | init_path = Path(infile) 171 | parser.parseInit(init_path.read_bytes()) 172 | segments_path = Path(inpath) 173 | 174 | time = TimeContext(**{'periodStart': 0, 'segmentStart': 0, 'segmentEnd': 0}) 175 | index = 0 176 | cues = [] 177 | for segment_path in segments_path.iterdir(): 178 | if segment_path.is_dir(): 179 | continue 180 | if segment_path.suffix not in ['.mp4', '.m4s', '.dash', '.ts']: 181 | continue 182 | _cues = parser.parseMedia(segment_path.read_bytes(), time) 183 | 184 | for cue in _cues: 185 | segment_time = 0 186 | cue.file = segment_path.name 187 | if len(cue.nestedCues) > 0: 188 | loop_nestedCues(cues, cue.nestedCues, index, segment_time) 189 | if cue.payload != '': 190 | cue.startTime += segment_time * index 191 | cue.endTime += segment_time * index 192 | cues.append(cue) 193 | index += 1 194 | # 按Cue.startTime从小到大排序 195 | cues.sort(key=compare) 196 | assert len(cues) > 0, 'ohh, it is a bug...' 197 | # 去重 198 | # 1. 如果当前行的endTime等于下一行的startTime 并且下一行内容与当前行相同 取下一行的endTime作为当前行的endTime 然后去除下一行 199 | # 2. 否则将下一行作为当前行 再次进行比较 直到比较结束 200 | offset = 0 201 | cues_fix = [] # type: List[Cue] 202 | cue = cues[offset] 203 | while offset < len(cues) - 1: 204 | offset += 1 205 | # 跳过空的行 206 | next_cue = cues[offset] 207 | if cue.payload == '': 208 | cue = next_cue 209 | continue 210 | if cue.payload == next_cue.payload and cue.endTime == next_cue.startTime: 211 | cue.endTime = next_cue.endTime 212 | else: 213 | cues_fix.append(cue) 214 | cue = next_cue 215 | # 最后一行也不能掉 216 | next_cue = cues[offset] 217 | if cue.payload == next_cue.payload and cue.endTime == next_cue.startTime: 218 | cue.endTime = next_cue.endTime 219 | else: 220 | cues_fix.append(cue) 221 | cue = next_cue 222 | # 先用列表放内容 最后join 223 | contents = ["WEBVTT"] # type: List[str] 224 | for cue in cues_fix: 225 | settings = cue._settings 226 | if settings != '': 227 | settings = ' ' + settings 228 | contents.append(f'{gentm(cue.startTime)} --> {gentm(cue.endTime)}{settings}\n{cue.payload}') 229 | content = '\n\n'.join(contents) 230 | segments_path.with_suffix(".vtt").write_text(content, encoding='utf-8') 231 | log.info(f'{len(cues_fix)} lines of subtitle was founded. (*^▽^*)') 232 | log.info(f'write to {segments_path.with_suffix(".vtt").resolve()}') 233 | 234 | 235 | def main(): 236 | 237 | parser = ArgumentParser( 238 | prog='dash-subtitle-extractor', 239 | usage='python -m main [OPTION]...', 240 | description='A tool that to parse subtitle embedded in DASH stream', 241 | add_help=True, 242 | ) 243 | parser.add_argument('-debug', '--debug', action='store_true', help='debug is needed') 244 | parser.add_argument('-type', '--type', choices=['wvtt', 'ttml'], help='subtitle codec, only support wvtt and ttml now') 245 | parser.add_argument('-timescale', '--timescale', default='1000', help='set timescale manually if no init segment') 246 | parser.add_argument('-init-path', '--init-path', help='init segment path') 247 | parser.add_argument('-segments-path', '--segments-path', help='segments folder path') 248 | parser.add_argument('-segment-time', '--segment-time', default='0', help='single segment duration, usually needed for ttml content, calculation method: d / timescale') 249 | args = parser.parse_args() # type: CmdArgs 250 | command_handler(args) 251 | parse(args) 252 | # python -m main --init-path "test/dashvtt_subtitle_WVTT_zh-TW/init.mp4" --segments-path "test/dashvtt_subtitle_WVTT_zh-TW" 253 | 254 | 255 | if __name__ == '__main__': 256 | main() -------------------------------------------------------------------------------- /srtConvert.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import codecs 3 | import math 4 | import os 5 | import re 6 | 7 | 8 | SUPPORTED_EXTENSIONS = [".xml", ".vtt"] 9 | 10 | 11 | def leading_zeros(value, digits=2): 12 | value = "000000" + str(value) 13 | return value[-digits:] 14 | 15 | 16 | def convert_time(raw_time): 17 | if int(raw_time) == 0: 18 | return "{}:{}:{},{}".format(0, 0, 0, 0) 19 | 20 | ms = '000' 21 | if len(raw_time) > 4: 22 | ms = leading_zeros(int(raw_time[:-4]) % 1000, 3) 23 | time_in_seconds = int(raw_time[:-7]) if len(raw_time) > 7 else 0 24 | second = leading_zeros(time_in_seconds % 60) 25 | minute = leading_zeros(int(math.floor(time_in_seconds / 60)) % 60) 26 | hour = leading_zeros(int(math.floor(time_in_seconds / 3600))) 27 | return "{}:{}:{},{}".format(hour, minute, second, ms) 28 | 29 | 30 | def xml_id_display_align_before(text): 31 | """ 32 | displayAlign="before" means the current sub will be displayed on top. 33 | That is and not at bottom. We check what's the xml:id associated to it 34 | to have an {\an8} position tag in the output file. 35 | """ 36 | align_before_re = re.compile(u'') 37 | has_align_before = re.search(align_before_re, text) 38 | if has_align_before: 39 | return has_align_before.group(1) 40 | return u"" 41 | 42 | 43 | def xml_get_cursive_style_ids(text): 44 | style_section = re.search("(.*)", text, flags=re.DOTALL) 45 | if not style_section: 46 | return [] 47 | style_ids_re = re.compile( 48 | ' ") 63 | if len(times[0]) == 9: 64 | times = ["00:" + t for t in times] 65 | return "{} --> {}".format(times[0], times[1].split(" ")[0]) 66 | 67 | 68 | def vtt_to_srt(text): 69 | if not text.startswith(u"\ufeffWEBVTT") and not text.startswith(u"WEBVTT"): 70 | raise Exception(".vtt format must start with WEBVTT, wrong file?") 71 | 72 | lines = [] 73 | current_sub_line = [] 74 | for line in text.split("\n"): 75 | if current_sub_line: 76 | current_sub_line.append(line) 77 | if not line: 78 | lines.append("\n".join(current_sub_line) + "\n") 79 | current_sub_line = [] 80 | 81 | elif " --> " in line: 82 | current_sub_line = [convert_vtt_time(line)] 83 | if current_sub_line: 84 | lines.append("\n".join(current_sub_line)) 85 | 86 | return "".join((u"{}\n{}".format(i, l) for i, l in enumerate(lines, 1))) 87 | 88 | 89 | def xml_to_srt(text): 90 | def append_subs(start, end, prev_content, format_time): 91 | subs.append({ 92 | "start_time": convert_time(start) if format_time else start, 93 | "end_time": convert_time(end) if format_time else end, 94 | "content": u"\n".join(prev_content), 95 | }) 96 | 97 | display_align_before = xml_id_display_align_before(text) 98 | begin_re = re.compile(u"\s*

(.*)

') 107 | 108 | # some span tags are used for italics, we'll replace them by and , 109 | # which is the standard for .srt files. We ignore all other uses. 110 | cursive_ids = xml_get_cursive_style_ids(text) 111 | span_start_re = re.compile(u'()+') 112 | span_id_re = re.compile(u'()+') 113 | span_end_re = re.compile(u'()+') 114 | br_re = re.compile(u'()+') 115 | fmt_t = True 116 | for s in sub_lines: 117 | span_start_tags = re.search(span_start_re, s) 118 | srt_cursive = u"" 119 | if span_start_tags: 120 | span_id = re.search(span_id_re, s) 121 | srt_cursive = u"" if span_id.groups()[1] in cursive_ids else u"" 122 | s = srt_cursive.join(s.split(span_start_tags.groups()[0])) 123 | 124 | string_region_re = r'(.*)

' 125 | s = re.sub(string_region_re, r'{\\an8}\2

', s) 126 | content = re.search(content_re, s).group(1) 127 | 128 | br_tags = re.search(br_re, content) 129 | if br_tags: 130 | content = u"\n".join(content.split(br_tags.group())) 131 | 132 | span_end_tags = re.search(span_end_re, content) 133 | if span_end_tags: 134 | srt_cursive = u"
" if srt_cursive else u"" 135 | content = srt_cursive.join(content.split(span_end_tags.group())) 136 | 137 | prev_start = prev_time["start"] 138 | start = re.search(start_re, s).group(1) 139 | end = re.search(end_re, s).group(1) 140 | if len(start.split(":")) > 1: 141 | fmt_t = False 142 | start = start.replace(".", ",") 143 | end = end.replace(".", ",") 144 | if (prev_start == start and prev_time["end"] == end) or not prev_start: 145 | # Fix for multiple lines starting at the same time 146 | prev_time = {"start": start, "end": end} 147 | prev_content.append(content) 148 | continue 149 | append_subs(prev_time["start"], prev_time["end"], prev_content, fmt_t) 150 | prev_time = {"start": start, "end": end} 151 | prev_content = [content] 152 | append_subs(start, end, prev_content, fmt_t) 153 | 154 | lines = (u"{}\n{} --> {}\n{}\n".format( 155 | s + 1, subs[s]["start_time"], subs[s]["end_time"], subs[s]["content"]) 156 | for s in range(len(subs))) 157 | return u"\n".join(lines) 158 | 159 | def createSrt_xml(inPath, outPath): 160 | with codecs.open(inPath, 'rb', "utf-8") as f: 161 | text = f.read() 162 | with codecs.open(outPath, 'wb', "utf-8") as f: 163 | f.write(to_srt(text, ".xml")) 164 | 165 | def createSrt_vtt(inPath, outPath): 166 | with codecs.open(inPath, 'rb', "utf-8") as f: 167 | text = f.read() 168 | with codecs.open(outPath, 'wb', "utf-8") as f: 169 | f.write(to_srt(text, ".vtt")) 170 | 171 | 172 | if __name__ == '__main__': 173 | main() 174 | -------------------------------------------------------------------------------- /text/Cue.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class positionAlign(Enum): 5 | LEFT = 'line-left' 6 | RIGHT = 'line-right' 7 | CENTER = 'center' 8 | AUTO = 'auto' 9 | 10 | 11 | class textAlign(Enum): 12 | LEFT = 'left' 13 | RIGHT = 'right' 14 | CENTER = 'center' 15 | START = 'start' 16 | END = 'end' 17 | 18 | 19 | class displayAlign(Enum): 20 | BEFORE = 'before' 21 | CENTER = 'center' 22 | AFTER = 'after' 23 | 24 | 25 | class direction(Enum): 26 | HORIZONTAL_LEFT_TO_RIGHT = 'ltr' 27 | HORIZONTAL_RIGHT_TO_LEFT = 'rtl' 28 | 29 | 30 | class writingMode(Enum): 31 | HORIZONTAL_TOP_TO_BOTTOM = 'horizontal-tb' 32 | VERTICAL_LEFT_TO_RIGHT = 'vertical-lr' 33 | VERTICAL_RIGHT_TO_LEFT = 'vertical-rl' 34 | 35 | 36 | class lineInterpretation(Enum): 37 | LINE_NUMBER = 0 38 | PERCENTAGE = 1 39 | 40 | 41 | class lineAlign(Enum): 42 | CENTER = 'center' 43 | START = 'start' 44 | END = 'end' 45 | 46 | 47 | class defaultTextColor(Enum): 48 | white = '#FFF' 49 | lime = '#0F0' 50 | cyan = '#0FF' 51 | red = '#F00' 52 | yellow = '#FF0' 53 | magenta = '#F0F' 54 | blue = '#00F' 55 | black = '#000' 56 | 57 | 58 | class defaultTextBackgroundColor(Enum): 59 | bg_white = '#FFF' 60 | bg_lime = '#0F0' 61 | bg_cyan = '#0FF' 62 | bg_red = '#F00' 63 | bg_yellow = '#FF0' 64 | bg_magenta = '#F0F' 65 | bg_blue = '#00F' 66 | bg_black = '#000' 67 | 68 | 69 | class fontWeight(Enum): 70 | NORMAL = 400 71 | BOLD = 700 72 | 73 | 74 | class fontStyle(Enum): 75 | NORMAL = 'normal' 76 | ITALIC = 'italic' 77 | OBLIQUE = 'oblique' 78 | 79 | 80 | class textDecoration(Enum): 81 | UNDERLINE = 'underline' 82 | LINE_THROUGH = 'lineThrough' 83 | OVERLINE = 'overline' 84 | 85 | 86 | class Cue: 87 | 88 | def __init__(self, startTime: float, endTime: float, payload: str, _settings: str = ''): 89 | self.startTime = startTime 90 | self.direction = direction.HORIZONTAL_LEFT_TO_RIGHT 91 | self.endTime = endTime 92 | self.payload = payload 93 | self.region = CueRegion() 94 | self.position = None 95 | self.positionAlign = positionAlign.AUTO 96 | self.size = 0 97 | self.textAlign = textAlign.CENTER 98 | self.writingMode = writingMode.HORIZONTAL_TOP_TO_BOTTOM 99 | self.lineInterpretation = lineInterpretation.LINE_NUMBER 100 | self.line = None 101 | self.lineHeight = '' 102 | self.lineAlign = lineAlign.START 103 | self.displayAlign = displayAlign.AFTER 104 | self.color = '' 105 | self.backgroundColor = '' 106 | self.backgroundImage = '' 107 | self.border = '' 108 | self.fontSize = '' 109 | self.fontWeight = fontWeight.NORMAL 110 | self.fontStyle = fontStyle.NORMAL 111 | self.fontFamily = '' 112 | self.letterSpacing = '' 113 | self.linePadding = '' 114 | self.opacity = 1 115 | self.textDecoration = [] 116 | self.wrapLine = True 117 | self.id = '' 118 | self.nestedCues = [] 119 | self.lineBreak = False 120 | self.spacer = False 121 | self.cellResolution = {'columns': 32, 'rows': 15} 122 | self._settings = _settings 123 | 124 | @staticmethod 125 | def lineBreak(start: float, end: float) -> 'Cue': 126 | cue = Cue(start, end, '') 127 | cue.lineBreak = True 128 | return cue 129 | 130 | def clone(self): 131 | cue = Cue(0, 0, '') 132 | for k, v in self.__dict__.items(): 133 | if isinstance(v, list): 134 | v = v.copy() 135 | cue.__setattr__(k, v) 136 | return cue 137 | 138 | @staticmethod 139 | def equal(cue1: 'Cue', cue2: 'Cue') -> bool: 140 | if cue1.startTime != cue2.startTime or cue1.endTime != cue2.endTime or cue1.payload != cue2.payload: 141 | return False 142 | for k, v in cue1.__dict__.items(): 143 | if k == 'startTime' or k == 'endTime' or k == 'payload': 144 | pass 145 | elif k == 'nestedCues': 146 | if not Cue.equal(cue1.nestedCues, cue2.nestedCues): 147 | return False 148 | elif k == 'region' or k == 'cellResolution': 149 | for k2 in cue1.__getattribute__(k): 150 | if cue1.__getattribute__(k)[k2] != cue2.__getattribute__(k)[k2]: 151 | return False 152 | elif isinstance(cue1.__getattribute__(k), list): 153 | if cue1.__getattribute__(k) != cue2.__getattribute__(k): 154 | return False 155 | else: 156 | if cue1.__getattribute__(k) != cue1.__getattribute__(k): 157 | return False 158 | return True 159 | 160 | 161 | class units(Enum): 162 | PX = 0 163 | PERCENTAGE = 1 164 | LINES = 2 165 | 166 | 167 | class scrollMode(Enum): 168 | NONE = '' 169 | UP = 'up' 170 | 171 | 172 | class CueRegion: 173 | 174 | def __init__(self, **kwargs): 175 | self.id = '' 176 | self.viewportAnchorX = 0 177 | self.viewportAnchorY = 0 178 | self.regionAnchorX = 0 179 | self.regionAnchorY = 0 180 | self.width = 100 181 | self.height = 100 182 | self.heightUnits = units.PERCENTAGE 183 | self.widthUnits = units.PERCENTAGE 184 | self.viewportAnchorUnits = units.PERCENTAGE 185 | self.scroll = scrollMode.NONE 186 | -------------------------------------------------------------------------------- /text/Mp4TtmlParser.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from text.Cue import Cue 4 | from text.TtmlTextParser import TtmlTextParser 5 | from util.Mp4Parser import Mp4Parser, ParsedBox 6 | from util.exceptions import InvalidMp4TTML 7 | from util.TextParser import TimeContext 8 | 9 | 10 | class Mp4TtmlParser: 11 | 12 | def __init__(self): 13 | self.parser_ = TtmlTextParser() 14 | 15 | def set_timescale(self, timescale: int): 16 | pass 17 | 18 | def parseInit(self, data: memoryview): 19 | ''' 20 | 这个函数不调用也没什么问题 21 | ''' 22 | def stpp_callback(box: ParsedBox): 23 | nonlocal sawSTPP 24 | sawSTPP = True 25 | box.parser.stop() 26 | 27 | sawSTPP = False 28 | # 初始化解析器 29 | mp4parser = Mp4Parser() 30 | # 给要准备解析的box添加对应的解析函数 后面回调 31 | mp4parser = mp4parser.box('moov', Mp4Parser.children) 32 | mp4parser = mp4parser.box('trak', Mp4Parser.children) 33 | mp4parser = mp4parser.box('mdia', Mp4Parser.children) 34 | mp4parser = mp4parser.box('minf', Mp4Parser.children) 35 | mp4parser = mp4parser.box('stbl', Mp4Parser.children) 36 | mp4parser = mp4parser.fullBox('stsd', Mp4Parser.sampleDescription) 37 | mp4parser = mp4parser.box('stpp', stpp_callback) 38 | # 解析数据 39 | mp4parser = mp4parser.parse(data) 40 | 41 | if not sawSTPP: 42 | raise InvalidMp4TTML(f'is sawSTPP? {sawSTPP}') 43 | 44 | def parseMedia(self, data: memoryview, time: TimeContext, dont_raise: bool = True) -> List[Cue]: 45 | 46 | def mdat_callback(data: bytes): 47 | nonlocal payload 48 | nonlocal sawMDAT 49 | sawMDAT = True 50 | payload.extend(self.parser_.parseMedia(data, time)) 51 | 52 | sawMDAT = False 53 | payload = [] 54 | 55 | mp4parser = Mp4Parser() 56 | mp4parser = mp4parser.box('mdat', Mp4Parser.allData(mdat_callback)) 57 | mp4parser = mp4parser.parse(data, partialOkay=False) 58 | 59 | if not sawMDAT: 60 | if dont_raise: 61 | return payload 62 | else: 63 | raise InvalidMp4TTML(f'is sawMDAT? {sawMDAT}') 64 | return payload -------------------------------------------------------------------------------- /text/Mp4VttParser.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from text.Cue import Cue 3 | # from text.TextEngine import TextEngine 4 | from text.VttTextParser import VttTextParser 5 | from util.DataViewReader import DataViewReader, Endianness 6 | # from util.Error import Error 7 | # from util.Functional import Functional 8 | from util.Mp4Parser import Mp4Parser, ParsedBox 9 | from util.Mp4BoxParsers import Mp4BoxParsers, ParsedTRUNSample 10 | # from util.StringUtils import StringUtils 11 | # from util.TextParser import TextParser 12 | from util.TextParser import TimeContext 13 | from util.exceptions import InvalidMp4VTT 14 | from log import log 15 | 16 | 17 | class Mp4VttParser: 18 | 19 | def __init__(self): 20 | self.timescale_ = None # type: int 21 | 22 | def set_timescale(self, timescale: int): 23 | self.timescale_ = timescale 24 | 25 | def parseInit(self, data: memoryview): 26 | # log.info('parseInit start') 27 | 28 | def mdhd_callback(box: ParsedBox): 29 | assert box.version == 0 or box.version == 1, 'MDHD version can only be 0 or 1' 30 | parsedMDHDBox = Mp4BoxParsers.parseMDHD(box.reader, box.version) 31 | self.timescale_ = parsedMDHDBox.timescale 32 | 33 | def wvtt_callback(box: ParsedBox): 34 | nonlocal sawWVTT 35 | sawWVTT = True 36 | 37 | sawWVTT = False 38 | # 初始化解析器 39 | mp4parser = Mp4Parser() 40 | # 给要准备解析的box添加对应的解析函数 后面回调 41 | mp4parser = mp4parser.box('moov', Mp4Parser.children) 42 | mp4parser = mp4parser.box('trak', Mp4Parser.children) 43 | mp4parser = mp4parser.box('mdia', Mp4Parser.children) 44 | mp4parser = mp4parser.fullBox('mdhd', mdhd_callback) 45 | mp4parser = mp4parser.box('minf', Mp4Parser.children) 46 | mp4parser = mp4parser.box('stbl', Mp4Parser.children) 47 | mp4parser = mp4parser.fullBox('stsd', Mp4Parser.sampleDescription) 48 | mp4parser = mp4parser.box('wvtt', wvtt_callback) 49 | # 解析数据 50 | mp4parser = mp4parser.parse(data) 51 | 52 | if not self.timescale_: 53 | raise InvalidMp4VTT('Missing timescale for VTT content. It should be located in the MDHD.') 54 | 55 | if not sawWVTT: 56 | raise InvalidMp4VTT('A WVTT box should have been seen (a valid vtt init segment with no actual subtitles') 57 | 58 | def parseMedia(self, data: memoryview, time: TimeContext) -> List[Cue]: 59 | 60 | def tfdt_callback(box: ParsedBox): 61 | nonlocal baseTime 62 | nonlocal sawTFDT 63 | sawTFDT = True 64 | assert box.version == 0 or box.version == 1, 'TFDT version can only be 0 or 1' 65 | parsedTFDTBox = Mp4BoxParsers.parseTFDT(box.reader, box.version) 66 | baseTime = parsedTFDTBox.baseMediaDecodeTime 67 | 68 | def tfhd_callback(box: ParsedBox): 69 | nonlocal defaultDuration 70 | assert box.flags is not None, 'A TFHD box should have a valid flags value' 71 | parsedTFHDBox = Mp4BoxParsers.parseTFHD(box.reader, box.flags) 72 | defaultDuration = parsedTFHDBox.defaultSampleDuration 73 | 74 | def trun_callback(box: ParsedBox): 75 | nonlocal sawTRUN 76 | nonlocal presentations 77 | sawTRUN = True 78 | assert box.version is not None, 'A TRUN box should have a valid version value' 79 | assert box.version is not None, 'A TRUN box should have a valid flags value' 80 | parsedTRUNBox = Mp4BoxParsers.parseTRUN(box.reader, box.version, box.flags) 81 | presentations = parsedTRUNBox.sampleData 82 | 83 | def mdat_callback(data: bytes): 84 | nonlocal sawMDAT 85 | nonlocal rawPayload 86 | #assert not sawMDAT, 'VTT cues in mp4 with multiple MDAT are not currently supported' 87 | sawMDAT = True 88 | rawPayload = data 89 | 90 | if not self.timescale_: 91 | raise InvalidMp4VTT('No init segment for MP4+VTT!') 92 | 93 | baseTime = 0 94 | presentations = [] # type: List[ParsedTRUNSample] 95 | rawPayload = b'' # type: bytes 96 | cues = [] # type: List[Cue] 97 | 98 | sawTFDT = False 99 | sawTRUN = False 100 | sawMDAT = False 101 | defaultDuration = None 102 | 103 | mp4parser = Mp4Parser() 104 | mp4parser = mp4parser.box('moof', Mp4Parser.children) 105 | mp4parser = mp4parser.box('traf', Mp4Parser.children) 106 | mp4parser = mp4parser.fullBox('tfdt', tfdt_callback) 107 | mp4parser = mp4parser.fullBox('tfhd', tfhd_callback) 108 | mp4parser = mp4parser.fullBox('trun', trun_callback) 109 | mp4parser = mp4parser.box('mdat', Mp4Parser.allData(mdat_callback)) 110 | mp4parser = mp4parser.parse(data, partialOkay=False) 111 | 112 | if not sawMDAT and not sawTFDT and not sawTRUN: 113 | raise InvalidMp4VTT(f'A required box is missing. Is saw: MDAT {sawMDAT} TFDT {sawTFDT} TRUN {sawTRUN}') 114 | 115 | currentTime = baseTime 116 | 117 | reader = DataViewReader(rawPayload, Endianness.BIG_ENDIAN) 118 | for presentation in presentations: 119 | duration = presentation.sampleDuration or defaultDuration 120 | if presentation.sampleCompositionTimeOffset: 121 | startTime = baseTime + presentation.sampleCompositionTimeOffset 122 | else: 123 | startTime = currentTime 124 | currentTime = startTime + (duration or 0) 125 | totalSize = 0 126 | while True: 127 | # Read the payload size. 128 | payloadSize = reader.readUint32() 129 | totalSize += payloadSize 130 | # Skip the type. 131 | payloadType = reader.readUint32() 132 | payloadName = Mp4Parser.typeToString(payloadType) 133 | 134 | # Read the data payload. 135 | payload = None 136 | if payloadName == 'vttc': 137 | if payloadSize > 8: 138 | payload = reader.readBytes(payloadSize - 8) 139 | elif payloadName == 'vtte': 140 | # It's a vtte, which is a vtt cue that is empty. Ignore any data that does exist. 141 | reader.skip(payloadSize - 8) 142 | else: 143 | log.error(f'Unknown box {payloadName}! Skipping!') 144 | reader.skip(payloadSize - 8) 145 | 146 | if duration: 147 | if payload: 148 | assert self.timescale_ is not None, 'Timescale should not be null!' 149 | cue = Mp4VttParser.parseVTTC_( 150 | payload, 151 | time.periodStart + startTime / self.timescale_, 152 | time.periodStart + currentTime / self.timescale_ 153 | ) 154 | cues.append(cue) 155 | else: 156 | log.error('WVTT sample duration unknown, and no default found!') 157 | assert not presentation.sampleSize or totalSize <= presentation.sampleSize, 'The samples do not fit evenly into the sample sizes given in the TRUN box!' 158 | 159 | # 检查是不是应该结束循环 160 | if presentation.sampleSize and totalSize < presentation.sampleSize: 161 | continue 162 | else: 163 | break 164 | assert not reader.hasMoreData(), 'MDAT which contain VTT cues and non-VTT data are not currently supported!' 165 | # parseVTTC_ 有可能返回的是 None 这里过滤一下 166 | return [cue for cue in cues if cue] 167 | 168 | @staticmethod 169 | def parseVTTC_(data: bytes, startTime: float, endTime: float): 170 | 171 | def payl_callback(data: bytes): 172 | nonlocal payload 173 | payload = data.decode('utf-8') 174 | 175 | def iden_callback(data: bytes): 176 | nonlocal _id 177 | _id = data.decode('utf-8') 178 | 179 | def sttg_callback(data: bytes): 180 | nonlocal settings 181 | settings = data.decode('utf-8') 182 | 183 | payload = None 184 | _id = None 185 | settings = None 186 | 187 | mp4parser = Mp4Parser() 188 | mp4parser = mp4parser.box('payl', Mp4Parser.allData(payl_callback)) 189 | mp4parser = mp4parser.box('iden', Mp4Parser.allData(iden_callback)) 190 | mp4parser = mp4parser.box('sttg', Mp4Parser.allData(sttg_callback)) 191 | mp4parser = mp4parser.parse(data) 192 | 193 | if payload: 194 | return Mp4VttParser.assembleCue_(payload, _id, settings, startTime, endTime) 195 | else: 196 | return None 197 | 198 | @staticmethod 199 | def assembleCue_(payload: bytes, _id: str, settings: str, startTime: float, endTime: float): 200 | cue = Cue(startTime, endTime, '', _settings=settings) 201 | 202 | styles = {} 203 | VttTextParser.parseCueStyles(payload, cue, styles) 204 | 205 | if _id: 206 | cue.id = _id 207 | 208 | # if settings: 209 | # # TextParser not fully implemented yet 210 | # parser = TextParser(settings) 211 | # word = parser.readWord() 212 | # while word: 213 | # if not VttTextParser.parseCueSetting(cue, word, VTTRegions=[]): 214 | # log.warning(f'VTT parser encountered an invalid VTT setting: {word}, The setting will be ignored.') 215 | 216 | # parser.skipWhitespace() 217 | # word = parser.readWord() 218 | return cue -------------------------------------------------------------------------------- /text/TextEngine.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThatNotEasy/StreamFab/157f7a91f75459e523076ab9d586c798f1a9c989/text/TextEngine.py -------------------------------------------------------------------------------- /text/TtmlTextParser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from xml.dom.minidom import parseString, Element, Node, Document 3 | from enum import Enum 4 | from typing import List, Union 5 | 6 | from text.Cue import Cue, CueRegion, units, direction, writingMode 7 | from text.Cue import textAlign, lineAlign, positionAlign, displayAlign 8 | from text.Cue import fontStyle, textDecoration 9 | from util.TextParser import TimeContext 10 | from util.exceptions import InvalidXML, InvalidTextCue 11 | from log import log 12 | 13 | document = Document() 14 | 15 | 16 | class RateInfo_: 17 | def __init__(self, frameRate: str, subFrameRate: str, frameRateMultiplier: str, tickRate: str): 18 | try: 19 | self.frameRate = float(frameRate) 20 | except Exception: 21 | self.frameRate = 30 22 | try: 23 | self.subFrameRate = float(subFrameRate) 24 | except Exception: 25 | self.subFrameRate = 1 26 | try: 27 | self.tickRate = float(tickRate) 28 | except Exception: 29 | self.tickRate = 0 30 | if self.tickRate == 0: 31 | if frameRate: 32 | self.tickRate = self.frameRate * self.subFrameRate 33 | else: 34 | self.tickRate = 1 35 | if frameRateMultiplier: 36 | multiplierResults = re.findall('^(\d+) (\d+)$', frameRateMultiplier) 37 | if len(multiplierResults) > 0: 38 | numerator = float(multiplierResults[1]) 39 | denominator = float(multiplierResults[2]) 40 | multiplierNum = numerator / denominator 41 | self.frameRate *= multiplierNum 42 | 43 | 44 | class TtmlTextParser: 45 | 46 | def parseInit(self): 47 | assert False, 'TTML does not have init segments' 48 | 49 | def parseMedia(self, data: bytes, time: TimeContext) -> List[Cue]: 50 | ttpNs = parameterNs_ 51 | ttsNs = styleNs_ 52 | text = data.decode('utf-8') 53 | cues = [] # type: List[Cue] 54 | xml = None 55 | 56 | if text == '': 57 | return cues 58 | try: 59 | xml = parseString(text) 60 | except Exception as e: 61 | log.error('xml parseString', exc_info=e) 62 | if xml is None: 63 | return cues 64 | parsererrors = xml.getElementsByTagName('parsererror') # type: List[Element] 65 | if len(parsererrors) > 0 and parsererrors[0]: 66 | raise InvalidXML('ttml parsererror') 67 | tts = xml.getElementsByTagName('tt') # type: List[Element] 68 | if len(tts) == 0: 69 | raise InvalidXML('TTML does not contain tag.') 70 | tt = tts[0] 71 | bodys = tt.getElementsByTagName('body') # type: List[Element] 72 | if len(bodys) == 0: 73 | return [] 74 | frameRate = tt.getAttributeNS(ttpNs, 'frameRate') 75 | subFrameRate = tt.getAttributeNS(ttpNs, 'subFrameRate') 76 | frameRateMultiplier = tt.getAttributeNS(ttpNs, 'frameRateMultiplier') 77 | tickRate = tt.getAttributeNS(ttpNs, 'tickRate') 78 | cellResolution = tt.getAttributeNS(ttpNs, 'cellResolution') 79 | spaceStyle = tt.getAttribute('xml:space') or 'default' 80 | extent = tt.getAttributeNS(ttsNs, 'extent') 81 | 82 | if spaceStyle != 'default' and spaceStyle != 'preserve': 83 | raise InvalidXML(f'Invalid xml:space value: {spaceStyle}') 84 | whitespaceTrim = spaceStyle == 'default' 85 | rateInfo = RateInfo_(frameRate, subFrameRate, frameRateMultiplier, tickRate) 86 | cellResolutionInfo = TtmlTextParser.getCellResolution_(cellResolution) 87 | 88 | metadatas = tt.getElementsByTagName('metadata') # type: List[Element] 89 | metadataElements = [] 90 | if len(metadatas) > 0: 91 | for childNode in metadatas[0].childNodes: 92 | if isinstance(childNode, Element): 93 | metadataElements.append(childNode) 94 | styles = tt.getElementsByTagName('style') # type: List[Element] 95 | regionElements = tt.getElementsByTagName('region') # type: List[Element] 96 | cueRegions = [] 97 | 98 | for region in regionElements: 99 | cueRegion = TtmlTextParser.parseCueRegion_(region, styles, extent) 100 | if cueRegion: 101 | cueRegions.append(cueRegion) 102 | 103 | body = bodys[0] 104 | if len([childNode for childNode in body.childNodes if isinstance(childNode, Element) and childNode.tagName == 'p']) > 0: 105 | raise InvalidTextCue('

can only be inside

in TTML') 106 | for divNode in body.childNodes: 107 | if isinstance(divNode, Element) is False: 108 | continue 109 | if divNode.tagName != 'div': 110 | continue 111 | has_p = False 112 | for pChildren in divNode.childNodes: 113 | if isinstance(pChildren, Element) is False: 114 | continue 115 | if pChildren.tagName == 'span': 116 | raise InvalidTextCue(' can only be inside

in TTML') 117 | if pChildren.tagName == 'p': 118 | has_p = True 119 | cue = TtmlTextParser.parseCue_(pChildren, time.periodStart, rateInfo, metadataElements, styles, regionElements, cueRegions, whitespaceTrim, False, cellResolutionInfo) 120 | if cue: 121 | cues.append(cue) 122 | if not has_p: 123 | cue = TtmlTextParser.parseCue_(divNode, time.periodStart, rateInfo, metadataElements, styles, regionElements, cueRegions, whitespaceTrim, False, cellResolutionInfo) 124 | if cue: 125 | cues.append(cue) 126 | return cues 127 | 128 | @staticmethod 129 | def parseCue_(cueNode: Union[Node, Element], offset, rateInfo, metadataElements, styles, regionElements, cueRegions, whitespaceTrim, isNested, cellResolution): 130 | cueElement = None # type: Element 131 | parentElement = cueNode.parentNode # type: Element 132 | 133 | if cueNode.nodeType == Node.TEXT_NODE: 134 | span = document.createElement('span') # tpye: Text 135 | span.appendChild(cueNode) 136 | cueElement = span 137 | else: 138 | assert cueNode.nodeType == Node.ELEMENT_NODE, 'nodeType should be ELEMENT_NODE!' 139 | cueElement = cueNode 140 | assert cueElement, 'cueElement should be non-None!' 141 | 142 | spaceStyle = cueElement.getAttribute('xml:space') or 'default' if whitespaceTrim else 'preserve' 143 | localWhitespaceTrim = spaceStyle == 'default' 144 | if cueElement.firstChild and cueElement.firstChild.nodeValue: 145 | # hasTextContent = re.match('\S', cueElement.firstChild.nodeValue) 146 | # \S 不匹配换行 但是js的test却会返回true 147 | # 所以python这里会误判 那么strip下达到修复效果 148 | hasTextContent = re.match('\S', cueElement.firstChild.nodeValue.strip()) 149 | else: 150 | hasTextContent = False 151 | hasTimeAttributes = cueElement.hasAttribute('begin') or cueElement.hasAttribute('end') or cueElement.hasAttribute('dur') 152 | if not hasTimeAttributes and not hasTextContent and cueElement.tagName != 'br': 153 | if not isNested: 154 | return None 155 | elif localWhitespaceTrim: 156 | return None 157 | start, end = TtmlTextParser.parseTime_(cueElement, rateInfo) 158 | while parentElement and parentElement.nodeType == Node.ELEMENT_NODE and parentElement.tagName != 'tt': 159 | start, end = TtmlTextParser.resolveTime_(parentElement, rateInfo, start, end) 160 | parentElement = parentElement.parentNode 161 | if start is None: 162 | start = 0 163 | start += offset 164 | if end is None: 165 | end = -1 166 | else: 167 | end += offset 168 | if cueElement.tagName == 'br': 169 | cue = Cue(start, end, '') 170 | cue.lineBreak = True 171 | return cue 172 | payload = '' 173 | nestedCues = [] 174 | flag = True 175 | for childNode in cueElement.childNodes: 176 | if childNode.nodeType != Node.TEXT_NODE: 177 | flag = False 178 | break 179 | if flag: 180 | payload: str = cueElement.firstChild.nodeValue 181 | if localWhitespaceTrim: 182 | payload = payload.strip() 183 | payload = re.sub('\s+', ' ', payload) 184 | else: 185 | for childNode in [_ for _ in cueElement.childNodes]: 186 | nestedCue = TtmlTextParser.parseCue_( 187 | childNode, 188 | offset, 189 | rateInfo, 190 | metadataElements, 191 | styles, 192 | regionElements, 193 | cueRegions, 194 | localWhitespaceTrim, 195 | True, 196 | cellResolution, 197 | ) 198 | if nestedCue: 199 | nestedCues.append(nestedCue) 200 | cue = Cue(start, end, payload) 201 | cue.nestedCues = nestedCues 202 | 203 | if cellResolution: 204 | cue.cellResolution = cellResolution 205 | 206 | regionElements = TtmlTextParser.getElementsFromCollection_(cueElement, 'region', regionElements, '') 207 | regionElement = None 208 | if len(regionElements) > 0 and regionElements[0].getAttribute('xml:id'): 209 | regionElement = regionElements[0] 210 | regionId = regionElement.getAttribute('xml:id') 211 | cue.region = [_ for _ in cueRegions if _.id == regionId][0] 212 | imageElement = None 213 | for nameSpace in smpteNsList_: 214 | imageElements = TtmlTextParser.getElementsFromCollection_(cueElement, 'backgroundImage', metadataElements, '#', nameSpace) 215 | if len(imageElements) > 0: 216 | imageElement = imageElements[0] 217 | break 218 | 219 | isLeaf = len(nestedCues) == 0 220 | 221 | TtmlTextParser.addStyle_( 222 | cue, 223 | cueElement, 224 | regionElement, 225 | imageElement, 226 | styles, 227 | isNested, 228 | isLeaf 229 | ) 230 | 231 | return cue 232 | 233 | @staticmethod 234 | def resolveTime_(parentElement, rateInfo: RateInfo_, start, end): 235 | # 这里有可能存在bug 236 | parentTime = TtmlTextParser.parseTime_(parentElement, rateInfo) 237 | 238 | if start is None: 239 | # No start time of your own? Inherit from the parent. 240 | start = parentTime[0] 241 | else: 242 | # Otherwise, the start time is relative to the parent's start time. 243 | if parentTime[0] is not None: 244 | start += parentTime[0] 245 | 246 | if end is None: 247 | # No end time of your own? Inherit from the parent. 248 | end = parentTime[1] 249 | else: 250 | # Otherwise, the end time is relative to the parent's _start_ time. 251 | # This is not a typo. Both times are relative to the parent's _start_. 252 | if parentTime[0] is not None: 253 | end += parentTime[0] 254 | 255 | return start, end 256 | 257 | @staticmethod 258 | def parseTime_(element: Element, rateInfo: RateInfo_): 259 | start = TtmlTextParser.parseTimeAttribute_(element.getAttribute('begin'), rateInfo) 260 | end = TtmlTextParser.parseTimeAttribute_(element.getAttribute('end'), rateInfo) 261 | duration = TtmlTextParser.parseTimeAttribute_(element.getAttribute('dur'), rateInfo) 262 | if end is None and duration is not None: 263 | end = start + duration 264 | return start, end 265 | 266 | @staticmethod 267 | def parseFramesTime_(rateInfo: RateInfo_, text): 268 | # 50t or 50.5t 269 | results = timeFramesFormat_.findall(text) 270 | frames = float(results[0]) 271 | return frames / rateInfo.frameRate 272 | 273 | @staticmethod 274 | def parseTickTime_(rateInfo: RateInfo_, text): 275 | # 50t or 50.5t 276 | results = timeTickFormat_.findall(text) 277 | ticks = float(results[0]) 278 | return ticks / rateInfo.tickRate 279 | 280 | @staticmethod 281 | def parseTimeFromRegex_(regex: re.Pattern, text: str) -> int: 282 | results = regex.findall(text) 283 | if len(results) == 0: 284 | return None 285 | if results[0][0] == '': 286 | return None 287 | 288 | hours = 0 289 | minutes = 0 290 | seconds = 0 291 | milliseconds = 0 292 | try: 293 | hours = int(results[0][0]) 294 | minutes = int(results[0][1]) 295 | seconds = float(results[0][2]) 296 | milliseconds = float(results[0][3]) 297 | except Exception: 298 | pass 299 | # 对于 timeColonFormatMilliseconds_ 来说 这里是匹配不到 milliseconds 的 300 | # 不过下一步计算的时候 由于seconds是小数 所以又修正了... 301 | 302 | return (milliseconds / 1000) + seconds + (minutes * 60) + (hours * 3600) 303 | 304 | @staticmethod 305 | def parseColonTimeWithFrames_(rateInfo: RateInfo_, text: str) -> int: 306 | # 01:02:43:07 ('07' is frames) or 01:02:43:07.1 (subframes) 307 | results = timeColonFormatFrames_.findall(text) 308 | 309 | hours = int(results[0][0]) 310 | minutes = int(results[0][1]) 311 | seconds = int(results[0][2]) 312 | frames = int(results[0][3]) 313 | subframes = int(results[0][4]) or 0 314 | 315 | frames += subframes / rateInfo.subFrameRate 316 | seconds += frames / rateInfo.frameRate 317 | 318 | return seconds + (minutes * 60) + (hours * 3600) 319 | 320 | @staticmethod 321 | def parseTimeAttribute_(text: str, rateInfo: RateInfo_): 322 | ret = None 323 | if timeColonFormatFrames_.match(text): 324 | ret = TtmlTextParser.parseColonTimeWithFrames_(rateInfo, text) 325 | elif timeColonFormat_.match(text): 326 | ret = TtmlTextParser.parseTimeFromRegex_(timeColonFormat_, text) 327 | elif timeColonFormatMilliseconds_.match(text): 328 | ret = TtmlTextParser.parseTimeFromRegex_(timeColonFormatMilliseconds_, text) 329 | elif timeFramesFormat_.match(text): 330 | ret = TtmlTextParser.parseFramesTime_(rateInfo, text) 331 | elif timeTickFormat_.match(text): 332 | ret = TtmlTextParser.parseTickTime_(rateInfo, text) 333 | elif timeHMSFormat_.match(text): 334 | ret = TtmlTextParser.parseTimeFromRegex_(timeHMSFormat_, text) 335 | elif text: 336 | raise InvalidTextCue('Could not parse cue time range in TTML') 337 | return ret 338 | 339 | @staticmethod 340 | def addStyle_(cue, cueElement, region, imageElement: Element, styles: List[Element], isNested: bool, isLeaf: bool): 341 | shouldInheritRegionStyles = isNested or isLeaf 342 | 343 | _direction = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'direction', shouldInheritRegionStyles) 344 | if _direction == 'rtl': 345 | cue.direction = direction.HORIZONTAL_RIGHT_TO_LEFT 346 | 347 | _writingMode = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'writingMode', shouldInheritRegionStyles) 348 | if _writingMode == 'tb' or _writingMode == 'tblr': 349 | cue.writingMode = writingMode.VERTICAL_LEFT_TO_RIGHT 350 | elif _writingMode == 'tbrl': 351 | cue.writingMode = writingMode.VERTICAL_RIGHT_TO_LEFT 352 | elif _writingMode == 'rltb' or _writingMode == 'rl': 353 | cue.direction = direction.HORIZONTAL_RIGHT_TO_LEFT 354 | elif _writingMode: 355 | cue.direction = direction.HORIZONTAL_LEFT_TO_RIGHT 356 | 357 | align = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'textAlign', shouldInheritRegionStyles) 358 | if align: 359 | cue.positionAlign = textAlignToPositionAlign_[align] 360 | cue.lineAlign = textAlignToLineAlign_[align] 361 | 362 | assert textAlign.__members__.get(align.upper()), f'{align.upper()} Should be in Cue.textAlign values!' 363 | else: 364 | cue.textAlign = textAlign.START 365 | 366 | _displayAlign = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'displayAlign', shouldInheritRegionStyles) 367 | if _displayAlign: 368 | assert displayAlign.__members__.get(_displayAlign.upper()), f'{_displayAlign.upper()} Should be in Cue.displayAlign values!' 369 | cue.displayAlign = displayAlign[_displayAlign.upper()] 370 | 371 | color = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'color', shouldInheritRegionStyles) 372 | if color: 373 | cue.color = color 374 | 375 | backgroundColor = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'backgroundColor', shouldInheritRegionStyles) 376 | if backgroundColor: 377 | cue.backgroundColor = backgroundColor 378 | 379 | border = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'border', shouldInheritRegionStyles) 380 | if border: 381 | cue.border = border 382 | 383 | fontFamily = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'fontFamily', shouldInheritRegionStyles) 384 | if fontFamily: 385 | cue.fontFamily = fontFamily 386 | 387 | fontWeight = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'fontWeight', shouldInheritRegionStyles) 388 | if fontWeight and fontWeight == 'bold': 389 | cue.fontWeight = fontWeight.BOLD 390 | 391 | wrapOption = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'wrapOption', shouldInheritRegionStyles) 392 | if wrapOption and wrapOption == 'noWrap': 393 | cue.wrapLine = False 394 | else: 395 | cue.wrapLine = True 396 | 397 | lineHeight = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'lineHeight', shouldInheritRegionStyles) 398 | if lineHeight and unitValues_.match(lineHeight): 399 | cue.lineHeight = lineHeight 400 | 401 | fontSize = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'fontSize', shouldInheritRegionStyles) 402 | 403 | if fontSize: 404 | isValidFontSizeUnit = unitValues_.match(fontSize) or percentValue_.match(fontSize) 405 | if isValidFontSizeUnit: 406 | cue.fontSize = fontSize 407 | 408 | _fontStyle = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'fontStyle', shouldInheritRegionStyles) 409 | if _fontStyle: 410 | assert fontStyle.__members__.get(_fontStyle.upper()), f'{_fontStyle.upper()} Should be in Cue.fontStyle values!' 411 | cue.fontStyle = fontStyle[_fontStyle.upper()] 412 | 413 | if imageElement: 414 | backgroundImageType = imageElement.getAttribute('imageType') or imageElement.getAttribute('imagetype') 415 | backgroundImageEncoding = imageElement.getAttribute('encoding') 416 | backgroundImageData = imageElement.textContent.trim() 417 | if backgroundImageType == 'PNG' and backgroundImageEncoding == 'Base64' and backgroundImageData: 418 | cue.backgroundImage = 'data:image/pngbase64,' + backgroundImageData 419 | 420 | letterSpacing = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'letterSpacing', shouldInheritRegionStyles) 421 | if letterSpacing and unitValues_.match(letterSpacing): 422 | cue.letterSpacing = letterSpacing 423 | 424 | linePadding = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'linePadding', shouldInheritRegionStyles) 425 | if linePadding and unitValues_.match(linePadding): 426 | cue.linePadding = linePadding 427 | 428 | opacity = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'opacity', shouldInheritRegionStyles) 429 | if opacity: 430 | cue.opacity = float(opacity) 431 | 432 | textDecorationRegion = TtmlTextParser.getStyleAttributeFromRegion_(region, styles, 'textDecoration') 433 | if textDecorationRegion: 434 | TtmlTextParser.addTextDecoration_(cue, textDecorationRegion) 435 | 436 | textDecorationElement = TtmlTextParser.getStyleAttributeFromElement_(cueElement, styles, 'textDecoration') 437 | if textDecorationElement: 438 | TtmlTextParser.addTextDecoration_(cue, textDecorationElement) 439 | 440 | @staticmethod 441 | def addTextDecoration_(cue: Cue, decoration): 442 | # 这里可能有问题 .value 443 | for value in decoration.split(' '): 444 | if value == 'underline': 445 | if textDecoration.UNDERLINE not in cue.textDecoration: 446 | cue.textDecoration.append(textDecoration.UNDERLINE) 447 | elif value == 'noUnderline': 448 | cue.textDecoration = [_ for _ in cue.textDecoration if textDecoration.UNDERLINE != _] 449 | elif value == 'lineThrough': 450 | if textDecoration.LINE_THROUGH not in cue.textDecoration: 451 | cue.textDecoration.append(textDecoration.LINE_THROUGH) 452 | elif value == 'noLineThrough': 453 | cue.textDecoration = [_ for _ in cue.textDecoration if textDecoration.LINE_THROUGH != _] 454 | elif value == 'overline': 455 | if textDecoration.OVERLINE not in cue.textDecoration: 456 | cue.textDecoration.append(textDecoration.OVERLINE) 457 | elif value == 'noOverline': 458 | cue.textDecoration = [_ for _ in cue.textDecoration if textDecoration.OVERLINE != _] 459 | 460 | @staticmethod 461 | def getStyleAttribute_(cueElement, region, styles, attribute, shouldInheritRegionStyles=True): 462 | attr = TtmlTextParser.getStyleAttributeFromElement_(cueElement, styles, attribute) 463 | if attr: 464 | return attr 465 | if shouldInheritRegionStyles: 466 | return TtmlTextParser.getStyleAttributeFromRegion_(region, styles, attribute) 467 | return None 468 | 469 | @staticmethod 470 | def parseCueRegion_(regionElement: Element, styles: List[Element], globalExtent: str): 471 | region = CueRegion() 472 | _id = regionElement.getAttribute('xml:id') 473 | if not _id: 474 | log.warning('TtmlTextParser parser encountered a region with no id. Region will be ignored.') 475 | return None 476 | region.id = _id 477 | globalResults = None 478 | if globalExtent: 479 | globalResults = percentValues_.findall(globalExtent) or pixelValues_.findall(globalExtent) 480 | if globalResults is not None and len(globalResults) == 2: 481 | globalWidth = int(globalResults[0][0]) 482 | globalHeight = int(globalResults[0][1]) 483 | else: 484 | globalWidth = None 485 | globalHeight = None 486 | results = None 487 | percentage = None 488 | 489 | extent = TtmlTextParser.getStyleAttributeFromRegion_(regionElement, styles, 'extent') 490 | if extent: 491 | percentage = percentValues_.findall(extent) 492 | results = percentage or pixelValues_.findall(extent) 493 | if results is not None: 494 | region.width = int(results[0][0]) 495 | region.height = int(results[0][1]) 496 | 497 | if not percentage: 498 | if globalWidth is not None: 499 | region.width = region.width * 100 / globalWidth 500 | if globalHeight is not None: 501 | region.height = region.height * 100 / globalHeight 502 | if percentage or globalWidth is not None: 503 | region.widthUnits = units.PERCENTAGE 504 | else: 505 | region.widthUnits = units.PX 506 | if percentage or globalHeight is not None: 507 | region.heightUnits = units.PERCENTAGE 508 | else: 509 | region.heightUnits = units.PX 510 | origin = TtmlTextParser.getStyleAttributeFromRegion_(regionElement, styles, 'origin') 511 | if origin: 512 | percentage = percentValues_.findall(origin) 513 | results = percentage or pixelValues_.findall(origin) 514 | if len(results) > 0: 515 | region.viewportAnchorX = int(results[0][0]) 516 | region.viewportAnchorY = int(results[0][1]) 517 | if len(percentage) == 0: 518 | if globalHeight is not None: 519 | region.viewportAnchorY = region.viewportAnchorY * 100 / globalHeight 520 | if globalWidth is not None: 521 | region.viewportAnchorX = region.viewportAnchorX * 100 / globalHeight 522 | if percentage or globalWidth is not None: 523 | region.viewportAnchorUnits = units.PERCENTAGE 524 | else: 525 | region.viewportAnchorUnits = units.PX 526 | return region 527 | 528 | @staticmethod 529 | def getInheritedStyleAttribute_(element: Element, styles, attribute): 530 | ttsNs = styleNs_ 531 | ebuttsNs = styleEbuttsNs_ 532 | 533 | inheritedStyles = TtmlTextParser.getElementsFromCollection_(element, 'style', styles, '') # tpye: List[Element] 534 | 535 | styleValue = None 536 | # The last value in our styles stack takes the precedence over the others 537 | for inheritedStyle in inheritedStyles: 538 | # Check ebu namespace first. 539 | styleAttributeValue = inheritedStyle.getAttributeNS(ebuttsNs, attribute) 540 | 541 | if not styleAttributeValue: 542 | # Fall back to tts namespace. 543 | styleAttributeValue = inheritedStyle.getAttributeNS(ttsNs, attribute) 544 | 545 | if not styleAttributeValue: 546 | # Next, check inheritance. 547 | # Styles can inherit from other styles, so traverse up that chain. 548 | styleAttributeValue = TtmlTextParser.getStyleAttributeFromElement_(inheritedStyle, styles, attribute) 549 | 550 | if styleAttributeValue: 551 | styleValue = styleAttributeValue 552 | 553 | return styleValue 554 | 555 | @staticmethod 556 | def getStyleAttributeFromElement_(cueElement: Element, styles, attribute: str): 557 | ttsNs = styleNs_ 558 | elementAttribute = cueElement.getAttributeNS(ttsNs, attribute) 559 | if elementAttribute: 560 | return elementAttribute 561 | return TtmlTextParser.getInheritedStyleAttribute_(cueElement, styles, attribute) 562 | 563 | @staticmethod 564 | def getInheritedAttribute_(element: Element, attributeName: str, nsName: str): 565 | ret = None 566 | while element: 567 | if nsName: 568 | ret = element.getAttributeNS(nsName, attributeName) 569 | else: 570 | ret = element.getAttribute(attributeName) 571 | if ret: 572 | break 573 | parentNode = element.parentNode 574 | if isinstance(parentNode, Element): 575 | element = parentNode 576 | else: 577 | break 578 | return ret 579 | 580 | @staticmethod 581 | def getElementsFromCollection_(element: Element, attributeName: str, collection: list, prefixName: str, nsName: str = None): 582 | items = [] 583 | if not element or len(collection) < 1: 584 | return items 585 | attributeValue = TtmlTextParser.getInheritedAttribute_(element, attributeName, nsName) 586 | if not attributeValue: 587 | return items 588 | itemNames = attributeValue.split(' ') 589 | for name in itemNames: 590 | for item in collection: 591 | if prefixName + item.getAttribute('xml:id') == name: 592 | items.append(item) 593 | break 594 | return items 595 | 596 | @staticmethod 597 | def getStyleAttributeFromRegion_(region: Element, styles, attribute): 598 | ttsNs = styleNs_ 599 | if not region: 600 | return None 601 | attr = region.getAttributeNS(ttsNs, attribute) 602 | if attr: 603 | return attr 604 | return TtmlTextParser.getInheritedStyleAttribute_(region, styles, attribute) 605 | 606 | @staticmethod 607 | def getCellResolution_(cellResolution: str): 608 | if cellResolution is None or cellResolution == '': 609 | return None 610 | matches = re.findall('^(\d+) (\d+)$', cellResolution) 611 | if len(matches) == 0: 612 | return None 613 | columns = int(matches[0][0]) 614 | rows = int(matches[0][1]) 615 | return {'columns': columns, 'rows': rows} 616 | 617 | 618 | # 50.17% 10% 619 | percentValues_ = re.compile('^(\d{1,2}(?:\.\d+)?|100(?:\.0+)?)% (\d{1,2}(?:\.\d+)?|100(?:\.0+)?)%$') 620 | 621 | # 0.6% 90% 622 | percentValue_ = re.compile('^(\d{1,2}(?:\.\d+)?|100)%$') 623 | 624 | # 100px, 8em, 0.80c 625 | unitValues_ = re.compile('^(\d+px|\d+em|\d*\.?\d+c)$') 626 | 627 | # 100px 628 | pixelValues_ = re.compile('^(\d+)px (\d+)px$') 629 | 630 | # 00:00:40:07 (7 frames) or 00:00:40:07.1 (7 frames, 1 subframe) 631 | timeColonFormatFrames_ = re.compile('^(\d{2,}):(\d{2}):(\d{2}):(\d{2})\.?(\d+)?$') 632 | 633 | # 00:00:40 or 00:40 634 | timeColonFormat_ = re.compile('^(?:(\d{2,}):)?(\d{2}):(\d{2})$') 635 | 636 | # 01:02:43.0345555 or 02:43.03 637 | timeColonFormatMilliseconds_ = re.compile('^(?:(\d{2,}):)?(\d{2}):(\d{2}\.\d{2,})$') 638 | 639 | # 75f or 75.5f 640 | timeFramesFormat_ = re.compile('^(\d*(?:\.\d*)?)f$') 641 | 642 | # 50t or 50.5t 643 | timeTickFormat_ = re.compile('^(\d*(?:\.\d*)?)t$') 644 | 645 | # 3.45h, 3m or 4.20s 646 | timeHMSFormat_ = re.compile('^(?:(\d*(?:\.\d*)?)h)?(?:(\d*(?:\.\d*)?)m)?(?:(\d*(?:\.\d*)?)s)?(?:(\d*(?:\.\d*)?)ms)?$') 647 | 648 | 649 | class textAlignToLineAlign_(Enum): 650 | left = lineAlign.START 651 | center = lineAlign.CENTER 652 | right = lineAlign.END 653 | start = lineAlign.START 654 | end = lineAlign.END 655 | 656 | 657 | class textAlignToPositionAlign_(Enum): 658 | left = positionAlign.LEFT 659 | center = positionAlign.CENTER 660 | right = positionAlign.RIGHT 661 | 662 | 663 | parameterNs_ = 'http://www.w3.org/ns/ttml#parameter' 664 | styleNs_ = 'http://www.w3.org/ns/ttml#styling' 665 | styleEbuttsNs_ = 'urn:ebu:tt:style' 666 | smpteNsList_ = [ 667 | 'http://www.smpte-ra.org/schemas/2052-1/2010/smpte-tt', 668 | 'http://www.smpte-ra.org/schemas/2052-1/2013/smpte-tt', 669 | ] 670 | -------------------------------------------------------------------------------- /text/VttTextParser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Dict, List, Union 3 | from xml.dom.minidom import parseString, Node, Element, Text 4 | from xml.sax.saxutils import escape 5 | from text.Cue import Cue, defaultTextColor, fontStyle, fontWeight, textDecoration 6 | from log import log 7 | 8 | 9 | class VttTextParser: 10 | 11 | def __init__(self): 12 | pass 13 | 14 | def parseInit(self, data: bytes): 15 | assert False, 'VTT does not have init segments' 16 | 17 | def parseMedia(self, data: bytes, time: int): 18 | pass 19 | 20 | @staticmethod 21 | def parseCueStyles(payload: str, rootCue: Cue, styles: Dict[str, Cue]): 22 | if len(styles) == 0: 23 | VttTextParser.addDefaultTextColor_(styles) 24 | payload = VttTextParser.replaceColorPayload_(payload) 25 | xmlPayload = '' + escape(payload) + '' 26 | elements = parseString(xmlPayload).getElementsByTagName('span') # type: List[Element] 27 | if len(elements) > 0 and elements[0]: 28 | element = elements[0] 29 | cues = [] # type: List[Cue] 30 | childNodes = element.childNodes # type: List[Element] 31 | if len(childNodes) == 1: 32 | childNode = childNodes[0] 33 | if childNode.nodeType == Node.TEXT_NODE or childNode.nodeType == Node.CDATA_SECTION_NODE: 34 | rootCue.payload = payload 35 | return 36 | for childNode in childNodes: 37 | if childNode.nodeValue and childNode.nodeValue.startswith('i>'): 38 | continue 39 | VttTextParser.generateCueFromElement_(childNode, rootCue, cues, styles) 40 | rootCue.nestedCues = cues 41 | else: 42 | log.warning(f'The cue\'s markup could not be parsed: {payload}') 43 | rootCue.payload = payload 44 | 45 | @staticmethod 46 | def generateCueFromElement_(element: Union[Element, Text], rootCue: Cue, cues: List[Cue], styles: Dict[str, Cue]): 47 | nestedCue = rootCue.clone() 48 | if element.nodeType == Node.ELEMENT_NODE and element.nodeName: 49 | bold = fontWeight.BOLD 50 | italic = fontStyle.ITALIC 51 | underline = textDecoration.UNDERLINE 52 | tags = re.split('[ .]+', element.nodeName) 53 | for tag in tags: 54 | if styles.get(tag): 55 | VttTextParser.mergeStyle_(nestedCue, styles.get(tag)) 56 | if tag == 'b': 57 | nestedCue.fontWeight = bold 58 | elif tag == 'i': 59 | nestedCue.fontStyle = italic 60 | elif tag == 'u': 61 | nestedCue.textDecoration.append(underline) 62 | isTextNode = element.nodeType == Node.TEXT_NODE or element.nodeType == Node.CDATA_SECTION_NODE 63 | if isTextNode: 64 | # element 这里是 Text 类型 js的textContent对应这里的data 65 | textArr = element.data.split('\n') 66 | isFirst = True 67 | for text in textArr: 68 | if not isFirst: 69 | lineBreakCue = rootCue.clone() 70 | lineBreakCue.lineBreak = True 71 | cues.append(lineBreakCue) 72 | if len(text) > 0: 73 | textCue = nestedCue.clone() 74 | textCue.payload = text 75 | cues.append(textCue) 76 | isFirst = False 77 | else: 78 | for childNode in element.childNodes: 79 | VttTextParser.generateCueFromElement_(childNode, nestedCue, cues, styles) 80 | 81 | @staticmethod 82 | def replaceColorPayload_(payload: str): 83 | ''' 84 | 这里没有找到相关样本测试 可能有bug 85 | ''' 86 | names = [] 87 | nameStart = -1 88 | newPayload = '' 89 | 90 | newPayload = payload 91 | # for i in range(len(payload)): 92 | # if payload[i] == '/': 93 | # end = payload.index('>', i) 94 | # if end <= i: 95 | # return payload 96 | # tagEnd = payload[i + 1:end] 97 | # tagStart = names.pop(-1) 98 | # if not tagEnd or not tagStart: 99 | # return payload 100 | # elif tagStart == tagEnd: 101 | # newPayload += '/' + tagEnd + '>' 102 | # i += len(tagEnd) + 1 103 | # else: 104 | # if not tagStart.startswith('c.') or tagEnd != 'c': 105 | # return payload 106 | # newPayload += '/' + tagStart + '>' 107 | # i += len(tagEnd) + 1 108 | # else: 109 | # if payload[i] == '<': 110 | # nameStart = i + 1 111 | # elif payload[i] == '>': 112 | # if nameStart > 0: 113 | # names.append(payload[nameStart:i]) 114 | # nameStart = -1 115 | # newPayload += payload[i] 116 | return newPayload 117 | 118 | @staticmethod 119 | def addDefaultTextColor_(styles: Dict[str, Cue]): 120 | for key, value in defaultTextColor.__members__.items(): 121 | cue = Cue(0, 0, '') 122 | cue.color = value 123 | styles[key] = cue -------------------------------------------------------------------------------- /util/DataViewReader.py: -------------------------------------------------------------------------------- 1 | import struct 2 | from enum import Enum 3 | 4 | from util.exceptions import OutOfBoundsError 5 | from util.exceptions import IntOverflowError 6 | 7 | 8 | class Endianness(Enum): 9 | BIG_ENDIAN = 0 10 | LITTLE_ENDIAN = 1 11 | 12 | 13 | class DataView: 14 | ''' 15 | shaka/util/buffer_utils.js 16 | ''' 17 | def __init__(self, data: bytes): 18 | self.buffer = memoryview(bytearray(data)) 19 | # self.buffer = memoryview(bytearray([0x96, 0x87, 0xac])) 20 | self.byteLength = len(self.buffer) # type: int 21 | 22 | def getUint8(self): 23 | pass 24 | 25 | def getUint16(self): 26 | pass 27 | 28 | def getUint32(self, position: int, littleEndian: bool = False): 29 | # 这里记得切片长度要补齐4位 不然unpack会报错 30 | buf = self.buffer[position:position + 4].tobytes() 31 | if len(buf) < 4: 32 | buf = b'\x00' * (4 - len(buf)) + buf 33 | if littleEndian: 34 | return struct.unpack("I", buf)[0] 37 | 38 | def getUint64(self, position: int, littleEndian: bool = False): 39 | # 这里记得切片长度要补齐4位 不然 40 | buf = self.buffer[position:position + 4].tobytes() 41 | if len(buf) < 4: 42 | buf = b'\x00' * (4 - len(buf)) + buf 43 | if littleEndian: 44 | return struct.unpack("I", buf)[0] 47 | 48 | def getInt8(self): 49 | pass 50 | 51 | def getInt16(self): 52 | pass 53 | 54 | def getInt32(self, position: int, littleEndian: bool = False): 55 | buf = self.buffer[position:position + 4].tobytes() 56 | if len(buf) < 4: 57 | buf = b'\x00' * (4 - len(buf)) + buf 58 | if littleEndian: 59 | return struct.unpack("i", buf)[0] 62 | 63 | def getInt64(self): 64 | pass 65 | 66 | def readUint8(self): 67 | pass 68 | 69 | def readUint16(self): 70 | pass 71 | 72 | def readUint32(self): 73 | pass 74 | 75 | def readInt8(self): 76 | pass 77 | 78 | def readInt16(self): 79 | pass 80 | 81 | def readInt32(self): 82 | pass 83 | 84 | def readInt64(self): 85 | pass 86 | 87 | @staticmethod 88 | def toUint8(data: 'DataView', offset: int = 0, length: int = None): 89 | # 由于python中float('inf')表示无穷大 但不能作为索引 90 | # 所以这里直接将最大长度视为byteLength 91 | if length is None: 92 | length = data.byteLength 93 | return data.buffer[offset:offset + length].tobytes() 94 | 95 | 96 | class DataViewReader(DataView): 97 | ''' 98 | shaka/util/data_view_reader.js 99 | ''' 100 | 101 | def __init__(self, data: bytes, endianness: Endianness): 102 | self.dataView_ = DataView(data) # type: DataView 103 | self.littleEndian_ = endianness == Endianness.LITTLE_ENDIAN # type: bool 104 | self.position_ = 0 # type: int 105 | 106 | def getDataView(self) -> DataView: 107 | return self.dataView_ 108 | 109 | def hasMoreData(self) -> bool: 110 | return self.position_ < self.dataView_.byteLength 111 | 112 | def getPosition(self) -> int: 113 | return self.position_ 114 | 115 | def getLength(self) -> int: 116 | return self.dataView_.byteLength 117 | 118 | def readUint8(self): 119 | pass 120 | 121 | def readUint16(self): 122 | pass 123 | 124 | def readUint32(self) -> int: 125 | value = self.dataView_.getUint32(self.position_, self.littleEndian_) 126 | self.position_ += 4 127 | return value 128 | 129 | def readInt32(self): 130 | value = self.dataView_.getInt32(self.position_, self.littleEndian_) 131 | self.position_ += 4 132 | return value 133 | 134 | def readUint64(self) -> int: 135 | if self.littleEndian_: 136 | low = self.dataView_.getUint32(self.position_, True) 137 | high = self.dataView_.getUint32(self.position_ + 4, True) 138 | else: 139 | high = self.dataView_.getUint32(self.position_, False) 140 | low = self.dataView_.getUint32(self.position_ + 4, False) 141 | 142 | if high > 0x1FFFFF: 143 | raise IntOverflowError 144 | 145 | self.position_ += 8 146 | return (high * (2 ** 32)) + low 147 | 148 | def readBytes(self, length: int): 149 | assert length >= 0, 'Bad call to DataViewReader.readBytes' 150 | if self.position_ + length > self.dataView_.byteLength: 151 | raise OutOfBoundsError 152 | data = DataView.toUint8(self.dataView_, self.position_, length) 153 | self.position_ += length 154 | return data 155 | 156 | def skip(self, length: int): 157 | assert length >= 0, 'Bad call to DataViewReader.skip' 158 | if self.position_ + length > self.dataView_.byteLength: 159 | raise OutOfBoundsError 160 | self.position_ += length 161 | 162 | def rewind(self, length: int): 163 | pass 164 | 165 | def seek(self, position: int): 166 | pass 167 | 168 | def readTerminatedString(self): 169 | pass 170 | 171 | def outOfBounds_(self): 172 | pass -------------------------------------------------------------------------------- /util/Functional.py: -------------------------------------------------------------------------------- 1 | class Functional: 2 | 3 | @staticmethod 4 | def isNotNull(value) -> bool: 5 | return value is not None -------------------------------------------------------------------------------- /util/Mp4BoxParsers.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from util.DataViewReader import DataViewReader 3 | 4 | 5 | class ParsedTFHDBox: 6 | 7 | def __init__(self, **kwargs): 8 | self.trackId = kwargs['trackId'] # type: int 9 | self.defaultSampleDuration = kwargs['defaultSampleDuration'] # type: int 10 | self.defaultSampleSize = kwargs['defaultSampleSize'] # type: int 11 | 12 | 13 | class ParsedTFDTBox: 14 | 15 | def __init__(self, **kwargs): 16 | self.baseMediaDecodeTime = kwargs['baseMediaDecodeTime'] # type: int 17 | 18 | 19 | class ParsedMDHDBox: 20 | 21 | def __init__(self, **kwargs): 22 | self.timescale = kwargs['timescale'] # type: int 23 | 24 | 25 | class ParsedTREXBox: 26 | 27 | def __init__(self, **kwargs): 28 | self.defaultSampleDuration = kwargs['defaultSampleDuration'] # type: int 29 | self.defaultSampleSize = kwargs['defaultSampleSize'] # type: int 30 | 31 | 32 | class ParsedTRUNBox: 33 | 34 | def __init__(self, **kwargs): 35 | self.sampleCount = kwargs['sampleCount'] # type: int 36 | self.sampleData = kwargs['sampleData'] # type: List[ParsedTRUNSample] 37 | 38 | 39 | class ParsedTRUNSample: 40 | 41 | def __init__(self, **kwargs): 42 | self.sampleDuration = kwargs['sampleDuration'] # type: int 43 | self.sampleSize = kwargs['sampleSize'] # type: int 44 | self.sampleCompositionTimeOffset = kwargs['sampleCompositionTimeOffset'] # type: int 45 | 46 | 47 | class ParsedTKHDBox: 48 | 49 | def __init__(self, **kwargs): 50 | self.trackId = kwargs['trackId'] # type: int 51 | 52 | 53 | class Mp4BoxParsers: 54 | 55 | @staticmethod 56 | def parseTFHD(reader: DataViewReader, flags: int) -> ParsedTFHDBox: 57 | defaultSampleDuration = None 58 | defaultSampleSize = None 59 | 60 | # Read "track_ID" 61 | trackId = reader.readUint32() 62 | 63 | # Skip "base_data_offset" if present. 64 | if flags & 0x000001: 65 | reader.skip(8) 66 | 67 | # Skip "sample_description_index" if present. 68 | if flags & 0x000002: 69 | reader.skip(4) 70 | 71 | # Read "default_sample_duration" if present. 72 | if flags & 0x000008: 73 | defaultSampleDuration = reader.readUint32() 74 | 75 | # Read "default_sample_size" if present. 76 | if flags & 0x000010: 77 | defaultSampleSize = reader.readUint32() 78 | 79 | return ParsedTFHDBox(**{ 80 | 'trackId': trackId, 81 | 'defaultSampleDuration': defaultSampleDuration, 82 | 'defaultSampleSize': defaultSampleSize, 83 | }) 84 | 85 | @staticmethod 86 | def parseTFDT(reader: DataViewReader, version: int) -> ParsedTFDTBox: 87 | if version == 1: 88 | baseMediaDecodeTime = reader.readUint64() 89 | else: 90 | baseMediaDecodeTime = reader.readUint32() 91 | return ParsedTFDTBox(**{'baseMediaDecodeTime': baseMediaDecodeTime}) 92 | 93 | @staticmethod 94 | def parseMDHD(reader: DataViewReader, version: int) -> ParsedMDHDBox: 95 | if version == 1: 96 | # Skip "creation_time" 97 | reader.skip(8) 98 | # Skip "modification_time" 99 | reader.skip(8) 100 | else: 101 | # Skip "creation_time" 102 | reader.skip(4) 103 | # Skip "modification_time" 104 | reader.skip(4) 105 | timescale = reader.readUint32() 106 | return ParsedMDHDBox(**{'timescale': timescale}) 107 | 108 | @staticmethod 109 | def parseTREX(reader: DataViewReader) -> ParsedTREXBox: 110 | pass 111 | 112 | @staticmethod 113 | def parseTRUN(reader: DataViewReader, version: int, flags: int) -> ParsedTRUNBox: 114 | sampleCount = reader.readUint32() 115 | sampleData = [] 116 | 117 | # Skip "data_offset" if present. 118 | if flags & 0x000001: 119 | reader.skip(4) 120 | 121 | # Skip "first_sample_flags" if present. 122 | if flags & 0x000004: 123 | reader.skip(4) 124 | 125 | for _ in range(sampleCount): 126 | sample = ParsedTRUNSample(**{ 127 | 'sampleDuration': None, 128 | 'sampleSize': None, 129 | 'sampleCompositionTimeOffset': None, 130 | }) 131 | 132 | # Read "sample duration" if present. 133 | if flags & 0x000100: 134 | sample.sampleDuration = reader.readUint32() 135 | 136 | # Read "sample_size" if present. 137 | if flags & 0x000200: 138 | sample.sampleSize = reader.readUint32() 139 | 140 | # Skip "sample_flags" if present. 141 | if flags & 0x000400: 142 | reader.skip(4) 143 | 144 | # Read "sample_time_offset" if present. 145 | if flags & 0x000800: 146 | if version == 0: 147 | sample.sampleCompositionTimeOffset = reader.readUint32() 148 | else: 149 | sample.sampleCompositionTimeOffset = reader.readInt32() 150 | sampleData.append(sample) 151 | 152 | return ParsedTRUNBox(**{'sampleCount': sampleCount, 'sampleData': sampleData}) 153 | 154 | @staticmethod 155 | def parseTKHD(reader: DataViewReader, version: int) -> ParsedTKHDBox: 156 | pass -------------------------------------------------------------------------------- /util/Mp4Parser.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Callable 2 | from enum import Enum 3 | 4 | # from log import log 5 | from util.DataViewReader import DataViewReader, Endianness 6 | 7 | 8 | class ParsedBox: 9 | ''' 10 | js原本是在shaka.extern.ParsedBox中 11 | 但是python中做分离会引起循环导入的问题 12 | 加上ParsedBox定义是在externs/shaka/mp4_parser.js 13 | 于是这里就把ParsedBox放到这里了 14 | ''' 15 | def __init__(self, **kwargs): 16 | self.parser = kwargs['parser'] # type: Mp4Parser 17 | self.partialOkay = kwargs['partialOkay'] # type: bool 18 | self.start = kwargs['start'] # type: int 19 | self.size = kwargs['size'] # type: int 20 | self.version = kwargs['version'] # type: int 21 | self.flags = kwargs['flags'] # type: int 22 | self.reader = kwargs['reader'] # type: DataViewReader 23 | self.has64BitSize = kwargs['has64BitSize'] # type: bool 24 | 25 | 26 | class Mp4Parser: 27 | 28 | class BoxType_(Enum): 29 | BASIC_BOX = 0 30 | FULL_BOX = 1 31 | 32 | def __init__(self): 33 | self.headers_ = {} # type: Dict[int, Mp4Parser.BoxType_] 34 | self.boxDefinitions_ = {} # type: Dict[int, Callable] 35 | self.done_ = False # type: bool 36 | 37 | def box(self, _type: str, definition: Callable) -> 'Mp4Parser': 38 | typeCode = Mp4Parser.typeFromString_(_type) 39 | self.headers_[typeCode] = Mp4Parser.BoxType_.BASIC_BOX 40 | self.boxDefinitions_[typeCode] = definition 41 | return self 42 | 43 | def fullBox(self, _type: str, definition: Callable) -> 'Mp4Parser': 44 | typeCode = Mp4Parser.typeFromString_(_type) 45 | self.headers_[typeCode] = Mp4Parser.BoxType_.FULL_BOX 46 | self.boxDefinitions_[typeCode] = definition 47 | return self 48 | 49 | def stop(self): 50 | self.done_ = True 51 | 52 | def parse(self, data, partialOkay: bool = False, stopOnPartial: bool = False): 53 | reader = DataViewReader(data, Endianness.BIG_ENDIAN) 54 | self.done_ = False 55 | while reader.hasMoreData() and not self.done_: 56 | self.parseNext(0, reader, partialOkay, stopOnPartial) 57 | 58 | def parseNext(self, absStart: int, reader: DataViewReader, partialOkay: bool, stopOnPartial: bool = False): 59 | start = reader.getPosition() 60 | 61 | # size(4 bytes) + type(4 bytes) = 8 bytes 62 | if stopOnPartial and start + 8 > reader.getLength(): 63 | self.done_ = True 64 | return 65 | 66 | size = reader.readUint32() 67 | _type = reader.readUint32() 68 | name = Mp4Parser.typeToString(_type) 69 | has64BitSize = False 70 | # log.info(f'[{name}] Parsing MP4 box') 71 | 72 | if size == 0: 73 | size = reader.getLength() - start 74 | elif size == 1: 75 | if stopOnPartial and reader.getPosition() + 8 > reader.getLength(): 76 | self.done_ = True 77 | return 78 | size = reader.readUint64() 79 | has64BitSize = True 80 | # 和js不一样 py中不存在key会直接异常 所以这里用get方法 81 | boxDefinition = self.boxDefinitions_.get(_type) 82 | 83 | if boxDefinition: 84 | version = None 85 | flags = None 86 | 87 | if self.headers_[_type] == Mp4Parser.BoxType_.FULL_BOX: 88 | if stopOnPartial and reader.getPosition() + 4 > reader.getLength(): 89 | self.done_ = True 90 | return 91 | versionAndFlags = reader.readUint32() 92 | version = versionAndFlags >> 24 93 | flags = versionAndFlags & 0xFFFFFF 94 | 95 | end = start + size 96 | if partialOkay and end > reader.getLength(): 97 | end = reader.getLength() 98 | 99 | if stopOnPartial and end > reader.getLength(): 100 | self.done_ = True 101 | return 102 | payloadSize = end - reader.getPosition() 103 | payload = reader.readBytes(payloadSize) if payloadSize > 0 else b'' 104 | 105 | payloadReader = DataViewReader(payload, Endianness.BIG_ENDIAN) 106 | 107 | box = { 108 | 'parser': self, 109 | 'partialOkay': partialOkay or False, 110 | 'version': version, 111 | 'flags': flags, 112 | 'reader': payloadReader, 113 | 'size': size, 114 | 'start': start + absStart, 115 | 'has64BitSize': has64BitSize, 116 | } 117 | box = ParsedBox(**box) 118 | 119 | boxDefinition(box) 120 | else: 121 | skipLength = min(start + size - reader.getPosition(), reader.getLength() - reader.getPosition()) 122 | reader.skip(skipLength) 123 | 124 | @staticmethod 125 | def children(box: ParsedBox): 126 | headerSize = Mp4Parser.headerSize(box) 127 | while box.reader.hasMoreData() and not box.parser.done_: 128 | box.parser.parseNext(box.start + headerSize, box.reader, box.partialOkay) 129 | 130 | @staticmethod 131 | def sampleDescription(box: ParsedBox): 132 | headerSize = Mp4Parser.headerSize(box) 133 | count = box.reader.readUint32() 134 | for _ in range(count): 135 | box.parser.parseNext(box.start + headerSize, box.reader, box.partialOkay) 136 | if box.parser.done_: 137 | break 138 | 139 | @staticmethod 140 | def allData(callback: Callable): 141 | def alldata_callback(box: ParsedBox): 142 | _all = box.reader.getLength() - box.reader.getPosition() 143 | return callback(box.reader.readBytes(_all)) 144 | return alldata_callback 145 | 146 | @staticmethod 147 | def typeFromString_(name: str): 148 | assert len(name) == 4, 'Mp4 box names must be 4 characters long' 149 | 150 | code = 0 151 | for char in name: 152 | code = (code << 8) | ord(char) 153 | return code 154 | 155 | @staticmethod 156 | def typeToString(_type: int): 157 | name = bytes([ 158 | (_type >> 24) & 0xff, 159 | (_type >> 16) & 0xff, 160 | (_type >> 8) & 0xff, 161 | _type & 0xff 162 | ]).decode('utf-8') 163 | return name 164 | 165 | @staticmethod 166 | def headerSize(box: ParsedBox): 167 | return 8 + (8 if box.has64BitSize else 0) + (4 if box.flags is not None else 0) -------------------------------------------------------------------------------- /util/TextParser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | class TimeContext: 5 | def __init__(self, **kwargs): 6 | self.periodStart = kwargs['periodStart'] # tpye: float 7 | self.segmentStart = kwargs['segmentStart'] # tpye: float 8 | self.segmentEnd = kwargs['segmentEnd'] # tpye: float 9 | 10 | 11 | class TextParser: 12 | 13 | def __init__(self, data: str): 14 | self.data_ = data 15 | self.position_ = 0 16 | 17 | def atEnd(self): 18 | return self.position_ == len(self.data_) 19 | 20 | def readLine(self): 21 | # assert 1 == 0, 'not implemented yet' 22 | return self.readRegexReturnCapture_('(.*?)(\n|$)', 1) 23 | 24 | def readWord(self): 25 | # assert 1 == 0, 'not implemented yet' 26 | return self.readRegexReturnCapture_('[^ \t\n]*', 0) 27 | 28 | def readRegexReturnCapture_(self, regex: str, index: int): 29 | if self.atEnd(): 30 | return None 31 | ret = self.readRegex(regex) 32 | if not ret: 33 | return None 34 | else: 35 | return ret[index] 36 | 37 | def readRegex(self, regex: str): 38 | index = self.indexOf_(regex) 39 | if self.atEnd() or index is None or index.position != self.position_: 40 | return None 41 | 42 | self.position_ += index.length 43 | return index.results 44 | 45 | def indexOf_(self, regex: str): 46 | # assert 1 == 0, 'not implemented yet' 47 | results = re.search(regex, self.data_[self.position_:]) 48 | if not results: 49 | return 50 | else: 51 | return IndexOf(results) 52 | 53 | 54 | class IndexOf: 55 | def __init__(self, results: re.Match): 56 | self.position = results.regs[0][0] 57 | self.length = len(results[0]) 58 | self.results = results -------------------------------------------------------------------------------- /util/exceptions.py: -------------------------------------------------------------------------------- 1 | class Error(Exception): 2 | '''Base class for shaka errors.''' 3 | 4 | 5 | class SeverityError(Error): 6 | '''Severity Error.''' 7 | 8 | 9 | class CategoryError(Error): 10 | '''Category Error.''' 11 | 12 | 13 | class InvalidMp4VTT(Error): 14 | '''Code INVALID_MP4_VTT Error.''' 15 | def __init__(self, reason: str): 16 | self.reason = reason 17 | 18 | def __str__(self): 19 | return self.reason 20 | 21 | 22 | class InvalidMp4TTML(Error): 23 | '''Code INVALID_MP4_TTML Error.''' 24 | def __init__(self, reason: str): 25 | self.reason = reason 26 | 27 | def __str__(self): 28 | return self.reason 29 | 30 | 31 | class InvalidXML(Error): 32 | '''Code INVALID_XML Error.''' 33 | def __init__(self, reason: str): 34 | self.reason = reason 35 | 36 | def __str__(self): 37 | return self.reason 38 | 39 | 40 | class InvalidTextCue(Error): 41 | '''Code INVALID_TEXT_CUE Error.''' 42 | def __init__(self, reason: str): 43 | self.reason = reason 44 | 45 | def __str__(self): 46 | return self.reason 47 | 48 | 49 | class OutOfBoundsError(Error): 50 | '''Code BUFFER_READ_OUT_OF_BOUNDS Error.''' 51 | 52 | 53 | class IntOverflowError(Error): 54 | '''Code JS_INTEGER_OVERFLOW Error.''' 55 | --------------------------------------------------------------------------------