├── README.md
├── __init__.py
├── amazon_get_meta_info.py
├── amazon_get_real_url.py
├── amazon_get_season_urls.py
├── amazon_get_seasons_info.py
├── amazon_is_meta.py
├── amazon_meta_crawler.js
├── funi_request.py
├── getCookie.py
├── get_season_urls.py
├── hbomax_meta_crawler.js
├── log.py
├── netflix_meta_crawler.js
├── paramount_subtitle_trans.py
├── srtConvert.py
├── text
    ├── Cue.py
    ├── Mp4TtmlParser.py
    ├── Mp4VttParser.py
    ├── TextEngine.py
    ├── TtmlTextParser.py
    └── VttTextParser.py
└── util
    ├── DataViewReader.py
    ├── Functional.py
    ├── Mp4BoxParsers.py
    ├── Mp4Parser.py
    ├── TextParser.py
    └── exceptions.py


/README.md:
--------------------------------------------------------------------------------
 1 | # StreamFab Python Modules
 2 | 
 3 | This repository contains a collection of Python modules developed by StreamFab for various streaming-related tasks.
 4 | 
 5 | ## Introduction
 6 | 
 7 | StreamFab is a suite of tools and utilities designed to facilitate streaming media processing and manipulation. These Python modules offer functionality for tasks such as:
 8 | 
 9 | - Stream capturing
10 | - Video format conversion
11 | - Subtitle extraction
12 | - Metadata retrieval
13 | - and more!
14 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThatNotEasy/StreamFab/157f7a91f75459e523076ab9d586c798f1a9c989/__init__.py


--------------------------------------------------------------------------------
/amazon_get_meta_info.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import re, sys, string
  4 | import bs4,json
  5 | 
  6 | import logging
  7 | logger = logging.getLogger()
  8 | 
  9 | def get_seconds(str):
 10 |     try:
 11 |         seconds = 0
 12 |         if str=='':
 13 |             return seconds
 14 |         if '時間' in str and '分' in str:
 15 |             strs = str.split('時間')
 16 |             hours = strs[0]
 17 |             min_str = strs[1]
 18 |             min_strs = min_str.split('分')
 19 |             min = min_strs[0]
 20 |             seconds = 60 * 60 * int(hours) + 60 * int(min)
 21 |         elif 'h' in str:
 22 |             if ' ' in str:
 23 |                 strs = str.split(' ')
 24 |                 hours = strs[0]
 25 |                 min = strs[2]
 26 |                 seconds = 60 * 60 * int(hours) + 60 * int(min)
 27 |             else:
 28 |                 if 'min' in str:
 29 |                     str = str[:-3]
 30 |                     strs = str.split('h')
 31 |                     hours = strs[0]
 32 |                     min = strs[1]
 33 |                     seconds = 60 * 60 * int(hours) + 60 * int(min)
 34 |                 else:
 35 |                     str = str[:-1]
 36 |                     seconds = 60 * 60 * int(str)
 37 |         else:
 38 |             if ' ' in str:
 39 |                 strs = str.split(' ')
 40 |                 min = strs[0]
 41 |                 seconds = 60 * int(min)
 42 |             else:
 43 |                 str = str[:-3]
 44 |                 seconds = 60 * int(str)
 45 |                 pass
 46 | 
 47 |     except Exception:
 48 |         return 0
 49 | 
 50 |     return seconds
 51 | 
 52 | EU_REGION = "atv-ps-eu"
 53 | JP_REGION = "atv-ps-fe"
 54 | US_REGION = "atv-ps"
 55 | 
 56 | ROW_NA = "ART4WZ8MWBX2Y"
 57 | ROE_EU = "A3K6Y4MI8GDYMT"
 58 | ROW_EU = "A2MFUE2XK8ZSSY"
 59 | ROW_FE = "A15PK738MTQHSO"
 60 | 
 61 | DE = "A1PA6795UKMFR9"
 62 | JP = "A1VC38T7YXB528"
 63 | UK = "A1F83G8C2ARO7P"
 64 | US = "ATVPDKIKX0DER"
 65 | 
 66 | def getDomainFromMarketplaceId(marketplaceID):
 67 | 
 68 |     if marketplaceID==ROW_NA or marketplaceID==ROE_EU or marketplaceID==ROW_EU or marketplaceID==ROW_FE:
 69 |         return ".primevideo.com"
 70 |     if marketplaceID==DE:
 71 |         return ".amazon.de"
 72 |     if marketplaceID==JP:
 73 |         return ".amazon.co.jp"
 74 |     if marketplaceID==UK:
 75 |         return ".amazon.co.uk"
 76 |     return ".amazon.com"
 77 | 
 78 | def getAVTFromMarketplaceId(marketplaceID):
 79 | 
 80 |     if marketplaceID==ROW_NA or marketplaceID==US:
 81 |         return US_REGION
 82 |     if marketplaceID==ROE_EU or marketplaceID==ROW_EU or marketplaceID==UK or marketplaceID==DE:
 83 |         return EU_REGION
 84 |     if marketplaceID==ROW_FE or marketplaceID==JP:
 85 |         return JP_REGION
 86 |     raise("getAVTFromMarketplaceId error")
 87 | 
 88 | def get_meta_info(webpage):
 89 |     amazonSoup = bs4.BeautifulSoup(webpage,'html5lib')
 90 | 
 91 |     reJson = r'"marketplaceID":"(.*?)"'
 92 |     marketplaceID = re.findall(reJson, webpage)[0]
 93 | 
 94 |     domain = getAVTFromMarketplaceId(marketplaceID) + getDomainFromMarketplaceId(marketplaceID)
 95 | 
 96 |     movie_play_url = ''
 97 |     movie_play_urls = amazonSoup.select('.dv-dp-node-playback')
 98 |     if len(movie_play_urls)!=0:
 99 |         movie_play_url = movie_play_urls[0].find("a")['href']
100 | 
101 |     current_id = ""
102 |     if amazonSoup.find("ol",class_="_3kgCxW")!=None:
103 |         lis = amazonSoup.find("ol",class_="_3kgCxW").find_all("li")
104 |         for index in range(len(lis)):
105 |             li = lis[index]
106 |             href = ""
107 |             if li.find("a")==None:
108 |                 continue
109 |             if not(li.find("a").has_attr('href')):
110 |                 continue
111 |             href= li.find("a")['href']
112 |             episodeId = ""
113 |             if li.find("label")!=None:
114 |                 episodeId = li.find("label")['for'].replace('selector-','')
115 |             if href.split('ref')[0] == movie_play_url.split('ref')[0]:
116 |                 current_id = episodeId
117 | 
118 |     reLocale = r'"locale":"(.*?)"'
119 |     locale = re.findall(reLocale, webpage)[0]
120 | 
121 |     reKeywordTitle = r'"keywords":\["(.*?)"\]'
122 |     tmp_keyword_title = re.findall(reKeywordTitle, webpage)
123 |     reTitle = r'"parentTitle":"(.*?)"'
124 |     tmp_title = re.findall(reTitle, webpage)
125 | 
126 |     h1_title=""
127 |     h1_elements = amazonSoup.findAll('h1')
128 |     for h1_element in h1_elements:
129 |         if h1_element.find('img') :
130 |             h1_title=h1_element.find('img').attrs["alt"]
131 |         pass
132 | 
133 |     main_element = amazonSoup.find('main',attrs={'id':'main','data-testid':'detailpage-main'})
134 | 
135 |     if len(amazonSoup.select('._2Q73m9'))!=0:
136 |         title = amazonSoup.select('._2Q73m9')[0].string
137 |     elif len(amazonSoup.find_all(attrs={'data-automation-id':'title'}))!=0:
138 |         title = amazonSoup.find_all(attrs={'data-automation-id':'title'})[0].string
139 |     elif h1_title:
140 |         title = h1_title
141 |     elif len(tmp_keyword_title)!=0:
142 |         title = tmp_keyword_title[0] 
143 |     elif len(tmp_title)!=0:
144 |         title = tmp_title[0]
145 |     else:
146 |         title = amazonSoup.title.string
147 |     
148 |     release_time = ''
149 |     release_times = amazonSoup.find_all(attrs={'data-automation-id':'release-year-badge'})
150 |     if len(release_times)==1:
151 |         release_time = release_times[0].string
152 |     runtime_strs = amazonSoup.find_all(attrs={'data-automation-id':'runtime-badge'})
153 |     runtime_str = ""
154 |     if len(runtime_strs)==1:
155 |         runtime_str = runtime_strs[0].string
156 |     runtime = get_seconds(runtime_str)
157 |     
158 |     synopsis = ''
159 |     e_synopsis = amazonSoup.select('._1wxob_')
160 |     if len(e_synopsis)==0:
161 |         e_synopsis = amazonSoup.select('._3qsVvm')
162 |     if len(e_synopsis)==0 and main_element is not None:
163 |         e_synopsis = main_element.select('._5tB6mN')
164 |     if len(e_synopsis)==0 and main_element is not None:
165 |         e_synopsis = main_element.select('.dv-dp-node-synopsis')
166 |     if len(e_synopsis)!=0:
167 |         synopsis = e_synopsis[0].string
168 | 
169 |     thumbs = ''
170 |     e_thumb = amazonSoup.select('#atf-full')
171 |     if len(e_thumb)!=0:
172 |         thumbs = e_thumb[0]['src']
173 |     if not thumbs:
174 |         e_thumb = amazonSoup.find("img", class_="_2x6L3o")
175 |         if e_thumb:
176 |             thumbs = e_thumb['src']
177 |     if not thumbs:
178 |         div_thumb = amazonSoup.find(attrs={'data-automation-id':'hero-background'})
179 |         if div_thumb:
180 |             e_thumb = div_thumb.find("img")
181 |         if e_thumb:
182 |             thumbs = e_thumb['src']
183 | 
184 |     if current_id=="":
185 |         reCurrentId = r'"pageTitleId":"(.*?)"'
186 |         current_ids = re.findall(reCurrentId, webpage)
187 |         if len(current_ids)!=0:
188 |             current_id = re.findall(reCurrentId, webpage)[0]
189 | 
190 |     try:
191 |         audios = []
192 |         subtitles = []
193 | 
194 |         meta_info = amazonSoup.select('.dv-dp-node-meta-info')
195 |         if (len(meta_info) > 0):
196 |             l = len(meta_info[0].select('._2czKtE'))
197 |             if (l==1):
198 |                 e_audios = amazonSoup.select('.dv-dp-node-meta-info')[0].select("._2czKtE")[0].select('dd')[0].text
199 |                 for e_audio in e_audios.split(','):
200 |                     if '…' in e_audio:
201 |                         e_audio = e_audio.split('…')[1]
202 |                     audios.append(e_audio)
203 |             else:
204 |                 e_subtitles = amazonSoup.select('.dv-dp-node-meta-info')[0].select('._2czKtE')[0].select('dd')[0].text
205 |                 e_audios = amazonSoup.select('.dv-dp-node-meta-info')[0].select("._2czKtE")[1].select('dd')[0].text
206 |                 for e_audio in e_audios.split(','):
207 |                     if '…' in e_audio:
208 |                         e_audio = e_audio.split('…')[1]
209 |                     audios.append(e_audio)
210 |                 for e_subtitle in e_subtitles.split(','):
211 |                     if '…' in e_subtitle:
212 |                         e_subtitle = e_subtitle.split('…')[1]
213 |                     subtitles.append(e_subtitle)
214 | 
215 |     except Exception as e:
216 |         audios = ['English']
217 |         subtitles = ['English']
218 |     # if 'primevideo.com' not in domain:
219 |         # current_id = ""
220 | 
221 |     genres = []
222 |     if main_element is not None:
223 |         genres_div = main_element.find('div', {'class': 'dv-node-dp-genres'})
224 |         if genres_div:
225 |             genres_spans = genres_div.find_all('span', {'aria-label': True})
226 |             for span in genres_spans:
227 |                 genres.append(span.get('aria-label'))
228 |         # if genres is not None or len(genres) != 0:
229 |             # print(genres)
230 | 
231 | 
232 |     directors = []
233 |     casts =[]
234 |     try:
235 |         if main_element is not None:
236 |             div_product_detail = main_element.find(attrs={'data-automation-id':'btf-product-details'})
237 |             if div_product_detail:
238 |                 directors_dl = div_product_detail.find_all('dl')
239 |                 for dl in directors_dl:
240 |                     dt = dl.find('dt', string='Directors')
241 |                     if dt:
242 |                         directors = [director.text for director in dt.find_next_sibling('dd').find_all('a')]
243 |                         break
244 | 
245 |                 casts_dl = div_product_detail.find_all('dl')
246 |                 for dl in casts_dl:
247 |                     dt = dl.find('dt', string='Starring')
248 |                     if dt:
249 |                         casts = [cast.text for cast in dt.find_next_sibling('dd').find_all('a')]
250 |                         break
251 | 
252 |     except Exception as e:
253 |         logger.info(str(e))
254 |         directors = []
255 |         casts = []
256 |     
257 |     # if casts:
258 |     #     print('casts :',casts)
259 |     # if directors:
260 |     #     print("directors :",directors)
261 | 
262 | 
263 |     info = {
264 |         "id": current_id,
265 |         "marketplaceID":marketplaceID,
266 |         "domain":domain,
267 |         "locale":locale,
268 |         "movie_play_url":movie_play_url,
269 |         "title": title,
270 |         "release_time": release_time,
271 |         "runtime": runtime,
272 |         "runtime_str":runtime_str,
273 |         "subtitles":subtitles,
274 |         "audios":audios,
275 |         "synopsis": synopsis,
276 |         "thumbs": thumbs,
277 |         "directors": directors,
278 |         "casts": casts,
279 |         "writers": [],
280 |         "genres": genres,
281 |         "moodTags": []
282 |     }
283 | 
284 |     return info
285 | 
286 | def run(params):
287 |     arrParams = list(params)
288 |     log_path = ''
289 |     for index in range(len(arrParams)):
290 |         if index==0:
291 |             log_path = arrParams[index]
292 |         elif index==1:
293 |             output_file = arrParams[index]
294 |         else:
295 |             input_file= arrParams[index]
296 |     
297 |     # set logging
298 |     log_handler = logging.FileHandler(log_path)
299 |     log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
300 |     log_handler.setFormatter(log_formatter)
301 |     logger.addHandler(log_handler)
302 |     logger.setLevel(logging.INFO)
303 | 
304 |     try:
305 |         f = open(input_file,'r',encoding='UTF-8')
306 |         webpage = f.read()
307 |         f.close()
308 | 
309 |         info = get_meta_info(webpage)
310 | 
311 |         with open(output_file,"w",encoding='UTF-8') as f:
312 |             json.dump(info,f)
313 |         logger.info("get_meta_info complete...")
314 | 
315 |     except Exception as e:
316 |         logger.exception(str(e))
317 | 
318 | import os
319 | if __name__ == '__main__':
320 | 
321 |     params = [
322 |         "./python.log",
323 |         "./get_meta_info.json",
324 |         "./amazonGetMetaInfoTempIn.html",
325 |     ]
326 | 
327 |     if os.path.exists(params[0]):
328 |         os.remove(params[0])
329 |     if os.path.exists(params[1]):
330 |         os.remove(params[1])
331 | 
332 |     run(params)


--------------------------------------------------------------------------------
/amazon_get_real_url.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re, sys, string
 4 | import bs4,json
 5 | 
 6 | import logging
 7 | logger = logging.getLogger()
 8 | 
 9 | def run(params):
10 |     arrParams = list(params)
11 |     log_path = ''
12 |     for index in range(len(arrParams)):
13 |         if index==0:
14 |             log_path = arrParams[index]
15 |         elif index==1:
16 |             output_file = arrParams[index]
17 |         else:
18 |             input_file= arrParams[index]
19 | 
20 |     # set logging
21 |     log_handler = logging.FileHandler(log_path)
22 |     log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
23 |     log_handler.setFormatter(log_formatter)
24 |     logger.addHandler(log_handler)
25 |     logger.setLevel(logging.INFO)
26 | 
27 |     try:
28 |         f = open(input_file,'r',encoding='UTF-8')
29 |         webpage = f.read()
30 |         f.close()
31 | 
32 |         amazonSoup = bs4.BeautifulSoup(webpage,'html5lib')
33 |         expander = amazonSoup.find_all(attrs={'data-automation-id':'ep-expander'})
34 |         if len(expander)==0:
35 |             real_url = ""
36 |         else:
37 |             real_url = expander[0]['href']
38 |         dict = {
39 |             "real_url":real_url
40 |         }
41 |         with open(output_file,"w", encoding='UTF-8') as f:
42 |             json.dump(dict,f)
43 |             logger.info("get_real_url complete...")
44 | 
45 |     except Exception as e:
46 |         logger.exception(str(e))
47 | 
48 | import os
49 | if __name__ == '__main__':
50 | 
51 |     params = [
52 |         "./log/python.log",
53 |         "./result/get_real_url.json",
54 |         "./html/season_1.html",
55 |     ]
56 | 
57 |     if os.path.exists(params[0]):
58 |         os.remove(params[0])
59 |     if os.path.exists(params[1]):
60 |         os.remove(params[1])
61 | 
62 |     run(params)


--------------------------------------------------------------------------------
/amazon_get_season_urls.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import re, sys, string
 4 | import bs4,json
 5 | 
 6 | import logging
 7 | logger = logging.getLogger()
 8 | 
 9 | def run(params):
10 |     arrParams = list(params)
11 |     log_path = ''
12 |     for index in range(len(arrParams)):
13 |         if index==0:
14 |             log_path = arrParams[index]
15 |         elif index==1:
16 |             output_file = arrParams[index]
17 |         else:
18 |             input_file= arrParams[index]
19 | 
20 |     # set logging
21 |     log_handler = logging.FileHandler(log_path)
22 |     log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
23 |     log_handler.setFormatter(log_formatter)
24 |     logger.addHandler(log_handler)
25 |     logger.setLevel(logging.INFO)
26 | 
27 |     try:
28 |         f = open(input_file,'r',encoding='UTF-8')
29 |         webpage = f.read()
30 |         f.close()
31 | 
32 |         amazonSoup = bs4.BeautifulSoup(webpage,'html5lib')
33 |         elems = amazonSoup.select('.dv-node-dp-seasons a')
34 |         season_urls = []
35 |         for elem in elems:
36 |             href = elem['href']
37 |             season_urls.append(href)
38 |         dict = {
39 |             "season_urls":season_urls
40 |         }
41 | 
42 |         if season_urls is None or len(season_urls)==0:
43 |             logger.warning("No season urls found '.dv-node-dp-seasons a' - trying '.dv-node-dp-seasons-default'...")
44 |             elems = amazonSoup.select('.dv-node-dp-seasons-default a')
45 |             season_urls = []
46 |             # 递归查找所有元素中的 herf
47 |             for elem in elems:
48 |                 href = elem['href']
49 |                 season_urls.append(href)
50 |             dict = {
51 |                 "season_urls":season_urls
52 |             }
53 | 
54 | 
55 | 
56 |         with open(output_file,"w", encoding='UTF-8') as f:
57 |             json.dump(dict,f)
58 |             logger.info("amazon_get_season_urls complete...")
59 | 
60 |     except Exception as e:
61 |         logger.exception(str(e))
62 | 
63 | import os
64 | if __name__ == '__main__':
65 | 
66 |     params = [
67 |         "./amazon/log/python.log",
68 |         "./amazon/result/get_season_urls.json",
69 |         "./amazon/html/_de/Watch Good Omens - Season 1 _ Prime Video.html",
70 |     ]
71 | 
72 |     if os.path.exists(params[0]):
73 |         os.remove(params[0])
74 |     if os.path.exists(params[1]):
75 |         os.remove(params[1])
76 | 
77 |     run(params)


--------------------------------------------------------------------------------
/amazon_get_seasons_info.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import re, sys, string
  4 | import bs4,json
  5 | 
  6 | import logging
  7 | logger = logging.getLogger()
  8 | 
  9 | def get_seconds(str):
 10 |     try:
 11 |         seconds = 0
 12 |         if str=='':
 13 |             return seconds
 14 |         if '時間' in str and '分' in str:
 15 |             strs = str.split('時間')
 16 |             hours = strs[0]
 17 |             min_str = strs[1]
 18 |             min_strs = min_str.split('分')
 19 |             min = min_strs[0]
 20 |             seconds = 60 * 60 * int(hours) + 60 * int(min)
 21 |         elif 'h' in str:
 22 |             if ' ' in str:
 23 |                 strs = str.split(' ')
 24 |                 hours = strs[0]
 25 |                 min = strs[2]
 26 |                 seconds = 60 * 60 * int(hours) + 60 * int(min)
 27 |             else:
 28 |                 if 'min' in str:
 29 |                     str = str[:-3]
 30 |                     strs = str.split('h')
 31 |                     hours = strs[0]
 32 |                     min = strs[1]
 33 |                     seconds = 60 * 60 * int(hours) + 60 * int(min)
 34 |                 else:
 35 |                     str = str[:-1]
 36 |                     seconds = 60 * 60 * int(str)
 37 |         else:
 38 |             if ' ' in str:
 39 |                 strs = str.split(' ')
 40 |                 min = strs[0]
 41 |                 seconds = 60 * int(min)
 42 |             else:
 43 |                 str = str[:-3]
 44 |                 seconds = 60 * int(str)
 45 |                 pass
 46 | 
 47 |     except Exception:
 48 |         return 0
 49 | 
 50 |     return seconds
 51 | 
 52 | def is_number(s):
 53 |     try:
 54 |         float(s)
 55 |         return True
 56 |     except ValueError:
 57 |         pass
 58 | 
 59 |     try:
 60 |         import unicodedata
 61 |         unicodedata.numeric(s)
 62 |         return True
 63 |     except (TypeError, ValueError):
 64 |         pass
 65 | 
 66 |     return False
 67 | 
 68 | def get_episodes_info(episodes_data):
 69 |     episodes_array = []
 70 |     count = 1
 71 | 
 72 |     for part in episodes_data:
 73 |         lis = part.find_all("li")
 74 |         for index in range(len(lis)):
 75 |             li = lis[index]
 76 | 
 77 |             title = ''
 78 |             js_title = li.find("div", class_="js-episode-title-name")
 79 |             if not js_title:
 80 |                 js_title = li.find("div", class_="dv-episode-noplayback-title")
 81 |             if not js_title:
 82 |                 js_title = li.find("div", class_="_1TQ2Rs")
 83 |             if not js_title:
 84 |                 js_title = li.find("div", class_="izvPPq")
 85 | 
 86 |             if not js_title:
 87 |                 js_title = li.find("span", class_="S4388s")
 88 |             if not js_title:
 89 |                 js_title = li.find("span", class_="P1uAb6")
 90 | 
 91 | 
 92 |             if js_title:
 93 |                 title = js_title.text
 94 |             
 95 |             synopsis = ''
 96 |             js_synopsis = li.find("div", class_="_3qsVvm")
 97 |             if js_synopsis:
 98 |                 synopsis = js_synopsis.string
 99 |  
100 |             if li.find("a")==None:
101 |                 count += 1
102 |                 continue
103 | 
104 |             if not(li.find("a").has_attr('href')):
105 |                 count += 1
106 |                 continue
107 | 
108 |             play_url = ""
109 |             play_url = li.find("a")['href']
110 |             if play_url == None:
111 |                 count += 1
112 |                 continue
113 |             
114 |             if 'force_return_url' in play_url:
115 |                 count += 1
116 |                 continue
117 | 
118 |             episodeId = ""
119 |             if li.find("label")!=None:
120 |                 episodeId = li.find("label")['for'].replace('selector-','')
121 | 
122 |             runtime_str = ''
123 |             js_runtime = li.find("div", class_="_1DcCXQ _2_ujMf")
124 |             if not js_runtime:
125 |                 js_runtime = li.find("div", class_="_3rBDNv _1AeKJC")
126 |             if not js_runtime:
127 |                 js_runtime = li.find("div", class_="_1wFEYz ci7S35")
128 |             if js_runtime:
129 |                 divs = js_runtime.find_all('div')
130 |                 if divs:
131 |                     if len(divs) > 1:
132 |                         runtime_str = divs[1].string
133 |                     else:
134 |                         runtime_str = divs[0].string
135 |                 
136 |             runtime = get_seconds(runtime_str)
137 |             
138 |             thumbs_url = ''
139 |             pic = li.find("picture")
140 |             if pic:
141 |                 img = pic.find("img")
142 |                 thumbs_url = img['src']
143 |                 
144 |             if not thumbs_url:
145 |                 img = li.find("img")
146 |                 if img:
147 |                     thumbs_url = img['src']
148 | 
149 |             # positionStr = title.replace(" ",'').split('.')[0]
150 |             # if(is_number(positionStr)):
151 |             #     position = int(positionStr) 
152 |             # else:
153 |             #     position = count
154 |             position = count
155 | 
156 |             e_obj = {
157 |                     "seq": position,
158 |                     "episodeId": episodeId,
159 |                     "play_url": play_url,
160 |                     "synopsis": synopsis,
161 |                     "title": title,
162 |                     "runtime": runtime,
163 |                     "runtime_str":runtime_str,
164 |                     "thumbs": {
165 |                         "url": thumbs_url
166 |                     }
167 |                 }
168 | 
169 |             count += 1
170 |             episodes_array.append(e_obj)
171 | 
172 |     return episodes_array
173 | 
174 | def get_extras_info(extras_data):
175 |     extras = {}
176 | 
177 |     for tag in extras_data:
178 |         extras_tag = ''
179 |         js_title = tag.find(class_='ROp-tf')
180 |         if not js_title:
181 |             js_title = tag.find(class_='Dsc37Q')  
182 |         if js_title:
183 |             extras_tag = js_title.text.replace(" ", "")
184 |     
185 |         extras_list = []
186 |         lis = []
187 |         ul = tag.find(class_="jxBPRE _28m62t")
188 |         if ul:
189 |             lis = ul.find_all("li")
190 | 
191 |         if not lis:
192 |             lis = tag.find_all(class_="_1z3n6o")
193 | 
194 |         for index in range(len(lis)):
195 |             li = lis[index]
196 | 
197 |             title = ""
198 |             runtime_str = ""
199 |             runtime = 0
200 |             rating = ""
201 |             play_id = ""
202 |             play_url = ""
203 |             synopsis = ""
204 |             thumbs_url = ""
205 | 
206 |             js_playbutton = li.find('a', attrs={'data-automation-id':'extras-playbutton'})
207 |             if not js_playbutton:
208 |                 continue
209 |             if js_playbutton:
210 |                 play_url = js_playbutton['href']
211 | 
212 |             js_title = li.find("div", class_="vRplU5")
213 |             if not js_title:
214 |                 js_title = li.find('span', class_="lTKTFD")
215 |             if js_title:
216 |                 title = js_title.string
217 | 
218 |             js_runtime_str = li.find(attrs={'data-automation-id':'runtime-badge'})
219 |             if js_runtime_str:
220 |                 runtime_str = js_runtime_str.string
221 |                 runtime = get_seconds(runtime_str)
222 | 
223 |             js_rating = li.find(class_="_2BZ5w7")
224 |             if not js_rating:
225 |                 js_rating = li.find('span', class_="G8xF_x")
226 |             if js_rating:
227 |                 rating = js_rating.string
228 | 
229 |             js_synopsis = li.find(class_="_16wNxC")
230 |             if not js_synopsis:
231 |                 js_synopsis = li.find(class_="rPtVMq")
232 |             if js_synopsis:
233 |                 synopsis = js_synopsis.string
234 |             
235 |             js_thumbs = li.find('img')
236 |             if js_thumbs:
237 |                 thumbs_url = js_thumbs['src']
238 | 
239 |             js_data_asin = li.find(class_="_1Opa2_ dvui-packshot _3g93Un")
240 |             if not js_data_asin:
241 |                 js_data_asin = li.find(class_="_1Opa2_ dvui-packshot _8eIApy")
242 |             if js_data_asin:
243 |                 play_id = js_data_asin['data-asin']
244 | 
245 |             meta = {
246 |                 "title": title,
247 |                 "runtime": runtime,
248 |                 "runtime_str": runtime_str,
249 |                 "rating": rating,
250 |                 "play_id": play_id,
251 |                 "play_url": play_url,
252 |                 "synopsis": synopsis,
253 |                 "thumbs": {"url":thumbs_url},
254 |             }
255 |             extras_list.append(meta)
256 | 
257 |         extras[extras_tag] = extras_list
258 | 
259 |     return extras
260 | 
261 | def get_seasons_info(webpages):
262 | 
263 |     all_seasons = []
264 | 
265 |     for webpage in webpages:
266 |         amazonSoup = bs4.BeautifulSoup(webpage,'html5lib')
267 |         
268 |         season_info = amazonSoup.find(class_="XR0d6P")   
269 |         if not season_info:
270 |             season_info = amazonSoup.find(class_="GG33WY")
271 |         # if not season_info:
272 |         #     season_info = amazonSoup.find(class_="dv-node-dp-seasons")
273 |         if not season_info:
274 |             continue
275 | 
276 |         pageTitleId = ''
277 |         
278 |         reCurrentId = r'"pageTitleId":"(.*?)"'
279 |         page = re.findall(reCurrentId, webpage)
280 |         if len(page) > 0:
281 |             pageTitleId = page[0]
282 | 
283 |         if pageTitleId == '':
284 |             reCurrentId = r'"originalURI":"(.*?)"'
285 |             originalURI = re.findall(reCurrentId, webpage)
286 |             if len(originalURI) > 0:
287 |                 originalURI = originalURI[0]
288 |                 match = re.search(r'/([A-Z0-9]+)\b', originalURI)
289 |                 if match:
290 |                     pageTitleId = match.group(1)
291 | 
292 |         big_title = ''
293 |         titles = amazonSoup.select('._2Q73m9')
294 |         if len(titles)!=0:
295 |             big_title = titles[0].string
296 |         elif len(amazonSoup.find_all(attrs={'data-automation-id':'title'}))!=0:
297 |             big_title = amazonSoup.find_all(attrs={'data-automation-id':'title'})[0].string
298 |         else:
299 |             big_title = amazonSoup.title.string
300 |         
301 |         release_year = ''
302 |         if len(amazonSoup.find_all(attrs={'data-automation-id':'release-year-badge'}))!=0:
303 |             release_year = amazonSoup.find_all(attrs={'data-automation-id':'release-year-badge'})[0].string
304 | 
305 |         season_name = ''
306 |         dv_node_dp_seasons = amazonSoup.find_all(class_="dv-node-dp-seasons")
307 |         if len(dv_node_dp_seasons)==0:
308 |             dv_node_dp_seasons = amazonSoup.find_all(class_='dv-node-dp-seasons-default')
309 |         
310 |         if len(dv_node_dp_seasons)!=0:
311 |             e_season = dv_node_dp_seasons[0].find(class_="_36qUej")
312 |             if not e_season:
313 |                 e_season = dv_node_dp_seasons[0].find(class_="_3R4jka")
314 | 
315 |             if e_season:
316 |                 season_name = e_season.string
317 | 
318 |         if not season_name:   
319 |             fclass = amazonSoup.find(class_="XqYSS8 dw87r6")
320 |             if fclass:
321 |                 if len(fclass.select("._36qUej"))!=0:
322 |                     season_name = fclass.select("._36qUej")[0].string
323 |         
324 |         if not season_name:
325 |             fclass =amazonSoup.find_all(class_="XqYSS8 _1J8qi6")
326 |             if len(fclass)!=0:
327 |                 season_name = fclass[0].string
328 | 
329 |         if not season_name:
330 |             data_automation_id = amazonSoup.find(attrs={'data-automation-id':'title'})
331 |             if data_automation_id:
332 |                 seasonNameDomList = data_automation_id.next_siblings
333 |                 for seasonNameDom in seasonNameDomList:
334 |                     if type(seasonNameDom) is not bs4.element.NavigableString:
335 |                         seasonNameDomStr = seasonNameDom.get_text().strip().replace('\n', '').replace('\r', '').strip()
336 |                         if "Season 1"==seasonNameDomStr:
337 |                             season_name = seasonNameDomStr
338 |                         elif "Season 1" in seasonNameDomStr:
339 |                             season_name = seasonNameDomStr.split('Season 1')[0].strip().replace('\n', '').replace('\r', '').strip()
340 |                         else:
341 |                             season_name = seasonNameDomStr
342 |                         break
343 |                     
344 |         episodes = []
345 |         episodes_data = amazonSoup.find_all(class_='XR0d6P')
346 |         if not episodes_data:
347 |             episodes_data = amazonSoup.find_all(class_='GG33WY')
348 |         if episodes_data:
349 |             episodes = get_episodes_info(episodes_data)
350 |  
351 |         extras = {}
352 |         extras_data = amazonSoup.select('._3g0WlT')
353 |         if not extras_data:
354 |             extras_data = amazonSoup.select('._3QUUKy')
355 |         if not extras_data:
356 |             extras_data = amazonSoup.select('._4fOMiL')
357 |         if extras_data:
358 |             extras = get_extras_info(extras_data)
359 | 
360 |         bShowMore = False
361 |         total_episodes = 0
362 |         ep_expander = amazonSoup.find_all(attrs={'data-automation-id':'ep-expander'})
363 |         if ep_expander:
364 |             bShowMore = True
365 |             text = ep_expander[0].text.strip()
366 |             match = re.search(r'\d+', text)
367 |             if match:
368 |                 total_episodes = int(match.group())
369 | 
370 |         if amazonSoup.find(class_='_1NNx6V DwgwxH'):
371 |             bShowMore = True
372 | 
373 |         seasons = {
374 |                 "id": pageTitleId,
375 |                 "title": big_title,
376 |                 "release_time": release_year,
377 |                 "season_name":season_name,
378 |                 "episodes": episodes,
379 |                 "extras": extras,
380 |                 "show_more": bShowMore,
381 |                 "total_episodes": total_episodes
382 |             }
383 |         
384 |         all_seasons.append(seasons)
385 | 
386 |     obj = {"seasons": all_seasons}
387 | 
388 |     return obj
389 | 
390 | def run(params): 
391 |     arrParams = list(params)
392 |     log_path = ''
393 |     webpages = []   
394 |     for index in range(len(arrParams)):
395 |         if index==0:
396 |             log_path = arrParams[index]
397 |         elif index==1:
398 |             output_file = arrParams[index]
399 |         else:
400 |             strIn = arrParams[index]
401 |             f = open(strIn,'r', encoding='UTF-8')
402 |             webpages.append(f.read())
403 |             f.close()
404 | 
405 |     # set logging
406 |     log_handler = logging.FileHandler(log_path)
407 |     log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
408 |     log_handler.setFormatter(log_formatter)
409 |     logger.addHandler(log_handler)
410 |     logger.setLevel(logging.INFO)
411 | 
412 |     try:
413 |         with open(output_file,"w",encoding='UTF-8') as f:
414 |             season_info = get_seasons_info(webpages)
415 |             json.dump(season_info,f)
416 |             logger.info("amazon_get_seasons_info complete...")
417 |             
418 |     except Exception as e:
419 |         logger.exception(str(e))
420 | 
421 | import os
422 | if __name__ == '__main__':
423 | 
424 |     params = [
425 |         "./amazon/log/python.log",
426 |         "./amazon/result/get_seasons_info.json",
427 |         "./amazon/html/_de/Watch Good Omens - Season 1 _ Prime Video.html",
428 |     ]
429 | 
430 |     if os.path.exists(params[0]):
431 |         os.remove(params[0])
432 |     if os.path.exists(params[1]):
433 |         os.remove(params[1])
434 | 
435 |     run(params)


--------------------------------------------------------------------------------
/amazon_is_meta.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import re, sys, string
  4 | import bs4,json
  5 | 
  6 | import logging
  7 | logger = logging.getLogger()
  8 | 
  9 | def run(params):
 10 |     arrParams = list(params)
 11 |     log_path = ''
 12 |     for index in range(len(arrParams)):
 13 |         if index==0:
 14 |             log_path = arrParams[index]
 15 |         elif index==1:
 16 |             output_file = arrParams[index]
 17 |         else:
 18 |             input_file= arrParams[index]
 19 | 
 20 |     # set logging
 21 |     log_handler = logging.FileHandler(log_path)
 22 |     log_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
 23 |     log_handler.setFormatter(log_formatter)
 24 |     logger.addHandler(log_handler)
 25 |     logger.setLevel(logging.INFO)
 26 | 
 27 |     try:
 28 |         f = open(input_file,'r',encoding='UTF-8')
 29 |         webpage = f.read()
 30 |         f.close()
 31 | 
 32 |         amazonSoup = bs4.BeautifulSoup(webpage,'html5lib')
 33 |         #<main id="main" data-testid="detailpage-main"></main>
 34 | 
 35 |         is_meta = False
 36 |         media_type = 'movie'
 37 |         
 38 |         main_element = amazonSoup.find('main',attrs={'id':'main','data-testid':'detailpage-main'})
 39 |         if main_element is not None:
 40 |             e_synopsis = main_element.select('._1wxob_')
 41 |             if len(e_synopsis)==0:
 42 |                 e_synopsis = main_element.select('._3qsVvm')
 43 |             
 44 |             if len(e_synopsis)==0:
 45 |                 e_synopsis = main_element.select('._5tB6mN')
 46 | 
 47 |             e_thumb = main_element.select('#atf-full')
 48 |             if len(e_thumb)==0:
 49 |                 e_thumb = main_element.select('.om7nme')
 50 |                 
 51 |             if len(e_synopsis)==0 and len(e_thumb)==0:
 52 |                 is_meta = False
 53 |             else:
 54 |                 is_meta = True
 55 | 
 56 |             #[show, movie, is_live, has_live, upcoming]
 57 |             e_live = amazonSoup.find_all(attrs={'data-automation-id':'live-state-badge'})
 58 |             if len(e_live) > 0:
 59 |                 str = e_live[0].string
 60 |                 if str == 'LIVE':
 61 |                     media_type = 'is_live'
 62 |                 elif str == 'UPCOMING':
 63 |                     media_type = 'upcoming'
 64 |                 else:
 65 |                     media_type = 'has_live'
 66 |             else:
 67 |                 e_episodes = amazonSoup.find_all(class_='XR0d6P')
 68 |                 if not e_episodes:
 69 |                     e_episodes = amazonSoup.find_all(class_='GG33WY')
 70 |                 if len(e_episodes) > 0:
 71 |                     media_type = 'show'
 72 |                 else:
 73 |                     media_type = 'movie'
 74 | 
 75 |         dict = {
 76 |             'is_meta':is_meta,
 77 |             'media_type':media_type
 78 |         }
 79 | 
 80 |         with open(output_file,"w", encoding='UTF-8') as f:
 81 |             json.dump(dict,f)
 82 |             logging.info("amazon_is_meta complete...")
 83 |                 
 84 |     except Exception as e:
 85 |         logger.exception(str(e))
 86 | 
 87 | import os
 88 | if __name__ == '__main__':
 89 | 
 90 |     params = [
 91 |         "./log/python.log",
 92 |         "./result/is_meta.json",
 93 |         "./html/Live/live_Amazon.com_ Wu-Tang Clan & Nas_ NY State of Mind Tour at Climate Pledge Arena _ Movies & TV.html",
 94 |         #"./html/Watch Meg 2_ The Trench _ Prime Video.html",
 95 |     ]
 96 | 
 97 |     if os.path.exists(params[0]):
 98 |         os.remove(params[0])
 99 |     if os.path.exists(params[1]):
100 |         os.remove(params[1])
101 | 
102 |     run(params)


--------------------------------------------------------------------------------
/funi_request.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | import logging
 4 | def hello():
 5 |     print('hello')
 6 |     pass
 7 | 
 8 | def api_request(id, token, output_file):
 9 | 
10 |     # logging.basicConfig(level=logging.INFO,
11 |     #                 format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
12 |     #                 datefmt='%m-%d %H:%M',
13 |     #                 filename='e:\\myapp.log',
14 |     #                 filemode='w')
15 | 
16 |     # console = logging.StreamHandler()
17 |     # console.setLevel(logging.INFO)
18 |     # # set a format which is simpler for console use
19 |     # formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
20 |     # # tell the handler to use this format
21 |     # console.setFormatter(formatter)
22 |     # logging.getLogger('').addHandler(console)
23 | 
24 |     hdr = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:70.0) Gecko/20100101 Firefox/70.0' }
25 |     hdr['devicetype'] = 'Android Phone'
26 |     hdr['Authorization'] = 'Token {}'.format(token)
27 | 
28 |     url= 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/{}/signed'.format(id)
29 |     print('{}'.format(url))
30 |     r= requests.get(url, headers=hdr)
31 |     try:
32 |         x = r.json()
33 |         with open(output_file, "wb") as file:
34 |             file.write(bytes(json.dumps(x), encoding='utf-8'))
35 |     except:
36 |         return None
37 | 
38 | if __name__ == '__main__':
39 |     # id = '1399964'
40 |     id = '1398850'
41 |     outputdir = 'C:\\Users\\fab\\AppData\\Local\\Temp\\'
42 |     outputfile='{}{}.json'.format(outputdir, id)
43 |     api_request(id, '5b506b51f20f11512db78d1ee944dad001a5d398', outputfile)
44 | 


--------------------------------------------------------------------------------
/getCookie.py:
--------------------------------------------------------------------------------
  1 | # uncompyle6 version 3.9.1
  2 | # Python bytecode version base 3.7.0 (3394)
  3 | # Decompiled from: Python 3.9.6 (default, Jun 27 2024, 17:58:20) 
  4 | # [GCC 4.8.5 20150623 (Red Hat 4.8.5-44)]
  5 | # Embedded file name: getCookie.py
  6 | 
  7 | import os, re, sys, json, base64, sqlite3, random, configparser, logging, logging.config
  8 | from win32crypt import CryptUnprotectData
  9 | from cryptography.hazmat.primitives.ciphers.aead import AESGCM
 10 | from abc import abstractmethod, ABCMeta
 11 | NODE_ELK_TOKEN_ID = "elk_token_id"
 12 | NODE_PREFIX = "_YY_TID_"
 13 | NODE_WEB_TOKEN = "web_token"
 14 | NODE_ITEM = "item"
 15 | NODE_BROWSER = "browser"
 16 | NODE_DOMAIN = "domain"
 17 | NODE_TYPE = "type"
 18 | VALUE_OLD = "old"
 19 | VALUE_NEW = "new"
 20 | CONFIG_NODE_PROFILE = "Profile"
 21 | CONFIG_KEY_PATH = "path"
 22 | 
 23 | class CookieParser(metaclass=ABCMeta):
 24 | 
 25 |     @abstractmethod
 26 |     def get_elk_data(self, host):
 27 |         pass
 28 | 
 29 | 
 30 | class CookieParserBase(CookieParser):
 31 | 
 32 |     def get_elk_data(self, host_list):
 33 |         """
 34 |         从cookie数据中解析elk数据
 35 |         :param host: 网站域名，e.g. dvdfab.cn
 36 |         :return: 返回包含网站elk数据的字典
 37 |         """
 38 |         host_elk_data = list()
 39 |         for host in host_list:
 40 |             cookie_data_list = self.get_cookie_data(host)
 41 |             for cookie_data in cookie_data_list:
 42 |                 if NODE_ELK_TOKEN_ID in cookie_data:
 43 |                     elk_old_data = dict()
 44 |                     elk_old_data[NODE_BROWSER] = self.get_browser_name()
 45 |                     elk_old_data[NODE_ITEM] = cookie_data[NODE_ELK_TOKEN_ID]
 46 |                     elk_old_data[NODE_DOMAIN] = host
 47 |                     elk_old_data[NODE_TYPE] = VALUE_OLD
 48 |                     host_elk_data.append(elk_old_data)
 49 |                 keys = cookie_data.keys()
 50 |                 for key in keys:
 51 |                     if key.startswith(NODE_PREFIX):
 52 |                         data = key.split("_", 3)
 53 |                         if len(data) == 4:
 54 |                             elk_new_data = dict()
 55 |                             elk_new_data[NODE_BROWSER] = self.get_browser_name()
 56 |                             elk_new_data[NODE_ITEM] = data[3]
 57 |                             elk_new_data[NODE_DOMAIN] = host
 58 |                             elk_new_data[NODE_TYPE] = VALUE_NEW
 59 |                             host_elk_data.append(elk_new_data)
 60 |                         elif re.match("_YY_V\\d{1,2}_TID_", key):
 61 |                             data = key.split("_", 4)
 62 |                             if len(data) == 5:
 63 |                                 elk_new_data = dict()
 64 |                                 elk_new_data[NODE_BROWSER] = self.get_browser_name()
 65 |                                 elk_new_data[NODE_ITEM] = data[4]
 66 |                                 elk_new_data[NODE_DOMAIN] = host
 67 |                                 elk_new_data[NODE_TYPE] = data[2].lower()
 68 |                                 host_elk_data.append(elk_new_data)
 69 | 
 70 |         return host_elk_data
 71 | 
 72 |     @abstractmethod
 73 |     def get_cookie_data(self, host):
 74 |         """
 75 |         获取host对应的cookie数据
 76 |         :param host: 网站域名，e.g. dvdfab.cn
 77 |         :return: 返回包含网站cookie数据的字典的list
 78 |         """
 79 |         pass
 80 | 
 81 |     @abstractmethod
 82 |     def get_sql(self, host):
 83 |         pass
 84 | 
 85 |     @abstractmethod
 86 |     def get_browser_name(self):
 87 |         pass
 88 | 
 89 |     have_log = False
 90 | 
 91 | 
 92 | class CookieParserFireFox(CookieParserBase):
 93 |     __doc__ = "从profiles.ini中获取cookies目录名称"
 94 | 
 95 |     def get_cookies_path_from_profile(self):
 96 |         profile_path_list = list()
 97 |         mozilla_profile = os.path.join(os.getenv("APPDATA"), "Mozilla\\Firefox")
 98 |         mozilla_profile_ini = os.path.join(mozilla_profile, "profiles.ini")
 99 |         if not os.path.exists(mozilla_profile_ini):
100 |             logger.warning("Profile.ini file is not found.")
101 |             return profile_path_list
102 |         try:
103 |             profile = configparser.ConfigParser()
104 |             profile.read(mozilla_profile_ini)
105 |             for i in range(10):
106 |                 profile_section = CONFIG_NODE_PROFILE + str(i)
107 |                 if profile.has_section(profile_section):
108 |                     data_path = os.path.normpath(os.path.join(mozilla_profile, profile.get(profile_section, "Path")))
109 |                     cookie_path = os.path.join(data_path, "cookies.sqlite")
110 |                     profile_path_list.append(cookie_path)
111 | 
112 |         except Exception as e:
113 |             try:
114 |                 logger.warning("Read profiles.ini failed {}".format(e))
115 |             finally:
116 |                 e = None
117 |                 del e
118 | 
119 |         return profile_path_list
120 | 
121 |     def get_cookie_path(self):
122 |         """
123 |         获取浏览器cookies文件路径
124 |         :return: cookie文件全路径
125 |         """
126 |         s_cookiepath_common = os.environ["APPDATA"] + "\\Mozilla\\Firefox\\Profiles"
127 |         profile_path_list = list()
128 |         if not os.path.exists(s_cookiepath_common):
129 |             return profile_path_list
130 |         profile_path_list = self.get_cookies_path_from_profile()
131 |         if len(profile_path_list) == 0:
132 |             l_folds_arr = os.listdir(s_cookiepath_common)
133 |             l_folds_end = [os.path.splitext(s_file)[-1][1[:None]] for s_file in l_folds_arr]
134 |             if "default-release" in l_folds_end:
135 |                 cookie_fold_index = l_folds_end.index("default-release")
136 |             else:
137 |                 if "default" in l_folds_end:
138 |                     cookie_fold_index = l_folds_end.index("default")
139 |                 else:
140 |                     if random.randint(0, 100) == 50:
141 |                         logger.error("Cannot find default folder: {}".format(str(l_folds_end)))
142 |                         cookie_fold_index = l_folds_end.index("default")
143 |                     else:
144 |                         logger.error("Cannot find default folder: {}".format(str(l_folds_end)))
145 |                         return profile_path_list
146 |             cookie_fold = l_folds_arr[cookie_fold_index]
147 |             cookie_path = os.path.join(s_cookiepath_common, cookie_fold)
148 |             cookie_path = os.path.join(cookie_path, "cookies.sqlite")
149 |             profile_path_list.append(cookie_path)
150 |         return profile_path_list
151 | 
152 |     def get_sql(self, host):
153 |         sql = "select host,name,value from moz_cookies where host='%s'" % host
154 |         return sql
155 | 
156 |     def get_browser_name(self):
157 |         return "FileFox"
158 | 
159 |     def get_cookie_data(self, host):
160 |         cookie_data_list = list()
161 |         cookie_path_list = self.get_cookie_path()
162 |         for cookie_path in cookie_path_list:
163 |             if not os.path.exists(cookie_path):
164 |                 continue
165 |             if self.have_log == False:
166 |                 logger.info("Found cookies file, {}".format(cookie_path))
167 |             sql = self.get_sql(host)
168 |             try:
169 |                 with sqlite3.connect(cookie_path) as conn:
170 |                     cur = conn.cursor()
171 |                     d_cookie = dict()
172 |                     res = cur.execute(sql).fetchall()
173 |                     for host_key, name, value in res:
174 |                         if name == "miniDialog":
175 |                             continue
176 |                         d_cookie[name] = value
177 | 
178 |                     if d_cookie:
179 |                         cookie_data_list.append(d_cookie)
180 |             except Exception as e:
181 |                 try:
182 |                     logger.warning("occur exception: {}".format(e))
183 |                 finally:
184 |                     e = None
185 |                     del e
186 | 
187 |         self.have_log = True
188 |         return cookie_data_list
189 | 
190 | 
191 | class CookieParserGoogleChome(CookieParserBase):
192 |     __doc__ = "负责解析google chrome 浏览器cookie数据"
193 | 
194 |     def get_local_state(self):
195 |         """
196 |         获取Local State文件路径
197 |         :return: 文件全路径
198 |         """
199 |         s_path = "\\Google\\Chrome\\User Data\\Local State"
200 |         s_local_state = os.environ["LOCALAPPDATA"] + s_path
201 |         return s_local_state
202 | 
203 |     def get_cookie_path(self):
204 |         """
205 |                s_browser_name: 浏览器名称(Google Chrome, Microsoft Edge)
206 |                s_host: 域名   例如： '.umeng.com'
207 |         """
208 |         s_cookie = "\\Google\\Chrome\\User Data\\Default\\Network\\Cookies"
209 |         s_cookie_path = os.environ["LOCALAPPDATA"] + s_cookie
210 |         return s_cookie_path
211 | 
212 |     def get_sql(self, host):
213 |         sql = "select host_key,name,encrypted_value from cookies where host_key='%s'" % host
214 |         return sql
215 | 
216 |     def get_browser_name(self):
217 |         return "Chrome"
218 | 
219 |     def get_cookie_data(self, host):
220 |         """
221 |         根据Local State解析出加密key，根据key解密cookie数据
222 |         :param host:
223 |         :return:
224 |         """
225 |         cookie_data_list = list()
226 |         s_cookie_path = self.get_cookie_path()
227 |         if not os.path.exists(s_cookie_path):
228 |             if self.have_log == False:
229 |                 logger.warning("The {} cookie files not found, {}".format(self.get_browser_name(), s_cookie_path))
230 |                 self.have_log = True
231 |             return cookie_data_list
232 |         sql = self.get_sql(host)
233 |         s_local_state = self.get_local_state()
234 |         try:
235 |             with sqlite3.connect(s_cookie_path) as conn:
236 |                 p_cursor = conn.cursor()
237 |                 p_res = p_cursor.execute(sql).fetchall()
238 |                 p_cursor.close()
239 |                 key = self._CookieParserGoogleChome__pull_the_key(self._CookieParserGoogleChome__get_string(s_local_state))
240 |                 d_cookie = dict()
241 |                 for s_host_key, s_name, c_encrypted_value in p_res:
242 |                     if c_encrypted_value[0[:3]] == b'v10':
243 |                         d_cookie[s_name] = self._CookieParserGoogleChome__decrypt_string(key, c_encrypted_value)
244 |                     else:
245 |                         d_cookie[s_name] = CryptUnprotectData(c_encrypted_value)[1].decode()
246 | 
247 |                 cookie_data_list.append(d_cookie)
248 |         except Exception as e:
249 |             try:
250 |                 logger.warning("occur exception: {}".format(e))
251 |             finally:
252 |                 e = None
253 |                 del e
254 | 
255 |         return cookie_data_list
256 | 
257 |     def __get_string(self, s_local_state):
258 |         with open(s_local_state, "r", encoding="utf-8") as f:
259 |             s_encrtpted_key = json.load(f)["os_crypt"]["encrypted_key"]
260 |         return s_encrtpted_key
261 | 
262 |     def __pull_the_key(self, base64_encrypted_key):
263 |         c_encrypted_key_with_header = base64.b64decode(base64_encrypted_key)
264 |         c_encrypted_key = c_encrypted_key_with_header[5[:None]]
265 |         c_key = CryptUnprotectData(c_encrypted_key, None, None, None, 0)[1]
266 |         return c_key
267 | 
268 |     def __decrypt_string(self, c_key, c_data):
269 |         c_nonce, c_cipherbytes = c_data[3[:15]], c_data[15[:None]]
270 |         aesgcm = AESGCM(c_key)
271 |         c_plainbytes = aesgcm.decrypt(c_nonce, c_cipherbytes, None)
272 |         s_plaintext = c_plainbytes.decode("utf-8")
273 |         return s_plaintext
274 | 
275 | 
276 | class CoolkieParserEdge(CookieParserGoogleChome):
277 |     __doc__ = "负责解析Edge浏览器cookie数据 "
278 | 
279 |     def get_cookie_path(self):
280 |         s_cookie = "\\Microsoft\\Edge\\User Data\\Default\\Network\\Cookies"
281 |         s_cookie_path = os.environ["LOCALAPPDATA"] + s_cookie
282 |         return s_cookie_path
283 | 
284 |     def get_local_state(self):
285 |         s_path = "\\Microsoft\\Edge\\User Data\\Local State"
286 |         s_local_state = os.environ["LOCALAPPDATA"] + s_path
287 |         return s_local_state
288 | 
289 |     def get_browser_name(self):
290 |         return "Edge"
291 | 
292 | 
293 | def get_elk_data(host_list):
294 |     """收集所有浏览器的elk数据"""
295 |     data_dict = {'google_chrome':CookieParserGoogleChome(), 
296 |      'edge':CoolkieParserEdge(), 
297 |      'firefox':CookieParserFireFox()}
298 |     total_data_dict = {}
299 |     host_data_list = list()
300 |     for key in data_dict:
301 |         cookie_parser = data_dict.get(key)
302 |         logger.info("Start get data for {} ...".format(key))
303 |         host_data_list += cookie_parser.get_elk_data(host_list)
304 |         logger.info("End get data for {} ...".format(key))
305 | 
306 |     total_data_dict[NODE_WEB_TOKEN] = host_data_list
307 |     return total_data_dict
308 | 
309 | 
310 | def save_elk_data(data_dict, file):
311 |     try:
312 |         json_string = json.dumps(data_dict)
313 |         json_string = base64.b64encode(json_string.encode("utf-8"))
314 |         with open(file, "wb") as f:
315 |             f.write(json_string)
316 |     except Exception as e:
317 |         try:
318 |             logging.error("save file exception: {}".format(e))
319 |         finally:
320 |             e = None
321 |             del e
322 | 
323 | 
324 | def init_logging():
325 |     global logger
326 |     logFilename = os.environ["TEMP"] + "\\devcon.log"
327 |     logging.basicConfig(level=(logging.DEBUG),
328 |       format="%(asctime)s-%(levelname)s-%(message)s",
329 |       datefmt="%y-%m-%d %H:%M",
330 |       filename=logFilename,
331 |       filemode="w+")
332 |     filehandler = logging.FileHandler(logFilename, encoding="utf-8")
333 |     logging.getLogger().addHandler(filehandler)
334 |     logger = logging.getLogger("devcon.log")
335 | 
336 | 
337 | if __name__ == "__main__":
338 |     if len(sys.argv) < 1:
339 |         print("Usage: python.exe output_file_path")
340 |         sys.exit(1)
341 |     init_logging()
342 |     logger.info("App version: 2022-07-27-14-55")
343 |     logger.info("Param: " + sys.argv[1])
344 |     host_list = [
345 |      '.dvdfab.cn', '.dvdfab.fr', '.dvdfab.at', '.dvdfab.co.jp', '.dvdfab.org', 
346 |      '.streamfab.com', '.streamfab.jp', '.streamfab.de', '.streamfab.fr', 
347 |      '.streamfab.tw']
348 |     logger.info("Start get elk data ...")
349 |     try:
350 |         data_dic = get_elk_data(host_list)
351 |     except Exception as e:
352 |         try:
353 |             logger.error("Get elk data exception: {}".format(e))
354 |         finally:
355 |             e = None
356 |             del e
357 | 
358 |     logger.info("Start save elk data ...")
359 |     file = sys.argv[1]
360 |     save_elk_data(data_dic, file)
361 |     logger.info("End")
362 | 


--------------------------------------------------------------------------------
/get_season_urls.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | import re, sys, string
 5 | import bs4
 6 | 
 7 | webpage = '''
 8 | <html><head><title>The Website Title</title></head>
 9 | <body>
10 | <p>Download my <strong>Python</strong> book from <a href="http://inventwithpython.com">my website</a>.</p>
11 | <p class="slogan">Learn Python the easy way!</p>
12 | <p>By <span id="author">Al Sweigart</span></p>
13 | </body></html>
14 | '''
15 | 
16 | def createSrt_vtt():
17 |     exampleSoup = bs4.BeautifulSoup(webpage,'html5lib')
18 |     elems = exampleSoup.select('#author')
19 |     type(elems)
20 |     # print (elems[0].getText())
21 |     # print(webpage)


--------------------------------------------------------------------------------
/hbomax_meta_crawler.js:
--------------------------------------------------------------------------------
  1 | 
  2 | function get_seasons_info(json_data, id) {
  3 | 	var json_obj = JSON.parse(json_data)
  4 | 
  5 | 	var season_num = 0;
  6 | 	for (var i = 0; i < json_obj.length; i++) {
  7 | 		var e_src = json_obj[i].body;
  8 | 
  9 | 		if (e_src.seasonNumber !== undefined) {
 10 | 			if (e_src.seasonNumber > season_num) {
 11 | 				season_num = e_src.seasonNumber
 12 | 			}
 13 | 		}
 14 | 		// 解决有些单 season 不能获取 tv 的问题
 15 | 		// https://play.hbomax.com/page/urn:hbo:page:GVU2_3QlhmYNJjhsJAWUZ:type:series
 16 | 		if (e_src.numberInSeries !== undefined) {
 17 | 			season_num = 1
 18 | 		}
 19 | 	}
 20 | 
 21 | 	src_seasons = new Array()
 22 | 	for (var i = 0; i < season_num; i++) {
 23 | 		obj = {
 24 | 			"id": i + 1,
 25 | 			"title": "Season " + String(i + 1),
 26 | 			"release_time": "",
 27 | 			"episodes": []
 28 | 		}
 29 | 		src_seasons.push(obj)
 30 | 	}
 31 | 
 32 | 	var all_episodes = new Array()
 33 | 	for (var i = 0; i < json_obj.length; i++) {
 34 | 
 35 | 		var e_src = json_obj[i].body;
 36 | 		if (e_src.seasonNumber === undefined && e_src.numberInSeries === undefined) {
 37 | 			continue
 38 | 		}
 39 | 		thumbs = e_src.images.tileburnedin
 40 | 		if (thumbs.indexOf("&size") != -1) {
 41 | 			thumbs = thumbs.split('&size')[0]
 42 | 		}
 43 | 
 44 | 		var e_des = {
 45 | 			"seq": e_src.numberInSeason,
 46 | 			"seasonNumber": e_src.seasonNumber,
 47 | 			"episodeId": String(e_src.numberInSeason),
 48 | 			"runtime": e_src.duration,
 49 | 			"url": 'https://play.hbomax.com/episode/' + e_src.references.viewable,
 50 | 			"synopsis": e_src.titles.full,
 51 | 			"sortInt": e_src.numberInSeason + e_src.seasonNumber,
 52 | 			"title": e_src.titles.full,
 53 | 			"thumbs": {
 54 | 				"url": thumbs
 55 | 			}
 56 | 		}
 57 | 		if (e_src.numberInSeries !== undefined) {
 58 | 			e_des['seasonNumber'] = 1
 59 | 			e_des['seq'] = e_src.numberInSeries
 60 | 			e_des['episodeId'] = e_src.numberInSeries
 61 | 		}
 62 | 		all_episodes.push(e_des)
 63 | 	}
 64 | 
 65 | 
 66 | 	var sort_all_episodes = all_episodes.sort(function (a, b) {
 67 | 		return (a.sortInt - b.sortInt);
 68 | 	})
 69 | 
 70 | 	for (var i = 0; i < sort_all_episodes.length; i++) {
 71 | 		e = sort_all_episodes[i]
 72 | 		src_seasons[e.seasonNumber - 1].episodes.push(e)
 73 | 	}
 74 | 
 75 | 	var result = {
 76 | 		"seasons": src_seasons
 77 | 	}
 78 | 	src_seasons_str = JSON.stringify(result)
 79 | 	return src_seasons_str
 80 | }
 81 | 
 82 | function get_meta_info(json_data, id) {
 83 | 	if (JSON.parse(json_data)[1].body.details === undefined) {
 84 | 		json_obj = JSON.parse(json_data)[0]
 85 | 		thumbs = json_obj.body.images.tileburnedin
 86 | 		if (thumbs.indexOf("&size") != -1) {
 87 | 			thumbs = thumbs.split('&size')[0]
 88 | 		}
 89 | 
 90 | 		all_obj = {
 91 | 			"id": json_obj.id,
 92 | 			"title": json_obj.body.titles.full,
 93 | 			"runtime": json_obj.body.duration,
 94 | 			"release_time": json_obj.body.releaseYear,
 95 | 			"synopsis": json_obj.body.summaries.full,
 96 | 			"thumbs": thumbs,
 97 | 			"directors": [],
 98 | 			"casts": [],
 99 | 			"writers": [],
100 | 			"genres": [json_obj.body.ratingCode],
101 | 			"moodTags": []
102 | 		}
103 | 		return JSON.stringify(all_obj)
104 | 	} else {
105 | 		json_obj = JSON.parse(json_data)[1]
106 | 		thumbs = json_obj.body.details.image.uri
107 | 		if (thumbs.indexOf("&size") != -1) {
108 | 			thumbs = thumbs.split('&size')[0]
109 | 		}
110 | 
111 | 		all_obj = {
112 | 			"id": id,
113 | 			"title": json_obj.body.details.title,
114 | 			"runtime": "",
115 | 			"release_time": "",
116 | 			"synopsis": json_obj.body.details.description,
117 | 			"thumbs": thumbs,
118 | 			"directors": [],
119 | 			"casts": [],
120 | 			"writers": [],
121 | 			"genres": [],
122 | 			"moodTags": []
123 | 		}
124 | 		return JSON.stringify(all_obj)
125 | 	}
126 | }


--------------------------------------------------------------------------------
/log.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import logging
 3 | import datetime
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | def setup_logger(name: str, write_to_file: bool = False) -> logging.Logger:
 8 |     formatter = logging.Formatter('%(asctime)s %(name)s %(filename)s %(lineno)s : %(levelname)s  %(message)s')
 9 |     log_time = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")
10 |     if getattr(sys, 'frozen', False):
11 |         log_folder_path = Path(sys.executable).parent / 'logs'
12 |     else:
13 |         log_folder_path = Path(__file__).parent.parent / 'logs'
14 |     if log_folder_path.exists() is False:
15 |         log_folder_path.mkdir()
16 | 
17 |     ch = logging.StreamHandler()
18 |     ch.setLevel(logging.DEBUG)
19 |     ch.setFormatter(formatter)
20 |     lt = logging.getLogger(f'{name}')
21 |     lt.setLevel(logging.DEBUG)
22 |     lt.addHandler(ch)
23 |     if write_to_file:
24 |         log_file_path = log_folder_path / f'{name}-{log_time}.log'
25 |         fh = logging.FileHandler(log_file_path.resolve().as_posix(), encoding='utf-8')
26 |         fh.setLevel(logging.DEBUG)
27 |         fh.setFormatter(formatter)
28 |         lt.addHandler(fh)
29 |         lt.info(f'log file -> {log_file_path}')
30 |     return lt
31 | 
32 | 
33 | log = setup_logger('pyshaka')


--------------------------------------------------------------------------------
/netflix_meta_crawler.js:
--------------------------------------------------------------------------------
 1 | function get_seasons_info(json_data, id) {
 2 |     let json_obj = JSON.parse(json_data)
 3 |     let seasons_src = json_obj.video.seasons
 4 |     let seasons_des = new Array()
 5 | 
 6 |     for (let index = 0; index < seasons_src.length; index++) {
 7 |         const s_src = seasons_src[index];
 8 |         let episodes_des = new Array()
 9 |         for (let j = 0; j < seasons_src[index].episodes.length; j++) {
10 |             const e_src = seasons_src[index].episodes[j];
11 |             let e_des = {
12 |                 "seq": e_src.seq,
13 |                 "episodeId": String(e_src.episodeId),
14 |                 "url": 'https://www.netflix.com/watch/' + String(e_src.episodeId),
15 |                 "synopsis": e_src.synopsis,
16 |                 "title": e_src.title,
17 | 				"runtime": e_src.runtime,
18 |                 "thumbs": {
19 |                     "url": e_src.thumbs[0].url
20 |                 }
21 |             }
22 |             episodes_des.push(e_des)
23 |         }
24 |         let s_des = {
25 |             "id": String(s_src.id),
26 |             "release_time": String(s_src.year),
27 |             "episodes": episodes_des,
28 |             "seq": s_src.seq
29 |         }
30 | 
31 |         seasons_des.push(s_des)
32 |     }
33 |     let all_obj = {
34 |         "seasons": seasons_des
35 |     }
36 |     all_json = JSON.stringify(all_obj)
37 |     return all_json
38 | }
39 | 
40 | function get_meta_info(json_data, id) {
41 | 
42 |     let json_obj = JSON.parse(json_data)
43 |     let video = json_obj.video
44 |     let thumbs_url = video.artwork[0].url;
45 |     let thumbs = {
46 |         'url': thumbs_url
47 |     }
48 | 
49 |     all_obj = {
50 |         "id": video.currentEpisode,
51 | 		"type": video.type,
52 |         "title": video.title,
53 |         "runtime": video.runtime,
54 |         "release_time": video.year,
55 |         "synopsis": video.synopsis,
56 |         "thumbs": thumbs,
57 |         "directors": [],
58 |         "casts": [],
59 |         "writers": [],
60 |         "genres": [video.rating],
61 |         "moodTags": []
62 |     }
63 |     all_json = JSON.stringify(all_obj)
64 |     return all_json
65 | }


--------------------------------------------------------------------------------
/paramount_subtitle_trans.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | from pathlib import Path
  3 | from datetime import datetime
  4 | from argparse import ArgumentParser
  5 | 
  6 | from util.TextParser import TimeContext
  7 | from text.Mp4VttParser import Mp4VttParser
  8 | from text.Mp4TtmlParser import Mp4TtmlParser
  9 | from text.Cue import Cue
 10 | from log import log
 11 | 
 12 | 
 13 | class CmdArgs:
 14 |     def __init__(self):
 15 |         self.debug = None # type: bool
 16 |         self.type = None # type: str
 17 |         self.timescale = None # type: int
 18 |         self.init_path = None # type: str
 19 |         self.segments_path = None # type: str
 20 |         self.segment_time = None # type: float
 21 | 
 22 | 
 23 | def command_handler(args: CmdArgs):
 24 |     '''
 25 |     对命令参数进行校验和修正
 26 |     '''
 27 |     assert args.type in ['wvtt', 'ttml'], f'not support {args.type} now'
 28 |     args.timescale = int(args.timescale)
 29 |     if args.init_path:
 30 |         args.init_path = args.init_path.strip()
 31 |     args.segments_path = args.segments_path.strip()
 32 |     args.segment_time = float(args.segment_time)
 33 | 
 34 | 
 35 | def loop_nestedCues(lines: List[str], nestedCues: List[Cue], index: int, segment_time: float):
 36 |     payload = ''
 37 |     for cue in nestedCues:
 38 |         if len(cue.nestedCues) > 0:
 39 |             loop_nestedCues(lines, cue.nestedCues, index, segment_time)
 40 |         if cue.payload != '':
 41 |             if payload == '':
 42 |                 payload = cue.payload
 43 |             else:
 44 |                 payload = f'{payload} {cue.payload}'
 45 |         # 这里突然想不起注释掉的原因了 好像是会重复...
 46 |         # lines.append(cue)
 47 |     cue = nestedCues[0]
 48 |     payload = payload
 49 |     if payload != '':
 50 |         cue.payload = payload
 51 |         cue.startTime += segment_time * index
 52 |         cue.endTime += segment_time * index
 53 |         lines.append(cue)
 54 | 
 55 | 
 56 | def compare(cue: Cue):
 57 |     return cue.startTime
 58 | 
 59 | 
 60 | # def compare(cue1: Cue, cue2: Cue):
 61 | #     if cue1.startTime < cue2.startTime:
 62 | #         return -1
 63 | #     if cue1.startTime > cue2.startTime:
 64 | #         return 1
 65 | #     return 0
 66 | 
 67 | 
 68 | def gentm(tm: float):
 69 |     return datetime.utcfromtimestamp(tm).strftime('%H:%M:%S.%f')[:-3]
 70 | 
 71 | 
 72 | def test_parse_mp4vtt():
 73 |     mp4vttparser = Mp4VttParser()
 74 |     vttInitSegment = Path("test/assets/vtt-init.mp4").read_bytes()
 75 |     mp4vttparser.parseInit(vttInitSegment)
 76 |     vttSegment = Path("test/assets/vtt-segment.mp4").read_bytes()
 77 |     timecontext = TimeContext(**{'periodStart': 0, 'segmentStart': 0, 'segmentEnd': 0})
 78 |     mp4vttparser.parseMedia(vttSegment, timecontext)
 79 | 
 80 | 
 81 | def parse(args: CmdArgs):
 82 |     if args.type == 'wvtt':
 83 |         parser = Mp4VttParser()
 84 |     elif args.type == 'ttml':
 85 |         parser = Mp4TtmlParser()
 86 |     else:
 87 |         assert 1 == 0, 'never should be here'
 88 |     if args.init_path:
 89 |         init_path = Path(args.init_path)
 90 |         parser.parseInit(init_path.read_bytes())
 91 |     else:
 92 |         parser.set_timescale(args.timescale)
 93 |     segments_path = Path(args.segments_path)
 94 |     time = TimeContext(**{'periodStart': 0, 'segmentStart': 0, 'segmentEnd': 0})
 95 |     index = 0
 96 |     cues = []
 97 |     for segment_path in segments_path.iterdir():
 98 |         if segment_path.is_dir():
 99 |             if args.debug:
100 |                 log.debug(f'{segment_path} is not a file, skip it')
101 |             continue
102 |         if segment_path.suffix not in ['.mp4', '.m4s', '.dash', '.ts']:
103 |             if args.debug:
104 |                 log.debug(f"{segment_path} suffix is not in ['.mp4', '.m4s', '.dash', '.ts'], skip it")
105 |             continue
106 |         if args.init_path and segment_path.name == init_path.name:
107 |             if args.debug:
108 |                 log.debug(f"{segment_path} is init_path , skip it")
109 |             continue
110 |         if args.debug:
111 |             log.debug(f'start parseMedia for {segment_path}')
112 |         _cues = parser.parseMedia(segment_path.read_bytes(), time)
113 | 
114 |         for cue in _cues:
115 |             cue.file = segment_path.name
116 |             if len(cue.nestedCues) > 0:
117 |                 loop_nestedCues(cues, cue.nestedCues, index, args.segment_time)
118 |             if cue.payload != '':
119 |                 cue.startTime += args.segment_time * index
120 |                 cue.endTime += args.segment_time * index
121 |                 cues.append(cue)
122 |         index += 1
123 |     # 按Cue.startTime从小到大排序
124 |     cues.sort(key=compare)
125 |     if args.debug:
126 |         log.debug(f'cues count {len(cues)}')
127 |     assert len(cues) > 0, 'ohh, it is a bug...'
128 |     # 去重
129 |     # 1. 如果当前行的endTime等于下一行的startTime 并且下一行内容与当前行相同 取下一行的endTime作为当前行的endTime 然后去除下一行
130 |     # 2. 否则将下一行作为当前行 再次进行比较 直到比较结束
131 |     offset = 0
132 |     cues_fix = [] # type: List[Cue]
133 |     cue = cues[offset]
134 |     while offset < len(cues) - 1:
135 |         offset += 1
136 |         # 跳过空的行
137 |         next_cue = cues[offset]
138 |         if cue.payload == '':
139 |             cue = next_cue
140 |             continue
141 |         if cue.payload == next_cue.payload and cue.endTime == next_cue.startTime:
142 |             cue.endTime = next_cue.endTime
143 |         else:
144 |             cues_fix.append(cue)
145 |             cue = next_cue
146 |     # 最后一行也不能掉
147 |     next_cue = cues[offset]
148 |     if cue.payload == next_cue.payload and cue.endTime == next_cue.startTime:
149 |         cue.endTime = next_cue.endTime
150 |     else:
151 |         cues_fix.append(cue)
152 |         cue = next_cue
153 |     if args.debug:
154 |         log.debug(f'after reduce duplicated lines, now lines count is {len(cues_fix)}')
155 |     # 先用列表放内容 最后join
156 |     contents = ["WEBVTT"] # type: List[str]
157 |     for cue in cues_fix:
158 |         settings = cue._settings
159 |         if settings != '':
160 |             settings = ' ' + settings
161 |         contents.append(f'{gentm(cue.startTime)} --> {gentm(cue.endTime)}{settings}\n{cue.payload}')
162 |     content = '\n\n'.join(contents)
163 |     segments_path.with_suffix(".vtt").write_text(content, encoding='utf-8')
164 |     log.info(f'{len(cues_fix)} lines of subtitle was founded. (*^▽^*)')
165 |     log.info(f'write to {segments_path.with_suffix(".vtt").resolve()}')
166 | 
167 | 
168 | def transVtt(infile , inpath):
169 |     parser = Mp4VttParser()
170 |     init_path = Path(infile)
171 |     parser.parseInit(init_path.read_bytes())
172 |     segments_path = Path(inpath)
173 |     
174 |     time = TimeContext(**{'periodStart': 0, 'segmentStart': 0, 'segmentEnd': 0})
175 |     index = 0
176 |     cues = []
177 |     for segment_path in segments_path.iterdir():
178 |         if segment_path.is_dir():
179 |             continue
180 |         if segment_path.suffix not in ['.mp4', '.m4s', '.dash', '.ts']:
181 |             continue
182 |         _cues = parser.parseMedia(segment_path.read_bytes(), time)
183 | 
184 |         for cue in _cues:
185 |             segment_time = 0
186 |             cue.file = segment_path.name
187 |             if len(cue.nestedCues) > 0:
188 |                 loop_nestedCues(cues, cue.nestedCues, index, segment_time)
189 |             if cue.payload != '':
190 |                 cue.startTime += segment_time * index
191 |                 cue.endTime += segment_time * index
192 |                 cues.append(cue)
193 |         index += 1
194 |     # 按Cue.startTime从小到大排序
195 |     cues.sort(key=compare)
196 |     assert len(cues) > 0, 'ohh, it is a bug...'
197 |     # 去重
198 |     # 1. 如果当前行的endTime等于下一行的startTime 并且下一行内容与当前行相同 取下一行的endTime作为当前行的endTime 然后去除下一行
199 |     # 2. 否则将下一行作为当前行 再次进行比较 直到比较结束
200 |     offset = 0
201 |     cues_fix = [] # type: List[Cue]
202 |     cue = cues[offset]
203 |     while offset < len(cues) - 1:
204 |         offset += 1
205 |         # 跳过空的行
206 |         next_cue = cues[offset]
207 |         if cue.payload == '':
208 |             cue = next_cue
209 |             continue
210 |         if cue.payload == next_cue.payload and cue.endTime == next_cue.startTime:
211 |             cue.endTime = next_cue.endTime
212 |         else:
213 |             cues_fix.append(cue)
214 |             cue = next_cue
215 |     # 最后一行也不能掉
216 |     next_cue = cues[offset]
217 |     if cue.payload == next_cue.payload and cue.endTime == next_cue.startTime:
218 |         cue.endTime = next_cue.endTime
219 |     else:
220 |         cues_fix.append(cue)
221 |         cue = next_cue
222 |     # 先用列表放内容 最后join
223 |     contents = ["WEBVTT"] # type: List[str]
224 |     for cue in cues_fix:
225 |         settings = cue._settings
226 |         if settings != '':
227 |             settings = ' ' + settings
228 |         contents.append(f'{gentm(cue.startTime)} --> {gentm(cue.endTime)}{settings}\n{cue.payload}')
229 |     content = '\n\n'.join(contents)
230 |     segments_path.with_suffix(".vtt").write_text(content, encoding='utf-8')
231 |     log.info(f'{len(cues_fix)} lines of subtitle was founded. (*^▽^*)')
232 |     log.info(f'write to {segments_path.with_suffix(".vtt").resolve()}')
233 | 
234 | 
235 | def main():
236 | 
237 |     parser = ArgumentParser(
238 |         prog='dash-subtitle-extractor',
239 |         usage='python -m main [OPTION]...',
240 |         description='A tool that to parse subtitle embedded in DASH stream',
241 |         add_help=True,
242 |     )
243 |     parser.add_argument('-debug', '--debug', action='store_true', help='debug is needed')
244 |     parser.add_argument('-type', '--type', choices=['wvtt', 'ttml'], help='subtitle codec, only support wvtt and ttml now')
245 |     parser.add_argument('-timescale', '--timescale', default='1000', help='set timescale manually if no init segment')
246 |     parser.add_argument('-init-path', '--init-path', help='init segment path')
247 |     parser.add_argument('-segments-path', '--segments-path', help='segments folder path')
248 |     parser.add_argument('-segment-time', '--segment-time', default='0', help='single segment duration, usually needed for ttml content, calculation method: d / timescale')
249 |     args = parser.parse_args() # type: CmdArgs
250 |     command_handler(args)
251 |     parse(args)
252 |     # python -m main --init-path "test/dashvtt_subtitle_WVTT_zh-TW/init.mp4" --segments-path "test/dashvtt_subtitle_WVTT_zh-TW"
253 | 
254 | 
255 | if __name__ == '__main__':
256 |     main()


--------------------------------------------------------------------------------
/srtConvert.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import codecs
  3 | import math
  4 | import os
  5 | import re
  6 | 
  7 | 
  8 | SUPPORTED_EXTENSIONS = [".xml", ".vtt"]
  9 | 
 10 | 
 11 | def leading_zeros(value, digits=2):
 12 |     value = "000000" + str(value)
 13 |     return value[-digits:]
 14 | 
 15 | 
 16 | def convert_time(raw_time):
 17 |     if int(raw_time) == 0:
 18 |         return "{}:{}:{},{}".format(0, 0, 0, 0)
 19 | 
 20 |     ms = '000'
 21 |     if len(raw_time) > 4:
 22 |         ms = leading_zeros(int(raw_time[:-4]) % 1000, 3)
 23 |     time_in_seconds = int(raw_time[:-7]) if len(raw_time) > 7 else 0
 24 |     second = leading_zeros(time_in_seconds % 60)
 25 |     minute = leading_zeros(int(math.floor(time_in_seconds / 60)) % 60)
 26 |     hour = leading_zeros(int(math.floor(time_in_seconds / 3600)))
 27 |     return "{}:{}:{},{}".format(hour, minute, second, ms)
 28 | 
 29 | 
 30 | def xml_id_display_align_before(text):
 31 |     """
 32 |     displayAlign="before" means the current sub will be displayed on top.
 33 |     That is and not at bottom. We check what's the xml:id associated to it
 34 |     to have an {\an8} position tag in the output file.
 35 |     """
 36 |     align_before_re = re.compile(u'<region.*tts:displayAlign=\"before\".*xml:id=\"(.*)\"/>')
 37 |     has_align_before = re.search(align_before_re, text)
 38 |     if has_align_before:
 39 |         return has_align_before.group(1)
 40 |     return u""
 41 | 
 42 | 
 43 | def xml_get_cursive_style_ids(text):
 44 |     style_section = re.search("<styling>(.*)</styling>", text, flags=re.DOTALL)
 45 |     if not style_section:
 46 |         return []
 47 |     style_ids_re = re.compile(
 48 |         '<style.* tts:fontStyle="italic".* xml:id=\"([a-zA-Z0-9_.]+)\"')
 49 |     return [re.search(style_ids_re, line).groups()[0]
 50 |             for line in style_section.group().split("\n")
 51 |             if re.search(style_ids_re, line)]
 52 | 
 53 | 
 54 | def to_srt(text, extension):
 55 |     if extension.lower() == ".xml":
 56 |         return xml_to_srt(text)
 57 |     if extension.lower() == ".vtt":
 58 |         return vtt_to_srt(text)
 59 | 
 60 | 
 61 | def convert_vtt_time(line):
 62 |     times = line.replace(".", ",").split(" --> ")
 63 |     if len(times[0]) == 9:
 64 |         times = ["00:" + t for t in times]
 65 |     return "{} --> {}".format(times[0], times[1].split(" ")[0])
 66 | 
 67 | 
 68 | def vtt_to_srt(text):
 69 |     if not text.startswith(u"\ufeffWEBVTT") and not text.startswith(u"WEBVTT"):
 70 |         raise Exception(".vtt format must start with WEBVTT, wrong file?")
 71 | 
 72 |     lines = []
 73 |     current_sub_line = []
 74 |     for line in text.split("\n"):
 75 |         if current_sub_line:
 76 |             current_sub_line.append(line)
 77 |             if not line:
 78 |                 lines.append("\n".join(current_sub_line) + "\n")
 79 |                 current_sub_line = []
 80 | 
 81 |         elif " --> " in line:
 82 |             current_sub_line = [convert_vtt_time(line)]
 83 |     if current_sub_line:
 84 |         lines.append("\n".join(current_sub_line))
 85 | 
 86 |     return "".join((u"{}\n{}".format(i, l) for i, l in enumerate(lines, 1)))
 87 | 
 88 | 
 89 | def xml_to_srt(text):
 90 |     def append_subs(start, end, prev_content, format_time):
 91 |         subs.append({
 92 |             "start_time": convert_time(start) if format_time else start,
 93 |             "end_time": convert_time(end) if format_time else end,
 94 |             "content": u"\n".join(prev_content),
 95 |         })
 96 | 
 97 |     display_align_before = xml_id_display_align_before(text)
 98 |     begin_re = re.compile(u"\s*<p begin=")
 99 |     sub_lines = (l for l in text.split("\n") if re.search(begin_re, l))
100 |     subs = []
101 |     prev_time = {"start": 0, "end": 0}
102 |     prev_content = []
103 |     start = end = ''
104 |     start_re = re.compile(u'begin\="([0-9:\.]*)')
105 |     end_re = re.compile(u'end\="([0-9:\.]*)')
106 |     content_re = re.compile(u'\">(.*)</p>')
107 | 
108 |     # some span tags are used for italics, we'll replace them by <i> and </i>,
109 |     # which is the standard for .srt files. We ignore all other uses.
110 |     cursive_ids = xml_get_cursive_style_ids(text)
111 |     span_start_re = re.compile(u'(<span style=\"[a-zA-Z0-9_.]+\">)+')
112 |     span_id_re = re.compile(u'(<span style=\"([a-zA-Z0-9_.]+)\">)+')
113 |     span_end_re = re.compile(u'(</span>)+')
114 |     br_re = re.compile(u'(<br\s*\/?>)+')
115 |     fmt_t = True
116 |     for s in sub_lines:
117 |         span_start_tags = re.search(span_start_re, s)
118 |         srt_cursive = u""
119 |         if span_start_tags:
120 |             span_id = re.search(span_id_re, s)
121 |             srt_cursive = u"<i>" if span_id.groups()[1] in cursive_ids else u""
122 |             s = srt_cursive.join(s.split(span_start_tags.groups()[0]))
123 | 
124 |         string_region_re = r'<p(.*region="' + display_align_before + r'".*")>(.*)</p>'
125 |         s = re.sub(string_region_re, r'<p\1>{\\an8}\2</p>', s)
126 |         content = re.search(content_re, s).group(1)
127 | 
128 |         br_tags = re.search(br_re, content)
129 |         if br_tags:
130 |             content = u"\n".join(content.split(br_tags.group()))
131 | 
132 |         span_end_tags = re.search(span_end_re, content)
133 |         if span_end_tags:
134 |             srt_cursive = u"</i>" if srt_cursive else u""
135 |             content = srt_cursive.join(content.split(span_end_tags.group()))
136 | 
137 |         prev_start = prev_time["start"]
138 |         start = re.search(start_re, s).group(1)
139 |         end = re.search(end_re, s).group(1)
140 |         if len(start.split(":")) > 1:
141 |             fmt_t = False
142 |             start = start.replace(".", ",")
143 |             end = end.replace(".", ",")
144 |         if (prev_start == start and prev_time["end"] == end) or not prev_start:
145 |             # Fix for multiple lines starting at the same time
146 |             prev_time = {"start": start, "end": end}
147 |             prev_content.append(content)
148 |             continue
149 |         append_subs(prev_time["start"], prev_time["end"], prev_content, fmt_t)
150 |         prev_time = {"start": start, "end": end}
151 |         prev_content = [content]
152 |     append_subs(start, end, prev_content, fmt_t)
153 | 
154 |     lines = (u"{}\n{} --> {}\n{}\n".format(
155 |         s + 1, subs[s]["start_time"], subs[s]["end_time"], subs[s]["content"])
156 |         for s in range(len(subs)))
157 |     return u"\n".join(lines)
158 | 
159 | def createSrt_xml(inPath, outPath):
160 |     with codecs.open(inPath, 'rb', "utf-8") as f:
161 |         text = f.read()
162 |     with codecs.open(outPath, 'wb', "utf-8") as f:
163 |         f.write(to_srt(text, ".xml"))
164 | 		
165 | def createSrt_vtt(inPath, outPath):
166 |     with codecs.open(inPath, 'rb', "utf-8") as f:
167 |         text = f.read()
168 |     with codecs.open(outPath, 'wb', "utf-8") as f:
169 |         f.write(to_srt(text, ".vtt"))
170 | 
171 | 
172 | if __name__ == '__main__':
173 |     main()
174 | 


--------------------------------------------------------------------------------
/text/Cue.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | 
  3 | 
  4 | class positionAlign(Enum):
  5 |     LEFT = 'line-left'
  6 |     RIGHT = 'line-right'
  7 |     CENTER = 'center'
  8 |     AUTO = 'auto'
  9 | 
 10 | 
 11 | class textAlign(Enum):
 12 |     LEFT = 'left'
 13 |     RIGHT = 'right'
 14 |     CENTER = 'center'
 15 |     START = 'start'
 16 |     END = 'end'
 17 | 
 18 | 
 19 | class displayAlign(Enum):
 20 |     BEFORE = 'before'
 21 |     CENTER = 'center'
 22 |     AFTER = 'after'
 23 | 
 24 | 
 25 | class direction(Enum):
 26 |     HORIZONTAL_LEFT_TO_RIGHT = 'ltr'
 27 |     HORIZONTAL_RIGHT_TO_LEFT = 'rtl'
 28 | 
 29 | 
 30 | class writingMode(Enum):
 31 |     HORIZONTAL_TOP_TO_BOTTOM = 'horizontal-tb'
 32 |     VERTICAL_LEFT_TO_RIGHT = 'vertical-lr'
 33 |     VERTICAL_RIGHT_TO_LEFT = 'vertical-rl'
 34 | 
 35 | 
 36 | class lineInterpretation(Enum):
 37 |     LINE_NUMBER = 0
 38 |     PERCENTAGE = 1
 39 | 
 40 | 
 41 | class lineAlign(Enum):
 42 |     CENTER = 'center'
 43 |     START = 'start'
 44 |     END = 'end'
 45 | 
 46 | 
 47 | class defaultTextColor(Enum):
 48 |     white = '#FFF'
 49 |     lime = '#0F0'
 50 |     cyan = '#0FF'
 51 |     red = '#F00'
 52 |     yellow = '#FF0'
 53 |     magenta = '#F0F'
 54 |     blue = '#00F'
 55 |     black = '#000'
 56 | 
 57 | 
 58 | class defaultTextBackgroundColor(Enum):
 59 |     bg_white = '#FFF'
 60 |     bg_lime = '#0F0'
 61 |     bg_cyan = '#0FF'
 62 |     bg_red = '#F00'
 63 |     bg_yellow = '#FF0'
 64 |     bg_magenta = '#F0F'
 65 |     bg_blue = '#00F'
 66 |     bg_black = '#000'
 67 | 
 68 | 
 69 | class fontWeight(Enum):
 70 |     NORMAL = 400
 71 |     BOLD = 700
 72 | 
 73 | 
 74 | class fontStyle(Enum):
 75 |     NORMAL = 'normal'
 76 |     ITALIC = 'italic'
 77 |     OBLIQUE = 'oblique'
 78 | 
 79 | 
 80 | class textDecoration(Enum):
 81 |     UNDERLINE = 'underline'
 82 |     LINE_THROUGH = 'lineThrough'
 83 |     OVERLINE = 'overline'
 84 | 
 85 | 
 86 | class Cue:
 87 | 
 88 |     def __init__(self, startTime: float, endTime: float, payload: str, _settings: str = ''):
 89 |         self.startTime = startTime
 90 |         self.direction = direction.HORIZONTAL_LEFT_TO_RIGHT
 91 |         self.endTime = endTime
 92 |         self.payload = payload
 93 |         self.region = CueRegion()
 94 |         self.position = None
 95 |         self.positionAlign = positionAlign.AUTO
 96 |         self.size = 0
 97 |         self.textAlign = textAlign.CENTER
 98 |         self.writingMode = writingMode.HORIZONTAL_TOP_TO_BOTTOM
 99 |         self.lineInterpretation = lineInterpretation.LINE_NUMBER
100 |         self.line = None
101 |         self.lineHeight = ''
102 |         self.lineAlign = lineAlign.START
103 |         self.displayAlign = displayAlign.AFTER
104 |         self.color = ''
105 |         self.backgroundColor = ''
106 |         self.backgroundImage = ''
107 |         self.border = ''
108 |         self.fontSize = ''
109 |         self.fontWeight = fontWeight.NORMAL
110 |         self.fontStyle = fontStyle.NORMAL
111 |         self.fontFamily = ''
112 |         self.letterSpacing = ''
113 |         self.linePadding = ''
114 |         self.opacity = 1
115 |         self.textDecoration = []
116 |         self.wrapLine = True
117 |         self.id = ''
118 |         self.nestedCues = []
119 |         self.lineBreak = False
120 |         self.spacer = False
121 |         self.cellResolution = {'columns': 32, 'rows': 15}
122 |         self._settings = _settings
123 | 
124 |     @staticmethod
125 |     def lineBreak(start: float, end: float) -> 'Cue':
126 |         cue = Cue(start, end, '')
127 |         cue.lineBreak = True
128 |         return cue
129 | 
130 |     def clone(self):
131 |         cue = Cue(0, 0, '')
132 |         for k, v in self.__dict__.items():
133 |             if isinstance(v, list):
134 |                 v = v.copy()
135 |             cue.__setattr__(k, v)
136 |         return cue
137 | 
138 |     @staticmethod
139 |     def equal(cue1: 'Cue', cue2: 'Cue') -> bool:
140 |         if cue1.startTime != cue2.startTime or cue1.endTime != cue2.endTime or cue1.payload != cue2.payload:
141 |             return False
142 |         for k, v in cue1.__dict__.items():
143 |             if k == 'startTime' or k == 'endTime' or k == 'payload':
144 |                 pass
145 |             elif k == 'nestedCues':
146 |                 if not Cue.equal(cue1.nestedCues, cue2.nestedCues):
147 |                     return False
148 |             elif k == 'region' or k == 'cellResolution':
149 |                 for k2 in cue1.__getattribute__(k):
150 |                     if cue1.__getattribute__(k)[k2] != cue2.__getattribute__(k)[k2]:
151 |                         return False
152 |             elif isinstance(cue1.__getattribute__(k), list):
153 |                 if cue1.__getattribute__(k) != cue2.__getattribute__(k):
154 |                     return False
155 |             else:
156 |                 if cue1.__getattribute__(k) != cue1.__getattribute__(k):
157 |                     return False
158 |         return True
159 | 
160 | 
161 | class units(Enum):
162 |     PX = 0
163 |     PERCENTAGE = 1
164 |     LINES = 2
165 | 
166 | 
167 | class scrollMode(Enum):
168 |     NONE = ''
169 |     UP = 'up'
170 | 
171 | 
172 | class CueRegion:
173 | 
174 |     def __init__(self, **kwargs):
175 |         self.id = ''
176 |         self.viewportAnchorX = 0
177 |         self.viewportAnchorY = 0
178 |         self.regionAnchorX = 0
179 |         self.regionAnchorY = 0
180 |         self.width = 100
181 |         self.height = 100
182 |         self.heightUnits = units.PERCENTAGE
183 |         self.widthUnits = units.PERCENTAGE
184 |         self.viewportAnchorUnits = units.PERCENTAGE
185 |         self.scroll = scrollMode.NONE
186 | 


--------------------------------------------------------------------------------
/text/Mp4TtmlParser.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from text.Cue import Cue
 4 | from text.TtmlTextParser import TtmlTextParser
 5 | from util.Mp4Parser import Mp4Parser, ParsedBox
 6 | from util.exceptions import InvalidMp4TTML
 7 | from util.TextParser import TimeContext
 8 | 
 9 | 
10 | class Mp4TtmlParser:
11 | 
12 |     def __init__(self):
13 |         self.parser_ = TtmlTextParser()
14 | 
15 |     def set_timescale(self, timescale: int):
16 |         pass
17 | 
18 |     def parseInit(self, data: memoryview):
19 |         '''
20 |         这个函数不调用也没什么问题
21 |         '''
22 |         def stpp_callback(box: ParsedBox):
23 |             nonlocal sawSTPP
24 |             sawSTPP = True
25 |             box.parser.stop()
26 | 
27 |         sawSTPP = False
28 |         # 初始化解析器
29 |         mp4parser = Mp4Parser()
30 |         # 给要准备解析的box添加对应的解析函数 后面回调
31 |         mp4parser = mp4parser.box('moov', Mp4Parser.children)
32 |         mp4parser = mp4parser.box('trak', Mp4Parser.children)
33 |         mp4parser = mp4parser.box('mdia', Mp4Parser.children)
34 |         mp4parser = mp4parser.box('minf', Mp4Parser.children)
35 |         mp4parser = mp4parser.box('stbl', Mp4Parser.children)
36 |         mp4parser = mp4parser.fullBox('stsd', Mp4Parser.sampleDescription)
37 |         mp4parser = mp4parser.box('stpp', stpp_callback)
38 |         # 解析数据
39 |         mp4parser = mp4parser.parse(data)
40 | 
41 |         if not sawSTPP:
42 |             raise InvalidMp4TTML(f'is sawSTPP? {sawSTPP}')
43 | 
44 |     def parseMedia(self, data: memoryview, time: TimeContext, dont_raise: bool = True) -> List[Cue]:
45 | 
46 |         def mdat_callback(data: bytes):
47 |             nonlocal payload
48 |             nonlocal sawMDAT
49 |             sawMDAT = True
50 |             payload.extend(self.parser_.parseMedia(data, time))
51 | 
52 |         sawMDAT = False
53 |         payload = []
54 | 
55 |         mp4parser = Mp4Parser()
56 |         mp4parser = mp4parser.box('mdat', Mp4Parser.allData(mdat_callback))
57 |         mp4parser = mp4parser.parse(data, partialOkay=False)
58 | 
59 |         if not sawMDAT:
60 |             if dont_raise:
61 |                 return payload
62 |             else:
63 |                 raise InvalidMp4TTML(f'is sawMDAT? {sawMDAT}')
64 |         return payload


--------------------------------------------------------------------------------
/text/Mp4VttParser.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | from text.Cue import Cue
  3 | # from text.TextEngine import TextEngine
  4 | from text.VttTextParser import VttTextParser
  5 | from util.DataViewReader import DataViewReader, Endianness
  6 | # from util.Error import Error
  7 | # from util.Functional import Functional
  8 | from util.Mp4Parser import Mp4Parser, ParsedBox
  9 | from util.Mp4BoxParsers import Mp4BoxParsers, ParsedTRUNSample
 10 | # from util.StringUtils import StringUtils
 11 | # from util.TextParser import TextParser
 12 | from util.TextParser import TimeContext
 13 | from util.exceptions import InvalidMp4VTT
 14 | from log import log
 15 | 
 16 | 
 17 | class Mp4VttParser:
 18 | 
 19 |     def __init__(self):
 20 |         self.timescale_ = None # type: int
 21 | 
 22 |     def set_timescale(self, timescale: int):
 23 |         self.timescale_ = timescale
 24 | 
 25 |     def parseInit(self, data: memoryview):
 26 |         # log.info('parseInit start')
 27 | 
 28 |         def mdhd_callback(box: ParsedBox):
 29 |             assert box.version == 0 or box.version == 1, 'MDHD version can only be 0 or 1'
 30 |             parsedMDHDBox = Mp4BoxParsers.parseMDHD(box.reader, box.version)
 31 |             self.timescale_ = parsedMDHDBox.timescale
 32 | 
 33 |         def wvtt_callback(box: ParsedBox):
 34 |             nonlocal sawWVTT
 35 |             sawWVTT = True
 36 | 
 37 |         sawWVTT = False
 38 |         # 初始化解析器
 39 |         mp4parser = Mp4Parser()
 40 |         # 给要准备解析的box添加对应的解析函数 后面回调
 41 |         mp4parser = mp4parser.box('moov', Mp4Parser.children)
 42 |         mp4parser = mp4parser.box('trak', Mp4Parser.children)
 43 |         mp4parser = mp4parser.box('mdia', Mp4Parser.children)
 44 |         mp4parser = mp4parser.fullBox('mdhd', mdhd_callback)
 45 |         mp4parser = mp4parser.box('minf', Mp4Parser.children)
 46 |         mp4parser = mp4parser.box('stbl', Mp4Parser.children)
 47 |         mp4parser = mp4parser.fullBox('stsd', Mp4Parser.sampleDescription)
 48 |         mp4parser = mp4parser.box('wvtt', wvtt_callback)
 49 |         # 解析数据
 50 |         mp4parser = mp4parser.parse(data)
 51 | 
 52 |         if not self.timescale_:
 53 |             raise InvalidMp4VTT('Missing timescale for VTT content. It should be located in the MDHD.')
 54 | 
 55 |         if not sawWVTT:
 56 |             raise InvalidMp4VTT('A WVTT box should have been seen (a valid vtt init segment with no actual subtitles')
 57 | 
 58 |     def parseMedia(self, data: memoryview, time: TimeContext) -> List[Cue]:
 59 | 
 60 |         def tfdt_callback(box: ParsedBox):
 61 |             nonlocal baseTime
 62 |             nonlocal sawTFDT
 63 |             sawTFDT = True
 64 |             assert box.version == 0 or box.version == 1, 'TFDT version can only be 0 or 1'
 65 |             parsedTFDTBox = Mp4BoxParsers.parseTFDT(box.reader, box.version)
 66 |             baseTime = parsedTFDTBox.baseMediaDecodeTime
 67 | 
 68 |         def tfhd_callback(box: ParsedBox):
 69 |             nonlocal defaultDuration
 70 |             assert box.flags is not None, 'A TFHD box should have a valid flags value'
 71 |             parsedTFHDBox = Mp4BoxParsers.parseTFHD(box.reader, box.flags)
 72 |             defaultDuration = parsedTFHDBox.defaultSampleDuration
 73 | 
 74 |         def trun_callback(box: ParsedBox):
 75 |             nonlocal sawTRUN
 76 |             nonlocal presentations
 77 |             sawTRUN = True
 78 |             assert box.version is not None, 'A TRUN box should have a valid version value'
 79 |             assert box.version is not None, 'A TRUN box should have a valid flags value'
 80 |             parsedTRUNBox = Mp4BoxParsers.parseTRUN(box.reader, box.version, box.flags)
 81 |             presentations = parsedTRUNBox.sampleData
 82 | 
 83 |         def mdat_callback(data: bytes):
 84 |             nonlocal sawMDAT
 85 |             nonlocal rawPayload
 86 |             #assert not sawMDAT, 'VTT cues in mp4 with multiple MDAT are not currently supported'
 87 |             sawMDAT = True
 88 |             rawPayload = data
 89 | 
 90 |         if not self.timescale_:
 91 |             raise InvalidMp4VTT('No init segment for MP4+VTT!')
 92 | 
 93 |         baseTime = 0
 94 |         presentations = [] # type: List[ParsedTRUNSample]
 95 |         rawPayload = b'' # type: bytes
 96 |         cues = [] # type: List[Cue]
 97 | 
 98 |         sawTFDT = False
 99 |         sawTRUN = False
100 |         sawMDAT = False
101 |         defaultDuration = None
102 | 
103 |         mp4parser = Mp4Parser()
104 |         mp4parser = mp4parser.box('moof', Mp4Parser.children)
105 |         mp4parser = mp4parser.box('traf', Mp4Parser.children)
106 |         mp4parser = mp4parser.fullBox('tfdt', tfdt_callback)
107 |         mp4parser = mp4parser.fullBox('tfhd', tfhd_callback)
108 |         mp4parser = mp4parser.fullBox('trun', trun_callback)
109 |         mp4parser = mp4parser.box('mdat', Mp4Parser.allData(mdat_callback))
110 |         mp4parser = mp4parser.parse(data, partialOkay=False)
111 | 
112 |         if not sawMDAT and not sawTFDT and not sawTRUN:
113 |             raise InvalidMp4VTT(f'A required box is missing. Is saw: MDAT {sawMDAT} TFDT {sawTFDT} TRUN {sawTRUN}')
114 | 
115 |         currentTime = baseTime
116 | 
117 |         reader = DataViewReader(rawPayload, Endianness.BIG_ENDIAN)
118 |         for presentation in presentations:
119 |             duration = presentation.sampleDuration or defaultDuration
120 |             if presentation.sampleCompositionTimeOffset:
121 |                 startTime = baseTime + presentation.sampleCompositionTimeOffset
122 |             else:
123 |                 startTime = currentTime
124 |             currentTime = startTime + (duration or 0)
125 |             totalSize = 0
126 |             while True:
127 |                 # Read the payload size.
128 |                 payloadSize = reader.readUint32()
129 |                 totalSize += payloadSize
130 |                 # Skip the type.
131 |                 payloadType = reader.readUint32()
132 |                 payloadName = Mp4Parser.typeToString(payloadType)
133 | 
134 |                 # Read the data payload.
135 |                 payload = None
136 |                 if payloadName == 'vttc':
137 |                     if payloadSize > 8:
138 |                         payload = reader.readBytes(payloadSize - 8)
139 |                 elif payloadName == 'vtte':
140 |                     # It's a vtte, which is a vtt cue that is empty. Ignore any data that does exist.
141 |                     reader.skip(payloadSize - 8)
142 |                 else:
143 |                     log.error(f'Unknown box {payloadName}! Skipping!')
144 |                     reader.skip(payloadSize - 8)
145 | 
146 |                 if duration:
147 |                     if payload:
148 |                         assert self.timescale_ is not None, 'Timescale should not be null!'
149 |                         cue = Mp4VttParser.parseVTTC_(
150 |                             payload,
151 |                             time.periodStart + startTime / self.timescale_,
152 |                             time.periodStart + currentTime / self.timescale_
153 |                         )
154 |                         cues.append(cue)
155 |                 else:
156 |                     log.error('WVTT sample duration unknown, and no default found!')
157 |                 assert not presentation.sampleSize or totalSize <= presentation.sampleSize, 'The samples do not fit evenly into the sample sizes given in the TRUN box!'
158 | 
159 |                 # 检查是不是应该结束循环
160 |                 if presentation.sampleSize and totalSize < presentation.sampleSize:
161 |                     continue
162 |                 else:
163 |                     break
164 |         assert not reader.hasMoreData(), 'MDAT which contain VTT cues and non-VTT data are not currently supported!'
165 |         # parseVTTC_ 有可能返回的是 None 这里过滤一下
166 |         return [cue for cue in cues if cue]
167 | 
168 |     @staticmethod
169 |     def parseVTTC_(data: bytes, startTime: float, endTime: float):
170 | 
171 |         def payl_callback(data: bytes):
172 |             nonlocal payload
173 |             payload = data.decode('utf-8')
174 | 
175 |         def iden_callback(data: bytes):
176 |             nonlocal _id
177 |             _id = data.decode('utf-8')
178 | 
179 |         def sttg_callback(data: bytes):
180 |             nonlocal settings
181 |             settings = data.decode('utf-8')
182 | 
183 |         payload = None
184 |         _id = None
185 |         settings = None
186 | 
187 |         mp4parser = Mp4Parser()
188 |         mp4parser = mp4parser.box('payl', Mp4Parser.allData(payl_callback))
189 |         mp4parser = mp4parser.box('iden', Mp4Parser.allData(iden_callback))
190 |         mp4parser = mp4parser.box('sttg', Mp4Parser.allData(sttg_callback))
191 |         mp4parser = mp4parser.parse(data)
192 | 
193 |         if payload:
194 |             return Mp4VttParser.assembleCue_(payload, _id, settings, startTime, endTime)
195 |         else:
196 |             return None
197 | 
198 |     @staticmethod
199 |     def assembleCue_(payload: bytes, _id: str, settings: str, startTime: float, endTime: float):
200 |         cue = Cue(startTime, endTime, '', _settings=settings)
201 | 
202 |         styles = {}
203 |         VttTextParser.parseCueStyles(payload, cue, styles)
204 | 
205 |         if _id:
206 |             cue.id = _id
207 | 
208 |         # if settings:
209 |         #     # TextParser not fully implemented yet
210 |         #     parser = TextParser(settings)
211 |         #     word = parser.readWord()
212 |         #     while word:
213 |         #         if not VttTextParser.parseCueSetting(cue, word, VTTRegions=[]):
214 |         #             log.warning(f'VTT parser encountered an invalid VTT setting: {word}, The setting will be ignored.')
215 | 
216 |         #         parser.skipWhitespace()
217 |         #         word = parser.readWord()
218 |         return cue


--------------------------------------------------------------------------------
/text/TextEngine.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ThatNotEasy/StreamFab/157f7a91f75459e523076ab9d586c798f1a9c989/text/TextEngine.py


--------------------------------------------------------------------------------
/text/TtmlTextParser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from xml.dom.minidom import parseString, Element, Node, Document
  3 | from enum import Enum
  4 | from typing import List, Union
  5 | 
  6 | from text.Cue import Cue, CueRegion, units, direction, writingMode
  7 | from text.Cue import textAlign, lineAlign, positionAlign, displayAlign
  8 | from text.Cue import fontStyle, textDecoration
  9 | from util.TextParser import TimeContext
 10 | from util.exceptions import InvalidXML, InvalidTextCue
 11 | from log import log
 12 | 
 13 | document = Document()
 14 | 
 15 | 
 16 | class RateInfo_:
 17 |     def __init__(self, frameRate: str, subFrameRate: str, frameRateMultiplier: str, tickRate: str):
 18 |         try:
 19 |             self.frameRate = float(frameRate)
 20 |         except Exception:
 21 |             self.frameRate = 30
 22 |         try:
 23 |             self.subFrameRate = float(subFrameRate)
 24 |         except Exception:
 25 |             self.subFrameRate = 1
 26 |         try:
 27 |             self.tickRate = float(tickRate)
 28 |         except Exception:
 29 |             self.tickRate = 0
 30 |         if self.tickRate == 0:
 31 |             if frameRate:
 32 |                 self.tickRate = self.frameRate * self.subFrameRate
 33 |             else:
 34 |                 self.tickRate = 1
 35 |         if frameRateMultiplier:
 36 |             multiplierResults = re.findall('^(\d+) (\d+)$', frameRateMultiplier)
 37 |             if len(multiplierResults) > 0:
 38 |                 numerator = float(multiplierResults[1])
 39 |                 denominator = float(multiplierResults[2])
 40 |                 multiplierNum = numerator / denominator
 41 |                 self.frameRate *= multiplierNum
 42 | 
 43 | 
 44 | class TtmlTextParser:
 45 | 
 46 |     def parseInit(self):
 47 |         assert False, 'TTML does not have init segments'
 48 | 
 49 |     def parseMedia(self, data: bytes, time: TimeContext) -> List[Cue]:
 50 |         ttpNs = parameterNs_
 51 |         ttsNs = styleNs_
 52 |         text = data.decode('utf-8')
 53 |         cues = [] # type: List[Cue]
 54 |         xml = None
 55 | 
 56 |         if text == '':
 57 |             return cues
 58 |         try:
 59 |             xml = parseString(text)
 60 |         except Exception as e:
 61 |             log.error('xml parseString', exc_info=e)
 62 |         if xml is None:
 63 |             return cues
 64 |         parsererrors = xml.getElementsByTagName('parsererror') # type: List[Element]
 65 |         if len(parsererrors) > 0 and parsererrors[0]:
 66 |             raise InvalidXML('ttml parsererror')
 67 |         tts = xml.getElementsByTagName('tt') # type: List[Element]
 68 |         if len(tts) == 0:
 69 |             raise InvalidXML('TTML does not contain <tt> tag.')
 70 |         tt = tts[0]
 71 |         bodys = tt.getElementsByTagName('body') # type: List[Element]
 72 |         if len(bodys) == 0:
 73 |             return []
 74 |         frameRate = tt.getAttributeNS(ttpNs, 'frameRate')
 75 |         subFrameRate = tt.getAttributeNS(ttpNs, 'subFrameRate')
 76 |         frameRateMultiplier = tt.getAttributeNS(ttpNs, 'frameRateMultiplier')
 77 |         tickRate = tt.getAttributeNS(ttpNs, 'tickRate')
 78 |         cellResolution = tt.getAttributeNS(ttpNs, 'cellResolution')
 79 |         spaceStyle = tt.getAttribute('xml:space') or 'default'
 80 |         extent = tt.getAttributeNS(ttsNs, 'extent')
 81 | 
 82 |         if spaceStyle != 'default' and spaceStyle != 'preserve':
 83 |             raise InvalidXML(f'Invalid xml:space value: {spaceStyle}')
 84 |         whitespaceTrim = spaceStyle == 'default'
 85 |         rateInfo = RateInfo_(frameRate, subFrameRate, frameRateMultiplier, tickRate)
 86 |         cellResolutionInfo = TtmlTextParser.getCellResolution_(cellResolution)
 87 | 
 88 |         metadatas = tt.getElementsByTagName('metadata') # type: List[Element]
 89 |         metadataElements = []
 90 |         if len(metadatas) > 0:
 91 |             for childNode in metadatas[0].childNodes:
 92 |                 if isinstance(childNode, Element):
 93 |                     metadataElements.append(childNode)
 94 |         styles = tt.getElementsByTagName('style') # type: List[Element]
 95 |         regionElements = tt.getElementsByTagName('region') # type: List[Element]
 96 |         cueRegions = []
 97 | 
 98 |         for region in regionElements:
 99 |             cueRegion = TtmlTextParser.parseCueRegion_(region, styles, extent)
100 |             if cueRegion:
101 |                 cueRegions.append(cueRegion)
102 | 
103 |         body = bodys[0]
104 |         if len([childNode for childNode in body.childNodes if isinstance(childNode, Element) and childNode.tagName == 'p']) > 0:
105 |             raise InvalidTextCue('<p> can only be inside <div> in TTML')
106 |         for divNode in body.childNodes:
107 |             if isinstance(divNode, Element) is False:
108 |                 continue
109 |             if divNode.tagName != 'div':
110 |                 continue
111 |             has_p = False
112 |             for pChildren in divNode.childNodes:
113 |                 if isinstance(pChildren, Element) is False:
114 |                     continue
115 |                 if pChildren.tagName == 'span':
116 |                     raise InvalidTextCue('<span> can only be inside <p> in TTML')
117 |                 if pChildren.tagName == 'p':
118 |                     has_p = True
119 |                     cue = TtmlTextParser.parseCue_(pChildren, time.periodStart, rateInfo, metadataElements, styles, regionElements, cueRegions, whitespaceTrim, False, cellResolutionInfo)
120 |                     if cue:
121 |                         cues.append(cue)
122 |             if not has_p:
123 |                 cue = TtmlTextParser.parseCue_(divNode, time.periodStart, rateInfo, metadataElements, styles, regionElements, cueRegions, whitespaceTrim, False, cellResolutionInfo)
124 |                 if cue:
125 |                     cues.append(cue)
126 |         return cues
127 | 
128 |     @staticmethod
129 |     def parseCue_(cueNode: Union[Node, Element], offset, rateInfo, metadataElements, styles, regionElements, cueRegions, whitespaceTrim, isNested, cellResolution):
130 |         cueElement = None # type: Element
131 |         parentElement = cueNode.parentNode # type: Element
132 | 
133 |         if cueNode.nodeType == Node.TEXT_NODE:
134 |             span = document.createElement('span') # tpye: Text
135 |             span.appendChild(cueNode)
136 |             cueElement = span
137 |         else:
138 |             assert cueNode.nodeType == Node.ELEMENT_NODE, 'nodeType should be ELEMENT_NODE!'
139 |             cueElement = cueNode
140 |         assert cueElement, 'cueElement should be non-None!'
141 | 
142 |         spaceStyle = cueElement.getAttribute('xml:space') or 'default' if whitespaceTrim else 'preserve'
143 |         localWhitespaceTrim = spaceStyle == 'default'
144 |         if cueElement.firstChild and cueElement.firstChild.nodeValue:
145 |             # hasTextContent = re.match('\S', cueElement.firstChild.nodeValue)
146 |             # \S 不匹配换行 但是js的test却会返回true
147 |             # 所以python这里会误判 那么strip下达到修复效果
148 |             hasTextContent = re.match('\S', cueElement.firstChild.nodeValue.strip())
149 |         else:
150 |             hasTextContent = False
151 |         hasTimeAttributes = cueElement.hasAttribute('begin') or cueElement.hasAttribute('end') or cueElement.hasAttribute('dur')
152 |         if not hasTimeAttributes and not hasTextContent and cueElement.tagName != 'br':
153 |             if not isNested:
154 |                 return None
155 |             elif localWhitespaceTrim:
156 |                 return None
157 |         start, end = TtmlTextParser.parseTime_(cueElement, rateInfo)
158 |         while parentElement and parentElement.nodeType == Node.ELEMENT_NODE and parentElement.tagName != 'tt':
159 |             start, end = TtmlTextParser.resolveTime_(parentElement, rateInfo, start, end)
160 |             parentElement = parentElement.parentNode
161 |         if start is None:
162 |             start = 0
163 |         start += offset
164 |         if end is None:
165 |             end = -1
166 |         else:
167 |             end += offset
168 |         if cueElement.tagName == 'br':
169 |             cue = Cue(start, end, '')
170 |             cue.lineBreak = True
171 |             return cue
172 |         payload = ''
173 |         nestedCues = []
174 |         flag = True
175 |         for childNode in cueElement.childNodes:
176 |             if childNode.nodeType != Node.TEXT_NODE:
177 |                 flag = False
178 |                 break
179 |         if flag:
180 |             payload: str = cueElement.firstChild.nodeValue
181 |             if localWhitespaceTrim:
182 |                 payload = payload.strip()
183 |                 payload = re.sub('\s+', ' ', payload)
184 |         else:
185 |             for childNode in [_ for _ in cueElement.childNodes]:
186 |                 nestedCue = TtmlTextParser.parseCue_(
187 |                     childNode,
188 |                     offset,
189 |                     rateInfo,
190 |                     metadataElements,
191 |                     styles,
192 |                     regionElements,
193 |                     cueRegions,
194 |                     localWhitespaceTrim,
195 |                     True,
196 |                     cellResolution,
197 |                 )
198 |                 if nestedCue:
199 |                     nestedCues.append(nestedCue)
200 |         cue = Cue(start, end, payload)
201 |         cue.nestedCues = nestedCues
202 | 
203 |         if cellResolution:
204 |             cue.cellResolution = cellResolution
205 | 
206 |         regionElements = TtmlTextParser.getElementsFromCollection_(cueElement, 'region', regionElements, '')
207 |         regionElement = None
208 |         if len(regionElements) > 0 and regionElements[0].getAttribute('xml:id'):
209 |             regionElement = regionElements[0]
210 |             regionId = regionElement.getAttribute('xml:id')
211 |             cue.region = [_ for _ in cueRegions if _.id == regionId][0]
212 |         imageElement = None
213 |         for nameSpace in smpteNsList_:
214 |             imageElements = TtmlTextParser.getElementsFromCollection_(cueElement, 'backgroundImage', metadataElements, '#', nameSpace)
215 |             if len(imageElements) > 0:
216 |                 imageElement = imageElements[0]
217 |                 break
218 | 
219 |         isLeaf = len(nestedCues) == 0
220 | 
221 |         TtmlTextParser.addStyle_(
222 |             cue,
223 |             cueElement,
224 |             regionElement,
225 |             imageElement,
226 |             styles,
227 |             isNested,
228 |             isLeaf
229 |         )
230 | 
231 |         return cue
232 | 
233 |     @staticmethod
234 |     def resolveTime_(parentElement, rateInfo: RateInfo_, start, end):
235 |         # 这里有可能存在bug
236 |         parentTime = TtmlTextParser.parseTime_(parentElement, rateInfo)
237 | 
238 |         if start is None:
239 |             # No start time of your own?  Inherit from the parent.
240 |             start = parentTime[0]
241 |         else:
242 |             # Otherwise, the start time is relative to the parent's start time.
243 |             if parentTime[0] is not None:
244 |                 start += parentTime[0]
245 | 
246 |         if end is None:
247 |             # No end time of your own?  Inherit from the parent.
248 |             end = parentTime[1]
249 |         else:
250 |             # Otherwise, the end time is relative to the parent's _start_ time.
251 |             # This is not a typo.  Both times are relative to the parent's _start_.
252 |             if parentTime[0] is not None:
253 |                 end += parentTime[0]
254 | 
255 |         return start, end
256 | 
257 |     @staticmethod
258 |     def parseTime_(element: Element, rateInfo: RateInfo_):
259 |         start = TtmlTextParser.parseTimeAttribute_(element.getAttribute('begin'), rateInfo)
260 |         end = TtmlTextParser.parseTimeAttribute_(element.getAttribute('end'), rateInfo)
261 |         duration = TtmlTextParser.parseTimeAttribute_(element.getAttribute('dur'), rateInfo)
262 |         if end is None and duration is not None:
263 |             end = start + duration
264 |         return start, end
265 | 
266 |     @staticmethod
267 |     def parseFramesTime_(rateInfo: RateInfo_, text):
268 |         # 50t or 50.5t
269 |         results = timeFramesFormat_.findall(text)
270 |         frames = float(results[0])
271 |         return frames / rateInfo.frameRate
272 | 
273 |     @staticmethod
274 |     def parseTickTime_(rateInfo: RateInfo_, text):
275 |         # 50t or 50.5t
276 |         results = timeTickFormat_.findall(text)
277 |         ticks = float(results[0])
278 |         return ticks / rateInfo.tickRate
279 | 
280 |     @staticmethod
281 |     def parseTimeFromRegex_(regex: re.Pattern, text: str) -> int:
282 |         results = regex.findall(text)
283 |         if len(results) == 0:
284 |             return None
285 |         if results[0][0] == '':
286 |             return None
287 | 
288 |         hours = 0
289 |         minutes = 0
290 |         seconds = 0
291 |         milliseconds = 0
292 |         try:
293 |             hours = int(results[0][0])
294 |             minutes = int(results[0][1])
295 |             seconds = float(results[0][2])
296 |             milliseconds = float(results[0][3])
297 |         except Exception:
298 |             pass
299 |         # 对于 timeColonFormatMilliseconds_ 来说 这里是匹配不到 milliseconds 的
300 |         # 不过下一步计算的时候 由于seconds是小数 所以又修正了...
301 | 
302 |         return (milliseconds / 1000) + seconds + (minutes * 60) + (hours * 3600)
303 | 
304 |     @staticmethod
305 |     def parseColonTimeWithFrames_(rateInfo: RateInfo_, text: str) -> int:
306 |         # 01:02:43:07 ('07' is frames) or 01:02:43:07.1 (subframes)
307 |         results = timeColonFormatFrames_.findall(text)
308 | 
309 |         hours = int(results[0][0])
310 |         minutes = int(results[0][1])
311 |         seconds = int(results[0][2])
312 |         frames = int(results[0][3])
313 |         subframes = int(results[0][4]) or 0
314 | 
315 |         frames += subframes / rateInfo.subFrameRate
316 |         seconds += frames / rateInfo.frameRate
317 | 
318 |         return seconds + (minutes * 60) + (hours * 3600)
319 | 
320 |     @staticmethod
321 |     def parseTimeAttribute_(text: str, rateInfo: RateInfo_):
322 |         ret = None
323 |         if timeColonFormatFrames_.match(text):
324 |             ret = TtmlTextParser.parseColonTimeWithFrames_(rateInfo, text)
325 |         elif timeColonFormat_.match(text):
326 |             ret = TtmlTextParser.parseTimeFromRegex_(timeColonFormat_, text)
327 |         elif timeColonFormatMilliseconds_.match(text):
328 |             ret = TtmlTextParser.parseTimeFromRegex_(timeColonFormatMilliseconds_, text)
329 |         elif timeFramesFormat_.match(text):
330 |             ret = TtmlTextParser.parseFramesTime_(rateInfo, text)
331 |         elif timeTickFormat_.match(text):
332 |             ret = TtmlTextParser.parseTickTime_(rateInfo, text)
333 |         elif timeHMSFormat_.match(text):
334 |             ret = TtmlTextParser.parseTimeFromRegex_(timeHMSFormat_, text)
335 |         elif text:
336 |             raise InvalidTextCue('Could not parse cue time range in TTML')
337 |         return ret
338 | 
339 |     @staticmethod
340 |     def addStyle_(cue, cueElement, region, imageElement: Element, styles: List[Element], isNested: bool, isLeaf: bool):
341 |         shouldInheritRegionStyles = isNested or isLeaf
342 | 
343 |         _direction = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'direction', shouldInheritRegionStyles)
344 |         if _direction == 'rtl':
345 |             cue.direction = direction.HORIZONTAL_RIGHT_TO_LEFT
346 | 
347 |         _writingMode = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'writingMode', shouldInheritRegionStyles)
348 |         if _writingMode == 'tb' or _writingMode == 'tblr':
349 |             cue.writingMode = writingMode.VERTICAL_LEFT_TO_RIGHT
350 |         elif _writingMode == 'tbrl':
351 |             cue.writingMode = writingMode.VERTICAL_RIGHT_TO_LEFT
352 |         elif _writingMode == 'rltb' or _writingMode == 'rl':
353 |             cue.direction = direction.HORIZONTAL_RIGHT_TO_LEFT
354 |         elif _writingMode:
355 |             cue.direction = direction.HORIZONTAL_LEFT_TO_RIGHT
356 | 
357 |         align = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'textAlign', shouldInheritRegionStyles)
358 |         if align:
359 |             cue.positionAlign = textAlignToPositionAlign_[align]
360 |             cue.lineAlign = textAlignToLineAlign_[align]
361 | 
362 |             assert textAlign.__members__.get(align.upper()), f'{align.upper()} Should be in Cue.textAlign values!'
363 |         else:
364 |             cue.textAlign = textAlign.START
365 | 
366 |         _displayAlign = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'displayAlign', shouldInheritRegionStyles)
367 |         if _displayAlign:
368 |             assert displayAlign.__members__.get(_displayAlign.upper()), f'{_displayAlign.upper()} Should be in Cue.displayAlign values!'
369 |             cue.displayAlign = displayAlign[_displayAlign.upper()]
370 | 
371 |         color = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'color', shouldInheritRegionStyles)
372 |         if color:
373 |             cue.color = color
374 | 
375 |         backgroundColor = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'backgroundColor', shouldInheritRegionStyles)
376 |         if backgroundColor:
377 |             cue.backgroundColor = backgroundColor
378 | 
379 |         border = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'border', shouldInheritRegionStyles)
380 |         if border:
381 |             cue.border = border
382 | 
383 |         fontFamily = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'fontFamily', shouldInheritRegionStyles)
384 |         if fontFamily:
385 |             cue.fontFamily = fontFamily
386 | 
387 |         fontWeight = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'fontWeight', shouldInheritRegionStyles)
388 |         if fontWeight and fontWeight == 'bold':
389 |             cue.fontWeight = fontWeight.BOLD
390 | 
391 |         wrapOption = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'wrapOption', shouldInheritRegionStyles)
392 |         if wrapOption and wrapOption == 'noWrap':
393 |             cue.wrapLine = False
394 |         else:
395 |             cue.wrapLine = True
396 | 
397 |         lineHeight = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'lineHeight', shouldInheritRegionStyles)
398 |         if lineHeight and unitValues_.match(lineHeight):
399 |             cue.lineHeight = lineHeight
400 | 
401 |         fontSize = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'fontSize', shouldInheritRegionStyles)
402 | 
403 |         if fontSize:
404 |             isValidFontSizeUnit = unitValues_.match(fontSize) or percentValue_.match(fontSize)
405 |             if isValidFontSizeUnit:
406 |                 cue.fontSize = fontSize
407 | 
408 |         _fontStyle = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'fontStyle', shouldInheritRegionStyles)
409 |         if _fontStyle:
410 |             assert fontStyle.__members__.get(_fontStyle.upper()), f'{_fontStyle.upper()} Should be in Cue.fontStyle values!'
411 |             cue.fontStyle = fontStyle[_fontStyle.upper()]
412 | 
413 |         if imageElement:
414 |             backgroundImageType = imageElement.getAttribute('imageType') or imageElement.getAttribute('imagetype')
415 |             backgroundImageEncoding = imageElement.getAttribute('encoding')
416 |             backgroundImageData = imageElement.textContent.trim()
417 |             if backgroundImageType == 'PNG' and backgroundImageEncoding == 'Base64' and backgroundImageData:
418 |                 cue.backgroundImage = 'data:image/pngbase64,' + backgroundImageData
419 | 
420 |         letterSpacing = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'letterSpacing', shouldInheritRegionStyles)
421 |         if letterSpacing and unitValues_.match(letterSpacing):
422 |             cue.letterSpacing = letterSpacing
423 | 
424 |         linePadding = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'linePadding', shouldInheritRegionStyles)
425 |         if linePadding and unitValues_.match(linePadding):
426 |             cue.linePadding = linePadding
427 | 
428 |         opacity = TtmlTextParser.getStyleAttribute_(cueElement, region, styles, 'opacity', shouldInheritRegionStyles)
429 |         if opacity:
430 |             cue.opacity = float(opacity)
431 | 
432 |         textDecorationRegion = TtmlTextParser.getStyleAttributeFromRegion_(region, styles, 'textDecoration')
433 |         if textDecorationRegion:
434 |             TtmlTextParser.addTextDecoration_(cue, textDecorationRegion)
435 | 
436 |         textDecorationElement = TtmlTextParser.getStyleAttributeFromElement_(cueElement, styles, 'textDecoration')
437 |         if textDecorationElement:
438 |             TtmlTextParser.addTextDecoration_(cue, textDecorationElement)
439 | 
440 |     @staticmethod
441 |     def addTextDecoration_(cue: Cue, decoration):
442 |         # 这里可能有问题 .value
443 |         for value in decoration.split(' '):
444 |             if value == 'underline':
445 |                 if textDecoration.UNDERLINE not in cue.textDecoration:
446 |                     cue.textDecoration.append(textDecoration.UNDERLINE)
447 |             elif value == 'noUnderline':
448 |                 cue.textDecoration = [_ for _ in cue.textDecoration if textDecoration.UNDERLINE != _]
449 |             elif value == 'lineThrough':
450 |                 if textDecoration.LINE_THROUGH not in cue.textDecoration:
451 |                     cue.textDecoration.append(textDecoration.LINE_THROUGH)
452 |             elif value == 'noLineThrough':
453 |                 cue.textDecoration = [_ for _ in cue.textDecoration if textDecoration.LINE_THROUGH != _]
454 |             elif value == 'overline':
455 |                 if textDecoration.OVERLINE not in cue.textDecoration:
456 |                     cue.textDecoration.append(textDecoration.OVERLINE)
457 |             elif value == 'noOverline':
458 |                 cue.textDecoration = [_ for _ in cue.textDecoration if textDecoration.OVERLINE != _]
459 | 
460 |     @staticmethod
461 |     def getStyleAttribute_(cueElement, region, styles, attribute, shouldInheritRegionStyles=True):
462 |         attr = TtmlTextParser.getStyleAttributeFromElement_(cueElement, styles, attribute)
463 |         if attr:
464 |             return attr
465 |         if shouldInheritRegionStyles:
466 |             return TtmlTextParser.getStyleAttributeFromRegion_(region, styles, attribute)
467 |         return None
468 | 
469 |     @staticmethod
470 |     def parseCueRegion_(regionElement: Element, styles: List[Element], globalExtent: str):
471 |         region = CueRegion()
472 |         _id = regionElement.getAttribute('xml:id')
473 |         if not _id:
474 |             log.warning('TtmlTextParser parser encountered a region with no id. Region will be ignored.')
475 |             return None
476 |         region.id = _id
477 |         globalResults = None
478 |         if globalExtent:
479 |             globalResults = percentValues_.findall(globalExtent) or pixelValues_.findall(globalExtent)
480 |         if globalResults is not None and len(globalResults) == 2:
481 |             globalWidth = int(globalResults[0][0])
482 |             globalHeight = int(globalResults[0][1])
483 |         else:
484 |             globalWidth = None
485 |             globalHeight = None
486 |         results = None
487 |         percentage = None
488 | 
489 |         extent = TtmlTextParser.getStyleAttributeFromRegion_(regionElement, styles, 'extent')
490 |         if extent:
491 |             percentage = percentValues_.findall(extent)
492 |             results = percentage or pixelValues_.findall(extent)
493 |             if results is not None:
494 |                 region.width = int(results[0][0])
495 |                 region.height = int(results[0][1])
496 | 
497 |                 if not percentage:
498 |                     if globalWidth is not None:
499 |                         region.width = region.width * 100 / globalWidth
500 |                     if globalHeight is not None:
501 |                         region.height = region.height * 100 / globalHeight
502 |                 if percentage or globalWidth is not None:
503 |                     region.widthUnits = units.PERCENTAGE
504 |                 else:
505 |                     region.widthUnits = units.PX
506 |                 if percentage or globalHeight is not None:
507 |                     region.heightUnits = units.PERCENTAGE
508 |                 else:
509 |                     region.heightUnits = units.PX
510 |         origin = TtmlTextParser.getStyleAttributeFromRegion_(regionElement, styles, 'origin')
511 |         if origin:
512 |             percentage = percentValues_.findall(origin)
513 |             results = percentage or pixelValues_.findall(origin)
514 |             if len(results) > 0:
515 |                 region.viewportAnchorX = int(results[0][0])
516 |                 region.viewportAnchorY = int(results[0][1])
517 |             if len(percentage) == 0:
518 |                 if globalHeight is not None:
519 |                     region.viewportAnchorY = region.viewportAnchorY * 100 / globalHeight
520 |                 if globalWidth is not None:
521 |                     region.viewportAnchorX = region.viewportAnchorX * 100 / globalHeight
522 |             if percentage or globalWidth is not None:
523 |                 region.viewportAnchorUnits = units.PERCENTAGE
524 |             else:
525 |                 region.viewportAnchorUnits = units.PX
526 |         return region
527 | 
528 |     @staticmethod
529 |     def getInheritedStyleAttribute_(element: Element, styles, attribute):
530 |         ttsNs = styleNs_
531 |         ebuttsNs = styleEbuttsNs_
532 | 
533 |         inheritedStyles = TtmlTextParser.getElementsFromCollection_(element, 'style', styles, '') # tpye: List[Element]
534 | 
535 |         styleValue = None
536 |         # The last value in our styles stack takes the precedence over the others
537 |         for inheritedStyle in inheritedStyles:
538 |             # Check ebu namespace first.
539 |             styleAttributeValue = inheritedStyle.getAttributeNS(ebuttsNs, attribute)
540 | 
541 |             if not styleAttributeValue:
542 |                 # Fall back to tts namespace.
543 |                 styleAttributeValue = inheritedStyle.getAttributeNS(ttsNs, attribute)
544 | 
545 |             if not styleAttributeValue:
546 |                 # Next, check inheritance.
547 |                 # Styles can inherit from other styles, so traverse up that chain.
548 |                 styleAttributeValue = TtmlTextParser.getStyleAttributeFromElement_(inheritedStyle, styles, attribute)
549 | 
550 |             if styleAttributeValue:
551 |                 styleValue = styleAttributeValue
552 | 
553 |         return styleValue
554 | 
555 |     @staticmethod
556 |     def getStyleAttributeFromElement_(cueElement: Element, styles, attribute: str):
557 |         ttsNs = styleNs_
558 |         elementAttribute = cueElement.getAttributeNS(ttsNs, attribute)
559 |         if elementAttribute:
560 |             return elementAttribute
561 |         return TtmlTextParser.getInheritedStyleAttribute_(cueElement, styles, attribute)
562 | 
563 |     @staticmethod
564 |     def getInheritedAttribute_(element: Element, attributeName: str, nsName: str):
565 |         ret = None
566 |         while element:
567 |             if nsName:
568 |                 ret = element.getAttributeNS(nsName, attributeName)
569 |             else:
570 |                 ret = element.getAttribute(attributeName)
571 |             if ret:
572 |                 break
573 |             parentNode = element.parentNode
574 |             if isinstance(parentNode, Element):
575 |                 element = parentNode
576 |             else:
577 |                 break
578 |         return ret
579 | 
580 |     @staticmethod
581 |     def getElementsFromCollection_(element: Element, attributeName: str, collection: list, prefixName: str, nsName: str = None):
582 |         items = []
583 |         if not element or len(collection) < 1:
584 |             return items
585 |         attributeValue = TtmlTextParser.getInheritedAttribute_(element, attributeName, nsName)
586 |         if not attributeValue:
587 |             return items
588 |         itemNames = attributeValue.split(' ')
589 |         for name in itemNames:
590 |             for item in collection:
591 |                 if prefixName + item.getAttribute('xml:id') == name:
592 |                     items.append(item)
593 |                     break
594 |         return items
595 | 
596 |     @staticmethod
597 |     def getStyleAttributeFromRegion_(region: Element, styles, attribute):
598 |         ttsNs = styleNs_
599 |         if not region:
600 |             return None
601 |         attr = region.getAttributeNS(ttsNs, attribute)
602 |         if attr:
603 |             return attr
604 |         return TtmlTextParser.getInheritedStyleAttribute_(region, styles, attribute)
605 | 
606 |     @staticmethod
607 |     def getCellResolution_(cellResolution: str):
608 |         if cellResolution is None or cellResolution == '':
609 |             return None
610 |         matches = re.findall('^(\d+) (\d+)$', cellResolution)
611 |         if len(matches) == 0:
612 |             return None
613 |         columns = int(matches[0][0])
614 |         rows = int(matches[0][1])
615 |         return {'columns': columns, 'rows': rows}
616 | 
617 | 
618 | # 50.17% 10%
619 | percentValues_ = re.compile('^(\d{1,2}(?:\.\d+)?|100(?:\.0+)?)% (\d{1,2}(?:\.\d+)?|100(?:\.0+)?)%$')
620 | 
621 | # 0.6% 90%
622 | percentValue_ = re.compile('^(\d{1,2}(?:\.\d+)?|100)%$')
623 | 
624 | # 100px, 8em, 0.80c
625 | unitValues_ = re.compile('^(\d+px|\d+em|\d*\.?\d+c)$')
626 | 
627 | # 100px
628 | pixelValues_ = re.compile('^(\d+)px (\d+)px$')
629 | 
630 | # 00:00:40:07 (7 frames) or 00:00:40:07.1 (7 frames, 1 subframe)
631 | timeColonFormatFrames_ = re.compile('^(\d{2,}):(\d{2}):(\d{2}):(\d{2})\.?(\d+)?$')
632 | 
633 | # 00:00:40 or 00:40
634 | timeColonFormat_ = re.compile('^(?:(\d{2,}):)?(\d{2}):(\d{2})$')
635 | 
636 | # 01:02:43.0345555 or 02:43.03
637 | timeColonFormatMilliseconds_ = re.compile('^(?:(\d{2,}):)?(\d{2}):(\d{2}\.\d{2,})$')
638 | 
639 | # 75f or 75.5f
640 | timeFramesFormat_ = re.compile('^(\d*(?:\.\d*)?)f$')
641 | 
642 | # 50t or 50.5t
643 | timeTickFormat_ = re.compile('^(\d*(?:\.\d*)?)t$')
644 | 
645 | # 3.45h, 3m or 4.20s
646 | timeHMSFormat_ = re.compile('^(?:(\d*(?:\.\d*)?)h)?(?:(\d*(?:\.\d*)?)m)?(?:(\d*(?:\.\d*)?)s)?(?:(\d*(?:\.\d*)?)ms)?$')
647 | 
648 | 
649 | class textAlignToLineAlign_(Enum):
650 |     left = lineAlign.START
651 |     center = lineAlign.CENTER
652 |     right = lineAlign.END
653 |     start = lineAlign.START
654 |     end = lineAlign.END
655 | 
656 | 
657 | class textAlignToPositionAlign_(Enum):
658 |     left = positionAlign.LEFT
659 |     center = positionAlign.CENTER
660 |     right = positionAlign.RIGHT
661 | 
662 | 
663 | parameterNs_ = 'http://www.w3.org/ns/ttml#parameter'
664 | styleNs_ = 'http://www.w3.org/ns/ttml#styling'
665 | styleEbuttsNs_ = 'urn:ebu:tt:style'
666 | smpteNsList_ = [
667 |     'http://www.smpte-ra.org/schemas/2052-1/2010/smpte-tt',
668 |     'http://www.smpte-ra.org/schemas/2052-1/2013/smpte-tt',
669 | ]
670 | 


--------------------------------------------------------------------------------
/text/VttTextParser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import Dict, List, Union
  3 | from xml.dom.minidom import parseString, Node, Element, Text
  4 | from xml.sax.saxutils import escape
  5 | from text.Cue import Cue, defaultTextColor, fontStyle, fontWeight, textDecoration
  6 | from log import log
  7 | 
  8 | 
  9 | class VttTextParser:
 10 | 
 11 |     def __init__(self):
 12 |         pass
 13 | 
 14 |     def parseInit(self, data: bytes):
 15 |         assert False, 'VTT does not have init segments'
 16 | 
 17 |     def parseMedia(self, data: bytes, time: int):
 18 |         pass
 19 | 
 20 |     @staticmethod
 21 |     def parseCueStyles(payload: str, rootCue: Cue, styles: Dict[str, Cue]):
 22 |         if len(styles) == 0:
 23 |             VttTextParser.addDefaultTextColor_(styles)
 24 |         payload = VttTextParser.replaceColorPayload_(payload)
 25 |         xmlPayload = '<span>' + escape(payload) + '</span>'
 26 |         elements = parseString(xmlPayload).getElementsByTagName('span') # type: List[Element]
 27 |         if len(elements) > 0 and elements[0]:
 28 |             element = elements[0]
 29 |             cues = [] # type: List[Cue]
 30 |             childNodes = element.childNodes # type: List[Element]
 31 |             if len(childNodes) == 1:
 32 |                 childNode = childNodes[0]
 33 |                 if childNode.nodeType == Node.TEXT_NODE or childNode.nodeType == Node.CDATA_SECTION_NODE:
 34 |                     rootCue.payload = payload
 35 |                     return
 36 |             for childNode in childNodes:
 37 |                 if childNode.nodeValue and childNode.nodeValue.startswith('i>'):
 38 |                     continue
 39 |                 VttTextParser.generateCueFromElement_(childNode, rootCue, cues, styles)
 40 |             rootCue.nestedCues = cues
 41 |         else:
 42 |             log.warning(f'The cue\'s markup could not be parsed: {payload}')
 43 |             rootCue.payload = payload
 44 | 
 45 |     @staticmethod
 46 |     def generateCueFromElement_(element: Union[Element, Text], rootCue: Cue, cues: List[Cue], styles: Dict[str, Cue]):
 47 |         nestedCue = rootCue.clone()
 48 |         if element.nodeType == Node.ELEMENT_NODE and element.nodeName:
 49 |             bold = fontWeight.BOLD
 50 |             italic = fontStyle.ITALIC
 51 |             underline = textDecoration.UNDERLINE
 52 |             tags = re.split('[ .]+', element.nodeName)
 53 |             for tag in tags:
 54 |                 if styles.get(tag):
 55 |                     VttTextParser.mergeStyle_(nestedCue, styles.get(tag))
 56 |                 if tag == 'b':
 57 |                     nestedCue.fontWeight = bold
 58 |                 elif tag == 'i':
 59 |                     nestedCue.fontStyle = italic
 60 |                 elif tag == 'u':
 61 |                     nestedCue.textDecoration.append(underline)
 62 |         isTextNode = element.nodeType == Node.TEXT_NODE or element.nodeType == Node.CDATA_SECTION_NODE
 63 |         if isTextNode:
 64 |             # element 这里是 Text 类型 js的textContent对应这里的data
 65 |             textArr = element.data.split('\n')
 66 |             isFirst = True
 67 |             for text in textArr:
 68 |                 if not isFirst:
 69 |                     lineBreakCue = rootCue.clone()
 70 |                     lineBreakCue.lineBreak = True
 71 |                     cues.append(lineBreakCue)
 72 |                 if len(text) > 0:
 73 |                     textCue = nestedCue.clone()
 74 |                     textCue.payload = text
 75 |                     cues.append(textCue)
 76 |                 isFirst = False
 77 |         else:
 78 |             for childNode in element.childNodes:
 79 |                 VttTextParser.generateCueFromElement_(childNode, nestedCue, cues, styles)
 80 | 
 81 |     @staticmethod
 82 |     def replaceColorPayload_(payload: str):
 83 |         '''
 84 |         这里没有找到相关样本测试 可能有bug
 85 |         '''
 86 |         names = []
 87 |         nameStart = -1
 88 |         newPayload = ''
 89 | 
 90 |         newPayload = payload
 91 |         # for i in range(len(payload)):
 92 |         #     if payload[i] == '/':
 93 |         #         end = payload.index('>', i)
 94 |         #         if end <= i:
 95 |         #             return payload
 96 |         #         tagEnd = payload[i + 1:end]
 97 |         #         tagStart = names.pop(-1)
 98 |         #         if not tagEnd or not tagStart:
 99 |         #             return payload
100 |         #         elif tagStart == tagEnd:
101 |         #             newPayload += '/' + tagEnd + '>'
102 |         #             i += len(tagEnd) + 1
103 |         #         else:
104 |         #             if not tagStart.startswith('c.') or tagEnd != 'c':
105 |         #                 return payload
106 |         #             newPayload += '/' + tagStart + '>'
107 |         #             i += len(tagEnd) + 1
108 |         #     else:
109 |         #         if payload[i] == '<':
110 |         #             nameStart = i + 1
111 |         #         elif payload[i] == '>':
112 |         #             if nameStart > 0:
113 |         #                 names.append(payload[nameStart:i])
114 |         #                 nameStart = -1
115 |         #         newPayload += payload[i]
116 |         return newPayload
117 | 
118 |     @staticmethod
119 |     def addDefaultTextColor_(styles: Dict[str, Cue]):
120 |         for key, value in defaultTextColor.__members__.items():
121 |             cue = Cue(0, 0, '')
122 |             cue.color = value
123 |             styles[key] = cue


--------------------------------------------------------------------------------
/util/DataViewReader.py:
--------------------------------------------------------------------------------
  1 | import struct
  2 | from enum import Enum
  3 | 
  4 | from util.exceptions import OutOfBoundsError
  5 | from util.exceptions import IntOverflowError
  6 | 
  7 | 
  8 | class Endianness(Enum):
  9 |     BIG_ENDIAN = 0
 10 |     LITTLE_ENDIAN = 1
 11 | 
 12 | 
 13 | class DataView:
 14 |     '''
 15 |     shaka/util/buffer_utils.js
 16 |     '''
 17 |     def __init__(self, data: bytes):
 18 |         self.buffer = memoryview(bytearray(data))
 19 |         # self.buffer = memoryview(bytearray([0x96, 0x87, 0xac]))
 20 |         self.byteLength = len(self.buffer) # type: int
 21 | 
 22 |     def getUint8(self):
 23 |         pass
 24 | 
 25 |     def getUint16(self):
 26 |         pass
 27 | 
 28 |     def getUint32(self, position: int, littleEndian: bool = False):
 29 |         # 这里记得切片长度要补齐4位 不然unpack会报错
 30 |         buf = self.buffer[position:position + 4].tobytes()
 31 |         if len(buf) < 4:
 32 |             buf = b'\x00' * (4 - len(buf)) + buf
 33 |         if littleEndian:
 34 |             return struct.unpack("<I", buf)[0]
 35 |         else:
 36 |             return struct.unpack(">I", buf)[0]
 37 | 
 38 |     def getUint64(self, position: int, littleEndian: bool = False):
 39 |         # 这里记得切片长度要补齐4位 不然
 40 |         buf = self.buffer[position:position + 4].tobytes()
 41 |         if len(buf) < 4:
 42 |             buf = b'\x00' * (4 - len(buf)) + buf
 43 |         if littleEndian:
 44 |             return struct.unpack("<I", buf)[0]
 45 |         else:
 46 |             return struct.unpack(">I", buf)[0]
 47 | 
 48 |     def getInt8(self):
 49 |         pass
 50 | 
 51 |     def getInt16(self):
 52 |         pass
 53 | 
 54 |     def getInt32(self, position: int, littleEndian: bool = False):
 55 |         buf = self.buffer[position:position + 4].tobytes()
 56 |         if len(buf) < 4:
 57 |             buf = b'\x00' * (4 - len(buf)) + buf
 58 |         if littleEndian:
 59 |             return struct.unpack("<i", buf)[0]
 60 |         else:
 61 |             return struct.unpack(">i", buf)[0]
 62 | 
 63 |     def getInt64(self):
 64 |         pass
 65 | 
 66 |     def readUint8(self):
 67 |         pass
 68 | 
 69 |     def readUint16(self):
 70 |         pass
 71 | 
 72 |     def readUint32(self):
 73 |         pass
 74 | 
 75 |     def readInt8(self):
 76 |         pass
 77 | 
 78 |     def readInt16(self):
 79 |         pass
 80 | 
 81 |     def readInt32(self):
 82 |         pass
 83 | 
 84 |     def readInt64(self):
 85 |         pass
 86 | 
 87 |     @staticmethod
 88 |     def toUint8(data: 'DataView', offset: int = 0, length: int = None):
 89 |         # 由于python中float('inf')表示无穷大 但不能作为索引
 90 |         # 所以这里直接将最大长度视为byteLength
 91 |         if length is None:
 92 |             length = data.byteLength
 93 |         return data.buffer[offset:offset + length].tobytes()
 94 | 
 95 | 
 96 | class DataViewReader(DataView):
 97 |     '''
 98 |     shaka/util/data_view_reader.js
 99 |     '''
100 | 
101 |     def __init__(self, data: bytes, endianness: Endianness):
102 |         self.dataView_ = DataView(data) # type: DataView
103 |         self.littleEndian_ = endianness == Endianness.LITTLE_ENDIAN # type: bool
104 |         self.position_ = 0 # type: int
105 | 
106 |     def getDataView(self) -> DataView:
107 |         return self.dataView_
108 | 
109 |     def hasMoreData(self) -> bool:
110 |         return self.position_ < self.dataView_.byteLength
111 | 
112 |     def getPosition(self) -> int:
113 |         return self.position_
114 | 
115 |     def getLength(self) -> int:
116 |         return self.dataView_.byteLength
117 | 
118 |     def readUint8(self):
119 |         pass
120 | 
121 |     def readUint16(self):
122 |         pass
123 | 
124 |     def readUint32(self) -> int:
125 |         value = self.dataView_.getUint32(self.position_, self.littleEndian_)
126 |         self.position_ += 4
127 |         return value
128 | 
129 |     def readInt32(self):
130 |         value = self.dataView_.getInt32(self.position_, self.littleEndian_)
131 |         self.position_ += 4
132 |         return value
133 | 
134 |     def readUint64(self) -> int:
135 |         if self.littleEndian_:
136 |             low = self.dataView_.getUint32(self.position_, True)
137 |             high = self.dataView_.getUint32(self.position_ + 4, True)
138 |         else:
139 |             high = self.dataView_.getUint32(self.position_, False)
140 |             low = self.dataView_.getUint32(self.position_ + 4, False)
141 | 
142 |         if high > 0x1FFFFF:
143 |             raise IntOverflowError
144 | 
145 |         self.position_ += 8
146 |         return (high * (2 ** 32)) + low
147 | 
148 |     def readBytes(self, length: int):
149 |         assert length >= 0, 'Bad call to DataViewReader.readBytes'
150 |         if self.position_ + length > self.dataView_.byteLength:
151 |             raise OutOfBoundsError
152 |         data = DataView.toUint8(self.dataView_, self.position_, length)
153 |         self.position_ += length
154 |         return data
155 | 
156 |     def skip(self, length: int):
157 |         assert length >= 0, 'Bad call to DataViewReader.skip'
158 |         if self.position_ + length > self.dataView_.byteLength:
159 |             raise OutOfBoundsError
160 |         self.position_ += length
161 | 
162 |     def rewind(self, length: int):
163 |         pass
164 | 
165 |     def seek(self, position: int):
166 |         pass
167 | 
168 |     def readTerminatedString(self):
169 |         pass
170 | 
171 |     def outOfBounds_(self):
172 |         pass


--------------------------------------------------------------------------------
/util/Functional.py:
--------------------------------------------------------------------------------
1 | class Functional:
2 | 
3 |     @staticmethod
4 |     def isNotNull(value) -> bool:
5 |         return value is not None


--------------------------------------------------------------------------------
/util/Mp4BoxParsers.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | from util.DataViewReader import DataViewReader
  3 | 
  4 | 
  5 | class ParsedTFHDBox:
  6 | 
  7 |     def __init__(self, **kwargs):
  8 |         self.trackId = kwargs['trackId'] # type: int
  9 |         self.defaultSampleDuration = kwargs['defaultSampleDuration'] # type: int
 10 |         self.defaultSampleSize = kwargs['defaultSampleSize'] # type: int
 11 | 
 12 | 
 13 | class ParsedTFDTBox:
 14 | 
 15 |     def __init__(self, **kwargs):
 16 |         self.baseMediaDecodeTime = kwargs['baseMediaDecodeTime'] # type: int
 17 | 
 18 | 
 19 | class ParsedMDHDBox:
 20 | 
 21 |     def __init__(self, **kwargs):
 22 |         self.timescale = kwargs['timescale'] # type: int
 23 | 
 24 | 
 25 | class ParsedTREXBox:
 26 | 
 27 |     def __init__(self, **kwargs):
 28 |         self.defaultSampleDuration = kwargs['defaultSampleDuration'] # type: int
 29 |         self.defaultSampleSize = kwargs['defaultSampleSize'] # type: int
 30 | 
 31 | 
 32 | class ParsedTRUNBox:
 33 | 
 34 |     def __init__(self, **kwargs):
 35 |         self.sampleCount = kwargs['sampleCount'] # type: int
 36 |         self.sampleData = kwargs['sampleData'] # type: List[ParsedTRUNSample]
 37 | 
 38 | 
 39 | class ParsedTRUNSample:
 40 | 
 41 |     def __init__(self, **kwargs):
 42 |         self.sampleDuration = kwargs['sampleDuration'] # type: int
 43 |         self.sampleSize = kwargs['sampleSize'] # type: int
 44 |         self.sampleCompositionTimeOffset = kwargs['sampleCompositionTimeOffset'] # type: int
 45 | 
 46 | 
 47 | class ParsedTKHDBox:
 48 | 
 49 |     def __init__(self, **kwargs):
 50 |         self.trackId = kwargs['trackId'] # type: int
 51 | 
 52 | 
 53 | class Mp4BoxParsers:
 54 | 
 55 |     @staticmethod
 56 |     def parseTFHD(reader: DataViewReader, flags: int) -> ParsedTFHDBox:
 57 |         defaultSampleDuration = None
 58 |         defaultSampleSize = None
 59 | 
 60 |         # Read "track_ID"
 61 |         trackId = reader.readUint32()
 62 | 
 63 |         # Skip "base_data_offset" if present.
 64 |         if flags & 0x000001:
 65 |             reader.skip(8)
 66 | 
 67 |         # Skip "sample_description_index" if present.
 68 |         if flags & 0x000002:
 69 |             reader.skip(4)
 70 | 
 71 |         # Read "default_sample_duration" if present.
 72 |         if flags & 0x000008:
 73 |             defaultSampleDuration = reader.readUint32()
 74 | 
 75 |         # Read "default_sample_size" if present.
 76 |         if flags & 0x000010:
 77 |             defaultSampleSize = reader.readUint32()
 78 | 
 79 |         return ParsedTFHDBox(**{
 80 |             'trackId': trackId,
 81 |             'defaultSampleDuration': defaultSampleDuration,
 82 |             'defaultSampleSize': defaultSampleSize,
 83 |         })
 84 | 
 85 |     @staticmethod
 86 |     def parseTFDT(reader: DataViewReader, version: int) -> ParsedTFDTBox:
 87 |         if version == 1:
 88 |             baseMediaDecodeTime = reader.readUint64()
 89 |         else:
 90 |             baseMediaDecodeTime = reader.readUint32()
 91 |         return ParsedTFDTBox(**{'baseMediaDecodeTime': baseMediaDecodeTime})
 92 | 
 93 |     @staticmethod
 94 |     def parseMDHD(reader: DataViewReader, version: int) -> ParsedMDHDBox:
 95 |         if version == 1:
 96 |             # Skip "creation_time"
 97 |             reader.skip(8)
 98 |             # Skip "modification_time"
 99 |             reader.skip(8)
100 |         else:
101 |             # Skip "creation_time"
102 |             reader.skip(4)
103 |             # Skip "modification_time"
104 |             reader.skip(4)
105 |         timescale = reader.readUint32()
106 |         return ParsedMDHDBox(**{'timescale': timescale})
107 | 
108 |     @staticmethod
109 |     def parseTREX(reader: DataViewReader) -> ParsedTREXBox:
110 |         pass
111 | 
112 |     @staticmethod
113 |     def parseTRUN(reader: DataViewReader, version: int, flags: int) -> ParsedTRUNBox:
114 |         sampleCount = reader.readUint32()
115 |         sampleData = []
116 | 
117 |         # Skip "data_offset" if present.
118 |         if flags & 0x000001:
119 |             reader.skip(4)
120 | 
121 |         # Skip "first_sample_flags" if present.
122 |         if flags & 0x000004:
123 |             reader.skip(4)
124 | 
125 |         for _ in range(sampleCount):
126 |             sample = ParsedTRUNSample(**{
127 |                 'sampleDuration': None,
128 |                 'sampleSize': None,
129 |                 'sampleCompositionTimeOffset': None,
130 |             })
131 | 
132 |             # Read "sample duration" if present.
133 |             if flags & 0x000100:
134 |                 sample.sampleDuration = reader.readUint32()
135 | 
136 |             # Read "sample_size" if present.
137 |             if flags & 0x000200:
138 |                 sample.sampleSize = reader.readUint32()
139 | 
140 |             # Skip "sample_flags" if present.
141 |             if flags & 0x000400:
142 |                 reader.skip(4)
143 | 
144 |             # Read "sample_time_offset" if present.
145 |             if flags & 0x000800:
146 |                 if version == 0:
147 |                     sample.sampleCompositionTimeOffset = reader.readUint32()
148 |                 else:
149 |                     sample.sampleCompositionTimeOffset = reader.readInt32()
150 |             sampleData.append(sample)
151 | 
152 |         return ParsedTRUNBox(**{'sampleCount': sampleCount, 'sampleData': sampleData})
153 | 
154 |     @staticmethod
155 |     def parseTKHD(reader: DataViewReader, version: int) -> ParsedTKHDBox:
156 |         pass


--------------------------------------------------------------------------------
/util/Mp4Parser.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, Callable
  2 | from enum import Enum
  3 | 
  4 | # from log import log
  5 | from util.DataViewReader import DataViewReader, Endianness
  6 | 
  7 | 
  8 | class ParsedBox:
  9 |     '''
 10 |     js原本是在shaka.extern.ParsedBox中
 11 |     但是python中做分离会引起循环导入的问题
 12 |     加上ParsedBox定义是在externs/shaka/mp4_parser.js
 13 |     于是这里就把ParsedBox放到这里了
 14 |     '''
 15 |     def __init__(self, **kwargs):
 16 |         self.parser = kwargs['parser'] # type: Mp4Parser
 17 |         self.partialOkay = kwargs['partialOkay'] # type: bool
 18 |         self.start = kwargs['start'] # type: int
 19 |         self.size = kwargs['size'] # type: int
 20 |         self.version = kwargs['version'] # type: int
 21 |         self.flags = kwargs['flags'] # type: int
 22 |         self.reader = kwargs['reader'] # type: DataViewReader
 23 |         self.has64BitSize = kwargs['has64BitSize'] # type: bool
 24 | 
 25 | 
 26 | class Mp4Parser:
 27 | 
 28 |     class BoxType_(Enum):
 29 |         BASIC_BOX = 0
 30 |         FULL_BOX = 1
 31 | 
 32 |     def __init__(self):
 33 |         self.headers_ = {} # type: Dict[int, Mp4Parser.BoxType_]
 34 |         self.boxDefinitions_ = {} # type: Dict[int, Callable]
 35 |         self.done_ = False # type: bool
 36 | 
 37 |     def box(self, _type: str, definition: Callable) -> 'Mp4Parser':
 38 |         typeCode = Mp4Parser.typeFromString_(_type)
 39 |         self.headers_[typeCode] = Mp4Parser.BoxType_.BASIC_BOX
 40 |         self.boxDefinitions_[typeCode] = definition
 41 |         return self
 42 | 
 43 |     def fullBox(self, _type: str, definition: Callable) -> 'Mp4Parser':
 44 |         typeCode = Mp4Parser.typeFromString_(_type)
 45 |         self.headers_[typeCode] = Mp4Parser.BoxType_.FULL_BOX
 46 |         self.boxDefinitions_[typeCode] = definition
 47 |         return self
 48 | 
 49 |     def stop(self):
 50 |         self.done_ = True
 51 | 
 52 |     def parse(self, data, partialOkay: bool = False, stopOnPartial: bool = False):
 53 |         reader = DataViewReader(data, Endianness.BIG_ENDIAN)
 54 |         self.done_ = False
 55 |         while reader.hasMoreData() and not self.done_:
 56 |             self.parseNext(0, reader, partialOkay, stopOnPartial)
 57 | 
 58 |     def parseNext(self, absStart: int, reader: DataViewReader, partialOkay: bool, stopOnPartial: bool = False):
 59 |         start = reader.getPosition()
 60 | 
 61 |         # size(4 bytes) + type(4 bytes) = 8 bytes
 62 |         if stopOnPartial and start + 8 > reader.getLength():
 63 |             self.done_ = True
 64 |             return
 65 | 
 66 |         size = reader.readUint32()
 67 |         _type = reader.readUint32()
 68 |         name = Mp4Parser.typeToString(_type)
 69 |         has64BitSize = False
 70 |         # log.info(f'[{name}] Parsing MP4 box')
 71 | 
 72 |         if size == 0:
 73 |             size = reader.getLength() - start
 74 |         elif size == 1:
 75 |             if stopOnPartial and reader.getPosition() + 8 > reader.getLength():
 76 |                 self.done_ = True
 77 |                 return
 78 |             size = reader.readUint64()
 79 |             has64BitSize = True
 80 |         # 和js不一样 py中不存在key会直接异常 所以这里用get方法
 81 |         boxDefinition = self.boxDefinitions_.get(_type)
 82 | 
 83 |         if boxDefinition:
 84 |             version = None
 85 |             flags = None
 86 | 
 87 |             if self.headers_[_type] == Mp4Parser.BoxType_.FULL_BOX:
 88 |                 if stopOnPartial and reader.getPosition() + 4 > reader.getLength():
 89 |                     self.done_ = True
 90 |                     return
 91 |                 versionAndFlags = reader.readUint32()
 92 |                 version = versionAndFlags >> 24
 93 |                 flags = versionAndFlags & 0xFFFFFF
 94 | 
 95 |             end = start + size
 96 |             if partialOkay and end > reader.getLength():
 97 |                 end = reader.getLength()
 98 | 
 99 |             if stopOnPartial and end > reader.getLength():
100 |                 self.done_ = True
101 |                 return
102 |             payloadSize = end - reader.getPosition()
103 |             payload = reader.readBytes(payloadSize) if payloadSize > 0 else b''
104 | 
105 |             payloadReader = DataViewReader(payload, Endianness.BIG_ENDIAN)
106 | 
107 |             box = {
108 |                 'parser': self,
109 |                 'partialOkay': partialOkay or False,
110 |                 'version': version,
111 |                 'flags': flags,
112 |                 'reader': payloadReader,
113 |                 'size': size,
114 |                 'start': start + absStart,
115 |                 'has64BitSize': has64BitSize,
116 |             }
117 |             box = ParsedBox(**box)
118 | 
119 |             boxDefinition(box)
120 |         else:
121 |             skipLength = min(start + size - reader.getPosition(), reader.getLength() - reader.getPosition())
122 |             reader.skip(skipLength)
123 | 
124 |     @staticmethod
125 |     def children(box: ParsedBox):
126 |         headerSize = Mp4Parser.headerSize(box)
127 |         while box.reader.hasMoreData() and not box.parser.done_:
128 |             box.parser.parseNext(box.start + headerSize, box.reader, box.partialOkay)
129 | 
130 |     @staticmethod
131 |     def sampleDescription(box: ParsedBox):
132 |         headerSize = Mp4Parser.headerSize(box)
133 |         count = box.reader.readUint32()
134 |         for _ in range(count):
135 |             box.parser.parseNext(box.start + headerSize, box.reader, box.partialOkay)
136 |             if box.parser.done_:
137 |                 break
138 | 
139 |     @staticmethod
140 |     def allData(callback: Callable):
141 |         def alldata_callback(box: ParsedBox):
142 |             _all = box.reader.getLength() - box.reader.getPosition()
143 |             return callback(box.reader.readBytes(_all))
144 |         return alldata_callback
145 | 
146 |     @staticmethod
147 |     def typeFromString_(name: str):
148 |         assert len(name) == 4, 'Mp4 box names must be 4 characters long'
149 | 
150 |         code = 0
151 |         for char in name:
152 |             code = (code << 8) | ord(char)
153 |         return code
154 | 
155 |     @staticmethod
156 |     def typeToString(_type: int):
157 |         name = bytes([
158 |             (_type >> 24) & 0xff,
159 |             (_type >> 16) & 0xff,
160 |             (_type >> 8) & 0xff,
161 |             _type & 0xff
162 |         ]).decode('utf-8')
163 |         return name
164 | 
165 |     @staticmethod
166 |     def headerSize(box: ParsedBox):
167 |         return 8 + (8 if box.has64BitSize else 0) + (4 if box.flags is not None else 0)


--------------------------------------------------------------------------------
/util/TextParser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | class TimeContext:
 5 |     def __init__(self, **kwargs):
 6 |         self.periodStart = kwargs['periodStart'] # tpye: float
 7 |         self.segmentStart = kwargs['segmentStart'] # tpye: float
 8 |         self.segmentEnd = kwargs['segmentEnd'] # tpye: float
 9 | 
10 | 
11 | class TextParser:
12 | 
13 |     def __init__(self, data: str):
14 |         self.data_ = data
15 |         self.position_ = 0
16 | 
17 |     def atEnd(self):
18 |         return self.position_ == len(self.data_)
19 | 
20 |     def readLine(self):
21 |         # assert 1 == 0, 'not implemented yet'
22 |         return self.readRegexReturnCapture_('(.*?)(\n|$)', 1)
23 | 
24 |     def readWord(self):
25 |         # assert 1 == 0, 'not implemented yet'
26 |         return self.readRegexReturnCapture_('[^ \t\n]*', 0)
27 | 
28 |     def readRegexReturnCapture_(self, regex: str, index: int):
29 |         if self.atEnd():
30 |             return None
31 |         ret = self.readRegex(regex)
32 |         if not ret:
33 |             return None
34 |         else:
35 |             return ret[index]
36 | 
37 |     def readRegex(self, regex: str):
38 |         index = self.indexOf_(regex)
39 |         if self.atEnd() or index is None or index.position != self.position_:
40 |             return None
41 | 
42 |         self.position_ += index.length
43 |         return index.results
44 | 
45 |     def indexOf_(self, regex: str):
46 |         # assert 1 == 0, 'not implemented yet'
47 |         results = re.search(regex, self.data_[self.position_:])
48 |         if not results:
49 |             return
50 |         else:
51 |             return IndexOf(results)
52 | 
53 | 
54 | class IndexOf:
55 |     def __init__(self, results: re.Match):
56 |         self.position = results.regs[0][0]
57 |         self.length = len(results[0])
58 |         self.results = results


--------------------------------------------------------------------------------
/util/exceptions.py:
--------------------------------------------------------------------------------
 1 | class Error(Exception):
 2 |     '''Base class for shaka errors.'''
 3 | 
 4 | 
 5 | class SeverityError(Error):
 6 |     '''Severity Error.'''
 7 | 
 8 | 
 9 | class CategoryError(Error):
10 |     '''Category Error.'''
11 | 
12 | 
13 | class InvalidMp4VTT(Error):
14 |     '''Code INVALID_MP4_VTT Error.'''
15 |     def __init__(self, reason: str):
16 |         self.reason = reason
17 | 
18 |     def __str__(self):
19 |         return self.reason
20 | 
21 | 
22 | class InvalidMp4TTML(Error):
23 |     '''Code INVALID_MP4_TTML Error.'''
24 |     def __init__(self, reason: str):
25 |         self.reason = reason
26 | 
27 |     def __str__(self):
28 |         return self.reason
29 | 
30 | 
31 | class InvalidXML(Error):
32 |     '''Code INVALID_XML Error.'''
33 |     def __init__(self, reason: str):
34 |         self.reason = reason
35 | 
36 |     def __str__(self):
37 |         return self.reason
38 | 
39 | 
40 | class InvalidTextCue(Error):
41 |     '''Code INVALID_TEXT_CUE Error.'''
42 |     def __init__(self, reason: str):
43 |         self.reason = reason
44 | 
45 |     def __str__(self):
46 |         return self.reason
47 | 
48 | 
49 | class OutOfBoundsError(Error):
50 |     '''Code BUFFER_READ_OUT_OF_BOUNDS Error.'''
51 | 
52 | 
53 | class IntOverflowError(Error):
54 |     '''Code JS_INTEGER_OVERFLOW Error.'''
55 | 


--------------------------------------------------------------------------------