├── test.mp3 ├── config.json ├── requirements.txt ├── Dockerfile ├── example.py ├── .gitignore ├── tools_language.py ├── acrcloud_logger.py ├── acrcloud_scan_files_libary.py ├── README.md ├── tools_str_sim.py ├── acrcloud_scan_files_python.py └── acrcloud_filter_libary.py /test.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/acrcloud/acrcloud_scan_files_python/HEAD/test.mp3 -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "host": "xxx", 3 | "access_key": "xxx", 4 | "access_secret": "xxx" 5 | } 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | python-Levenshtein 2 | fuzzywuzzy 3 | backports.csv 4 | requests 5 | openpyxl 6 | python-dateutil 7 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:2.7.16-slim-stretch 2 | 3 | COPY . /acr_scan_tool 4 | WORKDIR /acr_scan_tool 5 | RUN chmod +x /acr_scan_tool/acrcloud_scan_files_python.py 6 | 7 | ENV PATH=${PATH}:/acr_scan_tool 8 | 9 | RUN apt-get update \ 10 | && apt-get install -y --no-install-recommends git \ 11 | && apt-get purge -y --auto-remove \ 12 | && rm -rf /var/lib/apt/lists/* 13 | 14 | RUN pip install git+https://github.com/acrcloud/acrcloud_sdk_python 15 | RUN pip install fuzzywuzzy requests openpyxl python-dateutil backports.csv 16 | 17 | 18 | ENTRYPOINT ["acrcloud_scan_files_python.py"] 19 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import json 7 | from acrcloud_scan_files_libary import ACRCloud_Scan_Files 8 | 9 | if __name__ == "__main__": 10 | 11 | #ACRCloud Scan File Example 12 | is_debug = 1 #display the log info, or is_debug=0 13 | start_time = 0 #scan file start time(seconds) 14 | stop_time = 0 #scan file end time(seconds), or you can set it to the duration of file 15 | step = 10 #the length of each identified fragment (seconds) 16 | rec_length = step 17 | 18 | #your acrcloud project host, access_key, access_secret 19 | config = { 20 | "host": "your project host", 21 | "access_key": "your project access_key", 22 | "access_secret": "your project access_secret" 23 | } 24 | #export dir 25 | export_dir = "./" 26 | 27 | filepath = sys.argv[1] 28 | 29 | acr_sfile = ACRCloud_Scan_Files(config, is_debug) 30 | 31 | stop_time = acr_sfile.get_duration_by_file(filepath) 32 | 33 | """ 34 | #get a list of recognition results 35 | result_list = acr_sfile.recognize_file(filepath, start_time, stop_time, step, rec_length) 36 | #export to csv 37 | export_filename_csv = filepath + ".csv" 38 | acr_sfile.export_to_csv(result_list, export_filename_csv, export_dir) 39 | #export to xlsx 40 | export_filename_xlsx = filepath + ".xlsx" 41 | acr_sfile.export_to_xlsx(result_list, export_filename_xlsx, export_dir) 42 | """ 43 | 44 | #iterator to get the result of each fragment 45 | result_list2 = [] 46 | with open(filepath+"_raw_result.lst", "w") as wfile: 47 | for item in acr_sfile.for_recognize_file(filepath, start_time, stop_time, step, rec_length): 48 | result_list2.append(item) 49 | filename = item["file"] 50 | timestamp = item["timestamp"] 51 | res = acr_sfile.parse_data(item["result"]) 52 | title = res[2] 53 | print(filename, timestamp, title) 54 | wfile.write("{0}\n".format(json.dumps(item))) 55 | 56 | #get results with played-duration 57 | filter_results = acr_sfile.apply_filter(result_list2) 58 | #export the results to xlsx 59 | export_filtername_xlsx = filepath + "_with_duration.xlsx" 60 | acr_sfile.export_to_xlsx(filter_results, export_filtername_xlsx, export_dir) 61 | 62 | 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *,cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask instance folder 59 | instance/ 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # IPython Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | ### JetBrains template 92 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 93 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 94 | 95 | # User-specific stuff: 96 | .idea/workspace.xml 97 | .idea/tasks.xml 98 | .idea/dictionaries 99 | .idea/vcs.xml 100 | .idea/jsLibraryMappings.xml 101 | 102 | # Sensitive or high-churn files: 103 | .idea/dataSources.ids 104 | .idea/dataSources.xml 105 | .idea/dataSources.local.xml 106 | .idea/sqlDataSources.xml 107 | .idea/dynamic.xml 108 | .idea/uiDesigner.xml 109 | 110 | # Gradle: 111 | .idea/gradle.xml 112 | .idea/libraries 113 | 114 | # Mongo Explorer plugin: 115 | .idea/mongoSettings.xml 116 | 117 | ## File-based project format: 118 | *.iws 119 | 120 | ## Plugin-specific files: 121 | 122 | # IntelliJ 123 | /out/ 124 | 125 | # mpeltonen/sbt-idea plugin 126 | .idea_modules/ 127 | 128 | # JIRA plugin 129 | atlassian-ide-plugin.xml 130 | 131 | # Crashlytics plugin (for Android Studio and IntelliJ) 132 | com_crashlytics_export_strings.xml 133 | crashlytics.properties 134 | crashlytics-build.properties 135 | fabric.properties 136 | 137 | -------------------------------------------------------------------------------- /tools_language.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | class tools_language: 5 | def __init__(self): 6 | pass 7 | 8 | def is_chinese(self, uchar): 9 | if uchar >= u'\u4e00' and uchar<=u'\u9fa5': 10 | return True 11 | else: 12 | return False 13 | 14 | def is_CJK(self, uchar): 15 | """判断一个unicode是否为CJK(中日韩)""" 16 | if uchar >= u'\u3000' and uchar <= u'\u303f': 17 | return True 18 | elif uchar >= u'\u3040' and uchar <= u'\u309f': 19 | return True 20 | elif uchar >= u'\u30a0' and uchar <= u'\u30ff': 21 | return True 22 | elif uchar >= u'\uff00' and uchar <= u'\u30ff': 23 | return True 24 | elif uchar >= u'\u4e00' and uchar <= u'\u9faf': 25 | return True 26 | elif uchar >= u'\u3400' and uchar <= u'\u4dbf': 27 | return True 28 | elif uchar >= u'\u0400' and uchar <= u'\u052f': #俄语 29 | return True 30 | elif uchar >= u'\uac00' and uchar <= u'\ud7ff': #韩文 31 | return True 32 | elif uchar >= u'\u4e00' and uchar <= u'\u9fa5': #中文 33 | return True 34 | elif uchar >= u'\uff61' and uchar <= u'\uff9f': #半角日文 半宽假名 35 | return True 36 | else: 37 | return False 38 | 39 | def is_number(self, uchar): 40 | if uchar >= u'\u0030' and uchar<=u'\uffef': 41 | return True 42 | else: 43 | return False 44 | 45 | def is_alphabet(self, uchar): 46 | if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'): 47 | return True 48 | else: 49 | return False 50 | 51 | def is_other(self, uchar): 52 | if not (self.is_chinese(uchar) or self.is_number(uchar) or self.is_alphabet(uchar)): 53 | return True 54 | else: 55 | return False 56 | 57 | def B2Q(self, uchar): 58 | inside_code=ord(uchar) 59 | if inside_code<0x0020 or inside_code>0x7e: 60 | return uchar 61 | if inside_code==0x0020: 62 | inside_code=0x3000 63 | else: 64 | inside_code+=0xfee0 65 | return unichr(inside_code) 66 | 67 | def Q2B(self, uchar): 68 | inside_code=ord(uchar) 69 | if inside_code==0x3000: 70 | inside_code=0x0020 71 | else: 72 | inside_code-=0xfee0 73 | if inside_code<0x0020 or inside_code>0x7e: 74 | return uchar 75 | return unichr(inside_code) 76 | 77 | def stringQ2B(self, ustring): 78 | return "".join([self.Q2B(uchar) for uchar in ustring]) 79 | 80 | def uniform(self, ustring): 81 | return self.stringQ2B(ustring).lower() 82 | 83 | def string2List(self, ustring): 84 | retList=[] 85 | utmp=[] 86 | for uchar in ustring: 87 | if self.is_other(uchar): 88 | if len(utmp)==0: 89 | continue 90 | else: 91 | retList.append("".join(utmp)) 92 | utmp=[] 93 | else: 94 | utmp.append(uchar) 95 | if len(utmp)!=0: 96 | retList.append("".join(utmp)) 97 | return retList 98 | 99 | def has_chinese(self, ustring): 100 | ustring_lower = ustring.lower() 101 | for uchar in ustring_lower: 102 | if self.is_chinese(uchar): 103 | return True 104 | return False 105 | 106 | def has_CJK(self, ustring): 107 | ustring_lower = ustring.lower() 108 | for uchar in ustring_lower: 109 | if self.is_CJK(uchar): 110 | return True 111 | return False 112 | 113 | 114 | -------------------------------------------------------------------------------- /acrcloud_logger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -* 3 | 4 | import os 5 | import sys 6 | import time 7 | import logging 8 | import traceback 9 | from logging.handlers import TimedRotatingFileHandler 10 | ''' 11 | traceback records log 12 | try: 13 | pass 14 | except Exception, e: 15 | logger.error('Failed to open file', exc_info=True) 16 | ''' 17 | 18 | import logging 19 | 20 | BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8) 21 | 22 | COLORS = { 23 | 'WARNING' : YELLOW, 24 | 'INFO' : GREEN, 25 | 'DEBUG' : BLUE, 26 | 'CRITICAL' : YELLOW, 27 | 'ERROR' : RED, 28 | 'RED' : RED, 29 | 'GREEN' : GREEN, 30 | 'YELLOW' : YELLOW, 31 | 'BLUE' : BLUE, 32 | 'MAGENTA' : MAGENTA, 33 | 'CYAN' : CYAN, 34 | 'WHITE' : WHITE, 35 | } 36 | 37 | RESET_SEQ = "\033[0m" 38 | COLOR_SEQ = "\033[1;%dm" 39 | BOLD_SEQ = "\033[1m" 40 | 41 | class ColoredFormatter(logging.Formatter): 42 | 43 | def __init__(self, *args, **kwargs): 44 | # can't do super(...) here because Formatter is an old school class 45 | logging.Formatter.__init__(self, *args, **kwargs) 46 | 47 | def format(self, record): 48 | levelname = record.levelname 49 | color = COLOR_SEQ % (30 + COLORS[levelname]) 50 | message = logging.Formatter.format(self, record) 51 | message = message.replace("$RESET", RESET_SEQ)\ 52 | .replace("$BOLD", BOLD_SEQ)\ 53 | .replace("$COLOR", color) 54 | for k,v in COLORS.items(): 55 | message = message.replace("$" + k, COLOR_SEQ % (v+30))\ 56 | .replace("$BG" + k, COLOR_SEQ % (v+40))\ 57 | .replace("$BG-" + k, COLOR_SEQ % (v+40)) 58 | return message + RESET_SEQ 59 | 60 | 61 | class AcrcloudLogger: 62 | 63 | def __init__(self, logname, loglevel = logging.INFO): 64 | self.logger = logging.getLogger(logname) 65 | self.logger.setLevel(loglevel) 66 | self.default_fmt = '%(asctime)s - %(name)s - %(levelname)8s - %(message)s' 67 | self.default_colorfmt = "$MAGENTA%(asctime)s$RESET - $COLOR%(name)-12s$RESET - $COLOR%(levelname)-6s$RESET - %(message)s" 68 | self.default_dir = './radioLog' 69 | 70 | def addFilehandler(self, logfile, logdir = None, fmt = '', loglevel = logging.INFO, when='D', interval=10, backupCount=1): 71 | try: 72 | filename = logfile 73 | if logdir is None: 74 | logdir = self.default_dir 75 | if not os.path.exists(logdir): 76 | os.makedirs(logdir) 77 | logfilepath = os.path.join(logdir, filename) 78 | #fhandler = logging.FileHandler(logfilepath) 79 | fhandler = TimedRotatingFileHandler(logfilepath, when, interval, backupCount) 80 | fhandler.setLevel(loglevel) 81 | formatter = logging.Formatter(fmt if fmt else self.default_fmt) 82 | fhandler.setFormatter(formatter) 83 | self.logger.addHandler(fhandler) 84 | return True 85 | except Exception as e: 86 | traceback.print_exc(file=sys.stdout) 87 | return False 88 | 89 | def addStreamHandler(self, fmt='', loglevel = logging.INFO): 90 | try: 91 | color_fmt = fmt if fmt else self.default_colorfmt 92 | shandler = logging.StreamHandler() 93 | shandler.setLevel(loglevel) 94 | color_formatter = ColoredFormatter(color_fmt) 95 | #f = logging.Formatter(self.default_fmt) 96 | shandler.setFormatter(color_formatter) 97 | self.logger.addHandler(shandler) 98 | return True 99 | except Exception as e: 100 | traceback.print_exc(file=sys.stdout) 101 | return False 102 | 103 | if __name__ == '__main__': 104 | 105 | dlog = AcrcloudLogger('test', logging.INFO) 106 | dlog.addFilehandler('test.log') 107 | dlog.addStreamHandler() 108 | #dlog.logger.warn("hel") 109 | """ 110 | for i in range(300): 111 | dlog.logger.warn('what!!!!!!!!!!!') 112 | #dlog.logger.info('hahhahah') 113 | #dlog.logger.error('it is monster!!') 114 | time.sleep(1) 115 | """ 116 | -------------------------------------------------------------------------------- /acrcloud_scan_files_libary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import time 7 | import json 8 | import codecs 9 | import logging 10 | import openpyxl 11 | from backports import csv 12 | from openpyxl import Workbook 13 | from acrcloud_filter_libary import FilterWorker 14 | from acrcloud_logger import AcrcloudLogger 15 | from acrcloud.recognizer import ACRCloudRecognizer 16 | 17 | if sys.version_info.major == 2: 18 | reload(sys) 19 | sys.setdefaultencoding("utf8") 20 | 21 | class ACRCloud_Scan_Files: 22 | 23 | def __init__(self, config, debug=1): 24 | self.openpyxl_version = ".".join(str(openpyxl.__version__).split(".")[:2]) 25 | self.config = config 26 | self.debug = debug 27 | self.init_log() 28 | self.re_handler = ACRCloudRecognizer(self.config) 29 | 30 | def init_log(self): 31 | log_level = logging.ERROR 32 | if self.debug == 1: 33 | log_level = logging.DEBUG 34 | 35 | shandler = logging.StreamHandler() 36 | #shandler.setLevel(log_level) 37 | self.log = logging.getLogger("ACRCloud_ScanFile") 38 | self.log.setLevel(log_level) 39 | self.log.addHandler(shandler) 40 | 41 | def as_text(self, value): 42 | if value is None: 43 | return "" 44 | return str(value) 45 | 46 | def get_duration_by_file(self, filepath): 47 | return int(ACRCloudRecognizer.get_duration_ms_by_file(filepath)/1000) 48 | 49 | def export_to_xlsx(self, result_list, export_filename="ACRCloud_ScanFile_Results.xlsx", export_dir="./"): 50 | try: 51 | results = [] 52 | for item in result_list: 53 | filename = item["file"] 54 | timestamp = item["timestamp"] 55 | jsoninfo = item["result"] 56 | if "status" in jsoninfo and jsoninfo["status"]["code"] == 0: 57 | row = self.parse_data(jsoninfo) 58 | row = [filename, timestamp] + list(row) 59 | results.append(row) 60 | results = sorted(results, key=lambda x:x[1]) 61 | 62 | wb = Workbook() 63 | sheet_music = wb.active 64 | sheet_music.title = "ACRCloud_Scan_File" 65 | 66 | header_row = ['filename', 'timestamp', 'custom_files_title', 'custom_acrid', 'title', 'artists', 'album', 67 | 'acrid', 'played_duration', 'label', 'isrc', 'upc', 'deezer', 'spotify', 'itunes', 'youtube'] 68 | 69 | sheet_music.append(header_row) 70 | for row in results: 71 | sheet_music.append(row) 72 | 73 | for column_cells in sheet_music.columns: 74 | length = max(len(self.as_text(cell.value)) for cell in column_cells) 75 | if length > 80: 76 | length == 80 77 | if self.openpyxl_version >= "2.6": 78 | sheet_music.column_dimensions[column_cells[0].column_letter].width = length 79 | else: 80 | sheet_music.column_dimensions[column_cells[0].column].width = length 81 | 82 | export_filepath = os.path.join(export_dir, export_filename) 83 | wb.save(export_filepath) 84 | if self.debug: 85 | self.log.info("export_to_xlsx.Save Data to xlsx: {0}".format(export_filename)) 86 | except Exception as e: 87 | self.log.error("Error@export_to_xlsx", exc_info=True) 88 | 89 | def export_to_csv(self, result_list, export_filename="ACRCloud_ScanFile_Results.csv", export_dir="./"): 90 | try: 91 | results = [] 92 | for item in result_list: 93 | filename = item["file"] 94 | timestamp = item["timestamp"] 95 | jsoninfo = item["result"] 96 | if "status" in jsoninfo and jsoninfo["status"]["code"] == 0: 97 | row = self.parse_data(jsoninfo) 98 | row = [filename, timestamp] + list(row) 99 | results.append(row) 100 | 101 | results = sorted(results, key=lambda x:x[1]) 102 | 103 | export_filepath = os.path.join(export_dir, export_filename) 104 | 105 | with codecs.open(export_filepath, 'w', 'utf-8-sig') as f: 106 | head_row = ['filename', 'timestamp', 'custom_files_title', 'custom_acrid', 'title', 'artists', 'album', 107 | 'acrid', 'played_duration', 'label', 'isrc', 'upc', 'deezer', 'spotify', 'itunes', 'youtube'] 108 | dw = csv.writer(f) 109 | dw.writerow(head_row) 110 | dw.writerows(results) 111 | if self.debug: 112 | self.log.info("export_to_csv.Save Data to csv: {0}".format(export_filename)) 113 | except Exception as e: 114 | self.log.error("Error@export_to_csv", exc_info=True) 115 | 116 | def parse_data(self, jsoninfo): 117 | try: 118 | title, played_duration, isrc, upc, acrid, label, album = [""]*7 119 | artists, deezer, spotify, itunes, youtube, custom_files_title, audio_id, custom_acrid = [""]*8 120 | 121 | metadata = jsoninfo.get('metadata', {}) 122 | played_duration = metadata.get("played_duration", "") 123 | if "music" in metadata and len(metadata["music"]) > 0: 124 | item = metadata["music"][0] 125 | title = item.get("title", "") 126 | offset = item.get("play_offset_ms", "") 127 | isrc = item.get("external_ids", {"isrc":""}).get("isrc","") 128 | upc = item.get("external_ids", {"upc":""}).get("upc","") 129 | acrid = item.get("acrid","") 130 | label = item.get("label", "") 131 | album = item.get("album", {"name":""}).get("name", "") 132 | artists = ",".join([ ar["name"] for ar in item.get('artists', [{"name":""}]) if ar.get("name") ]) 133 | deezer = item.get("external_metadata", {"deezer":{"track":{"id":""}}}).get("deezer", {"track":{"id":""}}).get("track", {"id":""}).get("id", "") 134 | spotify = item.get("external_metadata", {"spotify":{"track":{"id":""}}}).get("spotify", {"track":{"id":""}}).get("track", {"id":""}).get("id", "") 135 | itunes = item.get("external_metadata", {"itunes":{"track":{"id":""}}}).get("itunes", {"track":{"id":""}}).get("track", {"id":""}).get("id", "") 136 | youtube = item.get("external_metadata", {"youtube":{"vid":""}}).get("youtube", {"vid":""}).get("vid", "") 137 | 138 | if "custom_files" in metadata and len(metadata["custom_files"]) > 0: 139 | custom_item = metadata["custom_files"][0] 140 | custom_files_title = custom_item.get("title", "") 141 | audio_id = custom_item.get("audio_id", "") 142 | custom_acrid = custom_item.get("acrid", "") 143 | except Exception as e: 144 | self.log.error("Error@parse_data") 145 | 146 | res = (custom_files_title, custom_acrid, title, artists, album, acrid, 147 | played_duration, label, isrc, upc, deezer, spotify, itunes, youtube,) 148 | 149 | return res 150 | 151 | def apply_filter(self, results): 152 | fworker = FilterWorker() 153 | result_new = fworker.apply_filter(results) 154 | return result_new 155 | 156 | def do_recognize(self, filepath, start_time, rec_length): 157 | current_time = time.strftime('%H:%M:%S', time.gmtime(start_time)) 158 | res_data = self.re_handler.recognize_by_file(filepath, start_time, rec_length) 159 | return filepath, current_time, res_data 160 | 161 | def for_recognize_file(self, filepath, start_time, stop_time, step, rec_length): 162 | try: 163 | for i in range(start_time, stop_time, step): 164 | filep, current_time, res_data = self.do_recognize(filepath, i, rec_length) 165 | if res_data: 166 | jsoninfo = json.loads(res_data) 167 | if "metadata" in jsoninfo and "timestamp_utc" in jsoninfo["metadata"]: 168 | jsoninfo["metadata"]["timestamp_utc"] = current_time 169 | else: 170 | jsoninfo = {} 171 | yield {"timestamp":current_time, "rec_length":rec_length, "result":jsoninfo, "file":filep} 172 | except Exception as e: 173 | self.log.error("Error@for_recognize_file", exc_info=True) 174 | 175 | def recognize_file(self, filepath, start_time, stop_time, step, rec_length): 176 | try: 177 | result_list = [] 178 | for i in range(start_time, stop_time, step): 179 | filep, current_time, res_data = self.do_recognize(filepath, i, rec_length) 180 | if res_data: 181 | jsoninfo = json.loads(res_data) 182 | try: 183 | if "metadata" in jsoninfo and "timestamp_utc" in jsoninfo["metadata"]: 184 | jsoninfo["metadata"]["timestamp_utc"] = current_time 185 | 186 | code = jsoninfo["status"]["code"] 187 | msg = jsoninfo["status"]["msg"] 188 | if jsoninfo["status"]["code"] not in [0, 1001]: 189 | raise Exception("recognize_file.(timestamp: {0}, {1}, {2})".format(current_time, code, msg)) 190 | except Exception as e: 191 | if self.debug: 192 | self.log.error(e) 193 | else: 194 | print (e) 195 | if code in [3001, 3003, 3013]: 196 | break 197 | else: 198 | continue 199 | 200 | result_list.append({"timestamp":current_time, "rec_length":rec_length, "result":jsoninfo, "file":filep}) 201 | if self.debug: 202 | parse_info = self.parse_data(jsoninfo) 203 | self.log.info('recognize_file.(timestamp: {0}, title: {1})'.format(current_time, parse_info[0])) 204 | except Exception as e: 205 | self.log.error("Error@recognize_file", exc_info=True) 206 | return result_list 207 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [Audio Recognition](https://www.acrcloud.com/music-recognition) -- File Scan Tool (Python Script) 2 | 3 | 4 | 5 | ## Overview 6 | [ACRCloud](https://www.acrcloud.com/) provides [Automatic Content Recognition](https://www.acrcloud.com/docs/introduction/automatic-content-recognition/) services for [Audio Fingerprinting](https://www.acrcloud.com/docs/introduction/audio-fingerprinting/) based applications such as **[Audio Recognition](https://www.acrcloud.com/music-recognition)** (supports music, video, ads for both online and offline), **[Broadcast Monitoring](https://www.acrcloud.com/broadcast-monitoring)**, **[Second Screen](https://www.acrcloud.com/second-screen-synchronization)**, **[Copyright Protection](https://www.acrcloud.com/copyright-protection-de-duplication)** and etc.
7 | 8 | This tool can scan audio/video files and detect audios you want to recognize such as music, ads. 9 | 10 | Supported Format: 11 | 12 | >>Audio: mp3, wav, m4a, flac, aac, amr, ape, ogg ...
13 | >>Video: mp4, mkv, wmv, flv, ts, avi ... 14 | 15 | ## Requirements 16 | 17 | **Notice: This tool only support Python 2.** 18 | 19 | - Python 2.x 20 | - fuzzywuzzy 21 | - openpyxl 22 | - backports.csv 23 | - requests 24 | - Follow one of the tutorials to create a project and get your host, access_key and access_secret. 25 | 26 | 27 | ## Run as a Docker Container 28 | - Install Docker 29 | - If you are using Windows: Download [Docker Desktop for Windows](https://download.docker.com/win/stable/Docker%20for%20Windows%20Installer.exe) and install. 30 | - If you are using MacOs: Download [Docker Desktop for Mac](https://download.docker.com/mac/stable/Docker.dmg) and install. 31 | - If you are using Linux: Open the Terminal and input `bash <(curl -s https://get.docker.com/)` 32 | - Change the config file (config.json). 33 | - Run following command 34 | ``` 35 | git clone https://github.com/acrcloud/acrcloud_scan_files_python.git 36 | 37 | cd acrcloud_scan_files_python 38 | 39 | sudo docker build -t acrcloud/python_scan_tool . 40 | # Call it without arguments to display the full help 41 | sudo docker run --rm acrcloud/python_scan_tool 42 | 43 | # Basic usage 44 | sudo docker run --rm -v $(pwd):/tmp -v /Users/acrcloud/:/music/ acrcloud/python_scan_tool -f /music/test.mp4 -o /tmp 45 | 46 | You need to change /Users/acrcloud/ to the directory where your audio/video file is. 47 | And the report file will in the acrcloud_scan_files_python directory. 48 | ``` 49 | ## Installation 50 | 51 | For Windows System, you must install [Python](https://www.python.org/downloads/windows/) and [pip](https://pip.pypa.io/en/stable/installing/). 52 | 53 | Open your terminal and change to the script directory of acrcloud_scan_files_python-master. Then run the command: 54 | 55 | ``` 56 | pip install -r requirements.txt 57 | ``` 58 | ## Install ACRCloud Python SDK 59 | 60 | 61 | You can run the following command to install it. 62 | 63 | ``` 64 | python -m pip install git+https://github.com/acrcloud/acrcloud_sdk_python 65 | ``` 66 | 67 | Or you can download the sdk and install it by following command. 68 | 69 | [ACRCloud Python SDK](https://github.com/acrcloud/acrcloud_sdk_python). 70 | 71 | 72 | ``` 73 | sudo python setup.py install 74 | ``` 75 | 76 | ## For Windows 77 | 78 | ### Install Library 79 | Windows Runtime Library 80 | 81 | X86: [download and install Library(windows/vcredist_x86.exe)](https://www.microsoft.com/en-us/download/details.aspx?id=5555) 82 | 83 | x64: [download and install Library(windows/vcredist_x64.exe)](https://www.microsoft.com/en-us/download/details.aspx?id=14632) 84 | 85 | 86 | ## Usage for Scan File Tool: 87 | 88 | _ ____ ____ ____ _ _ 89 | / \ / ___| _ \ / ___| | ___ _ _ __| | 90 | / _ \| | | |_) | | | |/ _ \| | | |/ _` | 91 | / ___ \ |___| _ <| |___| | (_) | |_| | (_| | 92 | /_/ \_\____|_| \_\\____|_|\___/ \____|\____| 93 | 94 | Before you use this script,you must have acrcloud host,access_key and access_secret. 95 | If you haven't have these ,you can register one https://console.acrcloud.com/signup 96 | 97 | Change the content of config.json,fill in your host, access_key and access_secret 98 | ``` 99 | { 100 | "host": "xxxxx", 101 | "access_key": "xxxxx", 102 | "access_secret": "xxxxx" 103 | } 104 | ``` 105 | 106 | ``` 107 | python acrcloud_scan_files_python.py -d folder_path 108 | python acrcloud_scan_files_python.py -f file_path 109 | python acrcloud_scan_files_python.py -h get_usage_help 110 | ``` 111 | 112 | ### Scan Folder Example: 113 | ``` 114 | python acrcloud_scan_files_python.py -d ~/music 115 | ``` 116 | ### Scan File Example: 117 | ``` 118 | python acrcloud_scan_files_python.py -f ~/testfiles/test.mp3 119 | ``` 120 | 121 | ### Add more params 122 | 123 | "-s" ---- scan step. (The scan interval.) 124 | 125 | "-l" ---- recongizing length. (use how many seconds to recongize. for example: -s 20 -l 10, it will get 20 seconds of audio each time and use the first 10 seconds of audio to recognize) 126 | 127 | "-r" ---- scan range. (The scan range. for example: -r 5-20, it will recognize the file starting from the 5th second and finish at the 20th second.) 128 | 129 | "-c" ---- set the config file path. 130 | 131 | "-w" ---- results with duration. (1-yes, 0-no), you must set offset config for your access key, pls contact support@acrcloud.com 132 | 133 | "-o" ---- set the directory to save the results 134 | 135 | "-t" ---- set the type of file.(csv[default] or xlsx). 136 | ``` 137 | If you want to change scan interval or you want to set recognize range,you can add some params 138 | Example: 139 | python acrcloud_scan_files_python.py -f ~/testfiles/test.mp3 -s 30 -r 0-20 140 | python acrcloud_scan_files_python.py -d ~/music -s 30 -w 1 141 | ``` 142 | 143 | Default is scan folder where this script in. 144 | 145 | The results are saved in the folder where this script in. 146 | 147 | 148 | ## Usage for Scan File Libary 149 | 150 | Introduction all API. 151 | 152 | ### acrcloud_scan_files_libary.py 153 | 154 | ``` 155 | class ACRCloud_Scan_Files: 156 | def get_duration_by_file(self, filepath): 157 | #@param filepath : query file path 158 | #@return : total duration of the file 159 | 160 | def export_to_xlsx(self, result_list, export_filename, export_dir): 161 | #@param result_list : the list of identification results 162 | #@param export_filename : export to this file 163 | #@param export_dir : export to this directory 164 | 165 | def export_to_csv(self, result_list, export_filename, export_dir): 166 | #@param result_list : the list of recognition results 167 | #@param export_filename : export to this file 168 | #@param export_dir : export to this directory 169 | 170 | def parse_data(self, result): 171 | #@param result : one recognition result 172 | #@return : a tuple, as follow 173 | # (title, artists, album, acrid, played_duration, label, isrc, upc, 174 | # deezer, spotify, itunes, youtube, custom_files_title, audio_id) 175 | 176 | def apply_filter(self, results): 177 | #@param results : the list of recognition results 178 | #@return : a list results with played_duration 179 | 180 | def for_recognize_file(self, filepath, start_time, stop_time, step, rec_length): 181 | #@param filepath : query file path 182 | #@param start_time : the start offset to recognize (seconds) 183 | #@param stop_time : the end offset to recognize (seconds) 184 | #@param rec_length : the duration of each fragment to recognize 185 | #@return : iterator to return the each recognition result 186 | 187 | def recognize_file(self, filepath, start_time, stop_time, step, rec_length): 188 | #@param filepath : query file path 189 | #@param start_time : the start offset to recognize (seconds) 190 | #@param stop_time : the end offset to recognize (seconds) 191 | #@param rec_length : the duration of each fragment to recognize 192 | #@return : the list of recognition results 193 | ``` 194 | 195 | ### Example 196 | 197 | run Text: python example.py test.mp3 198 | 199 | ``` 200 | #!/usr/bin/env python 201 | #-*- coding:utf-8 -*- 202 | 203 | import os 204 | import sys 205 | from acrcloud_scan_files_libary import ACRCloud_Scan_Files 206 | 207 | if __name__ == "__main__": 208 | 209 | #ACRCloud Scan File Example 210 | is_debug = 1 #display the log info, or is_debug=0 211 | start_time = 0 #scan file start time(seconds) 212 | stop_time = 0 #scan file end time(seconds), or you can set it to the duration of file 213 | step = 10 #the length of each identified fragment (seconds) 214 | rec_length = step 215 | 216 | #your acrcloud project host, access_key, access_secret 217 | config = { 218 | "host": "XXX", 219 | "access_key":"XXX", 220 | "access_secret": "XXX" 221 | } 222 | 223 | filepath = sys.argv[1] 224 | 225 | acr_sfile = ACRCloud_Scan_Files(config, is_debug) 226 | 227 | stop_time = acr_sfile.get_duration_by_file(filepath) 228 | 229 | #get a list of recognition results 230 | result_list = acr_sfile.recognize_file(filepath, start_time, stop_time, step, rec_length) 231 | 232 | #export the result 233 | export_dir = "./" 234 | #export to csv 235 | export_filename_csv = "test.csv" 236 | acr_sfile.export_to_csv(result_list, export_filename_csv, export_dir) 237 | #export to xlsx 238 | export_filename_xlsx = "test.xlsx" 239 | acr_sfile.export_to_xlsx(result_list, export_filename_xlsx, export_dir) 240 | 241 | #iterator to get the result of each fragment 242 | result_list2 = [] 243 | for item in acr_sfile.for_recognize_file(filepath, start_time, stop_time, step, rec_length): 244 | result_list2.append(item) 245 | filename = item["file"] 246 | timestamp = item["timestamp"] 247 | res = acr_sfile.parse_data(item["result"]) 248 | title = res[0] 249 | print filename, timestamp, title 250 | 251 | #get results with played-duration 252 | filter_results = acr_sfile.apply_filter(result_list2) 253 | #export the results to xlsx 254 | export_filtername_xlsx = "test_with_duration.xlsx" 255 | acr_sfile.export_to_xlsx(filter_results, export_filtername_xlsx, export_dir) 256 | ``` 257 | -------------------------------------------------------------------------------- /tools_str_sim.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | """ 4 | author: hong 5 | Copyright (c) 2011 Adam Cohen 6 | ...... 7 | 8 | """ 9 | import re 10 | import sys 11 | import string 12 | from fuzzywuzzy import fuzz 13 | 14 | reload(sys) 15 | sys.setdefaultencoding("utf8") 16 | 17 | RE_SPECIAL_STRING = """[ \[\][]\(\)()\n\t\r,\.\:"'‘“<>《》!!??&]""" 18 | RE_SUB_STRING = "(\(.*\))|(\[.*\])|((.*))" 19 | THREADHOLD = 75 20 | 21 | #https://stackoverflow.com/questions/286921/efficiently-replace-all-accented-characters-in-a-string 22 | latin_map={ 23 | u"Á":"A", 24 | u"Ă":"A", 25 | u"Ắ":"A", 26 | u"Ặ":"A", 27 | u"Ằ":"A", 28 | u"Ẳ":"A", 29 | u"Ẵ":"A", 30 | u"Ǎ":"A", 31 | u"Â":"A", 32 | u"Ấ":"A", 33 | u"Ậ":"A", 34 | u"Ầ":"A", 35 | u"Ẩ":"A", 36 | u"Ẫ":"A", 37 | u"Ä":"A", 38 | u"Ǟ":"A", 39 | u"Ȧ":"A", 40 | u"Ǡ":"A", 41 | u"Ạ":"A", 42 | u"Ȁ":"A", 43 | u"À":"A", 44 | u"Ả":"A", 45 | u"Ȃ":"A", 46 | u"Ā":"A", 47 | u"Ą":"A", 48 | u"Å":"A", 49 | u"Ǻ":"A", 50 | u"Ḁ":"A", 51 | u"Ⱥ":"A", 52 | u"Ã":"A", 53 | u"Ꜳ":"AA", 54 | u"Æ":"AE", 55 | u"Ǽ":"AE", 56 | u"Ǣ":"AE", 57 | u"Ꜵ":"AO", 58 | u"Ꜷ":"AU", 59 | u"Ꜹ":"AV", 60 | u"Ꜻ":"AV", 61 | u"Ꜽ":"AY", 62 | u"Ḃ":"B", 63 | u"Ḅ":"B", 64 | u"Ɓ":"B", 65 | u"Ḇ":"B", 66 | u"Ƀ":"B", 67 | u"Ƃ":"B", 68 | u"Ć":"C", 69 | u"Č":"C", 70 | u"Ç":"C", 71 | u"Ḉ":"C", 72 | u"Ĉ":"C", 73 | u"Ċ":"C", 74 | u"Ƈ":"C", 75 | u"Ȼ":"C", 76 | u"Ď":"D", 77 | u"Ḑ":"D", 78 | u"Ḓ":"D", 79 | u"Ḋ":"D", 80 | u"Ḍ":"D", 81 | u"Ɗ":"D", 82 | u"Ḏ":"D", 83 | u"Dz":"D", 84 | u"Dž":"D", 85 | u"Đ":"D", 86 | u"Ƌ":"D", 87 | u"DZ":"DZ", 88 | u"DŽ":"DZ", 89 | u"É":"E", 90 | u"Ĕ":"E", 91 | u"Ě":"E", 92 | u"Ȩ":"E", 93 | u"Ḝ":"E", 94 | u"Ê":"E", 95 | u"Ế":"E", 96 | u"Ệ":"E", 97 | u"Ề":"E", 98 | u"Ể":"E", 99 | u"Ễ":"E", 100 | u"Ḙ":"E", 101 | u"Ë":"E", 102 | u"Ė":"E", 103 | u"Ẹ":"E", 104 | u"Ȅ":"E", 105 | u"È":"E", 106 | u"Ẻ":"E", 107 | u"Ȇ":"E", 108 | u"Ē":"E", 109 | u"Ḗ":"E", 110 | u"Ḕ":"E", 111 | u"Ę":"E", 112 | u"Ɇ":"E", 113 | u"Ẽ":"E", 114 | u"Ḛ":"E", 115 | u"Ꝫ":"ET", 116 | u"Ḟ":"F", 117 | u"Ƒ":"F", 118 | u"Ǵ":"G", 119 | u"Ğ":"G", 120 | u"Ǧ":"G", 121 | u"Ģ":"G", 122 | u"Ĝ":"G", 123 | u"Ġ":"G", 124 | u"Ɠ":"G", 125 | u"Ḡ":"G", 126 | u"Ǥ":"G", 127 | u"Ḫ":"H", 128 | u"Ȟ":"H", 129 | u"Ḩ":"H", 130 | u"Ĥ":"H", 131 | u"Ⱨ":"H", 132 | u"Ḧ":"H", 133 | u"Ḣ":"H", 134 | u"Ḥ":"H", 135 | u"Ħ":"H", 136 | u"Í":"I", 137 | u"Ĭ":"I", 138 | u"Ǐ":"I", 139 | u"Î":"I", 140 | u"Ï":"I", 141 | u"Ḯ":"I", 142 | u"İ":"I", 143 | u"Ị":"I", 144 | u"Ȉ":"I", 145 | u"Ì":"I", 146 | u"Ỉ":"I", 147 | u"Ȋ":"I", 148 | u"Ī":"I", 149 | u"Į":"I", 150 | u"Ɨ":"I", 151 | u"Ĩ":"I", 152 | u"Ḭ":"I", 153 | u"Ꝺ":"D", 154 | u"Ꝼ":"F", 155 | u"Ᵹ":"G", 156 | u"Ꞃ":"R", 157 | u"Ꞅ":"S", 158 | u"Ꞇ":"T", 159 | u"Ꝭ":"IS", 160 | u"Ĵ":"J", 161 | u"Ɉ":"J", 162 | u"Ḱ":"K", 163 | u"Ǩ":"K", 164 | u"Ķ":"K", 165 | u"Ⱪ":"K", 166 | u"Ꝃ":"K", 167 | u"Ḳ":"K", 168 | u"Ƙ":"K", 169 | u"Ḵ":"K", 170 | u"Ꝁ":"K", 171 | u"Ꝅ":"K", 172 | u"Ĺ":"L", 173 | u"Ƚ":"L", 174 | u"Ľ":"L", 175 | u"Ļ":"L", 176 | u"Ḽ":"L", 177 | u"Ḷ":"L", 178 | u"Ḹ":"L", 179 | u"Ⱡ":"L", 180 | u"Ꝉ":"L", 181 | u"Ḻ":"L", 182 | u"Ŀ":"L", 183 | u"Ɫ":"L", 184 | u"Lj":"L", 185 | u"Ł":"L", 186 | u"LJ":"LJ", 187 | u"Ḿ":"M", 188 | u"Ṁ":"M", 189 | u"Ṃ":"M", 190 | u"Ɱ":"M", 191 | u"Ń":"N", 192 | u"Ň":"N", 193 | u"Ņ":"N", 194 | u"Ṋ":"N", 195 | u"Ṅ":"N", 196 | u"Ṇ":"N", 197 | u"Ǹ":"N", 198 | u"Ɲ":"N", 199 | u"Ṉ":"N", 200 | u"Ƞ":"N", 201 | u"Nj":"N", 202 | u"Ñ":"N", 203 | u"NJ":"NJ", 204 | u"Ó":"O", 205 | u"Ŏ":"O", 206 | u"Ǒ":"O", 207 | u"Ô":"O", 208 | u"Ố":"O", 209 | u"Ộ":"O", 210 | u"Ồ":"O", 211 | u"Ổ":"O", 212 | u"Ỗ":"O", 213 | u"Ö":"O", 214 | u"Ȫ":"O", 215 | u"Ȯ":"O", 216 | u"Ȱ":"O", 217 | u"Ọ":"O", 218 | u"Ő":"O", 219 | u"Ȍ":"O", 220 | u"Ò":"O", 221 | u"Ỏ":"O", 222 | u"Ơ":"O", 223 | u"Ớ":"O", 224 | u"Ợ":"O", 225 | u"Ờ":"O", 226 | u"Ở":"O", 227 | u"Ỡ":"O", 228 | u"Ȏ":"O", 229 | u"Ꝋ":"O", 230 | u"Ꝍ":"O", 231 | u"Ō":"O", 232 | u"Ṓ":"O", 233 | u"Ṑ":"O", 234 | u"Ɵ":"O", 235 | u"Ǫ":"O", 236 | u"Ǭ":"O", 237 | u"Ø":"O", 238 | u"Ǿ":"O", 239 | u"Õ":"O", 240 | u"Ṍ":"O", 241 | u"Ṏ":"O", 242 | u"Ȭ":"O", 243 | u"Ƣ":"OI", 244 | u"Ꝏ":"OO", 245 | u"Ɛ":"E", 246 | u"Ɔ":"O", 247 | u"Ȣ":"OU", 248 | u"Ṕ":"P", 249 | u"Ṗ":"P", 250 | u"Ꝓ":"P", 251 | u"Ƥ":"P", 252 | u"Ꝕ":"P", 253 | u"Ᵽ":"P", 254 | u"Ꝑ":"P", 255 | u"Ꝙ":"Q", 256 | u"Ꝗ":"Q", 257 | u"Ŕ":"R", 258 | u"Ř":"R", 259 | u"Ŗ":"R", 260 | u"Ṙ":"R", 261 | u"Ṛ":"R", 262 | u"Ṝ":"R", 263 | u"Ȑ":"R", 264 | u"Ȓ":"R", 265 | u"Ṟ":"R", 266 | u"Ɍ":"R", 267 | u"Ɽ":"R", 268 | u"Ꜿ":"C", 269 | u"Ǝ":"E", 270 | u"Ś":"S", 271 | u"Ṥ":"S", 272 | u"Š":"S", 273 | u"Ṧ":"S", 274 | u"Ş":"S", 275 | u"Ŝ":"S", 276 | u"Ș":"S", 277 | u"Ṡ":"S", 278 | u"Ṣ":"S", 279 | u"Ṩ":"S", 280 | u"Ť":"T", 281 | u"Ţ":"T", 282 | u"Ṱ":"T", 283 | u"Ț":"T", 284 | u"Ⱦ":"T", 285 | u"Ṫ":"T", 286 | u"Ṭ":"T", 287 | u"Ƭ":"T", 288 | u"Ṯ":"T", 289 | u"Ʈ":"T", 290 | u"Ŧ":"T", 291 | u"Ɐ":"A", 292 | u"Ꞁ":"L", 293 | u"Ɯ":"M", 294 | u"Ʌ":"V", 295 | u"Ꜩ":"TZ", 296 | u"Ú":"U", 297 | u"Ŭ":"U", 298 | u"Ǔ":"U", 299 | u"Û":"U", 300 | u"Ṷ":"U", 301 | u"Ü":"U", 302 | u"Ǘ":"U", 303 | u"Ǚ":"U", 304 | u"Ǜ":"U", 305 | u"Ǖ":"U", 306 | u"Ṳ":"U", 307 | u"Ụ":"U", 308 | u"Ű":"U", 309 | u"Ȕ":"U", 310 | u"Ù":"U", 311 | u"Ủ":"U", 312 | u"Ư":"U", 313 | u"Ứ":"U", 314 | u"Ự":"U", 315 | u"Ừ":"U", 316 | u"Ử":"U", 317 | u"Ữ":"U", 318 | u"Ȗ":"U", 319 | u"Ū":"U", 320 | u"Ṻ":"U", 321 | u"Ų":"U", 322 | u"Ů":"U", 323 | u"Ũ":"U", 324 | u"Ṹ":"U", 325 | u"Ṵ":"U", 326 | u"Ꝟ":"V", 327 | u"Ṿ":"V", 328 | u"Ʋ":"V", 329 | u"Ṽ":"V", 330 | u"Ꝡ":"VY", 331 | u"Ẃ":"W", 332 | u"Ŵ":"W", 333 | u"Ẅ":"W", 334 | u"Ẇ":"W", 335 | u"Ẉ":"W", 336 | u"Ẁ":"W", 337 | u"Ⱳ":"W", 338 | u"Ẍ":"X", 339 | u"Ẋ":"X", 340 | u"Ý":"Y", 341 | u"Ŷ":"Y", 342 | u"Ÿ":"Y", 343 | u"Ẏ":"Y", 344 | u"Ỵ":"Y", 345 | u"Ỳ":"Y", 346 | u"Ƴ":"Y", 347 | u"Ỷ":"Y", 348 | u"Ỿ":"Y", 349 | u"Ȳ":"Y", 350 | u"Ɏ":"Y", 351 | u"Ỹ":"Y", 352 | u"Ź":"Z", 353 | u"Ž":"Z", 354 | u"Ẑ":"Z", 355 | u"Ⱬ":"Z", 356 | u"Ż":"Z", 357 | u"Ẓ":"Z", 358 | u"Ȥ":"Z", 359 | u"Ẕ":"Z", 360 | u"Ƶ":"Z", 361 | u"IJ":"IJ", 362 | u"Œ":"OE", 363 | u"ᴀ":"A", 364 | u"ᴁ":"AE", 365 | u"ʙ":"B", 366 | u"ᴃ":"B", 367 | u"ᴄ":"C", 368 | u"ᴅ":"D", 369 | u"ᴇ":"E", 370 | u"ꜰ":"F", 371 | u"ɢ":"G", 372 | u"ʛ":"G", 373 | u"ʜ":"H", 374 | u"ɪ":"I", 375 | u"ʁ":"R", 376 | u"ᴊ":"J", 377 | u"ᴋ":"K", 378 | u"ʟ":"L", 379 | u"ᴌ":"L", 380 | u"ᴍ":"M", 381 | u"ɴ":"N", 382 | u"ᴏ":"O", 383 | u"ɶ":"OE", 384 | u"ᴐ":"O", 385 | u"ᴕ":"OU", 386 | u"ᴘ":"P", 387 | u"ʀ":"R", 388 | u"ᴎ":"N", 389 | u"ᴙ":"R", 390 | u"ꜱ":"S", 391 | u"ᴛ":"T", 392 | u"ⱻ":"E", 393 | u"ᴚ":"R", 394 | u"ᴜ":"U", 395 | u"ᴠ":"V", 396 | u"ᴡ":"W", 397 | u"ʏ":"Y", 398 | u"ᴢ":"Z", 399 | u"á":"a", 400 | #"á":"a", 401 | u"ă":"a", 402 | u"ắ":"a", 403 | u"ặ":"a", 404 | u"ằ":"a", 405 | u"ẳ":"a", 406 | u"ẵ":"a", 407 | u"ǎ":"a", 408 | u"â":"a", 409 | u"ấ":"a", 410 | u"ậ":"a", 411 | u"ầ":"a", 412 | u"ẩ":"a", 413 | u"ẫ":"a", 414 | u"ä":"a", 415 | u"ǟ":"a", 416 | u"ȧ":"a", 417 | u"ǡ":"a", 418 | u"ạ":"a", 419 | u"ȁ":"a", 420 | u"à":"a", 421 | u"ả":"a", 422 | u"ȃ":"a", 423 | u"ā":"a", 424 | u"ą":"a", 425 | u"ᶏ":"a", 426 | u"ẚ":"a", 427 | u"å":"a", 428 | u"ǻ":"a", 429 | u"ḁ":"a", 430 | u"ⱥ":"a", 431 | u"ã":"a", 432 | u"ꜳ":"aa", 433 | u"æ":"ae", 434 | u"ǽ":"ae", 435 | u"ǣ":"ae", 436 | u"ꜵ":"ao", 437 | u"ꜷ":"au", 438 | u"ꜹ":"av", 439 | u"ꜻ":"av", 440 | u"ꜽ":"ay", 441 | u"ḃ":"b", 442 | u"ḅ":"b", 443 | u"ɓ":"b", 444 | u"ḇ":"b", 445 | u"ᵬ":"b", 446 | u"ᶀ":"b", 447 | u"ƀ":"b", 448 | u"ƃ":"b", 449 | u"ɵ":"o", 450 | u"ć":"c", 451 | u"č":"c", 452 | u"ç":"c", 453 | u"ḉ":"c", 454 | u"ĉ":"c", 455 | u"ɕ":"c", 456 | u"ċ":"c", 457 | u"ƈ":"c", 458 | u"ȼ":"c", 459 | u"ď":"d", 460 | u"ḑ":"d", 461 | u"ḓ":"d", 462 | u"ȡ":"d", 463 | u"ḋ":"d", 464 | u"ḍ":"d", 465 | u"ɗ":"d", 466 | u"ᶑ":"d", 467 | u"ḏ":"d", 468 | u"ᵭ":"d", 469 | u"ᶁ":"d", 470 | u"đ":"d", 471 | u"ɖ":"d", 472 | u"ƌ":"d", 473 | u"ı":"i", 474 | u"ȷ":"j", 475 | u"ɟ":"j", 476 | u"ʄ":"j", 477 | u"dz":"dz", 478 | u"dž":"dz", 479 | u"é":"e", 480 | u"ĕ":"e", 481 | u"ě":"e", 482 | u"ȩ":"e", 483 | u"ḝ":"e", 484 | u"ê":"e", 485 | u"ế":"e", 486 | u"ệ":"e", 487 | u"ề":"e", 488 | u"ể":"e", 489 | u"ễ":"e", 490 | u"ḙ":"e", 491 | u"ë":"e", 492 | u"ė":"e", 493 | u"ẹ":"e", 494 | u"ȅ":"e", 495 | u"è":"e", 496 | u"ẻ":"e", 497 | u"ȇ":"e", 498 | u"ē":"e", 499 | u"ḗ":"e", 500 | u"ḕ":"e", 501 | u"ⱸ":"e", 502 | u"ę":"e", 503 | u"ᶒ":"e", 504 | u"ɇ":"e", 505 | u"ẽ":"e", 506 | u"ḛ":"e", 507 | u"ꝫ":"et", 508 | u"ḟ":"f", 509 | u"ƒ":"f", 510 | u"ᵮ":"f", 511 | u"ᶂ":"f", 512 | u"ǵ":"g", 513 | u"ğ":"g", 514 | u"ǧ":"g", 515 | u"ģ":"g", 516 | u"ĝ":"g", 517 | u"ġ":"g", 518 | u"ɠ":"g", 519 | u"ḡ":"g", 520 | u"ᶃ":"g", 521 | u"ǥ":"g", 522 | u"ḫ":"h", 523 | u"ȟ":"h", 524 | u"ḩ":"h", 525 | u"ĥ":"h", 526 | u"ⱨ":"h", 527 | u"ḧ":"h", 528 | u"ḣ":"h", 529 | u"ḥ":"h", 530 | u"ɦ":"h", 531 | u"ẖ":"h", 532 | u"ħ":"h", 533 | u"ƕ":"hv", 534 | u"í":"i", 535 | u"ĭ":"i", 536 | u"ǐ":"i", 537 | u"î":"i", 538 | u"ï":"i", 539 | u"ḯ":"i", 540 | u"ị":"i", 541 | u"ȉ":"i", 542 | u"ì":"i", 543 | u"ỉ":"i", 544 | u"ȋ":"i", 545 | u"ī":"i", 546 | u"į":"i", 547 | u"ᶖ":"i", 548 | u"ɨ":"i", 549 | u"ĩ":"i", 550 | u"ḭ":"i", 551 | u"ꝺ":"d", 552 | u"ꝼ":"f", 553 | u"ᵹ":"g", 554 | u"ꞃ":"r", 555 | u"ꞅ":"s", 556 | u"ꞇ":"t", 557 | u"ꝭ":"is", 558 | u"ǰ":"j", 559 | u"ĵ":"j", 560 | u"ʝ":"j", 561 | u"ɉ":"j", 562 | u"ḱ":"k", 563 | u"ǩ":"k", 564 | u"ķ":"k", 565 | u"ⱪ":"k", 566 | u"ꝃ":"k", 567 | u"ḳ":"k", 568 | u"ƙ":"k", 569 | u"ḵ":"k", 570 | u"ᶄ":"k", 571 | u"ꝁ":"k", 572 | u"ꝅ":"k", 573 | u"ĺ":"l", 574 | u"ƚ":"l", 575 | u"ɬ":"l", 576 | u"ľ":"l", 577 | u"ļ":"l", 578 | u"ḽ":"l", 579 | u"ȴ":"l", 580 | u"ḷ":"l", 581 | u"ḹ":"l", 582 | u"ⱡ":"l", 583 | u"ꝉ":"l", 584 | u"ḻ":"l", 585 | u"ŀ":"l", 586 | u"ɫ":"l", 587 | u"ᶅ":"l", 588 | u"ɭ":"l", 589 | u"ł":"l", 590 | u"lj":"lj", 591 | u"ſ":"s", 592 | u"ẜ":"s", 593 | u"ẛ":"s", 594 | u"ẝ":"s", 595 | u"ḿ":"m", 596 | u"ṁ":"m", 597 | u"ṃ":"m", 598 | u"ɱ":"m", 599 | u"ᵯ":"m", 600 | u"ᶆ":"m", 601 | u"ń":"n", 602 | u"ň":"n", 603 | u"ņ":"n", 604 | u"ṋ":"n", 605 | u"ȵ":"n", 606 | u"ṅ":"n", 607 | u"ṇ":"n", 608 | u"ǹ":"n", 609 | u"ɲ":"n", 610 | u"ṉ":"n", 611 | u"ƞ":"n", 612 | u"ᵰ":"n", 613 | u"ᶇ":"n", 614 | u"ɳ":"n", 615 | u"ñ":"n", 616 | u"nj":"nj", 617 | u"ó":"o", 618 | u"ŏ":"o", 619 | u"ǒ":"o", 620 | u"ô":"o", 621 | u"ố":"o", 622 | u"ộ":"o", 623 | u"ồ":"o", 624 | u"ổ":"o", 625 | u"ỗ":"o", 626 | u"ö":"o", 627 | u"ȫ":"o", 628 | u"ȯ":"o", 629 | u"ȱ":"o", 630 | u"ọ":"o", 631 | u"ő":"o", 632 | u"ȍ":"o", 633 | u"ò":"o", 634 | u"ỏ":"o", 635 | u"ơ":"o", 636 | u"ớ":"o", 637 | u"ợ":"o", 638 | u"ờ":"o", 639 | u"ở":"o", 640 | u"ỡ":"o", 641 | u"ȏ":"o", 642 | u"ꝋ":"o", 643 | u"ꝍ":"o", 644 | u"ⱺ":"o", 645 | u"ō":"o", 646 | u"ṓ":"o", 647 | u"ṑ":"o", 648 | u"ǫ":"o", 649 | u"ǭ":"o", 650 | u"ø":"o", 651 | u"ǿ":"o", 652 | u"õ":"o", 653 | u"ṍ":"o", 654 | u"ṏ":"o", 655 | u"ȭ":"o", 656 | u"ƣ":"oi", 657 | u"ꝏ":"oo", 658 | u"ɛ":"e", 659 | u"ᶓ":"e", 660 | u"ɔ":"o", 661 | u"ᶗ":"o", 662 | u"ȣ":"ou", 663 | u"ṕ":"p", 664 | u"ṗ":"p", 665 | u"ꝓ":"p", 666 | u"ƥ":"p", 667 | u"ᵱ":"p", 668 | u"ᶈ":"p", 669 | u"ꝕ":"p", 670 | u"ᵽ":"p", 671 | u"ꝑ":"p", 672 | u"ꝙ":"q", 673 | u"ʠ":"q", 674 | u"ɋ":"q", 675 | u"ꝗ":"q", 676 | u"ŕ":"r", 677 | u"ř":"r", 678 | u"ŗ":"r", 679 | u"ṙ":"r", 680 | u"ṛ":"r", 681 | u"ṝ":"r", 682 | u"ȑ":"r", 683 | u"ɾ":"r", 684 | u"ᵳ":"r", 685 | u"ȓ":"r", 686 | u"ṟ":"r", 687 | u"ɼ":"r", 688 | u"ᵲ":"r", 689 | u"ᶉ":"r", 690 | u"ɍ":"r", 691 | u"ɽ":"r", 692 | u"ↄ":"c", 693 | u"ꜿ":"c", 694 | u"ɘ":"e", 695 | u"ɿ":"r", 696 | u"ś":"s", 697 | u"ṥ":"s", 698 | u"š":"s", 699 | u"ṧ":"s", 700 | u"ş":"s", 701 | u"ŝ":"s", 702 | u"ș":"s", 703 | u"ṡ":"s", 704 | u"ṣ":"s", 705 | u"ṩ":"s", 706 | u"ʂ":"s", 707 | u"ᵴ":"s", 708 | u"ᶊ":"s", 709 | u"ȿ":"s", 710 | u"ɡ":"g", 711 | u"ᴑ":"o", 712 | u"ᴓ":"o", 713 | u"ᴝ":"u", 714 | u"ť":"t", 715 | u"ţ":"t", 716 | u"ṱ":"t", 717 | u"ț":"t", 718 | u"ȶ":"t", 719 | u"ẗ":"t", 720 | u"ⱦ":"t", 721 | u"ṫ":"t", 722 | u"ṭ":"t", 723 | u"ƭ":"t", 724 | u"ṯ":"t", 725 | u"ᵵ":"t", 726 | u"ƫ":"t", 727 | u"ʈ":"t", 728 | u"ŧ":"t", 729 | u"ᵺ":"th", 730 | u"ɐ":"a", 731 | u"ᴂ":"ae", 732 | u"ǝ":"e", 733 | u"ᵷ":"g", 734 | u"ɥ":"h", 735 | u"ʮ":"h", 736 | u"ʯ":"h", 737 | u"ᴉ":"i", 738 | u"ʞ":"k", 739 | u"ꞁ":"l", 740 | u"ɯ":"m", 741 | u"ɰ":"m", 742 | u"ᴔ":"oe", 743 | u"ɹ":"r", 744 | u"ɻ":"r", 745 | u"ɺ":"r", 746 | u"ⱹ":"r", 747 | u"ʇ":"t", 748 | u"ʌ":"v", 749 | u"ʍ":"w", 750 | u"ʎ":"y", 751 | u"ꜩ":"tz", 752 | u"ú":"u", 753 | u"ŭ":"u", 754 | u"ǔ":"u", 755 | u"û":"u", 756 | u"ṷ":"u", 757 | u"ü":"u", 758 | u"ǘ":"u", 759 | u"ǚ":"u", 760 | u"ǜ":"u", 761 | u"ǖ":"u", 762 | u"ṳ":"u", 763 | u"ụ":"u", 764 | u"ű":"u", 765 | u"ȕ":"u", 766 | u"ù":"u", 767 | u"ủ":"u", 768 | u"ư":"u", 769 | u"ứ":"u", 770 | u"ự":"u", 771 | u"ừ":"u", 772 | u"ử":"u", 773 | u"ữ":"u", 774 | u"ȗ":"u", 775 | u"ū":"u", 776 | u"ṻ":"u", 777 | u"ų":"u", 778 | u"ᶙ":"u", 779 | u"ů":"u", 780 | u"ũ":"u", 781 | u"ṹ":"u", 782 | u"ṵ":"u", 783 | u"ᵫ":"ue", 784 | u"ꝸ":"um", 785 | u"ⱴ":"v", 786 | u"ꝟ":"v", 787 | u"ṿ":"v", 788 | u"ʋ":"v", 789 | u"ᶌ":"v", 790 | u"ⱱ":"v", 791 | u"ṽ":"v", 792 | u"ꝡ":"vy", 793 | u"ẃ":"w", 794 | u"ŵ":"w", 795 | u"ẅ":"w", 796 | u"ẇ":"w", 797 | u"ẉ":"w", 798 | u"ẁ":"w", 799 | u"ⱳ":"w", 800 | u"ẘ":"w", 801 | u"ẍ":"x", 802 | u"ẋ":"x", 803 | u"ᶍ":"x", 804 | u"ý":"y", 805 | u"ŷ":"y", 806 | u"ÿ":"y", 807 | u"ẏ":"y", 808 | u"ỵ":"y", 809 | u"ỳ":"y", 810 | u"ƴ":"y", 811 | u"ỷ":"y", 812 | u"ỿ":"y", 813 | u"ȳ":"y", 814 | u"ẙ":"y", 815 | u"ɏ":"y", 816 | u"ỹ":"y", 817 | u"ź":"z", 818 | u"ž":"z", 819 | u"ẑ":"z", 820 | u"ʑ":"z", 821 | u"ⱬ":"z", 822 | u"ż":"z", 823 | u"ẓ":"z", 824 | u"ȥ":"z", 825 | u"ẕ":"z", 826 | u"ᵶ":"z", 827 | u"ᶎ":"z", 828 | u"ʐ":"z", 829 | u"ƶ":"z", 830 | u"ɀ":"z", 831 | u"ff":"ff", 832 | u"ffi":"ffi", 833 | u"ffl":"ffl", 834 | u"fi":"fi", 835 | u"fl":"fl", 836 | u"ij":"ij", 837 | u"œ":"oe", 838 | u"st":"st", 839 | u"ₐ":"a", 840 | u"ₑ":"e", 841 | u"ᵢ":"i", 842 | u"ⱼ":"j", 843 | u"ₒ":"o", 844 | u"ᵣ":"r", 845 | u"ᵤ":"u", 846 | u"ᵥ":"v", 847 | u"ₓ":"x", 848 | } 849 | 850 | 851 | 852 | def latinize(old_str): 853 | old_str = old_str.lower() 854 | new_a = "" 855 | for a in old_str: 856 | new_a += str(latin_map.get(a, a)) 857 | return new_a 858 | 859 | def str_filter_sub(old_str): 860 | old_str_sub = re.sub(RE_SUB_STRING, "", old_str) 861 | new_str = re.sub(RE_SPECIAL_STRING, '', old_str_sub) 862 | return new_str 863 | 864 | def str_filter(old_str): 865 | return re.sub(RE_SPECIAL_STRING, '', old_str).strip() 866 | 867 | def remove_punct(input_str): 868 | if not input_str: 869 | return input_str 870 | del_estr = string.punctuation 871 | replace = " "*len(del_estr) 872 | tran_tab = string.maketrans(del_estr, replace) 873 | input_str = input_str.translate(tran_tab) 874 | return " ".join(input_str.split()) 875 | 876 | def str_sub(old_str): 877 | old_str = old_str.lower() 878 | new_str = re.sub(RE_SUB_STRING, "", old_str).strip() 879 | if new_str.find(" - ") != -1: 880 | new_str = new_str[:new_str.find(" - ")] 881 | new_str = latinize(new_str) 882 | new_str = remove_punct(new_str.strip()) 883 | return new_str 884 | 885 | def str_sim(str1_old, str2_old): 886 | ''' 887 | warning: do not str1=str(str1) 888 | ''' 889 | str1 = str(str1_old) 890 | str2 = str(str2_old) 891 | 892 | format_str1 = str_filter(str1.lower().strip()) 893 | format_str2 = str_filter(str2.lower().strip()) 894 | if format_str1 == format_str2 or format_str1.find(format_str2) != -1 or format_str2.find(format_str1) != -1: 895 | return True, "" 896 | 897 | format_str1 = str_filter_sub(str1.lower().strip()) 898 | format_str2 = str_filter_sub(str2.lower().strip()) 899 | ratio = fuzz.ratio(format_str1, format_str2) 900 | return ratio >= THREADHOLD or format_str1 == format_str2 or format_str1.find(format_str2) != -1 or format_str2.find(format_str1) != -1 , str(ratio) 901 | 902 | -------------------------------------------------------------------------------- /acrcloud_scan_files_python.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import time 7 | import json 8 | import codecs 9 | import optparse 10 | import logging 11 | import openpyxl 12 | from backports import csv 13 | from openpyxl import Workbook 14 | from acrcloud_logger import AcrcloudLogger 15 | from acrcloud_filter_libary import FilterWorker 16 | from acrcloud.recognizer import ACRCloudRecognizer 17 | 18 | if sys.version_info.major == 2: 19 | reload(sys) 20 | sys.setdefaultencoding("utf8") 21 | 22 | 23 | class ACRCloud_Scan_Files: 24 | 25 | def __init__(self, config_file): 26 | self.config = { 27 | 'host': '', 28 | 'access_key': '', 29 | 'access_secret': '', 30 | 'debug': False, 31 | 'timeout': 10 # seconds 32 | } 33 | self.openpyxl_version = ".".join(str(openpyxl.__version__).split(".")[:2]) 34 | self.config_file = config_file 35 | self.init_log() 36 | self.init_config() 37 | 38 | def init_log(self): 39 | self.dlog = AcrcloudLogger('ACRCloud_ScanF', logging.INFO) 40 | if not self.dlog.addFilehandler(logfile="log_scan_files.log", logdir="./", loglevel=logging.WARN): 41 | sys.exit(1) 42 | if not self.dlog.addStreamHandler(): 43 | sys.exit(1) 44 | 45 | def init_config(self): 46 | try: 47 | json_config = None 48 | with codecs.open(self.config_file, 'r') as f: 49 | json_config = json.loads(f.read()) 50 | for k in ["host", "access_key", "access_secret"]: 51 | if k in json_config and json_config[k].strip(): 52 | self.config[k] = str(json_config[k].strip()) 53 | else: 54 | self.dlog.logger.error("init_config.not found {0} from config.json, pls check".format(k)) 55 | sys.exit(1) 56 | 57 | self.re_handler = ACRCloudRecognizer(self.config) 58 | if self.re_handler: 59 | self.dlog.logger.warning("init_config success!") 60 | except Exception as e: 61 | self.dlog.logger.error("init_config.error", exc_info=True) 62 | 63 | def read_file(self, infile, jFirst=True): 64 | with open(infile, "rb") as rfile: 65 | for line in rfile: 66 | if jFirst: 67 | jFirst = False 68 | continue 69 | yield line.strip() 70 | 71 | def write_error(self, file_path, error_time, error_detail): 72 | with open('error_scan.txt', 'a', ) as f: 73 | msg = file_path + '||' + str(error_time) + '||' + str(error_detail) + '\n' 74 | f.write(msg) 75 | 76 | def empty_error_scan(self): 77 | if os.path.exists('error_scan.txt'): 78 | os.remove('error_scan.txt') 79 | 80 | def export_to_csv(self, result_list, export_filename="ACRCloud_ScanFile_Results.csv", export_dir="./"): 81 | try: 82 | results = [] 83 | for item in result_list: 84 | filename = item["file"] 85 | timestamp = item["timestamp"] 86 | jsoninfo = item["result"] 87 | if "status" in jsoninfo and jsoninfo["status"]["code"] == 0: 88 | row = self.parse_data(jsoninfo) 89 | row = [filename, timestamp] + list(row) 90 | results.append(row) 91 | 92 | export_filepath = os.path.join(export_dir, export_filename) 93 | 94 | with codecs.open(export_filepath, 'w', 'utf-8-sig') as f: 95 | head_row = ['filename', 'timestamp', 'title', 'artists', 'album', 'acrid', 'played_duration', 'label', 96 | 'isrc', 'upc', 'deezer', 'spotify', 'itunes', 'youtube', 'custom_files_title', 'audio_id'] 97 | dw = csv.writer(f) 98 | dw.writerow(head_row) 99 | dw.writerows(results) 100 | self.dlog.logger.info("export_to_csv.Save Data to csv: {0}".format(export_filepath)) 101 | except Exception as e: 102 | self.dlog.logger.error("Error export_to_csv", exc_info=True) 103 | 104 | def export_to_json(self, result_list, export_filename="ACRCloud_ScanFile_Results.json", export_dir="./"): 105 | try: 106 | results = [] 107 | json_results = [] 108 | new_results = {} 109 | export_filepath = os.path.join(export_dir, export_filename) 110 | 111 | head_row = ['filename', 'timestamp', 'title', 'artists', 'album', 'acrid', 'played_duration', 'label', 112 | 'isrc', 'upc', 'deezer', 'spotify', 'itunes', 'youtube', 'custom_files_title', 'audio_id'] 113 | 114 | for item in result_list: 115 | filename = item["file"] 116 | timestamp = item["timestamp"] 117 | jsoninfo = item["result"] 118 | if "status" in jsoninfo and jsoninfo["status"]["code"] == 0: 119 | row = self.parse_data(jsoninfo) 120 | row = [filename, timestamp] + list(row) 121 | results.append(row) 122 | 123 | for i in results: 124 | for k in range(len(head_row)): 125 | new_results[head_row[k]] = i[k] 126 | 127 | json_results.append(new_results) 128 | 129 | with codecs.open(export_filepath, 'w', 'utf-8-sig') as f: 130 | f.write(json.dumps(json_results)) 131 | except Exception as e: 132 | self.dlog.logger.error("Error export_to_json", exc_info=True) 133 | 134 | def export_to_xlsx(self, result_list, export_filename="ACRCloud_ScanFile_Results.xlsx", export_dir="./"): 135 | try: 136 | wb = Workbook() 137 | sheet_channels = wb.active 138 | sheet_channels.title = "Results" 139 | head_row = ['filename', 'timestamp', 'title', 'artists', 'album', 'acrid', 'played_duration', 'label', 140 | 'isrc', 'upc', 'deezer', 'spotify', 'itunes', 'youtube', 'custom_files_title', 'audio_id'] 141 | sheet_channels.append(head_row) 142 | 143 | for item in result_list: 144 | filename = item["file"] 145 | timestamp = item["timestamp"] 146 | jsoninfo = item["result"] 147 | if "status" in jsoninfo and jsoninfo["status"]["code"] == 0: 148 | row = self.parse_data(jsoninfo) 149 | row = [filename, timestamp] + list(row) 150 | sheet_channels.append(row) 151 | 152 | export_filepath = os.path.join(export_dir, export_filename) 153 | 154 | for column_cells in sheet_channels.columns: 155 | length = max(len(str(cell.value) if cell.value else "") for cell in column_cells) 156 | if length > 100: 157 | length == 100 158 | if self.openpyxl_version >= "2.6": 159 | sheet_channels.column_dimensions[column_cells[0].column_letter].width = length 160 | else: 161 | sheet_channels.column_dimensions[column_cells[0].column].width = length 162 | wb.save(export_filepath) 163 | 164 | self.dlog.logger.info("export_to_xlsx.Save Data to xlsx: {0}".format(export_filepath)) 165 | except Exception as e: 166 | self.dlog.logger.error("Error export_to_xlsx", exc_info=True) 167 | 168 | def parse_data(self, jsoninfo): 169 | try: 170 | title, played_duration, isrc, upc, acrid, label, album = [""] * 7 171 | artists, deezer, spotify, itunes, youtube, custom_files_title, audio_id = [""] * 7 172 | 173 | metadata = jsoninfo.get('metadata', {}) 174 | played_duration = metadata.get("played_duration", "") 175 | if "music" in metadata and len(metadata["music"]) > 0: 176 | item = metadata["music"][0] 177 | title = item.get("title", "") 178 | offset = item.get("play_offset_ms", "") 179 | if "external_ids" in item: 180 | if "isrc" in item["external_ids"]: 181 | isrc_obj = item["external_ids"]["isrc"] 182 | isrc = isrc_obj[0] if type(isrc_obj) == list else isrc_obj 183 | if "upc" in item["external_ids"]: 184 | upc_obj = item["external_ids"]["upc"] 185 | upc = upc_obj[0] if type(upc_obj) == list else upc_obj 186 | acrid = item.get("acrid", "") 187 | label = item.get("label", "") 188 | album = item.get("album", {"name": ""}).get("name", "") 189 | artists = ",".join([ar["name"] for ar in item.get('artists', [{"name": ""}]) if ar.get("name")]) 190 | if "external_metadata" in item: 191 | e_metadata = item["external_metadata"] 192 | if "deezer" in e_metadata: 193 | deezer_obj = e_metadata["deezer"] 194 | deezer = deezer_obj[0]["track"]["id"] if type(deezer_obj) == list else deezer_obj["track"]["id"] 195 | if "spotify" in e_metadata: 196 | spotify_obj = e_metadata["spotify"] 197 | spotify = spotify_obj[0]["track"]["id"] if type(spotify_obj) == list else spotify_obj["track"][ 198 | "id"] 199 | if "youtube" in e_metadata: 200 | youtube_obj = e_metadata["youtube"] 201 | youtube = youtube_obj[0]["vid"] if type(youtube_obj) == list else youtube_obj["vid"] 202 | 203 | if "custom_files" in metadata and len(metadata["custom_files"]) > 0: 204 | custom_item = metadata["custom_files"][0] 205 | custom_files_title = custom_item.get("title", "") 206 | audio_id = custom_item.get("audio_id", "") 207 | except Exception as e: 208 | self.dlog.logger.error("parse_data.error.data:{0}".format(metadata), exc_info=True) 209 | 210 | res = (title, artists, album, acrid, played_duration, label, isrc, upc, 211 | deezer, spotify, itunes, youtube, custom_files_title, audio_id) 212 | return res 213 | 214 | def apply_filter(self, results): 215 | fworker = FilterWorker() 216 | result_new = fworker.apply_filter(results) 217 | return result_new 218 | 219 | def do_recognize(self, filepath, start_time, rec_length): 220 | try: 221 | current_time = time.strftime('%H:%M:%S', time.gmtime(start_time)) 222 | res_data = self.re_handler.recognize_by_file(filepath, start_time, rec_length) 223 | return filepath, current_time, res_data 224 | except Exception as e: 225 | self.dlog.logger.error("do_recognize.error.({0}, {1}, {2})".format(filepath, start_time, rec_length), 226 | exc_info=True) 227 | return filepath, current_time, None 228 | 229 | def recognize_file(self, filepath, start_time, stop_time, step, rec_length, with_duration=0): 230 | self.dlog.logger.warning("scan_file.start_to_run: {0}".format(filepath)) 231 | 232 | result = [] 233 | for i in range(start_time, stop_time, step): 234 | filep, current_time, res_data = self.do_recognize(filepath, i, rec_length) 235 | try: 236 | print(res_data) 237 | jsoninfo = json.loads(res_data) 238 | code = jsoninfo['status']['code'] 239 | msg = jsoninfo['status']['msg'] 240 | if "status" in jsoninfo and jsoninfo["status"]["code"] == 0: 241 | result.append( 242 | {"timestamp": current_time, "rec_length": rec_length, "result": jsoninfo, "file": filep}) 243 | res = self.parse_data(jsoninfo) 244 | # self.dlog.logger.info('recognize_file.(time:{0}, title: {1})'.format(current_time, res[0])) 245 | self.dlog.logger.info( 246 | 'recognize_file.(time:{0}, title: {1}, custom title: {2})'.format(current_time, res[0], 247 | res[-2])) 248 | if code == 2005: 249 | self.dlog.logger.warning('recognize_file.(time:{0}, code:{1}, Done!)'.format(current_time, code)) 250 | break 251 | elif code == 1001: 252 | result.append( 253 | {"timestamp": current_time, "rec_length": rec_length, "result": jsoninfo, "file": filep}) 254 | self.dlog.logger.info("recognize_file.(time:{0}, code:{1}, No_Result)".format(current_time, code)) 255 | elif code == 3001: 256 | self.dlog.logger.error( 257 | 'recognize_file.(time:{0}, code:{1}, Missing/Invalid Access Key)'.format(current_time, code)) 258 | break 259 | elif code == 3003: 260 | self.dlog.logger.error( 261 | 'recognize_file.(time:{0}, code:{1}, Limit exceeded)'.format(current_time, code)) 262 | elif code == 3000: 263 | self.dlog.logger.error('recognize_file.(time:{0}, {1}, {2})'.format(current_time, code, msg)) 264 | self.write_error(filepath, i, 'NETWORK ERROR') 265 | i += step 266 | except Exception as e: 267 | self.dlog.logger.error('recognize_file.error', exc_info=True) 268 | self.write_error(filepath, i, 'JSON ERROR') 269 | return result 270 | 271 | def scan_file_main(self, option, start_time, stop_time): 272 | try: 273 | filepath = option.file_path 274 | step = option.step 275 | rec_length = option.rec_length 276 | with_duration = option.with_duration 277 | out_dir = option.out_dir 278 | if out_dir and not os.path.exists(out_dir): 279 | try: 280 | os.makedirs(out_dir) 281 | except Exception as e: 282 | self.dlog.logger.error("scan_file_main.create_out_dir_error:{0}, please check it!".format(out_dir), 283 | exc_info=True) 284 | return 285 | 286 | file_type = option.file_type 287 | if start_time == 0 and stop_time == 0: 288 | file_total_seconds = int(ACRCloudRecognizer.get_duration_ms_by_file(filepath) / 1000) 289 | results = self.recognize_file(filepath, start_time, file_total_seconds, step, rec_length, with_duration) 290 | else: 291 | results = self.recognize_file(filepath, start_time, stop_time, step, rec_length, with_duration) 292 | 293 | filename_csv = 'result-' + os.path.basename(filepath.strip()) + '.csv' 294 | filename_xlsx = 'result-' + os.path.basename(filepath.strip()) + '.xlsx' 295 | filename_json = 'result-' + os.path.basename(filepath.strip()) + '.json' 296 | 297 | if results: 298 | if file_type == "csv": 299 | self.export_to_csv(results, filename_csv, out_dir) 300 | elif file_type == "json": 301 | self.export_to_json(results, filename_json, out_dir) 302 | else: 303 | self.export_to_xlsx(results, filename_xlsx, out_dir) 304 | 305 | if with_duration == 1: 306 | new_results = [] 307 | if results: 308 | new_results = self.apply_filter(results) 309 | 310 | filename_with_duration_csv = 'result-' + os.path.basename(filepath.strip()) + '_with_duration.csv' 311 | filename_with_duration_xlsx = 'result-' + os.path.basename(filepath.strip()) + '_with_duration.xlsx' 312 | filename_with_duration_json = 'result-' + os.path.basename(filepath.strip()) + '_with_duration.json' 313 | 314 | if file_type == "csv": 315 | self.export_to_csv(new_results, filename_with_duration_csv, out_dir) 316 | elif file_type == "json": 317 | 318 | self.export_to_json(new_results, filename_with_duration_json, out_dir) 319 | else: 320 | self.export_to_xlsx(new_results, filename_with_duration_xlsx, out_dir) 321 | except Exception as e: 322 | self.dlog.logger.error("scan_file_main.error", exc_info=True) 323 | return 324 | 325 | def scan_folder_main(self, option, start_time, stop_time): 326 | try: 327 | path = option.folder_path 328 | file_list = os.listdir(path) 329 | for i in file_list: 330 | option.file_path = path + '/' + i 331 | self.scan_file_main(option, start_time, stop_time) 332 | except Exception as e: 333 | self.dlog.logger.error("scan_folder_main.error", exc_info=True) 334 | 335 | 336 | if __name__ == '__main__': 337 | usage = r''' 338 | _ ____ ____ ____ _ _ 339 | / \ / ___| _ \ / ___| | ___ _ _ __| | 340 | / _ \| | | |_) | | | |/ _ \| | | |/ _` | 341 | / ___ \ |___| _ <| |___| | (_) | |_| | (_| | 342 | /_/ \_\____|_| \_\\____|_|\___/ \____|\____| 343 | 344 | Usage: 345 | python acrcloud_scan_files_python.py -d folder_path 346 | python acrcloud_scan_files_python.py -f file_path 347 | Example: 348 | python acrcloud_scan_files_python.py -d ~/music 349 | python acrcloud_scan_files_python.py -f ~/testfiles/test.mp3 350 | If you want to change scan interval or you want to set recognize range,you can add some params 351 | Example: 352 | python acrcloud_scan_files_python.py -f ~/testfiles/test.mp3 -s 30 -r 0-20 -l 10 353 | python acrcloud_scan_files_python.py -d ~/music -s 30 354 | ''' 355 | 356 | parser = optparse.OptionParser() 357 | parser.add_option('-f', '--file', dest='file_path', type='string', help='Scan file you want to recognize') 358 | parser.add_option('-c', '--config', dest='config', type='string', default="config.json", help='config file') 359 | parser.add_option('-d', '--folder', dest='folder_path', type='string', help='Scan folder you want to recognize') 360 | parser.add_option('-s', '--step', dest='step', type='int', default=10, help='step') 361 | parser.add_option('-l', '--rec_length', dest='rec_length', type='int', default=10, help='rec_length') 362 | parser.add_option('-e', '--error_file', dest='error_file', type='string', help='error scan file') 363 | parser.add_option('-r', '--range', dest='range', type='string', default='0-0', help='error scan file') 364 | parser.add_option('-w', '--with_duration', dest="with_duration", type='int', default=0, help='with_duration') 365 | parser.add_option('-o', '--out_dir', dest="out_dir", type='string', default="./", help='out_dir') 366 | parser.add_option('-t', '--file_type', dest="file_type", type='string', default="csv", help='file_type') 367 | 368 | (options, args) = parser.parse_args() 369 | start = int(options.range.split('-')[0]) 370 | stop = int(options.range.split('-')[1]) 371 | 372 | asf = ACRCloud_Scan_Files(options.config) 373 | if options.file_path: 374 | asf.empty_error_scan() 375 | asf.scan_file_main(options, start, stop) 376 | elif options.folder_path: 377 | asf.empty_error_scan() 378 | asf.scan_folder_main(options, start, stop) 379 | else: 380 | print(usage) 381 | -------------------------------------------------------------------------------- /acrcloud_filter_libary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding:utf-8 -*- 3 | 4 | import os 5 | import sys 6 | import json 7 | import copy 8 | import math 9 | import datetime 10 | import traceback 11 | import tools_str_sim 12 | import acrcloud_logger 13 | from dateutil.relativedelta import * 14 | 15 | if sys.version_info.major == 2: 16 | reload(sys) 17 | sys.setdefaultencoding("utf8") 18 | 19 | NORESULT = "noResult" 20 | 21 | class ResultFilter: 22 | 23 | def __init__(self, dlog): 24 | self._dlog = dlog 25 | self._real_music = {} 26 | self._real_music_list_num = 3 27 | self._real_custom = {} 28 | self._real_custom_list_num = 3 29 | self._real_custom_valid_interval = 5*60 30 | self._delay_music = {} 31 | self._delay_music_last_result = {} 32 | self._delay_music_interval_threshold = 2*60 33 | self._delay_custom = {} 34 | self._delay_custom_played_duration_min = 2 35 | self._delay_list_max_num = 35 36 | self._delay_list_threshold = 120 37 | 38 | def get_mutil_result_title(self, data, itype='music', isize = 1): 39 | ret_list = [] 40 | index = 0 41 | json_res = data["result"] 42 | if json_res == NORESULT: 43 | return [NORESULT] 44 | try: 45 | if json_res['status']['code'] == 0: 46 | if itype == 'music': 47 | if 'metadata' in json_res and 'music' in json_res['metadata']: 48 | for item in json_res['metadata']['music']: 49 | ret_list.append(item['title']) 50 | index += 1 51 | if index >= isize: 52 | break 53 | elif 'metainfos' in json_res: 54 | for item in json_res['metainfos']: 55 | ret_list.append(item['title']) 56 | index += 1 57 | if index >= isize: 58 | break 59 | elif itype == 'custom': 60 | if 'metadata' in json_res and 'custom_files' in json_res['metadata']: 61 | for item in json_res['metadata']['custom_files']: 62 | ret_list.append(item['title']) 63 | index += 1 64 | if index >= isize: 65 | break 66 | except Exception as e: 67 | self._dlog.logger.error("Error@get_mutil_result_title", exc_info=True) 68 | self._dlog.logger.error("Error_Data: {0}".format(data)) 69 | return ret_list if ret_list else [NORESULT] 70 | 71 | def get_mutil_result_acrid(self, data, itype='music', isize = 1): 72 | ret_list = [] 73 | index = 0 74 | json_res = data["result"] 75 | if json_res == NORESULT: 76 | return [NORESULT] 77 | try: 78 | if json_res['status']['code'] == 0: 79 | if itype == 'music': 80 | if 'metadata' in json_res and 'music' in json_res['metadata']: 81 | for item in json_res['metadata']['music']: 82 | ret_list.append(item['acrid']) 83 | index += 1 84 | if index >= isize: 85 | break 86 | elif 'metainfos' in json_res: 87 | for item in json_res['metainfos']: 88 | ret_list.append(item['acrid']) 89 | index += 1 90 | if index >= isize: 91 | break 92 | elif itype == 'custom': 93 | if 'metadata' in json_res and 'custom_files' in json_res['metadata']: 94 | for item in json_res['metadata']['custom_files']: 95 | ret_list.append(item['acrid']) 96 | index += 1 97 | if index >= isize: 98 | break 99 | except Exception as e: 100 | self._dlog.logger.error("Error@get_mutil_result_acrid", exc_info=True) 101 | self._dlog.logger.error("Error_Data: {0}".format(json.dumps(result))) 102 | return ret_list if ret_list else [NORESULT] 103 | 104 | def swap_position(self, ret_title, ret_data, itype): 105 | json_res = ret_data["result"] 106 | meta_type = None 107 | music_list = [] 108 | if itype == 'music': 109 | if 'metadata' in json_res: 110 | music_list = json_res['metadata']['music'] 111 | elif 'metainfos' in json_res: 112 | music_list = json_res['metainfos'] 113 | elif itype == 'custom': 114 | music_list = json_res['metadata']['custom_files'] 115 | 116 | if music_list: 117 | ret_index = 0 118 | for index, item in enumerate(music_list): 119 | if itype == "music": 120 | if item['title'] == ret_title: 121 | ret_index = index 122 | break 123 | else: 124 | if item['acrid'] == ret_title: 125 | ret_index = index 126 | break 127 | if ret_index > 0: 128 | music_list[0], music_list[ret_index] = music_list[ret_index], music_list[0] 129 | 130 | def custom_result_append(self, ret_data, title, from_data, count, tmp_deal_title_map): 131 | ret_title_set = set() 132 | for item in ret_data['result']['metadata']['custom_files']: 133 | ret_title_set.add(item['acrid']) 134 | 135 | for item in from_data['result']['metadata']['custom_files']: 136 | acrid = item['acrid'] 137 | if acrid == title and acrid not in ret_title_set: 138 | item['count'] = count 139 | ret_data['result']['metadata']['custom_files'].append(item) 140 | ret_title_set.add(acrid) 141 | 142 | for item in from_data['result']['metadata']['custom_files']: 143 | acrid = item['acrid'] 144 | if acrid not in ret_title_set: 145 | if acrid in tmp_deal_title_map: 146 | item['count'] = tmp_deal_title_map[acrid]['count'] 147 | ret_data['result']['metadata']['custom_files'].append(item) 148 | 149 | def get_play_offset(self, data, itype='music'): 150 | try: 151 | play_offset_ms = 0 152 | result = data['result'] 153 | if result['status']['code'] == 1001: 154 | return 0 155 | if itype == 'music': 156 | play_offset_ms = result['metadata']['music'][0]['play_offset_ms'] 157 | elif itype == 'custom': 158 | play_offset_ms = result['metadata']['custom_files'][0]['play_offset_ms'] 159 | except Exception as e: 160 | self._dlog.logger.error("Error@Get_Play_Offset, error_data: {0}, {1}".format(itype, data), exc_info=True) 161 | return play_offset_ms/1000.0 162 | 163 | def get_db_play_offset(self, data, offset_type="begin", itype='music'): 164 | """ 165 | itype : music or custom 166 | offset_type : begin or end offset 167 | """ 168 | try: 169 | if offset_type not in ['begin', 'end']: 170 | self._dlog.logger.error("Error@Get_DB_Play_Offset.offset_type({0}) error".format(offset_type)) 171 | return (None, self.get_play_offset(data, itype)) #if offset_type error, return play_offset_ms 172 | 173 | db_offset_key = "db_{0}_time_offset_ms".format(offset_type) 174 | sample_offset_key = "sample_{0}_time_offset_ms".format(offset_type) 175 | 176 | db_play_offset_ms = 0 #ms 177 | sample_play_offset_ms = 0 178 | result = data['result'] 179 | if result['status']['code'] == 1001: 180 | return 0 181 | if itype == 'music': 182 | db_play_offset_ms = result['metadata']['music'][0][db_offset_key] 183 | sample_play_offset_ms = result['metadata']['music'][0][sample_offset_key] 184 | elif itype == 'custom': 185 | db_play_offset_ms = result['metadata']['custom_files'][0][db_offset_key] 186 | sample_play_offset_ms = result['metadata']['custom_files'][0][sample_offset_key] 187 | 188 | return (int(sample_play_offset_ms)/1000.0, int(db_play_offset_ms)/1000.0) 189 | except Exception as e: 190 | self._dlog.logger.error("Error@please contact support@acrcloud.com to add offset config for your access_key") 191 | return (None, None) 192 | 193 | def get_duration(self, end_timestamp, start_timestamp): 194 | end = datetime.datetime.strptime(end_timestamp, '%H:%M:%S') 195 | start = datetime.datetime.strptime(start_timestamp, '%H:%M:%S') 196 | return (end - start).total_seconds() 197 | 198 | def get_duration_accurate(self, end_data, start_data, itype='music'): 199 | monitor_len = end_data.get('rec_length', 10) 200 | end_play_offset = self.get_play_offset(end_data, itype) 201 | start_play_offset = self.get_play_offset(start_data, itype) 202 | pre_seconds = max(20, monitor_len*2) 203 | if int(start_play_offset) < pre_seconds: 204 | start_play_offset = 0 205 | else: 206 | start_play_offset = start_play_offset - (monitor_len/2) 207 | return int(round(end_play_offset - start_play_offset)) 208 | 209 | def get_duration_accurate_use_db_offset(self, end_data, begin_data, isize, itype='music'): 210 | begin_timestamp = datetime.datetime.strptime(begin_data['timestamp'], "%H:%M:%S") 211 | 212 | monitor_len = end_data.get('rec_length', 10) 213 | 214 | end_sample_offset, end_db_offset = self.get_db_play_offset(end_data, 'end', itype) 215 | begin_sample_offset, begin_db_offset = self.get_db_play_offset(begin_data, 'begin', itype) 216 | for i in [ end_sample_offset, end_db_offset, begin_sample_offset, begin_db_offset]: 217 | if i is None: 218 | return 0, 0, 0, begin_data["timestamp"] 219 | 220 | accurate_begin_timestamp = (begin_timestamp + relativedelta(seconds=int(float(begin_sample_offset)))).strftime("%H:%M:%S") 221 | 222 | db_len = int(round(end_db_offset - begin_db_offset)) 223 | sample_len = int(round(end_sample_offset - begin_sample_offset + (isize-1)*monitor_len)) 224 | 225 | mix_len = 0 226 | if int(begin_sample_offset) == 0 and int(begin_db_offset) == 0: 227 | mix_len = (isize-1)*monitor_len + end_sample_offset 228 | elif int(begin_sample_offset) == 0: 229 | if begin_db_offset <= monitor_len: 230 | mix_len = (isize-1)*monitor_len + end_sample_offset 231 | else: 232 | mix_len = (isize-1)*monitor_len + end_sample_offset - begin_sample_offset 233 | elif int(begin_db_offset) == 0: 234 | mix_len = (isize-1)*monitor_len + end_sample_offset - begin_sample_offset 235 | else: 236 | mix_len = (isize-1)*monitor_len + end_sample_offset - begin_sample_offset 237 | mix_len = int(round(mix_len)) 238 | 239 | return sample_len, db_len, mix_len, accurate_begin_timestamp 240 | 241 | def judge_zero_item_contain_current_result(self, ret_sim_title, zero_data, itype="music"): 242 | """ 243 | itype: music => title is track name 244 | itype: custom => title is acrid 245 | """ 246 | try: 247 | is_contain = False 248 | if itype == "music": 249 | zero_title_list = self.get_mutil_result_title(zero_data, 'music', 5) 250 | elif itype == "custom": 251 | zero_title_list = self.get_mutil_result_acrid(zero_data, 'custom', 5) 252 | else: 253 | return is_contain 254 | 255 | for ztitle in zero_title_list: 256 | if ztitle == NORESULT: 257 | break 258 | sim_zero_title = self.tryStrSub(ztitle)[0] if itype == "music" else ztitle 259 | if sim_zero_title == ret_sim_title: 260 | is_contain = True 261 | self.swap_position(ztitle, zero_data, itype) 262 | break 263 | except Exception as e: 264 | self._dlog.logger.error("Error@judge_zero_item_contain_current_result", exc_info=True) 265 | return is_contain 266 | 267 | def judge_latter_item_contain_current_result(self, ret_sim_title, latter_data, itype="music"): 268 | """ 269 | itype: music => title is track name 270 | itype: custom => title is acrid 271 | """ 272 | try: 273 | is_contain = False 274 | latter_data_swaped = None 275 | if itype == "music": 276 | latter_title_list = self.get_mutil_result_title(latter_data, 'music', 5) 277 | elif itype == "custom": 278 | latter_title_list = self.get_mutil_result_acrid(latter_data, 'custom', 5) 279 | else: 280 | return is_contain, latter_data_swaped 281 | 282 | for ltitle in latter_title_list: 283 | if ltitle == NORESULT: 284 | break 285 | sim_latter_title = self.tryStrSub(ltitle)[0] if itype == "music" else ltitle 286 | if sim_latter_title == ret_sim_title: 287 | is_contain = True 288 | latter_data_swaped = copy.deepcopy(latter_data) 289 | self.swap_position(ltitle, latter_data_swaped, itype) 290 | break 291 | except Exception as e: 292 | self._dlog.logger.error("Error@judge_latter_item_contain_current_result", exc_info=True) 293 | return is_contain, latter_data_swaped 294 | 295 | def real_check_title_custom(self, stream_id, title, timestamp_obj): 296 | now_timestamp = timestamp_obj #datetime.datetime.utcnow() 297 | if stream_id not in self._real_custom: 298 | self._real_custom[stream_id] = [[('','')], ''] 299 | 300 | if len(self._real_custom[stream_id][0]) > self._real_custom_list_num: 301 | self._real_custom[stream_id][0] = self._real_custom[stream_id][0][-self._real_custom_list_num:] 302 | his_list_num = self._real_custom_list_num 303 | else: 304 | his_list_num = len(self._real_custom[stream_id][0]) 305 | 306 | for i in range(his_list_num-1, -1, -1): 307 | if self._real_custom[stream_id][0][i][0] == title: 308 | his_timestamp = self._real_custom[stream_id][0][i][1] 309 | his_time_obj = datetime.datetime.strptime(his_timestamp, '%H:%M:%S') 310 | if (now_timestamp - his_time_obj).total_seconds() <= self._real_custom_valid_interval: 311 | return True 312 | if title == NORESULT: 313 | break 314 | 315 | return False 316 | 317 | def checkResultSim(self, idx, curr_title, his_title, stream_id): 318 | if not curr_title or not his_title: 319 | return False 320 | sim, detail = tools_str_sim.str_sim(curr_title, his_title) 321 | if not sim and curr_title != NORESULT and his_title != NORESULT: 322 | pass 323 | return sim 324 | 325 | def checkSame(self, curr_title, stream_id): 326 | self._real_music[stream_id] = self._real_music.get(stream_id, [[''], '']) 327 | if len(self._real_music[stream_id][0]) > self._real_music_list_num: 328 | self._real_music[stream_id][0] = self._real_music[stream_id][0][-self._real_music_list_num:] 329 | his_max = self._real_music_list_num 330 | else: 331 | his_max = len(self._real_music[stream_id][0]) 332 | for i in range(his_max-1, -1, -1): 333 | if self.checkResultSim(i, curr_title, self._real_music[stream_id][0][i], stream_id): 334 | return True 335 | if curr_title == NORESULT: 336 | break 337 | return False 338 | 339 | def updateResultTitle(self, data, new_title): 340 | if new_title == NORESULT: 341 | return 342 | try: 343 | json_res = data["result"] 344 | metainfos = json_res.get("metainfos") 345 | metadata = json_res.get("metadata") 346 | if metainfos: 347 | metainfos[0]['title'] = new_title 348 | else: 349 | if metadata.get('music'): 350 | metadata['music'][0]['title'] = new_title 351 | else: 352 | metadata['custom_files'][0]['title'] = new_title 353 | except Exception as e: 354 | self._dlog.logger.error("Error@updateResultTitle", exc_info=True) 355 | 356 | def tryStrSub(self, try_str): 357 | sub_str = tools_str_sim.str_sub(try_str) 358 | if len(sub_str) > 0 and len(try_str) > len(sub_str): 359 | return sub_str, True 360 | return try_str, False 361 | 362 | def tryUpdateResultTitle(self, data, itype): 363 | if itype == 'custom': 364 | title = self.get_mutil_result_title(data, 'custom', 1)[0] 365 | return title 366 | title = self.get_mutil_result_title(data, 'music', 1)[0] 367 | stream_id = data.get("stream_id") 368 | new_title, try_status = self.tryStrSub(title) 369 | if try_status: 370 | self.updateResultTitle(data, new_title) 371 | return new_title 372 | return title 373 | 374 | def deal_real_history(self, data): 375 | is_new = False 376 | result = None 377 | curr_title = self.get_mutil_result_title(data, 'music', 1)[0] 378 | stream_id = data.get("stream_id") 379 | if not stream_id: 380 | return result, is_new 381 | if curr_title == NORESULT: 382 | if not self.checkSame(curr_title, stream_id): 383 | self._real_music[stream_id][0].append(curr_title) 384 | self._real_music[stream_id][1] = data 385 | result = data 386 | is_new = True 387 | else: 388 | result = None 389 | is_new = False 390 | else: 391 | if self.checkSame(curr_title, stream_id): 392 | result = self._real_music[stream_id][1] 393 | is_new = False 394 | else: 395 | self._real_music[stream_id][0].append(curr_title) 396 | self._real_music[stream_id][1] = data 397 | result = data 398 | is_new = True 399 | 400 | return result, is_new 401 | 402 | def deal_delay_history(self, data): 403 | stream_id = data.get("stream_id") 404 | timestamp = data.get("timestamp") 405 | raw_title = self.get_mutil_result_title(data, 'music', 1)[0] 406 | sim_title = self.tryStrSub(raw_title) 407 | if stream_id not in self._delay_music: 408 | self._delay_music[stream_id] = [(raw_title, sim_title[0], timestamp, data)] 409 | else: 410 | self._delay_music[stream_id].append((raw_title, sim_title[0], timestamp, data)) 411 | 412 | if len(self._delay_music[stream_id]) > self._delay_list_max_num : 413 | return self.runDelayX_for_music_delay2(stream_id) 414 | else: 415 | return None 416 | 417 | def compute_played_duration(self, history_data, start_index, end_index, judge_zero_or_latter=True, itype="music"): 418 | retdata = history_data[start_index][-1] 419 | 420 | if itype == "music": 421 | ret_title = self.get_mutil_result_title(retdata, 'music', 1)[0] 422 | ret_sim_title = history_data[start_index][1] 423 | elif itype == "custom": 424 | ret_title = self.get_mutil_result_acrid(retdata, 'custom', 1)[0] 425 | ret_sim_title = ret_title 426 | 427 | if judge_zero_or_latter and start_index == 1: 428 | if self.judge_zero_item_contain_current_result(ret_sim_title, history_data[0][-1], itype): 429 | start_index = 0 430 | 431 | is_contain = False 432 | latter_data_swaped = None 433 | if judge_zero_or_latter and (end_index + 1 <= len(history_data) - 1): 434 | is_contain, latter_data_swaped = self.judge_latter_item_contain_current_result(ret_sim_title, history_data[end_index+1][-1], itype) 435 | 436 | if itype == "music": 437 | start_timestamp = history_data[start_index][2] 438 | end_timestamp = history_data[end_index][2] 439 | start_data = history_data[start_index][3] 440 | end_data = history_data[end_index][3] 441 | else: 442 | start_timestamp = history_data[start_index][1] 443 | end_timestamp = history_data[end_index][1] 444 | start_data = history_data[start_index][2] 445 | end_data = history_data[end_index][2] 446 | 447 | duration = self.get_duration(end_timestamp, start_timestamp) 448 | duration_accurate = self.get_duration_accurate(end_data, start_data, itype) 449 | isize = end_index - start_index + 1 450 | if is_contain: 451 | end_data = latter_data_swaped 452 | isize += 1 453 | 454 | sample_duraion, db_duration, mix_duration, accurate_timestamp_utc = self.get_duration_accurate_use_db_offset(end_data, start_data, isize, itype) 455 | 456 | ret_dict = { 457 | "duration" : duration, 458 | "duration_accurate" : duration_accurate, 459 | "sample_duration" : sample_duraion, 460 | "db_duration" : db_duration, 461 | "mix_duration" : mix_duration, 462 | "accurate_timestamp_utc" : accurate_timestamp_utc, 463 | } 464 | return ret_dict 465 | 466 | def get_data_duration_ms(self, data): 467 | try: 468 | duration_ms = -1 469 | json_res = data["result"] 470 | if json_res['status']['code'] == 0: 471 | if 'metadata' in json_res and 'music' in json_res['metadata']: 472 | if len(json_res['metadata']['music']) > 0: 473 | duration_ms = json_res["metadata"]["music"][0]["duration_ms"] 474 | except Exception as e: 475 | self._dlog.logger.error("Error@get_data_duration_ms", exc_info=True) 476 | return (duration_ms/1000.0) if duration_ms != -1 else duration_ms 477 | 478 | def get_time_diff(self, start_timestamp, end_timestamp, tformat="%Y-%m-%d %H:%M:%S"): 479 | try: 480 | diff_sec = 0 481 | start_obj = datetime.datetime.strptime(start_timestamp, tformat) 482 | end_obj = datetime.datetime.strptime(end_timestamp, tformat) 483 | diff_sec = int((end_obj - start_obj).total_seconds()) 484 | except Exception as e: 485 | self._dlog.logger.error("Error@get_diff_seconds", exc_info=True) 486 | return diff_sec 487 | 488 | def remove_next_result_from_now_result_list_for_music_delay2(self, history_data, ret_data, max_index): 489 | #Just for music delay2 filter 490 | try: 491 | if ret_data and len(history_data) >= max_index+2: 492 | raw_title, sim_title, timestamp, next_data = history_data[max_index + 1] 493 | if next_data: 494 | next_title_list = self.get_mutil_result_title(next_data, 'music', 1) 495 | next_title_set = set(next_title_list) 496 | new_ret_music = [] 497 | for index, item in enumerate(ret_data["result"]["metadata"]["music"]): 498 | if index == 0 or (item["title"] not in next_title_set): 499 | new_ret_music.append(item) 500 | ret_data["result"]["metadata"]["music"] = new_ret_music 501 | except Exception as e: 502 | self._dlog.logger.error("Error@remove_next_result_from_now_result_list_for_music_delay2", exc_info=True) 503 | 504 | def result_append_for_music_delay2(self, ret_data, title, from_data): 505 | try: 506 | ret_title_set = set() 507 | for item in ret_data['result']['metadata']['music']: 508 | sim_title = self.tryStrSub(item['title'])[0] 509 | ret_title_set.add(sim_title) 510 | 511 | for item in from_data['result']['metadata']['music']: 512 | from_title = item['title'] 513 | sim_from_title = self.tryStrSub(from_title)[0] 514 | if sim_from_title == title and sim_from_title not in ret_title_set: 515 | ret_data['result']['metadata']['music'].append(item) 516 | ret_title_set.add(sim_from_title) 517 | except Exception as e: 518 | self._dlog.logger.error("Error@result_append_for_music_delay2", exc_info=True) 519 | 520 | def get_custom_duration_by_title(self, title, ret_data): 521 | try: 522 | duration = 0 523 | db_end_offset = 0 524 | for index, item in enumerate(ret_data["result"]["metadata"]["custom_files"]): 525 | #custom 获取的title是acrid 526 | if title == item["acrid"]: 527 | duration_ms = int(item["duration_ms"]) 528 | db_end_offset_ms = int(item["db_end_time_offset_ms"]) 529 | if duration_ms >= 0: 530 | duration = int(duration_ms/1000) 531 | if db_end_offset_ms: 532 | db_end_offset = int(db_end_offset_ms/1000) 533 | except Exception as e: 534 | self._dlog.logger.error("Error@get_custom_duration_by_title, error_data:{0}".format(ret_data), exc_info=True) 535 | return duration, db_end_offset 536 | 537 | def get_music_duration_by_title(self, title, ret_data): 538 | try: 539 | duration = 0 540 | db_end_offset = 0 541 | if "metadata" in ret_data["result"] and "music" in ret_data["result"]["metadata"]: 542 | for index, item in enumerate(ret_data["result"]["metadata"]["music"]): 543 | if title == item["title"]: 544 | duration_ms = int(item["duration_ms"]) 545 | db_end_offset_ms = int(item["db_end_time_offset_ms"]) 546 | if duration_ms >= 0: 547 | duration = int(duration_ms/1000) 548 | if db_end_offset_ms: 549 | db_end_offset = int(db_end_offset_ms/1000) 550 | except Exception as e: 551 | self._dlog.logger.error("Error@get_custom_duration_by_title, error_data:{0}".format(ret_data), exc_info=True) 552 | return duration, db_end_offset 553 | 554 | def delay_dynamic_judge_size(self, deal_title_map, history_data, itype): 555 | try: 556 | judge_size = 5 557 | if itype == "custom": 558 | title = sorted(deal_title_map.items(), key=lambda x:x[1]["score"], reverse=True)[0][0] 559 | else: 560 | title = deal_title_map.keys()[0] 561 | 562 | index = deal_title_map[title]["index_list"][-1] 563 | if itype == "custom": 564 | ret_data = history_data[index][2] 565 | else: 566 | ret_data = history_data[index][3] 567 | 568 | monitor_len = ret_data.get("monitor_seconds", 10) 569 | 570 | if itype == "custom": 571 | duration, db_end_offset = self.get_custom_duration_by_title(title, ret_data) 572 | else: 573 | duration, db_end_offset = self.get_music_duration_by_title(title, ret_data) 574 | 575 | if db_end_offset > 0 and db_end_offset < duration: 576 | judge_size = abs(int(math.ceil(db_end_offset*1.0/monitor_len))) + 1 577 | if judge_size > 10: 578 | judge_size = 10 579 | if judge_size <= 3: 580 | judge_size = 3 581 | if itype == "custom": 582 | judge_size = 1 583 | except Exception as e: 584 | self._dlog.logger.error("Error@delay_dynamic_judge_size", exc_info=True) 585 | return judge_size+1 586 | 587 | def fill_ret_data_by_acrid_count(self, sorted_title_list, history_data): 588 | try: 589 | ret_data = None 590 | init_ret_data = True 591 | for sitem in sorted_title_list: 592 | sitem_title, sitem_map = sitem 593 | sitem_title = self.tryStrSub(sitem_title)[0] 594 | sitem_count = sitem_map["count"] 595 | acrid_count_map = {} 596 | for tindex in sitem_map["index_list"]: 597 | tdata = history_data[tindex][3] 598 | if init_ret_data: 599 | ret_data = copy.deepcopy(tdata) 600 | ret_data["result"]["metadata"]["music"] = [] 601 | init_ret_data = False 602 | if "metadata" in tdata["result"] and "music" in tdata["result"]["metadata"]: 603 | for item in tdata['result']['metadata']['music']: 604 | sim_title = self.tryStrSub(item['title'])[0] 605 | if sim_title == sitem_title: 606 | acrid = item['acrid'] 607 | if acrid not in acrid_count_map: 608 | acrid_count_map[acrid] = {"count":0, "info":item} 609 | acrid_count_map[acrid]["count"] += 1 610 | if ret_data is None: 611 | break 612 | 613 | acrid_count_map_sorted = sorted(acrid_count_map.items(), key=lambda x:x[1]["count"], reverse=True) 614 | for s_index, s_item in enumerate(acrid_count_map_sorted): 615 | ret_data["result"]["metadata"]["music"].append(s_item[1]["info"]) 616 | if s_index >= 2: 617 | break 618 | if ret_data is not None and len(ret_data['result']['metadata']['music']) > 6: 619 | ret_data['result']['metadata']['music'] = ret_data['result']['metadata']['music'][:6] 620 | except Exception as e: 621 | self._dlog.logger.error("Error@fill_ret_data_by_acrid_count", exc_info=True) 622 | return ret_data 623 | 624 | def get_music_data_offset(self, data): 625 | try: 626 | ret = { 627 | "monitor_len":0, 628 | "duration_ms":0, 629 | "s_begin_ms":0, 630 | "s_end_ms":0, 631 | "d_begin_ms":0, 632 | "d_end_ms":0 633 | } 634 | result = data.get("result") 635 | monitor_len = data.get("monitor_seconds", 10) 636 | ret["monitor_len"] = monitor_len 637 | if result and "metadata" in result and "music" in result["metadata"]: 638 | fitem = result["metadata"]["music"][0] 639 | ret["duration_ms"] = int(fitem["duration_ms"]) 640 | ret["s_begin_ms"] = int(fitem["sample_begin_time_offset_ms"]) 641 | ret["s_end_ms"] = int(fitem["sample_end_time_offset_ms"]) 642 | ret["d_begin_ms"] = int(fitem["db_begin_time_offset_ms"]) 643 | ret["d_end_ms"] = int(fitem["db_end_time_offset_ms"]) 644 | return ret 645 | except Exception as e: 646 | self._dlog.logger.error("Error@get_music_data_offset, error_data:{0}".format(data), exc_info=True) 647 | return None 648 | 649 | def check_if_is_break(self, index1, index2, data1, data2): 650 | try: 651 | is_break = False 652 | ret1 = self.get_music_data_offset(data1) 653 | ret2 = self.get_music_data_offset(data2) 654 | if ret1 and ret2: 655 | diff_db = ret2["d_end_ms"] - ret1["d_begin_ms"] 656 | if diff_db <= 0: 657 | return is_break 658 | timestamp1 = datetime.datetime.strptime(data1["timestamp"], "%H:%M:%S") 659 | timestamp2 = datetime.datetime.strptime(data2["timestamp"], "%H:%M:%S") 660 | monitor_len = ret1["monitor_len"] 661 | A1 = timestamp1 + relativedelta(seconds=int(ret1["s_begin_ms"]/1000)) 662 | A2 = timestamp2 + relativedelta(seconds=int(ret2["s_end_ms"]/1000)) 663 | B1 = int((A2 - A1).total_seconds()) 664 | B2 = (index2 - index1 - 1)*monitor_len + int(diff_db/1000) 665 | B3 = int(diff_db/1000) 666 | if abs(B3 - B1) <= 15: 667 | is_break = False 668 | elif abs(B2 - B1) <= 10: 669 | is_break = True 670 | except Exception as e: 671 | self._dlog.logger.error("Error@check_if_is_break", exc_info=True) 672 | return is_break 673 | 674 | def check_if_continuous(self, index1, index2, data1, data2): 675 | try: 676 | is_cont = True 677 | ret1 = self.get_music_data_offset(data1) 678 | ret2 = self.get_music_data_offset(data2) 679 | timestamp1 = datetime.datetime.strptime(data1["timestamp"], "%H:%M:%S") 680 | timestamp2 = datetime.datetime.strptime(data2["timestamp"], "%H:%M:%S") 681 | diff_sec = (timestamp2 - timestamp1).total_seconds() 682 | monitor_len = ret1["monitor_len"] 683 | if ret1 and ret2: 684 | for tmp_ret in [ret1, ret2]: 685 | if (tmp_ret["s_end_ms"] - tmp_ret["s_begin_ms"]) != (tmp_ret["d_end_ms"] - tmp_ret["d_begin_ms"]): 686 | return is_cont 687 | dur1 = ret1["d_end_ms"] - ret1["d_begin_ms"] 688 | dur2 = ret2["d_end_ms"] - ret2["d_begin_ms"] 689 | dur1 = dur1 if dur1 > 0 else 0 690 | dur2 = dur2 if dur2 > 0 else 0 691 | ret1_s_end = ret1["s_end_ms"] 692 | ret2_s_begin = ret2["s_begin_ms"] 693 | if index1+1 == index2 and abs(monitor_len*1000 - ret1_s_end) < 2500 and abs(ret2_s_begin) < 2500 and diff_sec < monitor_len*2: 694 | pass 695 | else: 696 | ifirst, iend = max(ret1["d_begin_ms"], ret2["d_begin_ms"]), min(ret1["d_end_ms"], ret2["d_end_ms"]) 697 | inter_dur = iend - ifirst 698 | if inter_dur > 0: 699 | min_dur = min(dur1, dur2) if min(dur1, dur2) > 0 else max(dur1, dur2) 700 | if min_dur > 0: 701 | inter_rate = (inter_dur*1.0/min_dur) 702 | if inter_dur >=2 and inter_rate >=0.8: 703 | is_cont = False 704 | except Exception as e: 705 | self._dlog.logger.error("Error@check_if_continuous", exc_info=True) 706 | return is_cont 707 | 708 | def runDelayX_for_music_delay2(self, stream_id): 709 | history_data = self._delay_music[stream_id] 710 | judge_zero_or_latter = True 711 | 712 | if len(history_data) >= self._delay_list_threshold: 713 | history_data = history_data[-(self._delay_list_threshold-1):] 714 | 715 | history_data_len = len(history_data) 716 | for ii in range((history_data_len-1), 0, -1): 717 | if history_data[-ii][0][0] == NORESULT: 718 | continue 719 | else: 720 | history_data = history_data[-(ii+1):] 721 | break 722 | 723 | first_not_noresult_index = -1 724 | for index, item in enumerate(history_data): 725 | if index == 0: 726 | continue 727 | if item[0] == NORESULT: 728 | first_not_noresult_index = index 729 | else: 730 | break 731 | if first_not_noresult_index != -1: 732 | history_data = history_data[first_not_noresult_index:] 733 | self._delay_music[stream_id] = history_data 734 | return None 735 | 736 | ########## Get Break Index ########## 737 | deal_title_map = {} #key:title, value:{'count':0, 'index_list':[]} 738 | break_index = 0 739 | 740 | 741 | for index, item in enumerate(history_data[1:]): 742 | index += 1 743 | raw_title, sim_title, timestamp, data = item 744 | if index!=1: 745 | flag_first = True 746 | flag_second = True 747 | if sim_title in deal_title_map: 748 | flag_first = False 749 | if flag_first: 750 | tmp_all_len = len(history_data) 751 | tmp_count = 0 752 | tmp_first_break_index = -1 753 | #tmp_judge_size = 2 754 | tmp_judge_size = self.delay_dynamic_judge_size(deal_title_map, history_data, "music") 755 | find_interval = False 756 | find_pre_last_index = index-1 757 | find_next_sim_index = -1 758 | for i in range(index, tmp_all_len): 759 | next_raw_title, next_sim_title, next_timestamp, next_data = history_data[i] 760 | tmp_list_flag = False 761 | if next_sim_title in deal_title_map: 762 | tmp_list_flag = True 763 | tmp_count = 0 764 | tmp_first_break_index = -1 765 | if find_interval == True: 766 | find_interval = False 767 | find_next_sim_index = i 768 | if find_next_sim_index - find_pre_last_index - 1 >= 8: 769 | is_break = self.check_if_is_break(find_pre_last_index, find_next_sim_index, history_data[find_pre_last_index][3], history_data[find_next_sim_index][3]) 770 | if is_break: 771 | break_index = find_pre_last_index + 1 772 | break 773 | else: 774 | if find_interval == False: 775 | find_interval = True 776 | find_pre_last_index = i - 1 777 | 778 | if tmp_list_flag: 779 | continue 780 | else: 781 | tmp_count += 1 782 | if tmp_first_break_index == -1: 783 | tmp_first_break_index = i 784 | if tmp_count < tmp_judge_size: 785 | continue 786 | flag_second = True 787 | break_index = tmp_first_break_index if tmp_first_break_index != -1 else i 788 | break 789 | 790 | if flag_first and flag_second and deal_title_map: 791 | if break_index >0: 792 | for iii in range(index, break_index): 793 | tmp_raw_title, tmp_sim_title, tmp_timestamp, tmp_data = history_data[iii] 794 | if tmp_sim_title == NORESULT: 795 | continue 796 | if tmp_sim_title in deal_title_map: 797 | deal_title_map[tmp_sim_title]['count'] += 1 798 | deal_title_map[tmp_sim_title]['index_list'].append(iii) 799 | #********************************************************** 800 | sorted_dtitle = sorted(deal_title_map.items(), key = lambda x:x[1]['count'], reverse = True) 801 | sorted_fitem_title, sorted_fitem_map = sorted_dtitle[0] 802 | sfm_count = sorted_fitem_map["count"] 803 | cfirst_index, csecond_index = sorted(sorted_fitem_map["index_list"])[:2] if sfm_count >=2 else [0, 0] 804 | if sfm_count in [2, 3]: #or ((3 < sfm_count <= 10) and sfm_count < (break_index - index)): 805 | is_cont = self.check_if_continuous(cfirst_index, csecond_index, history_data[cfirst_index][3], history_data[csecond_index][3]) 806 | if not is_cont: 807 | judge_zero_or_latter = False 808 | break_index = cfirst_index + 1 809 | deal_title_map = {sorted_fitem_title:{'count':1, 'index_list':[cfirst_index]}} 810 | #********************************************************** 811 | #跳出 812 | break 813 | 814 | if sim_title == NORESULT: 815 | continue 816 | if sim_title not in deal_title_map: 817 | deal_title_map[sim_title] ={'count':0, 'index_list':[]} 818 | deal_title_map[sim_title]['count'] += 1 819 | deal_title_map[sim_title]['index_list'].append(index) 820 | 821 | 822 | ret_data = None 823 | duration_dict = {} 824 | duration = 0 825 | if break_index > 0 and deal_title_map: 826 | sorted_title_list = sorted(deal_title_map.items(), key = lambda x:x[1]['count'], reverse = True) 827 | ret_data = self.fill_ret_data_by_acrid_count(sorted_title_list, history_data) 828 | if ret_data and len(ret_data["result"]["metadata"]["music"]) == 0: 829 | ret_data = None 830 | 831 | index_range = set() 832 | for title in deal_title_map: 833 | index_range |= set(deal_title_map[title]['index_list']) 834 | min_index = min(index_range) 835 | max_index = max(index_range) 836 | duration_dict = self.compute_played_duration(history_data, min_index, max_index, judge_zero_or_latter, "music") 837 | 838 | self.remove_next_result_from_now_result_list_for_music_delay2(history_data, ret_data, max_index) 839 | 840 | if ret_data: 841 | duration = duration_dict["duration"] 842 | duration_accurate = duration_dict["duration_accurate"] 843 | sample_duration = duration_dict["sample_duration"] 844 | db_duration = duration_dict["db_duration"] 845 | mix_duration = duration_dict["mix_duration"] 846 | accurate_timestamp_utc = duration_dict["accurate_timestamp_utc"] 847 | ret_data['result']['metadata']['played_duration'] = abs(mix_duration) 848 | ret_data['result']['metadata']['timestamp_utc'] = accurate_timestamp_utc 849 | ret_data['timestamp'] = accurate_timestamp_utc 850 | if ret_data['result']['metadata']['played_duration'] <= 1: 851 | ret_data = None 852 | 853 | ########### cut history_data ############# 854 | if break_index>=0: 855 | cut_index = break_index 856 | for i, item in enumerate(history_data[break_index:]): 857 | if item[0][0] == NORESULT: 858 | cut_index = break_index + i + 1 859 | else: 860 | break 861 | cut_index = cut_index - 1 if cut_index >= 1 else cut_index 862 | history_data = history_data[cut_index:] 863 | 864 | reverse_index = -1 865 | for i, item in enumerate(history_data[::-1]): 866 | if item[0][0] == NORESULT: 867 | reverse_index = i 868 | continue 869 | else: 870 | break 871 | 872 | if reverse_index != -1: 873 | new_cut_index = -1 874 | reverse_index = len(history_data) - reverse_index - 1 875 | if reverse_index in [0, 1]: 876 | history_data = [] 877 | else: 878 | pass 879 | 880 | if judge_zero_or_latter == False and len(history_data) > 0: 881 | if history_data[0][0] != NORESULT: 882 | tmp_t, sim_tmp_t, tmp_timestamp, tmp_data = history_data[0] 883 | if tmp_data and "status" in tmp_data["result"]: 884 | tmp_data["result"]["status"]["code"] = 1001 885 | history_data[0] = (NORESULT, NORESULT, tmp_timestamp, tmp_data) 886 | self._delay_music[stream_id] = history_data 887 | 888 | return ret_data 889 | 890 | 891 | def deal_real_custom(self, data): 892 | is_new = False 893 | result = None 894 | curr_title = self.get_mutil_result_acrid(data, 'custom')[0] 895 | 896 | stream_id = data.get("stream_id") 897 | timestamp = data.get("timestamp") 898 | timestamp_obj = datetime.datetime.strptime(timestamp, "%H:%M:%S") 899 | if not stream_id: 900 | return result, is_new 901 | if curr_title == NORESULT: 902 | if not self.real_check_title_custom(stream_id, curr_title, timestamp_obj): 903 | self._real_custom[stream_id][0].append((curr_title, timestamp)) 904 | self._real_custom[stream_id][1] = data 905 | result = data 906 | is_new = True 907 | else: 908 | result = None 909 | is_new = False 910 | else: 911 | if self.real_check_title_custom(stream_id, curr_title, timestamp_obj): 912 | result = self._real_custom[stream_id][1] 913 | is_new = False 914 | else: 915 | self._real_custom[stream_id][0].append((curr_title, timestamp)) 916 | self._real_custom[stream_id][1] = data 917 | result = data 918 | is_new = True 919 | return result, is_new 920 | 921 | def deal_delay_custom(self, data): 922 | try: 923 | ret_result = None 924 | stream_id = data.get("stream_id") 925 | timestamp = data.get("timestamp") 926 | title_list = self.get_mutil_result_acrid(data, 'custom', 5) 927 | if stream_id not in self._delay_custom: 928 | self._delay_custom[stream_id] = [(title_list, timestamp, data)] 929 | else: 930 | self._delay_custom[stream_id].append((title_list, timestamp, data)) 931 | 932 | if len(self._delay_custom[stream_id]) >= self._delay_list_max_num: 933 | ret_result = self.runDelayX_custom(stream_id) 934 | except Exception as e: 935 | self._dlog.logger.error("Error@deal_delay_custom", exc_info=True) 936 | return ret_result 937 | 938 | def remove_next_result_from_now_result_list(self, history_data, ret_data, max_index): 939 | #Just for custom delay filter 940 | try: 941 | if ret_data and len(history_data) >= max_index+2: 942 | acrid_list, timestamp, next_data = history_data[max_index + 1] 943 | if next_data: 944 | #update max size acrid_list to 20 945 | next_acrid_list = self.get_mutil_result_acrid(next_data, 'custom', 20) 946 | next_acrid_set = set(next_acrid_list) 947 | new_ret_custom_files = [] 948 | for index, item in enumerate(ret_data["result"]["metadata"]["custom_files"]): 949 | if index == 0 or (item["acrid"] not in next_acrid_set): 950 | new_ret_custom_files.append(item) 951 | ret_data["result"]["metadata"]["custom_files"] = new_ret_custom_files 952 | except Exception as e: 953 | self._dlog.logger.error("Error@remove_next_result_from_now_result_list", exc_info=True) 954 | 955 | def get_custom_duration_by_title(self, title, ret_data): 956 | try: 957 | duration = 0 958 | for index, item in enumerate(ret_data["result"]["metadata"]["custom_files"]): 959 | if title == item["acrid"]: 960 | duration_ms = int(item["duration_ms"]) 961 | if duration_ms >= 0: 962 | duration = int(duration_ms/1000) 963 | except Exception as e: 964 | self._dlog.logger.error("Error@get_custom_duration_by_title, error_data:{0}".format(ret_data), exc_info=True) 965 | return duration 966 | 967 | def custom_delay_dynamic_judge_size(self, deal_title_map, history_data): 968 | try: 969 | judge_size = 6 970 | title = list(deal_title_map.keys())[0] 971 | index = deal_title_map[title]["index_list"][-1] 972 | ret_data = history_data[index][2] 973 | duration = self.get_custom_duration_by_title(title, ret_data) 974 | tmp_size = int(duration/10) 975 | if tmp_size <=6: 976 | judge_size = tmp_size if tmp_size > 1 else 2 977 | elif tmp_size >= 18: 978 | judge_size = 18 979 | except Exception as e: 980 | self._dlog.logger.error("Error@custom_delay_dynamic_judge_size", exc_info=True) 981 | 982 | return judge_size if judge_size >= 2 else 2 983 | 984 | def runDelayX_custom(self, stream_id): 985 | history_data = self._delay_custom[stream_id] 986 | 987 | if len(history_data) >= self._delay_list_threshold: 988 | history_data = history_data[-(self._delay_list_threshold-1):] 989 | 990 | history_data_len = len(history_data) 991 | for ii in range((history_data_len-1), 0, -1): 992 | if history_data[-ii][0][0] == NORESULT: 993 | continue 994 | else: 995 | history_data = history_data[-(ii+1):] 996 | break 997 | 998 | first_not_noresult_index = -1 999 | for index, item in enumerate(history_data): 1000 | if index == 0: 1001 | continue 1002 | if len(item[0])>0 and item[0][0] == NORESULT: 1003 | first_not_noresult_index = index 1004 | else: 1005 | break 1006 | if first_not_noresult_index != -1: 1007 | history_data = history_data[first_not_noresult_index:] 1008 | self._delay_custom[stream_id] = history_data 1009 | return None 1010 | 1011 | deal_title_map = {} #key:title, value:{'count':0, 'index_list':[]} 1012 | tmp_deal_title_map = {} 1013 | break_index = 0 1014 | 1015 | for index, item in enumerate(history_data[1:]): 1016 | index += 1 1017 | title_list, timestamp, data = item 1018 | if index!=1: 1019 | flag_first = True 1020 | flag_second = True 1021 | for title in title_list[:3]: 1022 | if title in deal_title_map: 1023 | flag_first = False 1024 | if flag_first: 1025 | judge_size = self.custom_delay_dynamic_judge_size(deal_title_map, history_data) 1026 | for i in range(1,judge_size): 1027 | if index + i < len(history_data): 1028 | next_title_list, next_timestamp, next_data = history_data[index + i] 1029 | for title in next_title_list[:3]: 1030 | if title in deal_title_map: 1031 | flag_second = False 1032 | else: 1033 | flag_second = False 1034 | if flag_first and flag_second and deal_title_map: 1035 | break_index = index 1036 | break 1037 | 1038 | for i, title in enumerate(title_list): 1039 | if title == NORESULT: 1040 | continue 1041 | if i == 0: 1042 | if title not in deal_title_map: 1043 | deal_title_map[title] ={'count':0, 'index_list':[]} 1044 | deal_title_map[title]['count'] += 1 1045 | deal_title_map[title]['index_list'].append(index) 1046 | if title not in tmp_deal_title_map: 1047 | tmp_deal_title_map[title] = {'count':0, 'index_list':[]} 1048 | tmp_deal_title_map[title]['count'] += 1 1049 | tmp_deal_title_map[title]['index_list'].append(index) 1050 | 1051 | ########### New Deal Custom Result Add Count ########### 1052 | ret_data = None 1053 | duration_dict = {} 1054 | duration = 0 1055 | if break_index > 0 and deal_title_map: 1056 | tmp_count_map = {} 1057 | sorted_title_list = sorted(deal_title_map.items(), key = lambda x:x[1]['count'], reverse = True) 1058 | for sitem in sorted_title_list: 1059 | sitem_title, sitem_map = sitem 1060 | sitem_count = sitem_map["count"] 1061 | sitem_min_index = min(sitem_map["index_list"]) 1062 | if sitem_count not in tmp_count_map: 1063 | tmp_count_map[sitem_count] = [] 1064 | tmp_count_map[sitem_count].append((sitem_title, sitem_min_index)) 1065 | first_item_flag = True 1066 | for scount in sorted(tmp_count_map.keys(), reverse=True): 1067 | count_list = sorted(tmp_count_map[scount], key = lambda x:x[1]) 1068 | for ditem in count_list: 1069 | dtitle, dindex = ditem 1070 | from_data = history_data[dindex][2] 1071 | if first_item_flag: 1072 | first_item_flag = False 1073 | ret_data = copy.deepcopy(from_data) 1074 | ret_data["result"]["metadata"]["custom_files"] = [] 1075 | self.custom_result_append(ret_data, dtitle, from_data, scount, tmp_deal_title_map) 1076 | 1077 | index_range = set() 1078 | for title in deal_title_map: 1079 | index_range |= set(deal_title_map[title]['index_list']) 1080 | min_index = min(index_range) 1081 | max_index = max(index_range) 1082 | duration_dict = self.compute_played_duration(history_data, min_index, max_index, True, "custom") 1083 | 1084 | self.remove_next_result_from_now_result_list(history_data, ret_data, max_index) 1085 | 1086 | if ret_data: 1087 | duration = duration_dict["duration"] 1088 | duration_accurate = duration_dict["duration_accurate"] 1089 | sample_duration = duration_dict["sample_duration"] 1090 | db_duration = duration_dict["db_duration"] 1091 | mix_duration = duration_dict["mix_duration"] 1092 | accurate_timestamp_utc = duration_dict["accurate_timestamp_utc"] 1093 | ret_data['result']['metadata']['played_duration'] = abs(mix_duration) 1094 | ret_data['result']['metadata']['timestamp_utc'] = accurate_timestamp_utc 1095 | ret_data['timestamp'] = accurate_timestamp_utc 1096 | if ret_data['result']['metadata']['played_duration'] <= self._delay_custom_played_duration_min: 1097 | ret_data = None 1098 | 1099 | ########### cut history_data ############# 1100 | if break_index>=0: 1101 | cut_index = break_index 1102 | for i, item in enumerate(history_data[break_index:]): 1103 | if item[0][0] == NORESULT: 1104 | cut_index = break_index + i + 1 1105 | else: 1106 | break 1107 | cut_index = cut_index - 1 if cut_index >= 1 else cut_index 1108 | history_data = history_data[cut_index:] 1109 | 1110 | reverse_index = -1 1111 | for i, item in enumerate(history_data[::-1]): 1112 | if item[0][0] == NORESULT: 1113 | reverse_index = i 1114 | continue 1115 | else: 1116 | break 1117 | 1118 | if reverse_index != -1: 1119 | new_cut_index = -1 1120 | reverse_index = len(history_data) - reverse_index - 1 1121 | if reverse_index in [0, 1]: 1122 | history_data = [] 1123 | else: 1124 | pass 1125 | 1126 | self._delay_custom[stream_id] = history_data 1127 | return ret_data 1128 | 1129 | class FilterWorker: 1130 | def __init__(self): 1131 | self.tmp_no_result = {'status': {'msg': 'No result', 'code': 1001, 'version': '1.0'}, 'metadata': {}} 1132 | self._result_map = [] 1133 | self.init_logger() 1134 | self._result_filter = ResultFilter(self.dlog) 1135 | 1136 | def init_logger(self): 1137 | self.dlog = acrcloud_logger.AcrcloudLogger('Filter_Log') 1138 | self.dlog.addStreamHandler() 1139 | 1140 | def save_one_delay(self, old_data, isCustom=0): 1141 | data = None 1142 | if isCustom: 1143 | data = self._result_filter.deal_delay_custom(old_data) 1144 | else: 1145 | data = self._result_filter.deal_delay_history(old_data) 1146 | 1147 | if data is not None: 1148 | del data["stream_id"] 1149 | self._result_map.append(data) 1150 | return True 1151 | else: 1152 | return False 1153 | 1154 | def save_one(self, jsondata): 1155 | try: 1156 | timestamp = jsondata['timestamp'] 1157 | if jsondata['result']['status']['code'] != 0: 1158 | jsondata['result']['metadata'] = {'timestamp_utc':timestamp} 1159 | elif 'metadata' in jsondata['result']: 1160 | jsondata['result']['metadata']['timestamp_utc'] = timestamp 1161 | 1162 | tmp_no_result_json = {'status': {'msg': 'No result', 'code': 1001, 'version': '1.0'}, 'metadata': {'timestamp_utc': timestamp}} 1163 | 1164 | ret = False 1165 | custom_data = copy.deepcopy(jsondata) 1166 | if jsondata['result']['status']['code'] != 0: 1167 | ret = self.save_one_delay(jsondata, 0) 1168 | ret = self.save_one_delay(custom_data, 1) 1169 | elif 'metadata' in jsondata['result'] and 'custom_files' in jsondata['result']['metadata']: 1170 | if 'music' in jsondata['result']['metadata']: 1171 | del custom_data['result']['metadata']['music'] 1172 | del jsondata['result']['metadata']['custom_files'] 1173 | ret = self.save_one_delay(jsondata, 0) 1174 | else: 1175 | jsondata['result'] = copy.deepcopy(tmp_no_result_json) 1176 | ret = self.save_one_delay(jsondata, 0) 1177 | ret = self.save_one_delay(custom_data, 1) 1178 | elif 'metadata' in jsondata['result'] and 'music' in jsondata['result']['metadata']: 1179 | custom_data['result'] = copy.deepcopy(tmp_no_result_json) 1180 | ret = self.save_one_delay(jsondata, 0) 1181 | except Exception as e: 1182 | self.dlog.logger.error("Error@save_one", exc_info=True) 1183 | return ret 1184 | 1185 | def do_filter(self, tmp_id, filepath, result, rec_length, timestamp): 1186 | try: 1187 | jsoninfo = { 1188 | "stream_id": tmp_id, 1189 | "file":filepath, 1190 | "rec_length": rec_length, 1191 | "result": result, 1192 | "timestamp": timestamp 1193 | } 1194 | self.save_one(jsoninfo) 1195 | except Exception as e: 1196 | self.dlog.logger.error("Error@do_filter", exc_info=True) 1197 | 1198 | def end_filter(self, tmp_id, rec_length, timestamp): 1199 | try: 1200 | tmp_no_result = copy.deepcopy(self.tmp_no_result) 1201 | for i in range(1, 60): 1202 | tmp_timestamp = datetime.datetime.strptime(timestamp, "%H:%M:%S") 1203 | new_timestamp = (tmp_timestamp + relativedelta(seconds=int(i*rec_length))).strftime("%H:%M:%S") 1204 | jsoninfo = { 1205 | "stream_id": tmp_id, 1206 | "rec_length": rec_length, 1207 | "result": tmp_no_result, 1208 | "timestamp": new_timestamp 1209 | } 1210 | self.save_one(jsoninfo) 1211 | except Exception as e: 1212 | self.dlog.logger.error("Error@end_filter", exc_info=True) 1213 | 1214 | def start_filter(self, tmp_id, rec_length, timestamp): 1215 | try: 1216 | tmp_no_result = copy.deepcopy(self.tmp_no_result) 1217 | for i in range(1, 0, -1): 1218 | new_timestamp = timestamp 1219 | jsoninfo = { 1220 | "stream_id": tmp_id, 1221 | "rec_length": rec_length, 1222 | "result": tmp_no_result, 1223 | "timestamp": new_timestamp 1224 | } 1225 | self.save_one(jsoninfo) 1226 | except Exception as e: 1227 | self.dlog.logger.error("Error@start_filter", exc_info=True) 1228 | 1229 | def apply_filter(self, result_list): 1230 | try: 1231 | appid = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S") 1232 | rec_length = 10 1233 | timestamp = None 1234 | for index, item in enumerate(result_list): 1235 | filename = item["file"] 1236 | timestamp = item["timestamp"] 1237 | rec_length = item["rec_length"] 1238 | if index == 0: 1239 | self.start_filter(appid, rec_length, timestamp) 1240 | result = item["result"] 1241 | if "status" in result and result["status"]["code"] in [0, 1001]: 1242 | self.do_filter(appid, filename, result, rec_length, timestamp) 1243 | if timestamp is not None: 1244 | self.end_filter(appid, rec_length, timestamp) 1245 | except Exception as e: 1246 | self.dlog.logger.error("Error@apply_filter", exc_info=True) 1247 | return self._result_map 1248 | 1249 | def test(self): 1250 | a = '{"timestamp": "01 00:17:40", "rec_length": 10, "result": {"status": {"msg": "Success", "code": 0, "version": "1.0"}, "cost_time": 1.2630000114441, "result_type": 0, "metadata": {"timestamp_utc": "2018-08-02 14:44:39", "music": [{"album": {"name": "Solino"}, "play_offset_ms": 85200, "sample_begin_time_offset_ms": 300, "title": "La Bambola", "result_from": 1, "release_date": "2002-10-28", "sample_end_time_offset_ms": 9460, "genres": [{"name": "Pop"}], "label": "Amiga", "db_end_time_offset_ms": 85120, "score": 82, "db_begin_time_offset_ms": 75960, "artists": [{"name": "Patty Pravo"}], "duration_ms": 182200, "external_ids": {"isrc": "ITB006870616", "upc": "743219711328"}, "acrid": "27fef80da4dabc33591a2c08a08edaf0", "external_metadata": {"spotify": {"album": {"name": "Solino", "id": "0I3MXd5FYGAj6X9GOJepMb"}, "track": {"name": "La Bambola", "id": "5YT3WdXo5gBwZ0TlJiB0TE"}, "artists": [{"name": "Patty Pravo", "id": "2Yi5fknmHBqqKjHF6cXQyh"}]}, "deezer": {"album": {"name": "Solino", "id": "112016"}, "track": {"name": "La Bambola", "id": "1017795"}, "artists": [{"name": "Patty Pravo", "id": "58615"}]}, "youtube": {"vid": "UHCgZY-HX6U"}}}]}}, "file": "radioairplay_19/501.2018.06.19.04.00.00.mp3"}' 1251 | data = json.loads(a) 1252 | raw_title = self._result_filter.get_mutil_result_title(data, 'music', 1)[0] 1253 | sim_title = self._result_filter.tryStrSub(raw_title) 1254 | print(raw_title, sim_title) 1255 | --------------------------------------------------------------------------------