├── test.mp3
├── config.json
├── requirements.txt
├── Dockerfile
├── example.py
├── .gitignore
├── tools_language.py
├── acrcloud_logger.py
├── acrcloud_scan_files_libary.py
├── README.md
├── tools_str_sim.py
├── acrcloud_scan_files_python.py
└── acrcloud_filter_libary.py
/test.mp3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/acrcloud/acrcloud_scan_files_python/HEAD/test.mp3
--------------------------------------------------------------------------------
/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "host": "xxx",
3 | "access_key": "xxx",
4 | "access_secret": "xxx"
5 | }
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | python-Levenshtein
2 | fuzzywuzzy
3 | backports.csv
4 | requests
5 | openpyxl
6 | python-dateutil
7 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:2.7.16-slim-stretch
2 |
3 | COPY . /acr_scan_tool
4 | WORKDIR /acr_scan_tool
5 | RUN chmod +x /acr_scan_tool/acrcloud_scan_files_python.py
6 |
7 | ENV PATH=${PATH}:/acr_scan_tool
8 |
9 | RUN apt-get update \
10 | && apt-get install -y --no-install-recommends git \
11 | && apt-get purge -y --auto-remove \
12 | && rm -rf /var/lib/apt/lists/*
13 |
14 | RUN pip install git+https://github.com/acrcloud/acrcloud_sdk_python
15 | RUN pip install fuzzywuzzy requests openpyxl python-dateutil backports.csv
16 |
17 |
18 | ENTRYPOINT ["acrcloud_scan_files_python.py"]
19 |
--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #-*- coding:utf-8 -*-
3 |
4 | import os
5 | import sys
6 | import json
7 | from acrcloud_scan_files_libary import ACRCloud_Scan_Files
8 |
9 | if __name__ == "__main__":
10 |
11 | #ACRCloud Scan File Example
12 | is_debug = 1 #display the log info, or is_debug=0
13 | start_time = 0 #scan file start time(seconds)
14 | stop_time = 0 #scan file end time(seconds), or you can set it to the duration of file
15 | step = 10 #the length of each identified fragment (seconds)
16 | rec_length = step
17 |
18 | #your acrcloud project host, access_key, access_secret
19 | config = {
20 | "host": "your project host",
21 | "access_key": "your project access_key",
22 | "access_secret": "your project access_secret"
23 | }
24 | #export dir
25 | export_dir = "./"
26 |
27 | filepath = sys.argv[1]
28 |
29 | acr_sfile = ACRCloud_Scan_Files(config, is_debug)
30 |
31 | stop_time = acr_sfile.get_duration_by_file(filepath)
32 |
33 | """
34 | #get a list of recognition results
35 | result_list = acr_sfile.recognize_file(filepath, start_time, stop_time, step, rec_length)
36 | #export to csv
37 | export_filename_csv = filepath + ".csv"
38 | acr_sfile.export_to_csv(result_list, export_filename_csv, export_dir)
39 | #export to xlsx
40 | export_filename_xlsx = filepath + ".xlsx"
41 | acr_sfile.export_to_xlsx(result_list, export_filename_xlsx, export_dir)
42 | """
43 |
44 | #iterator to get the result of each fragment
45 | result_list2 = []
46 | with open(filepath+"_raw_result.lst", "w") as wfile:
47 | for item in acr_sfile.for_recognize_file(filepath, start_time, stop_time, step, rec_length):
48 | result_list2.append(item)
49 | filename = item["file"]
50 | timestamp = item["timestamp"]
51 | res = acr_sfile.parse_data(item["result"])
52 | title = res[2]
53 | print(filename, timestamp, title)
54 | wfile.write("{0}\n".format(json.dumps(item)))
55 |
56 | #get results with played-duration
57 | filter_results = acr_sfile.apply_filter(result_list2)
58 | #export the results to xlsx
59 | export_filtername_xlsx = filepath + "_with_duration.xlsx"
60 | acr_sfile.export_to_xlsx(filter_results, export_filtername_xlsx, export_dir)
61 |
62 |
63 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | nosetests.xml
46 | coverage.xml
47 | *,cover
48 | .hypothesis/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 |
58 | # Flask instance folder
59 | instance/
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # IPython Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # dotenv
80 | .env
81 |
82 | # virtualenv
83 | venv/
84 | ENV/
85 |
86 | # Spyder project settings
87 | .spyderproject
88 |
89 | # Rope project settings
90 | .ropeproject
91 | ### JetBrains template
92 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
93 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
94 |
95 | # User-specific stuff:
96 | .idea/workspace.xml
97 | .idea/tasks.xml
98 | .idea/dictionaries
99 | .idea/vcs.xml
100 | .idea/jsLibraryMappings.xml
101 |
102 | # Sensitive or high-churn files:
103 | .idea/dataSources.ids
104 | .idea/dataSources.xml
105 | .idea/dataSources.local.xml
106 | .idea/sqlDataSources.xml
107 | .idea/dynamic.xml
108 | .idea/uiDesigner.xml
109 |
110 | # Gradle:
111 | .idea/gradle.xml
112 | .idea/libraries
113 |
114 | # Mongo Explorer plugin:
115 | .idea/mongoSettings.xml
116 |
117 | ## File-based project format:
118 | *.iws
119 |
120 | ## Plugin-specific files:
121 |
122 | # IntelliJ
123 | /out/
124 |
125 | # mpeltonen/sbt-idea plugin
126 | .idea_modules/
127 |
128 | # JIRA plugin
129 | atlassian-ide-plugin.xml
130 |
131 | # Crashlytics plugin (for Android Studio and IntelliJ)
132 | com_crashlytics_export_strings.xml
133 | crashlytics.properties
134 | crashlytics-build.properties
135 | fabric.properties
136 |
137 |
--------------------------------------------------------------------------------
/tools_language.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #-*- coding:utf-8 -*-
3 |
4 | class tools_language:
5 | def __init__(self):
6 | pass
7 |
8 | def is_chinese(self, uchar):
9 | if uchar >= u'\u4e00' and uchar<=u'\u9fa5':
10 | return True
11 | else:
12 | return False
13 |
14 | def is_CJK(self, uchar):
15 | """判断一个unicode是否为CJK(中日韩)"""
16 | if uchar >= u'\u3000' and uchar <= u'\u303f':
17 | return True
18 | elif uchar >= u'\u3040' and uchar <= u'\u309f':
19 | return True
20 | elif uchar >= u'\u30a0' and uchar <= u'\u30ff':
21 | return True
22 | elif uchar >= u'\uff00' and uchar <= u'\u30ff':
23 | return True
24 | elif uchar >= u'\u4e00' and uchar <= u'\u9faf':
25 | return True
26 | elif uchar >= u'\u3400' and uchar <= u'\u4dbf':
27 | return True
28 | elif uchar >= u'\u0400' and uchar <= u'\u052f': #俄语
29 | return True
30 | elif uchar >= u'\uac00' and uchar <= u'\ud7ff': #韩文
31 | return True
32 | elif uchar >= u'\u4e00' and uchar <= u'\u9fa5': #中文
33 | return True
34 | elif uchar >= u'\uff61' and uchar <= u'\uff9f': #半角日文 半宽假名
35 | return True
36 | else:
37 | return False
38 |
39 | def is_number(self, uchar):
40 | if uchar >= u'\u0030' and uchar<=u'\uffef':
41 | return True
42 | else:
43 | return False
44 |
45 | def is_alphabet(self, uchar):
46 | if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
47 | return True
48 | else:
49 | return False
50 |
51 | def is_other(self, uchar):
52 | if not (self.is_chinese(uchar) or self.is_number(uchar) or self.is_alphabet(uchar)):
53 | return True
54 | else:
55 | return False
56 |
57 | def B2Q(self, uchar):
58 | inside_code=ord(uchar)
59 | if inside_code<0x0020 or inside_code>0x7e:
60 | return uchar
61 | if inside_code==0x0020:
62 | inside_code=0x3000
63 | else:
64 | inside_code+=0xfee0
65 | return unichr(inside_code)
66 |
67 | def Q2B(self, uchar):
68 | inside_code=ord(uchar)
69 | if inside_code==0x3000:
70 | inside_code=0x0020
71 | else:
72 | inside_code-=0xfee0
73 | if inside_code<0x0020 or inside_code>0x7e:
74 | return uchar
75 | return unichr(inside_code)
76 |
77 | def stringQ2B(self, ustring):
78 | return "".join([self.Q2B(uchar) for uchar in ustring])
79 |
80 | def uniform(self, ustring):
81 | return self.stringQ2B(ustring).lower()
82 |
83 | def string2List(self, ustring):
84 | retList=[]
85 | utmp=[]
86 | for uchar in ustring:
87 | if self.is_other(uchar):
88 | if len(utmp)==0:
89 | continue
90 | else:
91 | retList.append("".join(utmp))
92 | utmp=[]
93 | else:
94 | utmp.append(uchar)
95 | if len(utmp)!=0:
96 | retList.append("".join(utmp))
97 | return retList
98 |
99 | def has_chinese(self, ustring):
100 | ustring_lower = ustring.lower()
101 | for uchar in ustring_lower:
102 | if self.is_chinese(uchar):
103 | return True
104 | return False
105 |
106 | def has_CJK(self, ustring):
107 | ustring_lower = ustring.lower()
108 | for uchar in ustring_lower:
109 | if self.is_CJK(uchar):
110 | return True
111 | return False
112 |
113 |
114 |
--------------------------------------------------------------------------------
/acrcloud_logger.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #-*- coding:utf-8 -*
3 |
4 | import os
5 | import sys
6 | import time
7 | import logging
8 | import traceback
9 | from logging.handlers import TimedRotatingFileHandler
10 | '''
11 | traceback records log
12 | try:
13 | pass
14 | except Exception, e:
15 | logger.error('Failed to open file', exc_info=True)
16 | '''
17 |
18 | import logging
19 |
20 | BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)
21 |
22 | COLORS = {
23 | 'WARNING' : YELLOW,
24 | 'INFO' : GREEN,
25 | 'DEBUG' : BLUE,
26 | 'CRITICAL' : YELLOW,
27 | 'ERROR' : RED,
28 | 'RED' : RED,
29 | 'GREEN' : GREEN,
30 | 'YELLOW' : YELLOW,
31 | 'BLUE' : BLUE,
32 | 'MAGENTA' : MAGENTA,
33 | 'CYAN' : CYAN,
34 | 'WHITE' : WHITE,
35 | }
36 |
37 | RESET_SEQ = "\033[0m"
38 | COLOR_SEQ = "\033[1;%dm"
39 | BOLD_SEQ = "\033[1m"
40 |
41 | class ColoredFormatter(logging.Formatter):
42 |
43 | def __init__(self, *args, **kwargs):
44 | # can't do super(...) here because Formatter is an old school class
45 | logging.Formatter.__init__(self, *args, **kwargs)
46 |
47 | def format(self, record):
48 | levelname = record.levelname
49 | color = COLOR_SEQ % (30 + COLORS[levelname])
50 | message = logging.Formatter.format(self, record)
51 | message = message.replace("$RESET", RESET_SEQ)\
52 | .replace("$BOLD", BOLD_SEQ)\
53 | .replace("$COLOR", color)
54 | for k,v in COLORS.items():
55 | message = message.replace("$" + k, COLOR_SEQ % (v+30))\
56 | .replace("$BG" + k, COLOR_SEQ % (v+40))\
57 | .replace("$BG-" + k, COLOR_SEQ % (v+40))
58 | return message + RESET_SEQ
59 |
60 |
61 | class AcrcloudLogger:
62 |
63 | def __init__(self, logname, loglevel = logging.INFO):
64 | self.logger = logging.getLogger(logname)
65 | self.logger.setLevel(loglevel)
66 | self.default_fmt = '%(asctime)s - %(name)s - %(levelname)8s - %(message)s'
67 | self.default_colorfmt = "$MAGENTA%(asctime)s$RESET - $COLOR%(name)-12s$RESET - $COLOR%(levelname)-6s$RESET - %(message)s"
68 | self.default_dir = './radioLog'
69 |
70 | def addFilehandler(self, logfile, logdir = None, fmt = '', loglevel = logging.INFO, when='D', interval=10, backupCount=1):
71 | try:
72 | filename = logfile
73 | if logdir is None:
74 | logdir = self.default_dir
75 | if not os.path.exists(logdir):
76 | os.makedirs(logdir)
77 | logfilepath = os.path.join(logdir, filename)
78 | #fhandler = logging.FileHandler(logfilepath)
79 | fhandler = TimedRotatingFileHandler(logfilepath, when, interval, backupCount)
80 | fhandler.setLevel(loglevel)
81 | formatter = logging.Formatter(fmt if fmt else self.default_fmt)
82 | fhandler.setFormatter(formatter)
83 | self.logger.addHandler(fhandler)
84 | return True
85 | except Exception as e:
86 | traceback.print_exc(file=sys.stdout)
87 | return False
88 |
89 | def addStreamHandler(self, fmt='', loglevel = logging.INFO):
90 | try:
91 | color_fmt = fmt if fmt else self.default_colorfmt
92 | shandler = logging.StreamHandler()
93 | shandler.setLevel(loglevel)
94 | color_formatter = ColoredFormatter(color_fmt)
95 | #f = logging.Formatter(self.default_fmt)
96 | shandler.setFormatter(color_formatter)
97 | self.logger.addHandler(shandler)
98 | return True
99 | except Exception as e:
100 | traceback.print_exc(file=sys.stdout)
101 | return False
102 |
103 | if __name__ == '__main__':
104 |
105 | dlog = AcrcloudLogger('test', logging.INFO)
106 | dlog.addFilehandler('test.log')
107 | dlog.addStreamHandler()
108 | #dlog.logger.warn("hel")
109 | """
110 | for i in range(300):
111 | dlog.logger.warn('what!!!!!!!!!!!')
112 | #dlog.logger.info('hahhahah')
113 | #dlog.logger.error('it is monster!!')
114 | time.sleep(1)
115 | """
116 |
--------------------------------------------------------------------------------
/acrcloud_scan_files_libary.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #-*- coding:utf-8 -*-
3 |
4 | import os
5 | import sys
6 | import time
7 | import json
8 | import codecs
9 | import logging
10 | import openpyxl
11 | from backports import csv
12 | from openpyxl import Workbook
13 | from acrcloud_filter_libary import FilterWorker
14 | from acrcloud_logger import AcrcloudLogger
15 | from acrcloud.recognizer import ACRCloudRecognizer
16 |
17 | if sys.version_info.major == 2:
18 | reload(sys)
19 | sys.setdefaultencoding("utf8")
20 |
21 | class ACRCloud_Scan_Files:
22 |
23 | def __init__(self, config, debug=1):
24 | self.openpyxl_version = ".".join(str(openpyxl.__version__).split(".")[:2])
25 | self.config = config
26 | self.debug = debug
27 | self.init_log()
28 | self.re_handler = ACRCloudRecognizer(self.config)
29 |
30 | def init_log(self):
31 | log_level = logging.ERROR
32 | if self.debug == 1:
33 | log_level = logging.DEBUG
34 |
35 | shandler = logging.StreamHandler()
36 | #shandler.setLevel(log_level)
37 | self.log = logging.getLogger("ACRCloud_ScanFile")
38 | self.log.setLevel(log_level)
39 | self.log.addHandler(shandler)
40 |
41 | def as_text(self, value):
42 | if value is None:
43 | return ""
44 | return str(value)
45 |
46 | def get_duration_by_file(self, filepath):
47 | return int(ACRCloudRecognizer.get_duration_ms_by_file(filepath)/1000)
48 |
49 | def export_to_xlsx(self, result_list, export_filename="ACRCloud_ScanFile_Results.xlsx", export_dir="./"):
50 | try:
51 | results = []
52 | for item in result_list:
53 | filename = item["file"]
54 | timestamp = item["timestamp"]
55 | jsoninfo = item["result"]
56 | if "status" in jsoninfo and jsoninfo["status"]["code"] == 0:
57 | row = self.parse_data(jsoninfo)
58 | row = [filename, timestamp] + list(row)
59 | results.append(row)
60 | results = sorted(results, key=lambda x:x[1])
61 |
62 | wb = Workbook()
63 | sheet_music = wb.active
64 | sheet_music.title = "ACRCloud_Scan_File"
65 |
66 | header_row = ['filename', 'timestamp', 'custom_files_title', 'custom_acrid', 'title', 'artists', 'album',
67 | 'acrid', 'played_duration', 'label', 'isrc', 'upc', 'deezer', 'spotify', 'itunes', 'youtube']
68 |
69 | sheet_music.append(header_row)
70 | for row in results:
71 | sheet_music.append(row)
72 |
73 | for column_cells in sheet_music.columns:
74 | length = max(len(self.as_text(cell.value)) for cell in column_cells)
75 | if length > 80:
76 | length == 80
77 | if self.openpyxl_version >= "2.6":
78 | sheet_music.column_dimensions[column_cells[0].column_letter].width = length
79 | else:
80 | sheet_music.column_dimensions[column_cells[0].column].width = length
81 |
82 | export_filepath = os.path.join(export_dir, export_filename)
83 | wb.save(export_filepath)
84 | if self.debug:
85 | self.log.info("export_to_xlsx.Save Data to xlsx: {0}".format(export_filename))
86 | except Exception as e:
87 | self.log.error("Error@export_to_xlsx", exc_info=True)
88 |
89 | def export_to_csv(self, result_list, export_filename="ACRCloud_ScanFile_Results.csv", export_dir="./"):
90 | try:
91 | results = []
92 | for item in result_list:
93 | filename = item["file"]
94 | timestamp = item["timestamp"]
95 | jsoninfo = item["result"]
96 | if "status" in jsoninfo and jsoninfo["status"]["code"] == 0:
97 | row = self.parse_data(jsoninfo)
98 | row = [filename, timestamp] + list(row)
99 | results.append(row)
100 |
101 | results = sorted(results, key=lambda x:x[1])
102 |
103 | export_filepath = os.path.join(export_dir, export_filename)
104 |
105 | with codecs.open(export_filepath, 'w', 'utf-8-sig') as f:
106 | head_row = ['filename', 'timestamp', 'custom_files_title', 'custom_acrid', 'title', 'artists', 'album',
107 | 'acrid', 'played_duration', 'label', 'isrc', 'upc', 'deezer', 'spotify', 'itunes', 'youtube']
108 | dw = csv.writer(f)
109 | dw.writerow(head_row)
110 | dw.writerows(results)
111 | if self.debug:
112 | self.log.info("export_to_csv.Save Data to csv: {0}".format(export_filename))
113 | except Exception as e:
114 | self.log.error("Error@export_to_csv", exc_info=True)
115 |
116 | def parse_data(self, jsoninfo):
117 | try:
118 | title, played_duration, isrc, upc, acrid, label, album = [""]*7
119 | artists, deezer, spotify, itunes, youtube, custom_files_title, audio_id, custom_acrid = [""]*8
120 |
121 | metadata = jsoninfo.get('metadata', {})
122 | played_duration = metadata.get("played_duration", "")
123 | if "music" in metadata and len(metadata["music"]) > 0:
124 | item = metadata["music"][0]
125 | title = item.get("title", "")
126 | offset = item.get("play_offset_ms", "")
127 | isrc = item.get("external_ids", {"isrc":""}).get("isrc","")
128 | upc = item.get("external_ids", {"upc":""}).get("upc","")
129 | acrid = item.get("acrid","")
130 | label = item.get("label", "")
131 | album = item.get("album", {"name":""}).get("name", "")
132 | artists = ",".join([ ar["name"] for ar in item.get('artists', [{"name":""}]) if ar.get("name") ])
133 | deezer = item.get("external_metadata", {"deezer":{"track":{"id":""}}}).get("deezer", {"track":{"id":""}}).get("track", {"id":""}).get("id", "")
134 | spotify = item.get("external_metadata", {"spotify":{"track":{"id":""}}}).get("spotify", {"track":{"id":""}}).get("track", {"id":""}).get("id", "")
135 | itunes = item.get("external_metadata", {"itunes":{"track":{"id":""}}}).get("itunes", {"track":{"id":""}}).get("track", {"id":""}).get("id", "")
136 | youtube = item.get("external_metadata", {"youtube":{"vid":""}}).get("youtube", {"vid":""}).get("vid", "")
137 |
138 | if "custom_files" in metadata and len(metadata["custom_files"]) > 0:
139 | custom_item = metadata["custom_files"][0]
140 | custom_files_title = custom_item.get("title", "")
141 | audio_id = custom_item.get("audio_id", "")
142 | custom_acrid = custom_item.get("acrid", "")
143 | except Exception as e:
144 | self.log.error("Error@parse_data")
145 |
146 | res = (custom_files_title, custom_acrid, title, artists, album, acrid,
147 | played_duration, label, isrc, upc, deezer, spotify, itunes, youtube,)
148 |
149 | return res
150 |
151 | def apply_filter(self, results):
152 | fworker = FilterWorker()
153 | result_new = fworker.apply_filter(results)
154 | return result_new
155 |
156 | def do_recognize(self, filepath, start_time, rec_length):
157 | current_time = time.strftime('%H:%M:%S', time.gmtime(start_time))
158 | res_data = self.re_handler.recognize_by_file(filepath, start_time, rec_length)
159 | return filepath, current_time, res_data
160 |
161 | def for_recognize_file(self, filepath, start_time, stop_time, step, rec_length):
162 | try:
163 | for i in range(start_time, stop_time, step):
164 | filep, current_time, res_data = self.do_recognize(filepath, i, rec_length)
165 | if res_data:
166 | jsoninfo = json.loads(res_data)
167 | if "metadata" in jsoninfo and "timestamp_utc" in jsoninfo["metadata"]:
168 | jsoninfo["metadata"]["timestamp_utc"] = current_time
169 | else:
170 | jsoninfo = {}
171 | yield {"timestamp":current_time, "rec_length":rec_length, "result":jsoninfo, "file":filep}
172 | except Exception as e:
173 | self.log.error("Error@for_recognize_file", exc_info=True)
174 |
175 | def recognize_file(self, filepath, start_time, stop_time, step, rec_length):
176 | try:
177 | result_list = []
178 | for i in range(start_time, stop_time, step):
179 | filep, current_time, res_data = self.do_recognize(filepath, i, rec_length)
180 | if res_data:
181 | jsoninfo = json.loads(res_data)
182 | try:
183 | if "metadata" in jsoninfo and "timestamp_utc" in jsoninfo["metadata"]:
184 | jsoninfo["metadata"]["timestamp_utc"] = current_time
185 |
186 | code = jsoninfo["status"]["code"]
187 | msg = jsoninfo["status"]["msg"]
188 | if jsoninfo["status"]["code"] not in [0, 1001]:
189 | raise Exception("recognize_file.(timestamp: {0}, {1}, {2})".format(current_time, code, msg))
190 | except Exception as e:
191 | if self.debug:
192 | self.log.error(e)
193 | else:
194 | print (e)
195 | if code in [3001, 3003, 3013]:
196 | break
197 | else:
198 | continue
199 |
200 | result_list.append({"timestamp":current_time, "rec_length":rec_length, "result":jsoninfo, "file":filep})
201 | if self.debug:
202 | parse_info = self.parse_data(jsoninfo)
203 | self.log.info('recognize_file.(timestamp: {0}, title: {1})'.format(current_time, parse_info[0]))
204 | except Exception as e:
205 | self.log.error("Error@recognize_file", exc_info=True)
206 | return result_list
207 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # [Audio Recognition](https://www.acrcloud.com/music-recognition) -- File Scan Tool (Python Script)
2 |
3 |
4 |
5 | ## Overview
6 | [ACRCloud](https://www.acrcloud.com/) provides [Automatic Content Recognition](https://www.acrcloud.com/docs/introduction/automatic-content-recognition/) services for [Audio Fingerprinting](https://www.acrcloud.com/docs/introduction/audio-fingerprinting/) based applications such as **[Audio Recognition](https://www.acrcloud.com/music-recognition)** (supports music, video, ads for both online and offline), **[Broadcast Monitoring](https://www.acrcloud.com/broadcast-monitoring)**, **[Second Screen](https://www.acrcloud.com/second-screen-synchronization)**, **[Copyright Protection](https://www.acrcloud.com/copyright-protection-de-duplication)** and etc.
7 |
8 | This tool can scan audio/video files and detect audios you want to recognize such as music, ads.
9 |
10 | Supported Format:
11 |
12 | >>Audio: mp3, wav, m4a, flac, aac, amr, ape, ogg ...
13 | >>Video: mp4, mkv, wmv, flv, ts, avi ...
14 |
15 | ## Requirements
16 |
17 | **Notice: This tool only support Python 2.**
18 |
19 | - Python 2.x
20 | - fuzzywuzzy
21 | - openpyxl
22 | - backports.csv
23 | - requests
24 | - Follow one of the tutorials to create a project and get your host, access_key and access_secret.
25 |
26 |
27 | ## Run as a Docker Container
28 | - Install Docker
29 | - If you are using Windows: Download [Docker Desktop for Windows](https://download.docker.com/win/stable/Docker%20for%20Windows%20Installer.exe) and install.
30 | - If you are using MacOs: Download [Docker Desktop for Mac](https://download.docker.com/mac/stable/Docker.dmg) and install.
31 | - If you are using Linux: Open the Terminal and input `bash <(curl -s https://get.docker.com/)`
32 | - Change the config file (config.json).
33 | - Run following command
34 | ```
35 | git clone https://github.com/acrcloud/acrcloud_scan_files_python.git
36 |
37 | cd acrcloud_scan_files_python
38 |
39 | sudo docker build -t acrcloud/python_scan_tool .
40 | # Call it without arguments to display the full help
41 | sudo docker run --rm acrcloud/python_scan_tool
42 |
43 | # Basic usage
44 | sudo docker run --rm -v $(pwd):/tmp -v /Users/acrcloud/:/music/ acrcloud/python_scan_tool -f /music/test.mp4 -o /tmp
45 |
46 | You need to change /Users/acrcloud/ to the directory where your audio/video file is.
47 | And the report file will in the acrcloud_scan_files_python directory.
48 | ```
49 | ## Installation
50 |
51 | For Windows System, you must install [Python](https://www.python.org/downloads/windows/) and [pip](https://pip.pypa.io/en/stable/installing/).
52 |
53 | Open your terminal and change to the script directory of acrcloud_scan_files_python-master. Then run the command:
54 |
55 | ```
56 | pip install -r requirements.txt
57 | ```
58 | ## Install ACRCloud Python SDK
59 |
60 |
61 | You can run the following command to install it.
62 |
63 | ```
64 | python -m pip install git+https://github.com/acrcloud/acrcloud_sdk_python
65 | ```
66 |
67 | Or you can download the sdk and install it by following command.
68 |
69 | [ACRCloud Python SDK](https://github.com/acrcloud/acrcloud_sdk_python).
70 |
71 |
72 | ```
73 | sudo python setup.py install
74 | ```
75 |
76 | ## For Windows
77 |
78 | ### Install Library
79 | Windows Runtime Library
80 |
81 | X86: [download and install Library(windows/vcredist_x86.exe)](https://www.microsoft.com/en-us/download/details.aspx?id=5555)
82 |
83 | x64: [download and install Library(windows/vcredist_x64.exe)](https://www.microsoft.com/en-us/download/details.aspx?id=14632)
84 |
85 |
86 | ## Usage for Scan File Tool:
87 |
88 | _ ____ ____ ____ _ _
89 | / \ / ___| _ \ / ___| | ___ _ _ __| |
90 | / _ \| | | |_) | | | |/ _ \| | | |/ _` |
91 | / ___ \ |___| _ <| |___| | (_) | |_| | (_| |
92 | /_/ \_\____|_| \_\\____|_|\___/ \____|\____|
93 |
94 | Before you use this script,you must have acrcloud host,access_key and access_secret.
95 | If you haven't have these ,you can register one https://console.acrcloud.com/signup
96 |
97 | Change the content of config.json,fill in your host, access_key and access_secret
98 | ```
99 | {
100 | "host": "xxxxx",
101 | "access_key": "xxxxx",
102 | "access_secret": "xxxxx"
103 | }
104 | ```
105 |
106 | ```
107 | python acrcloud_scan_files_python.py -d folder_path
108 | python acrcloud_scan_files_python.py -f file_path
109 | python acrcloud_scan_files_python.py -h get_usage_help
110 | ```
111 |
112 | ### Scan Folder Example:
113 | ```
114 | python acrcloud_scan_files_python.py -d ~/music
115 | ```
116 | ### Scan File Example:
117 | ```
118 | python acrcloud_scan_files_python.py -f ~/testfiles/test.mp3
119 | ```
120 |
121 | ### Add more params
122 |
123 | "-s" ---- scan step. (The scan interval.)
124 |
125 | "-l" ---- recongizing length. (use how many seconds to recongize. for example: -s 20 -l 10, it will get 20 seconds of audio each time and use the first 10 seconds of audio to recognize)
126 |
127 | "-r" ---- scan range. (The scan range. for example: -r 5-20, it will recognize the file starting from the 5th second and finish at the 20th second.)
128 |
129 | "-c" ---- set the config file path.
130 |
131 | "-w" ---- results with duration. (1-yes, 0-no), you must set offset config for your access key, pls contact support@acrcloud.com
132 |
133 | "-o" ---- set the directory to save the results
134 |
135 | "-t" ---- set the type of file.(csv[default] or xlsx).
136 | ```
137 | If you want to change scan interval or you want to set recognize range,you can add some params
138 | Example:
139 | python acrcloud_scan_files_python.py -f ~/testfiles/test.mp3 -s 30 -r 0-20
140 | python acrcloud_scan_files_python.py -d ~/music -s 30 -w 1
141 | ```
142 |
143 | Default is scan folder where this script in.
144 |
145 | The results are saved in the folder where this script in.
146 |
147 |
148 | ## Usage for Scan File Libary
149 |
150 | Introduction all API.
151 |
152 | ### acrcloud_scan_files_libary.py
153 |
154 | ```
155 | class ACRCloud_Scan_Files:
156 | def get_duration_by_file(self, filepath):
157 | #@param filepath : query file path
158 | #@return : total duration of the file
159 |
160 | def export_to_xlsx(self, result_list, export_filename, export_dir):
161 | #@param result_list : the list of identification results
162 | #@param export_filename : export to this file
163 | #@param export_dir : export to this directory
164 |
165 | def export_to_csv(self, result_list, export_filename, export_dir):
166 | #@param result_list : the list of recognition results
167 | #@param export_filename : export to this file
168 | #@param export_dir : export to this directory
169 |
170 | def parse_data(self, result):
171 | #@param result : one recognition result
172 | #@return : a tuple, as follow
173 | # (title, artists, album, acrid, played_duration, label, isrc, upc,
174 | # deezer, spotify, itunes, youtube, custom_files_title, audio_id)
175 |
176 | def apply_filter(self, results):
177 | #@param results : the list of recognition results
178 | #@return : a list results with played_duration
179 |
180 | def for_recognize_file(self, filepath, start_time, stop_time, step, rec_length):
181 | #@param filepath : query file path
182 | #@param start_time : the start offset to recognize (seconds)
183 | #@param stop_time : the end offset to recognize (seconds)
184 | #@param rec_length : the duration of each fragment to recognize
185 | #@return : iterator to return the each recognition result
186 |
187 | def recognize_file(self, filepath, start_time, stop_time, step, rec_length):
188 | #@param filepath : query file path
189 | #@param start_time : the start offset to recognize (seconds)
190 | #@param stop_time : the end offset to recognize (seconds)
191 | #@param rec_length : the duration of each fragment to recognize
192 | #@return : the list of recognition results
193 | ```
194 |
195 | ### Example
196 |
197 | run Text: python example.py test.mp3
198 |
199 | ```
200 | #!/usr/bin/env python
201 | #-*- coding:utf-8 -*-
202 |
203 | import os
204 | import sys
205 | from acrcloud_scan_files_libary import ACRCloud_Scan_Files
206 |
207 | if __name__ == "__main__":
208 |
209 | #ACRCloud Scan File Example
210 | is_debug = 1 #display the log info, or is_debug=0
211 | start_time = 0 #scan file start time(seconds)
212 | stop_time = 0 #scan file end time(seconds), or you can set it to the duration of file
213 | step = 10 #the length of each identified fragment (seconds)
214 | rec_length = step
215 |
216 | #your acrcloud project host, access_key, access_secret
217 | config = {
218 | "host": "XXX",
219 | "access_key":"XXX",
220 | "access_secret": "XXX"
221 | }
222 |
223 | filepath = sys.argv[1]
224 |
225 | acr_sfile = ACRCloud_Scan_Files(config, is_debug)
226 |
227 | stop_time = acr_sfile.get_duration_by_file(filepath)
228 |
229 | #get a list of recognition results
230 | result_list = acr_sfile.recognize_file(filepath, start_time, stop_time, step, rec_length)
231 |
232 | #export the result
233 | export_dir = "./"
234 | #export to csv
235 | export_filename_csv = "test.csv"
236 | acr_sfile.export_to_csv(result_list, export_filename_csv, export_dir)
237 | #export to xlsx
238 | export_filename_xlsx = "test.xlsx"
239 | acr_sfile.export_to_xlsx(result_list, export_filename_xlsx, export_dir)
240 |
241 | #iterator to get the result of each fragment
242 | result_list2 = []
243 | for item in acr_sfile.for_recognize_file(filepath, start_time, stop_time, step, rec_length):
244 | result_list2.append(item)
245 | filename = item["file"]
246 | timestamp = item["timestamp"]
247 | res = acr_sfile.parse_data(item["result"])
248 | title = res[0]
249 | print filename, timestamp, title
250 |
251 | #get results with played-duration
252 | filter_results = acr_sfile.apply_filter(result_list2)
253 | #export the results to xlsx
254 | export_filtername_xlsx = "test_with_duration.xlsx"
255 | acr_sfile.export_to_xlsx(filter_results, export_filtername_xlsx, export_dir)
256 | ```
257 |
--------------------------------------------------------------------------------
/tools_str_sim.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #-*- coding:utf-8 -*-
3 | """
4 | author: hong
5 | Copyright (c) 2011 Adam Cohen
6 | ......
7 |
8 | """
9 | import re
10 | import sys
11 | import string
12 | from fuzzywuzzy import fuzz
13 |
14 | reload(sys)
15 | sys.setdefaultencoding("utf8")
16 |
17 | RE_SPECIAL_STRING = """[ \[\][]\(\)()\n\t\r,\.\:"'‘“<>《》!!??&]"""
18 | RE_SUB_STRING = "(\(.*\))|(\[.*\])|((.*))"
19 | THREADHOLD = 75
20 |
21 | #https://stackoverflow.com/questions/286921/efficiently-replace-all-accented-characters-in-a-string
22 | latin_map={
23 | u"Á":"A",
24 | u"Ă":"A",
25 | u"Ắ":"A",
26 | u"Ặ":"A",
27 | u"Ằ":"A",
28 | u"Ẳ":"A",
29 | u"Ẵ":"A",
30 | u"Ǎ":"A",
31 | u"Â":"A",
32 | u"Ấ":"A",
33 | u"Ậ":"A",
34 | u"Ầ":"A",
35 | u"Ẩ":"A",
36 | u"Ẫ":"A",
37 | u"Ä":"A",
38 | u"Ǟ":"A",
39 | u"Ȧ":"A",
40 | u"Ǡ":"A",
41 | u"Ạ":"A",
42 | u"Ȁ":"A",
43 | u"À":"A",
44 | u"Ả":"A",
45 | u"Ȃ":"A",
46 | u"Ā":"A",
47 | u"Ą":"A",
48 | u"Å":"A",
49 | u"Ǻ":"A",
50 | u"Ḁ":"A",
51 | u"Ⱥ":"A",
52 | u"Ã":"A",
53 | u"Ꜳ":"AA",
54 | u"Æ":"AE",
55 | u"Ǽ":"AE",
56 | u"Ǣ":"AE",
57 | u"Ꜵ":"AO",
58 | u"Ꜷ":"AU",
59 | u"Ꜹ":"AV",
60 | u"Ꜻ":"AV",
61 | u"Ꜽ":"AY",
62 | u"Ḃ":"B",
63 | u"Ḅ":"B",
64 | u"Ɓ":"B",
65 | u"Ḇ":"B",
66 | u"Ƀ":"B",
67 | u"Ƃ":"B",
68 | u"Ć":"C",
69 | u"Č":"C",
70 | u"Ç":"C",
71 | u"Ḉ":"C",
72 | u"Ĉ":"C",
73 | u"Ċ":"C",
74 | u"Ƈ":"C",
75 | u"Ȼ":"C",
76 | u"Ď":"D",
77 | u"Ḑ":"D",
78 | u"Ḓ":"D",
79 | u"Ḋ":"D",
80 | u"Ḍ":"D",
81 | u"Ɗ":"D",
82 | u"Ḏ":"D",
83 | u"Dz":"D",
84 | u"Dž":"D",
85 | u"Đ":"D",
86 | u"Ƌ":"D",
87 | u"DZ":"DZ",
88 | u"DŽ":"DZ",
89 | u"É":"E",
90 | u"Ĕ":"E",
91 | u"Ě":"E",
92 | u"Ȩ":"E",
93 | u"Ḝ":"E",
94 | u"Ê":"E",
95 | u"Ế":"E",
96 | u"Ệ":"E",
97 | u"Ề":"E",
98 | u"Ể":"E",
99 | u"Ễ":"E",
100 | u"Ḙ":"E",
101 | u"Ë":"E",
102 | u"Ė":"E",
103 | u"Ẹ":"E",
104 | u"Ȅ":"E",
105 | u"È":"E",
106 | u"Ẻ":"E",
107 | u"Ȇ":"E",
108 | u"Ē":"E",
109 | u"Ḗ":"E",
110 | u"Ḕ":"E",
111 | u"Ę":"E",
112 | u"Ɇ":"E",
113 | u"Ẽ":"E",
114 | u"Ḛ":"E",
115 | u"Ꝫ":"ET",
116 | u"Ḟ":"F",
117 | u"Ƒ":"F",
118 | u"Ǵ":"G",
119 | u"Ğ":"G",
120 | u"Ǧ":"G",
121 | u"Ģ":"G",
122 | u"Ĝ":"G",
123 | u"Ġ":"G",
124 | u"Ɠ":"G",
125 | u"Ḡ":"G",
126 | u"Ǥ":"G",
127 | u"Ḫ":"H",
128 | u"Ȟ":"H",
129 | u"Ḩ":"H",
130 | u"Ĥ":"H",
131 | u"Ⱨ":"H",
132 | u"Ḧ":"H",
133 | u"Ḣ":"H",
134 | u"Ḥ":"H",
135 | u"Ħ":"H",
136 | u"Í":"I",
137 | u"Ĭ":"I",
138 | u"Ǐ":"I",
139 | u"Î":"I",
140 | u"Ï":"I",
141 | u"Ḯ":"I",
142 | u"İ":"I",
143 | u"Ị":"I",
144 | u"Ȉ":"I",
145 | u"Ì":"I",
146 | u"Ỉ":"I",
147 | u"Ȋ":"I",
148 | u"Ī":"I",
149 | u"Į":"I",
150 | u"Ɨ":"I",
151 | u"Ĩ":"I",
152 | u"Ḭ":"I",
153 | u"Ꝺ":"D",
154 | u"Ꝼ":"F",
155 | u"Ᵹ":"G",
156 | u"Ꞃ":"R",
157 | u"Ꞅ":"S",
158 | u"Ꞇ":"T",
159 | u"Ꝭ":"IS",
160 | u"Ĵ":"J",
161 | u"Ɉ":"J",
162 | u"Ḱ":"K",
163 | u"Ǩ":"K",
164 | u"Ķ":"K",
165 | u"Ⱪ":"K",
166 | u"Ꝃ":"K",
167 | u"Ḳ":"K",
168 | u"Ƙ":"K",
169 | u"Ḵ":"K",
170 | u"Ꝁ":"K",
171 | u"Ꝅ":"K",
172 | u"Ĺ":"L",
173 | u"Ƚ":"L",
174 | u"Ľ":"L",
175 | u"Ļ":"L",
176 | u"Ḽ":"L",
177 | u"Ḷ":"L",
178 | u"Ḹ":"L",
179 | u"Ⱡ":"L",
180 | u"Ꝉ":"L",
181 | u"Ḻ":"L",
182 | u"Ŀ":"L",
183 | u"Ɫ":"L",
184 | u"Lj":"L",
185 | u"Ł":"L",
186 | u"LJ":"LJ",
187 | u"Ḿ":"M",
188 | u"Ṁ":"M",
189 | u"Ṃ":"M",
190 | u"Ɱ":"M",
191 | u"Ń":"N",
192 | u"Ň":"N",
193 | u"Ņ":"N",
194 | u"Ṋ":"N",
195 | u"Ṅ":"N",
196 | u"Ṇ":"N",
197 | u"Ǹ":"N",
198 | u"Ɲ":"N",
199 | u"Ṉ":"N",
200 | u"Ƞ":"N",
201 | u"Nj":"N",
202 | u"Ñ":"N",
203 | u"NJ":"NJ",
204 | u"Ó":"O",
205 | u"Ŏ":"O",
206 | u"Ǒ":"O",
207 | u"Ô":"O",
208 | u"Ố":"O",
209 | u"Ộ":"O",
210 | u"Ồ":"O",
211 | u"Ổ":"O",
212 | u"Ỗ":"O",
213 | u"Ö":"O",
214 | u"Ȫ":"O",
215 | u"Ȯ":"O",
216 | u"Ȱ":"O",
217 | u"Ọ":"O",
218 | u"Ő":"O",
219 | u"Ȍ":"O",
220 | u"Ò":"O",
221 | u"Ỏ":"O",
222 | u"Ơ":"O",
223 | u"Ớ":"O",
224 | u"Ợ":"O",
225 | u"Ờ":"O",
226 | u"Ở":"O",
227 | u"Ỡ":"O",
228 | u"Ȏ":"O",
229 | u"Ꝋ":"O",
230 | u"Ꝍ":"O",
231 | u"Ō":"O",
232 | u"Ṓ":"O",
233 | u"Ṑ":"O",
234 | u"Ɵ":"O",
235 | u"Ǫ":"O",
236 | u"Ǭ":"O",
237 | u"Ø":"O",
238 | u"Ǿ":"O",
239 | u"Õ":"O",
240 | u"Ṍ":"O",
241 | u"Ṏ":"O",
242 | u"Ȭ":"O",
243 | u"Ƣ":"OI",
244 | u"Ꝏ":"OO",
245 | u"Ɛ":"E",
246 | u"Ɔ":"O",
247 | u"Ȣ":"OU",
248 | u"Ṕ":"P",
249 | u"Ṗ":"P",
250 | u"Ꝓ":"P",
251 | u"Ƥ":"P",
252 | u"Ꝕ":"P",
253 | u"Ᵽ":"P",
254 | u"Ꝑ":"P",
255 | u"Ꝙ":"Q",
256 | u"Ꝗ":"Q",
257 | u"Ŕ":"R",
258 | u"Ř":"R",
259 | u"Ŗ":"R",
260 | u"Ṙ":"R",
261 | u"Ṛ":"R",
262 | u"Ṝ":"R",
263 | u"Ȑ":"R",
264 | u"Ȓ":"R",
265 | u"Ṟ":"R",
266 | u"Ɍ":"R",
267 | u"Ɽ":"R",
268 | u"Ꜿ":"C",
269 | u"Ǝ":"E",
270 | u"Ś":"S",
271 | u"Ṥ":"S",
272 | u"Š":"S",
273 | u"Ṧ":"S",
274 | u"Ş":"S",
275 | u"Ŝ":"S",
276 | u"Ș":"S",
277 | u"Ṡ":"S",
278 | u"Ṣ":"S",
279 | u"Ṩ":"S",
280 | u"Ť":"T",
281 | u"Ţ":"T",
282 | u"Ṱ":"T",
283 | u"Ț":"T",
284 | u"Ⱦ":"T",
285 | u"Ṫ":"T",
286 | u"Ṭ":"T",
287 | u"Ƭ":"T",
288 | u"Ṯ":"T",
289 | u"Ʈ":"T",
290 | u"Ŧ":"T",
291 | u"Ɐ":"A",
292 | u"Ꞁ":"L",
293 | u"Ɯ":"M",
294 | u"Ʌ":"V",
295 | u"Ꜩ":"TZ",
296 | u"Ú":"U",
297 | u"Ŭ":"U",
298 | u"Ǔ":"U",
299 | u"Û":"U",
300 | u"Ṷ":"U",
301 | u"Ü":"U",
302 | u"Ǘ":"U",
303 | u"Ǚ":"U",
304 | u"Ǜ":"U",
305 | u"Ǖ":"U",
306 | u"Ṳ":"U",
307 | u"Ụ":"U",
308 | u"Ű":"U",
309 | u"Ȕ":"U",
310 | u"Ù":"U",
311 | u"Ủ":"U",
312 | u"Ư":"U",
313 | u"Ứ":"U",
314 | u"Ự":"U",
315 | u"Ừ":"U",
316 | u"Ử":"U",
317 | u"Ữ":"U",
318 | u"Ȗ":"U",
319 | u"Ū":"U",
320 | u"Ṻ":"U",
321 | u"Ų":"U",
322 | u"Ů":"U",
323 | u"Ũ":"U",
324 | u"Ṹ":"U",
325 | u"Ṵ":"U",
326 | u"Ꝟ":"V",
327 | u"Ṿ":"V",
328 | u"Ʋ":"V",
329 | u"Ṽ":"V",
330 | u"Ꝡ":"VY",
331 | u"Ẃ":"W",
332 | u"Ŵ":"W",
333 | u"Ẅ":"W",
334 | u"Ẇ":"W",
335 | u"Ẉ":"W",
336 | u"Ẁ":"W",
337 | u"Ⱳ":"W",
338 | u"Ẍ":"X",
339 | u"Ẋ":"X",
340 | u"Ý":"Y",
341 | u"Ŷ":"Y",
342 | u"Ÿ":"Y",
343 | u"Ẏ":"Y",
344 | u"Ỵ":"Y",
345 | u"Ỳ":"Y",
346 | u"Ƴ":"Y",
347 | u"Ỷ":"Y",
348 | u"Ỿ":"Y",
349 | u"Ȳ":"Y",
350 | u"Ɏ":"Y",
351 | u"Ỹ":"Y",
352 | u"Ź":"Z",
353 | u"Ž":"Z",
354 | u"Ẑ":"Z",
355 | u"Ⱬ":"Z",
356 | u"Ż":"Z",
357 | u"Ẓ":"Z",
358 | u"Ȥ":"Z",
359 | u"Ẕ":"Z",
360 | u"Ƶ":"Z",
361 | u"IJ":"IJ",
362 | u"Œ":"OE",
363 | u"ᴀ":"A",
364 | u"ᴁ":"AE",
365 | u"ʙ":"B",
366 | u"ᴃ":"B",
367 | u"ᴄ":"C",
368 | u"ᴅ":"D",
369 | u"ᴇ":"E",
370 | u"ꜰ":"F",
371 | u"ɢ":"G",
372 | u"ʛ":"G",
373 | u"ʜ":"H",
374 | u"ɪ":"I",
375 | u"ʁ":"R",
376 | u"ᴊ":"J",
377 | u"ᴋ":"K",
378 | u"ʟ":"L",
379 | u"ᴌ":"L",
380 | u"ᴍ":"M",
381 | u"ɴ":"N",
382 | u"ᴏ":"O",
383 | u"ɶ":"OE",
384 | u"ᴐ":"O",
385 | u"ᴕ":"OU",
386 | u"ᴘ":"P",
387 | u"ʀ":"R",
388 | u"ᴎ":"N",
389 | u"ᴙ":"R",
390 | u"ꜱ":"S",
391 | u"ᴛ":"T",
392 | u"ⱻ":"E",
393 | u"ᴚ":"R",
394 | u"ᴜ":"U",
395 | u"ᴠ":"V",
396 | u"ᴡ":"W",
397 | u"ʏ":"Y",
398 | u"ᴢ":"Z",
399 | u"á":"a",
400 | #"á":"a",
401 | u"ă":"a",
402 | u"ắ":"a",
403 | u"ặ":"a",
404 | u"ằ":"a",
405 | u"ẳ":"a",
406 | u"ẵ":"a",
407 | u"ǎ":"a",
408 | u"â":"a",
409 | u"ấ":"a",
410 | u"ậ":"a",
411 | u"ầ":"a",
412 | u"ẩ":"a",
413 | u"ẫ":"a",
414 | u"ä":"a",
415 | u"ǟ":"a",
416 | u"ȧ":"a",
417 | u"ǡ":"a",
418 | u"ạ":"a",
419 | u"ȁ":"a",
420 | u"à":"a",
421 | u"ả":"a",
422 | u"ȃ":"a",
423 | u"ā":"a",
424 | u"ą":"a",
425 | u"ᶏ":"a",
426 | u"ẚ":"a",
427 | u"å":"a",
428 | u"ǻ":"a",
429 | u"ḁ":"a",
430 | u"ⱥ":"a",
431 | u"ã":"a",
432 | u"ꜳ":"aa",
433 | u"æ":"ae",
434 | u"ǽ":"ae",
435 | u"ǣ":"ae",
436 | u"ꜵ":"ao",
437 | u"ꜷ":"au",
438 | u"ꜹ":"av",
439 | u"ꜻ":"av",
440 | u"ꜽ":"ay",
441 | u"ḃ":"b",
442 | u"ḅ":"b",
443 | u"ɓ":"b",
444 | u"ḇ":"b",
445 | u"ᵬ":"b",
446 | u"ᶀ":"b",
447 | u"ƀ":"b",
448 | u"ƃ":"b",
449 | u"ɵ":"o",
450 | u"ć":"c",
451 | u"č":"c",
452 | u"ç":"c",
453 | u"ḉ":"c",
454 | u"ĉ":"c",
455 | u"ɕ":"c",
456 | u"ċ":"c",
457 | u"ƈ":"c",
458 | u"ȼ":"c",
459 | u"ď":"d",
460 | u"ḑ":"d",
461 | u"ḓ":"d",
462 | u"ȡ":"d",
463 | u"ḋ":"d",
464 | u"ḍ":"d",
465 | u"ɗ":"d",
466 | u"ᶑ":"d",
467 | u"ḏ":"d",
468 | u"ᵭ":"d",
469 | u"ᶁ":"d",
470 | u"đ":"d",
471 | u"ɖ":"d",
472 | u"ƌ":"d",
473 | u"ı":"i",
474 | u"ȷ":"j",
475 | u"ɟ":"j",
476 | u"ʄ":"j",
477 | u"dz":"dz",
478 | u"dž":"dz",
479 | u"é":"e",
480 | u"ĕ":"e",
481 | u"ě":"e",
482 | u"ȩ":"e",
483 | u"ḝ":"e",
484 | u"ê":"e",
485 | u"ế":"e",
486 | u"ệ":"e",
487 | u"ề":"e",
488 | u"ể":"e",
489 | u"ễ":"e",
490 | u"ḙ":"e",
491 | u"ë":"e",
492 | u"ė":"e",
493 | u"ẹ":"e",
494 | u"ȅ":"e",
495 | u"è":"e",
496 | u"ẻ":"e",
497 | u"ȇ":"e",
498 | u"ē":"e",
499 | u"ḗ":"e",
500 | u"ḕ":"e",
501 | u"ⱸ":"e",
502 | u"ę":"e",
503 | u"ᶒ":"e",
504 | u"ɇ":"e",
505 | u"ẽ":"e",
506 | u"ḛ":"e",
507 | u"ꝫ":"et",
508 | u"ḟ":"f",
509 | u"ƒ":"f",
510 | u"ᵮ":"f",
511 | u"ᶂ":"f",
512 | u"ǵ":"g",
513 | u"ğ":"g",
514 | u"ǧ":"g",
515 | u"ģ":"g",
516 | u"ĝ":"g",
517 | u"ġ":"g",
518 | u"ɠ":"g",
519 | u"ḡ":"g",
520 | u"ᶃ":"g",
521 | u"ǥ":"g",
522 | u"ḫ":"h",
523 | u"ȟ":"h",
524 | u"ḩ":"h",
525 | u"ĥ":"h",
526 | u"ⱨ":"h",
527 | u"ḧ":"h",
528 | u"ḣ":"h",
529 | u"ḥ":"h",
530 | u"ɦ":"h",
531 | u"ẖ":"h",
532 | u"ħ":"h",
533 | u"ƕ":"hv",
534 | u"í":"i",
535 | u"ĭ":"i",
536 | u"ǐ":"i",
537 | u"î":"i",
538 | u"ï":"i",
539 | u"ḯ":"i",
540 | u"ị":"i",
541 | u"ȉ":"i",
542 | u"ì":"i",
543 | u"ỉ":"i",
544 | u"ȋ":"i",
545 | u"ī":"i",
546 | u"į":"i",
547 | u"ᶖ":"i",
548 | u"ɨ":"i",
549 | u"ĩ":"i",
550 | u"ḭ":"i",
551 | u"ꝺ":"d",
552 | u"ꝼ":"f",
553 | u"ᵹ":"g",
554 | u"ꞃ":"r",
555 | u"ꞅ":"s",
556 | u"ꞇ":"t",
557 | u"ꝭ":"is",
558 | u"ǰ":"j",
559 | u"ĵ":"j",
560 | u"ʝ":"j",
561 | u"ɉ":"j",
562 | u"ḱ":"k",
563 | u"ǩ":"k",
564 | u"ķ":"k",
565 | u"ⱪ":"k",
566 | u"ꝃ":"k",
567 | u"ḳ":"k",
568 | u"ƙ":"k",
569 | u"ḵ":"k",
570 | u"ᶄ":"k",
571 | u"ꝁ":"k",
572 | u"ꝅ":"k",
573 | u"ĺ":"l",
574 | u"ƚ":"l",
575 | u"ɬ":"l",
576 | u"ľ":"l",
577 | u"ļ":"l",
578 | u"ḽ":"l",
579 | u"ȴ":"l",
580 | u"ḷ":"l",
581 | u"ḹ":"l",
582 | u"ⱡ":"l",
583 | u"ꝉ":"l",
584 | u"ḻ":"l",
585 | u"ŀ":"l",
586 | u"ɫ":"l",
587 | u"ᶅ":"l",
588 | u"ɭ":"l",
589 | u"ł":"l",
590 | u"lj":"lj",
591 | u"ſ":"s",
592 | u"ẜ":"s",
593 | u"ẛ":"s",
594 | u"ẝ":"s",
595 | u"ḿ":"m",
596 | u"ṁ":"m",
597 | u"ṃ":"m",
598 | u"ɱ":"m",
599 | u"ᵯ":"m",
600 | u"ᶆ":"m",
601 | u"ń":"n",
602 | u"ň":"n",
603 | u"ņ":"n",
604 | u"ṋ":"n",
605 | u"ȵ":"n",
606 | u"ṅ":"n",
607 | u"ṇ":"n",
608 | u"ǹ":"n",
609 | u"ɲ":"n",
610 | u"ṉ":"n",
611 | u"ƞ":"n",
612 | u"ᵰ":"n",
613 | u"ᶇ":"n",
614 | u"ɳ":"n",
615 | u"ñ":"n",
616 | u"nj":"nj",
617 | u"ó":"o",
618 | u"ŏ":"o",
619 | u"ǒ":"o",
620 | u"ô":"o",
621 | u"ố":"o",
622 | u"ộ":"o",
623 | u"ồ":"o",
624 | u"ổ":"o",
625 | u"ỗ":"o",
626 | u"ö":"o",
627 | u"ȫ":"o",
628 | u"ȯ":"o",
629 | u"ȱ":"o",
630 | u"ọ":"o",
631 | u"ő":"o",
632 | u"ȍ":"o",
633 | u"ò":"o",
634 | u"ỏ":"o",
635 | u"ơ":"o",
636 | u"ớ":"o",
637 | u"ợ":"o",
638 | u"ờ":"o",
639 | u"ở":"o",
640 | u"ỡ":"o",
641 | u"ȏ":"o",
642 | u"ꝋ":"o",
643 | u"ꝍ":"o",
644 | u"ⱺ":"o",
645 | u"ō":"o",
646 | u"ṓ":"o",
647 | u"ṑ":"o",
648 | u"ǫ":"o",
649 | u"ǭ":"o",
650 | u"ø":"o",
651 | u"ǿ":"o",
652 | u"õ":"o",
653 | u"ṍ":"o",
654 | u"ṏ":"o",
655 | u"ȭ":"o",
656 | u"ƣ":"oi",
657 | u"ꝏ":"oo",
658 | u"ɛ":"e",
659 | u"ᶓ":"e",
660 | u"ɔ":"o",
661 | u"ᶗ":"o",
662 | u"ȣ":"ou",
663 | u"ṕ":"p",
664 | u"ṗ":"p",
665 | u"ꝓ":"p",
666 | u"ƥ":"p",
667 | u"ᵱ":"p",
668 | u"ᶈ":"p",
669 | u"ꝕ":"p",
670 | u"ᵽ":"p",
671 | u"ꝑ":"p",
672 | u"ꝙ":"q",
673 | u"ʠ":"q",
674 | u"ɋ":"q",
675 | u"ꝗ":"q",
676 | u"ŕ":"r",
677 | u"ř":"r",
678 | u"ŗ":"r",
679 | u"ṙ":"r",
680 | u"ṛ":"r",
681 | u"ṝ":"r",
682 | u"ȑ":"r",
683 | u"ɾ":"r",
684 | u"ᵳ":"r",
685 | u"ȓ":"r",
686 | u"ṟ":"r",
687 | u"ɼ":"r",
688 | u"ᵲ":"r",
689 | u"ᶉ":"r",
690 | u"ɍ":"r",
691 | u"ɽ":"r",
692 | u"ↄ":"c",
693 | u"ꜿ":"c",
694 | u"ɘ":"e",
695 | u"ɿ":"r",
696 | u"ś":"s",
697 | u"ṥ":"s",
698 | u"š":"s",
699 | u"ṧ":"s",
700 | u"ş":"s",
701 | u"ŝ":"s",
702 | u"ș":"s",
703 | u"ṡ":"s",
704 | u"ṣ":"s",
705 | u"ṩ":"s",
706 | u"ʂ":"s",
707 | u"ᵴ":"s",
708 | u"ᶊ":"s",
709 | u"ȿ":"s",
710 | u"ɡ":"g",
711 | u"ᴑ":"o",
712 | u"ᴓ":"o",
713 | u"ᴝ":"u",
714 | u"ť":"t",
715 | u"ţ":"t",
716 | u"ṱ":"t",
717 | u"ț":"t",
718 | u"ȶ":"t",
719 | u"ẗ":"t",
720 | u"ⱦ":"t",
721 | u"ṫ":"t",
722 | u"ṭ":"t",
723 | u"ƭ":"t",
724 | u"ṯ":"t",
725 | u"ᵵ":"t",
726 | u"ƫ":"t",
727 | u"ʈ":"t",
728 | u"ŧ":"t",
729 | u"ᵺ":"th",
730 | u"ɐ":"a",
731 | u"ᴂ":"ae",
732 | u"ǝ":"e",
733 | u"ᵷ":"g",
734 | u"ɥ":"h",
735 | u"ʮ":"h",
736 | u"ʯ":"h",
737 | u"ᴉ":"i",
738 | u"ʞ":"k",
739 | u"ꞁ":"l",
740 | u"ɯ":"m",
741 | u"ɰ":"m",
742 | u"ᴔ":"oe",
743 | u"ɹ":"r",
744 | u"ɻ":"r",
745 | u"ɺ":"r",
746 | u"ⱹ":"r",
747 | u"ʇ":"t",
748 | u"ʌ":"v",
749 | u"ʍ":"w",
750 | u"ʎ":"y",
751 | u"ꜩ":"tz",
752 | u"ú":"u",
753 | u"ŭ":"u",
754 | u"ǔ":"u",
755 | u"û":"u",
756 | u"ṷ":"u",
757 | u"ü":"u",
758 | u"ǘ":"u",
759 | u"ǚ":"u",
760 | u"ǜ":"u",
761 | u"ǖ":"u",
762 | u"ṳ":"u",
763 | u"ụ":"u",
764 | u"ű":"u",
765 | u"ȕ":"u",
766 | u"ù":"u",
767 | u"ủ":"u",
768 | u"ư":"u",
769 | u"ứ":"u",
770 | u"ự":"u",
771 | u"ừ":"u",
772 | u"ử":"u",
773 | u"ữ":"u",
774 | u"ȗ":"u",
775 | u"ū":"u",
776 | u"ṻ":"u",
777 | u"ų":"u",
778 | u"ᶙ":"u",
779 | u"ů":"u",
780 | u"ũ":"u",
781 | u"ṹ":"u",
782 | u"ṵ":"u",
783 | u"ᵫ":"ue",
784 | u"ꝸ":"um",
785 | u"ⱴ":"v",
786 | u"ꝟ":"v",
787 | u"ṿ":"v",
788 | u"ʋ":"v",
789 | u"ᶌ":"v",
790 | u"ⱱ":"v",
791 | u"ṽ":"v",
792 | u"ꝡ":"vy",
793 | u"ẃ":"w",
794 | u"ŵ":"w",
795 | u"ẅ":"w",
796 | u"ẇ":"w",
797 | u"ẉ":"w",
798 | u"ẁ":"w",
799 | u"ⱳ":"w",
800 | u"ẘ":"w",
801 | u"ẍ":"x",
802 | u"ẋ":"x",
803 | u"ᶍ":"x",
804 | u"ý":"y",
805 | u"ŷ":"y",
806 | u"ÿ":"y",
807 | u"ẏ":"y",
808 | u"ỵ":"y",
809 | u"ỳ":"y",
810 | u"ƴ":"y",
811 | u"ỷ":"y",
812 | u"ỿ":"y",
813 | u"ȳ":"y",
814 | u"ẙ":"y",
815 | u"ɏ":"y",
816 | u"ỹ":"y",
817 | u"ź":"z",
818 | u"ž":"z",
819 | u"ẑ":"z",
820 | u"ʑ":"z",
821 | u"ⱬ":"z",
822 | u"ż":"z",
823 | u"ẓ":"z",
824 | u"ȥ":"z",
825 | u"ẕ":"z",
826 | u"ᵶ":"z",
827 | u"ᶎ":"z",
828 | u"ʐ":"z",
829 | u"ƶ":"z",
830 | u"ɀ":"z",
831 | u"ff":"ff",
832 | u"ffi":"ffi",
833 | u"ffl":"ffl",
834 | u"fi":"fi",
835 | u"fl":"fl",
836 | u"ij":"ij",
837 | u"œ":"oe",
838 | u"st":"st",
839 | u"ₐ":"a",
840 | u"ₑ":"e",
841 | u"ᵢ":"i",
842 | u"ⱼ":"j",
843 | u"ₒ":"o",
844 | u"ᵣ":"r",
845 | u"ᵤ":"u",
846 | u"ᵥ":"v",
847 | u"ₓ":"x",
848 | }
849 |
850 |
851 |
852 | def latinize(old_str):
853 | old_str = old_str.lower()
854 | new_a = ""
855 | for a in old_str:
856 | new_a += str(latin_map.get(a, a))
857 | return new_a
858 |
859 | def str_filter_sub(old_str):
860 | old_str_sub = re.sub(RE_SUB_STRING, "", old_str)
861 | new_str = re.sub(RE_SPECIAL_STRING, '', old_str_sub)
862 | return new_str
863 |
864 | def str_filter(old_str):
865 | return re.sub(RE_SPECIAL_STRING, '', old_str).strip()
866 |
867 | def remove_punct(input_str):
868 | if not input_str:
869 | return input_str
870 | del_estr = string.punctuation
871 | replace = " "*len(del_estr)
872 | tran_tab = string.maketrans(del_estr, replace)
873 | input_str = input_str.translate(tran_tab)
874 | return " ".join(input_str.split())
875 |
876 | def str_sub(old_str):
877 | old_str = old_str.lower()
878 | new_str = re.sub(RE_SUB_STRING, "", old_str).strip()
879 | if new_str.find(" - ") != -1:
880 | new_str = new_str[:new_str.find(" - ")]
881 | new_str = latinize(new_str)
882 | new_str = remove_punct(new_str.strip())
883 | return new_str
884 |
885 | def str_sim(str1_old, str2_old):
886 | '''
887 | warning: do not str1=str(str1)
888 | '''
889 | str1 = str(str1_old)
890 | str2 = str(str2_old)
891 |
892 | format_str1 = str_filter(str1.lower().strip())
893 | format_str2 = str_filter(str2.lower().strip())
894 | if format_str1 == format_str2 or format_str1.find(format_str2) != -1 or format_str2.find(format_str1) != -1:
895 | return True, ""
896 |
897 | format_str1 = str_filter_sub(str1.lower().strip())
898 | format_str2 = str_filter_sub(str2.lower().strip())
899 | ratio = fuzz.ratio(format_str1, format_str2)
900 | return ratio >= THREADHOLD or format_str1 == format_str2 or format_str1.find(format_str2) != -1 or format_str2.find(format_str1) != -1 , str(ratio)
901 |
902 |
--------------------------------------------------------------------------------
/acrcloud_scan_files_python.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 |
4 | import os
5 | import sys
6 | import time
7 | import json
8 | import codecs
9 | import optparse
10 | import logging
11 | import openpyxl
12 | from backports import csv
13 | from openpyxl import Workbook
14 | from acrcloud_logger import AcrcloudLogger
15 | from acrcloud_filter_libary import FilterWorker
16 | from acrcloud.recognizer import ACRCloudRecognizer
17 |
18 | if sys.version_info.major == 2:
19 | reload(sys)
20 | sys.setdefaultencoding("utf8")
21 |
22 |
23 | class ACRCloud_Scan_Files:
24 |
25 | def __init__(self, config_file):
26 | self.config = {
27 | 'host': '',
28 | 'access_key': '',
29 | 'access_secret': '',
30 | 'debug': False,
31 | 'timeout': 10 # seconds
32 | }
33 | self.openpyxl_version = ".".join(str(openpyxl.__version__).split(".")[:2])
34 | self.config_file = config_file
35 | self.init_log()
36 | self.init_config()
37 |
38 | def init_log(self):
39 | self.dlog = AcrcloudLogger('ACRCloud_ScanF', logging.INFO)
40 | if not self.dlog.addFilehandler(logfile="log_scan_files.log", logdir="./", loglevel=logging.WARN):
41 | sys.exit(1)
42 | if not self.dlog.addStreamHandler():
43 | sys.exit(1)
44 |
45 | def init_config(self):
46 | try:
47 | json_config = None
48 | with codecs.open(self.config_file, 'r') as f:
49 | json_config = json.loads(f.read())
50 | for k in ["host", "access_key", "access_secret"]:
51 | if k in json_config and json_config[k].strip():
52 | self.config[k] = str(json_config[k].strip())
53 | else:
54 | self.dlog.logger.error("init_config.not found {0} from config.json, pls check".format(k))
55 | sys.exit(1)
56 |
57 | self.re_handler = ACRCloudRecognizer(self.config)
58 | if self.re_handler:
59 | self.dlog.logger.warning("init_config success!")
60 | except Exception as e:
61 | self.dlog.logger.error("init_config.error", exc_info=True)
62 |
63 | def read_file(self, infile, jFirst=True):
64 | with open(infile, "rb") as rfile:
65 | for line in rfile:
66 | if jFirst:
67 | jFirst = False
68 | continue
69 | yield line.strip()
70 |
71 | def write_error(self, file_path, error_time, error_detail):
72 | with open('error_scan.txt', 'a', ) as f:
73 | msg = file_path + '||' + str(error_time) + '||' + str(error_detail) + '\n'
74 | f.write(msg)
75 |
76 | def empty_error_scan(self):
77 | if os.path.exists('error_scan.txt'):
78 | os.remove('error_scan.txt')
79 |
80 | def export_to_csv(self, result_list, export_filename="ACRCloud_ScanFile_Results.csv", export_dir="./"):
81 | try:
82 | results = []
83 | for item in result_list:
84 | filename = item["file"]
85 | timestamp = item["timestamp"]
86 | jsoninfo = item["result"]
87 | if "status" in jsoninfo and jsoninfo["status"]["code"] == 0:
88 | row = self.parse_data(jsoninfo)
89 | row = [filename, timestamp] + list(row)
90 | results.append(row)
91 |
92 | export_filepath = os.path.join(export_dir, export_filename)
93 |
94 | with codecs.open(export_filepath, 'w', 'utf-8-sig') as f:
95 | head_row = ['filename', 'timestamp', 'title', 'artists', 'album', 'acrid', 'played_duration', 'label',
96 | 'isrc', 'upc', 'deezer', 'spotify', 'itunes', 'youtube', 'custom_files_title', 'audio_id']
97 | dw = csv.writer(f)
98 | dw.writerow(head_row)
99 | dw.writerows(results)
100 | self.dlog.logger.info("export_to_csv.Save Data to csv: {0}".format(export_filepath))
101 | except Exception as e:
102 | self.dlog.logger.error("Error export_to_csv", exc_info=True)
103 |
104 | def export_to_json(self, result_list, export_filename="ACRCloud_ScanFile_Results.json", export_dir="./"):
105 | try:
106 | results = []
107 | json_results = []
108 | new_results = {}
109 | export_filepath = os.path.join(export_dir, export_filename)
110 |
111 | head_row = ['filename', 'timestamp', 'title', 'artists', 'album', 'acrid', 'played_duration', 'label',
112 | 'isrc', 'upc', 'deezer', 'spotify', 'itunes', 'youtube', 'custom_files_title', 'audio_id']
113 |
114 | for item in result_list:
115 | filename = item["file"]
116 | timestamp = item["timestamp"]
117 | jsoninfo = item["result"]
118 | if "status" in jsoninfo and jsoninfo["status"]["code"] == 0:
119 | row = self.parse_data(jsoninfo)
120 | row = [filename, timestamp] + list(row)
121 | results.append(row)
122 |
123 | for i in results:
124 | for k in range(len(head_row)):
125 | new_results[head_row[k]] = i[k]
126 |
127 | json_results.append(new_results)
128 |
129 | with codecs.open(export_filepath, 'w', 'utf-8-sig') as f:
130 | f.write(json.dumps(json_results))
131 | except Exception as e:
132 | self.dlog.logger.error("Error export_to_json", exc_info=True)
133 |
134 | def export_to_xlsx(self, result_list, export_filename="ACRCloud_ScanFile_Results.xlsx", export_dir="./"):
135 | try:
136 | wb = Workbook()
137 | sheet_channels = wb.active
138 | sheet_channels.title = "Results"
139 | head_row = ['filename', 'timestamp', 'title', 'artists', 'album', 'acrid', 'played_duration', 'label',
140 | 'isrc', 'upc', 'deezer', 'spotify', 'itunes', 'youtube', 'custom_files_title', 'audio_id']
141 | sheet_channels.append(head_row)
142 |
143 | for item in result_list:
144 | filename = item["file"]
145 | timestamp = item["timestamp"]
146 | jsoninfo = item["result"]
147 | if "status" in jsoninfo and jsoninfo["status"]["code"] == 0:
148 | row = self.parse_data(jsoninfo)
149 | row = [filename, timestamp] + list(row)
150 | sheet_channels.append(row)
151 |
152 | export_filepath = os.path.join(export_dir, export_filename)
153 |
154 | for column_cells in sheet_channels.columns:
155 | length = max(len(str(cell.value) if cell.value else "") for cell in column_cells)
156 | if length > 100:
157 | length == 100
158 | if self.openpyxl_version >= "2.6":
159 | sheet_channels.column_dimensions[column_cells[0].column_letter].width = length
160 | else:
161 | sheet_channels.column_dimensions[column_cells[0].column].width = length
162 | wb.save(export_filepath)
163 |
164 | self.dlog.logger.info("export_to_xlsx.Save Data to xlsx: {0}".format(export_filepath))
165 | except Exception as e:
166 | self.dlog.logger.error("Error export_to_xlsx", exc_info=True)
167 |
168 | def parse_data(self, jsoninfo):
169 | try:
170 | title, played_duration, isrc, upc, acrid, label, album = [""] * 7
171 | artists, deezer, spotify, itunes, youtube, custom_files_title, audio_id = [""] * 7
172 |
173 | metadata = jsoninfo.get('metadata', {})
174 | played_duration = metadata.get("played_duration", "")
175 | if "music" in metadata and len(metadata["music"]) > 0:
176 | item = metadata["music"][0]
177 | title = item.get("title", "")
178 | offset = item.get("play_offset_ms", "")
179 | if "external_ids" in item:
180 | if "isrc" in item["external_ids"]:
181 | isrc_obj = item["external_ids"]["isrc"]
182 | isrc = isrc_obj[0] if type(isrc_obj) == list else isrc_obj
183 | if "upc" in item["external_ids"]:
184 | upc_obj = item["external_ids"]["upc"]
185 | upc = upc_obj[0] if type(upc_obj) == list else upc_obj
186 | acrid = item.get("acrid", "")
187 | label = item.get("label", "")
188 | album = item.get("album", {"name": ""}).get("name", "")
189 | artists = ",".join([ar["name"] for ar in item.get('artists', [{"name": ""}]) if ar.get("name")])
190 | if "external_metadata" in item:
191 | e_metadata = item["external_metadata"]
192 | if "deezer" in e_metadata:
193 | deezer_obj = e_metadata["deezer"]
194 | deezer = deezer_obj[0]["track"]["id"] if type(deezer_obj) == list else deezer_obj["track"]["id"]
195 | if "spotify" in e_metadata:
196 | spotify_obj = e_metadata["spotify"]
197 | spotify = spotify_obj[0]["track"]["id"] if type(spotify_obj) == list else spotify_obj["track"][
198 | "id"]
199 | if "youtube" in e_metadata:
200 | youtube_obj = e_metadata["youtube"]
201 | youtube = youtube_obj[0]["vid"] if type(youtube_obj) == list else youtube_obj["vid"]
202 |
203 | if "custom_files" in metadata and len(metadata["custom_files"]) > 0:
204 | custom_item = metadata["custom_files"][0]
205 | custom_files_title = custom_item.get("title", "")
206 | audio_id = custom_item.get("audio_id", "")
207 | except Exception as e:
208 | self.dlog.logger.error("parse_data.error.data:{0}".format(metadata), exc_info=True)
209 |
210 | res = (title, artists, album, acrid, played_duration, label, isrc, upc,
211 | deezer, spotify, itunes, youtube, custom_files_title, audio_id)
212 | return res
213 |
214 | def apply_filter(self, results):
215 | fworker = FilterWorker()
216 | result_new = fworker.apply_filter(results)
217 | return result_new
218 |
219 | def do_recognize(self, filepath, start_time, rec_length):
220 | try:
221 | current_time = time.strftime('%H:%M:%S', time.gmtime(start_time))
222 | res_data = self.re_handler.recognize_by_file(filepath, start_time, rec_length)
223 | return filepath, current_time, res_data
224 | except Exception as e:
225 | self.dlog.logger.error("do_recognize.error.({0}, {1}, {2})".format(filepath, start_time, rec_length),
226 | exc_info=True)
227 | return filepath, current_time, None
228 |
229 | def recognize_file(self, filepath, start_time, stop_time, step, rec_length, with_duration=0):
230 | self.dlog.logger.warning("scan_file.start_to_run: {0}".format(filepath))
231 |
232 | result = []
233 | for i in range(start_time, stop_time, step):
234 | filep, current_time, res_data = self.do_recognize(filepath, i, rec_length)
235 | try:
236 | print(res_data)
237 | jsoninfo = json.loads(res_data)
238 | code = jsoninfo['status']['code']
239 | msg = jsoninfo['status']['msg']
240 | if "status" in jsoninfo and jsoninfo["status"]["code"] == 0:
241 | result.append(
242 | {"timestamp": current_time, "rec_length": rec_length, "result": jsoninfo, "file": filep})
243 | res = self.parse_data(jsoninfo)
244 | # self.dlog.logger.info('recognize_file.(time:{0}, title: {1})'.format(current_time, res[0]))
245 | self.dlog.logger.info(
246 | 'recognize_file.(time:{0}, title: {1}, custom title: {2})'.format(current_time, res[0],
247 | res[-2]))
248 | if code == 2005:
249 | self.dlog.logger.warning('recognize_file.(time:{0}, code:{1}, Done!)'.format(current_time, code))
250 | break
251 | elif code == 1001:
252 | result.append(
253 | {"timestamp": current_time, "rec_length": rec_length, "result": jsoninfo, "file": filep})
254 | self.dlog.logger.info("recognize_file.(time:{0}, code:{1}, No_Result)".format(current_time, code))
255 | elif code == 3001:
256 | self.dlog.logger.error(
257 | 'recognize_file.(time:{0}, code:{1}, Missing/Invalid Access Key)'.format(current_time, code))
258 | break
259 | elif code == 3003:
260 | self.dlog.logger.error(
261 | 'recognize_file.(time:{0}, code:{1}, Limit exceeded)'.format(current_time, code))
262 | elif code == 3000:
263 | self.dlog.logger.error('recognize_file.(time:{0}, {1}, {2})'.format(current_time, code, msg))
264 | self.write_error(filepath, i, 'NETWORK ERROR')
265 | i += step
266 | except Exception as e:
267 | self.dlog.logger.error('recognize_file.error', exc_info=True)
268 | self.write_error(filepath, i, 'JSON ERROR')
269 | return result
270 |
271 | def scan_file_main(self, option, start_time, stop_time):
272 | try:
273 | filepath = option.file_path
274 | step = option.step
275 | rec_length = option.rec_length
276 | with_duration = option.with_duration
277 | out_dir = option.out_dir
278 | if out_dir and not os.path.exists(out_dir):
279 | try:
280 | os.makedirs(out_dir)
281 | except Exception as e:
282 | self.dlog.logger.error("scan_file_main.create_out_dir_error:{0}, please check it!".format(out_dir),
283 | exc_info=True)
284 | return
285 |
286 | file_type = option.file_type
287 | if start_time == 0 and stop_time == 0:
288 | file_total_seconds = int(ACRCloudRecognizer.get_duration_ms_by_file(filepath) / 1000)
289 | results = self.recognize_file(filepath, start_time, file_total_seconds, step, rec_length, with_duration)
290 | else:
291 | results = self.recognize_file(filepath, start_time, stop_time, step, rec_length, with_duration)
292 |
293 | filename_csv = 'result-' + os.path.basename(filepath.strip()) + '.csv'
294 | filename_xlsx = 'result-' + os.path.basename(filepath.strip()) + '.xlsx'
295 | filename_json = 'result-' + os.path.basename(filepath.strip()) + '.json'
296 |
297 | if results:
298 | if file_type == "csv":
299 | self.export_to_csv(results, filename_csv, out_dir)
300 | elif file_type == "json":
301 | self.export_to_json(results, filename_json, out_dir)
302 | else:
303 | self.export_to_xlsx(results, filename_xlsx, out_dir)
304 |
305 | if with_duration == 1:
306 | new_results = []
307 | if results:
308 | new_results = self.apply_filter(results)
309 |
310 | filename_with_duration_csv = 'result-' + os.path.basename(filepath.strip()) + '_with_duration.csv'
311 | filename_with_duration_xlsx = 'result-' + os.path.basename(filepath.strip()) + '_with_duration.xlsx'
312 | filename_with_duration_json = 'result-' + os.path.basename(filepath.strip()) + '_with_duration.json'
313 |
314 | if file_type == "csv":
315 | self.export_to_csv(new_results, filename_with_duration_csv, out_dir)
316 | elif file_type == "json":
317 |
318 | self.export_to_json(new_results, filename_with_duration_json, out_dir)
319 | else:
320 | self.export_to_xlsx(new_results, filename_with_duration_xlsx, out_dir)
321 | except Exception as e:
322 | self.dlog.logger.error("scan_file_main.error", exc_info=True)
323 | return
324 |
325 | def scan_folder_main(self, option, start_time, stop_time):
326 | try:
327 | path = option.folder_path
328 | file_list = os.listdir(path)
329 | for i in file_list:
330 | option.file_path = path + '/' + i
331 | self.scan_file_main(option, start_time, stop_time)
332 | except Exception as e:
333 | self.dlog.logger.error("scan_folder_main.error", exc_info=True)
334 |
335 |
336 | if __name__ == '__main__':
337 | usage = r'''
338 | _ ____ ____ ____ _ _
339 | / \ / ___| _ \ / ___| | ___ _ _ __| |
340 | / _ \| | | |_) | | | |/ _ \| | | |/ _` |
341 | / ___ \ |___| _ <| |___| | (_) | |_| | (_| |
342 | /_/ \_\____|_| \_\\____|_|\___/ \____|\____|
343 |
344 | Usage:
345 | python acrcloud_scan_files_python.py -d folder_path
346 | python acrcloud_scan_files_python.py -f file_path
347 | Example:
348 | python acrcloud_scan_files_python.py -d ~/music
349 | python acrcloud_scan_files_python.py -f ~/testfiles/test.mp3
350 | If you want to change scan interval or you want to set recognize range,you can add some params
351 | Example:
352 | python acrcloud_scan_files_python.py -f ~/testfiles/test.mp3 -s 30 -r 0-20 -l 10
353 | python acrcloud_scan_files_python.py -d ~/music -s 30
354 | '''
355 |
356 | parser = optparse.OptionParser()
357 | parser.add_option('-f', '--file', dest='file_path', type='string', help='Scan file you want to recognize')
358 | parser.add_option('-c', '--config', dest='config', type='string', default="config.json", help='config file')
359 | parser.add_option('-d', '--folder', dest='folder_path', type='string', help='Scan folder you want to recognize')
360 | parser.add_option('-s', '--step', dest='step', type='int', default=10, help='step')
361 | parser.add_option('-l', '--rec_length', dest='rec_length', type='int', default=10, help='rec_length')
362 | parser.add_option('-e', '--error_file', dest='error_file', type='string', help='error scan file')
363 | parser.add_option('-r', '--range', dest='range', type='string', default='0-0', help='error scan file')
364 | parser.add_option('-w', '--with_duration', dest="with_duration", type='int', default=0, help='with_duration')
365 | parser.add_option('-o', '--out_dir', dest="out_dir", type='string', default="./", help='out_dir')
366 | parser.add_option('-t', '--file_type', dest="file_type", type='string', default="csv", help='file_type')
367 |
368 | (options, args) = parser.parse_args()
369 | start = int(options.range.split('-')[0])
370 | stop = int(options.range.split('-')[1])
371 |
372 | asf = ACRCloud_Scan_Files(options.config)
373 | if options.file_path:
374 | asf.empty_error_scan()
375 | asf.scan_file_main(options, start, stop)
376 | elif options.folder_path:
377 | asf.empty_error_scan()
378 | asf.scan_folder_main(options, start, stop)
379 | else:
380 | print(usage)
381 |
--------------------------------------------------------------------------------
/acrcloud_filter_libary.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #-*- coding:utf-8 -*-
3 |
4 | import os
5 | import sys
6 | import json
7 | import copy
8 | import math
9 | import datetime
10 | import traceback
11 | import tools_str_sim
12 | import acrcloud_logger
13 | from dateutil.relativedelta import *
14 |
15 | if sys.version_info.major == 2:
16 | reload(sys)
17 | sys.setdefaultencoding("utf8")
18 |
19 | NORESULT = "noResult"
20 |
21 | class ResultFilter:
22 |
23 | def __init__(self, dlog):
24 | self._dlog = dlog
25 | self._real_music = {}
26 | self._real_music_list_num = 3
27 | self._real_custom = {}
28 | self._real_custom_list_num = 3
29 | self._real_custom_valid_interval = 5*60
30 | self._delay_music = {}
31 | self._delay_music_last_result = {}
32 | self._delay_music_interval_threshold = 2*60
33 | self._delay_custom = {}
34 | self._delay_custom_played_duration_min = 2
35 | self._delay_list_max_num = 35
36 | self._delay_list_threshold = 120
37 |
38 | def get_mutil_result_title(self, data, itype='music', isize = 1):
39 | ret_list = []
40 | index = 0
41 | json_res = data["result"]
42 | if json_res == NORESULT:
43 | return [NORESULT]
44 | try:
45 | if json_res['status']['code'] == 0:
46 | if itype == 'music':
47 | if 'metadata' in json_res and 'music' in json_res['metadata']:
48 | for item in json_res['metadata']['music']:
49 | ret_list.append(item['title'])
50 | index += 1
51 | if index >= isize:
52 | break
53 | elif 'metainfos' in json_res:
54 | for item in json_res['metainfos']:
55 | ret_list.append(item['title'])
56 | index += 1
57 | if index >= isize:
58 | break
59 | elif itype == 'custom':
60 | if 'metadata' in json_res and 'custom_files' in json_res['metadata']:
61 | for item in json_res['metadata']['custom_files']:
62 | ret_list.append(item['title'])
63 | index += 1
64 | if index >= isize:
65 | break
66 | except Exception as e:
67 | self._dlog.logger.error("Error@get_mutil_result_title", exc_info=True)
68 | self._dlog.logger.error("Error_Data: {0}".format(data))
69 | return ret_list if ret_list else [NORESULT]
70 |
71 | def get_mutil_result_acrid(self, data, itype='music', isize = 1):
72 | ret_list = []
73 | index = 0
74 | json_res = data["result"]
75 | if json_res == NORESULT:
76 | return [NORESULT]
77 | try:
78 | if json_res['status']['code'] == 0:
79 | if itype == 'music':
80 | if 'metadata' in json_res and 'music' in json_res['metadata']:
81 | for item in json_res['metadata']['music']:
82 | ret_list.append(item['acrid'])
83 | index += 1
84 | if index >= isize:
85 | break
86 | elif 'metainfos' in json_res:
87 | for item in json_res['metainfos']:
88 | ret_list.append(item['acrid'])
89 | index += 1
90 | if index >= isize:
91 | break
92 | elif itype == 'custom':
93 | if 'metadata' in json_res and 'custom_files' in json_res['metadata']:
94 | for item in json_res['metadata']['custom_files']:
95 | ret_list.append(item['acrid'])
96 | index += 1
97 | if index >= isize:
98 | break
99 | except Exception as e:
100 | self._dlog.logger.error("Error@get_mutil_result_acrid", exc_info=True)
101 | self._dlog.logger.error("Error_Data: {0}".format(json.dumps(result)))
102 | return ret_list if ret_list else [NORESULT]
103 |
104 | def swap_position(self, ret_title, ret_data, itype):
105 | json_res = ret_data["result"]
106 | meta_type = None
107 | music_list = []
108 | if itype == 'music':
109 | if 'metadata' in json_res:
110 | music_list = json_res['metadata']['music']
111 | elif 'metainfos' in json_res:
112 | music_list = json_res['metainfos']
113 | elif itype == 'custom':
114 | music_list = json_res['metadata']['custom_files']
115 |
116 | if music_list:
117 | ret_index = 0
118 | for index, item in enumerate(music_list):
119 | if itype == "music":
120 | if item['title'] == ret_title:
121 | ret_index = index
122 | break
123 | else:
124 | if item['acrid'] == ret_title:
125 | ret_index = index
126 | break
127 | if ret_index > 0:
128 | music_list[0], music_list[ret_index] = music_list[ret_index], music_list[0]
129 |
130 | def custom_result_append(self, ret_data, title, from_data, count, tmp_deal_title_map):
131 | ret_title_set = set()
132 | for item in ret_data['result']['metadata']['custom_files']:
133 | ret_title_set.add(item['acrid'])
134 |
135 | for item in from_data['result']['metadata']['custom_files']:
136 | acrid = item['acrid']
137 | if acrid == title and acrid not in ret_title_set:
138 | item['count'] = count
139 | ret_data['result']['metadata']['custom_files'].append(item)
140 | ret_title_set.add(acrid)
141 |
142 | for item in from_data['result']['metadata']['custom_files']:
143 | acrid = item['acrid']
144 | if acrid not in ret_title_set:
145 | if acrid in tmp_deal_title_map:
146 | item['count'] = tmp_deal_title_map[acrid]['count']
147 | ret_data['result']['metadata']['custom_files'].append(item)
148 |
149 | def get_play_offset(self, data, itype='music'):
150 | try:
151 | play_offset_ms = 0
152 | result = data['result']
153 | if result['status']['code'] == 1001:
154 | return 0
155 | if itype == 'music':
156 | play_offset_ms = result['metadata']['music'][0]['play_offset_ms']
157 | elif itype == 'custom':
158 | play_offset_ms = result['metadata']['custom_files'][0]['play_offset_ms']
159 | except Exception as e:
160 | self._dlog.logger.error("Error@Get_Play_Offset, error_data: {0}, {1}".format(itype, data), exc_info=True)
161 | return play_offset_ms/1000.0
162 |
163 | def get_db_play_offset(self, data, offset_type="begin", itype='music'):
164 | """
165 | itype : music or custom
166 | offset_type : begin or end offset
167 | """
168 | try:
169 | if offset_type not in ['begin', 'end']:
170 | self._dlog.logger.error("Error@Get_DB_Play_Offset.offset_type({0}) error".format(offset_type))
171 | return (None, self.get_play_offset(data, itype)) #if offset_type error, return play_offset_ms
172 |
173 | db_offset_key = "db_{0}_time_offset_ms".format(offset_type)
174 | sample_offset_key = "sample_{0}_time_offset_ms".format(offset_type)
175 |
176 | db_play_offset_ms = 0 #ms
177 | sample_play_offset_ms = 0
178 | result = data['result']
179 | if result['status']['code'] == 1001:
180 | return 0
181 | if itype == 'music':
182 | db_play_offset_ms = result['metadata']['music'][0][db_offset_key]
183 | sample_play_offset_ms = result['metadata']['music'][0][sample_offset_key]
184 | elif itype == 'custom':
185 | db_play_offset_ms = result['metadata']['custom_files'][0][db_offset_key]
186 | sample_play_offset_ms = result['metadata']['custom_files'][0][sample_offset_key]
187 |
188 | return (int(sample_play_offset_ms)/1000.0, int(db_play_offset_ms)/1000.0)
189 | except Exception as e:
190 | self._dlog.logger.error("Error@please contact support@acrcloud.com to add offset config for your access_key")
191 | return (None, None)
192 |
193 | def get_duration(self, end_timestamp, start_timestamp):
194 | end = datetime.datetime.strptime(end_timestamp, '%H:%M:%S')
195 | start = datetime.datetime.strptime(start_timestamp, '%H:%M:%S')
196 | return (end - start).total_seconds()
197 |
198 | def get_duration_accurate(self, end_data, start_data, itype='music'):
199 | monitor_len = end_data.get('rec_length', 10)
200 | end_play_offset = self.get_play_offset(end_data, itype)
201 | start_play_offset = self.get_play_offset(start_data, itype)
202 | pre_seconds = max(20, monitor_len*2)
203 | if int(start_play_offset) < pre_seconds:
204 | start_play_offset = 0
205 | else:
206 | start_play_offset = start_play_offset - (monitor_len/2)
207 | return int(round(end_play_offset - start_play_offset))
208 |
209 | def get_duration_accurate_use_db_offset(self, end_data, begin_data, isize, itype='music'):
210 | begin_timestamp = datetime.datetime.strptime(begin_data['timestamp'], "%H:%M:%S")
211 |
212 | monitor_len = end_data.get('rec_length', 10)
213 |
214 | end_sample_offset, end_db_offset = self.get_db_play_offset(end_data, 'end', itype)
215 | begin_sample_offset, begin_db_offset = self.get_db_play_offset(begin_data, 'begin', itype)
216 | for i in [ end_sample_offset, end_db_offset, begin_sample_offset, begin_db_offset]:
217 | if i is None:
218 | return 0, 0, 0, begin_data["timestamp"]
219 |
220 | accurate_begin_timestamp = (begin_timestamp + relativedelta(seconds=int(float(begin_sample_offset)))).strftime("%H:%M:%S")
221 |
222 | db_len = int(round(end_db_offset - begin_db_offset))
223 | sample_len = int(round(end_sample_offset - begin_sample_offset + (isize-1)*monitor_len))
224 |
225 | mix_len = 0
226 | if int(begin_sample_offset) == 0 and int(begin_db_offset) == 0:
227 | mix_len = (isize-1)*monitor_len + end_sample_offset
228 | elif int(begin_sample_offset) == 0:
229 | if begin_db_offset <= monitor_len:
230 | mix_len = (isize-1)*monitor_len + end_sample_offset
231 | else:
232 | mix_len = (isize-1)*monitor_len + end_sample_offset - begin_sample_offset
233 | elif int(begin_db_offset) == 0:
234 | mix_len = (isize-1)*monitor_len + end_sample_offset - begin_sample_offset
235 | else:
236 | mix_len = (isize-1)*monitor_len + end_sample_offset - begin_sample_offset
237 | mix_len = int(round(mix_len))
238 |
239 | return sample_len, db_len, mix_len, accurate_begin_timestamp
240 |
241 | def judge_zero_item_contain_current_result(self, ret_sim_title, zero_data, itype="music"):
242 | """
243 | itype: music => title is track name
244 | itype: custom => title is acrid
245 | """
246 | try:
247 | is_contain = False
248 | if itype == "music":
249 | zero_title_list = self.get_mutil_result_title(zero_data, 'music', 5)
250 | elif itype == "custom":
251 | zero_title_list = self.get_mutil_result_acrid(zero_data, 'custom', 5)
252 | else:
253 | return is_contain
254 |
255 | for ztitle in zero_title_list:
256 | if ztitle == NORESULT:
257 | break
258 | sim_zero_title = self.tryStrSub(ztitle)[0] if itype == "music" else ztitle
259 | if sim_zero_title == ret_sim_title:
260 | is_contain = True
261 | self.swap_position(ztitle, zero_data, itype)
262 | break
263 | except Exception as e:
264 | self._dlog.logger.error("Error@judge_zero_item_contain_current_result", exc_info=True)
265 | return is_contain
266 |
267 | def judge_latter_item_contain_current_result(self, ret_sim_title, latter_data, itype="music"):
268 | """
269 | itype: music => title is track name
270 | itype: custom => title is acrid
271 | """
272 | try:
273 | is_contain = False
274 | latter_data_swaped = None
275 | if itype == "music":
276 | latter_title_list = self.get_mutil_result_title(latter_data, 'music', 5)
277 | elif itype == "custom":
278 | latter_title_list = self.get_mutil_result_acrid(latter_data, 'custom', 5)
279 | else:
280 | return is_contain, latter_data_swaped
281 |
282 | for ltitle in latter_title_list:
283 | if ltitle == NORESULT:
284 | break
285 | sim_latter_title = self.tryStrSub(ltitle)[0] if itype == "music" else ltitle
286 | if sim_latter_title == ret_sim_title:
287 | is_contain = True
288 | latter_data_swaped = copy.deepcopy(latter_data)
289 | self.swap_position(ltitle, latter_data_swaped, itype)
290 | break
291 | except Exception as e:
292 | self._dlog.logger.error("Error@judge_latter_item_contain_current_result", exc_info=True)
293 | return is_contain, latter_data_swaped
294 |
295 | def real_check_title_custom(self, stream_id, title, timestamp_obj):
296 | now_timestamp = timestamp_obj #datetime.datetime.utcnow()
297 | if stream_id not in self._real_custom:
298 | self._real_custom[stream_id] = [[('','')], '']
299 |
300 | if len(self._real_custom[stream_id][0]) > self._real_custom_list_num:
301 | self._real_custom[stream_id][0] = self._real_custom[stream_id][0][-self._real_custom_list_num:]
302 | his_list_num = self._real_custom_list_num
303 | else:
304 | his_list_num = len(self._real_custom[stream_id][0])
305 |
306 | for i in range(his_list_num-1, -1, -1):
307 | if self._real_custom[stream_id][0][i][0] == title:
308 | his_timestamp = self._real_custom[stream_id][0][i][1]
309 | his_time_obj = datetime.datetime.strptime(his_timestamp, '%H:%M:%S')
310 | if (now_timestamp - his_time_obj).total_seconds() <= self._real_custom_valid_interval:
311 | return True
312 | if title == NORESULT:
313 | break
314 |
315 | return False
316 |
317 | def checkResultSim(self, idx, curr_title, his_title, stream_id):
318 | if not curr_title or not his_title:
319 | return False
320 | sim, detail = tools_str_sim.str_sim(curr_title, his_title)
321 | if not sim and curr_title != NORESULT and his_title != NORESULT:
322 | pass
323 | return sim
324 |
325 | def checkSame(self, curr_title, stream_id):
326 | self._real_music[stream_id] = self._real_music.get(stream_id, [[''], ''])
327 | if len(self._real_music[stream_id][0]) > self._real_music_list_num:
328 | self._real_music[stream_id][0] = self._real_music[stream_id][0][-self._real_music_list_num:]
329 | his_max = self._real_music_list_num
330 | else:
331 | his_max = len(self._real_music[stream_id][0])
332 | for i in range(his_max-1, -1, -1):
333 | if self.checkResultSim(i, curr_title, self._real_music[stream_id][0][i], stream_id):
334 | return True
335 | if curr_title == NORESULT:
336 | break
337 | return False
338 |
339 | def updateResultTitle(self, data, new_title):
340 | if new_title == NORESULT:
341 | return
342 | try:
343 | json_res = data["result"]
344 | metainfos = json_res.get("metainfos")
345 | metadata = json_res.get("metadata")
346 | if metainfos:
347 | metainfos[0]['title'] = new_title
348 | else:
349 | if metadata.get('music'):
350 | metadata['music'][0]['title'] = new_title
351 | else:
352 | metadata['custom_files'][0]['title'] = new_title
353 | except Exception as e:
354 | self._dlog.logger.error("Error@updateResultTitle", exc_info=True)
355 |
356 | def tryStrSub(self, try_str):
357 | sub_str = tools_str_sim.str_sub(try_str)
358 | if len(sub_str) > 0 and len(try_str) > len(sub_str):
359 | return sub_str, True
360 | return try_str, False
361 |
362 | def tryUpdateResultTitle(self, data, itype):
363 | if itype == 'custom':
364 | title = self.get_mutil_result_title(data, 'custom', 1)[0]
365 | return title
366 | title = self.get_mutil_result_title(data, 'music', 1)[0]
367 | stream_id = data.get("stream_id")
368 | new_title, try_status = self.tryStrSub(title)
369 | if try_status:
370 | self.updateResultTitle(data, new_title)
371 | return new_title
372 | return title
373 |
374 | def deal_real_history(self, data):
375 | is_new = False
376 | result = None
377 | curr_title = self.get_mutil_result_title(data, 'music', 1)[0]
378 | stream_id = data.get("stream_id")
379 | if not stream_id:
380 | return result, is_new
381 | if curr_title == NORESULT:
382 | if not self.checkSame(curr_title, stream_id):
383 | self._real_music[stream_id][0].append(curr_title)
384 | self._real_music[stream_id][1] = data
385 | result = data
386 | is_new = True
387 | else:
388 | result = None
389 | is_new = False
390 | else:
391 | if self.checkSame(curr_title, stream_id):
392 | result = self._real_music[stream_id][1]
393 | is_new = False
394 | else:
395 | self._real_music[stream_id][0].append(curr_title)
396 | self._real_music[stream_id][1] = data
397 | result = data
398 | is_new = True
399 |
400 | return result, is_new
401 |
402 | def deal_delay_history(self, data):
403 | stream_id = data.get("stream_id")
404 | timestamp = data.get("timestamp")
405 | raw_title = self.get_mutil_result_title(data, 'music', 1)[0]
406 | sim_title = self.tryStrSub(raw_title)
407 | if stream_id not in self._delay_music:
408 | self._delay_music[stream_id] = [(raw_title, sim_title[0], timestamp, data)]
409 | else:
410 | self._delay_music[stream_id].append((raw_title, sim_title[0], timestamp, data))
411 |
412 | if len(self._delay_music[stream_id]) > self._delay_list_max_num :
413 | return self.runDelayX_for_music_delay2(stream_id)
414 | else:
415 | return None
416 |
417 | def compute_played_duration(self, history_data, start_index, end_index, judge_zero_or_latter=True, itype="music"):
418 | retdata = history_data[start_index][-1]
419 |
420 | if itype == "music":
421 | ret_title = self.get_mutil_result_title(retdata, 'music', 1)[0]
422 | ret_sim_title = history_data[start_index][1]
423 | elif itype == "custom":
424 | ret_title = self.get_mutil_result_acrid(retdata, 'custom', 1)[0]
425 | ret_sim_title = ret_title
426 |
427 | if judge_zero_or_latter and start_index == 1:
428 | if self.judge_zero_item_contain_current_result(ret_sim_title, history_data[0][-1], itype):
429 | start_index = 0
430 |
431 | is_contain = False
432 | latter_data_swaped = None
433 | if judge_zero_or_latter and (end_index + 1 <= len(history_data) - 1):
434 | is_contain, latter_data_swaped = self.judge_latter_item_contain_current_result(ret_sim_title, history_data[end_index+1][-1], itype)
435 |
436 | if itype == "music":
437 | start_timestamp = history_data[start_index][2]
438 | end_timestamp = history_data[end_index][2]
439 | start_data = history_data[start_index][3]
440 | end_data = history_data[end_index][3]
441 | else:
442 | start_timestamp = history_data[start_index][1]
443 | end_timestamp = history_data[end_index][1]
444 | start_data = history_data[start_index][2]
445 | end_data = history_data[end_index][2]
446 |
447 | duration = self.get_duration(end_timestamp, start_timestamp)
448 | duration_accurate = self.get_duration_accurate(end_data, start_data, itype)
449 | isize = end_index - start_index + 1
450 | if is_contain:
451 | end_data = latter_data_swaped
452 | isize += 1
453 |
454 | sample_duraion, db_duration, mix_duration, accurate_timestamp_utc = self.get_duration_accurate_use_db_offset(end_data, start_data, isize, itype)
455 |
456 | ret_dict = {
457 | "duration" : duration,
458 | "duration_accurate" : duration_accurate,
459 | "sample_duration" : sample_duraion,
460 | "db_duration" : db_duration,
461 | "mix_duration" : mix_duration,
462 | "accurate_timestamp_utc" : accurate_timestamp_utc,
463 | }
464 | return ret_dict
465 |
466 | def get_data_duration_ms(self, data):
467 | try:
468 | duration_ms = -1
469 | json_res = data["result"]
470 | if json_res['status']['code'] == 0:
471 | if 'metadata' in json_res and 'music' in json_res['metadata']:
472 | if len(json_res['metadata']['music']) > 0:
473 | duration_ms = json_res["metadata"]["music"][0]["duration_ms"]
474 | except Exception as e:
475 | self._dlog.logger.error("Error@get_data_duration_ms", exc_info=True)
476 | return (duration_ms/1000.0) if duration_ms != -1 else duration_ms
477 |
478 | def get_time_diff(self, start_timestamp, end_timestamp, tformat="%Y-%m-%d %H:%M:%S"):
479 | try:
480 | diff_sec = 0
481 | start_obj = datetime.datetime.strptime(start_timestamp, tformat)
482 | end_obj = datetime.datetime.strptime(end_timestamp, tformat)
483 | diff_sec = int((end_obj - start_obj).total_seconds())
484 | except Exception as e:
485 | self._dlog.logger.error("Error@get_diff_seconds", exc_info=True)
486 | return diff_sec
487 |
488 | def remove_next_result_from_now_result_list_for_music_delay2(self, history_data, ret_data, max_index):
489 | #Just for music delay2 filter
490 | try:
491 | if ret_data and len(history_data) >= max_index+2:
492 | raw_title, sim_title, timestamp, next_data = history_data[max_index + 1]
493 | if next_data:
494 | next_title_list = self.get_mutil_result_title(next_data, 'music', 1)
495 | next_title_set = set(next_title_list)
496 | new_ret_music = []
497 | for index, item in enumerate(ret_data["result"]["metadata"]["music"]):
498 | if index == 0 or (item["title"] not in next_title_set):
499 | new_ret_music.append(item)
500 | ret_data["result"]["metadata"]["music"] = new_ret_music
501 | except Exception as e:
502 | self._dlog.logger.error("Error@remove_next_result_from_now_result_list_for_music_delay2", exc_info=True)
503 |
504 | def result_append_for_music_delay2(self, ret_data, title, from_data):
505 | try:
506 | ret_title_set = set()
507 | for item in ret_data['result']['metadata']['music']:
508 | sim_title = self.tryStrSub(item['title'])[0]
509 | ret_title_set.add(sim_title)
510 |
511 | for item in from_data['result']['metadata']['music']:
512 | from_title = item['title']
513 | sim_from_title = self.tryStrSub(from_title)[0]
514 | if sim_from_title == title and sim_from_title not in ret_title_set:
515 | ret_data['result']['metadata']['music'].append(item)
516 | ret_title_set.add(sim_from_title)
517 | except Exception as e:
518 | self._dlog.logger.error("Error@result_append_for_music_delay2", exc_info=True)
519 |
520 | def get_custom_duration_by_title(self, title, ret_data):
521 | try:
522 | duration = 0
523 | db_end_offset = 0
524 | for index, item in enumerate(ret_data["result"]["metadata"]["custom_files"]):
525 | #custom 获取的title是acrid
526 | if title == item["acrid"]:
527 | duration_ms = int(item["duration_ms"])
528 | db_end_offset_ms = int(item["db_end_time_offset_ms"])
529 | if duration_ms >= 0:
530 | duration = int(duration_ms/1000)
531 | if db_end_offset_ms:
532 | db_end_offset = int(db_end_offset_ms/1000)
533 | except Exception as e:
534 | self._dlog.logger.error("Error@get_custom_duration_by_title, error_data:{0}".format(ret_data), exc_info=True)
535 | return duration, db_end_offset
536 |
537 | def get_music_duration_by_title(self, title, ret_data):
538 | try:
539 | duration = 0
540 | db_end_offset = 0
541 | if "metadata" in ret_data["result"] and "music" in ret_data["result"]["metadata"]:
542 | for index, item in enumerate(ret_data["result"]["metadata"]["music"]):
543 | if title == item["title"]:
544 | duration_ms = int(item["duration_ms"])
545 | db_end_offset_ms = int(item["db_end_time_offset_ms"])
546 | if duration_ms >= 0:
547 | duration = int(duration_ms/1000)
548 | if db_end_offset_ms:
549 | db_end_offset = int(db_end_offset_ms/1000)
550 | except Exception as e:
551 | self._dlog.logger.error("Error@get_custom_duration_by_title, error_data:{0}".format(ret_data), exc_info=True)
552 | return duration, db_end_offset
553 |
554 | def delay_dynamic_judge_size(self, deal_title_map, history_data, itype):
555 | try:
556 | judge_size = 5
557 | if itype == "custom":
558 | title = sorted(deal_title_map.items(), key=lambda x:x[1]["score"], reverse=True)[0][0]
559 | else:
560 | title = deal_title_map.keys()[0]
561 |
562 | index = deal_title_map[title]["index_list"][-1]
563 | if itype == "custom":
564 | ret_data = history_data[index][2]
565 | else:
566 | ret_data = history_data[index][3]
567 |
568 | monitor_len = ret_data.get("monitor_seconds", 10)
569 |
570 | if itype == "custom":
571 | duration, db_end_offset = self.get_custom_duration_by_title(title, ret_data)
572 | else:
573 | duration, db_end_offset = self.get_music_duration_by_title(title, ret_data)
574 |
575 | if db_end_offset > 0 and db_end_offset < duration:
576 | judge_size = abs(int(math.ceil(db_end_offset*1.0/monitor_len))) + 1
577 | if judge_size > 10:
578 | judge_size = 10
579 | if judge_size <= 3:
580 | judge_size = 3
581 | if itype == "custom":
582 | judge_size = 1
583 | except Exception as e:
584 | self._dlog.logger.error("Error@delay_dynamic_judge_size", exc_info=True)
585 | return judge_size+1
586 |
587 | def fill_ret_data_by_acrid_count(self, sorted_title_list, history_data):
588 | try:
589 | ret_data = None
590 | init_ret_data = True
591 | for sitem in sorted_title_list:
592 | sitem_title, sitem_map = sitem
593 | sitem_title = self.tryStrSub(sitem_title)[0]
594 | sitem_count = sitem_map["count"]
595 | acrid_count_map = {}
596 | for tindex in sitem_map["index_list"]:
597 | tdata = history_data[tindex][3]
598 | if init_ret_data:
599 | ret_data = copy.deepcopy(tdata)
600 | ret_data["result"]["metadata"]["music"] = []
601 | init_ret_data = False
602 | if "metadata" in tdata["result"] and "music" in tdata["result"]["metadata"]:
603 | for item in tdata['result']['metadata']['music']:
604 | sim_title = self.tryStrSub(item['title'])[0]
605 | if sim_title == sitem_title:
606 | acrid = item['acrid']
607 | if acrid not in acrid_count_map:
608 | acrid_count_map[acrid] = {"count":0, "info":item}
609 | acrid_count_map[acrid]["count"] += 1
610 | if ret_data is None:
611 | break
612 |
613 | acrid_count_map_sorted = sorted(acrid_count_map.items(), key=lambda x:x[1]["count"], reverse=True)
614 | for s_index, s_item in enumerate(acrid_count_map_sorted):
615 | ret_data["result"]["metadata"]["music"].append(s_item[1]["info"])
616 | if s_index >= 2:
617 | break
618 | if ret_data is not None and len(ret_data['result']['metadata']['music']) > 6:
619 | ret_data['result']['metadata']['music'] = ret_data['result']['metadata']['music'][:6]
620 | except Exception as e:
621 | self._dlog.logger.error("Error@fill_ret_data_by_acrid_count", exc_info=True)
622 | return ret_data
623 |
624 | def get_music_data_offset(self, data):
625 | try:
626 | ret = {
627 | "monitor_len":0,
628 | "duration_ms":0,
629 | "s_begin_ms":0,
630 | "s_end_ms":0,
631 | "d_begin_ms":0,
632 | "d_end_ms":0
633 | }
634 | result = data.get("result")
635 | monitor_len = data.get("monitor_seconds", 10)
636 | ret["monitor_len"] = monitor_len
637 | if result and "metadata" in result and "music" in result["metadata"]:
638 | fitem = result["metadata"]["music"][0]
639 | ret["duration_ms"] = int(fitem["duration_ms"])
640 | ret["s_begin_ms"] = int(fitem["sample_begin_time_offset_ms"])
641 | ret["s_end_ms"] = int(fitem["sample_end_time_offset_ms"])
642 | ret["d_begin_ms"] = int(fitem["db_begin_time_offset_ms"])
643 | ret["d_end_ms"] = int(fitem["db_end_time_offset_ms"])
644 | return ret
645 | except Exception as e:
646 | self._dlog.logger.error("Error@get_music_data_offset, error_data:{0}".format(data), exc_info=True)
647 | return None
648 |
649 | def check_if_is_break(self, index1, index2, data1, data2):
650 | try:
651 | is_break = False
652 | ret1 = self.get_music_data_offset(data1)
653 | ret2 = self.get_music_data_offset(data2)
654 | if ret1 and ret2:
655 | diff_db = ret2["d_end_ms"] - ret1["d_begin_ms"]
656 | if diff_db <= 0:
657 | return is_break
658 | timestamp1 = datetime.datetime.strptime(data1["timestamp"], "%H:%M:%S")
659 | timestamp2 = datetime.datetime.strptime(data2["timestamp"], "%H:%M:%S")
660 | monitor_len = ret1["monitor_len"]
661 | A1 = timestamp1 + relativedelta(seconds=int(ret1["s_begin_ms"]/1000))
662 | A2 = timestamp2 + relativedelta(seconds=int(ret2["s_end_ms"]/1000))
663 | B1 = int((A2 - A1).total_seconds())
664 | B2 = (index2 - index1 - 1)*monitor_len + int(diff_db/1000)
665 | B3 = int(diff_db/1000)
666 | if abs(B3 - B1) <= 15:
667 | is_break = False
668 | elif abs(B2 - B1) <= 10:
669 | is_break = True
670 | except Exception as e:
671 | self._dlog.logger.error("Error@check_if_is_break", exc_info=True)
672 | return is_break
673 |
674 | def check_if_continuous(self, index1, index2, data1, data2):
675 | try:
676 | is_cont = True
677 | ret1 = self.get_music_data_offset(data1)
678 | ret2 = self.get_music_data_offset(data2)
679 | timestamp1 = datetime.datetime.strptime(data1["timestamp"], "%H:%M:%S")
680 | timestamp2 = datetime.datetime.strptime(data2["timestamp"], "%H:%M:%S")
681 | diff_sec = (timestamp2 - timestamp1).total_seconds()
682 | monitor_len = ret1["monitor_len"]
683 | if ret1 and ret2:
684 | for tmp_ret in [ret1, ret2]:
685 | if (tmp_ret["s_end_ms"] - tmp_ret["s_begin_ms"]) != (tmp_ret["d_end_ms"] - tmp_ret["d_begin_ms"]):
686 | return is_cont
687 | dur1 = ret1["d_end_ms"] - ret1["d_begin_ms"]
688 | dur2 = ret2["d_end_ms"] - ret2["d_begin_ms"]
689 | dur1 = dur1 if dur1 > 0 else 0
690 | dur2 = dur2 if dur2 > 0 else 0
691 | ret1_s_end = ret1["s_end_ms"]
692 | ret2_s_begin = ret2["s_begin_ms"]
693 | if index1+1 == index2 and abs(monitor_len*1000 - ret1_s_end) < 2500 and abs(ret2_s_begin) < 2500 and diff_sec < monitor_len*2:
694 | pass
695 | else:
696 | ifirst, iend = max(ret1["d_begin_ms"], ret2["d_begin_ms"]), min(ret1["d_end_ms"], ret2["d_end_ms"])
697 | inter_dur = iend - ifirst
698 | if inter_dur > 0:
699 | min_dur = min(dur1, dur2) if min(dur1, dur2) > 0 else max(dur1, dur2)
700 | if min_dur > 0:
701 | inter_rate = (inter_dur*1.0/min_dur)
702 | if inter_dur >=2 and inter_rate >=0.8:
703 | is_cont = False
704 | except Exception as e:
705 | self._dlog.logger.error("Error@check_if_continuous", exc_info=True)
706 | return is_cont
707 |
708 | def runDelayX_for_music_delay2(self, stream_id):
709 | history_data = self._delay_music[stream_id]
710 | judge_zero_or_latter = True
711 |
712 | if len(history_data) >= self._delay_list_threshold:
713 | history_data = history_data[-(self._delay_list_threshold-1):]
714 |
715 | history_data_len = len(history_data)
716 | for ii in range((history_data_len-1), 0, -1):
717 | if history_data[-ii][0][0] == NORESULT:
718 | continue
719 | else:
720 | history_data = history_data[-(ii+1):]
721 | break
722 |
723 | first_not_noresult_index = -1
724 | for index, item in enumerate(history_data):
725 | if index == 0:
726 | continue
727 | if item[0] == NORESULT:
728 | first_not_noresult_index = index
729 | else:
730 | break
731 | if first_not_noresult_index != -1:
732 | history_data = history_data[first_not_noresult_index:]
733 | self._delay_music[stream_id] = history_data
734 | return None
735 |
736 | ########## Get Break Index ##########
737 | deal_title_map = {} #key:title, value:{'count':0, 'index_list':[]}
738 | break_index = 0
739 |
740 |
741 | for index, item in enumerate(history_data[1:]):
742 | index += 1
743 | raw_title, sim_title, timestamp, data = item
744 | if index!=1:
745 | flag_first = True
746 | flag_second = True
747 | if sim_title in deal_title_map:
748 | flag_first = False
749 | if flag_first:
750 | tmp_all_len = len(history_data)
751 | tmp_count = 0
752 | tmp_first_break_index = -1
753 | #tmp_judge_size = 2
754 | tmp_judge_size = self.delay_dynamic_judge_size(deal_title_map, history_data, "music")
755 | find_interval = False
756 | find_pre_last_index = index-1
757 | find_next_sim_index = -1
758 | for i in range(index, tmp_all_len):
759 | next_raw_title, next_sim_title, next_timestamp, next_data = history_data[i]
760 | tmp_list_flag = False
761 | if next_sim_title in deal_title_map:
762 | tmp_list_flag = True
763 | tmp_count = 0
764 | tmp_first_break_index = -1
765 | if find_interval == True:
766 | find_interval = False
767 | find_next_sim_index = i
768 | if find_next_sim_index - find_pre_last_index - 1 >= 8:
769 | is_break = self.check_if_is_break(find_pre_last_index, find_next_sim_index, history_data[find_pre_last_index][3], history_data[find_next_sim_index][3])
770 | if is_break:
771 | break_index = find_pre_last_index + 1
772 | break
773 | else:
774 | if find_interval == False:
775 | find_interval = True
776 | find_pre_last_index = i - 1
777 |
778 | if tmp_list_flag:
779 | continue
780 | else:
781 | tmp_count += 1
782 | if tmp_first_break_index == -1:
783 | tmp_first_break_index = i
784 | if tmp_count < tmp_judge_size:
785 | continue
786 | flag_second = True
787 | break_index = tmp_first_break_index if tmp_first_break_index != -1 else i
788 | break
789 |
790 | if flag_first and flag_second and deal_title_map:
791 | if break_index >0:
792 | for iii in range(index, break_index):
793 | tmp_raw_title, tmp_sim_title, tmp_timestamp, tmp_data = history_data[iii]
794 | if tmp_sim_title == NORESULT:
795 | continue
796 | if tmp_sim_title in deal_title_map:
797 | deal_title_map[tmp_sim_title]['count'] += 1
798 | deal_title_map[tmp_sim_title]['index_list'].append(iii)
799 | #**********************************************************
800 | sorted_dtitle = sorted(deal_title_map.items(), key = lambda x:x[1]['count'], reverse = True)
801 | sorted_fitem_title, sorted_fitem_map = sorted_dtitle[0]
802 | sfm_count = sorted_fitem_map["count"]
803 | cfirst_index, csecond_index = sorted(sorted_fitem_map["index_list"])[:2] if sfm_count >=2 else [0, 0]
804 | if sfm_count in [2, 3]: #or ((3 < sfm_count <= 10) and sfm_count < (break_index - index)):
805 | is_cont = self.check_if_continuous(cfirst_index, csecond_index, history_data[cfirst_index][3], history_data[csecond_index][3])
806 | if not is_cont:
807 | judge_zero_or_latter = False
808 | break_index = cfirst_index + 1
809 | deal_title_map = {sorted_fitem_title:{'count':1, 'index_list':[cfirst_index]}}
810 | #**********************************************************
811 | #跳出
812 | break
813 |
814 | if sim_title == NORESULT:
815 | continue
816 | if sim_title not in deal_title_map:
817 | deal_title_map[sim_title] ={'count':0, 'index_list':[]}
818 | deal_title_map[sim_title]['count'] += 1
819 | deal_title_map[sim_title]['index_list'].append(index)
820 |
821 |
822 | ret_data = None
823 | duration_dict = {}
824 | duration = 0
825 | if break_index > 0 and deal_title_map:
826 | sorted_title_list = sorted(deal_title_map.items(), key = lambda x:x[1]['count'], reverse = True)
827 | ret_data = self.fill_ret_data_by_acrid_count(sorted_title_list, history_data)
828 | if ret_data and len(ret_data["result"]["metadata"]["music"]) == 0:
829 | ret_data = None
830 |
831 | index_range = set()
832 | for title in deal_title_map:
833 | index_range |= set(deal_title_map[title]['index_list'])
834 | min_index = min(index_range)
835 | max_index = max(index_range)
836 | duration_dict = self.compute_played_duration(history_data, min_index, max_index, judge_zero_or_latter, "music")
837 |
838 | self.remove_next_result_from_now_result_list_for_music_delay2(history_data, ret_data, max_index)
839 |
840 | if ret_data:
841 | duration = duration_dict["duration"]
842 | duration_accurate = duration_dict["duration_accurate"]
843 | sample_duration = duration_dict["sample_duration"]
844 | db_duration = duration_dict["db_duration"]
845 | mix_duration = duration_dict["mix_duration"]
846 | accurate_timestamp_utc = duration_dict["accurate_timestamp_utc"]
847 | ret_data['result']['metadata']['played_duration'] = abs(mix_duration)
848 | ret_data['result']['metadata']['timestamp_utc'] = accurate_timestamp_utc
849 | ret_data['timestamp'] = accurate_timestamp_utc
850 | if ret_data['result']['metadata']['played_duration'] <= 1:
851 | ret_data = None
852 |
853 | ########### cut history_data #############
854 | if break_index>=0:
855 | cut_index = break_index
856 | for i, item in enumerate(history_data[break_index:]):
857 | if item[0][0] == NORESULT:
858 | cut_index = break_index + i + 1
859 | else:
860 | break
861 | cut_index = cut_index - 1 if cut_index >= 1 else cut_index
862 | history_data = history_data[cut_index:]
863 |
864 | reverse_index = -1
865 | for i, item in enumerate(history_data[::-1]):
866 | if item[0][0] == NORESULT:
867 | reverse_index = i
868 | continue
869 | else:
870 | break
871 |
872 | if reverse_index != -1:
873 | new_cut_index = -1
874 | reverse_index = len(history_data) - reverse_index - 1
875 | if reverse_index in [0, 1]:
876 | history_data = []
877 | else:
878 | pass
879 |
880 | if judge_zero_or_latter == False and len(history_data) > 0:
881 | if history_data[0][0] != NORESULT:
882 | tmp_t, sim_tmp_t, tmp_timestamp, tmp_data = history_data[0]
883 | if tmp_data and "status" in tmp_data["result"]:
884 | tmp_data["result"]["status"]["code"] = 1001
885 | history_data[0] = (NORESULT, NORESULT, tmp_timestamp, tmp_data)
886 | self._delay_music[stream_id] = history_data
887 |
888 | return ret_data
889 |
890 |
891 | def deal_real_custom(self, data):
892 | is_new = False
893 | result = None
894 | curr_title = self.get_mutil_result_acrid(data, 'custom')[0]
895 |
896 | stream_id = data.get("stream_id")
897 | timestamp = data.get("timestamp")
898 | timestamp_obj = datetime.datetime.strptime(timestamp, "%H:%M:%S")
899 | if not stream_id:
900 | return result, is_new
901 | if curr_title == NORESULT:
902 | if not self.real_check_title_custom(stream_id, curr_title, timestamp_obj):
903 | self._real_custom[stream_id][0].append((curr_title, timestamp))
904 | self._real_custom[stream_id][1] = data
905 | result = data
906 | is_new = True
907 | else:
908 | result = None
909 | is_new = False
910 | else:
911 | if self.real_check_title_custom(stream_id, curr_title, timestamp_obj):
912 | result = self._real_custom[stream_id][1]
913 | is_new = False
914 | else:
915 | self._real_custom[stream_id][0].append((curr_title, timestamp))
916 | self._real_custom[stream_id][1] = data
917 | result = data
918 | is_new = True
919 | return result, is_new
920 |
921 | def deal_delay_custom(self, data):
922 | try:
923 | ret_result = None
924 | stream_id = data.get("stream_id")
925 | timestamp = data.get("timestamp")
926 | title_list = self.get_mutil_result_acrid(data, 'custom', 5)
927 | if stream_id not in self._delay_custom:
928 | self._delay_custom[stream_id] = [(title_list, timestamp, data)]
929 | else:
930 | self._delay_custom[stream_id].append((title_list, timestamp, data))
931 |
932 | if len(self._delay_custom[stream_id]) >= self._delay_list_max_num:
933 | ret_result = self.runDelayX_custom(stream_id)
934 | except Exception as e:
935 | self._dlog.logger.error("Error@deal_delay_custom", exc_info=True)
936 | return ret_result
937 |
938 | def remove_next_result_from_now_result_list(self, history_data, ret_data, max_index):
939 | #Just for custom delay filter
940 | try:
941 | if ret_data and len(history_data) >= max_index+2:
942 | acrid_list, timestamp, next_data = history_data[max_index + 1]
943 | if next_data:
944 | #update max size acrid_list to 20
945 | next_acrid_list = self.get_mutil_result_acrid(next_data, 'custom', 20)
946 | next_acrid_set = set(next_acrid_list)
947 | new_ret_custom_files = []
948 | for index, item in enumerate(ret_data["result"]["metadata"]["custom_files"]):
949 | if index == 0 or (item["acrid"] not in next_acrid_set):
950 | new_ret_custom_files.append(item)
951 | ret_data["result"]["metadata"]["custom_files"] = new_ret_custom_files
952 | except Exception as e:
953 | self._dlog.logger.error("Error@remove_next_result_from_now_result_list", exc_info=True)
954 |
955 | def get_custom_duration_by_title(self, title, ret_data):
956 | try:
957 | duration = 0
958 | for index, item in enumerate(ret_data["result"]["metadata"]["custom_files"]):
959 | if title == item["acrid"]:
960 | duration_ms = int(item["duration_ms"])
961 | if duration_ms >= 0:
962 | duration = int(duration_ms/1000)
963 | except Exception as e:
964 | self._dlog.logger.error("Error@get_custom_duration_by_title, error_data:{0}".format(ret_data), exc_info=True)
965 | return duration
966 |
967 | def custom_delay_dynamic_judge_size(self, deal_title_map, history_data):
968 | try:
969 | judge_size = 6
970 | title = list(deal_title_map.keys())[0]
971 | index = deal_title_map[title]["index_list"][-1]
972 | ret_data = history_data[index][2]
973 | duration = self.get_custom_duration_by_title(title, ret_data)
974 | tmp_size = int(duration/10)
975 | if tmp_size <=6:
976 | judge_size = tmp_size if tmp_size > 1 else 2
977 | elif tmp_size >= 18:
978 | judge_size = 18
979 | except Exception as e:
980 | self._dlog.logger.error("Error@custom_delay_dynamic_judge_size", exc_info=True)
981 |
982 | return judge_size if judge_size >= 2 else 2
983 |
984 | def runDelayX_custom(self, stream_id):
985 | history_data = self._delay_custom[stream_id]
986 |
987 | if len(history_data) >= self._delay_list_threshold:
988 | history_data = history_data[-(self._delay_list_threshold-1):]
989 |
990 | history_data_len = len(history_data)
991 | for ii in range((history_data_len-1), 0, -1):
992 | if history_data[-ii][0][0] == NORESULT:
993 | continue
994 | else:
995 | history_data = history_data[-(ii+1):]
996 | break
997 |
998 | first_not_noresult_index = -1
999 | for index, item in enumerate(history_data):
1000 | if index == 0:
1001 | continue
1002 | if len(item[0])>0 and item[0][0] == NORESULT:
1003 | first_not_noresult_index = index
1004 | else:
1005 | break
1006 | if first_not_noresult_index != -1:
1007 | history_data = history_data[first_not_noresult_index:]
1008 | self._delay_custom[stream_id] = history_data
1009 | return None
1010 |
1011 | deal_title_map = {} #key:title, value:{'count':0, 'index_list':[]}
1012 | tmp_deal_title_map = {}
1013 | break_index = 0
1014 |
1015 | for index, item in enumerate(history_data[1:]):
1016 | index += 1
1017 | title_list, timestamp, data = item
1018 | if index!=1:
1019 | flag_first = True
1020 | flag_second = True
1021 | for title in title_list[:3]:
1022 | if title in deal_title_map:
1023 | flag_first = False
1024 | if flag_first:
1025 | judge_size = self.custom_delay_dynamic_judge_size(deal_title_map, history_data)
1026 | for i in range(1,judge_size):
1027 | if index + i < len(history_data):
1028 | next_title_list, next_timestamp, next_data = history_data[index + i]
1029 | for title in next_title_list[:3]:
1030 | if title in deal_title_map:
1031 | flag_second = False
1032 | else:
1033 | flag_second = False
1034 | if flag_first and flag_second and deal_title_map:
1035 | break_index = index
1036 | break
1037 |
1038 | for i, title in enumerate(title_list):
1039 | if title == NORESULT:
1040 | continue
1041 | if i == 0:
1042 | if title not in deal_title_map:
1043 | deal_title_map[title] ={'count':0, 'index_list':[]}
1044 | deal_title_map[title]['count'] += 1
1045 | deal_title_map[title]['index_list'].append(index)
1046 | if title not in tmp_deal_title_map:
1047 | tmp_deal_title_map[title] = {'count':0, 'index_list':[]}
1048 | tmp_deal_title_map[title]['count'] += 1
1049 | tmp_deal_title_map[title]['index_list'].append(index)
1050 |
1051 | ########### New Deal Custom Result Add Count ###########
1052 | ret_data = None
1053 | duration_dict = {}
1054 | duration = 0
1055 | if break_index > 0 and deal_title_map:
1056 | tmp_count_map = {}
1057 | sorted_title_list = sorted(deal_title_map.items(), key = lambda x:x[1]['count'], reverse = True)
1058 | for sitem in sorted_title_list:
1059 | sitem_title, sitem_map = sitem
1060 | sitem_count = sitem_map["count"]
1061 | sitem_min_index = min(sitem_map["index_list"])
1062 | if sitem_count not in tmp_count_map:
1063 | tmp_count_map[sitem_count] = []
1064 | tmp_count_map[sitem_count].append((sitem_title, sitem_min_index))
1065 | first_item_flag = True
1066 | for scount in sorted(tmp_count_map.keys(), reverse=True):
1067 | count_list = sorted(tmp_count_map[scount], key = lambda x:x[1])
1068 | for ditem in count_list:
1069 | dtitle, dindex = ditem
1070 | from_data = history_data[dindex][2]
1071 | if first_item_flag:
1072 | first_item_flag = False
1073 | ret_data = copy.deepcopy(from_data)
1074 | ret_data["result"]["metadata"]["custom_files"] = []
1075 | self.custom_result_append(ret_data, dtitle, from_data, scount, tmp_deal_title_map)
1076 |
1077 | index_range = set()
1078 | for title in deal_title_map:
1079 | index_range |= set(deal_title_map[title]['index_list'])
1080 | min_index = min(index_range)
1081 | max_index = max(index_range)
1082 | duration_dict = self.compute_played_duration(history_data, min_index, max_index, True, "custom")
1083 |
1084 | self.remove_next_result_from_now_result_list(history_data, ret_data, max_index)
1085 |
1086 | if ret_data:
1087 | duration = duration_dict["duration"]
1088 | duration_accurate = duration_dict["duration_accurate"]
1089 | sample_duration = duration_dict["sample_duration"]
1090 | db_duration = duration_dict["db_duration"]
1091 | mix_duration = duration_dict["mix_duration"]
1092 | accurate_timestamp_utc = duration_dict["accurate_timestamp_utc"]
1093 | ret_data['result']['metadata']['played_duration'] = abs(mix_duration)
1094 | ret_data['result']['metadata']['timestamp_utc'] = accurate_timestamp_utc
1095 | ret_data['timestamp'] = accurate_timestamp_utc
1096 | if ret_data['result']['metadata']['played_duration'] <= self._delay_custom_played_duration_min:
1097 | ret_data = None
1098 |
1099 | ########### cut history_data #############
1100 | if break_index>=0:
1101 | cut_index = break_index
1102 | for i, item in enumerate(history_data[break_index:]):
1103 | if item[0][0] == NORESULT:
1104 | cut_index = break_index + i + 1
1105 | else:
1106 | break
1107 | cut_index = cut_index - 1 if cut_index >= 1 else cut_index
1108 | history_data = history_data[cut_index:]
1109 |
1110 | reverse_index = -1
1111 | for i, item in enumerate(history_data[::-1]):
1112 | if item[0][0] == NORESULT:
1113 | reverse_index = i
1114 | continue
1115 | else:
1116 | break
1117 |
1118 | if reverse_index != -1:
1119 | new_cut_index = -1
1120 | reverse_index = len(history_data) - reverse_index - 1
1121 | if reverse_index in [0, 1]:
1122 | history_data = []
1123 | else:
1124 | pass
1125 |
1126 | self._delay_custom[stream_id] = history_data
1127 | return ret_data
1128 |
1129 | class FilterWorker:
1130 | def __init__(self):
1131 | self.tmp_no_result = {'status': {'msg': 'No result', 'code': 1001, 'version': '1.0'}, 'metadata': {}}
1132 | self._result_map = []
1133 | self.init_logger()
1134 | self._result_filter = ResultFilter(self.dlog)
1135 |
1136 | def init_logger(self):
1137 | self.dlog = acrcloud_logger.AcrcloudLogger('Filter_Log')
1138 | self.dlog.addStreamHandler()
1139 |
1140 | def save_one_delay(self, old_data, isCustom=0):
1141 | data = None
1142 | if isCustom:
1143 | data = self._result_filter.deal_delay_custom(old_data)
1144 | else:
1145 | data = self._result_filter.deal_delay_history(old_data)
1146 |
1147 | if data is not None:
1148 | del data["stream_id"]
1149 | self._result_map.append(data)
1150 | return True
1151 | else:
1152 | return False
1153 |
1154 | def save_one(self, jsondata):
1155 | try:
1156 | timestamp = jsondata['timestamp']
1157 | if jsondata['result']['status']['code'] != 0:
1158 | jsondata['result']['metadata'] = {'timestamp_utc':timestamp}
1159 | elif 'metadata' in jsondata['result']:
1160 | jsondata['result']['metadata']['timestamp_utc'] = timestamp
1161 |
1162 | tmp_no_result_json = {'status': {'msg': 'No result', 'code': 1001, 'version': '1.0'}, 'metadata': {'timestamp_utc': timestamp}}
1163 |
1164 | ret = False
1165 | custom_data = copy.deepcopy(jsondata)
1166 | if jsondata['result']['status']['code'] != 0:
1167 | ret = self.save_one_delay(jsondata, 0)
1168 | ret = self.save_one_delay(custom_data, 1)
1169 | elif 'metadata' in jsondata['result'] and 'custom_files' in jsondata['result']['metadata']:
1170 | if 'music' in jsondata['result']['metadata']:
1171 | del custom_data['result']['metadata']['music']
1172 | del jsondata['result']['metadata']['custom_files']
1173 | ret = self.save_one_delay(jsondata, 0)
1174 | else:
1175 | jsondata['result'] = copy.deepcopy(tmp_no_result_json)
1176 | ret = self.save_one_delay(jsondata, 0)
1177 | ret = self.save_one_delay(custom_data, 1)
1178 | elif 'metadata' in jsondata['result'] and 'music' in jsondata['result']['metadata']:
1179 | custom_data['result'] = copy.deepcopy(tmp_no_result_json)
1180 | ret = self.save_one_delay(jsondata, 0)
1181 | except Exception as e:
1182 | self.dlog.logger.error("Error@save_one", exc_info=True)
1183 | return ret
1184 |
1185 | def do_filter(self, tmp_id, filepath, result, rec_length, timestamp):
1186 | try:
1187 | jsoninfo = {
1188 | "stream_id": tmp_id,
1189 | "file":filepath,
1190 | "rec_length": rec_length,
1191 | "result": result,
1192 | "timestamp": timestamp
1193 | }
1194 | self.save_one(jsoninfo)
1195 | except Exception as e:
1196 | self.dlog.logger.error("Error@do_filter", exc_info=True)
1197 |
1198 | def end_filter(self, tmp_id, rec_length, timestamp):
1199 | try:
1200 | tmp_no_result = copy.deepcopy(self.tmp_no_result)
1201 | for i in range(1, 60):
1202 | tmp_timestamp = datetime.datetime.strptime(timestamp, "%H:%M:%S")
1203 | new_timestamp = (tmp_timestamp + relativedelta(seconds=int(i*rec_length))).strftime("%H:%M:%S")
1204 | jsoninfo = {
1205 | "stream_id": tmp_id,
1206 | "rec_length": rec_length,
1207 | "result": tmp_no_result,
1208 | "timestamp": new_timestamp
1209 | }
1210 | self.save_one(jsoninfo)
1211 | except Exception as e:
1212 | self.dlog.logger.error("Error@end_filter", exc_info=True)
1213 |
1214 | def start_filter(self, tmp_id, rec_length, timestamp):
1215 | try:
1216 | tmp_no_result = copy.deepcopy(self.tmp_no_result)
1217 | for i in range(1, 0, -1):
1218 | new_timestamp = timestamp
1219 | jsoninfo = {
1220 | "stream_id": tmp_id,
1221 | "rec_length": rec_length,
1222 | "result": tmp_no_result,
1223 | "timestamp": new_timestamp
1224 | }
1225 | self.save_one(jsoninfo)
1226 | except Exception as e:
1227 | self.dlog.logger.error("Error@start_filter", exc_info=True)
1228 |
1229 | def apply_filter(self, result_list):
1230 | try:
1231 | appid = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
1232 | rec_length = 10
1233 | timestamp = None
1234 | for index, item in enumerate(result_list):
1235 | filename = item["file"]
1236 | timestamp = item["timestamp"]
1237 | rec_length = item["rec_length"]
1238 | if index == 0:
1239 | self.start_filter(appid, rec_length, timestamp)
1240 | result = item["result"]
1241 | if "status" in result and result["status"]["code"] in [0, 1001]:
1242 | self.do_filter(appid, filename, result, rec_length, timestamp)
1243 | if timestamp is not None:
1244 | self.end_filter(appid, rec_length, timestamp)
1245 | except Exception as e:
1246 | self.dlog.logger.error("Error@apply_filter", exc_info=True)
1247 | return self._result_map
1248 |
1249 | def test(self):
1250 | a = '{"timestamp": "01 00:17:40", "rec_length": 10, "result": {"status": {"msg": "Success", "code": 0, "version": "1.0"}, "cost_time": 1.2630000114441, "result_type": 0, "metadata": {"timestamp_utc": "2018-08-02 14:44:39", "music": [{"album": {"name": "Solino"}, "play_offset_ms": 85200, "sample_begin_time_offset_ms": 300, "title": "La Bambola", "result_from": 1, "release_date": "2002-10-28", "sample_end_time_offset_ms": 9460, "genres": [{"name": "Pop"}], "label": "Amiga", "db_end_time_offset_ms": 85120, "score": 82, "db_begin_time_offset_ms": 75960, "artists": [{"name": "Patty Pravo"}], "duration_ms": 182200, "external_ids": {"isrc": "ITB006870616", "upc": "743219711328"}, "acrid": "27fef80da4dabc33591a2c08a08edaf0", "external_metadata": {"spotify": {"album": {"name": "Solino", "id": "0I3MXd5FYGAj6X9GOJepMb"}, "track": {"name": "La Bambola", "id": "5YT3WdXo5gBwZ0TlJiB0TE"}, "artists": [{"name": "Patty Pravo", "id": "2Yi5fknmHBqqKjHF6cXQyh"}]}, "deezer": {"album": {"name": "Solino", "id": "112016"}, "track": {"name": "La Bambola", "id": "1017795"}, "artists": [{"name": "Patty Pravo", "id": "58615"}]}, "youtube": {"vid": "UHCgZY-HX6U"}}}]}}, "file": "radioairplay_19/501.2018.06.19.04.00.00.mp3"}'
1251 | data = json.loads(a)
1252 | raw_title = self._result_filter.get_mutil_result_title(data, 'music', 1)[0]
1253 | sim_title = self._result_filter.tryStrSub(raw_title)
1254 | print(raw_title, sim_title)
1255 |
--------------------------------------------------------------------------------