├── effect_images ├── 1.png └── 结构示意图.png ├── service ├── module_tools │ ├── __pycache__ │ │ ├── input_data.cpython-37.pyc │ │ ├── save_result.cpython-37.pyc │ │ ├── identify_faultservice.cpython-37.pyc │ │ └── diagnosis_faultservice.cpython-37.pyc │ ├── diagnosis_faultservice.py │ ├── save_result.py │ ├── genarate_solutions.py │ ├── identify_faultservice.py │ └── input_data.py ├── generate_solutions_service.py ├── fault_diagnosis_service.py └── web_service.py ├── config ├── stop.txt └── data_base_sql ├── requirements.txt ├── utils ├── jaccard_api.py ├── data_tools.py ├── graph.py ├── pageRank.py ├── draw_graph_tool.py ├── pcalg.py └── process_aiops2020_data_to_original.py ├── dao ├── es_dao.py ├── neo4j_dao.py └── db_dao.py ├── bean ├── output_model.py ├── input_model.py └── save_model.py ├── .gitignore ├── app.py ├── demo ├── aiops_2020_data_test.py └── hadoop_data_test.py ├── README.md └── LICENSE /effect_images/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OS-ABC/AIOsp-Fault-Diagnosis/main/effect_images/1.png -------------------------------------------------------------------------------- /effect_images/结构示意图.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OS-ABC/AIOsp-Fault-Diagnosis/main/effect_images/结构示意图.png -------------------------------------------------------------------------------- /service/module_tools/__pycache__/input_data.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OS-ABC/AIOsp-Fault-Diagnosis/main/service/module_tools/__pycache__/input_data.cpython-37.pyc -------------------------------------------------------------------------------- /service/module_tools/__pycache__/save_result.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OS-ABC/AIOsp-Fault-Diagnosis/main/service/module_tools/__pycache__/save_result.cpython-37.pyc -------------------------------------------------------------------------------- /service/module_tools/__pycache__/identify_faultservice.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OS-ABC/AIOsp-Fault-Diagnosis/main/service/module_tools/__pycache__/identify_faultservice.cpython-37.pyc -------------------------------------------------------------------------------- /service/module_tools/__pycache__/diagnosis_faultservice.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OS-ABC/AIOsp-Fault-Diagnosis/main/service/module_tools/__pycache__/diagnosis_faultservice.cpython-37.pyc -------------------------------------------------------------------------------- /config/stop.txt: -------------------------------------------------------------------------------- 1 | * 2 | < 3 | > 4 | ( 5 | ) 6 | \n 7 | , 8 | . 9 | - 10 | : 11 | / 12 | [ 13 | ] 14 | $ 15 | \t 16 | = 17 | ; 18 | java 19 | @ 20 | + 21 | ... 22 | { 23 | } 24 | _ 25 | & 26 | at 27 | main 28 | s -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | SQLAlchemy==1.3.18 2 | networkx==2.4 3 | pandas==1.0.5 4 | numpy==1.18.5 5 | Flask==1.1.2 6 | pytz==2020.1 7 | scipy==1.5.0 8 | elasticsearch==7.11.0 9 | gsq==0.1.6 10 | jieba==0.42.1 11 | py2neo==2021.0.1 12 | -------------------------------------------------------------------------------- /utils/jaccard_api.py: -------------------------------------------------------------------------------- 1 | import re 2 | from copy import deepcopy 3 | 4 | import jieba 5 | import jieba.analyse 6 | 7 | 8 | def log_preprocess(log, paramregex, eraseRex): 9 | for currentRex in eraseRex: 10 | log = re.sub(currentRex, '', log) 11 | for currentRex in paramregex: 12 | log = re.sub(currentRex, '<*>', log) 13 | return log 14 | 15 | 16 | def generate_cidian_jaccard(exception_window_log_, stopkeyword): 17 | log_dic = [] 18 | fenci = jieba.cut_for_search(exception_window_log_) 19 | for fc in fenci: 20 | if fc not in log_dic and fc not in stopkeyword and not re.search(' +', fc): 21 | log_dic.append(fc) 22 | return log_dic 23 | -------------------------------------------------------------------------------- /dao/es_dao.py: -------------------------------------------------------------------------------- 1 | from elasticsearch import Elasticsearch 2 | 3 | 4 | class ESConnection: 5 | def __init__(self, hosts): 6 | self.hosts = hosts 7 | self.es = Elasticsearch(hosts=hosts) 8 | 9 | def query(self, index, query): 10 | data = self.es.search(body=query, index=index, params={"scroll": "10m", "size": 10000}) 11 | result = data['hits']['hits'] 12 | total = data['hits']['total']['value'] 13 | scroll_id = data['_scroll_id'] 14 | 15 | for i in range(0, int(total / 10000) + 1): 16 | query_scroll = self.es.scroll(scroll_id=scroll_id, params={"scroll": "1m"})['hits']['hits'] 17 | result += query_scroll 18 | 19 | return result -------------------------------------------------------------------------------- /service/generate_solutions_service.py: -------------------------------------------------------------------------------- 1 | from dao.db_dao import DBDao 2 | from service.module_tools.genarate_solutions import GenetateSolutuons 3 | from service.module_tools.save_result import SaveResult 4 | 5 | 6 | def time_generate_logs_solutions(): 7 | """ 8 | 为根因详情列表中未生成修复方案的所有根因日志生成解决方案 9 | :return: 10 | """ 11 | dbDao = DBDao() 12 | root_logs = dbDao.get_all_root_logs_noSolution() 13 | dbDao.db_close() 14 | result = None 15 | for root_log in root_logs: 16 | sorted_solutions = GenetateSolutuons.get_solutions_by_logDetail(root_log.detail) 17 | result = SaveResult.save_solutions(root_log.fault_id, root_log.causeOfFault, sorted_solutions) 18 | return result 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /bean/output_model.py: -------------------------------------------------------------------------------- 1 | class FaultServiceDetail: 2 | def __init__(self, faultId: int, serviceName: str, hostName: str, fault_root: str, exception_time: str): 3 | self.serviceName = serviceName 4 | self.hostName = hostName 5 | self.fault_root = fault_root 6 | self.exception_time = exception_time 7 | self.faultId = faultId 8 | 9 | def keys(self): 10 | ''' 11 | 当对实例化对象使用dict(obj)的时候, 会调用这个方法,这里定义了字典的键, 其对应的值将以obj['name']的形式取, 12 | 但是对象是不可以以这种方式取值的, 为了支持这种取值, 可以为类增加一个方法 13 | ''' 14 | return ('serviceName', 'hostName', 'fault_root', 'exception_time', 'faultId') 15 | 16 | def __getitem__(self, item): 17 | ''' 18 | 内置方法, 当使用obj['name']的形式的时候, 将调用这个方法, 这里返回的结果就是值 19 | ''' 20 | return getattr(self, item) 21 | 22 | 23 | if __name__ == '__main__': 24 | a = FaultServiceDetail(1, '2', '3', '4', '5') 25 | re = dict(a) 26 | print(re) 27 | -------------------------------------------------------------------------------- /utils/data_tools.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import time 3 | 4 | import pytz 5 | 6 | 7 | def is_number(number): 8 | """ 9 | Check whether this is a number (int, long, float, hex) . 10 | 11 | Aruguments: 12 | number: {string}, input string for number check. 13 | """ 14 | try: 15 | float(number) # for int, long, float 16 | except ValueError: 17 | try: 18 | int(number, 16) # for possible hex 19 | except ValueError: 20 | return False 21 | 22 | return True 23 | 24 | # UTCS时间转换为时间戳 2018-07-13T16:00:00Z 25 | def utc_to_local(utc_time_str, utc_format='%Y-%m-%dT%H:%M:%S.%fZ'): 26 | local_tz = pytz.timezone('Asia/Chongqing') #定义本地时区 27 | local_format = "%Y-%m-%d %H:%M:%S" #定义本地时间format 28 | 29 | utc_dt = datetime.datetime.strptime(utc_time_str, utc_format) #讲世界时间的格式转化为datetime.datetime格式 30 | local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) #想将datetime格式添加上世界时区,然后astimezone切换时区:世界时区==>本地时区 31 | #time_str = local_dt.strftime(local_format) #将datetime格式转化为str—format格式 32 | #return int(time.mktime(time.strptime(time_str, local_format))) #运用mktime方法将date—tuple格式的时间转化为时间戳;time.strptime()可以得到tuple的时间格式 33 | return int(time.mktime(local_dt.timetuple())) -------------------------------------------------------------------------------- /utils/graph.py: -------------------------------------------------------------------------------- 1 | class ServiceNode: 2 | def __init__(self,serviceId, serviceName = None,serviceType = None): 3 | self.serviceId = serviceId 4 | self.serviceName = serviceName 5 | self.serviceType = serviceType 6 | self.hostName = None 7 | self.containerName = None 8 | self.hostId = None 9 | self.containerId = None 10 | self.isException = 0 11 | self.childs = [] 12 | def add_childs(self,service_id): 13 | self.childs.append(service_id) 14 | 15 | class ExceptionDataNode: 16 | def __init__(self, id, nodeType): 17 | self.id = id 18 | self.nodeType = nodeType 19 | self.name = None 20 | self.detail = None 21 | self.belongTo = None 22 | self.exceptionTime = None 23 | self.units = None 24 | self.childs = [] 25 | 26 | def add_childs(self, id): 27 | self.childs.append(id) 28 | class Graph: 29 | def __init__(self, nodes, edges): 30 | self.nodes = nodes 31 | self.edges = edges 32 | self.generate_invoke_graph_consturct() 33 | 34 | def generate_invoke_graph_consturct(self): 35 | """ 36 | 生成图遍历算法所需图结构 37 | :param nodes:{serviceId:ServiceNode,serviceId:ServiceNode} 38 | :param edges:[[serviceId,serviceId][serviceId,serviceId]] 39 | :return:graph 40 | """ 41 | graph = {} 42 | for edge in self.edges: 43 | for i in edge: 44 | if i not in graph.keys(): 45 | node = self.nodes[i] 46 | graph[i] = node 47 | if edge[1] not in graph[edge[0]].childs: 48 | graph[edge[0]].add_childs(edge[1]) 49 | # 对图中单点节点补充 50 | for key, node in self.nodes.items(): 51 | if key not in graph: 52 | graph[key] = node 53 | self.nodes = graph -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | 78 | # Next.js build output 79 | .next 80 | 81 | # Nuxt.js build / generate output 82 | .nuxt 83 | dist 84 | 85 | # Gatsby files 86 | .cache/ 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 88 | # https://nextjs.org/blog/next-9-1#public-directory-support 89 | # public 90 | 91 | # vuepress build output 92 | .vuepress/dist 93 | 94 | # Serverless directories 95 | .serverless/ 96 | 97 | # FuseBox cache 98 | .fusebox/ 99 | 100 | # DynamoDB Local files 101 | .dynamodb/ 102 | 103 | # TernJS port file 104 | .tern-port 105 | 106 | /.idea/ 107 | */__pycache__ 108 | -------------------------------------------------------------------------------- /dao/neo4j_dao.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from py2neo import Graph 4 | 5 | 6 | class GraphDao: 7 | 8 | def __init__(self): 9 | self.g = Graph( 10 | host="127.0.0.1", # neo4j 搭载服务器的ip地址,ifconfig可获取到 11 | http_port=7474, # neo4j 服务器监听的端口号 12 | user="neo4j", # 数据库user name,如果没有更改过,应该是neo4j 13 | password="neo4j") 14 | # self.num_limit = 20 15 | 16 | def execute_sql(self, sql): 17 | answer = None 18 | try: 19 | answer = self.g.run(sql).data() 20 | except: 21 | logging.error("execute sql failed, sql: {0}".format(sql)) 22 | return answer 23 | 24 | def get_all__entities(self): 25 | sql = 'MATCH (n) return n' 26 | result = self.execute_sql(sql) 27 | return [i['n'] for i in result] 28 | 29 | ##故障修复知识图谱 30 | 31 | # def get_all_log_entities(self): 32 | # sql = 'MATCH (n:log) return n' 33 | # result = self.execute_sql(sql) 34 | # return [i['n'] for i in result] 35 | 36 | #获取图谱log节点 37 | def get_all_log_entities(self): 38 | result = self.g.run("match (n:log) return n").data() 39 | return result 40 | 41 | #根据log获取故障节点列表 42 | def get_fault_entity_by_log(self, log_name): 43 | sql = 'MATCH (x:fault)-[r:has_log]->(y:log) where y.name = "{0}" return x'.format( 44 | log_name) 45 | result = self.execute_sql(sql) 46 | return [i['x'] for i in result] 47 | #根据falut获取解决方案列表 48 | def get_solutions_by_fault(self, fault_name): 49 | sql = 'MATCH (x:fault)-[r:has_solution]->(y:solution) where x.name = "{0}" return y'.format( 50 | fault_name) 51 | result = self.execute_sql(sql) 52 | return [i['y'] for i in result] 53 | 54 | #根据falut获取原因列表 55 | def get_reasons_by_fault(self, fault_name): 56 | sql = 'MATCH (x:fault)-[r:has_reason]->(y:reason) where x.name = "{0}" return y'.format( 57 | fault_name) 58 | result = self.execute_sql(sql) 59 | return [i['y'] for i in result] 60 | 61 | #根据原因获取解决方案列表 62 | def get_solutions_by_reason(self, reason_name): 63 | sql = 'MATCH (x:reason)-[r:has_solution]->(y:solution) where x.name = "{0}" return y'.format( 64 | reason_name) 65 | result = self.execute_sql(sql) 66 | return [i['y'] for i in result] -------------------------------------------------------------------------------- /bean/input_model.py: -------------------------------------------------------------------------------- 1 | class DeploymentDataEntry: 2 | def __init__(self, serviceInstanceId:str, serviceName: str, hostId: str = None,hostName:str = None,containerId:str =None,containerName:str= None): 3 | self.serviceInstanceId = serviceInstanceId 4 | self.serviceName = serviceName 5 | self.hostId = hostId 6 | self.hostName = hostName 7 | self.containerId = containerId 8 | self.containerName = containerName 9 | 10 | class TraceDataEntry: 11 | def __init__(self, id:str, pid: str, serviceId: str,traceId:str,serviceName:str = None,serviceType:str =None,startTime:str= None): 12 | self.id = id 13 | self.pid = pid 14 | self.serviceId = serviceId 15 | self.serviceName = serviceName 16 | self.serviceType = serviceType 17 | self.startTime = startTime 18 | self.traceId = traceId 19 | 20 | class OriginalMetricEntry: 21 | def __init__(self, metricId:str, metricName: str, timestamp:str, value: float, metricBelongTo:str, units:str = None,metricBelongLevel:str= None): 22 | self.metricId = metricId 23 | self.metricName = metricName 24 | self.timestamp = timestamp 25 | self.value = value 26 | self.units = units 27 | self.metricBelongTo = metricBelongTo 28 | self.metricBelongLevel = metricBelongLevel 29 | 30 | class OriginalLogEntry: 31 | def __init__(self, logId:str, timestamp:str, logMessage: str, logBelongTo:str, logLevel: str = None, logBelongLevel:str= None): 32 | self.logId = logId 33 | self.timestamp = timestamp 34 | self.logMessage = logMessage 35 | self.logLevel = logLevel 36 | self.logBelongTo = logBelongTo 37 | self.logBelongLevel = logBelongLevel 38 | 39 | class ExceptionMetricEntry: 40 | def __init__(self, startTime:str, endTime:str, metricId: str, metricName:str, value: float, metricBelongTo:str, units:str = None,metricBelongLevel:str= None): 41 | self.startTime = startTime 42 | self.endTime = endTime 43 | self.metricId = metricId 44 | self.metricName = metricName 45 | self.value = value 46 | self.metricBelongTo = metricBelongTo 47 | self.units = units 48 | self.metricBelongLevel = metricBelongLevel 49 | 50 | class ExceptionLogEntry: 51 | def __init__(self, startTime:str, endTime:str, logId: str, logBelongTo:str, logExceptionSegment: str, logBelongLevel:str= None): 52 | self.startTime = startTime 53 | self.endTime = endTime 54 | self.logId = logId 55 | self.logExceptionSegment = logExceptionSegment 56 | self.logBelongTo = logBelongTo 57 | self.logBelongLevel = logBelongLevel -------------------------------------------------------------------------------- /utils/pageRank.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # from pygraph.classes.digraph import digraph 4 | import copy 5 | 6 | import networkx as nx 7 | 8 | class PRIterator: 9 | __doc__ = '''计算一张图中的PR值''' 10 | 11 | def __init__(self, dg): 12 | self.damping_factor = 0.85 # 阻尼系数,即α 13 | self.max_iterations = 100 # 最大迭代次数 14 | self.min_delta = 0.00001 # 确定迭代是否结束的参数,即ϵ 15 | self.graph = copy.deepcopy(dg) 16 | 17 | def page_rank(self): 18 | # 先将图中没有出链的节点改为对所有节点都有出链 19 | for node in self.graph.nodes(): 20 | if len(list(self.graph.neighbors(node))) == 0: 21 | for node2 in self.graph.nodes(): 22 | nx.DiGraph.add_edge(self.graph, node, node2) 23 | 24 | nodes = self.graph.nodes() 25 | graph_size = len(nodes) 26 | 27 | if graph_size == 0: 28 | return {} 29 | page_rank = dict.fromkeys(nodes, 1.0 / graph_size) # 给每个节点赋予初始的PR值 30 | damping_value = (1.0 - self.damping_factor) / graph_size # 公式中的(1−α)/N部分 31 | 32 | flag = False 33 | for i in range(self.max_iterations): 34 | change = 0 35 | for node in nodes: 36 | rank = 0 37 | for incident_page in self.graph.predecessors(node): # 遍历所有“入射”的页面 38 | rank += self.damping_factor * (page_rank[incident_page] / len(list(self.graph.neighbors(incident_page)))) 39 | rank += damping_value 40 | change += abs(page_rank[node] - rank) # 绝对值 41 | page_rank[node] = rank 42 | 43 | # print("This is NO.%s iteration" % (i + 1)) 44 | # print(page_rank) 45 | 46 | if change < self.min_delta: 47 | flag = True 48 | break 49 | # if flag: 50 | # print("finished in %s iterations!" % node) 51 | # else: 52 | # print("finished out of 100 iterations!") 53 | return page_rank 54 | 55 | 56 | if __name__ == '__main__': 57 | dg = nx.DiGraph() 58 | 59 | dg.add_nodes_from(["0", "1", "2", "3","4"]) 60 | 61 | dg.add_edge("0", "1") 62 | dg.add_edge("1", "0") 63 | dg.add_edge("0", "2") 64 | dg.add_edge("2", "0") 65 | dg.add_edge("0", "4") 66 | dg.add_edge("4", "0") 67 | dg.add_edge("2", "4") 68 | dg.add_edge("4", "2") 69 | # dg.add_edge("A", "C") 70 | # dg.add_edge("A", "D") 71 | # dg.add_edge("C", "A") 72 | # dg.add_edge("C", "B") 73 | # dg.add_edge("D", "B") 74 | # dg.add_edge("B", "D") 75 | # dg.add_edge("E", "A") 76 | 77 | pr = PRIterator(dg) 78 | page_ranks = pr.page_rank() 79 | 80 | print("The final page rank is\n", page_ranks) 81 | 82 | -------------------------------------------------------------------------------- /config/data_base_sql: -------------------------------------------------------------------------------- 1 | create table exception_data_dependency_graph 2 | ( 3 | id int auto_increment 4 | primary key, 5 | fault_id int not null, 6 | graph_json text not null, 7 | create_time datetime default CURRENT_TIMESTAMP not null, 8 | update_time datetime default CURRENT_TIMESTAMP not null 9 | ) 10 | comment '服务异常数据依赖图存储表'; 11 | 12 | create table fault_service 13 | ( 14 | id int auto_increment 15 | primary key, 16 | fault_service_id varchar(255) not null comment '故障服务ID', 17 | fault_service_name varchar(255) null, 18 | fault_service_type varchar(255) null, 19 | host_name varchar(255) null, 20 | exception_time datetime null, 21 | process_state int default 0 not null, 22 | create_time datetime default CURRENT_TIMESTAMP not null, 23 | update_time datetime default CURRENT_TIMESTAMP not null 24 | ) 25 | comment '故障服务详情表'; 26 | 27 | create table fault_service_root_detail 28 | ( 29 | id int auto_increment 30 | primary key, 31 | fault_id int not null, 32 | causeOfFault varchar(255) not null comment '根因ID', 33 | causeName varchar(255) not null comment '根因名称', 34 | detail text null comment '根因内容', 35 | has_solution int default 0 not null, 36 | type int default 0 null comment '0:指标 1:日志', 37 | `rank` int not null comment '根因排序', 38 | create_time datetime default CURRENT_TIMESTAMP not null, 39 | update_time datetime default CURRENT_TIMESTAMP not null 40 | ) 41 | comment '故障服务根因详情表'; 42 | 43 | create table fault_service_solution 44 | ( 45 | id int auto_increment 46 | primary key, 47 | fault_id int not null, 48 | root_log_id varchar(255) not null, 49 | fault_reason text null, 50 | fault_solution text not null, 51 | `rank` int not null, 52 | create_time datetime default CURRENT_TIMESTAMP not null, 53 | update_time datetime default CURRENT_TIMESTAMP not null 54 | ) 55 | comment '故障服务修复方案详情表'; 56 | 57 | create table service_dependency_graph 58 | ( 59 | id int auto_increment 60 | primary key, 61 | fault_id int not null, 62 | graph_json text not null, 63 | create_time datetime default CURRENT_TIMESTAMP not null, 64 | update_time datetime default CURRENT_TIMESTAMP not null 65 | ) 66 | comment '服务依赖图存储表'; 67 | 68 | -------------------------------------------------------------------------------- /bean/save_model.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, Text, DateTime, Date, String,text 2 | from sqlalchemy.ext.declarative import declarative_base 3 | 4 | Base = declarative_base() 5 | 6 | 7 | class FaultService(Base): 8 | __tablename__ = 'fault_service' 9 | id = Column(Integer, primary_key=True) 10 | fault_service_id = Column(String) 11 | fault_service_name = Column(String) 12 | fault_service_type = Column(String) 13 | host_name = Column(String) 14 | exception_time = Column(DateTime) 15 | process_state = Column(Integer) 16 | create_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP')) 17 | update_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP')) 18 | __mapper_args__ = { 19 | "order_by": create_time.desc() 20 | } 21 | 22 | def to_dict(self): 23 | return {c.name: getattr(self, c.name, None) for c in self.__table__.columns} 24 | 25 | class FaultServiceRoot(Base): 26 | __tablename__ = 'fault_service_root_detail' 27 | id = Column(Integer, primary_key=True) 28 | fault_id = Column(Integer) 29 | causeOfFault = Column(String) 30 | causeName = Column(String) 31 | detail = Column(Text) 32 | has_solution = Column(Integer) 33 | type = Column(Integer) #0:指标 34 | rank = Column(Integer) #1 2 3 35 | create_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP')) 36 | update_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP')) 37 | __mapper_args__ = { 38 | "order_by": create_time.desc() 39 | } 40 | 41 | def to_dict(self): 42 | return {c.name: getattr(self, c.name, None) for c in self.__table__.columns} 43 | 44 | class ServiceDependencyGraph(Base): 45 | __tablename__ = 'service_dependency_graph' 46 | id = Column(Integer, primary_key=True) 47 | fault_id = Column(Integer) 48 | graph_json = Column(Text) 49 | create_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP')) 50 | update_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP')) 51 | __mapper_args__ = { 52 | "order_by": create_time.desc() 53 | } 54 | 55 | def to_dict(self): 56 | return {c.name: getattr(self, c.name, None) for c in self.__table__.columns} 57 | class ExceptionDataDependencyGraph(Base): 58 | __tablename__ = 'exception_data_dependency_graph' 59 | id = Column(Integer, primary_key=True) 60 | fault_id = Column(Integer) 61 | graph_json = Column(Text) 62 | create_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP')) 63 | update_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP')) 64 | __mapper_args__ = { 65 | "order_by": create_time.desc() 66 | } 67 | 68 | def to_dict(self): 69 | return {c.name: getattr(self, c.name, None) for c in self.__table__.columns} 70 | 71 | class FaultServiceSolution(Base): 72 | __tablename__ = 'fault_service_solution' 73 | id = Column(Integer, primary_key=True) 74 | fault_id = Column(Integer) 75 | root_log_id = Column(String) 76 | fault_reason = Column(Text) 77 | fault_solution = Column(Text) 78 | rank = Column(Integer) # 1 2 3 79 | create_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP')) 80 | update_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP')) 81 | __mapper_args__ = { 82 | "order_by": create_time.desc() 83 | } 84 | 85 | def to_dict(self): 86 | return {c.name: getattr(self, c.name, None) for c in self.__table__.columns} 87 | -------------------------------------------------------------------------------- /service/fault_diagnosis_service.py: -------------------------------------------------------------------------------- 1 | 2 | from service.module_tools.diagnosis_faultservice import DiagnosisFaultService 3 | from service.module_tools.identify_faultservice import IdentifyFaultService 4 | from service.module_tools.input_data import InputData 5 | from service.module_tools.save_result import SaveResult 6 | from utils.graph import Graph 7 | 8 | 9 | def fault_diagmosis(deploymentData, traceData, original_metricData, original_logData, exception_metricData, 10 | exception_logData): 11 | #输入数据 12 | data = InputData(deploymentData, traceData, original_metricData, original_logData, exception_metricData, 13 | exception_logData) 14 | #识别故障服务 15 | final_root_services, service_invoke_graph = get_root_services(data) 16 | #对故障服务诊断 17 | services_diagnisis_results = get_fault_services_roots(data,final_root_services,service_invoke_graph) 18 | #存储诊断结果 19 | save_fault_root_cause_diagnosis_result(service_invoke_graph,final_root_services,services_diagnisis_results) 20 | pass 21 | 22 | def get_root_services(data): 23 | """ 24 | 故障服务识别子模块主入口 25 | :param data: input_data实例 26 | :return: 故障服务列表({serviceId:数值})、服务依赖图(graph实例,nodes:{serviceId:ServiceNode,serviceId:ServiceNode},edges[[serviceId,serviceId],[serviceId,serviceId]]) 27 | """ 28 | nodes, edges, traverse_initial_list = IdentifyFaultService.generate_service_invoke_graph(data.organization_traceObjData_by_traceId()) 29 | nodes = IdentifyFaultService.completion_serviceNode_deploymentData(data.organization_deploymentObjData_by_sviid(), nodes) 30 | nodes = IdentifyFaultService.set_service_exception_info(nodes,data) 31 | service_invoke_graph = Graph(nodes, edges) 32 | # final_root_services = get_fault_services_list(service_invoke_graph,traverse_initial_list) 33 | final_root_services = IdentifyFaultService.get_fault_services_list_PR(service_invoke_graph,traverse_initial_list) 34 | print('故障服务列表为:{}'.format(final_root_services)) 35 | return final_root_services, service_invoke_graph 36 | 37 | def get_fault_services_roots(data,final_root_services,service_invoke_graph): 38 | """ 39 | 对所有故障服务进行诊断入口 40 | :param data:input_data实例 41 | :param final_root_services:{serviceId:数值,serviceId:数值} 42 | :param service_invoke_graph:nodes:{},egdes:[] 43 | :return: 44 | """ 45 | services_diagnisis_results = dict() 46 | for i in final_root_services: 47 | serviceNode = service_invoke_graph.nodes[i] 48 | services_diagnisis_results[serviceNode.serviceId] = dict() 49 | falut_root_dict, final_exception_data_graph = DiagnosisFaultService.get_servcie_fault_causes(serviceNode, data) 50 | # 打印结果 51 | falut_root_dict_name = dict() 52 | for root_id,rootValue in falut_root_dict.items(): 53 | rootNode = final_exception_data_graph.nodes[root_id] 54 | falut_root_dict_name[rootNode.name] = rootValue 55 | print('{0} 服务故障根因为:{1}'.format(serviceNode.serviceName, falut_root_dict_name)) 56 | 57 | services_diagnisis_results[serviceNode.serviceId]['falut_root_dict'] = falut_root_dict 58 | services_diagnisis_results[serviceNode.serviceId]['final_exception_data_graph'] = final_exception_data_graph 59 | return services_diagnisis_results 60 | 61 | def save_fault_root_cause_diagnosis_result(service_invoke_graph,final_root_services,services_diagnisis_results): 62 | SaveResult.save(service_invoke_graph,final_root_services,services_diagnisis_results) 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /service/web_service.py: -------------------------------------------------------------------------------- 1 | from bean.output_model import FaultServiceDetail 2 | from dao.db_dao import DBDao 3 | from service.module_tools.genarate_solutions import GenetateSolutuons 4 | from service.module_tools.save_result import SaveResult 5 | 6 | 7 | def get_fault_service_list(): 8 | """ 9 | 查询所有故障服务数据,按时间从高到底排序,分为已处理和未处理两列 10 | :return: 11 | """ 12 | dbDao = DBDao() 13 | fault_service_list_unprocess = dbDao.select_all_fault_service_detail_by_processState(0) 14 | fault_service_list_process = dbDao.select_all_fault_service_detail_by_processState(1) 15 | fault_service_detail_list_unprocess = list() 16 | fault_service_detail_list_process = list() 17 | for fault_service in fault_service_list_unprocess: 18 | root = dbDao.select_rank1_faultserviceroot_by_faultid(fault_service.id) 19 | if root: 20 | faultServiceDetail = FaultServiceDetail(fault_service.id, fault_service.fault_service_name, 21 | fault_service.host_name, root.causeName, 22 | fault_service.exception_time) 23 | fault_service_detail_list_unprocess.append(faultServiceDetail) 24 | 25 | for fault_service in fault_service_list_process: 26 | root = dbDao.select_rank1_faultserviceroot_by_faultid(fault_service.id) 27 | if root: 28 | faultServiceDetail = FaultServiceDetail(fault_service.id, fault_service.fault_service_name, 29 | fault_service.host_name, root.causeName, 30 | fault_service.exception_time) 31 | fault_service_detail_list_process.append(faultServiceDetail) 32 | dbDao.db_close() 33 | return [dict(i) for i in fault_service_detail_list_unprocess], [dict(i) for i in fault_service_detail_list_process] 34 | 35 | 36 | # def get_fault_service_detail(fault_id): 37 | # """ 38 | # 查询某一故障服务诊断详情,此接口返回数据包含诊断时的服务依赖图、故障服务对应详细信息 39 | # :param fault_id: 40 | # :return: 41 | # """ 42 | 43 | # """ 44 | # 按faultId查询故障详细内容 45 | # """ 46 | # 47 | # 48 | # def get_fault_id(fault_id): 49 | # fault = None 50 | # db = get_session() 51 | # if fault_id: 52 | # fault = db.query(Fault).filter(Fault.id == fault_id).one() 53 | # db.close() 54 | # return fault.to_dict() 55 | 56 | 57 | def get_service_invoke_graph(fault_id): 58 | """ 59 | 根据故障服务编号查询对应的服务依赖图 60 | :param fault_id: 61 | :return: 62 | """ 63 | service_invoke_graph_json = None 64 | dbDao = DBDao() 65 | if fault_id: 66 | service_invoke_graph_json = dbDao.select_service_invoke_graph_by_faultid(fault_id) 67 | dbDao.db_close() 68 | if service_invoke_graph_json == None: 69 | return None 70 | return service_invoke_graph_json.to_dict() 71 | 72 | 73 | def get_exception_data_dependency_graph(fault_id, service_id): 74 | """ 75 | 根据fault_id查询服务异常数据依赖图 76 | :param fault_id: 77 | :param service_id: 78 | :return: 79 | """ 80 | exception_data_dependency_graph_json = None 81 | dbDao = DBDao() 82 | if fault_id and service_id: 83 | exception_data_dependency_graph_json = dbDao.select_exception_data_dependency_graph_by_faultid(fault_id) 84 | dbDao.db_close() 85 | return exception_data_dependency_graph_json.to_dict() 86 | 87 | 88 | def get_solutions_by_log(fault_id, log_id, log_detail): 89 | """ 90 | 获取根因日志的解决方案 91 | :param fault_id: 92 | :param log_id: 93 | :param logDetail: 94 | :return: 95 | """ 96 | dbDao = DBDao() 97 | root_log = dbDao.get_root_log_by_logid_and_faultid(fault_id,log_id) 98 | if root_log.has_solution == 0: 99 | sorted_solutions = GenetateSolutuons.get_solutions_by_logDetail(log_detail) 100 | result = SaveResult.save_solutions(root_log.fault_id, root_log.causeOfFault, sorted_solutions) 101 | solutions = dbDao.select_solutions_by_logid_and_faultid(fault_id,log_id) 102 | dbDao.db_close() 103 | return [i.to_dict() for i in solutions] 104 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from copy import deepcopy 3 | 4 | from flask import Flask, request, json, jsonify,make_response 5 | 6 | from service.web_service import get_fault_service_list, get_service_invoke_graph, get_exception_data_dependency_graph, \ 7 | get_solutions_by_log 8 | 9 | app = Flask(__name__, static_folder='static', template_folder='templates') 10 | 11 | # @app.route("/fault_diagnosis", methods=['POST']) #请求方式为post 12 | # def fault_diagnosis(): 13 | # data = request.data 14 | # j_data = json.loads(data) 15 | # sys_rca(j_data) 16 | 17 | result_response = {'code': 1, 'message': 'success', 'data': None} 18 | 19 | @app.route("/fault_service_list", methods=['GET']) # 请求方式为get 20 | def fault_service_list(): 21 | response = deepcopy(result_response) 22 | try: 23 | fault_list_unprocess, fault_list_process = get_fault_service_list() 24 | response['data'] = {'fault_list_unprocess': fault_list_unprocess, 'fault_list_process': fault_list_process} 25 | except Exception as e: 26 | response['code'] = 0 27 | response['message'] = str(e) 28 | res = make_response(jsonify(response)) # 设置响应体 29 | # res.status = '200' # 设置状态码 30 | res.headers['Access-Control-Allow-Origin'] = "*" # 设置允许跨域 31 | res.headers['Access-Control-Allow-Methods'] = 'PUT,GET,POST,DELETE' 32 | return res 33 | 34 | 35 | @app.route("/fault_service_invoke_graph", methods=['GET']) # 请求方式为get 36 | def fault_service_invoke_graph(): 37 | response = deepcopy(result_response) 38 | try: 39 | fault_id = request.args['fault_id'] 40 | # input_data = json.loads(data) 41 | service_invoke_graph_json = get_service_invoke_graph(fault_id) 42 | # service_invoke_graph = json.loads(service_invoke_graph_json) 43 | response['data'] = {'service_invoke_graph': service_invoke_graph_json} 44 | except Exception as e: 45 | response['code'] = 0 46 | response['message'] = str(e) 47 | return jsonify(response) 48 | 49 | 50 | @app.route("/exception_data_dependency_graph", methods=['GET']) # 请求方式为get 51 | def exception_data_dependency_graph(): 52 | response = deepcopy(result_response) 53 | try: 54 | fault_id = request.args['fault_id'] 55 | service_id = request.args['service_id'] 56 | # input_data = json.loads(data) 57 | log_metric_graph_json = get_exception_data_dependency_graph(fault_id, service_id) 58 | # log_metric_graph = json.loads(log_metric_graph_json) 59 | response['data'] = {'log_metric_graph': log_metric_graph_json} 60 | except Exception as e: 61 | response['code'] = 0 62 | response['message'] = str(e) 63 | res = make_response(jsonify(response)) # 设置响应体 64 | # res.status = '200' # 设置状态码 65 | res.headers['Access-Control-Allow-Origin'] = "*" # 设置允许跨域 66 | res.headers['Access-Control-Allow-Methods'] = 'PUT,GET,POST,DELETE' 67 | return res 68 | 69 | 70 | @app.route("/root_log_solutions", methods=['POST']) # 请求方式为get 71 | def fault_logDetail_solutions(): 72 | response = deepcopy(result_response) 73 | try: 74 | fault_id = request.form['fault_id'] 75 | log_id = request.form['log_id'] 76 | log_detail = request.form['log_detail'] 77 | solutions = get_solutions_by_log(fault_id,log_id,log_detail) 78 | response['data'] = {'fault_id': fault_id, 'log_id': log_id,'log_detail':log_detail,'solutions':solutions} 79 | except Exception as e: 80 | response['code'] = 0 81 | response['message'] = str(e) 82 | res = make_response(jsonify(response)) # 设置响应体 83 | # res.status = '200' # 设置状态码 84 | res.headers['Access-Control-Allow-Origin'] = "*" # 设置允许跨域 85 | res.headers['Access-Control-Allow-Methods'] = 'PUT,GET,POST,DELETE' 86 | return res 87 | 88 | 89 | if __name__ == '__main__': 90 | app.run(host='0.0.0.0', port=5000, debug=True) 91 | # 跨域支持 92 | # def after_request(response): 93 | # # JS前端跨域支持 94 | # response.headers['Cache-Control'] = 'no-cache' 95 | # response.headers['Access-Control-Allow-Origin'] = '*' 96 | # return response 97 | # 98 | # app.after_request(after_request) 99 | -------------------------------------------------------------------------------- /utils/draw_graph_tool.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | import matplotlib.pyplot as plt 3 | try: 4 | import pygraphviz 5 | from networkx.drawing.nx_agraph import graphviz_layout 6 | except ImportError: 7 | try: 8 | import pydot 9 | from networkx.drawing.nx_pydot import graphviz_layout 10 | except ImportError: 11 | raise ImportError("This example needs Graphviz and either " 12 | "PyGraphviz or PyDotPlus") 13 | if __name__ == '__main__': 14 | dg = nx.DiGraph() 15 | # dg.add_nodes_from(["0", "1", "2", "3", "4"]) 16 | nodes = ["os_022:osb_001", "docker_001:csf_001","docker_008:csf_005","docker_007:csf_004","docker_008:csf_003","docker_008:csf_002","db_003","db_007","db_009","docker_001:fly_remote_001","os_021:osb_001","docker_003:csf_001", "docker_005:csf_005","docker_006:csf_004","docker_005:csf_003","docker_006:csf_002","docker_003:fly_remote_001","docker_006:csf_003", "docker_005:csf_002","docker_004:csf_001","docker_006:csf_005","docker_004:fly_remote_001","docker_007:csf_005","docker_007:csf_003","docker_008:csf_004","docker_007:csf_002","docker_005:csf_004","docker_002:csf_001","docker_002:fly_remote_001"] 17 | edges = [("os_022:osb_001","docker_001:csf_001"),("docker_001:csf_001","docker_008:csf_005"), ("docker_001:csf_001","docker_007:csf_004"), ("docker_001:csf_001","docker_008:csf_003"), ("docker_001:csf_001","docker_008:csf_002"), ("docker_008:csf_005", "db_003"), ("docker_007:csf_004", "db_003"), ("docker_008:csf_003", "db_003"), ("docker_008:csf_002", "db_003"), ("docker_001:csf_001", "db_007"), ("docker_001:csf_001", "db_009"), ("docker_001:csf_001", "docker_001:fly_remote_001"),("os_021:osb_001", "docker_003:csf_001"), ("docker_003:csf_001","docker_005:csf_005"), ("docker_003:csf_001", "docker_006:csf_004"), ("docker_003:csf_001","docker_005:csf_003"), ("docker_003:csf_001", "docker_006:csf_002"), ("docker_005:csf_005","db_003"), ("docker_006:csf_004","db_003"), ("docker_005:csf_003", "db_003"), ("docker_006:csf_002", "db_003"), ("docker_003:csf_001", "db_007"), ("docker_003:csf_001", "db_009"), ("docker_003:csf_001","docker_003:fly_remote_001"), ("docker_003:csf_001","docker_006:csf_003"),("docker_003:csf_001","docker_005:csf_002"), ("docker_006:csf_003", "db_003"),("docker_005:csf_002","db_003"), ("os_021:osb_001","docker_004:csf_001"), ("docker_004:csf_001","docker_006:csf_005"), ("docker_004:csf_001","docker_006:csf_004"), ("docker_004:csf_001", "docker_006:csf_003"),("docker_004:csf_001","docker_006:csf_002"), ("docker_006:csf_005", "db_003"),("docker_004:csf_001","db_007"),("docker_004:csf_001","db_009"), ("docker_004:csf_001", "docker_004:fly_remote_001"), ("docker_001:csf_001","docker_007:csf_005"), ("docker_001:csf_001", "docker_007:csf_003"), ("docker_007:csf_005", "db_003"),("docker_007:csf_003","db_003"), ("docker_001:csf_001","docker_008:csf_004"), ("docker_001:csf_001", "docker_007:csf_002"),("docker_008:csf_004", "db_003"), ("docker_007:csf_002", "db_003"), ("docker_003:csf_001","docker_005:csf_004"), ("docker_005:csf_004", "db_003"), ("docker_004:csf_001","docker_005:csf_005"), ("os_022:osb_001","docker_002:csf_001"), ("docker_002:csf_001","docker_007:csf_005"), ("docker_002:csf_001","docker_007:csf_004"),("docker_002:csf_001","docker_007:csf_003"), ("docker_002:csf_001", "docker_007:csf_002"), ("docker_002:csf_001","db_007"), ("docker_002:csf_001","db_009"), ("docker_002:csf_001","docker_002:fly_remote_001"), ("docker_004:csf_001", "docker_005:csf_004"), ("docker_004:csf_001","docker_005:csf_002"), ("docker_003:csf_001", "docker_006:csf_005"), ("docker_004:csf_001", "docker_005:csf_003"), ("docker_002:csf_001", "docker_008:csf_005"), ("docker_002:csf_001", "docker_008:csf_002"), ("docker_002:csf_001","docker_008:csf_004"), ("docker_002:csf_001","docker_008:csf_003")] 18 | dg.add_nodes_from(nodes) 19 | dg.add_edges_from(edges) 20 | # dg.add_weighted_edges_from(list) 21 | # pos = nx.graphviz_layout(dg, prog='dot') 22 | 23 | nx.draw(dg, 24 | pos=nx.graphviz_layout(dg, prog='dot'), # pos 指的是布局,主要有spring_layout,random_layout,circle_layout,shell_layout 25 | node_color='g', # node_color指节点颜色,有rbykw,同理edge_color 26 | edge_color='r', 27 | with_labels=True, # with_labels指节点是否显示名字 28 | font_size=18, # font_size表示字体大小,font_color表示字的颜色 29 | node_size=60) # font_size表示字体大小,font_color表示字的颜色 30 | plt.show() 31 | pass -------------------------------------------------------------------------------- /service/module_tools/diagnosis_faultservice.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | from utils.graph import ExceptionDataNode, Graph 4 | from utils.pageRank import PRIterator 5 | from utils.pcalg import construct_service_dependency_diagram 6 | 7 | 8 | class DiagnosisFaultService: 9 | @staticmethod 10 | def get_servcie_fault_causes(serviceNode,data): 11 | """ 12 | 对某一故障服务进行细粒度诊断 13 | :param serviceNode: 14 | :param data: input_data实例 15 | :return: 16 | """ 17 | # 确定与故障服务相关的异常指标 18 | serviceId = serviceNode.serviceId 19 | hostId = serviceNode.hostId 20 | containerId = serviceNode.containerId 21 | exception_metrics, exception_logs = data.get_fault_service_related_log_metric_data(serviceId,containerId,hostId) 22 | # 处理原始数据,得到PC算法输入格式,原始数据预处理 23 | pc_input = data.get_PC_input_data(exception_metrics, exception_logs) 24 | # 利用PC算法生成图,g的节点为输入数据的Index 25 | g, columns_mapping = construct_service_dependency_diagram(pc_input) 26 | #生成的g,替换节点为metricId和logId 27 | g_new = DiagnosisFaultService.get_g_dataId(g, columns_mapping) 28 | # 识别图中的根因节点 29 | falut_root_dict = DiagnosisFaultService.get_root_cause(g_new) 30 | # 生成返回图结构 31 | final_exception_data_graph = DiagnosisFaultService.geneate_final_return_graph(g_new,exception_metrics, exception_logs) 32 | return falut_root_dict,final_exception_data_graph 33 | 34 | @staticmethod 35 | def get_g_dataId(g,columns_mapping): 36 | g_new = nx.DiGraph() 37 | for node in g.nodes: 38 | g_new.add_node(columns_mapping[node]) 39 | for edge in g.edges: 40 | g_new.add_edge(columns_mapping[edge[0]],columns_mapping[edge[1]]) 41 | return g_new 42 | 43 | @staticmethod 44 | def get_root_cause(g): 45 | """ 46 | 通过关系图获取根因列表,获取故障服务根因列表 47 | Args: 48 | g: 关系图 49 | 50 | Returns: 根因列表 51 | 52 | """ 53 | result = list() 54 | # 获取Pr值最高的点 55 | begin_node_id, begin_node_pr = None, 0 56 | # for node_id in node_ids: 57 | # if len(list(g.predecessors(node_id))) > max_pre_size: 58 | # max_pre_node = node_id 59 | # max_pre_size = len(list(g.predecessors(node_id))) 60 | pr = PRIterator(g) 61 | page_ranks = pr.page_rank() 62 | node_pr_sorted = sorted(page_ranks.items(), key=lambda x: x[1], reverse=True) 63 | begin_node_id = node_pr_sorted[0][0] 64 | # 层次遍历 65 | node_filter, node_queue = {begin_node_id}, list([begin_node_id]) 66 | while node_queue: 67 | node_now = node_queue.pop(0) 68 | if not g.predecessors(node_now): 69 | if node_now not in result: 70 | result.append(node_now) 71 | continue 72 | is_pre_not_filter = False 73 | for k in g.predecessors(node_now): 74 | if k not in node_filter: 75 | is_pre_not_filter = True 76 | node_filter.add(k) 77 | node_queue.append(k) 78 | # 如果所有的上游节点都在 filter 中,将当前节点加入 result,避免 result 为空的情况 79 | if not is_pre_not_filter: 80 | for k in g.predecessors(node_now): 81 | if k not in result: 82 | result.append(k) 83 | if node_now not in result: 84 | result.append(node_now) 85 | 86 | g_reverse = g.reverse(copy=True) 87 | pr_reverse = PRIterator(g_reverse) 88 | page_ranks_reverse = pr_reverse.page_rank() 89 | for key, value in page_ranks_reverse.items(): 90 | if key in result: 91 | value += 0.5 92 | node_pr_reverse_sorted = sorted(page_ranks.items(), key=lambda x: x[1], reverse=True) 93 | result_final = {} 94 | for index, i in enumerate(node_pr_reverse_sorted): 95 | if index < 3: 96 | result_final[i[0]]= i[1] 97 | return result_final 98 | 99 | @staticmethod 100 | def geneate_final_return_graph(g_new,exception_metrics, exception_logs): 101 | """ 102 | 生成返回的图结构 103 | :param g_new: 104 | :param data: 105 | :param exception_metrics: 与服务相关的异常指标 106 | :param exception_logs: 与服务相关的异常日志 107 | :return: 108 | """ 109 | nodes = {} 110 | for node_id in g_new.nodes: 111 | id = node_id 112 | if id in exception_metrics: 113 | nodeType = "metric" 114 | tmpExceptionDataNode = ExceptionDataNode(id, nodeType) 115 | tmpExceptionDataNode.name = exception_metrics[id][0].metricName 116 | tmpExceptionDataNode.detail = exception_metrics[id][0].value 117 | tmpExceptionDataNode.units = exception_metrics[id][0].units 118 | tmpExceptionDataNode.belongTo = exception_metrics[id][0].metricBelongTo 119 | tmpExceptionDataNode.exceptionTime = exception_metrics[id][0].startTime 120 | nodes[id] = tmpExceptionDataNode 121 | elif id in exception_logs: 122 | nodeType = "log" 123 | tmpExceptionDataNode = ExceptionDataNode(id, nodeType) 124 | tmpExceptionDataNode.belongTo = exception_logs[id][0].logBelongTo 125 | tmpExceptionDataNode.exceptionTime = exception_logs[id][0].startTime 126 | tmpExceptionDataNode.detail = exception_logs[id][0].logExceptionSegment 127 | nodes[id] = tmpExceptionDataNode 128 | else: 129 | continue 130 | edges = g_new.edges() 131 | final_return_graph = Graph(nodes,edges) 132 | return final_return_graph 133 | -------------------------------------------------------------------------------- /dao/db_dao.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from sqlalchemy import create_engine 4 | from sqlalchemy.orm import sessionmaker 5 | 6 | from bean.save_model import FaultServiceRoot, FaultService, ServiceDependencyGraph, ExceptionDataDependencyGraph, \ 7 | FaultServiceSolution 8 | 9 | class DBDao: 10 | def __init__(self): 11 | self.engine = create_engine('mysql+pymysql://root:root1234@127.0.0.1:3306/fault_result_solution') 12 | self.session = self.get_session() 13 | 14 | def get_session(self): 15 | # 创建DBSession类型: 16 | DBSession = sessionmaker(bind=self.engine) 17 | session = DBSession() 18 | return session 19 | 20 | def db_close(self): 21 | self.session.close() 22 | 23 | def db_commit(self): 24 | self.session.commit() 25 | 26 | def get_all_root_logs_noSolution(self): 27 | """ 28 | 获取未生成修复方案的全部根因日志 29 | :return: 30 | """ 31 | root_logs = self.session.query(FaultServiceRoot).filter(FaultServiceRoot.type == 1, 32 | FaultServiceRoot.has_solution == 0).all() 33 | return root_logs 34 | 35 | def update_root_detail_table_has_solutuon(self,fault_id,log_id): 36 | """ 37 | 将已经生成修复方案的根因日志的has_solution字段更新为1 38 | :return: 39 | """ 40 | result = self.session.query(FaultServiceRoot).filter(FaultServiceRoot.fault_id == fault_id, 41 | FaultServiceRoot.causeOfFault == log_id).update( 42 | FaultServiceRoot.has_solution == 1) 43 | return result 44 | 45 | def insert_fault_service_into_fault_service_table(self,serviceId,serviceName,serviceType,hostName): 46 | """ 47 | 插入新的故障服务在故障服务详情表 48 | :param serviceId: 49 | :param serviceName: 50 | :param serviceType: 51 | :param hostName: 52 | :return: 53 | """ 54 | fault_service = FaultService(fault_service_id = serviceId, fault_service_name = serviceName, 55 | fault_service_type = serviceType, host_name = hostName, 56 | exception_time=datetime.datetime.now(), process_state=0) 57 | self.session.add(fault_service) 58 | self.db_commit() 59 | return fault_service 60 | 61 | def insert_fault_service_root_into_fault_service_root_table(self, fault_id, root_id, name, detail,type,rank): 62 | service_falut_root = FaultServiceRoot(fault_id=fault_id, causeOfFault=root_id, causeName=name, 63 | detail=detail, has_solution=0, type=type, 64 | rank=rank) 65 | self.session.add(service_falut_root) 66 | return service_falut_root 67 | 68 | def insert_service_dependency_graph_into_service_dependency_graph_table(self, fault_id, fault_service_dependency_graph_json): 69 | service_dependency_graph = ServiceDependencyGraph(fault_id=fault_id, 70 | graph_json=fault_service_dependency_graph_json) 71 | self.session.add(service_dependency_graph) 72 | return service_dependency_graph 73 | 74 | def insert_exception_data_dependency_graphh_into_exception_data_dependency_graph_table(self, fault_id, service_exception_data_dependency_graph_json): 75 | exception_data_dependency_graph = ExceptionDataDependencyGraph(fault_id=fault_id, 76 | graph_json=service_exception_data_dependency_graph_json) 77 | self.session.add(exception_data_dependency_graph) 78 | return exception_data_dependency_graph 79 | def insert_fault_service_solution_insert_fault_service_solution_table(self,fault_id,log_id,fault_reason,fault_solution,rank): 80 | faultServiceSolution = FaultServiceSolution(fault_id=fault_id, root_log_id=log_id, fault_reason=fault_reason, 81 | fault_solution=fault_solution, rank=rank) 82 | self.session.add(faultServiceSolution) 83 | return faultServiceSolution 84 | 85 | def select_all_fault_service_detail_by_processState(self,process_state): 86 | fault_service_list = self.session.query(FaultService).filter(FaultService.process_state == process_state).all() 87 | return fault_service_list 88 | 89 | def select_rank1_faultserviceroot_by_faultid(self,fault_id): 90 | root = self.session.query(FaultServiceRoot).filter(FaultServiceRoot.fault_id == fault_id).order_by( 91 | FaultServiceRoot.rank.desc()).first() 92 | return root 93 | def select_service_invoke_graph_by_faultid(self,fault_id): 94 | service_invoke_graph_json = self.session.query(ServiceDependencyGraph).filter( 95 | ServiceDependencyGraph.fault_id == fault_id).first() 96 | return service_invoke_graph_json 97 | 98 | def select_exception_data_dependency_graph_by_faultid(self,fault_id): 99 | exception_data_dependency_graph_json = self.session.query(ExceptionDataDependencyGraph).filter( 100 | ExceptionDataDependencyGraph.fault_id == fault_id).one() 101 | return exception_data_dependency_graph_json 102 | def get_root_log_by_logid_and_faultid(self,fault_id,log_id): 103 | root_log = self.session.query(FaultServiceRoot).filter(FaultServiceRoot.fault_id == fault_id, 104 | FaultServiceRoot.causeOfFault == log_id).first() 105 | return root_log 106 | 107 | def select_solutions_by_logid_and_faultid(self,fault_id,log_id): 108 | solutions = self.session.query(FaultServiceSolution).filter(FaultServiceSolution.fault_id == fault_id,FaultServiceSolution.root_log_id == log_id).order_by(FaultServiceSolution.rank.asc()).all() 109 | return solutions 110 | -------------------------------------------------------------------------------- /service/module_tools/save_result.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from dao.db_dao import DBDao 4 | 5 | 6 | class SaveResult: 7 | @staticmethod 8 | def save(service_invoke_graph,final_root_services,services_diagnisis_results): 9 | dbDao = DBDao() 10 | fault_ids = dict() 11 | # 存储识别出的故障服务到故障服务表 12 | for fault_service_id in final_root_services: 13 | serviceNode = service_invoke_graph.nodes[fault_service_id] 14 | fault_service = dbDao.insert_fault_service_into_fault_service_table(serviceNode.serviceId,serviceNode.serviceName,serviceNode.serviceType,serviceNode.hostName) 15 | fault_ids[fault_service.id] = fault_service.fault_service_id 16 | # 生成存储结果的服务依赖图Json 17 | fault_service_dependency_graph_json = SaveResult.generate_save_fault_service_dependency_graph_json(service_invoke_graph,fault_ids) 18 | for fault_id, fault_service_id in fault_ids.items(): 19 | fault_service_roots = services_diagnisis_results[fault_service_id]['falut_root_dict'] 20 | final_exception_data_graph = services_diagnisis_results[fault_service_id]['final_exception_data_graph'] 21 | for index, root_id in enumerate(fault_service_roots.keys()): 22 | rootNode = final_exception_data_graph.nodes[root_id] 23 | if rootNode.nodeType == "metric": 24 | rootNode.nodeType = 0 25 | else: 26 | rootNode.nodeType = 1 27 | service_falut_root = dbDao.insert_fault_service_root_into_fault_service_root_table(fault_id,root_id,rootNode.name,rootNode.detail,rootNode.nodeType,index) 28 | service_dependency_graph = dbDao.insert_service_dependency_graph_into_service_dependency_graph_table(fault_id,fault_service_dependency_graph_json) 29 | # 生成存储结果的服务异常数据依赖图Json 30 | service_exception_data_dependency_graph_json = SaveResult.generate_save_service_exception_data_dependency_graph_json(final_exception_data_graph,fault_service_roots) 31 | exception_data_dependency_graph = dbDao.insert_exception_data_dependency_graphh_into_exception_data_dependency_graph_table(fault_id,service_exception_data_dependency_graph_json) 32 | dbDao.db_commit() 33 | dbDao.db_close() 34 | 35 | @staticmethod 36 | def generate_save_fault_service_dependency_graph_json(service_invoke_graph,fault_ids): 37 | storage_graph_json = dict() 38 | nodes = list() 39 | edges = list() 40 | for node_id, node in service_invoke_graph.nodes.items(): 41 | save_node_dict = {} 42 | save_node_dict['id'] = node_id 43 | save_node_dict['label'] = node.serviceName 44 | save_node_dict['data'] = {} 45 | save_node_dict['data']['name'] = node.serviceName 46 | save_node_dict['data']['type'] = node.serviceType 47 | save_node_dict['data']['type'] = node.hostName 48 | save_node_dict['data']['fault_id'] = None 49 | if node_id in fault_ids.values(): 50 | save_node_dict['data']['health_level'] = 2 51 | for key, value in fault_ids.items(): 52 | if value == node_id: 53 | save_node_dict['data']['fault_id'] = key 54 | elif node.isException == 1: 55 | save_node_dict['data']['health_level'] = 1 56 | else: 57 | save_node_dict['data']['health_level'] = 0 58 | 59 | nodes.append(save_node_dict) 60 | for i in service_invoke_graph.edges: 61 | edge = {} 62 | edge['source'] = i[0] 63 | edge['target'] = i[1] 64 | edges.append(edge) 65 | storage_graph_json['nodes'] = nodes 66 | storage_graph_json['edges'] = edges 67 | graph_json = json.dumps(storage_graph_json) 68 | # graph_json = storage_graph_json 69 | return graph_json 70 | 71 | @staticmethod 72 | def generate_save_service_exception_data_dependency_graph_json(final_exception_data_graph, fault_service_roots): 73 | storage_graph_json = dict() 74 | nodes = list() 75 | edges = list() 76 | for node_id, node in final_exception_data_graph.nodes.items(): 77 | save_node_dict = {} 78 | save_node_dict['id'] = node.id 79 | save_node_dict['label'] = node.name 80 | save_node_dict['data'] = dict() 81 | save_node_dict['data']['name'] = node.name 82 | save_node_dict['data']['type'] = node.nodeType 83 | save_node_dict['data']['detail'] = node.detail 84 | save_node_dict['data']['belongTo'] = node.belongTo 85 | save_node_dict['data']['exceptionTime'] = node.exceptionTime 86 | save_node_dict['data']['units'] = node.units 87 | if node_id in fault_service_roots.keys(): 88 | save_node_dict['data']['import'] = 1 89 | else: 90 | save_node_dict['data']['import'] = 0 91 | nodes.append(save_node_dict) 92 | for i in final_exception_data_graph.edges: 93 | edge = {} 94 | edge['source'] = i[0] 95 | edge['target'] = i[1] 96 | edges.append(edge) 97 | storage_graph_json['nodes'] = nodes 98 | storage_graph_json['edges'] = edges 99 | graph_json_result = json.dumps(storage_graph_json) 100 | # graph_json_result = storage_graph_json 101 | return graph_json_result 102 | 103 | @staticmethod 104 | def save_solutions(fault_id,log_id,sorted_solutions): 105 | dbDao = DBDao() 106 | for index,solution in enumerate(sorted_solutions): 107 | faultServiceSolution = dbDao.insert_fault_service_solution_insert_fault_service_solution_table(fault_id,log_id,solution['reason'],solution['html_content'],index) 108 | result = dbDao.update_root_detail_table_has_solutuon(fault_id,log_id) 109 | if result: 110 | dbDao.db_commit() 111 | dbDao.db_close() 112 | return result -------------------------------------------------------------------------------- /demo/aiops_2020_data_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from service.fault_diagnosis_service import fault_diagmosis 4 | from utils.data_tools import is_number 5 | 6 | 7 | def data_collection_process_json(): 8 | """ 9 | 从json中获取原始调用链数据、部署数据、原始指标数据、原始日志数据、异常指标数据和异常日志数据 10 | Args: 11 | 12 | Returns: 返回对应的从json中获取到的数据 13 | """ 14 | 15 | # 获取未处理的异常数据 16 | f = open('../data/aiops_data_2020/2020_04_11/items_exception_result.json', 'r') 17 | original_exception_metric_data = json.load(f) 18 | exception_metric_data = original_exception_metric_data 19 | f.close() 20 | 21 | f = open('../data/aiops_data_2020/2020_04_11/origina_items.json', 'r') 22 | original_items = json.load(f) 23 | items = original_items 24 | f.close() 25 | 26 | f = open('../data/aiops_data_2020/2020_04_11/origina_traces.json', 'r') 27 | original_traces = json.load(f) 28 | traces = {} 29 | for i, (k, v) in enumerate(original_traces.items()): 30 | if i <= 500: 31 | traces[k] = v 32 | else: 33 | break 34 | f.close() 35 | 36 | f = open('../data/aiops_data_2020/2020_04_11/original_deployment_data.json', 'r') 37 | deployment_data = json.load(f) 38 | f.close() 39 | return exception_metric_data, items, traces, deployment_data 40 | 41 | def get_original_trace_data(traces): 42 | """ 43 | 将traces处理成目标输入数据:每条记录包括id、pid、serviceId、serviceName、serviceType、startTime、traceId等字段 44 | aiops_2020处理的traces是将traces组织在一起的 45 | :param traces: 46 | :return:目标格式的List 47 | """ 48 | traceData = list() 49 | for traceId, trace in traces.items(): 50 | for index, span in enumerate(trace): 51 | if span['pid'] == 'None': 52 | span['pid'] = -1 53 | tmp_dict = {} 54 | tmp_dict['id'] = span['id'] 55 | tmp_dict['pid'] = span['pid'] 56 | tmp_dict['serviceId'] = span['serviceId'] # serviceCode并不能唯一标识服务实例,需要再讨论 57 | tmp_dict['serviceName'] = span['serviceName'] 58 | tmp_dict['serviceType'] = span['serviceType'] 59 | tmp_dict['startTime'] = span['startTime'] 60 | tmp_dict['traceId'] = traceId 61 | traceData.append(tmp_dict) 62 | return traceData 63 | 64 | def get_deployment_data(deployment_data): 65 | """ 66 | 将部署数据处理成目标输入数据:每条记录包含serviceInstanceId、serviceName、hostId、hostName、containerId、containerName等字段 67 | :param deployment_data: 68 | :return:目标格式List 69 | """ 70 | return deployment_data 71 | 72 | 73 | def get_original_metric_data(items): 74 | """ 75 | 将原始指标数据处理成目标输入数据:每条记录包含timestamp、metricId、metricName、value、units、metricBelongTo、metricBelongLevel 76 | :param items: 77 | :return:目标格式List 78 | """ 79 | originalMetricData = list() 80 | for key,metrics in items.items(): 81 | metricId = metrics['metricId'] 82 | metricName = metrics['metricId'] 83 | units = None 84 | metricBelongTo = metrics['metricBelongTo'] 85 | metricBelongLevel = metrics['metricBelongLevel'] 86 | stampTimes = metrics['timeStamps'] 87 | values = metrics['values'] 88 | if len(stampTimes) > 0 and len(stampTimes) == len(values): 89 | for index, value in enumerate(stampTimes): 90 | tmp_metric = dict() 91 | tmp_metric['metricId'] = str(metricId) 92 | tmp_metric['metricName'] = metricName 93 | tmp_metric['metricBelongTo'] = metricBelongTo 94 | tmp_metric['metricBelongLevel'] = metricBelongLevel 95 | tmp_metric['units'] = units 96 | tmp_metric['timestamp'] = value 97 | if is_number(values[index]): 98 | tmp_metric['value'] = float(values[index]) 99 | originalMetricData.append(tmp_metric) 100 | else: 101 | continue 102 | return originalMetricData 103 | 104 | def get_exception_metric_data(exception_metric_data): 105 | """ 106 | 将异常指标数据处理成输入数据:每条记录包含startTime、endTime、metricId、metricName、value、units、metricBelongTo、metricBelongLevel等字段 107 | :param exception_metric_data: 108 | :return:目标格式List 109 | """ 110 | exceptionMetricData = list() 111 | 112 | for ex_metric in exception_metric_data: 113 | metricId = ex_metric['metricId'] 114 | metricName = ex_metric['metricName'] 115 | units = None 116 | metricBelongTo = ex_metric['belongTo'] 117 | metricBelongLevel = None 118 | stampTimes = ex_metric['testTime'] 119 | values = ex_metric['value'] 120 | stampTimes_splits = stampTimes.strip().split(',') 121 | values_splits = values.strip().split(',') 122 | if len(stampTimes_splits) > 0 and len(stampTimes_splits) == len(values_splits): 123 | for index, value in enumerate(stampTimes_splits): 124 | tmp_metric = dict() 125 | tmp_metric['metricId'] = str(metricId) 126 | tmp_metric['metricName'] = metricName 127 | tmp_metric['metricBelongTo'] = metricBelongTo 128 | tmp_metric['metricBelongLevel'] = metricBelongLevel 129 | tmp_metric['units'] = units 130 | tmp_metric['endTime'] = value 131 | tmp_metric['startTime'] = value 132 | if is_number(values_splits[index]): 133 | tmp_metric['value'] = float(values_splits[index]) 134 | exceptionMetricData.append(tmp_metric) 135 | else: 136 | continue 137 | return exceptionMetricData 138 | 139 | if __name__ == '__main__': 140 | exception_metric_data, items, traces, deployment_data = data_collection_process_json() 141 | 142 | deploymentData = get_deployment_data(deployment_data) 143 | traceData = get_original_trace_data(traces) 144 | original_metricData = get_original_metric_data(items) 145 | exception_metricData = get_exception_metric_data(exception_metric_data) 146 | exception_logData = [] 147 | original_logData = [] 148 | fault_diagmosis(deploymentData, traceData, original_metricData, original_logData, exception_metricData, 149 | exception_logData) 150 | pass 151 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | # AIOsp-Fault-Diagnosis 3 | 4 | ### 工具介绍 5 | 6 | ​本文设计并实现了一种**故障辅助诊断工具**,主要用于辅助运维人员进行微服务系统故障诊断及修复,其中故障根因诊断可以定位到指标或日志层级,修复方案则基于已有的图谱生成,最终使用Flask框架搭建查询服务。 7 | 8 | 实现功能如下: 9 | - 服务依赖图构建:还原出系统实时服务依赖关系,并在图中进一步标识出异常服务以及故障服务 10 | - 异常数据提取与依赖图构建:对异常服务进行分析,构建异常间的因果关系,进而在图中标记出根源指标或日志 11 | - 异常日志根源辅助分析:基于日志匹配结果,分析异常日志可能的原因及解决方案 12 | 13 | 14 | 整体定位流程图如下: 15 | 16 | 示意图 17 | 18 | ### Install 19 | ``` 20 | pip install -r requirements.txt 21 | ``` 22 | ``` 23 | 1、安装MYSQL数据库 24 | 2、运行config/data_base_sql创建项目所需数据库 25 | ``` 26 | 27 | #### Note 28 | 29 | - python_version = 3.7.9 30 | - Mysql 数据库存储故障根因诊断结果和修复方案 31 | - 若不考虑存储部分,可注释掉所有和SaveResult相关代码 32 | - 使用项目中的数据库存储时注意在```dao```文件夹下的账号密码 33 | 34 | ### demo 35 | 36 | - 数据下载地址 37 | ``` 38 | 链接:https://pan.baidu.com/s/1bMwuqRpJ1hMhKPhxajp4kw 密码:ptmj 39 | ``` 40 | ### demoRun 41 | 42 | 数据放入```data```文件夹 43 | - hadoop数据:```run demo/hadoop_data_test.py```完成故障根因诊断并存储 44 | - aiops数据:```run demo/aiops_2020_data_test.py```完成故障根因诊断并存储 45 | 46 | ### WEB接口启动 47 | ``` 48 | run app.py 49 | ``` 50 | ### 生成修复方案 51 | ``` 52 | run generate_solutuons_service.py的time_generate_logs_solutions() 53 | ``` 54 | #### Note 55 | 56 | - 未配置neo4j数据库和mysql数据库,该功能不可用 57 | 58 | ### 目录结构 59 | ├── fault_diagnosis_and_repair 60 | └── bean 61 | ├── input_model.py //输入数据对应对象类 62 | ├── output_model.py //web接口输出对象类 63 | └── save_model.py //与mysql数据库表对应类 64 | ├── config 65 | ├── data_base_sql //mysql数据库创建SQL 66 | └── stop.txt //停用词 67 | ├── dao 68 | ├── db_dao.py //mysql对应dao 69 | ├── es_dao.py //es对应dao 70 | └── neo4j_dao.py //neo4j对应dao 71 | ├── data 72 | ├── aiops_data_2020 //存放demo数据 73 | └── hadoop_data.py //存放demo数据 74 | ├── demo 75 | ├── aiops_2020_data_test.py //aiops数据故障根因诊断并存储入口 76 | └── hadoop_data_test.py //hadoop数据故障根因诊断并存储入口 77 | ├── service //项目核心代码 78 | ├── module_tools //功能实现封装的各个模块工具类 79 | ├── diagnosis_faultservice.py //故障服务进一步诊断工具类 80 | ├── generate_solutions.py //修复方案生成工具类 81 | ├── identify_faulrservice.py //故障服务识别工具类 82 | ├── input_data.py //输入数据处理工具类 83 | └── save_result.py //数据存储工具类 84 | ├── fault_diagnosis_service //故障根因诊断服务入口 85 | ├── generate_solutions_service //修复方案生成服务入口 86 | └── web_service //交互模块查询服务入口 87 | ├── utils 88 | ├── data_tools.py 89 | ├── graph.py 90 | ├── jaccard_api.py 91 | ├── pageRank.py 92 | ├── pcalg.py 93 | └── prcess_aiops2020_data_original.py //可将aiops 2020数据处理为本项目所需的数据,作为aiops_2020_data_test.py数据源 94 | ├── app.py //flask接口启动 95 | └── requirements.txt 96 | ### 输入数据格式 97 | 项目所需数据包括原始指标数据、原始日志数据、系统部署数据、调用数据、异常检测模块检测出的异常指标数据和异常日志数据下面分别对所需数据格式介绍 98 | 99 | - 原始指标数据 100 | 101 | 字段 | 说明 | 类型| 102 | ---- | ----- | ------ 103 | timestamp|采集时间(s)|秒级时间戳string 104 | metricId|指标唯一标识|string 105 | metricName|指标名称|string 106 | value|指标采集值|float 107 | units|单位|string 108 | metricBelongTo| 指标所属|string 109 | metricBelongLevel|指标所属层级|host/container/service 110 | 111 | timestamp | metricId | metricName| value | units | metricBelongTo | metricBelongLevel 112 | ---- | ----- | ------ | ------ | ------ | ------ | ------ 113 | '1614688922' | '29162' | 'CPU iowait time'| 0.717773 | '%'| 'fuguangbo-0002'| 'host' 114 | 115 | - 原始日志数据 116 | 117 | 字段 | 说明 | 类型| 118 | ---- | ----- | ------ 119 | timestamp|采集时间(s)|秒级时间戳string 120 | logId|日志唯一标识|string 121 | logMessage|日志条目信息|string 122 | logLevel|日志等级|string 123 | logBelongTo|日志所属|string 124 | logBelongLevel| 日志所属层级|host/container/service 125 | 126 | timestamp | logId | logMessage| logLevel | logBelongTo | logBelongLevel 127 | ---- | ----- | ------ | ------ | ------ | ------ 128 | 1614688885 | 'hadoop-root-datanode-hadoop-slave2.log' | org.apache.hadoop.hdfs.server... | INFO | 'DataNode'| 'service' 129 | 130 | - 部署数据 131 | 132 | 字段 | 说明 | 类型| 133 | ---- | ----- | ------ 134 | serviceInstanceId|服务实例唯一标识|string 135 | serviceName|服务名称|string 136 | hostId|主机唯一标识|string 137 | hostName|主机名称|string 138 | containerId|容器唯一标识|string 139 | containerName|容器名称|string 140 | 141 | serviceInstanceId | serviceName | hostId| hostName | containerId | containerName 142 | ---- | ----- | ------ | ------ | ------ | ------ 143 | 'NameNode' | 'NameNode' | 'hadoop-master'| 'hadoop-master' | 'fuguangbo-0002'| 'fuguangbo-0002' 144 | 145 | - 调用链数据 146 | 147 | 字段 | 说明 | 类型| 148 | ---- | ----- | ------ 149 | id|当前调用SpanId|string 150 | pid|父调用SpanId|string 151 | serviceId|服务实例唯一标识|string 152 | serviceName|服务名称|string 153 | serviceType|服务类别|string 154 | startTime|调用开始时间|string 155 | traceId|调用链唯一标识|string 156 | 157 | id | pid | serviceId| serviceName | serviceType | startTime| traceId 158 | ---- | ----- | ------ | ------ | ------ | ------ | ------ 159 | '136.60.16146924705200712' | -1 | ''DataNode'| 'DataNode' | 'Local'| '1614692470520'|'136.60.16146924705200713' 160 | 161 | - 异常指标数据 162 | 163 | 字段 | 说明 | 类型| 164 | ---- | ----- | ------ 165 | startTime|异常开始时间|string 166 | endTime|异常结束时间|string 167 | metricId|指标唯一标识|string 168 | metricName|指标名称|string 169 | value|指标值|float 170 | units|单位|string 171 | metricBelongTo|指标所属|string 172 | metricBelongLevel|指标所属层级|host/container/service 173 | 174 | startTime | endTime | metricId| metricName | value | units| metricBelongTo|metricBelongLevel 175 | ---- | ----- | ------ | ------ | ------ | ------ | ------ | ------ 176 | '2021-03-02 21:31:02'|'2021-03-02 21:31:02' | '29162'| 'CPU iowait time' | '0.912974| 'fuguangbo-0002'|'host' 177 | 178 | - 异常日志数据 179 | 180 | 字段 | 说明 | 类型| 181 | ---- | ----- | ------ 182 | startTime|异常开始时间|string 183 | endTime|异常结束时间|string 184 | logId|日志唯一标识|string 185 | logExceptionSegment|日志异常片段|string 186 | logBelongTo|日志所属|float 187 | logBelongLevel|日志所属层级|host/container/service 188 | 189 | startTime | endTime | logId| logExceptionSegment | logBelongTo | logBelongLevel 190 | ---- | ----- | ------ | ------ | ------ | ------ 191 | '2021-03-02T13:15:51.452Z'|'2021-03-02T13:15:51.452Z' |'hadoop-root-datanode-hadoop-slave2.log'| java.net.NoRouteToHostException...| 'DataNode'|'service' 192 | 193 | ### 运行效果 194 | - 故障根因诊断运行效果分两个部分:打印结果和存储结果 195 | 故障根因诊断打印结果: 196 | 197 | ![截图](https://github.com/yymgithub/AIOsp-Fault-Diagnosis/blob/main/effect_images/1.png?raw=true) 198 | 199 | >故障服务列表是在多个异常服务中识别出的故障服务列表,key为服务Id
200 | >XXX服务故障根因列表是该故障服务的根因指标、日志列表,key为指标或日志名称 201 | 具体存储结果可依据数据库SQl查看 202 | - web接口返回值为Json 203 | - 修复方案生成直接生成修复方案存储到mysql -------------------------------------------------------------------------------- /service/module_tools/genarate_solutions.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from dao.neo4j_dao import GraphDao 4 | from utils.jaccard_api import log_preprocess, generate_cidian_jaccard 5 | 6 | paramregex = [r'blk_-?\d+', r'(\d+\.){3}\d+(:\d+)?', r'(\d+\.){3}\d+', r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)', 7 | r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$'] 8 | stopkeyword = [line.strip() for line in open('config/stop.txt').readlines()] 9 | eraseRex = [r'(\d+\-){2}\d+\s\d+\:\d+\:\d+\,\d+', r'INFO', r'ERROR', r'DEBUG', r'WARN', r'FATAL'] 10 | 11 | class GenetateSolutuons: 12 | @staticmethod 13 | def get_solutions_by_logDetail(log_detail): 14 | """ 15 | 生成异常日志片段对应的解决方案列表 16 | :param log_detail: 17 | :return: 排好序的解决方案列表 18 | """ 19 | logs_dictionaries = GenetateSolutuons.get_fault_repair_graph_dictionary_log() 20 | logs_with_jaccard = GenetateSolutuons.logDetail_graph_cache_jaccard(log_detail, logs_dictionaries) 21 | 22 | # jaccard 日志按相似度大小进行排序 23 | logs_with_jaccard_sort = sorted(logs_with_jaccard.items(), key=lambda x: x[1]['jaccard'], reverse=True) 24 | # 取前 5 name 25 | top_name = [item[1]['name'] for item in logs_with_jaccard_sort[:5]] 26 | solutions = {} 27 | 28 | graph_dao = GraphDao() 29 | for name in top_name: 30 | if len(solutions) < 5: 31 | solutions_tmp = {} 32 | faults = graph_dao.get_fault_entity_by_log(name) 33 | for fault in faults: 34 | solutions_zhijie = graph_dao.get_solutions_by_fault(fault['name']) 35 | reasons = graph_dao.get_reasons_by_fault(fault['name']) 36 | for reason in reasons: 37 | solutions_jianjie = graph_dao.get_solutions_by_reason(reason['name']) 38 | for solution_jianjie in solutions_jianjie: 39 | solutions_tmp[solution_jianjie['name']] = {} 40 | solutions_tmp[solution_jianjie['name']]['reason'] = reason['content'] 41 | solutions_tmp[solution_jianjie['name']]['html_content'] = solution_jianjie['html_content'] 42 | solutions_tmp[solution_jianjie['name']]['json_content'] = solution_jianjie['json_content'] 43 | solutions_tmp[solution_jianjie['name']]['vote'] = solution_jianjie['vote'] 44 | pass 45 | for solution_zhijie in solutions_zhijie: 46 | solutions_tmp[solution_zhijie['name']] = {} 47 | solutions_tmp[solution_zhijie['name']]['reason'] = "暂无" 48 | solutions_tmp[solution_zhijie['name']]['html_content'] = solution_zhijie['html_content'] 49 | solutions_tmp[solution_zhijie['name']]['json_content'] = solution_zhijie['json_content'] 50 | solutions_tmp[solution_zhijie['name']]['vote'] = solution_zhijie['vote'] 51 | # 根据投票数排序 52 | solutions_tmp_sort = sorted(solutions_tmp.items(), key=lambda x: x[1]['vote'], reverse=True) 53 | for solution_tmp in solutions_tmp_sort: 54 | if len(solutions) < 5: 55 | solutions[solution_tmp[0]] = {} 56 | solutions[solution_tmp[0]]['reason'] = solution_tmp[1]['reason'] 57 | solutions[solution_tmp[0]]['html_content'] = solution_tmp[1]['html_content'] 58 | solutions[solution_tmp[0]]['json_content'] = solution_tmp[1]['json_content'] 59 | solutions[solution_tmp[0]]['vote'] = solution_tmp[1]['vote'] 60 | solutions[solution_tmp[0]]['serial_number'] = len(solutions) 61 | else: 62 | break 63 | solutions_sort = sorted(solutions.items(), key=lambda x: x[1]['serial_number']) 64 | result_solutions = [] 65 | for solution_sort in solutions_sort: 66 | result_solution = {} 67 | result_solution['reason'] = solution_sort[1]['reason'] 68 | result_solution['html_content'] = solution_sort[1]['html_content'] 69 | result_solution['json_content'] = solution_sort[1]['json_content'] 70 | result_solution['vote'] = solution_sort[1]['vote'] 71 | result_solution['serial_number'] = solution_sort[1]['serial_number'] 72 | result_solutions.append(result_solution) 73 | return result_solutions 74 | 75 | @staticmethod 76 | def logDetail_graph_cache_jaccard(yichanglog, graph_cache): 77 | """ 78 | 根因日志异常片段与图谱中全部日志相似度计算 79 | :param yichanglog: 80 | :param graph_cache: 81 | :return: dict {图谱中日志名称:相似度值} 82 | """ 83 | result_dict = {} 84 | log_ = log_preprocess(yichanglog, paramregex, eraseRex) 85 | log1_dic = generate_cidian_jaccard(log_, stopkeyword) 86 | 87 | for name, log2_dic in graph_cache.items(): 88 | dict = {} 89 | bingji = list(set(log1_dic).union(set(log2_dic))) 90 | jiaoji = list(set(log1_dic).intersection(set(log2_dic))) 91 | jiaquan = 0 92 | for word in jiaoji: 93 | if re.search(r'[a-zA-Z0-9]+.[a-zA-Z0-9]+.[a-zA-Z0-9]+Exception', word): 94 | jiaquan += 5 95 | jaccard = (len(jiaoji) + jiaquan) / len(bingji) 96 | dict['name'] = name 97 | dict['dict'] = log2_dic 98 | dict['jaccard'] = jaccard 99 | result_dict[name] = dict 100 | return result_dict 101 | 102 | @staticmethod 103 | def get_fault_repair_graph_dictionary_log(): 104 | """ 105 | 获取图谱中所有日志的分词结果 106 | :return: 107 | """ 108 | graph_dao = GraphDao() 109 | logs = graph_dao.get_all_log_entities() 110 | graph_cache_jaccard = GenetateSolutuons.generate_graph_cache_jaccard(logs, paramregex, eraseRex, stopkeyword) 111 | return graph_cache_jaccard 112 | 113 | @staticmethod 114 | def generate_graph_cache_jaccard(graph_logs, paramregex, eraseRex, stopkeyword): 115 | """ 116 | 生成图谱中所有日志的分词结果 117 | :param graph_logs: 118 | :param paramregex: 119 | :param eraseRex: 120 | :param stopkeyword: 121 | :return: 122 | """ 123 | graph_cache_jaccard = {} 124 | for log in graph_logs: 125 | log_ = log_preprocess(log['n']['content'], paramregex, eraseRex) 126 | log_dict = generate_cidian_jaccard(log_, stopkeyword) 127 | graph_cache_jaccard[log['n']['name']] = log_dict 128 | return graph_cache_jaccard -------------------------------------------------------------------------------- /service/module_tools/identify_faultservice.py: -------------------------------------------------------------------------------- 1 | import networkx as nx 2 | 3 | from utils.graph import ServiceNode 4 | from utils.pageRank import PRIterator 5 | 6 | 7 | class IdentifyFaultService: 8 | @staticmethod 9 | def generate_service_invoke_graph(traceObjData_by_traceId): 10 | """ 11 | 根据调用链数据 12 | :param traceObjData_by_traceId: 13 | :return: nodes、edges一种服务调用图的表现形式;traverse_initial_list所有的调用发起节点 14 | """ 15 | num_nodes = {} #{spanId:ServiceNode,spanId:ServiceNode} 16 | num_edges = [] #[[spanId,spanId],[spanId,spanId]] 17 | traverse_initial_list = []#[serviceId,serviceId] 18 | for key, value in traceObjData_by_traceId.items(): 19 | for i in value: 20 | id = i.id 21 | pid = i.pid 22 | serviceId = i.serviceId 23 | serviceName = i.serviceName 24 | serviceType = i.serviceType 25 | if id not in num_nodes: 26 | num_nodes[id] = ServiceNode(serviceId,serviceName,serviceType) 27 | if pid and id and pid != -1: 28 | if [pid, id] not in num_edges: 29 | num_edges.append([pid, id]) 30 | elif pid == -1: 31 | if serviceId not in traverse_initial_list: 32 | traverse_initial_list.append(serviceId) 33 | 34 | #转换为真正调用图数据 35 | nodes = {} #{serviceId:ServiceNode,serviceId:ServiceNode} 36 | edges = [] #[[serviceId,serviceId],[serviceId,serviceId]] 37 | # 将编号边替换为具体的服务ID 38 | for num_edge in num_edges: 39 | if num_edge[0] in num_nodes and num_edge[1] in num_nodes: 40 | p_servceId = num_nodes[num_edge[0]].serviceId 41 | c_serviceId = num_nodes[num_edge[1]].serviceId 42 | if p_servceId!= c_serviceId and [p_servceId,c_serviceId] not in edges: 43 | edges.append([p_servceId,c_serviceId]) 44 | # 将编号节点替换为具体的服务ID 45 | for key, value in num_nodes.items(): 46 | if value.serviceId not in nodes: 47 | nodes[value.serviceId] = value 48 | return nodes,edges,traverse_initial_list 49 | 50 | @staticmethod 51 | def completion_serviceNode_deploymentData(deploymentObjData_by_sviid,nodes): 52 | """ 53 | 对服务依赖图中的节点补充部署信息 54 | :param deployment_data: 接入的部署数据 55 | :param nodes: 图中节点dict 56 | :return:nodes 57 | """ 58 | for key, value in nodes.items(): 59 | if key in deploymentObjData_by_sviid.keys(): 60 | value.hostId = deploymentObjData_by_sviid[key].hostId 61 | value.hostName = deploymentObjData_by_sviid[key].hostName 62 | value.dockerName = deploymentObjData_by_sviid[key].containerName 63 | value.dockerId = deploymentObjData_by_sviid[key].containerId 64 | return nodes 65 | 66 | @staticmethod 67 | def set_service_exception_info(nodes, data): 68 | """ 69 | 识别出异常服务,并在nodes中补充异常信息 70 | :param nodes: 服务依赖图节点dict ServiceNode 71 | :param data: input_data实例 72 | :return: nodes 补充上是否异常信息 73 | """ 74 | if nodes is None: 75 | return None 76 | exception_list_metric_belongTo = data.organization_exception_metricObjData_by_metricBelongTo() 77 | exception_list_log_belongTo = data.organization_exception_logObjData_by_logBelongTo() 78 | for key, serviceNode in nodes.items(): 79 | if serviceNode.serviceId in ( 80 | exception_list_metric_belongTo or exception_list_log_belongTo) or serviceNode.hostId in ( 81 | exception_list_metric_belongTo or exception_list_log_belongTo) or serviceNode.containerId in ( 82 | exception_list_metric_belongTo or exception_list_log_belongTo): 83 | serviceNode.isException = 1 84 | return nodes 85 | 86 | @staticmethod 87 | def location_root_service(graph, start_service_id, root_services): 88 | """ 89 | 定位根因服务,某个节点为初始遍历节点 90 | :param graph: 91 | :param start_service_id: 92 | :param root_services: 93 | :return:root_services 本次遍历后的根因列表 94 | """ 95 | queue = [] 96 | queue.append(start_service_id) 97 | while (len(queue) > 0): 98 | cur_node_id = queue.pop(0) 99 | if IdentifyFaultService.is_root_service(graph, cur_node_id): 100 | if cur_node_id not in root_services: 101 | root_services[cur_node_id] = 1 102 | else: 103 | root_services[cur_node_id] = root_services[cur_node_id] + 1 104 | else: 105 | for chirld_id in graph[cur_node_id].childs: 106 | if graph[chirld_id].isException == 1: 107 | queue.append(chirld_id) 108 | return root_services 109 | 110 | @staticmethod 111 | def is_root_service(graph, service_id): 112 | is_root = True 113 | chirlds = graph[service_id].childs 114 | for chirld_id in chirlds: 115 | if graph[chirld_id].isException == 1: 116 | is_root = False 117 | if graph[service_id].isException == 0: 118 | is_root = False 119 | return is_root 120 | 121 | @staticmethod 122 | def get_fault_services_list_PR(graph, traverse_initial_list): 123 | """ 124 | 识别故障服务列表 PR方法 125 | :param graph: 126 | :param traverse_initial_list: 127 | :return: final_root_services 故障服务列表 {serviceId:数值,serviceId:数值} 128 | """ 129 | if len(graph.nodes) == 0: 130 | return None 131 | dg = nx.DiGraph() 132 | for key, node in graph.nodes.items(): 133 | dg.add_node(key) 134 | for edge in graph.edges: 135 | dg.add_edge(edge[0], edge[1]) 136 | pr = PRIterator(dg) 137 | page_ranks = pr.page_rank() 138 | node_pr_sorted = sorted(page_ranks.items(), key=lambda x: x[1], reverse=True) 139 | root_services = {} 140 | for index, serviceId in enumerate(node_pr_sorted): 141 | if index < 3: 142 | root_services[serviceId[0]] = serviceId[1] 143 | return root_services 144 | 145 | @staticmethod 146 | def get_fault_services_list(graph, traverse_initial_list): 147 | """ 148 | 识别故障服务 使用图广度优先搜索算法 149 | :param graph: 150 | :param traverse_initial_list: 151 | :return: final_root_services 故障服务列表 {serviceId:数值,serviceId:数值} 152 | """ 153 | construct_graph = graph.generate_invoke_graph_consturct() 154 | if len(construct_graph) == 0 or len(traverse_initial_list) == 0: 155 | return None 156 | root_services = {} # {serviceId:数值,serviceId:数值} 157 | # 遍历图初始遍历列表 158 | for i in traverse_initial_list: 159 | root_services = IdentifyFaultService.location_root_service(construct_graph, i, root_services) 160 | return root_services -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /demo/hadoop_data_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from service.fault_diagnosis_service import fault_diagmosis 4 | from utils.data_tools import is_number, utc_to_local 5 | 6 | 7 | def data_collection_process_json(): 8 | """ 9 | 从json中获取原始调用链数据、部署数据、原始指标数据、原始日志数据、异常指标数据和异常日志数据 10 | Args: 11 | 12 | Returns: 返回对应的从json中获取到的数据 13 | """ 14 | 15 | #获取异常指标数据 16 | f = open('../data/hadoop_data/exception_data/result-infi.json', 'r') 17 | original_exception_metric_data = json.load(f) 18 | exception_metric_data = original_exception_metric_data['3EUz83cBglWMAhILSC5I'] 19 | #获取日志指标数据 20 | f = open('../data/hadoop_data/exception_data/result_log-infi.json', 'r') 21 | original_exception_log_data = json.load(f) 22 | exception_log_data = original_exception_log_data['3EUz83cBglWMAhILSC5I'] 23 | #获取原始指标、调用链、日志数据 24 | f = open('../data/hadoop_data/3EUz83cBglWMAhILSC5I.json', 'r') 25 | original_data = json.load(f) 26 | items = original_data[0]['_source']['items'] 27 | traces = original_data[0]['_source']['traces'] 28 | logs = original_data[0]['_source']['logs'] 29 | #获取原始部署数据 30 | f = open('../data/hadoop_data/deployment_info.json', 'r') 31 | deployment_data = json.load(f) 32 | return exception_metric_data,exception_log_data,original_data,items,traces,logs,deployment_data 33 | 34 | def get_original_trace_data(traces): 35 | """ 36 | 将traces处理成目标输入数据:每条记录包括id、pid、serviceId、serviceName、serviceType、startTime、traceId等字段 37 | hadoop的traces是将traces组织在一起的 38 | :param traces: 39 | :return:目标格式的List 40 | """ 41 | traceData = list() 42 | for trace in traces: 43 | i_dict = json.loads(trace) 44 | for index, span in enumerate(i_dict['data']['trace']['spans']): 45 | if span['parentSpanId'] != -1: continue 46 | else: 47 | if len(span['refs'])== 0: 48 | tmp_dict = {} 49 | tmp_dict['id'] = span['segmentId'] 50 | tmp_dict['pid'] = -1 51 | tmp_dict['serviceId'] = span['serviceCode']#serviceCode并不能唯一标识服务实例,需要再讨论 52 | tmp_dict['serviceName'] = span['serviceCode'] 53 | tmp_dict['serviceType'] = span['type'] 54 | tmp_dict['startTime'] = span['startTime'] 55 | tmp_dict['traceId'] = span['traceId'] 56 | traceData.append(tmp_dict) 57 | else: 58 | for k in span['refs']: 59 | tmp_dict = {} 60 | tmp_dict['id'] = span['segmentId'] 61 | tmp_dict['pid'] = k['parentSegmentId'] 62 | tmp_dict['serviceId'] = span['serviceCode'] # serviceCode并不能唯一标识服务实例,需要再讨论 63 | tmp_dict['serviceName'] = span['serviceCode'] 64 | tmp_dict['serviceType'] = span['type'] 65 | tmp_dict['startTime'] = span['startTime'] 66 | tmp_dict['traceId'] = span['traceId'] 67 | traceData.append(tmp_dict) 68 | return traceData 69 | 70 | def get_deployment_data(deployment_data): 71 | """ 72 | 将部署数据处理成目标输入数据:每条记录包含serviceInstanceId、serviceName、hostId、hostName、containerId、containerName等字段 73 | :param deployment_data: 74 | :return:目标格式List 75 | """ 76 | return deployment_data 77 | 78 | def get_original_metric_data(items): 79 | """ 80 | 将原始指标数据处理成目标输入数据:每条记录包含timestamp、metricId、metricName、value、units、metricBelongTo、metricBelongLevel 81 | :param items: 82 | :return:目标格式List 83 | """ 84 | originalMetricData = list() 85 | for item_str in items: 86 | item = json.loads(item_str) 87 | metricId = item['id'] 88 | metricName = item['name'] 89 | units = item['units'] 90 | applicationName = item['applicationName'] 91 | metricBelongTo = None 92 | metricBelongLevel = None 93 | if applicationName == "Zabbix server": 94 | continue 95 | else: 96 | if applicationName is None: 97 | continue 98 | elif applicationName.startswith('Hadoop'): 99 | if applicationName == "Hadoop": 100 | name_split = metricName.split(':') 101 | metricBelongTo = name_split[0] 102 | metricBelongLevel = "service" 103 | else: 104 | applicationName_split = applicationName.split() 105 | metricBelongTo = applicationName_split[1] 106 | metricBelongLevel = "service" 107 | elif applicationName.startswith('Docker'): 108 | if applicationName == "Docker": 109 | continue 110 | else: 111 | applicationName_split = applicationName.split() 112 | metricBelongTo = applicationName_split[2][1:] 113 | metricBelongLevel = "docker" 114 | else: 115 | metricBelongTo = item['hostName'] 116 | metricBelongLevel = "host" 117 | stampTimes = item['allClock'] 118 | values = item['allValue'] 119 | stampTimes_splits = stampTimes.strip().split(',') 120 | values_splits = values.strip().split(',') 121 | if len(stampTimes_splits) > 0 and len(stampTimes_splits) == len(values_splits): 122 | for index,value in enumerate(stampTimes_splits): 123 | tmp_metric = dict() 124 | tmp_metric['metricId'] = str(metricId) 125 | tmp_metric['metricName'] = metricName 126 | tmp_metric['metricBelongTo'] = metricBelongTo 127 | tmp_metric['metricBelongLevel'] = metricBelongLevel 128 | tmp_metric['units'] = units 129 | tmp_metric['timestamp'] = value 130 | if is_number(values_splits[index]): 131 | tmp_metric['value'] = float(values_splits[index]) 132 | originalMetricData.append(tmp_metric) 133 | else: 134 | continue 135 | return originalMetricData 136 | 137 | def get_original_log_data(logs): 138 | """ 139 | 将原始日志数据处理成目标输入数据:每条记录包含timestamp、logId、logMessage、logLevel、logBelongTo、logBelongLevel等字段 140 | :param logs: 141 | :return:目标格式List 142 | """ 143 | originalLogData = list() 144 | for key, logList in logs.items(): 145 | if len(logList) == 0: 146 | continue 147 | logId = key 148 | for log_str in logList: 149 | log = json.loads(log_str) 150 | tmp_log = dict() 151 | tmp_log['logId'] = logId 152 | if logId == "stderr": 153 | tmp_log['logLevel'] = None 154 | tmp_log['logMessage'] = log['message'] 155 | log_time = log['@timestamp'] 156 | log_time = utc_to_local(log_time) 157 | tmp_log['timestamp'] = log_time 158 | else: 159 | if "level" in log: 160 | tmp_log['logLevel'] = log['level'] 161 | else: 162 | tmp_log['logLevel'] = None 163 | tmp_log['logMessage'] = log['log_message'] 164 | log_time = log['log_time'] 165 | log_time = utc_to_local(log_time) 166 | tmp_log['timestamp'] = log_time 167 | if logId.startswith('hadoop'): 168 | logId_splits = logId.strip().split('-') 169 | logBelongTo = logId_splits[2] 170 | tmp_log['logBelongLevel'] = "service" 171 | if logBelongTo == "datanode": 172 | tmp_log['logBelongTo'] = "DataNode" 173 | elif logBelongTo == "namenode": 174 | tmp_log['logBelongTo'] = "NameNode" 175 | elif logBelongTo == "nodemanager": 176 | tmp_log['logBelongTo'] = "NodeManager" 177 | elif logBelongTo == "resourcemanager": 178 | tmp_log['logBelongTo'] = "ResourceManager" 179 | elif logBelongTo == "secondarynamenode": 180 | tmp_log['logBelongTo'] = "SecondaryNameNode" 181 | else: 182 | tmp_log['logBelongTo'] = log['host']['name'] 183 | tmp_log['logBelongLevel'] = "host" 184 | originalLogData.append(tmp_log) 185 | return originalLogData 186 | 187 | def get_exception_metric_data(exception_metric_data): 188 | """ 189 | 将异常指标数据处理成输入数据:每条记录包含startTime、endTime、metricId、metricName、value、units、metricBelongTo、metricBelongLevel等字段 190 | :param exception_metric_data: 191 | :return:目标格式List 192 | """ 193 | exceptionMetricData = list() 194 | for ex_metric in exception_metric_data: 195 | tmp_dict = dict() 196 | metricId = ex_metric['metricId'] 197 | metricName = ex_metric['metricName'] 198 | units = None 199 | metricBelongTo = None 200 | metricBelongLevel = None 201 | applicationName = ex_metric['belongTo'] 202 | if applicationName == "Zabbix server": 203 | continue 204 | else: 205 | if applicationName is None: 206 | continue 207 | elif applicationName.startswith('Hadoop'): 208 | if applicationName == "Hadoop": 209 | name_split = metricName.split(':') 210 | metricBelongTo = name_split[0] 211 | metricBelongLevel = "service" 212 | else: 213 | applicationName_split = applicationName.split() 214 | metricBelongTo = applicationName_split[1] 215 | metricBelongLevel = "service" 216 | elif applicationName.startswith('Docker'): 217 | if applicationName == "Docker": 218 | continue 219 | else: 220 | applicationName_split = applicationName.split() 221 | metricBelongTo = applicationName_split[2][1:] 222 | metricBelongLevel = "docker" 223 | else: 224 | # metricBelongTo = ex_metric['hostName'] 225 | metricBelongTo = "fuguangbo-0002" 226 | metricBelongLevel = "host" 227 | stampTimes = ex_metric['testTime'] 228 | values = ex_metric['value'] 229 | stampTimes_splits = stampTimes.strip().split(',') 230 | values_splits = values.strip().split(',') 231 | if len(stampTimes_splits) > 0 and len(stampTimes_splits) == len(values_splits): 232 | for index, value in enumerate(stampTimes_splits): 233 | tmp_metric = dict() 234 | tmp_metric['metricId'] = str(metricId) 235 | tmp_metric['metricName'] = metricName 236 | tmp_metric['metricBelongTo'] = metricBelongTo 237 | tmp_metric['metricBelongLevel'] = metricBelongLevel 238 | tmp_metric['units'] = units 239 | tmp_metric['endTime'] = value 240 | tmp_metric['startTime'] = value 241 | if is_number(values_splits[index]): 242 | tmp_metric['value'] = float(values_splits[index]) 243 | exceptionMetricData.append(tmp_metric) 244 | else: 245 | continue 246 | return exceptionMetricData 247 | 248 | def get_exception_log_data(exception_log_data): 249 | """ 250 | 将异常日志数据处理成目标输入数据:每条记录包含startTime、endTime、logId、logExceptionSegment、logBelongLevel、logBelongTo等字段 251 | :param exception_log_data: 252 | :return: 目标格式List 253 | """ 254 | exceptionLogData = list() 255 | for logSegment in exception_log_data: 256 | tmp_log = dict() 257 | startTime = logSegment['testTime'] 258 | endTime = logSegment['testTime'] 259 | logId = logSegment['logId'].strip().split(':')[0] 260 | logExceptionSegment = logSegment['logDetail'] 261 | if logId.startswith('hadoop'): 262 | logId_splits = logId.strip().split('-') 263 | logBelongTo = logId_splits[2] 264 | tmp_log['logBelongLevel'] = "service" 265 | if logBelongTo == "datanode": 266 | tmp_log['logBelongTo'] = "DataNode" 267 | elif logBelongTo == "namenode": 268 | tmp_log['logBelongTo'] = "NameNode" 269 | elif logBelongTo == "nodemanager": 270 | tmp_log['logBelongTo'] = "NodeManager" 271 | elif logBelongTo == "resourcemanager": 272 | tmp_log['logBelongTo'] = "ResourceManager" 273 | elif logBelongTo == "secondarynamenode": 274 | tmp_log['logBelongTo'] = "SecondaryNameNode" 275 | else: 276 | tmp_log['logBelongTo'] = logSegment["belongTo"].strip().split(':')[0] 277 | tmp_log['logBelongLevel'] = "host" 278 | tmp_log['startTime'] = startTime 279 | tmp_log['endTime'] = endTime 280 | tmp_log['logId'] = logId 281 | tmp_log['logExceptionSegment'] = logExceptionSegment 282 | exceptionLogData.append(tmp_log) 283 | return exceptionLogData 284 | 285 | if __name__ == '__main__': 286 | exception_metric_data, exception_log_data, original_data, items, traces, logs, deployment_data = data_collection_process_json() 287 | deploymentData = get_deployment_data(deployment_data) 288 | traceData = get_original_trace_data(traces) 289 | original_metricData = get_original_metric_data(items) 290 | original_logData = get_original_log_data(logs) 291 | exception_metricData = get_exception_metric_data(exception_metric_data) 292 | exception_logData = get_exception_log_data(exception_log_data) 293 | fault_diagmosis(deploymentData, traceData, original_metricData, original_logData, exception_metricData, 294 | exception_logData) 295 | pass -------------------------------------------------------------------------------- /service/module_tools/input_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List 3 | 4 | from bean.input_model import DeploymentDataEntry, TraceDataEntry, OriginalMetricEntry, OriginalLogEntry, \ 5 | ExceptionMetricEntry, ExceptionLogEntry 6 | import pandas as pd 7 | 8 | 9 | class InputData: 10 | def __init__(self, deploymentData: List, traceData: List, original_metricData: List, original_logData: List, 11 | exception_metricData: List, exception_logData: List): 12 | self.deploymentObjData = deploymentData_to_obj(deploymentData) 13 | self.traceObjData = traceData_to_obj(traceData) 14 | self.original_metricObjData = originalMetricData_to_obj(original_metricData) 15 | self.original_logObjData = originalLogData_to_obj(original_logData) 16 | self.exception_metricObjData = exceptionMetricData_to_obj(exception_metricData) 17 | self.exception_logObjData = exceptionLogData_to_obj(exception_logData) 18 | 19 | # self.traceObjData_by_traceId = None 20 | # self.deploymentObjData_by_sviid = None 21 | # self.original_metricObjData_by_metricId = None 22 | # self.original_logObjData_by_logId = None 23 | # 24 | # self.exception_metricObjData_by_metricBelongTo = None 25 | # self.exception_logObjData_by_logBelongTo = None 26 | 27 | def organization_deploymentObjData_by_sviid(self): 28 | """ 29 | 组织部署数据dict,key为serviceInstanceId 30 | :param deploymentObjData: 31 | :return: 以serviceInstanceId为key的dict 32 | """ 33 | # if deploymentObjData_by_sviid is not None: return self.deploymentObjData_by_sviid 34 | deploymentObjData_by_sviid = {} 35 | for i in self.deploymentObjData: 36 | deploymentObjData_by_sviid[i.serviceInstanceId] = i 37 | return deploymentObjData_by_sviid 38 | 39 | def organization_traceObjData_by_traceId(self): 40 | """ 41 | 组织调用链数据dict,key为straceId 42 | :param traceObjData: 43 | :return: 以traceId为key的dict 44 | """ 45 | # if self.traceObjData_by_traceId is not None: return self.traceObjData_by_traceId 46 | traceObjData_by_traceId = {} 47 | for i in self.traceObjData: 48 | if i.traceId not in traceObjData_by_traceId: 49 | traceObjData_by_traceId[i.traceId] = [] 50 | traceObjData_by_traceId[i.traceId].append(i) 51 | else: 52 | traceObjData_by_traceId[i.traceId].append(i) 53 | return traceObjData_by_traceId 54 | 55 | def organization_original_metricObjData_by_metricId(self): 56 | """ 57 | 组织原始指标数据dict,key为metricId 58 | :param original_metricObjData: 59 | :return:以metricId为key的dict 60 | """ 61 | # if self.original_metricObjData_by_metricId is not None: return self.original_metricObjData_by_metricId 62 | original_metricObjData_by_metricId = {} 63 | for i in self.original_metricObjData: 64 | if i.metricId not in original_metricObjData_by_metricId: 65 | original_metricObjData_by_metricId[i.metricId] = [] 66 | original_metricObjData_by_metricId[i.metricId].append(i) 67 | else: 68 | original_metricObjData_by_metricId[i.metricId].append(i) 69 | return original_metricObjData_by_metricId 70 | 71 | def organization_original_logObjData_by_logId(self): 72 | """ 73 | 组织原始指标数据dict,key为logId 74 | :param original_logObjData: 75 | :return: 以logId为key的dict 76 | """ 77 | # if self.original_logObjData_by_logId is not None: return self.original_logObjData_by_logId 78 | original_logObjData_by_logId = {} 79 | for i in self.original_logObjData: 80 | if i.logId not in original_logObjData_by_logId: 81 | original_logObjData_by_logId[i.logId] = [] 82 | original_logObjData_by_logId[i.logId].append(i) 83 | else: 84 | original_logObjData_by_logId[i.logId].append(i) 85 | return original_logObjData_by_logId 86 | 87 | def get_target_exception_metric_data(self, exception_metricObjData): 88 | pass 89 | 90 | def get_target_exception_log_data(self, exception_logObjData): 91 | pass 92 | 93 | def organization_exception_metricObjData_by_metricBelongTo(self): 94 | """ 95 | 组织异常指标数据dict,key为metricBelongTo 96 | :param exception_metricObjData: 97 | :return: 以metricBelongTo为key的dict 98 | """ 99 | # if self.exception_metricObjData_by_metricBelongTo is not None: return self.exception_metricObjData_by_metricBelongTo 100 | exception_metricObjData_by_metricBelongTo = {} 101 | for i in self.exception_metricObjData: 102 | if i.metricBelongTo not in exception_metricObjData_by_metricBelongTo: 103 | exception_metricObjData_by_metricBelongTo[i.metricBelongTo] = [] 104 | exception_metricObjData_by_metricBelongTo[i.metricBelongTo].append(i) 105 | return exception_metricObjData_by_metricBelongTo 106 | 107 | def organization_exception_logObjData_by_logBelongTo(self): 108 | """ 109 | 组织异常日志数据dict,key为logBelongTo 110 | :param exception_logObjData: 111 | :return: 以logBelongTo为key的dict 112 | """ 113 | # if self.exception_logObjData_by_logBelongTo is not None: return self.exception_logObjData_by_logBelongTo 114 | exception_logObjData_by_logBelongTo = {} 115 | for i in self.exception_logObjData: 116 | if i.logBelongTo not in exception_logObjData_by_logBelongTo: 117 | exception_logObjData_by_logBelongTo[i.logBelongTo] = [] 118 | exception_logObjData_by_logBelongTo[i.logBelongTo].append(i) 119 | return exception_logObjData_by_logBelongTo 120 | 121 | def get_fault_service_related_log_metric_data(self, serviceId, containerId=None, hostId=None): 122 | exception_metrics_service_related = dict() 123 | exception_logs_service_related = dict() 124 | if serviceId is None: return exception_metrics_service_related, exception_logs_service_related 125 | 126 | if serviceId and serviceId in self.organization_exception_metricObjData_by_metricBelongTo().keys(): 127 | service_exception_metrics_list = self.organization_exception_metricObjData_by_metricBelongTo()[serviceId] 128 | for i in service_exception_metrics_list: 129 | if i.metricId not in exception_metrics_service_related: 130 | exception_metrics_service_related[i.metricId] = [] 131 | exception_metrics_service_related[i.metricId].append(i) 132 | else: 133 | exception_metrics_service_related[i.metricId].append(i) 134 | if hostId and hostId in self.organization_exception_metricObjData_by_metricBelongTo().keys(): 135 | ssss = self.organization_exception_metricObjData_by_metricBelongTo() 136 | host_exception_metrics_list = self.organization_exception_metricObjData_by_metricBelongTo()[hostId] 137 | for i in host_exception_metrics_list: 138 | if i.metricId not in exception_metrics_service_related: 139 | exception_metrics_service_related[i.metricId] = [] 140 | exception_metrics_service_related[i.metricId].append(i) 141 | else: 142 | exception_metrics_service_related[i.metricId].append(i) 143 | if containerId and containerId in self.organization_exception_metricObjData_by_metricBelongTo().keys(): 144 | docker_exception_metrics_list = self.organization_exception_metricObjData_by_metricBelongTo()[containerId] 145 | for i in docker_exception_metrics_list: 146 | if i.metricId not in exception_metrics_service_related: 147 | exception_metrics_service_related[i.metricId] = [] 148 | exception_metrics_service_related[i.metricId].append(i) 149 | else: 150 | exception_metrics_service_related[i.metricId].append(i) 151 | # 获取相关异常日志列表 152 | if serviceId and serviceId in self.organization_exception_logObjData_by_logBelongTo().keys(): 153 | service_exception_logs_list = self.organization_exception_logObjData_by_logBelongTo()[serviceId] 154 | for i in service_exception_logs_list: 155 | if i.logId not in exception_logs_service_related: 156 | exception_logs_service_related[i.logId] = [] 157 | exception_logs_service_related[i.logId].append(i) 158 | else: 159 | exception_logs_service_related[i.logId].append(i) 160 | if hostId and hostId in self.organization_exception_logObjData_by_logBelongTo().keys(): 161 | host_exception_logs_list = self.organization_exception_logObjData_by_logBelongTo()[hostId] 162 | for i in host_exception_logs_list: 163 | if i.logId not in exception_logs_service_related: 164 | exception_logs_service_related[i.logId] = [] 165 | exception_logs_service_related[i.logId].append(i) 166 | else: 167 | exception_logs_service_related[i.logId].append(i) 168 | if containerId and containerId in self.organization_exception_logObjData_by_logBelongTo().keys(): 169 | docker_exception_logs_list = self.organization_exception_logObjData_by_logBelongTo()[containerId] 170 | for i in docker_exception_logs_list: 171 | if i.logId not in exception_logs_service_related: 172 | exception_logs_service_related[i.logId] = [] 173 | exception_logs_service_related[i.logId].append(i) 174 | else: 175 | exception_logs_service_related[i.logId].append(i) 176 | return exception_metrics_service_related, exception_logs_service_related 177 | 178 | def get_PC_input_data(self, exception_metrics, exception_logs): 179 | """ 180 | 原始数据预处理,得到PC算法输入格式 181 | :param exception_metrics: 182 | :param exception_logs: 183 | :return: 184 | """ 185 | metric_input = None 186 | for key, value in exception_metrics.items(): 187 | metric_data = [i.__dict__ for i in self.organization_original_metricObjData_by_metricId()[key]] 188 | # df = pd.DataFrame(metric_data) 189 | df = pd.read_json(json.dumps(metric_data), orient='records') 190 | if df.empty == False: 191 | metric_input_tmp = df[['metricId', 'timestamp', 'value']].groupby( 192 | ['metricId', 'timestamp']).agg('mean') 193 | metric_input_tmp = metric_input_tmp.pivot_table(index='timestamp', columns='metricId', values='value') 194 | if metric_input is None: 195 | metric_input = metric_input_tmp 196 | else: 197 | metric_input = pd.concat([metric_input, metric_input_tmp], axis=1, sort=True) 198 | log_input = None 199 | for key, value in exception_logs.items(): 200 | log_data = self.organization_original_logObjData_by_logId() 201 | log_data = self.organization_original_logObjData_by_logId()[key] 202 | log_data = [i.__dict__ for i in self.organization_original_logObjData_by_logId()[key]] 203 | # df = pd.DataFrame(log_data) 204 | df = pd.read_json(json.dumps(log_data), orient='records') 205 | if df.empty == False: 206 | log_input_tmp = df[['logId', 'timestamp', 'logMessage']].groupby( 207 | ['logId', 'timestamp']).agg('count') 208 | log_input_tmp = log_input_tmp.pivot_table(index='timestamp', columns='logId', values='logMessage') 209 | if log_input is None: 210 | log_input = log_input_tmp 211 | else: 212 | log_input = pd.concat([log_input, log_input_tmp], axis=1) 213 | pc_input = pd.concat([metric_input, log_input], axis=1) 214 | pc_input.fillna(method='pad', axis=0, inplace=True) 215 | pc_input.fillna(method='backfill', axis=0, inplace=True) 216 | pc_input[pc_input == 0] = 0.00001 217 | return pc_input 218 | 219 | 220 | def deploymentData_to_obj(deploymentData): 221 | deploymentObjData = list() 222 | if deploymentData is None: return deploymentObjData 223 | for data in deploymentData: 224 | tmp_obj = DeploymentDataEntry(data['serviceInstanceId'], data['serviceName'], data['hostId'], data['hostName'], 225 | data['containerId'], data['containerName']) 226 | deploymentObjData.append(tmp_obj) 227 | return deploymentObjData 228 | 229 | 230 | def traceData_to_obj(traceData): 231 | traceObjData = list() 232 | if traceData is None: return traceObjData 233 | for data in traceData: 234 | tmp_obj = TraceDataEntry(data['id'], data['pid'], data['serviceId'], data['traceId'], data['serviceName'], 235 | data['serviceType'], data['startTime']) 236 | traceObjData.append(tmp_obj) 237 | return traceObjData 238 | 239 | 240 | def originalMetricData_to_obj(original_metricData): 241 | original_metricObjData = list() 242 | if original_metricData is None: return original_metricObjData 243 | for data in original_metricData: 244 | tmp_obj = OriginalMetricEntry(data['metricId'], data['metricName'], data['timestamp'], data['value'], 245 | data['metricBelongTo'], data['units'], data['metricBelongLevel']) 246 | original_metricObjData.append(tmp_obj) 247 | return original_metricObjData 248 | 249 | 250 | def originalLogData_to_obj(original_logData): 251 | original_logObjData = list() 252 | if original_logData is None: return original_logObjData 253 | for data in original_logData: 254 | tmp_obj = OriginalLogEntry(data['logId'], data['timestamp'], data['logMessage'], data['logBelongTo'], 255 | data['logLevel'], data['logBelongLevel']) 256 | original_logObjData.append(tmp_obj) 257 | return original_logObjData 258 | 259 | 260 | def exceptionMetricData_to_obj(exception_metricData): 261 | exception_metricObjData = list() 262 | if exception_metricData is None: return exception_metricObjData 263 | for data in exception_metricData: 264 | tmp_obj = ExceptionMetricEntry(data['startTime'], data['endTime'], data['metricId'], data['metricName'], 265 | data['value'], data['metricBelongTo'], data['units'], data['metricBelongLevel']) 266 | exception_metricObjData.append(tmp_obj) 267 | return exception_metricObjData 268 | 269 | 270 | def exceptionLogData_to_obj(exception_logData): 271 | exception_logObjData = list() 272 | if exception_logData is None: return exception_logObjData 273 | for data in exception_logData: 274 | tmp_obj = ExceptionLogEntry(data['startTime'], data['endTime'], data['logId'], data['logBelongTo'], 275 | data['logExceptionSegment'], data['logBelongLevel']) 276 | exception_logObjData.append(tmp_obj) 277 | return exception_logObjData 278 | 279 | 280 | if __name__ == '__main__': 281 | metric1 = OriginalMetricEntry("1", "1", "1", 1.0, "1", "1", "1") 282 | metric2 = OriginalMetricEntry("2", "2", "2", 1.0, "2", "2", "2") 283 | list = [metric1.__dict__, metric2.__dict__] 284 | a = json.dumps(list) 285 | df = pd.read_json(a, orient='records') 286 | pass 287 | -------------------------------------------------------------------------------- /utils/pcalg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | """A graph generator based on the PC algorithm [Kalisch2007]. 5 | 6 | [Kalisch2007] Markus Kalisch and Peter Bhlmann. Estimating 7 | high-dimensional directed acyclic graphs with the pc-algorithm. In The 8 | Journal of Machine Learning Research, Vol. 8, pp. 613-636, 2007. 9 | """ 10 | from __future__ import print_function 11 | 12 | import logging 13 | import math 14 | from itertools import combinations, permutations 15 | import pandas as pd 16 | import networkx as nx 17 | import numpy as np 18 | from gsq.ci_tests import ci_test_bin, ci_test_dis 19 | from gsq.gsq_testdata import bin_data, dis_data 20 | # from networkx.drawing.tests.test_pylab import plt 21 | from scipy.stats import norm 22 | import matplotlib.pyplot as plt 23 | 24 | from utils.pageRank import PRIterator 25 | 26 | _logger = logging.getLogger(__name__) 27 | 28 | 29 | # 条件独立性检验 30 | def gaussCItest(suffstat, x, y, S): 31 | S = list(S) 32 | C = pd.DataFrame(suffstat).astype(float).corr().values 33 | n = pd.DataFrame(suffstat).values.shape[0] 34 | 35 | cut_at = 0.9999999 36 | 37 | # 偏相关系数 38 | # S中没有点 39 | if len(S) == 0: 40 | r = C[x, y] 41 | 42 | # S中只有一个点 一阶偏相关系数 43 | elif len(S) == 1: 44 | a = (C[x, y] - C[x, S] * C[y, S]) 45 | try: 46 | b = math.sqrt((1 - math.pow(C[y, S], 2)) * (1 - math.pow(C[x, S], 2))) 47 | r = a / b 48 | except: 49 | r = C[x, y] 50 | # 其实我没太明白这里是怎么求的,但R语言的pcalg包就是这样写的 51 | else: 52 | m = C[np.ix_([x] + [y] + S, [x] + [y] + S)] 53 | PM = np.linalg.pinv(m) 54 | r = -1 * PM[0, 1] / math.sqrt(abs(PM[0, 0] * PM[1, 1])) 55 | 56 | r = min(cut_at, max(-1 * cut_at, r)) 57 | # Fisher’s z-transform 58 | res = math.sqrt(n - len(S) - 3) * .5 * math.log1p((2 * r) / (1 - r)) 59 | # Φ^{-1}(1-α/2) 60 | return 2 * (1 - norm.cdf(abs(res))) 61 | 62 | 63 | def _create_complete_graph(node_ids): 64 | """ 65 | 根据「节点列表」创建「图结构」 66 | Create a complete graph from the list of node ids. 67 | 68 | Args: 69 | node_ids: a list of node ids 70 | 71 | Returns: 72 | An undirected graph (as a networkx.Graph) 73 | """ 74 | g = nx.Graph() 75 | g.add_nodes_from(node_ids) 76 | for (i, j) in combinations(node_ids, 2): 77 | g.add_edge(i, j) 78 | return g 79 | 80 | 81 | def estimate_skeleton(indep_test_func, data_matrix, alpha, **kwargs): 82 | """ 83 | 根据统计信息预估骨架图, 84 | 1. 根据原始数据转换成无方向的的图 85 | 2. 遍历所有的有向边,进行独立性检测,当独立性检测结果大于 alpha 时,删除边 86 | Estimate a skeleton graph from the statistical information. 87 | 88 | Args: 89 | indep_test_func: 独立性检测方法 90 | the function name for a conditional independency test. 91 | data_matrix: data (as a numpy array). 92 | alpha: the significance level. 93 | kwargs: 94 | 'max_reach': maximum value of l (see the code). The 95 | value depends on the underlying distribution. 96 | 'method': if 'stable' given, use stable-PC algorithm 97 | (see [Colombo2014]). 98 | 'init_graph': initial structure of skeleton graph 99 | (as a networkx.Graph). If not specified, 100 | a complete graph is used. 101 | other parameters may be passed depending on the 102 | indep_test_func()s. 103 | Returns: 104 | g: a skeleton graph (as a networkx.Graph). 105 | sep_set: a separation set (as an 2D-array of set()). 106 | 107 | [Colombo2014] Diego Colombo and Marloes H Maathuis. Order-independent 108 | constraint-based causal structure learning. In The Journal of Machine 109 | Learning Research, Vol. 15, pp. 3741-3782, 2014. 110 | """ 111 | 112 | def method_stable(kwargs): 113 | return ('method' in kwargs) and kwargs['method'] == "stable" 114 | 115 | node_ids = range(data_matrix.shape[1]) 116 | node_size = data_matrix.shape[1] 117 | sep_set = [[set() for i in range(node_size)] for j in range(node_size)] 118 | if 'init_graph' in kwargs: 119 | g = kwargs['init_graph'] 120 | if not isinstance(g, nx.Graph): 121 | raise ValueError 122 | elif not g.number_of_nodes() == len(node_ids): 123 | raise ValueError('init_graph not matching data_matrix shape') 124 | for (i, j) in combinations(node_ids, 2): 125 | if not g.has_edge(i, j): 126 | sep_set[i][j] = None 127 | sep_set[j][i] = None 128 | else: 129 | # 构造无向边的图 130 | g = _create_complete_graph(node_ids) 131 | 132 | l = 0 133 | while True: 134 | cont = False 135 | remove_edges = [] 136 | # 遍历 node_ids 的全排列,去遍历所有可能存在的边(因为是有向边,所以是排列) 137 | for (i, j) in permutations(node_ids, 2): 138 | # 即其相邻节点 139 | adj_i = list(g.neighbors(i)) 140 | # 如果 j 是 i 的相邻节点,则删除;否则继续下一次遍历 141 | if j not in adj_i: 142 | continue 143 | else: 144 | adj_i.remove(j) 145 | # The process stops if all neighborhoods in the current graph are smaller than the size of the conditional set. 146 | if len(adj_i) >= l: 147 | # _logger.debug('testing %s and %s' % (i, j)) 148 | _logger.debug('测试 %s 节点和 %s 节点' % (i, j)) 149 | # _logger.debug('neighbors of %s are %s' % (i, str(adj_i))) 150 | _logger.debug('%s 的相邻节点有 %s' % (i, str(adj_i))) 151 | if len(adj_i) < l: 152 | continue 153 | # 存在任意节点 k(其实不是节点 k,也可能是节点集合 k),使 i-j 满足条件独立性,那么需要删除 i-j 154 | for k in combinations(adj_i, l): 155 | _logger.debug('indep prob of %s and %s with subset %s' 156 | % (i, j, str(k))) 157 | # 求独立性检测概率 158 | # p_val = indep_test_func(data_matrix, i, j, set(k), **kwargs) 159 | p_val = gaussCItest(data_matrix, i, j, set(k)) 160 | _logger.debug('独立性检测概率为 %s' % str(p_val)) 161 | # 如果概率值大于 alpha 超参数,则移除 i->j 的边 162 | if p_val > alpha: 163 | if g.has_edge(i, j): 164 | _logger.debug('p: 移除边 (%s, %s)' % (i, j)) 165 | if method_stable(kwargs): 166 | remove_edges.append((i, j)) 167 | else: 168 | g.remove_edge(i, j) 169 | # 求并集,即将集合 k 加入到 sep_set 中,由于本步骤不考虑方向,因此 i->j j->i 都采取这种策略 170 | sep_set[i][j] |= set(k) 171 | sep_set[j][i] |= set(k) 172 | break 173 | cont = True 174 | l += 1 175 | if method_stable(kwargs): 176 | g.remove_edges_from(remove_edges) 177 | if cont is False: 178 | break 179 | if ('max_reach' in kwargs) and (l > kwargs['max_reach']): 180 | break 181 | 182 | return (g, sep_set) 183 | 184 | 185 | def estimate_cpdag(skel_graph, sep_set): 186 | """ 187 | 188 | Estimate a CPDAG from the skeleton graph and separation sets 189 | returned by the estimate_skeleton() function. 190 | 191 | Args: 192 | skel_graph: A skeleton graph (an undirected networkx.Graph). 193 | sep_set: An 2D-array of separation set. 194 | The contents look like something like below. 195 | sep_set[i][j] = set([k, l, m]) 196 | 197 | Returns: 198 | An estimated DAG. 199 | """ 200 | # 将骨架图变成有方向的 201 | dag = skel_graph.to_directed() 202 | node_ids = skel_graph.nodes() 203 | # 提取所有的 i,j 组合 204 | for (i, j) in combinations(node_ids, 2): 205 | # 寻找满足关系的 k,i → k ← j 206 | adj_i = set(dag.successors(i)) 207 | if j in adj_i: 208 | continue 209 | adj_j = set(dag.successors(j)) 210 | if i in adj_j: 211 | continue 212 | # 程序稳定的验证,无实际意义 213 | if sep_set[i][j] is None: 214 | continue 215 | # 叮! 找到了 K 可能的集合 216 | common_k = adj_i & adj_j 217 | for k in common_k: 218 | # k 不能存在于 sep_set,由于上一步中无方向,因此只需要判断一个即可 219 | if k not in sep_set[i][j]: 220 | # 如果 k->i,那么j->i,这是不合理的 221 | if dag.has_edge(k, i): 222 | _logger.debug('S: 移除边 (%s, %s)' % (k, i)) 223 | dag.remove_edge(k, i) 224 | # 同上 225 | if dag.has_edge(k, j): 226 | _logger.debug('S: remove edge (%s, %s)' % (k, j)) 227 | dag.remove_edge(k, j) 228 | 229 | def _has_both_edges(dag, i, j): 230 | return dag.has_edge(i, j) and dag.has_edge(j, i) 231 | 232 | def _has_any_edge(dag, i, j): 233 | return dag.has_edge(i, j) or dag.has_edge(j, i) 234 | 235 | def _has_one_edge(dag, i, j): 236 | return ((dag.has_edge(i, j) and (not dag.has_edge(j, i))) or 237 | (not dag.has_edge(i, j)) and dag.has_edge(j, i)) 238 | 239 | def _has_no_edge(dag, i, j): 240 | return (not dag.has_edge(i, j)) and (not dag.has_edge(j, i)) 241 | 242 | # For all the combination of nodes i and j, apply the following 243 | # rules. 244 | # 开始使用三种规则了 245 | old_dag = dag.copy() 246 | while True: 247 | # 提取所有的 i,j 组合 248 | for (i, j) in combinations(node_ids, 2): 249 | # Rule 1: Orient i-j into i->j whenever there is an arrow k->i 250 | # such that k and j are nonadjacent. 251 | # 252 | # Check if i-j. 253 | # 检验是否存在 i-j 无向边 254 | if _has_both_edges(dag, i, j): 255 | # Look all the predecessors of i. 256 | for k in dag.predecessors(i): 257 | # Skip if there is an arrow i->k. 258 | if dag.has_edge(i, k): 259 | continue 260 | # Skip if k and j are adjacent. 261 | if _has_any_edge(dag, k, j): 262 | continue 263 | # Make i-j into i->j 264 | _logger.debug('R1: remove edge (%s, %s)' % (j, i)) 265 | dag.remove_edge(j, i) 266 | break 267 | 268 | # Rule 2: Orient i-j into i->j whenever there is a chain 269 | # i->k->j. 270 | # 271 | # Check if i-j. 272 | if _has_both_edges(dag, i, j): 273 | # Find nodes k where k is i->k. 274 | succs_i = set() 275 | for k in dag.successors(i): 276 | if not dag.has_edge(k, i): 277 | succs_i.add(k) 278 | # Find nodes j where j is k->j. 279 | preds_j = set() 280 | for k in dag.predecessors(j): 281 | if not dag.has_edge(j, k): 282 | preds_j.add(k) 283 | # Check if there is any node k where i->k->j. 284 | if len(succs_i & preds_j) > 0: 285 | # Make i-j into i->j 286 | _logger.debug('R2: remove edge (%s, %s)' % (j, i)) 287 | dag.remove_edge(j, i) 288 | 289 | # Rule 3: Orient i-j into i->j whenever there are two chains 290 | # i-k->j and i-l->j such that k and l are nonadjacent. 291 | # 292 | # Check if i-j. 293 | if _has_both_edges(dag, i, j): 294 | # Find nodes k where i-k. 295 | adj_i = set() 296 | for k in dag.successors(i): 297 | if dag.has_edge(k, i): 298 | adj_i.add(k) 299 | # For all the pairs of nodes in adj_i, 300 | for (k, l) in combinations(adj_i, 2): 301 | # Skip if k and l are adjacent. 302 | if _has_any_edge(dag, k, l): 303 | continue 304 | # Skip if not k->j. 305 | if dag.has_edge(j, k) or (not dag.has_edge(k, j)): 306 | continue 307 | # Skip if not l->j. 308 | if dag.has_edge(j, l) or (not dag.has_edge(l, j)): 309 | continue 310 | # Make i-j into i->j. 311 | _logger.debug('R3: remove edge (%s, %s)' % (j, i)) 312 | dag.remove_edge(j, i) 313 | break 314 | 315 | # Rule 4: Orient i-j into i->j whenever there are two chains 316 | # i-k->l and k->l->j such that k and j are nonadjacent. 317 | # 318 | # However, this rule is not necessary when the PC-algorithm 319 | # is used to estimate a DAG. 320 | 321 | if nx.is_isomorphic(dag, old_dag): 322 | break 323 | old_dag = dag.copy() 324 | 325 | return dag 326 | 327 | 328 | def construct_service_dependency_diagram(b): 329 | data = np.array(b.iloc[:, :])[:, :] 330 | columns = list(b.columns)[:] 331 | columns_mapping = {i: str(column) for i, column in enumerate(columns)} 332 | 333 | (g, sep_set) = estimate_skeleton(indep_test_func=ci_test_dis, 334 | data_matrix=data, 335 | alpha=0.05) 336 | g = estimate_cpdag(skel_graph=g, sep_set=sep_set) 337 | return g, columns_mapping 338 | 339 | 340 | def get_root_cause(g): 341 | """ 342 | 通过关系图获取根因列表 343 | Args: 344 | g: 关系图 345 | 346 | Returns: 根因列表 347 | 348 | """ 349 | result = list() 350 | node_ids = g.nodes() 351 | # 获取原因最多的节点 352 | max_pre_node, max_pre_size = None, 0 353 | for node_id in node_ids: 354 | if len(list(g.predecessors(node_id))) > max_pre_size: 355 | max_pre_node = node_id 356 | max_pre_size = len(list(g.predecessors(node_id))) 357 | # 层次遍历 358 | node_filter, node_queue = {max_pre_node}, list([max_pre_node]) 359 | while node_queue: 360 | node_now = node_queue.pop(0) 361 | if not g.predecessors(node_now): 362 | result.append(node_now) 363 | continue 364 | is_pre_not_filter = False 365 | for k in g.predecessors(node_now): 366 | if k not in node_filter: 367 | is_pre_not_filter = True 368 | node_filter.add(k) 369 | node_queue.append(k) 370 | # 如果所有的上游节点都在 filter 中,将当前节点加入 result,避免 result 为空的情况 371 | if not is_pre_not_filter: 372 | result.append(node_now) 373 | return result 374 | 375 | 376 | if __name__ == '__main__': 377 | 378 | # 打印日志,不要注释掉 379 | ch = logging.StreamHandler() 380 | ch.setLevel(logging.DEBUG) 381 | _logger.setLevel(logging.DEBUG) 382 | _logger.addHandler(ch) 383 | 384 | # mock 原始数据 385 | dm = np.array(bin_data).reshape((5000, 5)) 386 | (g, sep_set) = estimate_skeleton(indep_test_func=ci_test_bin, 387 | data_matrix=dm, 388 | alpha=0.01) 389 | # 390 | g = estimate_cpdag(skel_graph=g, sep_set=sep_set) 391 | g_answer = nx.DiGraph() 392 | g_answer.add_nodes_from([0, 1, 2, 3, 4]) 393 | g_answer.add_edges_from([(0, 1), (2, 3), (3, 2), (3, 1), 394 | (2, 4), (4, 2), (4, 1)]) 395 | print('Edges are:', g.edges(), end='') 396 | if nx.is_isomorphic(g, g_answer): 397 | print(' => GOOD') 398 | else: 399 | print(' => WRONG') 400 | print('True edges should be:', g_answer.edges()) 401 | 402 | # 又 mock 了多点的数据进行测试 403 | dm = np.array(dis_data).reshape((10000, 5)) 404 | (g, sep_set) = estimate_skeleton(indep_test_func=ci_test_dis, 405 | data_matrix=dm, 406 | alpha=0.01, 407 | levels=[3, 2, 3, 4, 2]) 408 | g = estimate_cpdag(skel_graph=g, sep_set=sep_set) 409 | nx.draw(g,pos=spring_layout(g, prog='dot'), # pos 指的是布局,主要有spring_layout,random_layout,circle_layout,shell_layout 410 | node_color='g', # node_color指节点颜色,有rbykw,同理edge_color 411 | edge_color='r', 412 | with_labels=True) 413 | plt.show() 414 | 415 | pr = PRIterator(g) 416 | page_ranks = pr.page_rank() 417 | print("The final page rank is\n", page_ranks) 418 | 419 | g_answer = nx.DiGraph() 420 | g_answer.add_nodes_from([0, 1, 2, 3, 4]) 421 | g_answer.add_edges_from([(0, 2), (1, 2), (1, 3), (4, 3)]) 422 | print('Edges are:', g.edges(), end='') 423 | if nx.is_isomorphic(g, g_answer): 424 | print(' => GOOD') 425 | else: 426 | print(' => WRONG') 427 | print('True edges should be:', g_answer.edges()) 428 | -------------------------------------------------------------------------------- /utils/process_aiops2020_data_to_original.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import csv 4 | import json 5 | 6 | def get_original_trace_data(): 7 | trace_csf_path = '../data/aiops_data_2020/2020_04_11/调用链指标/trace_csf.csv' 8 | trace_fly_remote_path = '../data/aiops_data_2020/2020_04_11/调用链指标/trace_fly_remote.csv' 9 | trace_jdbc_path = '../data/aiops_data_2020/2020_04_11/调用链指标/trace_jdbc.csv' 10 | trace_local_path = '../data/aiops_data_2020/2020_04_11/调用链指标/trace_local.csv' 11 | trace_osb_path = '../data/aiops_data_2020/2020_04_11/调用链指标/trace_osb.csv' 12 | trace_remote_process_path = '../data/aiops_data_2020/2020_04_11/调用链指标/trace_remote_process.csv' 13 | 14 | result = {} 15 | with open(trace_csf_path, 'r') as f: 16 | reader = csv.DictReader(f) 17 | for row in reader: 18 | tmp = {} 19 | traceId = row['traceId'] 20 | id = row['id'] 21 | pid = row['pid'] 22 | # serviceId = row['cmdb_id'] + ':' + row['serviceName'] 23 | serviceType = row['callType'] 24 | tmp['id'] = id 25 | tmp['pid'] = pid 26 | tmp['serviceId'] = None 27 | tmp['cmdb_id'] = row['cmdb_id'] 28 | tmp['serviceType'] = serviceType 29 | tmp['serviceName'] = row['serviceName'] 30 | tmp['startTime'] = row['startTime'] 31 | if traceId not in result: 32 | result[traceId] = [] 33 | result[traceId].append(tmp) 34 | else: 35 | result[traceId].append(tmp) 36 | 37 | with open(trace_jdbc_path, 'r') as f: 38 | reader = csv.DictReader(f) 39 | for row in reader: 40 | tmp = {} 41 | traceId = row['traceId'] 42 | id = row['id'] 43 | pid = row['pid'] 44 | # serviceId = row['cmdb_id'] + ':' + row['dsName'] 45 | serviceType = row['callType'] 46 | tmp['id'] = id 47 | tmp['pid'] = pid 48 | tmp['serviceId'] = row['dsName'] 49 | tmp['cmdb_id'] = row['cmdb_id'] 50 | tmp['serviceType'] = serviceType 51 | tmp['serviceName'] = row['dsName'] 52 | tmp['startTime'] = row['startTime'] 53 | if traceId not in result: 54 | result[traceId] = [] 55 | result[traceId].append(tmp) 56 | else: 57 | result[traceId].append(tmp) 58 | 59 | with open(trace_local_path, 'r') as f: 60 | reader = csv.DictReader(f) 61 | for row in reader: 62 | tmp = {} 63 | traceId = row['traceId'] 64 | id = row['id'] 65 | pid = row['pid'] 66 | # serviceId = row['cmdb_id'] + ':' + row['dsName'] 67 | serviceType = row['callType'] 68 | tmp['id'] = id 69 | tmp['pid'] = pid 70 | tmp['serviceId'] = row['dsName'] 71 | tmp['cmdb_id'] = row['cmdb_id'] 72 | tmp['serviceType'] = serviceType 73 | tmp['serviceName'] = row['dsName'] 74 | tmp['startTime'] = row['startTime'] 75 | if traceId not in result: 76 | result[traceId] = [] 77 | tmp['startTime'] = row['startTime'] 78 | else: 79 | result[traceId].append(tmp) 80 | 81 | with open(trace_osb_path, 'r') as f: 82 | reader = csv.DictReader(f) 83 | for row in reader: 84 | tmp = {} 85 | traceId = row['traceId'] 86 | id = row['id'] 87 | pid = row['pid'] 88 | serviceId = row['cmdb_id'] + ':' + row['serviceName'] 89 | serviceType = row['callType'] 90 | tmp['id'] = id 91 | tmp['pid'] = pid 92 | tmp['serviceId'] = serviceId 93 | tmp['cmdb_id'] = row['cmdb_id'] 94 | tmp['serviceType'] = serviceType 95 | tmp['serviceName'] = row['serviceName'] 96 | tmp['startTime'] = row['startTime'] 97 | if traceId not in result: 98 | result[traceId] = [] 99 | result[traceId].append(tmp) 100 | else: 101 | result[traceId].append(tmp) 102 | 103 | with open(trace_fly_remote_path, 'r') as f: 104 | reader = csv.DictReader(f) 105 | for row in reader: 106 | tmp = {} 107 | traceId = row['traceId'] 108 | id = row['id'] 109 | pid = row['pid'] 110 | serviceId = row['cmdb_id'] + ':' + row['serviceName'] 111 | serviceType = row['callType'] 112 | tmp['id'] = id 113 | tmp['pid'] = pid 114 | tmp['serviceId'] = serviceId 115 | tmp['cmdb_id'] = row['cmdb_id'] 116 | tmp['serviceType'] = serviceType 117 | tmp['serviceName'] = row['serviceName'] 118 | tmp['startTime'] = row['startTime'] 119 | if traceId not in result: 120 | result[traceId] = [] 121 | result[traceId].append(tmp) 122 | else: 123 | result[traceId].append(tmp) 124 | 125 | with open(trace_remote_process_path, 'r') as f: 126 | reader = csv.DictReader(f) 127 | for row in reader: 128 | tmp = {} 129 | traceId = row['traceId'] 130 | id = row['id'] 131 | pid = row['pid'] 132 | serviceId = row['cmdb_id'] + ':' + row['serviceName'] 133 | serviceType = row['callType'] 134 | tmp['id'] = id 135 | tmp['pid'] = pid 136 | tmp['serviceId'] = serviceId 137 | tmp['cmdb_id'] = row['cmdb_id'] 138 | tmp['serviceType'] = serviceType 139 | tmp['serviceName'] = row['serviceName'] 140 | tmp['startTime'] = row['startTime'] 141 | if traceId not in result: 142 | result[traceId] = [] 143 | result[traceId].append(tmp) 144 | else: 145 | result[traceId].append(tmp) 146 | for k,value in result.items(): 147 | tmp_pid_dict = {} 148 | for i in value: 149 | if i['pid'] != 'None': 150 | tmp_pid_dict[i['pid']] = i['cmdb_id'] 151 | del_values = list() 152 | for i in value: 153 | if i['serviceType'] == "CSF": 154 | if i['id'] in tmp_pid_dict: 155 | cmdb_id = tmp_pid_dict[i['id']] 156 | serviceId = cmdb_id +":"+i['serviceName'] 157 | i['serviceId'] = serviceId 158 | i['cmdb_id'] = cmdb_id 159 | else: 160 | del_values.append(i) 161 | for del_i in del_values: 162 | value.remove(del_i) 163 | 164 | save_path = '../data/aiops_data_2020/2020_04_11/origina_traces.json' 165 | with open(save_path, 'w') as f: 166 | json.dump(result, f, indent=2, sort_keys=True, ensure_ascii=False) 167 | 168 | def get_target_deployment_data(original_deployment_data): 169 | """ 170 | 原始部署数据处理为接入目标格式 171 | Args: 172 | original_metric_data: 原始部署数据 173 | 174 | Returns: 原始部署数据的目标接入格式{ serviceInstanceId:{ serviceInstanceId:””, serviceName:””, hostId:””, hostname:””, containerId:””, containerName:””},{},{}} 175 | """ 176 | # original_deployment_data = [{"serviceInstanceId":"os_021:osb_001","serviceName":"osb_001", "containerId":None, "containerName":None,"hostId":"os_021","hostName":"os_021"},{"serviceInstanceId":"os_022:osb_002","serviceName":"osb_002", "containerId":None, "containerName":None,"hostId":"os_022", "hostName":"os_022"},{"serviceInstanceId":"docker_001:csf_001","serviceName":"csf_001", "containerId":"docker_001", "containerName":"docker_001","hostId":"os_017", "hostName":"os_017"},{"serviceInstanceId":"docker_002:csf_001","serviceName":"csf_001", "containerId":"docker_002", "containerName":"docker_002","hostId":"os_018", "hostName":"os_018"},{"serviceInstanceId":"docker_003:csf_001","serviceName":"csf_001", "containerId":"docker_003", "containerName":"docker_003","hostId":"os_019", "hostName":"os_019"},{"serviceInstanceId":"docker_004:csf_001","serviceName":"csf_001", "containerId":"docker_004", "containerName":"docker_004","hostId":"os_020", "hostName":"os_020"},{"serviceInstanceId":"docker_005:csf_002","serviceName":"csf_002","containerId":"docker_005","containerName":"docker_005","hostId":"os_017","hostName":"os_017"},{"serviceInstanceId":"docker_006:csf_002","serviceName":"csf_002", "containerId":"docker_006", "containerName":"docker_006","hostId":"os_018","hostName":"os_018"},{"serviceInstanceId":"docker_007:csf_002","serviceName":"csf_002","containerId":"docker_007","containerName":"docker_007","hostId":"os_019","hostName":"os_019"},{"serviceInstanceId":"docker_008:csf_002","serviceName":"csf_002","containerId":"docker_008","containerName":"docker_008","hostId":"os_020","hostName":"os_020"},{"serviceInstanceId":"docker_005:csf_003","serviceName":"csf_003","containerId":"docker_005","containerName":"docker_005","hostId":"os_017","hostName":"os_017"},{"serviceInstanceId":"docker_006:csf_003","serviceName":"csf_003","containerId":"docker_006","containerName":"docker_006","hostId":"os_018","hostName":"os_018"},{"serviceInstanceId":"docker_007:csf_003","serviceName":"csf_003","containerId":"docker_007","containerName":"docker_007","hostId":"os_019","hostName":"os_019"},{"serviceInstanceId":"docker_008:csf_003","serviceName":"csf_003","containerId":"docker_008","containerName":"docker_008","hostId":"os_020","hostName":"os_020"},{"serviceInstanceId":"docker_005:csf_004","serviceName":"csf_004","containerId":"docker_005","containerName":"docker_005","hostId":"os_017","hostName":"os_017"},{"serviceInstanceId":"docker_006:csf_004","serviceName":"csf_004","containerId":"docker_006","containerName":"docker_006","hostId":"os_018","hostName":"os_018"},{"serviceInstanceId":"docker_007:csf_004","serviceName":"csf_004","containerId":"docker_007","containerName":"docker_007","hostId":"os_019","hostName":"os_019"},{"serviceInstanceId":"docker_008:csf_004","serviceName":"csf_004","containerId":"docker_008","containerName":"docker_008","hostId":"os_020","hostName":"os_020"},{"serviceInstanceId":"docker_005:csf_005","serviceName":"csf_005","containerId":"docker_005","containerName":"docker_005","hostId":"os_017","hostName":"os_017"},{"serviceInstanceId":"docker_006:csf_005","serviceName":"csf_005","containerId":"docker_006","containerName":"docker_006","hostId":"os_018","hostName":"os_018"},{"serviceInstanceId":"docker_007:csf_005","serviceName":"csf_005", "containerId":"docker_007", "containerName":"docker_007","hostId":"os_019","hostName":"os_019"},{"serviceInstanceId":"docker_008:csf_005","serviceName":"csf_005","containerId":"docker_008","containerName":"docker_008","hostId":"os_020","hostName":"os_020"},{"serviceInstanceId":"db_001","serviceName":"db_001", "containerId":None, "containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_002","serviceName":"db_001", "containerId":None, "containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_003","serviceName":"db_003","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_004","serviceName":"db_004","containerId":None, "containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_005","serviceName":"db_005","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_006","serviceName":"db_006","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_007","serviceName":"db_007","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_008","serviceName":"db_008","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_009","serviceName":"db_009","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_010","serviceName":"db_010","containerId":None, "containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_011","serviceName":"db_011","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_012","serviceName":"db_012","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_013","serviceName":"db_013","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":":docker_001:fly_remote_001","serviceName":"fly_remote_001","containerId":"docker_001","containerName":"docker_001","hostId":None,"hostName":None},{"serviceInstanceId":"docker_002:fly_remote_001","serviceName":"fly_remote_001","containerId":"docker_002","containerName":"docker_002","hostId":None,"hostName":None},{"serviceInstanceId":"docker_003:fly_remote_001","serviceName":"fly_remote_001","containerId":"docker_003","containerName":"docker_003","hostId":None,"hostName":None},{"serviceInstanceId":"docker_004:fly_remote_001","serviceName":"fly_remote_001","containerId":"docker_004","containerName":"docker_004","hostId":None,"hostName":None}] 177 | # target_deployment_data = {} 178 | # for i in original_deployment_data: 179 | # target_deployment_data[i['serviceInstanceId']]= i 180 | # return target_deployment_data 181 | 182 | 183 | # f = open('../../../data/aiops_data_2020/2020_05_22/original_deployment_data.json', 'r') 184 | # # deployment_data = json.load(f) 185 | # get_original_trace_data() 186 | def get_original_items_data(): 187 | """ 188 | 拆分指标文件 189 | :param : 190 | :return: json 191 | """ 192 | result = {} 193 | db_oracle_11g_path = '../data/aiops_data_2020/2020_04_11/平台指标/db_oracle_11g.csv' 194 | dcos_docker_path = '../data/aiops_data_2020/2020_04_11/平台指标/dcos_docker.csv' 195 | os_linux_path = '../data/aiops_data_2020/2020_04_11/平台指标/os_linux.csv' 196 | with open(db_oracle_11g_path, 'r') as f: 197 | reader = csv.DictReader(f) 198 | for row in reader: 199 | curveId = row['itemid']+":"+row['name']+":"+row['bomc_id'] 200 | if curveId not in result: 201 | result[curveId] = {} 202 | result[curveId]['metricId'] = row['itemid'] 203 | result[curveId]['metricName'] = row['name'] 204 | result[curveId]['metricBelongTo'] = row['cmdb_id'] 205 | result[curveId]['metricBelongLevel'] = "service" 206 | result[curveId]['values'] = [] 207 | result[curveId]['timeStamps'] = [] 208 | result[curveId]['values'].append(row['value']) 209 | result[curveId]['timeStamps'].append(row['timestamp']) 210 | else: 211 | result[curveId]['values'].append(row['value']) 212 | result[curveId]['timeStamps'].append(row['timestamp']) 213 | with open(dcos_docker_path, 'r') as f: 214 | reader = csv.DictReader(f) 215 | for row in reader: 216 | curveId = row['itemid']+":"+row['name']+":"+row['bomc_id'] 217 | if curveId not in result: 218 | result[curveId] = {} 219 | result[curveId]['metricId'] = row['itemid'] 220 | result[curveId]['metricName'] = row['name'] 221 | result[curveId]['metricBelongTo'] = row['cmdb_id'] 222 | result[curveId]['metricBelongLevel'] = "docker" 223 | result[curveId]['values'] = [] 224 | result[curveId]['timeStamps'] = [] 225 | result[curveId]['values'].append(row['value']) 226 | result[curveId]['timeStamps'].append(row['timestamp']) 227 | else: 228 | result[curveId]['values'].append(row['value']) 229 | result[curveId]['timeStamps'].append(row['timestamp']) 230 | 231 | with open(os_linux_path, 'r') as f: 232 | reader = csv.DictReader(f) 233 | for row in reader: 234 | curveId = row['itemid']+":"+row['name']+":"+row['bomc_id'] 235 | if curveId not in result: 236 | result[curveId] = {} 237 | result[curveId]['metricId'] = row['itemid'] 238 | result[curveId]['metricName'] = row['name'] 239 | result[curveId]['metricBelongTo'] = row['cmdb_id'] 240 | result[curveId]['metricBelongLevel'] = "host" 241 | result[curveId]['values'] = [] 242 | result[curveId]['timeStamps'] = [] 243 | result[curveId]['values'].append(row['value']) 244 | result[curveId]['timeStamps'].append(row['timestamp']) 245 | else: 246 | result[curveId]['values'].append(row['value']) 247 | result[curveId]['timeStamps'].append(row['timestamp']) 248 | save_path = '../data/aiops_data_2020/2020_04_11/origina_items.json' 249 | with open(save_path, 'w') as f: 250 | json.dump(result, f, indent=2, sort_keys=True, ensure_ascii=False) 251 | pass 252 | 253 | if __name__ == '__main__': 254 | # get_original_items_data() 255 | get_original_trace_data() --------------------------------------------------------------------------------