├── effect_images
├── 1.png
└── 结构示意图.png
├── service
├── module_tools
│ ├── __pycache__
│ │ ├── input_data.cpython-37.pyc
│ │ ├── save_result.cpython-37.pyc
│ │ ├── identify_faultservice.cpython-37.pyc
│ │ └── diagnosis_faultservice.cpython-37.pyc
│ ├── diagnosis_faultservice.py
│ ├── save_result.py
│ ├── genarate_solutions.py
│ ├── identify_faultservice.py
│ └── input_data.py
├── generate_solutions_service.py
├── fault_diagnosis_service.py
└── web_service.py
├── config
├── stop.txt
└── data_base_sql
├── requirements.txt
├── utils
├── jaccard_api.py
├── data_tools.py
├── graph.py
├── pageRank.py
├── draw_graph_tool.py
├── pcalg.py
└── process_aiops2020_data_to_original.py
├── dao
├── es_dao.py
├── neo4j_dao.py
└── db_dao.py
├── bean
├── output_model.py
├── input_model.py
└── save_model.py
├── .gitignore
├── app.py
├── demo
├── aiops_2020_data_test.py
└── hadoop_data_test.py
├── README.md
└── LICENSE
/effect_images/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OS-ABC/AIOsp-Fault-Diagnosis/main/effect_images/1.png
--------------------------------------------------------------------------------
/effect_images/结构示意图.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OS-ABC/AIOsp-Fault-Diagnosis/main/effect_images/结构示意图.png
--------------------------------------------------------------------------------
/service/module_tools/__pycache__/input_data.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OS-ABC/AIOsp-Fault-Diagnosis/main/service/module_tools/__pycache__/input_data.cpython-37.pyc
--------------------------------------------------------------------------------
/service/module_tools/__pycache__/save_result.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OS-ABC/AIOsp-Fault-Diagnosis/main/service/module_tools/__pycache__/save_result.cpython-37.pyc
--------------------------------------------------------------------------------
/service/module_tools/__pycache__/identify_faultservice.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OS-ABC/AIOsp-Fault-Diagnosis/main/service/module_tools/__pycache__/identify_faultservice.cpython-37.pyc
--------------------------------------------------------------------------------
/service/module_tools/__pycache__/diagnosis_faultservice.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OS-ABC/AIOsp-Fault-Diagnosis/main/service/module_tools/__pycache__/diagnosis_faultservice.cpython-37.pyc
--------------------------------------------------------------------------------
/config/stop.txt:
--------------------------------------------------------------------------------
1 | *
2 | <
3 | >
4 | (
5 | )
6 | \n
7 | ,
8 | .
9 | -
10 | :
11 | /
12 | [
13 | ]
14 | $
15 | \t
16 | =
17 | ;
18 | java
19 | @
20 | +
21 | ...
22 | {
23 | }
24 | _
25 | &
26 | at
27 | main
28 | s
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | SQLAlchemy==1.3.18
2 | networkx==2.4
3 | pandas==1.0.5
4 | numpy==1.18.5
5 | Flask==1.1.2
6 | pytz==2020.1
7 | scipy==1.5.0
8 | elasticsearch==7.11.0
9 | gsq==0.1.6
10 | jieba==0.42.1
11 | py2neo==2021.0.1
12 |
--------------------------------------------------------------------------------
/utils/jaccard_api.py:
--------------------------------------------------------------------------------
1 | import re
2 | from copy import deepcopy
3 |
4 | import jieba
5 | import jieba.analyse
6 |
7 |
8 | def log_preprocess(log, paramregex, eraseRex):
9 | for currentRex in eraseRex:
10 | log = re.sub(currentRex, '', log)
11 | for currentRex in paramregex:
12 | log = re.sub(currentRex, '<*>', log)
13 | return log
14 |
15 |
16 | def generate_cidian_jaccard(exception_window_log_, stopkeyword):
17 | log_dic = []
18 | fenci = jieba.cut_for_search(exception_window_log_)
19 | for fc in fenci:
20 | if fc not in log_dic and fc not in stopkeyword and not re.search(' +', fc):
21 | log_dic.append(fc)
22 | return log_dic
23 |
--------------------------------------------------------------------------------
/dao/es_dao.py:
--------------------------------------------------------------------------------
1 | from elasticsearch import Elasticsearch
2 |
3 |
4 | class ESConnection:
5 | def __init__(self, hosts):
6 | self.hosts = hosts
7 | self.es = Elasticsearch(hosts=hosts)
8 |
9 | def query(self, index, query):
10 | data = self.es.search(body=query, index=index, params={"scroll": "10m", "size": 10000})
11 | result = data['hits']['hits']
12 | total = data['hits']['total']['value']
13 | scroll_id = data['_scroll_id']
14 |
15 | for i in range(0, int(total / 10000) + 1):
16 | query_scroll = self.es.scroll(scroll_id=scroll_id, params={"scroll": "1m"})['hits']['hits']
17 | result += query_scroll
18 |
19 | return result
--------------------------------------------------------------------------------
/service/generate_solutions_service.py:
--------------------------------------------------------------------------------
1 | from dao.db_dao import DBDao
2 | from service.module_tools.genarate_solutions import GenetateSolutuons
3 | from service.module_tools.save_result import SaveResult
4 |
5 |
6 | def time_generate_logs_solutions():
7 | """
8 | 为根因详情列表中未生成修复方案的所有根因日志生成解决方案
9 | :return:
10 | """
11 | dbDao = DBDao()
12 | root_logs = dbDao.get_all_root_logs_noSolution()
13 | dbDao.db_close()
14 | result = None
15 | for root_log in root_logs:
16 | sorted_solutions = GenetateSolutuons.get_solutions_by_logDetail(root_log.detail)
17 | result = SaveResult.save_solutions(root_log.fault_id, root_log.causeOfFault, sorted_solutions)
18 | return result
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/bean/output_model.py:
--------------------------------------------------------------------------------
1 | class FaultServiceDetail:
2 | def __init__(self, faultId: int, serviceName: str, hostName: str, fault_root: str, exception_time: str):
3 | self.serviceName = serviceName
4 | self.hostName = hostName
5 | self.fault_root = fault_root
6 | self.exception_time = exception_time
7 | self.faultId = faultId
8 |
9 | def keys(self):
10 | '''
11 | 当对实例化对象使用dict(obj)的时候, 会调用这个方法,这里定义了字典的键, 其对应的值将以obj['name']的形式取,
12 | 但是对象是不可以以这种方式取值的, 为了支持这种取值, 可以为类增加一个方法
13 | '''
14 | return ('serviceName', 'hostName', 'fault_root', 'exception_time', 'faultId')
15 |
16 | def __getitem__(self, item):
17 | '''
18 | 内置方法, 当使用obj['name']的形式的时候, 将调用这个方法, 这里返回的结果就是值
19 | '''
20 | return getattr(self, item)
21 |
22 |
23 | if __name__ == '__main__':
24 | a = FaultServiceDetail(1, '2', '3', '4', '5')
25 | re = dict(a)
26 | print(re)
27 |
--------------------------------------------------------------------------------
/utils/data_tools.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import time
3 |
4 | import pytz
5 |
6 |
7 | def is_number(number):
8 | """
9 | Check whether this is a number (int, long, float, hex) .
10 |
11 | Aruguments:
12 | number: {string}, input string for number check.
13 | """
14 | try:
15 | float(number) # for int, long, float
16 | except ValueError:
17 | try:
18 | int(number, 16) # for possible hex
19 | except ValueError:
20 | return False
21 |
22 | return True
23 |
24 | # UTCS时间转换为时间戳 2018-07-13T16:00:00Z
25 | def utc_to_local(utc_time_str, utc_format='%Y-%m-%dT%H:%M:%S.%fZ'):
26 | local_tz = pytz.timezone('Asia/Chongqing') #定义本地时区
27 | local_format = "%Y-%m-%d %H:%M:%S" #定义本地时间format
28 |
29 | utc_dt = datetime.datetime.strptime(utc_time_str, utc_format) #讲世界时间的格式转化为datetime.datetime格式
30 | local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(local_tz) #想将datetime格式添加上世界时区,然后astimezone切换时区:世界时区==>本地时区
31 | #time_str = local_dt.strftime(local_format) #将datetime格式转化为str—format格式
32 | #return int(time.mktime(time.strptime(time_str, local_format))) #运用mktime方法将date—tuple格式的时间转化为时间戳;time.strptime()可以得到tuple的时间格式
33 | return int(time.mktime(local_dt.timetuple()))
--------------------------------------------------------------------------------
/utils/graph.py:
--------------------------------------------------------------------------------
1 | class ServiceNode:
2 | def __init__(self,serviceId, serviceName = None,serviceType = None):
3 | self.serviceId = serviceId
4 | self.serviceName = serviceName
5 | self.serviceType = serviceType
6 | self.hostName = None
7 | self.containerName = None
8 | self.hostId = None
9 | self.containerId = None
10 | self.isException = 0
11 | self.childs = []
12 | def add_childs(self,service_id):
13 | self.childs.append(service_id)
14 |
15 | class ExceptionDataNode:
16 | def __init__(self, id, nodeType):
17 | self.id = id
18 | self.nodeType = nodeType
19 | self.name = None
20 | self.detail = None
21 | self.belongTo = None
22 | self.exceptionTime = None
23 | self.units = None
24 | self.childs = []
25 |
26 | def add_childs(self, id):
27 | self.childs.append(id)
28 | class Graph:
29 | def __init__(self, nodes, edges):
30 | self.nodes = nodes
31 | self.edges = edges
32 | self.generate_invoke_graph_consturct()
33 |
34 | def generate_invoke_graph_consturct(self):
35 | """
36 | 生成图遍历算法所需图结构
37 | :param nodes:{serviceId:ServiceNode,serviceId:ServiceNode}
38 | :param edges:[[serviceId,serviceId][serviceId,serviceId]]
39 | :return:graph
40 | """
41 | graph = {}
42 | for edge in self.edges:
43 | for i in edge:
44 | if i not in graph.keys():
45 | node = self.nodes[i]
46 | graph[i] = node
47 | if edge[1] not in graph[edge[0]].childs:
48 | graph[edge[0]].add_childs(edge[1])
49 | # 对图中单点节点补充
50 | for key, node in self.nodes.items():
51 | if key not in graph:
52 | graph[key] = node
53 | self.nodes = graph
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 | lerna-debug.log*
8 |
9 | # Diagnostic reports (https://nodejs.org/api/report.html)
10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
11 |
12 | # Runtime data
13 | pids
14 | *.pid
15 | *.seed
16 | *.pid.lock
17 |
18 | # Directory for instrumented libs generated by jscoverage/JSCover
19 | lib-cov
20 |
21 | # Coverage directory used by tools like istanbul
22 | coverage
23 | *.lcov
24 |
25 | # nyc test coverage
26 | .nyc_output
27 |
28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
29 | .grunt
30 |
31 | # Bower dependency directory (https://bower.io/)
32 | bower_components
33 |
34 | # node-waf configuration
35 | .lock-wscript
36 |
37 | # Compiled binary addons (https://nodejs.org/api/addons.html)
38 | build/Release
39 |
40 | # Dependency directories
41 | node_modules/
42 | jspm_packages/
43 |
44 | # TypeScript v1 declaration files
45 | typings/
46 |
47 | # TypeScript cache
48 | *.tsbuildinfo
49 |
50 | # Optional npm cache directory
51 | .npm
52 |
53 | # Optional eslint cache
54 | .eslintcache
55 |
56 | # Microbundle cache
57 | .rpt2_cache/
58 | .rts2_cache_cjs/
59 | .rts2_cache_es/
60 | .rts2_cache_umd/
61 |
62 | # Optional REPL history
63 | .node_repl_history
64 |
65 | # Output of 'npm pack'
66 | *.tgz
67 |
68 | # Yarn Integrity file
69 | .yarn-integrity
70 |
71 | # dotenv environment variables file
72 | .env
73 | .env.test
74 |
75 | # parcel-bundler cache (https://parceljs.org/)
76 | .cache
77 |
78 | # Next.js build output
79 | .next
80 |
81 | # Nuxt.js build / generate output
82 | .nuxt
83 | dist
84 |
85 | # Gatsby files
86 | .cache/
87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js
88 | # https://nextjs.org/blog/next-9-1#public-directory-support
89 | # public
90 |
91 | # vuepress build output
92 | .vuepress/dist
93 |
94 | # Serverless directories
95 | .serverless/
96 |
97 | # FuseBox cache
98 | .fusebox/
99 |
100 | # DynamoDB Local files
101 | .dynamodb/
102 |
103 | # TernJS port file
104 | .tern-port
105 |
106 | /.idea/
107 | */__pycache__
108 |
--------------------------------------------------------------------------------
/dao/neo4j_dao.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from py2neo import Graph
4 |
5 |
6 | class GraphDao:
7 |
8 | def __init__(self):
9 | self.g = Graph(
10 | host="127.0.0.1", # neo4j 搭载服务器的ip地址,ifconfig可获取到
11 | http_port=7474, # neo4j 服务器监听的端口号
12 | user="neo4j", # 数据库user name,如果没有更改过,应该是neo4j
13 | password="neo4j")
14 | # self.num_limit = 20
15 |
16 | def execute_sql(self, sql):
17 | answer = None
18 | try:
19 | answer = self.g.run(sql).data()
20 | except:
21 | logging.error("execute sql failed, sql: {0}".format(sql))
22 | return answer
23 |
24 | def get_all__entities(self):
25 | sql = 'MATCH (n) return n'
26 | result = self.execute_sql(sql)
27 | return [i['n'] for i in result]
28 |
29 | ##故障修复知识图谱
30 |
31 | # def get_all_log_entities(self):
32 | # sql = 'MATCH (n:log) return n'
33 | # result = self.execute_sql(sql)
34 | # return [i['n'] for i in result]
35 |
36 | #获取图谱log节点
37 | def get_all_log_entities(self):
38 | result = self.g.run("match (n:log) return n").data()
39 | return result
40 |
41 | #根据log获取故障节点列表
42 | def get_fault_entity_by_log(self, log_name):
43 | sql = 'MATCH (x:fault)-[r:has_log]->(y:log) where y.name = "{0}" return x'.format(
44 | log_name)
45 | result = self.execute_sql(sql)
46 | return [i['x'] for i in result]
47 | #根据falut获取解决方案列表
48 | def get_solutions_by_fault(self, fault_name):
49 | sql = 'MATCH (x:fault)-[r:has_solution]->(y:solution) where x.name = "{0}" return y'.format(
50 | fault_name)
51 | result = self.execute_sql(sql)
52 | return [i['y'] for i in result]
53 |
54 | #根据falut获取原因列表
55 | def get_reasons_by_fault(self, fault_name):
56 | sql = 'MATCH (x:fault)-[r:has_reason]->(y:reason) where x.name = "{0}" return y'.format(
57 | fault_name)
58 | result = self.execute_sql(sql)
59 | return [i['y'] for i in result]
60 |
61 | #根据原因获取解决方案列表
62 | def get_solutions_by_reason(self, reason_name):
63 | sql = 'MATCH (x:reason)-[r:has_solution]->(y:solution) where x.name = "{0}" return y'.format(
64 | reason_name)
65 | result = self.execute_sql(sql)
66 | return [i['y'] for i in result]
--------------------------------------------------------------------------------
/bean/input_model.py:
--------------------------------------------------------------------------------
1 | class DeploymentDataEntry:
2 | def __init__(self, serviceInstanceId:str, serviceName: str, hostId: str = None,hostName:str = None,containerId:str =None,containerName:str= None):
3 | self.serviceInstanceId = serviceInstanceId
4 | self.serviceName = serviceName
5 | self.hostId = hostId
6 | self.hostName = hostName
7 | self.containerId = containerId
8 | self.containerName = containerName
9 |
10 | class TraceDataEntry:
11 | def __init__(self, id:str, pid: str, serviceId: str,traceId:str,serviceName:str = None,serviceType:str =None,startTime:str= None):
12 | self.id = id
13 | self.pid = pid
14 | self.serviceId = serviceId
15 | self.serviceName = serviceName
16 | self.serviceType = serviceType
17 | self.startTime = startTime
18 | self.traceId = traceId
19 |
20 | class OriginalMetricEntry:
21 | def __init__(self, metricId:str, metricName: str, timestamp:str, value: float, metricBelongTo:str, units:str = None,metricBelongLevel:str= None):
22 | self.metricId = metricId
23 | self.metricName = metricName
24 | self.timestamp = timestamp
25 | self.value = value
26 | self.units = units
27 | self.metricBelongTo = metricBelongTo
28 | self.metricBelongLevel = metricBelongLevel
29 |
30 | class OriginalLogEntry:
31 | def __init__(self, logId:str, timestamp:str, logMessage: str, logBelongTo:str, logLevel: str = None, logBelongLevel:str= None):
32 | self.logId = logId
33 | self.timestamp = timestamp
34 | self.logMessage = logMessage
35 | self.logLevel = logLevel
36 | self.logBelongTo = logBelongTo
37 | self.logBelongLevel = logBelongLevel
38 |
39 | class ExceptionMetricEntry:
40 | def __init__(self, startTime:str, endTime:str, metricId: str, metricName:str, value: float, metricBelongTo:str, units:str = None,metricBelongLevel:str= None):
41 | self.startTime = startTime
42 | self.endTime = endTime
43 | self.metricId = metricId
44 | self.metricName = metricName
45 | self.value = value
46 | self.metricBelongTo = metricBelongTo
47 | self.units = units
48 | self.metricBelongLevel = metricBelongLevel
49 |
50 | class ExceptionLogEntry:
51 | def __init__(self, startTime:str, endTime:str, logId: str, logBelongTo:str, logExceptionSegment: str, logBelongLevel:str= None):
52 | self.startTime = startTime
53 | self.endTime = endTime
54 | self.logId = logId
55 | self.logExceptionSegment = logExceptionSegment
56 | self.logBelongTo = logBelongTo
57 | self.logBelongLevel = logBelongLevel
--------------------------------------------------------------------------------
/utils/pageRank.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # from pygraph.classes.digraph import digraph
4 | import copy
5 |
6 | import networkx as nx
7 |
8 | class PRIterator:
9 | __doc__ = '''计算一张图中的PR值'''
10 |
11 | def __init__(self, dg):
12 | self.damping_factor = 0.85 # 阻尼系数,即α
13 | self.max_iterations = 100 # 最大迭代次数
14 | self.min_delta = 0.00001 # 确定迭代是否结束的参数,即ϵ
15 | self.graph = copy.deepcopy(dg)
16 |
17 | def page_rank(self):
18 | # 先将图中没有出链的节点改为对所有节点都有出链
19 | for node in self.graph.nodes():
20 | if len(list(self.graph.neighbors(node))) == 0:
21 | for node2 in self.graph.nodes():
22 | nx.DiGraph.add_edge(self.graph, node, node2)
23 |
24 | nodes = self.graph.nodes()
25 | graph_size = len(nodes)
26 |
27 | if graph_size == 0:
28 | return {}
29 | page_rank = dict.fromkeys(nodes, 1.0 / graph_size) # 给每个节点赋予初始的PR值
30 | damping_value = (1.0 - self.damping_factor) / graph_size # 公式中的(1−α)/N部分
31 |
32 | flag = False
33 | for i in range(self.max_iterations):
34 | change = 0
35 | for node in nodes:
36 | rank = 0
37 | for incident_page in self.graph.predecessors(node): # 遍历所有“入射”的页面
38 | rank += self.damping_factor * (page_rank[incident_page] / len(list(self.graph.neighbors(incident_page))))
39 | rank += damping_value
40 | change += abs(page_rank[node] - rank) # 绝对值
41 | page_rank[node] = rank
42 |
43 | # print("This is NO.%s iteration" % (i + 1))
44 | # print(page_rank)
45 |
46 | if change < self.min_delta:
47 | flag = True
48 | break
49 | # if flag:
50 | # print("finished in %s iterations!" % node)
51 | # else:
52 | # print("finished out of 100 iterations!")
53 | return page_rank
54 |
55 |
56 | if __name__ == '__main__':
57 | dg = nx.DiGraph()
58 |
59 | dg.add_nodes_from(["0", "1", "2", "3","4"])
60 |
61 | dg.add_edge("0", "1")
62 | dg.add_edge("1", "0")
63 | dg.add_edge("0", "2")
64 | dg.add_edge("2", "0")
65 | dg.add_edge("0", "4")
66 | dg.add_edge("4", "0")
67 | dg.add_edge("2", "4")
68 | dg.add_edge("4", "2")
69 | # dg.add_edge("A", "C")
70 | # dg.add_edge("A", "D")
71 | # dg.add_edge("C", "A")
72 | # dg.add_edge("C", "B")
73 | # dg.add_edge("D", "B")
74 | # dg.add_edge("B", "D")
75 | # dg.add_edge("E", "A")
76 |
77 | pr = PRIterator(dg)
78 | page_ranks = pr.page_rank()
79 |
80 | print("The final page rank is\n", page_ranks)
81 |
82 |
--------------------------------------------------------------------------------
/config/data_base_sql:
--------------------------------------------------------------------------------
1 | create table exception_data_dependency_graph
2 | (
3 | id int auto_increment
4 | primary key,
5 | fault_id int not null,
6 | graph_json text not null,
7 | create_time datetime default CURRENT_TIMESTAMP not null,
8 | update_time datetime default CURRENT_TIMESTAMP not null
9 | )
10 | comment '服务异常数据依赖图存储表';
11 |
12 | create table fault_service
13 | (
14 | id int auto_increment
15 | primary key,
16 | fault_service_id varchar(255) not null comment '故障服务ID',
17 | fault_service_name varchar(255) null,
18 | fault_service_type varchar(255) null,
19 | host_name varchar(255) null,
20 | exception_time datetime null,
21 | process_state int default 0 not null,
22 | create_time datetime default CURRENT_TIMESTAMP not null,
23 | update_time datetime default CURRENT_TIMESTAMP not null
24 | )
25 | comment '故障服务详情表';
26 |
27 | create table fault_service_root_detail
28 | (
29 | id int auto_increment
30 | primary key,
31 | fault_id int not null,
32 | causeOfFault varchar(255) not null comment '根因ID',
33 | causeName varchar(255) not null comment '根因名称',
34 | detail text null comment '根因内容',
35 | has_solution int default 0 not null,
36 | type int default 0 null comment '0:指标 1:日志',
37 | `rank` int not null comment '根因排序',
38 | create_time datetime default CURRENT_TIMESTAMP not null,
39 | update_time datetime default CURRENT_TIMESTAMP not null
40 | )
41 | comment '故障服务根因详情表';
42 |
43 | create table fault_service_solution
44 | (
45 | id int auto_increment
46 | primary key,
47 | fault_id int not null,
48 | root_log_id varchar(255) not null,
49 | fault_reason text null,
50 | fault_solution text not null,
51 | `rank` int not null,
52 | create_time datetime default CURRENT_TIMESTAMP not null,
53 | update_time datetime default CURRENT_TIMESTAMP not null
54 | )
55 | comment '故障服务修复方案详情表';
56 |
57 | create table service_dependency_graph
58 | (
59 | id int auto_increment
60 | primary key,
61 | fault_id int not null,
62 | graph_json text not null,
63 | create_time datetime default CURRENT_TIMESTAMP not null,
64 | update_time datetime default CURRENT_TIMESTAMP not null
65 | )
66 | comment '服务依赖图存储表';
67 |
68 |
--------------------------------------------------------------------------------
/bean/save_model.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import Column, Integer, Text, DateTime, Date, String,text
2 | from sqlalchemy.ext.declarative import declarative_base
3 |
4 | Base = declarative_base()
5 |
6 |
7 | class FaultService(Base):
8 | __tablename__ = 'fault_service'
9 | id = Column(Integer, primary_key=True)
10 | fault_service_id = Column(String)
11 | fault_service_name = Column(String)
12 | fault_service_type = Column(String)
13 | host_name = Column(String)
14 | exception_time = Column(DateTime)
15 | process_state = Column(Integer)
16 | create_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP'))
17 | update_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP'))
18 | __mapper_args__ = {
19 | "order_by": create_time.desc()
20 | }
21 |
22 | def to_dict(self):
23 | return {c.name: getattr(self, c.name, None) for c in self.__table__.columns}
24 |
25 | class FaultServiceRoot(Base):
26 | __tablename__ = 'fault_service_root_detail'
27 | id = Column(Integer, primary_key=True)
28 | fault_id = Column(Integer)
29 | causeOfFault = Column(String)
30 | causeName = Column(String)
31 | detail = Column(Text)
32 | has_solution = Column(Integer)
33 | type = Column(Integer) #0:指标
34 | rank = Column(Integer) #1 2 3
35 | create_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP'))
36 | update_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP'))
37 | __mapper_args__ = {
38 | "order_by": create_time.desc()
39 | }
40 |
41 | def to_dict(self):
42 | return {c.name: getattr(self, c.name, None) for c in self.__table__.columns}
43 |
44 | class ServiceDependencyGraph(Base):
45 | __tablename__ = 'service_dependency_graph'
46 | id = Column(Integer, primary_key=True)
47 | fault_id = Column(Integer)
48 | graph_json = Column(Text)
49 | create_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP'))
50 | update_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP'))
51 | __mapper_args__ = {
52 | "order_by": create_time.desc()
53 | }
54 |
55 | def to_dict(self):
56 | return {c.name: getattr(self, c.name, None) for c in self.__table__.columns}
57 | class ExceptionDataDependencyGraph(Base):
58 | __tablename__ = 'exception_data_dependency_graph'
59 | id = Column(Integer, primary_key=True)
60 | fault_id = Column(Integer)
61 | graph_json = Column(Text)
62 | create_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP'))
63 | update_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP'))
64 | __mapper_args__ = {
65 | "order_by": create_time.desc()
66 | }
67 |
68 | def to_dict(self):
69 | return {c.name: getattr(self, c.name, None) for c in self.__table__.columns}
70 |
71 | class FaultServiceSolution(Base):
72 | __tablename__ = 'fault_service_solution'
73 | id = Column(Integer, primary_key=True)
74 | fault_id = Column(Integer)
75 | root_log_id = Column(String)
76 | fault_reason = Column(Text)
77 | fault_solution = Column(Text)
78 | rank = Column(Integer) # 1 2 3
79 | create_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP'))
80 | update_time = Column(DateTime,server_default=text('CURRENT_TIMESTAMP'))
81 | __mapper_args__ = {
82 | "order_by": create_time.desc()
83 | }
84 |
85 | def to_dict(self):
86 | return {c.name: getattr(self, c.name, None) for c in self.__table__.columns}
87 |
--------------------------------------------------------------------------------
/service/fault_diagnosis_service.py:
--------------------------------------------------------------------------------
1 |
2 | from service.module_tools.diagnosis_faultservice import DiagnosisFaultService
3 | from service.module_tools.identify_faultservice import IdentifyFaultService
4 | from service.module_tools.input_data import InputData
5 | from service.module_tools.save_result import SaveResult
6 | from utils.graph import Graph
7 |
8 |
9 | def fault_diagmosis(deploymentData, traceData, original_metricData, original_logData, exception_metricData,
10 | exception_logData):
11 | #输入数据
12 | data = InputData(deploymentData, traceData, original_metricData, original_logData, exception_metricData,
13 | exception_logData)
14 | #识别故障服务
15 | final_root_services, service_invoke_graph = get_root_services(data)
16 | #对故障服务诊断
17 | services_diagnisis_results = get_fault_services_roots(data,final_root_services,service_invoke_graph)
18 | #存储诊断结果
19 | save_fault_root_cause_diagnosis_result(service_invoke_graph,final_root_services,services_diagnisis_results)
20 | pass
21 |
22 | def get_root_services(data):
23 | """
24 | 故障服务识别子模块主入口
25 | :param data: input_data实例
26 | :return: 故障服务列表({serviceId:数值})、服务依赖图(graph实例,nodes:{serviceId:ServiceNode,serviceId:ServiceNode},edges[[serviceId,serviceId],[serviceId,serviceId]])
27 | """
28 | nodes, edges, traverse_initial_list = IdentifyFaultService.generate_service_invoke_graph(data.organization_traceObjData_by_traceId())
29 | nodes = IdentifyFaultService.completion_serviceNode_deploymentData(data.organization_deploymentObjData_by_sviid(), nodes)
30 | nodes = IdentifyFaultService.set_service_exception_info(nodes,data)
31 | service_invoke_graph = Graph(nodes, edges)
32 | # final_root_services = get_fault_services_list(service_invoke_graph,traverse_initial_list)
33 | final_root_services = IdentifyFaultService.get_fault_services_list_PR(service_invoke_graph,traverse_initial_list)
34 | print('故障服务列表为:{}'.format(final_root_services))
35 | return final_root_services, service_invoke_graph
36 |
37 | def get_fault_services_roots(data,final_root_services,service_invoke_graph):
38 | """
39 | 对所有故障服务进行诊断入口
40 | :param data:input_data实例
41 | :param final_root_services:{serviceId:数值,serviceId:数值}
42 | :param service_invoke_graph:nodes:{},egdes:[]
43 | :return:
44 | """
45 | services_diagnisis_results = dict()
46 | for i in final_root_services:
47 | serviceNode = service_invoke_graph.nodes[i]
48 | services_diagnisis_results[serviceNode.serviceId] = dict()
49 | falut_root_dict, final_exception_data_graph = DiagnosisFaultService.get_servcie_fault_causes(serviceNode, data)
50 | # 打印结果
51 | falut_root_dict_name = dict()
52 | for root_id,rootValue in falut_root_dict.items():
53 | rootNode = final_exception_data_graph.nodes[root_id]
54 | falut_root_dict_name[rootNode.name] = rootValue
55 | print('{0} 服务故障根因为:{1}'.format(serviceNode.serviceName, falut_root_dict_name))
56 |
57 | services_diagnisis_results[serviceNode.serviceId]['falut_root_dict'] = falut_root_dict
58 | services_diagnisis_results[serviceNode.serviceId]['final_exception_data_graph'] = final_exception_data_graph
59 | return services_diagnisis_results
60 |
61 | def save_fault_root_cause_diagnosis_result(service_invoke_graph,final_root_services,services_diagnisis_results):
62 | SaveResult.save(service_invoke_graph,final_root_services,services_diagnisis_results)
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/service/web_service.py:
--------------------------------------------------------------------------------
1 | from bean.output_model import FaultServiceDetail
2 | from dao.db_dao import DBDao
3 | from service.module_tools.genarate_solutions import GenetateSolutuons
4 | from service.module_tools.save_result import SaveResult
5 |
6 |
7 | def get_fault_service_list():
8 | """
9 | 查询所有故障服务数据,按时间从高到底排序,分为已处理和未处理两列
10 | :return:
11 | """
12 | dbDao = DBDao()
13 | fault_service_list_unprocess = dbDao.select_all_fault_service_detail_by_processState(0)
14 | fault_service_list_process = dbDao.select_all_fault_service_detail_by_processState(1)
15 | fault_service_detail_list_unprocess = list()
16 | fault_service_detail_list_process = list()
17 | for fault_service in fault_service_list_unprocess:
18 | root = dbDao.select_rank1_faultserviceroot_by_faultid(fault_service.id)
19 | if root:
20 | faultServiceDetail = FaultServiceDetail(fault_service.id, fault_service.fault_service_name,
21 | fault_service.host_name, root.causeName,
22 | fault_service.exception_time)
23 | fault_service_detail_list_unprocess.append(faultServiceDetail)
24 |
25 | for fault_service in fault_service_list_process:
26 | root = dbDao.select_rank1_faultserviceroot_by_faultid(fault_service.id)
27 | if root:
28 | faultServiceDetail = FaultServiceDetail(fault_service.id, fault_service.fault_service_name,
29 | fault_service.host_name, root.causeName,
30 | fault_service.exception_time)
31 | fault_service_detail_list_process.append(faultServiceDetail)
32 | dbDao.db_close()
33 | return [dict(i) for i in fault_service_detail_list_unprocess], [dict(i) for i in fault_service_detail_list_process]
34 |
35 |
36 | # def get_fault_service_detail(fault_id):
37 | # """
38 | # 查询某一故障服务诊断详情,此接口返回数据包含诊断时的服务依赖图、故障服务对应详细信息
39 | # :param fault_id:
40 | # :return:
41 | # """
42 |
43 | # """
44 | # 按faultId查询故障详细内容
45 | # """
46 | #
47 | #
48 | # def get_fault_id(fault_id):
49 | # fault = None
50 | # db = get_session()
51 | # if fault_id:
52 | # fault = db.query(Fault).filter(Fault.id == fault_id).one()
53 | # db.close()
54 | # return fault.to_dict()
55 |
56 |
57 | def get_service_invoke_graph(fault_id):
58 | """
59 | 根据故障服务编号查询对应的服务依赖图
60 | :param fault_id:
61 | :return:
62 | """
63 | service_invoke_graph_json = None
64 | dbDao = DBDao()
65 | if fault_id:
66 | service_invoke_graph_json = dbDao.select_service_invoke_graph_by_faultid(fault_id)
67 | dbDao.db_close()
68 | if service_invoke_graph_json == None:
69 | return None
70 | return service_invoke_graph_json.to_dict()
71 |
72 |
73 | def get_exception_data_dependency_graph(fault_id, service_id):
74 | """
75 | 根据fault_id查询服务异常数据依赖图
76 | :param fault_id:
77 | :param service_id:
78 | :return:
79 | """
80 | exception_data_dependency_graph_json = None
81 | dbDao = DBDao()
82 | if fault_id and service_id:
83 | exception_data_dependency_graph_json = dbDao.select_exception_data_dependency_graph_by_faultid(fault_id)
84 | dbDao.db_close()
85 | return exception_data_dependency_graph_json.to_dict()
86 |
87 |
88 | def get_solutions_by_log(fault_id, log_id, log_detail):
89 | """
90 | 获取根因日志的解决方案
91 | :param fault_id:
92 | :param log_id:
93 | :param logDetail:
94 | :return:
95 | """
96 | dbDao = DBDao()
97 | root_log = dbDao.get_root_log_by_logid_and_faultid(fault_id,log_id)
98 | if root_log.has_solution == 0:
99 | sorted_solutions = GenetateSolutuons.get_solutions_by_logDetail(log_detail)
100 | result = SaveResult.save_solutions(root_log.fault_id, root_log.causeOfFault, sorted_solutions)
101 | solutions = dbDao.select_solutions_by_logid_and_faultid(fault_id,log_id)
102 | dbDao.db_close()
103 | return [i.to_dict() for i in solutions]
104 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from copy import deepcopy
3 |
4 | from flask import Flask, request, json, jsonify,make_response
5 |
6 | from service.web_service import get_fault_service_list, get_service_invoke_graph, get_exception_data_dependency_graph, \
7 | get_solutions_by_log
8 |
9 | app = Flask(__name__, static_folder='static', template_folder='templates')
10 |
11 | # @app.route("/fault_diagnosis", methods=['POST']) #请求方式为post
12 | # def fault_diagnosis():
13 | # data = request.data
14 | # j_data = json.loads(data)
15 | # sys_rca(j_data)
16 |
17 | result_response = {'code': 1, 'message': 'success', 'data': None}
18 |
19 | @app.route("/fault_service_list", methods=['GET']) # 请求方式为get
20 | def fault_service_list():
21 | response = deepcopy(result_response)
22 | try:
23 | fault_list_unprocess, fault_list_process = get_fault_service_list()
24 | response['data'] = {'fault_list_unprocess': fault_list_unprocess, 'fault_list_process': fault_list_process}
25 | except Exception as e:
26 | response['code'] = 0
27 | response['message'] = str(e)
28 | res = make_response(jsonify(response)) # 设置响应体
29 | # res.status = '200' # 设置状态码
30 | res.headers['Access-Control-Allow-Origin'] = "*" # 设置允许跨域
31 | res.headers['Access-Control-Allow-Methods'] = 'PUT,GET,POST,DELETE'
32 | return res
33 |
34 |
35 | @app.route("/fault_service_invoke_graph", methods=['GET']) # 请求方式为get
36 | def fault_service_invoke_graph():
37 | response = deepcopy(result_response)
38 | try:
39 | fault_id = request.args['fault_id']
40 | # input_data = json.loads(data)
41 | service_invoke_graph_json = get_service_invoke_graph(fault_id)
42 | # service_invoke_graph = json.loads(service_invoke_graph_json)
43 | response['data'] = {'service_invoke_graph': service_invoke_graph_json}
44 | except Exception as e:
45 | response['code'] = 0
46 | response['message'] = str(e)
47 | return jsonify(response)
48 |
49 |
50 | @app.route("/exception_data_dependency_graph", methods=['GET']) # 请求方式为get
51 | def exception_data_dependency_graph():
52 | response = deepcopy(result_response)
53 | try:
54 | fault_id = request.args['fault_id']
55 | service_id = request.args['service_id']
56 | # input_data = json.loads(data)
57 | log_metric_graph_json = get_exception_data_dependency_graph(fault_id, service_id)
58 | # log_metric_graph = json.loads(log_metric_graph_json)
59 | response['data'] = {'log_metric_graph': log_metric_graph_json}
60 | except Exception as e:
61 | response['code'] = 0
62 | response['message'] = str(e)
63 | res = make_response(jsonify(response)) # 设置响应体
64 | # res.status = '200' # 设置状态码
65 | res.headers['Access-Control-Allow-Origin'] = "*" # 设置允许跨域
66 | res.headers['Access-Control-Allow-Methods'] = 'PUT,GET,POST,DELETE'
67 | return res
68 |
69 |
70 | @app.route("/root_log_solutions", methods=['POST']) # 请求方式为get
71 | def fault_logDetail_solutions():
72 | response = deepcopy(result_response)
73 | try:
74 | fault_id = request.form['fault_id']
75 | log_id = request.form['log_id']
76 | log_detail = request.form['log_detail']
77 | solutions = get_solutions_by_log(fault_id,log_id,log_detail)
78 | response['data'] = {'fault_id': fault_id, 'log_id': log_id,'log_detail':log_detail,'solutions':solutions}
79 | except Exception as e:
80 | response['code'] = 0
81 | response['message'] = str(e)
82 | res = make_response(jsonify(response)) # 设置响应体
83 | # res.status = '200' # 设置状态码
84 | res.headers['Access-Control-Allow-Origin'] = "*" # 设置允许跨域
85 | res.headers['Access-Control-Allow-Methods'] = 'PUT,GET,POST,DELETE'
86 | return res
87 |
88 |
89 | if __name__ == '__main__':
90 | app.run(host='0.0.0.0', port=5000, debug=True)
91 | # 跨域支持
92 | # def after_request(response):
93 | # # JS前端跨域支持
94 | # response.headers['Cache-Control'] = 'no-cache'
95 | # response.headers['Access-Control-Allow-Origin'] = '*'
96 | # return response
97 | #
98 | # app.after_request(after_request)
99 |
--------------------------------------------------------------------------------
/utils/draw_graph_tool.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 | import matplotlib.pyplot as plt
3 | try:
4 | import pygraphviz
5 | from networkx.drawing.nx_agraph import graphviz_layout
6 | except ImportError:
7 | try:
8 | import pydot
9 | from networkx.drawing.nx_pydot import graphviz_layout
10 | except ImportError:
11 | raise ImportError("This example needs Graphviz and either "
12 | "PyGraphviz or PyDotPlus")
13 | if __name__ == '__main__':
14 | dg = nx.DiGraph()
15 | # dg.add_nodes_from(["0", "1", "2", "3", "4"])
16 | nodes = ["os_022:osb_001", "docker_001:csf_001","docker_008:csf_005","docker_007:csf_004","docker_008:csf_003","docker_008:csf_002","db_003","db_007","db_009","docker_001:fly_remote_001","os_021:osb_001","docker_003:csf_001", "docker_005:csf_005","docker_006:csf_004","docker_005:csf_003","docker_006:csf_002","docker_003:fly_remote_001","docker_006:csf_003", "docker_005:csf_002","docker_004:csf_001","docker_006:csf_005","docker_004:fly_remote_001","docker_007:csf_005","docker_007:csf_003","docker_008:csf_004","docker_007:csf_002","docker_005:csf_004","docker_002:csf_001","docker_002:fly_remote_001"]
17 | edges = [("os_022:osb_001","docker_001:csf_001"),("docker_001:csf_001","docker_008:csf_005"), ("docker_001:csf_001","docker_007:csf_004"), ("docker_001:csf_001","docker_008:csf_003"), ("docker_001:csf_001","docker_008:csf_002"), ("docker_008:csf_005", "db_003"), ("docker_007:csf_004", "db_003"), ("docker_008:csf_003", "db_003"), ("docker_008:csf_002", "db_003"), ("docker_001:csf_001", "db_007"), ("docker_001:csf_001", "db_009"), ("docker_001:csf_001", "docker_001:fly_remote_001"),("os_021:osb_001", "docker_003:csf_001"), ("docker_003:csf_001","docker_005:csf_005"), ("docker_003:csf_001", "docker_006:csf_004"), ("docker_003:csf_001","docker_005:csf_003"), ("docker_003:csf_001", "docker_006:csf_002"), ("docker_005:csf_005","db_003"), ("docker_006:csf_004","db_003"), ("docker_005:csf_003", "db_003"), ("docker_006:csf_002", "db_003"), ("docker_003:csf_001", "db_007"), ("docker_003:csf_001", "db_009"), ("docker_003:csf_001","docker_003:fly_remote_001"), ("docker_003:csf_001","docker_006:csf_003"),("docker_003:csf_001","docker_005:csf_002"), ("docker_006:csf_003", "db_003"),("docker_005:csf_002","db_003"), ("os_021:osb_001","docker_004:csf_001"), ("docker_004:csf_001","docker_006:csf_005"), ("docker_004:csf_001","docker_006:csf_004"), ("docker_004:csf_001", "docker_006:csf_003"),("docker_004:csf_001","docker_006:csf_002"), ("docker_006:csf_005", "db_003"),("docker_004:csf_001","db_007"),("docker_004:csf_001","db_009"), ("docker_004:csf_001", "docker_004:fly_remote_001"), ("docker_001:csf_001","docker_007:csf_005"), ("docker_001:csf_001", "docker_007:csf_003"), ("docker_007:csf_005", "db_003"),("docker_007:csf_003","db_003"), ("docker_001:csf_001","docker_008:csf_004"), ("docker_001:csf_001", "docker_007:csf_002"),("docker_008:csf_004", "db_003"), ("docker_007:csf_002", "db_003"), ("docker_003:csf_001","docker_005:csf_004"), ("docker_005:csf_004", "db_003"), ("docker_004:csf_001","docker_005:csf_005"), ("os_022:osb_001","docker_002:csf_001"), ("docker_002:csf_001","docker_007:csf_005"), ("docker_002:csf_001","docker_007:csf_004"),("docker_002:csf_001","docker_007:csf_003"), ("docker_002:csf_001", "docker_007:csf_002"), ("docker_002:csf_001","db_007"), ("docker_002:csf_001","db_009"), ("docker_002:csf_001","docker_002:fly_remote_001"), ("docker_004:csf_001", "docker_005:csf_004"), ("docker_004:csf_001","docker_005:csf_002"), ("docker_003:csf_001", "docker_006:csf_005"), ("docker_004:csf_001", "docker_005:csf_003"), ("docker_002:csf_001", "docker_008:csf_005"), ("docker_002:csf_001", "docker_008:csf_002"), ("docker_002:csf_001","docker_008:csf_004"), ("docker_002:csf_001","docker_008:csf_003")]
18 | dg.add_nodes_from(nodes)
19 | dg.add_edges_from(edges)
20 | # dg.add_weighted_edges_from(list)
21 | # pos = nx.graphviz_layout(dg, prog='dot')
22 |
23 | nx.draw(dg,
24 | pos=nx.graphviz_layout(dg, prog='dot'), # pos 指的是布局,主要有spring_layout,random_layout,circle_layout,shell_layout
25 | node_color='g', # node_color指节点颜色,有rbykw,同理edge_color
26 | edge_color='r',
27 | with_labels=True, # with_labels指节点是否显示名字
28 | font_size=18, # font_size表示字体大小,font_color表示字的颜色
29 | node_size=60) # font_size表示字体大小,font_color表示字的颜色
30 | plt.show()
31 | pass
--------------------------------------------------------------------------------
/service/module_tools/diagnosis_faultservice.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 |
3 | from utils.graph import ExceptionDataNode, Graph
4 | from utils.pageRank import PRIterator
5 | from utils.pcalg import construct_service_dependency_diagram
6 |
7 |
8 | class DiagnosisFaultService:
9 | @staticmethod
10 | def get_servcie_fault_causes(serviceNode,data):
11 | """
12 | 对某一故障服务进行细粒度诊断
13 | :param serviceNode:
14 | :param data: input_data实例
15 | :return:
16 | """
17 | # 确定与故障服务相关的异常指标
18 | serviceId = serviceNode.serviceId
19 | hostId = serviceNode.hostId
20 | containerId = serviceNode.containerId
21 | exception_metrics, exception_logs = data.get_fault_service_related_log_metric_data(serviceId,containerId,hostId)
22 | # 处理原始数据,得到PC算法输入格式,原始数据预处理
23 | pc_input = data.get_PC_input_data(exception_metrics, exception_logs)
24 | # 利用PC算法生成图,g的节点为输入数据的Index
25 | g, columns_mapping = construct_service_dependency_diagram(pc_input)
26 | #生成的g,替换节点为metricId和logId
27 | g_new = DiagnosisFaultService.get_g_dataId(g, columns_mapping)
28 | # 识别图中的根因节点
29 | falut_root_dict = DiagnosisFaultService.get_root_cause(g_new)
30 | # 生成返回图结构
31 | final_exception_data_graph = DiagnosisFaultService.geneate_final_return_graph(g_new,exception_metrics, exception_logs)
32 | return falut_root_dict,final_exception_data_graph
33 |
34 | @staticmethod
35 | def get_g_dataId(g,columns_mapping):
36 | g_new = nx.DiGraph()
37 | for node in g.nodes:
38 | g_new.add_node(columns_mapping[node])
39 | for edge in g.edges:
40 | g_new.add_edge(columns_mapping[edge[0]],columns_mapping[edge[1]])
41 | return g_new
42 |
43 | @staticmethod
44 | def get_root_cause(g):
45 | """
46 | 通过关系图获取根因列表,获取故障服务根因列表
47 | Args:
48 | g: 关系图
49 |
50 | Returns: 根因列表
51 |
52 | """
53 | result = list()
54 | # 获取Pr值最高的点
55 | begin_node_id, begin_node_pr = None, 0
56 | # for node_id in node_ids:
57 | # if len(list(g.predecessors(node_id))) > max_pre_size:
58 | # max_pre_node = node_id
59 | # max_pre_size = len(list(g.predecessors(node_id)))
60 | pr = PRIterator(g)
61 | page_ranks = pr.page_rank()
62 | node_pr_sorted = sorted(page_ranks.items(), key=lambda x: x[1], reverse=True)
63 | begin_node_id = node_pr_sorted[0][0]
64 | # 层次遍历
65 | node_filter, node_queue = {begin_node_id}, list([begin_node_id])
66 | while node_queue:
67 | node_now = node_queue.pop(0)
68 | if not g.predecessors(node_now):
69 | if node_now not in result:
70 | result.append(node_now)
71 | continue
72 | is_pre_not_filter = False
73 | for k in g.predecessors(node_now):
74 | if k not in node_filter:
75 | is_pre_not_filter = True
76 | node_filter.add(k)
77 | node_queue.append(k)
78 | # 如果所有的上游节点都在 filter 中,将当前节点加入 result,避免 result 为空的情况
79 | if not is_pre_not_filter:
80 | for k in g.predecessors(node_now):
81 | if k not in result:
82 | result.append(k)
83 | if node_now not in result:
84 | result.append(node_now)
85 |
86 | g_reverse = g.reverse(copy=True)
87 | pr_reverse = PRIterator(g_reverse)
88 | page_ranks_reverse = pr_reverse.page_rank()
89 | for key, value in page_ranks_reverse.items():
90 | if key in result:
91 | value += 0.5
92 | node_pr_reverse_sorted = sorted(page_ranks.items(), key=lambda x: x[1], reverse=True)
93 | result_final = {}
94 | for index, i in enumerate(node_pr_reverse_sorted):
95 | if index < 3:
96 | result_final[i[0]]= i[1]
97 | return result_final
98 |
99 | @staticmethod
100 | def geneate_final_return_graph(g_new,exception_metrics, exception_logs):
101 | """
102 | 生成返回的图结构
103 | :param g_new:
104 | :param data:
105 | :param exception_metrics: 与服务相关的异常指标
106 | :param exception_logs: 与服务相关的异常日志
107 | :return:
108 | """
109 | nodes = {}
110 | for node_id in g_new.nodes:
111 | id = node_id
112 | if id in exception_metrics:
113 | nodeType = "metric"
114 | tmpExceptionDataNode = ExceptionDataNode(id, nodeType)
115 | tmpExceptionDataNode.name = exception_metrics[id][0].metricName
116 | tmpExceptionDataNode.detail = exception_metrics[id][0].value
117 | tmpExceptionDataNode.units = exception_metrics[id][0].units
118 | tmpExceptionDataNode.belongTo = exception_metrics[id][0].metricBelongTo
119 | tmpExceptionDataNode.exceptionTime = exception_metrics[id][0].startTime
120 | nodes[id] = tmpExceptionDataNode
121 | elif id in exception_logs:
122 | nodeType = "log"
123 | tmpExceptionDataNode = ExceptionDataNode(id, nodeType)
124 | tmpExceptionDataNode.belongTo = exception_logs[id][0].logBelongTo
125 | tmpExceptionDataNode.exceptionTime = exception_logs[id][0].startTime
126 | tmpExceptionDataNode.detail = exception_logs[id][0].logExceptionSegment
127 | nodes[id] = tmpExceptionDataNode
128 | else:
129 | continue
130 | edges = g_new.edges()
131 | final_return_graph = Graph(nodes,edges)
132 | return final_return_graph
133 |
--------------------------------------------------------------------------------
/dao/db_dao.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | from sqlalchemy import create_engine
4 | from sqlalchemy.orm import sessionmaker
5 |
6 | from bean.save_model import FaultServiceRoot, FaultService, ServiceDependencyGraph, ExceptionDataDependencyGraph, \
7 | FaultServiceSolution
8 |
9 | class DBDao:
10 | def __init__(self):
11 | self.engine = create_engine('mysql+pymysql://root:root1234@127.0.0.1:3306/fault_result_solution')
12 | self.session = self.get_session()
13 |
14 | def get_session(self):
15 | # 创建DBSession类型:
16 | DBSession = sessionmaker(bind=self.engine)
17 | session = DBSession()
18 | return session
19 |
20 | def db_close(self):
21 | self.session.close()
22 |
23 | def db_commit(self):
24 | self.session.commit()
25 |
26 | def get_all_root_logs_noSolution(self):
27 | """
28 | 获取未生成修复方案的全部根因日志
29 | :return:
30 | """
31 | root_logs = self.session.query(FaultServiceRoot).filter(FaultServiceRoot.type == 1,
32 | FaultServiceRoot.has_solution == 0).all()
33 | return root_logs
34 |
35 | def update_root_detail_table_has_solutuon(self,fault_id,log_id):
36 | """
37 | 将已经生成修复方案的根因日志的has_solution字段更新为1
38 | :return:
39 | """
40 | result = self.session.query(FaultServiceRoot).filter(FaultServiceRoot.fault_id == fault_id,
41 | FaultServiceRoot.causeOfFault == log_id).update(
42 | FaultServiceRoot.has_solution == 1)
43 | return result
44 |
45 | def insert_fault_service_into_fault_service_table(self,serviceId,serviceName,serviceType,hostName):
46 | """
47 | 插入新的故障服务在故障服务详情表
48 | :param serviceId:
49 | :param serviceName:
50 | :param serviceType:
51 | :param hostName:
52 | :return:
53 | """
54 | fault_service = FaultService(fault_service_id = serviceId, fault_service_name = serviceName,
55 | fault_service_type = serviceType, host_name = hostName,
56 | exception_time=datetime.datetime.now(), process_state=0)
57 | self.session.add(fault_service)
58 | self.db_commit()
59 | return fault_service
60 |
61 | def insert_fault_service_root_into_fault_service_root_table(self, fault_id, root_id, name, detail,type,rank):
62 | service_falut_root = FaultServiceRoot(fault_id=fault_id, causeOfFault=root_id, causeName=name,
63 | detail=detail, has_solution=0, type=type,
64 | rank=rank)
65 | self.session.add(service_falut_root)
66 | return service_falut_root
67 |
68 | def insert_service_dependency_graph_into_service_dependency_graph_table(self, fault_id, fault_service_dependency_graph_json):
69 | service_dependency_graph = ServiceDependencyGraph(fault_id=fault_id,
70 | graph_json=fault_service_dependency_graph_json)
71 | self.session.add(service_dependency_graph)
72 | return service_dependency_graph
73 |
74 | def insert_exception_data_dependency_graphh_into_exception_data_dependency_graph_table(self, fault_id, service_exception_data_dependency_graph_json):
75 | exception_data_dependency_graph = ExceptionDataDependencyGraph(fault_id=fault_id,
76 | graph_json=service_exception_data_dependency_graph_json)
77 | self.session.add(exception_data_dependency_graph)
78 | return exception_data_dependency_graph
79 | def insert_fault_service_solution_insert_fault_service_solution_table(self,fault_id,log_id,fault_reason,fault_solution,rank):
80 | faultServiceSolution = FaultServiceSolution(fault_id=fault_id, root_log_id=log_id, fault_reason=fault_reason,
81 | fault_solution=fault_solution, rank=rank)
82 | self.session.add(faultServiceSolution)
83 | return faultServiceSolution
84 |
85 | def select_all_fault_service_detail_by_processState(self,process_state):
86 | fault_service_list = self.session.query(FaultService).filter(FaultService.process_state == process_state).all()
87 | return fault_service_list
88 |
89 | def select_rank1_faultserviceroot_by_faultid(self,fault_id):
90 | root = self.session.query(FaultServiceRoot).filter(FaultServiceRoot.fault_id == fault_id).order_by(
91 | FaultServiceRoot.rank.desc()).first()
92 | return root
93 | def select_service_invoke_graph_by_faultid(self,fault_id):
94 | service_invoke_graph_json = self.session.query(ServiceDependencyGraph).filter(
95 | ServiceDependencyGraph.fault_id == fault_id).first()
96 | return service_invoke_graph_json
97 |
98 | def select_exception_data_dependency_graph_by_faultid(self,fault_id):
99 | exception_data_dependency_graph_json = self.session.query(ExceptionDataDependencyGraph).filter(
100 | ExceptionDataDependencyGraph.fault_id == fault_id).one()
101 | return exception_data_dependency_graph_json
102 | def get_root_log_by_logid_and_faultid(self,fault_id,log_id):
103 | root_log = self.session.query(FaultServiceRoot).filter(FaultServiceRoot.fault_id == fault_id,
104 | FaultServiceRoot.causeOfFault == log_id).first()
105 | return root_log
106 |
107 | def select_solutions_by_logid_and_faultid(self,fault_id,log_id):
108 | solutions = self.session.query(FaultServiceSolution).filter(FaultServiceSolution.fault_id == fault_id,FaultServiceSolution.root_log_id == log_id).order_by(FaultServiceSolution.rank.asc()).all()
109 | return solutions
110 |
--------------------------------------------------------------------------------
/service/module_tools/save_result.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from dao.db_dao import DBDao
4 |
5 |
6 | class SaveResult:
7 | @staticmethod
8 | def save(service_invoke_graph,final_root_services,services_diagnisis_results):
9 | dbDao = DBDao()
10 | fault_ids = dict()
11 | # 存储识别出的故障服务到故障服务表
12 | for fault_service_id in final_root_services:
13 | serviceNode = service_invoke_graph.nodes[fault_service_id]
14 | fault_service = dbDao.insert_fault_service_into_fault_service_table(serviceNode.serviceId,serviceNode.serviceName,serviceNode.serviceType,serviceNode.hostName)
15 | fault_ids[fault_service.id] = fault_service.fault_service_id
16 | # 生成存储结果的服务依赖图Json
17 | fault_service_dependency_graph_json = SaveResult.generate_save_fault_service_dependency_graph_json(service_invoke_graph,fault_ids)
18 | for fault_id, fault_service_id in fault_ids.items():
19 | fault_service_roots = services_diagnisis_results[fault_service_id]['falut_root_dict']
20 | final_exception_data_graph = services_diagnisis_results[fault_service_id]['final_exception_data_graph']
21 | for index, root_id in enumerate(fault_service_roots.keys()):
22 | rootNode = final_exception_data_graph.nodes[root_id]
23 | if rootNode.nodeType == "metric":
24 | rootNode.nodeType = 0
25 | else:
26 | rootNode.nodeType = 1
27 | service_falut_root = dbDao.insert_fault_service_root_into_fault_service_root_table(fault_id,root_id,rootNode.name,rootNode.detail,rootNode.nodeType,index)
28 | service_dependency_graph = dbDao.insert_service_dependency_graph_into_service_dependency_graph_table(fault_id,fault_service_dependency_graph_json)
29 | # 生成存储结果的服务异常数据依赖图Json
30 | service_exception_data_dependency_graph_json = SaveResult.generate_save_service_exception_data_dependency_graph_json(final_exception_data_graph,fault_service_roots)
31 | exception_data_dependency_graph = dbDao.insert_exception_data_dependency_graphh_into_exception_data_dependency_graph_table(fault_id,service_exception_data_dependency_graph_json)
32 | dbDao.db_commit()
33 | dbDao.db_close()
34 |
35 | @staticmethod
36 | def generate_save_fault_service_dependency_graph_json(service_invoke_graph,fault_ids):
37 | storage_graph_json = dict()
38 | nodes = list()
39 | edges = list()
40 | for node_id, node in service_invoke_graph.nodes.items():
41 | save_node_dict = {}
42 | save_node_dict['id'] = node_id
43 | save_node_dict['label'] = node.serviceName
44 | save_node_dict['data'] = {}
45 | save_node_dict['data']['name'] = node.serviceName
46 | save_node_dict['data']['type'] = node.serviceType
47 | save_node_dict['data']['type'] = node.hostName
48 | save_node_dict['data']['fault_id'] = None
49 | if node_id in fault_ids.values():
50 | save_node_dict['data']['health_level'] = 2
51 | for key, value in fault_ids.items():
52 | if value == node_id:
53 | save_node_dict['data']['fault_id'] = key
54 | elif node.isException == 1:
55 | save_node_dict['data']['health_level'] = 1
56 | else:
57 | save_node_dict['data']['health_level'] = 0
58 |
59 | nodes.append(save_node_dict)
60 | for i in service_invoke_graph.edges:
61 | edge = {}
62 | edge['source'] = i[0]
63 | edge['target'] = i[1]
64 | edges.append(edge)
65 | storage_graph_json['nodes'] = nodes
66 | storage_graph_json['edges'] = edges
67 | graph_json = json.dumps(storage_graph_json)
68 | # graph_json = storage_graph_json
69 | return graph_json
70 |
71 | @staticmethod
72 | def generate_save_service_exception_data_dependency_graph_json(final_exception_data_graph, fault_service_roots):
73 | storage_graph_json = dict()
74 | nodes = list()
75 | edges = list()
76 | for node_id, node in final_exception_data_graph.nodes.items():
77 | save_node_dict = {}
78 | save_node_dict['id'] = node.id
79 | save_node_dict['label'] = node.name
80 | save_node_dict['data'] = dict()
81 | save_node_dict['data']['name'] = node.name
82 | save_node_dict['data']['type'] = node.nodeType
83 | save_node_dict['data']['detail'] = node.detail
84 | save_node_dict['data']['belongTo'] = node.belongTo
85 | save_node_dict['data']['exceptionTime'] = node.exceptionTime
86 | save_node_dict['data']['units'] = node.units
87 | if node_id in fault_service_roots.keys():
88 | save_node_dict['data']['import'] = 1
89 | else:
90 | save_node_dict['data']['import'] = 0
91 | nodes.append(save_node_dict)
92 | for i in final_exception_data_graph.edges:
93 | edge = {}
94 | edge['source'] = i[0]
95 | edge['target'] = i[1]
96 | edges.append(edge)
97 | storage_graph_json['nodes'] = nodes
98 | storage_graph_json['edges'] = edges
99 | graph_json_result = json.dumps(storage_graph_json)
100 | # graph_json_result = storage_graph_json
101 | return graph_json_result
102 |
103 | @staticmethod
104 | def save_solutions(fault_id,log_id,sorted_solutions):
105 | dbDao = DBDao()
106 | for index,solution in enumerate(sorted_solutions):
107 | faultServiceSolution = dbDao.insert_fault_service_solution_insert_fault_service_solution_table(fault_id,log_id,solution['reason'],solution['html_content'],index)
108 | result = dbDao.update_root_detail_table_has_solutuon(fault_id,log_id)
109 | if result:
110 | dbDao.db_commit()
111 | dbDao.db_close()
112 | return result
--------------------------------------------------------------------------------
/demo/aiops_2020_data_test.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from service.fault_diagnosis_service import fault_diagmosis
4 | from utils.data_tools import is_number
5 |
6 |
7 | def data_collection_process_json():
8 | """
9 | 从json中获取原始调用链数据、部署数据、原始指标数据、原始日志数据、异常指标数据和异常日志数据
10 | Args:
11 |
12 | Returns: 返回对应的从json中获取到的数据
13 | """
14 |
15 | # 获取未处理的异常数据
16 | f = open('../data/aiops_data_2020/2020_04_11/items_exception_result.json', 'r')
17 | original_exception_metric_data = json.load(f)
18 | exception_metric_data = original_exception_metric_data
19 | f.close()
20 |
21 | f = open('../data/aiops_data_2020/2020_04_11/origina_items.json', 'r')
22 | original_items = json.load(f)
23 | items = original_items
24 | f.close()
25 |
26 | f = open('../data/aiops_data_2020/2020_04_11/origina_traces.json', 'r')
27 | original_traces = json.load(f)
28 | traces = {}
29 | for i, (k, v) in enumerate(original_traces.items()):
30 | if i <= 500:
31 | traces[k] = v
32 | else:
33 | break
34 | f.close()
35 |
36 | f = open('../data/aiops_data_2020/2020_04_11/original_deployment_data.json', 'r')
37 | deployment_data = json.load(f)
38 | f.close()
39 | return exception_metric_data, items, traces, deployment_data
40 |
41 | def get_original_trace_data(traces):
42 | """
43 | 将traces处理成目标输入数据:每条记录包括id、pid、serviceId、serviceName、serviceType、startTime、traceId等字段
44 | aiops_2020处理的traces是将traces组织在一起的
45 | :param traces:
46 | :return:目标格式的List
47 | """
48 | traceData = list()
49 | for traceId, trace in traces.items():
50 | for index, span in enumerate(trace):
51 | if span['pid'] == 'None':
52 | span['pid'] = -1
53 | tmp_dict = {}
54 | tmp_dict['id'] = span['id']
55 | tmp_dict['pid'] = span['pid']
56 | tmp_dict['serviceId'] = span['serviceId'] # serviceCode并不能唯一标识服务实例,需要再讨论
57 | tmp_dict['serviceName'] = span['serviceName']
58 | tmp_dict['serviceType'] = span['serviceType']
59 | tmp_dict['startTime'] = span['startTime']
60 | tmp_dict['traceId'] = traceId
61 | traceData.append(tmp_dict)
62 | return traceData
63 |
64 | def get_deployment_data(deployment_data):
65 | """
66 | 将部署数据处理成目标输入数据:每条记录包含serviceInstanceId、serviceName、hostId、hostName、containerId、containerName等字段
67 | :param deployment_data:
68 | :return:目标格式List
69 | """
70 | return deployment_data
71 |
72 |
73 | def get_original_metric_data(items):
74 | """
75 | 将原始指标数据处理成目标输入数据:每条记录包含timestamp、metricId、metricName、value、units、metricBelongTo、metricBelongLevel
76 | :param items:
77 | :return:目标格式List
78 | """
79 | originalMetricData = list()
80 | for key,metrics in items.items():
81 | metricId = metrics['metricId']
82 | metricName = metrics['metricId']
83 | units = None
84 | metricBelongTo = metrics['metricBelongTo']
85 | metricBelongLevel = metrics['metricBelongLevel']
86 | stampTimes = metrics['timeStamps']
87 | values = metrics['values']
88 | if len(stampTimes) > 0 and len(stampTimes) == len(values):
89 | for index, value in enumerate(stampTimes):
90 | tmp_metric = dict()
91 | tmp_metric['metricId'] = str(metricId)
92 | tmp_metric['metricName'] = metricName
93 | tmp_metric['metricBelongTo'] = metricBelongTo
94 | tmp_metric['metricBelongLevel'] = metricBelongLevel
95 | tmp_metric['units'] = units
96 | tmp_metric['timestamp'] = value
97 | if is_number(values[index]):
98 | tmp_metric['value'] = float(values[index])
99 | originalMetricData.append(tmp_metric)
100 | else:
101 | continue
102 | return originalMetricData
103 |
104 | def get_exception_metric_data(exception_metric_data):
105 | """
106 | 将异常指标数据处理成输入数据:每条记录包含startTime、endTime、metricId、metricName、value、units、metricBelongTo、metricBelongLevel等字段
107 | :param exception_metric_data:
108 | :return:目标格式List
109 | """
110 | exceptionMetricData = list()
111 |
112 | for ex_metric in exception_metric_data:
113 | metricId = ex_metric['metricId']
114 | metricName = ex_metric['metricName']
115 | units = None
116 | metricBelongTo = ex_metric['belongTo']
117 | metricBelongLevel = None
118 | stampTimes = ex_metric['testTime']
119 | values = ex_metric['value']
120 | stampTimes_splits = stampTimes.strip().split(',')
121 | values_splits = values.strip().split(',')
122 | if len(stampTimes_splits) > 0 and len(stampTimes_splits) == len(values_splits):
123 | for index, value in enumerate(stampTimes_splits):
124 | tmp_metric = dict()
125 | tmp_metric['metricId'] = str(metricId)
126 | tmp_metric['metricName'] = metricName
127 | tmp_metric['metricBelongTo'] = metricBelongTo
128 | tmp_metric['metricBelongLevel'] = metricBelongLevel
129 | tmp_metric['units'] = units
130 | tmp_metric['endTime'] = value
131 | tmp_metric['startTime'] = value
132 | if is_number(values_splits[index]):
133 | tmp_metric['value'] = float(values_splits[index])
134 | exceptionMetricData.append(tmp_metric)
135 | else:
136 | continue
137 | return exceptionMetricData
138 |
139 | if __name__ == '__main__':
140 | exception_metric_data, items, traces, deployment_data = data_collection_process_json()
141 |
142 | deploymentData = get_deployment_data(deployment_data)
143 | traceData = get_original_trace_data(traces)
144 | original_metricData = get_original_metric_data(items)
145 | exception_metricData = get_exception_metric_data(exception_metric_data)
146 | exception_logData = []
147 | original_logData = []
148 | fault_diagmosis(deploymentData, traceData, original_metricData, original_logData, exception_metricData,
149 | exception_logData)
150 | pass
151 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | # AIOsp-Fault-Diagnosis
3 |
4 | ### 工具介绍
5 |
6 | 本文设计并实现了一种**故障辅助诊断工具**,主要用于辅助运维人员进行微服务系统故障诊断及修复,其中故障根因诊断可以定位到指标或日志层级,修复方案则基于已有的图谱生成,最终使用Flask框架搭建查询服务。
7 |
8 | 实现功能如下:
9 | - 服务依赖图构建:还原出系统实时服务依赖关系,并在图中进一步标识出异常服务以及故障服务
10 | - 异常数据提取与依赖图构建:对异常服务进行分析,构建异常间的因果关系,进而在图中标记出根源指标或日志
11 | - 异常日志根源辅助分析:基于日志匹配结果,分析异常日志可能的原因及解决方案
12 |
13 |
14 | 整体定位流程图如下:
15 |
16 |
17 |
18 | ### Install
19 | ```
20 | pip install -r requirements.txt
21 | ```
22 | ```
23 | 1、安装MYSQL数据库
24 | 2、运行config/data_base_sql创建项目所需数据库
25 | ```
26 |
27 | #### Note
28 |
29 | - python_version = 3.7.9
30 | - Mysql 数据库存储故障根因诊断结果和修复方案
31 | - 若不考虑存储部分,可注释掉所有和SaveResult相关代码
32 | - 使用项目中的数据库存储时注意在```dao```文件夹下的账号密码
33 |
34 | ### demo
35 |
36 | - 数据下载地址
37 | ```
38 | 链接:https://pan.baidu.com/s/1bMwuqRpJ1hMhKPhxajp4kw 密码:ptmj
39 | ```
40 | ### demoRun
41 |
42 | 数据放入```data```文件夹
43 | - hadoop数据:```run demo/hadoop_data_test.py```完成故障根因诊断并存储
44 | - aiops数据:```run demo/aiops_2020_data_test.py```完成故障根因诊断并存储
45 |
46 | ### WEB接口启动
47 | ```
48 | run app.py
49 | ```
50 | ### 生成修复方案
51 | ```
52 | run generate_solutuons_service.py的time_generate_logs_solutions()
53 | ```
54 | #### Note
55 |
56 | - 未配置neo4j数据库和mysql数据库,该功能不可用
57 |
58 | ### 目录结构
59 | ├── fault_diagnosis_and_repair
60 | └── bean
61 | ├── input_model.py //输入数据对应对象类
62 | ├── output_model.py //web接口输出对象类
63 | └── save_model.py //与mysql数据库表对应类
64 | ├── config
65 | ├── data_base_sql //mysql数据库创建SQL
66 | └── stop.txt //停用词
67 | ├── dao
68 | ├── db_dao.py //mysql对应dao
69 | ├── es_dao.py //es对应dao
70 | └── neo4j_dao.py //neo4j对应dao
71 | ├── data
72 | ├── aiops_data_2020 //存放demo数据
73 | └── hadoop_data.py //存放demo数据
74 | ├── demo
75 | ├── aiops_2020_data_test.py //aiops数据故障根因诊断并存储入口
76 | └── hadoop_data_test.py //hadoop数据故障根因诊断并存储入口
77 | ├── service //项目核心代码
78 | ├── module_tools //功能实现封装的各个模块工具类
79 | ├── diagnosis_faultservice.py //故障服务进一步诊断工具类
80 | ├── generate_solutions.py //修复方案生成工具类
81 | ├── identify_faulrservice.py //故障服务识别工具类
82 | ├── input_data.py //输入数据处理工具类
83 | └── save_result.py //数据存储工具类
84 | ├── fault_diagnosis_service //故障根因诊断服务入口
85 | ├── generate_solutions_service //修复方案生成服务入口
86 | └── web_service //交互模块查询服务入口
87 | ├── utils
88 | ├── data_tools.py
89 | ├── graph.py
90 | ├── jaccard_api.py
91 | ├── pageRank.py
92 | ├── pcalg.py
93 | └── prcess_aiops2020_data_original.py //可将aiops 2020数据处理为本项目所需的数据,作为aiops_2020_data_test.py数据源
94 | ├── app.py //flask接口启动
95 | └── requirements.txt
96 | ### 输入数据格式
97 | 项目所需数据包括原始指标数据、原始日志数据、系统部署数据、调用数据、异常检测模块检测出的异常指标数据和异常日志数据下面分别对所需数据格式介绍
98 |
99 | - 原始指标数据
100 |
101 | 字段 | 说明 | 类型|
102 | ---- | ----- | ------
103 | timestamp|采集时间(s)|秒级时间戳string
104 | metricId|指标唯一标识|string
105 | metricName|指标名称|string
106 | value|指标采集值|float
107 | units|单位|string
108 | metricBelongTo| 指标所属|string
109 | metricBelongLevel|指标所属层级|host/container/service
110 |
111 | timestamp | metricId | metricName| value | units | metricBelongTo | metricBelongLevel
112 | ---- | ----- | ------ | ------ | ------ | ------ | ------
113 | '1614688922' | '29162' | 'CPU iowait time'| 0.717773 | '%'| 'fuguangbo-0002'| 'host'
114 |
115 | - 原始日志数据
116 |
117 | 字段 | 说明 | 类型|
118 | ---- | ----- | ------
119 | timestamp|采集时间(s)|秒级时间戳string
120 | logId|日志唯一标识|string
121 | logMessage|日志条目信息|string
122 | logLevel|日志等级|string
123 | logBelongTo|日志所属|string
124 | logBelongLevel| 日志所属层级|host/container/service
125 |
126 | timestamp | logId | logMessage| logLevel | logBelongTo | logBelongLevel
127 | ---- | ----- | ------ | ------ | ------ | ------
128 | 1614688885 | 'hadoop-root-datanode-hadoop-slave2.log' | org.apache.hadoop.hdfs.server... | INFO | 'DataNode'| 'service'
129 |
130 | - 部署数据
131 |
132 | 字段 | 说明 | 类型|
133 | ---- | ----- | ------
134 | serviceInstanceId|服务实例唯一标识|string
135 | serviceName|服务名称|string
136 | hostId|主机唯一标识|string
137 | hostName|主机名称|string
138 | containerId|容器唯一标识|string
139 | containerName|容器名称|string
140 |
141 | serviceInstanceId | serviceName | hostId| hostName | containerId | containerName
142 | ---- | ----- | ------ | ------ | ------ | ------
143 | 'NameNode' | 'NameNode' | 'hadoop-master'| 'hadoop-master' | 'fuguangbo-0002'| 'fuguangbo-0002'
144 |
145 | - 调用链数据
146 |
147 | 字段 | 说明 | 类型|
148 | ---- | ----- | ------
149 | id|当前调用SpanId|string
150 | pid|父调用SpanId|string
151 | serviceId|服务实例唯一标识|string
152 | serviceName|服务名称|string
153 | serviceType|服务类别|string
154 | startTime|调用开始时间|string
155 | traceId|调用链唯一标识|string
156 |
157 | id | pid | serviceId| serviceName | serviceType | startTime| traceId
158 | ---- | ----- | ------ | ------ | ------ | ------ | ------
159 | '136.60.16146924705200712' | -1 | ''DataNode'| 'DataNode' | 'Local'| '1614692470520'|'136.60.16146924705200713'
160 |
161 | - 异常指标数据
162 |
163 | 字段 | 说明 | 类型|
164 | ---- | ----- | ------
165 | startTime|异常开始时间|string
166 | endTime|异常结束时间|string
167 | metricId|指标唯一标识|string
168 | metricName|指标名称|string
169 | value|指标值|float
170 | units|单位|string
171 | metricBelongTo|指标所属|string
172 | metricBelongLevel|指标所属层级|host/container/service
173 |
174 | startTime | endTime | metricId| metricName | value | units| metricBelongTo|metricBelongLevel
175 | ---- | ----- | ------ | ------ | ------ | ------ | ------ | ------
176 | '2021-03-02 21:31:02'|'2021-03-02 21:31:02' | '29162'| 'CPU iowait time' | '0.912974| 'fuguangbo-0002'|'host'
177 |
178 | - 异常日志数据
179 |
180 | 字段 | 说明 | 类型|
181 | ---- | ----- | ------
182 | startTime|异常开始时间|string
183 | endTime|异常结束时间|string
184 | logId|日志唯一标识|string
185 | logExceptionSegment|日志异常片段|string
186 | logBelongTo|日志所属|float
187 | logBelongLevel|日志所属层级|host/container/service
188 |
189 | startTime | endTime | logId| logExceptionSegment | logBelongTo | logBelongLevel
190 | ---- | ----- | ------ | ------ | ------ | ------
191 | '2021-03-02T13:15:51.452Z'|'2021-03-02T13:15:51.452Z' |'hadoop-root-datanode-hadoop-slave2.log'| java.net.NoRouteToHostException...| 'DataNode'|'service'
192 |
193 | ### 运行效果
194 | - 故障根因诊断运行效果分两个部分:打印结果和存储结果
195 | 故障根因诊断打印结果:
196 |
197 | 
198 |
199 | >故障服务列表是在多个异常服务中识别出的故障服务列表,key为服务Id
200 | >XXX服务故障根因列表是该故障服务的根因指标、日志列表,key为指标或日志名称
201 | 具体存储结果可依据数据库SQl查看
202 | - web接口返回值为Json
203 | - 修复方案生成直接生成修复方案存储到mysql
--------------------------------------------------------------------------------
/service/module_tools/genarate_solutions.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from dao.neo4j_dao import GraphDao
4 | from utils.jaccard_api import log_preprocess, generate_cidian_jaccard
5 |
6 | paramregex = [r'blk_-?\d+', r'(\d+\.){3}\d+(:\d+)?', r'(\d+\.){3}\d+', r'(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)',
7 | r'(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$']
8 | stopkeyword = [line.strip() for line in open('config/stop.txt').readlines()]
9 | eraseRex = [r'(\d+\-){2}\d+\s\d+\:\d+\:\d+\,\d+', r'INFO', r'ERROR', r'DEBUG', r'WARN', r'FATAL']
10 |
11 | class GenetateSolutuons:
12 | @staticmethod
13 | def get_solutions_by_logDetail(log_detail):
14 | """
15 | 生成异常日志片段对应的解决方案列表
16 | :param log_detail:
17 | :return: 排好序的解决方案列表
18 | """
19 | logs_dictionaries = GenetateSolutuons.get_fault_repair_graph_dictionary_log()
20 | logs_with_jaccard = GenetateSolutuons.logDetail_graph_cache_jaccard(log_detail, logs_dictionaries)
21 |
22 | # jaccard 日志按相似度大小进行排序
23 | logs_with_jaccard_sort = sorted(logs_with_jaccard.items(), key=lambda x: x[1]['jaccard'], reverse=True)
24 | # 取前 5 name
25 | top_name = [item[1]['name'] for item in logs_with_jaccard_sort[:5]]
26 | solutions = {}
27 |
28 | graph_dao = GraphDao()
29 | for name in top_name:
30 | if len(solutions) < 5:
31 | solutions_tmp = {}
32 | faults = graph_dao.get_fault_entity_by_log(name)
33 | for fault in faults:
34 | solutions_zhijie = graph_dao.get_solutions_by_fault(fault['name'])
35 | reasons = graph_dao.get_reasons_by_fault(fault['name'])
36 | for reason in reasons:
37 | solutions_jianjie = graph_dao.get_solutions_by_reason(reason['name'])
38 | for solution_jianjie in solutions_jianjie:
39 | solutions_tmp[solution_jianjie['name']] = {}
40 | solutions_tmp[solution_jianjie['name']]['reason'] = reason['content']
41 | solutions_tmp[solution_jianjie['name']]['html_content'] = solution_jianjie['html_content']
42 | solutions_tmp[solution_jianjie['name']]['json_content'] = solution_jianjie['json_content']
43 | solutions_tmp[solution_jianjie['name']]['vote'] = solution_jianjie['vote']
44 | pass
45 | for solution_zhijie in solutions_zhijie:
46 | solutions_tmp[solution_zhijie['name']] = {}
47 | solutions_tmp[solution_zhijie['name']]['reason'] = "暂无"
48 | solutions_tmp[solution_zhijie['name']]['html_content'] = solution_zhijie['html_content']
49 | solutions_tmp[solution_zhijie['name']]['json_content'] = solution_zhijie['json_content']
50 | solutions_tmp[solution_zhijie['name']]['vote'] = solution_zhijie['vote']
51 | # 根据投票数排序
52 | solutions_tmp_sort = sorted(solutions_tmp.items(), key=lambda x: x[1]['vote'], reverse=True)
53 | for solution_tmp in solutions_tmp_sort:
54 | if len(solutions) < 5:
55 | solutions[solution_tmp[0]] = {}
56 | solutions[solution_tmp[0]]['reason'] = solution_tmp[1]['reason']
57 | solutions[solution_tmp[0]]['html_content'] = solution_tmp[1]['html_content']
58 | solutions[solution_tmp[0]]['json_content'] = solution_tmp[1]['json_content']
59 | solutions[solution_tmp[0]]['vote'] = solution_tmp[1]['vote']
60 | solutions[solution_tmp[0]]['serial_number'] = len(solutions)
61 | else:
62 | break
63 | solutions_sort = sorted(solutions.items(), key=lambda x: x[1]['serial_number'])
64 | result_solutions = []
65 | for solution_sort in solutions_sort:
66 | result_solution = {}
67 | result_solution['reason'] = solution_sort[1]['reason']
68 | result_solution['html_content'] = solution_sort[1]['html_content']
69 | result_solution['json_content'] = solution_sort[1]['json_content']
70 | result_solution['vote'] = solution_sort[1]['vote']
71 | result_solution['serial_number'] = solution_sort[1]['serial_number']
72 | result_solutions.append(result_solution)
73 | return result_solutions
74 |
75 | @staticmethod
76 | def logDetail_graph_cache_jaccard(yichanglog, graph_cache):
77 | """
78 | 根因日志异常片段与图谱中全部日志相似度计算
79 | :param yichanglog:
80 | :param graph_cache:
81 | :return: dict {图谱中日志名称:相似度值}
82 | """
83 | result_dict = {}
84 | log_ = log_preprocess(yichanglog, paramregex, eraseRex)
85 | log1_dic = generate_cidian_jaccard(log_, stopkeyword)
86 |
87 | for name, log2_dic in graph_cache.items():
88 | dict = {}
89 | bingji = list(set(log1_dic).union(set(log2_dic)))
90 | jiaoji = list(set(log1_dic).intersection(set(log2_dic)))
91 | jiaquan = 0
92 | for word in jiaoji:
93 | if re.search(r'[a-zA-Z0-9]+.[a-zA-Z0-9]+.[a-zA-Z0-9]+Exception', word):
94 | jiaquan += 5
95 | jaccard = (len(jiaoji) + jiaquan) / len(bingji)
96 | dict['name'] = name
97 | dict['dict'] = log2_dic
98 | dict['jaccard'] = jaccard
99 | result_dict[name] = dict
100 | return result_dict
101 |
102 | @staticmethod
103 | def get_fault_repair_graph_dictionary_log():
104 | """
105 | 获取图谱中所有日志的分词结果
106 | :return:
107 | """
108 | graph_dao = GraphDao()
109 | logs = graph_dao.get_all_log_entities()
110 | graph_cache_jaccard = GenetateSolutuons.generate_graph_cache_jaccard(logs, paramregex, eraseRex, stopkeyword)
111 | return graph_cache_jaccard
112 |
113 | @staticmethod
114 | def generate_graph_cache_jaccard(graph_logs, paramregex, eraseRex, stopkeyword):
115 | """
116 | 生成图谱中所有日志的分词结果
117 | :param graph_logs:
118 | :param paramregex:
119 | :param eraseRex:
120 | :param stopkeyword:
121 | :return:
122 | """
123 | graph_cache_jaccard = {}
124 | for log in graph_logs:
125 | log_ = log_preprocess(log['n']['content'], paramregex, eraseRex)
126 | log_dict = generate_cidian_jaccard(log_, stopkeyword)
127 | graph_cache_jaccard[log['n']['name']] = log_dict
128 | return graph_cache_jaccard
--------------------------------------------------------------------------------
/service/module_tools/identify_faultservice.py:
--------------------------------------------------------------------------------
1 | import networkx as nx
2 |
3 | from utils.graph import ServiceNode
4 | from utils.pageRank import PRIterator
5 |
6 |
7 | class IdentifyFaultService:
8 | @staticmethod
9 | def generate_service_invoke_graph(traceObjData_by_traceId):
10 | """
11 | 根据调用链数据
12 | :param traceObjData_by_traceId:
13 | :return: nodes、edges一种服务调用图的表现形式;traverse_initial_list所有的调用发起节点
14 | """
15 | num_nodes = {} #{spanId:ServiceNode,spanId:ServiceNode}
16 | num_edges = [] #[[spanId,spanId],[spanId,spanId]]
17 | traverse_initial_list = []#[serviceId,serviceId]
18 | for key, value in traceObjData_by_traceId.items():
19 | for i in value:
20 | id = i.id
21 | pid = i.pid
22 | serviceId = i.serviceId
23 | serviceName = i.serviceName
24 | serviceType = i.serviceType
25 | if id not in num_nodes:
26 | num_nodes[id] = ServiceNode(serviceId,serviceName,serviceType)
27 | if pid and id and pid != -1:
28 | if [pid, id] not in num_edges:
29 | num_edges.append([pid, id])
30 | elif pid == -1:
31 | if serviceId not in traverse_initial_list:
32 | traverse_initial_list.append(serviceId)
33 |
34 | #转换为真正调用图数据
35 | nodes = {} #{serviceId:ServiceNode,serviceId:ServiceNode}
36 | edges = [] #[[serviceId,serviceId],[serviceId,serviceId]]
37 | # 将编号边替换为具体的服务ID
38 | for num_edge in num_edges:
39 | if num_edge[0] in num_nodes and num_edge[1] in num_nodes:
40 | p_servceId = num_nodes[num_edge[0]].serviceId
41 | c_serviceId = num_nodes[num_edge[1]].serviceId
42 | if p_servceId!= c_serviceId and [p_servceId,c_serviceId] not in edges:
43 | edges.append([p_servceId,c_serviceId])
44 | # 将编号节点替换为具体的服务ID
45 | for key, value in num_nodes.items():
46 | if value.serviceId not in nodes:
47 | nodes[value.serviceId] = value
48 | return nodes,edges,traverse_initial_list
49 |
50 | @staticmethod
51 | def completion_serviceNode_deploymentData(deploymentObjData_by_sviid,nodes):
52 | """
53 | 对服务依赖图中的节点补充部署信息
54 | :param deployment_data: 接入的部署数据
55 | :param nodes: 图中节点dict
56 | :return:nodes
57 | """
58 | for key, value in nodes.items():
59 | if key in deploymentObjData_by_sviid.keys():
60 | value.hostId = deploymentObjData_by_sviid[key].hostId
61 | value.hostName = deploymentObjData_by_sviid[key].hostName
62 | value.dockerName = deploymentObjData_by_sviid[key].containerName
63 | value.dockerId = deploymentObjData_by_sviid[key].containerId
64 | return nodes
65 |
66 | @staticmethod
67 | def set_service_exception_info(nodes, data):
68 | """
69 | 识别出异常服务,并在nodes中补充异常信息
70 | :param nodes: 服务依赖图节点dict ServiceNode
71 | :param data: input_data实例
72 | :return: nodes 补充上是否异常信息
73 | """
74 | if nodes is None:
75 | return None
76 | exception_list_metric_belongTo = data.organization_exception_metricObjData_by_metricBelongTo()
77 | exception_list_log_belongTo = data.organization_exception_logObjData_by_logBelongTo()
78 | for key, serviceNode in nodes.items():
79 | if serviceNode.serviceId in (
80 | exception_list_metric_belongTo or exception_list_log_belongTo) or serviceNode.hostId in (
81 | exception_list_metric_belongTo or exception_list_log_belongTo) or serviceNode.containerId in (
82 | exception_list_metric_belongTo or exception_list_log_belongTo):
83 | serviceNode.isException = 1
84 | return nodes
85 |
86 | @staticmethod
87 | def location_root_service(graph, start_service_id, root_services):
88 | """
89 | 定位根因服务,某个节点为初始遍历节点
90 | :param graph:
91 | :param start_service_id:
92 | :param root_services:
93 | :return:root_services 本次遍历后的根因列表
94 | """
95 | queue = []
96 | queue.append(start_service_id)
97 | while (len(queue) > 0):
98 | cur_node_id = queue.pop(0)
99 | if IdentifyFaultService.is_root_service(graph, cur_node_id):
100 | if cur_node_id not in root_services:
101 | root_services[cur_node_id] = 1
102 | else:
103 | root_services[cur_node_id] = root_services[cur_node_id] + 1
104 | else:
105 | for chirld_id in graph[cur_node_id].childs:
106 | if graph[chirld_id].isException == 1:
107 | queue.append(chirld_id)
108 | return root_services
109 |
110 | @staticmethod
111 | def is_root_service(graph, service_id):
112 | is_root = True
113 | chirlds = graph[service_id].childs
114 | for chirld_id in chirlds:
115 | if graph[chirld_id].isException == 1:
116 | is_root = False
117 | if graph[service_id].isException == 0:
118 | is_root = False
119 | return is_root
120 |
121 | @staticmethod
122 | def get_fault_services_list_PR(graph, traverse_initial_list):
123 | """
124 | 识别故障服务列表 PR方法
125 | :param graph:
126 | :param traverse_initial_list:
127 | :return: final_root_services 故障服务列表 {serviceId:数值,serviceId:数值}
128 | """
129 | if len(graph.nodes) == 0:
130 | return None
131 | dg = nx.DiGraph()
132 | for key, node in graph.nodes.items():
133 | dg.add_node(key)
134 | for edge in graph.edges:
135 | dg.add_edge(edge[0], edge[1])
136 | pr = PRIterator(dg)
137 | page_ranks = pr.page_rank()
138 | node_pr_sorted = sorted(page_ranks.items(), key=lambda x: x[1], reverse=True)
139 | root_services = {}
140 | for index, serviceId in enumerate(node_pr_sorted):
141 | if index < 3:
142 | root_services[serviceId[0]] = serviceId[1]
143 | return root_services
144 |
145 | @staticmethod
146 | def get_fault_services_list(graph, traverse_initial_list):
147 | """
148 | 识别故障服务 使用图广度优先搜索算法
149 | :param graph:
150 | :param traverse_initial_list:
151 | :return: final_root_services 故障服务列表 {serviceId:数值,serviceId:数值}
152 | """
153 | construct_graph = graph.generate_invoke_graph_consturct()
154 | if len(construct_graph) == 0 or len(traverse_initial_list) == 0:
155 | return None
156 | root_services = {} # {serviceId:数值,serviceId:数值}
157 | # 遍历图初始遍历列表
158 | for i in traverse_initial_list:
159 | root_services = IdentifyFaultService.location_root_service(construct_graph, i, root_services)
160 | return root_services
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/demo/hadoop_data_test.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from service.fault_diagnosis_service import fault_diagmosis
4 | from utils.data_tools import is_number, utc_to_local
5 |
6 |
7 | def data_collection_process_json():
8 | """
9 | 从json中获取原始调用链数据、部署数据、原始指标数据、原始日志数据、异常指标数据和异常日志数据
10 | Args:
11 |
12 | Returns: 返回对应的从json中获取到的数据
13 | """
14 |
15 | #获取异常指标数据
16 | f = open('../data/hadoop_data/exception_data/result-infi.json', 'r')
17 | original_exception_metric_data = json.load(f)
18 | exception_metric_data = original_exception_metric_data['3EUz83cBglWMAhILSC5I']
19 | #获取日志指标数据
20 | f = open('../data/hadoop_data/exception_data/result_log-infi.json', 'r')
21 | original_exception_log_data = json.load(f)
22 | exception_log_data = original_exception_log_data['3EUz83cBglWMAhILSC5I']
23 | #获取原始指标、调用链、日志数据
24 | f = open('../data/hadoop_data/3EUz83cBglWMAhILSC5I.json', 'r')
25 | original_data = json.load(f)
26 | items = original_data[0]['_source']['items']
27 | traces = original_data[0]['_source']['traces']
28 | logs = original_data[0]['_source']['logs']
29 | #获取原始部署数据
30 | f = open('../data/hadoop_data/deployment_info.json', 'r')
31 | deployment_data = json.load(f)
32 | return exception_metric_data,exception_log_data,original_data,items,traces,logs,deployment_data
33 |
34 | def get_original_trace_data(traces):
35 | """
36 | 将traces处理成目标输入数据:每条记录包括id、pid、serviceId、serviceName、serviceType、startTime、traceId等字段
37 | hadoop的traces是将traces组织在一起的
38 | :param traces:
39 | :return:目标格式的List
40 | """
41 | traceData = list()
42 | for trace in traces:
43 | i_dict = json.loads(trace)
44 | for index, span in enumerate(i_dict['data']['trace']['spans']):
45 | if span['parentSpanId'] != -1: continue
46 | else:
47 | if len(span['refs'])== 0:
48 | tmp_dict = {}
49 | tmp_dict['id'] = span['segmentId']
50 | tmp_dict['pid'] = -1
51 | tmp_dict['serviceId'] = span['serviceCode']#serviceCode并不能唯一标识服务实例,需要再讨论
52 | tmp_dict['serviceName'] = span['serviceCode']
53 | tmp_dict['serviceType'] = span['type']
54 | tmp_dict['startTime'] = span['startTime']
55 | tmp_dict['traceId'] = span['traceId']
56 | traceData.append(tmp_dict)
57 | else:
58 | for k in span['refs']:
59 | tmp_dict = {}
60 | tmp_dict['id'] = span['segmentId']
61 | tmp_dict['pid'] = k['parentSegmentId']
62 | tmp_dict['serviceId'] = span['serviceCode'] # serviceCode并不能唯一标识服务实例,需要再讨论
63 | tmp_dict['serviceName'] = span['serviceCode']
64 | tmp_dict['serviceType'] = span['type']
65 | tmp_dict['startTime'] = span['startTime']
66 | tmp_dict['traceId'] = span['traceId']
67 | traceData.append(tmp_dict)
68 | return traceData
69 |
70 | def get_deployment_data(deployment_data):
71 | """
72 | 将部署数据处理成目标输入数据:每条记录包含serviceInstanceId、serviceName、hostId、hostName、containerId、containerName等字段
73 | :param deployment_data:
74 | :return:目标格式List
75 | """
76 | return deployment_data
77 |
78 | def get_original_metric_data(items):
79 | """
80 | 将原始指标数据处理成目标输入数据:每条记录包含timestamp、metricId、metricName、value、units、metricBelongTo、metricBelongLevel
81 | :param items:
82 | :return:目标格式List
83 | """
84 | originalMetricData = list()
85 | for item_str in items:
86 | item = json.loads(item_str)
87 | metricId = item['id']
88 | metricName = item['name']
89 | units = item['units']
90 | applicationName = item['applicationName']
91 | metricBelongTo = None
92 | metricBelongLevel = None
93 | if applicationName == "Zabbix server":
94 | continue
95 | else:
96 | if applicationName is None:
97 | continue
98 | elif applicationName.startswith('Hadoop'):
99 | if applicationName == "Hadoop":
100 | name_split = metricName.split(':')
101 | metricBelongTo = name_split[0]
102 | metricBelongLevel = "service"
103 | else:
104 | applicationName_split = applicationName.split()
105 | metricBelongTo = applicationName_split[1]
106 | metricBelongLevel = "service"
107 | elif applicationName.startswith('Docker'):
108 | if applicationName == "Docker":
109 | continue
110 | else:
111 | applicationName_split = applicationName.split()
112 | metricBelongTo = applicationName_split[2][1:]
113 | metricBelongLevel = "docker"
114 | else:
115 | metricBelongTo = item['hostName']
116 | metricBelongLevel = "host"
117 | stampTimes = item['allClock']
118 | values = item['allValue']
119 | stampTimes_splits = stampTimes.strip().split(',')
120 | values_splits = values.strip().split(',')
121 | if len(stampTimes_splits) > 0 and len(stampTimes_splits) == len(values_splits):
122 | for index,value in enumerate(stampTimes_splits):
123 | tmp_metric = dict()
124 | tmp_metric['metricId'] = str(metricId)
125 | tmp_metric['metricName'] = metricName
126 | tmp_metric['metricBelongTo'] = metricBelongTo
127 | tmp_metric['metricBelongLevel'] = metricBelongLevel
128 | tmp_metric['units'] = units
129 | tmp_metric['timestamp'] = value
130 | if is_number(values_splits[index]):
131 | tmp_metric['value'] = float(values_splits[index])
132 | originalMetricData.append(tmp_metric)
133 | else:
134 | continue
135 | return originalMetricData
136 |
137 | def get_original_log_data(logs):
138 | """
139 | 将原始日志数据处理成目标输入数据:每条记录包含timestamp、logId、logMessage、logLevel、logBelongTo、logBelongLevel等字段
140 | :param logs:
141 | :return:目标格式List
142 | """
143 | originalLogData = list()
144 | for key, logList in logs.items():
145 | if len(logList) == 0:
146 | continue
147 | logId = key
148 | for log_str in logList:
149 | log = json.loads(log_str)
150 | tmp_log = dict()
151 | tmp_log['logId'] = logId
152 | if logId == "stderr":
153 | tmp_log['logLevel'] = None
154 | tmp_log['logMessage'] = log['message']
155 | log_time = log['@timestamp']
156 | log_time = utc_to_local(log_time)
157 | tmp_log['timestamp'] = log_time
158 | else:
159 | if "level" in log:
160 | tmp_log['logLevel'] = log['level']
161 | else:
162 | tmp_log['logLevel'] = None
163 | tmp_log['logMessage'] = log['log_message']
164 | log_time = log['log_time']
165 | log_time = utc_to_local(log_time)
166 | tmp_log['timestamp'] = log_time
167 | if logId.startswith('hadoop'):
168 | logId_splits = logId.strip().split('-')
169 | logBelongTo = logId_splits[2]
170 | tmp_log['logBelongLevel'] = "service"
171 | if logBelongTo == "datanode":
172 | tmp_log['logBelongTo'] = "DataNode"
173 | elif logBelongTo == "namenode":
174 | tmp_log['logBelongTo'] = "NameNode"
175 | elif logBelongTo == "nodemanager":
176 | tmp_log['logBelongTo'] = "NodeManager"
177 | elif logBelongTo == "resourcemanager":
178 | tmp_log['logBelongTo'] = "ResourceManager"
179 | elif logBelongTo == "secondarynamenode":
180 | tmp_log['logBelongTo'] = "SecondaryNameNode"
181 | else:
182 | tmp_log['logBelongTo'] = log['host']['name']
183 | tmp_log['logBelongLevel'] = "host"
184 | originalLogData.append(tmp_log)
185 | return originalLogData
186 |
187 | def get_exception_metric_data(exception_metric_data):
188 | """
189 | 将异常指标数据处理成输入数据:每条记录包含startTime、endTime、metricId、metricName、value、units、metricBelongTo、metricBelongLevel等字段
190 | :param exception_metric_data:
191 | :return:目标格式List
192 | """
193 | exceptionMetricData = list()
194 | for ex_metric in exception_metric_data:
195 | tmp_dict = dict()
196 | metricId = ex_metric['metricId']
197 | metricName = ex_metric['metricName']
198 | units = None
199 | metricBelongTo = None
200 | metricBelongLevel = None
201 | applicationName = ex_metric['belongTo']
202 | if applicationName == "Zabbix server":
203 | continue
204 | else:
205 | if applicationName is None:
206 | continue
207 | elif applicationName.startswith('Hadoop'):
208 | if applicationName == "Hadoop":
209 | name_split = metricName.split(':')
210 | metricBelongTo = name_split[0]
211 | metricBelongLevel = "service"
212 | else:
213 | applicationName_split = applicationName.split()
214 | metricBelongTo = applicationName_split[1]
215 | metricBelongLevel = "service"
216 | elif applicationName.startswith('Docker'):
217 | if applicationName == "Docker":
218 | continue
219 | else:
220 | applicationName_split = applicationName.split()
221 | metricBelongTo = applicationName_split[2][1:]
222 | metricBelongLevel = "docker"
223 | else:
224 | # metricBelongTo = ex_metric['hostName']
225 | metricBelongTo = "fuguangbo-0002"
226 | metricBelongLevel = "host"
227 | stampTimes = ex_metric['testTime']
228 | values = ex_metric['value']
229 | stampTimes_splits = stampTimes.strip().split(',')
230 | values_splits = values.strip().split(',')
231 | if len(stampTimes_splits) > 0 and len(stampTimes_splits) == len(values_splits):
232 | for index, value in enumerate(stampTimes_splits):
233 | tmp_metric = dict()
234 | tmp_metric['metricId'] = str(metricId)
235 | tmp_metric['metricName'] = metricName
236 | tmp_metric['metricBelongTo'] = metricBelongTo
237 | tmp_metric['metricBelongLevel'] = metricBelongLevel
238 | tmp_metric['units'] = units
239 | tmp_metric['endTime'] = value
240 | tmp_metric['startTime'] = value
241 | if is_number(values_splits[index]):
242 | tmp_metric['value'] = float(values_splits[index])
243 | exceptionMetricData.append(tmp_metric)
244 | else:
245 | continue
246 | return exceptionMetricData
247 |
248 | def get_exception_log_data(exception_log_data):
249 | """
250 | 将异常日志数据处理成目标输入数据:每条记录包含startTime、endTime、logId、logExceptionSegment、logBelongLevel、logBelongTo等字段
251 | :param exception_log_data:
252 | :return: 目标格式List
253 | """
254 | exceptionLogData = list()
255 | for logSegment in exception_log_data:
256 | tmp_log = dict()
257 | startTime = logSegment['testTime']
258 | endTime = logSegment['testTime']
259 | logId = logSegment['logId'].strip().split(':')[0]
260 | logExceptionSegment = logSegment['logDetail']
261 | if logId.startswith('hadoop'):
262 | logId_splits = logId.strip().split('-')
263 | logBelongTo = logId_splits[2]
264 | tmp_log['logBelongLevel'] = "service"
265 | if logBelongTo == "datanode":
266 | tmp_log['logBelongTo'] = "DataNode"
267 | elif logBelongTo == "namenode":
268 | tmp_log['logBelongTo'] = "NameNode"
269 | elif logBelongTo == "nodemanager":
270 | tmp_log['logBelongTo'] = "NodeManager"
271 | elif logBelongTo == "resourcemanager":
272 | tmp_log['logBelongTo'] = "ResourceManager"
273 | elif logBelongTo == "secondarynamenode":
274 | tmp_log['logBelongTo'] = "SecondaryNameNode"
275 | else:
276 | tmp_log['logBelongTo'] = logSegment["belongTo"].strip().split(':')[0]
277 | tmp_log['logBelongLevel'] = "host"
278 | tmp_log['startTime'] = startTime
279 | tmp_log['endTime'] = endTime
280 | tmp_log['logId'] = logId
281 | tmp_log['logExceptionSegment'] = logExceptionSegment
282 | exceptionLogData.append(tmp_log)
283 | return exceptionLogData
284 |
285 | if __name__ == '__main__':
286 | exception_metric_data, exception_log_data, original_data, items, traces, logs, deployment_data = data_collection_process_json()
287 | deploymentData = get_deployment_data(deployment_data)
288 | traceData = get_original_trace_data(traces)
289 | original_metricData = get_original_metric_data(items)
290 | original_logData = get_original_log_data(logs)
291 | exception_metricData = get_exception_metric_data(exception_metric_data)
292 | exception_logData = get_exception_log_data(exception_log_data)
293 | fault_diagmosis(deploymentData, traceData, original_metricData, original_logData, exception_metricData,
294 | exception_logData)
295 | pass
--------------------------------------------------------------------------------
/service/module_tools/input_data.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import List
3 |
4 | from bean.input_model import DeploymentDataEntry, TraceDataEntry, OriginalMetricEntry, OriginalLogEntry, \
5 | ExceptionMetricEntry, ExceptionLogEntry
6 | import pandas as pd
7 |
8 |
9 | class InputData:
10 | def __init__(self, deploymentData: List, traceData: List, original_metricData: List, original_logData: List,
11 | exception_metricData: List, exception_logData: List):
12 | self.deploymentObjData = deploymentData_to_obj(deploymentData)
13 | self.traceObjData = traceData_to_obj(traceData)
14 | self.original_metricObjData = originalMetricData_to_obj(original_metricData)
15 | self.original_logObjData = originalLogData_to_obj(original_logData)
16 | self.exception_metricObjData = exceptionMetricData_to_obj(exception_metricData)
17 | self.exception_logObjData = exceptionLogData_to_obj(exception_logData)
18 |
19 | # self.traceObjData_by_traceId = None
20 | # self.deploymentObjData_by_sviid = None
21 | # self.original_metricObjData_by_metricId = None
22 | # self.original_logObjData_by_logId = None
23 | #
24 | # self.exception_metricObjData_by_metricBelongTo = None
25 | # self.exception_logObjData_by_logBelongTo = None
26 |
27 | def organization_deploymentObjData_by_sviid(self):
28 | """
29 | 组织部署数据dict,key为serviceInstanceId
30 | :param deploymentObjData:
31 | :return: 以serviceInstanceId为key的dict
32 | """
33 | # if deploymentObjData_by_sviid is not None: return self.deploymentObjData_by_sviid
34 | deploymentObjData_by_sviid = {}
35 | for i in self.deploymentObjData:
36 | deploymentObjData_by_sviid[i.serviceInstanceId] = i
37 | return deploymentObjData_by_sviid
38 |
39 | def organization_traceObjData_by_traceId(self):
40 | """
41 | 组织调用链数据dict,key为straceId
42 | :param traceObjData:
43 | :return: 以traceId为key的dict
44 | """
45 | # if self.traceObjData_by_traceId is not None: return self.traceObjData_by_traceId
46 | traceObjData_by_traceId = {}
47 | for i in self.traceObjData:
48 | if i.traceId not in traceObjData_by_traceId:
49 | traceObjData_by_traceId[i.traceId] = []
50 | traceObjData_by_traceId[i.traceId].append(i)
51 | else:
52 | traceObjData_by_traceId[i.traceId].append(i)
53 | return traceObjData_by_traceId
54 |
55 | def organization_original_metricObjData_by_metricId(self):
56 | """
57 | 组织原始指标数据dict,key为metricId
58 | :param original_metricObjData:
59 | :return:以metricId为key的dict
60 | """
61 | # if self.original_metricObjData_by_metricId is not None: return self.original_metricObjData_by_metricId
62 | original_metricObjData_by_metricId = {}
63 | for i in self.original_metricObjData:
64 | if i.metricId not in original_metricObjData_by_metricId:
65 | original_metricObjData_by_metricId[i.metricId] = []
66 | original_metricObjData_by_metricId[i.metricId].append(i)
67 | else:
68 | original_metricObjData_by_metricId[i.metricId].append(i)
69 | return original_metricObjData_by_metricId
70 |
71 | def organization_original_logObjData_by_logId(self):
72 | """
73 | 组织原始指标数据dict,key为logId
74 | :param original_logObjData:
75 | :return: 以logId为key的dict
76 | """
77 | # if self.original_logObjData_by_logId is not None: return self.original_logObjData_by_logId
78 | original_logObjData_by_logId = {}
79 | for i in self.original_logObjData:
80 | if i.logId not in original_logObjData_by_logId:
81 | original_logObjData_by_logId[i.logId] = []
82 | original_logObjData_by_logId[i.logId].append(i)
83 | else:
84 | original_logObjData_by_logId[i.logId].append(i)
85 | return original_logObjData_by_logId
86 |
87 | def get_target_exception_metric_data(self, exception_metricObjData):
88 | pass
89 |
90 | def get_target_exception_log_data(self, exception_logObjData):
91 | pass
92 |
93 | def organization_exception_metricObjData_by_metricBelongTo(self):
94 | """
95 | 组织异常指标数据dict,key为metricBelongTo
96 | :param exception_metricObjData:
97 | :return: 以metricBelongTo为key的dict
98 | """
99 | # if self.exception_metricObjData_by_metricBelongTo is not None: return self.exception_metricObjData_by_metricBelongTo
100 | exception_metricObjData_by_metricBelongTo = {}
101 | for i in self.exception_metricObjData:
102 | if i.metricBelongTo not in exception_metricObjData_by_metricBelongTo:
103 | exception_metricObjData_by_metricBelongTo[i.metricBelongTo] = []
104 | exception_metricObjData_by_metricBelongTo[i.metricBelongTo].append(i)
105 | return exception_metricObjData_by_metricBelongTo
106 |
107 | def organization_exception_logObjData_by_logBelongTo(self):
108 | """
109 | 组织异常日志数据dict,key为logBelongTo
110 | :param exception_logObjData:
111 | :return: 以logBelongTo为key的dict
112 | """
113 | # if self.exception_logObjData_by_logBelongTo is not None: return self.exception_logObjData_by_logBelongTo
114 | exception_logObjData_by_logBelongTo = {}
115 | for i in self.exception_logObjData:
116 | if i.logBelongTo not in exception_logObjData_by_logBelongTo:
117 | exception_logObjData_by_logBelongTo[i.logBelongTo] = []
118 | exception_logObjData_by_logBelongTo[i.logBelongTo].append(i)
119 | return exception_logObjData_by_logBelongTo
120 |
121 | def get_fault_service_related_log_metric_data(self, serviceId, containerId=None, hostId=None):
122 | exception_metrics_service_related = dict()
123 | exception_logs_service_related = dict()
124 | if serviceId is None: return exception_metrics_service_related, exception_logs_service_related
125 |
126 | if serviceId and serviceId in self.organization_exception_metricObjData_by_metricBelongTo().keys():
127 | service_exception_metrics_list = self.organization_exception_metricObjData_by_metricBelongTo()[serviceId]
128 | for i in service_exception_metrics_list:
129 | if i.metricId not in exception_metrics_service_related:
130 | exception_metrics_service_related[i.metricId] = []
131 | exception_metrics_service_related[i.metricId].append(i)
132 | else:
133 | exception_metrics_service_related[i.metricId].append(i)
134 | if hostId and hostId in self.organization_exception_metricObjData_by_metricBelongTo().keys():
135 | ssss = self.organization_exception_metricObjData_by_metricBelongTo()
136 | host_exception_metrics_list = self.organization_exception_metricObjData_by_metricBelongTo()[hostId]
137 | for i in host_exception_metrics_list:
138 | if i.metricId not in exception_metrics_service_related:
139 | exception_metrics_service_related[i.metricId] = []
140 | exception_metrics_service_related[i.metricId].append(i)
141 | else:
142 | exception_metrics_service_related[i.metricId].append(i)
143 | if containerId and containerId in self.organization_exception_metricObjData_by_metricBelongTo().keys():
144 | docker_exception_metrics_list = self.organization_exception_metricObjData_by_metricBelongTo()[containerId]
145 | for i in docker_exception_metrics_list:
146 | if i.metricId not in exception_metrics_service_related:
147 | exception_metrics_service_related[i.metricId] = []
148 | exception_metrics_service_related[i.metricId].append(i)
149 | else:
150 | exception_metrics_service_related[i.metricId].append(i)
151 | # 获取相关异常日志列表
152 | if serviceId and serviceId in self.organization_exception_logObjData_by_logBelongTo().keys():
153 | service_exception_logs_list = self.organization_exception_logObjData_by_logBelongTo()[serviceId]
154 | for i in service_exception_logs_list:
155 | if i.logId not in exception_logs_service_related:
156 | exception_logs_service_related[i.logId] = []
157 | exception_logs_service_related[i.logId].append(i)
158 | else:
159 | exception_logs_service_related[i.logId].append(i)
160 | if hostId and hostId in self.organization_exception_logObjData_by_logBelongTo().keys():
161 | host_exception_logs_list = self.organization_exception_logObjData_by_logBelongTo()[hostId]
162 | for i in host_exception_logs_list:
163 | if i.logId not in exception_logs_service_related:
164 | exception_logs_service_related[i.logId] = []
165 | exception_logs_service_related[i.logId].append(i)
166 | else:
167 | exception_logs_service_related[i.logId].append(i)
168 | if containerId and containerId in self.organization_exception_logObjData_by_logBelongTo().keys():
169 | docker_exception_logs_list = self.organization_exception_logObjData_by_logBelongTo()[containerId]
170 | for i in docker_exception_logs_list:
171 | if i.logId not in exception_logs_service_related:
172 | exception_logs_service_related[i.logId] = []
173 | exception_logs_service_related[i.logId].append(i)
174 | else:
175 | exception_logs_service_related[i.logId].append(i)
176 | return exception_metrics_service_related, exception_logs_service_related
177 |
178 | def get_PC_input_data(self, exception_metrics, exception_logs):
179 | """
180 | 原始数据预处理,得到PC算法输入格式
181 | :param exception_metrics:
182 | :param exception_logs:
183 | :return:
184 | """
185 | metric_input = None
186 | for key, value in exception_metrics.items():
187 | metric_data = [i.__dict__ for i in self.organization_original_metricObjData_by_metricId()[key]]
188 | # df = pd.DataFrame(metric_data)
189 | df = pd.read_json(json.dumps(metric_data), orient='records')
190 | if df.empty == False:
191 | metric_input_tmp = df[['metricId', 'timestamp', 'value']].groupby(
192 | ['metricId', 'timestamp']).agg('mean')
193 | metric_input_tmp = metric_input_tmp.pivot_table(index='timestamp', columns='metricId', values='value')
194 | if metric_input is None:
195 | metric_input = metric_input_tmp
196 | else:
197 | metric_input = pd.concat([metric_input, metric_input_tmp], axis=1, sort=True)
198 | log_input = None
199 | for key, value in exception_logs.items():
200 | log_data = self.organization_original_logObjData_by_logId()
201 | log_data = self.organization_original_logObjData_by_logId()[key]
202 | log_data = [i.__dict__ for i in self.organization_original_logObjData_by_logId()[key]]
203 | # df = pd.DataFrame(log_data)
204 | df = pd.read_json(json.dumps(log_data), orient='records')
205 | if df.empty == False:
206 | log_input_tmp = df[['logId', 'timestamp', 'logMessage']].groupby(
207 | ['logId', 'timestamp']).agg('count')
208 | log_input_tmp = log_input_tmp.pivot_table(index='timestamp', columns='logId', values='logMessage')
209 | if log_input is None:
210 | log_input = log_input_tmp
211 | else:
212 | log_input = pd.concat([log_input, log_input_tmp], axis=1)
213 | pc_input = pd.concat([metric_input, log_input], axis=1)
214 | pc_input.fillna(method='pad', axis=0, inplace=True)
215 | pc_input.fillna(method='backfill', axis=0, inplace=True)
216 | pc_input[pc_input == 0] = 0.00001
217 | return pc_input
218 |
219 |
220 | def deploymentData_to_obj(deploymentData):
221 | deploymentObjData = list()
222 | if deploymentData is None: return deploymentObjData
223 | for data in deploymentData:
224 | tmp_obj = DeploymentDataEntry(data['serviceInstanceId'], data['serviceName'], data['hostId'], data['hostName'],
225 | data['containerId'], data['containerName'])
226 | deploymentObjData.append(tmp_obj)
227 | return deploymentObjData
228 |
229 |
230 | def traceData_to_obj(traceData):
231 | traceObjData = list()
232 | if traceData is None: return traceObjData
233 | for data in traceData:
234 | tmp_obj = TraceDataEntry(data['id'], data['pid'], data['serviceId'], data['traceId'], data['serviceName'],
235 | data['serviceType'], data['startTime'])
236 | traceObjData.append(tmp_obj)
237 | return traceObjData
238 |
239 |
240 | def originalMetricData_to_obj(original_metricData):
241 | original_metricObjData = list()
242 | if original_metricData is None: return original_metricObjData
243 | for data in original_metricData:
244 | tmp_obj = OriginalMetricEntry(data['metricId'], data['metricName'], data['timestamp'], data['value'],
245 | data['metricBelongTo'], data['units'], data['metricBelongLevel'])
246 | original_metricObjData.append(tmp_obj)
247 | return original_metricObjData
248 |
249 |
250 | def originalLogData_to_obj(original_logData):
251 | original_logObjData = list()
252 | if original_logData is None: return original_logObjData
253 | for data in original_logData:
254 | tmp_obj = OriginalLogEntry(data['logId'], data['timestamp'], data['logMessage'], data['logBelongTo'],
255 | data['logLevel'], data['logBelongLevel'])
256 | original_logObjData.append(tmp_obj)
257 | return original_logObjData
258 |
259 |
260 | def exceptionMetricData_to_obj(exception_metricData):
261 | exception_metricObjData = list()
262 | if exception_metricData is None: return exception_metricObjData
263 | for data in exception_metricData:
264 | tmp_obj = ExceptionMetricEntry(data['startTime'], data['endTime'], data['metricId'], data['metricName'],
265 | data['value'], data['metricBelongTo'], data['units'], data['metricBelongLevel'])
266 | exception_metricObjData.append(tmp_obj)
267 | return exception_metricObjData
268 |
269 |
270 | def exceptionLogData_to_obj(exception_logData):
271 | exception_logObjData = list()
272 | if exception_logData is None: return exception_logObjData
273 | for data in exception_logData:
274 | tmp_obj = ExceptionLogEntry(data['startTime'], data['endTime'], data['logId'], data['logBelongTo'],
275 | data['logExceptionSegment'], data['logBelongLevel'])
276 | exception_logObjData.append(tmp_obj)
277 | return exception_logObjData
278 |
279 |
280 | if __name__ == '__main__':
281 | metric1 = OriginalMetricEntry("1", "1", "1", 1.0, "1", "1", "1")
282 | metric2 = OriginalMetricEntry("2", "2", "2", 1.0, "2", "2", "2")
283 | list = [metric1.__dict__, metric2.__dict__]
284 | a = json.dumps(list)
285 | df = pd.read_json(a, orient='records')
286 | pass
287 |
--------------------------------------------------------------------------------
/utils/pcalg.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | """A graph generator based on the PC algorithm [Kalisch2007].
5 |
6 | [Kalisch2007] Markus Kalisch and Peter Bhlmann. Estimating
7 | high-dimensional directed acyclic graphs with the pc-algorithm. In The
8 | Journal of Machine Learning Research, Vol. 8, pp. 613-636, 2007.
9 | """
10 | from __future__ import print_function
11 |
12 | import logging
13 | import math
14 | from itertools import combinations, permutations
15 | import pandas as pd
16 | import networkx as nx
17 | import numpy as np
18 | from gsq.ci_tests import ci_test_bin, ci_test_dis
19 | from gsq.gsq_testdata import bin_data, dis_data
20 | # from networkx.drawing.tests.test_pylab import plt
21 | from scipy.stats import norm
22 | import matplotlib.pyplot as plt
23 |
24 | from utils.pageRank import PRIterator
25 |
26 | _logger = logging.getLogger(__name__)
27 |
28 |
29 | # 条件独立性检验
30 | def gaussCItest(suffstat, x, y, S):
31 | S = list(S)
32 | C = pd.DataFrame(suffstat).astype(float).corr().values
33 | n = pd.DataFrame(suffstat).values.shape[0]
34 |
35 | cut_at = 0.9999999
36 |
37 | # 偏相关系数
38 | # S中没有点
39 | if len(S) == 0:
40 | r = C[x, y]
41 |
42 | # S中只有一个点 一阶偏相关系数
43 | elif len(S) == 1:
44 | a = (C[x, y] - C[x, S] * C[y, S])
45 | try:
46 | b = math.sqrt((1 - math.pow(C[y, S], 2)) * (1 - math.pow(C[x, S], 2)))
47 | r = a / b
48 | except:
49 | r = C[x, y]
50 | # 其实我没太明白这里是怎么求的,但R语言的pcalg包就是这样写的
51 | else:
52 | m = C[np.ix_([x] + [y] + S, [x] + [y] + S)]
53 | PM = np.linalg.pinv(m)
54 | r = -1 * PM[0, 1] / math.sqrt(abs(PM[0, 0] * PM[1, 1]))
55 |
56 | r = min(cut_at, max(-1 * cut_at, r))
57 | # Fisher’s z-transform
58 | res = math.sqrt(n - len(S) - 3) * .5 * math.log1p((2 * r) / (1 - r))
59 | # Φ^{-1}(1-α/2)
60 | return 2 * (1 - norm.cdf(abs(res)))
61 |
62 |
63 | def _create_complete_graph(node_ids):
64 | """
65 | 根据「节点列表」创建「图结构」
66 | Create a complete graph from the list of node ids.
67 |
68 | Args:
69 | node_ids: a list of node ids
70 |
71 | Returns:
72 | An undirected graph (as a networkx.Graph)
73 | """
74 | g = nx.Graph()
75 | g.add_nodes_from(node_ids)
76 | for (i, j) in combinations(node_ids, 2):
77 | g.add_edge(i, j)
78 | return g
79 |
80 |
81 | def estimate_skeleton(indep_test_func, data_matrix, alpha, **kwargs):
82 | """
83 | 根据统计信息预估骨架图,
84 | 1. 根据原始数据转换成无方向的的图
85 | 2. 遍历所有的有向边,进行独立性检测,当独立性检测结果大于 alpha 时,删除边
86 | Estimate a skeleton graph from the statistical information.
87 |
88 | Args:
89 | indep_test_func: 独立性检测方法
90 | the function name for a conditional independency test.
91 | data_matrix: data (as a numpy array).
92 | alpha: the significance level.
93 | kwargs:
94 | 'max_reach': maximum value of l (see the code). The
95 | value depends on the underlying distribution.
96 | 'method': if 'stable' given, use stable-PC algorithm
97 | (see [Colombo2014]).
98 | 'init_graph': initial structure of skeleton graph
99 | (as a networkx.Graph). If not specified,
100 | a complete graph is used.
101 | other parameters may be passed depending on the
102 | indep_test_func()s.
103 | Returns:
104 | g: a skeleton graph (as a networkx.Graph).
105 | sep_set: a separation set (as an 2D-array of set()).
106 |
107 | [Colombo2014] Diego Colombo and Marloes H Maathuis. Order-independent
108 | constraint-based causal structure learning. In The Journal of Machine
109 | Learning Research, Vol. 15, pp. 3741-3782, 2014.
110 | """
111 |
112 | def method_stable(kwargs):
113 | return ('method' in kwargs) and kwargs['method'] == "stable"
114 |
115 | node_ids = range(data_matrix.shape[1])
116 | node_size = data_matrix.shape[1]
117 | sep_set = [[set() for i in range(node_size)] for j in range(node_size)]
118 | if 'init_graph' in kwargs:
119 | g = kwargs['init_graph']
120 | if not isinstance(g, nx.Graph):
121 | raise ValueError
122 | elif not g.number_of_nodes() == len(node_ids):
123 | raise ValueError('init_graph not matching data_matrix shape')
124 | for (i, j) in combinations(node_ids, 2):
125 | if not g.has_edge(i, j):
126 | sep_set[i][j] = None
127 | sep_set[j][i] = None
128 | else:
129 | # 构造无向边的图
130 | g = _create_complete_graph(node_ids)
131 |
132 | l = 0
133 | while True:
134 | cont = False
135 | remove_edges = []
136 | # 遍历 node_ids 的全排列,去遍历所有可能存在的边(因为是有向边,所以是排列)
137 | for (i, j) in permutations(node_ids, 2):
138 | # 即其相邻节点
139 | adj_i = list(g.neighbors(i))
140 | # 如果 j 是 i 的相邻节点,则删除;否则继续下一次遍历
141 | if j not in adj_i:
142 | continue
143 | else:
144 | adj_i.remove(j)
145 | # The process stops if all neighborhoods in the current graph are smaller than the size of the conditional set.
146 | if len(adj_i) >= l:
147 | # _logger.debug('testing %s and %s' % (i, j))
148 | _logger.debug('测试 %s 节点和 %s 节点' % (i, j))
149 | # _logger.debug('neighbors of %s are %s' % (i, str(adj_i)))
150 | _logger.debug('%s 的相邻节点有 %s' % (i, str(adj_i)))
151 | if len(adj_i) < l:
152 | continue
153 | # 存在任意节点 k(其实不是节点 k,也可能是节点集合 k),使 i-j 满足条件独立性,那么需要删除 i-j
154 | for k in combinations(adj_i, l):
155 | _logger.debug('indep prob of %s and %s with subset %s'
156 | % (i, j, str(k)))
157 | # 求独立性检测概率
158 | # p_val = indep_test_func(data_matrix, i, j, set(k), **kwargs)
159 | p_val = gaussCItest(data_matrix, i, j, set(k))
160 | _logger.debug('独立性检测概率为 %s' % str(p_val))
161 | # 如果概率值大于 alpha 超参数,则移除 i->j 的边
162 | if p_val > alpha:
163 | if g.has_edge(i, j):
164 | _logger.debug('p: 移除边 (%s, %s)' % (i, j))
165 | if method_stable(kwargs):
166 | remove_edges.append((i, j))
167 | else:
168 | g.remove_edge(i, j)
169 | # 求并集,即将集合 k 加入到 sep_set 中,由于本步骤不考虑方向,因此 i->j j->i 都采取这种策略
170 | sep_set[i][j] |= set(k)
171 | sep_set[j][i] |= set(k)
172 | break
173 | cont = True
174 | l += 1
175 | if method_stable(kwargs):
176 | g.remove_edges_from(remove_edges)
177 | if cont is False:
178 | break
179 | if ('max_reach' in kwargs) and (l > kwargs['max_reach']):
180 | break
181 |
182 | return (g, sep_set)
183 |
184 |
185 | def estimate_cpdag(skel_graph, sep_set):
186 | """
187 |
188 | Estimate a CPDAG from the skeleton graph and separation sets
189 | returned by the estimate_skeleton() function.
190 |
191 | Args:
192 | skel_graph: A skeleton graph (an undirected networkx.Graph).
193 | sep_set: An 2D-array of separation set.
194 | The contents look like something like below.
195 | sep_set[i][j] = set([k, l, m])
196 |
197 | Returns:
198 | An estimated DAG.
199 | """
200 | # 将骨架图变成有方向的
201 | dag = skel_graph.to_directed()
202 | node_ids = skel_graph.nodes()
203 | # 提取所有的 i,j 组合
204 | for (i, j) in combinations(node_ids, 2):
205 | # 寻找满足关系的 k,i → k ← j
206 | adj_i = set(dag.successors(i))
207 | if j in adj_i:
208 | continue
209 | adj_j = set(dag.successors(j))
210 | if i in adj_j:
211 | continue
212 | # 程序稳定的验证,无实际意义
213 | if sep_set[i][j] is None:
214 | continue
215 | # 叮! 找到了 K 可能的集合
216 | common_k = adj_i & adj_j
217 | for k in common_k:
218 | # k 不能存在于 sep_set,由于上一步中无方向,因此只需要判断一个即可
219 | if k not in sep_set[i][j]:
220 | # 如果 k->i,那么j->i,这是不合理的
221 | if dag.has_edge(k, i):
222 | _logger.debug('S: 移除边 (%s, %s)' % (k, i))
223 | dag.remove_edge(k, i)
224 | # 同上
225 | if dag.has_edge(k, j):
226 | _logger.debug('S: remove edge (%s, %s)' % (k, j))
227 | dag.remove_edge(k, j)
228 |
229 | def _has_both_edges(dag, i, j):
230 | return dag.has_edge(i, j) and dag.has_edge(j, i)
231 |
232 | def _has_any_edge(dag, i, j):
233 | return dag.has_edge(i, j) or dag.has_edge(j, i)
234 |
235 | def _has_one_edge(dag, i, j):
236 | return ((dag.has_edge(i, j) and (not dag.has_edge(j, i))) or
237 | (not dag.has_edge(i, j)) and dag.has_edge(j, i))
238 |
239 | def _has_no_edge(dag, i, j):
240 | return (not dag.has_edge(i, j)) and (not dag.has_edge(j, i))
241 |
242 | # For all the combination of nodes i and j, apply the following
243 | # rules.
244 | # 开始使用三种规则了
245 | old_dag = dag.copy()
246 | while True:
247 | # 提取所有的 i,j 组合
248 | for (i, j) in combinations(node_ids, 2):
249 | # Rule 1: Orient i-j into i->j whenever there is an arrow k->i
250 | # such that k and j are nonadjacent.
251 | #
252 | # Check if i-j.
253 | # 检验是否存在 i-j 无向边
254 | if _has_both_edges(dag, i, j):
255 | # Look all the predecessors of i.
256 | for k in dag.predecessors(i):
257 | # Skip if there is an arrow i->k.
258 | if dag.has_edge(i, k):
259 | continue
260 | # Skip if k and j are adjacent.
261 | if _has_any_edge(dag, k, j):
262 | continue
263 | # Make i-j into i->j
264 | _logger.debug('R1: remove edge (%s, %s)' % (j, i))
265 | dag.remove_edge(j, i)
266 | break
267 |
268 | # Rule 2: Orient i-j into i->j whenever there is a chain
269 | # i->k->j.
270 | #
271 | # Check if i-j.
272 | if _has_both_edges(dag, i, j):
273 | # Find nodes k where k is i->k.
274 | succs_i = set()
275 | for k in dag.successors(i):
276 | if not dag.has_edge(k, i):
277 | succs_i.add(k)
278 | # Find nodes j where j is k->j.
279 | preds_j = set()
280 | for k in dag.predecessors(j):
281 | if not dag.has_edge(j, k):
282 | preds_j.add(k)
283 | # Check if there is any node k where i->k->j.
284 | if len(succs_i & preds_j) > 0:
285 | # Make i-j into i->j
286 | _logger.debug('R2: remove edge (%s, %s)' % (j, i))
287 | dag.remove_edge(j, i)
288 |
289 | # Rule 3: Orient i-j into i->j whenever there are two chains
290 | # i-k->j and i-l->j such that k and l are nonadjacent.
291 | #
292 | # Check if i-j.
293 | if _has_both_edges(dag, i, j):
294 | # Find nodes k where i-k.
295 | adj_i = set()
296 | for k in dag.successors(i):
297 | if dag.has_edge(k, i):
298 | adj_i.add(k)
299 | # For all the pairs of nodes in adj_i,
300 | for (k, l) in combinations(adj_i, 2):
301 | # Skip if k and l are adjacent.
302 | if _has_any_edge(dag, k, l):
303 | continue
304 | # Skip if not k->j.
305 | if dag.has_edge(j, k) or (not dag.has_edge(k, j)):
306 | continue
307 | # Skip if not l->j.
308 | if dag.has_edge(j, l) or (not dag.has_edge(l, j)):
309 | continue
310 | # Make i-j into i->j.
311 | _logger.debug('R3: remove edge (%s, %s)' % (j, i))
312 | dag.remove_edge(j, i)
313 | break
314 |
315 | # Rule 4: Orient i-j into i->j whenever there are two chains
316 | # i-k->l and k->l->j such that k and j are nonadjacent.
317 | #
318 | # However, this rule is not necessary when the PC-algorithm
319 | # is used to estimate a DAG.
320 |
321 | if nx.is_isomorphic(dag, old_dag):
322 | break
323 | old_dag = dag.copy()
324 |
325 | return dag
326 |
327 |
328 | def construct_service_dependency_diagram(b):
329 | data = np.array(b.iloc[:, :])[:, :]
330 | columns = list(b.columns)[:]
331 | columns_mapping = {i: str(column) for i, column in enumerate(columns)}
332 |
333 | (g, sep_set) = estimate_skeleton(indep_test_func=ci_test_dis,
334 | data_matrix=data,
335 | alpha=0.05)
336 | g = estimate_cpdag(skel_graph=g, sep_set=sep_set)
337 | return g, columns_mapping
338 |
339 |
340 | def get_root_cause(g):
341 | """
342 | 通过关系图获取根因列表
343 | Args:
344 | g: 关系图
345 |
346 | Returns: 根因列表
347 |
348 | """
349 | result = list()
350 | node_ids = g.nodes()
351 | # 获取原因最多的节点
352 | max_pre_node, max_pre_size = None, 0
353 | for node_id in node_ids:
354 | if len(list(g.predecessors(node_id))) > max_pre_size:
355 | max_pre_node = node_id
356 | max_pre_size = len(list(g.predecessors(node_id)))
357 | # 层次遍历
358 | node_filter, node_queue = {max_pre_node}, list([max_pre_node])
359 | while node_queue:
360 | node_now = node_queue.pop(0)
361 | if not g.predecessors(node_now):
362 | result.append(node_now)
363 | continue
364 | is_pre_not_filter = False
365 | for k in g.predecessors(node_now):
366 | if k not in node_filter:
367 | is_pre_not_filter = True
368 | node_filter.add(k)
369 | node_queue.append(k)
370 | # 如果所有的上游节点都在 filter 中,将当前节点加入 result,避免 result 为空的情况
371 | if not is_pre_not_filter:
372 | result.append(node_now)
373 | return result
374 |
375 |
376 | if __name__ == '__main__':
377 |
378 | # 打印日志,不要注释掉
379 | ch = logging.StreamHandler()
380 | ch.setLevel(logging.DEBUG)
381 | _logger.setLevel(logging.DEBUG)
382 | _logger.addHandler(ch)
383 |
384 | # mock 原始数据
385 | dm = np.array(bin_data).reshape((5000, 5))
386 | (g, sep_set) = estimate_skeleton(indep_test_func=ci_test_bin,
387 | data_matrix=dm,
388 | alpha=0.01)
389 | #
390 | g = estimate_cpdag(skel_graph=g, sep_set=sep_set)
391 | g_answer = nx.DiGraph()
392 | g_answer.add_nodes_from([0, 1, 2, 3, 4])
393 | g_answer.add_edges_from([(0, 1), (2, 3), (3, 2), (3, 1),
394 | (2, 4), (4, 2), (4, 1)])
395 | print('Edges are:', g.edges(), end='')
396 | if nx.is_isomorphic(g, g_answer):
397 | print(' => GOOD')
398 | else:
399 | print(' => WRONG')
400 | print('True edges should be:', g_answer.edges())
401 |
402 | # 又 mock 了多点的数据进行测试
403 | dm = np.array(dis_data).reshape((10000, 5))
404 | (g, sep_set) = estimate_skeleton(indep_test_func=ci_test_dis,
405 | data_matrix=dm,
406 | alpha=0.01,
407 | levels=[3, 2, 3, 4, 2])
408 | g = estimate_cpdag(skel_graph=g, sep_set=sep_set)
409 | nx.draw(g,pos=spring_layout(g, prog='dot'), # pos 指的是布局,主要有spring_layout,random_layout,circle_layout,shell_layout
410 | node_color='g', # node_color指节点颜色,有rbykw,同理edge_color
411 | edge_color='r',
412 | with_labels=True)
413 | plt.show()
414 |
415 | pr = PRIterator(g)
416 | page_ranks = pr.page_rank()
417 | print("The final page rank is\n", page_ranks)
418 |
419 | g_answer = nx.DiGraph()
420 | g_answer.add_nodes_from([0, 1, 2, 3, 4])
421 | g_answer.add_edges_from([(0, 2), (1, 2), (1, 3), (4, 3)])
422 | print('Edges are:', g.edges(), end='')
423 | if nx.is_isomorphic(g, g_answer):
424 | print(' => GOOD')
425 | else:
426 | print(' => WRONG')
427 | print('True edges should be:', g_answer.edges())
428 |
--------------------------------------------------------------------------------
/utils/process_aiops2020_data_to_original.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import csv
4 | import json
5 |
6 | def get_original_trace_data():
7 | trace_csf_path = '../data/aiops_data_2020/2020_04_11/调用链指标/trace_csf.csv'
8 | trace_fly_remote_path = '../data/aiops_data_2020/2020_04_11/调用链指标/trace_fly_remote.csv'
9 | trace_jdbc_path = '../data/aiops_data_2020/2020_04_11/调用链指标/trace_jdbc.csv'
10 | trace_local_path = '../data/aiops_data_2020/2020_04_11/调用链指标/trace_local.csv'
11 | trace_osb_path = '../data/aiops_data_2020/2020_04_11/调用链指标/trace_osb.csv'
12 | trace_remote_process_path = '../data/aiops_data_2020/2020_04_11/调用链指标/trace_remote_process.csv'
13 |
14 | result = {}
15 | with open(trace_csf_path, 'r') as f:
16 | reader = csv.DictReader(f)
17 | for row in reader:
18 | tmp = {}
19 | traceId = row['traceId']
20 | id = row['id']
21 | pid = row['pid']
22 | # serviceId = row['cmdb_id'] + ':' + row['serviceName']
23 | serviceType = row['callType']
24 | tmp['id'] = id
25 | tmp['pid'] = pid
26 | tmp['serviceId'] = None
27 | tmp['cmdb_id'] = row['cmdb_id']
28 | tmp['serviceType'] = serviceType
29 | tmp['serviceName'] = row['serviceName']
30 | tmp['startTime'] = row['startTime']
31 | if traceId not in result:
32 | result[traceId] = []
33 | result[traceId].append(tmp)
34 | else:
35 | result[traceId].append(tmp)
36 |
37 | with open(trace_jdbc_path, 'r') as f:
38 | reader = csv.DictReader(f)
39 | for row in reader:
40 | tmp = {}
41 | traceId = row['traceId']
42 | id = row['id']
43 | pid = row['pid']
44 | # serviceId = row['cmdb_id'] + ':' + row['dsName']
45 | serviceType = row['callType']
46 | tmp['id'] = id
47 | tmp['pid'] = pid
48 | tmp['serviceId'] = row['dsName']
49 | tmp['cmdb_id'] = row['cmdb_id']
50 | tmp['serviceType'] = serviceType
51 | tmp['serviceName'] = row['dsName']
52 | tmp['startTime'] = row['startTime']
53 | if traceId not in result:
54 | result[traceId] = []
55 | result[traceId].append(tmp)
56 | else:
57 | result[traceId].append(tmp)
58 |
59 | with open(trace_local_path, 'r') as f:
60 | reader = csv.DictReader(f)
61 | for row in reader:
62 | tmp = {}
63 | traceId = row['traceId']
64 | id = row['id']
65 | pid = row['pid']
66 | # serviceId = row['cmdb_id'] + ':' + row['dsName']
67 | serviceType = row['callType']
68 | tmp['id'] = id
69 | tmp['pid'] = pid
70 | tmp['serviceId'] = row['dsName']
71 | tmp['cmdb_id'] = row['cmdb_id']
72 | tmp['serviceType'] = serviceType
73 | tmp['serviceName'] = row['dsName']
74 | tmp['startTime'] = row['startTime']
75 | if traceId not in result:
76 | result[traceId] = []
77 | tmp['startTime'] = row['startTime']
78 | else:
79 | result[traceId].append(tmp)
80 |
81 | with open(trace_osb_path, 'r') as f:
82 | reader = csv.DictReader(f)
83 | for row in reader:
84 | tmp = {}
85 | traceId = row['traceId']
86 | id = row['id']
87 | pid = row['pid']
88 | serviceId = row['cmdb_id'] + ':' + row['serviceName']
89 | serviceType = row['callType']
90 | tmp['id'] = id
91 | tmp['pid'] = pid
92 | tmp['serviceId'] = serviceId
93 | tmp['cmdb_id'] = row['cmdb_id']
94 | tmp['serviceType'] = serviceType
95 | tmp['serviceName'] = row['serviceName']
96 | tmp['startTime'] = row['startTime']
97 | if traceId not in result:
98 | result[traceId] = []
99 | result[traceId].append(tmp)
100 | else:
101 | result[traceId].append(tmp)
102 |
103 | with open(trace_fly_remote_path, 'r') as f:
104 | reader = csv.DictReader(f)
105 | for row in reader:
106 | tmp = {}
107 | traceId = row['traceId']
108 | id = row['id']
109 | pid = row['pid']
110 | serviceId = row['cmdb_id'] + ':' + row['serviceName']
111 | serviceType = row['callType']
112 | tmp['id'] = id
113 | tmp['pid'] = pid
114 | tmp['serviceId'] = serviceId
115 | tmp['cmdb_id'] = row['cmdb_id']
116 | tmp['serviceType'] = serviceType
117 | tmp['serviceName'] = row['serviceName']
118 | tmp['startTime'] = row['startTime']
119 | if traceId not in result:
120 | result[traceId] = []
121 | result[traceId].append(tmp)
122 | else:
123 | result[traceId].append(tmp)
124 |
125 | with open(trace_remote_process_path, 'r') as f:
126 | reader = csv.DictReader(f)
127 | for row in reader:
128 | tmp = {}
129 | traceId = row['traceId']
130 | id = row['id']
131 | pid = row['pid']
132 | serviceId = row['cmdb_id'] + ':' + row['serviceName']
133 | serviceType = row['callType']
134 | tmp['id'] = id
135 | tmp['pid'] = pid
136 | tmp['serviceId'] = serviceId
137 | tmp['cmdb_id'] = row['cmdb_id']
138 | tmp['serviceType'] = serviceType
139 | tmp['serviceName'] = row['serviceName']
140 | tmp['startTime'] = row['startTime']
141 | if traceId not in result:
142 | result[traceId] = []
143 | result[traceId].append(tmp)
144 | else:
145 | result[traceId].append(tmp)
146 | for k,value in result.items():
147 | tmp_pid_dict = {}
148 | for i in value:
149 | if i['pid'] != 'None':
150 | tmp_pid_dict[i['pid']] = i['cmdb_id']
151 | del_values = list()
152 | for i in value:
153 | if i['serviceType'] == "CSF":
154 | if i['id'] in tmp_pid_dict:
155 | cmdb_id = tmp_pid_dict[i['id']]
156 | serviceId = cmdb_id +":"+i['serviceName']
157 | i['serviceId'] = serviceId
158 | i['cmdb_id'] = cmdb_id
159 | else:
160 | del_values.append(i)
161 | for del_i in del_values:
162 | value.remove(del_i)
163 |
164 | save_path = '../data/aiops_data_2020/2020_04_11/origina_traces.json'
165 | with open(save_path, 'w') as f:
166 | json.dump(result, f, indent=2, sort_keys=True, ensure_ascii=False)
167 |
168 | def get_target_deployment_data(original_deployment_data):
169 | """
170 | 原始部署数据处理为接入目标格式
171 | Args:
172 | original_metric_data: 原始部署数据
173 |
174 | Returns: 原始部署数据的目标接入格式{ serviceInstanceId:{ serviceInstanceId:””, serviceName:””, hostId:””, hostname:””, containerId:””, containerName:””},{},{}}
175 | """
176 | # original_deployment_data = [{"serviceInstanceId":"os_021:osb_001","serviceName":"osb_001", "containerId":None, "containerName":None,"hostId":"os_021","hostName":"os_021"},{"serviceInstanceId":"os_022:osb_002","serviceName":"osb_002", "containerId":None, "containerName":None,"hostId":"os_022", "hostName":"os_022"},{"serviceInstanceId":"docker_001:csf_001","serviceName":"csf_001", "containerId":"docker_001", "containerName":"docker_001","hostId":"os_017", "hostName":"os_017"},{"serviceInstanceId":"docker_002:csf_001","serviceName":"csf_001", "containerId":"docker_002", "containerName":"docker_002","hostId":"os_018", "hostName":"os_018"},{"serviceInstanceId":"docker_003:csf_001","serviceName":"csf_001", "containerId":"docker_003", "containerName":"docker_003","hostId":"os_019", "hostName":"os_019"},{"serviceInstanceId":"docker_004:csf_001","serviceName":"csf_001", "containerId":"docker_004", "containerName":"docker_004","hostId":"os_020", "hostName":"os_020"},{"serviceInstanceId":"docker_005:csf_002","serviceName":"csf_002","containerId":"docker_005","containerName":"docker_005","hostId":"os_017","hostName":"os_017"},{"serviceInstanceId":"docker_006:csf_002","serviceName":"csf_002", "containerId":"docker_006", "containerName":"docker_006","hostId":"os_018","hostName":"os_018"},{"serviceInstanceId":"docker_007:csf_002","serviceName":"csf_002","containerId":"docker_007","containerName":"docker_007","hostId":"os_019","hostName":"os_019"},{"serviceInstanceId":"docker_008:csf_002","serviceName":"csf_002","containerId":"docker_008","containerName":"docker_008","hostId":"os_020","hostName":"os_020"},{"serviceInstanceId":"docker_005:csf_003","serviceName":"csf_003","containerId":"docker_005","containerName":"docker_005","hostId":"os_017","hostName":"os_017"},{"serviceInstanceId":"docker_006:csf_003","serviceName":"csf_003","containerId":"docker_006","containerName":"docker_006","hostId":"os_018","hostName":"os_018"},{"serviceInstanceId":"docker_007:csf_003","serviceName":"csf_003","containerId":"docker_007","containerName":"docker_007","hostId":"os_019","hostName":"os_019"},{"serviceInstanceId":"docker_008:csf_003","serviceName":"csf_003","containerId":"docker_008","containerName":"docker_008","hostId":"os_020","hostName":"os_020"},{"serviceInstanceId":"docker_005:csf_004","serviceName":"csf_004","containerId":"docker_005","containerName":"docker_005","hostId":"os_017","hostName":"os_017"},{"serviceInstanceId":"docker_006:csf_004","serviceName":"csf_004","containerId":"docker_006","containerName":"docker_006","hostId":"os_018","hostName":"os_018"},{"serviceInstanceId":"docker_007:csf_004","serviceName":"csf_004","containerId":"docker_007","containerName":"docker_007","hostId":"os_019","hostName":"os_019"},{"serviceInstanceId":"docker_008:csf_004","serviceName":"csf_004","containerId":"docker_008","containerName":"docker_008","hostId":"os_020","hostName":"os_020"},{"serviceInstanceId":"docker_005:csf_005","serviceName":"csf_005","containerId":"docker_005","containerName":"docker_005","hostId":"os_017","hostName":"os_017"},{"serviceInstanceId":"docker_006:csf_005","serviceName":"csf_005","containerId":"docker_006","containerName":"docker_006","hostId":"os_018","hostName":"os_018"},{"serviceInstanceId":"docker_007:csf_005","serviceName":"csf_005", "containerId":"docker_007", "containerName":"docker_007","hostId":"os_019","hostName":"os_019"},{"serviceInstanceId":"docker_008:csf_005","serviceName":"csf_005","containerId":"docker_008","containerName":"docker_008","hostId":"os_020","hostName":"os_020"},{"serviceInstanceId":"db_001","serviceName":"db_001", "containerId":None, "containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_002","serviceName":"db_001", "containerId":None, "containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_003","serviceName":"db_003","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_004","serviceName":"db_004","containerId":None, "containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_005","serviceName":"db_005","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_006","serviceName":"db_006","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_007","serviceName":"db_007","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_008","serviceName":"db_008","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_009","serviceName":"db_009","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_010","serviceName":"db_010","containerId":None, "containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_011","serviceName":"db_011","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_012","serviceName":"db_012","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":"db_013","serviceName":"db_013","containerId":None,"containerName":None,"hostId":None,"hostName":None},{"serviceInstanceId":":docker_001:fly_remote_001","serviceName":"fly_remote_001","containerId":"docker_001","containerName":"docker_001","hostId":None,"hostName":None},{"serviceInstanceId":"docker_002:fly_remote_001","serviceName":"fly_remote_001","containerId":"docker_002","containerName":"docker_002","hostId":None,"hostName":None},{"serviceInstanceId":"docker_003:fly_remote_001","serviceName":"fly_remote_001","containerId":"docker_003","containerName":"docker_003","hostId":None,"hostName":None},{"serviceInstanceId":"docker_004:fly_remote_001","serviceName":"fly_remote_001","containerId":"docker_004","containerName":"docker_004","hostId":None,"hostName":None}]
177 | # target_deployment_data = {}
178 | # for i in original_deployment_data:
179 | # target_deployment_data[i['serviceInstanceId']]= i
180 | # return target_deployment_data
181 |
182 |
183 | # f = open('../../../data/aiops_data_2020/2020_05_22/original_deployment_data.json', 'r')
184 | # # deployment_data = json.load(f)
185 | # get_original_trace_data()
186 | def get_original_items_data():
187 | """
188 | 拆分指标文件
189 | :param :
190 | :return: json
191 | """
192 | result = {}
193 | db_oracle_11g_path = '../data/aiops_data_2020/2020_04_11/平台指标/db_oracle_11g.csv'
194 | dcos_docker_path = '../data/aiops_data_2020/2020_04_11/平台指标/dcos_docker.csv'
195 | os_linux_path = '../data/aiops_data_2020/2020_04_11/平台指标/os_linux.csv'
196 | with open(db_oracle_11g_path, 'r') as f:
197 | reader = csv.DictReader(f)
198 | for row in reader:
199 | curveId = row['itemid']+":"+row['name']+":"+row['bomc_id']
200 | if curveId not in result:
201 | result[curveId] = {}
202 | result[curveId]['metricId'] = row['itemid']
203 | result[curveId]['metricName'] = row['name']
204 | result[curveId]['metricBelongTo'] = row['cmdb_id']
205 | result[curveId]['metricBelongLevel'] = "service"
206 | result[curveId]['values'] = []
207 | result[curveId]['timeStamps'] = []
208 | result[curveId]['values'].append(row['value'])
209 | result[curveId]['timeStamps'].append(row['timestamp'])
210 | else:
211 | result[curveId]['values'].append(row['value'])
212 | result[curveId]['timeStamps'].append(row['timestamp'])
213 | with open(dcos_docker_path, 'r') as f:
214 | reader = csv.DictReader(f)
215 | for row in reader:
216 | curveId = row['itemid']+":"+row['name']+":"+row['bomc_id']
217 | if curveId not in result:
218 | result[curveId] = {}
219 | result[curveId]['metricId'] = row['itemid']
220 | result[curveId]['metricName'] = row['name']
221 | result[curveId]['metricBelongTo'] = row['cmdb_id']
222 | result[curveId]['metricBelongLevel'] = "docker"
223 | result[curveId]['values'] = []
224 | result[curveId]['timeStamps'] = []
225 | result[curveId]['values'].append(row['value'])
226 | result[curveId]['timeStamps'].append(row['timestamp'])
227 | else:
228 | result[curveId]['values'].append(row['value'])
229 | result[curveId]['timeStamps'].append(row['timestamp'])
230 |
231 | with open(os_linux_path, 'r') as f:
232 | reader = csv.DictReader(f)
233 | for row in reader:
234 | curveId = row['itemid']+":"+row['name']+":"+row['bomc_id']
235 | if curveId not in result:
236 | result[curveId] = {}
237 | result[curveId]['metricId'] = row['itemid']
238 | result[curveId]['metricName'] = row['name']
239 | result[curveId]['metricBelongTo'] = row['cmdb_id']
240 | result[curveId]['metricBelongLevel'] = "host"
241 | result[curveId]['values'] = []
242 | result[curveId]['timeStamps'] = []
243 | result[curveId]['values'].append(row['value'])
244 | result[curveId]['timeStamps'].append(row['timestamp'])
245 | else:
246 | result[curveId]['values'].append(row['value'])
247 | result[curveId]['timeStamps'].append(row['timestamp'])
248 | save_path = '../data/aiops_data_2020/2020_04_11/origina_items.json'
249 | with open(save_path, 'w') as f:
250 | json.dump(result, f, indent=2, sort_keys=True, ensure_ascii=False)
251 | pass
252 |
253 | if __name__ == '__main__':
254 | # get_original_items_data()
255 | get_original_trace_data()
--------------------------------------------------------------------------------