├── CVE_annotated_dataset.xlsx ├── LICENSE ├── LR_ATT&CK_model.pkl ├── LR_ATT&CK_model_V2.pkl ├── README.md ├── SMET.py ├── SMET_use_example.py ├── __pycache__ ├── SMET.cpython-38.pyc ├── SMET.cpython-39.pyc ├── funs.cpython-39.pyc ├── nlp_general.cpython-39.pyc ├── parse_class.cpython-38.pyc └── parse_class.cpython-39.pyc ├── funs.py ├── id2ATT&CK.json ├── id2ATT&CK_V2.json ├── id2mitre.json ├── nlp_general.py ├── parse_class.py ├── requirements-frozen.txt ├── requirements.txt └── tram_data_predictions.json /CVE_annotated_dataset.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basel-a/SMET/21a223f710eb6accdb4dcbf435e8b3af9a739261/CVE_annotated_dataset.xlsx -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 basel-a 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LR_ATT&CK_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basel-a/SMET/21a223f710eb6accdb4dcbf435e8b3af9a739261/LR_ATT&CK_model.pkl -------------------------------------------------------------------------------- /LR_ATT&CK_model_V2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basel-a/SMET/21a223f710eb6accdb4dcbf435e8b3af9a739261/LR_ATT&CK_model_V2.pkl -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SMET (mapping CTI reports and CVE to ATT&CK) 2 | SMET : Semantic Mapping of CVE to ATT&CK and its Application to Cybersecurity
3 | https://link.springer.com/chapter/10.1007/978-3-031-37586-6_15 4 | 5 | SMET is tool that maps text (such as CTI reports or CVE) to ATT&CK techniques
6 | Check SMET_use_example.py to learn how to integrate SMET in your code 7 | 8 | ATT&CK BERT for semantically meaningful cybersecurity text embedding: https://huggingface.co/basel/ATTACK-BERT 9 | 10 | ## Installation 11 | Developed using Python 3.8.18, other versions should work too.
12 | ```bash 13 | pip install -r requirements.txt 14 | python -m spacy download en_core_web_lg 15 | ``` -------------------------------------------------------------------------------- /SMET.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Apr 14 16:24:28 2023 4 | 5 | @author: basel 6 | """ 7 | 8 | 9 | 10 | from sentence_transformers import SentenceTransformer 11 | import pickle 12 | from scipy.special import softmax 13 | from parse_class import Parser 14 | from nlp_general import NLP 15 | # import pandas as pd 16 | import funs 17 | nlp = NLP() 18 | nlp.load_model('dep') 19 | nlp.load_model('sentencizer') 20 | 21 | LR_model = pickle.load(open("LR_ATT&CK_model_V2.pkl", 'rb')) 22 | emb_model = SentenceTransformer("basel/ATTACK-BERT") 23 | id2mitre = funs.read_json_as_dict('id2mitre.json') 24 | id2label = funs.read_json_as_dict('id2ATT&CK_V2.json') 25 | id2label = {int(i):id2label[i] for i in id2label} 26 | 27 | def get_verbs_tag(srl): 28 | verbs_tag= {} 29 | 30 | for v in srl['verbs']: 31 | 32 | try: 33 | v_ind = v['tags'].index("B-V") 34 | except: 35 | 36 | continue 37 | for other_v in srl['verbs']: 38 | if other_v['tags'][v_ind] not in ('B-V','O'): 39 | 40 | try: 41 | other_v_ind = other_v['tags'].index("B-V") 42 | except: 43 | 44 | continue 45 | 46 | if v['id'] not in verbs_tag: 47 | verbs_tag[v['id']] = [(other_v["verb"] , "-".join(other_v['tags'][v_ind].split("-")[1:]),abs(v_ind - other_v_ind),other_v_ind)] 48 | else: 49 | verbs_tag[v['id']] += [(other_v["verb"] , "-".join(other_v['tags'][v_ind].split("-")[1:]),abs(v_ind - other_v_ind),other_v_ind)] 50 | 51 | for i in verbs_tag: 52 | verbs_tag[i] = sorted(verbs_tag[i],key = lambda x:x[2]) 53 | return verbs_tag 54 | 55 | def add_arg0_from_parent(srl,srl_dict): 56 | 57 | verbs_tag = get_verbs_tag(srl) 58 | for v in srl_dict: 59 | if 'V' in srl_dict[v] and srl_dict[v]['V']['text'] in verbs_tag and 'ARG0' not in srl_dict[v] and v in verbs_tag: 60 | parent_verb = verbs_tag[v][0][0] 61 | if parent_verb in srl_dict and 'ARG0' in srl_dict[parent_verb]: 62 | srl_dict[v]['ARG0'] = srl_dict[parent_verb]['ARG0'].copy() 63 | 64 | 65 | def get_AVs(text,CVE = False): 66 | 67 | sents = nlp.seperate_sentences(text) 68 | cve_srl = {} 69 | for sent in sents: 70 | try: 71 | srl = Parser.extract_srl(sent) 72 | Parser.add_v_id_srl(srl) 73 | srl_dict = Parser.srl_to_dict(srl) 74 | add_arg0_from_parent(srl,srl_dict) 75 | cve_srl[sent] = (srl_dict) 76 | except: 77 | print('error') 78 | 79 | if CVE: 80 | arg_constrain = {'ARG0' : lambda x : 'attacker' in x.lower() or 'adversary' in x.lower() or 'user' in x.lower() or 'vulnerability' in x } 81 | vo0 = nlp.extract_VO_from_sents_lambda(cve_srl,arg_constrain) 82 | 83 | arg_constrain = {'ARG1' : lambda x : 'attacker' in x.lower() or 'adversary' in x.lower() or 'user' in x or 'vulnerability' in x.lower() } 84 | vo1 = nlp.extract_VO_from_sents_lambda(cve_srl,arg_constrain) 85 | 86 | arg_constrain = {'V' : lambda x : 'allow' in x.lower() or 'lead' in x.lower() or 'result' in x.lower()} 87 | vo2 = nlp.extract_VO_from_sents_lambda(cve_srl,arg_constrain) #or 'caus' in x.lower() 88 | 89 | cve_vos_filtered = { key:vo0.get(key,[])+vo1.get(key,[])+vo2.get(key,[]) for key in set(list(vo0.keys())+list(vo1.keys())+list(vo2.keys())) } 90 | cve_vos = set([i[0] for j in cve_vos_filtered.values() for i in j ]) 91 | cve_vos.add(text) 92 | 93 | 94 | else: 95 | arg_constrain = {} 96 | vo = nlp.extract_VO_from_sents_lambda(cve_srl,arg_constrain) 97 | cve_vos = set([i[0] for j in vo.values() for i in j ]) 98 | return cve_vos 99 | 100 | 101 | def predict_techniques(emb,clf,id2label): 102 | dec = clf.decision_function([emb]) 103 | out = softmax(dec)[0] 104 | return sorted(list( zip([id2mitre[id2label[i]] for i in range(len(dec[0])) ], out)), key = lambda x:x[1], reverse=True) 105 | 106 | 107 | def predict_per_vo(cve_vos, rank,id2mitre): 108 | 109 | out = [] 110 | for vo in cve_vos: 111 | if vo.strip() == '': 112 | continue 113 | vo = vo 114 | dec = rank(vo) 115 | out.append([(j[0],j[1]) for j in dec]) 116 | 117 | 118 | outa = [j for k in out for j in k] 119 | outa = sorted(outa,key = lambda x:x[1],reverse = True ) 120 | abc = [] 121 | cdf = [] 122 | for k in outa: 123 | if k[0] not in cdf: 124 | abc.append(k) 125 | cdf.append(k[0]) 126 | outa = [i for i in abc if i[0]] 127 | 128 | return outa 129 | 130 | #map text to ATT&CK 131 | def map_text(text,CVE = False): 132 | rank = lambda x:predict_techniques(emb_model.encode(x),LR_model,id2label) 133 | vos = get_AVs(text,CVE = CVE) 134 | return predict_per_vo(vos,rank,id2mitre) 135 | 136 | #map attack vector to ATT&CK 137 | def map_attack_vector(AV): 138 | return predict_techniques(emb_model.encode(AV),LR_model,id2label) 139 | 140 | 141 | -------------------------------------------------------------------------------- /SMET_use_example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Apr 15 04:03:07 2023 4 | 5 | @author: basel 6 | """ 7 | 8 | from SMET import map_text,map_attack_vector 9 | 10 | #Mapping tips: 11 | #When the input is short (e.g., one sentence or attack action) use map_attack_vector() 12 | #For inputs that consist of a few lines, such as a CVE entry of a paragraph from a CTI report use map_text() 13 | #In cases where the input is long, like a full CTI report, segmented the text into multiple paragraphs or sentences and processed each separately 14 | 15 | 16 | #map attack vectors to ATT&CK 17 | AV1 = 'take screenshot' 18 | mapping1 = map_attack_vector(AV1) 19 | 20 | AV2 = 'delete logs' 21 | mapping2 = map_attack_vector(AV2) 22 | 23 | AV3 = 'exfiltrate data to C2 server' 24 | mapping3 = map_attack_vector(AV3) 25 | 26 | 27 | 28 | #map CVE to ATT&CK 29 | cve = "" 30 | mapping = map_text(cve,CVE = True) 31 | 32 | 33 | #map any text to ATT&CK 34 | cve = "" 35 | mapping = map_text(cve,CVE = False) 36 | 37 | 38 | #get embedding using ATT&CK 39 | from sentence_transformers import SentenceTransformer 40 | 41 | text = "" 42 | 43 | emb_model = SentenceTransformer("basel/ATTACK-BERT") 44 | embedding = emb_model.encode(text) 45 | 46 | 47 | ###### 48 | from sentence_transformers import SentenceTransformer 49 | 50 | model = SentenceTransformer('basel/ATTACK-BERT') 51 | 52 | sentences = ["the account has weak password", "attacker gain an initial access to the machine"] 53 | 54 | embeddings = model.encode(sentences) 55 | 56 | from sklearn.metrics.pairwise import cosine_similarity 57 | print(cosine_similarity([embeddings[0]], [embeddings[1]])) 58 | 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /__pycache__/SMET.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basel-a/SMET/21a223f710eb6accdb4dcbf435e8b3af9a739261/__pycache__/SMET.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/SMET.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basel-a/SMET/21a223f710eb6accdb4dcbf435e8b3af9a739261/__pycache__/SMET.cpython-39.pyc -------------------------------------------------------------------------------- /__pycache__/funs.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basel-a/SMET/21a223f710eb6accdb4dcbf435e8b3af9a739261/__pycache__/funs.cpython-39.pyc -------------------------------------------------------------------------------- /__pycache__/nlp_general.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basel-a/SMET/21a223f710eb6accdb4dcbf435e8b3af9a739261/__pycache__/nlp_general.cpython-39.pyc -------------------------------------------------------------------------------- /__pycache__/parse_class.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basel-a/SMET/21a223f710eb6accdb4dcbf435e8b3af9a739261/__pycache__/parse_class.cpython-38.pyc -------------------------------------------------------------------------------- /__pycache__/parse_class.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/basel-a/SMET/21a223f710eb6accdb4dcbf435e8b3af9a739261/__pycache__/parse_class.cpython-39.pyc -------------------------------------------------------------------------------- /funs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Oct 29 12:59:22 2019 4 | 5 | @author: babdeen 6 | """ 7 | import json 8 | from os import listdir 9 | from os.path import isfile, join,isdir 10 | #import gensim 11 | #model = gensim.models.KeyedVectors.load_word2vec_format('C:/basel/GoogleNews-vectors-negative300.bin', binary=True) 12 | 13 | def save_list_to_text(l, dist): 14 | if dist[-4:] != '.txt': 15 | return 'error' 16 | file = open(dist,'w', encoding='utf-8', errors='ignore') 17 | for line in l: 18 | line = line.replace('\n', ' ') 19 | file.write(line) 20 | file.write('\n') 21 | file.close() 22 | return 'done' 23 | 24 | def save_list_to_text_w_sep(l, dist,sep): 25 | if dist[-4:] != '.txt': 26 | return 'error' 27 | file = open(dist,'w', encoding='utf-8', errors='ignore') 28 | for line in l: 29 | line = line.replace('\n', ' ') 30 | file.write(line) 31 | file.write('\n') 32 | file.write(sep) 33 | file.write('\n') 34 | file.close() 35 | 36 | def save_list_to_text_2(l, dist): 37 | if dist[-4:] != '.txt': 38 | return 'error' 39 | file = open(dist,'w',encoding='utf-8', errors='ignore') 40 | for line in l: 41 | file.write(str(line)) 42 | file.write('\n') 43 | file.close() 44 | return 'done' 45 | 46 | def save_list_to_text_w_sep_2(l, dist,sep): 47 | if dist[-4:] != '.txt': 48 | return 'error' 49 | file = open(dist,'w') 50 | for line in l: 51 | file.write(str(line)) 52 | file.write('\n') 53 | file.write(sep) 54 | file.write('\n') 55 | file.close() 56 | return 'done' 57 | 58 | def read_list_from_text(dist, maxi = -1): 59 | file = open(dist,'r', encoding='utf-8', errors='ignore') 60 | l = [] 61 | line = file.readline() 62 | if maxi == -1: 63 | while line != '': 64 | l.append(line) 65 | line = file.readline() 66 | 67 | else: 68 | c = 0 69 | while line != '': 70 | l.append(line) 71 | c += 1 72 | if c > maxi: 73 | break 74 | line = file.readline() 75 | file.close() 76 | return l 77 | 78 | 79 | #def most_sim(word , top = 6): 80 | # return model.most_similar(word, topn=top) 81 | 82 | def save_dict_as_json(dist, d,note = ''): 83 | with open(dist, 'w') as fp: 84 | json.dump(d, fp) 85 | if note != '': 86 | with open(dist[:-5] +'.txt', 'w') as fp: 87 | fp.write(note) 88 | return 'done' 89 | 90 | def save_list_as_json(dist, l): 91 | d = {i:j for i,j in enumerate(l)} 92 | with open(dist, 'w') as fp: 93 | json.dump(d, fp) 94 | return 'done' 95 | 96 | def read_json_as_dict(src): 97 | with open(src) as json_file: 98 | data = json.load(json_file) 99 | return data 100 | 101 | def read_json_as_dict_utf(src): 102 | with open(src,encoding = 'utf-8') as json_file: 103 | data = json.load(json_file) 104 | return data 105 | 106 | 107 | def read_words_from_text(dist, delimeter = ','): 108 | file = open(dist,'r') 109 | l = [] 110 | content = file.read() 111 | words = content.split(delimeter) 112 | file.close() 113 | return words 114 | def get_files_in_folder(path): 115 | return [f for f in listdir(path) if isfile(join(path, f))] 116 | 117 | 118 | def get_folders_in_folder(path): 119 | return [f for f in listdir(path) if isdir(join(path, f))] 120 | 121 | def get_all_files(path): 122 | folders = [path] 123 | out = [] 124 | while folders != []: 125 | folder = folders[0] 126 | folders = folders[1:] 127 | 128 | folders.extend([folder + '/' + i for i in get_folders_in_folder(folder)]) 129 | out.extend([folder + '/' + i for i in get_files_in_folder(folder)]) 130 | return out 131 | -------------------------------------------------------------------------------- /id2ATT&CK.json: -------------------------------------------------------------------------------- 1 | {"0": "T1137", "1": "T1176", "2": "T1110", "3": "T1134", "4": "T1190", "5": "T1495", "6": "T1104", "7": "T1568", "8": "T1098", "9": "T1205", "10": "T1614", "11": "T1030", "12": "T1135", "13": "T1608", "14": "T1595", "15": "T1531", "16": "T1596", "17": "T1220", "18": "T1014", "19": "T1561", "20": "T1485", "21": "T1090", "22": "T1490", "23": "T1219", "24": "T1007", "25": "T1580", "26": "T1033", "27": "T1082", "28": "T1102", "29": "T1087", "30": "T1010", "31": "T1550", "32": "T1136", "33": "T1484", "34": "T1497", "35": "T1570", "36": "T1601", "37": "T1207", "38": "T1556", "39": "T1200", "40": "T1602", "41": "T1591", "42": "T1204", "43": "T1482", "44": "T1083", "45": "T1599", "46": "T1571", "47": "T1597", "48": "T1071", "49": "T1069", "50": "T1025", "51": "T1020", "52": "T1072", "53": "T1535", "54": "T1538", "55": "T1140", "56": "T1127", "57": "T1199", "58": "T1587", "59": "T1037", "60": "T1080", "61": "T1189", "62": "T1593", "63": "T1197", "64": "T1496", "65": "T1021", "66": "T1553", "67": "T1027", "68": "T1113", "69": "T1125", "70": "T1119", "71": "T1040", "72": "T1528", "73": "T1530", "74": "T1534", "75": "T1592", "76": "T1562", "77": "T1543", "78": "T1006", "79": "T1529", "80": "T1598", "81": "T1124", "82": "T1053", "83": "T1567", "84": "T1569", "85": "T1056", "86": "T1547", "87": "T1584", "88": "T1078", "89": "T1590", "90": "T1202", "91": "T1055", "92": "T1047", "93": "T1609", "94": "T1560", "95": "T1554", "96": "T1185", "97": "T1036", "98": "T1526", "99": "T1039", "100": "T1041", "101": "T1498", "102": "T1212", "103": "T1572", "104": "T1537", "105": "T1612", "106": "T1048", "107": "T1221", "108": "T1201", "109": "T1217", "110": "T1132", "111": "T1074", "112": "T1005", "113": "T1564", "114": "T1187", "115": "T1016", "116": "T1211", "117": "T1613", "118": "T1049", "119": "T1052", "120": "T1008", "121": "T1611", "122": "T1586", "123": "T1499", "124": "T1213", "125": "T1525", "126": "T1557", "127": "T1057", "128": "T1542", "129": "T1216", "130": "T1222", "131": "T1114", "132": "T1546", "133": "T1559", "134": "T1120", "135": "T1018", "136": "T1011", "137": "T1486", "138": "T1563", "139": "T1552", "140": "T1518", "141": "T1129", "142": "T1115", "143": "T1001", "144": "T1068", "145": "T1046", "146": "T1583", "147": "T1105", "148": "T1091", "149": "T1012", "150": "T1585", "151": "T1111", "152": "T1610", "153": "T1588", "154": "T1548", "155": "T1573", "156": "T1003", "157": "T1489", "158": "T1574", "159": "T1589", "160": "T1133", "161": "T1555", "162": "T1029", "163": "T1505", "164": "T1218", "165": "T1070", "166": "T1578", "167": "T1210", "168": "T1558", "169": "T1106", "170": "T1566", "171": "T1491", "172": "T1195", "173": "T1095", "174": "T1092", "175": "T1112", "176": "T1539", "177": "T1600", "178": "T1565", "179": "T1594", "180": "T1059", "181": "T1123", "182": "T1203", "183": "T1480", "184": "T1606"} -------------------------------------------------------------------------------- /id2ATT&CK_V2.json: -------------------------------------------------------------------------------- 1 | {"0": "T1547", "1": "T1201", "2": "T1003", "3": "T1578", "4": "T1499", "5": "T1571", "6": "T1129", "7": "T1197", "8": "T1007", "9": "T1584", "10": "T1498", "11": "T1005", "12": "T1102", "13": "T1531", "14": "T1132", "15": "T1030", "16": "T1110", "17": "T1491", "18": "T1069", "19": "T1566", "20": "T1221", "21": "T1599", "22": "T1039", "23": "T1530", "24": "T1137", "25": "T1600", "26": "T1083", "27": "T1550", "28": "T1591", "29": "T1202", "30": "T1553", "31": "T1018", "32": "T1543", "33": "T1115", "34": "T1204", "35": "T1111", "36": "T1125", "37": "T1027", "38": "T1528", "39": "T1537", "40": "T1580", "41": "T1014", "42": "T1025", "43": "T1614", "44": "T1001", "45": "T1091", "46": "T1176", "47": "T1057", "48": "T1199", "49": "T1497", "50": "T1564", "51": "T1120", "52": "T1219", "53": "T1053", "54": "T1588", "55": "T1213", "56": "T1505", "57": "T1546", "58": "T1119", "59": "T1068", "60": "T1496", "61": "T1485", "62": "T1047", "63": "T1611", "64": "T1040", "65": "T1074", "66": "T1114", "67": "T1106", "68": "T1597", "69": "T1220", "70": "T1557", "71": "T1082", "72": "T1195", "73": "T1480", "74": "T1572", "75": "T1592", "76": "T1124", "77": "T1594", "78": "T1140", "79": "T1036", "80": "T1601", "81": "T1078", "82": "T1112", "83": "T1049", "84": "T1548", "85": "T1200", "86": "T1029", "87": "T1589", "88": "T1020", "89": "T1587", "90": "T1048", "91": "T1583", "92": "T1136", "93": "T1037", "94": "T1012", "95": "T1568", "96": "T1559", "97": "T1010", "98": "T1210", "99": "T1016", "100": "T1185", "101": "T1518", "102": "T1072", "103": "T1056", "104": "T1490", "105": "T1133", "106": "T1529", "107": "T1562", "108": "T1567", "109": "T1203", "110": "T1105", "111": "T1558", "112": "T1055", "113": "T1070", "114": "T1052", "115": "T1538", "116": "T1489", "117": "T1552", "118": "T1484", "119": "T1218", "120": "T1596", "121": "T1021", "122": "T1585", "123": "T1606", "124": "T1095", "125": "T1613", "126": "T1565", "127": "T1526", "128": "T1563", "129": "T1560", "130": "T1059", "131": "T1482", "132": "T1190", "133": "T1080", "134": "T1608", "135": "T1011", "136": "T1008", "137": "T1205", "138": "T1087", "139": "T1602", "140": "T1006", "141": "T1135", "142": "T1554", "143": "T1090", "144": "T1212", "145": "T1046", "146": "T1187", "147": "T1098", "148": "T1598", "149": "T1211", "150": "T1071", "151": "T1041", "152": "T1123", "153": "T1586", "154": "T1104", "155": "T1495", "156": "T1033", "157": "T1222", "158": "T1573", "159": "T1570", "160": "T1207", "161": "T1216", "162": "T1189", "163": "T1609", "164": "T1574", "165": "T1134", "166": "T1590", "167": "T1555", "168": "T1113", "169": "T1486", "170": "T1534", "171": "T1539", "172": "T1612", "173": "T1569", "174": "T1561", "175": "T1556", "176": "T1593", "177": "T1542", "178": "T1595", "179": "T1535", "180": "T1092", "181": "T1525", "182": "T1127", "183": "T1610", "184": "T1217"} -------------------------------------------------------------------------------- /id2mitre.json: -------------------------------------------------------------------------------- 1 | {"T1055.011": "Process Injection: Extra Window Memory Injection", "T1053.005": "Scheduled Task/Job: Scheduled Task", "T1205.002": "Traffic Signaling: Socket Filters", "T1560.001": "Archive Collected Data: Archive via Utility", "T1021.005": "Remote Services: VNC", "T1047": "Windows Management Instrumentation", "T1113": "Screen Capture", "T1037": "Boot or Logon Initialization Scripts", "T1557": "Adversary-in-the-Middle", "T1033": "System Owner/User Discovery", "T1583": "Acquire Infrastructure", "T1218.011": "System Binary Proxy Execution: Rundll32", "T1613": "Container and Resource Discovery", "T1583.007": "Acquire Infrastructure: Serverless", "T1132.001": "Data Encoding: Standard Encoding", "T1027.009": "Obfuscated Files or Information: Embedded Payloads", "T1556.003": "Modify Authentication Process: Pluggable Authentication Modules", "T1578.004": "Modify Cloud Compute Infrastructure: Revert Cloud Instance", "T1592": "Gather Victim Host Information", "T1596.003": "Search Open Technical Databases: Digital Certificates", "T1056.001": "Input Capture: Keylogging", "T1222.002": "File and Directory Permissions Modification: Linux and Mac File and Directory Permissions Modification", "T1110.001": "Brute Force: Password Guessing", "T1216.001": "System Script Proxy Execution: PubPrn", "T1597.002": "Search Closed Sources: Purchase Technical Data", "T1003": "OS Credential Dumping", "T1129": "Shared Modules", "T1602": "Data from Configuration Repository", "T1561.002": "Disk Wipe: Disk Structure Wipe", "T1498.001": "Network Denial of Service: Direct Network Flood", "T1574.007": "Hijack Execution Flow: Path Interception by PATH Environment Variable", "T1213.002": "Data from Information Repositories: Sharepoint", "T1006": "Direct Volume Access", "T1564.008": "Hide Artifacts: Email Hiding Rules", "T1491.002": "Defacement: External Defacement", "T1590.005": "Gather Victim Network Information: IP Addresses", "T1499.001": "Endpoint Denial of Service: OS Exhaustion Flood", "T1014": "Rootkit", "T1546.013": "Event Triggered Execution: PowerShell Profile", "T1059.007": "Command and Scripting Interpreter: JavaScript", "T1590.002": "Gather Victim Network Information: DNS", "T1123": "Audio Capture", "T1543": "Create or Modify System Process", "T1133": "External Remote Services", "T1546.006": "Event Triggered Execution: LC_LOAD_DYLIB Addition", "T1539": "Steal Web Session Cookie", "T1053.007": "Scheduled Task/Job: Container Orchestration Job", "T1568.002": "Dynamic Resolution: Domain Generation Algorithms", "T1036.007": "Masquerading: Double File Extension", "T1548.002": "Abuse Elevation Control Mechanism: Bypass User Account Control", "T1016.001": "System Network Configuration Discovery: Internet Connection Discovery", "T1548.003": "Abuse Elevation Control Mechanism: Sudo and Sudo Caching", "T1560.003": "Archive Collected Data: Archive via Custom Method", "T1578": "Modify Cloud Compute Infrastructure", "T1069": "Permission Groups Discovery", "T1114": "Email Collection", "T1003.002": "OS Credential Dumping: Security Account Manager", "T1596.002": "Search Open Technical Databases: WHOIS", "T1542.001": "Pre-OS Boot: System Firmware", "T1594": "Search Victim-Owned Websites", "T1069.003": "Permission Groups Discovery: Cloud Groups", "T1574.011": "Hijack Execution Flow: Services Registry Permissions Weakness", "T1596.001": "Search Open Technical Databases: DNS/Passive DNS", "T1499.003": "Endpoint Denial of Service: Application Exhaustion Flood", "T1195.001": "Supply Chain Compromise: Compromise Software Dependencies and Development Tools", "T1588.004": "Obtain Capabilities: Digital Certificates", "T1583.002": "Acquire Infrastructure: DNS Server", "T1561": "Disk Wipe", "T1071.004": "Application Layer Protocol: DNS", "T1552.005": "Unsecured Credentials: Cloud Instance Metadata API", "T1555.002": "Credentials from Password Stores: Securityd Memory", "T1615": "Group Policy Discovery", "T1542.003": "Pre-OS Boot: Bootkit", "T1025": "Data from Removable Media", "T1218.013": "System Binary Proxy Execution: Mavinject", "T1074.001": "Data Staged: Local Data Staging", "T1036.005": "Masquerading: Match Legitimate Name or Location", "T1587.003": "Develop Capabilities: Digital Certificates", "T1565.001": "Data Manipulation: Stored Data Manipulation", "T1110.002": "Brute Force: Password Cracking", "T1114.001": "Email Collection: Local Email Collection", "T1555.001": "Credentials from Password Stores: Keychain", "T1547": "Boot or Logon Autostart Execution", "T1003.004": "OS Credential Dumping: LSA Secrets", "T1600": "Weaken Encryption", "T1606.002": "Forge Web Credentials: SAML Tokens", "T1489": "Service Stop", "T1587.001": "Develop Capabilities: Malware", "T1087.002": "Account Discovery: Domain Account", "T1547.014": "Boot or Logon Autostart Execution: Active Setup", "T1564": "Hide Artifacts", "T1559.002": "Inter-Process Communication: Dynamic Data Exchange", "T1204.002": "User Execution: Malicious File", "T1591.003": "Gather Victim Org Information: Identify Business Tempo", "T1592.001": "Gather Victim Host Information: Hardware", "T1080": "Taint Shared Content", "T1484.002": "Domain Policy Modification: Domain Trust Modification", "T1573.001": "Encrypted Channel: Symmetric Cryptography", "T1087.001": "Account Discovery: Local Account", "T1586.001": "Compromise Accounts: Social Media Accounts", "T1562.009": "Impair Defenses: Safe Mode Boot", "T1542.005": "Pre-OS Boot: TFTP Boot", "T1543.003": "Create or Modify System Process: Windows Service", "T1568.001": "Dynamic Resolution: Fast Flux DNS", "T1497.001": "Virtualization/Sandbox Evasion: System Checks", "T1053.003": "Scheduled Task/Job: Cron", "T1069.002": "Permission Groups Discovery: Domain Groups", "T1588.006": "Obtain Capabilities: Vulnerabilities", "T1566.002": "Phishing: Spearphishing Link", "T1070.002": "Indicator Removal: Clear Linux or Mac System Logs", "T1499.004": "Endpoint Denial of Service: Application or System Exploitation", "T1137": "Office Application Startup", "T1218.004": "System Binary Proxy Execution: InstallUtil", "T1598.003": "Phishing for Information: Spearphishing Link", "T1021.004": "Remote Services: SSH", "T1098.003": "Account Manipulation: Additional Cloud Roles", "T1547.012": "Boot or Logon Autostart Execution: Print Processors", "T1566.001": "Phishing: Spearphishing Attachment", "T1027.008": "Obfuscated Files or Information: Stripped Payloads", "T1559.001": "Inter-Process Communication: Component Object Model", "T1574.001": "Hijack Execution Flow: DLL Search Order Hijacking", "T1119": "Automated Collection", "T1115": "Clipboard Data", "T1003.007": "OS Credential Dumping: Proc Filesystem", "T1583.005": "Acquire Infrastructure: Botnet", "T1555.005": "Credentials from Password Stores: Password Managers", "T1553.001": "Subvert Trust Controls: Gatekeeper Bypass", "T1608.004": "Stage Capabilities: Drive-by Target", "T1007": "System Service Discovery", "T1040": "Network Sniffing", "T1553.002": "Subvert Trust Controls: Code Signing", "T1530": "Data from Cloud Storage", "T1565.003": "Data Manipulation: Runtime Data Manipulation", "T1552.002": "Unsecured Credentials: Credentials in Registry", "T1135": "Network Share Discovery", "T1120": "Peripheral Device Discovery", "T1590.004": "Gather Victim Network Information: Network Topology", "T1587.002": "Develop Capabilities: Code Signing Certificates", "T1222.001": "File and Directory Permissions Modification: Windows File and Directory Permissions Modification", "T1137.006": "Office Application Startup: Add-ins", "T1505.002": "Server Software Component: Transport Agent", "T1082": "System Information Discovery", "T1071": "Application Layer Protocol", "T1074.002": "Data Staged: Remote Data Staging", "T1053": "Scheduled Task/Job", "T1218.007": "System Binary Proxy Execution: Msiexec", "T1590.003": "Gather Victim Network Information: Network Trust Dependencies", "T1498.002": "Network Denial of Service: Reflection Amplification", "T1556.002": "Modify Authentication Process: Password Filter DLL", "T1505.005": "Server Software Component: Terminal Services DLL", "T1059.002": "Command and Scripting Interpreter: AppleScript", "T1176": "Browser Extensions", "T1499.002": "Endpoint Denial of Service: Service Exhaustion Flood", "T1195.003": "Supply Chain Compromise: Compromise Hardware Supply Chain", "T1106": "Native API", "T1070.007": "Indicator Removal: Clear Network Connection History and Configurations", "T1558.004": "Steal or Forge Kerberos Tickets: AS-REP Roasting", "T1584.003": "Compromise Infrastructure: Virtual Private Server", "T1600.001": "Weaken Encryption: Reduce Key Space", "T1070.003": "Indicator Removal: Clear Command History", "T1202": "Indirect Command Execution", "T1091": "Replication Through Removable Media", "T1005": "Data from Local System", "T1140": "Deobfuscate/Decode Files or Information", "T1137.005": "Office Application Startup: Outlook Rules", "T1562": "Impair Defenses", "T1586.003": "Compromise Accounts: Cloud Accounts", "T1586.002": "Compromise Accounts: Email Accounts", "T1608.001": "Stage Capabilities: Upload Malware", "T1195": "Supply Chain Compromise", "T1190": "Exploit Public-Facing Application", "T1558": "Steal or Forge Kerberos Tickets", "T1555": "Credentials from Password Stores", "T1567": "Exfiltration Over Web Service", "T1219": "Remote Access Software", "T1583.001": "Acquire Infrastructure: Domains", "T1560.002": "Archive Collected Data: Archive via Library", "T1055.003": "Process Injection: Thread Execution Hijacking", "T1036": "Masquerading", "T1546.011": "Event Triggered Execution: Application Shimming", "T1552": "Unsecured Credentials", "T1547.010": "Boot or Logon Autostart Execution: Port Monitors", "T1070.008": "Indicator Removal: Clear Mailbox Data", "T1037.002": "Boot or Logon Initialization Scripts: Login Hook", "T1055": "Process Injection", "T1205": "Traffic Signaling", "T1218": "System Binary Proxy Execution", "T1070.006": "Indicator Removal: Timestomp", "T1620": "Reflective Code Loading", "T1611": "Escape to Host", "T1547.009": "Boot or Logon Autostart Execution: Shortcut Modification", "T1010": "Application Window Discovery", "T1087.003": "Account Discovery: Email Account", "T1497.003": "Virtualization/Sandbox Evasion: Time Based Evasion", "T1218.003": "System Binary Proxy Execution: CMSTP", "T1563.001": "Remote Service Session Hijacking: SSH Hijacking", "T1562.002": "Impair Defenses: Disable Windows Event Logging", "T1029": "Scheduled Transfer", "T1021.002": "Remote Services: SMB/Windows Admin Shares", "T1525": "Implant Internal Image", "T1572": "Protocol Tunneling", "T1218.002": "System Binary Proxy Execution: Control Panel", "T1599.001": "Network Boundary Bridging: Network Address Translation Traversal", "T1608.002": "Stage Capabilities: Upload Tool", "T1547.005": "Boot or Logon Autostart Execution: Security Support Provider", "T1550": "Use Alternate Authentication Material", "T1597.001": "Search Closed Sources: Threat Intel Vendors", "T1011": "Exfiltration Over Other Network Medium", "T1602.002": "Data from Configuration Repository: Network Device Configuration Dump", "T1589": "Gather Victim Identity Information", "T1562.004": "Impair Defenses: Disable or Modify System Firewall", "T1560": "Archive Collected Data", "T1553.003": "Subvert Trust Controls: SIP and Trust Provider Hijacking", "T1185": "Browser Session Hijacking", "T1021": "Remote Services", "T1071.003": "Application Layer Protocol: Mail Protocols", "T1556.007": "Modify Authentication Process: Hybrid Identity", "T1595.002": "Active Scanning: Vulnerability Scanning", "T1596": "Search Open Technical Databases", "T1207": "Rogue Domain Controller", "T1553.006": "Subvert Trust Controls: Code Signing Policy Modification", "T1610": "Deploy Container", "T1112": "Modify Registry", "T1543.004": "Create or Modify System Process: Launch Daemon", "T1580": "Cloud Infrastructure Discovery", "T1555.003": "Credentials from Password Stores: Credentials from Web Browsers", "T1574.008": "Hijack Execution Flow: Path Interception by Search Order Hijacking", "T1491": "Defacement", "T1535": "Unused/Unsupported Cloud Regions", "T1557.003": "Adversary-in-the-Middle: DHCP Spoofing", "T1563": "Remote Service Session Hijacking", "T1027.001": "Obfuscated Files or Information: Binary Padding", "T1505.003": "Server Software Component: Web Shell", "T1484.001": "Domain Policy Modification: Group Policy Modification", "T1217": "Browser Bookmark Discovery", "T1552.004": "Unsecured Credentials: Private Keys", "T1583.004": "Acquire Infrastructure: Server", "T1021.006": "Remote Services: Windows Remote Management", "T1011.001": "Exfiltration Over Other Network Medium: Exfiltration Over Bluetooth", "T1078.001": "Valid Accounts: Default Accounts", "T1547.003": "Boot or Logon Autostart Execution: Time Providers", "T1546.005": "Event Triggered Execution: Trap", "T1574.006": "Hijack Execution Flow: Dynamic Linker Hijacking", "T1136.001": "Create Account: Local Account", "T1092": "Communication Through Removable Media", "T1070.001": "Indicator Removal: Clear Windows Event Logs", "T1585.002": "Establish Accounts: Email Accounts", "T1557.001": "Adversary-in-the-Middle: LLMNR/NBT-NS Poisoning and SMB Relay", "T1222": "File and Directory Permissions Modification", "T1003.001": "OS Credential Dumping: LSASS Memory", "T1595": "Active Scanning", "T1548": "Abuse Elevation Control Mechanism", "T1134.002": "Access Token Manipulation: Create Process with Token", "T1548.001": "Abuse Elevation Control Mechanism: Setuid and Setgid", "T1547.004": "Boot or Logon Autostart Execution: Winlogon Helper DLL", "T1021.003": "Remote Services: Distributed Component Object Model", "T1110.003": "Brute Force: Password Spraying", "T1090.002": "Proxy: External Proxy", "T1056.003": "Input Capture: Web Portal Capture", "T1589.002": "Gather Victim Identity Information: Email Addresses", "T1003.005": "OS Credential Dumping: Cached Domain Credentials", "T1098.004": "Account Manipulation: SSH Authorized Keys", "T1590.006": "Gather Victim Network Information: Network Security Appliances", "T1546.012": "Event Triggered Execution: Image File Execution Options Injection", "T1218.008": "System Binary Proxy Execution: Odbcconf", "T1593.002": "Search Open Websites/Domains: Search Engines", "T1591.002": "Gather Victim Org Information: Business Relationships", "T1125": "Video Capture", "T1055.013": "Process Injection: Process Doppelg\u00e4nging", "T1016": "System Network Configuration Discovery", "T1578.003": "Modify Cloud Compute Infrastructure: Delete Cloud Instance", "T1593.003": "Search Open Websites/Domains: Code Repositories", "T1574.005": "Hijack Execution Flow: Executable Installer File Permissions Weakness", "T1546.008": "Event Triggered Execution: Accessibility Features", "T1087": "Account Discovery", "T1090": "Proxy", "T1059": "Command and Scripting Interpreter", "T1562.006": "Impair Defenses: Indicator Blocking", "T1136.002": "Create Account: Domain Account", "T1589.003": "Gather Victim Identity Information: Employee Names", "T1482": "Domain Trust Discovery", "T1558.001": "Steal or Forge Kerberos Tickets: Golden Ticket", "T1020": "Automated Exfiltration", "T1592.004": "Gather Victim Host Information: Client Configurations", "T1562.007": "Impair Defenses: Disable or Modify Cloud Firewall", "T1036.002": "Masquerading: Right-to-Left Override", "T1588.001": "Obtain Capabilities: Malware", "T1542.002": "Pre-OS Boot: Component Firmware", "T1070": "Indicator Removal", "T1048.001": "Exfiltration Over Alternative Protocol: Exfiltration Over Symmetric Encrypted Non-C2 Protocol", "T1137.001": "Office Application Startup: Office Template Macros", "T1583.003": "Acquire Infrastructure: Virtual Private Server", "T1213.001": "Data from Information Repositories: Confluence", "T1550.003": "Use Alternate Authentication Material: Pass the Ticket", "T1609": "Container Administration Command", "T1083": "File and Directory Discovery", "T1568": "Dynamic Resolution", "T1036.004": "Masquerading: Masquerade Task or Service", "T1055.004": "Process Injection: Asynchronous Procedure Call", "T1020.001": "Automated Exfiltration: Traffic Duplication", "T1647": "Plist File Modification", "T1546.009": "Event Triggered Execution: AppCert DLLs", "T1114.003": "Email Collection: Email Forwarding Rule", "T1074": "Data Staged", "T1649": "Steal or Forge Authentication Certificates", "T1098.005": "Account Manipulation: Device Registration", "T1049": "System Network Connections Discovery", "T1584": "Compromise Infrastructure", "T1553.005": "Subvert Trust Controls: Mark-of-the-Web Bypass", "T1600.002": "Weaken Encryption: Disable Crypto Hardware", "T1542": "Pre-OS Boot", "T1612": "Build Image on Host", "T1055.002": "Process Injection: Portable Executable Injection", "T1218.012": "System Binary Proxy Execution: Verclsid", "T1586": "Compromise Accounts", "T1569.001": "System Services: Launchctl", "T1584.005": "Compromise Infrastructure: Botnet", "T1059.008": "Command and Scripting Interpreter: Network Device CLI", "T1552.003": "Unsecured Credentials: Bash History", "T1562.010": "Impair Defenses: Downgrade Attack", "T1559.003": "Inter-Process Communication: XPC Services", "T1497": "Virtualization/Sandbox Evasion", "T1102": "Web Service", "T1552.001": "Unsecured Credentials: Credentials In Files", "T1568.003": "Dynamic Resolution: DNS Calculation", "T1218.005": "System Binary Proxy Execution: Mshta", "T1547.015": "Boot or Logon Autostart Execution: Login Items", "T1608": "Stage Capabilities", "T1608.005": "Stage Capabilities: Link Target", "T1104": "Multi-Stage Channels", "T1480": "Execution Guardrails", "T1619": "Cloud Storage Object Discovery", "T1606.001": "Forge Web Credentials: Web Cookies", "T1134.001": "Access Token Manipulation: Token Impersonation/Theft", "T1567.001": "Exfiltration Over Web Service: Exfiltration to Code Repository", "T1205.001": "Traffic Signaling: Port Knocking", "T1583.006": "Acquire Infrastructure: Web Services", "T1528": "Steal Application Access Token", "T1598.002": "Phishing for Information: Spearphishing Attachment", "T1098.001": "Account Manipulation: Additional Cloud Credentials", "T1204": "User Execution", "T1491.001": "Defacement: Internal Defacement", "T1564.002": "Hide Artifacts: Hidden Users", "T1134.003": "Access Token Manipulation: Make and Impersonate Token", "T1552.006": "Unsecured Credentials: Group Policy Preferences", "T1048.002": "Exfiltration Over Alternative Protocol: Exfiltration Over Asymmetric Encrypted Non-C2 Protocol", "T1087.004": "Account Discovery: Cloud Account", "T1057": "Process Discovery", "T1562.003": "Impair Defenses: Impair Command History Logging", "T1546.003": "Event Triggered Execution: Windows Management Instrumentation Event Subscription", "T1596.004": "Search Open Technical Databases: CDNs", "T1497.002": "Virtualization/Sandbox Evasion: User Activity Based Checks", "T1585.003": "Establish Accounts: Cloud Accounts", "T1072": "Software Deployment Tools", "T1041": "Exfiltration Over C2 Channel", "T1134.004": "Access Token Manipulation: Parent PID Spoofing", "T1591": "Gather Victim Org Information", "T1606": "Forge Web Credentials", "T1621": "Multi-Factor Authentication Request Generation", "T1554": "Compromise Client Software Binary", "T1059.001": "Command and Scripting Interpreter: PowerShell", "T1546.001": "Event Triggered Execution: Change Default File Association", "T1055.014": "Process Injection: VDSO Hijacking", "T1071.002": "Application Layer Protocol: File Transfer Protocols", "T1212": "Exploitation for Credential Access", "T1546.014": "Event Triggered Execution: Emond", "T1102.003": "Web Service: One-Way Communication", "T1590": "Gather Victim Network Information", "T1210": "Exploitation of Remote Services", "T1534": "Internal Spearphishing", "T1574.010": "Hijack Execution Flow: Services File Permissions Weakness", "T1547.001": "Boot or Logon Autostart Execution: Registry Run Keys / Startup Folder", "T1199": "Trusted Relationship", "T1136.003": "Create Account: Cloud Account", "T1069.001": "Permission Groups Discovery: Local Groups", "T1593": "Search Open Websites/Domains", "T1098": "Account Manipulation", "T1048": "Exfiltration Over Alternative Protocol", "T1547.006": "Boot or Logon Autostart Execution: Kernel Modules and Extensions", "T1056.002": "Input Capture: GUI Input Capture", "T1588.002": "Obtain Capabilities: Tool", "T1052.001": "Exfiltration Over Physical Medium: Exfiltration over USB", "T1574.013": "Hijack Execution Flow: KernelCallbackTable", "T1597": "Search Closed Sources", "T1053.006": "Scheduled Task/Job: Systemd Timers", "T1566": "Phishing", "T1542.004": "Pre-OS Boot: ROMMONkit", "T1218.001": "System Binary Proxy Execution: Compiled HTML File", "T1070.005": "Indicator Removal: Network Share Connection Removal", "T1090.003": "Proxy: Multi-hop Proxy", "T1110": "Brute Force", "T1059.004": "Command and Scripting Interpreter: Unix Shell", "T1137.003": "Office Application Startup: Outlook Forms", "T1562.001": "Impair Defenses: Disable or Modify Tools", "T1565": "Data Manipulation", "T1559": "Inter-Process Communication", "T1001": "Data Obfuscation", "T1039": "Data from Network Shared Drive", "T1584.006": "Compromise Infrastructure: Web Services", "T1601": "Modify System Image", "T1574": "Hijack Execution Flow", "T1027.005": "Obfuscated Files or Information: Indicator Removal from Tools", "T1204.003": "User Execution: Malicious Image", "T1078": "Valid Accounts", "T1571": "Non-Standard Port", "T1585.001": "Establish Accounts: Social Media Accounts", "T1055.012": "Process Injection: Process Hollowing", "T1068": "Exploitation for Privilege Escalation", "T1564.009": "Hide Artifacts: Resource Forking", "T1531": "Account Access Removal", "T1110.004": "Brute Force: Credential Stuffing", "T1027": "Obfuscated Files or Information", "T1556.006": "Modify Authentication Process: Multi-Factor Authentication", "T1114.002": "Email Collection: Remote Email Collection", "T1505.004": "Server Software Component: IIS Components", "T1036.001": "Masquerading: Invalid Code Signature", "T1564.006": "Hide Artifacts: Run Virtual Instance", "T1201": "Password Policy Discovery", "T1546": "Event Triggered Execution", "T1546.004": "Event Triggered Execution: Unix Shell Configuration Modification", "T1187": "Forced Authentication", "T1134.005": "Access Token Manipulation: SID-History Injection", "T1599": "Network Boundary Bridging", "T1486": "Data Encrypted for Impact", "T1553": "Subvert Trust Controls", "T1548.004": "Abuse Elevation Control Mechanism: Elevated Execution with Prompt", "T1592.003": "Gather Victim Host Information: Firmware", "T1573": "Encrypted Channel", "T1547.002": "Boot or Logon Autostart Execution: Authentication Package", "T1218.010": "System Binary Proxy Execution: Regsvr32", "T1592.002": "Gather Victim Host Information: Software", "T1056": "Input Capture", "T1587.004": "Develop Capabilities: Exploits", "T1593.001": "Search Open Websites/Domains: Social Media", "T1546.015": "Event Triggered Execution: Component Object Model Hijacking", "T1589.001": "Gather Victim Identity Information: Credentials", "T1195.002": "Supply Chain Compromise: Compromise Software Supply Chain", "T1036.003": "Masquerading: Rename System Utilities", "T1102.002": "Web Service: Bidirectional Communication", "T1203": "Exploitation for Client Execution", "T1595.003": "Active Scanning: Wordlist Scanning", "T1137.004": "Office Application Startup: Outlook Home Page", "T1573.002": "Encrypted Channel: Asymmetric Cryptography", "T1567.002": "Exfiltration Over Web Service: Exfiltration to Cloud Storage", "T1570": "Lateral Tool Transfer", "T1574.009": "Hijack Execution Flow: Path Interception by Unquoted Path", "T1608.003": "Stage Capabilities: Install Digital Certificate", "T1037.005": "Boot or Logon Initialization Scripts: Startup Items", "T1614.001": "System Location Discovery: System Language Discovery", "T1095": "Non-Application Layer Protocol", "T1027.003": "Obfuscated Files or Information: Steganography", "T1584.002": "Compromise Infrastructure: DNS Server", "T1001.003": "Data Obfuscation: Protocol Impersonation", "T1012": "Query Registry", "T1030": "Data Transfer Size Limits", "T1550.004": "Use Alternate Authentication Material: Web Session Cookie", "T1078.002": "Valid Accounts: Domain Accounts", "T1218.009": "System Binary Proxy Execution: Regsvcs/Regasm", "T1553.004": "Subvert Trust Controls: Install Root Certificate", "T1037.003": "Boot or Logon Initialization Scripts: Network Logon Script", "T1499": "Endpoint Denial of Service", "T1027.004": "Obfuscated Files or Information: Compile After Delivery", "T1614": "System Location Discovery", "T1564.007": "Hide Artifacts: VBA Stomping", "T1197": "BITS Jobs", "T1127.001": "Trusted Developer Utilities Proxy Execution: MSBuild", "T1090.004": "Proxy: Domain Fronting", "T1557.002": "Adversary-in-the-Middle: ARP Cache Poisoning", "T1562.008": "Impair Defenses: Disable Cloud Logs", "T1518.001": "Software Discovery: Security Software Discovery", "T1564.003": "Hide Artifacts: Hidden Window", "T1059.006": "Command and Scripting Interpreter: Python", "T1591.004": "Gather Victim Org Information: Identify Roles", "T1132": "Data Encoding", "T1546.010": "Event Triggered Execution: AppInit DLLs", "T1598": "Phishing for Information", "T1496": "Resource Hijacking", "T1585": "Establish Accounts", "T1588": "Obtain Capabilities", "T1546.002": "Event Triggered Execution: Screensaver", "T1578.002": "Modify Cloud Compute Infrastructure: Create Cloud Instance", "T1213.003": "Data from Information Repositories: Code Repositories", "T1565.002": "Data Manipulation: Transmitted Data Manipulation", "T1003.008": "OS Credential Dumping: /etc/passwd and /etc/shadow", "T1543.001": "Create or Modify System Process: Launch Agent", "T1569": "System Services", "T1059.003": "Command and Scripting Interpreter: Windows Command Shell", "T1055.009": "Process Injection: Proc Memory", "T1601.001": "Modify System Image: Patch System Image", "T1558.002": "Steal or Forge Kerberos Tickets: Silver Ticket", "T1213": "Data from Information Repositories", "T1070.009": "Indicator Removal: Clear Persistence", "T1555.004": "Credentials from Password Stores: Windows Credential Manager", "T1200": "Hardware Additions", "T1505": "Server Software Component", "T1485": "Data Destruction", "T1132.002": "Data Encoding: Non-Standard Encoding", "T1556.001": "Modify Authentication Process: Domain Controller Authentication", "T1537": "Transfer Data to Cloud Account", "T1027.006": "Obfuscated Files or Information: HTML Smuggling", "T1556.005": "Modify Authentication Process: Reversible Encryption", "T1070.004": "Indicator Removal: File Deletion", "T1189": "Drive-by Compromise", "T1498": "Network Denial of Service", "T1546.016": "Event Triggered Execution: Installer Packages", "T1595.001": "Active Scanning: Scanning IP Blocks", "T1221": "Template Injection", "T1037.004": "Boot or Logon Initialization Scripts: RC Scripts", "T1134": "Access Token Manipulation", "T1111": "Multi-Factor Authentication Interception", "T1027.002": "Obfuscated Files or Information: Software Packing", "T1584.007": "Compromise Infrastructure: Serverless", "T1071.001": "Application Layer Protocol: Web Protocols", "T1059.005": "Command and Scripting Interpreter: Visual Basic", "T1564.005": "Hide Artifacts: Hidden File System", "T1543.002": "Create or Modify System Process: Systemd Service", "T1563.002": "Remote Service Session Hijacking: RDP Hijacking", "T1136": "Create Account", "T1547.013": "Boot or Logon Autostart Execution: XDG Autostart Entries", "T1584.004": "Compromise Infrastructure: Server", "T1526": "Cloud Service Discovery", "T1018": "Remote System Discovery", "T1046": "Network Service Discovery", "T1590.001": "Gather Victim Network Information: Domain Properties", "T1518": "Software Discovery", "T1538": "Cloud Service Dashboard", "T1055.005": "Process Injection: Thread Local Storage", "T1622": "Debugger Evasion", "T1036.006": "Masquerading: Space after Filename", "T1547.007": "Boot or Logon Autostart Execution: Re-opened Applications", "T1608.006": "Stage Capabilities: SEO Poisoning", "T1550.002": "Use Alternate Authentication Material: Pass the Hash", "T1052": "Exfiltration Over Physical Medium", "T1574.002": "Hijack Execution Flow: DLL Side-Loading", "T1105": "Ingress Tool Transfer", "T1098.002": "Account Manipulation: Additional Email Delegate Permissions", "T1588.003": "Obtain Capabilities: Code Signing Certificates", "T1648": "Serverless Execution", "T1055.008": "Process Injection: Ptrace System Calls", "T1027.007": "Obfuscated Files or Information: Dynamic API Resolution", "T1021.001": "Remote Services: Remote Desktop Protocol", "T1037.001": "Boot or Logon Initialization Scripts: Logon Script (Windows)", "T1055.015": "Process Injection: ListPlanting", "T1484": "Domain Policy Modification", "T1220": "XSL Script Processing", "T1596.005": "Search Open Technical Databases: Scan Databases", "T1564.001": "Hide Artifacts: Hidden Files and Directories", "T1578.001": "Modify Cloud Compute Infrastructure: Create Snapshot", "T1591.001": "Gather Victim Org Information: Determine Physical Locations", "T1137.002": "Office Application Startup: Office Test", "T1587": "Develop Capabilities", "T1003.003": "OS Credential Dumping: NTDS", "T1602.001": "Data from Configuration Repository: SNMP (MIB Dump)", "T1001.002": "Data Obfuscation: Steganography", "T1204.001": "User Execution: Malicious Link", "T1550.001": "Use Alternate Authentication Material: Application Access Token", "T1547.008": "Boot or Logon Autostart Execution: LSASS Driver", "T1569.002": "System Services: Service Execution", "T1078.004": "Valid Accounts: Cloud Accounts", "T1480.001": "Execution Guardrails: Environmental Keying", "T1008": "Fallback Channels", "T1564.004": "Hide Artifacts: NTFS File Attributes", "T1558.003": "Steal or Forge Kerberos Tickets: Kerberoasting", "T1003.006": "OS Credential Dumping: DCSync", "T1124": "System Time Discovery", "T1053.002": "Scheduled Task/Job: At", "T1055.001": "Process Injection: Dynamic-link Library Injection", "T1588.005": "Obtain Capabilities: Exploits", "T1556": "Modify Authentication Process", "T1056.004": "Input Capture: Credential API Hooking", "T1495": "Firmware Corruption", "T1490": "Inhibit System Recovery", "T1546.007": "Event Triggered Execution: Netsh Helper DLL", "T1566.003": "Phishing: Spearphishing via Service", "T1090.001": "Proxy: Internal Proxy", "T1216": "System Script Proxy Execution", "T1102.001": "Web Service: Dead Drop Resolver", "T1001.001": "Data Obfuscation: Junk Data", "T1598.001": "Phishing for Information: Spearphishing Service", "T1552.007": "Unsecured Credentials: Container API", "T1584.001": "Compromise Infrastructure: Domains", "T1505.001": "Server Software Component: SQL Stored Procedures", "T1556.004": "Modify Authentication Process: Network Device Authentication", "T1561.001": "Disk Wipe: Disk Content Wipe", "T1048.003": "Exfiltration Over Alternative Protocol: Exfiltration Over Unencrypted Non-C2 Protocol", "T1574.004": "Hijack Execution Flow: Dylib Hijacking", "T1601.002": "Modify System Image: Downgrade System Image", "T1078.003": "Valid Accounts: Local Accounts", "T1211": "Exploitation for Defense Evasion", "T1127": "Trusted Developer Utilities Proxy Execution", "T1529": "System Shutdown/Reboot", "T1218.014": "System Binary Proxy Execution: MMC", "T1564.010": "Hide Artifacts: Process Argument Spoofing", "T1574.012": "Hijack Execution Flow: COR_PROFILER"} -------------------------------------------------------------------------------- /nlp_general.py: -------------------------------------------------------------------------------- 1 | """ 2 | Created on Wed Mar 18 12:15:21 2020 3 | 4 | @author: babdeen 5 | """ 6 | import spacy 7 | 8 | import numpy as np 9 | from spacy.lang.en import English 10 | parser = spacy.load("en_core_web_lg",disable=['ner']) 11 | from nltk.corpus import stopwords 12 | stop_words = set(stopwords.words('english')) 13 | 14 | stop_words = stop_words.union(['may','should','can','could']) 15 | 16 | sw_NN = set(list(stop_words)+['part','kind','different','number','all','many','kinds','parts','whole','certain','various','other','such','both','multiple','some','several']) 17 | import string 18 | from nltk.corpus import wordnet as wn 19 | import re 20 | from collections import Counter,OrderedDict 21 | import pickle 22 | from nltk.stem.wordnet import WordNetLemmatizer 23 | lemmatizer = WordNetLemmatizer() 24 | from sklearn.feature_extraction.text import TfidfVectorizer 25 | from pattern.en import lexeme, pluralize 26 | from numpy.linalg import norm 27 | from numpy import dot 28 | 29 | 30 | class NLP: 31 | def srl_to_dict(srl): 32 | SRLDict = {} 33 | for verb in srl['verbs']: 34 | verb_str = verb['id'] 35 | SRLDict[verb_str] = {} 36 | for ind,tag in enumerate(verb['tags']): 37 | if tag != 'O': 38 | if tag[0] == 'B': 39 | newTag = tag[tag.find('-')+1:] 40 | if newTag not in SRLDict[verb_str]: 41 | SRLDict[verb_str][newTag] = {'text': srl['words'][ind] } 42 | else: 43 | SRLDict[verb_str][newTag]['text'] += ('/ ' + srl['words'][ind]) 44 | if newTag == 'V': 45 | SRLDict[verb_str][newTag]['index'] = ind 46 | else : 47 | newTag = tag[tag.find('-')+1:] 48 | if newTag not in SRLDict[verb_str]: 49 | continue 50 | SRLDict[verb_str][newTag]['text'] += (' ' + srl['words'][ind]) 51 | 52 | return SRLDict 53 | def add_v_id_srl_from_dict(srls): 54 | for srl in srls: 55 | NLP.add_v_id_srl(srls[srl]) 56 | 57 | def srl_to_dict_from_dict(srls): 58 | return {srl:NLP.srl_to_dict(srls[srl]) for srl in srls} 59 | def add_v_id_srl(srl): 60 | verbs = set() 61 | counter = {} 62 | 63 | for v in srl["verbs"]: 64 | if v["verb"] not in verbs: 65 | verbs.add(v["verb"]) 66 | counter[v["verb"]] = 1 67 | v['id'] = v["verb"] 68 | else: 69 | counter[v["verb"]] += 1 70 | v['id'] = v["verb"] + '_' + str(counter[v["verb"]]) 71 | 72 | def get_lemma(word,is_verb): 73 | return lemmatizer.lemmatize(word.lower(), wn.VERB if is_verb else wn.NOUN) 74 | 75 | 76 | def load_model(self,model_name): 77 | if model_name == 'ner': 78 | self.ner_spacy = spacy.load('en_core_web_lg',disable=['parser', 'tagger']) 79 | if model_name == 'pos': 80 | self.pos_tagger = spacy.load("en_core_web_lg",disable=['ner', 'parser']) 81 | if model_name == 'parse': 82 | self.parser = spacy.load("en_core_web_lg",disable=['ner']) 83 | if model_name == 'dep': 84 | self.dep_tagger = spacy.load("en_core_web_lg",disable=['ner']) 85 | if model_name == 'sentencizer': 86 | self.nlp = English() 87 | #self.nlp.add_pipe(self.nlp.create_pipe('sentencizer')) 88 | self.nlp.add_pipe('sentencizer') 89 | 90 | 91 | def seperate_sentences(self,text): 92 | doc = self.nlp(text) 93 | #sentences = [sent.string.strip() for sent in doc.sents] 94 | sentences = [sent.text.strip() for sent in doc.sents] 95 | return sentences 96 | 97 | 98 | def cos_sim(x,y): 99 | a = np.array(x) 100 | b = np.array(y) 101 | if norm(a) == 0 or norm(b) == 0 : 102 | return 0 103 | return abs(dot(a, b)/(norm(a)*norm(b))) 104 | 105 | 106 | def extract_VO_from_docs_lambda(self,doc_srl,arg_constrain = {}, join_args = True, return_args = ['V','ARG1','ARG2','ARG3','ARGM-LOC','ARGM-EXT']): 107 | out = {} 108 | exclude = [] 109 | for doc in doc_srl: 110 | out[doc] = [] 111 | for sent in doc_srl[doc]: 112 | 113 | 114 | for v in doc_srl[doc][sent]: 115 | if 'V' in doc_srl[doc][sent][v] : 116 | 117 | cont = True 118 | for arg in arg_constrain: 119 | if arg=='forbid': 120 | for srl_arg in doc_srl[doc][sent][v]: 121 | if srl_arg in arg_constrain[arg]: 122 | cont = False 123 | break 124 | if cont == False : 125 | break 126 | elif arg not in doc_srl[doc][sent][v] : 127 | cont = False 128 | break 129 | elif arg_constrain[arg](doc_srl[doc][sent][v][arg]) == False: 130 | cont = False 131 | break 132 | if cont: 133 | args = [doc_srl[doc][sent][v][arg]['text'] for arg in doc_srl[doc][sent][v] if arg in return_args] 134 | VO = ' '.join(args) if join_args else args 135 | out[doc].append((VO,sent)) 136 | return out 137 | 138 | def filter_srl_docs_lambda(self,doc_srl,arg_constrain = {},return_args = None): 139 | out = {} 140 | for doc in doc_srl: 141 | out[doc] = {} 142 | for sent in doc_srl[doc]: 143 | out[doc][sent] = {} 144 | for v in doc_srl[doc][sent]: 145 | 146 | cont = True 147 | for arg in arg_constrain: 148 | if arg=='forbid': 149 | for srl_arg in doc_srl[doc][sent][v]: 150 | if srl_arg in arg_constrain[arg]: 151 | cont = False 152 | break 153 | if cont == False : 154 | break 155 | elif arg not in doc_srl[doc][sent][v] : 156 | cont = False 157 | break 158 | elif arg_constrain[arg](doc_srl[doc][sent][v][arg]['text']) == False: 159 | cont = False 160 | break 161 | if cont: 162 | out[doc][sent][v] = {i:doc_srl[doc][sent][v][i] for i in doc_srl[doc][sent][v] if return_args == None or i in return_args } 163 | return out 164 | 165 | def extract_VO_from_sents_lambda(self,sents_srl,arg_constrain = {}, join_args = True, return_args = ['V','ARG1','ARG2','ARG3','ARGM-LOC','ARGM-EXT']): 166 | out = {} 167 | exclude = [] 168 | 169 | for sent in sents_srl: 170 | out[sent] = [] 171 | 172 | for v in sents_srl[sent]: 173 | if 'V' in sents_srl[sent][v] : 174 | 175 | cont = True 176 | for arg in arg_constrain: 177 | if arg=='forbid': 178 | for srl_arg in sents_srl[sent][v]: 179 | if srl_arg in arg_constrain[arg]: 180 | cont = False 181 | break 182 | if cont == False : 183 | break 184 | elif arg not in sents_srl[sent][v] : 185 | cont = False 186 | break 187 | elif arg_constrain[arg](sents_srl[sent][v][arg]['text']) == False: 188 | cont = False 189 | break 190 | if cont: 191 | args = [sents_srl[sent][v][arg]['text'] for arg in sents_srl[sent][v] if arg in return_args] 192 | VO = ' '.join(args) if join_args else args 193 | out[sent].append((VO,sent)) 194 | return out 195 | 196 | def extract_VO_from_sents(self,sents_srl,arg_constrain = {}, join_args = True, return_args = ['V','ARG1','ARG2','ARG3','ARGM-LOC','ARGM-EXT']): 197 | out = {} 198 | for sent in sents_srl: 199 | out[sent] = [] 200 | 201 | 202 | for v in sents_srl[sent]: 203 | if 'V' in sents_srl[sent][v] : 204 | 205 | cont = True 206 | for arg in arg_constrain: 207 | if arg=='forbid': 208 | for srl_arg in sents_srl[sent][v]: 209 | if srl_arg in arg_constrain[arg]: 210 | cont = False 211 | break 212 | if cont == False : 213 | break 214 | elif arg not in sents_srl[sent][v] : 215 | cont = False 216 | break 217 | elif arg_constrain[arg](sents_srl[sent][v][arg]['text']) == False: 218 | cont = False 219 | break 220 | if cont: 221 | args = [sents_srl[sent][v][arg]['text'] for arg in sents_srl[sent][v] if arg in return_args] 222 | VO = ' '.join(args) if join_args else args 223 | out[sent].append((VO,sent)) 224 | return out 225 | def extract_VO_from_srl(srl,sent,arg_constrain = {}, join_args = True, return_args = ['V','ARG1','ARG2','ARG3','ARGM-LOC','ARGM-EXT']): 226 | exclude = [] 227 | 228 | VO = '' 229 | # for v in srl: 230 | if 'V' in srl and srl['V']['text'] not in exclude: 231 | 232 | cont = True 233 | for arg in arg_constrain: 234 | if arg not in srl: 235 | cont = False 236 | break 237 | elif srl[arg]['text'].lower() not in arg_constrain[arg] and arg_constrain[arg] != 'any': 238 | cont = False 239 | break 240 | if cont: 241 | args = [srl[arg]['text'] for arg in srl if arg in return_args] 242 | VO = ' '.join(args) if join_args else args 243 | return VO 244 | -------------------------------------------------------------------------------- /parse_class.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jul 21 23:21:08 2020 4 | 5 | @author: babdeen 6 | """ 7 | 8 | 9 | # -*- coding: utf-8 -*- 10 | """ 11 | Created on Sun Jul 5 23:01:55 2020 12 | 13 | @author: babdeen 14 | """ 15 | 16 | from allennlp.predictors.predictor import Predictor 17 | from nlp_general import NLP 18 | # import allennlp_models.syntax.srl 19 | # SRL = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.11.19.tar.gz") 20 | SRL = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/structured-prediction-srl-bert.2020.12.15.tar.gz") 21 | # DT = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz") 22 | # CT = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz") 23 | 24 | 25 | class Parser: 26 | def extract_srl(text): 27 | srl = SRL.predict(text) 28 | Parser.add_v_id_srl(srl) 29 | # srl_dict = Parser.srl_to_dict(srl) 30 | return srl 31 | 32 | # def extract_dt(text): 33 | # return DT.predict(text) 34 | # def extract_ct(text): 35 | # return CT.predict(text) 36 | def srl_to_dict(srl): 37 | SRLDict = {} 38 | for verb in srl['verbs']: 39 | verb_str = verb['id'] 40 | SRLDict[verb_str] = {} 41 | for ind,tag in enumerate(verb['tags']): 42 | if tag != 'O': 43 | if tag[0] == 'B': 44 | newTag = tag[tag.find('-')+1:] 45 | if newTag not in SRLDict[verb_str]: 46 | SRLDict[verb_str][newTag] = {'text': srl['words'][ind] } 47 | else: 48 | SRLDict[verb_str][newTag]['text'] += ('/ ' + srl['words'][ind]) 49 | # if newTag == 'V': 50 | SRLDict[verb_str][newTag]['index'] = ind 51 | else : 52 | newTag = tag[tag.find('-')+1:] 53 | if newTag not in SRLDict[verb_str]: 54 | continue 55 | SRLDict[verb_str][newTag]['text'] += (' ' + srl['words'][ind]) 56 | 57 | return SRLDict 58 | 59 | 60 | def srl_to_dict_from_dict(srls): 61 | return {srl:Parser.srl_to_dict(srls[srl]) for srl in srls} 62 | 63 | def srl_to_dict_from_list(srls): 64 | return [Parser.srl_to_dict(srls[srl]) for srl in srls] 65 | 66 | def add_v_id_srl(srl): 67 | verbs = set() 68 | counter = {} 69 | 70 | for v in srl["verbs"]: 71 | if v["verb"] not in verbs: 72 | verbs.add(v["verb"]) 73 | counter[v["verb"]] = 1 74 | v['id'] = v["verb"] 75 | else: 76 | counter[v["verb"]] += 1 77 | v['id'] = v["verb"] + '_' + str(counter[v["verb"]]) 78 | 79 | def add_v_id_srl_from_dict(srls): 80 | for srl in srls: 81 | Parser.add_v_id_srl(srls[srl]) 82 | 83 | 84 | def get_words_verb(sent,is_SRL = True,args = ['ARG0','ARG1','ARG2','ARG3'], ): 85 | words_verbs = {} 86 | if is_SRL: 87 | srl = sent 88 | else: 89 | srl = Parser.extract_srl(sent) 90 | srl = Parser.srl_to_dict(srl) 91 | 92 | for v in srl: 93 | if 'V' not in srl[v]: 94 | continue 95 | args_join = ' '.join([srl[v][arg]['text'] if arg in srl[v] else '' for arg in args]) 96 | for w in args_join.split(): 97 | if w in words_verbs: 98 | words_verbs[w].append(srl[v]['V']['text']) 99 | else: 100 | words_verbs[w] = [srl[v]['V']['text']] 101 | return words_verbs 102 | 103 | def get_words_sub(sent, is_SRL = True, args = ['ARG1','ARG2','ARG3']): 104 | words_subs = {} 105 | 106 | if is_SRL: 107 | srl = sent 108 | else: 109 | srl = Parser.extract_srl(sent) 110 | srl = Parser.srl_to_dict(srl) 111 | 112 | for v in srl: 113 | if 'ARG0' in srl[v]: 114 | args_join = ' '.join([srl[v][arg]['text'] if arg in srl[v] else '' for arg in args]) 115 | 116 | for w in args_join.split(): 117 | if w in words_subs: 118 | words_subs[w].append(srl[v]['ARG0']['text']) 119 | else: 120 | words_subs[w] = [srl[v]['ARG0']['text']] 121 | return words_subs 122 | 123 | def extract_VO_from_docs(doc_srl,arg_constrain = {}, join_args = True, return_args = ['V','ARG1','ARG2','ARG3','ARGM-LOC','ARGM-EXT'], exclude_dep = ['amod']): 124 | out = [] 125 | for doc in doc_srl: 126 | for sent in doc_srl[doc]: 127 | 128 | if exclude_dep != []: 129 | exclude = NLP.extract_dep(sent, exclude_dep) 130 | for v in doc_srl[doc][sent]: 131 | if 'V' in doc_srl[doc][sent][v] and doc_srl[doc][sent][v]['V']['text'] not in exclude: 132 | 133 | cont = True 134 | for arg in arg_constrain: 135 | if arg not in doc_srl[doc][sent][v]: 136 | cont = False 137 | break 138 | elif doc_srl[doc][sent][v][arg]['text'].lower() not in arg_constrain[arg] and arg_constrain[arg] != 'any': 139 | cont = False 140 | break 141 | if cont: 142 | args = [doc_srl[doc][sent][v][arg]['text'] for arg in doc_srl[doc][sent][v] if arg in return_args] 143 | VO = ' '.join(args) if join_args else args 144 | out.append((VO,sent)) 145 | return out 146 | 147 | def extract_VO_from_sents(sents_srl,arg_constrain = {}, join_args = True, return_args = ['V','ARG1','ARG2','ARG3','ARGM-LOC','ARGM-EXT'], exclude_dep = ['amod']): 148 | out = [] 149 | exclude = [] 150 | for sent in sents_srl: 151 | 152 | if exclude_dep != []: 153 | exclude = NLP.extract_dep(sent, exclude_dep) 154 | 155 | for v in sents_srl[sent]: 156 | if 'V' in sents_srl[sent][v] and sents_srl[sent][v]['V']['text'] not in exclude: 157 | 158 | cont = True 159 | for arg in arg_constrain: 160 | if arg not in sents_srl[sent][v]: 161 | cont = False 162 | break 163 | elif sents_srl[sent][v][arg]['text'].lower() not in arg_constrain[arg] and arg_constrain[arg] != 'any': 164 | cont = False 165 | break 166 | if cont: 167 | args = [sents_srl[sent][v][arg]['text'] for arg in sents_srl[sent][v] if arg in return_args] 168 | VO = ' '.join(args) if join_args else args 169 | out.append((VO,sent)) 170 | return out 171 | 172 | def extract_VO_from_srl(srl,sent,arg_constrain = {}, join_args = True, return_args = ['V','ARG1','ARG2','ARG3','ARGM-LOC','ARGM-EXT'], exclude_dep = ['amod']): 173 | 174 | if exclude_dep != []: 175 | exclude = NLP.extract_dep(sent, exclude_dep) 176 | 177 | VO = '' 178 | # for v in srl: 179 | if 'V' in srl and srl['V']['text'] not in exclude: 180 | 181 | cont = True 182 | for arg in arg_constrain: 183 | if arg not in srl: 184 | cont = False 185 | break 186 | elif srl[arg]['text'].lower() not in arg_constrain[arg] and arg_constrain[arg] != 'any': 187 | cont = False 188 | break 189 | if cont: 190 | args = [srl[arg]['text'] for arg in srl if arg in return_args] 191 | VO = ' '.join(args) if join_args else args 192 | return VO 193 | 194 | 195 | -------------------------------------------------------------------------------- /requirements-frozen.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.9.1 2 | aiosignal==1.3.1 3 | allennlp==2.10.1 4 | allennlp-models==2.10.1 5 | async-timeout==4.0.3 6 | attrs==23.1.0 7 | autocommand==2.2.2 8 | backports.csv==1.0.7 9 | base58==2.1.1 10 | beautifulsoup4==4.12.2 11 | benepar==0.2.0 12 | blis==0.7.11 13 | boto3==1.34.4 14 | botocore==1.34.4 15 | cached-path==1.1.6 16 | cachetools==5.3.2 17 | catalogue==2.0.10 18 | certifi==2023.11.17 19 | cffi==1.16.0 20 | charset-normalizer==3.1.0 21 | cheroot==10.0.0 22 | CherryPy==18.9.0 23 | click==8.1.7 24 | colorama==0.4.6 25 | commonmark==0.9.1 26 | confection==0.0.4 27 | conllu==4.4.2 28 | cryptography==41.0.7 29 | cymem==2.0.8 30 | datasets==2.10.1 31 | dill==0.3.6 32 | docker-pycreds==0.4.0 33 | en-core-web-lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.3.0/en_core_web_lg-3.3.0-py3-none-any.whl#sha256=6ce19d37dfe5280400f80a5954d41afca10cbc742b97bfcf4b0e452b6eb24273 34 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl#sha256=84d7d8059bfbf53c09b39139782f76cd6ac7064851e7799dcc685c06ebf5fd4f 35 | exceptiongroup==1.2.0 36 | fairscale==0.4.6 37 | feedparser==6.0.11 38 | filelock==3.7.1 39 | frozenlist==1.4.1 40 | fsspec==2023.12.2 41 | ftfy==6.1.3 42 | future==0.18.3 43 | gitdb==4.0.11 44 | GitPython==3.1.40 45 | google-api-core==2.15.0 46 | google-auth==2.25.2 47 | google-cloud-core==2.4.1 48 | google-cloud-storage==2.14.0 49 | google-crc32c==1.5.0 50 | google-resumable-media==2.7.0 51 | googleapis-common-protos==1.62.0 52 | h5py==3.10.0 53 | huggingface-hub==0.10.1 54 | idna==3.6 55 | importlib-resources==6.1.1 56 | inflect==6.0.1 57 | iniconfig==2.0.0 58 | jaraco.collections==5.0.0 59 | jaraco.context==4.3.0 60 | jaraco.functools==4.0.0 61 | jaraco.text==3.12.0 62 | Jinja2==3.1.2 63 | jmespath==1.0.1 64 | joblib==1.3.2 65 | langcodes==3.3.0 66 | lmdb==1.3.0 67 | lxml==4.9.4 68 | MarkupSafe==2.1.3 69 | more-itertools==10.1.0 70 | multidict==6.0.4 71 | multiprocess==0.70.14 72 | murmurhash==1.0.10 73 | mysqlclient==2.2.1 74 | neuralcoref==4.0 75 | nltk==3.8.1 76 | numpy==1.24.4 77 | packaging==23.2 78 | pandas==2.0.3 79 | pathtools==0.1.2 80 | pathy==0.10.1 81 | Pattern==3.6 82 | pdfminer.six==20221105 83 | pdfplumber==0.7.4 84 | Pillow==9.2.0 85 | pluggy==1.3.0 86 | portend==3.2.0 87 | preshed==3.0.9 88 | promise==2.3 89 | protobuf==3.20.3 90 | psutil==5.9.7 91 | py-rouge==1.1 92 | pyarrow==14.0.2 93 | pyasn1==0.5.1 94 | pyasn1-modules==0.3.0 95 | pycparser==2.21 96 | pydantic==1.8.2 97 | pydantic_core==2.0.2 98 | Pygments==2.17.2 99 | pytest==7.4.3 100 | python-dateutil==2.8.2 101 | python-docx==1.1.0 102 | pytz==2023.3.post1 103 | pywin32==306 104 | PyYAML==6.0.1 105 | regex==2023.10.3 106 | requests==2.30.0 107 | responses==0.18.0 108 | rich==12.6.0 109 | rsa==4.9 110 | s3transfer==0.9.0 111 | sacremoses==0.1.1 112 | scikit-learn==1.3.0 113 | scipy==1.10.1 114 | sentence-transformers==2.2.2 115 | sentencepiece==0.1.99 116 | sentry-sdk==1.39.1 117 | setproctitle==1.3.3 118 | sgmllib3k==1.0.0 119 | shortuuid==1.0.11 120 | six==1.16.0 121 | smart-open==5.2.1 122 | smmap==5.0.1 123 | soupsieve==2.5 124 | spacy==3.3.3 125 | spacy-legacy==3.0.12 126 | spacy-loggers==1.0.1 127 | srsly==2.4.8 128 | tempora==5.5.0 129 | tensorboardX==2.6.2.2 130 | termcolor==1.1.0 131 | thinc==8.0.17 132 | threadpoolctl==3.2.0 133 | tokenizers==0.12.1 134 | tomli==2.0.1 135 | torch==1.12.1 136 | torch-struct==0.5 137 | torchvision==0.13.1 138 | tqdm==4.65.0 139 | traitlets==5.14.0 140 | transformers==4.20.1 141 | typer==0.4.2 142 | typing_extensions==4.5.0 143 | tzdata==2023.3 144 | urllib3==1.26.18 145 | Wand==0.6.10 146 | wandb==0.12.21 147 | wasabi==0.10.1 148 | wcwidth==0.2.12 149 | word2number==1.1 150 | wrapt==1.12.1 151 | xxhash==3.4.1 152 | yarl==1.9.4 153 | zc.lockfile==3.0.post1 154 | zipp==3.17.0 155 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | allennlp==2.10.1 2 | allennlp-models==2.10.1 3 | nltk==3.8.1 4 | numpy==1.24.4 5 | Pattern==3.6 6 | scikit_learn==1.3.0 7 | scipy==1.10.1 8 | sentence_transformers==2.2.2 9 | spacy==3.3.3 10 | --------------------------------------------------------------------------------