├── reports └── .empty ├── input ├── src ├── fieldhunter │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ ├── eval.py │ │ └── base.py │ └── inference │ │ ├── __init__.py │ │ ├── common.py │ │ ├── fieldtypesRelaxed.py │ │ └── fieldtypes.py ├── fh.py ├── fh_relaxed.py └── trace_statistics.py ├── lib └── nemere ├── requirements.txt ├── .idea ├── .gitignore ├── vcs.xml ├── misc.xml ├── other.xml ├── inspectionProfiles │ └── profiles_settings.xml ├── modules.xml └── fieldhunter.iml ├── .gitmodules ├── .gitignore ├── setup.cfg ├── eval-traces.sh ├── eval-fh.sh ├── eval-fh-relaxed.sh └── README.md /reports/.empty: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /input: -------------------------------------------------------------------------------- 1 | sub/nemere/input/ -------------------------------------------------------------------------------- /src/fieldhunter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/fieldhunter/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/nemere: -------------------------------------------------------------------------------- 1 | ../sub/nemere/src/nemere/ -------------------------------------------------------------------------------- /src/fieldhunter/inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | netzob 2 | pyitlib 3 | openpyxl -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /workspace.xml 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "sub/nemere"] 2 | path = sub/nemere 3 | url = git@github.com:vs-uulm/nemesys.git 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | reports/ 3 | workspace.xml 4 | usage.statistics.xml 5 | shelf/ 6 | *.iml 7 | gradle.xml 8 | 9 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | verbosity=2 3 | with-doctest=1 4 | where=src/ 5 | doctest-options=+ELLIPSIS 6 | # tests=../tests/, . -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /eval-traces.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #input=input/*-100.pcap 4 | #input=input/*-1000.pcap 5 | #input="input/*-100.pcap input/*-1000.pcap" 6 | #input=input/maxdiff-filtered/*-1000.pcap 7 | #input=input/maxdiff-fromOrig/*-100.pcap 8 | input="input/maxdiff-fromOrig/*-100*.pcap input/deduped-orig/*-100*.pcap" 9 | #input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap" 10 | 11 | 12 | #tftnext=$(expr 1 + $(ls -d reports/tft-* | sed "s/^.*tft-\([0-9]*\)-.*$/\1/" | sort | tail -1)) 13 | #tftnpad=$(printf "%03d" ${tftnext}) 14 | #currcomm=$(git log -1 --format="%h") 15 | #report=reports/tft-${tftnpad}-clustering-${currcomm} 16 | #mkdir ${report} 17 | 18 | for fn in ${input} ; do 19 | python src/trace_statistics.py ${fn} 20 | # Give tshark some time to recover 21 | sleep 3 22 | done 23 | 24 | #mv reports/*.csv ${report}/ 25 | #mv reports/*.pdf ${report}/ 26 | 27 | spd-say "Bin fertig!" 28 | -------------------------------------------------------------------------------- /eval-fh.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #input=input/*-100.pcap 4 | #input=input/*-1000.pcap 5 | #input="input/*-100.pcap input/*-1000.pcap" 6 | #input=input/maxdiff-filtered/*-1000.pcap 7 | # input=input/maxdiff-fromOrig/*-1000.pcap 8 | #input=input/maxdiff-fromOrig/*-100.pcap 9 | input="input/maxdiff-fromOrig/*-100*.pcap input/deduped-orig/*-100*.pcap" 10 | #input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap" 11 | 12 | 13 | #tftnext=$(expr 1 + $(ls -d reports/tft-* | sed "s/^.*tft-\([0-9]*\)-.*$/\1/" | sort | tail -1)) 14 | #tftnpad=$(printf "%03d" ${tftnext}) 15 | #currcomm=$(git log -1 --format="%h") 16 | #report=reports/tft-${tftnpad}-clustering-${currcomm} 17 | #mkdir ${report} 18 | 19 | for fn in ${input} ; do 20 | python src/fh.py ${fn} 21 | # Give tshark some time to recover 22 | sleep 3 23 | done 24 | 25 | #mv reports/*.csv ${report}/ 26 | #mv reports/*.pdf ${report}/ 27 | 28 | spd-say "Bin fertig!" 29 | -------------------------------------------------------------------------------- /eval-fh-relaxed.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | #input=input/*-100.pcap 4 | #input=input/*-1000.pcap 5 | #input="input/*-100.pcap input/*-1000.pcap" 6 | #input=input/maxdiff-filtered/*-1000.pcap 7 | # input=input/maxdiff-fromOrig/*-1000.pcap 8 | #input=input/maxdiff-fromOrig/*-100.pcap 9 | input="input/maxdiff-fromOrig/*-100*.pcap input/deduped-orig/*-100*.pcap" 10 | #input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap" 11 | 12 | 13 | #tftnext=$(expr 1 + $(ls -d reports/tft-* | sed "s/^.*tft-\([0-9]*\)-.*$/\1/" | sort | tail -1)) 14 | #tftnpad=$(printf "%03d" ${tftnext}) 15 | #currcomm=$(git log -1 --format="%h") 16 | #report=reports/tft-${tftnpad}-clustering-${currcomm} 17 | #mkdir ${report} 18 | 19 | for fn in ${input} ; do 20 | python src/fh_relaxed.py ${fn} 21 | # Give tshark some time to recover 22 | sleep 3 23 | done 24 | 25 | #mv reports/*.csv ${report}/ 26 | #mv reports/*.pdf ${report}/ 27 | 28 | spd-say "Bin fertig!" 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FieldHunter 2 | 3 | Re-implementation of parts of the protocol reverse engineering approach FieldHunter (FH) as proposed in 4 | 5 | > Bermudez, Ignacio, Alok Tongaonkar, Marios Iliofotou, Marco Mellia, und Maurizio M. Munafò. 6 | > „Towards Automatic Protocol Field Inference“. Computer Communications 84 (15. Juni 2016). 7 | > https://doi.org/10.1016/j.comcom.2016.02.015. 8 | 9 | Written by Stephan Kleber 10 | who also proposed some improvements for the field heuristics in 11 | `inference/fieldtypesRelaxed.py` 12 | used by 13 | `src/fh_relaxed.py` 14 | for evaluation to be run by 15 | `eval-fh-relaxed.sh`. 16 | 17 | The original FieldHunter heuristics are run via 18 | `eval-fh.sh`. 19 | 20 | It only implements FH's binary message handling using n-grams (not textual using delimiters!) 21 | 22 | 23 | Statistics about traces can be gained by 24 | `eval-traces.sh`. 25 | 26 | Not sure about a licence right now. 27 | 28 | ## Installation 29 | 30 | Clone the repository including the nemere submodule: 31 | ```git clone --recurse-submodules git@github.com:vs-uulm/nemesys.git``` 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /.idea/fieldhunter.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 18 | 19 | 21 | 22 | 24 | 25 | 27 | -------------------------------------------------------------------------------- /src/fh.py: -------------------------------------------------------------------------------- 1 | """ 2 | Only implements FH's binary message handling using n-grams (not textual using delimiters!) 3 | """ 4 | 5 | from argparse import ArgumentParser 6 | from time import time 7 | 8 | # noinspection PyUnresolvedReferences 9 | from tabulate import tabulate 10 | # noinspection PyUnresolvedReferences 11 | from pprint import pprint 12 | # noinspection PyUnresolvedReferences 13 | import IPython 14 | 15 | from nemere.utils.loader import SpecimenLoader 16 | from nemere.utils.evaluationHelpers import StartupFilecheck 17 | from nemere.utils.reportWriter import writeReport 18 | from nemere.validation.dissectorMatcher import MessageComparator, DissectorMatcher 19 | 20 | from fieldhunter.inference.fieldtypes import * 21 | from fieldhunter.inference.common import segmentedMessagesAndSymbols 22 | from fieldhunter.utils.base import Flows 23 | from fieldhunter.utils.eval import FieldTypeReport 24 | 25 | 26 | 27 | 28 | if __name__ == '__main__': 29 | parser = ArgumentParser( 30 | description='Re-Implementation of FieldHunter.') 31 | parser.add_argument('pcapfilename', help='Filename of the PCAP to load.') 32 | parser.add_argument('-i', '--interactive', help='open ipython prompt after finishing the analysis.', 33 | action="store_true") 34 | # Pointless options: FH requires TCP/UDP over IP (FH, Section 6.6) 35 | # parser.add_argument('-l', '--layer', type=int, default=2, 36 | # help='Protocol layer relative to IP to consider. Default is 2 layers above IP ' 37 | # '(typically the payload of a transport protocol).') 38 | # parser.add_argument('-r', '--relativeToIP', default=True, action='store_false') 39 | args = parser.parse_args() 40 | layer = 2 41 | relativeToIP = True 42 | 43 | filechecker = StartupFilecheck(args.pcapfilename) 44 | 45 | specimens = SpecimenLoader(args.pcapfilename, layer = layer, relativeToIP = relativeToIP) 46 | # noinspection PyTypeChecker 47 | messages = list(specimens.messagePool.keys()) # type: List[L4NetworkMessage] 48 | flows = Flows(messages) 49 | 50 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 51 | print("Hunting fields in", filechecker.pcapstrippedname) 52 | inferenceStart = time() 53 | 54 | # MSG-type 55 | print("Inferring", MSGtype.typelabel) 56 | msgtypefields = MSGtype(flows) 57 | 58 | # MSG-len 59 | print("Inferring", MSGlen.typelabel) 60 | msglenfields = MSGlen(flows) 61 | 62 | # Host-ID 63 | print("Inferring", HostID.typelabel) 64 | hostidfields = HostID(messages) 65 | 66 | # Session-ID (FH, Section 3.2.4) 67 | print("Inferring", SessionID.typelabel) 68 | sessionidfields = SessionID(messages) 69 | 70 | # Trans-ID (FH, Section 3.2.5) 71 | print("Inferring", TransID.typelabel) 72 | transidfields = TransID(flows) 73 | 74 | # Accumulators (FH, Section 3.2.6) 75 | print("Inferring", Accumulator.typelabel) 76 | accumulatorfields = Accumulator(flows) 77 | 78 | # in order of fieldtypes.precedence! 79 | sortedInferredTypes = sorted( 80 | (msglenfields, msgtypefields, hostidfields, sessionidfields, transidfields, accumulatorfields), 81 | key=lambda l: precedence[l.typelabel] ) 82 | segmentedMessages, symbols = segmentedMessagesAndSymbols(sortedInferredTypes, messages) 83 | 84 | inferenceDuration = time() - inferenceStart 85 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 86 | # statistics for all types 87 | print(tabulate( 88 | [(infield.typelabel, 89 | sum(1 for msgsegs in infield.segments if len(msgsegs) > 0), 90 | max(len(msgsegs) for msgsegs in infield.segments) 91 | if len(infield.segments) > 0 else 0 # prevent empty sequence for max() 92 | ) for infield in sortedInferredTypes], 93 | headers=["typelabel","messages","max inferred per msg"] 94 | )) 95 | 96 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 97 | 98 | nontrivialSymbols = [sym for sym in symbols if len(sym.fields) > 1] 99 | comparator = MessageComparator(specimens, layer=layer, relativeToIP=relativeToIP) 100 | print("Dissection complete.") 101 | comparator.pprintInterleaved(nontrivialSymbols) 102 | print(f"\n + {len(symbols)-len(nontrivialSymbols)} messages without any inferred fields.") 103 | 104 | # calc FMS per message 105 | print("Calculate FMS...") 106 | message2quality = DissectorMatcher.symbolListFMS(comparator, symbols) 107 | # write statistics to csv 108 | writeReport(message2quality, inferenceDuration, comparator, "fieldhunter-literal", 109 | filechecker.reportFullPath) 110 | 111 | # FTR validation: calculate TP/FP/FN ==> P/R per protocol and per type 112 | infieldWorkbook = FieldTypeReport.newWorkbook() 113 | for infields in sortedInferredTypes: 114 | infieldReport = FieldTypeReport(infields, comparator, segmentedMessages) 115 | infieldReport.addXLworksheet(infieldWorkbook, FieldTypeReport.ovTitle) 116 | FieldTypeReport.saveWorkbook(infieldWorkbook, filechecker.pcapstrippedname) 117 | 118 | # interactive 119 | if args.interactive: 120 | IPython.embed() 121 | 122 | -------------------------------------------------------------------------------- /src/fh_relaxed.py: -------------------------------------------------------------------------------- 1 | """ 2 | FieldHunter main script with relaxed assumptions (see fieldhunter.inference.fieldtypesRelaxed) 3 | 4 | Only implements FH's binary message handling using n-grams (not textual using delimiters!) 5 | """ 6 | import logging 7 | from argparse import ArgumentParser 8 | from time import time 9 | 10 | # noinspection PyUnresolvedReferences 11 | from tabulate import tabulate 12 | # noinspection PyUnresolvedReferences 13 | from pprint import pprint 14 | # noinspection PyUnresolvedReferences 15 | import IPython 16 | 17 | from nemere.utils.loader import SpecimenLoader 18 | from nemere.utils.evaluationHelpers import StartupFilecheck 19 | from nemere.utils.reportWriter import writeReport 20 | from nemere.validation.dissectorMatcher import MessageComparator, DissectorMatcher 21 | from nemere.inference.segments import TypedSegment 22 | 23 | from fieldhunter.inference.fieldtypesRelaxed import * 24 | from fieldhunter.inference.common import segmentedMessagesAndSymbols 25 | from fieldhunter.utils.base import Flows 26 | from fieldhunter.utils.eval import FieldTypeReport, GroundTruth 27 | 28 | if __name__ == '__main__': 29 | parser = ArgumentParser( 30 | description='Re-Implementation of FieldHunter.') 31 | parser.add_argument('pcapfilename', help='Filename of the PCAP to load.') 32 | parser.add_argument('-i', '--interactive', help='Open ipython prompt after finishing the analysis.', 33 | action="store_true") 34 | parser.add_argument('-d', '--debug', help='Enable debug output.', action="store_true") 35 | args = parser.parse_args() 36 | if args.debug: 37 | print("DEBUG") 38 | logging.basicConfig(level=logging.DEBUG) 39 | logger = logging.getLogger() 40 | logger.setLevel(logging.DEBUG) 41 | else: 42 | print("INFO") 43 | logging.basicConfig(level=logging.INFO) 44 | logger = logging.getLogger() 45 | logger.setLevel(logging.INFO) 46 | 47 | layer = 2 48 | relativeToIP = True 49 | 50 | filechecker = StartupFilecheck(args.pcapfilename) 51 | 52 | specimens = SpecimenLoader(args.pcapfilename, layer = layer, relativeToIP = relativeToIP) 53 | # noinspection PyTypeChecker 54 | messages = list(specimens.messagePool.keys()) # type: List[L4NetworkMessage] 55 | flows = Flows(messages) 56 | 57 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 58 | print("Hunting fields in", filechecker.pcapstrippedname) 59 | inferenceStart = time() 60 | 61 | # MSG-type 62 | print("Inferring", MSGtype.typelabel) 63 | msgtypefields = MSGtype(flows) 64 | 65 | # MSG-len 66 | print("Inferring", MSGlen.typelabel) 67 | msglenfields = MSGlen(flows) 68 | 69 | # Host-ID 70 | print("Inferring", HostID.typelabel) 71 | hostidfields = HostID(messages) 72 | 73 | # Session-ID (FH, Section 3.2.4) 74 | print("Inferring", SessionID.typelabel) 75 | sessionidfields = SessionID(messages) 76 | 77 | # Trans-ID (FH, Section 3.2.5) 78 | print("Inferring", TransID.typelabel) 79 | transidfields = TransID(flows) 80 | 81 | # Accumulators (FH, Section 3.2.6) 82 | print("Inferring", Accumulator.typelabel) 83 | accumulatorfields = Accumulator(flows) 84 | 85 | # in order of fieldtypes.precedence! 86 | sortedInferredTypes = sorted( 87 | (msglenfields, msgtypefields, hostidfields, sessionidfields, transidfields, accumulatorfields), 88 | key=lambda l: precedence[l.typelabel] ) 89 | segmentedMessages, symbols = segmentedMessagesAndSymbols(sortedInferredTypes, messages) 90 | 91 | inferenceDuration = time() - inferenceStart 92 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 93 | 94 | print(tabulate( 95 | [(infield.typelabel, 96 | sum(1 for msgsegs in infield.segments if len(msgsegs) > 0), 97 | max(len(msgsegs) for msgsegs in infield.segments) 98 | if len(infield.segments) > 0 else 0 # prevent empty sequence for max() 99 | ) for infield in sortedInferredTypes], 100 | headers=["typelabel","messages","max inferred per msg"] 101 | )) 102 | 103 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 104 | 105 | nontrivialSymbols = [sym for sym in symbols if len(sym.fields) > 1] 106 | comparator = MessageComparator(specimens, layer=layer, relativeToIP=relativeToIP) 107 | print("Dissection complete.") 108 | comparator.pprintInterleaved(nontrivialSymbols) 109 | print(f"\n + {len(symbols)-len(nontrivialSymbols)} messages without any inferred fields.") 110 | 111 | # calc FMS per message 112 | print("Calculate FMS...") 113 | message2quality = DissectorMatcher.symbolListFMS(comparator, symbols) 114 | # write statistics to csv 115 | writeReport(message2quality, inferenceDuration, comparator, "fieldhunter-literal", 116 | filechecker.reportFullPath) 117 | 118 | # FTR validation: calculate TP/FP/FN ==> P/R per protocol and per type 119 | infieldWorkbook = FieldTypeReport.newWorkbook() 120 | for infields in sortedInferredTypes: 121 | infieldReport = FieldTypeReport(infields, comparator, segmentedMessages) 122 | infieldReport.addXLworksheet(infieldWorkbook, FieldTypeReport.ovTitle) 123 | FieldTypeReport.saveWorkbook(infieldWorkbook, filechecker.pcapstrippedname) 124 | 125 | # coverage 126 | tpByteSum = sum(sum( 127 | len(seg) for seg in msg 128 | if isinstance(seg, TypedSegment) and comparator.lookupField(seg)[1] in GroundTruth.fieldtypes[seg.fieldtype] 129 | ) for msg in segmentedMessages.values()) 130 | payloadSum = sum(len(msg.data) for msg in segmentedMessages.keys()) 131 | coverage = tpByteSum/payloadSum 132 | print(f"Coverage (ratio of TP bytes): {coverage:.5f}") 133 | # TODO quick and dirty hard coded filename, no checks. 134 | import csv 135 | with open("reports/fh-coverage.csv", "a") as covcsv: 136 | covwriter = csv.writer(covcsv) 137 | covwriter.writerow([filechecker.pcapstrippedname, tpByteSum, payloadSum, coverage]) 138 | 139 | # interactive 140 | if args.interactive: 141 | print(""" 142 | The inference of individual field types can be found in: 143 | msglenfields, msgtypefields, hostidfields, sessionidfields, transidfields, accumulatorfields 144 | 145 | A combination per message is in: 146 | segmentedMessages, symbols 147 | """) 148 | IPython.embed() 149 | 150 | -------------------------------------------------------------------------------- /src/trace_statistics.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script provides statistics about the given PCAP trace that have impact on the FieldHunter inference. 3 | """ 4 | # noinspection PyUnresolvedReferences 5 | import IPython, logging 6 | # noinspection PyUnresolvedReferences 7 | from tabulate import tabulate 8 | from argparse import ArgumentParser 9 | from os.path import join 10 | import matplotlib.pyplot as plt 11 | 12 | from nemere.utils.loader import SpecimenLoader 13 | from nemere.utils.evaluationHelpers import StartupFilecheck, reportFolder 14 | from nemere.validation.dissectorMatcher import MessageComparator 15 | 16 | from fieldhunter.inference.fieldtypes import * 17 | from fieldhunter.utils.base import Flows 18 | from fieldhunter.utils.eval import GroundTruth, csvAppend 19 | 20 | logging.basicConfig(level=logging.DEBUG) 21 | logging.getLogger().setLevel(logging.DEBUG) 22 | 23 | if __name__ == '__main__': 24 | parser = ArgumentParser( 25 | description='Statistics about the given PCAP trace that have impact on the FieldHunter inference.') 26 | parser.add_argument('pcapfilename', help='Filename of the PCAP to load.') 27 | parser.add_argument('-i', '--interactive', help='open IPython prompt after finishing the analysis.', 28 | action="store_true") 29 | args = parser.parse_args() 30 | 31 | filechecker = StartupFilecheck(args.pcapfilename) 32 | 33 | # FH always requires the protocol to be inside TCP/UDP over IP (FH, Section 6.6) 34 | specimens = SpecimenLoader(args.pcapfilename, layer=2, relativeToIP=True) 35 | # noinspection PyTypeChecker 36 | messages = list(specimens.messagePool.keys()) # type: List[L4NetworkMessage] 37 | comparator = MessageComparator(specimens, layer=2, relativeToIP=True) 38 | 39 | # # # # # # # # # # # # # # # # # # 40 | # Relevant for MSG-Type 41 | flows = Flows(messages) 42 | # print(tabulate(flows.c2sInConversations().keys())) 43 | # print(tabulate(flows.s2cInConversations().keys())) 44 | print("Conversations:\n") 45 | print(tabulate(flows.conversations().keys())) 46 | mqr = flows.matchQueryResponse() 47 | print("\nNumber of matching queries and responses:", len(mqr), "in", len(flows.flows), "flows") 48 | print("Found in", len(messages), f"messages. Coverage: {(len(mqr)*200)/len(messages):.1f}%") 49 | header = ["trace", "matching", "conversations", "flows", "messages", "coverage"] 50 | # amount/percentage of messages in the trace that are of "singular flows", i. e., a single message without either 51 | # a matching request or reply, is calculated by (100% - coverage). 52 | csvAppend(reportFolder, "flows", header, [[ 53 | filechecker.pcapstrippedname, len(mqr), len(flows.conversations()), len(flows.flows), 54 | len(messages), (len(mqr)*200)/len(messages) ]]) 55 | # TODO 56 | # discern types: broadcasts, c2s/s2c without matching flow 57 | 58 | # # # # # # # # # # # # # # # # # # 59 | # Entropy filter threshold rationale: entropy statistics for ground truth fields 60 | # since the entropyThresh used in MSGtype/MSGlen (NonConstantNonRandomEntropyFieldType) is not given in FH 61 | # using our traces to back our value. 62 | gt = GroundTruth(comparator) 63 | gtTypeAndLengthEntropies = gt.typeAndLenEntropies() 64 | header = ["trace", "field name", "type label", "sample count", "entropy"] 65 | # write/append to a file. Columns: trace, field name, type label, sample count, entropy 66 | csvAppend(reportFolder, "typeAndLengthEntropies", header, 67 | ([filechecker.pcapstrippedname, *row] for row in gtTypeAndLengthEntropies if not numpy.isnan(row[-1]))) 68 | # # # # # # # # # # # # # # # # # # 69 | 70 | # # # # # # # # # # # # # # # # # # 71 | # Relevant for MSG-Len 72 | # TODO length of messages, something like: 73 | # keyfunc = lambda m: len(m.data) 74 | # msgbylen = {k: v for k, v in groupby(sorted(direction, key=keyfunc), keyfunc)} 75 | # # # # # # # # # # # # # # # # # # 76 | 77 | # # # # # # # # # # # # # # # # # # 78 | # Entropy plots: Relevant for MSG-Type and Trans-ID 79 | c2s, s2c = flows.splitDirections() 80 | c2sEntropy = pyitNgramEntropy(c2s, 1) 81 | s2cEntropy = pyitNgramEntropy(s2c, 1) 82 | fig: plt.Figure 83 | ax1: plt.Axes 84 | fig, (ax1, ax2) = plt.subplots(2,1,figsize=(6,6)) 85 | for ax, entropy in [(ax1, c2sEntropy), (ax2, s2cEntropy)]: 86 | if len(entropy) > 0: 87 | ax.stem(entropy, use_line_collection=True) 88 | else: 89 | ax.text(1, .5, "no entries") 90 | ax.set_xlim(0, 32) 91 | ax.set_ylim(0.,1.) 92 | ax.grid(which="major", axis="y") 93 | ax.set_xlabel("byte offset") 94 | ax.set_ylabel("normalized entropy") 95 | plt.suptitle("Entropies per byte offset", fontsize="x-large") 96 | ax1.set_title("Client to Server Collection") 97 | ax2.set_title("Server to Client Collection") 98 | fig.tight_layout(rect=[0,0,1,.95]) 99 | fig.savefig(join(reportFolder, filechecker.pcapstrippedname + ".pdf")) 100 | # # # # # # # # # # # # # # # # # # 101 | 102 | # # # # # # # # # # # # # # # # # # 103 | # DHCP "Transaction ID" that is a FH Session-ID 104 | if "dhcp" in specimens.pcapFileName: 105 | sessIDtuples = sorted( ( 106 | (comparator.parsedMessages[specimens.messagePool[msg]].getValuesByName('dhcp.id')[0], 107 | msg.source.rpartition(':')[0], msg.destination.rpartition(':')[0]) for msg in messages), 108 | key = lambda x: x[0] ) 109 | participantsTuples = [(a, *sorted([b, c])) for a, b, c in sessIDtuples] 110 | field2value = [( 111 | intsFromNgrams([bytes.fromhex(a)])[0], 112 | intsFromNgrams([bytes(map(int, b.split(".") + c.split(".")))])[0]) 113 | for a, b, c in participantsTuples] 114 | ngSc = numpy.array(list(zip(*field2value))) 115 | catCorr = drv.information_mutual(ngSc[0], ngSc[1]) / drv.entropy_joint(ngSc) 116 | print(catCorr) 117 | # 0.5073953157493724 118 | # For dhcp_SMIA2011101X_deduped-1000.pcap this is just about .5 which is quite surprising. 119 | ignoreList = {"0.0.0.0", "255.255.255.255"} 120 | field2value = [( 121 | intsFromNgrams([bytes.fromhex(a)])[0], 122 | intsFromNgrams([bytes(map(int, b.split(".") + c.split(".")))])[0]) 123 | for a, b, c in participantsTuples if b not in ignoreList and c not in ignoreList and a != "00000000"] 124 | ngSc = numpy.array(list(zip(*field2value))) 125 | catCorr = drv.information_mutual(ngSc[0], ngSc[1]) / drv.entropy_joint(ngSc) 126 | print(catCorr) 127 | # 0.566225418688138 128 | # Ignoring some trivial cases raises the correlation only marginally. 129 | # # # # # # # # # # # # # # # # # # 130 | 131 | # interactive 132 | if args.interactive: 133 | print() 134 | IPython.embed() 135 | -------------------------------------------------------------------------------- /src/fieldhunter/inference/common.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common handling of inference intermediates or results. 3 | """ 4 | 5 | from typing import Iterable, List, Tuple, Dict 6 | 7 | from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage 8 | from netzob.Model.Vocabulary.Symbol import Symbol 9 | 10 | from fieldhunter.inference.fieldtypes import FieldType 11 | from nemere.inference.formatRefinement import isOverlapping 12 | from nemere.inference.segmentHandler import symbolsFromSegments 13 | from nemere.inference.segments import TypedSegment, MessageSegment 14 | from nemere.inference.analyzers import Value 15 | 16 | 17 | def segmentedMessagesAndSymbols(typedFields: Iterable[FieldType], messages: List[AbstractMessage]) \ 18 | -> Tuple[Dict[AbstractMessage, List[MessageSegment]], List[Symbol]]: 19 | # noinspection PyProtectedMember 20 | """ 21 | Consolidate the inferred fields into segmented messages and additionally into symbols. 22 | 23 | >>> from itertools import chain 24 | >>> from tabulate import tabulate 25 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage 26 | >>> from nemere.visualization.simplePrint import SegmentPrinter 27 | >>> from fieldhunter.inference.common import segmentedMessagesAndSymbols 28 | >>> from fieldhunter.inference.fieldtypes import FieldType 29 | >>> from fieldhunter.utils.base import iterateSelected 30 | >>> # prevent Netzob from producing debug output. 31 | >>> import logging 32 | >>> logging.getLogger().setLevel(30) 33 | >>> 34 | >>> messageList = [ 35 | ... L4NetworkMessage(b"QQQ456789"), L4NetworkMessage(b"RRR567890"), L4NetworkMessage(b"QQQ7890AB"), 36 | ... L4NetworkMessage(b"RRR567890"), L4NetworkMessage(b"QQQ123456789"), L4NetworkMessage(b"RRR890A"), 37 | ... L4NetworkMessage(b"QQQ6789"), L4NetworkMessage(b"RRR890ABCDEFGH") 38 | ... ] 39 | >>> 40 | >>> # normally this would only be performed by a subclass of FieldType internally; here for the sake of testing 41 | >>> segmentsA = FieldType._posLen2segments(messageList, [(0,3),(5,2)]) 42 | >>> del segmentsA[5][1]; del segmentsA[3]; del segmentsA[1][1]; del segmentsA[0][0] 43 | >>> segmentsB = FieldType._posLen2segments(messageList, [(2,2),(5,4)]) 44 | >>> ftA = FieldType() 45 | >>> ftA._segments = segmentsA 46 | >>> ftB = FieldType() 47 | >>> ftB._segments = segmentsB 48 | >>> 49 | >>> sm, sym = segmentedMessagesAndSymbols([ftA, ftB], messageList) 50 | >>> sp = SegmentPrinter(sm.values()) # doctest: +SKIP 51 | >>> sp.toConsole() # doctest: +SKIP 52 | >>> print(tabulate(sm.values())) 53 | --------------------------------------------------------------- ------------------------------------------------------------------- 54 | MessageSegment 2 bytes at (2, 4): 5134 | values: (81, 52) MessageSegment 2 bytes at (5, 7): 3637 | values: (54, 55) 55 | MessageSegment 3 bytes at (0, 3): 525252 | values: (82, 82, 82) MessageSegment 4 bytes at (5, 9): 37383930 | values: (55, 56, 57... 56 | MessageSegment 3 bytes at (0, 3): 515151 | values: (81, 81, 81) MessageSegment 2 bytes at (5, 7): 3930 | values: (57, 48) 57 | MessageSegment 2 bytes at (2, 4): 5235 | values: (82, 53) MessageSegment 4 bytes at (5, 9): 37383930 | values: (55, 56, 57... 58 | MessageSegment 3 bytes at (0, 3): 515151 | values: (81, 81, 81) MessageSegment 2 bytes at (5, 7): 3334 | values: (51, 52) 59 | MessageSegment 3 bytes at (0, 3): 525252 | values: (82, 82, 82) 60 | MessageSegment 3 bytes at (0, 3): 515151 | values: (81, 81, 81) MessageSegment 2 bytes at (5, 7): 3839 | values: (56, 57) 61 | MessageSegment 3 bytes at (0, 3): 525252 | values: (82, 82, 82) MessageSegment 2 bytes at (5, 7): 3041 | values: (48, 65) 62 | --------------------------------------------------------------- ------------------------------------------------------------------- 63 | >>> for s in sym: 64 | ... print(s.getCells()) # doctest: +NORMALIZE_WHITESPACE 65 | Field | Field | Field | Field | Field 66 | ----- | ----- | ----- | ----- | ----- 67 | 'QQ' | 'Q4' | '5' | '67' | '89' 68 | ----- | ----- | ----- | ----- | ----- 69 | Field | Field | Field 70 | ----- | ----- | ------ 71 | 'RRR' | '56' | '7890' 72 | ----- | ----- | ------ 73 | Field | Field | Field | Field 74 | ----- | ----- | ----- | ----- 75 | 'QQQ' | '78' | '90' | 'AB' 76 | ----- | ----- | ----- | ----- 77 | Field | Field | Field | Field 78 | ----- | ----- | ----- | ------ 79 | 'RR' | 'R5' | '6' | '7890' 80 | ----- | ----- | ----- | ------ 81 | Field | Field | Field | Field 82 | ----- | ----- | ----- | ------- 83 | 'QQQ' | '12' | '34' | '56789' 84 | ----- | ----- | ----- | ------- 85 | Field | Field 86 | ----- | ------ 87 | 'RRR' | '890A' 88 | ----- | ------ 89 | Field | Field | Field 90 | ----- | ----- | ----- 91 | 'QQQ' | '67' | '89' 92 | ----- | ----- | ----- 93 | Field | Field | Field | Field 94 | ----- | ----- | ----- | --------- 95 | 'RRR' | '89' | '0A' | 'BCDEFGH' 96 | ----- | ----- | ----- | --------- 97 | 98 | :param typedFields: The inferred fields of different types in order of their precedence! 99 | E. g., field types with smaller index will remove concurring subsequent ones that overlap. 100 | :param messages: The messages to expect inference for. 101 | :return: tuple of 102 | * dict of the messages and their segment list. 103 | * Netzob symbols representing the inference. 104 | """ 105 | # combine inferred fields per message to facilitate validation 106 | typedSequences = [ 107 | {segs[0].message: segs for segs in fields.segments if len(segs) > 0} 108 | for fields in typedFields 109 | ] 110 | 111 | segmentedMessages = dict() 112 | for msg in messages: 113 | segmsg = list() 114 | # in order of fieldtypes.precedence! 115 | for typedMessages in typedSequences: 116 | if msg in typedMessages: # type: List[TypedSegment] 117 | # segments of a field type for one message 118 | for cand in typedMessages[msg]: 119 | # check overlapping segment 120 | overlapps = False 121 | for seg in segmsg: 122 | if isOverlapping(cand, seg): 123 | overlapps = True 124 | break 125 | # if a segment is already 126 | if overlapps: 127 | continue 128 | segmsg.append(cand) 129 | # symbolsFromSegments fixes gaps, but cannot know anything about the message in an empty list, so we add a dummy 130 | # segment for these cases here 131 | segmentedMessages[msg] = sorted(segmsg, key=lambda s: s.offset) if len(segmsg) > 0 else \ 132 | [ MessageSegment(Value(msg), 0, len(msg.data)) ] 133 | 134 | symbols = symbolsFromSegments(segmentedMessages.values()) 135 | 136 | return segmentedMessages, symbols -------------------------------------------------------------------------------- /src/fieldhunter/utils/eval.py: -------------------------------------------------------------------------------- 1 | import os, csv, logging 2 | from typing import Any 3 | from os.path import join, exists 4 | from time import strftime 5 | 6 | from openpyxl import Workbook, utils 7 | from openpyxl.worksheet.worksheet import Worksheet 8 | 9 | import nemere.utils.evaluationHelpers as eh 10 | from nemere.inference.segments import MessageSegment 11 | from nemere.validation.dissectorMatcher import MessageComparator 12 | 13 | from fieldhunter.inference.fieldtypes import * 14 | 15 | 16 | def csvAppend(reportFolder: str, fileName: str, header: List[str], rows: Iterable[Iterable[Any]]): 17 | csvpath = os.path.join(reportFolder, fileName + '.csv') 18 | csvWriteHead = False if os.path.exists(csvpath) else True 19 | 20 | print('Write statistics to {}...'.format(csvpath)) 21 | with open(csvpath, 'a') as csvfile: 22 | statisticscsv = csv.writer(csvfile) 23 | if csvWriteHead: 24 | statisticscsv.writerow(header) 25 | statisticscsv.writerows(rows) 26 | 27 | 28 | class FieldTypeReport(object): 29 | 30 | headers = ["hexbytes", "segment offset", "segment end", 31 | "overlap ratio", "overlap index", "overlap offset", "overlap end", "overlap value", 32 | "message date", "message type", "field name", "field type", "TP/FP", "isVisible"] 33 | # (column isVisible could also be called: "not hidden by other type") 34 | 35 | overviewHeaders = [ 36 | "field type", "FN", "FP", "TP", "P", "R" 37 | ] 38 | 39 | ovTitle = "Overview" 40 | 41 | def __init__(self, fieldtype: FieldType, comparator: MessageComparator, 42 | segmentedMessages: Dict[AbstractMessage, List[MessageSegment]] = None): 43 | """ 44 | 45 | :param fieldtype: The field type object to generate a report for. 46 | :param comparator: A NEMERE MessageComparator to look up the true fields overlapping our inference. 47 | :param segmentedMessages: Optional Dict of segmented messages to check whether another field type got 48 | precedence for single inference instances. see fieldhunter.inference.fieldtypes#precedence and 49 | fieldhunter.inference.common#segmentedMessagesAndSymbols 50 | """ 51 | self._fieldtype = fieldtype 52 | self._comparator = comparator 53 | self._segmentedMessages = segmentedMessages 54 | 55 | def lookupOverlap(self): 56 | """ 57 | Lookup the overlap with the ground truth for all segments inferred for the given FieldHunter field type. 58 | 59 | :return: table (list of lists) of statistics for each inferred segment from field type, according to the 60 | columns given in FieldTypeReport#headers. 61 | """ 62 | tabdata = list() 63 | 64 | for seg in (seg for msgsegs in self._fieldtype.segments for seg in msgsegs if msgsegs): 65 | # field: from ground true; seg(ment): inferred; overlap: intersection of field and segment 66 | overlapRatio, overlapIndex, overlapOffset, overlapEnd = self._comparator.fieldOverlap(seg) 67 | messagetype, fieldname, fieldtype = self._comparator.lookupField(seg) 68 | overlapValue = "'" + seg.message.data[overlapOffset:overlapEnd].hex() + "'" 69 | 70 | # determine what is a TP (value True)/FP (value False) using GroundTruth 71 | tpfp = fieldname in GroundTruth.fieldtypes[self.typelabel] 72 | 73 | # check the precedence of multiple overlapping inferred fields 74 | isVisible = seg in chain.from_iterable(self._segmentedMessages.values())\ 75 | if self._segmentedMessages is not None else "n/a" 76 | 77 | tabdata.append(["'" + seg.bytes.hex() + "'", seg.offset, seg.nextOffset, 78 | overlapRatio, overlapIndex, overlapOffset, overlapEnd, overlapValue, 79 | seg.message.date, messagetype, fieldname, fieldtype, tpfp, isVisible]) 80 | return tabdata 81 | 82 | @property 83 | def typelabel(self): 84 | """The label for the field type this report is generated for.""" 85 | return self._fieldtype.typelabel 86 | 87 | def countTrueOccurrences(self): 88 | counter = 0 89 | for fieldname in GroundTruth.fieldtypes[self.typelabel]: 90 | counter += len(self._comparator.lookupValues4FieldName(fieldname)) 91 | return counter 92 | 93 | def addXLworksheet(self, workbook: Workbook, overview: str=None): 94 | """Add data as worksheet to a openpyxl workbook. The caller needs to take take to write to file afterwards.""" 95 | worksheet = workbook.create_sheet(self.typelabel) 96 | worksheet.append(FieldTypeReport.headers) 97 | for row in self.lookupOverlap(): 98 | worksheet.append(row) 99 | onlyVisible = f",{utils.quote_sheetname(self.typelabel)}!N:N,TRUE()" \ 100 | if self._segmentedMessages is not None else "" 101 | if overview is not None: 102 | try: 103 | ovSheet = workbook[overview] # type: Worksheet 104 | currentRow = ovSheet.max_row + 1 105 | tpCoord = f"{utils.get_column_letter(4)}{currentRow}" 106 | ovSheet.append([ 107 | self.typelabel, 108 | f"={self.countTrueOccurrences()} - {tpCoord}", # "FN" 109 | f"=COUNTIFS({utils.quote_sheetname(self.typelabel)}!M:M,FALSE(){onlyVisible})", # "=FP" 110 | f"=COUNTIFS({utils.quote_sheetname(self.typelabel)}!M:M,TRUE(){onlyVisible})", # "=TP" 111 | f"=D{currentRow}/(D{currentRow}+C{currentRow})", # P 112 | f"=D{currentRow}/(D{currentRow}+B{currentRow})", # R 113 | ]) 114 | except KeyError: 115 | logging.getLogger(__name__).info("Overview sheet with title", overview, "not found. " 116 | "Not writing overview.") 117 | return workbook 118 | 119 | @staticmethod 120 | def newWorkbook(): 121 | """Prepare a new workbook to hold worksheets generated by #addXLworksheet().""" 122 | infieldWorkbook = Workbook() 123 | infieldWorkbook.active.title = FieldTypeReport.ovTitle 124 | ovSheet = infieldWorkbook[FieldTypeReport.ovTitle] 125 | ovSheet.append(FieldTypeReport.overviewHeaders) 126 | return infieldWorkbook 127 | 128 | @staticmethod 129 | def saveWorkbook(infieldWorkbook: Workbook, pcapstrippedname: str): 130 | infieldFilename = join(eh.reportFolder, 131 | f"FieldTypeReport_{pcapstrippedname}_{strftime('%Y%m%d-%H%M%S')}.xlsx") 132 | if not exists(infieldFilename): 133 | print("Write field type report to", infieldFilename) 134 | infieldWorkbook.save(infieldFilename) 135 | else: 136 | print("Could not write", infieldFilename, "- File exists") 137 | for worksheet in infieldWorkbook.worksheets: 138 | headers = worksheet.rows[0] 139 | cells = worksheet.rows[1:] 140 | print(f"\nReport for {worksheet.title}:\n" + tabulate(cells, headers=headers)) 141 | 142 | 143 | class GroundTruth(object): 144 | """tshark dissector field names for sample protocols mapped from the FieldHunter field type class.""" 145 | fieldtypes = { 146 | MSGlen.typelabel: ["nbss.length"], 147 | MSGtype.typelabel: ["dhcp.option.dhcp", "ntp.flags", "ntp.stratum", "dns.flags", 148 | "nbns.flags", "smb.cmd", "smb.flags", ], 149 | HostID.typelabel: ["dhcp.ip.client", "dhcp.ip.your", "dhcp.ip.server", "dhcp.hw.mac_addr", "ntp.refid"], 150 | SessionID.typelabel: ["dhcp.id", "smb.pid", "smb.uid", "smb.mid"], 151 | TransID.typelabel: ["dns.id", "nbns.id"], 152 | Accumulator.typelabel: [] 153 | } 154 | 155 | def __init__(self, comparator:MessageComparator, endianness: str = "big"): 156 | self._comparator = comparator 157 | self._endianness = endianness 158 | logging.getLogger(__name__).setLevel(logging.DEBUG) 159 | 160 | def entropyPerField(self, fieldname: str): 161 | """Collect true fields values and calculate their entropy for the current trace.""" 162 | fieldsValues = [bytes.fromhex(hexval) for hexval in self._comparator.lookupValues4FieldName(fieldname)] 163 | if len(fieldsValues) > 0: 164 | fieldLengths = Counter(len(bv) for bv in fieldsValues) 165 | # should normally be a constant value for this kind of fields 166 | mostCommonLen = fieldLengths.most_common(1)[0][0] 167 | logging.getLogger(__name__).debug(f"Field lengths of {fieldname}: {repr(fieldLengths)}") 168 | entropy = drv.entropy(intsFromNgrams(fieldsValues, self._endianness)) / (mostCommonLen * 8) 169 | else: 170 | entropy = numpy.nan 171 | return len(fieldsValues), entropy 172 | 173 | def typeAndLenEntropies(self): 174 | """ 175 | Collect MSGtype/MSGlen true fields according to GroundTruth.fieldtypes[MSGtype.typelabel/MSGlen.typelabel] 176 | 177 | :return: list of lists of "field name", "type label", "sample count", and "entropy" 178 | """ 179 | entropyList = list() 180 | for typelabel in [MSGtype.typelabel, MSGlen.typelabel]: 181 | for fieldname in GroundTruth.fieldtypes[typelabel]: 182 | # for each field name calculate entropy 183 | entropyList.append([ 184 | fieldname, 185 | typelabel, 186 | *self.entropyPerField(fieldname) 187 | ]) 188 | return entropyList 189 | -------------------------------------------------------------------------------- /src/fieldhunter/inference/fieldtypesRelaxed.py: -------------------------------------------------------------------------------- 1 | """ 2 | Infer message field types according to the FieldHunter paper Section 3.2 3 | but with some relaxed thresholds and assumptions. 4 | 5 | TODO introduce doctests to check critical functions in inference.fieldtypes 6 | """ 7 | from typing import List, Tuple, Dict, Iterable, Union 8 | import logging 9 | from collections import Counter 10 | from abc import ABC 11 | 12 | import numpy 13 | from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage 14 | from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage 15 | 16 | from fieldhunter.utils.base import Flows, intsFromNgrams 17 | from fieldhunter.inference.fieldtypes import NonConstantNonRandomEntropyFieldType, Accumulator 18 | import fieldhunter.inference.fieldtypes as fieldtypes 19 | 20 | 21 | # logging.getLogger(__name__).setLevel(logging.DEBUG) 22 | 23 | 24 | class MSGtype(fieldtypes.MSGtype): 25 | """ 26 | Relaxed version of message type (MSG-Type) inference (FH, Section 3.2.1, Fig. 3 left). 27 | 28 | see .fieldtypes.MSGtype 29 | """ 30 | causalityThresh = 0.7 31 | """ 32 | FH, Sec. 3.2.1 says 0.8, but that leaves no candidates for our NTP traces 33 | Reduces TP and FP for SMB 100. 34 | """ 35 | 36 | 37 | class MSGlen(fieldtypes.MSGlen): 38 | """ 39 | Relaxed version of message length (MSG-Len) inference (FH, Section 3.2.2, Fig. 3 center). 40 | 41 | see ..fieldtypes.MSGlen 42 | """ 43 | 44 | def __init__(self, flows: Flows): 45 | super(NonConstantNonRandomEntropyFieldType, self).__init__() 46 | 47 | # The FH paper per direction wants to handle each direction separately, which is pointless for MSG-Len, 48 | # so we place all messages in one direction object. 49 | self._msgDirection = [type(self).Direction(flows.messages)] 50 | # TODO It might rather be useful to separate message types (distinct formats) in this manner. 51 | # However, this requires combination with some message type classification approach. => Future Work. 52 | 53 | class Direction(fieldtypes.MSGlen.Direction): 54 | 55 | @staticmethod 56 | def _candidateIsAcceptable(solutionAcceptable: Dict[Tuple[AbstractMessage, AbstractMessage], bool], 57 | Xarray: numpy.ndarray): 58 | """ 59 | FH does not require that either the values in X[0]/a or X[1]/b are equal for all X in Xes. 60 | Thus, different values are accepted, although a message length typically is calculated using the same 61 | multiplicator X[0], even if the offset X[1] may change, so X[0] must be a scalar value. 62 | 63 | Otherwise we end up with lots of FPs. Examples: 64 | * In SMB, the 'Msg. Len. Model Parameters' (a,b) == [1., 4.] 65 | of the 4-gram at offset 0, 4 is nbss.length, i. e., a TP! 66 | Offsets 16 and 22 are FP, but with diverging A and B vectors. 67 | * Another example: In DNS, the beginning of the queried name is a FP 68 | (probably due to DNS' subdomain numbered separator scheme). 69 | 70 | Thus, we require that X[0] is the same constant value throughout the majority of checked solutions. 71 | (We use the majority to account for some random error exactly as FH does using the MSGlen.lenhypoThresh) 72 | 73 | :param solutionAcceptable: Dict of which solution is acceptable for which combination of messages. 74 | :return: Whether the given candidate is acceptable. 75 | """ 76 | acceptCount = Counter(solutionAcceptable.values()) 77 | mostlyAcceptable = bool(acceptCount[True] / len(acceptCount) > MSGlen.lenhypoThresh) 78 | # noinspection PyTypeChecker 79 | constantMultiplicator = all(numpy.round(Xarray[0,0], 8) == numpy.round(Xarray[1:,0], 8)) 80 | logging.getLogger(__name__).debug(f"Candidate mostlyAcceptable {mostlyAcceptable} " 81 | f"and has constantMultiplicator {constantMultiplicator}.") 82 | return mostlyAcceptable and constantMultiplicator 83 | 84 | 85 | class CategoricalCorrelatedField(fieldtypes.CategoricalCorrelatedField,ABC): 86 | """ 87 | Abstract class for inferring field types using categorical correlation of n-gram values with external values, e. g., 88 | environmental information like addresses from encapsulation. 89 | 90 | Enhancement of fieldtypes.CategoricalCorrelatedField to iteratively check n-grams from size four to one. 91 | """ 92 | @classmethod 93 | def correlate(cls, messages: List[L4NetworkMessage], nMax: int = 4, nMin: int = 1): 94 | """ 95 | Generate n-grams with n.s from large to small 96 | at the same offsets for each message an correlate each n-gram using categorical correlation. 97 | 98 | see fieldtypes.CategoricalCorrelatedField#correlate() 99 | see HostID for the rationale of this enhancement over FH. 100 | 101 | :param messages: Messages to generate n-grams to correlate to. 102 | :param nMax: maximum of n to correlate (decrease from large to small) 103 | :param nMin: minimum of n to correlate 104 | :return: Correlation values for each offset of n-grams generated from the messages. 105 | """ 106 | categoricalCorrelation = None 107 | for n in range(nMax,nMin+1,-1): 108 | # this is one correlation value for each n-gram starting at the offset 109 | corrAtOffset = super().correlate(messages, n) 110 | if categoricalCorrelation is None: # initial fill 111 | categoricalCorrelation = [-1] * (len(corrAtOffset) + n - 1) 112 | if len(corrAtOffset) + n - 1 != len(categoricalCorrelation): # validity check 113 | # this should not occur of #correlate() is correct and called with the same set of messages 114 | raise RuntimeError("Too few values to correlate.") 115 | for offset, corr in enumerate(corrAtOffset): # iterate all n-gram offsets 116 | for nOff in range(offset, offset+n): # check/set the correlation for ALL bytes of this n-gram 117 | if categoricalCorrelation[nOff] < corr: 118 | categoricalCorrelation[nOff] = corr 119 | corRepr = [round(cc,3) for cc in categoricalCorrelation] 120 | logging.getLogger(__name__).debug(f"Correlation of {n}-ngrams: {corRepr}") 121 | return categoricalCorrelation 122 | 123 | @classmethod 124 | def _combineNgrams2Values(cls, ngrams: Iterable[bytes], values: List[int]): 125 | r""" 126 | The correlation is perfect if null values are omitted 127 | 128 | >>> ngrand = [b'\xa2\xe7', b'r\x06', b'\x0f?', b'd\x8a', b'\xa0X', b'\x04\xba', b'\x19r', b'\x17M', b',\xda', 129 | ... b'9K', b'<3', b'\xaa\xdf'] 130 | >>> valRnd = [0.601, 0.601, 0.601, 0.601, 0.804, 0.804, 0.804, 0.804, 0.804, 0.792, 0.731, 0.722] 131 | >>> from fieldhunter.inference.fieldtypesRelaxed import CategoricalCorrelatedField 132 | >>> CategoricalCorrelatedField._combineNgrams2Values(ngrand, valRnd) 133 | array([[4.1703e+04, 2.9190e+04, 3.9030e+03, 2.5738e+04, 4.1048e+04, 134 | 1.2100e+03, 6.5140e+03, 5.9650e+03, 1.1482e+04, 1.4667e+04, 135 | 1.5411e+04, 4.3743e+04], 136 | [6.0100e-01, 6.0100e-01, 6.0100e-01, 6.0100e-01, 8.0400e-01, 137 | 8.0400e-01, 8.0400e-01, 8.0400e-01, 8.0400e-01, 7.9200e-01, 138 | 7.3100e-01, 7.2200e-01]]) 139 | """ 140 | nonNull = list(zip(*filter(lambda x: set(x[0]) != {0}, zip(ngrams, values)))) 141 | if len(nonNull) == 0: 142 | nonNull = [[],[]] 143 | return super(CategoricalCorrelatedField, cls)._combineNgrams2Values(*nonNull) 144 | 145 | 146 | class HostID(CategoricalCorrelatedField, fieldtypes.HostID): 147 | """ 148 | Relaxed version of host identifier (Host-ID) inference (FH, Sec. 3.2.3) 149 | Find n-gram that is strongly correlated with IP address of sender. 150 | 151 | see fieldtypes.HostID 152 | 153 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 154 | # # We investigated the low categoricalCorrelation for all but one byte within an address field (see NTP and DHCP). 155 | # # According to NTP offset 12 (REF ID, often DST IP address) and DHCP offsets (12, 17, and) 20 (IPs) 156 | # # this works in principle, but if the n-gram is too short the correlation gets lost for some n-grams. 157 | print(tabulate(zip(*[hostidfields.categoricalCorrelation]), showindex="always")) 158 | from matplotlib import pyplot 159 | pyplot.bar(range(len(hostidfields.categoricalCorrelation)), hostidfields.categoricalCorrelation) 160 | pyplot.show() 161 | # sum([msg.data[20:24] == bytes(map(int, msg.source.rpartition(':')[0].split('.'))) for msg in messages]) 162 | # sum([int.from_bytes(messages[m].data[20:24], "big") == srcs[m] for m in range(len(messages))]) 163 | # # While the whole dhcp.ip.server [20:24] correlates nicely to the IP address, single n-grams don't. 164 | serverIP = [(int.from_bytes(messages[m].data[20:24], "big"), srcs[m]) for m in range(len(messages))] 165 | serverIP0 = [(messages[m].data[20], srcs[m]) for m in range(len(messages))] 166 | serverIP1 = [(messages[m].data[21], srcs[m]) for m in range(len(messages))] 167 | serverIP2 = [(messages[m].data[22], srcs[m]) for m in range(len(messages))] 168 | serverIP3 = [(messages[m].data[23], srcs[m]) for m in range(len(messages))] 169 | # nsp = numpy.array([sip for sip in serverIP]) 170 | # # The correlation is perfect, if null values are omitted 171 | nsp = numpy.array([sip for sip in serverIP if sip[0] != 0]) # and sip[0] == sip[1] 172 | # nsp0 = numpy.array(serverIP0) 173 | # nsp1 = numpy.array(serverIP1) 174 | # nsp2 = numpy.array(serverIP2) 175 | # nsp3 = numpy.array(serverIP3) 176 | nsp0 = numpy.array([sip for sip in serverIP0 if sip[0] != 0]) 177 | nsp1 = numpy.array([sip for sip in serverIP1 if sip[0] != 0]) 178 | nsp2 = numpy.array([sip for sip in serverIP2 if sip[0] != 0]) 179 | nsp3 = numpy.array([sip for sip in serverIP3 if sip[0] != 0]) 180 | for serverSrcPairs in [nsp, nsp0, nsp1, nsp2, nsp3]: 181 | print(drv.information_mutual(serverSrcPairs[:, 0], serverSrcPairs[:, 1]) / drv.entropy_joint(serverSrcPairs.T)) 182 | # # Thus, this is no implementation error, but raises doubts about the Host-ID description completeness: 183 | # # Probably it does not mention an Entropy filter, direction separation, or - most probably - 184 | # # an iterative n-gram size increase (like for MSGlen). Thus, we implement such an iterative n-gram analysis 185 | # # in this class's relaxed super-class CategoricalCorrelatedField. 186 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # 187 | """ 188 | 189 | 190 | class SessionID(CategoricalCorrelatedField, fieldtypes.SessionID): 191 | r""" 192 | Relaxed version of session identifier (Session-ID) inference (FH, Section 3.2.4) 193 | Find n-gram that is strongly correlated with IP addresses of sender and receiver 194 | using categorical correlation like Host-ID. 195 | 196 | see fieldtypes.SessionID 197 | 198 | >>> from fieldhunter.inference.fieldtypesRelaxed import SessionID 199 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage 200 | >>> messages = [ 201 | ... L4NetworkMessage(b"session111\x42\x17\x23\x00\x08\x15", 202 | ... l3SourceAddress="1.2.3.100", l3DestinationAddress="1.2.3.1"), 203 | ... L4NetworkMessage(b"session111\xe4\x83\x82\x85\xbf", 204 | ... l3SourceAddress="1.2.3.1", l3DestinationAddress="1.2.3.100"), 205 | ... L4NetworkMessage(b"session111\x23\x17\xf9\x0b\x00b\x12", 206 | ... l3SourceAddress="1.2.3.100", l3DestinationAddress="1.2.3.1"), 207 | ... L4NetworkMessage(b"session222\x42\x17Jk\x8a1e\xb5", 208 | ... l3SourceAddress="1.2.3.2", l3DestinationAddress="1.2.3.100"), 209 | ... L4NetworkMessage(b"session222L\xab\x83\x1a\xef\x13", 210 | ... l3SourceAddress="1.2.3.100", l3DestinationAddress="1.2.3.2"), 211 | ... ] 212 | >>> SessionID.correlate(messages) 213 | [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4181656600790516, 0.4181656600790516, 0.4181656600790516, 0.5, 0.5] 214 | 215 | A problem similar to Host-ID's leads to the same bad quality, thus, we apply the same change via the relaxed 216 | super-class CategoricalCorrelatedField. 217 | """ 218 | @classmethod 219 | def _filterMessages(cls, messages: List[L4NetworkMessage]): 220 | ignoreList = {b"\x00"*4, b"\xff"*4} 221 | logging.getLogger(__name__).debug("Ignoring non-set and broadcast addresses.") 222 | return [messages for messages, srcDst in zip(messages, cls._srcDstBytes(messages)) 223 | if ignoreList.isdisjoint(srcDst)] 224 | 225 | @classmethod 226 | def _values2correlate2(cls, messages: List[L4NetworkMessage]): 227 | """ 228 | Get source AND destination addresses in the same manner as (just) the source for Host-ID. 229 | Recover byte representations of the IPv4 addresses from all Netzob messages and make one int out if each. 230 | 231 | Compared to the original FH paper, treat source and destination IPs as set, 232 | ignoring their role as denoting sender of receiver, but only interpreting them as equal participants. 233 | 234 | :param messages: Messages to generate n-grams to correlate to. 235 | :return: integer representation of source and destination addresses for each message. 236 | """ 237 | participantPairs = [sorted(srcDst) for srcDst in cls._srcDstBytes(messages)] 238 | return intsFromNgrams(a+b for a,b in participantPairs) 239 | 240 | 241 | class TransID(fieldtypes.TransID): 242 | """ 243 | Relaxed version of transaction identifier (Trans-ID) inference (FH, Section 3.2.5, Fig. 3 right) 244 | 245 | see fieldtypes.TransID 246 | """ 247 | entropyThresh = 0.6 248 | """ 249 | This Value not given in FH! We improve the threshold compared to the paper 250 | by using it as factor for relative entropy amongst all entropies in the collection. 251 | """ 252 | 253 | absoluteEntropy = False 254 | 255 | convLenOneThresh = 0.9 256 | 257 | minConversationLength = 2 258 | """ 259 | For the horizontal entropy require conversations longer than this amount of message exchanges to observe that the 260 | ID changes for each request/reply pair and not is Session-ID/Cookie of some sort. 261 | I. e., "Transaction ID" in DHCP would be a FP, since despite its name it is actually a Session-ID) 262 | """ 263 | 264 | # In _verticalAndHorizontalRandomNgrams(self): 265 | # for the _c2sCombinedOffsets 266 | # (TODO alternatively, deviating from FH, use the offset for each query specifically?) 267 | # and _s2cCombinedOffsets 268 | # (TODO alternatively, deviating from FH, use the entry for each response specifically?) 269 | # This would allow offsets for different message types, but would require to compare values using _constantQRvalues 270 | # with the specific offsets per Q/R pair. ==> Future Work 271 | 272 | @classmethod 273 | def _horizontalRandomNgrams(cls, conversions: Dict[tuple, List[AbstractMessage]], 274 | verticalEntropyFiltered: List[int]) -> Dict[Union[Tuple, None], List[int]]: 275 | if len(conversions) > 0: 276 | # With a conversation length of one, no meaningful horizontal entropy can be calculated (see DNS) 277 | convLens = Counter([len(c) for c in conversions.values()]) 278 | lenOneRatio = convLens[1] / sum(convLens.values()) 279 | 280 | # New compared to original FH: 281 | # If most conversations (convLenOneThresh) are just one message long per direction (e. g. DNS), 282 | # ignore the horizontal entropy filter 283 | if lenOneRatio > .9: 284 | return {None: verticalEntropyFiltered} 285 | else: 286 | filteredOutput = dict() 287 | # horizontal collections: entropy of n-gram per the same offset in all messages of one flow direction 288 | for key, conv in conversions.items(): 289 | # The horizontal entropy is too low if the number of specimens is low 290 | # -> Enhancing over FH, we use the threshold as a relative to max and ignore short conversations 291 | if len(conv) < cls.minConversationLength: 292 | continue 293 | filteredOutput[key] = cls.entropyFilteredOffsets(conv, cls.absoluteEntropy) 294 | return filteredOutput 295 | else: 296 | return {} 297 | 298 | # Host-ID will always return a subset of Session-ID fields, so Host-ID should get precedence 299 | # MSG-Len would be overwritten by MSG-Type (see SMB: nbss.length), so first use MSG-Len 300 | precedence = {MSGlen.typelabel: 0, MSGtype.typelabel: 1, HostID.typelabel: 2, 301 | SessionID.typelabel: 3, TransID.typelabel: 4, Accumulator.typelabel: 5} 302 | """ 303 | The order in which to map field types to messages. 304 | Lower numbers take precedence over higher numbers, so that the type with the higher number will be ignored 305 | if overlapping at the same offet range in the message. 306 | """ -------------------------------------------------------------------------------- /src/fieldhunter/utils/base.py: -------------------------------------------------------------------------------- 1 | from collections import Iterator 2 | from itertools import chain 3 | from typing import List, Dict, Iterable, Tuple, Union 4 | 5 | import IPython 6 | from numpy import nan 7 | from pyitlib import discrete_random_variable as drv 8 | 9 | from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage 10 | from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage 11 | 12 | from nemere.inference.segments import MessageAnalyzer 13 | 14 | 15 | 16 | class NgramIterator(Iterator): 17 | """ 18 | Iterate over the byte n-grams in message. 19 | 20 | FH, Section 3.1.2 21 | 22 | 23 | >>> from fieldhunter.utils.base import NgramIterator 24 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage 25 | >>> ngi = NgramIterator(L4NetworkMessage(b"QQQ456789")) 26 | >>> for ngram in ngi: 27 | ... print(ngram, ngi.offset, ngi.exhausted, ngi.lookahead) 28 | b'QQQ' 0 False True 29 | b'QQ4' 1 False True 30 | b'Q45' 2 False True 31 | b'456' 3 False True 32 | b'567' 4 False True 33 | b'678' 5 False True 34 | b'789' 6 False False 35 | >>> print(ngi.exhausted, ngi.lookahead) 36 | True False 37 | """ 38 | 39 | def __init__(self, message: AbstractMessage, n=3): 40 | """ 41 | 42 | :param message: The message of which to iterate the n-grams. 43 | :param n: The n in n-gram (length of chunk in bytes). 44 | """ 45 | if not isinstance(message, AbstractMessage): 46 | raise ValueError("Parameter needs to be a Netzob message object (AbstractMessage).") 47 | self._message = message 48 | self._n = n 49 | self.__offset = -1 50 | 51 | __step = 1 52 | 53 | def __iter__(self): 54 | self.__offset = -1 55 | return self 56 | 57 | def __next__(self) -> bytes: 58 | self.__offset += NgramIterator.__step 59 | if self.exhausted: 60 | raise StopIteration() 61 | return self._message.data[self.__offset:self.__offset+self._n] 62 | 63 | @property 64 | def offset(self): 65 | """ 66 | NgramIterator enumerates the offset of the n-gram its current iteration is taken from. 67 | 68 | :return: offset of the n-gram in the current iteration. 69 | """ 70 | return self.__offset 71 | 72 | @property 73 | def exhausted(self): 74 | """ 75 | :return: Indicates that the last iteration has occurred. 76 | """ 77 | return self.__offset > len(self._message.data) - self._n 78 | 79 | @property 80 | def lookahead(self): 81 | """ 82 | :return: True indicates that at least one more iteration is contained in this iterator. 83 | """ 84 | return self.__offset + NgramIterator.__step <= len(self._message.data) - self._n 85 | 86 | 87 | class Flows(object): 88 | # noinspection PyUnresolvedReferences 89 | """ 90 | In FH, a flow is defined by the 5-tuple: Layer-4 Protocol, Source IP, Destination IP, Source Port, Destination Port 91 | (FH, Section2, Footnote 1) 92 | 93 | >>> from tabulate import tabulate 94 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage 95 | >>> from fieldhunter.utils.base import Flows 96 | >>> messages = [ 97 | ... L4NetworkMessage(b"QQQ456789", l4Protocol="dummy", date=1445405280.01, l3SourceAddress="192.168.23.100", l3DestinationAddress="192.168.23.245", l4SourceAddress=10815, l4DestinationAddress=42), 98 | ... L4NetworkMessage(b"RRR567890", l4Protocol="dummy", date=1445405280.03, l3SourceAddress="192.168.23.245", l3DestinationAddress="192.168.23.100", l4SourceAddress=42, l4DestinationAddress=10815), 99 | ... L4NetworkMessage(b"QQQ7890AB", l4Protocol="dummy", date=1445405280.07, l3SourceAddress="192.168.23.100", l3DestinationAddress="192.168.23.245", l4SourceAddress=10815, l4DestinationAddress=42), 100 | ... L4NetworkMessage(b"RRR567890", l4Protocol="dummy", date=1445405280.05, l3SourceAddress="192.168.23.245", l3DestinationAddress="192.168.23.100", l4SourceAddress=42, l4DestinationAddress=10815), 101 | ... L4NetworkMessage(b"QQQ123456789", l4Protocol="dummy", date=1445405280.11, l3SourceAddress="192.168.23.100", l3DestinationAddress="192.168.23.245", l4SourceAddress=1717, l4DestinationAddress=2323), 102 | ... L4NetworkMessage(b"RRR890A", l4Protocol="dummy", date=1445405280.13, l3SourceAddress="192.168.23.1", l3DestinationAddress="192.168.23.100", l4SourceAddress=2323, l4DestinationAddress=1717), 103 | ... L4NetworkMessage(b"QQQ6789", l4Protocol="dummy", date=1445405280.17, l3SourceAddress="192.168.23.1", l3DestinationAddress="192.168.23.245", l4SourceAddress=1717, l4DestinationAddress=2323), 104 | ... L4NetworkMessage(b"RRR890ABCDEFGH", l4Protocol="dummy", date=1445405280.23, l3SourceAddress="192.168.23.245", l3DestinationAddress="192.168.23.100", l4SourceAddress=2323, l4DestinationAddress=1717) 105 | ... ] 106 | >>> # for the sake of the test case, the messages RRR890A and QQQ6789 have src IPs that rules them out as 107 | >>> # valid conversations, they should not be contained in the conversion lists below. 108 | >>> flows = Flows(messages) 109 | >>> mqr = flows.matchQueryResponse() 110 | >>> print(tabulate([ (q.date, r.date) for q, r in mqr.items() ], floatfmt="")) 111 | ------------- ------------- 112 | 1445405280.01 1445405280.03 113 | 1445405280.11 1445405280.23 114 | ------------- ------------- 115 | >>> print(tabulate( [(list(flowtuple) + [b" ".join(msg.data for msg in msglst)]) for flowtuple, msglst in flows.conversations().items()] )) 116 | ----- -------------- -------------- ----- ---- --------------------------------------- 117 | dummy 192.168.23.100 192.168.23.245 10815 42 QQQ456789 QQQ7890AB RRR567890 RRR567890 118 | dummy 192.168.23.100 192.168.23.245 1717 2323 QQQ123456789 RRR890ABCDEFGH 119 | ----- -------------- -------------- ----- ---- --------------------------------------- 120 | >>> print(tabulate( [(list(flowtuple) + [b" ".join(msg.data for msg in msglst)]) for flowtuple, msglst in flows.c2sInConversations().items()] )) 121 | ----- -------------- -------------- ----- ---- ------------------- 122 | dummy 192.168.23.100 192.168.23.245 10815 42 QQQ456789 QQQ7890AB 123 | dummy 192.168.23.100 192.168.23.245 1717 2323 QQQ123456789 124 | ----- -------------- -------------- ----- ---- ------------------- 125 | >>> print(tabulate( [(list(flowtuple) + [b" ".join(msg.data for msg in msglst)]) for flowtuple, msglst in flows.s2cInConversations().items()] )) 126 | ----- -------------- -------------- ----- ---- ------------------- 127 | dummy 192.168.23.100 192.168.23.245 10815 42 RRR567890 RRR567890 128 | dummy 192.168.23.100 192.168.23.245 1717 2323 RRR890ABCDEFGH 129 | ----- -------------- -------------- ----- ---- ------------------- 130 | """ 131 | 132 | def __init__(self, messages: List[L4NetworkMessage]): 133 | self._messages = messages 134 | self._flows = self._identify() 135 | 136 | @property 137 | def messages(self): 138 | return self._messages 139 | 140 | def _identify(self) -> Dict[Tuple, List[L4NetworkMessage]]: 141 | """ 142 | Identify flows. 143 | 144 | :return A dict mapping the 5-tuple 145 | (Layer-4 Protocol, Source IP, Destination IP, Source Port, Destination Port) 146 | to the list of addresses in the flow denoted by the 5-tuple. 147 | """ 148 | flows = dict() # type: Dict[Tuple[str,str,str,str,str], List[L4NetworkMessage]] 149 | # client is initiator, sort by packet date 150 | for msg in sorted(self._messages, key=lambda m: m.date): # type: L4NetworkMessage 151 | if not isinstance(msg, L4NetworkMessage): 152 | raise TypeError("To identify flows, all messages need to be from a known encapsulation with known " 153 | "network and transport layer protocols. No flow determined for " 154 | f"{type(msg).__name__}:\n{msg}") 155 | src = msg.source.rpartition(':') 156 | dst = msg.destination.rpartition(':') 157 | srcAddress = src[0] 158 | dstAddress = dst[0] 159 | srcPort = src[2] 160 | dstPort = dst[2] 161 | keytuple = (msg.l4Protocol, srcAddress, dstAddress, srcPort, dstPort) 162 | if keytuple not in flows: 163 | flows[keytuple] = list() 164 | flows[keytuple].append(msg) 165 | return flows 166 | 167 | @property 168 | def flows(self): 169 | return self._flows 170 | 171 | def conversations(self) -> Dict[Tuple, List[AbstractMessage]]: 172 | """ 173 | "A conversation is formed of the two flows in opposite direction..." (FH, Section2, Footnote 1) 174 | :return: Dict of conversations with the c2s flow tuple as key. 175 | """ 176 | return {qkey: self._flows[qkey] + self._flows[rkey] 177 | for qkey,rkey in self._dialogs().items() if rkey is not None} 178 | 179 | def c2sInConversations(self) -> Dict[Tuple, List[AbstractMessage]]: 180 | """ 181 | "A conversation is formed of the two flows in opposite direction..." (FH, Section2, Footnote 1) 182 | :return: Dict of c2s messages per conversation with the c2s flow tuple as key. 183 | """ 184 | return {qkey: self._flows[qkey] for qkey,rkey in self._dialogs().items() if rkey is not None} 185 | 186 | def s2cInConversations(self) -> Dict[Tuple, List[AbstractMessage]]: 187 | """ 188 | "A conversation is formed of the two flows in opposite direction..." (FH, Section2, Footnote 1) 189 | :return: Dict of s2c messages per conversation with the c2s flow tuple as key. 190 | """ 191 | return {qkey: self._flows[rkey] for qkey,rkey in self._dialogs().items() if rkey is not None} 192 | 193 | def _dialogs(self) -> Dict[Tuple,Union[Tuple,None]]: 194 | """ 195 | find pairs of flows with src/dst and reversed to each other. 196 | """ 197 | dialogs = dict() 198 | for keytuple in self._flows.keys(): 199 | # exchange src and dst addresses and ports 200 | rkeytuple = (keytuple[0], keytuple[2], keytuple[1], keytuple[4], keytuple[3]) 201 | if rkeytuple in dialogs: 202 | if dialogs[rkeytuple] is not None: 203 | raise Exception("Strange things happened here.") 204 | # identify the flow starting earlier as client (key in dialogs), the other as server (value in dialogs) 205 | if self._flows[rkeytuple][0].date < self._flows[keytuple][0].date: 206 | dialogs[rkeytuple] = keytuple 207 | else: 208 | del dialogs[rkeytuple] 209 | dialogs[keytuple] = rkeytuple 210 | else: 211 | dialogs[keytuple] = None 212 | return dialogs 213 | 214 | def splitDirections(self) -> Tuple[List[AbstractMessage],List[AbstractMessage]]: 215 | """ 216 | Split list of messages into directions S2C and C2S based on flow information. 217 | Ignores all flows that have no reverse direction. 218 | 219 | FH, Section 2, Footnote 1 220 | 221 | :return Lists of messages, the first is client-to-server, the second is server-to-client 222 | """ 223 | # merge all client flows into one and all server flows into another list of messages 224 | c2s = list(chain.from_iterable(self.c2sInConversations().values())) 225 | s2c = list(chain.from_iterable(self.s2cInConversations().values())) 226 | return c2s, s2c 227 | 228 | def matchQueryResponse(self): 229 | """ 230 | Match queries with responses in the flows by identifying 231 | for each client-to-server message (query) the server-to-client message (response) 232 | that has the closest subsequent transmission time. 233 | """ 234 | dialogs = self._dialogs() 235 | qr = dict() 236 | 237 | for keytuple in dialogs.keys(): 238 | if dialogs[keytuple] is None: 239 | continue 240 | qlist = self._flows[keytuple].copy() 241 | rlist = self._flows[dialogs[keytuple]].copy() 242 | 243 | # assume qlist and rlist are sorted by query.date and resp.date 244 | prevquery = None 245 | for query in qlist: 246 | respFound = False 247 | for resp in rlist: 248 | # first response later than query 249 | if query.date < resp.date: 250 | qr[query] = resp 251 | respFound = True 252 | break 253 | if not respFound: 254 | continue 255 | # if the response to query seems to be the same than to the previous query... 256 | if prevquery is not None and qr[query] == qr[prevquery]: 257 | # ... ignore the earlier query since a response message in between seems to have gone missing. 258 | del qr[prevquery] 259 | prevquery = query 260 | return qr 261 | 262 | 263 | def ngramEntropy(messages: List[AbstractMessage], n=1): 264 | """ 265 | The vertical entropies for each offset of all the n-grams at the same offset throughout all messages. 266 | Own entropy calculation implementation. See #pyitEntropyVertical 267 | 268 | FH, Section 3.2.1 269 | """ 270 | ngIters = [NgramIterator(msg, n) for msg in messages] 271 | vEntropy = list() 272 | 273 | for ngrams in zip(*ngIters): 274 | vEntropy.append(MessageAnalyzer.calcEntropy(ngrams, 256)) 275 | 276 | return vEntropy 277 | 278 | 279 | def intsFromNgrams(ngrams: Iterable[bytes], endianness='big') -> List[int]: 280 | r""" 281 | Convert an iterable of byte n-grams into a single integer per n-gram. 282 | This is useful to simplify working aroung numpy's issue with null-bytes: 283 | Issue #3878 (https://github.com/numpy/numpy/issues/3878) 284 | 285 | >>> from fieldhunter.utils.base import intsFromNgrams 286 | >>> ngramlist = [b"\x00\x00\x00", b"\x00\x11\x00", b"\xab\x00\x00", b"\xab\x11\x23", b"\x08\x15"] 287 | >>> ifn = intsFromNgrams(ngramlist) 288 | >>> # noinspection PyUnresolvedReferences 289 | >>> [hex(val) for val in ifn] 290 | ['0x0', '0x1100', '0xab0000', '0xab1123', '0x815'] 291 | 292 | :param ngrams: Iterable of n-grams, one bytes string per n-gram 293 | :param endianness: The endianness to use to interpret the bytes. 294 | :return: List of integers, one for with the value of each bytes string n-gram. 295 | """ 296 | return [int.from_bytes(b, endianness) for b in ngrams] 297 | 298 | 299 | def pyitNgramEntropy(messages: List[AbstractMessage], n=1, endianness='big'): 300 | """ 301 | The vertical entropies for each offset of all the n-grams at the same offset throughout all messages. 302 | Implementation of entropy calculation from pyitlib. See #entropyVertical 303 | 304 | FH, Section 3.2.1 305 | 306 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage 307 | >>> from fieldhunter.utils.base import pyitNgramEntropy, ngramEntropy 308 | >>> messageList = [ 309 | ... L4NetworkMessage(b"QQQ456789"), L4NetworkMessage(b"RRR567890"), L4NetworkMessage(b"QQQ7890AB"), 310 | ... L4NetworkMessage(b"RRR567890"), L4NetworkMessage(b"QQQ123456789"), L4NetworkMessage(b"RRR890A"), 311 | ... L4NetworkMessage(b"QQQ6789"), L4NetworkMessage(b"RRR890ABCDEFGH") 312 | ... ] 313 | >>> ngramEntropy(messageList) == pyitNgramEntropy(messageList) 314 | True 315 | 316 | :param messages: The list of messages to get the n-grams from. 317 | :param n: The n in n-gram. 318 | :param endianness: Endianness to interpret the n-grams in. 319 | """ 320 | ngIters = [NgramIterator(msg, n) for msg in messages] 321 | vEntropy = list() 322 | 323 | for ngrams in zip(*ngIters): # type: List[bytes] 324 | # int.from_bytes is necessary because of numpy's issue with null-bytes: #3878 325 | # (https://github.com/numpy/numpy/issues/3878) 326 | vEntropy.append(drv.entropy(intsFromNgrams(ngrams, endianness))/(n*8)) 327 | 328 | return vEntropy 329 | 330 | 331 | def mutualInformationNormalized(qInts: Union[List[List[int]],List[int]], rInts: Union[List[List[int]],List[int]]): 332 | """ 333 | Calculate the Mutual Information between two lists of n-grams, e.g., 334 | one list being queries and the other the according responses, normalized to the queries' entropy. 335 | Mutual information measures the information shared between the two lists. (FH, Section 3.2.1) 336 | 337 | >>> from fieldhunter.utils.base import mutualInformationNormalized, intsFromNgrams 338 | >>> queryNgramsConst = [b'QQQ', b'QQQ', b'QQQ', b'QQQ'] 339 | >>> respoNgramsConst = [b'RRR', b'RRR', b'RRR', b'RRR'] 340 | >>> queryNgramsCorr = [b'42', b'40', b'42', b'23', b'17'] 341 | >>> respoNgramsCorr = [b'24', b'04', b'24', b'32', b'71'] 342 | >>> queryNgramsPart = [b'42', b'40', b'42', b'23', b'17'] 343 | >>> respoNgramsPart = [b'04', b'04', b'04', b'32', b'71'] 344 | >>> # query and response n-grams are always constant: This allows no conclusion about any correlation => nan 345 | >>> mutualInformationNormalized(intsFromNgrams(queryNgramsConst), intsFromNgrams(respoNgramsConst)) 346 | nan 347 | >>> # query and response n-grams always have corresponding values => perfectly correlated 348 | >>> mutualInformationNormalized(intsFromNgrams(queryNgramsCorr), intsFromNgrams(respoNgramsCorr)) 349 | 1.0 350 | >>> # some query and response n-grams have corresponding values, multiple other queries have the same responses. 351 | >>> mutualInformationNormalized(intsFromNgrams(queryNgramsPart), intsFromNgrams(respoNgramsPart)) 352 | 0.7133... 353 | 354 | :param qInts: List of (n-grams as int-list) or one int per realization 355 | :param rInts: List of (n-grams as int-list) or one int per realization 356 | :return: 357 | """ 358 | assert len(qInts) > 0, "Entropy requires at least one query realization" 359 | assert len(rInts) > 0, "Entropy requires at least one reply realization" 360 | assert len(qInts) == len(rInts), "Mutual information requires the same amount of query and reply realizations" 361 | qEntropy = drv.entropy(qInts) 362 | if qEntropy != 0: 363 | return drv.information_mutual(qInts, rInts) / qEntropy 364 | else: 365 | return nan 366 | 367 | 368 | def qrAssociationCorrelation(mqr: Dict[L4NetworkMessage, L4NetworkMessage], n=1): 369 | """ 370 | Take the matched query-response pairs (mqr) 371 | and associate n-gram offsets by mutual information as correlation metric. 372 | 373 | >>> from tabulate import tabulate 374 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage 375 | >>> from fieldhunter.utils.base import qrAssociationCorrelation 376 | >>> matchedqr = { 377 | ... L4NetworkMessage(b"QQQ456789"): L4NetworkMessage(b"RRR567890"), 378 | ... L4NetworkMessage(b"QQQ7890AB"): L4NetworkMessage(b"RRR567890"), 379 | ... L4NetworkMessage(b"QQQ123456789"): L4NetworkMessage(b"RRR890A"), 380 | ... L4NetworkMessage(b"QQQ6789"): L4NetworkMessage(b"RRR890ABCDEFGH"), 381 | ... } 382 | >>> qrAC = qrAssociationCorrelation(matchedqr) 383 | >>> print(tabulate(qrAC.items())) 384 | - ----- 385 | 0 nan 386 | 1 nan 387 | 2 nan 388 | 3 0.5 389 | 4 0.5 390 | 5 0.5 391 | 6 0 392 | - ----- 393 | 394 | For the first 3 bytes the normalized mutual information is undefined: see #mutualInformationNormalized() 395 | 396 | # TODO optimize efficiency by supporting an input filter, i. e., 397 | calculate mutual information only for given ngram offsets 398 | 399 | :param mqr: Matched query-response pairs 400 | :param n: The length of the n-grams to use (in bytes) 401 | :returns: Offset => causality value 402 | """ 403 | mutInf = dict() 404 | qIterators, rIterators = list(), list() 405 | for qrPair in mqr.items(): 406 | qIterators.append(NgramIterator(qrPair[0], n)) 407 | rIterators.append(NgramIterator(qrPair[1], n)) 408 | while not all(qiter.exhausted for qiter in qIterators) or all(riter.exhausted for riter in rIterators): 409 | qNgrams = list() 410 | rNgrams = list() 411 | # get two lists of n-grams with the same offset, one for queries, one for responses 412 | for qIter, rIter in zip(qIterators, rIterators): 413 | if not qIter.lookahead or not rIter.lookahead: 414 | # there are no more n-grams for either query or response for this pair of Q/R messages 415 | continue 416 | # fetch the next iteration for both messages in parallel. 417 | # A StopIteration should never occur here, since we check if the iterators are soon being exhausted before. 418 | qNgrams.append(next(qIter)) 419 | rNgrams.append(next(rIter)) 420 | # print("Q offset:", qIter.offset) # should be the same for all iterators in one while loop 421 | # print("R offset:", rIter.offset, "\n") 422 | if len(qNgrams) == 0 or len(rNgrams) == 0: 423 | break 424 | # print(qNgrams) 425 | # print(rNgrams, "\n") 426 | qInts = intsFromNgrams(qNgrams) 427 | rInts = intsFromNgrams(rNgrams) 428 | # qIter and rIter are always set here, otherwise the break on (len(qNgrams) == 0 or len(rNgrams) == 0) 429 | # above would have been triggered 430 | # noinspection PyUnboundLocalVariable 431 | if qIter.offset != rIter.offset: 432 | # NgramIterator remembers the offset of its current iteration. This must be the same in both messages. 433 | raise RuntimeError("The offsets in qrAssociationCorrelation calculation do not match:" 434 | f"{qIter.offset} {rIter.offset}\n{qNgrams}\n{rNgrams}") 435 | mutInf[qIter.offset] = mutualInformationNormalized(qInts, rInts) 436 | return mutInf 437 | 438 | 439 | def verticalByteMerge(mqr: Dict[L4NetworkMessage, L4NetworkMessage], offsets: Iterable[int]): 440 | # noinspection PyUnresolvedReferences 441 | """ 442 | Returns two lists of integer-list representations of byte strings, 443 | one from all queries and one from all responses, 444 | containing the bytes at all offsets given as parameter. 445 | 446 | >>> from fieldhunter.utils.base import verticalByteMerge 447 | >>> messageMap = { 448 | ... L4NetworkMessage(b"QQQ456789"): L4NetworkMessage(b"RRR567890"), 449 | ... L4NetworkMessage(b"QQQ7890AB"): L4NetworkMessage(b"RRR567890"), 450 | ... L4NetworkMessage(b"QQQ123456789"): L4NetworkMessage(b"RRR890A"), 451 | ... L4NetworkMessage(b"QQQ6789"): L4NetworkMessage(b"RRR890ABCDEFGH"), 452 | ... } 453 | >>> verticalByteMerge(messageMap, [1]) 454 | ([81, 81, 81, 81], [82, 82, 82, 82]) 455 | >>> verticalByteMerge(messageMap, [1,2,3]) 456 | ([5329204, 5329207, 5329201, 5329206], [5394997, 5394997, 5395000, 5395000]) 457 | >>> qMsgs, rMsgs = verticalByteMerge(messageMap, [3,5,6]) 458 | >>> # ints converted back to the bytes number 3, 5, and 6 from the keys in messageMap 459 | >>> [int.to_bytes(val, 3, 'big') for val in qMsgs] 460 | [b'467', b'790', b'134', b'689'] 461 | >>> # ints converted back to the bytes number 3, 5, and 6 from the values in messageMap 462 | >>> [int.to_bytes(val, 3, 'big') for val in rMsgs] 463 | [b'578', b'578', b'80A', b'80A'] 464 | 465 | :param mqr: Dict that maps one message to another. 466 | :param offsets: List of offsets for which the byte values should be returned. 467 | The offset must exist in all messages. 468 | :return: Two lists of integer representations of the byte values at the given offsets, 469 | one list for the keys and one for the values of the input dict. 470 | :raises IndexError: If an offset does not exist in any message. 471 | """ 472 | sortedOffs = sorted(offsets) 473 | qMerge = list() 474 | rMerge = list() 475 | for query, resp in mqr.items(): 476 | # int.from_bytes is necessary because of numpy's issue with null-bytes: #3878 477 | # (https://github.com/numpy/numpy/issues/3878) 478 | qMerge.append(int.from_bytes(bytes(query.data[o] for o in sortedOffs), 'big')) 479 | rMerge.append(int.from_bytes(bytes(resp.data[o] for o in sortedOffs), 'big')) 480 | return qMerge, rMerge 481 | 482 | 483 | def iterateSelected(toIter: Iterator, selectors: List[int]): 484 | """ 485 | Only return selected iterations from an iterator. 486 | 487 | >>> from fieldhunter.utils.base import iterateSelected 488 | >>> bytesTuple = iter((b'QQQ456789', b'RRR567890', b'QQQ7890AB', b'RRR567890', b'QQQ123456789', b'RRR890A')) 489 | >>> bt2357 = iterateSelected(bytesTuple, [2,3,5,7]) 490 | >>> next(bt2357) 491 | b'QQQ7890AB' 492 | >>> next(bt2357) 493 | b'RRR567890' 494 | >>> next(bt2357) 495 | b'RRR890A' 496 | >>> next(bt2357) # doctest: +IGNORE_EXCEPTION_DETAIL 497 | Traceback (most recent call last): 498 | StopIteration 499 | 500 | :param toIter: The iterator to traverse. 501 | :param selectors: The list of iteration indices to return. 502 | :return: A generator for all iterations in toIter that have an "index" selected by selectors. 503 | """ 504 | return (element for offset, element in enumerate(toIter) if offset in selectors) 505 | 506 | 507 | def list2ranges(offsets: List[int]): 508 | """ 509 | Generate ranges from a list of integer values. The ranges denote the starts and lengths of any subsequence of 510 | adjacent values, e. g. the list [1,2,3,6,7,20] would result in the ranges [(1,3),(6,2),(20,1)] 511 | 512 | >>> from fieldhunter.utils.base import list2ranges 513 | >>> list2ranges([1,2,3,6,7,20]) 514 | [(1, 3), (6, 2), (20, 1)] 515 | >>> list2ranges([2,3,6,11,12,13,23,24,25,26]) 516 | [(2, 2), (6, 1), (11, 3), (23, 4)] 517 | >>> list2ranges([2]) 518 | [(2, 1)] 519 | >>> list2ranges([]) 520 | [] 521 | >>> list2ranges([-2]) # doctest: +IGNORE_EXCEPTION_DETAIL 522 | Traceback (most recent call last): 523 | ValueError: Offsets must be positive numbers. 524 | 525 | :param offsets: list of integers 526 | :return: list of ranges (tuples of offset and length) of consecutive the offsets. 527 | """ 528 | soffs = sorted(offsets) 529 | ranges = list() # type: List[Tuple[int,int]] 530 | # offsets empty 531 | if len(soffs) == 0: 532 | return ranges 533 | if soffs[0] < 0: 534 | raise ValueError("Offsets must be positive numbers.") 535 | # only one offset 536 | if len(soffs) == 1: 537 | return [(soffs[0],1)] 538 | start = soffs[0] 539 | last = soffs[0] 540 | for offs in soffs[1:]: 541 | if offs > last + 1: 542 | ranges.append((start, last - start + 1)) 543 | # start a new range 544 | start = offs 545 | last = offs 546 | # append dangling start/last 547 | ranges.append((start, last - start + 1)) 548 | 549 | return ranges 550 | 551 | 552 | def ngramIsOverlapping(o0, n0, o1, n1): 553 | """ 554 | Check if two ranges are overlapping. The ranges are defined by offset and length each. 555 | 556 | >>> ngramIsOverlapping(2,2,0,3) 557 | True 558 | >>> ngramIsOverlapping(2,2,0,2) 559 | False 560 | >>> ngramIsOverlapping(2,2,3,2) 561 | True 562 | >>> ngramIsOverlapping(2,2,4,2) 563 | False 564 | 565 | :param o0: Offset of n-gram 0 566 | :param n0: Length (n) of n-gram 0 567 | :param o1: Offset of n-gram 1 568 | :param n1: Length (n) of n-gram 1 569 | :return: True if overlapping, false otherwise 570 | """ 571 | return o1 + n1 - 1 >= o0 and o1 < o0 + n0 572 | 573 | -------------------------------------------------------------------------------- /src/fieldhunter/inference/fieldtypes.py: -------------------------------------------------------------------------------- 1 | """ 2 | Infer message field types exactly according to the FieldHunter paper Section 3.2 3 | 4 | TODO introduce doctests to check critical functions in inference.fieldtypes 5 | """ 6 | from typing import List, Tuple, Dict, Iterable, ItemsView, Union 7 | import random, logging 8 | from itertools import groupby, product, chain, combinations 9 | from collections import Counter 10 | from abc import ABC, abstractmethod 11 | 12 | import numpy 13 | from scipy.stats import pearsonr 14 | from pyitlib import discrete_random_variable as drv 15 | from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage 16 | from netzob.Model.Vocabulary.Messages.L2NetworkMessage import L2NetworkMessage 17 | from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage 18 | 19 | from nemere.inference.analyzers import Value 20 | from nemere.inference.segments import TypedSegment 21 | from tabulate import tabulate 22 | 23 | from fieldhunter.utils.base import qrAssociationCorrelation, verticalByteMerge, mutualInformationNormalized, \ 24 | list2ranges, Flows, NgramIterator, iterateSelected, intsFromNgrams, \ 25 | ngramIsOverlapping, pyitNgramEntropy 26 | 27 | 28 | # logging.getLogger(__name__).setLevel(logging.DEBUG) 29 | 30 | 31 | class FieldType(ABC): 32 | """ 33 | Generic, abstract base class for field types. Holds segments and a type label for the inferred fields. 34 | 35 | For DocTest example see fieldhunter.inference.common#segmentedMessagesAndSymbols() 36 | """ 37 | typelabel = None 38 | 39 | def __init__(self): 40 | self._segments = list() 41 | 42 | @property 43 | def segments(self) -> List[List[TypedSegment]]: 44 | """ 45 | :return: Final result as segments that are of the inferred type. 46 | """ 47 | return self._segments 48 | 49 | @classmethod 50 | def _posLen2segments(cls, messages: List[L2NetworkMessage], 51 | posLen: Union[Iterable[Tuple[int, int]],ItemsView[int, int]]) \ 52 | -> List[List[TypedSegment]]: 53 | """ 54 | Generate Segments from (remaining) field ranges. 55 | 56 | For DocTest example see fieldhunter.inference.common#segmentedMessagesAndSymbols() 57 | 58 | :param messages: Messages to generate n-grams to correlate to. 59 | :param posLen: List of start-length tuples to create segments for from each message. 60 | :return: Lists of segments per message generated from the posLen parameter. 61 | """ 62 | segments = list() 63 | for message in messages: 64 | mval = Value(message) 65 | segs4msg = list() 66 | for start, length in posLen: 67 | # check if boundaries fit into message 68 | if start + length <= len(mval.values): 69 | segs4msg.append(TypedSegment(mval, start, length, cls.typelabel)) 70 | segments.append(segs4msg) 71 | return segments 72 | 73 | 74 | class NonConstantNonRandomEntropyFieldType(FieldType, ABC): 75 | """ 76 | Abstract class for inferring field types using entropy of n-gram values 77 | where the entropy may neither be 0 (constant n-gram values) 78 | nor equal or greater than a threshold (random n-gram values). 79 | """ 80 | # Value for entropyThresh not given in FH! 81 | # We use a constant entropyThresh of 0.4 determined by own empirics (results of /src/trace_statistics.py in 82 | # nemesys-reports/NEMEFTR/fieldhunter/typeAndLengthEntropies.ods) 83 | entropyThresh = 0.4 84 | 85 | @classmethod 86 | def entropyFilteredOffsets(cls, messages: List[AbstractMessage], n: int): 87 | """ 88 | Find offsets of n-grams (with the same offset in different messages of the list), that are not constant and not 89 | random, i. e., that have an entropy between and cls.entropyThresh. 90 | 91 | FH, Section 3.2.1 92 | 93 | :param messages: Messages to generate n-grams from 94 | :param n: The $n$ in n-gram 95 | :return: Returns a list of offsets that have non-constant and non-random (below entropyThresh) entropy. 96 | """ 97 | entropy = pyitNgramEntropy(messages, n) 98 | ePo = [(offset, entropy) for offset, entropy in enumerate(entropy)] 99 | logging.getLogger(__name__).debug(f"Entropies per offset:\n{tabulate(ePo)}") 100 | return [offset for offset, entropy in enumerate(entropy) if 0 < entropy < cls.entropyThresh] 101 | 102 | 103 | class MSGtype(NonConstantNonRandomEntropyFieldType): 104 | """ 105 | Message type (MSG-Type) inference (FH, Section 3.2.1, Fig. 3 left). 106 | This type heuristic is based on the mutual information shared between n-grams at the same offset in the query and 107 | response messages. This assumes that message type fields are at the same position in query and response. Moreover, 108 | fields that solely denote whether a message is a query or a response, yield a undefined mutual information and thus 109 | cannot be detected as fields denoting a message type. 110 | 111 | The properties of this class provide access to intermediate and final results. 112 | """ 113 | typelabel = "MSG-Type" 114 | causalityThresh = 0.8 115 | 116 | def __init__(self, flows: Flows): 117 | super().__init__() 118 | 119 | logger = logging.getLogger(__name__) 120 | c2s, s2c = flows.splitDirections() # type: List[L4NetworkMessage], List[L4NetworkMessage] 121 | 122 | # discard constant and random offsets 123 | self._c2sEntropyFiltered = type(self).entropyFilteredOffsets(c2s, 1) 124 | self._s2cEntropyFiltered = type(self).entropyFilteredOffsets(s2c, 1) 125 | logger.info(f"c2sEntropyFiltered offsets: {self.c2sEntropyFiltered}") 126 | logger.info(f"s2cEntropyFiltered offsets: {self.s2cEntropyFiltered}") 127 | 128 | # compute Q->R association 129 | mqr = flows.matchQueryResponse() 130 | if len(mqr) < 2: 131 | # not enough query response pairs to continue analysis: create valid empty instance state and return 132 | self._qrCausality = dict() 133 | self._filteredCausality = dict() 134 | self._mergingOffsets = list() 135 | self._mergedCausality = dict() 136 | self._msgtypeRanges = list() 137 | self._segments = list() 138 | return 139 | # Mutual information 140 | self._qrCausality = qrAssociationCorrelation(mqr) 141 | # filter: only if offset is in c2sEntropyFiltered/s2cEntropyFiltered and the causality is greater than the causalityThresh 142 | self._filteredCausality = {offset: self.qrCausality[offset] for offset in 143 | set(self.c2sEntropyFiltered).intersection(self.s2cEntropyFiltered) 144 | if self.qrCausality[offset] > type(self).causalityThresh} 145 | # filteredCausality are offsets of MSG-Type candidate n-grams 146 | logger.info(f"filtered causality: {sorted(self.filteredCausality.items())}") 147 | 148 | # Merge n-grams above causality threshold and check correlation 149 | self._mergingOffsets = list() 150 | for offset in sorted(self.filteredCausality.keys()): 151 | self._mergingOffsets.append(offset) 152 | qMergedField, rMergedField = verticalByteMerge(mqr, self.offsets) 153 | mergedCausality = mutualInformationNormalized(qMergedField, rMergedField) 154 | if mergedCausality <= type(self).causalityThresh: 155 | # Filter problematic n-grams 156 | self._mergingOffsets.pop() 157 | # re-calculate in case the last iteration removed a problematic n-gram 158 | qMergedField, rMergedField = verticalByteMerge(mqr, self.offsets) 159 | self._mergedCausality = mutualInformationNormalized(qMergedField, rMergedField) 160 | logger.info(f"mergedCausality: {self.mergedCausality}") 161 | logger.info(f" mergedOffsets: {self._mergingOffsets}") 162 | logger.info(f" from offsets: {sorted(self.filteredCausality.keys())}") 163 | 164 | # create segments from bytes in mergingOffsets 165 | self._msgtypeRanges = list2ranges(self.offsets) 166 | self._segments = type(self)._posLen2segments(c2s + s2c, self._msgtypeRanges) 167 | 168 | 169 | @property 170 | def s2cEntropyFiltered(self) -> List[int]: 171 | """ 172 | :return: The offsets for which the vertical entropies of all the server to client messages are 173 | greater than zero and less than MSGtype.entropyThresh 174 | """ 175 | return self._s2cEntropyFiltered 176 | 177 | @property 178 | def c2sEntropyFiltered(self) -> List[int]: 179 | """ 180 | :return: The offsets for which the vertical entropies of all the client to server messages are 181 | greater than zero and less than MSGtype.entropyThresh 182 | """ 183 | return self._c2sEntropyFiltered 184 | 185 | @property 186 | def qrCausality(self) -> Dict[int,float]: 187 | return self._qrCausality 188 | 189 | @property 190 | def filteredCausality(self) -> Dict[int,float]: 191 | return self._filteredCausality 192 | 193 | @property 194 | def mergedCausality(self) -> List[int]: 195 | return self._mergedCausality 196 | 197 | @property 198 | def offsets(self): 199 | """ 200 | :return: Final result as individual byte offsets of offsets that are MSG-Types 201 | """ 202 | return self._mergingOffsets 203 | 204 | @property 205 | def ranges(self) -> List[Tuple[int, int]]: 206 | """ 207 | :return: Final result as ranges of offsets that are MSG-Types 208 | """ 209 | return self._msgtypeRanges 210 | 211 | 212 | class MSGlen(NonConstantNonRandomEntropyFieldType): 213 | """ 214 | Message length (MSG-Len) inference (FH, Section 3.2.2, Fig. 3 center). 215 | Find values in the message that linearly correlate with the application-layer message size. 216 | 217 | Properties enable access to intermediate and final results. 218 | """ 219 | typelabel = "MSG-Len" 220 | # coefficient threshold 0.6 (FH, Section 3.2.2) 221 | minCorrelation = 0.6 222 | # MSG-Len hypothesis threshold 0.9 (FH, Section 3.2.2) 223 | lenhypoThresh = 0.9 224 | 225 | def __init__(self, flows: Flows): 226 | super().__init__() 227 | 228 | self._msgDirection = list() 229 | c2s, s2c = flows.splitDirections() # type: List[L4NetworkMessage], List[L4NetworkMessage] 230 | # per direction - for MSG-Len this is pointless, but the paper says to do it. 231 | # it might rather be useful to separate message types (distinct formats) in this manner. 232 | for direction in [c2s, s2c]: 233 | self._msgDirection.append(type(self).Direction(direction)) 234 | 235 | @property 236 | def acceptedCandidatesPerDir(self) -> List[Dict[int, int]]: 237 | return [mldir.acceptedCandidates for mldir in self._msgDirection] 238 | 239 | @property 240 | def segments(self) -> List[List[TypedSegment]]: 241 | return list(chain.from_iterable([mldir.segments for mldir in self._msgDirection])) 242 | 243 | class Direction(object): 244 | """ 245 | Encapsulates direction-wise inference of MSGlen fields. 246 | Roughly corresponds to either the S2C-collection or C2S-collection branch 247 | depicted in the flow graph of FH, Fig. 3 center. 248 | 249 | Provides methods to extract different size collections, finding candidates by Pearson correlation coefficient, 250 | and verifying the hypothesis of candidates denoting the length of the message. 251 | """ 252 | # TODO also support little endian (for our test traces, it was irrelevant) 253 | endianness = 'big' 254 | 255 | def __init__(self, direction: List[L4NetworkMessage]): 256 | self._direction = direction 257 | # noinspection PyTypeChecker 258 | self._msgbylen = None # type: Dict[int, List[L4NetworkMessage]] 259 | """Homogeneous Size Collections""" 260 | # noinspection PyTypeChecker 261 | self._msgmixlen = None # type: List[L4NetworkMessage] 262 | # noinspection PyTypeChecker 263 | self._candidateAtNgram = None # type: Dict[int, List[int]] 264 | # noinspection PyTypeChecker 265 | self._acceptedCandidates = None # type: Dict[int, int] 266 | """Associates offset with a field length (n-gram's n) to define a list of unambiguous MSG-Len candidates""" 267 | # noinspection PyTypeChecker 268 | self._acceptedX = None # type: Dict[int, numpy.ndarray] 269 | """Maps offsets to (a,b) that solve the linear equation in #verifyCandidates() 270 | (FH: 'Msg. Len. Model Parameters')""" 271 | 272 | self.differentSizeCollections() 273 | self.findCandidates() 274 | self.verifyCandidates() 275 | 276 | # create segments for each accepted candidate 277 | self._segments = MSGlen._posLen2segments(self._direction, self.acceptedCandidates.items()) 278 | 279 | def differentSizeCollections(self): 280 | """ 281 | "stratifying messages by length": extract different size collection -> vector of message lengths 282 | 283 | :return: List of messages that contains an equal amount of messages of each length, 284 | i. e., List of according message lengths 285 | """ 286 | if len(self._direction) == 0: # "No flows in this direction." 287 | self._msgmixlen = list() 288 | return 289 | keyfunc = lambda m: len(m.data) 290 | # Homogeneous Size Collections 291 | self._msgbylen = {k: list(v) for k, v in groupby(sorted(self._direction, key=keyfunc), keyfunc)} 292 | minCollSize = min(len(v) for v in self._msgbylen.values()) 293 | # generate size-heterogeneous collection by random sampling 294 | msgmixlen = list() 295 | for k, v in self._msgbylen.items(): 296 | random.seed(42) 297 | if len(v) > minCollSize: 298 | msgmixlen.extend(random.sample(v, k=minCollSize)) 299 | else: 300 | msgmixlen.extend(v) 301 | self._msgmixlen = msgmixlen 302 | 303 | def findCandidates(self): 304 | """ 305 | Find message-length candidates (in parenthesis: block names from FH, Fig. 3 center): 306 | * filter for message offsets where the n-gram is not constant and not random (Entropy Filter) 307 | * correlate n-grams to message lengths (Pearson Correlation) 308 | 309 | :return: The offsets (dict value: list) where the Pearson Correlation 310 | exceeds the threshold MSGlen.minCorrelation 311 | for different sized n-grams (dict key). 312 | """ 313 | # "Extract Vector of Message Length" 314 | lens4msgmix = [len(m.data) for m in self._msgmixlen] # type: List[int] 315 | candidateAtNgram = dict() 316 | # iterate n-grams' n=32, 24, 16 bits (4, 3, 2 bytes), see 3.1.2 317 | for n in [4, 3, 2]: 318 | # entropy filter for each n-gram offset for "Field Values Matrix" below 319 | offsets = MSGlen.entropyFilteredOffsets(self._msgmixlen, n) 320 | # TODO currently only tested for big endian, see #intsFromNgrams 321 | # TODO for textual protocols decode the n-gram as (ASCII) number (FH, Sec. 3.2.2, second paragraph) 322 | ngIters = (intsFromNgrams( 323 | iterateSelected(NgramIterator(msg, n), offsets), type(self).endianness) for msg in self._msgmixlen) 324 | # "Field Values Matrix" 325 | ngramsAtOffsets = numpy.array(list(ngIters)) 326 | 327 | # correlate columns of ngramsAtOffsets to lens4msgmix 328 | pearsonAtOffset = list() 329 | for ngrams in ngramsAtOffsets.T: 330 | # Pearson correlation coefficient (numeric value of n-gram) -> (len(msg.data)) 331 | pearsonAtOffset.append(pearsonr(ngrams, lens4msgmix)[0]) 332 | candidateAtNgram[n] = [o for pao, o in zip(pearsonAtOffset, offsets) if pao > MSGlen.minCorrelation] 333 | self._candidateAtNgram = candidateAtNgram 334 | 335 | def verifyCandidates(self): 336 | """ 337 | Verify the length-hypothesis for candidates, by solving the linear equation 338 | for values at the candidate n-grams in candidateAtNgram (precedence for larger n, i. e., longer fields): 339 | 340 | MSG_len = a * value + b (a > 0, b \in N) - "Msg. Len. Model Parameters" 341 | lens4msgmix = ngramsAtOffsets[:,candidateAtNgram[n]] * a + 1 * b 342 | 343 | At least a threshold 0.9 of the message pairs with different lengths has to fulfill the hypothesis. 344 | """ 345 | acceptedCandidates = dict() # type: Dict[int, int] 346 | acceptedX = dict() 347 | # specifying found acceptable solutions at offset (key) with n (value) for this direction 348 | for n in [4, 3, 2]: 349 | for offset in self._candidateAtNgram[n]: 350 | # check precedence: if longer already-accepted n-gram overlaps this offset ignore 351 | # noinspection PyTypeChecker 352 | if not MSGlen.checkPrecedence(offset, n, acceptedCandidates.items()): 353 | continue 354 | # MSG-len hypothesis test - for ALL message pairs with different lengths (FH, 3.2.2 last paragraph) 355 | # - for the n-grams from this offset - keep only those offsets, where the threshold of pairs holds 356 | solutionAcceptable = dict() # type: Dict[Tuple[AbstractMessage, AbstractMessage], bool] 357 | Xes = list() 358 | for l1, l2 in combinations(self._msgbylen.keys(),2): 359 | for msg0, msg1 in product(self._msgbylen[l1], self._msgbylen[l2]): 360 | ngramPair = [msg0.data[offset:offset + n], msg1.data[offset:offset + n]] 361 | if ngramPair[0] == ngramPair[1]: 362 | solutionAcceptable[(msg0, msg1)] = False 363 | continue 364 | A = numpy.array( [intsFromNgrams(ngramPair), [1, 1]] ).T 365 | B = numpy.array( [len(msg0.data), len(msg1.data)] ) 366 | try: # solve the linear equation 367 | X = numpy.linalg.inv(A).dot(B) 368 | solutionAcceptable[(msg0, msg1)] = X[0] > 0 and X[1].is_integer() 369 | Xes.append(X) 370 | except numpy.linalg.LinAlgError: 371 | print("LinAlgError occurred. Solution considered as non-acceptable.") 372 | solutionAcceptable[(msg0, msg1)] = False 373 | Xarray = numpy.array(Xes) 374 | logging.getLogger(__name__).debug(f"Checking candidate with n = {n} at offset {offset}.") 375 | if type(self)._candidateIsAcceptable(solutionAcceptable, Xarray): 376 | acceptedCandidates[offset] = n 377 | acceptedX[offset] = Xarray 378 | self._acceptedCandidates = acceptedCandidates 379 | self._acceptedX = acceptedX 380 | 381 | @staticmethod 382 | def _candidateIsAcceptable(solutionAcceptable: Dict[Tuple[AbstractMessage, AbstractMessage], bool], 383 | Xarray: numpy.ndarray): 384 | """ 385 | Count the message pairs for which the solution is acceptable according to the MSG-len hypothesis test. 386 | 387 | :param solutionAcceptable: results of the the MSG-len hypothesis test for the Cartesian product of messages. 388 | :return: Whether this candidate is acceptable using MSGlen.lenhypoThresh. 389 | """ 390 | acceptCount = Counter(solutionAcceptable.values()) 391 | return bool(acceptCount[True]/len(acceptCount) > MSGlen.lenhypoThresh) 392 | 393 | @property 394 | def acceptedCandidates(self) -> Dict[int, int]: 395 | """Associates offset with a field length (n-gram's n) to define a list of unambiguous MSG-Len candidates""" 396 | return self._acceptedCandidates 397 | 398 | @property 399 | def segments(self): 400 | return self._segments 401 | 402 | @staticmethod 403 | def checkPrecedence(offset: int, n: int, ngrams: Iterable[Tuple[int, int]]): 404 | """ 405 | Has n-gram at offset precedence over all n-grams in ngrams? 406 | 407 | :param offset: 408 | :param n: 409 | :param ngrams: offset and n for a list of n-grams 410 | :return: 411 | """ 412 | for o1, n1 in ngrams: 413 | if ngramIsOverlapping(offset, n, o1, n1): 414 | return False 415 | return True 416 | 417 | 418 | class CategoricalCorrelatedField(FieldType,ABC): 419 | """ 420 | Abstract class for inferring field types using categorical correlation of n-gram values with external values, e. g., 421 | environmental information like addresses from encapsulation. 422 | 423 | CategoricalCorrelatedField#correlate() uses #_values2correlate2() to determine what to correlate. 424 | This is different for the subclasses (HostID, SessionID). It iterates the n-grams (n=1) and creates 425 | n-grams-to-source-IP-tuples for Host-ID or (n-grams,(source IP,destination IP))-tuples for Session-ID. 426 | It correlates the n-grams to the respective tuple by calculating the catCorr for the 427 | n-gram and the source/destination tuple. 428 | """ 429 | correlationThresh = 0.9 # 0.9, threshold for correlation between host ID and IP address(es) (FH, Sec. 3.2.3) 430 | minLenThresh = 4 # host ID fields must at least be 4 bytes long (FH, Sec. 3.2.3) 431 | 432 | def __init__(self, messages: List[L4NetworkMessage]): 433 | super().__init__() 434 | self._messages = messages 435 | filteredMessages = type(self)._filterMessages(messages) 436 | # We correlate only the filtered messages... 437 | self._categoricalCorrelation = type(self).correlate(filteredMessages) 438 | self._catCorrPosLen = type(self).catCorrPosLen(self._categoricalCorrelation) 439 | # ... but use the positions to generate segments for all messages. This might not always be wise. 440 | self._segments = type(self)._posLen2segments(self._messages, self._catCorrPosLen) 441 | 442 | @classmethod 443 | def _filterMessages(cls, messages: List[L4NetworkMessage]): 444 | """ 445 | Filter messages used to correlate in the first place. To be overwritten by subclasses. 446 | This basic implementation passes the input list unchanged. 447 | """ 448 | return messages 449 | 450 | @classmethod 451 | @abstractmethod 452 | def _values2correlate2(cls, messages: List[L4NetworkMessage]) -> List[int]: 453 | """ 454 | Implement to determine the external values to correlate the n-grams of messages with. 455 | 456 | :param messages: Messages for which to generate correlation values. 457 | :return: The list of values, one for each message in the given order, to correlate to. 458 | """ 459 | raise NotImplementedError("Implement this abstract class method in a subclass.") 460 | 461 | @classmethod 462 | def correlate(cls, messages: List[L4NetworkMessage], n: int = 1): 463 | # noinspection PyShadowingNames 464 | r""" 465 | Generate n-grams at the same offsets for each message an correlate each n-gram using 466 | categorical correlation: R(x, y) = I(x: y)/H(x, y) \in [0,1] 467 | Uses cls#n to determine the n-gram sizes and cls#_values2correlate2() to obtain tuples of data to correlate. 468 | 469 | >>> from fieldhunter.inference.fieldtypes import SessionID 470 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage 471 | >>> messages = [ 472 | ... L4NetworkMessage(b"session111\x42\x17\x23\x00\x08\x15", 473 | ... l3SourceAddress="1.2.3.100", l3DestinationAddress="1.2.3.1"), 474 | ... L4NetworkMessage(b"session111xe4\x83\x82\x85\xbf", 475 | ... l3SourceAddress="1.2.3.100", l3DestinationAddress="1.2.3.1"), 476 | ... L4NetworkMessage(b"session111\x42\x17\xf9\x0b\x00b\x12O", 477 | ... l3SourceAddress="1.2.3.100", l3DestinationAddress="1.2.3.1"), 478 | ... L4NetworkMessage(b"session222\x42\x17Jk\x8a1e\xb5", 479 | ... l3SourceAddress="1.2.3.2", l3DestinationAddress="1.2.3.100"), 480 | ... L4NetworkMessage(b"session222L\xab\x83\x1a\xef\x13", 481 | ... l3SourceAddress="1.2.3.2", l3DestinationAddress="1.2.3.100"), 482 | ... ] 483 | >>> SessionID.correlate(messages) 484 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.21851654863632566, 0.21851654863632566, 0.4181656600790516, 0.4181656600790516, 0.4181656600790516, 0.4181656600790516] 485 | 486 | :param messages: Messages to generate n-grams to correlate to. 487 | :param n: Host-ID uses 8-bit/1-byte n-grams according to FH, Sec. 3.1.2, but this does not work well 488 | (see fieldtypesRelaxed.CategoricalCorrelatedField) 489 | :return: Correlation values for each offset of n-grams generated from the messages. 490 | """ 491 | # ngram at offset and src address 492 | ngramsSrcs = list() 493 | categoricalCorrelation = list() 494 | corrValues = cls._values2correlate2(messages) 495 | # Iterate n-grams of all messages 496 | for ngrams in zip(*(NgramIterator(msg, n=n) for msg in messages)): 497 | ngSc = cls._combineNgrams2Values(ngrams, corrValues) 498 | if ngSc.size > 0: 499 | # categoricalCorrelation: R(x, y) = I(x: y)/H(x, y) \in [0,1] 500 | catCorr = drv.information_mutual(ngSc[0], ngSc[1]) / drv.entropy_joint(ngSc) 501 | else: 502 | catCorr = numpy.nan 503 | ngramsSrcs.append(ngSc) 504 | categoricalCorrelation.append(catCorr) 505 | return categoricalCorrelation 506 | 507 | @classmethod 508 | def catCorrPosLen(cls, categoricalCorrelation: List[float]): 509 | """ 510 | Merge consecutive candidate n-grams with categoricalCorrelation > correlationThresh. 511 | Filters n-gram offsets on defined thresholds (FH, Sec. 3.2.3) by their categorical correlation values to 512 | * correlation between host ID and IP address(es) > correlationThresh 513 | * discard short fields < minHostLenThresh 514 | 515 | :param categoricalCorrelation: Correlation values for each offset of n-grams generated from the messages. 516 | :return: List of start-length tuples with categorical correlation above threshold and not being a short field. 517 | """ 518 | catCorrOffsets = [ offset for offset, catCorr in enumerate(categoricalCorrelation) 519 | if catCorr > cls.correlationThresh ] 520 | catCorrRanges = list2ranges(catCorrOffsets) 521 | # discard short fields < minHostLenThresh 522 | return [ (start, length) for start, length in catCorrRanges if length >= cls.minLenThresh ] 523 | 524 | @property 525 | def categoricalCorrelation(self): 526 | # !! The attribute self._categoricalCorrelation needs to be defined in subclass init !! 527 | # noinspection PyUnresolvedReferences 528 | return self._categoricalCorrelation 529 | 530 | @classmethod 531 | def _combineNgrams2Values(cls, ngrams: Iterable[bytes], values: List[int]) -> numpy.ndarray: 532 | return numpy.array([intsFromNgrams(ngrams), values]) 533 | 534 | @classmethod 535 | def _srcDstBytes(cls, messages: List[L4NetworkMessage]): 536 | return [ ( 537 | bytes(map(int, msg.source.rpartition(':')[0].split('.'))), 538 | bytes(map(int, msg.destination.rpartition(':')[0].split('.'))) 539 | ) for msg in messages] 540 | 541 | 542 | class HostID(CategoricalCorrelatedField): 543 | """ 544 | Host identifier (Host-ID) inference (FH, Sec. 3.2.3) 545 | Find n-gram that is strongly correlated with IP address of sender. 546 | """ 547 | typelabel = 'Host-ID' 548 | 549 | @classmethod 550 | def _values2correlate2(cls, messages: List[L4NetworkMessage]): 551 | """ 552 | Recover byte representations of the IPv4 addresses from all Netzob messages and make one int out if each. 553 | :param messages: Messages to generate n-grams to correlate to. 554 | :return: 555 | """ 556 | return intsFromNgrams(src for src, dst in cls._srcDstBytes(messages)) 557 | 558 | 559 | class SessionID(CategoricalCorrelatedField): 560 | """ 561 | Session identifier (Session-ID) inference (FH, Section 3.2.4) 562 | Find n-gram that is strongly correlated with IP addresses of sender and receiver 563 | using categorical correlation like Host-ID. 564 | 565 | Most of FH, Section 3.2.4, refers to Host-ID, so we use all missing details from there and reuse the implementation. 566 | The only difference are the values to correlate (see #_values2correlate2()) 567 | """ 568 | typelabel = 'Session-ID' 569 | 570 | @classmethod 571 | def _values2correlate2(cls, messages: List[L4NetworkMessage]): 572 | """ 573 | Get source AND destination addresses in the same manner as (just) the source for Host-ID. 574 | Recover byte representations of the IPv4 addresses from all Netzob messages and make one int out if each. 575 | 576 | :param messages: Messages to generate n-grams to correlate to. 577 | :return: integer representation of source and destination addresses for each message. 578 | """ 579 | return intsFromNgrams(src+dst for src, dst in cls._srcDstBytes(messages)) 580 | 581 | 582 | class TransID(FieldType): 583 | """ 584 | Transaction identifier (Trans-ID) inference (FH, Section 3.2.5, Fig. 3 right) 585 | """ 586 | typelabel = 'Trans-ID' 587 | 588 | transSupportThresh = 0.8 # enough support in conversations (FH, Sec. 3.2.5) 589 | minFieldLength = 2 # merged n-grams must at least be this amount of bytes long 590 | # n-gram size is not explicitly given in FH, but the description (merging, sharp drops in entropy in Fig. 6) 591 | # leads to assuming it should be 1. 592 | n = 1 593 | entropyThresh = 0.6 # Value is not given in FH paper! 594 | """ 595 | entropy in c2s/s2c + flows: threshold for high entropy is not given in FH! 596 | We use a value determined by own empirics: see entropy plots from src/trace_statistics.py 597 | """ 598 | absoluteEntropy = True 599 | 600 | def __init__(self, flows: Flows): 601 | super().__init__() 602 | 603 | # prepare instance attributes 604 | self._flows = flows 605 | self._c2s, self._s2c = self._flows.splitDirections() # type: List[L4NetworkMessage], List[L4NetworkMessage] 606 | self._c2sEntropyFiltered = None 607 | self._s2cEntropyFiltered = None 608 | self._c2sConvsEntropyFiltered = dict() 609 | self._s2cConvsEntropyFiltered = dict() 610 | self._c2sHorizontalOffsets = None 611 | self._s2cHorizontalOffsets = None 612 | self._c2sCombinedOffsets = None 613 | self._s2cCombinedOffsets = None 614 | self._valuematch = dict() 615 | # noinspection PyTypeChecker 616 | self._c2sConsistentRanges = None # type: Iterable[Tuple[int, int]] 617 | # noinspection PyTypeChecker 618 | self._s2cConsistentRanges = None # type: Iterable[Tuple[int, int]] 619 | 620 | # Infer 621 | self._verticalAndHorizontalRandomNgrams() 622 | self._constantQRvalues() 623 | self._consistentCandidates() 624 | # TODO not needed for textual protocols (FH, Sec. 3.2.5, last sentence) 625 | self._c2sConsistentRanges = type(self)._mergeAndFilter(self._c2sConsistentCandidates) 626 | self._s2cConsistentRanges = type(self)._mergeAndFilter(self._s2cConsistentCandidates) 627 | self._segments = \ 628 | type(self)._posLen2segments(self._c2s, self._c2sConsistentRanges) + \ 629 | type(self)._posLen2segments(self._s2c, self._s2cConsistentRanges) 630 | 631 | @classmethod 632 | def entropyFilteredOffsets(cls, messages: List[AbstractMessage], absolute=True): 633 | """ 634 | Find offsets of n-grams (with the same offset in different messages of the list) that are random, 635 | i. e., that have a entropy greater than entropyThresh (cls.entropyThresh or relative). 636 | 637 | FH, Section 3.2.5 638 | 639 | :param messages: Messages to generate n-grams from 640 | :param absolute: Use the absolute constant for the threshold if true, 641 | make it relative to the maximum entropy if False. 642 | :return: Returns a list of offsets that have non-constant and non-random (below entropyThresh) entropy. 643 | """ 644 | if len(messages) > 0: 645 | entropy = pyitNgramEntropy(messages, cls.n) 646 | entropyThresh = cls.entropyThresh if absolute else max(entropy) * cls.entropyThresh 647 | return [offset for offset, entropy in enumerate(entropy) if entropy > entropyThresh] 648 | else: 649 | return [] 650 | 651 | def _verticalAndHorizontalRandomNgrams(self): 652 | """ 653 | Determine n-grams that are "random across vertical and horizontal collections" (FH, Sec. 3.2.5). 654 | 655 | Output is written to self._c2sCombinedOffsets and self._s2cCombinedOffsets. 656 | Moreover, intermediate results are persisted in instance attributes for evaluation. 657 | """ 658 | logger = logging.getLogger(__name__) 659 | # vertical collections 660 | c2s, s2c = self._flows.splitDirections() # type: List[L4NetworkMessage], List[L4NetworkMessage] 661 | self._c2sEntropyFiltered = type(self).entropyFilteredOffsets(c2s, type(self).absoluteEntropy) 662 | self._s2cEntropyFiltered = type(self).entropyFilteredOffsets(s2c, type(self).absoluteEntropy) 663 | logger.info(f"c2sEntropyFiltered offsets: {self._c2sEntropyFiltered}") 664 | logger.info(f"s2cEntropyFiltered offsets: {self._s2cEntropyFiltered}") 665 | 666 | # # DEBUGGING horizontal collections: intermediate entropy of n-grams 667 | # self._c2sConvsEntropy = dict() 668 | # for key, conv in self._flows.c2sInConversations().items(): 669 | # self._c2sConvsEntropy[key] = pyitNgramEntropy(conv, type(self).n) 670 | # self._s2cConvsEntropy = dict() 671 | # for key, conv in self._flows.s2cInConversations().items(): 672 | # self._s2cConvsEntropy[key] = pyitNgramEntropy(conv, type(self).n) 673 | # print('_c2sConvsEntropy') 674 | # pprint(self._c2sConvsEntropy) 675 | # print('_s2cConvsEntropy') 676 | # pprint(self._s2cConvsEntropy) 677 | # 678 | # horizontal collections: entropy of n-gram per the same offset in all messages of one flow direction 679 | self._c2sConvsEntropyFiltered = type(self)._horizontalRandomNgrams( 680 | self._flows.c2sInConversations(), self._c2sEntropyFiltered) 681 | self._s2cConvsEntropyFiltered = type(self)._horizontalRandomNgrams( 682 | self._flows.s2cInConversations(), self._s2cEntropyFiltered) 683 | logger.info('c2sConvsEntropyFiltered: ' + repr(self._c2sConvsEntropyFiltered.values())) 684 | logger.info('s2cConvsEntropyFiltered: ' + repr(self._s2cConvsEntropyFiltered.values())) 685 | 686 | # intersection of all c2s and s2c filtered offset lists (per flow) 687 | c2sOffsetLists = [set(offsetlist) for offsetlist in self._c2sConvsEntropyFiltered.values()] 688 | self._c2sHorizontalOffsets = set.intersection(*c2sOffsetLists) if len(c2sOffsetLists) > 0 else set() 689 | s2cOffsetLists = [set(offsetlist) for offsetlist in self._s2cConvsEntropyFiltered.values()] 690 | self._s2cHorizontalOffsets = set.intersection(*s2cOffsetLists) if len(s2cOffsetLists) > 0 else set() 691 | 692 | # offsets in _c2sEntropyFiltered where the offset is also in all of the lists of _c2sConvsEntropyFiltered 693 | self._c2sCombinedOffsets = self._c2sHorizontalOffsets.intersection(self._c2sEntropyFiltered) 694 | # offsets in _c2sEntropyFiltered where the offset is also in all of the lists of _s2cConvsEntropyFiltered 695 | self._s2cCombinedOffsets = self._s2cHorizontalOffsets.intersection(self._s2cEntropyFiltered) 696 | 697 | @classmethod 698 | def _horizontalRandomNgrams(cls, conversions: Dict[tuple, List[AbstractMessage]], 699 | verticalEntropyFiltered: List[int]) -> Dict[Union[Tuple, None], List[int]]: 700 | """ 701 | Filter in offsets that are random horizontally 702 | 703 | :param conversions: 704 | :param verticalEntropyFiltered: 705 | :return: 706 | """ 707 | filteredOutput = dict() 708 | # horizontal collections: entropy of n-gram per the same offset in all messages of one flow direction 709 | for key, conv in conversions.items(): 710 | filteredOutput[key] = cls.entropyFilteredOffsets(conv, cls.absoluteEntropy) 711 | return filteredOutput 712 | 713 | def _constantQRvalues(self): 714 | """ 715 | Request/Response pairs: search for n-grams with constant values (differing offsets allowed) 716 | 717 | Output is placed in self._valuematch. 718 | """ 719 | # compute Q->R association 720 | mqr = self._flows.matchQueryResponse() 721 | # from the n-gram offsets that passed the entropy-filters determine those that have the same value in mqr pairs 722 | for query, resp in mqr.items(): 723 | qrmatchlist = self._valuematch[(query, resp)] = list() 724 | # value in query at any of the offsets in _c2sCombinedOffsets 725 | for c2sOffset in self._c2sCombinedOffsets: 726 | if len(query.data) < c2sOffset + type(self).n: 727 | continue 728 | qvalue = query.data[c2sOffset:c2sOffset + type(self).n] 729 | # matches a value of resp at any of the offsets in _s2cCombinedOffsets 730 | for s2cOffset in self._s2cCombinedOffsets: 731 | if len(resp.data) < s2cOffset + type(self).n: 732 | continue 733 | rvalue = resp.data[s2cOffset:s2cOffset + type(self).n] 734 | if qvalue == rvalue: 735 | qrmatchlist.append((c2sOffset, s2cOffset)) 736 | 737 | def _consistentCandidates(self): 738 | """ 739 | measure consistency: offsets recognized in more than transSupportThresh of conversations 740 | 741 | Output is written to self._c2sConsistentCandidates and self._s2cConsistentCandidates 742 | """ 743 | c2sCandidateCount = Counter() 744 | s2cCandidateCount = Counter() 745 | for offsetlist in self._valuematch.values(): # (query, resp), offsetlist 746 | if len(offsetlist) < 1: 747 | continue 748 | # transpose to offsets per direction 749 | c2sOffsets, s2cOffsets = zip(*offsetlist) 750 | c2sCandidateCount.update(set(c2sOffsets)) 751 | s2cCandidateCount.update(set(s2cOffsets)) 752 | self._c2sConsistentCandidates = [offset for offset, cc in c2sCandidateCount.items() if 753 | cc > type(self).transSupportThresh * len(self._c2s)] 754 | self._s2cConsistentCandidates = [offset for offset, cc in s2cCandidateCount.items() if 755 | cc > type(self).transSupportThresh * len(self._s2c)] 756 | 757 | @classmethod 758 | def _mergeAndFilter(cls, consistentCandidates): 759 | """ 760 | merge and filter candidates by minimum length 761 | """ 762 | return [ol for ol in list2ranges(consistentCandidates) if ol[1] >= cls.minFieldLength] 763 | 764 | 765 | class Accumulator(FieldType): 766 | """ 767 | Accumulator inference (FH, Section 3.2.6) 768 | 769 | "Accumulators are fields that have increasing values over consecutive message within the same conversation." 770 | (FH, Sec. 3.2.6) 771 | """ 772 | typelabel = 'Accumulator' 773 | 774 | # TODO also support little endian (for our test traces, it was irrelevant) 775 | endianness = 'big' 776 | ns = (8, 4, 3, 2) 777 | deltaEntropyThresh = 0.8 # Not given in FH, own empirics: 0.2 778 | 779 | def __init__(self, flows: Flows): 780 | super(Accumulator, self).__init__() 781 | 782 | # c2s and s2c independently 783 | self._c2sConvs = {key: list(sorted(conv, key=lambda m: m.date)) 784 | for key, conv in flows.c2sInConversations().items()} 785 | self._c2sDeltas = type(self).deltas(self._c2sConvs) 786 | self._c2sDeltaEntropies = type(self).entropies(self._c2sDeltas) 787 | 788 | self._s2cConvs = {key: list(sorted(conv, key=lambda m: m.date)) 789 | for key, conv in flows.s2cInConversations().items()} 790 | self._s2cDeltas = type(self).deltas(self._c2sConvs) 791 | self._s2cDeltaEntropies = type(self).entropies(self._s2cDeltas) 792 | 793 | # print('c2sDeltaEntropies (n: offset: value)') 794 | # pprint(c2sDeltaEntropies) 795 | # print('s2cDeltaEntropies (n: offset: value)') 796 | # pprint(s2cDeltaEntropies) 797 | 798 | c2s, s2c = flows.splitDirections() # type: List[L4NetworkMessage], List[L4NetworkMessage] 799 | self._segments = self._posLen2segments(c2s, type(self).filter(self._c2sDeltaEntropies)) + \ 800 | self._posLen2segments(s2c, type(self).filter(self._s2cDeltaEntropies)) 801 | 802 | @classmethod 803 | def deltas(cls, conversations: Dict[tuple, List[AbstractMessage]]) -> Dict[int, Dict[int, List[int]]]: 804 | """ 805 | Value deltas per offset and n over all message-pairs of all conversations. 806 | 807 | :param conversations: Conversations need to be sorted in chronological order for the message pairs to produce 808 | meaningful deltas. 809 | :return: Pairwise deltas of values per offset and n-gram size. 810 | """ 811 | deltas = dict() 812 | for key, conv in conversations.items(): 813 | if len(conv) > 2: 814 | continue 815 | # subsequent messages per direction per conversation 816 | for msgA, msgB in zip(conv[:-1], conv[1:]): 817 | # iterate n-grams' n = 8, 4, 3, 2 818 | # combined from Sec. 3.1.2: n=32, 24, 16 bits (4, 3, 2 bytes) 819 | # and see Sec. 3.2.6: n=64, 32, 16 bits (8, 4, 2 bytes) 820 | for n in cls.ns: 821 | if n not in deltas: 822 | deltas[n] = dict() 823 | for offset, (ngramA, ngramB) in enumerate(zip(NgramIterator(msgA, n), NgramIterator(msgB, n))): 824 | # calculate delta between n-grams (n and offset identical) two subsequent messages 825 | # TODO test support little endian (for our test traces, it was irrelevant) 826 | delta = int.from_bytes(ngramB, cls.endianness) - int.from_bytes(ngramA, cls.endianness) 827 | if offset not in deltas[n]: 828 | deltas[n][offset] = list() 829 | deltas[n][offset].append(delta) 830 | return deltas 831 | 832 | @classmethod 833 | def entropies(cls, deltas: Dict[int, Dict[int, List[int]]]) -> Dict[int, Dict[int, float]]: 834 | """ 835 | For positive delta values with enough samples to calculate a meaningful entropy (>= 2), 836 | calculate the normalized entropies of the "compressed" (ln()) deltas. 837 | 838 | :param deltas: Pairwise deltas between values of subsequent messages in conversations 839 | at the same offset and with the same length (n): Dict[n, Dict[offset, delta] ]. 840 | :return: Entropies of deltas per n-gram length and offset: Dict[n, Dict[offset, entropy] ]. 841 | """ 842 | lndeltas = dict() 843 | for n, offdel in deltas.items(): 844 | lndeltas[n] = dict() 845 | for offset, dlts in offdel.items(): 846 | # require more than 1 value to calculate a meaningful entropy 847 | if len(dlts) < 2: 848 | continue 849 | npdlts = numpy.array(dlts) 850 | # require all deltas to be positive 851 | if any(npdlts <= 0): 852 | continue 853 | # compress deltas by ln 854 | lndeltas[n][offset] = numpy.log(numpy.array(dlts)) 855 | deltaEntropies = {n: {offset: drv.entropy(dlts)/numpy.log(n*8) 856 | for offset, dlts in offdel.items()} for n, offdel in lndeltas.items()} 857 | return deltaEntropies 858 | 859 | @classmethod 860 | def filter(cls, deltaEntropies: Dict[int, Dict[int, float]]) -> List[Tuple[int, int]]: 861 | """ 862 | Filter the entropies per n-gram size and offset to yield unambiguos candidates for accumulators. 863 | Filtering criteria are: 864 | * "fairly constant": relatively low entropy 865 | * previous filtering left over offsets for a n 866 | * prefer larger ns and smaller offsets if candidates are overlapping 867 | 868 | :param deltaEntropies: Entropies of deltas per n-gram length and offset: Dict[n, Dict[offset, entropy] ]. 869 | :return: List of offsets and lengths that are valid field candidates. 870 | """ 871 | # "fairly constant": relatively low entropy -> threshold (value not given in FH) 872 | filteredDE = {n: {offs: entr for offs, entr in offsdelt.items() if entr < cls.deltaEntropyThresh} 873 | for n, offsdelt in deltaEntropies.items()} 874 | candidates = dict() # type: Dict[int, List[int]] 875 | for n in reversed(sorted(filteredDE.keys())): 876 | # no offsets for this n-gram size 877 | if len(filteredDE[n]) == 0: 878 | continue 879 | for offset in sorted(filteredDE[n].keys()): 880 | # precedence for larger ns and smaller offsets: thats those we already found and added to candidates 881 | overlapps = False 882 | for candN, candOffs in candidates.items(): 883 | for candO in candOffs: 884 | if ngramIsOverlapping(offset, n, candO, candN): 885 | overlapps = True 886 | break 887 | if overlapps: 888 | break 889 | if overlapps: 890 | continue 891 | if not n in candidates: 892 | candidates[n] = list() 893 | candidates[n].append(offset) 894 | posLen = [(o, n) for n, offsets in candidates.items() for o in offsets] 895 | return posLen 896 | 897 | 898 | # Host-ID will always return a subset of Session-ID fields, so Host-ID should get precedence 899 | # MSG-Len would be overwritten by MSG-Type (see SMB: nbss.length), so first use MSG-Len 900 | precedence = {MSGlen.typelabel: 0, MSGtype.typelabel: 1, HostID.typelabel: 2, 901 | SessionID.typelabel: 3, TransID.typelabel: 4, Accumulator.typelabel: 5} 902 | """ 903 | The order in which to map field types to messages. 904 | Lower numbers take precedence over higher numbers, so that the type with the higher number will be ignored 905 | if overlapping at the same offet range in the message. 906 | """ --------------------------------------------------------------------------------