├── reports
└── .empty
├── input
├── src
├── fieldhunter
│ ├── __init__.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── eval.py
│ │ └── base.py
│ └── inference
│ │ ├── __init__.py
│ │ ├── common.py
│ │ ├── fieldtypesRelaxed.py
│ │ └── fieldtypes.py
├── fh.py
├── fh_relaxed.py
└── trace_statistics.py
├── lib
└── nemere
├── requirements.txt
├── .idea
├── .gitignore
├── vcs.xml
├── misc.xml
├── other.xml
├── inspectionProfiles
│ └── profiles_settings.xml
├── modules.xml
└── fieldhunter.iml
├── .gitmodules
├── .gitignore
├── setup.cfg
├── eval-traces.sh
├── eval-fh.sh
├── eval-fh-relaxed.sh
└── README.md
/reports/.empty:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/input:
--------------------------------------------------------------------------------
1 | sub/nemere/input/
--------------------------------------------------------------------------------
/src/fieldhunter/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/fieldhunter/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/lib/nemere:
--------------------------------------------------------------------------------
1 | ../sub/nemere/src/nemere/
--------------------------------------------------------------------------------
/src/fieldhunter/inference/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | netzob
2 | pyitlib
3 | openpyxl
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml
3 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "sub/nemere"]
2 | path = sub/nemere
3 | url = git@github.com:vs-uulm/nemesys.git
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | reports/
3 | workspace.xml
4 | usage.statistics.xml
5 | shelf/
6 | *.iml
7 | gradle.xml
8 |
9 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [nosetests]
2 | verbosity=2
3 | with-doctest=1
4 | where=src/
5 | doctest-options=+ELLIPSIS
6 | # tests=../tests/, .
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/other.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/eval-traces.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #input=input/*-100.pcap
4 | #input=input/*-1000.pcap
5 | #input="input/*-100.pcap input/*-1000.pcap"
6 | #input=input/maxdiff-filtered/*-1000.pcap
7 | #input=input/maxdiff-fromOrig/*-100.pcap
8 | input="input/maxdiff-fromOrig/*-100*.pcap input/deduped-orig/*-100*.pcap"
9 | #input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap"
10 |
11 |
12 | #tftnext=$(expr 1 + $(ls -d reports/tft-* | sed "s/^.*tft-\([0-9]*\)-.*$/\1/" | sort | tail -1))
13 | #tftnpad=$(printf "%03d" ${tftnext})
14 | #currcomm=$(git log -1 --format="%h")
15 | #report=reports/tft-${tftnpad}-clustering-${currcomm}
16 | #mkdir ${report}
17 |
18 | for fn in ${input} ; do
19 | python src/trace_statistics.py ${fn}
20 | # Give tshark some time to recover
21 | sleep 3
22 | done
23 |
24 | #mv reports/*.csv ${report}/
25 | #mv reports/*.pdf ${report}/
26 |
27 | spd-say "Bin fertig!"
28 |
--------------------------------------------------------------------------------
/eval-fh.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #input=input/*-100.pcap
4 | #input=input/*-1000.pcap
5 | #input="input/*-100.pcap input/*-1000.pcap"
6 | #input=input/maxdiff-filtered/*-1000.pcap
7 | # input=input/maxdiff-fromOrig/*-1000.pcap
8 | #input=input/maxdiff-fromOrig/*-100.pcap
9 | input="input/maxdiff-fromOrig/*-100*.pcap input/deduped-orig/*-100*.pcap"
10 | #input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap"
11 |
12 |
13 | #tftnext=$(expr 1 + $(ls -d reports/tft-* | sed "s/^.*tft-\([0-9]*\)-.*$/\1/" | sort | tail -1))
14 | #tftnpad=$(printf "%03d" ${tftnext})
15 | #currcomm=$(git log -1 --format="%h")
16 | #report=reports/tft-${tftnpad}-clustering-${currcomm}
17 | #mkdir ${report}
18 |
19 | for fn in ${input} ; do
20 | python src/fh.py ${fn}
21 | # Give tshark some time to recover
22 | sleep 3
23 | done
24 |
25 | #mv reports/*.csv ${report}/
26 | #mv reports/*.pdf ${report}/
27 |
28 | spd-say "Bin fertig!"
29 |
--------------------------------------------------------------------------------
/eval-fh-relaxed.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #input=input/*-100.pcap
4 | #input=input/*-1000.pcap
5 | #input="input/*-100.pcap input/*-1000.pcap"
6 | #input=input/maxdiff-filtered/*-1000.pcap
7 | # input=input/maxdiff-fromOrig/*-1000.pcap
8 | #input=input/maxdiff-fromOrig/*-100.pcap
9 | input="input/maxdiff-fromOrig/*-100*.pcap input/deduped-orig/*-100*.pcap"
10 | #input="input/maxdiff-fromOrig/ntp_SMIA-20111010_maxdiff-100.pcap"
11 |
12 |
13 | #tftnext=$(expr 1 + $(ls -d reports/tft-* | sed "s/^.*tft-\([0-9]*\)-.*$/\1/" | sort | tail -1))
14 | #tftnpad=$(printf "%03d" ${tftnext})
15 | #currcomm=$(git log -1 --format="%h")
16 | #report=reports/tft-${tftnpad}-clustering-${currcomm}
17 | #mkdir ${report}
18 |
19 | for fn in ${input} ; do
20 | python src/fh_relaxed.py ${fn}
21 | # Give tshark some time to recover
22 | sleep 3
23 | done
24 |
25 | #mv reports/*.csv ${report}/
26 | #mv reports/*.pdf ${report}/
27 |
28 | spd-say "Bin fertig!"
29 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # FieldHunter
2 |
3 | Re-implementation of parts of the protocol reverse engineering approach FieldHunter (FH) as proposed in
4 |
5 | > Bermudez, Ignacio, Alok Tongaonkar, Marios Iliofotou, Marco Mellia, und Maurizio M. Munafò.
6 | > „Towards Automatic Protocol Field Inference“. Computer Communications 84 (15. Juni 2016).
7 | > https://doi.org/10.1016/j.comcom.2016.02.015.
8 |
9 | Written by Stephan Kleber
10 | who also proposed some improvements for the field heuristics in
11 | `inference/fieldtypesRelaxed.py`
12 | used by
13 | `src/fh_relaxed.py`
14 | for evaluation to be run by
15 | `eval-fh-relaxed.sh`.
16 |
17 | The original FieldHunter heuristics are run via
18 | `eval-fh.sh`.
19 |
20 | It only implements FH's binary message handling using n-grams (not textual using delimiters!)
21 |
22 |
23 | Statistics about traces can be gained by
24 | `eval-traces.sh`.
25 |
26 | Not sure about a licence right now.
27 |
28 | ## Installation
29 |
30 | Clone the repository including the nemere submodule:
31 | ```git clone --recurse-submodules git@github.com:vs-uulm/nemesys.git```
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/.idea/fieldhunter.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/src/fh.py:
--------------------------------------------------------------------------------
1 | """
2 | Only implements FH's binary message handling using n-grams (not textual using delimiters!)
3 | """
4 |
5 | from argparse import ArgumentParser
6 | from time import time
7 |
8 | # noinspection PyUnresolvedReferences
9 | from tabulate import tabulate
10 | # noinspection PyUnresolvedReferences
11 | from pprint import pprint
12 | # noinspection PyUnresolvedReferences
13 | import IPython
14 |
15 | from nemere.utils.loader import SpecimenLoader
16 | from nemere.utils.evaluationHelpers import StartupFilecheck
17 | from nemere.utils.reportWriter import writeReport
18 | from nemere.validation.dissectorMatcher import MessageComparator, DissectorMatcher
19 |
20 | from fieldhunter.inference.fieldtypes import *
21 | from fieldhunter.inference.common import segmentedMessagesAndSymbols
22 | from fieldhunter.utils.base import Flows
23 | from fieldhunter.utils.eval import FieldTypeReport
24 |
25 |
26 |
27 |
28 | if __name__ == '__main__':
29 | parser = ArgumentParser(
30 | description='Re-Implementation of FieldHunter.')
31 | parser.add_argument('pcapfilename', help='Filename of the PCAP to load.')
32 | parser.add_argument('-i', '--interactive', help='open ipython prompt after finishing the analysis.',
33 | action="store_true")
34 | # Pointless options: FH requires TCP/UDP over IP (FH, Section 6.6)
35 | # parser.add_argument('-l', '--layer', type=int, default=2,
36 | # help='Protocol layer relative to IP to consider. Default is 2 layers above IP '
37 | # '(typically the payload of a transport protocol).')
38 | # parser.add_argument('-r', '--relativeToIP', default=True, action='store_false')
39 | args = parser.parse_args()
40 | layer = 2
41 | relativeToIP = True
42 |
43 | filechecker = StartupFilecheck(args.pcapfilename)
44 |
45 | specimens = SpecimenLoader(args.pcapfilename, layer = layer, relativeToIP = relativeToIP)
46 | # noinspection PyTypeChecker
47 | messages = list(specimens.messagePool.keys()) # type: List[L4NetworkMessage]
48 | flows = Flows(messages)
49 |
50 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
51 | print("Hunting fields in", filechecker.pcapstrippedname)
52 | inferenceStart = time()
53 |
54 | # MSG-type
55 | print("Inferring", MSGtype.typelabel)
56 | msgtypefields = MSGtype(flows)
57 |
58 | # MSG-len
59 | print("Inferring", MSGlen.typelabel)
60 | msglenfields = MSGlen(flows)
61 |
62 | # Host-ID
63 | print("Inferring", HostID.typelabel)
64 | hostidfields = HostID(messages)
65 |
66 | # Session-ID (FH, Section 3.2.4)
67 | print("Inferring", SessionID.typelabel)
68 | sessionidfields = SessionID(messages)
69 |
70 | # Trans-ID (FH, Section 3.2.5)
71 | print("Inferring", TransID.typelabel)
72 | transidfields = TransID(flows)
73 |
74 | # Accumulators (FH, Section 3.2.6)
75 | print("Inferring", Accumulator.typelabel)
76 | accumulatorfields = Accumulator(flows)
77 |
78 | # in order of fieldtypes.precedence!
79 | sortedInferredTypes = sorted(
80 | (msglenfields, msgtypefields, hostidfields, sessionidfields, transidfields, accumulatorfields),
81 | key=lambda l: precedence[l.typelabel] )
82 | segmentedMessages, symbols = segmentedMessagesAndSymbols(sortedInferredTypes, messages)
83 |
84 | inferenceDuration = time() - inferenceStart
85 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
86 | # statistics for all types
87 | print(tabulate(
88 | [(infield.typelabel,
89 | sum(1 for msgsegs in infield.segments if len(msgsegs) > 0),
90 | max(len(msgsegs) for msgsegs in infield.segments)
91 | if len(infield.segments) > 0 else 0 # prevent empty sequence for max()
92 | ) for infield in sortedInferredTypes],
93 | headers=["typelabel","messages","max inferred per msg"]
94 | ))
95 |
96 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
97 |
98 | nontrivialSymbols = [sym for sym in symbols if len(sym.fields) > 1]
99 | comparator = MessageComparator(specimens, layer=layer, relativeToIP=relativeToIP)
100 | print("Dissection complete.")
101 | comparator.pprintInterleaved(nontrivialSymbols)
102 | print(f"\n + {len(symbols)-len(nontrivialSymbols)} messages without any inferred fields.")
103 |
104 | # calc FMS per message
105 | print("Calculate FMS...")
106 | message2quality = DissectorMatcher.symbolListFMS(comparator, symbols)
107 | # write statistics to csv
108 | writeReport(message2quality, inferenceDuration, comparator, "fieldhunter-literal",
109 | filechecker.reportFullPath)
110 |
111 | # FTR validation: calculate TP/FP/FN ==> P/R per protocol and per type
112 | infieldWorkbook = FieldTypeReport.newWorkbook()
113 | for infields in sortedInferredTypes:
114 | infieldReport = FieldTypeReport(infields, comparator, segmentedMessages)
115 | infieldReport.addXLworksheet(infieldWorkbook, FieldTypeReport.ovTitle)
116 | FieldTypeReport.saveWorkbook(infieldWorkbook, filechecker.pcapstrippedname)
117 |
118 | # interactive
119 | if args.interactive:
120 | IPython.embed()
121 |
122 |
--------------------------------------------------------------------------------
/src/fh_relaxed.py:
--------------------------------------------------------------------------------
1 | """
2 | FieldHunter main script with relaxed assumptions (see fieldhunter.inference.fieldtypesRelaxed)
3 |
4 | Only implements FH's binary message handling using n-grams (not textual using delimiters!)
5 | """
6 | import logging
7 | from argparse import ArgumentParser
8 | from time import time
9 |
10 | # noinspection PyUnresolvedReferences
11 | from tabulate import tabulate
12 | # noinspection PyUnresolvedReferences
13 | from pprint import pprint
14 | # noinspection PyUnresolvedReferences
15 | import IPython
16 |
17 | from nemere.utils.loader import SpecimenLoader
18 | from nemere.utils.evaluationHelpers import StartupFilecheck
19 | from nemere.utils.reportWriter import writeReport
20 | from nemere.validation.dissectorMatcher import MessageComparator, DissectorMatcher
21 | from nemere.inference.segments import TypedSegment
22 |
23 | from fieldhunter.inference.fieldtypesRelaxed import *
24 | from fieldhunter.inference.common import segmentedMessagesAndSymbols
25 | from fieldhunter.utils.base import Flows
26 | from fieldhunter.utils.eval import FieldTypeReport, GroundTruth
27 |
28 | if __name__ == '__main__':
29 | parser = ArgumentParser(
30 | description='Re-Implementation of FieldHunter.')
31 | parser.add_argument('pcapfilename', help='Filename of the PCAP to load.')
32 | parser.add_argument('-i', '--interactive', help='Open ipython prompt after finishing the analysis.',
33 | action="store_true")
34 | parser.add_argument('-d', '--debug', help='Enable debug output.', action="store_true")
35 | args = parser.parse_args()
36 | if args.debug:
37 | print("DEBUG")
38 | logging.basicConfig(level=logging.DEBUG)
39 | logger = logging.getLogger()
40 | logger.setLevel(logging.DEBUG)
41 | else:
42 | print("INFO")
43 | logging.basicConfig(level=logging.INFO)
44 | logger = logging.getLogger()
45 | logger.setLevel(logging.INFO)
46 |
47 | layer = 2
48 | relativeToIP = True
49 |
50 | filechecker = StartupFilecheck(args.pcapfilename)
51 |
52 | specimens = SpecimenLoader(args.pcapfilename, layer = layer, relativeToIP = relativeToIP)
53 | # noinspection PyTypeChecker
54 | messages = list(specimens.messagePool.keys()) # type: List[L4NetworkMessage]
55 | flows = Flows(messages)
56 |
57 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
58 | print("Hunting fields in", filechecker.pcapstrippedname)
59 | inferenceStart = time()
60 |
61 | # MSG-type
62 | print("Inferring", MSGtype.typelabel)
63 | msgtypefields = MSGtype(flows)
64 |
65 | # MSG-len
66 | print("Inferring", MSGlen.typelabel)
67 | msglenfields = MSGlen(flows)
68 |
69 | # Host-ID
70 | print("Inferring", HostID.typelabel)
71 | hostidfields = HostID(messages)
72 |
73 | # Session-ID (FH, Section 3.2.4)
74 | print("Inferring", SessionID.typelabel)
75 | sessionidfields = SessionID(messages)
76 |
77 | # Trans-ID (FH, Section 3.2.5)
78 | print("Inferring", TransID.typelabel)
79 | transidfields = TransID(flows)
80 |
81 | # Accumulators (FH, Section 3.2.6)
82 | print("Inferring", Accumulator.typelabel)
83 | accumulatorfields = Accumulator(flows)
84 |
85 | # in order of fieldtypes.precedence!
86 | sortedInferredTypes = sorted(
87 | (msglenfields, msgtypefields, hostidfields, sessionidfields, transidfields, accumulatorfields),
88 | key=lambda l: precedence[l.typelabel] )
89 | segmentedMessages, symbols = segmentedMessagesAndSymbols(sortedInferredTypes, messages)
90 |
91 | inferenceDuration = time() - inferenceStart
92 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
93 |
94 | print(tabulate(
95 | [(infield.typelabel,
96 | sum(1 for msgsegs in infield.segments if len(msgsegs) > 0),
97 | max(len(msgsegs) for msgsegs in infield.segments)
98 | if len(infield.segments) > 0 else 0 # prevent empty sequence for max()
99 | ) for infield in sortedInferredTypes],
100 | headers=["typelabel","messages","max inferred per msg"]
101 | ))
102 |
103 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
104 |
105 | nontrivialSymbols = [sym for sym in symbols if len(sym.fields) > 1]
106 | comparator = MessageComparator(specimens, layer=layer, relativeToIP=relativeToIP)
107 | print("Dissection complete.")
108 | comparator.pprintInterleaved(nontrivialSymbols)
109 | print(f"\n + {len(symbols)-len(nontrivialSymbols)} messages without any inferred fields.")
110 |
111 | # calc FMS per message
112 | print("Calculate FMS...")
113 | message2quality = DissectorMatcher.symbolListFMS(comparator, symbols)
114 | # write statistics to csv
115 | writeReport(message2quality, inferenceDuration, comparator, "fieldhunter-literal",
116 | filechecker.reportFullPath)
117 |
118 | # FTR validation: calculate TP/FP/FN ==> P/R per protocol and per type
119 | infieldWorkbook = FieldTypeReport.newWorkbook()
120 | for infields in sortedInferredTypes:
121 | infieldReport = FieldTypeReport(infields, comparator, segmentedMessages)
122 | infieldReport.addXLworksheet(infieldWorkbook, FieldTypeReport.ovTitle)
123 | FieldTypeReport.saveWorkbook(infieldWorkbook, filechecker.pcapstrippedname)
124 |
125 | # coverage
126 | tpByteSum = sum(sum(
127 | len(seg) for seg in msg
128 | if isinstance(seg, TypedSegment) and comparator.lookupField(seg)[1] in GroundTruth.fieldtypes[seg.fieldtype]
129 | ) for msg in segmentedMessages.values())
130 | payloadSum = sum(len(msg.data) for msg in segmentedMessages.keys())
131 | coverage = tpByteSum/payloadSum
132 | print(f"Coverage (ratio of TP bytes): {coverage:.5f}")
133 | # TODO quick and dirty hard coded filename, no checks.
134 | import csv
135 | with open("reports/fh-coverage.csv", "a") as covcsv:
136 | covwriter = csv.writer(covcsv)
137 | covwriter.writerow([filechecker.pcapstrippedname, tpByteSum, payloadSum, coverage])
138 |
139 | # interactive
140 | if args.interactive:
141 | print("""
142 | The inference of individual field types can be found in:
143 | msglenfields, msgtypefields, hostidfields, sessionidfields, transidfields, accumulatorfields
144 |
145 | A combination per message is in:
146 | segmentedMessages, symbols
147 | """)
148 | IPython.embed()
149 |
150 |
--------------------------------------------------------------------------------
/src/trace_statistics.py:
--------------------------------------------------------------------------------
1 | """
2 | This script provides statistics about the given PCAP trace that have impact on the FieldHunter inference.
3 | """
4 | # noinspection PyUnresolvedReferences
5 | import IPython, logging
6 | # noinspection PyUnresolvedReferences
7 | from tabulate import tabulate
8 | from argparse import ArgumentParser
9 | from os.path import join
10 | import matplotlib.pyplot as plt
11 |
12 | from nemere.utils.loader import SpecimenLoader
13 | from nemere.utils.evaluationHelpers import StartupFilecheck, reportFolder
14 | from nemere.validation.dissectorMatcher import MessageComparator
15 |
16 | from fieldhunter.inference.fieldtypes import *
17 | from fieldhunter.utils.base import Flows
18 | from fieldhunter.utils.eval import GroundTruth, csvAppend
19 |
20 | logging.basicConfig(level=logging.DEBUG)
21 | logging.getLogger().setLevel(logging.DEBUG)
22 |
23 | if __name__ == '__main__':
24 | parser = ArgumentParser(
25 | description='Statistics about the given PCAP trace that have impact on the FieldHunter inference.')
26 | parser.add_argument('pcapfilename', help='Filename of the PCAP to load.')
27 | parser.add_argument('-i', '--interactive', help='open IPython prompt after finishing the analysis.',
28 | action="store_true")
29 | args = parser.parse_args()
30 |
31 | filechecker = StartupFilecheck(args.pcapfilename)
32 |
33 | # FH always requires the protocol to be inside TCP/UDP over IP (FH, Section 6.6)
34 | specimens = SpecimenLoader(args.pcapfilename, layer=2, relativeToIP=True)
35 | # noinspection PyTypeChecker
36 | messages = list(specimens.messagePool.keys()) # type: List[L4NetworkMessage]
37 | comparator = MessageComparator(specimens, layer=2, relativeToIP=True)
38 |
39 | # # # # # # # # # # # # # # # # # #
40 | # Relevant for MSG-Type
41 | flows = Flows(messages)
42 | # print(tabulate(flows.c2sInConversations().keys()))
43 | # print(tabulate(flows.s2cInConversations().keys()))
44 | print("Conversations:\n")
45 | print(tabulate(flows.conversations().keys()))
46 | mqr = flows.matchQueryResponse()
47 | print("\nNumber of matching queries and responses:", len(mqr), "in", len(flows.flows), "flows")
48 | print("Found in", len(messages), f"messages. Coverage: {(len(mqr)*200)/len(messages):.1f}%")
49 | header = ["trace", "matching", "conversations", "flows", "messages", "coverage"]
50 | # amount/percentage of messages in the trace that are of "singular flows", i. e., a single message without either
51 | # a matching request or reply, is calculated by (100% - coverage).
52 | csvAppend(reportFolder, "flows", header, [[
53 | filechecker.pcapstrippedname, len(mqr), len(flows.conversations()), len(flows.flows),
54 | len(messages), (len(mqr)*200)/len(messages) ]])
55 | # TODO
56 | # discern types: broadcasts, c2s/s2c without matching flow
57 |
58 | # # # # # # # # # # # # # # # # # #
59 | # Entropy filter threshold rationale: entropy statistics for ground truth fields
60 | # since the entropyThresh used in MSGtype/MSGlen (NonConstantNonRandomEntropyFieldType) is not given in FH
61 | # using our traces to back our value.
62 | gt = GroundTruth(comparator)
63 | gtTypeAndLengthEntropies = gt.typeAndLenEntropies()
64 | header = ["trace", "field name", "type label", "sample count", "entropy"]
65 | # write/append to a file. Columns: trace, field name, type label, sample count, entropy
66 | csvAppend(reportFolder, "typeAndLengthEntropies", header,
67 | ([filechecker.pcapstrippedname, *row] for row in gtTypeAndLengthEntropies if not numpy.isnan(row[-1])))
68 | # # # # # # # # # # # # # # # # # #
69 |
70 | # # # # # # # # # # # # # # # # # #
71 | # Relevant for MSG-Len
72 | # TODO length of messages, something like:
73 | # keyfunc = lambda m: len(m.data)
74 | # msgbylen = {k: v for k, v in groupby(sorted(direction, key=keyfunc), keyfunc)}
75 | # # # # # # # # # # # # # # # # # #
76 |
77 | # # # # # # # # # # # # # # # # # #
78 | # Entropy plots: Relevant for MSG-Type and Trans-ID
79 | c2s, s2c = flows.splitDirections()
80 | c2sEntropy = pyitNgramEntropy(c2s, 1)
81 | s2cEntropy = pyitNgramEntropy(s2c, 1)
82 | fig: plt.Figure
83 | ax1: plt.Axes
84 | fig, (ax1, ax2) = plt.subplots(2,1,figsize=(6,6))
85 | for ax, entropy in [(ax1, c2sEntropy), (ax2, s2cEntropy)]:
86 | if len(entropy) > 0:
87 | ax.stem(entropy, use_line_collection=True)
88 | else:
89 | ax.text(1, .5, "no entries")
90 | ax.set_xlim(0, 32)
91 | ax.set_ylim(0.,1.)
92 | ax.grid(which="major", axis="y")
93 | ax.set_xlabel("byte offset")
94 | ax.set_ylabel("normalized entropy")
95 | plt.suptitle("Entropies per byte offset", fontsize="x-large")
96 | ax1.set_title("Client to Server Collection")
97 | ax2.set_title("Server to Client Collection")
98 | fig.tight_layout(rect=[0,0,1,.95])
99 | fig.savefig(join(reportFolder, filechecker.pcapstrippedname + ".pdf"))
100 | # # # # # # # # # # # # # # # # # #
101 |
102 | # # # # # # # # # # # # # # # # # #
103 | # DHCP "Transaction ID" that is a FH Session-ID
104 | if "dhcp" in specimens.pcapFileName:
105 | sessIDtuples = sorted( (
106 | (comparator.parsedMessages[specimens.messagePool[msg]].getValuesByName('dhcp.id')[0],
107 | msg.source.rpartition(':')[0], msg.destination.rpartition(':')[0]) for msg in messages),
108 | key = lambda x: x[0] )
109 | participantsTuples = [(a, *sorted([b, c])) for a, b, c in sessIDtuples]
110 | field2value = [(
111 | intsFromNgrams([bytes.fromhex(a)])[0],
112 | intsFromNgrams([bytes(map(int, b.split(".") + c.split(".")))])[0])
113 | for a, b, c in participantsTuples]
114 | ngSc = numpy.array(list(zip(*field2value)))
115 | catCorr = drv.information_mutual(ngSc[0], ngSc[1]) / drv.entropy_joint(ngSc)
116 | print(catCorr)
117 | # 0.5073953157493724
118 | # For dhcp_SMIA2011101X_deduped-1000.pcap this is just about .5 which is quite surprising.
119 | ignoreList = {"0.0.0.0", "255.255.255.255"}
120 | field2value = [(
121 | intsFromNgrams([bytes.fromhex(a)])[0],
122 | intsFromNgrams([bytes(map(int, b.split(".") + c.split(".")))])[0])
123 | for a, b, c in participantsTuples if b not in ignoreList and c not in ignoreList and a != "00000000"]
124 | ngSc = numpy.array(list(zip(*field2value)))
125 | catCorr = drv.information_mutual(ngSc[0], ngSc[1]) / drv.entropy_joint(ngSc)
126 | print(catCorr)
127 | # 0.566225418688138
128 | # Ignoring some trivial cases raises the correlation only marginally.
129 | # # # # # # # # # # # # # # # # # #
130 |
131 | # interactive
132 | if args.interactive:
133 | print()
134 | IPython.embed()
135 |
--------------------------------------------------------------------------------
/src/fieldhunter/inference/common.py:
--------------------------------------------------------------------------------
1 | """
2 | Common handling of inference intermediates or results.
3 | """
4 |
5 | from typing import Iterable, List, Tuple, Dict
6 |
7 | from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage
8 | from netzob.Model.Vocabulary.Symbol import Symbol
9 |
10 | from fieldhunter.inference.fieldtypes import FieldType
11 | from nemere.inference.formatRefinement import isOverlapping
12 | from nemere.inference.segmentHandler import symbolsFromSegments
13 | from nemere.inference.segments import TypedSegment, MessageSegment
14 | from nemere.inference.analyzers import Value
15 |
16 |
17 | def segmentedMessagesAndSymbols(typedFields: Iterable[FieldType], messages: List[AbstractMessage]) \
18 | -> Tuple[Dict[AbstractMessage, List[MessageSegment]], List[Symbol]]:
19 | # noinspection PyProtectedMember
20 | """
21 | Consolidate the inferred fields into segmented messages and additionally into symbols.
22 |
23 | >>> from itertools import chain
24 | >>> from tabulate import tabulate
25 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage
26 | >>> from nemere.visualization.simplePrint import SegmentPrinter
27 | >>> from fieldhunter.inference.common import segmentedMessagesAndSymbols
28 | >>> from fieldhunter.inference.fieldtypes import FieldType
29 | >>> from fieldhunter.utils.base import iterateSelected
30 | >>> # prevent Netzob from producing debug output.
31 | >>> import logging
32 | >>> logging.getLogger().setLevel(30)
33 | >>>
34 | >>> messageList = [
35 | ... L4NetworkMessage(b"QQQ456789"), L4NetworkMessage(b"RRR567890"), L4NetworkMessage(b"QQQ7890AB"),
36 | ... L4NetworkMessage(b"RRR567890"), L4NetworkMessage(b"QQQ123456789"), L4NetworkMessage(b"RRR890A"),
37 | ... L4NetworkMessage(b"QQQ6789"), L4NetworkMessage(b"RRR890ABCDEFGH")
38 | ... ]
39 | >>>
40 | >>> # normally this would only be performed by a subclass of FieldType internally; here for the sake of testing
41 | >>> segmentsA = FieldType._posLen2segments(messageList, [(0,3),(5,2)])
42 | >>> del segmentsA[5][1]; del segmentsA[3]; del segmentsA[1][1]; del segmentsA[0][0]
43 | >>> segmentsB = FieldType._posLen2segments(messageList, [(2,2),(5,4)])
44 | >>> ftA = FieldType()
45 | >>> ftA._segments = segmentsA
46 | >>> ftB = FieldType()
47 | >>> ftB._segments = segmentsB
48 | >>>
49 | >>> sm, sym = segmentedMessagesAndSymbols([ftA, ftB], messageList)
50 | >>> sp = SegmentPrinter(sm.values()) # doctest: +SKIP
51 | >>> sp.toConsole() # doctest: +SKIP
52 | >>> print(tabulate(sm.values()))
53 | --------------------------------------------------------------- -------------------------------------------------------------------
54 | MessageSegment 2 bytes at (2, 4): 5134 | values: (81, 52) MessageSegment 2 bytes at (5, 7): 3637 | values: (54, 55)
55 | MessageSegment 3 bytes at (0, 3): 525252 | values: (82, 82, 82) MessageSegment 4 bytes at (5, 9): 37383930 | values: (55, 56, 57...
56 | MessageSegment 3 bytes at (0, 3): 515151 | values: (81, 81, 81) MessageSegment 2 bytes at (5, 7): 3930 | values: (57, 48)
57 | MessageSegment 2 bytes at (2, 4): 5235 | values: (82, 53) MessageSegment 4 bytes at (5, 9): 37383930 | values: (55, 56, 57...
58 | MessageSegment 3 bytes at (0, 3): 515151 | values: (81, 81, 81) MessageSegment 2 bytes at (5, 7): 3334 | values: (51, 52)
59 | MessageSegment 3 bytes at (0, 3): 525252 | values: (82, 82, 82)
60 | MessageSegment 3 bytes at (0, 3): 515151 | values: (81, 81, 81) MessageSegment 2 bytes at (5, 7): 3839 | values: (56, 57)
61 | MessageSegment 3 bytes at (0, 3): 525252 | values: (82, 82, 82) MessageSegment 2 bytes at (5, 7): 3041 | values: (48, 65)
62 | --------------------------------------------------------------- -------------------------------------------------------------------
63 | >>> for s in sym:
64 | ... print(s.getCells()) # doctest: +NORMALIZE_WHITESPACE
65 | Field | Field | Field | Field | Field
66 | ----- | ----- | ----- | ----- | -----
67 | 'QQ' | 'Q4' | '5' | '67' | '89'
68 | ----- | ----- | ----- | ----- | -----
69 | Field | Field | Field
70 | ----- | ----- | ------
71 | 'RRR' | '56' | '7890'
72 | ----- | ----- | ------
73 | Field | Field | Field | Field
74 | ----- | ----- | ----- | -----
75 | 'QQQ' | '78' | '90' | 'AB'
76 | ----- | ----- | ----- | -----
77 | Field | Field | Field | Field
78 | ----- | ----- | ----- | ------
79 | 'RR' | 'R5' | '6' | '7890'
80 | ----- | ----- | ----- | ------
81 | Field | Field | Field | Field
82 | ----- | ----- | ----- | -------
83 | 'QQQ' | '12' | '34' | '56789'
84 | ----- | ----- | ----- | -------
85 | Field | Field
86 | ----- | ------
87 | 'RRR' | '890A'
88 | ----- | ------
89 | Field | Field | Field
90 | ----- | ----- | -----
91 | 'QQQ' | '67' | '89'
92 | ----- | ----- | -----
93 | Field | Field | Field | Field
94 | ----- | ----- | ----- | ---------
95 | 'RRR' | '89' | '0A' | 'BCDEFGH'
96 | ----- | ----- | ----- | ---------
97 |
98 | :param typedFields: The inferred fields of different types in order of their precedence!
99 | E. g., field types with smaller index will remove concurring subsequent ones that overlap.
100 | :param messages: The messages to expect inference for.
101 | :return: tuple of
102 | * dict of the messages and their segment list.
103 | * Netzob symbols representing the inference.
104 | """
105 | # combine inferred fields per message to facilitate validation
106 | typedSequences = [
107 | {segs[0].message: segs for segs in fields.segments if len(segs) > 0}
108 | for fields in typedFields
109 | ]
110 |
111 | segmentedMessages = dict()
112 | for msg in messages:
113 | segmsg = list()
114 | # in order of fieldtypes.precedence!
115 | for typedMessages in typedSequences:
116 | if msg in typedMessages: # type: List[TypedSegment]
117 | # segments of a field type for one message
118 | for cand in typedMessages[msg]:
119 | # check overlapping segment
120 | overlapps = False
121 | for seg in segmsg:
122 | if isOverlapping(cand, seg):
123 | overlapps = True
124 | break
125 | # if a segment is already
126 | if overlapps:
127 | continue
128 | segmsg.append(cand)
129 | # symbolsFromSegments fixes gaps, but cannot know anything about the message in an empty list, so we add a dummy
130 | # segment for these cases here
131 | segmentedMessages[msg] = sorted(segmsg, key=lambda s: s.offset) if len(segmsg) > 0 else \
132 | [ MessageSegment(Value(msg), 0, len(msg.data)) ]
133 |
134 | symbols = symbolsFromSegments(segmentedMessages.values())
135 |
136 | return segmentedMessages, symbols
--------------------------------------------------------------------------------
/src/fieldhunter/utils/eval.py:
--------------------------------------------------------------------------------
1 | import os, csv, logging
2 | from typing import Any
3 | from os.path import join, exists
4 | from time import strftime
5 |
6 | from openpyxl import Workbook, utils
7 | from openpyxl.worksheet.worksheet import Worksheet
8 |
9 | import nemere.utils.evaluationHelpers as eh
10 | from nemere.inference.segments import MessageSegment
11 | from nemere.validation.dissectorMatcher import MessageComparator
12 |
13 | from fieldhunter.inference.fieldtypes import *
14 |
15 |
16 | def csvAppend(reportFolder: str, fileName: str, header: List[str], rows: Iterable[Iterable[Any]]):
17 | csvpath = os.path.join(reportFolder, fileName + '.csv')
18 | csvWriteHead = False if os.path.exists(csvpath) else True
19 |
20 | print('Write statistics to {}...'.format(csvpath))
21 | with open(csvpath, 'a') as csvfile:
22 | statisticscsv = csv.writer(csvfile)
23 | if csvWriteHead:
24 | statisticscsv.writerow(header)
25 | statisticscsv.writerows(rows)
26 |
27 |
28 | class FieldTypeReport(object):
29 |
30 | headers = ["hexbytes", "segment offset", "segment end",
31 | "overlap ratio", "overlap index", "overlap offset", "overlap end", "overlap value",
32 | "message date", "message type", "field name", "field type", "TP/FP", "isVisible"]
33 | # (column isVisible could also be called: "not hidden by other type")
34 |
35 | overviewHeaders = [
36 | "field type", "FN", "FP", "TP", "P", "R"
37 | ]
38 |
39 | ovTitle = "Overview"
40 |
41 | def __init__(self, fieldtype: FieldType, comparator: MessageComparator,
42 | segmentedMessages: Dict[AbstractMessage, List[MessageSegment]] = None):
43 | """
44 |
45 | :param fieldtype: The field type object to generate a report for.
46 | :param comparator: A NEMERE MessageComparator to look up the true fields overlapping our inference.
47 | :param segmentedMessages: Optional Dict of segmented messages to check whether another field type got
48 | precedence for single inference instances. see fieldhunter.inference.fieldtypes#precedence and
49 | fieldhunter.inference.common#segmentedMessagesAndSymbols
50 | """
51 | self._fieldtype = fieldtype
52 | self._comparator = comparator
53 | self._segmentedMessages = segmentedMessages
54 |
55 | def lookupOverlap(self):
56 | """
57 | Lookup the overlap with the ground truth for all segments inferred for the given FieldHunter field type.
58 |
59 | :return: table (list of lists) of statistics for each inferred segment from field type, according to the
60 | columns given in FieldTypeReport#headers.
61 | """
62 | tabdata = list()
63 |
64 | for seg in (seg for msgsegs in self._fieldtype.segments for seg in msgsegs if msgsegs):
65 | # field: from ground true; seg(ment): inferred; overlap: intersection of field and segment
66 | overlapRatio, overlapIndex, overlapOffset, overlapEnd = self._comparator.fieldOverlap(seg)
67 | messagetype, fieldname, fieldtype = self._comparator.lookupField(seg)
68 | overlapValue = "'" + seg.message.data[overlapOffset:overlapEnd].hex() + "'"
69 |
70 | # determine what is a TP (value True)/FP (value False) using GroundTruth
71 | tpfp = fieldname in GroundTruth.fieldtypes[self.typelabel]
72 |
73 | # check the precedence of multiple overlapping inferred fields
74 | isVisible = seg in chain.from_iterable(self._segmentedMessages.values())\
75 | if self._segmentedMessages is not None else "n/a"
76 |
77 | tabdata.append(["'" + seg.bytes.hex() + "'", seg.offset, seg.nextOffset,
78 | overlapRatio, overlapIndex, overlapOffset, overlapEnd, overlapValue,
79 | seg.message.date, messagetype, fieldname, fieldtype, tpfp, isVisible])
80 | return tabdata
81 |
82 | @property
83 | def typelabel(self):
84 | """The label for the field type this report is generated for."""
85 | return self._fieldtype.typelabel
86 |
87 | def countTrueOccurrences(self):
88 | counter = 0
89 | for fieldname in GroundTruth.fieldtypes[self.typelabel]:
90 | counter += len(self._comparator.lookupValues4FieldName(fieldname))
91 | return counter
92 |
93 | def addXLworksheet(self, workbook: Workbook, overview: str=None):
94 | """Add data as worksheet to a openpyxl workbook. The caller needs to take take to write to file afterwards."""
95 | worksheet = workbook.create_sheet(self.typelabel)
96 | worksheet.append(FieldTypeReport.headers)
97 | for row in self.lookupOverlap():
98 | worksheet.append(row)
99 | onlyVisible = f",{utils.quote_sheetname(self.typelabel)}!N:N,TRUE()" \
100 | if self._segmentedMessages is not None else ""
101 | if overview is not None:
102 | try:
103 | ovSheet = workbook[overview] # type: Worksheet
104 | currentRow = ovSheet.max_row + 1
105 | tpCoord = f"{utils.get_column_letter(4)}{currentRow}"
106 | ovSheet.append([
107 | self.typelabel,
108 | f"={self.countTrueOccurrences()} - {tpCoord}", # "FN"
109 | f"=COUNTIFS({utils.quote_sheetname(self.typelabel)}!M:M,FALSE(){onlyVisible})", # "=FP"
110 | f"=COUNTIFS({utils.quote_sheetname(self.typelabel)}!M:M,TRUE(){onlyVisible})", # "=TP"
111 | f"=D{currentRow}/(D{currentRow}+C{currentRow})", # P
112 | f"=D{currentRow}/(D{currentRow}+B{currentRow})", # R
113 | ])
114 | except KeyError:
115 | logging.getLogger(__name__).info("Overview sheet with title", overview, "not found. "
116 | "Not writing overview.")
117 | return workbook
118 |
119 | @staticmethod
120 | def newWorkbook():
121 | """Prepare a new workbook to hold worksheets generated by #addXLworksheet()."""
122 | infieldWorkbook = Workbook()
123 | infieldWorkbook.active.title = FieldTypeReport.ovTitle
124 | ovSheet = infieldWorkbook[FieldTypeReport.ovTitle]
125 | ovSheet.append(FieldTypeReport.overviewHeaders)
126 | return infieldWorkbook
127 |
128 | @staticmethod
129 | def saveWorkbook(infieldWorkbook: Workbook, pcapstrippedname: str):
130 | infieldFilename = join(eh.reportFolder,
131 | f"FieldTypeReport_{pcapstrippedname}_{strftime('%Y%m%d-%H%M%S')}.xlsx")
132 | if not exists(infieldFilename):
133 | print("Write field type report to", infieldFilename)
134 | infieldWorkbook.save(infieldFilename)
135 | else:
136 | print("Could not write", infieldFilename, "- File exists")
137 | for worksheet in infieldWorkbook.worksheets:
138 | headers = worksheet.rows[0]
139 | cells = worksheet.rows[1:]
140 | print(f"\nReport for {worksheet.title}:\n" + tabulate(cells, headers=headers))
141 |
142 |
143 | class GroundTruth(object):
144 | """tshark dissector field names for sample protocols mapped from the FieldHunter field type class."""
145 | fieldtypes = {
146 | MSGlen.typelabel: ["nbss.length"],
147 | MSGtype.typelabel: ["dhcp.option.dhcp", "ntp.flags", "ntp.stratum", "dns.flags",
148 | "nbns.flags", "smb.cmd", "smb.flags", ],
149 | HostID.typelabel: ["dhcp.ip.client", "dhcp.ip.your", "dhcp.ip.server", "dhcp.hw.mac_addr", "ntp.refid"],
150 | SessionID.typelabel: ["dhcp.id", "smb.pid", "smb.uid", "smb.mid"],
151 | TransID.typelabel: ["dns.id", "nbns.id"],
152 | Accumulator.typelabel: []
153 | }
154 |
155 | def __init__(self, comparator:MessageComparator, endianness: str = "big"):
156 | self._comparator = comparator
157 | self._endianness = endianness
158 | logging.getLogger(__name__).setLevel(logging.DEBUG)
159 |
160 | def entropyPerField(self, fieldname: str):
161 | """Collect true fields values and calculate their entropy for the current trace."""
162 | fieldsValues = [bytes.fromhex(hexval) for hexval in self._comparator.lookupValues4FieldName(fieldname)]
163 | if len(fieldsValues) > 0:
164 | fieldLengths = Counter(len(bv) for bv in fieldsValues)
165 | # should normally be a constant value for this kind of fields
166 | mostCommonLen = fieldLengths.most_common(1)[0][0]
167 | logging.getLogger(__name__).debug(f"Field lengths of {fieldname}: {repr(fieldLengths)}")
168 | entropy = drv.entropy(intsFromNgrams(fieldsValues, self._endianness)) / (mostCommonLen * 8)
169 | else:
170 | entropy = numpy.nan
171 | return len(fieldsValues), entropy
172 |
173 | def typeAndLenEntropies(self):
174 | """
175 | Collect MSGtype/MSGlen true fields according to GroundTruth.fieldtypes[MSGtype.typelabel/MSGlen.typelabel]
176 |
177 | :return: list of lists of "field name", "type label", "sample count", and "entropy"
178 | """
179 | entropyList = list()
180 | for typelabel in [MSGtype.typelabel, MSGlen.typelabel]:
181 | for fieldname in GroundTruth.fieldtypes[typelabel]:
182 | # for each field name calculate entropy
183 | entropyList.append([
184 | fieldname,
185 | typelabel,
186 | *self.entropyPerField(fieldname)
187 | ])
188 | return entropyList
189 |
--------------------------------------------------------------------------------
/src/fieldhunter/inference/fieldtypesRelaxed.py:
--------------------------------------------------------------------------------
1 | """
2 | Infer message field types according to the FieldHunter paper Section 3.2
3 | but with some relaxed thresholds and assumptions.
4 |
5 | TODO introduce doctests to check critical functions in inference.fieldtypes
6 | """
7 | from typing import List, Tuple, Dict, Iterable, Union
8 | import logging
9 | from collections import Counter
10 | from abc import ABC
11 |
12 | import numpy
13 | from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage
14 | from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage
15 |
16 | from fieldhunter.utils.base import Flows, intsFromNgrams
17 | from fieldhunter.inference.fieldtypes import NonConstantNonRandomEntropyFieldType, Accumulator
18 | import fieldhunter.inference.fieldtypes as fieldtypes
19 |
20 |
21 | # logging.getLogger(__name__).setLevel(logging.DEBUG)
22 |
23 |
24 | class MSGtype(fieldtypes.MSGtype):
25 | """
26 | Relaxed version of message type (MSG-Type) inference (FH, Section 3.2.1, Fig. 3 left).
27 |
28 | see .fieldtypes.MSGtype
29 | """
30 | causalityThresh = 0.7
31 | """
32 | FH, Sec. 3.2.1 says 0.8, but that leaves no candidates for our NTP traces
33 | Reduces TP and FP for SMB 100.
34 | """
35 |
36 |
37 | class MSGlen(fieldtypes.MSGlen):
38 | """
39 | Relaxed version of message length (MSG-Len) inference (FH, Section 3.2.2, Fig. 3 center).
40 |
41 | see ..fieldtypes.MSGlen
42 | """
43 |
44 | def __init__(self, flows: Flows):
45 | super(NonConstantNonRandomEntropyFieldType, self).__init__()
46 |
47 | # The FH paper per direction wants to handle each direction separately, which is pointless for MSG-Len,
48 | # so we place all messages in one direction object.
49 | self._msgDirection = [type(self).Direction(flows.messages)]
50 | # TODO It might rather be useful to separate message types (distinct formats) in this manner.
51 | # However, this requires combination with some message type classification approach. => Future Work.
52 |
53 | class Direction(fieldtypes.MSGlen.Direction):
54 |
55 | @staticmethod
56 | def _candidateIsAcceptable(solutionAcceptable: Dict[Tuple[AbstractMessage, AbstractMessage], bool],
57 | Xarray: numpy.ndarray):
58 | """
59 | FH does not require that either the values in X[0]/a or X[1]/b are equal for all X in Xes.
60 | Thus, different values are accepted, although a message length typically is calculated using the same
61 | multiplicator X[0], even if the offset X[1] may change, so X[0] must be a scalar value.
62 |
63 | Otherwise we end up with lots of FPs. Examples:
64 | * In SMB, the 'Msg. Len. Model Parameters' (a,b) == [1., 4.]
65 | of the 4-gram at offset 0, 4 is nbss.length, i. e., a TP!
66 | Offsets 16 and 22 are FP, but with diverging A and B vectors.
67 | * Another example: In DNS, the beginning of the queried name is a FP
68 | (probably due to DNS' subdomain numbered separator scheme).
69 |
70 | Thus, we require that X[0] is the same constant value throughout the majority of checked solutions.
71 | (We use the majority to account for some random error exactly as FH does using the MSGlen.lenhypoThresh)
72 |
73 | :param solutionAcceptable: Dict of which solution is acceptable for which combination of messages.
74 | :return: Whether the given candidate is acceptable.
75 | """
76 | acceptCount = Counter(solutionAcceptable.values())
77 | mostlyAcceptable = bool(acceptCount[True] / len(acceptCount) > MSGlen.lenhypoThresh)
78 | # noinspection PyTypeChecker
79 | constantMultiplicator = all(numpy.round(Xarray[0,0], 8) == numpy.round(Xarray[1:,0], 8))
80 | logging.getLogger(__name__).debug(f"Candidate mostlyAcceptable {mostlyAcceptable} "
81 | f"and has constantMultiplicator {constantMultiplicator}.")
82 | return mostlyAcceptable and constantMultiplicator
83 |
84 |
85 | class CategoricalCorrelatedField(fieldtypes.CategoricalCorrelatedField,ABC):
86 | """
87 | Abstract class for inferring field types using categorical correlation of n-gram values with external values, e. g.,
88 | environmental information like addresses from encapsulation.
89 |
90 | Enhancement of fieldtypes.CategoricalCorrelatedField to iteratively check n-grams from size four to one.
91 | """
92 | @classmethod
93 | def correlate(cls, messages: List[L4NetworkMessage], nMax: int = 4, nMin: int = 1):
94 | """
95 | Generate n-grams with n.s from large to small
96 | at the same offsets for each message an correlate each n-gram using categorical correlation.
97 |
98 | see fieldtypes.CategoricalCorrelatedField#correlate()
99 | see HostID for the rationale of this enhancement over FH.
100 |
101 | :param messages: Messages to generate n-grams to correlate to.
102 | :param nMax: maximum of n to correlate (decrease from large to small)
103 | :param nMin: minimum of n to correlate
104 | :return: Correlation values for each offset of n-grams generated from the messages.
105 | """
106 | categoricalCorrelation = None
107 | for n in range(nMax,nMin+1,-1):
108 | # this is one correlation value for each n-gram starting at the offset
109 | corrAtOffset = super().correlate(messages, n)
110 | if categoricalCorrelation is None: # initial fill
111 | categoricalCorrelation = [-1] * (len(corrAtOffset) + n - 1)
112 | if len(corrAtOffset) + n - 1 != len(categoricalCorrelation): # validity check
113 | # this should not occur of #correlate() is correct and called with the same set of messages
114 | raise RuntimeError("Too few values to correlate.")
115 | for offset, corr in enumerate(corrAtOffset): # iterate all n-gram offsets
116 | for nOff in range(offset, offset+n): # check/set the correlation for ALL bytes of this n-gram
117 | if categoricalCorrelation[nOff] < corr:
118 | categoricalCorrelation[nOff] = corr
119 | corRepr = [round(cc,3) for cc in categoricalCorrelation]
120 | logging.getLogger(__name__).debug(f"Correlation of {n}-ngrams: {corRepr}")
121 | return categoricalCorrelation
122 |
123 | @classmethod
124 | def _combineNgrams2Values(cls, ngrams: Iterable[bytes], values: List[int]):
125 | r"""
126 | The correlation is perfect if null values are omitted
127 |
128 | >>> ngrand = [b'\xa2\xe7', b'r\x06', b'\x0f?', b'd\x8a', b'\xa0X', b'\x04\xba', b'\x19r', b'\x17M', b',\xda',
129 | ... b'9K', b'<3', b'\xaa\xdf']
130 | >>> valRnd = [0.601, 0.601, 0.601, 0.601, 0.804, 0.804, 0.804, 0.804, 0.804, 0.792, 0.731, 0.722]
131 | >>> from fieldhunter.inference.fieldtypesRelaxed import CategoricalCorrelatedField
132 | >>> CategoricalCorrelatedField._combineNgrams2Values(ngrand, valRnd)
133 | array([[4.1703e+04, 2.9190e+04, 3.9030e+03, 2.5738e+04, 4.1048e+04,
134 | 1.2100e+03, 6.5140e+03, 5.9650e+03, 1.1482e+04, 1.4667e+04,
135 | 1.5411e+04, 4.3743e+04],
136 | [6.0100e-01, 6.0100e-01, 6.0100e-01, 6.0100e-01, 8.0400e-01,
137 | 8.0400e-01, 8.0400e-01, 8.0400e-01, 8.0400e-01, 7.9200e-01,
138 | 7.3100e-01, 7.2200e-01]])
139 | """
140 | nonNull = list(zip(*filter(lambda x: set(x[0]) != {0}, zip(ngrams, values))))
141 | if len(nonNull) == 0:
142 | nonNull = [[],[]]
143 | return super(CategoricalCorrelatedField, cls)._combineNgrams2Values(*nonNull)
144 |
145 |
146 | class HostID(CategoricalCorrelatedField, fieldtypes.HostID):
147 | """
148 | Relaxed version of host identifier (Host-ID) inference (FH, Sec. 3.2.3)
149 | Find n-gram that is strongly correlated with IP address of sender.
150 |
151 | see fieldtypes.HostID
152 |
153 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
154 | # # We investigated the low categoricalCorrelation for all but one byte within an address field (see NTP and DHCP).
155 | # # According to NTP offset 12 (REF ID, often DST IP address) and DHCP offsets (12, 17, and) 20 (IPs)
156 | # # this works in principle, but if the n-gram is too short the correlation gets lost for some n-grams.
157 | print(tabulate(zip(*[hostidfields.categoricalCorrelation]), showindex="always"))
158 | from matplotlib import pyplot
159 | pyplot.bar(range(len(hostidfields.categoricalCorrelation)), hostidfields.categoricalCorrelation)
160 | pyplot.show()
161 | # sum([msg.data[20:24] == bytes(map(int, msg.source.rpartition(':')[0].split('.'))) for msg in messages])
162 | # sum([int.from_bytes(messages[m].data[20:24], "big") == srcs[m] for m in range(len(messages))])
163 | # # While the whole dhcp.ip.server [20:24] correlates nicely to the IP address, single n-grams don't.
164 | serverIP = [(int.from_bytes(messages[m].data[20:24], "big"), srcs[m]) for m in range(len(messages))]
165 | serverIP0 = [(messages[m].data[20], srcs[m]) for m in range(len(messages))]
166 | serverIP1 = [(messages[m].data[21], srcs[m]) for m in range(len(messages))]
167 | serverIP2 = [(messages[m].data[22], srcs[m]) for m in range(len(messages))]
168 | serverIP3 = [(messages[m].data[23], srcs[m]) for m in range(len(messages))]
169 | # nsp = numpy.array([sip for sip in serverIP])
170 | # # The correlation is perfect, if null values are omitted
171 | nsp = numpy.array([sip for sip in serverIP if sip[0] != 0]) # and sip[0] == sip[1]
172 | # nsp0 = numpy.array(serverIP0)
173 | # nsp1 = numpy.array(serverIP1)
174 | # nsp2 = numpy.array(serverIP2)
175 | # nsp3 = numpy.array(serverIP3)
176 | nsp0 = numpy.array([sip for sip in serverIP0 if sip[0] != 0])
177 | nsp1 = numpy.array([sip for sip in serverIP1 if sip[0] != 0])
178 | nsp2 = numpy.array([sip for sip in serverIP2 if sip[0] != 0])
179 | nsp3 = numpy.array([sip for sip in serverIP3 if sip[0] != 0])
180 | for serverSrcPairs in [nsp, nsp0, nsp1, nsp2, nsp3]:
181 | print(drv.information_mutual(serverSrcPairs[:, 0], serverSrcPairs[:, 1]) / drv.entropy_joint(serverSrcPairs.T))
182 | # # Thus, this is no implementation error, but raises doubts about the Host-ID description completeness:
183 | # # Probably it does not mention an Entropy filter, direction separation, or - most probably -
184 | # # an iterative n-gram size increase (like for MSGlen). Thus, we implement such an iterative n-gram analysis
185 | # # in this class's relaxed super-class CategoricalCorrelatedField.
186 | # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
187 | """
188 |
189 |
190 | class SessionID(CategoricalCorrelatedField, fieldtypes.SessionID):
191 | r"""
192 | Relaxed version of session identifier (Session-ID) inference (FH, Section 3.2.4)
193 | Find n-gram that is strongly correlated with IP addresses of sender and receiver
194 | using categorical correlation like Host-ID.
195 |
196 | see fieldtypes.SessionID
197 |
198 | >>> from fieldhunter.inference.fieldtypesRelaxed import SessionID
199 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage
200 | >>> messages = [
201 | ... L4NetworkMessage(b"session111\x42\x17\x23\x00\x08\x15",
202 | ... l3SourceAddress="1.2.3.100", l3DestinationAddress="1.2.3.1"),
203 | ... L4NetworkMessage(b"session111\xe4\x83\x82\x85\xbf",
204 | ... l3SourceAddress="1.2.3.1", l3DestinationAddress="1.2.3.100"),
205 | ... L4NetworkMessage(b"session111\x23\x17\xf9\x0b\x00b\x12",
206 | ... l3SourceAddress="1.2.3.100", l3DestinationAddress="1.2.3.1"),
207 | ... L4NetworkMessage(b"session222\x42\x17Jk\x8a1e\xb5",
208 | ... l3SourceAddress="1.2.3.2", l3DestinationAddress="1.2.3.100"),
209 | ... L4NetworkMessage(b"session222L\xab\x83\x1a\xef\x13",
210 | ... l3SourceAddress="1.2.3.100", l3DestinationAddress="1.2.3.2"),
211 | ... ]
212 | >>> SessionID.correlate(messages)
213 | [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.4181656600790516, 0.4181656600790516, 0.4181656600790516, 0.5, 0.5]
214 |
215 | A problem similar to Host-ID's leads to the same bad quality, thus, we apply the same change via the relaxed
216 | super-class CategoricalCorrelatedField.
217 | """
218 | @classmethod
219 | def _filterMessages(cls, messages: List[L4NetworkMessage]):
220 | ignoreList = {b"\x00"*4, b"\xff"*4}
221 | logging.getLogger(__name__).debug("Ignoring non-set and broadcast addresses.")
222 | return [messages for messages, srcDst in zip(messages, cls._srcDstBytes(messages))
223 | if ignoreList.isdisjoint(srcDst)]
224 |
225 | @classmethod
226 | def _values2correlate2(cls, messages: List[L4NetworkMessage]):
227 | """
228 | Get source AND destination addresses in the same manner as (just) the source for Host-ID.
229 | Recover byte representations of the IPv4 addresses from all Netzob messages and make one int out if each.
230 |
231 | Compared to the original FH paper, treat source and destination IPs as set,
232 | ignoring their role as denoting sender of receiver, but only interpreting them as equal participants.
233 |
234 | :param messages: Messages to generate n-grams to correlate to.
235 | :return: integer representation of source and destination addresses for each message.
236 | """
237 | participantPairs = [sorted(srcDst) for srcDst in cls._srcDstBytes(messages)]
238 | return intsFromNgrams(a+b for a,b in participantPairs)
239 |
240 |
241 | class TransID(fieldtypes.TransID):
242 | """
243 | Relaxed version of transaction identifier (Trans-ID) inference (FH, Section 3.2.5, Fig. 3 right)
244 |
245 | see fieldtypes.TransID
246 | """
247 | entropyThresh = 0.6
248 | """
249 | This Value not given in FH! We improve the threshold compared to the paper
250 | by using it as factor for relative entropy amongst all entropies in the collection.
251 | """
252 |
253 | absoluteEntropy = False
254 |
255 | convLenOneThresh = 0.9
256 |
257 | minConversationLength = 2
258 | """
259 | For the horizontal entropy require conversations longer than this amount of message exchanges to observe that the
260 | ID changes for each request/reply pair and not is Session-ID/Cookie of some sort.
261 | I. e., "Transaction ID" in DHCP would be a FP, since despite its name it is actually a Session-ID)
262 | """
263 |
264 | # In _verticalAndHorizontalRandomNgrams(self):
265 | # for the _c2sCombinedOffsets
266 | # (TODO alternatively, deviating from FH, use the offset for each query specifically?)
267 | # and _s2cCombinedOffsets
268 | # (TODO alternatively, deviating from FH, use the entry for each response specifically?)
269 | # This would allow offsets for different message types, but would require to compare values using _constantQRvalues
270 | # with the specific offsets per Q/R pair. ==> Future Work
271 |
272 | @classmethod
273 | def _horizontalRandomNgrams(cls, conversions: Dict[tuple, List[AbstractMessage]],
274 | verticalEntropyFiltered: List[int]) -> Dict[Union[Tuple, None], List[int]]:
275 | if len(conversions) > 0:
276 | # With a conversation length of one, no meaningful horizontal entropy can be calculated (see DNS)
277 | convLens = Counter([len(c) for c in conversions.values()])
278 | lenOneRatio = convLens[1] / sum(convLens.values())
279 |
280 | # New compared to original FH:
281 | # If most conversations (convLenOneThresh) are just one message long per direction (e. g. DNS),
282 | # ignore the horizontal entropy filter
283 | if lenOneRatio > .9:
284 | return {None: verticalEntropyFiltered}
285 | else:
286 | filteredOutput = dict()
287 | # horizontal collections: entropy of n-gram per the same offset in all messages of one flow direction
288 | for key, conv in conversions.items():
289 | # The horizontal entropy is too low if the number of specimens is low
290 | # -> Enhancing over FH, we use the threshold as a relative to max and ignore short conversations
291 | if len(conv) < cls.minConversationLength:
292 | continue
293 | filteredOutput[key] = cls.entropyFilteredOffsets(conv, cls.absoluteEntropy)
294 | return filteredOutput
295 | else:
296 | return {}
297 |
298 | # Host-ID will always return a subset of Session-ID fields, so Host-ID should get precedence
299 | # MSG-Len would be overwritten by MSG-Type (see SMB: nbss.length), so first use MSG-Len
300 | precedence = {MSGlen.typelabel: 0, MSGtype.typelabel: 1, HostID.typelabel: 2,
301 | SessionID.typelabel: 3, TransID.typelabel: 4, Accumulator.typelabel: 5}
302 | """
303 | The order in which to map field types to messages.
304 | Lower numbers take precedence over higher numbers, so that the type with the higher number will be ignored
305 | if overlapping at the same offet range in the message.
306 | """
--------------------------------------------------------------------------------
/src/fieldhunter/utils/base.py:
--------------------------------------------------------------------------------
1 | from collections import Iterator
2 | from itertools import chain
3 | from typing import List, Dict, Iterable, Tuple, Union
4 |
5 | import IPython
6 | from numpy import nan
7 | from pyitlib import discrete_random_variable as drv
8 |
9 | from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage
10 | from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage
11 |
12 | from nemere.inference.segments import MessageAnalyzer
13 |
14 |
15 |
16 | class NgramIterator(Iterator):
17 | """
18 | Iterate over the byte n-grams in message.
19 |
20 | FH, Section 3.1.2
21 |
22 |
23 | >>> from fieldhunter.utils.base import NgramIterator
24 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage
25 | >>> ngi = NgramIterator(L4NetworkMessage(b"QQQ456789"))
26 | >>> for ngram in ngi:
27 | ... print(ngram, ngi.offset, ngi.exhausted, ngi.lookahead)
28 | b'QQQ' 0 False True
29 | b'QQ4' 1 False True
30 | b'Q45' 2 False True
31 | b'456' 3 False True
32 | b'567' 4 False True
33 | b'678' 5 False True
34 | b'789' 6 False False
35 | >>> print(ngi.exhausted, ngi.lookahead)
36 | True False
37 | """
38 |
39 | def __init__(self, message: AbstractMessage, n=3):
40 | """
41 |
42 | :param message: The message of which to iterate the n-grams.
43 | :param n: The n in n-gram (length of chunk in bytes).
44 | """
45 | if not isinstance(message, AbstractMessage):
46 | raise ValueError("Parameter needs to be a Netzob message object (AbstractMessage).")
47 | self._message = message
48 | self._n = n
49 | self.__offset = -1
50 |
51 | __step = 1
52 |
53 | def __iter__(self):
54 | self.__offset = -1
55 | return self
56 |
57 | def __next__(self) -> bytes:
58 | self.__offset += NgramIterator.__step
59 | if self.exhausted:
60 | raise StopIteration()
61 | return self._message.data[self.__offset:self.__offset+self._n]
62 |
63 | @property
64 | def offset(self):
65 | """
66 | NgramIterator enumerates the offset of the n-gram its current iteration is taken from.
67 |
68 | :return: offset of the n-gram in the current iteration.
69 | """
70 | return self.__offset
71 |
72 | @property
73 | def exhausted(self):
74 | """
75 | :return: Indicates that the last iteration has occurred.
76 | """
77 | return self.__offset > len(self._message.data) - self._n
78 |
79 | @property
80 | def lookahead(self):
81 | """
82 | :return: True indicates that at least one more iteration is contained in this iterator.
83 | """
84 | return self.__offset + NgramIterator.__step <= len(self._message.data) - self._n
85 |
86 |
87 | class Flows(object):
88 | # noinspection PyUnresolvedReferences
89 | """
90 | In FH, a flow is defined by the 5-tuple: Layer-4 Protocol, Source IP, Destination IP, Source Port, Destination Port
91 | (FH, Section2, Footnote 1)
92 |
93 | >>> from tabulate import tabulate
94 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage
95 | >>> from fieldhunter.utils.base import Flows
96 | >>> messages = [
97 | ... L4NetworkMessage(b"QQQ456789", l4Protocol="dummy", date=1445405280.01, l3SourceAddress="192.168.23.100", l3DestinationAddress="192.168.23.245", l4SourceAddress=10815, l4DestinationAddress=42),
98 | ... L4NetworkMessage(b"RRR567890", l4Protocol="dummy", date=1445405280.03, l3SourceAddress="192.168.23.245", l3DestinationAddress="192.168.23.100", l4SourceAddress=42, l4DestinationAddress=10815),
99 | ... L4NetworkMessage(b"QQQ7890AB", l4Protocol="dummy", date=1445405280.07, l3SourceAddress="192.168.23.100", l3DestinationAddress="192.168.23.245", l4SourceAddress=10815, l4DestinationAddress=42),
100 | ... L4NetworkMessage(b"RRR567890", l4Protocol="dummy", date=1445405280.05, l3SourceAddress="192.168.23.245", l3DestinationAddress="192.168.23.100", l4SourceAddress=42, l4DestinationAddress=10815),
101 | ... L4NetworkMessage(b"QQQ123456789", l4Protocol="dummy", date=1445405280.11, l3SourceAddress="192.168.23.100", l3DestinationAddress="192.168.23.245", l4SourceAddress=1717, l4DestinationAddress=2323),
102 | ... L4NetworkMessage(b"RRR890A", l4Protocol="dummy", date=1445405280.13, l3SourceAddress="192.168.23.1", l3DestinationAddress="192.168.23.100", l4SourceAddress=2323, l4DestinationAddress=1717),
103 | ... L4NetworkMessage(b"QQQ6789", l4Protocol="dummy", date=1445405280.17, l3SourceAddress="192.168.23.1", l3DestinationAddress="192.168.23.245", l4SourceAddress=1717, l4DestinationAddress=2323),
104 | ... L4NetworkMessage(b"RRR890ABCDEFGH", l4Protocol="dummy", date=1445405280.23, l3SourceAddress="192.168.23.245", l3DestinationAddress="192.168.23.100", l4SourceAddress=2323, l4DestinationAddress=1717)
105 | ... ]
106 | >>> # for the sake of the test case, the messages RRR890A and QQQ6789 have src IPs that rules them out as
107 | >>> # valid conversations, they should not be contained in the conversion lists below.
108 | >>> flows = Flows(messages)
109 | >>> mqr = flows.matchQueryResponse()
110 | >>> print(tabulate([ (q.date, r.date) for q, r in mqr.items() ], floatfmt=""))
111 | ------------- -------------
112 | 1445405280.01 1445405280.03
113 | 1445405280.11 1445405280.23
114 | ------------- -------------
115 | >>> print(tabulate( [(list(flowtuple) + [b" ".join(msg.data for msg in msglst)]) for flowtuple, msglst in flows.conversations().items()] ))
116 | ----- -------------- -------------- ----- ---- ---------------------------------------
117 | dummy 192.168.23.100 192.168.23.245 10815 42 QQQ456789 QQQ7890AB RRR567890 RRR567890
118 | dummy 192.168.23.100 192.168.23.245 1717 2323 QQQ123456789 RRR890ABCDEFGH
119 | ----- -------------- -------------- ----- ---- ---------------------------------------
120 | >>> print(tabulate( [(list(flowtuple) + [b" ".join(msg.data for msg in msglst)]) for flowtuple, msglst in flows.c2sInConversations().items()] ))
121 | ----- -------------- -------------- ----- ---- -------------------
122 | dummy 192.168.23.100 192.168.23.245 10815 42 QQQ456789 QQQ7890AB
123 | dummy 192.168.23.100 192.168.23.245 1717 2323 QQQ123456789
124 | ----- -------------- -------------- ----- ---- -------------------
125 | >>> print(tabulate( [(list(flowtuple) + [b" ".join(msg.data for msg in msglst)]) for flowtuple, msglst in flows.s2cInConversations().items()] ))
126 | ----- -------------- -------------- ----- ---- -------------------
127 | dummy 192.168.23.100 192.168.23.245 10815 42 RRR567890 RRR567890
128 | dummy 192.168.23.100 192.168.23.245 1717 2323 RRR890ABCDEFGH
129 | ----- -------------- -------------- ----- ---- -------------------
130 | """
131 |
132 | def __init__(self, messages: List[L4NetworkMessage]):
133 | self._messages = messages
134 | self._flows = self._identify()
135 |
136 | @property
137 | def messages(self):
138 | return self._messages
139 |
140 | def _identify(self) -> Dict[Tuple, List[L4NetworkMessage]]:
141 | """
142 | Identify flows.
143 |
144 | :return A dict mapping the 5-tuple
145 | (Layer-4 Protocol, Source IP, Destination IP, Source Port, Destination Port)
146 | to the list of addresses in the flow denoted by the 5-tuple.
147 | """
148 | flows = dict() # type: Dict[Tuple[str,str,str,str,str], List[L4NetworkMessage]]
149 | # client is initiator, sort by packet date
150 | for msg in sorted(self._messages, key=lambda m: m.date): # type: L4NetworkMessage
151 | if not isinstance(msg, L4NetworkMessage):
152 | raise TypeError("To identify flows, all messages need to be from a known encapsulation with known "
153 | "network and transport layer protocols. No flow determined for "
154 | f"{type(msg).__name__}:\n{msg}")
155 | src = msg.source.rpartition(':')
156 | dst = msg.destination.rpartition(':')
157 | srcAddress = src[0]
158 | dstAddress = dst[0]
159 | srcPort = src[2]
160 | dstPort = dst[2]
161 | keytuple = (msg.l4Protocol, srcAddress, dstAddress, srcPort, dstPort)
162 | if keytuple not in flows:
163 | flows[keytuple] = list()
164 | flows[keytuple].append(msg)
165 | return flows
166 |
167 | @property
168 | def flows(self):
169 | return self._flows
170 |
171 | def conversations(self) -> Dict[Tuple, List[AbstractMessage]]:
172 | """
173 | "A conversation is formed of the two flows in opposite direction..." (FH, Section2, Footnote 1)
174 | :return: Dict of conversations with the c2s flow tuple as key.
175 | """
176 | return {qkey: self._flows[qkey] + self._flows[rkey]
177 | for qkey,rkey in self._dialogs().items() if rkey is not None}
178 |
179 | def c2sInConversations(self) -> Dict[Tuple, List[AbstractMessage]]:
180 | """
181 | "A conversation is formed of the two flows in opposite direction..." (FH, Section2, Footnote 1)
182 | :return: Dict of c2s messages per conversation with the c2s flow tuple as key.
183 | """
184 | return {qkey: self._flows[qkey] for qkey,rkey in self._dialogs().items() if rkey is not None}
185 |
186 | def s2cInConversations(self) -> Dict[Tuple, List[AbstractMessage]]:
187 | """
188 | "A conversation is formed of the two flows in opposite direction..." (FH, Section2, Footnote 1)
189 | :return: Dict of s2c messages per conversation with the c2s flow tuple as key.
190 | """
191 | return {qkey: self._flows[rkey] for qkey,rkey in self._dialogs().items() if rkey is not None}
192 |
193 | def _dialogs(self) -> Dict[Tuple,Union[Tuple,None]]:
194 | """
195 | find pairs of flows with src/dst and reversed to each other.
196 | """
197 | dialogs = dict()
198 | for keytuple in self._flows.keys():
199 | # exchange src and dst addresses and ports
200 | rkeytuple = (keytuple[0], keytuple[2], keytuple[1], keytuple[4], keytuple[3])
201 | if rkeytuple in dialogs:
202 | if dialogs[rkeytuple] is not None:
203 | raise Exception("Strange things happened here.")
204 | # identify the flow starting earlier as client (key in dialogs), the other as server (value in dialogs)
205 | if self._flows[rkeytuple][0].date < self._flows[keytuple][0].date:
206 | dialogs[rkeytuple] = keytuple
207 | else:
208 | del dialogs[rkeytuple]
209 | dialogs[keytuple] = rkeytuple
210 | else:
211 | dialogs[keytuple] = None
212 | return dialogs
213 |
214 | def splitDirections(self) -> Tuple[List[AbstractMessage],List[AbstractMessage]]:
215 | """
216 | Split list of messages into directions S2C and C2S based on flow information.
217 | Ignores all flows that have no reverse direction.
218 |
219 | FH, Section 2, Footnote 1
220 |
221 | :return Lists of messages, the first is client-to-server, the second is server-to-client
222 | """
223 | # merge all client flows into one and all server flows into another list of messages
224 | c2s = list(chain.from_iterable(self.c2sInConversations().values()))
225 | s2c = list(chain.from_iterable(self.s2cInConversations().values()))
226 | return c2s, s2c
227 |
228 | def matchQueryResponse(self):
229 | """
230 | Match queries with responses in the flows by identifying
231 | for each client-to-server message (query) the server-to-client message (response)
232 | that has the closest subsequent transmission time.
233 | """
234 | dialogs = self._dialogs()
235 | qr = dict()
236 |
237 | for keytuple in dialogs.keys():
238 | if dialogs[keytuple] is None:
239 | continue
240 | qlist = self._flows[keytuple].copy()
241 | rlist = self._flows[dialogs[keytuple]].copy()
242 |
243 | # assume qlist and rlist are sorted by query.date and resp.date
244 | prevquery = None
245 | for query in qlist:
246 | respFound = False
247 | for resp in rlist:
248 | # first response later than query
249 | if query.date < resp.date:
250 | qr[query] = resp
251 | respFound = True
252 | break
253 | if not respFound:
254 | continue
255 | # if the response to query seems to be the same than to the previous query...
256 | if prevquery is not None and qr[query] == qr[prevquery]:
257 | # ... ignore the earlier query since a response message in between seems to have gone missing.
258 | del qr[prevquery]
259 | prevquery = query
260 | return qr
261 |
262 |
263 | def ngramEntropy(messages: List[AbstractMessage], n=1):
264 | """
265 | The vertical entropies for each offset of all the n-grams at the same offset throughout all messages.
266 | Own entropy calculation implementation. See #pyitEntropyVertical
267 |
268 | FH, Section 3.2.1
269 | """
270 | ngIters = [NgramIterator(msg, n) for msg in messages]
271 | vEntropy = list()
272 |
273 | for ngrams in zip(*ngIters):
274 | vEntropy.append(MessageAnalyzer.calcEntropy(ngrams, 256))
275 |
276 | return vEntropy
277 |
278 |
279 | def intsFromNgrams(ngrams: Iterable[bytes], endianness='big') -> List[int]:
280 | r"""
281 | Convert an iterable of byte n-grams into a single integer per n-gram.
282 | This is useful to simplify working aroung numpy's issue with null-bytes:
283 | Issue #3878 (https://github.com/numpy/numpy/issues/3878)
284 |
285 | >>> from fieldhunter.utils.base import intsFromNgrams
286 | >>> ngramlist = [b"\x00\x00\x00", b"\x00\x11\x00", b"\xab\x00\x00", b"\xab\x11\x23", b"\x08\x15"]
287 | >>> ifn = intsFromNgrams(ngramlist)
288 | >>> # noinspection PyUnresolvedReferences
289 | >>> [hex(val) for val in ifn]
290 | ['0x0', '0x1100', '0xab0000', '0xab1123', '0x815']
291 |
292 | :param ngrams: Iterable of n-grams, one bytes string per n-gram
293 | :param endianness: The endianness to use to interpret the bytes.
294 | :return: List of integers, one for with the value of each bytes string n-gram.
295 | """
296 | return [int.from_bytes(b, endianness) for b in ngrams]
297 |
298 |
299 | def pyitNgramEntropy(messages: List[AbstractMessage], n=1, endianness='big'):
300 | """
301 | The vertical entropies for each offset of all the n-grams at the same offset throughout all messages.
302 | Implementation of entropy calculation from pyitlib. See #entropyVertical
303 |
304 | FH, Section 3.2.1
305 |
306 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage
307 | >>> from fieldhunter.utils.base import pyitNgramEntropy, ngramEntropy
308 | >>> messageList = [
309 | ... L4NetworkMessage(b"QQQ456789"), L4NetworkMessage(b"RRR567890"), L4NetworkMessage(b"QQQ7890AB"),
310 | ... L4NetworkMessage(b"RRR567890"), L4NetworkMessage(b"QQQ123456789"), L4NetworkMessage(b"RRR890A"),
311 | ... L4NetworkMessage(b"QQQ6789"), L4NetworkMessage(b"RRR890ABCDEFGH")
312 | ... ]
313 | >>> ngramEntropy(messageList) == pyitNgramEntropy(messageList)
314 | True
315 |
316 | :param messages: The list of messages to get the n-grams from.
317 | :param n: The n in n-gram.
318 | :param endianness: Endianness to interpret the n-grams in.
319 | """
320 | ngIters = [NgramIterator(msg, n) for msg in messages]
321 | vEntropy = list()
322 |
323 | for ngrams in zip(*ngIters): # type: List[bytes]
324 | # int.from_bytes is necessary because of numpy's issue with null-bytes: #3878
325 | # (https://github.com/numpy/numpy/issues/3878)
326 | vEntropy.append(drv.entropy(intsFromNgrams(ngrams, endianness))/(n*8))
327 |
328 | return vEntropy
329 |
330 |
331 | def mutualInformationNormalized(qInts: Union[List[List[int]],List[int]], rInts: Union[List[List[int]],List[int]]):
332 | """
333 | Calculate the Mutual Information between two lists of n-grams, e.g.,
334 | one list being queries and the other the according responses, normalized to the queries' entropy.
335 | Mutual information measures the information shared between the two lists. (FH, Section 3.2.1)
336 |
337 | >>> from fieldhunter.utils.base import mutualInformationNormalized, intsFromNgrams
338 | >>> queryNgramsConst = [b'QQQ', b'QQQ', b'QQQ', b'QQQ']
339 | >>> respoNgramsConst = [b'RRR', b'RRR', b'RRR', b'RRR']
340 | >>> queryNgramsCorr = [b'42', b'40', b'42', b'23', b'17']
341 | >>> respoNgramsCorr = [b'24', b'04', b'24', b'32', b'71']
342 | >>> queryNgramsPart = [b'42', b'40', b'42', b'23', b'17']
343 | >>> respoNgramsPart = [b'04', b'04', b'04', b'32', b'71']
344 | >>> # query and response n-grams are always constant: This allows no conclusion about any correlation => nan
345 | >>> mutualInformationNormalized(intsFromNgrams(queryNgramsConst), intsFromNgrams(respoNgramsConst))
346 | nan
347 | >>> # query and response n-grams always have corresponding values => perfectly correlated
348 | >>> mutualInformationNormalized(intsFromNgrams(queryNgramsCorr), intsFromNgrams(respoNgramsCorr))
349 | 1.0
350 | >>> # some query and response n-grams have corresponding values, multiple other queries have the same responses.
351 | >>> mutualInformationNormalized(intsFromNgrams(queryNgramsPart), intsFromNgrams(respoNgramsPart))
352 | 0.7133...
353 |
354 | :param qInts: List of (n-grams as int-list) or one int per realization
355 | :param rInts: List of (n-grams as int-list) or one int per realization
356 | :return:
357 | """
358 | assert len(qInts) > 0, "Entropy requires at least one query realization"
359 | assert len(rInts) > 0, "Entropy requires at least one reply realization"
360 | assert len(qInts) == len(rInts), "Mutual information requires the same amount of query and reply realizations"
361 | qEntropy = drv.entropy(qInts)
362 | if qEntropy != 0:
363 | return drv.information_mutual(qInts, rInts) / qEntropy
364 | else:
365 | return nan
366 |
367 |
368 | def qrAssociationCorrelation(mqr: Dict[L4NetworkMessage, L4NetworkMessage], n=1):
369 | """
370 | Take the matched query-response pairs (mqr)
371 | and associate n-gram offsets by mutual information as correlation metric.
372 |
373 | >>> from tabulate import tabulate
374 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage
375 | >>> from fieldhunter.utils.base import qrAssociationCorrelation
376 | >>> matchedqr = {
377 | ... L4NetworkMessage(b"QQQ456789"): L4NetworkMessage(b"RRR567890"),
378 | ... L4NetworkMessage(b"QQQ7890AB"): L4NetworkMessage(b"RRR567890"),
379 | ... L4NetworkMessage(b"QQQ123456789"): L4NetworkMessage(b"RRR890A"),
380 | ... L4NetworkMessage(b"QQQ6789"): L4NetworkMessage(b"RRR890ABCDEFGH"),
381 | ... }
382 | >>> qrAC = qrAssociationCorrelation(matchedqr)
383 | >>> print(tabulate(qrAC.items()))
384 | - -----
385 | 0 nan
386 | 1 nan
387 | 2 nan
388 | 3 0.5
389 | 4 0.5
390 | 5 0.5
391 | 6 0
392 | - -----
393 |
394 | For the first 3 bytes the normalized mutual information is undefined: see #mutualInformationNormalized()
395 |
396 | # TODO optimize efficiency by supporting an input filter, i. e.,
397 | calculate mutual information only for given ngram offsets
398 |
399 | :param mqr: Matched query-response pairs
400 | :param n: The length of the n-grams to use (in bytes)
401 | :returns: Offset => causality value
402 | """
403 | mutInf = dict()
404 | qIterators, rIterators = list(), list()
405 | for qrPair in mqr.items():
406 | qIterators.append(NgramIterator(qrPair[0], n))
407 | rIterators.append(NgramIterator(qrPair[1], n))
408 | while not all(qiter.exhausted for qiter in qIterators) or all(riter.exhausted for riter in rIterators):
409 | qNgrams = list()
410 | rNgrams = list()
411 | # get two lists of n-grams with the same offset, one for queries, one for responses
412 | for qIter, rIter in zip(qIterators, rIterators):
413 | if not qIter.lookahead or not rIter.lookahead:
414 | # there are no more n-grams for either query or response for this pair of Q/R messages
415 | continue
416 | # fetch the next iteration for both messages in parallel.
417 | # A StopIteration should never occur here, since we check if the iterators are soon being exhausted before.
418 | qNgrams.append(next(qIter))
419 | rNgrams.append(next(rIter))
420 | # print("Q offset:", qIter.offset) # should be the same for all iterators in one while loop
421 | # print("R offset:", rIter.offset, "\n")
422 | if len(qNgrams) == 0 or len(rNgrams) == 0:
423 | break
424 | # print(qNgrams)
425 | # print(rNgrams, "\n")
426 | qInts = intsFromNgrams(qNgrams)
427 | rInts = intsFromNgrams(rNgrams)
428 | # qIter and rIter are always set here, otherwise the break on (len(qNgrams) == 0 or len(rNgrams) == 0)
429 | # above would have been triggered
430 | # noinspection PyUnboundLocalVariable
431 | if qIter.offset != rIter.offset:
432 | # NgramIterator remembers the offset of its current iteration. This must be the same in both messages.
433 | raise RuntimeError("The offsets in qrAssociationCorrelation calculation do not match:"
434 | f"{qIter.offset} {rIter.offset}\n{qNgrams}\n{rNgrams}")
435 | mutInf[qIter.offset] = mutualInformationNormalized(qInts, rInts)
436 | return mutInf
437 |
438 |
439 | def verticalByteMerge(mqr: Dict[L4NetworkMessage, L4NetworkMessage], offsets: Iterable[int]):
440 | # noinspection PyUnresolvedReferences
441 | """
442 | Returns two lists of integer-list representations of byte strings,
443 | one from all queries and one from all responses,
444 | containing the bytes at all offsets given as parameter.
445 |
446 | >>> from fieldhunter.utils.base import verticalByteMerge
447 | >>> messageMap = {
448 | ... L4NetworkMessage(b"QQQ456789"): L4NetworkMessage(b"RRR567890"),
449 | ... L4NetworkMessage(b"QQQ7890AB"): L4NetworkMessage(b"RRR567890"),
450 | ... L4NetworkMessage(b"QQQ123456789"): L4NetworkMessage(b"RRR890A"),
451 | ... L4NetworkMessage(b"QQQ6789"): L4NetworkMessage(b"RRR890ABCDEFGH"),
452 | ... }
453 | >>> verticalByteMerge(messageMap, [1])
454 | ([81, 81, 81, 81], [82, 82, 82, 82])
455 | >>> verticalByteMerge(messageMap, [1,2,3])
456 | ([5329204, 5329207, 5329201, 5329206], [5394997, 5394997, 5395000, 5395000])
457 | >>> qMsgs, rMsgs = verticalByteMerge(messageMap, [3,5,6])
458 | >>> # ints converted back to the bytes number 3, 5, and 6 from the keys in messageMap
459 | >>> [int.to_bytes(val, 3, 'big') for val in qMsgs]
460 | [b'467', b'790', b'134', b'689']
461 | >>> # ints converted back to the bytes number 3, 5, and 6 from the values in messageMap
462 | >>> [int.to_bytes(val, 3, 'big') for val in rMsgs]
463 | [b'578', b'578', b'80A', b'80A']
464 |
465 | :param mqr: Dict that maps one message to another.
466 | :param offsets: List of offsets for which the byte values should be returned.
467 | The offset must exist in all messages.
468 | :return: Two lists of integer representations of the byte values at the given offsets,
469 | one list for the keys and one for the values of the input dict.
470 | :raises IndexError: If an offset does not exist in any message.
471 | """
472 | sortedOffs = sorted(offsets)
473 | qMerge = list()
474 | rMerge = list()
475 | for query, resp in mqr.items():
476 | # int.from_bytes is necessary because of numpy's issue with null-bytes: #3878
477 | # (https://github.com/numpy/numpy/issues/3878)
478 | qMerge.append(int.from_bytes(bytes(query.data[o] for o in sortedOffs), 'big'))
479 | rMerge.append(int.from_bytes(bytes(resp.data[o] for o in sortedOffs), 'big'))
480 | return qMerge, rMerge
481 |
482 |
483 | def iterateSelected(toIter: Iterator, selectors: List[int]):
484 | """
485 | Only return selected iterations from an iterator.
486 |
487 | >>> from fieldhunter.utils.base import iterateSelected
488 | >>> bytesTuple = iter((b'QQQ456789', b'RRR567890', b'QQQ7890AB', b'RRR567890', b'QQQ123456789', b'RRR890A'))
489 | >>> bt2357 = iterateSelected(bytesTuple, [2,3,5,7])
490 | >>> next(bt2357)
491 | b'QQQ7890AB'
492 | >>> next(bt2357)
493 | b'RRR567890'
494 | >>> next(bt2357)
495 | b'RRR890A'
496 | >>> next(bt2357) # doctest: +IGNORE_EXCEPTION_DETAIL
497 | Traceback (most recent call last):
498 | StopIteration
499 |
500 | :param toIter: The iterator to traverse.
501 | :param selectors: The list of iteration indices to return.
502 | :return: A generator for all iterations in toIter that have an "index" selected by selectors.
503 | """
504 | return (element for offset, element in enumerate(toIter) if offset in selectors)
505 |
506 |
507 | def list2ranges(offsets: List[int]):
508 | """
509 | Generate ranges from a list of integer values. The ranges denote the starts and lengths of any subsequence of
510 | adjacent values, e. g. the list [1,2,3,6,7,20] would result in the ranges [(1,3),(6,2),(20,1)]
511 |
512 | >>> from fieldhunter.utils.base import list2ranges
513 | >>> list2ranges([1,2,3,6,7,20])
514 | [(1, 3), (6, 2), (20, 1)]
515 | >>> list2ranges([2,3,6,11,12,13,23,24,25,26])
516 | [(2, 2), (6, 1), (11, 3), (23, 4)]
517 | >>> list2ranges([2])
518 | [(2, 1)]
519 | >>> list2ranges([])
520 | []
521 | >>> list2ranges([-2]) # doctest: +IGNORE_EXCEPTION_DETAIL
522 | Traceback (most recent call last):
523 | ValueError: Offsets must be positive numbers.
524 |
525 | :param offsets: list of integers
526 | :return: list of ranges (tuples of offset and length) of consecutive the offsets.
527 | """
528 | soffs = sorted(offsets)
529 | ranges = list() # type: List[Tuple[int,int]]
530 | # offsets empty
531 | if len(soffs) == 0:
532 | return ranges
533 | if soffs[0] < 0:
534 | raise ValueError("Offsets must be positive numbers.")
535 | # only one offset
536 | if len(soffs) == 1:
537 | return [(soffs[0],1)]
538 | start = soffs[0]
539 | last = soffs[0]
540 | for offs in soffs[1:]:
541 | if offs > last + 1:
542 | ranges.append((start, last - start + 1))
543 | # start a new range
544 | start = offs
545 | last = offs
546 | # append dangling start/last
547 | ranges.append((start, last - start + 1))
548 |
549 | return ranges
550 |
551 |
552 | def ngramIsOverlapping(o0, n0, o1, n1):
553 | """
554 | Check if two ranges are overlapping. The ranges are defined by offset and length each.
555 |
556 | >>> ngramIsOverlapping(2,2,0,3)
557 | True
558 | >>> ngramIsOverlapping(2,2,0,2)
559 | False
560 | >>> ngramIsOverlapping(2,2,3,2)
561 | True
562 | >>> ngramIsOverlapping(2,2,4,2)
563 | False
564 |
565 | :param o0: Offset of n-gram 0
566 | :param n0: Length (n) of n-gram 0
567 | :param o1: Offset of n-gram 1
568 | :param n1: Length (n) of n-gram 1
569 | :return: True if overlapping, false otherwise
570 | """
571 | return o1 + n1 - 1 >= o0 and o1 < o0 + n0
572 |
573 |
--------------------------------------------------------------------------------
/src/fieldhunter/inference/fieldtypes.py:
--------------------------------------------------------------------------------
1 | """
2 | Infer message field types exactly according to the FieldHunter paper Section 3.2
3 |
4 | TODO introduce doctests to check critical functions in inference.fieldtypes
5 | """
6 | from typing import List, Tuple, Dict, Iterable, ItemsView, Union
7 | import random, logging
8 | from itertools import groupby, product, chain, combinations
9 | from collections import Counter
10 | from abc import ABC, abstractmethod
11 |
12 | import numpy
13 | from scipy.stats import pearsonr
14 | from pyitlib import discrete_random_variable as drv
15 | from netzob.Model.Vocabulary.Messages.AbstractMessage import AbstractMessage
16 | from netzob.Model.Vocabulary.Messages.L2NetworkMessage import L2NetworkMessage
17 | from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage
18 |
19 | from nemere.inference.analyzers import Value
20 | from nemere.inference.segments import TypedSegment
21 | from tabulate import tabulate
22 |
23 | from fieldhunter.utils.base import qrAssociationCorrelation, verticalByteMerge, mutualInformationNormalized, \
24 | list2ranges, Flows, NgramIterator, iterateSelected, intsFromNgrams, \
25 | ngramIsOverlapping, pyitNgramEntropy
26 |
27 |
28 | # logging.getLogger(__name__).setLevel(logging.DEBUG)
29 |
30 |
31 | class FieldType(ABC):
32 | """
33 | Generic, abstract base class for field types. Holds segments and a type label for the inferred fields.
34 |
35 | For DocTest example see fieldhunter.inference.common#segmentedMessagesAndSymbols()
36 | """
37 | typelabel = None
38 |
39 | def __init__(self):
40 | self._segments = list()
41 |
42 | @property
43 | def segments(self) -> List[List[TypedSegment]]:
44 | """
45 | :return: Final result as segments that are of the inferred type.
46 | """
47 | return self._segments
48 |
49 | @classmethod
50 | def _posLen2segments(cls, messages: List[L2NetworkMessage],
51 | posLen: Union[Iterable[Tuple[int, int]],ItemsView[int, int]]) \
52 | -> List[List[TypedSegment]]:
53 | """
54 | Generate Segments from (remaining) field ranges.
55 |
56 | For DocTest example see fieldhunter.inference.common#segmentedMessagesAndSymbols()
57 |
58 | :param messages: Messages to generate n-grams to correlate to.
59 | :param posLen: List of start-length tuples to create segments for from each message.
60 | :return: Lists of segments per message generated from the posLen parameter.
61 | """
62 | segments = list()
63 | for message in messages:
64 | mval = Value(message)
65 | segs4msg = list()
66 | for start, length in posLen:
67 | # check if boundaries fit into message
68 | if start + length <= len(mval.values):
69 | segs4msg.append(TypedSegment(mval, start, length, cls.typelabel))
70 | segments.append(segs4msg)
71 | return segments
72 |
73 |
74 | class NonConstantNonRandomEntropyFieldType(FieldType, ABC):
75 | """
76 | Abstract class for inferring field types using entropy of n-gram values
77 | where the entropy may neither be 0 (constant n-gram values)
78 | nor equal or greater than a threshold (random n-gram values).
79 | """
80 | # Value for entropyThresh not given in FH!
81 | # We use a constant entropyThresh of 0.4 determined by own empirics (results of /src/trace_statistics.py in
82 | # nemesys-reports/NEMEFTR/fieldhunter/typeAndLengthEntropies.ods)
83 | entropyThresh = 0.4
84 |
85 | @classmethod
86 | def entropyFilteredOffsets(cls, messages: List[AbstractMessage], n: int):
87 | """
88 | Find offsets of n-grams (with the same offset in different messages of the list), that are not constant and not
89 | random, i. e., that have an entropy between and cls.entropyThresh.
90 |
91 | FH, Section 3.2.1
92 |
93 | :param messages: Messages to generate n-grams from
94 | :param n: The $n$ in n-gram
95 | :return: Returns a list of offsets that have non-constant and non-random (below entropyThresh) entropy.
96 | """
97 | entropy = pyitNgramEntropy(messages, n)
98 | ePo = [(offset, entropy) for offset, entropy in enumerate(entropy)]
99 | logging.getLogger(__name__).debug(f"Entropies per offset:\n{tabulate(ePo)}")
100 | return [offset for offset, entropy in enumerate(entropy) if 0 < entropy < cls.entropyThresh]
101 |
102 |
103 | class MSGtype(NonConstantNonRandomEntropyFieldType):
104 | """
105 | Message type (MSG-Type) inference (FH, Section 3.2.1, Fig. 3 left).
106 | This type heuristic is based on the mutual information shared between n-grams at the same offset in the query and
107 | response messages. This assumes that message type fields are at the same position in query and response. Moreover,
108 | fields that solely denote whether a message is a query or a response, yield a undefined mutual information and thus
109 | cannot be detected as fields denoting a message type.
110 |
111 | The properties of this class provide access to intermediate and final results.
112 | """
113 | typelabel = "MSG-Type"
114 | causalityThresh = 0.8
115 |
116 | def __init__(self, flows: Flows):
117 | super().__init__()
118 |
119 | logger = logging.getLogger(__name__)
120 | c2s, s2c = flows.splitDirections() # type: List[L4NetworkMessage], List[L4NetworkMessage]
121 |
122 | # discard constant and random offsets
123 | self._c2sEntropyFiltered = type(self).entropyFilteredOffsets(c2s, 1)
124 | self._s2cEntropyFiltered = type(self).entropyFilteredOffsets(s2c, 1)
125 | logger.info(f"c2sEntropyFiltered offsets: {self.c2sEntropyFiltered}")
126 | logger.info(f"s2cEntropyFiltered offsets: {self.s2cEntropyFiltered}")
127 |
128 | # compute Q->R association
129 | mqr = flows.matchQueryResponse()
130 | if len(mqr) < 2:
131 | # not enough query response pairs to continue analysis: create valid empty instance state and return
132 | self._qrCausality = dict()
133 | self._filteredCausality = dict()
134 | self._mergingOffsets = list()
135 | self._mergedCausality = dict()
136 | self._msgtypeRanges = list()
137 | self._segments = list()
138 | return
139 | # Mutual information
140 | self._qrCausality = qrAssociationCorrelation(mqr)
141 | # filter: only if offset is in c2sEntropyFiltered/s2cEntropyFiltered and the causality is greater than the causalityThresh
142 | self._filteredCausality = {offset: self.qrCausality[offset] for offset in
143 | set(self.c2sEntropyFiltered).intersection(self.s2cEntropyFiltered)
144 | if self.qrCausality[offset] > type(self).causalityThresh}
145 | # filteredCausality are offsets of MSG-Type candidate n-grams
146 | logger.info(f"filtered causality: {sorted(self.filteredCausality.items())}")
147 |
148 | # Merge n-grams above causality threshold and check correlation
149 | self._mergingOffsets = list()
150 | for offset in sorted(self.filteredCausality.keys()):
151 | self._mergingOffsets.append(offset)
152 | qMergedField, rMergedField = verticalByteMerge(mqr, self.offsets)
153 | mergedCausality = mutualInformationNormalized(qMergedField, rMergedField)
154 | if mergedCausality <= type(self).causalityThresh:
155 | # Filter problematic n-grams
156 | self._mergingOffsets.pop()
157 | # re-calculate in case the last iteration removed a problematic n-gram
158 | qMergedField, rMergedField = verticalByteMerge(mqr, self.offsets)
159 | self._mergedCausality = mutualInformationNormalized(qMergedField, rMergedField)
160 | logger.info(f"mergedCausality: {self.mergedCausality}")
161 | logger.info(f" mergedOffsets: {self._mergingOffsets}")
162 | logger.info(f" from offsets: {sorted(self.filteredCausality.keys())}")
163 |
164 | # create segments from bytes in mergingOffsets
165 | self._msgtypeRanges = list2ranges(self.offsets)
166 | self._segments = type(self)._posLen2segments(c2s + s2c, self._msgtypeRanges)
167 |
168 |
169 | @property
170 | def s2cEntropyFiltered(self) -> List[int]:
171 | """
172 | :return: The offsets for which the vertical entropies of all the server to client messages are
173 | greater than zero and less than MSGtype.entropyThresh
174 | """
175 | return self._s2cEntropyFiltered
176 |
177 | @property
178 | def c2sEntropyFiltered(self) -> List[int]:
179 | """
180 | :return: The offsets for which the vertical entropies of all the client to server messages are
181 | greater than zero and less than MSGtype.entropyThresh
182 | """
183 | return self._c2sEntropyFiltered
184 |
185 | @property
186 | def qrCausality(self) -> Dict[int,float]:
187 | return self._qrCausality
188 |
189 | @property
190 | def filteredCausality(self) -> Dict[int,float]:
191 | return self._filteredCausality
192 |
193 | @property
194 | def mergedCausality(self) -> List[int]:
195 | return self._mergedCausality
196 |
197 | @property
198 | def offsets(self):
199 | """
200 | :return: Final result as individual byte offsets of offsets that are MSG-Types
201 | """
202 | return self._mergingOffsets
203 |
204 | @property
205 | def ranges(self) -> List[Tuple[int, int]]:
206 | """
207 | :return: Final result as ranges of offsets that are MSG-Types
208 | """
209 | return self._msgtypeRanges
210 |
211 |
212 | class MSGlen(NonConstantNonRandomEntropyFieldType):
213 | """
214 | Message length (MSG-Len) inference (FH, Section 3.2.2, Fig. 3 center).
215 | Find values in the message that linearly correlate with the application-layer message size.
216 |
217 | Properties enable access to intermediate and final results.
218 | """
219 | typelabel = "MSG-Len"
220 | # coefficient threshold 0.6 (FH, Section 3.2.2)
221 | minCorrelation = 0.6
222 | # MSG-Len hypothesis threshold 0.9 (FH, Section 3.2.2)
223 | lenhypoThresh = 0.9
224 |
225 | def __init__(self, flows: Flows):
226 | super().__init__()
227 |
228 | self._msgDirection = list()
229 | c2s, s2c = flows.splitDirections() # type: List[L4NetworkMessage], List[L4NetworkMessage]
230 | # per direction - for MSG-Len this is pointless, but the paper says to do it.
231 | # it might rather be useful to separate message types (distinct formats) in this manner.
232 | for direction in [c2s, s2c]:
233 | self._msgDirection.append(type(self).Direction(direction))
234 |
235 | @property
236 | def acceptedCandidatesPerDir(self) -> List[Dict[int, int]]:
237 | return [mldir.acceptedCandidates for mldir in self._msgDirection]
238 |
239 | @property
240 | def segments(self) -> List[List[TypedSegment]]:
241 | return list(chain.from_iterable([mldir.segments for mldir in self._msgDirection]))
242 |
243 | class Direction(object):
244 | """
245 | Encapsulates direction-wise inference of MSGlen fields.
246 | Roughly corresponds to either the S2C-collection or C2S-collection branch
247 | depicted in the flow graph of FH, Fig. 3 center.
248 |
249 | Provides methods to extract different size collections, finding candidates by Pearson correlation coefficient,
250 | and verifying the hypothesis of candidates denoting the length of the message.
251 | """
252 | # TODO also support little endian (for our test traces, it was irrelevant)
253 | endianness = 'big'
254 |
255 | def __init__(self, direction: List[L4NetworkMessage]):
256 | self._direction = direction
257 | # noinspection PyTypeChecker
258 | self._msgbylen = None # type: Dict[int, List[L4NetworkMessage]]
259 | """Homogeneous Size Collections"""
260 | # noinspection PyTypeChecker
261 | self._msgmixlen = None # type: List[L4NetworkMessage]
262 | # noinspection PyTypeChecker
263 | self._candidateAtNgram = None # type: Dict[int, List[int]]
264 | # noinspection PyTypeChecker
265 | self._acceptedCandidates = None # type: Dict[int, int]
266 | """Associates offset with a field length (n-gram's n) to define a list of unambiguous MSG-Len candidates"""
267 | # noinspection PyTypeChecker
268 | self._acceptedX = None # type: Dict[int, numpy.ndarray]
269 | """Maps offsets to (a,b) that solve the linear equation in #verifyCandidates()
270 | (FH: 'Msg. Len. Model Parameters')"""
271 |
272 | self.differentSizeCollections()
273 | self.findCandidates()
274 | self.verifyCandidates()
275 |
276 | # create segments for each accepted candidate
277 | self._segments = MSGlen._posLen2segments(self._direction, self.acceptedCandidates.items())
278 |
279 | def differentSizeCollections(self):
280 | """
281 | "stratifying messages by length": extract different size collection -> vector of message lengths
282 |
283 | :return: List of messages that contains an equal amount of messages of each length,
284 | i. e., List of according message lengths
285 | """
286 | if len(self._direction) == 0: # "No flows in this direction."
287 | self._msgmixlen = list()
288 | return
289 | keyfunc = lambda m: len(m.data)
290 | # Homogeneous Size Collections
291 | self._msgbylen = {k: list(v) for k, v in groupby(sorted(self._direction, key=keyfunc), keyfunc)}
292 | minCollSize = min(len(v) for v in self._msgbylen.values())
293 | # generate size-heterogeneous collection by random sampling
294 | msgmixlen = list()
295 | for k, v in self._msgbylen.items():
296 | random.seed(42)
297 | if len(v) > minCollSize:
298 | msgmixlen.extend(random.sample(v, k=minCollSize))
299 | else:
300 | msgmixlen.extend(v)
301 | self._msgmixlen = msgmixlen
302 |
303 | def findCandidates(self):
304 | """
305 | Find message-length candidates (in parenthesis: block names from FH, Fig. 3 center):
306 | * filter for message offsets where the n-gram is not constant and not random (Entropy Filter)
307 | * correlate n-grams to message lengths (Pearson Correlation)
308 |
309 | :return: The offsets (dict value: list) where the Pearson Correlation
310 | exceeds the threshold MSGlen.minCorrelation
311 | for different sized n-grams (dict key).
312 | """
313 | # "Extract Vector of Message Length"
314 | lens4msgmix = [len(m.data) for m in self._msgmixlen] # type: List[int]
315 | candidateAtNgram = dict()
316 | # iterate n-grams' n=32, 24, 16 bits (4, 3, 2 bytes), see 3.1.2
317 | for n in [4, 3, 2]:
318 | # entropy filter for each n-gram offset for "Field Values Matrix" below
319 | offsets = MSGlen.entropyFilteredOffsets(self._msgmixlen, n)
320 | # TODO currently only tested for big endian, see #intsFromNgrams
321 | # TODO for textual protocols decode the n-gram as (ASCII) number (FH, Sec. 3.2.2, second paragraph)
322 | ngIters = (intsFromNgrams(
323 | iterateSelected(NgramIterator(msg, n), offsets), type(self).endianness) for msg in self._msgmixlen)
324 | # "Field Values Matrix"
325 | ngramsAtOffsets = numpy.array(list(ngIters))
326 |
327 | # correlate columns of ngramsAtOffsets to lens4msgmix
328 | pearsonAtOffset = list()
329 | for ngrams in ngramsAtOffsets.T:
330 | # Pearson correlation coefficient (numeric value of n-gram) -> (len(msg.data))
331 | pearsonAtOffset.append(pearsonr(ngrams, lens4msgmix)[0])
332 | candidateAtNgram[n] = [o for pao, o in zip(pearsonAtOffset, offsets) if pao > MSGlen.minCorrelation]
333 | self._candidateAtNgram = candidateAtNgram
334 |
335 | def verifyCandidates(self):
336 | """
337 | Verify the length-hypothesis for candidates, by solving the linear equation
338 | for values at the candidate n-grams in candidateAtNgram (precedence for larger n, i. e., longer fields):
339 |
340 | MSG_len = a * value + b (a > 0, b \in N) - "Msg. Len. Model Parameters"
341 | lens4msgmix = ngramsAtOffsets[:,candidateAtNgram[n]] * a + 1 * b
342 |
343 | At least a threshold 0.9 of the message pairs with different lengths has to fulfill the hypothesis.
344 | """
345 | acceptedCandidates = dict() # type: Dict[int, int]
346 | acceptedX = dict()
347 | # specifying found acceptable solutions at offset (key) with n (value) for this direction
348 | for n in [4, 3, 2]:
349 | for offset in self._candidateAtNgram[n]:
350 | # check precedence: if longer already-accepted n-gram overlaps this offset ignore
351 | # noinspection PyTypeChecker
352 | if not MSGlen.checkPrecedence(offset, n, acceptedCandidates.items()):
353 | continue
354 | # MSG-len hypothesis test - for ALL message pairs with different lengths (FH, 3.2.2 last paragraph)
355 | # - for the n-grams from this offset - keep only those offsets, where the threshold of pairs holds
356 | solutionAcceptable = dict() # type: Dict[Tuple[AbstractMessage, AbstractMessage], bool]
357 | Xes = list()
358 | for l1, l2 in combinations(self._msgbylen.keys(),2):
359 | for msg0, msg1 in product(self._msgbylen[l1], self._msgbylen[l2]):
360 | ngramPair = [msg0.data[offset:offset + n], msg1.data[offset:offset + n]]
361 | if ngramPair[0] == ngramPair[1]:
362 | solutionAcceptable[(msg0, msg1)] = False
363 | continue
364 | A = numpy.array( [intsFromNgrams(ngramPair), [1, 1]] ).T
365 | B = numpy.array( [len(msg0.data), len(msg1.data)] )
366 | try: # solve the linear equation
367 | X = numpy.linalg.inv(A).dot(B)
368 | solutionAcceptable[(msg0, msg1)] = X[0] > 0 and X[1].is_integer()
369 | Xes.append(X)
370 | except numpy.linalg.LinAlgError:
371 | print("LinAlgError occurred. Solution considered as non-acceptable.")
372 | solutionAcceptable[(msg0, msg1)] = False
373 | Xarray = numpy.array(Xes)
374 | logging.getLogger(__name__).debug(f"Checking candidate with n = {n} at offset {offset}.")
375 | if type(self)._candidateIsAcceptable(solutionAcceptable, Xarray):
376 | acceptedCandidates[offset] = n
377 | acceptedX[offset] = Xarray
378 | self._acceptedCandidates = acceptedCandidates
379 | self._acceptedX = acceptedX
380 |
381 | @staticmethod
382 | def _candidateIsAcceptable(solutionAcceptable: Dict[Tuple[AbstractMessage, AbstractMessage], bool],
383 | Xarray: numpy.ndarray):
384 | """
385 | Count the message pairs for which the solution is acceptable according to the MSG-len hypothesis test.
386 |
387 | :param solutionAcceptable: results of the the MSG-len hypothesis test for the Cartesian product of messages.
388 | :return: Whether this candidate is acceptable using MSGlen.lenhypoThresh.
389 | """
390 | acceptCount = Counter(solutionAcceptable.values())
391 | return bool(acceptCount[True]/len(acceptCount) > MSGlen.lenhypoThresh)
392 |
393 | @property
394 | def acceptedCandidates(self) -> Dict[int, int]:
395 | """Associates offset with a field length (n-gram's n) to define a list of unambiguous MSG-Len candidates"""
396 | return self._acceptedCandidates
397 |
398 | @property
399 | def segments(self):
400 | return self._segments
401 |
402 | @staticmethod
403 | def checkPrecedence(offset: int, n: int, ngrams: Iterable[Tuple[int, int]]):
404 | """
405 | Has n-gram at offset precedence over all n-grams in ngrams?
406 |
407 | :param offset:
408 | :param n:
409 | :param ngrams: offset and n for a list of n-grams
410 | :return:
411 | """
412 | for o1, n1 in ngrams:
413 | if ngramIsOverlapping(offset, n, o1, n1):
414 | return False
415 | return True
416 |
417 |
418 | class CategoricalCorrelatedField(FieldType,ABC):
419 | """
420 | Abstract class for inferring field types using categorical correlation of n-gram values with external values, e. g.,
421 | environmental information like addresses from encapsulation.
422 |
423 | CategoricalCorrelatedField#correlate() uses #_values2correlate2() to determine what to correlate.
424 | This is different for the subclasses (HostID, SessionID). It iterates the n-grams (n=1) and creates
425 | n-grams-to-source-IP-tuples for Host-ID or (n-grams,(source IP,destination IP))-tuples for Session-ID.
426 | It correlates the n-grams to the respective tuple by calculating the catCorr for the
427 | n-gram and the source/destination tuple.
428 | """
429 | correlationThresh = 0.9 # 0.9, threshold for correlation between host ID and IP address(es) (FH, Sec. 3.2.3)
430 | minLenThresh = 4 # host ID fields must at least be 4 bytes long (FH, Sec. 3.2.3)
431 |
432 | def __init__(self, messages: List[L4NetworkMessage]):
433 | super().__init__()
434 | self._messages = messages
435 | filteredMessages = type(self)._filterMessages(messages)
436 | # We correlate only the filtered messages...
437 | self._categoricalCorrelation = type(self).correlate(filteredMessages)
438 | self._catCorrPosLen = type(self).catCorrPosLen(self._categoricalCorrelation)
439 | # ... but use the positions to generate segments for all messages. This might not always be wise.
440 | self._segments = type(self)._posLen2segments(self._messages, self._catCorrPosLen)
441 |
442 | @classmethod
443 | def _filterMessages(cls, messages: List[L4NetworkMessage]):
444 | """
445 | Filter messages used to correlate in the first place. To be overwritten by subclasses.
446 | This basic implementation passes the input list unchanged.
447 | """
448 | return messages
449 |
450 | @classmethod
451 | @abstractmethod
452 | def _values2correlate2(cls, messages: List[L4NetworkMessage]) -> List[int]:
453 | """
454 | Implement to determine the external values to correlate the n-grams of messages with.
455 |
456 | :param messages: Messages for which to generate correlation values.
457 | :return: The list of values, one for each message in the given order, to correlate to.
458 | """
459 | raise NotImplementedError("Implement this abstract class method in a subclass.")
460 |
461 | @classmethod
462 | def correlate(cls, messages: List[L4NetworkMessage], n: int = 1):
463 | # noinspection PyShadowingNames
464 | r"""
465 | Generate n-grams at the same offsets for each message an correlate each n-gram using
466 | categorical correlation: R(x, y) = I(x: y)/H(x, y) \in [0,1]
467 | Uses cls#n to determine the n-gram sizes and cls#_values2correlate2() to obtain tuples of data to correlate.
468 |
469 | >>> from fieldhunter.inference.fieldtypes import SessionID
470 | >>> from netzob.Model.Vocabulary.Messages.L4NetworkMessage import L4NetworkMessage
471 | >>> messages = [
472 | ... L4NetworkMessage(b"session111\x42\x17\x23\x00\x08\x15",
473 | ... l3SourceAddress="1.2.3.100", l3DestinationAddress="1.2.3.1"),
474 | ... L4NetworkMessage(b"session111xe4\x83\x82\x85\xbf",
475 | ... l3SourceAddress="1.2.3.100", l3DestinationAddress="1.2.3.1"),
476 | ... L4NetworkMessage(b"session111\x42\x17\xf9\x0b\x00b\x12O",
477 | ... l3SourceAddress="1.2.3.100", l3DestinationAddress="1.2.3.1"),
478 | ... L4NetworkMessage(b"session222\x42\x17Jk\x8a1e\xb5",
479 | ... l3SourceAddress="1.2.3.2", l3DestinationAddress="1.2.3.100"),
480 | ... L4NetworkMessage(b"session222L\xab\x83\x1a\xef\x13",
481 | ... l3SourceAddress="1.2.3.2", l3DestinationAddress="1.2.3.100"),
482 | ... ]
483 | >>> SessionID.correlate(messages)
484 | [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.21851654863632566, 0.21851654863632566, 0.4181656600790516, 0.4181656600790516, 0.4181656600790516, 0.4181656600790516]
485 |
486 | :param messages: Messages to generate n-grams to correlate to.
487 | :param n: Host-ID uses 8-bit/1-byte n-grams according to FH, Sec. 3.1.2, but this does not work well
488 | (see fieldtypesRelaxed.CategoricalCorrelatedField)
489 | :return: Correlation values for each offset of n-grams generated from the messages.
490 | """
491 | # ngram at offset and src address
492 | ngramsSrcs = list()
493 | categoricalCorrelation = list()
494 | corrValues = cls._values2correlate2(messages)
495 | # Iterate n-grams of all messages
496 | for ngrams in zip(*(NgramIterator(msg, n=n) for msg in messages)):
497 | ngSc = cls._combineNgrams2Values(ngrams, corrValues)
498 | if ngSc.size > 0:
499 | # categoricalCorrelation: R(x, y) = I(x: y)/H(x, y) \in [0,1]
500 | catCorr = drv.information_mutual(ngSc[0], ngSc[1]) / drv.entropy_joint(ngSc)
501 | else:
502 | catCorr = numpy.nan
503 | ngramsSrcs.append(ngSc)
504 | categoricalCorrelation.append(catCorr)
505 | return categoricalCorrelation
506 |
507 | @classmethod
508 | def catCorrPosLen(cls, categoricalCorrelation: List[float]):
509 | """
510 | Merge consecutive candidate n-grams with categoricalCorrelation > correlationThresh.
511 | Filters n-gram offsets on defined thresholds (FH, Sec. 3.2.3) by their categorical correlation values to
512 | * correlation between host ID and IP address(es) > correlationThresh
513 | * discard short fields < minHostLenThresh
514 |
515 | :param categoricalCorrelation: Correlation values for each offset of n-grams generated from the messages.
516 | :return: List of start-length tuples with categorical correlation above threshold and not being a short field.
517 | """
518 | catCorrOffsets = [ offset for offset, catCorr in enumerate(categoricalCorrelation)
519 | if catCorr > cls.correlationThresh ]
520 | catCorrRanges = list2ranges(catCorrOffsets)
521 | # discard short fields < minHostLenThresh
522 | return [ (start, length) for start, length in catCorrRanges if length >= cls.minLenThresh ]
523 |
524 | @property
525 | def categoricalCorrelation(self):
526 | # !! The attribute self._categoricalCorrelation needs to be defined in subclass init !!
527 | # noinspection PyUnresolvedReferences
528 | return self._categoricalCorrelation
529 |
530 | @classmethod
531 | def _combineNgrams2Values(cls, ngrams: Iterable[bytes], values: List[int]) -> numpy.ndarray:
532 | return numpy.array([intsFromNgrams(ngrams), values])
533 |
534 | @classmethod
535 | def _srcDstBytes(cls, messages: List[L4NetworkMessage]):
536 | return [ (
537 | bytes(map(int, msg.source.rpartition(':')[0].split('.'))),
538 | bytes(map(int, msg.destination.rpartition(':')[0].split('.')))
539 | ) for msg in messages]
540 |
541 |
542 | class HostID(CategoricalCorrelatedField):
543 | """
544 | Host identifier (Host-ID) inference (FH, Sec. 3.2.3)
545 | Find n-gram that is strongly correlated with IP address of sender.
546 | """
547 | typelabel = 'Host-ID'
548 |
549 | @classmethod
550 | def _values2correlate2(cls, messages: List[L4NetworkMessage]):
551 | """
552 | Recover byte representations of the IPv4 addresses from all Netzob messages and make one int out if each.
553 | :param messages: Messages to generate n-grams to correlate to.
554 | :return:
555 | """
556 | return intsFromNgrams(src for src, dst in cls._srcDstBytes(messages))
557 |
558 |
559 | class SessionID(CategoricalCorrelatedField):
560 | """
561 | Session identifier (Session-ID) inference (FH, Section 3.2.4)
562 | Find n-gram that is strongly correlated with IP addresses of sender and receiver
563 | using categorical correlation like Host-ID.
564 |
565 | Most of FH, Section 3.2.4, refers to Host-ID, so we use all missing details from there and reuse the implementation.
566 | The only difference are the values to correlate (see #_values2correlate2())
567 | """
568 | typelabel = 'Session-ID'
569 |
570 | @classmethod
571 | def _values2correlate2(cls, messages: List[L4NetworkMessage]):
572 | """
573 | Get source AND destination addresses in the same manner as (just) the source for Host-ID.
574 | Recover byte representations of the IPv4 addresses from all Netzob messages and make one int out if each.
575 |
576 | :param messages: Messages to generate n-grams to correlate to.
577 | :return: integer representation of source and destination addresses for each message.
578 | """
579 | return intsFromNgrams(src+dst for src, dst in cls._srcDstBytes(messages))
580 |
581 |
582 | class TransID(FieldType):
583 | """
584 | Transaction identifier (Trans-ID) inference (FH, Section 3.2.5, Fig. 3 right)
585 | """
586 | typelabel = 'Trans-ID'
587 |
588 | transSupportThresh = 0.8 # enough support in conversations (FH, Sec. 3.2.5)
589 | minFieldLength = 2 # merged n-grams must at least be this amount of bytes long
590 | # n-gram size is not explicitly given in FH, but the description (merging, sharp drops in entropy in Fig. 6)
591 | # leads to assuming it should be 1.
592 | n = 1
593 | entropyThresh = 0.6 # Value is not given in FH paper!
594 | """
595 | entropy in c2s/s2c + flows: threshold for high entropy is not given in FH!
596 | We use a value determined by own empirics: see entropy plots from src/trace_statistics.py
597 | """
598 | absoluteEntropy = True
599 |
600 | def __init__(self, flows: Flows):
601 | super().__init__()
602 |
603 | # prepare instance attributes
604 | self._flows = flows
605 | self._c2s, self._s2c = self._flows.splitDirections() # type: List[L4NetworkMessage], List[L4NetworkMessage]
606 | self._c2sEntropyFiltered = None
607 | self._s2cEntropyFiltered = None
608 | self._c2sConvsEntropyFiltered = dict()
609 | self._s2cConvsEntropyFiltered = dict()
610 | self._c2sHorizontalOffsets = None
611 | self._s2cHorizontalOffsets = None
612 | self._c2sCombinedOffsets = None
613 | self._s2cCombinedOffsets = None
614 | self._valuematch = dict()
615 | # noinspection PyTypeChecker
616 | self._c2sConsistentRanges = None # type: Iterable[Tuple[int, int]]
617 | # noinspection PyTypeChecker
618 | self._s2cConsistentRanges = None # type: Iterable[Tuple[int, int]]
619 |
620 | # Infer
621 | self._verticalAndHorizontalRandomNgrams()
622 | self._constantQRvalues()
623 | self._consistentCandidates()
624 | # TODO not needed for textual protocols (FH, Sec. 3.2.5, last sentence)
625 | self._c2sConsistentRanges = type(self)._mergeAndFilter(self._c2sConsistentCandidates)
626 | self._s2cConsistentRanges = type(self)._mergeAndFilter(self._s2cConsistentCandidates)
627 | self._segments = \
628 | type(self)._posLen2segments(self._c2s, self._c2sConsistentRanges) + \
629 | type(self)._posLen2segments(self._s2c, self._s2cConsistentRanges)
630 |
631 | @classmethod
632 | def entropyFilteredOffsets(cls, messages: List[AbstractMessage], absolute=True):
633 | """
634 | Find offsets of n-grams (with the same offset in different messages of the list) that are random,
635 | i. e., that have a entropy greater than entropyThresh (cls.entropyThresh or relative).
636 |
637 | FH, Section 3.2.5
638 |
639 | :param messages: Messages to generate n-grams from
640 | :param absolute: Use the absolute constant for the threshold if true,
641 | make it relative to the maximum entropy if False.
642 | :return: Returns a list of offsets that have non-constant and non-random (below entropyThresh) entropy.
643 | """
644 | if len(messages) > 0:
645 | entropy = pyitNgramEntropy(messages, cls.n)
646 | entropyThresh = cls.entropyThresh if absolute else max(entropy) * cls.entropyThresh
647 | return [offset for offset, entropy in enumerate(entropy) if entropy > entropyThresh]
648 | else:
649 | return []
650 |
651 | def _verticalAndHorizontalRandomNgrams(self):
652 | """
653 | Determine n-grams that are "random across vertical and horizontal collections" (FH, Sec. 3.2.5).
654 |
655 | Output is written to self._c2sCombinedOffsets and self._s2cCombinedOffsets.
656 | Moreover, intermediate results are persisted in instance attributes for evaluation.
657 | """
658 | logger = logging.getLogger(__name__)
659 | # vertical collections
660 | c2s, s2c = self._flows.splitDirections() # type: List[L4NetworkMessage], List[L4NetworkMessage]
661 | self._c2sEntropyFiltered = type(self).entropyFilteredOffsets(c2s, type(self).absoluteEntropy)
662 | self._s2cEntropyFiltered = type(self).entropyFilteredOffsets(s2c, type(self).absoluteEntropy)
663 | logger.info(f"c2sEntropyFiltered offsets: {self._c2sEntropyFiltered}")
664 | logger.info(f"s2cEntropyFiltered offsets: {self._s2cEntropyFiltered}")
665 |
666 | # # DEBUGGING horizontal collections: intermediate entropy of n-grams
667 | # self._c2sConvsEntropy = dict()
668 | # for key, conv in self._flows.c2sInConversations().items():
669 | # self._c2sConvsEntropy[key] = pyitNgramEntropy(conv, type(self).n)
670 | # self._s2cConvsEntropy = dict()
671 | # for key, conv in self._flows.s2cInConversations().items():
672 | # self._s2cConvsEntropy[key] = pyitNgramEntropy(conv, type(self).n)
673 | # print('_c2sConvsEntropy')
674 | # pprint(self._c2sConvsEntropy)
675 | # print('_s2cConvsEntropy')
676 | # pprint(self._s2cConvsEntropy)
677 | #
678 | # horizontal collections: entropy of n-gram per the same offset in all messages of one flow direction
679 | self._c2sConvsEntropyFiltered = type(self)._horizontalRandomNgrams(
680 | self._flows.c2sInConversations(), self._c2sEntropyFiltered)
681 | self._s2cConvsEntropyFiltered = type(self)._horizontalRandomNgrams(
682 | self._flows.s2cInConversations(), self._s2cEntropyFiltered)
683 | logger.info('c2sConvsEntropyFiltered: ' + repr(self._c2sConvsEntropyFiltered.values()))
684 | logger.info('s2cConvsEntropyFiltered: ' + repr(self._s2cConvsEntropyFiltered.values()))
685 |
686 | # intersection of all c2s and s2c filtered offset lists (per flow)
687 | c2sOffsetLists = [set(offsetlist) for offsetlist in self._c2sConvsEntropyFiltered.values()]
688 | self._c2sHorizontalOffsets = set.intersection(*c2sOffsetLists) if len(c2sOffsetLists) > 0 else set()
689 | s2cOffsetLists = [set(offsetlist) for offsetlist in self._s2cConvsEntropyFiltered.values()]
690 | self._s2cHorizontalOffsets = set.intersection(*s2cOffsetLists) if len(s2cOffsetLists) > 0 else set()
691 |
692 | # offsets in _c2sEntropyFiltered where the offset is also in all of the lists of _c2sConvsEntropyFiltered
693 | self._c2sCombinedOffsets = self._c2sHorizontalOffsets.intersection(self._c2sEntropyFiltered)
694 | # offsets in _c2sEntropyFiltered where the offset is also in all of the lists of _s2cConvsEntropyFiltered
695 | self._s2cCombinedOffsets = self._s2cHorizontalOffsets.intersection(self._s2cEntropyFiltered)
696 |
697 | @classmethod
698 | def _horizontalRandomNgrams(cls, conversions: Dict[tuple, List[AbstractMessage]],
699 | verticalEntropyFiltered: List[int]) -> Dict[Union[Tuple, None], List[int]]:
700 | """
701 | Filter in offsets that are random horizontally
702 |
703 | :param conversions:
704 | :param verticalEntropyFiltered:
705 | :return:
706 | """
707 | filteredOutput = dict()
708 | # horizontal collections: entropy of n-gram per the same offset in all messages of one flow direction
709 | for key, conv in conversions.items():
710 | filteredOutput[key] = cls.entropyFilteredOffsets(conv, cls.absoluteEntropy)
711 | return filteredOutput
712 |
713 | def _constantQRvalues(self):
714 | """
715 | Request/Response pairs: search for n-grams with constant values (differing offsets allowed)
716 |
717 | Output is placed in self._valuematch.
718 | """
719 | # compute Q->R association
720 | mqr = self._flows.matchQueryResponse()
721 | # from the n-gram offsets that passed the entropy-filters determine those that have the same value in mqr pairs
722 | for query, resp in mqr.items():
723 | qrmatchlist = self._valuematch[(query, resp)] = list()
724 | # value in query at any of the offsets in _c2sCombinedOffsets
725 | for c2sOffset in self._c2sCombinedOffsets:
726 | if len(query.data) < c2sOffset + type(self).n:
727 | continue
728 | qvalue = query.data[c2sOffset:c2sOffset + type(self).n]
729 | # matches a value of resp at any of the offsets in _s2cCombinedOffsets
730 | for s2cOffset in self._s2cCombinedOffsets:
731 | if len(resp.data) < s2cOffset + type(self).n:
732 | continue
733 | rvalue = resp.data[s2cOffset:s2cOffset + type(self).n]
734 | if qvalue == rvalue:
735 | qrmatchlist.append((c2sOffset, s2cOffset))
736 |
737 | def _consistentCandidates(self):
738 | """
739 | measure consistency: offsets recognized in more than transSupportThresh of conversations
740 |
741 | Output is written to self._c2sConsistentCandidates and self._s2cConsistentCandidates
742 | """
743 | c2sCandidateCount = Counter()
744 | s2cCandidateCount = Counter()
745 | for offsetlist in self._valuematch.values(): # (query, resp), offsetlist
746 | if len(offsetlist) < 1:
747 | continue
748 | # transpose to offsets per direction
749 | c2sOffsets, s2cOffsets = zip(*offsetlist)
750 | c2sCandidateCount.update(set(c2sOffsets))
751 | s2cCandidateCount.update(set(s2cOffsets))
752 | self._c2sConsistentCandidates = [offset for offset, cc in c2sCandidateCount.items() if
753 | cc > type(self).transSupportThresh * len(self._c2s)]
754 | self._s2cConsistentCandidates = [offset for offset, cc in s2cCandidateCount.items() if
755 | cc > type(self).transSupportThresh * len(self._s2c)]
756 |
757 | @classmethod
758 | def _mergeAndFilter(cls, consistentCandidates):
759 | """
760 | merge and filter candidates by minimum length
761 | """
762 | return [ol for ol in list2ranges(consistentCandidates) if ol[1] >= cls.minFieldLength]
763 |
764 |
765 | class Accumulator(FieldType):
766 | """
767 | Accumulator inference (FH, Section 3.2.6)
768 |
769 | "Accumulators are fields that have increasing values over consecutive message within the same conversation."
770 | (FH, Sec. 3.2.6)
771 | """
772 | typelabel = 'Accumulator'
773 |
774 | # TODO also support little endian (for our test traces, it was irrelevant)
775 | endianness = 'big'
776 | ns = (8, 4, 3, 2)
777 | deltaEntropyThresh = 0.8 # Not given in FH, own empirics: 0.2
778 |
779 | def __init__(self, flows: Flows):
780 | super(Accumulator, self).__init__()
781 |
782 | # c2s and s2c independently
783 | self._c2sConvs = {key: list(sorted(conv, key=lambda m: m.date))
784 | for key, conv in flows.c2sInConversations().items()}
785 | self._c2sDeltas = type(self).deltas(self._c2sConvs)
786 | self._c2sDeltaEntropies = type(self).entropies(self._c2sDeltas)
787 |
788 | self._s2cConvs = {key: list(sorted(conv, key=lambda m: m.date))
789 | for key, conv in flows.s2cInConversations().items()}
790 | self._s2cDeltas = type(self).deltas(self._c2sConvs)
791 | self._s2cDeltaEntropies = type(self).entropies(self._s2cDeltas)
792 |
793 | # print('c2sDeltaEntropies (n: offset: value)')
794 | # pprint(c2sDeltaEntropies)
795 | # print('s2cDeltaEntropies (n: offset: value)')
796 | # pprint(s2cDeltaEntropies)
797 |
798 | c2s, s2c = flows.splitDirections() # type: List[L4NetworkMessage], List[L4NetworkMessage]
799 | self._segments = self._posLen2segments(c2s, type(self).filter(self._c2sDeltaEntropies)) + \
800 | self._posLen2segments(s2c, type(self).filter(self._s2cDeltaEntropies))
801 |
802 | @classmethod
803 | def deltas(cls, conversations: Dict[tuple, List[AbstractMessage]]) -> Dict[int, Dict[int, List[int]]]:
804 | """
805 | Value deltas per offset and n over all message-pairs of all conversations.
806 |
807 | :param conversations: Conversations need to be sorted in chronological order for the message pairs to produce
808 | meaningful deltas.
809 | :return: Pairwise deltas of values per offset and n-gram size.
810 | """
811 | deltas = dict()
812 | for key, conv in conversations.items():
813 | if len(conv) > 2:
814 | continue
815 | # subsequent messages per direction per conversation
816 | for msgA, msgB in zip(conv[:-1], conv[1:]):
817 | # iterate n-grams' n = 8, 4, 3, 2
818 | # combined from Sec. 3.1.2: n=32, 24, 16 bits (4, 3, 2 bytes)
819 | # and see Sec. 3.2.6: n=64, 32, 16 bits (8, 4, 2 bytes)
820 | for n in cls.ns:
821 | if n not in deltas:
822 | deltas[n] = dict()
823 | for offset, (ngramA, ngramB) in enumerate(zip(NgramIterator(msgA, n), NgramIterator(msgB, n))):
824 | # calculate delta between n-grams (n and offset identical) two subsequent messages
825 | # TODO test support little endian (for our test traces, it was irrelevant)
826 | delta = int.from_bytes(ngramB, cls.endianness) - int.from_bytes(ngramA, cls.endianness)
827 | if offset not in deltas[n]:
828 | deltas[n][offset] = list()
829 | deltas[n][offset].append(delta)
830 | return deltas
831 |
832 | @classmethod
833 | def entropies(cls, deltas: Dict[int, Dict[int, List[int]]]) -> Dict[int, Dict[int, float]]:
834 | """
835 | For positive delta values with enough samples to calculate a meaningful entropy (>= 2),
836 | calculate the normalized entropies of the "compressed" (ln()) deltas.
837 |
838 | :param deltas: Pairwise deltas between values of subsequent messages in conversations
839 | at the same offset and with the same length (n): Dict[n, Dict[offset, delta] ].
840 | :return: Entropies of deltas per n-gram length and offset: Dict[n, Dict[offset, entropy] ].
841 | """
842 | lndeltas = dict()
843 | for n, offdel in deltas.items():
844 | lndeltas[n] = dict()
845 | for offset, dlts in offdel.items():
846 | # require more than 1 value to calculate a meaningful entropy
847 | if len(dlts) < 2:
848 | continue
849 | npdlts = numpy.array(dlts)
850 | # require all deltas to be positive
851 | if any(npdlts <= 0):
852 | continue
853 | # compress deltas by ln
854 | lndeltas[n][offset] = numpy.log(numpy.array(dlts))
855 | deltaEntropies = {n: {offset: drv.entropy(dlts)/numpy.log(n*8)
856 | for offset, dlts in offdel.items()} for n, offdel in lndeltas.items()}
857 | return deltaEntropies
858 |
859 | @classmethod
860 | def filter(cls, deltaEntropies: Dict[int, Dict[int, float]]) -> List[Tuple[int, int]]:
861 | """
862 | Filter the entropies per n-gram size and offset to yield unambiguos candidates for accumulators.
863 | Filtering criteria are:
864 | * "fairly constant": relatively low entropy
865 | * previous filtering left over offsets for a n
866 | * prefer larger ns and smaller offsets if candidates are overlapping
867 |
868 | :param deltaEntropies: Entropies of deltas per n-gram length and offset: Dict[n, Dict[offset, entropy] ].
869 | :return: List of offsets and lengths that are valid field candidates.
870 | """
871 | # "fairly constant": relatively low entropy -> threshold (value not given in FH)
872 | filteredDE = {n: {offs: entr for offs, entr in offsdelt.items() if entr < cls.deltaEntropyThresh}
873 | for n, offsdelt in deltaEntropies.items()}
874 | candidates = dict() # type: Dict[int, List[int]]
875 | for n in reversed(sorted(filteredDE.keys())):
876 | # no offsets for this n-gram size
877 | if len(filteredDE[n]) == 0:
878 | continue
879 | for offset in sorted(filteredDE[n].keys()):
880 | # precedence for larger ns and smaller offsets: thats those we already found and added to candidates
881 | overlapps = False
882 | for candN, candOffs in candidates.items():
883 | for candO in candOffs:
884 | if ngramIsOverlapping(offset, n, candO, candN):
885 | overlapps = True
886 | break
887 | if overlapps:
888 | break
889 | if overlapps:
890 | continue
891 | if not n in candidates:
892 | candidates[n] = list()
893 | candidates[n].append(offset)
894 | posLen = [(o, n) for n, offsets in candidates.items() for o in offsets]
895 | return posLen
896 |
897 |
898 | # Host-ID will always return a subset of Session-ID fields, so Host-ID should get precedence
899 | # MSG-Len would be overwritten by MSG-Type (see SMB: nbss.length), so first use MSG-Len
900 | precedence = {MSGlen.typelabel: 0, MSGtype.typelabel: 1, HostID.typelabel: 2,
901 | SessionID.typelabel: 3, TransID.typelabel: 4, Accumulator.typelabel: 5}
902 | """
903 | The order in which to map field types to messages.
904 | Lower numbers take precedence over higher numbers, so that the type with the higher number will be ignored
905 | if overlapping at the same offet range in the message.
906 | """
--------------------------------------------------------------------------------