├── .gitignore ├── setup.py ├── LICENSE.md ├── README.md └── pcap_reassembler.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | 21 | # Installer logs 22 | pip-log.txt 23 | 24 | # Unit test / coverage reports 25 | .coverage 26 | .tox 27 | nosetests.xml 28 | 29 | # Translations 30 | *.mo 31 | 32 | # Mr Developer 33 | .mr.developer.cfg 34 | .project 35 | .pydevproject 36 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name="pcap-reassembler", 4 | version='0.1', 5 | description='Reassembles UDP/TCP packets into application layer messages', 6 | author='Fredrik Appelros, Carl Ekerot', 7 | author_email='fredrik.appelros@gmail.com, kalle@implode.se', 8 | url='https://github.com/FredrikAppelros/pcap-reassembler', 9 | py_modules=['pcap_reassembler'], 10 | install_requires=['pylibpcap'] 11 | ) 12 | 13 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 Fredrik Appelros, Carl Ekerot 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | pcap-reassembler 2 | ================ 3 | 4 | Reassembles UDP/TCP packets into application layer messages. May also 5 | extract data from different OSI layers. 6 | 7 | Introduction 8 | ------------ 9 | 10 | pcap-reassembler is a tool which helps analyzing application layer protocol 11 | data without having to inspect segmented transport level payloads. 12 | For TCP, application layer messages are reassembled through analysis of 13 | acknowledgement numbers between segments. For UDP, each datagram payload is 14 | interpreted as an application layer message. 15 | 16 | As of now, pcap-reassembler is compatible with Ethernet as link layer protocol, 17 | IPv4 as network layer protocol, and TCP or UDP as transport layer protocols. The 18 | transport layer protocol is automatically detected from the IP header protocol 19 | flag. 20 | 21 | pcap-reassembler is implemented through [pylibpcap](http://pylibpcap.sourceforge.net/). 22 | 23 | Installation 24 | ------------ 25 | 26 | Install pcap-reassembler with ```python setup.py install``` 27 | 28 | Usage 29 | ----- 30 | 31 | ```python 32 | >>> from pcap_reassembler import PcapReassembler, address_to_string 33 | >>> reassembler = PcapReassembler() 34 | >>> messages = reassembler.load_pcap('http.cap') 35 | >>> msg = messages[0] 36 | >>> msg.payload 37 | 'GET /download.html HTTP/1.1\r\nHost: www.ethereal.com\r\nUser-Agent: Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.6) Gecko/20040113\r\nAccept: text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,image/jpeg,image/gif;q=0.2,*/*;q=0.1\r\nAccept-Language: en-us,en;q=0.5\r\nAccept-Encoding: gzip,deflate\r\nAccept-Charset: ISO-8859-1,utf-8;q=0.7,*;q=0.7\r\nKeep-Alive: 300\r\nConnection: keep-alive\r\nReferer: http://www.ethereal.com/development.html\r\n\r\n' 38 | >>> address_to_string(msg.src_addr) 39 | '145.254.160.237' 40 | ``` 41 | 42 | Limitations 43 | ----------- 44 | 45 | pcap-reassembler does not yet support IP fragmentation or IPv6. Also the only 46 | transport layer protocols that are supported are TCP and UDP. 47 | 48 | License 49 | ------- 50 | 51 | Distributed under the MIT license. See the ```LICENSE``` file. 52 | -------------------------------------------------------------------------------- /pcap_reassembler.py: -------------------------------------------------------------------------------- 1 | """pcap reassembler 2 | 3 | Provides a way to reassemble application layer messages from UDP or TCP 4 | packets found in a pcap file. 5 | 6 | """ 7 | 8 | import sys 9 | import pcap 10 | import struct 11 | import time 12 | 13 | # OSI layer constants 14 | PHYSICAL_LAYER = 1 15 | DATA_LINK_LAYER = 2 16 | NETWORK_LAYER = 3 17 | TRANSPORT_LAYER = 4 18 | SESSION_LAYER = 5 19 | PRESENTATION_LAYER = 6 20 | APPLICATION_LAYER = 7 21 | 22 | # TPID from IEEE 802.1Q 23 | _tpid = '\x81\x00' 24 | 25 | # EtherType constants 26 | _ether_type = { 27 | 'IPv4': '\x08\x00', 28 | 'IPv6': '\x86\xdd', 29 | } 30 | 31 | # IP protocol field constants 32 | _ip_protocol = { 33 | 'TCP': '\x06', 34 | 'UDP': '\x11', 35 | } 36 | 37 | FILL_BYTE = "\x00" 38 | 39 | class Message(dict): 40 | """Reassembled message class 41 | 42 | Message attributes are accessible as regular object attributes using 43 | dot-notation. The common available attributes are: 44 | 45 | * number - the message number based on the order of the 46 | first fragment 47 | * fragment_tss - the fragment timestamps 48 | * ts - the message timestamp based on the timestamp of 49 | the first fragment 50 | * data - the raw byte data of the entire message 51 | * payload - the raw byte data of the message payload 52 | 53 | """ 54 | @property 55 | def ts(self): 56 | return self.fragment_tss[0] 57 | 58 | __getattr__ = dict.__getitem__ 59 | __setattr__ = dict.__setitem__ 60 | 61 | class PcapReassembler: 62 | def __init__(self): 63 | # TCP stream buffer 64 | self._tcp_stream = None 65 | # message buffer 66 | self._msgs = None 67 | # packet count 68 | self._count = 1 69 | # OSI layer 70 | self._layer = 4 71 | # strict TCP reassembly policy 72 | self._strict_policy = False 73 | 74 | def load_pcap(self, filename, layer=TRANSPORT_LAYER, strict=False): 75 | """Loads a pcap file and returns a list of Message objects 76 | containing the reassembled messages for the specified OSI 77 | layer. 78 | 79 | Usage: 80 | >>> import pcap_reassembler 81 | >>> reassembler = pcap_reassembler.PcapReassembler() 82 | >>> msgs = reassembler.load_pcap('http.cap') 83 | >>> msgs[0].payload 84 | 'GET /download.html ...' 85 | 86 | """ 87 | if not DATA_LINK_LAYER <= layer <= TRANSPORT_LAYER: 88 | raise ValueError("Specified OSI layer is not supported.") 89 | self._tcp_stream = {} 90 | self._msgs = [] 91 | self._count = 1 92 | self._layer = layer 93 | self._strict_policy = strict 94 | p = pcap.pcapObject() 95 | p.open_offline(filename) 96 | # process all packets 97 | try: 98 | p.dispatch(-1, self._process_eth) 99 | except Exception as e: 100 | print e 101 | 102 | # flush all TCP connections for remaining messages 103 | for socks in self._tcp_stream: 104 | self._tcp_flush(socks) 105 | self._msgs.sort(key=lambda x: x.number) 106 | return self._msgs 107 | 108 | def _process_eth(self, length, data, ts): 109 | """Processes an Ethernet packet (header to checksum; not the full frame). 110 | 111 | May propagate processing to the correct IP version processing function. 112 | 113 | """ 114 | dst_addr = data[0:6] 115 | src_addr = data[6:12] 116 | ieee_8021q = data[12:14] == _tpid 117 | if ieee_8021q: 118 | tci = data[14:16] 119 | eth_type = data[16:18] 120 | pld = data[18:] 121 | else: 122 | eth_type = data[12:14] 123 | pld = data[14:] 124 | if self._layer > 2: 125 | if eth_type == _ether_type['IPv4']: 126 | self._process_ipv4(ts, pld) 127 | else: 128 | pass 129 | else: 130 | msg = Message({ 131 | 'number': self._count, 132 | 'fragment_tss': [ts], 133 | 'data': ''.join(data), 134 | 'src_addr': src_addr, 135 | 'dst_addr': dst_addr, 136 | 'eth_type': eth_type, 137 | 'payload': ''.join(pld), 138 | }) 139 | if ieee_8021q: 140 | msg['tci'] = tci 141 | self._msgs.append(msg) 142 | self._count += 1 143 | 144 | def _process_ipv4(self, ts, data): 145 | """Processes an IPv4 packet. 146 | 147 | Extracts source address, destination address and protocol fields 148 | and may propagate processing to the correct protocol processing 149 | function. 150 | 151 | """ 152 | header_len = 4 * (_decode_byte(data[0]) & 0x0f) 153 | tot_len = _decode_short(data[2:4]) 154 | ip_type = data[9] 155 | src = data[12:16] 156 | dst = data[16:20] 157 | pld = data[header_len:tot_len] 158 | if self._layer > 3: 159 | if ip_type == _ip_protocol['TCP']: 160 | self._process_tcp(ts, src, dst, pld) 161 | elif ip_type == _ip_protocol['UDP']: 162 | self._process_udp(ts, src, dst, pld) 163 | else: 164 | pass 165 | else: 166 | msg = Message({ 167 | 'number': self._count, 168 | 'fragment_tss': [ts], 169 | 'data': ''.join(data[:tot_len]), 170 | 'ip_type': ip_type, 171 | 'src_addr': src, 172 | 'dst_addr': dst, 173 | 'payload': ''.join(pld), 174 | }) 175 | self._msgs.append(msg) 176 | 177 | def _process_tcp(self, ts, src_addr, dst_addr, data): 178 | """Processes a TCP packet. 179 | 180 | Extracts source port, destination port, sequence number and 181 | acknowledgement number and adds the payload to the current message 182 | data. If there is no current message in the buffer one is created 183 | with the attributes of the current packet. When the acknowledgement 184 | number changes the TCP connection buffer associated with the 185 | current source address is flushed. 186 | 187 | """ 188 | # reassemble PDUs by buffering packets and flushing when ack changes 189 | src_port = _decode_short(data[0:2]) 190 | dst_port = _decode_short(data[2:4]) 191 | seq = _decode_word(data[4:8]) 192 | ack = _decode_word(data[8:12]) 193 | offset = (_decode_byte(data[12]) & 0xf0) >> 4 194 | pld = data[4*offset:] 195 | src_socket = (src_addr, src_port) 196 | dst_socket = (dst_addr, dst_port) 197 | sockets = (src_socket, dst_socket) 198 | if pld: 199 | if not sockets in self._tcp_stream: 200 | self._tcp_stream[sockets] = Message({ 201 | 'number': self._count, 202 | 'fragment_tss': [], 203 | 'data': [], 204 | 'ip_proto': 'TCP', 205 | 'src_addr': src_addr, 206 | 'dst_addr': dst_addr, 207 | 'src_port': src_port, 208 | 'dst_port': dst_port, 209 | 'seq': seq, 210 | 'ack': ack, 211 | 'payload': [], 212 | }) 213 | msg = self._tcp_stream[sockets] 214 | msg.fragment_tss.append(ts) 215 | msg.data.append(''.join(data)) 216 | offset = seq - msg.seq 217 | if offset > len(msg.payload): 218 | msg.payload.extend(FILL_BYTE * (offset - len(msg.payload))) 219 | msg.payload[offset:offset+len(pld)] = list(pld) 220 | if self._strict_policy: 221 | # Check the other stream in the connection 222 | sockets = sockets[::-1] 223 | if (sockets in self._tcp_stream and ack == self._tcp_stream[sockets].seq + 224 | len(self._tcp_stream[sockets].payload)): 225 | self._tcp_flush(sockets) 226 | del self._tcp_stream[sockets] 227 | else: 228 | if sockets in self._tcp_stream and ack != self._tcp_stream[sockets].ack: 229 | self._tcp_flush(sockets) 230 | del self._tcp_stream[sockets] 231 | 232 | def _tcp_flush(self, sockets): 233 | """Flushes the specified TCP connection buffer. 234 | 235 | Adds the flushed message to the message buffer. 236 | 237 | """ 238 | msg = self._tcp_stream[sockets] 239 | msg.payload = ''.join(msg.payload) 240 | self._msgs.append(msg) 241 | 242 | def _process_udp(self, ts, src_addr, dst_addr, data): 243 | """Processes an UDP packet. 244 | 245 | Extracts source and destination port and creates a message 246 | from the current packet which is added to the message buffer. 247 | 248 | """ 249 | src_port = _decode_short(data[0:2]) 250 | dst_port = _decode_short(data[2:4]) 251 | msg = Message({ 252 | 'number': self._count, 253 | 'fragment_tss': [ts], 254 | 'data': ''.join(data), 255 | 'ip_proto': 'UDP', 256 | 'src_addr': src_addr, 257 | 'dst_addr': dst_addr, 258 | 'src_port': src_port, 259 | 'dst_port': dst_port, 260 | 'payload': ''.join(data[8:]), 261 | }) 262 | self._msgs.append(msg) 263 | 264 | def _decode_byte(data): 265 | """Decodes one byte of network data into an unsigned char.""" 266 | return struct.unpack('!B', data)[0] 267 | 268 | def _decode_short(data): 269 | """Decodes two bytes of network data into an unsigned short.""" 270 | return struct.unpack('!H', data)[0] 271 | 272 | def _decode_word(data): 273 | """Decodes four bytes of network data into an unsigned int.""" 274 | return struct.unpack('!I', data)[0] 275 | 276 | def validate_tcp_checksum(length, src, dst, data): 277 | """Validates a TCP checksum according to RFC 1071. 278 | 279 | Takes length, source address and destination address for computing 280 | the IP pseudo-header. The data parameter contains the entire TCP 281 | packet. 282 | 283 | """ 284 | # this is currently unused as we simply insert newer data 285 | # over old data without checking the checksum 286 | csum = _decode_short(data[16:18]) 287 | data = list(data) 288 | data[16:18] = '\x00\x00' 289 | if len(data) % 2 != 0: 290 | data.append('\x00') 291 | sum = 0 292 | sum += _decode_short(src[0:2]) 293 | sum += _decode_short(src[2:4]) 294 | sum += _decode_short(dst[0:2]) 295 | sum += _decode_short(dst[2:4]) 296 | sum += 0x0006 297 | sum += length 298 | for i in range(0, len(data), 2): 299 | sum += _decode_short(data[i] + data[i+1]) 300 | while sum >> 16: 301 | sum = (sum & 0xffff) + (sum >> 16) 302 | return ~sum & 0xffff == csum 303 | 304 | def address_to_string(b): 305 | """Converts an IP address to its string representation. 306 | 307 | Takes a 4-byte string representing an IP address, and returns a 308 | dot-separated decimal representation on the form '123.123.123.123'. 309 | 310 | """ 311 | assert len(b) == 4 312 | b = map(lambda x: str(_decode_byte(x)), b) 313 | return '.'.join(b) 314 | --------------------------------------------------------------------------------