├── .DS_Store ├── images ├── halogen.png └── halo_diagram.png ├── tests ├── two_jpg_png.doc ├── test_three_png.doc ├── test_single_jpg.doc └── test_single_jpg.docx ├── TODO.md ├── CHANGELOG.md ├── CONTRIBUTORS.md ├── halogen ├── halogen.py ├── lib │ ├── generator.py │ ├── render.py │ └── parser.py └── mfbot.py ├── LICENSE ├── CODE_OF_CONDUCT.md └── README.md /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/target/halogen/HEAD/.DS_Store -------------------------------------------------------------------------------- /images/halogen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/target/halogen/HEAD/images/halogen.png -------------------------------------------------------------------------------- /tests/two_jpg_png.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/target/halogen/HEAD/tests/two_jpg_png.doc -------------------------------------------------------------------------------- /images/halo_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/target/halogen/HEAD/images/halo_diagram.png -------------------------------------------------------------------------------- /tests/test_three_png.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/target/halogen/HEAD/tests/test_three_png.doc -------------------------------------------------------------------------------- /tests/test_single_jpg.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/target/halogen/HEAD/tests/test_single_jpg.doc -------------------------------------------------------------------------------- /tests/test_single_jpg.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/target/halogen/HEAD/tests/test_single_jpg.docx -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # To Do: 2 | 3 | 1. look at adding a "save" mode that'll save off the images (probably only PNGs) to a specific directory. For some reason jpgs didn't look to work last time I checked. This will make it easier to keep track of what we've found and what we haven't. 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 8/20/2021 2 | * fixing typos in documentation. 3 | ## 4/22/2021 4 | * Adding in support for jumping over repeated bytes in jpeg files, this should reduce false positives `--jpg-jump` 5 | ## 4/14/2021 6 | * Expanding documentation. 7 | * Updating images. 8 | * Pulling in `--jpg-sof2sos` version. 9 | ## 7/2/2020 10 | * New version. Removed some features we probably don't need and can now cycle through more than one image. 11 | ## 4/23/2019 12 | * Reviewed and created issue to submit for approval -------------------------------------------------------------------------------- /CONTRIBUTORS.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 | 3 | Thank you to [all of our contributors](https://github.com/target/halogen/graphs/contributors). For reviewing per-file contributions, run the following commands: 4 | ``` 5 | git blame 6 | git log -p 7 | ``` 8 | 9 | # Target Team 10 | * Kyle Eaton ([@0xkyle](https://twitter.com/0xkyle)) 11 | * Devin Smith 12 | 13 | 14 | # External Contributors 15 | * Wyatt Roersma ([wyattroersma](https://twitter.com/wyattroersma)) 16 | * Thao Vo ([@ttttv0](https://twitter.com/ttttv0)) -------------------------------------------------------------------------------- /halogen/halogen.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ The mfbot Python3 CLI script """ 3 | from mfbot import MFBot 4 | 5 | 6 | def main() -> None: 7 | """ Main function to start things up for the command line use of mfbot """ 8 | mfbot = MFBot() 9 | mfbot.parse_args() 10 | if mfbot.dir: 11 | yara_rule_output = mfbot.dir_run() 12 | if len(yara_rule_output) > 0: 13 | if mfbot.clam: 14 | mfbot.print_clam_rule(yara_rule_output) 15 | else: 16 | mfbot.print_yara_rule(yara_rule_output) 17 | else: 18 | print("No images found within that directory") 19 | else: 20 | yara_rule_output = mfbot.run() 21 | if len(yara_rule_output) > 0: 22 | if mfbot.clam: 23 | mfbot.print_clam_rule(yara_rule_output) 24 | else: 25 | mfbot.print_yara_rule(yara_rule_output) 26 | else: 27 | print('No image found.') 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020 Target Brands, Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. -------------------------------------------------------------------------------- /halogen/lib/generator.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ The generator parser library to support all generation processes """ 3 | import re 4 | import binascii 5 | import sys 6 | 7 | 8 | def yara_image_rule_maker(self) -> dict: 9 | """ Yara image rule generator 10 | returns: rule_match_dict """ 11 | if self.image_name is None: 12 | find_matches_dict = self.get_file[1] 13 | self.image_name = [] 14 | for values in find_matches_dict: 15 | if len(values[0]) > 0: 16 | if values == "JPG2" and ("JPG" in self.image_name): 17 | pass #skip jpg2 because we matched on a more narrow jpg header value. 18 | else: 19 | self.image_name.append(values) 20 | 21 | rule_match_dict = yara_image_generator(self) 22 | return rule_match_dict 23 | 24 | 25 | def yara_image_generator(self) -> dict: 26 | """ puts the data in a format that we need for later in the process 27 | returns dict_list """ 28 | dict_list = [] 29 | if self.image_name is not None: 30 | for ftype in self.image_name: 31 | for match in self.get_file[1][ftype]: 32 | rule_data = {} 33 | rule_data['format'] = ftype 34 | if type(match) is bytes: 35 | img_hex_value = binascii.hexlify(match) 36 | value = (str(img_hex_value)) 37 | rule_data['hex'] = value[2:-1] 38 | elif type(match) is str: 39 | rule_data['hex'] = match 40 | if rule_data not in dict_list: 41 | dict_list.append(rule_data) 42 | return dict_list 43 | 44 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to make participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at 59 | [TTS-OpenSource-Office@target.com](mailto:TTS-OpenSource-Office@target.com). All 60 | complaints will be reviewed and investigated and will result in a response that 61 | is deemed necessary and appropriate to the circumstances. The project team is 62 | obligated to maintain confidentiality with regard to the reporter of an incident. 63 | Further details of specific enforcement policies may be posted separately. 64 | 65 | Project maintainers who do not follow or enforce the Code of Conduct in good 66 | faith may face temporary or permanent repercussions as determined by other 67 | members of the project's leadership. 68 | 69 | ## Attribution 70 | 71 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 72 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 73 | 74 | [homepage]: https://www.contributor-covenant.org 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Halogen](/images/halogen.png) 2 | **** 3 | Halogen is a tool to automate the creation of yara rules based on the image files embedded within a malicious document. This can assist cyber security professionals in writing detection rules for malicious threats as well as help responders in identifying with particular threat they are dealing with. Currently, Halogen is able to create rules based on JPG and PNG files. 4 | **** 5 | ![Halogen Walkthrough](/images/halo_diagram.png) 6 | 7 | ## Halogen help 8 | ``` 9 | python3 halogen.py -h 10 | usage: halogen.py [-h] [-f FILE] [-d DIR] [-n NAME] [--png-idat] [--jpg-sos] [--jpg-sof2sos] [--jpg-jump] [-c CONTAINER] [--clam] [--rprefix RPREFIX] 11 | 12 | Halogen: Automatically create yara rules based on images embedded in office documents. 13 | 14 | optional arguments: 15 | -h, --help show this help message and exit 16 | -f FILE, --file FILE File to parse 17 | -d DIR, --directory DIR 18 | directory to scan for image files. 19 | -n NAME, --rule-name NAME 20 | specify a custom name for the rule file 21 | --png-idat For PNG matches, instead of starting with the PNG file header, start with the IDAT chunk. 22 | --jpg-sos For JPG matches, skip over the header and look for the Start of Scan marker, and begin the match there. 23 | --jpg-sof2sos for JPG matches, skip over the header and match the SOF all the way to the SOS + 45 bytes of the data within the SOS. 24 | --jpg-jump for JPG matches, skip over the header and identify the sof, the sos and then read the actual image data take that data and look for repeated bytes. Skip those bytes and then 25 | create 45 bytes of raw image data. 26 | -c CONTAINER, --container CONTAINER 27 | specify a clamav container type defaults to CL_TYPE_MSOLE2, CL_TYPE_OOXML_WORD, CL_TYPE_OOXML_XL, CL_TYPE_OOXML_PPT 28 | --clam generate a clam rule instead of a yara rule 29 | --rprefix RPREFIX specify a clamav ruleset prefix 30 | 31 | 32 | ``` 33 | ## Testing it out 34 | We've included some test document files with embedded images for you to test this out with. Running `python3 halogen/halogen.py -d tests/ > /tmp/halogen_test.yara` will produce the test yara file containing all images found within the files inside the `tests/` directory. 35 | From here you can run `yara -d /tmp/halogen_test.yara tests/` and observe which images match which files. 36 | 37 | ### Notes 38 | 1. We use two patterns for JPG matching. One is less strict than the typical JPG file header, and we use this because we've seen some malicious files match this pattern. If Halogen finds both, it'll default to writing out the more strict match. Typically, these have the same matching content, so no detection really gets missed. 39 | 2. For PNG files you can choose to start by default at the file header, or with `--png-idat` you can start at the IDAT chunk found within a PNG file. We also reduced the bytes returned when matching on the IDAT chunk. 40 | 3. Similar to the above, you can start JPG matches at the Start of Scan marker by using the `--jpg-sos` flag. 41 | 4. Because of how the SOS section of the JPG file works, we've also included an optional `--jpg-sof2sos` flag, which reads the Start of Frame (SOF) marker until the SOS is found, and then reads an additional 45 bytes. This is useful if the the stardard `--jpg-sos` is giving you false positives. 42 | 5. In an effort to reduce false positives, we've added in the `--jpg-jump` flag which reads the compressed image data and creates a hex jump in the yara output if it finds repeated image bytes. This allows us to match on the SOF and SOS of the file, as well as some of the more unique data in the image. 43 | 44 | 45 | ### Contributing 46 | Please contribute pull requests in python3, and submit any bugs you find as issues. 47 | -------------------------------------------------------------------------------- /halogen/mfbot.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | from lib.parser import get_file 5 | from lib.generator import yara_image_rule_maker 6 | from lib.render import yara_print_rule, clam_print_rule 7 | 8 | 9 | 10 | class MFBot: 11 | """ Malicious File Bot Class """ 12 | def __init__(self) -> None: 13 | args = MFBot.parse_args() 14 | self.yara_base_file = args.file 15 | self.image_name = None 16 | self.idat = args.idat 17 | self.jpgsos = args.jpgsos 18 | self.sof2sos = args.sof2sos 19 | self.dir = args.dir 20 | self.jump = args.jump 21 | self.dirhash = [] 22 | self.name = args.name 23 | self.container = args.container 24 | self.clam = args.clam 25 | self.rprefix = args.rprefix 26 | @staticmethod 27 | def parse_args()-> iter: 28 | """ Parse any options passed to the the script """ 29 | parser_args = argparse.ArgumentParser(description="Halogen: Automatically create yara \ 30 | rules based on images embedded in office documents.") 31 | parser_args.add_argument("-f", "--file", help="File to parse") 32 | parser_args.add_argument("-d", "--directory", dest="dir", help="directory to scan \ 33 | for image files.") 34 | parser_args.add_argument("-n", "--rule-name", dest="name", help="specify a custom \ 35 | name for the rule file") 36 | parser_args.add_argument("--png-idat", dest="idat", help="For PNG matches, instead \ 37 | of starting with the PNG file header, start with the IDAT chunk.", action='store_true') 38 | parser_args.add_argument("--jpg-sos", dest="jpgsos", help="For JPG matches, skip \ 39 | over the header and look for the Start of Scan marker, \ 40 | and begin the match there.", action='store_true') 41 | parser_args.add_argument("--jpg-sof2sos", dest="sof2sos", help="for JPG matches, \ 42 | skip over the header and match the SOF all the way to the SOS + 45 bytes of the \ 43 | data within the SOS.", action='store_true') 44 | parser_args.add_argument("--jpg-jump", dest="jump", help="for JPG matches, \ 45 | skip over the header and identify the sof, the sos and then read the actual image data \ 46 | take that data and look for repeated bytes. Skip those bytes and then create 45 bytes of\ 47 | raw image data.", action='store_true') 48 | parser_args.add_argument("-c", "--container", dest="container", help="specify a clamav container type \ 49 | defaults to CL_TYPE_MSOLE2, CL_TYPE_OOXML_WORD, CL_TYPE_OOXML_XL, CL_TYPE_OOXML_PPT") 50 | parser_args.add_argument("--clam", dest="clam", help="generate a clam rule instead of a yara rule", action="store_true") 51 | parser_args.add_argument("--rprefix", dest="rprefix", help="specify a clamav ruleset prefix") 52 | args = parser_args.parse_args() 53 | if (args.file is None) and (args.dir is None): 54 | parser_args.print_help() 55 | exit(1) 56 | return args 57 | 58 | def run(self): 59 | """mfbot.run() is the core function to call that will return all information 60 | generated by mfbot. 61 | returns: rule_dict - dictionary of rules. """ 62 | self.get_file = get_file(self) 63 | rule_dict = yara_image_rule_maker(self) 64 | if rule_dict is not None: 65 | return rule_dict 66 | 67 | def print_yara_rule(self, rule_list): 68 | """ prints the yara rule by reading in a list of dicts, and iterating over that. 69 | parameter: rule_list - list of rules to print. """ 70 | yara_print_rule(self, rule_list) 71 | def print_clam_rule(self, rule_list): 72 | """ prints the yara rule by reading in a list of dicts, and iterating over that. 73 | parameter: rule_list - list of rules to print. """ 74 | clam_print_rule(self, rule_list) 75 | def dir_run(self): 76 | """ runs through the process with a directory instead of a single file. 77 | returns: combo list. """ 78 | filelist = glob.glob(self.dir + "/*") 79 | combo = [] 80 | for f in filelist: 81 | if os.path.isfile(f): 82 | self.image_name = None 83 | self.yara_base_file = f 84 | self.get_file = get_file(self) 85 | self.dirhash.append(self.get_file[0]) 86 | rule_dict = yara_image_rule_maker(self) 87 | if rule_dict is not None: 88 | for i in rule_dict: 89 | if i not in combo: 90 | combo.append(i) 91 | else: 92 | pass 93 | return combo 94 | -------------------------------------------------------------------------------- /halogen/lib/render.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ The render library to support all output processes """ 3 | import datetime 4 | import re 5 | 6 | def yara_print_rule(self, l): 7 | """ iterate over the list, and print a string for each rule 8 | parameter: l - list of rules""" 9 | if self.name: 10 | rname = str(self.name) 11 | else: 12 | rname = "halogen_generated_{md5_hash}".format(md5_hash=self.get_file[0]) 13 | if self.dirhash and len(self.dirhash) < 20: 14 | md5val = self.dirhash 15 | else: 16 | md5val = self.get_file[0] 17 | if self.dir: 18 | dir_path = self.dir 19 | if "\\" in dir_path: 20 | win_path = dir_path.replace("\\", "\\\\") 21 | fname = "Directory: {0} ".format(win_path) 22 | else: 23 | fname = "Directory: {0} ".format(dir_path) 24 | else: 25 | fname = self.yara_base_file 26 | 27 | rule_string = """\ 28 | rule {rname} : maldoc image 29 | {{ 30 | meta: 31 | tlp = "amber" 32 | author = "Halogen Generated Rule" 33 | date = "{date}" 34 | md5 = "{md5_hash}" 35 | family = "malware family" 36 | filename = "{input_file}" 37 | scope = "['detection', 'collection']" 38 | intel = "['']" 39 | strings: 40 | """.format(rname=rname, md5_hash=md5val, date=str(datetime.date.today()), 41 | input_file=fname) 42 | for i in range(0, len(l)): 43 | rule_dict = l[i] 44 | ftype = rule_dict['format'].lower() 45 | image_hex = rule_dict['hex'] 46 | s = " ${ftype}_img_value_{image_name_string} = {{{image_value_str}}}\n".format( 47 | ftype=ftype, image_name_string=i, image_value_str=image_hex 48 | ) 49 | rule_string += s 50 | 51 | rule_string += """ 52 | condition: 53 | any of them 54 | }""" 55 | print(rule_string) 56 | 57 | def clam_print_rule(self, l): 58 | """ iterate over the list, and print a string for each rule 59 | parameter: l - list of rules""" 60 | rule_arr1 = [] 61 | rule_arr2 = [] 62 | rname = "" 63 | if self.name: 64 | if self.rprefix: 65 | rname = str(self.name) 66 | else: 67 | rname = "HalogenGenerated.{md5_hash}".format(md5_hash=self.get_file[0]) 68 | if self.rprefix: 69 | if self.rprefix.endswith("."): 70 | rname = self.rprefix + rname 71 | else: 72 | rname = self.rprefix + "." + rname 73 | if self.dirhash and len(self.dirhash) < 20: 74 | md5val = self.dirhash 75 | else: 76 | md5val = self.get_file[0] 77 | if self.dir: 78 | dir_path = self.dir 79 | if "\\" in dir_path: 80 | win_path = dir_path.replace("\\", "\\\\") 81 | fname = "Directory: {0} ".format(win_path) 82 | else: 83 | fname = "Directory: {0} ".format(dir_path) 84 | else: 85 | fname = self.yara_base_file 86 | if self.container: 87 | container_list = [self.container] 88 | else: 89 | container_list = [ 90 | "CL_TYPE_MSOLE2", 91 | "CL_TYPE_OOXML_WORD", 92 | "CL_TYPE_OOXML_XL", 93 | "CL_TYPE_OOXML_PPT", 94 | ] 95 | for container in container_list: 96 | ctype = container.split("_")[-1] 97 | if ctype=="MSOLE2": 98 | #Special Handling because apparently clam has issues pulling these out of OLE sometimes 99 | rule_string = """{rname}.{ctype}.{date};Engine:81-255,Target:2;(""".format(rname=rname,ctype=ctype,date=datetime.datetime.now().strftime("%y%m%d"),container=container) 100 | else: 101 | rule_string = """{rname}.{ctype}.{date};Engine:81-255,Container:{container},Target:5;(""".format(rname=rname,ctype=ctype,date=datetime.datetime.now().strftime("%y%m%d"),container=container) 102 | j = len(l) - 1 103 | for i in range(0, len(l)): 104 | rule_string += str(i) 105 | if i < j: 106 | rule_string += "|" 107 | else: 108 | rule_string += ");" 109 | for i in range(0, len(l)): 110 | rule_dict = l[i] 111 | ftype = rule_dict['format'].lower() 112 | image_hex = rule_dict['hex'] 113 | for m in re.finditer(r"(?P \[ (?P\d+) \] )",image_hex): 114 | image_hex = re.sub(re.escape(m.group('jmp')),f"\x7b{m.group('val')}\x7d",image_hex) 115 | s = "{image_value_str}".format(image_value_str=image_hex) 116 | rule_string += s 117 | if i < j: 118 | rule_string += ";" 119 | print(rule_string) 120 | 121 | -------------------------------------------------------------------------------- /halogen/lib/parser.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | """ The parser library to support all input file processing and parsing """ 3 | import hashlib 4 | import re 5 | 6 | def idat(file_map): 7 | """if the idat option has been set (PNG_IDAT), we find the png header, and 8 | then find the IDAT chunk. Grab bytes from the idat chunk onwards. 9 | parameter: file_map - bytes of file 10 | returns: matching bytes. """ 11 | match_list = [] 12 | png_header = re.compile(b'(?s)\x89\x50\x4e\x47') 13 | png_idat = re.compile(b'(?s)(\x49\x44\x41\x54.{80})') 14 | for match in png_header.finditer(file_map): 15 | end = match.end() 16 | match_list.append(png_idat.search(file_map, end).group()) 17 | return match_list 18 | 19 | 20 | def jpg_sos(file_map): 21 | """ if the jpg_sos option has been set (JPG_SOS), we find the jpg header, 22 | and then find the SOS section. Grab bytes from the sos section onwards. 23 | parameter: file_map - bytes of file 24 | returns: matching bytes. """ 25 | match_list = [] 26 | jpg_header = re.compile(b'(?s)\xff\xd8\xff\xe0\x00\x10') 27 | sos = re.compile(b'(?s)(\xff\xda.{100})') 28 | for match in jpg_header.finditer(file_map): 29 | end = match.end() 30 | match_list.append(sos.search(file_map, end).group()) 31 | return match_list 32 | 33 | 34 | def jpg_sof2sos(file_map): 35 | """ if the jpg_sof2sos option has been set, we find the jpg header, 36 | then find the SOF section. Keeping track of that start point, we look for 37 | the SOS header. We match all the bytes between those and then a few additional 38 | bytes of the SOS compressed data. 39 | parameter - file_map - bytes of file 40 | returns matching bytes.""" 41 | match_list = [] 42 | jpg_header = re.compile(b'(?s)\xff\xd8\xff\xe0\x00\x10') 43 | sof = re.compile(b'(?s)(\xff\xc0|\xff\xc2)') 44 | sos = re.compile(b'(?s)(\xff\xda.{45})') 45 | for match in jpg_header.finditer(file_map): 46 | end_header = match.end() 47 | start_sof = sof.search(file_map, end_header).start() 48 | end_sos = sos.search(file_map, start_sof).end() 49 | match_list.append(file_map[start_sof:end_sos]) 50 | return match_list 51 | 52 | 53 | def pattern_id(file_map): 54 | count = 0 55 | pattern = file_map[0:8] 56 | pattern2 = file_map[8:16] 57 | pattern3 = file_map[16:24] 58 | if pattern == pattern2: 59 | l = len(pattern) 60 | for i in range(0, len(file_map), l): 61 | if pattern == file_map[i:i+8]: 62 | count += 8 63 | return count 64 | elif pattern2 == pattern3: 65 | l = len(pattern2) 66 | for i in range(8, len(file_map)-8, l): 67 | if pattern2 == file_map[i:i+8]: 68 | count += 8 69 | return count 70 | else: 71 | return count 72 | 73 | def jpg_jump(file_map): 74 | """ if the jpg_sof2sos_jump is set, we're going to essentially run the jpg_sof2sos function, 75 | but we're trying to identify repeated patterns in the post SOS section... this way we can 76 | jump over them and create the match from there. 77 | parameter - file_map - bytes of the file 78 | returns matching bytes/pattern. 79 | """ 80 | match_list = [] 81 | jpg_header = re.compile(b'(?s)(\xff\xd8\xff\xe0|\xff\xd8\xff\xe1)') 82 | sof = re.compile(b'(?s)(\xff\xc0|\xff\xc2)') 83 | sos = re.compile(b'(?s)\xff\xda') 84 | jpg_footer = re.compile(b'(?s)\xff\xd9') 85 | for match in jpg_header.finditer(file_map): 86 | end_header = match.end() 87 | end_footer = jpg_footer.search(file_map, end_header).end() 88 | start_sof = sof.search(file_map, end_header, end_footer).start() 89 | end_sos_pointer = sos.search(file_map, start_sof, end_footer).end() 90 | number_colors_components = int.from_bytes((file_map[end_sos_pointer+2:end_sos_pointer+3]), byteorder='little') 91 | start_sos_data = end_sos_pointer + 3 + (number_colors_components * 2) 92 | pattern_start_spot = start_sos_data + 5 93 | data = file_map[pattern_start_spot:end_footer] 94 | jump_size = pattern_id(data) 95 | prefix = file_map[start_sof:pattern_start_spot].hex() 96 | unique_bytes = file_map[pattern_start_spot + jump_size: pattern_start_spot + jump_size + 84].hex() 97 | if jump_size == 0: 98 | match_list.append(prefix + unique_bytes) 99 | else: 100 | jump = " [ {} ] ".format(jump_size) 101 | match_list.append(prefix + jump + unique_bytes) 102 | return match_list 103 | 104 | 105 | def get_matches(self, file_map) -> dict: 106 | """get_matches returns all regex matches on a provided file. 107 | Because of how the image is store, RTF is the bytes of the ascii 108 | representation of bytes for the image file. 109 | 110 | parameter: file_map - bytes of file 111 | returns: dictionary of matching bytes per regex pattern. 112 | """ 113 | get_file_dict = {} 114 | match_dict = { 115 | 'GIF': re.findall(b'(?s)(\x47\x49\x46\x38\x39\x61.{80})', file_map), 116 | 'RTF': re.findall(b'(?s)(.{20}\x35\x30\x34\x65\x34\x37\x30.{80}|.{20}\x66\x66\x64\x38\x66\x66.{80}|' 117 | b'.{20}\x66\x66\x64\x38\x66\x66\x65\x30\x30\x30\x31\x30.{80})', file_map), 118 | } 119 | if self.jpgsos: 120 | match_dict['JPG_SOS'] = jpg_sos(file_map) 121 | elif self.sof2sos: 122 | match_dict['JPG_SOF2SOS'] = jpg_sof2sos(file_map) 123 | elif self.jump: 124 | match_dict['JPG_JUMP'] = jpg_jump(file_map) 125 | else: 126 | match_dict['JPG'] = re.findall(b'(?s)(\xff\xd8\xff\xe0\x00\x10.{80})', file_map) 127 | match_dict['JPG2'] = re.findall(b'(?s)(\xff\xd8\xff.{80})', file_map) 128 | if self.idat: 129 | match_dict['PNG_IDAT'] = idat(file_map) 130 | else: 131 | match_dict['PNG'] = re.findall(b'(?s)(\x89\x50\x4e\x47.{82})', file_map) 132 | m = re.match(br'^(?P\x49\x49\x2a\x00[^\x00\x00]{2}.{80})',file_map,re.S) 133 | if m: 134 | match_dict['TIF'] = [m.group('magic_beans')] 135 | for file_type, regex_match in match_dict.items(): 136 | if len(regex_match) > 0: 137 | get_file_dict[file_type] = regex_match 138 | return get_file_dict 139 | 140 | 141 | def get_file(self) -> tuple: 142 | """ Generate md5 for input file to include in the yara meta data and run regex matches 143 | returns: md5hash of file and the file dictionary. """ 144 | hash_md5 = hashlib.md5() 145 | with open(self.yara_base_file, "rb") as f: 146 | file_map = f.read() 147 | get_file_dict = get_matches(self, file_map) 148 | hash_md5.update(file_map) 149 | return hash_md5.hexdigest(), get_file_dict 150 | 151 | --------------------------------------------------------------------------------