├── LICENSE
├── README.md
├── chophound.ps1
├── chophound.py
└── replace.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2022, Arris Huijgen
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ChopHound
 2 | Some scripts for dealing with any challenges that might arise when importing (large) JSON datasets into BloodHound. The blog which discusses these scripts can be found at [https://blog.bitsadmin.com/blog/dealing-with-large-bloodhound-datasets](https://blog.bitsadmin.com/blog/dealing-with-large-bloodhound-datasets).
 3 | 
 4 | ## Scripts
 5 | | Name | Description |
 6 | | ---- | ----------- |
 7 | | chophound.ps1 | PowerShell implementation of ingesting a large BloodHound JSON file and splitting it into smaller chunks. Note that if the file you are trying to split is too large (or your PC's memory is too small), this script will fail with an out of memory exception. |
 8 | | chophound.py | Python implementation of the .ps1 script which has support for splitting large JSON files into smaller chunks. |
 9 | | replace.py | Little script to replace non-ASCII characters in the file provided with a question mark ('?') in order to avoid possible encoding errors. Note that when running this script against a file with a Byte-Order Mark at the beginning, those bytes will also simply be replaced by question marks and you will need to manually remove those bytes with a hex editor like [HxD](https://mh-nexus.de/en/hxd/) |
10 | 


--------------------------------------------------------------------------------
/chophound.ps1:
--------------------------------------------------------------------------------
 1 | param ([Parameter(Mandatory)]$File, [int]$ChunkSize=5000)
 2 | 
 3 | # Banner
 4 | "ChopHoundPS v1.0 ( https://github.com/bitsadmin/chophound/ )"
 5 | 
 6 | # Read file into memory
 7 | Write-Warning "Reading file $File"
 8 | $name_no_ext = [System.IO.Path]::GetFileNameWithoutExtension($File)
 9 | $js = Get-Content -Raw $File | ConvertFrom-Json
10 | 
11 | if(-not $?)
12 | {
13 | 	Write-Warning "Error while reading file. Quitting."
14 | 	return
15 | }
16 | 
17 | # Determine data tag name
18 | $tagname = $js.meta.type
19 | if($js.meta.version -gt 3)
20 |     { $tagname = 'data' }
21 | 
22 | # Calculate number of blocks
23 | $numblocks = [Math]::Ceiling($js.data.Count / $ChunkSize)
24 | Write-Warning "Splitting in $numblocks blocks of $ChunkSize elements"
25 | $i = 0
26 | 
27 | # Perform splitting
28 | while($i*$ChunkSize -lt $js.data.Count)
29 | {
30 |     $outname = "{0}_{1:0000}.json" -f $name_no_ext,$i
31 | 
32 |     # meta -> count
33 |     $meta = $js.meta
34 |     $meta.count = $ChunkSize
35 |     if(($i+1)*$ChunkSize -gt $js.data.Count)
36 |         { $meta.count = $js.data.Count - ($i*$ChunkSize)}
37 | 
38 |     Write-Warning "Writing file $outname"
39 |     
40 |     # Meta tag MUST be after the data, otherwise BloodHound won't find it
41 |     '{{"{0}":{1},"meta":{2}}}' -f `
42 |         $tagname, `
43 |         ($js.data[($i*$ChunkSize)..((($i+1)*$ChunkSize)-1)] | ConvertTo-Json -Depth 100 -Compress), `
44 |         ($meta | ConvertTo-Json -Compress) `
45 |         | Out-File $outname -NoNewline -Encoding UTF8BOM
46 |     
47 |     $i++
48 | }


--------------------------------------------------------------------------------
/chophound.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import argparse
 4 | import json
 5 | try:
 6 |     import ijson.backends.python as ijson
 7 | except (ImportError, ModuleNotFoundError):
 8 |     print('Error: install the Python ijson module first')
 9 | 
10 | VERSION = 1.0
11 | 
12 | def main(args):
13 |     # Collect args
14 |     file = args.file
15 |     chunksize = args.chunksize
16 | 
17 |     # Fixed variables
18 |     basename = os.path.splitext(os.path.basename(file))[0]
19 |     jsonformat = '{"data":[%s],"meta":%s}'
20 | 
21 |     # Open in binary mode to seek
22 |     print('[+] Opening file %s' % file)
23 |     with open(file, 'rb') as js:
24 |         # Obtain meta tag
25 |         js.seek(-0x100, os.SEEK_END)
26 |         lastbytes = str(js.read(0x100))
27 |         if args.verbose:
28 |             print(f"lastbytes: {lastbytes}")
29 |         metatagstr = re.search('("meta":(\s+)?{.*})', lastbytes, re.IGNORECASE).group(1).replace('\\n',"")
30 |         if args.verbose:
31 |             print(metatagstr)
32 |         metatag = json.loads('{' + metatagstr)
33 | 
34 |     # Open in text mode to parse
35 |     with open(file, 'r', encoding='utf-8-sig', errors='replace') as js:
36 |         items = ijson.items(js, 'data.item')
37 | 
38 |         endoflist = False
39 |         i = 0
40 |         while True:
41 |             outfile = '%s_%.4d.json' % (basename, i)
42 | 
43 |             # Get chunk
44 |             chunks = []
45 |             count = 0
46 |             try:
47 |                 while True:
48 |                     item = next(items)
49 |                     chunks.append(json.dumps(item))
50 | 
51 |                     count += 1
52 |                     if count == chunksize:
53 |                         break
54 |             except StopIteration:
55 |                 endoflist = True
56 | 
57 |             # Update meta tag
58 |             metatag['meta']['count'] = count
59 | 
60 |             # Format and store
61 |             print('[+] Writing %s' % outfile)
62 |             with open(outfile, 'w', encoding='utf-8-sig', errors='replace') as jsout:
63 |                 jsout.write(jsonformat % (','.join(chunks), json.dumps(metatag['meta'])))
64 | 
65 |             i += 1
66 | 
67 |             if endoflist:
68 |                 break
69 | 
70 | def getargs():
71 |     parser = argparse.ArgumentParser(
72 |         description='Convert large BloodHound json to smaller chunks'
73 |     )
74 |     parser.add_argument('file', help='JSON file to split')
75 |     parser.add_argument('-c', '--chunksize', default=10000, type=int, dest='chunksize', help='Number of items per outputted chunk')
76 |     parser.add_argument('-v', '--verbose', action=argparse.BooleanOptionalAction, help='Show verbose output')
77 | 
78 |     return parser.parse_args()
79 | 
80 | if __name__ == '__main__':
81 |     print('ChopHound v%.2f ( https://github.com/bitsadmin/chophound/ )' % VERSION)
82 |     main(getargs())
83 | 


--------------------------------------------------------------------------------
/replace.py:
--------------------------------------------------------------------------------
 1 | import mmap
 2 | import argparse
 3 | 
 4 | VERSION = 1.0
 5 | 
 6 | def main(args):
 7 |     file = args.file
 8 |     verbose = args.verbose
 9 |     crapbytes = []
10 | 
11 |     print('Locating non-ASCII characters in %s' % file)
12 |     with open(file, 'r+b') as f:
13 |         mem = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
14 |         offset = 0
15 |         for byte in mem:
16 |             obyte = ord(byte)
17 |             if (obyte < 0x20 or obyte > 0x7e) and obyte not in (0x0a, 0x0d):
18 |                 crapbytes.append(offset)
19 |                 if verbose:
20 |                     print("Found non-ASCII character at offset 0x%.8x" % offset)
21 | 
22 |             offset += 1
23 | 
24 |         mem.close()
25 |     print('Found a total of %d non-ASCII characters' % len(crapbytes))
26 | 
27 |     print('Fixing non-ASCII characters in %s' % file)
28 |     with open(file, 'r+b') as f:
29 |         mem = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_WRITE)
30 | 
31 |         # Navigate to offset and write question mark
32 |         for offset in crapbytes:
33 |             if verbose:
34 |                 print("Writing '?' to offset 0x%.8x" % offset)
35 |             mem.seek(offset)
36 |             mem.write_byte(0x3f)
37 | 
38 |         mem.close()
39 |     print('Fixed a total of %d non-ASCII characters' % len(crapbytes))
40 | 
41 | def getargs():
42 |     parser = argparse.ArgumentParser(
43 |         description='In-place replacement of non-ASCII characters to question marks'
44 |     )
45 |     parser.add_argument('file', help='File to replace non-ASCII characters in')
46 |     parser.add_argument('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Show verbose output')
47 | 
48 |     return parser.parse_args()
49 | 
50 | if __name__ == '__main__':
51 |     print('ReplaceNonASCII v%.2f ( https://github.com/bitsadmin/chophound/ )' % VERSION)
52 |     main(getargs())
53 | 


--------------------------------------------------------------------------------