├── tests ├── __init__.py ├── data.py ├── test_cli.py ├── test_extract.py └── test_utils.py ├── MANIFEST.in ├── .gitignore ├── .editorconfig ├── test ├── LICENSE ├── setup.py ├── README.rst └── har_extractor.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include tests/*.py 2 | include build.sh 3 | include test 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /env 2 | /build 3 | /dist 4 | /*.egg-info 5 | *.pyc 6 | __pycache__ 7 | .coverage 8 | *.har 9 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | end_of_line = lf 5 | insert_final_newline = true 6 | charset = utf-8 7 | indent_style = space 8 | indent_size = 4 9 | -------------------------------------------------------------------------------- /test: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | if [ -d env ]; then 4 | . env/bin/activate 5 | fi 6 | 7 | if which coverage >/dev/null 2>&1; then 8 | coverage run --include har_extractor.py setup.py test && coverage report -m 9 | else 10 | ./setup.py test 11 | fi 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 dead-beef. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | from unittest import TestLoader 5 | from setuptools import setup 6 | 7 | def tests(): 8 | return TestLoader().discover('tests') 9 | 10 | BASE_DIR = os.path.abspath(os.path.dirname(__file__)) 11 | try: 12 | with open(os.path.join(BASE_DIR, 'README.rst')) as fp: 13 | README = fp.read() 14 | except IOError: 15 | README = '' 16 | 17 | setup( 18 | name='har-extractor', 19 | version='1.0.1', 20 | description='HTTP Archive extractor', 21 | long_description=README, 22 | classifiers=[ 23 | 'Development Status :: 5 - Production/Stable', 24 | 'Environment :: Console', 25 | 'Intended Audience :: End Users/Desktop', 26 | 'License :: OSI Approved :: MIT License', 27 | 'Programming Language :: Python :: 3', 28 | 'Programming Language :: Python :: 3.4', 29 | 'Topic :: Internet :: WWW/HTTP', 30 | 'Topic :: System :: Archiving', 31 | 'Topic :: Utilities' 32 | ], 33 | keywords='har http archive extractor', 34 | url='https://github.com/dead-beef/har-extractor', 35 | author='dead-beef', 36 | author_email='contact@dead-beef.tk', 37 | license='MIT', 38 | py_modules=['har_extractor'], 39 | entry_points={ 40 | 'console_scripts': [ 41 | 'har-extractor=har_extractor:main' 42 | ] 43 | }, 44 | test_suite='setup.tests', 45 | install_requires=['ijson'], 46 | extras_require={ 47 | 'dev': [ 48 | 'coverage', 49 | 'twine>=1.8.1', 50 | 'wheel' 51 | ] 52 | }, 53 | python_requires='>=3', 54 | include_package_data=True, 55 | zip_safe=False 56 | ) 57 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | har-extractor 2 | ============= 3 | 4 | .. image:: https://img.shields.io/pypi/v/har-extractor.svg 5 | :target: https://pypi.python.org/pypi/har-extractor 6 | .. image:: https://img.shields.io/pypi/status/har-extractor.svg 7 | :target: https://pypi.python.org/pypi/har-extractor 8 | .. image:: https://img.shields.io/pypi/format/har-extractor.svg 9 | :target: https://pypi.python.org/pypi/har-extractor 10 | .. image:: https://img.shields.io/librariesio/github/dead-beef/har-extractor.svg 11 | :target: https://libraries.io/pypi/har-extractor 12 | .. image:: https://img.shields.io/pypi/pyversions/har-extractor.svg 13 | :target: https://python.org 14 | .. image:: https://img.shields.io/pypi/l/har-extractor.svg 15 | :target: https://github.com/dead-beef/har-extractor/blob/master/LICENSE 16 | 17 | Overview 18 | -------- 19 | 20 | Extractor for 21 | `HAR `__, 22 | HTTP Archive format. 23 | 24 | Requirements 25 | ------------ 26 | 27 | - `Python 3 `__ 28 | 29 | Optional 30 | ~~~~~~~~ 31 | 32 | - `YAJL 2 `__ 33 | - `CFFI `__ 34 | 35 | Installation 36 | ------------ 37 | 38 | .. code:: bash 39 | 40 | pip install har-extractor 41 | 42 | Usage 43 | ----- 44 | 45 | :: 46 | 47 | usage: har-extractor [-h] [-V] [-l] [-o DIRECTORY] [-v] [-nv] [-i] [-ni] 48 | [-s] [-ns] [-d] [-nd] 49 | FILE 50 | 51 | positional arguments: 52 | FILE HAR file 53 | 54 | optional arguments: 55 | -h, --help show this help message and exit 56 | -V, --version show program's version number and exit 57 | -l, --list list the contents of input file 58 | -o DIRECTORY, --output DIRECTORY 59 | set output directory (default: ./.d) 60 | -v, --verbose turn on verbose output (default) 61 | -nv, --no-verbose turn off verbose output 62 | -i, --iterative use iterative json parser 63 | -ni, --no-iterative do not use iterative json parser (default) 64 | -s, --strict exit and delete extracted data after first error 65 | -ns, --no-strict ignore errors (default) 66 | -d, --directories create url directories (default) 67 | -nd, --no-directories 68 | do not create url directories 69 | 70 | Development 71 | ----------- 72 | 73 | Installation 74 | ~~~~~~~~~~~~ 75 | 76 | .. code:: bash 77 | 78 | git clone https://github.com/dead-beef/har-extractor 79 | cd har-extractor 80 | pip install -e .[dev] 81 | 82 | Building 83 | ~~~~~~~~ 84 | 85 | .. code:: bash 86 | 87 | ./build.sh 88 | 89 | Testing 90 | ~~~~~~~ 91 | 92 | .. code:: bash 93 | 94 | ./test 95 | 96 | Licenses 97 | -------- 98 | 99 | - `har-extractor `__ 100 | 101 | -------------------------------------------------------------------------------- /tests/data.py: -------------------------------------------------------------------------------- 1 | TEST_ARCHIVE = { 2 | 'log': { 3 | 'entries': [ 4 | { 5 | 'request': { 6 | 'method': 'GET', 7 | 'url': 'https://127.0.0.1/', 8 | }, 9 | 'response': { 10 | 'status': 200, 11 | 'statusText': 'OK', 12 | 'content': { 13 | 'mimeType': 'text/plain', 14 | 'size': 4, 15 | 'text': 'test' 16 | } 17 | } 18 | }, 19 | { 20 | 'request': { 21 | 'method': 'GET', 22 | 'url': 'https://127.0.0.1/dir/', 23 | }, 24 | 'response': { 25 | 'status': 200, 26 | 'statusText': 'OK', 27 | 'content': { 28 | 'mimeType': "text/plain", 29 | 'size': 8, 30 | 'encoding': 'base64', 31 | 'text': 'dGVzdDIK' 32 | } 33 | } 34 | }, 35 | { 36 | 'request': { 37 | 'method': 'GET', 38 | 'url': 'https://127.0.0.1/404', 39 | }, 40 | 'response': { 41 | 'status': 404, 42 | 'statusText': 'Not Found', 43 | 'content': { 44 | 'mimeType': "text/plain", 45 | 'size': 0, 46 | 'text': '' 47 | } 48 | } 49 | } 50 | ] 51 | } 52 | } 53 | 54 | TEST_ARCHIVE_INVALID = { 55 | 'log': { 56 | 'entries': [ 57 | { 58 | 'response': { 59 | 'status': 200, 60 | 'statusText': 'OK', 61 | 'content': { 62 | 'mimeType': 'text/plain', 63 | 'size': 4, 64 | 'text': 'test' 65 | } 66 | } 67 | }, 68 | {}, 69 | { 70 | 'request': { 71 | 'method': 'GET', 72 | 'url': 'https://127.0.0.1/404', 73 | }, 74 | 'response': { 75 | 'status': 404, 76 | 'statusText': 'Not Found', 77 | 'content': { 78 | 'mimeType': "text/plain", 79 | 'size': 3, 80 | 'text': '404' 81 | } 82 | } 83 | } 84 | ] 85 | } 86 | } 87 | 88 | TEST_ARCHIVE_CONTENTS = [ 89 | ('test', 'index.html'), 90 | (b'test2\n', 'dir') 91 | ] 92 | 93 | TEST_ARCHIVE_VERBOSE = '''GET https://127.0.0.1/ -> 200 OK text/plain 4B 94 | \t----> dir/index.html 95 | GET https://127.0.0.1/dir/ -> 200 OK text/plain 8B 96 | \t----> dir/dir 97 | GET https://127.0.0.1/404 -> 404 Not Found text/plain 0B 98 | \t----> 99 | ''' 100 | 101 | TEST_ARCHIVE_INVALID_VERBOSE = ''' -> 200 OK text/plain 4B 102 | -> 103 | \t----> 104 | GET https://127.0.0.1/404 -> 404 Not Found text/plain 3B 105 | \t----> dir/404 106 | ''' 107 | 108 | TEST_ARCHIVE_LIST = '''GET https://127.0.0.1/ -> 200 OK text/plain 4B 109 | GET https://127.0.0.1/dir/ -> 200 OK text/plain 8B 110 | GET https://127.0.0.1/404 -> 404 Not Found text/plain 0B 111 | ''' 112 | 113 | TEST_ARCHIVE_INVALID_LIST = ''' -> 200 OK text/plain 4B 114 | -> 115 | GET https://127.0.0.1/404 -> 404 Not Found text/plain 3B 116 | ''' 117 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | # pylint:disable=too-many-arguments 2 | 3 | from unittest import TestCase 4 | from unittest.mock import patch, mock_open 5 | from io import StringIO 6 | 7 | from har_extractor import main 8 | 9 | 10 | @patch('sys.stderr', new_callable=StringIO) 11 | @patch('shutil.rmtree') 12 | @patch('builtins.open', new_callable=mock_open) 13 | @patch('har_extractor.get_out_dir', return_value='outdir') 14 | @patch('har_extractor.get_entries', return_value='entries') 15 | @patch('har_extractor.extract') 16 | class TestMain(TestCase): 17 | @patch('sys.exit') 18 | def test_error(self, exit_, extract, *_): 19 | self.assertEqual(main(['-v']), 1) 20 | exit_.assert_called_with(2) 21 | self.assertEqual(extract.call_count, 0) 22 | 23 | def test_default(self, extract, get_entries, 24 | get_out_dir, open_, rmtree, stderr): 25 | handle = open_() 26 | self.assertEqual(main(['file']), 0) 27 | stderr.seek(0) 28 | self.assertEqual(stderr.read(), '') 29 | self.assertEqual(rmtree.call_count, 0) 30 | get_out_dir.assert_called_with(None, 'file.d') 31 | open_.assert_called_with('file', 'rb') 32 | get_entries.assert_called_with(handle, False) 33 | extract.assert_called_with('entries', 'outdir', True, True, False) 34 | 35 | def test_list(self, extract, get_entries, 36 | get_out_dir, open_, rmtree, stderr): 37 | handle = open_() 38 | self.assertEqual(main(['-l', 'file']), 0) 39 | stderr.seek(0) 40 | self.assertEqual(stderr.read(), '') 41 | self.assertEqual(rmtree.call_count, 0) 42 | self.assertEqual(get_out_dir.call_count, 0) 43 | open_.assert_called_with('file', 'rb') 44 | get_entries.assert_called_with(handle, False) 45 | extract.assert_called_with('entries', None, True, True, False) 46 | 47 | def test_args(self, extract, get_entries, 48 | get_out_dir, open_, rmtree, stderr): 49 | handle = open_() 50 | self.assertEqual( 51 | main(['-v', '-i', '-d', '-s', '-o', 'dir', 'file']), 52 | 0 53 | ) 54 | stderr.seek(0) 55 | self.assertEqual(stderr.read(), '') 56 | self.assertEqual(rmtree.call_count, 0) 57 | get_out_dir.assert_called_with('dir', 'file.d') 58 | open_.assert_called_with('file', 'rb') 59 | get_entries.assert_called_with(handle, True) 60 | extract.assert_called_with('entries', 'outdir', True, True, True) 61 | 62 | self.assertEqual( 63 | main(['-nv', '-ni', '-nd', '-ns', '-o', 'dir', 'file']), 64 | 0 65 | ) 66 | stderr.seek(0) 67 | self.assertEqual(stderr.read(), '') 68 | self.assertEqual(rmtree.call_count, 0) 69 | get_out_dir.assert_called_with('dir', 'file.d') 70 | open_.assert_called_with('file', 'rb') 71 | get_entries.assert_called_with(handle, False) 72 | extract.assert_called_with('entries', 'outdir', False, False, False) 73 | 74 | def test_getdir_error(self, extract, get_entries, 75 | get_out_dir, open_, rmtree, stderr): 76 | get_out_dir.side_effect = ValueError('value error') 77 | self.assertEqual(main(['-v', 'file']), 1) 78 | stderr.seek(0) 79 | self.assertEqual(stderr.read(), 'value error\n') 80 | self.assertEqual(rmtree.call_count, 0) 81 | get_out_dir.assert_called_with(None, 'file.d') 82 | self.assertEqual(open_.call_count, 0) 83 | self.assertEqual(get_entries.call_count, 0) 84 | self.assertEqual(extract.call_count, 0) 85 | 86 | def test_extract_error(self, extract, get_entries, 87 | get_out_dir, open_, rmtree, stderr): 88 | handle = open_() 89 | extract.side_effect = IOError('io error') 90 | self.assertEqual(main(['file']), 1) 91 | stderr.seek(0) 92 | self.assertEqual(stderr.read(), 'io error\n') 93 | self.assertEqual(rmtree.call_count, 0) 94 | get_out_dir.assert_called_with(None, 'file.d') 95 | open_.assert_called_with('file', 'rb') 96 | get_entries.assert_called_with(handle, False) 97 | extract.assert_called_with('entries', 'outdir', True, True, False) 98 | 99 | def test_extract_error_strict(self, extract, get_entries, 100 | get_out_dir, open_, rmtree, stderr): 101 | handle = open_() 102 | extract.side_effect = IOError('io error') 103 | self.assertEqual(main(['-s', 'file']), 1) 104 | stderr.seek(0) 105 | self.assertEqual(stderr.read(), 'io error\n') 106 | rmtree.assert_called_with('outdir') 107 | get_out_dir.assert_called_with(None, 'file.d') 108 | open_.assert_called_with('file', 'rb') 109 | get_entries.assert_called_with(handle, False) 110 | extract.assert_called_with('entries', 'outdir', True, True, True) 111 | -------------------------------------------------------------------------------- /tests/test_extract.py: -------------------------------------------------------------------------------- 1 | # pylint:disable=too-many-arguments 2 | 3 | from unittest import TestCase 4 | from unittest.mock import patch, call 5 | from io import StringIO 6 | 7 | import os 8 | 9 | from har_extractor import extract 10 | 11 | from data import ( 12 | TEST_ARCHIVE, TEST_ARCHIVE_LIST, TEST_ARCHIVE_CONTENTS, 13 | TEST_ARCHIVE_VERBOSE, TEST_ARCHIVE_INVALID_VERBOSE, 14 | TEST_ARCHIVE_INVALID, TEST_ARCHIVE_INVALID_LIST 15 | ) 16 | 17 | @patch('os.path.exists', return_value=False) 18 | @patch('os.makedirs') 19 | @patch('har_extractor.make_entry_dirs') 20 | @patch('har_extractor.write') 21 | @patch('sys.stderr', new_callable=StringIO) 22 | @patch('sys.stdout', new_callable=StringIO) 23 | class TestDoExtract(TestCase): 24 | def test_extract(self, stdout, stderr, 25 | write, make_entry_dirs, makedirs, _): 26 | extract(TEST_ARCHIVE['log']['entries'], 'dir/') 27 | stdout.seek(0) 28 | stderr.seek(0) 29 | makedirs.assert_called_with('dir/', exist_ok=True) 30 | write.assert_has_calls([ 31 | call(content, os.path.join('dir', fname)) 32 | for content, fname in TEST_ARCHIVE_CONTENTS 33 | ]) 34 | self.assertEqual(make_entry_dirs.call_count, 0) 35 | self.assertEqual(stdout.read(), '') 36 | self.assertEqual(stderr.read(), '') 37 | 38 | def test_extract_subdirs(self, stdout, stderr, 39 | write, make_entry_dirs, makedirs, _): 40 | extract(TEST_ARCHIVE['log']['entries'], 'dir/', subdirs=True) 41 | stdout.seek(0) 42 | stderr.seek(0) 43 | makedirs.assert_has_calles([ 44 | call('dir/', exist_ok=True), 45 | call('dir/127.0.0.1', exist_ok=True) 46 | ]) 47 | write.assert_has_calls([ 48 | call(content, os.path.join('dir/127.0.0.1', fname)) 49 | for content, fname in TEST_ARCHIVE_CONTENTS 50 | ]) 51 | make_entry_dirs.assert_has_calls([ 52 | call('dir/', os.path.join('dir/127.0.0.1', fname)) 53 | for _, fname in TEST_ARCHIVE_CONTENTS 54 | ]) 55 | self.assertEqual(make_entry_dirs.call_count, 2) 56 | self.assertEqual(stdout.read(), '') 57 | self.assertEqual(stderr.read(), '') 58 | 59 | def test_extract_verbose(self, stdout, stderr, 60 | write, make_entry_dirs, makedirs, _): 61 | extract(TEST_ARCHIVE['log']['entries'], 'dir/', verbose=True) 62 | stdout.seek(0) 63 | stderr.seek(0) 64 | makedirs.assert_called_with('dir/', exist_ok=True) 65 | write.assert_has_calls([ 66 | call(content, os.path.join('dir', fname)) 67 | for content, fname in TEST_ARCHIVE_CONTENTS 68 | ]) 69 | self.assertEqual(make_entry_dirs.call_count, 0) 70 | self.assertEqual(stdout.read(), TEST_ARCHIVE_VERBOSE) 71 | self.assertEqual(stderr.read(), '') 72 | 73 | def test_extract_invalid_all(self, stdout, stderr, 74 | write, make_entry_dirs, makedirs, _): 75 | extract(TEST_ARCHIVE_INVALID['log']['entries'], 76 | 'dir/', verbose=True, exit_on_error=False) 77 | stdout.seek(0) 78 | stderr.seek(0) 79 | makedirs.assert_called_with('dir/', exist_ok=True) 80 | write.assert_called_with('404', 'dir/404') 81 | self.assertEqual(make_entry_dirs.call_count, 0) 82 | self.assertEqual(stdout.read(), TEST_ARCHIVE_INVALID_VERBOSE) 83 | self.assertNotEqual(stderr.read(), '') 84 | 85 | def test_extract_invalid(self, stdout, stderr, 86 | write, make_entry_dirs, makedirs, _): 87 | output = TEST_ARCHIVE_INVALID_LIST.split('\n') 88 | output = output[0] + '\n' 89 | with self.assertRaises(ValueError): 90 | extract(TEST_ARCHIVE_INVALID['log']['entries'], 91 | 'dir/', verbose=True) 92 | stdout.seek(0) 93 | stderr.seek(0) 94 | makedirs.assert_called_with('dir/', exist_ok=True) 95 | self.assertEqual(make_entry_dirs.call_count, 0) 96 | self.assertEqual(write.call_count, 0) 97 | self.assertEqual(stdout.read(), output) 98 | self.assertEqual(stderr.read(), '') 99 | 100 | def test_extract_ioerror(self, stdout, stderr, 101 | write, make_entry_dirs, makedirs, _): 102 | write.side_effect = IOError 103 | 104 | extract(TEST_ARCHIVE['log']['entries'], 105 | 'dir/', verbose=True, exit_on_error=False) 106 | 107 | makedirs.assert_called_with('dir/', exist_ok=True) 108 | stdout.seek(0) 109 | stderr.seek(0) 110 | write.assert_has_calls([ 111 | call(content, os.path.join('dir', fname)) 112 | for content, fname in TEST_ARCHIVE_CONTENTS 113 | ]) 114 | self.assertEqual(make_entry_dirs.call_count, 0) 115 | self.assertEqual(stdout.read(), TEST_ARCHIVE_VERBOSE) 116 | self.assertNotEqual(stderr.read(), '') 117 | 118 | with self.assertRaises(IOError): 119 | extract(TEST_ARCHIVE['log']['entries'], 'dir/', verbose=True) 120 | 121 | def test_extract_list(self, stdout, stderr, 122 | write, make_entry_dirs, makedirs, _): 123 | extract(TEST_ARCHIVE['log']['entries'], None) 124 | stdout.seek(0) 125 | stderr.seek(0) 126 | self.assertEqual(makedirs.call_count, 0) 127 | self.assertEqual(write.call_count, 0) 128 | self.assertEqual(make_entry_dirs.call_count, 0) 129 | self.assertEqual(stdout.read(), TEST_ARCHIVE_LIST) 130 | self.assertEqual(stderr.read(), '') 131 | 132 | def test_extract_list_invalid(self, stdout, stderr, 133 | write, make_entry_dirs, makedirs, _): 134 | extract(TEST_ARCHIVE_INVALID['log']['entries'], None) 135 | stdout.seek(0) 136 | stderr.seek(0) 137 | self.assertEqual(stdout.read(), TEST_ARCHIVE_INVALID_LIST) 138 | self.assertEqual(stderr.read(), '') 139 | self.assertEqual(makedirs.call_count, 0) 140 | self.assertEqual(write.call_count, 0) 141 | self.assertEqual(make_entry_dirs.call_count, 0) 142 | #extract(TEST_ARCHIVE_INVALID['log']['entries'], None, False, False) 143 | -------------------------------------------------------------------------------- /har_extractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from argparse import ArgumentParser 4 | from urllib.parse import urlparse 5 | from base64 import b64decode 6 | from itertools import count 7 | 8 | import os 9 | import sys 10 | import json 11 | import shutil 12 | 13 | try: 14 | import ijson.backends.yajl2_cffi as ijson 15 | except ImportError: 16 | try: 17 | import ijson.backends.yajl2 as ijson 18 | except ImportError: 19 | try: 20 | import ijson 21 | except ImportError: 22 | ijson = None 23 | 24 | 25 | __appname__ = 'har-extractor' 26 | __version__ = '1.0.1' 27 | 28 | NAME_VERSION = '%s %s' % (__appname__, __version__) 29 | SIZE_UNITS = 'BKMGT' 30 | 31 | 32 | def format_size(size): 33 | if size < 0: 34 | return '' 35 | unit_name = 'B' 36 | unit_value = 1 37 | for unit_name in SIZE_UNITS: 38 | if size < 1024 * unit_value: 39 | break 40 | unit_value *= 1024 41 | if size % unit_value == 0: 42 | return '%d%s' % (size // unit_value, unit_name) 43 | return '%.2f%s' % (size / unit_value, unit_name) 44 | 45 | def get_unused_name(path): 46 | if not os.path.exists(path): 47 | return path 48 | name, ext = os.path.splitext(path) 49 | i = 1 50 | while True: 51 | path = '%s.%d%s' % (name, i, ext) 52 | if not os.path.exists(path): 53 | return path 54 | i += 1 55 | 56 | def write(content, fname): 57 | if isinstance(content, bytes): 58 | mode = 'wb' 59 | else: 60 | mode = 'w' 61 | with open(fname, mode) as fp: 62 | fp.write(content) 63 | 64 | def format_entry(entry): 65 | request = entry.get('request', {}) 66 | response = entry.get('response', {}) 67 | content = response.get('content', {}) 68 | return '%s %s -> %s %s %s %s' % ( 69 | request.get('method', ''), 70 | request.get('url', ''), 71 | response.get('status', ''), 72 | response.get('statusText', ''), 73 | content.get('mimeType', ''), 74 | format_size(content.get('size', -1)) 75 | ) 76 | 77 | def get_entry_content(entry): 78 | try: 79 | content = entry['response']['content'] 80 | except KeyError: 81 | return None 82 | try: 83 | text = content['text'] 84 | if not text: 85 | return None 86 | except KeyError: 87 | return None 88 | 89 | try: 90 | if content['encoding'] == 'base64': 91 | text = b64decode(text) 92 | else: 93 | raise ValueError( 94 | '\tUnknown content encoding: "%s"' % content['encoding'] 95 | ) 96 | except KeyError: 97 | pass 98 | 99 | return text 100 | 101 | def get_entry_path(entry, subdirs=False): 102 | try: 103 | url = urlparse(entry['request']['url']) 104 | except KeyError: 105 | raise ValueError('Invalid entry: missing request URL: %s' % repr(entry)) 106 | 107 | fname = url.path.strip('/') 108 | if fname == '': 109 | fname = 'index.html' 110 | 111 | if subdirs: 112 | return os.path.join(url.netloc, fname) 113 | return os.path.basename(fname) 114 | 115 | def get_entries(fp, iterative=True): 116 | if fp is sys.stdin: 117 | iterative = True 118 | fp = fp.buffer 119 | 120 | if ijson is None or not iterative: 121 | data = fp.read() 122 | if isinstance(data, bytes): 123 | data = data.decode('utf-8') 124 | data = json.loads(data) 125 | return data['log']['entries'] 126 | else: 127 | return ijson.items(fp, 'log.entries.item') 128 | 129 | def get_out_dir(path, default): 130 | if not path: 131 | return default 132 | if os.path.exists(path): 133 | if not os.path.isdir(path): 134 | raise ValueError('"%s" is not a directory' % path) 135 | return os.path.join(path, default) 136 | return path 137 | 138 | def dirnames(entry, root): 139 | path = os.path.relpath(entry, root) 140 | ret = [] 141 | path = os.path.dirname(path) 142 | while path: 143 | ret.append(os.path.join(root, path)) 144 | path = os.path.dirname(path) 145 | return ret 146 | 147 | def move_files_to_dir(path, first): 148 | dirname, name = os.path.split(path) 149 | name, ext = os.path.splitext(name) 150 | shutil.move(first, os.path.join(path, 'index.html')) 151 | for i in count(1): 152 | fpath = os.path.join(dirname, '%s.%d%s' % (name, i, ext)) 153 | if not os.path.exists(fpath): 154 | return 155 | fname = 'index.%d.html' % i 156 | shutil.move(fpath, os.path.join(path, fname)) 157 | 158 | def make_entry_dirs(root, entry): 159 | try: 160 | os.makedirs(os.path.dirname(entry), exist_ok=True) 161 | return 162 | except OSError: 163 | for path in reversed(dirnames(entry, root)): 164 | if not os.path.exists(path): 165 | os.mkdir(path) 166 | elif not os.path.isdir(path): 167 | tmp = get_unused_name(path) 168 | shutil.move(path, tmp) 169 | os.mkdir(path) 170 | move_files_to_dir(path, tmp) 171 | 172 | 173 | def extract(entries, outdir=None, 174 | subdirs=False, verbose=False, exit_on_error=True): 175 | if outdir is not None: 176 | os.makedirs(outdir, exist_ok=True) 177 | 178 | for entry in entries: 179 | try: 180 | if verbose or outdir is None: 181 | print(format_entry(entry)) 182 | 183 | if outdir is None: 184 | continue 185 | 186 | content = get_entry_content(entry) 187 | if content is None: 188 | if verbose: 189 | print('\t----> ') 190 | continue 191 | 192 | fname = get_entry_path(entry, subdirs) 193 | fname = os.path.join(outdir, fname) 194 | fname = get_unused_name(fname) 195 | if verbose: 196 | print('\t---->', fname) 197 | 198 | try: 199 | if subdirs: 200 | make_entry_dirs(outdir, fname) 201 | write(content, fname) 202 | except (OSError, IOError) as err: 203 | msg = 'Could not write "%s": %s' % (fname, repr(err)) 204 | if exit_on_error: 205 | raise IOError(msg) 206 | else: 207 | print(msg, file=sys.stderr) 208 | except (KeyError, ValueError) as err: 209 | msg = 'Invalid entry: %s: %s' % (repr(entry), repr(err)) 210 | if exit_on_error: 211 | raise ValueError(msg) 212 | else: 213 | print(msg, file=sys.stderr) 214 | 215 | 216 | def main(args=None): 217 | parser = ArgumentParser(args) 218 | 219 | parser.add_argument('file', metavar='FILE', help='HAR file') 220 | 221 | parser.add_argument('-V', '--version', 222 | action='version', version=NAME_VERSION) 223 | 224 | parser.add_argument('-l', '--list', action='store_true', 225 | help='list the contents of input file') 226 | 227 | parser.add_argument('-o', '--output', 228 | metavar='DIRECTORY', default=None, 229 | help='set output directory (default: ./.d)') 230 | 231 | parser.add_argument('-v', '--verbose', 232 | dest='verbose', 233 | action='store_true', 234 | help='turn on verbose output (default)') 235 | 236 | parser.add_argument('-nv', '--no-verbose', 237 | dest='verbose', 238 | action='store_false', 239 | help='turn off verbose output') 240 | 241 | parser.add_argument('-i', '--iterative', 242 | dest='iterative', 243 | action='store_true', 244 | help='use iterative json parser') 245 | 246 | parser.add_argument('-ni', '--no-iterative', 247 | dest='iterative', 248 | action='store_false', 249 | help='do not use iterative json parser (default)') 250 | 251 | parser.add_argument('-s', '--strict', 252 | dest='strict', 253 | action='store_true', 254 | help='exit and delete extracted data after first error') 255 | 256 | parser.add_argument('-ns', '--no-strict', 257 | dest='strict', 258 | action='store_false', 259 | help='ignore errors (default)') 260 | 261 | parser.add_argument('-d', '--directories', 262 | dest='directories', 263 | action='store_true', 264 | help='create url directories (default)') 265 | 266 | parser.add_argument('-nd', '--no-directories', 267 | dest='directories', 268 | action='store_false', 269 | help='do not create url directories') 270 | 271 | parser.set_defaults( 272 | iterative=False, 273 | directories=True, 274 | strict=False, 275 | verbose=True 276 | ) 277 | 278 | if args is not None: 279 | args = parser.parse_args(args) 280 | else: 281 | args = parser.parse_args() 282 | 283 | if args.file is None: 284 | return 1 285 | 286 | if args.list: 287 | outdir = None 288 | else: 289 | try: 290 | outdir = get_out_dir(args.output, 291 | os.path.basename(args.file) + '.d') 292 | except ValueError as err: 293 | print(err, file=sys.stderr) 294 | return 1 295 | 296 | try: 297 | with open(args.file, 'rb') as fp: 298 | entries = get_entries(fp, args.iterative) 299 | extract(entries, outdir, 300 | args.directories, args.verbose, args.strict) 301 | except (ValueError, IOError) as err: 302 | if args.strict: 303 | shutil.rmtree(outdir) 304 | print(err, file=sys.stderr) 305 | return 1 306 | 307 | return 0 308 | 309 | 310 | if __name__ == '__main__': 311 | sys.exit(main()) 312 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # pylint:disable=too-many-arguments 2 | 3 | from unittest import TestCase 4 | from unittest.mock import patch, mock_open, call 5 | 6 | from io import BytesIO 7 | 8 | import json 9 | 10 | from har_extractor import ( 11 | format_size, get_unused_name, write, get_out_dir, 12 | format_entry, get_entry_content, get_entry_path, 13 | get_entries, dirnames, move_files_to_dir, make_entry_dirs 14 | ) 15 | 16 | from data import TEST_ARCHIVE 17 | 18 | 19 | class TestUtils(TestCase): 20 | def test_format_size(self): 21 | self.assertEqual(format_size(-1), '') 22 | self.assertEqual(format_size(0), '0B') 23 | self.assertEqual(format_size(512), '512B') 24 | self.assertEqual(format_size(1024), '1K') 25 | self.assertEqual(format_size(2048), '2K') 26 | self.assertEqual(format_size(1536), '1.50K') 27 | self.assertEqual(format_size(4 * (1024 ** 2)), '4M') 28 | self.assertEqual(format_size(4 * (1024 ** 3) + 1), '4.00G') 29 | self.assertEqual(format_size(1024 ** 4), '1T') 30 | 31 | @staticmethod 32 | @patch('builtins.open', new_callable=mock_open) 33 | def test_write(open_mock): 34 | handle = open_mock() 35 | write('content', 'fname') 36 | open_mock.assert_called_with('fname', 'w') 37 | handle.write.assert_called_with('content') 38 | write(b'\x01\x02\x03', '') 39 | open_mock.assert_called_with('', 'wb') 40 | handle.write.assert_called_with(b'\x01\x02\x03') 41 | 42 | @patch('os.path.exists') 43 | def test_get_unused_name(self, exists): 44 | exists.side_effect = [False] 45 | self.assertEqual(get_unused_name('/dir/name.ext'), '/dir/name.ext') 46 | exists.side_effect = [True, False] 47 | self.assertEqual(get_unused_name('/dir/name.ext'), '/dir/name.1.ext') 48 | exists.side_effect = [True, True, True, False] 49 | self.assertEqual(get_unused_name('/dir/name.ext'), '/dir/name.3.ext') 50 | 51 | @patch('os.path.exists', return_value=False) 52 | @patch('os.path.isdir', return_value=True) 53 | def test_get_out_dir(self, isdir, exists): 54 | self.assertEqual(get_out_dir(None, 'default'), 'default') 55 | self.assertEqual(get_out_dir('', 'default'), 'default') 56 | self.assertEqual(exists.call_count, 0) 57 | self.assertEqual(isdir.call_count, 0) 58 | 59 | self.assertEqual(get_out_dir('path', 'default'), 'path') 60 | exists.assert_called_with('path') 61 | exists.reset_mock() 62 | self.assertEqual(isdir.call_count, 0) 63 | 64 | exists.return_value = True 65 | self.assertEqual(get_out_dir('path', 'default'), 'path/default') 66 | exists.assert_called_with('path') 67 | isdir.assert_called_with('path') 68 | 69 | with self.assertRaises(ValueError): 70 | isdir.return_value = False 71 | get_out_dir('path', 'default') 72 | 73 | @patch('har_extractor.format_size', return_value='') 74 | def test_format_entry(self, format_size_mock): 75 | entry = { 76 | 'request': { 77 | 'method': 'METHOD', 78 | 'url': '/url' 79 | }, 80 | 'response': { 81 | 'status': 123, 82 | 'statusText': 'Status Text', 83 | 'content': { 84 | 'mimeType': 'mime/type', 85 | 'size': 321 86 | } 87 | } 88 | } 89 | formatted = 'METHOD /url -> 123 Status Text mime/type ' 90 | self.assertEqual(format_entry(entry), formatted) 91 | format_size_mock.assert_called_with(321) 92 | 93 | def test_format_entry_invalid(self): 94 | formatted = ' -> ' \ 95 | ' ' 96 | self.assertEqual(format_entry({}), formatted) 97 | 98 | def test_get_entry_path(self): 99 | test = lambda url: get_entry_path({'request': {'url': url}}) 100 | self.assertEqual(test('http://127.0.0.1'), 'index.html') 101 | self.assertEqual(test('http://127.0.0.1/'), 'index.html') 102 | self.assertEqual(test('http://127.0.0.1/dir/'), 'dir') 103 | self.assertEqual(test('http://127.0.0.1/dir'), 'dir') 104 | self.assertEqual(test('http://127.0.0.1/dir/name'), 'name') 105 | self.assertEqual(test('http://127.0.0.1/dir/name/'), 'name') 106 | self.assertEqual(test('http://127.0.0.1/dir/name?arg'), 'name') 107 | self.assertEqual(test('http://127.0.0.1/dir/name/?arg'), 'name') 108 | with self.assertRaises(ValueError): 109 | get_entry_path({}) 110 | 111 | def test_get_entry_path_subdirs(self): 112 | test = lambda url: get_entry_path({'request': {'url': url}}, True) 113 | self.assertEqual(test('http://127.0.0.1'), '127.0.0.1/index.html') 114 | self.assertEqual(test('http://127.0.0.1/'), '127.0.0.1/index.html') 115 | self.assertEqual(test('http://127.0.0.1/dir/'), '127.0.0.1/dir') 116 | self.assertEqual(test('http://127.0.0.1/dir'), '127.0.0.1/dir') 117 | self.assertEqual(test('http://127.0.0.1/dir/name'), '127.0.0.1/dir/name') 118 | self.assertEqual(test('http://127.0.0.1/dir/name/'), '127.0.0.1/dir/name') 119 | self.assertEqual(test('http://127.0.0.1/dir/name?arg'), '127.0.0.1/dir/name') 120 | self.assertEqual(test('http://127.0.0.1/dir/name/?arg'), '127.0.0.1/dir/name') 121 | 122 | def test_get_entry_content(self): 123 | entry = { 124 | 'response': { 125 | 'content': { 126 | } 127 | } 128 | } 129 | self.assertIsNone(get_entry_content({})) 130 | self.assertIsNone(get_entry_content(entry)) 131 | 132 | entry['response']['content']['text'] = '' 133 | self.assertIsNone(get_entry_content(entry)) 134 | 135 | entry['response']['content']['text'] = 'test' 136 | self.assertEqual(get_entry_content(entry), 'test') 137 | 138 | entry['response']['content']['text'] = 'dGVzdA==' 139 | entry['response']['content']['encoding'] = 'base64' 140 | self.assertEqual(get_entry_content(entry), b'test') 141 | 142 | with self.assertRaises(ValueError): 143 | entry['response']['content']['encoding'] = 'encoding' 144 | get_entry_content(entry) 145 | 146 | def test_dirnames(self): 147 | self.assertEqual(dirnames('/x/y/z/', '/x/'), ['/x/y']) 148 | self.assertEqual(dirnames('/x/y/z/', '/x'), ['/x/y']) 149 | self.assertEqual(dirnames('/x/y/z', '/x'), ['/x/y']) 150 | self.assertEqual(dirnames('/x/y/z/u/v', '/x/'), 151 | ['/x/y/z/u', '/x/y/z', '/x/y']) 152 | 153 | @patch('shutil.move') 154 | @patch('os.path.exists', side_effect=[True, True, False]) 155 | def test_move_files_to_dir(self, exists, move): 156 | move_files_to_dir('/a/b/c.d', '/a/b/c.3.d') 157 | exists.assert_has_calls([ 158 | call('/a/b/c.%d.d' % i) for i in range(1, 4) 159 | ]) 160 | self.assertEqual(exists.call_count, 3) 161 | move.assert_has_calls([ 162 | call('/a/b/c.3.d', '/a/b/c.d/index.html'), 163 | call('/a/b/c.1.d', '/a/b/c.d/index.1.html'), 164 | call('/a/b/c.2.d', '/a/b/c.d/index.2.html') 165 | ]) 166 | self.assertEqual(move.call_count, 3) 167 | 168 | @patch('har_extractor.move_files_to_dir') 169 | @patch('shutil.move') 170 | @patch('os.makedirs') 171 | @patch('os.mkdir') 172 | def test_make_entry_dirs(self, mkdir, mkdirs, move, move_files_to_dir_): 173 | make_entry_dirs('/root', '/root/dir/entry') 174 | mkdirs.assert_called_with('/root/dir', exist_ok=True) 175 | self.assertEqual(mkdirs.call_count, 1) 176 | self.assertEqual(mkdir.call_count, 0) 177 | self.assertEqual(move.call_count, 0) 178 | self.assertEqual(move_files_to_dir_.call_count, 0) 179 | 180 | @patch('har_extractor.get_unused_name', return_value='unused') 181 | @patch('har_extractor.move_files_to_dir') 182 | @patch('shutil.move') 183 | @patch('os.makedirs', side_effect=OSError) 184 | @patch('os.mkdir') 185 | @patch('os.path.exists', side_effect=[False, True, True]) 186 | @patch('os.path.isdir', side_effect=[False, True]) 187 | def test_make_entry_dirs_error(self, isdir, exists, 188 | mkdir, mkdirs, move, 189 | move_files_to_dir_, get_unused_name_): 190 | make_entry_dirs('/root/dir', '/root/dir/dir2/dir3/4/entry') 191 | 192 | mkdirs.assert_called_with('/root/dir/dir2/dir3/4', exist_ok=True) 193 | self.assertEqual(mkdirs.call_count, 1) 194 | 195 | exists.assert_has_calls([ 196 | call('/root/dir/dir2'), 197 | call('/root/dir/dir2/dir3'), 198 | call('/root/dir/dir2/dir3/4') 199 | ]) 200 | self.assertEqual(exists.call_count, 3) 201 | 202 | isdir.assert_has_calls([ 203 | call('/root/dir/dir2/dir3'), 204 | call('/root/dir/dir2/dir3/4') 205 | ]) 206 | self.assertEqual(isdir.call_count, 2) 207 | 208 | mkdir.assert_has_calls([ 209 | call('/root/dir/dir2'), 210 | call('/root/dir/dir2/dir3') 211 | ]) 212 | self.assertEqual(mkdir.call_count, 2) 213 | 214 | get_unused_name_.assert_called_with('/root/dir/dir2/dir3') 215 | self.assertEqual(get_unused_name_.call_count, 1) 216 | 217 | move.assert_called_with('/root/dir/dir2/dir3', 'unused') 218 | self.assertEqual(move.call_count, 1) 219 | 220 | move_files_to_dir_.assert_called_with('/root/dir/dir2/dir3', 'unused') 221 | self.assertEqual(move_files_to_dir_.call_count, 1) 222 | 223 | 224 | class TestGetEntries(TestCase): 225 | def test_iterative(self): 226 | fp = BytesIO(json.dumps(TEST_ARCHIVE).encode('utf-8')) 227 | items = get_entries(fp, True) 228 | self.assertNotIsInstance(items, list) 229 | self.assertEqual(list(items), TEST_ARCHIVE['log']['entries']) 230 | 231 | def test_not_iterative(self): 232 | fp = BytesIO(json.dumps(TEST_ARCHIVE).encode('utf-8')) 233 | items = get_entries(fp, False) 234 | self.assertIsInstance(items, list) 235 | self.assertEqual(items, TEST_ARCHIVE['log']['entries']) 236 | 237 | @patch('sys.stdin') 238 | def test_stdin(self, stdin): 239 | stdin.buffer = BytesIO(json.dumps(TEST_ARCHIVE).encode('utf-8')) 240 | items = get_entries(stdin, False) 241 | self.assertNotIsInstance(items, list) 242 | self.assertEqual(list(items), TEST_ARCHIVE['log']['entries']) 243 | 244 | @patch('har_extractor.ijson', new=None) 245 | def test_no_ijson(self): 246 | fp = BytesIO(json.dumps(TEST_ARCHIVE).encode('utf-8')) 247 | items = get_entries(fp, True) 248 | self.assertIsInstance(items, list) 249 | self.assertEqual(items, TEST_ARCHIVE['log']['entries']) 250 | --------------------------------------------------------------------------------