├── .gitignore
├── README.rst
├── examples
    └── hello
    │   ├── README.md
    │   ├── hello-orig.cpp
    │   ├── hello.cpp
    │   └── ishello.sh
├── setup.py
├── src
    └── structureshrink
    │   ├── __init__.py
    │   ├── __main__.py
    │   └── shrinker.py
└── tests
    └── test_shrinking.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | ### JetBrains template
  2 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
  3 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  4 | 
  5 | # User-specific stuff:
  6 | .idea/workspace.xml
  7 | .idea/tasks.xml
  8 | .idea/dictionaries
  9 | .idea/vcs.xml
 10 | .idea/jsLibraryMappings.xml
 11 | 
 12 | # Sensitive or high-churn files:
 13 | .idea/dataSources.ids
 14 | .idea/dataSources.xml
 15 | .idea/dataSources.local.xml
 16 | .idea/sqlDataSources.xml
 17 | .idea/dynamic.xml
 18 | .idea/uiDesigner.xml
 19 | 
 20 | # Gradle:
 21 | .idea/gradle.xml
 22 | .idea/libraries
 23 | 
 24 | # Mongo Explorer plugin:
 25 | .idea/mongoSettings.xml
 26 | 
 27 | ## File-based project format:
 28 | *.iws
 29 | 
 30 | ## Plugin-specific files:
 31 | 
 32 | # IntelliJ
 33 | /out/
 34 | 
 35 | # mpeltonen/sbt-idea plugin
 36 | .idea_modules/
 37 | 
 38 | # JIRA plugin
 39 | atlassian-ide-plugin.xml
 40 | 
 41 | # Crashlytics plugin (for Android Studio and IntelliJ)
 42 | com_crashlytics_export_strings.xml
 43 | crashlytics.properties
 44 | crashlytics-build.properties
 45 | fabric.properties
 46 | ### Python template
 47 | # Byte-compiled / optimized / DLL files
 48 | __pycache__/
 49 | *.py[cod]
 50 | *$py.class
 51 | 
 52 | # C extensions
 53 | *.so
 54 | 
 55 | # Distribution / packaging
 56 | .Python
 57 | env/
 58 | build/
 59 | develop-eggs/
 60 | dist/
 61 | downloads/
 62 | eggs/
 63 | .eggs/
 64 | lib/
 65 | lib64/
 66 | parts/
 67 | sdist/
 68 | var/
 69 | *.egg-info/
 70 | .installed.cfg
 71 | *.egg
 72 | 
 73 | # PyInstaller
 74 | #  Usually these files are written by a python script from a template
 75 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 76 | *.manifest
 77 | *.spec
 78 | 
 79 | # Installer logs
 80 | pip-log.txt
 81 | pip-delete-this-directory.txt
 82 | 
 83 | # Unit test / coverage reports
 84 | htmlcov/
 85 | .tox/
 86 | .coverage
 87 | .coverage.*
 88 | .cache
 89 | nosetests.xml
 90 | coverage.xml
 91 | *,cover
 92 | .hypothesis/
 93 | 
 94 | # Translations
 95 | *.mo
 96 | *.pot
 97 | 
 98 | # Django stuff:
 99 | *.log
100 | local_settings.py
101 | 
102 | # Flask instance folder
103 | instance/
104 | 
105 | # Scrapy stuff:
106 | .scrapy
107 | 
108 | # Sphinx documentation
109 | docs/_build/
110 | 
111 | # PyBuilder
112 | target/
113 | 
114 | # IPython Notebook
115 | .ipynb_checkpoints
116 | 
117 | # pyenv
118 | .python-version
119 | 
120 | # celery beat schedule file
121 | celerybeat-schedule
122 | 
123 | # dotenv
124 | .env
125 | 
126 | # virtualenv
127 | venv/
128 | ENV/
129 | 
130 | # Spyder project settings
131 | .spyderproject
132 | 
133 | # Rope project settings
134 | .ropeproject
135 | 
136 | # Created by .ignore support plugin (hsz.mobi)
137 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Structured Shrinking
  2 | ====================
  3 | 
  4 | structureshrink is a program and library that attempts to find structure in a
  5 | file and uses it to produce smaller examples of the same sort of file.
  6 | 
  7 | It considers an example smaller if it contains strictly fewer bytes, or if it
  8 | has the same number of bytes but is lexicographically smaller treated as a
  9 | sequence of unsigned 8 bit integers (currently structureshrink only does a
 10 | small amount to shrink with regards to the latter)
 11 | 
 12 | Multishrinking
 13 | --------------
 14 | 
 15 | Rather than shrinking with respect to a single predicate, structureshrink
 16 | insteads shrinks with respect to a *classification*. It takes a function that
 17 | maps data to a label. It then tries to produce the smallest example for *each*
 18 | label. 
 19 | 
 20 | For a command line invocation the label should usually be the exit status of
 21 | the program. For library usage it can be any hashable python value.
 22 | 
 23 | This has a couple advantages:
 24 | 
 25 | 1. It allows you to `detect interesting failures that might occur doing the
 26 |    course of shrinking <http://blog.regehr.org/archives/1284>`_.
 27 | 2. Attempts to shrink one label can result in shrinking another label. This
 28 |    potentially allows escaping local minima.
 29 | 
 30 | Core Algorithm
 31 | --------------
 32 | 
 33 | At its heart, structureshrink uses a sequence minimisation algorithm. For long
 34 | sequences it uses something that is essentially a light optimisation on 
 35 | `Delta Debugging <https://en.wikipedia.org/wiki/Delta_Debugging>`_. For shorter
 36 | sequences it tries slightly harder to delete *all* sub-intervals of the
 37 | sequence (it only does this for shorter sequences because this is an
 38 | intrinsically O(n^2) operation).
 39 | 
 40 | So far, so uninteresting.
 41 | 
 42 | The interesting feature of structureshrink is how this algorithm is applied.
 43 | 
 44 | structureshrink extracts a sequence of interesting ngrams from the string. An
 45 | ngram is interesting if it appears at least max(2, n) times in the string.
 46 | 
 47 | There's no *particularly* principled reason for this choice, except that it
 48 | seems to work and it bounds the number of ngrams. A fixed threshold tends to
 49 | get a bit excited on highly repetitive data.
 50 | 
 51 | We also remember previously useful ngrams and try those even if they don't
 52 | satisfy the criterion for the current string. This allows us to infer structure
 53 | and then make use of it even when it has become less obvious.
 54 | 
 55 | Once we have these ngrams we do several things with them:
 56 | 
 57 | 1. We split the data by the occurrences of each ngram in it. We then try to
 58 |    shrink the sequence of splits by the property that joining by the ngram
 59 |    satisfies the desired criterion.
 60 | 2. We split the data by the occurrences of each ngram in it. We then try to
 61 |    shrink the *ngram* (bytewise) such that joining the splits together by that
 62 |    ngram satisfies the criterion.
 63 | 
 64 | Ngrams are processed in the following order (based on the current best at the
 65 | time of calculation. These are not updated as we go):
 66 | 
 67 | 1. Longer ngrams are processed first
 68 | 2. Given two ngrams of the same size, process the ones such that smallest
 69 |    element of the split is of longest length (because this is the smallest
 70 |    amount of data that can be deleted on a successful shrink, so doing these
 71 |    first gets us to smaller strings faster).
 72 | 3. Given two ngrams of the same length and the same smallest split size (this
 73 |    usually only happens once the ngrams are length one or two), pick the one
 74 |    with the *fewest* splits (because at this stage we're usually more concerned
 75 |    with sequence minimizer performance than worst case guarantees).
 76 | 
 77 | We also have additional more naive phases:
 78 | 
 79 | 1. Simply apply the sequence shrinking algorithm bytewise to the data
 80 | 
 81 | We go through these phases in sequence. Each time a phase produces useful
 82 | changes we go back to the beginning rather than moving on to the next phase.
 83 | 
 84 | We stop once no phase produces a change.
 85 | 
 86 | 
 87 | Advantages of structureshrink
 88 | -----------------------------
 89 | 
 90 | Because structureshrink detects the features it needs from the file rather than
 91 | proceeding linewise, it is able to cope with a much wider range of file
 92 | formats, both text and binary.
 93 | 
 94 | It also produces much smaller examples than simple linewise or space wise
 95 | deletion - for example I've seen it happily rename variables in a C++ program
 96 | despite knowing literally nothing about the grammar of C++.
 97 | 
 98 | This can also be a downside, as aggressively minimized programs are not very
 99 | readable. To compensate for that structureshrink lets you specify a
100 | preprocessor for formatting your data (e.g. clang-format). This runs before
101 | shrinking, and can also have the advantage that it speeds up the shrink by
102 | removing useless shrinks (at least, as long as your preprocessor is faster than
103 | your test program).
104 | 
105 | 
106 | Usage
107 | -----
108 | 
109 | There's a library and a commannd line tool. Neither are what I would describe
110 | as documented.
111 | 
112 | To use the command line tool run:
113 | 
114 | .. code-block::
115 | 
116 |     python setup.py install
117 | 
118 | This will require Python 3.4+.
119 | 
120 | You can now use the structureshrink command.
121 | 
122 | Usage is that you pass it a file to be shrunk (this will have its contents
123 | replaced and a backup file created) and a command to run. The results will
124 | be in the 'shrinks' directory, one per exit status for the command seen.
125 | 
126 | Development status
127 | ------------------
128 | 
129 | Somewhere between "Research prototype" and "Usable tool", but much closer to
130 | the first one. It seems to work pretty well, and it's not completely fragile,
131 | but it's definitely rough around the edges. It's certainly not going to
132 | maintain backwards compatibility.
133 | 
134 | It's not particularly well tested right now (by my standards it barely counts
135 | as tested at all), so it's probably broken in amusing ways.
136 | 


--------------------------------------------------------------------------------
/examples/hello/README.md:
--------------------------------------------------------------------------------
 1 | # Hello World
 2 | 
 3 | This is [a rather nasty example from John Regehr](http://blog.regehr.org/archives/1284).
 4 | 
 5 | It takes a C++ implementation of Hello World after a preprocessor has been run
 6 | on it and attempts to reduce it to a minimal program that when compiled outputs
 7 | "Hello". It also notes anything that triggers an internal compiler error along
 8 | the way.
 9 | 
10 | Structureshrink does surprisingly well on this example despite not knowing
11 | anything about the structure of C++ programs.
12 | 
13 | In some sense this isn't surprising because this was the main example I developed
14 | it against! In another it's quite surprising because C++ has a fairly complicated
15 | structure and structureshrink performs a lot of non-trivial transformations on
16 | the shrunk example such as e.g. renaming variables and methods to have shorter
17 | names.
18 | 


--------------------------------------------------------------------------------
/examples/hello/hello-orig.cpp:
--------------------------------------------------------------------------------
1 | #include <iostream>
2 |  
3 | int main() {
4 |   std::cout << "Hello World!++" << std::endl;
5 | }
6 | 


--------------------------------------------------------------------------------
/examples/hello/ishello.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -e -x
 4 | 
 5 | if
 6 |   g++ -O3 -w hello.cpp >compiler.out 2>&1
 7 | then
 8 |   ulimit -t 1 && ./a.out | grep Hello
 9 | else
10 |   if
11 |     grep 'internal compiler error' compiler.out
12 |   then
13 |     exit 101
14 |   else
15 |     exit 1
16 |   fi
17 | fi
18 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | import os
 3 | 
 4 | 
 5 | def local_file(name):
 6 |     return os.path.relpath(os.path.join(os.path.dirname(__file__), name))
 7 | 
 8 | SOURCE = local_file("src")
 9 | README = local_file("README.rst")
10 | 
11 | 
12 | setup(
13 |     name='structureshrink',
14 |     version="0.0.1",
15 |     author='David R. MacIver',
16 |     author_email='david@drmaciver.com',
17 |     packages=find_packages(SOURCE),
18 |     package_dir={"": SOURCE},
19 |     url='https://github.com/DRMacIver/hypothesis',
20 |     license='AGPL v3',
21 |     description='A library for property based testing',
22 |     zip_safe=False,
23 |     classifiers=[
24 |         "Programming Language :: Python :: 3.5",
25 |         "Programming Language :: Python :: Implementation :: CPython",
26 |     ],
27 |     install_requires=['click'],
28 |     long_description=open(README).read(),
29 |     entry_points={
30 |         'console_scripts': [
31 |             'structureshrink=structureshrink.__main__:shrinker'
32 |         ]
33 |     }
34 | )
35 | 


--------------------------------------------------------------------------------
/src/structureshrink/__init__.py:
--------------------------------------------------------------------------------
1 | from structureshrink.shrinker import shrink, Volume, Shrinker
2 | 
3 | __all__ = ['shrink', 'Volume', 'Shrinker']
4 | 


--------------------------------------------------------------------------------
/src/structureshrink/__main__.py:
--------------------------------------------------------------------------------
  1 | from structureshrink import Shrinker, Volume
  2 | import os
  3 | from shutil import which
  4 | import shlex
  5 | import click
  6 | import subprocess
  7 | import hashlib
  8 | import signal
  9 | import sys
 10 | import time
 11 | import random
 12 | import traceback
 13 | 
 14 | 
 15 | def validate_command(ctx, param, value):
 16 |     if value is None:
 17 |         return None
 18 |     parts = shlex.split(value)
 19 |     command = parts[0]
 20 | 
 21 |     if os.path.exists(command):
 22 |         command = os.path.abspath(command)
 23 |     else:
 24 |         what = which(command)
 25 |         if what is None:
 26 |             raise click.BadParameter('%s: command not found' % (command,))
 27 |         command = os.path.abspath(what)
 28 |     return [command] + parts[1:]
 29 | 
 30 | 
 31 | def signal_group(sp, signal):
 32 |     gid = os.getpgid(sp.pid)
 33 |     assert gid != os.getgid()
 34 |     os.killpg(gid, signal)
 35 | 
 36 | 
 37 | def interrupt_wait_and_kill(sp):
 38 |     if sp.returncode is None:
 39 |         # In case the subprocess forked. Python might hang if you don't close
 40 |         # all pipes.
 41 |         for pipe in [sp.stdout, sp.stderr, sp.stdin]:
 42 |             if pipe:
 43 |                 pipe.close()
 44 |         signal_group(sp, signal.SIGINT)
 45 |         for _ in range(10):
 46 |             if sp.poll() is not None:
 47 |                 return
 48 |             time.sleep(0.1)
 49 |         signal_group(sp, signal.SIGKILL)
 50 | 
 51 | 
 52 | @click.command(
 53 |     help="""
 54 | structureshrink takes a file and a test command and attempts to produce a
 55 | minimal example of every distinct status code it sees out of that command.
 56 | (Normally you're only interested in one, but sometimes there are other
 57 | interesting behaviours that occur while running it).
 58 | 
 59 | Usage is 'structureshrink test filename filenames...'. The file will be
 60 | repeatedly overwritten with a smaller version of it, with a backup placed in
 61 | the backup file. When the program exits the file will be replaced with the
 62 | smallest contents that produce the same exit code as was originally present.
 63 | 
 64 | Additional files will not be replaced but will be used as additional examples,
 65 | which may discover other interesting files as well as aiding the shrinking
 66 | process.
 67 | """.strip()
 68 | )
 69 | @click.option('--debug', default=False, is_flag=True, help=(
 70 |     'Emit (extremely verbose) debug output while shrinking'
 71 | ))
 72 | @click.option('--principal', default=False, is_flag=True, help=(
 73 |     'When set will only try to shrink examples that classify the same as the '
 74 |     'initial example (other values will still be recorded but it will not make'
 75 |     ' any deliberate attempts to shrink them).'
 76 | ))
 77 | @click.option(
 78 |     '--quiet', default=False, is_flag=True, help=(
 79 |         'Emit no output at all while shrinking'))
 80 | @click.option(
 81 |     '--backup', default='', help=(
 82 |         'Name of the backup file to create. Defaults to adding .bak to the '
 83 |         'name of the source file'))
 84 | @click.option(
 85 |     '--shrinks', default='shrinks',
 86 |     type=click.Path(file_okay=False, resolve_path=True))
 87 | @click.option('--seed', default=None)
 88 | @click.option(
 89 |     '--preprocess', default=None, callback=validate_command,
 90 |     help=(
 91 |         "Provide a command that 'normalizes' the input before it is tested ("
 92 |         'e.g. a code formatter). If this command returns a non-zero exit code '
 93 |         'then the example will be skipped altogether.'))
 94 | @click.option(
 95 |     '--timeout', default=1, type=click.FLOAT, help=(
 96 |         'Time out subprocesses after this many seconds. If set to <= 0 then '
 97 |         'no timeout will be used.'))
 98 | @click.option(
 99 |     '--pass', '-p', 'passes', multiple=True,
100 |     help='Run only a single pass'
101 | )
102 | @click.option('--classify', default=None, callback=validate_command)
103 | @click.argument('test', callback=validate_command)
104 | @click.argument('filename', type=click.Path(
105 |     exists=True, resolve_path=True, dir_okay=False, allow_dash=True
106 | ))
107 | @click.argument('filenames', type=click.Path(
108 |     exists=True, resolve_path=True, dir_okay=False, allow_dash=False
109 | ), nargs=-1)
110 | def shrinker(
111 |     debug, quiet, backup, filename, test, shrinks, preprocess, timeout,
112 |     classify, filenames, seed, principal, passes
113 | ):
114 |     if debug and quiet:
115 |         raise click.UsageError('Cannot have both debug output and be quiet')
116 | 
117 |     if debug:
118 |         def dump_trace(signum, frame):
119 |             traceback.print_stack()
120 |         signal.signal(signal.SIGQUIT, dump_trace)
121 | 
122 |     if seed is not None:
123 |         random.seed(seed)
124 | 
125 |     if not backup:
126 |         backup = filename + os.extsep + 'bak'
127 | 
128 |     history = os.path.join(shrinks, 'history')
129 | 
130 |     try:
131 |         os.mkdir(shrinks)
132 |     except OSError:
133 |         pass
134 | 
135 |     try:
136 |         os.mkdir(history)
137 |     except OSError:
138 |         pass
139 | 
140 |     try:
141 |         os.remove(backup)
142 |     except FileNotFoundError:
143 |         pass
144 | 
145 |     seen_output = set()
146 | 
147 |     def classify_data(string):
148 |         if filename == '-':
149 |             sp = subprocess.Popen(
150 |                 test, stdin=subprocess.PIPE,
151 |                 stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
152 |                 universal_newlines=False,
153 |                 preexec_fn=os.setsid,
154 |             )
155 |             try:
156 |                 sp.communicate(string, timeout=timeout)
157 |             finally:
158 |                 interrupt_wait_and_kill(sp)
159 |             result = sp.returncode
160 |         else:
161 |             try:
162 |                 os.rename(filename, backup)
163 |                 with open(filename, 'wb') as o:
164 |                     o.write(string)
165 |                 sp = subprocess.Popen(
166 |                     test, stdout=subprocess.DEVNULL, stdin=subprocess.DEVNULL,
167 |                     stderr=subprocess.DEVNULL, universal_newlines=False,
168 |                     preexec_fn=os.setsid,
169 |                 )
170 |                 try:
171 |                     sp.communicate(timeout=timeout)
172 |                 except subprocess.TimeoutExpired:
173 |                     return 'timeout'
174 |                 finally:
175 |                     interrupt_wait_and_kill(sp)
176 |                 return sp.returncode
177 |             finally:
178 |                 try:
179 |                     os.remove(filename)
180 |                 except FileNotFoundError:
181 |                     pass
182 |                 os.rename(backup, filename)
183 |         if classify is None or result is None:
184 |             return result
185 |         else:
186 |             try:
187 |                 classify_output = subprocess.check_output(
188 |                     classify, timeout=timeout, stdin=subprocess.DEVNULL)
189 |                 classify_return = 0
190 |             except subprocess.CalledProcessError as e:
191 |                 classify_output = e.output
192 |                 classify_return = e.returncode
193 |             if classify_output and classify_output not in seen_output:
194 |                 shrinker.debug(
195 |                     'New classification: %r' % (classify_output,)
196 |                 )
197 |                 seen_output.add(classify_output)
198 |             return ':%d:%d:%s:' % (
199 |                 result, classify_return,
200 |                 hashlib.sha1(classify_output).hexdigest()[:8]
201 |                 if classify_output else '.'
202 |             )
203 | 
204 |     timeout *= 10
205 |     if timeout <= 0:
206 |         timeout = None
207 | 
208 |     if preprocess:
209 |         def preprocessor(string):
210 |             sp = subprocess.Popen(
211 |                 preprocess, stdin=subprocess.PIPE,
212 |                 stdout=subprocess.PIPE, stderr=subprocess.DEVNULL,
213 |                 universal_newlines=False,
214 |                 preexec_fn=os.setsid,
215 |             )
216 |             try:
217 |                 out, _ = sp.communicate(string, timeout=timeout)
218 |                 assert isinstance(out, bytes)
219 |                 return out
220 |             except subprocess.TimeoutExpired:
221 |                 shrinker.debug('Timed out while calling preprocessor')
222 |                 return None
223 |             except subprocess.CalledProcessError:
224 |                 shrinker.debug('Error while calling preprocessor')
225 |                 return None
226 |             finally:
227 |                 interrupt_wait_and_kill(sp)
228 |     else:
229 |         preprocessor = None
230 | 
231 |     if filename == '-':
232 |         initial = sys.stdin.buffer.read()
233 |     else:
234 |         with open(filename, 'rb') as o:
235 |             initial = o.read()
236 | 
237 |     if debug:
238 |         volume = Volume.debug
239 |     elif quiet:
240 |         volume = Volume.quiet
241 |     else:
242 |         volume = Volume.normal
243 | 
244 |     def suffixed_name(status):
245 |         if filename == '-':
246 |             base = ''
247 |             ext = 'example'
248 |         else:
249 |             *base, ext = os.path.basename(filename).split(os.extsep, 1)
250 |             base = os.extsep.join(base)
251 |         if base:
252 |             return os.path.extsep.join(((base, '%s' % (status,), ext)))
253 |         else:
254 |             return os.path.extsep.join(((ext, '%s' % (status,))))
255 | 
256 |     def shrink_callback(string, status):
257 |         with open(os.path.join(shrinks, suffixed_name(status)), 'wb') as o:
258 |             o.write(string)
259 |         with open(
260 |             os.path.join(history, suffixed_name(
261 |                 '%d-%s' % (len(string), hashlib.sha1(string).hexdigest()[:12])
262 |             )), 'wb'
263 |         ) as o:
264 |             o.write(string)
265 |     shrinker = Shrinker(
266 |         initial, classify_data, volume=volume,
267 |         shrink_callback=shrink_callback, printer=click.echo,
268 |         preprocess=preprocessor, principal_only=principal,
269 |         passes=passes or None,
270 |     )
271 |     initial_label = shrinker.classify(initial)
272 |     # Go through the old shrunk files. This both reintegrates them into our
273 |     # current shrink state so we can resume and also lets us clear out old bad
274 |     # examples.
275 |     try:
276 |         for f in os.listdir(shrinks):
277 |             path = os.path.join(shrinks, f)
278 |             if not os.path.isfile(path):
279 |                 continue
280 |             with open(path, 'rb') as i:
281 |                 contents = i.read()
282 |             status = shrinker.classify(contents)
283 |             if suffixed_name(status) != f:
284 |                 shrinker.debug('Clearing out defunct %r file' % (f,))
285 |                 os.unlink(path)
286 |             else:
287 |                 shrinker.debug(
288 |                     'Reusing previous %d byte example for label %r' % (
289 |                         len(contents), status
290 |                     ))
291 |         for f in os.listdir(history):
292 |             path = os.path.join(history, f)
293 |             if not os.path.isfile(path):
294 |                 continue
295 |             with open(path, 'rb') as i:
296 |                 contents = i.read()
297 |             if principal and len(contents) > len(initial):
298 |                 continue
299 |             shrinker.classify(contents)
300 | 
301 |         for filepath in filenames:
302 |             with open(filepath, 'rb') as i:
303 |                 value = i.read()
304 |             shrinker.classify(value)
305 | 
306 |         if timeout is not None:
307 |             timeout //= 10
308 |         shrinker.shrink()
309 |     finally:
310 |         if filename != '-':
311 |             os.rename(filename, backup)
312 |             with open(filename, 'wb') as o:
313 |                 o.write(shrinker.best[initial_label])
314 |         else:
315 |             sys.stdout.buffer.write(shrinker.best[initial_label])
316 | 
317 | 
318 | if __name__ == '__main__':
319 |     shrinker()
320 | 


--------------------------------------------------------------------------------
/src/structureshrink/shrinker.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | from collections import OrderedDict, Counter
  3 | from enum import IntEnum
  4 | 
  5 | 
  6 | class Volume(IntEnum):
  7 |     quiet = 0
  8 |     normal = 1
  9 |     debug = 2
 10 | 
 11 | 
 12 | def sort_key(s):
 13 |     return (len(s), s)
 14 | 
 15 | 
 16 | def cache_key(s):
 17 |     if len(s) < 20:
 18 |         return s
 19 |     return hashlib.sha1(s).digest()
 20 | 
 21 | 
 22 | ALPHABET = [bytes([b]) for b in range(256)]
 23 | 
 24 | 
 25 | class Shrinker(object):
 26 | 
 27 |     def __init__(
 28 |         self,
 29 |         initial, classify, *,
 30 |         preprocess=None, shrink_callback=None, printer=None,
 31 |         volume=Volume.quiet, principal_only=False,
 32 |         passes=None
 33 |     ):
 34 |         self.__interesting_ngrams = set()
 35 |         self.__shrink_callback = shrink_callback or (lambda s, r: None)
 36 |         self.__printer = printer or (lambda s: None)
 37 |         self.__inital = initial
 38 |         self.__classify = classify
 39 |         self.__preprocess = preprocess or (lambda s: s)
 40 |         self.__volume = volume
 41 | 
 42 |         self.__cache = {}
 43 |         self.__preprocess_cache = {}
 44 |         self.__best = OrderedDict()
 45 |         self.shrinks = 0
 46 |         preprocessed = self.__preprocess(initial)
 47 |         if preprocessed is None:
 48 |             raise ValueError('Initial example is rejected by preprocessing')
 49 |         label = self.classify(preprocessed)
 50 |         self.output('Initial example: %s, labelled %r' % ((
 51 |             '%d bytes ' % (len(initial),)
 52 |             if initial == preprocessed
 53 |             else '%d bytes (%d preprocessed)' % (
 54 |                 len(initial), len(preprocessed))),
 55 |             label))
 56 |         self.__initial_label = label
 57 |         self.principal_only = principal_only
 58 |         self.passes = passes
 59 | 
 60 |     def pass_enabled(self, pass_name):
 61 |         if self.passes is None or pass_name in self.passes:
 62 |             self.output('Running pass %r' % (pass_name,))
 63 |             return True
 64 |         self.debug('Skipping pass %r' % (pass_name,))
 65 |         return False
 66 | 
 67 |     def output(self, text):
 68 |         if self.__volume >= Volume.normal:
 69 |             self.__printer(text)
 70 | 
 71 |     def debug(self, text):
 72 |         if self.__volume >= Volume.debug:
 73 |             self.__printer(text)
 74 | 
 75 |     @property
 76 |     def best(self):
 77 |         return self.__best
 78 | 
 79 |     def classify(self, string):
 80 |         key = cache_key(string)
 81 |         try:
 82 |             return self.__cache[key]
 83 |         except KeyError:
 84 |             pass
 85 | 
 86 |         keys = [key]
 87 | 
 88 |         preprocessed = self.__preprocess(string)
 89 |         if preprocessed is None:
 90 |             result = None
 91 |         else:
 92 |             string = preprocessed
 93 |             preprocess_key = cache_key(preprocessed)
 94 |             keys.append(preprocess_key)
 95 |             try:
 96 |                 result = self.__cache[preprocess_key]
 97 |             except KeyError:
 98 |                 result = self.__classify(preprocessed)
 99 |             if (
100 |                 result not in self.best or
101 |                 sort_key(string) < sort_key(self.best[result])
102 |             ):
103 |                 self.shrinks += 1
104 |                 if self.best:
105 |                     if result not in self.best:
106 |                         self.output((
107 |                             'Shrink %d: Discovered new label %r'
108 |                             ' with %d bytes') % (
109 |                                 self.shrinks, result, len(string)))
110 |                     else:
111 |                         deletes = len(self.best[result]) - len(string)
112 |                         if deletes == 0:
113 |                             shrink_message = 'lowered %d' % (
114 |                                 len([1 for u, v in zip(
115 |                                     string, self.best[result]) if u < v]),)
116 |                         else:
117 |                             shrink_message = 'deleted %d' % (deletes,)
118 | 
119 |                         self.output(
120 |                             'Shrink %d: Label %r now %d bytes (%s)' % (
121 |                                 self.shrinks, result, len(string),
122 |                                 shrink_message))
123 |                 self.__shrink_callback(string, result)
124 |                 self.__best[result] = string
125 |         for k in keys:
126 |             self.__cache[k] = result
127 |         return result
128 | 
129 |     def __suitable_ngrams(self, label):
130 |         self.debug('Calculating ngrams for %r' % (label,))
131 |         found_ngrams = ngrams(self.best[label])
132 |         self.debug('Found %d ngrams' % len(found_ngrams),)
133 |         return found_ngrams
134 | 
135 |     def bracket_shrink(self, string, criterion, threshold=1.0):
136 |         prev = None
137 |         while prev != string:
138 |             prev = string
139 |             for l, r in detect_possible_brackets(string):
140 |                 intervals = intervals_for_brackets(string, l, r)
141 |                 if intervals is None:
142 |                     continue
143 |                 intervals.sort(
144 |                     key=lambda x: (x[0] - x[1], x[0]))
145 |                 self.debug('Shrinking for bracketed pair %r, %r' % (
146 |                     bytes([l]), bytes([r])
147 |                 ))
148 |                 changed = True
149 |                 while changed:
150 |                     changed = False
151 |                     i = 0
152 |                     while i < len(intervals):
153 |                         u, v = intervals[i]
154 |                         for t in [
155 |                             string[:u] + string[v:],
156 |                             string[:u + 1] + string[v - 1:],
157 |                             string[:u] + string[u + 1:v - 1] + string[v:],
158 |                         ]:
159 |                             if (
160 |                                 len(t) < len(string) * threshold and
161 |                                 criterion(t)
162 |                             ):
163 |                                 string = t
164 |                                 intervals = intervals_for_brackets(
165 |                                     string, l, r)
166 |                                 changed = True
167 |                                 break
168 |                         else:
169 |                             i += 1
170 |                         if intervals is None:
171 |                             break
172 |         return string
173 | 
174 |     def delete_characters(self, string, criterion):
175 |         counts = Counter(string)
176 |         for c in sorted(range(256), key=counts.__getitem__, reverse=True):
177 |             if c not in string:
178 |                 continue
179 |             c = bytes([c])
180 |             t = string.replace(c, b'')
181 |             if criterion(t):
182 |                 self.debug('Removed %r' % (c,))
183 |                 string = t
184 | 
185 |     def partition_charwise(self, string, criterion):
186 |         counts = Counter(string)
187 |         alphabet = sorted(counts)
188 |         for c in sorted(alphabet, key=lambda s: (counts[s], s)):
189 |             if c not in string:
190 |                 continue
191 |             compressed = bytearray()
192 |             seen_c = False
193 |             for b in string:
194 |                 if b == c:
195 |                     if not seen_c:
196 |                         seen_c = True
197 |                         compressed.append(b)
198 |                 else:
199 |                     seen_c = False
200 |                     compressed.append(b)
201 |             compressed = bytes(compressed)
202 |             if compressed != string:
203 |                 self.debug('Compressing runs of %r' % (bytes([c]),))
204 |                 if criterion(compressed):
205 |                     string = compressed
206 |             c = bytes([c])
207 | 
208 |             partition = string.split(c)
209 |             if len(partition) <= 1:
210 |                 continue
211 |             self.debug('Partition by %r into %d parts' % (c, len(partition)))
212 |             shrunk = _ddmin(partition, lambda ls: criterion(c.join(ls)))
213 |             if len(shrunk) < len(partition):
214 |                 self.debug('Removed %d parts' % (
215 |                     len(partition) - len(shrunk),))
216 |             t = b''.join(shrunk)
217 |             if criterion(t):
218 |                 self.debug('Removed %r entirely' % (c,))
219 |                 string = t
220 |             else:
221 |                 smaller = {bytes([d]) for d in alphabet if d < c[0]}
222 |                 for d in sorted(smaller):
223 |                     t = d.join(shrunk)
224 |                     if criterion(t):
225 |                         self.debug('Replaced %r with %r' % (c, d))
226 |                         string = t
227 |                         break
228 |                 else:
229 |                     string = c.join(shrunk)
230 | 
231 |         return string
232 | 
233 |     def calculate_partition(self, string, l, r, level):
234 |         labels = []
235 |         count = 0
236 |         bad = False
237 |         for c in string:
238 |             if c == l:
239 |                 count += 1
240 |             elif c == r:
241 |                 count -= 1
242 |                 if count < 0:
243 |                     bad = True
244 |                     break
245 |             labels.append(count >= level)
246 |         if bad:
247 |             return None
248 |         if count != 0:
249 |             return None
250 |         if True not in labels:
251 |             return None
252 |         assert len(labels) == len(string)
253 |         prev_label = None
254 |         current = bytearray()
255 |         partition = []
256 |         for c, label in zip(string, labels):
257 |             if label != prev_label:
258 |                 if current:
259 |                     partition.append(bytes(current))
260 |                 current.clear()
261 |                 current.append(c)
262 |                 prev_label = label
263 |             else:
264 |                 current.append(c)
265 |         if current:
266 |             partition.append(bytes(current))
267 |         assert b''.join(partition) == string
268 |         assert b'' not in partition
269 |         return partition
270 | 
271 |     def bracket_partition(self, string, criterion):
272 |         level = 1
273 |         while True:
274 |             initial = string
275 |             brackets = list(detect_possible_brackets(string))
276 |             partitions = []
277 |             for l, r in brackets:
278 |                 partition = self.calculate_partition(string, l, r, level)
279 |                 if partition is not None:
280 |                     partitions.append((l, r, len(partition)))
281 | 
282 |             partitions.sort(key=lambda x: x[-1])
283 |             any_partitions = False
284 |             for l, r, _ in partitions:
285 |                 partition = self.calculate_partition(string, l, r, level)
286 |                 if partition is None:
287 |                     continue
288 |                 any_partitions = True
289 |                 self.debug(
290 |                     'Partitioning by bracket %r at level %d into %d pieces' % (
291 |                         bytes([l, r]), level, len(partition)))
292 |                 string = b''.join(_ddmin(
293 |                     partition, lambda ls: criterion(b''.join(ls))
294 |                 ))
295 |             if not any_partitions:
296 |                 break
297 |             if string == initial:
298 |                 level += 1
299 | 
300 |     def shrink(self):
301 |         prev = -1
302 |         while prev != self.shrinks:
303 |             assert self.shrinks > prev
304 |             prev = self.shrinks
305 |             options = list(self.best.items())
306 |             # Always prefer the label we started with, because that's the one
307 |             # the user is most likely to be interested in. Amongst the rest,
308 |             # go for the one that is currently most complicated.
309 |             options.sort(key=lambda lr: sort_key(lr[1]), reverse=True)
310 |             options.sort(key=lambda lr: lr[0] != self.__initial_label)
311 |             for label, current in options:
312 |                 if not current:
313 |                     continue
314 |                 if self.principal_only and self.__initial_label != label:
315 |                     continue
316 |                 if self.classify(b'') == label:
317 |                     continue
318 |                 self.output(
319 |                     'Shrinking for label %r from %d bytes (%d distinct)' % (
320 |                         label, len(current), len(set(current))))
321 | 
322 |                 if self.pass_enabled('split'):
323 |                     lo = 0
324 |                     hi = len(current)
325 |                     while lo + 1 < hi:
326 |                         mid = (lo + hi) // 2
327 |                         if self.classify(current[:mid]) == label:
328 |                             hi = mid
329 |                         else:
330 |                             lo = mid
331 | 
332 |                 initial_shrinks = self.shrinks
333 | 
334 |                 def criterion(string):
335 |                     return self.classify(string) == label
336 | 
337 |                 if self.pass_enabled('brackets'):
338 |                     self.debug('Minimizing bracketwise')
339 |                     self.bracket_partition(self.best[label], criterion)
340 |                     self.bracket_shrink(
341 |                         self.best[label], lambda c: self.classify(c) == label
342 |                     )
343 | 
344 |                     if initial_shrinks != self.shrinks:
345 |                         continue
346 | 
347 |                 if self.pass_enabled('charwise'):
348 |                     self.debug('Minimizing by partition')
349 |                     self.partition_charwise(self.best[label], criterion)
350 | 
351 |                     if initial_shrinks != self.shrinks:
352 |                         continue
353 | 
354 |                 if self.pass_enabled('bytewise'):
355 |                     self.debug('Minimizing by bytes')
356 |                     _bytemin(
357 |                         self.best[label], lambda b: self.classify(b) == label)
358 | 
359 |                     if initial_shrinks != self.shrinks:
360 |                         continue
361 | 
362 |                 if self.pass_enabled('ngrams'):
363 |                     for ngram in self.__suitable_ngrams(label):
364 |                         if len(ngram) <= 1:
365 |                             continue
366 |                         initial = self.best[label].split(ngram)
367 |                         if len(initial) <= 1:
368 |                             continue
369 |                         self.debug('Partitioning by ngram %r' % (
370 |                             ngram,))
371 |                         shrunk = _ddmin(initial, lambda ls: criterion(
372 |                             ngram.join(ls)))
373 |                         if len(shrunk) <= 1:
374 |                             continue
375 |                         self.debug('Attempting to minimize ngram %r' % (
376 |                             ngram,))
377 |                         result = _bytemin(
378 |                             ngram, lambda ng: criterion(ng.join(shrunk)))
379 |                         if ngram != result:
380 |                             self.debug('Minimized ngram %r to %r' % (
381 |                                 ngram, result))
382 | 
383 |                     if initial_shrinks != self.shrinks:
384 |                         continue
385 | 
386 | 
387 | def ngrams(string):
388 |     assert isinstance(string, bytes)
389 |     grams_to_indices = {b'': range(len(string))}
390 |     ngrams = set()
391 |     ngram_counts = Counter()
392 |     c = 0
393 |     while grams_to_indices:
394 |         new_grams_to_indices = {}
395 |         for ng, ls in grams_to_indices.items():
396 |             assert len(ng) == c
397 |             if len(ls) >= 2:
398 |                 if ng:
399 |                     ngrams.add(ng)
400 |                     ngram_counts[ng] = len(ls)
401 |                 seen = set()
402 |                 for i in ls:
403 |                     g = string[i:i + len(ng) + 1]
404 |                     seen.add(g)
405 |                     if len(g) == c + 1:
406 |                         new_grams_to_indices.setdefault(g, []).append(i)
407 |         c += 1
408 |         grams_to_indices = new_grams_to_indices
409 |     for ngram in sorted(ngrams, key=len, reverse=True):
410 |         for t in [ngram[:-1], ngram[1:]]:
411 |             if ngram_counts[t] == ngram_counts[ngram]:
412 |                 ngrams.discard(t)
413 |     return sorted(ngrams, key=len, reverse=True)
414 | 
415 | 
416 | def score(splitter, string):
417 |     # Lower is better.
418 |     bits = string.split(splitter)
419 |     if not bits:
420 |         return (0, 0)
421 |     else:
422 |         return (-min(map(len, bits)), len(bits))
423 | 
424 | 
425 | def _smallmin(string, classify):
426 |     assert len(string) <= 2
427 |     # A bunch of small example optimizations. They're mostly not
428 |     # hit but can be a huge time saver when they are.
429 |     if len(string) <= 2:
430 |         for a in ALPHABET:
431 |             if classify(a):
432 |                 return a
433 |         assert len(string) == 2
434 |         for a in ALPHABET:
435 |             for b in ALPHABET:
436 |                 c = a + b
437 |                 if c >= string:
438 |                     break
439 |                 if classify(c):
440 |                     return c
441 | 
442 | 
443 | def _bytemin(string, criterion):
444 |     return bytes(_ddmin(list(string), lambda ls: criterion(bytes(ls))))
445 | 
446 | SMALL = 2
447 | 
448 | 
449 | def _ddmin(ls, criterion):
450 |     if not criterion(ls):
451 |         raise ValueError('Initial example does not satisfy condition')
452 |     if criterion([]):
453 |         return []
454 |     k = len(ls) // 2
455 |     while k > 0:
456 |         i = 0
457 |         while k < len(ls) and i + k <= len(ls):
458 |             s = ls[i:i + k]
459 |             assert len(s) < len(ls)
460 |             if criterion(s):
461 |                 ls = s
462 |             else:
463 |                 s = ls[:i] + ls[i + k:]
464 |                 assert len(s) + k == len(ls)
465 |                 if criterion(s):
466 |                     ls = s
467 |                 else:
468 |                     if k <= SMALL:
469 |                         i += 1
470 |                     else:
471 |                         i += k
472 |         if k <= SMALL:
473 |             k -= 1
474 |         elif k <= 2 * SMALL:
475 |             k = SMALL
476 |         else:
477 |             k //= 2
478 |     return ls
479 | 
480 | 
481 | def shrink(*args, **kwargs):
482 |     """Attempt to find a minimal version of initial that satisfies classify."""
483 |     shrinker = Shrinker(*args, **kwargs)
484 |     shrinker.shrink()
485 |     return shrinker.best
486 | 
487 | 
488 | def intervals_for_brackets(string, l, r):
489 |     intervals = []
490 |     stack = []
491 |     for i, c in enumerate(string):
492 |         if c == l:
493 |             stack.append(i)
494 |         elif c == r:
495 |             if stack:
496 |                 intervals.append((stack.pop(), i + 1))
497 |             else:
498 |                 return None
499 |     if stack:
500 |         return None
501 |     return intervals
502 | 
503 | 
504 | def detect_possible_brackets(string):
505 |     counts = Counter(string)
506 |     reverse_counts = {}
507 |     for v, n in counts.items():
508 |         if n > 1:
509 |             reverse_counts.setdefault(n, []).append(v)
510 |     return sorted([
511 |         (a, b)
512 |         for ls in reverse_counts.values()
513 |         for a in ls
514 |         for b in ls
515 |         if string.index(a) < string.index(b)
516 |     ], key=lambda x: counts[x[0]], reverse=True)
517 | 


--------------------------------------------------------------------------------
/tests/test_shrinking.py:
--------------------------------------------------------------------------------
 1 | from structureshrink import shrink
 2 | from hypothesis import given, strategies as st
 3 | import hashlib
 4 | 
 5 | 
 6 | @given(st.binary(), st.random_module())
 7 | def test_partition_by_length(b, _):
 8 |     shrunk = shrink(b, len)
 9 |     assert len(shrunk) == len(b) + 1
10 | 
11 | 
12 | @given(
13 |     st.lists(st.binary(min_size=1, max_size=4), min_size=1, max_size=5),
14 |     st.random_module()
15 | )
16 | def test_shrink_to_any_substring(ls, _):
17 |     shrunk = shrink(
18 |         b''.join(ls), lambda x: sum(l in x for l in ls)
19 |     )
20 |     assert len(shrunk) >= len(ls)
21 | 
22 | 
23 | def test_partition_by_last_byte():
24 |     seed = b''.join(bytes([i, j]) for i in range(256) for j in range(256))
25 |     shrunk = shrink(
26 |         seed, lambda s: hashlib.sha1(s).digest()[-1] & 127
27 |     )
28 |     assert len(shrunk) == 128
29 | 


--------------------------------------------------------------------------------