├── .gitignore ├── LICENSE ├── Pipfile ├── Pipfile.lock ├── example.txt ├── readme.md ├── skjul.csv └── skjul.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Mike Pedersen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | name = "pypi" 3 | url = "https://pypi.org/simple" 4 | verify_ssl = true 5 | 6 | [dev-packages] 7 | 8 | [packages] 9 | numpy = "*" 10 | scikit-learn = "*" 11 | docopt = "*" 12 | schema = "*" 13 | 14 | [requires] 15 | python_version = "3.6" 16 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "78f76cab2ce6f9a365c12dd5f7daaffc18c531b141e858d2d864e01a0d34247b" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.6" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "contextlib2": { 20 | "hashes": [ 21 | "sha256:509f9419ee91cdd00ba34443217d5ca51f5a364a404e1dce9e8979cea969ca48", 22 | "sha256:f5260a6e679d2ff42ec91ec5252f4eeffdcf21053db9113bd0a8e4d953769c00" 23 | ], 24 | "version": "==0.5.5" 25 | }, 26 | "docopt": { 27 | "hashes": [ 28 | "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491" 29 | ], 30 | "index": "pypi", 31 | "version": "==0.6.2" 32 | }, 33 | "numpy": { 34 | "hashes": [ 35 | "sha256:1980f8d84548d74921685f68096911585fee393975f53797614b34d4f409b6da", 36 | "sha256:22752cd809272671b273bb86df0f505f505a12368a3a5fc0aa811c7ece4dfd5c", 37 | "sha256:23cc40313036cffd5d1873ef3ce2e949bdee0646c5d6f375bf7ee4f368db2511", 38 | "sha256:2b0b118ff547fecabc247a2668f48f48b3b1f7d63676ebc5be7352a5fd9e85a5", 39 | "sha256:3a0bd1edf64f6a911427b608a894111f9fcdb25284f724016f34a84c9a3a6ea9", 40 | "sha256:3f25f6c7b0d000017e5ac55977a3999b0b1a74491eacb3c1aa716f0e01f6dcd1", 41 | "sha256:4061c79ac2230594a7419151028e808239450e676c39e58302ad296232e3c2e8", 42 | "sha256:560ceaa24f971ab37dede7ba030fc5d8fa173305d94365f814d9523ffd5d5916", 43 | "sha256:62be044cd58da2a947b7e7b2252a10b42920df9520fc3d39f5c4c70d5460b8ba", 44 | "sha256:6c692e3879dde0b67a9dc78f9bfb6f61c666b4562fd8619632d7043fb5b691b0", 45 | "sha256:6f65e37b5a331df950ef6ff03bd4136b3c0bbcf44d4b8e99135d68a537711b5a", 46 | "sha256:7a78cc4ddb253a55971115f8320a7ce28fd23a065fc33166d601f51760eecfa9", 47 | "sha256:80a41edf64a3626e729a62df7dd278474fc1726836552b67a8c6396fd7e86760", 48 | "sha256:893f4d75255f25a7b8516feb5766c6b63c54780323b9bd4bc51cdd7efc943c73", 49 | "sha256:972ea92f9c1b54cc1c1a3d8508e326c0114aaf0f34996772a30f3f52b73b942f", 50 | "sha256:9f1d4865436f794accdabadc57a8395bd3faa755449b4f65b88b7df65ae05f89", 51 | "sha256:9f4cd7832b35e736b739be03b55875706c8c3e5fe334a06210f1a61e5c2c8ca5", 52 | "sha256:adab43bf657488300d3aeeb8030d7f024fcc86e3a9b8848741ea2ea903e56610", 53 | "sha256:bd2834d496ba9b1bdda3a6cf3de4dc0d4a0e7be306335940402ec95132ad063d", 54 | "sha256:d20c0360940f30003a23c0adae2fe50a0a04f3e48dc05c298493b51fd6280197", 55 | "sha256:d3b3ed87061d2314ff3659bb73896e622252da52558f2380f12c421fbdee3d89", 56 | "sha256:dc235bf29a406dfda5790d01b998a1c01d7d37f449128c0b1b7d1c89a84fae8b", 57 | "sha256:fb3c83554f39f48f3fa3123b9c24aecf681b1c289f9334f8215c1d3c8e2f6e5b" 58 | ], 59 | "index": "pypi", 60 | "version": "==1.16.2" 61 | }, 62 | "schema": { 63 | "hashes": [ 64 | "sha256:44add3ef9016c85ac4b0291b45286a657d0df309b31528ca8d0a9c6d0aa68186", 65 | "sha256:5b0e0f47923164190513db2e91b9ab1941162b2dc400cc9b1803c2abab579e62" 66 | ], 67 | "index": "pypi", 68 | "version": "==0.7.0" 69 | }, 70 | "scikit-learn": { 71 | "hashes": [ 72 | "sha256:018f470a7e685767d84ce6fac87af59e064e87ec3cea71eaf12646f9538e293d", 73 | "sha256:0ae00d570331b8a5c552f721167818b4739a5c855fbc76b11231ccdea2dd26ab", 74 | "sha256:13079520dd8211967d1871e439b59818d335439672818e9683847091d0e07778", 75 | "sha256:1c133749a526b33af2b6695d94d2cc43ba212c5aa7bd3a45619335556ced7637", 76 | "sha256:382e7053567b7b11e862782e3de2940e2141be24e6262aa0b4a9cb7fdd61f85a", 77 | "sha256:384df81fdba12d21063072f2cf472a7a8425a3d4fa3915faef0a88e94e07b332", 78 | "sha256:4705073de7bbcc6b9cd2f24dc9189aa8d3935e8621d3e65546c4b7fee9a042bf", 79 | "sha256:4f829d6c09b997e1d0a998f970cf3ff82cd6796d56148c63c29174367878d490", 80 | "sha256:51a933224b1b11986d4c7c123e5b28eb69602899d0179e6888b7abf2ffc85265", 81 | "sha256:63ad98c6512b52aebde9bd806ec1127e13e2a8d42a00ebdf805153819f7c2cad", 82 | "sha256:67e15514c9df4c5354b3ecc89451f5baa0f1b62c7ed68f4d20febf9c9d9e17a6", 83 | "sha256:75f0e0e93851b30639baabfc1a4433aabc57eef269d55ee4c6f649fb60686218", 84 | "sha256:89609708e819342dd5c94617fd53a36187d7d6a80435ddb282f6a60b058dbe77", 85 | "sha256:8ca274d4e91685e4547af718b6f1e9a9d4912c7a6dcb0c68925de84f81a09d2a", 86 | "sha256:9987f3d31efc427ebf9926f703e5171552cfb3b6935f880e4f0d3a17b7f91540", 87 | "sha256:9f3e08dbd3f2f574913faba9b48d3c24a43fcc0eb14a0e962431005434b9cfe6", 88 | "sha256:a7a403bcea250cac37971058fca0c30b0144737a375f99d3855e5e7a34c43348", 89 | "sha256:ad7e4e823db1271d344e0c3ce0988b2e0fecc49079eec9c818d866c38b2824bd", 90 | "sha256:b1e9037a582e650d866324a50d2741724ea5f6c175200bef0b549d014898035a", 91 | "sha256:b82fbd8843ead2640158b2c0946d354b66f3d49472e6790d70c4ceec35663b3f", 92 | "sha256:b91c82bfd25145d428de99429de97d7a1c2c2658c212689fe2839b29a5251159", 93 | "sha256:ba57b73ec7074f60bb85f953296df437784d560553d0cc04b253c43f1846ccad", 94 | "sha256:c503802a81de18b8b4d40d069f5e363795ee44b1605f38bc104160ca3bfe2c41", 95 | "sha256:d30e8e0dffbc299533f47044fec26c5087473cb29cf51f1995986ac8354c7b4c", 96 | "sha256:d89b810bfb0e16a0de7f18773849bdf83dd7fd0614ae5225e5a9214cdb9be245", 97 | "sha256:e22e1d47def2944ad7a12c09452de085587ec5baad2174683e56a42b6918a76f", 98 | "sha256:f650ddc023c95681fccd5e297820f35de039e008265040c08188be95b3275a0f", 99 | "sha256:f7d4b3885ad1a7a6f07719ab6b1790d9892d6d41d973e8d4543a93bb15226fb4" 100 | ], 101 | "index": "pypi", 102 | "version": "==0.20.3" 103 | }, 104 | "scipy": { 105 | "hashes": [ 106 | "sha256:014cb900c003b5ac81a53f2403294e8ecf37aedc315b59a6b9370dce0aa7627a", 107 | "sha256:281a34da34a5e0de42d26aed692ab710141cad9d5d218b20643a9cb538ace976", 108 | "sha256:588f9cc4bfab04c45fbd19c1354b5ade377a8124d6151d511c83730a9b6b2338", 109 | "sha256:5a10661accd36b6e2e8855addcf3d675d6222006a15795420a39c040362def66", 110 | "sha256:628f60be272512ca1123524969649a8cb5ae8b31cca349f7c6f8903daf9034d7", 111 | "sha256:6dcc43a88e25b815c2dea1c6fac7339779fc988f5df8396e1de01610604a7c38", 112 | "sha256:70e37cec0ac0fe95c85b74ca4e0620169590fd5d3f44765f3c3a532cedb0e5fd", 113 | "sha256:7274735fb6fb5d67d3789ddec2cd53ed6362539b41aa6cc0d33a06c003aaa390", 114 | "sha256:78e12972e144da47326958ac40c2bd1c1cca908edc8b01c26a36f9ffd3dce466", 115 | "sha256:790cbd3c8d09f3a6d9c47c4558841e25bac34eb7a0864a9def8f26be0b8706af", 116 | "sha256:79792c8fe8e9d06ebc50fe23266522c8c89f20aa94ac8e80472917ecdce1e5ba", 117 | "sha256:865afedf35aaef6df6344bee0de391ee5e99d6e802950a237f9fb9b13e441f91", 118 | "sha256:870fd401ec7b64a895cff8e206ee16569158db00254b2f7157b4c9a5db72c722", 119 | "sha256:963815c226b29b0176d5e3d37fc9de46e2778ce4636a5a7af11a48122ef2577c", 120 | "sha256:9726791484f08e394af0b59eb80489ad94d0a53bbb58ab1837dcad4d58489863", 121 | "sha256:9de84a71bb7979aa8c089c4fb0ea0e2ed3917df3fb2a287a41aaea54bbad7f5d", 122 | "sha256:b2c324ddc5d6dbd3f13680ad16a29425841876a84a1de23a984236d1afff4fa6", 123 | "sha256:b86ae13c597fca087cb8c193870507c8916cefb21e52e1897da320b5a35075e5", 124 | "sha256:ba0488d4dbba2af5bf9596b849873102d612e49a118c512d9d302ceafa36e01a", 125 | "sha256:d78702af4102a3a4e23bb7372cec283e78f32f5573d92091aa6aaba870370fe1", 126 | "sha256:def0e5d681dd3eb562b059d355ae8bebe27f5cc455ab7c2b6655586b63d3a8ea", 127 | "sha256:e085d1babcb419bbe58e2e805ac61924dac4ca45a07c9fa081144739e500aa3c", 128 | "sha256:e2cfcbab37c082a5087aba5ff00209999053260441caadd4f0e8f4c2d6b72088", 129 | "sha256:e742f1f5dcaf222e8471c37ee3d1fd561568a16bb52e031c25674ff1cf9702d5", 130 | "sha256:f06819b028b8ef9010281e74c59cb35483933583043091ed6b261bb1540f11cc", 131 | "sha256:f15f2d60a11c306de7700ee9f65df7e9e463848dbea9c8051e293b704038da60", 132 | "sha256:f31338ee269d201abe76083a990905473987371ff6f3fdb76a3f9073a361cf37", 133 | "sha256:f6b88c8d302c3dac8dff7766955e38d670c82e0d79edfc7eae47d6bb2c186594" 134 | ], 135 | "version": "==1.2.1" 136 | } 137 | }, 138 | "develop": {} 139 | } 140 | -------------------------------------------------------------------------------- /example.txt: -------------------------------------------------------------------------------- 1 | ‘The Babel fish,’ said The Hitchhiker’s Guide to the Galaxy quietly, ‘is small, yellow, and leech-like, and probably the oddest thing in the Universe. It feeds on brainwave energy received not from its own carrier but from those around it. It absorbs all unconscious mental frequencies from this brainwave energy to nourish itself with. It then excretes into the mind of its carrier a telepathic matrix formed by combining the conscious thought frequencies with nerve signals picked up from the speech centres of the brain which has supplied them. The practical upshot of all this is that if you stick a Babel fish in your ear you can instantly understand anything said to you in any form of language. 2 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Skjul - Text-based steganography 2 | 3 | *Steganography* is the practice of inconspicuously hiding data (a secret) within 4 | some other data (a carrier). Often this is within images, where the lower bits 5 | can be used to store a secret message. While having few real uses, steganography 6 | can be a fun exercise in information theory. 7 | 8 | Skjul (Danish for *hide*, as in *to hide*), is a text-based steganography 9 | implementation. Given a carrier message, Skjul can encode a secret bitstring 10 | into it by slightly changing words - hopefully so little as to be imperceptible 11 | to an uninitiated reader. 12 | 13 | ## Example 14 | 15 | $ cat example.txt 16 | ‘The Babel fish,’ said The Hitchhiker’s Guide to the Galaxy quietly, ‘is 17 | small, yellow, and leech-like, and probably the oddest thing in the 18 | Universe. It feeds on brainwave energy received not from its own carrier but 19 | from those around it. It absorbs all unconscious mental frequencies from 20 | this brainwave energy to nourish itself with. It then excretes into the mind 21 | of its carrier a telepathic matrix formed by combining the conscious thought 22 | frequencies with nerve signals picked up from the speech centres of the 23 | brain which has supplied them. The practical upshot of all this is that if 24 | you stick a Babel fish in your ear you can instantly understand anything 25 | said to you in any form of language. 26 | 27 | $ cat example.txt | ./skjul.py encode '101010' | tee 'encoded.txt' 28 | ‘The Babel fish,’ said The Hitchhiker’s Guide to the Galaxy quietly, ‘is 29 | small, yellow, and leech-like, and possibly the oddest thing in the 30 | Universe. It feeds on brainwave energy recieved not to its own carrier but 31 | to those around it. It absorbs all unconscious mental frequencies from this 32 | brainwave energy to nourish itself with. It then excretes into the mind of 33 | its carrier a telepathic matrix formed by combining the conscious thought 34 | frequencies with nerve signals picked up from the speech centres of the 35 | brain which has supplied them. The practical upshot of all that is that if 36 | you stick a Babel fish in your ear you can instantly comprehend anything 37 | said from you in any form of language. 38 | 39 | $ wdiff example.txt encoded.txt 40 | ‘The Babel fish,’ said The Hitchhiker’s Guide to the Galaxy quietly, ‘is 41 | small, yellow, and leech-like, and [-probably-] {+possibly+} the oddest 42 | thing in the Universe. It feeds on brainwave energy [-received-] 43 | {+recieved+} not [-from-] {+to+} its own carrier but [-from-] {+to+} those 44 | around it. It absorbs all unconscious mental frequencies from this brainwave 45 | energy to nourish itself with. It then excretes into the mind of its carrier 46 | a telepathic matrix formed by combining the conscious thought frequencies 47 | with nerve signals picked up from the speech centres of the brain which has 48 | supplied them. The practical upshot of all [-this-] {+that+} is that if you 49 | stick a Babel fish in your ear you can instantly [-understand-] 50 | {+comprehend+} anything said [-to-] {+from+} you in any form of language. 51 | 52 | $ cat encoded.txt | ./skjul decode 53 | 101010 54 | 55 | The secrets can only be bitstrings. 56 | 57 | ## How it works 58 | 59 | ### Word pairs 60 | 61 | Given a carrier string and a secret bitstring, the basic idea is assign to each 62 | word in the carrier string a *paired word*. The secret message can then be 63 | encoded in our choice of word. To not have a noticeable difference, the paired 64 | word should be able to "work" in the same context as the original word, i.e. we 65 | wish to select words that are likely to share the same neighboring words. 66 | 67 | Word-vector models is a common way to model these *distributional properties* of 68 | words. In such a model, each word has a vector embedding of e.g. 300 dimensions. 69 | These embeddings are built such that words that tend to have similar contexts 70 | also tend to have similar embeddings. 71 | 72 | Using a word vector model, we pair each embedding with a neighbor using cosine 73 | distance as the metric. Note that these pairings must be exclusive, i.e. 74 | `[(a,b), (a,c)]` is not valid because `a` participates in both pairs. Instead, 75 | we find the k-nearest neighbors for each word then greedily pair words based on 76 | the distance to the closest non-paired neighbor. This means that words are not 77 | always paired with their closest neighbor and some words are not paired at all. 78 | 79 | This repository includes a precomputed pair list based on 80 | [Facebook's fasttext vectors](https://fasttext.cc/docs/en/crawl-vectors.html). 81 | 82 | For example, the string has "this is a test" has 3 words in the pair list: 83 | 84 | | 1 | 0 | Distance | 85 | |------|-------|------------| 86 | | this | that | 0.17533547 | 87 | | was | is | 0.28453428 | 88 | | test | tests | 0.20037645 | 89 | 90 | To encode a *k*-bit message, we simply pick the *k* tokens with lowest distance 91 | to their paired word and swap or not depending on the corresponding bit in the 92 | secret. Eg. to encode a single 1-bit, we change "this is a test" to "that is a 93 | test". 94 | 95 | ### Variable length coding 96 | 97 | The method as outlined above requires the person decoding to know the length of 98 | the secret. This makes it somewhat unpractical and it would be better to encode 99 | the length as part of the message itself. To do this, we need a prefix-free 100 | encoding scheme, as we do not know the amount of bits for the length beforehand. 101 | 102 | For this, we use 103 | [Elias gamma coding](https://en.wikipedia.org/wiki/Elias_gamma_coding). In gamma 104 | coding, we first encode the length of the integer in unary zero bits followed by 105 | the length integer itself. 106 | 107 | A downside of this is that it increases the length of the secret, especially for 108 | small secrets. This is due to how the number of bits in the length itself is 109 | comparatively more significant than it would be for a longer message. 110 | 111 | ### A pinch of noise 112 | 113 | Lastly, we add XOR encryption to the secret using a pseudorandom number 114 | generator (PRNG). This breaks any predictable patterns that might be in the 115 | secret. For example, a secret of only zeros and a carrier that contains the same 116 | pair often would always pick the same word. This also makes it possible to 117 | specify a key by using the key as seed for the PRNG. 118 | 119 | We also add an small, optional amount of noise to each word pair distance. This 120 | make the pairs chosen more varied, such that it is not always the minimum word 121 | that is chosen. 122 | -------------------------------------------------------------------------------- /skjul.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """Text-based steganography. 3 | 4 | Usage: 5 | skjul.py process [] [] [--neighbors=] [--lines=] 6 | skjul.py encode [] [--key=] [--noise=] 7 | skjul.py decode [] [--key=] [--noise=] 8 | skjul.py --version 9 | 10 | Commands: 11 | process Build a pairs list from a fastText vector file. 12 | encode Encode a secret message. Carrier is read from standard 13 | input and output is written to standard output. 14 | decode Decode a secret messasge from standard input. 15 | 16 | All commands accept a path to a pair-list file. If this is not supplied, then 17 | 'skjul.csv' in the current working directory is used instead. 18 | 19 | Options: 20 | -h --help Show this screen. 21 | --version Show version. 22 | -n --lines= Number of lines to read from file [default: all]. 23 | -k --neighbors= Number of neighbors to find for each word [default: 10]. 24 | -k --key= Key to encode/decode message with [default: 0]. 25 | -x --noise= Noise fraction when selecting words [default: 0.025]. 26 | """ 27 | 28 | import re 29 | import csv 30 | from enum import Enum 31 | import numpy as np 32 | from sklearn.neighbors import NearestNeighbors 33 | from docopt import docopt 34 | from schema import Schema, And, Or, Use, Regex 35 | import sys 36 | import itertools 37 | 38 | 39 | def _pairing(x, k=10, metric='cosine', stable=False): 40 | """ 41 | Pairs points such that points that are closer wrt. the given metric are 42 | more likely to be paired together. 43 | 44 | Args: 45 | x (array): A n by d matrix representing n vectors of length d. 46 | k (int): Number of neighbors to consider when pairing. 47 | metric (str): Metric to use for nearest neighbor. 48 | stable (bool): Whether to ensure exact output for the given input. 49 | 50 | See sklearn.neighbors.NearestNeighbors for all possible metrics. 51 | 52 | Returns: 53 | low: Lower indices of each pair 54 | hi: Higher indices of each pair 55 | dist: Distance between each point 56 | """ 57 | 58 | x = np.asarray(x) 59 | n = x.shape[0] 60 | 61 | edge_dist, edge_tgt = NearestNeighbors(n_neighbors=k, metric=metric) \ 62 | .fit(x).kneighbors() 63 | 64 | sorting = np.argsort(edge_dist, axis=None, 65 | kind='stable' if stable else None) 66 | 67 | edge_src = np.unravel_index(sorting, edge_dist.shape)[0] 68 | edge_tgt = np.ravel(edge_tgt)[sorting] 69 | edge_dist = np.ravel(edge_dist)[sorting] 70 | 71 | pairing = np.full([n], -1, dtype=np.int32) 72 | pairing_dist = np.zeros([n], dtype=edge_dist.dtype) 73 | 74 | for src, tgt, dist in np.nditer((edge_src, edge_tgt, edge_dist)): 75 | if pairing[src] == -1 and pairing[tgt] == -1: 76 | pairing[src] = tgt 77 | pairing[tgt] = src 78 | pairing_dist[src] = pairing_dist[tgt] = dist 79 | 80 | paired_indices = np.where(pairing != -1)[0] 81 | lo = paired_indices[pairing[paired_indices] > paired_indices] 82 | hi = pairing[lo] 83 | 84 | return lo, hi, pairing_dist[lo] 85 | 86 | 87 | def _gamma_encode(num): 88 | """ 89 | Encodes a positive number using Elias gamma coding. 90 | 91 | Args: 92 | num (int): An integer to encode. Must be positive. 93 | 94 | Returns: 95 | list: A list of booleans representing bits of the encoded number. 96 | """ 97 | code = [False] * num.bit_length() 98 | 99 | for i in range(num.bit_length() - 1, -1, -1): 100 | code.append((num >> i) & 1 != 0) 101 | 102 | return code 103 | 104 | 105 | def _gamma_decode(bits): 106 | """ 107 | Decodes an Elias gamma encoded number. 108 | 109 | Args: 110 | bits (iterable): An iterable of bits to decode. 111 | 112 | Returns: 113 | int: The decoded gamma integer. 114 | """ 115 | 116 | reading = False 117 | length = 0 118 | num = 0 119 | 120 | for bit in bits: 121 | if not reading: 122 | if bit: 123 | reading = True 124 | else: 125 | length += 1 126 | 127 | if reading: 128 | num = num << 1 129 | num |= 1 if bit else 0 130 | length -= 1 131 | 132 | if length == 0: 133 | return num 134 | 135 | 136 | class _Caps(Enum): 137 | UPPER = 1 138 | TITLE = 2 139 | LOWER = 3 140 | 141 | @staticmethod 142 | def from_word(word): 143 | if word.istitle(): 144 | return _Caps.TITLE 145 | elif word.isupper(): 146 | return _Caps.UPPER 147 | else: 148 | return _Caps.LOWER 149 | 150 | def apply(self, word): 151 | if self == _Caps.UPPER: 152 | return word.upper() 153 | elif self == _Caps.TITLE: 154 | return word.title() 155 | else: 156 | return word.lower() 157 | 158 | 159 | class Steganographer: 160 | """ 161 | A class for hiding secret messages in ordinary text. 162 | """ 163 | 164 | TOKEN_REGEX = re.compile(r'\w+') 165 | 166 | @staticmethod 167 | def from_embeddings(words, embeddings, k=5, metric='cosine'): 168 | """ 169 | Creates a new steganographer from a list of words and corresponding 170 | embeddings. 171 | 172 | Args: 173 | words (list): A list of words of length n. 174 | embeddings (array): A n by d matrix of word embeddings. 175 | k (int): The number of neighbors to consider when pairing words. 176 | metric (str): The metric to use when pairing words. 177 | 178 | Returns: 179 | Steganographer: A new steganographer. 180 | """ 181 | 182 | words = np.asarray(words) 183 | embeddings = np.asarray(embeddings) 184 | lower_words = {} 185 | valid = np.zeros([words.size], np.bool) 186 | 187 | # Filter non-token words and lowercase all words. In case of a 188 | # collision, prefer embeddings from lowercase words. We assume that 189 | # these are more common and therefore more representative. 190 | 191 | for i, word in enumerate(words): 192 | lower = word.lower() 193 | 194 | valid[i] = (Steganographer.TOKEN_REGEX.fullmatch(word) is not None 195 | and (lower not in lower_words or not word.islower())) 196 | 197 | if valid[i]: 198 | old = lower_words.get(lower) 199 | 200 | if old is not None: 201 | valid[old] = False 202 | 203 | lower_words[lower] = i 204 | 205 | words = np.char.lower(words[valid]) 206 | embeddings = embeddings[valid] 207 | 208 | left, right, dist = _pairing(embeddings, k, metric=metric, stable=True) 209 | return Steganographer(zip(words[left], words[right], dist)) 210 | 211 | @staticmethod 212 | def load(file): 213 | """Loads a steganographer from a file-like object""" 214 | 215 | return Steganographer((a, b, float(dist)) 216 | for [a, b, dist] 217 | in csv.reader(file)) 218 | 219 | def __init__(self, pairs): 220 | self.map = {a: (b, dist, value) 221 | for left, right, dist in pairs 222 | for a, b, dist, value in [(left, right, dist, True), 223 | (right, left, dist, False)]} 224 | 225 | def save(self, file): 226 | """Saves the steganographer to a file-like object""" 227 | 228 | pairs = ([a, b, dist] 229 | for (a, (b, dist, value)) 230 | in self.map.items() 231 | if value) 232 | 233 | csv.writer(file).writerows(sorted(pairs, key=lambda x: x[2])) 234 | 235 | def _tokenize(self, string): 236 | intertokens = [] 237 | tokens = [] 238 | caps = [] 239 | 240 | last_end = 0 241 | 242 | for match in Steganographer.TOKEN_REGEX.finditer(string): 243 | if match.group().lower() in self.map: 244 | intertokens.append(string[last_end:match.start()]) 245 | tokens.append(match.group().lower()) 246 | caps.append(_Caps.from_word(match.group())) 247 | last_end = match.end() 248 | 249 | intertokens.append(string[last_end:]) 250 | 251 | return tokens, caps, intertokens 252 | 253 | def encode(self, carrier, secret, key=0, noise=0): 254 | """ 255 | Encodes a secret message into a carrier message. 256 | 257 | Args: 258 | carrier (str): A string to embed a secret message into. 259 | secret (list): A list of booleans to embed. 260 | key (int): A key to encode the message with. 261 | noise (float): The amount of noise to add. 262 | 263 | Returns: 264 | str: A message with the given secret embedded within it. 265 | """ 266 | tokens, caps, intertokens = self._tokenize(carrier) 267 | 268 | rng = np.random.RandomState(key) 269 | token_noise = rng.rand(len(tokens)) * noise 270 | 271 | secret = _gamma_encode(len(secret)) + secret 272 | 273 | if len(secret) > len(tokens): 274 | raise ValueError('Insufficient tokens for secret') 275 | 276 | dist = np.array([self.map[token][1] for token in tokens]) + token_noise 277 | 278 | for index, bit in zip(np.argsort(dist), secret): 279 | paired, _, value = self.map[tokens[index]] 280 | if (value != bool(bit)) != bool(rng.randint(1)): 281 | tokens[index] = paired 282 | 283 | result = [] 284 | for token, cap, intertoken in zip(tokens, caps, intertokens): 285 | result.append(intertoken) 286 | result.append(cap.apply(token)) 287 | 288 | result.append(intertokens[-1]) 289 | 290 | return ''.join(result) 291 | 292 | def decode(self, message, key=0, noise=0): 293 | """ 294 | Extracts a secret message from a string. 295 | 296 | Args: 297 | message (str): A string from which to extract a secret message. 298 | key (int): The key the message was encoded with. 299 | noise (float): The amount of noise the message was encoded with. 300 | 301 | Returns: 302 | list: The decoded secret list of booleans. 303 | """ 304 | tokens = self._tokenize(message)[0] 305 | 306 | rng = np.random.RandomState(key) 307 | token_noise = rng.rand(len(tokens)) * noise 308 | 309 | dist = np.array([self.map[token][1] for token in tokens]) + token_noise 310 | 311 | bits = (self.map[tokens[index]][2] != bool(rng.randint(1)) 312 | for index in np.argsort(dist)) 313 | 314 | secret_len = _gamma_decode(bits) 315 | 316 | return list(itertools.islice(bits, secret_len)) 317 | 318 | 319 | def _read_fast(file, nrows=None): 320 | """ 321 | Reads a facebook fastText formatted vector file into a list of words and a 322 | 2d numpy array of corresponding embeddings. 323 | """ 324 | [n, d] = [int(s) for s in file.readline().split(' ')] 325 | 326 | if nrows is not None: 327 | n = min(n, nrows) 328 | 329 | embeddings = np.zeros([n, d], np.float32) 330 | words = [] 331 | 332 | for i, line in enumerate(file): 333 | if i >= n: 334 | break 335 | 336 | row = line.split(' ') 337 | words.append(row[0]) 338 | embeddings[i, :] = [float(x) for x in row[1:d + 1]] 339 | 340 | return words, embeddings 341 | 342 | 343 | def main(): 344 | raw_args = docopt(__doc__, version='skjul 0.1') 345 | 346 | schema = Schema({ 347 | 'process': bool, 348 | 'encode': bool, 349 | 'decode': bool, 350 | '': Or(None, Use(open)), 351 | '': Use(lambda x: x or 'skjul.csv'), 352 | '': Or(None, And(Regex(r'^[01]*$'), Use( 353 | lambda s: [c == '1' for c in s]))), 354 | '--version': bool, 355 | '--lines': Or(And('all', Use(lambda x: None)), Use(int)), 356 | '--neighbors': Use(int), 357 | '--key': Use(int), 358 | '--noise': Use(float), 359 | }) 360 | 361 | args = schema.validate(raw_args) 362 | 363 | if args['process']: 364 | words, embeddings = _read_fast(args[''] or sys.stdin, 365 | nrows=args['--lines']) 366 | 367 | st = Steganographer.from_embeddings(words, embeddings, 368 | k=args['--neighbors']) 369 | 370 | with open(args[''], 'w') as pairs: 371 | st.save(pairs) 372 | 373 | elif args['encode']: 374 | with open(args['']) as pairs: 375 | st = Steganographer.load(pairs) 376 | 377 | sys.stdout.write(st.encode(sys.stdin.read(), args[''], 378 | args['--key'], args['--noise'])) 379 | elif args['decode']: 380 | with open(args['']) as pairs: 381 | st = Steganographer.load(pairs) 382 | decoded = st.decode(sys.stdin.read(), args['--key'], args['--noise']) 383 | sys.stdout.write(''.join('1' if bit else '0' for bit in decoded )) 384 | 385 | 386 | if __name__ == "__main__": 387 | main() 388 | --------------------------------------------------------------------------------