├── .gitignore
├── LICENSE
├── Pipfile
├── Pipfile.lock
├── example.txt
├── readme.md
├── skjul.csv
└── skjul.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Mike Pedersen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | name = "pypi"
 3 | url = "https://pypi.org/simple"
 4 | verify_ssl = true
 5 | 
 6 | [dev-packages]
 7 | 
 8 | [packages]
 9 | numpy = "*"
10 | scikit-learn = "*"
11 | docopt = "*"
12 | schema = "*"
13 | 
14 | [requires]
15 | python_version = "3.6"
16 | 


--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
  1 | {
  2 |     "_meta": {
  3 |         "hash": {
  4 |             "sha256": "78f76cab2ce6f9a365c12dd5f7daaffc18c531b141e858d2d864e01a0d34247b"
  5 |         },
  6 |         "pipfile-spec": 6,
  7 |         "requires": {
  8 |             "python_version": "3.6"
  9 |         },
 10 |         "sources": [
 11 |             {
 12 |                 "name": "pypi",
 13 |                 "url": "https://pypi.org/simple",
 14 |                 "verify_ssl": true
 15 |             }
 16 |         ]
 17 |     },
 18 |     "default": {
 19 |         "contextlib2": {
 20 |             "hashes": [
 21 |                 "sha256:509f9419ee91cdd00ba34443217d5ca51f5a364a404e1dce9e8979cea969ca48",
 22 |                 "sha256:f5260a6e679d2ff42ec91ec5252f4eeffdcf21053db9113bd0a8e4d953769c00"
 23 |             ],
 24 |             "version": "==0.5.5"
 25 |         },
 26 |         "docopt": {
 27 |             "hashes": [
 28 |                 "sha256:49b3a825280bd66b3aa83585ef59c4a8c82f2c8a522dbe754a8bc8d08c85c491"
 29 |             ],
 30 |             "index": "pypi",
 31 |             "version": "==0.6.2"
 32 |         },
 33 |         "numpy": {
 34 |             "hashes": [
 35 |                 "sha256:1980f8d84548d74921685f68096911585fee393975f53797614b34d4f409b6da",
 36 |                 "sha256:22752cd809272671b273bb86df0f505f505a12368a3a5fc0aa811c7ece4dfd5c",
 37 |                 "sha256:23cc40313036cffd5d1873ef3ce2e949bdee0646c5d6f375bf7ee4f368db2511",
 38 |                 "sha256:2b0b118ff547fecabc247a2668f48f48b3b1f7d63676ebc5be7352a5fd9e85a5",
 39 |                 "sha256:3a0bd1edf64f6a911427b608a894111f9fcdb25284f724016f34a84c9a3a6ea9",
 40 |                 "sha256:3f25f6c7b0d000017e5ac55977a3999b0b1a74491eacb3c1aa716f0e01f6dcd1",
 41 |                 "sha256:4061c79ac2230594a7419151028e808239450e676c39e58302ad296232e3c2e8",
 42 |                 "sha256:560ceaa24f971ab37dede7ba030fc5d8fa173305d94365f814d9523ffd5d5916",
 43 |                 "sha256:62be044cd58da2a947b7e7b2252a10b42920df9520fc3d39f5c4c70d5460b8ba",
 44 |                 "sha256:6c692e3879dde0b67a9dc78f9bfb6f61c666b4562fd8619632d7043fb5b691b0",
 45 |                 "sha256:6f65e37b5a331df950ef6ff03bd4136b3c0bbcf44d4b8e99135d68a537711b5a",
 46 |                 "sha256:7a78cc4ddb253a55971115f8320a7ce28fd23a065fc33166d601f51760eecfa9",
 47 |                 "sha256:80a41edf64a3626e729a62df7dd278474fc1726836552b67a8c6396fd7e86760",
 48 |                 "sha256:893f4d75255f25a7b8516feb5766c6b63c54780323b9bd4bc51cdd7efc943c73",
 49 |                 "sha256:972ea92f9c1b54cc1c1a3d8508e326c0114aaf0f34996772a30f3f52b73b942f",
 50 |                 "sha256:9f1d4865436f794accdabadc57a8395bd3faa755449b4f65b88b7df65ae05f89",
 51 |                 "sha256:9f4cd7832b35e736b739be03b55875706c8c3e5fe334a06210f1a61e5c2c8ca5",
 52 |                 "sha256:adab43bf657488300d3aeeb8030d7f024fcc86e3a9b8848741ea2ea903e56610",
 53 |                 "sha256:bd2834d496ba9b1bdda3a6cf3de4dc0d4a0e7be306335940402ec95132ad063d",
 54 |                 "sha256:d20c0360940f30003a23c0adae2fe50a0a04f3e48dc05c298493b51fd6280197",
 55 |                 "sha256:d3b3ed87061d2314ff3659bb73896e622252da52558f2380f12c421fbdee3d89",
 56 |                 "sha256:dc235bf29a406dfda5790d01b998a1c01d7d37f449128c0b1b7d1c89a84fae8b",
 57 |                 "sha256:fb3c83554f39f48f3fa3123b9c24aecf681b1c289f9334f8215c1d3c8e2f6e5b"
 58 |             ],
 59 |             "index": "pypi",
 60 |             "version": "==1.16.2"
 61 |         },
 62 |         "schema": {
 63 |             "hashes": [
 64 |                 "sha256:44add3ef9016c85ac4b0291b45286a657d0df309b31528ca8d0a9c6d0aa68186",
 65 |                 "sha256:5b0e0f47923164190513db2e91b9ab1941162b2dc400cc9b1803c2abab579e62"
 66 |             ],
 67 |             "index": "pypi",
 68 |             "version": "==0.7.0"
 69 |         },
 70 |         "scikit-learn": {
 71 |             "hashes": [
 72 |                 "sha256:018f470a7e685767d84ce6fac87af59e064e87ec3cea71eaf12646f9538e293d",
 73 |                 "sha256:0ae00d570331b8a5c552f721167818b4739a5c855fbc76b11231ccdea2dd26ab",
 74 |                 "sha256:13079520dd8211967d1871e439b59818d335439672818e9683847091d0e07778",
 75 |                 "sha256:1c133749a526b33af2b6695d94d2cc43ba212c5aa7bd3a45619335556ced7637",
 76 |                 "sha256:382e7053567b7b11e862782e3de2940e2141be24e6262aa0b4a9cb7fdd61f85a",
 77 |                 "sha256:384df81fdba12d21063072f2cf472a7a8425a3d4fa3915faef0a88e94e07b332",
 78 |                 "sha256:4705073de7bbcc6b9cd2f24dc9189aa8d3935e8621d3e65546c4b7fee9a042bf",
 79 |                 "sha256:4f829d6c09b997e1d0a998f970cf3ff82cd6796d56148c63c29174367878d490",
 80 |                 "sha256:51a933224b1b11986d4c7c123e5b28eb69602899d0179e6888b7abf2ffc85265",
 81 |                 "sha256:63ad98c6512b52aebde9bd806ec1127e13e2a8d42a00ebdf805153819f7c2cad",
 82 |                 "sha256:67e15514c9df4c5354b3ecc89451f5baa0f1b62c7ed68f4d20febf9c9d9e17a6",
 83 |                 "sha256:75f0e0e93851b30639baabfc1a4433aabc57eef269d55ee4c6f649fb60686218",
 84 |                 "sha256:89609708e819342dd5c94617fd53a36187d7d6a80435ddb282f6a60b058dbe77",
 85 |                 "sha256:8ca274d4e91685e4547af718b6f1e9a9d4912c7a6dcb0c68925de84f81a09d2a",
 86 |                 "sha256:9987f3d31efc427ebf9926f703e5171552cfb3b6935f880e4f0d3a17b7f91540",
 87 |                 "sha256:9f3e08dbd3f2f574913faba9b48d3c24a43fcc0eb14a0e962431005434b9cfe6",
 88 |                 "sha256:a7a403bcea250cac37971058fca0c30b0144737a375f99d3855e5e7a34c43348",
 89 |                 "sha256:ad7e4e823db1271d344e0c3ce0988b2e0fecc49079eec9c818d866c38b2824bd",
 90 |                 "sha256:b1e9037a582e650d866324a50d2741724ea5f6c175200bef0b549d014898035a",
 91 |                 "sha256:b82fbd8843ead2640158b2c0946d354b66f3d49472e6790d70c4ceec35663b3f",
 92 |                 "sha256:b91c82bfd25145d428de99429de97d7a1c2c2658c212689fe2839b29a5251159",
 93 |                 "sha256:ba57b73ec7074f60bb85f953296df437784d560553d0cc04b253c43f1846ccad",
 94 |                 "sha256:c503802a81de18b8b4d40d069f5e363795ee44b1605f38bc104160ca3bfe2c41",
 95 |                 "sha256:d30e8e0dffbc299533f47044fec26c5087473cb29cf51f1995986ac8354c7b4c",
 96 |                 "sha256:d89b810bfb0e16a0de7f18773849bdf83dd7fd0614ae5225e5a9214cdb9be245",
 97 |                 "sha256:e22e1d47def2944ad7a12c09452de085587ec5baad2174683e56a42b6918a76f",
 98 |                 "sha256:f650ddc023c95681fccd5e297820f35de039e008265040c08188be95b3275a0f",
 99 |                 "sha256:f7d4b3885ad1a7a6f07719ab6b1790d9892d6d41d973e8d4543a93bb15226fb4"
100 |             ],
101 |             "index": "pypi",
102 |             "version": "==0.20.3"
103 |         },
104 |         "scipy": {
105 |             "hashes": [
106 |                 "sha256:014cb900c003b5ac81a53f2403294e8ecf37aedc315b59a6b9370dce0aa7627a",
107 |                 "sha256:281a34da34a5e0de42d26aed692ab710141cad9d5d218b20643a9cb538ace976",
108 |                 "sha256:588f9cc4bfab04c45fbd19c1354b5ade377a8124d6151d511c83730a9b6b2338",
109 |                 "sha256:5a10661accd36b6e2e8855addcf3d675d6222006a15795420a39c040362def66",
110 |                 "sha256:628f60be272512ca1123524969649a8cb5ae8b31cca349f7c6f8903daf9034d7",
111 |                 "sha256:6dcc43a88e25b815c2dea1c6fac7339779fc988f5df8396e1de01610604a7c38",
112 |                 "sha256:70e37cec0ac0fe95c85b74ca4e0620169590fd5d3f44765f3c3a532cedb0e5fd",
113 |                 "sha256:7274735fb6fb5d67d3789ddec2cd53ed6362539b41aa6cc0d33a06c003aaa390",
114 |                 "sha256:78e12972e144da47326958ac40c2bd1c1cca908edc8b01c26a36f9ffd3dce466",
115 |                 "sha256:790cbd3c8d09f3a6d9c47c4558841e25bac34eb7a0864a9def8f26be0b8706af",
116 |                 "sha256:79792c8fe8e9d06ebc50fe23266522c8c89f20aa94ac8e80472917ecdce1e5ba",
117 |                 "sha256:865afedf35aaef6df6344bee0de391ee5e99d6e802950a237f9fb9b13e441f91",
118 |                 "sha256:870fd401ec7b64a895cff8e206ee16569158db00254b2f7157b4c9a5db72c722",
119 |                 "sha256:963815c226b29b0176d5e3d37fc9de46e2778ce4636a5a7af11a48122ef2577c",
120 |                 "sha256:9726791484f08e394af0b59eb80489ad94d0a53bbb58ab1837dcad4d58489863",
121 |                 "sha256:9de84a71bb7979aa8c089c4fb0ea0e2ed3917df3fb2a287a41aaea54bbad7f5d",
122 |                 "sha256:b2c324ddc5d6dbd3f13680ad16a29425841876a84a1de23a984236d1afff4fa6",
123 |                 "sha256:b86ae13c597fca087cb8c193870507c8916cefb21e52e1897da320b5a35075e5",
124 |                 "sha256:ba0488d4dbba2af5bf9596b849873102d612e49a118c512d9d302ceafa36e01a",
125 |                 "sha256:d78702af4102a3a4e23bb7372cec283e78f32f5573d92091aa6aaba870370fe1",
126 |                 "sha256:def0e5d681dd3eb562b059d355ae8bebe27f5cc455ab7c2b6655586b63d3a8ea",
127 |                 "sha256:e085d1babcb419bbe58e2e805ac61924dac4ca45a07c9fa081144739e500aa3c",
128 |                 "sha256:e2cfcbab37c082a5087aba5ff00209999053260441caadd4f0e8f4c2d6b72088",
129 |                 "sha256:e742f1f5dcaf222e8471c37ee3d1fd561568a16bb52e031c25674ff1cf9702d5",
130 |                 "sha256:f06819b028b8ef9010281e74c59cb35483933583043091ed6b261bb1540f11cc",
131 |                 "sha256:f15f2d60a11c306de7700ee9f65df7e9e463848dbea9c8051e293b704038da60",
132 |                 "sha256:f31338ee269d201abe76083a990905473987371ff6f3fdb76a3f9073a361cf37",
133 |                 "sha256:f6b88c8d302c3dac8dff7766955e38d670c82e0d79edfc7eae47d6bb2c186594"
134 |             ],
135 |             "version": "==1.2.1"
136 |         }
137 |     },
138 |     "develop": {}
139 | }
140 | 


--------------------------------------------------------------------------------
/example.txt:
--------------------------------------------------------------------------------
1 | ‘The Babel fish,’ said The Hitchhiker’s Guide to the Galaxy quietly, ‘is small, yellow, and leech-like, and probably the oddest thing in the Universe. It feeds on brainwave energy received not from its own carrier but from those around it. It absorbs all unconscious mental frequencies from this brainwave energy to nourish itself with. It then excretes into the mind of its carrier a telepathic matrix formed by combining the conscious thought frequencies with nerve signals picked up from the speech centres of the brain which has supplied them. The practical upshot of all this is that if you stick a Babel fish in your ear you can instantly understand anything said to you in any form of language.
2 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
  1 | # Skjul - Text-based steganography
  2 | 
  3 | *Steganography* is the practice of inconspicuously hiding data (a secret) within
  4 | some other data (a carrier). Often this is within images, where the lower bits
  5 | can be used to store a secret message. While having few real uses, steganography
  6 | can be a fun exercise in information theory.
  7 | 
  8 | Skjul (Danish for *hide*, as in *to hide*), is a text-based steganography
  9 | implementation. Given a carrier message, Skjul can encode a secret bitstring
 10 | into it by slightly changing words - hopefully so little as to be imperceptible
 11 | to an uninitiated reader.
 12 | 
 13 | ## Example
 14 | 
 15 |     $ cat example.txt
 16 |     ‘The Babel fish,’ said The Hitchhiker’s Guide to the Galaxy quietly, ‘is
 17 |     small, yellow, and leech-like, and probably the oddest thing in the
 18 |     Universe. It feeds on brainwave energy received not from its own carrier but
 19 |     from those around it. It absorbs all unconscious mental frequencies from
 20 |     this brainwave energy to nourish itself with. It then excretes into the mind
 21 |     of its carrier a telepathic matrix formed by combining the conscious thought
 22 |     frequencies with nerve signals picked up from the speech centres of the
 23 |     brain which has supplied them. The practical upshot of all this is that if
 24 |     you stick a Babel fish in your ear you can instantly understand anything
 25 |     said to you in any form of language.
 26 | 
 27 |     $ cat example.txt | ./skjul.py encode '101010' | tee 'encoded.txt'
 28 |     ‘The Babel fish,’ said The Hitchhiker’s Guide to the Galaxy quietly, ‘is
 29 |     small, yellow, and leech-like, and possibly the oddest thing in the
 30 |     Universe. It feeds on brainwave energy recieved not to its own carrier but
 31 |     to those around it. It absorbs all unconscious mental frequencies from this
 32 |     brainwave energy to nourish itself with. It then excretes into the mind of
 33 |     its carrier a telepathic matrix formed by combining the conscious thought
 34 |     frequencies with nerve signals picked up from the speech centres of the
 35 |     brain which has supplied them. The practical upshot of all that is that if
 36 |     you stick a Babel fish in your ear you can instantly comprehend anything
 37 |     said from you in any form of language.
 38 | 
 39 |     $ wdiff example.txt encoded.txt
 40 |     ‘The Babel fish,’ said The Hitchhiker’s Guide to the Galaxy quietly, ‘is
 41 |     small, yellow, and leech-like, and [-probably-] {+possibly+} the oddest
 42 |     thing in the Universe. It feeds on brainwave energy [-received-]
 43 |     {+recieved+} not [-from-] {+to+} its own carrier but [-from-] {+to+} those
 44 |     around it. It absorbs all unconscious mental frequencies from this brainwave
 45 |     energy to nourish itself with. It then excretes into the mind of its carrier
 46 |     a telepathic matrix formed by combining the conscious thought frequencies
 47 |     with nerve signals picked up from the speech centres of the brain which has
 48 |     supplied them. The practical upshot of all [-this-] {+that+} is that if you
 49 |     stick a Babel fish in your ear you can instantly [-understand-]
 50 |     {+comprehend+} anything said [-to-] {+from+} you in any form of language.
 51 | 
 52 |     $ cat encoded.txt | ./skjul decode
 53 |     101010
 54 | 
 55 | The secrets can only be bitstrings.
 56 | 
 57 | ## How it works
 58 | 
 59 | ### Word pairs
 60 | 
 61 | Given a carrier string and a secret bitstring, the basic idea is assign to each
 62 | word in the carrier string a *paired word*. The secret message can then be
 63 | encoded in our choice of word. To not have a noticeable difference, the paired
 64 | word should be able to "work" in the same context as the original word, i.e. we
 65 | wish to select words that are likely to share the same neighboring words.
 66 | 
 67 | Word-vector models is a common way to model these *distributional properties* of
 68 | words. In such a model, each word has a vector embedding of e.g. 300 dimensions.
 69 | These embeddings are built such that words that tend to have similar contexts
 70 | also tend to have similar embeddings.
 71 | 
 72 | Using a word vector model, we pair each embedding with a neighbor using cosine
 73 | distance as the metric. Note that these pairings must be exclusive, i.e.
 74 | `[(a,b), (a,c)]` is not valid because `a` participates in both pairs. Instead,
 75 | we find the k-nearest neighbors for each word then greedily pair words based on
 76 | the distance to the closest non-paired neighbor. This means that words are not
 77 | always paired with their closest neighbor and some words are not paired at all.
 78 | 
 79 | This repository includes a precomputed pair list based on
 80 | [Facebook's fasttext vectors](https://fasttext.cc/docs/en/crawl-vectors.html).
 81 | 
 82 | For example, the string has "this is a test" has 3 words in the pair list:
 83 | 
 84 | | 1    | 0     | Distance   |
 85 | |------|-------|------------|
 86 | | this | that  | 0.17533547 |
 87 | | was  | is    | 0.28453428 |
 88 | | test | tests | 0.20037645 |
 89 | 
 90 | To encode a *k*-bit message, we simply pick the *k* tokens with lowest distance
 91 | to their paired word and swap or not depending on the corresponding bit in the
 92 | secret. Eg. to encode a single 1-bit, we change "this is a test" to "that is a
 93 | test".
 94 | 
 95 | ### Variable length coding
 96 | 
 97 | The method as outlined above requires the person decoding to know the length of
 98 | the secret. This makes it somewhat unpractical and it would be better to encode
 99 | the length as part of the message itself. To do this, we need a prefix-free
100 | encoding scheme, as we do not know the amount of bits for the length beforehand.
101 | 
102 | For this, we use
103 | [Elias gamma coding](https://en.wikipedia.org/wiki/Elias_gamma_coding). In gamma
104 | coding, we first encode the length of the integer in unary zero bits followed by
105 | the length integer itself.
106 | 
107 | A downside of this is that it increases the length of the secret, especially for
108 | small secrets. This is due to how the number of bits in the length itself is
109 | comparatively more significant than it would be for a longer message.
110 | 
111 | ### A pinch of noise
112 | 
113 | Lastly, we add XOR encryption to the secret using a pseudorandom number
114 | generator (PRNG). This breaks any predictable patterns that might be in the
115 | secret. For example, a secret of only zeros and a carrier that contains the same
116 | pair often would always pick the same word. This also makes it possible to
117 | specify a key by using the key as seed for the PRNG.
118 | 
119 | We also add an small, optional amount of noise to each word pair distance. This
120 | make the pairs chosen more varied, such that it is not always the minimum word
121 | that is chosen.
122 | 


--------------------------------------------------------------------------------
/skjul.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Text-based steganography.
  3 | 
  4 | Usage:
  5 |   skjul.py process [<in>] [<pairs>] [--neighbors=<k>] [--lines=<n>]
  6 |   skjul.py encode <secret> [<pairs>] [--key=<k>] [--noise=<x>]
  7 |   skjul.py decode [<pairs>] [--key=<k>] [--noise=<x>]
  8 |   skjul.py --version
  9 | 
 10 | Commands:
 11 |   process             Build a pairs list from a fastText vector file.
 12 |   encode              Encode a secret message. Carrier is read from standard
 13 |                       input and output is written to standard output.
 14 |   decode              Decode a secret messasge from standard input.
 15 | 
 16 | All commands accept a path to a pair-list file. If this is not supplied, then
 17 | 'skjul.csv' in the current working directory is used instead.
 18 | 
 19 | Options:
 20 |   -h --help           Show this screen.
 21 |   --version           Show version.
 22 |   -n --lines=<n>      Number of lines to read from file [default: all].
 23 |   -k --neighbors=<k>  Number of neighbors to find for each word [default: 10].
 24 |   -k --key=<key>      Key to encode/decode message with [default: 0].
 25 |   -x --noise=<x>      Noise fraction when selecting words [default: 0.025].
 26 | """
 27 | 
 28 | import re
 29 | import csv
 30 | from enum import Enum
 31 | import numpy as np
 32 | from sklearn.neighbors import NearestNeighbors
 33 | from docopt import docopt
 34 | from schema import Schema, And, Or, Use, Regex
 35 | import sys
 36 | import itertools
 37 | 
 38 | 
 39 | def _pairing(x, k=10, metric='cosine', stable=False):
 40 |     """
 41 |     Pairs points such that points that are closer wrt. the given metric are
 42 |     more likely to be paired together.
 43 | 
 44 |     Args:
 45 |         x (array): A n by d matrix representing n vectors of length d.
 46 |         k (int): Number of neighbors to consider when pairing.
 47 |         metric (str): Metric to use for nearest neighbor.
 48 |         stable (bool): Whether to ensure exact output for the given input.
 49 | 
 50 |     See sklearn.neighbors.NearestNeighbors for all possible metrics.
 51 | 
 52 |     Returns:
 53 |         low: Lower indices of each pair
 54 |         hi: Higher indices of each pair
 55 |         dist: Distance between each point
 56 |     """
 57 | 
 58 |     x = np.asarray(x)
 59 |     n = x.shape[0]
 60 | 
 61 |     edge_dist, edge_tgt = NearestNeighbors(n_neighbors=k, metric=metric) \
 62 |         .fit(x).kneighbors()
 63 | 
 64 |     sorting = np.argsort(edge_dist, axis=None,
 65 |                          kind='stable' if stable else None)
 66 | 
 67 |     edge_src = np.unravel_index(sorting, edge_dist.shape)[0]
 68 |     edge_tgt = np.ravel(edge_tgt)[sorting]
 69 |     edge_dist = np.ravel(edge_dist)[sorting]
 70 | 
 71 |     pairing = np.full([n], -1, dtype=np.int32)
 72 |     pairing_dist = np.zeros([n], dtype=edge_dist.dtype)
 73 | 
 74 |     for src, tgt, dist in np.nditer((edge_src, edge_tgt, edge_dist)):
 75 |         if pairing[src] == -1 and pairing[tgt] == -1:
 76 |             pairing[src] = tgt
 77 |             pairing[tgt] = src
 78 |             pairing_dist[src] = pairing_dist[tgt] = dist
 79 | 
 80 |     paired_indices = np.where(pairing != -1)[0]
 81 |     lo = paired_indices[pairing[paired_indices] > paired_indices]
 82 |     hi = pairing[lo]
 83 | 
 84 |     return lo, hi, pairing_dist[lo]
 85 | 
 86 | 
 87 | def _gamma_encode(num):
 88 |     """
 89 |     Encodes a positive number using Elias gamma coding.
 90 | 
 91 |     Args:
 92 |         num (int): An integer to encode. Must be positive.
 93 | 
 94 |     Returns:
 95 |         list: A list of booleans representing bits of the encoded number.
 96 |     """
 97 |     code = [False] * num.bit_length()
 98 | 
 99 |     for i in range(num.bit_length() - 1, -1, -1):
100 |         code.append((num >> i) & 1 != 0)
101 | 
102 |     return code
103 | 
104 | 
105 | def _gamma_decode(bits):
106 |     """
107 |     Decodes an Elias gamma encoded number.
108 | 
109 |     Args:
110 |         bits (iterable): An iterable of bits to decode.
111 | 
112 |     Returns:
113 |         int: The decoded gamma integer.
114 |     """
115 | 
116 |     reading = False
117 |     length = 0
118 |     num = 0
119 | 
120 |     for bit in bits:
121 |         if not reading:
122 |             if bit:
123 |                 reading = True
124 |             else:
125 |                 length += 1
126 | 
127 |         if reading:
128 |             num = num << 1
129 |             num |= 1 if bit else 0
130 |             length -= 1
131 | 
132 |             if length == 0:
133 |                 return num
134 | 
135 | 
136 | class _Caps(Enum):
137 |     UPPER = 1
138 |     TITLE = 2
139 |     LOWER = 3
140 | 
141 |     @staticmethod
142 |     def from_word(word):
143 |         if word.istitle():
144 |             return _Caps.TITLE
145 |         elif word.isupper():
146 |             return _Caps.UPPER
147 |         else:
148 |             return _Caps.LOWER
149 | 
150 |     def apply(self, word):
151 |         if self == _Caps.UPPER:
152 |             return word.upper()
153 |         elif self == _Caps.TITLE:
154 |             return word.title()
155 |         else:
156 |             return word.lower()
157 | 
158 | 
159 | class Steganographer:
160 |     """
161 |     A class for hiding secret messages in ordinary text.
162 |     """
163 | 
164 |     TOKEN_REGEX = re.compile(r'\w+')
165 | 
166 |     @staticmethod
167 |     def from_embeddings(words, embeddings, k=5, metric='cosine'):
168 |         """
169 |         Creates a new steganographer from a list of words and corresponding
170 |         embeddings.
171 | 
172 |         Args:
173 |             words (list): A list of words of length n.
174 |             embeddings (array): A n by d matrix of word embeddings.
175 |             k (int): The number of neighbors to consider when pairing words.
176 |             metric (str): The metric to use when pairing words.
177 | 
178 |         Returns:
179 |             Steganographer: A new steganographer.
180 |         """
181 | 
182 |         words = np.asarray(words)
183 |         embeddings = np.asarray(embeddings)
184 |         lower_words = {}
185 |         valid = np.zeros([words.size], np.bool)
186 | 
187 |         # Filter non-token words and lowercase all words. In case of a
188 |         # collision, prefer embeddings from lowercase words. We assume that
189 |         # these are more common and therefore more representative.
190 | 
191 |         for i, word in enumerate(words):
192 |             lower = word.lower()
193 | 
194 |             valid[i] = (Steganographer.TOKEN_REGEX.fullmatch(word) is not None
195 |                         and (lower not in lower_words or not word.islower()))
196 | 
197 |             if valid[i]:
198 |                 old = lower_words.get(lower)
199 | 
200 |                 if old is not None:
201 |                     valid[old] = False
202 | 
203 |                 lower_words[lower] = i
204 | 
205 |         words = np.char.lower(words[valid])
206 |         embeddings = embeddings[valid]
207 | 
208 |         left, right, dist = _pairing(embeddings, k, metric=metric, stable=True)
209 |         return Steganographer(zip(words[left], words[right], dist))
210 | 
211 |     @staticmethod
212 |     def load(file):
213 |         """Loads a steganographer from a file-like object"""
214 | 
215 |         return Steganographer((a, b, float(dist))
216 |                               for [a, b, dist]
217 |                               in csv.reader(file))
218 | 
219 |     def __init__(self, pairs):
220 |         self.map = {a: (b, dist, value)
221 |                     for left, right, dist in pairs
222 |                     for a, b, dist, value in [(left, right, dist, True),
223 |                                               (right, left, dist, False)]}
224 | 
225 |     def save(self, file):
226 |         """Saves the steganographer to a file-like object"""
227 | 
228 |         pairs = ([a, b, dist]
229 |                  for (a, (b, dist, value))
230 |                  in self.map.items()
231 |                  if value)
232 | 
233 |         csv.writer(file).writerows(sorted(pairs, key=lambda x: x[2]))
234 | 
235 |     def _tokenize(self, string):
236 |         intertokens = []
237 |         tokens = []
238 |         caps = []
239 | 
240 |         last_end = 0
241 | 
242 |         for match in Steganographer.TOKEN_REGEX.finditer(string):
243 |             if match.group().lower() in self.map:
244 |                 intertokens.append(string[last_end:match.start()])
245 |                 tokens.append(match.group().lower())
246 |                 caps.append(_Caps.from_word(match.group()))
247 |                 last_end = match.end()
248 | 
249 |         intertokens.append(string[last_end:])
250 | 
251 |         return tokens, caps, intertokens
252 | 
253 |     def encode(self, carrier, secret, key=0, noise=0):
254 |         """
255 |         Encodes a secret message into a carrier message.
256 | 
257 |         Args:
258 |             carrier (str): A string to embed a secret message into.
259 |             secret (list): A list of booleans to embed.
260 |             key (int): A key to encode the message with.
261 |             noise (float): The amount of noise to add.
262 | 
263 |         Returns:
264 |             str: A message with the given secret embedded within it.
265 |         """
266 |         tokens, caps, intertokens = self._tokenize(carrier)
267 | 
268 |         rng = np.random.RandomState(key)
269 |         token_noise = rng.rand(len(tokens)) * noise
270 | 
271 |         secret = _gamma_encode(len(secret)) + secret
272 | 
273 |         if len(secret) > len(tokens):
274 |             raise ValueError('Insufficient tokens for secret')
275 | 
276 |         dist = np.array([self.map[token][1] for token in tokens]) + token_noise
277 | 
278 |         for index, bit in zip(np.argsort(dist), secret):
279 |             paired, _, value = self.map[tokens[index]]
280 |             if (value != bool(bit)) != bool(rng.randint(1)):
281 |                 tokens[index] = paired
282 | 
283 |         result = []
284 |         for token, cap, intertoken in zip(tokens, caps, intertokens):
285 |             result.append(intertoken)
286 |             result.append(cap.apply(token))
287 | 
288 |         result.append(intertokens[-1])
289 | 
290 |         return ''.join(result)
291 | 
292 |     def decode(self, message, key=0, noise=0):
293 |         """
294 |         Extracts a secret message from a string.
295 | 
296 |         Args:
297 |             message (str): A string from which to extract a secret message.
298 |             key (int): The key the message was encoded with.
299 |             noise (float): The amount of noise the message was encoded with.
300 | 
301 |         Returns:
302 |             list: The decoded secret list of booleans.
303 |         """
304 |         tokens = self._tokenize(message)[0]
305 | 
306 |         rng = np.random.RandomState(key)
307 |         token_noise = rng.rand(len(tokens)) * noise
308 | 
309 |         dist = np.array([self.map[token][1] for token in tokens]) + token_noise
310 | 
311 |         bits = (self.map[tokens[index]][2] != bool(rng.randint(1))
312 |                 for index in np.argsort(dist))
313 | 
314 |         secret_len = _gamma_decode(bits)
315 | 
316 |         return list(itertools.islice(bits, secret_len))
317 | 
318 | 
319 | def _read_fast(file, nrows=None):
320 |     """
321 |     Reads a facebook fastText formatted vector file into a list of words and a
322 |     2d numpy array of corresponding embeddings.
323 |     """
324 |     [n, d] = [int(s) for s in file.readline().split(' ')]
325 | 
326 |     if nrows is not None:
327 |         n = min(n, nrows)
328 | 
329 |     embeddings = np.zeros([n, d], np.float32)
330 |     words = []
331 | 
332 |     for i, line in enumerate(file):
333 |         if i >= n:
334 |             break
335 | 
336 |         row = line.split(' ')
337 |         words.append(row[0])
338 |         embeddings[i, :] = [float(x) for x in row[1:d + 1]]
339 | 
340 |     return words, embeddings
341 | 
342 | 
343 | def main():
344 |     raw_args = docopt(__doc__, version='skjul 0.1')
345 | 
346 |     schema = Schema({
347 |         'process': bool,
348 |         'encode': bool,
349 |         'decode': bool,
350 |         '<in>': Or(None, Use(open)),
351 |         '<pairs>': Use(lambda x: x or 'skjul.csv'),
352 |         '<secret>': Or(None, And(Regex(r'^[01]*$'), Use(
353 |             lambda s: [c == '1' for c in s]))),
354 |         '--version': bool,
355 |         '--lines': Or(And('all', Use(lambda x: None)), Use(int)),
356 |         '--neighbors': Use(int),
357 |         '--key': Use(int),
358 |         '--noise': Use(float),
359 |     })
360 | 
361 |     args = schema.validate(raw_args)
362 | 
363 |     if args['process']:
364 |         words, embeddings = _read_fast(args['<in>'] or sys.stdin,
365 |                                        nrows=args['--lines'])
366 | 
367 |         st = Steganographer.from_embeddings(words, embeddings,
368 |                                             k=args['--neighbors'])
369 | 
370 |         with open(args['<pairs>'], 'w') as pairs:
371 |             st.save(pairs)
372 | 
373 |     elif args['encode']:
374 |         with open(args['<pairs>']) as pairs:
375 |             st = Steganographer.load(pairs)
376 | 
377 |         sys.stdout.write(st.encode(sys.stdin.read(), args['<secret>'],
378 |                                    args['--key'], args['--noise']))
379 |     elif args['decode']:
380 |         with open(args['<pairs>']) as pairs:
381 |             st = Steganographer.load(pairs)
382 |         decoded = st.decode(sys.stdin.read(), args['--key'], args['--noise'])
383 |         sys.stdout.write(''.join('1' if bit else '0' for bit in decoded ))
384 | 
385 | 
386 | if __name__ == "__main__":
387 |     main()
388 | 


--------------------------------------------------------------------------------