├── .travis.yml ├── phpunit.xml.dist ├── Tests └── TextOrBinarySnifferTest.php ├── composer.json ├── LICENSE ├── README.md └── TextOrBinarySniffer.php /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | 3 | php: 4 | - 5.3.3 5 | - 5.3 6 | - 5.4 7 | - 5.5 8 | 9 | before_script: 10 | - composer install --prefer-source --dev 11 | 12 | script: phpunit --coverage-text --verbose 13 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Tests 6 | 7 | 8 | 9 | 10 | 11 | . 12 | 13 | ./Tests/ 14 | ./vendor/ 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /Tests/TextOrBinarySnifferTest.php: -------------------------------------------------------------------------------- 1 | addData($bytes, 0, strlen($bytes)); 17 | 18 | $sniffer = new TextOrBinarySniffer; 19 | 20 | $this->assertTrue($sniffer->isLikelyText($buffer)); 21 | $this->assertTrue($sniffer->looksLikeValidUtf8($buffer)); 22 | $this->assertFalse($sniffer->isLikelyBinary($buffer)); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dflydev/snort-textorbinary", 3 | "description": "Buffer analysis to determine whether a Buffer is more likely to represent text or binary data.", 4 | "license": "MIT", 5 | "authors": [ 6 | { 7 | "name": "Dragonfly Development Inc.", 8 | "email": "info@dflydev.com", 9 | "homepage": "http://dflydev.com" 10 | }, 11 | { 12 | "name": "Beau Simensen", 13 | "email": "beau@dflydev.com", 14 | "homepage": "http://beausimensen.com" 15 | } 16 | ], 17 | "require": { 18 | "php": ">=5.3.3", 19 | "dflydev/snort-buffer": "self.version" 20 | }, 21 | "autoload": { 22 | "psr-0": { 23 | "Dflydev\\Snort\\TextOrBinary": "" 24 | } 25 | }, 26 | "target-dir": "Dflydev/Snort/TextOrBinary", 27 | "extra": { 28 | "branch-alias": { 29 | "dev-master": "1.0-dev" 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013 Dragonfly Development Inc. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is furnished 8 | to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Snort - Text or Binary 2 | ====================== 3 | 4 | Buffer analysis to determine whether a Buffer is more likely to represent text 5 | or binary data. 6 | 7 | 8 | Requirements 9 | ------------ 10 | 11 | * PHP 5.3+ 12 | 13 | 14 | Installation 15 | ------------ 16 | 17 | Through [Composer][1] as [dflydev/snort-textorbinary][2]. 18 | 19 | 20 | Usage 21 | ----- 22 | 23 | ```php 24 | addData($bytes, 0, strlen($bytes)); 33 | 34 | $sniffer = new Dflydev\Snort\TextOrBinary\TextOrBinarySniffer; 35 | 36 | if ($sniffer->isLikelyText($buffer)) { 37 | print "Well, probably text? (as best as we can guess in 512 bytes...)\n"; 38 | } 39 | 40 | if ($sniffer->isLikelyBinary($buffer)) { 41 | print "Well, probably binary? (as best as we can guess in 512 bytes...)\n"; 42 | } 43 | 44 | if ($sniffer->isMostlyAscii($buffer)) { 45 | print "Yup, Mostly ASCII!\n"; 46 | } 47 | 48 | if ($sniffer->looksLikeUtf8($buffer)) { 49 | print "Yup, UTF-8!\n"; 50 | } 51 | ``` 52 | 53 | 54 | License 55 | ------- 56 | 57 | MIT, see LICENSE. 58 | 59 | 60 | Community 61 | --------- 62 | 63 | If you have questions or want to help out, join us in the **#dflydev** channel 64 | on **irc.freenode.net** or mention [@dflydev][4] on Twitter. 65 | 66 | 67 | Not Invented Here 68 | ----------------- 69 | 70 | This work was heavily influenced by [Apache Tika][3]. 71 | 72 | 73 | [1]: http://getcomposer.org 74 | [2]: https://packagist.org/packages/dflydev/snort-textorbinary 75 | [3]: http://tika.apache.org 76 | [4]: https://twitter.com/dflydev 77 | -------------------------------------------------------------------------------- /TextOrBinarySniffer.php: -------------------------------------------------------------------------------- 1 | isMostlyAscii($buffer) || $this->looksLikeValidUtf8($buffer); 12 | } 13 | 14 | public function isLikelyBinary(Buffer $buffer) 15 | { 16 | return !$this->isLikelyText($buffer); 17 | } 18 | 19 | public function isMostlyAscii(Buffer $buffer) 20 | { 21 | $control = $buffer->countRange(0, 0x20); 22 | $ascii = $buffer->countRange(0x20, 128); 23 | $safe = $this->countSafeControl($buffer); 24 | 25 | $total = $buffer->total(); 26 | 27 | return $total > 0 28 | && ($control - $safe) * 100 < $total * 2 29 | && ($ascii + $safe) * 100 > $total * 90; 30 | } 31 | 32 | public function looksLikeValidUtf8(Buffer $buffer) 33 | { 34 | $control = $buffer->countRange(0, 0x20); 35 | $utf8 = $buffer->countRange(0x20, 0x80); 36 | $safe = $this->countSafeControl($buffer); 37 | 38 | $expectedContinuation = 0; 39 | $leading = array( 40 | $buffer->countRange(0xc0, 0xe0), 41 | $buffer->countRange(0xe0, 0xf0), 42 | $buffer->countRange(0xf0, 0xf8), 43 | ); 44 | 45 | for ($i = 0; $i < count($leading); $i++) { 46 | $utf8 += $leading[$i]; 47 | $expectedContinuation += ($i + 1) * $leading[$i]; 48 | } 49 | 50 | $continuation = $buffer->countRange(0x80, 0xc0); 51 | 52 | return $utf8 > 0 53 | && $continuation <= $expectedContinuation 54 | && $continuation >= $expectedContinuation - 3 55 | && ($control - $safe) * 100 < $utf8 * 2; 56 | } 57 | 58 | public function countControl(Buffer $buffer) 59 | { 60 | return $buffer->countRange(0, 0x20) - $this->countSafeControl($buffer); 61 | } 62 | 63 | public function countSafeControl(Buffer $buffer) 64 | { 65 | return 66 | $buffer->count(ord("\t")) + 67 | $buffer->count(ord("\n")) + 68 | $buffer->count(ord("\r")) + 69 | $buffer->count(0x0c) + 70 | $buffer->count(0x1b); 71 | } 72 | } 73 | --------------------------------------------------------------------------------