├── ChangeLog ├── LICENSE ├── README.txt ├── doc ├── android-notes ├── bat-manual.pdf ├── bat-manual.tex ├── bat_internal_format.txt ├── creating-the-database.txt ├── database-example-files │ ├── DOWNLOADURL │ ├── LIST │ ├── README │ └── SHA256SUM ├── defensivepublications │ └── security-bat.pdf ├── filesystem-observations ├── json.txt ├── kernelsymbolsspec.txt ├── knowledgebase-ideas ├── listoftags.txt ├── pfif │ ├── README │ ├── workreport1 │ ├── workreport2 │ ├── workreport3 │ ├── workreport4 │ └── workreport56 ├── processing-java ├── running-tests.txt └── testsuite │ ├── README │ ├── bat-training.tex │ ├── bat-training2.tex │ ├── bat-training3.tex │ ├── bat-training4.tex │ ├── bat-training5.tex │ ├── bat-training6.tex │ ├── openwrt-configs │ ├── 010-fix_mixed_implicit_and_normal_rules_error.patch │ ├── README │ ├── firmware1-config │ ├── firmware2-config │ └── firmware3-config │ ├── testoutput │ └── README │ ├── training-notes │ ├── training-notes2 │ ├── training-notes3 │ ├── training-notes4 │ ├── training-notes5 │ └── training-notes6 └── src ├── LICENSE ├── MANIFEST.in ├── TODO ├── bat-scan ├── bat-scan.config ├── bat ├── __init__.py ├── batxor.py ├── bruteforcescan.py ├── busybox.py ├── busyboxversion.py ├── checks.py ├── derivekernelconfig.py ├── elfcheck.py ├── ext2.py ├── extractor.py ├── file2package.py ├── findduplicates.py ├── findlibs.py ├── fixduplicates.py ├── fsmagic.py ├── fssearch.py ├── fwunpack.py ├── generatehexdump.py ├── generateimages.py ├── generatejson.py ├── generatereports.py ├── guireport.py ├── identifier.py ├── images.py ├── interfaces.py ├── javacheck.py ├── jffs2.py ├── kernelanalysis.py ├── kernelsymbols.py ├── licenseversion.py ├── piecharts.py ├── prerun.py ├── prunefiles.py ├── renamefiles.py ├── reportcopyright.py ├── security.py └── unpackrpm.py ├── batgui ├── busybox-compare-configs.py ├── busybox-walk.py ├── crawlers ├── README ├── crawling-php ├── gnu-config └── gnucrawler.py ├── debian ├── changelog ├── compat ├── control ├── copyright ├── files ├── pyversions └── rules ├── extractkernelstrings.py ├── knowledgebaseadd.py ├── knowledgebaseaddchipset.py ├── knowledgebaseinit.py ├── maintenance ├── bat-sqlitetopostgresql.py ├── batextensions.py ├── busybox-appletname-extractor.py ├── clonedbinit.py ├── copybatarchives.py ├── createbatarchive.py ├── createdb.config ├── createdb.py ├── createfiledatabasedebian.py ├── createfiledatabasefedora.py ├── createmanifests.py ├── cveparser.py ├── dumplist.py ├── extractrpms.py ├── findclones.py ├── findthirdparty.py ├── generatelist-fdroid.py ├── generatelist.py ├── packagerename.py ├── postgresql-index.sql ├── postgresql-table-drop.sql ├── postgresql-table.sql ├── rewritelist.py ├── scorecache.py ├── storeresults.py ├── updatesha256sum.py ├── verifyarchive.py ├── verifydb.py └── verifylist.py ├── patches ├── README ├── code2html-0.9.1-add-csharp.patch ├── code2html-0.9.1-add-groovyscala.patch ├── code2html-0.9.1-add-qml.patch └── cramfs.patch ├── scripts ├── comparebinaries.py ├── extractcomments.py ├── findxor.py ├── licensecompare.py ├── sourcewalk.py └── verifysourcearchive.py ├── setup.cfg └── setup.py /README.txt: -------------------------------------------------------------------------------- 1 | The Binary Analysis Tool (BAT) is a modular framework to analyse binary files. 2 | 3 | This project is no longer actively maintained. There are a few forks that might suit your needs. 4 | 5 | If you still want to use it, don't forget to also install "bat-extratools": 6 | 7 | https://github.com/armijnhemel/bat-extratools/ 8 | -------------------------------------------------------------------------------- /doc/android-notes: -------------------------------------------------------------------------------- 1 | Unpacking Android things 2 | 3 | File systems in use: 4 | 5 | * yaffs2 (use unyaffs for this, although this will not always work) 6 | * ubifs (seen one example so far that had broken images) 7 | * ext4 (soon, according to post that Ted Ts'o once did) 8 | * possibly other file systems too 9 | 10 | All kinds of meta info in boot.img files: 11 | 12 | http://android.git.kernel.org/?p=platform/bootloader/legacy.git;a=blob;f=include/boot/bootimg.h;h=44fde9277d65c82eecb8ffeaab7b078e61c6ff3f;hb=HEAD 13 | 14 | Location of standard license texts 15 | 16 | After unpacking firmware often a copy of the default Google terms of service can be found in /etc/NOTICE.html.gz. Sometimes additional files with extra license texts can be found. 17 | 18 | 19 | APK 20 | 21 | Applications are in APK format: 22 | 23 | http://en.wikipedia.org/wiki/APK_%28file_format%29 24 | 25 | 26 | Android resource files and XML files 27 | 28 | AXMLPrinter2: http://code.google.com/p/android4me/downloads/list 29 | http://code.google.com/p/android-apktool/ 30 | -------------------------------------------------------------------------------- /doc/bat-manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armijnhemel/binaryanalysis/ea97b6b7617128ccf7cfa19244b91675d9bf66df/doc/bat-manual.pdf -------------------------------------------------------------------------------- /doc/database-example-files/DOWNLOADURL: -------------------------------------------------------------------------------- 1 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.10.tar.bz2 https://www.kernel.org/ 2 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.11.tar.bz2 https://www.kernel.org/ 3 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.12.tar.bz2 https://www.kernel.org/ 4 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.13.tar.bz2 https://www.kernel.org/ 5 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.14.tar.bz2 https://www.kernel.org/ 6 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.15.tar.bz2 https://www.kernel.org/ 7 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.16.tar.bz2 https://www.kernel.org/ 8 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.17.tar.bz2 https://www.kernel.org/ 9 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.18.tar.bz2 https://www.kernel.org/ 10 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.19.tar.bz2 https://www.kernel.org/ 11 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.1.tar.bz2 https://www.kernel.org/ 12 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.20.tar.bz2 https://www.kernel.org/ 13 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.21.tar.bz2 https://www.kernel.org/ 14 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.22.tar.bz2 https://www.kernel.org/ 15 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.23.tar.bz2 https://www.kernel.org/ 16 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.24.tar.bz2 https://www.kernel.org/ 17 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.25.tar.bz2 https://www.kernel.org/ 18 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.26.tar.bz2 https://www.kernel.org/ 19 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.27.tar.bz2 https://www.kernel.org/ 20 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.28.tar.bz2 https://www.kernel.org/ 21 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.29.tar.bz2 https://www.kernel.org/ 22 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.2.tar.bz2 https://www.kernel.org/ 23 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.30.tar.bz2 https://www.kernel.org/ 24 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.31.tar.bz2 https://www.kernel.org/ 25 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.32.tar.bz2 https://www.kernel.org/ 26 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.33.tar.bz2 https://www.kernel.org/ 27 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.34.tar.bz2 https://www.kernel.org/ 28 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.35.tar.bz2 https://www.kernel.org/ 29 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.36.tar.bz2 https://www.kernel.org/ 30 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.37.tar.bz2 https://www.kernel.org/ 31 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.38.tar.bz2 https://www.kernel.org/ 32 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.39.tar.bz2 https://www.kernel.org/ 33 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.3.tar.bz2 https://www.kernel.org/ 34 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.40.tar.bz2 https://www.kernel.org/ 35 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.4.tar.bz2 https://www.kernel.org/ 36 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.5.tar.bz2 https://www.kernel.org/ 37 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.6.tar.bz2 https://www.kernel.org/ 38 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.7.tar.bz2 https://www.kernel.org/ 39 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.8.tar.bz2 https://www.kernel.org/ 40 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.9.tar.bz2 https://www.kernel.org/ 41 | ftp://ftp.kernel.org/pub/linux/kernel/v2.0/linux-2.0.tar.bz2 https://www.kernel.org/ 42 | -------------------------------------------------------------------------------- /doc/database-example-files/LIST: -------------------------------------------------------------------------------- 1 | linux 2.0.10 linux-2.0.10.tar.bz2 kernel 2 | linux 2.0.11 linux-2.0.11.tar.bz2 kernel 3 | linux 2.0.12 linux-2.0.12.tar.bz2 kernel 4 | linux 2.0.13 linux-2.0.13.tar.bz2 kernel 5 | linux 2.0.14 linux-2.0.14.tar.bz2 kernel 6 | linux 2.0.15 linux-2.0.15.tar.bz2 kernel 7 | linux 2.0.16 linux-2.0.16.tar.bz2 kernel 8 | linux 2.0.17 linux-2.0.17.tar.bz2 kernel 9 | linux 2.0.18 linux-2.0.18.tar.bz2 kernel 10 | linux 2.0.19 linux-2.0.19.tar.bz2 kernel 11 | linux 2.0.1 linux-2.0.1.tar.bz2 kernel 12 | linux 2.0.20 linux-2.0.20.tar.bz2 kernel 13 | linux 2.0.21 linux-2.0.21.tar.bz2 kernel 14 | linux 2.0.22 linux-2.0.22.tar.bz2 kernel 15 | linux 2.0.23 linux-2.0.23.tar.bz2 kernel 16 | linux 2.0.24 linux-2.0.24.tar.bz2 kernel 17 | linux 2.0.25 linux-2.0.25.tar.bz2 kernel 18 | linux 2.0.26 linux-2.0.26.tar.bz2 kernel 19 | linux 2.0.27 linux-2.0.27.tar.bz2 kernel 20 | linux 2.0.28 linux-2.0.28.tar.bz2 kernel 21 | linux 2.0.29 linux-2.0.29.tar.bz2 kernel 22 | linux 2.0.2 linux-2.0.2.tar.bz2 kernel 23 | linux 2.0.30 linux-2.0.30.tar.bz2 kernel 24 | linux 2.0.31 linux-2.0.31.tar.bz2 kernel 25 | linux 2.0.32 linux-2.0.32.tar.bz2 kernel 26 | linux 2.0.33 linux-2.0.33.tar.bz2 kernel 27 | linux 2.0.34 linux-2.0.34.tar.bz2 kernel 28 | linux 2.0.35 linux-2.0.35.tar.bz2 kernel 29 | linux 2.0.36 linux-2.0.36.tar.bz2 kernel 30 | linux 2.0.37 linux-2.0.37.tar.bz2 kernel 31 | linux 2.0.38 linux-2.0.38.tar.bz2 kernel 32 | linux 2.0.39 linux-2.0.39.tar.bz2 kernel 33 | linux 2.0.3 linux-2.0.3.tar.bz2 kernel 34 | linux 2.0.40 linux-2.0.40.tar.bz2 kernel 35 | linux 2.0.4 linux-2.0.4.tar.bz2 kernel 36 | linux 2.0.5 linux-2.0.5.tar.bz2 kernel 37 | linux 2.0.6 linux-2.0.6.tar.bz2 kernel 38 | linux 2.0.7 linux-2.0.7.tar.bz2 kernel 39 | linux 2.0.8 linux-2.0.8.tar.bz2 kernel 40 | linux 2.0.9 linux-2.0.9.tar.bz2 kernel 41 | linux 2.0 linux-2.0.tar.bz2 kernel 42 | -------------------------------------------------------------------------------- /doc/database-example-files/README: -------------------------------------------------------------------------------- 1 | In this directory there are a few examples of files used during the database 2 | creation process. In this case the files are for the Linux kernel. These files 3 | are just here as an example to explain the structure. 4 | -------------------------------------------------------------------------------- /doc/defensivepublications/security-bat.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armijnhemel/binaryanalysis/ea97b6b7617128ccf7cfa19244b91675d9bf66df/doc/defensivepublications/security-bat.pdf -------------------------------------------------------------------------------- /doc/filesystem-observations: -------------------------------------------------------------------------------- 1 | Reading contents from file systems and compressed files 2 | 3 | Apart from the kernel a device has one or more file systems. The contents of these file systems can contain all kinds of files: normal files (including more file systems), directories, device nodes, etc. 4 | 5 | http://sourceforge.net/apps/mediawiki/fuse/index.php?title=CompressedFileSystems 6 | 7 | Nullsoft installer 8 | 9 | http://nsis.svn.sourceforge.net/viewvc/nsis/NSIS/trunk/Source/exehead/fileform.h?revision=6101&view=markup 10 | -------------------------------------------------------------------------------- /doc/kernelsymbolsspec.txt: -------------------------------------------------------------------------------- 1 | Visualising symbol relationships in the Linux kernel 2 | 3 | The Linux kernel supports dynamic module loading. Kernel modules sometimes need to invoke functions that live in another piece of kernel code. Like dynamically linking user space programs during compilation time a list of symbols that are needed by the module at runtime, as well as a list of symbols that are defined/exported, are recorded. During runtime these symbols are resolved. 4 | 5 | The Linux kernel developers have marked certain symbols as "for use by GPL licensed code only" to indicate that modules that use these symbols are expected to be GPLv2 (or compatible) licensed. Linux kernel modules that have a license that is not compatible with the GPLv2 license should not use these modules. Failure to comply can result in legal action by copyright holders, who wish to uphold the license requirements. 6 | 7 | The visualisation code presented here tries to visualise dependencies between symbols used and the license. It does so by extracting symbols from the Linux kernel modules and kernel images, resolving symbols, querying a database with symbols extracted from Linux kernel source code and mapping the value of the exported symbol with the declared license of the module. This way we can catch if a module that is declared proprietary is actually using a GPL symbol. 8 | 9 | EXPORT_SYMBOL and EXPORT_SYMBOL_GPL 10 | 11 | The mechanism used to export normal kernel symbols is EXPORT_SYMBOL(). The GPL only kernel symbols are exported using EXPORT_SYMBOL_GPL(). If the license of a module is not GPL compatible it will not be able to call GPL only symbols. Sometimes vendors change EXPORT_SYMBOL_GPL() to EXPORT_SYMBOL() to work around these restrictions. The visualisation code can help catch these cases. 12 | 13 | Steps for visualising 14 | 15 | 1. extract symbols from Linux kernel modules 16 | 17 | The symbols in Linux kernel modules can be found in the symbol table of the ELF file. Using the command: 18 | 19 | $ readelf -W --syms 20 | 21 | this information can be extracted and processed. 22 | 23 | 2. extract symbols from the main Linux kernel image. 24 | 25 | If the Linux kernel is an ELF image then symbols can be obtained in a similar way as for Linux kernel modules. If the Linux kernel image is not an ELF image then some extra work has to be done. By looking for a known symbol that can be found in all kernel images (such as "loops_per_jiffy") and searching around it (kernel symbols are separated by NUL characters) a list of symbols can be obtained. 26 | 27 | 3. extract version information from each module and each kernel image. During runtime typically the modules and main kernel image need to have the same version. Although sometimes modules with different versions are "forced" to load into the running kernel this is rare. 28 | 29 | 4. find out for each module where each needed symbol is defined. Some filtering is done based on the versions extracted from step 3. It might turn out that some symbols are not defined anywhere, which is an error that should be reported. 30 | 31 | 5. for each symbol query the database to see what its type is. The result can be one of three things: normal kernel symbol, gpl only kernel symbol, or unknown. The unknown symbols indicate either out of tree kernel code or an omission in the database. The version extracted in step 3 is used because symbols can change over time (usually from normal kernel symbol to gpl only kernel symbol). 32 | 33 | 6. extract the declared license from each module using 34 | 35 | $ modinfo -l /path/to/kernelmodule 36 | 37 | 7. Create a graph, checking the type for each symbol and seeing if there is a mismatch between GPL symbols that are needed and a declared license that is not GPL compatible. 38 | 39 | 40 | Installing the kernel visualisation code in BAT 41 | 42 | The code should be copied into the directory with other BAT modules with the right ownership and permissions (rest as the other files). This can either be done manually, or by rebuilding the BAT binary package (see BAT manual for instructions). 43 | 44 | The following section should be added to the BAT configuration file: 45 | 46 | [findsymbols] 47 | type = aggregate 48 | module = bat.kernelsymbols 49 | method = findsymbols 50 | envvars = BAT_DB=/gpl/master/master.sqlite3:KERNELSYMBOL_SVG=1:KERNELSYMBOL_DEPENDENCIES=1 51 | noscan = text:xml:graphics:pdf:audio:video:mp4 52 | enabled = yes 53 | storetarget = images 54 | storedir = /tmp/images 55 | storetype = -graph.png:-graph.svg 56 | cleanup = yes 57 | priority = 5 58 | 59 | 60 | GraphViz should be installed as a dependency. Since there are buggy versions of GraphViz in older versions of Debian and Ubuntu either a recent version of Fedora (20 being the latest at time of writing this documentation), or Ubuntu (14.04 LTS being the latest at time of writing this documentation) should be used. 61 | -------------------------------------------------------------------------------- /doc/knowledgebase-ideas: -------------------------------------------------------------------------------- 1 | Ideas for the knowledgebase 2 | =========================== 3 | 4 | This file describes some ideas regarding the knowledgebase (milestone 4). It is based on the flow of how a firmware might pass through a scanning system. 5 | 6 | Firmware layout 7 | 8 | A firmware can consist of file systems (compressed/uncompressed), bootloaders, kernels (compressed/uncompressed), graphic files (compressed/uncompressed) and so on. 9 | 10 | A file system can be nested inside other file systems, or appended to a kernel image, or prepended in front of a kernel image. In short: we can have nesting. 11 | 12 | |---bootloader 13 | |---kernel 14 | | \-------file system 15 | |---file system 16 | 17 | The parts that can be found in a firmware are independent of each other and can all be extracted and analyzed separately. That means that it is fairly easy to separate information in a database. 18 | 19 | blob -- unique number, index 20 | checksum -- sha256, after unpacking 21 | type -- type of the blob kernel, type of file system, combined, picture, etc. 22 | compression -- type of compression, if any 23 | offset -- offset in the parent blob (represented as integers) 24 | parent -- parent blob, or 0 if it is top level 25 | firmware -- firmware it is part of (foreign key), although this actually only relevant for the top level firmware 26 | 27 | +------+----------+----------+-------------+--------+--------+----------+ 28 | | blob | sha256 | type | compression | offset | parent | firmware | 29 | +------+----------+----------+-------------+--------+--------+----------+ 30 | | 500 | 99999999 | firmware | none | 0 | 0 | | 31 | +------+----------+----------+-------------+--------+--------+----------+ 32 | | 2 | abcdefgh | kernel | gzip | 64 | 500 | | 33 | +------+----------+----------+-------------+--------+--------+----------+ 34 | | 3 | fgbfsfff | ext2 | gzip | 8192 | 500 | | 35 | +------+----------+----------+-------------+--------+--------+----------+ 36 | 37 | This would describe a firmware, with a kernel blob (gzip compressed) at hex offset 0x40, followed by an ext2 file system with gzip compression at hex offset 0x2000. 38 | 39 | Every top level firmware could be identified by: 40 | 41 | id -- unique number 42 | checksum -- sha256sum 43 | version -- version number, name, if applicable 44 | product -- id (foreign key) 45 | scandate -- date a device was scanned 46 | scantype -- automatic/by hand (does this make sense? should this be included?) 47 | verified -- (not sure if I would include this information or what it would mean. Verified by hand?) 48 | public -- whether or not a scan report of this 49 | sources -- boolean (are there sources for this device) 50 | compliant -- boolean (combine with 'sources'? What if it was fixed eventually? what if we could not scan the firmware) 51 | comments -- include a full report here? Is it necessary to make this searchable? It is kinda unstructured data. 52 | 53 | Every device could be identified by: 54 | 55 | id -- unique number for a device (corresponds to "product" in the previous table) 56 | vendor -- vendor name (NETGEAR, ASUS, Cisco, Linksys, etc. etc.) 57 | name -- name of the device (WRT54G, etc.) 58 | type -- subtype of the device (v5, 001, whatever is used) 59 | chipset -- Texas Instruments AR7, Broadcom BCM6851 (with a join we can reduce this to ARM, MIPS, etc.) 60 | upstream vendor -- useful information, which is typically not something that should be made publicly available 61 | 62 | An abstraction for the chipset: 63 | 64 | name -- name of the chipset 65 | vendor -- name of the vendor 66 | chipset family -- generic chip family (MIPS, ARM, etc.) 67 | 68 | for example: 69 | 70 | +---------+-------------------+-------+ 71 | | name | vendor | chip | 72 | +---------+-------------------+-------+ 73 | | AR7 | Texas Instruments | MIPS | 74 | +---------+-------------------+-------+ 75 | | BCM6851 | Broadcom | MIPS | 76 | +---------+-------------------+-------+ 77 | -------------------------------------------------------------------------------- /doc/listoftags.txt: -------------------------------------------------------------------------------- 1 | This is a list that maps tags to files. Each file could have more than one tag. 2 | 3 | empty :: empty files 4 | symlink :: symbolic link 5 | temporary :: temporary file used internally by BAT (should never be exposed to the outside world) 6 | 7 | 8 | The first basic distinction is between 'text' and 'binary': 9 | 10 | text :: files that only contain ASCII characters. 11 | binary :: files that contain other characters than ASCII characters (possibly also including ASCII characters) 12 | 13 | 14 | aiff :: AIFF/AIFF-C files 15 | androidresource :: resource file (Android specific) 16 | androidxml :: Android 'binary' XML 17 | appledouble :: Apple Double files (resource forks, etc.) 18 | audio :: audio files (generic for all audio files that are tagged) 19 | bflt :: bFLT files (uClinux) 20 | certificate :: certificate files (generic for certiicates) 21 | compressed :: various compressed files (UPX, 7z, lzip, lzop, xz, gzip, compress, bzip2, lrzip, zip, lzma) 22 | cursor :: MS Windows cursor file (like ICO) 23 | dalvik :: Android dalvik file (gemeric, both dex and odex) 24 | dex :: Android Dex (old Android) 25 | font :: font data (woff, otf, ttf) 26 | graphics :: graphics files (generic for all graphics: WebP, BMP, GIF, JPEG, PNG) 27 | ico :: MS Windows ICO file 28 | ihex :: Intel iHEX file 29 | messagecatalog :: GNU message catalogue 30 | mp4 :: MPEG 4 31 | odex :: Android ODEX (optimized DEX) 32 | otf :: OpenType fonts 33 | pak :: Chrome PAK files 34 | resource :: resource files (generic for all resource files: Android resources, fonts, ICS, Chrome PAK, GNU message catalog, ICO/cursor, timezone files, Apple resource forks, certificates, terminfo) 35 | resourcefork :: Apple resource fork 36 | riff :: RIFF container (WebP, WAV) 37 | rsa :: 38 | serializedjava :: serialized Java 39 | sqlite3 :: Sqlite3 database file 40 | terminfo :: terminfo file 41 | timezone :: time zone file 42 | ttf :: True Type fonts 43 | upx :: UPX files 44 | vimswap :: Vim swap file 45 | wav :: WAV file 46 | webp :: WebP graphics file 47 | woff :: WOFF font files 48 | xml :: XML file 49 | -------------------------------------------------------------------------------- /doc/pfif/README: -------------------------------------------------------------------------------- 1 | This directory contains reports for development work done on BAT in 2012 with a grant coming through PFIF (Protocol Freedom Information Foundation). These are included here for the sake of transparancy. 2 | -------------------------------------------------------------------------------- /doc/pfif/workreport1: -------------------------------------------------------------------------------- 1 | Work Report milestone 1 ("Official release of BAT") 2 | 3 | The milestone "Official release of BAT" was performed between October 16 2011 and January 30 2012. In this period the following was done: 4 | 5 | * revision 5 of BAT was tagged (October 18) 6 | * configurations for building binary packages of bat-extratools for Fedora 14/15 and Ubuntu 10.10 and Debian 6 were added (October 16-18) 7 | * binary packages of bat-extratools for Fedora 14/15, Ubuntu 10.10 and Debian 6 were made and released (October 18-20) 8 | * configurations for building binary packages of BAT for Fedora 14/15, Ubuntu 10.10 and Debian 6 were added or updated (October 17-18) 9 | * binary packages of BAT for Fedora 14/15, Ubuntu 10.10 and Debian 6 were made and released (October 18-20) 10 | * a much improved user manual was written and released on the binaryanalysis.org website (November 22) 11 | 12 | To fix some packaging mistakes the following was done: 13 | 14 | * revision 6 of BAT was tagged (January 30 2012) 15 | * configurations for building binary packages of bat-extratools-java for Fedora 14/15 and Ubuntu 10.10 and Debian 6 were added (January 13 - 14 2012) 16 | * binary packages of BAT, bat-extratools and bat-extratools-java for Fedora 14/15, Ubuntu 10.10 and Debian 6 were made and released (January 30 2012) 17 | * a updated user manual was released on the binaryanalysis.org website (January 30 2012) 18 | 19 | Additionally many bugs were fixed (massive speed ups, code clean ups) and new functionality was added (November 25 2011 - January 30 2012). 20 | -------------------------------------------------------------------------------- /doc/pfif/workreport2: -------------------------------------------------------------------------------- 1 | Work Report milestone 2 ("Processing Java class files for better extracting of strings for the ranking module") 2 | 3 | 4 | The milestone "Processing Java class files for better extracting of strings for the ranking module" was performed between October 19 and November 23. In this period the following was done: 5 | 6 | * the data model from the ranking database was changed to take programming languages into account by adding a new field "language" 7 | * a public domain third party tool to process files for Android's Dalvik ("dedexer") was added to the bat-extratools collection 8 | * wrapper code was written to process output from jcf-dump (for regular Java class files) and dedexer (for Dalvik files) 9 | * a database consisting of code from the Apache project was generated and made available for download to PFIF 10 | -------------------------------------------------------------------------------- /doc/pfif/workreport3: -------------------------------------------------------------------------------- 1 | Work Report milestone 5 ("Semi-interactive UI") 2 | 3 | The milestone "Semi-interactive UI" was performed between March 8 2012 and May 18 2012. In this period the following was done: 4 | 5 | * split bruteforce method into a frontend and a backend, to allow for different frontends 6 | * add methods to create pictures of results from ranking method and other methods 7 | * write results of bruteforce method, including program state and generated files, to an archive 8 | * write a graphical user interface that allows viewing generated archives 9 | * added possiblilty to interactively launch scans from interface and save results to a file 10 | * rework code for configuration, to make it easier to enable/disable scans from the graphical user interface 11 | * code for tagging files was expanded, plus display filters for tags were added to the graphical user interface 12 | -------------------------------------------------------------------------------- /doc/pfif/workreport4: -------------------------------------------------------------------------------- 1 | Work Report milestone 4 ("adding support for Minix file system") 2 | 3 | The milestone "adding support for Minix file system" was completed on May 21 4 | 2012. The work on this milestone was done between May 1 2012 and May 21 2012. 5 | In this period the following work was done: 6 | 7 | * made program to extract Minix v1 file systems as used on many Linux based IP 8 | cameras 9 | * tested with many IP camera firmwares containing a Minix file system and 10 | manually verified contents were correct. 11 | * added code to use the program inside the Binary Analysis Tool 12 | -------------------------------------------------------------------------------- /doc/pfif/workreport56: -------------------------------------------------------------------------------- 1 | Work Report milestone 3 ("standardized test set/training materials based on OpenWrt") 2 | 3 | The milestone "standardized test set/training materials based on OpenWrt" was 4 | completed on November 28 2012, when all materials were updated to reflect the 5 | status of the upcoming BAT 10.0 release (before the end of 2012). 6 | 7 | Work was done between November 27 2011 and November 28 2012. A DVD with the 8 | software used in the tests was sent by mail in February 2012. The training 9 | materials were released on November 30 2012: 10 | 11 | http://www.binaryanalysis.org/en/content/show/documentation 12 | 13 | Work Report milestone 6 ("Release incorporating milestones 2 - 5") 14 | 15 | The milestone "release incorporating milestones 2 - 5" was completed on May 16 | 22 2012, when a release containing both milestones 4 and 5 was made. Since then 17 | a new release with several bug fixes was made on October 4 and there will be 18 | another release with new features before the end of 2012. 19 | 20 | The training materials and configuration to rebuild the test materials used in 21 | this training were released separately on November 30 2012: 22 | 23 | http://www.binaryanalysis.org/en/content/show/documentation 24 | -------------------------------------------------------------------------------- /doc/processing-java: -------------------------------------------------------------------------------- 1 | Processing Java files in the ranking module 2 | 3 | Separating scanning for programming languages 4 | 5 | We have chosen to treat Java and C executables in a different way. There are good reasons for this: 6 | 7 | * strings that are very common in C programs might be very significant for Java programs or vice versa. If all strings would be used for all scans a string found in lots of C programs, but only one Java program, it would be considered irrelevant, even though it is very significant. 8 | * although embedding Java in C programs and vice versa does happen it is not the most common situation 9 | 10 | The database in the ranking module has a separate field 'language' where for each string that has been extracted the language of the file is recorded. The language is determined by looking at the extension of the file and using a special lookup table that maps extensions to a programming language. 11 | 12 | Processing binaries and narrowing results 13 | 14 | Java binaries contain quite a bit of data that is not useful for our string based search, such as datatypes, etcetera. Also, when running the command 'strings' on Java class files sometimes some additional whitespace (like a tab) is printed in front of the stringdata we want to use, because the Java compiler has inserted that at one point. 15 | 16 | It is possible to get just string constants out of Java binaries (both .class files and Android's DEX files) and discard all other information. For Java class files this can be done using jcf-dump (part of gcc) and processing the output. For DEX files this can be done by running Dedexer and processing its output. 17 | 18 | Granularity of scans 19 | 20 | Granularity could possibly be an issue when scanning Java class files. Executables that are generated when compiling C programs (like ELF executables or libraries) usually contain many more strings than 21 | Java class files, which are conceptually perhaps closer to object files than to executables. So the amount of strings extracted from a Java class file compared to a 'normal' executable is significantly lower. Whether or not this will affect the result is currently unknown. 22 | 23 | Since Dalvik bytecode is always in one archive it is not a problem there. 24 | -------------------------------------------------------------------------------- /doc/running-tests.txt: -------------------------------------------------------------------------------- 1 | Notes for running tests 2 | 3 | (these are some personal notes for testing) 4 | 5 | Smoke test for unpacking 6 | ------------------------ 7 | 8 | To run a smoke test for unpacking with a large collection of firmwares it is 9 | important to do the following: 10 | 11 | * set 'cleanup' to yes so the disk doesn't fill up with unpacking directories 12 | * set 'writeoutputfile' to 'no' so the disk doesn't fill up with result files 13 | * disable ranking so the database is not hit (taking a long time) 14 | -------------------------------------------------------------------------------- /doc/testsuite/README: -------------------------------------------------------------------------------- 1 | This directory will in the future contain: 2 | 3 | * configuration information to build a test suite based on OpenWrt 4 | * documentation, standardized testing scenarios and training materials 5 | -------------------------------------------------------------------------------- /doc/testsuite/bat-training4.tex: -------------------------------------------------------------------------------- 1 | \documentclass[11pt]{beamer} 2 | 3 | \usepackage{url} 4 | \usepackage{tikz} 5 | %\author{Armijn Hemel} 6 | \title{Using the Binary Analysis Tool - part 4} 7 | \date{} 8 | 9 | \begin{document} 10 | 11 | \setlength{\parskip}{4pt} 12 | 13 | \frame{\titlepage} 14 | 15 | \frame{ 16 | \frametitle{Subjects} 17 | In this course you will learn: 18 | 19 | \begin{itemize} 20 | \item to browse results of a scan made with the Binary Analysis Tool 21 | \end{itemize} 22 | } 23 | 24 | \frame{ 25 | \frametitle{Starting the Binary Analysis Tool result viewer} 26 | The Binary Analysis Tool result viewer is a Python program using wxPython. It can be invoked using the command: 27 | 28 | \texttt{batgui} 29 | 30 | which will launch the GUI. 31 | } 32 | 33 | \frame{ 34 | \frametitle{Alternative viewer} 35 | An alternative viewer using Qt can be found at: 36 | 37 | \url{https://github.com/monkeyiq/batgui2} 38 | 39 | The rest of this training will be using the original \texttt{batgui}. 40 | } 41 | 42 | \frame{ 43 | \frametitle{Loading a file in the BAT result viewer} 44 | 45 | Via File $\rightarrow$ Open in the menu a result file can be loaded and displayed. 46 | 47 | On the left there will be a file tree, on the right results for individual files will be displayed. 48 | } 49 | 50 | \frame{ 51 | \frametitle{Filtering results in the BAT result viewer} 52 | Not every file type might be interesting. To unclutter the user interface and the directory tree a display filter is present that will hide certain file types from the directory tree. 53 | 54 | Configuration $\rightarrow$ Filter Configuration will show a list of checkboxes of file types to ignore. 55 | } 56 | 57 | \frame{ 58 | \frametitle{Interpreting results of a scan} 59 | For each file a few attributes will be shown by default: 60 | 61 | \begin{itemize} 62 | \item name of the binary 63 | \item absolute file path 64 | \item relative file path if it is nested and parent is an unpacked compressed file or file system 65 | \item size 66 | \item SHA256 checksum 67 | \item tags 68 | \end{itemize} 69 | 70 | In addition results of file specific scans might be shown (architecture, shared libraries, etcetera) 71 | } 72 | 73 | \frame{ 74 | \frametitle{Interpreting results of advanced ranking scan} 75 | If the advanced ranking scan is enabled a lot more information becomes available: 76 | 77 | \begin{itemize} 78 | \item function names matching 79 | \item string constants matching 80 | \item version number guess 81 | \item possible licenses guess 82 | \end{itemize} 83 | 84 | This information should be carefully analysed and not blindly trusted. 85 | } 86 | 87 | \frame{ 88 | \frametitle{Interpreting results: function names} 89 | For dynamically linked ELF executables unique function names (if matched) will be displayed. 90 | 91 | Many unique function names is a clear indicator of software reuse. 92 | } 93 | 94 | \frame{ 95 | \frametitle{Interpreting results: string constants (1)} 96 | For a good classification the following things are important: 97 | 98 | \begin{itemize} 99 | \item amount of matched string constants 100 | \item distribution of matched string constants 101 | \end{itemize} 102 | 103 | If there are only few strings that can be matched, the results are likely to be not very reliable. 104 | 105 | An even distribution of scores, combined with few matched unique strings and non-unique strings means that nothing was reliably matched. 106 | } 107 | 108 | \frame{ 109 | \frametitle{Interpreting results: string constants (2)} 110 | The advanced ranking scan will create two pie charts. The first pie chart details how the algorithm classfied the strings (unique matches, assigned matches, unmatched, and so on), the second pie chart depicts the score for each packages. 111 | 112 | The first pie charts determines the fidelity of the second pie chart: if many strings (dozens, hundreds) could be matched and assigned to a package (either a unique or non-unique match), then the second pie chart will have a high fidelity. If just a handful strings could be matched, the second pie chart has a low fidelity. 113 | } 114 | 115 | \frame{ 116 | \frametitle{Interpreting results: version numbers for unique strings} 117 | Based on unique strings BAT tries to determine version numbers of matched packages. 118 | 119 | Because version number guessing is tied to unique strings version number guessing is not reliable if there are just a few unique strings. 120 | } 121 | 122 | \frame{ 123 | \frametitle{Interpreting results: license guess} 124 | Based on unique strings BAT tries to determine possible used licenses for matched packages. 125 | 126 | License guessing is likely to be unreliable if there are just a few unique strings. Versions are not taken into account (yet) when determining the license: all possible licenses are reported, also if the software is relicensed in some version. 127 | } 128 | 129 | \frame{ 130 | \frametitle{Conclusion} 131 | In this course you have learned about: 132 | 133 | \begin{itemize} 134 | \item to browse results of a scan made with the Binary Analysis Tool 135 | \end{itemize} 136 | 137 | In the next course we will dig into how the Binary Analysis Tool can be extended. 138 | } 139 | \end{document} 140 | -------------------------------------------------------------------------------- /doc/testsuite/bat-training6.tex: -------------------------------------------------------------------------------- 1 | \documentclass[11pt]{beamer} 2 | 3 | \usepackage{url} 4 | \usepackage{tikz} 5 | %\author{Armijn Hemel} 6 | \title{Using the Binary Analysis Tool - part 6} 7 | \date{} 8 | 9 | \begin{document} 10 | 11 | \setlength{\parskip}{4pt} 12 | 13 | \frame{\titlepage} 14 | 15 | \frame{ 16 | \frametitle{Subjects} 17 | In this course you will learn: 18 | 19 | \begin{itemize} 20 | \item to generate a database for BAT ranking 21 | \item to configure BAT to use the ranking database 22 | \end{itemize} 23 | } 24 | 25 | \frame{ 26 | \frametitle{Collecting a dataset} 27 | Before you can generate the database you need a dataset. A good dataset can be built from downloads from for example: 28 | 29 | \begin{itemize} 30 | \item upstream projects 31 | \item distributions 32 | \end{itemize} 33 | 34 | The database works best if there is a wide range of software in the database. If there is too little software in the database there will be mismatches, possible falsely detecting software. 35 | } 36 | 37 | \frame{ 38 | \frametitle{Generating a file list for database extraction} 39 | The database generating script that is used expects a file listing all files that should be processed. There is a helper script called \texttt{generatelist.py} that helps generating this list. It can be found in the source repository of BAT in the directory \texttt{maintenance}. 40 | 41 | \texttt{python generatelist.py -f /path/to/dir/with/files -o origin | sort > /path/to/dir/with/files/LIST} 42 | 43 | The parameter \texttt{-o} allows you to set an origin of where the source was downloaded, for example \texttt{debian} or \texttt{gnome}. If not set, it will be set to \texttt{unknown}. 44 | } 45 | 46 | \frame{ 47 | \frametitle{Generating the database} 48 | The script to generate the database is called \texttt{createdb.py}. It can be found in the source repository of BAT in the directory \texttt{maintenance}. It can extract: 49 | 50 | \begin{itemize} 51 | \item string constants (\texttt{xgettext}) and function names (\texttt{ctags}) 52 | \item license information (using Ninka and FOSSology) 53 | \item copyright information (using FOSSology) 54 | \item configuration from Linux kernel Makefiles 55 | \end{itemize} 56 | 57 | It can be invoked as follows: 58 | 59 | \texttt{python createdb.py -c /path/to/configurationfile -d /path/to/database -f /path/to/dir/with/files} 60 | } 61 | 62 | \frame{ 63 | \frametitle{Installing Ninka} 64 | The Ninka scanner can be used to extract licensing information from source code files. It can be downloaded from: 65 | 66 | \url{https://github.com/dmgerman/ninka/} 67 | 68 | As of the time of writing the latest version is \texttt{1.1}. This version number is hardcoded a few times in \texttt{createdb.py} and should be changed if the version of Ninka changes. 69 | 70 | Ninka can be installed as follows: 71 | 72 | \begin{enumerate} 73 | \item unpack in \texttt{/tmp} 74 | \item \texttt{cd /tmp/ninka-1.1/} 75 | \item \texttt{cd comments} 76 | \item \texttt{make clean; make} 77 | \end{enumerate} 78 | } 79 | 80 | \frame{ 81 | \frametitle{Installing FOSSology} 82 | There are binary packages available for most distributions. Most major distributions already have support. Alternatively, packages can be downloaded from: 83 | 84 | \url{http://www.fossology.org/} 85 | } 86 | 87 | \begin{frame}[fragile] 88 | \frametitle{Creating the configuration file} 89 | The standard BAT distribution comes with an example configuration file for \texttt{createdb.py}: 90 | 91 | \begin{verbatim} 92 | [extractconfig] 93 | configtype = global 94 | database = /tmp/test/master.sqlite3 95 | scanlicense = yes 96 | licensedb = /tmp/test/licenses.sqlite3 97 | ninkacommentsdb = /tmp/test/ninkacomments.sqlite3 98 | scancopyright = yes 99 | cleanup = yes 100 | wipe = no 101 | \end{verbatim} 102 | \end{frame} 103 | 104 | \frame{ 105 | \frametitle{Running the database creation script} 106 | The database extraction script can be run as follows: 107 | 108 | \texttt{python createdb.py -c /path/to/configuration/file -f /path/to/directory/with/sources} 109 | 110 | This will create, depending on the configuration, one to three files: the main database, a licenses database and a temporary database for Ninka, which can be ignored or discarded later. 111 | } 112 | 113 | \frame{ 114 | \frametitle{Caching databases} 115 | The ranking scan uses several caching databases: 116 | 117 | \begin{itemize} 118 | \item strings for each programming language 119 | \item average amount of strings per package for each programming language 120 | \item function names 121 | \end{itemize} 122 | 123 | If caching databases are not found then the ranking code will not work properly. 124 | } 125 | 126 | \frame{ 127 | \frametitle{Configuring the ranking scan} 128 | The functionality for the ranking scan is split in two separate scans: 129 | 130 | \begin{enumerate} 131 | \item identifier extraction (in the \texttt{[identifier]} scan) 132 | \item identifier lookup and scoring (in the \texttt{[versionlicensecopyright]} scan) 133 | \end{enumerate} 134 | } 135 | 136 | \begin{frame}[fragile] 137 | \frametitle{Further ranking configuration} 138 | To enable license scanning and reporting the parameter \texttt{BAT\_RANKING\_LICENSE} should be set to \texttt{1}: 139 | 140 | \begin{verbatim} 141 | [ranking] 142 | ... 143 | envvars = ... 144 | :BAT_RANKING_LICENSE=1 145 | ... 146 | \end{verbatim} 147 | \end{frame} 148 | 149 | \frame{ 150 | \frametitle{Conclusion} 151 | In this course you have learned about: 152 | 153 | \begin{itemize} 154 | \item to generate a database for BAT ranking 155 | \item to configure BAT to use the ranking database 156 | \end{itemize} 157 | 158 | This concludes the Binary Analysis Tool training. 159 | } 160 | 161 | \end{document} 162 | -------------------------------------------------------------------------------- /doc/testsuite/openwrt-configs/010-fix_mixed_implicit_and_normal_rules_error.patch: -------------------------------------------------------------------------------- 1 | --- a/Makefile 2 | +++ b/Makefile 3 | @@ -428,7 +428,7 @@ ifeq ($(config-targets),1) 4 | -include $(srctree)/arch/$(ARCH)/Makefile 5 | export KBUILD_DEFCONFIG 6 | 7 | -config %config: scripts_basic outputmakefile FORCE 8 | +%config: scripts_basic outputmakefile FORCE 9 | $(Q)mkdir -p include 10 | $(Q)$(MAKE) $(build)=scripts/kconfig $@ 11 | $(Q)$(MAKE) -C $(srctree) KBUILD_SRC= .kernelrelease 12 | @@ -1276,7 +1276,7 @@ endif 13 | $(Q)$(MAKE) $(build)=$(build-dir) $(target-dir)$(notdir $@) 14 | 15 | # Modules 16 | -/ %/: prepare scripts FORCE 17 | +%/: prepare scripts FORCE 18 | $(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \ 19 | $(build)=$(build-dir) 20 | %.ko: prepare scripts FORCE 21 | -------------------------------------------------------------------------------- /doc/testsuite/openwrt-configs/README: -------------------------------------------------------------------------------- 1 | 010-fix_mixed_implicit_and_normal_rules_error.patch :: this is a patch for BusyBox to ensure Backfire 10.03 can build on versions with a newer version of GNU make. It should be placed in package/busybox/patches/ 2 | 3 | It was copied from upstream OpenWrt 4 | -------------------------------------------------------------------------------- /doc/testsuite/testoutput/README: -------------------------------------------------------------------------------- 1 | This directory contains test output from the test firmwares as generated by BAT 9.0 with the default configuration. The output has been slightly modified to not distribute compiled binaries and introducing a GPL license violation. 2 | 3 | These results can be browsed with the BAT results viewer. 4 | 5 | These results are 6 | 7 | Included are: 8 | 9 | * ... 10 | * ... 11 | -------------------------------------------------------------------------------- /doc/testsuite/training-notes4: -------------------------------------------------------------------------------- 1 | Notes/transcript for presentation. 2 | 3 | File: bat-training4.pdf 4 | 5 | Slide 1: no notes 6 | 7 | Slide 2: The Binary Analysis Tool has a simple viewer that allows you to view 8 | and browse results of a scan (stored as a tar archive) in a simple graphical 9 | interface. 10 | 11 | Slide 3: no notes 12 | 13 | Slide 4: no notes 14 | 15 | Slide 5: Opening a result file in the BAT results viewer is simple: via 16 | File -> Open a file selection menu will be opened, after which a file can be 17 | loaded. The results file will be unpacked. On the left there will be a file 18 | tree that can be browsed, on the right results will be displayed in tabs. 19 | 20 | Slide 6: Because not every file will be interesting it is possible to filter 21 | uninteresting files from the file tree. Especially in larger firmwares, such 22 | as Android images, there can be thousands of files, like graphics files or 23 | text files or Android resource files, that are not directly interesting for 24 | license compliance engineering, but which clutter the interface and which make 25 | it easy to get lost in the amount of data. 26 | 27 | The filter can be configured via Configuration -> Filter Configuration. 28 | 29 | Slide 7: For each file a few attributes (as far as applicable) will be shown, 30 | like size, checksum, tags, file name, and absolute and relative paths. 31 | Depending on the file some other results (like scan results, architecture, list 32 | of dynamically linked libraries) might be shown as well. 33 | 34 | Slide 8: The advanced ranking scan will give access to a lot more information, 35 | like unique function names, or which strings were matched to a database of 36 | strings and possibly a version number guess, if enough data is available. 37 | 38 | This information should be carefully inspected and not blindly trusted. 39 | 40 | Slide 9: no notes 41 | 42 | Slide 10 + 11: Interpreting the string constants takes some care. To make a good 43 | classification of a program it is important to look at how many strings are 44 | matched and how they are distributed over the various packages. An even 45 | distribution of the score, with very few matched strings, is a clear miss and 46 | means that nothing has been recognized. 47 | 48 | If there are only a few strings that can be recognized (whether unique or 49 | non-unique) also means that the results are likely to be not very good either. 50 | In short: the more strings that can be recognized, the better the fidelity. 51 | 52 | Slide 12: Version numbers are determined based on the amount of unique strings. 53 | If there are enough unique strings it is possible to reliably guess the version 54 | number of the program because there are slight differences between strings 55 | between versions. 56 | 57 | If just a few unique strings were matched then version guessing will not be 58 | very reliable. 59 | 60 | Slide 13: Similarly to version numbers a guess is made of possible licenses. 61 | The possible licenses are also based upon unique matches only. If there are 62 | few unique matches this will not be reliable. 63 | 64 | Also, currently guessed versions are not taken into account when determining 65 | the license. Since code can have a different license per version this is 66 | important. Now all possible licenses are reported. 67 | 68 | Slide 14: no notes 69 | -------------------------------------------------------------------------------- /doc/testsuite/training-notes5: -------------------------------------------------------------------------------- 1 | Notes/transcript for presentation. 2 | 3 | File: bat-training5.pdf 4 | 5 | Slide 1: no notes 6 | 7 | Slide 2: In this course we look at extending the Binary Analysis Tool and look 8 | at adding new identifiers and new scans. 9 | 10 | Slide 3: Identifiers of supported compressed files, file systems and media 11 | files are hardcoded in a file in the BAT source tree, namely bat/fsmagic.py. 12 | The structure is a simple Python dictionary called "fsmagic" that can easily be 13 | extended. 14 | 15 | Depending on the file type the identifier might start at the beginning of the 16 | file, the end of the file, or somewhere after the beginning of the file. For 17 | identifiers that start after the beginning of the file a special dictionary 18 | with "correction" offsets should also be modified. 19 | 20 | A new identifier will only be scanned as soon as there is a scan that actually 21 | uses it and declares it in the configuration file in the "magic" configuration 22 | parameter. 23 | 24 | Slide 4: Identifiers can be used in scans by accessing the dictionary "offsets" 25 | that is passed to each of the scans. The keys for the dictionary are the same 26 | keys as in the "fsmagic" dictionary. 27 | 28 | Slide 5: Prerun scans are used to quickly tag files. It takes a few parameters 29 | including the offsets and previous tags that were given to the file and returns 30 | a list of new tags the scan has found. 31 | 32 | Slide 6: no comments 33 | 34 | Slide 7 + 8: Unpacking scans try to carve a file from a larger file and unpack 35 | its contents. Parameters to methods include a blacklist, which contains tuples 36 | of with start/end values of byte ranges in the file that should be ignored, 37 | since these ranges have already been scanned for example by another unpack 38 | scan. 39 | 40 | The return value of an unpack scan should be a list with three values: 41 | 1. a list of tuples containing the names of directories where files have been 42 | unpacked and the byte offset in the original file where the compressed file or 43 | file system or media file can be found 44 | 2. an updated blacklist to which new byte ranges have been added in case of 45 | successful unpacking 46 | 3. a list of tags, the same as are returned by prerun scans 47 | 4. hints for the scanning engine, for example if the type of results is already 48 | known in advance (example: PNG unpacking) 49 | 50 | Slide 9: a new leaf scan is fairly simple and only has a few parameters, namely 51 | the full path of the file, the blacklist with byte ranges that should be 52 | ignored, and environment variables. 53 | 54 | The result can be an arbitrary Python value. Depending on if XML output is 55 | enabled it might be necessary to write a custom XML pretty printer for return 56 | values that are more complex than basic types (integer, boolean, float, 57 | strings). 58 | 59 | Slide 10: no comments 60 | 61 | Slide 11: Postrun scans do not alter the scan results, but merely process them 62 | to for example create a different representation of the results, like reports 63 | or graphics. 64 | 65 | The parameters are the results of the previous scans, plus some extra meta 66 | information like paths in the file system and environment variables. 67 | 68 | There is no return value for post run scans. 69 | 70 | Slide 12: no comments 71 | -------------------------------------------------------------------------------- /doc/testsuite/training-notes6: -------------------------------------------------------------------------------- 1 | Notes/transcript for presentation. 2 | 3 | File: bat-training6.pdf 4 | 5 | Slide 1: no notes 6 | 7 | Slide 2: In this final course we will look at how to generate a database for 8 | the advanced ranking scan discussed in previous courses, plus how to configure 9 | it so BAT can use it. 10 | 11 | Slide 3: A good dataset is important for the ranking to work. If there is too 12 | little data the classifications will be strongly biased to what is in the 13 | database and programs will be wrongly identified. 14 | 15 | The database works best if there is a wide range of software in it. Good 16 | sources for software are open source projects and Linux distributions. 17 | 18 | Slide 4: There is a database generation script available in the BAT source code 19 | repository. This script processes files, but it needs to have a list of files 20 | with some metadata (like packagename, version, and the origin of the download). 21 | Creating this list is very easy (it is a simple format), but if there are many 22 | source code archives that need to be processed it can be quite a lot of work. 23 | 24 | To help with that work there is a script that can generate these lists. 25 | 26 | Slide 5: The actual script that processes the source code files can be found in 27 | the BAT source code repository. It extracts string constants and function names 28 | (for C/C++ programs) and can be instructed to extract license texts from source 29 | code if instructed to using the Ninka license scanner. 30 | 31 | Slide 6: The Ninka license scanner can be used to extract license texts. 32 | Installing it is a bit tricky and BAT has, right now, a few hardcoded paths for 33 | Ninka, so it needs to be installed in a (semi-)fixed location before licenses 34 | can be scanned with the database extraction script. 35 | 36 | Slide 7: Similarly FOSSology can be used for extracting licenses and copyright 37 | statements like e-mail addresses, URLs and more. For this FOSSology needs to 38 | be installed. Installing FOSSology is quite complex, but there are ready made 39 | packages for most distributions available. 40 | 41 | Slide 8: The ranking scan makes use of several caching databases to speed up 42 | scanning. The caching databases contain information that is needed by the 43 | scanning process, such as the average number of strings in a package, string 44 | constants, and function names. 45 | 46 | Slide 9: 47 | 48 | Slide 10: 49 | 50 | Slide 11: no comments 51 | 52 | Slide 12: If licensing information (determined by looking at licenses of files 53 | in which unique strings were found) should be reported as well another 54 | environment variable called BAT_RANKING_LICENSE should be set to 1. 55 | 56 | Slide 13: no comments 57 | -------------------------------------------------------------------------------- /src/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include bat-scan.config 2 | include bat/* 3 | include README 4 | include LICENSE 5 | -------------------------------------------------------------------------------- /src/bat/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/armijnhemel/binaryanalysis/ea97b6b7617128ccf7cfa19244b91675d9bf66df/src/bat/__init__.py -------------------------------------------------------------------------------- /src/bat/batxor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2012-2015 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | Sometimes some files like firmwares are encrypted. The level of encryption 9 | varies with keys and verifying signatures at boot time to very simple 10 | "encryption" by simply XORing with a byte string. 11 | 12 | The code here scans binary files for certain known XOR parameters and applies 13 | them, but only if no other scan succeeds. 14 | 15 | For this we need to keep some state, possibly even delete the file only later, 16 | by tagging it as 'temporary' and removing it later on. 17 | ''' 18 | 19 | import sys 20 | import os 21 | import os.path 22 | import tempfile 23 | import mmap 24 | import fwunpack 25 | 26 | # some of the signatures we know about: 27 | # * Splashtop (fast boot environment) 28 | # * Bococom router series (2.6.21, Ralink chipset) 29 | # * Sitecom WL-340 and WL-342 30 | 31 | # Finding new signatures is done by hand. A helper tool (findxor.py) can be 32 | # found in the scripts directory 33 | 34 | # The signatures of various known XOR "encrypted" firmwares. 35 | signatures = { 'splashtop': ['\x51', '\x57', '\x45', '\x52'] 36 | , 'bococom': ['\x3a', '\x93', '\xa2', '\x95', '\xc3', '\x63', '\x48', '\x45', '\x58', '\x09', '\x12', '\x03', '\x08', '\xc8', '\x3c'] 37 | , 'sitecom': ['\x78', '\x3c', '\x9e', '\xcf', '\x67', '\xb3', '\x59', '\xac'] 38 | , 'edimax': ['\x88','\x44','\xa2','\xd1','\x68','\xb4','\x5a','\x2d'] 39 | } 40 | 41 | def unpackXOR(filename, sig, tempdir=None): 42 | tmpdir = fwunpack.unpacksetup(tempdir) 43 | tmpfile = tempfile.mkstemp(dir=tmpdir) 44 | os.fdopen(tmpfile[0]).close() 45 | 46 | fwunpack.unpackFile(filename, 0, tmpfile[1], tmpdir, modify=True) 47 | datafile = open(filename) 48 | datafile.seek(0) 49 | data = datafile.read(1000000) 50 | 51 | # read data, XOR, write data out again 52 | f2 = open(tmpfile[1], 'w') 53 | counter = 0 54 | while data != '': 55 | for i in data: 56 | f2.write(chr(ord(i) ^ ord(signatures[sig][counter]))) 57 | counter = (counter+1)%len(signatures[sig]) 58 | data = datafile.read(1000000) 59 | f2.close() 60 | datafile.close() 61 | return tmpdir 62 | 63 | def searchUnpackXOR(filename, tempdir=None, blacklist=[], offsets={}, scanenv={}, debug=False): 64 | hints = [] 65 | diroffsets = [] 66 | 67 | # If something else already unpacked (parts) of the file we're not 68 | # going to continue. 69 | if 'BAT_UNPACKED' in scanenv: 70 | if scanenv['BAT_UNPACKED'] == 'True': 71 | return (diroffsets, blacklist, [], hints) 72 | 73 | if 'XOR_MINIMUM' in scanenv: 74 | xor_minimum = int(scanenv['XOR_MINIMUM']) 75 | else: 76 | xor_minimum = 0 77 | # only continue if no other scan has succeeded 78 | if blacklist != []: 79 | return (diroffsets, blacklist, [], hints) 80 | counter = 1 81 | 82 | # only continue if we actually have signatures 83 | if signatures == {}: 84 | return (diroffsets, blacklist, [], hints) 85 | 86 | # open the file, so we can search for signatures 87 | # TODO: use the identifier search we have elsewhere. 88 | datafile = os.open(filename, os.O_RDONLY) 89 | datamm = mmap.mmap(datafile, 0, access=mmap.ACCESS_READ) 90 | 91 | tmpdir = fwunpack.dirsetup(tempdir, filename, "xor", counter) 92 | res = None 93 | for s in signatures: 94 | bs = reduce(lambda x, y: x + y, signatures[s]) 95 | # find all instances of the signature. We might want to tweak 96 | # this a bit. 97 | bsres = datamm.find(bs) 98 | if bsres == -1: 99 | continue 100 | siginstances = [bsres] 101 | while bsres != -1: 102 | bsres = datamm.find(bs, bsres +1) 103 | if bsres != -1: 104 | siginstances.append(bsres) 105 | if len(siginstances) > 0: 106 | if len(siginstances) < xor_minimum: 107 | continue 108 | res = unpackXOR(filename, s, tmpdir) 109 | if res != None: 110 | diroffsets.append((res, 0, os.stat(filename).st_size)) 111 | # blacklist the whole file 112 | blacklist.append((0, os.stat(filename).st_size)) 113 | break 114 | datamm.close() 115 | os.close(datafile) 116 | if res == None: 117 | os.rmdir(tmpdir) 118 | return (diroffsets, blacklist, [], hints) 119 | return (diroffsets, blacklist, ['temporary'], hints) 120 | -------------------------------------------------------------------------------- /src/bat/busyboxversion.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2009-2015 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | # Stand alone module to determine the version of BusyBox. Has a method for being called 8 | # from one of the default scans, but can also be invoked separately. 9 | 10 | import sys 11 | import os 12 | import tempfile 13 | import copy 14 | from optparse import OptionParser 15 | 16 | import busybox 17 | import extractor 18 | 19 | def busybox_version(filename, tags, cursor, conn, filehashes, blacklist=[], scanenv={}, scandebug=False, unpacktempdir=None): 20 | try: 21 | filesize = os.stat(filename).st_size 22 | ## if the whole file is blacklisted, we don't have to scan 23 | if blacklist != []: 24 | if extractor.inblacklist(0, blacklist) == filesize: 25 | return None 26 | ## make a copy and add a bogus value for the last 27 | ## byte to a temporary blacklist to make the loop work 28 | ## well. 29 | blacklist_tmp = copy.deepcopy(blacklist) 30 | blacklist_tmp.append((filesize,filesize)) 31 | datafile = open(filename, 'rb') 32 | lastindex = 0 33 | datafile.seek(lastindex) 34 | for i in blacklist_tmp: 35 | if i[0] == lastindex: 36 | lastindex = i[1] - 1 37 | datafile.seek(lastindex) 38 | continue 39 | if i[0] > lastindex: 40 | ## check if there actually is enough data to do a search first 41 | ## "BusyBox v" has length 9, has at least 2 digits and a dot 42 | if (i[0] - lastindex) < 12: 43 | lastindex = i[1] - 1 44 | datafile.seek(lastindex) 45 | continue 46 | data = datafile.read(i[0] - lastindex) 47 | tmpfile = tempfile.mkstemp() 48 | os.write(tmpfile[0], data) 49 | os.fdopen(tmpfile[0]).close() 50 | bbres = busybox.extract_version(tmpfile[1]) 51 | os.unlink(tmpfile[1]) 52 | ## set lastindex to the next 53 | lastindex = i[1] - 1 54 | datafile.seek(lastindex) 55 | if bbres != None: 56 | break 57 | datafile.close() 58 | else: 59 | bbres = busybox.extract_version(filename) 60 | if bbres != None: 61 | return (['busybox'], bbres) 62 | except Exception, e: 63 | return None 64 | 65 | def main(argv): 66 | parser = OptionParser() 67 | parser.add_option("-b", "--binary", dest="bb", help="path to BusyBox binary", metavar="FILE") 68 | (options, args) = parser.parse_args() 69 | if options.bb == None: 70 | parser.error("Path to BusyBox binary needed") 71 | (res, version) = busybox_version(options.bb, None, None, {}, []) 72 | 73 | if version != None: 74 | print version 75 | else: 76 | print "No BusyBox found" 77 | 78 | if __name__ == "__main__": 79 | main(sys.argv) 80 | -------------------------------------------------------------------------------- /src/bat/ext2.py: -------------------------------------------------------------------------------- 1 | # Binary Analysis Tool 2 | # Copyright 2009-2016 Armijn Hemel for Tjaldur Software Governance Solutions 3 | # Licensed under Apache 2.0, see LICENSE file for details 4 | 5 | import os 6 | import subprocess 7 | import tempfile 8 | 9 | ''' 10 | Module to 'unpack' an ext2 file system. We are taking a shortcut. We're using 11 | e2cp to copy files, but we're recreating the directories in the file system 12 | ourselves. We can get this information from the output of el2s. 13 | 14 | The second column displays the Ext2/linux mode flags which can be found in 15 | from e2fsprogs. 16 | 17 | We are mostly interested in regular files and directories: 18 | 19 | #define LINUX_S_IFREG 0100000 20 | #define LINUX_S_IFDIR 0040000 21 | ''' 22 | 23 | def copyext2fs(source, target=None): 24 | if target == None: 25 | targetdir = tempfile.mkdtemp() 26 | else: 27 | targetdir = target 28 | 29 | # now walk each directory and copy files 30 | scandirs = [""] 31 | unpackfail = False 32 | while len(scandirs) != 0: 33 | newscandirs = set() 34 | for scandir in scandirs: 35 | p = subprocess.Popen(['e2ls', '-l', source + ":" + scandir], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 36 | (stanout, stanerr) = p.communicate() 37 | if p.returncode != 0: 38 | # This could happen is for example the file system is corrupted 39 | # and inodes are damaged 40 | unpackfail = True 41 | break 42 | if stanout.strip() == "No files found!": 43 | continue 44 | for i in stanout.strip().split("\n"): 45 | if i.startswith(">"): 46 | continue 47 | isplits = i.split() 48 | if len(isplits[1]) < 5: 49 | # bogus file system, so continue 50 | return None 51 | modeflag = int(isplits[1][0:-3]) 52 | if len(isplits) < 8: 53 | continue 54 | else: 55 | filename = isplits[7] 56 | if modeflag == 40: 57 | newscandirs.add(scandir + "/" + filename) 58 | os.mkdir(target + "/" + scandir + "/" + filename) 59 | # also take sticky bit, suid, sgid, etc. into account 60 | elif modeflag >= 100 and modeflag < 120: 61 | copypath = source + ":" + scandir + "/" + filename 62 | p = subprocess.Popen(['e2cp', copypath, "-d", os.path.normpath(target + "/" + scandir)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 63 | (stanout, stanerr) = p.communicate() 64 | if p.returncode != 0: 65 | continue 66 | scandirs = newscandirs 67 | if unpackfail: 68 | return None 69 | return targetdir 70 | -------------------------------------------------------------------------------- /src/bat/file2package.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2012-2015 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This is a plugin for the Binary Analysis Tool. Its purpose is to determine the 9 | package a file belongs to based on the name of a package. This information is 10 | mined from distributions like Fedora and Debian. 11 | ''' 12 | 13 | import os 14 | import os.path 15 | import sys 16 | import subprocess 17 | import copy 18 | import Queue 19 | import cPickle 20 | import multiprocessing 21 | from multiprocessing import Process, Lock 22 | from multiprocessing.sharedctypes import Value, Array 23 | 24 | def grabpackage(scanqueue, reportqueue, cursor, query): 25 | # select the packages that are available. It would be better to also have the directory 26 | # name available, so we should get rid of 'path' and use something else that is better 27 | # suited 28 | while True: 29 | filename = scanqueue.get(timeout=2592000) 30 | cursor.execute(query, (os.path.basename(filename),)) 31 | res = cursor.fetchall() 32 | if res != []: 33 | returnres = [] 34 | # TODO: filter results, only return files that are not in tons of packages 35 | for r in res: 36 | (package, packageversion, distribution, distroversion) = r 37 | distrores = {} 38 | distrores['package'] = package 39 | distrores['packageversion'] = packageversion 40 | distrores['distribution'] = distribution 41 | distrores['distributionversion'] = distroversion 42 | returnres.append(distrores) 43 | reportqueue.put({filename: returnres}) 44 | scanqueue.task_done() 45 | 46 | def filename2package(unpackreports, scantempdir, topleveldir, processors, scanenv, batcursors, batcons, scandebug=False, unpacktempdir=None): 47 | processtasks = [] 48 | for i in unpackreports: 49 | if not 'checksum' in unpackreports[i]: 50 | continue 51 | processtasks.append(i) 52 | 53 | if processors == None: 54 | processamount = 1 55 | else: 56 | processamount = processors 57 | # create a queue for tasks, with a few threads reading from the queue 58 | # and looking up results and putting them in a result queue 59 | query = "select distinct package, packageversion, source, distroversion from file where filename = %s" 60 | scanmanager = multiprocessing.Manager() 61 | scanqueue = multiprocessing.JoinableQueue(maxsize=0) 62 | reportqueue = scanmanager.Queue(maxsize=0) 63 | processpool = [] 64 | 65 | map(lambda x: scanqueue.put(x), processtasks) 66 | minprocessamount = min(len(processtasks), processamount) 67 | res = [] 68 | 69 | for i in range(0,minprocessamount): 70 | p = multiprocessing.Process(target=grabpackage, args=(scanqueue,reportqueue,batcursors[i],query)) 71 | processpool.append(p) 72 | p.start() 73 | 74 | scanqueue.join() 75 | 76 | while True: 77 | try: 78 | val = reportqueue.get_nowait() 79 | res.append(val) 80 | reportqueue.task_done() 81 | except Queue.Empty, e: 82 | # Queue is empty 83 | break 84 | reportqueue.join() 85 | 86 | for p in processpool: 87 | p.terminate() 88 | 89 | for r in res: 90 | filename = r.keys()[0] 91 | filehash = unpackreports[filename]['checksum'] 92 | 93 | # read pickle file 94 | leaf_file = open(os.path.join(topleveldir, "filereports", "%s-filereport.pickle" % filehash), 'rb') 95 | leafreports = cPickle.load(leaf_file) 96 | leaf_file.close() 97 | 98 | # write pickle file 99 | leafreports['file2package'] = r[filename] 100 | leafreports['tags'].append('file2package') 101 | unpackreports[filename]['tags'].append('file2package') 102 | leaf_file = open(os.path.join(topleveldir, "filereports", "%s-filereport.pickle" % filehash), 'wb') 103 | cPickle.dump(leafreports, leaf_file) 104 | leaf_file.close() 105 | 106 | returnres = res 107 | 108 | def file2packagesetup(scanenv, cursor, conn, debug=False): 109 | if cursor == None: 110 | return (False, {}) 111 | cursor.execute("select table_name from information_schema.tables where table_type='BASE TABLE' and table_schema='public'") 112 | tablenames = map(lambda x: x[0], cursor.fetchall()) 113 | conn.commit() 114 | if not 'file' in tablenames: 115 | return (False, {}) 116 | return (True, scanenv) 117 | -------------------------------------------------------------------------------- /src/bat/findduplicates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2013-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | import sys 8 | 9 | ''' 10 | This aggregate scan traverses the unpackreports and reports all duplicate 11 | files as a list of lists of identical files. 12 | ''' 13 | 14 | def findduplicates(unpackreports, scantempdir, topleveldir, processors, scanenv, batcursors, batcons, scandebug=False, unpacktempdir=None): 15 | filehashes = {} 16 | for r in unpackreports.keys(): 17 | if 'checksum' in unpackreports[r]: 18 | if unpackreports[r]['checksum'] in filehashes: 19 | filehashes[unpackreports[r]['checksum']].append(r) 20 | else: 21 | filehashes[unpackreports[r]['checksum']] = [r] 22 | duplicates = [] 23 | for h in filehashes: 24 | if len(filehashes[h]) > 1: 25 | duplicates.append(filehashes[h]) 26 | if duplicates != []: 27 | return {'duplicates': duplicates} 28 | -------------------------------------------------------------------------------- /src/bat/fixduplicates.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | 4 | # Binary Analysis Tool 5 | # Copyright 2014-2016 Armijn Hemel for Tjaldur Software Governance Solutions 6 | # Licensed under Apache 2.0, see LICENSE file for details 7 | 8 | import os 9 | import os.path 10 | import sys 11 | import subprocess 12 | import copy 13 | import cPickle 14 | import elfcheck 15 | 16 | ''' 17 | During scanning BAT tags duplicate files (same checksums) and only processes a 18 | single file later on. Which file is marked as the 'original' and which as the 19 | duplicate depends on the scanning order, which is non-deterministic. 20 | 21 | In some situations there is more information available to make a better choice 22 | about the 'original' and the duplicate. 23 | 24 | This module is to fix these situations. 25 | 26 | 1. In ELF shared libraries the SONAME and RPATH attributes can be used. 27 | ''' 28 | 29 | def fixduplicates(unpackreports, scantempdir, topleveldir, processors, scanenv, batcursors, batcons, scandebug=False, unpacktempdir=None): 30 | # First deal with ELF files 31 | # store names of all ELF files present in scan archive 32 | elffiles = set() 33 | dupefiles = set() 34 | 35 | seendupe = False 36 | 37 | for i in unpackreports: 38 | if not 'checksum' in unpackreports[i]: 39 | continue 40 | filehash = unpackreports[i]['checksum'] 41 | if not os.path.exists(os.path.join(topleveldir, "filereports", "%s-filereport.pickle" % filehash)): 42 | continue 43 | 44 | if not 'elf' in unpackreports[i]['tags']: 45 | continue 46 | 47 | # This makes no sense for for example statically linked libraries and, Linux kernel 48 | # images and Linux kernel modules, so skip. 49 | if 'static' in unpackreports[i]['tags']: 50 | continue 51 | if 'linuxkernel' in unpackreports[i]['tags']: 52 | continue 53 | if 'duplicate' in unpackreports[i]['tags']: 54 | seendupe = True 55 | dupefiles.add(i) 56 | else: 57 | elffiles.add(i) 58 | 59 | # only process if there actually are duplicate files 60 | if seendupe: 61 | dupehashes = {} 62 | for i in dupefiles: 63 | filehash = unpackreports[i]['checksum'] 64 | if filehash in dupehashes: 65 | dupehashes[filehash].append(i) 66 | else: 67 | dupehashes[filehash] = [i] 68 | dupekeys = dupehashes.keys() 69 | for i in elffiles: 70 | filehash = unpackreports[i]['checksum'] 71 | if filehash in dupekeys: 72 | realpath = unpackreports[i]['realpath'] 73 | filename = unpackreports[i]['name'] 74 | 75 | elfres = elfcheck.getDynamicLibs(os.path.join(realpath, filename)) 76 | if elfres == {} or elfres == None: 77 | continue 78 | 79 | if not 'sonames' in elfres: 80 | continue 81 | 82 | sonames = elfres['sonames'] 83 | 84 | # there should be only one SONAME 85 | if len(sonames) != 1: 86 | continue 87 | 88 | soname = sonames[0] 89 | if soname == filename: 90 | # no need for fixing 91 | continue 92 | if unpackreports[i]['scans'] != []: 93 | # if any unpack scans were successful then renaming might have 94 | # to be done recursively which needs more thought 95 | continue 96 | unpackreports[i]['tags'].append('duplicate') 97 | for j in dupehashes[filehash]: 98 | if soname == os.path.basename(j): 99 | unpackreports[j]['tags'].remove('duplicate') 100 | break 101 | -------------------------------------------------------------------------------- /src/bat/fsmagic.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2009-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | '''This file contains information about how to recognize certain 8 | files, file systems, compression, and so on automatically and which 9 | methods or functions to invoke to unpack these files for further 10 | analysis.''' 11 | 12 | # information from: 13 | # 1. /usr/share/magic 14 | # 2. include/linux/magic.h in the Linux kernel sources 15 | # 3. http://www.squashfs-lzma.org/ 16 | # 4. http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=364260 17 | # 5. various other places 18 | 19 | # This is not the same as the magic database, but just a list of 20 | # identifiers that are used for these file systems, compressed files,etc. 21 | # In BAT a lot more work is done to verify what a file really is, which 22 | # the magic database does not do. 23 | 24 | fsmagic = { 25 | 'gzip': '\x1f\x8b\x08', # x08 is the only compression method according to RFC 1952 26 | 'compress': '\x1f\x9d', 27 | 'bz2': 'BZh', 28 | 'rar': 'Rar!\x1a\x07', 29 | 'rarfooter': '\xc4\x3d\x7b\x00\x40\x07\x00', # http://forensicswiki.org/wiki/RAR#Terminator_.28terminator.29 30 | 'zip': '\x50\x4b\x03\04', 31 | 'zipend': '\x50\x4b\x05\06', 32 | 'lrzip': 'LRZI', 33 | 'rzip': 'RZIP', 34 | 'squashfs1': '\x68\x73\x71\x73', # hsqs -- little endian 35 | 'squashfs2': '\x73\x71\x73\x68', # sqsh -- big endian 36 | 'squashfs3': '\x71\x73\x68\x73', # qshs -- little endian 37 | 'squashfs4': '\x73\x68\x73\x71', # shsq -- big endian 38 | 'squashfs5': '\x74\x71\x73\x68', # tqsh - used in DD-WRT 39 | 'squashfs6': '\x68\x73\x71\x74', # hsqt - used in DD-WRT 40 | 'squashfs7': '\x73\x71\x6c\x7a', # sqlz 41 | 'android-sparse': '\x3a\xff\x26\xed', 42 | 'lzma_alone': '\x5d\x00\x00', 43 | 'lzma_alone_alt': '\x6d\x00\x00', # used in OpenWrt 44 | 'lzma_alone_alt2':'\x6c\x00\x00', # seen in some routers, like ZyXEL NBG5615 45 | '7z': '7z\xbc\xaf\x27\x1c', 46 | 'xz': '\xfd\x37\x7a\x58\x5a\x00', 47 | 'xztrailer': '\x59\x5a', 48 | 'lzip': 'LZIP', 49 | 'lzop': '\x89\x4c\x5a\x4f\x00\x0d\x0a\x1a\x0a', 50 | 'lha': '-lh7-', 51 | 'cramfs_le': '\x45\x3d\xcd\x28', 52 | 'cramfs_be': '\x28\xcd\x3d\x45', 53 | 'romfs': '-rom1fs-', 54 | 'jffs2_le': '\x85\x19', 55 | 'jffs2_be': '\x19\x85', 56 | 'ubifs': '\x31\x18\x10\x06', 57 | 'ubi': '\x55\x42\x49\x23', 58 | 'rpm': '\xed\xab\xee\xdb', 59 | 'ext2': '\x53\xef', # little endian 60 | 'minix': '\x8f\x13', # specific version of Minix v1 file system 61 | 'arj': '\x60\xea', 62 | 'cab': 'MSCF\x00\x00\x00\x00', # first four bytes following header are always 0 63 | 'installshield': 'ISc(', 64 | 'pkbac': 'PKBAC', 65 | 'winrar': 'WinRAR', 66 | 'png': '\x89PNG\x0d\x0a\x1a\x0a', 67 | 'pngtrailer': '\x00\x00\x00\x00IEND\xae\x42\x60\x82', # length, chunk type and CRC for PNG trailer are always the same 68 | 'cpiotrailer': 'TRAILER!!!', 69 | 'bmp': 'BM', 70 | 'jpeg': '\xff\xd8', 71 | 'jpegtrailer': '\xff\xd9', 72 | 'jfif': 'JFIF', 73 | 'gif87': 'GIF87a', 74 | 'gif89': 'GIF89a', 75 | 'ico': '\x00\x00\x01\x00', 76 | 'riff': 'RIFF', 77 | 'cpio1': '070701', 78 | 'cpio2': '070702', 79 | 'cpio3': '070707', 80 | 'iso9660': 'CD001', 81 | 'swf': 'CWS', 82 | 'pdf': '%PDF-', 83 | 'pdftrailer': '%%EOF', 84 | 'ar': '!', 85 | 'tar1': 'ustar\x00', 86 | 'tar2': 'ustar\x20', 87 | 'java_serialized': '\xac\xed\x00', 88 | 'fat12': 'FAT12', 89 | 'fat16': 'FAT16', 90 | 'pe': 'MZ', 91 | 'upx': 'UPX', 92 | 'java': '\xca\xfe\xba\xbe', 93 | 'pack200': '\xca\xfe\xd0\x0d', 94 | 'dex': 'dex\n', # Android Dex 95 | 'odex': 'dey\n', # Android Odex 96 | 'oat': 'oat\n', # Android OAT 97 | 'otf': 'OTTO', 98 | 'ttf': '\x00\x01\x00\x00', 99 | 'id3': 'TAG', 100 | 'id3v2': 'ID3', 101 | 'mp4': 'ftyp', 102 | 'ogg': 'OggS', 103 | 'sqlite3': 'SQLite format 3\x00', 104 | 'u-boot': '\x27\x05\x19\x56', 105 | 'yaffs2': '\x03\x00\x00\x00\x01\x00\x00\x00\xff\xff', # this is not a an official signature, just occuring frequently 106 | 'plf': '\x50\x4c\x46\x21', 107 | 'chm': 'ITSF\x03\x00\x00\x00\x60\x00\x00\x00\x01\x00\x00\x00', 108 | 'msi': '\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1', # not sure this is a correct signature 109 | 'windowsassemblyheader': '', 111 | 'appledouble': '\x00\x05\x16\x07', 112 | 'mswim': 'MSWIM\x00\x00\x00', 113 | 'certificate': '-----BEGIN', 114 | 'androidbackup': 'ANDROID BACKUP\n', 115 | 'aiff': 'FORM', 116 | 'woff': 'wOFF', 117 | 'woff2': 'wOF2', 118 | 'xar': '\x78\x61\x72\x21', 119 | 'icc': 'acsp', 120 | 'elf': '\x7f\x45\x4c\x46', 121 | 'bflt': '\x62\x46\x4c\x54', 122 | } 123 | 124 | # some offsets can be found after a certain number of bytes, but 125 | # the actual file system or file starts earlier 126 | correction = { 127 | 'ext2': 0x438, 128 | 'minix': 0x410, 129 | 'iso9660': 32769, 130 | 'tar1': 0x101, 131 | 'tar2': 0x101, 132 | 'fat12': 54, 133 | 'fat16': 54, 134 | 'lha': 2, 135 | 'icc': 36, 136 | } 137 | 138 | # collection of markers that should be scanned together 139 | squashtypes = ['squashfs1', 'squashfs2', 'squashfs3', 'squashfs4', 'squashfs5', 'squashfs6'] 140 | lzmatypes = ['lzma_alone', 'lzma_alone_alt', 'lzma_alone_alt2'] 141 | cpio = ['cpio1', 'cpio2', 'cpio3'] 142 | gif = ['gif87', 'gif89'] 143 | tar = ['tar1', 'tar2'] 144 | -------------------------------------------------------------------------------- /src/bat/fssearch.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2009-2013 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | import sys 8 | import os 9 | import tempfile 10 | import fsmagic 11 | 12 | # Find a squashfs file system, starting at a certain offset. 13 | # Returns the offset of the file system nearest file system. 14 | def findSquashfs(data, offset=0): 15 | marker = -1 16 | squashtype = None 17 | for t in fsmagic.squashtypes: 18 | sqshmarker = findMarker(fsmagic.fsmagic[t], data, offset) 19 | if sqshmarker == -1: 20 | continue 21 | if marker == -1: 22 | marker = sqshmarker 23 | else: 24 | marker = min(marker, sqshmarker) 25 | return marker 26 | 27 | # Find a marker. To more efficiently deal with big files we don't read in 28 | # the entire file at once, but use read() and seek() 29 | def findMarker(marker, datafile, offset=0): 30 | databuffer = [] 31 | datafile.seek(offset) 32 | databuffer = datafile.read(100000) 33 | while databuffer != '': 34 | res = databuffer.find(marker) 35 | if res != -1: 36 | datafile.seek(0) 37 | return offset + res 38 | else: 39 | # move the offset 50 40 | datafile.seek(offset + 99950) 41 | # read 100000 bytes from oldoffset + 50, so there is 50 bytes 42 | # overlap with the previous read 43 | databuffer = datafile.read(100000) 44 | if len(databuffer) >= 50: 45 | offset = offset + 99950 46 | else: 47 | offset = offset + len(databuffer) 48 | datafile.seek(0) 49 | return -1 50 | 51 | def findType(type, data, offset=0): 52 | res = findMarker(fsmagic.fsmagic[type], data, offset) 53 | return res 54 | 55 | def findCpio(data, offset=0): 56 | cpiomarker = -1 57 | for marker in fsmagic.cpio: 58 | res = findMarker(fsmagic.fsmagic[marker], data, offset) 59 | if res != -1 and cpiomarker == -1: 60 | cpiomarker = res 61 | elif res != -1: 62 | cpiomarker = min(cpiomarker, res) 63 | return cpiomarker 64 | 65 | def findXZTrailer(data, offset=0): 66 | return findType('xztrailer', data, offset) 67 | 68 | def findCpioTrailer(data, offset=0): 69 | return findType('cpiotrailer', data, offset) 70 | 71 | def findExt2fs(data, offset=0): 72 | return findType('ext2', data, offset) 73 | 74 | def findISO9660(data, offset=0): 75 | return findType('iso9660', data, offset) 76 | 77 | def findIco(data, offset=0): 78 | return findType('ico', data, offset) 79 | 80 | def findRPM(data, offset=0): 81 | return findType('rpm', data, offset) 82 | 83 | def findGzip(data, offset=0): 84 | return findType('gzip', data, offset) 85 | 86 | def findZip(data, offset=0): 87 | return findType('zip', data, offset) 88 | 89 | def findCramfs(data, offset=0): 90 | return findType('cramfs', data, offset) 91 | 92 | def findUbi(data, offset=0): 93 | return findType('ubi', data, offset) 94 | 95 | def findRar(data, offset=0): 96 | return findType('rar', data, offset) 97 | 98 | # not reliable according to comments in /usr/share/magic 99 | def findLZMA(data, offset=0): 100 | return findType('lzma_alone', data, offset) 101 | 102 | def findXZ(data, offset=0): 103 | return findType('xz', data, offset) 104 | 105 | def findLzip(data, offset=0): 106 | return findType('lzip', data, offset) 107 | 108 | def findLzo(data, offset=0): 109 | return findType('lzo', data, offset) 110 | 111 | def findBzip2(data, offset=0): 112 | return findType('bz2', data, offset) 113 | 114 | def findARJ(data, offset=0): 115 | return findType('arj', data, offset) 116 | 117 | def findCab(data, offset=0): 118 | return findType('cab', data, offset) 119 | 120 | def findPNG(data, offset=0): 121 | return findType('png', data, offset) 122 | 123 | # http://www.w3.org/TR/PNG-Chunks.html 124 | def findPNGTrailer(data, offset=0): 125 | return findType('pngtrailer', data, offset) 126 | 127 | def findJFIF(data, offset=0): 128 | jfifmarker = findType('jfif', data, offset) 129 | if jfifmarker < 6: 130 | return -1 131 | else: 132 | return jfifmarker - 6 133 | 134 | def findGIF(data, offset=0): 135 | gifmarker = -1 136 | for marker in fsmagic.gif: 137 | res = findMarker(fsmagic.fsmagic[marker], data, offset) 138 | if res != -1 and gifmarker == -1: 139 | gifmarker = res 140 | elif res != -1: 141 | gifmarker = min(gifmarker, res) 142 | return gifmarker 143 | 144 | def markerSearch(data): 145 | offsets = [] 146 | marker_keys = fsmagic.marker.keys() 147 | for key in marker_keys: 148 | res = data.find(fsmagic.marker[key]) 149 | while res != -1: 150 | offsets.append((res, key)) 151 | res = data.find(fsmagic.marker[key], res+1) 152 | offsets.sort() 153 | for i in offsets: 154 | print hex(i[0]), i[1], i[0]%8 155 | 156 | def bruteForceSearch(data): 157 | offsets = [] 158 | fsmagic_keys = fsmagic.fsmagic.keys() 159 | for key in fsmagic_keys: 160 | res = data.find(fsmagic.fsmagic[key]) 161 | while res != -1: 162 | offsets.append((res, key)) 163 | res = data.find(fsmagic.fsmagic[key], res+1) 164 | offsets.sort() 165 | for i in offsets: 166 | print hex(i[0]), i[1], i[0]%8 167 | -------------------------------------------------------------------------------- /src/bat/generatehexdump.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2012-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This is a plugin for the Binary Analysis Tool. It takes the output of hexdump -Cv 9 | and writes it to a file with gzip compression. The output is later used in the 10 | graphical user interface. 11 | 12 | Parameters: 13 | 14 | BAT_REPORTDIR :: directory where output should be written to. This is useful for caching 15 | BAT_IMAGE_MAXFILESIZE :: maximum size of source file 16 | 17 | This should be run as a postrun scan 18 | ''' 19 | 20 | import os 21 | import os.path 22 | import sys 23 | import subprocess 24 | import gzip 25 | 26 | def generateHexdump(filename, unpackreport, scantempdir, topleveldir, scanenv, cursor, conn, debug=False): 27 | if not 'checksum' in unpackreport: 28 | return 29 | reportdir = scanenv.get('BAT_REPORTDIR', '.') 30 | try: 31 | os.stat(reportdir) 32 | except: 33 | # BAT_REPORTDIR does not exist 34 | try: 35 | os.makedirs(reportdir) 36 | except Exception: 37 | return 38 | 39 | maxsize = int(scanenv.get('BAT_IMAGE_MAXFILESIZE', sys.maxint)) 40 | # override file name, we won't use it much 41 | filename = os.path.join(unpackreport['realpath'], unpackreport['name']) 42 | filesize = os.stat(filename).st_size 43 | if filesize > maxsize: 44 | return 45 | if not os.path.exists("%s/%s-hexdump.gz" % (reportdir, unpackreport['checksum'])): 46 | p = subprocess.Popen(['hexdump', '-Cv', filename], stdout=subprocess.PIPE, 47 | stderr=subprocess.PIPE, close_fds=True) 48 | (stanout, stanerr) = p.communicate() 49 | if stanout != "": 50 | gf = gzip.open("%s/%s-hexdump.gz" % (reportdir, unpackreport['checksum']), 'w') 51 | gf.write(stanout) 52 | gf.close() 53 | -------------------------------------------------------------------------------- /src/bat/images.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2012-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This is a plugin for the Binary Analysis Tool. It generates images of files, both 9 | full files and thumbnails. The files can be used for informational purposes, such 10 | as detecting roughly where offsets can be found, if data is compressed or encrypted, 11 | etc. 12 | 13 | This should be run as a postrun scan 14 | 15 | Parameters for configuration file: 16 | 17 | * BAT_IMAGE_MAXFILESIZE :: maximum size of the *source* file, to prevent 18 | ridiculously large files from being turned into even ridiculously larger 19 | pictures 20 | * BAT_IMAGEDIR :: location to where images should be written 21 | ''' 22 | 23 | import os 24 | import os.path 25 | import sys 26 | import subprocess 27 | from PIL import Image 28 | 29 | def generateImages(filename, unpackreport, scantempdir, topleveldir, scanenv, cursor, conn, debug=False): 30 | if not 'checksum' in unpackreport: 31 | return 32 | 33 | imagedir = scanenv.get('BAT_IMAGEDIR', "%s/%s" % (topleveldir, "images")) 34 | try: 35 | os.stat(imagedir) 36 | except: 37 | # BAT_IMAGEDIR does not exist 38 | try: 39 | os.makedirs(imagedir) 40 | except Exception, e: 41 | return 42 | 43 | maxsize = int(scanenv.get('BAT_IMAGE_MAXFILESIZE', sys.maxint)) 44 | filesize = os.stat("%s/%s" % (scantempdir, filename)).st_size 45 | if filesize > maxsize: 46 | return 47 | # this stuff is easily cached 48 | if not os.path.exists("%s/%s.png" % (imagedir, unpackreport['checksum'])): 49 | fwfile = open("%s/%s" % (scantempdir, filename)) 50 | 51 | # this is very inefficient for large files, but we *really* need all the data :-( 52 | fwdata = fwfile.read() 53 | fwfile.close() 54 | 55 | fwlen = len(fwdata) 56 | 57 | if fwlen > 512: 58 | height = 512 59 | else: 60 | height = fwlen 61 | width = fwlen/height 62 | 63 | # we might need to add some bytes so we can create a valid picture 64 | if fwlen%height > 0: 65 | width = width + 1 66 | for i in range(0, height - (fwlen%height)): 67 | fwdata = fwdata + chr(0) 68 | 69 | imgbuffer = buffer(bytearray(fwdata)) 70 | 71 | im = Image.frombuffer("L", (height, width), imgbuffer, "raw", "L", 0, 1) 72 | im.save("%s/%s.png" % (imagedir, unpackreport['checksum'])) 73 | #''' 74 | if width > 100: 75 | imthumb = im.thumbnail((height/4, width/4)) 76 | im.save("%s/%s-thumbnail.png" % (imagedir, unpackreport['checksum'])) 77 | #''' 78 | -------------------------------------------------------------------------------- /src/bat/piecharts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2012-2015 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This is a plugin for the Binary Analysis Tool. It generates images of results 9 | of the ranking scan, like piecharts and version charts. 10 | 11 | It is used by generateimages.py 12 | ''' 13 | 14 | import os 15 | import os.path 16 | import cPickle 17 | import matplotlib 18 | matplotlib.use('cairo') 19 | import pylab 20 | 21 | def generateImages(picklefile, pickledir, filehash, imagedir, pietype): 22 | 23 | leaf_file = open(os.path.join(pickledir, picklefile), 'rb') 24 | (piedata, pielabels) = cPickle.load(leaf_file) 25 | leaf_file.close() 26 | 27 | pylab.figure(1, figsize=(6.5,6.5)) 28 | ax = pylab.axes([0.2, 0.15, 0.6, 0.6]) 29 | 30 | pylab.pie(piedata, labels=pielabels) 31 | 32 | pylab.savefig(os.path.join(imagedir, '%s-%s.png' % (filehash, pietype))) 33 | pylab.gcf().clear() 34 | os.unlink(os.path.join(pickledir, picklefile)) 35 | -------------------------------------------------------------------------------- /src/bat/prunefiles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2013-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | import os 8 | import os.path 9 | import sys 10 | 11 | ''' 12 | This method can be used to prune scans, by for example ignoring all graphics files 13 | ''' 14 | 15 | def prunefiles(unpackreports, scantempdir, topleveldir, processors, scanenv, batcursors, batcons, scandebug=False, unpacktempdir=None): 16 | if not "PRUNE_TAGS" in scanenv: 17 | return 18 | prunes = scanenv['PRUNE_TAGS'] 19 | prunetags = set(prunes.split(',')) 20 | 21 | cleanpickles = False 22 | if scanenv.get('PRUNE_FILEREPORT_CLEAN', 0) == '1': 23 | cleanpickles = True 24 | 25 | cleanfiles = set() 26 | for u in unpackreports.keys(): 27 | if set(unpackreports[u]['tags']).intersection(prunetags) != set(): 28 | if cleanpickles: 29 | filehash = unpackreports[u]['checksum'] 30 | cleanfiles.add(filehash) 31 | del unpackreports[u] 32 | 33 | for filehash in cleanfiles: 34 | try: 35 | os.unlink(os.path.join(topleveldir, "filereports", "%s-filereport.pickle" % filehash)) 36 | except Exception, e: 37 | print >>sys.stderr, "error removing", filehash, e 38 | sys.stderr.flush() 39 | -------------------------------------------------------------------------------- /src/bat/renamefiles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2015-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | import shutil 8 | import os.path 9 | import copy 10 | 11 | ''' 12 | This aggregate scan traverses the unpackreports an tries to rename certain files based on properties of 13 | unpacked files. For example: 14 | 15 | * if a file is carved out of a larger file that contains a Linux kernel, 16 | rename it to something like "unpacked-linux-kernel" 17 | * if a gzip CPIO archive is extracted from a Linux kernel and contains 18 | files/directories, like /root or /dev it is likely an initramfs 19 | ''' 20 | 21 | def renamefiles(unpackreports, scantempdir, topleveldir, processors, scanenv, batcursors, batcons, scandebug=False, unpacktempdir=None): 22 | # only focus on initramfs that is also compressed for now 23 | kernelfiles = set() 24 | # known compressions for initramfs 25 | initramfscompressions = ['gzip'] 26 | for r in unpackreports.keys(): 27 | if 'checksum' in unpackreports[r]: 28 | if 'linuxkernel' in unpackreports[r]['tags']: 29 | if 'modulekernelversion' in unpackreports[r]['tags']: 30 | continue 31 | if 'duplicate' in unpackreports[r]['tags']: 32 | continue 33 | kernelfiles.add(r) 34 | 35 | if 'TEMPLATE' in scanenv: 36 | template = scanenv['TEMPLATE'] 37 | if template is not None: 38 | templatecutoff = template.find('%') 39 | template = template[:templatecutoff] 40 | 41 | cpiotemplate = "initramfs" 42 | for r in kernelfiles: 43 | if unpackreports[r]['scans'] != []: 44 | counter = 0 45 | for s in unpackreports[r]['scans']: 46 | if len(s['scanreports']) != 1: 47 | counter += 1 48 | continue 49 | renamefiles = set() 50 | origcpio = '' 51 | targetcpio = '' 52 | process = False 53 | if s['scanname'] in initramfscompressions: 54 | unpackfile = s['scanreports'][0] 55 | if unpackreports[unpackfile]['name'].startswith('tmp'): 56 | process = True 57 | else: 58 | if template is not None: 59 | if unpackreports[unpackfile]['name'].startswith(template): 60 | process = True 61 | if not process: 62 | counter += 1 63 | continue 64 | if unpackreports[unpackfile]['scans'] != []: 65 | if len(unpackreports[unpackfile]['scans']) != 1: 66 | counter += 1 67 | continue 68 | if unpackreports[unpackfile]['scans'][0]['scanname'] == 'cpio': 69 | # it is an initramfs, so it is possible to rename the file 70 | # Rename on disk: 71 | # 1. file 72 | # 2. unpacking directory 73 | # Then rename in unpackreports 74 | # 1. original file 75 | # 2. any paths in scanreports (path, realpath) 76 | # 3. references in parent file 77 | origname = os.path.join(unpackreports[unpackfile]['realpath'], unpackreports[unpackfile]['name']) 78 | targetname = os.path.join(unpackreports[unpackfile]['realpath'], cpiotemplate) 79 | if not os.path.exists(targetname): 80 | # on disk 81 | shutil.move(origname, targetname) 82 | if not "duplicate" in unpackreports[unpackfile]['tags']: 83 | origcpio = "%s-cpio-1" % origname 84 | targetcpio = "%s-cpio-1" % targetname 85 | shutil.move(origcpio, targetcpio) 86 | # in unpackreports 87 | unpackreports[unpackfile]['name'] = cpiotemplate 88 | newunpackreportsname = os.path.join(os.path.dirname(unpackfile), cpiotemplate) 89 | 90 | unpackreports[r]['scans'][counter]['scanreports'][0] = newunpackreportsname 91 | renamefiles.add(unpackfile) 92 | 93 | while len(renamefiles) != 0: 94 | newrenamefiles = set() 95 | for re in renamefiles: 96 | origcpio = '/%s' % os.path.basename(origcpio) 97 | targetcpio = '/%s' % os.path.basename(targetcpio) 98 | newr = re.replace(origcpio, targetcpio) 99 | 100 | realpath = copy.deepcopy(unpackreports[re]['realpath']) 101 | newrealpath = realpath.replace(origcpio, targetcpio) 102 | unpackreports[re]['realpath'] = newrealpath 103 | # recurse into files, if any 104 | if 'scans' in unpackreports[re]: 105 | for sc in unpackreports[re]['scans']: 106 | if 'scanreports' in sc: 107 | newrenamefiles.update(sc['scanreports']) 108 | newscanreports = [] 109 | for scr in sc['scanreports']: 110 | newscanreports.append(scr.replace(origcpio, targetcpio)) 111 | sc['scanreports'] = newscanreports 112 | 113 | # then rename and delete the old value 114 | unpackreports[newr] = copy.deepcopy(unpackreports[re]) 115 | del unpackreports[re] 116 | renamefiles = newrenamefiles 117 | counter += 1 118 | -------------------------------------------------------------------------------- /src/bat/reportcopyright.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This plugin for BAT looks at the extracted identifiers and looks at if there 9 | is some sort of copyright notice in an extracted identifier. This might not 10 | work well in the case of multiline copyright notices. 11 | ''' 12 | 13 | import os 14 | import os.path 15 | import sys 16 | import subprocess 17 | import copy 18 | import cPickle 19 | import multiprocessing 20 | 21 | 22 | def reportcopyright(unpackreports, scantempdir, topleveldir, processors, scanenv, batcursors, batcons, scandebug=False, unpacktempdir=None): 23 | for i in unpackreports: 24 | if not 'checksum' in unpackreports[i]: 25 | continue 26 | filehash = unpackreports[i]['checksum'] 27 | if not os.path.exists(os.path.join(topleveldir, "filereports", "%s-filereport.pickle" % filehash)): 28 | continue 29 | if not 'identifier' in unpackreports[i]['tags']: 30 | continue 31 | 32 | # read pickle file 33 | leaf_file = open(os.path.join(topleveldir, "filereports", "%s-filereport.pickle" % filehash), 'rb') 34 | leafreports = cPickle.load(leaf_file) 35 | leaf_file.close() 36 | 37 | writeback = False 38 | strs = leafreports['identifier']['strings'] 39 | copyrights = [] 40 | for line in strs: 41 | if 'copyright' in line.lower(): 42 | writeback = True 43 | copyrights.append(line) 44 | continue 45 | if '(c)' in line.lower(): 46 | writeback = True 47 | copyrights.append(line) 48 | if writeback: 49 | unpackreports[i]['tags'].append('copyright') 50 | leafreports['tags'].append('copyright') 51 | leafreports['copyrights'] = copyrights 52 | 53 | leaf_file = open(os.path.join(topleveldir, "filereports", "%s-filereport.pickle" % filehash), 'wb') 54 | leafreports = cPickle.dump(leafreports, leaf_file) 55 | leaf_file.close() 56 | -------------------------------------------------------------------------------- /src/bat/unpackrpm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2009-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This module contains only code specific to RPM unpacking. This is so it can be 9 | disabled on systems that don't have the Python RPM bindings installed. 10 | ''' 11 | 12 | import os 13 | import os.path 14 | import struct 15 | import subprocess 16 | import tempfile 17 | import rpm 18 | import extractor 19 | import fwunpack 20 | 21 | # RPM is basically a header, plus some compressed files, so we might get 22 | # duplicates at the moment. We can defeat this easily by setting the blacklist 23 | # upperbound to the start of compression + 1. This is ugly and should actually 24 | # be fixed. 25 | def searchUnpackRPM(filename, tempdir=None, blacklist=[], offsets={}, scanenv={}, debug=False): 26 | hints = {} 27 | if 'rpm' not in offsets: 28 | return ([], blacklist, [], hints) 29 | if offsets['rpm'] == []: 30 | return ([], blacklist, [], hints) 31 | 32 | # sanity checks for payload compressors before even trying to process headers 33 | compressorfound = False 34 | compressors = ['gzip', 'xz', 'bz2', 'lzip'] 35 | for compressor in compressors: 36 | if compressor in offsets: 37 | compressorfound = True 38 | break 39 | 40 | if not compressorfound: 41 | return ([], blacklist, [], hints) 42 | 43 | offsetsfound = False 44 | for compressor in compressors: 45 | if offsets[compressor] != []: 46 | offsetsfound = True 47 | break 48 | 49 | if not offsetsfound: 50 | return ([], blacklist, [], hints) 51 | 52 | diroffsets = [] 53 | rpmcounter = 1 54 | for offset in offsets['rpm']: 55 | blacklistoffset = extractor.inblacklist(offset, blacklist) 56 | if blacklistoffset is not None: 57 | continue 58 | rpmfile = open(filename, 'rb') 59 | rpmfile.seek(offset+4) 60 | rpmversionbyte = rpmfile.read(1) 61 | rpmfile.close() 62 | rpmmajorversion = struct.unpack(' 3 or rpmmajorversion == 0: 64 | continue 65 | 66 | # now first check the header 67 | headervalid = False 68 | tset = rpm.TransactionSet() 69 | tset.setVSFlags(rpm._RPMVSF_NOSIGNATURES) 70 | sizeofheader = 0 71 | # search all compressors, sorted by prevalence 72 | for compressor in ['gzip', 'xz', 'bz2', 'lzip']: 73 | if not compressor in offsets: 74 | continue 75 | for compressoroffset in offsets[compressor]: 76 | if compressoroffset < offset: 77 | continue 78 | try: 79 | tmprpm = tempfile.mkstemp() 80 | rpmfile = open(filename, 'rb') 81 | rpmfile.seek(offset) 82 | rpmdata = rpmfile.read(compressoroffset - offset) 83 | rpmfile.close() 84 | os.write(tmprpm[0], rpmdata) 85 | os.fsync(tmprpm[0]) 86 | os.close(tmprpm[0]) 87 | fdno = os.open(tmprpm[1], os.O_RDONLY) 88 | header = tset.hdrFromFdno(fdno) 89 | os.close(fdno) 90 | os.unlink(tmprpm[1]) 91 | headervalid = True 92 | sizeofheader = compressoroffset - offset 93 | break 94 | except: 95 | if os.path.exists(tmprpm[1]): 96 | os.close(fdno) 97 | os.unlink(tmprpm[1]) 98 | if headervalid: 99 | break 100 | 101 | if not headervalid: 102 | ## no valid header was found so continue with the next RPM file 103 | continue 104 | 105 | # The RPM file format is heavily underdocumented, so scrape bits and pieces 106 | # of docs from various sources. 107 | # http://www.rpm.org/max-rpm/s1-rpm-file-format-rpm-file-format.html 108 | # https://docs.fedoraproject.org/ro/Fedora_Draft_Documentation/0.1/html/RPM_Guide/ch-package-structure.html 109 | 110 | # payload format always has to be cpio 111 | if header[rpm.RPMTAG_PAYLOADFORMAT] != 'cpio': 112 | continue 113 | 114 | # possibly good statistic to have 115 | #compressor = header[rpm.RPMTAG_PAYLOADCOMPRESSOR] 116 | 117 | # the size of the headers and payload, but not of the lead and any signatures 118 | bl = header[rpm.RPMTAG_SIGSIZE] 119 | filesize = os.stat(filename).st_size 120 | 121 | # after the header checks are done carve the possible RPM file from 122 | # the bigger archive (right now just removing all leading bytes) and 123 | # use rpm2cpio to unpack the RPM file. 124 | tmpdir = fwunpack.dirsetup(tempdir, filename, "rpm", rpmcounter) 125 | tmpfile = tempfile.mkstemp(dir=tmpdir) 126 | os.fdopen(tmpfile[0]).close() 127 | 128 | fwunpack.unpackFile(filename, offset, tmpfile[1], tmpdir) 129 | 130 | # first use rpm2cpio to unpack the rpm data 131 | p = subprocess.Popen(['rpm2cpio', tmpfile[1]], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) 132 | (stanout, stanerr) = p.communicate() 133 | if len(stanout) != 0: 134 | # cleanup first 135 | os.unlink(tmpfile[1]) 136 | if tempdir is None: 137 | os.rmdir(tmpdir) 138 | # then use unpackCpio() to unpack the RPM 139 | res = fwunpack.unpackCpio(stanout, tmpdir) 140 | else: 141 | os.unlink(tmpfile[1]) 142 | if tempdir is None: 143 | os.rmdir(tmpdir) 144 | 145 | if res is not None: 146 | rpmcounter = rpmcounter + 1 147 | try: 148 | # this header describes the size of headers + 149 | # compressed payload size. It might be a few bytes off 150 | # with the actual size of the file. 151 | bl = header[rpm.RPMTAG_SIGSIZE] 152 | filesize = os.stat(filename).st_size 153 | # sanity check. It should not happen with a properly 154 | # formatted RPM file, but you never know. 155 | if bl > filesize: 156 | bl = payloadoffset + 1 157 | except: 158 | bl = payloadoffset + 1 159 | diroffsets.append((res, offset, bl)) 160 | blacklist.append((offset, bl)) 161 | else: 162 | # cleanup 163 | os.rmdir(tmpdir) 164 | return (diroffsets, blacklist, [], hints) 165 | -------------------------------------------------------------------------------- /src/busybox-walk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2009-2013 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This program can be used to walk a directory tree and report the names 9 | of the applets that symlink to BusyBox. While not accurate (symlinks could 10 | have been removed) it might come in handy as an extra tool. 11 | ''' 12 | 13 | import os 14 | import sys sys 15 | from optparse import OptionParser 16 | 17 | def busyboxWalk(busyboxdir): 18 | busybox_applets = [] 19 | 20 | osgen = os.walk(busyboxdir) 21 | 22 | try: 23 | while True: 24 | i = osgen.next() 25 | for p in i[2]: 26 | if os.path.basename(os.path.realpath(os.path.join(i[0], p))) == 'busybox': 27 | busybox_applets.append(p) 28 | except StopIteration: 29 | pass 30 | 31 | busybox_applets.sort() 32 | return busybox_applets 33 | 34 | def main(argv): 35 | parser = OptionParser() 36 | parser.add_option("-d", "--directory", dest="bd", help="directory", metavar="DIR") 37 | (options, args) = parser.parse_args() 38 | if options.bd == None: 39 | parser.error("Path to top level directory of unpacked firmware needed") 40 | applets = busyboxWalk(options.bd) 41 | if applets != []: 42 | print "The following applets were found as symlinks:" 43 | for a in applets: 44 | if a != 'busybox': 45 | print "* %s" % a 46 | 47 | if __name__ == "__main__": 48 | main(sys.argv) 49 | -------------------------------------------------------------------------------- /src/crawlers/README: -------------------------------------------------------------------------------- 1 | This is a directory that contains several crawlers for initializing and updating a database. 2 | 3 | We need crawlers for: 4 | 5 | * ftp.gnu.org and mirrors 6 | * ftp.kernel.org and mirrors (for non-kernel tools) 7 | * sourceforge.net 8 | * savannah 9 | * apache (at least the C/C++ based projects) 10 | -------------------------------------------------------------------------------- /src/crawlers/crawling-php: -------------------------------------------------------------------------------- 1 | Crawling PHP's PEAR and PECL repositories 2 | 3 | PHP has an extension mechanism with which packages can be downloaded and 4 | installed, called 'pear'. Various 'channels' (or repositories) can be 5 | configured, like PEAR, PECL and more. 6 | 7 | The packages are downloaded, built (if needed) and installed on the system. 8 | The pear command also has an option to download packages. To download all 9 | packages there is a convenient 'download-all' option. 10 | 11 | For PEAR use: 12 | 13 | $ pear download-all 14 | 15 | For PECL use: 16 | 17 | $ pecl download-all 18 | 19 | By default only packages marked as 'stable' will be downloaded. To download 20 | other packages you can set the preferred_state, for example: 21 | 22 | $ pear config-set preferred_state alpha 23 | $ pear config-set preferred_state beta 24 | $ pear config-set preferred_state devel 25 | 26 | Please note: not all distributions build PHP with the pear command (for example 27 | Fedora does not) because it means that some packages will be installed outside 28 | the control of the system package manager which can lead to a sysadmin 29 | nightmare. You might need to build and install PHP as a normal user somewhere 30 | before being able to run the commands mentioned above. 31 | -------------------------------------------------------------------------------- /src/crawlers/gnu-config: -------------------------------------------------------------------------------- 1 | ## change this to a local mirror! 2 | [hostconfig] 3 | protocol = ftp 4 | url = ftp.nluug.nl/pub/gnu 5 | storedir = /tmp/gpl 6 | -------------------------------------------------------------------------------- /src/debian/compat: -------------------------------------------------------------------------------- 1 | 9 2 | -------------------------------------------------------------------------------- /src/debian/control: -------------------------------------------------------------------------------- 1 | Source: bat 2 | Section: misc 3 | Priority: extra 4 | Maintainer: Armijn Hemel 5 | Build-Depends: debhelper (>= 7.0.50~), python (>= 2.7) 6 | Standards-Version: 3.9.8 7 | Homepage: http://www.binaryanalysis.org/ 8 | 9 | Package: bat 10 | Architecture: all 11 | Depends: python-support (>= 0.90), python (>= 2.7), python-magic, binutils, e2tools, squashfs-tools, xz-utils, zip, unrar, cabextract, unshield, p7zip, p7zip-full, cpio, bzip2, mtd-utils, lzip, lzop, arj, icoutils, gettext, rpm, python-rpm, bat-extratools (>= 27.0), poppler-utils, upx-ucl, libxml2-utils, netpbm, lrzip, ncompress, python-imaging, vorbis-tools, ctags, python-matplotlib, unzip, python-pydot, bsdiff, python-reportlab, fonts-liberation, clamav, john, python-psycopg2, openssl 12 | Description: Modular framework to assist auditing binary files 13 | The Binary Analysis Tool is a modular framework that assists with auditing 14 | the contents of compiled software. It makes it easier and cheaper to look 15 | inside technology, and this helps compliance and due diligence activities. 16 | . 17 | The tool is freely available to everyone. The community can use it and 18 | participate in further development, and work together to help reduce errors 19 | when shipping devices or products containing Free and Open Source Software. 20 | 21 | ## these are apparently always provided, so they don't need to be explicitely 22 | ## defined as a dependency. Personally I think this is stupid and all 23 | ## dependencies should be listed, but hey, anything to keep lintian happy! 24 | # Depends: e2fsprogs, coreutils, gzip, tar 25 | -------------------------------------------------------------------------------- /src/debian/copyright: -------------------------------------------------------------------------------- 1 | This work was packaged for Debian by: 2 | 3 | Armijn Hemel on Wed, 12 Mar 2014 19:20:43 +0100 4 | 5 | It was downloaded from: 6 | 7 | 8 | 9 | Upstream Author: 10 | 11 | Armijn Hemel 12 | 13 | Copyright: 14 | 15 | 16 | 17 | License: 18 | 19 | Licensed under the Apache License, Version 2.0 (the "License"); 20 | you may not use this file except in compliance with the License. 21 | You may obtain a copy of the License at 22 | 23 | http://www.apache.org/licenses/LICENSE-2.0 24 | 25 | Unless required by applicable law or agreed to in writing, software 26 | distributed under the License is distributed on an "AS IS" BASIS, 27 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 28 | See the License for the specific language governing permissions and 29 | limitations under the License. 30 | 31 | On Debian systems, the complete text of the Apache version 2.0 license 32 | can be found in "/usr/share/common-licenses/Apache-2.0". 33 | 34 | The Debian packaging is: 35 | 36 | Copyright (C) 2010-2015 Armijn Hemel 37 | -------------------------------------------------------------------------------- /src/debian/files: -------------------------------------------------------------------------------- 1 | bat_5.0-1_i386.deb unknown extra 2 | -------------------------------------------------------------------------------- /src/debian/pyversions: -------------------------------------------------------------------------------- 1 | 2.5- 2 | -------------------------------------------------------------------------------- /src/debian/rules: -------------------------------------------------------------------------------- 1 | #!/usr/bin/make -f 2 | # -*- makefile -*- 3 | # Sample debian/rules that uses debhelper. 4 | # This file was originally written by Joey Hess and Craig Small. 5 | # As a special exception, when this file is copied by dh-make into a 6 | # dh-make output file, you may use that output file without restriction. 7 | # This special exception was added by Craig Small in version 0.37 of dh-make. 8 | 9 | # Uncomment this to turn on verbose mode. 10 | #export DH_VERBOSE=1 11 | 12 | %: 13 | dh $@ 14 | -------------------------------------------------------------------------------- /src/extractkernelstrings.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ## Binary Analysis Tool 4 | ## Copyright 2010-2013 Armijn Hemel for Tjaldur Software Governance Solutions 5 | ## Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | import sys, os, string, re 8 | from optparse import OptionParser 9 | import sqlite3 10 | from bat import extractor 11 | 12 | ## TODO: replace by generic code from ranking.py 13 | 14 | ## some strings we are interested in can't be extracted using xgettext. 15 | ## We use a few regular expressions for them to extract them. Since there 16 | ## macros being introduced (and removed) from the kernel sources regularly 17 | ## we should try and keep this list up to date. 18 | exprs = [] 19 | 20 | bugtrapexpr = re.compile("BUG_TRAP\s*\(([\w\s\.:<>\-+=~!@#$^%&*\[\]{}+?|/,'\(\)\\\]+)\);", re.MULTILINE) 21 | -------------------------------------------------------------------------------- /src/knowledgebaseadd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ## Binary Analysis Tool 4 | ## Copyright 2009-2013 Armijn Hemel for Tjaldur Software Governance Solutions 5 | ## Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This script can be used to add firmware data to an existing knowledgebase 9 | ''' 10 | 11 | import os, sys, sqlite3, hashlib 12 | from optparse import OptionParser 13 | 14 | def gethash(path): 15 | scanfile = open("%s" % (path,), 'r') 16 | h = hashlib.new('sha256') 17 | h.update(scanfile.read()) 18 | scanfile.close() 19 | return h.hexdigest() 20 | 21 | def main(argv): 22 | parser = OptionParser() 23 | parser.add_option("-b", "--binary", dest="binary", help="path to binary", metavar="FILE") 24 | parser.add_option("-d", "--database", dest="db", help="path to database", metavar="FILE") 25 | parser.add_option("-c", "--chipset", dest="chipset", help="name of chipset", metavar="CHIPSET") 26 | parser.add_option("-f", "--firmwareversion", dest="fwversion", help="firmware version", metavar="FWVERSION") 27 | parser.add_option("-m", "--manufacturer", dest="vendor", help="name of manufacturer", metavar="MANUFACTURER") 28 | parser.add_option("-n", "--name", dest="name", help="name of device", metavar="NAME") 29 | parser.add_option("-u", "--upstream", dest="upstream", help="upstream vendor (optional)", metavar="UPSTREAM") 30 | parser.add_option("-w", "--hardwareversion", dest="hwversion", help="hardware version", metavar="HWVERSION") 31 | (options, args) = parser.parse_args() 32 | 33 | if options.db == None: 34 | parser.error("Path to database file needed") 35 | try: 36 | conn = sqlite3.connect(options.db) 37 | except: 38 | print "Can't open database file" 39 | sys.exit(1) 40 | 41 | if options.chipset == None: 42 | parser.error("Need name of chipset") 43 | if options.fwversion == None: 44 | parser.error("Need firmware version") 45 | if options.hwversion == None: 46 | parser.error("Need hardware version") 47 | if options.vendor == None: 48 | parser.error("Need manufacturer name") 49 | if options.name == None: 50 | parser.error("Need device name") 51 | if options.upstream == None: 52 | options.upstream = '' 53 | if options.binary == None: 54 | parser.error("Need path to binary") 55 | else: 56 | try: 57 | os.stat(options.binary) 58 | except: 59 | print >>sys.stderr, "Can't open binary" 60 | sys.exit(1) 61 | 62 | c = conn.cursor() 63 | 64 | c.execute('''create table if not exists device (id integer primary key autoincrement, vendor text, name text, version text, chipset text, upstream text)''') 65 | c.execute('''create table if not exists binary (id integer primary key autoincrement, sha256 text, deviceid integer)''') 66 | c.execute('''create index if not exists sha256_index on binary (sha256)''') 67 | 68 | t = (options.vendor, options.name, options.hwversion, options.chipset, options.upstream) 69 | c.execute('''insert into device(vendor, name, version,chipset, upstream) values (?, ?, ?, ?, ?)''', t) 70 | conn.commit() 71 | lastrow = c.lastrowid 72 | 73 | fwhash = gethash(options.binary) 74 | c.execute('''insert into binary (sha256, deviceid) values (?, ?)''', (fwhash, lastrow)) 75 | conn.commit() 76 | c.close() 77 | 78 | if __name__ == "__main__": 79 | main(sys.argv) 80 | -------------------------------------------------------------------------------- /src/knowledgebaseaddchipset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ## Binary Analysis Tool 4 | ## Copyright 2009-2013 Armijn Hemel for Tjaldur Software Governance Solutions 5 | ## Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This script can be used to add chipset data to an existing knowledgebase 9 | ''' 10 | 11 | import os, sys, sqlite3 12 | from optparse import OptionParser 13 | 14 | def main(argv): 15 | parser = OptionParser() 16 | parser.add_option("-d", "--database", dest="db", help="path to database", metavar="FILE") 17 | parser.add_option("-c", "--chipset", dest="chipset", help="name of chipset", metavar="CHIPSET") 18 | parser.add_option("-a", "--architecture", dest="architecture", help="chipset architecture (MIPS, ARM, etc.)", metavar="ARCHITECTURE") 19 | parser.add_option("-m", "--manufacturer", dest="manufacturer", help="chipset manufacturer", metavar="MANUFACTURER") 20 | (options, args) = parser.parse_args() 21 | 22 | if options.db == None: 23 | parser.error("Path to database file needed") 24 | try: 25 | conn = sqlite3.connect(options.db) 26 | except: 27 | print "Can't open database file" 28 | sys.exit(1) 29 | 30 | if options.chipset == None: 31 | parser.error("Need name of chipset") 32 | if options.manufacturer == None: 33 | parser.error("Need name of manufacturer") 34 | if options.architecture == None: 35 | parser.error("Need name of architecture") 36 | 37 | c = conn.cursor() 38 | 39 | ## insert some test data 40 | ## chipset information from http://wiki.openwrt.org/toh/start 41 | c.execute('''insert into chipset values (?, ?, ?)''', (options.chipset, options.manufacturer, options.architecture)) 42 | #c.execute('''insert into chipset values ('AR7', 'Texas Instruments', 'MIPS')''') 43 | #c.execute('''insert into chipset values ('BCM6851', 'Broadcom', 'MIPS')''') 44 | #c.execute('''insert into chipset values ('BCM4712', 'Broadcom', 'MIPS')''') 45 | conn.commit() 46 | c.close() 47 | 48 | if __name__ == "__main__": 49 | main(sys.argv) 50 | -------------------------------------------------------------------------------- /src/knowledgebaseinit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | ## Binary Analysis Tool 4 | ## Copyright 2009-2013 Armijn Hemel for Tjaldur Software Governance Solutions 5 | ## Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This script can be used to initialize an empty knowledgebase and create all tables. 9 | ''' 10 | 11 | import os, sys, sqlite3 12 | from optparse import OptionParser 13 | 14 | def main(argv): 15 | parser = OptionParser() 16 | parser.add_option("-d", "--database", dest="db", help="path to database", metavar="FILE") 17 | (options, args) = parser.parse_args() 18 | if options.db == None: 19 | parser.error("Path to database file needed") 20 | try: 21 | conn = sqlite3.connect(options.db) 22 | except: 23 | print "Can't open database file" 24 | sys.exit(1) 25 | 26 | c = conn.cursor() 27 | 28 | ## create some tables 29 | c.execute('''create table if not exists chipset (name text, vendor text, family text)''') 30 | c.execute('''create table if not exists device (id integer primary key autoincrement, vendor text, name text, version text, chipset text, upstream text)''') 31 | c.execute('''create table if not exists binary (id integer primary key autoincrement, sha256 text, deviceid integer)''') 32 | c.execute('''create index if not exists sha256_index on binary (sha256)''') 33 | 34 | conn.commit() 35 | c.close() 36 | 37 | if __name__ == "__main__": 38 | main(sys.argv) 39 | -------------------------------------------------------------------------------- /src/maintenance/batextensions.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Binary Analysis Tool 5 | # Copyright 2009-2015 Armijn Hemel for Tjaldur Software Governance Solutions 6 | # Licensed under Apache 2.0, see LICENSE file for details 7 | 8 | # list of extensions, plus what language they should be mapped to 9 | # This is not necessarily correct, but right now it suffices. Ideally a parser 10 | # would be run on each file to see what kind of file it is. 11 | extensions = {'.c' : 'C', 12 | '.cc' : 'C', 13 | '.cpp' : 'C', 14 | '.cxx' : 'C', 15 | '.c++' : 'C', 16 | '.h' : 'C', 17 | '.hh' : 'C', 18 | '.hpp' : 'C', 19 | '.hxx' : 'C', 20 | '.l' : 'C', 21 | '.qml' : 'C', 22 | '.s' : 'C', 23 | '.txx' : 'C', 24 | '.y' : 'C', 25 | '.dts' : 'C', # specific to Linux kernel 26 | '.dtsi' : 'C', # specific to Linux kernel 27 | '.cs' : 'C#', 28 | '.groovy' : 'Java', 29 | '.java' : 'Java', 30 | '.jsp' : 'Java', 31 | '.scala' : 'Java', 32 | '.as' : 'ActionScript', 33 | '.js' : 'JavaScript', 34 | '.php' : 'PHP', 35 | '.py' : 'Python', 36 | '.rb' : 'Ruby', 37 | '.patch' : 'patch', 38 | '.diff' : 'patch', 39 | } 40 | -------------------------------------------------------------------------------- /src/maintenance/busybox-appletname-extractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2009-2012 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | # 8 | # Helper script to extract configurations from busybox source code. 9 | # Results are dumped as a pickle file, which can later be used by the 10 | # BusyBox processing scripts to map applet names back to configuration 11 | # directives. This is useful when comparing with a supplied configuration 12 | # file to see if these match. 13 | # 14 | 15 | # For newer BusyBox versions you first need to generate applets.h 16 | # First unpack the archive, go to the root of the unpacked archive and run: 17 | # ./scripts/gen_build_files.sh . . 18 | 19 | import sys, os, re, pickle 20 | from optparse import OptionParser 21 | 22 | def extract_major_version(version): 23 | return version.rsplit('.', version.count('.')-1)[0] 24 | 25 | # configs format: 26 | # {symbolic link name: (appletname, config option)} 27 | # Example: 28 | # {'sha1sum': ('md5_sha1_sum', 'SHA1SUM')} 29 | # 30 | def extract_configuration(lines, version): 31 | configs = {} 32 | if version >= "1.1.1": 33 | if extract_major_version(version) >= "1.15": 34 | prefix = "IF_" 35 | else: 36 | prefix = "USE_" 37 | for line in lines: 38 | configname = re.match("%s([\w_]*)\(APPLET_\w+\(([\w\.\-_\[]+),\s*([\w\.\-_]*)" % (prefix,), line.strip()) 39 | if configname != None: 40 | configs[configname.groups()[1]] = (configname.groups()[2], configname.groups()[0]) 41 | else: 42 | configname = re.match("%s([\w_]*)\(APPLET\(([\w\.\-_\[]+)" % (prefix,), line.strip()) 43 | if configname != None: 44 | configs[configname.groups()[1]] = (configname.groups()[1], configname.groups()[0]) 45 | else: 46 | if version < "1.00": 47 | prefix = "BB" 48 | else: 49 | prefix = "CONFIG" 50 | for line in range(0,len(lines) -1): 51 | config = re.match("#ifdef %s\_([\_\w]+)" % (prefix,), lines[line].strip()) 52 | if config == None: 53 | config = re.match("#if ENABLE\_([\_\w]+)", lines[line].strip()) 54 | if config == None: 55 | config = re.match("#if BB\_APPLET\_([\_\w]+)", lines[line].strip()) 56 | if config == None: 57 | config = re.match("#if defined\(%s\_(FEATURE\_[\_\w]+)\)" % (prefix,), lines[line].strip()) 58 | if config == None: 59 | continue 60 | configname = re.match("APPLET\(([\w\.\-\_\[]+), ([\w\_]+),", lines[line+1].strip()) 61 | if configname == None: 62 | configname = re.match("APPLET_(?:NOUSAGE|ODDNAME)\(\"([\w\.\-\_\[]+)\", ([\w\_]+),", lines[line+1].strip()) 63 | if configname != None: 64 | # remove _main from the name of the applet, assuming it is 65 | # the same as the name of the applet 66 | configs[configname.groups()[0]] = (configname.groups()[1][:-5], config.groups()[0]) 67 | else: 68 | # remove _main from the name of the applet, assuming it is 69 | # the same as the name of the applet 70 | configs[configname.groups()[0]] = (configname.groups()[1][:-5], config.groups()[0]) 71 | return configs 72 | 73 | def main(argv): 74 | parser = OptionParser() 75 | parser.add_option("-a", "--applets", action="store", dest="applets", help="path to applets.h", metavar="FILE") 76 | parser.add_option("-n", "--busyboxversion", action="store", dest="busyboxversion", help="BusyBox version", metavar="VERSION") 77 | (options, args) = parser.parse_args() 78 | if options.applets == None: 79 | parser.error("Path to applets.h in BusyBox directory needed") 80 | if options.busyboxversion == None: 81 | parser.error("BusyBox version needed") 82 | 83 | busybox_applets = open(options.applets, 'rb') 84 | busybox_lines = busybox_applets.readlines() 85 | version = options.busyboxversion 86 | bb_configuration = extract_configuration(busybox_lines, version) 87 | if bb_configuration != []: 88 | output = open('%s-config' % (version, ), 'w') 89 | pickle.dump(bb_configuration, output) 90 | output.close() 91 | 92 | if __name__ == "__main__": 93 | main(sys.argv) 94 | -------------------------------------------------------------------------------- /src/maintenance/clonedbinit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2013 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This script can be used to initialize the clone database. The clone database is 9 | used to record information about which packages should be treated as the same 10 | package, or under which alternative names it is known, and so on. 11 | 12 | Cloning of packages happens frequently: 13 | 14 | * renaming: packages are renamed for some reason, like politics 15 | * bundling: one package is copied entirely into another package, for example 16 | as "third party software" 17 | * partial copying: parts of a package have been copied into another package. 18 | Examples are glue code for some Python packages, where a large amount of 19 | packages share just one file 20 | 21 | Apart from verbatim cloning there is also cloning that happens in a more subtle 22 | way. For example, code was copied, then slightly adapted. It might not be the 23 | same when looking at SHA256 checksums of the files, but it might still look 24 | the same when looking at strings or function names. 25 | ''' 26 | 27 | import os 28 | import sys 29 | import sqlite3 30 | from optparse import OptionParser 31 | 32 | def main(argv): 33 | parser = OptionParser() 34 | parser.add_option("-d", "--database", dest="db", help="path to clone database", metavar="FILE") 35 | (options, args) = parser.parse_args() 36 | if options.db == None: 37 | parser.error("Path to clone database file needed") 38 | try: 39 | conn = sqlite3.connect(options.db) 40 | except: 41 | print "Can't open clone database file" 42 | sys.exit(1) 43 | 44 | c = conn.cursor() 45 | 46 | # create table for renamed packages 47 | c.execute('''create table if not exists renames (originalname text, newname text)''') 48 | c.execute('''create index if not exists renames_index_originalname on renames (originalname)''') 49 | c.execute('''create index if not exists renames_index_newname on renames (newname)''') 50 | 51 | # insert some values as examples 52 | c.execute('''insert into renames values ('ethereal', 'wireshark')''') 53 | c.execute('''insert into renames values ('koffice', 'calligra')''') 54 | c.execute('''insert into renames values ('ucd-snmp', 'net-snmp')''') 55 | c.execute('''insert into renames values ('iproute', 'iproute2')''') 56 | c.execute('''insert into renames values ('gaim', 'pidgin')''') 57 | c.execute('''insert into renames values ('kdebase-runtime', 'kde-runtime')''') 58 | c.execute('''insert into renames values ('kdebase-workspace', 'kde-workspace')''') 59 | c.execute('''insert into renames values ('eglibc', 'glibc')''') 60 | c.execute('''insert into renames values ('org.apache.servicemix.bundles.ant', 'apache-ant')''') 61 | c.execute('''insert into renames values ('wengophone', 'qutecom')''') 62 | c.execute('''insert into renames values ('gaim-plugin_pack', 'purple-plugin_pack')''') 63 | 64 | conn.commit() 65 | c.close() 66 | conn.close() 67 | 68 | if __name__ == "__main__": 69 | main(sys.argv) 70 | -------------------------------------------------------------------------------- /src/maintenance/copybatarchives.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2014-2015 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | Script to copy BAT archive files efficiently. Takes three arguments: 9 | 10 | 1. directory with 'original' archives (used to create the BAT archives) 11 | 2. directory with BAT archives 12 | 3. target directory where to copy BAT archives to 13 | 14 | It is very important that 1. is the same directory as used to generate the BAT archives 15 | ''' 16 | 17 | import sys 18 | import os 19 | import re 20 | import subprocess 21 | import shutil 22 | import stat 23 | from optparse import OptionParser 24 | from multiprocessing import Pool 25 | 26 | def main(argv): 27 | parser = OptionParser() 28 | parser.add_option("-a", "--archivedir", action="store", dest="archivedir", help="path to directory with BAT archives", metavar="DIR") 29 | parser.add_option("-o", "--origdir", action="store", dest="origdir", help="directory with original archives", metavar="DIR") 30 | parser.add_option("-t", "--targetdir", action="store", dest="targetdir", help="target directory", metavar="DIR") 31 | (options, args) = parser.parse_args() 32 | 33 | if options.archivedir == None: 34 | parser.error("specify archivedir") 35 | else: 36 | try: 37 | archivelist = open(os.path.join(options.archivedir,"ARCHIVELIST")).readlines() 38 | except: 39 | parser.error("'ARCHIVELIST' not found in file dir") 40 | if options.origdir == None: 41 | parser.error("specify origdir") 42 | else: 43 | try: 44 | filelist = open(os.path.join(options.origdir,"LIST")).readlines() 45 | except: 46 | parser.error("'LIST' not found in file dir") 47 | 48 | archives = os.listdir(options.archivedir) 49 | archivenames = set() 50 | for a in archives: 51 | asplits = a.rsplit('.', 2) 52 | if len(asplits) != 3: 53 | continue 54 | if asplits[2] != 'bz2': 55 | continue 56 | if asplits[1] != 'tar': 57 | continue 58 | if not 'bat' in asplits[0]: 59 | continue 60 | archivenames.add(a) 61 | 62 | if options.targetdir == None: 63 | parser.error("specify targetdir") 64 | else: 65 | if not os.path.exists(options.targetdir): 66 | parser.error("targetdir does not exist") 67 | 68 | copyfromarchives = set() 69 | copyfromorig = set() 70 | archivetometa = {} 71 | 72 | for unpackfile in filelist: 73 | try: 74 | unpacks = unpackfile.strip().split() 75 | if len(unpacks) == 4: 76 | (package, version, filename, origin) = unpacks 77 | if '%s-%s-%s-bat.tar.bz2' % (package, version, origin) in archivenames: 78 | copyfromarchives.add('%s-%s-%s-bat.tar.bz2' % (package, version, origin)) 79 | archivetometa['%s-%s-%s-bat.tar.bz2' % (package, version, origin)] = (version, origin) 80 | else: 81 | copyfromorig.add(filename) 82 | except: 83 | pass 84 | 85 | print "copying %d archives" % len(copyfromarchives) 86 | for i in copyfromarchives: 87 | shutil.copy(os.path.join(options.archivedir, i), options.targetdir) 88 | print "copying %d original files" % len(copyfromorig) 89 | for i in copyfromorig: 90 | shutil.copy(os.path.join(options.origdir, i), options.targetdir) 91 | print "copying manifests" 92 | if os.path.exists(os.path.join(options.origdir, 'MANIFESTS')): 93 | os.mkdir(os.path.join(options.targetdir, 'MANIFESTS')) 94 | manifests = os.listdir(os.path.join(options.origdir, 'MANIFESTS')) 95 | for i in manifests: 96 | shutil.copy(os.path.join(options.origdir, 'MANIFESTS', i), os.path.join(options.targetdir, 'MANIFESTS')) 97 | if os.path.exists(os.path.join(options.archivedir, 'MANIFESTS')): 98 | manifests = os.listdir(os.path.join(options.archivedir, 'MANIFESTS')) 99 | for i in manifests: 100 | shutil.copy(os.path.join(options.archivedir, 'MANIFESTS', i), os.path.join(options.targetdir, 'MANIFESTS')) 101 | print "copying checksums and downloadurl" 102 | if os.path.exists(os.path.join(options.origdir, 'SHA256SUM')): 103 | shutil.copy(os.path.join(options.origdir, 'SHA256SUM'), options.targetdir) 104 | if os.path.exists(os.path.join(options.origdir, 'DOWNLOADURL')): 105 | shutil.copy(os.path.join(options.origdir, 'DOWNLOADURL'), options.targetdir) 106 | #if os.path.exists(os.path.join(options.origdir, 'SHA256SUM')): 107 | #sha256sums = open(os.path.join(options.origdir, 'SHA256SUM')).readlines() 108 | #if os.path.exists(os.path.join(options.archivedir, 'SHA256SUM')): 109 | #sha256sums = open(os.path.join(options.archivedir, 'SHA256SUM')).readlines() 110 | 111 | print "writing LIST" 112 | newlistfile = open(os.path.join(options.targetdir, "LIST"), 'wb') 113 | # walk the original LIST file and write lines for the files for which there are no archives 114 | for f in filelist: 115 | unpacks = f.strip().split() 116 | filename = unpacks[2] 117 | if filename in copyfromorig: 118 | newlistfile.write(f) 119 | # then walk the list for archives 120 | for f in archivelist: 121 | archivename = f.strip() 122 | if archivename in copyfromarchives: 123 | (version, origin) = archivetometa[archivename] 124 | newlistfile.write("%s\t%s\t%s\t%s\tbatarchive\n" % (archivename[:-12], version, archivename, origin)) 125 | newlistfile.close() 126 | 127 | if __name__ == "__main__": 128 | main(sys.argv) 129 | -------------------------------------------------------------------------------- /src/maintenance/createdb.config: -------------------------------------------------------------------------------- 1 | [extractconfig] 2 | configtype = global 3 | 4 | ## first the database credentials 5 | postgresql_user = bat 6 | postgresql_password = bat 7 | postgresql_db = bat 8 | #postgresql_port = 5432 9 | #postgresql_host = 127.0.0.1 10 | # 11 | ## first the database credentials 12 | #auth_postgresql_user = bat 13 | #auth_postgresql_password = bat 14 | #auth_postgresql_db = bat_old 15 | #auth_postgresql_port = 5432 16 | #auth_postgresql_host = 127.0.0.1 17 | #authcopy = string:function:variable 18 | # 19 | 20 | ## 21 | scanlicense = yes 22 | scancopyright = yes 23 | scansecurity = yes 24 | cleanup = yes 25 | ## should the database be wiped? 26 | wipe = no 27 | ## directory where to unpack sources 28 | unpackdir = /ramdisk 29 | extrahashes = md5:sha1:crc32:tlsh 30 | nomoschunks = 10 31 | urlcutoff = 1000 32 | maxstringcutoff = 1000 33 | minstringcutoff = 4 34 | ## below are for for generatelistrpm.py 35 | patchesdir = /tmp/patches 36 | rpmdb = /tmp/rpmdb.sqlite3 37 | insecurerpm = yes 38 | ## cutoff is 200 MiB 39 | cutoff = 209715200 40 | 41 | ## configuration for CVE parser 42 | [cveconfig] 43 | ## directory of where to store patches that are mentioned 44 | ## in a CVE report 45 | patchdir = /home/bat/cve/git 46 | 47 | ## now follows per package configuration 48 | 49 | [bash] 50 | configtype = package 51 | extensions = .def:C 52 | 53 | [chromium] 54 | configtype = package 55 | blacklist = icudt46l_dat.S:icudt42l_dat.S:icudtl_dat.S:icudt42l_dat.s 56 | 57 | [qt] 58 | configtype = package 59 | blacklist = icudt46l_dat.S:icudt42l_dat.S:icudtl_dat.S:icudt42l_dat.s 60 | 61 | [freecad] 62 | configtype = package 63 | blacklist = Arch_rc.py 64 | 65 | [linux] 66 | configtype = package 67 | alwaysscan = string:function 68 | -------------------------------------------------------------------------------- /src/maintenance/createfiledatabasedebian.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2012-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This script mines data from Debian package databases (available on any Debian 9 | mirror as Contents-$ARCH.gz) and puts it in another database. 10 | ''' 11 | 12 | import os 13 | import os.path 14 | import sys 15 | import psycopg2 16 | import gzip 17 | import ConfigParser 18 | from optparse import OptionParser 19 | 20 | def main(argv): 21 | config = ConfigParser.ConfigParser() 22 | parser = OptionParser() 23 | parser.add_option("-c", "--config", action="store", dest="cfg", help="path to configuration file", metavar="FILE") 24 | parser.add_option("-f", "--file", action="store", dest="contentsfile", help="path to file containing contents of Debian packages", metavar="FILE") 25 | 26 | (options, args) = parser.parse_args() 27 | if options.contentsfile == None: 28 | parser.error("Need path to Debian packages file") 29 | 30 | if not os.path.exists(options.contentsfile): 31 | print >>sys.stderr, "Debian packages file does not exist" 32 | sys.stderr.flush() 33 | sys.exit(1) 34 | 35 | if options.cfg == None: 36 | parser.error("Need path to configuration file") 37 | 38 | try: 39 | configfile = open(options.cfg, 'r') 40 | except: 41 | parser.error("Configuration file not readable") 42 | 43 | config.readfp(configfile) 44 | configfile.close() 45 | 46 | section = 'extractconfig' 47 | 48 | try: 49 | postgresql_user = config.get(section, 'postgresql_user') 50 | postgresql_password = config.get(section, 'postgresql_password') 51 | postgresql_db = config.get(section, 'postgresql_db') 52 | 53 | # check to see if a host (IP-address) was supplied either 54 | # as host or hostaddr. hostaddr is not supported on older 55 | # versions of psycopg2, for example CentOS 6.6, so it is not 56 | # used at the moment. 57 | try: 58 | postgresql_host = config.get(section, 'postgresql_host') 59 | except: 60 | postgresql_host = None 61 | try: 62 | postgresql_hostaddr = config.get(section, 'postgresql_hostaddr') 63 | except: 64 | postgresql_hostaddr = None 65 | # check to see if a port was specified. If not, default to 'None' 66 | try: 67 | postgresql_port = config.get(section, 'postgresql_port') 68 | except Exception, e: 69 | postgresql_port = None 70 | except: 71 | print >>sys.stderr, "Database connection not defined in configuration file. Exiting..." 72 | sys.stderr.flush() 73 | sys.exit(1) 74 | try: 75 | conn = psycopg2.connect(database=postgresql_db, user=postgresql_user, password=postgresql_password, host=postgresql_host, port=postgresql_port) 76 | 77 | cursor = conn.cursor() 78 | except: 79 | print >>sys.stderr, "Can't open database" 80 | sys.exit(1) 81 | 82 | contents = gzip.open(options.contentsfile) 83 | seenstart = False 84 | for i in contents: 85 | if not seenstart: 86 | if i.startswith('FILE'): 87 | seenstart = True 88 | continue 89 | else: 90 | continue 91 | packageversion='' 92 | (filepath, categorypackage) = i.strip().rsplit(' ', 1) 93 | package = categorypackage.rsplit('/')[1].strip() 94 | 95 | cursor.execute("insert into file values (%s,%s,%s,%s, 'Debian', %s)", (os.path.basename(filepath.strip()), os.path.dirname(filepath.strip()), package, packageversion, '')) 96 | 97 | contents.close() 98 | conn.commit() 99 | cursor.close() 100 | conn.close() 101 | 102 | if __name__ == "__main__": 103 | main(sys.argv) 104 | -------------------------------------------------------------------------------- /src/maintenance/createfiledatabasefedora.py: -------------------------------------------------------------------------------- 1 | #/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2012-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This script mines data from Fedora package databases (available on any Fedora 9 | mirror under os/repodata) and puts it in another database. 10 | 11 | The names of the files that are needed end in "filelists.sqlite.bz2" 12 | (file list database) and "primary.sqlite.bz2" (package database) 13 | 14 | Example: linux/releases/24/Everything/x86_64/os/repodata/ 15 | 16 | The files need to be decompressed first 17 | ''' 18 | 19 | import os 20 | import os.path 21 | import sys 22 | import sqlite3 23 | import psycopg2 24 | from optparse import OptionParser 25 | import ConfigParser 26 | 27 | # select version,name,pkgKey from packages; 28 | # store in {pkgKey: {'name': name, 'version': version}} 29 | # from other database: 30 | # select version,name,pkgKey from packages; 31 | # process all files (not directories) 32 | # store in database 33 | 34 | def main(argv): 35 | config = ConfigParser.ConfigParser() 36 | parser = OptionParser() 37 | parser.add_option("-c", "--config", action="store", dest="cfg", help="path to configuration file", metavar="FILE") 38 | parser.add_option("-f", "--filelistdatabase", action="store", dest="filelistdatabase", help="path to database containing file info (filelists.sqlite)", metavar="FILE") 39 | parser.add_option("-p", "--packagedatabase", action="store", dest="packagedatabase", help="path to database containing package info (primary.sqlite)", metavar="FILE") 40 | parser.add_option("-s", "--fedoraversion", action="store", dest="fedoraversion", help="Fedora version", metavar="VERSION") 41 | 42 | (options, args) = parser.parse_args() 43 | 44 | if options.cfg == None: 45 | parser.error("Need path to configuration file") 46 | 47 | try: 48 | configfile = open(options.cfg, 'r') 49 | except: 50 | parser.error("Configuration file not readable") 51 | config.readfp(configfile) 52 | configfile.close() 53 | 54 | section = 'extractconfig' 55 | 56 | try: 57 | postgresql_user = config.get(section, 'postgresql_user') 58 | postgresql_password = config.get(section, 'postgresql_password') 59 | postgresql_db = config.get(section, 'postgresql_db') 60 | 61 | # check to see if a host (IP-address) was supplied either 62 | # as host or hostaddr. hostaddr is not supported on older 63 | # versions of psycopg2, for example CentOS 6.6, so it is not 64 | # used at the moment. 65 | try: 66 | postgresql_host = config.get(section, 'postgresql_host') 67 | except: 68 | postgresql_host = None 69 | try: 70 | postgresql_hostaddr = config.get(section, 'postgresql_hostaddr') 71 | except: 72 | postgresql_hostaddr = None 73 | # check to see if a port was specified. If not, default to 'None' 74 | try: 75 | postgresql_port = config.get(section, 'postgresql_port') 76 | except Exception, e: 77 | postgresql_port = None 78 | except: 79 | print >>sys.stderr, "Database connection not defined in configuration file. Exiting..." 80 | sys.stderr.flush() 81 | sys.exit(1) 82 | try: 83 | conn = psycopg2.connect(database=postgresql_db, user=postgresql_user, password=postgresql_password, host=postgresql_host, port=postgresql_port) 84 | 85 | cursor = conn.cursor() 86 | except: 87 | print >>sys.stderr, "Can't open database" 88 | sys.exit(1) 89 | 90 | if options.filelistdatabase == None or options.packagedatabase == None: 91 | parser.error("Provide paths to Fedora databases") 92 | if options.fedoraversion == None: 93 | parser.error("Provide version of Fedora") 94 | 95 | filelistconn = sqlite3.connect(options.filelistdatabase) 96 | filelistcursor = filelistconn.cursor() 97 | 98 | packageconn = sqlite3.connect(options.packagedatabase) 99 | packagecursor = packageconn.cursor() 100 | 101 | pkgnameversion = {} 102 | packagecursor.execute("select pkgKey, name, version from packages") 103 | res = packagecursor.fetchall() 104 | packageconn.commit() 105 | for i in res: 106 | pkgnameversion[i[0]] = {'name': i[1], 'version': i[2]} 107 | packagecursor.close() 108 | packageconn.close() 109 | 110 | for pkg in pkgnameversion.keys(): 111 | filelistcursor.execute("select pkgKey, dirname, filenames, filetypes from filelist where pkgKey=%d" % pkg) 112 | res = filelistcursor.fetchall() 113 | distroversion='' 114 | for r in res: 115 | (pkgKey, dirname, filenames, filetypes) = r 116 | files = filenames.split('/') 117 | # very crude filter to take care of '/' in filenames, which split will 118 | # turn into ['', ''] 119 | if '' in files: 120 | newfiles = [] 121 | for i in range(0,len(files)): 122 | empty = False 123 | if files[i] == '': 124 | if not empty: 125 | empty = True 126 | continue 127 | else: 128 | newfiles.append('/') 129 | empty = False 130 | else: 131 | newfiles.append(files[i]) 132 | empty = False 133 | files = newfiles 134 | for i in range(0,len(files)): 135 | if files[i] == '': 136 | continue 137 | if filetypes[i] == 'd': 138 | continue 139 | cursor.execute("insert into file values (%s,%s,%s,%s, 'Fedora', %s)", (files[i], dirname, pkgnameversion[pkg]['name'], pkgnameversion[pkg]['version'], options.fedoraversion)) 140 | #print dirname, files[i], pkgnameversion[pkg] 141 | filelistcursor.close() 142 | filelistconn.close() 143 | conn.commit() 144 | cursor.close() 145 | conn.close() 146 | 147 | if __name__ == "__main__": 148 | main(sys.argv) 149 | -------------------------------------------------------------------------------- /src/maintenance/dumplist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2012-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This script can be used to regenerate a LIST file from a database. This 9 | can be useful in situations like a diskcrash (and only the 'processed' table 10 | could be recovered), or in case of errors in the extraction scripts where parts 11 | of the database have to be regenerated. 12 | 13 | By default the script writes data for files from all origins, unless 'origin' 14 | is specified. 15 | 16 | This script needs the same configuration file as the database creation script. 17 | ''' 18 | 19 | import os 20 | import os.path 21 | import re 22 | import sys 23 | import fnmatch 24 | import ConfigParser 25 | from optparse import OptionParser 26 | 27 | import psycopg2 28 | 29 | def main(argv): 30 | config = ConfigParser.ConfigParser() 31 | parser = OptionParser() 32 | parser.add_option("-c", "--configuration", action="store", dest="cfg", help="path to configuration file", metavar="FILE") 33 | parser.add_option("-l", "--listfile", action="store", dest="listfile", help="path to LIST file (output)", metavar="FILE") 34 | parser.add_option("-o", "--origin", action="store", dest="origin", help="optional origin filter") 35 | 36 | (options, args) = parser.parse_args() 37 | if options.listfile == None: 38 | parser.error("Need path to LIST file") 39 | if options.cfg == None: 40 | parser.error("Need path to configuration file") 41 | 42 | try: 43 | configfile = open(options.cfg, 'r') 44 | except: 45 | parser.error("Configuration file not readable") 46 | config.readfp(configfile) 47 | configfile.close() 48 | 49 | section = 'extractconfig' 50 | 51 | try: 52 | postgresql_user = config.get(section, 'postgresql_user') 53 | postgresql_password = config.get(section, 'postgresql_password') 54 | postgresql_db = config.get(section, 'postgresql_db') 55 | 56 | # check to see if a host (IP-address) was supplied either 57 | # as host or hostaddr. hostaddr is not supported on older 58 | # versions of psycopg2, for example CentOS 6.6, so it is not 59 | # used at the moment. 60 | try: 61 | postgresql_host = config.get(section, 'postgresql_host') 62 | except: 63 | postgresql_host = None 64 | try: 65 | postgresql_hostaddr = config.get(section, 'postgresql_hostaddr') 66 | except: 67 | postgresql_hostaddr = None 68 | # check to see if a port was specified. If not, default to 'None' 69 | try: 70 | postgresql_port = config.get(section, 'postgresql_port') 71 | except Exception, e: 72 | postgresql_port = None 73 | except: 74 | print >>sys.stderr, "Database connection not defined in configuration file. Exiting..." 75 | sys.stderr.flush() 76 | sys.exit(1) 77 | try: 78 | conn = psycopg2.connect(database=postgresql_db, user=postgresql_user, password=postgresql_password, host=postgresql_host, port=postgresql_port) 79 | 80 | cursor = conn.cursor() 81 | except: 82 | print >>sys.stderr, "Can't open database" 83 | sys.exit(1) 84 | 85 | # TODO: add some sanity checks for 'origin' first 86 | if options.origin != None: 87 | cursor.execute("select package, version, filename, origin from processed where origin=%s", (options.origin,)) 88 | else: 89 | cursor.execute("select package, version, filename, origin from processed") 90 | res = cursor.fetchall() 91 | cursor.close() 92 | conn.close() 93 | 94 | if res != []: 95 | listfile = open(options.listfile, 'w') 96 | for i in res: 97 | (package, version, filename, origin) = i 98 | listfile.write("%s\t%s\t%s\t%s\n" % (package, version, filename, origin)) 99 | listfile.flush() 100 | listfile.close() 101 | 102 | if __name__ == "__main__": 103 | main(sys.argv) 104 | -------------------------------------------------------------------------------- /src/maintenance/findthirdparty.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2015-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This script finds clones in packages that are very specifically indicated in 9 | the source code tree of a package as "third party" by looking if certain 10 | patterns occur in path names. 11 | ''' 12 | 13 | import sys 14 | import os 15 | import psycopg2 16 | import multiprocessing 17 | from optparse import OptionParser 18 | import ConfigParser 19 | 20 | def main(argv): 21 | config = ConfigParser.ConfigParser() 22 | parser = OptionParser() 23 | parser.add_option("-c", "--config", action="store", dest="cfg", help="path to configuration file", metavar="FILE") 24 | parser.add_option("-t", "--test", action="store_true", dest="dryrun", help="do a test run, only report", metavar="TEST") 25 | (options, args) = parser.parse_args() 26 | 27 | if options.cfg == None: 28 | parser.error("No configuration file found") 29 | 30 | if not os.path.exists(options.cfg): 31 | parser.error("Configuration file does not exist") 32 | try: 33 | configfile = open(options.cfg, 'r') 34 | except: 35 | parser.error("Configuration file not readable") 36 | config.readfp(configfile) 37 | configfile.close() 38 | 39 | if not options.dryrun: 40 | options.dryrun = False 41 | 42 | # search configuration to see if it is correct and/or not malformed 43 | # first search for a section called 'extractconfig' with configtype = global 44 | for section in config.sections(): 45 | if section == "extractconfig": 46 | try: 47 | postgresql_user = config.get(section, 'postgresql_user') 48 | postgresql_password = config.get(section, 'postgresql_password') 49 | postgresql_db = config.get(section, 'postgresql_db') 50 | 51 | # check to see if a host (IP-address) was supplied either 52 | # as host or hostaddr. hostaddr is not supported on older 53 | # versions of psycopg2, for example CentOS 6.6, so it is not 54 | # used at the moment. 55 | try: 56 | postgresql_host = config.get(section, 'postgresql_host') 57 | except: 58 | postgresql_host = None 59 | try: 60 | postgresql_hostaddr = config.get(section, 'postgresql_hostaddr') 61 | except: 62 | postgresql_hostaddr = None 63 | 64 | # check to see if a port was specified. If not, default to 'None' 65 | try: 66 | postgresql_port = config.get(section, 'postgresql_port') 67 | except Exception, e: 68 | postgresql_port = None 69 | except: 70 | print >>sys.stderr, "Database connection not defined in configuration file. Exiting..." 71 | sys.stderr.flush() 72 | sys.exit(1) 73 | 74 | try: 75 | conn = psycopg2.connect(database=postgresql_db, user=postgresql_user, password=postgresql_password, host=postgresql_host, port=postgresql_port) 76 | cursor = conn.cursor() 77 | packagecursor = conn.cursor() 78 | except: 79 | print >>sys.stderr, "Database not running or misconfigured" 80 | sys.exit(1) 81 | 82 | packages = cursor.execute("select package, version, origin from processed") 83 | packages = cursor.fetchall() 84 | conn.commit() 85 | 86 | ignorepackages = ['linux', 'busybox'] 87 | 88 | packages = map(lambda x: x[:2], packages) 89 | 90 | packages.sort() 91 | 92 | thirdparty = set(['thirdparty', 'third_party', '3rdparty', '3rdpart']) 93 | 94 | seensha256 = set() 95 | for i in packages: 96 | packagecursor.execute("select distinct checksum,thirdparty from processed_file where package=%s and version=%s", i) 97 | while True: 98 | res = packagecursor.fetchmany(50000) 99 | conn.commit() 100 | if len(res) == 0: 101 | break 102 | for s in res: 103 | if s[0] in seensha256: 104 | continue 105 | if s[1] != None: 106 | continue 107 | checksum = s[0] 108 | cursor.execute("select distinct package,pathname,thirdparty from processed_file where checksum=%s", (checksum,)) 109 | packageres = cursor.fetchall() 110 | conn.commit() 111 | packageres = filter(lambda x: x[0] != i[0], packageres) 112 | for p in packageres: 113 | if p[0] in ignorepackages: 114 | continue 115 | if p[2] != None: 116 | continue 117 | # check if specific markers are in the path 118 | if i[0] in os.path.dirname(p[1]): 119 | marked = False 120 | for t in thirdparty: 121 | if t in os.path.dirname(p[1]): 122 | if options.dryrun: 123 | print i[0], i[1], checksum, p[:-1] 124 | else: 125 | cursor.execute("update processed_file set thirdparty=%s where package=%s and pathname=%s and checksum=%s", (True, p[0], p[1], checksum)) 126 | marked = True 127 | break 128 | if 'external' in os.path.dirname(p[1]) and not marked: 129 | if options.dryrun: 130 | print i[0], i[1], checksum, p[:-1] 131 | else: 132 | cursor.execute("update processed_file set thirdparty=%s where package=%s and pathname=%s and checksum=%s", (True, p[0], p[1], checksum)) 133 | else: 134 | if options.dryrun: 135 | pass 136 | #print i[0], i[1], checksum, p[:-1] 137 | else: 138 | pass 139 | conn.commit() 140 | seensha256.add(s[0]) 141 | conn.commit() 142 | packagecursor.close() 143 | cursor.close() 144 | conn.close() 145 | 146 | if __name__ == "__main__": 147 | main(sys.argv) 148 | -------------------------------------------------------------------------------- /src/maintenance/generatelist-fdroid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2011-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | Helper script to generate the LIST files for the string extraction scripts. 9 | While this script is not foolproof, it will save lots of typing :-) 10 | ''' 11 | 12 | import sys 13 | import os 14 | import os.path 15 | import bz2 16 | import tarfile 17 | import gzip 18 | from optparse import OptionParser 19 | 20 | # translation table for renames. None currently for F-Droid 21 | packagerenames = {} 22 | 23 | def generatelist(filedir): 24 | files = os.walk(filedir) 25 | try: 26 | while True: 27 | i = files.next() 28 | for p in i[2]: 29 | if p == "LIST" or p == 'SHA256SUM': 30 | continue 31 | # first determine things like the extension 32 | res = p.rsplit('_src.tar.gz', 1) 33 | if len(res) != 2: 34 | continue 35 | (packageversion, extension) = res 36 | (package, version) = packageversion.rsplit('_', 1) 37 | # f-droid specific package renames go here 38 | if package in packagerenames: 39 | package = packagerenames[package] 40 | print "%s\t%s\t%s\tf-droid" % (package, version, p) 41 | 42 | except Exception, e: 43 | print >>sys.stderr, e 44 | sys.stderr.flush() 45 | 46 | def main(argv): 47 | parser = OptionParser() 48 | parser.add_option("-f", "--filedir", action="store", dest="filedir", help="path to directory containing files to unpack", metavar="DIR") 49 | (options, args) = parser.parse_args() 50 | if options.filedir == None: 51 | print >>sys.stderr, "Specify dir with files" 52 | sys.exit(1) 53 | generatelist(options.filedir) 54 | 55 | if __name__ == "__main__": 56 | main(sys.argv) 57 | -------------------------------------------------------------------------------- /src/maintenance/generatelist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2011-2015 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | Helper script to generate the LIST files for the string extraction scripts. 9 | While this script is not foolproof, it will save lots of typing :-) 10 | ''' 11 | 12 | import sys 13 | import os 14 | import os.path 15 | from optparse import OptionParser 16 | 17 | # it's either in the form of: 18 | # package-version.extension 19 | # package_version.extension 20 | # where extension is tar.gz, tar.bz2, tar.xz, tgz, zip, tbz2, etc. 21 | def generatelist(filedir, origin): 22 | files = os.walk(filedir) 23 | try: 24 | while True: 25 | i = files.next() 26 | for p in i[2]: 27 | if p == "LIST": 28 | continue 29 | if p == "SHA256SUM": 30 | continue 31 | if p == "DOWNLOADURL": 32 | continue 33 | # first determine things like the extension 34 | res = p.rsplit('.', 1) 35 | if len(res) == 1: 36 | print >>sys.stderr, "can't split %s -- add manually" % (p,) 37 | continue 38 | (packageversion, extension) = res 39 | if extension in ["tgz", "tbz2"]: 40 | pass 41 | elif extension in ["jar", "zip"]: 42 | pass 43 | else: 44 | try: 45 | (packageversion, extension, compression) = p.rsplit('.', 2) 46 | except: 47 | continue 48 | if not (extension in ["tar"] and compression in ["gz", "bz2", "xz", "lz", "lzma", "Z"]): 49 | continue 50 | # exceptions go here 51 | if "wireless_tools" in packageversion: 52 | res = packageversion.rsplit(".", 1) 53 | # first try package-version 54 | else: 55 | res = packageversion.rsplit("-", 1) 56 | if len(res) == 1: 57 | # then try package_version 58 | res = packageversion.rsplit("_", 1) 59 | if len(res) == 1: 60 | print >>sys.stderr, "can't split %s -- add manually" % (p,) 61 | continue 62 | # perhaps there is a better split possible 63 | if res[1] in ['src', 'source', 'sources', 'Source', 'CLEAN', 'RHsemiCLEAN', 'RHCLEAN']: 64 | if '-' in res[0]: 65 | pass 66 | (package, version) = res 67 | print "%s\t%s\t%s\t%s" % (package, version, p, origin) 68 | except Exception: 69 | pass 70 | 71 | def main(argv): 72 | parser = OptionParser() 73 | parser.add_option("-f", "--filedir", action="store", dest="filedir", help="path to directory containing files to unpack", metavar="DIR") 74 | parser.add_option("-o", "--origin", action="store", dest="origin", help="origin of packages (default: unknown)", metavar="ORIGIN") 75 | (options, args) = parser.parse_args() 76 | if options.filedir is None: 77 | parser.error("Specify dir with files") 78 | if options.origin is None: 79 | origin = "unknown" 80 | else: 81 | origin = options.origin 82 | generatelist(options.filedir, origin) 83 | 84 | if __name__ == "__main__": 85 | main(sys.argv) 86 | -------------------------------------------------------------------------------- /src/maintenance/packagerename.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2012-2013 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This script mass renames files in the database. It uses a file with names and 9 | versions of packages, plus the new name and version the package should be 10 | given. Per package one line is used. Each line has four fields, separated by | 11 | 12 | oldname|oldversion|newname|newversion 13 | 14 | Optionally takes extra argument to dump data. This is useful to update the caches 15 | without having to regenerate the complete cache (which can take a looong time). 16 | ''' 17 | 18 | import sys 19 | import os 20 | import sqlite3 21 | import cPickle 22 | from optparse import OptionParser 23 | 24 | def main(argv): 25 | parser = OptionParser() 26 | parser.add_option("-d", "--database", action="store", dest="db", help="path to database file", metavar="FILE") 27 | parser.add_option("-r", "--rename", action="store", dest="removal", help="path to file listing package/version that need to be renamed", metavar="FILE") 28 | parser.add_option("-p", "--dump", action="store", dest="pickle", help="path to dump file", metavar="FILE") 29 | (options, args) = parser.parse_args() 30 | 31 | if options.db == None: 32 | parser.error("No database found") 33 | 34 | if options.removal == None: 35 | parser.error("No rename file found") 36 | 37 | dump = False 38 | if options.pickle != None: 39 | dump = True 40 | #parser.error("No dump file found") 41 | 42 | # store in pickle: 43 | # * package 44 | # * function names 45 | # * strings 46 | # * variable names 47 | pickledumps = [] 48 | 49 | rename = open(options.removal).readlines() 50 | renamefiles = [] 51 | for i in rename: 52 | (oldpackage, oldversion, newpackage, newversion) = i.strip().split('|') 53 | renamefiles.append((oldpackage, oldversion, newpackage, newversion)) 54 | conn = sqlite3.connect(options.db) 55 | cursor = conn.cursor() 56 | for r in renamefiles: 57 | (oldpackage, oldversion, newpackage, newversion) = r 58 | renamesha256 = set() 59 | removesha256 = set() 60 | cursor.execute('select checksum from processed_file where package=? and version=?', ((oldpackage, oldversion))) 61 | sha256s = cursor.fetchall() 62 | # now check for each SHA256 if it already exists with the new version (and the 63 | # old entry only needs to be removed) or if it actually needs to be renamed. 64 | for sha256 in sha256s: 65 | cursor.execute('select distinct package, version from processed_file where checksum=?', sha256) 66 | res = cursor.fetchall() 67 | if (newpackage, newversion) in res: 68 | removesha256.add(sha256) 69 | continue 70 | else: 71 | renamesha256.add(sha256) 72 | if dump: 73 | # first dump all data 74 | programstrings = [] 75 | functionnames = [] 76 | varnames = [] 77 | allsha256 = set() 78 | #allsha256 = removesha256 + renamesha256 79 | allsha256.update(removesha256) 80 | allsha256.update(renamesha256) 81 | for s in allsha256: 82 | res = cursor.execute("select stringidentifier,language from extracted_string where checksum=?", (s[0],)) 83 | if res != None: 84 | programstrings += res 85 | res = cursor.execute("select functionname,language from extracted_function where checksum=?", (s[0],)) 86 | if res != None: 87 | functionnames += res 88 | res = cursor.execute("select name,language,type from extracted_name where checksum=?", (s[0],)) 89 | if res != None: 90 | varnames += res 91 | pickledumps.append({'package': oldpackage, 'programstrings': programstrings, 'functionnames': functionnames, 'varnames': varnames}) 92 | 93 | for s in renamesha256: 94 | cursor.execute("update processed_file set package=?, version=? where checksum=? and package=? and version=?", (r[2], r[3], s[0], r[0], r[1])) 95 | for s in removesha256: 96 | cursor.execute("delete from processed_file where checksum=? and package=? and version=?", (s[0], r[0], r[1])) 97 | conn.commit() 98 | cursor.execute("select * from processed where package=? and version=?", (r[2], r[3])) 99 | res = cursor.fetchall() 100 | # only when doesn't exist in processed yet 101 | if res == []: 102 | cursor.execute("update processed set package=?, version=? where package=? and version=?", (r[2], r[3], r[0], r[1])) 103 | else: 104 | cursor.execute("delete from processed where package=? and version=?", (r[0], r[1])) 105 | conn.commit() 106 | conn.close() 107 | 108 | if dump: 109 | dumpfile = open(options.pickle, 'wb') 110 | cPickle.dump(pickledumps, dumpfile) 111 | dumpfile.close() 112 | 113 | if __name__ == "__main__": 114 | main(sys.argv) 115 | -------------------------------------------------------------------------------- /src/maintenance/postgresql-index.sql: -------------------------------------------------------------------------------- 1 | create index processed_index on processed(package, version); 2 | create index processed_checksum on processed(checksum); 3 | create index processed_origin on processed(origin); 4 | create index processed_website on processed(website); 5 | create index processedfile_package_checksum_index on processed_file(checksum, package); 6 | create index processedfile_package_version_index on processed_file(package, version); 7 | create index processedfile_filename_index on processed_file(filename); 8 | create index stringidentifier_index on extracted_string(stringidentifier,language); 9 | create index extracted_hash_index on extracted_string(checksum); 10 | create index extracted_language_index on extracted_string(language); 11 | create index function_index on extracted_function(checksum); 12 | create index functionname_index on extracted_function(functionname); 13 | create index functionname_language on extracted_function(language); 14 | create index name_checksum_index on extracted_name(checksum); 15 | create index name_name_index on extracted_name(name); 16 | create index name_type_index on extracted_name(type); 17 | create index name_language_index on extracted_name(language); 18 | create index kernel_configuration_filename on kernel_configuration(filename); 19 | create index kernelmodule_alias_index on kernelmodule_alias(alias); 20 | create index kernelmodule_author_index on kernelmodule_author(author); 21 | create index kernelmodule_description_index on kernelmodule_description(description); 22 | create index kernelmodule_firmware_index on kernelmodule_firmware(firmware); 23 | create index kernelmodule_license_index on kernelmodule_license(license); 24 | create index kernelmodule_parameter_index on kernelmodule_parameter(paramname); 25 | create index kernelmodule_parameter_description_index on kernelmodule_parameter_description(description); 26 | create index kernelmodule_version_index on kernelmodule_version(version); 27 | create index kernelmodule_alias_checksum_index on kernelmodule_alias(checksum); 28 | create index kernelmodule_author_checksum_index on kernelmodule_author(checksum); 29 | create index kernelmodule_description_checksum_index on kernelmodule_description(checksum); 30 | create index kernelmodule_firmware_checksum_index on kernelmodule_firmware(checksum); 31 | create index kernelmodule_license_checksum_index on kernelmodule_license(checksum); 32 | create index kernelmodule_parameter_checksum_index on kernelmodule_parameter(checksum); 33 | create index kernelmodule_parameter_description_checksum_index on kernelmodule_parameter_description(checksum); 34 | create index kernelmodule_version_checksum_index on kernelmodule_version(checksum); 35 | create index batresult_checksum_index on batresult(checksum); 36 | create index batresult_filename_index on batresult(filename); 37 | create index blacklist_checksum_index on blacklist(checksum); 38 | create index rpm_checksum_index on rpm(checksum); 39 | create index rpm_rpmname_index on rpm(rpmname); 40 | create index archivealias_checksum_index on archivealias(checksum); 41 | create index misc_checksum_index on misc(checksum); 42 | create index misc_name_index on misc(name); 43 | create index hashconversion_sha256_index on hashconversion(sha256); 44 | create index hashconversion_md5_index on hashconversion(md5); 45 | create index hashconversion_sha1_index on hashconversion(sha1); 46 | create index hashconversion_crc32_index on hashconversion(crc32); 47 | create index hashconversion_tlsh_index on hashconversion(tlsh); 48 | create index license_index on licenses(checksum); 49 | create index copyright_index on extracted_copyright(checksum); 50 | create index copyright_type_index on extracted_copyright(copyright, type); 51 | create index security_cert_checksum_index on security_cert(checksum); 52 | create index security_cve_checksum_index on security_cve(checksum); 53 | create index security_password_hash_index on security_password(hash); 54 | create index renames_index_originalname on renames (originalname); 55 | create index renames_index_newname on renames (newname); 56 | create index file_index on file(filename, directory); 57 | 58 | create index linuxkernelfunctionname_index on linuxkernelfunctionnamecache(functionname); 59 | create index linuxkernelnamecache_index on linuxkernelnamecache(varname); 60 | create index functionname_c_index on functionnamecache_c(functionname); 61 | create index varnamecache_c_index on varnamecache_c(varname); 62 | create index functionname_java_index on functionnamecache_java(functionname); 63 | create index fieldname_java_cache on fieldcache_java(fieldname); 64 | create index classname_java_cache on classcache_java(classname); 65 | 66 | create index stringidentifier_actionscript_index on stringscache_actionscript(stringidentifier); 67 | create index scores_actionscript_index on scores_actionscript(stringidentifier); 68 | create index package_actionscript_index on avgstringscache_actionscript(package); 69 | 70 | create index stringidentifier_c_index on stringscache_c(stringidentifier); 71 | create index scores_c_index on scores_c(stringidentifier); 72 | create index avgpackage_c_index on avgstringscache_c(package); 73 | 74 | create index stringidentifier_csharp_index on stringscache_csharp(stringidentifier); 75 | create index scores_csharp_index on scores_csharp(stringidentifier); 76 | create index avgpackage_csharp_index on avgstringscache_csharp(package); 77 | 78 | create index stringidentifier_java_index on stringscache_java(stringidentifier); 79 | create index scores_java_index on scores_java(stringidentifier); 80 | create index avgpackage_java_index on avgstringscache_java(package); 81 | 82 | create index stringidentifier_javascript_index on stringscache_javascript(stringidentifier); 83 | create index scores_javascript_index on scores_javascript(stringidentifier); 84 | create index avgpackage_javascript_index on avgstringscache_javascript(package); 85 | 86 | create index stringidentifier_php_index on stringscache_php(stringidentifier); 87 | create index scores_php_index on scores_php(stringidentifier); 88 | create index avgpackage_php_index on avgstringscache_php(package); 89 | 90 | create index stringidentifier_python_index on stringscache_python(stringidentifier); 91 | create index scores_python_index on scores_python(stringidentifier); 92 | create index avgpackage_python_index on avgstringscache_python(package); 93 | 94 | create index stringidentifier_ruby_index on stringscache_ruby(stringidentifier); 95 | create index scores_ruby_index on scores_ruby(stringidentifier); 96 | create index avgpackage_ruby_index on avgstringscache_ruby(package); 97 | -------------------------------------------------------------------------------- /src/maintenance/postgresql-table-drop.sql: -------------------------------------------------------------------------------- 1 | drop table processed; 2 | drop table processed_file; 3 | drop table extracted_string; 4 | drop table extracted_function; 5 | drop table extracted_name; 6 | 7 | drop table kernel_configuration; 8 | drop table kernelmodule_alias; 9 | drop table kernelmodule_author; 10 | drop table kernelmodule_description; 11 | drop table kernelmodule_firmware; 12 | drop table kernelmodule_license; 13 | drop table kernelmodule_parameter; 14 | drop table kernelmodule_parameter_description; 15 | drop table kernelmodule_version; 16 | 17 | drop table batresult; 18 | drop table blacklist; 19 | drop table rpm; 20 | drop table archivealias; 21 | drop table misc; 22 | drop table hashconversion; 23 | drop table licenses; 24 | drop table extracted_copyright; 25 | drop table security_cert; 26 | drop table security_cve; 27 | drop table security_password; 28 | drop table renames; 29 | drop table file; 30 | drop table stringscache_actionscript; 31 | drop table scores_actionscript; 32 | drop table avgstringscache_actionscript; 33 | 34 | drop table stringscache_c; 35 | drop table scores_c; 36 | drop table avgstringscache_c; 37 | 38 | drop table stringscache_csharp; 39 | drop table scores_csharp; 40 | drop table avgstringscache_csharp; 41 | 42 | drop table stringscache_java; 43 | drop table scores_java; 44 | drop table avgstringscache_java; 45 | 46 | drop table stringscache_javascript; 47 | drop table scores_javascript; 48 | drop table avgstringscache_javascript; 49 | 50 | drop table stringscache_php; 51 | drop table scores_php; 52 | drop table avgstringscache_php; 53 | 54 | drop table stringscache_python; 55 | drop table scores_python; 56 | drop table avgstringscache_python; 57 | 58 | drop table stringscache_ruby; 59 | drop table scores_ruby; 60 | drop table avgstringscache_ruby; 61 | 62 | drop table varnamecache_c; 63 | drop table linuxkernelnamecache; 64 | drop table functionnamecache_c; 65 | drop table linuxkernelfunctionnamecache; 66 | drop table functionnamecache_java; 67 | drop table fieldcache_java; 68 | drop table classcache_java; 69 | -------------------------------------------------------------------------------- /src/maintenance/postgresql-table.sql: -------------------------------------------------------------------------------- 1 | create table if not exists processed (package text, version text, filename text, origin text, checksum text, downloadurl text, website text); 2 | create table if not exists processed_file (package text, version text, pathname text, checksum text, filename text, thirdparty boolean); 3 | create table if not exists extracted_string (stringidentifier text, checksum text, language text, linenumber int); 4 | create table if not exists extracted_function (checksum text, functionname text, language text, linenumber int); 5 | create table if not exists extracted_name (checksum text, name text, type text, language text, linenumber int); 6 | 7 | create table if not exists kernel_configuration(configstring text, filename text, version text); 8 | create table if not exists kernelmodule_alias(checksum text, modulename text, alias text); 9 | create table if not exists kernelmodule_author(checksum text, modulename text, author text); 10 | create table if not exists kernelmodule_description(checksum text, modulename text, description text); 11 | create table if not exists kernelmodule_firmware(checksum text, modulename text, firmware text); 12 | create table if not exists kernelmodule_license(checksum text, modulename text, license text); 13 | create table if not exists kernelmodule_parameter(checksum text, modulename text, paramname text, paramtype text); 14 | create table if not exists kernelmodule_parameter_description(checksum text, modulename text, paramname text, description text); 15 | create table if not exists kernelmodule_version(checksum text, modulename text, version text); 16 | 17 | create table if not exists batresult(checksum text, filename text, tlsh text, pathname text, parentname text, parentchecksum text); 18 | create table if not exists blacklist(checksum text, filename text, origin text); 19 | create table if not exists rpm(rpmname text, checksum text, downloadurl text); 20 | create table if not exists archivealias(checksum text, archivename text, origin text, downloadurl text, website text); 21 | create table if not exists misc(checksum text, name text); 22 | create table if not exists hashconversion (sha256 text, md5 text, sha1 text, crc32 text, tlsh text); 23 | create table if not exists licenses (checksum text, license text, scanner text, version text); 24 | create table if not exists extracted_copyright (checksum text, copyright text, type text, byteoffset int); 25 | create table if not exists security_cert(checksum text, securitybug text, linenumber int, function text, whitelist boolean); 26 | create table if not exists security_cve(checksum text, cve text); 27 | create table if not exists security_password(hash text, password text, origin text); 28 | create table if not exists renames (originalname text, newname text); 29 | create table if not exists file(filename text, directory text, package text, packageversion text, source text, distroversion text); 30 | create table if not exists stringscache_actionscript (stringidentifier text, package text, filename text); 31 | create table if not exists scores_actionscript (stringidentifier text, packages int, score real); 32 | create table if not exists avgstringscache_actionscript (package text, avgstrings real, primary key (package)); 33 | 34 | create table if not exists stringscache_c (stringidentifier text, package text, filename text); 35 | create table if not exists scores_c (stringidentifier text, packages int, score real); 36 | create table if not exists avgstringscache_c (package text, avgstrings real, primary key (package)); 37 | 38 | create table if not exists stringscache_csharp (stringidentifier text, package text, filename text); 39 | create table if not exists scores_csharp (stringidentifier text, packages int, score real); 40 | create table if not exists avgstringscache_csharp (package text, avgstrings real, primary key (package)); 41 | 42 | create table if not exists stringscache_java (stringidentifier text, package text, filename text); 43 | create table if not exists scores_java (stringidentifier text, packages int, score real); 44 | create table if not exists avgstringscache_java (package text, avgstrings real, primary key (package)); 45 | 46 | create table if not exists stringscache_javascript (stringidentifier text, package text, filename text); 47 | create table if not exists scores_javascript (stringidentifier text, packages int, score real); 48 | create table if not exists avgstringscache_javascript (package text, avgstrings real, primary key (package)); 49 | 50 | create table if not exists stringscache_php (stringidentifier text, package text, filename text); 51 | create table if not exists scores_php (stringidentifier text, packages int, score real); 52 | create table if not exists avgstringscache_php (package text, avgstrings real, primary key (package)); 53 | 54 | create table if not exists stringscache_python (stringidentifier text, package text, filename text); 55 | create table if not exists scores_python (stringidentifier text, packages int, score real); 56 | create table if not exists avgstringscache_python (package text, avgstrings real, primary key (package)); 57 | 58 | create table if not exists stringscache_ruby (stringidentifier text, package text, filename text); 59 | create table if not exists scores_ruby (stringidentifier text, packages int, score real); 60 | create table if not exists avgstringscache_ruby (package text, avgstrings real, primary key (package)); 61 | 62 | create table if not exists varnamecache_c (varname text, package text); 63 | create table if not exists linuxkernelnamecache (varname text, package text); 64 | create table if not exists functionnamecache_c (functionname text, package text); 65 | create table if not exists linuxkernelfunctionnamecache (functionname text, package text); 66 | create table if not exists functionnamecache_java (functionname text, package text); 67 | create table if not exists fieldcache_java (fieldname text, package text); 68 | create table if not exists classcache_java (classname text, package text); 69 | -------------------------------------------------------------------------------- /src/maintenance/rewritelist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2013 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This program can be used to generate a LIST file, like generatelist.py, but 9 | taking two LIST files as input. The first ('correctedlist') is LIST that has 10 | corrected input. The second one is a possibly non-corrected list. 11 | 12 | The main use case is when the database has to be regenerated (new license 13 | scanners, better string extraction, and so on), with possibly new input. 14 | Using dumplist.py the (supposedly) corrected list (for old packages) can be 15 | extracted from the database. With generatelist.py a new list can be generated 16 | for the packages. By comparing the two and reusing the corrected results a lot 17 | of effort can be saved. 18 | ''' 19 | 20 | import os 21 | import os.path 22 | import sys 23 | from optparse import OptionParser 24 | 25 | def main(argv): 26 | parser = OptionParser() 27 | parser.add_option("-c", "--correctedlist", action="store", dest="correctedlist", help="path to corrected list", metavar="FILE") 28 | parser.add_option("-n", "--newlist", action="store", dest="newlist", help="path to new list", metavar="FILE") 29 | (options, args) = parser.parse_args() 30 | 31 | if options.correctedlist == None: 32 | parser.error("Need corrected list") 33 | if options.newlist == None: 34 | parser.error("Need new list") 35 | 36 | if not os.path.exists(options.correctedlist): 37 | parser.error("Need corrected list") 38 | if not os.path.exists(options.newlist): 39 | parser.error("Need new list") 40 | 41 | # first suck in the corrected data, filename is key 42 | correctedfiles = {} 43 | correctedfile_list = open(options.correctedlist).readlines() 44 | for c in correctedfile_list: 45 | (package, version, filename, origin) = c.strip().split() 46 | # this should actually not happen 47 | if correctedfiles.has_key(filename): 48 | continue 49 | else: 50 | correctedfiles[filename] = (package, version, origin) 51 | 52 | # then suck in the new data, filename is key 53 | newfiles = {} 54 | newfile_list = open(options.newlist).readlines() 55 | for c in newfile_list: 56 | (package, version, filename, origin) = c.strip().split() 57 | # this should actually not happen 58 | if newfiles.has_key(filename): 59 | continue 60 | else: 61 | newfiles[filename] = (package, version, origin) 62 | listentries = [] 63 | for i in newfiles.keys(): 64 | if correctedfiles.has_key(i): 65 | # entries are not the same! 66 | if newfiles[i] != correctedfiles[i]: 67 | listentries.append("%s\t%s\t%s\t%s" % (correctedfiles[i][0], correctedfiles[i][1], i, correctedfiles[i][2])) 68 | else: 69 | listentries.append("%s\t%s\t%s\t%s" % (newfiles[i][0], newfiles[i][1], i, newfiles[i][2])) 70 | else: 71 | listentries.append("%s\t%s\t%s\t%s" % (newfiles[i][0], newfiles[i][1], i, newfiles[i][2])) 72 | listentries.sort() 73 | for i in listentries: 74 | print i 75 | 76 | if __name__ == "__main__": 77 | main(sys.argv) 78 | -------------------------------------------------------------------------------- /src/maintenance/scorecache.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2014-2015 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | import sys 8 | import os 9 | import os.path 10 | import sqlite3 11 | from optparse import OptionParser 12 | 13 | def main(argv): 14 | alpha = 5.0 15 | 16 | parser = OptionParser() 17 | parser.add_option("-d", "--database", action="store", dest="db", help="path to caching database", metavar="FILE") 18 | (options, args) = parser.parse_args() 19 | if options.db == None: 20 | parser.error("Path to caching database") 21 | if not os.path.exists(options.db): 22 | print >>sys.stderr, "Caching database %s does not exist" % options.db 23 | sys.exit(1) 24 | 25 | conn = sqlite3.connect(options.db) 26 | c = conn.cursor() 27 | 28 | c.execute("create table if not exists scores (stringidentifier text, packages int, score real)") 29 | c.execute("create index if not exists scoresindex on scores(stringidentifier)") 30 | conn.commit() 31 | c2 = conn.cursor() 32 | 33 | c.execute("select distinct stringidentifier from stringscache") 34 | programstrings = c.fetchmany(10000) 35 | while programstrings != []: 36 | for p in programstrings: 37 | pkgs = {} 38 | filenames = {} 39 | 40 | pfs = c2.execute("select package, filename from stringscache where stringidentifier=?", p).fetchall() 41 | packages = set(map(lambda x: x[0], pfs)) 42 | 43 | if len(packages) == 1: 44 | score = float(len(p[0])) 45 | else: 46 | for pf in pfs: 47 | (package, filename) = pf 48 | if not filenames.has_key(filename): 49 | filenames[filename] = [package] 50 | else: 51 | filenames[filename] = list(set(filenames[filename] + [package])) 52 | try: 53 | score = float(len(p[0])) / pow(alpha, (len(filenames) - 1)) 54 | except Exception, e: 55 | score = len(p[0]) / sys.maxint 56 | # cut off for for example postgresql 57 | if score < 1e-37: 58 | score = 0.0 59 | c2.execute("insert into scores(stringidentifier, packages, score) values (?,?,?)", (p[0], len(packages), float(score))) 60 | programstrings = c.fetchmany(10000) 61 | conn.commit() 62 | c2.close() 63 | print "vacuuming" 64 | c.execute("vacuum") 65 | conn.commit() 66 | c.close() 67 | conn.close() 68 | 69 | if __name__ == "__main__": 70 | main(sys.argv) 71 | -------------------------------------------------------------------------------- /src/maintenance/updatesha256sum.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2014-2015 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This script is to update the SHA256SUM file in a directory that contains sha256 checksums 9 | for each file, that speeds up database creation. 10 | ''' 11 | 12 | import os 13 | import os.path 14 | import sys 15 | import hashlib 16 | import multiprocessing 17 | import zlib 18 | from optparse import OptionParser 19 | 20 | try: 21 | import tlsh 22 | tlshscan = True 23 | except Exception, e: 24 | tlshscan = False 25 | 26 | def computehash((filedir, filename, extrahashes)): 27 | filehashes = {} 28 | resolved_path = os.path.join(filedir, filename) 29 | scanfile = open(resolved_path, 'r') 30 | filedata = scanfile.read() 31 | scanfile.close() 32 | h = hashlib.new('sha256') 33 | h.update(filedata) 34 | filehashes['sha256'] = h.hexdigest() 35 | 36 | if 'crc32' in extrahashes: 37 | try: 38 | filehashes['crc32'] = zlib.crc32(filedata) & 0xffffffff 39 | except: 40 | return None 41 | 42 | if 'tlsh' in extrahashes: 43 | if os.stat(resolved_path).st_size >= 256: 44 | filehashes['tlsh'] = tlsh.hash(filedata) 45 | else: 46 | filehashes['tlsh'] = None 47 | 48 | # first remove 'crc32' from extrahashes 49 | extrahashesset = set(extrahashes) 50 | try: 51 | extrahashesset.remove('crc32') 52 | except KeyError: 53 | pass 54 | 55 | # then remove 'tlsh' from extrahashes 56 | try: 57 | extrahashesset.remove('tlsh') 58 | except KeyError: 59 | pass 60 | 61 | temphashes = {} 62 | for i in extrahashesset: 63 | temphashes[i] = hashlib.new(i) 64 | for i in extrahashesset: 65 | temphashes[i].update(filedata) 66 | for i in extrahashesset: 67 | filehashes[i] = temphashes[i].hexdigest() 68 | return (filename, filehashes) 69 | 70 | def main(argv): 71 | parser = OptionParser() 72 | parser.add_option("-f", "--filedir", action="store", dest="filedir", help="path to directory with files", metavar="DIR") 73 | (options, args) = parser.parse_args() 74 | if options.filedir == None: 75 | parser.error("No directory defined") 76 | if not os.path.exists(options.filedir): 77 | parser.error("No directory found") 78 | dirlist = os.listdir(options.filedir) 79 | dirlist = filter(lambda x: x != 'LIST' and x != 'SHA256SUM', dirlist) 80 | dirlist = filter(lambda x: os.path.isfile(os.path.join(options.filedir, x)), dirlist) 81 | 82 | # no files, so exit 83 | if len(dirlist) == 0: 84 | sys.exit(0) 85 | 86 | extrahashes = ['md5', 'sha1', 'crc32'] 87 | if tlshscan: 88 | extrahashes.append('tlsh') 89 | 90 | filetohash = {} 91 | if os.path.exists(os.path.join(options.filedir, "SHA256SUM")): 92 | sha256file = os.path.join(options.filedir, "SHA256SUM") 93 | sha256lines = open(sha256file, 'r').readlines() 94 | # first line should have the supported hashes 95 | 96 | checksumsused = sha256lines[0].strip().split() 97 | # first line is always a list of supported hashes. 98 | process = True 99 | if set(checksumsused).intersection(set(extrahashes)) != set(extrahashes): 100 | process = False 101 | if process: 102 | for i in sha256lines[1:]: 103 | entries = i.strip().split() 104 | filename = entries[0] 105 | if filename == 'SHA256SUM': 106 | continue 107 | if filename == 'LIST': 108 | continue 109 | if filename == 'DOWNLOADURL': 110 | continue 111 | # sha256 is always the first hash and second entry 112 | hashentry = entries[1] 113 | filetohash[filename] = {} 114 | filetohash[filename]['sha256'] = hashentry 115 | counter = 2 116 | for c in checksumsused[1:]: 117 | # only record results for hashes that are in 'extrahashes' 118 | if c in extrahashes: 119 | filetohash[filename][c] = entries[counter] 120 | counter += 1 121 | 122 | # determine which files need to be scanned 123 | diffset = set(dirlist).difference(set(filetohash)) 124 | if len(diffset) == 0: 125 | sys.exit(0) 126 | 127 | # find hashes in parallel 128 | shatasks = map(lambda x: (options.filedir, x, extrahashes), diffset) 129 | pool = multiprocessing.Pool() 130 | sharesults = filter(lambda x: x != None, pool.map(computehash, shatasks, 1)) 131 | pool.terminate() 132 | 133 | for i in sharesults: 134 | (filename, filehashes) = i 135 | filetohash[filename] = filehashes 136 | 137 | # write results 138 | filenameskeys = filetohash.keys() 139 | filenameskeys.sort() 140 | sha256file = open(os.path.join(options.filedir, "SHA256SUM"), 'w') 141 | # first write a line with the hashes that are supported 142 | if extrahashes == []: 143 | sha256file.write("sha256\n") 144 | else: 145 | hashesstring = "sha256" 146 | for h in extrahashes: 147 | hashesstring += "\t%s" % h 148 | sha256file.write("%s\n" % hashesstring) 149 | for i in filenameskeys: 150 | # first hashes, since file names could contain spaces 151 | hashesstring = filetohash[i]['sha256'] 152 | for h in extrahashes: 153 | hashesstring += "\t%s" % filetohash[i][h] 154 | sha256file.write("%s %s\n" % (i, hashesstring)) 155 | sha256file.close() 156 | 157 | if __name__ == "__main__": 158 | main(sys.argv) 159 | -------------------------------------------------------------------------------- /src/maintenance/verifyarchive.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2014 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | Script to test integrity of archives. TODO: properly handle ZIP archives 9 | ''' 10 | 11 | import sys, os, magic, multiprocessing, subprocess 12 | import tempfile, bz2, tarfile, gzip 13 | from optparse import OptionParser 14 | 15 | tarmagic = ['POSIX tar archive (GNU)' 16 | , 'tar archive' 17 | ] 18 | 19 | ms = magic.open(magic.MAGIC_NONE) 20 | ms.load() 21 | 22 | # unpack the directories to be scanned. 23 | def unpack((directory, filename)): 24 | try: 25 | os.stat(os.path.join(directory, filename)) 26 | except: 27 | print >>sys.stderr, "Can't find %s" % filename 28 | return None 29 | 30 | filemagic = ms.file(os.path.realpath(os.path.join(directory, filename))) 31 | 32 | # Assume if the files are bz2 or gzip compressed they are compressed tar files 33 | if 'bzip2 compressed data' in filemagic: 34 | # for some reason the tar.bz2 unpacking from python doesn't always work, like 35 | # aeneas-1.0.tar.bz2 from GNU, so use a subprocess instead of using the 36 | # Python tar functionality. 37 | p = subprocess.Popen(['tar', 'jtf', os.path.join(directory, filename)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) 38 | (stanout, stanerr) = p.communicate() 39 | elif 'LZMA compressed data, streamed' in filemagic: 40 | p = subprocess.Popen(['tar', 'itf', os.path.join(directory, filename)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) 41 | (stanout, stanerr) = p.communicate() 42 | elif 'XZ compressed data' in filemagic: 43 | p = subprocess.Popen(['tar', 'itf', os.path.join(directory, filename)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) 44 | (stanout, stanerr) = p.communicate() 45 | elif 'gzip compressed data' in filemagic: 46 | p = subprocess.Popen(['tar', 'ztf', os.path.join(directory, filename)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) 47 | (stanout, stanerr) = p.communicate() 48 | elif 'compress\'d data 16 bits' in filemagic: 49 | p = subprocess.Popen(['tar', 'ztf', os.path.join(directory, filename)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) 50 | (stanout, stanerr) = p.communicate() 51 | elif 'Minix filesystem' in filemagic and filename.endswith('.gz'): 52 | # sometimes libmagic gets it wrong 53 | p = subprocess.Popen(['tar', 'ztf', os.path.join(directory, filename)], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) 54 | (stanout, stanerr) = p.communicate() 55 | else: 56 | return None 57 | if p.returncode != 0: 58 | return (filename, False) 59 | else: 60 | return (filename, True) 61 | ''' 62 | elif 'Zip archive data' in filemagic: 63 | try: 64 | tmpdir = tempfile.mkdtemp(dir=unpackdir) 65 | p = subprocess.Popen(['unzip', "-B", os.path.join(directory, filename), '-d', tmpdir], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) 66 | (stanout, stanerr) = p.communicate() 67 | if p.returncode != 0 and p.returncode != 1: 68 | print >>sys.stderr, "unpacking ZIP failed for", filename, stanerr 69 | shutil.rmtree(tmpdir) 70 | else: 71 | return tmpdir 72 | except Exception, e: 73 | print >>sys.stderr, "unpacking ZIP failed", e 74 | ''' 75 | 76 | def main(argv): 77 | parser = OptionParser() 78 | parser.add_option("-f", "--filedir", action="store", dest="filedir", help="path to directory containing files to unpack", metavar="DIR") 79 | 80 | (options, args) = parser.parse_args() 81 | if options.filedir == None: 82 | parser.error("Specify dir with files") 83 | else: 84 | try: 85 | filelist = open(os.path.join(options.filedir, "LIST")).readlines() 86 | except: 87 | parser.error("'LIST' not found in file dir") 88 | 89 | # first process the LIST file 90 | pkgmeta = [] 91 | for unpackfile in filelist: 92 | try: 93 | unpacks = unpackfile.strip().split() 94 | if len(unpacks) == 3: 95 | origin = "unknown" 96 | (package, version, filename) = unpacks 97 | else: 98 | (package, version, filename, origin) = unpacks 99 | pkgmeta.append((options.filedir, filename)) 100 | except Exception, e: 101 | # oops, something went wrong 102 | print >>sys.stderr, e 103 | 104 | pool = multiprocessing.Pool() 105 | unpackresults = pool.map(unpack, pkgmeta, 1) 106 | pool.terminate() 107 | for i in unpackresults: 108 | if i != None: 109 | (filename, result) = i 110 | if not result: 111 | print "corrupt archive: %s" % filename 112 | 113 | if __name__ == "__main__": 114 | main(sys.argv) 115 | -------------------------------------------------------------------------------- /src/maintenance/verifydb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2012-2016 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This script verifies that the tables in a database are in sync, which means: 9 | all of the files in the tables "extracted_string" and "extracted_function" can 10 | also be found in "processed_file" 11 | ''' 12 | 13 | import sys 14 | import os 15 | import os.path 16 | import re 17 | import fnmatch 18 | import psycopg2 19 | import ConfigParser 20 | from optparse import OptionParser 21 | 22 | def main(argv): 23 | config = ConfigParser.ConfigParser() 24 | parser = OptionParser() 25 | parser.add_option("-c", "--config", action="store", dest="cfg", help="path to configuration file", metavar="FILE") 26 | 27 | (options, args) = parser.parse_args() 28 | if options.cfg == None: 29 | parser.error("Need path to configuration file") 30 | 31 | try: 32 | configfile = open(options.cfg, 'r') 33 | except: 34 | parser.error("Configuration file not readable") 35 | config.readfp(configfile) 36 | configfile.close() 37 | 38 | section = 'extractconfig' 39 | 40 | try: 41 | postgresql_user = config.get(section, 'postgresql_user') 42 | postgresql_password = config.get(section, 'postgresql_password') 43 | postgresql_db = config.get(section, 'postgresql_db') 44 | 45 | # check to see if a host (IP-address) was supplied either 46 | # as host or hostaddr. hostaddr is not supported on older 47 | # versions of psycopg2, for example CentOS 6.6, so it is not 48 | # used at the moment. 49 | try: 50 | postgresql_host = config.get(section, 'postgresql_host') 51 | except: 52 | postgresql_host = None 53 | try: 54 | postgresql_hostaddr = config.get(section, 'postgresql_hostaddr') 55 | except: 56 | postgresql_hostaddr = None 57 | # check to see if a port was specified. If not, default to 'None' 58 | try: 59 | postgresql_port = config.get(section, 'postgresql_port') 60 | except Exception, e: 61 | postgresql_port = None 62 | except: 63 | print >>sys.stderr, "Database connection not defined in configuration file. Exiting..." 64 | sys.stderr.flush() 65 | sys.exit(1) 66 | 67 | try: 68 | conn = psycopg2.connect(database=postgresql_db, user=postgresql_user, password=postgresql_password, host=postgresql_host, port=postgresql_port) 69 | 70 | cursor = conn.cursor() 71 | except: 72 | print >>sys.stderr, "Can't open database" 73 | sys.exit(1) 74 | 75 | print "checking processed" 76 | sys.stdout.flush() 77 | cursor.execute("select distinct checksum from processed") 78 | res = cursor.fetchall() 79 | conn.commit() 80 | for r in res: 81 | cursor.execute('select checksum from processed where checksum=%s', r) 82 | processed_results = cursor.fetchall() 83 | conn.commit() 84 | if len(processed_results) != 1: 85 | cursor.execute('select * from processed where checksum=%s', r) 86 | processed_results = cursor.fetchall() 87 | conn.commit() 88 | print "identical:", map(lambda x: "%s %s" % (x[0], x[1]), processed_results) 89 | sys.stdout.flush() 90 | 91 | # create a new cursor 92 | ncursor = conn.cursor() 93 | 94 | cursor.execute("select package,version from processed_file") 95 | res = cursor.fetchmany(40000) 96 | conn.commit() 97 | 98 | totals = 0 99 | print "checking processed_file" 100 | sys.stdout.flush() 101 | while res != []: 102 | totals += len(res) 103 | #print "processing", totals 104 | for r in res: 105 | (package,version) = r 106 | ncursor.execute('select checksum from processed where package=%s and version=%s LIMIT 1', r) 107 | pres = ncursor.fetchall() 108 | conn.commit() 109 | if pres == []: 110 | print "database not in sync", r 111 | sys.stdout.flush() 112 | res = cursor.fetchmany(40000) 113 | conn.commit() 114 | 115 | for i in ["extracted_string", "extracted_function"]: 116 | cursor.execute("select distinct(checksum) from %s" % i) 117 | res = cursor.fetchmany(40000) 118 | conn.commit() 119 | totals = 0 120 | while res != []: 121 | totals += len(res) 122 | print "processing %s" % i, totals 123 | sys.stdout.flush() 124 | for r in res: 125 | ncursor.execute('select checksum from processed_file where checksum=%s LIMIT 1', r) 126 | pres = ncursor.fetchall() 127 | conn.commit() 128 | if pres == []: 129 | print "database %s not in sync" % i, r[0] 130 | sys.stdout.flush() 131 | res = cursor.fetchmany(40000) 132 | conn.commit() 133 | 134 | if __name__ == "__main__": 135 | main(sys.argv) 136 | -------------------------------------------------------------------------------- /src/maintenance/verifylist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2012-2013 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | Helper script to verify the LIST files generated by generatelist.py. This is useful to see if any typos were made. 9 | ''' 10 | 11 | import os 12 | import os.path 13 | import sys 14 | from optparse import OptionParser 15 | 16 | def main(argv): 17 | parser = OptionParser() 18 | parser.add_option("-l", "--list", action="store", dest="listfile", help="path to LIST", metavar="FILE") 19 | (options, args) = parser.parse_args() 20 | 21 | try: 22 | filelist = open(options.listfile).readlines() 23 | except: 24 | parser.error("'LIST' not found") 25 | 26 | prev_split = None 27 | for unpackfile in filelist: 28 | # simple format check 29 | try: 30 | unpacks = unpackfile.strip().split() 31 | if len(unpacks) != 4: 32 | print >>sys.stderr, "FORMAT ERROR", unpackfile.strip() 33 | sys.stderr.flush() 34 | continue 35 | except Exception, e: 36 | # oops, something went wrong 37 | print >>sys.stderr, e 38 | # see if dfsg is in the package name, since Debian tends to do this 39 | if 'dfsg' in unpacks[0]: 40 | print >>sys.stderr, "DFSG ERROR", unpackfile.strip() 41 | sys.stderr.flush() 42 | if prev_split == None: 43 | prev_split = unpacks 44 | continue 45 | # see if we have the same package with different case 46 | if unpacks[0] != prev_split[0] and unpacks[0].lower() == prev_split[0].lower(): 47 | print >>sys.stderr, "CASE ERROR", unpackfile.strip() 48 | sys.stderr.flush() 49 | prev_split = unpacks 50 | 51 | if __name__ == "__main__": 52 | main(sys.argv) 53 | -------------------------------------------------------------------------------- /src/patches/README: -------------------------------------------------------------------------------- 1 | This directory contains a few patches that need to be applied to programs before they can be reliably used by BAT 2 | 3 | * cramfs.patch : this patch enables the -x option and removes the unpacking of special inodes, such as device files. Creating these files sometimes requires root privileges. This means that BAT would have to run as root. Since these special files are not inspected anyway there is no need to unpack them. 4 | 5 | * code2html-0.9.1-add-qml.patch : this patch lets code2html also process QML files (more and more frequently used in Qt programs) 6 | 7 | * code2html-0.9.1-add-groovyscala.patch : this patch lets code2html also process Scala and Groovy files as Java 8 | 9 | 10 | * code2html-0.9.1-add-csharp.patch : this patch lets code2html process C# files. It is a direct copy of the Java config, with a few minor modifications 11 | -------------------------------------------------------------------------------- /src/patches/code2html-0.9.1-add-groovyscala.patch: -------------------------------------------------------------------------------- 1 | diff -ruN code2html-0.9.1/code2html code2html-0.9.1.new/code2html 2 | --- code2html-0.9.1/code2html 2002-01-12 22:17:02.000000000 +0100 3 | +++ code2html-0.9.1.new/code2html 2012-03-04 17:27:14.207492320 +0100 4 | @@ -2549,7 +2549,7 @@ 5 | # taken from nedit 6 | # modified by PP 7 | $LANGUAGE{'java'} = { 8 | - 'filename' => '\\.java$', 9 | + 'filename' => '\\.(java|groovy|scala)$', 10 | 'regex' => '', 11 | 'patterns' => [ 12 | { 13 | -------------------------------------------------------------------------------- /src/patches/code2html-0.9.1-add-qml.patch: -------------------------------------------------------------------------------- 1 | diff -ruN code2html-0.9.1/code2html code2html-0.9.1.new/code2html 2 | --- code2html-0.9.1/code2html 2002-01-12 22:17:02.000000000 +0100 3 | +++ code2html-0.9.1.new/code2html 2012-03-03 19:28:46.000000000 +0100 4 | @@ -2258,7 +2258,7 @@ 5 | # taken from nedit 6 | # modified by PP 7 | $LANGUAGE{'c++'} = { 8 | - 'filename' => '\\.(c(c|pp|xx)|h(h|pp|xx)|C(C|PP|XX)?|H(H|PP|XX)?|i)$', 9 | + 'filename' => '\\.(c(c|pp|xx)|h(h|pp|xx)|C(C|PP|XX)?|H(H|PP|XX)?|i|qml)$', 10 | 'regex' => '', 11 | 'patterns' => [ 12 | { 13 | -------------------------------------------------------------------------------- /src/patches/cramfs.patch: -------------------------------------------------------------------------------- 1 | diff -ru util-linux-ng-2.18-rc1/disk-utils/fsck.cramfs.c util-linux-ng-2.18-rc1.new/disk-utils/fsck.cramfs.c 2 | --- util-linux-ng-2.18-rc1/disk-utils/fsck.cramfs.c 2010-03-18 23:11:23.000000000 +0100 3 | +++ util-linux-ng-2.18-rc1.new/disk-utils/fsck.cramfs.c 2010-06-09 10:11:29.000000000 +0200 4 | @@ -34,7 +34,7 @@ 5 | */ 6 | 7 | /* compile-time options */ 8 | -//#define INCLUDE_FS_TESTS /* include cramfs checking and extraction */ 9 | +#define INCLUDE_FS_TESTS /* include cramfs checking and extraction */ 10 | 11 | #include 12 | #include 13 | @@ -640,13 +640,14 @@ 14 | if (opt_verbose) { 15 | print_node(type, i, path); 16 | } 17 | - 18 | +/* 19 | if (opt_extract) { 20 | if (mknod(path, i->mode, devtype) < 0) { 21 | die(FSCK_ERROR, 1, _("mknod failed: %s"), path); 22 | } 23 | change_file_status(path, i); 24 | } 25 | +*/ 26 | } 27 | 28 | static void expand_fs(char *path, struct cramfs_inode *inode) 29 | -------------------------------------------------------------------------------- /src/scripts/comparebinaries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2013-2015 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This program compares two binaries (firmwares, files, etc.) in various ways to 9 | see how close they are. 10 | 11 | There are a few scenarios where this program can be used: 12 | 13 | 1. comparing an old firmware (that is already known and which has been verified) 14 | to a new firmware (update) and see if there are any big differences. 15 | 2. comparing a firmware to a rebuild of a firmware as part of compliance 16 | engineering 17 | 3. comparing two binaries to see if a certain security bug might be present 18 | 19 | A few assumptions are made: 20 | 21 | 1. both firmwares were unpacked using the Binary Analysis Tool 22 | 2. files that are in the original firmware, but not in the new firmware, are 23 | not reported (example: removed binaries). This might change in a future version. 24 | 3. files that are in the new firmware but not not in the original firmware are 25 | reported, since this would mean additions to the firmware, possibly with 26 | license conditions or security concerns. 27 | 4. files that appear in both firmwares but which are not identical are checked 28 | using bsdiff and, if available, tlsh. 29 | 30 | With just checksums it is easy to find the files that are different. Using BSDIFF 31 | and tlsh it becomes easier to see how big the difference really is. 32 | 33 | Low values are probably not interesting at all: 34 | * time stamps (BusyBox, Linux kernel, etc. record a time stamp in the binary) 35 | * slightly different compiler settings 36 | 37 | If the diffs get larger there are of course bigger changes. 38 | 39 | This approach will make it easier to make a baseline scan of a firmware, then 40 | find, prioritize and scan only the differences in an update of the firmware. 41 | ''' 42 | 43 | import sys 44 | import os 45 | import os.path 46 | import hashlib 47 | import subprocess 48 | import tempfile 49 | import magic 50 | import multiprocessing 51 | from optparse import OptionParser 52 | try: 53 | import tlsh 54 | tlshscanning = True 55 | except: 56 | tlshscanning = False 57 | 58 | # copied from bruteforce.py 59 | def gethash(path, filename): 60 | scanfile = open("%s/%s" % (path, filename), 'r') 61 | h = hashlib.new('sha256') 62 | scanfile.seek(0) 63 | hashdata = scanfile.read(10000000) 64 | while hashdata != '': 65 | h.update(hashdata) 66 | hashdata = scanfile.read(10000000) 67 | scanfile.close() 68 | return h.hexdigest() 69 | 70 | # method to compare binaries. Returns the amount of bytes that differ 71 | # according to bsdiff, or 0 if the files are identical 72 | def comparebinaries(path1, path2): 73 | basepath1 = os.path.basename(path1) 74 | dirpath1 = os.path.dirname(path1) 75 | basepath2 = os.path.basename(path2) 76 | dirpath2 = os.path.dirname(path2) 77 | # binaries are identical 78 | if gethash(dirpath1, basepath1) == gethash(dirpath2, basepath2): 79 | return 0 80 | difftmp = tempfile.mkstemp() 81 | os.fdopen(difftmp[0]).close() 82 | p = subprocess.Popen(["bsdiff", path1, path2, difftmp[1]], stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True) 83 | # cleanup 84 | (stanout, stanerr) = p.communicate() 85 | diffsize = os.stat(difftmp[1]).st_size 86 | os.unlink(difftmp[1]) 87 | return diffsize 88 | 89 | def main(argv): 90 | parser = OptionParser() 91 | parser.add_option("-n", "--new", action="store", dest="newdir", help="path to BAT results of new binary", metavar="DIR") 92 | parser.add_option("-o", "--original", action="store", dest="olddir", help="path to BAT results of original binary", metavar="DIR") 93 | (options, args) = parser.parse_args() 94 | if options.olddir == None or options.newdir == None: 95 | parser.error("Supply paths to both directories") 96 | 97 | if not os.path.exists(options.olddir): 98 | parser.error("Directory \"%s\" does not exist" % (options.olddir,)) 99 | 100 | if not os.path.exists(options.newdir): 101 | parser.error("Directory \"%s\" does not exist" % (options.newdir,)) 102 | 103 | ms = magic.open(magic.MAGIC_NONE) 104 | ms.load() 105 | 106 | # The goal is to check the files from the new binary and 107 | # compare them with files from the old binary 108 | # First build a list of files in the original binary 109 | # Then do the same for the new binary and check: 110 | # * does a file with the same name exist in the original binary 111 | # * do the files differ 112 | # and report about it 113 | checkfiles = {} 114 | osgen = os.walk(options.olddir) 115 | try: 116 | while True: 117 | i = osgen.next() 118 | for p in i[2]: 119 | if os.path.islink(os.path.join(i[0], p)): 120 | continue 121 | if not os.path.isfile(os.path.join(i[0], p)): 122 | continue 123 | if not checkfiles.has_key(p): 124 | checkfiles[p] = [os.path.join(i[0], p)] 125 | else: 126 | checkfiles[p].append(os.path.join(i[0],p)) 127 | except StopIteration: 128 | pass 129 | notfoundnewdir = [] 130 | notfoundorigdir = [] 131 | # now loop over the new binary 132 | osgen = os.walk(options.newdir) 133 | try: 134 | while True: 135 | i = osgen.next() 136 | for p in i[2]: 137 | if os.path.islink(os.path.join(i[0], p)): 138 | continue 139 | if not os.path.isfile(os.path.join(i[0], p)): 140 | continue 141 | # name of this file can't be found in old scan tree, so report 142 | if not checkfiles.has_key(p): 143 | notfoundnewdir.append(p) 144 | else: 145 | for j in checkfiles[p]: 146 | diff = comparebinaries(j, os.path.join(i[0], p)) 147 | # bsdiff between two identical files is 143 bytes 148 | if diff <= 143 : 149 | continue 150 | else: 151 | print "* %s and %s differ %d bytes according to bsdiff" % ("%s/%s" % (i[0], p), j, diff) 152 | except StopIteration: 153 | pass 154 | 155 | if notfoundnewdir != []: 156 | print "\nThe following files from the new binary were not found in the original binary:" 157 | for i in notfoundnewdir: 158 | print "* %s" % i 159 | 160 | # TODO: check for files in the original directory as well, although 161 | # removal of files might not be as interesting 162 | if notfoundorigdir != []: 163 | print "\nThe following files from the original binary were not found in the new binary:" 164 | for i in notfoundorigdir: 165 | print "* %s" % i 166 | 167 | if __name__ == "__main__": 168 | main(sys.argv) 169 | -------------------------------------------------------------------------------- /src/scripts/findxor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2015 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | Find XOR key using some very superdumb methods. 9 | 10 | The idea is to exploit the idea that padding is used in firmwares. Usually padding 11 | consists of NUL bytes. When XORing the key with NUL bytes the result will be the key. 12 | Often it is very easy to see the key in plain sight using for example the command 13 | "hexdump -C". 14 | 15 | In this script it is assumed (for now) that the keylength is 16 and that there is just 16 | one single key used. Manual inspection is definitely needed. 17 | ''' 18 | 19 | import collections 20 | import os 21 | import sys 22 | 23 | from optparse import OptionParser 24 | 25 | def findpadding(firmware): 26 | counter = collections.Counter() 27 | fwfile = open(firmware) 28 | firmwarebytes = fwfile.read() 29 | fwfile.close() 30 | fwlen = len(firmwarebytes) 31 | blocks = fwlen/16 32 | byteblocks = [] 33 | for i in xrange(0, blocks): 34 | byteblocks.append(firmwarebytes[i*16:i*16+16]) 35 | counter.update(byteblocks) 36 | rank = 1 37 | reportamount = 10 38 | print "MOST COMMON, TOP %d" % reportamount 39 | for i in counter.most_common(reportamount): 40 | print rank, i[1], map(lambda x: hex(ord(x)), i[0]) 41 | rank += 1 42 | 43 | def main(argv): 44 | parser = OptionParser() 45 | parser.add_option("-f", "--firmware", action="store", dest="firmware", help="path to firmware", metavar="FILE") 46 | (options, args) = parser.parse_args() 47 | if options.firmware == None: 48 | parser.exit("Path to firmware not supplied, exiting") 49 | if os.path.isdir(options.firmware): 50 | print >>sys.stderr, "%s is not a file" % options.firmware 51 | sys.exit(1) 52 | 53 | findpadding(options.firmware) 54 | 55 | if __name__ == "__main__": 56 | main(sys.argv) 57 | -------------------------------------------------------------------------------- /src/scripts/sourcewalk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Binary Analysis Tool 4 | # Copyright 2013-2015 Armijn Hemel for Tjaldur Software Governance Solutions 5 | # Licensed under Apache 2.0, see LICENSE file for details 6 | 7 | ''' 8 | This program can quickly determine whether or not a file is in known upstream 9 | sources. It uses a pregenerated database containing names and checksums of 10 | files (for example the Linux kernel) and reports whether or not it can be found 11 | in the database. 12 | 13 | The purpose of this script is to find files that differ from upstream files and 14 | reduce the search space. 15 | 16 | This script will *NOT* catch: 17 | 18 | * binary files 19 | * patch/diff files 20 | * anything that does not have an extension from the list 21 | * configuration files 22 | ''' 23 | 24 | import os 25 | import os.path 26 | import sys 27 | import sqlite3 28 | import hashlib 29 | from optparse import OptionParser 30 | 31 | # list of extensions, plus what language they should be mapped to 32 | # This is not necessarily correct, but for now it is good enough. 33 | extensions = {'.c' : 'C', 34 | '.cc' : 'C', 35 | '.cpp' : 'C', 36 | '.cxx' : 'C', 37 | '.c++' : 'C', 38 | '.h' : 'C', 39 | '.hh' : 'C', 40 | '.hpp' : 'C', 41 | '.hxx' : 'C', 42 | '.l' : 'C', 43 | '.qml' : 'C', 44 | '.s' : 'C', 45 | '.txx' : 'C', 46 | '.y' : 'C', 47 | '.cs' : 'C#', 48 | '.groovy' : 'Java', 49 | '.java' : 'Java', 50 | '.jsp' : 'Java', 51 | '.scala' : 'Java', 52 | '.as' : 'ActionScript', 53 | '.js' : 'JavaScript', 54 | } 55 | 56 | def sourceWalk(scandir, dbpath): 57 | conn = sqlite3.connect(dbpath, check_same_thread = False) 58 | 59 | cursor = conn.cursor() 60 | osgen = os.walk(scandir) 61 | lenscandir = len(scandir) 62 | notfound = 0 63 | total = 0 64 | 65 | try: 66 | while True: 67 | i = osgen.next() 68 | for p in i[2]: 69 | if os.stat("%s/%s" % (i[0], p)).st_size == 0: 70 | continue 71 | p_nocase = p.lower() 72 | for extension in extensions.keys(): 73 | if (p_nocase.endswith(extension)): 74 | total = total + 1 75 | scanfile = open("%s/%s" % (i[0], p), 'r') 76 | h = hashlib.new('sha256') 77 | h.update(scanfile.read()) 78 | scanfile.close() 79 | filehash = h.hexdigest() 80 | cursor.execute('''select checksum from processed_file where checksum=? limit 1''', (filehash,)) 81 | res = cursor.fetchall() 82 | # there is at least one hit, so ignore 83 | if len(res) != 0: 84 | continue 85 | # no hits, so this is an interesting file 86 | else: 87 | print "%s" % os.path.join(scandir, i[0][lenscandir:],p) 88 | notfound = notfound + 1 89 | pass 90 | except StopIteration: 91 | pass 92 | print "Total files: %d" % total 93 | print "Files not found in database: %d" % notfound 94 | 95 | def main(argv): 96 | parser = OptionParser() 97 | parser.add_option("-d", "--database", action="store", dest="db", help="path to database", metavar="FILE") 98 | parser.add_option("-f", "--filedir", action="store", dest="filedir", help="path to top level directory containing source tree", metavar="DIR") 99 | (options, args) = parser.parse_args() 100 | if options.filedir == None: 101 | parser.error("Specify dir with files") 102 | if options.db == None: 103 | parser.error("Specify path to database") 104 | 105 | sourceWalk(options.filedir, options.db) 106 | 107 | if __name__ == "__main__": 108 | main(sys.argv) 109 | -------------------------------------------------------------------------------- /src/setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_rpm] 2 | release = 1 3 | packager = Armijn Hemel 4 | group = Development/Tools 5 | doc_files = LICENSE 6 | requires = python-magic, binutils, e2fsprogs, e2tools, squashfs-tools, coreutils, xz, xz-lzma-compat, zip, unzip, unrar, cabextract, unshield, p7zip, p7zip-plugins, cpio, tar, bzip2, mtd-utils, lzip, lzop, arj, icoutils, rpm, rpm-python, gettext, bat-extratools >= 27.0, ucl, upx, poppler-utils, netpbm-progs, libxml2, lrzip, ncompress, python-imaging, vorbis-tools, ctags, python-matplotlib, file, pydot, bsdiff, python-reportlab, liberation-sans-fonts, clamav, john, python-psycopg2, openssl 7 | -------------------------------------------------------------------------------- /src/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | import glob 5 | import os.path 6 | 7 | setup(name='bat', 8 | version='37.0', 9 | description='Binary Analysis Tool', 10 | author='Binary Analysis Project', 11 | author_email='info@binaryanalysis.org', 12 | url='http://www.binaryanalysis.org/', 13 | packages=['bat'], 14 | license="Apache 2.0", 15 | scripts=['maintenance/busybox-appletname-extractor.py', 'maintenance/clonedbinit.py', 'bat-scan', 'busybox-compare-configs.py'], 16 | data_files=[ ('/etc/bat', ['bat-scan.config']), 17 | ], 18 | long_description="""The Binary Analysis Tool is a modular framework that assists with auditing 19 | the contents of compiled software. It makes it easier and cheaper to look 20 | inside technology, and this helps compliance and due diligence activities. 21 | 22 | The tool is freely available to everyone. The community can use it and 23 | participate in further development, and work together to help reduce errors 24 | when shipping devices or products containing Free and Open Source Software.""" 25 | ) 26 | --------------------------------------------------------------------------------