├── .gitignore ├── .gitmodules ├── LICENCE.md ├── Makefile ├── Makefile.rules ├── README.md ├── TODO.md ├── dtracker.H ├── dtracker.cpp ├── dtracker_debug.H ├── dtracker_debug.cpp ├── hooks ├── hooks.H ├── libdft_tag_bitset │ ├── mmap.cpp │ ├── openclose.cpp │ ├── read.cpp │ └── write.cpp ├── libdft_tag_set_fdoff │ ├── mmap.cpp │ ├── openclose.cpp │ ├── read.cpp │ └── write.cpp └── syscall_args.h ├── osutils.H ├── osutils.cpp ├── provlog.H ├── provlog.cpp ├── raw2dsl.py ├── raw2ttl.py ├── samples ├── .gitignore ├── Makefile ├── README.txt ├── ccombine.c ├── sgrep.c ├── tricky.c └── upcase.c └── support ├── makefile.libdft ├── makefile.pin ├── makefile.provtoolbox └── makefile.vars /.gitignore: -------------------------------------------------------------------------------- 1 | *.sw* 2 | *.pyc 3 | *.out 4 | *.log 5 | obj-ia32 6 | pin 7 | support/pin-* 8 | support/neoclipse-* 9 | support/provToolbox 10 | support/toolbox-* 11 | neoclipse 12 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "support/libdft"] 2 | path = support/libdft 3 | url = https://git.cs.vu.nl/r.vermeulen/libdft.git 4 | -------------------------------------------------------------------------------- /LICENCE.md: -------------------------------------------------------------------------------- 1 | **Copyright (c) 2014, VU University Amsterdam.** 2 | **All rights reserved.** 3 | 4 | This software was developed by Manolis Stamatogiannakis at VU University, Amsterdam, The Netherlands. 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | * Neither the name of VU University nor the names of its 14 | contributors may be used to endorse or promote products derived from 15 | this software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 18 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ############################################################## 2 | # 3 | # DO NOT EDIT THIS FILE! 4 | # 5 | ############################################################## 6 | ifdef PIN_ROOT 7 | CONFIG_ROOT := $(PIN_ROOT)/source/tools/Config 8 | else 9 | PIN_ROOT := ./pin 10 | CONFIG_ROOT := $(PIN_ROOT)/source/tools/Config 11 | endif 12 | PIN_VERSION := $(shell head -1 $(PIN_ROOT)/README | tr -dc 0-9 | awk '{if ($$0 < 100) { print 10*$$0 } else { print $$0 }}') 13 | ifeq ($(PIN_VERSION),) 14 | $(error Cannot determine Pin version.) 15 | else 16 | $(info ## Using Pin v$(PIN_VERSION) from $(PIN_ROOT).) 17 | $(info ## Loading defaults from $(CONFIG_ROOT)/makefile.config.) 18 | include $(CONFIG_ROOT)/makefile.config 19 | endif 20 | 21 | ifneq ($(HOST_ARCH),ia32) 22 | $(info ## Running on $(HOST_ARCH) host. Targetting to ia32.) 23 | TARGET := ia32 24 | $(info ## Reloading defaults from $(CONFIG_ROOT)/makefile.config.) 25 | include $(CONFIG_ROOT)/makefile.config 26 | endif 27 | 28 | include Makefile.rules 29 | 30 | ifeq ($(wildcard $(TOOLS_ROOT)/Config/makefile.default.rules),) 31 | $(warning Cannot include makefile.default.rules. This is caused because Pin assumes our code is located under PIN_ROOT. Fix this by running:) 32 | $(warning sed -i.bak 's/PIN_ROOT :=/PIN_ROOT ?=/' "$(CONFIG_ROOT)/makefile.unix.config") 33 | $(error Aborting) 34 | else 35 | $(info ## Loading default rules for $(TOOLS_ROOT)/Config/makefile.default.rules.) 36 | include $(TOOLS_ROOT)/Config/makefile.default.rules 37 | endif 38 | 39 | ############################################################## 40 | # 41 | # DO NOT EDIT THIS FILE! 42 | # 43 | ############################################################## 44 | -------------------------------------------------------------------------------- /Makefile.rules: -------------------------------------------------------------------------------- 1 | # In this file we define which of our own applications/tools/test should be built. 2 | # For docs see: http://software.intel.com/sites/landingpage/pintool/docs/62141/Pin/html/index.html#AddingTests 3 | # For a complex example see: pin/source/tools/SimpleExamples/makefile.rules 4 | 5 | # DataTracker top dir 6 | DTRACKER_ROOT = $(abspath $(dir $(firstword $(MAKEFILE_LIST)))) 7 | 8 | # Paths for support libraries. 9 | SUPPORT_REL_PATH = ./support 10 | SUPPORT_PATH = $(realpath $(SUPPORT_REL_PATH)) 11 | LIBDFT_PATH = $(realpath $(SUPPORT_PATH)/libdft/src) 12 | LIBDFT_TAG_FLAGS ?= -DLIBDFT_TAG_TYPE=libdft_tag_set_fdoff 13 | #LIBDFT_TAG_FLAGS ?= -DLIBDFT_TAG_TYPE=libdft_tag_bitset -DTAG_BITSET_SIZE=32 14 | 15 | # Turns pintool.log output off. 16 | LOGGING_FLAGS = -DNO_PINTOOL_LOG 17 | 18 | # Pin Makefile variables. 19 | TOOL_ROOTS := dtracker 20 | APP_ROOTS := 21 | TEST_ROOTS := 22 | TOOL_CXXFLAGS += -std=c++11 $(LOGGING_FLAGS) $(LIBDFT_TAG_FLAGS) -I$(DTRACKER_ROOT) -I$(LIBDFT_PATH) 23 | 24 | ####################################################################### 25 | # Data Tracker. 26 | ####################################################################### 27 | DTRACKER_HOOKS_ACTIVE = $(shell echo "LIBDFT_TAG_TYPE" | $(CPP) $(LIBDFT_TAG_FLAGS) - | tail -1) 28 | DTRACKER_HOOKS_DIR = hooks/$(DTRACKER_HOOKS_ACTIVE) 29 | DTRACKER_HOOKS_SRC = $(wildcard $(DTRACKER_HOOKS_DIR)/*.cpp) 30 | DTRACKER_HOOKS_OBJS = $(patsubst %.cpp,$(OBJDIR)%$(OBJ_SUFFIX),$(DTRACKER_HOOKS_SRC)) 31 | 32 | DTRACKER_OBJS = $(OBJDIR)dtracker$(OBJ_SUFFIX)\ 33 | $(DTRACKER_HOOKS_OBJS)\ 34 | $(OBJDIR)provlog$(OBJ_SUFFIX)\ 35 | $(OBJDIR)osutils$(OBJ_SUFFIX)\ 36 | $(OBJDIR)dtracker_debug$(OBJ_SUFFIX) 37 | 38 | $(OBJDIR)dtracker$(OBJ_SUFFIX): dtracker.cpp | $(OBJDIR)hooks/$(DTRACKER_HOOKS_ACTIVE) 39 | $(CXX) $(TOOL_CXXFLAGS) $(COMP_OBJ)$@ $< 40 | 41 | # Note: $(TOOL_LIBS) must come after -ldft, or you'll get undefined symbols at runtime. 42 | $(OBJDIR)dtracker$(PINTOOL_SUFFIX): $(DTRACKER_OBJS) 43 | $(LINKER) $(TOOL_LDFLAGS) $(LINK_EXE)$@ $(^:%.h=) $(TOOL_LPATHS) -L$(LIBDFT_PATH) -ldft $(TOOL_LIBS) 44 | 45 | ####################################################################### 46 | # Directories. 47 | ####################################################################### 48 | $(OBJDIR)%: 49 | mkdir -p $@ 50 | 51 | ####################################################################### 52 | # Generic rules for support libraries. 53 | ####################################################################### 54 | SUPPORT_VARS += CC="$(CC)" CFLAGS="$(TOOL_CFLAGS)" 55 | SUPPORT_VARS += CXX="$(CXX)" CXXFLAGS="$(TOOL_CXXFLAGS)" 56 | 57 | .PHONY: support-% 58 | support-%: 59 | LIBDFT_TAG_FLAGS="$(LIBDFT_TAG_FLAGS)" $(MAKE) -C support -f makefile.$* 60 | 61 | .PHONY: help 62 | help: 63 | $(info Some potentially useful targets:) 64 | $(info - support : Builds libraries to be used for writing/compiling pin tools.) 65 | $(info - support-clean : Remove the built libraries.) 66 | $(info ) 67 | $(info Some potentially useful options:) 68 | $(info - DEBUG=1 : Turns off optimizations and enables debug flags.) 69 | $(info - USE_GLIB=1 : Enable compile/linking flags for glib.) 70 | 71 | # vim: set noet ts=4 sts=4 sw=4 ai ft=make : 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DataTracker 2 | =========== 3 | 4 | About 5 | ------ 6 | DataTracker is a tool for collecting high-fidelity data provenance from unmodified Linux programs. It is based on [Intel Pin][pin] _Dynamic Binary Instrumentation_ framework and [libdft][libdft] _Dynamic Taint Analysis_ library. The taint marks supported by the original libdft are of limited size and cannot provide adequate fidelity for use in provenance tracking. For this, DataTracker uses a [modified version][libdft-mod] of the library developed at [VU University Amsterdam][vu-cs]. 7 | 8 | DataTracker was developed at VU University Amsterdam by Manolis Stamatogiannakis and presented at IPAW14. 9 | You can get a [copy of the paper][dtracker-dare] from VU Digital Archive Repository ([VU-DARE][vu-dare]). 10 | We also have a [demo on YouTube][ipaw14-demo]. 11 | Presentation slides available upon request. 12 | 13 | Requirements 14 | ------------- 15 | DataTracker can work with 32bit Linux programs. This limitation is imposed by the current version of libdft. However, the methods of both software are not platform-specific. So, in principle, they can be ported on any platform supported by Intel Pin. The requirements for running DataTracker are: 16 | 17 | * A C++11 compiler and unix build utilities (e.g. GNU Make). 18 | * A recent (>=2.13) version of Intel Pin. The framework must be present in directory ``pin`` inside the DataTracker top directory. 19 | * A suitable version of the [modified libdft][libdft-mod] - typically the latest available. This must be placed in directory ``support/libdft``. 20 | * Python 2.7 for converting raw provenance to [PROV][prov] format in [Turtle][turtle] syntax. 21 | 22 | Installation 23 | ------------- 24 | After cloning DataTracker, follow these steps to compile it. 25 | 26 | **Multiarch setup (intel64 only):** 27 | On intel64 (a.k.a. x86\_64) hosts, DataTracker and libdft need to be cross-compiled to ia32. 28 | For this, you will need a working multiarch setup. 29 | [Google](http://www.google.com) and [serverfault](https://serverfault.com/) are your friends for this. 30 | 31 | **Build environment:** 32 | On Debian/Ubuntu systems, you should install ``build-essential`` meta-package which will provide a C++ compiler and GNU Make. On other systems, you should either install some equivalent meta-package or install the tools one by one using trial and error. 33 | 34 | **Intel Pin:** You can [manually download][pin-dl] a suitable Pin version and extract it in ``pin`` directory. For convenience, a makefile is provided which takes care of this. I.e. it downloads and extracts a suitable Pin version. Invoke it using: 35 | 36 | ``` 37 | make -C support -f makefile.pin 38 | ``` 39 | 40 | **libdft:** The modified libdft is packed as a submodule of DataTracker. You need to disable Git's certificate checking to successfully retrieve it. Because libdft does not use [Pin's makefile infrastructure][pin-makefile] you need to set ``PIN_ROOT`` environment variable before compiling it. E.g.: 41 | 42 | ``` 43 | export PIN_ROOT=$(pwd)/pin 44 | GIT_SSL_NO_VERIFY=true git submodule update --init 45 | make support-libdft 46 | ``` 47 | 48 | **dtracker pin tool**: Finaly compile the pin tool of DataTracker using: 49 | 50 | ``` 51 | make 52 | ``` 53 | 54 | If all above steps were successfull, ``obj-ia32/dtracker.so`` will be created. This is Pin tool containing all the instrumentation required to capture provenance. 55 | 56 | 57 | Runnning 58 | --------- 59 | 60 | ### Capturing raw provenance 61 | To capture provenance from a program, launch it from the unix shell using something like this: 62 | 63 | ``` 64 | ./pin/pin.sh -follow_execv -t ./obj-ia32/dtracker.so -- 65 | ``` 66 | 67 | The command runs the program under Pin 68 | In addition to the standard Pin knobs, DataTracker additionally supports these tool-specific knobs: 69 | 70 | * ```-stdin [1|0]```: Turns tracking of data read from the standard input on or off. Default if off. 71 | * ```-stdout [1|0]```: Turns logging of provenance of data written to standard output on or off. Default if on. 72 | * ```-stderr [1|0]```: Turns logging of provenance of data written to standard error on or off. Default if off. 73 | 74 | Note that launching large programs using the method above takes a lot of time. For such programs, it is suggested to first launch the program and then attach DataTracker to the running process like this: 75 | 76 | ``` 77 | ./pin/pin.sh -follow_execv -pid -t ./obj-ia32/dtracker.so 78 | ``` 79 | 80 | The raw provenance generated by DataTracker is contained in file ``rawprov.out``. Any additional debugging information are written in file ``pintool.log``. 81 | 82 | ### Converting to PROV 83 | The ``raw2ttl.py`` script converts the raw provenance generated by DataTracker to [PROV][prov] format in [Turtle][turtle] syntax. The converter works as a filter. So, a conversion would look like this: 84 | 85 | ``` 86 | python raw2ttl.py < rawprov.out > prov.ttl 87 | ``` 88 | 89 | ### Visualizing provenance 90 | For visualization of the generated provenance, we suggest using [``provconvert``][provconvert] from Luc Moreau's [ProvToolbox][provtoolbox]. It is suggested to use the binary release. 91 | 92 | Of course any other PROV-compatible tool can be used, either directly, or via conversion of the Turtle file to a supported syntax. 93 | If you were able to produce any good-looking provenance graph, we'd love to incorporate them in these pages. 94 | 95 | Sample programs 96 | ---------------- 97 | In this repository also include a few sample programs we used for evaluating the effectiveness of DataTracker. You can find these programs in the ``samples`` directory. To build them, use: 98 | 99 | ``` 100 | make -C samples 101 | ``` 102 | 103 | 125 | 126 | [pin]: http://software.intel.com/en-us/articles/pin-a-dynamic-binary-instrumentation-tool 127 | [pin-dl]: http://software.intel.com/en-us/articles/pintool-downloads 128 | [pin-makefile]: http://software.intel.com/sites/landingpage/pintool/docs/62732/Pin/html/index.html#MAKEFILES 129 | [libdft]: http://www.cs.columbia.edu/~vpk/research/libdft/ 130 | [libdft-mod]: https://git.cs.vu.nl/r.vermeulen/libdft 131 | [vu-cs]: http://www.cs.vu.nl/en/ 132 | [turtle]: http://www.w3.org/TeamSubmission/turtle/ 133 | [prov]: http://www.w3.org/TR/2013/NOTE-prov-overview-20130430/ 134 | [provconvert]: https://github.com/lucmoreau/ProvToolbox/wiki/provconvert 135 | [provtoolbox]: https://github.com/lucmoreau/ProvToolbox/wiki/ProvToolbox-Home 136 | [ipaw14-demo]: https://www.youtube.com/watch?v=BD0h6M5mVoo 137 | [vu-dare]: http://dare.ubvu.vu.nl/ 138 | [dtracker-dare]: http://dare.ubvu.vu.nl/handle/1871/51386 139 | 140 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # DataTracker to-do/bugs list 2 | 3 | - Need to handle `pread64`/`pwrite64`. 4 | - Need to handle `link`/`unlink`/`rename`. 5 | 6 | -------------------------------------------------------------------------------- /dtracker.H: -------------------------------------------------------------------------------- 1 | #ifndef DTRACKER_H 2 | #define DTRACKER_H 3 | 4 | /**** generic macros and definitions ******************************/ 5 | #define BOOL(x) ((x) ? 1 : 0) 6 | 7 | /* min/max macros for general use */ 8 | #if !defined(MIN) 9 | #define MIN(a,b) (((a)<(b))?(a):(b)) 10 | #endif 11 | #if !defined(MAX) 12 | #define MAX(a,b) (((a)>(b))?(a):(b)) 13 | #endif 14 | 15 | /* compiler directives for branch prediction */ 16 | #if !defined(likely) 17 | #define likely(x) __builtin_expect((x), 1) 18 | #endif 19 | #if !defined(unlikely) 20 | #define unlikely(x) __builtin_expect((x), 0) 21 | #endif 22 | 23 | /**** debugging macros and inlines ********************************/ 24 | #ifdef NO_PINTOOL_LOG 25 | #ifdef LOG 26 | #undef LOG 27 | #endif 28 | #define LOG(args...) do{} while(0) 29 | #endif 30 | 31 | #endif 32 | 33 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 34 | -------------------------------------------------------------------------------- /dtracker.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | /* DataTracker includes. */ 11 | #include "provlog.H" 12 | #include "dtracker.H" 13 | #include "hooks/hooks.H" 14 | #include "osutils.H" 15 | 16 | /* libdft includes. */ 17 | #include "syscall_desc.h" 18 | #include "tagmap.h" 19 | 20 | /* Pin includes. */ 21 | #include 22 | 23 | // #define DTRACKER_DEBUG 24 | #include "dtracker_debug.H" 25 | 26 | /* Syscall descriptors, defined in libdft. */ 27 | extern syscall_desc_t syscall_desc[SYSCALL_MAX]; 28 | 29 | /* Pin knob for setting the raw prov output file */ 30 | static KNOB ProvRawKnob(KNOB_MODE_WRITEONCE, "pintool", "o", 31 | "rawprov.out", "The output file for raw prov data" 32 | ); 33 | 34 | /* Pin knobs for tracking stdin/stdout/stderr */ 35 | static KNOB TrackStdin(KNOB_MODE_WRITEONCE, "pintool", "stdin", 36 | "0", "Taint data originating from stdin." 37 | ); 38 | static KNOB TrackStdout(KNOB_MODE_WRITEONCE, "pintool", "stdout", 39 | "1", "Log the taint tag data for stdout." 40 | ); 41 | static KNOB TrackStderr(KNOB_MODE_WRITEONCE, "pintool", "stderr", 42 | "0", "Log the taint tag data for stderr." 43 | ); 44 | 45 | /* 46 | * Called when a new image is loaded. 47 | * Currently only acts when the main executable is loaded to set exename global. 48 | */ 49 | static void ImageLoad(IMG img, VOID * v) { 50 | if (IMG_IsMainExecutable(img)) { 51 | exename = path_resolve(IMG_Name(img)); 52 | pid = getpid(); 53 | PROVLOG::exec(exename, pid); 54 | 55 | // Add stdin/stdout/stderr to watched file descriptors. 56 | // This should take place while loading the image in order to have 57 | // exename available. 58 | if ( atoi(TrackStdin.Value().c_str()) ) { 59 | PROVLOG::ufd_t ufd = PROVLOG::ufdmap[STDIN_FILENO]; 60 | std::string fdn = fdname(STDIN_FILENO); 61 | fdset.insert(STDIN_FILENO); 62 | LOG( "Watching fd" + decstr(STDIN_FILENO) + " (" + fdn + ").\n"); 63 | PROVLOG::open(ufd, fdn, fcntl(STDIN_FILENO, F_GETFL), 0); 64 | } 65 | if ( atoi(TrackStdout.Value().c_str()) ) { 66 | PROVLOG::ufd_t ufd = PROVLOG::ufdmap[STDOUT_FILENO]; 67 | std::string fdn = fdname(STDOUT_FILENO); 68 | fdset.insert(STDOUT_FILENO); 69 | LOG( "Watching fd" + decstr(STDOUT_FILENO) + " (" + fdn + ").\n"); 70 | PROVLOG::open(ufd, fdn, fcntl(STDOUT_FILENO, F_GETFL), 0); 71 | } 72 | if ( atoi(TrackStderr.Value().c_str()) ) { 73 | PROVLOG::ufd_t ufd = PROVLOG::ufdmap[STDERR_FILENO]; 74 | std::string fdn = fdname(STDERR_FILENO); 75 | fdset.insert(STDERR_FILENO); 76 | LOG( "Watching fd" + decstr(STDERR_FILENO) + " (" + fdn + ").\n"); 77 | PROVLOG::open(ufd, fdn, fcntl(STDERR_FILENO, F_GETFL), 0); 78 | } 79 | // TODO: Do we need to wash taint at this point? 80 | } 81 | } 82 | 83 | /* 84 | * Called before exit. 85 | * Handles any fd's that haven't been closed. 86 | */ 87 | static void OnExit(INT32, void *) { 88 | /* Generate close log entries for remaining ufds. 89 | * Don't you love the c++11 loop syntax? 90 | */ 91 | for ( auto &fd : fdset ) { 92 | PROVLOG::ufd_t ufd = PROVLOG::ufdmap[fd]; 93 | PROVLOG::ufdmap.del(fd); 94 | PROVLOG::close(ufd); 95 | } 96 | } 97 | 98 | 99 | /* 100 | * Tool used for verifying that libdft propagates taint correctly. 101 | */ 102 | int main(int argc, char **argv) { 103 | /* initialize symbol processing */ 104 | PIN_InitSymbols(); 105 | 106 | if (unlikely(PIN_Init(argc, argv))) 107 | goto err; 108 | 109 | IMG_AddInstrumentFunction(ImageLoad, 0); 110 | PIN_AddFiniFunction(OnExit, 0); 111 | 112 | #ifdef DTRACKER_DEBUG 113 | INS_AddInstrumentFunction(CheckMagicValue, 0); 114 | #endif 115 | 116 | LOG("Initializing libdft.\n"); 117 | if (unlikely(libdft_init() != 0)) 118 | goto err; 119 | 120 | // reset counters 121 | bzero(stdcount, sizeof(stdcount)); 122 | 123 | // Open raw prov file. 124 | // This file is to be post-processed to get the data in a proper format. 125 | PROVLOG::rawProvStream.open(ProvRawKnob.Value().c_str()); 126 | 127 | 128 | /* 129 | * Install taint sources and sinks. 130 | * syscall_set_{pre, post}() set the callbacks in the libdft 131 | * syscall description struct. 132 | * These callbacks are respectively invoked through 133 | * sysenter_save() and sysexit_save() function of libdft. 134 | * In turn, these libdft functions are hooked to run before/after 135 | * every syscall using PIN_AddSyscall{Entry, Exit}Function(). 136 | */ 137 | 138 | /* dtracker_openclose.cpp: open(2), creat(2), close(2) */ 139 | (void)syscall_set_pre(&syscall_desc[__NR_open], pre_open_hook); 140 | (void)syscall_set_pre(&syscall_desc[__NR_creat], pre_open_hook); 141 | (void)syscall_set_post(&syscall_desc[__NR_open], post_open_hook); 142 | (void)syscall_set_post(&syscall_desc[__NR_creat], post_open_hook); 143 | (void)syscall_set_post(&syscall_desc[__NR_close], post_close_hook); 144 | 145 | /* dtracker_read.cpp: read(2), readv(2) */ 146 | (void)syscall_set_post(&syscall_desc[__NR_read], post_read_hook); 147 | (void)syscall_set_post(&syscall_desc[__NR_readv], post_readv_hook); 148 | 149 | /* dtracker_write.cpp: write(2), writev(2) */ 150 | (void)syscall_set_post(&syscall_desc[__NR_write], post_write_hook); 151 | (void)syscall_set_post(&syscall_desc[__NR_writev], post_writev_hook); 152 | 153 | /* dtracker_mmap.cpp: mmap2(2), munmap(2) */ 154 | (void)syscall_set_post(&syscall_desc[__NR_mmap2], post_mmap2_hook); 155 | (void)syscall_set_post(&syscall_desc[__NR_munmap], post_munmap_hook); 156 | 157 | 158 | /* start the program and return something to make the compiler happy */ 159 | LOG("Starting program.\n"); 160 | PIN_StartProgram(); 161 | return EXIT_SUCCESS; 162 | 163 | err: 164 | /* error handling */ 165 | 166 | /* detach from the process */ 167 | libdft_die(); 168 | 169 | /* return */ 170 | return EXIT_FAILURE; 171 | } 172 | 173 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 174 | -------------------------------------------------------------------------------- /dtracker_debug.H: -------------------------------------------------------------------------------- 1 | #ifndef DTRACKER_DEBUG_H 2 | #define DTRACKER_DEBUG_H 3 | 4 | #include 5 | #include 6 | #include "tagmap.h" 7 | #include "pin.H" 8 | 9 | #define DT_DBG_MAGIC "Tsakas" 10 | #define DT_DBG_MAGICLEN 6 11 | 12 | inline std::string tag_memrange_sprint(ADDRINT addr, size_t n) { 13 | size_t i; 14 | std::stringstream ss; 15 | 16 | ss << "tags[" << StringFromAddrint(addr) << "][0:" << n-1 << "] = ["; 17 | for (i=0; i void pre_open_hook(syscall_ctx_t *ctx); 12 | template void post_open_hook(syscall_ctx_t *ctx); 13 | template void post_close_hook(syscall_ctx_t *ctx); 14 | 15 | /* dtracker_read.cpp */ 16 | template void post_read_hook(syscall_ctx_t *ctx); 17 | template void post_readv_hook(syscall_ctx_t *ctx); 18 | 19 | /* dtracker_write.cpp */ 20 | template void post_write_hook(syscall_ctx_t *ctx); 21 | template void post_writev_hook(syscall_ctx_t *ctx); 22 | 23 | /* dtracker_mmap.cpp */ 24 | template void post_mmap2_hook(syscall_ctx_t *ctx); 25 | template void post_munmap_hook(syscall_ctx_t *ctx); 26 | template void post_mremap_hook(syscall_ctx_t *ctx); 27 | 28 | #endif 29 | 30 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 31 | -------------------------------------------------------------------------------- /hooks/libdft_tag_bitset/mmap.cpp: -------------------------------------------------------------------------------- 1 | #include "hooks/hooks.H" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "libdft_api.h" 9 | #include "tagmap.h" 10 | #include "pin.H" 11 | 12 | #include "provlog.H" 13 | #include "dtracker.H" 14 | #include "osutils.H" 15 | 16 | /* TODO: Implement tagmap_getb_as_ptr which also allocates tags. 17 | * This would save us from the get/update/assign pattern we use. 18 | * TODO: Hook for mprotect(2) (?). 19 | * TODO: Hooks for munmap(), mremap(). 20 | */ 21 | 22 | /* 23 | * mmap2(2) handler (taint-source) 24 | * 25 | * Signature: void *mmap2(void *addr, size_t length, int prot, int flags, int fd, off_t pgoffset); 26 | * ARG0 ARG1 ARG2 ARG3 ARG4 ARG5 27 | * 28 | */ 29 | #define DEF_SYSCALL_MMAP2 30 | #include "hooks/syscall_args.h" 31 | template<> 32 | void post_mmap2_hook(syscall_ctx_t *ctx) { 33 | /* not successful; optimized branch */ 34 | if (unlikely(_ADDR == (ADDRINT)-1)) { 35 | LOG("ERROR " _CALL_LOG_STR + " (" + strerror(errno) + ")\n"); 36 | return; 37 | } 38 | 39 | if (_FD >= 0 && fdset.find(_FD) != fdset.end()) { 40 | LOG("OK " _CALL_LOG_STR + "\n"); 41 | 42 | /* set tags on mapped area */ 43 | const PROVLOG::ufd_t ufd = PROVLOG::ufdmap[_FD]; 44 | size_t i = 0; 45 | 46 | while(i<_LENGTH) { 47 | tag_t t = tagmap_getb(_ADDR+i); 48 | t.set(ufd); 49 | tagmap_setb_with_tag(_ADDR+i, t); 50 | 51 | LOG( "mmap:tags[" + StringFromAddrint(_ADDR+i) + "] : " + 52 | tag_sprint(t) + "\n" 53 | ); 54 | i++; 55 | } 56 | } 57 | else { 58 | /* log mapping if it is anonymous */ 59 | if (_FD == -1) LOG("OK " _CALL_LOG_STR + "\n"); 60 | 61 | /* clear tags on mapped area */ 62 | size_t i = 0; 63 | while(i<_LENGTH) { tagmap_clrb(_ADDR+i); i++; } 64 | } 65 | } 66 | #define UNDEF_SYSCALL_MMAP2 67 | #include "hooks/syscall_args.h" 68 | 69 | /* 70 | * munmap(2) handler 71 | * 72 | * Signature: int munmap(void *addr, size_t length); 73 | * 74 | */ 75 | #define DEF_SYSCALL_MUNMAP 76 | #include "hooks/syscall_args.h" 77 | template<> 78 | void post_munmap_hook(syscall_ctx_t *ctx) { 79 | /* not successful; optimized branch */ 80 | if (unlikely(_RET_STATUS < 0)) { 81 | LOG("ERROR " _CALL_LOG_STR + " (" + strerror(errno) + ")\n"); 82 | return; 83 | } 84 | 85 | LOG("OK " _CALL_LOG_STR + "\n"); 86 | for(size_t i=0; i<_LENGTH; i++) tagmap_clrb(_ADDR+i); 87 | } 88 | #define UNDEF_SYSCALL_MUNMAP 89 | #include "hooks/syscall_args.h" 90 | 91 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 92 | -------------------------------------------------------------------------------- /hooks/libdft_tag_bitset/openclose.cpp: -------------------------------------------------------------------------------- 1 | #include "hooks/hooks.H" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "libdft_api.h" 8 | #include "tagmap.h" 9 | #include "pin.H" 10 | 11 | #include "provlog.H" 12 | #include "dtracker.H" 13 | #include "osutils.H" 14 | 15 | /* tracks whether path existed before the execution of syscall */ 16 | static struct { 17 | std::string pathname; 18 | int existed_before_syscall; 19 | } exist_status; 20 | 21 | /* 22 | * open(2)/creat(2) handlers 23 | * 24 | * Signatures: 25 | * int open(const char *pathname, int flags); 26 | * int open(const char *pathname, int flags, mode_t mode); 27 | * int creat(const char *pathname, mode_t mode); 28 | */ 29 | #define DEF_SYSCALL_OPEN 30 | #include "hooks/syscall_args.h" 31 | template<> 32 | void pre_open_hook(syscall_ctx_t *ctx) { 33 | /* Check the status of the pathname we are about to open/create. */ 34 | exist_status.pathname = std::string(_PATHNAME); 35 | exist_status.existed_before_syscall = path_exists(exist_status.pathname); 36 | //std::cerr << exist_status.pathname << std::endl; 37 | //std::cerr << exist_status.existed_before_syscall << std::endl; 38 | } 39 | template<> 40 | void post_open_hook(syscall_ctx_t *ctx) { 41 | /* not successful; optimized branch */ 42 | if (unlikely(_FD < 0)) { 43 | LOG("ERROR " _CALL_LOG_STR + " (" + strerror(errno) + ")\n"); 44 | return; 45 | } 46 | 47 | /* Resolve fd to full pathname. Use this instead of syscall argument. */ 48 | const std::string fdn = fdname(_FD); 49 | 50 | if ( !in_dtracker_whitelist(fdn) && !path_isdir(fdn) ) { 51 | const PROVLOG::ufd_t ufd = PROVLOG::ufdmap[_FD]; 52 | fdset.insert(_FD); 53 | 54 | int created = ( 55 | exist_status.existed_before_syscall != 1 && 56 | (_FLAGS & O_CREAT) && 57 | exist_status.pathname == std::string(_PATHNAME) 58 | ); 59 | 60 | LOG("OK " _CALL_LOG_STR + "\n"); 61 | LOG("INFO mapped fd" + decstr(_FD) + ":ufd" + decstr(ufd) + "\n"); 62 | PROVLOG::open(ufd, fdn, _FLAGS, created); 63 | } 64 | else { 65 | LOG("INFO ignoring fd" + decstr(_FD) + " (" + fdn + ")\n"); 66 | } 67 | 68 | /* reset the exist_status */ 69 | exist_status.existed_before_syscall = 0; 70 | } 71 | #define UNDEF_SYSCALL_OPEN 72 | #include "hooks/syscall_args.h" 73 | 74 | /* 75 | * close(2) handler - updates watched fds 76 | * 77 | * Signature: int close(int fd); 78 | */ 79 | #define DEF_SYSCALL_CLOSE 80 | #include "hooks/syscall_args.h" 81 | template<> 82 | void post_close_hook(syscall_ctx_t *ctx) { 83 | /* not successful; optimized branch */ 84 | if (unlikely(_RET_STATUS < 0)) { 85 | LOG("ERROR " _CALL_LOG_STR + " (" + strerror(errno) + ")\n"); 86 | return; 87 | } 88 | 89 | LOG("OK " _CALL_LOG_STR + "\n"); 90 | 91 | std::set::iterator it = fdset.find(_FD); 92 | if (it == fdset.end()) return; 93 | const PROVLOG::ufd_t ufd = PROVLOG::ufdmap[_FD]; 94 | 95 | 96 | fdset.erase(it); 97 | PROVLOG::ufdmap.del(_FD); 98 | if (IS_STDFD(_FD)) stdcount[_FD] = 0; 99 | 100 | LOG("INFO removed mapping fd" + decstr(_FD) + ":ufd" + decstr(ufd) + "\n"); 101 | PROVLOG::close(ufd); 102 | } 103 | #define UNDEF_SYSCALL_CLOSE 104 | #include "hooks/syscall_args.h" 105 | 106 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 107 | -------------------------------------------------------------------------------- /hooks/libdft_tag_bitset/read.cpp: -------------------------------------------------------------------------------- 1 | #include "hooks/hooks.H" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "libdft_api.h" 8 | #include "tagmap.h" 9 | #include "pin.H" 10 | 11 | #include "provlog.H" 12 | #include "dtracker.H" 13 | #include "osutils.H" 14 | 15 | /* 16 | * read(2) handler (taint-source) 17 | * 18 | * Signature: ssize_t read(int fd, void *buf, size_t count); 19 | */ 20 | template<> 21 | void post_read_hook(syscall_ctx_t *ctx) { 22 | /* not successful; optimized branch; errno message may be incorrect */ 23 | if (unlikely((long)ctx->ret < 0)) { 24 | LOG("Error reading from fd" + decstr(ctx->arg[SYSCALL_ARG0]) + ": " + strerror(errno) + "\n"); 25 | return; 26 | } 27 | 28 | /* define constants for better readability of code */ 29 | const size_t nr = ctx->ret; 30 | const int fd = ctx->arg[SYSCALL_ARG0]; 31 | const LEVEL_BASE::ADDRINT buf = ctx->arg[SYSCALL_ARG1]; 32 | // const size_t count = ctx->arg[SYSCALL_ARG2]; 33 | 34 | if (fdset.find(fd) != fdset.end()) { 35 | /* set tags on read bytes */ 36 | PROVLOG::ufd_t ufd = PROVLOG::ufdmap[fd]; 37 | off_t read_offset_start = 0; 38 | size_t i = 0; 39 | 40 | if (IS_STDFD(fd)) { // counters for stdin/stdout/stderr are manually maintained 41 | read_offset_start = stdcount[fd]; 42 | stdcount[fd] += nr; 43 | } 44 | else { 45 | read_offset_start = lseek(fd, 0, SEEK_CUR); 46 | if ( unlikely(read_offset_start < 0) ){ 47 | LOG("Error on L" + decstr(__LINE__) + " lseek-ing on fd" + decstr(fd) + ": " + strerror(errno) + "\n"); 48 | return; 49 | } 50 | read_offset_start -= nr; 51 | } 52 | 53 | // debug logging. 54 | LOG("----------------------------\n"); 55 | LOG( "Read " + decstr(nr) + 56 | " bytes from fd" + decstr(fd) + 57 | ":" + decstr((LEVEL_BASE::INT64)read_offset_start) + 58 | " to " + StringFromAddrint(buf) + ".\n" 59 | ); 60 | LOG( "[" + StringFromAddrint(buf) + 61 | " - " + StringFromAddrint(buf+32) + 62 | "] = " + std::string((char *)buf, MIN(nr, 32)) + "\n" 63 | ); 64 | 65 | while(i 87 | void post_readv_hook(syscall_ctx_t *ctx) { 88 | /* iterators */ 89 | int i; 90 | struct iovec *iov; 91 | set::iterator it; 92 | 93 | /* bytes copied in a iovec structure */ 94 | size_t iov_tot; 95 | 96 | /* total bytes copied */ 97 | size_t tot = (size_t)ctx->ret; 98 | 99 | LOG("readv called. ABORT."); 100 | /* readv() was not successful; optimized branch */ 101 | if (unlikely((long)ctx->ret <= 0)) 102 | return; 103 | 104 | /* get the descriptor */ 105 | it = fdset.find((int)ctx->arg[SYSCALL_ARG0]); 106 | 107 | /* iterate the iovec structures */ 108 | for (i = 0; i < (int)ctx->arg[SYSCALL_ARG2] && tot > 0; i++) { 109 | /* get an iovec */ 110 | iov = ((struct iovec *)ctx->arg[SYSCALL_ARG1]) + i; 111 | 112 | /* get the length of the iovec */ 113 | iov_tot = (tot >= (size_t)iov->iov_len) ? 114 | (size_t)iov->iov_len : tot; 115 | 116 | /* taint interesting data and zero everything else */ 117 | if (it != fdset.end()) 118 | /* set the tag markings */ 119 | tagmap_setn((size_t)iov->iov_base, iov_tot); 120 | else 121 | /* clear the tag markings */ 122 | tagmap_clrn((size_t)iov->iov_base, iov_tot); 123 | 124 | /* housekeeping */ 125 | tot -= iov_tot; 126 | } 127 | } 128 | 129 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 130 | -------------------------------------------------------------------------------- /hooks/libdft_tag_bitset/write.cpp: -------------------------------------------------------------------------------- 1 | #include "hooks/hooks.H" 2 | 3 | // #include 4 | // #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include "libdft_api.h" 12 | #include "tagmap.h" 13 | #include "pin.H" 14 | 15 | #define USE_LIBDFT_TAG_BITSET 16 | #include "provlog.H" 17 | #include "dtracker.H" 18 | #include "osutils.H" 19 | 20 | 21 | #include "pin.H" 22 | 23 | 24 | // #define __DEBUG_SYSCALL_WRITE 25 | #ifdef __DEBUG_SYSCALL_WRITE 26 | static inline std::string __RANGE2STR(const range_map_t & rmap) { 27 | std::string s; 28 | for (auto &r : rmap) { 29 | s += decstr(r.first.first) + ":" + decstr(r.first.second) + 30 | "(" + decstr((int)r.second.type) + "," + decstr(r.second.start) + 31 | "," + decstr(r.second.length) + ") "; 32 | } 33 | return s; 34 | } 35 | #endif 36 | 37 | /* 38 | * write(2) handler (taint-sink) 39 | * 40 | * Signature: ssize_t write(int fd, const void *buf, size_t count); 41 | */ 42 | #define DEF_SYSCALL_WRITE 43 | #include "hooks/syscall_args.h" 44 | template<> 45 | void post_write_hook(syscall_ctx_t *ctx) { 46 | /* ignore write() on not watched fd */ 47 | if (unlikely(fdset.find(_FD) == fdset.end())) 48 | return; 49 | 50 | /* write() was not successful; optimized branch; errno message may be incorrect */ 51 | if (unlikely(_N_WRITTEN < 0)) { 52 | LOG("ERROR " _CALL_LOG_STR + " (" + strerror(errno) + ")\n"); 53 | return; 54 | } 55 | 56 | LOG("OK " _CALL_LOG_STR + "\n"); 57 | 58 | const PROVLOG::ufd_t ufd = PROVLOG::ufdmap[_FD]; 59 | off_t write_begin; 60 | 61 | /* calculate begining of write */ 62 | if (IS_STDFD(_FD)) { 63 | write_begin = stdcount[_FD]; 64 | stdcount[_FD] += _N_WRITTEN; 65 | } 66 | else { 67 | write_begin = lseek(_FD, 0, SEEK_CUR) - _N_WRITTEN; 68 | if ( unlikely(write_begin < 0) ){ 69 | LOG("Error on L" + decstr(__LINE__) + " lseek-ing on fd" + decstr(_FD) + ": " + strerror(errno) + "\n"); 70 | return; 71 | } 72 | } 73 | 74 | // Range aggregation. Only NONE/REP ranges make sense for libdft_tag_bitset. 75 | // In ranges[j] we keep the offset where tag[j] was first marked present. 76 | // A REP range is dumped the first time when tag[j] is not present again. 77 | INT32 *ranges = new INT32[TAG_BITSET_SIZE]; 78 | std::fill(ranges, ranges+TAG_BITSET_SIZE, -1); 79 | 80 | for(ssize_t i=0; i<_N_WRITTEN; i++) { //loop through memory locations 81 | tag_t tag = tagmap_getb(_BUF+i); 82 | for(unsigned int j=0; j= 0) { 95 | // range end - output data and reset it 96 | PROVLOG::write(j, ufd, ranges[j], write_begin+i-ranges[j]); 97 | ranges[j] = -1; 98 | } 99 | } 100 | } //loop memory locations 101 | for(unsigned int j=0; j= 0) 104 | PROVLOG::write(j, ufd, ranges[j], _N_WRITTEN-ranges[j]); 105 | } 106 | delete ranges; 107 | } 108 | #define UNDEF_SYSCALL_WRITE 109 | #include "hooks/syscall_args.h" 110 | 111 | template<> 112 | void post_writev_hook(syscall_ctx_t *ctx) { 113 | LOG("Writev. Not supported yet.\n"); 114 | } 115 | 116 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 117 | -------------------------------------------------------------------------------- /hooks/libdft_tag_set_fdoff/mmap.cpp: -------------------------------------------------------------------------------- 1 | #include "hooks/hooks.H" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "libdft_api.h" 9 | #include "tagmap.h" 10 | #include "pin.H" 11 | 12 | #include "provlog.H" 13 | #include "dtracker.H" 14 | #include "osutils.H" 15 | 16 | 17 | // TODO: Consider hooking mprotect(2). 18 | 19 | /* 20 | * mmap2(2) handler (taint-source) 21 | * 22 | * Signature: void *mmap2(void *addr, size_t length, int prot, int flags, int fd, off_t pgoffset); 23 | * ARG0 ARG1 ARG2 ARG3 ARG4 ARG5 24 | * 25 | * TODO: Don't forget to also create hooks for munmap(), mremap(). 26 | */ 27 | #define DEF_SYSCALL_MMAP2 28 | #include "hooks/syscall_args.h" 29 | template<> 30 | void post_mmap2_hook(syscall_ctx_t *ctx) { 31 | /* not successful; optimized branch */ 32 | if (unlikely(_ADDR == (ADDRINT)-1)) { 33 | LOG("ERROR " _CALL_LOG_STR + " (" + strerror(errno) + ")\n"); 34 | return; 35 | } 36 | 37 | if (_FD >= 0 && fdset.find(_FD) != fdset.end()) { 38 | LOG("OK " _CALL_LOG_STR + "\n"); 39 | 40 | /* set tags on mapped area */ 41 | const PROVLOG::ufd_t ufd = PROVLOG::ufdmap[_FD]; 42 | size_t i = 0; 43 | 44 | while(i<_LENGTH) { 45 | tag_t ts {{ufd, _FD_OFFSET+i}}; 46 | tag_t ts_prev = tagmap_getb(_ADDR+i); 47 | 48 | tagmap_setb_with_tag(_ADDR+i, ts); 49 | 50 | LOG( "mmap:tags[" + StringFromAddrint(_ADDR+i) + "] : " + 51 | tag_sprint(ts_prev) + " -> " + 52 | tag_sprint(tagmap_getb(_ADDR+i)) + "\n" 53 | ); 54 | i++; 55 | } 56 | } 57 | else { 58 | /* log mapping if it is anonymous */ 59 | if (_FD == -1) LOG("OK " _CALL_LOG_STR + "\n"); 60 | 61 | /* clear tags on mapped area */ 62 | size_t i = 0; 63 | while(i<_LENGTH) { tagmap_clrb(_ADDR+i); i++; } 64 | } 65 | } 66 | #define UNDEF_SYSCALL_MMAP2 67 | #include "hooks/syscall_args.h" 68 | 69 | /* 70 | * munmap(2) handler 71 | * 72 | * Signature: int munmap(void *addr, size_t length); 73 | * 74 | */ 75 | #define DEF_SYSCALL_MUNMAP 76 | #include "hooks/syscall_args.h" 77 | template<> 78 | void post_munmap_hook(syscall_ctx_t *ctx) { 79 | /* not successful; optimized branch */ 80 | if (unlikely(_RET_STATUS < 0)) { 81 | LOG("ERROR " _CALL_LOG_STR + " (" + strerror(errno) + ")\n"); 82 | return; 83 | } 84 | 85 | LOG("OK " _CALL_LOG_STR + "\n"); 86 | for(size_t i=0; i<_LENGTH; i++) tagmap_clrb(_ADDR+i); 87 | } 88 | #define UNDEF_SYSCALL_MUNMAP 89 | #include "hooks/syscall_args.h" 90 | 91 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 92 | -------------------------------------------------------------------------------- /hooks/libdft_tag_set_fdoff/openclose.cpp: -------------------------------------------------------------------------------- 1 | #include "hooks/hooks.H" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "libdft_api.h" 8 | #include "tagmap.h" 9 | #include "pin.H" 10 | 11 | #include "provlog.H" 12 | #include "dtracker.H" 13 | #include "osutils.H" 14 | 15 | /* tracks whether path existed before the execution of syscall */ 16 | static struct { 17 | std::string pathname; 18 | int existed_before_syscall; 19 | } exist_status; 20 | 21 | /* 22 | * open(2)/creat(2) handlers 23 | * 24 | * Signatures: 25 | * int open(const char *pathname, int flags); 26 | * int open(const char *pathname, int flags, mode_t mode); 27 | * int creat(const char *pathname, mode_t mode); 28 | */ 29 | #define DEF_SYSCALL_OPEN 30 | #include "hooks/syscall_args.h" 31 | template<> 32 | void pre_open_hook(syscall_ctx_t *ctx) { 33 | /* Check the status of the pathname we are about to open/create. */ 34 | exist_status.pathname = std::string(_PATHNAME); 35 | exist_status.existed_before_syscall = path_exists(exist_status.pathname); 36 | //std::cerr << exist_status.pathname << std::endl; 37 | //std::cerr << exist_status.existed_before_syscall << std::endl; 38 | } 39 | template<> 40 | void post_open_hook(syscall_ctx_t *ctx) { 41 | /* not successful; optimized branch */ 42 | if (unlikely(_FD < 0)) { 43 | LOG("ERROR " _CALL_LOG_STR + " (" + strerror(errno) + ")\n"); 44 | return; 45 | } 46 | 47 | /* Resolve fd to full pathname. Use this instead of syscall argument. */ 48 | const std::string fdn = fdname(_FD); 49 | 50 | if ( !in_dtracker_whitelist(fdn) && !path_isdir(fdn) ) { 51 | const PROVLOG::ufd_t ufd = PROVLOG::ufdmap[_FD]; 52 | fdset.insert(_FD); 53 | 54 | int created = ( 55 | exist_status.existed_before_syscall != 1 && 56 | (_FLAGS & O_CREAT) && 57 | exist_status.pathname == std::string(_PATHNAME) 58 | ); 59 | 60 | LOG("OK " _CALL_LOG_STR + "\n"); 61 | LOG("INFO mapped fd" + decstr(_FD) + ":ufd" + decstr(ufd) + "\n"); 62 | PROVLOG::open(ufd, fdn, _FLAGS, created); 63 | } 64 | else { 65 | LOG("INFO ignoring fd" + decstr(_FD) + " (" + fdn + ")\n"); 66 | } 67 | 68 | /* reset the exist_status */ 69 | exist_status.existed_before_syscall = 0; 70 | } 71 | #define UNDEF_SYSCALL_OPEN 72 | #include "hooks/syscall_args.h" 73 | 74 | /* 75 | * close(2) handler - updates watched fds 76 | * 77 | * Signature: int close(int fd); 78 | */ 79 | #define DEF_SYSCALL_CLOSE 80 | #include "hooks/syscall_args.h" 81 | template<> 82 | void post_close_hook(syscall_ctx_t *ctx) { 83 | /* not successful; optimized branch */ 84 | if (unlikely(_RET_STATUS < 0)) { 85 | LOG("ERROR " _CALL_LOG_STR + " (" + strerror(errno) + ")\n"); 86 | return; 87 | } 88 | 89 | LOG("OK " _CALL_LOG_STR + "\n"); 90 | 91 | std::set::iterator it = fdset.find(_FD); 92 | if (it == fdset.end()) return; 93 | const PROVLOG::ufd_t ufd = PROVLOG::ufdmap[_FD]; 94 | 95 | 96 | fdset.erase(it); 97 | PROVLOG::ufdmap.del(_FD); 98 | if (IS_STDFD(_FD)) stdcount[_FD] = 0; 99 | 100 | LOG("INFO removed mapping fd" + decstr(_FD) + ":ufd" + decstr(ufd) + "\n"); 101 | PROVLOG::close(ufd); 102 | } 103 | #define UNDEF_SYSCALL_CLOSE 104 | #include "hooks/syscall_args.h" 105 | 106 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 107 | -------------------------------------------------------------------------------- /hooks/libdft_tag_set_fdoff/read.cpp: -------------------------------------------------------------------------------- 1 | #include "hooks/hooks.H" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "libdft_api.h" 8 | #include "tagmap.h" 9 | #include "pin.H" 10 | 11 | #include "provlog.H" 12 | #include "dtracker.H" 13 | #include "osutils.H" 14 | 15 | 16 | /* 17 | * read(2) handler (taint-source) 18 | * 19 | * Signature: ssize_t read(int fd, void *buf, size_t count); 20 | */ 21 | template<> 22 | void post_read_hook(syscall_ctx_t *ctx) { 23 | /* not successful; optimized branch; errno message may be incorrect */ 24 | if (unlikely((long)ctx->ret < 0)) { 25 | LOG("Error reading from fd" + decstr(ctx->arg[SYSCALL_ARG0]) + ": " + strerror(errno) + "\n"); 26 | return; 27 | } 28 | 29 | /* define constants for better readability of code */ 30 | const size_t nr = ctx->ret; 31 | const int fd = ctx->arg[SYSCALL_ARG0]; 32 | const LEVEL_BASE::ADDRINT buf = ctx->arg[SYSCALL_ARG1]; 33 | // const size_t count = ctx->arg[SYSCALL_ARG2]; 34 | 35 | if (fdset.find(fd) != fdset.end()) { 36 | /* set tags on read bytes */ 37 | const PROVLOG::ufd_t ufd = PROVLOG::ufdmap[fd]; 38 | off_t read_offset_start = 0; 39 | size_t i = 0; 40 | 41 | if (IS_STDFD(fd)) { // counters for stdin/stdout/stderr are manually maintained 42 | read_offset_start = stdcount[fd]; 43 | stdcount[fd] += nr; 44 | } 45 | else { 46 | read_offset_start = lseek(fd, 0, SEEK_CUR); 47 | if ( unlikely(read_offset_start < 0) ){ 48 | LOG("Error on L" + decstr(__LINE__) + " lseek-ing on fd" + decstr(fd) + ": " + strerror(errno) + "\n"); 49 | return; 50 | } 51 | read_offset_start -= nr; 52 | } 53 | 54 | // debug logging. 55 | LOG("----------------------------\n"); 56 | LOG( "Read " + decstr(nr) + 57 | " bytes from fd" + decstr(fd) + 58 | ":" + decstr((LEVEL_BASE::INT64)read_offset_start) + 59 | " to " + StringFromAddrint(buf) + ".\n" 60 | ); 61 | LOG( "[" + StringFromAddrint(buf) + 62 | " - " + StringFromAddrint(buf+32) + 63 | "] = " + std::string((char *)buf, MIN(nr, 32)) + "\n" 64 | ); 65 | 66 | while(i " + 74 | tag_sprint(tagmap_getb(buf+i)) + "\n" 75 | ); 76 | i++; 77 | } 78 | } 79 | else { 80 | /* clear tags for read bytes */ 81 | size_t i = 0; 82 | while(i 90 | void post_readv_hook(syscall_ctx_t *ctx) { 91 | /* iterators */ 92 | int i; 93 | struct iovec *iov; 94 | set::iterator it; 95 | 96 | /* bytes copied in a iovec structure */ 97 | size_t iov_tot; 98 | 99 | /* total bytes copied */ 100 | size_t tot = (size_t)ctx->ret; 101 | 102 | LOG("readv called. ABORT."); 103 | /* readv() was not successful; optimized branch */ 104 | if (unlikely((long)ctx->ret <= 0)) 105 | return; 106 | 107 | /* get the descriptor */ 108 | it = fdset.find((int)ctx->arg[SYSCALL_ARG0]); 109 | 110 | /* iterate the iovec structures */ 111 | for (i = 0; i < (int)ctx->arg[SYSCALL_ARG2] && tot > 0; i++) { 112 | /* get an iovec */ 113 | iov = ((struct iovec *)ctx->arg[SYSCALL_ARG1]) + i; 114 | 115 | /* get the length of the iovec */ 116 | iov_tot = (tot >= (size_t)iov->iov_len) ? 117 | (size_t)iov->iov_len : tot; 118 | 119 | /* taint interesting data and zero everything else */ 120 | if (it != fdset.end()) 121 | /* set the tag markings */ 122 | tagmap_setn((size_t)iov->iov_base, iov_tot); 123 | else 124 | /* clear the tag markings */ 125 | tagmap_clrn((size_t)iov->iov_base, iov_tot); 126 | 127 | /* housekeeping */ 128 | tot -= iov_tot; 129 | } 130 | } 131 | 132 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 133 | -------------------------------------------------------------------------------- /hooks/libdft_tag_set_fdoff/write.cpp: -------------------------------------------------------------------------------- 1 | #include "hooks/hooks.H" 2 | 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include "libdft_api.h" 11 | #include "tagmap.h" 12 | #include "pin.H" 13 | 14 | #define USE_LIBDFT_TAG_SET_FDOFF 15 | #include "provlog.H" 16 | #include "dtracker.H" 17 | #include "osutils.H" 18 | 19 | 20 | #include "pin.H" 21 | 22 | 23 | /* 24 | Output aggregation information 25 | ------------------------------- 26 | 27 | DTracker does two types of aggregation: 28 | a. Range mapping. 29 | I.e. for 0<=i 87 | void post_write_hook(syscall_ctx_t *ctx) { 88 | /* ignore write() on not watched fd */ 89 | if (unlikely(fdset.find(_FD) == fdset.end())) 90 | return; 91 | 92 | /* write() was not successful; optimized branch; errno message may be incorrect */ 93 | if (unlikely(_N_WRITTEN < 0)) { 94 | LOG("ERROR " _CALL_LOG_STR + " (" + strerror(errno) + ")\n"); 95 | return; 96 | } 97 | 98 | LOG("OK " _CALL_LOG_STR + "\n"); 99 | 100 | const PROVLOG::ufd_t ufd = PROVLOG::ufdmap[_FD]; 101 | off_t write_begin; 102 | range_map_t ranges; 103 | range_map_t ranges_prev; 104 | 105 | /* calculate begining of write */ 106 | if (IS_STDFD(_FD)) { 107 | write_begin = stdcount[_FD]; 108 | stdcount[_FD] += _N_WRITTEN; 109 | } 110 | else { 111 | write_begin = lseek(_FD, 0, SEEK_CUR) - _N_WRITTEN; 112 | if ( unlikely(write_begin < 0) ){ 113 | LOG("Error on L" + decstr(__LINE__) + " lseek-ing on fd" + decstr(_FD) + ": " + strerror(errno) + "\n"); 114 | return; 115 | } 116 | } 117 | 118 | for(ssize_t i=0; i<_N_WRITTEN; i++) { //loop through memory locations 119 | tag_t tag = tagmap_getb(_BUF+i); 120 | 121 | #ifdef __DEBUG_SYSCALL_WRITE 122 | LOG("---------------------- " + std::string((char *)(_BUF+i), 1) + "\n"); 123 | LOG("RANGES " + __RANGE2STR(ranges) + "\n"); 124 | LOG("RANGES_PREV " + __RANGE2STR(ranges_prev) + "\n"); 125 | #endif 126 | 127 | for (auto & tm : tag) { //loop taint marks for specific location 128 | 129 | // check if a single taint mark from the input is repeated in the output 130 | auto rlookup = tm; 131 | auto range_it = ranges_prev.find(rlookup); 132 | if (range_it != ranges_prev.end()) { 133 | // LOG("C1\n"); 134 | auto range_last = (*range_it).first; 135 | auto range_info = (*range_it).second; 136 | 137 | switch(range_info.type) { 138 | case range_info_t::SEQ: 139 | // adjust SEQ range 140 | range_last.second--; 141 | range_info.length--; 142 | ranges_prev.insert(range_it, std::make_pair(range_last, range_info)); 143 | ranges_prev.erase(range_it); 144 | 145 | // add a new REP range to next range set 146 | ranges.insert(std::make_pair(tm, (range_info_t){range_info_t::REP, i-1, 2})); 147 | continue; 148 | 149 | case range_info_t::NONE: 150 | // set range type to REP 151 | range_info.type = range_info_t::REP; 152 | 153 | case range_info_t::REP: 154 | // add info to next range set 155 | range_info.length++; 156 | ranges.insert(std::make_pair(tm, range_info)); 157 | 158 | // remove info from previous range set 159 | ranges_prev.erase(range_it); 160 | continue; 161 | } 162 | } 163 | 164 | // check if a sequence of taint marks from the input also appears in the output 165 | rlookup = tm; 166 | rlookup.second -= 1; 167 | range_it = ranges_prev.find(rlookup); 168 | if (range_it != ranges_prev.end()) { 169 | // LOG("C2\n"); 170 | auto range_info = (*range_it).second; 171 | 172 | switch(range_info.type) { 173 | case range_info_t::REP: 174 | // start a new range - we won't touch the old one 175 | ranges.insert(std::make_pair(tm, (range_info_t){range_info_t::NONE, i, 1})); 176 | continue; 177 | 178 | case range_info_t::NONE: 179 | // set range type to SEQ 180 | range_info.type = range_info_t::SEQ; 181 | 182 | case range_info_t::SEQ: 183 | // add info to next range set 184 | range_info.length++; 185 | ranges.insert(std::make_pair(tm, range_info)); 186 | 187 | // remove info from previous range set 188 | ranges_prev.erase(range_it); 189 | continue; 190 | } 191 | } 192 | 193 | // add taint mark as a new range 194 | // LOG("C3\n"); 195 | rlookup = tm; 196 | ranges.insert(std::make_pair(rlookup, (range_info_t){range_info_t::NONE, i, 1})); 197 | 198 | } //loop taint marks 199 | 200 | #ifdef __DEBUG_SYSCALL_WRITE 201 | LOG("~~~~~~~~~~~~~~\n"); 202 | LOG("RANGES " + __RANGE2STR(ranges) + "\n"); 203 | LOG("RANGES_PREV " + __RANGE2STR(ranges_prev) + "\n"); 204 | #endif 205 | 206 | //dump output for pattern sequences that were broken 207 | for (auto &tm : ranges_prev) 208 | PROVLOG::write_range(ufd, write_begin, tm.first, tm.second); 209 | 210 | // swap the two range sets and clear the ranges for next iteration 211 | ranges.swap(ranges_prev); 212 | ranges.clear(); 213 | 214 | } //loop memory locations 215 | 216 | //dump the remaining pattern sequences 217 | for (auto &tm : ranges_prev) 218 | PROVLOG::write_range(ufd, write_begin, tm.first, tm.second); 219 | } 220 | #define UNDEF_SYSCALL_WRITE 221 | #include "hooks/syscall_args.h" 222 | 223 | template<> 224 | void post_writev_hook(syscall_ctx_t *ctx) { 225 | LOG("Writev. Not supported yet.\n"); 226 | } 227 | 228 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 229 | -------------------------------------------------------------------------------- /hooks/syscall_args.h: -------------------------------------------------------------------------------- 1 | /* This header is meant as a helper to improve readability of the 2 | * syscall hooks code. It defines macros for the value of each 3 | * syscall argument. The file should be included twice each time: 4 | * once to enable the macro definitions for a specific syscall and 5 | * once to disable them. 6 | * 7 | * Make sure you check how each macro is defined before using them. 8 | * Sometimes, some minimal processing is done for convenience. 9 | * E.g. pgoffset argument of mmap2() is converted to bytes offset 10 | * because that is what we use in practice. 11 | * 12 | * Usage example: 13 | * #define DEF_SYSCALL_X 14 | * #include "syscall_args.h" 15 | * void pre_X_hook(syscall_ctx_t *ctx) { ... } 16 | * void post_X_hook(syscall_ctx_t *ctx) { ... } 17 | * #define UNDEF_SYSCALL_X 18 | * #include "syscall_args.h" 19 | * 20 | * In addition to argument macros, we also define the _CALL_LOG_STR 21 | * which merges the arguments in a format suitable to be used with 22 | * Pin's LOG() function. As a convention, non-imporant arguments can 23 | * be rendered as "*". 24 | * Using _CALL_LOG_STR anywhere in the code adds a lot of string 25 | * concatenations. To avoid that during measurements, we define it 26 | * to the empty string if NO_PINTOOL_LOG is set. 27 | */ 28 | 29 | #ifdef NO_PINTOOL_LOG 30 | #define _CALL_LOG_STR "" 31 | #endif 32 | 33 | /**** open(2) *****************************************************/ 34 | #ifdef DEF_SYSCALL_OPEN 35 | #define _FD (int)ctx->ret 36 | #define _PATHNAME (char *)ctx->arg[SYSCALL_ARG0] 37 | #define _FLAGS (ctx->nr == __NR_open ? ctx->arg[SYSCALL_ARG1] : O_CREAT|O_WRONLY|O_TRUNC) 38 | #define _MODE (ctx->nr == __NR_creat ? ctx->arg[SYSCALL_ARG1] : ctx->arg[SYSCALL_ARG2]) 39 | #ifndef NO_PINTOOL_LOG 40 | #define _CALL_LOG_STR + std::string(ctx->nr == __NR_creat ? "creat(" : "open(") + _PATHNAME + ", " + decstr(_FLAGS) + ", " + decstr(_MODE) + ") = " + decstr(_FD) 41 | #endif 42 | #undef DEF_SYSCALL_OPEN 43 | #endif 44 | 45 | #ifdef UNDEF_SYSCALL_OPEN 46 | #undef _FD 47 | #undef _PATHNAME 48 | #undef _FLAGS 49 | #undef _MODE 50 | #ifndef NO_PINTOOL_LOG 51 | #undef _CALL_LOG_STR 52 | #endif 53 | #undef UNDEF_SYSCALL_OPEN 54 | #endif 55 | 56 | /**** close(2) *****************************************************/ 57 | #ifdef DEF_SYSCALL_CLOSE 58 | #define _RET_STATUS (int)ctx->ret 59 | #define _FD (int)ctx->arg[SYSCALL_ARG0] 60 | #ifndef NO_PINTOOL_LOG 61 | #define _CALL_LOG_STR "close(" + decstr(_FD) + ") = " + decstr(_RET_STATUS) 62 | #endif 63 | #undef DEF_SYSCALL_CLOSE 64 | #endif 65 | 66 | #ifdef UNDEF_SYSCALL_CLOSE 67 | #undef _RET_STATUS 68 | #undef _FD 69 | #ifndef NO_PINTOOL_LOG 70 | #undef _CALL_LOG_STR 71 | #endif 72 | #undef UNDEF_SYSCALL_CLOSE 73 | #endif 74 | 75 | /**** mmap2(2) ****************************************************/ 76 | #ifdef DEF_SYSCALL_MMAP2 77 | #define _ADDR (ADDRINT)ctx->ret 78 | #define _ADDR_HINT (ADDRINT)ctx->arg[SYSCALL_ARG0] 79 | #define _LENGTH (size_t)ctx->arg[SYSCALL_ARG1] 80 | #define _PROT (int)ctx->arg[SYSCALL_ARG2] 81 | #define _FLAGS (int)ctx->arg[SYSCALL_ARG3] 82 | #define _FD (int)ctx->arg[SYSCALL_ARG4] 83 | #define _FD_OFFSET ((INT64)ctx->arg[SYSCALL_ARG5]*4096) 84 | #ifndef NO_PINTOOL_LOG 85 | #define _CALL_LOG_STR "mmap2(*, " + decstr(_LENGTH) + ", " + "*, *, " + decstr(_FD) + ", " + std::string(((_FLAGS&(MAP_ANONYMOUS|MAP_ANON)) != 0) ? "*" : hexstr(_FD_OFFSET)) + ") = " + StringFromAddrint(_ADDR) 86 | #endif 87 | #undef DEF_SYSCALL_MMAP2 88 | #endif 89 | 90 | #ifdef UNDEF_SYSCALL_MMAP2 91 | #undef _ADDR 92 | #undef _ADDR_HINT 93 | #undef _LENGTH 94 | #undef _PROT 95 | #undef _FLAGS 96 | #undef _FD 97 | #undef _FD_OFFSET 98 | #ifndef NO_PINTOOL_LOG 99 | #undef _CALL_LOG_STR 100 | #endif 101 | #undef UNDEF_SYSCALL_MMAP2 102 | #endif 103 | 104 | 105 | /**** munmap(2) ***************************************************/ 106 | #ifdef DEF_SYSCALL_MUNMAP 107 | #define _RET_STATUS ctx->ret 108 | #define _ADDR (ADDRINT)ctx->arg[SYSCALL_ARG0] 109 | #define _LENGTH (size_t)ctx->arg[SYSCALL_ARG1] 110 | #ifndef NO_PINTOOL_LOG 111 | #define _CALL_LOG_STR "munmap(" + StringFromAddrint(_ADDR) + ", " + decstr(_LENGTH) + ") = " + decstr(_RET_STATUS) 112 | #endif 113 | #undef DEF_SYSCALL_MUNMAP 114 | #endif 115 | 116 | #ifdef UNDEF_SYSCALL_MUNMAP 117 | #undef _RET_STATUS 118 | #undef _ADDR 119 | #undef _LENGTH 120 | #ifndef NO_PINTOOL_LOG 121 | #undef _CALL_LOG_STR 122 | #endif 123 | #undef UNDEF_SYSCALL_MUNMAP 124 | #endif 125 | 126 | 127 | /**** read(2) *****************************************************/ 128 | #ifdef DEF_SYSCALL_READ 129 | #define _RET_STATUS (int)ctx->ret 130 | #define _FD (int)ctx->arg[SYSCALL_ARG0] 131 | #ifndef NO_PINTOOL_LOG 132 | #define _CALL_LOG_STR "close(" + decstr(_FD) + ") = " + decstr(_RET_STATUS) 133 | #endif 134 | #undef DEF_SYSCALL_READ 135 | #endif 136 | 137 | #ifdef UNDEF_SYSCALL_READ 138 | #undef _RET_STATUS 139 | #undef _FD 140 | #ifndef NO_PINTOOL_LOG 141 | #undef _CALL_LOG_STR 142 | #endif 143 | #undef UNDEF_SYSCALL_READ 144 | #endif 145 | 146 | 147 | /**** write(2) ****************************************************/ 148 | #ifdef DEF_SYSCALL_WRITE 149 | #define _N_WRITTEN (ssize_t)ctx->ret 150 | #define _FD (int)ctx->arg[SYSCALL_ARG0] 151 | #define _BUF (ADDRINT)ctx->arg[SYSCALL_ARG1] 152 | #define _COUNT (size_t)ctx->arg[SYSCALL_ARG2] 153 | #ifndef NO_PINTOOL_LOG 154 | #define _CALL_LOG_STR "write(" + decstr(_FD) + ", " + StringFromAddrint(_BUF) + ", " + decstr(_COUNT) + ") = " + decstr(_N_WRITTEN) 155 | #endif 156 | #undef DEF_SYSCALL_WRITE 157 | #endif 158 | 159 | #ifdef UNDEF_SYSCALL_WRITE 160 | #undef _N_WRITTEN 161 | #undef _FD 162 | #undef _BUF 163 | #undef _COUNT 164 | #ifndef NO_PINTOOL_LOG 165 | #undef _CALL_LOG_STR 166 | #endif 167 | #undef UNDEF_SYSCALL_WRITE 168 | #endif 169 | 170 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 171 | -------------------------------------------------------------------------------- /osutils.H: -------------------------------------------------------------------------------- 1 | /// 2 | /// @file 3 | /// \brief Utility functions with OS-specific implementations. 4 | /// 5 | #ifndef DTRACKER_OSUTILS_H 6 | #define DTRACKER_OSUTILS_H 7 | 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | 21 | #include "pin.H" 22 | #include "libdft_api.h" 23 | 24 | /* Platform specific white-lists. 25 | * Data coming from files matching these patterns are not tainted. 26 | */ 27 | #if defined(TARGET_LINUX) 28 | #define DTRACKER_FILE_WHITELIST_RE "\\.so$|\\.so\\." 29 | #define DTRACKER_PATH_WHITELIST_RE "^/proc/|^/lib/|^/usr/lib/|^/etc/|^/usr/share/" 30 | #elif defined(TARGET_MAC) 31 | #define DTRACKER_FILE_WHITELIST_RE "\\.dylib$" 32 | #define DTRACKER_PATH_WHITELIST_RE NULL 33 | #elif defined(TARGET_WINDOWS) 34 | #define DTRACKER_FILE_WHITELIST_RE "\\.dll$" 35 | #define DTRACKER_PATH_WHITELIST_RE NULL 36 | #endif 37 | 38 | /// 39 | /// @brief Determines if a filename is whitelisted. 40 | /// 41 | /// Whitelisted files are not tainted by dtracker. 42 | /// Without whitelisting, the slowdown factor because of taint 43 | /// tracking is HUGE. 44 | /// 45 | /// @param fname -- the filename to be checked. 46 | /// @return 1 if the filename is whitelisted. 0 otherwise. 47 | /// 48 | inline int in_dtracker_whitelist(const std::string & fname) { 49 | // Note: basename() and dirname() may modify their arguments. 50 | // For this, we create a duplicate of fname to give them. 51 | // Also their return value should not be freed because it 52 | // is either a pointer into fname or statically allocated. 53 | char *fdup; 54 | 55 | // Check file patterns. 56 | if (DTRACKER_FILE_WHITELIST_RE != NULL && (fdup = strdup(fname.c_str()))) { 57 | int status = -1; 58 | regex_t re; 59 | char *bname = basename(fdup); 60 | 61 | if (regcomp(&re, DTRACKER_FILE_WHITELIST_RE, REG_EXTENDED|REG_NOSUB) == 0) { 62 | status = regexec(&re, bname, (size_t) 0, NULL, 0); 63 | regfree(&re); 64 | } 65 | free(fdup); 66 | if (status == 0) return 1; 67 | } 68 | 69 | // Check dir patterns. 70 | if (DTRACKER_PATH_WHITELIST_RE != NULL && (fdup = strdup(fname.c_str()))) { 71 | int status = -1; 72 | regex_t re; 73 | 74 | // We have to do this crap because dirname() does not append a /. 75 | char *dname_noslash = dirname(fdup); 76 | size_t dname_sz = (strlen(dname_noslash)+2)*sizeof(char); 77 | char *dname = (char *)malloc(dname_sz); 78 | 79 | if (dname != NULL && regcomp(&re, DTRACKER_PATH_WHITELIST_RE, REG_EXTENDED|REG_NOSUB) == 0) { 80 | snprintf(dname, dname_sz, "%s/", dname_noslash); 81 | status = regexec(&re, dname, (size_t) 0, NULL, 0); 82 | regfree(&re); 83 | free(dname); 84 | } 85 | free(fdup); 86 | if (status == 0) return 1; 87 | } 88 | 89 | return 0; 90 | } 91 | 92 | 93 | /// 94 | /// @brief Retrieves the absolute path to a file, resolving any symlinks. 95 | /// 96 | /// Currently only implemented for Linux/MacOS, for which the finction is 97 | /// a simple wrapper over realpath(3). 98 | /// 99 | /// @param path -- a file path to be resolved. 100 | /// @return A string representing the absolute path to the file or NULL. 101 | inline std::string path_resolve(const std::string & path) { 102 | #if defined(TARGET_LINUX) || defined(TARGET_MAC) 103 | char *crval = realpath(path.c_str(), NULL); 104 | if (crval != NULL) { 105 | std::string rval(crval); 106 | free(crval); 107 | return rval; 108 | } 109 | else { 110 | return NULL; 111 | } 112 | #elif defined(TARGET_WINDOWS) 113 | assert(0); 114 | return NULL; 115 | #endif 116 | } 117 | 118 | inline int path_isdir(const std::string & path) { 119 | #if defined(TARGET_LINUX) || defined(TARGET_MAC) 120 | struct stat stats; 121 | return (stat(path.c_str(), &stats) == 0 && S_ISDIR(stats.st_mode)); 122 | #elif defined(TARGET_WINDOWS) 123 | assert(0); 124 | return -1; 125 | #endif 126 | } 127 | 128 | inline int path_exists(const std::string & path) { 129 | #if defined(TARGET_LINUX) || defined(TARGET_MAC) 130 | return (access(path.c_str(), F_OK) == 0); 131 | #elif defined(TARGET_WINDOWS) 132 | assert(0); 133 | return -1; 134 | #endif 135 | } 136 | 137 | /// 138 | /// @brief Resolves an open file descriptor to a filename. 139 | /// 140 | /// Any symbolic links in the path are resolved. If an error occurs, 141 | /// the respective error message is returned instead of the file path. 142 | /// Because the function uses a static buffer, the file path may be 143 | /// returned truncated ending with "...". 144 | /// 145 | /// @param fd -- the file descriptor to be resolved. 146 | /// @return A string representing the full path to the file. 147 | std::string fdname(int fd); 148 | #endif 149 | 150 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 151 | -------------------------------------------------------------------------------- /osutils.cpp: -------------------------------------------------------------------------------- 1 | #include "osutils.H" 2 | #include 3 | 4 | #if defined(TARGET_LINUX) 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #define __PROC_SELF_FD "/proc/self/fd" 13 | 14 | std::string fdname(int fd) { 15 | char ppath[PATH_MAX]; 16 | char fpath[PATH_MAX]; 17 | int w; 18 | 19 | /* create string for fd link path in /proc */ 20 | w = snprintf(ppath, PATH_MAX*sizeof(char), "%s/%d", __PROC_SELF_FD, fd); 21 | assert(w < (int)(PATH_MAX*sizeof(char))); 22 | 23 | /* read link and return results */ 24 | w = readlink(ppath, fpath, PATH_MAX*sizeof(char)); 25 | if (w < 0) { 26 | return std::string(strerror(errno)); 27 | } 28 | else if (w >= PATH_MAX) { 29 | /* terminate string and return */ 30 | fpath[PATH_MAX-1] = '\0'; 31 | return std::string(fpath)+std::string("..."); 32 | } 33 | else { 34 | /* terminate string */ 35 | fpath[w] = '\0'; 36 | return std::string(fpath); 37 | } 38 | 39 | /* return something to make compiler happy */ 40 | return NULL; 41 | } 42 | 43 | #elif defined(TARGET_MAC) || defined(TARGET_WINDOWS) 44 | 45 | std::string fdname(int fd) { 46 | // Not implemented yet. 47 | // See: http://stackoverflow.com/a/13544447/277172 (Mac) 48 | // http://stackoverflow.com/a/1188803/277172 (Windows) 49 | assert(0); 50 | return std::string("N/A"); 51 | } 52 | 53 | #endif 54 | 55 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 56 | -------------------------------------------------------------------------------- /provlog.H: -------------------------------------------------------------------------------- 1 | #ifndef DTRACKER_PROVLOG_H 2 | #define DTRACKER_PROVLOG_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include // ? 10 | 11 | #include 12 | #include 13 | #include 14 | #include "dtracker.H" 15 | 16 | /* Maximum open files per process. */ 17 | #define MAX_OPEN_FILES 1024 18 | 19 | /* macros related to stdin/stdout/stderr */ 20 | #define STDFD_MAX ( MAX( MAX(STDIN_FILENO, STDOUT_FILENO), STDERR_FILENO ) + 1 ) 21 | #define IS_STDFD(fd) ( (fd == STDOUT_FILENO) || (fd == STDIN_FILENO) || (fd == STDERR_FILENO) ) 22 | 23 | 24 | /**** data types and externals ************************************/ 25 | namespace PROVLOG { 26 | 27 | typedef UINT32 ufd_t; 28 | 29 | /* 30 | * UFDMap maps program fds (as used by the program) to ufds. 31 | * Normal fds are not suitable for use as taint marks in taint flow 32 | * analysis becaƒuse are "recycled" by the os. This will eventually 33 | * lead to misattribution of some data. 34 | * 35 | * Unlike fds which are recycled by the OS, ufds increase monotonically. 36 | * This makes ufds suitable for use in taint flow analysis. 37 | */ 38 | class UFDMap { 39 | public: 40 | ufd_t operator[](int fd) { 41 | if (this->map[fd] == 0) 42 | this->map[fd] = this->next++; 43 | return this->map[fd]; 44 | } 45 | ufd_t del(int fd) { 46 | ufd_t ufd = map[fd]; 47 | map[fd] = 0; 48 | return ufd; 49 | } 50 | private: 51 | ufd_t next = 1; 52 | std::array map; 53 | }; 54 | 55 | extern UFDMap ufdmap; 56 | } // namespace PROVLOG 57 | 58 | /* Set of watched fds. */ 59 | extern std::set fdset; 60 | 61 | /* Counters for stdin/stdout/stderr. */ 62 | extern off_t stdcount[STDFD_MAX]; 63 | 64 | /* Current executable name and pid. */ 65 | extern std::string exename; 66 | extern pid_t pid; 67 | 68 | /**** output macros and inlines ***********************************/ 69 | typedef struct { 70 | enum {NONE, SEQ, REP} type; 71 | ssize_t start; 72 | ssize_t length; 73 | } range_info_t; 74 | static const char * range_type_strings[] __attribute__((unused)) = { "NONE", "SEQ", "REP" }; 75 | #ifdef USE_LIBDFT_TAG_SET_FDOFF 76 | typedef std::map::inner_type, range_info_t> range_map_t; 77 | #endif 78 | 79 | namespace PROVLOG { 80 | 81 | /* Raw provenance output stream. */ 82 | extern std::ofstream rawProvStream; 83 | 84 | /* inline functions for raw provenance logging */ 85 | static inline void open(const ufd_t ufd, const std::string & fdname, const int flags, const int created) { 86 | rawProvStream << "o:ufd" << ufd << ":" << fdname << std::endl; 87 | 88 | // Unless the the O_WRONLY flag is on, the file descriptor can be read. 89 | if (! (flags&O_WRONLY) ) 90 | rawProvStream << "u:" << exename << ":" << fdname << std::endl; 91 | 92 | // Emit a generated line if needed. 93 | if (flags & (O_WRONLY|O_RDWR)) { 94 | if (created) { 95 | rawProvStream << "#g:created" << std::endl; 96 | rawProvStream << "g:c:" << exename << ":" << fdname << std::endl; 97 | } 98 | else if (flags & O_TRUNC) { 99 | rawProvStream << "#g:truncated" << std::endl; 100 | rawProvStream << "g:t:" << exename << ":" << fdname << std::endl; 101 | } 102 | else { 103 | // Updated means that it is opened for writing. 104 | // TODO: Currently this is translated to a wasGeneratedBy edge only 105 | // if some tainted bytes are written. 106 | rawProvStream << "#g:updated" << std::endl; 107 | rawProvStream << "g:u:" << exename << ":" << fdname << std::endl; 108 | } 109 | } 110 | 111 | // TODO: (low urgency) emit a truncation line if O_TRUNC is included in the flags 112 | } 113 | static inline void close(const ufd_t ufd) { 114 | rawProvStream << "c:ufd" << ufd << std::endl; 115 | } 116 | static inline void exec(const std::string & exename, pid_t pid) { 117 | rawProvStream << "x:" << pid << ":" << exename << std::endl; 118 | } 119 | 120 | #ifdef USE_LIBDFT_TAG_BITSET 121 | // used for DLIBDFT_TAG_TYPE=libdft_tag_bitset 122 | // cpp doesn't support string comparison in conditionals 123 | // define this guard macro at the location of use 124 | static inline void write(const ufd_t ufd_origin, const ufd_t ufd_dest, const off_t write_begin, const off_t length) { 125 | const char *range_type = length > 1 ? range_type_strings[range_info_t::REP] : range_type_strings[range_info_t::NONE]; 126 | rawProvStream << "w:" << range_type << 127 | ":ufd" << ufd_dest << ":" << write_begin << 128 | ":ufd" << ufd_origin << ":" << 0 << 129 | ":" << length << std::endl; 130 | } 131 | #endif 132 | 133 | #ifdef USE_LIBDFT_TAG_SET_FDOFF 134 | // used for DLIBDFT_TAG_TYPE=libdft_tag_set_fdoff 135 | // cpp doesn't support string comparison in conditionals 136 | // define this guard macro at the location of use 137 | static inline void write_range(const ufd_t ofd, const off_t write_begin, const range_map_t::key_type last, const range_map_t::mapped_type & info) { 138 | switch(info.type) { 139 | case range_info_t::SEQ: 140 | rawProvStream << "w:" << range_type_strings[info.type] << 141 | ":ufd" << ofd << ":" << (write_begin+info.start) << 142 | ":ufd" << last.first << ":" << (last.second-(info.length-1)) << 143 | ":" << info.length << std::endl; 144 | break; 145 | case range_info_t::NONE: 146 | case range_info_t::REP: 147 | rawProvStream << "w:" << range_type_strings[info.type] << 148 | ":ufd" << ofd << ":" << (write_begin+info.start) << 149 | ":ufd" << last.first << ":" << last.second << 150 | ":" << info.length << std::endl; 151 | break; 152 | } 153 | } 154 | #endif 155 | 156 | } // namespace PROVLOG 157 | 158 | #endif 159 | 160 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 161 | -------------------------------------------------------------------------------- /provlog.cpp: -------------------------------------------------------------------------------- 1 | #include "provlog.H" 2 | 3 | /* Array that maps fds to ufds. Unlike fds which are recycled, ufds 4 | * increase monotonically. This makes them suitable for use as taint 5 | * marks. 6 | */ 7 | PROVLOG::UFDMap PROVLOG::ufdmap; 8 | 9 | /* Set of watched fds - maybe change this to bitset? */ 10 | std::set fdset; 11 | 12 | /* Counters for stdin/stdout/stderr. 13 | * TODO: Maybe this should be generalized. I.e. maintain counters for 14 | * all fds where isatty(fd) returns true. 15 | */ 16 | off_t stdcount[STDFD_MAX]; 17 | 18 | /* Raw provenance output stream. */ 19 | std::ofstream PROVLOG::rawProvStream; 20 | 21 | /* Current executable name and pid. 22 | * XXX: Check if this works correctly while following execv(). 23 | */ 24 | std::string exename("N/A"); 25 | pid_t pid; 26 | 27 | /* vim: set noet ts=4 sts=4 sw=4 ai : */ 28 | -------------------------------------------------------------------------------- /raw2dsl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Converter from our custom raw provenance output to SPADE DSL format. 4 | # See: https://code.google.com/p/data-provenance/wiki/Pipe 5 | 6 | from raw2ttl import RawConverter 7 | from raw2ttl import Error, UnknownUFDError 8 | import argparse 9 | import fileinput 10 | from operator import itemgetter 11 | from textwrap import dedent 12 | from time import gmtime, strftime 13 | 14 | # from pprint import pprint 15 | 16 | # type: id: : ... : 17 | # type: from: to: : ... : 18 | 19 | ########################################################################################## 20 | # XXX: after a fork() provenance of two processes may be interleaved in the raw output. 21 | # for this, pid probably has to be printed with every line in the raw output. 22 | # also, locking may be required when dumping to the raw output. 23 | ########################################################################################## 24 | 25 | #### Exceptions ##################################################### 26 | class NoVertexIDError(Error): 27 | """Raised when there's no vid for an artifact.""" 28 | def __init__(self, artifact): 29 | self.artifact = artifact 30 | def __str__(self): 31 | return "No vertex id found for artifact %s." % (self.artifact) 32 | 33 | 34 | 35 | #### SPADE DSL converter class ################################### 36 | class RawDSLConverter(RawConverter): 37 | formats = { 38 | 'program_vertex': dedent(''' 39 | type:Process id:{proc_vid} program:{program} pid:{pid} 40 | ''').strip(), 41 | 'file_vertex': dedent(''' 42 | type:Artifact id:{file_vid} filename:{filename} label:"{label}" 43 | ''').strip(), 44 | 'range_vertex': dedent(''' 45 | type:Artifact id:{range_vid} file:{rangename} memberof:{file_vid} 46 | ''').strip(), 47 | 48 | 'used': dedent(''' 49 | type:Used from:{proc_vid} to:{file_vid} 50 | ''').strip(), 51 | 'derived': dedent(''' 52 | type:WasDerivedFrom from:{file_vid1} to:{file_vid2} 53 | ''').strip(), 54 | 'generated': dedent(''' 55 | type:WasGeneratedBy from:{file_vid} to:{proc_vid} 56 | ''').strip(), 57 | 58 | 'range': '{filename}[{start},{end}]', 59 | 60 | # not used for now 61 | 'derived_range': dedent(''' 62 | <{filename1}{range1}> prov:wasDerivedFrom <{filename2}{range2}> . 63 | ''').strip(), 64 | 65 | } 66 | 67 | def __init__(self, keepcomments=True, keepbad=False, minrange=0): 68 | super(RawDSLConverter, self).__init__(keepcomments, keepbad, minrange) 69 | 70 | # Compute the base for the unique vertex ids created by this session. 71 | # This will produce something like this: 201504141812190000. 72 | # The base can then be incremented to get (hopefully) unique ids. 73 | self.vid_base = int(strftime("%Y%m%d%H%M%S", gmtime())) * (10**4) 74 | self.vid_next = self.vid_base 75 | self.vid_files = {} 76 | self.vid_procs = {} 77 | 78 | def get_file_vid(self, filename, makenew=True): 79 | """ Returns the vertex id for the specified file. 80 | """ 81 | if filename not in self.vid_files: 82 | if not makenew: 83 | raise NoVertexIDError(filename) 84 | else: 85 | self.vid_files[filename] = self.vid_next 86 | self.vid_next+=1 87 | return self.vid_files[filename] 88 | 89 | def get_proc_vid(self, program=None, pid=None, makenew=True): 90 | """ Returns the vertex id for the specified process. 91 | """ 92 | makenew = False if (program is None or pid is None) else makenew 93 | program = self.exe if (program is None) else program 94 | pid = self.pid if (pid is None) else pid 95 | 96 | k = '%s[%s]' % (program, pid) 97 | if k not in self.vid_procs: 98 | if not makenew: 99 | raise NoVertexIDError(k) 100 | else: 101 | self.vid_procs[k] = self.vid_next 102 | self.vid_next+=1 103 | return self.vid_procs[k] 104 | 105 | def handle_c(self, data): 106 | ufd = itemgetter('ufd')(data) 107 | filename1 = self.ufdmap[ufd] 108 | 109 | # print triples 110 | if ufd in self.derived: 111 | for filename2 in self.derived[ufd]: 112 | print self.format('derived', 113 | file_vid1 = self.get_file_vid(filename1, False), 114 | file_vid2 = self.get_file_vid(filename2, False), 115 | ) 116 | del self.derived[ufd] 117 | 118 | # cleanup generated 119 | if filename1 in self.generated: self.generated.remove(filename1) 120 | 121 | def handle_g(self, data): 122 | mode, exe, filename = itemgetter('mode', 'program', 'file')(data) 123 | assert self.exe == exe, "Unexpected change to executable name. Expected %s. Got %s." % (self.exe, exe) 124 | 125 | if mode == 't' or mode == 'g': 126 | print self.format('generated', 127 | proc_vid = self.get_proc_vid(), 128 | file_vid = self.get_file_vid(filename, False), 129 | ) 130 | else: 131 | #do not generate triple yet - it will be generated on first write 132 | self.generated.add(filename); 133 | 134 | def handle_o(self, data): 135 | ufd, filename = itemgetter('ufd', 'file')(data) 136 | self.ufdmap[ufd] = filename 137 | 138 | print self.format('file_vertex', 139 | file_vid = self.get_file_vid(filename), 140 | filename = self.__class__.quote_file(filename), 141 | label = filename 142 | ) 143 | 144 | def handle_u(self, data): 145 | exe, filename = itemgetter('program', 'file')(data) 146 | assert self.exe == exe, "Unexpected change to executable name. Expected %s. Got %s." % (self.exe, exe) 147 | print self.format('used', 148 | proc_vid = self.get_proc_vid(), 149 | file_vid = self.get_file_vid(filename, False), 150 | ) 151 | 152 | def handle_w(self, data): 153 | rtype, ufd, offset, origin_ufd, origin_offset, length = itemgetter( 154 | 'range_type', 'out_ufd', 'out_offset', 'origin_ufd', 'origin_offset', 'length' 155 | )(data) 156 | 157 | if ufd not in self.ufdmap: 158 | raise UnknownUFDError(ufd) 159 | if origin_ufd not in self.ufdmap: 160 | raise UnknownUFDError(origin_ufd) 161 | 162 | filename = self.ufdmap[ufd] 163 | filename_origin = self.ufdmap[origin_ufd] 164 | offset = int(offset) 165 | origin_offset = int(origin_offset) 166 | length = int(length) 167 | 168 | # emit generated triple if needed 169 | if filename in self.generated: 170 | print self.format('generated', 171 | proc_vid = self.get_proc_vid(), 172 | file_vid = self.get_file_vid(filename, False), 173 | ) 174 | self.generated.remove(filename) 175 | 176 | # simple file provenance 177 | if ufd in self.derived: 178 | self.derived[ufd].add(filename_origin) 179 | else: 180 | self.derived[ufd] = set([filename_origin]) 181 | 182 | # output ranges 183 | if self.minrange > 0 and length >= self.minrange: 184 | if rtype == 'SEQ': 185 | range_origin = self.format('range', 186 | filename = self.__class__.quote_file(filename_origin), 187 | start = origin_offset, 188 | end = origin_offset+length-1 189 | ) 190 | range_dest = self.format('range', 191 | filename = self.__class__.quote_file(filename), 192 | start = offset, 193 | end = offset+length-1 194 | ) 195 | elif rtype == 'REP': 196 | range_origin = self.format('range', 197 | filename = self.__class__.quote_file(filename_origin), 198 | start = origin_offset, 199 | end = origin_offset 200 | ) 201 | range_dest = self.format('range', 202 | filename = self.__class__.quote_file(filename), 203 | start = offset, 204 | end = offset+length-1 205 | ) 206 | else: 207 | # temp fix for NONE ranges 208 | # the script will run, but they will be missed 209 | return 210 | 211 | print self.format('range_vertex', 212 | range_vid = self.get_file_vid(range_origin), 213 | rangename = range_origin, 214 | file_vid = self.get_file_vid(filename_origin, False) 215 | ) 216 | print self.format('range_vertex', 217 | range_vid = self.get_file_vid(range_dest), 218 | rangename = range_dest, 219 | file_vid = self.get_file_vid(filename, False) 220 | ) 221 | print self.format('derived', 222 | file_vid1 = self.get_file_vid(range_origin, False), 223 | file_vid2 = self.get_file_vid(range_dest, False), 224 | ) 225 | 226 | 227 | def handle_x(self, data): 228 | self.pid, self.exe = itemgetter('pid', 'program')(data) 229 | self.generated.clear() 230 | 231 | print self.format('program_vertex', 232 | proc_vid = self.get_proc_vid(self.exe, self.pid), 233 | pid = self.pid, 234 | program = self.__class__.quote_file(self.exe), 235 | ) 236 | 237 | 238 | #### main ########################################################### 239 | if __name__ == "__main__": 240 | tag_range = {} 241 | 242 | parser = argparse.ArgumentParser(description='Convert DataTracker raw format to input for the SPADE DSL Reporter.') 243 | parser.add_argument('-minrange', type=int, default=0, help='the minimum range size to be included in the output') 244 | # parser.add_argument('dsl-pipe', metavar='pipe', nargs='*', help='location of the SPADE DSL pipe') 245 | parser.add_argument('files', metavar='file', nargs='*', help='specify input files') 246 | args = parser.parse_args() 247 | 248 | converter = RawDSLConverter(keepcomments=False, keepbad=False, minrange=getattr(args, 'minrange', 0)) 249 | 250 | for line in fileinput.input(args.files): 251 | converter.process_line(line) 252 | -------------------------------------------------------------------------------- /raw2ttl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Converter from our custom raw provenance output to turtle output. 4 | # See: http://www.w3.org/TeamSubmission/turtle/ 5 | # 6 | # The RawConverter class contained here also serves as a base for 7 | # building converters for other formats. 8 | 9 | from abc import ABCMeta 10 | 11 | import argparse 12 | import sys 13 | import fileinput 14 | import string 15 | import urllib 16 | import inspect 17 | from operator import itemgetter 18 | from textwrap import dedent 19 | from pprint import pprint 20 | 21 | 22 | 23 | #### Exceptions ##################################################### 24 | class Error(Exception): 25 | """Base class for exceptions in this module.""" 26 | pass 27 | 28 | class UnknownUFDError(Error): 29 | """Raised when there's no mapping for an ufd.""" 30 | def __init__(self, ufd): 31 | self.ufd = ufd 32 | def __str__(self): 33 | return "No active mapping for %s." % (self.ufd) 34 | 35 | class TagFormatError(Error): 36 | """Raised when tags cannot be parsed.""" 37 | def __init__(self, tagspec): 38 | self.tagspec = tagspec 39 | def __str__(self): 40 | return "Cannot parse '%s' into tags." % (self.tagspec) 41 | 42 | class RangeError(Error): 43 | pass 44 | 45 | 46 | 47 | #### Classes for used data types #################################### 48 | class Range: 49 | start = 0 50 | end = 0 51 | 52 | def __init__(self, start, end=None): 53 | self.start = start 54 | self.end = self.start if end == None else end 55 | if self.end < self.start: 56 | self.start = self.end 57 | self.end = start 58 | 59 | def expand(self, n=1): 60 | self.end += n 61 | 62 | def lexpand(self, n=1): 63 | self.start -= n 64 | 65 | def length(self): 66 | return self.start-self.end 67 | 68 | def join(self, range2): 69 | if not self.is_adjacent(range2): 70 | raise RangeError("Attempting to join not adjacent ranges.") 71 | 72 | def is_adjacent(self, range2): 73 | if isinstance(range2, self.__class__): 74 | if range2.end == self.start-1 or range2.start == self.end+1: 75 | return True 76 | return False 77 | elif isinstance(range2, int): 78 | if range2 == self.start-1 or range2 == self.end+1: 79 | return True 80 | return False 81 | else: 82 | raise RangeError("Unsupported argument type.") 83 | 84 | def is_overlapping(self, range2): 85 | if range2.start <= self.start and range2.end <= self.start: 86 | return False 87 | if range2.start >= self.end and range2.end >= self.end: 88 | return False 89 | return True 90 | 91 | def __str__(self): 92 | return "%d-%d" % (self.start, self.end) 93 | 94 | 95 | 96 | 97 | #### Base converter class ########################################### 98 | class RawConverter: 99 | """ Base class for raw provenance converters. 100 | 101 | The class encapsulates the processing of the input lines. 102 | Lines are parsed into a dict by process_line() and then passed 103 | to the appropriate handler which should be implemented by the 104 | subclass. 105 | The expected format of each line is defined in the input_formats 106 | dict. 107 | """ 108 | __metaclass__ = ABCMeta 109 | formats = {} 110 | input_formats = { 111 | 'c': (['ufd'], 0), 112 | 'g': (['mode', 'program', 'file'], 2), 113 | 'o': (['ufd', 'file'], 1), 114 | 'u': (['program', 'file'], 1), 115 | 'w': (['range_type', 'out_ufd', 'out_offset', 'origin_ufd', 'origin_offset', 'length'], 5), 116 | 'x': (['pid', 'program'], 1), 117 | } 118 | exe = None 119 | pid = -1 120 | ufdmap = {} 121 | derived = {} 122 | generated = set() 123 | 124 | def __init__(self, keepcomments=True, keepbad=False, minrange=0): 125 | self.keepcomments = keepcomments 126 | self.keepbad = keepbad 127 | self.minrange = minrange 128 | self.output_static('header') 129 | self.handlers = dict(filter( 130 | lambda t: t[0].startswith('handle_'), 131 | inspect.getmembers(self, predicate=inspect.ismethod) 132 | )) 133 | 134 | def format(self, fmt, **kwargs): 135 | return self.formats[fmt].format(**kwargs) 136 | 137 | def output_static(self, what): 138 | if what in self.formats: 139 | print self.formats[what] 140 | 141 | def output_format(self, fmt, **kwargs): 142 | print self.formats(fmt).format(**kwargs) 143 | 144 | def process_line(self, line): 145 | line = line.strip() 146 | 147 | if line.startswith('#'): 148 | if self.keepcomments: 149 | print line 150 | else: 151 | op, data = line.strip().split(':', 1) 152 | 153 | try: 154 | # Combine line format and data into a dict. 155 | keys, nsplits = self.input_formats[op] 156 | data_dict = dict(zip(keys, data.split(':', nsplits))) 157 | 158 | # Call the handler with the data in the dict. 159 | self.handlers['handle_'+op](data_dict) 160 | except KeyError: 161 | # Keep bad lines as comments 162 | if self.keepbad: 163 | print '#BAD '+line 164 | else: 165 | raise 166 | 167 | @classmethod 168 | def quote_file(cls, filename, asURL=False): 169 | if asURL: 170 | return 'file://'+urllib.pathname2url(filename) 171 | else: 172 | return '"%s"' % (filename) 173 | 174 | 175 | 176 | #### Turtle converter class ###################################### 177 | class RawTTLConverter(RawConverter): 178 | formats = { 179 | 'header': dedent(''' 180 | @prefix prov: . 181 | @prefix rdfs: . 182 | ''').strip(), 183 | 'derived': dedent(''' 184 | <{filename1}> prov:wasDerivedFrom <{filename2}> . 185 | ''').strip(), 186 | 'derived_range': dedent(''' 187 | <{filename1}{range1}> prov:wasDerivedFrom <{filename2}{range2}> . 188 | ''').strip(), 189 | 'exec': dedent(''' 190 | <{url_program}> a prov:Activity . 191 | ''').strip(), 192 | 'generated': dedent(''' 193 | <{filename}> prov:wasGeneratedBy <{url_program}> . 194 | ''').strip(), 195 | 'member': dedent(''' 196 | <{filename}> prov:hadMember <{filename}{range}> . 197 | ''').strip(), 198 | 'open': dedent(''' 199 | <{filename}> a prov:Entity . 200 | <{filename}> rdfs:label "{label}" . 201 | ''').strip(), 202 | 'used': dedent(''' 203 | <{url_program}> prov:used <{filename}> . 204 | ''').strip(), 205 | 'file_range': '#%d-%d', 206 | } 207 | 208 | @classmethod 209 | def quote_file(cls, filename, asURL=True): 210 | return super(RawTTLConverter, cls).quote_file(filename, asURL) 211 | 212 | def handle_c(self, data): 213 | ufd = itemgetter('ufd')(data) 214 | filename1 = self.ufdmap[ufd] 215 | 216 | if ufd in self.derived: 217 | for filename2 in self.derived[ufd]: 218 | print self.format('derived', 219 | filename1 = self.__class__.quote_file(filename1), 220 | filename2 = self.__class__.quote_file(filename2), 221 | ) 222 | del self.derived[ufd] 223 | 224 | # cleanup generated 225 | if filename1 in self.generated: self.generated.remove(filename1) 226 | 227 | def handle_g(self, data): 228 | mode, exe, filename = itemgetter('mode', 'program', 'file')(data) 229 | assert self.exe == exe, "Unexpected change to executable name. Expected %s. Got %s." % (self.exe, exe) 230 | 231 | if mode == 't' or mode == 'g': 232 | print self.format('generated', 233 | url_program = self.__class__.quote_file(self.exe), 234 | filename = self.__class__.quote_file(filename), 235 | ) 236 | else: 237 | #do not generate triple yet - it will be generated on first write 238 | self.generated.add(filename); 239 | 240 | def handle_o(self, data): 241 | ufd, filename = itemgetter('ufd', 'file')(data) 242 | self.ufdmap[ufd] = filename 243 | 244 | print self.format('open', 245 | filename = self.__class__.quote_file(filename), 246 | label = filename 247 | ) 248 | 249 | def handle_u(self, data): 250 | exe, filename = itemgetter('program', 'file')(data) 251 | assert self.exe == exe, "Unexpected change to executable name. Expected %s. Got %s." % (self.exe, exe) 252 | 253 | print self.format('used', 254 | url_program = self.__class__.quote_file(exe), 255 | filename = self.__class__.quote_file(filename), 256 | ) 257 | 258 | def handle_w(self, data): 259 | rtype, ufd, offset, origin_ufd, origin_offset, length = itemgetter( 260 | 'range_type', 'out_ufd', 'out_offset', 'origin_ufd', 'origin_offset', 'length' 261 | )(data) 262 | 263 | if ufd not in self.ufdmap: 264 | raise UnknownUFDError(ufd) 265 | if origin_ufd not in self.ufdmap: 266 | raise UnknownUFDError(origin_ufd) 267 | 268 | filename = self.ufdmap[ufd] 269 | filename_origin = self.ufdmap[origin_ufd] 270 | offset = int(offset) 271 | origin_offset = int(origin_offset) 272 | length = int(length) 273 | 274 | # emit generated triple if needed 275 | if filename in self.generated: 276 | print self.format('generated', 277 | url_program = self.__class__.quote_file(self.exe), 278 | filename = self.__class__.quote_file(filename), 279 | ) 280 | self.generated.remove(filename) 281 | 282 | # simple file provenance 283 | if ufd in self.derived: 284 | self.derived[ufd].add(filename_origin) 285 | else: 286 | self.derived[ufd] = set([filename_origin]) 287 | 288 | # output ranges 289 | if self.minrange > 0 and length >= self.minrange: 290 | if rtype == 'SEQ': 291 | print self.format('member', 292 | filename = self.__class__.quote_file(filename), 293 | range = file_range_fmt % (offset, offset+length-1) 294 | ) 295 | print self.format('member', 296 | filename = self.__class__.quote_file(filename_origin), 297 | range = file_range_fmt % (origin_offset, origin_offset+length-1) 298 | ) 299 | print self.format('derived_range', 300 | filename1 = self.__class__.quote_file(filename), 301 | range1 = file_range_fmt % (offset, offset+length-1), 302 | filename2 = self.__class__.quote_file(filename_origin), 303 | range2 = file_range_fmt % (origin_offset, origin_offset+length-1) 304 | ) 305 | elif rtype == 'REP': 306 | print self.format('member', 307 | filename = self.__class__.quote_file(filename), 308 | range = file_range_fmt % (offset, offset+length-1) 309 | ) 310 | print self.format('member', 311 | filename = self.__class__.quote_file(filename_origin), 312 | range = file_range_fmt % (origin_offset, origin_offset) 313 | ) 314 | print self.format('derived_range', 315 | filename1 = self.__class__.quote_file(filename), 316 | range1 = file_range_fmt % (offset, offset+length-1), 317 | filename2 = self.__class__.quote_file(filename_origin), 318 | range2 = file_range_fmt % (origin_offset, origin_offset) 319 | ) 320 | 321 | # TODO: Aggregation per written buffer is done inside dtracker. 322 | # Additional aggregation may be done here. 323 | 324 | def handle_x(self, data): 325 | pid, self.exe = itemgetter('pid', 'program')(data) 326 | self.generated.clear() 327 | 328 | print self.format('exec', 329 | url_program = self.__class__.quote_file(self.exe), 330 | ) 331 | 332 | 333 | #### main ########################################################### 334 | if __name__ == "__main__": 335 | tag_range = {} 336 | 337 | parser = argparse.ArgumentParser(description='Convert DataTracker raw format to PROV/Turtle format.') 338 | parser.add_argument('-minrange', type=int, default=0, help='the minimum range size to be included in the output') 339 | parser.add_argument('files', metavar='file', nargs='*', help='specify input files') 340 | args = parser.parse_args() 341 | 342 | converter = RawTTLConverter(minrange=args.minrange) 343 | 344 | for line in fileinput.input(args.files): 345 | converter.process_line(line) 346 | -------------------------------------------------------------------------------- /samples/.gitignore: -------------------------------------------------------------------------------- 1 | _* 2 | -------------------------------------------------------------------------------- /samples/Makefile: -------------------------------------------------------------------------------- 1 | CC=gcc 2 | CFLAGS=-O0 -DTARGET_IA32 -DHOST_IA32 -DTARGET_LINUX -m32 -Wall 3 | 4 | ALL_TESTS=#test_aggr test_atoi test_printf 5 | ALL_TARGETS=$(ALL_TESTS) ccombine sgrep upcase tricky #cplow cprefix pickshort charn tricky sum xor 6 | DEBUG_OUT=*.id* *.nam 7 | 8 | .PHONY: all 9 | all: $(ALL_TARGETS) 10 | 11 | test_atoi: test_atoi.c 12 | $(CC) $(CFLAGS) -DBSD_ATOI $^ -o $@ 13 | 14 | clean: 15 | rm -f $(ALL_TARGETS) $(DEBUG_OUT) 16 | -------------------------------------------------------------------------------- /samples/README.txt: -------------------------------------------------------------------------------- 1 | This dir contains sample programs to be used for debugging/testing our pin tools. 2 | -------------------------------------------------------------------------------- /samples/ccombine.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /* ccombine: Combines two input files to generate the output. 8 | * The combination process does not have any usability other than demonstrating 9 | * the propagation of taint marks within DataTracker. 10 | */ 11 | 12 | int main(int argc, char** argv) { 13 | int in1, in2, out; 14 | char c1, c2, c; 15 | if (argc != 4) goto err; 16 | 17 | in1 = open(argv[1], O_RDONLY); 18 | in2 = open(argv[2], O_RDONLY); 19 | out = open(argv[3], O_WRONLY|O_CREAT|O_TRUNC|O_SYNC, S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP); 20 | if (! (in1 && in2 && out)) goto err; 21 | 22 | while (read(in1, &c1, sizeof(char)) && read(in2, &c2, sizeof(char))) { 23 | if (c1 == '\n' || c2 == '\n') 24 | /* implicit value - preserve newlines from either inputs */ 25 | c = '\n'; 26 | else if (c1 == ' ' || c2 == ' ') 27 | /* implicit value - preserve spaces from either inputs - implicit value */ 28 | c = ' '; 29 | else if (c1<'a' || c1>'z' || c2<'a' || c2>'z') 30 | /* implicit value - use a tilde if either input is not a lowercase letter */ 31 | c = '~'; 32 | else if (c1 > c2) 33 | /* combined value - use values from both inputs to generate output */ 34 | c = 'a' + c1 - c2; 35 | else 36 | /* copied value - use the second input */ 37 | c = c2; 38 | write(out, &c, sizeof(char)); 39 | } 40 | 41 | close(in1); 42 | close(in2); 43 | close(out); 44 | return 0; 45 | 46 | err: 47 | return 1; 48 | } 49 | 50 | /* 51 | vim: ai:ts=4:sw=4:noet:ft=c 52 | */ 53 | -------------------------------------------------------------------------------- /samples/sgrep.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | /* A simple grep-like utility. 8 | * Searches for the specified word in the input files and prints the 9 | * lines that match. The search is case-insensitive. 10 | */ 11 | 12 | int main(int argc, char **argv) { 13 | int i; 14 | FILE *f; 15 | char *word; 16 | char line[2048]; 17 | char *lline; 18 | char *c; 19 | 20 | if (argc<2) { 21 | fprintf(stderr, "Usage: %s ...\n", argv[0]); 22 | return -1; 23 | } 24 | 25 | /* convert word to lowercase */ 26 | for (word=argv[1]; *word != '\0'; word++) *word = tolower(*word); 27 | word = argv[1]; 28 | 29 | /* loop through input files */ 30 | for (i=2; i 2 | 3 | #define BUFLEN 128 4 | 5 | /* Dummy program. 6 | * Reads data from input file but does not use them to produce 7 | * the output. 8 | */ 9 | 10 | int main(int argc, char** argv) { 11 | FILE *f1, *f2; 12 | char b[BUFLEN]; 13 | if (argc < 3) return 1; 14 | 15 | f1 = fopen(argv[1], "r"); 16 | f2 = fopen(argv[2], "w"); 17 | if (f1 == NULL || f2 == NULL) return 1; 18 | 19 | fgets(b, BUFLEN, f1); 20 | fprintf(f2, "http://bit.ly/ipaw2014\n"); 21 | fclose(f1); 22 | fclose(f2); 23 | 24 | return 0; 25 | } 26 | -------------------------------------------------------------------------------- /samples/upcase.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | /* upcase: Converts input files to upper case. 6 | * One output file is generated per input file. 7 | */ 8 | 9 | int main(int argc, char** argv) { 10 | FILE *f1, *f2; 11 | char *f2n; 12 | int n, c, offset; 13 | 14 | if (argc<2) return 1; 15 | for (n=1; n='a' && c<='z') ? 'Z'-'z' : 0; 30 | fputc(c+offset, f2); 31 | } 32 | fclose(f1); 33 | fclose(f2); 34 | } 35 | return 0; 36 | } 37 | -------------------------------------------------------------------------------- /support/makefile.libdft: -------------------------------------------------------------------------------- 1 | include makefile.vars 2 | 3 | LIBDFT_SRC = libdft/src 4 | LIBDFT_TAG_FLAGS ?= -DLIBDFT_TAG_TYPE=libdft_tag_set_fdoff # fdset offset tags 5 | #LIBDFT_TAG_FLAGS ?= -DLIBDFT_TAG_TYPE=libdft_tag_bitset -DTAG_BITSET_SIZE=32 # bitset tags 6 | CPPFLAGS += $(LIBDFT_TAG_FLAGS) 7 | 8 | .PHONY: all 9 | all: libdft 10 | 11 | .PHONY: libdft 12 | libdft: $(LIBDFT_SRC) 13 | cd $< && CPPFLAGS="$(CPPFLAGS)" make 14 | 15 | .PHONY: clean 16 | clean: $(LIBDFT_SRC) 17 | cd $< && make clean 18 | 19 | # vim: ai:ts=4:sw=4:et!:ft=make 20 | -------------------------------------------------------------------------------- /support/makefile.pin: -------------------------------------------------------------------------------- 1 | include makefile.vars 2 | 3 | PIN_TARBALL=$(notdir $(PIN_TARBALL_URL)) 4 | PIN_BASEDIR=$(basename $(basename $(PIN_TARBALL))) 5 | 6 | ifdef PIN_ROOT 7 | $(info ## $$PIN_ROOT is set to $(PIN_ROOT). Skipping Pin installation.) 8 | ALL_TARGETS= 9 | else 10 | ALL_TARGETS=../pin 11 | endif 12 | 13 | .PHONY: all 14 | 15 | all: $(ALL_TARGETS) 16 | 17 | ../pin: $(PIN_BASEDIR) 18 | $(LN) -sf support/$(PIN_BASEDIR) ../pin 19 | 20 | $(PIN_BASEDIR): $(PIN_TARBALL) 21 | @[ -d $@ ] || $(TAR) -zxvf $(PIN_TARBALL) 22 | 23 | $(PIN_TARBALL): 24 | $(WGET) $(PIN_TARBALL_URL) -O $@ 25 | 26 | .PHONY: clean 27 | clean: 28 | $(RM) -rf $(PIN_TARBALL) $(PIN_BASEDIR) 29 | [ -L ../pin ] && $(RM) -f ../pin 30 | 31 | # vim: ai:ts=4:sw=4:et!:ft=make 32 | -------------------------------------------------------------------------------- /support/makefile.provtoolbox: -------------------------------------------------------------------------------- 1 | include makefile.vars 2 | 3 | PROVTOOLBOX_TARBALL=$(notdir $(PROVTOOLBOX_TARBALL_URL)) 4 | PROVTOOLBOX_BASEDIR=provToolbox 5 | 6 | .PHONY: all 7 | all: $(PROVTOOLBOX_BASEDIR) 8 | 9 | $(PROVTOOLBOX_BASEDIR): $(PROVTOOLBOX_TARBALL) 10 | @[ -d $@ ] || $(UNZIP) $(PROVTOOLBOX_TARBALL) && chmod -R 'go-w' $@ 11 | 12 | $(PROVTOOLBOX_TARBALL): 13 | $(WGET) $(PROVTOOLBOX_TARBALL_URL) -O $@ 14 | 15 | .PHONY: clean 16 | clean: 17 | $(RM) -rf $(PROVTOOLBOX_TARBALL) $(PROVTOOLBOX_BASEDIR) 18 | 19 | # vim: ai:ts=4:sw=4:et!:ft=make 20 | -------------------------------------------------------------------------------- /support/makefile.vars: -------------------------------------------------------------------------------- 1 | CC=clang 2 | CXX=clang++ 3 | GIT=git 4 | INSTALL=install 5 | LN=ln 6 | PYTHON=python 7 | RM=rm 8 | SED=sed 9 | SVN=svn 10 | TAR=tar 11 | UNZIP=unzip 12 | WGET=wget 13 | 14 | BUILD_DIR=build 15 | 16 | CONFIG_FLAGS_COMMON = --prefix=$(realpath $(BUILD_DIR)/..) CPPFLAGS=-I$(realpath $(BUILD_DIR)/../include) LDFLAGS=-L$(realpath $(BUILD_DIR)/../lib) PKG_CONFIG_PATH=$(realpath $(BUILD_DIR)/../lib/pkgconfig) PATH=$(realpath $(BUILD_DIR)/../bin):$(PATH) 17 | 18 | PIN_TARBALL_URL = http://software.intel.com/sites/landingpage/pintool/downloads/pin-2.13-62732-gcc.4.4.7-linux.tar.gz 19 | PROVTOOLBOX_TARBALL_URL = http://search.maven.org/remotecontent?filepath=org/openprovenance/prov/toolbox/0.6.1/toolbox-0.6.1-release.zip 20 | 21 | --------------------------------------------------------------------------------