├── .gitignore ├── Makefile ├── README.md ├── include ├── common.h ├── file.h ├── fileIndexer.h ├── indexSet.h ├── loserTree.hpp ├── nGramBase.h ├── nGramIndex.h ├── nGramSearch.h ├── smFile.h ├── snugglefish.h └── utils.h ├── python ├── README.md ├── pysnugglefish.cpp ├── setup.py └── snuggle.py └── src ├── file.cpp ├── fileIndexer.cpp ├── indexSet.cpp ├── nGramBase.cpp ├── nGramIndex.cpp ├── nGramSearch.cpp ├── smFile.cpp └── snugglefish.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | snugglefish 2 | *.o 3 | *.swp 4 | python/build 5 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #Approved for Public Release; Distribution Unlimited: 13-1937 2 | 3 | # Copyright (c) 2014 The MITRE Corporation. All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions 7 | # are met: 8 | # 1. Redistributions of source code must retain the above copyright 9 | # notice, this list of conditions and the following disclaimer. 10 | # 2. Redistributions in binary form must reproduce the above copyright 11 | # notice, this list of conditions and the following disclaimer in the 12 | # documentation and/or other materials provided with the distribution. 13 | # 14 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 | # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 | # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 | # SUCH DAMAGE. 25 | 26 | CC=g++ 27 | 28 | all: snugglefish 29 | 30 | snugglefish: snugglefish.o nGramBase.o nGramSearch.o nGramIndex.o fileIndexer.o files 31 | ${CC} -rdynamic -pthread snugglefish.o nGramBase.o nGramSearch.o nGramIndex.o fileIndexer.o file.o indexSet.o smFile.o -o snugglefish 32 | 33 | snugglefish.o: src/snugglefish.cpp 34 | ${CC} -Iinclude -g -c src/snugglefish.cpp 35 | 36 | nGramBase.o: src/nGramBase.cpp 37 | ${CC} -Iinclude -g -c src/nGramBase.cpp 38 | 39 | nGramIndex.o: src/nGramIndex.cpp src/nGramBase.cpp 40 | ${CC} -Iinclude -g -c src/nGramIndex.cpp 41 | 42 | nGramSearch.o: src/nGramSearch.cpp src/nGramBase.cpp 43 | ${CC} -Iinclude -g -c src/nGramSearch.cpp 44 | 45 | fileIndexer.o: src/fileIndexer.cpp 46 | ${CC} -Iinclude -g -c src/fileIndexer.cpp 47 | 48 | files: src/file.cpp src/indexSet.cpp src/smFile.cpp 49 | ${CC} -Iinclude -g -c src/file.cpp 50 | ${CC} -Iinclude -g -c src/indexSet.cpp 51 | ${CC} -Iinclude -g -c src/smFile.cpp 52 | 53 | clean: 54 | rm -rf *.o snugglefish 55 | 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | NOTICE: This proof-of-concept is no longer being maintained. See [BigGrep](https://github.com/cmu-sei/BigGrep) and [UrsaDB](https://github.com/CERT-Polska/ursadb) for similar capabilities. 2 | 3 | Snugglefish 4 | ======== 5 | 6 | Simple N-Gram Fast Indexer & Searcher (SNGFSH) 7 | 8 | Description 9 | =========== 10 | 11 | Got lots of malware and want to be able to quickly limit your search for an 12 | arbitrary binary string to a much smaller quantity of files? Then snugglefish 13 | is for you! 14 | 15 | Check out this post which explains it: 16 | http://www.mitre.org/capabilities/cybersecurity/overview/cybersecurity-blog/snugglefish-provides-quick-pattern-matching 17 | 18 | This work is based upon a paper published by CMU CERT entitled "A Scalable 19 | Search Index for Binary Files" which we highly recommend reading. It contains 20 | some optimizations we have not yet implemented. 21 | -------------------------------------------------------------------------------- /include/common.h: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | #ifndef SNGCOMMON_H 30 | #define SNGCOMMON_H 31 | 32 | #include 33 | #include 34 | 35 | typedef struct _mi_data { 36 | std::vector * fileList; 37 | uint32_t queue; 38 | void* ngramindex; 39 | 40 | uint32_t ngramSize; 41 | 42 | pthread_mutex_t filesMutex; 43 | pthread_mutex_t nGramIndexMutex; 44 | } mi_data; 45 | 46 | #define ngram_t_endian uint32_t 47 | #define ngram_t_version uint8_t 48 | #define ngram_t_size uint8_t 49 | #define ngram_t_fnlength uint16_t 50 | #define ngram_t_indexcount uint32_t 51 | #define ngram_t_fidtype uint32_t 52 | 53 | #define ngram_t_indexfcount uint32_t 54 | #define ngram_t_offset uint64_t 55 | #define ngram_t_numfiles uint32_t 56 | 57 | 58 | #define ENDIAN_CHECK 0x01234567 59 | #define VERSION 0x01 60 | 61 | //File Header Fields in bytes 62 | //Shared 63 | #define ENDIAN_CHECK_FIELD sizeof(ngram_t_endian) 64 | #define VERSION_FIELD sizeof(ngram_t_version) 65 | #define NGRAM_SIZE_FIELD sizeof(ngram_t_size) 66 | //filid file only 67 | #define MAX_FILENAME_LENGTH_FIELD sizeof(ngram_t_fnlength) 68 | //#define MAX_FILES_PER_NGRAM_FIELD 4 //TODO delete 69 | #define NUM_INDEX_FILES_FIELD sizeof(ngram_t_indexcount) 70 | #define NUM_FILES_FIELD sizeof(ngram_t_fidtype) //Number of files in catalog 71 | 72 | #define INDEX_HEADER_NUM_FILES_FIELD sizeof(ngram_t_indexfcount)//number of files in an index 73 | 74 | //index file only 75 | #define OFFSET_FIELD sizeof(ngram_t_offset) //64-bit offset into ngram file 76 | #define INDEX_NUM_FILES_FIELD sizeof(ngram_t_numfiles) //how many files in that ngram 77 | 78 | #define FILID_HEADER_SIZE ENDIAN_CHECK_FIELD + \ 79 | VERSION_FIELD+ \ 80 | NGRAM_SIZE_FIELD+ \ 81 | MAX_FILENAME_LENGTH_FIELD+ \ 82 | NUM_INDEX_FILES_FIELD + \ 83 | NUM_FILES_FIELD 84 | 85 | #define FILID_NUM_INDEX_OFFSET ENDIAN_CHECK_FIELD + \ 86 | VERSION_FIELD + \ 87 | NGRAM_SIZE_FIELD + \ 88 | MAX_FILENAME_LENGTH_FIELD 89 | 90 | #define FILID_NUM_FILES_OFFSET ENDIAN_CHECK_FIELD + \ 91 | VERSION_FIELD + \ 92 | NGRAM_SIZE_FIELD + \ 93 | MAX_FILENAME_LENGTH_FIELD + \ 94 | NUM_INDEX_FILES_FIELD 95 | 96 | #define INDEX_HEADER_SIZE ENDIAN_CHECK_FIELD + \ 97 | VERSION_FIELD + \ 98 | NGRAM_SIZE_FIELD + \ 99 | INDEX_HEADER_NUM_FILES_FIELD 100 | 101 | #define INDEX_ENTRY_SIZE OFFSET_FIELD + \ 102 | INDEX_NUM_FILES_FIELD 103 | //Ngram File Constants 104 | #define NGRAM_FILE_EXTENSION ".ngram" 105 | #define INDEX_FILE_EXTENSION ".index" 106 | #define FILEID_FILE_EXTENSION ".sngfs" 107 | 108 | // These defines govern the file number at the end of the file 109 | // The string governs the amount of 0's that are used 110 | // The buffer size, indicates the size of the string buffer 111 | // the string number (08 by default) should be less than 112 | // the buffer size + 1 113 | #define FILE_NUM_SPRINTF_STRING "%08u" 114 | #define FILE_NUM_BUFFER_SIZE 30 115 | #define FILE_MODE (mode_t)0775 116 | 117 | #endif 118 | -------------------------------------------------------------------------------- /include/file.h: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | #ifndef SNGFILE_H 30 | #define SNGFILE_H 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | namespace snugglefish { 39 | 40 | //Base class that keeps track of files 41 | //Abstraction from pure C method of writing files 42 | //Probably lighter-weight than C++ stream classes 43 | //It's re-inventing the wheel a bit, but I wanted to keep it 44 | // as light weight as possible without using pure C 45 | class file 46 | { 47 | 48 | public: 49 | //Constructor 50 | file(const char* fileName, size_t buffersize = (1024 * 1024 * 8 * 32)); 51 | 52 | //Destructor 53 | //Calls close, then frees up malloc'd elements 54 | ~file(); 55 | 56 | //Opens a file with flags O_RDWR | O_CREAT 57 | //Allocates a buffer for writing 58 | bool create(mode_t filemode = (mode_t)0755); 59 | 60 | //Opens a file either O_RDONLY or O_RDWR based on readwrite value 61 | //Allocates a buffer if in write mode 62 | bool open(char readwrite); 63 | 64 | 65 | //Mmaps the file read-only 66 | uint8_t* mmap(); 67 | 68 | //Closes a file, flushes buffer first 69 | //frees buffer if allocated in open or create 70 | //Also closes mmap if opened 71 | bool close(); 72 | 73 | 74 | //Read, just front-ends the read syscall 75 | void read(uint8_t* destination, size_t length); 76 | 77 | //Read at specific locations (offset from SEEK_SET) 78 | //Using read_at and read together should be done carefully 79 | void read_at(int32_t location, uint8_t* destination, size_t length); 80 | 81 | //Buffered writer 82 | bool write(uint8_t* data, size_t length); 83 | 84 | //Non-Buffered write to specific locations (offset from SEEK_SET) 85 | bool write_at(int32_t location, uint8_t* data, size_t length); 86 | 87 | //Flush anything buffered 88 | bool flush(); 89 | 90 | //Returns the size of the file using stat 91 | const size_t get_size(); 92 | 93 | //Does file exist 94 | const bool exists(); 95 | 96 | 97 | protected: 98 | char* filename; 99 | bool readonly; 100 | 101 | private: 102 | int32_t fd; //File Descriptor 103 | uint8_t* mmapFile; 104 | size_t size; //Size of File -- used for mmap purposes 105 | char* buffer; 106 | size_t buffersize; 107 | size_t bufferused; 108 | size_t bufferparam; 109 | 110 | 111 | bool real_write(int fd, uint8_t* data, size_t length); 112 | 113 | }; 114 | 115 | } 116 | 117 | #endif 118 | -------------------------------------------------------------------------------- /include/fileIndexer.h: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | 30 | #ifndef FILEINDEXER_H 31 | #define FILEINDEXER_H 32 | 33 | #include 34 | #include 35 | 36 | 37 | namespace snugglefish { 38 | //Class to keep track of nGram Index files 39 | class fileIndexer 40 | { 41 | public: 42 | 43 | //Constructor -- only takes ngramLength 44 | fileIndexer(uint8_t ngramLength); 45 | 46 | //Processes the nGrams from a file -- returns an allocated array of bools 47 | //Calling function must cleanup 48 | std::vector* processFile(const char* fileName); 49 | 50 | private: 51 | std::vector* processNgrams(unsigned char *buf, uint64_t fileSize); 52 | void processNgrams(unsigned char *buf, uint64_t fileSize, bool ngramList[]); 53 | 54 | uint64_t filesProcessed; 55 | uint32_t ngramLength; 56 | uint64_t maxNgram; 57 | uint32_t pagesize; 58 | 59 | 60 | 61 | }; 62 | } 63 | #endif 64 | -------------------------------------------------------------------------------- /include/indexSet.h: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | 30 | #ifndef SNGINDEXSET_H 31 | #define SNGINDEXSET_H 32 | 33 | 34 | #include "file.h" 35 | #include "common.h" 36 | #include 37 | #include 38 | #include 39 | 40 | namespace snugglefish { 41 | 42 | //Storage container for index entries 43 | //Due to padding we should typecast on the fly 44 | //when we want to use this 45 | struct index_entry{ 46 | uint64_t offset; 47 | uint32_t num_files; 48 | }; 49 | 50 | class indexSet 51 | { 52 | 53 | public: 54 | indexSet(const char* fileBase, uint32_t count, uint8_t nGramSize); 55 | ~indexSet(); 56 | 57 | void create(ngram_t_numfiles nFiles = 0); 58 | void addNGrams(uint32_t ngram, std::list *files); 59 | void updateNumFiles(ngram_t_numfiles count); 60 | 61 | //Opens and mmaps both the Index and NGram File 62 | void open(); 63 | 64 | void close(); 65 | 66 | //Get number of files with given ngram 67 | size_t getNGramCount(uint64_t ngram); 68 | 69 | //Returns mmap'd loation of given ngram 70 | ngram_t_fidtype* getNGrams(uint64_t ngram, size_t* count); 71 | 72 | private: 73 | void addIndexData(uint64_t offset, uint32_t nFiles); 74 | 75 | 76 | file* indexFile; 77 | file* nGramFile; 78 | 79 | uint8_t* indexMap; 80 | uint8_t* nGramMap; 81 | uint8_t* indexEntries; 82 | 83 | std::string fileBase; 84 | 85 | bool writable; 86 | 87 | //Index File header elements 88 | ngram_t_endian endian_check; 89 | ngram_t_version version; 90 | ngram_t_size ngramLength; 91 | ngram_t_numfiles numFiles; 92 | 93 | uint32_t count; 94 | uint64_t offset; 95 | 96 | }; 97 | 98 | } 99 | #endif 100 | -------------------------------------------------------------------------------- /include/loserTree.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | 5 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions 9 | are met: 10 | 1. Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | 2. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 | SUCH DAMAGE. 27 | 28 | */ 29 | 30 | #ifndef LOSERTREE 31 | #define LOSERTREE 32 | 33 | #include 34 | #include 35 | #include 36 | 37 | namespace snugglefish { 38 | 39 | using namespace std; 40 | 41 | //Class Declarations 42 | 43 | template 44 | class playerNode; 45 | template 46 | class matchNode; 47 | 48 | template 49 | struct treeElement { 50 | comparable value; 51 | playerNode* pNodePtr; //pointer to the player node where this should reside 52 | 53 | treeElement(comparable value, playerNode* pNodePtr) { 54 | this-> value = value; this->pNodePtr = pNodePtr; 55 | } 56 | 57 | treeElement(){;} 58 | }; 59 | 60 | template 61 | class treeNode{ 62 | public: 63 | virtual comparable getValue() = 0; //pure virtual function 64 | virtual treeElement getTreeElement() = 0; 65 | virtual uint8_t isPlayer() = 0; 66 | }; 67 | 68 | template 69 | class playerNode: public treeNode { 70 | public: 71 | playerNode(matchNode* parent, std::vector* playerList, uint32_t playerListSize, uint32_t playerId, comparable sentinel); 72 | comparable getValue(); 73 | 74 | treeElement getTreeElement() {return treeElement(this->getValue(), this);} 75 | void advancePlayer() { index_id++;} 76 | uint8_t isPlayer() { return 1; } 77 | const uint32_t getId() { return this->position; } 78 | 79 | matchNode* parent; 80 | 81 | private: 82 | std::vector* player; //player for this player node 83 | uint32_t position; //where in the playerlist did you get the above node from? 84 | uint32_t index_id; //where in the list are you 85 | comparable sentinel; 86 | }; 87 | 88 | 89 | template 90 | class matchNode: public treeNode { 91 | public: 92 | comparable getValue() {return element.value;} 93 | treeElement getTreeElement() { return this->element; } 94 | uint8_t isPlayer() {return 0;} 95 | 96 | matchNode* parent; 97 | treeNode* left; 98 | treeNode* right; 99 | treeElement element; 100 | uint32_t depth; 101 | }; 102 | 103 | template 104 | class loserTree{ 105 | public: 106 | loserTree(std::vector* playerList, uint32_t listSize, comparable sentinel); 107 | //TODO getWinner renamed not public 108 | comparable getWinnerValue(); 109 | uint32_t getWinnerId(); 110 | void playNextMatch(); 111 | ~loserTree(); 112 | 113 | 114 | private: 115 | void removeAndPlay(treeElement* winner); 116 | void buildTree(); 117 | void destroyChildren(matchNode* node); 118 | void playUp(matchNode* node, treeElement winner, matchNode* winnerNode); 119 | treeElement getMatchWinner(treeElement & loser); 120 | treeElement buildChildren(matchNode* node); 121 | treeElement getWinner(); 122 | 123 | std::vector* playerList; //array of pointers to vectors 124 | uint32_t playerListSize; 125 | uint32_t playerId; //to keep track of which vectors we've assigned 126 | 127 | uint32_t match_depth; 128 | treeElement winner; 129 | matchNode* root; 130 | comparable sentinel; 131 | }; 132 | 133 | //DEFINITIONS 134 | 135 | template 136 | loserTree::loserTree(vector* playerList, uint32_t listSize, comparable sentinel){ 137 | //Match depth, not including players is log base 2 (listSize) 138 | this->sentinel = sentinel; 139 | this->playerList = playerList; 140 | this->playerListSize = listSize; 141 | this->playerId = 0; 142 | this->match_depth = ceil(log10((double) listSize) / log10((double) 2)); 143 | 144 | 145 | this->root = new matchNode; 146 | this->root->parent = 0; 147 | this->root->depth = 1; 148 | 149 | this-> winner = this->buildChildren(this->root); 150 | 151 | } 152 | 153 | 154 | template 155 | loserTree::~loserTree(){ 156 | //Recursively destroy tree 157 | this->destroyChildren(this->root); 158 | } 159 | 160 | 161 | template 162 | treeElement loserTree::getWinner(){ 163 | return this->winner; 164 | } 165 | 166 | template 167 | uint32_t loserTree::getWinnerId(){ 168 | return this->winner.pNodePtr->getId(); 169 | } 170 | 171 | template 172 | comparable loserTree::getWinnerValue(){ 173 | return this->winner.value; 174 | } 175 | 176 | template 177 | void loserTree::playNextMatch(){ 178 | if (this->winner.pNodePtr != 0){ 179 | this->winner.pNodePtr->advancePlayer(); 180 | this->playUp(this->winner.pNodePtr->parent, this->winner.pNodePtr->getTreeElement(), (matchNode*)this->winner.pNodePtr); 181 | } 182 | } 183 | 184 | //Starting at the deepest match node, go upwards 185 | template 186 | void loserTree::playUp(matchNode* node, treeElement winner, matchNode* winnerNode){ 187 | if(node->depth == 1){ //root node 188 | if(winner.value > node->element.value || 189 | (winner.value == node->element.value && 190 | node->right == winnerNode)){ 191 | this->winner = node->element; 192 | node->element = winner; 193 | }else{ 194 | this->winner = winner; 195 | } 196 | 197 | }else{ 198 | //Compare winner received with stored loser rescurse upwards 199 | if(winner.value > node->element.value || 200 | (winner.value == node->element.value && 201 | node->right == winnerNode)){//new loser 202 | this->playUp(node->parent, node->element, node); 203 | node->element = winner; 204 | }else{ 205 | this->playUp(node->parent, winner, node); 206 | } 207 | } 208 | } 209 | 210 | template 211 | void loserTree::destroyChildren(matchNode* node){ 212 | if (node->depth == this->match_depth){ 213 | //clean up player list? 214 | }else{ 215 | if(!node->isPlayer()){ 216 | destroyChildren((matchNode*)node->left); 217 | if (node->right) 218 | destroyChildren((matchNode*)node->right); 219 | } 220 | } 221 | 222 | if(!node->isPlayer()){ 223 | delete node->left; 224 | if (node->right) 225 | delete node->right; 226 | } 227 | } 228 | 229 | template 230 | treeElement loserTree::buildChildren(matchNode* node){ 231 | treeElement right,left; 232 | 233 | if (node->depth == this->match_depth || 234 | this->playerId >= (this->playerListSize - 1)) { //Children are player nodes 235 | node->left = new playerNode(node, this->playerList, 236 | this->playerListSize, this->playerId++, this->sentinel); 237 | node->right = new playerNode(node, this->playerList, 238 | this->playerListSize, this->playerId++, this->sentinel); 239 | 240 | left.value = node->left->getValue(); 241 | left.pNodePtr = (playerNode*)node->left; 242 | 243 | right.value = node->right->getValue(); 244 | right.pNodePtr = (playerNode*)node->right; 245 | 246 | }else{//Regular match node where children are matches 247 | node->left = (treeNode*) new matchNode; 248 | matchNode* leftMatch = (matchNode*) node->left; 249 | leftMatch->parent = node; 250 | leftMatch->depth = node->depth + 1; 251 | 252 | left = this->buildChildren((matchNode*) node->left); 253 | 254 | if(this->playerId >= this->playerListSize){ 255 | //there are no more lists to sort, seed this instead 256 | node->right = 0; 257 | right = treeElement(this->sentinel, 0); 258 | }else{ 259 | node->right = (treeNode*) new matchNode; 260 | matchNode* rightMatch = (matchNode*) node->right; 261 | rightMatch->parent = node; 262 | rightMatch->depth = node->depth + 1; 263 | right = this->buildChildren((matchNode*) node->right); 264 | } 265 | 266 | } 267 | 268 | if (right.value < left.value) { //right is the winner 269 | node->element = left; //store the loser 270 | return right; //return the winner 271 | }else{//if right => left it loses 272 | node->element = right; 273 | return left; 274 | } 275 | } 276 | 277 | 278 | 279 | 280 | template 281 | playerNode::playerNode(matchNode* parent, vector* playerList, uint32_t playerListSize, uint32_t playerId, comparable sentinel){ 282 | this->parent = parent; 283 | this->sentinel = sentinel; 284 | if(playerId >= playerListSize){//uneven 285 | this->player = 0; 286 | }else{ 287 | this->player = & (playerList[playerId]); 288 | this->position = playerId; 289 | } 290 | this->index_id = 0; 291 | } 292 | 293 | 294 | template 295 | comparable playerNode::getValue(){ 296 | if(player && index_id < ((std::vector)(*player)).size()) 297 | return ((std::vector)(*player))[index_id]; 298 | else 299 | return this->sentinel; 300 | } 301 | 302 | 303 | 304 | 305 | 306 | 307 | } 308 | 309 | 310 | #endif 311 | -------------------------------------------------------------------------------- /include/nGramBase.h: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | 30 | #ifndef NGRAMBASE_H 31 | #define NGRAMBASE_H 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include "common.h" 39 | #include "file.h" 40 | #include "indexSet.h" 41 | #include "smFile.h" 42 | 43 | namespace snugglefish { 44 | 45 | 46 | //Class to keep track of nGram Index files 47 | class nGramBase 48 | { 49 | public: 50 | 51 | nGramBase( uint32_t ngramLength, std::string indexFileName); 52 | 53 | protected: 54 | uint16_t maxFileNameLength; 55 | std::string baseFileName; 56 | uint8_t ngramLength; // the ngram size we're using 57 | uint64_t maxNgram; 58 | 59 | 60 | private: 61 | 62 | 63 | }; 64 | 65 | } 66 | #endif 67 | -------------------------------------------------------------------------------- /include/nGramIndex.h: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | #ifndef NGRAMINDEX_H 30 | #define NGRAMINDEX_H 31 | 32 | #include "nGramBase.h" 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include "smFile.h" 39 | #include "indexSet.h" 40 | 41 | #define BUFFER_NODE_SIZE sizeof(ngram_t_fidtype) + 24 //24 is upper bound per node 42 | 43 | //Index File Constants 44 | #define DEFAULT_MAX_FILENAME_SIZE 65 //64 characters + null terminator 45 | #define TWO_GB 2147483648 //1024 * 1024 * 1204 * 2 46 | #define FOUR_GB TWO_GB * 2 47 | #define MAX_BUFFER_SIZE FOUR_GB 48 | 49 | //4MB = 4194304 50 | //8MB = 8388608 51 | #define WRITE_BUFFER_SIZE 1024 * 1024 * 8 * 32 //256MB 52 | 53 | namespace snugglefish { 54 | 55 | //Storage container for output buffer 56 | struct buffer_element{ 57 | uint64_t elements_size; //how many elements in list 58 | //stl list size() will iterate everytime, easier to keep 59 | //static counter 60 | std::list* elements; 61 | }; 62 | 63 | 64 | //Class to keep track of nGram Index files 65 | class nGramIndex: public nGramBase 66 | { 67 | public: 68 | 69 | nGramIndex( uint32_t ngramLength, std::string indexFileName); 70 | ~nGramIndex(); 71 | 72 | //Accessors 73 | const uint32_t getmaxFileNameLength(){return this->maxFileNameLength;} 74 | const uint64_t getmaxBufferSize(){return this->bufferMax;} 75 | 76 | //Setters 77 | void setmaxFileNameLength(uint32_t length){ this->maxFileNameLength = length; } 78 | void setmaxBufferSize(uint64_t size){this->bufferMax = size;} 79 | 80 | //Write Mode 81 | void addNGrams(std::vector* nGramList, std::string filename); 82 | //void addNGrams(bool nGramList[], std::string filename, int flag); 83 | 84 | void getStats(uint64_t & totalFiles, uint64_t& sessionFiles, uint64_t& indexFiles, bool& flushing); 85 | 86 | private: 87 | //Write Mode Functions 88 | void flushAll(); 89 | void flushMaster(); 90 | void flushIndex(ngram_t_indexfcount num_files ); 91 | 92 | 93 | //Write Mode Variables 94 | uint64_t bufferMax; 95 | buffer_element* output_buffer; 96 | uint64_t buffer_memory_usage; //how many bytes is the buffer storing (only file ids) 97 | std::vector< std::string > fileNameList; 98 | bool flush; 99 | bool flushing; 100 | 101 | 102 | smFile* masterFile; 103 | 104 | uint64_t numFilesProcessed; 105 | uint64_t numSessionFilesProcessed; //How many files have been processed this session 106 | 107 | 108 | }; 109 | 110 | } 111 | #endif 112 | -------------------------------------------------------------------------------- /include/nGramSearch.h: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | #ifndef NGRAMSEARCH_H 30 | #define NGRAMSEARCH_H 31 | 32 | #include "nGramBase.h" 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include "smFile.h" 42 | #include "indexSet.h" 43 | 44 | namespace snugglefish { 45 | 46 | typedef struct _thread_data{ 47 | uint32_t queue; 48 | uint32_t maximumIndex; 49 | uint32_t ngramLength; 50 | 51 | smFile* masterFile; 52 | std::vector* matchedFiles; 53 | std::string* baseFileName; 54 | std::vector* nGramQuery; 55 | 56 | pthread_mutex_t queueMutex; 57 | pthread_mutex_t smFileMutex; 58 | pthread_mutex_t mfMutex; 59 | } thread_data; 60 | 61 | 62 | //Class to keep track of nGram Index files 63 | class nGramSearch: public nGramBase 64 | { 65 | public: 66 | 67 | nGramSearch( uint32_t ngramLength, std::string indexFileName); 68 | nGramSearch( uint32_t ngramLength, std::string indexFileName, uint32_t threads); 69 | ~nGramSearch(); 70 | 71 | 72 | //Read Mode 73 | std::vector* searchNGrams(std::vector nGramQuery); 74 | std::vector* stringToNGrams(std::string searchString); 75 | 76 | protected: 77 | 78 | private: 79 | //FUNCTIONS 80 | static std::list< std::pair > orderNGrams(indexSet* index, const std::vector& nGramQuery); 81 | //Alpha is just a placeholder name for this search type 82 | //I envision there will be multiple search types 83 | static std::list searchAlpha(indexSet* index, std::list< std::pair > & queryList); 84 | static void* searchNGramThread(void* input); 85 | 86 | 87 | uint32_t numThreads; 88 | 89 | 90 | //Read Mode Variables 91 | smFile* masterFile; 92 | uint32_t numIndexFiles; 93 | uint32_t numFiles; 94 | 95 | 96 | 97 | 98 | }; 99 | 100 | } 101 | #endif 102 | -------------------------------------------------------------------------------- /include/smFile.h: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | #ifndef SNGMASTERFILE_H 30 | #define SNGMASTERFILE_H 31 | 32 | 33 | #include "file.h" 34 | #include "common.h" 35 | #include 36 | 37 | namespace snugglefish { 38 | 39 | class smFile: public file 40 | { 41 | 42 | public: 43 | smFile(std::string fileBase, uint8_t nGramSize); 44 | ~smFile(); 45 | 46 | void create(ngram_t_fnlength maxfnLength); 47 | void open(char readwrite); 48 | 49 | bool flush(); 50 | 51 | void addFileId(const char* fileName); 52 | void updateIndexFileCount(ngram_t_indexcount count); 53 | 54 | const ngram_t_indexcount getNumIndexFiles() { return numIndexFiles; } 55 | const ngram_t_fidtype getNumFiles() { return numFiles; } 56 | 57 | const char* getFilebyId(uint64_t id); 58 | 59 | private: 60 | //Index File header elements 61 | ngram_t_endian endian_check; 62 | ngram_t_version version; 63 | ngram_t_size ngramLength; 64 | ngram_t_fnlength maxFileNameLength; 65 | ngram_t_indexcount numIndexFiles; 66 | ngram_t_fidtype numFiles; 67 | 68 | 69 | char* fileBuffer; 70 | 71 | }; 72 | 73 | } 74 | #endif 75 | -------------------------------------------------------------------------------- /include/snugglefish.h: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | #ifndef SNUGGLEFISH_H 30 | #define SNUGGLEFISH_H 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #ifdef __cplusplus 38 | extern "C" { 39 | #endif 40 | 41 | 42 | void make_index(std::string indexFileName, std::vector fileNames, uint32_t ngramSize, uint32_t max_files, uint64_t max_buffer, uint32_t threads); 43 | 44 | #ifdef __cplusplus 45 | } 46 | #endif 47 | 48 | std::vector* search(std::string indexFileName, std::string searchString, uint32_t ngramSize, uint32_t threads); 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /include/utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | 30 | #ifndef SNUGGLE_UTILS_H 31 | #define SNUGGLE_UTILS_H 32 | 33 | 34 | #include 35 | #include 36 | #include 37 | #include // std::sort 38 | 39 | //TODO REMOVE 40 | #include 41 | #include 42 | 43 | 44 | namespace snugglefish{ 45 | 46 | 47 | std::list vbencode_number(uint32_t number){ 48 | std::list bytes; 49 | while (true){ 50 | bytes.push_front(number % 128); 51 | if (number < 128){ 52 | break; 53 | } 54 | number /= 128; 55 | } 56 | bytes.back() += 128; 57 | return bytes; 58 | } 59 | 60 | 61 | std::list vbencode_numbers(const std::vector& numbers){ 62 | std::list output; 63 | for(int32_t i = 0; i < numbers.size(); i++){ 64 | std::list out = vbencode_number(numbers[i]); 65 | if (i == 0){ 66 | output = out; 67 | }else{ 68 | output.splice(output.end(), out); 69 | } 70 | } 71 | 72 | return output; 73 | } 74 | 75 | std::vector vbdecode_numbers(std::list numbers){ 76 | std::vector output; 77 | uint32_t n = 0; 78 | 79 | for (std::list::const_iterator iterator = numbers.begin(); iterator != numbers.end(); iterator++){ 80 | if (*iterator < 128){ 81 | n = 128 * n + *iterator; 82 | }else{ 83 | n = 128 * n + (*iterator - 128); 84 | output.push_back(n); 85 | n = 0; 86 | } 87 | } 88 | 89 | return output; 90 | } 91 | 92 | 93 | uint32_t pfor_analyze_bits(std::vector v, int b, uint32_t& length, uint32_t& exceptions){ 94 | uint32_t len = 0, min = 0, range = 1 << b; 95 | 96 | for(uint32_t lo = 0, hi = 0; hi < v.size(); hi++){ 97 | if(v[hi] - v[lo] >= range){ 98 | if(hi - lo > len){ 99 | min = lo; 100 | len = hi - lo; 101 | } 102 | while(v[hi] - v[lo] >= range) lo++; 103 | } 104 | } 105 | exceptions = v.size() - len; 106 | length = len + 1; 107 | return min; 108 | } 109 | 110 | 111 | uint32_t bit_encode(uint32_t number, uint32_t bits, uint32_t& overflow){ 112 | overflow = number >> bits; 113 | return number & (( 1 << bits) - 1); 114 | } 115 | 116 | 117 | uint32_t pfordelta_bitsize(const std::vector& numbers, uint32_t& exceptions, uint32_t max_exceptions = 16){ 118 | //Sort the array 119 | std::vector sorted_deltas(numbers); 120 | std::sort(sorted_deltas.begin(), sorted_deltas.end()); 121 | 122 | uint32_t low_ratio = 0xFFFFFFFF; 123 | uint8_t bits = 0; 124 | for(uint32_t i = 3; i < 16; i++){//Assume 1 or 2 are too small TODO figure out best lower bound 125 | uint32_t length; 126 | uint32_t excep; 127 | pfor_analyze_bits(sorted_deltas, i, length, excep); 128 | uint32_t compression_ratio = i + (excep / (bool) sorted_deltas.size()) * (8 * sorted_deltas.size()); 129 | //std::cout << i << " = " << exceptions << " " << compression_ratio<< std::endl; 130 | if (compression_ratio < low_ratio){ 131 | exceptions = excep; 132 | low_ratio = compression_ratio; 133 | bits = i; 134 | }else{ // >= 135 | //break; 136 | } 137 | } 138 | 139 | if(exceptions > max_exceptions){ 140 | bits = 0; 141 | } 142 | 143 | return bits; 144 | } 145 | 146 | uint8_t* packer(const std::vector& numbers, uint32_t packsize){ 147 | uint32_t total_size = (numbers.size() * packsize) / 8; //TODO account for float 148 | uint8_t* output = new uint8_t[total_size](); //the () initializes the values to 0 149 | 150 | 151 | if (packsize <= 3){ //Less than half a byte 152 | int left = 0;//how many bits are left in the output byte 153 | int shift = 0;//how many bits have we used in the input byte 154 | 155 | for(int i = 0, j =0; i < numbers.size(); i++){ 156 | if(left){ 157 | if (left >= packsize){ 158 | output[j] |= numbers[i] << (left - packsize); 159 | left -= packsize; 160 | if(left){ 161 | continue; 162 | } 163 | else{ 164 | j++; 165 | continue; 166 | } 167 | }else{ 168 | output[j] |= numbers[i] >> (packsize - left); 169 | shift = left; 170 | left = 0; 171 | j++; 172 | } 173 | 174 | } 175 | 176 | output[j] |= (numbers[i] << (8 - packsize)) << shift; 177 | left = 8 - packsize + shift; 178 | shift = 0; 179 | } 180 | 181 | }else if (packsize <= 8){ //1 byte or less 182 | //Iterate through every number 183 | int left = 0; 184 | int shift = 0; 185 | for(int i = 0, j = 0; i < numbers.size(); i++){ 186 | if (left){ 187 | output[j++] |= numbers[i] >> (packsize - left); 188 | if (left == packsize){ 189 | left = 0; 190 | continue; 191 | } 192 | shift = left; 193 | left = 0; 194 | } 195 | 196 | output[j] |= (numbers[i] & (((1 << packsize) - 1) >> shift)) << (8 - (packsize - shift)); 197 | left = 8 - (packsize - shift); // how many bits are left in this byte 198 | shift = 0; 199 | } 200 | 201 | return output; 202 | 203 | 204 | }else if (packsize <= 16) { //2 bytes or less 205 | 206 | }else { 207 | //error 208 | 209 | } 210 | } 211 | 212 | /* 213 | This code written based on information gleaned from: 214 | 215 | Hao Yan, Shuai Ding, and Torsten Suel. “Inverted index compression and 216 | query processing with optimized document ordering.” 217 | In Proceedings of the 18th international conference on World wide web (WWW ’09). 218 | ACM, New York, NY, USA, 401-410 219 | */ 220 | 221 | void pfordelta_encode(const std::vector& numbers, uint32_t max_exceptions = 16){ 222 | 223 | std::vector deltas; 224 | 225 | //Create Delta Array 226 | for(uint32_t i = 0; i < numbers.size(); i++){ 227 | if (i == 0){ 228 | deltas.push_back(numbers[i]); 229 | }else{ 230 | deltas.push_back(numbers[i] - numbers[i - 1]); 231 | } 232 | } 233 | 234 | uint32_t exceptions = 0; 235 | uint32_t bits = pfordelta_bitsize(deltas, exceptions); 236 | 237 | 238 | if(!bits){//wasn't able to find a number that matches constraints 239 | //TODO 240 | } 241 | 242 | //At this point we should have the bitsize and the number of exceptions 243 | 244 | std::vector compressed; 245 | std::vector exception_index; 246 | std::vector exception_values; 247 | for (uint32_t i = 0; i < deltas.size(); i++){ 248 | uint32_t overflow; 249 | compressed.push_back(bit_encode(deltas[i], bits, overflow)); 250 | if(overflow){ 251 | exception_index.push_back(i); 252 | exception_values.push_back(overflow); 253 | } 254 | std::bitset<6> foo; 255 | foo = compressed[i]; 256 | std::cout << foo << std::endl; 257 | } 258 | 259 | std::bitset<8> bit_val; 260 | //Bitpack 261 | uint8_t leader = bits << 4 | exceptions; 262 | uint8_t* packed = packer(compressed, bits); 263 | std::list e_index = vbencode_numbers(exception_index); 264 | std::list e_vals = vbencode_numbers(exception_values); 265 | 266 | 267 | 268 | bit_val = leader; 269 | std::cout << bit_val << std::endl; 270 | for(int i = 0; i < (compressed.size() * bits) / 8 ; i++){ 271 | bit_val = packed[i]; 272 | std::cout << bit_val << std::endl; 273 | } 274 | 275 | std::cout<< "Exception Indexes: " << std::endl; 276 | 277 | for(std::list::iterator iter = e_index.begin(); iter != e_index.end(); iter++){ 278 | bit_val = *iter; 279 | std::cout << bit_val << std::endl; 280 | } 281 | 282 | 283 | std::cout<< "Exception Values: " << std::endl; 284 | for(std::list::iterator iter = e_vals.begin(); iter != e_vals.end(); iter++){ 285 | bit_val = *iter; 286 | std::cout << bit_val << std::endl; 287 | } 288 | 289 | } 290 | 291 | 292 | } 293 | 294 | #endif 295 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | PySnugglefish 2 | ======================= 3 | 4 | Python bindings for Snugglefish 5 | 6 | Description 7 | =========== 8 | PySnugglefish is a python wrapper around the snugglefish C++ codebase. It exposes the search and index functions to python modules. 9 | 10 | PySnugglefish Instances 11 | ======================= 12 | The pysnugglefish module is pretty simple to utilize. 13 | Start by importing the module, it's pretty easy: 14 | 15 | import pysnugglefish 16 | 17 | To execute snugglefish functions, create an interface through pysnugglefish: 18 | 19 | obj = pysnugglefish.init("/path/to/indexfile") 20 | 21 | 22 | Or, you can specify the index file and the ngram size as init arguments. 23 | 24 | ngram_sz = 3 25 | obj = pysnugglefish.init("/path/to/indexfile", ngram_sz) 26 | 27 | Ngram size must be either 3 or 4. 28 | 29 | Indexing with PySnugglefish 30 | =========================== 31 | 32 | To index, simply feed the pysnugglefish instance with configuration options, then run the indexing function. 33 | 34 | obj = pysnugglefish.init("/path/to/indexfile") 35 | obj.file_list = ["/path/to/file1", "/path/to/file2"] 36 | obj.ngram_size = 3 # defaults to 3 37 | obj.max_buffer = 9001 # defaults to no maximum (0) 38 | obj.max_files = 100000 # defaults to no maximum (0) 39 | obj.make_index() # create the index file 40 | 41 | If you are indexing lots of files this will be very memory and CPU intensive, so be patient. 42 | 43 | Searching with PySnugglefish 44 | ============================ 45 | The module facilitates searching a specified index. 46 | Again, provide configuration, then execute your search. 47 | 48 | obj = pysnugglefish.init("/path/to/indexfile") 49 | obj.ngram_size = 3 # better equal the ngram_size used to generate the index! 50 | bitstring = "\x41\x42\x43" 51 | files_found = obj.search(bitstring) 52 | 53 | The search function returns an array containing the filenames of each file in the index which the snugglefish code matched to the input search string. 54 | 55 | Caveats 56 | ======= 57 | Python doesn't interpret tildes in paths automatically. This pysnugglefish module does not do any special processing on the files provided to the index function. So, expect issues with paths such as ~/Documents/file.bin 58 | -------------------------------------------------------------------------------- /python/pysnugglefish.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 | SUCH DAMAGE. 24 | */ 25 | 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include "nGramSearch.h" 32 | #include "nGramIndex.h" 33 | #include "fileIndexer.h" 34 | #include "common.h" 35 | 36 | using namespace std; 37 | 38 | /* Module's error object. */ 39 | static PyObject *SnuggleError; 40 | 41 | /* 42 | * Define the pysnugglefish object. 43 | * Represents data used in the snugglefish features. 44 | */ 45 | typedef struct { 46 | PyObject_HEAD 47 | PyObject *index; 48 | PyObject *file_list; 49 | int ngram_size; 50 | int max_buffer; 51 | int max_files; 52 | } pysnugglefish; 53 | 54 | /* Facilitate destruction of pysnugglefish objects. */ 55 | static void pysnugglefish_dealloc(pysnugglefish *self) { 56 | Py_XDECREF(self->index); 57 | Py_XDECREF(self->file_list); 58 | self->ob_type->tp_free((PyObject*)self); 59 | } 60 | 61 | /* Construct a new pysnugglefish object. */ 62 | static PyObject *pysnugglefish_new(PyTypeObject *type, PyObject *args, PyObject *kwds) { 63 | pysnugglefish *self; 64 | self = (pysnugglefish *)type->tp_alloc(type, 0); 65 | if (self != NULL) { // if object created, init fields to defaults 66 | self->index = PyString_FromString(""); 67 | if (self->index == NULL) { 68 | Py_DECREF(self); 69 | return NULL; 70 | } 71 | self->file_list = PyList_New(0); 72 | self->max_buffer = 0; 73 | self->ngram_size = 3; 74 | self->max_files = 0; 75 | } 76 | 77 | return (PyObject *)self; 78 | } 79 | 80 | /* Initialize a pysnugglefish object from command line arguments. */ 81 | static int pysnugglefish_init(pysnugglefish *self, PyObject *args, PyObject *kwds) { 82 | PyObject *index=NULL, *tmp; 83 | int ngrams = 3; 84 | 85 | static char *kwlist[] = {(char *) "index", (char *) "ngram_size", NULL}; 86 | 87 | // ngram size optional 88 | if (!PyArg_ParseTupleAndKeywords(args, kwds, "S|i", kwlist, &index, &ngrams)) { 89 | return -1; 90 | } 91 | 92 | if (ngrams != 3 && ngrams != 4) { 93 | PyErr_SetString(PyExc_TypeError, "N-Gram size must be set to 3 or 4."); 94 | return -1; 95 | } 96 | if (index) { // if index was provided, manage references and set 97 | tmp = self->index; 98 | Py_INCREF(index); 99 | self->index = index; 100 | Py_XDECREF(tmp); 101 | } 102 | self->ngram_size = ngrams; 103 | 104 | return 0; 105 | } 106 | 107 | /* Define members of pysnugglefish object. */ 108 | static PyMemberDef pysnugglefish_members[] = { 109 | {(char *) "index", T_OBJECT_EX, offsetof(pysnugglefish, index), 0, 110 | (char *) "index name"}, 111 | {(char *) "file_list", T_OBJECT_EX, offsetof(pysnugglefish, file_list), 0, 112 | (char *) "list of file names"}, 113 | {(char *) "ngram_size", T_INT, offsetof(pysnugglefish, ngram_size), 0, 114 | (char *) "n-gram size"}, 115 | {(char *) "max_buffer", T_INT, offsetof(pysnugglefish, max_buffer), 0, 116 | (char *) "max buffer size"}, 117 | {(char *) "max_files", T_INT, offsetof(pysnugglefish, max_files), 0, 118 | (char *) "max files to use"}, 119 | { NULL } /* Sentinel */ 120 | }; 121 | 122 | /* 123 | * Search a snugglefish index for a given search input. 124 | * Args: searchString (string) 125 | */ 126 | static PyObject *pysnugglefish_search(pysnugglefish *self, PyObject *args) { 127 | char *searchString; 128 | vector *found; 129 | long procs; 130 | 131 | //This only works on some *nixes 132 | // TODO figure out which systems don't support this call 133 | procs = sysconf(_SC_NPROCESSORS_ONLN); 134 | if (procs < 1) { 135 | procs = 1; 136 | } 137 | 138 | // Threads optional 139 | if (!PyArg_ParseTuple(args, "s|i", &searchString, &procs)) { 140 | return NULL; 141 | } 142 | 143 | if (procs <= 0) { 144 | PyErr_SetString(SnuggleError, (char *) "Invalid threads"); 145 | return NULL; 146 | } 147 | 148 | try { 149 | snugglefish::nGramSearch searcher(self->ngram_size, PyString_AsString(self->index), (uint32_t) procs); 150 | vector *ngrams = searcher.stringToNGrams(searchString); 151 | found = searcher.searchNGrams(*ngrams); 152 | } catch (exception &e) { 153 | PyErr_SetString(SnuggleError, e.what()); 154 | return NULL; 155 | } 156 | 157 | PyObject *ret = PyList_New(found->size()); 158 | for (size_t i = 0; i < found->size(); i++) { 159 | PyList_SetItem(ret, i, Py_BuildValue("s", (*found)[i].c_str())); 160 | } 161 | 162 | delete found; 163 | return ret; 164 | } 165 | 166 | void *indexerThread(void *input) { 167 | mi_data *midata = (mi_data *) input; 168 | snugglefish::nGramIndex *ngramindex = (snugglefish::nGramIndex *) midata->ngramindex; 169 | snugglefish::fileIndexer indexer(midata->ngramSize); 170 | 171 | while(1) { 172 | pthread_mutex_lock(&midata->filesMutex); 173 | if (midata->queue >= midata->fileList->size()) { 174 | pthread_mutex_unlock(&midata->filesMutex); 175 | break; 176 | } 177 | 178 | uint32_t i = midata->queue++; 179 | pthread_mutex_unlock(&midata->filesMutex); 180 | 181 | try { 182 | vector *processedFile = indexer.processFile((*(midata->fileList))[i].c_str()); 183 | if (processedFile != 0) { 184 | pthread_mutex_lock(&midata->nGramIndexMutex); 185 | ngramindex->addNGrams(processedFile, (*(midata->fileList))[i]); 186 | pthread_mutex_unlock(&midata->nGramIndexMutex); 187 | } 188 | } catch (exception &e) { 189 | return (void *) e.what(); 190 | } 191 | } 192 | 193 | return 0; 194 | } 195 | 196 | /* 197 | * Index all of the files specified in the file_list member. 198 | * Output the index at the path specified in the pysnugglefish index member. 199 | */ 200 | static PyObject *pysnugglefish_index(pysnugglefish *self, PyObject *args) { 201 | vector files; 202 | long procs; 203 | int i; 204 | pthread_t *indexers; 205 | mi_data *midata; 206 | void *status; 207 | Py_ssize_t ct; 208 | 209 | //This only works on some *nixes 210 | // TODO figure out which systems don't support this call 211 | procs = sysconf(_SC_NPROCESSORS_ONLN); 212 | if (procs < 1) { 213 | procs = 1; 214 | } 215 | 216 | // Threads optional 217 | if (!PyArg_ParseTuple(args, "|i", &procs)) { 218 | return NULL; 219 | } 220 | 221 | if (procs <= 0) { 222 | PyErr_SetString(SnuggleError, (char *) "Invalid threads"); 223 | return NULL; 224 | } 225 | 226 | // No files to index. 227 | ct = PyList_Size(self->file_list); 228 | if (ct == 0) { 229 | Py_RETURN_NONE; 230 | } 231 | 232 | for (i = 0; i < ct; i++) { 233 | files.push_back(PyString_AsString(PyList_GetItem(self->file_list, i))); 234 | } 235 | 236 | midata = new mi_data; 237 | indexers = (pthread_t *) malloc(procs * sizeof(pthread_t)); 238 | 239 | pthread_attr_t attr; 240 | pthread_attr_init(&attr); 241 | pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); 242 | 243 | pthread_mutex_init(&(midata->filesMutex), NULL); 244 | pthread_mutex_init(&(midata->nGramIndexMutex), NULL); 245 | 246 | midata->fileList = &files; 247 | midata->ngramSize = self->ngram_size; 248 | midata->queue = 0; 249 | 250 | try { 251 | snugglefish::nGramIndex ngramindex(self->ngram_size, PyString_AsString(self->index)); 252 | if (self->max_buffer > 0) { 253 | ngramindex.setmaxBufferSize(self->max_buffer); 254 | } 255 | 256 | midata->ngramindex = &ngramindex; 257 | 258 | for (uint32_t i = 0; i < procs; i++) { 259 | pthread_create(&indexers[i], &attr, indexerThread, (void *) midata); 260 | } 261 | 262 | while (1) { 263 | //Usage of mutex shouldn't matter 264 | if (midata->queue >= files.size()) { 265 | break; 266 | } 267 | sleep(1); 268 | } 269 | 270 | for (uint32_t i = 0; i < procs; i++) { 271 | pthread_join(indexers[i], &status); 272 | if (status) { 273 | PyErr_SetString(SnuggleError, (char *) status); 274 | return NULL; 275 | } 276 | } 277 | 278 | } catch (exception &e) { 279 | PyErr_SetString(SnuggleError, e.what()); 280 | return NULL; 281 | } 282 | 283 | 284 | pthread_mutex_destroy(&(midata->filesMutex)); 285 | pthread_mutex_destroy(&(midata->nGramIndexMutex)); 286 | pthread_attr_destroy(&attr); 287 | 288 | delete midata; 289 | free(indexers); 290 | 291 | Py_RETURN_NONE; 292 | } 293 | 294 | /* Define the set of methods callable from a pysnugglefish object. */ 295 | static PyMethodDef pysnugglefish_methods[] = { 296 | {"search", (PyCFunction) pysnugglefish_search, METH_VARARGS, "Search the current index for an input."}, 297 | {"make_index", (PyCFunction) pysnugglefish_index, METH_VARARGS, "Make an index out of the current files list."}, 298 | { NULL } /* Sentinel */ 299 | }; 300 | 301 | /* Define getter for data attributes of pysnugglefish objects. */ 302 | static PyObject *pysnugglefish_getattr(pysnugglefish *self, char *attrname) { 303 | if (strcmp(attrname, "index") == 0) { 304 | return self->index; 305 | } else if (strcmp(attrname, "file_list") == 0) { 306 | Py_INCREF(self->file_list); 307 | return self->file_list; 308 | } else if (strcmp(attrname, "max_files") == 0) { 309 | return Py_BuildValue("i", self->max_files); 310 | } else if (strcmp(attrname, "max_buffer") == 0) { 311 | return Py_BuildValue("i", self->max_buffer); 312 | } else if (strcmp(attrname, "ngram_size") == 0) { 313 | return Py_BuildValue("i", self->ngram_size); 314 | } else if (strcmp(attrname, "search") == 0) { 315 | return PyObject_GenericGetAttr((PyObject *)self, Py_BuildValue("s", attrname)); 316 | } else if (strcmp(attrname, "make_index") == 0) { 317 | return PyObject_GenericGetAttr((PyObject *)self, Py_BuildValue("s", attrname)); 318 | } else { 319 | PyErr_SetString(PyExc_AttributeError, attrname); 320 | return NULL; 321 | } 322 | } 323 | 324 | /* Define setter for data attributes of pysnugglefish objects. */ 325 | static int pysnugglefish_setattr(pysnugglefish *self, char *name, PyObject *value) { 326 | int result = -1; 327 | if (strcmp(name, "index") == 0) { 328 | PyErr_SetString(SnuggleError, "Index is read-only after init."); 329 | } else if (strcmp(name, "file_list") == 0) { 330 | result = 0; 331 | if (PyList_Check(value) && value != NULL) { 332 | Py_XDECREF(self->file_list); 333 | Py_INCREF(value); 334 | self->file_list = value; 335 | } else { 336 | result = -1; 337 | } 338 | } else if (strcmp(name, "max_files") == 0 && value != NULL) { 339 | int newval = 0; 340 | if (PyArg_Parse(value, "i", &newval)) { 341 | if (newval > 0) { 342 | self->max_files = newval; 343 | result = 0; 344 | } 345 | } 346 | } else if (strcmp(name, "max_buffer") == 0 && value != NULL) { 347 | int newval = 0; 348 | if (PyArg_Parse(value, "i", &newval)) { 349 | if (newval > 0) { 350 | self->max_buffer = newval; 351 | result = 0; 352 | } 353 | } 354 | } else if (strcmp(name, "ngram_size") == 0 && value != NULL) { 355 | int newval = 0; 356 | if (PyArg_Parse(value, "i", &newval)) { 357 | if (newval == 3 || newval == 4) { 358 | self->ngram_size = newval; 359 | result = 0; 360 | } 361 | } 362 | } else { 363 | PyErr_SetString(PyExc_AttributeError, name); 364 | result = -1; 365 | } 366 | return result; 367 | } 368 | 369 | /* 370 | * Define the Python type for pysnugglefish objects. 371 | * Configures pysnugglefish with its getters, setters, destructors, etc. 372 | */ 373 | PyTypeObject pysnugglefish_Type = { 374 | PyObject_HEAD_INIT(NULL) 375 | 0, /* ob_size */ 376 | "pysnugglefish", /* tp_name */ 377 | sizeof(pysnugglefish), /* tp_basicsize */ 378 | 0, /* tp_itemsize */ 379 | (destructor)pysnugglefish_dealloc, /* tp_dealloc */ 380 | 0, /* tp_print */ 381 | (getattrfunc)pysnugglefish_getattr, /* tp_getattr */ 382 | (setattrfunc)pysnugglefish_setattr, /* tp_setattr */ 383 | 0, /* tp_compare */ 384 | 0, /* tp_repr */ 385 | 0, /* tp_as_number */ 386 | 0, /* tp_as_sequence */ 387 | 0, /* tp_as_mapping */ 388 | 0, /* tp_hash */ 389 | 0, /* tp_call */ 390 | 0, /* tp_str */ 391 | 0, /* tp_getattro */ 392 | 0, /* tp_setattro */ 393 | 0, /* tp_as_buffer */ 394 | Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */ 395 | "pysnugglefish objects", /* tp_doc */ 396 | 0, /* tp_traverse */ 397 | 0, /* tp_clear */ 398 | 0, /* tp_richcompare */ 399 | 0, /* tp_weaklistoffset */ 400 | 0, /* tp_iter */ 401 | 0, /* tp_iternext */ 402 | pysnugglefish_methods, /* tp_methods */ 403 | pysnugglefish_members, /* tp_members */ 404 | 0, /* tp_getset */ 405 | 0, /* tp_base */ 406 | 0, /* tp_dict */ 407 | 0, /* tp_descr_get */ 408 | 0, /* tp_descr_set */ 409 | 0, /* tp_dictoffset */ 410 | (initproc)pysnugglefish_init, /* tp_init */ 411 | 0, /* tp_alloc */ 412 | pysnugglefish_new, /* tp_new */ 413 | }; 414 | 415 | /* Define static module methods (none). */ 416 | static PyMethodDef module_methods[] = { 417 | {NULL} /* Sentinel */ 418 | }; 419 | 420 | /* 421 | * Initialize the module. 422 | * Ensures that the pysnugglefish type is ready, makes methods available to 423 | * objects, and prepares error object for any future issues with pysnugglefish. 424 | */ 425 | PyMODINIT_FUNC initpysnugglefish(void) { 426 | PyObject *m; 427 | 428 | if (PyType_Ready(&pysnugglefish_Type) < 0) { 429 | return; 430 | } 431 | m = Py_InitModule3("pysnugglefish", module_methods, "Module that exposes snugglefish methods."); 432 | if (m == NULL) { 433 | return; 434 | } 435 | 436 | SnuggleError = PyErr_NewException((char *) "pysnugglefish.error", NULL, NULL); 437 | Py_INCREF(SnuggleError); 438 | PyModule_AddObject(m, "error", SnuggleError); 439 | Py_INCREF(&pysnugglefish_Type); 440 | PyModule_AddObject(m, "init", (PyObject *) &pysnugglefish_Type); 441 | } 442 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2014 The MITRE Corporation. All rights reserved. 3 | # 4 | # Redistribution and use in source and binary forms, with or without 5 | # modification, are permitted provided that the following conditions 6 | # are met: 7 | # 1. Redistributions of source code must retain the above copyright 8 | # notice, this list of conditions and the following disclaimer. 9 | # 2. Redistributions in binary form must reproduce the above copyright 10 | # notice, this list of conditions and the following disclaimer in the 11 | # documentation and/or other materials provided with the distribution. 12 | # 13 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 14 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 | # ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 17 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 | # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 | # SUCH DAMAGE. 24 | # 25 | 26 | from distutils.core import setup, Extension 27 | 28 | INCLUDE_DIRS = [ '/usr/local/include', 29 | '/opt/local/include', 30 | '/usr/include', 31 | '../include' ] 32 | LIBRARY_DIRS = [ '/usr/lib', 33 | '/usr/local/lib' ] 34 | 35 | # the c++ extension module 36 | extension_mod = Extension("pysnugglefish", 37 | sources=[ "../src/fileIndexer.cpp", 38 | "../src/nGramBase.cpp", 39 | "../src/nGramIndex.cpp", 40 | "../src/nGramSearch.cpp", 41 | "../src/file.cpp", 42 | "../src/indexSet.cpp", 43 | "../src/smFile.cpp", 44 | "pysnugglefish.cpp" ], 45 | include_dirs = INCLUDE_DIRS, 46 | library_dirs = LIBRARY_DIRS 47 | ) 48 | 49 | 50 | setup (# Distribution meta-data 51 | name = "pysnugglefish", 52 | version = "0.2", 53 | description = "python bindings for snugglefish", 54 | author = "Wesley Shields", 55 | author_email = "wshields@mitre.org", 56 | license = "BSD", 57 | long_description = "Python bindings for snugglefish", 58 | ext_modules = [ extension_mod ] 59 | ) 60 | -------------------------------------------------------------------------------- /python/snuggle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import pysnugglefish 5 | import multiprocessing 6 | 7 | from optparse import OptionParser 8 | 9 | def main(): 10 | cpu_count = multiprocessing.cpu_count() 11 | 12 | parser = OptionParser() 13 | parser.add_option("-i", "--index", action="store_true", dest="index", 14 | default=None, help="Index operation.") 15 | parser.add_option("-s", "--search", action="store_true", dest="search", 16 | default=None, help="Search operation.") 17 | parser.add_option("-I", "--indexfile", action="store", dest="indexfile", 18 | default="", help="Index file to search or create.") 19 | parser.add_option("-t", "--threads", action="store", dest="threads", 20 | default=cpu_count, 21 | help="Number of threads to spawn (default is #cpus).") 22 | 23 | (opts, searchstring) = parser.parse_args() 24 | 25 | if (not opts.index and not opts.search) or (opts.index and opts.search): 26 | print "[!] Must specify one of index or search." 27 | return 28 | 29 | if not opts.indexfile: 30 | print "[!] Must specify output location" 31 | return 32 | 33 | s = pysnugglefish.init(opts.indexfile) 34 | 35 | try: 36 | threads = int(opts.threads) 37 | except Exception as e: 38 | print "[!] Invalid threads: %s" % e.message 39 | return 40 | 41 | if threads <= 0: 42 | print "[!] Invalid threads. Defaulting to %i." % cpu_count 43 | threads = cpu_count 44 | 45 | if opts.index: 46 | s.file_list = [ line.rstrip('\n') for line in sys.stdin.readlines() ] 47 | msg = "[+] Indexing %i files with %i threads." % (len(s.file_list), 48 | threads) 49 | print "[+] This might take a while... ;)" 50 | try: 51 | s.make_index(threads) 52 | except Exception as e: 53 | print "[!] Exception while indexing: %s" % e.message 54 | elif opts.search: 55 | searchstring = ' '.join(searchstring) 56 | if not searchstring: 57 | searchstring = raw_input("Search string: ") 58 | 59 | if not searchstring: 60 | print "[!] Must enter a search string." 61 | return 62 | 63 | print "[+] Searching for %s with %i threads" % (searchstring, threads) 64 | try: 65 | results = s.search(searchstring, threads) 66 | for result in results: 67 | print result 68 | except Exception as e: 69 | print "[!] Exception while searching: %s" % e.message 70 | 71 | if __name__ == '__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /src/file.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | #include "file.h" 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | using namespace snugglefish; 42 | using namespace std; 43 | 44 | file::file(const char* fileName, size_t buffersize){ 45 | this->bufferparam = buffersize; 46 | 47 | this->buffer = NULL; 48 | this->buffersize = 0; 49 | this->bufferused = 0; 50 | 51 | this->fd = 0; 52 | this->mmapFile = 0; 53 | this->size = 0; 54 | 55 | this->filename = (char*) malloc(sizeof(char) * (strlen(fileName) + 1)); 56 | strncpy(this->filename, fileName, strlen(fileName) + 1); 57 | } 58 | 59 | 60 | file::~file(){ 61 | //File Descriptor is not zero 62 | if(this->fd){ 63 | this->close(); 64 | } 65 | 66 | if (this->filename != NULL){ 67 | free(this->filename); 68 | } 69 | 70 | if (this->buffer){ 71 | free(this->buffer); 72 | this->buffer = NULL; 73 | this->buffersize = this->bufferused = 0; 74 | } 75 | } 76 | 77 | bool file::create(mode_t filemode){ 78 | 79 | if (this->bufferparam > 0){ 80 | this->buffersize = this->bufferparam; 81 | this->buffer = (char*) malloc(buffersize); 82 | } 83 | this->fd = ::open(this->filename, O_RDWR | O_CREAT, filemode); 84 | this->readonly = false; 85 | 86 | if (this->fd <= 0){ 87 | cerr << "Unable to Create File: " << filename << " -- Error: " << strerror(errno) << endl; 88 | throw runtime_error("Creating File"); 89 | } 90 | 91 | return true; 92 | } 93 | 94 | bool file::open(char readwrite){ 95 | switch (readwrite){ 96 | case 'r': 97 | { 98 | this->fd = ::open(this->filename, O_RDONLY); 99 | this->size = this->get_size(); 100 | this->readonly = true; 101 | break; 102 | } 103 | case 'w': 104 | { 105 | if (this->bufferparam > 0){ 106 | this->buffersize = this->bufferparam; 107 | this->buffer = (char*) malloc(buffersize); 108 | } 109 | this->fd = ::open(this->filename, O_RDWR); 110 | this->readonly = false; 111 | break; 112 | } 113 | default: 114 | throw runtime_error("Unrecognized read/write mode"); 115 | break; 116 | } 117 | 118 | if (this->fd <= 0){ 119 | cerr << "Error Opening File: " << this->filename << " -- Error: " << strerror(errno) << endl; 120 | throw runtime_error("Opening File"); 121 | } 122 | 123 | return true; 124 | } 125 | 126 | uint8_t* file::mmap(){ 127 | //Open the file first 128 | if (this->fd == 0) 129 | this->open('r'); 130 | 131 | if (!this->size) 132 | return NULL; 133 | 134 | this->mmapFile = (uint8_t*) ::mmap(NULL, this->size, PROT_READ, MAP_SHARED, this->fd, 0); 135 | 136 | if(this->mmapFile == MAP_FAILED){ 137 | cerr << "Error Loading Map for File : " << this->filename<< " -- Error: " << strerror(errno) << endl; 138 | throw runtime_error("Loading Map"); 139 | } 140 | 141 | return this->mmapFile; 142 | 143 | } 144 | 145 | bool file::close(){ 146 | if (!this->fd){ // already closed? 147 | return true; 148 | } 149 | 150 | this->flush(); 151 | 152 | //Close the mmap if opened 153 | if (this->mmapFile){ 154 | munmap(this->mmapFile, this->size); 155 | this->mmapFile = 0; 156 | this->size = 0; 157 | } 158 | 159 | //Free the write buffer if allocated 160 | if (this->buffer){ 161 | free(this->buffer); 162 | this->buffer = NULL; 163 | this->buffersize = this->bufferused = 0; 164 | } 165 | 166 | int32_t retval = ::close(this->fd); 167 | if(retval){//non zero 168 | cerr << "Error Closing File: " << this->filename << " -- Error: " << strerror(errno) << endl; 169 | throw runtime_error("Closing File"); 170 | } 171 | 172 | this->fd = 0; 173 | 174 | return true; 175 | 176 | } 177 | 178 | bool file::real_write(int fd, uint8_t* data, size_t length){ 179 | ssize_t written = ::write(fd, data, length); 180 | if (written == -1){//TODO what if partial write? 181 | cerr << "Unable to write to file: " << this->filename << " -- Error: " << strerror(errno) << endl; 182 | throw runtime_error("Write Error"); 183 | } 184 | 185 | return true; 186 | } 187 | 188 | bool file::flush(){ 189 | if(this->readonly){ 190 | return true; 191 | } 192 | 193 | if (this->bufferused){ 194 | lseek(this->fd, 0, SEEK_END); 195 | this->real_write(this->fd, (uint8_t*) this->buffer, this->bufferused * sizeof(char)); 196 | this->bufferused = 0; 197 | } 198 | 199 | return true; 200 | } 201 | 202 | 203 | bool file::write(uint8_t * data, size_t length){ 204 | if (this->readonly){ 205 | throw runtime_error("Write command on read-only file"); 206 | } 207 | 208 | if((this->bufferused + (length)) > this->buffersize) { 209 | this->flush(); 210 | 211 | if (length < this->buffersize){ 212 | memcpy(this->buffer, data, length); 213 | this->bufferused = length; 214 | }else{ 215 | //Data being passed in is larger than buffer, write out directly 216 | //Seek to the end of the file first 217 | lseek(this->fd, 0, SEEK_END); 218 | this->real_write(this->fd, data, length); 219 | } 220 | }else{ // just buffer it up 221 | memcpy(this->buffer + this->bufferused, data, length); 222 | this->bufferused += length; 223 | } 224 | 225 | return true; 226 | } 227 | 228 | 229 | bool file::write_at(int32_t location, uint8_t * data, size_t length){ 230 | if (this->readonly){ 231 | throw runtime_error("Write command on reaodnly file"); 232 | } 233 | 234 | //Flush before writing at locations 235 | flush(); 236 | lseek(this->fd, location, SEEK_SET); 237 | this->real_write(this->fd, data, length * sizeof(uint8_t)); 238 | 239 | return true; 240 | 241 | } 242 | 243 | void file::read(uint8_t* dest, size_t length){ 244 | ::read(this->fd, dest, length); 245 | } 246 | 247 | void file::read_at(int32_t location, uint8_t* dest, size_t length){ 248 | lseek(this->fd, location, SEEK_SET); 249 | ::read (this->fd, dest, length); 250 | } 251 | 252 | const size_t file::get_size(){ 253 | struct stat st; 254 | size_t filesize = 0; 255 | if (stat(this->filename, &st) == 0){ 256 | filesize = st.st_size; 257 | }else{ 258 | throw runtime_error("Statting File"); 259 | } 260 | 261 | return filesize; 262 | } 263 | 264 | const bool file::exists(){ 265 | struct stat st; 266 | if(stat(this->filename, &st) == 0){ 267 | return true; 268 | }else{ 269 | return false; 270 | } 271 | } 272 | -------------------------------------------------------------------------------- /src/fileIndexer.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | 30 | #include "fileIndexer.h" 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | 40 | #include 41 | #include 42 | 43 | 44 | #include "file.h" 45 | 46 | using namespace snugglefish; 47 | using namespace std; 48 | 49 | fileIndexer::fileIndexer(uint8_t ngramLength): 50 | filesProcessed(0) 51 | { 52 | 53 | if(ngramLength == 3 || ngramLength == 4){ 54 | this->ngramLength = ngramLength; 55 | this->maxNgram = ((uint64_t) (1) << (8*ngramLength)); 56 | this->pagesize = getpagesize(); 57 | } 58 | else { 59 | throw std::runtime_error("Ngram len must be either 3 or 4"); 60 | } 61 | 62 | } 63 | 64 | // Takes in a file name, mmaps it, and creates a vector of ngrams 65 | // return: pointer to vector of ngrams 66 | vector* fileIndexer::processFile(const char* fileName){ 67 | 68 | vector* ngramList = 0; 69 | file* inputFile = new file(fileName); 70 | size_t fileSize = inputFile->get_size(); 71 | uint8_t* inputFileMap = inputFile->mmap(); 72 | 73 | if (!fileSize){ 74 | delete inputFile; 75 | return NULL; 76 | } 77 | 78 | 79 | this->filesProcessed++; 80 | 81 | //It is much faster to just use an array that holds a boolean 82 | //for every element, this requires only ~64MB per file for 3-byte Ngrams 83 | //but 16GB for 4 byte ngrams, so only do this for 3-byte ngrams 84 | //and do a map instead for 4-byte 85 | if(this->ngramLength == 3){ 86 | ngramList = new vector; 87 | bool* bngramList = new bool[this->maxNgram](); 88 | 89 | try{ 90 | processNgrams(inputFileMap, fileSize, bngramList); 91 | 92 | for(uint32_t k = 0; k < this->maxNgram; k++){ 93 | if(bngramList[k]){ 94 | ngramList->push_back(k); 95 | } 96 | } 97 | 98 | delete[] bngramList; 99 | 100 | } catch(exception &e){ 101 | cout << "Error processing ngrams: "<< e.what() << endl; 102 | } 103 | 104 | }else{ //ngramLength == 4 105 | try { 106 | ngramList = processNgrams(inputFileMap, fileSize); 107 | } catch(exception &e){ 108 | cout << "Error processing ngrams: " << e.what() << endl; 109 | } 110 | } 111 | 112 | inputFile->close(); 113 | delete inputFile; 114 | 115 | return ngramList; 116 | } 117 | 118 | 119 | //Creates vector of ngrams in a given file 120 | //Uses an STL map which should use a RED/BLACK tree 121 | //Insertion/Search should be O(log(n)) time where n is the 122 | //number of nodes already in the tree 123 | vector* fileIndexer::processNgrams(unsigned char* buf, uint64_t fileSize){ 124 | map ngram_map; 125 | 126 | uint32_t nGram = 0; 127 | uint32_t i, j; 128 | 129 | for(i = 0; ((i + this->ngramLength) - 1) < fileSize; i++){ 130 | nGram = 0; 131 | for(j = 0; j < this->ngramLength; j++){ 132 | nGram += (unsigned char)buf[i+j] * (1 << (8*j)); 133 | } 134 | if(nGram >= this->maxNgram){ 135 | throw std::runtime_error("Ngram greater than maxNgram"); 136 | } 137 | ngram_map.insert(pair(nGram, true)); 138 | } 139 | 140 | //We should now have a map of ngrams, turn into a sorted vector 141 | //TODO pre-allocate vector to do this faster 142 | vector* ngramVector = new vector; 143 | 144 | for(map::iterator it = ngram_map.begin(); it != ngram_map.end() ; it++){ 145 | ngramVector->push_back(it->first); 146 | } 147 | 148 | return ngramVector; 149 | } 150 | 151 | // Takes the file and creates ngrams from it 152 | void fileIndexer::processNgrams(unsigned char* buf, uint64_t fileSize, bool ngramList[]){ 153 | //TODO update with byte array 154 | uint64_t nGram = 0; 155 | uint64_t i, j; 156 | for(i = 0; ((i + this->ngramLength) - 1) < fileSize; i++){ 157 | nGram = 0; 158 | // creates an ngram using the formula buf[i] + buf[i+1]*256 159 | // + buf[i+2]*256*256 160 | for(j = 0; j < this->ngramLength; j++){ 161 | // (1 <<(8*j)) is equivalent to pow(256,j) 162 | nGram += (unsigned char)buf[i+j] * (1 << (8*j)); 163 | } 164 | if(nGram >= this->maxNgram){ 165 | throw std::runtime_error("Ngram greater than maxNgram"); 166 | } 167 | ngramList[nGram] = 1; 168 | } 169 | 170 | return; 171 | } 172 | -------------------------------------------------------------------------------- /src/indexSet.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | 30 | #include "indexSet.h" 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | 38 | using namespace snugglefish; 39 | using namespace std; 40 | 41 | 42 | indexSet::indexSet(const char* fileBase, uint32_t count, uint8_t nGramSize) 43 | :indexFile(0), nGramFile(0), fileBase(fileBase), ngramLength(nGramSize), 44 | writable(false), indexMap(0), nGramMap(0), count(count) 45 | { 46 | } 47 | 48 | indexSet::~indexSet(){ 49 | if (this->indexFile){ 50 | this->indexFile->close(); 51 | delete this->indexFile; 52 | } 53 | 54 | if (this->nGramFile){ 55 | this->nGramFile->close(); 56 | delete this->nGramFile; 57 | } 58 | } 59 | 60 | void indexSet::close(){ 61 | if (this->indexFile){ 62 | this->indexFile->close(); 63 | delete this->indexFile; 64 | this->indexFile = NULL; 65 | } 66 | if (this->nGramFile){ 67 | this->nGramFile->close(); 68 | delete this->nGramFile; 69 | this->nGramFile = NULL; 70 | } 71 | 72 | } 73 | 74 | void indexSet::create(ngram_t_numfiles nFiles){ 75 | if (indexFile || nGramFile){//Already opened? 76 | throw runtime_error("Error opening index or ngram file"); 77 | } 78 | 79 | //Create the Files 80 | //First create the filenames 81 | string indexFileName, nGramFileName; 82 | indexFileName = nGramFileName = this->fileBase; 83 | char number_string[FILE_NUM_BUFFER_SIZE]; 84 | snprintf(number_string, FILE_NUM_BUFFER_SIZE, FILE_NUM_SPRINTF_STRING, this->count); 85 | 86 | indexFileName.append(INDEX_FILE_EXTENSION).append(number_string); 87 | nGramFileName.append(NGRAM_FILE_EXTENSION).append(number_string); 88 | 89 | this->indexFile = new file(indexFileName.c_str()); 90 | this->nGramFile = new file(nGramFileName.c_str()); 91 | 92 | if(indexFile->exists() || nGramFile->exists()){ //Already exist? 93 | throw runtime_error("index file or ngram file already exists"); 94 | } 95 | 96 | indexFile->create(); 97 | nGramFile->create(); 98 | 99 | 100 | this->endian_check = ENDIAN_CHECK; 101 | this->version = VERSION; 102 | 103 | //Write standard header 104 | indexFile->write((uint8_t*) (&(this->endian_check)), ENDIAN_CHECK_FIELD); 105 | indexFile->write((uint8_t*) (&(this->version)), VERSION_FIELD); 106 | indexFile->write((uint8_t*) (&(this->ngramLength)), NGRAM_SIZE_FIELD); 107 | indexFile->write((uint8_t*) (&nFiles), INDEX_HEADER_NUM_FILES_FIELD); 108 | 109 | offset = 0; //Offset in the ngramfile 110 | 111 | writable = true; 112 | 113 | } 114 | 115 | void indexSet::open(){ 116 | string indexFileName, nGramFileName; 117 | indexFileName = nGramFileName = this->fileBase; 118 | char number_string[FILE_NUM_BUFFER_SIZE]; 119 | snprintf(number_string, FILE_NUM_BUFFER_SIZE, FILE_NUM_SPRINTF_STRING, this->count); 120 | 121 | indexFileName.append(INDEX_FILE_EXTENSION).append(number_string); 122 | nGramFileName.append(NGRAM_FILE_EXTENSION).append(number_string); 123 | 124 | indexFile = new file(indexFileName.c_str()); 125 | 126 | if(!indexFile->exists()){ 127 | throw runtime_error("index file does not exist"); 128 | } 129 | 130 | nGramFile = new file(nGramFileName.c_str()); 131 | 132 | if (!nGramFile->exists()){ 133 | throw runtime_error("index file does not exist"); 134 | } 135 | 136 | indexFile->open('r'); 137 | nGramFile->open('r'); 138 | 139 | indexMap = indexFile->mmap(); 140 | nGramMap = nGramFile->mmap(); 141 | 142 | indexEntries = indexMap + INDEX_HEADER_SIZE; 143 | } 144 | 145 | void indexSet::addIndexData(uint64_t offset, uint32_t nFiles){ 146 | indexFile->write((uint8_t*) &offset, sizeof(uint64_t)); 147 | indexFile->write((uint8_t*) &nFiles, sizeof(uint32_t)); 148 | } 149 | 150 | void indexSet::addNGrams(uint32_t ngram, list* files){ 151 | if (!writable || !nGramFile){ 152 | //TODO 153 | } 154 | 155 | uint32_t size = files->size(); 156 | uint64_t off = offset; 157 | uint32_t difference = sizeof(ngram_t_fidtype) * size; 158 | 159 | if (!size){ 160 | off = 0; 161 | } 162 | 163 | //Pre-emptively add the index information, i.e, offset, numFiles 164 | addIndexData(off, size); 165 | 166 | while(size > 0){ 167 | nGramFile->write((uint8_t*) (&(files->front())), sizeof(ngram_t_fidtype)); 168 | files->pop_front(); 169 | size--; 170 | } 171 | 172 | offset = offset + difference; 173 | 174 | } 175 | 176 | void indexSet::updateNumFiles(ngram_t_numfiles count){ 177 | if (!writable || !indexFile){ 178 | //TODO 179 | } 180 | //TODO cleanup offset 181 | indexFile->write_at(INDEX_HEADER_SIZE - INDEX_HEADER_NUM_FILES_FIELD, (uint8_t*) (&count), INDEX_HEADER_NUM_FILES_FIELD); 182 | } 183 | 184 | size_t indexSet::getNGramCount(uint64_t ngram){ 185 | index_entry* index_table = (index_entry*) (indexEntries + (ngram * (INDEX_ENTRY_SIZE))); 186 | return index_table->num_files; 187 | } 188 | 189 | ngram_t_fidtype* indexSet::getNGrams(uint64_t ngram, size_t* count){ 190 | index_entry* index_table = (index_entry*) (indexEntries + (ngram * (INDEX_ENTRY_SIZE))); 191 | *count = index_table->num_files; 192 | ngram_t_fidtype* ptr = (ngram_t_fidtype*) (nGramMap + index_table->offset); 193 | 194 | return ptr; 195 | } 196 | -------------------------------------------------------------------------------- /src/nGramBase.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | 30 | #include "nGramBase.h" 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | 42 | #include //for dirname and basename() 43 | 44 | using namespace snugglefish; 45 | using namespace std; 46 | 47 | 48 | nGramBase::nGramBase( uint32_t ngramLength, string indexFileName) 49 | { 50 | 51 | if(ngramLength == 3 || ngramLength == 4){ 52 | this->ngramLength = ngramLength; 53 | this->maxNgram = (uint64_t) (1) << (8*ngramLength); 54 | } 55 | else { 56 | throw std::runtime_error("Ngram len must be 3 or 4"); 57 | } 58 | 59 | 60 | 61 | size_t pos; 62 | string baseFileName = indexFileName; 63 | // Check to see if the index file ends with any of the extentions we use 64 | // And if so, remove them to get the base filename 65 | pos = baseFileName.rfind( NGRAM_FILE_EXTENSION ); 66 | // Make sure that it is at the end of the string 67 | if(pos == (baseFileName.size() - 6)){ 68 | baseFileName = baseFileName.substr(0, pos); 69 | } else{ 70 | pos = baseFileName.rfind( INDEX_FILE_EXTENSION ); 71 | if(pos == (baseFileName.size() - 6)){ 72 | baseFileName = baseFileName.substr(0, pos); 73 | } else{ 74 | pos = baseFileName.rfind( FILEID_FILE_EXTENSION ); 75 | if(pos == (baseFileName.size() - 6)){ 76 | baseFileName = baseFileName.substr(0, pos); 77 | } 78 | } 79 | } 80 | 81 | 82 | this->baseFileName = baseFileName; 83 | } 84 | 85 | -------------------------------------------------------------------------------- /src/nGramIndex.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | 30 | #include "nGramIndex.h" 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | 41 | using namespace snugglefish; 42 | using namespace std; 43 | 44 | 45 | nGramIndex::nGramIndex( uint32_t ngramLength, string indexFileName) 46 | :nGramBase(ngramLength, indexFileName), bufferMax(MAX_BUFFER_SIZE), 47 | buffer_memory_usage(0), flush(false), flushing(false) { 48 | 49 | 50 | this->maxFileNameLength = DEFAULT_MAX_FILENAME_SIZE; 51 | 52 | //allocate output buffer 53 | this->output_buffer = new buffer_element[maxNgram]; 54 | for(uint64_t i = 0; i < maxNgram; i++){ 55 | this->output_buffer[i].elements_size = 0; 56 | this->output_buffer[i].elements = new list; 57 | } 58 | 59 | masterFile = new smFile(baseFileName, ngramLength); 60 | 61 | if (masterFile->exists()){ 62 | masterFile->open('w'); 63 | }else{ //Must create it 64 | masterFile->create(maxFileNameLength); 65 | } 66 | 67 | numFilesProcessed = masterFile->getNumFiles(); 68 | numSessionFilesProcessed = 0; 69 | } 70 | 71 | nGramIndex::~nGramIndex(){ 72 | 73 | this->flushAll(); 74 | for(uint64_t i = 0; i < maxNgram; i++){ 75 | delete this->output_buffer[i].elements; 76 | } 77 | delete[] output_buffer; 78 | 79 | delete masterFile; 80 | 81 | } 82 | 83 | 84 | /* NGram Related Functions */ 85 | void nGramIndex::addNGrams(vector* nGramList, string filename){ 86 | //POSIX basename may modify argument so create a copy 87 | char* temp_filename = new char[filename.length() + 1]; 88 | strncpy(temp_filename, filename.c_str(), filename.length() + 1); 89 | filename = basename(temp_filename); 90 | delete[] temp_filename; 91 | 92 | ngram_t_fidtype file_id = numFilesProcessed++; 93 | fileNameList.push_back(filename); 94 | 95 | // Insert ngram into the list, add the new node to the memory usage variable, 96 | // and check if the maximum memory has been used, and if so, indicate that the 97 | // nGrams should be flushed to disk 98 | for(uint32_t i = 0; i < nGramList->size(); i++){ 99 | uint32_t nGram = (*nGramList)[i]; 100 | output_buffer[nGram].elements_size++; 101 | output_buffer[nGram].elements->push_back(file_id); 102 | 103 | buffer_memory_usage += BUFFER_NODE_SIZE; //Add size of node 104 | 105 | if(buffer_memory_usage >= bufferMax){ 106 | flush = true; 107 | } 108 | } 109 | 110 | //We cleanup the memory 111 | delete nGramList; 112 | 113 | if (flush){ 114 | flushAll(); 115 | flush = false; 116 | } 117 | 118 | 119 | } 120 | 121 | void nGramIndex::flushAll(){ 122 | flushing = true; 123 | if(fileNameList.size()){ 124 | ngram_t_indexfcount num_files = fileNameList.size(); 125 | //By updating the master file last, this set can be queried 126 | //while creating a new index set 127 | flushIndex(num_files); 128 | flushMaster(); 129 | 130 | numSessionFilesProcessed += num_files; 131 | } 132 | flushing = false; 133 | } 134 | 135 | // Flush the file names and update the number of index files 136 | void nGramIndex::flushMaster(){ 137 | //Write FileNames to File ID file 138 | ngram_t_indexfcount num_files = (ngram_t_indexfcount) fileNameList.size(); 139 | 140 | if (!num_files){ 141 | return; 142 | } 143 | 144 | for(unsigned long i = 0; i < num_files; i++){ 145 | masterFile->addFileId(fileNameList[i].c_str()); 146 | } 147 | 148 | //Clear the vector 149 | fileNameList.clear(); 150 | 151 | //Update filid with new value 152 | masterFile->updateIndexFileCount(masterFile->getNumIndexFiles() + 1); 153 | } 154 | 155 | // Flush the ngrams to the index files 156 | void nGramIndex::flushIndex(ngram_t_indexfcount num_files){ 157 | //Create the files 158 | indexSet* tIndex = new indexSet(baseFileName.c_str(), masterFile->getNumIndexFiles(), ngramLength); 159 | tIndex->create(num_files); 160 | 161 | uint64_t bytes_flushed = 0; 162 | 163 | for(uint32_t i = 0; i < maxNgram; i++){ //iterate through every ngram 164 | bytes_flushed += output_buffer[i].elements_size * sizeof(ngram_t_fidtype); 165 | tIndex->addNGrams(i, this->output_buffer[i].elements); 166 | output_buffer[i].elements_size = 0; 167 | 168 | if (output_buffer[i].elements->size() != 0) 169 | cout << "Not Zero" << endl; 170 | } 171 | 172 | buffer_memory_usage = 0; 173 | 174 | tIndex->close(); 175 | delete tIndex; 176 | } 177 | 178 | 179 | void nGramIndex::getStats(uint64_t& totalFiles, uint64_t& sessionFiles, uint64_t& indexFiles, bool& flushing){ 180 | totalFiles = masterFile->getNumFiles(); 181 | sessionFiles = numSessionFilesProcessed; 182 | indexFiles = masterFile->getNumIndexFiles(); 183 | flushing = this->flushing; 184 | } 185 | -------------------------------------------------------------------------------- /src/nGramSearch.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | 30 | #include "nGramSearch.h" 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include "indexSet.h" 43 | 44 | using namespace snugglefish; 45 | using namespace std; 46 | 47 | 48 | nGramSearch::nGramSearch( uint32_t ngramLength, string indexFileName) 49 | :nGramBase(ngramLength, indexFileName),numThreads(1) { 50 | 51 | masterFile = new smFile(baseFileName, ngramLength); 52 | 53 | if (!masterFile->exists()){ 54 | //some error 55 | }else{ 56 | masterFile->open('r'); 57 | } 58 | 59 | 60 | numIndexFiles = masterFile->getNumIndexFiles(); 61 | numFiles = masterFile->getNumFiles(); 62 | } 63 | 64 | nGramSearch::nGramSearch( uint32_t ngramLength, string indexFileName, uint32_t threads) 65 | :nGramBase(ngramLength, indexFileName),numThreads(threads) { 66 | 67 | masterFile = new smFile(baseFileName, ngramLength); 68 | 69 | if (!masterFile->exists()){ 70 | //some error 71 | }else{ 72 | masterFile->open('r'); 73 | } 74 | 75 | 76 | numIndexFiles = masterFile->getNumIndexFiles(); 77 | numFiles = masterFile->getNumFiles(); 78 | 79 | } 80 | 81 | nGramSearch::~nGramSearch(){ 82 | delete masterFile; 83 | } 84 | 85 | 86 | vector* nGramSearch::stringToNGrams(string searchString){ 87 | uint64_t nGram; 88 | vector * ngrams = new vector; 89 | 90 | 91 | for(size_t i = 0; i + ngramLength - 1 < searchString.length(); i++){ 92 | nGram = 0; 93 | for(size_t j = 0; j < ngramLength; j++){ 94 | // (1 << (8*j)) is equivalent to pow(256,j) 95 | nGram += (unsigned char)searchString[i+j] * (1 << (8*j)); 96 | } 97 | ngrams->push_back(nGram); 98 | } 99 | 100 | return ngrams; 101 | } 102 | 103 | 104 | vector* nGramSearch::searchNGrams(vector nGramQuery){ 105 | vector* matchedFiles = new vector; 106 | pthread_t * threads; 107 | thread_data* tdata; 108 | void* status; 109 | 110 | tdata = new thread_data(); 111 | 112 | pthread_attr_t attr; 113 | pthread_attr_init(&attr); 114 | pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); 115 | 116 | pthread_mutex_init(&(tdata->queueMutex), NULL); 117 | pthread_mutex_init(&(tdata->smFileMutex), NULL); 118 | pthread_mutex_init(&(tdata->mfMutex), NULL); 119 | 120 | tdata->ngramLength = ngramLength; 121 | tdata->queue = 0; 122 | tdata->maximumIndex = numIndexFiles; 123 | 124 | tdata->masterFile = (this->masterFile); 125 | tdata->matchedFiles = matchedFiles; 126 | tdata->baseFileName = &baseFileName; 127 | tdata->nGramQuery = &(nGramQuery); 128 | 129 | threads = (pthread_t*) malloc(numThreads * sizeof(pthread_t)); 130 | 131 | //Create the threads 132 | for(uint32_t i = 0; i < numThreads; i++){ 133 | pthread_create(& threads[i], &attr, searchNGramThread, (void*) (tdata)); 134 | } 135 | 136 | //Join on them 137 | for(uint32_t i = 0; i < numThreads; i++){ 138 | pthread_join(threads[i], &status); 139 | } 140 | 141 | pthread_mutex_destroy(&(tdata->queueMutex)); 142 | pthread_mutex_destroy(&(tdata->smFileMutex)); 143 | pthread_mutex_destroy(&(tdata->mfMutex)); 144 | pthread_attr_destroy(&attr); 145 | 146 | free(threads); 147 | delete tdata; 148 | masterFile->close(); 149 | return matchedFiles; 150 | 151 | } 152 | 153 | void* nGramSearch::searchNGramThread(void* input){ 154 | 155 | 156 | thread_data* tdata = (thread_data*) input; 157 | 158 | while(1){ 159 | pthread_mutex_lock(& tdata->queueMutex); 160 | if (tdata->queue >= tdata->maximumIndex){ 161 | pthread_mutex_unlock(& tdata->queueMutex); 162 | break; 163 | } 164 | 165 | uint32_t i = tdata->queue++; 166 | pthread_mutex_unlock(& tdata->queueMutex); 167 | 168 | indexSet* tIndex = new indexSet(tdata->baseFileName->c_str(), i, tdata->ngramLength); 169 | tIndex->open(); 170 | 171 | //Get ordered list of NGrams 172 | // In ascending order by number of files that contain that ngram 173 | //list queryList = orderNGrams(nGramQuery); 174 | list< pair > queryList = orderNGrams(tIndex, *(tdata->nGramQuery)); 175 | 176 | //Get list of File Ids that match NGrams 177 | list matchedIds = searchAlpha((indexSet*) tIndex, queryList); 178 | 179 | //Convert File IDs to filenames 180 | list::iterator ft = matchedIds.begin(); 181 | while(ft != matchedIds.end()){ 182 | pthread_mutex_lock(& tdata->smFileMutex); 183 | string matched_filename = tdata->masterFile->getFilebyId(*ft); 184 | pthread_mutex_unlock(& tdata->smFileMutex); 185 | 186 | pthread_mutex_lock(& tdata->mfMutex); 187 | tdata->matchedFiles->push_back(matched_filename); 188 | pthread_mutex_unlock(& tdata->mfMutex); 189 | ft++; 190 | } 191 | 192 | tIndex->close(); 193 | delete tIndex; 194 | } 195 | 196 | return NULL; 197 | } 198 | 199 | list< pair > nGramSearch::orderNGrams(indexSet* index, const vector & nGramQuery){ 200 | bool nomatch = false; 201 | list< pair > queryList; 202 | for(uint32_t j = 0; j < nGramQuery.size(); j++){ 203 | size_t numfiles = index->getNGramCount(nGramQuery[j]); 204 | //index_entry* index_table = (index_entry*) (indexEntries + (nGramQuery[j] * (INDEX_ENTRY_SIZE))); 205 | 206 | if(numfiles == 0){ 207 | nomatch = true; 208 | break; 209 | //No Files Match 210 | } 211 | 212 | if(queryList.empty()){ 213 | queryList.push_back(pair(nGramQuery[j], numfiles)); 214 | }else{ 215 | bool placed = false; 216 | for (list< pair >::iterator it = queryList.begin(); 217 | it != queryList.end(); it++ ){ 218 | if((*it).second > numfiles){ 219 | queryList.insert(it, pair(nGramQuery[j], numfiles)); 220 | placed = true; 221 | break; 222 | } 223 | } 224 | if (!placed){ 225 | queryList.push_back(pair(nGramQuery[j], numfiles)); 226 | } 227 | } 228 | } 229 | 230 | if (nomatch){ 231 | queryList.clear(); 232 | } 233 | 234 | return queryList; 235 | 236 | } 237 | 238 | 239 | // Takes the list of sorted fids, and puts the common fids into matchedIds 240 | list nGramSearch::searchAlpha(indexSet* index, list< pair > &queryList){ 241 | list matchedIds; 242 | 243 | if(queryList.size() == 0){ 244 | return matchedIds; 245 | } 246 | // this gets the ngram list for the first file, and places them 247 | // into the matchedIds list 248 | ngram_t_numfiles count = 0; 249 | ngram_t_fidtype* ngrams = index->getNGrams((uint64_t) queryList.front().first, (size_t*) &count); 250 | for (ngram_t_numfiles j = 0; j < count; j++){ 251 | matchedIds.push_back(ngrams[j]); 252 | } 253 | 254 | //Dont need the front element anymore so get rid of it 255 | queryList.pop_front(); 256 | 257 | //For every id see if it's in the remaining Ngrams 258 | //Now for every subsequent NGram whittle down the list 259 | for(list< pair >::iterator it = queryList.begin(); 260 | it != queryList.end(); it++){ 261 | 262 | ngram_t_numfiles ngram_elements = 0; 263 | ngram_t_fidtype* ngrams = index->getNGrams((uint64_t) (*it).first, (size_t*) &ngram_elements); 264 | uint32_t ngram_index = 0; 265 | 266 | list::iterator ft = matchedIds.begin(); 267 | while(ft != matchedIds.end()){ 268 | bool found = false; 269 | // Checks each element of the next fid array for 270 | // the current fid in matchedIds 271 | while(ngram_index < ngram_elements){ 272 | if (ngrams[ngram_index] == (*ft)){ 273 | found = true; 274 | break; 275 | }else if(ngrams[ngram_index] > (*ft)){ 276 | //update ngram location 277 | // If the fid is too large, it is 278 | // necessary to recheck the same index on the 279 | // next pass, as to make sure it isn't missed 280 | break; 281 | } 282 | ngram_index++; 283 | } 284 | 285 | if(!found){ 286 | ft = matchedIds.erase(ft); 287 | }else{ 288 | // ft++ is only needed if erase isn't called as 289 | // erase moves the iterator ahead 290 | ft++; 291 | } 292 | } 293 | } 294 | 295 | return matchedIds; 296 | } 297 | -------------------------------------------------------------------------------- /src/smFile.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | 30 | #include "smFile.h" 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | 40 | using namespace snugglefish; 41 | using namespace std; 42 | 43 | 44 | smFile::smFile(string fileBase, uint8_t nGramSize) 45 | :file(fileBase.append(FILEID_FILE_EXTENSION).c_str()), ngramLength((ngram_t_size) nGramSize) 46 | { 47 | 48 | } 49 | 50 | smFile::~smFile(){ 51 | flush(); 52 | } 53 | 54 | bool smFile::flush(){ 55 | if (this->readonly) 56 | return true; 57 | 58 | file::flush(); 59 | write_at((int32_t) FILID_NUM_FILES_OFFSET, (uint8_t*) &numFiles, (size_t) NUM_FILES_FIELD); 60 | 61 | return true; 62 | 63 | } 64 | 65 | void smFile::create(ngram_t_fnlength maxfnLength){ 66 | if (this->exists()){ 67 | cerr << "Unable to create file: " << this->filename << " -- Already Exists" << endl; 68 | throw runtime_error("Creating File"); 69 | } 70 | 71 | //Create the File 72 | file::create(); 73 | 74 | this->endian_check = ENDIAN_CHECK; 75 | this->version = VERSION; 76 | this->maxFileNameLength = maxfnLength; 77 | 78 | ngram_t_indexcount four_byte_zero = this->numIndexFiles = 0; 79 | ngram_t_fidtype eight_byte_zero = this->numFiles = 0; 80 | 81 | //Write standard header 82 | write((uint8_t*) (&(this->endian_check)), ENDIAN_CHECK_FIELD); 83 | write((uint8_t*) (&(this->version)), VERSION_FIELD); 84 | write((uint8_t*) (&(this->ngramLength)), NGRAM_SIZE_FIELD); 85 | write((uint8_t*) (&(this->maxFileNameLength)), MAX_FILENAME_LENGTH_FIELD); 86 | write((uint8_t*) &four_byte_zero, NUM_INDEX_FILES_FIELD); 87 | write((uint8_t*) &eight_byte_zero, NUM_FILES_FIELD); 88 | 89 | fileBuffer = (char*) malloc((maxFileNameLength + 1)* sizeof(char)); 90 | 91 | } 92 | 93 | void smFile::open(char readwrite){ 94 | file::open(readwrite); 95 | 96 | ngram_t_size ngramL; 97 | 98 | read((uint8_t*) (&(this->endian_check)), ENDIAN_CHECK_FIELD); 99 | 100 | if (this->endian_check != ENDIAN_CHECK){ 101 | throw runtime_error("Endian Mismatch"); 102 | } 103 | 104 | read((uint8_t*) (&(this->version)), VERSION_FIELD); 105 | read((uint8_t*) (&ngramL), NGRAM_SIZE_FIELD); 106 | 107 | if (this->ngramLength != ngramL){ 108 | throw runtime_error("N Gram Length Mismatch"); 109 | } 110 | 111 | read((uint8_t*) (&(this->maxFileNameLength)), MAX_FILENAME_LENGTH_FIELD); 112 | read((uint8_t*) (&(this->numIndexFiles)), NUM_INDEX_FILES_FIELD); 113 | read((uint8_t*) (&(this->numFiles)), NUM_FILES_FIELD); 114 | 115 | fileBuffer = (char*) malloc((maxFileNameLength + 1)* sizeof(char)); 116 | 117 | } 118 | 119 | 120 | void smFile::addFileId(const char * fileName){ 121 | strncpy(fileBuffer, fileName, maxFileNameLength); 122 | write((uint8_t*) fileBuffer, maxFileNameLength * sizeof(char)); 123 | 124 | numFiles++; 125 | //Writing of the numFiles to the file is delayed until a flush 126 | } 127 | 128 | const char* smFile::getFilebyId(uint64_t id){ 129 | read_at(FILID_HEADER_SIZE + (id * maxFileNameLength * sizeof(char)), (uint8_t*) fileBuffer, maxFileNameLength * sizeof(char)); 130 | return fileBuffer; 131 | } 132 | 133 | void smFile::updateIndexFileCount(ngram_t_indexcount count){ 134 | write_at((int32_t) FILID_NUM_INDEX_OFFSET, (uint8_t*) &count, (size_t) NUM_INDEX_FILES_FIELD); 135 | numIndexFiles = count; 136 | } 137 | -------------------------------------------------------------------------------- /src/snugglefish.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Approved for Public Release; Distribution Unlimited: 13-1937 3 | 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions 8 | are met: 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 2. Redistributions in binary form must reproduce the above copyright 12 | notice, this list of conditions and the following disclaimer in the 13 | documentation and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 | ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 | SUCH DAMAGE. 26 | 27 | */ 28 | 29 | 30 | // Snugglefish.cpp 31 | // Sample NGram Fast Indexer and Search (SNGFIS) 32 | // Allows for the indexing and searching of a large amount of samples in a short 33 | // period of time. 34 | 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | 46 | #include 47 | #include 48 | #include 49 | 50 | #include 51 | #include 52 | 53 | #include "common.h" 54 | 55 | using namespace std; 56 | using namespace snugglefish; 57 | 58 | #define NGRAM_SIZE_OPTION_LONG "ngramsize" 59 | #define NGRAM_SIZE_OPTION_SHORT 'n' 60 | #define HELP_OPTION_LONG "help" 61 | #define HELP_OPTION_SHORT 'h' 62 | #define INDEX_OPTION_SHORT 'i' 63 | #define INDEX_OPTION_LONG "index" 64 | #define SEARCH_OPTION_SHORT 's' 65 | #define SEARCH_OPTION_LONG "search" 66 | #define OUTPUT_OPTION_SHORT 'o' 67 | #define OUTPUT_OPTION_LONG "output" 68 | #define FILE_OPTION_SHORT 'f' 69 | #define FILE_OPTION_LONG "file" 70 | #define NODE_BUFFER_OPTION_LONG "node_bound" 71 | #define NODE_BUFFER_OPTION_SHORT 'b' 72 | #define THREADS_OPTION_LONG "threads" 73 | #define THREADS_OPTION_SHORT 't' 74 | #define SHORT_OPTIONS_STRING "n:hsio:f:b:t:" 75 | 76 | 77 | uint32_t cpu_count(){ 78 | long procs = -1; 79 | 80 | //This only works on some *nixes TODO figure out which systems don't support this call 81 | procs = sysconf(_SC_NPROCESSORS_ONLN); 82 | if (procs < 1){ 83 | cerr << "Unable to get CPU count, defaulting to 1 -- Error: " 84 | << strerror(errno) << endl; 85 | return 1; 86 | } 87 | return (uint32_t) procs; 88 | } 89 | 90 | 91 | 92 | void handler(int sig) { 93 | void *array[10]; 94 | int size; 95 | 96 | // get void*'s for all entries on the stack 97 | size = backtrace(array, 10); 98 | 99 | // print out all the frames to stderr 100 | fprintf(stderr, "Error: signal %d:\n", sig); 101 | backtrace_symbols_fd(array, size, 2); 102 | exit(1); 103 | } 104 | 105 | void printHelp(); 106 | 107 | 108 | int main(int argc, char *argv[]){ 109 | int c, option_index = 0; 110 | uint32_t ngramSize = 3; 111 | uint32_t max_files = 0; 112 | uint64_t max_buffer = 0; 113 | uint32_t threads = 0; 114 | 115 | signal(SIGSEGV, handler); 116 | 117 | string commandString, indexFileName = "", searchString; 118 | vector fileList; 119 | static struct option long_options[] ={ 120 | // Ngramsize defaults to 3 if not present 121 | {INDEX_OPTION_LONG, no_argument, 0, INDEX_OPTION_SHORT}, 122 | {SEARCH_OPTION_LONG, no_argument, 0, SEARCH_OPTION_SHORT}, 123 | {FILE_OPTION_LONG, required_argument, 0, FILE_OPTION_SHORT}, 124 | {OUTPUT_OPTION_LONG, required_argument, 0, OUTPUT_OPTION_SHORT}, 125 | {NGRAM_SIZE_OPTION_LONG, required_argument, 0, NGRAM_SIZE_OPTION_SHORT}, 126 | {HELP_OPTION_LONG, no_argument, 0, HELP_OPTION_SHORT}, 127 | {THREADS_OPTION_LONG, required_argument, 0, THREADS_OPTION_SHORT}, 128 | {0,0,0,0} 129 | }; 130 | bool indexFlag = false, searchFlag = false; 131 | 132 | // Default threads to the number of cpus 133 | threads = cpu_count(); 134 | 135 | // loop over all of the options 136 | while ((c = getopt_long(argc, argv, SHORT_OPTIONS_STRING, long_options, &option_index)) != -1){ 137 | // check to see if a single character or long option came through 138 | switch (c){ 139 | // short option 't' 140 | case THREADS_OPTION_SHORT: 141 | { 142 | istringstream tss(optarg); 143 | uint32_t thr = 0; 144 | 145 | if(!(tss >> thr)){ 146 | cout << "Invalid number of threads, please enter an integer" << endl; 147 | return 0; 148 | }else{ 149 | threads = thr; 150 | } 151 | 152 | break; 153 | } 154 | case SEARCH_OPTION_SHORT: 155 | searchFlag = true; 156 | break; 157 | case INDEX_OPTION_SHORT: 158 | indexFlag = true; 159 | break; 160 | // Outpur and file options are the same 161 | case OUTPUT_OPTION_SHORT: 162 | case FILE_OPTION_SHORT: 163 | indexFileName = optarg; 164 | break; 165 | case HELP_OPTION_SHORT: 166 | printHelp(); 167 | return 0; 168 | // short option 'a' 169 | case NGRAM_SIZE_OPTION_SHORT: 170 | { 171 | //string ngramSizeString(optarg); 172 | istringstream ss(optarg); 173 | //stringstream ss; 174 | //ss << ngramSizeString; 175 | if(!(ss >> ngramSize)){ 176 | cout << "Invalid ngram size, please enter an integer" << endl; 177 | return 0; 178 | } 179 | if(ngramSize < 3 || ngramSize > 4){ 180 | cout << "Ngram size must be 3 or 4. Ngram size is " << ngramSize << endl; 181 | return 0; 182 | } 183 | break; 184 | } 185 | 186 | case NODE_BUFFER_OPTION_SHORT: 187 | { 188 | istringstream ss(optarg); 189 | uint64_t tmax_buffer = 0; 190 | if(!(ss >> tmax_buffer)){ 191 | cout << "Invalid maxmimum node buffer, please enter an integer" << endl; 192 | return 0; 193 | } 194 | 195 | if(tmax_buffer < 4){ 196 | cout << "Node buffer size must be at least 4GB in size" << endl; 197 | return 0; 198 | } 199 | 200 | max_buffer = tmax_buffer * 1073741824; //in Gigabytes 201 | break; 202 | } 203 | 204 | } 205 | } 206 | /* the rest of the command line arguments 207 | * each option needs at least an aditional parameter*/ 208 | if((!indexFlag && !searchFlag)){ 209 | cout << "Must specify -i for index or -s for search" << endl; 210 | return 0; 211 | } 212 | 213 | if((indexFlag && searchFlag)){ 214 | cout << "Must specify only one of -i or -s" << endl; 215 | return 0; 216 | 217 | } 218 | if(indexFileName.empty()){ 219 | cout << "Index file necessary" << endl; 220 | return 0; 221 | } 222 | 223 | 224 | if(indexFlag){ 225 | // get the list of files to add 226 | 227 | for(; optind < argc; optind++){ 228 | fileList.push_back(argv[optind]); 229 | } 230 | // if fileList is empty, it means we need to get the 231 | // filenames from stdin 232 | if(fileList.empty()){ 233 | string fileName; 234 | while(cin){ 235 | getline(cin,fileName); 236 | if(fileName.empty()) 237 | break; 238 | fileList.push_back(fileName); 239 | } 240 | } 241 | //Eventually options should be sent as a structure 242 | make_index(indexFileName, fileList, ngramSize, max_files, max_buffer, threads); 243 | } else if(searchFlag) { 244 | // Get the string to search for 245 | if (optind < argc) { 246 | searchString = argv[optind]; 247 | } else { 248 | cin >> searchString; 249 | } 250 | if (searchString.size() < ngramSize){ 251 | cout << "Search string size is smaller than Ngram size, the search string must be greater than or equal to the ngram size" << endl; 252 | } else { 253 | vector* found = search(indexFileName, searchString, ngramSize, threads); 254 | for(uint32_t i = 0; i < found->size(); i++){ 255 | cout << (*found)[i] << endl; 256 | } 257 | delete found; 258 | } 259 | } else{ 260 | printHelp(); 261 | } 262 | 263 | return 0; 264 | } 265 | 266 | void printHelp(){ 267 | cout << "Usage: snugglefish [OPTIONS]" << endl; 268 | cout << "Index files based on Ngrams then Search for a string" << endl; 269 | cout << "-i, --index Index Operation, requires -o, and files to index" << endl; 270 | cout << "-s, --search Search Operation, requires -f, and search string" << endl; 271 | cout << "-o, --output Specifies the output file for indexing, equivalent to -f" << endl; 272 | cout << "-f, --file Index file to search, equivalent to -o" << endl; 273 | cout << "-b, --node_bound Maximum node buffer memory size before flushing" << endl; 274 | cout << "-n, --ngramsize The size of Ngram to use (default is 3)" << endl; 275 | cout << "-h, --help This help screen" << endl; 276 | cout << "-t, --threads Number of search threads to spawn (default is #cpus)" << endl; 277 | cout << "Examples:" << endl; 278 | cout << "Index: snugglefish [-n ngramsize] -i -o " << endl; 279 | cout << "If no file names are given on the commandline, the stdin will be used" << endl; 280 | cout << "Search: snugglefish [-n ngramsize] -s -f " << endl; 281 | cout << "If no string is given, it can be entered on the command line" << endl; 282 | 283 | 284 | } 285 | 286 | 287 | void printStats(nGramIndex* indexer, uint64_t processed, uint64_t listsize){ 288 | uint64_t total; 289 | uint64_t session; 290 | uint64_t indexes; 291 | bool flushing; 292 | indexer->getStats(total, session, indexes, flushing); 293 | 294 | uint64_t percent = ((double) processed / listsize) * 100; 295 | 296 | cerr << "\r"; 297 | cerr << "Processed: " << percent << "% -- " << processed << "/" << listsize; 298 | 299 | if(flushing){ 300 | cerr << " (Flushing ... )"; 301 | }else{ 302 | cerr << " "; 303 | } 304 | } 305 | 306 | void* indexerThread(void* input){ 307 | mi_data* midata = (mi_data*) input; 308 | nGramIndex* ngramindex = (nGramIndex*) midata->ngramindex; 309 | fileIndexer indexer(midata->ngramSize); 310 | 311 | while(1){ 312 | pthread_mutex_lock(& midata->filesMutex); 313 | if (midata->queue >= midata->fileList->size()){ 314 | pthread_mutex_unlock(& midata->filesMutex); 315 | break; 316 | } 317 | uint32_t i = midata->queue++; 318 | pthread_mutex_unlock(& midata->filesMutex); 319 | 320 | try{ 321 | vector* processedFile = indexer.processFile((*(midata->fileList))[i].c_str()); 322 | if(processedFile != 0){ 323 | pthread_mutex_lock(& midata->nGramIndexMutex); 324 | ngramindex->addNGrams(processedFile, (*(midata->fileList))[i]); 325 | pthread_mutex_unlock(& midata->nGramIndexMutex); 326 | } 327 | }catch(exception& e){ 328 | cout << "Error in thread:" << e.what() << endl; 329 | handler(SIGSEGV); 330 | } 331 | } 332 | return 0; 333 | } 334 | 335 | void make_index(string indexFileName, vector fileNames, uint32_t ngramSize, uint32_t max_files, uint64_t max_buffer, uint32_t threads){ 336 | 337 | pthread_t* indexers; 338 | mi_data* midata; 339 | void* status; 340 | 341 | 342 | midata = new mi_data; 343 | indexers = (pthread_t*) malloc(threads * sizeof(pthread_t)); 344 | 345 | 346 | pthread_attr_t attr; 347 | pthread_attr_init(&attr); 348 | pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); 349 | 350 | pthread_mutex_init(& (midata->filesMutex), NULL); 351 | pthread_mutex_init(& (midata->nGramIndexMutex), NULL); 352 | 353 | midata->fileList = & fileNames; 354 | midata->ngramSize = ngramSize; 355 | midata->queue = 0; 356 | 357 | try{ 358 | nGramIndex ngramindex(ngramSize, indexFileName); 359 | if (max_buffer > 0){ 360 | ngramindex.setmaxBufferSize(max_buffer); 361 | } 362 | 363 | midata->ngramindex = &ngramindex; 364 | 365 | for(uint32_t i = 0; i < threads; i++){ 366 | pthread_create(& indexers[i], & attr, indexerThread, (void*) midata); 367 | } 368 | 369 | while(1){ 370 | //Usage of mutex shouldn't matter 371 | if (midata->queue >= fileNames.size()){ 372 | break; 373 | } 374 | 375 | printStats(&ngramindex, midata->queue, fileNames.size()); 376 | sleep(1); 377 | } 378 | 379 | 380 | for(uint32_t i = 0; i < threads; i++){ 381 | pthread_join(indexers[i], &status); 382 | } 383 | 384 | //Print some final stats 385 | printStats(&ngramindex, midata->queue, fileNames.size()); 386 | cerr << endl; 387 | 388 | } catch (exception& e){ 389 | cout << "Error:" << e.what() << endl; 390 | handler(SIGSEGV); 391 | } 392 | 393 | 394 | pthread_mutex_destroy(&(midata->filesMutex)); 395 | pthread_mutex_destroy(&(midata->nGramIndexMutex)); 396 | pthread_attr_destroy(&attr); 397 | 398 | delete midata; 399 | free(indexers); 400 | } 401 | 402 | 403 | vector* search(string indexFileName, string searchString, uint32_t ngramSize, uint32_t threads){ 404 | vector* ret; 405 | nGramSearch searcher(ngramSize, indexFileName, threads); 406 | vector* ngrams = searcher.stringToNGrams(searchString); 407 | ret = searcher.searchNGrams(*ngrams); 408 | 409 | delete ngrams; 410 | return ret; 411 | } 412 | --------------------------------------------------------------------------------