├── .gitignore
├── Makefile
├── README.md
├── include
    ├── common.h
    ├── file.h
    ├── fileIndexer.h
    ├── indexSet.h
    ├── loserTree.hpp
    ├── nGramBase.h
    ├── nGramIndex.h
    ├── nGramSearch.h
    ├── smFile.h
    ├── snugglefish.h
    └── utils.h
├── python
    ├── README.md
    ├── pysnugglefish.cpp
    ├── setup.py
    └── snuggle.py
└── src
    ├── file.cpp
    ├── fileIndexer.cpp
    ├── indexSet.cpp
    ├── nGramBase.cpp
    ├── nGramIndex.cpp
    ├── nGramSearch.cpp
    ├── smFile.cpp
    └── snugglefish.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | snugglefish
2 | *.o
3 | *.swp
4 | python/build
5 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #Approved for Public Release; Distribution Unlimited: 13-1937
 2 | 
 3 | # Copyright (c) 2014 The MITRE Corporation. All rights reserved.
 4 | #
 5 | # Redistribution and use in source and binary forms, with or without
 6 | # modification, are permitted provided that the following conditions
 7 | # are met:
 8 | # 1. Redistributions of source code must retain the above copyright
 9 | #    notice, this list of conditions and the following disclaimer.
10 | # 2. Redistributions in binary form must reproduce the above copyright
11 | #    notice, this list of conditions and the following disclaimer in the
12 | #    documentation and/or other materials provided with the distribution.
13 | #
14 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 | # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 | # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 | # SUCH DAMAGE.
25 | 
26 | CC=g++
27 | 
28 | all: snugglefish
29 | 
30 | snugglefish: snugglefish.o nGramBase.o nGramSearch.o nGramIndex.o fileIndexer.o files
31 | 	${CC} -rdynamic -pthread snugglefish.o nGramBase.o nGramSearch.o nGramIndex.o fileIndexer.o file.o indexSet.o smFile.o  -o snugglefish
32 | 
33 | snugglefish.o: src/snugglefish.cpp
34 | 	${CC} -Iinclude -g -c src/snugglefish.cpp
35 | 
36 | nGramBase.o: src/nGramBase.cpp
37 | 	${CC} -Iinclude -g -c src/nGramBase.cpp
38 | 
39 | nGramIndex.o: src/nGramIndex.cpp src/nGramBase.cpp
40 | 	${CC} -Iinclude -g -c src/nGramIndex.cpp
41 | 
42 | nGramSearch.o: src/nGramSearch.cpp src/nGramBase.cpp
43 | 	${CC} -Iinclude -g -c src/nGramSearch.cpp
44 | 
45 | fileIndexer.o: src/fileIndexer.cpp
46 | 	${CC} -Iinclude -g -c src/fileIndexer.cpp
47 | 
48 | files: src/file.cpp src/indexSet.cpp src/smFile.cpp
49 | 	${CC} -Iinclude -g -c src/file.cpp
50 | 	${CC} -Iinclude -g -c src/indexSet.cpp
51 | 	${CC} -Iinclude -g -c src/smFile.cpp
52 | 
53 | clean: 
54 | 	rm -rf *.o snugglefish
55 | 
56 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | NOTICE: This proof-of-concept is no longer being maintained.  See [BigGrep](https://github.com/cmu-sei/BigGrep) and [UrsaDB](https://github.com/CERT-Polska/ursadb) for similar capabilities.
 2 | 
 3 | Snugglefish
 4 | ========
 5 | 
 6 | Simple N-Gram Fast Indexer & Searcher (SNGFSH)
 7 | 
 8 | Description
 9 | ===========
10 | 
11 | Got lots of malware and want to be able to quickly limit your search for an
12 | arbitrary binary string to a much smaller quantity of files? Then snugglefish
13 | is for you!
14 | 
15 | Check out this post which explains it:
16 | http://www.mitre.org/capabilities/cybersecurity/overview/cybersecurity-blog/snugglefish-provides-quick-pattern-matching
17 | 
18 | This work is based upon a paper published by CMU CERT entitled "A Scalable
19 | Search Index for Binary Files" which we highly recommend reading. It contains
20 | some optimizations we have not yet implemented.
21 | 


--------------------------------------------------------------------------------
/include/common.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | #ifndef SNGCOMMON_H
 30 | #define SNGCOMMON_H
 31 | 
 32 | #include <vector>
 33 | #include <string>
 34 | 
 35 | typedef struct _mi_data {
 36 |     std::vector <std::string>* fileList;
 37 |     uint32_t queue;
 38 |     void* ngramindex;
 39 | 
 40 |     uint32_t ngramSize;
 41 | 
 42 |     pthread_mutex_t filesMutex;
 43 |     pthread_mutex_t nGramIndexMutex;
 44 | } mi_data;
 45 | 
 46 | #define ngram_t_endian              uint32_t
 47 | #define ngram_t_version             uint8_t
 48 | #define ngram_t_size                uint8_t
 49 | #define ngram_t_fnlength            uint16_t
 50 | #define ngram_t_indexcount          uint32_t
 51 | #define ngram_t_fidtype             uint32_t
 52 | 
 53 | #define ngram_t_indexfcount         uint32_t
 54 | #define ngram_t_offset              uint64_t
 55 | #define ngram_t_numfiles            uint32_t
 56 | 
 57 | 
 58 | #define ENDIAN_CHECK                            0x01234567
 59 | #define VERSION                                 0x01
 60 | 
 61 | //File Header Fields in bytes
 62 | //Shared
 63 | #define ENDIAN_CHECK_FIELD                      sizeof(ngram_t_endian)
 64 | #define VERSION_FIELD                           sizeof(ngram_t_version) 
 65 | #define NGRAM_SIZE_FIELD                        sizeof(ngram_t_size) 
 66 | //filid file only
 67 | #define MAX_FILENAME_LENGTH_FIELD               sizeof(ngram_t_fnlength) 
 68 | //#define MAX_FILES_PER_NGRAM_FIELD               4 //TODO delete
 69 | #define NUM_INDEX_FILES_FIELD                   sizeof(ngram_t_indexcount) 
 70 | #define NUM_FILES_FIELD                         sizeof(ngram_t_fidtype) //Number of files in catalog
 71 | 
 72 | #define INDEX_HEADER_NUM_FILES_FIELD            sizeof(ngram_t_indexfcount)//number of files in an index
 73 | 
 74 | //index file only
 75 | #define OFFSET_FIELD                            sizeof(ngram_t_offset) //64-bit offset into ngram file
 76 | #define INDEX_NUM_FILES_FIELD                   sizeof(ngram_t_numfiles) //how many files in that ngram
 77 | 
 78 | #define FILID_HEADER_SIZE   ENDIAN_CHECK_FIELD + \
 79 |                             VERSION_FIELD+ \
 80 |                             NGRAM_SIZE_FIELD+ \
 81 |                             MAX_FILENAME_LENGTH_FIELD+ \
 82 |                             NUM_INDEX_FILES_FIELD + \
 83 |                             NUM_FILES_FIELD
 84 |                     
 85 | #define FILID_NUM_INDEX_OFFSET  ENDIAN_CHECK_FIELD + \
 86 |                                 VERSION_FIELD +  \
 87 |                                 NGRAM_SIZE_FIELD +  \
 88 |                                 MAX_FILENAME_LENGTH_FIELD
 89 | 
 90 | #define FILID_NUM_FILES_OFFSET  ENDIAN_CHECK_FIELD + \
 91 |                                 VERSION_FIELD + \
 92 |                                 NGRAM_SIZE_FIELD + \
 93 |                                 MAX_FILENAME_LENGTH_FIELD + \
 94 |                                 NUM_INDEX_FILES_FIELD
 95 | 
 96 | #define INDEX_HEADER_SIZE   ENDIAN_CHECK_FIELD + \
 97 |                             VERSION_FIELD + \
 98 |                             NGRAM_SIZE_FIELD + \
 99 |                             INDEX_HEADER_NUM_FILES_FIELD
100 | 
101 | #define INDEX_ENTRY_SIZE    OFFSET_FIELD + \
102 |                             INDEX_NUM_FILES_FIELD
103 | //Ngram File Constants
104 | #define NGRAM_FILE_EXTENSION    ".ngram"
105 | #define INDEX_FILE_EXTENSION    ".index"
106 | #define FILEID_FILE_EXTENSION   ".sngfs"
107 | 
108 | // These defines govern the file number at the end of the file
109 | // The string governs the amount of 0's that are used
110 | // The buffer size, indicates the size of the string buffer
111 | // the string number (08 by default) should be less than 
112 | // the buffer size + 1
113 | #define FILE_NUM_SPRINTF_STRING "%08u"
114 | #define FILE_NUM_BUFFER_SIZE 30
115 | #define FILE_MODE (mode_t)0775
116 | 
117 | #endif
118 | 


--------------------------------------------------------------------------------
/include/file.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | #ifndef SNGFILE_H
 30 | #define SNGFILE_H
 31 | 
 32 | #include <stdint.h>
 33 | #include <cstddef>
 34 | #include <cstdlib>
 35 | #include <sys/types.h>
 36 | #include <sys/stat.h>
 37 | 
 38 | namespace snugglefish {
 39 |    
 40 |     //Base class that keeps track of files
 41 |     //Abstraction from pure C method of writing files
 42 |     //Probably lighter-weight than C++ stream classes
 43 |     //It's re-inventing the wheel a bit, but I wanted to keep it
 44 |     // as light weight as possible without using pure C
 45 |     class file
 46 |     {
 47 | 
 48 |         public:
 49 |             //Constructor
 50 |             file(const char* fileName, size_t buffersize = (1024 * 1024 * 8 * 32));
 51 | 
 52 |             //Destructor
 53 |             //Calls close, then frees up malloc'd elements
 54 |             ~file();
 55 | 
 56 |             //Opens a file with flags O_RDWR | O_CREAT
 57 |             //Allocates a buffer for writing
 58 |             bool create(mode_t filemode = (mode_t)0755);
 59 | 
 60 |             //Opens a file either O_RDONLY or O_RDWR based on readwrite value
 61 |             //Allocates a buffer if in write mode
 62 |             bool open(char readwrite);
 63 | 
 64 | 
 65 |             //Mmaps the file read-only
 66 |             uint8_t* mmap();
 67 | 
 68 |             //Closes a file, flushes buffer first
 69 |             //frees buffer if allocated in open or create
 70 |             //Also closes mmap if opened
 71 |             bool close();
 72 | 
 73 |             
 74 |             //Read, just front-ends the read syscall
 75 |             void read(uint8_t* destination, size_t length);
 76 | 
 77 |             //Read at specific locations (offset from SEEK_SET)
 78 |             //Using read_at and read together should be done carefully 
 79 |             void read_at(int32_t location, uint8_t* destination, size_t length);
 80 | 
 81 |             //Buffered writer
 82 |             bool write(uint8_t* data, size_t length);
 83 | 
 84 |             //Non-Buffered write to specific locations (offset from SEEK_SET)
 85 |             bool write_at(int32_t location, uint8_t* data, size_t length);
 86 | 
 87 |             //Flush anything buffered
 88 |             bool flush();
 89 | 
 90 |             //Returns the size of the file using stat
 91 |             const size_t get_size();
 92 | 
 93 |             //Does file exist
 94 |             const bool exists();
 95 |             
 96 |     
 97 |         protected:
 98 |             char* filename;
 99 |             bool readonly;
100 | 
101 |         private:
102 |             int32_t fd;  //File Descriptor
103 |             uint8_t* mmapFile;
104 |             size_t  size; //Size of File -- used for mmap purposes
105 |             char* buffer;
106 |             size_t buffersize;
107 |             size_t bufferused;
108 |             size_t bufferparam;
109 | 
110 | 
111 |             bool real_write(int fd, uint8_t* data, size_t length);
112 | 
113 |     };
114 |     
115 | }
116 | 
117 | #endif
118 | 


--------------------------------------------------------------------------------
/include/fileIndexer.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Approved for Public Release; Distribution Unlimited: 13-1937
 3 | 
 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions
 8 | are met:
 9 | 1. Redistributions of source code must retain the above copyright
10 |    notice, this list of conditions and the following disclaimer.
11 | 2. Redistributions in binary form must reproduce the above copyright
12 |    notice, this list of conditions and the following disclaimer in the
13 |    documentation and/or other materials provided with the distribution.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 | SUCH DAMAGE.
26 | 
27 | */
28 | 
29 | 
30 | #ifndef FILEINDEXER_H
31 | #define FILEINDEXER_H
32 | 
33 | #include <stdint.h>
34 | #include <vector>
35 | 
36 | 
37 | namespace snugglefish {
38 |     //Class to keep track of nGram Index files
39 |     class fileIndexer
40 |     {
41 |         public:
42 |             
43 |             //Constructor -- only takes ngramLength
44 |             fileIndexer(uint8_t ngramLength);
45 | 
46 |             //Processes the nGrams from a file -- returns an allocated array of bools
47 |             //Calling function must cleanup
48 |             std::vector<uint32_t>* processFile(const char* fileName);
49 | 
50 |         private:
51 |             std::vector<uint32_t>* processNgrams(unsigned char *buf, uint64_t fileSize);
52 |             void processNgrams(unsigned char *buf, uint64_t fileSize, bool ngramList[]);
53 | 
54 |             uint64_t filesProcessed;
55 |             uint32_t ngramLength;
56 |             uint64_t maxNgram;
57 |             uint32_t pagesize;
58 | 
59 | 
60 |             
61 |     };
62 | }
63 | #endif
64 | 


--------------------------------------------------------------------------------
/include/indexSet.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | 
 30 | #ifndef SNGINDEXSET_H
 31 | #define SNGINDEXSET_H
 32 | 
 33 | 
 34 | #include "file.h"
 35 | #include "common.h"
 36 | #include <string>
 37 | #include <vector>
 38 | #include <list>
 39 | 
 40 | namespace snugglefish {
 41 | 
 42 |     //Storage container for index entries
 43 |     //Due to padding we should typecast on the fly
 44 |     //when we want to use this
 45 |     struct index_entry{
 46 |         uint64_t offset;
 47 |         uint32_t num_files;
 48 |     };
 49 | 
 50 |     class indexSet 
 51 |     {
 52 | 
 53 |         public:
 54 |             indexSet(const char* fileBase, uint32_t count, uint8_t nGramSize);
 55 |             ~indexSet();
 56 | 
 57 |             void create(ngram_t_numfiles nFiles = 0);
 58 |             void addNGrams(uint32_t ngram, std::list<ngram_t_fidtype> *files);
 59 |             void updateNumFiles(ngram_t_numfiles count);
 60 | 
 61 |             //Opens and mmaps both the Index and NGram File
 62 |             void open();
 63 | 
 64 |             void close();
 65 | 
 66 |             //Get number of files with given ngram
 67 |             size_t getNGramCount(uint64_t ngram);
 68 | 
 69 |             //Returns mmap'd loation of given ngram
 70 |             ngram_t_fidtype* getNGrams(uint64_t ngram, size_t* count);
 71 | 
 72 |         private:
 73 |             void addIndexData(uint64_t offset, uint32_t nFiles);
 74 | 
 75 | 
 76 |             file* indexFile;
 77 |             file* nGramFile;
 78 | 
 79 |             uint8_t* indexMap;
 80 |             uint8_t* nGramMap;
 81 |             uint8_t* indexEntries;
 82 | 
 83 |             std::string fileBase;
 84 | 
 85 |             bool writable;
 86 | 
 87 |             //Index File header elements
 88 |             ngram_t_endian      endian_check;
 89 |             ngram_t_version     version;
 90 |             ngram_t_size        ngramLength;
 91 |             ngram_t_numfiles    numFiles;
 92 | 
 93 |             uint32_t count;
 94 |             uint64_t offset;
 95 | 
 96 |     };
 97 | 
 98 | }
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/include/loserTree.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | 
  5 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  6 | 
  7 | Redistribution and use in source and binary forms, with or without
  8 | modification, are permitted provided that the following conditions
  9 | are met:
 10 | 1. Redistributions of source code must retain the above copyright
 11 |    notice, this list of conditions and the following disclaimer.
 12 | 2. Redistributions in binary form must reproduce the above copyright
 13 |    notice, this list of conditions and the following disclaimer in the
 14 |    documentation and/or other materials provided with the distribution.
 15 | 
 16 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 19 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 22 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 23 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 24 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 25 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 26 | SUCH DAMAGE.
 27 | 
 28 | */
 29 | 
 30 | #ifndef LOSERTREE
 31 | #define LOSERTREE
 32 | 
 33 | #include <stdint.h>
 34 | #include <vector>
 35 | #include <cmath>
 36 | 
 37 | namespace snugglefish {
 38 | 
 39 |     using namespace std;
 40 | 
 41 |     //Class Declarations
 42 | 
 43 |     template<class comparable>
 44 |     class playerNode;
 45 |     template<class comparable>
 46 |     class matchNode;
 47 | 
 48 |     template<class comparable>
 49 |     struct treeElement {
 50 |         comparable value;
 51 |         playerNode<comparable>*   pNodePtr; //pointer to the player node where this should reside
 52 | 
 53 |         treeElement(comparable value, playerNode<comparable>* pNodePtr) {
 54 |             this-> value = value; this->pNodePtr = pNodePtr;
 55 |         }
 56 |     
 57 |         treeElement(){;}
 58 |     };
 59 | 
 60 |     template<class comparable>
 61 |     class treeNode{
 62 |         public:
 63 |             virtual comparable getValue() = 0; //pure virtual function
 64 |             virtual treeElement<comparable> getTreeElement() = 0;
 65 |             virtual uint8_t isPlayer() = 0;
 66 |     };
 67 | 
 68 |     template<class comparable>
 69 |     class playerNode: public treeNode<comparable> {
 70 |         public:
 71 |             playerNode(matchNode<comparable>* parent, std::vector<comparable>* playerList, uint32_t playerListSize, uint32_t playerId, comparable sentinel);
 72 |             comparable getValue();
 73 | 
 74 |             treeElement<comparable> getTreeElement() {return treeElement<comparable>(this->getValue(), this);}
 75 |             void advancePlayer() { index_id++;}
 76 |             uint8_t isPlayer() { return 1; }
 77 |             const uint32_t getId() { return this->position; }
 78 | 
 79 |             matchNode<comparable>* parent;
 80 | 
 81 |         private:
 82 |             std::vector<comparable>* player; //player for this player node
 83 |             uint32_t position; //where in the playerlist did you get the above node from?
 84 |             uint32_t index_id; //where in the list are you
 85 |             comparable sentinel;
 86 |     };
 87 | 
 88 | 
 89 |     template<class comparable>
 90 |     class matchNode: public treeNode<comparable> {
 91 |         public:
 92 |             comparable getValue() {return element.value;}
 93 |             treeElement<comparable> getTreeElement() { return this->element; }
 94 |             uint8_t isPlayer() {return 0;}
 95 | 
 96 |             matchNode<comparable>* parent;
 97 |             treeNode<comparable>* left;
 98 |             treeNode<comparable>* right;
 99 |             treeElement<comparable> element;
100 |             uint32_t depth;
101 |     };
102 | 
103 |     template<class comparable>
104 |     class loserTree{
105 |         public:
106 |             loserTree(std::vector<comparable>* playerList, uint32_t listSize, comparable sentinel);
107 |             //TODO getWinner renamed not public
108 |             comparable getWinnerValue();
109 |             uint32_t getWinnerId();
110 |             void playNextMatch();
111 |             ~loserTree();
112 | 
113 | 
114 |         private:
115 |             void removeAndPlay(treeElement<comparable>* winner);
116 |             void buildTree();
117 |             void destroyChildren(matchNode<comparable>* node);
118 |             void playUp(matchNode<comparable>* node, treeElement<comparable> winner, matchNode<comparable>* winnerNode);
119 |             treeElement<comparable> getMatchWinner(treeElement<comparable> & loser);
120 |             treeElement<comparable> buildChildren(matchNode<comparable>* node);
121 |             treeElement<comparable> getWinner();
122 | 
123 |             std::vector<comparable>* playerList; //array of pointers to vectors
124 |             uint32_t playerListSize;
125 |             uint32_t playerId; //to keep track of which vectors we've assigned
126 | 
127 |             uint32_t match_depth;
128 |             treeElement<comparable> winner;
129 |             matchNode<comparable>* root;
130 |             comparable sentinel;
131 |     };
132 | 
133 | //DEFINITIONS
134 | 
135 |     template<class comparable>
136 |     loserTree<comparable>::loserTree(vector<comparable>* playerList, uint32_t listSize, comparable sentinel){
137 |         //Match depth, not including players is log base 2 (listSize)
138 |         this->sentinel = sentinel;
139 |         this->playerList = playerList;
140 |         this->playerListSize = listSize;
141 |         this->playerId = 0; 
142 |         this->match_depth = ceil(log10((double) listSize) / log10((double) 2));
143 | 
144 | 
145 |         this->root = new matchNode<comparable>;
146 |         this->root->parent = 0;
147 |         this->root->depth = 1;
148 | 
149 |         this-> winner = this->buildChildren(this->root);
150 | 
151 |     }
152 | 
153 | 
154 |     template<class comparable>
155 |     loserTree<comparable>::~loserTree(){
156 |         //Recursively destroy tree
157 |         this->destroyChildren(this->root);
158 |     }
159 | 
160 | 
161 |     template<class comparable>
162 |     treeElement<comparable> loserTree<comparable>::getWinner(){
163 |         return this->winner;
164 |     }
165 | 
166 |     template<class comparable>
167 |     uint32_t loserTree<comparable>::getWinnerId(){
168 |         return this->winner.pNodePtr->getId();
169 |     }
170 | 
171 |     template<class comparable>
172 |     comparable loserTree<comparable>::getWinnerValue(){
173 |         return this->winner.value;
174 |     }
175 | 
176 |     template<class comparable>
177 |     void loserTree<comparable>::playNextMatch(){
178 |         if (this->winner.pNodePtr != 0){
179 |             this->winner.pNodePtr->advancePlayer();
180 |             this->playUp(this->winner.pNodePtr->parent, this->winner.pNodePtr->getTreeElement(), (matchNode<comparable>*)this->winner.pNodePtr);
181 |         }
182 |     }
183 | 
184 |     //Starting at the deepest match node, go upwards
185 |     template<class comparable>
186 |     void loserTree<comparable>::playUp(matchNode<comparable>* node, treeElement<comparable> winner, matchNode<comparable>* winnerNode){
187 |         if(node->depth == 1){  //root node
188 |             if(winner.value > node->element.value ||
189 |                 (winner.value == node->element.value &&
190 |                     node->right == winnerNode)){
191 |                 this->winner = node->element;
192 |                 node->element = winner;
193 |             }else{
194 |                 this->winner = winner;
195 |             }
196 | 
197 |         }else{
198 |             //Compare winner received with stored loser rescurse upwards
199 |             if(winner.value > node->element.value || 
200 |                 (winner.value == node->element.value && 
201 |                     node->right == winnerNode)){//new loser
202 |                 this->playUp(node->parent, node->element, node);
203 |                 node->element = winner;
204 |             }else{
205 |                 this->playUp(node->parent, winner, node);
206 |             }
207 |         }
208 |     }
209 | 
210 |     template<class comparable>
211 |     void loserTree<comparable>::destroyChildren(matchNode<comparable>* node){
212 |         if (node->depth == this->match_depth){
213 |             //clean up player list?
214 |         }else{
215 |             if(!node->isPlayer()){
216 |                 destroyChildren((matchNode<comparable>*)node->left);
217 |                 if (node->right)
218 |                     destroyChildren((matchNode<comparable>*)node->right);
219 |             }
220 |         }
221 | 
222 |         if(!node->isPlayer()){
223 |             delete node->left;
224 |             if (node->right)
225 |                 delete node->right;
226 |         }
227 |     }
228 | 
229 |     template<class comparable>
230 |     treeElement<comparable> loserTree<comparable>::buildChildren(matchNode<comparable>* node){
231 |         treeElement<comparable> right,left;
232 | 
233 |         if (node->depth == this->match_depth || 
234 |                 this->playerId >= (this->playerListSize - 1)) { //Children are player nodes
235 |             node->left = new playerNode<comparable>(node, this->playerList, 
236 |                     this->playerListSize, this->playerId++, this->sentinel);
237 |             node->right = new playerNode<comparable>(node, this->playerList, 
238 |                     this->playerListSize, this->playerId++, this->sentinel);
239 | 
240 |             left.value = node->left->getValue();
241 |             left.pNodePtr = (playerNode<comparable>*)node->left;
242 | 
243 |             right.value = node->right->getValue();
244 |             right.pNodePtr = (playerNode<comparable>*)node->right;
245 | 
246 |         }else{//Regular match node where children are matches
247 |             node->left = (treeNode<comparable>*) new matchNode<comparable>;
248 |             matchNode<comparable>* leftMatch = (matchNode<comparable>*) node->left;
249 |             leftMatch->parent = node;
250 |             leftMatch->depth = node->depth + 1;
251 | 
252 |             left = this->buildChildren((matchNode<comparable>*) node->left);
253 | 
254 |             if(this->playerId >= this->playerListSize){
255 |             //there are no more lists to sort, seed this instead
256 |                 node->right = 0;
257 |                 right = treeElement<comparable>(this->sentinel, 0);
258 |             }else{
259 |                 node->right = (treeNode<comparable>*) new matchNode<comparable>;
260 |                 matchNode<comparable>* rightMatch = (matchNode<comparable>*) node->right;
261 |                 rightMatch->parent = node;
262 |                 rightMatch->depth = node->depth + 1;
263 |                 right = this->buildChildren((matchNode<comparable>*) node->right);
264 |             }
265 | 
266 |         }
267 | 
268 |         if (right.value < left.value) { //right is the winner
269 |             node->element = left; //store the loser
270 |             return right; //return the winner
271 |         }else{//if right => left it loses
272 |             node->element = right;
273 |             return left;
274 |         } 
275 |     }
276 | 
277 | 
278 | 
279 | 
280 |     template<class comparable>
281 |     playerNode<comparable>::playerNode(matchNode<comparable>* parent, vector<comparable>* playerList, uint32_t playerListSize,  uint32_t playerId, comparable sentinel){
282 |         this->parent = parent;
283 |         this->sentinel = sentinel;
284 |         if(playerId >= playerListSize){//uneven 
285 |             this->player = 0;    
286 |         }else{
287 |             this->player = & (playerList[playerId]);
288 |             this->position = playerId;
289 |         }
290 |         this->index_id = 0;
291 |     }
292 | 
293 | 
294 |     template<class comparable>
295 |     comparable playerNode<comparable>::getValue(){
296 |         if(player && index_id < ((std::vector<comparable>)(*player)).size()) 
297 |             return ((std::vector<comparable>)(*player))[index_id];
298 |         else
299 |              return this->sentinel;
300 |     }
301 | 
302 | 
303 | 
304 | 
305 | 
306 | 
307 | }
308 | 
309 | 
310 | #endif
311 | 


--------------------------------------------------------------------------------
/include/nGramBase.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Approved for Public Release; Distribution Unlimited: 13-1937
 3 | 
 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions
 8 | are met:
 9 | 1. Redistributions of source code must retain the above copyright
10 |    notice, this list of conditions and the following disclaimer.
11 | 2. Redistributions in binary form must reproduce the above copyright
12 |    notice, this list of conditions and the following disclaimer in the
13 |    documentation and/or other materials provided with the distribution.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 | SUCH DAMAGE.
26 | 
27 | */
28 | 
29 | 
30 | #ifndef NGRAMBASE_H
31 | #define NGRAMBASE_H
32 | 
33 | #include <list>
34 | #include <vector>
35 | #include <string>
36 | #include <fstream>
37 | #include <stdint.h>
38 | #include "common.h"
39 | #include "file.h"
40 | #include "indexSet.h"
41 | #include "smFile.h"
42 | 
43 | namespace snugglefish {
44 | 
45 | 
46 |     //Class to keep track of nGram Index files
47 |     class nGramBase
48 |     {
49 |         public:
50 |             
51 |             nGramBase( uint32_t ngramLength, std::string indexFileName);
52 |             
53 |         protected:
54 |             uint16_t maxFileNameLength;
55 |             std::string baseFileName;
56 |             uint8_t ngramLength; // the ngram size we're using
57 |             uint64_t maxNgram; 
58 |             
59 | 
60 |         private:
61 |             
62 | 
63 |     };
64 | 
65 | }
66 | #endif
67 | 


--------------------------------------------------------------------------------
/include/nGramIndex.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | #ifndef NGRAMINDEX_H
 30 | #define NGRAMINDEX_H
 31 | 
 32 | #include "nGramBase.h"
 33 | #include <list>
 34 | #include <vector>
 35 | #include <string>
 36 | #include <fstream>
 37 | #include <stdint.h>
 38 | #include "smFile.h"
 39 | #include "indexSet.h"
 40 | 
 41 | #define BUFFER_NODE_SIZE        sizeof(ngram_t_fidtype) + 24 //24 is upper bound per node
 42 | 
 43 | //Index File Constants
 44 | #define DEFAULT_MAX_FILENAME_SIZE                   65 //64 characters + null terminator
 45 | #define TWO_GB                                      2147483648 //1024 * 1024 * 1204 * 2 
 46 | #define FOUR_GB                                     TWO_GB * 2
 47 | #define MAX_BUFFER_SIZE                             FOUR_GB 
 48 | 
 49 |     //4MB = 4194304 
 50 |     //8MB = 8388608 
 51 | #define WRITE_BUFFER_SIZE                           1024 * 1024 * 8 * 32 //256MB
 52 | 
 53 | namespace snugglefish {
 54 | 
 55 |     //Storage container for output buffer
 56 |     struct buffer_element{
 57 |         uint64_t    elements_size; //how many elements in list
 58 |                                    //stl list size() will iterate everytime, easier to keep
 59 |                                    //static counter
 60 |         std::list<ngram_t_fidtype>* elements;
 61 |     };
 62 | 
 63 | 
 64 |     //Class to keep track of nGram Index files
 65 |     class nGramIndex: public nGramBase
 66 |     {
 67 |         public:
 68 |             
 69 |             nGramIndex( uint32_t ngramLength, std::string indexFileName);
 70 |             ~nGramIndex();
 71 | 
 72 |             //Accessors
 73 |             const uint32_t getmaxFileNameLength(){return this->maxFileNameLength;}   
 74 |             const uint64_t getmaxBufferSize(){return this->bufferMax;}
 75 | 
 76 |             //Setters
 77 |             void setmaxFileNameLength(uint32_t length){ this->maxFileNameLength = length; }
 78 |             void setmaxBufferSize(uint64_t size){this->bufferMax = size;}
 79 |             
 80 |             //Write Mode
 81 |             void addNGrams(std::vector<uint32_t>* nGramList, std::string filename);
 82 |             //void addNGrams(bool nGramList[], std::string filename, int flag);
 83 | 
 84 |             void getStats(uint64_t & totalFiles, uint64_t& sessionFiles, uint64_t& indexFiles, bool& flushing);
 85 |     
 86 |         private:
 87 |             //Write Mode Functions
 88 |             void flushAll();
 89 |             void flushMaster();
 90 |             void flushIndex(ngram_t_indexfcount num_files );
 91 | 
 92 |            
 93 |             //Write Mode Variables
 94 |             uint64_t bufferMax;
 95 |             buffer_element* output_buffer;
 96 |             uint64_t buffer_memory_usage; //how many bytes is the buffer storing (only file ids)
 97 |             std::vector< std::string > fileNameList;
 98 |             bool flush;
 99 |             bool flushing;
100 | 
101 | 
102 |             smFile* masterFile;
103 | 
104 |             uint64_t numFilesProcessed;
105 |             uint64_t numSessionFilesProcessed; //How many files have been processed this session
106 |  
107 | 
108 |     };
109 | 
110 | }
111 | #endif
112 | 


--------------------------------------------------------------------------------
/include/nGramSearch.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | #ifndef NGRAMSEARCH_H
 30 | #define NGRAMSEARCH_H
 31 | 
 32 | #include "nGramBase.h"
 33 | #include <list>
 34 | #include <vector>
 35 | #include <string>
 36 | #include <fstream>
 37 | #include <stdint.h>
 38 | #include <utility>
 39 | #include <pthread.h>
 40 | #include <queue>
 41 | #include "smFile.h"
 42 | #include "indexSet.h"
 43 | 
 44 | namespace snugglefish {
 45 | 
 46 |     typedef struct _thread_data{
 47 |         uint32_t queue;
 48 |         uint32_t maximumIndex;
 49 |         uint32_t ngramLength;
 50 | 
 51 |         smFile* masterFile;
 52 |         std::vector<std::string>* matchedFiles;
 53 |         std::string* baseFileName;
 54 |         std::vector<uint64_t>* nGramQuery;
 55 | 
 56 |         pthread_mutex_t queueMutex;
 57 |         pthread_mutex_t smFileMutex; 
 58 |         pthread_mutex_t mfMutex;
 59 |     } thread_data;
 60 | 
 61 | 
 62 |     //Class to keep track of nGram Index files
 63 |     class nGramSearch: public nGramBase
 64 |     {
 65 |         public:
 66 |             
 67 |             nGramSearch( uint32_t ngramLength, std::string indexFileName);
 68 |             nGramSearch( uint32_t ngramLength, std::string indexFileName, uint32_t threads);
 69 |             ~nGramSearch();
 70 | 
 71 | 
 72 |             //Read Mode
 73 |             std::vector<std::string>* searchNGrams(std::vector<uint64_t> nGramQuery);
 74 |             std::vector<uint64_t>* stringToNGrams(std::string searchString);
 75 | 
 76 |         protected:
 77 | 
 78 |         private:
 79 |             //FUNCTIONS
 80 |             static std::list< std::pair<uint64_t, size_t> > orderNGrams(indexSet* index, const std::vector<uint64_t>& nGramQuery);
 81 |             //Alpha is just a placeholder name for this search type
 82 |             //I envision there will be multiple search types
 83 |             static std::list<ngram_t_fidtype> searchAlpha(indexSet* index, std::list< std::pair<uint64_t,size_t> > & queryList);
 84 |             static void* searchNGramThread(void* input);
 85 | 
 86 | 
 87 |             uint32_t numThreads;
 88 | 
 89 | 
 90 |             //Read Mode Variables
 91 |             smFile* masterFile;
 92 |             uint32_t numIndexFiles;
 93 |             uint32_t numFiles;
 94 | 
 95 |             
 96 |  
 97 | 
 98 |     };
 99 | 
100 | }
101 | #endif
102 | 


--------------------------------------------------------------------------------
/include/smFile.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Approved for Public Release; Distribution Unlimited: 13-1937
 3 | 
 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions
 8 | are met:
 9 | 1. Redistributions of source code must retain the above copyright
10 |    notice, this list of conditions and the following disclaimer.
11 | 2. Redistributions in binary form must reproduce the above copyright
12 |    notice, this list of conditions and the following disclaimer in the
13 |    documentation and/or other materials provided with the distribution.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 | SUCH DAMAGE.
26 | 
27 | */
28 | 
29 | #ifndef SNGMASTERFILE_H
30 | #define SNGMASTERFILE_H
31 | 
32 | 
33 | #include "file.h"
34 | #include "common.h"
35 | #include <string>
36 | 
37 | namespace snugglefish {
38 | 
39 |     class smFile: public file
40 |     {
41 | 
42 |         public:
43 |             smFile(std::string fileBase, uint8_t nGramSize);
44 |             ~smFile();
45 | 
46 |             void create(ngram_t_fnlength maxfnLength);
47 |             void open(char readwrite);
48 | 
49 |             bool flush();
50 | 
51 |             void addFileId(const char* fileName);
52 |             void updateIndexFileCount(ngram_t_indexcount count);
53 | 
54 |             const ngram_t_indexcount getNumIndexFiles() { return numIndexFiles; }
55 |             const ngram_t_fidtype getNumFiles() { return numFiles; }
56 | 
57 |             const char* getFilebyId(uint64_t id);
58 | 
59 |         private:
60 |             //Index File header elements
61 |             ngram_t_endian      endian_check;
62 |             ngram_t_version     version;
63 |             ngram_t_size        ngramLength;
64 |             ngram_t_fnlength    maxFileNameLength;
65 |             ngram_t_indexcount  numIndexFiles;
66 |             ngram_t_fidtype     numFiles;
67 | 
68 | 
69 |             char*   fileBuffer;
70 | 
71 |     };
72 | 
73 | }
74 | #endif
75 | 


--------------------------------------------------------------------------------
/include/snugglefish.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Approved for Public Release; Distribution Unlimited: 13-1937
 3 |  
 4 |  Copyright (c) 2014 The MITRE Corporation. All rights reserved.
 5 |  
 6 |  Redistribution and use in source and binary forms, with or without
 7 |  modification, are permitted provided that the following conditions
 8 |  are met:
 9 |  1. Redistributions of source code must retain the above copyright
10 |  notice, this list of conditions and the following disclaimer.
11 |  2. Redistributions in binary form must reproduce the above copyright
12 |  notice, this list of conditions and the following disclaimer in the
13 |  documentation and/or other materials provided with the distribution.
14 |  
15 |  THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 |  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 |  ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 |  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 |  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 |  OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 |  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 |  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 |  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 |  SUCH DAMAGE.
26 |  
27 |  */
28 | 
29 | #ifndef SNUGGLEFISH_H
30 | #define SNUGGLEFISH_H
31 | 
32 | #include <string>
33 | #include <vector>
34 | #include <stdint.h>
35 | #include <pthread.h>
36 | 
37 | #ifdef __cplusplus
38 | extern "C" {
39 | #endif
40 | 
41 | 
42 |     void make_index(std::string indexFileName, std::vector <std::string> fileNames, uint32_t ngramSize, uint32_t max_files, uint64_t max_buffer, uint32_t threads);
43 | 
44 | #ifdef __cplusplus
45 | }
46 | #endif
47 | 
48 | std::vector<std::string>* search(std::string indexFileName, std::string searchString, uint32_t ngramSize, uint32_t threads);
49 |     
50 | #endif
51 | 


--------------------------------------------------------------------------------
/include/utils.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | 
 30 | #ifndef SNUGGLE_UTILS_H
 31 | #define SNUGGLE_UTILS_H
 32 | 
 33 | 
 34 | #include <stdint.h>
 35 | #include <list>
 36 | #include <vector>
 37 | #include <algorithm> // std::sort
 38 | 
 39 | //TODO REMOVE
 40 | #include <iostream>
 41 | #include <bitset>
 42 | 
 43 | 
 44 | namespace snugglefish{
 45 | 
 46 |    
 47 |     std::list<uint8_t> vbencode_number(uint32_t number){
 48 |         std::list<uint8_t> bytes; 
 49 |         while (true){
 50 |             bytes.push_front(number % 128);
 51 |             if (number < 128){
 52 |                 break;
 53 |             }
 54 |             number /= 128;
 55 |         }
 56 |         bytes.back() += 128;
 57 |         return bytes;
 58 |     } 
 59 | 
 60 | 
 61 |     std::list<uint8_t> vbencode_numbers(const std::vector<uint32_t>& numbers){
 62 |         std::list<uint8_t> output;
 63 |         for(int32_t i = 0; i < numbers.size(); i++){
 64 |             std::list<uint8_t> out = vbencode_number(numbers[i]);
 65 |             if (i == 0){
 66 |                 output = out;
 67 |             }else{
 68 |                 output.splice(output.end(), out);
 69 |             }   
 70 |         }
 71 | 
 72 |         return output;
 73 |     }
 74 | 
 75 |     std::vector<uint32_t> vbdecode_numbers(std::list<uint8_t> numbers){
 76 |         std::vector<uint32_t> output;
 77 |         uint32_t n = 0;
 78 | 
 79 |         for (std::list<uint8_t>::const_iterator iterator = numbers.begin(); iterator != numbers.end(); iterator++){
 80 |             if (*iterator < 128){
 81 |                 n = 128 * n + *iterator;
 82 |             }else{
 83 |                 n = 128 * n + (*iterator - 128);
 84 |                 output.push_back(n);
 85 |                 n = 0;
 86 |             }
 87 |         } 
 88 | 
 89 |         return output;
 90 |     }
 91 | 
 92 | 
 93 |     uint32_t pfor_analyze_bits(std::vector<uint32_t> v, int b, uint32_t& length, uint32_t& exceptions){
 94 |         uint32_t len = 0, min = 0, range = 1 << b;
 95 | 
 96 |         for(uint32_t lo = 0, hi = 0; hi < v.size(); hi++){
 97 |             if(v[hi] - v[lo] >= range){
 98 |                 if(hi - lo > len){
 99 |                     min = lo; 
100 |                     len = hi - lo;
101 |                 }     
102 |                 while(v[hi] - v[lo] >= range) lo++;
103 |             }
104 |         }
105 |         exceptions = v.size() - len;
106 |         length = len + 1;
107 |         return min;
108 |     }
109 | 
110 | 
111 |     uint32_t bit_encode(uint32_t number, uint32_t bits, uint32_t& overflow){
112 |         overflow = number >> bits;
113 |         return number & (( 1 << bits) - 1);
114 |     }
115 | 
116 | 
117 |     uint32_t pfordelta_bitsize(const std::vector<uint32_t>& numbers, uint32_t& exceptions, uint32_t max_exceptions = 16){
118 |         //Sort the array
119 |         std::vector<uint32_t> sorted_deltas(numbers);
120 |         std::sort(sorted_deltas.begin(), sorted_deltas.end());
121 | 
122 |         uint32_t low_ratio = 0xFFFFFFFF;
123 |         uint8_t bits = 0;
124 |         for(uint32_t i = 3; i < 16; i++){//Assume 1 or 2 are too small TODO figure out best lower bound
125 |             uint32_t length;
126 |             uint32_t excep;
127 |             pfor_analyze_bits(sorted_deltas, i, length, excep);
128 |             uint32_t compression_ratio = i + (excep / (bool) sorted_deltas.size()) * (8 * sorted_deltas.size());
129 |             //std::cout << i << " = " << exceptions << "  " << compression_ratio<< std::endl;           
130 |             if (compression_ratio < low_ratio){
131 |                 exceptions = excep;
132 |                 low_ratio = compression_ratio;
133 |                 bits = i;
134 |             }else{ // >=
135 |                 //break;
136 |             }
137 |         }
138 | 
139 |         if(exceptions > max_exceptions){
140 |             bits = 0;
141 |         }
142 | 
143 |         return bits;
144 |     } 
145 | 
146 |     uint8_t* packer(const std::vector<uint32_t>& numbers, uint32_t packsize){
147 |         uint32_t total_size = (numbers.size() * packsize) / 8; //TODO account for float
148 |         uint8_t* output = new uint8_t[total_size](); //the () initializes the values to 0
149 | 
150 | 
151 |         if (packsize <= 3){ //Less than half a byte
152 |             int left = 0;//how many bits are left in the output byte
153 |             int shift = 0;//how many bits have we used in the input byte
154 | 
155 |             for(int i = 0, j =0; i < numbers.size(); i++){
156 |                 if(left){
157 |                     if (left >= packsize){
158 |                         output[j] |= numbers[i] << (left - packsize);
159 |                         left -= packsize;
160 |                         if(left){
161 |                             continue;
162 |                         }
163 |                         else{
164 |                             j++;
165 |                             continue;
166 |                         }
167 |                     }else{
168 |                         output[j] |= numbers[i] >> (packsize - left);
169 |                         shift = left;
170 |                         left = 0;
171 |                         j++;
172 |                     }
173 |     
174 |                 }
175 | 
176 |                 output[j] |= (numbers[i] << (8 - packsize)) << shift;
177 |                 left = 8 - packsize + shift;
178 |                 shift = 0;
179 |             }
180 |          
181 |         }else if (packsize <= 8){ //1 byte or less
182 |             //Iterate through every number
183 |             int left = 0;
184 |             int shift = 0;
185 |             for(int i = 0, j = 0; i < numbers.size(); i++){
186 |                 if (left){
187 |                     output[j++] |= numbers[i] >> (packsize - left);
188 |                     if (left == packsize){
189 |                         left = 0;
190 |                         continue;
191 |                     }
192 |                     shift = left;
193 |                     left = 0;
194 |                 }
195 |     
196 |                 output[j] |= (numbers[i] & (((1 << packsize) - 1) >> shift))  << (8 - (packsize - shift));
197 |                 left = 8 - (packsize - shift); // how many bits are left in this byte
198 |                 shift = 0;
199 |             }
200 | 
201 |             return output;
202 |         
203 | 
204 |         }else if (packsize <= 16) { //2 bytes or less
205 |         
206 |         }else {
207 |             //error
208 | 
209 |         }
210 |     }
211 | 
212 |     /*
213 |         This code written based on information gleaned from:
214 | 
215 |         Hao Yan, Shuai Ding, and Torsten Suel. “Inverted index compression and
216 |         query processing with optimized document ordering.”
217 |         In Proceedings of the 18th international conference on World wide web (WWW ’09).
218 |         ACM, New York, NY, USA, 401-410
219 |     */    
220 | 
221 |     void pfordelta_encode(const std::vector<uint32_t>& numbers, uint32_t max_exceptions = 16){
222 | 
223 |         std::vector<uint32_t> deltas;
224 | 
225 |         //Create Delta Array
226 |         for(uint32_t i = 0; i < numbers.size(); i++){
227 |             if (i == 0){
228 |                 deltas.push_back(numbers[i]);
229 |             }else{
230 |                 deltas.push_back(numbers[i] - numbers[i - 1]);
231 |             }
232 |         }
233 |         
234 |         uint32_t exceptions = 0;
235 |         uint32_t bits = pfordelta_bitsize(deltas, exceptions);
236 | 
237 | 
238 |         if(!bits){//wasn't able to find a number that matches constraints
239 |             //TODO
240 |         }
241 | 
242 |         //At this point we should have the bitsize and the number of exceptions
243 | 
244 |         std::vector<uint32_t> compressed;
245 |         std::vector<uint32_t> exception_index;
246 |         std::vector<uint32_t> exception_values;
247 |         for (uint32_t i = 0; i < deltas.size(); i++){
248 |             uint32_t overflow;
249 |             compressed.push_back(bit_encode(deltas[i], bits, overflow));
250 |             if(overflow){
251 |                 exception_index.push_back(i);
252 |                 exception_values.push_back(overflow); 
253 |             }
254 |             std::bitset<6> foo;
255 |             foo = compressed[i];
256 |             std::cout << foo << std::endl;
257 |         }
258 | 
259 |         std::bitset<8> bit_val;
260 |         //Bitpack
261 |         uint8_t leader = bits << 4 | exceptions;
262 |         uint8_t* packed = packer(compressed, bits);
263 |         std::list<uint8_t> e_index = vbencode_numbers(exception_index);
264 |         std::list<uint8_t> e_vals = vbencode_numbers(exception_values);
265 | 
266 | 
267 | 
268 |         bit_val = leader;
269 |         std::cout << bit_val << std::endl;
270 |         for(int i = 0; i < (compressed.size() * bits) / 8 ; i++){
271 |             bit_val = packed[i];
272 |             std::cout << bit_val << std::endl;
273 |         }
274 | 
275 |         std::cout<< "Exception Indexes: " << std::endl;
276 | 
277 |         for(std::list<uint8_t>::iterator iter = e_index.begin(); iter != e_index.end(); iter++){
278 |             bit_val = *iter;
279 |             std::cout << bit_val << std::endl;
280 |         }
281 | 
282 | 
283 |         std::cout<< "Exception Values: " << std::endl;
284 |         for(std::list<uint8_t>::iterator iter = e_vals.begin(); iter != e_vals.end(); iter++){
285 |             bit_val = *iter;
286 |             std::cout << bit_val << std::endl;
287 |         }
288 | 
289 |     }
290 |     
291 | 
292 | }
293 | 
294 | #endif
295 | 


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
 1 | PySnugglefish
 2 | =======================
 3 | 
 4 | Python bindings for Snugglefish
 5 | 
 6 | Description
 7 | ===========
 8 | PySnugglefish is a python wrapper around the snugglefish C++ codebase. It exposes the search and index functions to python modules.
 9 | 
10 | PySnugglefish Instances
11 | =======================
12 | The pysnugglefish module is pretty simple to utilize.
13 | Start by importing the module, it's pretty easy:
14 | 
15 | 		import pysnugglefish
16 | 
17 | To execute snugglefish functions, create an interface through pysnugglefish:
18 | 
19 | 		obj = pysnugglefish.init("/path/to/indexfile")
20 | 		
21 | 
22 | Or, you can specify the index file and the ngram size as init arguments.
23 | 		
24 | 		ngram_sz = 3
25 | 		obj = pysnugglefish.init("/path/to/indexfile", ngram_sz)
26 | 
27 | Ngram size must be either 3 or 4.
28 | 
29 | Indexing with PySnugglefish
30 | ===========================
31 | 
32 | To index, simply feed the pysnugglefish instance with configuration options, then run the indexing function. 
33 | 
34 | 		obj = pysnugglefish.init("/path/to/indexfile")
35 | 		obj.file_list = ["/path/to/file1", "/path/to/file2"]
36 | 		obj.ngram_size = 3 # defaults to 3
37 | 		obj.max_buffer = 9001 # defaults to no maximum (0)
38 | 		obj.max_files = 100000 # defaults to no maximum (0)
39 | 		obj.make_index() # create the index file
40 | 
41 | If you are indexing lots of files this will be very memory and CPU intensive, so be patient.
42 | 
43 | Searching with PySnugglefish
44 | ============================
45 | The module facilitates searching a specified index.
46 | Again, provide configuration, then execute your search.
47 | 
48 | 		obj = pysnugglefish.init("/path/to/indexfile")
49 | 		obj.ngram_size = 3 # better equal the ngram_size used to generate the index!
50 | 		bitstring = "\x41\x42\x43"
51 | 		files_found = obj.search(bitstring)
52 | 		
53 | The search function returns an array containing the filenames of each file in the index which the snugglefish code matched to the input search string.
54 | 
55 | Caveats
56 | =======
57 | Python doesn't interpret tildes in paths automatically. This pysnugglefish module does not do any special processing on the files provided to the index function. So, expect issues with paths such as ~/Documents/file.bin
58 | 


--------------------------------------------------------------------------------
/python/pysnugglefish.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  3 | 
  4 | Redistribution and use in source and binary forms, with or without
  5 | modification, are permitted provided that the following conditions
  6 | are met:
  7 | 1. Redistributions of source code must retain the above copyright
  8 |    notice, this list of conditions and the following disclaimer.
  9 | 2. Redistributions in binary form must reproduce the above copyright
 10 |    notice, this list of conditions and the following disclaimer in the
 11 |    documentation and/or other materials provided with the distribution.
 12 | 
 13 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 15 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 16 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 17 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 18 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 19 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 20 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 21 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 22 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 23 | SUCH DAMAGE.
 24 | */
 25 | 
 26 | #include <Python.h>
 27 | #include <structmember.h>
 28 | #include <stdio.h>
 29 | #include <string>
 30 | #include <vector>
 31 | #include "nGramSearch.h"
 32 | #include "nGramIndex.h"
 33 | #include "fileIndexer.h"
 34 | #include "common.h"
 35 | 
 36 | using namespace std;
 37 | 
 38 | /* Module's error object. */
 39 | static PyObject *SnuggleError;
 40 | 
 41 | /*
 42 |  * Define the pysnugglefish object.
 43 |  * Represents data used in the snugglefish features.
 44 |  */
 45 | typedef struct {
 46 | 	PyObject_HEAD
 47 | 	PyObject *index;
 48 | 	PyObject *file_list;
 49 | 	int ngram_size;
 50 | 	int max_buffer;
 51 | 	int max_files;
 52 | } pysnugglefish;
 53 | 
 54 | /* Facilitate destruction of pysnugglefish objects. */
 55 | static void pysnugglefish_dealloc(pysnugglefish *self) {
 56 | 	Py_XDECREF(self->index);
 57 | 	Py_XDECREF(self->file_list);
 58 | 	self->ob_type->tp_free((PyObject*)self);
 59 | }
 60 | 
 61 | /* Construct a new pysnugglefish object. */
 62 | static PyObject *pysnugglefish_new(PyTypeObject *type, PyObject *args, PyObject *kwds) {
 63 | 	pysnugglefish *self;
 64 | 	self = (pysnugglefish *)type->tp_alloc(type, 0);
 65 | 	if (self != NULL) { // if object created, init fields to defaults
 66 | 		self->index = PyString_FromString("");
 67 | 		if (self->index == NULL) {
 68 | 			Py_DECREF(self);
 69 | 			return NULL;
 70 | 		}
 71 | 		self->file_list = PyList_New(0);
 72 | 		self->max_buffer = 0;
 73 | 		self->ngram_size = 3;
 74 | 		self->max_files = 0;
 75 | 	}
 76 | 
 77 | 	return (PyObject *)self;
 78 | }
 79 | 
 80 | /* Initialize a pysnugglefish object from command line arguments. */
 81 | static int pysnugglefish_init(pysnugglefish *self, PyObject *args, PyObject *kwds) {
 82 | 	PyObject *index=NULL, *tmp;
 83 | 	int ngrams = 3;
 84 | 
 85 | 	static char *kwlist[] = {(char *) "index", (char *) "ngram_size", NULL};
 86 | 
 87 | 	// ngram size optional
 88 | 	if (!PyArg_ParseTupleAndKeywords(args, kwds, "S|i", kwlist, &index, &ngrams)) {
 89 | 		return -1;
 90 | 	}
 91 | 
 92 | 	if (ngrams != 3 && ngrams != 4) {
 93 | 		PyErr_SetString(PyExc_TypeError, "N-Gram size must be set to 3 or 4.");
 94 | 		return -1;
 95 | 	}
 96 | 	if (index) { // if index was provided, manage references and set
 97 | 		tmp = self->index;
 98 | 		Py_INCREF(index);
 99 | 		self->index = index;
100 | 		Py_XDECREF(tmp);
101 | 	}
102 | 	self->ngram_size = ngrams;
103 | 
104 | 	return 0;
105 | }
106 | 
107 | /* Define members of pysnugglefish object. */
108 | static PyMemberDef pysnugglefish_members[] = {
109 | 	{(char *) "index", T_OBJECT_EX, offsetof(pysnugglefish, index), 0,
110 | 	 (char *) "index name"},
111 | 	{(char *) "file_list", T_OBJECT_EX, offsetof(pysnugglefish, file_list), 0,
112 | 	 (char *) "list of file names"},
113 | 	{(char *) "ngram_size", T_INT, offsetof(pysnugglefish, ngram_size), 0,
114 | 	 (char *) "n-gram size"},
115 | 	{(char *) "max_buffer", T_INT, offsetof(pysnugglefish, max_buffer), 0,
116 | 	 (char *) "max buffer size"},
117 | 	{(char *) "max_files", T_INT, offsetof(pysnugglefish, max_files), 0,
118 | 	 (char *) "max files to use"},
119 | 	{ NULL }  /* Sentinel */
120 | };
121 | 
122 | /*
123 |  * Search a snugglefish index for a given search input.
124 |  * Args: searchString (string)
125 |  */
126 | static PyObject *pysnugglefish_search(pysnugglefish *self, PyObject *args) {
127 | 	char *searchString;
128 | 	vector<string> *found;
129 | 	long procs;
130 | 
131 | 	//This only works on some *nixes
132 | 	// TODO figure out which systems don't support this call
133 | 	procs = sysconf(_SC_NPROCESSORS_ONLN);
134 | 	if (procs < 1) {
135 | 		procs = 1;
136 | 	}
137 | 
138 | 	// Threads optional
139 | 	if (!PyArg_ParseTuple(args, "s|i", &searchString, &procs)) {
140 | 		return NULL;
141 | 	}
142 | 
143 | 	if (procs <= 0) {
144 | 		PyErr_SetString(SnuggleError, (char *) "Invalid threads");
145 | 		return NULL;
146 | 	}
147 | 
148 | 	try {
149 | 		snugglefish::nGramSearch searcher(self->ngram_size, PyString_AsString(self->index), (uint32_t) procs);
150 | 		vector<uint64_t> *ngrams = searcher.stringToNGrams(searchString);
151 | 		found = searcher.searchNGrams(*ngrams);
152 | 	} catch (exception &e) {
153 | 		PyErr_SetString(SnuggleError, e.what());
154 | 		return NULL;
155 | 	}
156 | 
157 | 	PyObject *ret = PyList_New(found->size());
158 | 	for (size_t i = 0; i < found->size(); i++) {
159 | 		PyList_SetItem(ret, i, Py_BuildValue("s", (*found)[i].c_str()));
160 | 	}
161 | 
162 | 	delete found;
163 | 	return ret;
164 | }
165 | 
166 | void *indexerThread(void *input) {
167 | 	mi_data *midata = (mi_data *) input;
168 | 	snugglefish::nGramIndex *ngramindex = (snugglefish::nGramIndex *) midata->ngramindex;
169 | 	snugglefish::fileIndexer indexer(midata->ngramSize);
170 | 
171 | 	while(1) {
172 | 		pthread_mutex_lock(&midata->filesMutex);
173 | 		if (midata->queue >= midata->fileList->size()) {
174 | 			pthread_mutex_unlock(&midata->filesMutex);
175 | 			break;
176 | 		}
177 | 
178 | 		uint32_t i = midata->queue++;
179 | 		pthread_mutex_unlock(&midata->filesMutex);
180 | 
181 | 		try {
182 | 			vector<uint32_t> *processedFile = indexer.processFile((*(midata->fileList))[i].c_str());
183 | 			if (processedFile != 0) {
184 | 				pthread_mutex_lock(&midata->nGramIndexMutex);
185 | 				ngramindex->addNGrams(processedFile, (*(midata->fileList))[i]);
186 | 				pthread_mutex_unlock(&midata->nGramIndexMutex);
187 | 			}
188 | 		} catch (exception &e) {
189 | 			return (void *) e.what();
190 | 		}
191 | 	}
192 | 
193 | 	return 0;
194 | }
195 | 
196 | /*
197 |  * Index all of the files specified in the file_list member.
198 |  * Output the index at the path specified in the pysnugglefish index member.
199 |  */
200 | static PyObject *pysnugglefish_index(pysnugglefish *self, PyObject *args) {
201 | 	vector<string> files;
202 | 	long procs;
203 | 	int i;
204 | 	pthread_t *indexers;
205 | 	mi_data *midata;
206 | 	void *status;
207 | 	Py_ssize_t ct;
208 | 
209 | 	//This only works on some *nixes
210 | 	// TODO figure out which systems don't support this call
211 | 	procs = sysconf(_SC_NPROCESSORS_ONLN);
212 | 	if (procs < 1) {
213 | 		procs = 1;
214 | 	}
215 | 
216 | 	// Threads optional
217 | 	if (!PyArg_ParseTuple(args, "|i", &procs)) {
218 | 		return NULL;
219 | 	}
220 | 
221 | 	if (procs <= 0) {
222 | 		PyErr_SetString(SnuggleError, (char *) "Invalid threads");
223 | 		return NULL;
224 | 	}
225 | 
226 | 	// No files to index.
227 | 	ct = PyList_Size(self->file_list);
228 | 	if (ct == 0) {
229 | 		Py_RETURN_NONE;
230 | 	}
231 | 
232 | 	for (i = 0; i < ct; i++) {
233 | 		files.push_back(PyString_AsString(PyList_GetItem(self->file_list, i)));
234 | 	}
235 | 
236 | 	midata = new mi_data;
237 | 	indexers = (pthread_t *) malloc(procs * sizeof(pthread_t));
238 | 
239 | 	pthread_attr_t attr;
240 | 	pthread_attr_init(&attr);
241 | 	pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
242 | 
243 | 	pthread_mutex_init(&(midata->filesMutex), NULL);
244 | 	pthread_mutex_init(&(midata->nGramIndexMutex), NULL);
245 | 
246 | 	midata->fileList = &files;
247 | 	midata->ngramSize = self->ngram_size;
248 | 	midata->queue = 0;
249 | 
250 | 	try {
251 | 		snugglefish::nGramIndex ngramindex(self->ngram_size, PyString_AsString(self->index));
252 | 		if (self->max_buffer > 0) {
253 | 			ngramindex.setmaxBufferSize(self->max_buffer);
254 | 		}
255 | 
256 | 		midata->ngramindex = &ngramindex;
257 | 
258 | 		for (uint32_t i = 0; i < procs; i++) {
259 | 			pthread_create(&indexers[i], &attr, indexerThread, (void *) midata);
260 | 		}
261 | 
262 | 		while (1) {
263 | 			//Usage of mutex shouldn't matter
264 | 			if (midata->queue >= files.size()) {
265 | 				break;
266 | 			}
267 | 			sleep(1);
268 | 		}
269 | 
270 | 		for (uint32_t i = 0; i < procs; i++) {
271 | 			pthread_join(indexers[i], &status);
272 | 			if (status) {
273 | 				PyErr_SetString(SnuggleError, (char *) status);
274 | 				return NULL;
275 | 			}
276 | 		}
277 | 
278 | 		} catch (exception &e) {
279 | 			PyErr_SetString(SnuggleError, e.what());
280 | 			return NULL;
281 | 		}
282 | 
283 | 
284 | 		pthread_mutex_destroy(&(midata->filesMutex));
285 | 		pthread_mutex_destroy(&(midata->nGramIndexMutex));
286 | 		pthread_attr_destroy(&attr);
287 | 
288 | 		delete midata;
289 | 		free(indexers);
290 | 
291 | 		Py_RETURN_NONE;
292 | }
293 | 
294 | /* Define the set of methods callable from a pysnugglefish object. */
295 | static PyMethodDef pysnugglefish_methods[] = {
296 | 	{"search", (PyCFunction) pysnugglefish_search, METH_VARARGS, "Search the current index for an input."},
297 | 	{"make_index", (PyCFunction) pysnugglefish_index, METH_VARARGS, "Make an index out of the current files list."},
298 | 	{ NULL }  /* Sentinel */
299 | };
300 | 
301 | /* Define getter for data attributes of pysnugglefish objects. */
302 | static PyObject *pysnugglefish_getattr(pysnugglefish *self, char *attrname) {
303 | 	if (strcmp(attrname, "index") == 0) {
304 | 		return self->index;
305 | 	} else if (strcmp(attrname, "file_list") == 0) {
306 | 		Py_INCREF(self->file_list);
307 | 		return self->file_list;
308 | 	} else if (strcmp(attrname, "max_files") == 0) {
309 | 		return Py_BuildValue("i", self->max_files);
310 | 	} else if (strcmp(attrname, "max_buffer") == 0) {
311 | 		return Py_BuildValue("i", self->max_buffer);
312 | 	} else if (strcmp(attrname, "ngram_size") == 0) {
313 | 		return Py_BuildValue("i", self->ngram_size);
314 | 	} else if (strcmp(attrname, "search") == 0) {
315 | 		return PyObject_GenericGetAttr((PyObject *)self, Py_BuildValue("s", attrname));
316 | 	} else if (strcmp(attrname, "make_index") == 0) {
317 | 		return PyObject_GenericGetAttr((PyObject *)self, Py_BuildValue("s", attrname));
318 | 	} else {
319 | 		PyErr_SetString(PyExc_AttributeError, attrname);
320 | 		return NULL;
321 | 	}
322 | }
323 | 
324 | /* Define setter for data attributes of pysnugglefish objects. */
325 | static int pysnugglefish_setattr(pysnugglefish *self, char *name, PyObject *value) {
326 | 	int result = -1;
327 | 	if (strcmp(name, "index") == 0) {
328 | 		PyErr_SetString(SnuggleError, "Index is read-only after init.");
329 | 	} else if (strcmp(name, "file_list") == 0) {
330 | 		result = 0;
331 | 		if (PyList_Check(value) && value != NULL) {
332 | 			Py_XDECREF(self->file_list);
333 | 			Py_INCREF(value);
334 | 			self->file_list = value;
335 | 		} else {
336 | 			result = -1;
337 | 		}
338 | 	} else if (strcmp(name, "max_files") == 0 && value != NULL) {
339 | 		int newval = 0;
340 | 		if (PyArg_Parse(value, "i", &newval)) {
341 | 			if (newval > 0) {
342 | 				self->max_files = newval;
343 | 				result = 0;
344 | 			}
345 | 		}
346 | 	} else if (strcmp(name, "max_buffer") == 0 && value != NULL) {
347 | 		int newval = 0;
348 | 		if (PyArg_Parse(value, "i", &newval)) {
349 | 			if (newval > 0) {
350 | 				self->max_buffer = newval;
351 | 				result = 0;
352 | 			}
353 | 		}
354 | 	} else if (strcmp(name, "ngram_size") == 0 && value != NULL) {
355 | 		int newval = 0;
356 | 		if (PyArg_Parse(value, "i", &newval)) {
357 | 			if (newval == 3 || newval == 4) {
358 | 				self->ngram_size = newval;
359 | 				result = 0;
360 | 			}
361 | 		}
362 | 	} else {
363 | 		PyErr_SetString(PyExc_AttributeError, name);
364 | 		result = -1;
365 | 	}
366 | 	return result;
367 | }
368 | 
369 | /*
370 |  * Define the Python type for pysnugglefish objects.
371 |  * Configures pysnugglefish with its getters, setters, destructors, etc.
372 |  */
373 | PyTypeObject pysnugglefish_Type = {
374 | 	PyObject_HEAD_INIT(NULL)
375 | 	0,                                        /* ob_size */
376 | 	"pysnugglefish",                          /* tp_name */
377 | 	sizeof(pysnugglefish),                    /* tp_basicsize */
378 | 	0,                                        /* tp_itemsize */
379 | 	(destructor)pysnugglefish_dealloc,        /* tp_dealloc */
380 | 	0,                                        /* tp_print */
381 | 	(getattrfunc)pysnugglefish_getattr,       /* tp_getattr */
382 | 	(setattrfunc)pysnugglefish_setattr,       /* tp_setattr */
383 | 	0,                                        /* tp_compare */
384 | 	0,                                        /* tp_repr */
385 | 	0,                                        /* tp_as_number */
386 | 	0,                                        /* tp_as_sequence */
387 | 	0,                                        /* tp_as_mapping */
388 | 	0,                                        /* tp_hash */
389 | 	0,                                        /* tp_call */
390 | 	0,                                        /* tp_str */
391 | 	0,                                        /* tp_getattro */
392 | 	0,                                        /* tp_setattro */
393 | 	0,                                        /* tp_as_buffer */
394 | 	Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
395 | 	"pysnugglefish objects",                  /* tp_doc */
396 | 	0,                                        /* tp_traverse */
397 | 	0,                                        /* tp_clear */
398 | 	0,                                        /* tp_richcompare */
399 | 	0,                                        /* tp_weaklistoffset */
400 | 	0,                                        /* tp_iter */
401 | 	0,                                        /* tp_iternext */
402 | 	pysnugglefish_methods,                    /* tp_methods */
403 | 	pysnugglefish_members,                    /* tp_members */
404 | 	0,                                        /* tp_getset */
405 | 	0,                                        /* tp_base */
406 | 	0,                                        /* tp_dict */
407 | 	0,                                        /* tp_descr_get */
408 | 	0,                                        /* tp_descr_set */
409 | 	0,                                        /* tp_dictoffset */
410 | 	(initproc)pysnugglefish_init,             /* tp_init */
411 | 	0,                                        /* tp_alloc */
412 | 	pysnugglefish_new,                        /* tp_new */
413 | };
414 | 
415 | /* Define static module methods (none). */
416 | static PyMethodDef module_methods[] = {
417 | 	{NULL}  /* Sentinel */
418 | };
419 | 
420 | /*
421 |  * Initialize the module.
422 |  * Ensures that the pysnugglefish type is ready, makes methods available to
423 |  * objects, and prepares error object for any future issues with pysnugglefish.
424 |  */
425 | PyMODINIT_FUNC initpysnugglefish(void) {
426 | 	PyObject *m;
427 | 
428 | 	if (PyType_Ready(&pysnugglefish_Type) < 0) {
429 | 		return;
430 | 	}
431 | 	m = Py_InitModule3("pysnugglefish", module_methods, "Module that exposes snugglefish methods.");
432 | 	if (m == NULL) {
433 | 		return;
434 | 	}
435 | 
436 | 	SnuggleError = PyErr_NewException((char *) "pysnugglefish.error", NULL, NULL);
437 | 	Py_INCREF(SnuggleError);
438 | 	PyModule_AddObject(m, "error", SnuggleError);
439 | 	Py_INCREF(&pysnugglefish_Type);
440 | 	PyModule_AddObject(m, "init", (PyObject *) &pysnugglefish_Type);
441 | }
442 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright (c) 2014 The MITRE Corporation. All rights reserved.
 3 | #
 4 | # Redistribution and use in source and binary forms, with or without
 5 | # modification, are permitted provided that the following conditions
 6 | # are met:
 7 | # 1. Redistributions of source code must retain the above copyright
 8 | #    notice, this list of conditions and the following disclaimer.
 9 | # 2. Redistributions in binary form must reproduce the above copyright
10 | #    notice, this list of conditions and the following disclaimer in the
11 | #    documentation and/or other materials provided with the distribution.
12 | #
13 | # THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 | # ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 | # OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 | # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 | # SUCH DAMAGE.
24 | #
25 | 
26 | from distutils.core import setup, Extension
27 | 
28 | INCLUDE_DIRS  = [ '/usr/local/include',
29 |                   '/opt/local/include',
30 |                   '/usr/include',
31 |                   '../include' ]
32 | LIBRARY_DIRS  = [ '/usr/lib',
33 |                   '/usr/local/lib' ]
34 | 
35 | # the c++ extension module
36 | extension_mod = Extension("pysnugglefish",
37 |                           sources=[ "../src/fileIndexer.cpp",
38 |                                     "../src/nGramBase.cpp",
39 |                                     "../src/nGramIndex.cpp",
40 |                                     "../src/nGramSearch.cpp",
41 |                                     "../src/file.cpp",
42 |                                     "../src/indexSet.cpp",
43 |                                     "../src/smFile.cpp",
44 |                                     "pysnugglefish.cpp" ],
45 |                           include_dirs = INCLUDE_DIRS,
46 |                           library_dirs = LIBRARY_DIRS
47 |                           )
48 | 
49 | 
50 | setup (# Distribution meta-data
51 |        name = "pysnugglefish",
52 |        version = "0.2",
53 |        description = "python bindings for snugglefish",
54 |        author = "Wesley Shields",
55 |        author_email = "wshields@mitre.org",
56 |        license = "BSD",
57 |        long_description = "Python bindings for snugglefish",
58 |        ext_modules = [ extension_mod ]
59 |        )
60 | 


--------------------------------------------------------------------------------
/python/snuggle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import pysnugglefish
 5 | import multiprocessing
 6 | 
 7 | from optparse import OptionParser
 8 | 
 9 | def main():
10 |     cpu_count = multiprocessing.cpu_count()
11 | 
12 |     parser = OptionParser()
13 |     parser.add_option("-i", "--index", action="store_true", dest="index",
14 |                       default=None, help="Index operation.")
15 |     parser.add_option("-s", "--search", action="store_true", dest="search",
16 |                       default=None, help="Search operation.")
17 |     parser.add_option("-I", "--indexfile", action="store", dest="indexfile",
18 |                       default="", help="Index file to search or create.")
19 |     parser.add_option("-t", "--threads", action="store", dest="threads",
20 |                       default=cpu_count,
21 |                       help="Number of threads to spawn (default is #cpus).")
22 | 
23 |     (opts, searchstring) = parser.parse_args()
24 | 
25 |     if (not opts.index and not opts.search) or (opts.index and opts.search):
26 |         print "[!] Must specify one of index or search."
27 |         return
28 | 
29 |     if not opts.indexfile:
30 |         print "[!] Must specify output location"
31 |         return
32 | 
33 |     s = pysnugglefish.init(opts.indexfile)
34 | 
35 |     try:
36 |         threads = int(opts.threads)
37 |     except Exception as e:
38 |         print "[!] Invalid threads: %s" % e.message
39 |         return
40 | 
41 |     if threads <= 0:
42 |         print "[!] Invalid threads. Defaulting to %i." % cpu_count
43 |         threads = cpu_count
44 | 
45 |     if opts.index:
46 |         s.file_list = [ line.rstrip('\n') for line in sys.stdin.readlines() ]
47 |         msg = "[+] Indexing %i files with %i threads." % (len(s.file_list),
48 |                                                           threads)
49 |         print "[+] This might take a while... ;)"
50 |         try:
51 |             s.make_index(threads)
52 |         except Exception as e:
53 |             print "[!] Exception while indexing: %s" % e.message
54 |     elif opts.search:
55 |         searchstring = ' '.join(searchstring)
56 |         if not searchstring:
57 |             searchstring = raw_input("Search string: ")
58 | 
59 |         if not searchstring:
60 |             print "[!] Must enter a search string."
61 |             return
62 | 
63 |         print "[+] Searching for %s with %i threads" % (searchstring, threads)
64 |         try:
65 |             results = s.search(searchstring, threads)
66 |             for result in results:
67 |                 print result
68 |         except Exception as e:
69 |             print "[!] Exception while searching: %s" % e.message
70 | 
71 | if __name__ == '__main__':
72 |     main()
73 | 


--------------------------------------------------------------------------------
/src/file.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | #include "file.h"
 30 | #include <cstring>
 31 | #include <cstdlib>
 32 | #include <cerrno>
 33 | #include <stdexcept>
 34 | 
 35 | #include <unistd.h>
 36 | #include <fcntl.h>
 37 | #include <sys/mman.h>
 38 | #include <iostream>
 39 | #include <errno.h>
 40 | 
 41 | using namespace snugglefish;
 42 | using namespace std;
 43 | 
 44 | file::file(const char* fileName, size_t buffersize){
 45 |     this->bufferparam = buffersize;
 46 | 
 47 |     this->buffer = NULL;
 48 |     this->buffersize = 0;
 49 |     this->bufferused = 0;
 50 | 
 51 |     this->fd = 0;
 52 |     this->mmapFile = 0;
 53 |     this->size = 0;
 54 | 
 55 |     this->filename = (char*) malloc(sizeof(char) * (strlen(fileName) + 1));
 56 |     strncpy(this->filename, fileName, strlen(fileName) + 1);
 57 | }
 58 | 
 59 | 
 60 | file::~file(){
 61 |     //File Descriptor is not zero
 62 |     if(this->fd){
 63 |         this->close();
 64 |     }
 65 |     
 66 |     if (this->filename != NULL){
 67 |         free(this->filename);
 68 |     }
 69 | 
 70 |     if (this->buffer){
 71 |         free(this->buffer);
 72 |         this->buffer = NULL;
 73 |         this->buffersize = this->bufferused = 0;
 74 |     }
 75 | }
 76 | 
 77 | bool file::create(mode_t filemode){
 78 | 
 79 |     if (this->bufferparam > 0){
 80 |         this->buffersize = this->bufferparam;
 81 |         this->buffer = (char*) malloc(buffersize);
 82 |     }
 83 |     this->fd = ::open(this->filename, O_RDWR | O_CREAT, filemode);
 84 |     this->readonly = false;
 85 | 
 86 |     if (this->fd <= 0){
 87 |         cerr << "Unable to Create File: " << filename << " -- Error: " << strerror(errno) << endl;
 88 |         throw runtime_error("Creating File");
 89 |     }
 90 | 
 91 |     return true;
 92 | }
 93 | 
 94 | bool file::open(char readwrite){
 95 |     switch (readwrite){
 96 |         case 'r':
 97 |         {    
 98 |             this->fd = ::open(this->filename, O_RDONLY);
 99 |             this->size = this->get_size();
100 |             this->readonly = true;
101 |             break;
102 |         }
103 |         case 'w':
104 |         {
105 |             if (this->bufferparam > 0){
106 |                 this->buffersize = this->bufferparam;
107 |                 this->buffer = (char*) malloc(buffersize);
108 |             }
109 |             this->fd = ::open(this->filename, O_RDWR);
110 |             this->readonly = false;
111 |             break;
112 |         }
113 |         default:
114 |             throw runtime_error("Unrecognized read/write mode");
115 |             break;
116 |     }
117 | 
118 |     if (this->fd <= 0){
119 |         cerr << "Error Opening File: " << this->filename << " -- Error: " << strerror(errno) << endl;
120 |         throw runtime_error("Opening File");
121 |     }
122 | 
123 |     return true;
124 | }
125 | 
126 | uint8_t* file::mmap(){
127 |     //Open the file first 
128 |     if (this->fd == 0)
129 |         this->open('r');
130 | 
131 |     if (!this->size)
132 |         return NULL;
133 | 
134 |     this->mmapFile = (uint8_t*) ::mmap(NULL, this->size, PROT_READ, MAP_SHARED, this->fd, 0);
135 | 
136 |     if(this->mmapFile == MAP_FAILED){
137 |         cerr << "Error Loading Map for File : " << this->filename<< " -- Error: " << strerror(errno) << endl;
138 |         throw runtime_error("Loading Map");
139 |     }
140 | 
141 |     return this->mmapFile;
142 | 
143 | }
144 | 
145 | bool file::close(){
146 |     if (!this->fd){ // already closed?
147 |         return true;
148 |     }
149 | 
150 |     this->flush(); 
151 | 
152 |     //Close the mmap if opened
153 |     if (this->mmapFile){
154 |         munmap(this->mmapFile, this->size); 
155 |         this->mmapFile = 0;
156 |         this->size = 0;
157 |     }
158 | 
159 |     //Free the write buffer if allocated
160 |     if (this->buffer){
161 |         free(this->buffer);
162 |         this->buffer = NULL;
163 |         this->buffersize = this->bufferused = 0;
164 |     }
165 | 
166 |     int32_t retval = ::close(this->fd);
167 |     if(retval){//non zero
168 |         cerr << "Error Closing File: " << this->filename << " -- Error: " << strerror(errno) << endl;
169 |         throw runtime_error("Closing File");
170 |     }
171 | 
172 |     this->fd = 0;
173 | 
174 |     return true;
175 | 
176 | }
177 | 
178 | bool file::real_write(int fd, uint8_t* data, size_t length){
179 |         ssize_t written = ::write(fd, data, length); 
180 |         if (written == -1){//TODO what if partial write?
181 |             cerr << "Unable to write to file: " << this->filename << " -- Error: " << strerror(errno) << endl;
182 |             throw runtime_error("Write Error");
183 |         }
184 | 
185 |     return true;
186 | }
187 | 
188 | bool file::flush(){
189 |     if(this->readonly){
190 |         return true;
191 |     }
192 | 
193 |     if (this->bufferused){
194 |         lseek(this->fd, 0, SEEK_END);
195 |         this->real_write(this->fd, (uint8_t*) this->buffer, this->bufferused * sizeof(char));
196 |         this->bufferused = 0;
197 |     }
198 | 
199 |     return true;
200 | }
201 | 
202 | 
203 | bool file::write(uint8_t * data, size_t length){
204 |     if (this->readonly){
205 |         throw runtime_error("Write command on read-only file");
206 |     }
207 | 
208 |     if((this->bufferused + (length)) > this->buffersize) {
209 |         this->flush();
210 | 
211 |         if (length < this->buffersize){
212 |             memcpy(this->buffer, data, length);
213 |             this->bufferused = length;
214 |         }else{  
215 |             //Data being passed in is larger than buffer, write out directly
216 |             //Seek to the end of the file first
217 |             lseek(this->fd, 0, SEEK_END);
218 |             this->real_write(this->fd, data, length);
219 |         }
220 |     }else{ // just buffer it up
221 |         memcpy(this->buffer + this->bufferused, data, length);
222 |         this->bufferused += length;
223 |     }
224 | 
225 |     return true;
226 | }
227 | 
228 | 
229 | bool file::write_at(int32_t location, uint8_t * data, size_t length){
230 |     if (this->readonly){
231 |         throw runtime_error("Write command on reaodnly file");
232 |     }
233 | 
234 |     //Flush before writing at locations
235 |     flush();
236 |     lseek(this->fd, location, SEEK_SET);
237 |     this->real_write(this->fd, data, length * sizeof(uint8_t));
238 | 
239 |     return true;
240 | 
241 | }
242 | 
243 | void file::read(uint8_t* dest, size_t length){
244 |    ::read(this->fd, dest, length); 
245 | }
246 | 
247 | void file::read_at(int32_t location, uint8_t* dest, size_t length){
248 |     lseek(this->fd, location, SEEK_SET);
249 |     ::read (this->fd, dest, length);
250 | }
251 | 
252 | const size_t file::get_size(){
253 |     struct stat st;
254 |     size_t filesize = 0;
255 |     if (stat(this->filename, &st) == 0){
256 |        filesize = st.st_size;
257 |     }else{
258 |         throw runtime_error("Statting File");
259 |     } 
260 | 
261 |     return filesize;
262 | }
263 | 
264 | const bool file::exists(){
265 |     struct stat st;
266 |     if(stat(this->filename, &st) == 0){
267 |         return true;
268 |     }else{
269 |         return false;
270 |     }
271 | }
272 | 


--------------------------------------------------------------------------------
/src/fileIndexer.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | 
 30 | #include "fileIndexer.h"
 31 | #include <iostream>
 32 | #include <stdexcept>
 33 | #include <sys/stat.h>
 34 | #include <fcntl.h>
 35 | #include <unistd.h>
 36 | #include <sys/mman.h>
 37 | #include <errno.h>
 38 | #include <string.h>
 39 | 
 40 | #include <vector>
 41 | #include <map>
 42 | 
 43 | 
 44 | #include "file.h"
 45 | 
 46 | using namespace snugglefish;
 47 | using namespace std;
 48 | 
 49 | fileIndexer::fileIndexer(uint8_t ngramLength):
 50 | filesProcessed(0)
 51 | {
 52 | 
 53 |     if(ngramLength == 3  || ngramLength == 4){
 54 |         this->ngramLength = ngramLength;
 55 |         this->maxNgram = ((uint64_t) (1) << (8*ngramLength));
 56 |         this->pagesize = getpagesize();
 57 |     }
 58 |     else {
 59 |         throw std::runtime_error("Ngram len must be either 3 or 4");
 60 |     }
 61 | 
 62 | }
 63 | 
 64 | // Takes in a file name, mmaps it, and creates a vector of ngrams
 65 | // return: pointer to vector of ngrams
 66 | vector<uint32_t>* fileIndexer::processFile(const char* fileName){
 67 | 
 68 |     vector<uint32_t>* ngramList = 0;
 69 |     file* inputFile = new file(fileName);
 70 |     size_t fileSize = inputFile->get_size();
 71 |     uint8_t* inputFileMap = inputFile->mmap();
 72 | 
 73 |     if (!fileSize){
 74 |         delete inputFile;
 75 |         return NULL;
 76 |     } 
 77 | 
 78 | 
 79 |     this->filesProcessed++;
 80 | 
 81 |     //It is much faster to just use an array that holds a boolean
 82 |     //for every element, this requires only ~64MB per file for 3-byte Ngrams
 83 |     //but 16GB for 4 byte ngrams, so only do this for 3-byte ngrams
 84 |     //and do a map instead for 4-byte
 85 |     if(this->ngramLength == 3){
 86 |         ngramList = new vector<uint32_t>;
 87 |         bool* bngramList = new bool[this->maxNgram]();
 88 | 
 89 |         try{
 90 |             processNgrams(inputFileMap, fileSize, bngramList);
 91 | 
 92 |             for(uint32_t k = 0; k < this->maxNgram; k++){
 93 |                 if(bngramList[k]){
 94 |                     ngramList->push_back(k);
 95 |                 }
 96 |             }
 97 |             
 98 |             delete[] bngramList;
 99 | 
100 |         } catch(exception &e){
101 |             cout << "Error processing ngrams: "<< e.what() << endl;
102 |         }
103 | 
104 |     }else{ //ngramLength == 4
105 |         try {
106 |             ngramList = processNgrams(inputFileMap, fileSize);
107 |         } catch(exception &e){
108 |             cout << "Error processing ngrams: " << e.what() << endl;
109 |         }
110 |     }
111 | 
112 |     inputFile->close();
113 |     delete inputFile;
114 | 
115 |     return ngramList;
116 | }
117 | 
118 | 
119 | //Creates vector of ngrams in a given file
120 | //Uses an STL map which should use a RED/BLACK tree
121 | //Insertion/Search should be O(log(n)) time where n is the 
122 | //number of nodes already in the tree
123 | vector<uint32_t>* fileIndexer::processNgrams(unsigned char* buf, uint64_t fileSize){
124 |     map<uint32_t, bool> ngram_map;
125 | 
126 |     uint32_t nGram = 0;
127 |     uint32_t i, j;
128 | 
129 |     for(i = 0; ((i + this->ngramLength) - 1) < fileSize; i++){
130 |         nGram = 0;
131 |         for(j = 0; j < this->ngramLength; j++){
132 |             nGram += (unsigned char)buf[i+j] * (1 << (8*j));
133 |         }
134 |         if(nGram >= this->maxNgram){
135 |             throw std::runtime_error("Ngram greater than maxNgram");
136 |         }
137 |         ngram_map.insert(pair<uint32_t,bool>(nGram, true));
138 |     }
139 | 
140 |     //We should now have a map of ngrams, turn into a sorted vector
141 |     //TODO pre-allocate vector to do this faster
142 |     vector<uint32_t>* ngramVector = new vector<uint32_t>;
143 |     
144 |     for(map<uint32_t,bool>::iterator it = ngram_map.begin(); it != ngram_map.end() ; it++){
145 |         ngramVector->push_back(it->first);
146 |     }
147 | 
148 |     return ngramVector;
149 | }
150 | 
151 | // Takes the file and creates ngrams from it
152 | void fileIndexer::processNgrams(unsigned char* buf, uint64_t fileSize, bool ngramList[]){
153 |     //TODO update with byte array
154 |     uint64_t nGram = 0;
155 |     uint64_t i, j;
156 |     for(i = 0; ((i + this->ngramLength) - 1) < fileSize; i++){
157 |         nGram = 0;
158 |        // creates an ngram using the formula buf[i] + buf[i+1]*256 
159 |        // + buf[i+2]*256*256
160 |         for(j = 0; j < this->ngramLength; j++){
161 |             // (1 <<(8*j)) is equivalent to pow(256,j)
162 |            nGram += (unsigned char)buf[i+j] * (1 << (8*j));
163 |         }
164 |         if(nGram >= this->maxNgram){
165 |             throw std::runtime_error("Ngram greater than maxNgram");
166 |         }
167 |         ngramList[nGram] = 1;
168 |     }
169 | 
170 |     return;
171 | }
172 | 


--------------------------------------------------------------------------------
/src/indexSet.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | 
 30 | #include "indexSet.h"
 31 | #include <string>
 32 | #include <list>
 33 | #include <cstdio>
 34 | #include <cerrno>
 35 | #include <unistd.h>
 36 | #include <stdexcept>
 37 | 
 38 | using namespace snugglefish;
 39 | using namespace std;
 40 | 
 41 | 
 42 | indexSet::indexSet(const char* fileBase, uint32_t count, uint8_t nGramSize)
 43 | :indexFile(0), nGramFile(0), fileBase(fileBase), ngramLength(nGramSize),
 44 | writable(false), indexMap(0), nGramMap(0), count(count)
 45 | {
 46 | }
 47 | 
 48 | indexSet::~indexSet(){
 49 |     if (this->indexFile){
 50 |         this->indexFile->close();
 51 |         delete this->indexFile;
 52 |     }
 53 | 
 54 |     if (this->nGramFile){
 55 |         this->nGramFile->close();
 56 |         delete this->nGramFile;
 57 |     }
 58 | }
 59 | 
 60 | void indexSet::close(){
 61 |     if (this->indexFile){
 62 |         this->indexFile->close();
 63 |         delete this->indexFile;
 64 |         this->indexFile = NULL;
 65 |     }
 66 |     if (this->nGramFile){
 67 |         this->nGramFile->close();
 68 |         delete this->nGramFile;
 69 |         this->nGramFile = NULL;
 70 |     }
 71 | 
 72 | }
 73 | 
 74 | void indexSet::create(ngram_t_numfiles nFiles){
 75 |     if (indexFile || nGramFile){//Already opened?
 76 |         throw runtime_error("Error opening index or ngram file");
 77 |     }
 78 | 
 79 |     //Create the Files
 80 |     //First create the filenames
 81 |     string indexFileName, nGramFileName;
 82 |     indexFileName = nGramFileName = this->fileBase;
 83 |     char number_string[FILE_NUM_BUFFER_SIZE];
 84 |     snprintf(number_string, FILE_NUM_BUFFER_SIZE, FILE_NUM_SPRINTF_STRING, this->count);
 85 | 
 86 |     indexFileName.append(INDEX_FILE_EXTENSION).append(number_string);
 87 |     nGramFileName.append(NGRAM_FILE_EXTENSION).append(number_string);
 88 | 
 89 |     this->indexFile = new file(indexFileName.c_str());
 90 |     this->nGramFile = new file(nGramFileName.c_str());
 91 | 
 92 |     if(indexFile->exists() || nGramFile->exists()){ //Already exist?
 93 |        throw runtime_error("index file or ngram file already exists"); 
 94 |     }
 95 | 
 96 |     indexFile->create();
 97 |     nGramFile->create();
 98 | 
 99 |     
100 |     this->endian_check = ENDIAN_CHECK;
101 |     this->version = VERSION;
102 | 
103 |     //Write standard header
104 |     indexFile->write((uint8_t*) (&(this->endian_check)), ENDIAN_CHECK_FIELD);
105 |     indexFile->write((uint8_t*) (&(this->version)), VERSION_FIELD);
106 |     indexFile->write((uint8_t*) (&(this->ngramLength)), NGRAM_SIZE_FIELD);
107 |     indexFile->write((uint8_t*) (&nFiles), INDEX_HEADER_NUM_FILES_FIELD);
108 | 
109 |     offset = 0; //Offset in the ngramfile
110 | 
111 |     writable = true;
112 | 
113 | }
114 | 
115 | void indexSet::open(){
116 |     string indexFileName, nGramFileName;
117 |     indexFileName = nGramFileName = this->fileBase;
118 |     char number_string[FILE_NUM_BUFFER_SIZE];
119 |     snprintf(number_string, FILE_NUM_BUFFER_SIZE, FILE_NUM_SPRINTF_STRING, this->count);
120 | 
121 |     indexFileName.append(INDEX_FILE_EXTENSION).append(number_string);
122 |     nGramFileName.append(NGRAM_FILE_EXTENSION).append(number_string);
123 | 
124 |     indexFile = new file(indexFileName.c_str());
125 | 
126 |     if(!indexFile->exists()){
127 |         throw runtime_error("index file does not exist");
128 |     }
129 | 
130 |     nGramFile = new file(nGramFileName.c_str());
131 | 
132 |     if (!nGramFile->exists()){
133 |         throw runtime_error("index file does not exist");
134 |     }
135 | 
136 |     indexFile->open('r');
137 |     nGramFile->open('r');
138 | 
139 |     indexMap = indexFile->mmap();
140 |     nGramMap = nGramFile->mmap();
141 | 
142 |     indexEntries = indexMap + INDEX_HEADER_SIZE;
143 | }
144 | 
145 | void indexSet::addIndexData(uint64_t offset, uint32_t nFiles){
146 |     indexFile->write((uint8_t*) &offset, sizeof(uint64_t));
147 |     indexFile->write((uint8_t*) &nFiles, sizeof(uint32_t));
148 | }
149 | 
150 | void indexSet::addNGrams(uint32_t ngram, list<ngram_t_fidtype>* files){
151 |     if (!writable || !nGramFile){
152 |         //TODO
153 |     }
154 | 
155 |     uint32_t size = files->size();
156 |     uint64_t off =  offset;
157 |     uint32_t difference = sizeof(ngram_t_fidtype) * size;
158 | 
159 |     if (!size){
160 |         off = 0;
161 |     }
162 | 
163 |     //Pre-emptively add the index information, i.e, offset, numFiles
164 |     addIndexData(off, size);
165 | 
166 |     while(size > 0){
167 |         nGramFile->write((uint8_t*) (&(files->front())), sizeof(ngram_t_fidtype));
168 |         files->pop_front();
169 |         size--;
170 |     } 
171 | 
172 |     offset =  offset + difference;
173 | 
174 | }
175 | 
176 | void indexSet::updateNumFiles(ngram_t_numfiles count){
177 |     if (!writable || !indexFile){
178 |         //TODO
179 |     }
180 |     //TODO cleanup offset
181 |     indexFile->write_at(INDEX_HEADER_SIZE - INDEX_HEADER_NUM_FILES_FIELD, (uint8_t*) (&count), INDEX_HEADER_NUM_FILES_FIELD);
182 | }
183 | 
184 | size_t indexSet::getNGramCount(uint64_t ngram){
185 |     index_entry* index_table = (index_entry*) (indexEntries + (ngram * (INDEX_ENTRY_SIZE)));
186 |     return index_table->num_files;
187 | }
188 | 
189 | ngram_t_fidtype* indexSet::getNGrams(uint64_t ngram, size_t* count){
190 |     index_entry* index_table = (index_entry*) (indexEntries + (ngram * (INDEX_ENTRY_SIZE)));
191 |     *count = index_table->num_files;
192 |     ngram_t_fidtype* ptr = (ngram_t_fidtype*) (nGramMap + index_table->offset);
193 | 
194 |     return ptr;
195 | }
196 | 


--------------------------------------------------------------------------------
/src/nGramBase.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | Approved for Public Release; Distribution Unlimited: 13-1937
 3 | 
 4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions
 8 | are met:
 9 | 1. Redistributions of source code must retain the above copyright
10 |    notice, this list of conditions and the following disclaimer.
11 | 2. Redistributions in binary form must reproduce the above copyright
12 |    notice, this list of conditions and the following disclaimer in the
13 |    documentation and/or other materials provided with the distribution.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25 | SUCH DAMAGE.
26 | 
27 | */
28 | 
29 | 
30 | #include "nGramBase.h"
31 | #include <iostream>
32 | #include <stdexcept>
33 | #include <sys/stat.h>
34 | #include <sys/types.h>
35 | #include <sys/uio.h>
36 | #include <unistd.h>
37 | #include <fcntl.h>
38 | #include <sys/mman.h>
39 | #include <errno.h>
40 | #include <string.h>
41 | 
42 | #include <libgen.h> //for dirname and basename()
43 | 
44 | using namespace snugglefish;
45 | using namespace std;
46 | 
47 | 
48 | nGramBase::nGramBase( uint32_t ngramLength, string indexFileName)
49 |     {
50 | 
51 |     if(ngramLength == 3 || ngramLength == 4){
52 |         this->ngramLength = ngramLength;
53 |         this->maxNgram = (uint64_t) (1) << (8*ngramLength);
54 |     }
55 |     else {
56 |         throw std::runtime_error("Ngram len must be 3 or 4");
57 |     }
58 | 
59 | 
60 | 
61 |     size_t pos;
62 |     string baseFileName = indexFileName;
63 |     // Check to see if the index file ends with any of the extentions we use
64 |     // And if so, remove them to get the base filename
65 |     pos = baseFileName.rfind( NGRAM_FILE_EXTENSION );
66 |     // Make sure that it is at the end of the string
67 |     if(pos == (baseFileName.size() - 6)){
68 |         baseFileName = baseFileName.substr(0, pos);
69 |     } else{
70 |         pos = baseFileName.rfind( INDEX_FILE_EXTENSION );
71 |         if(pos == (baseFileName.size() - 6)){
72 |             baseFileName = baseFileName.substr(0, pos);
73 |         } else{
74 |             pos = baseFileName.rfind( FILEID_FILE_EXTENSION );
75 |             if(pos == (baseFileName.size() - 6)){
76 |             baseFileName = baseFileName.substr(0, pos);
77 |             }
78 |         }
79 |     }
80 | 
81 | 
82 |     this->baseFileName = baseFileName;
83 | }
84 | 
85 | 


--------------------------------------------------------------------------------
/src/nGramIndex.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | 
 30 | #include "nGramIndex.h"
 31 | #include <iostream>
 32 | #include <stdexcept>
 33 | #include <sys/stat.h>
 34 | #include <fcntl.h>
 35 | #include <unistd.h>
 36 | #include <sys/mman.h>
 37 | #include <errno.h>
 38 | #include <string.h>
 39 | #include <libgen.h>
 40 | 
 41 | using namespace snugglefish;
 42 | using namespace std;
 43 | 
 44 | 
 45 | nGramIndex::nGramIndex( uint32_t ngramLength, string indexFileName)
 46 |     :nGramBase(ngramLength, indexFileName), bufferMax(MAX_BUFFER_SIZE),
 47 |     buffer_memory_usage(0), flush(false), flushing(false) {
 48 | 
 49 | 
 50 |     this->maxFileNameLength = DEFAULT_MAX_FILENAME_SIZE;
 51 |     
 52 | 	//allocate output buffer
 53 | 	this->output_buffer = new buffer_element[maxNgram];
 54 | 	for(uint64_t i = 0; i < maxNgram; i++){
 55 | 		this->output_buffer[i].elements_size = 0;
 56 | 		this->output_buffer[i].elements = new list<ngram_t_fidtype>;
 57 | 	}
 58 | 
 59 |     masterFile = new smFile(baseFileName, ngramLength);
 60 | 
 61 |     if (masterFile->exists()){
 62 |         masterFile->open('w');
 63 |     }else{ //Must create it
 64 |         masterFile->create(maxFileNameLength);
 65 |     }
 66 | 
 67 |     numFilesProcessed = masterFile->getNumFiles();
 68 |     numSessionFilesProcessed = 0;
 69 | }
 70 | 
 71 | nGramIndex::~nGramIndex(){
 72 |     
 73 | 	this->flushAll();
 74 | 	for(uint64_t i = 0; i < maxNgram; i++){
 75 | 		delete this->output_buffer[i].elements;
 76 | 	}
 77 | 	delete[] output_buffer;    
 78 | 
 79 |     delete masterFile;
 80 | 
 81 | }
 82 | 
 83 | 
 84 | /*  NGram Related Functions */
 85 | void nGramIndex::addNGrams(vector<uint32_t>* nGramList, string filename){
 86 |     //POSIX basename may modify argument so create a copy
 87 |     char* temp_filename = new char[filename.length() + 1];
 88 |     strncpy(temp_filename, filename.c_str(), filename.length() + 1);
 89 |     filename = basename(temp_filename);
 90 |     delete[] temp_filename;
 91 | 
 92 |     ngram_t_fidtype file_id = numFilesProcessed++;
 93 |     fileNameList.push_back(filename);
 94 | 
 95 |     // Insert ngram into the list, add the new node to the memory usage variable,
 96 |     // and check if the maximum memory has been used, and if so, indicate that the
 97 |     // nGrams should be flushed to disk
 98 |     for(uint32_t i = 0; i < nGramList->size(); i++){
 99 |         uint32_t nGram = (*nGramList)[i];
100 |         output_buffer[nGram].elements_size++;
101 |         output_buffer[nGram].elements->push_back(file_id);
102 | 
103 |         buffer_memory_usage += BUFFER_NODE_SIZE; //Add size of node
104 | 
105 |         if(buffer_memory_usage >= bufferMax){ 
106 |             flush = true;
107 |         }
108 |     }
109 | 
110 |     //We cleanup the memory
111 |     delete nGramList;
112 | 
113 |     if (flush){
114 |         flushAll();
115 |         flush = false;
116 |     }
117 | 
118 | 
119 | }
120 | 
121 | void nGramIndex::flushAll(){
122 |     flushing = true;
123 |     if(fileNameList.size()){
124 |         ngram_t_indexfcount num_files = fileNameList.size();
125 |         //By updating the master file last, this set can be queried
126 |         //while creating a new index set
127 |         flushIndex(num_files);
128 |         flushMaster();
129 | 
130 |         numSessionFilesProcessed += num_files;
131 |     }
132 |     flushing = false;
133 | }
134 | 
135 | // Flush the file names and update the number of index files
136 | void nGramIndex::flushMaster(){
137 |     //Write FileNames to File ID file
138 |     ngram_t_indexfcount num_files = (ngram_t_indexfcount) fileNameList.size();
139 | 
140 |     if (!num_files){
141 |         return;
142 |     }
143 | 
144 |     for(unsigned long i = 0; i < num_files; i++){
145 |         masterFile->addFileId(fileNameList[i].c_str());
146 |     }
147 | 
148 |     //Clear the vector
149 |     fileNameList.clear();
150 | 
151 |     //Update filid with new value
152 |     masterFile->updateIndexFileCount(masterFile->getNumIndexFiles() + 1);
153 | }
154 | 
155 | // Flush the ngrams to the index files
156 | void nGramIndex::flushIndex(ngram_t_indexfcount num_files){
157 |     //Create the files
158 |     indexSet* tIndex = new indexSet(baseFileName.c_str(), masterFile->getNumIndexFiles(), ngramLength);
159 |     tIndex->create(num_files);
160 | 
161 |     uint64_t bytes_flushed = 0;
162 | 
163 |     for(uint32_t i = 0; i < maxNgram; i++){ //iterate through every ngram
164 |         bytes_flushed += output_buffer[i].elements_size * sizeof(ngram_t_fidtype);
165 |         tIndex->addNGrams(i, this->output_buffer[i].elements);
166 |         output_buffer[i].elements_size = 0;
167 | 
168 |         if (output_buffer[i].elements->size() != 0)
169 |             cout << "Not Zero" << endl;
170 |     }
171 | 
172 |     buffer_memory_usage = 0;
173 | 
174 |     tIndex->close();
175 |     delete tIndex;
176 | }
177 | 
178 | 
179 | void nGramIndex::getStats(uint64_t& totalFiles, uint64_t& sessionFiles, uint64_t& indexFiles, bool& flushing){
180 |     totalFiles = masterFile->getNumFiles();
181 |     sessionFiles = numSessionFilesProcessed;
182 |     indexFiles = masterFile->getNumIndexFiles();
183 |     flushing = this->flushing;
184 | }
185 | 


--------------------------------------------------------------------------------
/src/nGramSearch.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | 
 30 | #include "nGramSearch.h"
 31 | #include <iostream>
 32 | #include <queue>
 33 | #include <stdexcept>
 34 | #include <sys/stat.h>
 35 | #include <fcntl.h>
 36 | #include <sys/mman.h>
 37 | #include <unistd.h>
 38 | #include <errno.h>
 39 | #include <string.h>
 40 | #include <utility>
 41 | #include <pthread.h>
 42 | #include "indexSet.h"
 43 | 
 44 | using namespace snugglefish;
 45 | using namespace std;
 46 | 
 47 | 
 48 | nGramSearch::nGramSearch( uint32_t ngramLength, string indexFileName)
 49 |     :nGramBase(ngramLength, indexFileName),numThreads(1) {
 50 | 
 51 |     masterFile = new smFile(baseFileName, ngramLength);
 52 | 
 53 |     if (!masterFile->exists()){
 54 |         //some error
 55 |     }else{
 56 |         masterFile->open('r');
 57 |     }
 58 | 
 59 | 
 60 |     numIndexFiles = masterFile->getNumIndexFiles();
 61 |     numFiles = masterFile->getNumFiles();
 62 | }
 63 | 
 64 | nGramSearch::nGramSearch( uint32_t ngramLength, string indexFileName, uint32_t threads)
 65 |     :nGramBase(ngramLength, indexFileName),numThreads(threads) {
 66 | 
 67 |     masterFile = new smFile(baseFileName, ngramLength);
 68 | 
 69 |     if (!masterFile->exists()){
 70 |         //some error
 71 |     }else{
 72 |         masterFile->open('r');
 73 |     }
 74 | 
 75 | 
 76 |     numIndexFiles = masterFile->getNumIndexFiles();
 77 |     numFiles = masterFile->getNumFiles();
 78 |     
 79 | }
 80 | 
 81 | nGramSearch::~nGramSearch(){
 82 |    delete masterFile; 
 83 | }
 84 | 
 85 | 
 86 | vector<uint64_t>* nGramSearch::stringToNGrams(string searchString){
 87 |     uint64_t nGram;
 88 |     vector <uint64_t>* ngrams = new vector<uint64_t>;
 89 | 
 90 | 
 91 |     for(size_t i = 0; i + ngramLength - 1 < searchString.length(); i++){
 92 |         nGram = 0;
 93 |         for(size_t j = 0; j < ngramLength; j++){
 94 | 	    // (1 << (8*j)) is equivalent to pow(256,j)
 95 |             nGram += (unsigned char)searchString[i+j] * (1 << (8*j));
 96 |         }
 97 |         ngrams->push_back(nGram);
 98 |     }
 99 | 
100 |     return ngrams;
101 | }
102 | 
103 | 
104 | vector<string>* nGramSearch::searchNGrams(vector<uint64_t> nGramQuery){
105 |     vector<string>* matchedFiles = new vector<string>;
106 |     pthread_t * threads;
107 |     thread_data* tdata;
108 |     void* status;
109 | 
110 |     tdata = new thread_data();
111 | 
112 |     pthread_attr_t attr;
113 |     pthread_attr_init(&attr);
114 |     pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
115 | 
116 |     pthread_mutex_init(&(tdata->queueMutex), NULL);
117 |     pthread_mutex_init(&(tdata->smFileMutex), NULL);
118 |     pthread_mutex_init(&(tdata->mfMutex), NULL);
119 | 
120 |     tdata->ngramLength = ngramLength;
121 |     tdata->queue = 0;
122 |     tdata->maximumIndex = numIndexFiles;
123 | 
124 |     tdata->masterFile = (this->masterFile);
125 |     tdata->matchedFiles = matchedFiles;
126 |     tdata->baseFileName = &baseFileName;
127 |     tdata->nGramQuery = &(nGramQuery);
128 | 
129 |     threads = (pthread_t*) malloc(numThreads * sizeof(pthread_t));
130 | 
131 |     //Create the threads
132 |     for(uint32_t i = 0; i < numThreads; i++){
133 |         pthread_create(& threads[i], &attr, searchNGramThread, (void*) (tdata));
134 |     }
135 |     
136 |     //Join on them
137 |     for(uint32_t i = 0; i < numThreads; i++){
138 |         pthread_join(threads[i], &status);
139 |     }
140 | 
141 |     pthread_mutex_destroy(&(tdata->queueMutex));
142 |     pthread_mutex_destroy(&(tdata->smFileMutex));
143 |     pthread_mutex_destroy(&(tdata->mfMutex));
144 |     pthread_attr_destroy(&attr);
145 | 
146 |     free(threads);
147 |     delete tdata;
148 |     masterFile->close();
149 |     return matchedFiles;
150 | 
151 | }
152 | 
153 | void* nGramSearch::searchNGramThread(void* input){
154 | 
155 | 
156 |     thread_data* tdata = (thread_data*) input;
157 | 
158 |     while(1){
159 |         pthread_mutex_lock(& tdata->queueMutex);
160 |         if (tdata->queue >= tdata->maximumIndex){
161 |             pthread_mutex_unlock(& tdata->queueMutex);
162 |             break;
163 |         }
164 | 
165 |         uint32_t i = tdata->queue++;
166 |         pthread_mutex_unlock(& tdata->queueMutex);
167 | 
168 |         indexSet* tIndex = new indexSet(tdata->baseFileName->c_str(), i, tdata->ngramLength);
169 |         tIndex->open();
170 | 
171 |         //Get ordered list of NGrams
172 |         // In ascending order by number of files that contain that ngram
173 |         //list<index_entry> queryList = orderNGrams(nGramQuery);
174 |         list< pair<uint64_t,size_t> > queryList = orderNGrams(tIndex, *(tdata->nGramQuery));
175 | 
176 |         //Get list of File Ids that match NGrams
177 |         list<ngram_t_fidtype> matchedIds = searchAlpha((indexSet*) tIndex, queryList);
178 | 
179 |         //Convert File IDs to filenames
180 |         list<ngram_t_fidtype>::iterator ft = matchedIds.begin();
181 |         while(ft != matchedIds.end()){
182 |             pthread_mutex_lock(& tdata->smFileMutex);
183 |             string matched_filename = tdata->masterFile->getFilebyId(*ft);
184 |             pthread_mutex_unlock(& tdata->smFileMutex);
185 | 
186 |             pthread_mutex_lock(& tdata->mfMutex);
187 |             tdata->matchedFiles->push_back(matched_filename);
188 |             pthread_mutex_unlock(& tdata->mfMutex);
189 |             ft++;
190 |         }
191 | 
192 |         tIndex->close();
193 |         delete tIndex;
194 |     }
195 | 
196 |     return NULL;
197 | }
198 | 
199 | list< pair<uint64_t, size_t> > nGramSearch::orderNGrams(indexSet* index, const vector<uint64_t> & nGramQuery){
200 |     bool nomatch = false;
201 |     list< pair<uint64_t, size_t> > queryList;
202 |     for(uint32_t j = 0; j < nGramQuery.size(); j++){
203 |         size_t numfiles = index->getNGramCount(nGramQuery[j]);
204 |         //index_entry* index_table = (index_entry*) (indexEntries + (nGramQuery[j] * (INDEX_ENTRY_SIZE)));
205 | 
206 |         if(numfiles == 0){
207 |             nomatch = true;
208 |             break;
209 |             //No Files Match
210 |         }
211 | 
212 |         if(queryList.empty()){
213 |             queryList.push_back(pair<uint64_t,size_t>(nGramQuery[j], numfiles));
214 |         }else{
215 |             bool placed = false;
216 |             for (list< pair<uint64_t, size_t> >::iterator it = queryList.begin(); 
217 |               it != queryList.end(); it++ ){
218 |                 if((*it).second > numfiles){
219 |                     queryList.insert(it, pair<uint64_t, size_t>(nGramQuery[j], numfiles));
220 |                     placed = true;
221 |                     break;
222 |                 }
223 |             }
224 |             if (!placed){
225 |                 queryList.push_back(pair<uint64_t,size_t>(nGramQuery[j], numfiles));
226 |             }
227 |         }
228 |     }
229 | 
230 |     if (nomatch){
231 |         queryList.clear();
232 |     }
233 | 
234 |     return queryList;
235 | 
236 | }
237 | 
238 | 
239 | // Takes the list of sorted fids, and puts the common fids into matchedIds
240 | list<ngram_t_fidtype> nGramSearch::searchAlpha(indexSet* index, list< pair<uint64_t,size_t> > &queryList){
241 |     list<ngram_t_fidtype> matchedIds;
242 | 
243 |     if(queryList.size() == 0){
244 |         return matchedIds;
245 |     }
246 |     // this gets the ngram list for the first file, and places them 
247 |     // into the matchedIds list
248 |     ngram_t_numfiles count = 0;
249 |     ngram_t_fidtype* ngrams = index->getNGrams((uint64_t) queryList.front().first, (size_t*) &count);
250 |     for (ngram_t_numfiles j = 0; j < count; j++){
251 |         matchedIds.push_back(ngrams[j]);
252 |     }
253 |         
254 |     //Dont need the front element anymore so get rid of it
255 |     queryList.pop_front();
256 | 
257 |     //For every id see if it's in the remaining Ngrams
258 |     //Now for every subsequent NGram whittle down the list
259 |     for(list< pair<uint64_t, size_t> >::iterator it = queryList.begin();
260 |             it != queryList.end(); it++){
261 | 
262 |         ngram_t_numfiles ngram_elements = 0;
263 |         ngram_t_fidtype* ngrams = index->getNGrams((uint64_t) (*it).first, (size_t*) &ngram_elements);
264 |         uint32_t ngram_index = 0;
265 | 
266 |         list<ngram_t_fidtype>::iterator ft = matchedIds.begin();
267 |         while(ft != matchedIds.end()){
268 |             bool found = false;
269 |             // Checks each element of the next fid array for
270 |             // the current fid in matchedIds
271 |             while(ngram_index < ngram_elements){
272 |                 if (ngrams[ngram_index] == (*ft)){
273 |                     found = true;
274 |                     break;
275 |                 }else if(ngrams[ngram_index] > (*ft)){
276 |                     //update ngram location
277 |                     // If the fid is too large, it is
278 |                     // necessary to recheck the same index on the
279 |                     // next pass, as to make sure it isn't missed
280 |                     break;
281 |                 }
282 |                 ngram_index++;
283 |             }
284 | 
285 |             if(!found){
286 |                 ft = matchedIds.erase(ft);
287 |             }else{
288 |                 // ft++ is only needed if erase isn't called as 
289 |                 // erase moves the iterator ahead
290 |                 ft++;
291 |             }
292 |         }
293 |     }
294 | 
295 |     return matchedIds;
296 | }
297 | 


--------------------------------------------------------------------------------
/src/smFile.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | 
 30 | #include "smFile.h"
 31 | #include <cstring>
 32 | #include <string>
 33 | #include <cstdlib>
 34 | #include <cerrno>
 35 | #include <stdexcept>
 36 | #include <iostream>
 37 | #include <errno.h>
 38 | 
 39 | 
 40 | using namespace snugglefish;
 41 | using namespace std;
 42 | 
 43 | 
 44 | smFile::smFile(string fileBase, uint8_t nGramSize)
 45 | :file(fileBase.append(FILEID_FILE_EXTENSION).c_str()), ngramLength((ngram_t_size) nGramSize)
 46 | {
 47 | 
 48 | }
 49 | 
 50 | smFile::~smFile(){
 51 |     flush();
 52 | }
 53 | 
 54 | bool smFile::flush(){
 55 |     if (this->readonly)
 56 |         return true;
 57 | 
 58 |     file::flush();
 59 |     write_at((int32_t) FILID_NUM_FILES_OFFSET, (uint8_t*) &numFiles, (size_t) NUM_FILES_FIELD);
 60 | 
 61 |     return true;
 62 | 
 63 | }
 64 | 
 65 | void smFile::create(ngram_t_fnlength maxfnLength){
 66 |     if (this->exists()){
 67 |         cerr << "Unable to create file: " << this->filename << " -- Already Exists" << endl;
 68 |         throw runtime_error("Creating File");
 69 |     }
 70 | 
 71 |     //Create the File
 72 |     file::create();
 73 |     
 74 |     this->endian_check = ENDIAN_CHECK;
 75 |     this->version = VERSION;
 76 |     this->maxFileNameLength = maxfnLength;
 77 |     
 78 |     ngram_t_indexcount four_byte_zero = this->numIndexFiles = 0;
 79 |     ngram_t_fidtype eight_byte_zero = this->numFiles = 0;
 80 | 
 81 |     //Write standard header
 82 |     write((uint8_t*) (&(this->endian_check)), ENDIAN_CHECK_FIELD);
 83 |     write((uint8_t*) (&(this->version)), VERSION_FIELD);
 84 |     write((uint8_t*) (&(this->ngramLength)), NGRAM_SIZE_FIELD);
 85 |     write((uint8_t*) (&(this->maxFileNameLength)), MAX_FILENAME_LENGTH_FIELD);
 86 |     write((uint8_t*) &four_byte_zero, NUM_INDEX_FILES_FIELD);
 87 |     write((uint8_t*) &eight_byte_zero, NUM_FILES_FIELD);
 88 | 
 89 |     fileBuffer = (char*) malloc((maxFileNameLength + 1)* sizeof(char));
 90 | 
 91 | }
 92 | 
 93 | void smFile::open(char readwrite){
 94 |     file::open(readwrite);
 95 | 
 96 |     ngram_t_size ngramL;
 97 | 
 98 |     read((uint8_t*) (&(this->endian_check)), ENDIAN_CHECK_FIELD);
 99 | 
100 |     if (this->endian_check != ENDIAN_CHECK){
101 |        throw runtime_error("Endian Mismatch"); 
102 |     }
103 | 
104 |     read((uint8_t*) (&(this->version)), VERSION_FIELD);
105 |     read((uint8_t*) (&ngramL), NGRAM_SIZE_FIELD);
106 | 
107 |     if (this->ngramLength != ngramL){
108 |        throw runtime_error("N Gram Length Mismatch"); 
109 |     }  
110 | 
111 |     read((uint8_t*) (&(this->maxFileNameLength)), MAX_FILENAME_LENGTH_FIELD);
112 |     read((uint8_t*) (&(this->numIndexFiles)), NUM_INDEX_FILES_FIELD);
113 |     read((uint8_t*) (&(this->numFiles)), NUM_FILES_FIELD);
114 | 
115 |     fileBuffer = (char*) malloc((maxFileNameLength + 1)* sizeof(char));
116 | 
117 | }
118 | 
119 | 
120 | void smFile::addFileId(const char * fileName){
121 |     strncpy(fileBuffer, fileName, maxFileNameLength);
122 |     write((uint8_t*) fileBuffer, maxFileNameLength * sizeof(char));
123 | 
124 |     numFiles++;
125 |     //Writing of the numFiles to the file is delayed until a flush
126 | }
127 | 
128 | const char* smFile::getFilebyId(uint64_t id){
129 |     read_at(FILID_HEADER_SIZE + (id * maxFileNameLength * sizeof(char)), (uint8_t*) fileBuffer, maxFileNameLength * sizeof(char)); 
130 |     return fileBuffer;
131 | }
132 | 
133 | void smFile::updateIndexFileCount(ngram_t_indexcount count){
134 |     write_at((int32_t) FILID_NUM_INDEX_OFFSET, (uint8_t*) &count, (size_t) NUM_INDEX_FILES_FIELD); 
135 |     numIndexFiles = count; 
136 | }
137 | 


--------------------------------------------------------------------------------
/src/snugglefish.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Approved for Public Release; Distribution Unlimited: 13-1937
  3 | 
  4 | Copyright (c) 2014 The MITRE Corporation. All rights reserved.
  5 | 
  6 | Redistribution and use in source and binary forms, with or without
  7 | modification, are permitted provided that the following conditions
  8 | are met:
  9 | 1. Redistributions of source code must retain the above copyright
 10 |    notice, this list of conditions and the following disclaimer.
 11 | 2. Redistributions in binary form must reproduce the above copyright
 12 |    notice, this list of conditions and the following disclaimer in the
 13 |    documentation and/or other materials provided with the distribution.
 14 | 
 15 | THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 16 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 18 | ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 21 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 22 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 23 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 24 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 25 | SUCH DAMAGE.
 26 | 
 27 | */
 28 | 
 29 | 
 30 | // Snugglefish.cpp
 31 | // Sample NGram Fast Indexer and Search (SNGFIS)
 32 | // Allows for the indexing and searching of a large amount of samples in a short 
 33 | // period of time.
 34 | 
 35 | #include <nGramSearch.h>
 36 | #include <nGramIndex.h>
 37 | #include <fileIndexer.h>
 38 | #include <snugglefish.h>
 39 | #include <iostream>
 40 | #include <sstream>
 41 | #include <getopt.h>
 42 | #include <execinfo.h>
 43 | #include <signal.h>
 44 | #include <stdlib.h>
 45 | 
 46 | #include <cerrno>
 47 | #include <unistd.h>
 48 | #include <cstring>
 49 | 
 50 | #include <utility>
 51 | #include <pthread.h>
 52 | 
 53 | #include "common.h"
 54 | 
 55 | using namespace std;
 56 | using namespace snugglefish;
 57 | 
 58 | #define NGRAM_SIZE_OPTION_LONG "ngramsize"
 59 | #define NGRAM_SIZE_OPTION_SHORT 'n'
 60 | #define HELP_OPTION_LONG "help"
 61 | #define HELP_OPTION_SHORT 'h'
 62 | #define INDEX_OPTION_SHORT 'i'
 63 | #define INDEX_OPTION_LONG "index"
 64 | #define SEARCH_OPTION_SHORT 's'
 65 | #define SEARCH_OPTION_LONG "search"
 66 | #define OUTPUT_OPTION_SHORT 'o'
 67 | #define OUTPUT_OPTION_LONG "output"
 68 | #define FILE_OPTION_SHORT 'f'
 69 | #define FILE_OPTION_LONG "file"
 70 | #define NODE_BUFFER_OPTION_LONG "node_bound"
 71 | #define NODE_BUFFER_OPTION_SHORT 'b'
 72 | #define THREADS_OPTION_LONG "threads"
 73 | #define THREADS_OPTION_SHORT 't'
 74 | #define SHORT_OPTIONS_STRING "n:hsio:f:b:t:"
 75 | 
 76 | 
 77 | uint32_t cpu_count(){
 78 |     long procs = -1;
 79 | 
 80 |     //This only works on some *nixes TODO figure out which systems don't support this call
 81 |     procs = sysconf(_SC_NPROCESSORS_ONLN);
 82 |     if (procs < 1){
 83 |         cerr << "Unable to get CPU count, defaulting to 1 -- Error: " 
 84 |              << strerror(errno) << endl;
 85 |         return 1;
 86 |     }
 87 |     return (uint32_t) procs;
 88 | }
 89 | 
 90 | 
 91 | 
 92 | void handler(int sig) {
 93 |   void *array[10];
 94 |   int size;
 95 | 
 96 |   // get void*'s for all entries on the stack
 97 |   size = backtrace(array, 10);
 98 | 
 99 |   // print out all the frames to stderr
100 |   fprintf(stderr, "Error: signal %d:\n", sig);
101 |   backtrace_symbols_fd(array, size, 2);
102 |   exit(1);
103 | }
104 | 
105 | void printHelp();
106 | 
107 | 
108 | int main(int argc, char *argv[]){
109 |     int c, option_index = 0;
110 |     uint32_t ngramSize = 3;
111 |     uint32_t max_files = 0;
112 |     uint64_t max_buffer = 0;
113 |     uint32_t threads = 0;
114 | 
115 |     signal(SIGSEGV, handler); 
116 | 
117 |     string commandString, indexFileName = "", searchString;
118 |     vector <string> fileList;
119 |     static struct option long_options[] ={
120 |         // Ngramsize defaults to 3 if not present
121 |         {INDEX_OPTION_LONG, no_argument, 0, INDEX_OPTION_SHORT},
122 |         {SEARCH_OPTION_LONG, no_argument, 0, SEARCH_OPTION_SHORT},
123 |         {FILE_OPTION_LONG, required_argument, 0, FILE_OPTION_SHORT},
124 |         {OUTPUT_OPTION_LONG, required_argument, 0, OUTPUT_OPTION_SHORT},
125 |         {NGRAM_SIZE_OPTION_LONG, required_argument, 0, NGRAM_SIZE_OPTION_SHORT},
126 |         {HELP_OPTION_LONG, no_argument, 0, HELP_OPTION_SHORT},
127 |         {THREADS_OPTION_LONG, required_argument, 0, THREADS_OPTION_SHORT},
128 |         {0,0,0,0}
129 |     };
130 |     bool indexFlag = false, searchFlag = false;
131 | 
132 |     // Default threads to the number of cpus
133 |     threads = cpu_count();
134 | 
135 |     // loop over all of the options
136 |     while ((c = getopt_long(argc, argv, SHORT_OPTIONS_STRING, long_options, &option_index)) != -1){
137 |         // check to see if a single character or long option came through
138 |         switch (c){
139 |             // short option 't'
140 |             case THREADS_OPTION_SHORT:
141 |                 {
142 |                     istringstream tss(optarg);
143 |                     uint32_t thr = 0;
144 | 
145 |                     if(!(tss >> thr)){
146 |                         cout << "Invalid number of threads, please enter an integer" << endl;
147 |                         return 0;
148 |                     }else{
149 |                         threads = thr;
150 |                     }
151 | 
152 |                     break;
153 |                 }
154 |             case SEARCH_OPTION_SHORT:
155 |                 searchFlag = true;
156 |                 break;
157 |             case INDEX_OPTION_SHORT:
158 |                 indexFlag = true;
159 |                 break;
160 |                 // Outpur and file options are the same
161 |             case OUTPUT_OPTION_SHORT:
162 |             case FILE_OPTION_SHORT:
163 |                 indexFileName = optarg;
164 |                 break;
165 |             case HELP_OPTION_SHORT:
166 |                 printHelp();
167 |                 return 0;
168 |                 // short option 'a'
169 |             case NGRAM_SIZE_OPTION_SHORT:
170 |                 {
171 |                     //string ngramSizeString(optarg);
172 |                     istringstream ss(optarg);
173 |                     //stringstream ss;
174 |                     //ss << ngramSizeString;
175 |                     if(!(ss >> ngramSize)){
176 |                         cout << "Invalid ngram size, please enter an integer" << endl;
177 |                         return 0;
178 |                     }
179 |                     if(ngramSize < 3 || ngramSize > 4){
180 |                         cout << "Ngram size must be 3 or 4. Ngram size is " << ngramSize << endl;
181 |                         return 0;
182 |                     }
183 |                     break;
184 |                 }
185 | 
186 |             case NODE_BUFFER_OPTION_SHORT:
187 |                 {
188 |                     istringstream ss(optarg);
189 |                     uint64_t tmax_buffer = 0;
190 |                     if(!(ss >> tmax_buffer)){
191 |                         cout << "Invalid maxmimum node buffer, please enter an integer" << endl;
192 |                         return 0;
193 |                     }
194 | 
195 |                     if(tmax_buffer < 4){
196 |                         cout << "Node buffer size must be at least 4GB in size" << endl;
197 |                         return 0;
198 |                     }
199 |             
200 |                     max_buffer = tmax_buffer * 1073741824; //in Gigabytes 
201 |                     break;
202 |                 }
203 |                     
204 |         }
205 |     }
206 |     /* the rest of the command line arguments 
207 |      * each option needs at least an aditional parameter*/
208 |     if((!indexFlag && !searchFlag)){
209 |         cout << "Must specify -i for index or -s for search" << endl;
210 |         return 0;
211 |     }
212 | 
213 |     if((indexFlag && searchFlag)){
214 |         cout << "Must specify only one of -i or -s" << endl;
215 |         return 0;
216 | 
217 |     }
218 |     if(indexFileName.empty()){
219 |         cout << "Index file necessary" << endl;
220 |         return 0;
221 |     }
222 | 
223 | 
224 |     if(indexFlag){
225 |         // get the list of files to add
226 | 
227 |         for(; optind < argc; optind++){
228 |             fileList.push_back(argv[optind]);
229 |         }
230 |         // if fileList is empty, it means we need to get the
231 |         // filenames from stdin
232 |         if(fileList.empty()){
233 |             string fileName;
234 |             while(cin){
235 |                 getline(cin,fileName);
236 |                 if(fileName.empty())
237 |                     break;
238 |                 fileList.push_back(fileName);
239 |             }
240 |         }
241 |         //Eventually options should be sent as a structure
242 |         make_index(indexFileName, fileList, ngramSize, max_files, max_buffer, threads);
243 |     } else if(searchFlag) {
244 |         // Get the string to search for
245 | 		if (optind < argc) {
246 | 			searchString = argv[optind];
247 | 		} else {
248 | 			cin >> searchString;
249 | 		}
250 | 	    if (searchString.size() < ngramSize){
251 | 	        cout << "Search string size is smaller than Ngram size, the search string must be greater than or equal to the ngram size" << endl;
252 | 	    } else {
253 | 			vector<string>* found = search(indexFileName, searchString, ngramSize, threads);
254 | 			for(uint32_t i = 0; i < found->size(); i++){
255 |        			cout << (*found)[i] << endl;
256 |    			}
257 |             delete found;
258 | 		}
259 |     } else{
260 |         printHelp();
261 |     }
262 | 
263 |     return 0;
264 | }
265 | 
266 | void printHelp(){
267 |     cout << "Usage: snugglefish [OPTIONS]" << endl;
268 |     cout << "Index files based on Ngrams then Search for a string" << endl;
269 |     cout << "-i, --index            Index Operation, requires -o, and files to index" << endl;
270 |     cout << "-s, --search           Search Operation, requires -f, and search string" << endl;
271 |     cout << "-o, --output           Specifies the output file for indexing, equivalent to -f" << endl;
272 |     cout << "-f, --file             Index file to search, equivalent to -o" << endl;
273 |     cout << "-b, --node_bound       Maximum node buffer memory size before flushing" << endl;
274 |     cout << "-n, --ngramsize        The size of Ngram to use (default is 3)" << endl;
275 |     cout << "-h, --help             This help screen" << endl;
276 |     cout << "-t, --threads          Number of search threads to spawn (default is #cpus)" << endl;
277 |     cout << "Examples:" << endl;
278 |     cout << "Index: snugglefish [-n ngramsize]  -i -o <index filename> <files to index>" << endl;
279 |     cout << "If no file names are given on the commandline, the stdin will be used" << endl;
280 |     cout << "Search: snugglefish [-n ngramsize] -s -f <index filename> <search string>" << endl;
281 |     cout << "If no string is given, it can be entered on the command line" << endl;
282 | 
283 | 
284 | }
285 | 
286 | 
287 | void printStats(nGramIndex* indexer, uint64_t processed, uint64_t listsize){
288 |     uint64_t total;
289 |     uint64_t session;
290 |     uint64_t indexes;
291 |     bool flushing;
292 |     indexer->getStats(total, session, indexes, flushing);
293 | 
294 |     uint64_t percent = ((double) processed / listsize) * 100;
295 | 
296 |     cerr << "\r";
297 |     cerr << "Processed: " << percent << "% -- " << processed << "/" << listsize;
298 | 
299 |     if(flushing){
300 |         cerr << " (Flushing ... )"; 
301 |     }else{
302 |         cerr << "                "; 
303 |     }
304 | }
305 | 
306 | void* indexerThread(void* input){
307 |     mi_data* midata = (mi_data*) input;
308 |     nGramIndex* ngramindex = (nGramIndex*) midata->ngramindex;
309 |     fileIndexer indexer(midata->ngramSize);
310 | 
311 |     while(1){
312 |         pthread_mutex_lock(& midata->filesMutex);
313 |         if (midata->queue >= midata->fileList->size()){
314 |             pthread_mutex_unlock(& midata->filesMutex);
315 |             break;
316 |         }
317 |         uint32_t i = midata->queue++; 
318 |         pthread_mutex_unlock(& midata->filesMutex); 
319 | 
320 |         try{
321 |             vector<uint32_t>* processedFile = indexer.processFile((*(midata->fileList))[i].c_str());
322 |             if(processedFile != 0){
323 |                 pthread_mutex_lock(& midata->nGramIndexMutex);
324 |                 ngramindex->addNGrams(processedFile, (*(midata->fileList))[i]);
325 |                 pthread_mutex_unlock(& midata->nGramIndexMutex);
326 |             }
327 |         }catch(exception& e){
328 |             cout << "Error in thread:" << e.what() << endl;
329 |             handler(SIGSEGV);
330 |         }
331 |     }
332 |     return 0;
333 | }
334 | 
335 | void make_index(string indexFileName, vector <string> fileNames, uint32_t ngramSize, uint32_t max_files, uint64_t max_buffer, uint32_t threads){
336 | 
337 |     pthread_t* indexers;
338 |     mi_data* midata;
339 |     void* status;
340 | 
341 | 
342 |     midata = new mi_data;
343 |     indexers = (pthread_t*) malloc(threads * sizeof(pthread_t));
344 | 
345 | 
346 |     pthread_attr_t attr;
347 |     pthread_attr_init(&attr);
348 |     pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
349 | 
350 |     pthread_mutex_init(& (midata->filesMutex), NULL);
351 |     pthread_mutex_init(& (midata->nGramIndexMutex), NULL);
352 | 
353 |     midata->fileList = & fileNames;
354 |     midata->ngramSize = ngramSize;
355 |     midata->queue = 0;
356 | 
357 |     try{
358 |         nGramIndex ngramindex(ngramSize, indexFileName);
359 |         if (max_buffer > 0){
360 |             ngramindex.setmaxBufferSize(max_buffer);
361 |         }
362 | 
363 |         midata->ngramindex = &ngramindex;
364 | 
365 |         for(uint32_t i = 0; i < threads; i++){
366 |             pthread_create(& indexers[i], & attr, indexerThread, (void*) midata);
367 |         }
368 | 
369 |         while(1){
370 |             //Usage of mutex shouldn't matter
371 |             if (midata->queue >= fileNames.size()){
372 |                 break;
373 |             }
374 |             
375 |             printStats(&ngramindex, midata->queue, fileNames.size());
376 |             sleep(1);
377 |         }
378 | 
379 | 
380 |         for(uint32_t i = 0; i < threads; i++){
381 |             pthread_join(indexers[i], &status);
382 |         }
383 | 
384 |         //Print some final stats
385 |         printStats(&ngramindex, midata->queue, fileNames.size());
386 |         cerr << endl;
387 | 
388 |     } catch (exception& e){
389 |         cout << "Error:" << e.what() << endl;
390 |         handler(SIGSEGV);
391 |     }
392 | 
393 | 
394 |     pthread_mutex_destroy(&(midata->filesMutex));
395 |     pthread_mutex_destroy(&(midata->nGramIndexMutex));
396 |     pthread_attr_destroy(&attr);
397 | 
398 |     delete midata;
399 |     free(indexers);
400 | }
401 | 
402 | 
403 | vector<string>* search(string indexFileName, string searchString, uint32_t ngramSize, uint32_t threads){
404 | 	vector<string>* ret;
405 | 	nGramSearch searcher(ngramSize, indexFileName, threads);
406 |     vector<uint64_t>* ngrams = searcher.stringToNGrams(searchString);
407 | 	ret = searcher.searchNGrams(*ngrams);
408 | 
409 |     delete ngrams;
410 |     return ret;
411 | }
412 | 


--------------------------------------------------------------------------------