├── .gitignore ├── LICENSE ├── README.md ├── attic ├── Vagrantfile ├── bn_plot.py ├── old-config.txt ├── old_graphlab_ref_code │ └── bcnlp_tm.py └── provision │ └── bootstrap.sh ├── bcnlp_fxtract.py ├── bcnlp_listfiles.py ├── bcnlp_tm.py ├── bn_filextract.py ├── config.txt ├── disk_images ├── fourpartusb1.E01 └── govdocs45sampler.E01 ├── externals ├── README.md ├── libewf-20140608.tar.gz └── libuna-alpha-20150927.tar.gz └── setup.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Logo](https://github.com/BitCurator/bitcurator.github.io/blob/main/logos/BitCurator-Basic-400px.png) 2 | 3 | # bitcurator-nlp-gentm 4 | 5 | [![GitHub issues](https://img.shields.io/github/issues/bitcurator/bitcurator-nlp-gentm.svg)](https://github.com/bitcurator/bitcurator-nlp-gentm/issues) 6 | [![GitHub forks](https://img.shields.io/github/forks/bitcurator/bitcurator-nlp-gentm.svg)](https://github.com/bitcurator/bitcurator-nlp-gentm/network) 7 | [![Twitter Follow](https://img.shields.io/twitter/follow/bitcurator.svg?style=social&label=Follow)](https://twitter.com/bitcurator) 8 | 9 | # END-OF-LIFE (EOL) NOTICE 10 | 11 | This research software has reached end-of-life. The code in this repository is no longer actively maintained or supported. 12 | 13 | ## About 14 | 15 | Generate topic models using open text automatically extracted from various file formats in disk images. This project uses The Sleuth Kit (https://github.com/sleuthkit/sleuthkit) to parse file systems in disk images, textract (https://textract.readthedocs.io/en/stable/) to extract text from common file formats, gensim to generate topic models (https://radimrehurek.com/gensim/), and pyLDAvis (https://github.com/bmabey/pyLDAvis) for visualization. 16 | 17 | ## Setup and Installation 18 | 19 | The topic model generation tool depends on a number of external natural language processing and digital forensics libraries. For convenience, we have included a script that will install all the required dependencies in Ubuntu 18.04LTS. This script will install certain tools (TSK, libewf, and several others) by compiling and installing from source. 20 | 21 | In a Ubuntu host or a clean virtual machine, first make sure you have git installed: 22 | 23 | * Open a terminal and install git using apt: 24 | ```shell 25 | $ sudo apt-get install git 26 | ``` 27 | 28 | Next, follow these steps: 29 | 30 | * Clone this repository: 31 | ```shell 32 | $ git clone https://github.com/bitcurator/bitcurator-nlp-gentm 33 | ``` 34 | 35 | * Change directory into the repository: 36 | ```shell 37 | $ cd bitcurator-nlp-gentm 38 | ``` 39 | 40 | * Run the setup shell script to install and configure the required software (various dependencies, TSK, textract, and gensim). Note that this may take some time (**typically 10-15 minutes**). 41 | ```shell 42 | $ sudo ./setup.sh 43 | ``` 44 | 45 | ## Disk Image Selection and Configuration 46 | 47 | This repository includes a sample Expert Witness Format disk image (**govdocs45sampler.E01**) in the **disk_images** directory. If you do not make any changes to the configuration file, the topic modeler and visualization tool will be run on text extracted from files discovered in this image. 48 | 49 | To run the tool against other disk images (EWF or raw), simply copy those images into the **disk_images** directory and edit the **[image_section]** of the configuration file (**config.txt**) to include the relevant files. For example, if you had two images named **testimage1.E01** and **testimage2.dd**, the section would be modified as follows: 50 | 51 | ```shell 52 | # Disk images to process (the default location can be changed in the following section) 53 | [image_section] 54 | my-image-name1.E01 = 1 55 | my-image-name2.dd = 1 56 | ``` 57 | 58 | ## Running the Tool 59 | 60 | Run the following command to extract text from the configured file types, start the topic modeling tool, and load the results into a browser window. 61 | 62 | ```shell 63 | $ python bcnlp_tm.py 64 | ``` 65 | 66 | * Depending on the size of your corpus, this may take some time. You will see a range of log output and (possibly) deprecation warnings related to the operation of gensim and other tools. The tool is operating normally unless it drops back to a terminal prompt with an error. 67 | 68 | * The results based on the text extracted from your specified file types and processed using pyLDAvis will appear automatically in a browser window. When finished viewing, you can terminate the server in the existing terminal by typing "Ctrl-X" followed by "Ctrl-C". 69 | 70 | Additional adjustments can be performed with command-line flags. 71 | 72 | * --topics: number of topics (default 10) 73 | * --tm: topic modeling tool (default gensim). (Graphlab option disabled due to licensing restrictions) 74 | * --infile: file source: if the --infile option is not used, the disc image(s) listed in the configuration 75 | file will be extracted. Use --infile to specify a directory instead. 76 | * --config: configuration file (default **config.txt** in main directory) - specify file path for alternate configuration file 77 | 78 | ```shell 79 | $ Usage: python bcnlp_tm.py [--topics <10>] [--tm ] [--infile ] [--config ] 80 | ``` 81 | 82 | ## Documentation 83 | 84 | Additional project information can be found on the BitCurator NLP wiki at https://github.com/BitCurator/bitcurator-nlp/wiki. 85 | 86 | ## License(s) 87 | 88 | The BitCurator logo, BitCurator project documentation, and other non-software products of the BitCurator team are subject to the the Creative Commons Attribution 4.0 Generic license (CC By 4.0). 89 | 90 | Unless otherwise indicated, software items in this repository are distributed under the terms of the GNU Lesser General Public License, Version 3. See the text file "COPYING" for further details about the terms of this license. 91 | 92 | In addition to software produced by the BitCurator team, BitCurator packages and modifies open source software produced by other developers. Licenses and attributions are retained here where applicable. 93 | 94 | ## Additional Notes 95 | 96 | If your Ubuntu VM does not already have a desktop (graphic UI), you will need to install one in order to view the results in a browser: 97 | 98 | ```shell 99 | $ sudo apt-get update 100 | $ sudo apt-get install ubuntu-desktop 101 | ``` 102 | -------------------------------------------------------------------------------- /attic/Vagrantfile: -------------------------------------------------------------------------------- 1 | #-*- mode: ruby -*- 2 | # vi: set ft=ruby : 3 | 4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing! 5 | VAGRANTFILE_API_VERSION = "2" 6 | 7 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| 8 | 9 | # Note: Built is tested with Ubuntu 17.04, but should also work with Ubuntu 16.04 10 | config.vm.box = "bento/ubuntu-17.04" 11 | 12 | # Run the provisioning script 13 | config.vm.provision :shell, :path => "./provision/bootstrap.sh" 14 | 15 | # Configure synced folder - uncomment the following line to enable 16 | # config.vm.synced_folder "", "/vagrant" 17 | 18 | # Port forward HTTP (80) to host 2020 19 | # Port forward 8080 (required for gensim) 20 | config.vm.network :forwarded_port, :host => 8080, :guest => 80 21 | config.vm.network :forwarded_port, :host => 8888, :guest => 8888 22 | 23 | # Use VirtualBox as the provider. Default specs are 4GB RAM, 2 procs 24 | # Increase vb.memory and vb.cpus for better performance 25 | config.vm.provider :virtualbox do |vb| 26 | vb.name = "nlp-webtools-0.0.1" 27 | vb.memory = 4096 28 | vb.cpus = 2 29 | end 30 | end 31 | -------------------------------------------------------------------------------- /attic/bn_plot.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=UTF-8 3 | # 4 | # BitCurator NLP (Disk Image Access for the Web) 5 | # Copyright (C) 2016 - 2018 6 | # All rights reserved. 7 | # 8 | # This code is distributed under the terms of the GNU General Public 9 | # License, Version 3. See the text file "COPYING" for further details 10 | # about the terms of this license. 11 | # 12 | # This file contains the main BitCurator NLP application for 3D plot. 13 | 14 | from mpl_toolkits.mplot3d import Axes3D 15 | import matplotlib.pyplot as plt 16 | import numpy as np 17 | import spacy 18 | #import spacy.en 19 | #from spacy.en import English 20 | import textract 21 | 22 | dict_ent = {} 23 | dict_time = {} 24 | dict_org = {} 25 | dict_person = {} 26 | dict_gpe = {} 27 | dict_event = {} 28 | dict_date = {} 29 | dict_languages = {} 30 | dict_facility = {} 31 | dict_work_of_art = {} 32 | dict_norp = {} 33 | dict_loc = {} 34 | 35 | def get_dict(ent_type): 36 | if ent_type == 'time': 37 | return "time", dict_time 38 | elif ent_type == 'org': 39 | return "org", dict_org 40 | elif ent_type == 'person': 41 | return "person", dict_person 42 | elif ent_type == 'gpe': 43 | return "gpe", dict_gpe 44 | elif ent_type == 'date': 45 | return "date", dict_date 46 | elif ent_type == 'languages': 47 | return "languages", dict_languages 48 | elif ent_type == 'facility': 49 | return "facility", dict_facility 50 | elif ent_type == 'work_of_art': 51 | return "work_of_art", dict_work_of_art 52 | elif ent_type == 'norp': 53 | return 'norp', dict_norp 54 | elif ent_type == 'loc': 55 | return "loc", dict_loc 56 | else: 57 | return None, None 58 | 59 | 60 | from bn_filextract import * 61 | 62 | from configobj import ConfigObj 63 | # Dict to number of partitions in each image 64 | partition_in = dict() 65 | config_file = "config.txt" # FIXME: Remove the globalization 66 | logging.basicConfig(filename= 'bcnlp.log', level=logging.DEBUG) 67 | 68 | img_list = [] 69 | doc_list = [] 70 | entities_list = [] 71 | 72 | class ParseForEnts(): 73 | """ Parses the given file(s) into entities and generates the span 74 | Input: text, entity_list 75 | Output: Span file(s) 76 | entity_list can be configured in the file bcnlp_config.txt. 77 | """ 78 | def __init__(self): 79 | self.spans = [] 80 | fig = plt.figure() 81 | self.ax = fig.add_subplot(111, projection='3d') 82 | self.ax.set_xlabel('Image') 83 | self.ax.set_ylabel('Document') 84 | self.ax.set_zlabel('Entity') 85 | 86 | def getIdsForPlot(self, img, doc, entity): 87 | if img not in img_list: 88 | logging.info("Appending img to img_list : %s", img) 89 | img_list.append(img) 90 | # return the key as it already exists 91 | img_id = img_list.index(img) 92 | 93 | #logging.info("[V]ParseForEnts: getIdsForPlot: doc_list: %s ", doc_list) 94 | if doc not in doc_list: 95 | logging.info("Appending DOC to doc_list : %s", doc) 96 | doc_list.append(doc) 97 | else: 98 | logging.info("Doc %s already exists ",doc) 99 | 100 | # return the key as it already exists 101 | doc_id = doc_list.index(doc) 102 | 103 | if entity not in entities_list: 104 | entities_list.append(entity) 105 | # return the key as it already exists 106 | entity_id = entities_list.index(entity) 107 | 108 | logging.info("getIdsForPlot:ret img_id %d doc_id %d entity_id: %d ",\ 109 | img_id, doc_id, entity_id) 110 | 111 | return img_id, doc_id, entity_id 112 | 113 | def tagEnts(self, text, entity_list, nlp, img, doc): 114 | self.spacy_doc = nlp(text) 115 | logging.info("Spacy_doc Entities: \n") 116 | 117 | ''' 118 | for ent in self.spacy_doc.ents: 119 | logging.info("%s, %s, %s", ent.text, ent.label, ent.label_) 120 | ''' 121 | 122 | for j in entity_list: 123 | dict_ent[j] = 0 124 | 125 | logging.info("tagEnts: Entity list: %s", entity_list) 126 | for word in self.spacy_doc[:-1]: 127 | #logging.info("[V]Word: %s ent_type: %s ", \ 128 | #word, str(word.ent_type_)) 129 | 130 | start = word.i 131 | end = word.i + 1 132 | while end < len(self.spacy_doc) and self.spacy_doc[end].is_punct: 133 | end += 1 134 | self.span = self.spacy_doc[start : end] 135 | if word.ent_type_ in entity_list or \ 136 | (word.ent_type_).lower() in entity_list: 137 | #logging.info("tagEnts:Img:%s Doc:%s Entity: %s ent_type:%s ", \ 138 | #img, doc, word, word.ent_type_) 139 | 140 | x, y, z = self.getIdsForPlot(img, doc, word) 141 | self.plot3d(x, y, z) 142 | logging.info("[D]tagEnts: ent_type %s is in entity_list ", \ 143 | word.ent_type_) 144 | end_char = "end: "+str(self.span.end_char) 145 | start_char = "start: "+str(self.span.start_char) 146 | ent_type = "type: "+word.ent_type_ 147 | self.spans.append((end_char, start_char, ent_type)) 148 | logging.debug("[D]tagEnts: Appended %s, New SPANS: %s ", \ 149 | word, self.spans) 150 | 151 | # For generating histogram, a new dictionary is created for 152 | # each entity. First time the value is initialized to 1. 153 | # It is appended for subsequent words 154 | edict_name, edict = get_dict(word.ent_type_.lower()) 155 | 156 | if edict != None: 157 | if str(word) in edict: 158 | edict[str(word)] += 1 159 | else: 160 | edict[str(word)] = 1 161 | 162 | dict_ent[str(word.ent_type_.lower())] += 1 163 | 164 | ''' 165 | # Note: This is commented out to reduce noice in the log file. 166 | else: 167 | logging.debug("ent_type %s for word %s is NOT in entity_list", 168 | word.ent_type_, word) 169 | ''' 170 | 171 | return self.spans, dict_ent 172 | def plot3d(self, x, y, z): 173 | self.ax.scatter(x, y, z, c='r', marker='.') 174 | 175 | 176 | def extractContents(self, infile): 177 | """ If infile is not in text format, it uses textract api to extract 178 | text out of the given file. 179 | """ 180 | if infile.endswith('.span'): 181 | return None 182 | if not infile.endswith('.txt'): 183 | print("infile {} doesnt end with txt. So textracting".format(infile)) 184 | 185 | ''' 186 | # Note: This is just in case we want to see the conversion 187 | # copied to a file 188 | filename, file_ext = os.path.splitext(infile) 189 | print("Filename: {}, ext: {}".format(filename, file_ext)) 190 | 191 | new_infile = replace_suffix(infile,file_ext, 'txt') 192 | print "new_infile: ", new_infile 193 | 194 | f = codecs.open(new_infile, "r", "utf-8") 195 | input_file_contents = f.read() 196 | 197 | ''' 198 | filename, file_ext = os.path.splitext(infile) 199 | try: 200 | text = textract.process(infile) 201 | except: 202 | print("Textract probably does not support extension ", file_ext) 203 | return None 204 | 205 | #nlp expects a unicode text string. 206 | input_file_contents = unicode(text,'utf-8') 207 | 208 | else: 209 | print "Extracting Contents of file", infile 210 | f = codecs.open(infile, "r", "utf-8") 211 | try: 212 | input_file_contents = f.read() 213 | except: 214 | print "Error reading file ", infile 215 | return None 216 | 217 | return input_file_contents 218 | 219 | def bnParseConfigFileForEnts(self, filename): 220 | """ Parses the configuration file plot_config.txt to 221 | extract FIXME 222 | """ 223 | config = ConfigObj(filename) 224 | entity_list_section = config['entity_list_section'] 225 | cfg_entity_list = [] 226 | for key in entity_list_section: 227 | #logging.debug("Cfg: Key: %s %s ", key, entity_list_section[key]) 228 | flag = int(entity_list_section[key]) 229 | if flag == 1: 230 | #logging.debug("Cfg: bnParseConfigFile: Appending key %s: ", key) 231 | cfg_entity_list.append(key) 232 | return cfg_entity_list 233 | 234 | def bcnlpProcessDir(self, infile, bg): 235 | """ Recursively calls itself till it finds a file which is not a 236 | directory, to process the file contents. 237 | """ 238 | for f in os.listdir(infile): 239 | f_path = infile + '/' + f 240 | print "\n>> Processing file ", f_path 241 | logging.debug("bcnlpProcessDir: Processing file %s ",f_path) 242 | if os.path.isdir(f_path): 243 | self.bcnlpProcessDir(f_path, bg) 244 | else: 245 | # It is a file 246 | logging.debug(">>>> Processing single file %s ", f_path) 247 | self.bcnlpProcessSingleFile(f_path, bg) 248 | 249 | 250 | def bcnlpProcessText(self, img, doc, text, entity_list, parse_en, bg=False): 251 | logging.info("ProcessText: img: %s doc: %s",img, doc) 252 | spans, dict_ents = self.tagEnts(text, entity_list, parse_en, img, doc) 253 | #logging.debug("const ents = %s", entity_list) 254 | 255 | 256 | def bcnlpProcessSingleFile(self, infile, bg = False): 257 | """ Given a file, it extracts the contents and calls tagEnts to 258 | create the spans for the entities given in the config file. 259 | """ 260 | outfile = infile+'.span' 261 | 262 | # Get the entity list from the config file: 263 | entity_list = self.bnParseConfigFile("bcnlp_config.txt") 264 | logging.debug("infile:{}, outfile:{}".format(infile, outfile)) 265 | logging.debug("Entity List:%s: ", str(entity_list)) 266 | 267 | text = self.extractContents(infile) 268 | 269 | if text == None: 270 | print("textract returned None for file ", infile) 271 | return 272 | spans, dict_ents = self.tagEnts(text, entity_list, img=None, doc=None) 273 | ''' 274 | # NOTE: just for debugging purpose. Produces a lot of log 275 | logging.debug("const text = %s", text) 276 | logging.debug("const spans = %s", str(spans)) 277 | logging.debug("const ents = %s", entity_list) 278 | ''' 279 | 280 | if not os.path.exists(outfile): 281 | logging.debug('writing spans to outfile %s ', outfile) 282 | with open(outfile, "w") as of: 283 | text_line = ("const text = '"+ text + "'") 284 | try: 285 | of.write(text_line.encode('utf8')) 286 | except UnicodeEncodeError as e: 287 | print "Unicode Error({0}) ".format(e) 288 | print (" ### Error in writing: ", infile) 289 | return 290 | span_line = str(spans).replace('(','{') 291 | span_line = span_line.replace(')','}') 292 | span_line = unicode("const spans = "+ span_line, 'utf-8') 293 | of.write("%s\n" % span_line) 294 | ent_line = unicode("const ents = " + str(entity_list), 'utf-8') 295 | of.write("%s\n" % ent_line) 296 | else: 297 | print("Outfile {} exists. So skipping".format(outfile)) 298 | 299 | print("\n") 300 | print ">> Wrote span info to output file ", outfile 301 | 302 | cfg_image = {} 303 | def bn_parse_config_file(config_file, section_name): 304 | print "bn_parse_config_file: Section: ", section_name, config_file 305 | config = ConfigObj(config_file) 306 | section = config[section_name] 307 | i = 0 308 | cfg_entity_list = [] 309 | for key in section: 310 | #if key == cfg_string: 311 | # found the string 312 | #return section[key] 313 | print "key: ", key 314 | if section_name == "image_section": 315 | print (key, section[key]) 316 | cfg_image[i] = key 317 | i+=1 318 | elif section_name == "entity_list_section": 319 | flag = int(entity_list_section[key]) 320 | if flag == 1: 321 | cfg_entity_list.append(key) 322 | if section_name == "entity_list_section": 323 | return cfg_entity_list 324 | #print "IMAGES: ", cfg_image 325 | 326 | 327 | if __name__ == "__main__": 328 | 329 | #parse_en = English() 330 | nlp = spacy.load('en') 331 | config_file = "config.txt" 332 | #bn_parse_config_file(config_file) 333 | bn_parse_config_file(config_file, "image_section") 334 | #bn = bn_filextract.bcnlp() 335 | bn = BnFilextract() 336 | 337 | # for each image extract the files and convert the convertable 338 | # formats to text format 339 | i = 0 # image index 340 | ent = ParseForEnts() 341 | 342 | # Find the excluded formats from config file. 343 | bn.exc_fmt_list = bn.bnGetExFmtsFromConfigFile(config_file) 344 | print("Excluded formats in config file: ", bn.exc_fmt_list) 345 | 346 | for img in cfg_image: 347 | print "Extracting files from image ", cfg_image[img] 348 | bn.bnExtractFiles(ent, cfg_image[img], i, nlp, config_file) 349 | i += 1 350 | 351 | #entity_list = ent.bnParseConfigFileForEnts("bn_config.txt") 352 | # Now traverse the directory and generate entities, etc. 353 | file_extract_dir = bn.bnGetConfigInfo(config_file, \ 354 | "confset_section", "file_staging_directory") 355 | 356 | i = 0 357 | for img in cfg_image: 358 | new_file_extract_dir = os.path.join(file_extract_dir, str(i)) 359 | bn.bnTraverseDirForPlot(img, new_file_extract_dir, \ 360 | ent, nlp, config_file) 361 | i += 1 362 | 363 | print(">> Plotting the results ") 364 | 365 | plt.show() 366 | 367 | ''' 368 | fig = plt.figure() 369 | ax = fig.add_subplot(111, projection='3d') 370 | 371 | xs = 372 | ys = 373 | zz = randrange(n, 0, 100) 374 | ''' 375 | 376 | 377 | -------------------------------------------------------------------------------- /attic/old-config.txt: -------------------------------------------------------------------------------- 1 | # 2 | # bitcurator-nlp-gentm config file 3 | # 4 | 5 | # Disk images to process (the default location can be changed in the following section) 6 | [image_section] 7 | govdocs45sampler.E01 = 1 8 | 9 | # Configuration settings. Where to find disk images and store intermediary files. 10 | [confset_section] 11 | disk_image_dir = "disk-images" 12 | file_staging_directory = "filextract_dir" 13 | nlp_dir = "bcnlp" 14 | spacy_outfile = "spacy_outfile" 15 | entity_info = "No" 16 | num_iterations = 200 17 | exclude_words = "false", "true", "other", "new", "old", 'can', 'will', 'may', 'also', 'shall', 'even', 'shall' 18 | 19 | # Formats to exclude when extracting text using textract 20 | [exclude_format_section] 21 | .jpg=1 22 | .JPG=1 23 | .mp3=1 24 | .wav=1 25 | -------------------------------------------------------------------------------- /attic/old_graphlab_ref_code/bcnlp_tm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=UTF-8 3 | # 4 | # BitCurator NLP (Disk Image Access for the Web) 5 | # Copyright (C) 2014 - 2016 6 | # All rights reserved. 7 | # 8 | # This code is distributed under the terms of the GNU General Public 9 | # License, Version 3. See the text file "COPYING" for further details 10 | # about the terms of this license. 11 | # 12 | # This file contains the main BitCurator NLP application for Topic modeling 13 | 14 | # Usage: python bcnlp_tm.py [--topics <10>] [--tm ] 15 | # Default num_topics = 10, tm=graphlab 16 | 17 | import os 18 | import logging 19 | import pyLDAvis 20 | import pyLDAvis.gensim 21 | import pyLDAvis.graphlab 22 | import graphlab as gl 23 | from gensim import corpora, models, similarities 24 | import gensim 25 | import textract 26 | from bn_filextract import * 27 | from configobj import ConfigObj 28 | from stop_words import get_stop_words 29 | 30 | try: 31 | from argparse import ArgumentParser 32 | except ImportError: 33 | raise ImportError("This script requires ArgumentParser which is in Python 2.7 or Python 3.0") 34 | 35 | #logging.basicConfig(filename= 'bcnlp_tm.log', level=logging.DEBUG) 36 | logging.basicConfig(filename= 'bcnlp_tm_info.log', level=logging.INFO) 37 | logging.basicConfig(filename= 'bcnlp_tm_debug.log', level=logging.DEBUG) 38 | logging.basicConfig(filename= 'bcnlp_tm_warning.log', level=logging.WARNING) 39 | 40 | 41 | cfg_image = {} 42 | #documents = [] 43 | 44 | class BnTopicModel(): 45 | 46 | def tm_generate_gensim(self, infile, num_topics, config_file): 47 | ''' Using the APIs provided by gensim, LDAvis gui is invoked. 48 | NOTE: This is not yet tested well. 49 | ''' 50 | documents = [] 51 | documents = bn.bnTraverseInfileDir(infile, documents, config_file) 52 | if documents == []: 53 | print("Documents are empty") 54 | 55 | # remove common words and tokenize 56 | ''' 57 | stoplist = set('a an the of to for s from is and this \ 58 | was were are , - | @ . '.split()) 59 | texts = [[word for word in document.lower().split() \ 60 | if word not in stoplist] \ 61 | for document in documents] 62 | ''' 63 | 64 | en_stop = get_stop_words('en') 65 | logging.info("Stop-words list: %s ", en_stop) 66 | texts = [[word for word in document.lower().split() \ 67 | if word not in en_stop] \ 68 | for document in documents] 69 | 70 | 71 | # remove words that appear only once 72 | from collections import defaultdict 73 | frequency = defaultdict(int) 74 | for text in texts: 75 | for token in text: 76 | frequency[token] += 1 77 | 78 | texts = [[token for token in text if frequency[token] > 1] 79 | for text in texts] 80 | 81 | texts = [[token for token in text if len(token) > 2] 82 | for text in texts] 83 | 84 | # NOTE: lemmatize not working 85 | ###texts = gensim.utils.lemmatize(texts) 86 | 87 | dictionary = corpora.Dictionary(texts) 88 | 89 | ##logging.info("[V]: token:id: %s", dictionary.token2id) 90 | 91 | ## dictionary.compactify() 92 | dictionary.save('/tmp/saved_dict.dict') 93 | 94 | # Now convert tokenized documents to vectors: 95 | corpus = [dictionary.doc2bow(text) for text in texts] 96 | 97 | ## logging.info("[V] Corpus: %s ", corpus) 98 | 99 | # store to disk, for later use 100 | corpora.MmCorpus.serialize('/tmp/saved_dict.mm', corpus) 101 | 102 | ## Creating Transformations 103 | ## The transformations are standard Python objects, typically 104 | ## initialized (trained) by means of a training corpus: 105 | ## First, let's use tfidf for training: It just involves simply 106 | ## going thru the supplied corpus once and computing document 107 | ## frequencies of all its featuers. 108 | 109 | tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model 110 | 111 | corpus_tfidf = tfidf[corpus] 112 | corpora.MmCorpus.serialize('/tmp/saved_corpus_tfidf.mm', corpus_tfidf) 113 | 114 | ''' 115 | # LSI model is commented out for now 116 | print "Printing TFIDF of given corpus \n" 117 | for doc in corpus_tfidf: 118 | print (doc) 119 | 120 | # Now Initialize an LSI transformation: num_topics set to 2 to make 121 | # it 2D lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, 122 | # num_topics=3) 123 | 124 | # create a double wrapper over the original corpus: 125 | # bow->tfidf->fold-in-lsi 126 | corpus_lsi = lsi[corpus_tfidf] 127 | 128 | print "Printing LSI topics" 129 | lsi.print_topics(4) 130 | 131 | for doc in corpus_lsi: 132 | print (doc) 133 | ''' 134 | 135 | # Create an LDA model 136 | ''' 137 | lda_model = models.LdaModel(corpus_tfidf, \ 138 | id2word=dictionary, \ 139 | num_topics=5) 140 | ''' 141 | lda_model = models.ldamodel.LdaModel(corpus=corpus, \ 142 | id2word=dictionary, \ 143 | num_topics=num_topics) 144 | corpus_lda = lda_model[corpus] 145 | 146 | corpus_lda_tfidf = lda_model[corpus_tfidf] 147 | 148 | # The following will print the topics in the logfile 149 | logging.info("Printing %s topics into log file: ", str(num_topics)) 150 | lda_model.print_topics(num_topics) 151 | 152 | # Generate data for the pyLDAvis interface from the lda_model above 153 | vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary) 154 | ##vis_data = pyLDAvis.gensim.prepare(lda_model, corpus_lda, dictionary) 155 | 156 | #pyLDAvis.display(vis_data) 157 | pyLDAvis.show(vis_data) 158 | 159 | def tm_generate_graphlab(self, indir, num_topics, config_file): 160 | ''' Generate the LDA model for documents in indir, using graphlab 161 | ''' 162 | indir_path = os.path.join(os.getcwd(), indir) 163 | print(">> Graphlab: Creating SArray for files in ", indir) 164 | sa = self.bnGenerateSArray(indir, config_file) 165 | 166 | sa_docs = gl.text_analytics.count_words(sa) 167 | sa_docs_nsw = sa_docs.dict_trim_by_keys(gl.text_analytics.stopwords(), \ 168 | True) 169 | 170 | num_iterations = bn.bnGetConfigInfo(config_file, \ 171 | "confset_section", "num_iterations") 172 | 173 | print(">> Graphlab: Creating topic model with {} topics: ".\ 174 | format(num_topics)) 175 | topic_model = gl.topic_model.create(sa_docs_nsw, \ 176 | num_topics=int(num_topics), \ 177 | num_iterations=int(num_iterations)) 178 | 179 | print(">> Graphlab: Preparing data: ") 180 | vis_data = pyLDAvis.graphlab.prepare(topic_model, sa_docs_nsw) 181 | 182 | print(">> Graphlab: Launching graphics ") 183 | pyLDAvis.show(vis_data) 184 | 185 | def remove_punctuation(self, text): 186 | import string 187 | return text.translate(None, string.punctuation) 188 | 189 | def remove_digits(self, text): 190 | import string 191 | return text.translate(None, string.digits) 192 | 193 | def bnGenerateSArray(self, filextract_dir, config_file): 194 | ''' Traverse through the files in a directory and create sArrays 195 | and append them into one single sArray. 196 | ''' 197 | fname = sys._getframe().f_code.co_name 198 | num_docs = 0 199 | sa_g = gl.SArray(dtype = str) 200 | sw_list = ['a', 'an', 'the', 'of', 'to', 'for','as', 'from', 'is', \ 201 | 'was', 'were', 'are', ',', '-', '|', '@', '.' ] 202 | for root, dirs, files in os.walk(filextract_dir): 203 | path = root.split(os.sep) 204 | 205 | ''' 206 | print "path: ", path, len(path) 207 | print "dirs: ", dirs 208 | print "files: ", files 209 | print((len(path) - 1) * '---', os.path.basename(root)) 210 | ''' 211 | 212 | # if no files continue to next level 213 | if files == []: 214 | continue 215 | 216 | for filename in files: 217 | file_path = '/'.join(path) + '/' + filename 218 | 219 | bn = BnFilextract() 220 | if os.stat(file_path).st_size == 0: 221 | logging.info(">>>> File %s is empty. Skip it ", file_path) 222 | continue 223 | 224 | if bn.isFileTextractable(filename, config_file): 225 | try: 226 | input_file_contents = textract.process(file_path) 227 | logging.info("Textracted %s ", file_path) 228 | if len(input_file_contents) == 0: 229 | logging.info(">>>> File %s is empty. Skip it ", file_path) 230 | continue 231 | except: 232 | logging.info("Textract failed for file %s ", filename) 233 | continue 234 | 235 | input_file_contents = self.remove_punctuation(input_file_contents) 236 | input_file_contents = self.remove_digits(input_file_contents) 237 | file_path = os.path.splitext(file_path)[0]+'.txt' 238 | logging.info("%s: writing contents to outfile:%s ", 239 | fname, file_path) 240 | else: 241 | logging.info("File %s is NOT textractable ",filename) 242 | continue 243 | 244 | with open(file_path, "w") as text_file: 245 | text_file.write(input_file_contents) 246 | 247 | logging.info(">>> Getting SArray for file %s ", file_path) 248 | sa_sub = gl.SArray(file_path) 249 | gl.text_analytics.trim_rare_words(sa_sub, \ 250 | threshold=2, stopwords=sw_list ) 251 | # Now append the sub-sarray to the main one. 252 | if num_docs == 0: 253 | sa_g = sa_sub 254 | else: 255 | sa_g = sa_g.append(sa_sub) 256 | num_docs += 1 257 | 258 | logging.info("%s: Total num docs: %d ", fname, num_docs) 259 | return sa_g 260 | 261 | def bnRemoveEmptyFiles(self, path): 262 | ''' Traverses the directory and recursively removes empty files. 263 | ''' 264 | files = os.listdir(path) 265 | if len(files): 266 | for fl in files: 267 | fullpath = os.path.join(path, fl) 268 | if os.path.isdir(fullpath): 269 | self.bnRemoveEmptyFiles(fullpath) 270 | if os.stat(fullpath).st_size == 0: 271 | logging.info("Removing file %s ", fullpath) 272 | os.remove(fullpath) 273 | 274 | def bn_parse_config_file(config_file, section_name): 275 | ''' Parses the config file to extract the image names and entity list. 276 | ''' 277 | logging.info("bn_parse_config_file: Section: %s ", section_name) 278 | config = ConfigObj(config_file) 279 | section = config[section_name] 280 | i = 0 281 | cfg_entity_list = [] 282 | for key in section: 283 | #if key == cfg_string: 284 | # found the string 285 | #return section[key] 286 | if section_name == "image_section": 287 | logging.info("parse_config: key: %s, section: %s", \ 288 | key, section[key]) 289 | cfg_image[i] = key 290 | i+=1 291 | elif section_name == "entity_list_section": 292 | flag = int(entity_list_section[key]) 293 | if flag == 1: 294 | cfg_entity_list.append(key) 295 | if section_name == "entity_list_section": 296 | return cfg_entity_list 297 | 298 | if __name__ == "__main__": 299 | parser = ArgumentParser(prog='bcnlp_tm.py', description='Topic modeling') 300 | parser.add_argument('--config', action='store', \ 301 | help="Config file[bntm_config.txt] ") 302 | parser.add_argument('--infile', action='store', help="input directory ") 303 | parser.add_argument('--tm', action='store', \ 304 | help="topic modeling :gensim/graphlab ") 305 | parser.add_argument('--topics', action='store', help="number of topics ") 306 | 307 | args = parser.parse_args() 308 | 309 | # Infile specifies the directory of files to run the topic modeling on. 310 | # If no argument specified, it will assume there are disk-images specified 311 | # in the config file bntm_config.txt. 312 | 313 | infile = args.infile 314 | tm = args.tm # Topic modeling type: gensim/graphlab 315 | config_file = args.config 316 | is_disk_image = False 317 | 318 | num_topics = 10 319 | if args.topics: 320 | num_topics = args.topics 321 | 322 | # default it to Graphlab 323 | if tm == None: 324 | tm = 'graphlab' 325 | 326 | if config_file == None: 327 | config_file = "bntm_config.txt" 328 | 329 | bn = BnFilextract() 330 | if infile == None: 331 | is_disk_image = True 332 | 333 | bn_parse_config_file(config_file, "image_section") 334 | print(">> Images in the config file: ", cfg_image) 335 | 336 | infile = bn.bnGetConfigInfo(config_file, \ 337 | "confset_section", "file_staging_directory") 338 | 339 | i = 0 340 | for img in cfg_image: 341 | print(">> Extracting files from image {}...".format(cfg_image[img])) 342 | bn.bnExtractFiles(None, cfg_image[img], i, None, config_file) 343 | i += 1 344 | print(">> ... Done ") 345 | 346 | else: 347 | print(">> Extracting files from ", infile) 348 | bn.bnTraverseInfileDir(infile, documents, config_file) 349 | 350 | tmc = BnTopicModel() 351 | if tm == 'gensim': 352 | tmc.tm_generate_gensim(infile, num_topics, config_file) 353 | elif tm == 'graphlab': 354 | if is_disk_image: 355 | indir = bn.bnGetOutDirFromConfig(config_file) 356 | print(">> Generating graphlab for images in disk image") 357 | logging.info(">> Generating graphlab for images in disk image") 358 | logging.info("File-extracted directory: %s ", indir) 359 | tmc.tm_generate_graphlab(indir, num_topics, config_file) 360 | else: 361 | print(">> Generating graphlab for files in ", infile) 362 | logging.info(">> Generating graphlab for files in %s", infile) 363 | tmc.tm_generate_graphlab(infile, num_topics, config_file) 364 | 365 | 366 | -------------------------------------------------------------------------------- /attic/provision/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # bootstrap.sh: Build and configuration script for nlp-webtools in Vagrant 4 | # -------------------------------------------------------------------------------------- 5 | # This script is only the *first time* you issue the command: 6 | # 7 | # vagrant up 8 | # 9 | # Or, following the commands: 10 | # 11 | # (vagrant halt) 12 | # vagrant destroy 13 | # vagrant up 14 | # 15 | #=============================================================================== 16 | # vim: softtabstop=4 shiftwidth=4 expandtab fenc=utf-8 spell spelllang=en cc=81 17 | #=============================================================================== 18 | # 19 | # Script Version 20 | __ScriptVersion="0.1" 21 | # Base directory for build log 22 | LOG_BASE=/var/log 23 | WWW_ROOT=/var/www 24 | 25 | #--- FUNCTION ---------------------------------------------------------------- 26 | # NAME: __function_defined 27 | # DESCRIPTION: Checks if a function is defined within this scripts scope 28 | # PARAMETERS: function name 29 | # RETURNS: 0 or 1 as in defined or not defined 30 | #------------------------------------------------------------------------------- 31 | __function_defined() { 32 | FUNC_NAME=$1 33 | if [ "$(command -v $FUNC_NAME)x" != "x" ]; then 34 | echoinfo "Found function $FUNC_NAME" 35 | return 0 36 | fi 37 | 38 | echodebug "$FUNC_NAME not found...." 39 | return 1 40 | } 41 | 42 | #--- FUNCTION ---------------------------------------------------------------- 43 | # NAME: __strip_duplicates 44 | # DESCRIPTION: Strip duplicate strings 45 | #------------------------------------------------------------------------------- 46 | __strip_duplicates() { 47 | echo "$@" | tr -s '[:space:]' '\n' | awk '!x[$0]++' 48 | } 49 | 50 | #--- FUNCTION ---------------------------------------------------------------- 51 | # NAME: echoerr 52 | # DESCRIPTION: Echo errors to stderr. 53 | #------------------------------------------------------------------------------- 54 | echoerror() { 55 | printf "%s * ERROR%s: %s\n" "${RC}" "${EC}" "$@" 1>&2; 56 | } 57 | 58 | #--- FUNCTION ---------------------------------------------------------------- 59 | # NAME: echoinfo 60 | # DESCRIPTION: Echo information to stdout. 61 | #------------------------------------------------------------------------------- 62 | echoinfo() { 63 | printf "%s * STATUS%s: %s\n" "${GC}" "${EC}" "$@"; 64 | } 65 | 66 | #--- FUNCTION ---------------------------------------------------------------- 67 | # NAME: echowarn 68 | # DESCRIPTION: Echo warning informations to stdout. 69 | #------------------------------------------------------------------------------- 70 | echowarn() { 71 | printf "%s * WARN%s: %s\n" "${YC}" "${EC}" "$@"; 72 | } 73 | 74 | #--- FUNCTION ---------------------------------------------------------------- 75 | # NAME: echodebug 76 | # DESCRIPTION: Echo debug information to stdout. 77 | #------------------------------------------------------------------------------- 78 | echodebug() { 79 | if [ $_ECHO_DEBUG -eq $BS_TRUE ]; then 80 | printf "${BC} * DEBUG${EC}: %s\n" "$@"; 81 | fi 82 | } 83 | #--- FUNCTION ---------------------------------------------------------------- 84 | # NAME: __apt_get_install_noinput 85 | # DESCRIPTION: (DRY) apt-get install with noinput options 86 | #------------------------------------------------------------------------------- 87 | __apt_get_install_noinput() { 88 | apt-get install -y -o DPkg::Options::=--force-confold "$@"; return $? 89 | } 90 | 91 | #--- FUNCTION ---------------------------------------------------------------- 92 | # NAME: __apt_get_upgrade_noinput 93 | # DESCRIPTION: (DRY) apt-get upgrade with noinput options 94 | #------------------------------------------------------------------------------- 95 | __apt_get_upgrade_noinput() { 96 | apt-get upgrade -y -o DPkg::Options::=--force-confold; return $? 97 | } 98 | 99 | #--- FUNCTION ---------------------------------------------------------------- 100 | # NAME: __pip_install_noinput 101 | # DESCRIPTION: (DRY) 102 | #------------------------------------------------------------------------------- 103 | __pip_install_noinput() { 104 | #pip install --upgrade "$@"; return $? 105 | # Uncomment for Python 3 106 | pip3 install --upgrade $@; return $? 107 | } 108 | 109 | #--- FUNCTION ---------------------------------------------------------------- 110 | # NAME: __pip_install_noinput 111 | # DESCRIPTION: (DRY) 112 | #------------------------------------------------------------------------------- 113 | __pip_pre_install_noinput() { 114 | #pip install --pre --upgrade "$@"; return $? 115 | # Uncomment for Python 3 116 | pip3 install --pre --upgrade $@; return $? 117 | } 118 | 119 | 120 | #--- FUNCTION ---------------------------------------------------------------- 121 | # NAME: __check_apt_lock 122 | # DESCRIPTION: (DRY) 123 | #------------------------------------------------------------------------------- 124 | __check_apt_lock() { 125 | lsof /var/lib/dpkg/lock > /dev/null 2>&1 126 | RES=`echo $?` 127 | return $RES 128 | } 129 | 130 | __enable_universe_repository() { 131 | if [ "x$(grep -R universe /etc/apt/sources.list /etc/apt/sources.list.d/ | grep -v '#')" != "x" ]; then 132 | # The universe repository is already enabled 133 | return 0 134 | fi 135 | echodebug "Enabling the universe repository" 136 | 137 | # Ubuntu versions higher than 12.04 do not live in the old repositories 138 | if [ $DISTRO_MAJOR_VERSION -gt 12 ] || ([ $DISTRO_MAJOR_VERSION -eq 12 ] && [ $DISTRO_MINOR_VERSION -gt 04 ]); then 139 | add-apt-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe" || return 1 140 | elif [ $DISTRO_MAJOR_VERSION -lt 11 ] && [ $DISTRO_MINOR_VERSION -lt 10 ]; then 141 | # Below Ubuntu 11.10, the -y flag to add-apt-repository is not supported 142 | add-apt-repository "deb http://old-releases.ubuntu.com/ubuntu $(lsb_release -sc) universe" || return 1 143 | fi 144 | 145 | add-apt-repository -y "deb http://old-releases.ubuntu.com/ubuntu $(lsb_release -sc) universe" || return 1 146 | 147 | return 0 148 | } 149 | 150 | __check_unparsed_options() { 151 | shellopts="$1" 152 | # grep alternative for SunOS 153 | if [ -f /usr/xpg4/bin/grep ]; then 154 | grep='/usr/xpg4/bin/grep' 155 | else 156 | grep='grep' 157 | fi 158 | unparsed_options=$( echo "$shellopts" | ${grep} -E '(^|[[:space:]])[-]+[[:alnum:]]' ) 159 | if [ "x$unparsed_options" != "x" ]; then 160 | usage 161 | echo 162 | echoerror "options are only allowed before install arguments" 163 | echo 164 | exit 1 165 | fi 166 | } 167 | 168 | configure_cpan() { 169 | (echo y;echo o conf prerequisites_policy follow;echo o conf commit)|cpan > /dev/null 170 | } 171 | 172 | usage() { 173 | echo "usage" 174 | exit 1 175 | } 176 | 177 | install_ubuntu_17.04_deps() { 178 | 179 | echoinfo "Updating your APT Repositories ... " 180 | apt-get update >> $LOG_BASE/nlp-install.log 2>&1 || return 1 181 | 182 | echoinfo "Installing Python Software Properies ... " 183 | __apt_get_install_noinput software-properties-common >> $LOG_BASE/nlp-install.log 2>&1 || return 1 184 | 185 | echoinfo "Enabling Universal Repository ... " 186 | __enable_universe_repository >> $LOG_BASE/nlp-install.log 2>&1 || return 1 187 | 188 | echoinfo "Updating Repository Package List ..." 189 | apt-get update >> $LOG_BASE/nlp-install.log 2>&1 || return 1 190 | 191 | echoinfo "Upgrading all packages to latest version ..." 192 | __apt_get_upgrade_noinput >> $LOG_BASE/nlp-install.log 2>&1 || return 1 193 | 194 | return 0 195 | } 196 | 197 | install_ubuntu_17.04_packages() { 198 | packages="antiword 199 | automake 200 | dkms 201 | ffmpeg 202 | flac 203 | g++-5 204 | gcc-5 205 | lame 206 | libffi-dev 207 | libjpeg-dev 208 | libmad0 209 | libpulse-dev 210 | libsox-fmt-mp3 211 | libtool 212 | libxml2-dev 213 | libxslt1-dev 214 | poppler-utils 215 | pstotext 216 | python 217 | python-dev 218 | python-pip 219 | python3-dev 220 | python3-pip 221 | sox 222 | swig 223 | swig3.0 224 | tesseract-ocr 225 | unrtf 226 | virtualbox-guest-utils 227 | virtualenv 228 | virtualenvwrapper 229 | zlib1g-dev" 230 | 231 | 232 | if [ "$@" = "dev" ]; then 233 | packages="$packages" 234 | elif [ "$@" = "stable" ]; then 235 | packages="$packages" 236 | fi 237 | 238 | for PACKAGE in $packages; do 239 | __apt_get_install_noinput $PACKAGE >> $LOG_BASE/nlp-install.log 2>&1 240 | ERROR=$? 241 | if [ $ERROR -ne 0 ]; then 242 | echoerror "Install Failure: $PACKAGE (Error Code: $ERROR)" 243 | else 244 | echoinfo "Installed Package: $PACKAGE" 245 | fi 246 | done 247 | 248 | return 0 249 | } 250 | 251 | install_ubuntu_17.04_pip_packages() { 252 | 253 | pip_packages="textract 254 | gensim 255 | pyLDAvis 256 | configobj" 257 | pip_special_packages="textacy" 258 | 259 | if [ "$@" = "dev" ]; then 260 | pip_packages="$pip_packages" 261 | elif [ "$@" = "stable" ]; then 262 | pip_packages="$pip_packages" 263 | fi 264 | 265 | ERROR=0 266 | 267 | for PACKAGE in $pip_packages; do 268 | CURRENT_ERROR=0 269 | echoinfo "Installed Python Package: $PACKAGE" 270 | __pip_install_noinput $PACKAGE >> $LOG_BASE/nlp-install.log 2>&1 || (let ERROR=ERROR+1 && let CURRENT_ERROR=1) 271 | if [ $CURRENT_ERROR -eq 1 ]; then 272 | echoerror "Python Package Install Failure: $PACKAGE" 273 | fi 274 | done 275 | 276 | # Prep environment for special packages, install cld2-cffi 277 | env CC=/usr/bin/gcc-5 pip3 install -U cld2-cffi 278 | 279 | for PACKAGE in $pip_special_packages; do 280 | CURRENT_ERROR=0 281 | echoinfo "Installed Python (special setup) Package: $PACKAGE" 282 | __pip_pre_install_noinput $PACKAGE >> $LOG_BASE/nlp-install.log 2>&1 || (let ERROR=ERROR+1 && let CURRENT_ERROR=1) 283 | if [ $CURRENT_ERROR -eq 1 ]; then 284 | echoerror "Python Package Install Failure: $PACKAGE" 285 | fi 286 | done 287 | 288 | if [ $ERROR -ne 0 ]; then 289 | echoerror 290 | return 1 291 | fi 292 | 293 | return 0 294 | } 295 | 296 | 297 | install_source_packages() { 298 | 299 | #echoinfo "nlp-webtools: Nothing to be installed currently. Continuing..." 300 | # Install libuna from specific release 301 | echoinfo "nlp-webtools: Building and installing libuna" 302 | CDIR=$(pwd) 303 | 304 | # Newer versions break a lot of stuff. Keep 20150927 for now. 305 | cd /tmp 306 | wget -q https://github.com/libyal/libuna/releases/download/20170112/libuna-alpha-20170112.tar.gz 307 | tar zxf libuna-alpha-20170112.tar.gz >> $HOME/nlp-install.log 2>&1 308 | cd libuna-20170112 309 | ./configure >> $HOME/nlp-install.log 2>&1 310 | make -s >> $HOME/nlp-install.log 2>&1 311 | make install >> $HOME/nlp-install.log 2>&1 312 | ldconfig >> $HOME/nlp-install.log 2>&1 313 | 314 | # Now clean up 315 | cd /tmp 316 | rm -rf libuna-20170112 317 | rm libuna-alpha-20170112.tar.gz 318 | 319 | # Install libewf from current sources 320 | echoinfo "nlp-webtools: Building and installing libewf" 321 | CDIR=$(pwd) 322 | 323 | # Newer versions break a lot of stuff. Keep 20140608 for now. 324 | cd /tmp 325 | cp /vagrant/externals/libewf-20140608.tar.gz . 326 | tar zxf libewf-20140608.tar.gz >> $HOME/nlp-install.log 2>&1 327 | cd libewf-20140608 328 | ./configure --enable-python --enable-v1-api >> $HOME/nlp-install.log 2>&1 329 | make -s >> $HOME/nlp-install.log 2>&1 330 | make install >> $HOME/nlp-install.log 2>&1 331 | ldconfig >> $HOME/nlp-install.log 2>&1 332 | 333 | # Now clean up 334 | cd /tmp 335 | rm -rf libewf-20140608 336 | rm libewf-20140608.tar.gz 337 | 338 | echoinfo "nlp-webtools: Adding DFXML tools and libraries" 339 | CDIR=$(pwd) 340 | git clone https://github.com/simsong/dfxml /usr/share/dfxml >> $HOME/nlp-install.log 2>&1 341 | # No cleanup needed 342 | cd /tmp 343 | 344 | # Install The Sleuth Kit (TSK) from current sources 345 | echoinfo "nlp-webtools: Building and installing The Sleuth Kit" 346 | CDIR=$(pwd) 347 | git clone --recursive https://github.com/sleuthkit/sleuthkit /usr/share/sleuthkit >> $HOME/nlp-install.log 2>&1 348 | cd /usr/share/sleuthkit 349 | git fetch 350 | git checkout master >> $HOME/nlp-install.log 2>&1 351 | ./bootstrap >> $HOME/nlp-install.log 2>&1 352 | ./configure >> $HOME/nlp-install.log 2>&1 353 | make -s >> $HOME/nlp-install.log 2>&1 354 | make install >> $HOME/nlp-install.log 2>&1 355 | ldconfig >> $HOME/nlp-install.log 2>&1 356 | 357 | # Install PyTSK 358 | echoinfo "nlp-webtools: Building and installing PyTSK (Python bindings for TSK)" 359 | echoinfo " -- Please be patient. This may take several minutes..." 360 | CDIR=$(pwd) 361 | cd /tmp 362 | git clone https://github.com/py4n6/pytsk 363 | cd pytsk 364 | python setup.py update >> $HOME/nlp-install.log 2>&1 365 | python setup.py build >> $HOME/nlp-install.log 2>&1 366 | python setup.py install >> $HOME/nlp-install.log 2>&1 367 | # Now clean up 368 | cd /tmp 369 | #rm -rf pytsk3-20170508 370 | rm -rf pytsk 371 | 372 | } 373 | 374 | complete_message() { 375 | echo 376 | echo "Installation Complete!" 377 | echo 378 | } 379 | 380 | OS=$(lsb_release -si) 381 | ARCH=$(uname -m | sed 's/x86_//;s/i[3-6]86/32/') 382 | VER=$(lsb_release -sr) 383 | 384 | if [ $OS != "Ubuntu" ]; then 385 | echo "nlp-webtools is only installable on Ubuntu operating systems at this time." 386 | exit 1 387 | fi 388 | 389 | if [ $VER != "17.04" ]; then 390 | echo "nlp-webtools is only installable on Ubuntu 17.04 at this time." 391 | exit 3 392 | fi 393 | 394 | if [ "`whoami`" != "root" ]; then 395 | echoerror "The nlp-webtools bootstrap script must run as root." 396 | echoinfo "Preferred Usage: sudo bootstrap.sh (options)" 397 | echo "" 398 | exit 3 399 | fi 400 | 401 | if [ "$SUDO_USER" = "" ]; then 402 | echo "The SUDO_USER variable doesn't seem to be set" 403 | exit 4 404 | fi 405 | 406 | # while getopts ":hvcsiyu" opt 407 | while getopts ":hv" opt 408 | do 409 | case "${opt}" in 410 | h ) usage; exit 0 ;; 411 | v ) echo "$0 -- Version $__ScriptVersion"; exit 0 ;; 412 | \?) echo 413 | echoerror "Option does not exist: $OPTARG" 414 | usage 415 | exit 1 416 | ;; 417 | esac 418 | done 419 | 420 | shift $(($OPTIND-1)) 421 | 422 | if [ "$#" -eq 0 ]; then 423 | ITYPE="stable" 424 | else 425 | __check_unparsed_options "$*" 426 | ITYPE=$1 427 | shift 428 | fi 429 | 430 | # Check installation type 431 | if [ "$(echo $ITYPE | egrep '(dev|stable)')x" = "x" ]; then 432 | echoerror "Installation type \"$ITYPE\" is not known..." 433 | exit 1 434 | fi 435 | 436 | echoinfo "****************************************************************" 437 | echoinfo "The nlp-webtools provisioning script will now configure your system." 438 | echoinfo "****************************************************************" 439 | echoinfo "" 440 | 441 | #if [ "$YESTOALL" -eq 1 ]; then 442 | # echoinfo "You supplied the -y option, this script will not exit for any reason" 443 | #fi 444 | 445 | echoinfo "OS: $OS" 446 | echoinfo "Arch: $ARCH" 447 | echoinfo "Version: $VER" 448 | echoinfo "The current user is: $SUDO_USER" 449 | 450 | export DEBIAN_FRONTEND=noninteractive 451 | install_ubuntu_${VER}_deps $ITYPE 452 | install_ubuntu_${VER}_packages $ITYPE 453 | install_ubuntu_${VER}_pip_packages $ITYPE 454 | install_source_packages 455 | 456 | complete_message 457 | -------------------------------------------------------------------------------- /bcnlp_fxtract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=UTF-8 3 | # 4 | # BitCurator NLP 5 | # Copyright (C) 2016-2018 6 | # All rights reserved. 7 | # 8 | # This code is distributed under the terms of the GNU General Public 9 | # License, Version 3. See the text file "COPYING" for further details 10 | # about the terms of this license. 11 | # 12 | # This file contains the File Extraction routines for BitCurator NLP. 13 | # 14 | 15 | import multiprocessing 16 | import os 17 | import errno 18 | import re 19 | import pytsk3 20 | import logging 21 | from dfvfs.resolver import resolver 22 | from dfvfs.path import path_spec 23 | from dfvfs.resolver import context 24 | from bcnlp_listfiles import FileEntryLister 25 | from dfvfs.path import factory as path_spec_factory 26 | from dfvfs.lib import definitions 27 | 28 | import argparse 29 | try: 30 | from argparse import ArgumentParser 31 | except ImportError: 32 | raise ImportError("This script requires ArgumentParser which is in Python 2.7 or Python 3.0") 33 | 34 | 35 | class FileExtractor(multiprocessing.Process): 36 | def __init__(self,fs_path_spec,output_path): 37 | super(FileExtractor, self).__init__() 38 | self._READ_BUFFER_SIZE = 32768 39 | 40 | self.file_queue = multiprocessing.Queue() 41 | 42 | self.fs_path_spec = fs_path_spec 43 | self.output_path = output_path 44 | 45 | def run(self): 46 | p = os.getpid() 47 | print(u"Running File Extractor: (PID {})".format(p)) 48 | 49 | # We have to open the filesystem from within the process, cannot 50 | # be passed 51 | resolver_context = context.Context() 52 | file_system = resolver.Resolver.OpenFileSystem( 53 | self.fs_path_spec, 54 | resolver_context=resolver_context 55 | ) 56 | 57 | # Read from the queue # 58 | while True: 59 | file_item = self.file_queue.get() 60 | if isinstance(file_item,unicode): 61 | if (file_item == u'TERMINATE'): 62 | break 63 | elif isinstance(file_item,ExtractionInfo): 64 | # Get dfvfs file entry from our path_spec # 65 | outpath_stack = list(os.path.split( 66 | self.output_path 67 | )) 68 | 69 | # Get dfvfs entry # 70 | file_entry = file_system.GetFileEntryByPathSpec( 71 | file_item.path_spec 72 | ) 73 | 74 | ads_name = self._GetStreamName( 75 | file_item.full_path 76 | ) 77 | 78 | # Export files based off of file_entry # 79 | self._ExportFiles( 80 | file_system, 81 | file_entry, 82 | outpath_stack, 83 | specified_ads_name=ads_name 84 | ) 85 | else: 86 | print(u"Item type unhandled for type: {}; {}".format( 87 | unicode(type(file_item)), 88 | unicode(file_item) 89 | )) 90 | 91 | file_system.Close() 92 | 93 | print(u"Ending File Extractor: (PID {})".format( 94 | os.getpid() 95 | )) 96 | 97 | def _GetStreamName(self,full_path): 98 | ads_name = None 99 | 100 | if ':' in full_path: 101 | ads_name = full_path.split(':')[1] 102 | 103 | return ads_name 104 | 105 | def _GetExportFilename(self, outpath_stack, filename, ads_name=None): 106 | """Create a export filename""" 107 | export_filename = u'' 108 | path_sep = os.pathsep 109 | export_path = os.path.sep.join(outpath_stack) 110 | 111 | name = os.path.basename(filename) 112 | if ads_name: 113 | name = u'{}.{}'.format(name,ads_name) 114 | export_filename = os.path.join(export_path, name) 115 | 116 | return export_filename 117 | 118 | def _ExportFiles(self, file_system, file_entry, outpath_stack, specified_ads_name=None): 119 | # Export if file_entry is a file # 120 | if file_entry.IsFile(): 121 | for ads in file_entry._GetDataStreams(): 122 | full_path = file_entry.path_spec.location 123 | 124 | if specified_ads_name: 125 | # Only extract out this ads because it was specified # 126 | if specified_ads_name != ads.name: 127 | continue 128 | 129 | full_path = u'{}:{}'.format(full_path,ads.name) 130 | 131 | if len(ads.name) > 0: 132 | ads_name = ads.name 133 | full_path = u'{}:{}'.format(full_path, ads.name) 134 | else: 135 | ads_name = None 136 | 137 | export_name = self._GetExportFilename( 138 | outpath_stack, 139 | file_entry.name, 140 | ads_name=ads_name 141 | ) 142 | 143 | result = self._ExportFile( 144 | file_entry, 145 | export_name, 146 | ads_name 147 | ) 148 | if result: 149 | #print(u"Exported {} to {}".format(full_path, export_name)) 150 | logging.info(u"Exported %s to %s",full_path, export_name) 151 | else: 152 | print(u"{} Not Exported to {}".format(full_path, export_name)) 153 | elif file_entry.IsDirectory(): 154 | for sub_file_entry in file_entry.sub_file_entries: 155 | outpath_stack.append(file_entry.name) 156 | self._ExportFiles( 157 | file_system, 158 | sub_file_entry, 159 | outpath_stack 160 | ) 161 | outpath_stack.pop() 162 | 163 | def _ExportFile(self, file_entry, export_filename, ads_name): 164 | """Export a file""" 165 | _offset = None 166 | 167 | # Outfile # 168 | # Check that path exists # 169 | export_path = os.path.dirname(export_filename) 170 | if not os.path.isdir(export_filename): 171 | try: 172 | os.makedirs(export_path) 173 | except OSError as oserror: 174 | if oserror.errno != errno.EEXIST: 175 | raise 176 | 177 | outfile = open( 178 | export_filename, 179 | 'wb' 180 | ) 181 | 182 | file_name = file_entry.name 183 | 184 | tsk_file = file_entry._tsk_file 185 | use_attribute = None 186 | 187 | if ads_name: 188 | data_stream_name = ads_name 189 | for attribute in tsk_file: 190 | if attribute.info.name == data_stream_name: 191 | use_attribute = attribute 192 | if data_stream_name == u'$J' and int(attribute.info.flags) & pytsk3.TSK_FS_ATTR_SPARSE: 193 | # If USN Journal, start at end of sparse data run # 194 | for run in attribute: 195 | print " Blocks %s to %s (%s blocks) [flags: %s] - [offset: %d]" % ( 196 | run.addr, run.addr + run.len, run.len, str(run.flags), run.offset 197 | ) 198 | if run.flags != pytsk3.TSK_FS_ATTR_RUN_FLAG_SPARSE: 199 | _offset = run.offset * tsk_file.info.fs_info.block_size 200 | break 201 | break 202 | 203 | if _offset is None: 204 | _offset = 0 205 | 206 | if use_attribute != None: 207 | _filesize = use_attribute.info.size 208 | else: 209 | _filesize = tsk_file.info.meta.size 210 | 211 | while _offset < _filesize: 212 | available_to_read = min(self._READ_BUFFER_SIZE, _filesize - _offset) 213 | 214 | if use_attribute != None: 215 | data = tsk_file.read_random( 216 | _offset, 217 | available_to_read, 218 | use_attribute.info.type, 219 | use_attribute.info.id 220 | ) 221 | else: 222 | data = tsk_file.read_random( 223 | _offset, 224 | available_to_read 225 | ) 226 | 227 | if not data: 228 | break 229 | 230 | _offset += len(data) 231 | 232 | outfile.write(data) 233 | 234 | outfile.close() 235 | return True 236 | 237 | #def AddFileToQueue(self,tsk_file_entry): 238 | def AddFileToQueue(self,tsk_file_entry, full_path): 239 | ''' ORIG 240 | einfo = ExtractionInfo( 241 | tsk_file_entry.path_spec, 242 | tsk_file_entry.full_path 243 | ) 244 | ''' 245 | einfo = ExtractionInfo( 246 | tsk_file_entry.path_spec, 247 | full_path 248 | ) 249 | self.file_queue.put(einfo) 250 | 251 | def Finish(self): 252 | self.file_queue.put(u'TERMINATE') 253 | 254 | class ExtractionInfo(): 255 | def __init__(self,path_spec,full_path): 256 | self.path_spec = path_spec 257 | self.full_path = full_path 258 | 259 | 260 | if __name__ == "__main__": 261 | parser = ArgumentParser(prog='bcnlp_fxtract.py', description='File Extraction') 262 | parser.add_argument('--config', action='store', help="Extract text") 263 | parser.add_argument('--image', action='store', help="Image") 264 | 265 | args = parser.parse_args() 266 | 267 | config_file = args.config 268 | if config_file == None: 269 | config_file = "config.txt" 270 | 271 | source_path = args.image 272 | if config_file == None: 273 | print("Image not specified ") 274 | os.exit() 275 | 276 | 277 | ''' 278 | # Get the basePathSpec of the given file/dir 279 | #spath_basename = os.path.basename(spath) 280 | stat_info = os.stat(source_path) 281 | 282 | path_spec = path_spec_factory.Factory.NewPathSpec( 283 | definitions.TYPE_INDICATOR_OS, location=source_path) 284 | 285 | #bl_fle = bcnlp_listfiles.FileEntryLister() 286 | bl_fle = FileEntryLister() 287 | base_path_spec = bl_fle.GetBasePathSpec(source_path) 288 | 289 | #num_partitions = file_entry_lister.ListAllFiles(source_path) 290 | 291 | ####file_system = resolver.Resolver.OpenFileSystem(base_path_spec) 292 | ####file_entry = resolver.Resolver.OpenFileEntry(base_path_spec) 293 | 294 | output_path = os.path.join(os.getcwd(), "extracted_files" ) 295 | #fe = FileExtractor(base_path_spec) 296 | fe = FileExtractor(base_path_spec, output_path) 297 | file_entry = resolver.Resolver.OpenFileEntry(base_path_spec) 298 | ''' 299 | 300 | bl_fle = FileEntryLister() 301 | base_path_spec = bl_fle.GetBasePathSpec(source_path) 302 | 303 | output_path = os.path.join(os.getcwd(), "new_extracted_files" ) 304 | file_location = os.path.join(os.getcwd(), "new_extracted_files" ) 305 | #image_location = os.path.join(os.getcwd(), "disk_images") 306 | os_path_spec = path_spec_factory.Factory.NewPathSpec(definitions.TYPE_INDICATOR_OS, location=source_path) 307 | ewf_path_spec = path_spec_factory.Factory.NewPathSpec(definitions.TYPE_INDICATOR_EWF, parent=os_path_spec) 308 | 309 | tsk_partition_path_spec = path_spec_factory.Factory.NewPathSpec(definitions.TYPE_INDICATOR_TSK_PARTITION, location=u'/p1', parent=ewf_path_spec) 310 | tsk_path_spec = path_spec_factory.Factory.NewPathSpec(definitions.TYPE_INDICATOR_TSK, location=file_location, parent=tsk_partition_path_spec) 311 | 312 | #file_entry = resolver.Resolver.OpenFileEntry(tsk_path_spec) 313 | file_entry = resolver.Resolver.OpenFileEntry(base_path_spec) 314 | print("file_entry: ", file_entry) 315 | 316 | #fe = FileExtractor(tsk_path_spec, output_path) 317 | fe = FileExtractor(base_path_spec, output_path) 318 | fe.start() 319 | #fe.AddFileToQueue(file_entry) 320 | fe.AddFileToQueue(file_entry, source_path) 321 | print ("Added files to queue") 322 | 323 | fe.Finish() 324 | 325 | -------------------------------------------------------------------------------- /bcnlp_listfiles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | # BitCurator NLP 5 | # Copyright (C) 2016 - 2018 6 | # All rights reserved. 7 | # 8 | # This code is distributed under the terms of the GNU General Public 9 | # License, Version 3. See the text file "COPYING" for further details 10 | # about the terms of this license. 11 | # 12 | # This file contains the File Extraction routines for BitCurator NLP. 13 | # 14 | 15 | """Script to list file entries. 16 | Extended from dfvfs example code list_file_entries.py 17 | """ 18 | 19 | from __future__ import print_function 20 | import argparse 21 | import logging 22 | import os 23 | import stat 24 | import sys 25 | 26 | from dfvfs.analyzer import analyzer 27 | from dfvfs.lib import definitions 28 | from dfvfs.lib import raw 29 | from dfvfs.path import factory as path_spec_factory 30 | from dfvfs.resolver import resolver 31 | from dfvfs.volume import tsk_volume_system 32 | 33 | class FileEntryLister(object): 34 | """Class that lists file entries.""" 35 | 36 | # Class constant that defines the default read buffer size. 37 | _READ_BUFFER_SIZE = 32768 38 | 39 | # For context see: http://en.wikipedia.org/wiki/Byte 40 | _UNITS_1000 = [u'B', u'kB', u'MB', u'GB', u'TB', u'EB', u'ZB', u'YB'] 41 | _UNITS_1024 = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'EiB', u'ZiB', u'YiB'] 42 | 43 | def _GetHumanReadableSize(self, size): 44 | """Retrieves a human readable string of the size. 45 | 46 | Args: 47 | size: The size in bytes. 48 | 49 | Returns: 50 | A human readable string of the size. 51 | """ 52 | magnitude_1000 = 0 53 | size_1000 = float(size) 54 | while size_1000 >= 1000: 55 | size_1000 /= 1000 56 | magnitude_1000 += 1 57 | 58 | magnitude_1024 = 0 59 | size_1024 = float(size) 60 | while size_1024 >= 1024: 61 | size_1024 /= 1024 62 | magnitude_1024 += 1 63 | 64 | size_string_1000 = None 65 | if magnitude_1000 > 0 and magnitude_1000 <= 7: 66 | size_string_1000 = u'{0:.1f}{1:s}'.format( 67 | size_1000, self._UNITS_1000[magnitude_1000]) 68 | 69 | size_string_1024 = None 70 | if magnitude_1024 > 0 and magnitude_1024 <= 7: 71 | size_string_1024 = u'{0:.1f}{1:s}'.format( 72 | size_1024, self._UNITS_1024[magnitude_1024]) 73 | 74 | if not size_string_1000 or not size_string_1024: 75 | return u'{0:d} B'.format(size) 76 | 77 | return u'{0:s} / {1:s} ({2:d} B)'.format( 78 | size_string_1024, size_string_1000, size) 79 | 80 | def _GetNextLevelTSKPartionVolumeSystemPathSpec(self, source_path_spec, is_single_part): 81 | """Determines the next level volume system path specification. 82 | 83 | Args: 84 | source_path_spec: the source path specification (instance of 85 | dfvfs.PathSpec). 86 | 87 | Returns: 88 | The next level volume system path specification (instance of 89 | dfvfs.PathSpec). 90 | 91 | Raises: 92 | RuntimeError: if the format of or within the source is not supported. 93 | """ 94 | volume_system_path_spec = path_spec_factory.Factory.NewPathSpec( 95 | definitions.TYPE_INDICATOR_TSK_PARTITION, location=u'/', 96 | parent=source_path_spec) 97 | 98 | volume_system = tsk_volume_system.TSKVolumeSystem() 99 | volume_system.Open(volume_system_path_spec) 100 | 101 | volume_identifiers = [] 102 | for volume in volume_system.volumes: 103 | volume_identifier = getattr(volume, 'identifier', None) 104 | if volume_identifier: 105 | volume_identifiers.append(volume_identifier) 106 | 107 | if not volume_identifiers: 108 | logging.warning(u'No supported partitions found.') 109 | return source_path_spec 110 | 111 | if (len(volume_identifiers) == 1) or (is_single_part == True): 112 | return path_spec_factory.Factory.NewPathSpec( 113 | definitions.TYPE_INDICATOR_TSK_PARTITION, location=u'/p1', 114 | parent=source_path_spec) 115 | 116 | print(u'The following partitions were found:') 117 | print(u'Identifier\tOffset\t\t\tSize') 118 | 119 | for volume_identifier in sorted(volume_identifiers): 120 | volume = volume_system.GetVolumeByIdentifier(volume_identifier) 121 | if not volume: 122 | raise RuntimeError( 123 | u'Volume missing for identifier: {0:s}.'.format(volume_identifier)) 124 | 125 | volume_extent = volume.extents[0] 126 | print( 127 | u'{0:s}\t\t{1:d} (0x{1:08x})\t{2:s}'.format( 128 | volume.identifier, volume_extent.offset, 129 | self._GetHumanReadableSize(volume_extent.size))) 130 | 131 | print(u'') 132 | 133 | while True: 134 | print( 135 | u'Please specify the identifier of the partition that should ' 136 | u'be processed:') 137 | 138 | selected_volume_identifier = sys.stdin.readline() 139 | selected_volume_identifier = selected_volume_identifier.strip() 140 | 141 | if selected_volume_identifier in volume_identifiers: 142 | break 143 | 144 | print(u'') 145 | print( 146 | u'Unsupported partition identifier, please try again or abort ' 147 | u'with Ctrl^C.') 148 | print(u'') 149 | 150 | location = u'/{0:s}'.format(selected_volume_identifier) 151 | 152 | return path_spec_factory.Factory.NewPathSpec( 153 | definitions.TYPE_INDICATOR_TSK_PARTITION, location=location, 154 | parent=source_path_spec) 155 | 156 | def _GetNextLevelTSKPartionVolumeSystemPathSpecForBcnlp(self, \ 157 | source_path_spec,\ 158 | spath): 159 | """Determines the next level volume system path specification. 160 | and calls ListFileEntry to output the file-list from every 161 | partition into the specified output file. 162 | 163 | Args: 164 | source_path_spec: the source path specification (instance of 165 | dfvfs.PathSpec). 166 | spath: source path 167 | 168 | Returns: 169 | number of Partitions 170 | 171 | Raises: 172 | RuntimeError: if the format of or within the source is not supported. 173 | """ 174 | fname = sys._getframe().f_code.co_name 175 | spath_basename = os.path.basename(spath) 176 | volume_system_path_spec = path_spec_factory.Factory.NewPathSpec( 177 | definitions.TYPE_INDICATOR_TSK_PARTITION, location=u'/', 178 | parent=source_path_spec) 179 | 180 | volume_system = tsk_volume_system.TSKVolumeSystem() 181 | volume_system.Open(volume_system_path_spec) 182 | 183 | volume_identifiers = [] 184 | for volume in volume_system.volumes: 185 | volume_identifier = getattr(volume, 'identifier', None) 186 | if volume_identifier: 187 | volume_identifiers.append(volume_identifier) 188 | 189 | if not volume_identifiers: 190 | logging.warning(u'No supported partitions found.') 191 | return source_path_spec 192 | 193 | print(u'The following partitions were found:') 194 | print(u'Identifier\tOffset\t\t\tSize') 195 | 196 | for volume_identifier in sorted(volume_identifiers): 197 | volume = volume_system.GetVolumeByIdentifier(volume_identifier) 198 | if not volume: 199 | raise RuntimeError( 200 | u'Volume missing for identifier: {0:s}.'.format(volume_identifier)) 201 | 202 | volume_extent = volume.extents[0] 203 | print( 204 | u'{0:s}\t\t{1:d} (0x{1:08x})\t{2:s}'.format( 205 | volume.identifier, volume_extent.offset, 206 | self._GetHumanReadableSize(volume_extent.size))) 207 | 208 | print(u'') 209 | 210 | for volume_identifier in sorted(volume_identifiers): 211 | volume = volume_system.GetVolumeByIdentifier(volume_identifier) 212 | location = u'/{0:s}'.format(volume_identifier) 213 | base_path_spec = path_spec_factory.Factory.NewPathSpec( \ 214 | definitions.TYPE_INDICATOR_TSK_PARTITION, location=location, \ 215 | parent=source_path_spec) 216 | 217 | logging.info("%s: Listing files for partition: %s ", fname, \ 218 | volume_identifier) 219 | 220 | base_path_spec = path_spec_factory.Factory.NewPathSpec( 221 | definitions.TYPE_INDICATOR_TSK, location=u'/', 222 | parent=base_path_spec) 223 | 224 | outfile = spath + "_filelist_" + str(volume_identifier) 225 | of = open(outfile, "a") 226 | logging.info("%s:Calling ListFileEntries to write list to file %s ",\ 227 | fname, outfile) 228 | self.ListFileEntries(base_path_spec, of, volume_identifier) 229 | of.close() 230 | return len(volume_identifiers) 231 | 232 | def _GetNextLevelVshadowVolumeSystemPathSpec(self, source_path_spec): 233 | """Determines the next level volume system path specification. 234 | 235 | Args: 236 | source_path_spec: the source path specification (instance of 237 | dfvfs.PathSpec). 238 | 239 | Returns: 240 | The next level volume system path specification (instance of 241 | dfvfs.PathSpec). 242 | 243 | Raises: 244 | RuntimeError: if the format of or within the source is not supported. 245 | """ 246 | # TODO: implement. 247 | return source_path_spec 248 | 249 | def _GetUpperLevelVolumeSystemPathSpec(self, source_path_spec, is_single_part): 250 | """Determines the upper level volume system path specification. 251 | 252 | Args: 253 | source_path_spec: the source path specification (instance of 254 | dfvfs.PathSpec). 255 | 256 | Returns: 257 | The upper level volume system path specification (instance of 258 | dfvfs.PathSpec). 259 | 260 | Raises: 261 | RuntimeError: if the format of or within the source is not supported. 262 | """ 263 | type_indicators = analyzer.Analyzer.GetVolumeSystemTypeIndicators( 264 | source_path_spec) 265 | 266 | if not type_indicators: 267 | # No supported volume system found, we are at the upper level. 268 | return source_path_spec 269 | 270 | if len(type_indicators) > 1: 271 | raise RuntimeError( 272 | u'Unsupported source found more than one volume system types.') 273 | 274 | if type_indicators[0] == definitions.TYPE_INDICATOR_TSK_PARTITION: 275 | path_spec = self._GetNextLevelTSKPartionVolumeSystemPathSpec( 276 | source_path_spec, is_single_part) 277 | 278 | elif type_indicators[0] == definitions.TYPE_INDICATOR_VSHADOW: 279 | path_spec = self._GetNextLevelVshadowVolumeSystemPathSpec( 280 | source_path_spec) 281 | 282 | else: 283 | raise RuntimeError(( 284 | u'Unsupported source found unsupported volume system ' 285 | u'type: {0:s}.').format(type_indicators[0])) 286 | 287 | return path_spec 288 | 289 | def _GetUpperLevelVolumeSystemPathSpecForBcnlp(self, source_path_spec, spath): 290 | """Determines the upper level volume system path specification, 291 | then calls methods to lsit files from all partitions into the 292 | specified file. 293 | 294 | Args: 295 | source_path_spec: the source path specification (instance of 296 | dfvfs.PathSpec). 297 | of: Output file descriptor 298 | 299 | Returns: number partitions 300 | 301 | Raises: 302 | RuntimeError: if the format of or within the source is not supported. 303 | """ 304 | fname = sys._getframe().f_code.co_name 305 | type_indicators = analyzer.Analyzer.GetVolumeSystemTypeIndicators( 306 | source_path_spec) 307 | 308 | if not type_indicators: 309 | # No supported volume system found, we are at the upper level. 310 | return source_path_spec 311 | 312 | if len(type_indicators) > 1: 313 | raise RuntimeError( 314 | u'Unsupported source found more than one volume system types.') 315 | 316 | if type_indicators[0] == definitions.TYPE_INDICATOR_TSK_PARTITION: 317 | partitions = self._GetNextLevelTSKPartionVolumeSystemPathSpecForBcnlp( 318 | source_path_spec, spath) 319 | 320 | elif type_indicators[0] == definitions.TYPE_INDICATOR_VSHADOW: 321 | path_spec = self._GetNextLevelVshadowVolumeSystemPathSpec( 322 | source_path_spec) 323 | 324 | else: 325 | raise RuntimeError(( 326 | u'Unsupported source found unsupported volume system ' 327 | u'type: {0:s}.').format(type_indicators[0])) 328 | 329 | return partitions 330 | 331 | def _ListFileEntry( 332 | self, file_system, file_entry, parent_full_path, output_writer, p): 333 | """Lists a file entry. 334 | #self, file_system, file_entry, parent_full_path, output_writer, p): 335 | 336 | Args: 337 | file_system: the file system (instance of dfvfs.FileSystem). 338 | file_entry: the file entry (instance of dfvfs.FileEntry). 339 | parent_full_path: the full path of the parent file entry. 340 | output_writer: the output writer (instance of StdoutWriter). 341 | """ 342 | # Since every file system implementation can have their own path 343 | # segment separator we are using JoinPath to be platform and file system 344 | # type independent. 345 | full_path = file_system.JoinPath([parent_full_path, file_entry.name]) 346 | if file_entry.IsFile(): 347 | output_writer.write(full_path) 348 | output_writer.write("\n") 349 | 350 | for sub_file_entry in file_entry.sub_file_entries: 351 | self._ListFileEntry(file_system, sub_file_entry, full_path, output_writer, p) 352 | 353 | def ListFileEntries(self, base_path_spec, output_writer, partition): 354 | """Lists file entries in the base path specification. 355 | 356 | Args: 357 | base_path_spec: the base path specification (instance of dfvfs.PathSpec). 358 | output_writer: the output writer (instance of StdoutWriter). 359 | """ 360 | 361 | file_system = resolver.Resolver.OpenFileSystem(base_path_spec) 362 | file_entry = resolver.Resolver.OpenFileEntry(base_path_spec) 363 | if file_entry is None: 364 | print( 365 | u'Unable to open base path specification:\n{0:s}'.format( 366 | base_path_spec.comparable)) 367 | logging.warning( 368 | u'Unable to open base path specification:\n{0:s}'.format( 369 | base_path_spec.comparable)) 370 | return 371 | 372 | self._ListFileEntry(file_system, file_entry, u'', output_writer, partition) 373 | 374 | def GetInodeForFile(self, image_path, file_path): 375 | """Returns the Inode of the given file in the given image. 376 | 377 | Args: 378 | image_path: Path to the image 379 | file_path: Path to the given fime. 380 | """ 381 | logging.info("GetInode: image_path: %s ", image_path) 382 | logging.info("GetInode: file_path:%s ", file_path) 383 | os_path_spec = path_spec_factory.Factory.NewPathSpec(\ 384 | definitions.TYPE_INDICATOR_OS, location=image_path) 385 | ewf_path_spec = path_spec_factory.Factory.NewPathSpec(\ 386 | definitions.TYPE_INDICATOR_EWF, parent=os_path_spec) 387 | tsk_partition_path_spec = path_spec_factory.Factory.NewPathSpec(\ 388 | definitions.TYPE_INDICATOR_TSK_PARTITION, \ 389 | location=u'/p1', parent=ewf_path_spec) 390 | tsk_path_spec = path_spec_factory.Factory.NewPathSpec(\ 391 | definitions.TYPE_INDICATOR_TSK, location=file_path, \ 392 | parent=tsk_partition_path_spec) 393 | file_entry = resolver.Resolver.OpenFileEntry(tsk_path_spec) 394 | 395 | if (file_entry == None): 396 | return -1 397 | 398 | stat_object = file_entry.GetStat() 399 | 400 | logging.info("Inode: for file %s = %s ",file_path, stat_object.ino) 401 | return(stat_object.ino) 402 | 403 | def GetBasePathSpec(self, source_path, is_single_part=False): 404 | """Determines the base path specification. 405 | (If is_sing_part is True (when this is called per partition), 406 | it doesn't get into checking the individual partitions). 407 | 408 | Args: 409 | source_path: the source path. 410 | 411 | Returns: 412 | The base path specification (instance of dfvfs.PathSpec). 413 | 414 | Raises: 415 | RuntimeError: if the source path does not exists, or if the source path 416 | is not a file or directory, or if the format of or within 417 | the source file is not supported. 418 | """ 419 | if not os.path.exists(source_path): 420 | raise RuntimeError(u'No such source: {0:s}.'.format(source_path)) 421 | 422 | stat_info = os.stat(source_path) 423 | 424 | if (not stat.S_ISDIR(stat_info.st_mode) and 425 | not stat.S_ISREG(stat_info.st_mode)): 426 | raise RuntimeError( 427 | u'Unsupported source: {0:s} not a file or directory.'.format( 428 | source_path)) 429 | 430 | if stat.S_ISDIR(stat_info.st_mode): 431 | path_spec = path_spec_factory.Factory.NewPathSpec( 432 | definitions.TYPE_INDICATOR_OS, location=source_path) 433 | 434 | elif stat.S_ISREG(stat_info.st_mode): 435 | path_spec = path_spec_factory.Factory.NewPathSpec( 436 | definitions.TYPE_INDICATOR_OS, location=source_path) 437 | 438 | type_indicators = analyzer.Analyzer.GetStorageMediaImageTypeIndicators( 439 | path_spec) 440 | 441 | if len(type_indicators) > 1: 442 | raise RuntimeError(( 443 | u'Unsupported source: {0:s} found more than one storage media ' 444 | u'image types.').format(source_path)) 445 | 446 | if len(type_indicators) == 1: 447 | path_spec = path_spec_factory.Factory.NewPathSpec( 448 | type_indicators[0], parent=path_spec) 449 | 450 | if not type_indicators: 451 | # The RAW storage media image type cannot be detected based on 452 | # a signature so we try to detect it based on common file naming 453 | # schemas. 454 | file_system = resolver.Resolver.OpenFileSystem(path_spec) 455 | raw_path_spec = path_spec_factory.Factory.NewPathSpec( 456 | definitions.TYPE_INDICATOR_RAW, parent=path_spec) 457 | 458 | glob_results = raw.RawGlobPathSpec(file_system, raw_path_spec) 459 | if glob_results: 460 | path_spec = raw_path_spec 461 | 462 | # In case we did not find a storage media image type we keep looking 463 | # since not all RAW storage media image naming schemas are known and 464 | # its type can only detected by its content. 465 | 466 | path_spec = self._GetUpperLevelVolumeSystemPathSpec(path_spec, is_single_part) 467 | 468 | # In case we did not find a volume system type we keep looking 469 | # since we could be dealing with a store media image that contains 470 | # a single volume. 471 | 472 | type_indicators = analyzer.Analyzer.GetFileSystemTypeIndicators( 473 | path_spec) 474 | 475 | if len(type_indicators) > 1: 476 | raise RuntimeError(( 477 | u'Unsupported source: {0:s} found more than one file system ' 478 | u'types.').format(source_path)) 479 | 480 | if not type_indicators: 481 | logging.warning(u'Unable to find a supported file system.') 482 | path_spec = path_spec_factory.Factory.NewPathSpec( 483 | definitions.TYPE_INDICATOR_OS, location=source_path) 484 | 485 | elif type_indicators[0] != definitions.TYPE_INDICATOR_TSK: 486 | raise RuntimeError(( 487 | u'Unsupported source: {0:s} found unsupported file system ' 488 | u'type: {1:s}.').format(source_path, type_indicators[0])) 489 | 490 | else: 491 | path_spec = path_spec_factory.Factory.NewPathSpec( 492 | definitions.TYPE_INDICATOR_TSK, location=u'/', 493 | parent=path_spec) 494 | 495 | return path_spec 496 | 497 | def ListAllFiles(self, source_path): 498 | """Determines the base path specification and lists all the files 499 | per partition, in the given disk image file, into text files 500 | _filelist_ 501 | Note: Modified routine from the original GetBasePathSpec method 502 | 503 | Args: 504 | source_path: the source path. 505 | 506 | Returns: 507 | The base path specification (instance of dfvfs.PathSpec). 508 | 509 | Raises: 510 | RuntimeError: if the source path does not exist, or if the source path 511 | is not a file or directory, or if the format of or within 512 | the source file is not supported. 513 | """ 514 | 515 | fname = sys._getframe().f_code.co_name 516 | logging.debug("%s: Listing files for %s", fname, source_path) 517 | if not os.path.exists(source_path): 518 | raise RuntimeError(u'No such source: {0:s}.'.format(source_path)) 519 | 520 | stat_info = os.stat(source_path) 521 | 522 | if (not stat.S_ISDIR(stat_info.st_mode) and 523 | not stat.S_ISREG(stat_info.st_mode)): 524 | raise RuntimeError( 525 | u'Unsupported source: {0:s} not a file or directory.'.format( 526 | source_path)) 527 | if stat.S_ISDIR(stat_info.st_mode): 528 | path_spec = path_spec_factory.Factory.NewPathSpec( 529 | definitions.TYPE_INDICATOR_OS, location=source_path) 530 | 531 | elif stat.S_ISREG(stat_info.st_mode): 532 | path_spec = path_spec_factory.Factory.NewPathSpec( 533 | definitions.TYPE_INDICATOR_OS, location=source_path) 534 | 535 | type_indicators = analyzer.Analyzer.GetStorageMediaImageTypeIndicators( 536 | path_spec) 537 | 538 | if len(type_indicators) > 1: 539 | raise RuntimeError(( 540 | u'Unsupported source: {0:s} found more than one storage media ' 541 | u'image types.').format(source_path)) 542 | 543 | if len(type_indicators) == 1: 544 | path_spec = path_spec_factory.Factory.NewPathSpec( 545 | type_indicators[0], parent=path_spec) 546 | 547 | if not type_indicators: 548 | # The RAW storage media image type cannot be detected based on 549 | # a signature so we try to detect it based on common file naming 550 | # schemas. 551 | file_system = resolver.Resolver.OpenFileSystem(path_spec) 552 | raw_path_spec = path_spec_factory.Factory.NewPathSpec( 553 | definitions.TYPE_INDICATOR_RAW, parent=path_spec) 554 | 555 | glob_results = raw.RawGlobPathSpec(file_system, raw_path_spec) 556 | if glob_results: 557 | path_spec = raw_path_spec 558 | 559 | # In case we did not find a storage media image type we keep looking 560 | # since not all RAW storage media image naming schemas are known and 561 | # its type can only detected by its content. 562 | 563 | partitions = self._GetUpperLevelVolumeSystemPathSpecForBcnlp(path_spec,\ 564 | source_path) 565 | 566 | return partitions 567 | def GetFileEntry(self, base_path_spec): 568 | return resolver.Resolver.OpenFileEntry(base_path_spec) 569 | 570 | 571 | class StdoutWriter(object): 572 | """Class that defines a stdout output writer.""" 573 | 574 | def Open(self): 575 | """Opens the output writer object. 576 | 577 | Returns: 578 | A boolean containing True if successful or False if not. 579 | """ 580 | return True 581 | 582 | def Close(self): 583 | """Closes the output writer object.""" 584 | pass 585 | 586 | def WriteFileEntry(self, path): 587 | """Writes the file path to stdout. 588 | 589 | Args: 590 | path: the path of the file. 591 | """ 592 | print(u'{0:s}'.format(path)) 593 | 594 | -------------------------------------------------------------------------------- /bcnlp_tm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=UTF-8 3 | # 4 | # BitCurator NLP (Disk Image Access for the Web) 5 | # Copyright (C) 2014 - 2016 6 | # All rights reserved. 7 | # 8 | # This code is distributed under the terms of the GNU General Public 9 | # License, Version 3. See the text file "COPYING" for further details 10 | # about the terms of this license. 11 | # 12 | # This file contains the main BitCurator NLP application for Topic modeling 13 | 14 | # Usage: python bcnlp_tm.py [--topics <10>] [--tm ] 15 | # Default num_topics = 10, tm=graphlab 16 | 17 | import os 18 | import logging 19 | import pyLDAvis 20 | import pyLDAvis.gensim 21 | from gensim import corpora, models, similarities 22 | import gensim 23 | import textract 24 | from bn_filextract import * 25 | from configobj import ConfigObj 26 | from stop_words import get_stop_words 27 | 28 | try: 29 | from argparse import ArgumentParser 30 | except ImportError: 31 | raise ImportError("This script requires ArgumentParser which is in Python 2.7 or Python 3.0") 32 | 33 | #logging.basicConfig(filename= 'bcnlp_tm.log', level=logging.DEBUG) 34 | logging.basicConfig(filename= 'bcnlp_tm_info.log', level=logging.INFO) 35 | logging.basicConfig(filename= 'bcnlp_tm_debug.log', level=logging.DEBUG) 36 | logging.basicConfig(filename= 'bcnlp_tm_warning.log', level=logging.WARNING) 37 | 38 | 39 | cfg_image = {} 40 | #documents = [] 41 | 42 | class BnTopicModel(): 43 | 44 | def tm_generate_gensim(self, infile, num_topics, config_file): 45 | ''' Using the APIs provided by gensim, LDAvis gui is invoked. 46 | NOTE: This is not yet tested well. 47 | ''' 48 | documents = [] 49 | documents = bn.bnTraverseInfileDir(infile, documents, config_file) 50 | if documents == []: 51 | print("Documents are empty") 52 | 53 | ''' #Debug 54 | i = 0 55 | for document in documents: 56 | logging.info("Document[%d] = %s ", i, document) 57 | i+=1 58 | ''' 59 | 60 | # remove common words and tokenize 61 | ''' 62 | stoplist = set('a an the of to for s from is and this \ 63 | was were are , - | @ . '.split()) 64 | texts = [[word for word in document.lower().split() \ 65 | if word not in stoplist] \ 66 | for document in documents] 67 | ''' 68 | 69 | # Remove stop words - both from known stopword list and from 70 | # configuration file. 71 | # NOTE: Gensim's preprocessing to remove stop words is commented out. 72 | # This seems to be doing better. Test with more dataset before 73 | # deciding on which one to keep. 74 | exc_list = bn.bnGetConfigInfo(config_file, \ 75 | "confset_section", "exclude_words") 76 | en_stop = get_stop_words('en') 77 | en_stop = en_stop + exc_list 78 | logging.info("Stop-words list: %s ", en_stop) 79 | texts = [[word for word in document.lower().split() \ 80 | if word not in en_stop] \ 81 | for document in documents] 82 | 83 | ## from pprint import pprint # pretty-printer 84 | ## pprint(texts) 85 | 86 | # remove words that appear only once 87 | from collections import defaultdict 88 | frequency = defaultdict(int) 89 | for text in texts: 90 | ''' 91 | # NOTE: Commenting for now. With the preprocessing in 92 | # filextract.py, we won't need this. Remove after testing. 93 | i = 0 94 | for word in text: 95 | # NOTE: Some text files need this conversion. See if this can 96 | # be done for the whole document at one time. 97 | text[i] = unicode(word, errors='ignore') 98 | i+=1 99 | ''' 100 | for token in text: 101 | frequency[token] += 1 102 | 103 | texts = [[token for token in text if frequency[token] > 1] 104 | for text in texts] 105 | 106 | texts = [[token for token in text if len(token) > 2] 107 | for text in texts] 108 | 109 | # NOTE: lemmatize not working 110 | ###texts = gensim.utils.lemmatize(texts) 111 | 112 | dictionary = corpora.Dictionary(texts) 113 | 114 | ##logging.info("[V]: token:id: %s", dictionary.token2id) 115 | 116 | ## dictionary.compactify() 117 | dictionary.save('/tmp/saved_dict.dict') 118 | 119 | # Now convert tokenized documents to vectors: 120 | corpus = [dictionary.doc2bow(text) for text in texts] 121 | 122 | ## logging.info("[V] Corpus: %s ", corpus) 123 | 124 | # store to disk, for later use 125 | corpora.MmCorpus.serialize('/tmp/saved_dict.mm', corpus) 126 | 127 | ## Creating Transformations 128 | ## The transformations are standard Python objects, typically 129 | ## initialized (trained) by means of a training corpus: 130 | ## First, let's use tfidf for training: It just involves simply 131 | ## going thru the supplied corpus once and computing document 132 | ## frequencies of all its featuers. 133 | 134 | tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model 135 | 136 | corpus_tfidf = tfidf[corpus] 137 | corpora.MmCorpus.serialize('/tmp/saved_corpus_tfidf.mm', corpus_tfidf) 138 | 139 | ''' 140 | # LSI model is commented out for now 141 | print "Printing TFIDF of given corpus \n" 142 | for doc in corpus_tfidf: 143 | print (doc) 144 | 145 | # Now Initialize an LSI transformation: num_topics set to 2 to make 146 | # it 2D lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, 147 | # num_topics=3) 148 | 149 | # create a double wrapper over the original corpus: 150 | # bow->tfidf->fold-in-lsi 151 | corpus_lsi = lsi[corpus_tfidf] 152 | 153 | print "Printing LSI topics" 154 | lsi.print_topics(4) 155 | 156 | for doc in corpus_lsi: 157 | print (doc) 158 | ''' 159 | 160 | # Create an LDA model 161 | ''' 162 | lda_model = models.LdaModel(corpus_tfidf, \ 163 | id2word=dictionary, \ 164 | num_topics=5) 165 | ''' 166 | lda_model = models.ldamodel.LdaModel(corpus=corpus, \ 167 | id2word=dictionary, \ 168 | num_topics=num_topics) 169 | corpus_lda = lda_model[corpus] 170 | 171 | corpus_lda_tfidf = lda_model[corpus_tfidf] 172 | 173 | # The following will print the topics in the logfile 174 | logging.info("Printing %s topics into log file: ", str(num_topics)) 175 | lda_model.print_topics(num_topics) 176 | 177 | # Generate data for the pyLDAvis interface from the lda_model above 178 | vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary) 179 | ###vis_data = pyLDAvis.gensim.prepare(lda_model, corpus_tfidf, dictionary) 180 | ##vis_data = pyLDAvis.gensim.prepare(lda_model, corpus_lda, dictionary) 181 | 182 | #pyLDAvis.display(vis_data) 183 | pyLDAvis.show(vis_data) 184 | 185 | def remove_punctuation(self, text): 186 | import string 187 | return text.translate(None, string.punctuation) 188 | 189 | def remove_digits(self, text): 190 | import string 191 | return text.translate(None, string.digits) 192 | 193 | def bnRemoveEmptyFiles(self, path): 194 | ''' Traverses the directory and recursively removes empty files. 195 | ''' 196 | files = os.listdir(path) 197 | if len(files): 198 | for fl in files: 199 | fullpath = os.path.join(path, fl) 200 | if os.path.isdir(fullpath): 201 | self.bnRemoveEmptyFiles(fullpath) 202 | if os.stat(fullpath).st_size == 0: 203 | logging.info("Removing file %s ", fullpath) 204 | os.remove(fullpath) 205 | 206 | def bn_parse_config_file(config_file, section_name): 207 | ''' Parses the config file to extract the image names and entity list. 208 | ''' 209 | logging.info("bn_parse_config_file: Section: %s ", section_name) 210 | config = ConfigObj(config_file) 211 | section = config[section_name] 212 | i = 0 213 | cfg_entity_list = [] 214 | for key in section: 215 | #if key == cfg_string: 216 | # found the string 217 | #return section[key] 218 | if section_name == "image_section": 219 | logging.info("parse_config: key: %s, section: %s", \ 220 | key, section[key]) 221 | cfg_image[i] = key 222 | i+=1 223 | elif section_name == "entity_list_section": 224 | flag = int(entity_list_section[key]) 225 | if flag == 1: 226 | cfg_entity_list.append(key) 227 | if section_name == "entity_list_section": 228 | return cfg_entity_list 229 | 230 | if __name__ == "__main__": 231 | parser = ArgumentParser(prog='bcnlp_tm.py', description='Topic modeling') 232 | parser.add_argument('--config', action='store', \ 233 | help="Config file[config.txt] ") 234 | parser.add_argument('--infile', action='store', help="input directory ") 235 | parser.add_argument('--tm', action='store', \ 236 | help="topic modeling :gensim/graphlab ") 237 | parser.add_argument('--topics', action='store', help="number of topics ") 238 | 239 | args = parser.parse_args() 240 | 241 | # Infile specifies the directory of files to run the topic modeling on. 242 | # If no argument specified, it will assume there are disk_images specified 243 | # in the config file config.txt. 244 | 245 | infile = args.infile 246 | tm = args.tm # Topic modeling type: gensim/graphlab 247 | config_file = args.config 248 | is_disk_image = False 249 | 250 | num_topics = 10 251 | if args.topics: 252 | num_topics = args.topics 253 | 254 | # default it to gensim 255 | if tm == None: 256 | tm = 'gensim' 257 | 258 | if config_file == None: 259 | config_file = "config.txt" 260 | 261 | bn = BnFilextract() 262 | if infile == None: 263 | is_disk_image = True 264 | 265 | bn_parse_config_file(config_file, "image_section") 266 | print(">> Images in the config file: ", cfg_image) 267 | 268 | infile = bn.bnGetConfigInfo(config_file, \ 269 | "confset_section", "file_staging_directory") 270 | 271 | i = 0 272 | for img in cfg_image: 273 | print(">> Extracting files from image {}...".format(cfg_image[img])) 274 | bn.bnExtractFiles(None, cfg_image[img], i, None, config_file) 275 | i += 1 276 | print(">> ... Done ") 277 | 278 | ''' 279 | # NOTE: We needed this for Graphlab as we didn't do it in graphlab 280 | # routine. If that code is put back we need to make sure we call 281 | # bnTraverseInfileDir fron tm_generate_graphlab 282 | else: 283 | documents = [] 284 | print(">> Extracting files from ", infile) 285 | bn.bnTraverseInfileDir(infile, documents, config_file) 286 | ''' 287 | 288 | tmc = BnTopicModel() 289 | tmc.tm_generate_gensim(infile, num_topics, config_file) 290 | 291 | 292 | -------------------------------------------------------------------------------- /bn_filextract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=UTF-8 3 | # 4 | # BitCurator NLP 5 | # Copyright (C) 2016 - 2018 6 | # All rights reserved. 7 | # 8 | # This code is distributed under the terms of the GNU General Public 9 | # License, Version 3. See the text file "COPYING" for further details 10 | # about the terms of this license. 11 | # 12 | # This file contains the File Extraction routines for BitCurator NLP. 13 | # 14 | 15 | import pytsk3 16 | import pyewf 17 | import os 18 | import sys 19 | import string 20 | import time 21 | import re 22 | import logging 23 | from configobj import ConfigObj 24 | import subprocess 25 | from subprocess import Popen,PIPE 26 | #import xml.etree.ElementTree as ET 27 | import textract 28 | import logging 29 | from bcnlp_listfiles import FileEntryLister 30 | from bcnlp_fxtract import FileExtractor 31 | 32 | from gensim.parsing.preprocessing import remove_stopwords 33 | from gensim.parsing.preprocessing import preprocess_documents 34 | from gensim.parsing.preprocessing import stem_text 35 | from gensim.parsing.preprocessing import strip_numeric 36 | from gensim.parsing.preprocessing import strip_punctuation 37 | 38 | class ewf_Img_Info(pytsk3.Img_Info): 39 | 40 | def __init__(self, ewf_handle): 41 | self._ewf_handle = ewf_handle 42 | super(ewf_Img_Info, self).__init__( 43 | url="", type=pytsk3.TSK_IMG_TYPE_EXTERNAL) 44 | 45 | def close(self): 46 | self._ewf_handle.close() 47 | 48 | def read(self, offset, size): 49 | self._ewf_handle.seek(offset) 50 | return self._ewf_handle.read(size) 51 | 52 | def get_size(self): 53 | return self._ewf_handle.get_media_size() 54 | 55 | def bn_getimginfo(image_path): 56 | logging.info("bn_getimginfo: Image Info for image %s: ", image_path) 57 | filenames = pyewf.glob(image_path) 58 | ewf_handle = pyewf.handle() 59 | ewf_handle.open(filenames) 60 | 61 | img = ewf_Img_Info(ewf_handle) 62 | return img 63 | 64 | # Dict to number of partitions in each image 65 | partition_in = dict() 66 | 67 | logging.basicConfig(filename= 'bcnlp_tm_info.log', level=logging.INFO) 68 | logging.basicConfig(filename= 'bcnlp_tm_debug.log', level=logging.DEBUG) 69 | 70 | class BnFilextract: 71 | """ This class contains the file extraction methods from 72 | disk images. 73 | """ 74 | num_partitions = 0 75 | part_array = ["image_path", "addr", "slot_num", "start_offset", "desc"] 76 | partDictList = [] 77 | num_partitions_ofimg = dict() 78 | 79 | def bnlpGetFsForImage(self, image_path, image_index, partition_num): 80 | """ Gets the filesystem info for an image and partition, 81 | using Pytsk3 method 82 | 83 | Args: 84 | image_path: Path to disk image 85 | image_index: Internally maintained serial number of the iamge 86 | partition_num: Partition within the volume 87 | 88 | Returns: 89 | Filesystem descriptor 90 | """ 91 | logging.info('bnlpGetFsForImage: image_path: %s', image_path) 92 | logging.info('bnlpGetFsForImage: index: %s', image_index) 93 | logging.info('bnlpGetFsForImage: part: %s', partition_num) 94 | img = bn_getimginfo(image_path) 95 | 96 | part_start = \ 97 | self.partDictList[int(image_index)][partition_num]['start_offset'] 98 | 99 | fs = pytsk3.FS_Info(img, offset=(part_start * 512)) 100 | return fs 101 | 102 | def bnGetExFmtsFromConfigFile(self, config_file): 103 | """Extract the list of excluded format types from the given 104 | config file. 105 | 106 | Args: 107 | config_file: Configuration file 108 | 109 | Returns: 110 | list of the format types to be excluded. 111 | """ 112 | exc_fmt_list = [] 113 | config = ConfigObj(config_file) 114 | section = config["exclude_format_section"] 115 | for key in section: 116 | if section[key]: 117 | exc_fmt_list.append(key) 118 | 119 | return exc_fmt_list 120 | 121 | def bnGetFileContents(self, filename, config_file): 122 | file_extract_dir = self.bnGetConfigInfo(config_file, \ 123 | "confset_section", "file_staging_directory") 124 | """EXtract the contents of a file while doing nlp on a local 125 | directory of files as opposed to a disk image. 126 | 127 | Args: 128 | filename: Given file 129 | config_file: Configuration file 130 | """ 131 | if filename.endswith('.txt') or filename.endswith('.TXT'): 132 | with open(filename, 'r') as tempfile: 133 | #input_file_contents = tempfile.read().replace('\n', '') 134 | input_file_contents = tempfile.read() 135 | 136 | else: 137 | # Eliminate the files that are configured to be excluded 138 | fn, filetype = os.path.splitext(filename) 139 | exc_fmt_list = self.bnGetExFmtsFromConfigFile(config_file) 140 | if filetype in exc_fmt_list: 141 | logging.info("File type %s excluded: %s", filetype, fn) 142 | return None 143 | logging.info("Filename %s is not a txt file. So textracting", \ 144 | filename) 145 | 146 | try: 147 | input_file_contents = textract.process(filename) 148 | logging.info(">>> Textract PASSED for file %s ", filename) 149 | #logging.info("bcnlp:: File contents of %s %s ",\ 150 | # filename, input_file_contents) 151 | except: 152 | logging.info("\n >>> Textract failed for doc %s ", filename) 153 | return None 154 | 155 | return input_file_contents 156 | 157 | def bnTraverseInfileDir(self, extracted_files, documents, config_file): 158 | ''' This routine traverses the given directory to extract the 159 | files and adds the contents to the global documents list. 160 | 161 | Args: 162 | extracted_files: Directory whose files need to be extracted. 163 | documents: Where the contents of th files will go. 164 | config_file: Configuration file. 165 | ''' 166 | 167 | print("bnTraverseInfileDir: extracted_files: ", extracted_files) 168 | num_docs = 0 169 | for root, dirs, files in os.walk(extracted_files): 170 | path = root.split(os.sep) 171 | ''' 172 | logging.info("traverse: path: %s, length: %s ", path, len(path)) 173 | logging.info("traverse: dirs: %s ", dirs) 174 | logging.info("traverse: files: %s ", files) 175 | ''' 176 | for filename in files: 177 | file_path = '/'.join(path) + '/' + filename 178 | doc = self.bnGetFileContents(file_path, config_file) 179 | if doc == None: 180 | logging.info(">> Filename %s is empty. Skipping ", 181 | file_path) 182 | continue 183 | doc = unicode(doc, errors='ignore') 184 | 185 | logging.info("[V]: traverse: Appending doc %s \ 186 | to documents list ", filename) 187 | ##logging.info("[VV]Document %s before preprocessing: %s",\ 188 | ## filename, doc) 189 | #doc = remove_stopwords(doc) 190 | doc = strip_punctuation(doc) 191 | doc = strip_numeric(doc) 192 | logging.info("Preprocessing done on DOC %s ", filename) 193 | ## logging.info("[VV]Preprocessing done on %s : %s", \ 194 | ## filename, doc) 195 | if doc != None: 196 | documents.append(doc) 197 | num_docs += 1 198 | logging.info("[D]traverse: Total num docs: %d", num_docs) 199 | return documents 200 | 201 | def bnTraverseDirForPlot(self, img, extracted_files, ent, parse_en, config_file): 202 | ''' This routine traverses the given directory to extract the 203 | files and invokes a routine to process the text for plotting 204 | purposes. 205 | 206 | Args: 207 | img: Image index 208 | extracted_files: Directory whose files need to be extracted. 209 | ent: Handle to ParseForEnts class 210 | parse_en: Spacy handle 211 | config_file: Configuration file. 212 | ''' 213 | 214 | num_docs = 0 215 | for root, dirs, files in os.walk(extracted_files): 216 | path = root.split(os.sep) 217 | 218 | entity_list = ent.bnParseConfigFileForEnts("config.txt") 219 | 220 | for filename in files: 221 | file_path = '/'.join(path) + '/' + filename 222 | file_contents = self.bnGetFileContents(file_path, config_file) 223 | logging.info("D2: traverse: getting contents from %s ",\ 224 | file_path) 225 | logging.info("bcnlpProcesstext for file:%s", file_path) 226 | 227 | try: 228 | ent.bcnlpProcessText(img, filename, unicode(file_contents,\ 229 | "utf-8"), entity_list, parse_en, bg=False) 230 | except: 231 | logging.info("bcnlpProcessText failed for img:%s, file:%s",\ 232 | str(img), filename) 233 | continue 234 | 235 | num_docs += 1 236 | #logging.info("[V] traverse: Total num docs: %d ", num_docs) 237 | 238 | def bnDfvfsGenerateFileList(self, image_path): 239 | """ Using Dfvfs methods, file-list is geenrated from the given 240 | disk image in an output file with the name: 241 | _filelist 242 | 243 | Args: 244 | image_path: Path to specified image 245 | 246 | """ 247 | num_partitions = 1 248 | 249 | file_entry_lister = FileEntryLister() 250 | try: 251 | num_partitions = file_entry_lister.ListAllFiles(image_path) 252 | 253 | except: 254 | print "file_entry_lister failed" 255 | return(0) 256 | 257 | return num_partitions 258 | 259 | def bnCreateDirsInPath(self, file_extract_dir, filepath): 260 | """ Looking at the path of the file, directories are created 261 | if they don't yet exist. 262 | 263 | Args: 264 | file_extract_dir: Directory where files are to be extracted. 265 | filepath: Path to the file 266 | """ 267 | filename = os.path.basename(filepath) 268 | dir_name = os.path.dirname(filepath) 269 | 270 | current_dir = os.path.join(os.getcwd(), file_extract_dir) 271 | file_list = filepath.split('/') 272 | logging.info("bnCreateDirsInPath: file_list: %s ", file_list) 273 | 274 | listlen = len(file_list) 275 | 276 | newdir = os.path.join(current_dir, file_list[0]) 277 | for i in range (0, listlen-1): 278 | #logging.info("i:%s file_list[i]: %s", i, file_list[i]) 279 | if os.path.exists(newdir): 280 | newdir = os.path.join(newdir, file_list[i+1]) 281 | else: 282 | logging.info("bnCreateDirsInPath: Creating dir: %s", newdir) 283 | os.mkdir(newdir) 284 | newdir = os.path.join(newdir, file_list[i+1]) 285 | 286 | def bnQueueFileForExtraction(self,\ 287 | base_path_spec,\ 288 | image_path, output_path, jobs): 289 | """ This routine pushes the file_entry corresponding to the 290 | given path_spec into the queue for file-extraction using 291 | dfvfs/FileExtractor APIs. 292 | Args: 293 | base_path_spec 294 | image_path: path to disk image 295 | output_path: Where the extracted files will go. 296 | jobs: Jobs queue for extraction task. 297 | """ 298 | fname = sys._getframe().f_code.co_name 299 | file_entry_lister = FileEntryLister() 300 | 301 | file_entry = file_entry_lister.GetFileEntry(base_path_spec) 302 | 303 | fe = FileExtractor(base_path_spec, output_path) 304 | jobs.append(fe) 305 | logging.info("[%s]: Jobs before adding to queue: %s ", fname, jobs) 306 | fe.start() 307 | fe.AddFileToQueue(file_entry, image_path) 308 | 309 | fe.Finish() 310 | logging.info("[%s]: Jobs after adding to the queue: %s" ,fname, jobs) 311 | 312 | for job in jobs: 313 | job.join() 314 | 315 | def bnExtractFiles(self, ent, image, image_index, parse_en, config_file): 316 | """ Generate file-list from the disk image and extract the 317 | files into a specified directory. 318 | 319 | Args: 320 | ent: Placeholder 321 | image: disk image 322 | image_index: Internally maintained index for the image 323 | parse_en: Placeholder 324 | config_file: Name of the configuration file. 325 | """ 326 | fname = sys._getframe().f_code.co_name 327 | logging.info("%s: Extracting files for img: %s, with config_file: %s ",\ 328 | fname, image, config_file) 329 | jobs = [] 330 | 331 | config = ConfigObj(config_file) 332 | exc_fmt_list = self.bnGetExFmtsFromConfigFile(config_file) 333 | 334 | file_extract_dir = self.bnGetConfigInfo(config_file, \ 335 | "confset_section", "file_staging_directory") 336 | 337 | 338 | disk_image_dir = self.bnGetConfigInfo(config_file, \ 339 | "confset_section", "disk_image_dir") 340 | 341 | image_path = os.getcwd() + "/" + disk_image_dir + "/" + image 342 | 343 | file_extract_dir_path = os.getcwd() + '/'+ file_extract_dir 344 | logging.info("%s: File Extracxtion directory: %s ", \ 345 | fname, file_extract_dir_path) 346 | 347 | print "\n>> Files will be extracted in ", file_extract_dir_path 348 | 349 | cmd = "mkdir " + file_extract_dir 350 | if not os.path.exists(file_extract_dir): 351 | subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) 352 | file_extract_dir_per_image = file_extract_dir + '/' + str(image_index) 353 | cmd = "mkdir " + file_extract_dir_per_image 354 | if not os.path.exists(file_extract_dir_per_image): 355 | subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) 356 | 357 | self.num_partitions = \ 358 | self.bnlpGetPartInfoForImage(image_path, image_index) 359 | 360 | ''' 361 | # Call Dfvfs method to generate the file-list in the image 362 | self.num_partitions = self.bnDfvfsGenerateFileList(image_path) 363 | partition_in[image] = self.num_partitions 364 | ''' 365 | 366 | logging.info("%s: # partitions:%s Generating filelist ", fname, \ 367 | self.num_partitions) 368 | 369 | logging.info("%s: Generated filelist. Extract contents", fname) 370 | 371 | file_entry_lister = FileEntryLister() 372 | output_path = file_extract_dir_per_image 373 | 374 | for p in range(0, self.num_partitions): 375 | base_path_spec = file_entry_lister.GetBasePathSpec(image_path, True) 376 | logging.info("%s: Extracting contents from part p = %s", fname, p) 377 | self.bnQueueFileForExtraction(base_path_spec, image_path, output_path, jobs) 378 | 379 | def isFileTextractable(self, filename, config_file): 380 | """ Not all files are extractable as text file. Before extracting 381 | a file, it should pass this test. 382 | Args: 383 | filename: Input file 384 | config_file: Name of the config file 385 | """ 386 | logging.info("isTextratable: filename: %s ", filename) 387 | 388 | if (filename.endswith('.txt') or filename.endswith('.TXT') or \ 389 | filename.endswith('.pdf') or filename.endswith('.PDF') or \ 390 | filename.endswith('.xml') or \ 391 | filename.endswith('.XML') or \ 392 | filename.endswith('.doc') or filename.endswith('.DOC') or \ 393 | filename.endswith('.htm') or filename.endswith('.HTM;1') or \ 394 | filename.endswith('.html') or filename.endswith('.HTML') or \ 395 | filename.endswith('.jpg') or filename.endswith('.JPG') ): 396 | 397 | # if any of the above types are configured to be exluded, 398 | # filter them out. 399 | fn, fe = os.path.splitext(filename) 400 | exc_fmt_list = self.bnGetExFmtsFromConfigFile(config_file) 401 | logging.info("isFileTextratable:file:%s, exc_fmt_list: %s", \ 402 | filename, exc_fmt_list) 403 | if fe in exc_fmt_list: 404 | logging.info("isTextraxtable:File %s configured \ 405 | to be excluded",filename) 406 | return False 407 | return True 408 | else: 409 | return False 410 | 411 | def bnIsEntityInfoSetInConfig(self, config_file): 412 | """ Filter for some legacy code. 413 | FIXME: Will be removed from here eventually 414 | """ 415 | entity_info = self.bnGetConfigInfo(\ 416 | config_file, "confset_section", "entity_info") 417 | if entity_info == "Yes": 418 | return True 419 | else: 420 | return False 421 | 422 | def bnlpDnldFile(self, inode, fs, filepath): 423 | """ Extracts the contents of a given file. 424 | Args: 425 | inode: Inode of the given file 426 | fs: Filesystem info 427 | """ 428 | logging.info("bnlpDnldFile: file_path:%s, inode:%d",filepath, inode) 429 | try: 430 | f = fs.open_meta(inode=inode) 431 | except: 432 | logging.info("fs.open_meta failed for file %s ", filepath) 433 | return 434 | 435 | # Read data and store it in a string 436 | offset = 0 437 | size = f.info.meta.size 438 | BUFF_SIZE = 1024 * 1024 439 | 440 | total_data = "" 441 | while offset < size: 442 | available_to_read = min(BUFF_SIZE, size - offset) 443 | data = f.read_random(offset, available_to_read) 444 | if not data: 445 | # print("Done with reading") 446 | break 447 | 448 | offset += len(data) 449 | total_data = total_data+data 450 | logging.info("bnlpDnldFile: D2: Length OF TOTAL DATA: %s ", \ 451 | str(len(total_data))) 452 | 453 | logging.info("bnlpDnldFile: D2: Dumping the contents to filepath %s ",\ 454 | filepath) 455 | 456 | try: 457 | with open(filepath, "w") as text_file: 458 | text_file.write(total_data) 459 | except IOError, e: 460 | print("Opeing the file {} failed with error {}", filepath, e) 461 | return 462 | 463 | ## print ("D2: Time to index the file ", filepath) 464 | basepath = os.path.dirname(filepath) 465 | 466 | def bnGetConfigInfo(self, config_file, section_name, cfg_string): 467 | """Given the key, extract info from the config file 468 | 469 | Args: 470 | config_file: Configuration filename 471 | section_name: Name of the section within the config file 472 | cfg_string: What we ase looking for - the key 473 | """ 474 | config = ConfigObj(config_file) 475 | section = config[section_name] 476 | for key in section: 477 | if key == cfg_string: 478 | # found the string 479 | return section[key] 480 | else: 481 | print "bnGetConfigInfo: Key not found in section ", section_name 482 | 483 | def bnGetOutDirFromConfig(self, config_file): 484 | config = ConfigObj(config_file) 485 | 486 | file_extract_dir = "file_staging_directory" 487 | config_section = config['confset_section'] 488 | for key in config_section: 489 | #print (key, config_section[key]) 490 | if key == "file_staging_directory": 491 | file_extract_dir = config_section[key] 492 | return file_extract_dir 493 | break 494 | else: 495 | print("file_staging_directory not in config file - using default\n") 496 | return None 497 | 498 | def bnlpGetNumPartsForImage(self, image_path, image_index): 499 | img = bn_getimginfo(image_path) 500 | 501 | # pytsk3.Volume_Info works only with file systems which have partition 502 | # defined. For file systems like FAT12, with no partition info, we need 503 | # to handle in an exception. 504 | try: 505 | volume = pytsk3.Volume_Info(img) 506 | except: 507 | logging.info(">> Volume Info failed. Could be FAT12 ") 508 | self.num_partitions = 1 509 | return (self.num_partitions) 510 | 511 | for part in volume: 512 | if part.slot_num >= 0: 513 | try: 514 | fs = pytsk3.FS_Info(img, offset=(part.start * 512)) 515 | except: 516 | logging.info(">> Exception in pytsk3.FS_Info in prtn:%s ", 517 | self.num_partitions ) 518 | continue 519 | self.num_partitions += 1 520 | return (self.num_partitions) 521 | 522 | 523 | def bnlpGetPartInfoForImage(self, image_path, image_index): 524 | img = bn_getimginfo(image_path) 525 | is_partition_info = False 526 | 527 | # pytsk3.Volume_Info works only with file systems which have partition 528 | # defined. For file systems like FAT12, with no partition info, we need 529 | # to handle in an exception. 530 | try: 531 | volume = pytsk3.Volume_Info(img) 532 | is_partition_info = True 533 | except: 534 | ## print "bnlpGetPartionInfoForImage: Volume Info failed. 535 | ## Could be FAT12 " 536 | self.num_partitions = 1 537 | is_partition_info = False 538 | fs = pytsk3.FS_Info(img, offset=0) 539 | 540 | ## print "D: File System Type Detected ", fs.info.ftype 541 | if fs.info.ftype == pytsk3.TSK_FS_TYPE_FAT12: 542 | fs_desc = "FAT12 file system" 543 | elif fs.info.ftype == pytsk3.TSK_FS_TYPE_ISO9660_DETECT: 544 | fs_desc = "ISO file system" 545 | else: 546 | fs_desc = "Unknown file system" 547 | 548 | self.partDictList.append([]) 549 | # First level files and directories off the root 550 | # returns file_list for the root directory 551 | file_list_root = self.bnlpListFiles(fs, "/", image_index, 0) 552 | image_name = os.path.basename(image_path) 553 | self.num_partitions_ofimg[image_name] = self.num_partitions 554 | 555 | # Populate the partDictList for the image. 556 | self.partDictList[image_index].append({self.part_array[0]:image_path, \ 557 | self.part_array[1]:0, \ 558 | self.part_array[2]:0, \ 559 | self.part_array[3]:0, \ 560 | self.part_array[4]:fs_desc }) 561 | return self.num_partitions 562 | 563 | # For images with partition_info, we continue here. 564 | self.partDictList.append([]) 565 | 566 | self.num_partitions = 0 567 | for part in volume: 568 | # The slot_num field of volume object has a value of -1 569 | # for non-partition entries - like Unallocated partition 570 | # and Primary and extended tables. So we will look for this 571 | # field to be >=0 to count partitions with valid file systems 572 | if part.slot_num >= 0: 573 | # Add the entry to the List of dictionaries, partDictList. 574 | # The list will have one dictionary per partition. The image 575 | # name is added as the first element of each partition to 576 | # avoid a two-dimentional list. 577 | ## print "D: image_path: ", image_path 578 | ## print "D: part_addr: ", part.addr 579 | ## print "D: part_slot_num: ", part.slot_num 580 | ## print "D: part_start_offset: ", part.start 581 | ## print "D: part_description: ", part.desc 582 | # Open the file system for this image at the extracted 583 | # start_offset. 584 | try: 585 | fs = pytsk3.FS_Info(img, offset=(part.start * 512)) 586 | except: 587 | logging.info("Exception in pytsk3.FS_Info for prtn:%s", 588 | self.num_partitions) 589 | continue 590 | 591 | self.partDictList[image_index].append({self.part_array[0]:image_path, \ 592 | self.part_array[1]:part.addr, \ 593 | self.part_array[2]:part.slot_num, \ 594 | self.part_array[3]:part.start, \ 595 | self.part_array[4]:part.desc }) 596 | 597 | self.num_partitions += 1 598 | 599 | fs = pytsk3.FS_Info(img, offset=(part.start * 512)) 600 | 601 | # First level files and directories off the root 602 | # returns file_list for the root directory 603 | file_list_root = self.bnlpListFiles(fs, "/", image_index, part.slot_num) 604 | ## print(file_list_root) 605 | 606 | image_name = os.path.basename(image_path) 607 | self.num_partitions_ofimg[image_name] = self.num_partitions 608 | logging.info("Number of Partitions for image %s = %s", 609 | image_name, self.num_partitions) 610 | return (self.num_partitions) 611 | 612 | bnlpFileInfo = ['name', 'size', 'mode', 'inode', 'p_inode', 'mtime', \ 613 | 'atime', 'ctime', 'isdir', 'deleted', 'name_slug'] 614 | def bnlpListFiles(self, fs, path, image_index, partition_num): 615 | file_list = [] 616 | try: 617 | directory = fs.open_dir(path=path) 618 | except: 619 | print "Error in opening file path {} ".format(path) 620 | return None 621 | 622 | i=0 623 | for f in directory: 624 | is_dir = False 625 | ''' 626 | print("Func:bnlpListFiles:root_path:{} size: {} inode: {} \ 627 | par inode: {} mode: {} type: {} ".format(f.info.name.name,\ 628 | f.info.meta.size, f.info.meta.addr, f.info.name.meta_addr,\ 629 | f.info.name.par_addr, f.info.meta.mode, f.info.meta.type)) 630 | ''' 631 | # Some files may not have the metadta information. So 632 | # access it only if it exists. 633 | if f.info.meta != None: 634 | if f.info.meta.type == 2: 635 | is_dir = True 636 | 637 | # Since we are displaying the modified time for the file, 638 | # Convert the mtime to isoformat to be passed in file_list. 639 | ## d = date.fromtimestamp(f.info.meta.mtime) 640 | ## mtime = d.isoformat() 641 | mtime = time.strftime("%FT%TZ",time.gmtime(f.info.meta.mtime)) 642 | 643 | 644 | if (int(f.info.meta.flags) & 0x01 == 0): 645 | deleted = "Yes" 646 | else: 647 | deleted = "No" 648 | 649 | # NOTE: A new item "name_slug" is added to those file names which 650 | # have a space. The space is replaced by %20 and saved as name_slug. 651 | # This is used later when a file with a "non-None" name_slug shows 652 | # up at the route. It is recognized as a filename with spaces and 653 | # using the inode comparison, its real name is extracted before 654 | # downloading the file. 655 | name_slug = "None" 656 | if " " in f.info.name.name: 657 | name_slug = f.info.name.name.replace(" ", "%20") 658 | file_list.append({self.bnlpFileInfo[0]:f.info.name.name, \ 659 | self.bnlpFileInfo[1]:f.info.meta.size, \ 660 | self.bnlpFileInfo[2]:f.info.meta.mode, \ 661 | self.bnlpFileInfo[3]:f.info.meta.addr, \ 662 | self.bnlpFileInfo[4]:f.info.name.par_addr, \ 663 | self.bnlpFileInfo[5]:mtime, \ 664 | self.bnlpFileInfo[6]:f.info.meta.atime, \ 665 | self.bnlpFileInfo[7]:f.info.meta.ctime, \ 666 | self.bnlpFileInfo[8]:is_dir, \ 667 | self.bnlpFileInfo[9]:deleted, \ 668 | self.bnlpFileInfo[10]:name_slug }) 669 | 670 | ##print("Func:bnlpListFiles: Listing Directory for PATH: ", path) 671 | ##print file_list 672 | ##print "\n\n" 673 | return file_list 674 | -------------------------------------------------------------------------------- /config.txt: -------------------------------------------------------------------------------- 1 | # 2 | # bitcurator-nlp-gentm config file 3 | # 4 | 5 | # Disk images to process (the default location can be changed in the following section) 6 | [image_section] 7 | govdocs45sampler.E01 = 1 8 | 9 | # Configuration settings. Where to find disk images and store intermediary files. 10 | [confset_section] 11 | disk_image_dir = "disk_images" 12 | file_staging_directory = "extracted_files" 13 | nlp_dir = "bcnlp" 14 | spacy_outfile = "spacy_outfile" 15 | entity_info = "No" 16 | num_iterations = 200 17 | exclude_words = "excludeme", "andme" 18 | 19 | # Formats to exclude when extracting text using textract 20 | [exclude_format_section] 21 | .jpg=1 22 | .JPG=1 23 | .mp3=1 24 | .wav=1 25 | -------------------------------------------------------------------------------- /disk_images/fourpartusb1.E01: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BitCurator/bitcurator-nlp-gentm/f75b4908862b5280949b783d409d43dd59034e49/disk_images/fourpartusb1.E01 -------------------------------------------------------------------------------- /disk_images/govdocs45sampler.E01: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BitCurator/bitcurator-nlp-gentm/f75b4908862b5280949b783d409d43dd59034e49/disk_images/govdocs45sampler.E01 -------------------------------------------------------------------------------- /externals/README.md: -------------------------------------------------------------------------------- 1 | # Support libraries 2 | 3 | This project uses libewf-20140608 to maintain compatibility with The Sleuth Kit. The source is included here as the upstream developer(s) keep moving it around. 4 | 5 | ## Documentation 6 | 7 | Additional project information can be found on the BitCurator NLP wiki at https://github.com/BitCurator/bitcurator-nlp/wiki. 8 | 9 | ## License(s) 10 | 11 | The BitCurator logo, BitCurator project documentation, and other non-software products of the BitCurator team are subject to the the Creative Commons Attribution 4.0 Generic license (CC By 4.0). 12 | 13 | Unless otherwise indicated, software items in this repository are distributed under the terms of the GNU Lesser General Public License, Version 3. See the text file "COPYING" for further details about the terms of this license. 14 | 15 | In addition to software produced by the BitCurator team, BitCurator packages and modifies open source software produced by other developers. Licenses and attributions are retained here where applicable. 16 | 17 | -------------------------------------------------------------------------------- /externals/libewf-20140608.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BitCurator/bitcurator-nlp-gentm/f75b4908862b5280949b783d409d43dd59034e49/externals/libewf-20140608.tar.gz -------------------------------------------------------------------------------- /externals/libuna-alpha-20150927.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BitCurator/bitcurator-nlp-gentm/f75b4908862b5280949b783d409d43dd59034e49/externals/libuna-alpha-20150927.tar.gz -------------------------------------------------------------------------------- /setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # setup.sh: Build and configuration script for nlp-webtools 5 | # 6 | # This script sets up a correctly configured environment to the topic modeling tool. 7 | # It should only be run once prior to running "python bcnlp_tm.py" for the first 8 | # time. 9 | # 10 | 11 | LOG_BASE=/tmp 12 | 13 | #--- FUNCTION ----------------------------------------------------------------- 14 | # NAME: echoinfo 15 | # DESCRIPTION: Echo information to stdout. 16 | #------------------------------------------------------------------------------ 17 | echoinfo() { 18 | printf "%s * STATUS%s: %s\n" "${GC}" "${EC}" "$@"; 19 | } 20 | 21 | #--- FUNCTION ----------------------------------------------------------------- 22 | # NAME: echoerr 23 | # DESCRIPTION: Echo errors to stderr. 24 | #------------------------------------------------------------------------------ 25 | echoerror() { 26 | printf "%s * ERROR%s: %s\n" "${RC}" "${EC}" "$@" 1>&2; 27 | } 28 | 29 | #--- FUNCTION ----------------------------------------------------------------- 30 | # NAME: __apt_get_install_noinput 31 | # DESCRIPTION: (DRY) apt-get install with noinput options 32 | #------------------------------------------------------------------------------ 33 | __apt_get_install_noinput() { 34 | apt-get install -y -o DPkg::Options::=--force-confold "$@"; return $? 35 | #yes | aptdcon --hide-terminal --install "$@"; return $? 36 | } 37 | 38 | #--- FUNCTION ----------------------------------------------------------------- 39 | # NAME: __pip_install_noinput 40 | # DESCRIPTION: (DRY) 41 | #------------------------------------------------------------------------------ 42 | 43 | __pip_install_noinput() { 44 | # Uncomment for Python 3 45 | #pip3 install --upgrade $@; return $? 46 | pip2 install --upgrade $@; return $? 47 | } 48 | 49 | #--- FUNCTION ----------------------------------------------------------------- 50 | # NAME: __pip_install_noinput 51 | # DESCRIPTION: (DRY) 52 | #------------------------------------------------------------------------------ 53 | __pip_pre_install_noinput() { 54 | # Uncomment for Python 3 55 | #pip3 install --pre --upgrade $@; return $? 56 | pip2 install --pre --upgrade $@; return $? 57 | } 58 | 59 | install_ubuntu_deps() { 60 | 61 | echoinfo "Updating your APT Repositories ... " 62 | apt-get update >> $LOG_BASE/nlp-install.log 2>&1 || return 1 63 | 64 | echoinfo "Installing Python Software Properies ... " 65 | __apt_get_install_noinput software-properties-common >> $LOG_BASE/nlp-install.log 2>&1 || return 1 66 | 67 | echoinfo "Enabling Universal Repository ... " 68 | __enable_universe_repository >> $LOG_BASE/nlp-install.log 2>&1 || return 1 69 | 70 | echoinfo "Updating Repository Package List ..." 71 | apt-get update >> $LOG_BASE/nlp-install.log 2>&1 || return 1 72 | 73 | echoinfo "Upgrading all packages to latest version ..." 74 | __apt_get_upgrade_noinput >> $LOG_BASE/nlp-install.log 2>&1 || return 1 75 | 76 | return 0 77 | } 78 | 79 | install_ubuntu_packages() { 80 | packages="antiword 81 | automake 82 | curl 83 | dkms 84 | ffmpeg 85 | flac 86 | g++-5 87 | gcc-5 88 | lame 89 | libffi-dev 90 | libjpeg-dev 91 | liblzma-dev 92 | libmad0 93 | libpulse-dev 94 | libsox-fmt-mp3 95 | libtool 96 | libxml2-dev 97 | libxslt1-dev 98 | lzma 99 | poppler-utils 100 | pstotext 101 | python 102 | python-dev 103 | python-pip 104 | python3-dev 105 | python3-pip 106 | sox 107 | swig 108 | swig3.0 109 | tesseract-ocr 110 | unrtf 111 | virtualbox-guest-utils 112 | virtualenv 113 | virtualenvwrapper 114 | zlib1g-dev" 115 | 116 | if [ "$@" = "dev" ]; then 117 | packages="$packages" 118 | elif [ "$@" = "stable" ]; then 119 | packages="$packages" 120 | fi 121 | 122 | for PACKAGE in $packages; do 123 | __apt_get_install_noinput $PACKAGE >> $LOG_BASE/nlp-install.log 2>&1 124 | ERROR=$? 125 | if [ $ERROR -ne 0 ]; then 126 | echoerror "Install Failure: $PACKAGE (Error Code: $ERROR)" 127 | else 128 | echoinfo "Installed Package: $PACKAGE" 129 | fi 130 | done 131 | 132 | return 0 133 | } 134 | 135 | install_ubuntu_pip_packages() { 136 | 137 | pip_packages="textract 138 | gensim 139 | pyLDAvis 140 | stop_words 141 | configobj" 142 | pip_special_packages="textacy" 143 | 144 | if [ "$@" = "dev" ]; then 145 | pip_packages="$pip_packages" 146 | elif [ "$@" = "stable" ]; then 147 | pip_packages="$pip_packages" 148 | fi 149 | 150 | ERROR=0 151 | 152 | for PACKAGE in $pip_packages; do 153 | CURRENT_ERROR=0 154 | echoinfo "Installed Python Package: $PACKAGE" 155 | __pip_install_noinput $PACKAGE >> $LOG_BASE/nlp-install.log 2>&1 || (let ERROR=ERROR+1 && let CURRENT_ERROR=1) 156 | if [ $CURRENT_ERROR -eq 1 ]; then 157 | echoerror "Python Package Install Failure: $PACKAGE" 158 | fi 159 | done 160 | 161 | # Prep environment for special packages, install cld2-cffi 162 | #env CC=/usr/bin/gcc-5 pip3 install -U cld2-cffi 163 | env CC=/usr/bin/gcc-5 pip install -U cld2-cffi 164 | 165 | for PACKAGE in $pip_special_packages; do 166 | CURRENT_ERROR=0 167 | echoinfo "Installed Python (special setup) Package: $PACKAGE" 168 | __pip_pre_install_noinput $PACKAGE >> $LOG_BASE/nlp-install.log 2>&1 || (let ERROR=ERROR+1 && let CURRENT_ERROR=1) 169 | if [ $CURRENT_ERROR -eq 1 ]; then 170 | echoerror "Python Package Install Failure: $PACKAGE" 171 | fi 172 | done 173 | 174 | if [ $ERROR -ne 0 ]; then 175 | echoerror 176 | return 1 177 | fi 178 | 179 | return 0 180 | } 181 | 182 | install_source_packages() { 183 | 184 | # Install libuna from specific release 185 | echoinfo "nlp-webtools: Building and installing libuna" 186 | CDIR=$(pwd) 187 | # Newer versions break a lot of stuff. Keep 20150927 for now. 188 | # wget -q https://github.com/libyal/libuna/releases/download/20170112/libuna-alpha-20170112.tar.gz 189 | cd /tmp 190 | cp /$HOME/bitcurator-nlp-gentm/externals/libuna-alpha-20150927.tar.gz . 191 | tar zxf libuna-alpha-20150927.tar.gz >> $HOME/nlp-install.log 2>&1 192 | cd libuna-20150927 193 | ./configure >> $HOME/nlp-install.log 2>&1 194 | make -s >> $HOME/nlp-install.log 2>&1 195 | make install >> $HOME/nlp-install.log 2>&1 196 | ldconfig >> $HOME/nlp-install.log 2>&1 197 | 198 | # Now clean up 199 | cd /tmp 200 | rm -rf libuna-20170112 201 | rm libuna-alpha-20170112.tar.gz 202 | 203 | # Install libewf from current sources 204 | echoinfo "nlp-webtools: Building and installing libewf" 205 | CDIR=$(pwd) 206 | 207 | # Newer versions break a lot of stuff. Keep 20140608 for now. 208 | cd /tmp 209 | cp /$HOME/bitcurator-nlp-gentm/externals/libewf-20140608.tar.gz . 210 | tar zxf libewf-20140608.tar.gz >> $HOME/nlp-install.log 2>&1 211 | cd libewf-20140608 212 | ./configure --enable-python --enable-v1-api >> $HOME/nlp-install.log 2>&1 213 | make -s >> $HOME/nlp-install.log 2>&1 214 | make install >> $HOME/nlp-install.log 2>&1 215 | ldconfig >> $HOME/nlp-install.log 2>&1 216 | 217 | # Now clean up 218 | cd /tmp 219 | rm -rf libewf-20140608 220 | rm libewf-20140608.tar.gz 221 | 222 | echoinfo "nlp-webtools: Adding DFXML tools and libraries" 223 | CDIR=$(pwd) 224 | git clone https://github.com/simsong/dfxml /usr/share/dfxml >> $HOME/nlp-install.log 2>&1 225 | # No cleanup needed 226 | cd /tmp 227 | 228 | # Install The Sleuth Kit (TSK) from current sources 229 | echoinfo "nlp-webtools: Building and installing The Sleuth Kit" 230 | CDIR=$(pwd) 231 | git clone --recursive https://github.com/sleuthkit/sleuthkit /usr/share/sleuthkit >> $HOME/nlp-install.log 2>&1 232 | cd /usr/share/sleuthkit 233 | git fetch 234 | git checkout master >> $HOME/nlp-install.log 2>&1 235 | ./bootstrap >> $HOME/nlp-install.log 2>&1 236 | ./configure >> $HOME/nlp-install.log 2>&1 237 | make -s >> $HOME/nlp-install.log 2>&1 238 | make install >> $HOME/nlp-install.log 2>&1 239 | ldconfig >> $HOME/nlp-install.log 2>&1 240 | 241 | # Install PyTSK 242 | echoinfo "nlp-webtools: Building and installing PyTSK (Python bindings for TSK)" 243 | echoinfo " -- Please be patient. This may take several minutes..." 244 | CDIR=$(pwd) 245 | cd /tmp 246 | git clone https://github.com/py4n6/pytsk 247 | cd pytsk 248 | python setup.py update >> $HOME/nlp-install.log 2>&1 249 | python setup.py build >> $HOME/nlp-install.log 2>&1 250 | python setup.py install >> $HOME/nlp-install.log 2>&1 251 | # Now clean up 252 | cd /tmp 253 | #rm -rf pytsk3-20170508 254 | rm -rf pytsk 255 | 256 | } 257 | 258 | complete_message() { 259 | echo 260 | echo "Installation Complete!" 261 | echo 262 | } 263 | 264 | echo "Installing core dependencies...." 265 | install_ubuntu_deps 266 | 267 | echo "Installing Ubuntu packages...." 268 | install_ubuntu_packages stable 269 | 270 | echo "Installing pip packages...." 271 | install_ubuntu_pip_packages stable 272 | 273 | echo "Installing source packages...." 274 | install_source_packages 275 | 276 | # echo "current directory1: ${PWD} " 277 | echo "Installing textract support packages..." 278 | sudo apt-get install libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr flac ffmpeg lame libmad0 libsox-fmt-mp3 libpulse-dev sox swig swig3.0 libjpeg-dev zlib1g-dev 279 | 280 | 281 | # echo "current directory2: ${PWD} " 282 | echo "Installing textract..." 283 | sudo pip install textract 284 | 285 | # No longer using graphlab 286 | #echo "Installing graphlab..." 287 | #sudo pip install --upgrade --no-cache-dir https://get.graphlab.com/GraphLab-Create/2.1/[user_email]/[license_key]/GraphLab-Create-License.tar.gz 288 | 289 | echo "Installing configObj..." 290 | pip install configobj 291 | 292 | echo "Installing gensim..." 293 | pip install gensim 294 | 295 | echo "Installing pyLDAvis..." 296 | pip install pyLDAvis 297 | 298 | # The following are needed for bn_plot 299 | pip install matplotlib 300 | pip install spacy 301 | python -m spacy download en 302 | 303 | echo "Installing dfvfs..." 304 | curl -O https://raw.githubusercontent.com/log2timeline/dfvfs/master/requirements.txt 305 | pip install -r requirements.txt 306 | pip install dfvfs 307 | 308 | complete_message 309 | --------------------------------------------------------------------------------