├── .gitignore
├── LICENSE
├── README.md
├── attic
├── Vagrantfile
├── bn_plot.py
├── old-config.txt
├── old_graphlab_ref_code
│ └── bcnlp_tm.py
└── provision
│ └── bootstrap.sh
├── bcnlp_fxtract.py
├── bcnlp_listfiles.py
├── bcnlp_tm.py
├── bn_filextract.py
├── config.txt
├── disk_images
├── fourpartusb1.E01
└── govdocs45sampler.E01
├── externals
├── README.md
├── libewf-20140608.tar.gz
└── libuna-alpha-20150927.tar.gz
└── setup.sh
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | # bitcurator-nlp-gentm
4 |
5 | [](https://github.com/bitcurator/bitcurator-nlp-gentm/issues)
6 | [](https://github.com/bitcurator/bitcurator-nlp-gentm/network)
7 | [](https://twitter.com/bitcurator)
8 |
9 | # END-OF-LIFE (EOL) NOTICE
10 |
11 | This research software has reached end-of-life. The code in this repository is no longer actively maintained or supported.
12 |
13 | ## About
14 |
15 | Generate topic models using open text automatically extracted from various file formats in disk images. This project uses The Sleuth Kit (https://github.com/sleuthkit/sleuthkit) to parse file systems in disk images, textract (https://textract.readthedocs.io/en/stable/) to extract text from common file formats, gensim to generate topic models (https://radimrehurek.com/gensim/), and pyLDAvis (https://github.com/bmabey/pyLDAvis) for visualization.
16 |
17 | ## Setup and Installation
18 |
19 | The topic model generation tool depends on a number of external natural language processing and digital forensics libraries. For convenience, we have included a script that will install all the required dependencies in Ubuntu 18.04LTS. This script will install certain tools (TSK, libewf, and several others) by compiling and installing from source.
20 |
21 | In a Ubuntu host or a clean virtual machine, first make sure you have git installed:
22 |
23 | * Open a terminal and install git using apt:
24 | ```shell
25 | $ sudo apt-get install git
26 | ```
27 |
28 | Next, follow these steps:
29 |
30 | * Clone this repository:
31 | ```shell
32 | $ git clone https://github.com/bitcurator/bitcurator-nlp-gentm
33 | ```
34 |
35 | * Change directory into the repository:
36 | ```shell
37 | $ cd bitcurator-nlp-gentm
38 | ```
39 |
40 | * Run the setup shell script to install and configure the required software (various dependencies, TSK, textract, and gensim). Note that this may take some time (**typically 10-15 minutes**).
41 | ```shell
42 | $ sudo ./setup.sh
43 | ```
44 |
45 | ## Disk Image Selection and Configuration
46 |
47 | This repository includes a sample Expert Witness Format disk image (**govdocs45sampler.E01**) in the **disk_images** directory. If you do not make any changes to the configuration file, the topic modeler and visualization tool will be run on text extracted from files discovered in this image.
48 |
49 | To run the tool against other disk images (EWF or raw), simply copy those images into the **disk_images** directory and edit the **[image_section]** of the configuration file (**config.txt**) to include the relevant files. For example, if you had two images named **testimage1.E01** and **testimage2.dd**, the section would be modified as follows:
50 |
51 | ```shell
52 | # Disk images to process (the default location can be changed in the following section)
53 | [image_section]
54 | my-image-name1.E01 = 1
55 | my-image-name2.dd = 1
56 | ```
57 |
58 | ## Running the Tool
59 |
60 | Run the following command to extract text from the configured file types, start the topic modeling tool, and load the results into a browser window.
61 |
62 | ```shell
63 | $ python bcnlp_tm.py
64 | ```
65 |
66 | * Depending on the size of your corpus, this may take some time. You will see a range of log output and (possibly) deprecation warnings related to the operation of gensim and other tools. The tool is operating normally unless it drops back to a terminal prompt with an error.
67 |
68 | * The results based on the text extracted from your specified file types and processed using pyLDAvis will appear automatically in a browser window. When finished viewing, you can terminate the server in the existing terminal by typing "Ctrl-X" followed by "Ctrl-C".
69 |
70 | Additional adjustments can be performed with command-line flags.
71 |
72 | * --topics: number of topics (default 10)
73 | * --tm: topic modeling tool (default gensim). (Graphlab option disabled due to licensing restrictions)
74 | * --infile: file source: if the --infile option is not used, the disc image(s) listed in the configuration
75 | file will be extracted. Use --infile to specify a directory instead.
76 | * --config: configuration file (default **config.txt** in main directory) - specify file path for alternate configuration file
77 |
78 | ```shell
79 | $ Usage: python bcnlp_tm.py [--topics <10>] [--tm ] [--infile ] [--config ]
80 | ```
81 |
82 | ## Documentation
83 |
84 | Additional project information can be found on the BitCurator NLP wiki at https://github.com/BitCurator/bitcurator-nlp/wiki.
85 |
86 | ## License(s)
87 |
88 | The BitCurator logo, BitCurator project documentation, and other non-software products of the BitCurator team are subject to the the Creative Commons Attribution 4.0 Generic license (CC By 4.0).
89 |
90 | Unless otherwise indicated, software items in this repository are distributed under the terms of the GNU Lesser General Public License, Version 3. See the text file "COPYING" for further details about the terms of this license.
91 |
92 | In addition to software produced by the BitCurator team, BitCurator packages and modifies open source software produced by other developers. Licenses and attributions are retained here where applicable.
93 |
94 | ## Additional Notes
95 |
96 | If your Ubuntu VM does not already have a desktop (graphic UI), you will need to install one in order to view the results in a browser:
97 |
98 | ```shell
99 | $ sudo apt-get update
100 | $ sudo apt-get install ubuntu-desktop
101 | ```
102 |
--------------------------------------------------------------------------------
/attic/Vagrantfile:
--------------------------------------------------------------------------------
1 | #-*- mode: ruby -*-
2 | # vi: set ft=ruby :
3 |
4 | # Vagrantfile API/syntax version. Don't touch unless you know what you're doing!
5 | VAGRANTFILE_API_VERSION = "2"
6 |
7 | Vagrant.configure(VAGRANTFILE_API_VERSION) do |config|
8 |
9 | # Note: Built is tested with Ubuntu 17.04, but should also work with Ubuntu 16.04
10 | config.vm.box = "bento/ubuntu-17.04"
11 |
12 | # Run the provisioning script
13 | config.vm.provision :shell, :path => "./provision/bootstrap.sh"
14 |
15 | # Configure synced folder - uncomment the following line to enable
16 | # config.vm.synced_folder "", "/vagrant"
17 |
18 | # Port forward HTTP (80) to host 2020
19 | # Port forward 8080 (required for gensim)
20 | config.vm.network :forwarded_port, :host => 8080, :guest => 80
21 | config.vm.network :forwarded_port, :host => 8888, :guest => 8888
22 |
23 | # Use VirtualBox as the provider. Default specs are 4GB RAM, 2 procs
24 | # Increase vb.memory and vb.cpus for better performance
25 | config.vm.provider :virtualbox do |vb|
26 | vb.name = "nlp-webtools-0.0.1"
27 | vb.memory = 4096
28 | vb.cpus = 2
29 | end
30 | end
31 |
--------------------------------------------------------------------------------
/attic/bn_plot.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # coding=UTF-8
3 | #
4 | # BitCurator NLP (Disk Image Access for the Web)
5 | # Copyright (C) 2016 - 2018
6 | # All rights reserved.
7 | #
8 | # This code is distributed under the terms of the GNU General Public
9 | # License, Version 3. See the text file "COPYING" for further details
10 | # about the terms of this license.
11 | #
12 | # This file contains the main BitCurator NLP application for 3D plot.
13 |
14 | from mpl_toolkits.mplot3d import Axes3D
15 | import matplotlib.pyplot as plt
16 | import numpy as np
17 | import spacy
18 | #import spacy.en
19 | #from spacy.en import English
20 | import textract
21 |
22 | dict_ent = {}
23 | dict_time = {}
24 | dict_org = {}
25 | dict_person = {}
26 | dict_gpe = {}
27 | dict_event = {}
28 | dict_date = {}
29 | dict_languages = {}
30 | dict_facility = {}
31 | dict_work_of_art = {}
32 | dict_norp = {}
33 | dict_loc = {}
34 |
35 | def get_dict(ent_type):
36 | if ent_type == 'time':
37 | return "time", dict_time
38 | elif ent_type == 'org':
39 | return "org", dict_org
40 | elif ent_type == 'person':
41 | return "person", dict_person
42 | elif ent_type == 'gpe':
43 | return "gpe", dict_gpe
44 | elif ent_type == 'date':
45 | return "date", dict_date
46 | elif ent_type == 'languages':
47 | return "languages", dict_languages
48 | elif ent_type == 'facility':
49 | return "facility", dict_facility
50 | elif ent_type == 'work_of_art':
51 | return "work_of_art", dict_work_of_art
52 | elif ent_type == 'norp':
53 | return 'norp', dict_norp
54 | elif ent_type == 'loc':
55 | return "loc", dict_loc
56 | else:
57 | return None, None
58 |
59 |
60 | from bn_filextract import *
61 |
62 | from configobj import ConfigObj
63 | # Dict to number of partitions in each image
64 | partition_in = dict()
65 | config_file = "config.txt" # FIXME: Remove the globalization
66 | logging.basicConfig(filename= 'bcnlp.log', level=logging.DEBUG)
67 |
68 | img_list = []
69 | doc_list = []
70 | entities_list = []
71 |
72 | class ParseForEnts():
73 | """ Parses the given file(s) into entities and generates the span
74 | Input: text, entity_list
75 | Output: Span file(s)
76 | entity_list can be configured in the file bcnlp_config.txt.
77 | """
78 | def __init__(self):
79 | self.spans = []
80 | fig = plt.figure()
81 | self.ax = fig.add_subplot(111, projection='3d')
82 | self.ax.set_xlabel('Image')
83 | self.ax.set_ylabel('Document')
84 | self.ax.set_zlabel('Entity')
85 |
86 | def getIdsForPlot(self, img, doc, entity):
87 | if img not in img_list:
88 | logging.info("Appending img to img_list : %s", img)
89 | img_list.append(img)
90 | # return the key as it already exists
91 | img_id = img_list.index(img)
92 |
93 | #logging.info("[V]ParseForEnts: getIdsForPlot: doc_list: %s ", doc_list)
94 | if doc not in doc_list:
95 | logging.info("Appending DOC to doc_list : %s", doc)
96 | doc_list.append(doc)
97 | else:
98 | logging.info("Doc %s already exists ",doc)
99 |
100 | # return the key as it already exists
101 | doc_id = doc_list.index(doc)
102 |
103 | if entity not in entities_list:
104 | entities_list.append(entity)
105 | # return the key as it already exists
106 | entity_id = entities_list.index(entity)
107 |
108 | logging.info("getIdsForPlot:ret img_id %d doc_id %d entity_id: %d ",\
109 | img_id, doc_id, entity_id)
110 |
111 | return img_id, doc_id, entity_id
112 |
113 | def tagEnts(self, text, entity_list, nlp, img, doc):
114 | self.spacy_doc = nlp(text)
115 | logging.info("Spacy_doc Entities: \n")
116 |
117 | '''
118 | for ent in self.spacy_doc.ents:
119 | logging.info("%s, %s, %s", ent.text, ent.label, ent.label_)
120 | '''
121 |
122 | for j in entity_list:
123 | dict_ent[j] = 0
124 |
125 | logging.info("tagEnts: Entity list: %s", entity_list)
126 | for word in self.spacy_doc[:-1]:
127 | #logging.info("[V]Word: %s ent_type: %s ", \
128 | #word, str(word.ent_type_))
129 |
130 | start = word.i
131 | end = word.i + 1
132 | while end < len(self.spacy_doc) and self.spacy_doc[end].is_punct:
133 | end += 1
134 | self.span = self.spacy_doc[start : end]
135 | if word.ent_type_ in entity_list or \
136 | (word.ent_type_).lower() in entity_list:
137 | #logging.info("tagEnts:Img:%s Doc:%s Entity: %s ent_type:%s ", \
138 | #img, doc, word, word.ent_type_)
139 |
140 | x, y, z = self.getIdsForPlot(img, doc, word)
141 | self.plot3d(x, y, z)
142 | logging.info("[D]tagEnts: ent_type %s is in entity_list ", \
143 | word.ent_type_)
144 | end_char = "end: "+str(self.span.end_char)
145 | start_char = "start: "+str(self.span.start_char)
146 | ent_type = "type: "+word.ent_type_
147 | self.spans.append((end_char, start_char, ent_type))
148 | logging.debug("[D]tagEnts: Appended %s, New SPANS: %s ", \
149 | word, self.spans)
150 |
151 | # For generating histogram, a new dictionary is created for
152 | # each entity. First time the value is initialized to 1.
153 | # It is appended for subsequent words
154 | edict_name, edict = get_dict(word.ent_type_.lower())
155 |
156 | if edict != None:
157 | if str(word) in edict:
158 | edict[str(word)] += 1
159 | else:
160 | edict[str(word)] = 1
161 |
162 | dict_ent[str(word.ent_type_.lower())] += 1
163 |
164 | '''
165 | # Note: This is commented out to reduce noice in the log file.
166 | else:
167 | logging.debug("ent_type %s for word %s is NOT in entity_list",
168 | word.ent_type_, word)
169 | '''
170 |
171 | return self.spans, dict_ent
172 | def plot3d(self, x, y, z):
173 | self.ax.scatter(x, y, z, c='r', marker='.')
174 |
175 |
176 | def extractContents(self, infile):
177 | """ If infile is not in text format, it uses textract api to extract
178 | text out of the given file.
179 | """
180 | if infile.endswith('.span'):
181 | return None
182 | if not infile.endswith('.txt'):
183 | print("infile {} doesnt end with txt. So textracting".format(infile))
184 |
185 | '''
186 | # Note: This is just in case we want to see the conversion
187 | # copied to a file
188 | filename, file_ext = os.path.splitext(infile)
189 | print("Filename: {}, ext: {}".format(filename, file_ext))
190 |
191 | new_infile = replace_suffix(infile,file_ext, 'txt')
192 | print "new_infile: ", new_infile
193 |
194 | f = codecs.open(new_infile, "r", "utf-8")
195 | input_file_contents = f.read()
196 |
197 | '''
198 | filename, file_ext = os.path.splitext(infile)
199 | try:
200 | text = textract.process(infile)
201 | except:
202 | print("Textract probably does not support extension ", file_ext)
203 | return None
204 |
205 | #nlp expects a unicode text string.
206 | input_file_contents = unicode(text,'utf-8')
207 |
208 | else:
209 | print "Extracting Contents of file", infile
210 | f = codecs.open(infile, "r", "utf-8")
211 | try:
212 | input_file_contents = f.read()
213 | except:
214 | print "Error reading file ", infile
215 | return None
216 |
217 | return input_file_contents
218 |
219 | def bnParseConfigFileForEnts(self, filename):
220 | """ Parses the configuration file plot_config.txt to
221 | extract FIXME
222 | """
223 | config = ConfigObj(filename)
224 | entity_list_section = config['entity_list_section']
225 | cfg_entity_list = []
226 | for key in entity_list_section:
227 | #logging.debug("Cfg: Key: %s %s ", key, entity_list_section[key])
228 | flag = int(entity_list_section[key])
229 | if flag == 1:
230 | #logging.debug("Cfg: bnParseConfigFile: Appending key %s: ", key)
231 | cfg_entity_list.append(key)
232 | return cfg_entity_list
233 |
234 | def bcnlpProcessDir(self, infile, bg):
235 | """ Recursively calls itself till it finds a file which is not a
236 | directory, to process the file contents.
237 | """
238 | for f in os.listdir(infile):
239 | f_path = infile + '/' + f
240 | print "\n>> Processing file ", f_path
241 | logging.debug("bcnlpProcessDir: Processing file %s ",f_path)
242 | if os.path.isdir(f_path):
243 | self.bcnlpProcessDir(f_path, bg)
244 | else:
245 | # It is a file
246 | logging.debug(">>>> Processing single file %s ", f_path)
247 | self.bcnlpProcessSingleFile(f_path, bg)
248 |
249 |
250 | def bcnlpProcessText(self, img, doc, text, entity_list, parse_en, bg=False):
251 | logging.info("ProcessText: img: %s doc: %s",img, doc)
252 | spans, dict_ents = self.tagEnts(text, entity_list, parse_en, img, doc)
253 | #logging.debug("const ents = %s", entity_list)
254 |
255 |
256 | def bcnlpProcessSingleFile(self, infile, bg = False):
257 | """ Given a file, it extracts the contents and calls tagEnts to
258 | create the spans for the entities given in the config file.
259 | """
260 | outfile = infile+'.span'
261 |
262 | # Get the entity list from the config file:
263 | entity_list = self.bnParseConfigFile("bcnlp_config.txt")
264 | logging.debug("infile:{}, outfile:{}".format(infile, outfile))
265 | logging.debug("Entity List:%s: ", str(entity_list))
266 |
267 | text = self.extractContents(infile)
268 |
269 | if text == None:
270 | print("textract returned None for file ", infile)
271 | return
272 | spans, dict_ents = self.tagEnts(text, entity_list, img=None, doc=None)
273 | '''
274 | # NOTE: just for debugging purpose. Produces a lot of log
275 | logging.debug("const text = %s", text)
276 | logging.debug("const spans = %s", str(spans))
277 | logging.debug("const ents = %s", entity_list)
278 | '''
279 |
280 | if not os.path.exists(outfile):
281 | logging.debug('writing spans to outfile %s ', outfile)
282 | with open(outfile, "w") as of:
283 | text_line = ("const text = '"+ text + "'")
284 | try:
285 | of.write(text_line.encode('utf8'))
286 | except UnicodeEncodeError as e:
287 | print "Unicode Error({0}) ".format(e)
288 | print (" ### Error in writing: ", infile)
289 | return
290 | span_line = str(spans).replace('(','{')
291 | span_line = span_line.replace(')','}')
292 | span_line = unicode("const spans = "+ span_line, 'utf-8')
293 | of.write("%s\n" % span_line)
294 | ent_line = unicode("const ents = " + str(entity_list), 'utf-8')
295 | of.write("%s\n" % ent_line)
296 | else:
297 | print("Outfile {} exists. So skipping".format(outfile))
298 |
299 | print("\n")
300 | print ">> Wrote span info to output file ", outfile
301 |
302 | cfg_image = {}
303 | def bn_parse_config_file(config_file, section_name):
304 | print "bn_parse_config_file: Section: ", section_name, config_file
305 | config = ConfigObj(config_file)
306 | section = config[section_name]
307 | i = 0
308 | cfg_entity_list = []
309 | for key in section:
310 | #if key == cfg_string:
311 | # found the string
312 | #return section[key]
313 | print "key: ", key
314 | if section_name == "image_section":
315 | print (key, section[key])
316 | cfg_image[i] = key
317 | i+=1
318 | elif section_name == "entity_list_section":
319 | flag = int(entity_list_section[key])
320 | if flag == 1:
321 | cfg_entity_list.append(key)
322 | if section_name == "entity_list_section":
323 | return cfg_entity_list
324 | #print "IMAGES: ", cfg_image
325 |
326 |
327 | if __name__ == "__main__":
328 |
329 | #parse_en = English()
330 | nlp = spacy.load('en')
331 | config_file = "config.txt"
332 | #bn_parse_config_file(config_file)
333 | bn_parse_config_file(config_file, "image_section")
334 | #bn = bn_filextract.bcnlp()
335 | bn = BnFilextract()
336 |
337 | # for each image extract the files and convert the convertable
338 | # formats to text format
339 | i = 0 # image index
340 | ent = ParseForEnts()
341 |
342 | # Find the excluded formats from config file.
343 | bn.exc_fmt_list = bn.bnGetExFmtsFromConfigFile(config_file)
344 | print("Excluded formats in config file: ", bn.exc_fmt_list)
345 |
346 | for img in cfg_image:
347 | print "Extracting files from image ", cfg_image[img]
348 | bn.bnExtractFiles(ent, cfg_image[img], i, nlp, config_file)
349 | i += 1
350 |
351 | #entity_list = ent.bnParseConfigFileForEnts("bn_config.txt")
352 | # Now traverse the directory and generate entities, etc.
353 | file_extract_dir = bn.bnGetConfigInfo(config_file, \
354 | "confset_section", "file_staging_directory")
355 |
356 | i = 0
357 | for img in cfg_image:
358 | new_file_extract_dir = os.path.join(file_extract_dir, str(i))
359 | bn.bnTraverseDirForPlot(img, new_file_extract_dir, \
360 | ent, nlp, config_file)
361 | i += 1
362 |
363 | print(">> Plotting the results ")
364 |
365 | plt.show()
366 |
367 | '''
368 | fig = plt.figure()
369 | ax = fig.add_subplot(111, projection='3d')
370 |
371 | xs =
372 | ys =
373 | zz = randrange(n, 0, 100)
374 | '''
375 |
376 |
377 |
--------------------------------------------------------------------------------
/attic/old-config.txt:
--------------------------------------------------------------------------------
1 | #
2 | # bitcurator-nlp-gentm config file
3 | #
4 |
5 | # Disk images to process (the default location can be changed in the following section)
6 | [image_section]
7 | govdocs45sampler.E01 = 1
8 |
9 | # Configuration settings. Where to find disk images and store intermediary files.
10 | [confset_section]
11 | disk_image_dir = "disk-images"
12 | file_staging_directory = "filextract_dir"
13 | nlp_dir = "bcnlp"
14 | spacy_outfile = "spacy_outfile"
15 | entity_info = "No"
16 | num_iterations = 200
17 | exclude_words = "false", "true", "other", "new", "old", 'can', 'will', 'may', 'also', 'shall', 'even', 'shall'
18 |
19 | # Formats to exclude when extracting text using textract
20 | [exclude_format_section]
21 | .jpg=1
22 | .JPG=1
23 | .mp3=1
24 | .wav=1
25 |
--------------------------------------------------------------------------------
/attic/old_graphlab_ref_code/bcnlp_tm.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # coding=UTF-8
3 | #
4 | # BitCurator NLP (Disk Image Access for the Web)
5 | # Copyright (C) 2014 - 2016
6 | # All rights reserved.
7 | #
8 | # This code is distributed under the terms of the GNU General Public
9 | # License, Version 3. See the text file "COPYING" for further details
10 | # about the terms of this license.
11 | #
12 | # This file contains the main BitCurator NLP application for Topic modeling
13 |
14 | # Usage: python bcnlp_tm.py [--topics <10>] [--tm ]
15 | # Default num_topics = 10, tm=graphlab
16 |
17 | import os
18 | import logging
19 | import pyLDAvis
20 | import pyLDAvis.gensim
21 | import pyLDAvis.graphlab
22 | import graphlab as gl
23 | from gensim import corpora, models, similarities
24 | import gensim
25 | import textract
26 | from bn_filextract import *
27 | from configobj import ConfigObj
28 | from stop_words import get_stop_words
29 |
30 | try:
31 | from argparse import ArgumentParser
32 | except ImportError:
33 | raise ImportError("This script requires ArgumentParser which is in Python 2.7 or Python 3.0")
34 |
35 | #logging.basicConfig(filename= 'bcnlp_tm.log', level=logging.DEBUG)
36 | logging.basicConfig(filename= 'bcnlp_tm_info.log', level=logging.INFO)
37 | logging.basicConfig(filename= 'bcnlp_tm_debug.log', level=logging.DEBUG)
38 | logging.basicConfig(filename= 'bcnlp_tm_warning.log', level=logging.WARNING)
39 |
40 |
41 | cfg_image = {}
42 | #documents = []
43 |
44 | class BnTopicModel():
45 |
46 | def tm_generate_gensim(self, infile, num_topics, config_file):
47 | ''' Using the APIs provided by gensim, LDAvis gui is invoked.
48 | NOTE: This is not yet tested well.
49 | '''
50 | documents = []
51 | documents = bn.bnTraverseInfileDir(infile, documents, config_file)
52 | if documents == []:
53 | print("Documents are empty")
54 |
55 | # remove common words and tokenize
56 | '''
57 | stoplist = set('a an the of to for s from is and this \
58 | was were are , - | @ . '.split())
59 | texts = [[word for word in document.lower().split() \
60 | if word not in stoplist] \
61 | for document in documents]
62 | '''
63 |
64 | en_stop = get_stop_words('en')
65 | logging.info("Stop-words list: %s ", en_stop)
66 | texts = [[word for word in document.lower().split() \
67 | if word not in en_stop] \
68 | for document in documents]
69 |
70 |
71 | # remove words that appear only once
72 | from collections import defaultdict
73 | frequency = defaultdict(int)
74 | for text in texts:
75 | for token in text:
76 | frequency[token] += 1
77 |
78 | texts = [[token for token in text if frequency[token] > 1]
79 | for text in texts]
80 |
81 | texts = [[token for token in text if len(token) > 2]
82 | for text in texts]
83 |
84 | # NOTE: lemmatize not working
85 | ###texts = gensim.utils.lemmatize(texts)
86 |
87 | dictionary = corpora.Dictionary(texts)
88 |
89 | ##logging.info("[V]: token:id: %s", dictionary.token2id)
90 |
91 | ## dictionary.compactify()
92 | dictionary.save('/tmp/saved_dict.dict')
93 |
94 | # Now convert tokenized documents to vectors:
95 | corpus = [dictionary.doc2bow(text) for text in texts]
96 |
97 | ## logging.info("[V] Corpus: %s ", corpus)
98 |
99 | # store to disk, for later use
100 | corpora.MmCorpus.serialize('/tmp/saved_dict.mm', corpus)
101 |
102 | ## Creating Transformations
103 | ## The transformations are standard Python objects, typically
104 | ## initialized (trained) by means of a training corpus:
105 | ## First, let's use tfidf for training: It just involves simply
106 | ## going thru the supplied corpus once and computing document
107 | ## frequencies of all its featuers.
108 |
109 | tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
110 |
111 | corpus_tfidf = tfidf[corpus]
112 | corpora.MmCorpus.serialize('/tmp/saved_corpus_tfidf.mm', corpus_tfidf)
113 |
114 | '''
115 | # LSI model is commented out for now
116 | print "Printing TFIDF of given corpus \n"
117 | for doc in corpus_tfidf:
118 | print (doc)
119 |
120 | # Now Initialize an LSI transformation: num_topics set to 2 to make
121 | # it 2D lsi = models.LsiModel(corpus_tfidf, id2word=dictionary,
122 | # num_topics=3)
123 |
124 | # create a double wrapper over the original corpus:
125 | # bow->tfidf->fold-in-lsi
126 | corpus_lsi = lsi[corpus_tfidf]
127 |
128 | print "Printing LSI topics"
129 | lsi.print_topics(4)
130 |
131 | for doc in corpus_lsi:
132 | print (doc)
133 | '''
134 |
135 | # Create an LDA model
136 | '''
137 | lda_model = models.LdaModel(corpus_tfidf, \
138 | id2word=dictionary, \
139 | num_topics=5)
140 | '''
141 | lda_model = models.ldamodel.LdaModel(corpus=corpus, \
142 | id2word=dictionary, \
143 | num_topics=num_topics)
144 | corpus_lda = lda_model[corpus]
145 |
146 | corpus_lda_tfidf = lda_model[corpus_tfidf]
147 |
148 | # The following will print the topics in the logfile
149 | logging.info("Printing %s topics into log file: ", str(num_topics))
150 | lda_model.print_topics(num_topics)
151 |
152 | # Generate data for the pyLDAvis interface from the lda_model above
153 | vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
154 | ##vis_data = pyLDAvis.gensim.prepare(lda_model, corpus_lda, dictionary)
155 |
156 | #pyLDAvis.display(vis_data)
157 | pyLDAvis.show(vis_data)
158 |
159 | def tm_generate_graphlab(self, indir, num_topics, config_file):
160 | ''' Generate the LDA model for documents in indir, using graphlab
161 | '''
162 | indir_path = os.path.join(os.getcwd(), indir)
163 | print(">> Graphlab: Creating SArray for files in ", indir)
164 | sa = self.bnGenerateSArray(indir, config_file)
165 |
166 | sa_docs = gl.text_analytics.count_words(sa)
167 | sa_docs_nsw = sa_docs.dict_trim_by_keys(gl.text_analytics.stopwords(), \
168 | True)
169 |
170 | num_iterations = bn.bnGetConfigInfo(config_file, \
171 | "confset_section", "num_iterations")
172 |
173 | print(">> Graphlab: Creating topic model with {} topics: ".\
174 | format(num_topics))
175 | topic_model = gl.topic_model.create(sa_docs_nsw, \
176 | num_topics=int(num_topics), \
177 | num_iterations=int(num_iterations))
178 |
179 | print(">> Graphlab: Preparing data: ")
180 | vis_data = pyLDAvis.graphlab.prepare(topic_model, sa_docs_nsw)
181 |
182 | print(">> Graphlab: Launching graphics ")
183 | pyLDAvis.show(vis_data)
184 |
185 | def remove_punctuation(self, text):
186 | import string
187 | return text.translate(None, string.punctuation)
188 |
189 | def remove_digits(self, text):
190 | import string
191 | return text.translate(None, string.digits)
192 |
193 | def bnGenerateSArray(self, filextract_dir, config_file):
194 | ''' Traverse through the files in a directory and create sArrays
195 | and append them into one single sArray.
196 | '''
197 | fname = sys._getframe().f_code.co_name
198 | num_docs = 0
199 | sa_g = gl.SArray(dtype = str)
200 | sw_list = ['a', 'an', 'the', 'of', 'to', 'for','as', 'from', 'is', \
201 | 'was', 'were', 'are', ',', '-', '|', '@', '.' ]
202 | for root, dirs, files in os.walk(filextract_dir):
203 | path = root.split(os.sep)
204 |
205 | '''
206 | print "path: ", path, len(path)
207 | print "dirs: ", dirs
208 | print "files: ", files
209 | print((len(path) - 1) * '---', os.path.basename(root))
210 | '''
211 |
212 | # if no files continue to next level
213 | if files == []:
214 | continue
215 |
216 | for filename in files:
217 | file_path = '/'.join(path) + '/' + filename
218 |
219 | bn = BnFilextract()
220 | if os.stat(file_path).st_size == 0:
221 | logging.info(">>>> File %s is empty. Skip it ", file_path)
222 | continue
223 |
224 | if bn.isFileTextractable(filename, config_file):
225 | try:
226 | input_file_contents = textract.process(file_path)
227 | logging.info("Textracted %s ", file_path)
228 | if len(input_file_contents) == 0:
229 | logging.info(">>>> File %s is empty. Skip it ", file_path)
230 | continue
231 | except:
232 | logging.info("Textract failed for file %s ", filename)
233 | continue
234 |
235 | input_file_contents = self.remove_punctuation(input_file_contents)
236 | input_file_contents = self.remove_digits(input_file_contents)
237 | file_path = os.path.splitext(file_path)[0]+'.txt'
238 | logging.info("%s: writing contents to outfile:%s ",
239 | fname, file_path)
240 | else:
241 | logging.info("File %s is NOT textractable ",filename)
242 | continue
243 |
244 | with open(file_path, "w") as text_file:
245 | text_file.write(input_file_contents)
246 |
247 | logging.info(">>> Getting SArray for file %s ", file_path)
248 | sa_sub = gl.SArray(file_path)
249 | gl.text_analytics.trim_rare_words(sa_sub, \
250 | threshold=2, stopwords=sw_list )
251 | # Now append the sub-sarray to the main one.
252 | if num_docs == 0:
253 | sa_g = sa_sub
254 | else:
255 | sa_g = sa_g.append(sa_sub)
256 | num_docs += 1
257 |
258 | logging.info("%s: Total num docs: %d ", fname, num_docs)
259 | return sa_g
260 |
261 | def bnRemoveEmptyFiles(self, path):
262 | ''' Traverses the directory and recursively removes empty files.
263 | '''
264 | files = os.listdir(path)
265 | if len(files):
266 | for fl in files:
267 | fullpath = os.path.join(path, fl)
268 | if os.path.isdir(fullpath):
269 | self.bnRemoveEmptyFiles(fullpath)
270 | if os.stat(fullpath).st_size == 0:
271 | logging.info("Removing file %s ", fullpath)
272 | os.remove(fullpath)
273 |
274 | def bn_parse_config_file(config_file, section_name):
275 | ''' Parses the config file to extract the image names and entity list.
276 | '''
277 | logging.info("bn_parse_config_file: Section: %s ", section_name)
278 | config = ConfigObj(config_file)
279 | section = config[section_name]
280 | i = 0
281 | cfg_entity_list = []
282 | for key in section:
283 | #if key == cfg_string:
284 | # found the string
285 | #return section[key]
286 | if section_name == "image_section":
287 | logging.info("parse_config: key: %s, section: %s", \
288 | key, section[key])
289 | cfg_image[i] = key
290 | i+=1
291 | elif section_name == "entity_list_section":
292 | flag = int(entity_list_section[key])
293 | if flag == 1:
294 | cfg_entity_list.append(key)
295 | if section_name == "entity_list_section":
296 | return cfg_entity_list
297 |
298 | if __name__ == "__main__":
299 | parser = ArgumentParser(prog='bcnlp_tm.py', description='Topic modeling')
300 | parser.add_argument('--config', action='store', \
301 | help="Config file[bntm_config.txt] ")
302 | parser.add_argument('--infile', action='store', help="input directory ")
303 | parser.add_argument('--tm', action='store', \
304 | help="topic modeling :gensim/graphlab ")
305 | parser.add_argument('--topics', action='store', help="number of topics ")
306 |
307 | args = parser.parse_args()
308 |
309 | # Infile specifies the directory of files to run the topic modeling on.
310 | # If no argument specified, it will assume there are disk-images specified
311 | # in the config file bntm_config.txt.
312 |
313 | infile = args.infile
314 | tm = args.tm # Topic modeling type: gensim/graphlab
315 | config_file = args.config
316 | is_disk_image = False
317 |
318 | num_topics = 10
319 | if args.topics:
320 | num_topics = args.topics
321 |
322 | # default it to Graphlab
323 | if tm == None:
324 | tm = 'graphlab'
325 |
326 | if config_file == None:
327 | config_file = "bntm_config.txt"
328 |
329 | bn = BnFilextract()
330 | if infile == None:
331 | is_disk_image = True
332 |
333 | bn_parse_config_file(config_file, "image_section")
334 | print(">> Images in the config file: ", cfg_image)
335 |
336 | infile = bn.bnGetConfigInfo(config_file, \
337 | "confset_section", "file_staging_directory")
338 |
339 | i = 0
340 | for img in cfg_image:
341 | print(">> Extracting files from image {}...".format(cfg_image[img]))
342 | bn.bnExtractFiles(None, cfg_image[img], i, None, config_file)
343 | i += 1
344 | print(">> ... Done ")
345 |
346 | else:
347 | print(">> Extracting files from ", infile)
348 | bn.bnTraverseInfileDir(infile, documents, config_file)
349 |
350 | tmc = BnTopicModel()
351 | if tm == 'gensim':
352 | tmc.tm_generate_gensim(infile, num_topics, config_file)
353 | elif tm == 'graphlab':
354 | if is_disk_image:
355 | indir = bn.bnGetOutDirFromConfig(config_file)
356 | print(">> Generating graphlab for images in disk image")
357 | logging.info(">> Generating graphlab for images in disk image")
358 | logging.info("File-extracted directory: %s ", indir)
359 | tmc.tm_generate_graphlab(indir, num_topics, config_file)
360 | else:
361 | print(">> Generating graphlab for files in ", infile)
362 | logging.info(">> Generating graphlab for files in %s", infile)
363 | tmc.tm_generate_graphlab(infile, num_topics, config_file)
364 |
365 |
366 |
--------------------------------------------------------------------------------
/attic/provision/bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # bootstrap.sh: Build and configuration script for nlp-webtools in Vagrant
4 | # --------------------------------------------------------------------------------------
5 | # This script is only the *first time* you issue the command:
6 | #
7 | # vagrant up
8 | #
9 | # Or, following the commands:
10 | #
11 | # (vagrant halt)
12 | # vagrant destroy
13 | # vagrant up
14 | #
15 | #===============================================================================
16 | # vim: softtabstop=4 shiftwidth=4 expandtab fenc=utf-8 spell spelllang=en cc=81
17 | #===============================================================================
18 | #
19 | # Script Version
20 | __ScriptVersion="0.1"
21 | # Base directory for build log
22 | LOG_BASE=/var/log
23 | WWW_ROOT=/var/www
24 |
25 | #--- FUNCTION ----------------------------------------------------------------
26 | # NAME: __function_defined
27 | # DESCRIPTION: Checks if a function is defined within this scripts scope
28 | # PARAMETERS: function name
29 | # RETURNS: 0 or 1 as in defined or not defined
30 | #-------------------------------------------------------------------------------
31 | __function_defined() {
32 | FUNC_NAME=$1
33 | if [ "$(command -v $FUNC_NAME)x" != "x" ]; then
34 | echoinfo "Found function $FUNC_NAME"
35 | return 0
36 | fi
37 |
38 | echodebug "$FUNC_NAME not found...."
39 | return 1
40 | }
41 |
42 | #--- FUNCTION ----------------------------------------------------------------
43 | # NAME: __strip_duplicates
44 | # DESCRIPTION: Strip duplicate strings
45 | #-------------------------------------------------------------------------------
46 | __strip_duplicates() {
47 | echo "$@" | tr -s '[:space:]' '\n' | awk '!x[$0]++'
48 | }
49 |
50 | #--- FUNCTION ----------------------------------------------------------------
51 | # NAME: echoerr
52 | # DESCRIPTION: Echo errors to stderr.
53 | #-------------------------------------------------------------------------------
54 | echoerror() {
55 | printf "%s * ERROR%s: %s\n" "${RC}" "${EC}" "$@" 1>&2;
56 | }
57 |
58 | #--- FUNCTION ----------------------------------------------------------------
59 | # NAME: echoinfo
60 | # DESCRIPTION: Echo information to stdout.
61 | #-------------------------------------------------------------------------------
62 | echoinfo() {
63 | printf "%s * STATUS%s: %s\n" "${GC}" "${EC}" "$@";
64 | }
65 |
66 | #--- FUNCTION ----------------------------------------------------------------
67 | # NAME: echowarn
68 | # DESCRIPTION: Echo warning informations to stdout.
69 | #-------------------------------------------------------------------------------
70 | echowarn() {
71 | printf "%s * WARN%s: %s\n" "${YC}" "${EC}" "$@";
72 | }
73 |
74 | #--- FUNCTION ----------------------------------------------------------------
75 | # NAME: echodebug
76 | # DESCRIPTION: Echo debug information to stdout.
77 | #-------------------------------------------------------------------------------
78 | echodebug() {
79 | if [ $_ECHO_DEBUG -eq $BS_TRUE ]; then
80 | printf "${BC} * DEBUG${EC}: %s\n" "$@";
81 | fi
82 | }
83 | #--- FUNCTION ----------------------------------------------------------------
84 | # NAME: __apt_get_install_noinput
85 | # DESCRIPTION: (DRY) apt-get install with noinput options
86 | #-------------------------------------------------------------------------------
87 | __apt_get_install_noinput() {
88 | apt-get install -y -o DPkg::Options::=--force-confold "$@"; return $?
89 | }
90 |
91 | #--- FUNCTION ----------------------------------------------------------------
92 | # NAME: __apt_get_upgrade_noinput
93 | # DESCRIPTION: (DRY) apt-get upgrade with noinput options
94 | #-------------------------------------------------------------------------------
95 | __apt_get_upgrade_noinput() {
96 | apt-get upgrade -y -o DPkg::Options::=--force-confold; return $?
97 | }
98 |
99 | #--- FUNCTION ----------------------------------------------------------------
100 | # NAME: __pip_install_noinput
101 | # DESCRIPTION: (DRY)
102 | #-------------------------------------------------------------------------------
103 | __pip_install_noinput() {
104 | #pip install --upgrade "$@"; return $?
105 | # Uncomment for Python 3
106 | pip3 install --upgrade $@; return $?
107 | }
108 |
109 | #--- FUNCTION ----------------------------------------------------------------
110 | # NAME: __pip_install_noinput
111 | # DESCRIPTION: (DRY)
112 | #-------------------------------------------------------------------------------
113 | __pip_pre_install_noinput() {
114 | #pip install --pre --upgrade "$@"; return $?
115 | # Uncomment for Python 3
116 | pip3 install --pre --upgrade $@; return $?
117 | }
118 |
119 |
120 | #--- FUNCTION ----------------------------------------------------------------
121 | # NAME: __check_apt_lock
122 | # DESCRIPTION: (DRY)
123 | #-------------------------------------------------------------------------------
124 | __check_apt_lock() {
125 | lsof /var/lib/dpkg/lock > /dev/null 2>&1
126 | RES=`echo $?`
127 | return $RES
128 | }
129 |
130 | __enable_universe_repository() {
131 | if [ "x$(grep -R universe /etc/apt/sources.list /etc/apt/sources.list.d/ | grep -v '#')" != "x" ]; then
132 | # The universe repository is already enabled
133 | return 0
134 | fi
135 | echodebug "Enabling the universe repository"
136 |
137 | # Ubuntu versions higher than 12.04 do not live in the old repositories
138 | if [ $DISTRO_MAJOR_VERSION -gt 12 ] || ([ $DISTRO_MAJOR_VERSION -eq 12 ] && [ $DISTRO_MINOR_VERSION -gt 04 ]); then
139 | add-apt-repository -y "deb http://archive.ubuntu.com/ubuntu $(lsb_release -sc) universe" || return 1
140 | elif [ $DISTRO_MAJOR_VERSION -lt 11 ] && [ $DISTRO_MINOR_VERSION -lt 10 ]; then
141 | # Below Ubuntu 11.10, the -y flag to add-apt-repository is not supported
142 | add-apt-repository "deb http://old-releases.ubuntu.com/ubuntu $(lsb_release -sc) universe" || return 1
143 | fi
144 |
145 | add-apt-repository -y "deb http://old-releases.ubuntu.com/ubuntu $(lsb_release -sc) universe" || return 1
146 |
147 | return 0
148 | }
149 |
150 | __check_unparsed_options() {
151 | shellopts="$1"
152 | # grep alternative for SunOS
153 | if [ -f /usr/xpg4/bin/grep ]; then
154 | grep='/usr/xpg4/bin/grep'
155 | else
156 | grep='grep'
157 | fi
158 | unparsed_options=$( echo "$shellopts" | ${grep} -E '(^|[[:space:]])[-]+[[:alnum:]]' )
159 | if [ "x$unparsed_options" != "x" ]; then
160 | usage
161 | echo
162 | echoerror "options are only allowed before install arguments"
163 | echo
164 | exit 1
165 | fi
166 | }
167 |
168 | configure_cpan() {
169 | (echo y;echo o conf prerequisites_policy follow;echo o conf commit)|cpan > /dev/null
170 | }
171 |
172 | usage() {
173 | echo "usage"
174 | exit 1
175 | }
176 |
177 | install_ubuntu_17.04_deps() {
178 |
179 | echoinfo "Updating your APT Repositories ... "
180 | apt-get update >> $LOG_BASE/nlp-install.log 2>&1 || return 1
181 |
182 | echoinfo "Installing Python Software Properies ... "
183 | __apt_get_install_noinput software-properties-common >> $LOG_BASE/nlp-install.log 2>&1 || return 1
184 |
185 | echoinfo "Enabling Universal Repository ... "
186 | __enable_universe_repository >> $LOG_BASE/nlp-install.log 2>&1 || return 1
187 |
188 | echoinfo "Updating Repository Package List ..."
189 | apt-get update >> $LOG_BASE/nlp-install.log 2>&1 || return 1
190 |
191 | echoinfo "Upgrading all packages to latest version ..."
192 | __apt_get_upgrade_noinput >> $LOG_BASE/nlp-install.log 2>&1 || return 1
193 |
194 | return 0
195 | }
196 |
197 | install_ubuntu_17.04_packages() {
198 | packages="antiword
199 | automake
200 | dkms
201 | ffmpeg
202 | flac
203 | g++-5
204 | gcc-5
205 | lame
206 | libffi-dev
207 | libjpeg-dev
208 | libmad0
209 | libpulse-dev
210 | libsox-fmt-mp3
211 | libtool
212 | libxml2-dev
213 | libxslt1-dev
214 | poppler-utils
215 | pstotext
216 | python
217 | python-dev
218 | python-pip
219 | python3-dev
220 | python3-pip
221 | sox
222 | swig
223 | swig3.0
224 | tesseract-ocr
225 | unrtf
226 | virtualbox-guest-utils
227 | virtualenv
228 | virtualenvwrapper
229 | zlib1g-dev"
230 |
231 |
232 | if [ "$@" = "dev" ]; then
233 | packages="$packages"
234 | elif [ "$@" = "stable" ]; then
235 | packages="$packages"
236 | fi
237 |
238 | for PACKAGE in $packages; do
239 | __apt_get_install_noinput $PACKAGE >> $LOG_BASE/nlp-install.log 2>&1
240 | ERROR=$?
241 | if [ $ERROR -ne 0 ]; then
242 | echoerror "Install Failure: $PACKAGE (Error Code: $ERROR)"
243 | else
244 | echoinfo "Installed Package: $PACKAGE"
245 | fi
246 | done
247 |
248 | return 0
249 | }
250 |
251 | install_ubuntu_17.04_pip_packages() {
252 |
253 | pip_packages="textract
254 | gensim
255 | pyLDAvis
256 | configobj"
257 | pip_special_packages="textacy"
258 |
259 | if [ "$@" = "dev" ]; then
260 | pip_packages="$pip_packages"
261 | elif [ "$@" = "stable" ]; then
262 | pip_packages="$pip_packages"
263 | fi
264 |
265 | ERROR=0
266 |
267 | for PACKAGE in $pip_packages; do
268 | CURRENT_ERROR=0
269 | echoinfo "Installed Python Package: $PACKAGE"
270 | __pip_install_noinput $PACKAGE >> $LOG_BASE/nlp-install.log 2>&1 || (let ERROR=ERROR+1 && let CURRENT_ERROR=1)
271 | if [ $CURRENT_ERROR -eq 1 ]; then
272 | echoerror "Python Package Install Failure: $PACKAGE"
273 | fi
274 | done
275 |
276 | # Prep environment for special packages, install cld2-cffi
277 | env CC=/usr/bin/gcc-5 pip3 install -U cld2-cffi
278 |
279 | for PACKAGE in $pip_special_packages; do
280 | CURRENT_ERROR=0
281 | echoinfo "Installed Python (special setup) Package: $PACKAGE"
282 | __pip_pre_install_noinput $PACKAGE >> $LOG_BASE/nlp-install.log 2>&1 || (let ERROR=ERROR+1 && let CURRENT_ERROR=1)
283 | if [ $CURRENT_ERROR -eq 1 ]; then
284 | echoerror "Python Package Install Failure: $PACKAGE"
285 | fi
286 | done
287 |
288 | if [ $ERROR -ne 0 ]; then
289 | echoerror
290 | return 1
291 | fi
292 |
293 | return 0
294 | }
295 |
296 |
297 | install_source_packages() {
298 |
299 | #echoinfo "nlp-webtools: Nothing to be installed currently. Continuing..."
300 | # Install libuna from specific release
301 | echoinfo "nlp-webtools: Building and installing libuna"
302 | CDIR=$(pwd)
303 |
304 | # Newer versions break a lot of stuff. Keep 20150927 for now.
305 | cd /tmp
306 | wget -q https://github.com/libyal/libuna/releases/download/20170112/libuna-alpha-20170112.tar.gz
307 | tar zxf libuna-alpha-20170112.tar.gz >> $HOME/nlp-install.log 2>&1
308 | cd libuna-20170112
309 | ./configure >> $HOME/nlp-install.log 2>&1
310 | make -s >> $HOME/nlp-install.log 2>&1
311 | make install >> $HOME/nlp-install.log 2>&1
312 | ldconfig >> $HOME/nlp-install.log 2>&1
313 |
314 | # Now clean up
315 | cd /tmp
316 | rm -rf libuna-20170112
317 | rm libuna-alpha-20170112.tar.gz
318 |
319 | # Install libewf from current sources
320 | echoinfo "nlp-webtools: Building and installing libewf"
321 | CDIR=$(pwd)
322 |
323 | # Newer versions break a lot of stuff. Keep 20140608 for now.
324 | cd /tmp
325 | cp /vagrant/externals/libewf-20140608.tar.gz .
326 | tar zxf libewf-20140608.tar.gz >> $HOME/nlp-install.log 2>&1
327 | cd libewf-20140608
328 | ./configure --enable-python --enable-v1-api >> $HOME/nlp-install.log 2>&1
329 | make -s >> $HOME/nlp-install.log 2>&1
330 | make install >> $HOME/nlp-install.log 2>&1
331 | ldconfig >> $HOME/nlp-install.log 2>&1
332 |
333 | # Now clean up
334 | cd /tmp
335 | rm -rf libewf-20140608
336 | rm libewf-20140608.tar.gz
337 |
338 | echoinfo "nlp-webtools: Adding DFXML tools and libraries"
339 | CDIR=$(pwd)
340 | git clone https://github.com/simsong/dfxml /usr/share/dfxml >> $HOME/nlp-install.log 2>&1
341 | # No cleanup needed
342 | cd /tmp
343 |
344 | # Install The Sleuth Kit (TSK) from current sources
345 | echoinfo "nlp-webtools: Building and installing The Sleuth Kit"
346 | CDIR=$(pwd)
347 | git clone --recursive https://github.com/sleuthkit/sleuthkit /usr/share/sleuthkit >> $HOME/nlp-install.log 2>&1
348 | cd /usr/share/sleuthkit
349 | git fetch
350 | git checkout master >> $HOME/nlp-install.log 2>&1
351 | ./bootstrap >> $HOME/nlp-install.log 2>&1
352 | ./configure >> $HOME/nlp-install.log 2>&1
353 | make -s >> $HOME/nlp-install.log 2>&1
354 | make install >> $HOME/nlp-install.log 2>&1
355 | ldconfig >> $HOME/nlp-install.log 2>&1
356 |
357 | # Install PyTSK
358 | echoinfo "nlp-webtools: Building and installing PyTSK (Python bindings for TSK)"
359 | echoinfo " -- Please be patient. This may take several minutes..."
360 | CDIR=$(pwd)
361 | cd /tmp
362 | git clone https://github.com/py4n6/pytsk
363 | cd pytsk
364 | python setup.py update >> $HOME/nlp-install.log 2>&1
365 | python setup.py build >> $HOME/nlp-install.log 2>&1
366 | python setup.py install >> $HOME/nlp-install.log 2>&1
367 | # Now clean up
368 | cd /tmp
369 | #rm -rf pytsk3-20170508
370 | rm -rf pytsk
371 |
372 | }
373 |
374 | complete_message() {
375 | echo
376 | echo "Installation Complete!"
377 | echo
378 | }
379 |
380 | OS=$(lsb_release -si)
381 | ARCH=$(uname -m | sed 's/x86_//;s/i[3-6]86/32/')
382 | VER=$(lsb_release -sr)
383 |
384 | if [ $OS != "Ubuntu" ]; then
385 | echo "nlp-webtools is only installable on Ubuntu operating systems at this time."
386 | exit 1
387 | fi
388 |
389 | if [ $VER != "17.04" ]; then
390 | echo "nlp-webtools is only installable on Ubuntu 17.04 at this time."
391 | exit 3
392 | fi
393 |
394 | if [ "`whoami`" != "root" ]; then
395 | echoerror "The nlp-webtools bootstrap script must run as root."
396 | echoinfo "Preferred Usage: sudo bootstrap.sh (options)"
397 | echo ""
398 | exit 3
399 | fi
400 |
401 | if [ "$SUDO_USER" = "" ]; then
402 | echo "The SUDO_USER variable doesn't seem to be set"
403 | exit 4
404 | fi
405 |
406 | # while getopts ":hvcsiyu" opt
407 | while getopts ":hv" opt
408 | do
409 | case "${opt}" in
410 | h ) usage; exit 0 ;;
411 | v ) echo "$0 -- Version $__ScriptVersion"; exit 0 ;;
412 | \?) echo
413 | echoerror "Option does not exist: $OPTARG"
414 | usage
415 | exit 1
416 | ;;
417 | esac
418 | done
419 |
420 | shift $(($OPTIND-1))
421 |
422 | if [ "$#" -eq 0 ]; then
423 | ITYPE="stable"
424 | else
425 | __check_unparsed_options "$*"
426 | ITYPE=$1
427 | shift
428 | fi
429 |
430 | # Check installation type
431 | if [ "$(echo $ITYPE | egrep '(dev|stable)')x" = "x" ]; then
432 | echoerror "Installation type \"$ITYPE\" is not known..."
433 | exit 1
434 | fi
435 |
436 | echoinfo "****************************************************************"
437 | echoinfo "The nlp-webtools provisioning script will now configure your system."
438 | echoinfo "****************************************************************"
439 | echoinfo ""
440 |
441 | #if [ "$YESTOALL" -eq 1 ]; then
442 | # echoinfo "You supplied the -y option, this script will not exit for any reason"
443 | #fi
444 |
445 | echoinfo "OS: $OS"
446 | echoinfo "Arch: $ARCH"
447 | echoinfo "Version: $VER"
448 | echoinfo "The current user is: $SUDO_USER"
449 |
450 | export DEBIAN_FRONTEND=noninteractive
451 | install_ubuntu_${VER}_deps $ITYPE
452 | install_ubuntu_${VER}_packages $ITYPE
453 | install_ubuntu_${VER}_pip_packages $ITYPE
454 | install_source_packages
455 |
456 | complete_message
457 |
--------------------------------------------------------------------------------
/bcnlp_fxtract.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # coding=UTF-8
3 | #
4 | # BitCurator NLP
5 | # Copyright (C) 2016-2018
6 | # All rights reserved.
7 | #
8 | # This code is distributed under the terms of the GNU General Public
9 | # License, Version 3. See the text file "COPYING" for further details
10 | # about the terms of this license.
11 | #
12 | # This file contains the File Extraction routines for BitCurator NLP.
13 | #
14 |
15 | import multiprocessing
16 | import os
17 | import errno
18 | import re
19 | import pytsk3
20 | import logging
21 | from dfvfs.resolver import resolver
22 | from dfvfs.path import path_spec
23 | from dfvfs.resolver import context
24 | from bcnlp_listfiles import FileEntryLister
25 | from dfvfs.path import factory as path_spec_factory
26 | from dfvfs.lib import definitions
27 |
28 | import argparse
29 | try:
30 | from argparse import ArgumentParser
31 | except ImportError:
32 | raise ImportError("This script requires ArgumentParser which is in Python 2.7 or Python 3.0")
33 |
34 |
35 | class FileExtractor(multiprocessing.Process):
36 | def __init__(self,fs_path_spec,output_path):
37 | super(FileExtractor, self).__init__()
38 | self._READ_BUFFER_SIZE = 32768
39 |
40 | self.file_queue = multiprocessing.Queue()
41 |
42 | self.fs_path_spec = fs_path_spec
43 | self.output_path = output_path
44 |
45 | def run(self):
46 | p = os.getpid()
47 | print(u"Running File Extractor: (PID {})".format(p))
48 |
49 | # We have to open the filesystem from within the process, cannot
50 | # be passed
51 | resolver_context = context.Context()
52 | file_system = resolver.Resolver.OpenFileSystem(
53 | self.fs_path_spec,
54 | resolver_context=resolver_context
55 | )
56 |
57 | # Read from the queue #
58 | while True:
59 | file_item = self.file_queue.get()
60 | if isinstance(file_item,unicode):
61 | if (file_item == u'TERMINATE'):
62 | break
63 | elif isinstance(file_item,ExtractionInfo):
64 | # Get dfvfs file entry from our path_spec #
65 | outpath_stack = list(os.path.split(
66 | self.output_path
67 | ))
68 |
69 | # Get dfvfs entry #
70 | file_entry = file_system.GetFileEntryByPathSpec(
71 | file_item.path_spec
72 | )
73 |
74 | ads_name = self._GetStreamName(
75 | file_item.full_path
76 | )
77 |
78 | # Export files based off of file_entry #
79 | self._ExportFiles(
80 | file_system,
81 | file_entry,
82 | outpath_stack,
83 | specified_ads_name=ads_name
84 | )
85 | else:
86 | print(u"Item type unhandled for type: {}; {}".format(
87 | unicode(type(file_item)),
88 | unicode(file_item)
89 | ))
90 |
91 | file_system.Close()
92 |
93 | print(u"Ending File Extractor: (PID {})".format(
94 | os.getpid()
95 | ))
96 |
97 | def _GetStreamName(self,full_path):
98 | ads_name = None
99 |
100 | if ':' in full_path:
101 | ads_name = full_path.split(':')[1]
102 |
103 | return ads_name
104 |
105 | def _GetExportFilename(self, outpath_stack, filename, ads_name=None):
106 | """Create a export filename"""
107 | export_filename = u''
108 | path_sep = os.pathsep
109 | export_path = os.path.sep.join(outpath_stack)
110 |
111 | name = os.path.basename(filename)
112 | if ads_name:
113 | name = u'{}.{}'.format(name,ads_name)
114 | export_filename = os.path.join(export_path, name)
115 |
116 | return export_filename
117 |
118 | def _ExportFiles(self, file_system, file_entry, outpath_stack, specified_ads_name=None):
119 | # Export if file_entry is a file #
120 | if file_entry.IsFile():
121 | for ads in file_entry._GetDataStreams():
122 | full_path = file_entry.path_spec.location
123 |
124 | if specified_ads_name:
125 | # Only extract out this ads because it was specified #
126 | if specified_ads_name != ads.name:
127 | continue
128 |
129 | full_path = u'{}:{}'.format(full_path,ads.name)
130 |
131 | if len(ads.name) > 0:
132 | ads_name = ads.name
133 | full_path = u'{}:{}'.format(full_path, ads.name)
134 | else:
135 | ads_name = None
136 |
137 | export_name = self._GetExportFilename(
138 | outpath_stack,
139 | file_entry.name,
140 | ads_name=ads_name
141 | )
142 |
143 | result = self._ExportFile(
144 | file_entry,
145 | export_name,
146 | ads_name
147 | )
148 | if result:
149 | #print(u"Exported {} to {}".format(full_path, export_name))
150 | logging.info(u"Exported %s to %s",full_path, export_name)
151 | else:
152 | print(u"{} Not Exported to {}".format(full_path, export_name))
153 | elif file_entry.IsDirectory():
154 | for sub_file_entry in file_entry.sub_file_entries:
155 | outpath_stack.append(file_entry.name)
156 | self._ExportFiles(
157 | file_system,
158 | sub_file_entry,
159 | outpath_stack
160 | )
161 | outpath_stack.pop()
162 |
163 | def _ExportFile(self, file_entry, export_filename, ads_name):
164 | """Export a file"""
165 | _offset = None
166 |
167 | # Outfile #
168 | # Check that path exists #
169 | export_path = os.path.dirname(export_filename)
170 | if not os.path.isdir(export_filename):
171 | try:
172 | os.makedirs(export_path)
173 | except OSError as oserror:
174 | if oserror.errno != errno.EEXIST:
175 | raise
176 |
177 | outfile = open(
178 | export_filename,
179 | 'wb'
180 | )
181 |
182 | file_name = file_entry.name
183 |
184 | tsk_file = file_entry._tsk_file
185 | use_attribute = None
186 |
187 | if ads_name:
188 | data_stream_name = ads_name
189 | for attribute in tsk_file:
190 | if attribute.info.name == data_stream_name:
191 | use_attribute = attribute
192 | if data_stream_name == u'$J' and int(attribute.info.flags) & pytsk3.TSK_FS_ATTR_SPARSE:
193 | # If USN Journal, start at end of sparse data run #
194 | for run in attribute:
195 | print " Blocks %s to %s (%s blocks) [flags: %s] - [offset: %d]" % (
196 | run.addr, run.addr + run.len, run.len, str(run.flags), run.offset
197 | )
198 | if run.flags != pytsk3.TSK_FS_ATTR_RUN_FLAG_SPARSE:
199 | _offset = run.offset * tsk_file.info.fs_info.block_size
200 | break
201 | break
202 |
203 | if _offset is None:
204 | _offset = 0
205 |
206 | if use_attribute != None:
207 | _filesize = use_attribute.info.size
208 | else:
209 | _filesize = tsk_file.info.meta.size
210 |
211 | while _offset < _filesize:
212 | available_to_read = min(self._READ_BUFFER_SIZE, _filesize - _offset)
213 |
214 | if use_attribute != None:
215 | data = tsk_file.read_random(
216 | _offset,
217 | available_to_read,
218 | use_attribute.info.type,
219 | use_attribute.info.id
220 | )
221 | else:
222 | data = tsk_file.read_random(
223 | _offset,
224 | available_to_read
225 | )
226 |
227 | if not data:
228 | break
229 |
230 | _offset += len(data)
231 |
232 | outfile.write(data)
233 |
234 | outfile.close()
235 | return True
236 |
237 | #def AddFileToQueue(self,tsk_file_entry):
238 | def AddFileToQueue(self,tsk_file_entry, full_path):
239 | ''' ORIG
240 | einfo = ExtractionInfo(
241 | tsk_file_entry.path_spec,
242 | tsk_file_entry.full_path
243 | )
244 | '''
245 | einfo = ExtractionInfo(
246 | tsk_file_entry.path_spec,
247 | full_path
248 | )
249 | self.file_queue.put(einfo)
250 |
251 | def Finish(self):
252 | self.file_queue.put(u'TERMINATE')
253 |
254 | class ExtractionInfo():
255 | def __init__(self,path_spec,full_path):
256 | self.path_spec = path_spec
257 | self.full_path = full_path
258 |
259 |
260 | if __name__ == "__main__":
261 | parser = ArgumentParser(prog='bcnlp_fxtract.py', description='File Extraction')
262 | parser.add_argument('--config', action='store', help="Extract text")
263 | parser.add_argument('--image', action='store', help="Image")
264 |
265 | args = parser.parse_args()
266 |
267 | config_file = args.config
268 | if config_file == None:
269 | config_file = "config.txt"
270 |
271 | source_path = args.image
272 | if config_file == None:
273 | print("Image not specified ")
274 | os.exit()
275 |
276 |
277 | '''
278 | # Get the basePathSpec of the given file/dir
279 | #spath_basename = os.path.basename(spath)
280 | stat_info = os.stat(source_path)
281 |
282 | path_spec = path_spec_factory.Factory.NewPathSpec(
283 | definitions.TYPE_INDICATOR_OS, location=source_path)
284 |
285 | #bl_fle = bcnlp_listfiles.FileEntryLister()
286 | bl_fle = FileEntryLister()
287 | base_path_spec = bl_fle.GetBasePathSpec(source_path)
288 |
289 | #num_partitions = file_entry_lister.ListAllFiles(source_path)
290 |
291 | ####file_system = resolver.Resolver.OpenFileSystem(base_path_spec)
292 | ####file_entry = resolver.Resolver.OpenFileEntry(base_path_spec)
293 |
294 | output_path = os.path.join(os.getcwd(), "extracted_files" )
295 | #fe = FileExtractor(base_path_spec)
296 | fe = FileExtractor(base_path_spec, output_path)
297 | file_entry = resolver.Resolver.OpenFileEntry(base_path_spec)
298 | '''
299 |
300 | bl_fle = FileEntryLister()
301 | base_path_spec = bl_fle.GetBasePathSpec(source_path)
302 |
303 | output_path = os.path.join(os.getcwd(), "new_extracted_files" )
304 | file_location = os.path.join(os.getcwd(), "new_extracted_files" )
305 | #image_location = os.path.join(os.getcwd(), "disk_images")
306 | os_path_spec = path_spec_factory.Factory.NewPathSpec(definitions.TYPE_INDICATOR_OS, location=source_path)
307 | ewf_path_spec = path_spec_factory.Factory.NewPathSpec(definitions.TYPE_INDICATOR_EWF, parent=os_path_spec)
308 |
309 | tsk_partition_path_spec = path_spec_factory.Factory.NewPathSpec(definitions.TYPE_INDICATOR_TSK_PARTITION, location=u'/p1', parent=ewf_path_spec)
310 | tsk_path_spec = path_spec_factory.Factory.NewPathSpec(definitions.TYPE_INDICATOR_TSK, location=file_location, parent=tsk_partition_path_spec)
311 |
312 | #file_entry = resolver.Resolver.OpenFileEntry(tsk_path_spec)
313 | file_entry = resolver.Resolver.OpenFileEntry(base_path_spec)
314 | print("file_entry: ", file_entry)
315 |
316 | #fe = FileExtractor(tsk_path_spec, output_path)
317 | fe = FileExtractor(base_path_spec, output_path)
318 | fe.start()
319 | #fe.AddFileToQueue(file_entry)
320 | fe.AddFileToQueue(file_entry, source_path)
321 | print ("Added files to queue")
322 |
323 | fe.Finish()
324 |
325 |
--------------------------------------------------------------------------------
/bcnlp_listfiles.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 |
4 | # BitCurator NLP
5 | # Copyright (C) 2016 - 2018
6 | # All rights reserved.
7 | #
8 | # This code is distributed under the terms of the GNU General Public
9 | # License, Version 3. See the text file "COPYING" for further details
10 | # about the terms of this license.
11 | #
12 | # This file contains the File Extraction routines for BitCurator NLP.
13 | #
14 |
15 | """Script to list file entries.
16 | Extended from dfvfs example code list_file_entries.py
17 | """
18 |
19 | from __future__ import print_function
20 | import argparse
21 | import logging
22 | import os
23 | import stat
24 | import sys
25 |
26 | from dfvfs.analyzer import analyzer
27 | from dfvfs.lib import definitions
28 | from dfvfs.lib import raw
29 | from dfvfs.path import factory as path_spec_factory
30 | from dfvfs.resolver import resolver
31 | from dfvfs.volume import tsk_volume_system
32 |
33 | class FileEntryLister(object):
34 | """Class that lists file entries."""
35 |
36 | # Class constant that defines the default read buffer size.
37 | _READ_BUFFER_SIZE = 32768
38 |
39 | # For context see: http://en.wikipedia.org/wiki/Byte
40 | _UNITS_1000 = [u'B', u'kB', u'MB', u'GB', u'TB', u'EB', u'ZB', u'YB']
41 | _UNITS_1024 = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'EiB', u'ZiB', u'YiB']
42 |
43 | def _GetHumanReadableSize(self, size):
44 | """Retrieves a human readable string of the size.
45 |
46 | Args:
47 | size: The size in bytes.
48 |
49 | Returns:
50 | A human readable string of the size.
51 | """
52 | magnitude_1000 = 0
53 | size_1000 = float(size)
54 | while size_1000 >= 1000:
55 | size_1000 /= 1000
56 | magnitude_1000 += 1
57 |
58 | magnitude_1024 = 0
59 | size_1024 = float(size)
60 | while size_1024 >= 1024:
61 | size_1024 /= 1024
62 | magnitude_1024 += 1
63 |
64 | size_string_1000 = None
65 | if magnitude_1000 > 0 and magnitude_1000 <= 7:
66 | size_string_1000 = u'{0:.1f}{1:s}'.format(
67 | size_1000, self._UNITS_1000[magnitude_1000])
68 |
69 | size_string_1024 = None
70 | if magnitude_1024 > 0 and magnitude_1024 <= 7:
71 | size_string_1024 = u'{0:.1f}{1:s}'.format(
72 | size_1024, self._UNITS_1024[magnitude_1024])
73 |
74 | if not size_string_1000 or not size_string_1024:
75 | return u'{0:d} B'.format(size)
76 |
77 | return u'{0:s} / {1:s} ({2:d} B)'.format(
78 | size_string_1024, size_string_1000, size)
79 |
80 | def _GetNextLevelTSKPartionVolumeSystemPathSpec(self, source_path_spec, is_single_part):
81 | """Determines the next level volume system path specification.
82 |
83 | Args:
84 | source_path_spec: the source path specification (instance of
85 | dfvfs.PathSpec).
86 |
87 | Returns:
88 | The next level volume system path specification (instance of
89 | dfvfs.PathSpec).
90 |
91 | Raises:
92 | RuntimeError: if the format of or within the source is not supported.
93 | """
94 | volume_system_path_spec = path_spec_factory.Factory.NewPathSpec(
95 | definitions.TYPE_INDICATOR_TSK_PARTITION, location=u'/',
96 | parent=source_path_spec)
97 |
98 | volume_system = tsk_volume_system.TSKVolumeSystem()
99 | volume_system.Open(volume_system_path_spec)
100 |
101 | volume_identifiers = []
102 | for volume in volume_system.volumes:
103 | volume_identifier = getattr(volume, 'identifier', None)
104 | if volume_identifier:
105 | volume_identifiers.append(volume_identifier)
106 |
107 | if not volume_identifiers:
108 | logging.warning(u'No supported partitions found.')
109 | return source_path_spec
110 |
111 | if (len(volume_identifiers) == 1) or (is_single_part == True):
112 | return path_spec_factory.Factory.NewPathSpec(
113 | definitions.TYPE_INDICATOR_TSK_PARTITION, location=u'/p1',
114 | parent=source_path_spec)
115 |
116 | print(u'The following partitions were found:')
117 | print(u'Identifier\tOffset\t\t\tSize')
118 |
119 | for volume_identifier in sorted(volume_identifiers):
120 | volume = volume_system.GetVolumeByIdentifier(volume_identifier)
121 | if not volume:
122 | raise RuntimeError(
123 | u'Volume missing for identifier: {0:s}.'.format(volume_identifier))
124 |
125 | volume_extent = volume.extents[0]
126 | print(
127 | u'{0:s}\t\t{1:d} (0x{1:08x})\t{2:s}'.format(
128 | volume.identifier, volume_extent.offset,
129 | self._GetHumanReadableSize(volume_extent.size)))
130 |
131 | print(u'')
132 |
133 | while True:
134 | print(
135 | u'Please specify the identifier of the partition that should '
136 | u'be processed:')
137 |
138 | selected_volume_identifier = sys.stdin.readline()
139 | selected_volume_identifier = selected_volume_identifier.strip()
140 |
141 | if selected_volume_identifier in volume_identifiers:
142 | break
143 |
144 | print(u'')
145 | print(
146 | u'Unsupported partition identifier, please try again or abort '
147 | u'with Ctrl^C.')
148 | print(u'')
149 |
150 | location = u'/{0:s}'.format(selected_volume_identifier)
151 |
152 | return path_spec_factory.Factory.NewPathSpec(
153 | definitions.TYPE_INDICATOR_TSK_PARTITION, location=location,
154 | parent=source_path_spec)
155 |
156 | def _GetNextLevelTSKPartionVolumeSystemPathSpecForBcnlp(self, \
157 | source_path_spec,\
158 | spath):
159 | """Determines the next level volume system path specification.
160 | and calls ListFileEntry to output the file-list from every
161 | partition into the specified output file.
162 |
163 | Args:
164 | source_path_spec: the source path specification (instance of
165 | dfvfs.PathSpec).
166 | spath: source path
167 |
168 | Returns:
169 | number of Partitions
170 |
171 | Raises:
172 | RuntimeError: if the format of or within the source is not supported.
173 | """
174 | fname = sys._getframe().f_code.co_name
175 | spath_basename = os.path.basename(spath)
176 | volume_system_path_spec = path_spec_factory.Factory.NewPathSpec(
177 | definitions.TYPE_INDICATOR_TSK_PARTITION, location=u'/',
178 | parent=source_path_spec)
179 |
180 | volume_system = tsk_volume_system.TSKVolumeSystem()
181 | volume_system.Open(volume_system_path_spec)
182 |
183 | volume_identifiers = []
184 | for volume in volume_system.volumes:
185 | volume_identifier = getattr(volume, 'identifier', None)
186 | if volume_identifier:
187 | volume_identifiers.append(volume_identifier)
188 |
189 | if not volume_identifiers:
190 | logging.warning(u'No supported partitions found.')
191 | return source_path_spec
192 |
193 | print(u'The following partitions were found:')
194 | print(u'Identifier\tOffset\t\t\tSize')
195 |
196 | for volume_identifier in sorted(volume_identifiers):
197 | volume = volume_system.GetVolumeByIdentifier(volume_identifier)
198 | if not volume:
199 | raise RuntimeError(
200 | u'Volume missing for identifier: {0:s}.'.format(volume_identifier))
201 |
202 | volume_extent = volume.extents[0]
203 | print(
204 | u'{0:s}\t\t{1:d} (0x{1:08x})\t{2:s}'.format(
205 | volume.identifier, volume_extent.offset,
206 | self._GetHumanReadableSize(volume_extent.size)))
207 |
208 | print(u'')
209 |
210 | for volume_identifier in sorted(volume_identifiers):
211 | volume = volume_system.GetVolumeByIdentifier(volume_identifier)
212 | location = u'/{0:s}'.format(volume_identifier)
213 | base_path_spec = path_spec_factory.Factory.NewPathSpec( \
214 | definitions.TYPE_INDICATOR_TSK_PARTITION, location=location, \
215 | parent=source_path_spec)
216 |
217 | logging.info("%s: Listing files for partition: %s ", fname, \
218 | volume_identifier)
219 |
220 | base_path_spec = path_spec_factory.Factory.NewPathSpec(
221 | definitions.TYPE_INDICATOR_TSK, location=u'/',
222 | parent=base_path_spec)
223 |
224 | outfile = spath + "_filelist_" + str(volume_identifier)
225 | of = open(outfile, "a")
226 | logging.info("%s:Calling ListFileEntries to write list to file %s ",\
227 | fname, outfile)
228 | self.ListFileEntries(base_path_spec, of, volume_identifier)
229 | of.close()
230 | return len(volume_identifiers)
231 |
232 | def _GetNextLevelVshadowVolumeSystemPathSpec(self, source_path_spec):
233 | """Determines the next level volume system path specification.
234 |
235 | Args:
236 | source_path_spec: the source path specification (instance of
237 | dfvfs.PathSpec).
238 |
239 | Returns:
240 | The next level volume system path specification (instance of
241 | dfvfs.PathSpec).
242 |
243 | Raises:
244 | RuntimeError: if the format of or within the source is not supported.
245 | """
246 | # TODO: implement.
247 | return source_path_spec
248 |
249 | def _GetUpperLevelVolumeSystemPathSpec(self, source_path_spec, is_single_part):
250 | """Determines the upper level volume system path specification.
251 |
252 | Args:
253 | source_path_spec: the source path specification (instance of
254 | dfvfs.PathSpec).
255 |
256 | Returns:
257 | The upper level volume system path specification (instance of
258 | dfvfs.PathSpec).
259 |
260 | Raises:
261 | RuntimeError: if the format of or within the source is not supported.
262 | """
263 | type_indicators = analyzer.Analyzer.GetVolumeSystemTypeIndicators(
264 | source_path_spec)
265 |
266 | if not type_indicators:
267 | # No supported volume system found, we are at the upper level.
268 | return source_path_spec
269 |
270 | if len(type_indicators) > 1:
271 | raise RuntimeError(
272 | u'Unsupported source found more than one volume system types.')
273 |
274 | if type_indicators[0] == definitions.TYPE_INDICATOR_TSK_PARTITION:
275 | path_spec = self._GetNextLevelTSKPartionVolumeSystemPathSpec(
276 | source_path_spec, is_single_part)
277 |
278 | elif type_indicators[0] == definitions.TYPE_INDICATOR_VSHADOW:
279 | path_spec = self._GetNextLevelVshadowVolumeSystemPathSpec(
280 | source_path_spec)
281 |
282 | else:
283 | raise RuntimeError((
284 | u'Unsupported source found unsupported volume system '
285 | u'type: {0:s}.').format(type_indicators[0]))
286 |
287 | return path_spec
288 |
289 | def _GetUpperLevelVolumeSystemPathSpecForBcnlp(self, source_path_spec, spath):
290 | """Determines the upper level volume system path specification,
291 | then calls methods to lsit files from all partitions into the
292 | specified file.
293 |
294 | Args:
295 | source_path_spec: the source path specification (instance of
296 | dfvfs.PathSpec).
297 | of: Output file descriptor
298 |
299 | Returns: number partitions
300 |
301 | Raises:
302 | RuntimeError: if the format of or within the source is not supported.
303 | """
304 | fname = sys._getframe().f_code.co_name
305 | type_indicators = analyzer.Analyzer.GetVolumeSystemTypeIndicators(
306 | source_path_spec)
307 |
308 | if not type_indicators:
309 | # No supported volume system found, we are at the upper level.
310 | return source_path_spec
311 |
312 | if len(type_indicators) > 1:
313 | raise RuntimeError(
314 | u'Unsupported source found more than one volume system types.')
315 |
316 | if type_indicators[0] == definitions.TYPE_INDICATOR_TSK_PARTITION:
317 | partitions = self._GetNextLevelTSKPartionVolumeSystemPathSpecForBcnlp(
318 | source_path_spec, spath)
319 |
320 | elif type_indicators[0] == definitions.TYPE_INDICATOR_VSHADOW:
321 | path_spec = self._GetNextLevelVshadowVolumeSystemPathSpec(
322 | source_path_spec)
323 |
324 | else:
325 | raise RuntimeError((
326 | u'Unsupported source found unsupported volume system '
327 | u'type: {0:s}.').format(type_indicators[0]))
328 |
329 | return partitions
330 |
331 | def _ListFileEntry(
332 | self, file_system, file_entry, parent_full_path, output_writer, p):
333 | """Lists a file entry.
334 | #self, file_system, file_entry, parent_full_path, output_writer, p):
335 |
336 | Args:
337 | file_system: the file system (instance of dfvfs.FileSystem).
338 | file_entry: the file entry (instance of dfvfs.FileEntry).
339 | parent_full_path: the full path of the parent file entry.
340 | output_writer: the output writer (instance of StdoutWriter).
341 | """
342 | # Since every file system implementation can have their own path
343 | # segment separator we are using JoinPath to be platform and file system
344 | # type independent.
345 | full_path = file_system.JoinPath([parent_full_path, file_entry.name])
346 | if file_entry.IsFile():
347 | output_writer.write(full_path)
348 | output_writer.write("\n")
349 |
350 | for sub_file_entry in file_entry.sub_file_entries:
351 | self._ListFileEntry(file_system, sub_file_entry, full_path, output_writer, p)
352 |
353 | def ListFileEntries(self, base_path_spec, output_writer, partition):
354 | """Lists file entries in the base path specification.
355 |
356 | Args:
357 | base_path_spec: the base path specification (instance of dfvfs.PathSpec).
358 | output_writer: the output writer (instance of StdoutWriter).
359 | """
360 |
361 | file_system = resolver.Resolver.OpenFileSystem(base_path_spec)
362 | file_entry = resolver.Resolver.OpenFileEntry(base_path_spec)
363 | if file_entry is None:
364 | print(
365 | u'Unable to open base path specification:\n{0:s}'.format(
366 | base_path_spec.comparable))
367 | logging.warning(
368 | u'Unable to open base path specification:\n{0:s}'.format(
369 | base_path_spec.comparable))
370 | return
371 |
372 | self._ListFileEntry(file_system, file_entry, u'', output_writer, partition)
373 |
374 | def GetInodeForFile(self, image_path, file_path):
375 | """Returns the Inode of the given file in the given image.
376 |
377 | Args:
378 | image_path: Path to the image
379 | file_path: Path to the given fime.
380 | """
381 | logging.info("GetInode: image_path: %s ", image_path)
382 | logging.info("GetInode: file_path:%s ", file_path)
383 | os_path_spec = path_spec_factory.Factory.NewPathSpec(\
384 | definitions.TYPE_INDICATOR_OS, location=image_path)
385 | ewf_path_spec = path_spec_factory.Factory.NewPathSpec(\
386 | definitions.TYPE_INDICATOR_EWF, parent=os_path_spec)
387 | tsk_partition_path_spec = path_spec_factory.Factory.NewPathSpec(\
388 | definitions.TYPE_INDICATOR_TSK_PARTITION, \
389 | location=u'/p1', parent=ewf_path_spec)
390 | tsk_path_spec = path_spec_factory.Factory.NewPathSpec(\
391 | definitions.TYPE_INDICATOR_TSK, location=file_path, \
392 | parent=tsk_partition_path_spec)
393 | file_entry = resolver.Resolver.OpenFileEntry(tsk_path_spec)
394 |
395 | if (file_entry == None):
396 | return -1
397 |
398 | stat_object = file_entry.GetStat()
399 |
400 | logging.info("Inode: for file %s = %s ",file_path, stat_object.ino)
401 | return(stat_object.ino)
402 |
403 | def GetBasePathSpec(self, source_path, is_single_part=False):
404 | """Determines the base path specification.
405 | (If is_sing_part is True (when this is called per partition),
406 | it doesn't get into checking the individual partitions).
407 |
408 | Args:
409 | source_path: the source path.
410 |
411 | Returns:
412 | The base path specification (instance of dfvfs.PathSpec).
413 |
414 | Raises:
415 | RuntimeError: if the source path does not exists, or if the source path
416 | is not a file or directory, or if the format of or within
417 | the source file is not supported.
418 | """
419 | if not os.path.exists(source_path):
420 | raise RuntimeError(u'No such source: {0:s}.'.format(source_path))
421 |
422 | stat_info = os.stat(source_path)
423 |
424 | if (not stat.S_ISDIR(stat_info.st_mode) and
425 | not stat.S_ISREG(stat_info.st_mode)):
426 | raise RuntimeError(
427 | u'Unsupported source: {0:s} not a file or directory.'.format(
428 | source_path))
429 |
430 | if stat.S_ISDIR(stat_info.st_mode):
431 | path_spec = path_spec_factory.Factory.NewPathSpec(
432 | definitions.TYPE_INDICATOR_OS, location=source_path)
433 |
434 | elif stat.S_ISREG(stat_info.st_mode):
435 | path_spec = path_spec_factory.Factory.NewPathSpec(
436 | definitions.TYPE_INDICATOR_OS, location=source_path)
437 |
438 | type_indicators = analyzer.Analyzer.GetStorageMediaImageTypeIndicators(
439 | path_spec)
440 |
441 | if len(type_indicators) > 1:
442 | raise RuntimeError((
443 | u'Unsupported source: {0:s} found more than one storage media '
444 | u'image types.').format(source_path))
445 |
446 | if len(type_indicators) == 1:
447 | path_spec = path_spec_factory.Factory.NewPathSpec(
448 | type_indicators[0], parent=path_spec)
449 |
450 | if not type_indicators:
451 | # The RAW storage media image type cannot be detected based on
452 | # a signature so we try to detect it based on common file naming
453 | # schemas.
454 | file_system = resolver.Resolver.OpenFileSystem(path_spec)
455 | raw_path_spec = path_spec_factory.Factory.NewPathSpec(
456 | definitions.TYPE_INDICATOR_RAW, parent=path_spec)
457 |
458 | glob_results = raw.RawGlobPathSpec(file_system, raw_path_spec)
459 | if glob_results:
460 | path_spec = raw_path_spec
461 |
462 | # In case we did not find a storage media image type we keep looking
463 | # since not all RAW storage media image naming schemas are known and
464 | # its type can only detected by its content.
465 |
466 | path_spec = self._GetUpperLevelVolumeSystemPathSpec(path_spec, is_single_part)
467 |
468 | # In case we did not find a volume system type we keep looking
469 | # since we could be dealing with a store media image that contains
470 | # a single volume.
471 |
472 | type_indicators = analyzer.Analyzer.GetFileSystemTypeIndicators(
473 | path_spec)
474 |
475 | if len(type_indicators) > 1:
476 | raise RuntimeError((
477 | u'Unsupported source: {0:s} found more than one file system '
478 | u'types.').format(source_path))
479 |
480 | if not type_indicators:
481 | logging.warning(u'Unable to find a supported file system.')
482 | path_spec = path_spec_factory.Factory.NewPathSpec(
483 | definitions.TYPE_INDICATOR_OS, location=source_path)
484 |
485 | elif type_indicators[0] != definitions.TYPE_INDICATOR_TSK:
486 | raise RuntimeError((
487 | u'Unsupported source: {0:s} found unsupported file system '
488 | u'type: {1:s}.').format(source_path, type_indicators[0]))
489 |
490 | else:
491 | path_spec = path_spec_factory.Factory.NewPathSpec(
492 | definitions.TYPE_INDICATOR_TSK, location=u'/',
493 | parent=path_spec)
494 |
495 | return path_spec
496 |
497 | def ListAllFiles(self, source_path):
498 | """Determines the base path specification and lists all the files
499 | per partition, in the given disk image file, into text files
500 | _filelist_
501 | Note: Modified routine from the original GetBasePathSpec method
502 |
503 | Args:
504 | source_path: the source path.
505 |
506 | Returns:
507 | The base path specification (instance of dfvfs.PathSpec).
508 |
509 | Raises:
510 | RuntimeError: if the source path does not exist, or if the source path
511 | is not a file or directory, or if the format of or within
512 | the source file is not supported.
513 | """
514 |
515 | fname = sys._getframe().f_code.co_name
516 | logging.debug("%s: Listing files for %s", fname, source_path)
517 | if not os.path.exists(source_path):
518 | raise RuntimeError(u'No such source: {0:s}.'.format(source_path))
519 |
520 | stat_info = os.stat(source_path)
521 |
522 | if (not stat.S_ISDIR(stat_info.st_mode) and
523 | not stat.S_ISREG(stat_info.st_mode)):
524 | raise RuntimeError(
525 | u'Unsupported source: {0:s} not a file or directory.'.format(
526 | source_path))
527 | if stat.S_ISDIR(stat_info.st_mode):
528 | path_spec = path_spec_factory.Factory.NewPathSpec(
529 | definitions.TYPE_INDICATOR_OS, location=source_path)
530 |
531 | elif stat.S_ISREG(stat_info.st_mode):
532 | path_spec = path_spec_factory.Factory.NewPathSpec(
533 | definitions.TYPE_INDICATOR_OS, location=source_path)
534 |
535 | type_indicators = analyzer.Analyzer.GetStorageMediaImageTypeIndicators(
536 | path_spec)
537 |
538 | if len(type_indicators) > 1:
539 | raise RuntimeError((
540 | u'Unsupported source: {0:s} found more than one storage media '
541 | u'image types.').format(source_path))
542 |
543 | if len(type_indicators) == 1:
544 | path_spec = path_spec_factory.Factory.NewPathSpec(
545 | type_indicators[0], parent=path_spec)
546 |
547 | if not type_indicators:
548 | # The RAW storage media image type cannot be detected based on
549 | # a signature so we try to detect it based on common file naming
550 | # schemas.
551 | file_system = resolver.Resolver.OpenFileSystem(path_spec)
552 | raw_path_spec = path_spec_factory.Factory.NewPathSpec(
553 | definitions.TYPE_INDICATOR_RAW, parent=path_spec)
554 |
555 | glob_results = raw.RawGlobPathSpec(file_system, raw_path_spec)
556 | if glob_results:
557 | path_spec = raw_path_spec
558 |
559 | # In case we did not find a storage media image type we keep looking
560 | # since not all RAW storage media image naming schemas are known and
561 | # its type can only detected by its content.
562 |
563 | partitions = self._GetUpperLevelVolumeSystemPathSpecForBcnlp(path_spec,\
564 | source_path)
565 |
566 | return partitions
567 | def GetFileEntry(self, base_path_spec):
568 | return resolver.Resolver.OpenFileEntry(base_path_spec)
569 |
570 |
571 | class StdoutWriter(object):
572 | """Class that defines a stdout output writer."""
573 |
574 | def Open(self):
575 | """Opens the output writer object.
576 |
577 | Returns:
578 | A boolean containing True if successful or False if not.
579 | """
580 | return True
581 |
582 | def Close(self):
583 | """Closes the output writer object."""
584 | pass
585 |
586 | def WriteFileEntry(self, path):
587 | """Writes the file path to stdout.
588 |
589 | Args:
590 | path: the path of the file.
591 | """
592 | print(u'{0:s}'.format(path))
593 |
594 |
--------------------------------------------------------------------------------
/bcnlp_tm.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # coding=UTF-8
3 | #
4 | # BitCurator NLP (Disk Image Access for the Web)
5 | # Copyright (C) 2014 - 2016
6 | # All rights reserved.
7 | #
8 | # This code is distributed under the terms of the GNU General Public
9 | # License, Version 3. See the text file "COPYING" for further details
10 | # about the terms of this license.
11 | #
12 | # This file contains the main BitCurator NLP application for Topic modeling
13 |
14 | # Usage: python bcnlp_tm.py [--topics <10>] [--tm ]
15 | # Default num_topics = 10, tm=graphlab
16 |
17 | import os
18 | import logging
19 | import pyLDAvis
20 | import pyLDAvis.gensim
21 | from gensim import corpora, models, similarities
22 | import gensim
23 | import textract
24 | from bn_filextract import *
25 | from configobj import ConfigObj
26 | from stop_words import get_stop_words
27 |
28 | try:
29 | from argparse import ArgumentParser
30 | except ImportError:
31 | raise ImportError("This script requires ArgumentParser which is in Python 2.7 or Python 3.0")
32 |
33 | #logging.basicConfig(filename= 'bcnlp_tm.log', level=logging.DEBUG)
34 | logging.basicConfig(filename= 'bcnlp_tm_info.log', level=logging.INFO)
35 | logging.basicConfig(filename= 'bcnlp_tm_debug.log', level=logging.DEBUG)
36 | logging.basicConfig(filename= 'bcnlp_tm_warning.log', level=logging.WARNING)
37 |
38 |
39 | cfg_image = {}
40 | #documents = []
41 |
42 | class BnTopicModel():
43 |
44 | def tm_generate_gensim(self, infile, num_topics, config_file):
45 | ''' Using the APIs provided by gensim, LDAvis gui is invoked.
46 | NOTE: This is not yet tested well.
47 | '''
48 | documents = []
49 | documents = bn.bnTraverseInfileDir(infile, documents, config_file)
50 | if documents == []:
51 | print("Documents are empty")
52 |
53 | ''' #Debug
54 | i = 0
55 | for document in documents:
56 | logging.info("Document[%d] = %s ", i, document)
57 | i+=1
58 | '''
59 |
60 | # remove common words and tokenize
61 | '''
62 | stoplist = set('a an the of to for s from is and this \
63 | was were are , - | @ . '.split())
64 | texts = [[word for word in document.lower().split() \
65 | if word not in stoplist] \
66 | for document in documents]
67 | '''
68 |
69 | # Remove stop words - both from known stopword list and from
70 | # configuration file.
71 | # NOTE: Gensim's preprocessing to remove stop words is commented out.
72 | # This seems to be doing better. Test with more dataset before
73 | # deciding on which one to keep.
74 | exc_list = bn.bnGetConfigInfo(config_file, \
75 | "confset_section", "exclude_words")
76 | en_stop = get_stop_words('en')
77 | en_stop = en_stop + exc_list
78 | logging.info("Stop-words list: %s ", en_stop)
79 | texts = [[word for word in document.lower().split() \
80 | if word not in en_stop] \
81 | for document in documents]
82 |
83 | ## from pprint import pprint # pretty-printer
84 | ## pprint(texts)
85 |
86 | # remove words that appear only once
87 | from collections import defaultdict
88 | frequency = defaultdict(int)
89 | for text in texts:
90 | '''
91 | # NOTE: Commenting for now. With the preprocessing in
92 | # filextract.py, we won't need this. Remove after testing.
93 | i = 0
94 | for word in text:
95 | # NOTE: Some text files need this conversion. See if this can
96 | # be done for the whole document at one time.
97 | text[i] = unicode(word, errors='ignore')
98 | i+=1
99 | '''
100 | for token in text:
101 | frequency[token] += 1
102 |
103 | texts = [[token for token in text if frequency[token] > 1]
104 | for text in texts]
105 |
106 | texts = [[token for token in text if len(token) > 2]
107 | for text in texts]
108 |
109 | # NOTE: lemmatize not working
110 | ###texts = gensim.utils.lemmatize(texts)
111 |
112 | dictionary = corpora.Dictionary(texts)
113 |
114 | ##logging.info("[V]: token:id: %s", dictionary.token2id)
115 |
116 | ## dictionary.compactify()
117 | dictionary.save('/tmp/saved_dict.dict')
118 |
119 | # Now convert tokenized documents to vectors:
120 | corpus = [dictionary.doc2bow(text) for text in texts]
121 |
122 | ## logging.info("[V] Corpus: %s ", corpus)
123 |
124 | # store to disk, for later use
125 | corpora.MmCorpus.serialize('/tmp/saved_dict.mm', corpus)
126 |
127 | ## Creating Transformations
128 | ## The transformations are standard Python objects, typically
129 | ## initialized (trained) by means of a training corpus:
130 | ## First, let's use tfidf for training: It just involves simply
131 | ## going thru the supplied corpus once and computing document
132 | ## frequencies of all its featuers.
133 |
134 | tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
135 |
136 | corpus_tfidf = tfidf[corpus]
137 | corpora.MmCorpus.serialize('/tmp/saved_corpus_tfidf.mm', corpus_tfidf)
138 |
139 | '''
140 | # LSI model is commented out for now
141 | print "Printing TFIDF of given corpus \n"
142 | for doc in corpus_tfidf:
143 | print (doc)
144 |
145 | # Now Initialize an LSI transformation: num_topics set to 2 to make
146 | # it 2D lsi = models.LsiModel(corpus_tfidf, id2word=dictionary,
147 | # num_topics=3)
148 |
149 | # create a double wrapper over the original corpus:
150 | # bow->tfidf->fold-in-lsi
151 | corpus_lsi = lsi[corpus_tfidf]
152 |
153 | print "Printing LSI topics"
154 | lsi.print_topics(4)
155 |
156 | for doc in corpus_lsi:
157 | print (doc)
158 | '''
159 |
160 | # Create an LDA model
161 | '''
162 | lda_model = models.LdaModel(corpus_tfidf, \
163 | id2word=dictionary, \
164 | num_topics=5)
165 | '''
166 | lda_model = models.ldamodel.LdaModel(corpus=corpus, \
167 | id2word=dictionary, \
168 | num_topics=num_topics)
169 | corpus_lda = lda_model[corpus]
170 |
171 | corpus_lda_tfidf = lda_model[corpus_tfidf]
172 |
173 | # The following will print the topics in the logfile
174 | logging.info("Printing %s topics into log file: ", str(num_topics))
175 | lda_model.print_topics(num_topics)
176 |
177 | # Generate data for the pyLDAvis interface from the lda_model above
178 | vis_data = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary)
179 | ###vis_data = pyLDAvis.gensim.prepare(lda_model, corpus_tfidf, dictionary)
180 | ##vis_data = pyLDAvis.gensim.prepare(lda_model, corpus_lda, dictionary)
181 |
182 | #pyLDAvis.display(vis_data)
183 | pyLDAvis.show(vis_data)
184 |
185 | def remove_punctuation(self, text):
186 | import string
187 | return text.translate(None, string.punctuation)
188 |
189 | def remove_digits(self, text):
190 | import string
191 | return text.translate(None, string.digits)
192 |
193 | def bnRemoveEmptyFiles(self, path):
194 | ''' Traverses the directory and recursively removes empty files.
195 | '''
196 | files = os.listdir(path)
197 | if len(files):
198 | for fl in files:
199 | fullpath = os.path.join(path, fl)
200 | if os.path.isdir(fullpath):
201 | self.bnRemoveEmptyFiles(fullpath)
202 | if os.stat(fullpath).st_size == 0:
203 | logging.info("Removing file %s ", fullpath)
204 | os.remove(fullpath)
205 |
206 | def bn_parse_config_file(config_file, section_name):
207 | ''' Parses the config file to extract the image names and entity list.
208 | '''
209 | logging.info("bn_parse_config_file: Section: %s ", section_name)
210 | config = ConfigObj(config_file)
211 | section = config[section_name]
212 | i = 0
213 | cfg_entity_list = []
214 | for key in section:
215 | #if key == cfg_string:
216 | # found the string
217 | #return section[key]
218 | if section_name == "image_section":
219 | logging.info("parse_config: key: %s, section: %s", \
220 | key, section[key])
221 | cfg_image[i] = key
222 | i+=1
223 | elif section_name == "entity_list_section":
224 | flag = int(entity_list_section[key])
225 | if flag == 1:
226 | cfg_entity_list.append(key)
227 | if section_name == "entity_list_section":
228 | return cfg_entity_list
229 |
230 | if __name__ == "__main__":
231 | parser = ArgumentParser(prog='bcnlp_tm.py', description='Topic modeling')
232 | parser.add_argument('--config', action='store', \
233 | help="Config file[config.txt] ")
234 | parser.add_argument('--infile', action='store', help="input directory ")
235 | parser.add_argument('--tm', action='store', \
236 | help="topic modeling :gensim/graphlab ")
237 | parser.add_argument('--topics', action='store', help="number of topics ")
238 |
239 | args = parser.parse_args()
240 |
241 | # Infile specifies the directory of files to run the topic modeling on.
242 | # If no argument specified, it will assume there are disk_images specified
243 | # in the config file config.txt.
244 |
245 | infile = args.infile
246 | tm = args.tm # Topic modeling type: gensim/graphlab
247 | config_file = args.config
248 | is_disk_image = False
249 |
250 | num_topics = 10
251 | if args.topics:
252 | num_topics = args.topics
253 |
254 | # default it to gensim
255 | if tm == None:
256 | tm = 'gensim'
257 |
258 | if config_file == None:
259 | config_file = "config.txt"
260 |
261 | bn = BnFilextract()
262 | if infile == None:
263 | is_disk_image = True
264 |
265 | bn_parse_config_file(config_file, "image_section")
266 | print(">> Images in the config file: ", cfg_image)
267 |
268 | infile = bn.bnGetConfigInfo(config_file, \
269 | "confset_section", "file_staging_directory")
270 |
271 | i = 0
272 | for img in cfg_image:
273 | print(">> Extracting files from image {}...".format(cfg_image[img]))
274 | bn.bnExtractFiles(None, cfg_image[img], i, None, config_file)
275 | i += 1
276 | print(">> ... Done ")
277 |
278 | '''
279 | # NOTE: We needed this for Graphlab as we didn't do it in graphlab
280 | # routine. If that code is put back we need to make sure we call
281 | # bnTraverseInfileDir fron tm_generate_graphlab
282 | else:
283 | documents = []
284 | print(">> Extracting files from ", infile)
285 | bn.bnTraverseInfileDir(infile, documents, config_file)
286 | '''
287 |
288 | tmc = BnTopicModel()
289 | tmc.tm_generate_gensim(infile, num_topics, config_file)
290 |
291 |
292 |
--------------------------------------------------------------------------------
/bn_filextract.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # coding=UTF-8
3 | #
4 | # BitCurator NLP
5 | # Copyright (C) 2016 - 2018
6 | # All rights reserved.
7 | #
8 | # This code is distributed under the terms of the GNU General Public
9 | # License, Version 3. See the text file "COPYING" for further details
10 | # about the terms of this license.
11 | #
12 | # This file contains the File Extraction routines for BitCurator NLP.
13 | #
14 |
15 | import pytsk3
16 | import pyewf
17 | import os
18 | import sys
19 | import string
20 | import time
21 | import re
22 | import logging
23 | from configobj import ConfigObj
24 | import subprocess
25 | from subprocess import Popen,PIPE
26 | #import xml.etree.ElementTree as ET
27 | import textract
28 | import logging
29 | from bcnlp_listfiles import FileEntryLister
30 | from bcnlp_fxtract import FileExtractor
31 |
32 | from gensim.parsing.preprocessing import remove_stopwords
33 | from gensim.parsing.preprocessing import preprocess_documents
34 | from gensim.parsing.preprocessing import stem_text
35 | from gensim.parsing.preprocessing import strip_numeric
36 | from gensim.parsing.preprocessing import strip_punctuation
37 |
38 | class ewf_Img_Info(pytsk3.Img_Info):
39 |
40 | def __init__(self, ewf_handle):
41 | self._ewf_handle = ewf_handle
42 | super(ewf_Img_Info, self).__init__(
43 | url="", type=pytsk3.TSK_IMG_TYPE_EXTERNAL)
44 |
45 | def close(self):
46 | self._ewf_handle.close()
47 |
48 | def read(self, offset, size):
49 | self._ewf_handle.seek(offset)
50 | return self._ewf_handle.read(size)
51 |
52 | def get_size(self):
53 | return self._ewf_handle.get_media_size()
54 |
55 | def bn_getimginfo(image_path):
56 | logging.info("bn_getimginfo: Image Info for image %s: ", image_path)
57 | filenames = pyewf.glob(image_path)
58 | ewf_handle = pyewf.handle()
59 | ewf_handle.open(filenames)
60 |
61 | img = ewf_Img_Info(ewf_handle)
62 | return img
63 |
64 | # Dict to number of partitions in each image
65 | partition_in = dict()
66 |
67 | logging.basicConfig(filename= 'bcnlp_tm_info.log', level=logging.INFO)
68 | logging.basicConfig(filename= 'bcnlp_tm_debug.log', level=logging.DEBUG)
69 |
70 | class BnFilextract:
71 | """ This class contains the file extraction methods from
72 | disk images.
73 | """
74 | num_partitions = 0
75 | part_array = ["image_path", "addr", "slot_num", "start_offset", "desc"]
76 | partDictList = []
77 | num_partitions_ofimg = dict()
78 |
79 | def bnlpGetFsForImage(self, image_path, image_index, partition_num):
80 | """ Gets the filesystem info for an image and partition,
81 | using Pytsk3 method
82 |
83 | Args:
84 | image_path: Path to disk image
85 | image_index: Internally maintained serial number of the iamge
86 | partition_num: Partition within the volume
87 |
88 | Returns:
89 | Filesystem descriptor
90 | """
91 | logging.info('bnlpGetFsForImage: image_path: %s', image_path)
92 | logging.info('bnlpGetFsForImage: index: %s', image_index)
93 | logging.info('bnlpGetFsForImage: part: %s', partition_num)
94 | img = bn_getimginfo(image_path)
95 |
96 | part_start = \
97 | self.partDictList[int(image_index)][partition_num]['start_offset']
98 |
99 | fs = pytsk3.FS_Info(img, offset=(part_start * 512))
100 | return fs
101 |
102 | def bnGetExFmtsFromConfigFile(self, config_file):
103 | """Extract the list of excluded format types from the given
104 | config file.
105 |
106 | Args:
107 | config_file: Configuration file
108 |
109 | Returns:
110 | list of the format types to be excluded.
111 | """
112 | exc_fmt_list = []
113 | config = ConfigObj(config_file)
114 | section = config["exclude_format_section"]
115 | for key in section:
116 | if section[key]:
117 | exc_fmt_list.append(key)
118 |
119 | return exc_fmt_list
120 |
121 | def bnGetFileContents(self, filename, config_file):
122 | file_extract_dir = self.bnGetConfigInfo(config_file, \
123 | "confset_section", "file_staging_directory")
124 | """EXtract the contents of a file while doing nlp on a local
125 | directory of files as opposed to a disk image.
126 |
127 | Args:
128 | filename: Given file
129 | config_file: Configuration file
130 | """
131 | if filename.endswith('.txt') or filename.endswith('.TXT'):
132 | with open(filename, 'r') as tempfile:
133 | #input_file_contents = tempfile.read().replace('\n', '')
134 | input_file_contents = tempfile.read()
135 |
136 | else:
137 | # Eliminate the files that are configured to be excluded
138 | fn, filetype = os.path.splitext(filename)
139 | exc_fmt_list = self.bnGetExFmtsFromConfigFile(config_file)
140 | if filetype in exc_fmt_list:
141 | logging.info("File type %s excluded: %s", filetype, fn)
142 | return None
143 | logging.info("Filename %s is not a txt file. So textracting", \
144 | filename)
145 |
146 | try:
147 | input_file_contents = textract.process(filename)
148 | logging.info(">>> Textract PASSED for file %s ", filename)
149 | #logging.info("bcnlp:: File contents of %s %s ",\
150 | # filename, input_file_contents)
151 | except:
152 | logging.info("\n >>> Textract failed for doc %s ", filename)
153 | return None
154 |
155 | return input_file_contents
156 |
157 | def bnTraverseInfileDir(self, extracted_files, documents, config_file):
158 | ''' This routine traverses the given directory to extract the
159 | files and adds the contents to the global documents list.
160 |
161 | Args:
162 | extracted_files: Directory whose files need to be extracted.
163 | documents: Where the contents of th files will go.
164 | config_file: Configuration file.
165 | '''
166 |
167 | print("bnTraverseInfileDir: extracted_files: ", extracted_files)
168 | num_docs = 0
169 | for root, dirs, files in os.walk(extracted_files):
170 | path = root.split(os.sep)
171 | '''
172 | logging.info("traverse: path: %s, length: %s ", path, len(path))
173 | logging.info("traverse: dirs: %s ", dirs)
174 | logging.info("traverse: files: %s ", files)
175 | '''
176 | for filename in files:
177 | file_path = '/'.join(path) + '/' + filename
178 | doc = self.bnGetFileContents(file_path, config_file)
179 | if doc == None:
180 | logging.info(">> Filename %s is empty. Skipping ",
181 | file_path)
182 | continue
183 | doc = unicode(doc, errors='ignore')
184 |
185 | logging.info("[V]: traverse: Appending doc %s \
186 | to documents list ", filename)
187 | ##logging.info("[VV]Document %s before preprocessing: %s",\
188 | ## filename, doc)
189 | #doc = remove_stopwords(doc)
190 | doc = strip_punctuation(doc)
191 | doc = strip_numeric(doc)
192 | logging.info("Preprocessing done on DOC %s ", filename)
193 | ## logging.info("[VV]Preprocessing done on %s : %s", \
194 | ## filename, doc)
195 | if doc != None:
196 | documents.append(doc)
197 | num_docs += 1
198 | logging.info("[D]traverse: Total num docs: %d", num_docs)
199 | return documents
200 |
201 | def bnTraverseDirForPlot(self, img, extracted_files, ent, parse_en, config_file):
202 | ''' This routine traverses the given directory to extract the
203 | files and invokes a routine to process the text for plotting
204 | purposes.
205 |
206 | Args:
207 | img: Image index
208 | extracted_files: Directory whose files need to be extracted.
209 | ent: Handle to ParseForEnts class
210 | parse_en: Spacy handle
211 | config_file: Configuration file.
212 | '''
213 |
214 | num_docs = 0
215 | for root, dirs, files in os.walk(extracted_files):
216 | path = root.split(os.sep)
217 |
218 | entity_list = ent.bnParseConfigFileForEnts("config.txt")
219 |
220 | for filename in files:
221 | file_path = '/'.join(path) + '/' + filename
222 | file_contents = self.bnGetFileContents(file_path, config_file)
223 | logging.info("D2: traverse: getting contents from %s ",\
224 | file_path)
225 | logging.info("bcnlpProcesstext for file:%s", file_path)
226 |
227 | try:
228 | ent.bcnlpProcessText(img, filename, unicode(file_contents,\
229 | "utf-8"), entity_list, parse_en, bg=False)
230 | except:
231 | logging.info("bcnlpProcessText failed for img:%s, file:%s",\
232 | str(img), filename)
233 | continue
234 |
235 | num_docs += 1
236 | #logging.info("[V] traverse: Total num docs: %d ", num_docs)
237 |
238 | def bnDfvfsGenerateFileList(self, image_path):
239 | """ Using Dfvfs methods, file-list is geenrated from the given
240 | disk image in an output file with the name:
241 | _filelist
242 |
243 | Args:
244 | image_path: Path to specified image
245 |
246 | """
247 | num_partitions = 1
248 |
249 | file_entry_lister = FileEntryLister()
250 | try:
251 | num_partitions = file_entry_lister.ListAllFiles(image_path)
252 |
253 | except:
254 | print "file_entry_lister failed"
255 | return(0)
256 |
257 | return num_partitions
258 |
259 | def bnCreateDirsInPath(self, file_extract_dir, filepath):
260 | """ Looking at the path of the file, directories are created
261 | if they don't yet exist.
262 |
263 | Args:
264 | file_extract_dir: Directory where files are to be extracted.
265 | filepath: Path to the file
266 | """
267 | filename = os.path.basename(filepath)
268 | dir_name = os.path.dirname(filepath)
269 |
270 | current_dir = os.path.join(os.getcwd(), file_extract_dir)
271 | file_list = filepath.split('/')
272 | logging.info("bnCreateDirsInPath: file_list: %s ", file_list)
273 |
274 | listlen = len(file_list)
275 |
276 | newdir = os.path.join(current_dir, file_list[0])
277 | for i in range (0, listlen-1):
278 | #logging.info("i:%s file_list[i]: %s", i, file_list[i])
279 | if os.path.exists(newdir):
280 | newdir = os.path.join(newdir, file_list[i+1])
281 | else:
282 | logging.info("bnCreateDirsInPath: Creating dir: %s", newdir)
283 | os.mkdir(newdir)
284 | newdir = os.path.join(newdir, file_list[i+1])
285 |
286 | def bnQueueFileForExtraction(self,\
287 | base_path_spec,\
288 | image_path, output_path, jobs):
289 | """ This routine pushes the file_entry corresponding to the
290 | given path_spec into the queue for file-extraction using
291 | dfvfs/FileExtractor APIs.
292 | Args:
293 | base_path_spec
294 | image_path: path to disk image
295 | output_path: Where the extracted files will go.
296 | jobs: Jobs queue for extraction task.
297 | """
298 | fname = sys._getframe().f_code.co_name
299 | file_entry_lister = FileEntryLister()
300 |
301 | file_entry = file_entry_lister.GetFileEntry(base_path_spec)
302 |
303 | fe = FileExtractor(base_path_spec, output_path)
304 | jobs.append(fe)
305 | logging.info("[%s]: Jobs before adding to queue: %s ", fname, jobs)
306 | fe.start()
307 | fe.AddFileToQueue(file_entry, image_path)
308 |
309 | fe.Finish()
310 | logging.info("[%s]: Jobs after adding to the queue: %s" ,fname, jobs)
311 |
312 | for job in jobs:
313 | job.join()
314 |
315 | def bnExtractFiles(self, ent, image, image_index, parse_en, config_file):
316 | """ Generate file-list from the disk image and extract the
317 | files into a specified directory.
318 |
319 | Args:
320 | ent: Placeholder
321 | image: disk image
322 | image_index: Internally maintained index for the image
323 | parse_en: Placeholder
324 | config_file: Name of the configuration file.
325 | """
326 | fname = sys._getframe().f_code.co_name
327 | logging.info("%s: Extracting files for img: %s, with config_file: %s ",\
328 | fname, image, config_file)
329 | jobs = []
330 |
331 | config = ConfigObj(config_file)
332 | exc_fmt_list = self.bnGetExFmtsFromConfigFile(config_file)
333 |
334 | file_extract_dir = self.bnGetConfigInfo(config_file, \
335 | "confset_section", "file_staging_directory")
336 |
337 |
338 | disk_image_dir = self.bnGetConfigInfo(config_file, \
339 | "confset_section", "disk_image_dir")
340 |
341 | image_path = os.getcwd() + "/" + disk_image_dir + "/" + image
342 |
343 | file_extract_dir_path = os.getcwd() + '/'+ file_extract_dir
344 | logging.info("%s: File Extracxtion directory: %s ", \
345 | fname, file_extract_dir_path)
346 |
347 | print "\n>> Files will be extracted in ", file_extract_dir_path
348 |
349 | cmd = "mkdir " + file_extract_dir
350 | if not os.path.exists(file_extract_dir):
351 | subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
352 | file_extract_dir_per_image = file_extract_dir + '/' + str(image_index)
353 | cmd = "mkdir " + file_extract_dir_per_image
354 | if not os.path.exists(file_extract_dir_per_image):
355 | subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)
356 |
357 | self.num_partitions = \
358 | self.bnlpGetPartInfoForImage(image_path, image_index)
359 |
360 | '''
361 | # Call Dfvfs method to generate the file-list in the image
362 | self.num_partitions = self.bnDfvfsGenerateFileList(image_path)
363 | partition_in[image] = self.num_partitions
364 | '''
365 |
366 | logging.info("%s: # partitions:%s Generating filelist ", fname, \
367 | self.num_partitions)
368 |
369 | logging.info("%s: Generated filelist. Extract contents", fname)
370 |
371 | file_entry_lister = FileEntryLister()
372 | output_path = file_extract_dir_per_image
373 |
374 | for p in range(0, self.num_partitions):
375 | base_path_spec = file_entry_lister.GetBasePathSpec(image_path, True)
376 | logging.info("%s: Extracting contents from part p = %s", fname, p)
377 | self.bnQueueFileForExtraction(base_path_spec, image_path, output_path, jobs)
378 |
379 | def isFileTextractable(self, filename, config_file):
380 | """ Not all files are extractable as text file. Before extracting
381 | a file, it should pass this test.
382 | Args:
383 | filename: Input file
384 | config_file: Name of the config file
385 | """
386 | logging.info("isTextratable: filename: %s ", filename)
387 |
388 | if (filename.endswith('.txt') or filename.endswith('.TXT') or \
389 | filename.endswith('.pdf') or filename.endswith('.PDF') or \
390 | filename.endswith('.xml') or \
391 | filename.endswith('.XML') or \
392 | filename.endswith('.doc') or filename.endswith('.DOC') or \
393 | filename.endswith('.htm') or filename.endswith('.HTM;1') or \
394 | filename.endswith('.html') or filename.endswith('.HTML') or \
395 | filename.endswith('.jpg') or filename.endswith('.JPG') ):
396 |
397 | # if any of the above types are configured to be exluded,
398 | # filter them out.
399 | fn, fe = os.path.splitext(filename)
400 | exc_fmt_list = self.bnGetExFmtsFromConfigFile(config_file)
401 | logging.info("isFileTextratable:file:%s, exc_fmt_list: %s", \
402 | filename, exc_fmt_list)
403 | if fe in exc_fmt_list:
404 | logging.info("isTextraxtable:File %s configured \
405 | to be excluded",filename)
406 | return False
407 | return True
408 | else:
409 | return False
410 |
411 | def bnIsEntityInfoSetInConfig(self, config_file):
412 | """ Filter for some legacy code.
413 | FIXME: Will be removed from here eventually
414 | """
415 | entity_info = self.bnGetConfigInfo(\
416 | config_file, "confset_section", "entity_info")
417 | if entity_info == "Yes":
418 | return True
419 | else:
420 | return False
421 |
422 | def bnlpDnldFile(self, inode, fs, filepath):
423 | """ Extracts the contents of a given file.
424 | Args:
425 | inode: Inode of the given file
426 | fs: Filesystem info
427 | """
428 | logging.info("bnlpDnldFile: file_path:%s, inode:%d",filepath, inode)
429 | try:
430 | f = fs.open_meta(inode=inode)
431 | except:
432 | logging.info("fs.open_meta failed for file %s ", filepath)
433 | return
434 |
435 | # Read data and store it in a string
436 | offset = 0
437 | size = f.info.meta.size
438 | BUFF_SIZE = 1024 * 1024
439 |
440 | total_data = ""
441 | while offset < size:
442 | available_to_read = min(BUFF_SIZE, size - offset)
443 | data = f.read_random(offset, available_to_read)
444 | if not data:
445 | # print("Done with reading")
446 | break
447 |
448 | offset += len(data)
449 | total_data = total_data+data
450 | logging.info("bnlpDnldFile: D2: Length OF TOTAL DATA: %s ", \
451 | str(len(total_data)))
452 |
453 | logging.info("bnlpDnldFile: D2: Dumping the contents to filepath %s ",\
454 | filepath)
455 |
456 | try:
457 | with open(filepath, "w") as text_file:
458 | text_file.write(total_data)
459 | except IOError, e:
460 | print("Opeing the file {} failed with error {}", filepath, e)
461 | return
462 |
463 | ## print ("D2: Time to index the file ", filepath)
464 | basepath = os.path.dirname(filepath)
465 |
466 | def bnGetConfigInfo(self, config_file, section_name, cfg_string):
467 | """Given the key, extract info from the config file
468 |
469 | Args:
470 | config_file: Configuration filename
471 | section_name: Name of the section within the config file
472 | cfg_string: What we ase looking for - the key
473 | """
474 | config = ConfigObj(config_file)
475 | section = config[section_name]
476 | for key in section:
477 | if key == cfg_string:
478 | # found the string
479 | return section[key]
480 | else:
481 | print "bnGetConfigInfo: Key not found in section ", section_name
482 |
483 | def bnGetOutDirFromConfig(self, config_file):
484 | config = ConfigObj(config_file)
485 |
486 | file_extract_dir = "file_staging_directory"
487 | config_section = config['confset_section']
488 | for key in config_section:
489 | #print (key, config_section[key])
490 | if key == "file_staging_directory":
491 | file_extract_dir = config_section[key]
492 | return file_extract_dir
493 | break
494 | else:
495 | print("file_staging_directory not in config file - using default\n")
496 | return None
497 |
498 | def bnlpGetNumPartsForImage(self, image_path, image_index):
499 | img = bn_getimginfo(image_path)
500 |
501 | # pytsk3.Volume_Info works only with file systems which have partition
502 | # defined. For file systems like FAT12, with no partition info, we need
503 | # to handle in an exception.
504 | try:
505 | volume = pytsk3.Volume_Info(img)
506 | except:
507 | logging.info(">> Volume Info failed. Could be FAT12 ")
508 | self.num_partitions = 1
509 | return (self.num_partitions)
510 |
511 | for part in volume:
512 | if part.slot_num >= 0:
513 | try:
514 | fs = pytsk3.FS_Info(img, offset=(part.start * 512))
515 | except:
516 | logging.info(">> Exception in pytsk3.FS_Info in prtn:%s ",
517 | self.num_partitions )
518 | continue
519 | self.num_partitions += 1
520 | return (self.num_partitions)
521 |
522 |
523 | def bnlpGetPartInfoForImage(self, image_path, image_index):
524 | img = bn_getimginfo(image_path)
525 | is_partition_info = False
526 |
527 | # pytsk3.Volume_Info works only with file systems which have partition
528 | # defined. For file systems like FAT12, with no partition info, we need
529 | # to handle in an exception.
530 | try:
531 | volume = pytsk3.Volume_Info(img)
532 | is_partition_info = True
533 | except:
534 | ## print "bnlpGetPartionInfoForImage: Volume Info failed.
535 | ## Could be FAT12 "
536 | self.num_partitions = 1
537 | is_partition_info = False
538 | fs = pytsk3.FS_Info(img, offset=0)
539 |
540 | ## print "D: File System Type Detected ", fs.info.ftype
541 | if fs.info.ftype == pytsk3.TSK_FS_TYPE_FAT12:
542 | fs_desc = "FAT12 file system"
543 | elif fs.info.ftype == pytsk3.TSK_FS_TYPE_ISO9660_DETECT:
544 | fs_desc = "ISO file system"
545 | else:
546 | fs_desc = "Unknown file system"
547 |
548 | self.partDictList.append([])
549 | # First level files and directories off the root
550 | # returns file_list for the root directory
551 | file_list_root = self.bnlpListFiles(fs, "/", image_index, 0)
552 | image_name = os.path.basename(image_path)
553 | self.num_partitions_ofimg[image_name] = self.num_partitions
554 |
555 | # Populate the partDictList for the image.
556 | self.partDictList[image_index].append({self.part_array[0]:image_path, \
557 | self.part_array[1]:0, \
558 | self.part_array[2]:0, \
559 | self.part_array[3]:0, \
560 | self.part_array[4]:fs_desc })
561 | return self.num_partitions
562 |
563 | # For images with partition_info, we continue here.
564 | self.partDictList.append([])
565 |
566 | self.num_partitions = 0
567 | for part in volume:
568 | # The slot_num field of volume object has a value of -1
569 | # for non-partition entries - like Unallocated partition
570 | # and Primary and extended tables. So we will look for this
571 | # field to be >=0 to count partitions with valid file systems
572 | if part.slot_num >= 0:
573 | # Add the entry to the List of dictionaries, partDictList.
574 | # The list will have one dictionary per partition. The image
575 | # name is added as the first element of each partition to
576 | # avoid a two-dimentional list.
577 | ## print "D: image_path: ", image_path
578 | ## print "D: part_addr: ", part.addr
579 | ## print "D: part_slot_num: ", part.slot_num
580 | ## print "D: part_start_offset: ", part.start
581 | ## print "D: part_description: ", part.desc
582 | # Open the file system for this image at the extracted
583 | # start_offset.
584 | try:
585 | fs = pytsk3.FS_Info(img, offset=(part.start * 512))
586 | except:
587 | logging.info("Exception in pytsk3.FS_Info for prtn:%s",
588 | self.num_partitions)
589 | continue
590 |
591 | self.partDictList[image_index].append({self.part_array[0]:image_path, \
592 | self.part_array[1]:part.addr, \
593 | self.part_array[2]:part.slot_num, \
594 | self.part_array[3]:part.start, \
595 | self.part_array[4]:part.desc })
596 |
597 | self.num_partitions += 1
598 |
599 | fs = pytsk3.FS_Info(img, offset=(part.start * 512))
600 |
601 | # First level files and directories off the root
602 | # returns file_list for the root directory
603 | file_list_root = self.bnlpListFiles(fs, "/", image_index, part.slot_num)
604 | ## print(file_list_root)
605 |
606 | image_name = os.path.basename(image_path)
607 | self.num_partitions_ofimg[image_name] = self.num_partitions
608 | logging.info("Number of Partitions for image %s = %s",
609 | image_name, self.num_partitions)
610 | return (self.num_partitions)
611 |
612 | bnlpFileInfo = ['name', 'size', 'mode', 'inode', 'p_inode', 'mtime', \
613 | 'atime', 'ctime', 'isdir', 'deleted', 'name_slug']
614 | def bnlpListFiles(self, fs, path, image_index, partition_num):
615 | file_list = []
616 | try:
617 | directory = fs.open_dir(path=path)
618 | except:
619 | print "Error in opening file path {} ".format(path)
620 | return None
621 |
622 | i=0
623 | for f in directory:
624 | is_dir = False
625 | '''
626 | print("Func:bnlpListFiles:root_path:{} size: {} inode: {} \
627 | par inode: {} mode: {} type: {} ".format(f.info.name.name,\
628 | f.info.meta.size, f.info.meta.addr, f.info.name.meta_addr,\
629 | f.info.name.par_addr, f.info.meta.mode, f.info.meta.type))
630 | '''
631 | # Some files may not have the metadta information. So
632 | # access it only if it exists.
633 | if f.info.meta != None:
634 | if f.info.meta.type == 2:
635 | is_dir = True
636 |
637 | # Since we are displaying the modified time for the file,
638 | # Convert the mtime to isoformat to be passed in file_list.
639 | ## d = date.fromtimestamp(f.info.meta.mtime)
640 | ## mtime = d.isoformat()
641 | mtime = time.strftime("%FT%TZ",time.gmtime(f.info.meta.mtime))
642 |
643 |
644 | if (int(f.info.meta.flags) & 0x01 == 0):
645 | deleted = "Yes"
646 | else:
647 | deleted = "No"
648 |
649 | # NOTE: A new item "name_slug" is added to those file names which
650 | # have a space. The space is replaced by %20 and saved as name_slug.
651 | # This is used later when a file with a "non-None" name_slug shows
652 | # up at the route. It is recognized as a filename with spaces and
653 | # using the inode comparison, its real name is extracted before
654 | # downloading the file.
655 | name_slug = "None"
656 | if " " in f.info.name.name:
657 | name_slug = f.info.name.name.replace(" ", "%20")
658 | file_list.append({self.bnlpFileInfo[0]:f.info.name.name, \
659 | self.bnlpFileInfo[1]:f.info.meta.size, \
660 | self.bnlpFileInfo[2]:f.info.meta.mode, \
661 | self.bnlpFileInfo[3]:f.info.meta.addr, \
662 | self.bnlpFileInfo[4]:f.info.name.par_addr, \
663 | self.bnlpFileInfo[5]:mtime, \
664 | self.bnlpFileInfo[6]:f.info.meta.atime, \
665 | self.bnlpFileInfo[7]:f.info.meta.ctime, \
666 | self.bnlpFileInfo[8]:is_dir, \
667 | self.bnlpFileInfo[9]:deleted, \
668 | self.bnlpFileInfo[10]:name_slug })
669 |
670 | ##print("Func:bnlpListFiles: Listing Directory for PATH: ", path)
671 | ##print file_list
672 | ##print "\n\n"
673 | return file_list
674 |
--------------------------------------------------------------------------------
/config.txt:
--------------------------------------------------------------------------------
1 | #
2 | # bitcurator-nlp-gentm config file
3 | #
4 |
5 | # Disk images to process (the default location can be changed in the following section)
6 | [image_section]
7 | govdocs45sampler.E01 = 1
8 |
9 | # Configuration settings. Where to find disk images and store intermediary files.
10 | [confset_section]
11 | disk_image_dir = "disk_images"
12 | file_staging_directory = "extracted_files"
13 | nlp_dir = "bcnlp"
14 | spacy_outfile = "spacy_outfile"
15 | entity_info = "No"
16 | num_iterations = 200
17 | exclude_words = "excludeme", "andme"
18 |
19 | # Formats to exclude when extracting text using textract
20 | [exclude_format_section]
21 | .jpg=1
22 | .JPG=1
23 | .mp3=1
24 | .wav=1
25 |
--------------------------------------------------------------------------------
/disk_images/fourpartusb1.E01:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BitCurator/bitcurator-nlp-gentm/f75b4908862b5280949b783d409d43dd59034e49/disk_images/fourpartusb1.E01
--------------------------------------------------------------------------------
/disk_images/govdocs45sampler.E01:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BitCurator/bitcurator-nlp-gentm/f75b4908862b5280949b783d409d43dd59034e49/disk_images/govdocs45sampler.E01
--------------------------------------------------------------------------------
/externals/README.md:
--------------------------------------------------------------------------------
1 | # Support libraries
2 |
3 | This project uses libewf-20140608 to maintain compatibility with The Sleuth Kit. The source is included here as the upstream developer(s) keep moving it around.
4 |
5 | ## Documentation
6 |
7 | Additional project information can be found on the BitCurator NLP wiki at https://github.com/BitCurator/bitcurator-nlp/wiki.
8 |
9 | ## License(s)
10 |
11 | The BitCurator logo, BitCurator project documentation, and other non-software products of the BitCurator team are subject to the the Creative Commons Attribution 4.0 Generic license (CC By 4.0).
12 |
13 | Unless otherwise indicated, software items in this repository are distributed under the terms of the GNU Lesser General Public License, Version 3. See the text file "COPYING" for further details about the terms of this license.
14 |
15 | In addition to software produced by the BitCurator team, BitCurator packages and modifies open source software produced by other developers. Licenses and attributions are retained here where applicable.
16 |
17 |
--------------------------------------------------------------------------------
/externals/libewf-20140608.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BitCurator/bitcurator-nlp-gentm/f75b4908862b5280949b783d409d43dd59034e49/externals/libewf-20140608.tar.gz
--------------------------------------------------------------------------------
/externals/libuna-alpha-20150927.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BitCurator/bitcurator-nlp-gentm/f75b4908862b5280949b783d409d43dd59034e49/externals/libuna-alpha-20150927.tar.gz
--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #
4 | # setup.sh: Build and configuration script for nlp-webtools
5 | #
6 | # This script sets up a correctly configured environment to the topic modeling tool.
7 | # It should only be run once prior to running "python bcnlp_tm.py" for the first
8 | # time.
9 | #
10 |
11 | LOG_BASE=/tmp
12 |
13 | #--- FUNCTION -----------------------------------------------------------------
14 | # NAME: echoinfo
15 | # DESCRIPTION: Echo information to stdout.
16 | #------------------------------------------------------------------------------
17 | echoinfo() {
18 | printf "%s * STATUS%s: %s\n" "${GC}" "${EC}" "$@";
19 | }
20 |
21 | #--- FUNCTION -----------------------------------------------------------------
22 | # NAME: echoerr
23 | # DESCRIPTION: Echo errors to stderr.
24 | #------------------------------------------------------------------------------
25 | echoerror() {
26 | printf "%s * ERROR%s: %s\n" "${RC}" "${EC}" "$@" 1>&2;
27 | }
28 |
29 | #--- FUNCTION -----------------------------------------------------------------
30 | # NAME: __apt_get_install_noinput
31 | # DESCRIPTION: (DRY) apt-get install with noinput options
32 | #------------------------------------------------------------------------------
33 | __apt_get_install_noinput() {
34 | apt-get install -y -o DPkg::Options::=--force-confold "$@"; return $?
35 | #yes | aptdcon --hide-terminal --install "$@"; return $?
36 | }
37 |
38 | #--- FUNCTION -----------------------------------------------------------------
39 | # NAME: __pip_install_noinput
40 | # DESCRIPTION: (DRY)
41 | #------------------------------------------------------------------------------
42 |
43 | __pip_install_noinput() {
44 | # Uncomment for Python 3
45 | #pip3 install --upgrade $@; return $?
46 | pip2 install --upgrade $@; return $?
47 | }
48 |
49 | #--- FUNCTION -----------------------------------------------------------------
50 | # NAME: __pip_install_noinput
51 | # DESCRIPTION: (DRY)
52 | #------------------------------------------------------------------------------
53 | __pip_pre_install_noinput() {
54 | # Uncomment for Python 3
55 | #pip3 install --pre --upgrade $@; return $?
56 | pip2 install --pre --upgrade $@; return $?
57 | }
58 |
59 | install_ubuntu_deps() {
60 |
61 | echoinfo "Updating your APT Repositories ... "
62 | apt-get update >> $LOG_BASE/nlp-install.log 2>&1 || return 1
63 |
64 | echoinfo "Installing Python Software Properies ... "
65 | __apt_get_install_noinput software-properties-common >> $LOG_BASE/nlp-install.log 2>&1 || return 1
66 |
67 | echoinfo "Enabling Universal Repository ... "
68 | __enable_universe_repository >> $LOG_BASE/nlp-install.log 2>&1 || return 1
69 |
70 | echoinfo "Updating Repository Package List ..."
71 | apt-get update >> $LOG_BASE/nlp-install.log 2>&1 || return 1
72 |
73 | echoinfo "Upgrading all packages to latest version ..."
74 | __apt_get_upgrade_noinput >> $LOG_BASE/nlp-install.log 2>&1 || return 1
75 |
76 | return 0
77 | }
78 |
79 | install_ubuntu_packages() {
80 | packages="antiword
81 | automake
82 | curl
83 | dkms
84 | ffmpeg
85 | flac
86 | g++-5
87 | gcc-5
88 | lame
89 | libffi-dev
90 | libjpeg-dev
91 | liblzma-dev
92 | libmad0
93 | libpulse-dev
94 | libsox-fmt-mp3
95 | libtool
96 | libxml2-dev
97 | libxslt1-dev
98 | lzma
99 | poppler-utils
100 | pstotext
101 | python
102 | python-dev
103 | python-pip
104 | python3-dev
105 | python3-pip
106 | sox
107 | swig
108 | swig3.0
109 | tesseract-ocr
110 | unrtf
111 | virtualbox-guest-utils
112 | virtualenv
113 | virtualenvwrapper
114 | zlib1g-dev"
115 |
116 | if [ "$@" = "dev" ]; then
117 | packages="$packages"
118 | elif [ "$@" = "stable" ]; then
119 | packages="$packages"
120 | fi
121 |
122 | for PACKAGE in $packages; do
123 | __apt_get_install_noinput $PACKAGE >> $LOG_BASE/nlp-install.log 2>&1
124 | ERROR=$?
125 | if [ $ERROR -ne 0 ]; then
126 | echoerror "Install Failure: $PACKAGE (Error Code: $ERROR)"
127 | else
128 | echoinfo "Installed Package: $PACKAGE"
129 | fi
130 | done
131 |
132 | return 0
133 | }
134 |
135 | install_ubuntu_pip_packages() {
136 |
137 | pip_packages="textract
138 | gensim
139 | pyLDAvis
140 | stop_words
141 | configobj"
142 | pip_special_packages="textacy"
143 |
144 | if [ "$@" = "dev" ]; then
145 | pip_packages="$pip_packages"
146 | elif [ "$@" = "stable" ]; then
147 | pip_packages="$pip_packages"
148 | fi
149 |
150 | ERROR=0
151 |
152 | for PACKAGE in $pip_packages; do
153 | CURRENT_ERROR=0
154 | echoinfo "Installed Python Package: $PACKAGE"
155 | __pip_install_noinput $PACKAGE >> $LOG_BASE/nlp-install.log 2>&1 || (let ERROR=ERROR+1 && let CURRENT_ERROR=1)
156 | if [ $CURRENT_ERROR -eq 1 ]; then
157 | echoerror "Python Package Install Failure: $PACKAGE"
158 | fi
159 | done
160 |
161 | # Prep environment for special packages, install cld2-cffi
162 | #env CC=/usr/bin/gcc-5 pip3 install -U cld2-cffi
163 | env CC=/usr/bin/gcc-5 pip install -U cld2-cffi
164 |
165 | for PACKAGE in $pip_special_packages; do
166 | CURRENT_ERROR=0
167 | echoinfo "Installed Python (special setup) Package: $PACKAGE"
168 | __pip_pre_install_noinput $PACKAGE >> $LOG_BASE/nlp-install.log 2>&1 || (let ERROR=ERROR+1 && let CURRENT_ERROR=1)
169 | if [ $CURRENT_ERROR -eq 1 ]; then
170 | echoerror "Python Package Install Failure: $PACKAGE"
171 | fi
172 | done
173 |
174 | if [ $ERROR -ne 0 ]; then
175 | echoerror
176 | return 1
177 | fi
178 |
179 | return 0
180 | }
181 |
182 | install_source_packages() {
183 |
184 | # Install libuna from specific release
185 | echoinfo "nlp-webtools: Building and installing libuna"
186 | CDIR=$(pwd)
187 | # Newer versions break a lot of stuff. Keep 20150927 for now.
188 | # wget -q https://github.com/libyal/libuna/releases/download/20170112/libuna-alpha-20170112.tar.gz
189 | cd /tmp
190 | cp /$HOME/bitcurator-nlp-gentm/externals/libuna-alpha-20150927.tar.gz .
191 | tar zxf libuna-alpha-20150927.tar.gz >> $HOME/nlp-install.log 2>&1
192 | cd libuna-20150927
193 | ./configure >> $HOME/nlp-install.log 2>&1
194 | make -s >> $HOME/nlp-install.log 2>&1
195 | make install >> $HOME/nlp-install.log 2>&1
196 | ldconfig >> $HOME/nlp-install.log 2>&1
197 |
198 | # Now clean up
199 | cd /tmp
200 | rm -rf libuna-20170112
201 | rm libuna-alpha-20170112.tar.gz
202 |
203 | # Install libewf from current sources
204 | echoinfo "nlp-webtools: Building and installing libewf"
205 | CDIR=$(pwd)
206 |
207 | # Newer versions break a lot of stuff. Keep 20140608 for now.
208 | cd /tmp
209 | cp /$HOME/bitcurator-nlp-gentm/externals/libewf-20140608.tar.gz .
210 | tar zxf libewf-20140608.tar.gz >> $HOME/nlp-install.log 2>&1
211 | cd libewf-20140608
212 | ./configure --enable-python --enable-v1-api >> $HOME/nlp-install.log 2>&1
213 | make -s >> $HOME/nlp-install.log 2>&1
214 | make install >> $HOME/nlp-install.log 2>&1
215 | ldconfig >> $HOME/nlp-install.log 2>&1
216 |
217 | # Now clean up
218 | cd /tmp
219 | rm -rf libewf-20140608
220 | rm libewf-20140608.tar.gz
221 |
222 | echoinfo "nlp-webtools: Adding DFXML tools and libraries"
223 | CDIR=$(pwd)
224 | git clone https://github.com/simsong/dfxml /usr/share/dfxml >> $HOME/nlp-install.log 2>&1
225 | # No cleanup needed
226 | cd /tmp
227 |
228 | # Install The Sleuth Kit (TSK) from current sources
229 | echoinfo "nlp-webtools: Building and installing The Sleuth Kit"
230 | CDIR=$(pwd)
231 | git clone --recursive https://github.com/sleuthkit/sleuthkit /usr/share/sleuthkit >> $HOME/nlp-install.log 2>&1
232 | cd /usr/share/sleuthkit
233 | git fetch
234 | git checkout master >> $HOME/nlp-install.log 2>&1
235 | ./bootstrap >> $HOME/nlp-install.log 2>&1
236 | ./configure >> $HOME/nlp-install.log 2>&1
237 | make -s >> $HOME/nlp-install.log 2>&1
238 | make install >> $HOME/nlp-install.log 2>&1
239 | ldconfig >> $HOME/nlp-install.log 2>&1
240 |
241 | # Install PyTSK
242 | echoinfo "nlp-webtools: Building and installing PyTSK (Python bindings for TSK)"
243 | echoinfo " -- Please be patient. This may take several minutes..."
244 | CDIR=$(pwd)
245 | cd /tmp
246 | git clone https://github.com/py4n6/pytsk
247 | cd pytsk
248 | python setup.py update >> $HOME/nlp-install.log 2>&1
249 | python setup.py build >> $HOME/nlp-install.log 2>&1
250 | python setup.py install >> $HOME/nlp-install.log 2>&1
251 | # Now clean up
252 | cd /tmp
253 | #rm -rf pytsk3-20170508
254 | rm -rf pytsk
255 |
256 | }
257 |
258 | complete_message() {
259 | echo
260 | echo "Installation Complete!"
261 | echo
262 | }
263 |
264 | echo "Installing core dependencies...."
265 | install_ubuntu_deps
266 |
267 | echo "Installing Ubuntu packages...."
268 | install_ubuntu_packages stable
269 |
270 | echo "Installing pip packages...."
271 | install_ubuntu_pip_packages stable
272 |
273 | echo "Installing source packages...."
274 | install_source_packages
275 |
276 | # echo "current directory1: ${PWD} "
277 | echo "Installing textract support packages..."
278 | sudo apt-get install libxml2-dev libxslt1-dev antiword unrtf poppler-utils pstotext tesseract-ocr flac ffmpeg lame libmad0 libsox-fmt-mp3 libpulse-dev sox swig swig3.0 libjpeg-dev zlib1g-dev
279 |
280 |
281 | # echo "current directory2: ${PWD} "
282 | echo "Installing textract..."
283 | sudo pip install textract
284 |
285 | # No longer using graphlab
286 | #echo "Installing graphlab..."
287 | #sudo pip install --upgrade --no-cache-dir https://get.graphlab.com/GraphLab-Create/2.1/[user_email]/[license_key]/GraphLab-Create-License.tar.gz
288 |
289 | echo "Installing configObj..."
290 | pip install configobj
291 |
292 | echo "Installing gensim..."
293 | pip install gensim
294 |
295 | echo "Installing pyLDAvis..."
296 | pip install pyLDAvis
297 |
298 | # The following are needed for bn_plot
299 | pip install matplotlib
300 | pip install spacy
301 | python -m spacy download en
302 |
303 | echo "Installing dfvfs..."
304 | curl -O https://raw.githubusercontent.com/log2timeline/dfvfs/master/requirements.txt
305 | pip install -r requirements.txt
306 | pip install dfvfs
307 |
308 | complete_message
309 |
--------------------------------------------------------------------------------