├── .gitignore
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── dataset
    └── README.md
├── nerlogparser
    ├── data
    │   ├── chars.txt
    │   ├── glove.6B.300d.trimmed.npz
    │   ├── label.txt
    │   ├── tags.txt
    │   ├── test.txt
    │   └── words.txt
    ├── dataformat
    │   ├── __init__.py
    │   └── toconll.py
    ├── grammar
    │   ├── __init__.py
    │   ├── authlog.py
    │   ├── bluegenelog.py
    │   ├── csvlog.py
    │   ├── daemonlog.py
    │   ├── debuglog.py
    │   ├── dmesglog.py
    │   ├── grammar_utility.py
    │   ├── kernellog.py
    │   ├── kippolog.py
    │   ├── messageslog.py
    │   ├── proxifierlog.py
    │   ├── weblog.py
    │   └── zookeeperlog.py
    ├── model
    │   ├── __init__.py
    │   ├── base_model.py
    │   ├── build_data.py
    │   ├── config.py
    │   ├── data_utils.py
    │   ├── evaluate.py
    │   ├── general_utils.py
    │   ├── ner_model.py
    │   └── train.py
    ├── nerlogparser.py
    ├── output
    │   ├── __init__.py
    │   └── to_json.py
    ├── preprocessing
    │   ├── Preprocessing.py
    │   ├── Splitting.py
    │   ├── __init__.py
    │   └── config
    │   │   └── datasets.conf
    ├── results
    │   └── test
    │   │   ├── events.out.tfevents.1533503273.seitpc80
    │   │   ├── log.txt
    │   │   └── model.weights
    │   │       ├── .data-00000-of-00001
    │   │       ├── .index
    │   │       ├── .meta
    │   │       └── checkpoint
    └── shell
    │   └── nerlogparser_shell.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *sh
2 | *pyc
3 | *.DS_Store
4 | __pycache__/
5 | .idea
6 | *.egg-info/
7 | build
8 | dist
9 | results


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    Copyright 2017 Guillaume Genthial
179 | 
180 |    Licensed under the Apache License, Version 2.0 (the "License");
181 |    you may not use this file except in compliance with the License.
182 |    You may obtain a copy of the License at
183 | 
184 |        http://www.apache.org/licenses/LICENSE-2.0
185 | 
186 |    Unless required by applicable law or agreed to in writing, software
187 |    distributed under the License is distributed on an "AS IS" BASIS,
188 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189 |    See the License for the specific language governing permissions and
190 |    limitations under the License.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | graft dataset
2 | graft nerlogparser
3 | include LICENSE.txt
4 | include README.md


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # nerlogparser: An automatic log parser
 2 | 
 3 | This is source code implementation of our paper entitled ["Automatic log parser to support forensic analysis"](http://researchrepository.murdoch.edu.au/id/eprint/42841/) published in the 16th Australian Digital Forensics Conference, pp. 1-10, 2018. We name the tool as `nerlogparser` because it uses named entity recognition (NER) technique to parse log files. This repository is a fork from [sequence_tagging](https://github.com/guillaumegenthial/sequence_tagging) by Guillaume Genthial.
 4 | 
 5 | ## Requirements
 6 | 1. Python 3.5
 7 | 2. TensorFlow 1.4.1
 8 | 3. nltk 3.4
 9 | 
10 | ## How to install
11 | 1. Create a new directory for `nerlogparser` in your home directory
12 | 
13 |    `mkdir $HOME/nerlogparser`
14 | 
15 | 2. Create virtual environment in newly created directory with specific Python version (3.5)
16 | 
17 |    `virtualenv $HOME/nerlogparser -p /usr/bin/python3.5`
18 | 
19 | 3. Activate the virtual environment
20 | 
21 |    `source $HOME/nerlogparser/bin/activate`
22 | 
23 | 4. Install `nerlogparser`
24 | 
25 |    `pip install nerlogparser`
26 | 
27 | ## How to run
28 | 1. Make sure your are still in the virtual environment mode
29 | 2. For example, run `nerlogparser` to parse authentication log file from `/var/log/auth.log` and print output to the screen
30 | 
31 |    `nerlogparser -i /var/log/auth.log`
32 | 
33 | 3. We can save parsing results in an output file such as `parsed-auth.json`. At the moment, the only supported file output format is JSON.
34 | 
35 |    `nerlogparser -i /var/log/auth.log -o parsed-auth.json`
36 | 
37 | 4. Run `nerlogpaser` help
38 | 
39 |    `nerlogparser -h`
40 | 
41 | ## Import from your Python script
42 | 
43 | ```python
44 | import pprint
45 | from nerlogparser.nerlogparser import Nerlogparser  
46 | 
47 | parser = Nerlogparser()
48 | parsed_logs = parser.parse_logs('/var/log/auth.log')  
49 | 
50 | for line_id, parsed in parsed_logs.items():
51 |     print('Line:', line_id)
52 |     pprint.pprint(parsed)
53 |     print()
54 | ```    
55 | 
56 | ## License
57 | Apache License 2.0. Please check [LICENSE](https://github.com/studiawan/nerlogparser/blob/master/LICENSE.txt).
58 | 


--------------------------------------------------------------------------------
/dataset/README.md:
--------------------------------------------------------------------------------
1 | # Download datasets
2 | Please download all datasets used to train prlogparser in this link: https://mega.nz/#F!mNhWXTDS.
3 | The decryption key is !Rc-u3XLUywDYLIQWDkn80Q
4 | 
5 | After the download is finished, copy and paste the log files from various sources to directory nerlogparser/datasets. 
6 | 


--------------------------------------------------------------------------------
/nerlogparser/data/chars.txt:
--------------------------------------------------------------------------------
 1 | 9
 2 | n
 3 | B
 4 | [
 5 | -
 6 | |
 7 | N
 8 | a
 9 | >
10 | T
11 | !
12 | U
13 | Z
14 | #
15 | ,
16 | l
17 | 0
18 | ;
19 | m
20 | i
21 | q
22 | f
23 | g
24 | O
25 | 7
26 | H
27 | {
28 | ^
29 | F
30 | 1
31 | )
32 | 8
33 | &
34 | 6
35 | '
36 | K
37 | G
38 | b
39 | o
40 | M
41 | z
42 | \
43 | +
44 | d
45 | V
46 | *
47 | 5
48 | D
49 | 4
50 | c
51 | u
52 | j
53 | J
54 | C
55 | `
56 | 3
57 | @
58 | ]
59 | 2
60 | :
61 | L
62 | w
63 | (
64 | s
65 | k
66 | X
67 | v
68 | =
69 | y
70 | x
71 | _
72 | "
73 | R
74 | $
75 | Q
76 | A
77 | %
78 | Y
79 | t
80 | P
81 | e
82 | h
83 | /
84 | ?
85 | E
86 | p
87 | ~
88 | W
89 | S
90 | .
91 | <
92 | r
93 | I
94 | }


--------------------------------------------------------------------------------
/nerlogparser/data/glove.6B.300d.trimmed.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/data/glove.6B.300d.trimmed.npz


--------------------------------------------------------------------------------
/nerlogparser/data/label.txt:
--------------------------------------------------------------------------------
 1 | TIM timestamp
 2 | SEQ sequence_number
 3 | LEV level
 4 | HOS hostname
 5 | SER service
 6 | SUB subservice
 7 | UTIM unix_time
 8 | O message
 9 | SOC sock
10 | NUM number
11 | COR core
12 | SOU source
13 | ARC arch
14 | DOM domain_or_ip
15 | STA status
16 | IPA ip_address
17 | DAS dash
18 | AUT auth
19 | COM command
20 | STC status_code
21 | BYT num_bytes
22 | REF referrer
23 | CLI client_agent
24 | JOB job


--------------------------------------------------------------------------------
/nerlogparser/data/tags.txt:
--------------------------------------------------------------------------------
 1 | I-COM
 2 | I-SOU
 3 | I-CLI
 4 | I-JOB
 5 | I-HOS
 6 | B-CLI
 7 | I-COR
 8 | I-REF
 9 | B-TIM
10 | O
11 | B-UTIM
12 | I-BYT
13 | I-TIM
14 | B-JOB
15 | I-IPA
16 | I-SEQ
17 | I-STC
18 | I-AUT
19 | I-STA
20 | I-UTIM
21 | B-SER
22 | B-DOM
23 | I-SOC
24 | I-SUB
25 | B-SUB
26 | B-COM
27 | I-SER
28 | I-NUM
29 | B-STA
30 | B-REF
31 | I-DAS
32 | I-LEV
33 | I-DOM
34 | I-ARC


--------------------------------------------------------------------------------
/nerlogparser/data/test.txt:
--------------------------------------------------------------------------------
  1 | Jean B-PER
  2 | Pierre I-PER
  3 | lives O
  4 | in O
  5 | New B-LOC
  6 | York I-LOC
  7 | . O
  8 | 
  9 | The O
 10 | European B-ORG
 11 | Union I-ORG
 12 | is O
 13 | a O
 14 | political O
 15 | and O
 16 | economic O
 17 | union O
 18 | 
 19 | A O
 20 | French B-MISC
 21 | American I-MISC
 22 | actor O
 23 | won O
 24 | an O
 25 | oscar O
 26 | 
 27 | Jean B-PER
 28 | Pierre I-PER
 29 | lives O
 30 | in O
 31 | New B-LOC
 32 | York I-LOC
 33 | . O
 34 | 
 35 | The O
 36 | European B-ORG
 37 | Union I-ORG
 38 | is O
 39 | a O
 40 | political O
 41 | and O
 42 | economic O
 43 | union O
 44 | 
 45 | A O
 46 | French B-MISC
 47 | American I-MISC
 48 | actor O
 49 | won O
 50 | an O
 51 | oscar O
 52 | 
 53 | Jean B-PER
 54 | Pierre I-PER
 55 | lives O
 56 | in O
 57 | New B-LOC
 58 | York I-LOC
 59 | . O
 60 | 
 61 | The O
 62 | European B-ORG
 63 | Union I-ORG
 64 | is O
 65 | a O
 66 | political O
 67 | and O
 68 | economic O
 69 | union O
 70 | 
 71 | A O
 72 | French B-MISC
 73 | American I-MISC
 74 | actor O
 75 | won O
 76 | an O
 77 | oscar O
 78 | 
 79 | Jean B-PER
 80 | Pierre I-PER
 81 | lives O
 82 | in O
 83 | New B-LOC
 84 | York I-LOC
 85 | . O
 86 | 
 87 | The O
 88 | European B-ORG
 89 | Union I-ORG
 90 | is O
 91 | a O
 92 | political O
 93 | and O
 94 | economic O
 95 | union O
 96 | 
 97 | A O
 98 | French B-MISC
 99 | American I-MISC
100 | actor O
101 | won O
102 | an O
103 | oscar O
104 | 


--------------------------------------------------------------------------------
/nerlogparser/dataformat/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/dataformat/__init__.py


--------------------------------------------------------------------------------
/nerlogparser/dataformat/toconll.py:
--------------------------------------------------------------------------------
  1 | from nltk import pos_tag, conlltags2tree
  2 | 
  3 | 
  4 | class ToConll(object):
  5 |     def __init__(self):
  6 |         self.entity_names = {
  7 |             'timestamp': 'I-TIM',
  8 |             'timestamp2': 'B-TIM',
  9 |             'sequence_number': 'I-SEQ',
 10 |             'level': 'I-LEV',
 11 |             'hostname': 'I-HOS',
 12 |             'service2': 'B-SER',
 13 |             'service': 'I-SER',
 14 |             'subservice2': 'B-SUB',
 15 |             'subservice': 'I-SUB',
 16 |             'unix_time2': 'B-UTIM',
 17 |             'unix_time': 'I-UTIM',
 18 |             'message': 'O',
 19 |             'sock': 'I-SOC',
 20 |             'number': 'I-NUM',
 21 |             'core1': 'I-COR',
 22 |             'core2': 'I-COR',
 23 |             'timestamp_bgl': 'I-TIM',
 24 |             'source': 'I-SOU',
 25 |             'arch': 'I-ARC',
 26 |             'domain_or_ip': 'I-DOM',
 27 |             'domain_or_ip2': 'B-DOM',
 28 |             'status': 'I-STA',
 29 |             'status2': 'B-STA',
 30 |             'ip_address': 'I-IPA',
 31 |             'dash': 'I-DAS',
 32 |             'auth': 'I-AUT',
 33 |             'command': 'I-COM',
 34 |             'command2': 'B-COM',
 35 |             'status_code': 'I-STC',
 36 |             'num_bytes': 'I-BYT',
 37 |             'referrer': 'I-REF',
 38 |             'referrer2': 'B-REF',
 39 |             'client_agent': 'I-CLI',
 40 |             'client_agent2': 'B-CLI',
 41 |             'job': 'I-JOB',
 42 |             'job2': 'B-JOB'
 43 |         }
 44 |         self.classes = ['I-TIM', 'B-TIM', 'I-SEQ', 'B-SEQ' 'I-LEV', 'B-LEV' 'I-HOS', 'B-HOS', 'I-SER', 'B-SER',
 45 |                         'B-SUB', 'I-SUB', 'B-UTIM', 'I-UTIM', 'O', 'B-SOC', 'I-SOC', 'B-NUM', 'I-NUM', 'I-COR', 'B-COR',
 46 |                         'B-SOU', 'I-SOU', 'B-ARC', 'I-ARC', 'I-DOM', 'B-DOM', 'I-STA', 'B-STA', 'B-IPA', 'I-IPA',
 47 |                         'I-DAS', 'B-DAS', 'B-AUT', 'I-AUT', 'B-COM', 'I-COM', 'B-STC', 'I-STC', 'B-BYT', 'I-BYT',
 48 |                         'I-REF', 'B-REF', 'I-CLI', 'B-CLI', 'I-JOB', 'B-JOB']
 49 | 
 50 |     def __get_conll_format(self, value_split, value_split_len, entity, stanford=False):
 51 |         if stanford is True:
 52 |             underscore = ' '
 53 |         else:
 54 |             underscore = ' _ _ '
 55 | 
 56 |         conll_format = ''
 57 |         if entity != 'message':
 58 |             if value_split_len == 1:
 59 |                 conll_format += value_split[0] + underscore + self.entity_names[entity] + '\n'
 60 |             else:
 61 |                 for index, value_name in enumerate(value_split):
 62 |                     if index == 0:
 63 |                         conll_format += value_name + underscore + self.entity_names[entity + '2'] + '\n'
 64 |                     else:
 65 |                         conll_format += value_name + underscore + self.entity_names[entity] + '\n'
 66 |         else:
 67 |             for value_name in value_split:
 68 |                 conll_format += value_name + underscore + self.entity_names[entity] + '\n'
 69 | 
 70 |         return conll_format
 71 | 
 72 |     def __get_conll_pos(self, value_pos, value_split_len, entity):
 73 |         conll_format = ''
 74 |         if entity != 'message':
 75 |             if value_split_len == 1:
 76 |                 conll_format += value_pos[0][0] + ' ' + value_pos[0][1] + ' ' + self.entity_names[entity] + '\n'
 77 |             else:
 78 |                 for index, value_name in enumerate(value_pos):
 79 |                     if index == 0:
 80 |                         conll_format += value_name[0] + ' ' + value_name[1] + ' ' + \
 81 |                                         self.entity_names[entity + '2'] + '\n'
 82 |                     else:
 83 |                         conll_format += value_name[0] + ' ' + value_name[1] + ' ' + self.entity_names[entity] + '\n'
 84 |         else:
 85 |             for value_name in value_pos:
 86 |                 conll_format += value_name[0] + ' ' + value_name[1] + ' ' + self.entity_names[entity] + '\n'
 87 | 
 88 |         return conll_format
 89 | 
 90 |     def __get_csv(self, value_pos, value_split_len, entity):
 91 |         csv_string = ''
 92 |         if entity != 'message':
 93 |             if value_split_len == 1:
 94 |                 csv_string += '\t' + value_pos[0][0] + '\t' + value_pos[0][1] + '\t' + self.entity_names[entity] + '\n'
 95 |             else:
 96 |                 for index, value_name in enumerate(value_pos):
 97 |                     if index == 0:
 98 |                         csv_string += '\t' + value_name[0] + '\t' + value_name[1] + '\t' + \
 99 |                                       self.entity_names[entity + '2'] + '\n'
100 |                     else:
101 |                         csv_string += '\t' + value_name[0] + '\t' + value_name[1] + '\t' + \
102 |                                       self.entity_names[entity] + '\n'
103 |         else:
104 |             for value_name in value_pos:
105 |                 csv_string += '\t' + value_name[0] + '\t' + value_name[1] + '\t' + self.entity_names[entity] + '\n'
106 | 
107 |         return csv_string
108 | 
109 |     def __get_nltk_tree(self, value_pos, value_split_len, entity):
110 |         iob_list = []
111 |         if entity != 'message':
112 |             if value_split_len == 1:
113 |                 iob_tuple = (value_pos[0][0], value_pos[0][1], self.entity_names[entity])
114 |                 iob_list.append(iob_tuple)
115 |             else:
116 |                 for index, value_name in enumerate(value_pos):
117 |                     if index == 0:
118 |                         iob_tuple = (value_name[0], value_name[1], self.entity_names[entity + '2'])
119 |                         iob_list.append(iob_tuple)
120 |                     else:
121 |                         iob_tuple = (value_name[0], value_name[1], self.entity_names[entity])
122 |                         iob_list.append(iob_tuple)
123 |         else:
124 |             for value_name in value_pos:
125 |                 iob_tuple = (value_name[0], value_name[1], self.entity_names[entity])
126 |                 iob_list.append(iob_tuple)
127 | 
128 |         return iob_list
129 | 
130 |     def convert(self, parsed, stanford=False, ispos=False, csv=False, csv_line_id=0, iobtree=False):
131 |         if csv:
132 |             conll_format = 'Sentence: ' + str(csv_line_id)
133 |         elif iobtree:
134 |             conll_format = None
135 |         else:
136 |             conll_format = ''
137 | 
138 |         conll = []
139 |         for entity, value in parsed.items():
140 |             value_split = value.split()
141 |             value_split_len = len(value_split)
142 | 
143 |             if value != '':
144 |                 if ispos:
145 |                     value_pos = pos_tag(value_split)  # pos = part of speech
146 |                     conll_format += self.__get_conll_pos(value_pos, value_split_len, entity)
147 |                 elif csv:
148 |                     value_pos = pos_tag(value_split)
149 |                     conll_format += self.__get_csv(value_pos, value_split_len, entity)
150 |                 elif iobtree:
151 |                     value_pos = pos_tag(value_split)
152 |                     iob = self.__get_nltk_tree(value_pos, value_split_len, entity)
153 |                     conll = conll + iob
154 |                 else:
155 |                     conll_format += self.__get_conll_format(value_split, value_split_len, entity, stanford)
156 | 
157 |         if csv is False and iobtree is False:
158 |             conll_format += '\n'
159 | 
160 |         if iobtree:
161 |             conll_format = conlltags2tree(conll)
162 | 
163 |         return conll_format
164 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/grammar/__init__.py


--------------------------------------------------------------------------------
/nerlogparser/grammar/authlog.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | from pyparsing import Word, alphas, Combine, nums, string, Optional, Regex
  4 | from collections import OrderedDict
  5 | 
  6 | 
  7 | class AuthLog(object):
  8 |     def __init__(self, dataset):
  9 |         """Constructor for class AuthLog.
 10 | 
 11 |         Parameters
 12 |         ----------
 13 |         dataset : str
 14 |             Dataset name.
 15 |         """
 16 |         self.dataset = dataset
 17 |         self.authlog_grammar = self.__get_authlog_grammar()
 18 | 
 19 |     @staticmethod
 20 |     def __get_authlog_grammar():
 21 |         """The definition of auth.log grammar. Supported dataset:
 22 |         casper-rw
 23 |         dfrws-2009
 24 |         honeynet-challenge5
 25 |         honeynet-challenge7
 26 | 
 27 |         Returns
 28 |         -------
 29 |         authlog_grammar :
 30 |             Grammar for auth.log
 31 |         """
 32 |         ints = Word(nums)
 33 | 
 34 |         # timestamp
 35 |         month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
 36 |         day = ints
 37 |         hour = Combine(ints + ':' + ints + ':' + ints)
 38 |         timestamp = month + day + hour
 39 | 
 40 |         # hostname, service name, message
 41 |         hostname = Word(alphas + nums + '_' + '-' + '.')
 42 |         service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':')
 43 |         subservice = Optional(Word(alphas + ':' + '-' + '_' + '(' + ')'))
 44 |         subservice_two_words = Optional(Word(alphas + ':' + '-' + '_' + '(' + ')')) + \
 45 |             Optional(Word(alphas + ':' + '-' + '_' + '(' + ')'))
 46 |         message = Regex('.*')
 47 | 
 48 |         # auth log grammar
 49 |         authlog_grammar = timestamp + hostname + service + subservice + subservice_two_words + message
 50 |         return authlog_grammar
 51 | 
 52 |     def parse_log(self, log_line):
 53 |         """Parse auth.log based on defined grammar.
 54 | 
 55 |         Parameters
 56 |         ----------
 57 |         log_line    : str
 58 |             A log line to be parsed.
 59 | 
 60 |         Returns
 61 |         -------
 62 |         parsed  : dict[str, str]
 63 |             A parsed auth.log containing these elements: timestamp, hostname, service, pid, subservice and message.
 64 |         """
 65 |         parsed_authlog = self.authlog_grammar.parseString(log_line)
 66 | 
 67 |         # get parsed auth.log
 68 |         parsed = OrderedDict()
 69 |         parsed['timestamp'] = parsed_authlog[0] + ' ' + parsed_authlog[1] + ' ' + parsed_authlog[2]
 70 |         parsed['hostname'] = parsed_authlog[3]
 71 |         parsed['service'] = parsed_authlog[4]
 72 | 
 73 |         if len(parsed_authlog) == 6:
 74 |             parsed['subservice'] = ''
 75 |             parsed['message'] = parsed_authlog[5]
 76 | 
 77 |         elif len(parsed_authlog) == 8:
 78 |             if not parsed_authlog[6].endswith(':'):
 79 |                 parsed['subservice'] = ''
 80 |                 parsed['message'] = ' '.join(parsed_authlog[5:])
 81 | 
 82 |             if parsed_authlog[5].endswith(':'):
 83 |                 parsed['subservice'] = parsed_authlog[5]
 84 |                 parsed['message'] = ' '.join(parsed_authlog[6:])
 85 | 
 86 |         else:
 87 |             if parsed_authlog[6].endswith(':'):
 88 |                 parsed['subservice'] = parsed_authlog[5] + ' ' + parsed_authlog[6]
 89 |                 parsed['message'] = ' '.join(parsed_authlog[7:])
 90 | 
 91 |             else:
 92 |                 parsed['subservice'] = parsed_authlog[5]
 93 |                 parsed['message'] = ' '.join(parsed_authlog[6:])
 94 | 
 95 |             if not parsed['subservice'].endswith(':'):
 96 |                 parsed['message'] = parsed['subservice'] + ' ' + parsed['message']
 97 |                 parsed['subservice'] = ''
 98 | 
 99 |         return parsed
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     # get auth.log datasets
104 |     dataset_path = '/home/hudan/Git/prlogparser/datasets/'
105 |     filenames = [
106 |         'casper-rw/auth.log',
107 |         'dfrws-2009-jhuisi/auth.log',
108 |         'dfrws-2009-jhuisi/auth.log.0',
109 |         'dfrws-2009-jhuisi/auth.log.1',
110 |         'dfrws-2009-nssal/auth.log',
111 |         'dfrws-2009-nssal/auth.log.0',
112 |         'dfrws-2009-nssal/auth.log.1',
113 |         'dfrws-2009-nssal/auth.log.2',
114 |         'dfrws-2009-nssal/auth.log.3',
115 |         'dfrws-2009-nssal/auth.log.4',
116 |         'honeynet-challenge5/auth.log',
117 |         'honeynet-challenge7/auth.log'
118 |     ]
119 | 
120 |     # setup test csv file to save results
121 |     test_file = '/home/hudan/Git/prlogparser/groundtruth/auth-test.csv'
122 |     f = open(test_file, 'w', newline='')
123 |     writer = csv.writer(f)
124 | 
125 |     # parse auth.log
126 |     dl = AuthLog('')
127 |     for filename in filenames:
128 |         filename = os.path.join(dataset_path, filename)
129 |         with open(filename, 'r') as f:
130 |             for line in f:
131 |                 # get parsed line and print
132 |                 parsed_line = dl.parse_log(line)
133 |                 print(parsed_line)
134 | 
135 |                 # write to csv
136 |                 row = list(parsed_line.values())
137 |                 writer.writerow(row)
138 |     f.close()
139 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/bluegenelog.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | from pyparsing import Word, alphas, Combine, nums, Regex, ParseException
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | class BlueGeneLog(object):
 8 |     def __init__(self, dataset):
 9 |         self.dataset = dataset
10 |         self.bluegenelog_grammar = self.__get_bluegenelog_grammar()
11 | 
12 |     @staticmethod
13 |     def __get_bluegenelog_grammar():
14 |         """The definition of BlueGene/L grammar.
15 | 
16 |         The BlueGene/L logs can be downloaded from [Usenix2006a]_ and
17 |         this data was used in [Stearley2008]_.
18 | 
19 |         Returns
20 |         -------
21 |         bluegene_grammar    :
22 |             Grammar for BlueGene/L supercomputer logs.
23 | 
24 |         References
25 |         ----------
26 |         .. [Usenix2006a]  The HPC4 data. URL: https://www.usenix.org/cfdr-data#hpc4
27 |         .. [Stearley2008] Stearley, J., & Oliner, A. J. Bad words: Finding faults in Spirit's syslogs.
28 |                           In 8th IEEE International Symposium on Cluster Computing and the Grid, pp. 765-770.
29 |         """
30 |         ints = Word(nums)
31 | 
32 |         sock = Word(alphas + '-' + '_')
33 |         number = ints
34 |         date = Combine(ints + '.' + ints + '.' + ints)
35 |         core1 = Word(alphas + nums + '-' + ':' + '_')
36 |         datetime = Combine(ints + '-' + ints + '-' + ints + '-' + ints + '.' + ints + '.' + ints + '.' + ints)
37 |         core2 = Word(alphas + nums + '-' + ':' + '_')
38 |         source = Word(alphas)
39 |         service = Word(alphas)
40 |         info_type = Word(alphas)
41 |         message = Regex('.*')
42 | 
43 |         # blue gene log grammar
44 |         bluegene_grammar = sock + number + date + core1 + datetime + core2 + source + service + info_type + message
45 |         return bluegene_grammar
46 | 
47 |     def parse_log(self, log_line):
48 |         """Parse the BlueGene/L logs based on defined grammar.
49 | 
50 |         Parameters
51 |         ----------
52 |         log_line    : str
53 |             A log line to be parsed
54 | 
55 |         Returns
56 |         -------
57 |         parsed      : dict[str, str]
58 |             A parsed BlueGene/L log.
59 |         """
60 |         parsed = OrderedDict()
61 |         try:
62 |             parsed_bluegenelog = self.bluegenelog_grammar.parseString(log_line)
63 |             parsed['sock'] = parsed_bluegenelog[0]
64 |             parsed['number'] = parsed_bluegenelog[1]
65 |             parsed['timestamp'] = parsed_bluegenelog[2]
66 |             parsed['core1'] = parsed_bluegenelog[3]
67 |             parsed['timestamp_bgl'] = parsed_bluegenelog[4]
68 |             parsed['core2'] = parsed_bluegenelog[5]
69 |             parsed['source'] = parsed_bluegenelog[6]
70 |             parsed['service'] = parsed_bluegenelog[7]
71 |             parsed['level'] = parsed_bluegenelog[8]
72 |             parsed['message'] = parsed_bluegenelog[9]
73 | 
74 |         except ParseException:
75 |             print(log_line)
76 | 
77 |         return parsed
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     dataset_path = '/home/hudan/Git/prlogparser/datasets/'
82 |     filenames = ['bgl2/bgl2']
83 | 
84 |     test_file = '/home/hudan/Git/prlogparser/groundtruth/test-results/bgl-test.csv'
85 |     f = open(test_file, 'w', newline='')
86 |     writer = csv.writer(f)
87 | 
88 |     bl = BlueGeneLog('')
89 |     for filename in filenames:
90 |         filename = os.path.join(dataset_path, filename)
91 |         with open(filename, 'r') as f:
92 |             for line in f:
93 |                 parsed_line = bl.parse_log(line)
94 |                 print(parsed_line['timestamp'])
95 | 
96 |                 row = list(parsed_line.values())
97 |                 writer.writerow(row)
98 |     f.close()
99 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/csvlog.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | import csv
 3 | 
 4 | 
 5 | class CSVLog(object):
 6 |     def __init__(self, dataset):
 7 |         self.dataset = dataset
 8 |         if self.dataset == 'dfrws-2016':
 9 |             self.log_file = '/home/hudan/Git/prlogparser/datasets/dfrws-2016/csv.csv'
10 | 
11 |     def parse_log(self):
12 |         parsed_logs = []
13 |         with open(self.log_file, 'r') as f:
14 |             reader = csv.reader(f)
15 |             for row in reader:
16 |                 parsed = OrderedDict()
17 |                 parsed['timestamp'] = row[0]
18 |                 parsed['sequence_number'] = row[1]
19 |                 parsed['service'] = row[2]
20 |                 parsed['level'] = row[3]
21 |                 parsed['message'] = row[4]
22 |                 parsed_logs.append(parsed)
23 | 
24 |         return parsed_logs
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     csvlog = CSVLog('dfrws-2016')
29 |     results = csvlog.parse_log()
30 |     for result in results:
31 |         print(result)
32 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/daemonlog.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | from pyparsing import Word, alphas, Combine, nums, string, Optional, Regex
  4 | from collections import OrderedDict
  5 | 
  6 | 
  7 | class DaemonLog(object):
  8 |     def __init__(self, dataset):
  9 |         """Constructor for class DaemonLog.
 10 | 
 11 |         Parameters
 12 |         ----------
 13 |         dataset : str
 14 |             Dataset name.
 15 |         """
 16 |         self.dataset = dataset
 17 |         self.daemonlog_grammar = self.__get_daemonlog_grammar()
 18 | 
 19 |     @staticmethod
 20 |     def __get_daemonlog_grammar():
 21 |         """The definition of daemon.log grammar. Supported dataset:
 22 |         casper-rw
 23 |         dfrws-2009
 24 |         honeynet-challenge5
 25 |         honeynet-challenge7
 26 | 
 27 |         Returns
 28 |         -------
 29 |         daemonlog_grammar :
 30 |             Grammar for daemon.log
 31 |         """
 32 |         ints = Word(nums)
 33 | 
 34 |         # timestamp
 35 |         month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
 36 |         day = ints
 37 |         hour = Combine(ints + ':' + ints + ':' + ints)
 38 |         timestamp = month + day + hour
 39 | 
 40 |         # hostname, service name, message
 41 |         hostname = Word(alphas + nums + '_' + '-' + '.')
 42 |         service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + '(' + ')' + ':')
 43 |         subservice = Optional(Word(alphas + nums + ':' + '-' + '_' + '<' + '>' + '.' + "'" + ','))
 44 |         subservice_two_words = Optional(Word(alphas + nums + ':' + '-' + '_' + '(' + ')' + '.' + ','))
 45 |         message = Regex('.*')
 46 | 
 47 |         # daemon log grammar
 48 |         daemon_grammar = timestamp + hostname + service + subservice + subservice_two_words + message
 49 |         return daemon_grammar
 50 | 
 51 |     def parse_log(self, log_line):
 52 |         """Parse auth.log based on defined grammar.
 53 | 
 54 |         Parameters)
 55 |         ----------
 56 |         log_line    : str
 57 |             A log line to be parsed.
 58 | 
 59 |         Returns
 60 |         -------
 61 |         parsed  : dict[str, str]
 62 |             A parsed auth.log containing these elements: timestamp, hostname, service, subservice and message.
 63 |         """
 64 |         parsed_daemonlog = self.daemonlog_grammar.parseString(log_line)
 65 | 
 66 |         # remove empty string
 67 |         parsed_daemonlog = list(filter(None, parsed_daemonlog))
 68 | 
 69 |         # get parsed auth.log
 70 |         parsed = OrderedDict()
 71 |         parsed['timestamp'] = parsed_daemonlog[0] + ' ' + parsed_daemonlog[1] + ' ' + parsed_daemonlog[2]
 72 |         parsed['hostname'] = parsed_daemonlog[3]
 73 |         parsed['service'] = parsed_daemonlog[4]
 74 | 
 75 |         if len(parsed_daemonlog) == 5:
 76 |             parsed['subservice'] = ''
 77 |             parsed['message'] = ''
 78 | 
 79 |         elif len(parsed_daemonlog) == 6:
 80 |             parsed['subservice'] = ''
 81 |             parsed['message'] = parsed_daemonlog[5]
 82 | 
 83 |         else:
 84 |             # subservice one word
 85 |             if parsed_daemonlog[5].endswith(':'):
 86 |                 parsed['subservice'] = parsed_daemonlog[5]
 87 |                 parsed['message'] = ' '.join(parsed_daemonlog[6:])
 88 | 
 89 |             # subservice two words
 90 |             elif not parsed_daemonlog[5].endswith(':') and parsed_daemonlog[6].endswith(':'):
 91 |                 parsed['subservice'] = parsed_daemonlog[5] + ' ' + parsed_daemonlog[6]
 92 |                 parsed['message'] = ' '.join(parsed_daemonlog[7:])
 93 | 
 94 |             # subservice two words
 95 |             elif parsed_daemonlog[5].endswith('>') and parsed_daemonlog[6].endswith(':'):
 96 |                 parsed['subservice'] = parsed_daemonlog[5] + ' ' + parsed_daemonlog[6]
 97 |                 parsed['message'] = ' '.join(parsed_daemonlog[7:])
 98 | 
 99 |             # subservice one word
100 |             elif parsed_daemonlog[5].endswith('>') and not parsed_daemonlog[6].endswith(':'):
101 |                 parsed['subservice'] = parsed_daemonlog[5]
102 |                 parsed['message'] = ' '.join(parsed_daemonlog[6:])
103 | 
104 |             else:
105 |                 parsed['subservice'] = ''
106 |                 parsed['message'] = parsed_daemonlog[5] + ' ' + ' '.join(parsed_daemonlog[6:])
107 | 
108 |             if not parsed['service'].endswith(':'):
109 |                 parsed['message'] = parsed['service'] + parsed['message']
110 |                 parsed['service'] = ''
111 | 
112 |         return parsed
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     # get daemon.log datasets
117 |     dataset_path = '/home/hudan/Git/prlogparser/datasets/'
118 |     filenames = [
119 |         'casper-rw/daemon.log',
120 |         'dfrws-2009-jhuisi/daemon.log',
121 |         'dfrws-2009-jhuisi/daemon.log.0',
122 |         'dfrws-2009-jhuisi/daemon.log.1',
123 |         'dfrws-2009-nssal/daemon.log',
124 |         'dfrws-2009-nssal/daemon.log.0',
125 |         'dfrws-2009-nssal/daemon.log.1',
126 |         'dfrws-2009-nssal/daemon.log.2',
127 |         'dfrws-2009-nssal/daemon.log.3',
128 |         'honeynet-challenge5/daemon.log',
129 |         'honeynet-challenge7/daemon.log'
130 |     ]
131 | 
132 |     # setup test csv file to save results
133 |     test_file = '/home/hudan/Git/prlogparser/groundtruth/daemon-test.csv'
134 |     f = open(test_file, 'w', newline='')
135 |     writer = csv.writer(f)
136 | 
137 |     # parse daemon.log
138 |     dl = DaemonLog('')
139 |     for filename in filenames:
140 |         filename = os.path.join(dataset_path, filename)
141 |         with open(filename, 'r') as f:
142 |             for line in f:
143 |                 # get parsed line and print
144 |                 parsed_line = dl.parse_log(line)
145 |                 print(parsed_line)
146 | 
147 |                 # write to csv
148 |                 row = list(parsed_line.values())
149 |                 writer.writerow(row)
150 | 
151 |     f.close()
152 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/debuglog.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | from pyparsing import Word, alphas, Combine, nums, string, Optional, Regex
  4 | from collections import OrderedDict
  5 | from nerlogparser.grammar.grammar_utility import GrammarUtility
  6 | 
  7 | 
  8 | class DebugLog(object):
  9 |     def __init__(self, dataset):
 10 |         self.dataset = dataset
 11 |         self.groups = {
 12 |             'group1': ['casper-rw'],
 13 |             'group2': ['dfrws-2009-jhuisi', 'dfrws-2009-nssal', 'honeynet-challenge7'],
 14 |             'group3': ['honeynet-challenge5']
 15 |         }
 16 | 
 17 |     def get_grammar(self):
 18 |         dl = None
 19 |         if self.dataset in self.groups['group1']:
 20 |             dl = DebugLog1(self.dataset)
 21 | 
 22 |         elif self.dataset in self.groups['group2']:
 23 |             dl = DebugLog2(self.dataset)
 24 | 
 25 |         elif self.dataset in self.groups['group3']:
 26 |             dl = DebugLog3(self.dataset)
 27 | 
 28 |         return dl
 29 | 
 30 | 
 31 | class DebugLog1(object):
 32 |     def __init__(self, dataset):
 33 |         """Constructor for class DebugLog.
 34 | 
 35 |         Parameters
 36 |         ----------
 37 |         dataset : str
 38 |             Dataset name.
 39 |         """
 40 |         self.dataset = dataset
 41 |         self.debuglog_grammar = self.__get_debuglog_grammar()
 42 | 
 43 |     @staticmethod
 44 |     def __get_debuglog_grammar():
 45 |         """The definition of debug log grammar. Supported dataset:
 46 |         casper-rw
 47 | 
 48 |         Returns
 49 |         -------
 50 |         debuglog_grammar :
 51 |             Grammar for debug log
 52 |         """
 53 |         ints = Word(nums)
 54 | 
 55 |         # timestamp
 56 |         month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
 57 |         day = ints
 58 |         hour = Combine(ints + ':' + ints + ':' + ints)
 59 |         timestamp = month + day + hour
 60 | 
 61 |         # hostname, service name, message
 62 |         hostname = Word(alphas + nums + '_' + '-' + '.')
 63 |         service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':')
 64 | 
 65 |         # unix time
 66 |         unix_time = '[' + Word(nums + '.' + ']')
 67 |         subservice = Optional(Word(alphas + nums + '_' + '-' + ':'))
 68 |         subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/'))
 69 |         message = Regex('.*')
 70 | 
 71 |         # debug log grammar
 72 |         debuglog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message
 73 |         return debuglog_grammar
 74 | 
 75 |     def parse_log(self, log_line):
 76 |         """Parse debug log based on defined grammar.
 77 | 
 78 |         Parameters
 79 |         ----------
 80 |         log_line    : str
 81 |             A log line to be parsed.
 82 | 
 83 |         Returns
 84 |         -------
 85 |         parsed  : dict[str, str]
 86 |             A parsed debug log containing these elements: timestamp, hostname, service, unix_time,
 87 |             subservice and message.
 88 |         """
 89 |         parsed_debuglog = self.debuglog_grammar.parseString(log_line)
 90 | 
 91 |         # get parsed debug log
 92 |         parsed = OrderedDict()
 93 |         parsed['timestamp'] = parsed_debuglog[0] + ' ' + parsed_debuglog[1] + ' ' + parsed_debuglog[2]
 94 |         parsed['hostname'] = parsed_debuglog[3]
 95 |         parsed['service'] = parsed_debuglog[4]
 96 |         parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_debuglog[5], parsed_debuglog[6])
 97 | 
 98 |         if len(parsed_debuglog) == 8:
 99 |             parsed['subservice'] = ''
100 |             parsed['message'] = parsed_debuglog[7]
101 | 
102 |         else:
103 |             # subservice one word
104 |             if parsed_debuglog[7].endswith(':') and not parsed_debuglog[8].endswith(':'):
105 |                 parsed['subservice'] = parsed_debuglog[7]
106 |                 parsed['message'] = ' '.join(parsed_debuglog[8:])
107 | 
108 |             # subservice two words
109 |             elif not parsed_debuglog[7].endswith(':') and parsed_debuglog[8].endswith(':'):
110 |                 parsed['subservice'] = parsed_debuglog[7] + ' ' + parsed_debuglog[8]
111 |                 parsed['message'] = ' '.join(parsed_debuglog[9:])
112 | 
113 |             # subservice two words
114 |             elif parsed_debuglog[7].endswith(':') and parsed_debuglog[8].endswith(':'):
115 |                 parsed['subservice'] = parsed_debuglog[7] + ' ' + parsed_debuglog[8]
116 |                 parsed['message'] = ' '.join(parsed_debuglog[9:])
117 | 
118 |             else:
119 |                 parsed['subservice'] = ''
120 |                 parsed['message'] = ' '.join(parsed_debuglog[7:])
121 | 
122 |             if not parsed['subservice'].endswith(':'):
123 |                 parsed['message'] = parsed['subservice'] + ' ' + parsed['message']
124 |                 parsed['subservice'] = ''
125 | 
126 |         return parsed
127 | 
128 | 
129 | class DebugLog2(object):
130 |     def __init__(self, dataset):
131 |         """Constructor for class MessagesLog. This parser also supports syslog.
132 | 
133 |         Parameters
134 |         ----------
135 |         dataset : str
136 |             Dataset name.
137 |         """
138 |         self.dataset = dataset
139 |         self.messageslog_grammar = self.__get_messageslog_grammar()
140 | 
141 |     @staticmethod
142 |     def __get_messageslog_grammar():
143 |         """The definition of messages log grammar. Supported dataset:
144 |         dfrws-2009
145 |         honeynet-challenge7
146 | 
147 |         Returns
148 |         -------
149 |         messageslog_grammar :
150 |             Grammar for messages log
151 |         """
152 |         ints = Word(nums)
153 | 
154 |         # timestamp
155 |         month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
156 |         day = ints
157 |         hour = Combine(ints + ':' + ints + ':' + ints)
158 |         timestamp = month + day + hour
159 | 
160 |         # hostname, service name, message
161 |         hostname = Word(alphas + nums + '_' + '-' + '.')
162 |         service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':')
163 |         subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/'))
164 | 
165 |         # unix time
166 |         unix_time = Optional('[' + Word(nums + '.' + ']'))
167 |         subservice = Optional(Word(alphas + nums + '_' + '-' + ':'))
168 |         message = Regex('.*')
169 | 
170 |         # messages log grammar
171 |         messageslog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message
172 |         return messageslog_grammar
173 | 
174 |     def parse_log(self, log_line):
175 |         """Parse messages log based on defined grammar.
176 | 
177 |         Parameters
178 |         ----------
179 |         log_line    : str
180 |             A log line to be parsed.
181 | 
182 |         Returns
183 |         -------
184 |         parsed  : dict[str, str]
185 |             A parsed messages log containing these elements: timestamp, hostname, service, unix_time,
186 |             subservice and message.
187 |         """
188 |         parsed_messageslog = self.messageslog_grammar.parseString(log_line)
189 | 
190 |         # get parsed kernel log
191 |         parsed = OrderedDict()
192 |         parsed['timestamp'] = parsed_messageslog[0] + ' ' + parsed_messageslog[1] + ' ' + parsed_messageslog[2]
193 |         parsed['hostname'] = parsed_messageslog[3]
194 |         parsed['service'] = parsed_messageslog[4]
195 | 
196 |         if len(parsed_messageslog) == 6:
197 |             parsed['unix_time'] = ''
198 |             parsed['subservice'] = ''
199 |             if not parsed['service'].endswith(':'):
200 |                 parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5]
201 |                 parsed['service'] = ''
202 | 
203 |             else:
204 |                 parsed['message'] = parsed_messageslog[5]
205 | 
206 |         elif len(parsed_messageslog) == 7:
207 |             parsed['unix_time'] = ''
208 |             parsed['subservice'] = ''
209 |             if not parsed_messageslog[5].endswith(':'):
210 |                 parsed['message'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6]
211 | 
212 |             if not parsed['service'].endswith(':'):
213 |                 parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5] + ' ' + parsed_messageslog[6]
214 |                 parsed['service'] = ''
215 | 
216 |         else:
217 |             if parsed_messageslog[5].startswith('[') and parsed_messageslog[6].endswith(']'):
218 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6])
219 | 
220 |                 if len(parsed_messageslog) > 8:
221 |                     # subservice one word
222 |                     if parsed_messageslog[7].endswith(':') and not parsed_messageslog[8].endswith(':'):
223 |                         parsed['subservice'] = parsed_messageslog[7]
224 |                         parsed['message'] = ' '.join(parsed_messageslog[8:])
225 | 
226 |                     # subservice two words
227 |                     elif not parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'):
228 |                         parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8]
229 |                         parsed['message'] = ' '.join(parsed_messageslog[9:])
230 | 
231 |                     # subservice two words
232 |                     elif parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'):
233 |                         parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8]
234 |                         parsed['message'] = ' '.join(parsed_messageslog[9:])
235 | 
236 |                     else:
237 |                         parsed['subservice'] = ''
238 |                         parsed['message'] = ' '.join(parsed_messageslog[7:])
239 | 
240 |                 else:
241 |                     parsed['subservice'] = ''
242 |                     parsed['message'] = ' '.join(parsed_messageslog[7:])
243 | 
244 |             else:
245 |                 parsed['unix_time'] = ''
246 | 
247 |                 # subservice one word
248 |                 if parsed_messageslog[5].endswith(':') and not parsed_messageslog[6].endswith(':'):
249 |                     parsed['subservice'] = parsed_messageslog[5]
250 |                     parsed['message'] = ' '.join(parsed_messageslog[6:])
251 | 
252 |                 # subservice two words
253 |                 elif not parsed_messageslog[5].endswith(':') and parsed_messageslog[6].endswith(':'):
254 |                     parsed['subservice'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6]
255 |                     parsed['message'] = ' '.join(parsed_messageslog[7:])
256 | 
257 |                 # subservice two words
258 |                 elif parsed_messageslog[5].endswith(':') and parsed_messageslog[6].endswith(':'):
259 |                     parsed['subservice'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6]
260 |                     parsed['message'] = ' '.join(parsed_messageslog[7:])
261 | 
262 |                 else:
263 |                     parsed['subservice'] = ''
264 |                     parsed['message'] = ' '.join(parsed_messageslog[5:])
265 | 
266 |                 if not parsed['subservice'].endswith(':'):
267 |                     parsed['message'] = parsed['subservice'] + ' ' + parsed['message']
268 |                     parsed['subservice'] = ''
269 | 
270 |         return parsed
271 | 
272 | 
273 | class DebugLog3(object):
274 |     def __init__(self, dataset):
275 |         """Constructor for class MessagesLog. This parser also supports syslog.
276 | 
277 |         Parameters
278 |         ----------
279 |         dataset : str
280 |             Dataset name.
281 |         """
282 |         self.dataset = dataset
283 |         self.messageslog_grammar = self.__get_messageslog_grammar()
284 | 
285 |     @staticmethod
286 |     def __get_messageslog_grammar():
287 |         """The definition of messages log grammar. Supported dataset:
288 |         honeynet-challenge5
289 | 
290 |         Returns
291 |         -------
292 |         messageslog_grammar :
293 |             Grammar for messages log
294 |         """
295 |         ints = Word(nums)
296 | 
297 |         # timestamp
298 |         month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
299 |         day = ints
300 |         hour = Combine(ints + ':' + ints + ':' + ints)
301 |         timestamp = month + day + hour
302 | 
303 |         # hostname, service name, message
304 |         hostname = Word(alphas + nums + '_' + '-' + '.')
305 |         service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':') + Optional(':')
306 | 
307 |         # unix time
308 |         unix_time = Optional('[' + Word(nums + '.' + ']'))
309 |         subservice = Optional(Word(alphas + nums + '_' + '-' + ':'))
310 |         subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/'))
311 |         message = Regex('.*')
312 | 
313 |         # messages log grammar
314 |         messageslog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message
315 |         return messageslog_grammar
316 | 
317 |     def parse_log(self, log_line):
318 |         """Parse messages log based on defined grammar.
319 | 
320 |         Parameters
321 |         ----------
322 |         log_line    : str
323 |             A log line to be parsed.
324 | 
325 |         Returns
326 |         -------
327 |         parsed  : dict[str, str]
328 |             A parsed messages log containing these elements: timestamp, hostname, service, unix_time,
329 |             subservice and message.
330 |         """
331 |         parsed_messageslog = self.messageslog_grammar.parseString(log_line)
332 | 
333 |         # get parsed kernel log
334 |         parsed = OrderedDict()
335 |         parsed['timestamp'] = parsed_messageslog[0] + ' ' + parsed_messageslog[1] + ' ' + parsed_messageslog[2]
336 |         parsed['hostname'] = parsed_messageslog[3]
337 |         parsed['service'] = parsed_messageslog[4]
338 | 
339 |         if len(parsed_messageslog) == 6:
340 |             parsed['unix_time'] = ''
341 |             parsed['subservice'] = ''
342 |             if not parsed['service'].endswith(':'):
343 |                 parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5]
344 |                 parsed['service'] = ''
345 | 
346 |         elif len(parsed_messageslog) == 7:
347 |             parsed['unix_time'] = ''
348 |             parsed['subservice'] = ''
349 |             if not parsed_messageslog[5].endswith(':'):
350 |                 parsed['message'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6]
351 | 
352 |             if not parsed['service'].endswith(':'):
353 |                 parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5] + ' ' + parsed_messageslog[6]
354 |                 parsed['service'] = ''
355 | 
356 |         elif len(parsed_messageslog) == 8:
357 |             if parsed_messageslog[5].startswith('[') and parsed_messageslog[6].endswith(']'):
358 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6])
359 |                 parsed['subservice'] = ''
360 |                 parsed['message'] = parsed_messageslog[7]
361 | 
362 |             else:
363 |                 parsed['unix_time'] = ''
364 |                 parsed['subservice'] = parsed_messageslog[5]
365 |                 parsed['message'] = ' '.join(parsed_messageslog[6:])
366 | 
367 |         else:
368 |             if parsed_messageslog[5] == ':':
369 |                 parsed['service'] = parsed['service'] + ' ' + parsed_messageslog[5]
370 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[6], parsed_messageslog[7])
371 | 
372 |                 # subservice one word
373 |                 if parsed_messageslog[8].endswith(':') and not parsed_messageslog[9].endswith(':'):
374 |                     parsed['subservice'] = parsed_messageslog[8]
375 |                     parsed['message'] = ' '.join(parsed_messageslog[9:])
376 | 
377 |                 # subservice two words
378 |                 elif not parsed_messageslog[8].endswith(':') and parsed_messageslog[9].endswith(':'):
379 |                     parsed['subservice'] = parsed_messageslog[8] + ' ' + parsed_messageslog[9]
380 |                     parsed['message'] = ' '.join(parsed_messageslog[10:])
381 | 
382 |                 # subservice two words
383 |                 elif parsed_messageslog[8].endswith(':') and parsed_messageslog[9].endswith(':'):
384 |                     parsed['subservice'] = parsed_messageslog[8] + ' ' + parsed_messageslog[9]
385 |                     parsed['message'] = ' '.join(parsed_messageslog[10:])
386 | 
387 |                 # no subservice, only message
388 |                 else:
389 |                     parsed['subservice'] = ''
390 |                     parsed['message'] = ' '.join(parsed_messageslog[8:])
391 | 
392 |             else:
393 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6])
394 | 
395 |                 # subservice one word
396 |                 if parsed_messageslog[7].endswith(':') and not parsed_messageslog[8].endswith(':'):
397 |                     parsed['subservice'] = parsed_messageslog[7]
398 |                     parsed['message'] = ' '.join(parsed_messageslog[8:])
399 | 
400 |                 # subservice two words
401 |                 elif not parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'):
402 |                     parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8]
403 |                     parsed['message'] = ' '.join(parsed_messageslog[9:])
404 | 
405 |                 # subservice two words
406 |                 elif parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'):
407 |                     parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8]
408 |                     parsed['message'] = ' '.join(parsed_messageslog[9:])
409 | 
410 |                 # no subservice, only message
411 |                 else:
412 |                     parsed['subservice'] = ''
413 |                     parsed['message'] = ' '.join(parsed_messageslog[7:])
414 | 
415 |             if not parsed['subservice'].endswith(':'):
416 |                 parsed['message'] = parsed['subservice'] + ' ' + parsed['message']
417 |                 parsed['subservice'] = ''
418 | 
419 |         return parsed
420 | 
421 | 
422 | class Main(object):
423 |     def __init__(self, datasets):
424 |         self.datasets = datasets
425 |         self.dataset_path = '/home/hudan/Git/prlogparser/datasets/'
426 |         self.groups = {
427 |             'group1': ['casper-rw'],
428 |             'group2': ['dfrws-2009-jhuisi', 'dfrws-2009-nssal', 'honeynet-challenge7'],
429 |             'group3': ['honeynet-challenge5']
430 |         }
431 | 
432 |     def run(self):
433 |         # parse messages.log
434 |         for group_name, group in self.groups.items():
435 |             # setup test csv file to save results
436 |             base_name = '/home/hudan/Git/prlogparser/groundtruth/debug-'
437 |             test_file = base_name + group_name + '.csv'
438 |             f = open(test_file, 'w', newline='')
439 |             writer = csv.writer(f)
440 | 
441 |             for dataset in group:
442 |                 # get grammar
443 |                 dl = None
444 |                 if group_name == 'group1':
445 |                     dl = DebugLog1(dataset)
446 | 
447 |                 elif group_name == 'group2':
448 |                     dl = DebugLog2(dataset)
449 | 
450 |                 elif group_name == 'group3':
451 |                     dl = DebugLog3(dataset)
452 | 
453 |                 # start parsing
454 |                 for filename in self.datasets[dataset]:
455 |                     filename = os.path.join(self.dataset_path, filename)
456 |                     with open(filename, 'r') as f:
457 |                         for line in f:
458 |                             # get parsed line and print
459 |                             parsed_line = dl.parse_log(line)
460 |                             print(parsed_line)
461 | 
462 |                             # write to csv
463 |                             row = list(parsed_line.values())
464 |                             writer.writerow(row)
465 | 
466 |             f.close()
467 | 
468 | if __name__ == '__main__':
469 |     datasets_files = {
470 |         'casper-rw': ['casper-rw/debug'],
471 |         'dfrws-2009-jhuisi': [
472 |             'dfrws-2009-jhuisi/debug',
473 |             'dfrws-2009-jhuisi/debug.0',
474 |             'dfrws-2009-jhuisi/debug.1'
475 |         ],
476 |         'dfrws-2009-nssal': [
477 |             'dfrws-2009-nssal/debug',
478 |             'dfrws-2009-nssal/debug.0',
479 |             'dfrws-2009-nssal/debug.1',
480 |             'dfrws-2009-nssal/debug.2',
481 |             'dfrws-2009-nssal/debug.3'
482 |         ],
483 |         'honeynet-challenge5': ['honeynet-challenge5/debug'],
484 |         'honeynet-challenge7': ['honeynet-challenge7/debug']
485 |     }
486 | 
487 |     main = Main(datasets_files)
488 |     main.run()
489 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/dmesglog.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | from pyparsing import Word, alphas, nums, Optional, Regex
  4 | from collections import OrderedDict
  5 | from nerlogparser.grammar.grammar_utility import GrammarUtility
  6 | 
  7 | 
  8 | class DmesgLog(object):
  9 |     def __init__(self, dataset):
 10 |         self.dataset = dataset
 11 |         self.groups = {
 12 |             'group1': ['casper-rw', 'dfrws-2009-jhuisi', 'honeynet-challenge5', 'honeynet-challenge7'],
 13 |             'group2': ['dfrws-2009-nssal']
 14 |         }
 15 | 
 16 |     def get_grammar(self):
 17 |         dl = None
 18 |         if self.dataset in self.groups['group1']:
 19 |             dl = DmesgLog1(self.dataset)
 20 | 
 21 |         elif self.dataset in self.groups['group2']:
 22 |             dl = DmesgLog2(self.dataset)
 23 | 
 24 |         return dl
 25 | 
 26 | 
 27 | class DmesgLog1(object):
 28 |     def __init__(self, dataset):
 29 |         """Constructor for class DmesgLog.
 30 | 
 31 |         Parameters
 32 |         ----------
 33 |         dataset : str
 34 |             Dataset name.
 35 |         """
 36 |         self.dataset = dataset
 37 |         self.dmesglog_grammar = self.__get_dmesglog_grammar()
 38 | 
 39 |     @staticmethod
 40 |     def __get_dmesglog_grammar():
 41 |         """The definition of dmesg log grammar. Supported dataset:
 42 |         casper-rw
 43 |         dfrws-2009-jhuisi
 44 |         honeynet-challenge5
 45 |         honeynet-challenge7
 46 | 
 47 |         Returns
 48 |         -------
 49 |         dmesglog_grammar : pyparsing.And
 50 |             Grammar for dmesg log
 51 |         """
 52 |         # unix time
 53 |         unix_time = '[' + Word(nums + '.' + ']')
 54 |         subservice = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + '.' + '='))
 55 |         subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '='))
 56 |         message = Regex('.*')
 57 | 
 58 |         # dmesg log grammar
 59 |         dmesglog_grammar = unix_time + subservice + subservice_two_words + message
 60 |         return dmesglog_grammar
 61 | 
 62 |     def parse_log(self, log_line):
 63 |         """Parse dmesg log based on defined grammar.
 64 | 
 65 |         Parameters
 66 |         ----------
 67 |         log_line    : str
 68 |             A log line to be parsed.
 69 | 
 70 |         Returns
 71 |         -------
 72 |         parsed  : dict[str, str]
 73 |             A parsed dmesg log containing these elements: unix_time, subservice and message.
 74 |         """
 75 |         parsed_dmesglog = self.dmesglog_grammar.parseString(log_line)
 76 | 
 77 |         # get parsed dmesg log
 78 |         parsed = OrderedDict()
 79 |         parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_dmesglog[0], parsed_dmesglog[1])
 80 | 
 81 |         if len(parsed_dmesglog) == 3:
 82 |             parsed['subservice'] = ''
 83 |             parsed['message'] = parsed_dmesglog[2]
 84 | 
 85 |         else:
 86 |             # subservice one word
 87 |             if parsed_dmesglog[2].endswith(':') and not parsed_dmesglog[3].endswith(':'):
 88 |                 parsed['subservice'] = parsed_dmesglog[2]
 89 |                 parsed['message'] = ' '.join(parsed_dmesglog[3:])
 90 | 
 91 |             # subservice two words
 92 |             elif not parsed_dmesglog[2].endswith(':') and parsed_dmesglog[3].endswith(':'):
 93 |                 parsed['subservice'] = parsed_dmesglog[2] + ' ' + parsed_dmesglog[3]
 94 |                 parsed['message'] = ' '.join(parsed_dmesglog[4:])
 95 | 
 96 |             # subservice two words
 97 |             elif parsed_dmesglog[2].endswith(':') and parsed_dmesglog[3].endswith(':'):
 98 |                 parsed['subservice'] = parsed_dmesglog[2] + ' ' + parsed_dmesglog[3]
 99 |                 parsed['message'] = ' '.join(parsed_dmesglog[4:])
100 | 
101 |             else:
102 |                 parsed['subservice'] = ''
103 |                 parsed['message'] = ' '.join(parsed_dmesglog[2:])
104 | 
105 |             if not parsed['subservice'].endswith(':') and parsed['subservice']:
106 |                 parsed['message'] = parsed['subservice'] + ' ' + parsed['message']
107 |                 parsed['subservice'] = ''
108 | 
109 |         return parsed
110 | 
111 | 
112 | class DmesgLog2(object):
113 |     # this class is written for dfrws-2009-jhuisi/nssal/dmesg*
114 |     def __init__(self, dataset):
115 |         self.dataset = dataset
116 |         self.dmesglog_grammar = self.__get_dmesglog_grammar()
117 | 
118 |     @staticmethod
119 |     def __get_dmesglog_grammar():
120 |         subservice = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + '.' + '='))
121 |         subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '='))
122 |         message = Regex('.*')
123 | 
124 |         # dmesg log grammar
125 |         dmesglog_grammar = subservice + subservice_two_words + message
126 |         return dmesglog_grammar
127 | 
128 |     def parse_log(self, log_line):
129 |         # get grammar
130 |         parsed_dmesglog = self.dmesglog_grammar.parseString(log_line)
131 | 
132 |         # get parsed dmesg log
133 |         parsed = OrderedDict()
134 |         if len(parsed_dmesglog) == 1:
135 |             if parsed_dmesglog[0].endswith(':'):
136 |                 parsed['subservice'] = parsed_dmesglog[0]
137 |                 parsed['message'] = ''
138 |             else:
139 |                 parsed['subservice'] = ''
140 |                 parsed['message'] = parsed_dmesglog[0]
141 | 
142 |         elif len(parsed_dmesglog) == 2:
143 |             if parsed_dmesglog[0].endswith(':'):
144 |                 parsed['subservice'] = parsed_dmesglog[0]
145 |                 parsed['message'] = parsed_dmesglog[1]
146 |             else:
147 |                 parsed['subservice'] = ''
148 |                 parsed['message'] = ' '.join(parsed_dmesglog[0:])
149 | 
150 |         else:
151 |             # subservice one word
152 |             if parsed_dmesglog[0].endswith(':') and not parsed_dmesglog[1].endswith(':'):
153 |                 parsed['subservice'] = parsed_dmesglog[0]
154 |                 parsed['message'] = ' '.join(parsed_dmesglog[1:])
155 | 
156 |             # subservice two words
157 |             elif not parsed_dmesglog[0].endswith(':') and parsed_dmesglog[1].endswith(':'):
158 |                 parsed['subservice'] = parsed_dmesglog[0] + ' ' + parsed_dmesglog[1]
159 |                 parsed['message'] = ' '.join(parsed_dmesglog[2:])
160 | 
161 |             # subservice two words
162 |             elif parsed_dmesglog[0].endswith(':') and parsed_dmesglog[1].endswith(':'):
163 |                 parsed['subservice'] = parsed_dmesglog[0] + ' ' + parsed_dmesglog[1]
164 |                 parsed['message'] = ' '.join(parsed_dmesglog[2:])
165 | 
166 |             else:
167 |                 parsed['subservice'] = ''
168 |                 parsed['message'] = ' '.join(parsed_dmesglog[:])
169 | 
170 |             if not parsed['subservice'].endswith(':') and parsed['subservice']:
171 |                 parsed['message'] = parsed['subservice'] + ' ' + parsed['message']
172 |                 parsed['subservice'] = ''
173 | 
174 |         return parsed
175 | 
176 | 
177 | class Main(object):
178 |     def __init__(self, datasets):
179 |         self.datasets = datasets
180 |         self.dataset_path = '/home/hudan/Git/prlogparser/datasets/'
181 |         self.groups = {
182 |             'group1': ['casper-rw', 'dfrws-2009-jhuisi', 'honeynet-challenge5', 'honeynet-challenge7'],
183 |             'group2': ['dfrws-2009-nssal']
184 |         }
185 | 
186 |     def run(self):
187 |         # parse dmesg log
188 |         for group_name, group in self.groups.items():
189 |             # setup test csv file to save results
190 |             base_name = '/home/hudan/Git/prlogparser/groundtruth/dmesg-'
191 |             test_file = base_name + group_name + '.csv'
192 |             f = open(test_file, 'w', newline='')
193 |             writer = csv.writer(f)
194 | 
195 |             for dataset in group:
196 |                 # get grammar
197 |                 dl = None
198 |                 if group_name == 'group1':
199 |                     dl = DmesgLog1(dataset)
200 | 
201 |                 elif group_name == 'group2':
202 |                     dl = DmesgLog2(dataset)
203 | 
204 |                 # start parsing
205 |                 for filename in self.datasets[dataset]:
206 |                     filename = os.path.join(self.dataset_path, filename)
207 |                     with open(filename, 'r') as f:
208 |                         for line in f:
209 |                             # get parsed line and print
210 |                             parsed_line = dl.parse_log(line)
211 |                             print(parsed_line)
212 | 
213 |                             # write to csv
214 |                             row = list(parsed_line.values())
215 |                             writer.writerow(row)
216 | 
217 |             f.close()
218 | 
219 | 
220 | if __name__ == '__main__':
221 |     # get dmesg log datasets
222 |     datasets_files = {
223 |         'casper-rw': [
224 |             'casper-rw/dmesg',
225 |             'casper-rw/dmesg.0',
226 |             'casper-rw/dmesg.1',
227 |             'casper-rw/dmesg.2',
228 |             'casper-rw/dmesg.3'
229 |         ],
230 |         'dfrws-2009-jhuisi': [
231 |             'dfrws-2009-jhuisi/dmesg',
232 |             'dfrws-2009-jhuisi/dmesg.0',
233 |             'dfrws-2009-jhuisi/dmesg.1',
234 |             'dfrws-2009-jhuisi/dmesg.2',
235 |             'dfrws-2009-jhuisi/dmesg.3',
236 |             'dfrws-2009-jhuisi/dmesg.4'
237 |         ],
238 |         'dfrws-2009-nssal': [
239 |             'dfrws-2009-nssal/dmesg',
240 |             'dfrws-2009-nssal/dmesg.0',
241 |             'dfrws-2009-nssal/dmesg.1',
242 |             'dfrws-2009-nssal/dmesg.2',
243 |             'dfrws-2009-nssal/dmesg.3',
244 |             'dfrws-2009-nssal/dmesg.4'
245 |         ],
246 |         'honeynet-challenge5': [
247 |             'honeynet-challenge5/dmesg',
248 |             'honeynet-challenge5/dmesg.0'
249 |         ],
250 |         'honeynet-challenge7': [
251 |             'honeynet-challenge7/dmesg',
252 |             'honeynet-challenge7/dmesg.0',
253 |             'honeynet-challenge7/dmesg.1',
254 |             'honeynet-challenge7/dmesg.2',
255 |             'honeynet-challenge7/dmesg.3',
256 |             'honeynet-challenge7/dmesg.4'
257 |         ]
258 |     }
259 | 
260 |     main = Main(datasets_files)
261 |     main.run()
262 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/grammar_utility.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class GrammarUtility(object):
 4 |     @staticmethod
 5 |     def get_unix_timestamp(square_bracket, timestamp):
 6 |         main_digit = timestamp.split('.')[0]
 7 |         space = ''
 8 | 
 9 |         if len(main_digit) == 1:
10 |             space = '    '
11 |         elif len(main_digit) == 2:
12 |             space = '   '
13 |         elif len(main_digit) == 3:
14 |             space = '  '
15 |         elif len(main_digit) == 4:
16 |             space = ' '
17 |         elif len(main_digit) == 5:
18 |             space = ''
19 | 
20 |         new_timestamp = square_bracket + space + timestamp
21 |         return new_timestamp
22 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/kernellog.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | from pyparsing import Word, alphas, Combine, nums, string, Optional, Regex
  4 | from collections import OrderedDict
  5 | from nerlogparser.grammar.grammar_utility import GrammarUtility
  6 | 
  7 | 
  8 | class KernelLog(object):
  9 |     def __init__(self, dataset):
 10 |         self.dataset = dataset
 11 |         self.groups = {
 12 |             'group1': ['casper-rw', 'dfrws-2009-jhuisi', 'dfrws-2009-nssal', 'honeynet-challenge7'],
 13 |             'group2': ['honeynet-challenge5']
 14 |         }
 15 | 
 16 |     def get_grammar(self):
 17 |         dl = None
 18 |         if self.dataset in self.groups['group1']:
 19 |             dl = KernelLog1(self.dataset)
 20 | 
 21 |         elif self.dataset in self.groups['group2']:
 22 |             dl = KernelLog2(self.dataset)
 23 | 
 24 |         return dl
 25 | 
 26 | 
 27 | class KernelLog1(object):
 28 |     def __init__(self, dataset):
 29 |         """Constructor for class KernelLog.
 30 | 
 31 |         Parameters
 32 |         ----------
 33 |         dataset : str
 34 |             Dataset name.
 35 |         """
 36 |         self.dataset = dataset
 37 | 
 38 |     @staticmethod
 39 |     def __get_kernellog_grammar():
 40 |         """The definition of kernel log grammar. Supported dataset:
 41 |         casper-rw
 42 |         dfrws-2009
 43 |         honeynet-challenge7
 44 | 
 45 |         Returns
 46 |         -------
 47 |         kernellog_grammar :
 48 |             Grammar for kernel log
 49 |         """
 50 |         ints = Word(nums)
 51 | 
 52 |         # timestamp
 53 |         month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
 54 |         day = ints
 55 |         hour = Combine(ints + ':' + ints + ':' + ints)
 56 |         timestamp = month + day + hour
 57 | 
 58 |         # hostname, service name, message
 59 |         hostname = Word(alphas + nums + '_' + '-' + '.')
 60 |         service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':')
 61 | 
 62 |         # unix time
 63 |         unix_time = Optional('[' + Word(nums + '.' + ']'))
 64 |         subservice = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + '.' + '=' + '(' + ')' + '*' +
 65 |                                    '<' + '>'))
 66 |         subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/' +
 67 |                                              '(' + ')' + '*'))
 68 |         message = Optional(Regex('.*'))
 69 | 
 70 |         # kernel log grammar
 71 |         debuglog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message
 72 |         return debuglog_grammar
 73 | 
 74 |     def parse_log(self, log_line):
 75 |         """Parse kernel log based on defined grammar.
 76 | 
 77 |         Parameters
 78 |         ----------
 79 |         log_line    : str
 80 |             A log line to be parsed.
 81 | 
 82 |         Returns
 83 |         -------
 84 |         parsed  : dict[str, str]
 85 |             A parsed kernel log containing these elements: timestamp, hostname, service, unix_time,
 86 |             subservice and message.
 87 |         """
 88 |         kernellog_grammar = self.__get_kernellog_grammar()
 89 |         parsed_kernellog = kernellog_grammar.parseString(log_line)
 90 | 
 91 |         # get parsed kernel log
 92 |         parsed = OrderedDict()
 93 |         parsed['timestamp'] = parsed_kernellog[0] + ' ' + parsed_kernellog[1] + ' ' + parsed_kernellog[2]
 94 |         parsed['hostname'] = parsed_kernellog[3]
 95 |         parsed['service'] = parsed_kernellog[4]
 96 | 
 97 |         if len(parsed_kernellog) == 6:
 98 |             parsed['unix_time'] = ''
 99 |             parsed['subservice'] = ''
100 |             parsed['message'] = parsed_kernellog[5]
101 | 
102 |         elif len(parsed_kernellog) == 7:
103 |             parsed['unix_time'] = ''
104 |             parsed['subservice'] = ''
105 |             if not parsed_kernellog[5].endswith(':'):
106 |                 parsed['message'] = parsed_kernellog[5] + ' ' + parsed_kernellog[6]
107 | 
108 |         elif len(parsed_kernellog) == 8:
109 |             parsed['unix_time'] = ''
110 |             parsed['subservice'] = ''
111 | 
112 |             # no message
113 |             if parsed_kernellog[7] == '' and parsed_kernellog[5].startswith('[') and parsed_kernellog[6].endswith(']'):
114 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_kernellog[5], parsed_kernellog[6])
115 |                 parsed['message'] = ''
116 | 
117 |             # message exists
118 |             elif parsed_kernellog[7] != '' and parsed_kernellog[5].startswith('[') and \
119 |                     parsed_kernellog[6].endswith(']'):
120 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_kernellog[5], parsed_kernellog[6])
121 |                 parsed['message'] = parsed_kernellog[7]
122 | 
123 |             # subservice one word
124 |             elif parsed_kernellog[5].endswith(':') and not parsed_kernellog[6].endswith(':'):
125 |                 parsed['subservice'] = parsed_kernellog[5]
126 |                 parsed['message'] = ' '.join(parsed_kernellog[6:])
127 | 
128 |             # subservice two words
129 |             elif not parsed_kernellog[5].endswith(':') and parsed_kernellog[6].endswith(':'):
130 |                 parsed['subservice'] = parsed_kernellog[5] + ' ' + parsed_kernellog[6]
131 |                 parsed['message'] = parsed_kernellog[7]
132 | 
133 |             # subservice two words
134 |             elif parsed_kernellog[5].endswith(':') and parsed_kernellog[6].endswith(':'):
135 |                 parsed['subservice'] = parsed_kernellog[5] + ' ' + parsed_kernellog[6]
136 |                 parsed['message'] = parsed_kernellog[7]
137 | 
138 |             # no timestamp, no subservice, just message
139 |             else:
140 |                 parsed['message'] = ' '.join(parsed_kernellog[5:])
141 | 
142 |         else:
143 |             # if timestamp exists
144 |             parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_kernellog[5], parsed_kernellog[6])
145 | 
146 |             # subservice one word
147 |             if parsed_kernellog[7].endswith(':') and not parsed_kernellog[8].endswith(':'):
148 |                 parsed['subservice'] = parsed_kernellog[7]
149 |                 parsed['message'] = ' '.join(parsed_kernellog[8:])
150 | 
151 |             # subservice two words
152 |             elif not parsed_kernellog[7].endswith(':') and parsed_kernellog[8].endswith(':'):
153 |                 parsed['subservice'] = parsed_kernellog[7] + ' ' + parsed_kernellog[8]
154 |                 parsed['message'] = ' '.join(parsed_kernellog[9:])
155 | 
156 |             # subservice two words
157 |             elif parsed_kernellog[7].endswith(':') and parsed_kernellog[8].endswith(':'):
158 |                 parsed['subservice'] = parsed_kernellog[7] + ' ' + parsed_kernellog[8]
159 |                 parsed['message'] = ' '.join(parsed_kernellog[9:])
160 | 
161 |             else:
162 |                 parsed['subservice'] = ''
163 |                 parsed['message'] = ' '.join(parsed_kernellog[7:])
164 | 
165 |             if not parsed['subservice'].endswith(':'):
166 |                 parsed['message'] = parsed['subservice'] + ' ' + parsed['message']
167 |                 parsed['subservice'] = ''
168 | 
169 |         return parsed
170 | 
171 | 
172 | class KernelLog2(object):
173 |     def __init__(self, dataset):
174 |         """Constructor for class KernelLog.
175 | 
176 |         Parameters
177 |         ----------
178 |         dataset : str
179 |             Dataset name.
180 |         """
181 |         self.dataset = dataset
182 | 
183 |     @staticmethod
184 |     def __get_kernellog_grammar():
185 |         """The definition of kernel log grammar. Supported dataset:
186 |         honeynet-challenge5
187 | 
188 |         Returns
189 |         -------
190 |         kernellog_grammar :
191 |             Grammar for kernel log
192 |         """
193 |         ints = Word(nums)
194 | 
195 |         # timestamp
196 |         month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
197 |         day = ints
198 |         hour = Combine(ints + ':' + ints + ':' + ints)
199 |         timestamp = month + day + hour
200 | 
201 |         # hostname, service name, message
202 |         # there is Optional(':')
203 |         hostname = Word(alphas + nums + '_' + '-' + '.')
204 |         service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':') + Optional(':')
205 | 
206 |         # unix time
207 |         unix_time = Optional('[' + Word(nums + '.' + ']'))
208 |         subservice = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + '.' + '=' + '(' + ')' + '*' +
209 |                                    '<' + '>'))
210 |         subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/' +
211 |                                              '(' + ')' + '*' + "'"))
212 |         message = Optional(Regex('.*'))
213 | 
214 |         # kernel log grammar
215 |         debuglog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message
216 |         return debuglog_grammar
217 | 
218 |     def parse_log(self, log_line):
219 |         """Parse kernel log based on defined grammar.
220 | 
221 |         Parameters
222 |         ----------
223 |         log_line    : str
224 |             A log line to be parsed.
225 | 
226 |         Returns
227 |         -------
228 |         parsed  : dict[str, str]
229 |             A parsed kernel log containing these elements: timestamp, hostname, service, unix_time,
230 |             subservice and message.
231 |         """
232 |         kernellog_grammar = self.__get_kernellog_grammar()
233 |         parsed_kernellog = kernellog_grammar.parseString(log_line)
234 | 
235 |         # get parsed kernel log
236 |         parsed = OrderedDict()
237 |         parsed['timestamp'] = parsed_kernellog[0] + ' ' + parsed_kernellog[1] + ' ' + parsed_kernellog[2]
238 |         parsed['hostname'] = parsed_kernellog[3]
239 |         parsed['service'] = parsed_kernellog[4]
240 | 
241 |         if len(parsed_kernellog) == 6:
242 |             parsed['unix_time'] = ''
243 |             parsed['subservice'] = ''
244 |             parsed['message'] = parsed_kernellog[5]
245 | 
246 |         elif len(parsed_kernellog) == 7:
247 |             parsed['unix_time'] = ''
248 |             parsed['subservice'] = ''
249 |             if not parsed_kernellog[5].endswith(':'):
250 |                 parsed['message'] = parsed_kernellog[5] + ' ' + parsed_kernellog[6]
251 | 
252 |         elif len(parsed_kernellog) == 8:
253 |             parsed['unix_time'] = ''
254 |             parsed['subservice'] = ''
255 | 
256 |             # timestamp exists
257 |             if parsed_kernellog[5].startswith('[') and parsed_kernellog[5].endswith(']'):
258 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_kernellog[5], parsed_kernellog[6])
259 |                 parsed['message'] = parsed_kernellog[7]
260 | 
261 |             # no timestamp, no subservice, just message
262 |             else:
263 |                 parsed['message'] = ' '.join(parsed_kernellog[5:])
264 | 
265 |         else:
266 |             if parsed_kernellog[5] == ':':
267 |                 parsed['service'] = parsed['service'] + ' ' + parsed_kernellog[5]
268 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_kernellog[6], parsed_kernellog[7])
269 | 
270 |                 # subservice one word
271 |                 if parsed_kernellog[8].endswith(':') and not parsed_kernellog[9].endswith(':'):
272 |                     parsed['subservice'] = parsed_kernellog[8]
273 |                     parsed['message'] = ' '.join(parsed_kernellog[9:])
274 | 
275 |                 # subservice two words
276 |                 elif not parsed_kernellog[8].endswith(':') and parsed_kernellog[9].endswith(':'):
277 |                     parsed['subservice'] = parsed_kernellog[8] + ' ' + parsed_kernellog[9]
278 |                     parsed['message'] = ' '.join(parsed_kernellog[10:])
279 | 
280 |                 # subservice two words
281 |                 elif parsed_kernellog[8].endswith(':') and parsed_kernellog[9].endswith(':'):
282 |                     parsed['subservice'] = parsed_kernellog[8] + ' ' + parsed_kernellog[9]
283 |                     parsed['message'] = ' '.join(parsed_kernellog[10:])
284 | 
285 |                 # no subservice, only message
286 |                 else:
287 |                     parsed['subservice'] = ''
288 |                     parsed['message'] = ' '.join(parsed_kernellog[8:])
289 | 
290 |             else:
291 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_kernellog[5], parsed_kernellog[6])
292 | 
293 |                 # subservice one word
294 |                 if parsed_kernellog[7].endswith(':') and not parsed_kernellog[8].endswith(':'):
295 |                     parsed['subservice'] = parsed_kernellog[7]
296 |                     parsed['message'] = ' '.join(parsed_kernellog[8:])
297 | 
298 |                 # subservice two words
299 |                 elif not parsed_kernellog[7].endswith(':') and parsed_kernellog[8].endswith(':'):
300 |                     parsed['subservice'] = parsed_kernellog[7] + ' ' + parsed_kernellog[8]
301 |                     parsed['message'] = ' '.join(parsed_kernellog[9:])
302 | 
303 |                 # subservice two words
304 |                 elif parsed_kernellog[7].endswith(':') and parsed_kernellog[8].endswith(':'):
305 |                     parsed['subservice'] = parsed_kernellog[7] + ' ' + parsed_kernellog[8]
306 |                     parsed['message'] = ' '.join(parsed_kernellog[9:])
307 | 
308 |                 # no subservice, only message
309 |                 else:
310 |                     parsed['subservice'] = ''
311 |                     parsed['message'] = ' '.join(parsed_kernellog[7:])
312 | 
313 |             if not parsed['subservice'].endswith(':'):
314 |                 parsed['message'] = parsed['subservice'] + ' ' + parsed['message']
315 |                 parsed['subservice'] = ''
316 | 
317 |         return parsed
318 | 
319 | 
320 | class Main(object):
321 |     def __init__(self, datasets):
322 |         self.datasets = datasets
323 |         self.dataset_path = '/home/hudan/Git/prlogparser/datasets/'
324 |         self.groups = {
325 |             'group1': ['casper-rw', 'dfrws-2009-jhuisi', 'dfrws-2009-nssal', 'honeynet-challenge7'],
326 |             'group2': ['honeynet-challenge5']
327 |         }
328 | 
329 |     def run(self):
330 |         # parse kernel.log
331 |         for group_name, group in self.groups.items():
332 |             # setup test csv file to save results
333 |             base_name = '/home/hudan/Git/prlogparser/groundtruth/kernel-'
334 |             test_file = base_name + group_name + '.csv'
335 |             f = open(test_file, 'w', newline='')
336 |             writer = csv.writer(f)
337 | 
338 |             for dataset in group:
339 |                 # get grammar
340 |                 dl = None
341 |                 if group_name == 'group1':
342 |                     dl = KernelLog1(dataset)
343 | 
344 |                 elif group_name == 'group2':
345 |                     dl = KernelLog2(dataset)
346 | 
347 |                 # start parsing
348 |                 for filename in self.datasets[dataset]:
349 |                     filename = os.path.join(self.dataset_path, filename)
350 |                     with open(filename, 'r') as f:
351 |                         for line in f:
352 |                             # get parsed line and print
353 |                             parsed_line = dl.parse_log(line)
354 |                             print(parsed_line)
355 | 
356 |                             # write to csv
357 |                             row = list(parsed_line.values())
358 |                             writer.writerow(row)
359 | 
360 |             f.close()
361 | 
362 | 
363 | if __name__ == '__main__':
364 |     datasets_files = {
365 |         'casper-rw': ['casper-rw/kern.log'],
366 |         'dfrws-2009-jhuisi': [
367 |             'dfrws-2009-jhuisi/kern.log',
368 |             'dfrws-2009-jhuisi/kern.log.0',
369 |             'dfrws-2009-jhuisi/kern.log.1',
370 |         ],
371 |         'dfrws-2009-nssal': [
372 |             'dfrws-2009-nssal/kern.log',
373 |             'dfrws-2009-nssal/kern.log.0',
374 |             'dfrws-2009-nssal/kern.log.1',
375 |             'dfrws-2009-nssal/kern.log.2',
376 |             'dfrws-2009-nssal/kern.log.3'
377 |         ],
378 |         'honeynet-challenge5': ['honeynet-challenge5/kern.log'],
379 |         'honeynet-challenge7': ['honeynet-challenge7/kern.log']
380 |     }
381 | 
382 |     main = Main(datasets_files)
383 |     main.run()
384 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/kippolog.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | from pyparsing import Word, Combine, nums, Regex, nestedExpr
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | class KippoLog(object):
 8 |     def __init__(self, dataset):
 9 |         self.dataset = dataset
10 |         self.kippolog_grammar = self.__get_kippolog_grammar()
11 | 
12 |     @staticmethod
13 |     def __get_kippolog_grammar():
14 |         """The definition of Kippo honeypot log grammar.
15 | 
16 |         Returns
17 |         -------
18 |         kippolog_grammar    :
19 |             Grammar for Kippo log.
20 |         """
21 |         ints = Word(nums)
22 | 
23 |         date = Combine(ints + '-' + ints + '-' + ints)
24 |         time = Combine(ints + ':' + ints + ':' + ints + '+0000')
25 |         timestamp = date + time
26 |         service = nestedExpr(opener='[', closer=']')
27 |         message = Regex(".*")
28 | 
29 |         # kippo honeypot log grammar
30 |         kippolog_grammar = timestamp('timestamp') + service('service') + message('message')
31 |         return kippolog_grammar
32 | 
33 |     def parse_log(self, log_line):
34 |         parsed_kippolog = self.kippolog_grammar.parseString(log_line)
35 | 
36 |         parsed = OrderedDict()
37 |         parsed['timestamp'] = ' '.join(parsed_kippolog.timestamp)
38 |         if len(parsed_kippolog.service[0]) > 1:
39 |             parsed['service'] = '[' + ' '.join(parsed_kippolog.service[0]) + ']'
40 |         else:
41 |             parsed['service'] = '[' + parsed_kippolog.service[0][0] + ']'
42 |         parsed['message'] = parsed_kippolog.message
43 | 
44 |         return parsed
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     dataset_path = '/home/hudan/Git/prlogparser/datasets/kippo/'
49 |     filenames = [
50 |         'kippo.2017-02-14.log',
51 |         'kippo.2017-02-15.log',
52 |         'kippo.2017-02-16.log',
53 |         'kippo.2017-02-17.log',
54 |         'kippo.2017-02-18.log',
55 |         'kippo.2017-02-19.log',
56 |         'kippo.2017-02-20.log'
57 |     ]
58 | 
59 |     test_file = '/home/hudan/Git/prlogparser/groundtruth/test-results/kippo-test.csv'
60 |     f = open(test_file, 'w', newline='')
61 |     writer = csv.writer(f)
62 | 
63 |     kl = KippoLog('')
64 |     for filename in filenames:
65 |         filename = os.path.join(dataset_path, filename)
66 |         with open(filename, 'r') as f:
67 |             for line in f:
68 |                 parsed_line = kl.parse_log(line)
69 |                 print(parsed_line)
70 | 
71 |                 row = list(parsed_line.values())
72 |                 writer.writerow(row)
73 | 
74 |     f.close()
75 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/messageslog.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | from pyparsing import Word, alphas, Combine, nums, string, Optional, Regex
  4 | from collections import OrderedDict
  5 | from nerlogparser.grammar.grammar_utility import GrammarUtility
  6 | 
  7 | 
  8 | class MessagesLog(object):
  9 |     def __init__(self, dataset):
 10 |         self.dataset = dataset
 11 |         self.groups = {
 12 |             'group1': ['casper-rw', 'dfrws-2009-jhuisi', 'dfrws-2009-nssal', 'honeynet-challenge7'],
 13 |             'group2': ['honeynet-challenge5']
 14 |         }
 15 | 
 16 |     def get_grammar(self):
 17 |         dl = None
 18 |         if self.dataset in self.groups['group1']:
 19 |             dl = MessagesLog1(self.dataset)
 20 | 
 21 |         elif self.dataset in self.groups['group2']:
 22 |             dl = MessagesLog2(self.dataset)
 23 | 
 24 |         return dl
 25 | 
 26 | 
 27 | class MessagesLog1(object):
 28 |     def __init__(self, dataset):
 29 |         """Constructor for class MessagesLog. This parser also supports syslog.
 30 | 
 31 |         Parameters
 32 |         ----------
 33 |         dataset : str
 34 |             Dataset name.
 35 |         """
 36 |         self.dataset = dataset
 37 |         self.messageslog_grammar = self.__get_messageslog_grammar()
 38 | 
 39 |     @staticmethod
 40 |     def __get_messageslog_grammar():
 41 |         """The definition of messages log grammar. Supported dataset:
 42 |         casper-rw
 43 |         dfrws-2009
 44 |         honeynet-challenge7
 45 | 
 46 |         Returns
 47 |         -------
 48 |         messageslog_grammar :
 49 |             Grammar for messages log
 50 |         """
 51 |         ints = Word(nums)
 52 | 
 53 |         # timestamp
 54 |         month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
 55 |         day = ints
 56 |         hour = Combine(ints + ':' + ints + ':' + ints)
 57 |         timestamp = month + day + hour
 58 | 
 59 |         # hostname, service name, message
 60 |         hostname = Word(alphas + nums + '_' + '-' + '.')
 61 |         service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':')
 62 | 
 63 |         # unix time
 64 |         unix_time = Optional('[' + Word(nums + '.' + ']'))
 65 |         subservice = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + '.' + '=' + '(' + ')' + '*' +
 66 |                                    '<' + '>' + ','))
 67 |         subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/' +
 68 |                                              '(' + ')' + '*'))
 69 |         message = Optional(Regex('.*'))
 70 | 
 71 |         # messages log grammar
 72 |         messageslog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message
 73 |         return messageslog_grammar
 74 | 
 75 |     def parse_log(self, log_line):
 76 |         """Parse messages log based on defined grammar.
 77 | 
 78 |         Parameters
 79 |         ----------
 80 |         log_line    : str
 81 |             A log line to be parsed.
 82 | 
 83 |         Returns
 84 |         -------
 85 |         parsed  : dict[str, str]
 86 |             A parsed messages log containing these elements: timestamp, hostname, service, unix_time,
 87 |             subservice and message.
 88 |         """
 89 |         parsed_messageslog = self.messageslog_grammar.parseString(log_line)
 90 | 
 91 |         # get parsed kernel log
 92 |         parsed = OrderedDict()
 93 |         parsed['timestamp'] = parsed_messageslog[0] + ' ' + parsed_messageslog[1] + ' ' + parsed_messageslog[2]
 94 |         parsed['hostname'] = parsed_messageslog[3]
 95 |         parsed['service'] = parsed_messageslog[4]
 96 | 
 97 |         if len(parsed_messageslog) == 6:
 98 |             parsed['unix_time'] = ''
 99 |             parsed['subservice'] = ''
100 |             if not parsed['service'].endswith(':'):
101 |                 parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5]
102 |                 parsed['service'] = ''
103 |             else:
104 |                 parsed['message'] = ''
105 | 
106 |         elif len(parsed_messageslog) == 7:
107 |             parsed['unix_time'] = ''
108 |             parsed['subservice'] = ''
109 |             if not parsed_messageslog[5].endswith(':'):
110 |                 parsed['message'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6]
111 | 
112 |             if not parsed['service'].endswith(':'):
113 |                 parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5] + ' ' + parsed_messageslog[6]
114 |                 parsed['service'] = ''
115 | 
116 |         elif len(parsed_messageslog) == 8:
117 |             parsed['unix_time'] = ''
118 |             parsed['subservice'] = ''
119 | 
120 |             # no message
121 |             if parsed_messageslog[7] == '' and parsed_messageslog[5].startswith('[') and \
122 |                     parsed_messageslog[6].endswith(']'):
123 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6])
124 |                 parsed['message'] = ''
125 | 
126 |             # message exists
127 |             elif parsed_messageslog[7] != '' and parsed_messageslog[5].startswith('[') and \
128 |                     parsed_messageslog[6].endswith(']'):
129 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6])
130 |                 parsed['message'] = parsed_messageslog[7]
131 | 
132 |             # subservice one word
133 |             elif parsed_messageslog[5].endswith(':') and not parsed_messageslog[6].endswith(':'):
134 |                 parsed['subservice'] = parsed_messageslog[5]
135 |                 parsed['message'] = ' '.join(parsed_messageslog[6:])
136 | 
137 |             # subservice two words
138 |             elif not parsed_messageslog[5].endswith(':') and parsed_messageslog[6].endswith(':'):
139 |                 parsed['subservice'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6]
140 |                 parsed['message'] = parsed_messageslog[7]
141 | 
142 |             # subservice two words
143 |             elif parsed_messageslog[5].endswith(':') and parsed_messageslog[6].endswith(':'):
144 |                 parsed['subservice'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6]
145 |                 parsed['message'] = parsed_messageslog[7]
146 | 
147 |             # no timestamp, no subservice, just message
148 |             else:
149 |                 parsed['message'] = ' '.join(parsed_messageslog[5:])
150 | 
151 |             if not parsed['subservice'].endswith(':'):
152 |                 parsed['message'] = parsed['subservice'] + ' ' + parsed['message']
153 |                 parsed['subservice'] = ''
154 | 
155 |             if not parsed['service'].endswith(':'):
156 |                 parsed['message'] = parsed['service'] + ' ' + parsed['message']
157 |                 parsed['service'] = ''
158 | 
159 |         else:
160 |             parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6])
161 | 
162 |             # subservice one word
163 |             if parsed_messageslog[7].endswith(':') and not parsed_messageslog[8].endswith(':'):
164 |                 parsed['subservice'] = parsed_messageslog[7]
165 |                 parsed['message'] = ' '.join(parsed_messageslog[8:])
166 | 
167 |             # subservice two words
168 |             elif not parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'):
169 |                 parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8]
170 |                 parsed['message'] = ' '.join(parsed_messageslog[9:])
171 | 
172 |             # subservice two words
173 |             elif parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'):
174 |                 parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8]
175 |                 parsed['message'] = ' '.join(parsed_messageslog[9:])
176 | 
177 |             else:
178 |                 parsed['subservice'] = ''
179 |                 parsed['message'] = ' '.join(parsed_messageslog[7:])
180 | 
181 |             if not parsed['subservice'].endswith(':'):
182 |                 parsed['message'] = parsed['subservice'] + ' ' + parsed['message']
183 |                 parsed['subservice'] = ''
184 | 
185 |         return parsed
186 | 
187 | 
188 | class MessagesLog2(object):
189 |     def __init__(self, dataset):
190 |         """Constructor for class MessagesLog. This parser also supports syslog.
191 | 
192 |         Parameters
193 |         ----------
194 |         dataset : str
195 |             Dataset name.
196 |         """
197 |         self.dataset = dataset
198 |         self.messageslog_grammar = self.__get_messageslog_grammar()
199 | 
200 |     @staticmethod
201 |     def __get_messageslog_grammar():
202 |         """The definition of messages log grammar. Supported dataset:
203 |         honeynet-challenge5
204 | 
205 |         Returns
206 |         -------
207 |         messageslog_grammar :
208 |             Grammar for messages log
209 |         """
210 |         ints = Word(nums)
211 | 
212 |         # timestamp
213 |         month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
214 |         day = ints
215 |         hour = Combine(ints + ':' + ints + ':' + ints)
216 |         timestamp = month + day + hour
217 | 
218 |         # hostname, service name, message
219 |         hostname = Word(alphas + nums + '_' + '-' + '.')
220 |         service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':') + Optional(':')
221 | 
222 |         # unix time
223 |         unix_time = Optional('[' + Word(nums + '.' + ']'))
224 |         subservice = Optional(Word(alphas + nums + '_' + '-' + ':'))
225 |         subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/' +
226 |                                              '(' + ')' + '*'))
227 |         message = Regex('.*')
228 | 
229 |         # messages log grammar
230 |         messageslog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message
231 |         return messageslog_grammar
232 | 
233 |     def parse_log(self, log_line):
234 |         """Parse messages log based on defined grammar.
235 | 
236 |         Parameters
237 |         ----------
238 |         log_line    : str
239 |             A log line to be parsed.
240 | 
241 |         Returns
242 |         -------
243 |         parsed  : dict[str, str]
244 |             A parsed messages log containing these elements: timestamp, hostname, service, unix_time,
245 |             subservice and message.
246 |         """
247 |         parsed_messageslog = self.messageslog_grammar.parseString(log_line)
248 | 
249 |         # get parsed kernel log
250 |         parsed = OrderedDict()
251 |         parsed['timestamp'] = parsed_messageslog[0] + ' ' + parsed_messageslog[1] + ' ' + parsed_messageslog[2]
252 |         parsed['hostname'] = parsed_messageslog[3]
253 |         parsed['service'] = parsed_messageslog[4]
254 | 
255 |         if len(parsed_messageslog) == 6:
256 |             parsed['unix_time'] = ''
257 |             parsed['subservice'] = ''
258 |             if not parsed['service'].endswith(':'):
259 |                 parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5]
260 |                 parsed['service'] = ''
261 |             else:
262 |                 parsed['message'] = ''
263 | 
264 |         elif len(parsed_messageslog) == 7:
265 |             parsed['unix_time'] = ''
266 |             parsed['subservice'] = ''
267 |             if not parsed_messageslog[5].endswith(':'):
268 |                 parsed['message'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6]
269 | 
270 |             if not parsed['service'].endswith(':'):
271 |                 parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5] + ' ' + parsed_messageslog[6]
272 |                 parsed['service'] = ''
273 | 
274 |         elif len(parsed_messageslog) == 8:
275 |             parsed['unix_time'] = ''
276 |             parsed['subservice'] = ''
277 | 
278 |             # timestamp exists
279 |             if parsed_messageslog[5].startswith('[') and parsed_messageslog[5].endswith(']'):
280 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6])
281 |                 parsed['message'] = parsed_messageslog[7]
282 | 
283 |             # no timestamp, no subservice, just message
284 |             else:
285 |                 parsed['message'] = ' '.join(parsed_messageslog[5:])
286 | 
287 |         else:
288 |             if parsed_messageslog[5] == ':':
289 |                 parsed['service'] = parsed['service'] + ' ' + parsed_messageslog[5]
290 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[6], parsed_messageslog[7])
291 | 
292 |                 # subservice one word
293 |                 if parsed_messageslog[8].endswith(':') and not parsed_messageslog[9].endswith(':'):
294 |                     parsed['subservice'] = parsed_messageslog[8]
295 |                     parsed['message'] = ' '.join(parsed_messageslog[9:])
296 | 
297 |                 # subservice two words
298 |                 elif not parsed_messageslog[8].endswith(':') and parsed_messageslog[9].endswith(':'):
299 |                     parsed['subservice'] = parsed_messageslog[8] + ' ' + parsed_messageslog[9]
300 |                     parsed['message'] = ' '.join(parsed_messageslog[10:])
301 | 
302 |                 # subservice two words
303 |                 elif parsed_messageslog[8].endswith(':') and parsed_messageslog[9].endswith(':'):
304 |                     parsed['subservice'] = parsed_messageslog[8] + ' ' + parsed_messageslog[9]
305 |                     parsed['message'] = ' '.join(parsed_messageslog[10:])
306 | 
307 |                 # no subservice, only message
308 |                 else:
309 |                     parsed['subservice'] = ''
310 |                     parsed['message'] = ' '.join(parsed_messageslog[8:])
311 | 
312 |             else:
313 |                 parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6])
314 | 
315 |                 # subservice one word
316 |                 if parsed_messageslog[7].endswith(':') and not parsed_messageslog[8].endswith(':'):
317 |                     parsed['subservice'] = parsed_messageslog[7]
318 |                     parsed['message'] = ' '.join(parsed_messageslog[8:])
319 | 
320 |                 # subservice two words
321 |                 elif not parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'):
322 |                     parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8]
323 |                     parsed['message'] = ' '.join(parsed_messageslog[9:])
324 | 
325 |                 # subservice two words
326 |                 elif parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'):
327 |                     parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8]
328 |                     parsed['message'] = ' '.join(parsed_messageslog[9:])
329 | 
330 |                 # no subservice, only message
331 |                 else:
332 |                     parsed['subservice'] = ''
333 |                     parsed['message'] = ' '.join(parsed_messageslog[7:])
334 | 
335 |             if not parsed['subservice'].endswith(':'):
336 |                 parsed['message'] = parsed['subservice'] + ' ' + parsed['message']
337 |                 parsed['subservice'] = ''
338 | 
339 |         return parsed
340 | 
341 | 
342 | class Main(object):
343 |     def __init__(self, datasets):
344 |         self.datasets = datasets
345 |         self.dataset_path = '/home/hudan/Git/prlogparser/datasets/'
346 |         self.groups = {
347 |             'group1': ['casper-rw', 'dfrws-2009-jhuisi', 'dfrws-2009-nssal', 'honeynet-challenge7'],
348 |             'group2': ['honeynet-challenge5']
349 |         }
350 | 
351 |     def run(self):
352 |         # parse messages.log
353 |         for group_name, group in self.groups.items():
354 |             # setup test csv file to save results
355 |             base_name = '/home/hudan/Git/prlogparser/groundtruth/messages-'
356 |             test_file = base_name + group_name + '.csv'
357 |             f = open(test_file, 'w', newline='')
358 |             writer = csv.writer(f)
359 | 
360 |             for dataset in group:
361 |                 # get grammar
362 |                 dl = None
363 |                 if group_name == 'group1':
364 |                     dl = MessagesLog1(dataset)
365 | 
366 |                 elif group_name == 'group2':
367 |                     dl = MessagesLog2(dataset)
368 | 
369 |                 # start parsing
370 |                 for filename in self.datasets[dataset]:
371 |                     filename = os.path.join(self.dataset_path, filename)
372 |                     with open(filename, 'r') as f:
373 |                         for line in f:
374 |                             # get parsed line and print
375 |                             parsed_line = dl.parse_log(line)
376 |                             print(parsed_line)
377 | 
378 |                             # write to csv
379 |                             row = list(parsed_line.values())
380 |                             writer.writerow(row)
381 | 
382 |             f.close()
383 | 
384 | 
385 | if __name__ == '__main__':
386 |     datasets_files = {
387 |         'casper-rw': [
388 |             'casper-rw/messages',
389 |             'casper-rw/syslog',
390 |             'casper-rw/syslog.0',
391 |             'casper-rw/syslog.1',
392 |             'casper-rw/syslog.2',
393 |             'casper-rw/syslog.3'
394 |         ],
395 |         'dfrws-2009-jhuisi': [
396 |             'dfrws-2009-jhuisi/messages',
397 |             'dfrws-2009-jhuisi/messages.0',
398 |             'dfrws-2009-jhuisi/messages.1',
399 |             'dfrws-2009-jhuisi/syslog',
400 |             'dfrws-2009-jhuisi/syslog.0',
401 |             'dfrws-2009-jhuisi/syslog.1',
402 |             'dfrws-2009-jhuisi/syslog.2'
403 |         ],
404 |         'dfrws-2009-nssal': [
405 |             'dfrws-2009-nssal/messages',
406 |             'dfrws-2009-nssal/messages.0',
407 |             'dfrws-2009-nssal/messages.1',
408 |             'dfrws-2009-nssal/messages.2',
409 |             'dfrws-2009-nssal/messages.3',
410 |             'dfrws-2009-nssal/syslog',
411 |             'dfrws-2009-nssal/syslog.0',
412 |             'dfrws-2009-nssal/syslog.1',
413 |             'dfrws-2009-nssal/syslog.2',
414 |             'dfrws-2009-nssal/syslog.3',
415 |             'dfrws-2009-nssal/syslog.4',
416 |             'dfrws-2009-nssal/syslog.5',
417 |             'dfrws-2009-nssal/syslog.6'
418 |         ],
419 |         'honeynet-challenge5': ['honeynet-challenge5/messages'],
420 |         'honeynet-challenge7': [
421 |             'honeynet-challenge7/messages',
422 |             'honeynet-challenge7/syslog'
423 |         ]
424 |     }
425 | 
426 |     main = Main(datasets_files)
427 |     main.run()
428 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/proxifierlog.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import csv
  3 | from pyparsing import Word, Combine, nums, alphas, Optional, Regex
  4 | from collections import OrderedDict
  5 | 
  6 | 
  7 | class ProxifierLog(object):
  8 |     def __init__(self, dataset):
  9 |         self.dataset = dataset
 10 |         self.proxifierlog_grammar = self.__get_proxifierlog_grammar()
 11 | 
 12 |     @staticmethod
 13 |     def __get_proxifierlog_grammar():
 14 |         # get proxifier grammar
 15 |         ints = Word(nums)
 16 | 
 17 |         date = Combine('[' + ints + '.' + ints)
 18 |         time = Combine(ints + ':' + ints + ':' + ints + ']')
 19 |         timestamp = date + time
 20 | 
 21 |         service = Word(alphas + nums + '.' + '-' + '_')
 22 |         arch = Optional(Word('*' + nums))
 23 |         domain_or_ip = Optional(Word('-')) + Word(alphas + nums + '.' + ':' + '-')
 24 |         status = Optional(Word(alphas + ',')) + Optional(':')
 25 |         message = Regex('.*')
 26 | 
 27 |         proxifierlog_grammar = timestamp + service + arch + domain_or_ip + status + message
 28 |         return proxifierlog_grammar
 29 | 
 30 |     def parse_log(self, log_line):
 31 |         # parse proxifier log entries
 32 |         parsed_proxifierlog = self.proxifierlog_grammar.parseString(log_line)
 33 | 
 34 |         parsed = OrderedDict()
 35 |         parsed['timestamp'] = parsed_proxifierlog[0] + ' ' + parsed_proxifierlog[1]
 36 |         parsed['service'] = parsed_proxifierlog[2]
 37 | 
 38 |         if len(parsed_proxifierlog) == 6:
 39 |             parsed['service'] = parsed_proxifierlog[2] + ' ' + parsed_proxifierlog[3]
 40 |             parsed['arch'] = ''
 41 |             parsed['domain_or_ip'] = ''
 42 |             parsed['status'] = ''
 43 |             parsed['message'] = ' '.join(parsed_proxifierlog[4:])
 44 | 
 45 |         elif len(parsed_proxifierlog) == 7:
 46 |             parsed['arch'] = ''
 47 |             parsed['domain_or_ip'] = parsed_proxifierlog[3] + ' ' + parsed_proxifierlog[4]
 48 | 
 49 |             if parsed_proxifierlog[5].endswith(','):
 50 |                 parsed['status'] = parsed_proxifierlog[5]
 51 |                 parsed['message'] = parsed_proxifierlog[6]
 52 |             else:
 53 |                 parsed['status'] = ''
 54 |                 parsed['message'] = ' '.join(parsed_proxifierlog[5:])
 55 | 
 56 |         elif len(parsed_proxifierlog) == 8:
 57 |             if parsed_proxifierlog[3].startswith('*'):
 58 |                 parsed['arch'] = parsed_proxifierlog[3]
 59 |                 parsed['domain_or_ip'] = parsed_proxifierlog[4] + ' ' + parsed_proxifierlog[5]
 60 | 
 61 |                 if parsed_proxifierlog[6].endswith(','):
 62 |                     parsed['status'] = parsed_proxifierlog[6]
 63 |                     parsed['message'] = parsed_proxifierlog[7]
 64 |                 else:
 65 |                     parsed['status'] = ''
 66 |                     parsed['message'] = ' '.join(parsed_proxifierlog[6:])
 67 | 
 68 |             else:
 69 |                 parsed['arch'] = ''
 70 |                 parsed['domain_or_ip'] = parsed_proxifierlog[3] + ' ' + parsed_proxifierlog[4]
 71 | 
 72 |                 if parsed_proxifierlog[6] == ':':
 73 |                     parsed['status'] = parsed_proxifierlog[5] + ' ' + parsed_proxifierlog[6]
 74 |                     parsed['message'] = parsed_proxifierlog[7]
 75 |                 else:
 76 |                     parsed['status'] = ''
 77 |                     parsed['message'] = ' '.join(parsed_proxifierlog[5:])
 78 | 
 79 |         elif len(parsed_proxifierlog) == 9:
 80 |             parsed['arch'] = parsed_proxifierlog[3]
 81 |             parsed['domain_or_ip'] = parsed_proxifierlog[4] + ' ' + parsed_proxifierlog[5]
 82 |             parsed['status'] = parsed_proxifierlog[6] + ' ' + parsed_proxifierlog[7]
 83 |             parsed['message'] = parsed_proxifierlog[8]
 84 | 
 85 |         return parsed
 86 | 
 87 | 
 88 | if __name__ == '__main__':
 89 |     dataset_path = '/home/hudan/Git/prlogparser/datasets/proxifier/'
 90 |     filenames = ['proxifier.log']
 91 | 
 92 |     test_file = '/home/hudan/Git/prlogparser/groundtruth/test-results/proxifier-test.csv'
 93 |     f = open(test_file, 'w', newline='')
 94 |     writer = csv.writer(f)
 95 | 
 96 |     pl = ProxifierLog('')
 97 |     for filename in filenames:
 98 |         filename = os.path.join(dataset_path, filename)
 99 |         with open(filename, 'r') as f:
100 |             for line in f:
101 |                 parsed_line = pl.parse_log(line)
102 |                 print(parsed_line)
103 | 
104 |                 row = list(parsed_line.values())
105 |                 writer.writerow(row)
106 | 
107 |     f.close()
108 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/weblog.py:
--------------------------------------------------------------------------------
  1 | # httpServerLogParser.py
  2 | #
  3 | # Copyright (c) 2016, Paul McGuire
  4 | #
  5 | 
  6 | import os
  7 | import csv
  8 | import string
  9 | from pyparsing import alphas, nums, dblQuotedString, Combine, Word, Group, delimitedList
 10 | from collections import OrderedDict
 11 | 
 12 | 
 13 | class WebLog(object):
 14 |     """This class is based on httpServerLogParser.py by Paul McGuire.
 15 |     http://pyparsing.wikispaces.com/file/detail/httpServerLogParser.py
 16 | 
 17 |     """
 18 |     def __init__(self, dataset):
 19 |         self.dataset = dataset
 20 |         self.weblog_grammar = self.__get_weblog_grammar()
 21 | 
 22 |     @staticmethod
 23 |     def __get_weblog_grammar():
 24 |         integer = Word(nums)
 25 |         ip_address = delimitedList(integer, ".", combine=True)
 26 |         time_zone_offset = Word("+-", nums)
 27 |         month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3)
 28 |         server_date_time = Group(Combine("[" + integer + "/" + month + "/" + integer +
 29 |                                          ":" + integer + ":" + integer + ":" + integer) +
 30 |                                  Combine(time_zone_offset + "]"))
 31 | 
 32 |         weblog_grammar = (ip_address.setResultsName("ip_address") +
 33 |                           Word("-").setResultsName("dash") +
 34 |                           ("-" | Word(alphas + nums + "@._")).setResultsName("auth") +
 35 |                           server_date_time.setResultsName("timestamp") +
 36 |                           dblQuotedString.setResultsName("command") +
 37 |                           (integer | "-").setResultsName("status_code") +
 38 |                           (integer | "-").setResultsName("num_bytes") +
 39 |                           dblQuotedString.setResultsName("referrer") +
 40 |                           dblQuotedString.setResultsName("client_agent"))
 41 | 
 42 |         return weblog_grammar
 43 | 
 44 |     def parse_log(self, log_line):
 45 |         parsed_weblog = self.weblog_grammar.parseString(log_line)
 46 | 
 47 |         parsed = OrderedDict()
 48 |         parsed['ip_address'] = parsed_weblog.ip_address
 49 |         parsed['dash'] = parsed_weblog.dash
 50 |         parsed['auth'] = parsed_weblog.auth
 51 |         parsed['timestamp'] = ' '.join(parsed_weblog.timestamp[0:2])
 52 |         parsed['command'] = parsed_weblog.command
 53 |         parsed['status_code'] = parsed_weblog.status_code
 54 |         parsed['num_bytes'] = parsed_weblog.num_bytes
 55 |         parsed['referrer'] = parsed_weblog.referrer
 56 |         parsed['client_agent'] = parsed_weblog.client_agent
 57 | 
 58 |         return parsed
 59 | 
 60 |     @staticmethod
 61 |     def __get_filename(base_filename, month, day):
 62 |         # example:
 63 |         # access.log.2018-01-02
 64 |         # access.log.2018-01-01
 65 | 
 66 |         # check day format
 67 |         if day < 10:
 68 |             day = '0' + str(day)
 69 |         else:
 70 |             day = str(day)
 71 | 
 72 |         fn = base_filename + '0' + str(month) + '-' + day
 73 |         return fn
 74 | 
 75 |     def get_all_filenames(self):
 76 |         # setup variables
 77 |         base_filename = 'access.log.2018-'
 78 |         months = range(3, 6)
 79 |         day_odd = range(1, 32)
 80 |         day_even = range(1, 31)
 81 | 
 82 |         # get all filenames
 83 |         filenames = []
 84 |         for month in months:
 85 |             if month % 2 == 0:
 86 |                 for day in day_even:
 87 |                     fn = self.__get_filename(base_filename, month, day)
 88 |                     filenames.append(fn)
 89 |             else:
 90 |                 for day in day_odd:
 91 |                     fn = self.__get_filename(base_filename, month, day)
 92 |                     filenames.append(fn)
 93 | 
 94 |         return filenames
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 |     wl = WebLog('')
 99 |     dataset_path = '/home/hudan/Git/prlogparser/datasets/secrepo-accesslog/'
100 |     filenames_list = wl.get_all_filenames()
101 | 
102 |     test_file = '/home/hudan/Git/prlogparser/groundtruth/test-results/weblog-test.csv'
103 |     f = open(test_file, 'w', newline='')
104 |     writer = csv.writer(f)
105 | 
106 |     for filename in filenames_list:
107 |         filename = os.path.join(dataset_path, filename)
108 |         with open(filename, 'r') as f:
109 |             for line in f:
110 |                 parsed_line = wl.parse_log(line)
111 |                 print(parsed_line)
112 | 
113 |                 row = list(parsed_line.values())
114 |                 writer.writerow(row)
115 | 
116 |     f.close()
117 | 


--------------------------------------------------------------------------------
/nerlogparser/grammar/zookeeperlog.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | from pyparsing import Word, Combine, nums, alphas, Regex, Optional
 4 | from collections import OrderedDict
 5 | 
 6 | 
 7 | class ZookeeperLog(object):
 8 |     def __init__(self, dataset):
 9 |         self.dataset = dataset
10 |         self.zookeeperlog_grammar = self.__get_zookeeperlog_grammar()
11 | 
12 |     @staticmethod
13 |     def __get_zookeeperlog_grammar():
14 |         ints = Word(nums)
15 | 
16 |         date = Combine(ints + '-' + ints + '-' + ints)
17 |         time = Combine(ints + ':' + ints + ':' + ints + ',' + ints)
18 |         timestamp = date + time
19 | 
20 |         dash = Word('-')
21 |         status = Word(alphas)
22 |         job = Word(alphas + nums + '[]:@=/.$()-') + Optional(Word(alphas + nums + '[]:@=/.$()-')) + Optional(Word('-'))
23 |         message = Regex('.*')
24 | 
25 |         zookeperlog_grammar = timestamp('timestamp') + dash('dash') + status('status') + job('job') + message('message')
26 |         return zookeperlog_grammar
27 | 
28 |     def parse_log(self, log_line):
29 |         parsed_zookeeperlog = self.zookeeperlog_grammar.parseString(log_line)
30 | 
31 |         parsed = OrderedDict()
32 |         parsed['timestamp'] = ' '.join(parsed_zookeeperlog.timestamp)
33 |         parsed['dash'] = parsed_zookeeperlog.dash
34 |         parsed['status'] = parsed_zookeeperlog.status
35 |         parsed['job'] = ' '.join(parsed_zookeeperlog.job)
36 |         parsed['message'] = parsed_zookeeperlog.message
37 | 
38 |         return parsed
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     dataset_path = '/home/hudan/Git/prlogparser/datasets/zookeeper/'
43 |     filenames = ['zookeeper.log']
44 | 
45 |     test_file = '/home/hudan/Git/prlogparser/groundtruth/test-results/zookeeper-test.csv'
46 |     f = open(test_file, 'w', newline='')
47 |     writer = csv.writer(f)
48 | 
49 |     zl = ZookeeperLog('')
50 |     for filename in filenames:
51 |         filename = os.path.join(dataset_path, filename)
52 |         with open(filename, 'r') as f:
53 |             for line in f:
54 |                 parsed_line = zl.parse_log(line)
55 |                 print(parsed_line)
56 | 
57 |                 row = list(parsed_line.values())
58 |                 writer.writerow(row)
59 | 
60 |     f.close()
61 | 


--------------------------------------------------------------------------------
/nerlogparser/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/model/__init__.py


--------------------------------------------------------------------------------
/nerlogparser/model/base_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | class BaseModel(object):
  6 |     """Generic class for general methods that are not specific to NER"""
  7 | 
  8 |     def __init__(self, config):
  9 |         """Defines self.config and self.logger
 10 | 
 11 |         Args:
 12 |             config: (Config instance) class with hyper parameters,
 13 |                 vocab and embeddings
 14 | 
 15 |         """
 16 |         self.config = config
 17 |         self.logger = config.logger
 18 |         self.sess   = None
 19 |         self.saver  = None
 20 | 
 21 |     def reinitialize_weights(self, scope_name):
 22 |         """Reinitializes the weights of a given layer"""
 23 |         variables = tf.contrib.framework.get_variables(scope_name)
 24 |         init = tf.variables_initializer(variables)
 25 |         self.sess.run(init)
 26 | 
 27 |     def add_train_op(self, lr_method, lr, loss, clip=-1):
 28 |         """Defines self.train_op that performs an update on a batch
 29 | 
 30 |         Args:
 31 |             lr_method: (string) sgd method, for example "adam"
 32 |             lr: (tf.placeholder) tf.float32, learning rate
 33 |             loss: (tensor) tf.float32 loss to minimize
 34 |             clip: (python float) clipping of gradient. If < 0, no clipping
 35 | 
 36 |         """
 37 |         _lr_m = lr_method.lower() # lower to make sure
 38 | 
 39 |         with tf.variable_scope("train_step"):
 40 |             if _lr_m == 'adam': # sgd method
 41 |                 optimizer = tf.train.AdamOptimizer(lr)
 42 |             elif _lr_m == 'adagrad':
 43 |                 optimizer = tf.train.AdagradOptimizer(lr)
 44 |             elif _lr_m == 'sgd':
 45 |                 optimizer = tf.train.GradientDescentOptimizer(lr)
 46 |             elif _lr_m == 'rmsprop':
 47 |                 optimizer = tf.train.RMSPropOptimizer(lr)
 48 |             else:
 49 |                 raise NotImplementedError("Unknown method {}".format(_lr_m))
 50 | 
 51 |             if clip > 0: # gradient clipping if clip is positive
 52 |                 grads, vs     = zip(*optimizer.compute_gradients(loss))
 53 |                 grads, gnorm  = tf.clip_by_global_norm(grads, clip)
 54 |                 self.train_op = optimizer.apply_gradients(zip(grads, vs))
 55 |             else:
 56 |                 self.train_op = optimizer.minimize(loss)
 57 | 
 58 |     def initialize_session(self):
 59 |         """Defines self.sess and initialize the variables"""
 60 |         self.logger.info("Initializing tf session")
 61 |         self.sess = tf.Session()
 62 |         self.sess.run(tf.global_variables_initializer())
 63 |         self.saver = tf.train.Saver()
 64 | 
 65 |     def restore_session(self, dir_model):
 66 |         """Reload weights into session
 67 | 
 68 |         Args:
 69 |             sess: tf.Session()
 70 |             dir_model: dir with weights
 71 | 
 72 |         """
 73 |         self.logger.info("Reloading the latest trained model...")
 74 |         self.saver.restore(self.sess, dir_model)
 75 | 
 76 |     def save_session(self):
 77 |         """Saves session = weights"""
 78 |         if not os.path.exists(self.config.dir_model):
 79 |             os.makedirs(self.config.dir_model)
 80 |         self.saver.save(self.sess, self.config.dir_model)
 81 | 
 82 |     def close_session(self):
 83 |         """Closes the session"""
 84 |         self.sess.close()
 85 | 
 86 |     def add_summary(self):
 87 |         """Defines variables for Tensorboard
 88 | 
 89 |         Args:
 90 |             dir_output: (string) where the results are written
 91 | 
 92 |         """
 93 |         self.merged      = tf.summary.merge_all()
 94 |         self.file_writer = tf.summary.FileWriter(self.config.dir_output,
 95 |                 self.sess.graph)
 96 | 
 97 |     def train(self, train, dev):
 98 |         """Performs training with early stopping and lr exponential decay
 99 | 
100 |         Args:
101 |             train: dataset that yields tuple of (sentences, tags)
102 |             dev: dataset
103 | 
104 |         """
105 |         best_score = 0
106 |         nepoch_no_imprv = 0 # for early stopping
107 |         self.add_summary() # tensorboard
108 | 
109 |         for epoch in range(self.config.nepochs):
110 |             self.logger.info("Epoch {:} out of {:}".format(epoch + 1,
111 |                         self.config.nepochs))
112 | 
113 |             score = self.run_epoch(train, dev, epoch)
114 |             self.config.lr *= self.config.lr_decay # decay learning rate
115 | 
116 |             # early stopping and saving best parameters
117 |             if score >= best_score:
118 |                 nepoch_no_imprv = 0
119 |                 self.save_session()
120 |                 best_score = score
121 |                 self.logger.info("- new best score!")
122 |             else:
123 |                 nepoch_no_imprv += 1
124 |                 if nepoch_no_imprv >= self.config.nepoch_no_imprv:
125 |                     self.logger.info("- early stopping {} epochs without "\
126 |                             "improvement".format(nepoch_no_imprv))
127 |                     break
128 | 
129 |     def evaluate(self, test):
130 |         """Evaluate model on test set
131 | 
132 |         Args:
133 |             test: instance of class Dataset
134 | 
135 |         """
136 |         self.logger.info("Testing model over test set")
137 |         metrics = self.run_evaluate(test)
138 |         msg = " - ".join(["{} {:04.2f}".format(k, v)
139 |                 for k, v in metrics.items()])
140 |         self.logger.info(msg)
141 | 


--------------------------------------------------------------------------------
/nerlogparser/model/build_data.py:
--------------------------------------------------------------------------------
 1 | from nerlogparser.model.config import Config
 2 | from nerlogparser.model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \
 3 |     get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \
 4 |     export_trimmed_glove_vectors, get_processing_word
 5 | 
 6 | 
 7 | def main():
 8 |     """Procedure to build data
 9 | 
10 |     You MUST RUN this procedure. It iterates over the whole dataset (train,
11 |     dev and test) and extract the vocabularies in terms of words, tags, and
12 |     characters. Having built the vocabularies it writes them in a file. The
13 |     writing of vocabulary in a file assigns an id (the line #) to each word.
14 |     It then extract the relevant GloVe vectors and stores them in a np array
15 |     such that the i-th entry corresponds to the i-th word in the vocabulary.
16 | 
17 | 
18 |     Args:
19 |         config: (instance of Config) has attributes like hyper-params...
20 | 
21 |     """
22 |     # get config and processing of words
23 |     config = Config(load=False)
24 |     processing_word = get_processing_word(lowercase=True)
25 | 
26 |     # Generators
27 |     dev   = CoNLLDataset(config.filename_dev, processing_word)
28 |     test  = CoNLLDataset(config.filename_test, processing_word)
29 |     train = CoNLLDataset(config.filename_train, processing_word)
30 | 
31 |     # Build Word and Tag vocab
32 |     vocab_words, vocab_tags = get_vocabs([train, dev, test])
33 |     vocab_glove = get_glove_vocab(config.filename_glove)
34 | 
35 |     vocab = vocab_words & vocab_glove
36 |     vocab.add(UNK)
37 |     vocab.add(NUM)
38 | 
39 |     # Save vocab
40 |     write_vocab(vocab, config.filename_words)
41 |     write_vocab(vocab_tags, config.filename_tags)
42 | 
43 |     # Trim GloVe Vectors
44 |     vocab = load_vocab(config.filename_words)
45 |     export_trimmed_glove_vectors(vocab, config.filename_glove,
46 |                                 config.filename_trimmed, config.dim_word)
47 | 
48 |     # Build and save char vocab
49 |     train = CoNLLDataset(config.filename_train)
50 |     vocab_chars = get_char_vocab(train)
51 |     write_vocab(vocab_chars, config.filename_chars)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/nerlogparser/model/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | 
  4 | from nerlogparser.model.general_utils import get_logger
  5 | from nerlogparser.model.data_utils import get_trimmed_glove_vectors, load_vocab, \
  6 |         get_processing_word
  7 | 
  8 | 
  9 | class Config():
 10 |     def __init__(self, load=True):
 11 |         """Initialize hyperparameters and load vocabs
 12 | 
 13 |         Args:
 14 |             load_embeddings: (bool) if True, load embeddings into
 15 |                 np array, else None
 16 | 
 17 |         """
 18 |         # directory for training outputs
 19 |         if not os.path.exists(self.dir_output):
 20 |             os.makedirs(self.dir_output)
 21 | 
 22 |         # create instance of logger
 23 |         self.logger = get_logger(self.path_log)
 24 | 
 25 |         # load if requested (default)
 26 |         if load:
 27 |             self.load()
 28 | 
 29 | 
 30 |     def load(self):
 31 |         """Loads vocabulary, processing functions and embeddings
 32 | 
 33 |         Supposes that build_data.py has been run successfully and that
 34 |         the corresponding files have been created (vocab and trimmed GloVe
 35 |         vectors)
 36 | 
 37 |         """
 38 |         # 1. vocabulary
 39 |         self.vocab_words = load_vocab(self.filename_words)
 40 |         self.vocab_tags  = load_vocab(self.filename_tags)
 41 |         self.vocab_chars = load_vocab(self.filename_chars)
 42 | 
 43 |         self.nwords     = len(self.vocab_words)
 44 |         self.nchars     = len(self.vocab_chars)
 45 |         self.ntags      = len(self.vocab_tags)
 46 | 
 47 |         # 2. get processing functions that map str -> id
 48 |         self.processing_word = get_processing_word(self.vocab_words,
 49 |                 self.vocab_chars, lowercase=True, chars=self.use_chars)
 50 |         self.processing_tag  = get_processing_word(self.vocab_tags,
 51 |                 lowercase=False, allow_unk=False)
 52 | 
 53 |         # 3. get pre-trained embeddings
 54 |         self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed)
 55 |                 if self.use_pretrained else None)
 56 | 
 57 | 
 58 |     # general config
 59 |     file_path = os.path.dirname(os.path.realpath(__file__))
 60 |     dir_output = os.path.join(file_path, '..', "results/test/")
 61 |     dir_model  = dir_output + "model.weights/"
 62 |     path_log   = dir_output + "log.txt"
 63 | 
 64 |     # embeddings
 65 |     dim_word = 300
 66 |     dim_char = 100
 67 | 
 68 |     # glove files
 69 |     filename_glove = "data/glove.6B/glove.6B.{}d.txt".format(dim_word)
 70 |     # trimmed embeddings (created from glove_filename with build_data.py)
 71 |     filename_trimmed = os.path.join(file_path, '..', "data/glove.6B.{}d.trimmed.npz".format(dim_word))
 72 |     use_pretrained = True
 73 | 
 74 |     # dataset
 75 |     filename_dev = "data/conll/conll.dev.txt"
 76 |     filename_test = "data/conll/conll.test.txt"
 77 |     filename_train = "data/conll/conll.train.txt"
 78 | 
 79 |     # filename_dev = filename_test = filename_train = "data/test.txt" # test
 80 | 
 81 |     max_iter = None # if not None, max number of examples in Dataset
 82 | 
 83 |     # vocab (created from dataset with build_data.py)
 84 |     filename_words = os.path.join(file_path, '..', "data/words.txt")
 85 |     filename_tags = os.path.join(file_path, '..', "data/tags.txt")
 86 |     filename_chars = os.path.join(file_path, '..', "data/chars.txt")
 87 | 
 88 |     # training
 89 |     train_embeddings = False
 90 |     nepochs          = 15
 91 |     dropout          = 0.5
 92 |     batch_size       = 20
 93 |     lr_method        = "adam"
 94 |     lr               = 0.001
 95 |     lr_decay         = 0.9
 96 |     clip             = -1 # if negative, no clipping
 97 |     nepoch_no_imprv  = 3
 98 | 
 99 |     # model hyperparameters
100 |     hidden_size_char = 100 # lstm on chars
101 |     hidden_size_lstm = 300 # lstm on word embeddings
102 | 
103 |     # NOTE: if both chars and crf, only 1.6x slower on GPU
104 |     use_crf = False # if crf, training is 1.7x slower on CPU
105 |     use_chars = True # if char embedding, training is 3.5x slower on CPU
106 | 
107 |     label_file = os.path.join(file_path, '..', "data/label.txt")
108 | 


--------------------------------------------------------------------------------
/nerlogparser/model/data_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | 
  4 | 
  5 | # shared global variables to be imported from model also
  6 | UNK = "$UNK$"
  7 | NUM = "$NUM$"
  8 | NONE = "O"
  9 | 
 10 | 
 11 | # special error message
 12 | class MyIOError(Exception):
 13 |     def __init__(self, filename):
 14 |         # custom error message
 15 |         message = """
 16 | ERROR: Unable to locate file {}.
 17 | 
 18 | FIX: Have you tried running python build_data.py first?
 19 | This will build vocab file from your train, test and dev sets and
 20 | trimm your word vectors.
 21 | """.format(filename)
 22 |         super(MyIOError, self).__init__(message)
 23 | 
 24 | 
 25 | class CoNLLDataset(object):
 26 |     """Class that iterates over CoNLL Dataset
 27 | 
 28 |     __iter__ method yields a tuple (words, tags)
 29 |         words: list of raw words
 30 |         tags: list of raw tags
 31 | 
 32 |     If processing_word and processing_tag are not None,
 33 |     optional preprocessing is appplied
 34 | 
 35 |     Example:
 36 |         ```python
 37 |         data = CoNLLDataset(filename)
 38 |         for sentence, tags in data:
 39 |             pass
 40 |         ```
 41 | 
 42 |     """
 43 |     def __init__(self, filename, processing_word=None, processing_tag=None,
 44 |                  max_iter=None):
 45 |         """
 46 |         Args:
 47 |             filename: path to the file
 48 |             processing_words: (optional) function that takes a word as input
 49 |             processing_tags: (optional) function that takes a tag as input
 50 |             max_iter: (optional) max number of sentences to yield
 51 | 
 52 |         """
 53 |         self.filename = filename
 54 |         self.processing_word = processing_word
 55 |         self.processing_tag = processing_tag
 56 |         self.max_iter = max_iter
 57 |         self.length = None
 58 | 
 59 | 
 60 |     def __iter__(self):
 61 |         niter = 0
 62 |         with open(self.filename) as f:
 63 |             words, tags = [], []
 64 |             for line in f:
 65 |                 line = line.strip()
 66 |                 if (len(line) == 0 or line.startswith("-DOCSTART-")):
 67 |                     if len(words) != 0:
 68 |                         niter += 1
 69 |                         if self.max_iter is not None and niter > self.max_iter:
 70 |                             break
 71 |                         yield words, tags
 72 |                         words, tags = [], []
 73 |                 else:
 74 |                     ls = line.split(' ')
 75 |                     word, tag = ls[0],ls[1]
 76 |                     if self.processing_word is not None:
 77 |                         word = self.processing_word(word)
 78 |                     if self.processing_tag is not None:
 79 |                         tag = self.processing_tag(tag)
 80 |                     words += [word]
 81 |                     tags += [tag]
 82 | 
 83 | 
 84 |     def __len__(self):
 85 |         """Iterates once over the corpus to set and store length"""
 86 |         if self.length is None:
 87 |             self.length = 0
 88 |             for _ in self:
 89 |                 self.length += 1
 90 | 
 91 |         return self.length
 92 | 
 93 | 
 94 | def get_vocabs(datasets):
 95 |     """Build vocabulary from an iterable of datasets objects
 96 | 
 97 |     Args:
 98 |         datasets: a list of dataset objects
 99 | 
100 |     Returns:
101 |         a set of all the words in the dataset
102 | 
103 |     """
104 |     print("Building vocab...")
105 |     vocab_words = set()
106 |     vocab_tags = set()
107 |     for dataset in datasets:
108 |         for words, tags in dataset:
109 |             vocab_words.update(words)
110 |             vocab_tags.update(tags)
111 |     print("- done. {} tokens".format(len(vocab_words)))
112 |     return vocab_words, vocab_tags
113 | 
114 | 
115 | def get_char_vocab(dataset):
116 |     """Build char vocabulary from an iterable of datasets objects
117 | 
118 |     Args:
119 |         dataset: a iterator yielding tuples (sentence, tags)
120 | 
121 |     Returns:
122 |         a set of all the characters in the dataset
123 | 
124 |     """
125 |     vocab_char = set()
126 |     for words, _ in dataset:
127 |         for word in words:
128 |             vocab_char.update(word)
129 | 
130 |     return vocab_char
131 | 
132 | 
133 | def get_glove_vocab(filename):
134 |     """Load vocab from file
135 | 
136 |     Args:
137 |         filename: path to the glove vectors
138 | 
139 |     Returns:
140 |         vocab: set() of strings
141 |     """
142 |     print("Building vocab...")
143 |     vocab = set()
144 |     with open(filename) as f:
145 |         for line in f:
146 |             word = line.strip().split(' ')[0]
147 |             vocab.add(word)
148 |     print("- done. {} tokens".format(len(vocab)))
149 |     return vocab
150 | 
151 | 
152 | def write_vocab(vocab, filename):
153 |     """Writes a vocab to a file
154 | 
155 |     Writes one word per line.
156 | 
157 |     Args:
158 |         vocab: iterable that yields word
159 |         filename: path to vocab file
160 | 
161 |     Returns:
162 |         write a word per line
163 | 
164 |     """
165 |     print("Writing vocab...")
166 |     with open(filename, "w") as f:
167 |         for i, word in enumerate(vocab):
168 |             if i != len(vocab) - 1:
169 |                 f.write("{}\n".format(word))
170 |             else:
171 |                 f.write(word)
172 |     print("- done. {} tokens".format(len(vocab)))
173 | 
174 | 
175 | def load_vocab(filename):
176 |     """Loads vocab from a file
177 | 
178 |     Args:
179 |         filename: (string) the format of the file must be one word per line.
180 | 
181 |     Returns:
182 |         d: dict[word] = index
183 | 
184 |     """
185 |     try:
186 |         d = dict()
187 |         with open(filename) as f:
188 |             for idx, word in enumerate(f):
189 |                 word = word.strip()
190 |                 d[word] = idx
191 | 
192 |     except IOError:
193 |         raise MyIOError(filename)
194 |     return d
195 | 
196 | 
197 | def export_trimmed_glove_vectors(vocab, glove_filename, trimmed_filename, dim):
198 |     """Saves glove vectors in numpy array
199 | 
200 |     Args:
201 |         vocab: dictionary vocab[word] = index
202 |         glove_filename: a path to a glove file
203 |         trimmed_filename: a path where to store a matrix in npy
204 |         dim: (int) dimension of embeddings
205 | 
206 |     """
207 |     embeddings = np.zeros([len(vocab), dim])
208 |     with open(glove_filename) as f:
209 |         for line in f:
210 |             line = line.strip().split(' ')
211 |             word = line[0]
212 |             embedding = [float(x) for x in line[1:]]
213 |             if word in vocab:
214 |                 word_idx = vocab[word]
215 |                 embeddings[word_idx] = np.asarray(embedding)
216 | 
217 |     np.savez_compressed(trimmed_filename, embeddings=embeddings)
218 | 
219 | 
220 | def get_trimmed_glove_vectors(filename):
221 |     """
222 |     Args:
223 |         filename: path to the npz file
224 | 
225 |     Returns:
226 |         matrix of embeddings (np array)
227 | 
228 |     """
229 |     try:
230 |         with np.load(filename) as data:
231 |             return data["embeddings"]
232 | 
233 |     except IOError:
234 |         raise MyIOError(filename)
235 | 
236 | 
237 | def get_processing_word(vocab_words=None, vocab_chars=None,
238 |                     lowercase=False, chars=False, allow_unk=True):
239 |     """Return lambda function that transform a word (string) into list,
240 |     or tuple of (list, id) of int corresponding to the ids of the word and
241 |     its corresponding characters.
242 | 
243 |     Args:
244 |         vocab: dict[word] = idx
245 | 
246 |     Returns:
247 |         f("cat") = ([12, 4, 32], 12345)
248 |                  = (list of char ids, word id)
249 | 
250 |     """
251 |     def f(word):
252 |         # 0. get chars of words
253 |         if vocab_chars is not None and chars == True:
254 |             char_ids = []
255 |             for char in word:
256 |                 # ignore chars out of vocabulary
257 |                 if char in vocab_chars:
258 |                     char_ids += [vocab_chars[char]]
259 | 
260 |         # 1. preprocess word
261 |         if lowercase:
262 |             word = word.lower()
263 |         if word.isdigit():
264 |             word = NUM
265 | 
266 |         # 2. get id of word
267 |         if vocab_words is not None:
268 |             if word in vocab_words:
269 |                 word = vocab_words[word]
270 |             else:
271 |                 if allow_unk:
272 |                     word = vocab_words[UNK]
273 |                 else:
274 |                     raise Exception("Unknow key is not allowed. Check that "\
275 |                                     "your vocab (tags?) is correct")
276 | 
277 |         # 3. return tuple char ids, word id
278 |         if vocab_chars is not None and chars == True:
279 |             return char_ids, word
280 |         else:
281 |             return word
282 | 
283 |     return f
284 | 
285 | 
286 | def _pad_sequences(sequences, pad_tok, max_length):
287 |     """
288 |     Args:
289 |         sequences: a generator of list or tuple
290 |         pad_tok: the char to pad with
291 | 
292 |     Returns:
293 |         a list of list where each sublist has same length
294 |     """
295 |     sequence_padded, sequence_length = [], []
296 | 
297 |     for seq in sequences:
298 |         seq = list(seq)
299 |         seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0)
300 |         sequence_padded +=  [seq_]
301 |         sequence_length += [min(len(seq), max_length)]
302 | 
303 |     return sequence_padded, sequence_length
304 | 
305 | 
306 | def pad_sequences(sequences, pad_tok, nlevels=1):
307 |     """
308 |     Args:
309 |         sequences: a generator of list or tuple
310 |         pad_tok: the char to pad with
311 |         nlevels: "depth" of padding, for the case where we have characters ids
312 | 
313 |     Returns:
314 |         a list of list where each sublist has same length
315 | 
316 |     """
317 |     if nlevels == 1:
318 |         max_length = max(map(lambda x : len(x), sequences))
319 |         sequence_padded, sequence_length = _pad_sequences(sequences,
320 |                                             pad_tok, max_length)
321 | 
322 |     elif nlevels == 2:
323 |         max_length_word = max([max(map(lambda x: len(x), seq))
324 |                                for seq in sequences])
325 |         sequence_padded, sequence_length = [], []
326 |         for seq in sequences:
327 |             # all words are same length now
328 |             sp, sl = _pad_sequences(seq, pad_tok, max_length_word)
329 |             sequence_padded += [sp]
330 |             sequence_length += [sl]
331 | 
332 |         max_length_sentence = max(map(lambda x : len(x), sequences))
333 |         sequence_padded, _ = _pad_sequences(sequence_padded,
334 |                 [pad_tok]*max_length_word, max_length_sentence)
335 |         sequence_length, _ = _pad_sequences(sequence_length, 0,
336 |                 max_length_sentence)
337 | 
338 |     return sequence_padded, sequence_length
339 | 
340 | 
341 | def minibatches(data, minibatch_size):
342 |     """
343 |     Args:
344 |         data: generator of (sentence, tags) tuples
345 |         minibatch_size: (int)
346 | 
347 |     Yields:
348 |         list of tuples
349 | 
350 |     """
351 |     x_batch, y_batch = [], []
352 |     for (x, y) in data:
353 |         if len(x_batch) == minibatch_size:
354 |             yield x_batch, y_batch
355 |             x_batch, y_batch = [], []
356 | 
357 |         if type(x[0]) == tuple:
358 |             x = zip(*x)
359 |         x_batch += [x]
360 |         y_batch += [y]
361 | 
362 |     if len(x_batch) != 0:
363 |         yield x_batch, y_batch
364 | 
365 | 
366 | def get_chunk_type(tok, idx_to_tag):
367 |     """
368 |     Args:
369 |         tok: id of token, ex 4
370 |         idx_to_tag: dictionary {4: "B-PER", ...}
371 | 
372 |     Returns:
373 |         tuple: "B", "PER"
374 | 
375 |     """
376 |     tag_name = idx_to_tag[tok]
377 |     tag_class = tag_name.split('-')[0]
378 |     tag_type = tag_name.split('-')[-1]
379 |     return tag_class, tag_type
380 | 
381 | 
382 | def get_chunks(seq, tags):
383 |     """Given a sequence of tags, group entities and their position
384 | 
385 |     Args:
386 |         seq: [4, 4, 0, 0, ...] sequence of labels
387 |         tags: dict["O"] = 4
388 | 
389 |     Returns:
390 |         list of (chunk_type, chunk_start, chunk_end)
391 | 
392 |     Example:
393 |         seq = [4, 5, 0, 3]
394 |         tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
395 |         result = [("PER", 0, 2), ("LOC", 3, 4)]
396 | 
397 |     """
398 |     default = tags[NONE]
399 |     idx_to_tag = {idx: tag for tag, idx in tags.items()}
400 |     chunks = []
401 |     chunk_type, chunk_start = None, None
402 |     for i, tok in enumerate(seq):
403 |         # End of a chunk 1
404 |         if tok == default and chunk_type is not None:
405 |             # Add a chunk.
406 |             chunk = (chunk_type, chunk_start, i)
407 |             chunks.append(chunk)
408 |             chunk_type, chunk_start = None, None
409 | 
410 |         # End of a chunk + start of a chunk!
411 |         elif tok != default:
412 |             tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
413 |             if chunk_type is None:
414 |                 chunk_type, chunk_start = tok_chunk_type, i
415 |             elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
416 |                 chunk = (chunk_type, chunk_start, i)
417 |                 chunks.append(chunk)
418 |                 chunk_type, chunk_start = tok_chunk_type, i
419 |         else:
420 |             pass
421 | 
422 |     # end condition
423 |     if chunk_type is not None:
424 |         chunk = (chunk_type, chunk_start, len(seq))
425 |         chunks.append(chunk)
426 | 
427 |     return chunks
428 | 


--------------------------------------------------------------------------------
/nerlogparser/model/evaluate.py:
--------------------------------------------------------------------------------
 1 | from nerlogparser.model.data_utils import CoNLLDataset
 2 | from nerlogparser.model.ner_model import NERModel
 3 | from nerlogparser.model.config import Config
 4 | 
 5 | 
 6 | def align_data(data):
 7 |     """Given dict with lists, creates aligned strings
 8 | 
 9 |     Adapted from Assignment 3 of CS224N
10 | 
11 |     Args:
12 |         data: (dict) data["x"] = ["I", "love", "you"]
13 |               (dict) data["y"] = ["O", "O", "O"]
14 | 
15 |     Returns:
16 |         data_aligned: (dict) data_align["x"] = "I love you"
17 |                            data_align["y"] = "O O    O  "
18 | 
19 |     """
20 |     spacings = [max([len(seq[i]) for seq in data.values()])
21 |                 for i in range(len(data[list(data.keys())[0]]))]
22 |     data_aligned = dict()
23 | 
24 |     # for each entry, create aligned string
25 |     for key, seq in data.items():
26 |         str_aligned = ""
27 |         for token, spacing in zip(seq, spacings):
28 |             str_aligned += token + " " * (spacing - len(token) + 1)
29 | 
30 |         data_aligned[key] = str_aligned
31 | 
32 |     return data_aligned
33 | 
34 | 
35 | def interactive_shell(model):
36 |     """Creates interactive shell to play with model
37 | 
38 |     Args:
39 |         model: instance of NERModel
40 | 
41 |     """
42 |     model.logger.info("""
43 | This is an interactive mode.
44 | To exit, enter 'exit'.
45 | You can enter a sentence like
46 | input> I love Paris""")
47 | 
48 |     while True:
49 |         sentence = input("input> ")
50 |         words_raw = sentence.strip().split(" ")
51 | 
52 |         if words_raw == ["exit"]:
53 |             break
54 | 
55 |         preds = model.predict(words_raw)
56 |         to_print = align_data({"input": words_raw, "output": preds})
57 | 
58 |         for key, seq in to_print.items():
59 |             model.logger.info(seq)
60 | 
61 | 
62 | def main():
63 |     # create instance of config
64 |     config = Config()
65 | 
66 |     # build model
67 |     model = NERModel(config)
68 |     model.build()
69 |     model.restore_session(config.dir_model)
70 | 
71 |     # create dataset
72 |     test = CoNLLDataset(config.filename_test, config.processing_word,
73 |                         config.processing_tag, config.max_iter)
74 | 
75 |     # evaluate and interact
76 |     model.evaluate(test)
77 |     interactive_shell(model)
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/nerlogparser/model/general_utils.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import sys
  3 | import logging
  4 | import numpy as np
  5 | 
  6 | 
  7 | def get_logger(filename):
  8 |     """Return a logger instance that writes in filename
  9 | 
 10 |     Args:
 11 |         filename: (string) path to log.txt
 12 | 
 13 |     Returns:
 14 |         logger: (instance of logger)
 15 | 
 16 |     """
 17 |     logger = logging.getLogger('logger')
 18 |     logger.setLevel(logging.DEBUG)
 19 |     logging.basicConfig(format='%(message)s', level=logging.DEBUG)
 20 |     handler = logging.FileHandler(filename)
 21 |     handler.setLevel(logging.DEBUG)
 22 |     handler.setFormatter(logging.Formatter(
 23 |             '%(asctime)s:%(levelname)s: %(message)s'))
 24 |     logging.getLogger().addHandler(handler)
 25 | 
 26 |     return logger
 27 | 
 28 | 
 29 | class Progbar(object):
 30 |     """Progbar class copied from keras (https://github.com/fchollet/keras/)
 31 | 
 32 |     Displays a progress bar.
 33 |     Small edit : added strict arg to update
 34 |     # Arguments
 35 |         target: Total number of steps expected.
 36 |         interval: Minimum visual progress update interval (in seconds).
 37 |     """
 38 | 
 39 |     def __init__(self, target, width=30, verbose=1):
 40 |         self.width = width
 41 |         self.target = target
 42 |         self.sum_values = {}
 43 |         self.unique_values = []
 44 |         self.start = time.time()
 45 |         self.total_width = 0
 46 |         self.seen_so_far = 0
 47 |         self.verbose = verbose
 48 | 
 49 |     def update(self, current, values=[], exact=[], strict=[]):
 50 |         """
 51 |         Updates the progress bar.
 52 |         # Arguments
 53 |             current: Index of current step.
 54 |             values: List of tuples (name, value_for_last_step).
 55 |                 The progress bar will display averages for these values.
 56 |             exact: List of tuples (name, value_for_last_step).
 57 |                 The progress bar will display these values directly.
 58 |         """
 59 | 
 60 |         for k, v in values:
 61 |             if k not in self.sum_values:
 62 |                 self.sum_values[k] = [v * (current - self.seen_so_far),
 63 |                                       current - self.seen_so_far]
 64 |                 self.unique_values.append(k)
 65 |             else:
 66 |                 self.sum_values[k][0] += v * (current - self.seen_so_far)
 67 |                 self.sum_values[k][1] += (current - self.seen_so_far)
 68 |         for k, v in exact:
 69 |             if k not in self.sum_values:
 70 |                 self.unique_values.append(k)
 71 |             self.sum_values[k] = [v, 1]
 72 | 
 73 |         for k, v in strict:
 74 |             if k not in self.sum_values:
 75 |                 self.unique_values.append(k)
 76 |             self.sum_values[k] = v
 77 | 
 78 |         self.seen_so_far = current
 79 | 
 80 |         now = time.time()
 81 |         if self.verbose == 1:
 82 |             prev_total_width = self.total_width
 83 |             sys.stdout.write("\b" * prev_total_width)
 84 |             sys.stdout.write("\r")
 85 | 
 86 |             numdigits = int(np.floor(np.log10(self.target))) + 1
 87 |             barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
 88 |             bar = barstr % (current, self.target)
 89 |             prog = float(current)/self.target
 90 |             prog_width = int(self.width*prog)
 91 |             if prog_width > 0:
 92 |                 bar += ('='*(prog_width-1))
 93 |                 if current < self.target:
 94 |                     bar += '>'
 95 |                 else:
 96 |                     bar += '='
 97 |             bar += ('.'*(self.width-prog_width))
 98 |             bar += ']'
 99 |             sys.stdout.write(bar)
100 |             self.total_width = len(bar)
101 | 
102 |             if current:
103 |                 time_per_unit = (now - self.start) / current
104 |             else:
105 |                 time_per_unit = 0
106 |             eta = time_per_unit*(self.target - current)
107 |             info = ''
108 |             if current < self.target:
109 |                 info += ' - ETA: %ds' % eta
110 |             else:
111 |                 info += ' - %ds' % (now - self.start)
112 |             for k in self.unique_values:
113 |                 if type(self.sum_values[k]) is list:
114 |                     info += ' - %s: %.4f' % (k,
115 |                         self.sum_values[k][0] / max(1, self.sum_values[k][1]))
116 |                 else:
117 |                     info += ' - %s: %s' % (k, self.sum_values[k])
118 | 
119 |             self.total_width += len(info)
120 |             if prev_total_width > self.total_width:
121 |                 info += ((prev_total_width-self.total_width) * " ")
122 | 
123 |             sys.stdout.write(info)
124 |             sys.stdout.flush()
125 | 
126 |             if current >= self.target:
127 |                 sys.stdout.write("\n")
128 | 
129 |         if self.verbose == 2:
130 |             if current >= self.target:
131 |                 info = '%ds' % (now - self.start)
132 |                 for k in self.unique_values:
133 |                     info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
134 |                 sys.stdout.write(info + "\n")
135 | 
136 |     def add(self, n, values=[]):
137 |         self.update(self.seen_so_far+n, values)
138 | 


--------------------------------------------------------------------------------
/nerlogparser/model/ner_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | 
  5 | from nerlogparser.model.data_utils import minibatches, pad_sequences, get_chunks
  6 | from nerlogparser.model.general_utils import Progbar
  7 | from nerlogparser.model.base_model import BaseModel
  8 | 
  9 | 
 10 | class NERModel(BaseModel):
 11 |     """Specialized class of Model for NER"""
 12 | 
 13 |     def __init__(self, config):
 14 |         super(NERModel, self).__init__(config)
 15 |         self.idx_to_tag = {idx: tag for tag, idx in
 16 |                            self.config.vocab_tags.items()}
 17 | 
 18 |     def add_placeholders(self):
 19 |         """Define placeholders = entries to computational graph"""
 20 |         # shape = (batch size, max length of sentence in batch)
 21 |         self.word_ids = tf.placeholder(tf.int32, shape=[None, None],
 22 |                         name="word_ids")
 23 | 
 24 |         # shape = (batch size)
 25 |         self.sequence_lengths = tf.placeholder(tf.int32, shape=[None],
 26 |                         name="sequence_lengths")
 27 | 
 28 |         # shape = (batch size, max length of sentence, max length of word)
 29 |         self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None],
 30 |                         name="char_ids")
 31 | 
 32 |         # shape = (batch_size, max_length of sentence)
 33 |         self.word_lengths = tf.placeholder(tf.int32, shape=[None, None],
 34 |                         name="word_lengths")
 35 | 
 36 |         # shape = (batch size, max length of sentence in batch)
 37 |         self.labels = tf.placeholder(tf.int32, shape=[None, None],
 38 |                         name="labels")
 39 | 
 40 |         # hyper parameters
 41 |         self.dropout = tf.placeholder(dtype=tf.float32, shape=[],
 42 |                         name="dropout")
 43 |         self.lr = tf.placeholder(dtype=tf.float32, shape=[],
 44 |                         name="lr")
 45 | 
 46 |     def get_feed_dict(self, words, labels=None, lr=None, dropout=None):
 47 |         """Given some data, pad it and build a feed dictionary
 48 | 
 49 |         Args:
 50 |             words: list of sentences. A sentence is a list of ids of a list of
 51 |                 words. A word is a list of ids
 52 |             labels: list of ids
 53 |             lr: (float) learning rate
 54 |             dropout: (float) keep prob
 55 | 
 56 |         Returns:
 57 |             dict {placeholder: value}
 58 | 
 59 |         """
 60 |         # perform padding of the given data
 61 |         if self.config.use_chars:
 62 |             char_ids, word_ids = zip(*words)
 63 |             word_ids, sequence_lengths = pad_sequences(word_ids, 0)
 64 |             char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2)
 65 |         else:
 66 |             word_ids, sequence_lengths = pad_sequences(words, 0)
 67 | 
 68 |         # build feed dictionary
 69 |         feed = {
 70 |             self.word_ids: word_ids,
 71 |             self.sequence_lengths: sequence_lengths
 72 |         }
 73 | 
 74 |         if self.config.use_chars:
 75 |             feed[self.char_ids] = char_ids
 76 |             feed[self.word_lengths] = word_lengths
 77 | 
 78 |         if labels is not None:
 79 |             labels, _ = pad_sequences(labels, 0)
 80 |             feed[self.labels] = labels
 81 | 
 82 |         if lr is not None:
 83 |             feed[self.lr] = lr
 84 | 
 85 |         if dropout is not None:
 86 |             feed[self.dropout] = dropout
 87 | 
 88 |         return feed, sequence_lengths
 89 | 
 90 |     def add_word_embeddings_op(self):
 91 |         """Defines self.word_embeddings
 92 | 
 93 |         If self.config.embeddings is not None and is a np array initialized
 94 |         with pre-trained word vectors, the word embeddings is just a look-up
 95 |         and we don't train the vectors. Otherwise, a random matrix with
 96 |         the correct shape is initialized.
 97 |         """
 98 |         with tf.variable_scope("words"):
 99 |             if self.config.embeddings is None:
100 |                 self.logger.info("WARNING: randomly initializing word vectors")
101 |                 _word_embeddings = tf.get_variable(
102 |                         name="_word_embeddings",
103 |                         dtype=tf.float32,
104 |                         shape=[self.config.nwords, self.config.dim_word])
105 |             else:
106 |                 _word_embeddings = tf.Variable(
107 |                         self.config.embeddings,
108 |                         name="_word_embeddings",
109 |                         dtype=tf.float32,
110 |                         trainable=self.config.train_embeddings)
111 | 
112 |             word_embeddings = tf.nn.embedding_lookup(_word_embeddings, self.word_ids, name="word_embeddings")
113 | 
114 |         with tf.variable_scope("chars"):
115 |             if self.config.use_chars:
116 |                 # get char embeddings matrix
117 |                 _char_embeddings = tf.get_variable(
118 |                         name="_char_embeddings",
119 |                         dtype=tf.float32,
120 |                         shape=[self.config.nchars, self.config.dim_char])
121 |                 char_embeddings = tf.nn.embedding_lookup(_char_embeddings,
122 |                         self.char_ids, name="char_embeddings")
123 | 
124 |                 # put the time dimension on axis=1
125 |                 s = tf.shape(char_embeddings)
126 |                 char_embeddings = tf.reshape(char_embeddings,
127 |                         shape=[s[0]*s[1], s[-2], self.config.dim_char])
128 |                 word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]])
129 | 
130 |                 # bi lstm on chars
131 |                 cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
132 |                         state_is_tuple=True)
133 |                 cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char,
134 |                         state_is_tuple=True)
135 |                 _output = tf.nn.bidirectional_dynamic_rnn(
136 |                         cell_fw, cell_bw, char_embeddings,
137 |                         sequence_length=word_lengths, dtype=tf.float32)
138 | 
139 |                 # read and concat output
140 |                 _, ((_, output_fw), (_, output_bw)) = _output
141 |                 output = tf.concat([output_fw, output_bw], axis=-1)
142 | 
143 |                 # shape = (batch size, max sentence length, char hidden size)
144 |                 output = tf.reshape(output,
145 |                         shape=[s[0], s[1], 2*self.config.hidden_size_char])
146 |                 word_embeddings = tf.concat([word_embeddings, output], axis=-1)
147 | 
148 |         self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout)
149 | 
150 |     def add_logits_op(self):
151 |         """Defines self.logits
152 | 
153 |         For each word in each sentence of the batch, it corresponds to a vector
154 |         of scores, of dimension equal to the number of tags.
155 |         """
156 |         with tf.variable_scope("bi-lstm"):
157 |             cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
158 |             cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm)
159 |             (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn(
160 |                     cell_fw, cell_bw, self.word_embeddings,
161 |                     sequence_length=self.sequence_lengths, dtype=tf.float32)
162 |             output = tf.concat([output_fw, output_bw], axis=-1)
163 |             output = tf.nn.dropout(output, self.dropout)
164 | 
165 |         with tf.variable_scope("proj"):
166 |             W = tf.get_variable("W", dtype=tf.float32,
167 |                     shape=[2*self.config.hidden_size_lstm, self.config.ntags])
168 | 
169 |             b = tf.get_variable("b", shape=[self.config.ntags],
170 |                     dtype=tf.float32, initializer=tf.zeros_initializer())
171 | 
172 |             nsteps = tf.shape(output)[1]
173 |             output = tf.reshape(output, [-1, 2*self.config.hidden_size_lstm])
174 |             pred = tf.matmul(output, W) + b
175 |             self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags])
176 | 
177 |     def add_pred_op(self):
178 |         """Defines self.labels_pred
179 | 
180 |         This op is defined only in the case where we don't use a CRF since in
181 |         that case we can make the prediction "in the graph" (thanks to tf
182 |         functions in other words). With theCRF, as the inference is coded
183 |         in python and not in pure tensroflow, we have to make the prediciton
184 |         outside the graph.
185 |         """
186 |         if not self.config.use_crf:
187 |             self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1),
188 |                     tf.int32)
189 | 
190 |     def add_loss_op(self):
191 |         """Defines the loss"""
192 |         if self.config.use_crf:
193 |             log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood(
194 |                     self.logits, self.labels, self.sequence_lengths)
195 |             self.trans_params = trans_params # need to evaluate it for decoding
196 |             self.loss = tf.reduce_mean(-log_likelihood)
197 |         else:
198 |             losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
199 |                     logits=self.logits, labels=self.labels)
200 |             mask = tf.sequence_mask(self.sequence_lengths)
201 |             losses = tf.boolean_mask(losses, mask)
202 |             self.loss = tf.reduce_mean(losses)
203 | 
204 |         # for tensorboard
205 |         tf.summary.scalar("loss", self.loss)
206 | 
207 |     def build(self):
208 |         # NER specific functions
209 |         self.add_placeholders()
210 |         self.add_word_embeddings_op()
211 |         self.add_logits_op()
212 |         self.add_pred_op()
213 |         self.add_loss_op()
214 | 
215 |         # Generic functions that add training op and initialize session
216 |         self.add_train_op(self.config.lr_method, self.lr, self.loss,
217 |                 self.config.clip)
218 |         self.initialize_session() # now self.sess is defined and vars are init
219 | 
220 |     def predict_batch(self, words):
221 |         """
222 |         Args:
223 |             words: list of sentences
224 | 
225 |         Returns:
226 |             labels_pred: list of labels for each sentence
227 |             sequence_length
228 | 
229 |         """
230 |         fd, sequence_lengths = self.get_feed_dict(words, dropout=1.0)
231 | 
232 |         if self.config.use_crf:
233 |             # get tag scores and transition params of CRF
234 |             viterbi_sequences = []
235 |             logits, trans_params = self.sess.run(
236 |                     [self.logits, self.trans_params], feed_dict=fd)
237 | 
238 |             # iterate over the sentences because no batching in vitervi_decode
239 |             for logit, sequence_length in zip(logits, sequence_lengths):
240 |                 logit = logit[:sequence_length] # keep only the valid steps
241 |                 viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(
242 |                         logit, trans_params)
243 |                 viterbi_sequences += [viterbi_seq]
244 | 
245 |             return viterbi_sequences, sequence_lengths
246 | 
247 |         else:
248 |             labels_pred = self.sess.run(self.labels_pred, feed_dict=fd)
249 | 
250 |             return labels_pred, sequence_lengths
251 | 
252 |     def run_epoch(self, train, dev, epoch):
253 |         """Performs one complete pass over the train set and evaluate on dev
254 | 
255 |         Args:
256 |             train: dataset that yields tuple of sentences, tags
257 |             dev: dataset
258 |             epoch: (int) index of the current epoch
259 | 
260 |         Returns:
261 |             f1: (python float), score to select model on, higher is better
262 | 
263 |         """
264 |         # progbar stuff for logging
265 |         batch_size = self.config.batch_size
266 |         nbatches = (len(train) + batch_size - 1) // batch_size
267 |         prog = Progbar(target=nbatches)
268 | 
269 |         # iterate over dataset
270 |         for i, (words, labels) in enumerate(minibatches(train, batch_size)):
271 |             fd, _ = self.get_feed_dict(words, labels, self.config.lr,
272 |                     self.config.dropout)
273 | 
274 |             _, train_loss, summary = self.sess.run(
275 |                     [self.train_op, self.loss, self.merged], feed_dict=fd)
276 | 
277 |             prog.update(i + 1, [("train loss", train_loss)])
278 | 
279 |             # tensorboard
280 |             if i % 10 == 0:
281 |                 self.file_writer.add_summary(summary, epoch*nbatches + i)
282 | 
283 |         metrics = self.run_evaluate(dev)
284 |         msg = " - ".join(["{} {:04.2f}".format(k, v)
285 |                 for k, v in metrics.items()])
286 |         self.logger.info(msg)
287 | 
288 |         return metrics["f1"]
289 | 
290 |     def run_evaluate(self, test):
291 |         """Evaluates performance on test set
292 | 
293 |         Args:
294 |             test: dataset that yields tuple of (sentences, tags)
295 | 
296 |         Returns:
297 |             metrics: (dict) metrics["acc"] = 98.4, ...
298 | 
299 |         """
300 |         accs = []
301 |         correct_preds, total_correct, total_preds = 0., 0., 0.
302 |         for words, labels in minibatches(test, self.config.batch_size):
303 |             labels_pred, sequence_lengths = self.predict_batch(words)
304 | 
305 |             for lab, lab_pred, length in zip(labels, labels_pred,
306 |                                              sequence_lengths):
307 |                 lab      = lab[:length]
308 |                 lab_pred = lab_pred[:length]
309 |                 accs    += [a==b for (a, b) in zip(lab, lab_pred)]
310 | 
311 |                 lab_chunks      = set(get_chunks(lab, self.config.vocab_tags))
312 |                 lab_pred_chunks = set(get_chunks(lab_pred,
313 |                                                  self.config.vocab_tags))
314 | 
315 |                 correct_preds += len(lab_chunks & lab_pred_chunks)
316 |                 total_preds   += len(lab_pred_chunks)
317 |                 total_correct += len(lab_chunks)
318 | 
319 |         p   = correct_preds / total_preds if correct_preds > 0 else 0
320 |         r   = correct_preds / total_correct if correct_preds > 0 else 0
321 |         f1  = 2 * p * r / (p + r) if correct_preds > 0 else 0
322 |         acc = np.mean(accs)
323 | 
324 |         return {"acc": 100*acc, "f1": 100*f1}
325 | 
326 |     def predict(self, words_raw):
327 |         """Returns list of tags
328 | 
329 |         Args:
330 |             words_raw: list of words (string), just one sentence (no batch)
331 | 
332 |         Returns:
333 |             preds: list of tags (string), one for each word in the sentence
334 | 
335 |         """
336 |         words = [self.config.processing_word(w) for w in words_raw]
337 |         if type(words[0]) == tuple:
338 |             words = zip(*words)
339 |         pred_ids, _ = self.predict_batch([words])
340 |         preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])]
341 | 
342 |         return preds
343 | 


--------------------------------------------------------------------------------
/nerlogparser/model/train.py:
--------------------------------------------------------------------------------
 1 | from nerlogparser.model.data_utils import CoNLLDataset
 2 | from nerlogparser.model.ner_model import NERModel
 3 | from nerlogparser.model.config import Config
 4 | 
 5 | 
 6 | def main():
 7 |     # create instance of config
 8 |     config = Config()
 9 | 
10 |     # build model
11 |     model = NERModel(config)
12 |     model.build()
13 |     # model.restore_session("results/crf/model.weights/") # optional, restore weights
14 |     # model.reinitialize_weights("proj")
15 | 
16 |     # create datasets
17 |     dev   = CoNLLDataset(config.filename_dev, config.processing_word,
18 |                          config.processing_tag, config.max_iter)
19 |     train = CoNLLDataset(config.filename_train, config.processing_word,
20 |                          config.processing_tag, config.max_iter)
21 | 
22 |     # train model
23 |     model.train(train, dev)
24 | 
25 | if __name__ == "__main__":
26 |     main()
27 | 


--------------------------------------------------------------------------------
/nerlogparser/nerlogparser.py:
--------------------------------------------------------------------------------
  1 | import pprint
  2 | from optparse import OptionParser
  3 | from collections import OrderedDict
  4 | from nerlogparser.model.ner_model import NERModel
  5 | from nerlogparser.model.config import Config
  6 | from nerlogparser.output.to_json import ToJson
  7 | 
  8 | 
  9 | class Nerlogparser(object):
 10 |     def __init__(self):
 11 |         self.model = None
 12 |         self.config = None
 13 |         self.master_label = {}
 14 | 
 15 |         self.__load_pretrained_model()
 16 |         self.__load_label()
 17 | 
 18 |     def __load_pretrained_model(self):
 19 |         # create instance of config
 20 |         self.config = Config()
 21 | 
 22 |         # load pretrained model
 23 |         self.model = NERModel(self.config)
 24 |         self.model.build()
 25 |         self.model.restore_session(self.config.dir_model)
 26 | 
 27 |     def __load_label(self):
 28 |         # load NER label and its corresponding human-readable field label
 29 |         with open(self.config.label_file, 'r') as f:
 30 |             label = f.readlines()
 31 | 
 32 |         labels = {}
 33 |         for line in label:
 34 |             line_split = line.split(' ')
 35 |             ner_label, final_label = line_split[0], line_split[1]
 36 |             labels[ner_label] = final_label.rstrip()
 37 | 
 38 |         self.master_label = labels
 39 | 
 40 |     def __get_per_entity(self, words_raw, ner_label):
 41 |         # one entity can contain one or more words
 42 |         entity = OrderedDict()
 43 |         for index, label in enumerate(ner_label):
 44 |             if '-' in label:
 45 |                 main_label = label.split('-')[1]
 46 |             else:
 47 |                 main_label = label
 48 | 
 49 |             if main_label not in entity.keys():
 50 |                 entity[main_label] = []
 51 | 
 52 |             entity[main_label].append(words_raw[index])
 53 | 
 54 |         # one entity is now one sentence
 55 |         final_entity = OrderedDict()
 56 |         for main_label, words in entity.items():
 57 |             final_label = self.master_label[main_label]
 58 |             final_entity[final_label] = ' '.join(words)
 59 | 
 60 |         if 'message' not in final_entity.keys():
 61 |             final_entity['message'] = ''
 62 | 
 63 |         return final_entity
 64 | 
 65 |     def parse_logs(self, log_file):
 66 |         # parse log files using pretrained model
 67 |         raw_logs = {}
 68 |         parsed_logs = OrderedDict()
 69 |         parsed_log_index = 0
 70 |         with open(log_file) as f:
 71 |             for line_index, line in enumerate(f):
 72 |                 if line not in ['\n', '\r\n']:
 73 |                     raw_logs[parsed_log_index] = line
 74 |                     words_raw = line.strip().split()
 75 | 
 76 |                     ner_label = self.model.predict(words_raw)
 77 |                     parsed = self.__get_per_entity(words_raw, ner_label)
 78 |                     parsed_logs[parsed_log_index] = parsed
 79 |                     parsed_log_index += 1
 80 | 
 81 |         return parsed_logs
 82 | 
 83 | 
 84 | def main():
 85 |     parser = OptionParser(usage='usage: nerlogparser [options]')
 86 |     parser.add_option('-i', '--input',
 87 |                       action='store',
 88 |                       dest='input_file',
 89 |                       help='Input log file.')
 90 |     parser.add_option('-o', '--output',
 91 |                       action='store',
 92 |                       dest='output_file',
 93 |                       help='Parsed log file.')
 94 | 
 95 |     # get options
 96 |     (options, args) = parser.parse_args()
 97 |     input_file = options.input_file
 98 |     output_file = options.output_file
 99 | 
100 |     if options.input_file:
101 |         # parse log file
102 |         nerlogparser = Nerlogparser()
103 |         parsed_results = nerlogparser.parse_logs(input_file)
104 | 
105 |         if options.output_file:
106 |             print('Write results to', output_file)
107 |             ToJson.write_to_json(parsed_results, output_file)
108 | 
109 |         else:
110 |             print('No output file. Print parsing results on terminal.')
111 |             for line_id, parsed in parsed_results.items():
112 |                 print('Line:', line_id)
113 |                 pprint.pprint(parsed)
114 |                 print()
115 | 
116 |     else:
117 |         print('Please see help: nerlogparser -h')
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/nerlogparser/output/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/output/__init__.py


--------------------------------------------------------------------------------
/nerlogparser/output/to_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | class ToJson(object):
 5 | 
 6 |     @staticmethod
 7 |     def write_to_json(parsed_logs, output_file):
 8 |         # write a dictionary to json file
 9 |         with open(output_file, 'w') as f:
10 |             json.dump(parsed_logs, f)
11 | 


--------------------------------------------------------------------------------
/nerlogparser/preprocessing/Preprocessing.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import errno
  3 | import pickle
  4 | from configparser import ConfigParser
  5 | from nerlogparser.grammar.authlog import AuthLog
  6 | from nerlogparser.grammar.daemonlog import DaemonLog
  7 | from nerlogparser.grammar.debuglog import DebugLog
  8 | from nerlogparser.grammar.dmesglog import DmesgLog
  9 | from nerlogparser.grammar.kernellog import KernelLog
 10 | from nerlogparser.grammar.messageslog import MessagesLog
 11 | from nerlogparser.grammar.csvlog import CSVLog
 12 | from nerlogparser.grammar.bluegenelog import BlueGeneLog
 13 | from nerlogparser.grammar.kippolog import KippoLog
 14 | from nerlogparser.grammar.proxifierlog import ProxifierLog
 15 | from nerlogparser.grammar.weblog import WebLog
 16 | from nerlogparser.grammar.zookeeperlog import ZookeeperLog
 17 | 
 18 | 
 19 | class Preprocessing(object):
 20 |     """This class do four main tasks:
 21 |     1. Parse log entries based on developer-defined grammar
 22 |     2. Put punctuations between parsed entities
 23 |     3. Save punctuations result to files
 24 |     4. Save parsed log entries to pickle files to be used in Splitting.py.
 25 | 
 26 |     """
 27 |     def __init__(self, data):
 28 |         self.dataset = data
 29 |         self.dataset_conf = {}
 30 |         self.files = {}
 31 | 
 32 |     @staticmethod
 33 |     def __check_path(path):
 34 |         """Check a path is exist or not. If not exist, then create it.
 35 | 
 36 |         Parameters
 37 |         ----------
 38 |         path    : str
 39 |             Path of a directory to be checked.
 40 |         """
 41 |         try:
 42 |             os.makedirs(path)
 43 |         except OSError as exception:
 44 |             if exception.errno != errno.EEXIST:
 45 |                 raise
 46 | 
 47 |     def __get_dataset(self):
 48 |         # get dataset configuration
 49 |         current_path = os.path.dirname(os.path.realpath(__file__))
 50 |         dataset_config_path = os.path.join(current_path, 'config', 'datasets.conf')
 51 | 
 52 |         # read dataset path from .conf file
 53 |         parser = ConfigParser()
 54 |         parser.read(dataset_config_path)
 55 |         for section_name in parser.sections():
 56 |             options = {}
 57 |             for name, value in parser.items(section_name):
 58 |                 options[name] = value
 59 |             self.dataset_conf[section_name] = options
 60 | 
 61 |         # get dataset and groundtruth path
 62 |         dataset_path = os.path.join(self.dataset_conf['main']['dataset_path'], self.dataset)
 63 |         groundtruth_path = os.path.join(self.dataset_conf['main']['groundtruth_path'], self.dataset)
 64 |         groundtruth_pickle_path = os.path.join(self.dataset_conf['main']['groundtruth_pickle'], self.dataset)
 65 |         self.__check_path(groundtruth_path)
 66 |         self.__check_path(groundtruth_pickle_path)
 67 |         filenames = os.listdir(dataset_path)
 68 | 
 69 |         # get full path of each filename
 70 |         for filename in filenames:
 71 |             self.files[filename] = {
 72 |                 'log_path': os.path.join(dataset_path, filename),
 73 |                 'groundtruth_path': os.path.join(groundtruth_path, filename),
 74 |                 'groundtruth_pickle': os.path.join(groundtruth_pickle_path, filename),
 75 |                 'type': filename.split('.')[0]
 76 |             }
 77 | 
 78 |     def __get_grammar(self, file_type):
 79 |         if file_type == 'auth':
 80 |             authlog_grammar = AuthLog(self.dataset)
 81 |             return authlog_grammar
 82 | 
 83 |         elif file_type == 'daemon':
 84 |             daemonlog_grammar = DaemonLog(self.dataset)
 85 |             return daemonlog_grammar
 86 | 
 87 |         elif file_type == 'debug':
 88 |             debuglog_grammar = DebugLog(self.dataset)
 89 |             return debuglog_grammar.get_grammar()
 90 | 
 91 |         elif file_type == 'dmesg':
 92 |             dmesglog_grammar = DmesgLog(self.dataset)
 93 |             return dmesglog_grammar.get_grammar()
 94 | 
 95 |         elif file_type == 'kern':
 96 |             kernellog_grammar = KernelLog(self.dataset)
 97 |             return kernellog_grammar.get_grammar()
 98 | 
 99 |         elif file_type == 'messages' or file_type == 'syslog':
100 |             messageslog_grammar = MessagesLog(self.dataset)
101 |             return messageslog_grammar.get_grammar()
102 | 
103 |         elif file_type == 'csv':
104 |             messageslog_grammar = CSVLog(self.dataset)
105 |             return messageslog_grammar
106 | 
107 |         elif file_type == 'bgl2':
108 |             bluegenelog_grammar = BlueGeneLog(self.dataset)
109 |             return bluegenelog_grammar
110 | 
111 |         elif file_type == 'kippo':
112 |             kippolog_grammar = KippoLog(self.dataset)
113 |             return kippolog_grammar
114 | 
115 |         elif file_type == 'proxifier':
116 |             proxifierlog_grammar = ProxifierLog(self.dataset)
117 |             return proxifierlog_grammar
118 | 
119 |         elif file_type == 'access':
120 |             weblog_grammar = WebLog(self.dataset)
121 |             return weblog_grammar
122 | 
123 |         elif file_type == 'zookeeper':
124 |             zookeeperlog_grammar = ZookeeperLog(self.dataset)
125 |             return zookeeperlog_grammar
126 | 
127 |     @staticmethod
128 |     def __set_punctuation(parsed_line):
129 |         # set punctuation from parsed line
130 |         punctuated = ''
131 |         for field_name, field_value in parsed_line.items():
132 |             if field_value != '' and field_value != ' ':
133 |                 if field_name == 'message' or field_name == 'client_agent':
134 |                     # if there is no period, then add one
135 |                     field_value = field_value.rstrip()
136 |                     if field_value[-1] != '.':
137 |                         punctuated += field_value + ' .PERIOD\n'
138 |                     else:
139 |                         punctuated += field_value + ' .PERIOD\n'
140 |                 else:
141 |                     punctuated += field_value + ' ,COMMA '
142 |             else:
143 |                 if field_name == 'message':
144 |                     punctuated += '\n'
145 |                 elif field_name == 'client_agent':
146 |                     punctuated += ' .PERIOD\n'
147 | 
148 |         return punctuated
149 | 
150 |     def punctuate(self):
151 |         # get dataset
152 |         self.__get_dataset()
153 | 
154 |         # punctuate log entries
155 |         for filename, properties in self.files.items():
156 |             print(self.dataset, filename)
157 | 
158 |             # get grammar based on file type
159 |             file_type = properties['type']
160 |             grammar = self.__get_grammar(file_type)
161 | 
162 |             # prepare output files
163 |             f_groundtruth = open(properties['groundtruth_path'], 'w')
164 | 
165 |             # parse log entries
166 |             parsed_list = []
167 |             if file_type != 'csv':
168 |                 with open(properties['log_path'], 'r') as f:
169 |                     for line in f:
170 |                         parsed = grammar.parse_log(line)
171 |                         parsed_list.append(parsed)
172 | 
173 |                         # set punctuation
174 |                         punctuated_line = self.__set_punctuation(parsed)
175 |                         f_groundtruth.write(punctuated_line)
176 | 
177 |             else:
178 |                 parsed_lines = grammar.parse_log()
179 |                 for parsed_line in parsed_lines:
180 |                     parsed_list.append(parsed_line)
181 |                     punctuated_line = self.__set_punctuation(parsed_line)
182 |                     f_groundtruth.write(punctuated_line)
183 | 
184 |             f_groundtruth.close()
185 | 
186 |             # save parsed list to a pickle file
187 |             with open(properties['groundtruth_pickle'], 'wb') as f_pickle:
188 |                 pickle.dump(parsed_list, f_pickle, protocol=pickle.HIGHEST_PROTOCOL)
189 | 
190 | 
191 | if __name__ == '__main__':
192 |     # put punctuation (comma and period) to all datasets
193 |     datasets = [
194 |         'casper-rw',
195 |         'dfrws-2009-jhuisi',
196 |         'dfrws-2009-nssal',
197 |         'dfrws-2016',
198 |         'honeynet-challenge5',
199 |         'honeynet-challenge7',
200 |         'bgl2',
201 |         'kippo',
202 |         'proxifier',
203 |         'secrepo-accesslog',
204 |         'zookeeper'
205 |     ]
206 | 
207 |     for dataset in datasets:
208 |         preprocess = Preprocessing(dataset)
209 |         preprocess.punctuate()
210 | 


--------------------------------------------------------------------------------
/nerlogparser/preprocessing/Splitting.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import errno
  3 | import shutil
  4 | import pickle
  5 | from configparser import ConfigParser
  6 | from math import floor
  7 | from nerlogparser.dataformat.toconll import ToConll
  8 | 
  9 | 
 10 | class Splitting(object):
 11 |     # split each file to three parts: train, dev, and test
 12 |     # compositition: train: 60, dev: 20, test: 20
 13 |     def __init__(self, data):
 14 |         self.dataset = data
 15 |         self.dataset_conf = {}
 16 |         self.files = {}
 17 |         self.punctuation_path = ''
 18 |         self.conll_path = ''
 19 |         self.train_file = ''
 20 |         self.dev_file = ''
 21 |         self.test_file = ''
 22 |         self.train_file_conll = ''
 23 |         self.dev_file_conll = ''
 24 |         self.test_file_conll = ''
 25 |         self.conll_stanford_path = ''
 26 |         self.train_file_conll_stanford = ''
 27 |         self.dev_file_conll_stanford = ''
 28 |         self.test_file_conll_stanford = ''
 29 |         self.conll_pos_path = ''
 30 |         self.train_file_conll_pos = ''
 31 |         self.dev_file_conll_pos = ''
 32 |         self.test_file_conll_pos = ''
 33 |         self.csv_path = ''
 34 |         self.file_csv = ''
 35 |         self.nltk_tree_path = ''
 36 |         self.train_file_nltk_tree = ''
 37 |         self.test_file_nltk_tree = ''
 38 | 
 39 |     @staticmethod
 40 |     def __check_path(path):
 41 |         # check a path is exist or not. if not exist, then create it
 42 |         try:
 43 |             os.makedirs(path)
 44 |         except OSError as exception:
 45 |             if exception.errno != errno.EEXIST:
 46 |                 raise
 47 | 
 48 |     def __set_datapath(self):
 49 |         # set data path for output files
 50 |         self.__check_path(self.punctuation_path)
 51 |         self.train_file = os.path.join(self.punctuation_path, 'ep.train.txt')
 52 |         self.dev_file = os.path.join(self.punctuation_path, 'ep.dev.txt')
 53 |         self.test_file = os.path.join(self.punctuation_path, 'ep.test.txt')
 54 | 
 55 |     def __set_datapath_conll(self):
 56 |         # set data path for output files
 57 |         self.__check_path(self.conll_path)
 58 |         self.train_file_conll = os.path.join(self.conll_path, 'conll.train.txt')
 59 |         self.dev_file_conll = os.path.join(self.conll_path, 'conll.dev.txt')
 60 |         self.test_file_conll = os.path.join(self.conll_path, 'conll.test.txt')
 61 | 
 62 |     def __set_datapath_conll_stanford(self):
 63 |         # set data path for output files
 64 |         self.__check_path(self.conll_stanford_path)
 65 |         self.train_file_conll_stanford = os.path.join(self.conll_stanford_path, 'conll.stanford.train.txt')
 66 |         self.dev_file_conll_stanford = os.path.join(self.conll_stanford_path, 'conll.stanford.dev.txt')
 67 |         self.test_file_conll_stanford = os.path.join(self.conll_stanford_path, 'conll.stanford.test.txt')
 68 | 
 69 |     def __set_datapath_conll_pos(self):
 70 |         # set data path for output files
 71 |         self.__check_path(self.conll_pos_path)
 72 |         self.train_file_conll_pos = os.path.join(self.conll_pos_path, 'conll.pos.train.txt')
 73 |         self.dev_file_conll_pos = os.path.join(self.conll_pos_path, 'conll.pos.dev.txt')
 74 |         self.test_file_conll_pos = os.path.join(self.conll_pos_path, 'conll.pos.test.txt')
 75 | 
 76 |     def __set_datapath_nltk_tree(self):
 77 |         # set data path for output files
 78 |         self.__check_path(self.nltk_tree_path)
 79 |         self.train_file_nltk_tree = os.path.join(self.nltk_tree_path, 'nltk.tree.train.txt')
 80 |         self.test_file_nltk_tree = os.path.join(self.nltk_tree_path, 'nltk.tree.test.txt')
 81 | 
 82 |     def __set_datapath_csv(self):
 83 |         self.__check_path(self.csv_path)
 84 |         self.file_csv = os.path.join(self.csv_path, 'csv.all.txt')
 85 | 
 86 |     def __get_dataset(self):
 87 |         # get dataset configuration
 88 |         current_path = os.path.dirname(os.path.realpath(__file__))
 89 |         dataset_config_path = os.path.join(current_path, 'config', 'datasets.conf')
 90 | 
 91 |         # read dataset path from .conf file
 92 |         parser = ConfigParser()
 93 |         parser.read(dataset_config_path)
 94 |         for section_name in parser.sections():
 95 |             options = {}
 96 |             for name, value in parser.items(section_name):
 97 |                 options[name] = value
 98 |             self.dataset_conf[section_name] = options
 99 | 
100 |         # set output path
101 |         self.punctuation_path = self.dataset_conf['main']['punctuation_path']
102 |         self.conll_path = self.dataset_conf['main']['conll_path']
103 |         self.conll_stanford_path = self.dataset_conf['main']['conll_stanford_path']
104 |         self.conll_pos_path = self.dataset_conf['main']['conll_pos_path']
105 |         self.csv_path = self.dataset_conf['main']['csv_path']
106 |         self.nltk_tree_path = self.dataset_conf['main']['nltk_tree_path']
107 | 
108 |         # get dataset and groundtruth path
109 |         dataset_path = os.path.join(self.dataset_conf['main']['dataset_path'], self.dataset)
110 |         groundtruth_path = os.path.join(self.dataset_conf['main']['groundtruth_path'], self.dataset)
111 |         groundtruth_pickle = os.path.join(self.dataset_conf['main']['groundtruth_pickle'], self.dataset)
112 |         filenames = os.listdir(dataset_path)
113 | 
114 |         # get full path of each filename
115 |         for filename in filenames:
116 |             self.files[filename] = {
117 |                 'log_path': os.path.join(dataset_path, filename),
118 |                 'groundtruth_path': os.path.join(groundtruth_path, filename),
119 |                 'groundtruth_pickle_path': os.path.join(groundtruth_pickle, filename)
120 |             }
121 | 
122 |     def split(self):
123 |         # get dataset and output file path
124 |         self.__get_dataset()
125 |         self.__set_datapath()
126 | 
127 |         # open files for train, dev, and test
128 |         f_train = open(self.train_file, 'a')
129 |         f_dev = open(self.dev_file, 'a')
130 |         f_test = open(self.test_file, 'a')
131 | 
132 |         for filename, properties in self.files.items():
133 |             print('punctuation file:', self.dataset, properties['log_path'])
134 | 
135 |             with open(properties['groundtruth_path'], 'r') as f:
136 |                 # read lines and get various length
137 |                 lines = f.readlines()
138 | 
139 |             # note: test_length = dev_length
140 |             lines_length = len(lines)
141 |             train_length = floor(0.6 * lines_length)
142 |             dev_length = floor(0.2 * lines_length)
143 |             dev_end_index = train_length + dev_length
144 | 
145 |             # get training, dev, and test data
146 |             for line in lines[:train_length]:
147 |                 f_train.write(line)
148 | 
149 |             for line in lines[train_length:dev_end_index]:
150 |                 f_dev.write(line)
151 | 
152 |             for line in lines[dev_end_index:]:
153 |                 f_test.write(line)
154 | 
155 |         # close files
156 |         f_train.close()
157 |         f_dev.close()
158 |         f_test.close()
159 | 
160 |     def split_conll(self):
161 |         # set conll output files and create conll-format instance
162 |         self.__set_datapath_conll()
163 |         conll = ToConll()
164 | 
165 |         # open files for train, dev, and test
166 |         f_train = open(self.train_file_conll, 'a')
167 |         f_dev = open(self.dev_file_conll, 'a')
168 |         f_test = open(self.test_file_conll, 'a')
169 | 
170 |         for filename, properties in self.files.items():
171 |             print('pickle file     :', self.dataset, properties['log_path'])
172 | 
173 |             # parsed_list is list of dictionaries containing parsed log entries
174 |             with open(properties['groundtruth_pickle_path'], 'rb') as f_pickle:
175 |                 parsed_list = pickle.load(f_pickle)
176 | 
177 |             # note: test_length = dev_length
178 |             lines_length = len(parsed_list)
179 |             train_length = floor(0.6 * lines_length)
180 |             dev_length = floor(0.2 * lines_length)
181 |             dev_end_index = train_length + dev_length
182 | 
183 |             # get training, dev, and test data
184 |             f_train.write('-DOCSTART- -X- O O\n\n')
185 |             for line in parsed_list[:train_length]:
186 |                 line = conll.convert(line)
187 |                 f_train.write(line)
188 | 
189 |             f_dev.write('-DOCSTART- -X- O O\n\n')
190 |             for line in parsed_list[train_length:dev_end_index]:
191 |                 line = conll.convert(line)
192 |                 f_dev.write(line)
193 | 
194 |             f_test.write('-DOCSTART- -X- O O\n\n')
195 |             for line in parsed_list[dev_end_index:]:
196 |                 line = conll.convert(line)
197 |                 f_test.write(line)
198 | 
199 |         # close files
200 |         f_train.close()
201 |         f_dev.close()
202 |         f_test.close()
203 | 
204 |     def split_conll_stanford(self):
205 |         # set conll output files and create conll-format instance
206 |         self.__set_datapath_conll_stanford()
207 |         conll = ToConll()
208 | 
209 |         # open files for train, dev, and test
210 |         f_train = open(self.train_file_conll_stanford, 'a')
211 |         f_dev = open(self.dev_file_conll_stanford, 'a')
212 |         f_test = open(self.test_file_conll_stanford, 'a')
213 | 
214 |         for filename, properties in self.files.items():
215 |             print('pickle file     :', self.dataset, properties['log_path'])
216 | 
217 |             # parsed_list is list of dictionaries containing parsed log entries
218 |             with open(properties['groundtruth_pickle_path'], 'rb') as f_pickle:
219 |                 parsed_list = pickle.load(f_pickle)
220 | 
221 |             # note: test_length = dev_length
222 |             lines_length = len(parsed_list)
223 |             train_length = floor(0.6 * lines_length)
224 |             dev_length = floor(0.2 * lines_length)
225 |             dev_end_index = train_length + dev_length
226 | 
227 |             # get training, dev, and test data
228 |             for line in parsed_list[:train_length]:
229 |                 line = conll.convert(line, stanford=True)
230 |                 f_train.write(line)
231 | 
232 |             for line in parsed_list[train_length:dev_end_index]:
233 |                 line = conll.convert(line, stanford=True)
234 |                 f_dev.write(line)
235 | 
236 |             for line in parsed_list[dev_end_index:]:
237 |                 line = conll.convert(line, stanford=True)
238 |                 f_test.write(line)
239 | 
240 |         # close files
241 |         f_train.close()
242 |         f_dev.close()
243 |         f_test.close()
244 | 
245 |     def split_conll_pos(self):
246 |         # set conll output files and create conll-format instance
247 |         self.__set_datapath_conll_pos()
248 |         conll = ToConll()
249 | 
250 |         # open files for train, dev, and test
251 |         f_train = open(self.train_file_conll_pos, 'a')
252 |         f_dev = open(self.dev_file_conll_pos, 'a')
253 |         f_test = open(self.test_file_conll_pos, 'a')
254 | 
255 |         for filename, properties in self.files.items():
256 |             print('pickle file     :', self.dataset, properties['log_path'])
257 | 
258 |             # parsed_list is list of dictionaries containing parsed log entries
259 |             with open(properties['groundtruth_pickle_path'], 'rb') as f_pickle:
260 |                 parsed_list = pickle.load(f_pickle)
261 | 
262 |             # note: test_length = dev_length
263 |             lines_length = len(parsed_list)
264 |             train_length = floor(0.6 * lines_length)
265 |             dev_length = floor(0.2 * lines_length)
266 |             dev_end_index = train_length + dev_length
267 | 
268 |             # get training, dev, and test data
269 |             for line in parsed_list[:train_length]:
270 |                 line = conll.convert(line, ispos=True)
271 |                 f_train.write(line)
272 | 
273 |             for line in parsed_list[train_length:dev_end_index]:
274 |                 line = conll.convert(line, ispos=True)
275 |                 f_dev.write(line)
276 | 
277 |             for line in parsed_list[dev_end_index:]:
278 |                 line = conll.convert(line, ispos=True)
279 |                 f_test.write(line)
280 | 
281 |         # close files
282 |         f_train.close()
283 |         f_dev.close()
284 |         f_test.close()
285 | 
286 |     def split_csv(self, f):
287 |         # create conll-format instance
288 |         conll = ToConll()
289 | 
290 |         for filename, properties in self.files.items():
291 |             print('pickle file     :', self.dataset, properties['log_path'])
292 | 
293 |             # parsed_list is list of dictionaries containing parsed log entries
294 |             with open(properties['groundtruth_pickle_path'], 'rb') as f_pickle:
295 |                 parsed_list = pickle.load(f_pickle)
296 | 
297 |             # get training, dev, and test data
298 |             for line_id, line in enumerate(parsed_list):
299 |                 line_id += 1
300 |                 line = conll.convert(line, csv=True, csv_line_id=line_id)
301 |                 f.write(line)
302 | 
303 |     def split_nltk_tree(self):
304 |         # set conll output files and create conll-format instance
305 |         self.__set_datapath_nltk_tree()
306 |         conll = ToConll()
307 | 
308 |         # open files for train and test
309 |         f_train = open(self.train_file_nltk_tree, 'ab')
310 |         f_test = open(self.test_file_nltk_tree, 'ab')
311 | 
312 |         for filename, properties in self.files.items():
313 |             print('pickle file     :', self.dataset, properties['log_path'])
314 | 
315 |             # parsed_list is list of dictionaries containing parsed log entries
316 |             with open(properties['groundtruth_pickle_path'], 'rb') as f_pickle:
317 |                 parsed_list = pickle.load(f_pickle)
318 | 
319 |             lines_length = len(parsed_list)
320 |             train_length = floor(0.8 * lines_length)
321 | 
322 |             # get training, and test data
323 |             for line in parsed_list[:train_length]:
324 |                 line = conll.convert(line, iobtree=True)
325 |                 pickle.dump(line, f_train)
326 | 
327 |             for line in parsed_list[train_length:]:
328 |                 line = conll.convert(line, iobtree=True)
329 |                 pickle.dump(line, f_test)
330 | 
331 |         # close files
332 |         f_train.close()
333 |         f_test.close()
334 | 
335 | 
336 | def remove_directories():
337 |     # remove output directories
338 |     punctuation_path = '/home/hudan/Git/prlogparser/data/punctuation/'
339 |     if os.path.isdir(punctuation_path):
340 |         shutil.rmtree(punctuation_path)
341 | 
342 |     conll_path = '/home/hudan/Git/prlogparser/data/conll/'
343 |     if os.path.isdir(conll_path):
344 |         shutil.rmtree(conll_path)
345 | 
346 |     conll_path_stanford = '/home/hudan/Git/prlogparser/data/conll-stanford/'
347 |     if os.path.isdir(conll_path_stanford):
348 |         shutil.rmtree(conll_path_stanford)
349 | 
350 |     conll_path_pos = '/home/hudan/Git/prlogparser/data/conll-pos/'
351 |     if os.path.isdir(conll_path_pos):
352 |         shutil.rmtree(conll_path_pos)
353 | 
354 |     csv_path = '/home/hudan/Git/prlogparser/data/csv/'
355 |     if os.path.isdir(csv_path):
356 |         shutil.rmtree(csv_path)
357 | 
358 |     nltk_tree_path = '/home/hudan/Git/prlogparser/data/nltk-tree/'
359 |     if os.path.isdir(nltk_tree_path):
360 |         shutil.rmtree(nltk_tree_path)
361 | 
362 | 
363 | def check_path(path):
364 |     # check a path is exist or not. if not exist, then create it
365 |     try:
366 |         os.makedirs(path)
367 |     except OSError as exception:
368 |         if exception.errno != errno.EEXIST:
369 |             raise
370 | 
371 | 
372 | def open_csv_file():
373 |     # set csv output files
374 |     csv_path = '/home/hudan/Git/prlogparser/data/csv/'
375 |     check_path(csv_path)
376 |     csv_file = os.path.join(csv_path, 'csv.all.txt')
377 |     f_csv = open(csv_file, 'a')
378 |     f_csv.write('Sentence #\tWord\tPOS\tTag\n')
379 | 
380 |     return f_csv
381 | 
382 | 
383 | if __name__ == '__main__':
384 |     datasets = [
385 |         'casper-rw',
386 |         'dfrws-2009-jhuisi',
387 |         'dfrws-2009-nssal',
388 |         'dfrws-2016',
389 |         'honeynet-challenge5',
390 |         'honeynet-challenge7',
391 |         'bgl2',
392 |         'kippo',
393 |         'proxifier',
394 |         'secrepo-accesslog',
395 |         'zookeeper'
396 |     ]
397 | 
398 |     # remove to clean output directories because we use append mode for output files
399 |     # and then run splitting for all datasets
400 |     remove_directories()
401 |     file_csv = open_csv_file()
402 |     for dataset in datasets:
403 |         s = Splitting(dataset)
404 |         s.split()
405 |         s.split_conll()
406 |         s.split_conll_pos()
407 |         s.split_csv(file_csv)
408 |         # s.split_conll_stanford()
409 |         # s.split_nltk_tree()
410 | 
411 |     file_csv.close()
412 | 


--------------------------------------------------------------------------------
/nerlogparser/preprocessing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/preprocessing/__init__.py


--------------------------------------------------------------------------------
/nerlogparser/preprocessing/config/datasets.conf:
--------------------------------------------------------------------------------
1 | [main]
2 | dataset_path = /home/hudan/Git/nerlogparser/datasets/
3 | groundtruth_path = /home/hudan/Git/nerlogparser/groundtruth/
4 | groundtruth_pickle = /home/hudan/Git/nerlogparser/groundtruth-pickle/
5 | conll_path = /home/hudan/Git/nerlogparser/data/conll/
6 | conll_stanford_path = /home/hudan/Git/nerlogparser/data/conll-stanford/
7 | conll_pos_path = /home/hudan/Git/nerlogparser/data/conll-pos/
8 | csv_path = /home/hudan/Git/nerlogparser/data/csv/
9 | nltk_tree_path = /home/hudan/Git/nerlogparser/data/nltk-tree/


--------------------------------------------------------------------------------
/nerlogparser/results/test/events.out.tfevents.1533503273.seitpc80:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/results/test/events.out.tfevents.1533503273.seitpc80


--------------------------------------------------------------------------------
/nerlogparser/results/test/model.weights/.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/results/test/model.weights/.data-00000-of-00001


--------------------------------------------------------------------------------
/nerlogparser/results/test/model.weights/.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/results/test/model.weights/.index


--------------------------------------------------------------------------------
/nerlogparser/results/test/model.weights/.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/results/test/model.weights/.meta


--------------------------------------------------------------------------------
/nerlogparser/results/test/model.weights/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "."
2 | all_model_checkpoint_paths: "."
3 | 


--------------------------------------------------------------------------------
/nerlogparser/shell/nerlogparser_shell.py:
--------------------------------------------------------------------------------
 1 | from nerlogparser.model.ner_model import NERModel
 2 | from nerlogparser.model.config import Config
 3 | 
 4 | 
 5 | def align_data(data):
 6 |     """Given dict with lists, creates aligned strings
 7 | 
 8 |     Adapted from Assignment 3 of CS224N
 9 | 
10 |     Args:
11 |         data: (dict) data["x"] = ["I", "love", "you"]
12 |               (dict) data["y"] = ["O", "O", "O"]
13 | 
14 |     Returns:
15 |         data_aligned: (dict) data_align["x"] = "I love you"
16 |                            data_align["y"] = "O O    O  "
17 | 
18 |     """
19 |     spacings = [max([len(seq[i]) for seq in data.values()])
20 |                 for i in range(len(data[list(data.keys())[0]]))]
21 |     data_aligned = dict()
22 | 
23 |     # for each entry, create aligned string
24 |     for key, seq in data.items():
25 |         str_aligned = ""
26 |         for token, spacing in zip(seq, spacings):
27 |             str_aligned += token + " " * (spacing - len(token) + 1)
28 | 
29 |         data_aligned[key] = str_aligned
30 | 
31 |     return data_aligned
32 | 
33 | 
34 | def interactive_shell(model):
35 |     """Creates interactive shell to play with model
36 | 
37 |     Args:
38 |         model: instance of NERModel
39 | 
40 |     """
41 |     model.logger.info("""
42 | This is an interactive mode.
43 | To exit, enter 'exit'.
44 | You can enter a sentence like
45 | input> I love Paris""")
46 | 
47 |     while True:
48 |         sentence = input("input> ")
49 | 
50 |         words_raw = sentence.strip().split(" ")
51 | 
52 |         if words_raw == ["exit"]:
53 |             break
54 | 
55 |         preds = model.predict(words_raw)
56 |         to_print = align_data({"input": words_raw, "output": preds})
57 | 
58 |         for key, seq in to_print.items():
59 |             model.logger.info(seq)
60 | 
61 | 
62 | def main():
63 |     # create instance of config
64 |     config = Config()
65 | 
66 |     # build model
67 |     model = NERModel(config)
68 |     model.build()
69 |     model.restore_session(config.dir_model)
70 | 
71 |     # interactive shell
72 |     interactive_shell(model)
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     main()
77 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='nerlogparser',
 4 |       version='0.0.1',
 5 |       description='Automatic log parser',
 6 |       long_description='Automatic log parser using named entity recognition in Python',
 7 |       classifiers=[
 8 |           'Development Status :: 2 - Pre-Alpha',
 9 |           'License :: OSI Approved :: Apache Software License',
10 |           'Programming Language :: Python :: 3.5',
11 |       ],
12 |       keywords='named entity recognition, log parser, log forensics',
13 |       url='http://github.com/studiawan/nerlogparser/',
14 |       author='Guillaume Genthial, Hudan Studiawan',
15 |       author_email='studiawan@gmail.com',
16 |       license='Apache',
17 |       packages=['nerlogparser'],
18 |       entry_points={
19 |           'console_scripts': [
20 |               'nerlogparser = nerlogparser.nerlogparser:main'
21 |           ],
22 |       },
23 |       install_requires=[
24 |           'tensorflow==1.4.1',
25 |           'nltk'
26 |       ],
27 |       include_package_data=True,
28 |       zip_safe=False)
29 | 


--------------------------------------------------------------------------------