├── .gitignore ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── dataset └── README.md ├── nerlogparser ├── data │ ├── chars.txt │ ├── glove.6B.300d.trimmed.npz │ ├── label.txt │ ├── tags.txt │ ├── test.txt │ └── words.txt ├── dataformat │ ├── __init__.py │ └── toconll.py ├── grammar │ ├── __init__.py │ ├── authlog.py │ ├── bluegenelog.py │ ├── csvlog.py │ ├── daemonlog.py │ ├── debuglog.py │ ├── dmesglog.py │ ├── grammar_utility.py │ ├── kernellog.py │ ├── kippolog.py │ ├── messageslog.py │ ├── proxifierlog.py │ ├── weblog.py │ └── zookeeperlog.py ├── model │ ├── __init__.py │ ├── base_model.py │ ├── build_data.py │ ├── config.py │ ├── data_utils.py │ ├── evaluate.py │ ├── general_utils.py │ ├── ner_model.py │ └── train.py ├── nerlogparser.py ├── output │ ├── __init__.py │ └── to_json.py ├── preprocessing │ ├── Preprocessing.py │ ├── Splitting.py │ ├── __init__.py │ └── config │ │ └── datasets.conf ├── results │ └── test │ │ ├── events.out.tfevents.1533503273.seitpc80 │ │ ├── log.txt │ │ └── model.weights │ │ ├── .data-00000-of-00001 │ │ ├── .index │ │ ├── .meta │ │ └── checkpoint └── shell │ └── nerlogparser_shell.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *sh 2 | *pyc 3 | *.DS_Store 4 | __pycache__/ 5 | .idea 6 | *.egg-info/ 7 | build 8 | dist 9 | results -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | Copyright 2017 Guillaume Genthial 179 | 180 | Licensed under the Apache License, Version 2.0 (the "License"); 181 | you may not use this file except in compliance with the License. 182 | You may obtain a copy of the License at 183 | 184 | http://www.apache.org/licenses/LICENSE-2.0 185 | 186 | Unless required by applicable law or agreed to in writing, software 187 | distributed under the License is distributed on an "AS IS" BASIS, 188 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 189 | See the License for the specific language governing permissions and 190 | limitations under the License. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | graft dataset 2 | graft nerlogparser 3 | include LICENSE.txt 4 | include README.md -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # nerlogparser: An automatic log parser 2 | 3 | This is source code implementation of our paper entitled ["Automatic log parser to support forensic analysis"](http://researchrepository.murdoch.edu.au/id/eprint/42841/) published in the 16th Australian Digital Forensics Conference, pp. 1-10, 2018. We name the tool as `nerlogparser` because it uses named entity recognition (NER) technique to parse log files. This repository is a fork from [sequence_tagging](https://github.com/guillaumegenthial/sequence_tagging) by Guillaume Genthial. 4 | 5 | ## Requirements 6 | 1. Python 3.5 7 | 2. TensorFlow 1.4.1 8 | 3. nltk 3.4 9 | 10 | ## How to install 11 | 1. Create a new directory for `nerlogparser` in your home directory 12 | 13 | `mkdir $HOME/nerlogparser` 14 | 15 | 2. Create virtual environment in newly created directory with specific Python version (3.5) 16 | 17 | `virtualenv $HOME/nerlogparser -p /usr/bin/python3.5` 18 | 19 | 3. Activate the virtual environment 20 | 21 | `source $HOME/nerlogparser/bin/activate` 22 | 23 | 4. Install `nerlogparser` 24 | 25 | `pip install nerlogparser` 26 | 27 | ## How to run 28 | 1. Make sure your are still in the virtual environment mode 29 | 2. For example, run `nerlogparser` to parse authentication log file from `/var/log/auth.log` and print output to the screen 30 | 31 | `nerlogparser -i /var/log/auth.log` 32 | 33 | 3. We can save parsing results in an output file such as `parsed-auth.json`. At the moment, the only supported file output format is JSON. 34 | 35 | `nerlogparser -i /var/log/auth.log -o parsed-auth.json` 36 | 37 | 4. Run `nerlogpaser` help 38 | 39 | `nerlogparser -h` 40 | 41 | ## Import from your Python script 42 | 43 | ```python 44 | import pprint 45 | from nerlogparser.nerlogparser import Nerlogparser 46 | 47 | parser = Nerlogparser() 48 | parsed_logs = parser.parse_logs('/var/log/auth.log') 49 | 50 | for line_id, parsed in parsed_logs.items(): 51 | print('Line:', line_id) 52 | pprint.pprint(parsed) 53 | print() 54 | ``` 55 | 56 | ## License 57 | Apache License 2.0. Please check [LICENSE](https://github.com/studiawan/nerlogparser/blob/master/LICENSE.txt). 58 | -------------------------------------------------------------------------------- /dataset/README.md: -------------------------------------------------------------------------------- 1 | # Download datasets 2 | Please download all datasets used to train prlogparser in this link: https://mega.nz/#F!mNhWXTDS. 3 | The decryption key is !Rc-u3XLUywDYLIQWDkn80Q 4 | 5 | After the download is finished, copy and paste the log files from various sources to directory nerlogparser/datasets. 6 | -------------------------------------------------------------------------------- /nerlogparser/data/chars.txt: -------------------------------------------------------------------------------- 1 | 9 2 | n 3 | B 4 | [ 5 | - 6 | | 7 | N 8 | a 9 | > 10 | T 11 | ! 12 | U 13 | Z 14 | # 15 | , 16 | l 17 | 0 18 | ; 19 | m 20 | i 21 | q 22 | f 23 | g 24 | O 25 | 7 26 | H 27 | { 28 | ^ 29 | F 30 | 1 31 | ) 32 | 8 33 | & 34 | 6 35 | ' 36 | K 37 | G 38 | b 39 | o 40 | M 41 | z 42 | \ 43 | + 44 | d 45 | V 46 | * 47 | 5 48 | D 49 | 4 50 | c 51 | u 52 | j 53 | J 54 | C 55 | ` 56 | 3 57 | @ 58 | ] 59 | 2 60 | : 61 | L 62 | w 63 | ( 64 | s 65 | k 66 | X 67 | v 68 | = 69 | y 70 | x 71 | _ 72 | " 73 | R 74 | $ 75 | Q 76 | A 77 | % 78 | Y 79 | t 80 | P 81 | e 82 | h 83 | / 84 | ? 85 | E 86 | p 87 | ~ 88 | W 89 | S 90 | . 91 | < 92 | r 93 | I 94 | } -------------------------------------------------------------------------------- /nerlogparser/data/glove.6B.300d.trimmed.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/data/glove.6B.300d.trimmed.npz -------------------------------------------------------------------------------- /nerlogparser/data/label.txt: -------------------------------------------------------------------------------- 1 | TIM timestamp 2 | SEQ sequence_number 3 | LEV level 4 | HOS hostname 5 | SER service 6 | SUB subservice 7 | UTIM unix_time 8 | O message 9 | SOC sock 10 | NUM number 11 | COR core 12 | SOU source 13 | ARC arch 14 | DOM domain_or_ip 15 | STA status 16 | IPA ip_address 17 | DAS dash 18 | AUT auth 19 | COM command 20 | STC status_code 21 | BYT num_bytes 22 | REF referrer 23 | CLI client_agent 24 | JOB job -------------------------------------------------------------------------------- /nerlogparser/data/tags.txt: -------------------------------------------------------------------------------- 1 | I-COM 2 | I-SOU 3 | I-CLI 4 | I-JOB 5 | I-HOS 6 | B-CLI 7 | I-COR 8 | I-REF 9 | B-TIM 10 | O 11 | B-UTIM 12 | I-BYT 13 | I-TIM 14 | B-JOB 15 | I-IPA 16 | I-SEQ 17 | I-STC 18 | I-AUT 19 | I-STA 20 | I-UTIM 21 | B-SER 22 | B-DOM 23 | I-SOC 24 | I-SUB 25 | B-SUB 26 | B-COM 27 | I-SER 28 | I-NUM 29 | B-STA 30 | B-REF 31 | I-DAS 32 | I-LEV 33 | I-DOM 34 | I-ARC -------------------------------------------------------------------------------- /nerlogparser/data/test.txt: -------------------------------------------------------------------------------- 1 | Jean B-PER 2 | Pierre I-PER 3 | lives O 4 | in O 5 | New B-LOC 6 | York I-LOC 7 | . O 8 | 9 | The O 10 | European B-ORG 11 | Union I-ORG 12 | is O 13 | a O 14 | political O 15 | and O 16 | economic O 17 | union O 18 | 19 | A O 20 | French B-MISC 21 | American I-MISC 22 | actor O 23 | won O 24 | an O 25 | oscar O 26 | 27 | Jean B-PER 28 | Pierre I-PER 29 | lives O 30 | in O 31 | New B-LOC 32 | York I-LOC 33 | . O 34 | 35 | The O 36 | European B-ORG 37 | Union I-ORG 38 | is O 39 | a O 40 | political O 41 | and O 42 | economic O 43 | union O 44 | 45 | A O 46 | French B-MISC 47 | American I-MISC 48 | actor O 49 | won O 50 | an O 51 | oscar O 52 | 53 | Jean B-PER 54 | Pierre I-PER 55 | lives O 56 | in O 57 | New B-LOC 58 | York I-LOC 59 | . O 60 | 61 | The O 62 | European B-ORG 63 | Union I-ORG 64 | is O 65 | a O 66 | political O 67 | and O 68 | economic O 69 | union O 70 | 71 | A O 72 | French B-MISC 73 | American I-MISC 74 | actor O 75 | won O 76 | an O 77 | oscar O 78 | 79 | Jean B-PER 80 | Pierre I-PER 81 | lives O 82 | in O 83 | New B-LOC 84 | York I-LOC 85 | . O 86 | 87 | The O 88 | European B-ORG 89 | Union I-ORG 90 | is O 91 | a O 92 | political O 93 | and O 94 | economic O 95 | union O 96 | 97 | A O 98 | French B-MISC 99 | American I-MISC 100 | actor O 101 | won O 102 | an O 103 | oscar O 104 | -------------------------------------------------------------------------------- /nerlogparser/dataformat/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/dataformat/__init__.py -------------------------------------------------------------------------------- /nerlogparser/dataformat/toconll.py: -------------------------------------------------------------------------------- 1 | from nltk import pos_tag, conlltags2tree 2 | 3 | 4 | class ToConll(object): 5 | def __init__(self): 6 | self.entity_names = { 7 | 'timestamp': 'I-TIM', 8 | 'timestamp2': 'B-TIM', 9 | 'sequence_number': 'I-SEQ', 10 | 'level': 'I-LEV', 11 | 'hostname': 'I-HOS', 12 | 'service2': 'B-SER', 13 | 'service': 'I-SER', 14 | 'subservice2': 'B-SUB', 15 | 'subservice': 'I-SUB', 16 | 'unix_time2': 'B-UTIM', 17 | 'unix_time': 'I-UTIM', 18 | 'message': 'O', 19 | 'sock': 'I-SOC', 20 | 'number': 'I-NUM', 21 | 'core1': 'I-COR', 22 | 'core2': 'I-COR', 23 | 'timestamp_bgl': 'I-TIM', 24 | 'source': 'I-SOU', 25 | 'arch': 'I-ARC', 26 | 'domain_or_ip': 'I-DOM', 27 | 'domain_or_ip2': 'B-DOM', 28 | 'status': 'I-STA', 29 | 'status2': 'B-STA', 30 | 'ip_address': 'I-IPA', 31 | 'dash': 'I-DAS', 32 | 'auth': 'I-AUT', 33 | 'command': 'I-COM', 34 | 'command2': 'B-COM', 35 | 'status_code': 'I-STC', 36 | 'num_bytes': 'I-BYT', 37 | 'referrer': 'I-REF', 38 | 'referrer2': 'B-REF', 39 | 'client_agent': 'I-CLI', 40 | 'client_agent2': 'B-CLI', 41 | 'job': 'I-JOB', 42 | 'job2': 'B-JOB' 43 | } 44 | self.classes = ['I-TIM', 'B-TIM', 'I-SEQ', 'B-SEQ' 'I-LEV', 'B-LEV' 'I-HOS', 'B-HOS', 'I-SER', 'B-SER', 45 | 'B-SUB', 'I-SUB', 'B-UTIM', 'I-UTIM', 'O', 'B-SOC', 'I-SOC', 'B-NUM', 'I-NUM', 'I-COR', 'B-COR', 46 | 'B-SOU', 'I-SOU', 'B-ARC', 'I-ARC', 'I-DOM', 'B-DOM', 'I-STA', 'B-STA', 'B-IPA', 'I-IPA', 47 | 'I-DAS', 'B-DAS', 'B-AUT', 'I-AUT', 'B-COM', 'I-COM', 'B-STC', 'I-STC', 'B-BYT', 'I-BYT', 48 | 'I-REF', 'B-REF', 'I-CLI', 'B-CLI', 'I-JOB', 'B-JOB'] 49 | 50 | def __get_conll_format(self, value_split, value_split_len, entity, stanford=False): 51 | if stanford is True: 52 | underscore = ' ' 53 | else: 54 | underscore = ' _ _ ' 55 | 56 | conll_format = '' 57 | if entity != 'message': 58 | if value_split_len == 1: 59 | conll_format += value_split[0] + underscore + self.entity_names[entity] + '\n' 60 | else: 61 | for index, value_name in enumerate(value_split): 62 | if index == 0: 63 | conll_format += value_name + underscore + self.entity_names[entity + '2'] + '\n' 64 | else: 65 | conll_format += value_name + underscore + self.entity_names[entity] + '\n' 66 | else: 67 | for value_name in value_split: 68 | conll_format += value_name + underscore + self.entity_names[entity] + '\n' 69 | 70 | return conll_format 71 | 72 | def __get_conll_pos(self, value_pos, value_split_len, entity): 73 | conll_format = '' 74 | if entity != 'message': 75 | if value_split_len == 1: 76 | conll_format += value_pos[0][0] + ' ' + value_pos[0][1] + ' ' + self.entity_names[entity] + '\n' 77 | else: 78 | for index, value_name in enumerate(value_pos): 79 | if index == 0: 80 | conll_format += value_name[0] + ' ' + value_name[1] + ' ' + \ 81 | self.entity_names[entity + '2'] + '\n' 82 | else: 83 | conll_format += value_name[0] + ' ' + value_name[1] + ' ' + self.entity_names[entity] + '\n' 84 | else: 85 | for value_name in value_pos: 86 | conll_format += value_name[0] + ' ' + value_name[1] + ' ' + self.entity_names[entity] + '\n' 87 | 88 | return conll_format 89 | 90 | def __get_csv(self, value_pos, value_split_len, entity): 91 | csv_string = '' 92 | if entity != 'message': 93 | if value_split_len == 1: 94 | csv_string += '\t' + value_pos[0][0] + '\t' + value_pos[0][1] + '\t' + self.entity_names[entity] + '\n' 95 | else: 96 | for index, value_name in enumerate(value_pos): 97 | if index == 0: 98 | csv_string += '\t' + value_name[0] + '\t' + value_name[1] + '\t' + \ 99 | self.entity_names[entity + '2'] + '\n' 100 | else: 101 | csv_string += '\t' + value_name[0] + '\t' + value_name[1] + '\t' + \ 102 | self.entity_names[entity] + '\n' 103 | else: 104 | for value_name in value_pos: 105 | csv_string += '\t' + value_name[0] + '\t' + value_name[1] + '\t' + self.entity_names[entity] + '\n' 106 | 107 | return csv_string 108 | 109 | def __get_nltk_tree(self, value_pos, value_split_len, entity): 110 | iob_list = [] 111 | if entity != 'message': 112 | if value_split_len == 1: 113 | iob_tuple = (value_pos[0][0], value_pos[0][1], self.entity_names[entity]) 114 | iob_list.append(iob_tuple) 115 | else: 116 | for index, value_name in enumerate(value_pos): 117 | if index == 0: 118 | iob_tuple = (value_name[0], value_name[1], self.entity_names[entity + '2']) 119 | iob_list.append(iob_tuple) 120 | else: 121 | iob_tuple = (value_name[0], value_name[1], self.entity_names[entity]) 122 | iob_list.append(iob_tuple) 123 | else: 124 | for value_name in value_pos: 125 | iob_tuple = (value_name[0], value_name[1], self.entity_names[entity]) 126 | iob_list.append(iob_tuple) 127 | 128 | return iob_list 129 | 130 | def convert(self, parsed, stanford=False, ispos=False, csv=False, csv_line_id=0, iobtree=False): 131 | if csv: 132 | conll_format = 'Sentence: ' + str(csv_line_id) 133 | elif iobtree: 134 | conll_format = None 135 | else: 136 | conll_format = '' 137 | 138 | conll = [] 139 | for entity, value in parsed.items(): 140 | value_split = value.split() 141 | value_split_len = len(value_split) 142 | 143 | if value != '': 144 | if ispos: 145 | value_pos = pos_tag(value_split) # pos = part of speech 146 | conll_format += self.__get_conll_pos(value_pos, value_split_len, entity) 147 | elif csv: 148 | value_pos = pos_tag(value_split) 149 | conll_format += self.__get_csv(value_pos, value_split_len, entity) 150 | elif iobtree: 151 | value_pos = pos_tag(value_split) 152 | iob = self.__get_nltk_tree(value_pos, value_split_len, entity) 153 | conll = conll + iob 154 | else: 155 | conll_format += self.__get_conll_format(value_split, value_split_len, entity, stanford) 156 | 157 | if csv is False and iobtree is False: 158 | conll_format += '\n' 159 | 160 | if iobtree: 161 | conll_format = conlltags2tree(conll) 162 | 163 | return conll_format 164 | -------------------------------------------------------------------------------- /nerlogparser/grammar/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/grammar/__init__.py -------------------------------------------------------------------------------- /nerlogparser/grammar/authlog.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from pyparsing import Word, alphas, Combine, nums, string, Optional, Regex 4 | from collections import OrderedDict 5 | 6 | 7 | class AuthLog(object): 8 | def __init__(self, dataset): 9 | """Constructor for class AuthLog. 10 | 11 | Parameters 12 | ---------- 13 | dataset : str 14 | Dataset name. 15 | """ 16 | self.dataset = dataset 17 | self.authlog_grammar = self.__get_authlog_grammar() 18 | 19 | @staticmethod 20 | def __get_authlog_grammar(): 21 | """The definition of auth.log grammar. Supported dataset: 22 | casper-rw 23 | dfrws-2009 24 | honeynet-challenge5 25 | honeynet-challenge7 26 | 27 | Returns 28 | ------- 29 | authlog_grammar : 30 | Grammar for auth.log 31 | """ 32 | ints = Word(nums) 33 | 34 | # timestamp 35 | month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 36 | day = ints 37 | hour = Combine(ints + ':' + ints + ':' + ints) 38 | timestamp = month + day + hour 39 | 40 | # hostname, service name, message 41 | hostname = Word(alphas + nums + '_' + '-' + '.') 42 | service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':') 43 | subservice = Optional(Word(alphas + ':' + '-' + '_' + '(' + ')')) 44 | subservice_two_words = Optional(Word(alphas + ':' + '-' + '_' + '(' + ')')) + \ 45 | Optional(Word(alphas + ':' + '-' + '_' + '(' + ')')) 46 | message = Regex('.*') 47 | 48 | # auth log grammar 49 | authlog_grammar = timestamp + hostname + service + subservice + subservice_two_words + message 50 | return authlog_grammar 51 | 52 | def parse_log(self, log_line): 53 | """Parse auth.log based on defined grammar. 54 | 55 | Parameters 56 | ---------- 57 | log_line : str 58 | A log line to be parsed. 59 | 60 | Returns 61 | ------- 62 | parsed : dict[str, str] 63 | A parsed auth.log containing these elements: timestamp, hostname, service, pid, subservice and message. 64 | """ 65 | parsed_authlog = self.authlog_grammar.parseString(log_line) 66 | 67 | # get parsed auth.log 68 | parsed = OrderedDict() 69 | parsed['timestamp'] = parsed_authlog[0] + ' ' + parsed_authlog[1] + ' ' + parsed_authlog[2] 70 | parsed['hostname'] = parsed_authlog[3] 71 | parsed['service'] = parsed_authlog[4] 72 | 73 | if len(parsed_authlog) == 6: 74 | parsed['subservice'] = '' 75 | parsed['message'] = parsed_authlog[5] 76 | 77 | elif len(parsed_authlog) == 8: 78 | if not parsed_authlog[6].endswith(':'): 79 | parsed['subservice'] = '' 80 | parsed['message'] = ' '.join(parsed_authlog[5:]) 81 | 82 | if parsed_authlog[5].endswith(':'): 83 | parsed['subservice'] = parsed_authlog[5] 84 | parsed['message'] = ' '.join(parsed_authlog[6:]) 85 | 86 | else: 87 | if parsed_authlog[6].endswith(':'): 88 | parsed['subservice'] = parsed_authlog[5] + ' ' + parsed_authlog[6] 89 | parsed['message'] = ' '.join(parsed_authlog[7:]) 90 | 91 | else: 92 | parsed['subservice'] = parsed_authlog[5] 93 | parsed['message'] = ' '.join(parsed_authlog[6:]) 94 | 95 | if not parsed['subservice'].endswith(':'): 96 | parsed['message'] = parsed['subservice'] + ' ' + parsed['message'] 97 | parsed['subservice'] = '' 98 | 99 | return parsed 100 | 101 | 102 | if __name__ == '__main__': 103 | # get auth.log datasets 104 | dataset_path = '/home/hudan/Git/prlogparser/datasets/' 105 | filenames = [ 106 | 'casper-rw/auth.log', 107 | 'dfrws-2009-jhuisi/auth.log', 108 | 'dfrws-2009-jhuisi/auth.log.0', 109 | 'dfrws-2009-jhuisi/auth.log.1', 110 | 'dfrws-2009-nssal/auth.log', 111 | 'dfrws-2009-nssal/auth.log.0', 112 | 'dfrws-2009-nssal/auth.log.1', 113 | 'dfrws-2009-nssal/auth.log.2', 114 | 'dfrws-2009-nssal/auth.log.3', 115 | 'dfrws-2009-nssal/auth.log.4', 116 | 'honeynet-challenge5/auth.log', 117 | 'honeynet-challenge7/auth.log' 118 | ] 119 | 120 | # setup test csv file to save results 121 | test_file = '/home/hudan/Git/prlogparser/groundtruth/auth-test.csv' 122 | f = open(test_file, 'w', newline='') 123 | writer = csv.writer(f) 124 | 125 | # parse auth.log 126 | dl = AuthLog('') 127 | for filename in filenames: 128 | filename = os.path.join(dataset_path, filename) 129 | with open(filename, 'r') as f: 130 | for line in f: 131 | # get parsed line and print 132 | parsed_line = dl.parse_log(line) 133 | print(parsed_line) 134 | 135 | # write to csv 136 | row = list(parsed_line.values()) 137 | writer.writerow(row) 138 | f.close() 139 | -------------------------------------------------------------------------------- /nerlogparser/grammar/bluegenelog.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from pyparsing import Word, alphas, Combine, nums, Regex, ParseException 4 | from collections import OrderedDict 5 | 6 | 7 | class BlueGeneLog(object): 8 | def __init__(self, dataset): 9 | self.dataset = dataset 10 | self.bluegenelog_grammar = self.__get_bluegenelog_grammar() 11 | 12 | @staticmethod 13 | def __get_bluegenelog_grammar(): 14 | """The definition of BlueGene/L grammar. 15 | 16 | The BlueGene/L logs can be downloaded from [Usenix2006a]_ and 17 | this data was used in [Stearley2008]_. 18 | 19 | Returns 20 | ------- 21 | bluegene_grammar : 22 | Grammar for BlueGene/L supercomputer logs. 23 | 24 | References 25 | ---------- 26 | .. [Usenix2006a] The HPC4 data. URL: https://www.usenix.org/cfdr-data#hpc4 27 | .. [Stearley2008] Stearley, J., & Oliner, A. J. Bad words: Finding faults in Spirit's syslogs. 28 | In 8th IEEE International Symposium on Cluster Computing and the Grid, pp. 765-770. 29 | """ 30 | ints = Word(nums) 31 | 32 | sock = Word(alphas + '-' + '_') 33 | number = ints 34 | date = Combine(ints + '.' + ints + '.' + ints) 35 | core1 = Word(alphas + nums + '-' + ':' + '_') 36 | datetime = Combine(ints + '-' + ints + '-' + ints + '-' + ints + '.' + ints + '.' + ints + '.' + ints) 37 | core2 = Word(alphas + nums + '-' + ':' + '_') 38 | source = Word(alphas) 39 | service = Word(alphas) 40 | info_type = Word(alphas) 41 | message = Regex('.*') 42 | 43 | # blue gene log grammar 44 | bluegene_grammar = sock + number + date + core1 + datetime + core2 + source + service + info_type + message 45 | return bluegene_grammar 46 | 47 | def parse_log(self, log_line): 48 | """Parse the BlueGene/L logs based on defined grammar. 49 | 50 | Parameters 51 | ---------- 52 | log_line : str 53 | A log line to be parsed 54 | 55 | Returns 56 | ------- 57 | parsed : dict[str, str] 58 | A parsed BlueGene/L log. 59 | """ 60 | parsed = OrderedDict() 61 | try: 62 | parsed_bluegenelog = self.bluegenelog_grammar.parseString(log_line) 63 | parsed['sock'] = parsed_bluegenelog[0] 64 | parsed['number'] = parsed_bluegenelog[1] 65 | parsed['timestamp'] = parsed_bluegenelog[2] 66 | parsed['core1'] = parsed_bluegenelog[3] 67 | parsed['timestamp_bgl'] = parsed_bluegenelog[4] 68 | parsed['core2'] = parsed_bluegenelog[5] 69 | parsed['source'] = parsed_bluegenelog[6] 70 | parsed['service'] = parsed_bluegenelog[7] 71 | parsed['level'] = parsed_bluegenelog[8] 72 | parsed['message'] = parsed_bluegenelog[9] 73 | 74 | except ParseException: 75 | print(log_line) 76 | 77 | return parsed 78 | 79 | 80 | if __name__ == '__main__': 81 | dataset_path = '/home/hudan/Git/prlogparser/datasets/' 82 | filenames = ['bgl2/bgl2'] 83 | 84 | test_file = '/home/hudan/Git/prlogparser/groundtruth/test-results/bgl-test.csv' 85 | f = open(test_file, 'w', newline='') 86 | writer = csv.writer(f) 87 | 88 | bl = BlueGeneLog('') 89 | for filename in filenames: 90 | filename = os.path.join(dataset_path, filename) 91 | with open(filename, 'r') as f: 92 | for line in f: 93 | parsed_line = bl.parse_log(line) 94 | print(parsed_line['timestamp']) 95 | 96 | row = list(parsed_line.values()) 97 | writer.writerow(row) 98 | f.close() 99 | -------------------------------------------------------------------------------- /nerlogparser/grammar/csvlog.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import csv 3 | 4 | 5 | class CSVLog(object): 6 | def __init__(self, dataset): 7 | self.dataset = dataset 8 | if self.dataset == 'dfrws-2016': 9 | self.log_file = '/home/hudan/Git/prlogparser/datasets/dfrws-2016/csv.csv' 10 | 11 | def parse_log(self): 12 | parsed_logs = [] 13 | with open(self.log_file, 'r') as f: 14 | reader = csv.reader(f) 15 | for row in reader: 16 | parsed = OrderedDict() 17 | parsed['timestamp'] = row[0] 18 | parsed['sequence_number'] = row[1] 19 | parsed['service'] = row[2] 20 | parsed['level'] = row[3] 21 | parsed['message'] = row[4] 22 | parsed_logs.append(parsed) 23 | 24 | return parsed_logs 25 | 26 | 27 | if __name__ == '__main__': 28 | csvlog = CSVLog('dfrws-2016') 29 | results = csvlog.parse_log() 30 | for result in results: 31 | print(result) 32 | -------------------------------------------------------------------------------- /nerlogparser/grammar/daemonlog.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from pyparsing import Word, alphas, Combine, nums, string, Optional, Regex 4 | from collections import OrderedDict 5 | 6 | 7 | class DaemonLog(object): 8 | def __init__(self, dataset): 9 | """Constructor for class DaemonLog. 10 | 11 | Parameters 12 | ---------- 13 | dataset : str 14 | Dataset name. 15 | """ 16 | self.dataset = dataset 17 | self.daemonlog_grammar = self.__get_daemonlog_grammar() 18 | 19 | @staticmethod 20 | def __get_daemonlog_grammar(): 21 | """The definition of daemon.log grammar. Supported dataset: 22 | casper-rw 23 | dfrws-2009 24 | honeynet-challenge5 25 | honeynet-challenge7 26 | 27 | Returns 28 | ------- 29 | daemonlog_grammar : 30 | Grammar for daemon.log 31 | """ 32 | ints = Word(nums) 33 | 34 | # timestamp 35 | month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 36 | day = ints 37 | hour = Combine(ints + ':' + ints + ':' + ints) 38 | timestamp = month + day + hour 39 | 40 | # hostname, service name, message 41 | hostname = Word(alphas + nums + '_' + '-' + '.') 42 | service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + '(' + ')' + ':') 43 | subservice = Optional(Word(alphas + nums + ':' + '-' + '_' + '<' + '>' + '.' + "'" + ',')) 44 | subservice_two_words = Optional(Word(alphas + nums + ':' + '-' + '_' + '(' + ')' + '.' + ',')) 45 | message = Regex('.*') 46 | 47 | # daemon log grammar 48 | daemon_grammar = timestamp + hostname + service + subservice + subservice_two_words + message 49 | return daemon_grammar 50 | 51 | def parse_log(self, log_line): 52 | """Parse auth.log based on defined grammar. 53 | 54 | Parameters) 55 | ---------- 56 | log_line : str 57 | A log line to be parsed. 58 | 59 | Returns 60 | ------- 61 | parsed : dict[str, str] 62 | A parsed auth.log containing these elements: timestamp, hostname, service, subservice and message. 63 | """ 64 | parsed_daemonlog = self.daemonlog_grammar.parseString(log_line) 65 | 66 | # remove empty string 67 | parsed_daemonlog = list(filter(None, parsed_daemonlog)) 68 | 69 | # get parsed auth.log 70 | parsed = OrderedDict() 71 | parsed['timestamp'] = parsed_daemonlog[0] + ' ' + parsed_daemonlog[1] + ' ' + parsed_daemonlog[2] 72 | parsed['hostname'] = parsed_daemonlog[3] 73 | parsed['service'] = parsed_daemonlog[4] 74 | 75 | if len(parsed_daemonlog) == 5: 76 | parsed['subservice'] = '' 77 | parsed['message'] = '' 78 | 79 | elif len(parsed_daemonlog) == 6: 80 | parsed['subservice'] = '' 81 | parsed['message'] = parsed_daemonlog[5] 82 | 83 | else: 84 | # subservice one word 85 | if parsed_daemonlog[5].endswith(':'): 86 | parsed['subservice'] = parsed_daemonlog[5] 87 | parsed['message'] = ' '.join(parsed_daemonlog[6:]) 88 | 89 | # subservice two words 90 | elif not parsed_daemonlog[5].endswith(':') and parsed_daemonlog[6].endswith(':'): 91 | parsed['subservice'] = parsed_daemonlog[5] + ' ' + parsed_daemonlog[6] 92 | parsed['message'] = ' '.join(parsed_daemonlog[7:]) 93 | 94 | # subservice two words 95 | elif parsed_daemonlog[5].endswith('>') and parsed_daemonlog[6].endswith(':'): 96 | parsed['subservice'] = parsed_daemonlog[5] + ' ' + parsed_daemonlog[6] 97 | parsed['message'] = ' '.join(parsed_daemonlog[7:]) 98 | 99 | # subservice one word 100 | elif parsed_daemonlog[5].endswith('>') and not parsed_daemonlog[6].endswith(':'): 101 | parsed['subservice'] = parsed_daemonlog[5] 102 | parsed['message'] = ' '.join(parsed_daemonlog[6:]) 103 | 104 | else: 105 | parsed['subservice'] = '' 106 | parsed['message'] = parsed_daemonlog[5] + ' ' + ' '.join(parsed_daemonlog[6:]) 107 | 108 | if not parsed['service'].endswith(':'): 109 | parsed['message'] = parsed['service'] + parsed['message'] 110 | parsed['service'] = '' 111 | 112 | return parsed 113 | 114 | 115 | if __name__ == '__main__': 116 | # get daemon.log datasets 117 | dataset_path = '/home/hudan/Git/prlogparser/datasets/' 118 | filenames = [ 119 | 'casper-rw/daemon.log', 120 | 'dfrws-2009-jhuisi/daemon.log', 121 | 'dfrws-2009-jhuisi/daemon.log.0', 122 | 'dfrws-2009-jhuisi/daemon.log.1', 123 | 'dfrws-2009-nssal/daemon.log', 124 | 'dfrws-2009-nssal/daemon.log.0', 125 | 'dfrws-2009-nssal/daemon.log.1', 126 | 'dfrws-2009-nssal/daemon.log.2', 127 | 'dfrws-2009-nssal/daemon.log.3', 128 | 'honeynet-challenge5/daemon.log', 129 | 'honeynet-challenge7/daemon.log' 130 | ] 131 | 132 | # setup test csv file to save results 133 | test_file = '/home/hudan/Git/prlogparser/groundtruth/daemon-test.csv' 134 | f = open(test_file, 'w', newline='') 135 | writer = csv.writer(f) 136 | 137 | # parse daemon.log 138 | dl = DaemonLog('') 139 | for filename in filenames: 140 | filename = os.path.join(dataset_path, filename) 141 | with open(filename, 'r') as f: 142 | for line in f: 143 | # get parsed line and print 144 | parsed_line = dl.parse_log(line) 145 | print(parsed_line) 146 | 147 | # write to csv 148 | row = list(parsed_line.values()) 149 | writer.writerow(row) 150 | 151 | f.close() 152 | -------------------------------------------------------------------------------- /nerlogparser/grammar/debuglog.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from pyparsing import Word, alphas, Combine, nums, string, Optional, Regex 4 | from collections import OrderedDict 5 | from nerlogparser.grammar.grammar_utility import GrammarUtility 6 | 7 | 8 | class DebugLog(object): 9 | def __init__(self, dataset): 10 | self.dataset = dataset 11 | self.groups = { 12 | 'group1': ['casper-rw'], 13 | 'group2': ['dfrws-2009-jhuisi', 'dfrws-2009-nssal', 'honeynet-challenge7'], 14 | 'group3': ['honeynet-challenge5'] 15 | } 16 | 17 | def get_grammar(self): 18 | dl = None 19 | if self.dataset in self.groups['group1']: 20 | dl = DebugLog1(self.dataset) 21 | 22 | elif self.dataset in self.groups['group2']: 23 | dl = DebugLog2(self.dataset) 24 | 25 | elif self.dataset in self.groups['group3']: 26 | dl = DebugLog3(self.dataset) 27 | 28 | return dl 29 | 30 | 31 | class DebugLog1(object): 32 | def __init__(self, dataset): 33 | """Constructor for class DebugLog. 34 | 35 | Parameters 36 | ---------- 37 | dataset : str 38 | Dataset name. 39 | """ 40 | self.dataset = dataset 41 | self.debuglog_grammar = self.__get_debuglog_grammar() 42 | 43 | @staticmethod 44 | def __get_debuglog_grammar(): 45 | """The definition of debug log grammar. Supported dataset: 46 | casper-rw 47 | 48 | Returns 49 | ------- 50 | debuglog_grammar : 51 | Grammar for debug log 52 | """ 53 | ints = Word(nums) 54 | 55 | # timestamp 56 | month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 57 | day = ints 58 | hour = Combine(ints + ':' + ints + ':' + ints) 59 | timestamp = month + day + hour 60 | 61 | # hostname, service name, message 62 | hostname = Word(alphas + nums + '_' + '-' + '.') 63 | service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':') 64 | 65 | # unix time 66 | unix_time = '[' + Word(nums + '.' + ']') 67 | subservice = Optional(Word(alphas + nums + '_' + '-' + ':')) 68 | subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/')) 69 | message = Regex('.*') 70 | 71 | # debug log grammar 72 | debuglog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message 73 | return debuglog_grammar 74 | 75 | def parse_log(self, log_line): 76 | """Parse debug log based on defined grammar. 77 | 78 | Parameters 79 | ---------- 80 | log_line : str 81 | A log line to be parsed. 82 | 83 | Returns 84 | ------- 85 | parsed : dict[str, str] 86 | A parsed debug log containing these elements: timestamp, hostname, service, unix_time, 87 | subservice and message. 88 | """ 89 | parsed_debuglog = self.debuglog_grammar.parseString(log_line) 90 | 91 | # get parsed debug log 92 | parsed = OrderedDict() 93 | parsed['timestamp'] = parsed_debuglog[0] + ' ' + parsed_debuglog[1] + ' ' + parsed_debuglog[2] 94 | parsed['hostname'] = parsed_debuglog[3] 95 | parsed['service'] = parsed_debuglog[4] 96 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_debuglog[5], parsed_debuglog[6]) 97 | 98 | if len(parsed_debuglog) == 8: 99 | parsed['subservice'] = '' 100 | parsed['message'] = parsed_debuglog[7] 101 | 102 | else: 103 | # subservice one word 104 | if parsed_debuglog[7].endswith(':') and not parsed_debuglog[8].endswith(':'): 105 | parsed['subservice'] = parsed_debuglog[7] 106 | parsed['message'] = ' '.join(parsed_debuglog[8:]) 107 | 108 | # subservice two words 109 | elif not parsed_debuglog[7].endswith(':') and parsed_debuglog[8].endswith(':'): 110 | parsed['subservice'] = parsed_debuglog[7] + ' ' + parsed_debuglog[8] 111 | parsed['message'] = ' '.join(parsed_debuglog[9:]) 112 | 113 | # subservice two words 114 | elif parsed_debuglog[7].endswith(':') and parsed_debuglog[8].endswith(':'): 115 | parsed['subservice'] = parsed_debuglog[7] + ' ' + parsed_debuglog[8] 116 | parsed['message'] = ' '.join(parsed_debuglog[9:]) 117 | 118 | else: 119 | parsed['subservice'] = '' 120 | parsed['message'] = ' '.join(parsed_debuglog[7:]) 121 | 122 | if not parsed['subservice'].endswith(':'): 123 | parsed['message'] = parsed['subservice'] + ' ' + parsed['message'] 124 | parsed['subservice'] = '' 125 | 126 | return parsed 127 | 128 | 129 | class DebugLog2(object): 130 | def __init__(self, dataset): 131 | """Constructor for class MessagesLog. This parser also supports syslog. 132 | 133 | Parameters 134 | ---------- 135 | dataset : str 136 | Dataset name. 137 | """ 138 | self.dataset = dataset 139 | self.messageslog_grammar = self.__get_messageslog_grammar() 140 | 141 | @staticmethod 142 | def __get_messageslog_grammar(): 143 | """The definition of messages log grammar. Supported dataset: 144 | dfrws-2009 145 | honeynet-challenge7 146 | 147 | Returns 148 | ------- 149 | messageslog_grammar : 150 | Grammar for messages log 151 | """ 152 | ints = Word(nums) 153 | 154 | # timestamp 155 | month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 156 | day = ints 157 | hour = Combine(ints + ':' + ints + ':' + ints) 158 | timestamp = month + day + hour 159 | 160 | # hostname, service name, message 161 | hostname = Word(alphas + nums + '_' + '-' + '.') 162 | service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':') 163 | subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/')) 164 | 165 | # unix time 166 | unix_time = Optional('[' + Word(nums + '.' + ']')) 167 | subservice = Optional(Word(alphas + nums + '_' + '-' + ':')) 168 | message = Regex('.*') 169 | 170 | # messages log grammar 171 | messageslog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message 172 | return messageslog_grammar 173 | 174 | def parse_log(self, log_line): 175 | """Parse messages log based on defined grammar. 176 | 177 | Parameters 178 | ---------- 179 | log_line : str 180 | A log line to be parsed. 181 | 182 | Returns 183 | ------- 184 | parsed : dict[str, str] 185 | A parsed messages log containing these elements: timestamp, hostname, service, unix_time, 186 | subservice and message. 187 | """ 188 | parsed_messageslog = self.messageslog_grammar.parseString(log_line) 189 | 190 | # get parsed kernel log 191 | parsed = OrderedDict() 192 | parsed['timestamp'] = parsed_messageslog[0] + ' ' + parsed_messageslog[1] + ' ' + parsed_messageslog[2] 193 | parsed['hostname'] = parsed_messageslog[3] 194 | parsed['service'] = parsed_messageslog[4] 195 | 196 | if len(parsed_messageslog) == 6: 197 | parsed['unix_time'] = '' 198 | parsed['subservice'] = '' 199 | if not parsed['service'].endswith(':'): 200 | parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5] 201 | parsed['service'] = '' 202 | 203 | else: 204 | parsed['message'] = parsed_messageslog[5] 205 | 206 | elif len(parsed_messageslog) == 7: 207 | parsed['unix_time'] = '' 208 | parsed['subservice'] = '' 209 | if not parsed_messageslog[5].endswith(':'): 210 | parsed['message'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6] 211 | 212 | if not parsed['service'].endswith(':'): 213 | parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5] + ' ' + parsed_messageslog[6] 214 | parsed['service'] = '' 215 | 216 | else: 217 | if parsed_messageslog[5].startswith('[') and parsed_messageslog[6].endswith(']'): 218 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6]) 219 | 220 | if len(parsed_messageslog) > 8: 221 | # subservice one word 222 | if parsed_messageslog[7].endswith(':') and not parsed_messageslog[8].endswith(':'): 223 | parsed['subservice'] = parsed_messageslog[7] 224 | parsed['message'] = ' '.join(parsed_messageslog[8:]) 225 | 226 | # subservice two words 227 | elif not parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'): 228 | parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8] 229 | parsed['message'] = ' '.join(parsed_messageslog[9:]) 230 | 231 | # subservice two words 232 | elif parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'): 233 | parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8] 234 | parsed['message'] = ' '.join(parsed_messageslog[9:]) 235 | 236 | else: 237 | parsed['subservice'] = '' 238 | parsed['message'] = ' '.join(parsed_messageslog[7:]) 239 | 240 | else: 241 | parsed['subservice'] = '' 242 | parsed['message'] = ' '.join(parsed_messageslog[7:]) 243 | 244 | else: 245 | parsed['unix_time'] = '' 246 | 247 | # subservice one word 248 | if parsed_messageslog[5].endswith(':') and not parsed_messageslog[6].endswith(':'): 249 | parsed['subservice'] = parsed_messageslog[5] 250 | parsed['message'] = ' '.join(parsed_messageslog[6:]) 251 | 252 | # subservice two words 253 | elif not parsed_messageslog[5].endswith(':') and parsed_messageslog[6].endswith(':'): 254 | parsed['subservice'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6] 255 | parsed['message'] = ' '.join(parsed_messageslog[7:]) 256 | 257 | # subservice two words 258 | elif parsed_messageslog[5].endswith(':') and parsed_messageslog[6].endswith(':'): 259 | parsed['subservice'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6] 260 | parsed['message'] = ' '.join(parsed_messageslog[7:]) 261 | 262 | else: 263 | parsed['subservice'] = '' 264 | parsed['message'] = ' '.join(parsed_messageslog[5:]) 265 | 266 | if not parsed['subservice'].endswith(':'): 267 | parsed['message'] = parsed['subservice'] + ' ' + parsed['message'] 268 | parsed['subservice'] = '' 269 | 270 | return parsed 271 | 272 | 273 | class DebugLog3(object): 274 | def __init__(self, dataset): 275 | """Constructor for class MessagesLog. This parser also supports syslog. 276 | 277 | Parameters 278 | ---------- 279 | dataset : str 280 | Dataset name. 281 | """ 282 | self.dataset = dataset 283 | self.messageslog_grammar = self.__get_messageslog_grammar() 284 | 285 | @staticmethod 286 | def __get_messageslog_grammar(): 287 | """The definition of messages log grammar. Supported dataset: 288 | honeynet-challenge5 289 | 290 | Returns 291 | ------- 292 | messageslog_grammar : 293 | Grammar for messages log 294 | """ 295 | ints = Word(nums) 296 | 297 | # timestamp 298 | month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 299 | day = ints 300 | hour = Combine(ints + ':' + ints + ':' + ints) 301 | timestamp = month + day + hour 302 | 303 | # hostname, service name, message 304 | hostname = Word(alphas + nums + '_' + '-' + '.') 305 | service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':') + Optional(':') 306 | 307 | # unix time 308 | unix_time = Optional('[' + Word(nums + '.' + ']')) 309 | subservice = Optional(Word(alphas + nums + '_' + '-' + ':')) 310 | subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/')) 311 | message = Regex('.*') 312 | 313 | # messages log grammar 314 | messageslog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message 315 | return messageslog_grammar 316 | 317 | def parse_log(self, log_line): 318 | """Parse messages log based on defined grammar. 319 | 320 | Parameters 321 | ---------- 322 | log_line : str 323 | A log line to be parsed. 324 | 325 | Returns 326 | ------- 327 | parsed : dict[str, str] 328 | A parsed messages log containing these elements: timestamp, hostname, service, unix_time, 329 | subservice and message. 330 | """ 331 | parsed_messageslog = self.messageslog_grammar.parseString(log_line) 332 | 333 | # get parsed kernel log 334 | parsed = OrderedDict() 335 | parsed['timestamp'] = parsed_messageslog[0] + ' ' + parsed_messageslog[1] + ' ' + parsed_messageslog[2] 336 | parsed['hostname'] = parsed_messageslog[3] 337 | parsed['service'] = parsed_messageslog[4] 338 | 339 | if len(parsed_messageslog) == 6: 340 | parsed['unix_time'] = '' 341 | parsed['subservice'] = '' 342 | if not parsed['service'].endswith(':'): 343 | parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5] 344 | parsed['service'] = '' 345 | 346 | elif len(parsed_messageslog) == 7: 347 | parsed['unix_time'] = '' 348 | parsed['subservice'] = '' 349 | if not parsed_messageslog[5].endswith(':'): 350 | parsed['message'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6] 351 | 352 | if not parsed['service'].endswith(':'): 353 | parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5] + ' ' + parsed_messageslog[6] 354 | parsed['service'] = '' 355 | 356 | elif len(parsed_messageslog) == 8: 357 | if parsed_messageslog[5].startswith('[') and parsed_messageslog[6].endswith(']'): 358 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6]) 359 | parsed['subservice'] = '' 360 | parsed['message'] = parsed_messageslog[7] 361 | 362 | else: 363 | parsed['unix_time'] = '' 364 | parsed['subservice'] = parsed_messageslog[5] 365 | parsed['message'] = ' '.join(parsed_messageslog[6:]) 366 | 367 | else: 368 | if parsed_messageslog[5] == ':': 369 | parsed['service'] = parsed['service'] + ' ' + parsed_messageslog[5] 370 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[6], parsed_messageslog[7]) 371 | 372 | # subservice one word 373 | if parsed_messageslog[8].endswith(':') and not parsed_messageslog[9].endswith(':'): 374 | parsed['subservice'] = parsed_messageslog[8] 375 | parsed['message'] = ' '.join(parsed_messageslog[9:]) 376 | 377 | # subservice two words 378 | elif not parsed_messageslog[8].endswith(':') and parsed_messageslog[9].endswith(':'): 379 | parsed['subservice'] = parsed_messageslog[8] + ' ' + parsed_messageslog[9] 380 | parsed['message'] = ' '.join(parsed_messageslog[10:]) 381 | 382 | # subservice two words 383 | elif parsed_messageslog[8].endswith(':') and parsed_messageslog[9].endswith(':'): 384 | parsed['subservice'] = parsed_messageslog[8] + ' ' + parsed_messageslog[9] 385 | parsed['message'] = ' '.join(parsed_messageslog[10:]) 386 | 387 | # no subservice, only message 388 | else: 389 | parsed['subservice'] = '' 390 | parsed['message'] = ' '.join(parsed_messageslog[8:]) 391 | 392 | else: 393 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6]) 394 | 395 | # subservice one word 396 | if parsed_messageslog[7].endswith(':') and not parsed_messageslog[8].endswith(':'): 397 | parsed['subservice'] = parsed_messageslog[7] 398 | parsed['message'] = ' '.join(parsed_messageslog[8:]) 399 | 400 | # subservice two words 401 | elif not parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'): 402 | parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8] 403 | parsed['message'] = ' '.join(parsed_messageslog[9:]) 404 | 405 | # subservice two words 406 | elif parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'): 407 | parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8] 408 | parsed['message'] = ' '.join(parsed_messageslog[9:]) 409 | 410 | # no subservice, only message 411 | else: 412 | parsed['subservice'] = '' 413 | parsed['message'] = ' '.join(parsed_messageslog[7:]) 414 | 415 | if not parsed['subservice'].endswith(':'): 416 | parsed['message'] = parsed['subservice'] + ' ' + parsed['message'] 417 | parsed['subservice'] = '' 418 | 419 | return parsed 420 | 421 | 422 | class Main(object): 423 | def __init__(self, datasets): 424 | self.datasets = datasets 425 | self.dataset_path = '/home/hudan/Git/prlogparser/datasets/' 426 | self.groups = { 427 | 'group1': ['casper-rw'], 428 | 'group2': ['dfrws-2009-jhuisi', 'dfrws-2009-nssal', 'honeynet-challenge7'], 429 | 'group3': ['honeynet-challenge5'] 430 | } 431 | 432 | def run(self): 433 | # parse messages.log 434 | for group_name, group in self.groups.items(): 435 | # setup test csv file to save results 436 | base_name = '/home/hudan/Git/prlogparser/groundtruth/debug-' 437 | test_file = base_name + group_name + '.csv' 438 | f = open(test_file, 'w', newline='') 439 | writer = csv.writer(f) 440 | 441 | for dataset in group: 442 | # get grammar 443 | dl = None 444 | if group_name == 'group1': 445 | dl = DebugLog1(dataset) 446 | 447 | elif group_name == 'group2': 448 | dl = DebugLog2(dataset) 449 | 450 | elif group_name == 'group3': 451 | dl = DebugLog3(dataset) 452 | 453 | # start parsing 454 | for filename in self.datasets[dataset]: 455 | filename = os.path.join(self.dataset_path, filename) 456 | with open(filename, 'r') as f: 457 | for line in f: 458 | # get parsed line and print 459 | parsed_line = dl.parse_log(line) 460 | print(parsed_line) 461 | 462 | # write to csv 463 | row = list(parsed_line.values()) 464 | writer.writerow(row) 465 | 466 | f.close() 467 | 468 | if __name__ == '__main__': 469 | datasets_files = { 470 | 'casper-rw': ['casper-rw/debug'], 471 | 'dfrws-2009-jhuisi': [ 472 | 'dfrws-2009-jhuisi/debug', 473 | 'dfrws-2009-jhuisi/debug.0', 474 | 'dfrws-2009-jhuisi/debug.1' 475 | ], 476 | 'dfrws-2009-nssal': [ 477 | 'dfrws-2009-nssal/debug', 478 | 'dfrws-2009-nssal/debug.0', 479 | 'dfrws-2009-nssal/debug.1', 480 | 'dfrws-2009-nssal/debug.2', 481 | 'dfrws-2009-nssal/debug.3' 482 | ], 483 | 'honeynet-challenge5': ['honeynet-challenge5/debug'], 484 | 'honeynet-challenge7': ['honeynet-challenge7/debug'] 485 | } 486 | 487 | main = Main(datasets_files) 488 | main.run() 489 | -------------------------------------------------------------------------------- /nerlogparser/grammar/dmesglog.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from pyparsing import Word, alphas, nums, Optional, Regex 4 | from collections import OrderedDict 5 | from nerlogparser.grammar.grammar_utility import GrammarUtility 6 | 7 | 8 | class DmesgLog(object): 9 | def __init__(self, dataset): 10 | self.dataset = dataset 11 | self.groups = { 12 | 'group1': ['casper-rw', 'dfrws-2009-jhuisi', 'honeynet-challenge5', 'honeynet-challenge7'], 13 | 'group2': ['dfrws-2009-nssal'] 14 | } 15 | 16 | def get_grammar(self): 17 | dl = None 18 | if self.dataset in self.groups['group1']: 19 | dl = DmesgLog1(self.dataset) 20 | 21 | elif self.dataset in self.groups['group2']: 22 | dl = DmesgLog2(self.dataset) 23 | 24 | return dl 25 | 26 | 27 | class DmesgLog1(object): 28 | def __init__(self, dataset): 29 | """Constructor for class DmesgLog. 30 | 31 | Parameters 32 | ---------- 33 | dataset : str 34 | Dataset name. 35 | """ 36 | self.dataset = dataset 37 | self.dmesglog_grammar = self.__get_dmesglog_grammar() 38 | 39 | @staticmethod 40 | def __get_dmesglog_grammar(): 41 | """The definition of dmesg log grammar. Supported dataset: 42 | casper-rw 43 | dfrws-2009-jhuisi 44 | honeynet-challenge5 45 | honeynet-challenge7 46 | 47 | Returns 48 | ------- 49 | dmesglog_grammar : pyparsing.And 50 | Grammar for dmesg log 51 | """ 52 | # unix time 53 | unix_time = '[' + Word(nums + '.' + ']') 54 | subservice = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + '.' + '=')) 55 | subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=')) 56 | message = Regex('.*') 57 | 58 | # dmesg log grammar 59 | dmesglog_grammar = unix_time + subservice + subservice_two_words + message 60 | return dmesglog_grammar 61 | 62 | def parse_log(self, log_line): 63 | """Parse dmesg log based on defined grammar. 64 | 65 | Parameters 66 | ---------- 67 | log_line : str 68 | A log line to be parsed. 69 | 70 | Returns 71 | ------- 72 | parsed : dict[str, str] 73 | A parsed dmesg log containing these elements: unix_time, subservice and message. 74 | """ 75 | parsed_dmesglog = self.dmesglog_grammar.parseString(log_line) 76 | 77 | # get parsed dmesg log 78 | parsed = OrderedDict() 79 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_dmesglog[0], parsed_dmesglog[1]) 80 | 81 | if len(parsed_dmesglog) == 3: 82 | parsed['subservice'] = '' 83 | parsed['message'] = parsed_dmesglog[2] 84 | 85 | else: 86 | # subservice one word 87 | if parsed_dmesglog[2].endswith(':') and not parsed_dmesglog[3].endswith(':'): 88 | parsed['subservice'] = parsed_dmesglog[2] 89 | parsed['message'] = ' '.join(parsed_dmesglog[3:]) 90 | 91 | # subservice two words 92 | elif not parsed_dmesglog[2].endswith(':') and parsed_dmesglog[3].endswith(':'): 93 | parsed['subservice'] = parsed_dmesglog[2] + ' ' + parsed_dmesglog[3] 94 | parsed['message'] = ' '.join(parsed_dmesglog[4:]) 95 | 96 | # subservice two words 97 | elif parsed_dmesglog[2].endswith(':') and parsed_dmesglog[3].endswith(':'): 98 | parsed['subservice'] = parsed_dmesglog[2] + ' ' + parsed_dmesglog[3] 99 | parsed['message'] = ' '.join(parsed_dmesglog[4:]) 100 | 101 | else: 102 | parsed['subservice'] = '' 103 | parsed['message'] = ' '.join(parsed_dmesglog[2:]) 104 | 105 | if not parsed['subservice'].endswith(':') and parsed['subservice']: 106 | parsed['message'] = parsed['subservice'] + ' ' + parsed['message'] 107 | parsed['subservice'] = '' 108 | 109 | return parsed 110 | 111 | 112 | class DmesgLog2(object): 113 | # this class is written for dfrws-2009-jhuisi/nssal/dmesg* 114 | def __init__(self, dataset): 115 | self.dataset = dataset 116 | self.dmesglog_grammar = self.__get_dmesglog_grammar() 117 | 118 | @staticmethod 119 | def __get_dmesglog_grammar(): 120 | subservice = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + '.' + '=')) 121 | subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=')) 122 | message = Regex('.*') 123 | 124 | # dmesg log grammar 125 | dmesglog_grammar = subservice + subservice_two_words + message 126 | return dmesglog_grammar 127 | 128 | def parse_log(self, log_line): 129 | # get grammar 130 | parsed_dmesglog = self.dmesglog_grammar.parseString(log_line) 131 | 132 | # get parsed dmesg log 133 | parsed = OrderedDict() 134 | if len(parsed_dmesglog) == 1: 135 | if parsed_dmesglog[0].endswith(':'): 136 | parsed['subservice'] = parsed_dmesglog[0] 137 | parsed['message'] = '' 138 | else: 139 | parsed['subservice'] = '' 140 | parsed['message'] = parsed_dmesglog[0] 141 | 142 | elif len(parsed_dmesglog) == 2: 143 | if parsed_dmesglog[0].endswith(':'): 144 | parsed['subservice'] = parsed_dmesglog[0] 145 | parsed['message'] = parsed_dmesglog[1] 146 | else: 147 | parsed['subservice'] = '' 148 | parsed['message'] = ' '.join(parsed_dmesglog[0:]) 149 | 150 | else: 151 | # subservice one word 152 | if parsed_dmesglog[0].endswith(':') and not parsed_dmesglog[1].endswith(':'): 153 | parsed['subservice'] = parsed_dmesglog[0] 154 | parsed['message'] = ' '.join(parsed_dmesglog[1:]) 155 | 156 | # subservice two words 157 | elif not parsed_dmesglog[0].endswith(':') and parsed_dmesglog[1].endswith(':'): 158 | parsed['subservice'] = parsed_dmesglog[0] + ' ' + parsed_dmesglog[1] 159 | parsed['message'] = ' '.join(parsed_dmesglog[2:]) 160 | 161 | # subservice two words 162 | elif parsed_dmesglog[0].endswith(':') and parsed_dmesglog[1].endswith(':'): 163 | parsed['subservice'] = parsed_dmesglog[0] + ' ' + parsed_dmesglog[1] 164 | parsed['message'] = ' '.join(parsed_dmesglog[2:]) 165 | 166 | else: 167 | parsed['subservice'] = '' 168 | parsed['message'] = ' '.join(parsed_dmesglog[:]) 169 | 170 | if not parsed['subservice'].endswith(':') and parsed['subservice']: 171 | parsed['message'] = parsed['subservice'] + ' ' + parsed['message'] 172 | parsed['subservice'] = '' 173 | 174 | return parsed 175 | 176 | 177 | class Main(object): 178 | def __init__(self, datasets): 179 | self.datasets = datasets 180 | self.dataset_path = '/home/hudan/Git/prlogparser/datasets/' 181 | self.groups = { 182 | 'group1': ['casper-rw', 'dfrws-2009-jhuisi', 'honeynet-challenge5', 'honeynet-challenge7'], 183 | 'group2': ['dfrws-2009-nssal'] 184 | } 185 | 186 | def run(self): 187 | # parse dmesg log 188 | for group_name, group in self.groups.items(): 189 | # setup test csv file to save results 190 | base_name = '/home/hudan/Git/prlogparser/groundtruth/dmesg-' 191 | test_file = base_name + group_name + '.csv' 192 | f = open(test_file, 'w', newline='') 193 | writer = csv.writer(f) 194 | 195 | for dataset in group: 196 | # get grammar 197 | dl = None 198 | if group_name == 'group1': 199 | dl = DmesgLog1(dataset) 200 | 201 | elif group_name == 'group2': 202 | dl = DmesgLog2(dataset) 203 | 204 | # start parsing 205 | for filename in self.datasets[dataset]: 206 | filename = os.path.join(self.dataset_path, filename) 207 | with open(filename, 'r') as f: 208 | for line in f: 209 | # get parsed line and print 210 | parsed_line = dl.parse_log(line) 211 | print(parsed_line) 212 | 213 | # write to csv 214 | row = list(parsed_line.values()) 215 | writer.writerow(row) 216 | 217 | f.close() 218 | 219 | 220 | if __name__ == '__main__': 221 | # get dmesg log datasets 222 | datasets_files = { 223 | 'casper-rw': [ 224 | 'casper-rw/dmesg', 225 | 'casper-rw/dmesg.0', 226 | 'casper-rw/dmesg.1', 227 | 'casper-rw/dmesg.2', 228 | 'casper-rw/dmesg.3' 229 | ], 230 | 'dfrws-2009-jhuisi': [ 231 | 'dfrws-2009-jhuisi/dmesg', 232 | 'dfrws-2009-jhuisi/dmesg.0', 233 | 'dfrws-2009-jhuisi/dmesg.1', 234 | 'dfrws-2009-jhuisi/dmesg.2', 235 | 'dfrws-2009-jhuisi/dmesg.3', 236 | 'dfrws-2009-jhuisi/dmesg.4' 237 | ], 238 | 'dfrws-2009-nssal': [ 239 | 'dfrws-2009-nssal/dmesg', 240 | 'dfrws-2009-nssal/dmesg.0', 241 | 'dfrws-2009-nssal/dmesg.1', 242 | 'dfrws-2009-nssal/dmesg.2', 243 | 'dfrws-2009-nssal/dmesg.3', 244 | 'dfrws-2009-nssal/dmesg.4' 245 | ], 246 | 'honeynet-challenge5': [ 247 | 'honeynet-challenge5/dmesg', 248 | 'honeynet-challenge5/dmesg.0' 249 | ], 250 | 'honeynet-challenge7': [ 251 | 'honeynet-challenge7/dmesg', 252 | 'honeynet-challenge7/dmesg.0', 253 | 'honeynet-challenge7/dmesg.1', 254 | 'honeynet-challenge7/dmesg.2', 255 | 'honeynet-challenge7/dmesg.3', 256 | 'honeynet-challenge7/dmesg.4' 257 | ] 258 | } 259 | 260 | main = Main(datasets_files) 261 | main.run() 262 | -------------------------------------------------------------------------------- /nerlogparser/grammar/grammar_utility.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class GrammarUtility(object): 4 | @staticmethod 5 | def get_unix_timestamp(square_bracket, timestamp): 6 | main_digit = timestamp.split('.')[0] 7 | space = '' 8 | 9 | if len(main_digit) == 1: 10 | space = ' ' 11 | elif len(main_digit) == 2: 12 | space = ' ' 13 | elif len(main_digit) == 3: 14 | space = ' ' 15 | elif len(main_digit) == 4: 16 | space = ' ' 17 | elif len(main_digit) == 5: 18 | space = '' 19 | 20 | new_timestamp = square_bracket + space + timestamp 21 | return new_timestamp 22 | -------------------------------------------------------------------------------- /nerlogparser/grammar/kernellog.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from pyparsing import Word, alphas, Combine, nums, string, Optional, Regex 4 | from collections import OrderedDict 5 | from nerlogparser.grammar.grammar_utility import GrammarUtility 6 | 7 | 8 | class KernelLog(object): 9 | def __init__(self, dataset): 10 | self.dataset = dataset 11 | self.groups = { 12 | 'group1': ['casper-rw', 'dfrws-2009-jhuisi', 'dfrws-2009-nssal', 'honeynet-challenge7'], 13 | 'group2': ['honeynet-challenge5'] 14 | } 15 | 16 | def get_grammar(self): 17 | dl = None 18 | if self.dataset in self.groups['group1']: 19 | dl = KernelLog1(self.dataset) 20 | 21 | elif self.dataset in self.groups['group2']: 22 | dl = KernelLog2(self.dataset) 23 | 24 | return dl 25 | 26 | 27 | class KernelLog1(object): 28 | def __init__(self, dataset): 29 | """Constructor for class KernelLog. 30 | 31 | Parameters 32 | ---------- 33 | dataset : str 34 | Dataset name. 35 | """ 36 | self.dataset = dataset 37 | 38 | @staticmethod 39 | def __get_kernellog_grammar(): 40 | """The definition of kernel log grammar. Supported dataset: 41 | casper-rw 42 | dfrws-2009 43 | honeynet-challenge7 44 | 45 | Returns 46 | ------- 47 | kernellog_grammar : 48 | Grammar for kernel log 49 | """ 50 | ints = Word(nums) 51 | 52 | # timestamp 53 | month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 54 | day = ints 55 | hour = Combine(ints + ':' + ints + ':' + ints) 56 | timestamp = month + day + hour 57 | 58 | # hostname, service name, message 59 | hostname = Word(alphas + nums + '_' + '-' + '.') 60 | service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':') 61 | 62 | # unix time 63 | unix_time = Optional('[' + Word(nums + '.' + ']')) 64 | subservice = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + '.' + '=' + '(' + ')' + '*' + 65 | '<' + '>')) 66 | subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/' + 67 | '(' + ')' + '*')) 68 | message = Optional(Regex('.*')) 69 | 70 | # kernel log grammar 71 | debuglog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message 72 | return debuglog_grammar 73 | 74 | def parse_log(self, log_line): 75 | """Parse kernel log based on defined grammar. 76 | 77 | Parameters 78 | ---------- 79 | log_line : str 80 | A log line to be parsed. 81 | 82 | Returns 83 | ------- 84 | parsed : dict[str, str] 85 | A parsed kernel log containing these elements: timestamp, hostname, service, unix_time, 86 | subservice and message. 87 | """ 88 | kernellog_grammar = self.__get_kernellog_grammar() 89 | parsed_kernellog = kernellog_grammar.parseString(log_line) 90 | 91 | # get parsed kernel log 92 | parsed = OrderedDict() 93 | parsed['timestamp'] = parsed_kernellog[0] + ' ' + parsed_kernellog[1] + ' ' + parsed_kernellog[2] 94 | parsed['hostname'] = parsed_kernellog[3] 95 | parsed['service'] = parsed_kernellog[4] 96 | 97 | if len(parsed_kernellog) == 6: 98 | parsed['unix_time'] = '' 99 | parsed['subservice'] = '' 100 | parsed['message'] = parsed_kernellog[5] 101 | 102 | elif len(parsed_kernellog) == 7: 103 | parsed['unix_time'] = '' 104 | parsed['subservice'] = '' 105 | if not parsed_kernellog[5].endswith(':'): 106 | parsed['message'] = parsed_kernellog[5] + ' ' + parsed_kernellog[6] 107 | 108 | elif len(parsed_kernellog) == 8: 109 | parsed['unix_time'] = '' 110 | parsed['subservice'] = '' 111 | 112 | # no message 113 | if parsed_kernellog[7] == '' and parsed_kernellog[5].startswith('[') and parsed_kernellog[6].endswith(']'): 114 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_kernellog[5], parsed_kernellog[6]) 115 | parsed['message'] = '' 116 | 117 | # message exists 118 | elif parsed_kernellog[7] != '' and parsed_kernellog[5].startswith('[') and \ 119 | parsed_kernellog[6].endswith(']'): 120 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_kernellog[5], parsed_kernellog[6]) 121 | parsed['message'] = parsed_kernellog[7] 122 | 123 | # subservice one word 124 | elif parsed_kernellog[5].endswith(':') and not parsed_kernellog[6].endswith(':'): 125 | parsed['subservice'] = parsed_kernellog[5] 126 | parsed['message'] = ' '.join(parsed_kernellog[6:]) 127 | 128 | # subservice two words 129 | elif not parsed_kernellog[5].endswith(':') and parsed_kernellog[6].endswith(':'): 130 | parsed['subservice'] = parsed_kernellog[5] + ' ' + parsed_kernellog[6] 131 | parsed['message'] = parsed_kernellog[7] 132 | 133 | # subservice two words 134 | elif parsed_kernellog[5].endswith(':') and parsed_kernellog[6].endswith(':'): 135 | parsed['subservice'] = parsed_kernellog[5] + ' ' + parsed_kernellog[6] 136 | parsed['message'] = parsed_kernellog[7] 137 | 138 | # no timestamp, no subservice, just message 139 | else: 140 | parsed['message'] = ' '.join(parsed_kernellog[5:]) 141 | 142 | else: 143 | # if timestamp exists 144 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_kernellog[5], parsed_kernellog[6]) 145 | 146 | # subservice one word 147 | if parsed_kernellog[7].endswith(':') and not parsed_kernellog[8].endswith(':'): 148 | parsed['subservice'] = parsed_kernellog[7] 149 | parsed['message'] = ' '.join(parsed_kernellog[8:]) 150 | 151 | # subservice two words 152 | elif not parsed_kernellog[7].endswith(':') and parsed_kernellog[8].endswith(':'): 153 | parsed['subservice'] = parsed_kernellog[7] + ' ' + parsed_kernellog[8] 154 | parsed['message'] = ' '.join(parsed_kernellog[9:]) 155 | 156 | # subservice two words 157 | elif parsed_kernellog[7].endswith(':') and parsed_kernellog[8].endswith(':'): 158 | parsed['subservice'] = parsed_kernellog[7] + ' ' + parsed_kernellog[8] 159 | parsed['message'] = ' '.join(parsed_kernellog[9:]) 160 | 161 | else: 162 | parsed['subservice'] = '' 163 | parsed['message'] = ' '.join(parsed_kernellog[7:]) 164 | 165 | if not parsed['subservice'].endswith(':'): 166 | parsed['message'] = parsed['subservice'] + ' ' + parsed['message'] 167 | parsed['subservice'] = '' 168 | 169 | return parsed 170 | 171 | 172 | class KernelLog2(object): 173 | def __init__(self, dataset): 174 | """Constructor for class KernelLog. 175 | 176 | Parameters 177 | ---------- 178 | dataset : str 179 | Dataset name. 180 | """ 181 | self.dataset = dataset 182 | 183 | @staticmethod 184 | def __get_kernellog_grammar(): 185 | """The definition of kernel log grammar. Supported dataset: 186 | honeynet-challenge5 187 | 188 | Returns 189 | ------- 190 | kernellog_grammar : 191 | Grammar for kernel log 192 | """ 193 | ints = Word(nums) 194 | 195 | # timestamp 196 | month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 197 | day = ints 198 | hour = Combine(ints + ':' + ints + ':' + ints) 199 | timestamp = month + day + hour 200 | 201 | # hostname, service name, message 202 | # there is Optional(':') 203 | hostname = Word(alphas + nums + '_' + '-' + '.') 204 | service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':') + Optional(':') 205 | 206 | # unix time 207 | unix_time = Optional('[' + Word(nums + '.' + ']')) 208 | subservice = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + '.' + '=' + '(' + ')' + '*' + 209 | '<' + '>')) 210 | subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/' + 211 | '(' + ')' + '*' + "'")) 212 | message = Optional(Regex('.*')) 213 | 214 | # kernel log grammar 215 | debuglog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message 216 | return debuglog_grammar 217 | 218 | def parse_log(self, log_line): 219 | """Parse kernel log based on defined grammar. 220 | 221 | Parameters 222 | ---------- 223 | log_line : str 224 | A log line to be parsed. 225 | 226 | Returns 227 | ------- 228 | parsed : dict[str, str] 229 | A parsed kernel log containing these elements: timestamp, hostname, service, unix_time, 230 | subservice and message. 231 | """ 232 | kernellog_grammar = self.__get_kernellog_grammar() 233 | parsed_kernellog = kernellog_grammar.parseString(log_line) 234 | 235 | # get parsed kernel log 236 | parsed = OrderedDict() 237 | parsed['timestamp'] = parsed_kernellog[0] + ' ' + parsed_kernellog[1] + ' ' + parsed_kernellog[2] 238 | parsed['hostname'] = parsed_kernellog[3] 239 | parsed['service'] = parsed_kernellog[4] 240 | 241 | if len(parsed_kernellog) == 6: 242 | parsed['unix_time'] = '' 243 | parsed['subservice'] = '' 244 | parsed['message'] = parsed_kernellog[5] 245 | 246 | elif len(parsed_kernellog) == 7: 247 | parsed['unix_time'] = '' 248 | parsed['subservice'] = '' 249 | if not parsed_kernellog[5].endswith(':'): 250 | parsed['message'] = parsed_kernellog[5] + ' ' + parsed_kernellog[6] 251 | 252 | elif len(parsed_kernellog) == 8: 253 | parsed['unix_time'] = '' 254 | parsed['subservice'] = '' 255 | 256 | # timestamp exists 257 | if parsed_kernellog[5].startswith('[') and parsed_kernellog[5].endswith(']'): 258 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_kernellog[5], parsed_kernellog[6]) 259 | parsed['message'] = parsed_kernellog[7] 260 | 261 | # no timestamp, no subservice, just message 262 | else: 263 | parsed['message'] = ' '.join(parsed_kernellog[5:]) 264 | 265 | else: 266 | if parsed_kernellog[5] == ':': 267 | parsed['service'] = parsed['service'] + ' ' + parsed_kernellog[5] 268 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_kernellog[6], parsed_kernellog[7]) 269 | 270 | # subservice one word 271 | if parsed_kernellog[8].endswith(':') and not parsed_kernellog[9].endswith(':'): 272 | parsed['subservice'] = parsed_kernellog[8] 273 | parsed['message'] = ' '.join(parsed_kernellog[9:]) 274 | 275 | # subservice two words 276 | elif not parsed_kernellog[8].endswith(':') and parsed_kernellog[9].endswith(':'): 277 | parsed['subservice'] = parsed_kernellog[8] + ' ' + parsed_kernellog[9] 278 | parsed['message'] = ' '.join(parsed_kernellog[10:]) 279 | 280 | # subservice two words 281 | elif parsed_kernellog[8].endswith(':') and parsed_kernellog[9].endswith(':'): 282 | parsed['subservice'] = parsed_kernellog[8] + ' ' + parsed_kernellog[9] 283 | parsed['message'] = ' '.join(parsed_kernellog[10:]) 284 | 285 | # no subservice, only message 286 | else: 287 | parsed['subservice'] = '' 288 | parsed['message'] = ' '.join(parsed_kernellog[8:]) 289 | 290 | else: 291 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_kernellog[5], parsed_kernellog[6]) 292 | 293 | # subservice one word 294 | if parsed_kernellog[7].endswith(':') and not parsed_kernellog[8].endswith(':'): 295 | parsed['subservice'] = parsed_kernellog[7] 296 | parsed['message'] = ' '.join(parsed_kernellog[8:]) 297 | 298 | # subservice two words 299 | elif not parsed_kernellog[7].endswith(':') and parsed_kernellog[8].endswith(':'): 300 | parsed['subservice'] = parsed_kernellog[7] + ' ' + parsed_kernellog[8] 301 | parsed['message'] = ' '.join(parsed_kernellog[9:]) 302 | 303 | # subservice two words 304 | elif parsed_kernellog[7].endswith(':') and parsed_kernellog[8].endswith(':'): 305 | parsed['subservice'] = parsed_kernellog[7] + ' ' + parsed_kernellog[8] 306 | parsed['message'] = ' '.join(parsed_kernellog[9:]) 307 | 308 | # no subservice, only message 309 | else: 310 | parsed['subservice'] = '' 311 | parsed['message'] = ' '.join(parsed_kernellog[7:]) 312 | 313 | if not parsed['subservice'].endswith(':'): 314 | parsed['message'] = parsed['subservice'] + ' ' + parsed['message'] 315 | parsed['subservice'] = '' 316 | 317 | return parsed 318 | 319 | 320 | class Main(object): 321 | def __init__(self, datasets): 322 | self.datasets = datasets 323 | self.dataset_path = '/home/hudan/Git/prlogparser/datasets/' 324 | self.groups = { 325 | 'group1': ['casper-rw', 'dfrws-2009-jhuisi', 'dfrws-2009-nssal', 'honeynet-challenge7'], 326 | 'group2': ['honeynet-challenge5'] 327 | } 328 | 329 | def run(self): 330 | # parse kernel.log 331 | for group_name, group in self.groups.items(): 332 | # setup test csv file to save results 333 | base_name = '/home/hudan/Git/prlogparser/groundtruth/kernel-' 334 | test_file = base_name + group_name + '.csv' 335 | f = open(test_file, 'w', newline='') 336 | writer = csv.writer(f) 337 | 338 | for dataset in group: 339 | # get grammar 340 | dl = None 341 | if group_name == 'group1': 342 | dl = KernelLog1(dataset) 343 | 344 | elif group_name == 'group2': 345 | dl = KernelLog2(dataset) 346 | 347 | # start parsing 348 | for filename in self.datasets[dataset]: 349 | filename = os.path.join(self.dataset_path, filename) 350 | with open(filename, 'r') as f: 351 | for line in f: 352 | # get parsed line and print 353 | parsed_line = dl.parse_log(line) 354 | print(parsed_line) 355 | 356 | # write to csv 357 | row = list(parsed_line.values()) 358 | writer.writerow(row) 359 | 360 | f.close() 361 | 362 | 363 | if __name__ == '__main__': 364 | datasets_files = { 365 | 'casper-rw': ['casper-rw/kern.log'], 366 | 'dfrws-2009-jhuisi': [ 367 | 'dfrws-2009-jhuisi/kern.log', 368 | 'dfrws-2009-jhuisi/kern.log.0', 369 | 'dfrws-2009-jhuisi/kern.log.1', 370 | ], 371 | 'dfrws-2009-nssal': [ 372 | 'dfrws-2009-nssal/kern.log', 373 | 'dfrws-2009-nssal/kern.log.0', 374 | 'dfrws-2009-nssal/kern.log.1', 375 | 'dfrws-2009-nssal/kern.log.2', 376 | 'dfrws-2009-nssal/kern.log.3' 377 | ], 378 | 'honeynet-challenge5': ['honeynet-challenge5/kern.log'], 379 | 'honeynet-challenge7': ['honeynet-challenge7/kern.log'] 380 | } 381 | 382 | main = Main(datasets_files) 383 | main.run() 384 | -------------------------------------------------------------------------------- /nerlogparser/grammar/kippolog.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from pyparsing import Word, Combine, nums, Regex, nestedExpr 4 | from collections import OrderedDict 5 | 6 | 7 | class KippoLog(object): 8 | def __init__(self, dataset): 9 | self.dataset = dataset 10 | self.kippolog_grammar = self.__get_kippolog_grammar() 11 | 12 | @staticmethod 13 | def __get_kippolog_grammar(): 14 | """The definition of Kippo honeypot log grammar. 15 | 16 | Returns 17 | ------- 18 | kippolog_grammar : 19 | Grammar for Kippo log. 20 | """ 21 | ints = Word(nums) 22 | 23 | date = Combine(ints + '-' + ints + '-' + ints) 24 | time = Combine(ints + ':' + ints + ':' + ints + '+0000') 25 | timestamp = date + time 26 | service = nestedExpr(opener='[', closer=']') 27 | message = Regex(".*") 28 | 29 | # kippo honeypot log grammar 30 | kippolog_grammar = timestamp('timestamp') + service('service') + message('message') 31 | return kippolog_grammar 32 | 33 | def parse_log(self, log_line): 34 | parsed_kippolog = self.kippolog_grammar.parseString(log_line) 35 | 36 | parsed = OrderedDict() 37 | parsed['timestamp'] = ' '.join(parsed_kippolog.timestamp) 38 | if len(parsed_kippolog.service[0]) > 1: 39 | parsed['service'] = '[' + ' '.join(parsed_kippolog.service[0]) + ']' 40 | else: 41 | parsed['service'] = '[' + parsed_kippolog.service[0][0] + ']' 42 | parsed['message'] = parsed_kippolog.message 43 | 44 | return parsed 45 | 46 | 47 | if __name__ == '__main__': 48 | dataset_path = '/home/hudan/Git/prlogparser/datasets/kippo/' 49 | filenames = [ 50 | 'kippo.2017-02-14.log', 51 | 'kippo.2017-02-15.log', 52 | 'kippo.2017-02-16.log', 53 | 'kippo.2017-02-17.log', 54 | 'kippo.2017-02-18.log', 55 | 'kippo.2017-02-19.log', 56 | 'kippo.2017-02-20.log' 57 | ] 58 | 59 | test_file = '/home/hudan/Git/prlogparser/groundtruth/test-results/kippo-test.csv' 60 | f = open(test_file, 'w', newline='') 61 | writer = csv.writer(f) 62 | 63 | kl = KippoLog('') 64 | for filename in filenames: 65 | filename = os.path.join(dataset_path, filename) 66 | with open(filename, 'r') as f: 67 | for line in f: 68 | parsed_line = kl.parse_log(line) 69 | print(parsed_line) 70 | 71 | row = list(parsed_line.values()) 72 | writer.writerow(row) 73 | 74 | f.close() 75 | -------------------------------------------------------------------------------- /nerlogparser/grammar/messageslog.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from pyparsing import Word, alphas, Combine, nums, string, Optional, Regex 4 | from collections import OrderedDict 5 | from nerlogparser.grammar.grammar_utility import GrammarUtility 6 | 7 | 8 | class MessagesLog(object): 9 | def __init__(self, dataset): 10 | self.dataset = dataset 11 | self.groups = { 12 | 'group1': ['casper-rw', 'dfrws-2009-jhuisi', 'dfrws-2009-nssal', 'honeynet-challenge7'], 13 | 'group2': ['honeynet-challenge5'] 14 | } 15 | 16 | def get_grammar(self): 17 | dl = None 18 | if self.dataset in self.groups['group1']: 19 | dl = MessagesLog1(self.dataset) 20 | 21 | elif self.dataset in self.groups['group2']: 22 | dl = MessagesLog2(self.dataset) 23 | 24 | return dl 25 | 26 | 27 | class MessagesLog1(object): 28 | def __init__(self, dataset): 29 | """Constructor for class MessagesLog. This parser also supports syslog. 30 | 31 | Parameters 32 | ---------- 33 | dataset : str 34 | Dataset name. 35 | """ 36 | self.dataset = dataset 37 | self.messageslog_grammar = self.__get_messageslog_grammar() 38 | 39 | @staticmethod 40 | def __get_messageslog_grammar(): 41 | """The definition of messages log grammar. Supported dataset: 42 | casper-rw 43 | dfrws-2009 44 | honeynet-challenge7 45 | 46 | Returns 47 | ------- 48 | messageslog_grammar : 49 | Grammar for messages log 50 | """ 51 | ints = Word(nums) 52 | 53 | # timestamp 54 | month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 55 | day = ints 56 | hour = Combine(ints + ':' + ints + ':' + ints) 57 | timestamp = month + day + hour 58 | 59 | # hostname, service name, message 60 | hostname = Word(alphas + nums + '_' + '-' + '.') 61 | service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':') 62 | 63 | # unix time 64 | unix_time = Optional('[' + Word(nums + '.' + ']')) 65 | subservice = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + '.' + '=' + '(' + ')' + '*' + 66 | '<' + '>' + ',')) 67 | subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/' + 68 | '(' + ')' + '*')) 69 | message = Optional(Regex('.*')) 70 | 71 | # messages log grammar 72 | messageslog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message 73 | return messageslog_grammar 74 | 75 | def parse_log(self, log_line): 76 | """Parse messages log based on defined grammar. 77 | 78 | Parameters 79 | ---------- 80 | log_line : str 81 | A log line to be parsed. 82 | 83 | Returns 84 | ------- 85 | parsed : dict[str, str] 86 | A parsed messages log containing these elements: timestamp, hostname, service, unix_time, 87 | subservice and message. 88 | """ 89 | parsed_messageslog = self.messageslog_grammar.parseString(log_line) 90 | 91 | # get parsed kernel log 92 | parsed = OrderedDict() 93 | parsed['timestamp'] = parsed_messageslog[0] + ' ' + parsed_messageslog[1] + ' ' + parsed_messageslog[2] 94 | parsed['hostname'] = parsed_messageslog[3] 95 | parsed['service'] = parsed_messageslog[4] 96 | 97 | if len(parsed_messageslog) == 6: 98 | parsed['unix_time'] = '' 99 | parsed['subservice'] = '' 100 | if not parsed['service'].endswith(':'): 101 | parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5] 102 | parsed['service'] = '' 103 | else: 104 | parsed['message'] = '' 105 | 106 | elif len(parsed_messageslog) == 7: 107 | parsed['unix_time'] = '' 108 | parsed['subservice'] = '' 109 | if not parsed_messageslog[5].endswith(':'): 110 | parsed['message'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6] 111 | 112 | if not parsed['service'].endswith(':'): 113 | parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5] + ' ' + parsed_messageslog[6] 114 | parsed['service'] = '' 115 | 116 | elif len(parsed_messageslog) == 8: 117 | parsed['unix_time'] = '' 118 | parsed['subservice'] = '' 119 | 120 | # no message 121 | if parsed_messageslog[7] == '' and parsed_messageslog[5].startswith('[') and \ 122 | parsed_messageslog[6].endswith(']'): 123 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6]) 124 | parsed['message'] = '' 125 | 126 | # message exists 127 | elif parsed_messageslog[7] != '' and parsed_messageslog[5].startswith('[') and \ 128 | parsed_messageslog[6].endswith(']'): 129 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6]) 130 | parsed['message'] = parsed_messageslog[7] 131 | 132 | # subservice one word 133 | elif parsed_messageslog[5].endswith(':') and not parsed_messageslog[6].endswith(':'): 134 | parsed['subservice'] = parsed_messageslog[5] 135 | parsed['message'] = ' '.join(parsed_messageslog[6:]) 136 | 137 | # subservice two words 138 | elif not parsed_messageslog[5].endswith(':') and parsed_messageslog[6].endswith(':'): 139 | parsed['subservice'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6] 140 | parsed['message'] = parsed_messageslog[7] 141 | 142 | # subservice two words 143 | elif parsed_messageslog[5].endswith(':') and parsed_messageslog[6].endswith(':'): 144 | parsed['subservice'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6] 145 | parsed['message'] = parsed_messageslog[7] 146 | 147 | # no timestamp, no subservice, just message 148 | else: 149 | parsed['message'] = ' '.join(parsed_messageslog[5:]) 150 | 151 | if not parsed['subservice'].endswith(':'): 152 | parsed['message'] = parsed['subservice'] + ' ' + parsed['message'] 153 | parsed['subservice'] = '' 154 | 155 | if not parsed['service'].endswith(':'): 156 | parsed['message'] = parsed['service'] + ' ' + parsed['message'] 157 | parsed['service'] = '' 158 | 159 | else: 160 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6]) 161 | 162 | # subservice one word 163 | if parsed_messageslog[7].endswith(':') and not parsed_messageslog[8].endswith(':'): 164 | parsed['subservice'] = parsed_messageslog[7] 165 | parsed['message'] = ' '.join(parsed_messageslog[8:]) 166 | 167 | # subservice two words 168 | elif not parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'): 169 | parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8] 170 | parsed['message'] = ' '.join(parsed_messageslog[9:]) 171 | 172 | # subservice two words 173 | elif parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'): 174 | parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8] 175 | parsed['message'] = ' '.join(parsed_messageslog[9:]) 176 | 177 | else: 178 | parsed['subservice'] = '' 179 | parsed['message'] = ' '.join(parsed_messageslog[7:]) 180 | 181 | if not parsed['subservice'].endswith(':'): 182 | parsed['message'] = parsed['subservice'] + ' ' + parsed['message'] 183 | parsed['subservice'] = '' 184 | 185 | return parsed 186 | 187 | 188 | class MessagesLog2(object): 189 | def __init__(self, dataset): 190 | """Constructor for class MessagesLog. This parser also supports syslog. 191 | 192 | Parameters 193 | ---------- 194 | dataset : str 195 | Dataset name. 196 | """ 197 | self.dataset = dataset 198 | self.messageslog_grammar = self.__get_messageslog_grammar() 199 | 200 | @staticmethod 201 | def __get_messageslog_grammar(): 202 | """The definition of messages log grammar. Supported dataset: 203 | honeynet-challenge5 204 | 205 | Returns 206 | ------- 207 | messageslog_grammar : 208 | Grammar for messages log 209 | """ 210 | ints = Word(nums) 211 | 212 | # timestamp 213 | month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 214 | day = ints 215 | hour = Combine(ints + ':' + ints + ':' + ints) 216 | timestamp = month + day + hour 217 | 218 | # hostname, service name, message 219 | hostname = Word(alphas + nums + '_' + '-' + '.') 220 | service = Word(alphas + nums + '/' + '-' + '_' + '.' + '[' + ']' + ':') + Optional(':') 221 | 222 | # unix time 223 | unix_time = Optional('[' + Word(nums + '.' + ']')) 224 | subservice = Optional(Word(alphas + nums + '_' + '-' + ':')) 225 | subservice_two_words = Optional(Word(alphas + nums + '_' + '-' + ':' + '[' + ']' + ',' + '.' + '=' + '/' + 226 | '(' + ')' + '*')) 227 | message = Regex('.*') 228 | 229 | # messages log grammar 230 | messageslog_grammar = timestamp + hostname + service + unix_time + subservice + subservice_two_words + message 231 | return messageslog_grammar 232 | 233 | def parse_log(self, log_line): 234 | """Parse messages log based on defined grammar. 235 | 236 | Parameters 237 | ---------- 238 | log_line : str 239 | A log line to be parsed. 240 | 241 | Returns 242 | ------- 243 | parsed : dict[str, str] 244 | A parsed messages log containing these elements: timestamp, hostname, service, unix_time, 245 | subservice and message. 246 | """ 247 | parsed_messageslog = self.messageslog_grammar.parseString(log_line) 248 | 249 | # get parsed kernel log 250 | parsed = OrderedDict() 251 | parsed['timestamp'] = parsed_messageslog[0] + ' ' + parsed_messageslog[1] + ' ' + parsed_messageslog[2] 252 | parsed['hostname'] = parsed_messageslog[3] 253 | parsed['service'] = parsed_messageslog[4] 254 | 255 | if len(parsed_messageslog) == 6: 256 | parsed['unix_time'] = '' 257 | parsed['subservice'] = '' 258 | if not parsed['service'].endswith(':'): 259 | parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5] 260 | parsed['service'] = '' 261 | else: 262 | parsed['message'] = '' 263 | 264 | elif len(parsed_messageslog) == 7: 265 | parsed['unix_time'] = '' 266 | parsed['subservice'] = '' 267 | if not parsed_messageslog[5].endswith(':'): 268 | parsed['message'] = parsed_messageslog[5] + ' ' + parsed_messageslog[6] 269 | 270 | if not parsed['service'].endswith(':'): 271 | parsed['message'] = parsed['service'] + ' ' + parsed_messageslog[5] + ' ' + parsed_messageslog[6] 272 | parsed['service'] = '' 273 | 274 | elif len(parsed_messageslog) == 8: 275 | parsed['unix_time'] = '' 276 | parsed['subservice'] = '' 277 | 278 | # timestamp exists 279 | if parsed_messageslog[5].startswith('[') and parsed_messageslog[5].endswith(']'): 280 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6]) 281 | parsed['message'] = parsed_messageslog[7] 282 | 283 | # no timestamp, no subservice, just message 284 | else: 285 | parsed['message'] = ' '.join(parsed_messageslog[5:]) 286 | 287 | else: 288 | if parsed_messageslog[5] == ':': 289 | parsed['service'] = parsed['service'] + ' ' + parsed_messageslog[5] 290 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[6], parsed_messageslog[7]) 291 | 292 | # subservice one word 293 | if parsed_messageslog[8].endswith(':') and not parsed_messageslog[9].endswith(':'): 294 | parsed['subservice'] = parsed_messageslog[8] 295 | parsed['message'] = ' '.join(parsed_messageslog[9:]) 296 | 297 | # subservice two words 298 | elif not parsed_messageslog[8].endswith(':') and parsed_messageslog[9].endswith(':'): 299 | parsed['subservice'] = parsed_messageslog[8] + ' ' + parsed_messageslog[9] 300 | parsed['message'] = ' '.join(parsed_messageslog[10:]) 301 | 302 | # subservice two words 303 | elif parsed_messageslog[8].endswith(':') and parsed_messageslog[9].endswith(':'): 304 | parsed['subservice'] = parsed_messageslog[8] + ' ' + parsed_messageslog[9] 305 | parsed['message'] = ' '.join(parsed_messageslog[10:]) 306 | 307 | # no subservice, only message 308 | else: 309 | parsed['subservice'] = '' 310 | parsed['message'] = ' '.join(parsed_messageslog[8:]) 311 | 312 | else: 313 | parsed['unix_time'] = GrammarUtility.get_unix_timestamp(parsed_messageslog[5], parsed_messageslog[6]) 314 | 315 | # subservice one word 316 | if parsed_messageslog[7].endswith(':') and not parsed_messageslog[8].endswith(':'): 317 | parsed['subservice'] = parsed_messageslog[7] 318 | parsed['message'] = ' '.join(parsed_messageslog[8:]) 319 | 320 | # subservice two words 321 | elif not parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'): 322 | parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8] 323 | parsed['message'] = ' '.join(parsed_messageslog[9:]) 324 | 325 | # subservice two words 326 | elif parsed_messageslog[7].endswith(':') and parsed_messageslog[8].endswith(':'): 327 | parsed['subservice'] = parsed_messageslog[7] + ' ' + parsed_messageslog[8] 328 | parsed['message'] = ' '.join(parsed_messageslog[9:]) 329 | 330 | # no subservice, only message 331 | else: 332 | parsed['subservice'] = '' 333 | parsed['message'] = ' '.join(parsed_messageslog[7:]) 334 | 335 | if not parsed['subservice'].endswith(':'): 336 | parsed['message'] = parsed['subservice'] + ' ' + parsed['message'] 337 | parsed['subservice'] = '' 338 | 339 | return parsed 340 | 341 | 342 | class Main(object): 343 | def __init__(self, datasets): 344 | self.datasets = datasets 345 | self.dataset_path = '/home/hudan/Git/prlogparser/datasets/' 346 | self.groups = { 347 | 'group1': ['casper-rw', 'dfrws-2009-jhuisi', 'dfrws-2009-nssal', 'honeynet-challenge7'], 348 | 'group2': ['honeynet-challenge5'] 349 | } 350 | 351 | def run(self): 352 | # parse messages.log 353 | for group_name, group in self.groups.items(): 354 | # setup test csv file to save results 355 | base_name = '/home/hudan/Git/prlogparser/groundtruth/messages-' 356 | test_file = base_name + group_name + '.csv' 357 | f = open(test_file, 'w', newline='') 358 | writer = csv.writer(f) 359 | 360 | for dataset in group: 361 | # get grammar 362 | dl = None 363 | if group_name == 'group1': 364 | dl = MessagesLog1(dataset) 365 | 366 | elif group_name == 'group2': 367 | dl = MessagesLog2(dataset) 368 | 369 | # start parsing 370 | for filename in self.datasets[dataset]: 371 | filename = os.path.join(self.dataset_path, filename) 372 | with open(filename, 'r') as f: 373 | for line in f: 374 | # get parsed line and print 375 | parsed_line = dl.parse_log(line) 376 | print(parsed_line) 377 | 378 | # write to csv 379 | row = list(parsed_line.values()) 380 | writer.writerow(row) 381 | 382 | f.close() 383 | 384 | 385 | if __name__ == '__main__': 386 | datasets_files = { 387 | 'casper-rw': [ 388 | 'casper-rw/messages', 389 | 'casper-rw/syslog', 390 | 'casper-rw/syslog.0', 391 | 'casper-rw/syslog.1', 392 | 'casper-rw/syslog.2', 393 | 'casper-rw/syslog.3' 394 | ], 395 | 'dfrws-2009-jhuisi': [ 396 | 'dfrws-2009-jhuisi/messages', 397 | 'dfrws-2009-jhuisi/messages.0', 398 | 'dfrws-2009-jhuisi/messages.1', 399 | 'dfrws-2009-jhuisi/syslog', 400 | 'dfrws-2009-jhuisi/syslog.0', 401 | 'dfrws-2009-jhuisi/syslog.1', 402 | 'dfrws-2009-jhuisi/syslog.2' 403 | ], 404 | 'dfrws-2009-nssal': [ 405 | 'dfrws-2009-nssal/messages', 406 | 'dfrws-2009-nssal/messages.0', 407 | 'dfrws-2009-nssal/messages.1', 408 | 'dfrws-2009-nssal/messages.2', 409 | 'dfrws-2009-nssal/messages.3', 410 | 'dfrws-2009-nssal/syslog', 411 | 'dfrws-2009-nssal/syslog.0', 412 | 'dfrws-2009-nssal/syslog.1', 413 | 'dfrws-2009-nssal/syslog.2', 414 | 'dfrws-2009-nssal/syslog.3', 415 | 'dfrws-2009-nssal/syslog.4', 416 | 'dfrws-2009-nssal/syslog.5', 417 | 'dfrws-2009-nssal/syslog.6' 418 | ], 419 | 'honeynet-challenge5': ['honeynet-challenge5/messages'], 420 | 'honeynet-challenge7': [ 421 | 'honeynet-challenge7/messages', 422 | 'honeynet-challenge7/syslog' 423 | ] 424 | } 425 | 426 | main = Main(datasets_files) 427 | main.run() 428 | -------------------------------------------------------------------------------- /nerlogparser/grammar/proxifierlog.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from pyparsing import Word, Combine, nums, alphas, Optional, Regex 4 | from collections import OrderedDict 5 | 6 | 7 | class ProxifierLog(object): 8 | def __init__(self, dataset): 9 | self.dataset = dataset 10 | self.proxifierlog_grammar = self.__get_proxifierlog_grammar() 11 | 12 | @staticmethod 13 | def __get_proxifierlog_grammar(): 14 | # get proxifier grammar 15 | ints = Word(nums) 16 | 17 | date = Combine('[' + ints + '.' + ints) 18 | time = Combine(ints + ':' + ints + ':' + ints + ']') 19 | timestamp = date + time 20 | 21 | service = Word(alphas + nums + '.' + '-' + '_') 22 | arch = Optional(Word('*' + nums)) 23 | domain_or_ip = Optional(Word('-')) + Word(alphas + nums + '.' + ':' + '-') 24 | status = Optional(Word(alphas + ',')) + Optional(':') 25 | message = Regex('.*') 26 | 27 | proxifierlog_grammar = timestamp + service + arch + domain_or_ip + status + message 28 | return proxifierlog_grammar 29 | 30 | def parse_log(self, log_line): 31 | # parse proxifier log entries 32 | parsed_proxifierlog = self.proxifierlog_grammar.parseString(log_line) 33 | 34 | parsed = OrderedDict() 35 | parsed['timestamp'] = parsed_proxifierlog[0] + ' ' + parsed_proxifierlog[1] 36 | parsed['service'] = parsed_proxifierlog[2] 37 | 38 | if len(parsed_proxifierlog) == 6: 39 | parsed['service'] = parsed_proxifierlog[2] + ' ' + parsed_proxifierlog[3] 40 | parsed['arch'] = '' 41 | parsed['domain_or_ip'] = '' 42 | parsed['status'] = '' 43 | parsed['message'] = ' '.join(parsed_proxifierlog[4:]) 44 | 45 | elif len(parsed_proxifierlog) == 7: 46 | parsed['arch'] = '' 47 | parsed['domain_or_ip'] = parsed_proxifierlog[3] + ' ' + parsed_proxifierlog[4] 48 | 49 | if parsed_proxifierlog[5].endswith(','): 50 | parsed['status'] = parsed_proxifierlog[5] 51 | parsed['message'] = parsed_proxifierlog[6] 52 | else: 53 | parsed['status'] = '' 54 | parsed['message'] = ' '.join(parsed_proxifierlog[5:]) 55 | 56 | elif len(parsed_proxifierlog) == 8: 57 | if parsed_proxifierlog[3].startswith('*'): 58 | parsed['arch'] = parsed_proxifierlog[3] 59 | parsed['domain_or_ip'] = parsed_proxifierlog[4] + ' ' + parsed_proxifierlog[5] 60 | 61 | if parsed_proxifierlog[6].endswith(','): 62 | parsed['status'] = parsed_proxifierlog[6] 63 | parsed['message'] = parsed_proxifierlog[7] 64 | else: 65 | parsed['status'] = '' 66 | parsed['message'] = ' '.join(parsed_proxifierlog[6:]) 67 | 68 | else: 69 | parsed['arch'] = '' 70 | parsed['domain_or_ip'] = parsed_proxifierlog[3] + ' ' + parsed_proxifierlog[4] 71 | 72 | if parsed_proxifierlog[6] == ':': 73 | parsed['status'] = parsed_proxifierlog[5] + ' ' + parsed_proxifierlog[6] 74 | parsed['message'] = parsed_proxifierlog[7] 75 | else: 76 | parsed['status'] = '' 77 | parsed['message'] = ' '.join(parsed_proxifierlog[5:]) 78 | 79 | elif len(parsed_proxifierlog) == 9: 80 | parsed['arch'] = parsed_proxifierlog[3] 81 | parsed['domain_or_ip'] = parsed_proxifierlog[4] + ' ' + parsed_proxifierlog[5] 82 | parsed['status'] = parsed_proxifierlog[6] + ' ' + parsed_proxifierlog[7] 83 | parsed['message'] = parsed_proxifierlog[8] 84 | 85 | return parsed 86 | 87 | 88 | if __name__ == '__main__': 89 | dataset_path = '/home/hudan/Git/prlogparser/datasets/proxifier/' 90 | filenames = ['proxifier.log'] 91 | 92 | test_file = '/home/hudan/Git/prlogparser/groundtruth/test-results/proxifier-test.csv' 93 | f = open(test_file, 'w', newline='') 94 | writer = csv.writer(f) 95 | 96 | pl = ProxifierLog('') 97 | for filename in filenames: 98 | filename = os.path.join(dataset_path, filename) 99 | with open(filename, 'r') as f: 100 | for line in f: 101 | parsed_line = pl.parse_log(line) 102 | print(parsed_line) 103 | 104 | row = list(parsed_line.values()) 105 | writer.writerow(row) 106 | 107 | f.close() 108 | -------------------------------------------------------------------------------- /nerlogparser/grammar/weblog.py: -------------------------------------------------------------------------------- 1 | # httpServerLogParser.py 2 | # 3 | # Copyright (c) 2016, Paul McGuire 4 | # 5 | 6 | import os 7 | import csv 8 | import string 9 | from pyparsing import alphas, nums, dblQuotedString, Combine, Word, Group, delimitedList 10 | from collections import OrderedDict 11 | 12 | 13 | class WebLog(object): 14 | """This class is based on httpServerLogParser.py by Paul McGuire. 15 | http://pyparsing.wikispaces.com/file/detail/httpServerLogParser.py 16 | 17 | """ 18 | def __init__(self, dataset): 19 | self.dataset = dataset 20 | self.weblog_grammar = self.__get_weblog_grammar() 21 | 22 | @staticmethod 23 | def __get_weblog_grammar(): 24 | integer = Word(nums) 25 | ip_address = delimitedList(integer, ".", combine=True) 26 | time_zone_offset = Word("+-", nums) 27 | month = Word(string.ascii_uppercase, string.ascii_lowercase, exact=3) 28 | server_date_time = Group(Combine("[" + integer + "/" + month + "/" + integer + 29 | ":" + integer + ":" + integer + ":" + integer) + 30 | Combine(time_zone_offset + "]")) 31 | 32 | weblog_grammar = (ip_address.setResultsName("ip_address") + 33 | Word("-").setResultsName("dash") + 34 | ("-" | Word(alphas + nums + "@._")).setResultsName("auth") + 35 | server_date_time.setResultsName("timestamp") + 36 | dblQuotedString.setResultsName("command") + 37 | (integer | "-").setResultsName("status_code") + 38 | (integer | "-").setResultsName("num_bytes") + 39 | dblQuotedString.setResultsName("referrer") + 40 | dblQuotedString.setResultsName("client_agent")) 41 | 42 | return weblog_grammar 43 | 44 | def parse_log(self, log_line): 45 | parsed_weblog = self.weblog_grammar.parseString(log_line) 46 | 47 | parsed = OrderedDict() 48 | parsed['ip_address'] = parsed_weblog.ip_address 49 | parsed['dash'] = parsed_weblog.dash 50 | parsed['auth'] = parsed_weblog.auth 51 | parsed['timestamp'] = ' '.join(parsed_weblog.timestamp[0:2]) 52 | parsed['command'] = parsed_weblog.command 53 | parsed['status_code'] = parsed_weblog.status_code 54 | parsed['num_bytes'] = parsed_weblog.num_bytes 55 | parsed['referrer'] = parsed_weblog.referrer 56 | parsed['client_agent'] = parsed_weblog.client_agent 57 | 58 | return parsed 59 | 60 | @staticmethod 61 | def __get_filename(base_filename, month, day): 62 | # example: 63 | # access.log.2018-01-02 64 | # access.log.2018-01-01 65 | 66 | # check day format 67 | if day < 10: 68 | day = '0' + str(day) 69 | else: 70 | day = str(day) 71 | 72 | fn = base_filename + '0' + str(month) + '-' + day 73 | return fn 74 | 75 | def get_all_filenames(self): 76 | # setup variables 77 | base_filename = 'access.log.2018-' 78 | months = range(3, 6) 79 | day_odd = range(1, 32) 80 | day_even = range(1, 31) 81 | 82 | # get all filenames 83 | filenames = [] 84 | for month in months: 85 | if month % 2 == 0: 86 | for day in day_even: 87 | fn = self.__get_filename(base_filename, month, day) 88 | filenames.append(fn) 89 | else: 90 | for day in day_odd: 91 | fn = self.__get_filename(base_filename, month, day) 92 | filenames.append(fn) 93 | 94 | return filenames 95 | 96 | 97 | if __name__ == '__main__': 98 | wl = WebLog('') 99 | dataset_path = '/home/hudan/Git/prlogparser/datasets/secrepo-accesslog/' 100 | filenames_list = wl.get_all_filenames() 101 | 102 | test_file = '/home/hudan/Git/prlogparser/groundtruth/test-results/weblog-test.csv' 103 | f = open(test_file, 'w', newline='') 104 | writer = csv.writer(f) 105 | 106 | for filename in filenames_list: 107 | filename = os.path.join(dataset_path, filename) 108 | with open(filename, 'r') as f: 109 | for line in f: 110 | parsed_line = wl.parse_log(line) 111 | print(parsed_line) 112 | 113 | row = list(parsed_line.values()) 114 | writer.writerow(row) 115 | 116 | f.close() 117 | -------------------------------------------------------------------------------- /nerlogparser/grammar/zookeeperlog.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | from pyparsing import Word, Combine, nums, alphas, Regex, Optional 4 | from collections import OrderedDict 5 | 6 | 7 | class ZookeeperLog(object): 8 | def __init__(self, dataset): 9 | self.dataset = dataset 10 | self.zookeeperlog_grammar = self.__get_zookeeperlog_grammar() 11 | 12 | @staticmethod 13 | def __get_zookeeperlog_grammar(): 14 | ints = Word(nums) 15 | 16 | date = Combine(ints + '-' + ints + '-' + ints) 17 | time = Combine(ints + ':' + ints + ':' + ints + ',' + ints) 18 | timestamp = date + time 19 | 20 | dash = Word('-') 21 | status = Word(alphas) 22 | job = Word(alphas + nums + '[]:@=/.$()-') + Optional(Word(alphas + nums + '[]:@=/.$()-')) + Optional(Word('-')) 23 | message = Regex('.*') 24 | 25 | zookeperlog_grammar = timestamp('timestamp') + dash('dash') + status('status') + job('job') + message('message') 26 | return zookeperlog_grammar 27 | 28 | def parse_log(self, log_line): 29 | parsed_zookeeperlog = self.zookeeperlog_grammar.parseString(log_line) 30 | 31 | parsed = OrderedDict() 32 | parsed['timestamp'] = ' '.join(parsed_zookeeperlog.timestamp) 33 | parsed['dash'] = parsed_zookeeperlog.dash 34 | parsed['status'] = parsed_zookeeperlog.status 35 | parsed['job'] = ' '.join(parsed_zookeeperlog.job) 36 | parsed['message'] = parsed_zookeeperlog.message 37 | 38 | return parsed 39 | 40 | 41 | if __name__ == '__main__': 42 | dataset_path = '/home/hudan/Git/prlogparser/datasets/zookeeper/' 43 | filenames = ['zookeeper.log'] 44 | 45 | test_file = '/home/hudan/Git/prlogparser/groundtruth/test-results/zookeeper-test.csv' 46 | f = open(test_file, 'w', newline='') 47 | writer = csv.writer(f) 48 | 49 | zl = ZookeeperLog('') 50 | for filename in filenames: 51 | filename = os.path.join(dataset_path, filename) 52 | with open(filename, 'r') as f: 53 | for line in f: 54 | parsed_line = zl.parse_log(line) 55 | print(parsed_line) 56 | 57 | row = list(parsed_line.values()) 58 | writer.writerow(row) 59 | 60 | f.close() 61 | -------------------------------------------------------------------------------- /nerlogparser/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/model/__init__.py -------------------------------------------------------------------------------- /nerlogparser/model/base_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tensorflow as tf 3 | 4 | 5 | class BaseModel(object): 6 | """Generic class for general methods that are not specific to NER""" 7 | 8 | def __init__(self, config): 9 | """Defines self.config and self.logger 10 | 11 | Args: 12 | config: (Config instance) class with hyper parameters, 13 | vocab and embeddings 14 | 15 | """ 16 | self.config = config 17 | self.logger = config.logger 18 | self.sess = None 19 | self.saver = None 20 | 21 | def reinitialize_weights(self, scope_name): 22 | """Reinitializes the weights of a given layer""" 23 | variables = tf.contrib.framework.get_variables(scope_name) 24 | init = tf.variables_initializer(variables) 25 | self.sess.run(init) 26 | 27 | def add_train_op(self, lr_method, lr, loss, clip=-1): 28 | """Defines self.train_op that performs an update on a batch 29 | 30 | Args: 31 | lr_method: (string) sgd method, for example "adam" 32 | lr: (tf.placeholder) tf.float32, learning rate 33 | loss: (tensor) tf.float32 loss to minimize 34 | clip: (python float) clipping of gradient. If < 0, no clipping 35 | 36 | """ 37 | _lr_m = lr_method.lower() # lower to make sure 38 | 39 | with tf.variable_scope("train_step"): 40 | if _lr_m == 'adam': # sgd method 41 | optimizer = tf.train.AdamOptimizer(lr) 42 | elif _lr_m == 'adagrad': 43 | optimizer = tf.train.AdagradOptimizer(lr) 44 | elif _lr_m == 'sgd': 45 | optimizer = tf.train.GradientDescentOptimizer(lr) 46 | elif _lr_m == 'rmsprop': 47 | optimizer = tf.train.RMSPropOptimizer(lr) 48 | else: 49 | raise NotImplementedError("Unknown method {}".format(_lr_m)) 50 | 51 | if clip > 0: # gradient clipping if clip is positive 52 | grads, vs = zip(*optimizer.compute_gradients(loss)) 53 | grads, gnorm = tf.clip_by_global_norm(grads, clip) 54 | self.train_op = optimizer.apply_gradients(zip(grads, vs)) 55 | else: 56 | self.train_op = optimizer.minimize(loss) 57 | 58 | def initialize_session(self): 59 | """Defines self.sess and initialize the variables""" 60 | self.logger.info("Initializing tf session") 61 | self.sess = tf.Session() 62 | self.sess.run(tf.global_variables_initializer()) 63 | self.saver = tf.train.Saver() 64 | 65 | def restore_session(self, dir_model): 66 | """Reload weights into session 67 | 68 | Args: 69 | sess: tf.Session() 70 | dir_model: dir with weights 71 | 72 | """ 73 | self.logger.info("Reloading the latest trained model...") 74 | self.saver.restore(self.sess, dir_model) 75 | 76 | def save_session(self): 77 | """Saves session = weights""" 78 | if not os.path.exists(self.config.dir_model): 79 | os.makedirs(self.config.dir_model) 80 | self.saver.save(self.sess, self.config.dir_model) 81 | 82 | def close_session(self): 83 | """Closes the session""" 84 | self.sess.close() 85 | 86 | def add_summary(self): 87 | """Defines variables for Tensorboard 88 | 89 | Args: 90 | dir_output: (string) where the results are written 91 | 92 | """ 93 | self.merged = tf.summary.merge_all() 94 | self.file_writer = tf.summary.FileWriter(self.config.dir_output, 95 | self.sess.graph) 96 | 97 | def train(self, train, dev): 98 | """Performs training with early stopping and lr exponential decay 99 | 100 | Args: 101 | train: dataset that yields tuple of (sentences, tags) 102 | dev: dataset 103 | 104 | """ 105 | best_score = 0 106 | nepoch_no_imprv = 0 # for early stopping 107 | self.add_summary() # tensorboard 108 | 109 | for epoch in range(self.config.nepochs): 110 | self.logger.info("Epoch {:} out of {:}".format(epoch + 1, 111 | self.config.nepochs)) 112 | 113 | score = self.run_epoch(train, dev, epoch) 114 | self.config.lr *= self.config.lr_decay # decay learning rate 115 | 116 | # early stopping and saving best parameters 117 | if score >= best_score: 118 | nepoch_no_imprv = 0 119 | self.save_session() 120 | best_score = score 121 | self.logger.info("- new best score!") 122 | else: 123 | nepoch_no_imprv += 1 124 | if nepoch_no_imprv >= self.config.nepoch_no_imprv: 125 | self.logger.info("- early stopping {} epochs without "\ 126 | "improvement".format(nepoch_no_imprv)) 127 | break 128 | 129 | def evaluate(self, test): 130 | """Evaluate model on test set 131 | 132 | Args: 133 | test: instance of class Dataset 134 | 135 | """ 136 | self.logger.info("Testing model over test set") 137 | metrics = self.run_evaluate(test) 138 | msg = " - ".join(["{} {:04.2f}".format(k, v) 139 | for k, v in metrics.items()]) 140 | self.logger.info(msg) 141 | -------------------------------------------------------------------------------- /nerlogparser/model/build_data.py: -------------------------------------------------------------------------------- 1 | from nerlogparser.model.config import Config 2 | from nerlogparser.model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \ 3 | get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \ 4 | export_trimmed_glove_vectors, get_processing_word 5 | 6 | 7 | def main(): 8 | """Procedure to build data 9 | 10 | You MUST RUN this procedure. It iterates over the whole dataset (train, 11 | dev and test) and extract the vocabularies in terms of words, tags, and 12 | characters. Having built the vocabularies it writes them in a file. The 13 | writing of vocabulary in a file assigns an id (the line #) to each word. 14 | It then extract the relevant GloVe vectors and stores them in a np array 15 | such that the i-th entry corresponds to the i-th word in the vocabulary. 16 | 17 | 18 | Args: 19 | config: (instance of Config) has attributes like hyper-params... 20 | 21 | """ 22 | # get config and processing of words 23 | config = Config(load=False) 24 | processing_word = get_processing_word(lowercase=True) 25 | 26 | # Generators 27 | dev = CoNLLDataset(config.filename_dev, processing_word) 28 | test = CoNLLDataset(config.filename_test, processing_word) 29 | train = CoNLLDataset(config.filename_train, processing_word) 30 | 31 | # Build Word and Tag vocab 32 | vocab_words, vocab_tags = get_vocabs([train, dev, test]) 33 | vocab_glove = get_glove_vocab(config.filename_glove) 34 | 35 | vocab = vocab_words & vocab_glove 36 | vocab.add(UNK) 37 | vocab.add(NUM) 38 | 39 | # Save vocab 40 | write_vocab(vocab, config.filename_words) 41 | write_vocab(vocab_tags, config.filename_tags) 42 | 43 | # Trim GloVe Vectors 44 | vocab = load_vocab(config.filename_words) 45 | export_trimmed_glove_vectors(vocab, config.filename_glove, 46 | config.filename_trimmed, config.dim_word) 47 | 48 | # Build and save char vocab 49 | train = CoNLLDataset(config.filename_train) 50 | vocab_chars = get_char_vocab(train) 51 | write_vocab(vocab_chars, config.filename_chars) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /nerlogparser/model/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | from nerlogparser.model.general_utils import get_logger 5 | from nerlogparser.model.data_utils import get_trimmed_glove_vectors, load_vocab, \ 6 | get_processing_word 7 | 8 | 9 | class Config(): 10 | def __init__(self, load=True): 11 | """Initialize hyperparameters and load vocabs 12 | 13 | Args: 14 | load_embeddings: (bool) if True, load embeddings into 15 | np array, else None 16 | 17 | """ 18 | # directory for training outputs 19 | if not os.path.exists(self.dir_output): 20 | os.makedirs(self.dir_output) 21 | 22 | # create instance of logger 23 | self.logger = get_logger(self.path_log) 24 | 25 | # load if requested (default) 26 | if load: 27 | self.load() 28 | 29 | 30 | def load(self): 31 | """Loads vocabulary, processing functions and embeddings 32 | 33 | Supposes that build_data.py has been run successfully and that 34 | the corresponding files have been created (vocab and trimmed GloVe 35 | vectors) 36 | 37 | """ 38 | # 1. vocabulary 39 | self.vocab_words = load_vocab(self.filename_words) 40 | self.vocab_tags = load_vocab(self.filename_tags) 41 | self.vocab_chars = load_vocab(self.filename_chars) 42 | 43 | self.nwords = len(self.vocab_words) 44 | self.nchars = len(self.vocab_chars) 45 | self.ntags = len(self.vocab_tags) 46 | 47 | # 2. get processing functions that map str -> id 48 | self.processing_word = get_processing_word(self.vocab_words, 49 | self.vocab_chars, lowercase=True, chars=self.use_chars) 50 | self.processing_tag = get_processing_word(self.vocab_tags, 51 | lowercase=False, allow_unk=False) 52 | 53 | # 3. get pre-trained embeddings 54 | self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed) 55 | if self.use_pretrained else None) 56 | 57 | 58 | # general config 59 | file_path = os.path.dirname(os.path.realpath(__file__)) 60 | dir_output = os.path.join(file_path, '..', "results/test/") 61 | dir_model = dir_output + "model.weights/" 62 | path_log = dir_output + "log.txt" 63 | 64 | # embeddings 65 | dim_word = 300 66 | dim_char = 100 67 | 68 | # glove files 69 | filename_glove = "data/glove.6B/glove.6B.{}d.txt".format(dim_word) 70 | # trimmed embeddings (created from glove_filename with build_data.py) 71 | filename_trimmed = os.path.join(file_path, '..', "data/glove.6B.{}d.trimmed.npz".format(dim_word)) 72 | use_pretrained = True 73 | 74 | # dataset 75 | filename_dev = "data/conll/conll.dev.txt" 76 | filename_test = "data/conll/conll.test.txt" 77 | filename_train = "data/conll/conll.train.txt" 78 | 79 | # filename_dev = filename_test = filename_train = "data/test.txt" # test 80 | 81 | max_iter = None # if not None, max number of examples in Dataset 82 | 83 | # vocab (created from dataset with build_data.py) 84 | filename_words = os.path.join(file_path, '..', "data/words.txt") 85 | filename_tags = os.path.join(file_path, '..', "data/tags.txt") 86 | filename_chars = os.path.join(file_path, '..', "data/chars.txt") 87 | 88 | # training 89 | train_embeddings = False 90 | nepochs = 15 91 | dropout = 0.5 92 | batch_size = 20 93 | lr_method = "adam" 94 | lr = 0.001 95 | lr_decay = 0.9 96 | clip = -1 # if negative, no clipping 97 | nepoch_no_imprv = 3 98 | 99 | # model hyperparameters 100 | hidden_size_char = 100 # lstm on chars 101 | hidden_size_lstm = 300 # lstm on word embeddings 102 | 103 | # NOTE: if both chars and crf, only 1.6x slower on GPU 104 | use_crf = False # if crf, training is 1.7x slower on CPU 105 | use_chars = True # if char embedding, training is 3.5x slower on CPU 106 | 107 | label_file = os.path.join(file_path, '..', "data/label.txt") 108 | -------------------------------------------------------------------------------- /nerlogparser/model/data_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | 4 | 5 | # shared global variables to be imported from model also 6 | UNK = "$UNK$" 7 | NUM = "$NUM$" 8 | NONE = "O" 9 | 10 | 11 | # special error message 12 | class MyIOError(Exception): 13 | def __init__(self, filename): 14 | # custom error message 15 | message = """ 16 | ERROR: Unable to locate file {}. 17 | 18 | FIX: Have you tried running python build_data.py first? 19 | This will build vocab file from your train, test and dev sets and 20 | trimm your word vectors. 21 | """.format(filename) 22 | super(MyIOError, self).__init__(message) 23 | 24 | 25 | class CoNLLDataset(object): 26 | """Class that iterates over CoNLL Dataset 27 | 28 | __iter__ method yields a tuple (words, tags) 29 | words: list of raw words 30 | tags: list of raw tags 31 | 32 | If processing_word and processing_tag are not None, 33 | optional preprocessing is appplied 34 | 35 | Example: 36 | ```python 37 | data = CoNLLDataset(filename) 38 | for sentence, tags in data: 39 | pass 40 | ``` 41 | 42 | """ 43 | def __init__(self, filename, processing_word=None, processing_tag=None, 44 | max_iter=None): 45 | """ 46 | Args: 47 | filename: path to the file 48 | processing_words: (optional) function that takes a word as input 49 | processing_tags: (optional) function that takes a tag as input 50 | max_iter: (optional) max number of sentences to yield 51 | 52 | """ 53 | self.filename = filename 54 | self.processing_word = processing_word 55 | self.processing_tag = processing_tag 56 | self.max_iter = max_iter 57 | self.length = None 58 | 59 | 60 | def __iter__(self): 61 | niter = 0 62 | with open(self.filename) as f: 63 | words, tags = [], [] 64 | for line in f: 65 | line = line.strip() 66 | if (len(line) == 0 or line.startswith("-DOCSTART-")): 67 | if len(words) != 0: 68 | niter += 1 69 | if self.max_iter is not None and niter > self.max_iter: 70 | break 71 | yield words, tags 72 | words, tags = [], [] 73 | else: 74 | ls = line.split(' ') 75 | word, tag = ls[0],ls[1] 76 | if self.processing_word is not None: 77 | word = self.processing_word(word) 78 | if self.processing_tag is not None: 79 | tag = self.processing_tag(tag) 80 | words += [word] 81 | tags += [tag] 82 | 83 | 84 | def __len__(self): 85 | """Iterates once over the corpus to set and store length""" 86 | if self.length is None: 87 | self.length = 0 88 | for _ in self: 89 | self.length += 1 90 | 91 | return self.length 92 | 93 | 94 | def get_vocabs(datasets): 95 | """Build vocabulary from an iterable of datasets objects 96 | 97 | Args: 98 | datasets: a list of dataset objects 99 | 100 | Returns: 101 | a set of all the words in the dataset 102 | 103 | """ 104 | print("Building vocab...") 105 | vocab_words = set() 106 | vocab_tags = set() 107 | for dataset in datasets: 108 | for words, tags in dataset: 109 | vocab_words.update(words) 110 | vocab_tags.update(tags) 111 | print("- done. {} tokens".format(len(vocab_words))) 112 | return vocab_words, vocab_tags 113 | 114 | 115 | def get_char_vocab(dataset): 116 | """Build char vocabulary from an iterable of datasets objects 117 | 118 | Args: 119 | dataset: a iterator yielding tuples (sentence, tags) 120 | 121 | Returns: 122 | a set of all the characters in the dataset 123 | 124 | """ 125 | vocab_char = set() 126 | for words, _ in dataset: 127 | for word in words: 128 | vocab_char.update(word) 129 | 130 | return vocab_char 131 | 132 | 133 | def get_glove_vocab(filename): 134 | """Load vocab from file 135 | 136 | Args: 137 | filename: path to the glove vectors 138 | 139 | Returns: 140 | vocab: set() of strings 141 | """ 142 | print("Building vocab...") 143 | vocab = set() 144 | with open(filename) as f: 145 | for line in f: 146 | word = line.strip().split(' ')[0] 147 | vocab.add(word) 148 | print("- done. {} tokens".format(len(vocab))) 149 | return vocab 150 | 151 | 152 | def write_vocab(vocab, filename): 153 | """Writes a vocab to a file 154 | 155 | Writes one word per line. 156 | 157 | Args: 158 | vocab: iterable that yields word 159 | filename: path to vocab file 160 | 161 | Returns: 162 | write a word per line 163 | 164 | """ 165 | print("Writing vocab...") 166 | with open(filename, "w") as f: 167 | for i, word in enumerate(vocab): 168 | if i != len(vocab) - 1: 169 | f.write("{}\n".format(word)) 170 | else: 171 | f.write(word) 172 | print("- done. {} tokens".format(len(vocab))) 173 | 174 | 175 | def load_vocab(filename): 176 | """Loads vocab from a file 177 | 178 | Args: 179 | filename: (string) the format of the file must be one word per line. 180 | 181 | Returns: 182 | d: dict[word] = index 183 | 184 | """ 185 | try: 186 | d = dict() 187 | with open(filename) as f: 188 | for idx, word in enumerate(f): 189 | word = word.strip() 190 | d[word] = idx 191 | 192 | except IOError: 193 | raise MyIOError(filename) 194 | return d 195 | 196 | 197 | def export_trimmed_glove_vectors(vocab, glove_filename, trimmed_filename, dim): 198 | """Saves glove vectors in numpy array 199 | 200 | Args: 201 | vocab: dictionary vocab[word] = index 202 | glove_filename: a path to a glove file 203 | trimmed_filename: a path where to store a matrix in npy 204 | dim: (int) dimension of embeddings 205 | 206 | """ 207 | embeddings = np.zeros([len(vocab), dim]) 208 | with open(glove_filename) as f: 209 | for line in f: 210 | line = line.strip().split(' ') 211 | word = line[0] 212 | embedding = [float(x) for x in line[1:]] 213 | if word in vocab: 214 | word_idx = vocab[word] 215 | embeddings[word_idx] = np.asarray(embedding) 216 | 217 | np.savez_compressed(trimmed_filename, embeddings=embeddings) 218 | 219 | 220 | def get_trimmed_glove_vectors(filename): 221 | """ 222 | Args: 223 | filename: path to the npz file 224 | 225 | Returns: 226 | matrix of embeddings (np array) 227 | 228 | """ 229 | try: 230 | with np.load(filename) as data: 231 | return data["embeddings"] 232 | 233 | except IOError: 234 | raise MyIOError(filename) 235 | 236 | 237 | def get_processing_word(vocab_words=None, vocab_chars=None, 238 | lowercase=False, chars=False, allow_unk=True): 239 | """Return lambda function that transform a word (string) into list, 240 | or tuple of (list, id) of int corresponding to the ids of the word and 241 | its corresponding characters. 242 | 243 | Args: 244 | vocab: dict[word] = idx 245 | 246 | Returns: 247 | f("cat") = ([12, 4, 32], 12345) 248 | = (list of char ids, word id) 249 | 250 | """ 251 | def f(word): 252 | # 0. get chars of words 253 | if vocab_chars is not None and chars == True: 254 | char_ids = [] 255 | for char in word: 256 | # ignore chars out of vocabulary 257 | if char in vocab_chars: 258 | char_ids += [vocab_chars[char]] 259 | 260 | # 1. preprocess word 261 | if lowercase: 262 | word = word.lower() 263 | if word.isdigit(): 264 | word = NUM 265 | 266 | # 2. get id of word 267 | if vocab_words is not None: 268 | if word in vocab_words: 269 | word = vocab_words[word] 270 | else: 271 | if allow_unk: 272 | word = vocab_words[UNK] 273 | else: 274 | raise Exception("Unknow key is not allowed. Check that "\ 275 | "your vocab (tags?) is correct") 276 | 277 | # 3. return tuple char ids, word id 278 | if vocab_chars is not None and chars == True: 279 | return char_ids, word 280 | else: 281 | return word 282 | 283 | return f 284 | 285 | 286 | def _pad_sequences(sequences, pad_tok, max_length): 287 | """ 288 | Args: 289 | sequences: a generator of list or tuple 290 | pad_tok: the char to pad with 291 | 292 | Returns: 293 | a list of list where each sublist has same length 294 | """ 295 | sequence_padded, sequence_length = [], [] 296 | 297 | for seq in sequences: 298 | seq = list(seq) 299 | seq_ = seq[:max_length] + [pad_tok]*max(max_length - len(seq), 0) 300 | sequence_padded += [seq_] 301 | sequence_length += [min(len(seq), max_length)] 302 | 303 | return sequence_padded, sequence_length 304 | 305 | 306 | def pad_sequences(sequences, pad_tok, nlevels=1): 307 | """ 308 | Args: 309 | sequences: a generator of list or tuple 310 | pad_tok: the char to pad with 311 | nlevels: "depth" of padding, for the case where we have characters ids 312 | 313 | Returns: 314 | a list of list where each sublist has same length 315 | 316 | """ 317 | if nlevels == 1: 318 | max_length = max(map(lambda x : len(x), sequences)) 319 | sequence_padded, sequence_length = _pad_sequences(sequences, 320 | pad_tok, max_length) 321 | 322 | elif nlevels == 2: 323 | max_length_word = max([max(map(lambda x: len(x), seq)) 324 | for seq in sequences]) 325 | sequence_padded, sequence_length = [], [] 326 | for seq in sequences: 327 | # all words are same length now 328 | sp, sl = _pad_sequences(seq, pad_tok, max_length_word) 329 | sequence_padded += [sp] 330 | sequence_length += [sl] 331 | 332 | max_length_sentence = max(map(lambda x : len(x), sequences)) 333 | sequence_padded, _ = _pad_sequences(sequence_padded, 334 | [pad_tok]*max_length_word, max_length_sentence) 335 | sequence_length, _ = _pad_sequences(sequence_length, 0, 336 | max_length_sentence) 337 | 338 | return sequence_padded, sequence_length 339 | 340 | 341 | def minibatches(data, minibatch_size): 342 | """ 343 | Args: 344 | data: generator of (sentence, tags) tuples 345 | minibatch_size: (int) 346 | 347 | Yields: 348 | list of tuples 349 | 350 | """ 351 | x_batch, y_batch = [], [] 352 | for (x, y) in data: 353 | if len(x_batch) == minibatch_size: 354 | yield x_batch, y_batch 355 | x_batch, y_batch = [], [] 356 | 357 | if type(x[0]) == tuple: 358 | x = zip(*x) 359 | x_batch += [x] 360 | y_batch += [y] 361 | 362 | if len(x_batch) != 0: 363 | yield x_batch, y_batch 364 | 365 | 366 | def get_chunk_type(tok, idx_to_tag): 367 | """ 368 | Args: 369 | tok: id of token, ex 4 370 | idx_to_tag: dictionary {4: "B-PER", ...} 371 | 372 | Returns: 373 | tuple: "B", "PER" 374 | 375 | """ 376 | tag_name = idx_to_tag[tok] 377 | tag_class = tag_name.split('-')[0] 378 | tag_type = tag_name.split('-')[-1] 379 | return tag_class, tag_type 380 | 381 | 382 | def get_chunks(seq, tags): 383 | """Given a sequence of tags, group entities and their position 384 | 385 | Args: 386 | seq: [4, 4, 0, 0, ...] sequence of labels 387 | tags: dict["O"] = 4 388 | 389 | Returns: 390 | list of (chunk_type, chunk_start, chunk_end) 391 | 392 | Example: 393 | seq = [4, 5, 0, 3] 394 | tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3} 395 | result = [("PER", 0, 2), ("LOC", 3, 4)] 396 | 397 | """ 398 | default = tags[NONE] 399 | idx_to_tag = {idx: tag for tag, idx in tags.items()} 400 | chunks = [] 401 | chunk_type, chunk_start = None, None 402 | for i, tok in enumerate(seq): 403 | # End of a chunk 1 404 | if tok == default and chunk_type is not None: 405 | # Add a chunk. 406 | chunk = (chunk_type, chunk_start, i) 407 | chunks.append(chunk) 408 | chunk_type, chunk_start = None, None 409 | 410 | # End of a chunk + start of a chunk! 411 | elif tok != default: 412 | tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag) 413 | if chunk_type is None: 414 | chunk_type, chunk_start = tok_chunk_type, i 415 | elif tok_chunk_type != chunk_type or tok_chunk_class == "B": 416 | chunk = (chunk_type, chunk_start, i) 417 | chunks.append(chunk) 418 | chunk_type, chunk_start = tok_chunk_type, i 419 | else: 420 | pass 421 | 422 | # end condition 423 | if chunk_type is not None: 424 | chunk = (chunk_type, chunk_start, len(seq)) 425 | chunks.append(chunk) 426 | 427 | return chunks 428 | -------------------------------------------------------------------------------- /nerlogparser/model/evaluate.py: -------------------------------------------------------------------------------- 1 | from nerlogparser.model.data_utils import CoNLLDataset 2 | from nerlogparser.model.ner_model import NERModel 3 | from nerlogparser.model.config import Config 4 | 5 | 6 | def align_data(data): 7 | """Given dict with lists, creates aligned strings 8 | 9 | Adapted from Assignment 3 of CS224N 10 | 11 | Args: 12 | data: (dict) data["x"] = ["I", "love", "you"] 13 | (dict) data["y"] = ["O", "O", "O"] 14 | 15 | Returns: 16 | data_aligned: (dict) data_align["x"] = "I love you" 17 | data_align["y"] = "O O O " 18 | 19 | """ 20 | spacings = [max([len(seq[i]) for seq in data.values()]) 21 | for i in range(len(data[list(data.keys())[0]]))] 22 | data_aligned = dict() 23 | 24 | # for each entry, create aligned string 25 | for key, seq in data.items(): 26 | str_aligned = "" 27 | for token, spacing in zip(seq, spacings): 28 | str_aligned += token + " " * (spacing - len(token) + 1) 29 | 30 | data_aligned[key] = str_aligned 31 | 32 | return data_aligned 33 | 34 | 35 | def interactive_shell(model): 36 | """Creates interactive shell to play with model 37 | 38 | Args: 39 | model: instance of NERModel 40 | 41 | """ 42 | model.logger.info(""" 43 | This is an interactive mode. 44 | To exit, enter 'exit'. 45 | You can enter a sentence like 46 | input> I love Paris""") 47 | 48 | while True: 49 | sentence = input("input> ") 50 | words_raw = sentence.strip().split(" ") 51 | 52 | if words_raw == ["exit"]: 53 | break 54 | 55 | preds = model.predict(words_raw) 56 | to_print = align_data({"input": words_raw, "output": preds}) 57 | 58 | for key, seq in to_print.items(): 59 | model.logger.info(seq) 60 | 61 | 62 | def main(): 63 | # create instance of config 64 | config = Config() 65 | 66 | # build model 67 | model = NERModel(config) 68 | model.build() 69 | model.restore_session(config.dir_model) 70 | 71 | # create dataset 72 | test = CoNLLDataset(config.filename_test, config.processing_word, 73 | config.processing_tag, config.max_iter) 74 | 75 | # evaluate and interact 76 | model.evaluate(test) 77 | interactive_shell(model) 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /nerlogparser/model/general_utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | import sys 3 | import logging 4 | import numpy as np 5 | 6 | 7 | def get_logger(filename): 8 | """Return a logger instance that writes in filename 9 | 10 | Args: 11 | filename: (string) path to log.txt 12 | 13 | Returns: 14 | logger: (instance of logger) 15 | 16 | """ 17 | logger = logging.getLogger('logger') 18 | logger.setLevel(logging.DEBUG) 19 | logging.basicConfig(format='%(message)s', level=logging.DEBUG) 20 | handler = logging.FileHandler(filename) 21 | handler.setLevel(logging.DEBUG) 22 | handler.setFormatter(logging.Formatter( 23 | '%(asctime)s:%(levelname)s: %(message)s')) 24 | logging.getLogger().addHandler(handler) 25 | 26 | return logger 27 | 28 | 29 | class Progbar(object): 30 | """Progbar class copied from keras (https://github.com/fchollet/keras/) 31 | 32 | Displays a progress bar. 33 | Small edit : added strict arg to update 34 | # Arguments 35 | target: Total number of steps expected. 36 | interval: Minimum visual progress update interval (in seconds). 37 | """ 38 | 39 | def __init__(self, target, width=30, verbose=1): 40 | self.width = width 41 | self.target = target 42 | self.sum_values = {} 43 | self.unique_values = [] 44 | self.start = time.time() 45 | self.total_width = 0 46 | self.seen_so_far = 0 47 | self.verbose = verbose 48 | 49 | def update(self, current, values=[], exact=[], strict=[]): 50 | """ 51 | Updates the progress bar. 52 | # Arguments 53 | current: Index of current step. 54 | values: List of tuples (name, value_for_last_step). 55 | The progress bar will display averages for these values. 56 | exact: List of tuples (name, value_for_last_step). 57 | The progress bar will display these values directly. 58 | """ 59 | 60 | for k, v in values: 61 | if k not in self.sum_values: 62 | self.sum_values[k] = [v * (current - self.seen_so_far), 63 | current - self.seen_so_far] 64 | self.unique_values.append(k) 65 | else: 66 | self.sum_values[k][0] += v * (current - self.seen_so_far) 67 | self.sum_values[k][1] += (current - self.seen_so_far) 68 | for k, v in exact: 69 | if k not in self.sum_values: 70 | self.unique_values.append(k) 71 | self.sum_values[k] = [v, 1] 72 | 73 | for k, v in strict: 74 | if k not in self.sum_values: 75 | self.unique_values.append(k) 76 | self.sum_values[k] = v 77 | 78 | self.seen_so_far = current 79 | 80 | now = time.time() 81 | if self.verbose == 1: 82 | prev_total_width = self.total_width 83 | sys.stdout.write("\b" * prev_total_width) 84 | sys.stdout.write("\r") 85 | 86 | numdigits = int(np.floor(np.log10(self.target))) + 1 87 | barstr = '%%%dd/%%%dd [' % (numdigits, numdigits) 88 | bar = barstr % (current, self.target) 89 | prog = float(current)/self.target 90 | prog_width = int(self.width*prog) 91 | if prog_width > 0: 92 | bar += ('='*(prog_width-1)) 93 | if current < self.target: 94 | bar += '>' 95 | else: 96 | bar += '=' 97 | bar += ('.'*(self.width-prog_width)) 98 | bar += ']' 99 | sys.stdout.write(bar) 100 | self.total_width = len(bar) 101 | 102 | if current: 103 | time_per_unit = (now - self.start) / current 104 | else: 105 | time_per_unit = 0 106 | eta = time_per_unit*(self.target - current) 107 | info = '' 108 | if current < self.target: 109 | info += ' - ETA: %ds' % eta 110 | else: 111 | info += ' - %ds' % (now - self.start) 112 | for k in self.unique_values: 113 | if type(self.sum_values[k]) is list: 114 | info += ' - %s: %.4f' % (k, 115 | self.sum_values[k][0] / max(1, self.sum_values[k][1])) 116 | else: 117 | info += ' - %s: %s' % (k, self.sum_values[k]) 118 | 119 | self.total_width += len(info) 120 | if prev_total_width > self.total_width: 121 | info += ((prev_total_width-self.total_width) * " ") 122 | 123 | sys.stdout.write(info) 124 | sys.stdout.flush() 125 | 126 | if current >= self.target: 127 | sys.stdout.write("\n") 128 | 129 | if self.verbose == 2: 130 | if current >= self.target: 131 | info = '%ds' % (now - self.start) 132 | for k in self.unique_values: 133 | info += ' - %s: %.4f' % (k, self.sum_values[k][0] / max(1, self.sum_values[k][1])) 134 | sys.stdout.write(info + "\n") 135 | 136 | def add(self, n, values=[]): 137 | self.update(self.seen_so_far+n, values) 138 | -------------------------------------------------------------------------------- /nerlogparser/model/ner_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | 5 | from nerlogparser.model.data_utils import minibatches, pad_sequences, get_chunks 6 | from nerlogparser.model.general_utils import Progbar 7 | from nerlogparser.model.base_model import BaseModel 8 | 9 | 10 | class NERModel(BaseModel): 11 | """Specialized class of Model for NER""" 12 | 13 | def __init__(self, config): 14 | super(NERModel, self).__init__(config) 15 | self.idx_to_tag = {idx: tag for tag, idx in 16 | self.config.vocab_tags.items()} 17 | 18 | def add_placeholders(self): 19 | """Define placeholders = entries to computational graph""" 20 | # shape = (batch size, max length of sentence in batch) 21 | self.word_ids = tf.placeholder(tf.int32, shape=[None, None], 22 | name="word_ids") 23 | 24 | # shape = (batch size) 25 | self.sequence_lengths = tf.placeholder(tf.int32, shape=[None], 26 | name="sequence_lengths") 27 | 28 | # shape = (batch size, max length of sentence, max length of word) 29 | self.char_ids = tf.placeholder(tf.int32, shape=[None, None, None], 30 | name="char_ids") 31 | 32 | # shape = (batch_size, max_length of sentence) 33 | self.word_lengths = tf.placeholder(tf.int32, shape=[None, None], 34 | name="word_lengths") 35 | 36 | # shape = (batch size, max length of sentence in batch) 37 | self.labels = tf.placeholder(tf.int32, shape=[None, None], 38 | name="labels") 39 | 40 | # hyper parameters 41 | self.dropout = tf.placeholder(dtype=tf.float32, shape=[], 42 | name="dropout") 43 | self.lr = tf.placeholder(dtype=tf.float32, shape=[], 44 | name="lr") 45 | 46 | def get_feed_dict(self, words, labels=None, lr=None, dropout=None): 47 | """Given some data, pad it and build a feed dictionary 48 | 49 | Args: 50 | words: list of sentences. A sentence is a list of ids of a list of 51 | words. A word is a list of ids 52 | labels: list of ids 53 | lr: (float) learning rate 54 | dropout: (float) keep prob 55 | 56 | Returns: 57 | dict {placeholder: value} 58 | 59 | """ 60 | # perform padding of the given data 61 | if self.config.use_chars: 62 | char_ids, word_ids = zip(*words) 63 | word_ids, sequence_lengths = pad_sequences(word_ids, 0) 64 | char_ids, word_lengths = pad_sequences(char_ids, pad_tok=0, nlevels=2) 65 | else: 66 | word_ids, sequence_lengths = pad_sequences(words, 0) 67 | 68 | # build feed dictionary 69 | feed = { 70 | self.word_ids: word_ids, 71 | self.sequence_lengths: sequence_lengths 72 | } 73 | 74 | if self.config.use_chars: 75 | feed[self.char_ids] = char_ids 76 | feed[self.word_lengths] = word_lengths 77 | 78 | if labels is not None: 79 | labels, _ = pad_sequences(labels, 0) 80 | feed[self.labels] = labels 81 | 82 | if lr is not None: 83 | feed[self.lr] = lr 84 | 85 | if dropout is not None: 86 | feed[self.dropout] = dropout 87 | 88 | return feed, sequence_lengths 89 | 90 | def add_word_embeddings_op(self): 91 | """Defines self.word_embeddings 92 | 93 | If self.config.embeddings is not None and is a np array initialized 94 | with pre-trained word vectors, the word embeddings is just a look-up 95 | and we don't train the vectors. Otherwise, a random matrix with 96 | the correct shape is initialized. 97 | """ 98 | with tf.variable_scope("words"): 99 | if self.config.embeddings is None: 100 | self.logger.info("WARNING: randomly initializing word vectors") 101 | _word_embeddings = tf.get_variable( 102 | name="_word_embeddings", 103 | dtype=tf.float32, 104 | shape=[self.config.nwords, self.config.dim_word]) 105 | else: 106 | _word_embeddings = tf.Variable( 107 | self.config.embeddings, 108 | name="_word_embeddings", 109 | dtype=tf.float32, 110 | trainable=self.config.train_embeddings) 111 | 112 | word_embeddings = tf.nn.embedding_lookup(_word_embeddings, self.word_ids, name="word_embeddings") 113 | 114 | with tf.variable_scope("chars"): 115 | if self.config.use_chars: 116 | # get char embeddings matrix 117 | _char_embeddings = tf.get_variable( 118 | name="_char_embeddings", 119 | dtype=tf.float32, 120 | shape=[self.config.nchars, self.config.dim_char]) 121 | char_embeddings = tf.nn.embedding_lookup(_char_embeddings, 122 | self.char_ids, name="char_embeddings") 123 | 124 | # put the time dimension on axis=1 125 | s = tf.shape(char_embeddings) 126 | char_embeddings = tf.reshape(char_embeddings, 127 | shape=[s[0]*s[1], s[-2], self.config.dim_char]) 128 | word_lengths = tf.reshape(self.word_lengths, shape=[s[0]*s[1]]) 129 | 130 | # bi lstm on chars 131 | cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char, 132 | state_is_tuple=True) 133 | cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_char, 134 | state_is_tuple=True) 135 | _output = tf.nn.bidirectional_dynamic_rnn( 136 | cell_fw, cell_bw, char_embeddings, 137 | sequence_length=word_lengths, dtype=tf.float32) 138 | 139 | # read and concat output 140 | _, ((_, output_fw), (_, output_bw)) = _output 141 | output = tf.concat([output_fw, output_bw], axis=-1) 142 | 143 | # shape = (batch size, max sentence length, char hidden size) 144 | output = tf.reshape(output, 145 | shape=[s[0], s[1], 2*self.config.hidden_size_char]) 146 | word_embeddings = tf.concat([word_embeddings, output], axis=-1) 147 | 148 | self.word_embeddings = tf.nn.dropout(word_embeddings, self.dropout) 149 | 150 | def add_logits_op(self): 151 | """Defines self.logits 152 | 153 | For each word in each sentence of the batch, it corresponds to a vector 154 | of scores, of dimension equal to the number of tags. 155 | """ 156 | with tf.variable_scope("bi-lstm"): 157 | cell_fw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm) 158 | cell_bw = tf.contrib.rnn.LSTMCell(self.config.hidden_size_lstm) 159 | (output_fw, output_bw), _ = tf.nn.bidirectional_dynamic_rnn( 160 | cell_fw, cell_bw, self.word_embeddings, 161 | sequence_length=self.sequence_lengths, dtype=tf.float32) 162 | output = tf.concat([output_fw, output_bw], axis=-1) 163 | output = tf.nn.dropout(output, self.dropout) 164 | 165 | with tf.variable_scope("proj"): 166 | W = tf.get_variable("W", dtype=tf.float32, 167 | shape=[2*self.config.hidden_size_lstm, self.config.ntags]) 168 | 169 | b = tf.get_variable("b", shape=[self.config.ntags], 170 | dtype=tf.float32, initializer=tf.zeros_initializer()) 171 | 172 | nsteps = tf.shape(output)[1] 173 | output = tf.reshape(output, [-1, 2*self.config.hidden_size_lstm]) 174 | pred = tf.matmul(output, W) + b 175 | self.logits = tf.reshape(pred, [-1, nsteps, self.config.ntags]) 176 | 177 | def add_pred_op(self): 178 | """Defines self.labels_pred 179 | 180 | This op is defined only in the case where we don't use a CRF since in 181 | that case we can make the prediction "in the graph" (thanks to tf 182 | functions in other words). With theCRF, as the inference is coded 183 | in python and not in pure tensroflow, we have to make the prediciton 184 | outside the graph. 185 | """ 186 | if not self.config.use_crf: 187 | self.labels_pred = tf.cast(tf.argmax(self.logits, axis=-1), 188 | tf.int32) 189 | 190 | def add_loss_op(self): 191 | """Defines the loss""" 192 | if self.config.use_crf: 193 | log_likelihood, trans_params = tf.contrib.crf.crf_log_likelihood( 194 | self.logits, self.labels, self.sequence_lengths) 195 | self.trans_params = trans_params # need to evaluate it for decoding 196 | self.loss = tf.reduce_mean(-log_likelihood) 197 | else: 198 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits( 199 | logits=self.logits, labels=self.labels) 200 | mask = tf.sequence_mask(self.sequence_lengths) 201 | losses = tf.boolean_mask(losses, mask) 202 | self.loss = tf.reduce_mean(losses) 203 | 204 | # for tensorboard 205 | tf.summary.scalar("loss", self.loss) 206 | 207 | def build(self): 208 | # NER specific functions 209 | self.add_placeholders() 210 | self.add_word_embeddings_op() 211 | self.add_logits_op() 212 | self.add_pred_op() 213 | self.add_loss_op() 214 | 215 | # Generic functions that add training op and initialize session 216 | self.add_train_op(self.config.lr_method, self.lr, self.loss, 217 | self.config.clip) 218 | self.initialize_session() # now self.sess is defined and vars are init 219 | 220 | def predict_batch(self, words): 221 | """ 222 | Args: 223 | words: list of sentences 224 | 225 | Returns: 226 | labels_pred: list of labels for each sentence 227 | sequence_length 228 | 229 | """ 230 | fd, sequence_lengths = self.get_feed_dict(words, dropout=1.0) 231 | 232 | if self.config.use_crf: 233 | # get tag scores and transition params of CRF 234 | viterbi_sequences = [] 235 | logits, trans_params = self.sess.run( 236 | [self.logits, self.trans_params], feed_dict=fd) 237 | 238 | # iterate over the sentences because no batching in vitervi_decode 239 | for logit, sequence_length in zip(logits, sequence_lengths): 240 | logit = logit[:sequence_length] # keep only the valid steps 241 | viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode( 242 | logit, trans_params) 243 | viterbi_sequences += [viterbi_seq] 244 | 245 | return viterbi_sequences, sequence_lengths 246 | 247 | else: 248 | labels_pred = self.sess.run(self.labels_pred, feed_dict=fd) 249 | 250 | return labels_pred, sequence_lengths 251 | 252 | def run_epoch(self, train, dev, epoch): 253 | """Performs one complete pass over the train set and evaluate on dev 254 | 255 | Args: 256 | train: dataset that yields tuple of sentences, tags 257 | dev: dataset 258 | epoch: (int) index of the current epoch 259 | 260 | Returns: 261 | f1: (python float), score to select model on, higher is better 262 | 263 | """ 264 | # progbar stuff for logging 265 | batch_size = self.config.batch_size 266 | nbatches = (len(train) + batch_size - 1) // batch_size 267 | prog = Progbar(target=nbatches) 268 | 269 | # iterate over dataset 270 | for i, (words, labels) in enumerate(minibatches(train, batch_size)): 271 | fd, _ = self.get_feed_dict(words, labels, self.config.lr, 272 | self.config.dropout) 273 | 274 | _, train_loss, summary = self.sess.run( 275 | [self.train_op, self.loss, self.merged], feed_dict=fd) 276 | 277 | prog.update(i + 1, [("train loss", train_loss)]) 278 | 279 | # tensorboard 280 | if i % 10 == 0: 281 | self.file_writer.add_summary(summary, epoch*nbatches + i) 282 | 283 | metrics = self.run_evaluate(dev) 284 | msg = " - ".join(["{} {:04.2f}".format(k, v) 285 | for k, v in metrics.items()]) 286 | self.logger.info(msg) 287 | 288 | return metrics["f1"] 289 | 290 | def run_evaluate(self, test): 291 | """Evaluates performance on test set 292 | 293 | Args: 294 | test: dataset that yields tuple of (sentences, tags) 295 | 296 | Returns: 297 | metrics: (dict) metrics["acc"] = 98.4, ... 298 | 299 | """ 300 | accs = [] 301 | correct_preds, total_correct, total_preds = 0., 0., 0. 302 | for words, labels in minibatches(test, self.config.batch_size): 303 | labels_pred, sequence_lengths = self.predict_batch(words) 304 | 305 | for lab, lab_pred, length in zip(labels, labels_pred, 306 | sequence_lengths): 307 | lab = lab[:length] 308 | lab_pred = lab_pred[:length] 309 | accs += [a==b for (a, b) in zip(lab, lab_pred)] 310 | 311 | lab_chunks = set(get_chunks(lab, self.config.vocab_tags)) 312 | lab_pred_chunks = set(get_chunks(lab_pred, 313 | self.config.vocab_tags)) 314 | 315 | correct_preds += len(lab_chunks & lab_pred_chunks) 316 | total_preds += len(lab_pred_chunks) 317 | total_correct += len(lab_chunks) 318 | 319 | p = correct_preds / total_preds if correct_preds > 0 else 0 320 | r = correct_preds / total_correct if correct_preds > 0 else 0 321 | f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 322 | acc = np.mean(accs) 323 | 324 | return {"acc": 100*acc, "f1": 100*f1} 325 | 326 | def predict(self, words_raw): 327 | """Returns list of tags 328 | 329 | Args: 330 | words_raw: list of words (string), just one sentence (no batch) 331 | 332 | Returns: 333 | preds: list of tags (string), one for each word in the sentence 334 | 335 | """ 336 | words = [self.config.processing_word(w) for w in words_raw] 337 | if type(words[0]) == tuple: 338 | words = zip(*words) 339 | pred_ids, _ = self.predict_batch([words]) 340 | preds = [self.idx_to_tag[idx] for idx in list(pred_ids[0])] 341 | 342 | return preds 343 | -------------------------------------------------------------------------------- /nerlogparser/model/train.py: -------------------------------------------------------------------------------- 1 | from nerlogparser.model.data_utils import CoNLLDataset 2 | from nerlogparser.model.ner_model import NERModel 3 | from nerlogparser.model.config import Config 4 | 5 | 6 | def main(): 7 | # create instance of config 8 | config = Config() 9 | 10 | # build model 11 | model = NERModel(config) 12 | model.build() 13 | # model.restore_session("results/crf/model.weights/") # optional, restore weights 14 | # model.reinitialize_weights("proj") 15 | 16 | # create datasets 17 | dev = CoNLLDataset(config.filename_dev, config.processing_word, 18 | config.processing_tag, config.max_iter) 19 | train = CoNLLDataset(config.filename_train, config.processing_word, 20 | config.processing_tag, config.max_iter) 21 | 22 | # train model 23 | model.train(train, dev) 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /nerlogparser/nerlogparser.py: -------------------------------------------------------------------------------- 1 | import pprint 2 | from optparse import OptionParser 3 | from collections import OrderedDict 4 | from nerlogparser.model.ner_model import NERModel 5 | from nerlogparser.model.config import Config 6 | from nerlogparser.output.to_json import ToJson 7 | 8 | 9 | class Nerlogparser(object): 10 | def __init__(self): 11 | self.model = None 12 | self.config = None 13 | self.master_label = {} 14 | 15 | self.__load_pretrained_model() 16 | self.__load_label() 17 | 18 | def __load_pretrained_model(self): 19 | # create instance of config 20 | self.config = Config() 21 | 22 | # load pretrained model 23 | self.model = NERModel(self.config) 24 | self.model.build() 25 | self.model.restore_session(self.config.dir_model) 26 | 27 | def __load_label(self): 28 | # load NER label and its corresponding human-readable field label 29 | with open(self.config.label_file, 'r') as f: 30 | label = f.readlines() 31 | 32 | labels = {} 33 | for line in label: 34 | line_split = line.split(' ') 35 | ner_label, final_label = line_split[0], line_split[1] 36 | labels[ner_label] = final_label.rstrip() 37 | 38 | self.master_label = labels 39 | 40 | def __get_per_entity(self, words_raw, ner_label): 41 | # one entity can contain one or more words 42 | entity = OrderedDict() 43 | for index, label in enumerate(ner_label): 44 | if '-' in label: 45 | main_label = label.split('-')[1] 46 | else: 47 | main_label = label 48 | 49 | if main_label not in entity.keys(): 50 | entity[main_label] = [] 51 | 52 | entity[main_label].append(words_raw[index]) 53 | 54 | # one entity is now one sentence 55 | final_entity = OrderedDict() 56 | for main_label, words in entity.items(): 57 | final_label = self.master_label[main_label] 58 | final_entity[final_label] = ' '.join(words) 59 | 60 | if 'message' not in final_entity.keys(): 61 | final_entity['message'] = '' 62 | 63 | return final_entity 64 | 65 | def parse_logs(self, log_file): 66 | # parse log files using pretrained model 67 | raw_logs = {} 68 | parsed_logs = OrderedDict() 69 | parsed_log_index = 0 70 | with open(log_file) as f: 71 | for line_index, line in enumerate(f): 72 | if line not in ['\n', '\r\n']: 73 | raw_logs[parsed_log_index] = line 74 | words_raw = line.strip().split() 75 | 76 | ner_label = self.model.predict(words_raw) 77 | parsed = self.__get_per_entity(words_raw, ner_label) 78 | parsed_logs[parsed_log_index] = parsed 79 | parsed_log_index += 1 80 | 81 | return parsed_logs 82 | 83 | 84 | def main(): 85 | parser = OptionParser(usage='usage: nerlogparser [options]') 86 | parser.add_option('-i', '--input', 87 | action='store', 88 | dest='input_file', 89 | help='Input log file.') 90 | parser.add_option('-o', '--output', 91 | action='store', 92 | dest='output_file', 93 | help='Parsed log file.') 94 | 95 | # get options 96 | (options, args) = parser.parse_args() 97 | input_file = options.input_file 98 | output_file = options.output_file 99 | 100 | if options.input_file: 101 | # parse log file 102 | nerlogparser = Nerlogparser() 103 | parsed_results = nerlogparser.parse_logs(input_file) 104 | 105 | if options.output_file: 106 | print('Write results to', output_file) 107 | ToJson.write_to_json(parsed_results, output_file) 108 | 109 | else: 110 | print('No output file. Print parsing results on terminal.') 111 | for line_id, parsed in parsed_results.items(): 112 | print('Line:', line_id) 113 | pprint.pprint(parsed) 114 | print() 115 | 116 | else: 117 | print('Please see help: nerlogparser -h') 118 | 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /nerlogparser/output/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/output/__init__.py -------------------------------------------------------------------------------- /nerlogparser/output/to_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | class ToJson(object): 5 | 6 | @staticmethod 7 | def write_to_json(parsed_logs, output_file): 8 | # write a dictionary to json file 9 | with open(output_file, 'w') as f: 10 | json.dump(parsed_logs, f) 11 | -------------------------------------------------------------------------------- /nerlogparser/preprocessing/Preprocessing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import errno 3 | import pickle 4 | from configparser import ConfigParser 5 | from nerlogparser.grammar.authlog import AuthLog 6 | from nerlogparser.grammar.daemonlog import DaemonLog 7 | from nerlogparser.grammar.debuglog import DebugLog 8 | from nerlogparser.grammar.dmesglog import DmesgLog 9 | from nerlogparser.grammar.kernellog import KernelLog 10 | from nerlogparser.grammar.messageslog import MessagesLog 11 | from nerlogparser.grammar.csvlog import CSVLog 12 | from nerlogparser.grammar.bluegenelog import BlueGeneLog 13 | from nerlogparser.grammar.kippolog import KippoLog 14 | from nerlogparser.grammar.proxifierlog import ProxifierLog 15 | from nerlogparser.grammar.weblog import WebLog 16 | from nerlogparser.grammar.zookeeperlog import ZookeeperLog 17 | 18 | 19 | class Preprocessing(object): 20 | """This class do four main tasks: 21 | 1. Parse log entries based on developer-defined grammar 22 | 2. Put punctuations between parsed entities 23 | 3. Save punctuations result to files 24 | 4. Save parsed log entries to pickle files to be used in Splitting.py. 25 | 26 | """ 27 | def __init__(self, data): 28 | self.dataset = data 29 | self.dataset_conf = {} 30 | self.files = {} 31 | 32 | @staticmethod 33 | def __check_path(path): 34 | """Check a path is exist or not. If not exist, then create it. 35 | 36 | Parameters 37 | ---------- 38 | path : str 39 | Path of a directory to be checked. 40 | """ 41 | try: 42 | os.makedirs(path) 43 | except OSError as exception: 44 | if exception.errno != errno.EEXIST: 45 | raise 46 | 47 | def __get_dataset(self): 48 | # get dataset configuration 49 | current_path = os.path.dirname(os.path.realpath(__file__)) 50 | dataset_config_path = os.path.join(current_path, 'config', 'datasets.conf') 51 | 52 | # read dataset path from .conf file 53 | parser = ConfigParser() 54 | parser.read(dataset_config_path) 55 | for section_name in parser.sections(): 56 | options = {} 57 | for name, value in parser.items(section_name): 58 | options[name] = value 59 | self.dataset_conf[section_name] = options 60 | 61 | # get dataset and groundtruth path 62 | dataset_path = os.path.join(self.dataset_conf['main']['dataset_path'], self.dataset) 63 | groundtruth_path = os.path.join(self.dataset_conf['main']['groundtruth_path'], self.dataset) 64 | groundtruth_pickle_path = os.path.join(self.dataset_conf['main']['groundtruth_pickle'], self.dataset) 65 | self.__check_path(groundtruth_path) 66 | self.__check_path(groundtruth_pickle_path) 67 | filenames = os.listdir(dataset_path) 68 | 69 | # get full path of each filename 70 | for filename in filenames: 71 | self.files[filename] = { 72 | 'log_path': os.path.join(dataset_path, filename), 73 | 'groundtruth_path': os.path.join(groundtruth_path, filename), 74 | 'groundtruth_pickle': os.path.join(groundtruth_pickle_path, filename), 75 | 'type': filename.split('.')[0] 76 | } 77 | 78 | def __get_grammar(self, file_type): 79 | if file_type == 'auth': 80 | authlog_grammar = AuthLog(self.dataset) 81 | return authlog_grammar 82 | 83 | elif file_type == 'daemon': 84 | daemonlog_grammar = DaemonLog(self.dataset) 85 | return daemonlog_grammar 86 | 87 | elif file_type == 'debug': 88 | debuglog_grammar = DebugLog(self.dataset) 89 | return debuglog_grammar.get_grammar() 90 | 91 | elif file_type == 'dmesg': 92 | dmesglog_grammar = DmesgLog(self.dataset) 93 | return dmesglog_grammar.get_grammar() 94 | 95 | elif file_type == 'kern': 96 | kernellog_grammar = KernelLog(self.dataset) 97 | return kernellog_grammar.get_grammar() 98 | 99 | elif file_type == 'messages' or file_type == 'syslog': 100 | messageslog_grammar = MessagesLog(self.dataset) 101 | return messageslog_grammar.get_grammar() 102 | 103 | elif file_type == 'csv': 104 | messageslog_grammar = CSVLog(self.dataset) 105 | return messageslog_grammar 106 | 107 | elif file_type == 'bgl2': 108 | bluegenelog_grammar = BlueGeneLog(self.dataset) 109 | return bluegenelog_grammar 110 | 111 | elif file_type == 'kippo': 112 | kippolog_grammar = KippoLog(self.dataset) 113 | return kippolog_grammar 114 | 115 | elif file_type == 'proxifier': 116 | proxifierlog_grammar = ProxifierLog(self.dataset) 117 | return proxifierlog_grammar 118 | 119 | elif file_type == 'access': 120 | weblog_grammar = WebLog(self.dataset) 121 | return weblog_grammar 122 | 123 | elif file_type == 'zookeeper': 124 | zookeeperlog_grammar = ZookeeperLog(self.dataset) 125 | return zookeeperlog_grammar 126 | 127 | @staticmethod 128 | def __set_punctuation(parsed_line): 129 | # set punctuation from parsed line 130 | punctuated = '' 131 | for field_name, field_value in parsed_line.items(): 132 | if field_value != '' and field_value != ' ': 133 | if field_name == 'message' or field_name == 'client_agent': 134 | # if there is no period, then add one 135 | field_value = field_value.rstrip() 136 | if field_value[-1] != '.': 137 | punctuated += field_value + ' .PERIOD\n' 138 | else: 139 | punctuated += field_value + ' .PERIOD\n' 140 | else: 141 | punctuated += field_value + ' ,COMMA ' 142 | else: 143 | if field_name == 'message': 144 | punctuated += '\n' 145 | elif field_name == 'client_agent': 146 | punctuated += ' .PERIOD\n' 147 | 148 | return punctuated 149 | 150 | def punctuate(self): 151 | # get dataset 152 | self.__get_dataset() 153 | 154 | # punctuate log entries 155 | for filename, properties in self.files.items(): 156 | print(self.dataset, filename) 157 | 158 | # get grammar based on file type 159 | file_type = properties['type'] 160 | grammar = self.__get_grammar(file_type) 161 | 162 | # prepare output files 163 | f_groundtruth = open(properties['groundtruth_path'], 'w') 164 | 165 | # parse log entries 166 | parsed_list = [] 167 | if file_type != 'csv': 168 | with open(properties['log_path'], 'r') as f: 169 | for line in f: 170 | parsed = grammar.parse_log(line) 171 | parsed_list.append(parsed) 172 | 173 | # set punctuation 174 | punctuated_line = self.__set_punctuation(parsed) 175 | f_groundtruth.write(punctuated_line) 176 | 177 | else: 178 | parsed_lines = grammar.parse_log() 179 | for parsed_line in parsed_lines: 180 | parsed_list.append(parsed_line) 181 | punctuated_line = self.__set_punctuation(parsed_line) 182 | f_groundtruth.write(punctuated_line) 183 | 184 | f_groundtruth.close() 185 | 186 | # save parsed list to a pickle file 187 | with open(properties['groundtruth_pickle'], 'wb') as f_pickle: 188 | pickle.dump(parsed_list, f_pickle, protocol=pickle.HIGHEST_PROTOCOL) 189 | 190 | 191 | if __name__ == '__main__': 192 | # put punctuation (comma and period) to all datasets 193 | datasets = [ 194 | 'casper-rw', 195 | 'dfrws-2009-jhuisi', 196 | 'dfrws-2009-nssal', 197 | 'dfrws-2016', 198 | 'honeynet-challenge5', 199 | 'honeynet-challenge7', 200 | 'bgl2', 201 | 'kippo', 202 | 'proxifier', 203 | 'secrepo-accesslog', 204 | 'zookeeper' 205 | ] 206 | 207 | for dataset in datasets: 208 | preprocess = Preprocessing(dataset) 209 | preprocess.punctuate() 210 | -------------------------------------------------------------------------------- /nerlogparser/preprocessing/Splitting.py: -------------------------------------------------------------------------------- 1 | import os 2 | import errno 3 | import shutil 4 | import pickle 5 | from configparser import ConfigParser 6 | from math import floor 7 | from nerlogparser.dataformat.toconll import ToConll 8 | 9 | 10 | class Splitting(object): 11 | # split each file to three parts: train, dev, and test 12 | # compositition: train: 60, dev: 20, test: 20 13 | def __init__(self, data): 14 | self.dataset = data 15 | self.dataset_conf = {} 16 | self.files = {} 17 | self.punctuation_path = '' 18 | self.conll_path = '' 19 | self.train_file = '' 20 | self.dev_file = '' 21 | self.test_file = '' 22 | self.train_file_conll = '' 23 | self.dev_file_conll = '' 24 | self.test_file_conll = '' 25 | self.conll_stanford_path = '' 26 | self.train_file_conll_stanford = '' 27 | self.dev_file_conll_stanford = '' 28 | self.test_file_conll_stanford = '' 29 | self.conll_pos_path = '' 30 | self.train_file_conll_pos = '' 31 | self.dev_file_conll_pos = '' 32 | self.test_file_conll_pos = '' 33 | self.csv_path = '' 34 | self.file_csv = '' 35 | self.nltk_tree_path = '' 36 | self.train_file_nltk_tree = '' 37 | self.test_file_nltk_tree = '' 38 | 39 | @staticmethod 40 | def __check_path(path): 41 | # check a path is exist or not. if not exist, then create it 42 | try: 43 | os.makedirs(path) 44 | except OSError as exception: 45 | if exception.errno != errno.EEXIST: 46 | raise 47 | 48 | def __set_datapath(self): 49 | # set data path for output files 50 | self.__check_path(self.punctuation_path) 51 | self.train_file = os.path.join(self.punctuation_path, 'ep.train.txt') 52 | self.dev_file = os.path.join(self.punctuation_path, 'ep.dev.txt') 53 | self.test_file = os.path.join(self.punctuation_path, 'ep.test.txt') 54 | 55 | def __set_datapath_conll(self): 56 | # set data path for output files 57 | self.__check_path(self.conll_path) 58 | self.train_file_conll = os.path.join(self.conll_path, 'conll.train.txt') 59 | self.dev_file_conll = os.path.join(self.conll_path, 'conll.dev.txt') 60 | self.test_file_conll = os.path.join(self.conll_path, 'conll.test.txt') 61 | 62 | def __set_datapath_conll_stanford(self): 63 | # set data path for output files 64 | self.__check_path(self.conll_stanford_path) 65 | self.train_file_conll_stanford = os.path.join(self.conll_stanford_path, 'conll.stanford.train.txt') 66 | self.dev_file_conll_stanford = os.path.join(self.conll_stanford_path, 'conll.stanford.dev.txt') 67 | self.test_file_conll_stanford = os.path.join(self.conll_stanford_path, 'conll.stanford.test.txt') 68 | 69 | def __set_datapath_conll_pos(self): 70 | # set data path for output files 71 | self.__check_path(self.conll_pos_path) 72 | self.train_file_conll_pos = os.path.join(self.conll_pos_path, 'conll.pos.train.txt') 73 | self.dev_file_conll_pos = os.path.join(self.conll_pos_path, 'conll.pos.dev.txt') 74 | self.test_file_conll_pos = os.path.join(self.conll_pos_path, 'conll.pos.test.txt') 75 | 76 | def __set_datapath_nltk_tree(self): 77 | # set data path for output files 78 | self.__check_path(self.nltk_tree_path) 79 | self.train_file_nltk_tree = os.path.join(self.nltk_tree_path, 'nltk.tree.train.txt') 80 | self.test_file_nltk_tree = os.path.join(self.nltk_tree_path, 'nltk.tree.test.txt') 81 | 82 | def __set_datapath_csv(self): 83 | self.__check_path(self.csv_path) 84 | self.file_csv = os.path.join(self.csv_path, 'csv.all.txt') 85 | 86 | def __get_dataset(self): 87 | # get dataset configuration 88 | current_path = os.path.dirname(os.path.realpath(__file__)) 89 | dataset_config_path = os.path.join(current_path, 'config', 'datasets.conf') 90 | 91 | # read dataset path from .conf file 92 | parser = ConfigParser() 93 | parser.read(dataset_config_path) 94 | for section_name in parser.sections(): 95 | options = {} 96 | for name, value in parser.items(section_name): 97 | options[name] = value 98 | self.dataset_conf[section_name] = options 99 | 100 | # set output path 101 | self.punctuation_path = self.dataset_conf['main']['punctuation_path'] 102 | self.conll_path = self.dataset_conf['main']['conll_path'] 103 | self.conll_stanford_path = self.dataset_conf['main']['conll_stanford_path'] 104 | self.conll_pos_path = self.dataset_conf['main']['conll_pos_path'] 105 | self.csv_path = self.dataset_conf['main']['csv_path'] 106 | self.nltk_tree_path = self.dataset_conf['main']['nltk_tree_path'] 107 | 108 | # get dataset and groundtruth path 109 | dataset_path = os.path.join(self.dataset_conf['main']['dataset_path'], self.dataset) 110 | groundtruth_path = os.path.join(self.dataset_conf['main']['groundtruth_path'], self.dataset) 111 | groundtruth_pickle = os.path.join(self.dataset_conf['main']['groundtruth_pickle'], self.dataset) 112 | filenames = os.listdir(dataset_path) 113 | 114 | # get full path of each filename 115 | for filename in filenames: 116 | self.files[filename] = { 117 | 'log_path': os.path.join(dataset_path, filename), 118 | 'groundtruth_path': os.path.join(groundtruth_path, filename), 119 | 'groundtruth_pickle_path': os.path.join(groundtruth_pickle, filename) 120 | } 121 | 122 | def split(self): 123 | # get dataset and output file path 124 | self.__get_dataset() 125 | self.__set_datapath() 126 | 127 | # open files for train, dev, and test 128 | f_train = open(self.train_file, 'a') 129 | f_dev = open(self.dev_file, 'a') 130 | f_test = open(self.test_file, 'a') 131 | 132 | for filename, properties in self.files.items(): 133 | print('punctuation file:', self.dataset, properties['log_path']) 134 | 135 | with open(properties['groundtruth_path'], 'r') as f: 136 | # read lines and get various length 137 | lines = f.readlines() 138 | 139 | # note: test_length = dev_length 140 | lines_length = len(lines) 141 | train_length = floor(0.6 * lines_length) 142 | dev_length = floor(0.2 * lines_length) 143 | dev_end_index = train_length + dev_length 144 | 145 | # get training, dev, and test data 146 | for line in lines[:train_length]: 147 | f_train.write(line) 148 | 149 | for line in lines[train_length:dev_end_index]: 150 | f_dev.write(line) 151 | 152 | for line in lines[dev_end_index:]: 153 | f_test.write(line) 154 | 155 | # close files 156 | f_train.close() 157 | f_dev.close() 158 | f_test.close() 159 | 160 | def split_conll(self): 161 | # set conll output files and create conll-format instance 162 | self.__set_datapath_conll() 163 | conll = ToConll() 164 | 165 | # open files for train, dev, and test 166 | f_train = open(self.train_file_conll, 'a') 167 | f_dev = open(self.dev_file_conll, 'a') 168 | f_test = open(self.test_file_conll, 'a') 169 | 170 | for filename, properties in self.files.items(): 171 | print('pickle file :', self.dataset, properties['log_path']) 172 | 173 | # parsed_list is list of dictionaries containing parsed log entries 174 | with open(properties['groundtruth_pickle_path'], 'rb') as f_pickle: 175 | parsed_list = pickle.load(f_pickle) 176 | 177 | # note: test_length = dev_length 178 | lines_length = len(parsed_list) 179 | train_length = floor(0.6 * lines_length) 180 | dev_length = floor(0.2 * lines_length) 181 | dev_end_index = train_length + dev_length 182 | 183 | # get training, dev, and test data 184 | f_train.write('-DOCSTART- -X- O O\n\n') 185 | for line in parsed_list[:train_length]: 186 | line = conll.convert(line) 187 | f_train.write(line) 188 | 189 | f_dev.write('-DOCSTART- -X- O O\n\n') 190 | for line in parsed_list[train_length:dev_end_index]: 191 | line = conll.convert(line) 192 | f_dev.write(line) 193 | 194 | f_test.write('-DOCSTART- -X- O O\n\n') 195 | for line in parsed_list[dev_end_index:]: 196 | line = conll.convert(line) 197 | f_test.write(line) 198 | 199 | # close files 200 | f_train.close() 201 | f_dev.close() 202 | f_test.close() 203 | 204 | def split_conll_stanford(self): 205 | # set conll output files and create conll-format instance 206 | self.__set_datapath_conll_stanford() 207 | conll = ToConll() 208 | 209 | # open files for train, dev, and test 210 | f_train = open(self.train_file_conll_stanford, 'a') 211 | f_dev = open(self.dev_file_conll_stanford, 'a') 212 | f_test = open(self.test_file_conll_stanford, 'a') 213 | 214 | for filename, properties in self.files.items(): 215 | print('pickle file :', self.dataset, properties['log_path']) 216 | 217 | # parsed_list is list of dictionaries containing parsed log entries 218 | with open(properties['groundtruth_pickle_path'], 'rb') as f_pickle: 219 | parsed_list = pickle.load(f_pickle) 220 | 221 | # note: test_length = dev_length 222 | lines_length = len(parsed_list) 223 | train_length = floor(0.6 * lines_length) 224 | dev_length = floor(0.2 * lines_length) 225 | dev_end_index = train_length + dev_length 226 | 227 | # get training, dev, and test data 228 | for line in parsed_list[:train_length]: 229 | line = conll.convert(line, stanford=True) 230 | f_train.write(line) 231 | 232 | for line in parsed_list[train_length:dev_end_index]: 233 | line = conll.convert(line, stanford=True) 234 | f_dev.write(line) 235 | 236 | for line in parsed_list[dev_end_index:]: 237 | line = conll.convert(line, stanford=True) 238 | f_test.write(line) 239 | 240 | # close files 241 | f_train.close() 242 | f_dev.close() 243 | f_test.close() 244 | 245 | def split_conll_pos(self): 246 | # set conll output files and create conll-format instance 247 | self.__set_datapath_conll_pos() 248 | conll = ToConll() 249 | 250 | # open files for train, dev, and test 251 | f_train = open(self.train_file_conll_pos, 'a') 252 | f_dev = open(self.dev_file_conll_pos, 'a') 253 | f_test = open(self.test_file_conll_pos, 'a') 254 | 255 | for filename, properties in self.files.items(): 256 | print('pickle file :', self.dataset, properties['log_path']) 257 | 258 | # parsed_list is list of dictionaries containing parsed log entries 259 | with open(properties['groundtruth_pickle_path'], 'rb') as f_pickle: 260 | parsed_list = pickle.load(f_pickle) 261 | 262 | # note: test_length = dev_length 263 | lines_length = len(parsed_list) 264 | train_length = floor(0.6 * lines_length) 265 | dev_length = floor(0.2 * lines_length) 266 | dev_end_index = train_length + dev_length 267 | 268 | # get training, dev, and test data 269 | for line in parsed_list[:train_length]: 270 | line = conll.convert(line, ispos=True) 271 | f_train.write(line) 272 | 273 | for line in parsed_list[train_length:dev_end_index]: 274 | line = conll.convert(line, ispos=True) 275 | f_dev.write(line) 276 | 277 | for line in parsed_list[dev_end_index:]: 278 | line = conll.convert(line, ispos=True) 279 | f_test.write(line) 280 | 281 | # close files 282 | f_train.close() 283 | f_dev.close() 284 | f_test.close() 285 | 286 | def split_csv(self, f): 287 | # create conll-format instance 288 | conll = ToConll() 289 | 290 | for filename, properties in self.files.items(): 291 | print('pickle file :', self.dataset, properties['log_path']) 292 | 293 | # parsed_list is list of dictionaries containing parsed log entries 294 | with open(properties['groundtruth_pickle_path'], 'rb') as f_pickle: 295 | parsed_list = pickle.load(f_pickle) 296 | 297 | # get training, dev, and test data 298 | for line_id, line in enumerate(parsed_list): 299 | line_id += 1 300 | line = conll.convert(line, csv=True, csv_line_id=line_id) 301 | f.write(line) 302 | 303 | def split_nltk_tree(self): 304 | # set conll output files and create conll-format instance 305 | self.__set_datapath_nltk_tree() 306 | conll = ToConll() 307 | 308 | # open files for train and test 309 | f_train = open(self.train_file_nltk_tree, 'ab') 310 | f_test = open(self.test_file_nltk_tree, 'ab') 311 | 312 | for filename, properties in self.files.items(): 313 | print('pickle file :', self.dataset, properties['log_path']) 314 | 315 | # parsed_list is list of dictionaries containing parsed log entries 316 | with open(properties['groundtruth_pickle_path'], 'rb') as f_pickle: 317 | parsed_list = pickle.load(f_pickle) 318 | 319 | lines_length = len(parsed_list) 320 | train_length = floor(0.8 * lines_length) 321 | 322 | # get training, and test data 323 | for line in parsed_list[:train_length]: 324 | line = conll.convert(line, iobtree=True) 325 | pickle.dump(line, f_train) 326 | 327 | for line in parsed_list[train_length:]: 328 | line = conll.convert(line, iobtree=True) 329 | pickle.dump(line, f_test) 330 | 331 | # close files 332 | f_train.close() 333 | f_test.close() 334 | 335 | 336 | def remove_directories(): 337 | # remove output directories 338 | punctuation_path = '/home/hudan/Git/prlogparser/data/punctuation/' 339 | if os.path.isdir(punctuation_path): 340 | shutil.rmtree(punctuation_path) 341 | 342 | conll_path = '/home/hudan/Git/prlogparser/data/conll/' 343 | if os.path.isdir(conll_path): 344 | shutil.rmtree(conll_path) 345 | 346 | conll_path_stanford = '/home/hudan/Git/prlogparser/data/conll-stanford/' 347 | if os.path.isdir(conll_path_stanford): 348 | shutil.rmtree(conll_path_stanford) 349 | 350 | conll_path_pos = '/home/hudan/Git/prlogparser/data/conll-pos/' 351 | if os.path.isdir(conll_path_pos): 352 | shutil.rmtree(conll_path_pos) 353 | 354 | csv_path = '/home/hudan/Git/prlogparser/data/csv/' 355 | if os.path.isdir(csv_path): 356 | shutil.rmtree(csv_path) 357 | 358 | nltk_tree_path = '/home/hudan/Git/prlogparser/data/nltk-tree/' 359 | if os.path.isdir(nltk_tree_path): 360 | shutil.rmtree(nltk_tree_path) 361 | 362 | 363 | def check_path(path): 364 | # check a path is exist or not. if not exist, then create it 365 | try: 366 | os.makedirs(path) 367 | except OSError as exception: 368 | if exception.errno != errno.EEXIST: 369 | raise 370 | 371 | 372 | def open_csv_file(): 373 | # set csv output files 374 | csv_path = '/home/hudan/Git/prlogparser/data/csv/' 375 | check_path(csv_path) 376 | csv_file = os.path.join(csv_path, 'csv.all.txt') 377 | f_csv = open(csv_file, 'a') 378 | f_csv.write('Sentence #\tWord\tPOS\tTag\n') 379 | 380 | return f_csv 381 | 382 | 383 | if __name__ == '__main__': 384 | datasets = [ 385 | 'casper-rw', 386 | 'dfrws-2009-jhuisi', 387 | 'dfrws-2009-nssal', 388 | 'dfrws-2016', 389 | 'honeynet-challenge5', 390 | 'honeynet-challenge7', 391 | 'bgl2', 392 | 'kippo', 393 | 'proxifier', 394 | 'secrepo-accesslog', 395 | 'zookeeper' 396 | ] 397 | 398 | # remove to clean output directories because we use append mode for output files 399 | # and then run splitting for all datasets 400 | remove_directories() 401 | file_csv = open_csv_file() 402 | for dataset in datasets: 403 | s = Splitting(dataset) 404 | s.split() 405 | s.split_conll() 406 | s.split_conll_pos() 407 | s.split_csv(file_csv) 408 | # s.split_conll_stanford() 409 | # s.split_nltk_tree() 410 | 411 | file_csv.close() 412 | -------------------------------------------------------------------------------- /nerlogparser/preprocessing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/preprocessing/__init__.py -------------------------------------------------------------------------------- /nerlogparser/preprocessing/config/datasets.conf: -------------------------------------------------------------------------------- 1 | [main] 2 | dataset_path = /home/hudan/Git/nerlogparser/datasets/ 3 | groundtruth_path = /home/hudan/Git/nerlogparser/groundtruth/ 4 | groundtruth_pickle = /home/hudan/Git/nerlogparser/groundtruth-pickle/ 5 | conll_path = /home/hudan/Git/nerlogparser/data/conll/ 6 | conll_stanford_path = /home/hudan/Git/nerlogparser/data/conll-stanford/ 7 | conll_pos_path = /home/hudan/Git/nerlogparser/data/conll-pos/ 8 | csv_path = /home/hudan/Git/nerlogparser/data/csv/ 9 | nltk_tree_path = /home/hudan/Git/nerlogparser/data/nltk-tree/ -------------------------------------------------------------------------------- /nerlogparser/results/test/events.out.tfevents.1533503273.seitpc80: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/results/test/events.out.tfevents.1533503273.seitpc80 -------------------------------------------------------------------------------- /nerlogparser/results/test/model.weights/.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/results/test/model.weights/.data-00000-of-00001 -------------------------------------------------------------------------------- /nerlogparser/results/test/model.weights/.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/results/test/model.weights/.index -------------------------------------------------------------------------------- /nerlogparser/results/test/model.weights/.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/studiawan/nerlogparser/4dc3d955f735ea5496557ee76378a38b5746e425/nerlogparser/results/test/model.weights/.meta -------------------------------------------------------------------------------- /nerlogparser/results/test/model.weights/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "." 2 | all_model_checkpoint_paths: "." 3 | -------------------------------------------------------------------------------- /nerlogparser/shell/nerlogparser_shell.py: -------------------------------------------------------------------------------- 1 | from nerlogparser.model.ner_model import NERModel 2 | from nerlogparser.model.config import Config 3 | 4 | 5 | def align_data(data): 6 | """Given dict with lists, creates aligned strings 7 | 8 | Adapted from Assignment 3 of CS224N 9 | 10 | Args: 11 | data: (dict) data["x"] = ["I", "love", "you"] 12 | (dict) data["y"] = ["O", "O", "O"] 13 | 14 | Returns: 15 | data_aligned: (dict) data_align["x"] = "I love you" 16 | data_align["y"] = "O O O " 17 | 18 | """ 19 | spacings = [max([len(seq[i]) for seq in data.values()]) 20 | for i in range(len(data[list(data.keys())[0]]))] 21 | data_aligned = dict() 22 | 23 | # for each entry, create aligned string 24 | for key, seq in data.items(): 25 | str_aligned = "" 26 | for token, spacing in zip(seq, spacings): 27 | str_aligned += token + " " * (spacing - len(token) + 1) 28 | 29 | data_aligned[key] = str_aligned 30 | 31 | return data_aligned 32 | 33 | 34 | def interactive_shell(model): 35 | """Creates interactive shell to play with model 36 | 37 | Args: 38 | model: instance of NERModel 39 | 40 | """ 41 | model.logger.info(""" 42 | This is an interactive mode. 43 | To exit, enter 'exit'. 44 | You can enter a sentence like 45 | input> I love Paris""") 46 | 47 | while True: 48 | sentence = input("input> ") 49 | 50 | words_raw = sentence.strip().split(" ") 51 | 52 | if words_raw == ["exit"]: 53 | break 54 | 55 | preds = model.predict(words_raw) 56 | to_print = align_data({"input": words_raw, "output": preds}) 57 | 58 | for key, seq in to_print.items(): 59 | model.logger.info(seq) 60 | 61 | 62 | def main(): 63 | # create instance of config 64 | config = Config() 65 | 66 | # build model 67 | model = NERModel(config) 68 | model.build() 69 | model.restore_session(config.dir_model) 70 | 71 | # interactive shell 72 | interactive_shell(model) 73 | 74 | 75 | if __name__ == "__main__": 76 | main() 77 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='nerlogparser', 4 | version='0.0.1', 5 | description='Automatic log parser', 6 | long_description='Automatic log parser using named entity recognition in Python', 7 | classifiers=[ 8 | 'Development Status :: 2 - Pre-Alpha', 9 | 'License :: OSI Approved :: Apache Software License', 10 | 'Programming Language :: Python :: 3.5', 11 | ], 12 | keywords='named entity recognition, log parser, log forensics', 13 | url='http://github.com/studiawan/nerlogparser/', 14 | author='Guillaume Genthial, Hudan Studiawan', 15 | author_email='studiawan@gmail.com', 16 | license='Apache', 17 | packages=['nerlogparser'], 18 | entry_points={ 19 | 'console_scripts': [ 20 | 'nerlogparser = nerlogparser.nerlogparser:main' 21 | ], 22 | }, 23 | install_requires=[ 24 | 'tensorflow==1.4.1', 25 | 'nltk' 26 | ], 27 | include_package_data=True, 28 | zip_safe=False) 29 | --------------------------------------------------------------------------------