├── LICENSE ├── LogBench-O ├── LogBench-O_prefix_1point.zip ├── LogBench-O_prefix_1point_file_level.zip └── LogBench-O_prefix_1point_wo_comments.zip ├── LogBench-T ├── LogBench-T_prefix_1point.zip └── LogBench-T_prefix_1point_file_level.zip ├── README.md ├── build └── code-transformer.jar ├── cases └── generated_cases.csv ├── img ├── empirical_overview.jpg ├── empirical_overview.pdf └── empirical_overview.png └── src ├── Baselines ├── ChatGPT │ └── chatgpt.py ├── Davinci │ └── davinci.py ├── DeepLV │ ├── Helper.py │ ├── Metrics.py │ ├── block_level_LSTM.py │ ├── block_processing │ │ └── block_processing.py │ └── deepLV_cleaner.py ├── Incoder │ └── incoder.py ├── LoGenText-Plus │ ├── README.md │ ├── requirements.txt │ └── results │ │ └── 1 │ │ ├── activemq │ │ ├── translation.context.test │ │ ├── translation.context.test.log │ │ └── translation.context.test.unsort │ │ ├── ambari │ │ ├── translation.context.test │ │ ├── translation.context.test.log │ │ └── translation.context.test.unsort │ │ ├── brooklyn │ │ ├── translation.context.test │ │ ├── translation.context.test.log │ │ └── translation.context.test.unsort │ │ ├── camel │ │ ├── translation.context.test │ │ ├── translation.context.test.log │ │ └── translation.context.test.unsort │ │ ├── cloudstack │ │ ├── translation.context.test │ │ ├── translation.context.test.log │ │ └── translation.context.test.unsort │ │ ├── hadoop │ │ ├── translation.context.test │ │ ├── translation.context.test.log │ │ └── translation.context.test.unsort │ │ ├── hbase │ │ ├── translation.context.test │ │ ├── translation.context.test.log │ │ └── translation.context.test.unsort │ │ ├── hive │ │ ├── translation.context.test │ │ ├── translation.context.test.log │ │ └── translation.context.test.unsort │ │ ├── ignite │ │ ├── translation.context.test │ │ ├── translation.context.test.log │ │ └── translation.context.test.unsort │ │ └── synapse │ │ ├── translation.context.test │ │ ├── translation.context.test.log │ │ └── translation.context.test.unsort ├── README.md ├── StarCoder │ └── starcoder.py ├── WhichVar │ ├── analysis.ipynb │ ├── cleaner.ipynb │ ├── data.json │ ├── model.py │ ├── output.json │ ├── test.json │ └── train.json └── lance │ ├── README.md │ └── lance.py ├── CodeTransformer └── README.md ├── DataCollector ├── check_pom.py ├── download.py ├── filter_pom.py └── get_java.py ├── eva_sem.ipynb └── unixcoder.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /LogBench-O/LogBench-O_prefix_1point.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-O/LogBench-O_prefix_1point.zip -------------------------------------------------------------------------------- /LogBench-O/LogBench-O_prefix_1point_file_level.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-O/LogBench-O_prefix_1point_file_level.zip -------------------------------------------------------------------------------- /LogBench-O/LogBench-O_prefix_1point_wo_comments.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-O/LogBench-O_prefix_1point_wo_comments.zip -------------------------------------------------------------------------------- /LogBench-T/LogBench-T_prefix_1point.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-T/LogBench-T_prefix_1point.zip -------------------------------------------------------------------------------- /LogBench-T/LogBench-T_prefix_1point_file_level.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-T/LogBench-T_prefix_1point_file_level.zip -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LogBench 2 | 3 | **LogBench is a benchmark for evaluating logging statement generation.** 4 | 5 | Logging statements are imperative in modern software. They serve important role in reflecting developer's intention, recording system behavior, and guiding failure diagnosis procedure. LogBench provides a benchmark and toolkit, allowing you to measure your own models and conveniently compare them with existing baseline models. 6 | 7 | 8 | If you find our paper benefit your research, please kindly cite our following paper: 9 | 10 | + Yichen Li, Yintong Huo, Zhihan Jiang, Renyi Zhong, Pinjia He, Yuxin Su, Lionel C. Briand, and Michael R. Lyu. [Exploring the Effectiveness of LLMs in Automated Logging Generation: An Empirical Study](https://arxiv.org/abs/2307.05950), IEEE Transactions on Software Engineering(TSE), 2024. 11 | 12 | ## Study overview 13 | ![overview](img/empirical_overview.jpg) 14 | 15 | The study is fully described in this [paper](https://arxiv.org/abs/2307.05950). LogBench comprises two subsets for evaluating the model's *effectiveness* and *generalizability*, respectively: 16 | 17 | 1. Effectiveness: **LogBench-O** contains a collection of high-quality logging statements and their associated code contexts. 18 | 2. Generalizability: **LogBench-T** is an unseen code dataset, after semantically-equivalent code transformation from LogBench-O. 19 | 20 | Additionally, LogBench offers various variants to support different settings in logging statement generation, including: 21 | 22 | * Method-level 23 | * File-level 24 | * Comment-included 25 | * Comment-free 26 | 27 | ## Repository organization 28 | We currently provide part of the code in the folder `/src`. We will release the full source code after the paper has been accepted. 29 | 30 | * LogBench-O: The `/LogBench-O` folder contains the files for LogBench-O. 31 | * LogBench-T: The `/LogBench-T` folder contains the files for LogBench-T. 32 | * Cases: Please refer to the `cases` folder for the generated cases. 33 | 34 | # 35 | 36 | ``` 37 | ├── LICENSE 38 | ├── LogBench-O 39 | │   ├── LogBench-O_prefix_1point.zip 40 | │   ├── LogBench-O_prefix_1point_file_level.zip 41 | │   └── LogBench-O_prefix_1point_wo_comments.zip 42 | ├── LogBench-T 43 | │   ├── LogBench-T_prefix_1point.zip 44 | │   └── LogBench-T_prefix_1point_file_level.zip 45 | ├── README.md 46 | ├── build 47 | │   └── code-transformer.jar 48 | ├── cases 49 | │   └── generated_cases.csv 50 | ├── img 51 | │   ├── overview.pdf 52 | │   └── overview.png 53 | └── src 54 | ├── Baselines 55 | │   ├── DeepLV 56 | │   ├── WhichVar 57 | │   ├── LogenText-Plus 58 | │   ├── StarCoder 59 | │   └── Lance 60 | │   └── InCoder 61 | │   └── ... 62 | ├── CodeTransformer 63 | │   └── README.md 64 | └── DataCollector 65 | ├── ... 66 | ``` 67 | 68 | 69 | ## Study subjects 70 | | 11 LLMs | Access | Paper reference | 71 | | ------------ | ------ | ---- | 72 | | Davinci | API | [Project](https://platform.openai.com/docs/models) | 73 | | ChatGPT | API | [Project](https://platform.openai.com/docs/models) | 74 | | LANCE | Model | [ICSE'22] [Using deep learning to generate complete log statements](https://dl.acm.org/doi/abs/10.1145/3510003.3511561) | 75 | | InCoder | Model | [ICLR'23] [InCoder: A Generative Model for Code Infilling and Synthesis](https://openreview.net/forum?id=hQwb-lbM6EL) | 76 | | Llama2 | Model | [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288) | 77 | | StarCoder | Model | [StarCoder: may the source be with you!](https://arxiv.org/abs/2305.06161) | 78 | | CodeLlama | Model | [Code Llama: Open Foundation Models for Code](https://arxiv.org/abs/2308.12950) | 79 | | CodeGeex | Plugin | [CodeGeeX: A Pre-Trained Model for Code Generation with Multilingual Evaluations on HumanEval-X](https://arxiv.org/abs/2303.17568) | 80 | | TabNine | Plugin | - | 81 | | Copilot | Plugin | - | 82 | | Code Whisperer | Plugin | - | 83 | | **Non-LLMs** | | 84 | | DeepLV | Model | [ICSE'21] [DeepLV: Suggesting Log Levels Using Ordinal Based Neural Networks](https://ieeexplore.ieee.org/abstract/document/9402068) | 85 | | WhichVar | Model | [TSE'21] [Which Variables Should I Log?](https://ieeexplore.ieee.org/document/8840982) | 86 | | LoGenText-Plus | Model | [TOSEM'23] [LoGenText-Plus: Improving Neural Machine Translation Based Logging Texts Generation with Syntactic Templates](https://dl.acm.org/doi/10.1145/3624740) | 87 | 88 | For each baseline utilized, we kindly request that please ensure to cite the relevant paper while using the code. 89 | 90 | 91 | 92 | ## Download original crawling logging dataset 93 | For further logging-related research, as GitHub does not hold large datasets, you can download the **whole** collected logging dataset Fullsize at [here](https://drive.google.com/file/d/13EV-rIFEwVrLGnpNIcpF3u9NSOh_gCNM/view?usp=sharing) 94 | (zip: 252M; unzip: 786M). 95 | 96 | 97 | ## Code transformation tool 98 | 99 | The folder `/build` contains the built tranformation tool. It will conduct the code tranformation automatically with its eight code transformers. 100 | - To conduct the code transformation in batch: 101 | ``` 102 | java -jar code-transformer.jar -f ./javafiles/ 103 | ``` 104 | -------------------------------------------------------------------------------- /build/code-transformer.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/build/code-transformer.jar -------------------------------------------------------------------------------- /img/empirical_overview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/img/empirical_overview.jpg -------------------------------------------------------------------------------- /img/empirical_overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/img/empirical_overview.pdf -------------------------------------------------------------------------------- /img/empirical_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/img/empirical_overview.png -------------------------------------------------------------------------------- /src/Baselines/ChatGPT/chatgpt.py: -------------------------------------------------------------------------------- 1 | from revChatGPT.V3 import Chatbot 2 | import os 3 | import glob 4 | import time 5 | import random 6 | 7 | def read_input_file(input_file): 8 | with open(input_file, 'r') as file: 9 | input_text = file.read() 10 | return input_text 11 | 12 | def write_output_file(output_file, content): 13 | with open(output_file, 'w') as file: 14 | file.write(content) 15 | 16 | def main(): 17 | input_folder = "" 18 | output_folder = "" 19 | java_files_pattern = os.path.join(input_folder, "*.java") 20 | input_files = glob.glob(java_files_pattern) 21 | random.shuffle(input_files) 22 | output_files = [os.path.join(output_folder, os.path.splitext(os.path.basename(f))[0] + "_output.java") for f in input_files] 23 | os.makedirs(output_folder, exist_ok=True) 24 | 25 | for i, input_file in enumerate(input_files): 26 | 27 | chatbot = Chatbot(api_key="") 28 | print(f"Processing {input_file}...") 29 | input_text = read_input_file(input_file) 30 | input_text = "Please complete the incomplete logging statement at the logging point. Please just reply me one line of code, don't reply me other text.:\n" + input_text 31 | try: 32 | if os.path.exists(output_files[i]): 33 | print("Output file already exists. Skipping...") 34 | continue 35 | result = chatbot.ask(input_text) 36 | time.sleep(2) 37 | output_file = output_files[i] 38 | write_output_file(output_file, result) 39 | print(f"Code saved to {output_file}") 40 | except Exception as e: 41 | print(f"Error processing {input_file}: {str(e)}") 42 | 43 | 44 | if __name__ == "__main__": 45 | main() -------------------------------------------------------------------------------- /src/Baselines/Davinci/davinci.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import os 3 | import glob 4 | import time 5 | import random 6 | from tqdm import tqdm 7 | 8 | def read_input_file(input_file): 9 | with open(input_file, 'r') as file: 10 | input_text = file.read() 11 | return input_text 12 | 13 | def write_output_file(output_file, content): 14 | with open(output_file, 'w') as file: 15 | file.write(content) 16 | 17 | openai.api_key = "" 18 | 19 | def generate_text(prompt, model="text-davinci-003", tokens=1024, temperature=1, top_p=1): 20 | response = openai.Completion.create( 21 | engine=model, 22 | prompt=prompt, 23 | max_tokens=tokens, 24 | n=1, 25 | stop=None, 26 | temperature=temperature, 27 | top_p=top_p 28 | ) 29 | 30 | return response.choices[0].text.strip() 31 | 32 | def main(): 33 | input_folder = "" 34 | output_folder = "" 35 | java_files_pattern = os.path.join(input_folder, "*.java") 36 | input_files = glob.glob(java_files_pattern) 37 | random.shuffle(input_files) 38 | output_files = [os.path.join(output_folder, os.path.splitext(os.path.basename(f))[0] + "_output.java") for f in input_files] 39 | os.makedirs(output_folder, exist_ok=True) 40 | 41 | for i, input_file in enumerate(tqdm(input_files, desc="Processing files")): 42 | print(f"Processing {input_file}...") 43 | input_text = read_input_file(input_file) 44 | input_text = "Please complete the incomplete logging statement at the logging point. Please just reply me one line of code, don't reply me other text.:\n" + input_text 45 | try: 46 | if os.path.exists(output_files[i]): 47 | print("Output file already exists. Skipping...") 48 | continue 49 | result = generate_text(input_text) 50 | time.sleep(2) 51 | output_file = output_files[i] 52 | write_output_file(output_file, result) 53 | print(f"Code saved to {output_file}") 54 | except Exception as e: 55 | print(f"Error processing {input_file}: {str(e)}") 56 | 57 | if __name__ == "__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /src/Baselines/DeepLV/Helper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import re as re 4 | import string 5 | import numpy as np 6 | import csv 7 | import pandas as pd 8 | 9 | from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score, precision_recall_fscore_support 10 | from sklearn.utils import resample 11 | from sklearn.preprocessing import LabelEncoder 12 | 13 | # for ordinal 14 | trace_label = [1, 0, 0, 0, 0] 15 | debug_label = [1, 1, 0, 0, 0] 16 | info_label = [1, 1, 1, 0, 0] 17 | warn_label = [1, 1 ,1, 1, 0] 18 | error_label = [1, 1, 1, 1, 1] 19 | 20 | # for normal 21 | #trace_label = [1, 0, 0, 0, 0] 22 | #debug_label = [0, 1, 0, 0, 0] 23 | #info_label = [0, 0, 1, 0, 0] 24 | #warn_label = [0, 0 ,0, 1, 0] 25 | #error_label = [0, 0, 0, 0, 1] 26 | 27 | 28 | 29 | 30 | def ordinal_encoder(classes): 31 | y = [] 32 | for c in classes: 33 | if c == 'trace': 34 | y.append(trace_label) 35 | elif c == 'debug': 36 | y.append(debug_label) 37 | elif c == 'info': 38 | y.append(info_label) 39 | elif c == 'warn': 40 | y.append(warn_label) 41 | else: 42 | y.append(error_label) 43 | y = np.array(y) 44 | return y 45 | 46 | 47 | def predict_prob_encoder(predict_prob): 48 | label_predicted = [] 49 | for column_prob in predict_prob: 50 | column_label = [] 51 | for p in column_prob: 52 | if p > 0.5: 53 | column_label.append(1) 54 | else: 55 | column_label.append(0) 56 | label_predicted.append(column_label) 57 | label_predicted = np.array(label_predicted) 58 | return label_predicted 59 | 60 | 61 | def predicted_label_encoder(y_list): 62 | target_list = [] 63 | 64 | target_trace_label = [1, 0, 0, 0, 0] 65 | target_debug_label = [0, 1, 0, 0, 0] 66 | target_info_label = [0, 0, 1 ,0, 0] 67 | target_warn_label = [0, 0, 0, 1, 0] 68 | target_error_label = [0, 0, 0, 0, 1] 69 | target_exception_label = [0, 0, 0, 0, 0] 70 | for y in y_list: 71 | if np.array_equal(np.array(y), np.array(trace_label)): 72 | target_list.append(target_trace_label) 73 | elif np.array_equal(np.array(y), np.array(debug_label)): 74 | target_list.append(target_debug_label) 75 | elif np.array_equal(np.array(y), np.array(info_label)): 76 | target_list.append(target_info_label) 77 | elif np.array_equal(np.array(y), np.array(warn_label)): 78 | target_list.append(target_warn_label) 79 | elif np.array_equal(np.array(y), np.array(error_label)): 80 | target_list.append(target_error_label) 81 | else: 82 | print("Something wrong happend in predicted_label_encoder.", y) 83 | target_list.append(target_warn_label) 84 | return np.array(target_list) 85 | 86 | 87 | 88 | 89 | def pd_encoder(y_list): #0:trace, 1:debug, 2:info, 3:warn, 4: error 90 | target_list = [] 91 | for y in y_list: 92 | if np.array_equal(np.array(y), np.array(trace_label)): 93 | target_list.append(0) 94 | elif np.array_equal(np.array(y), np.array(debug_label)): 95 | target_list.append(1) 96 | elif np.array_equal(np.array(y), np.array(info_label)): 97 | target_list.append(2) 98 | elif np.array_equal(np.array(y), np.array(warn_label)): 99 | target_list.append(3) 100 | elif np.array_equal(np.array(y), np.array(error_label)): 101 | target_list.append(4) 102 | else: 103 | print("Something wrong happend in pd_encoder.", y) 104 | target_list.append(3) 105 | return target_list 106 | 107 | 108 | 109 | 110 | def class_accuracy(y_test, y_predicted): 111 | trace_test_list = [] 112 | debug_test_list = [] 113 | info_test_list = [] 114 | warn_test_list = [] 115 | error_test_list = [] 116 | 117 | trace_predicted_list = [] 118 | debug_predicted_list = [] 119 | info_predicted_list = [] 120 | warn_predicted_list = [] 121 | error_predicted_list = [] 122 | 123 | for yt, yp in zip(y_test, y_predicted): 124 | if np.array_equal(np.array(yt), np.array(trace_label)): 125 | trace_test_list.append(trace_label) 126 | trace_predicted_list.append(yp) 127 | elif np.array_equal(np.array(yt), np.array(debug_label)): 128 | debug_test_list.append(debug_label) 129 | debug_predicted_list.append(yp) 130 | elif np.array_equal(np.array(yt), np.array(info_label)): 131 | info_test_list.append(info_label) 132 | info_predicted_list.append(yp) 133 | elif np.array_equal(np.array(yt), np.array(warn_label)): 134 | warn_test_list.append(warn_label) 135 | warn_predicted_list.append(yp) 136 | elif np.array_equal(np.array(yt), np.array(error_label)): 137 | error_test_list.append(error_label) 138 | error_predicted_list.append(yp) 139 | else: 140 | print("something wrong happened in class_accuracy", yt, yp) 141 | acc_trace = accuracy_score(np.array(trace_test_list), np.array(trace_predicted_list)) 142 | acc_debug = accuracy_score(np.array(debug_test_list), np.array(debug_predicted_list)) 143 | acc_info = accuracy_score(np.array(info_test_list), np.array(info_predicted_list)) 144 | acc_warn = accuracy_score(np.array(warn_test_list), np.array(warn_predicted_list)) 145 | acc_error = accuracy_score(np.array(error_test_list), np.array(error_predicted_list)) 146 | print ('Trace Accuracy: ', acc_trace) 147 | print ('Debug Accuracy: ', acc_debug) 148 | print ('Info Accuracy: ', acc_info) 149 | print ('Warn Accuracy: ', acc_warn) 150 | print ('Error Accuracy: ', acc_error) 151 | 152 | #This is for the case combining debug and trace together 153 | def upsampling(x_train, y_train, seed_value): 154 | 155 | #code below is for upsampling the data 156 | 157 | df=pd.DataFrame() 158 | df['x_train'] = x_train 159 | df['y_train'] = pd_encoder(y_train) 160 | 161 | data_td = df.loc[df['y_train'] == 0] 162 | data_info = df.loc[df['y_train'] == 1] 163 | data_warn = df.loc[df['y_train'] == 2] 164 | data_error = df.loc[df['y_train'] == 3] 165 | data_len = np.array([len(data_td), len(data_info), len(data_warn), len(data_error)]) 166 | max_num = np.max(data_len) 167 | 168 | td_upsampled = resample(data_td, replace=True, n_samples=max_num, random_state=seed_value) 169 | info_upsampled = resample(data_info, replace=True, n_samples=max_num, random_state=seed_value) 170 | warn_upsampled = resample(data_warn, replace=True, n_samples=max_num, random_state=seed_value) 171 | error_upsampled = resample(data_error, replace=True, n_samples=max_num, random_state=seed_value) 172 | 173 | td_upsampled=td_upsampled.drop(columns=['y_train']).to_numpy() 174 | info_upsampled=info_upsampled.drop(columns=['y_train']).to_numpy() 175 | warn_upsampled=warn_upsampled.drop(columns=['y_train']).to_numpy() 176 | error_upsampled=error_upsampled.drop(columns=['y_train']).to_numpy() 177 | 178 | x_train = np.concatenate((td_upsampled, info_upsampled, warn_upsampled, error_upsampled)) 179 | temp_y_train = [] 180 | for i in range(0, max_num): 181 | temp_y_train.append([1, 0, 0, 0]) 182 | for i in range(0, max_num): 183 | temp_y_train.append([1, 1, 0, 0]) 184 | for i in range(0, max_num): 185 | temp_y_train.append([1, 1, 1, 0]) 186 | for i in range(0, max_num): 187 | temp_y_train.append([1, 1, 1, 1]) 188 | 189 | y_train = np.array(temp_y_train) 190 | return x_train, y_train 191 | 192 | 193 | def ordinal_accuracy(y_test, y_predicted): 194 | print(len(y_test), len(y_predicted)) 195 | left_boundary = 0.0 196 | right_boundary = 4.0 197 | value_cumulation = 0.0 198 | for yt, yp in zip(y_test, y_predicted): 199 | lb_distance = float(yt) - left_boundary 200 | rb_distance = right_boundary - float(yt) 201 | max_distance = np.max(np.array([lb_distance, rb_distance])) 202 | value = 1.0 - abs(float(yp) - float(yt))/max_distance 203 | value_cumulation = value_cumulation + value 204 | return value_cumulation/float(len(y_test)) 205 | 206 | -------------------------------------------------------------------------------- /src/Baselines/DeepLV/Metrics.py: -------------------------------------------------------------------------------- 1 | from keras.callbacks import Callback 2 | import numpy as np 3 | from sklearn.model_selection import train_test_split 4 | from sklearn.model_selection import StratifiedKFold 5 | from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, balanced_accuracy_score, accuracy_score 6 | from sklearn.utils import resample 7 | 8 | class Metrics(Callback): 9 | def on_train_begin(self, logs={}): 10 | self.val_f1s = [] 11 | self.val_recalls = [] 12 | self.val_precisions = [] 13 | self.val_auc = [] 14 | 15 | def on_epoch_end(self, epoch, logs={}): 16 | val_predict = (np.asarray(self.model.predict( 17 | self.validation_data[0]))).round() 18 | val_targ = self.validation_data[1] 19 | pos_label=1 20 | _val_f1 = f1_score(val_targ, val_predict, labels=[pos_label],pos_label=1, average ='binary') 21 | _val_recall = recall_score(val_targ, val_predict, labels=[pos_label],pos_label=1, average ='binary') 22 | _val_precision = precision_score(val_targ, val_predict, labels=[pos_label],pos_label=1, average ='binary') 23 | _val_auc = roc_auc_score(val_targ, val_predict) 24 | self.val_f1s.append(_val_f1) 25 | self.val_recalls.append(_val_recall) 26 | self.val_precisions.append(_val_precision) 27 | self.val_auc.append(_val_auc) 28 | return 29 | 30 | -------------------------------------------------------------------------------- /src/Baselines/DeepLV/block_level_LSTM.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import os 3 | import sys 4 | import re as re 5 | import string 6 | 7 | import multiprocessing 8 | import numpy as np 9 | from gensim.models.word2vec import Word2Vec 10 | from gensim.corpora.dictionary import Dictionary 11 | from gensim.parsing.porter import PorterStemmer 12 | 13 | import random as rn 14 | seed_value = 17020 15 | seed_window = 1500 16 | import pandas as pd 17 | import csv 18 | from sklearn.model_selection import train_test_split 19 | from sklearn.model_selection import StratifiedKFold 20 | from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, balanced_accuracy_score, accuracy_score, precision_recall_fscore_support 21 | from sklearn.utils import resample 22 | from sklearn.preprocessing import LabelEncoder 23 | 24 | import matplotlib.pyplot as plt 25 | 26 | import tensorflow as tf 27 | import Metrics 28 | from keras import backend as K 29 | from keras.preprocessing import sequence 30 | from keras.models import Sequential 31 | from keras.layers import Dense, Flatten, Dropout, Embedding, LSTM, Bidirectional, Activation, LeakyReLU 32 | from keras.models import model_from_yaml 33 | from keras.utils import np_utils 34 | from keras_self_attention import SeqSelfAttention 35 | 36 | 37 | import Helper 38 | 39 | 40 | config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 16} ) 41 | sess = tf.Session(config=config) 42 | K.set_session(sess) 43 | 44 | 45 | csv.field_size_limit(100000000) 46 | sys.setrecursionlimit(1000000) 47 | # set parameters: 48 | n_iterations = 1 49 | embedding_iterations = 1 50 | n_epoch = 50 51 | 52 | vocab_dim = 100 53 | maxlen = 100 54 | n_exposures = 10 55 | window_size = 7 56 | batch_size = 24 57 | input_length = 100 58 | cpu_count = multiprocessing.cpu_count() 59 | 60 | test_list = [] 61 | neg_full = [] 62 | pos_full = [] 63 | syntactic_list = [] 64 | 65 | 66 | 67 | model_location = 'model_block' +'/lstm_'+ sys.argv[1] 68 | embedding_location = 'embedding_block' + '/Word2vec_model_' + sys.argv[1] + '.pkl' 69 | 70 | 71 | def loadfile(): 72 | 73 | data_full=pd.read_csv('block_processing/blocks/logged_syn' + '_' + sys.argv[1] + '.csv', usecols=[1,2,3,4], engine='python') 74 | 75 | dataset = data_full.values 76 | classes = dataset[:, 2] 77 | data=data_full['Values'].values.tolist() 78 | combined = data 79 | combined_full = data_full.values.tolist() 80 | 81 | encoder = LabelEncoder() 82 | encoder.fit(classes) 83 | encoded_Y = encoder.transform(classes) 84 | y = Helper.ordinal_encoder(classes) 85 | 86 | 87 | 88 | x_train, x_test, y_train, y_test = train_test_split(combined_full, y, test_size=0.2, train_size=0.8, random_state=seed_value, stratify=y) 89 | x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, train_size=0.75, random_state=seed_value, stratify=y_train) 90 | test_block_list = [] 91 | train_block_list = [] 92 | for x in x_test: 93 | test_list.append(x[0]) 94 | test_block_list.append(x[1]) 95 | x_test = np.array(test_block_list) 96 | for x in x_train: 97 | train_block_list.append(x[1]) 98 | x_train = train_block_list 99 | 100 | return combined,y, x_train, x_val, x_test, y_train, y_val, y_test 101 | 102 | 103 | 104 | def word_splitter(word, docText): 105 | splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', word)).split() 106 | for word in splitted: 107 | docText.append(word.lower()) 108 | 109 | 110 | 111 | 112 | def tokenizer(text): 113 | newText = [] 114 | for doc in text: 115 | docText = [] 116 | #for word in str(doc).replace("['", "").replace("']", "").replace(",", "").replace("'", "").split(' '): 117 | for word in str(doc).replace("'", "").replace("[", "").replace("]", "").replace(",", "").replace('"', "").split(' '): 118 | docText.append(word) 119 | 120 | newText.append(docText) 121 | #print (newText) 122 | return newText 123 | 124 | 125 | 126 | def input_transform(words): 127 | model=Word2Vec.load(embedding_location) 128 | _, _,dictionaries=create_dictionaries(model,words) 129 | return dictionaries 130 | 131 | 132 | 133 | 134 | 135 | 136 | def create_dictionaries(model=None, 137 | combined=None): 138 | 139 | from keras.preprocessing import sequence 140 | 141 | if (combined is not None) and (model is not None): 142 | gensim_dict = Dictionary() 143 | gensim_dict.doc2bow(model.wv.vocab.keys(), 144 | allow_update=True) 145 | w2indx = {v: k+1 for k, v in gensim_dict.items()} 146 | w2vec = {word: model.wv[word] for word in w2indx.keys()} 147 | 148 | def parse_dataset(combined): 149 | data=[] 150 | for sentence in combined: 151 | new_txt = [] 152 | for word in sentence: 153 | try: 154 | new_txt.append(w2indx[word]) 155 | except: 156 | new_txt.append(0) 157 | data.append(new_txt) 158 | return data 159 | combined=parse_dataset(combined) 160 | combined= sequence.pad_sequences(combined, maxlen=maxlen) 161 | return w2indx, w2vec,combined 162 | 163 | 164 | def word2vec_train(combined): 165 | model = Word2Vec(size=vocab_dim, #dimension of word embedding vectors 166 | min_count=n_exposures, 167 | window=window_size, 168 | workers=cpu_count, sg=1, 169 | iter=embedding_iterations) 170 | model.build_vocab(combined) 171 | model.save(embedding_location) 172 | index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined) 173 | return index_dict, word_vectors,combined 174 | 175 | 176 | def get_data(index_dict,word_vectors,combined): 177 | 178 | n_symbols = len(index_dict) + 1 179 | embedding_weights = np.zeros((n_symbols, vocab_dim)) 180 | for word, index in index_dict.items(): 181 | embedding_weights[index, :] = word_vectors[word] 182 | 183 | 184 | return n_symbols,embedding_weights 185 | 186 | 187 | def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test, x_val, y_val): 188 | 189 | tf.set_random_seed(seed_value) 190 | 191 | 192 | 193 | 194 | model = Sequential() 195 | model.add(Embedding(output_dim=vocab_dim, 196 | input_dim=n_symbols, 197 | mask_zero=True, 198 | weights=[embedding_weights], 199 | input_length=input_length)) 200 | model.add(Bidirectional(LSTM(output_dim=128,activation='sigmoid'))) 201 | model.add(Dropout(0.2)) 202 | model.add(Dense(5, activation='sigmoid')) 203 | 204 | 205 | print ('Compiling the Model..') 206 | model.compile(loss='binary_crossentropy', 207 | optimizer='adam',metrics=['accuracy']) 208 | 209 | print ("Train...") 210 | metrics = Metrics.Metrics() 211 | history = model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epoch,verbose=1, validation_data=(x_val, y_val)) 212 | 213 | base_min = optimal_epoch(history) 214 | print ("Evaluate...") 215 | score = model.evaluate(x_test, y_test, 216 | batch_size=batch_size) 217 | yaml_string = model.to_yaml() 218 | with open(model_location +'.yml', 'w') as outfile: 219 | outfile.write( yaml.dump(yaml_string, default_flow_style=True) ) 220 | model.save_weights(model_location + sys.argv[1] + '.h5') 221 | np.set_printoptions(threshold=sys.maxsize) 222 | 223 | prob_predicted = model.predict(x_test, verbose=1) 224 | label_predicted = Helper.predict_prob_encoder(prob_predicted) 225 | num_y_test = Helper.pd_encoder(y_test) 226 | num_y_predicted = Helper.pd_encoder(label_predicted) 227 | 228 | val_accuracy = accuracy_score(y_test, label_predicted) 229 | print ('Accuracy: ', val_accuracy) 230 | Helper.class_accuracy(y_test, label_predicted) 231 | 232 | with open(model_location + '_target.txt', 'wt') as f: 233 | for y in y_test: 234 | f.write(str(y)+ '\n') 235 | with open(model_location + '_predicted.txt', 'wt') as f: 236 | for y in label_predicted: 237 | f.write(str(y)+ '\n') 238 | return [val_accuracy] 239 | 240 | 241 | 242 | 243 | 244 | def get_FP_FN(label_predicted, label_target): 245 | FP_id_list = [] 246 | FN_id_list = [] 247 | for i in range(0, len(label_predicted)): 248 | if int(label_predicted[i]) == 1 and int(label_target[i]) == 0: 249 | FP_id_list.append(i) 250 | elif int(label_predicted[i]) == 0 and int(label_target[i]) == 1: 251 | FN_id_list.append(i) 252 | #print (FP_id_list) 253 | #print (FN_id_list) 254 | with open('model_block' +'/labels/list/lstm_FP_' + sys.argv[1] + '.txt', 'wt') as f: 255 | for fp in FP_id_list: 256 | f.write(str(test_list[int(fp)])+ '\n') 257 | with open('model_block' +'/labels/list/lstm_FN_' + sys.argv[1] + '.txt', 'wt') as f: 258 | for fn in FN_id_list: 259 | f.write(str(test_list[int(fn)])+ '\n') 260 | 261 | 262 | def train(): 263 | os.environ['PYTHONHASHSEED']=str(seed_value) 264 | np.random.seed(seed_value) 265 | rn.seed(seed_value) 266 | print ('Loading Data...') 267 | combined,y,x_train, x_val, x_test, y_train, y_val, y_test=loadfile() 268 | print ('Tokenizing...') 269 | combined = tokenizer(combined) 270 | x_train = tokenizer (x_train) 271 | x_test = tokenizer (x_test) 272 | x_val = tokenizer (x_val) 273 | print ('Training a Word2vec model...') 274 | index_dict, word_vectors,combined=word2vec_train(combined) 275 | x_train = input_transform(x_train) 276 | x_test = input_transform(x_test) 277 | x_val = input_transform(x_val) 278 | print ('Setting up Arrays for Keras Embedding Layer...') 279 | n_symbols,embedding_weights=get_data(index_dict, word_vectors,combined) 280 | #print (x_train.shape,y_train.shape) 281 | result = train_lstm(n_symbols,embedding_weights,x_train,y_train, x_val , y_val , x_test,y_test) 282 | return result 283 | 284 | 285 | def pipeline_train(iterations): 286 | seed_and_result = {} 287 | if iterations == 1: 288 | train() 289 | else: 290 | for i in range(0, iterations): 291 | print('Iteration: ', i) 292 | global seed_value 293 | result = train() 294 | seed_and_result[seed_value] = result 295 | seed_value = seed_value + seed_window 296 | i = i + 1 297 | return seed_and_result 298 | 299 | def eval_metric(model, history, metric_name): 300 | metric = history.history[metric_name] 301 | val_metric = history.history['val_' + metric_name] 302 | e = range(1, n_epoch + 1) 303 | plt.plot(e, metric, 'bo', label='Train ' + metric_name) 304 | plt.plot(e, val_metric, 'b', label='Validation ' + metric_name) 305 | plt.xlabel('Epoch number') 306 | plt.ylabel(metric_name) 307 | plt.title('Comparing training and validation ' + metric_name + ' for ' + model.name) 308 | plt.legend() 309 | plt.show() 310 | 311 | 312 | def optimal_epoch(model_hist): 313 | min_epoch = np.argmin(model_hist.history['val_loss']) + 1 314 | print("Minimum validation loss reached in epoch {}".format(min_epoch)) 315 | return min_epoch 316 | 317 | 318 | 319 | 320 | if __name__=='__main__': 321 | result_dict = pipeline_train(n_iterations) 322 | print (sys.argv[1]) 323 | -------------------------------------------------------------------------------- /src/Baselines/DeepLV/block_processing/block_processing.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import multiprocessing 3 | import numpy as np 4 | import pandas as pd 5 | import csv 6 | import re 7 | 8 | block_set = {"DoStatement", "WhileStatement", "SynchronizedStatement", "IfStatement", "SwitchStatement", "TryStatement", "EnhancedForStatement", "ForStatement", "MethodDeclaration", "CatchClause", "Block" , "SwitchCase"} 9 | syntactic_filter_set = {"Block", "SimpleName", "SimpleType", "QualifiedName", "ParameterizedType", "PrimitiveType", "SingleVariableDeclaration", "ArrayType", "TypeLiteral"} 10 | block_dict = {} 11 | target_dict = {} 12 | methods_dict = {} 13 | methods_lines = {} 14 | target_dict_logged = {} 15 | level_dict_logged = {} 16 | message_dict_logged = {} 17 | target_dict_nonlogged = {} 18 | 19 | 20 | def read_logs(filename): 21 | f = open('original_logs/logs-' + filename+ '.txt') 22 | lines = f.readlines() 23 | f.close() 24 | return lines 25 | 26 | 27 | def get_classname(method): 28 | fullpath = method.split('.') 29 | class_name = fullpath[-3] + '.' + fullpath[-2]+'.java' 30 | return class_name 31 | 32 | 33 | def read_AST_file(filename): 34 | f = open('AST/AST-'+filename+'.txt') 35 | lines = f.readlines() 36 | f.close() 37 | 38 | return lines 39 | 40 | 41 | def parse_ASTlines(ASTlines): 42 | lines = [] 43 | #parse ASTlines by regex 44 | for astline in ASTlines: 45 | 46 | astType = re.findall(r'([^<]+)', astline)[0] 47 | location = re.findall(r'([^<]+)', astline)[0] 48 | begin = re.findall(r'([^<]+)', astline)[0] 49 | end = re.findall(r'([^<]+)', astline)[0] 50 | #content = re.findall(r'([^<]+)', astline)[0] 51 | content = re.findall(r'(.*?)', astline)[0] 52 | lines.append([astType, location, begin, end, content]) 53 | #for every AST line, 0: type, 1: location, 2: beginline, 3: endline, 4: content 54 | return lines 55 | 56 | 57 | 58 | def parse_Loglines(Loglines): 59 | loglines = [] 60 | #parse ASTlines by regex 61 | for logline in Loglines: 62 | callsite = re.findall(r'([^<]+)', logline)[0] 63 | level = re.findall(r'([^<]+)', logline)[0] 64 | line = re.findall(r'([^<]+)', logline)[0] 65 | if(re.findall(r'([^<]+)', logline)): 66 | content = re.findall(r'([^<]+)', logline)[0] 67 | loglines.append([level, line, content, callsite]) 68 | else: 69 | loglines.append([level, line, 'No message', callsite]) 70 | #0: level, 1: line number, 2: content, 3: callsite 71 | 72 | return loglines 73 | 74 | 75 | def if_log_line(ast, loglines): 76 | for log in loglines: 77 | #print (get_classname(log[3]), get_classname(astlist[1])) 78 | #print (log[1], astlist[2]) 79 | if(get_classname(log[3]) == get_classname(astlist[1]) and int(log[1]) == int(astlist[2])): 80 | #print ('1') 81 | return True 82 | return False 83 | 84 | 85 | 86 | def if_diff_levels(value_list): 87 | if len(value_list) > 1: 88 | for i in range (0, len(value_list)-1): 89 | for j in range (i+1, len(value_list)): 90 | if value_list[i][0] != value_list[j][0]: 91 | return 2 92 | else: 93 | return 0 94 | return 1 95 | 96 | def not_level_guard(string): 97 | if "enabled" in string: 98 | if "info" in string or "debug" in string or "trace" in string: 99 | return False 100 | return True 101 | 102 | #0: <= 1 log in the block, 1: multiple logs at the same level, 2: multiple logs at different levels 103 | 104 | 105 | def get_level_id(log, current_level): 106 | log_level = re.findall(r'([^<]+)', log)[0] 107 | message = '-' 108 | if(re.findall(r'([^<]+)', log)): 109 | message = re.findall(r'([^<]+)', log)[0] 110 | if log_level == 'trace': 111 | level_id = 0 112 | elif log_level == 'debug': 113 | level_id = 1 114 | elif log_level == 'info': 115 | level_id = 2 116 | elif log_level == 'warn': 117 | level_id = 3 118 | elif log_level == 'error': 119 | level_id = 4 120 | else: 121 | level_id = 5 122 | if level_id > current_level: 123 | return level_id, message 124 | else: 125 | return current_level, message 126 | 127 | 128 | def get_level_name(level_id): 129 | if level_id == 0: 130 | return "trace" 131 | elif level_id == 1: 132 | return "debug" 133 | elif level_id == 2: 134 | return "info" 135 | elif level_id == 3: 136 | return "warn" 137 | elif level_id == 4: 138 | return "error" 139 | elif level_id == 5: 140 | return "fatal" 141 | else: 142 | return "unknown" 143 | 144 | def label_blocks(target_dict, loglines): 145 | for key, value in target_dict.items(): 146 | logged_flag = False 147 | #level id: 0 - trace, 1 - debug, 2 - info, 3 - warn, 4 - error, 5 - fatal 148 | level_id = 0 149 | message = '-' 150 | for log in loglines: 151 | log_class = get_classname(re.findall(r'([^<]+)', log)[0]) 152 | log_line = int(re.findall(r'([^<]+)', log)[0]) 153 | key_class = re.findall(r'([^<]+)', key)[0] 154 | key_start = int(re.findall(r'([^<]+)', key)[0]) 155 | key_end = int(re.findall(r'([^<]+)', key)[0]) 156 | if log_line >= key_start and log_line <= key_end and log_class == key_class: 157 | level_id, message = get_level_id(log, level_id) 158 | logged_flag = True 159 | if logged_flag == True: 160 | target_dict_logged[key] = value 161 | level_dict_logged[key]=get_level_name(level_id) 162 | message_dict_logged[key]= message 163 | else: 164 | target_dict_nonlogged[key] = value 165 | 166 | 167 | def get_methods_dict (node): # set the startline of the first node of a method as it's startline 168 | if node[1] in methods_dict: 169 | if int(methods_dict[node[1]]) > int(node[2]): 170 | methods_dict[node[1]] = node[2] 171 | else: 172 | methods_dict[node[1]] = node[2] 173 | 174 | 175 | def get_methods_lines (methods_dict): 176 | for key, value in methods_dict.items(): 177 | class_name = get_classname(key) 178 | if class_name in methods_lines: 179 | methods_lines[class_name].append(int(value)) 180 | else: 181 | methods_lines[class_name] = [] 182 | 183 | for key, value in methods_lines.items(): 184 | value.sort() 185 | #print (key) 186 | #print (value) 187 | 188 | 189 | def get_method_start_line_for_AST (class_name, start_line): 190 | method_start_line = int(start_line) 191 | memory_line = 1 192 | if methods_lines[class_name]: 193 | for v in methods_lines[class_name]: 194 | if int(v) >= int(start_line): 195 | #print (memory_line) 196 | return int(memory_line) 197 | else: 198 | memory_line = int(v) 199 | else: 200 | return int(method_start_line) 201 | 202 | 203 | if __name__=='__main__': 204 | 205 | ASTlines = read_AST_file(sys.argv[1]) 206 | loglines = read_logs(sys.argv[1]) 207 | 208 | ASTlists = parse_ASTlines(ASTlines) 209 | loglists = parse_Loglines(loglines) 210 | 211 | for astlist in ASTlists: 212 | get_methods_dict(astlist) 213 | #filter level-guard if statements 214 | ast_content = astlist[4].lower()[0:40] 215 | #for every AST line, 0: type, 1: location, 2: beginline, 3: endline, 4: content 216 | if astlist[0] in block_set and not_level_guard(ast_content): 217 | if astlist[1] in block_dict: 218 | if (astlist[2]) not in block_dict[astlist[1]]: 219 | block_dict[astlist[1]].append(int(astlist[2])) 220 | if (astlist[3]) not in block_dict[astlist[1]]: 221 | block_dict[astlist[1]].append(int(astlist[3])) 222 | 223 | else: 224 | block_dict[astlist[1]] = [] 225 | get_methods_lines(methods_dict) 226 | 227 | for key, value in block_dict.items(): 228 | value.sort() 229 | 230 | 231 | 232 | 233 | for key, value in block_dict.items(): 234 | for i in range (0, len(value)-1): 235 | dict_key = '' + get_classname(key) + '' + '' + str(value[i]) + '' + '' + str((value[i+1])-1) + '' 236 | target_dict[dict_key] = [] 237 | 238 | 239 | m_start_line = 0 240 | for key, value in target_dict.items(): 241 | class_name = re.findall(r'([^<]+)', key)[0] 242 | start_line = re.findall(r'([^<]+)', key)[0] 243 | m_start_line = get_method_start_line_for_AST(class_name, start_line) 244 | if m_start_line is not None: 245 | if int(m_start_line) == 1: 246 | m_start_line = start_line 247 | else: 248 | m_start_line = start_line 249 | 250 | end_line = re.findall(r'([^<]+)', key)[0] 251 | #print (key) 252 | for astlist in ASTlists: 253 | if astlist[0] not in syntactic_filter_set and int(astlist[2]) <= int(end_line) and int(astlist[2]) >= int(m_start_line) and class_name == get_classname(astlist[1]): 254 | if(if_log_line(astlist, loglists)==False): 255 | value.append(astlist[0]) 256 | 257 | 258 | 259 | label_blocks(target_dict, loglines) 260 | result_list_logged = [] 261 | for key, value in target_dict_logged.items(): 262 | result_list_logged.append([key, value, level_dict_logged[key], message_dict_logged[key]]) 263 | 264 | result_list_nonlogged = [] 265 | for key, value in target_dict_nonlogged.items(): 266 | result_list_nonlogged.append([key, value]) 267 | 268 | 269 | 270 | 271 | header_logged = ['Key', 'Values', 'Level', 'Message'] 272 | logged_dict_to_write=pd.DataFrame(columns=header_logged,data=result_list_logged) 273 | logged_dict_to_write.to_csv('blocks/logged_syn_' + sys.argv[1] + '.csv') 274 | 275 | 276 | -------------------------------------------------------------------------------- /src/Baselines/DeepLV/deepLV_cleaner.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | import numpy as np 3 | import os 4 | import javalang 5 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction 6 | from rouge import Rouge 7 | import re 8 | import numpy as np 9 | from sklearn import metrics 10 | import pandas as pd 11 | 12 | def level_acc(classification_pred, classification_label) -> float: 13 | level_map = {'trace':0., 'debug':1., 'info':2., 'warn':3., 'error':4.} 14 | new_pred = [] 15 | new_label = [] 16 | length = len(classification_pred) 17 | for idx in range(length): 18 | predict = classification_pred[idx] 19 | label = classification_label[idx] 20 | if predict in level_map.keys() and label in level_map.keys(): 21 | pred_sum = level_map[predict] 22 | label_sum = level_map[label] 23 | new_pred.append(pred_sum) 24 | new_label.append(label_sum) 25 | matches = sum(x == y for x, y in zip(new_pred, new_label)) 26 | total_elements = len(new_pred) 27 | accuracy = matches / total_elements 28 | return accuracy 29 | 30 | def query_level(level: float) -> str: 31 | if level == 1.: 32 | return 'trace' 33 | elif level == 2.: 34 | return 'debug' 35 | elif level == 3.: 36 | return 'info' 37 | elif level == 4.: 38 | return 'warn' 39 | elif level == 5.: 40 | return 'error' 41 | else: 42 | return '' 43 | 44 | def aod(classification_pred, classification_label) -> float: 45 | level_map = {'trace':1., 'debug':2., 'info':3., 'warn':4., 'error':5.} 46 | max_distance = {'trace':4., 'debug':3., 'info':2., 'warn':3., 'error':4.} 47 | 48 | distance_sum = 0. 49 | noise = 0. 50 | length = len(classification_pred) 51 | 52 | for idx in range(length): 53 | try: 54 | predict = classification_pred[idx] 55 | label = classification_label[idx] 56 | pred_sum = level_map[predict] 57 | label_sum = level_map[label] 58 | level = query_level(label_sum) 59 | _distance = abs(label_sum - pred_sum) 60 | distance_sum = distance_sum + (1 - _distance / max_distance[level]) 61 | except Exception as e: 62 | noise = noise+1 63 | aod = distance_sum / (length-noise) 64 | return aod 65 | 66 | def extract_quoted_strings(s): 67 | quoted_strings = re.findall(r'"([^"]*)"', s) 68 | " ".join(quoted_strings) 69 | remaining = re.sub(r'"[^"]*"', '', s) 70 | char_to_remove = ['+', ','] 71 | for char in char_to_remove: 72 | remaining = remaining.replace(char, '') 73 | var_list_origin = remaining.split(' ') 74 | var_list = [item for item in var_list_origin if (not item == ' ')] 75 | var_list = [item for item in var_list if item] 76 | return quoted_strings, var_list 77 | 78 | def extract_outer_brackets(s): 79 | stack = [] 80 | result = [] 81 | 82 | for m in re.finditer(r"[()]", s): 83 | char, pos = m.group(0), m.start(0) 84 | if char == "(": 85 | stack.append(pos) 86 | elif char == ")": 87 | if len(stack) == 1: 88 | result.append(s[stack.pop() + 1:pos]) 89 | else: 90 | stack.pop() 91 | return result 92 | 93 | def extract_level(statement): 94 | parts = statement.split('.') 95 | for part in parts: 96 | if '(' in part: 97 | level = part.split('(')[0] 98 | return level.strip() 99 | return '' 100 | 101 | 102 | 103 | def extract_text(statement): 104 | bracket_contents = extract_outer_brackets(statement) 105 | if bracket_contents: # Check if the list is not empty 106 | # Pass the first item (contents of the first set of brackets) to extract_quoted_strings 107 | quoted_strings, remaining = extract_quoted_strings(bracket_contents[0]) 108 | quoted_strings_combined = ' '.join(quoted_strings) 109 | return quoted_strings_combined 110 | else: 111 | return '' # Return an empty string if no brackets are found 112 | 113 | df = pd.read_csv('logbench.csv') 114 | df = df[df['Statement'].apply(lambda x: len(x.splitlines()) == 1)] 115 | df['level'] = df['Statement'].apply(extract_level) 116 | df['text'] = df['Statement'].apply(extract_text) 117 | 118 | 119 | df.to_csv('logbench_cleaned.csv', index=False) -------------------------------------------------------------------------------- /src/Baselines/Incoder/incoder.py: -------------------------------------------------------------------------------- 1 | import os 2 | import javalang 3 | import re 4 | from typing import List 5 | import torch 6 | import tokenizers 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | import json 9 | 10 | path = '' 11 | ground_truth_folder = '' 12 | 13 | def insert_text_to_java_file(file_name, line_number): 14 | with open(file_name, 'r') as file: 15 | lines = file.readlines() 16 | 17 | if line_number > len(lines): 18 | print("out of range") 19 | 20 | lines[line_number - 1] = lines[line_number - 1].rstrip() + '\n' 21 | 22 | with open(file_name, 'w') as file: 23 | file.writelines(lines) 24 | 25 | 26 | def extract_numbers(s): 27 | return re.findall(r'\d+', s) 28 | 29 | 30 | def parse_directory(dir_path, ground_truth_folder): 31 | for filename in os.listdir(dir_path): 32 | file_path = os.path.join(dir_path, filename) 33 | if os.path.isfile(file_path) and file_path.endswith('.java'): 34 | ground_truth_path = ground_truth_folder + file_path.split('/')[-1][:-5] + '_config.txt' 35 | try: 36 | with open(ground_truth_path) as f: 37 | lines = f.readlines() 38 | if len(lines) >= 1: 39 | line_number = int(extract_numbers(lines[0].strip(' ')[:-1])[0]) 40 | insert_text_to_java_file(file_path, line_number) 41 | except FileNotFoundError: 42 | pass 43 | elif os.path.isdir(file_path): 44 | parse_directory(file_path, ground_truth_folder) 45 | 46 | parse_directory(path,ground_truth_folder) 47 | # Data procession done. 48 | 49 | 50 | tokenizers_version = tuple(int(n) for n in tokenizers.__version__.split('.')) 51 | if tokenizers_version < (0, 12, 1): 52 | print("warning: Your tokenizers version looks old and you will likely have formatting issues. We recommend installing tokenizers >= 0.12.1") 53 | 54 | # set BIG_MODEL to use the 6.7B parameter model 55 | BIG_MODEL = True 56 | 57 | # use a GPU 58 | CUDA = True 59 | 60 | # print intermediate outputs of infilling 61 | VERBOSE = False 62 | 63 | if BIG_MODEL: 64 | model_name = "facebook/incoder-6B" 65 | if CUDA: 66 | kwargs = dict( 67 | revision="float16", 68 | torch_dtype=torch.float16, 69 | low_cpu_mem_usage=False, 70 | ) 71 | else: 72 | kwargs = dict( 73 | low_cpu_mem_usage=False, 74 | ) 75 | else: 76 | model_name = "facebook/incoder-1B" 77 | kwargs = {} 78 | 79 | print("loading model") 80 | model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs) 81 | print("loading tokenizer") 82 | tokenizer = AutoTokenizer.from_pretrained(model_name) 83 | print("loading complete") 84 | 85 | if CUDA: 86 | # if you plan to fine-tune the model, you should not use half precision. 87 | model = model.half().cuda() 88 | 89 | # signals the start of a document 90 | BOS = "<|endoftext|>" 91 | # signals the end of a generated infill 92 | EOM = "<|endofmask|>" 93 | 94 | def make_sentinel(i): 95 | # signals (1) a location to insert an infill and (2) the start of the infill generation 96 | return f"<|mask:{i}|>" 97 | 98 | def generate(input: str, max_to_generate: int=128, temperature: float=0.2): 99 | """ 100 | Do standard left-to-right completion of the prefix `input` by sampling from the model 101 | """ 102 | input_ids = tokenizer(input, return_tensors="pt").input_ids 103 | if CUDA: 104 | input_ids = input_ids.cuda() 105 | max_length = max_to_generate + input_ids.flatten().size(0) 106 | if max_length > 2048: 107 | print("warning: max_length {} is greater than the context window {}".format(max_length, 2048)) 108 | with torch.no_grad(): 109 | output = model.generate(input_ids=input_ids, do_sample=True, top_p=0.95, temperature=temperature, max_length=max_length) 110 | # pass clean_up_tokenization_spaces=False to avoid removing spaces before punctuation, e.g. "from ." -> "from." 111 | detok_hypo_str = tokenizer.decode(output.flatten(), clean_up_tokenization_spaces=False) 112 | if detok_hypo_str.startswith(BOS): 113 | detok_hypo_str = detok_hypo_str[len(BOS):] 114 | return detok_hypo_str 115 | 116 | def infill(parts: List[str], max_to_generate: int=50, temperature: float=0.2, extra_sentinel: bool=True, max_retries: int=1): 117 | """ 118 | Generate infills to complete a partial document, e.g. 119 | [A C E] -> [A B C D E], where B and D are infills that have been generated. 120 | 121 | parts: List[str]. list of parts of the document. One string will be 122 | inserted in between each element, i.e. infilling N-1 locations for a list 123 | of length N. 124 | max_to_generate: int. maximum number of tokens to generate. Keep in mind 125 | that the model context size is 2048. 126 | temperature: float. temperature parameter for sampling. 127 | extra_sentinel: bool. we recommend setting this to True, as it makes it 128 | easier for the model to end generated infills. See the footnote in 129 | section 2.2 of our paper for details. 130 | max_retries: int. if > 1, use rejection sampling to keep sampling infills until 131 | all infills sample a completion token. 132 | 133 | returns a dictionary containing the following: 134 | text: str, the completed document (with infills inserted) 135 | parts: List[str], length N. Same as passed to the method 136 | infills: List[str], length N-1. The list of infills generated 137 | retries_attempted: number of retries used (if max_retries > 1) 138 | """ 139 | assert isinstance(parts, list) 140 | retries_attempted = 0 141 | done = False 142 | 143 | while (not done) and (retries_attempted < max_retries): 144 | retries_attempted += 1 145 | 146 | if VERBOSE: 147 | print(f"retry {retries_attempted}") 148 | 149 | ## (1) build the prompt 150 | if len(parts) == 1: 151 | prompt = parts[0] 152 | else: 153 | prompt = "" 154 | # encode parts separated by sentinel 155 | for sentinel_ix, part in enumerate(parts): 156 | prompt += part 157 | if extra_sentinel or (sentinel_ix < len(parts) - 1): 158 | prompt += make_sentinel(sentinel_ix) 159 | 160 | infills = [] 161 | complete = [] 162 | 163 | done = True 164 | 165 | ## (2) generate infills 166 | for sentinel_ix, part in enumerate(parts[:-1]): 167 | complete.append(part) 168 | prompt += make_sentinel(sentinel_ix) 169 | # TODO: this is inefficient as it requires re-encoding prefixes repeatedly 170 | completion = generate(prompt, max_to_generate, temperature) 171 | completion = completion[len(prompt):] 172 | if EOM not in completion: 173 | if VERBOSE: 174 | print(f"warning: {EOM} not found") 175 | completion += EOM 176 | done = False 177 | completion = completion[:completion.index(EOM) + len(EOM)] 178 | infilled = completion[:-len(EOM)] 179 | infills.append(infilled) 180 | complete.append(infilled) 181 | prompt += completion 182 | complete.append(parts[-1]) 183 | text = ''.join(complete) 184 | 185 | if VERBOSE: 186 | print("generated text:") 187 | print(prompt) 188 | print() 189 | print("parts:") 190 | print(parts) 191 | print() 192 | print("infills:") 193 | print(infills) 194 | print() 195 | print("restitched text:") 196 | print(text) 197 | print() 198 | 199 | return { 200 | 'text': text, # str, the completed document (with infills inserted) 201 | 'parts': parts, # List[str], length N. Same as passed to the method 202 | 'infills': infills, # List[str], length N-1. The list of infills generated 203 | 'retries_attempted': retries_attempted, # number of retries used (if max_retries > 1) 204 | } 205 | 206 | def docstring_to_code(code, max_to_generate=50, temperature=0.2): 207 | 208 | parts = code.split("") 209 | result = infill(parts, max_to_generate=max_to_generate, temperature=temperature) 210 | return result 211 | 212 | input_path = path 213 | output_path= '' 214 | 215 | if not os.path.exists(): 216 | os.makedirs(output_path) 217 | 218 | for filename in os.listdir(input_path): 219 | if filename.endswith(".java"): 220 | print(filename) 221 | input_file_path = os.path.join(input_path, filename) 222 | 223 | with open(input_file_path, 'r', encoding='utf-8') as file: 224 | file_content = file.read() 225 | example = f"'''\\\n{file_content}\n'''" 226 | 227 | processed_content = docstring_to_code(example) 228 | 229 | output_file_path = os.path.join(output_path, filename) 230 | with open(output_file_path, 'w', encoding='utf-8') as output_file: 231 | for item in processed_content['infills']: 232 | output_file.write(f"{item}\n") 233 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/README.md: -------------------------------------------------------------------------------- 1 | # LoGenText-Plus 2 | 3 | The implementation of "LoGenText-Plus: Improving Neural Machine Translation-based Logging Texts Generation with Syntactic Templates" 4 | 5 | > This code and dataset are based on [Context-Aware Model on Fairseq](https://github.com/libeineu/Context-Aware) and [LoGenText](https://github.com/conf-202x/experimental-result). 6 | 7 | ## Requirements and Installation 8 | 9 | * Pytorch >= 1.5.1 10 | * Python version >= 3.6 11 | 12 | 1. `conda create --name --file requirements.txt` 13 | 14 | ## Stage 1: template generation 15 | 16 | 17 | Note: `` is the path to the replication package. 18 | 19 | ### Train and inference for templates 20 | 21 | > 1. Run the following command to start the pre-training: 22 | ``` 23 | cd /code/template-gen/pre-train 24 | bash runs/pre-train.sh 25 | ``` 26 | 27 | 28 | > 2. Run the following command to train a basic model: 29 | ``` 30 | cd /code/template-gen/basic-train 31 | bash runs/basic-train.sh 32 | ``` 33 | `` is the project name in lowercase, which can be activemq, ambari, etc. 34 | 35 | > 3. Run the following command to train and generate the templates for a certain : 36 | ``` 37 | cd /code/template-gen/ast-temp 38 | bash runs/temp-gen.sh 39 | ``` 40 | `` should be the same with the project in step 2, and the generated templates can be found in `saved_checkpoints/pre-ast-templete/`. 41 | 42 | 43 | ## Stage 2: template-based logging text generation 44 | 45 | Note: `` is the path to the replication package. 46 | 47 | ### Train and inference for logging texts 48 | 49 | > 1. Run the following command to start the pre-training: 50 | ``` 51 | cd /code/logging-gen/pre-train 52 | bash runs/pre-train.sh 53 | ``` 54 | 55 | > 2. Run the following command to train a basic model: 56 | ``` 57 | cd /code/logging-gen/basic-train 58 | bash runs/basic-train.sh 59 | ``` 60 | `` is the project name in lowercase, which can be activemq, ambari, etc. 61 | 62 | > 3. Run the following command to train and generate the logging texts for a certain : 63 | ``` 64 | cd /code/logging-gen/ast-temp 65 | bash runs/log-gen.sh 66 | ``` 67 | `` should be the same with the project in step 2, and the generated logging texts can be found in `translations/1/`. 68 | 69 | ## Results 70 | 71 | The results can be found in the `results` folder, which is organized by project. 72 | 73 | ## Data 74 | 75 | The dataset can be found in the `dataset` folder, which is organized by project. It has the following structure: 76 | ``` 77 | dataset 78 | ├── 79 | │   ├── dev.code.1.templete 80 | │   ├── dev.log 81 | │   ├── dev.log.1.templete 82 | │   ├── dev.pre-ast 83 | │   ├── test.code.1.templete 84 | │   ├── test.code.gen.ast.similar.1.templete 85 | │   ├── test.log 86 | │   ├── test.log.1.templete 87 | │   ├── test.pre-ast 88 | │   ├── train.code.1.templete 89 | │   ├── train.log 90 | │   ├── train.log.1.templete 91 | │   └── train.pre-ast 92 | ``` 93 | - `` is one of the studied projects, suach as `activemq`. 94 | - `train/dev/test.log` are the files containing the extracted `logging texts` target sequence. 95 | - `train/dev/test.pre-ast` are the files containing the `ASTs` context. 96 | - `train/dev/test.code.1.templete` are the files containing `pre-log code + template from logging text in similar code`. 97 | - `train/dev/test.log.1.template` are the files containing the template extracted from the `logging text`. 98 | - `test.code.gen.ast.similar.1.templete` are the file containing the `pre-log code + predicted template`. -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/requirements.txt: -------------------------------------------------------------------------------- 1 | # This file may be used to create an environment using: 2 | # $ conda create --name --file 3 | # platform: linux-64 4 | _libgcc_mutex=0.1=main 5 | blas=1.0=mkl 6 | brotlipy=0.7.0=py38h27cfd23_1003 7 | ca-certificates=2022.4.26=h06a4308_0 8 | certifi=2022.5.18.1=py38h06a4308_0 9 | cffi=1.14.4=pypi_0 10 | charset-normalizer=2.0.4=pyhd3eb1b0_0 11 | click=7.1.2=pypi_0 12 | cryptography=37.0.1=py38h9ce1e76_0 13 | cudatoolkit=10.1.243=h6bb024c_0 14 | cycler=0.10.0=pypi_0 15 | freetype=2.10.4=h5ab3b9f_0 16 | idna=3.3=pyhd3eb1b0_0 17 | intel-openmp=2020.2=254 18 | joblib=1.0.0=pypi_0 19 | jpeg=9b=h024ee3a_2 20 | kiwisolver=1.3.1=pypi_0 21 | lcms2=2.11=h396b838_0 22 | ld_impl_linux-64=2.33.1=h53a641e_7 23 | libedit=3.1.20191231=h14c3975_1 24 | libffi=3.3=he6710b0_2 25 | libgcc-ng=9.1.0=hdf63c60_0 26 | libpng=1.6.37=hbc83047_0 27 | libprotobuf=3.19.1=h4ff587b_0 28 | libstdcxx-ng=9.1.0=hdf63c60_0 29 | libtiff=4.1.0=h2733197_1 30 | lz4-c=1.9.3=h2531618_0 31 | matplotlib=3.3.4=pypi_0 32 | mkl=2020.2=256 33 | mkl-service=2.3.0=py38he904b0f_0 34 | mkl_fft=1.2.0=py38h23d657b_0 35 | mkl_random=1.1.1=py38h0573a6f_0 36 | ncurses=6.2=he6710b0_1 37 | ninja=1.10.2=py38hff7bd54_0 38 | numpy=1.19.2=py38h54aff64_0 39 | numpy-base=1.19.2=py38hfa32c7d_0 40 | olefile=0.46=py_0 41 | openssl=1.1.1o=h7f8727e_0 42 | pillow=8.1.0=py38he98fc37_0 43 | pip=20.3.3=py38h06a4308_0 44 | portalocker=2.2.0=pypi_0 45 | protobuf=3.19.1=py38h295c915_0 46 | pycparser=2.20=pypi_0 47 | pyopenssl=22.0.0=pyhd3eb1b0_0 48 | pyparsing=2.4.7=pypi_0 49 | pysocks=1.7.1=py38h06a4308_0 50 | python=3.8.5=h7579374_1 51 | python-dateutil=2.8.1=pypi_0 52 | pytorch=1.5.1=py3.8_cuda10.1.243_cudnn7.6.3_0 53 | readline=8.1=h27cfd23_0 54 | regex=2020.11.13=pypi_0 55 | requests=2.27.1=pyhd3eb1b0_0 56 | sacrebleu=1.5.0=pypi_0 57 | sacremoses=0.0.43=pypi_0 58 | setuptools=52.0.0=py38h06a4308_0 59 | six=1.15.0=py38h06a4308_0 60 | sqlite=3.33.0=h62c20be_0 61 | subword-nmt=0.3.7=pypi_0 62 | tensorboardx=2.2=pyhd3eb1b0_0 63 | tk=8.6.10=hbc83047_0 64 | torchvision=0.6.1=py38_cu101 65 | tqdm=4.56.0=pypi_0 66 | urllib3=1.26.9=py38h06a4308_0 67 | wheel=0.36.2=pyhd3eb1b0_0 68 | xz=5.2.5=h7b6447c_0 69 | zlib=1.2.11=h7b6447c_3 70 | zstd=1.4.5=h9ceee32_0 71 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/activemq/translation.context.test: -------------------------------------------------------------------------------- 1 | expiring connection to zookeeper vid 2 | unsubscribing next messages 3 | async dispose interrupted 4 | stopping async topic tasks 5 | error with selector vid 6 | network connection between vid and vid has been established 7 | unknown command vid 8 | vid failed to resetting to batch 9 | virtual consumer added vid for virtual destination vid 10 | shutdown of topic traffic generator failed 11 | store commit failed 12 | failed to create persistence adapter vid 13 | destination is full vid 14 | setting optimized out of vid to vid 15 | async read check was rejected from the executor 16 | tempdest vid 17 | redelivery of vid stopped 18 | exception on forwarding to non existent temp dest 19 | vid ignoring sub vid on vid from vid is no longer active 20 | reconnected to vid 21 | detected missing corrupt journal files dropped vid messages from the index in vid seconds 22 | attempting to acquire the exclusive lock to become the master broker 23 | broker plugin vid started 24 | checkpoint failed 25 | vid flushtodisk done vid ms vid 26 | failed to fire bridge for vid 27 | stopping broker vid 28 | sasl vid handshake complete 29 | error receiving message vid this exception is ignored 30 | sendreq vid 31 | endpoint vid will not receive any messages due to broker zero error vid 32 | xa resource manager vid 33 | do not know how to process activemq command vid 34 | sending to vid messages from vid to vid 35 | vid message sent to vid 36 | zero length partial vid 37 | message not found in sequence id index vid 38 | unknown datastruction vid 39 | message received since last read check resetting flag vid 40 | sendreq vid 41 | vid could not find the object rename for vid 42 | failed to unregister mbean vid 43 | periodic checkpoint failed 44 | error unsubscribing vid from vid vid 45 | creating producer vid message vid 46 | network bridge could not be registered in jmx vid 47 | slow kahadb access journal append took vid ms index 48 | waiting vid ms before attempting to reconnect 49 | prepare of vid failed because it was marked rollback only 50 | received an exception but connection is ignored vid 51 | could not correlate the connection vid 52 | vid ms before attempting to reconnect to vid 53 | failed to get durable subscription vid 54 | unable to read persisted selector cache it will be ignored 55 | get destinations returned empty list 56 | caught an exception trying to determine if there is no flag 57 | vid usage manager memory limit reached vid producers will be throttled to the rate at vid 58 | setting durable subscriber to vid 59 | work rejected vid 60 | reusing an active session vid 61 | thread does not hold the context lock on close of vid 62 | creating producer to vid 63 | vid elapsed time in second vid s 64 | recovery mode trying to reconnect to zero 65 | could not apply query parameters vid to vid 66 | producer vid with non persistent delivery 67 | failed to call after delivery 68 | failed to register mbean vid 69 | master lock retry sleep interrupted 70 | vid ms elapsed since last write check 71 | vid remove request on vid from vid vid matching sub vid 72 | vid attempting to acquire exclusive lease to become the master 73 | async start of vid 74 | vid no set batch from sequence id set vid 75 | connector removed with uri vid 76 | corrupt journal record unexpected exception on journal replay of location vid 77 | apache activemq vid vid vid 78 | auto transport newconnectionexecutor didn t cleanly 79 | assigned vid to consumer vid 80 | setting topic vid to vid 81 | no queue named vid 82 | could not connect to local uri vid vid 83 | closed socket vid 84 | locker keepalive resulted in 85 | failure reason 86 | notified failover transport vid of interruption completion 87 | failed to initialize local connection for the jmsconnector 88 | timeout waiting for echo service shutdown 89 | trace entry vid 90 | failed to remove consumer on connection vid 91 | xa transaction rollback vid 92 | bridge was disposed before the first vid 93 | interrupted while redelivery 94 | unsubscribing durable journal 95 | sending to vid messages to vid 96 | removing consumer vid 97 | attempting to acquire the exclusive lock to become the master broker 98 | not adding to dlq vid to vid 99 | trying to build a pooledconnectionfactory 100 | sampler interrupted 101 | vid received message vid 102 | failed to close connection vid 103 | failed to accept accept for vid 104 | rolled back vid messages from the index in vid seconds 105 | error occured while processing vid 106 | unexpected local exception vid 107 | vid end of vid with vid 108 | master lock retry sleep interrupted 109 | message not found in sequence id index vid 110 | failed to deliver remove command for destination vid 111 | vid removed from scheduler vid 112 | installing discarding dead letter queue broker plugin dropall vid dropall vid 113 | failed to create object name to unregister vid 114 | vid vid ms elapsed since last write check 115 | failed to send mqtt subscription vid 116 | connector vid started 117 | session vid has more work to do b c of unconsumed 118 | could not transfer the template file to journal transferfile vid 119 | exception occurred for client vid vid processing vid 120 | async error occurred vid 121 | executing sql vid 122 | msg vid id vid destinationname vid 123 | failed to unregister mbean vid 124 | forcing shutdown of executorservice vid 125 | failed to prepare xaresource vid 126 | the remote exception was vid 127 | committing user vid 128 | amqp header arrived invalid version vid 129 | message expired vid 130 | error on queueing the ack compactor 131 | failed to load vid 132 | failed to unregister mbean vid 133 | vid recovered prepared vid 134 | vid ignoring destination vid restricted to vid network hops only 135 | journalled transacted acknowledge for vid at vid 136 | async exception with no exception listener vid 137 | could not preallocate journal file with zeros 138 | unable to unregister subscription vid 139 | attempting to acquire vid 140 | failed to remove scheduler vid 141 | starting a network connection between vid ms 142 | could not create transportlogger reason vid 143 | mqtt client vid connected version vid 144 | get peer broker index vid 145 | vid performance vid to vid 146 | vid ignoring destination vid restricted to vid network hops only 147 | transportloggerfactory could not be started reason vid 148 | received null command from url vid 149 | sending message to vid client vid 150 | last update vid full gc candidates set vid 151 | failed to call getplatformmbeanserver due to 152 | can t use property vid which is of type vid value 153 | policy not applied error processing object addition for addition of vid 154 | executing sql vid 155 | failed to write to scheduler vid 156 | rollback processing error 157 | cleanup removing the data 158 | could not connect to local uri vid vid 159 | starting network connection between vid and vid has been established 160 | failed to lookup the broker from vid 161 | vid ms elapsed and vid consumers subscribed starting dispatch 162 | waiting for outstanding responses to be properly 163 | thread using classloader vid 164 | unknown command vid 165 | stopped recover next messages 166 | vid failed to lease sleeping for vid milli s before trying again 167 | recovery replayed vid operations from the journal 168 | scope vid 169 | failed to register mbean vid 170 | exception occurred for client vid vid processing vid vid 171 | removed scheduled job vid 172 | shutting down test echo service 173 | connector not registered for uuid vid 174 | failed to send command vid 175 | connector stopped stopping proxy 176 | exception on dispatch to browser vid 177 | add exception was raised while executing the run command for oncomplete 178 | start failure exception 179 | the type vid should end with to be a valid discovery type 180 | continuation vid expired vid 181 | suppressing duplicate message send vid 182 | opening new cause 183 | no log writer available for vid 184 | starting to synchronously receive vid messages 185 | vid matching remote vid 186 | failed to unregister mbean vid 187 | load of vid 188 | running clientid vid 189 | failed to aquire lock 190 | adding destination vid 191 | restore consumer vid in pull mode pending recovery overriding prefetch vid 192 | rar vid stopped or undeployed recovery 193 | job scheduler store checkpoint complete 194 | connected to zookeeper 195 | endpoint vid failed to process message reason 196 | the type vid should end with to be a discovery type 197 | invoking start on vid 198 | policy not applied user vid does not have name attribute vid under entry vid 199 | master lock retry sleep interrupted 200 | forwarding of acks in journal file vid 201 | creating temporary file vid 202 | received_exception vid 203 | shutdown of executorservice vid is shutdown vid and terminated vid took vid 204 | async connection timeout task was rejected from the executor 205 | mqtt client vid established heart beat of vid ms vid ms grace period 206 | caught exception in mainloop 207 | exceeded redelivery with count vid ack vid 208 | ignoring consumerinfo vid from vid vid 209 | no connection attempt made in time for vid throwing inactivityioexception 210 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/activemq/translation.context.test.log: -------------------------------------------------------------------------------- 1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/activemq/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/activemq/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None) 2 | | [code] dictionary: 1080 types 3 | | [log] dictionary: 1080 types 4 | | data-bin/context test 209 examples 5 | | ['data-bin/context'] test 209 examples 6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/activemq/checkpoint_last.pt 7 | | Translated 209 sentences (2201 tokens) in 3.2s (64.63 sentences/s, 680.59 tokens/s) 8 | | Generate test with beam=8: BLEU = 26.58 46.1/28.3/21.8/18.7 (BP = 0.985 ratio = 0.985 hyp_len = 1299 ref_len = 1319) 9 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/activemq/translation.context.test.unsort: -------------------------------------------------------------------------------- 1 | 51 vid ms before attempting to reconnect to vid 2 | 144 vid performance vid to vid 3 | 128 message expired vid 4 | 105 unexpected local exception vid 5 | 31 xa resource manager vid 6 | 187 running clientid vid 7 | 79 setting topic vid to vid 8 | 92 interrupted while redelivery 9 | 138 attempting to acquire vid 10 | 4 error with selector vid 11 | 109 failed to deliver remove command for destination vid 12 | 5 network connection between vid and vid has been established 13 | 97 not adding to dlq vid to vid 14 | 26 stopping broker vid 15 | 86 failed to initialize local connection for the jmsconnector 16 | 179 continuation vid expired vid 17 | 90 xa transaction rollback vid 18 | 161 waiting for outstanding responses to be properly 19 | 201 received_exception vid 20 | 39 sendreq vid 21 | 102 failed to accept accept for vid 22 | 156 cleanup removing the data 23 | 58 work rejected vid 24 | 22 broker plugin vid started 25 | 29 sendreq vid 26 | 72 async start of vid 27 | 21 attempting to acquire the exclusive lock to become the master broker 28 | 200 creating temporary file vid 29 | 158 starting network connection between vid and vid has been established 30 | 96 attempting to acquire the exclusive lock to become the master broker 31 | 181 opening new cause 32 | 140 starting a network connection between vid ms 33 | 2 async dispose interrupted 34 | 41 failed to unregister mbean vid 35 | 59 reusing an active session vid 36 | 205 caught exception in mainloop 37 | 16 redelivery of vid stopped 38 | 111 installing discarding dead letter queue broker plugin dropall vid dropall vid 39 | 193 connected to zookeeper 40 | 185 failed to unregister mbean vid 41 | 154 failed to write to scheduler vid 42 | 114 failed to send mqtt subscription vid 43 | 54 get destinations returned empty list 44 | 99 sampler interrupted 45 | 183 starting to synchronously receive vid messages 46 | 91 bridge was disposed before the first vid 47 | 142 mqtt client vid connected version vid 48 | 13 setting optimized out of vid to vid 49 | 195 the type vid should end with to be a discovery type 50 | 153 executing sql vid 51 | 208 no connection attempt made in time for vid throwing inactivityioexception 52 | 139 failed to remove scheduler vid 53 | 189 adding destination vid 54 | 47 waiting vid ms before attempting to reconnect 55 | 95 removing consumer vid 56 | 115 connector vid started 57 | 24 vid flushtodisk done vid ms vid 58 | 124 failed to prepare xaresource vid 59 | 71 vid attempting to acquire exclusive lease to become the master 60 | 67 failed to register mbean vid 61 | 172 connector not registered for uuid vid 62 | 57 setting durable subscriber to vid 63 | 196 invoking start on vid 64 | 113 vid vid ms elapsed since last write check 65 | 7 vid failed to resetting to batch 66 | 10 store commit failed 67 | 49 received an exception but connection is ignored vid 68 | 170 removed scheduled job vid 69 | 45 network bridge could not be registered in jmx vid 70 | 70 vid remove request on vid from vid vid matching sub vid 71 | 178 the type vid should end with to be a valid discovery type 72 | 80 no queue named vid 73 | 122 failed to unregister mbean vid 74 | 52 failed to get durable subscription vid 75 | 83 locker keepalive resulted in 76 | 42 periodic checkpoint failed 77 | 120 executing sql vid 78 | 186 load of vid 79 | 44 creating producer vid message vid 80 | 87 timeout waiting for echo service shutdown 81 | 148 sending message to vid client vid 82 | 112 failed to create object name to unregister vid 83 | 85 notified failover transport vid of interruption completion 84 | 135 async exception with no exception listener vid 85 | 174 connector stopped stopping proxy 86 | 162 thread using classloader vid 87 | 46 slow kahadb access journal append took vid ms index 88 | 125 the remote exception was vid 89 | 63 recovery mode trying to reconnect to zero 90 | 53 unable to read persisted selector cache it will be ignored 91 | 188 failed to aquire lock 92 | 101 failed to close connection vid 93 | 76 apache activemq vid vid vid 94 | 15 tempdest vid 95 | 61 creating producer to vid 96 | 12 destination is full vid 97 | 0 expiring connection to zookeeper vid 98 | 203 async connection timeout task was rejected from the executor 99 | 130 failed to load vid 100 | 146 transportloggerfactory could not be started reason vid 101 | 25 failed to fire bridge for vid 102 | 137 unable to unregister subscription vid 103 | 194 endpoint vid failed to process message reason 104 | 11 failed to create persistence adapter vid 105 | 74 connector removed with uri vid 106 | 77 auto transport newconnectionexecutor didn t cleanly 107 | 199 forwarding of acks in journal file vid 108 | 127 amqp header arrived invalid version vid 109 | 104 error occured while processing vid 110 | 147 received null command from url vid 111 | 143 get peer broker index vid 112 | 100 vid received message vid 113 | 136 could not preallocate journal file with zeros 114 | 171 shutting down test echo service 115 | 93 unsubscribing durable journal 116 | 117 could not transfer the template file to journal transferfile vid 117 | 191 rar vid stopped or undeployed recovery 118 | 176 add exception was raised while executing the run command for oncomplete 119 | 126 committing user vid 120 | 19 reconnected to vid 121 | 159 failed to lookup the broker from vid 122 | 38 message received since last read check resetting flag vid 123 | 106 vid end of vid with vid 124 | 65 producer vid with non persistent delivery 125 | 37 unknown datastruction vid 126 | 155 rollback processing error 127 | 64 could not apply query parameters vid to vid 128 | 60 thread does not hold the context lock on close of vid 129 | 27 sasl vid handshake complete 130 | 184 vid matching remote vid 131 | 75 corrupt journal record unexpected exception on journal replay of location vid 132 | 207 ignoring consumerinfo vid from vid vid 133 | 206 exceeded redelivery with count vid ack vid 134 | 197 policy not applied user vid does not have name attribute vid under entry vid 135 | 40 vid could not find the object rename for vid 136 | 89 failed to remove consumer on connection vid 137 | 14 async read check was rejected from the executor 138 | 182 no log writer available for vid 139 | 163 unknown command vid 140 | 150 failed to call getplatformmbeanserver due to 141 | 118 exception occurred for client vid vid processing vid 142 | 73 vid no set batch from sequence id set vid 143 | 6 unknown command vid 144 | 141 could not create transportlogger reason vid 145 | 134 journalled transacted acknowledge for vid at vid 146 | 18 vid ignoring sub vid on vid from vid is no longer active 147 | 84 failure reason 148 | 48 prepare of vid failed because it was marked rollback only 149 | 110 vid removed from scheduler vid 150 | 123 forcing shutdown of executorservice vid 151 | 55 caught an exception trying to determine if there is no flag 152 | 132 vid recovered prepared vid 153 | 168 failed to register mbean vid 154 | 173 failed to send command vid 155 | 28 error receiving message vid this exception is ignored 156 | 9 shutdown of topic traffic generator failed 157 | 169 exception occurred for client vid vid processing vid vid 158 | 34 vid message sent to vid 159 | 152 policy not applied error processing object addition for addition of vid 160 | 1 unsubscribing next messages 161 | 202 shutdown of executorservice vid is shutdown vid and terminated vid took vid 162 | 108 message not found in sequence id index vid 163 | 167 scope vid 164 | 68 master lock retry sleep interrupted 165 | 204 mqtt client vid established heart beat of vid ms vid ms grace period 166 | 198 master lock retry sleep interrupted 167 | 32 do not know how to process activemq command vid 168 | 66 failed to call after delivery 169 | 3 stopping async topic tasks 170 | 160 vid ms elapsed and vid consumers subscribed starting dispatch 171 | 157 could not connect to local uri vid vid 172 | 36 message not found in sequence id index vid 173 | 56 vid usage manager memory limit reached vid producers will be throttled to the rate at vid 174 | 107 master lock retry sleep interrupted 175 | 81 could not connect to local uri vid vid 176 | 151 can t use property vid which is of type vid value 177 | 131 failed to unregister mbean vid 178 | 166 recovery replayed vid operations from the journal 179 | 165 vid failed to lease sleeping for vid milli s before trying again 180 | 116 session vid has more work to do b c of unconsumed 181 | 149 last update vid full gc candidates set vid 182 | 78 assigned vid to consumer vid 183 | 164 stopped recover next messages 184 | 180 suppressing duplicate message send vid 185 | 69 vid ms elapsed since last write check 186 | 119 async error occurred vid 187 | 17 exception on forwarding to non existent temp dest 188 | 98 trying to build a pooledconnectionfactory 189 | 175 exception on dispatch to browser vid 190 | 82 closed socket vid 191 | 62 vid elapsed time in second vid s 192 | 190 restore consumer vid in pull mode pending recovery overriding prefetch vid 193 | 129 error on queueing the ack compactor 194 | 20 detected missing corrupt journal files dropped vid messages from the index in vid seconds 195 | 121 msg vid id vid destinationname vid 196 | 88 trace entry vid 197 | 103 rolled back vid messages from the index in vid seconds 198 | 35 zero length partial vid 199 | 43 error unsubscribing vid from vid vid 200 | 23 checkpoint failed 201 | 145 vid ignoring destination vid restricted to vid network hops only 202 | 177 start failure exception 203 | 33 sending to vid messages from vid to vid 204 | 50 could not correlate the connection vid 205 | 94 sending to vid messages to vid 206 | 133 vid ignoring destination vid restricted to vid network hops only 207 | 192 job scheduler store checkpoint complete 208 | 30 endpoint vid will not receive any messages due to broker zero error vid 209 | 8 virtual consumer added vid for virtual destination vid 210 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/ambari/translation.context.test.log: -------------------------------------------------------------------------------- 1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/ambari/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/ambari/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None) 2 | | [code] dictionary: 1080 types 3 | | [log] dictionary: 1080 types 4 | | data-bin/context test 365 examples 5 | | ['data-bin/context'] test 365 examples 6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/ambari/checkpoint_last.pt 7 | | Translated 365 sentences (4566 tokens) in 6.8s (53.74 sentences/s, 672.23 tokens/s) 8 | | Generate test with beam=8: BLEU = 25.50 46.4/28.2/20.9/16.7 (BP = 0.982 ratio = 0.982 hyp_len = 2683 ref_len = 2733) 9 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/brooklyn/translation.context.test: -------------------------------------------------------------------------------- 1 | problem persisting but no longer running ignoring 2 | unable to create policy spec for vid vid 3 | management node vid in vid seconds waiting for persistence to write all data continuing 4 | unable to instantiate vid vid rethrowing vid 5 | vid invoking sensor vid on vid with vid 6 | vid no port available for vid empty range vid 7 | vid rethrowing 8 | user vid not authorized to see sensor vid of entity vid excluding from current state results 9 | determined reachability of sockets vid vid 10 | vid health check for vid component continuing recovering vid 11 | unable again to find details of location vid in rest call to list ignoring location vid 12 | multiple ambiguous definitions for config key vid on vid with vid 13 | authentication successful vid 14 | retrieving java url vid from vid 15 | publishing management node health vid 16 | first server up in vid is vid 17 | using first reachable address vid for node vid in vid 18 | unable to instantiate vid vid 19 | cancelled vid tasks for vid vid 20 | can t calculate percentage value for entity vid as total from producer vid is zero 21 | destroying app vid mgmt is vid 22 | cannot set key vid on vid from flag vid containing class is not configurable 23 | error resizing but no longer running vid 24 | failed to resolve aws hostname of vid rethrowing 25 | parsing values for vid at vid vid 26 | invoking effector vid on vid with args vid 27 | vid recording removal of container vid 28 | vid redundant call to start vid skipping vid 29 | for vid considering members vid 30 | getrequiredopenports detected at vid vid 31 | credentials have no effect in builder unless uri for host is specified 32 | missing icon data for vid expected at vid already logged warn and error details 33 | checkpointing delta of memento with references updating vid entities vid locations vid policies vid enrichers vid catalog items vid bundles removing vid 34 | vid n plan being added is n vid n plan already present is n vid 35 | configuration error vid 36 | primary node vid is deprecated use vid instead use vid 37 | deprecated use of managementcontext for unmanaged vid ignoring 38 | loaded java type vid for vid vid but had errors vid 39 | installing vid from vid on vid 40 | missing catalog item for vid vid inferring as vid because that is able to load the item 41 | reconfiguring vid config file for vid because vid is not on vid 42 | conflicting value for key vid from deprecated name vid using earlier deprecated name vid 43 | deserializing the non static class vid with multiple outer class fields vid when changing compilers it s possible that the instance won t be able to be deserialized due to changed outer class field names in those cases deserialization could fail with field not found exception or class cast exception following this log line 44 | item vid cannot be moved skipping 45 | launched brooklyn vid 46 | replacing in vid member vid with old address vid new address vid 47 | adding to vid vid appears identical to existing vid may get removed on rebind underlying addition should be modified so it is not added twice 48 | error adding brooklyn properties for vid vid 49 | error copying customize resources 50 | problem polling for async script on vid for vid continuing 51 | resize vid from vid to vid 52 | tmpdirfinder candidate tmp dir vid cannot have files created inside it vid 53 | loaded rebind raw data took vid vid entities vid locations vid policies vid enrichers vid feeds vid catalog items vid bundles from vid 54 | changing hostname recorded against public ip vid from vid 55 | executing vid failed with class vid 56 | using ssh tool vid of type vid props 57 | subsequent error during termination vid 58 | failed rest deployment launching vid vid 59 | rebindriver for vid is not transforming machine location so not generating machine vid vid 60 | management node vid in vid new plane unable to promote to vid currently vid see log for vid 61 | error destroying vid ignoring vid 62 | detail on failure to deploy webapp vid 63 | policy vid balancing finished at cold node vid workrate number no way to improve it 64 | problem persisting change delta rethrowing 65 | group vid got new member vid 66 | vid recording metric update for item vid 67 | brooklynsecurityproviderfilterjavax dofilter caught vid 68 | problem in ha poller but no longer running vid 69 | cannot request read only mode for vid when already running vid ignoring 70 | rebindmanager instantiate vid rethrowing vid 71 | vid check for vid continuing failing vid 72 | cannot get hostname bug with string vid for vid ignoring 73 | failed to set permissions to vid for file vid 74 | fallback super realclass vid attempt failed orig class vid vid 75 | location vid added to vid 76 | success following serialized for vid vid 77 | running shell command at vid vid 78 | discouraged use of brooklyn properties deprecated use vid instead use vid 79 | error calculating and setting combination for enricher vid 80 | cassandra nics inferred ip vid for vid 81 | policy vid balancing finished at cold node vid workrate number no way to improve it 82 | jclouds using template vid options vid to provision machine in vid 83 | initiating replica set with vid 84 | deprecated use of brooklyn custom brooklyn properties for vid 85 | vid publishing failed state vid currentfailurestarttime vid now vid 86 | skipping configuration of non ec2 computeservice vid 87 | rebinding entity vid even though actual state is vid expected state is vid 88 | starting entity vid at vid 89 | no portforwardmanager using legacy vid 90 | geodns inferred geoinfo vid from hostname vid 91 | deprecated use of scanjavaannotations instead use of vid version syntax in future versions to load vid 92 | error rebinding brooklyn web console rebinding 93 | seeds considered stable for cluster vid node vid 94 | expected to find two security groups on node vid in app vid one shared one unique found vid vid 95 | queued task vid rethrowing vid 96 | error forcing brooklyn gc usage now vid 97 | vid adding children to vid n vid 98 | item vid cannot be moved skipping 99 | unable to create from archive returning vid 100 | resolution of vid failed swallowing and returning vid 101 | queued task vid of vid no longer running vid 102 | disconnecting sshjtool vid vid 103 | brooklyn geo info lookup failed for vid 104 | cors brooklyn fee disabled 105 | context entity found by looking at target vid entity tag not context entity 106 | multiple definitions for effector vid on vid ignoring vid 107 | network facing enricher not transforming vid uri vid because no port in target vid for vid 108 | copying chunk vid to vid on vid 109 | bundle vid containing bom is not managed by brooklyn using legacy item installation 110 | deprecated use of name key to define vid version should be specified within id key or with version key not this tag 111 | vid ports not applicable or not yet applicable because has multiple locations vid ignoring 112 | invoking vid on vid in vid 113 | can t infer catalog item id from the following plan n vid 114 | uninstalling bundle vid from brooklyn ui module bundle location vid 115 | members of vid checking vid eliminating because not member 116 | vid added to machine vid of location vid vid 117 | error stopping child continuing and will rethrow if no other errors 118 | multiple definitions for config key vid on vid from vid and vid preferring lower vid value vid 119 | cancelling vid mode vid on vid 120 | uninstalling bundle vid from brooklyn managed bundle vid n vid 121 | failed to unmanage entity vid and its descendants after failure to initialise rethrowing original exception 122 | could not determine canonical name of file vid returning original file 123 | no maven resource file vid available 124 | vid clearing ssh for vid 125 | scheduling item for persistence addition vid 126 | error computing geo info for vid internet issues or too many requests to free servers for vid subsequent errors for vid 127 | network facing enricher not transforming vid uri vid because no port mapping for vid 128 | failed to set permissions to vid for file vid expected behaviour on windows vid subsequent failures on any file will be logged at trace 129 | trace for quarantine group vid failed to start entity vid removing vid 130 | osgi could not find bundle vid in search after installing it from vid 131 | two masters detected probably a handover just occured vid 132 | launching vid members of vid now vid 133 | installing image regex to vid for vid 134 | flagutils for vid setting field vid val vid newval vid key vid 135 | vid undeploying vid vid on vid 136 | running command at vid vid 137 | vid recording addition of container vid 138 | brooklynsecurityproviderfilterjavax start 139 | theoretical best primary at vid vid maybe others not available using next best vid 140 | formula configured vid 141 | error creating uri for vid rethrowing vid 142 | validation done in vid 143 | vid scheduling but no longer running vid 144 | members of vid checking vid eliminating because not up 145 | resource vid type vid deployed to vid 146 | cannot notifyofinitialvalue for subscription with value vid 147 | creating customizing vid for vid 148 | create shell command at vid 149 | no reachable address vid feed from vid to vid 150 | activating local management for vid on start 151 | sethostnamecustomizer ignoring machine vid in vid 152 | while starting vid obtained new location instance vid 153 | managing vid in mode vid doing this recursively because a child is preregistered 154 | problem setting application lifecycle usage event vid vid 155 | vid closing pool for vid 156 | autodeployment in parent s management context triggered for vid vid will not be supported in future explicit manage call required 157 | child spec vid is already set with parent vid how did this happen 158 | found existing shared security group in vid for app vid vid 159 | found namespace vid returning it 160 | skipping ssh check for vid vid due to config waitforconnectable vid 161 | failed transfer vid to vid retryable error attempt vid vid vid 162 | error stopping brooklynweb console rethrowing 163 | starting entity vid at vid 164 | could not register external ui module vid vid 165 | service vid could not be parsed at vid vid 166 | discouraged deprecated use of static annotated effector method vid defined in vid 167 | unable again to find details of location vid in rest call to list ignoring location vid 168 | vid pre start management of entity vid mode vid 169 | releasing machine vid in vid instance id vid 170 | problem releasing machine vid propagating after vid vid 171 | this management node vid supposed to be master but reportedly unhealthy no op as expect other node to fix self vid 172 | rebind entity vid no longer running vid 173 | fallback loadclass vid attempt failed orig class vid vid 174 | configuring brooklynnode entity startup 175 | no location has been set on vid cannot configure security groups in context vid 176 | sequence for vid incremented to vid 177 | updating brooklyn properties from vid 178 | jmx jar for vid is not a valid jmx on vid because no jmx 179 | deprecated automatic coercion of object to timeduration set breakpoint in typecoercions to inspect convert to duration 180 | referenced task for vid vid 181 | adding auto generated user vid vid in vid 182 | suspending machine vid in vid instance id vid 183 | forcing catalog load on access of catalog items 184 | misconfiguration for vid sslconfig vid but no https_port on vid 185 | vid detected item removal on change of vid 186 | rescheduling addition of shard vid because add failed via router vid 187 | destroyed and unmanaged vid mgmt now vid managed vid 188 | problem deleting temporary files of async script on vid ignoring 189 | stopped read only vid mgmt vid 190 | vault response code vid vid 191 | problem terminiating management node state listeners continuing 192 | removing from vid member vid with old address vid because inferred address is now null 193 | machine details for vid missing from jclouds using ssh test instead name vid version vid arch vid ram vid cpus vid 194 | formula configured vid 195 | started brooklyn rest server at vid vid 196 | geodns vid refreshing vid 197 | policy vid detected vid should be on vid but can t move it vid 198 | unable to instantiate vid rethrowing vid 199 | system bundles are vid 200 | error in enricher vid but no longer running vid 201 | creating brooklyn local copy of bundle file vid 202 | vid resizing vid from vid to vid vid 203 | discouraged deprecated use of brooklynproperties for vid instead vid 204 | brooklyn gc deleted vid tasks as was over global limit now have vid 205 | custom password rebind for vid vid 206 | error launching brooklyn items from node vid ignoring vid 207 | ignoring failed execution of task callback hook vid because executor is shutdown 208 | failed to resolve aws hostname of vid vid 209 | installing vid with exit code vid 210 | isfirstnodeset but no cluster members found to add vid 211 | cannot store location lifecycle usage event for vid state vid because storage not available 212 | looking up vid in osgi 213 | standard location resolvers not installed location resolution will fail shortly 214 | restarting entity vid in vid machine vid 215 | done vid checkentity vid 216 | unable to delete one or more paths vid on shutdown vid 217 | launching vid with role vid and source of attempt to vid with role vid and vid but no unmanaged 218 | parent not found discarding following original ring for vid 219 | loading initial catalog from vid 220 | vid invoking effector on vid effector vid parameters vid 221 | queueing update needed task for vid update will occur shortly 222 | adding startup script to enable winrm for windows vm on vid 223 | brooklyn thought it was already managing bundle vid but it s not installed to framework 224 | vid invoking effector vid on vid with vid which is the target vid 225 | error running mongodb script vid at vid 226 | creating zookeeper using custom spec for vid 227 | repeating problem vid but no longer active ignoring 228 | releasing machine vid in vid instance id vid ignoring and continuing vid vid 229 | deleting temporary token for vid with version vid 230 | invalid item in catalog when converting rest catalog item type vid 231 | deletion of orphan state found unusually referenced feeds keeping vid 232 | looking up external classpath for vid 233 | vid calculated desired pool size vid from vid to vid 234 | error launching brooklyn vid 235 | unable to re connect to jmx url vid vid 236 | problem notifying listener vid of vid 237 | vm vid connection succeeded after vid on vid 238 | tmpdirfinder candidate tmp dir vid cannot have files created inside it vid 239 | error recording monitor info vid 240 | ignoring flag open_iptables on non ssh location vid 241 | task vid was modified but modification was never used 242 | long poll retrieving status directly received exit status will retry on vid for vid 243 | vid picking up vid as the tracker already set often due to rebind 244 | multiple definitions for effector vid on vid preferring lower vid to vid 245 | deprecated use of entities startmanagement application managementcontext for vid ignoring vid 246 | vid set on vid but pollforfirstreachableaddress vid 247 | use of groovy lang closure is deprecated in basicsubscriptioncontext subscribe 248 | restarting brooklyn machine in vid instance id vid 249 | theoretical best primary at vid vid maybe others not available at vid 250 | ignoring deprecated flag open_iptables on windows location vid 251 | error polling for vid command vid 252 | knifeportuseknifedefault specified to vid when already told to use vid explicitly overriding previous see subsequent warning for more details 253 | vid recording pool size vid for vid 254 | use of groovy lang closure is deprecated in type vid 255 | catalog does not contain item for type vid loaded class directly instead 256 | for vid considering membership of vid which is in locations vid 257 | looking for vid in revised location vid 258 | delaying vid vid allowed vid elapsed then rechecking for vid ms 259 | rest request running as vid threw vid 260 | mysampleimpl init with config vid 261 | localhost obtainport vid returning vid 262 | fabric vid updating seeds chosen vid potential vid 263 | geodns including vid even though vid is a private subnet homeless ential vid 264 | ignoring userdatastring vid in vm creation because not supported for cloud type vid 265 | management node vid detected master change required newmaster vid oldmaster vid plane vid heartbeattimeout vid 266 | vid can t configure resolver at vid no sshmachines 267 | brooklyn management context for vid vid 268 | rebinding addition of memento vid vid 269 | starting entity vid at vid 270 | cancelled vid tasks for vid with vid remaining of vid vid 271 | enricher vid transforming vid to vid 272 | had to wait vid for vid vid to be true before setting vid 273 | resizing vid to vid proxy vid of vid 274 | change handler should be hidden by event handler trace for unexpected mongo node handler 275 | bundle vid matches metadata of managed bundle vid but not osgi bundle location vid and matches already installed osgi bundle is no op 276 | ignoring mode vid in favour of port for management candidates of vid vid 277 | unexpected structure for state module vid skipping vid vid 278 | queued task vid at context vid no hierarchy 279 | effector vid defined on vid has no body invoking caller supplied vid instead 280 | ambiguous spec supertypes vid for target vid it is recommended that any registered type constraint for a spec be compatible with the sions 281 | restart of vid requested be applied at machine level 282 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/brooklyn/translation.context.test.log: -------------------------------------------------------------------------------- 1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/brooklyn/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/brooklyn/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None) 2 | | [code] dictionary: 1080 types 3 | | [log] dictionary: 1080 types 4 | | data-bin/context test 281 examples 5 | | ['data-bin/context'] test 281 examples 6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/brooklyn/checkpoint_last.pt 7 | | Translated 281 sentences (4829 tokens) in 7.5s (37.59 sentences/s, 646.06 tokens/s) 8 | | Generate test with beam=8: BLEU = 31.22 51.6/32.5/25.6/22.1 (BP = 1.000 ratio = 1.014 hyp_len = 2680 ref_len = 2642) 9 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/camel/translation.context.test.log: -------------------------------------------------------------------------------- 1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/camel/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/camel/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None) 2 | | [code] dictionary: 1080 types 3 | | [log] dictionary: 1080 types 4 | | data-bin/context test 637 examples 5 | | ['data-bin/context'] test 637 examples 6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/camel/checkpoint_last.pt 7 | | Translated 637 sentences (7331 tokens) in 9.5s (67.14 sentences/s, 772.64 tokens/s) 8 | | Generate test with beam=8: BLEU = 40.05 59.9/45.1/39.6/37.3 (BP = 0.896 ratio = 0.901 hyp_len = 4093 ref_len = 4543) 9 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/cloudstack/translation.context.test.log: -------------------------------------------------------------------------------- 1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/cloudstack/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/cloudstack/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None) 2 | | [code] dictionary: 1080 types 3 | | [log] dictionary: 1080 types 4 | | data-bin/context test 1061 examples 5 | | ['data-bin/context'] test 1061 examples 6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/cloudstack/checkpoint_last.pt 7 | | Translated 1061 sentences (13432 tokens) in 20.5s (51.64 sentences/s, 653.81 tokens/s) 8 | | Generate test with beam=8: BLEU = 34.95 53.6/38.9/31.9/27.8 (BP = 0.948 ratio = 0.949 hyp_len = 8344 ref_len = 8789) 9 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/hadoop/translation.context.test.log: -------------------------------------------------------------------------------- 1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/hadoop/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hadoop/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None) 2 | | [code] dictionary: 1080 types 3 | | [log] dictionary: 1080 types 4 | | data-bin/context test 1127 examples 5 | | ['data-bin/context'] test 1127 examples 6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hadoop/checkpoint_last.pt 7 | | Translated 1127 sentences (13134 tokens) in 19.2s (58.77 sentences/s, 684.85 tokens/s) 8 | | Generate test with beam=8: BLEU = 23.79 46.1/28.0/22.0/19.1 (BP = 0.877 ratio = 0.884 hyp_len = 7660 ref_len = 8664) 9 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/hbase/translation.context.test.log: -------------------------------------------------------------------------------- 1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/hbase/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hbase/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None) 2 | | [code] dictionary: 1080 types 3 | | [log] dictionary: 1080 types 4 | | data-bin/context test 507 examples 5 | | ['data-bin/context'] test 507 examples 6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hbase/checkpoint_last.pt 7 | | Translated 507 sentences (5988 tokens) in 9.1s (55.81 sentences/s, 659.12 tokens/s) 8 | | Generate test with beam=8: BLEU = 23.73 45.2/27.9/21.7/17.7 (BP = 0.899 ratio = 0.904 hyp_len = 3583 ref_len = 3964) 9 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/hive/translation.context.test.log: -------------------------------------------------------------------------------- 1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/hive/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hive/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None) 2 | | [code] dictionary: 1080 types 3 | | [log] dictionary: 1080 types 4 | | data-bin/context test 629 examples 5 | | ['data-bin/context'] test 629 examples 6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hive/checkpoint_last.pt 7 | | Translated 629 sentences (6861 tokens) in 10.8s (58.47 sentences/s, 637.76 tokens/s) 8 | | Generate test with beam=8: BLEU = 30.25 48.9/33.0/27.9/24.4 (BP = 0.934 ratio = 0.936 hyp_len = 3898 ref_len = 4163) 9 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/ignite/translation.context.test: -------------------------------------------------------------------------------- 1 | received session attribute request nodeid vid msg vid 2 | stopping spi vid 3 | overriding partition map in full update map exchid vid curpart vid newpart vid 4 | received session attribute request message msg vid nodeid vid 5 | ipfinder 6 | cassandra session refreshed 7 | will move session to less loaded worker ses vid from vid to vid 8 | baseline won t be changed cause the lost partitions were detected 9 | partition changed state grp vid p vid prev vid to vid 10 | got removed entry in lockasync method will retry vid 11 | node connection is idle but there are unacknowledged messages will wait vid 12 | refresh partitions due to topology update 13 | starting loading model by the path vid 14 | preserving deployment without node participants vid 15 | xa resource start xid vid xid vid 16 | obsolete version was not set because lock was explicit vid 17 | failed to connect to ignite update server vid 18 | generated node_joined bulk event nodecnt vid evtnode vid 19 | mvcc coordinator issued topology version for service vid fut vid 20 | processing node departure vid 21 | refresh partitions due to mapping was changed 22 | received job execution request while stopping will ignore vid 23 | failed to add entry err vid entry vid 24 | sent peer class loading response to node node does not exist vid 25 | attempt to execute cassandra batch vid operation to process rest vid of vid elements 26 | failed to find future for dht finish response txid vid node vid res vid 27 | acquired deployment class from local cache vid 28 | updating full partition map grp vid exchver vid fullmap vid 29 | communication problem resolver detected job cancelled nodeid vid 30 | failed to get future result 31 | got removed entry in transaction getallasync will retry vid 32 | restored near prepare from node vid 33 | async response resp vid 34 | skipped discovery notification node vid type vid topver vid 35 | ignore communication error resolve message resolve process already started sndnode vid 36 | ignore communication error resolver forced nodes stop reqid vid locnode vid 37 | failed to add candidates because entry was removed will renew 38 | maxconntimeout 39 | cleared invalid entry from remote transaction will skip entry vid tx vid 40 | vid xid version uuid vid 41 | store remove key vid tx vid 42 | received dht finish response txid vid dhttxid vid node vid 43 | removed mapping for node nodeid vid tx vid 44 | offheap remove key vid 45 | cassandra session refreshed 46 | committed from tm locnodeid vid tx vid 47 | partition map after beforeexchange grp vid exchid vid fullmap vid 48 | merge exchange future exchfut vid mergedfut vid evt vid evtnode vid evtnodeclient vid 49 | successfully locked persistence storage folder 50 | received near lock response for unknown future txid vid node vid 51 | failed to find future for get response sender vid res vid 52 | failed to find client message worker clientnode vid 53 | shmemport 54 | coordinator failed node is new coordinator ver vid 55 | found unacknowledged batch for left node nodeid vid fut vid 56 | restored partition state from wal grp vid p vid state vid updcntr vid 57 | deactivate page store manager id vid topver vid 58 | updated cache entry val vid old vid entry vid 59 | failed to send initial demand request to node 60 | deployment cannot be reused random class could not be loaded from sender node dep vid meta vid 61 | get affinity from cache vid key vid val vid 62 | failed to unswap entry 63 | scanner processor started 64 | boot class path vid 65 | failed to unlock key all partition nodes left the grid 66 | vid view caches 67 | failed to find class probably due to task job cancellation name vid err vid 68 | vid used cache groups id to name vid 69 | vid has been interrupted 70 | node is stopped or lock is broken in non failover safe mode aborting transaction 71 | injecting cache store session 72 | vid view information in a cluster 73 | message has been sent to next node msg vid next vid 74 | got removed entry while updating near value will retry vid 75 | awscredentials 76 | stopped closure processor 77 | message has been sent to node nodeid vid msg vid 78 | started moving ses vid 79 | failed to find class protocol vid 80 | injected task resources continuous query vid 81 | use vid option to disable it 82 | i am modified job_1 vid on vid 83 | cassandra table vid cause appropriate keyspace doesn t exist 84 | delete entries from db cache vid keytype vid cnt vid 85 | can t initialize query string vid 86 | finished range check range vid pos vid 87 | vid truststore_type vid 88 | cleaner has been cancelled 89 | received remove lock request for removed entry will retry entry vid req vid 90 | failed to send partition update to node left the grid 91 | received metrics update message from unknown node vid 92 | after vid release vid 93 | unregistering mbean vid 94 | error when polling event queue 95 | received duplicate continuous query message vid 96 | received schema propose discovery message but cache is statically configured and vid flag is set will report error opid vid msg vid 97 | sent peer class loading request node vid req vid 98 | discarding node add finished message join process is not finished vid 99 | removed message set due to node leaving grid vid 100 | encrypted data status vid handshakestaus vid ses vid 101 | received near prepare from node that left txid vid node vid 102 | partition has been scheduled for eviction this node is oldest non affinity node grp vid p vid prevstate vid 103 | failed to send tx update response node left msg vid node vid 104 | failed to send dht finish response node left txid vid dhttxid vid node vid 105 | discarding node added message with empty topology vid 106 | failed to send message to node msg vid err vid 107 | waiting for handshake buffer vid 108 | coordinator received single message ver vid node vid allreceived vid 109 | closing connection locnodeid vid rmtaddr vid rmtport vid 110 | finished executing job processor onkernalstop callback 111 | baseline won t be changed in topology 112 | failed to notify exchange future callback for exchange future vid 113 | rolling back ignite transaction vid 114 | opened input stream path vid delegate vid 115 | failed to cancel service ignoring name vid execid vid 116 | new resources vid 117 | i am modified job_1 vid on vid 118 | discarding reconnect message reconnect is completed vid 119 | failed to acquire lock with negative node vid 120 | flushing shuffle messages before sending task completion notification taskinfo vid state vid err vid 121 | skipping global authentication for node security credentials not found probably due to coordinator has older version nodeid vid addrs vid 122 | stealing job to a new node newnode vid oldnode vid sesid vid job vid jobctx vid task vid 123 | partition states after afterexchange grp vid exchver vid 124 | failed to find count down latch with worker vid 125 | cleared invalid entry from remote transaction will skip entry vid tx vid 126 | injecting cache store session vid 127 | abandoning re map because future is done vid 128 | partition map before afterexchange exchid vid fullmap vid 129 | unexpected response to join request vid 130 | sent cache message msg vid node vid 131 | received unexpected response to join request vid 132 | jdbc drivers folder has no files returning empty list 133 | transaction was not found in nodes 134 | vid label vid 135 | message is ignored as it came for the closed topic vid 136 | invalid transaction state for rollback state vid tx vid 137 | completed fragmentizer coordinator remote node vid 138 | finished running ssl engine tasks handshakestatus vid 139 | duplicate initialize process request received will ignore vid 140 | closing socket to next not sent vid 141 | ipc io stopping as unused vid 142 | non loopback local ips vid 143 | failed to restore closed connection reconnect networktimeout vid jointimeout vid 144 | discarding metrics update message issued by node node is no more coordinator vid 145 | ignoring backup element row vid cachemode vid incbackups vid primary vid 146 | tuple id vid from storm vid 147 | skipping own directory vid 148 | received near prepare response txid vid node vid 149 | failed during partition counters delivery to remote node left cluster will ignore futid vid node vid 150 | got removed entry while updating will retry vid 151 | mqtt grid vid 152 | i am modified job_1 vid on vid 153 | received communication error resolve request nodeid vid req vid 154 | application vid is vid 155 | failed to send multicast address request will retry in ms vid 156 | runtime error caught during initial demand request sending 157 | added new daemon node to topology vid 158 | failed to send checkpoint message to node msg vid err vid 159 | discarding killed join vid 160 | closing zookeeper ip finder 161 | failed to send verified node left message to node msg vid 162 | node left topology vid 163 | unknown connection detected is some other software connecting to this ignite port vid connection vid rmtaddr vid 164 | removing left node from full map update grp vid nodeid vid partmap vid 165 | sent job request client disconnected node vid taskname vid 166 | dht lock fut failed to send request txid vid dhttxid vid intx vid node vid 167 | prepared statement cluster error detected another thread already first 168 | ignite node is in invalid state due to a critical failure 169 | partition map after beforeexchange grp vid exchid vid fullmap vid 170 | skipping deployment check as remote node does not have required class vid 171 | timed out waiting for lock response vid 172 | vid node id vid 173 | received job cancel stopped callback 174 | received onundeploy request ldriver vid 175 | initialized alive zookeeper ip finder vid 176 | added invalid partition to future invalidparts vid 177 | load cache vid key vid val vid 178 | caught malformed url exception vid 179 | entry clear key vid entry vid val vid 180 | write entries to db cache vid keytype vid cnt vid 181 | return lastinitializedfut for topology ready future ver vid fut vid 182 | got removed entry when adding lock will retry vid 183 | received shuffle ack desc vid msg vid 184 | total number of jobs to be stolen vid 185 | will move session to less loaded worker ses vid msg vid 186 | found duplicate future in futures map will not add vid 187 | offer not sufficient for slave request vid 188 | gc worker has been started 189 | skipping rebalancing partition state is not moving vid p vid 190 | waiting for coordinator initialization will retry vid 191 | sent near finish response for completed tx txid vid dhttxid vid node vid 192 | failed to send partition update to node because it left grid will ignore node vid msg vid 193 | jobs to reject count jobstoreject vid jobs vid 194 | received dht lock response txid vid dhttxid vid node vid 195 | received user finish request jobid vid ses vid 196 | handshake response from local node vid 197 | starvationinc 198 | updating full partition map grp vid exchver vid fullmap vid 199 | unregistered spi mbean vid 200 | control utility has completed execution at vid 201 | put from load cache vid key vid val vid 202 | completing topology ready future right away head vid topver vid 203 | one model training time was vid 204 | stopped port processor 205 | command vid finished with code vid 206 | received data load request vid 207 | message has been sent to address msg vid locnodeid vid 208 | ignoring entry for partition that does not belong key true val false 209 | failed to stop distributed node vid 210 | bytes sockch vid cnt vid 211 | starting loading model by the path vid 212 | initializing cache store 213 | check before retry node already created vid 214 | discarding node left message join process is not finished vid 215 | partitions have been scheduled to resend reason node vid 216 | ignore affinity change message lastaffver vid exchver vid msgver vid 217 | file has been concurrently deleted vid 218 | ignoring entry for partition that does not belong key true val true err false 219 | got removed entry in lockasync method will retry vid 220 | node version to set vid 221 | got removed entry in lockasync method will retry vid 222 | failed to communication error resolve diagnostic with additional information vid 223 | sent near finish response txid vid dhttxid vid node vid 224 | attempted to remove lock on removed entry will retry rmvver vid entry vid 225 | opened igfs output stream for file append igfsname vid path vid streamid vid ses vid 226 | undeployed class loader as there are no participating nodes vid 227 | partition states after afterexchange grp vid exchver vid states vid 228 | mbean for metric registry vid can t be created 229 | other nodes not found 230 | got removed entry while processing get response will not retry 231 | vid view information in a cluster 232 | failed to get entry version msg vid 233 | vid mapping type vid 234 | creating db table with index 235 | grid load balancing spi vid 236 | failed to read classpath resource vid 237 | vid used cache groups id to name vid 238 | received data load response vid nodeid vid res vid 239 | metric registry not found registry vid 240 | initialized connection with remote vid node nodeid vid rmtaddr vid 241 | received data load response vid 242 | received dht finish response txid vid dhttxid vid node vid 243 | write dump file vid 244 | started range vid pos vid 245 | started services deployment future init localnode vid 246 | discarding node failed message sent from node which is about to fail vid 247 | new coordinator sends request ver vid node vid 248 | failed to perform operation 249 | sending partition update to node because it left grid will ignore node vid msg vid 250 | failed to find node added message node vid 251 | idle_verify is still running processed vid of vid local partitions 252 | synchronization aftercompletion status_status vid 253 | skipping dump page history due to can not reserve wal segments vid 254 | skipping alive node vid 255 | completing future vid 256 | received incoming connection when already connected to this node 257 | localportrange addr vid rmtport vid 258 | finished restoring partition state for local groups groupsprocessed vid time vid ms 259 | created new meta with updated participants vid 260 | default values 261 | updated metadata on server node holder vid changedschemas vid 262 | acquired deployment class after verifying other class 263 | delay alive nodes change process max event threshold reached newevts vid totalevts vid 264 | client creation failed addr vid err vid 265 | put after update cache vid key vid val vid success vid 266 | coordinator received single message ver vid node vid allreceived vid 267 | failed to add candidates because entry was removed will renew 268 | vid addresses vid 269 | dfltpri 270 | undeployed class loader as there are no participating nodes vid 271 | check failed message has been ignored msg vid spistate vid 272 | failed to wait for metadata update typeid vid schemaid vid 273 | sending cache message msg vid node vid 274 | completing topology ready future right away head vid topver vid 275 | handling topology req vid 276 | configured session factory using file vid 277 | ignite_hostname_constraint has invalid pattern it will be ignore 278 | waiting for handshake rmtnode vid 279 | received handshake message rmtnode vid rcvcnt vid 280 | external collision notification to vid 281 | received shuffle ack desc vid msg vid 282 | failed to close incoming file vid 283 | ignoring response since task is already reducing or finishing res vid 284 | got removed entry in transaction getallasync method will retry vid 285 | notifying exchange future about to remote node 286 | store put key true val true tx false 287 | field not found vid 288 | failed to find future for get response sender vid res vid 289 | starting spi implementation vid 290 | exchange timings 291 | failed waiting while initialization is completed 292 | vid ping_interval vid 293 | failed to send global state response node left nodeid vid nodeid vid 294 | vid the subcommands that take vid as an arguments 295 | interrupted while waiting for consumer threads to shut down exiting uncleanly 296 | daemon node failed vid 297 | received incoming connection when already connected to this node rejecting locnode vid rmtnode vid 298 | skipping partition on recovery no page store or wal state grp vid p vid 299 | before acquiring transaction lock for put on keys vid 300 | ignore affinity for cache vid key vid val vid 301 | failed to get future result fut vid 302 | demo tcpserver stared 303 | failed to send unauthenticated message to node node vid err vid 304 | successfully bound shared memory communication to tcp port port vid lochost vid 305 | unregistered mbean vid 306 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/ignite/translation.context.test.log: -------------------------------------------------------------------------------- 1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/ignite/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/ignite/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None) 2 | | [code] dictionary: 1080 types 3 | | [log] dictionary: 1080 types 4 | | data-bin/context test 305 examples 5 | | ['data-bin/context'] test 305 examples 6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/ignite/checkpoint_last.pt 7 | | Translated 305 sentences (4151 tokens) in 5.9s (52.01 sentences/s, 707.82 tokens/s) 8 | | Generate test with beam=8: BLEU = 28.81 50.7/32.9/25.8/20.4 (BP = 0.942 ratio = 0.943 hyp_len = 2482 ref_len = 2631) 9 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/synapse/translation.context.test: -------------------------------------------------------------------------------- 1 | priorityexecutor undeployment of the entry named vid started 2 | connection without a pool something wrong need to fix 3 | priorityexecutor with name vid does not exist 4 | http connection vid output 5 | vid message for the vid dropped in the pre mediation state by the mandatory sequence n vid 6 | user id cannot be found 7 | connection closed by the client end while writing the response vid 8 | event source vid was removed from the synapse configuration successfully 9 | no resource is defined for location vid 10 | received to vid 11 | cannot find a datasource with name vid either in in in memory or jndi datasource repositories 12 | can not open a connection to the url with a path vid 13 | restoring the messageprocessor with name vid completed 14 | initializing child mediators of mediator vid 15 | synapse library import named vid has been deployed from file vid 16 | sequence vid has already been undeployed 17 | the file vid is not a valid soap11 18 | fail to create the condition in the given directory vid 19 | there is no secret for alias vid returning itself 20 | endpoint vid has been updated from the file vid 21 | http connection vid response vid 22 | deleting a job with name vid group vid 23 | endpoint vid has been deployed from file vid 24 | error opening key store vid 25 | loading trust keystore from vid 26 | synapse received a response message without a message id 27 | error while pipe vid shutting down listener 28 | directory vid is not writable 29 | registered mediator serializer vid for vid 30 | http connection vid closed 31 | message request received for the request message id vid 32 | synapse encountered an exception no error handlers sending fault 33 | received to vid 34 | created a error log vid 35 | http protocol error vid 36 | system may be unstable ioreactor encountered a checked exception vid 37 | error while closing the temporary file vid 38 | error occurred while shutting down jvm 39 | priorityexecutor vid has already been undeployed 40 | all transport threads and tasks are idle and no pending callbacks 41 | using http tuning parameter vid vid 42 | matching cher for the provided character sequence and the pattern vid 43 | localentry update from file vid has started 44 | can t send the out message sequence vid does not exist 45 | couldn t get the lock for processing the file vid 46 | initializing transport listener for request 47 | undeploying proxy service vid 48 | undeployment of the endpoint vid 49 | added mediators for vid 50 | configuring transport sender started 51 | sequence vid has been built from the file vid 52 | proxyservice named vid has been built from the 53 | initializing xar metadata 54 | setting a statistics stack on the message context 55 | start writing the hessian message to outputstream 56 | no secret repositories have been configured 57 | error resolving directory to move after processing vid 58 | initializing synapse in an already existing axis2 server instance 59 | sequence deployment from file vid completed 60 | pass through vid sender started 61 | message store deployment from file vid completed 62 | eventsource named vid has been built from the file vid 63 | received a continue response 64 | error in closing the input stream 65 | error while releasing the file vid 66 | connection closed by the target host while receiving request 67 | creating a secret repositories for given configuration 68 | priorityexecutor undeployment of the entry named vid started 69 | synapse timed out for the request with message id vid 70 | no beanstalk definitions found for initialization 71 | soapaction vid 72 | loading endpoints from vid 73 | server certificate validation trust has been disabled do not use 74 | amqp transport polling task started listen for service vid 75 | keep alive connection was closed by the client vid 76 | did not schedule the job vid job count is zero 77 | thread was interrupted while waiting to be destroying 78 | hot deployment has been suspended ignoring 79 | sequence vid has been updated from the file vid 80 | can t send the out message sequence vid does not exist 81 | messagestore named vid has been restored 82 | there is no private key in the given configuration 83 | graceful stop request completed in milliseconds 84 | error pausing transport sender 85 | localentry update from file vid has started 86 | one or more required fields are not found in the mgiven vid 87 | api named vid has been deployed from file vid 88 | endpoint deployment from file vid completed 89 | error opening key store vid 90 | deleting temporary file vid 91 | connection time out while writing the response vid 92 | template vid has been updated from the file vid 93 | memory cache is full unable to initialize the cache value 94 | unable to create ssl context with the given configuration 95 | initiating a file based secret repository 96 | the reconnection attempt number vid failed next re try will be after vid seconds 97 | keep alive connection was closed 98 | taskdescription cannot be found for name vid returning null 99 | registered mediator for extension vid 100 | unexpected exception encountered in targethandler 101 | cannot create a urlconnection for given url vid 102 | deployment of the synapse artifact from file vid started 103 | message processor deployment from file vid started 104 | destroying the synapsecallbackreceiver 105 | getting a datasource with name vid from the given configuration 106 | synapsesubscription failed sending fault response 107 | template deployment from file vid completed 108 | proxyservice named vid has been built from the file vid 109 | base64 decoding on input 110 | loading a file vid from classpath 111 | cannot open vid 112 | creating new taskderepositories 113 | startuptask named vid has been undeployed 114 | session with id vid is still live 115 | synapsesubscription failed sending fault response 116 | interrupted while building message for rest_url request 117 | the property vid with key vid target vid 118 | template task vid has already been undeployed 119 | priorityexecutor named vid has been deployed from file vid 120 | added mediator serializer vid for vid 121 | starting apache synapse 122 | vid listener started on vid port vid 123 | outgoing request counter rolled over for the session vid from vid 124 | encountered an i o error vid 125 | template vid has been built from the file vid 126 | restoring the messagestore with name vid started 127 | there are no statistics to be cleaned 128 | removing the session with the session id vid 129 | proxyservice deployment from proxy service vid started 130 | start replicating the property with key vid 131 | removed taskdescription vid 132 | setting the store type vid to vid 133 | you are using a persistent message queue you will be loosing messages which are on the queue 134 | restoring the messagestore with name vid completed 135 | cookies string vid 136 | loading a file vid from classpath 137 | initializing mediators of mediator vid 138 | priorityexecutor vid has been updated from the file vid 139 | retrieving task was interrupted 140 | loading synapse properties from the file vid 141 | synapse has decided to abort the message n vid 142 | creating session information for given session id vid 143 | expiring message id vid dropping message after global statistics 144 | crl taken from cache 145 | could not determine host name 146 | error while destroying the task vid 147 | loading trust keystore from vid 148 | endpoint vid has been updated from the file vid 149 | initializing synapsecallbackreceiver 150 | destroying pass through vid listener 151 | starting apache synapse 152 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/synapse/translation.context.test.log: -------------------------------------------------------------------------------- 1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/synapse/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/synapse/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None) 2 | | [code] dictionary: 1080 types 3 | | [log] dictionary: 1080 types 4 | | data-bin/context test 151 examples 5 | | ['data-bin/context'] test 151 examples 6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/synapse/checkpoint_last.pt 7 | | Translated 151 sentences (1749 tokens) in 2.5s (59.55 sentences/s, 689.74 tokens/s) 8 | | Generate test with beam=8: BLEU = 37.85 55.9/41.1/34.0/29.0 (BP = 0.976 ratio = 0.976 hyp_len = 1078 ref_len = 1104) 9 | -------------------------------------------------------------------------------- /src/Baselines/LoGenText-Plus/results/1/synapse/translation.context.test.unsort: -------------------------------------------------------------------------------- 1 | 99 unexpected exception encountered in targethandler 2 | 129 start replicating the property with key vid 3 | 109 loading a file vid from classpath 4 | 57 initializing synapse in an already existing axis2 server instance 5 | 102 message processor deployment from file vid started 6 | 135 loading a file vid from classpath 7 | 20 http connection vid response vid 8 | 66 creating a secret repositories for given configuration 9 | 89 deleting temporary file vid 10 | 125 restoring the messagestore with name vid started 11 | 0 priorityexecutor undeployment of the entry named vid started 12 | 67 priorityexecutor undeployment of the entry named vid started 13 | 3 http connection vid output 14 | 13 initializing child mediators of mediator vid 15 | 131 setting the store type vid to vid 16 | 29 http connection vid closed 17 | 30 message request received for the request message id vid 18 | 100 cannot create a urlconnection for given url vid 19 | 127 removing the session with the session id vid 20 | 7 event source vid was removed from the synapse configuration successfully 21 | 96 keep alive connection was closed 22 | 136 initializing mediators of mediator vid 23 | 34 http protocol error vid 24 | 46 undeploying proxy service vid 25 | 103 destroying the synapsecallbackreceiver 26 | 98 registered mediator for extension vid 27 | 47 undeployment of the endpoint vid 28 | 41 matching cher for the provided character sequence and the pattern vid 29 | 81 there is no private key in the given configuration 30 | 149 destroying pass through vid listener 31 | 35 system may be unstable ioreactor encountered a checked exception vid 32 | 150 starting apache synapse 33 | 62 received a continue response 34 | 77 hot deployment has been suspended ignoring 35 | 1 connection without a pool something wrong need to fix 36 | 16 the file vid is not a valid soap11 37 | 144 could not determine host name 38 | 110 cannot open vid 39 | 139 loading synapse properties from the file vid 40 | 52 initializing xar metadata 41 | 21 deleting a job with name vid group vid 42 | 97 taskdescription cannot be found for name vid returning null 43 | 130 removed taskdescription vid 44 | 76 thread was interrupted while waiting to be destroying 45 | 71 loading endpoints from vid 46 | 74 keep alive connection was closed by the client vid 47 | 8 no resource is defined for location vid 48 | 27 directory vid is not writable 49 | 145 error while destroying the task vid 50 | 128 proxyservice deployment from proxy service vid started 51 | 101 deployment of the synapse artifact from file vid started 52 | 54 start writing the hessian message to outputstream 53 | 141 creating session information for given session id vid 54 | 104 getting a datasource with name vid from the given configuration 55 | 148 initializing synapsecallbackreceiver 56 | 123 encountered an i o error vid 57 | 84 localentry update from file vid has started 58 | 73 amqp transport polling task started listen for service vid 59 | 42 localentry update from file vid has started 60 | 40 using http tuning parameter vid vid 61 | 37 error occurred while shutting down jvm 62 | 126 there are no statistics to be cleaned 63 | 49 configuring transport sender started 64 | 45 initializing transport listener for request 65 | 133 restoring the messagestore with name vid completed 66 | 112 startuptask named vid has been undeployed 67 | 82 graceful stop request completed in milliseconds 68 | 10 cannot find a datasource with name vid either in in in memory or jndi datasource repositories 69 | 87 endpoint deployment from file vid completed 70 | 44 couldn t get the lock for processing the file vid 71 | 6 connection closed by the client end while writing the response vid 72 | 60 message store deployment from file vid completed 73 | 53 setting a statistics stack on the message context 74 | 83 error pausing transport sender 75 | 18 there is no secret for alias vid returning itself 76 | 85 one or more required fields are not found in the mgiven vid 77 | 120 starting apache synapse 78 | 38 priorityexecutor vid has already been undeployed 79 | 55 no secret repositories have been configured 80 | 134 cookies string vid 81 | 143 crl taken from cache 82 | 108 base64 decoding on input 83 | 69 no beanstalk definitions found for initialization 84 | 26 error while pipe vid shutting down listener 85 | 119 added mediator serializer vid for vid 86 | 107 proxyservice named vid has been built from the file vid 87 | 132 you are using a persistent message queue you will be loosing messages which are on the queue 88 | 4 vid message for the vid dropped in the pre mediation state by the mandatory sequence n vid 89 | 48 added mediators for vid 90 | 17 fail to create the condition in the given directory vid 91 | 51 proxyservice named vid has been built from the 92 | 11 can not open a connection to the url with a path vid 93 | 61 eventsource named vid has been built from the file vid 94 | 70 soapaction vid 95 | 116 the property vid with key vid target vid 96 | 111 creating new taskderepositories 97 | 115 interrupted while building message for rest_url request 98 | 114 synapsesubscription failed sending fault response 99 | 23 error opening key store vid 100 | 56 error resolving directory to move after processing vid 101 | 75 did not schedule the job vid job count is zero 102 | 64 error while releasing the file vid 103 | 137 priorityexecutor vid has been updated from the file vid 104 | 28 registered mediator serializer vid for vid 105 | 147 endpoint vid has been updated from the file vid 106 | 142 expiring message id vid dropping message after global statistics 107 | 122 outgoing request counter rolled over for the session vid from vid 108 | 91 template vid has been updated from the file vid 109 | 146 loading trust keystore from vid 110 | 19 endpoint vid has been updated from the file vid 111 | 15 sequence vid has already been undeployed 112 | 5 user id cannot be found 113 | 105 synapsesubscription failed sending fault response 114 | 36 error while closing the temporary file vid 115 | 65 connection closed by the target host while receiving request 116 | 31 synapse encountered an exception no error handlers sending fault 117 | 95 the reconnection attempt number vid failed next re try will be after vid seconds 118 | 80 messagestore named vid has been restored 119 | 90 connection time out while writing the response vid 120 | 12 restoring the messageprocessor with name vid completed 121 | 118 priorityexecutor named vid has been deployed from file vid 122 | 22 endpoint vid has been deployed from file vid 123 | 86 api named vid has been deployed from file vid 124 | 124 template vid has been built from the file vid 125 | 138 retrieving task was interrupted 126 | 68 synapse timed out for the request with message id vid 127 | 58 sequence deployment from file vid completed 128 | 33 created a error log vid 129 | 79 can t send the out message sequence vid does not exist 130 | 32 received to vid 131 | 9 received to vid 132 | 94 initiating a file based secret repository 133 | 92 memory cache is full unable to initialize the cache value 134 | 43 can t send the out message sequence vid does not exist 135 | 93 unable to create ssl context with the given configuration 136 | 59 pass through vid sender started 137 | 88 error opening key store vid 138 | 72 server certificate validation trust has been disabled do not use 139 | 121 vid listener started on vid port vid 140 | 25 synapse received a response message without a message id 141 | 140 synapse has decided to abort the message n vid 142 | 24 loading trust keystore from vid 143 | 63 error in closing the input stream 144 | 113 session with id vid is still live 145 | 2 priorityexecutor with name vid does not exist 146 | 39 all transport threads and tasks are idle and no pending callbacks 147 | 50 sequence vid has been built from the file vid 148 | 117 template task vid has already been undeployed 149 | 78 sequence vid has been updated from the file vid 150 | 14 synapse library import named vid has been deployed from file vid 151 | 106 template deployment from file vid completed 152 | -------------------------------------------------------------------------------- /src/Baselines/README.md: -------------------------------------------------------------------------------- 1 | # README 2 | We have open-sourced the code for the baselines we use. For the released model, we have open-sourced the code. For APIs, we have open-sourced the invocation scripts. As for those commercial plugins, due to usage restrictions, they can only be invoked manually. 3 | 4 | For each baseline utilized, we kindly request that please ensure to cite the relevant paper. 5 | -------------------------------------------------------------------------------- /src/Baselines/StarCoder/starcoder.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | import re 3 | import os 4 | import tqdm 5 | 6 | path = './LogBench-O_prefix_1point' 7 | ground_truth_folder = './LogBench-O_prefix_1point' 8 | output_path= './StarCoder_LogBench-O_prefix_1point' 9 | FIM_INDICATOR = "" 10 | FIM_PREFIX = "" 11 | FIM_MIDDLE = "" 12 | FIM_SUFFIX = "" 13 | 14 | checkpoint = "bigcode/starcoder" 15 | device = "cuda" 16 | auth_token = "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" 17 | 18 | # Check if output_path exists, if not, create it 19 | if not os.path.exists(output_path): 20 | os.makedirs(output_path) 21 | 22 | 23 | def insert_text_to_java_file(file_name, line_number): 24 | with open(file_name, 'r', encoding='utf-8') as file: 25 | lines = file.readlines() 26 | if line_number > len(lines): 27 | print("out of range") 28 | lines[line_number - 1] = lines[line_number - 1].rstrip() + FIM_INDICATOR +'\n' 29 | with open(file_name, 'w', encoding='utf-8') as file: 30 | file.writelines(lines) 31 | 32 | 33 | def extract_numbers(s): 34 | return re.findall(r'\d+', s) 35 | 36 | def parse_directory(dir_path, ground_truth_folder): 37 | for filename in os.listdir(dir_path): 38 | file_path = os.path.join(dir_path, filename) 39 | if os.path.isfile(file_path) and file_path.endswith('.java'): 40 | ground_truth_path = os.path.join(ground_truth_folder, file_path.split('/')[-1][:-5] + '_config.txt') 41 | try: 42 | with open(ground_truth_path, 'r', encoding='utf-8') as f: 43 | lines = f.readlines() 44 | if len(lines) >= 1: 45 | line_number = int(extract_numbers(lines[0].strip(' ')[:-1])[0]) 46 | insert_text_to_java_file(file_path, line_number) 47 | except FileNotFoundError: 48 | pass 49 | elif os.path.isdir(file_path): 50 | parse_directory(file_path, ground_truth_folder) 51 | 52 | parse_directory(path,ground_truth_folder) 53 | 54 | tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_auth_token=auth_token) 55 | model = AutoModelForCausalLM.from_pretrained(checkpoint, use_auth_token=auth_token).to(device) 56 | 57 | def generate(input_text): 58 | if FIM_INDICATOR in input_text: 59 | try: 60 | prefix, suffix = input_text.split(FIM_INDICATOR) 61 | except: 62 | raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt!") 63 | input_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}" 64 | 65 | 66 | inputs = tokenizer(input_text, return_tensors="pt") 67 | inputs = {k: v.to(device) for k, v in inputs.items()} 68 | outputs = model.generate( 69 | input_ids=inputs["input_ids"], 70 | attention_mask=inputs["attention_mask"], 71 | max_length=1024, 72 | do_sample=True, 73 | pad_token_id=tokenizer.eos_token_id, # Set pad_token_id 74 | ) 75 | return (tokenizer.decode(outputs[0])) 76 | 77 | for filename in os.listdir(path): 78 | if filename.endswith(".java"): 79 | print(filename) 80 | input_file_path = os.path.join(path, filename) 81 | 82 | try: 83 | with open(input_file_path, 'r', encoding='utf-8') as file: 84 | file_content = file.read() 85 | example = f"'''\\\n{file_content}\n'''" 86 | processed_content = generate(example) 87 | output_file_path = os.path.join(output_path, filename) 88 | with open(output_file_path, 'w', encoding='utf-8') as output_file: 89 | output_file.write(f"{processed_content}\n") 90 | except Exception as e: 91 | print(f"Error processing file {filename}: {e}") 92 | -------------------------------------------------------------------------------- /src/Baselines/WhichVar/analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 23, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 9, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "with open(\"output.json\", \"r\") as f:\n", 20 | " data_list = json.load(f)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 10, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "{'code': 'private void handleAdjustPublishRate(Context ctx) throws Exception {\\n Double publishRate = mapper.readValue(ctx.body(), Double.class);', 'pred_variables': ['mapper', 'publishRate', 'body'], 'label_variables': ['publishRate']}\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "print(data_list[0])" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 20, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "def precision_recall_f1(labels, predictions):\n", 47 | " true_positives = len(set(labels) & set(predictions))\n", 48 | " false_positives = len(set(predictions) - set(labels))\n", 49 | " false_negatives = len(set(labels) - set(predictions))\n", 50 | "\n", 51 | " precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0.0 else 0.0\n", 52 | " recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0.0 else 0.0\n", 53 | " f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0.0 else 0.0\n", 54 | "\n", 55 | " return precision, recall, f1" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 22, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "precs, recs, f1s = [], [], []\n", 65 | "for idx, data in enumerate(data_list):\n", 66 | " labels = data['label_variables']\n", 67 | " predcits = data['pred_variables']\n", 68 | " \n", 69 | " # print(predcits, labels)\n", 70 | " precision, recall, f1 = precision_recall_f1(labels, predcits)\n", 71 | " precs.append(precision)\n", 72 | " recs.append(recall)\n", 73 | " f1s.append(f1)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 25, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "0.5030762324986151\n", 86 | "0.6346379386090578\n", 87 | "0.5348833543779392\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "print(np.mean(precs))\n", 93 | "print(np.mean(recs))\n", 94 | "print(np.mean(f1s))" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "myenv", 108 | "language": "python", 109 | "name": "python3" 110 | }, 111 | "language_info": { 112 | "codemirror_mode": { 113 | "name": "ipython", 114 | "version": 3 115 | }, 116 | "file_extension": ".py", 117 | "mimetype": "text/x-python", 118 | "name": "python", 119 | "nbconvert_exporter": "python", 120 | "pygments_lexer": "ipython3", 121 | "version": "3.7.13" 122 | } 123 | }, 124 | "nbformat": 4, 125 | "nbformat_minor": 2 126 | } 127 | -------------------------------------------------------------------------------- /src/Baselines/WhichVar/cleaner.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import json\n", 11 | "import re\n", 12 | "from collections import Counter\n", 13 | "\n", 14 | "regex = r\"(?i)(?:log(?:ger)?\\w*)\\s*\\.\\s*(?:log|error|info|warn|fatal|debug|trace|off|all)\\s*\\([^;]*\\)\"\n", 15 | "\n", 16 | "def process_directory(directory):\n", 17 | " for filename in os.listdir(directory):\n", 18 | " filepath = os.path.join(directory, filename)\n", 19 | " if os.path.isdir(filepath):\n", 20 | " process_directory(filepath)\n", 21 | " elif filename.endswith('.json'):\n", 22 | " process_file(filepath)\n", 23 | "\n", 24 | "def process_file(filepath):\n", 25 | " with open(filepath, 'r') as f:\n", 26 | " data = json.load(f)\n", 27 | " method_code = data.get('methodCode', '')\n", 28 | " log_variables = data.get('logVariables', [])\n", 29 | " \n", 30 | " for match in re.finditer(regex, method_code):\n", 31 | " logging_statement = match.group(0)\n", 32 | " \n", 33 | " if all(var in logging_statement for var in log_variables):\n", 34 | " start_index = match.start()\n", 35 | " line_count = Counter(method_code[:start_index])['\\n']\n", 36 | " start_line = max(0, line_count - 15)\n", 37 | " preceding_lines = method_code.split('\\n')[:start_line]\n", 38 | " start_index = len('\\n'.join(preceding_lines)) + 1 if preceding_lines else 0\n", 39 | " data['methodCode'] = method_code[start_index:match.end()]\n", 40 | " \n", 41 | " with open(filepath, 'w') as f:\n", 42 | " json.dump(data, f)\n", 43 | " break\n", 44 | "# ...\n", 45 | "\n", 46 | "process_directory('/Users/liyichen/data/')\n" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [] 55 | } 56 | ], 57 | "metadata": { 58 | "kernelspec": { 59 | "display_name": "Python 3", 60 | "language": "python", 61 | "name": "python3" 62 | }, 63 | "language_info": { 64 | "codemirror_mode": { 65 | "name": "ipython", 66 | "version": 3 67 | }, 68 | "file_extension": ".py", 69 | "mimetype": "text/x-python", 70 | "name": "python", 71 | "nbconvert_exporter": "python", 72 | "pygments_lexer": "ipython3", 73 | "version": "3.9.7" 74 | } 75 | }, 76 | "nbformat": 4, 77 | "nbformat_minor": 2 78 | } 79 | -------------------------------------------------------------------------------- /src/Baselines/WhichVar/model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence 4 | from torchtext.vocab import GloVe 5 | import json 6 | from torch.utils.data import DataLoader, Dataset, random_split 7 | import numpy as np 8 | from sklearn.metrics import precision_score, recall_score, f1_score 9 | import re 10 | import random 11 | 12 | def check_and_split_camel_case(s): 13 | if re.match(r'^[a-z]+([A-Z][a-z]*)*$', s): 14 | words = re.findall('[a-z]+|[A-Z][a-z]*', s) 15 | return "yes", words 16 | else: 17 | return "no", s 18 | 19 | 20 | def setup_seed(seed): 21 | if seed == -1: 22 | seed = random.randint(0, 1000) 23 | torch.manual_seed(seed) 24 | torch.cuda.manual_seed_all(seed) 25 | np.random.seed(seed) 26 | random.seed(seed) 27 | torch.backends.cudnn.deterministic = True 28 | torch.backends.cudnn.benchmark = False 29 | return seed 30 | 31 | 32 | class Model(nn.Module): 33 | def __init__(self, weight): 34 | super(Model, self).__init__() 35 | vocab_size = weight.shape[0] 36 | self.word_embed = nn.Embedding(num_embeddings=vocab_size+1, embedding_dim=weight.shape[-1]) 37 | self.word_embed.weight.data[:vocab_size] = weight 38 | self.word_embed.weight.data[vocab_size] = torch.zeros(weight.shape[-1]) 39 | self.word_embed.weight.requires_grad = False 40 | 41 | self.rnn = nn.LSTM(100, 128, num_layers=2, bidirectional=True, batch_first=True) 42 | self.num_heads = 4 43 | self.attention = nn.MultiheadAttention(embed_dim=256, num_heads=self.num_heads, batch_first=True) 44 | 45 | self.cls_layer = nn.Linear(256, 1, bias=False) 46 | 47 | 48 | def forward(self, sentences, lens): 49 | 50 | embeds = self.word_embed(sentences) 51 | outputs, _ = self.rnn(embeds) 52 | attn_mask=torch.zeros((sentences.size(0) * 4, sentences.size(1), sentences.size(1)), device=sentences.device).bool() 53 | for i, l in enumerate(lens): 54 | for j in range(1, self.num_heads+1): 55 | attn_mask[i*j][:l][:l] = True 56 | 57 | attention_embeds, _ = self.attention(outputs, outputs, outputs, attn_mask=None) 58 | logits = self.cls_layer(attention_embeds).squeeze(dim=-1) 59 | 60 | return logits 61 | 62 | class SensDataSet(Dataset): 63 | def __init__(self, data, label): 64 | self.data = data 65 | self.label = label 66 | 67 | def __len__(self): 68 | return len(self.data) 69 | 70 | def __getitem__(self, idx): 71 | tuple_ = (self.data[idx], self.label[idx]) 72 | return tuple_ 73 | 74 | 75 | def collate_fn(data_tuple): 76 | # data_tuple.sort(key=lambda x: len(x[0]), reverse=True) 77 | data = [torch.LongTensor(sq[0]) for sq in data_tuple] 78 | label = [torch.Tensor(sq[1]) for sq in data_tuple] 79 | data_length = [len(sq) for sq in data] 80 | data = pad_sequence(data, batch_first=True) 81 | label = pad_sequence(label, batch_first=True) 82 | return data, label, data_length 83 | 84 | 85 | # def evaluate(model, test_dataloader, device): 86 | # acc = 0 87 | # n = 0 88 | # model.eval() 89 | # total_pred = [] 90 | # total_label =[] 91 | # for batch_x, batch_y, batch_x_len in test_dataloader: 92 | # batch_x = batch_x.to(device) 93 | # batch_y = batch_y.to(device) 94 | # out = model(batch_x, batch_x_len) 95 | # predicts = (out > 0) + 0 96 | # for predict, label, length in zip(predicts, batch_y, batch_x_len): 97 | # total_pred.append(predict[:length]) 98 | # total_label.append(label[:length]) 99 | # total_pred = torch.cat(total_pred).cpu().numpy() 100 | # total_label = torch.cat(total_label).cpu().numpy() 101 | 102 | # precision = precision_score(total_label, total_pred) 103 | # recall = recall_score(total_label, total_pred) 104 | # f1 = f1_score(total_label, total_pred) 105 | # return {"precision" : precision, "recall" : recall, "f1" : f1} 106 | 107 | 108 | def evaluate(model, test_dataloader, device): 109 | model.eval() 110 | precision_list = [] 111 | recall_list = [] 112 | f1_list = [] 113 | predicts_list = [] 114 | for batch_x, batch_y, batch_x_len in test_dataloader: 115 | batch_x = batch_x.to(device) 116 | batch_y = batch_y.to(device) 117 | out = model(batch_x, batch_x_len) 118 | predicts = (out > 0) + 0 119 | batch_x = batch_x.cpu().numpy() 120 | batch_y = batch_y.cpu().numpy() 121 | predicts = predicts.cpu().numpy() 122 | for x, predict, label, length in zip(batch_x, predicts, batch_y, batch_x_len): 123 | # print(len(x), len(predict), len(label), length) 124 | x, predict, label = x[:length], predict[:length], label[:length] 125 | 126 | pred_1_set = set(x[predict == 1]) 127 | pred_0_set = set(x) - pred_1_set 128 | label_1_set = set(x[label == 1]) 129 | label_0_set = set(x) - label_1_set 130 | TP = len(label_1_set.intersection(pred_1_set)) 131 | FN = len(label_1_set.intersection(pred_0_set)) 132 | FP = len(label_0_set.intersection(pred_1_set)) 133 | TN = len(label_0_set.intersection(pred_0_set)) 134 | precision = TP / (TP + FP) if (TP + FP) != 0 else 0 135 | recall = TP / (TP + FN) if (TP + FN) != 0 else 0 136 | f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0 137 | precision_list.append(precision) 138 | recall_list.append(recall) 139 | f1_list.append(f1) 140 | predicts_list.append(predict) 141 | 142 | precision = np.mean(precision_list) 143 | recall = np.mean(recall_list) 144 | f1 = np.mean(f1_list) 145 | # print(len(f1_list)) 146 | 147 | return {"precision" : precision, "recall" : recall, "f1" : f1}, predicts_list 148 | 149 | 150 | 151 | 152 | if __name__ == '__main__': 153 | setup_seed(111) 154 | device = 'cuda:0' if torch.cuda.is_available() else 'cpu' 155 | print(device) 156 | epochs = 0 157 | glove = GloVe(name='6B', dim=100) 158 | vocab_size = len(glove) 159 | with open('./train.json', 'r') as file: 160 | train_data = json.load(file) 161 | with open('./test.json', 'r') as file: 162 | test_data = json.load(file) 163 | 164 | train_sentences = train_data['input'] 165 | train_sentences = [[glove.stoi[word] if word in glove.stoi.keys() else vocab_size for word in sentence] for sentence in train_sentences] 166 | train_labels = train_data['label'] 167 | 168 | test_sentences = test_data['input'] 169 | test_sentences = [[glove.stoi[word] if word in glove.stoi.keys() else vocab_size for word in sentence] for sentence in test_sentences] 170 | test_labels = test_data['label'] 171 | # print(len(test_sentences)) 172 | # train_size = int(0.8 * len(sentences)) 173 | # train_sentences, test_sentences = sentences[:train_size], sentences[train_size:] 174 | # train_labels, test_labels = labels[:train_size], labels[train_size:] 175 | 176 | train_dataset = SensDataSet(data=train_sentences, label=train_labels) 177 | test_dataset = SensDataSet(data=test_sentences, label=test_labels) 178 | 179 | # train_dataset, test_dataset = random_split(dataset, [0.8, 0.2]) 180 | 181 | train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn) 182 | test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn) 183 | 184 | model = Model(weight=glove.vectors).to(device) 185 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-03) 186 | loss_fun = nn.BCEWithLogitsLoss(reduction='none') 187 | 188 | for epoch in range(1, epochs + 1): 189 | model.train() 190 | total_loss = [] 191 | for batch_id, (batch_x, batch_y, batch_x_len) in enumerate(train_loader): 192 | batch_x = batch_x.to(device) 193 | batch_y = batch_y.to(device) 194 | out = model(batch_x, batch_x_len) 195 | pos_mask=torch.zeros((batch_x.size(0), batch_x.size(1)), device=device).bool() 196 | for i, l in enumerate(batch_x_len): 197 | pos_mask[i][:l] = True 198 | loss = loss_fun(out, batch_y)[pos_mask].mean() 199 | optimizer.zero_grad() 200 | loss.backward() 201 | optimizer.step() 202 | total_loss.append(loss.item()) 203 | print("epoch: {}/{}, loss={}".format(epoch, epochs, np.mean(total_loss))) 204 | result1, predicts_list1 = evaluate(model, train_loader, device) 205 | result2, predicts_list2 = evaluate(model, test_loader, device) 206 | print('result on train set: {}'.format(result1)) 207 | print('result on test set: {}'.format(result2)) 208 | 209 | torch.save(model.state_dict(), 'model/model.pth') 210 | 211 | model.load_state_dict(torch.load('model/model.pth')) 212 | 213 | result, predicts_list = evaluate(model, test_loader, device) 214 | print(len(predicts_list)) 215 | test_cases = [] 216 | for i in range(len(predicts_list)): 217 | codes = test_data['codes'][i] 218 | predict = predicts_list[i] 219 | # print(len(test_data['input'][i]), len(test_sentences[i]), len(predict)) 220 | variables = list(set([test_data['input'][i][j] for j, v in enumerate(predict) if v == 1])) 221 | label_variables = test_data['variables'][i] 222 | output_data = { 223 | 'code': codes, 224 | 'pred_variables': variables, 225 | 'label_variables': label_variables 226 | } 227 | test_cases.append(output_data) 228 | json.dump(test_cases, open('output.json', 'w'), indent=4) 229 | 230 | -------------------------------------------------------------------------------- /src/Baselines/lance/README.md: -------------------------------------------------------------------------------- 1 | # Using Deep Learning To Support Logging Activities 2 | 3 | We present LANCE(Log stAtemeNt reCommEnder), a DL-based approach for supporting the task of log statement generation and injection in the context of Java. LANCE is built on the recently proposed Text-To-Text Transfer Transformer (T5) architecture 4 | 5 | 6 | #### How to experiment with LANCE 7 | 8 | 9 | * ##### How to train a new SentencePiece Model 10 | 11 | Before training the [T5 small](https://github.com/google-research/text-to-text-transfer-transformer), namely the core of LANCE, it is important to also train a new tokenizer (sentencepiece model) to accomodate the expanded vocabulary given by the java programming language. For such, we used the raw pre-training instances(Java corpus) + English sentences from the well known C4 dataset 12 | 13 | *Pythonic way* 14 | 15 | ``` 16 | pip install sentencepiece==0.1.96 17 | import sentencepiece as spm 18 | spm.SentencePieceTrainer.train('--input=all_sp.txt --model_prefix=LOG_SP --vocab_size=32000 --bos_id=-1 --eos_id=1 --unk_id=2 --pad_id=0 --shuffle_input_sentence=true --character_coverage=1.0 --user_defined_symbols=“”') 19 | ``` 20 | 21 | Under this path we also provide our trained tokenizer: https://github.com/lance-log/lance/tree/main/Code 22 | 23 | * ##### Setup a Google Cloud Storage (GCS) Bucket 24 | To setup a new GCS Bucket for training and fine-tuning a T5 Model, please follow the original guide provided by Google: Here the link: https://cloud.google.com/storage/docs/quickstart-console 25 | 26 | 27 | * ##### Datasets 28 | 29 | The datasets for pre-training, fine-tuning, validating and finally testing LANCE can be found at this link: https://drive.google.com/drive/folders/1D12y-CIJTYLxMeSmGQjxEXjTEzQImgaH?usp=sharing 30 | 31 | * ##### Pre-training/Fine-tuning 32 | 33 | To pre-train and then, fine-tune LANCE, please use the following: 34 | - Pre-Training 35 | - Fine-Tuning 36 | 37 | 38 | 39 | * ##### Models 40 | * Pre-trained on the tasks mixture (Multi-Task) 41 | * Pre-trained on LogSTMT only Task 42 | * Pre-trained on Denoise only Task 43 | * No Pre-trained 44 | 45 | * ##### Results: :open_file_folder: 46 | * Multi-Task 47 | * LogSTMT only Task 48 | * Denoising only Task 49 | * No Pre-trained 50 | 51 | 52 | * ##### Additional: 53 | Under Miscellaneous, you can find the additional script used for the data analysis and the exact hyper-parameters configuration we employed in the study. 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /src/Baselines/lance/lance.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import os 3 | import gin 4 | import tensorflow.compat.v1 as tf 5 | import tensorflow_datasets as tfds 6 | from contextlib import contextmanager 7 | import logging as py_logging 8 | import t5 9 | from t5.data import postprocessors as t5_postprocessors 10 | from t5.seqio import Feature,SentencePieceVocabulary 11 | from mesh_tensorflow.transformer.learning_rate_schedules import slanted_triangular 12 | from mesh_tensorflow.transformer.learning_rate_schedules import truncated_rsqrt 13 | from tensorflow.keras.optimizers.schedules import PolynomialDecay 14 | from t5 import models 15 | 16 | BASE_DIR = "gs://xxxx" #@param { type: "string" } 17 | TPU_TOPOLOGY = "2x2" 18 | tpu = tf.distribute.cluster_resolver.TPUClusterResolver("grpc://xx.xx.xx.xx") # TPU detection 19 | TPU_ADDRESS = tpu.get_master() 20 | tf.disable_v2_behavior() 21 | tf.get_logger().propagate = False 22 | py_logging.root.setLevel('INFO') 23 | 24 | @contextmanager 25 | def tf_verbosity_level(level): 26 | og_level = tf.logging.get_verbosity() 27 | tf.logging.set_verbosity(level) 28 | yield 29 | tf.logging.set_verbosity(og_level) 30 | 31 | path_finetuning = BASE_DIR + '/datasets/Fine-tuning/train.tsv' #@param { type: "string" } 32 | path_eval = BASE_DIR + '/datasets/Fine-tuning/eval.tsv' #@param { type: "string" } 33 | path_test = BASE_DIR + '/datasets/Fine-tuning/test.tsv' #@param { type: "string" } 34 | 35 | nq_tsv_path = { 36 | "train": path_finetuning, 37 | "validation": path_test 38 | } 39 | 40 | num_nq_examples = dict(train=106382, validation=12020) 41 | 42 | vocab_model_path = BASE_DIR + '/Code/SP_LOG.model' #@param { type: "string" } 43 | vocab_path = BASE_DIR + '/Code/SP_LOG.vocab' #@param { type: "string" } 44 | 45 | 46 | TaskRegistry = t5.data.TaskRegistry 47 | TfdsTask = t5.data.TfdsTask 48 | 49 | 50 | def get_default_vocabulary(): 51 | return SentencePieceVocabulary(vocab_model_path, 100) 52 | 53 | DEFAULT_OUTPUT_FEATURES = { 54 | "inputs": Feature( 55 | vocabulary=get_default_vocabulary(), add_eos=True, required=False), 56 | 57 | "targets": Feature( 58 | vocabulary=get_default_vocabulary(), add_eos=True) 59 | } 60 | 61 | def nq_dataset_task(split, shuffle_files=True): 62 | # We only have one file for each split. 63 | del shuffle_files 64 | 65 | # Load lines from the text file as examples. 66 | 67 | ds = tf.data.TextLineDataset(nq_tsv_path[split]) 68 | ds = ds.map( 69 | functools.partial(tf.io.decode_csv, record_defaults=["string","string"], 70 | field_delim="\t", use_quote_delim=True), 71 | num_parallel_calls=tf.data.experimental.AUTOTUNE) 72 | 73 | ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex))) 74 | return ds 75 | 76 | print("A few raw train examples...") 77 | for ex in tfds.as_numpy(nq_dataset_task("train").take(5)): 78 | print(ex) 79 | 80 | def preprocessing(ds): 81 | 82 | def to_inputs_and_targets(ex): 83 | x_input = tf.strings.strip(ex['input']) 84 | y_label = tf.strings.strip(ex['output']) 85 | inputs = tf.strings.join([x_input], separator=' ') 86 | class_label = tf.strings.join([y_label], separator=' ') 87 | return {'inputs': inputs, 'targets': class_label} 88 | 89 | return ds.map(to_inputs_and_targets, 90 | num_parallel_calls=tf.data.experimental.AUTOTUNE) 91 | 92 | t5.data.TaskRegistry.remove('log_injection') 93 | t5.data.TaskRegistry.add( 94 | "log_injection", 95 | dataset_fn=nq_dataset_task, 96 | splits=["train","validation"], 97 | text_preprocessor=[preprocessing], 98 | output_features = DEFAULT_OUTPUT_FEATURES, 99 | metric_fns=[t5.evaluation.metrics.accuracy], 100 | num_input_examples=num_nq_examples 101 | ) 102 | 103 | nq_task = t5.data.TaskRegistry.get("log_injection") 104 | ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512}) 105 | print("A few preprocessed training examples...") 106 | for ex in tfds.as_numpy(ds.take(5)): 107 | print(ex) 108 | 109 | starter_learning_rate = 0.01 110 | end_learning_rate = 0.001 111 | decay_steps = 10000 112 | 113 | learning_rate_fn = PolynomialDecay( 114 | starter_learning_rate, 115 | decay_steps, 116 | end_learning_rate, 117 | power=0.5) 118 | 119 | MODEL_SIZE = "small" 120 | 121 | MODEL_DIR = BASE_DIR + '/modeltest/'#@param { type: "string" } 122 | 123 | PRETRAINED_DIR=BASE_DIR + '/denoising_task_model/'#@param { type: "string" } 124 | 125 | 126 | model_parallelism, train_batch_size, keep_checkpoint_max = { 127 | "small": (1, 128, 16), 128 | "base": (2, 128, 8), 129 | "large": (8, 64, 4), 130 | "3B": (8, 16, 1), 131 | "11B": (8, 16, 1)}[MODEL_SIZE] 132 | 133 | tf.io.gfile.makedirs(MODEL_DIR) 134 | 135 | model = t5.models.MtfModel( 136 | model_dir=PRETRAINED_DIR, 137 | tpu=TPU_ADDRESS, 138 | #tpu_job_name="node-1", 139 | #tpu_zone="us-central1-f", 140 | #gcp_project="lance", 141 | tpu_topology=TPU_TOPOLOGY, 142 | model_parallelism=model_parallelism, 143 | batch_size=train_batch_size, 144 | learning_rate_schedule = learning_rate_fn, #pick the correct scheduler, according to the model you want to train 145 | sequence_length={"inputs": 512, "targets": 512}, 146 | save_checkpoints_steps=5000, 147 | keep_checkpoint_max=keep_checkpoint_max, 148 | iterations_per_loop=100, 149 | ) 150 | 151 | PATH_GIN_FILE_NO_PT = BASE_DIR + '/Configs/no_pretraining_operative_config.gin' 152 | PATH_GIN_FILE_MT = BASE_DIR + '/Configs/multi-task_operative_config.gin' 153 | PATH_GIN_FILE_DENOISE = BASE_DIR + '/Configs/denoise_only_operative_config.gin' 154 | PATH_GIN_FILE_LOG_STMT = BASE_DIR + '/Configs/log_stmt_only_operative_config.gin' 155 | 156 | #with gin.unlock_config(): 157 | # gin.parse_config_file(PATH_GIN_FILE_DENOISE) 158 | # #RUN FINE-TUNING 159 | # TRAIN_STEPS = 200000 160 | # model.finetune(mixture_or_task_name="log_injection", 161 | # finetune_steps=TRAIN_STEPS, 162 | # pretrained_model_dir=PRETRAINED_DIR) 163 | 164 | # If the no-pretraining experiment is the one you want to run, then, uncomment the following and comment model.finetune 165 | # Also, make sure to upload the slanted_operative.gin 166 | #model.train("log_injection", TRAIN_STEPS) 167 | #model.bach_size=32 168 | #model.eval( 169 | #mixture_or_task_name="log_injection", 170 | #checkpoint_steps=-1 171 | #) 172 | #dataset_list = ["cassandra","elasticsearch","flink","hbase","wicket","zookeeper"] 173 | dataset_list = ['logstudy'] 174 | for item in dataset_list: 175 | model.batch_size = 256 176 | input_file = BASE_DIR + f'/datasets/logr_input/lance_function_transformed.txt'#@param { type: "string" } 177 | output_file = BASE_DIR+ f'/datasets/logr_input/lance_function_transformed_result.txt'#@param { type: "string" } 178 | model.predict(input_file, output_file, checkpoint_steps=-1, vocabulary=get_default_vocabulary()) -------------------------------------------------------------------------------- /src/CodeTransformer/README.md: -------------------------------------------------------------------------------- 1 | # CodeTransformer 2 | 3 | We provide only the compiled version for use throughout the review process. We will make the source code available after the paper has been accepted. 4 | -------------------------------------------------------------------------------- /src/DataCollector/check_pom.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import xml.etree.ElementTree as ET 4 | import os 5 | import base64 6 | import shutil 7 | from github import Github 8 | from github import GithubException 9 | 10 | def check_string_in_file(file_path, search_str="log4j"): 11 | with open(file_path, 'r') as file: 12 | content = file.read() 13 | 14 | if "log4j" in content.lower() or "slf4j" in content.lower(): 15 | return True 16 | else: 17 | return False 18 | 19 | def check_log4j(pom_file_path): 20 | # Parse the POM file as XML 21 | try: 22 | # Parse XML file 23 | tree = ET.parse(pom_file_path) 24 | root = tree.getroot() 25 | 26 | # Define the Log4j dependency artifact details 27 | group_id = 'org.apache.logging.log4j' 28 | artifact_id = 'log4j-core' 29 | 30 | # Iterate over the dependency elements in the POM file and check for the Log4j dependency 31 | for dependency in root.findall('.//{http://maven.apache.org/POM/4.0.0}dependency'): 32 | # Retrieve the group ID and artifact ID of the dependency 33 | dep_group_id = dependency.find('.//{http://maven.apache.org/POM/4.0.0}groupId') 34 | dep_artifact_id = dependency.find('.//{http://maven.apache.org/POM/4.0.0}artifactId') 35 | if dep_group_id is not None and dep_artifact_id is not None: 36 | dep_group_id, dep_artifact_id = dep_group_id.text, dep_artifact_id.text 37 | # Check if the dependency is the Log4j dependency 38 | if dep_group_id == group_id and dep_artifact_id == artifact_id: 39 | print(f'The POM file {pom_file_path} features the Log4j dependency') 40 | return True 41 | 42 | except ET.ParseError as e: 43 | # Handle XML parsing exception 44 | print('Error parsing XML file:', e) 45 | 46 | print(f'The POM file {pom_file_path} does not feature the Log4j dependency') 47 | return False 48 | 49 | 50 | def get_sha_for_tag(repository, tag): 51 | """ 52 | Returns a commit PyGithub object for the specified repository and tag. 53 | """ 54 | branches = repository.get_branches() 55 | matched_branches = [match for match in branches if match.name == tag] 56 | if matched_branches: 57 | return matched_branches[0].commit.sha 58 | 59 | tags = repository.get_tags() 60 | matched_tags = [match for match in tags if match.name == tag] 61 | if not matched_tags: 62 | print("No Tag or Branch exists with that name") 63 | return None 64 | return matched_tags[0].commit.sha 65 | 66 | 67 | def download_file(git, sha, repo, path): 68 | try: 69 | file_content = git.get_contents(path, ref=sha) 70 | file_data = base64.b64decode(file_content.content) 71 | directory_path, _ = os.path.split(path) 72 | if not os.path.exists(f"repos/{repo}/{directory_path}"): 73 | os.makedirs(f"repos/{repo}/{directory_path}", exist_ok=True) 74 | file_out = open(f"repos/{repo}/{path}", "wb") 75 | file_out.write(file_data) 76 | file_out.close() 77 | except (GithubException, IOError) as exc: 78 | print('Error processing %s: %s', path, exc) 79 | 80 | def check_repo(owner, repo, branch="master"): 81 | # Define the Github Tree API endpoint and repository details 82 | api_url = 'https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1' 83 | # Make an HTTP GET request to the Github Tree API endpoint 84 | access_token = "" 85 | headers = {'Authorization': f'token {access_token}'} 86 | if not os.path.exists(f"repos/{repo}/"): 87 | os.makedirs(f"repos/{repo}/") 88 | #print(f"./{repo}/ created") 89 | 90 | git = Github("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") 91 | git_repo = git.get_repo(f"{owner}/{repo}") 92 | sha = get_sha_for_tag(git_repo, branch) 93 | # Parse the response data as JSON 94 | response = requests.get(api_url.format(owner=owner, repo=repo, branch=branch), headers=headers) 95 | data = response.json() 96 | contain_pom = False 97 | if sha is not None: 98 | for item in data['tree']: 99 | if re.search("pom.xml", item['path'], re.IGNORECASE): 100 | download_file(git_repo, sha, repo, item['path']) 101 | if check_log4j(f"repos/{repo}/{item['path']}"): 102 | contain_pom = True 103 | break 104 | else: 105 | os.remove(f"repos/{repo}/{item['path']}") 106 | print(f"{owner}/{repo} pom checking result: ", contain_pom) 107 | shutil.rmtree(f"repos/{repo}/") 108 | return contain_pom 109 | # # Iterate over the file and directory objects in the response 110 | # for item in data['tree']: 111 | # # Retrieve the file path and type 112 | # path, type = item['path'], item['type'] 113 | 114 | # # If the item is a file, retrieve the raw content using the 'url' property 115 | # if type == 'blob': 116 | # file_url = item['url'] 117 | # file_response = requests.get(file_url) 118 | # file_data = file_response.content 119 | 120 | # # Process the file content as needed 121 | # print(f'File: {path}') 122 | # #print(file_data) 123 | # else: 124 | # # Process directories or other items as needed 125 | # print(f'Directory: {path}') 126 | # github.com/davidb/scala-maven-plugin 127 | 128 | def check_repo_root(owner, repo, access_token, branch="master"): 129 | # Define the Github Tree API endpoint and repository details 130 | #api_url = 'https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1' 131 | # Make an HTTP GET request to the Github Tree API endpoint 132 | #headers = {'Authorization': f'token {access_token}'} 133 | if not os.path.exists(f"repos/{repo}/"): 134 | os.makedirs(f"repos/{repo}/") 135 | #print(f"./{repo}/ created") 136 | 137 | git = Github(access_token) 138 | try: 139 | git_repo = git.get_repo(f"{owner}/{repo}") 140 | except GithubException as e: 141 | if e.status == 404: 142 | print("non") 143 | else: 144 | print("error") 145 | shutil.rmtree(f"repos/{repo}/") 146 | return False 147 | 148 | sha = get_sha_for_tag(git_repo, branch) 149 | # Parse the response data as JSON 150 | contain_pom = False 151 | if sha is not None: 152 | contents = git_repo.get_dir_contents(".", ref=sha) 153 | for content in contents: 154 | if content.type == "file" and content.path == "pom.xml": 155 | download_file(git_repo, sha, repo, content.path) 156 | if check_log4j(f"repos/{repo}/{content.path}") or check_string_in_file(f"repos/{repo}/{content.path}"): 157 | contain_pom = True 158 | break 159 | 160 | shutil.rmtree(f"repos/{repo}/") 161 | print(f"{owner}/{repo} pom checking result: ", contain_pom) 162 | return contain_pom 163 | 164 | #check_repo("davidb", "scala-maven-plugin") -------------------------------------------------------------------------------- /src/DataCollector/download.py: -------------------------------------------------------------------------------- 1 | from get_java import download_java 2 | 3 | Key = "" 4 | 5 | with open("1.txt", "r") as file: 6 | for line in file: 7 | repo_list = line.split() 8 | owner, repo, branch = repo_list[1], repo_list[2], repo_list[3] 9 | print(f"{repo_list[0]} repo: {owner} {repo} {branch}") 10 | Done = False 11 | with open("result1.txt", "r") as f: 12 | content = f.read() 13 | if owner in content and repo in content: 14 | Done = True 15 | if Done: 16 | print("Already Done!") 17 | continue 18 | cnt1, cnt2 = download_java(owner, repo, key, branch) 19 | with open("result1.txt", "a") as f: 20 | f.write(f"{repo_list[0]} {owner}/{repo} downloaded: {cnt1}/{cnt1+cnt2} files\n") -------------------------------------------------------------------------------- /src/DataCollector/filter_pom.py: -------------------------------------------------------------------------------- 1 | import json 2 | import subprocess 3 | from check_pom import check_repo_root 4 | from tqdm import tqdm 5 | 6 | with open("results.json", encoding='latin1') as rf: 7 | repos = json.load(rf) 8 | 9 | 10 | Key = "" 11 | 12 | repos = repos['items'] 13 | #check_repo_root("nysenate", "openlegislation", "dev") 14 | end_point = len(repos) 15 | 16 | with open("result.txt", "a") as f: 17 | for i in range(17, end_point): 18 | repo_item = repos[i] 19 | branch = repo_item['defaultBranch'] 20 | owner, repo = repo_item['name'].split('/') 21 | print(f"\n{i}-{end_point}/{len(repos)} repo: {owner} {repo} {branch}\n") 22 | if check_repo_root(owner, repo, key, branch): 23 | f.write(f"{i} {owner} {repo} {branch}\n") 24 | f.flush() 25 | -------------------------------------------------------------------------------- /src/DataCollector/get_java.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import xml.etree.ElementTree as ET 4 | import os 5 | import base64 6 | import shutil 7 | from tqdm import tqdm 8 | from github import Github 9 | from github import GithubException 10 | import subprocess 11 | import time 12 | 13 | pattern = r"(?im)log.*\.(log|error|info|warn|fatal|debug|trace|off|all)\(.*\)" 14 | regex = re.compile(pattern, re.DOTALL) 15 | 16 | 17 | def git_clone(owner, repo): 18 | max_attempts = 5 19 | retry_wait_time = 5 # in seconds 20 | 21 | git_url = f"https://github.com/{owner}/{repo}.git" 22 | local_path = f"./temp/{repo}" 23 | cmd = ["git", "clone", git_url, local_path] 24 | 25 | for i in range(max_attempts): 26 | try: 27 | subprocess.check_call(cmd) 28 | print("Git clone successful!") 29 | break 30 | except subprocess.CalledProcessError as e: 31 | print(f"Git clone attempt {i + 1} failed with error code {e.returncode}. Retrying in {retry_wait_time} seconds...") 32 | time.sleep(retry_wait_time) 33 | else: 34 | print(f"Git clone failed after {max_attempts} attempts.") 35 | 36 | 37 | def get_sha_for_tag(repository, tag): 38 | """ 39 | Returns a commit PyGithub object for the specified repository and tag. 40 | """ 41 | branches = repository.get_branches() 42 | matched_branches = [match for match in branches if match.name == tag] 43 | if matched_branches: 44 | return matched_branches[0].commit.sha 45 | 46 | tags = repository.get_tags() 47 | matched_tags = [match for match in tags if match.name == tag] 48 | if not matched_tags: 49 | print("No Tag or Branch exists with that name") 50 | return None 51 | return matched_tags[0].commit.sha 52 | 53 | 54 | def check_java(path): 55 | try: 56 | with open(path, 'r') as file: 57 | content = file.read() 58 | words = content.split() 59 | if len(words) > 300: 60 | return False 61 | lines = content.split('\n') 62 | if len(lines) > 300: 63 | return False 64 | match = regex.search(content) 65 | if match: 66 | return True 67 | except UnicodeDecodeError as e: 68 | print(f"Error: {e} and Path: {path}") 69 | return False 70 | return False 71 | 72 | 73 | def download_java_file(git, sha, repo, path): 74 | try: 75 | file_content = git.get_contents(path, ref=sha) 76 | _, file_name = os.path.split(path) 77 | file_data = base64.b64decode(file_content.content) 78 | file_out = open(f"repos/{repo}/{file_name}", "wb") 79 | file_out.write(file_data) 80 | file_out.close() 81 | if check_java(f"repos/{repo}/{file_name}") == False: 82 | os.remove(f"repos/{repo}/{file_name}") 83 | return 0 84 | return 1 85 | except (GithubException, IOError) as exc: 86 | print('Error processing %s: %s', path, exc) 87 | return 0 88 | 89 | 90 | def download_java(owner, repo, access_token, branch="master"): 91 | if not os.path.exists(f"repos/{repo}/"): 92 | os.makedirs(f"repos/{repo}/") 93 | 94 | git = Github(access_token) 95 | try: 96 | git_repo = git.get_repo(f"{owner}/{repo}") 97 | except GithubException as e: 98 | if e.status == 404: 99 | print("Non") 100 | else: 101 | print("Error") 102 | shutil.rmtree(f"repos/{repo}/") 103 | return False 104 | sha = get_sha_for_tag(git_repo, branch) 105 | 106 | # Define the Github Tree API endpoint and repository details 107 | api_url = 'https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1' 108 | # Make an HTTP GET request to the Github Tree API endpoint 109 | headers = {'Authorization': f'token {access_token}'} 110 | response = requests.get(api_url.format(owner=owner, repo=repo, branch=branch), headers=headers) 111 | data = response.json() 112 | cnt1, cnt2 = 0, 0 113 | print(git_repo.size) 114 | if git_repo.size < 500000000: 115 | git_clone(owner, repo) 116 | for subdir, dirs, files in os.walk(f"./temp/{repo}"): 117 | for file in tqdm(files): 118 | if not file.endswith(".java"): 119 | continue 120 | file_path = os.path.join(subdir, file) 121 | if os.path.getsize(file_path) < 15 * 1024: 122 | cnt2 += 1 123 | if check_java(file_path): 124 | cnt1 += 1 125 | shutil.copy2(file_path, f"repos/{repo}/{file}") 126 | shutil.rmtree(f"./temp/{repo}") 127 | else: 128 | print("File is too large!") 129 | if sha is not None: 130 | tree = data['tree'] 131 | leng = len(tree) 132 | for file in tqdm(tree): 133 | #for item in tqdm(tree): 134 | if file['type'] != "tree" and file['size'] < 15 * 1024 and file['path'].endswith(".java"): 135 | cnt1 += 1 136 | cnt1 += download_java_file(git_repo, sha, repo, file['path']) 137 | cnt2 += 1 138 | if cnt1 == 0: 139 | shutil.rmtree(f"repos/{repo}/") 140 | print(f"{owner}/{repo} downloaded: {cnt1}/{cnt1+cnt2} files") 141 | return cnt1, cnt2 142 | -------------------------------------------------------------------------------- /src/unixcoder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Microsoft Corporation. 2 | # Licensed under the MIT license. 3 | 4 | import torch 5 | import torch.nn as nn 6 | from transformers import RobertaTokenizer, RobertaModel, RobertaConfig 7 | 8 | class UniXcoder(nn.Module): 9 | def __init__(self, model_name): 10 | """ 11 | Build UniXcoder. 12 | 13 | Parameters: 14 | 15 | * `model_name`- huggingface model card name. e.g. microsoft/unixcoder-base 16 | """ 17 | super(UniXcoder, self).__init__() 18 | self.tokenizer = RobertaTokenizer.from_pretrained(model_name) 19 | self.config = RobertaConfig.from_pretrained(model_name) 20 | self.config.is_decoder = True 21 | self.model = RobertaModel.from_pretrained(model_name, config=self.config) 22 | 23 | self.register_buffer("bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024)) 24 | self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False) 25 | self.lm_head.weight = self.model.embeddings.word_embeddings.weight 26 | self.lsm = nn.LogSoftmax(dim=-1) 27 | 28 | self.tokenizer.add_tokens([""],special_tokens=True) 29 | 30 | def tokenize(self, inputs, mode="", max_length=512, padding=False): 31 | """ 32 | Convert string to token ids 33 | 34 | Parameters: 35 | 36 | * `inputs`- list of input strings. 37 | * `max_length`- The maximum total source sequence length after tokenization. 38 | * `padding`- whether to pad source sequence length to max_length. 39 | * `mode`- which mode the sequence will use. i.e. , , 40 | """ 41 | assert mode in ["", "", ""] 42 | assert max_length < 1024 43 | 44 | tokenizer = self.tokenizer 45 | 46 | tokens_ids = [] 47 | for x in inputs: 48 | tokens = tokenizer.tokenize(x) 49 | if mode == "": 50 | tokens = tokens[:max_length-4] 51 | tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens + [tokenizer.sep_token] 52 | elif mode == "": 53 | tokens = tokens[-(max_length-3):] 54 | tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens 55 | else: 56 | tokens = tokens[:max_length-5] 57 | tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens + [tokenizer.sep_token] 58 | 59 | tokens_id = tokenizer.convert_tokens_to_ids(tokens) 60 | if padding: 61 | tokens_id = tokens_id + [self.config.pad_token_id] * (max_length-len(tokens_id)) 62 | tokens_ids.append(tokens_id) 63 | return tokens_ids 64 | 65 | def decode(self, source_ids): 66 | """ Convert token ids to string """ 67 | predictions = [] 68 | for x in source_ids: 69 | prediction = [] 70 | for y in x: 71 | t = y.cpu().numpy() 72 | t = list(t) 73 | if 0 in t: 74 | t = t[:t.index(0)] 75 | text = self.tokenizer.decode(t,clean_up_tokenization_spaces=False) 76 | prediction.append(text) 77 | predictions.append(prediction) 78 | return predictions 79 | 80 | def forward(self, source_ids): 81 | """ Obtain token embeddings and sentence embeddings """ 82 | mask = source_ids.ne(self.config.pad_token_id) 83 | token_embeddings = self.model(source_ids,attention_mask = mask.unsqueeze(1) * mask.unsqueeze(2))[0] 84 | sentence_embeddings = (token_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1).unsqueeze(-1) 85 | return token_embeddings, sentence_embeddings 86 | 87 | def generate(self, source_ids, decoder_only = True, eos_id = None, beam_size = 5, max_length = 64): 88 | """ Generate sequence given context (source_ids) """ 89 | 90 | # Set encoder mask attention matrix: bidirectional for , unirectional for 91 | if decoder_only: 92 | mask = self.bias[:,:source_ids.size(-1),:source_ids.size(-1)] 93 | else: 94 | mask = source_ids.ne(self.config.pad_token_id) 95 | mask = mask.unsqueeze(1) * mask.unsqueeze(2) 96 | 97 | if eos_id is None: 98 | eos_id = self.config.eos_token_id 99 | 100 | device = source_ids.device 101 | 102 | # Decoding using beam search 103 | preds = [] 104 | zero = torch.LongTensor(1).fill_(0).to(device) 105 | source_len = list(source_ids.ne(1).sum(-1).cpu().numpy()) 106 | length = source_ids.size(-1) 107 | encoder_output = self.model(source_ids,attention_mask=mask) 108 | for i in range(source_ids.shape[0]): 109 | context = [[x[i:i+1,:,:source_len[i]].repeat(beam_size,1,1,1) for x in y] 110 | for y in encoder_output.past_key_values] 111 | beam = Beam(beam_size,eos_id,device) 112 | input_ids = beam.getCurrentState().clone() 113 | context_ids = source_ids[i:i+1,:source_len[i]].repeat(beam_size,1) 114 | out = encoder_output.last_hidden_state[i:i+1,:source_len[i]].repeat(beam_size,1,1) 115 | for _ in range(max_length): 116 | if beam.done(): 117 | break 118 | if _ == 0: 119 | hidden_states = out[:,-1,:] 120 | out = self.lsm(self.lm_head(hidden_states)).data 121 | beam.advance(out) 122 | input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin())) 123 | input_ids = beam.getCurrentState().clone() 124 | else: 125 | length = context_ids.size(-1)+input_ids.size(-1) 126 | out = self.model(input_ids,attention_mask=self.bias[:,context_ids.size(-1):length,:length], 127 | past_key_values=context).last_hidden_state 128 | hidden_states = out[:,-1,:] 129 | out = self.lsm(self.lm_head(hidden_states)).data 130 | beam.advance(out) 131 | input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin())) 132 | input_ids = torch.cat((input_ids,beam.getCurrentState().clone()),-1) 133 | hyp = beam.getHyp(beam.getFinal()) 134 | pred = beam.buildTargetTokens(hyp)[:beam_size] 135 | pred = [torch.cat([x.view(-1) for x in p]+[zero]*(max_length-len(p))).view(1,-1) for p in pred] 136 | preds.append(torch.cat(pred,0).unsqueeze(0)) 137 | 138 | preds = torch.cat(preds,0) 139 | 140 | return preds 141 | 142 | 143 | 144 | class Beam(object): 145 | def __init__(self, size, eos, device): 146 | self.size = size 147 | self.device = device 148 | # The score for each translation on the beam. 149 | self.scores = torch.FloatTensor(size).zero_().to(device) 150 | # The backpointers at each time-step. 151 | self.prevKs = [] 152 | # The outputs at each time-step. 153 | self.nextYs = [torch.LongTensor(size).fill_(0).to(device)] 154 | # Has EOS topped the beam yet. 155 | self._eos = eos 156 | self.eosTop = False 157 | # Time and k pair for finished. 158 | self.finished = [] 159 | 160 | def getCurrentState(self): 161 | "Get the outputs for the current timestep." 162 | batch = self.nextYs[-1].view(-1, 1) 163 | return batch 164 | 165 | def getCurrentOrigin(self): 166 | "Get the backpointers for the current timestep." 167 | return self.prevKs[-1] 168 | 169 | def advance(self, wordLk): 170 | """ 171 | Given prob over words for every last beam `wordLk` and attention 172 | `attnOut`: Compute and update the beam search. 173 | 174 | Parameters: 175 | 176 | * `wordLk`- probs of advancing from the last step (K x words) 177 | * `attnOut`- attention at the last step 178 | 179 | Returns: True if beam search is complete. 180 | """ 181 | numWords = wordLk.size(1) 182 | 183 | # Sum the previous scores. 184 | if len(self.prevKs) > 0: 185 | beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk) 186 | 187 | # Don't let EOS have children. 188 | for i in range(self.nextYs[-1].size(0)): 189 | if self.nextYs[-1][i] == self._eos: 190 | beamLk[i] = -1e20 191 | else: 192 | beamLk = wordLk[0] 193 | flatBeamLk = beamLk.view(-1) 194 | bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True) 195 | 196 | self.scores = bestScores 197 | 198 | # bestScoresId is flattened beam x word array, so calculate which 199 | # word and beam each score came from 200 | prevK = torch.div(bestScoresId, numWords, rounding_mode="floor") 201 | self.prevKs.append(prevK) 202 | self.nextYs.append((bestScoresId - prevK * numWords)) 203 | 204 | 205 | for i in range(self.nextYs[-1].size(0)): 206 | if self.nextYs[-1][i] == self._eos: 207 | s = self.scores[i] 208 | self.finished.append((s, len(self.nextYs) - 1, i)) 209 | 210 | # End condition is when top-of-beam is EOS and no global score. 211 | if self.nextYs[-1][0] == self._eos: 212 | self.eosTop = True 213 | 214 | def done(self): 215 | return self.eosTop and len(self.finished) >= self.size 216 | 217 | def getFinal(self): 218 | if len(self.finished) == 0: 219 | self.finished.append((self.scores[0], len(self.nextYs) - 1, 0)) 220 | self.finished.sort(key=lambda a: -a[0]) 221 | if len(self.finished) != self.size: 222 | unfinished=[] 223 | for i in range(self.nextYs[-1].size(0)): 224 | if self.nextYs[-1][i] != self._eos: 225 | s = self.scores[i] 226 | unfinished.append((s, len(self.nextYs) - 1, i)) 227 | unfinished.sort(key=lambda a: -a[0]) 228 | self.finished+=unfinished[:self.size-len(self.finished)] 229 | return self.finished[:self.size] 230 | 231 | def getHyp(self, beam_res): 232 | """ 233 | Walk back to construct the full hypothesis. 234 | """ 235 | hyps=[] 236 | for _,timestep, k in beam_res: 237 | hyp = [] 238 | for j in range(len(self.prevKs[:timestep]) - 1, -1, -1): 239 | hyp.append(self.nextYs[j+1][k]) 240 | k = self.prevKs[j][k] 241 | hyps.append(hyp[::-1]) 242 | return hyps 243 | 244 | def buildTargetTokens(self, preds): 245 | sentence=[] 246 | for pred in preds: 247 | tokens = [] 248 | for tok in pred: 249 | if tok==self._eos: 250 | break 251 | tokens.append(tok) 252 | sentence.append(tokens) 253 | return sentence 254 | 255 | --------------------------------------------------------------------------------