├── LICENSE
├── LogBench-O
├── LogBench-O_prefix_1point.zip
├── LogBench-O_prefix_1point_file_level.zip
└── LogBench-O_prefix_1point_wo_comments.zip
├── LogBench-T
├── LogBench-T_prefix_1point.zip
└── LogBench-T_prefix_1point_file_level.zip
├── README.md
├── build
└── code-transformer.jar
├── cases
└── generated_cases.csv
├── img
├── empirical_overview.jpg
├── empirical_overview.pdf
└── empirical_overview.png
└── src
├── Baselines
├── ChatGPT
│ └── chatgpt.py
├── Davinci
│ └── davinci.py
├── DeepLV
│ ├── Helper.py
│ ├── Metrics.py
│ ├── block_level_LSTM.py
│ ├── block_processing
│ │ └── block_processing.py
│ └── deepLV_cleaner.py
├── Incoder
│ └── incoder.py
├── LoGenText-Plus
│ ├── README.md
│ ├── requirements.txt
│ └── results
│ │ └── 1
│ │ ├── activemq
│ │ ├── translation.context.test
│ │ ├── translation.context.test.log
│ │ └── translation.context.test.unsort
│ │ ├── ambari
│ │ ├── translation.context.test
│ │ ├── translation.context.test.log
│ │ └── translation.context.test.unsort
│ │ ├── brooklyn
│ │ ├── translation.context.test
│ │ ├── translation.context.test.log
│ │ └── translation.context.test.unsort
│ │ ├── camel
│ │ ├── translation.context.test
│ │ ├── translation.context.test.log
│ │ └── translation.context.test.unsort
│ │ ├── cloudstack
│ │ ├── translation.context.test
│ │ ├── translation.context.test.log
│ │ └── translation.context.test.unsort
│ │ ├── hadoop
│ │ ├── translation.context.test
│ │ ├── translation.context.test.log
│ │ └── translation.context.test.unsort
│ │ ├── hbase
│ │ ├── translation.context.test
│ │ ├── translation.context.test.log
│ │ └── translation.context.test.unsort
│ │ ├── hive
│ │ ├── translation.context.test
│ │ ├── translation.context.test.log
│ │ └── translation.context.test.unsort
│ │ ├── ignite
│ │ ├── translation.context.test
│ │ ├── translation.context.test.log
│ │ └── translation.context.test.unsort
│ │ └── synapse
│ │ ├── translation.context.test
│ │ ├── translation.context.test.log
│ │ └── translation.context.test.unsort
├── README.md
├── StarCoder
│ └── starcoder.py
├── WhichVar
│ ├── analysis.ipynb
│ ├── cleaner.ipynb
│ ├── data.json
│ ├── model.py
│ ├── output.json
│ ├── test.json
│ └── train.json
└── lance
│ ├── README.md
│ └── lance.py
├── CodeTransformer
└── README.md
├── DataCollector
├── check_pom.py
├── download.py
├── filter_pom.py
└── get_java.py
├── eva_sem.ipynb
└── unixcoder.py
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/LogBench-O/LogBench-O_prefix_1point.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-O/LogBench-O_prefix_1point.zip
--------------------------------------------------------------------------------
/LogBench-O/LogBench-O_prefix_1point_file_level.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-O/LogBench-O_prefix_1point_file_level.zip
--------------------------------------------------------------------------------
/LogBench-O/LogBench-O_prefix_1point_wo_comments.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-O/LogBench-O_prefix_1point_wo_comments.zip
--------------------------------------------------------------------------------
/LogBench-T/LogBench-T_prefix_1point.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-T/LogBench-T_prefix_1point.zip
--------------------------------------------------------------------------------
/LogBench-T/LogBench-T_prefix_1point_file_level.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-T/LogBench-T_prefix_1point_file_level.zip
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # LogBench
2 |
3 | **LogBench is a benchmark for evaluating logging statement generation.**
4 |
5 | Logging statements are imperative in modern software. They serve important role in reflecting developer's intention, recording system behavior, and guiding failure diagnosis procedure. LogBench provides a benchmark and toolkit, allowing you to measure your own models and conveniently compare them with existing baseline models.
6 |
7 |
8 | If you find our paper benefit your research, please kindly cite our following paper:
9 |
10 | + Yichen Li, Yintong Huo, Zhihan Jiang, Renyi Zhong, Pinjia He, Yuxin Su, Lionel C. Briand, and Michael R. Lyu. [Exploring the Effectiveness of LLMs in Automated Logging Generation: An Empirical Study](https://arxiv.org/abs/2307.05950), IEEE Transactions on Software Engineering(TSE), 2024.
11 |
12 | ## Study overview
13 | 
14 |
15 | The study is fully described in this [paper](https://arxiv.org/abs/2307.05950). LogBench comprises two subsets for evaluating the model's *effectiveness* and *generalizability*, respectively:
16 |
17 | 1. Effectiveness: **LogBench-O** contains a collection of high-quality logging statements and their associated code contexts.
18 | 2. Generalizability: **LogBench-T** is an unseen code dataset, after semantically-equivalent code transformation from LogBench-O.
19 |
20 | Additionally, LogBench offers various variants to support different settings in logging statement generation, including:
21 |
22 | * Method-level
23 | * File-level
24 | * Comment-included
25 | * Comment-free
26 |
27 | ## Repository organization
28 | We currently provide part of the code in the folder `/src`. We will release the full source code after the paper has been accepted.
29 |
30 | * LogBench-O: The `/LogBench-O` folder contains the files for LogBench-O.
31 | * LogBench-T: The `/LogBench-T` folder contains the files for LogBench-T.
32 | * Cases: Please refer to the `cases` folder for the generated cases.
33 |
34 | #
35 |
36 | ```
37 | ├── LICENSE
38 | ├── LogBench-O
39 | │ ├── LogBench-O_prefix_1point.zip
40 | │ ├── LogBench-O_prefix_1point_file_level.zip
41 | │ └── LogBench-O_prefix_1point_wo_comments.zip
42 | ├── LogBench-T
43 | │ ├── LogBench-T_prefix_1point.zip
44 | │ └── LogBench-T_prefix_1point_file_level.zip
45 | ├── README.md
46 | ├── build
47 | │ └── code-transformer.jar
48 | ├── cases
49 | │ └── generated_cases.csv
50 | ├── img
51 | │ ├── overview.pdf
52 | │ └── overview.png
53 | └── src
54 | ├── Baselines
55 | │ ├── DeepLV
56 | │ ├── WhichVar
57 | │ ├── LogenText-Plus
58 | │ ├── StarCoder
59 | │ └── Lance
60 | │ └── InCoder
61 | │ └── ...
62 | ├── CodeTransformer
63 | │ └── README.md
64 | └── DataCollector
65 | ├── ...
66 | ```
67 |
68 |
69 | ## Study subjects
70 | | 11 LLMs | Access | Paper reference |
71 | | ------------ | ------ | ---- |
72 | | Davinci | API | [Project](https://platform.openai.com/docs/models) |
73 | | ChatGPT | API | [Project](https://platform.openai.com/docs/models) |
74 | | LANCE | Model | [ICSE'22] [Using deep learning to generate complete log statements](https://dl.acm.org/doi/abs/10.1145/3510003.3511561) |
75 | | InCoder | Model | [ICLR'23] [InCoder: A Generative Model for Code Infilling and Synthesis](https://openreview.net/forum?id=hQwb-lbM6EL) |
76 | | Llama2 | Model | [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288) |
77 | | StarCoder | Model | [StarCoder: may the source be with you!](https://arxiv.org/abs/2305.06161) |
78 | | CodeLlama | Model | [Code Llama: Open Foundation Models for Code](https://arxiv.org/abs/2308.12950) |
79 | | CodeGeex | Plugin | [CodeGeeX: A Pre-Trained Model for Code Generation with Multilingual Evaluations on HumanEval-X](https://arxiv.org/abs/2303.17568) |
80 | | TabNine | Plugin | - |
81 | | Copilot | Plugin | - |
82 | | Code Whisperer | Plugin | - |
83 | | **Non-LLMs** | |
84 | | DeepLV | Model | [ICSE'21] [DeepLV: Suggesting Log Levels Using Ordinal Based Neural Networks](https://ieeexplore.ieee.org/abstract/document/9402068) |
85 | | WhichVar | Model | [TSE'21] [Which Variables Should I Log?](https://ieeexplore.ieee.org/document/8840982) |
86 | | LoGenText-Plus | Model | [TOSEM'23] [LoGenText-Plus: Improving Neural Machine Translation Based Logging Texts Generation with Syntactic Templates](https://dl.acm.org/doi/10.1145/3624740) |
87 |
88 | For each baseline utilized, we kindly request that please ensure to cite the relevant paper while using the code.
89 |
90 |
91 |
92 | ## Download original crawling logging dataset
93 | For further logging-related research, as GitHub does not hold large datasets, you can download the **whole** collected logging dataset Fullsize at [here](https://drive.google.com/file/d/13EV-rIFEwVrLGnpNIcpF3u9NSOh_gCNM/view?usp=sharing)
94 | (zip: 252M; unzip: 786M).
95 |
96 |
97 | ## Code transformation tool
98 |
99 | The folder `/build` contains the built tranformation tool. It will conduct the code tranformation automatically with its eight code transformers.
100 | - To conduct the code transformation in batch:
101 | ```
102 | java -jar code-transformer.jar -f ./javafiles/
103 | ```
104 |
--------------------------------------------------------------------------------
/build/code-transformer.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/build/code-transformer.jar
--------------------------------------------------------------------------------
/img/empirical_overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/img/empirical_overview.jpg
--------------------------------------------------------------------------------
/img/empirical_overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/img/empirical_overview.pdf
--------------------------------------------------------------------------------
/img/empirical_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/img/empirical_overview.png
--------------------------------------------------------------------------------
/src/Baselines/ChatGPT/chatgpt.py:
--------------------------------------------------------------------------------
1 | from revChatGPT.V3 import Chatbot
2 | import os
3 | import glob
4 | import time
5 | import random
6 |
7 | def read_input_file(input_file):
8 | with open(input_file, 'r') as file:
9 | input_text = file.read()
10 | return input_text
11 |
12 | def write_output_file(output_file, content):
13 | with open(output_file, 'w') as file:
14 | file.write(content)
15 |
16 | def main():
17 | input_folder = ""
18 | output_folder = ""
19 | java_files_pattern = os.path.join(input_folder, "*.java")
20 | input_files = glob.glob(java_files_pattern)
21 | random.shuffle(input_files)
22 | output_files = [os.path.join(output_folder, os.path.splitext(os.path.basename(f))[0] + "_output.java") for f in input_files]
23 | os.makedirs(output_folder, exist_ok=True)
24 |
25 | for i, input_file in enumerate(input_files):
26 |
27 | chatbot = Chatbot(api_key="")
28 | print(f"Processing {input_file}...")
29 | input_text = read_input_file(input_file)
30 | input_text = "Please complete the incomplete logging statement at the logging point. Please just reply me one line of code, don't reply me other text.:\n" + input_text
31 | try:
32 | if os.path.exists(output_files[i]):
33 | print("Output file already exists. Skipping...")
34 | continue
35 | result = chatbot.ask(input_text)
36 | time.sleep(2)
37 | output_file = output_files[i]
38 | write_output_file(output_file, result)
39 | print(f"Code saved to {output_file}")
40 | except Exception as e:
41 | print(f"Error processing {input_file}: {str(e)}")
42 |
43 |
44 | if __name__ == "__main__":
45 | main()
--------------------------------------------------------------------------------
/src/Baselines/Davinci/davinci.py:
--------------------------------------------------------------------------------
1 | import openai
2 | import os
3 | import glob
4 | import time
5 | import random
6 | from tqdm import tqdm
7 |
8 | def read_input_file(input_file):
9 | with open(input_file, 'r') as file:
10 | input_text = file.read()
11 | return input_text
12 |
13 | def write_output_file(output_file, content):
14 | with open(output_file, 'w') as file:
15 | file.write(content)
16 |
17 | openai.api_key = ""
18 |
19 | def generate_text(prompt, model="text-davinci-003", tokens=1024, temperature=1, top_p=1):
20 | response = openai.Completion.create(
21 | engine=model,
22 | prompt=prompt,
23 | max_tokens=tokens,
24 | n=1,
25 | stop=None,
26 | temperature=temperature,
27 | top_p=top_p
28 | )
29 |
30 | return response.choices[0].text.strip()
31 |
32 | def main():
33 | input_folder = ""
34 | output_folder = ""
35 | java_files_pattern = os.path.join(input_folder, "*.java")
36 | input_files = glob.glob(java_files_pattern)
37 | random.shuffle(input_files)
38 | output_files = [os.path.join(output_folder, os.path.splitext(os.path.basename(f))[0] + "_output.java") for f in input_files]
39 | os.makedirs(output_folder, exist_ok=True)
40 |
41 | for i, input_file in enumerate(tqdm(input_files, desc="Processing files")):
42 | print(f"Processing {input_file}...")
43 | input_text = read_input_file(input_file)
44 | input_text = "Please complete the incomplete logging statement at the logging point. Please just reply me one line of code, don't reply me other text.:\n" + input_text
45 | try:
46 | if os.path.exists(output_files[i]):
47 | print("Output file already exists. Skipping...")
48 | continue
49 | result = generate_text(input_text)
50 | time.sleep(2)
51 | output_file = output_files[i]
52 | write_output_file(output_file, result)
53 | print(f"Code saved to {output_file}")
54 | except Exception as e:
55 | print(f"Error processing {input_file}: {str(e)}")
56 |
57 | if __name__ == "__main__":
58 | main()
59 |
--------------------------------------------------------------------------------
/src/Baselines/DeepLV/Helper.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import re as re
4 | import string
5 | import numpy as np
6 | import csv
7 | import pandas as pd
8 |
9 | from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score, precision_recall_fscore_support
10 | from sklearn.utils import resample
11 | from sklearn.preprocessing import LabelEncoder
12 |
13 | # for ordinal
14 | trace_label = [1, 0, 0, 0, 0]
15 | debug_label = [1, 1, 0, 0, 0]
16 | info_label = [1, 1, 1, 0, 0]
17 | warn_label = [1, 1 ,1, 1, 0]
18 | error_label = [1, 1, 1, 1, 1]
19 |
20 | # for normal
21 | #trace_label = [1, 0, 0, 0, 0]
22 | #debug_label = [0, 1, 0, 0, 0]
23 | #info_label = [0, 0, 1, 0, 0]
24 | #warn_label = [0, 0 ,0, 1, 0]
25 | #error_label = [0, 0, 0, 0, 1]
26 |
27 |
28 |
29 |
30 | def ordinal_encoder(classes):
31 | y = []
32 | for c in classes:
33 | if c == 'trace':
34 | y.append(trace_label)
35 | elif c == 'debug':
36 | y.append(debug_label)
37 | elif c == 'info':
38 | y.append(info_label)
39 | elif c == 'warn':
40 | y.append(warn_label)
41 | else:
42 | y.append(error_label)
43 | y = np.array(y)
44 | return y
45 |
46 |
47 | def predict_prob_encoder(predict_prob):
48 | label_predicted = []
49 | for column_prob in predict_prob:
50 | column_label = []
51 | for p in column_prob:
52 | if p > 0.5:
53 | column_label.append(1)
54 | else:
55 | column_label.append(0)
56 | label_predicted.append(column_label)
57 | label_predicted = np.array(label_predicted)
58 | return label_predicted
59 |
60 |
61 | def predicted_label_encoder(y_list):
62 | target_list = []
63 |
64 | target_trace_label = [1, 0, 0, 0, 0]
65 | target_debug_label = [0, 1, 0, 0, 0]
66 | target_info_label = [0, 0, 1 ,0, 0]
67 | target_warn_label = [0, 0, 0, 1, 0]
68 | target_error_label = [0, 0, 0, 0, 1]
69 | target_exception_label = [0, 0, 0, 0, 0]
70 | for y in y_list:
71 | if np.array_equal(np.array(y), np.array(trace_label)):
72 | target_list.append(target_trace_label)
73 | elif np.array_equal(np.array(y), np.array(debug_label)):
74 | target_list.append(target_debug_label)
75 | elif np.array_equal(np.array(y), np.array(info_label)):
76 | target_list.append(target_info_label)
77 | elif np.array_equal(np.array(y), np.array(warn_label)):
78 | target_list.append(target_warn_label)
79 | elif np.array_equal(np.array(y), np.array(error_label)):
80 | target_list.append(target_error_label)
81 | else:
82 | print("Something wrong happend in predicted_label_encoder.", y)
83 | target_list.append(target_warn_label)
84 | return np.array(target_list)
85 |
86 |
87 |
88 |
89 | def pd_encoder(y_list): #0:trace, 1:debug, 2:info, 3:warn, 4: error
90 | target_list = []
91 | for y in y_list:
92 | if np.array_equal(np.array(y), np.array(trace_label)):
93 | target_list.append(0)
94 | elif np.array_equal(np.array(y), np.array(debug_label)):
95 | target_list.append(1)
96 | elif np.array_equal(np.array(y), np.array(info_label)):
97 | target_list.append(2)
98 | elif np.array_equal(np.array(y), np.array(warn_label)):
99 | target_list.append(3)
100 | elif np.array_equal(np.array(y), np.array(error_label)):
101 | target_list.append(4)
102 | else:
103 | print("Something wrong happend in pd_encoder.", y)
104 | target_list.append(3)
105 | return target_list
106 |
107 |
108 |
109 |
110 | def class_accuracy(y_test, y_predicted):
111 | trace_test_list = []
112 | debug_test_list = []
113 | info_test_list = []
114 | warn_test_list = []
115 | error_test_list = []
116 |
117 | trace_predicted_list = []
118 | debug_predicted_list = []
119 | info_predicted_list = []
120 | warn_predicted_list = []
121 | error_predicted_list = []
122 |
123 | for yt, yp in zip(y_test, y_predicted):
124 | if np.array_equal(np.array(yt), np.array(trace_label)):
125 | trace_test_list.append(trace_label)
126 | trace_predicted_list.append(yp)
127 | elif np.array_equal(np.array(yt), np.array(debug_label)):
128 | debug_test_list.append(debug_label)
129 | debug_predicted_list.append(yp)
130 | elif np.array_equal(np.array(yt), np.array(info_label)):
131 | info_test_list.append(info_label)
132 | info_predicted_list.append(yp)
133 | elif np.array_equal(np.array(yt), np.array(warn_label)):
134 | warn_test_list.append(warn_label)
135 | warn_predicted_list.append(yp)
136 | elif np.array_equal(np.array(yt), np.array(error_label)):
137 | error_test_list.append(error_label)
138 | error_predicted_list.append(yp)
139 | else:
140 | print("something wrong happened in class_accuracy", yt, yp)
141 | acc_trace = accuracy_score(np.array(trace_test_list), np.array(trace_predicted_list))
142 | acc_debug = accuracy_score(np.array(debug_test_list), np.array(debug_predicted_list))
143 | acc_info = accuracy_score(np.array(info_test_list), np.array(info_predicted_list))
144 | acc_warn = accuracy_score(np.array(warn_test_list), np.array(warn_predicted_list))
145 | acc_error = accuracy_score(np.array(error_test_list), np.array(error_predicted_list))
146 | print ('Trace Accuracy: ', acc_trace)
147 | print ('Debug Accuracy: ', acc_debug)
148 | print ('Info Accuracy: ', acc_info)
149 | print ('Warn Accuracy: ', acc_warn)
150 | print ('Error Accuracy: ', acc_error)
151 |
152 | #This is for the case combining debug and trace together
153 | def upsampling(x_train, y_train, seed_value):
154 |
155 | #code below is for upsampling the data
156 |
157 | df=pd.DataFrame()
158 | df['x_train'] = x_train
159 | df['y_train'] = pd_encoder(y_train)
160 |
161 | data_td = df.loc[df['y_train'] == 0]
162 | data_info = df.loc[df['y_train'] == 1]
163 | data_warn = df.loc[df['y_train'] == 2]
164 | data_error = df.loc[df['y_train'] == 3]
165 | data_len = np.array([len(data_td), len(data_info), len(data_warn), len(data_error)])
166 | max_num = np.max(data_len)
167 |
168 | td_upsampled = resample(data_td, replace=True, n_samples=max_num, random_state=seed_value)
169 | info_upsampled = resample(data_info, replace=True, n_samples=max_num, random_state=seed_value)
170 | warn_upsampled = resample(data_warn, replace=True, n_samples=max_num, random_state=seed_value)
171 | error_upsampled = resample(data_error, replace=True, n_samples=max_num, random_state=seed_value)
172 |
173 | td_upsampled=td_upsampled.drop(columns=['y_train']).to_numpy()
174 | info_upsampled=info_upsampled.drop(columns=['y_train']).to_numpy()
175 | warn_upsampled=warn_upsampled.drop(columns=['y_train']).to_numpy()
176 | error_upsampled=error_upsampled.drop(columns=['y_train']).to_numpy()
177 |
178 | x_train = np.concatenate((td_upsampled, info_upsampled, warn_upsampled, error_upsampled))
179 | temp_y_train = []
180 | for i in range(0, max_num):
181 | temp_y_train.append([1, 0, 0, 0])
182 | for i in range(0, max_num):
183 | temp_y_train.append([1, 1, 0, 0])
184 | for i in range(0, max_num):
185 | temp_y_train.append([1, 1, 1, 0])
186 | for i in range(0, max_num):
187 | temp_y_train.append([1, 1, 1, 1])
188 |
189 | y_train = np.array(temp_y_train)
190 | return x_train, y_train
191 |
192 |
193 | def ordinal_accuracy(y_test, y_predicted):
194 | print(len(y_test), len(y_predicted))
195 | left_boundary = 0.0
196 | right_boundary = 4.0
197 | value_cumulation = 0.0
198 | for yt, yp in zip(y_test, y_predicted):
199 | lb_distance = float(yt) - left_boundary
200 | rb_distance = right_boundary - float(yt)
201 | max_distance = np.max(np.array([lb_distance, rb_distance]))
202 | value = 1.0 - abs(float(yp) - float(yt))/max_distance
203 | value_cumulation = value_cumulation + value
204 | return value_cumulation/float(len(y_test))
205 |
206 |
--------------------------------------------------------------------------------
/src/Baselines/DeepLV/Metrics.py:
--------------------------------------------------------------------------------
1 | from keras.callbacks import Callback
2 | import numpy as np
3 | from sklearn.model_selection import train_test_split
4 | from sklearn.model_selection import StratifiedKFold
5 | from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, balanced_accuracy_score, accuracy_score
6 | from sklearn.utils import resample
7 |
8 | class Metrics(Callback):
9 | def on_train_begin(self, logs={}):
10 | self.val_f1s = []
11 | self.val_recalls = []
12 | self.val_precisions = []
13 | self.val_auc = []
14 |
15 | def on_epoch_end(self, epoch, logs={}):
16 | val_predict = (np.asarray(self.model.predict(
17 | self.validation_data[0]))).round()
18 | val_targ = self.validation_data[1]
19 | pos_label=1
20 | _val_f1 = f1_score(val_targ, val_predict, labels=[pos_label],pos_label=1, average ='binary')
21 | _val_recall = recall_score(val_targ, val_predict, labels=[pos_label],pos_label=1, average ='binary')
22 | _val_precision = precision_score(val_targ, val_predict, labels=[pos_label],pos_label=1, average ='binary')
23 | _val_auc = roc_auc_score(val_targ, val_predict)
24 | self.val_f1s.append(_val_f1)
25 | self.val_recalls.append(_val_recall)
26 | self.val_precisions.append(_val_precision)
27 | self.val_auc.append(_val_auc)
28 | return
29 |
30 |
--------------------------------------------------------------------------------
/src/Baselines/DeepLV/block_level_LSTM.py:
--------------------------------------------------------------------------------
1 | import yaml
2 | import os
3 | import sys
4 | import re as re
5 | import string
6 |
7 | import multiprocessing
8 | import numpy as np
9 | from gensim.models.word2vec import Word2Vec
10 | from gensim.corpora.dictionary import Dictionary
11 | from gensim.parsing.porter import PorterStemmer
12 |
13 | import random as rn
14 | seed_value = 17020
15 | seed_window = 1500
16 | import pandas as pd
17 | import csv
18 | from sklearn.model_selection import train_test_split
19 | from sklearn.model_selection import StratifiedKFold
20 | from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, balanced_accuracy_score, accuracy_score, precision_recall_fscore_support
21 | from sklearn.utils import resample
22 | from sklearn.preprocessing import LabelEncoder
23 |
24 | import matplotlib.pyplot as plt
25 |
26 | import tensorflow as tf
27 | import Metrics
28 | from keras import backend as K
29 | from keras.preprocessing import sequence
30 | from keras.models import Sequential
31 | from keras.layers import Dense, Flatten, Dropout, Embedding, LSTM, Bidirectional, Activation, LeakyReLU
32 | from keras.models import model_from_yaml
33 | from keras.utils import np_utils
34 | from keras_self_attention import SeqSelfAttention
35 |
36 |
37 | import Helper
38 |
39 |
40 | config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 16} )
41 | sess = tf.Session(config=config)
42 | K.set_session(sess)
43 |
44 |
45 | csv.field_size_limit(100000000)
46 | sys.setrecursionlimit(1000000)
47 | # set parameters:
48 | n_iterations = 1
49 | embedding_iterations = 1
50 | n_epoch = 50
51 |
52 | vocab_dim = 100
53 | maxlen = 100
54 | n_exposures = 10
55 | window_size = 7
56 | batch_size = 24
57 | input_length = 100
58 | cpu_count = multiprocessing.cpu_count()
59 |
60 | test_list = []
61 | neg_full = []
62 | pos_full = []
63 | syntactic_list = []
64 |
65 |
66 |
67 | model_location = 'model_block' +'/lstm_'+ sys.argv[1]
68 | embedding_location = 'embedding_block' + '/Word2vec_model_' + sys.argv[1] + '.pkl'
69 |
70 |
71 | def loadfile():
72 |
73 | data_full=pd.read_csv('block_processing/blocks/logged_syn' + '_' + sys.argv[1] + '.csv', usecols=[1,2,3,4], engine='python')
74 |
75 | dataset = data_full.values
76 | classes = dataset[:, 2]
77 | data=data_full['Values'].values.tolist()
78 | combined = data
79 | combined_full = data_full.values.tolist()
80 |
81 | encoder = LabelEncoder()
82 | encoder.fit(classes)
83 | encoded_Y = encoder.transform(classes)
84 | y = Helper.ordinal_encoder(classes)
85 |
86 |
87 |
88 | x_train, x_test, y_train, y_test = train_test_split(combined_full, y, test_size=0.2, train_size=0.8, random_state=seed_value, stratify=y)
89 | x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, train_size=0.75, random_state=seed_value, stratify=y_train)
90 | test_block_list = []
91 | train_block_list = []
92 | for x in x_test:
93 | test_list.append(x[0])
94 | test_block_list.append(x[1])
95 | x_test = np.array(test_block_list)
96 | for x in x_train:
97 | train_block_list.append(x[1])
98 | x_train = train_block_list
99 |
100 | return combined,y, x_train, x_val, x_test, y_train, y_val, y_test
101 |
102 |
103 |
104 | def word_splitter(word, docText):
105 | splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', word)).split()
106 | for word in splitted:
107 | docText.append(word.lower())
108 |
109 |
110 |
111 |
112 | def tokenizer(text):
113 | newText = []
114 | for doc in text:
115 | docText = []
116 | #for word in str(doc).replace("['", "").replace("']", "").replace(",", "").replace("'", "").split(' '):
117 | for word in str(doc).replace("'", "").replace("[", "").replace("]", "").replace(",", "").replace('"', "").split(' '):
118 | docText.append(word)
119 |
120 | newText.append(docText)
121 | #print (newText)
122 | return newText
123 |
124 |
125 |
126 | def input_transform(words):
127 | model=Word2Vec.load(embedding_location)
128 | _, _,dictionaries=create_dictionaries(model,words)
129 | return dictionaries
130 |
131 |
132 |
133 |
134 |
135 |
136 | def create_dictionaries(model=None,
137 | combined=None):
138 |
139 | from keras.preprocessing import sequence
140 |
141 | if (combined is not None) and (model is not None):
142 | gensim_dict = Dictionary()
143 | gensim_dict.doc2bow(model.wv.vocab.keys(),
144 | allow_update=True)
145 | w2indx = {v: k+1 for k, v in gensim_dict.items()}
146 | w2vec = {word: model.wv[word] for word in w2indx.keys()}
147 |
148 | def parse_dataset(combined):
149 | data=[]
150 | for sentence in combined:
151 | new_txt = []
152 | for word in sentence:
153 | try:
154 | new_txt.append(w2indx[word])
155 | except:
156 | new_txt.append(0)
157 | data.append(new_txt)
158 | return data
159 | combined=parse_dataset(combined)
160 | combined= sequence.pad_sequences(combined, maxlen=maxlen)
161 | return w2indx, w2vec,combined
162 |
163 |
164 | def word2vec_train(combined):
165 | model = Word2Vec(size=vocab_dim, #dimension of word embedding vectors
166 | min_count=n_exposures,
167 | window=window_size,
168 | workers=cpu_count, sg=1,
169 | iter=embedding_iterations)
170 | model.build_vocab(combined)
171 | model.save(embedding_location)
172 | index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined)
173 | return index_dict, word_vectors,combined
174 |
175 |
176 | def get_data(index_dict,word_vectors,combined):
177 |
178 | n_symbols = len(index_dict) + 1
179 | embedding_weights = np.zeros((n_symbols, vocab_dim))
180 | for word, index in index_dict.items():
181 | embedding_weights[index, :] = word_vectors[word]
182 |
183 |
184 | return n_symbols,embedding_weights
185 |
186 |
187 | def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test, x_val, y_val):
188 |
189 | tf.set_random_seed(seed_value)
190 |
191 |
192 |
193 |
194 | model = Sequential()
195 | model.add(Embedding(output_dim=vocab_dim,
196 | input_dim=n_symbols,
197 | mask_zero=True,
198 | weights=[embedding_weights],
199 | input_length=input_length))
200 | model.add(Bidirectional(LSTM(output_dim=128,activation='sigmoid')))
201 | model.add(Dropout(0.2))
202 | model.add(Dense(5, activation='sigmoid'))
203 |
204 |
205 | print ('Compiling the Model..')
206 | model.compile(loss='binary_crossentropy',
207 | optimizer='adam',metrics=['accuracy'])
208 |
209 | print ("Train...")
210 | metrics = Metrics.Metrics()
211 | history = model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epoch,verbose=1, validation_data=(x_val, y_val))
212 |
213 | base_min = optimal_epoch(history)
214 | print ("Evaluate...")
215 | score = model.evaluate(x_test, y_test,
216 | batch_size=batch_size)
217 | yaml_string = model.to_yaml()
218 | with open(model_location +'.yml', 'w') as outfile:
219 | outfile.write( yaml.dump(yaml_string, default_flow_style=True) )
220 | model.save_weights(model_location + sys.argv[1] + '.h5')
221 | np.set_printoptions(threshold=sys.maxsize)
222 |
223 | prob_predicted = model.predict(x_test, verbose=1)
224 | label_predicted = Helper.predict_prob_encoder(prob_predicted)
225 | num_y_test = Helper.pd_encoder(y_test)
226 | num_y_predicted = Helper.pd_encoder(label_predicted)
227 |
228 | val_accuracy = accuracy_score(y_test, label_predicted)
229 | print ('Accuracy: ', val_accuracy)
230 | Helper.class_accuracy(y_test, label_predicted)
231 |
232 | with open(model_location + '_target.txt', 'wt') as f:
233 | for y in y_test:
234 | f.write(str(y)+ '\n')
235 | with open(model_location + '_predicted.txt', 'wt') as f:
236 | for y in label_predicted:
237 | f.write(str(y)+ '\n')
238 | return [val_accuracy]
239 |
240 |
241 |
242 |
243 |
244 | def get_FP_FN(label_predicted, label_target):
245 | FP_id_list = []
246 | FN_id_list = []
247 | for i in range(0, len(label_predicted)):
248 | if int(label_predicted[i]) == 1 and int(label_target[i]) == 0:
249 | FP_id_list.append(i)
250 | elif int(label_predicted[i]) == 0 and int(label_target[i]) == 1:
251 | FN_id_list.append(i)
252 | #print (FP_id_list)
253 | #print (FN_id_list)
254 | with open('model_block' +'/labels/list/lstm_FP_' + sys.argv[1] + '.txt', 'wt') as f:
255 | for fp in FP_id_list:
256 | f.write(str(test_list[int(fp)])+ '\n')
257 | with open('model_block' +'/labels/list/lstm_FN_' + sys.argv[1] + '.txt', 'wt') as f:
258 | for fn in FN_id_list:
259 | f.write(str(test_list[int(fn)])+ '\n')
260 |
261 |
262 | def train():
263 | os.environ['PYTHONHASHSEED']=str(seed_value)
264 | np.random.seed(seed_value)
265 | rn.seed(seed_value)
266 | print ('Loading Data...')
267 | combined,y,x_train, x_val, x_test, y_train, y_val, y_test=loadfile()
268 | print ('Tokenizing...')
269 | combined = tokenizer(combined)
270 | x_train = tokenizer (x_train)
271 | x_test = tokenizer (x_test)
272 | x_val = tokenizer (x_val)
273 | print ('Training a Word2vec model...')
274 | index_dict, word_vectors,combined=word2vec_train(combined)
275 | x_train = input_transform(x_train)
276 | x_test = input_transform(x_test)
277 | x_val = input_transform(x_val)
278 | print ('Setting up Arrays for Keras Embedding Layer...')
279 | n_symbols,embedding_weights=get_data(index_dict, word_vectors,combined)
280 | #print (x_train.shape,y_train.shape)
281 | result = train_lstm(n_symbols,embedding_weights,x_train,y_train, x_val , y_val , x_test,y_test)
282 | return result
283 |
284 |
285 | def pipeline_train(iterations):
286 | seed_and_result = {}
287 | if iterations == 1:
288 | train()
289 | else:
290 | for i in range(0, iterations):
291 | print('Iteration: ', i)
292 | global seed_value
293 | result = train()
294 | seed_and_result[seed_value] = result
295 | seed_value = seed_value + seed_window
296 | i = i + 1
297 | return seed_and_result
298 |
299 | def eval_metric(model, history, metric_name):
300 | metric = history.history[metric_name]
301 | val_metric = history.history['val_' + metric_name]
302 | e = range(1, n_epoch + 1)
303 | plt.plot(e, metric, 'bo', label='Train ' + metric_name)
304 | plt.plot(e, val_metric, 'b', label='Validation ' + metric_name)
305 | plt.xlabel('Epoch number')
306 | plt.ylabel(metric_name)
307 | plt.title('Comparing training and validation ' + metric_name + ' for ' + model.name)
308 | plt.legend()
309 | plt.show()
310 |
311 |
312 | def optimal_epoch(model_hist):
313 | min_epoch = np.argmin(model_hist.history['val_loss']) + 1
314 | print("Minimum validation loss reached in epoch {}".format(min_epoch))
315 | return min_epoch
316 |
317 |
318 |
319 |
320 | if __name__=='__main__':
321 | result_dict = pipeline_train(n_iterations)
322 | print (sys.argv[1])
323 |
--------------------------------------------------------------------------------
/src/Baselines/DeepLV/block_processing/block_processing.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import multiprocessing
3 | import numpy as np
4 | import pandas as pd
5 | import csv
6 | import re
7 |
8 | block_set = {"DoStatement", "WhileStatement", "SynchronizedStatement", "IfStatement", "SwitchStatement", "TryStatement", "EnhancedForStatement", "ForStatement", "MethodDeclaration", "CatchClause", "Block" , "SwitchCase"}
9 | syntactic_filter_set = {"Block", "SimpleName", "SimpleType", "QualifiedName", "ParameterizedType", "PrimitiveType", "SingleVariableDeclaration", "ArrayType", "TypeLiteral"}
10 | block_dict = {}
11 | target_dict = {}
12 | methods_dict = {}
13 | methods_lines = {}
14 | target_dict_logged = {}
15 | level_dict_logged = {}
16 | message_dict_logged = {}
17 | target_dict_nonlogged = {}
18 |
19 |
20 | def read_logs(filename):
21 | f = open('original_logs/logs-' + filename+ '.txt')
22 | lines = f.readlines()
23 | f.close()
24 | return lines
25 |
26 |
27 | def get_classname(method):
28 | fullpath = method.split('.')
29 | class_name = fullpath[-3] + '.' + fullpath[-2]+'.java'
30 | return class_name
31 |
32 |
33 | def read_AST_file(filename):
34 | f = open('AST/AST-'+filename+'.txt')
35 | lines = f.readlines()
36 | f.close()
37 |
38 | return lines
39 |
40 |
41 | def parse_ASTlines(ASTlines):
42 | lines = []
43 | #parse ASTlines by regex
44 | for astline in ASTlines:
45 |
46 | astType = re.findall(r'([^<]+)', astline)[0]
47 | location = re.findall(r'([^<]+)', astline)[0]
48 | begin = re.findall(r'([^<]+)', astline)[0]
49 | end = re.findall(r'([^<]+)', astline)[0]
50 | #content = re.findall(r'([^<]+)', astline)[0]
51 | content = re.findall(r'(.*?)', astline)[0]
52 | lines.append([astType, location, begin, end, content])
53 | #for every AST line, 0: type, 1: location, 2: beginline, 3: endline, 4: content
54 | return lines
55 |
56 |
57 |
58 | def parse_Loglines(Loglines):
59 | loglines = []
60 | #parse ASTlines by regex
61 | for logline in Loglines:
62 | callsite = re.findall(r'([^<]+)', logline)[0]
63 | level = re.findall(r'([^<]+)', logline)[0]
64 | line = re.findall(r'([^<]+)', logline)[0]
65 | if(re.findall(r'([^<]+)', logline)):
66 | content = re.findall(r'([^<]+)', logline)[0]
67 | loglines.append([level, line, content, callsite])
68 | else:
69 | loglines.append([level, line, 'No message', callsite])
70 | #0: level, 1: line number, 2: content, 3: callsite
71 |
72 | return loglines
73 |
74 |
75 | def if_log_line(ast, loglines):
76 | for log in loglines:
77 | #print (get_classname(log[3]), get_classname(astlist[1]))
78 | #print (log[1], astlist[2])
79 | if(get_classname(log[3]) == get_classname(astlist[1]) and int(log[1]) == int(astlist[2])):
80 | #print ('1')
81 | return True
82 | return False
83 |
84 |
85 |
86 | def if_diff_levels(value_list):
87 | if len(value_list) > 1:
88 | for i in range (0, len(value_list)-1):
89 | for j in range (i+1, len(value_list)):
90 | if value_list[i][0] != value_list[j][0]:
91 | return 2
92 | else:
93 | return 0
94 | return 1
95 |
96 | def not_level_guard(string):
97 | if "enabled" in string:
98 | if "info" in string or "debug" in string or "trace" in string:
99 | return False
100 | return True
101 |
102 | #0: <= 1 log in the block, 1: multiple logs at the same level, 2: multiple logs at different levels
103 |
104 |
105 | def get_level_id(log, current_level):
106 | log_level = re.findall(r'([^<]+)', log)[0]
107 | message = '-'
108 | if(re.findall(r'([^<]+)', log)):
109 | message = re.findall(r'([^<]+)', log)[0]
110 | if log_level == 'trace':
111 | level_id = 0
112 | elif log_level == 'debug':
113 | level_id = 1
114 | elif log_level == 'info':
115 | level_id = 2
116 | elif log_level == 'warn':
117 | level_id = 3
118 | elif log_level == 'error':
119 | level_id = 4
120 | else:
121 | level_id = 5
122 | if level_id > current_level:
123 | return level_id, message
124 | else:
125 | return current_level, message
126 |
127 |
128 | def get_level_name(level_id):
129 | if level_id == 0:
130 | return "trace"
131 | elif level_id == 1:
132 | return "debug"
133 | elif level_id == 2:
134 | return "info"
135 | elif level_id == 3:
136 | return "warn"
137 | elif level_id == 4:
138 | return "error"
139 | elif level_id == 5:
140 | return "fatal"
141 | else:
142 | return "unknown"
143 |
144 | def label_blocks(target_dict, loglines):
145 | for key, value in target_dict.items():
146 | logged_flag = False
147 | #level id: 0 - trace, 1 - debug, 2 - info, 3 - warn, 4 - error, 5 - fatal
148 | level_id = 0
149 | message = '-'
150 | for log in loglines:
151 | log_class = get_classname(re.findall(r'([^<]+)', log)[0])
152 | log_line = int(re.findall(r'([^<]+)', log)[0])
153 | key_class = re.findall(r'([^<]+)', key)[0]
154 | key_start = int(re.findall(r'([^<]+)', key)[0])
155 | key_end = int(re.findall(r'([^<]+)', key)[0])
156 | if log_line >= key_start and log_line <= key_end and log_class == key_class:
157 | level_id, message = get_level_id(log, level_id)
158 | logged_flag = True
159 | if logged_flag == True:
160 | target_dict_logged[key] = value
161 | level_dict_logged[key]=get_level_name(level_id)
162 | message_dict_logged[key]= message
163 | else:
164 | target_dict_nonlogged[key] = value
165 |
166 |
167 | def get_methods_dict (node): # set the startline of the first node of a method as it's startline
168 | if node[1] in methods_dict:
169 | if int(methods_dict[node[1]]) > int(node[2]):
170 | methods_dict[node[1]] = node[2]
171 | else:
172 | methods_dict[node[1]] = node[2]
173 |
174 |
175 | def get_methods_lines (methods_dict):
176 | for key, value in methods_dict.items():
177 | class_name = get_classname(key)
178 | if class_name in methods_lines:
179 | methods_lines[class_name].append(int(value))
180 | else:
181 | methods_lines[class_name] = []
182 |
183 | for key, value in methods_lines.items():
184 | value.sort()
185 | #print (key)
186 | #print (value)
187 |
188 |
189 | def get_method_start_line_for_AST (class_name, start_line):
190 | method_start_line = int(start_line)
191 | memory_line = 1
192 | if methods_lines[class_name]:
193 | for v in methods_lines[class_name]:
194 | if int(v) >= int(start_line):
195 | #print (memory_line)
196 | return int(memory_line)
197 | else:
198 | memory_line = int(v)
199 | else:
200 | return int(method_start_line)
201 |
202 |
203 | if __name__=='__main__':
204 |
205 | ASTlines = read_AST_file(sys.argv[1])
206 | loglines = read_logs(sys.argv[1])
207 |
208 | ASTlists = parse_ASTlines(ASTlines)
209 | loglists = parse_Loglines(loglines)
210 |
211 | for astlist in ASTlists:
212 | get_methods_dict(astlist)
213 | #filter level-guard if statements
214 | ast_content = astlist[4].lower()[0:40]
215 | #for every AST line, 0: type, 1: location, 2: beginline, 3: endline, 4: content
216 | if astlist[0] in block_set and not_level_guard(ast_content):
217 | if astlist[1] in block_dict:
218 | if (astlist[2]) not in block_dict[astlist[1]]:
219 | block_dict[astlist[1]].append(int(astlist[2]))
220 | if (astlist[3]) not in block_dict[astlist[1]]:
221 | block_dict[astlist[1]].append(int(astlist[3]))
222 |
223 | else:
224 | block_dict[astlist[1]] = []
225 | get_methods_lines(methods_dict)
226 |
227 | for key, value in block_dict.items():
228 | value.sort()
229 |
230 |
231 |
232 |
233 | for key, value in block_dict.items():
234 | for i in range (0, len(value)-1):
235 | dict_key = '' + get_classname(key) + '' + '' + str(value[i]) + '' + '' + str((value[i+1])-1) + ''
236 | target_dict[dict_key] = []
237 |
238 |
239 | m_start_line = 0
240 | for key, value in target_dict.items():
241 | class_name = re.findall(r'([^<]+)', key)[0]
242 | start_line = re.findall(r'([^<]+)', key)[0]
243 | m_start_line = get_method_start_line_for_AST(class_name, start_line)
244 | if m_start_line is not None:
245 | if int(m_start_line) == 1:
246 | m_start_line = start_line
247 | else:
248 | m_start_line = start_line
249 |
250 | end_line = re.findall(r'([^<]+)', key)[0]
251 | #print (key)
252 | for astlist in ASTlists:
253 | if astlist[0] not in syntactic_filter_set and int(astlist[2]) <= int(end_line) and int(astlist[2]) >= int(m_start_line) and class_name == get_classname(astlist[1]):
254 | if(if_log_line(astlist, loglists)==False):
255 | value.append(astlist[0])
256 |
257 |
258 |
259 | label_blocks(target_dict, loglines)
260 | result_list_logged = []
261 | for key, value in target_dict_logged.items():
262 | result_list_logged.append([key, value, level_dict_logged[key], message_dict_logged[key]])
263 |
264 | result_list_nonlogged = []
265 | for key, value in target_dict_nonlogged.items():
266 | result_list_nonlogged.append([key, value])
267 |
268 |
269 |
270 |
271 | header_logged = ['Key', 'Values', 'Level', 'Message']
272 | logged_dict_to_write=pd.DataFrame(columns=header_logged,data=result_list_logged)
273 | logged_dict_to_write.to_csv('blocks/logged_syn_' + sys.argv[1] + '.csv')
274 |
275 |
276 |
--------------------------------------------------------------------------------
/src/Baselines/DeepLV/deepLV_cleaner.py:
--------------------------------------------------------------------------------
1 | import pandas
2 | import numpy as np
3 | import os
4 | import javalang
5 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
6 | from rouge import Rouge
7 | import re
8 | import numpy as np
9 | from sklearn import metrics
10 | import pandas as pd
11 |
12 | def level_acc(classification_pred, classification_label) -> float:
13 | level_map = {'trace':0., 'debug':1., 'info':2., 'warn':3., 'error':4.}
14 | new_pred = []
15 | new_label = []
16 | length = len(classification_pred)
17 | for idx in range(length):
18 | predict = classification_pred[idx]
19 | label = classification_label[idx]
20 | if predict in level_map.keys() and label in level_map.keys():
21 | pred_sum = level_map[predict]
22 | label_sum = level_map[label]
23 | new_pred.append(pred_sum)
24 | new_label.append(label_sum)
25 | matches = sum(x == y for x, y in zip(new_pred, new_label))
26 | total_elements = len(new_pred)
27 | accuracy = matches / total_elements
28 | return accuracy
29 |
30 | def query_level(level: float) -> str:
31 | if level == 1.:
32 | return 'trace'
33 | elif level == 2.:
34 | return 'debug'
35 | elif level == 3.:
36 | return 'info'
37 | elif level == 4.:
38 | return 'warn'
39 | elif level == 5.:
40 | return 'error'
41 | else:
42 | return ''
43 |
44 | def aod(classification_pred, classification_label) -> float:
45 | level_map = {'trace':1., 'debug':2., 'info':3., 'warn':4., 'error':5.}
46 | max_distance = {'trace':4., 'debug':3., 'info':2., 'warn':3., 'error':4.}
47 |
48 | distance_sum = 0.
49 | noise = 0.
50 | length = len(classification_pred)
51 |
52 | for idx in range(length):
53 | try:
54 | predict = classification_pred[idx]
55 | label = classification_label[idx]
56 | pred_sum = level_map[predict]
57 | label_sum = level_map[label]
58 | level = query_level(label_sum)
59 | _distance = abs(label_sum - pred_sum)
60 | distance_sum = distance_sum + (1 - _distance / max_distance[level])
61 | except Exception as e:
62 | noise = noise+1
63 | aod = distance_sum / (length-noise)
64 | return aod
65 |
66 | def extract_quoted_strings(s):
67 | quoted_strings = re.findall(r'"([^"]*)"', s)
68 | " ".join(quoted_strings)
69 | remaining = re.sub(r'"[^"]*"', '', s)
70 | char_to_remove = ['+', ',']
71 | for char in char_to_remove:
72 | remaining = remaining.replace(char, '')
73 | var_list_origin = remaining.split(' ')
74 | var_list = [item for item in var_list_origin if (not item == ' ')]
75 | var_list = [item for item in var_list if item]
76 | return quoted_strings, var_list
77 |
78 | def extract_outer_brackets(s):
79 | stack = []
80 | result = []
81 |
82 | for m in re.finditer(r"[()]", s):
83 | char, pos = m.group(0), m.start(0)
84 | if char == "(":
85 | stack.append(pos)
86 | elif char == ")":
87 | if len(stack) == 1:
88 | result.append(s[stack.pop() + 1:pos])
89 | else:
90 | stack.pop()
91 | return result
92 |
93 | def extract_level(statement):
94 | parts = statement.split('.')
95 | for part in parts:
96 | if '(' in part:
97 | level = part.split('(')[0]
98 | return level.strip()
99 | return ''
100 |
101 |
102 |
103 | def extract_text(statement):
104 | bracket_contents = extract_outer_brackets(statement)
105 | if bracket_contents: # Check if the list is not empty
106 | # Pass the first item (contents of the first set of brackets) to extract_quoted_strings
107 | quoted_strings, remaining = extract_quoted_strings(bracket_contents[0])
108 | quoted_strings_combined = ' '.join(quoted_strings)
109 | return quoted_strings_combined
110 | else:
111 | return '' # Return an empty string if no brackets are found
112 |
113 | df = pd.read_csv('logbench.csv')
114 | df = df[df['Statement'].apply(lambda x: len(x.splitlines()) == 1)]
115 | df['level'] = df['Statement'].apply(extract_level)
116 | df['text'] = df['Statement'].apply(extract_text)
117 |
118 |
119 | df.to_csv('logbench_cleaned.csv', index=False)
--------------------------------------------------------------------------------
/src/Baselines/Incoder/incoder.py:
--------------------------------------------------------------------------------
1 | import os
2 | import javalang
3 | import re
4 | from typing import List
5 | import torch
6 | import tokenizers
7 | from transformers import AutoModelForCausalLM, AutoTokenizer
8 | import json
9 |
10 | path = ''
11 | ground_truth_folder = ''
12 |
13 | def insert_text_to_java_file(file_name, line_number):
14 | with open(file_name, 'r') as file:
15 | lines = file.readlines()
16 |
17 | if line_number > len(lines):
18 | print("out of range")
19 |
20 | lines[line_number - 1] = lines[line_number - 1].rstrip() + '\n'
21 |
22 | with open(file_name, 'w') as file:
23 | file.writelines(lines)
24 |
25 |
26 | def extract_numbers(s):
27 | return re.findall(r'\d+', s)
28 |
29 |
30 | def parse_directory(dir_path, ground_truth_folder):
31 | for filename in os.listdir(dir_path):
32 | file_path = os.path.join(dir_path, filename)
33 | if os.path.isfile(file_path) and file_path.endswith('.java'):
34 | ground_truth_path = ground_truth_folder + file_path.split('/')[-1][:-5] + '_config.txt'
35 | try:
36 | with open(ground_truth_path) as f:
37 | lines = f.readlines()
38 | if len(lines) >= 1:
39 | line_number = int(extract_numbers(lines[0].strip(' ')[:-1])[0])
40 | insert_text_to_java_file(file_path, line_number)
41 | except FileNotFoundError:
42 | pass
43 | elif os.path.isdir(file_path):
44 | parse_directory(file_path, ground_truth_folder)
45 |
46 | parse_directory(path,ground_truth_folder)
47 | # Data procession done.
48 |
49 |
50 | tokenizers_version = tuple(int(n) for n in tokenizers.__version__.split('.'))
51 | if tokenizers_version < (0, 12, 1):
52 | print("warning: Your tokenizers version looks old and you will likely have formatting issues. We recommend installing tokenizers >= 0.12.1")
53 |
54 | # set BIG_MODEL to use the 6.7B parameter model
55 | BIG_MODEL = True
56 |
57 | # use a GPU
58 | CUDA = True
59 |
60 | # print intermediate outputs of infilling
61 | VERBOSE = False
62 |
63 | if BIG_MODEL:
64 | model_name = "facebook/incoder-6B"
65 | if CUDA:
66 | kwargs = dict(
67 | revision="float16",
68 | torch_dtype=torch.float16,
69 | low_cpu_mem_usage=False,
70 | )
71 | else:
72 | kwargs = dict(
73 | low_cpu_mem_usage=False,
74 | )
75 | else:
76 | model_name = "facebook/incoder-1B"
77 | kwargs = {}
78 |
79 | print("loading model")
80 | model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
81 | print("loading tokenizer")
82 | tokenizer = AutoTokenizer.from_pretrained(model_name)
83 | print("loading complete")
84 |
85 | if CUDA:
86 | # if you plan to fine-tune the model, you should not use half precision.
87 | model = model.half().cuda()
88 |
89 | # signals the start of a document
90 | BOS = "<|endoftext|>"
91 | # signals the end of a generated infill
92 | EOM = "<|endofmask|>"
93 |
94 | def make_sentinel(i):
95 | # signals (1) a location to insert an infill and (2) the start of the infill generation
96 | return f"<|mask:{i}|>"
97 |
98 | def generate(input: str, max_to_generate: int=128, temperature: float=0.2):
99 | """
100 | Do standard left-to-right completion of the prefix `input` by sampling from the model
101 | """
102 | input_ids = tokenizer(input, return_tensors="pt").input_ids
103 | if CUDA:
104 | input_ids = input_ids.cuda()
105 | max_length = max_to_generate + input_ids.flatten().size(0)
106 | if max_length > 2048:
107 | print("warning: max_length {} is greater than the context window {}".format(max_length, 2048))
108 | with torch.no_grad():
109 | output = model.generate(input_ids=input_ids, do_sample=True, top_p=0.95, temperature=temperature, max_length=max_length)
110 | # pass clean_up_tokenization_spaces=False to avoid removing spaces before punctuation, e.g. "from ." -> "from."
111 | detok_hypo_str = tokenizer.decode(output.flatten(), clean_up_tokenization_spaces=False)
112 | if detok_hypo_str.startswith(BOS):
113 | detok_hypo_str = detok_hypo_str[len(BOS):]
114 | return detok_hypo_str
115 |
116 | def infill(parts: List[str], max_to_generate: int=50, temperature: float=0.2, extra_sentinel: bool=True, max_retries: int=1):
117 | """
118 | Generate infills to complete a partial document, e.g.
119 | [A C E] -> [A B C D E], where B and D are infills that have been generated.
120 |
121 | parts: List[str]. list of parts of the document. One string will be
122 | inserted in between each element, i.e. infilling N-1 locations for a list
123 | of length N.
124 | max_to_generate: int. maximum number of tokens to generate. Keep in mind
125 | that the model context size is 2048.
126 | temperature: float. temperature parameter for sampling.
127 | extra_sentinel: bool. we recommend setting this to True, as it makes it
128 | easier for the model to end generated infills. See the footnote in
129 | section 2.2 of our paper for details.
130 | max_retries: int. if > 1, use rejection sampling to keep sampling infills until
131 | all infills sample a completion token.
132 |
133 | returns a dictionary containing the following:
134 | text: str, the completed document (with infills inserted)
135 | parts: List[str], length N. Same as passed to the method
136 | infills: List[str], length N-1. The list of infills generated
137 | retries_attempted: number of retries used (if max_retries > 1)
138 | """
139 | assert isinstance(parts, list)
140 | retries_attempted = 0
141 | done = False
142 |
143 | while (not done) and (retries_attempted < max_retries):
144 | retries_attempted += 1
145 |
146 | if VERBOSE:
147 | print(f"retry {retries_attempted}")
148 |
149 | ## (1) build the prompt
150 | if len(parts) == 1:
151 | prompt = parts[0]
152 | else:
153 | prompt = ""
154 | # encode parts separated by sentinel
155 | for sentinel_ix, part in enumerate(parts):
156 | prompt += part
157 | if extra_sentinel or (sentinel_ix < len(parts) - 1):
158 | prompt += make_sentinel(sentinel_ix)
159 |
160 | infills = []
161 | complete = []
162 |
163 | done = True
164 |
165 | ## (2) generate infills
166 | for sentinel_ix, part in enumerate(parts[:-1]):
167 | complete.append(part)
168 | prompt += make_sentinel(sentinel_ix)
169 | # TODO: this is inefficient as it requires re-encoding prefixes repeatedly
170 | completion = generate(prompt, max_to_generate, temperature)
171 | completion = completion[len(prompt):]
172 | if EOM not in completion:
173 | if VERBOSE:
174 | print(f"warning: {EOM} not found")
175 | completion += EOM
176 | done = False
177 | completion = completion[:completion.index(EOM) + len(EOM)]
178 | infilled = completion[:-len(EOM)]
179 | infills.append(infilled)
180 | complete.append(infilled)
181 | prompt += completion
182 | complete.append(parts[-1])
183 | text = ''.join(complete)
184 |
185 | if VERBOSE:
186 | print("generated text:")
187 | print(prompt)
188 | print()
189 | print("parts:")
190 | print(parts)
191 | print()
192 | print("infills:")
193 | print(infills)
194 | print()
195 | print("restitched text:")
196 | print(text)
197 | print()
198 |
199 | return {
200 | 'text': text, # str, the completed document (with infills inserted)
201 | 'parts': parts, # List[str], length N. Same as passed to the method
202 | 'infills': infills, # List[str], length N-1. The list of infills generated
203 | 'retries_attempted': retries_attempted, # number of retries used (if max_retries > 1)
204 | }
205 |
206 | def docstring_to_code(code, max_to_generate=50, temperature=0.2):
207 |
208 | parts = code.split("")
209 | result = infill(parts, max_to_generate=max_to_generate, temperature=temperature)
210 | return result
211 |
212 | input_path = path
213 | output_path= ''
214 |
215 | if not os.path.exists():
216 | os.makedirs(output_path)
217 |
218 | for filename in os.listdir(input_path):
219 | if filename.endswith(".java"):
220 | print(filename)
221 | input_file_path = os.path.join(input_path, filename)
222 |
223 | with open(input_file_path, 'r', encoding='utf-8') as file:
224 | file_content = file.read()
225 | example = f"'''\\\n{file_content}\n'''"
226 |
227 | processed_content = docstring_to_code(example)
228 |
229 | output_file_path = os.path.join(output_path, filename)
230 | with open(output_file_path, 'w', encoding='utf-8') as output_file:
231 | for item in processed_content['infills']:
232 | output_file.write(f"{item}\n")
233 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/README.md:
--------------------------------------------------------------------------------
1 | # LoGenText-Plus
2 |
3 | The implementation of "LoGenText-Plus: Improving Neural Machine Translation-based Logging Texts Generation with Syntactic Templates"
4 |
5 | > This code and dataset are based on [Context-Aware Model on Fairseq](https://github.com/libeineu/Context-Aware) and [LoGenText](https://github.com/conf-202x/experimental-result).
6 |
7 | ## Requirements and Installation
8 |
9 | * Pytorch >= 1.5.1
10 | * Python version >= 3.6
11 |
12 | 1. `conda create --name --file requirements.txt`
13 |
14 | ## Stage 1: template generation
15 |
16 |
17 | Note: `` is the path to the replication package.
18 |
19 | ### Train and inference for templates
20 |
21 | > 1. Run the following command to start the pre-training:
22 | ```
23 | cd /code/template-gen/pre-train
24 | bash runs/pre-train.sh
25 | ```
26 |
27 |
28 | > 2. Run the following command to train a basic model:
29 | ```
30 | cd /code/template-gen/basic-train
31 | bash runs/basic-train.sh
32 | ```
33 | `` is the project name in lowercase, which can be activemq, ambari, etc.
34 |
35 | > 3. Run the following command to train and generate the templates for a certain :
36 | ```
37 | cd /code/template-gen/ast-temp
38 | bash runs/temp-gen.sh
39 | ```
40 | `` should be the same with the project in step 2, and the generated templates can be found in `saved_checkpoints/pre-ast-templete/`.
41 |
42 |
43 | ## Stage 2: template-based logging text generation
44 |
45 | Note: `` is the path to the replication package.
46 |
47 | ### Train and inference for logging texts
48 |
49 | > 1. Run the following command to start the pre-training:
50 | ```
51 | cd /code/logging-gen/pre-train
52 | bash runs/pre-train.sh
53 | ```
54 |
55 | > 2. Run the following command to train a basic model:
56 | ```
57 | cd /code/logging-gen/basic-train
58 | bash runs/basic-train.sh
59 | ```
60 | `` is the project name in lowercase, which can be activemq, ambari, etc.
61 |
62 | > 3. Run the following command to train and generate the logging texts for a certain :
63 | ```
64 | cd /code/logging-gen/ast-temp
65 | bash runs/log-gen.sh
66 | ```
67 | `` should be the same with the project in step 2, and the generated logging texts can be found in `translations/1/`.
68 |
69 | ## Results
70 |
71 | The results can be found in the `results` folder, which is organized by project.
72 |
73 | ## Data
74 |
75 | The dataset can be found in the `dataset` folder, which is organized by project. It has the following structure:
76 | ```
77 | dataset
78 | ├──
79 | │ ├── dev.code.1.templete
80 | │ ├── dev.log
81 | │ ├── dev.log.1.templete
82 | │ ├── dev.pre-ast
83 | │ ├── test.code.1.templete
84 | │ ├── test.code.gen.ast.similar.1.templete
85 | │ ├── test.log
86 | │ ├── test.log.1.templete
87 | │ ├── test.pre-ast
88 | │ ├── train.code.1.templete
89 | │ ├── train.log
90 | │ ├── train.log.1.templete
91 | │ └── train.pre-ast
92 | ```
93 | - `` is one of the studied projects, suach as `activemq`.
94 | - `train/dev/test.log` are the files containing the extracted `logging texts` target sequence.
95 | - `train/dev/test.pre-ast` are the files containing the `ASTs` context.
96 | - `train/dev/test.code.1.templete` are the files containing `pre-log code + template from logging text in similar code`.
97 | - `train/dev/test.log.1.template` are the files containing the template extracted from the `logging text`.
98 | - `test.code.gen.ast.similar.1.templete` are the file containing the `pre-log code + predicted template`.
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/requirements.txt:
--------------------------------------------------------------------------------
1 | # This file may be used to create an environment using:
2 | # $ conda create --name --file
3 | # platform: linux-64
4 | _libgcc_mutex=0.1=main
5 | blas=1.0=mkl
6 | brotlipy=0.7.0=py38h27cfd23_1003
7 | ca-certificates=2022.4.26=h06a4308_0
8 | certifi=2022.5.18.1=py38h06a4308_0
9 | cffi=1.14.4=pypi_0
10 | charset-normalizer=2.0.4=pyhd3eb1b0_0
11 | click=7.1.2=pypi_0
12 | cryptography=37.0.1=py38h9ce1e76_0
13 | cudatoolkit=10.1.243=h6bb024c_0
14 | cycler=0.10.0=pypi_0
15 | freetype=2.10.4=h5ab3b9f_0
16 | idna=3.3=pyhd3eb1b0_0
17 | intel-openmp=2020.2=254
18 | joblib=1.0.0=pypi_0
19 | jpeg=9b=h024ee3a_2
20 | kiwisolver=1.3.1=pypi_0
21 | lcms2=2.11=h396b838_0
22 | ld_impl_linux-64=2.33.1=h53a641e_7
23 | libedit=3.1.20191231=h14c3975_1
24 | libffi=3.3=he6710b0_2
25 | libgcc-ng=9.1.0=hdf63c60_0
26 | libpng=1.6.37=hbc83047_0
27 | libprotobuf=3.19.1=h4ff587b_0
28 | libstdcxx-ng=9.1.0=hdf63c60_0
29 | libtiff=4.1.0=h2733197_1
30 | lz4-c=1.9.3=h2531618_0
31 | matplotlib=3.3.4=pypi_0
32 | mkl=2020.2=256
33 | mkl-service=2.3.0=py38he904b0f_0
34 | mkl_fft=1.2.0=py38h23d657b_0
35 | mkl_random=1.1.1=py38h0573a6f_0
36 | ncurses=6.2=he6710b0_1
37 | ninja=1.10.2=py38hff7bd54_0
38 | numpy=1.19.2=py38h54aff64_0
39 | numpy-base=1.19.2=py38hfa32c7d_0
40 | olefile=0.46=py_0
41 | openssl=1.1.1o=h7f8727e_0
42 | pillow=8.1.0=py38he98fc37_0
43 | pip=20.3.3=py38h06a4308_0
44 | portalocker=2.2.0=pypi_0
45 | protobuf=3.19.1=py38h295c915_0
46 | pycparser=2.20=pypi_0
47 | pyopenssl=22.0.0=pyhd3eb1b0_0
48 | pyparsing=2.4.7=pypi_0
49 | pysocks=1.7.1=py38h06a4308_0
50 | python=3.8.5=h7579374_1
51 | python-dateutil=2.8.1=pypi_0
52 | pytorch=1.5.1=py3.8_cuda10.1.243_cudnn7.6.3_0
53 | readline=8.1=h27cfd23_0
54 | regex=2020.11.13=pypi_0
55 | requests=2.27.1=pyhd3eb1b0_0
56 | sacrebleu=1.5.0=pypi_0
57 | sacremoses=0.0.43=pypi_0
58 | setuptools=52.0.0=py38h06a4308_0
59 | six=1.15.0=py38h06a4308_0
60 | sqlite=3.33.0=h62c20be_0
61 | subword-nmt=0.3.7=pypi_0
62 | tensorboardx=2.2=pyhd3eb1b0_0
63 | tk=8.6.10=hbc83047_0
64 | torchvision=0.6.1=py38_cu101
65 | tqdm=4.56.0=pypi_0
66 | urllib3=1.26.9=py38h06a4308_0
67 | wheel=0.36.2=pyhd3eb1b0_0
68 | xz=5.2.5=h7b6447c_0
69 | zlib=1.2.11=h7b6447c_3
70 | zstd=1.4.5=h9ceee32_0
71 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/activemq/translation.context.test:
--------------------------------------------------------------------------------
1 | expiring connection to zookeeper vid
2 | unsubscribing next messages
3 | async dispose interrupted
4 | stopping async topic tasks
5 | error with selector vid
6 | network connection between vid and vid has been established
7 | unknown command vid
8 | vid failed to resetting to batch
9 | virtual consumer added vid for virtual destination vid
10 | shutdown of topic traffic generator failed
11 | store commit failed
12 | failed to create persistence adapter vid
13 | destination is full vid
14 | setting optimized out of vid to vid
15 | async read check was rejected from the executor
16 | tempdest vid
17 | redelivery of vid stopped
18 | exception on forwarding to non existent temp dest
19 | vid ignoring sub vid on vid from vid is no longer active
20 | reconnected to vid
21 | detected missing corrupt journal files dropped vid messages from the index in vid seconds
22 | attempting to acquire the exclusive lock to become the master broker
23 | broker plugin vid started
24 | checkpoint failed
25 | vid flushtodisk done vid ms vid
26 | failed to fire bridge for vid
27 | stopping broker vid
28 | sasl vid handshake complete
29 | error receiving message vid this exception is ignored
30 | sendreq vid
31 | endpoint vid will not receive any messages due to broker zero error vid
32 | xa resource manager vid
33 | do not know how to process activemq command vid
34 | sending to vid messages from vid to vid
35 | vid message sent to vid
36 | zero length partial vid
37 | message not found in sequence id index vid
38 | unknown datastruction vid
39 | message received since last read check resetting flag vid
40 | sendreq vid
41 | vid could not find the object rename for vid
42 | failed to unregister mbean vid
43 | periodic checkpoint failed
44 | error unsubscribing vid from vid vid
45 | creating producer vid message vid
46 | network bridge could not be registered in jmx vid
47 | slow kahadb access journal append took vid ms index
48 | waiting vid ms before attempting to reconnect
49 | prepare of vid failed because it was marked rollback only
50 | received an exception but connection is ignored vid
51 | could not correlate the connection vid
52 | vid ms before attempting to reconnect to vid
53 | failed to get durable subscription vid
54 | unable to read persisted selector cache it will be ignored
55 | get destinations returned empty list
56 | caught an exception trying to determine if there is no flag
57 | vid usage manager memory limit reached vid producers will be throttled to the rate at vid
58 | setting durable subscriber to vid
59 | work rejected vid
60 | reusing an active session vid
61 | thread does not hold the context lock on close of vid
62 | creating producer to vid
63 | vid elapsed time in second vid s
64 | recovery mode trying to reconnect to zero
65 | could not apply query parameters vid to vid
66 | producer vid with non persistent delivery
67 | failed to call after delivery
68 | failed to register mbean vid
69 | master lock retry sleep interrupted
70 | vid ms elapsed since last write check
71 | vid remove request on vid from vid vid matching sub vid
72 | vid attempting to acquire exclusive lease to become the master
73 | async start of vid
74 | vid no set batch from sequence id set vid
75 | connector removed with uri vid
76 | corrupt journal record unexpected exception on journal replay of location vid
77 | apache activemq vid vid vid
78 | auto transport newconnectionexecutor didn t cleanly
79 | assigned vid to consumer vid
80 | setting topic vid to vid
81 | no queue named vid
82 | could not connect to local uri vid vid
83 | closed socket vid
84 | locker keepalive resulted in
85 | failure reason
86 | notified failover transport vid of interruption completion
87 | failed to initialize local connection for the jmsconnector
88 | timeout waiting for echo service shutdown
89 | trace entry vid
90 | failed to remove consumer on connection vid
91 | xa transaction rollback vid
92 | bridge was disposed before the first vid
93 | interrupted while redelivery
94 | unsubscribing durable journal
95 | sending to vid messages to vid
96 | removing consumer vid
97 | attempting to acquire the exclusive lock to become the master broker
98 | not adding to dlq vid to vid
99 | trying to build a pooledconnectionfactory
100 | sampler interrupted
101 | vid received message vid
102 | failed to close connection vid
103 | failed to accept accept for vid
104 | rolled back vid messages from the index in vid seconds
105 | error occured while processing vid
106 | unexpected local exception vid
107 | vid end of vid with vid
108 | master lock retry sleep interrupted
109 | message not found in sequence id index vid
110 | failed to deliver remove command for destination vid
111 | vid removed from scheduler vid
112 | installing discarding dead letter queue broker plugin dropall vid dropall vid
113 | failed to create object name to unregister vid
114 | vid vid ms elapsed since last write check
115 | failed to send mqtt subscription vid
116 | connector vid started
117 | session vid has more work to do b c of unconsumed
118 | could not transfer the template file to journal transferfile vid
119 | exception occurred for client vid vid processing vid
120 | async error occurred vid
121 | executing sql vid
122 | msg vid id vid destinationname vid
123 | failed to unregister mbean vid
124 | forcing shutdown of executorservice vid
125 | failed to prepare xaresource vid
126 | the remote exception was vid
127 | committing user vid
128 | amqp header arrived invalid version vid
129 | message expired vid
130 | error on queueing the ack compactor
131 | failed to load vid
132 | failed to unregister mbean vid
133 | vid recovered prepared vid
134 | vid ignoring destination vid restricted to vid network hops only
135 | journalled transacted acknowledge for vid at vid
136 | async exception with no exception listener vid
137 | could not preallocate journal file with zeros
138 | unable to unregister subscription vid
139 | attempting to acquire vid
140 | failed to remove scheduler vid
141 | starting a network connection between vid ms
142 | could not create transportlogger reason vid
143 | mqtt client vid connected version vid
144 | get peer broker index vid
145 | vid performance vid to vid
146 | vid ignoring destination vid restricted to vid network hops only
147 | transportloggerfactory could not be started reason vid
148 | received null command from url vid
149 | sending message to vid client vid
150 | last update vid full gc candidates set vid
151 | failed to call getplatformmbeanserver due to
152 | can t use property vid which is of type vid value
153 | policy not applied error processing object addition for addition of vid
154 | executing sql vid
155 | failed to write to scheduler vid
156 | rollback processing error
157 | cleanup removing the data
158 | could not connect to local uri vid vid
159 | starting network connection between vid and vid has been established
160 | failed to lookup the broker from vid
161 | vid ms elapsed and vid consumers subscribed starting dispatch
162 | waiting for outstanding responses to be properly
163 | thread using classloader vid
164 | unknown command vid
165 | stopped recover next messages
166 | vid failed to lease sleeping for vid milli s before trying again
167 | recovery replayed vid operations from the journal
168 | scope vid
169 | failed to register mbean vid
170 | exception occurred for client vid vid processing vid vid
171 | removed scheduled job vid
172 | shutting down test echo service
173 | connector not registered for uuid vid
174 | failed to send command vid
175 | connector stopped stopping proxy
176 | exception on dispatch to browser vid
177 | add exception was raised while executing the run command for oncomplete
178 | start failure exception
179 | the type vid should end with to be a valid discovery type
180 | continuation vid expired vid
181 | suppressing duplicate message send vid
182 | opening new cause
183 | no log writer available for vid
184 | starting to synchronously receive vid messages
185 | vid matching remote vid
186 | failed to unregister mbean vid
187 | load of vid
188 | running clientid vid
189 | failed to aquire lock
190 | adding destination vid
191 | restore consumer vid in pull mode pending recovery overriding prefetch vid
192 | rar vid stopped or undeployed recovery
193 | job scheduler store checkpoint complete
194 | connected to zookeeper
195 | endpoint vid failed to process message reason
196 | the type vid should end with to be a discovery type
197 | invoking start on vid
198 | policy not applied user vid does not have name attribute vid under entry vid
199 | master lock retry sleep interrupted
200 | forwarding of acks in journal file vid
201 | creating temporary file vid
202 | received_exception vid
203 | shutdown of executorservice vid is shutdown vid and terminated vid took vid
204 | async connection timeout task was rejected from the executor
205 | mqtt client vid established heart beat of vid ms vid ms grace period
206 | caught exception in mainloop
207 | exceeded redelivery with count vid ack vid
208 | ignoring consumerinfo vid from vid vid
209 | no connection attempt made in time for vid throwing inactivityioexception
210 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/activemq/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/activemq/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/activemq/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 209 examples
5 | | ['data-bin/context'] test 209 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/activemq/checkpoint_last.pt
7 | | Translated 209 sentences (2201 tokens) in 3.2s (64.63 sentences/s, 680.59 tokens/s)
8 | | Generate test with beam=8: BLEU = 26.58 46.1/28.3/21.8/18.7 (BP = 0.985 ratio = 0.985 hyp_len = 1299 ref_len = 1319)
9 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/activemq/translation.context.test.unsort:
--------------------------------------------------------------------------------
1 | 51 vid ms before attempting to reconnect to vid
2 | 144 vid performance vid to vid
3 | 128 message expired vid
4 | 105 unexpected local exception vid
5 | 31 xa resource manager vid
6 | 187 running clientid vid
7 | 79 setting topic vid to vid
8 | 92 interrupted while redelivery
9 | 138 attempting to acquire vid
10 | 4 error with selector vid
11 | 109 failed to deliver remove command for destination vid
12 | 5 network connection between vid and vid has been established
13 | 97 not adding to dlq vid to vid
14 | 26 stopping broker vid
15 | 86 failed to initialize local connection for the jmsconnector
16 | 179 continuation vid expired vid
17 | 90 xa transaction rollback vid
18 | 161 waiting for outstanding responses to be properly
19 | 201 received_exception vid
20 | 39 sendreq vid
21 | 102 failed to accept accept for vid
22 | 156 cleanup removing the data
23 | 58 work rejected vid
24 | 22 broker plugin vid started
25 | 29 sendreq vid
26 | 72 async start of vid
27 | 21 attempting to acquire the exclusive lock to become the master broker
28 | 200 creating temporary file vid
29 | 158 starting network connection between vid and vid has been established
30 | 96 attempting to acquire the exclusive lock to become the master broker
31 | 181 opening new cause
32 | 140 starting a network connection between vid ms
33 | 2 async dispose interrupted
34 | 41 failed to unregister mbean vid
35 | 59 reusing an active session vid
36 | 205 caught exception in mainloop
37 | 16 redelivery of vid stopped
38 | 111 installing discarding dead letter queue broker plugin dropall vid dropall vid
39 | 193 connected to zookeeper
40 | 185 failed to unregister mbean vid
41 | 154 failed to write to scheduler vid
42 | 114 failed to send mqtt subscription vid
43 | 54 get destinations returned empty list
44 | 99 sampler interrupted
45 | 183 starting to synchronously receive vid messages
46 | 91 bridge was disposed before the first vid
47 | 142 mqtt client vid connected version vid
48 | 13 setting optimized out of vid to vid
49 | 195 the type vid should end with to be a discovery type
50 | 153 executing sql vid
51 | 208 no connection attempt made in time for vid throwing inactivityioexception
52 | 139 failed to remove scheduler vid
53 | 189 adding destination vid
54 | 47 waiting vid ms before attempting to reconnect
55 | 95 removing consumer vid
56 | 115 connector vid started
57 | 24 vid flushtodisk done vid ms vid
58 | 124 failed to prepare xaresource vid
59 | 71 vid attempting to acquire exclusive lease to become the master
60 | 67 failed to register mbean vid
61 | 172 connector not registered for uuid vid
62 | 57 setting durable subscriber to vid
63 | 196 invoking start on vid
64 | 113 vid vid ms elapsed since last write check
65 | 7 vid failed to resetting to batch
66 | 10 store commit failed
67 | 49 received an exception but connection is ignored vid
68 | 170 removed scheduled job vid
69 | 45 network bridge could not be registered in jmx vid
70 | 70 vid remove request on vid from vid vid matching sub vid
71 | 178 the type vid should end with to be a valid discovery type
72 | 80 no queue named vid
73 | 122 failed to unregister mbean vid
74 | 52 failed to get durable subscription vid
75 | 83 locker keepalive resulted in
76 | 42 periodic checkpoint failed
77 | 120 executing sql vid
78 | 186 load of vid
79 | 44 creating producer vid message vid
80 | 87 timeout waiting for echo service shutdown
81 | 148 sending message to vid client vid
82 | 112 failed to create object name to unregister vid
83 | 85 notified failover transport vid of interruption completion
84 | 135 async exception with no exception listener vid
85 | 174 connector stopped stopping proxy
86 | 162 thread using classloader vid
87 | 46 slow kahadb access journal append took vid ms index
88 | 125 the remote exception was vid
89 | 63 recovery mode trying to reconnect to zero
90 | 53 unable to read persisted selector cache it will be ignored
91 | 188 failed to aquire lock
92 | 101 failed to close connection vid
93 | 76 apache activemq vid vid vid
94 | 15 tempdest vid
95 | 61 creating producer to vid
96 | 12 destination is full vid
97 | 0 expiring connection to zookeeper vid
98 | 203 async connection timeout task was rejected from the executor
99 | 130 failed to load vid
100 | 146 transportloggerfactory could not be started reason vid
101 | 25 failed to fire bridge for vid
102 | 137 unable to unregister subscription vid
103 | 194 endpoint vid failed to process message reason
104 | 11 failed to create persistence adapter vid
105 | 74 connector removed with uri vid
106 | 77 auto transport newconnectionexecutor didn t cleanly
107 | 199 forwarding of acks in journal file vid
108 | 127 amqp header arrived invalid version vid
109 | 104 error occured while processing vid
110 | 147 received null command from url vid
111 | 143 get peer broker index vid
112 | 100 vid received message vid
113 | 136 could not preallocate journal file with zeros
114 | 171 shutting down test echo service
115 | 93 unsubscribing durable journal
116 | 117 could not transfer the template file to journal transferfile vid
117 | 191 rar vid stopped or undeployed recovery
118 | 176 add exception was raised while executing the run command for oncomplete
119 | 126 committing user vid
120 | 19 reconnected to vid
121 | 159 failed to lookup the broker from vid
122 | 38 message received since last read check resetting flag vid
123 | 106 vid end of vid with vid
124 | 65 producer vid with non persistent delivery
125 | 37 unknown datastruction vid
126 | 155 rollback processing error
127 | 64 could not apply query parameters vid to vid
128 | 60 thread does not hold the context lock on close of vid
129 | 27 sasl vid handshake complete
130 | 184 vid matching remote vid
131 | 75 corrupt journal record unexpected exception on journal replay of location vid
132 | 207 ignoring consumerinfo vid from vid vid
133 | 206 exceeded redelivery with count vid ack vid
134 | 197 policy not applied user vid does not have name attribute vid under entry vid
135 | 40 vid could not find the object rename for vid
136 | 89 failed to remove consumer on connection vid
137 | 14 async read check was rejected from the executor
138 | 182 no log writer available for vid
139 | 163 unknown command vid
140 | 150 failed to call getplatformmbeanserver due to
141 | 118 exception occurred for client vid vid processing vid
142 | 73 vid no set batch from sequence id set vid
143 | 6 unknown command vid
144 | 141 could not create transportlogger reason vid
145 | 134 journalled transacted acknowledge for vid at vid
146 | 18 vid ignoring sub vid on vid from vid is no longer active
147 | 84 failure reason
148 | 48 prepare of vid failed because it was marked rollback only
149 | 110 vid removed from scheduler vid
150 | 123 forcing shutdown of executorservice vid
151 | 55 caught an exception trying to determine if there is no flag
152 | 132 vid recovered prepared vid
153 | 168 failed to register mbean vid
154 | 173 failed to send command vid
155 | 28 error receiving message vid this exception is ignored
156 | 9 shutdown of topic traffic generator failed
157 | 169 exception occurred for client vid vid processing vid vid
158 | 34 vid message sent to vid
159 | 152 policy not applied error processing object addition for addition of vid
160 | 1 unsubscribing next messages
161 | 202 shutdown of executorservice vid is shutdown vid and terminated vid took vid
162 | 108 message not found in sequence id index vid
163 | 167 scope vid
164 | 68 master lock retry sleep interrupted
165 | 204 mqtt client vid established heart beat of vid ms vid ms grace period
166 | 198 master lock retry sleep interrupted
167 | 32 do not know how to process activemq command vid
168 | 66 failed to call after delivery
169 | 3 stopping async topic tasks
170 | 160 vid ms elapsed and vid consumers subscribed starting dispatch
171 | 157 could not connect to local uri vid vid
172 | 36 message not found in sequence id index vid
173 | 56 vid usage manager memory limit reached vid producers will be throttled to the rate at vid
174 | 107 master lock retry sleep interrupted
175 | 81 could not connect to local uri vid vid
176 | 151 can t use property vid which is of type vid value
177 | 131 failed to unregister mbean vid
178 | 166 recovery replayed vid operations from the journal
179 | 165 vid failed to lease sleeping for vid milli s before trying again
180 | 116 session vid has more work to do b c of unconsumed
181 | 149 last update vid full gc candidates set vid
182 | 78 assigned vid to consumer vid
183 | 164 stopped recover next messages
184 | 180 suppressing duplicate message send vid
185 | 69 vid ms elapsed since last write check
186 | 119 async error occurred vid
187 | 17 exception on forwarding to non existent temp dest
188 | 98 trying to build a pooledconnectionfactory
189 | 175 exception on dispatch to browser vid
190 | 82 closed socket vid
191 | 62 vid elapsed time in second vid s
192 | 190 restore consumer vid in pull mode pending recovery overriding prefetch vid
193 | 129 error on queueing the ack compactor
194 | 20 detected missing corrupt journal files dropped vid messages from the index in vid seconds
195 | 121 msg vid id vid destinationname vid
196 | 88 trace entry vid
197 | 103 rolled back vid messages from the index in vid seconds
198 | 35 zero length partial vid
199 | 43 error unsubscribing vid from vid vid
200 | 23 checkpoint failed
201 | 145 vid ignoring destination vid restricted to vid network hops only
202 | 177 start failure exception
203 | 33 sending to vid messages from vid to vid
204 | 50 could not correlate the connection vid
205 | 94 sending to vid messages to vid
206 | 133 vid ignoring destination vid restricted to vid network hops only
207 | 192 job scheduler store checkpoint complete
208 | 30 endpoint vid will not receive any messages due to broker zero error vid
209 | 8 virtual consumer added vid for virtual destination vid
210 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/ambari/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/ambari/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/ambari/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 365 examples
5 | | ['data-bin/context'] test 365 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/ambari/checkpoint_last.pt
7 | | Translated 365 sentences (4566 tokens) in 6.8s (53.74 sentences/s, 672.23 tokens/s)
8 | | Generate test with beam=8: BLEU = 25.50 46.4/28.2/20.9/16.7 (BP = 0.982 ratio = 0.982 hyp_len = 2683 ref_len = 2733)
9 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/brooklyn/translation.context.test:
--------------------------------------------------------------------------------
1 | problem persisting but no longer running ignoring
2 | unable to create policy spec for vid vid
3 | management node vid in vid seconds waiting for persistence to write all data continuing
4 | unable to instantiate vid vid rethrowing vid
5 | vid invoking sensor vid on vid with vid
6 | vid no port available for vid empty range vid
7 | vid rethrowing
8 | user vid not authorized to see sensor vid of entity vid excluding from current state results
9 | determined reachability of sockets vid vid
10 | vid health check for vid component continuing recovering vid
11 | unable again to find details of location vid in rest call to list ignoring location vid
12 | multiple ambiguous definitions for config key vid on vid with vid
13 | authentication successful vid
14 | retrieving java url vid from vid
15 | publishing management node health vid
16 | first server up in vid is vid
17 | using first reachable address vid for node vid in vid
18 | unable to instantiate vid vid
19 | cancelled vid tasks for vid vid
20 | can t calculate percentage value for entity vid as total from producer vid is zero
21 | destroying app vid mgmt is vid
22 | cannot set key vid on vid from flag vid containing class is not configurable
23 | error resizing but no longer running vid
24 | failed to resolve aws hostname of vid rethrowing
25 | parsing values for vid at vid vid
26 | invoking effector vid on vid with args vid
27 | vid recording removal of container vid
28 | vid redundant call to start vid skipping vid
29 | for vid considering members vid
30 | getrequiredopenports detected at vid vid
31 | credentials have no effect in builder unless uri for host is specified
32 | missing icon data for vid expected at vid already logged warn and error details
33 | checkpointing delta of memento with references updating vid entities vid locations vid policies vid enrichers vid catalog items vid bundles removing vid
34 | vid n plan being added is n vid n plan already present is n vid
35 | configuration error vid
36 | primary node vid is deprecated use vid instead use vid
37 | deprecated use of managementcontext for unmanaged vid ignoring
38 | loaded java type vid for vid vid but had errors vid
39 | installing vid from vid on vid
40 | missing catalog item for vid vid inferring as vid because that is able to load the item
41 | reconfiguring vid config file for vid because vid is not on vid
42 | conflicting value for key vid from deprecated name vid using earlier deprecated name vid
43 | deserializing the non static class vid with multiple outer class fields vid when changing compilers it s possible that the instance won t be able to be deserialized due to changed outer class field names in those cases deserialization could fail with field not found exception or class cast exception following this log line
44 | item vid cannot be moved skipping
45 | launched brooklyn vid
46 | replacing in vid member vid with old address vid new address vid
47 | adding to vid vid appears identical to existing vid may get removed on rebind underlying addition should be modified so it is not added twice
48 | error adding brooklyn properties for vid vid
49 | error copying customize resources
50 | problem polling for async script on vid for vid continuing
51 | resize vid from vid to vid
52 | tmpdirfinder candidate tmp dir vid cannot have files created inside it vid
53 | loaded rebind raw data took vid vid entities vid locations vid policies vid enrichers vid feeds vid catalog items vid bundles from vid
54 | changing hostname recorded against public ip vid from vid
55 | executing vid failed with class vid
56 | using ssh tool vid of type vid props
57 | subsequent error during termination vid
58 | failed rest deployment launching vid vid
59 | rebindriver for vid is not transforming machine location so not generating machine vid vid
60 | management node vid in vid new plane unable to promote to vid currently vid see log for vid
61 | error destroying vid ignoring vid
62 | detail on failure to deploy webapp vid
63 | policy vid balancing finished at cold node vid workrate number no way to improve it
64 | problem persisting change delta rethrowing
65 | group vid got new member vid
66 | vid recording metric update for item vid
67 | brooklynsecurityproviderfilterjavax dofilter caught vid
68 | problem in ha poller but no longer running vid
69 | cannot request read only mode for vid when already running vid ignoring
70 | rebindmanager instantiate vid rethrowing vid
71 | vid check for vid continuing failing vid
72 | cannot get hostname bug with string vid for vid ignoring
73 | failed to set permissions to vid for file vid
74 | fallback super realclass vid attempt failed orig class vid vid
75 | location vid added to vid
76 | success following serialized for vid vid
77 | running shell command at vid vid
78 | discouraged use of brooklyn properties deprecated use vid instead use vid
79 | error calculating and setting combination for enricher vid
80 | cassandra nics inferred ip vid for vid
81 | policy vid balancing finished at cold node vid workrate number no way to improve it
82 | jclouds using template vid options vid to provision machine in vid
83 | initiating replica set with vid
84 | deprecated use of brooklyn custom brooklyn properties for vid
85 | vid publishing failed state vid currentfailurestarttime vid now vid
86 | skipping configuration of non ec2 computeservice vid
87 | rebinding entity vid even though actual state is vid expected state is vid
88 | starting entity vid at vid
89 | no portforwardmanager using legacy vid
90 | geodns inferred geoinfo vid from hostname vid
91 | deprecated use of scanjavaannotations instead use of vid version syntax in future versions to load vid
92 | error rebinding brooklyn web console rebinding
93 | seeds considered stable for cluster vid node vid
94 | expected to find two security groups on node vid in app vid one shared one unique found vid vid
95 | queued task vid rethrowing vid
96 | error forcing brooklyn gc usage now vid
97 | vid adding children to vid n vid
98 | item vid cannot be moved skipping
99 | unable to create from archive returning vid
100 | resolution of vid failed swallowing and returning vid
101 | queued task vid of vid no longer running vid
102 | disconnecting sshjtool vid vid
103 | brooklyn geo info lookup failed for vid
104 | cors brooklyn fee disabled
105 | context entity found by looking at target vid entity tag not context entity
106 | multiple definitions for effector vid on vid ignoring vid
107 | network facing enricher not transforming vid uri vid because no port in target vid for vid
108 | copying chunk vid to vid on vid
109 | bundle vid containing bom is not managed by brooklyn using legacy item installation
110 | deprecated use of name key to define vid version should be specified within id key or with version key not this tag
111 | vid ports not applicable or not yet applicable because has multiple locations vid ignoring
112 | invoking vid on vid in vid
113 | can t infer catalog item id from the following plan n vid
114 | uninstalling bundle vid from brooklyn ui module bundle location vid
115 | members of vid checking vid eliminating because not member
116 | vid added to machine vid of location vid vid
117 | error stopping child continuing and will rethrow if no other errors
118 | multiple definitions for config key vid on vid from vid and vid preferring lower vid value vid
119 | cancelling vid mode vid on vid
120 | uninstalling bundle vid from brooklyn managed bundle vid n vid
121 | failed to unmanage entity vid and its descendants after failure to initialise rethrowing original exception
122 | could not determine canonical name of file vid returning original file
123 | no maven resource file vid available
124 | vid clearing ssh for vid
125 | scheduling item for persistence addition vid
126 | error computing geo info for vid internet issues or too many requests to free servers for vid subsequent errors for vid
127 | network facing enricher not transforming vid uri vid because no port mapping for vid
128 | failed to set permissions to vid for file vid expected behaviour on windows vid subsequent failures on any file will be logged at trace
129 | trace for quarantine group vid failed to start entity vid removing vid
130 | osgi could not find bundle vid in search after installing it from vid
131 | two masters detected probably a handover just occured vid
132 | launching vid members of vid now vid
133 | installing image regex to vid for vid
134 | flagutils for vid setting field vid val vid newval vid key vid
135 | vid undeploying vid vid on vid
136 | running command at vid vid
137 | vid recording addition of container vid
138 | brooklynsecurityproviderfilterjavax start
139 | theoretical best primary at vid vid maybe others not available using next best vid
140 | formula configured vid
141 | error creating uri for vid rethrowing vid
142 | validation done in vid
143 | vid scheduling but no longer running vid
144 | members of vid checking vid eliminating because not up
145 | resource vid type vid deployed to vid
146 | cannot notifyofinitialvalue for subscription with value vid
147 | creating customizing vid for vid
148 | create shell command at vid
149 | no reachable address vid feed from vid to vid
150 | activating local management for vid on start
151 | sethostnamecustomizer ignoring machine vid in vid
152 | while starting vid obtained new location instance vid
153 | managing vid in mode vid doing this recursively because a child is preregistered
154 | problem setting application lifecycle usage event vid vid
155 | vid closing pool for vid
156 | autodeployment in parent s management context triggered for vid vid will not be supported in future explicit manage call required
157 | child spec vid is already set with parent vid how did this happen
158 | found existing shared security group in vid for app vid vid
159 | found namespace vid returning it
160 | skipping ssh check for vid vid due to config waitforconnectable vid
161 | failed transfer vid to vid retryable error attempt vid vid vid
162 | error stopping brooklynweb console rethrowing
163 | starting entity vid at vid
164 | could not register external ui module vid vid
165 | service vid could not be parsed at vid vid
166 | discouraged deprecated use of static annotated effector method vid defined in vid
167 | unable again to find details of location vid in rest call to list ignoring location vid
168 | vid pre start management of entity vid mode vid
169 | releasing machine vid in vid instance id vid
170 | problem releasing machine vid propagating after vid vid
171 | this management node vid supposed to be master but reportedly unhealthy no op as expect other node to fix self vid
172 | rebind entity vid no longer running vid
173 | fallback loadclass vid attempt failed orig class vid vid
174 | configuring brooklynnode entity startup
175 | no location has been set on vid cannot configure security groups in context vid
176 | sequence for vid incremented to vid
177 | updating brooklyn properties from vid
178 | jmx jar for vid is not a valid jmx on vid because no jmx
179 | deprecated automatic coercion of object to timeduration set breakpoint in typecoercions to inspect convert to duration
180 | referenced task for vid vid
181 | adding auto generated user vid vid in vid
182 | suspending machine vid in vid instance id vid
183 | forcing catalog load on access of catalog items
184 | misconfiguration for vid sslconfig vid but no https_port on vid
185 | vid detected item removal on change of vid
186 | rescheduling addition of shard vid because add failed via router vid
187 | destroyed and unmanaged vid mgmt now vid managed vid
188 | problem deleting temporary files of async script on vid ignoring
189 | stopped read only vid mgmt vid
190 | vault response code vid vid
191 | problem terminiating management node state listeners continuing
192 | removing from vid member vid with old address vid because inferred address is now null
193 | machine details for vid missing from jclouds using ssh test instead name vid version vid arch vid ram vid cpus vid
194 | formula configured vid
195 | started brooklyn rest server at vid vid
196 | geodns vid refreshing vid
197 | policy vid detected vid should be on vid but can t move it vid
198 | unable to instantiate vid rethrowing vid
199 | system bundles are vid
200 | error in enricher vid but no longer running vid
201 | creating brooklyn local copy of bundle file vid
202 | vid resizing vid from vid to vid vid
203 | discouraged deprecated use of brooklynproperties for vid instead vid
204 | brooklyn gc deleted vid tasks as was over global limit now have vid
205 | custom password rebind for vid vid
206 | error launching brooklyn items from node vid ignoring vid
207 | ignoring failed execution of task callback hook vid because executor is shutdown
208 | failed to resolve aws hostname of vid vid
209 | installing vid with exit code vid
210 | isfirstnodeset but no cluster members found to add vid
211 | cannot store location lifecycle usage event for vid state vid because storage not available
212 | looking up vid in osgi
213 | standard location resolvers not installed location resolution will fail shortly
214 | restarting entity vid in vid machine vid
215 | done vid checkentity vid
216 | unable to delete one or more paths vid on shutdown vid
217 | launching vid with role vid and source of attempt to vid with role vid and vid but no unmanaged
218 | parent not found discarding following original ring for vid
219 | loading initial catalog from vid
220 | vid invoking effector on vid effector vid parameters vid
221 | queueing update needed task for vid update will occur shortly
222 | adding startup script to enable winrm for windows vm on vid
223 | brooklyn thought it was already managing bundle vid but it s not installed to framework
224 | vid invoking effector vid on vid with vid which is the target vid
225 | error running mongodb script vid at vid
226 | creating zookeeper using custom spec for vid
227 | repeating problem vid but no longer active ignoring
228 | releasing machine vid in vid instance id vid ignoring and continuing vid vid
229 | deleting temporary token for vid with version vid
230 | invalid item in catalog when converting rest catalog item type vid
231 | deletion of orphan state found unusually referenced feeds keeping vid
232 | looking up external classpath for vid
233 | vid calculated desired pool size vid from vid to vid
234 | error launching brooklyn vid
235 | unable to re connect to jmx url vid vid
236 | problem notifying listener vid of vid
237 | vm vid connection succeeded after vid on vid
238 | tmpdirfinder candidate tmp dir vid cannot have files created inside it vid
239 | error recording monitor info vid
240 | ignoring flag open_iptables on non ssh location vid
241 | task vid was modified but modification was never used
242 | long poll retrieving status directly received exit status will retry on vid for vid
243 | vid picking up vid as the tracker already set often due to rebind
244 | multiple definitions for effector vid on vid preferring lower vid to vid
245 | deprecated use of entities startmanagement application managementcontext for vid ignoring vid
246 | vid set on vid but pollforfirstreachableaddress vid
247 | use of groovy lang closure is deprecated in basicsubscriptioncontext subscribe
248 | restarting brooklyn machine in vid instance id vid
249 | theoretical best primary at vid vid maybe others not available at vid
250 | ignoring deprecated flag open_iptables on windows location vid
251 | error polling for vid command vid
252 | knifeportuseknifedefault specified to vid when already told to use vid explicitly overriding previous see subsequent warning for more details
253 | vid recording pool size vid for vid
254 | use of groovy lang closure is deprecated in type vid
255 | catalog does not contain item for type vid loaded class directly instead
256 | for vid considering membership of vid which is in locations vid
257 | looking for vid in revised location vid
258 | delaying vid vid allowed vid elapsed then rechecking for vid ms
259 | rest request running as vid threw vid
260 | mysampleimpl init with config vid
261 | localhost obtainport vid returning vid
262 | fabric vid updating seeds chosen vid potential vid
263 | geodns including vid even though vid is a private subnet homeless ential vid
264 | ignoring userdatastring vid in vm creation because not supported for cloud type vid
265 | management node vid detected master change required newmaster vid oldmaster vid plane vid heartbeattimeout vid
266 | vid can t configure resolver at vid no sshmachines
267 | brooklyn management context for vid vid
268 | rebinding addition of memento vid vid
269 | starting entity vid at vid
270 | cancelled vid tasks for vid with vid remaining of vid vid
271 | enricher vid transforming vid to vid
272 | had to wait vid for vid vid to be true before setting vid
273 | resizing vid to vid proxy vid of vid
274 | change handler should be hidden by event handler trace for unexpected mongo node handler
275 | bundle vid matches metadata of managed bundle vid but not osgi bundle location vid and matches already installed osgi bundle is no op
276 | ignoring mode vid in favour of port for management candidates of vid vid
277 | unexpected structure for state module vid skipping vid vid
278 | queued task vid at context vid no hierarchy
279 | effector vid defined on vid has no body invoking caller supplied vid instead
280 | ambiguous spec supertypes vid for target vid it is recommended that any registered type constraint for a spec be compatible with the sions
281 | restart of vid requested be applied at machine level
282 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/brooklyn/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/brooklyn/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/brooklyn/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 281 examples
5 | | ['data-bin/context'] test 281 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/brooklyn/checkpoint_last.pt
7 | | Translated 281 sentences (4829 tokens) in 7.5s (37.59 sentences/s, 646.06 tokens/s)
8 | | Generate test with beam=8: BLEU = 31.22 51.6/32.5/25.6/22.1 (BP = 1.000 ratio = 1.014 hyp_len = 2680 ref_len = 2642)
9 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/camel/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/camel/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/camel/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 637 examples
5 | | ['data-bin/context'] test 637 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/camel/checkpoint_last.pt
7 | | Translated 637 sentences (7331 tokens) in 9.5s (67.14 sentences/s, 772.64 tokens/s)
8 | | Generate test with beam=8: BLEU = 40.05 59.9/45.1/39.6/37.3 (BP = 0.896 ratio = 0.901 hyp_len = 4093 ref_len = 4543)
9 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/cloudstack/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/cloudstack/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/cloudstack/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 1061 examples
5 | | ['data-bin/context'] test 1061 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/cloudstack/checkpoint_last.pt
7 | | Translated 1061 sentences (13432 tokens) in 20.5s (51.64 sentences/s, 653.81 tokens/s)
8 | | Generate test with beam=8: BLEU = 34.95 53.6/38.9/31.9/27.8 (BP = 0.948 ratio = 0.949 hyp_len = 8344 ref_len = 8789)
9 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/hadoop/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/hadoop/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hadoop/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 1127 examples
5 | | ['data-bin/context'] test 1127 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hadoop/checkpoint_last.pt
7 | | Translated 1127 sentences (13134 tokens) in 19.2s (58.77 sentences/s, 684.85 tokens/s)
8 | | Generate test with beam=8: BLEU = 23.79 46.1/28.0/22.0/19.1 (BP = 0.877 ratio = 0.884 hyp_len = 7660 ref_len = 8664)
9 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/hbase/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/hbase/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hbase/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 507 examples
5 | | ['data-bin/context'] test 507 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hbase/checkpoint_last.pt
7 | | Translated 507 sentences (5988 tokens) in 9.1s (55.81 sentences/s, 659.12 tokens/s)
8 | | Generate test with beam=8: BLEU = 23.73 45.2/27.9/21.7/17.7 (BP = 0.899 ratio = 0.904 hyp_len = 3583 ref_len = 3964)
9 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/hive/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/hive/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hive/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 629 examples
5 | | ['data-bin/context'] test 629 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hive/checkpoint_last.pt
7 | | Translated 629 sentences (6861 tokens) in 10.8s (58.47 sentences/s, 637.76 tokens/s)
8 | | Generate test with beam=8: BLEU = 30.25 48.9/33.0/27.9/24.4 (BP = 0.934 ratio = 0.936 hyp_len = 3898 ref_len = 4163)
9 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/ignite/translation.context.test:
--------------------------------------------------------------------------------
1 | received session attribute request nodeid vid msg vid
2 | stopping spi vid
3 | overriding partition map in full update map exchid vid curpart vid newpart vid
4 | received session attribute request message msg vid nodeid vid
5 | ipfinder
6 | cassandra session refreshed
7 | will move session to less loaded worker ses vid from vid to vid
8 | baseline won t be changed cause the lost partitions were detected
9 | partition changed state grp vid p vid prev vid to vid
10 | got removed entry in lockasync method will retry vid
11 | node connection is idle but there are unacknowledged messages will wait vid
12 | refresh partitions due to topology update
13 | starting loading model by the path vid
14 | preserving deployment without node participants vid
15 | xa resource start xid vid xid vid
16 | obsolete version was not set because lock was explicit vid
17 | failed to connect to ignite update server vid
18 | generated node_joined bulk event nodecnt vid evtnode vid
19 | mvcc coordinator issued topology version for service vid fut vid
20 | processing node departure vid
21 | refresh partitions due to mapping was changed
22 | received job execution request while stopping will ignore vid
23 | failed to add entry err vid entry vid
24 | sent peer class loading response to node node does not exist vid
25 | attempt to execute cassandra batch vid operation to process rest vid of vid elements
26 | failed to find future for dht finish response txid vid node vid res vid
27 | acquired deployment class from local cache vid
28 | updating full partition map grp vid exchver vid fullmap vid
29 | communication problem resolver detected job cancelled nodeid vid
30 | failed to get future result
31 | got removed entry in transaction getallasync will retry vid
32 | restored near prepare from node vid
33 | async response resp vid
34 | skipped discovery notification node vid type vid topver vid
35 | ignore communication error resolve message resolve process already started sndnode vid
36 | ignore communication error resolver forced nodes stop reqid vid locnode vid
37 | failed to add candidates because entry was removed will renew
38 | maxconntimeout
39 | cleared invalid entry from remote transaction will skip entry vid tx vid
40 | vid xid version uuid vid
41 | store remove key vid tx vid
42 | received dht finish response txid vid dhttxid vid node vid
43 | removed mapping for node nodeid vid tx vid
44 | offheap remove key vid
45 | cassandra session refreshed
46 | committed from tm locnodeid vid tx vid
47 | partition map after beforeexchange grp vid exchid vid fullmap vid
48 | merge exchange future exchfut vid mergedfut vid evt vid evtnode vid evtnodeclient vid
49 | successfully locked persistence storage folder
50 | received near lock response for unknown future txid vid node vid
51 | failed to find future for get response sender vid res vid
52 | failed to find client message worker clientnode vid
53 | shmemport
54 | coordinator failed node is new coordinator ver vid
55 | found unacknowledged batch for left node nodeid vid fut vid
56 | restored partition state from wal grp vid p vid state vid updcntr vid
57 | deactivate page store manager id vid topver vid
58 | updated cache entry val vid old vid entry vid
59 | failed to send initial demand request to node
60 | deployment cannot be reused random class could not be loaded from sender node dep vid meta vid
61 | get affinity from cache vid key vid val vid
62 | failed to unswap entry
63 | scanner processor started
64 | boot class path vid
65 | failed to unlock key all partition nodes left the grid
66 | vid view caches
67 | failed to find class probably due to task job cancellation name vid err vid
68 | vid used cache groups id to name vid
69 | vid has been interrupted
70 | node is stopped or lock is broken in non failover safe mode aborting transaction
71 | injecting cache store session
72 | vid view information in a cluster
73 | message has been sent to next node msg vid next vid
74 | got removed entry while updating near value will retry vid
75 | awscredentials
76 | stopped closure processor
77 | message has been sent to node nodeid vid msg vid
78 | started moving ses vid
79 | failed to find class protocol vid
80 | injected task resources continuous query vid
81 | use vid option to disable it
82 | i am modified job_1 vid on vid
83 | cassandra table vid cause appropriate keyspace doesn t exist
84 | delete entries from db cache vid keytype vid cnt vid
85 | can t initialize query string vid
86 | finished range check range vid pos vid
87 | vid truststore_type vid
88 | cleaner has been cancelled
89 | received remove lock request for removed entry will retry entry vid req vid
90 | failed to send partition update to node left the grid
91 | received metrics update message from unknown node vid
92 | after vid release vid
93 | unregistering mbean vid
94 | error when polling event queue
95 | received duplicate continuous query message vid
96 | received schema propose discovery message but cache is statically configured and vid flag is set will report error opid vid msg vid
97 | sent peer class loading request node vid req vid
98 | discarding node add finished message join process is not finished vid
99 | removed message set due to node leaving grid vid
100 | encrypted data status vid handshakestaus vid ses vid
101 | received near prepare from node that left txid vid node vid
102 | partition has been scheduled for eviction this node is oldest non affinity node grp vid p vid prevstate vid
103 | failed to send tx update response node left msg vid node vid
104 | failed to send dht finish response node left txid vid dhttxid vid node vid
105 | discarding node added message with empty topology vid
106 | failed to send message to node msg vid err vid
107 | waiting for handshake buffer vid
108 | coordinator received single message ver vid node vid allreceived vid
109 | closing connection locnodeid vid rmtaddr vid rmtport vid
110 | finished executing job processor onkernalstop callback
111 | baseline won t be changed in topology
112 | failed to notify exchange future callback for exchange future vid
113 | rolling back ignite transaction vid
114 | opened input stream path vid delegate vid
115 | failed to cancel service ignoring name vid execid vid
116 | new resources vid
117 | i am modified job_1 vid on vid
118 | discarding reconnect message reconnect is completed vid
119 | failed to acquire lock with negative node vid
120 | flushing shuffle messages before sending task completion notification taskinfo vid state vid err vid
121 | skipping global authentication for node security credentials not found probably due to coordinator has older version nodeid vid addrs vid
122 | stealing job to a new node newnode vid oldnode vid sesid vid job vid jobctx vid task vid
123 | partition states after afterexchange grp vid exchver vid
124 | failed to find count down latch with worker vid
125 | cleared invalid entry from remote transaction will skip entry vid tx vid
126 | injecting cache store session vid
127 | abandoning re map because future is done vid
128 | partition map before afterexchange exchid vid fullmap vid
129 | unexpected response to join request vid
130 | sent cache message msg vid node vid
131 | received unexpected response to join request vid
132 | jdbc drivers folder has no files returning empty list
133 | transaction was not found in nodes
134 | vid label vid
135 | message is ignored as it came for the closed topic vid
136 | invalid transaction state for rollback state vid tx vid
137 | completed fragmentizer coordinator remote node vid
138 | finished running ssl engine tasks handshakestatus vid
139 | duplicate initialize process request received will ignore vid
140 | closing socket to next not sent vid
141 | ipc io stopping as unused vid
142 | non loopback local ips vid
143 | failed to restore closed connection reconnect networktimeout vid jointimeout vid
144 | discarding metrics update message issued by node node is no more coordinator vid
145 | ignoring backup element row vid cachemode vid incbackups vid primary vid
146 | tuple id vid from storm vid
147 | skipping own directory vid
148 | received near prepare response txid vid node vid
149 | failed during partition counters delivery to remote node left cluster will ignore futid vid node vid
150 | got removed entry while updating will retry vid
151 | mqtt grid vid
152 | i am modified job_1 vid on vid
153 | received communication error resolve request nodeid vid req vid
154 | application vid is vid
155 | failed to send multicast address request will retry in ms vid
156 | runtime error caught during initial demand request sending
157 | added new daemon node to topology vid
158 | failed to send checkpoint message to node msg vid err vid
159 | discarding killed join vid
160 | closing zookeeper ip finder
161 | failed to send verified node left message to node msg vid
162 | node left topology vid
163 | unknown connection detected is some other software connecting to this ignite port vid connection vid rmtaddr vid
164 | removing left node from full map update grp vid nodeid vid partmap vid
165 | sent job request client disconnected node vid taskname vid
166 | dht lock fut failed to send request txid vid dhttxid vid intx vid node vid
167 | prepared statement cluster error detected another thread already first
168 | ignite node is in invalid state due to a critical failure
169 | partition map after beforeexchange grp vid exchid vid fullmap vid
170 | skipping deployment check as remote node does not have required class vid
171 | timed out waiting for lock response vid
172 | vid node id vid
173 | received job cancel stopped callback
174 | received onundeploy request ldriver vid
175 | initialized alive zookeeper ip finder vid
176 | added invalid partition to future invalidparts vid
177 | load cache vid key vid val vid
178 | caught malformed url exception vid
179 | entry clear key vid entry vid val vid
180 | write entries to db cache vid keytype vid cnt vid
181 | return lastinitializedfut for topology ready future ver vid fut vid
182 | got removed entry when adding lock will retry vid
183 | received shuffle ack desc vid msg vid
184 | total number of jobs to be stolen vid
185 | will move session to less loaded worker ses vid msg vid
186 | found duplicate future in futures map will not add vid
187 | offer not sufficient for slave request vid
188 | gc worker has been started
189 | skipping rebalancing partition state is not moving vid p vid
190 | waiting for coordinator initialization will retry vid
191 | sent near finish response for completed tx txid vid dhttxid vid node vid
192 | failed to send partition update to node because it left grid will ignore node vid msg vid
193 | jobs to reject count jobstoreject vid jobs vid
194 | received dht lock response txid vid dhttxid vid node vid
195 | received user finish request jobid vid ses vid
196 | handshake response from local node vid
197 | starvationinc
198 | updating full partition map grp vid exchver vid fullmap vid
199 | unregistered spi mbean vid
200 | control utility has completed execution at vid
201 | put from load cache vid key vid val vid
202 | completing topology ready future right away head vid topver vid
203 | one model training time was vid
204 | stopped port processor
205 | command vid finished with code vid
206 | received data load request vid
207 | message has been sent to address msg vid locnodeid vid
208 | ignoring entry for partition that does not belong key true val false
209 | failed to stop distributed node vid
210 | bytes sockch vid cnt vid
211 | starting loading model by the path vid
212 | initializing cache store
213 | check before retry node already created vid
214 | discarding node left message join process is not finished vid
215 | partitions have been scheduled to resend reason node vid
216 | ignore affinity change message lastaffver vid exchver vid msgver vid
217 | file has been concurrently deleted vid
218 | ignoring entry for partition that does not belong key true val true err false
219 | got removed entry in lockasync method will retry vid
220 | node version to set vid
221 | got removed entry in lockasync method will retry vid
222 | failed to communication error resolve diagnostic with additional information vid
223 | sent near finish response txid vid dhttxid vid node vid
224 | attempted to remove lock on removed entry will retry rmvver vid entry vid
225 | opened igfs output stream for file append igfsname vid path vid streamid vid ses vid
226 | undeployed class loader as there are no participating nodes vid
227 | partition states after afterexchange grp vid exchver vid states vid
228 | mbean for metric registry vid can t be created
229 | other nodes not found
230 | got removed entry while processing get response will not retry
231 | vid view information in a cluster
232 | failed to get entry version msg vid
233 | vid mapping type vid
234 | creating db table with index
235 | grid load balancing spi vid
236 | failed to read classpath resource vid
237 | vid used cache groups id to name vid
238 | received data load response vid nodeid vid res vid
239 | metric registry not found registry vid
240 | initialized connection with remote vid node nodeid vid rmtaddr vid
241 | received data load response vid
242 | received dht finish response txid vid dhttxid vid node vid
243 | write dump file vid
244 | started range vid pos vid
245 | started services deployment future init localnode vid
246 | discarding node failed message sent from node which is about to fail vid
247 | new coordinator sends request ver vid node vid
248 | failed to perform operation
249 | sending partition update to node because it left grid will ignore node vid msg vid
250 | failed to find node added message node vid
251 | idle_verify is still running processed vid of vid local partitions
252 | synchronization aftercompletion status_status vid
253 | skipping dump page history due to can not reserve wal segments vid
254 | skipping alive node vid
255 | completing future vid
256 | received incoming connection when already connected to this node
257 | localportrange addr vid rmtport vid
258 | finished restoring partition state for local groups groupsprocessed vid time vid ms
259 | created new meta with updated participants vid
260 | default values
261 | updated metadata on server node holder vid changedschemas vid
262 | acquired deployment class after verifying other class
263 | delay alive nodes change process max event threshold reached newevts vid totalevts vid
264 | client creation failed addr vid err vid
265 | put after update cache vid key vid val vid success vid
266 | coordinator received single message ver vid node vid allreceived vid
267 | failed to add candidates because entry was removed will renew
268 | vid addresses vid
269 | dfltpri
270 | undeployed class loader as there are no participating nodes vid
271 | check failed message has been ignored msg vid spistate vid
272 | failed to wait for metadata update typeid vid schemaid vid
273 | sending cache message msg vid node vid
274 | completing topology ready future right away head vid topver vid
275 | handling topology req vid
276 | configured session factory using file vid
277 | ignite_hostname_constraint has invalid pattern it will be ignore
278 | waiting for handshake rmtnode vid
279 | received handshake message rmtnode vid rcvcnt vid
280 | external collision notification to vid
281 | received shuffle ack desc vid msg vid
282 | failed to close incoming file vid
283 | ignoring response since task is already reducing or finishing res vid
284 | got removed entry in transaction getallasync method will retry vid
285 | notifying exchange future about to remote node
286 | store put key true val true tx false
287 | field not found vid
288 | failed to find future for get response sender vid res vid
289 | starting spi implementation vid
290 | exchange timings
291 | failed waiting while initialization is completed
292 | vid ping_interval vid
293 | failed to send global state response node left nodeid vid nodeid vid
294 | vid the subcommands that take vid as an arguments
295 | interrupted while waiting for consumer threads to shut down exiting uncleanly
296 | daemon node failed vid
297 | received incoming connection when already connected to this node rejecting locnode vid rmtnode vid
298 | skipping partition on recovery no page store or wal state grp vid p vid
299 | before acquiring transaction lock for put on keys vid
300 | ignore affinity for cache vid key vid val vid
301 | failed to get future result fut vid
302 | demo tcpserver stared
303 | failed to send unauthenticated message to node node vid err vid
304 | successfully bound shared memory communication to tcp port port vid lochost vid
305 | unregistered mbean vid
306 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/ignite/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/ignite/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/ignite/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 305 examples
5 | | ['data-bin/context'] test 305 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/ignite/checkpoint_last.pt
7 | | Translated 305 sentences (4151 tokens) in 5.9s (52.01 sentences/s, 707.82 tokens/s)
8 | | Generate test with beam=8: BLEU = 28.81 50.7/32.9/25.8/20.4 (BP = 0.942 ratio = 0.943 hyp_len = 2482 ref_len = 2631)
9 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/synapse/translation.context.test:
--------------------------------------------------------------------------------
1 | priorityexecutor undeployment of the entry named vid started
2 | connection without a pool something wrong need to fix
3 | priorityexecutor with name vid does not exist
4 | http connection vid output
5 | vid message for the vid dropped in the pre mediation state by the mandatory sequence n vid
6 | user id cannot be found
7 | connection closed by the client end while writing the response vid
8 | event source vid was removed from the synapse configuration successfully
9 | no resource is defined for location vid
10 | received to vid
11 | cannot find a datasource with name vid either in in in memory or jndi datasource repositories
12 | can not open a connection to the url with a path vid
13 | restoring the messageprocessor with name vid completed
14 | initializing child mediators of mediator vid
15 | synapse library import named vid has been deployed from file vid
16 | sequence vid has already been undeployed
17 | the file vid is not a valid soap11
18 | fail to create the condition in the given directory vid
19 | there is no secret for alias vid returning itself
20 | endpoint vid has been updated from the file vid
21 | http connection vid response vid
22 | deleting a job with name vid group vid
23 | endpoint vid has been deployed from file vid
24 | error opening key store vid
25 | loading trust keystore from vid
26 | synapse received a response message without a message id
27 | error while pipe vid shutting down listener
28 | directory vid is not writable
29 | registered mediator serializer vid for vid
30 | http connection vid closed
31 | message request received for the request message id vid
32 | synapse encountered an exception no error handlers sending fault
33 | received to vid
34 | created a error log vid
35 | http protocol error vid
36 | system may be unstable ioreactor encountered a checked exception vid
37 | error while closing the temporary file vid
38 | error occurred while shutting down jvm
39 | priorityexecutor vid has already been undeployed
40 | all transport threads and tasks are idle and no pending callbacks
41 | using http tuning parameter vid vid
42 | matching cher for the provided character sequence and the pattern vid
43 | localentry update from file vid has started
44 | can t send the out message sequence vid does not exist
45 | couldn t get the lock for processing the file vid
46 | initializing transport listener for request
47 | undeploying proxy service vid
48 | undeployment of the endpoint vid
49 | added mediators for vid
50 | configuring transport sender started
51 | sequence vid has been built from the file vid
52 | proxyservice named vid has been built from the
53 | initializing xar metadata
54 | setting a statistics stack on the message context
55 | start writing the hessian message to outputstream
56 | no secret repositories have been configured
57 | error resolving directory to move after processing vid
58 | initializing synapse in an already existing axis2 server instance
59 | sequence deployment from file vid completed
60 | pass through vid sender started
61 | message store deployment from file vid completed
62 | eventsource named vid has been built from the file vid
63 | received a continue response
64 | error in closing the input stream
65 | error while releasing the file vid
66 | connection closed by the target host while receiving request
67 | creating a secret repositories for given configuration
68 | priorityexecutor undeployment of the entry named vid started
69 | synapse timed out for the request with message id vid
70 | no beanstalk definitions found for initialization
71 | soapaction vid
72 | loading endpoints from vid
73 | server certificate validation trust has been disabled do not use
74 | amqp transport polling task started listen for service vid
75 | keep alive connection was closed by the client vid
76 | did not schedule the job vid job count is zero
77 | thread was interrupted while waiting to be destroying
78 | hot deployment has been suspended ignoring
79 | sequence vid has been updated from the file vid
80 | can t send the out message sequence vid does not exist
81 | messagestore named vid has been restored
82 | there is no private key in the given configuration
83 | graceful stop request completed in milliseconds
84 | error pausing transport sender
85 | localentry update from file vid has started
86 | one or more required fields are not found in the mgiven vid
87 | api named vid has been deployed from file vid
88 | endpoint deployment from file vid completed
89 | error opening key store vid
90 | deleting temporary file vid
91 | connection time out while writing the response vid
92 | template vid has been updated from the file vid
93 | memory cache is full unable to initialize the cache value
94 | unable to create ssl context with the given configuration
95 | initiating a file based secret repository
96 | the reconnection attempt number vid failed next re try will be after vid seconds
97 | keep alive connection was closed
98 | taskdescription cannot be found for name vid returning null
99 | registered mediator for extension vid
100 | unexpected exception encountered in targethandler
101 | cannot create a urlconnection for given url vid
102 | deployment of the synapse artifact from file vid started
103 | message processor deployment from file vid started
104 | destroying the synapsecallbackreceiver
105 | getting a datasource with name vid from the given configuration
106 | synapsesubscription failed sending fault response
107 | template deployment from file vid completed
108 | proxyservice named vid has been built from the file vid
109 | base64 decoding on input
110 | loading a file vid from classpath
111 | cannot open vid
112 | creating new taskderepositories
113 | startuptask named vid has been undeployed
114 | session with id vid is still live
115 | synapsesubscription failed sending fault response
116 | interrupted while building message for rest_url request
117 | the property vid with key vid target vid
118 | template task vid has already been undeployed
119 | priorityexecutor named vid has been deployed from file vid
120 | added mediator serializer vid for vid
121 | starting apache synapse
122 | vid listener started on vid port vid
123 | outgoing request counter rolled over for the session vid from vid
124 | encountered an i o error vid
125 | template vid has been built from the file vid
126 | restoring the messagestore with name vid started
127 | there are no statistics to be cleaned
128 | removing the session with the session id vid
129 | proxyservice deployment from proxy service vid started
130 | start replicating the property with key vid
131 | removed taskdescription vid
132 | setting the store type vid to vid
133 | you are using a persistent message queue you will be loosing messages which are on the queue
134 | restoring the messagestore with name vid completed
135 | cookies string vid
136 | loading a file vid from classpath
137 | initializing mediators of mediator vid
138 | priorityexecutor vid has been updated from the file vid
139 | retrieving task was interrupted
140 | loading synapse properties from the file vid
141 | synapse has decided to abort the message n vid
142 | creating session information for given session id vid
143 | expiring message id vid dropping message after global statistics
144 | crl taken from cache
145 | could not determine host name
146 | error while destroying the task vid
147 | loading trust keystore from vid
148 | endpoint vid has been updated from the file vid
149 | initializing synapsecallbackreceiver
150 | destroying pass through vid listener
151 | starting apache synapse
152 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/synapse/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/synapse/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/synapse/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 151 examples
5 | | ['data-bin/context'] test 151 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/synapse/checkpoint_last.pt
7 | | Translated 151 sentences (1749 tokens) in 2.5s (59.55 sentences/s, 689.74 tokens/s)
8 | | Generate test with beam=8: BLEU = 37.85 55.9/41.1/34.0/29.0 (BP = 0.976 ratio = 0.976 hyp_len = 1078 ref_len = 1104)
9 |
--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/synapse/translation.context.test.unsort:
--------------------------------------------------------------------------------
1 | 99 unexpected exception encountered in targethandler
2 | 129 start replicating the property with key vid
3 | 109 loading a file vid from classpath
4 | 57 initializing synapse in an already existing axis2 server instance
5 | 102 message processor deployment from file vid started
6 | 135 loading a file vid from classpath
7 | 20 http connection vid response vid
8 | 66 creating a secret repositories for given configuration
9 | 89 deleting temporary file vid
10 | 125 restoring the messagestore with name vid started
11 | 0 priorityexecutor undeployment of the entry named vid started
12 | 67 priorityexecutor undeployment of the entry named vid started
13 | 3 http connection vid output
14 | 13 initializing child mediators of mediator vid
15 | 131 setting the store type vid to vid
16 | 29 http connection vid closed
17 | 30 message request received for the request message id vid
18 | 100 cannot create a urlconnection for given url vid
19 | 127 removing the session with the session id vid
20 | 7 event source vid was removed from the synapse configuration successfully
21 | 96 keep alive connection was closed
22 | 136 initializing mediators of mediator vid
23 | 34 http protocol error vid
24 | 46 undeploying proxy service vid
25 | 103 destroying the synapsecallbackreceiver
26 | 98 registered mediator for extension vid
27 | 47 undeployment of the endpoint vid
28 | 41 matching cher for the provided character sequence and the pattern vid
29 | 81 there is no private key in the given configuration
30 | 149 destroying pass through vid listener
31 | 35 system may be unstable ioreactor encountered a checked exception vid
32 | 150 starting apache synapse
33 | 62 received a continue response
34 | 77 hot deployment has been suspended ignoring
35 | 1 connection without a pool something wrong need to fix
36 | 16 the file vid is not a valid soap11
37 | 144 could not determine host name
38 | 110 cannot open vid
39 | 139 loading synapse properties from the file vid
40 | 52 initializing xar metadata
41 | 21 deleting a job with name vid group vid
42 | 97 taskdescription cannot be found for name vid returning null
43 | 130 removed taskdescription vid
44 | 76 thread was interrupted while waiting to be destroying
45 | 71 loading endpoints from vid
46 | 74 keep alive connection was closed by the client vid
47 | 8 no resource is defined for location vid
48 | 27 directory vid is not writable
49 | 145 error while destroying the task vid
50 | 128 proxyservice deployment from proxy service vid started
51 | 101 deployment of the synapse artifact from file vid started
52 | 54 start writing the hessian message to outputstream
53 | 141 creating session information for given session id vid
54 | 104 getting a datasource with name vid from the given configuration
55 | 148 initializing synapsecallbackreceiver
56 | 123 encountered an i o error vid
57 | 84 localentry update from file vid has started
58 | 73 amqp transport polling task started listen for service vid
59 | 42 localentry update from file vid has started
60 | 40 using http tuning parameter vid vid
61 | 37 error occurred while shutting down jvm
62 | 126 there are no statistics to be cleaned
63 | 49 configuring transport sender started
64 | 45 initializing transport listener for request
65 | 133 restoring the messagestore with name vid completed
66 | 112 startuptask named vid has been undeployed
67 | 82 graceful stop request completed in milliseconds
68 | 10 cannot find a datasource with name vid either in in in memory or jndi datasource repositories
69 | 87 endpoint deployment from file vid completed
70 | 44 couldn t get the lock for processing the file vid
71 | 6 connection closed by the client end while writing the response vid
72 | 60 message store deployment from file vid completed
73 | 53 setting a statistics stack on the message context
74 | 83 error pausing transport sender
75 | 18 there is no secret for alias vid returning itself
76 | 85 one or more required fields are not found in the mgiven vid
77 | 120 starting apache synapse
78 | 38 priorityexecutor vid has already been undeployed
79 | 55 no secret repositories have been configured
80 | 134 cookies string vid
81 | 143 crl taken from cache
82 | 108 base64 decoding on input
83 | 69 no beanstalk definitions found for initialization
84 | 26 error while pipe vid shutting down listener
85 | 119 added mediator serializer vid for vid
86 | 107 proxyservice named vid has been built from the file vid
87 | 132 you are using a persistent message queue you will be loosing messages which are on the queue
88 | 4 vid message for the vid dropped in the pre mediation state by the mandatory sequence n vid
89 | 48 added mediators for vid
90 | 17 fail to create the condition in the given directory vid
91 | 51 proxyservice named vid has been built from the
92 | 11 can not open a connection to the url with a path vid
93 | 61 eventsource named vid has been built from the file vid
94 | 70 soapaction vid
95 | 116 the property vid with key vid target vid
96 | 111 creating new taskderepositories
97 | 115 interrupted while building message for rest_url request
98 | 114 synapsesubscription failed sending fault response
99 | 23 error opening key store vid
100 | 56 error resolving directory to move after processing vid
101 | 75 did not schedule the job vid job count is zero
102 | 64 error while releasing the file vid
103 | 137 priorityexecutor vid has been updated from the file vid
104 | 28 registered mediator serializer vid for vid
105 | 147 endpoint vid has been updated from the file vid
106 | 142 expiring message id vid dropping message after global statistics
107 | 122 outgoing request counter rolled over for the session vid from vid
108 | 91 template vid has been updated from the file vid
109 | 146 loading trust keystore from vid
110 | 19 endpoint vid has been updated from the file vid
111 | 15 sequence vid has already been undeployed
112 | 5 user id cannot be found
113 | 105 synapsesubscription failed sending fault response
114 | 36 error while closing the temporary file vid
115 | 65 connection closed by the target host while receiving request
116 | 31 synapse encountered an exception no error handlers sending fault
117 | 95 the reconnection attempt number vid failed next re try will be after vid seconds
118 | 80 messagestore named vid has been restored
119 | 90 connection time out while writing the response vid
120 | 12 restoring the messageprocessor with name vid completed
121 | 118 priorityexecutor named vid has been deployed from file vid
122 | 22 endpoint vid has been deployed from file vid
123 | 86 api named vid has been deployed from file vid
124 | 124 template vid has been built from the file vid
125 | 138 retrieving task was interrupted
126 | 68 synapse timed out for the request with message id vid
127 | 58 sequence deployment from file vid completed
128 | 33 created a error log vid
129 | 79 can t send the out message sequence vid does not exist
130 | 32 received to vid
131 | 9 received to vid
132 | 94 initiating a file based secret repository
133 | 92 memory cache is full unable to initialize the cache value
134 | 43 can t send the out message sequence vid does not exist
135 | 93 unable to create ssl context with the given configuration
136 | 59 pass through vid sender started
137 | 88 error opening key store vid
138 | 72 server certificate validation trust has been disabled do not use
139 | 121 vid listener started on vid port vid
140 | 25 synapse received a response message without a message id
141 | 140 synapse has decided to abort the message n vid
142 | 24 loading trust keystore from vid
143 | 63 error in closing the input stream
144 | 113 session with id vid is still live
145 | 2 priorityexecutor with name vid does not exist
146 | 39 all transport threads and tasks are idle and no pending callbacks
147 | 50 sequence vid has been built from the file vid
148 | 117 template task vid has already been undeployed
149 | 78 sequence vid has been updated from the file vid
150 | 14 synapse library import named vid has been deployed from file vid
151 | 106 template deployment from file vid completed
152 |
--------------------------------------------------------------------------------
/src/Baselines/README.md:
--------------------------------------------------------------------------------
1 | # README
2 | We have open-sourced the code for the baselines we use. For the released model, we have open-sourced the code. For APIs, we have open-sourced the invocation scripts. As for those commercial plugins, due to usage restrictions, they can only be invoked manually.
3 |
4 | For each baseline utilized, we kindly request that please ensure to cite the relevant paper.
5 |
--------------------------------------------------------------------------------
/src/Baselines/StarCoder/starcoder.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoModelForCausalLM, AutoTokenizer
2 | import re
3 | import os
4 | import tqdm
5 |
6 | path = './LogBench-O_prefix_1point'
7 | ground_truth_folder = './LogBench-O_prefix_1point'
8 | output_path= './StarCoder_LogBench-O_prefix_1point'
9 | FIM_INDICATOR = ""
10 | FIM_PREFIX = ""
11 | FIM_MIDDLE = ""
12 | FIM_SUFFIX = ""
13 |
14 | checkpoint = "bigcode/starcoder"
15 | device = "cuda"
16 | auth_token = "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"
17 |
18 | # Check if output_path exists, if not, create it
19 | if not os.path.exists(output_path):
20 | os.makedirs(output_path)
21 |
22 |
23 | def insert_text_to_java_file(file_name, line_number):
24 | with open(file_name, 'r', encoding='utf-8') as file:
25 | lines = file.readlines()
26 | if line_number > len(lines):
27 | print("out of range")
28 | lines[line_number - 1] = lines[line_number - 1].rstrip() + FIM_INDICATOR +'\n'
29 | with open(file_name, 'w', encoding='utf-8') as file:
30 | file.writelines(lines)
31 |
32 |
33 | def extract_numbers(s):
34 | return re.findall(r'\d+', s)
35 |
36 | def parse_directory(dir_path, ground_truth_folder):
37 | for filename in os.listdir(dir_path):
38 | file_path = os.path.join(dir_path, filename)
39 | if os.path.isfile(file_path) and file_path.endswith('.java'):
40 | ground_truth_path = os.path.join(ground_truth_folder, file_path.split('/')[-1][:-5] + '_config.txt')
41 | try:
42 | with open(ground_truth_path, 'r', encoding='utf-8') as f:
43 | lines = f.readlines()
44 | if len(lines) >= 1:
45 | line_number = int(extract_numbers(lines[0].strip(' ')[:-1])[0])
46 | insert_text_to_java_file(file_path, line_number)
47 | except FileNotFoundError:
48 | pass
49 | elif os.path.isdir(file_path):
50 | parse_directory(file_path, ground_truth_folder)
51 |
52 | parse_directory(path,ground_truth_folder)
53 |
54 | tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_auth_token=auth_token)
55 | model = AutoModelForCausalLM.from_pretrained(checkpoint, use_auth_token=auth_token).to(device)
56 |
57 | def generate(input_text):
58 | if FIM_INDICATOR in input_text:
59 | try:
60 | prefix, suffix = input_text.split(FIM_INDICATOR)
61 | except:
62 | raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt!")
63 | input_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
64 |
65 |
66 | inputs = tokenizer(input_text, return_tensors="pt")
67 | inputs = {k: v.to(device) for k, v in inputs.items()}
68 | outputs = model.generate(
69 | input_ids=inputs["input_ids"],
70 | attention_mask=inputs["attention_mask"],
71 | max_length=1024,
72 | do_sample=True,
73 | pad_token_id=tokenizer.eos_token_id, # Set pad_token_id
74 | )
75 | return (tokenizer.decode(outputs[0]))
76 |
77 | for filename in os.listdir(path):
78 | if filename.endswith(".java"):
79 | print(filename)
80 | input_file_path = os.path.join(path, filename)
81 |
82 | try:
83 | with open(input_file_path, 'r', encoding='utf-8') as file:
84 | file_content = file.read()
85 | example = f"'''\\\n{file_content}\n'''"
86 | processed_content = generate(example)
87 | output_file_path = os.path.join(output_path, filename)
88 | with open(output_file_path, 'w', encoding='utf-8') as output_file:
89 | output_file.write(f"{processed_content}\n")
90 | except Exception as e:
91 | print(f"Error processing file {filename}: {e}")
92 |
--------------------------------------------------------------------------------
/src/Baselines/WhichVar/analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 23,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import json\n",
10 | "import numpy as np"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 9,
16 | "metadata": {},
17 | "outputs": [],
18 | "source": [
19 | "with open(\"output.json\", \"r\") as f:\n",
20 | " data_list = json.load(f)"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 10,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdout",
30 | "output_type": "stream",
31 | "text": [
32 | "{'code': 'private void handleAdjustPublishRate(Context ctx) throws Exception {\\n Double publishRate = mapper.readValue(ctx.body(), Double.class);', 'pred_variables': ['mapper', 'publishRate', 'body'], 'label_variables': ['publishRate']}\n"
33 | ]
34 | }
35 | ],
36 | "source": [
37 | "print(data_list[0])"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 20,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "def precision_recall_f1(labels, predictions):\n",
47 | " true_positives = len(set(labels) & set(predictions))\n",
48 | " false_positives = len(set(predictions) - set(labels))\n",
49 | " false_negatives = len(set(labels) - set(predictions))\n",
50 | "\n",
51 | " precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0.0 else 0.0\n",
52 | " recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0.0 else 0.0\n",
53 | " f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0.0 else 0.0\n",
54 | "\n",
55 | " return precision, recall, f1"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 22,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "precs, recs, f1s = [], [], []\n",
65 | "for idx, data in enumerate(data_list):\n",
66 | " labels = data['label_variables']\n",
67 | " predcits = data['pred_variables']\n",
68 | " \n",
69 | " # print(predcits, labels)\n",
70 | " precision, recall, f1 = precision_recall_f1(labels, predcits)\n",
71 | " precs.append(precision)\n",
72 | " recs.append(recall)\n",
73 | " f1s.append(f1)"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 25,
79 | "metadata": {},
80 | "outputs": [
81 | {
82 | "name": "stdout",
83 | "output_type": "stream",
84 | "text": [
85 | "0.5030762324986151\n",
86 | "0.6346379386090578\n",
87 | "0.5348833543779392\n"
88 | ]
89 | }
90 | ],
91 | "source": [
92 | "print(np.mean(precs))\n",
93 | "print(np.mean(recs))\n",
94 | "print(np.mean(f1s))"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": null,
100 | "metadata": {},
101 | "outputs": [],
102 | "source": []
103 | }
104 | ],
105 | "metadata": {
106 | "kernelspec": {
107 | "display_name": "myenv",
108 | "language": "python",
109 | "name": "python3"
110 | },
111 | "language_info": {
112 | "codemirror_mode": {
113 | "name": "ipython",
114 | "version": 3
115 | },
116 | "file_extension": ".py",
117 | "mimetype": "text/x-python",
118 | "name": "python",
119 | "nbconvert_exporter": "python",
120 | "pygments_lexer": "ipython3",
121 | "version": "3.7.13"
122 | }
123 | },
124 | "nbformat": 4,
125 | "nbformat_minor": 2
126 | }
127 |
--------------------------------------------------------------------------------
/src/Baselines/WhichVar/cleaner.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "import json\n",
11 | "import re\n",
12 | "from collections import Counter\n",
13 | "\n",
14 | "regex = r\"(?i)(?:log(?:ger)?\\w*)\\s*\\.\\s*(?:log|error|info|warn|fatal|debug|trace|off|all)\\s*\\([^;]*\\)\"\n",
15 | "\n",
16 | "def process_directory(directory):\n",
17 | " for filename in os.listdir(directory):\n",
18 | " filepath = os.path.join(directory, filename)\n",
19 | " if os.path.isdir(filepath):\n",
20 | " process_directory(filepath)\n",
21 | " elif filename.endswith('.json'):\n",
22 | " process_file(filepath)\n",
23 | "\n",
24 | "def process_file(filepath):\n",
25 | " with open(filepath, 'r') as f:\n",
26 | " data = json.load(f)\n",
27 | " method_code = data.get('methodCode', '')\n",
28 | " log_variables = data.get('logVariables', [])\n",
29 | " \n",
30 | " for match in re.finditer(regex, method_code):\n",
31 | " logging_statement = match.group(0)\n",
32 | " \n",
33 | " if all(var in logging_statement for var in log_variables):\n",
34 | " start_index = match.start()\n",
35 | " line_count = Counter(method_code[:start_index])['\\n']\n",
36 | " start_line = max(0, line_count - 15)\n",
37 | " preceding_lines = method_code.split('\\n')[:start_line]\n",
38 | " start_index = len('\\n'.join(preceding_lines)) + 1 if preceding_lines else 0\n",
39 | " data['methodCode'] = method_code[start_index:match.end()]\n",
40 | " \n",
41 | " with open(filepath, 'w') as f:\n",
42 | " json.dump(data, f)\n",
43 | " break\n",
44 | "# ...\n",
45 | "\n",
46 | "process_directory('/Users/liyichen/data/')\n"
47 | ]
48 | },
49 | {
50 | "cell_type": "code",
51 | "execution_count": null,
52 | "metadata": {},
53 | "outputs": [],
54 | "source": []
55 | }
56 | ],
57 | "metadata": {
58 | "kernelspec": {
59 | "display_name": "Python 3",
60 | "language": "python",
61 | "name": "python3"
62 | },
63 | "language_info": {
64 | "codemirror_mode": {
65 | "name": "ipython",
66 | "version": 3
67 | },
68 | "file_extension": ".py",
69 | "mimetype": "text/x-python",
70 | "name": "python",
71 | "nbconvert_exporter": "python",
72 | "pygments_lexer": "ipython3",
73 | "version": "3.9.7"
74 | }
75 | },
76 | "nbformat": 4,
77 | "nbformat_minor": 2
78 | }
79 |
--------------------------------------------------------------------------------
/src/Baselines/WhichVar/model.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch
3 | from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
4 | from torchtext.vocab import GloVe
5 | import json
6 | from torch.utils.data import DataLoader, Dataset, random_split
7 | import numpy as np
8 | from sklearn.metrics import precision_score, recall_score, f1_score
9 | import re
10 | import random
11 |
12 | def check_and_split_camel_case(s):
13 | if re.match(r'^[a-z]+([A-Z][a-z]*)*$', s):
14 | words = re.findall('[a-z]+|[A-Z][a-z]*', s)
15 | return "yes", words
16 | else:
17 | return "no", s
18 |
19 |
20 | def setup_seed(seed):
21 | if seed == -1:
22 | seed = random.randint(0, 1000)
23 | torch.manual_seed(seed)
24 | torch.cuda.manual_seed_all(seed)
25 | np.random.seed(seed)
26 | random.seed(seed)
27 | torch.backends.cudnn.deterministic = True
28 | torch.backends.cudnn.benchmark = False
29 | return seed
30 |
31 |
32 | class Model(nn.Module):
33 | def __init__(self, weight):
34 | super(Model, self).__init__()
35 | vocab_size = weight.shape[0]
36 | self.word_embed = nn.Embedding(num_embeddings=vocab_size+1, embedding_dim=weight.shape[-1])
37 | self.word_embed.weight.data[:vocab_size] = weight
38 | self.word_embed.weight.data[vocab_size] = torch.zeros(weight.shape[-1])
39 | self.word_embed.weight.requires_grad = False
40 |
41 | self.rnn = nn.LSTM(100, 128, num_layers=2, bidirectional=True, batch_first=True)
42 | self.num_heads = 4
43 | self.attention = nn.MultiheadAttention(embed_dim=256, num_heads=self.num_heads, batch_first=True)
44 |
45 | self.cls_layer = nn.Linear(256, 1, bias=False)
46 |
47 |
48 | def forward(self, sentences, lens):
49 |
50 | embeds = self.word_embed(sentences)
51 | outputs, _ = self.rnn(embeds)
52 | attn_mask=torch.zeros((sentences.size(0) * 4, sentences.size(1), sentences.size(1)), device=sentences.device).bool()
53 | for i, l in enumerate(lens):
54 | for j in range(1, self.num_heads+1):
55 | attn_mask[i*j][:l][:l] = True
56 |
57 | attention_embeds, _ = self.attention(outputs, outputs, outputs, attn_mask=None)
58 | logits = self.cls_layer(attention_embeds).squeeze(dim=-1)
59 |
60 | return logits
61 |
62 | class SensDataSet(Dataset):
63 | def __init__(self, data, label):
64 | self.data = data
65 | self.label = label
66 |
67 | def __len__(self):
68 | return len(self.data)
69 |
70 | def __getitem__(self, idx):
71 | tuple_ = (self.data[idx], self.label[idx])
72 | return tuple_
73 |
74 |
75 | def collate_fn(data_tuple):
76 | # data_tuple.sort(key=lambda x: len(x[0]), reverse=True)
77 | data = [torch.LongTensor(sq[0]) for sq in data_tuple]
78 | label = [torch.Tensor(sq[1]) for sq in data_tuple]
79 | data_length = [len(sq) for sq in data]
80 | data = pad_sequence(data, batch_first=True)
81 | label = pad_sequence(label, batch_first=True)
82 | return data, label, data_length
83 |
84 |
85 | # def evaluate(model, test_dataloader, device):
86 | # acc = 0
87 | # n = 0
88 | # model.eval()
89 | # total_pred = []
90 | # total_label =[]
91 | # for batch_x, batch_y, batch_x_len in test_dataloader:
92 | # batch_x = batch_x.to(device)
93 | # batch_y = batch_y.to(device)
94 | # out = model(batch_x, batch_x_len)
95 | # predicts = (out > 0) + 0
96 | # for predict, label, length in zip(predicts, batch_y, batch_x_len):
97 | # total_pred.append(predict[:length])
98 | # total_label.append(label[:length])
99 | # total_pred = torch.cat(total_pred).cpu().numpy()
100 | # total_label = torch.cat(total_label).cpu().numpy()
101 |
102 | # precision = precision_score(total_label, total_pred)
103 | # recall = recall_score(total_label, total_pred)
104 | # f1 = f1_score(total_label, total_pred)
105 | # return {"precision" : precision, "recall" : recall, "f1" : f1}
106 |
107 |
108 | def evaluate(model, test_dataloader, device):
109 | model.eval()
110 | precision_list = []
111 | recall_list = []
112 | f1_list = []
113 | predicts_list = []
114 | for batch_x, batch_y, batch_x_len in test_dataloader:
115 | batch_x = batch_x.to(device)
116 | batch_y = batch_y.to(device)
117 | out = model(batch_x, batch_x_len)
118 | predicts = (out > 0) + 0
119 | batch_x = batch_x.cpu().numpy()
120 | batch_y = batch_y.cpu().numpy()
121 | predicts = predicts.cpu().numpy()
122 | for x, predict, label, length in zip(batch_x, predicts, batch_y, batch_x_len):
123 | # print(len(x), len(predict), len(label), length)
124 | x, predict, label = x[:length], predict[:length], label[:length]
125 |
126 | pred_1_set = set(x[predict == 1])
127 | pred_0_set = set(x) - pred_1_set
128 | label_1_set = set(x[label == 1])
129 | label_0_set = set(x) - label_1_set
130 | TP = len(label_1_set.intersection(pred_1_set))
131 | FN = len(label_1_set.intersection(pred_0_set))
132 | FP = len(label_0_set.intersection(pred_1_set))
133 | TN = len(label_0_set.intersection(pred_0_set))
134 | precision = TP / (TP + FP) if (TP + FP) != 0 else 0
135 | recall = TP / (TP + FN) if (TP + FN) != 0 else 0
136 | f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
137 | precision_list.append(precision)
138 | recall_list.append(recall)
139 | f1_list.append(f1)
140 | predicts_list.append(predict)
141 |
142 | precision = np.mean(precision_list)
143 | recall = np.mean(recall_list)
144 | f1 = np.mean(f1_list)
145 | # print(len(f1_list))
146 |
147 | return {"precision" : precision, "recall" : recall, "f1" : f1}, predicts_list
148 |
149 |
150 |
151 |
152 | if __name__ == '__main__':
153 | setup_seed(111)
154 | device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
155 | print(device)
156 | epochs = 0
157 | glove = GloVe(name='6B', dim=100)
158 | vocab_size = len(glove)
159 | with open('./train.json', 'r') as file:
160 | train_data = json.load(file)
161 | with open('./test.json', 'r') as file:
162 | test_data = json.load(file)
163 |
164 | train_sentences = train_data['input']
165 | train_sentences = [[glove.stoi[word] if word in glove.stoi.keys() else vocab_size for word in sentence] for sentence in train_sentences]
166 | train_labels = train_data['label']
167 |
168 | test_sentences = test_data['input']
169 | test_sentences = [[glove.stoi[word] if word in glove.stoi.keys() else vocab_size for word in sentence] for sentence in test_sentences]
170 | test_labels = test_data['label']
171 | # print(len(test_sentences))
172 | # train_size = int(0.8 * len(sentences))
173 | # train_sentences, test_sentences = sentences[:train_size], sentences[train_size:]
174 | # train_labels, test_labels = labels[:train_size], labels[train_size:]
175 |
176 | train_dataset = SensDataSet(data=train_sentences, label=train_labels)
177 | test_dataset = SensDataSet(data=test_sentences, label=test_labels)
178 |
179 | # train_dataset, test_dataset = random_split(dataset, [0.8, 0.2])
180 |
181 | train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
182 | test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
183 |
184 | model = Model(weight=glove.vectors).to(device)
185 | optimizer = torch.optim.Adam(model.parameters(), lr=1e-03)
186 | loss_fun = nn.BCEWithLogitsLoss(reduction='none')
187 |
188 | for epoch in range(1, epochs + 1):
189 | model.train()
190 | total_loss = []
191 | for batch_id, (batch_x, batch_y, batch_x_len) in enumerate(train_loader):
192 | batch_x = batch_x.to(device)
193 | batch_y = batch_y.to(device)
194 | out = model(batch_x, batch_x_len)
195 | pos_mask=torch.zeros((batch_x.size(0), batch_x.size(1)), device=device).bool()
196 | for i, l in enumerate(batch_x_len):
197 | pos_mask[i][:l] = True
198 | loss = loss_fun(out, batch_y)[pos_mask].mean()
199 | optimizer.zero_grad()
200 | loss.backward()
201 | optimizer.step()
202 | total_loss.append(loss.item())
203 | print("epoch: {}/{}, loss={}".format(epoch, epochs, np.mean(total_loss)))
204 | result1, predicts_list1 = evaluate(model, train_loader, device)
205 | result2, predicts_list2 = evaluate(model, test_loader, device)
206 | print('result on train set: {}'.format(result1))
207 | print('result on test set: {}'.format(result2))
208 |
209 | torch.save(model.state_dict(), 'model/model.pth')
210 |
211 | model.load_state_dict(torch.load('model/model.pth'))
212 |
213 | result, predicts_list = evaluate(model, test_loader, device)
214 | print(len(predicts_list))
215 | test_cases = []
216 | for i in range(len(predicts_list)):
217 | codes = test_data['codes'][i]
218 | predict = predicts_list[i]
219 | # print(len(test_data['input'][i]), len(test_sentences[i]), len(predict))
220 | variables = list(set([test_data['input'][i][j] for j, v in enumerate(predict) if v == 1]))
221 | label_variables = test_data['variables'][i]
222 | output_data = {
223 | 'code': codes,
224 | 'pred_variables': variables,
225 | 'label_variables': label_variables
226 | }
227 | test_cases.append(output_data)
228 | json.dump(test_cases, open('output.json', 'w'), indent=4)
229 |
230 |
--------------------------------------------------------------------------------
/src/Baselines/lance/README.md:
--------------------------------------------------------------------------------
1 | # Using Deep Learning To Support Logging Activities
2 |
3 | We present LANCE(Log stAtemeNt reCommEnder), a DL-based approach for supporting the task of log statement generation and injection in the context of Java. LANCE is built on the recently proposed Text-To-Text Transfer Transformer (T5) architecture
4 |
5 |
6 | #### How to experiment with LANCE
7 |
8 |
9 | * ##### How to train a new SentencePiece Model
10 |
11 | Before training the [T5 small](https://github.com/google-research/text-to-text-transfer-transformer), namely the core of LANCE, it is important to also train a new tokenizer (sentencepiece model) to accomodate the expanded vocabulary given by the java programming language. For such, we used the raw pre-training instances(Java corpus) + English sentences from the well known C4 dataset
12 |
13 | *Pythonic way*
14 |
15 | ```
16 | pip install sentencepiece==0.1.96
17 | import sentencepiece as spm
18 | spm.SentencePieceTrainer.train('--input=all_sp.txt --model_prefix=LOG_SP --vocab_size=32000 --bos_id=-1 --eos_id=1 --unk_id=2 --pad_id=0 --shuffle_input_sentence=true --character_coverage=1.0 --user_defined_symbols=“”')
19 | ```
20 |
21 | Under this path we also provide our trained tokenizer: https://github.com/lance-log/lance/tree/main/Code
22 |
23 | * ##### Setup a Google Cloud Storage (GCS) Bucket
24 | To setup a new GCS Bucket for training and fine-tuning a T5 Model, please follow the original guide provided by Google: Here the link: https://cloud.google.com/storage/docs/quickstart-console
25 |
26 |
27 | * ##### Datasets
28 |
29 | The datasets for pre-training, fine-tuning, validating and finally testing LANCE can be found at this link: https://drive.google.com/drive/folders/1D12y-CIJTYLxMeSmGQjxEXjTEzQImgaH?usp=sharing
30 |
31 | * ##### Pre-training/Fine-tuning
32 |
33 | To pre-train and then, fine-tune LANCE, please use the following:
34 | - Pre-Training
35 | - Fine-Tuning
36 |
37 |
38 |
39 | * ##### Models
40 | * Pre-trained on the tasks mixture (Multi-Task)
41 | * Pre-trained on LogSTMT only Task
42 | * Pre-trained on Denoise only Task
43 | * No Pre-trained
44 |
45 | * ##### Results: :open_file_folder:
46 | * Multi-Task
47 | * LogSTMT only Task
48 | * Denoising only Task
49 | * No Pre-trained
50 |
51 |
52 | * ##### Additional:
53 | Under Miscellaneous, you can find the additional script used for the data analysis and the exact hyper-parameters configuration we employed in the study.
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/src/Baselines/lance/lance.py:
--------------------------------------------------------------------------------
1 | import functools
2 | import os
3 | import gin
4 | import tensorflow.compat.v1 as tf
5 | import tensorflow_datasets as tfds
6 | from contextlib import contextmanager
7 | import logging as py_logging
8 | import t5
9 | from t5.data import postprocessors as t5_postprocessors
10 | from t5.seqio import Feature,SentencePieceVocabulary
11 | from mesh_tensorflow.transformer.learning_rate_schedules import slanted_triangular
12 | from mesh_tensorflow.transformer.learning_rate_schedules import truncated_rsqrt
13 | from tensorflow.keras.optimizers.schedules import PolynomialDecay
14 | from t5 import models
15 |
16 | BASE_DIR = "gs://xxxx" #@param { type: "string" }
17 | TPU_TOPOLOGY = "2x2"
18 | tpu = tf.distribute.cluster_resolver.TPUClusterResolver("grpc://xx.xx.xx.xx") # TPU detection
19 | TPU_ADDRESS = tpu.get_master()
20 | tf.disable_v2_behavior()
21 | tf.get_logger().propagate = False
22 | py_logging.root.setLevel('INFO')
23 |
24 | @contextmanager
25 | def tf_verbosity_level(level):
26 | og_level = tf.logging.get_verbosity()
27 | tf.logging.set_verbosity(level)
28 | yield
29 | tf.logging.set_verbosity(og_level)
30 |
31 | path_finetuning = BASE_DIR + '/datasets/Fine-tuning/train.tsv' #@param { type: "string" }
32 | path_eval = BASE_DIR + '/datasets/Fine-tuning/eval.tsv' #@param { type: "string" }
33 | path_test = BASE_DIR + '/datasets/Fine-tuning/test.tsv' #@param { type: "string" }
34 |
35 | nq_tsv_path = {
36 | "train": path_finetuning,
37 | "validation": path_test
38 | }
39 |
40 | num_nq_examples = dict(train=106382, validation=12020)
41 |
42 | vocab_model_path = BASE_DIR + '/Code/SP_LOG.model' #@param { type: "string" }
43 | vocab_path = BASE_DIR + '/Code/SP_LOG.vocab' #@param { type: "string" }
44 |
45 |
46 | TaskRegistry = t5.data.TaskRegistry
47 | TfdsTask = t5.data.TfdsTask
48 |
49 |
50 | def get_default_vocabulary():
51 | return SentencePieceVocabulary(vocab_model_path, 100)
52 |
53 | DEFAULT_OUTPUT_FEATURES = {
54 | "inputs": Feature(
55 | vocabulary=get_default_vocabulary(), add_eos=True, required=False),
56 |
57 | "targets": Feature(
58 | vocabulary=get_default_vocabulary(), add_eos=True)
59 | }
60 |
61 | def nq_dataset_task(split, shuffle_files=True):
62 | # We only have one file for each split.
63 | del shuffle_files
64 |
65 | # Load lines from the text file as examples.
66 |
67 | ds = tf.data.TextLineDataset(nq_tsv_path[split])
68 | ds = ds.map(
69 | functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
70 | field_delim="\t", use_quote_delim=True),
71 | num_parallel_calls=tf.data.experimental.AUTOTUNE)
72 |
73 | ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
74 | return ds
75 |
76 | print("A few raw train examples...")
77 | for ex in tfds.as_numpy(nq_dataset_task("train").take(5)):
78 | print(ex)
79 |
80 | def preprocessing(ds):
81 |
82 | def to_inputs_and_targets(ex):
83 | x_input = tf.strings.strip(ex['input'])
84 | y_label = tf.strings.strip(ex['output'])
85 | inputs = tf.strings.join([x_input], separator=' ')
86 | class_label = tf.strings.join([y_label], separator=' ')
87 | return {'inputs': inputs, 'targets': class_label}
88 |
89 | return ds.map(to_inputs_and_targets,
90 | num_parallel_calls=tf.data.experimental.AUTOTUNE)
91 |
92 | t5.data.TaskRegistry.remove('log_injection')
93 | t5.data.TaskRegistry.add(
94 | "log_injection",
95 | dataset_fn=nq_dataset_task,
96 | splits=["train","validation"],
97 | text_preprocessor=[preprocessing],
98 | output_features = DEFAULT_OUTPUT_FEATURES,
99 | metric_fns=[t5.evaluation.metrics.accuracy],
100 | num_input_examples=num_nq_examples
101 | )
102 |
103 | nq_task = t5.data.TaskRegistry.get("log_injection")
104 | ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
105 | print("A few preprocessed training examples...")
106 | for ex in tfds.as_numpy(ds.take(5)):
107 | print(ex)
108 |
109 | starter_learning_rate = 0.01
110 | end_learning_rate = 0.001
111 | decay_steps = 10000
112 |
113 | learning_rate_fn = PolynomialDecay(
114 | starter_learning_rate,
115 | decay_steps,
116 | end_learning_rate,
117 | power=0.5)
118 |
119 | MODEL_SIZE = "small"
120 |
121 | MODEL_DIR = BASE_DIR + '/modeltest/'#@param { type: "string" }
122 |
123 | PRETRAINED_DIR=BASE_DIR + '/denoising_task_model/'#@param { type: "string" }
124 |
125 |
126 | model_parallelism, train_batch_size, keep_checkpoint_max = {
127 | "small": (1, 128, 16),
128 | "base": (2, 128, 8),
129 | "large": (8, 64, 4),
130 | "3B": (8, 16, 1),
131 | "11B": (8, 16, 1)}[MODEL_SIZE]
132 |
133 | tf.io.gfile.makedirs(MODEL_DIR)
134 |
135 | model = t5.models.MtfModel(
136 | model_dir=PRETRAINED_DIR,
137 | tpu=TPU_ADDRESS,
138 | #tpu_job_name="node-1",
139 | #tpu_zone="us-central1-f",
140 | #gcp_project="lance",
141 | tpu_topology=TPU_TOPOLOGY,
142 | model_parallelism=model_parallelism,
143 | batch_size=train_batch_size,
144 | learning_rate_schedule = learning_rate_fn, #pick the correct scheduler, according to the model you want to train
145 | sequence_length={"inputs": 512, "targets": 512},
146 | save_checkpoints_steps=5000,
147 | keep_checkpoint_max=keep_checkpoint_max,
148 | iterations_per_loop=100,
149 | )
150 |
151 | PATH_GIN_FILE_NO_PT = BASE_DIR + '/Configs/no_pretraining_operative_config.gin'
152 | PATH_GIN_FILE_MT = BASE_DIR + '/Configs/multi-task_operative_config.gin'
153 | PATH_GIN_FILE_DENOISE = BASE_DIR + '/Configs/denoise_only_operative_config.gin'
154 | PATH_GIN_FILE_LOG_STMT = BASE_DIR + '/Configs/log_stmt_only_operative_config.gin'
155 |
156 | #with gin.unlock_config():
157 | # gin.parse_config_file(PATH_GIN_FILE_DENOISE)
158 | # #RUN FINE-TUNING
159 | # TRAIN_STEPS = 200000
160 | # model.finetune(mixture_or_task_name="log_injection",
161 | # finetune_steps=TRAIN_STEPS,
162 | # pretrained_model_dir=PRETRAINED_DIR)
163 |
164 | # If the no-pretraining experiment is the one you want to run, then, uncomment the following and comment model.finetune
165 | # Also, make sure to upload the slanted_operative.gin
166 | #model.train("log_injection", TRAIN_STEPS)
167 | #model.bach_size=32
168 | #model.eval(
169 | #mixture_or_task_name="log_injection",
170 | #checkpoint_steps=-1
171 | #)
172 | #dataset_list = ["cassandra","elasticsearch","flink","hbase","wicket","zookeeper"]
173 | dataset_list = ['logstudy']
174 | for item in dataset_list:
175 | model.batch_size = 256
176 | input_file = BASE_DIR + f'/datasets/logr_input/lance_function_transformed.txt'#@param { type: "string" }
177 | output_file = BASE_DIR+ f'/datasets/logr_input/lance_function_transformed_result.txt'#@param { type: "string" }
178 | model.predict(input_file, output_file, checkpoint_steps=-1, vocabulary=get_default_vocabulary())
--------------------------------------------------------------------------------
/src/CodeTransformer/README.md:
--------------------------------------------------------------------------------
1 | # CodeTransformer
2 |
3 | We provide only the compiled version for use throughout the review process. We will make the source code available after the paper has been accepted.
4 |
--------------------------------------------------------------------------------
/src/DataCollector/check_pom.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import re
3 | import xml.etree.ElementTree as ET
4 | import os
5 | import base64
6 | import shutil
7 | from github import Github
8 | from github import GithubException
9 |
10 | def check_string_in_file(file_path, search_str="log4j"):
11 | with open(file_path, 'r') as file:
12 | content = file.read()
13 |
14 | if "log4j" in content.lower() or "slf4j" in content.lower():
15 | return True
16 | else:
17 | return False
18 |
19 | def check_log4j(pom_file_path):
20 | # Parse the POM file as XML
21 | try:
22 | # Parse XML file
23 | tree = ET.parse(pom_file_path)
24 | root = tree.getroot()
25 |
26 | # Define the Log4j dependency artifact details
27 | group_id = 'org.apache.logging.log4j'
28 | artifact_id = 'log4j-core'
29 |
30 | # Iterate over the dependency elements in the POM file and check for the Log4j dependency
31 | for dependency in root.findall('.//{http://maven.apache.org/POM/4.0.0}dependency'):
32 | # Retrieve the group ID and artifact ID of the dependency
33 | dep_group_id = dependency.find('.//{http://maven.apache.org/POM/4.0.0}groupId')
34 | dep_artifact_id = dependency.find('.//{http://maven.apache.org/POM/4.0.0}artifactId')
35 | if dep_group_id is not None and dep_artifact_id is not None:
36 | dep_group_id, dep_artifact_id = dep_group_id.text, dep_artifact_id.text
37 | # Check if the dependency is the Log4j dependency
38 | if dep_group_id == group_id and dep_artifact_id == artifact_id:
39 | print(f'The POM file {pom_file_path} features the Log4j dependency')
40 | return True
41 |
42 | except ET.ParseError as e:
43 | # Handle XML parsing exception
44 | print('Error parsing XML file:', e)
45 |
46 | print(f'The POM file {pom_file_path} does not feature the Log4j dependency')
47 | return False
48 |
49 |
50 | def get_sha_for_tag(repository, tag):
51 | """
52 | Returns a commit PyGithub object for the specified repository and tag.
53 | """
54 | branches = repository.get_branches()
55 | matched_branches = [match for match in branches if match.name == tag]
56 | if matched_branches:
57 | return matched_branches[0].commit.sha
58 |
59 | tags = repository.get_tags()
60 | matched_tags = [match for match in tags if match.name == tag]
61 | if not matched_tags:
62 | print("No Tag or Branch exists with that name")
63 | return None
64 | return matched_tags[0].commit.sha
65 |
66 |
67 | def download_file(git, sha, repo, path):
68 | try:
69 | file_content = git.get_contents(path, ref=sha)
70 | file_data = base64.b64decode(file_content.content)
71 | directory_path, _ = os.path.split(path)
72 | if not os.path.exists(f"repos/{repo}/{directory_path}"):
73 | os.makedirs(f"repos/{repo}/{directory_path}", exist_ok=True)
74 | file_out = open(f"repos/{repo}/{path}", "wb")
75 | file_out.write(file_data)
76 | file_out.close()
77 | except (GithubException, IOError) as exc:
78 | print('Error processing %s: %s', path, exc)
79 |
80 | def check_repo(owner, repo, branch="master"):
81 | # Define the Github Tree API endpoint and repository details
82 | api_url = 'https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1'
83 | # Make an HTTP GET request to the Github Tree API endpoint
84 | access_token = ""
85 | headers = {'Authorization': f'token {access_token}'}
86 | if not os.path.exists(f"repos/{repo}/"):
87 | os.makedirs(f"repos/{repo}/")
88 | #print(f"./{repo}/ created")
89 |
90 | git = Github("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")
91 | git_repo = git.get_repo(f"{owner}/{repo}")
92 | sha = get_sha_for_tag(git_repo, branch)
93 | # Parse the response data as JSON
94 | response = requests.get(api_url.format(owner=owner, repo=repo, branch=branch), headers=headers)
95 | data = response.json()
96 | contain_pom = False
97 | if sha is not None:
98 | for item in data['tree']:
99 | if re.search("pom.xml", item['path'], re.IGNORECASE):
100 | download_file(git_repo, sha, repo, item['path'])
101 | if check_log4j(f"repos/{repo}/{item['path']}"):
102 | contain_pom = True
103 | break
104 | else:
105 | os.remove(f"repos/{repo}/{item['path']}")
106 | print(f"{owner}/{repo} pom checking result: ", contain_pom)
107 | shutil.rmtree(f"repos/{repo}/")
108 | return contain_pom
109 | # # Iterate over the file and directory objects in the response
110 | # for item in data['tree']:
111 | # # Retrieve the file path and type
112 | # path, type = item['path'], item['type']
113 |
114 | # # If the item is a file, retrieve the raw content using the 'url' property
115 | # if type == 'blob':
116 | # file_url = item['url']
117 | # file_response = requests.get(file_url)
118 | # file_data = file_response.content
119 |
120 | # # Process the file content as needed
121 | # print(f'File: {path}')
122 | # #print(file_data)
123 | # else:
124 | # # Process directories or other items as needed
125 | # print(f'Directory: {path}')
126 | # github.com/davidb/scala-maven-plugin
127 |
128 | def check_repo_root(owner, repo, access_token, branch="master"):
129 | # Define the Github Tree API endpoint and repository details
130 | #api_url = 'https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1'
131 | # Make an HTTP GET request to the Github Tree API endpoint
132 | #headers = {'Authorization': f'token {access_token}'}
133 | if not os.path.exists(f"repos/{repo}/"):
134 | os.makedirs(f"repos/{repo}/")
135 | #print(f"./{repo}/ created")
136 |
137 | git = Github(access_token)
138 | try:
139 | git_repo = git.get_repo(f"{owner}/{repo}")
140 | except GithubException as e:
141 | if e.status == 404:
142 | print("non")
143 | else:
144 | print("error")
145 | shutil.rmtree(f"repos/{repo}/")
146 | return False
147 |
148 | sha = get_sha_for_tag(git_repo, branch)
149 | # Parse the response data as JSON
150 | contain_pom = False
151 | if sha is not None:
152 | contents = git_repo.get_dir_contents(".", ref=sha)
153 | for content in contents:
154 | if content.type == "file" and content.path == "pom.xml":
155 | download_file(git_repo, sha, repo, content.path)
156 | if check_log4j(f"repos/{repo}/{content.path}") or check_string_in_file(f"repos/{repo}/{content.path}"):
157 | contain_pom = True
158 | break
159 |
160 | shutil.rmtree(f"repos/{repo}/")
161 | print(f"{owner}/{repo} pom checking result: ", contain_pom)
162 | return contain_pom
163 |
164 | #check_repo("davidb", "scala-maven-plugin")
--------------------------------------------------------------------------------
/src/DataCollector/download.py:
--------------------------------------------------------------------------------
1 | from get_java import download_java
2 |
3 | Key = ""
4 |
5 | with open("1.txt", "r") as file:
6 | for line in file:
7 | repo_list = line.split()
8 | owner, repo, branch = repo_list[1], repo_list[2], repo_list[3]
9 | print(f"{repo_list[0]} repo: {owner} {repo} {branch}")
10 | Done = False
11 | with open("result1.txt", "r") as f:
12 | content = f.read()
13 | if owner in content and repo in content:
14 | Done = True
15 | if Done:
16 | print("Already Done!")
17 | continue
18 | cnt1, cnt2 = download_java(owner, repo, key, branch)
19 | with open("result1.txt", "a") as f:
20 | f.write(f"{repo_list[0]} {owner}/{repo} downloaded: {cnt1}/{cnt1+cnt2} files\n")
--------------------------------------------------------------------------------
/src/DataCollector/filter_pom.py:
--------------------------------------------------------------------------------
1 | import json
2 | import subprocess
3 | from check_pom import check_repo_root
4 | from tqdm import tqdm
5 |
6 | with open("results.json", encoding='latin1') as rf:
7 | repos = json.load(rf)
8 |
9 |
10 | Key = ""
11 |
12 | repos = repos['items']
13 | #check_repo_root("nysenate", "openlegislation", "dev")
14 | end_point = len(repos)
15 |
16 | with open("result.txt", "a") as f:
17 | for i in range(17, end_point):
18 | repo_item = repos[i]
19 | branch = repo_item['defaultBranch']
20 | owner, repo = repo_item['name'].split('/')
21 | print(f"\n{i}-{end_point}/{len(repos)} repo: {owner} {repo} {branch}\n")
22 | if check_repo_root(owner, repo, key, branch):
23 | f.write(f"{i} {owner} {repo} {branch}\n")
24 | f.flush()
25 |
--------------------------------------------------------------------------------
/src/DataCollector/get_java.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import re
3 | import xml.etree.ElementTree as ET
4 | import os
5 | import base64
6 | import shutil
7 | from tqdm import tqdm
8 | from github import Github
9 | from github import GithubException
10 | import subprocess
11 | import time
12 |
13 | pattern = r"(?im)log.*\.(log|error|info|warn|fatal|debug|trace|off|all)\(.*\)"
14 | regex = re.compile(pattern, re.DOTALL)
15 |
16 |
17 | def git_clone(owner, repo):
18 | max_attempts = 5
19 | retry_wait_time = 5 # in seconds
20 |
21 | git_url = f"https://github.com/{owner}/{repo}.git"
22 | local_path = f"./temp/{repo}"
23 | cmd = ["git", "clone", git_url, local_path]
24 |
25 | for i in range(max_attempts):
26 | try:
27 | subprocess.check_call(cmd)
28 | print("Git clone successful!")
29 | break
30 | except subprocess.CalledProcessError as e:
31 | print(f"Git clone attempt {i + 1} failed with error code {e.returncode}. Retrying in {retry_wait_time} seconds...")
32 | time.sleep(retry_wait_time)
33 | else:
34 | print(f"Git clone failed after {max_attempts} attempts.")
35 |
36 |
37 | def get_sha_for_tag(repository, tag):
38 | """
39 | Returns a commit PyGithub object for the specified repository and tag.
40 | """
41 | branches = repository.get_branches()
42 | matched_branches = [match for match in branches if match.name == tag]
43 | if matched_branches:
44 | return matched_branches[0].commit.sha
45 |
46 | tags = repository.get_tags()
47 | matched_tags = [match for match in tags if match.name == tag]
48 | if not matched_tags:
49 | print("No Tag or Branch exists with that name")
50 | return None
51 | return matched_tags[0].commit.sha
52 |
53 |
54 | def check_java(path):
55 | try:
56 | with open(path, 'r') as file:
57 | content = file.read()
58 | words = content.split()
59 | if len(words) > 300:
60 | return False
61 | lines = content.split('\n')
62 | if len(lines) > 300:
63 | return False
64 | match = regex.search(content)
65 | if match:
66 | return True
67 | except UnicodeDecodeError as e:
68 | print(f"Error: {e} and Path: {path}")
69 | return False
70 | return False
71 |
72 |
73 | def download_java_file(git, sha, repo, path):
74 | try:
75 | file_content = git.get_contents(path, ref=sha)
76 | _, file_name = os.path.split(path)
77 | file_data = base64.b64decode(file_content.content)
78 | file_out = open(f"repos/{repo}/{file_name}", "wb")
79 | file_out.write(file_data)
80 | file_out.close()
81 | if check_java(f"repos/{repo}/{file_name}") == False:
82 | os.remove(f"repos/{repo}/{file_name}")
83 | return 0
84 | return 1
85 | except (GithubException, IOError) as exc:
86 | print('Error processing %s: %s', path, exc)
87 | return 0
88 |
89 |
90 | def download_java(owner, repo, access_token, branch="master"):
91 | if not os.path.exists(f"repos/{repo}/"):
92 | os.makedirs(f"repos/{repo}/")
93 |
94 | git = Github(access_token)
95 | try:
96 | git_repo = git.get_repo(f"{owner}/{repo}")
97 | except GithubException as e:
98 | if e.status == 404:
99 | print("Non")
100 | else:
101 | print("Error")
102 | shutil.rmtree(f"repos/{repo}/")
103 | return False
104 | sha = get_sha_for_tag(git_repo, branch)
105 |
106 | # Define the Github Tree API endpoint and repository details
107 | api_url = 'https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1'
108 | # Make an HTTP GET request to the Github Tree API endpoint
109 | headers = {'Authorization': f'token {access_token}'}
110 | response = requests.get(api_url.format(owner=owner, repo=repo, branch=branch), headers=headers)
111 | data = response.json()
112 | cnt1, cnt2 = 0, 0
113 | print(git_repo.size)
114 | if git_repo.size < 500000000:
115 | git_clone(owner, repo)
116 | for subdir, dirs, files in os.walk(f"./temp/{repo}"):
117 | for file in tqdm(files):
118 | if not file.endswith(".java"):
119 | continue
120 | file_path = os.path.join(subdir, file)
121 | if os.path.getsize(file_path) < 15 * 1024:
122 | cnt2 += 1
123 | if check_java(file_path):
124 | cnt1 += 1
125 | shutil.copy2(file_path, f"repos/{repo}/{file}")
126 | shutil.rmtree(f"./temp/{repo}")
127 | else:
128 | print("File is too large!")
129 | if sha is not None:
130 | tree = data['tree']
131 | leng = len(tree)
132 | for file in tqdm(tree):
133 | #for item in tqdm(tree):
134 | if file['type'] != "tree" and file['size'] < 15 * 1024 and file['path'].endswith(".java"):
135 | cnt1 += 1
136 | cnt1 += download_java_file(git_repo, sha, repo, file['path'])
137 | cnt2 += 1
138 | if cnt1 == 0:
139 | shutil.rmtree(f"repos/{repo}/")
140 | print(f"{owner}/{repo} downloaded: {cnt1}/{cnt1+cnt2} files")
141 | return cnt1, cnt2
142 |
--------------------------------------------------------------------------------
/src/unixcoder.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Microsoft Corporation.
2 | # Licensed under the MIT license.
3 |
4 | import torch
5 | import torch.nn as nn
6 | from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
7 |
8 | class UniXcoder(nn.Module):
9 | def __init__(self, model_name):
10 | """
11 | Build UniXcoder.
12 |
13 | Parameters:
14 |
15 | * `model_name`- huggingface model card name. e.g. microsoft/unixcoder-base
16 | """
17 | super(UniXcoder, self).__init__()
18 | self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
19 | self.config = RobertaConfig.from_pretrained(model_name)
20 | self.config.is_decoder = True
21 | self.model = RobertaModel.from_pretrained(model_name, config=self.config)
22 |
23 | self.register_buffer("bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024))
24 | self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)
25 | self.lm_head.weight = self.model.embeddings.word_embeddings.weight
26 | self.lsm = nn.LogSoftmax(dim=-1)
27 |
28 | self.tokenizer.add_tokens([""],special_tokens=True)
29 |
30 | def tokenize(self, inputs, mode="", max_length=512, padding=False):
31 | """
32 | Convert string to token ids
33 |
34 | Parameters:
35 |
36 | * `inputs`- list of input strings.
37 | * `max_length`- The maximum total source sequence length after tokenization.
38 | * `padding`- whether to pad source sequence length to max_length.
39 | * `mode`- which mode the sequence will use. i.e. , ,
40 | """
41 | assert mode in ["", "", ""]
42 | assert max_length < 1024
43 |
44 | tokenizer = self.tokenizer
45 |
46 | tokens_ids = []
47 | for x in inputs:
48 | tokens = tokenizer.tokenize(x)
49 | if mode == "":
50 | tokens = tokens[:max_length-4]
51 | tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens + [tokenizer.sep_token]
52 | elif mode == "":
53 | tokens = tokens[-(max_length-3):]
54 | tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens
55 | else:
56 | tokens = tokens[:max_length-5]
57 | tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens + [tokenizer.sep_token]
58 |
59 | tokens_id = tokenizer.convert_tokens_to_ids(tokens)
60 | if padding:
61 | tokens_id = tokens_id + [self.config.pad_token_id] * (max_length-len(tokens_id))
62 | tokens_ids.append(tokens_id)
63 | return tokens_ids
64 |
65 | def decode(self, source_ids):
66 | """ Convert token ids to string """
67 | predictions = []
68 | for x in source_ids:
69 | prediction = []
70 | for y in x:
71 | t = y.cpu().numpy()
72 | t = list(t)
73 | if 0 in t:
74 | t = t[:t.index(0)]
75 | text = self.tokenizer.decode(t,clean_up_tokenization_spaces=False)
76 | prediction.append(text)
77 | predictions.append(prediction)
78 | return predictions
79 |
80 | def forward(self, source_ids):
81 | """ Obtain token embeddings and sentence embeddings """
82 | mask = source_ids.ne(self.config.pad_token_id)
83 | token_embeddings = self.model(source_ids,attention_mask = mask.unsqueeze(1) * mask.unsqueeze(2))[0]
84 | sentence_embeddings = (token_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1).unsqueeze(-1)
85 | return token_embeddings, sentence_embeddings
86 |
87 | def generate(self, source_ids, decoder_only = True, eos_id = None, beam_size = 5, max_length = 64):
88 | """ Generate sequence given context (source_ids) """
89 |
90 | # Set encoder mask attention matrix: bidirectional for , unirectional for
91 | if decoder_only:
92 | mask = self.bias[:,:source_ids.size(-1),:source_ids.size(-1)]
93 | else:
94 | mask = source_ids.ne(self.config.pad_token_id)
95 | mask = mask.unsqueeze(1) * mask.unsqueeze(2)
96 |
97 | if eos_id is None:
98 | eos_id = self.config.eos_token_id
99 |
100 | device = source_ids.device
101 |
102 | # Decoding using beam search
103 | preds = []
104 | zero = torch.LongTensor(1).fill_(0).to(device)
105 | source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
106 | length = source_ids.size(-1)
107 | encoder_output = self.model(source_ids,attention_mask=mask)
108 | for i in range(source_ids.shape[0]):
109 | context = [[x[i:i+1,:,:source_len[i]].repeat(beam_size,1,1,1) for x in y]
110 | for y in encoder_output.past_key_values]
111 | beam = Beam(beam_size,eos_id,device)
112 | input_ids = beam.getCurrentState().clone()
113 | context_ids = source_ids[i:i+1,:source_len[i]].repeat(beam_size,1)
114 | out = encoder_output.last_hidden_state[i:i+1,:source_len[i]].repeat(beam_size,1,1)
115 | for _ in range(max_length):
116 | if beam.done():
117 | break
118 | if _ == 0:
119 | hidden_states = out[:,-1,:]
120 | out = self.lsm(self.lm_head(hidden_states)).data
121 | beam.advance(out)
122 | input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
123 | input_ids = beam.getCurrentState().clone()
124 | else:
125 | length = context_ids.size(-1)+input_ids.size(-1)
126 | out = self.model(input_ids,attention_mask=self.bias[:,context_ids.size(-1):length,:length],
127 | past_key_values=context).last_hidden_state
128 | hidden_states = out[:,-1,:]
129 | out = self.lsm(self.lm_head(hidden_states)).data
130 | beam.advance(out)
131 | input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
132 | input_ids = torch.cat((input_ids,beam.getCurrentState().clone()),-1)
133 | hyp = beam.getHyp(beam.getFinal())
134 | pred = beam.buildTargetTokens(hyp)[:beam_size]
135 | pred = [torch.cat([x.view(-1) for x in p]+[zero]*(max_length-len(p))).view(1,-1) for p in pred]
136 | preds.append(torch.cat(pred,0).unsqueeze(0))
137 |
138 | preds = torch.cat(preds,0)
139 |
140 | return preds
141 |
142 |
143 |
144 | class Beam(object):
145 | def __init__(self, size, eos, device):
146 | self.size = size
147 | self.device = device
148 | # The score for each translation on the beam.
149 | self.scores = torch.FloatTensor(size).zero_().to(device)
150 | # The backpointers at each time-step.
151 | self.prevKs = []
152 | # The outputs at each time-step.
153 | self.nextYs = [torch.LongTensor(size).fill_(0).to(device)]
154 | # Has EOS topped the beam yet.
155 | self._eos = eos
156 | self.eosTop = False
157 | # Time and k pair for finished.
158 | self.finished = []
159 |
160 | def getCurrentState(self):
161 | "Get the outputs for the current timestep."
162 | batch = self.nextYs[-1].view(-1, 1)
163 | return batch
164 |
165 | def getCurrentOrigin(self):
166 | "Get the backpointers for the current timestep."
167 | return self.prevKs[-1]
168 |
169 | def advance(self, wordLk):
170 | """
171 | Given prob over words for every last beam `wordLk` and attention
172 | `attnOut`: Compute and update the beam search.
173 |
174 | Parameters:
175 |
176 | * `wordLk`- probs of advancing from the last step (K x words)
177 | * `attnOut`- attention at the last step
178 |
179 | Returns: True if beam search is complete.
180 | """
181 | numWords = wordLk.size(1)
182 |
183 | # Sum the previous scores.
184 | if len(self.prevKs) > 0:
185 | beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
186 |
187 | # Don't let EOS have children.
188 | for i in range(self.nextYs[-1].size(0)):
189 | if self.nextYs[-1][i] == self._eos:
190 | beamLk[i] = -1e20
191 | else:
192 | beamLk = wordLk[0]
193 | flatBeamLk = beamLk.view(-1)
194 | bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
195 |
196 | self.scores = bestScores
197 |
198 | # bestScoresId is flattened beam x word array, so calculate which
199 | # word and beam each score came from
200 | prevK = torch.div(bestScoresId, numWords, rounding_mode="floor")
201 | self.prevKs.append(prevK)
202 | self.nextYs.append((bestScoresId - prevK * numWords))
203 |
204 |
205 | for i in range(self.nextYs[-1].size(0)):
206 | if self.nextYs[-1][i] == self._eos:
207 | s = self.scores[i]
208 | self.finished.append((s, len(self.nextYs) - 1, i))
209 |
210 | # End condition is when top-of-beam is EOS and no global score.
211 | if self.nextYs[-1][0] == self._eos:
212 | self.eosTop = True
213 |
214 | def done(self):
215 | return self.eosTop and len(self.finished) >= self.size
216 |
217 | def getFinal(self):
218 | if len(self.finished) == 0:
219 | self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
220 | self.finished.sort(key=lambda a: -a[0])
221 | if len(self.finished) != self.size:
222 | unfinished=[]
223 | for i in range(self.nextYs[-1].size(0)):
224 | if self.nextYs[-1][i] != self._eos:
225 | s = self.scores[i]
226 | unfinished.append((s, len(self.nextYs) - 1, i))
227 | unfinished.sort(key=lambda a: -a[0])
228 | self.finished+=unfinished[:self.size-len(self.finished)]
229 | return self.finished[:self.size]
230 |
231 | def getHyp(self, beam_res):
232 | """
233 | Walk back to construct the full hypothesis.
234 | """
235 | hyps=[]
236 | for _,timestep, k in beam_res:
237 | hyp = []
238 | for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
239 | hyp.append(self.nextYs[j+1][k])
240 | k = self.prevKs[j][k]
241 | hyps.append(hyp[::-1])
242 | return hyps
243 |
244 | def buildTargetTokens(self, preds):
245 | sentence=[]
246 | for pred in preds:
247 | tokens = []
248 | for tok in pred:
249 | if tok==self._eos:
250 | break
251 | tokens.append(tok)
252 | sentence.append(tokens)
253 | return sentence
254 |
255 |
--------------------------------------------------------------------------------