├── LICENSE
├── LogBench-O
    ├── LogBench-O_prefix_1point.zip
    ├── LogBench-O_prefix_1point_file_level.zip
    └── LogBench-O_prefix_1point_wo_comments.zip
├── LogBench-T
    ├── LogBench-T_prefix_1point.zip
    └── LogBench-T_prefix_1point_file_level.zip
├── README.md
├── build
    └── code-transformer.jar
├── cases
    └── generated_cases.csv
├── img
    ├── empirical_overview.jpg
    ├── empirical_overview.pdf
    └── empirical_overview.png
└── src
    ├── Baselines
        ├── ChatGPT
        │   └── chatgpt.py
        ├── Davinci
        │   └── davinci.py
        ├── DeepLV
        │   ├── Helper.py
        │   ├── Metrics.py
        │   ├── block_level_LSTM.py
        │   ├── block_processing
        │   │   └── block_processing.py
        │   └── deepLV_cleaner.py
        ├── Incoder
        │   └── incoder.py
        ├── LoGenText-Plus
        │   ├── README.md
        │   ├── requirements.txt
        │   └── results
        │   │   └── 1
        │   │       ├── activemq
        │   │           ├── translation.context.test
        │   │           ├── translation.context.test.log
        │   │           └── translation.context.test.unsort
        │   │       ├── ambari
        │   │           ├── translation.context.test
        │   │           ├── translation.context.test.log
        │   │           └── translation.context.test.unsort
        │   │       ├── brooklyn
        │   │           ├── translation.context.test
        │   │           ├── translation.context.test.log
        │   │           └── translation.context.test.unsort
        │   │       ├── camel
        │   │           ├── translation.context.test
        │   │           ├── translation.context.test.log
        │   │           └── translation.context.test.unsort
        │   │       ├── cloudstack
        │   │           ├── translation.context.test
        │   │           ├── translation.context.test.log
        │   │           └── translation.context.test.unsort
        │   │       ├── hadoop
        │   │           ├── translation.context.test
        │   │           ├── translation.context.test.log
        │   │           └── translation.context.test.unsort
        │   │       ├── hbase
        │   │           ├── translation.context.test
        │   │           ├── translation.context.test.log
        │   │           └── translation.context.test.unsort
        │   │       ├── hive
        │   │           ├── translation.context.test
        │   │           ├── translation.context.test.log
        │   │           └── translation.context.test.unsort
        │   │       ├── ignite
        │   │           ├── translation.context.test
        │   │           ├── translation.context.test.log
        │   │           └── translation.context.test.unsort
        │   │       └── synapse
        │   │           ├── translation.context.test
        │   │           ├── translation.context.test.log
        │   │           └── translation.context.test.unsort
        ├── README.md
        ├── StarCoder
        │   └── starcoder.py
        ├── WhichVar
        │   ├── analysis.ipynb
        │   ├── cleaner.ipynb
        │   ├── data.json
        │   ├── model.py
        │   ├── output.json
        │   ├── test.json
        │   └── train.json
        └── lance
        │   ├── README.md
        │   └── lance.py
    ├── CodeTransformer
        └── README.md
    ├── DataCollector
        ├── check_pom.py
        ├── download.py
        ├── filter_pom.py
        └── get_java.py
    ├── eva_sem.ipynb
    └── unixcoder.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LogBench-O/LogBench-O_prefix_1point.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-O/LogBench-O_prefix_1point.zip


--------------------------------------------------------------------------------
/LogBench-O/LogBench-O_prefix_1point_file_level.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-O/LogBench-O_prefix_1point_file_level.zip


--------------------------------------------------------------------------------
/LogBench-O/LogBench-O_prefix_1point_wo_comments.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-O/LogBench-O_prefix_1point_wo_comments.zip


--------------------------------------------------------------------------------
/LogBench-T/LogBench-T_prefix_1point.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-T/LogBench-T_prefix_1point.zip


--------------------------------------------------------------------------------
/LogBench-T/LogBench-T_prefix_1point_file_level.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/LogBench-T/LogBench-T_prefix_1point_file_level.zip


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LogBench
  2 | 
  3 | **LogBench is a benchmark for evaluating logging statement generation.** 
  4 | 
  5 | Logging statements are imperative in modern software. They serve important role in reflecting developer's intention, recording system behavior, and guiding failure diagnosis procedure. LogBench provides a benchmark and toolkit, allowing you to measure your own models and conveniently compare them with existing baseline models.
  6 | 
  7 | 
  8 | If you find our paper benefit your research, please kindly cite our following paper:
  9 | 
 10 | + Yichen Li, Yintong Huo, Zhihan Jiang, Renyi Zhong, Pinjia He, Yuxin Su, Lionel C. Briand, and Michael R. Lyu. [Exploring the Effectiveness of LLMs in Automated Logging Generation: An Empirical Study](https://arxiv.org/abs/2307.05950), IEEE Transactions on Software Engineering(TSE), 2024.
 11 | 
 12 | ## Study overview
 13 | ![overview](img/empirical_overview.jpg)
 14 | 
 15 | The study is fully described in this [paper](https://arxiv.org/abs/2307.05950). LogBench comprises two subsets for evaluating the model's *effectiveness* and *generalizability*, respectively:
 16 | 
 17 | 1. Effectiveness: **LogBench-O** contains a collection of high-quality logging statements and their associated code contexts.
 18 | 2. Generalizability: **LogBench-T** is an unseen code dataset, after semantically-equivalent code transformation from LogBench-O.
 19 | 
 20 | Additionally, LogBench offers various variants to support different settings in logging statement generation, including:
 21 | 
 22 | * Method-level 
 23 | * File-level 
 24 | * Comment-included
 25 | * Comment-free
 26 | 
 27 | ## Repository organization 
 28 | We currently provide part of the code in the folder `/src`. We will release the full source code after the paper has been accepted.
 29 | 
 30 | * LogBench-O: The `/LogBench-O` folder contains the files for LogBench-O.
 31 | * LogBench-T: The `/LogBench-T` folder contains the files for LogBench-T.
 32 | * Cases: Please refer to the `cases` folder for the generated cases.
 33 | 
 34 | # 
 35 | 
 36 | ```
 37 | ├── LICENSE
 38 | ├── LogBench-O
 39 | │   ├── LogBench-O_prefix_1point.zip
 40 | │   ├── LogBench-O_prefix_1point_file_level.zip
 41 | │   └── LogBench-O_prefix_1point_wo_comments.zip
 42 | ├── LogBench-T
 43 | │   ├── LogBench-T_prefix_1point.zip
 44 | │   └── LogBench-T_prefix_1point_file_level.zip
 45 | ├── README.md
 46 | ├── build
 47 | │   └── code-transformer.jar
 48 | ├── cases
 49 | │   └── generated_cases.csv
 50 | ├── img
 51 | │   ├── overview.pdf
 52 | │   └── overview.png
 53 | └── src
 54 |     ├── Baselines
 55 |     │   ├── DeepLV
 56 |     │   ├── WhichVar
 57 |     │   ├── LogenText-Plus
 58 |     │   ├── StarCoder
 59 |     │   └── Lance
 60 |     │   └── InCoder
 61 |     │   └── ...
 62 |     ├── CodeTransformer
 63 |     │   └── README.md
 64 |     └── DataCollector
 65 |         ├── ...
 66 | ```
 67 | 
 68 | 
 69 | ## Study subjects
 70 | | 11 LLMs        | Access | Paper reference |
 71 | | ------------ | ------ | ---- |
 72 | | Davinci      | API    | [Project](https://platform.openai.com/docs/models) |
 73 | | ChatGPT      | API    | [Project](https://platform.openai.com/docs/models) |
 74 | | LANCE        | Model  | [ICSE'22] [Using deep learning to generate complete log statements](https://dl.acm.org/doi/abs/10.1145/3510003.3511561) |
 75 | | InCoder      | Model  | [ICLR'23] [InCoder: A Generative Model for Code Infilling and Synthesis](https://openreview.net/forum?id=hQwb-lbM6EL) |
 76 | | Llama2      | Model    | [Llama 2: Open Foundation and Fine-Tuned Chat Models](https://arxiv.org/abs/2307.09288) |
 77 | | StarCoder      | Model    | [StarCoder: may the source be with you!](https://arxiv.org/abs/2305.06161) |
 78 | | CodeLlama      | Model    | [Code Llama: Open Foundation Models for Code](https://arxiv.org/abs/2308.12950) |
 79 | | CodeGeex     | Plugin | [CodeGeeX: A Pre-Trained Model for Code Generation with Multilingual Evaluations on HumanEval-X](https://arxiv.org/abs/2303.17568) |
 80 | | TabNine      | Plugin | - |
 81 | | Copilot      | Plugin | - |
 82 | | Code Whisperer | Plugin | - |
 83 | | **Non-LLMs** | |
 84 | | DeepLV      | Model    | [ICSE'21] [DeepLV: Suggesting Log Levels Using Ordinal Based Neural Networks](https://ieeexplore.ieee.org/abstract/document/9402068) |
 85 | | WhichVar      | Model    | [TSE'21] [Which Variables Should I Log?](https://ieeexplore.ieee.org/document/8840982) |
 86 | | LoGenText-Plus        | Model  | [TOSEM'23] [LoGenText-Plus: Improving Neural Machine Translation Based Logging Texts Generation with Syntactic Templates](https://dl.acm.org/doi/10.1145/3624740) |
 87 | 
 88 | For each baseline utilized, we kindly request that please ensure to cite the relevant paper while using the code.
 89 | 
 90 | 
 91 | 
 92 | ## Download original crawling logging dataset
 93 | For further logging-related research, as GitHub does not hold large datasets, you can download the **whole** collected logging dataset Fullsize at [here](https://drive.google.com/file/d/13EV-rIFEwVrLGnpNIcpF3u9NSOh_gCNM/view?usp=sharing)
 94 | (zip: 252M; unzip: 786M).
 95 | 
 96 | 
 97 | ## Code transformation tool
 98 | 
 99 | The folder `/build` contains the built tranformation tool. It will conduct the code tranformation automatically with its eight code transformers.
100 | - To conduct the code transformation in batch:
101 | ```
102 | java -jar code-transformer.jar -f ./javafiles/
103 | ```
104 | 


--------------------------------------------------------------------------------
/build/code-transformer.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/build/code-transformer.jar


--------------------------------------------------------------------------------
/img/empirical_overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/img/empirical_overview.jpg


--------------------------------------------------------------------------------
/img/empirical_overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/img/empirical_overview.pdf


--------------------------------------------------------------------------------
/img/empirical_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/logpai/LogBench/364357338014a6d76d313fc3d8549c02938b6f2e/img/empirical_overview.png


--------------------------------------------------------------------------------
/src/Baselines/ChatGPT/chatgpt.py:
--------------------------------------------------------------------------------
 1 | from revChatGPT.V3 import Chatbot
 2 | import os
 3 | import glob
 4 | import time
 5 | import random
 6 |     
 7 | def read_input_file(input_file):
 8 |     with open(input_file, 'r') as file:
 9 |         input_text = file.read()
10 |     return input_text
11 | 
12 | def write_output_file(output_file, content):
13 |     with open(output_file, 'w') as file:
14 |         file.write(content)
15 | 
16 | def main():
17 |     input_folder = ""
18 |     output_folder = ""
19 |     java_files_pattern = os.path.join(input_folder, "*.java")
20 |     input_files = glob.glob(java_files_pattern)
21 |     random.shuffle(input_files)
22 |     output_files = [os.path.join(output_folder, os.path.splitext(os.path.basename(f))[0] + "_output.java") for f in input_files]
23 |     os.makedirs(output_folder, exist_ok=True)
24 | 
25 |     for i, input_file in enumerate(input_files):
26 |         
27 |         chatbot = Chatbot(api_key="")
28 |         print(f"Processing {input_file}...")
29 |         input_text = read_input_file(input_file)
30 |         input_text = "Please complete the incomplete logging statement at the logging point. Please just reply me one line of code, don't reply me other text.:\n" + input_text
31 |         try:
32 |             if os.path.exists(output_files[i]):
33 |                 print("Output file already exists. Skipping...")
34 |                 continue
35 |             result = chatbot.ask(input_text)
36 |             time.sleep(2)
37 |             output_file = output_files[i]
38 |             write_output_file(output_file, result)
39 |             print(f"Code saved to {output_file}")
40 |         except Exception as e:
41 |             print(f"Error processing {input_file}: {str(e)}")
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     main()


--------------------------------------------------------------------------------
/src/Baselines/Davinci/davinci.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | import os
 3 | import glob
 4 | import time
 5 | import random
 6 | from tqdm import tqdm 
 7 | 
 8 | def read_input_file(input_file):
 9 |     with open(input_file, 'r') as file:
10 |         input_text = file.read()
11 |     return input_text
12 | 
13 | def write_output_file(output_file, content):
14 |     with open(output_file, 'w') as file:
15 |         file.write(content)
16 | 
17 | openai.api_key = ""
18 | 
19 | def generate_text(prompt, model="text-davinci-003", tokens=1024, temperature=1, top_p=1):
20 |     response = openai.Completion.create(
21 |         engine=model,
22 |         prompt=prompt,
23 |         max_tokens=tokens,
24 |         n=1,
25 |         stop=None,
26 |         temperature=temperature,
27 |         top_p=top_p
28 |     )
29 | 
30 |     return response.choices[0].text.strip()
31 | 
32 | def main():
33 |     input_folder = ""  
34 |     output_folder = ""
35 |     java_files_pattern = os.path.join(input_folder, "*.java")
36 |     input_files = glob.glob(java_files_pattern)
37 |     random.shuffle(input_files)
38 |     output_files = [os.path.join(output_folder, os.path.splitext(os.path.basename(f))[0] + "_output.java") for f in input_files]
39 |     os.makedirs(output_folder, exist_ok=True)
40 | 
41 |     for i, input_file in enumerate(tqdm(input_files, desc="Processing files")):
42 |         print(f"Processing {input_file}...")
43 |         input_text = read_input_file(input_file)
44 |         input_text = "Please complete the incomplete logging statement at the logging point. Please just reply me one line of code, don't reply me other text.:\n" + input_text
45 |         try:
46 |             if os.path.exists(output_files[i]):
47 |                 print("Output file already exists. Skipping...")
48 |                 continue
49 |             result = generate_text(input_text)
50 |             time.sleep(2)
51 |             output_file = output_files[i]
52 |             write_output_file(output_file, result)
53 |             print(f"Code saved to {output_file}")
54 |         except Exception as e:
55 |             print(f"Error processing {input_file}: {str(e)}")
56 | 
57 | if __name__ == "__main__":
58 |     main()
59 | 


--------------------------------------------------------------------------------
/src/Baselines/DeepLV/Helper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import re as re
  4 | import string
  5 | import numpy as np
  6 | import csv
  7 | import pandas as pd
  8 | 
  9 | from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, accuracy_score, precision_recall_fscore_support
 10 | from sklearn.utils import resample
 11 | from sklearn.preprocessing import LabelEncoder
 12 | 
 13 | # for ordinal
 14 | trace_label = [1, 0, 0, 0, 0]
 15 | debug_label = [1, 1, 0, 0, 0]
 16 | info_label = [1, 1, 1, 0, 0]
 17 | warn_label = [1, 1 ,1, 1, 0]
 18 | error_label = [1, 1, 1, 1, 1]
 19 | 
 20 | # for normal
 21 | #trace_label = [1, 0, 0, 0, 0]
 22 | #debug_label = [0, 1, 0, 0, 0]
 23 | #info_label = [0, 0, 1, 0, 0]
 24 | #warn_label = [0, 0 ,0, 1, 0]
 25 | #error_label = [0, 0, 0, 0, 1]
 26 | 
 27 | 
 28 | 
 29 | 
 30 | def ordinal_encoder(classes):
 31 |     y = []
 32 |     for c in classes:
 33 |         if c == 'trace':
 34 |             y.append(trace_label)
 35 |         elif c == 'debug':
 36 |             y.append(debug_label)
 37 |         elif c == 'info':
 38 |             y.append(info_label)
 39 |         elif c == 'warn':
 40 |             y.append(warn_label)
 41 |         else:
 42 |             y.append(error_label)
 43 |     y = np.array(y)
 44 |     return y
 45 | 
 46 | 
 47 | def predict_prob_encoder(predict_prob):
 48 |     label_predicted = []
 49 |     for column_prob in predict_prob:
 50 |         column_label = []
 51 |         for p in column_prob:
 52 |             if p > 0.5:
 53 |                 column_label.append(1)
 54 |             else:
 55 |                 column_label.append(0)
 56 |         label_predicted.append(column_label)
 57 |     label_predicted = np.array(label_predicted)
 58 |     return label_predicted
 59 | 
 60 | 
 61 | def predicted_label_encoder(y_list):
 62 |     target_list = []
 63 | 
 64 |     target_trace_label = [1, 0, 0, 0, 0]
 65 |     target_debug_label = [0, 1, 0, 0, 0]
 66 |     target_info_label = [0, 0, 1 ,0, 0]
 67 |     target_warn_label = [0, 0, 0, 1, 0]
 68 |     target_error_label = [0, 0, 0, 0, 1]
 69 |     target_exception_label = [0, 0, 0, 0, 0]
 70 |     for y in y_list:
 71 |         if np.array_equal(np.array(y), np.array(trace_label)):
 72 |             target_list.append(target_trace_label)
 73 |         elif np.array_equal(np.array(y), np.array(debug_label)):
 74 |             target_list.append(target_debug_label)
 75 |         elif np.array_equal(np.array(y), np.array(info_label)):
 76 |             target_list.append(target_info_label)
 77 |         elif np.array_equal(np.array(y), np.array(warn_label)):
 78 |             target_list.append(target_warn_label)
 79 |         elif np.array_equal(np.array(y), np.array(error_label)):
 80 |             target_list.append(target_error_label)
 81 |         else:
 82 |             print("Something wrong happend in predicted_label_encoder.", y)
 83 |             target_list.append(target_warn_label)
 84 |     return np.array(target_list)
 85 | 
 86 | 
 87 | 
 88 | 
 89 | def pd_encoder(y_list): #0:trace, 1:debug, 2:info, 3:warn, 4: error
 90 |     target_list = []
 91 |     for y in y_list:
 92 |         if np.array_equal(np.array(y), np.array(trace_label)):
 93 |             target_list.append(0)
 94 |         elif np.array_equal(np.array(y), np.array(debug_label)):
 95 |             target_list.append(1)
 96 |         elif np.array_equal(np.array(y), np.array(info_label)):
 97 |             target_list.append(2)
 98 |         elif np.array_equal(np.array(y), np.array(warn_label)):
 99 |             target_list.append(3)
100 |         elif np.array_equal(np.array(y), np.array(error_label)):
101 |             target_list.append(4)
102 |         else:
103 |             print("Something wrong happend in pd_encoder.", y)
104 |             target_list.append(3)
105 |     return target_list
106 | 
107 | 
108 | 
109 | 
110 | def class_accuracy(y_test, y_predicted):
111 |     trace_test_list = []
112 |     debug_test_list = []
113 |     info_test_list = []
114 |     warn_test_list = []
115 |     error_test_list = []
116 | 
117 |     trace_predicted_list = []
118 |     debug_predicted_list = []
119 |     info_predicted_list = []
120 |     warn_predicted_list = []
121 |     error_predicted_list = []
122 | 
123 |     for yt, yp in zip(y_test, y_predicted):
124 |         if np.array_equal(np.array(yt), np.array(trace_label)):
125 |             trace_test_list.append(trace_label)
126 |             trace_predicted_list.append(yp)
127 |         elif np.array_equal(np.array(yt), np.array(debug_label)):
128 |             debug_test_list.append(debug_label)
129 |             debug_predicted_list.append(yp)
130 |         elif np.array_equal(np.array(yt), np.array(info_label)):
131 |             info_test_list.append(info_label)
132 |             info_predicted_list.append(yp)
133 |         elif np.array_equal(np.array(yt), np.array(warn_label)):
134 |             warn_test_list.append(warn_label)
135 |             warn_predicted_list.append(yp)
136 |         elif np.array_equal(np.array(yt), np.array(error_label)):
137 |             error_test_list.append(error_label)
138 |             error_predicted_list.append(yp)
139 |         else:
140 |             print("something wrong happened in class_accuracy", yt, yp)
141 |     acc_trace = accuracy_score(np.array(trace_test_list), np.array(trace_predicted_list))
142 |     acc_debug = accuracy_score(np.array(debug_test_list), np.array(debug_predicted_list))
143 |     acc_info = accuracy_score(np.array(info_test_list), np.array(info_predicted_list))
144 |     acc_warn = accuracy_score(np.array(warn_test_list), np.array(warn_predicted_list))
145 |     acc_error = accuracy_score(np.array(error_test_list), np.array(error_predicted_list))
146 |     print ('Trace Accuracy: ', acc_trace)
147 |     print ('Debug Accuracy: ', acc_debug)
148 |     print ('Info Accuracy: ', acc_info)
149 |     print ('Warn Accuracy: ', acc_warn)
150 |     print ('Error Accuracy: ', acc_error)
151 | 
152 | #This is for the case combining debug and trace together
153 | def upsampling(x_train, y_train, seed_value):
154 | 
155 |     #code below is for upsampling the data
156 | 
157 |     df=pd.DataFrame()
158 |     df['x_train'] = x_train
159 |     df['y_train'] = pd_encoder(y_train)
160 | 
161 |     data_td = df.loc[df['y_train'] == 0]
162 |     data_info = df.loc[df['y_train'] == 1]
163 |     data_warn = df.loc[df['y_train'] == 2]
164 |     data_error = df.loc[df['y_train'] == 3]
165 |     data_len = np.array([len(data_td), len(data_info), len(data_warn), len(data_error)])
166 |     max_num = np.max(data_len)
167 | 
168 |     td_upsampled = resample(data_td, replace=True, n_samples=max_num, random_state=seed_value)
169 |     info_upsampled = resample(data_info, replace=True, n_samples=max_num, random_state=seed_value)
170 |     warn_upsampled = resample(data_warn, replace=True, n_samples=max_num, random_state=seed_value)
171 |     error_upsampled = resample(data_error, replace=True, n_samples=max_num, random_state=seed_value)
172 | 
173 |     td_upsampled=td_upsampled.drop(columns=['y_train']).to_numpy()
174 |     info_upsampled=info_upsampled.drop(columns=['y_train']).to_numpy()
175 |     warn_upsampled=warn_upsampled.drop(columns=['y_train']).to_numpy()
176 |     error_upsampled=error_upsampled.drop(columns=['y_train']).to_numpy()
177 | 
178 |     x_train = np.concatenate((td_upsampled, info_upsampled, warn_upsampled, error_upsampled))
179 |     temp_y_train = []
180 |     for i in range(0, max_num):
181 |         temp_y_train.append([1, 0, 0, 0])
182 |     for i in range(0, max_num):
183 |         temp_y_train.append([1, 1, 0, 0])
184 |     for i in range(0, max_num):
185 |         temp_y_train.append([1, 1, 1, 0])
186 |     for i in range(0, max_num):
187 |         temp_y_train.append([1, 1, 1, 1])
188 | 
189 |     y_train = np.array(temp_y_train)
190 |     return x_train, y_train
191 |          
192 | 
193 | def ordinal_accuracy(y_test, y_predicted):
194 |     print(len(y_test), len(y_predicted))
195 |     left_boundary = 0.0
196 |     right_boundary = 4.0
197 |     value_cumulation = 0.0
198 |     for yt, yp in zip(y_test, y_predicted):
199 |         lb_distance = float(yt) - left_boundary
200 |         rb_distance = right_boundary - float(yt)
201 |         max_distance = np.max(np.array([lb_distance, rb_distance]))
202 |         value = 1.0 - abs(float(yp) - float(yt))/max_distance
203 |         value_cumulation = value_cumulation + value
204 |     return value_cumulation/float(len(y_test))
205 | 
206 | 


--------------------------------------------------------------------------------
/src/Baselines/DeepLV/Metrics.py:
--------------------------------------------------------------------------------
 1 | from keras.callbacks import Callback
 2 | import numpy as np
 3 | from sklearn.model_selection import train_test_split
 4 | from sklearn.model_selection import StratifiedKFold 
 5 | from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, balanced_accuracy_score, accuracy_score
 6 | from sklearn.utils import resample
 7 | 
 8 | class Metrics(Callback):
 9 |     def on_train_begin(self, logs={}):
10 |         self.val_f1s = []
11 |         self.val_recalls = []
12 |         self.val_precisions = []
13 |         self.val_auc = []
14 | 
15 |     def on_epoch_end(self, epoch, logs={}):
16 |         val_predict = (np.asarray(self.model.predict(
17 |             self.validation_data[0]))).round()
18 |         val_targ = self.validation_data[1]
19 |         pos_label=1
20 |         _val_f1 = f1_score(val_targ, val_predict, labels=[pos_label],pos_label=1, average ='binary')
21 |         _val_recall = recall_score(val_targ, val_predict, labels=[pos_label],pos_label=1, average ='binary')
22 |         _val_precision = precision_score(val_targ, val_predict, labels=[pos_label],pos_label=1, average ='binary')
23 |         _val_auc = roc_auc_score(val_targ, val_predict)
24 |         self.val_f1s.append(_val_f1)
25 |         self.val_recalls.append(_val_recall)
26 |         self.val_precisions.append(_val_precision)
27 |         self.val_auc.append(_val_auc)
28 |         return
29 | 
30 | 


--------------------------------------------------------------------------------
/src/Baselines/DeepLV/block_level_LSTM.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | import os
  3 | import sys
  4 | import re as re
  5 | import string
  6 | 
  7 | import multiprocessing
  8 | import numpy as np
  9 | from gensim.models.word2vec import Word2Vec
 10 | from gensim.corpora.dictionary import Dictionary
 11 | from gensim.parsing.porter import PorterStemmer
 12 | 
 13 | import random as rn
 14 | seed_value = 17020 
 15 | seed_window = 1500
 16 | import pandas as pd
 17 | import csv
 18 | from sklearn.model_selection import train_test_split
 19 | from sklearn.model_selection import StratifiedKFold 
 20 | from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, balanced_accuracy_score, accuracy_score, precision_recall_fscore_support
 21 | from sklearn.utils import resample
 22 | from sklearn.preprocessing import LabelEncoder
 23 | 
 24 | import matplotlib.pyplot as plt
 25 | 
 26 | import tensorflow as tf
 27 | import Metrics
 28 | from keras import backend as K
 29 | from keras.preprocessing import sequence
 30 | from keras.models import Sequential
 31 | from keras.layers import Dense, Flatten, Dropout, Embedding, LSTM, Bidirectional, Activation, LeakyReLU
 32 | from keras.models import model_from_yaml
 33 | from keras.utils import np_utils
 34 | from keras_self_attention import SeqSelfAttention
 35 | 
 36 | 
 37 | import Helper
 38 | 
 39 | 
 40 | config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 16} ) 
 41 | sess = tf.Session(config=config) 
 42 | K.set_session(sess)
 43 | 
 44 | 
 45 | csv.field_size_limit(100000000)
 46 | sys.setrecursionlimit(1000000)
 47 | # set parameters:
 48 | n_iterations = 1
 49 | embedding_iterations = 1
 50 | n_epoch = 50
 51 | 
 52 | vocab_dim = 100
 53 | maxlen = 100
 54 | n_exposures = 10
 55 | window_size = 7
 56 | batch_size = 24
 57 | input_length = 100
 58 | cpu_count = multiprocessing.cpu_count()
 59 | 
 60 | test_list = []
 61 | neg_full = []
 62 | pos_full = []
 63 | syntactic_list = []
 64 | 
 65 | 
 66 | 
 67 | model_location = 'model_block'  +'/lstm_'+ sys.argv[1]
 68 | embedding_location = 'embedding_block' + '/Word2vec_model_' + sys.argv[1] + '.pkl'
 69 | 
 70 | 
 71 | def loadfile():
 72 | 
 73 |     data_full=pd.read_csv('block_processing/blocks/logged_syn'  + '_' + sys.argv[1] + '.csv', usecols=[1,2,3,4], engine='python')
 74 | 
 75 |     dataset = data_full.values
 76 |     classes = dataset[:, 2]
 77 |     data=data_full['Values'].values.tolist()
 78 |     combined = data
 79 |     combined_full = data_full.values.tolist()
 80 | 
 81 |     encoder = LabelEncoder()
 82 |     encoder.fit(classes)
 83 |     encoded_Y = encoder.transform(classes)
 84 |     y = Helper.ordinal_encoder(classes)
 85 | 
 86 | 
 87 | 
 88 |     x_train, x_test, y_train, y_test = train_test_split(combined_full, y, test_size=0.2, train_size=0.8, random_state=seed_value, stratify=y)
 89 |     x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, train_size=0.75, random_state=seed_value, stratify=y_train)
 90 |     test_block_list = []
 91 |     train_block_list = []
 92 |     for x in x_test:
 93 |         test_list.append(x[0])
 94 |         test_block_list.append(x[1])
 95 |     x_test = np.array(test_block_list)
 96 |     for x in x_train:
 97 |         train_block_list.append(x[1])
 98 |     x_train = train_block_list
 99 | 
100 |     return combined,y, x_train, x_val, x_test, y_train, y_val,  y_test
101 | 
102 | 
103 | 
104 | def word_splitter(word, docText):
105 |     splitted = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', word)).split()
106 |     for word in splitted:
107 |         docText.append(word.lower())
108 | 
109 | 
110 | 
111 | 
112 | def tokenizer(text):
113 |     newText = []
114 |     for doc in text:
115 |         docText = []
116 |         #for word in str(doc).replace("['", "").replace("']", "").replace(",", "").replace("'", "").split(' '):
117 |         for word in str(doc).replace("'", "").replace("[", "").replace("]", "").replace(",", "").replace('"', "").split(' '):
118 |             docText.append(word)
119 |             
120 |         newText.append(docText)
121 |     #print (newText)
122 |     return newText
123 |     
124 | 
125 | 
126 | def input_transform(words):
127 |     model=Word2Vec.load(embedding_location)
128 |     _, _,dictionaries=create_dictionaries(model,words)
129 |     return dictionaries
130 | 
131 | 
132 | 
133 | 
134 | 
135 | 
136 | def create_dictionaries(model=None,
137 |                         combined=None):
138 | 
139 |     from keras.preprocessing import sequence
140 |     
141 |     if (combined is not None) and (model is not None):
142 |         gensim_dict = Dictionary()
143 |         gensim_dict.doc2bow(model.wv.vocab.keys(),
144 |                             allow_update=True)
145 |         w2indx = {v: k+1 for k, v in gensim_dict.items()}
146 |         w2vec = {word: model.wv[word] for word in w2indx.keys()}
147 | 
148 |         def parse_dataset(combined):
149 |             data=[]
150 |             for sentence in combined:
151 |                 new_txt = []
152 |                 for word in sentence:
153 |                     try:
154 |                         new_txt.append(w2indx[word])
155 |                     except:
156 |                         new_txt.append(0)
157 |                 data.append(new_txt)
158 |             return data
159 |         combined=parse_dataset(combined)
160 |         combined= sequence.pad_sequences(combined, maxlen=maxlen)
161 |         return w2indx, w2vec,combined
162 | 
163 | 
164 | def word2vec_train(combined):
165 |     model = Word2Vec(size=vocab_dim, #dimension of word embedding vectors
166 |                      min_count=n_exposures,
167 |                      window=window_size,
168 |                      workers=cpu_count, sg=1,
169 |                      iter=embedding_iterations)
170 |     model.build_vocab(combined)
171 |     model.save(embedding_location)
172 |     index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined)
173 |     return   index_dict, word_vectors,combined
174 | 
175 | 
176 | def get_data(index_dict,word_vectors,combined):
177 | 
178 |     n_symbols = len(index_dict) + 1  
179 |     embedding_weights = np.zeros((n_symbols, vocab_dim))
180 |     for word, index in index_dict.items():
181 |         embedding_weights[index, :] = word_vectors[word]
182 | 
183 | 
184 |     return n_symbols,embedding_weights
185 | 
186 | 
187 | def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test, x_val, y_val):
188 |     
189 |     tf.set_random_seed(seed_value)
190 | 
191 |     
192 | 
193 | 
194 |     model = Sequential()  
195 |     model.add(Embedding(output_dim=vocab_dim,
196 |                         input_dim=n_symbols,
197 |                         mask_zero=True,
198 |                         weights=[embedding_weights],
199 |                         input_length=input_length)) 
200 |     model.add(Bidirectional(LSTM(output_dim=128,activation='sigmoid')))
201 |     model.add(Dropout(0.2))
202 |     model.add(Dense(5, activation='sigmoid')) 
203 | 
204 | 
205 |     print ('Compiling the Model..')
206 |     model.compile(loss='binary_crossentropy',
207 |                   optimizer='adam',metrics=['accuracy'])
208 | 
209 |     print ("Train...")
210 |     metrics = Metrics.Metrics()
211 |     history = model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epoch,verbose=1, validation_data=(x_val, y_val))
212 | 
213 |     base_min = optimal_epoch(history)
214 |     print ("Evaluate...")
215 |     score = model.evaluate(x_test, y_test,
216 |                                 batch_size=batch_size)
217 |     yaml_string = model.to_yaml()
218 |     with open(model_location +'.yml', 'w') as outfile:
219 |         outfile.write( yaml.dump(yaml_string, default_flow_style=True) )
220 |     model.save_weights(model_location + sys.argv[1] + '.h5')
221 |     np.set_printoptions(threshold=sys.maxsize)
222 | 
223 |     prob_predicted = model.predict(x_test,  verbose=1)
224 |     label_predicted = Helper.predict_prob_encoder(prob_predicted)  
225 |     num_y_test = Helper.pd_encoder(y_test)
226 |     num_y_predicted = Helper.pd_encoder(label_predicted)
227 | 
228 |     val_accuracy = accuracy_score(y_test, label_predicted)
229 |     print ('Accuracy: ', val_accuracy)
230 |     Helper.class_accuracy(y_test, label_predicted)
231 | 
232 |     with open(model_location + '_target.txt', 'wt') as f:
233 |         for y in y_test:
234 |             f.write(str(y)+ '\n')
235 |     with open(model_location + '_predicted.txt', 'wt') as f:
236 |         for y in label_predicted:
237 |             f.write(str(y)+ '\n')
238 |     return [val_accuracy]
239 | 
240 | 
241 | 
242 |         
243 | 
244 | def get_FP_FN(label_predicted, label_target):
245 |     FP_id_list = []
246 |     FN_id_list = []
247 |     for i in range(0, len(label_predicted)):
248 |         if int(label_predicted[i]) == 1 and int(label_target[i]) == 0:
249 |             FP_id_list.append(i)
250 |         elif int(label_predicted[i]) == 0 and int(label_target[i]) == 1:
251 |             FN_id_list.append(i)
252 |     #print (FP_id_list)
253 |     #print (FN_id_list)
254 |     with open('model_block'  +'/labels/list/lstm_FP_' + sys.argv[1] + '.txt', 'wt') as f:
255 |         for fp in FP_id_list:
256 |             f.write(str(test_list[int(fp)])+ '\n')
257 |     with open('model_block' +'/labels/list/lstm_FN_' + sys.argv[1] + '.txt', 'wt') as f:
258 |         for fn in FN_id_list:
259 |             f.write(str(test_list[int(fn)])+ '\n')
260 |         
261 | 
262 | def train():
263 |     os.environ['PYTHONHASHSEED']=str(seed_value)
264 |     np.random.seed(seed_value)  
265 |     rn.seed(seed_value)
266 |     print ('Loading Data...')
267 |     combined,y,x_train, x_val, x_test, y_train, y_val,  y_test=loadfile()
268 |     print ('Tokenizing...')
269 |     combined = tokenizer(combined)
270 |     x_train = tokenizer (x_train)
271 |     x_test = tokenizer (x_test)
272 |     x_val = tokenizer (x_val)
273 |     print ('Training a Word2vec model...')
274 |     index_dict, word_vectors,combined=word2vec_train(combined)
275 |     x_train = input_transform(x_train)
276 |     x_test = input_transform(x_test)
277 |     x_val = input_transform(x_val)
278 |     print ('Setting up Arrays for Keras Embedding Layer...')
279 |     n_symbols,embedding_weights=get_data(index_dict, word_vectors,combined)
280 |     #print (x_train.shape,y_train.shape)
281 |     result = train_lstm(n_symbols,embedding_weights,x_train,y_train, x_val , y_val , x_test,y_test)
282 |     return result
283 | 
284 | 
285 | def pipeline_train(iterations):
286 |     seed_and_result = {}
287 |     if iterations == 1:
288 |         train()
289 |     else:
290 |         for i in range(0, iterations):
291 |             print('Iteration: ', i)
292 |             global seed_value
293 |             result = train()
294 |             seed_and_result[seed_value] = result
295 |             seed_value = seed_value + seed_window
296 |             i = i + 1
297 |     return seed_and_result
298 | 
299 | def eval_metric(model, history, metric_name):
300 |     metric = history.history[metric_name]
301 |     val_metric = history.history['val_' + metric_name]
302 |     e = range(1, n_epoch + 1)
303 |     plt.plot(e, metric, 'bo', label='Train ' + metric_name)
304 |     plt.plot(e, val_metric, 'b', label='Validation ' + metric_name)
305 |     plt.xlabel('Epoch number')
306 |     plt.ylabel(metric_name)
307 |     plt.title('Comparing training and validation ' + metric_name + ' for ' + model.name)
308 |     plt.legend()
309 |     plt.show()
310 | 
311 | 
312 | def optimal_epoch(model_hist):
313 |     min_epoch = np.argmin(model_hist.history['val_loss']) + 1
314 |     print("Minimum validation loss reached in epoch {}".format(min_epoch))
315 |     return min_epoch
316 | 
317 | 
318 | 
319 | 
320 | if __name__=='__main__':
321 |     result_dict = pipeline_train(n_iterations)
322 |     print (sys.argv[1])
323 | 


--------------------------------------------------------------------------------
/src/Baselines/DeepLV/block_processing/block_processing.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import multiprocessing
  3 | import numpy as np
  4 | import pandas as pd
  5 | import csv
  6 | import re
  7 | 
  8 | block_set = {"DoStatement", "WhileStatement", "SynchronizedStatement", "IfStatement", "SwitchStatement", "TryStatement", "EnhancedForStatement", "ForStatement", "MethodDeclaration", "CatchClause", "Block" , "SwitchCase"}
  9 | syntactic_filter_set = {"Block", "SimpleName", "SimpleType", "QualifiedName", "ParameterizedType", "PrimitiveType", "SingleVariableDeclaration", "ArrayType", "TypeLiteral"}
 10 | block_dict = {}
 11 | target_dict = {}
 12 | methods_dict = {} 
 13 | methods_lines = {}
 14 | target_dict_logged = {}
 15 | level_dict_logged = {}
 16 | message_dict_logged = {}
 17 | target_dict_nonlogged = {}
 18 | 
 19 | 
 20 | def read_logs(filename):
 21 |     f = open('original_logs/logs-' + filename+ '.txt')
 22 |     lines = f.readlines()
 23 |     f.close()
 24 |     return lines
 25 | 
 26 | 
 27 | def get_classname(method):
 28 |     fullpath = method.split('.')
 29 |     class_name = fullpath[-3] + '.' + fullpath[-2]+'.java'
 30 |     return class_name
 31 | 
 32 | 
 33 | def read_AST_file(filename):
 34 |     f = open('AST/AST-'+filename+'.txt')
 35 |     lines = f.readlines()
 36 |     f.close()
 37 | 
 38 |     return lines
 39 | 
 40 | 
 41 | def parse_ASTlines(ASTlines):
 42 |     lines = []
 43 |     #parse ASTlines by regex
 44 |     for astline in ASTlines:
 45 |         
 46 |         astType = re.findall(r'<type>([^<]+)</type>', astline)[0]
 47 |         location = re.findall(r'<method>([^<]+)</method>', astline)[0]
 48 |         begin = re.findall(r'<begin>([^<]+)</begin>', astline)[0]
 49 |         end = re.findall(r'<end>([^<]+)</end>', astline)[0]
 50 |         #content = re.findall(r'<name>([^<]+)</name>', astline)[0]
 51 |         content = re.findall(r'<name>(.*?)</name>', astline)[0]
 52 |         lines.append([astType, location, begin, end, content])
 53 |         #for every AST line, 0: type, 1: location, 2: beginline, 3: endline, 4: content
 54 |     return lines
 55 | 
 56 | 
 57 | 
 58 | def parse_Loglines(Loglines):
 59 |     loglines = []
 60 |     #parse ASTlines by regex
 61 |     for logline in Loglines:   
 62 |         callsite = re.findall(r'<callsite>([^<]+)</callsite>', logline)[0]
 63 |         level = re.findall(r'<level>([^<]+)</level>', logline)[0]
 64 |         line = re.findall(r'<line>([^<]+)</line>', logline)[0]     
 65 |         if(re.findall(r'<constant>([^<]+)</constant>', logline)):
 66 |             content = re.findall(r'<constant>([^<]+)</constant>', logline)[0]
 67 |             loglines.append([level, line, content, callsite])
 68 |         else:
 69 |             loglines.append([level, line, 'No message', callsite])      
 70 |         #0: level, 1: line number, 2: content, 3: callsite
 71 | 
 72 |     return loglines
 73 | 
 74 | 
 75 | def if_log_line(ast, loglines):
 76 |     for log in loglines:
 77 |         #print (get_classname(log[3]), get_classname(astlist[1]))
 78 |         #print (log[1], astlist[2])
 79 |         if(get_classname(log[3]) == get_classname(astlist[1]) and int(log[1]) == int(astlist[2])):
 80 |             #print ('1')
 81 |             return True
 82 |     return False
 83 | 
 84 | 
 85 | 
 86 | def if_diff_levels(value_list):
 87 |     if len(value_list) > 1:
 88 |         for i in range (0, len(value_list)-1):
 89 |             for j in range (i+1, len(value_list)):
 90 |                 if value_list[i][0] != value_list[j][0]:
 91 |                     return 2
 92 |     else:
 93 |         return 0
 94 |     return 1
 95 | 
 96 | def not_level_guard(string):
 97 |     if "enabled" in string:
 98 |         if "info" in string or "debug" in string or "trace" in string:
 99 |             return False
100 |     return True
101 | 
102 |     #0: <= 1 log in the block, 1: multiple logs at the same level, 2: multiple logs at different levels
103 | 
104 | 
105 | def get_level_id(log, current_level):
106 |     log_level = re.findall(r'<level>([^<]+)</level>', log)[0]
107 |     message = '-'
108 |     if(re.findall(r'<constant>([^<]+)</constant>', log)):
109 |         message = re.findall(r'<constant>([^<]+)</constant>', log)[0]
110 |     if log_level == 'trace':
111 |         level_id = 0
112 |     elif log_level == 'debug':
113 |         level_id = 1
114 |     elif log_level == 'info':
115 |         level_id = 2
116 |     elif log_level == 'warn':
117 |         level_id = 3
118 |     elif log_level == 'error':
119 |         level_id = 4
120 |     else:
121 |         level_id = 5
122 |     if level_id > current_level:
123 |         return level_id, message
124 |     else:
125 |         return current_level, message
126 |     
127 | 
128 | def get_level_name(level_id):
129 |     if level_id == 0:
130 |         return "trace"
131 |     elif level_id == 1:
132 |         return "debug"
133 |     elif level_id == 2:
134 |         return "info"
135 |     elif level_id == 3:
136 |         return "warn"
137 |     elif level_id == 4:
138 |         return "error"
139 |     elif level_id == 5:
140 |         return "fatal"
141 |     else:
142 |         return "unknown"
143 | 
144 | def label_blocks(target_dict, loglines):
145 |     for key, value in target_dict.items():
146 |         logged_flag = False
147 |         #level id: 0 - trace, 1 - debug, 2 - info, 3 - warn, 4 - error, 5 - fatal
148 |         level_id = 0
149 |         message = '-'
150 |         for log in loglines:     
151 |             log_class = get_classname(re.findall(r'<callsite>([^<]+)</callsite>', log)[0])
152 |             log_line = int(re.findall(r'<line>([^<]+)</line>', log)[0])
153 |             key_class = re.findall(r'<class>([^<]+)</class>', key)[0]
154 |             key_start = int(re.findall(r'<start>([^<]+)</start>', key)[0])
155 |             key_end = int(re.findall(r'<end>([^<]+)</end>', key)[0])
156 |             if log_line >= key_start and log_line <= key_end and log_class == key_class:
157 |                 level_id, message = get_level_id(log, level_id)
158 |                 logged_flag = True
159 |         if logged_flag == True:
160 |             target_dict_logged[key] = value
161 |             level_dict_logged[key]=get_level_name(level_id)
162 |             message_dict_logged[key]= message
163 |         else:
164 |             target_dict_nonlogged[key] = value
165 | 
166 | 
167 | def get_methods_dict (node): # set the startline of the first node of a method as it's startline
168 |     if node[1] in methods_dict:
169 |         if int(methods_dict[node[1]]) > int(node[2]):
170 |             methods_dict[node[1]] = node[2]
171 |     else:
172 |         methods_dict[node[1]] = node[2]
173 | 
174 | 
175 | def get_methods_lines (methods_dict):
176 |     for key, value in methods_dict.items():
177 |         class_name = get_classname(key)
178 |         if class_name in methods_lines:
179 |             methods_lines[class_name].append(int(value))
180 |         else:
181 |             methods_lines[class_name] = []
182 | 
183 |     for key, value in methods_lines.items():
184 |         value.sort()
185 |         #print (key)
186 |         #print (value)
187 | 
188 | 
189 | def get_method_start_line_for_AST (class_name, start_line):
190 |     method_start_line = int(start_line)
191 |     memory_line = 1
192 |     if methods_lines[class_name]:
193 |         for v in methods_lines[class_name]:
194 |             if int(v) >= int(start_line):
195 |                 #print (memory_line)
196 |                 return int(memory_line)
197 |             else:
198 |                 memory_line = int(v)
199 |     else:
200 |         return int(method_start_line)
201 | 
202 | 
203 | if __name__=='__main__':
204 | 
205 |     ASTlines = read_AST_file(sys.argv[1])
206 |     loglines = read_logs(sys.argv[1])
207 | 
208 |     ASTlists = parse_ASTlines(ASTlines)
209 |     loglists = parse_Loglines(loglines)
210 | 
211 |     for astlist in ASTlists:
212 |         get_methods_dict(astlist)
213 |         #filter level-guard if statements
214 |         ast_content = astlist[4].lower()[0:40]
215 |         #for every AST line, 0: type, 1: location, 2: beginline, 3: endline, 4: content
216 |         if astlist[0] in block_set and not_level_guard(ast_content):
217 |             if astlist[1] in block_dict:
218 |                 if (astlist[2]) not in block_dict[astlist[1]]:
219 |                     block_dict[astlist[1]].append(int(astlist[2]))
220 |                 if (astlist[3]) not in block_dict[astlist[1]]:
221 |                     block_dict[astlist[1]].append(int(astlist[3]))
222 |                     
223 |             else:
224 |                 block_dict[astlist[1]] = []
225 |     get_methods_lines(methods_dict)
226 | 
227 |     for key, value in block_dict.items():
228 |         value.sort()
229 | 
230 | 
231 | 
232 | 
233 |     for key, value in block_dict.items():
234 |         for i in range (0, len(value)-1):
235 |             dict_key = '<class>' + get_classname(key) + '</class>' + '<start>' + str(value[i]) + '</start>' + '<end>' + str((value[i+1])-1) + '</end>' 
236 |             target_dict[dict_key] = []
237 | 
238 | 
239 |     m_start_line = 0
240 |     for key, value in target_dict.items():
241 |         class_name = re.findall(r'<class>([^<]+)</class>', key)[0]
242 |         start_line = re.findall(r'<start>([^<]+)</start>', key)[0]
243 |         m_start_line = get_method_start_line_for_AST(class_name, start_line)    
244 |         if m_start_line is not None:
245 |             if int(m_start_line) == 1:
246 |                 m_start_line = start_line
247 |         else:
248 |             m_start_line = start_line
249 | 
250 |         end_line = re.findall(r'<end>([^<]+)</end>', key)[0]
251 |         #print (key)
252 |         for astlist in ASTlists:
253 |             if astlist[0] not in syntactic_filter_set and int(astlist[2]) <= int(end_line) and int(astlist[2]) >= int(m_start_line) and class_name == get_classname(astlist[1]):
254 |                 if(if_log_line(astlist, loglists)==False):
255 |                     value.append(astlist[0])
256 |                 
257 | 
258 | 
259 |     label_blocks(target_dict, loglines)
260 |     result_list_logged = []
261 |     for key, value in target_dict_logged.items():
262 |         result_list_logged.append([key, value, level_dict_logged[key], message_dict_logged[key]])
263 | 
264 |     result_list_nonlogged = []
265 |     for key, value in target_dict_nonlogged.items():
266 |         result_list_nonlogged.append([key, value])
267 | 
268 | 
269 | 
270 | 
271 |     header_logged = ['Key', 'Values', 'Level', 'Message']
272 |     logged_dict_to_write=pd.DataFrame(columns=header_logged,data=result_list_logged)
273 |     logged_dict_to_write.to_csv('blocks/logged_syn_' + sys.argv[1] + '.csv')
274 | 
275 | 
276 | 


--------------------------------------------------------------------------------
/src/Baselines/DeepLV/deepLV_cleaner.py:
--------------------------------------------------------------------------------
  1 | import pandas
  2 | import numpy as np
  3 | import os
  4 | import javalang
  5 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
  6 | from rouge import Rouge 
  7 | import re
  8 | import numpy as np
  9 | from sklearn import metrics
 10 | import pandas as pd
 11 | 
 12 | def level_acc(classification_pred, classification_label) -> float:
 13 |     level_map = {'trace':0., 'debug':1., 'info':2., 'warn':3., 'error':4.}
 14 |     new_pred = []
 15 |     new_label = []
 16 |     length = len(classification_pred)
 17 |     for idx in range(length):
 18 |         predict = classification_pred[idx]
 19 |         label = classification_label[idx]
 20 |         if predict in level_map.keys() and label in level_map.keys():
 21 |             pred_sum = level_map[predict]
 22 |             label_sum = level_map[label]
 23 |             new_pred.append(pred_sum)
 24 |             new_label.append(label_sum)
 25 |     matches = sum(x == y for x, y in zip(new_pred, new_label))
 26 |     total_elements = len(new_pred)
 27 |     accuracy = matches / total_elements
 28 |     return accuracy
 29 | 
 30 | def query_level(level: float) -> str:
 31 |     if level == 1.:
 32 |         return 'trace'
 33 |     elif level == 2.:
 34 |         return 'debug'
 35 |     elif level == 3.:
 36 |         return 'info'
 37 |     elif level == 4.:
 38 |         return 'warn'
 39 |     elif level == 5.:
 40 |         return 'error'
 41 |     else:
 42 |         return ''
 43 |         
 44 | def aod(classification_pred, classification_label) -> float:
 45 |     level_map = {'trace':1., 'debug':2., 'info':3., 'warn':4., 'error':5.}
 46 |     max_distance = {'trace':4., 'debug':3., 'info':2., 'warn':3., 'error':4.}
 47 | 
 48 |     distance_sum = 0.
 49 |     noise = 0.
 50 |     length = len(classification_pred)
 51 |     
 52 |     for idx in range(length):
 53 |         try:
 54 |             predict = classification_pred[idx]
 55 |             label = classification_label[idx]
 56 |             pred_sum = level_map[predict]
 57 |             label_sum = level_map[label]
 58 |             level = query_level(label_sum)
 59 |             _distance = abs(label_sum - pred_sum)
 60 |             distance_sum = distance_sum + (1 - _distance / max_distance[level])
 61 |         except Exception as e:
 62 |             noise = noise+1
 63 |     aod = distance_sum / (length-noise)    
 64 |     return aod
 65 | 
 66 | def extract_quoted_strings(s):
 67 |     quoted_strings = re.findall(r'"([^"]*)"', s)
 68 |     " ".join(quoted_strings)
 69 |     remaining = re.sub(r'"[^"]*"', '', s)
 70 |     char_to_remove = ['+', ',']
 71 |     for char in char_to_remove:
 72 |         remaining = remaining.replace(char, '')
 73 |     var_list_origin = remaining.split(' ')
 74 |     var_list = [item for item in var_list_origin if (not item == ' ')]
 75 |     var_list = [item for item in var_list if item]
 76 |     return quoted_strings, var_list
 77 | 
 78 | def extract_outer_brackets(s):
 79 |     stack = []
 80 |     result = []
 81 | 
 82 |     for m in re.finditer(r"[()]", s):
 83 |         char, pos = m.group(0), m.start(0)
 84 |         if char == "(":
 85 |             stack.append(pos)
 86 |         elif char == ")":
 87 |             if len(stack) == 1:
 88 |                 result.append(s[stack.pop() + 1:pos])
 89 |             else:
 90 |                 stack.pop()
 91 |     return result
 92 | 
 93 | def extract_level(statement):
 94 |     parts = statement.split('.')
 95 |     for part in parts:
 96 |         if '(' in part:
 97 |             level = part.split('(')[0]
 98 |             return level.strip()
 99 |     return ''
100 | 
101 | 
102 | 
103 | def extract_text(statement):
104 |     bracket_contents = extract_outer_brackets(statement)
105 |     if bracket_contents:  # Check if the list is not empty
106 |         # Pass the first item (contents of the first set of brackets) to extract_quoted_strings
107 |         quoted_strings, remaining = extract_quoted_strings(bracket_contents[0])
108 |         quoted_strings_combined = ' '.join(quoted_strings)
109 |         return quoted_strings_combined
110 |     else:
111 |         return ''  # Return an empty string if no brackets are found
112 | 
113 | df = pd.read_csv('logbench.csv')
114 | df = df[df['Statement'].apply(lambda x: len(x.splitlines()) == 1)]
115 | df['level'] = df['Statement'].apply(extract_level)
116 | df['text'] = df['Statement'].apply(extract_text)
117 | 
118 | 
119 | df.to_csv('logbench_cleaned.csv', index=False)


--------------------------------------------------------------------------------
/src/Baselines/Incoder/incoder.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import javalang
  3 | import re
  4 | from typing import List
  5 | import torch
  6 | import tokenizers
  7 | from transformers import AutoModelForCausalLM, AutoTokenizer
  8 | import json
  9 | 
 10 | path = ''
 11 | ground_truth_folder = ''
 12 | 
 13 | def insert_text_to_java_file(file_name, line_number):
 14 |     with open(file_name, 'r') as file:
 15 |         lines = file.readlines()
 16 | 
 17 |     if line_number > len(lines):
 18 |         print("out of range")
 19 | 
 20 |     lines[line_number - 1] = lines[line_number - 1].rstrip() + '<insert>\n'
 21 | 
 22 |     with open(file_name, 'w') as file:
 23 |         file.writelines(lines)
 24 | 
 25 |         
 26 | def extract_numbers(s):
 27 |     return re.findall(r'\d+', s)
 28 | 
 29 | 
 30 | def parse_directory(dir_path, ground_truth_folder):
 31 |     for filename in os.listdir(dir_path):
 32 |         file_path = os.path.join(dir_path, filename)
 33 |         if os.path.isfile(file_path) and file_path.endswith('.java'):
 34 |             ground_truth_path = ground_truth_folder + file_path.split('/')[-1][:-5] + '_config.txt'
 35 |             try:
 36 |                 with open(ground_truth_path) as f:
 37 |                     lines = f.readlines()
 38 |                     if len(lines) >= 1:
 39 |                         line_number = int(extract_numbers(lines[0].strip(' ')[:-1])[0])
 40 |                         insert_text_to_java_file(file_path, line_number)
 41 |             except FileNotFoundError:
 42 |                 pass
 43 |         elif os.path.isdir(file_path):
 44 |             parse_directory(file_path, ground_truth_folder)
 45 | 
 46 | parse_directory(path,ground_truth_folder)
 47 | # Data procession done.
 48 | 
 49 | 
 50 | tokenizers_version = tuple(int(n) for n in tokenizers.__version__.split('.'))
 51 | if tokenizers_version < (0, 12, 1):
 52 |     print("warning: Your tokenizers version looks old and you will likely have formatting issues. We recommend installing tokenizers >= 0.12.1")
 53 | 
 54 | # set BIG_MODEL to use the 6.7B parameter model
 55 | BIG_MODEL = True
 56 | 
 57 | # use a GPU
 58 | CUDA = True
 59 | 
 60 | # print intermediate outputs of infilling
 61 | VERBOSE = False
 62 | 
 63 | if BIG_MODEL:
 64 |     model_name = "facebook/incoder-6B"
 65 |     if CUDA:
 66 |         kwargs = dict(
 67 |             revision="float16", 
 68 |             torch_dtype=torch.float16,
 69 |             low_cpu_mem_usage=False,
 70 |         )
 71 |     else:
 72 |         kwargs = dict(
 73 |             low_cpu_mem_usage=False,
 74 |         )
 75 | else:
 76 |     model_name = "facebook/incoder-1B"
 77 |     kwargs = {}
 78 | 
 79 | print("loading model")
 80 | model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
 81 | print("loading tokenizer")
 82 | tokenizer = AutoTokenizer.from_pretrained(model_name)
 83 | print("loading complete")
 84 | 
 85 | if CUDA:
 86 |     # if you plan to fine-tune the model, you should not use half precision.
 87 |     model = model.half().cuda()
 88 | 
 89 | # signals the start of a document
 90 | BOS = "<|endoftext|>"
 91 | # signals the end of a generated infill
 92 | EOM = "<|endofmask|>"
 93 | 
 94 | def make_sentinel(i):
 95 |     # signals (1) a location to insert an infill and (2) the start of the infill generation
 96 |     return f"<|mask:{i}|>"
 97 | 
 98 | def generate(input: str, max_to_generate: int=128, temperature: float=0.2):
 99 |     """
100 |     Do standard left-to-right completion of the prefix `input` by sampling from the model
101 |     """
102 |     input_ids = tokenizer(input, return_tensors="pt").input_ids
103 |     if CUDA:
104 |         input_ids = input_ids.cuda()
105 |     max_length = max_to_generate + input_ids.flatten().size(0)
106 |     if max_length > 2048:
107 |         print("warning: max_length {} is greater than the context window {}".format(max_length, 2048))
108 |     with torch.no_grad():
109 |         output = model.generate(input_ids=input_ids, do_sample=True, top_p=0.95, temperature=temperature, max_length=max_length)
110 |     # pass clean_up_tokenization_spaces=False to avoid removing spaces before punctuation, e.g. "from ." -> "from."
111 |     detok_hypo_str = tokenizer.decode(output.flatten(), clean_up_tokenization_spaces=False)
112 |     if detok_hypo_str.startswith(BOS):
113 |         detok_hypo_str = detok_hypo_str[len(BOS):]
114 |     return detok_hypo_str
115 | 
116 | def infill(parts: List[str], max_to_generate: int=50, temperature: float=0.2, extra_sentinel: bool=True, max_retries: int=1):
117 |     """
118 |     Generate infills to complete a partial document, e.g.
119 |     [A C E] -> [A B C D E], where B and D are infills that have been generated.
120 | 
121 |     parts: List[str]. list of parts of the document. One string will be
122 |             inserted in between each element, i.e. infilling N-1 locations for a list
123 |             of length N.
124 |     max_to_generate: int. maximum number of tokens to generate. Keep in mind
125 |             that the model context size is 2048.
126 |     temperature: float. temperature parameter for sampling.
127 |     extra_sentinel: bool. we recommend setting this to True, as it makes it
128 |             easier for the model to end generated infills. See the footnote in 
129 |             section 2.2 of our paper for details.
130 |     max_retries: int. if > 1, use rejection sampling to keep sampling infills until
131 |             all infills sample a completion token.
132 | 
133 |     returns a dictionary containing the following:
134 |         text:  str, the completed document (with infills inserted)
135 |         parts:  List[str], length N. Same as passed to the method
136 |         infills:  List[str], length N-1. The list of infills generated
137 |         retries_attempted:  number of retries used (if max_retries > 1)
138 |     """
139 |     assert isinstance(parts, list)
140 |     retries_attempted = 0
141 |     done = False
142 | 
143 |     while (not done) and (retries_attempted < max_retries):
144 |         retries_attempted += 1
145 | 
146 |         if VERBOSE:
147 |             print(f"retry {retries_attempted}")
148 |         
149 |         ## (1) build the prompt
150 |         if len(parts) == 1:
151 |             prompt = parts[0]
152 |         else:
153 |             prompt = ""
154 |             # encode parts separated by sentinel
155 |             for sentinel_ix, part in enumerate(parts):
156 |                 prompt += part
157 |                 if extra_sentinel or (sentinel_ix < len(parts) - 1):
158 |                     prompt += make_sentinel(sentinel_ix)
159 |         
160 |         infills = []
161 |         complete = []
162 | 
163 |         done = True
164 | 
165 |         ## (2) generate infills
166 |         for sentinel_ix, part in enumerate(parts[:-1]):
167 |             complete.append(part)
168 |             prompt += make_sentinel(sentinel_ix)
169 |             # TODO: this is inefficient as it requires re-encoding prefixes repeatedly
170 |             completion = generate(prompt, max_to_generate, temperature)
171 |             completion = completion[len(prompt):]
172 |             if EOM not in completion:
173 |                 if VERBOSE:
174 |                     print(f"warning: {EOM} not found")
175 |                 completion += EOM
176 |                 done = False
177 |             completion = completion[:completion.index(EOM) + len(EOM)]
178 |             infilled = completion[:-len(EOM)]
179 |             infills.append(infilled)
180 |             complete.append(infilled)
181 |             prompt += completion
182 |         complete.append(parts[-1])
183 |         text = ''.join(complete)
184 | 
185 |     if VERBOSE:
186 |         print("generated text:")
187 |         print(prompt)
188 |         print()
189 |         print("parts:")
190 |         print(parts)
191 |         print()
192 |         print("infills:")
193 |         print(infills)
194 |         print()
195 |         print("restitched text:")
196 |         print(text)
197 |         print()
198 |     
199 |     return {
200 |         'text': text, # str, the completed document (with infills inserted)
201 |         'parts': parts, # List[str], length N. Same as passed to the method
202 |         'infills': infills, # List[str], length N-1. The list of infills generated
203 |         'retries_attempted': retries_attempted, # number of retries used (if max_retries > 1)
204 |     } 
205 | 
206 | def docstring_to_code(code, max_to_generate=50, temperature=0.2):
207 | 
208 |     parts = code.split("<insert>")
209 |     result = infill(parts, max_to_generate=max_to_generate, temperature=temperature)
210 |     return result
211 | 
212 | input_path = path
213 | output_path= ''
214 | 
215 | if not os.path.exists():
216 |     os.makedirs(output_path)
217 | 
218 | for filename in os.listdir(input_path):
219 |     if filename.endswith(".java"):
220 |         print(filename)
221 |         input_file_path = os.path.join(input_path, filename)
222 |         
223 |         with open(input_file_path, 'r', encoding='utf-8') as file:
224 |             file_content = file.read()
225 |             example = f"'''\\\n{file_content}\n'''"
226 |             
227 |             processed_content = docstring_to_code(example)
228 |             
229 |             output_file_path = os.path.join(output_path, filename)
230 |             with open(output_file_path, 'w', encoding='utf-8') as output_file:
231 |                 for item in processed_content['infills']:
232 |                     output_file.write(f"{item}\n")
233 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/README.md:
--------------------------------------------------------------------------------
 1 | # LoGenText-Plus
 2 | 
 3 | The implementation of "LoGenText-Plus: Improving Neural Machine Translation-based Logging Texts Generation with Syntactic Templates"
 4 | 
 5 | > This code and dataset are based on [Context-Aware Model on Fairseq](https://github.com/libeineu/Context-Aware) and [LoGenText](https://github.com/conf-202x/experimental-result).
 6 | 
 7 | ## Requirements and Installation
 8 | 
 9 | * Pytorch  >= 1.5.1
10 | * Python version >= 3.6
11 | 
12 | 1. `conda create --name <env> --file requirements.txt`
13 | 
14 | ## Stage 1: template generation
15 | 
16 | 
17 | Note: `<root_dir>` is the path to the replication package.
18 | 
19 | ### Train and inference for templates
20 | 
21 | > 1. Run the following command to start the pre-training: 
22 | ```
23 | cd <root_dir>/code/template-gen/pre-train
24 | bash runs/pre-train.sh
25 | ```
26 | 
27 | 
28 | > 2. Run the following command to train a basic model: 
29 | ```
30 | cd <root_dir>/code/template-gen/basic-train
31 | bash runs/basic-train.sh <root_dir> <project>
32 | ```
33 | `<project>` is the project name in lowercase, which can be activemq, ambari, etc. 
34 | 
35 | > 3. Run the following command to train and generate the templates for a certain <project>: 
36 | ```
37 | cd <root_dir>/code/template-gen/ast-temp
38 | bash runs/temp-gen.sh <root_dir> <project>
39 | ```
40 | `<project>` should be the same with the project in step 2, and the generated templates can be found in `saved_checkpoints/pre-ast-templete/<project>`. 
41 | 
42 | 
43 | ## Stage 2: template-based logging text generation
44 | 
45 | Note: `<root_dir>` is the path to the replication package.
46 | 
47 | ### Train and inference for logging texts
48 | 
49 | > 1. Run the following command to start the pre-training: 
50 | ```
51 | cd <root_dir>/code/logging-gen/pre-train
52 | bash runs/pre-train.sh
53 | ```
54 | 
55 | > 2. Run the following command to train a basic model: 
56 | ```
57 | cd <root_dir>/code/logging-gen/basic-train
58 | bash runs/basic-train.sh <root_dir> <project>
59 | ```
60 | `<project>` is the project name in lowercase, which can be activemq, ambari, etc. 
61 | 
62 | > 3. Run the following command to train and generate the logging texts for a certain <project>: 
63 | ```
64 | cd <root_dir>/code/logging-gen/ast-temp
65 | bash runs/log-gen.sh <root_dir> <project>
66 | ```
67 | `<project>` should be the same with the project in step 2, and the generated logging texts can be found in `translations/1/<project>`. 
68 | 
69 | ## Results
70 | 
71 | The results can be found in the `results` folder, which is organized by project.
72 | 
73 | ## Data
74 | 
75 | The dataset can be found in the `dataset` folder, which is organized by project. It has the following structure:
76 | ```
77 | dataset
78 | ├── <project>
79 | │   ├── dev.code.1.templete
80 | │   ├── dev.log
81 | │   ├── dev.log.1.templete
82 | │   ├── dev.pre-ast
83 | │   ├── test.code.1.templete
84 | │   ├── test.code.gen.ast.similar.1.templete
85 | │   ├── test.log
86 | │   ├── test.log.1.templete
87 | │   ├── test.pre-ast
88 | │   ├── train.code.1.templete
89 | │   ├── train.log
90 | │   ├── train.log.1.templete
91 | │   └── train.pre-ast
92 | ```
93 | - `<project>` is one of the studied projects, suach as `activemq`.
94 | - `train/dev/test.log` are the files containing the extracted `logging texts` target sequence.
95 | - `train/dev/test.pre-ast` are the files containing the `ASTs` context.
96 | - `train/dev/test.code.1.templete` are the files containing `pre-log code + template from logging text in similar code`.
97 | - `train/dev/test.log.1.template` are the files containing the template extracted from the `logging text`.
98 | - `test.code.gen.ast.similar.1.templete` are the file containing the `pre-log code + predicted template`.


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/requirements.txt:
--------------------------------------------------------------------------------
 1 | # This file may be used to create an environment using:
 2 | # $ conda create --name <env> --file <this file>
 3 | # platform: linux-64
 4 | _libgcc_mutex=0.1=main
 5 | blas=1.0=mkl
 6 | brotlipy=0.7.0=py38h27cfd23_1003
 7 | ca-certificates=2022.4.26=h06a4308_0
 8 | certifi=2022.5.18.1=py38h06a4308_0
 9 | cffi=1.14.4=pypi_0
10 | charset-normalizer=2.0.4=pyhd3eb1b0_0
11 | click=7.1.2=pypi_0
12 | cryptography=37.0.1=py38h9ce1e76_0
13 | cudatoolkit=10.1.243=h6bb024c_0
14 | cycler=0.10.0=pypi_0
15 | freetype=2.10.4=h5ab3b9f_0
16 | idna=3.3=pyhd3eb1b0_0
17 | intel-openmp=2020.2=254
18 | joblib=1.0.0=pypi_0
19 | jpeg=9b=h024ee3a_2
20 | kiwisolver=1.3.1=pypi_0
21 | lcms2=2.11=h396b838_0
22 | ld_impl_linux-64=2.33.1=h53a641e_7
23 | libedit=3.1.20191231=h14c3975_1
24 | libffi=3.3=he6710b0_2
25 | libgcc-ng=9.1.0=hdf63c60_0
26 | libpng=1.6.37=hbc83047_0
27 | libprotobuf=3.19.1=h4ff587b_0
28 | libstdcxx-ng=9.1.0=hdf63c60_0
29 | libtiff=4.1.0=h2733197_1
30 | lz4-c=1.9.3=h2531618_0
31 | matplotlib=3.3.4=pypi_0
32 | mkl=2020.2=256
33 | mkl-service=2.3.0=py38he904b0f_0
34 | mkl_fft=1.2.0=py38h23d657b_0
35 | mkl_random=1.1.1=py38h0573a6f_0
36 | ncurses=6.2=he6710b0_1
37 | ninja=1.10.2=py38hff7bd54_0
38 | numpy=1.19.2=py38h54aff64_0
39 | numpy-base=1.19.2=py38hfa32c7d_0
40 | olefile=0.46=py_0
41 | openssl=1.1.1o=h7f8727e_0
42 | pillow=8.1.0=py38he98fc37_0
43 | pip=20.3.3=py38h06a4308_0
44 | portalocker=2.2.0=pypi_0
45 | protobuf=3.19.1=py38h295c915_0
46 | pycparser=2.20=pypi_0
47 | pyopenssl=22.0.0=pyhd3eb1b0_0
48 | pyparsing=2.4.7=pypi_0
49 | pysocks=1.7.1=py38h06a4308_0
50 | python=3.8.5=h7579374_1
51 | python-dateutil=2.8.1=pypi_0
52 | pytorch=1.5.1=py3.8_cuda10.1.243_cudnn7.6.3_0
53 | readline=8.1=h27cfd23_0
54 | regex=2020.11.13=pypi_0
55 | requests=2.27.1=pyhd3eb1b0_0
56 | sacrebleu=1.5.0=pypi_0
57 | sacremoses=0.0.43=pypi_0
58 | setuptools=52.0.0=py38h06a4308_0
59 | six=1.15.0=py38h06a4308_0
60 | sqlite=3.33.0=h62c20be_0
61 | subword-nmt=0.3.7=pypi_0
62 | tensorboardx=2.2=pyhd3eb1b0_0
63 | tk=8.6.10=hbc83047_0
64 | torchvision=0.6.1=py38_cu101
65 | tqdm=4.56.0=pypi_0
66 | urllib3=1.26.9=py38h06a4308_0
67 | wheel=0.36.2=pyhd3eb1b0_0
68 | xz=5.2.5=h7b6447c_0
69 | zlib=1.2.11=h7b6447c_3
70 | zstd=1.4.5=h9ceee32_0
71 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/activemq/translation.context.test:
--------------------------------------------------------------------------------
  1 | expiring connection to zookeeper vid
  2 | unsubscribing next messages
  3 | async dispose interrupted
  4 | stopping async topic tasks
  5 | error with selector vid
  6 | network connection between vid and vid has been established
  7 | unknown command vid
  8 | vid failed to resetting to batch
  9 | virtual consumer added vid for virtual destination vid
 10 | shutdown of topic traffic generator failed
 11 | store commit failed
 12 | failed to create persistence adapter vid
 13 | destination is full vid
 14 | setting optimized out of vid to vid
 15 | async read check was rejected from the executor
 16 | tempdest vid
 17 | redelivery of vid stopped
 18 | exception on forwarding to non existent temp dest
 19 | vid ignoring sub vid on vid from vid is no longer active
 20 | reconnected to vid
 21 | detected missing corrupt journal files dropped vid messages from the index in vid seconds
 22 | attempting to acquire the exclusive lock to become the master broker
 23 | broker plugin vid started
 24 | checkpoint failed
 25 | vid flushtodisk done vid ms vid
 26 | failed to fire bridge for vid
 27 | stopping broker vid
 28 | sasl vid handshake complete
 29 | error receiving message vid this exception is ignored
 30 | sendreq vid
 31 | endpoint vid will not receive any messages due to broker zero error vid
 32 | xa resource manager vid
 33 | do not know how to process activemq command vid
 34 | sending to vid messages from vid to vid
 35 | vid message sent to vid
 36 | zero length partial vid
 37 | message not found in sequence id index vid
 38 | unknown datastruction vid
 39 | message received since last read check resetting flag vid
 40 | sendreq vid
 41 | vid could not find the object rename for vid
 42 | failed to unregister mbean vid
 43 | periodic checkpoint failed
 44 | error unsubscribing vid from vid vid
 45 | creating producer vid message vid
 46 | network bridge could not be registered in jmx vid
 47 | slow kahadb access journal append took vid ms index
 48 | waiting vid ms before attempting to reconnect
 49 | prepare of vid failed because it was marked rollback only
 50 | received an exception but connection is ignored vid
 51 | could not correlate the connection vid
 52 | vid ms before attempting to reconnect to vid
 53 | failed to get durable subscription vid
 54 | unable to read persisted selector cache it will be ignored
 55 | get destinations returned empty list
 56 | caught an exception trying to determine if there is no flag
 57 | vid usage manager memory limit reached vid producers will be throttled to the rate at vid
 58 | setting durable subscriber to vid
 59 | work rejected vid
 60 | reusing an active session vid
 61 | thread does not hold the context lock on close of vid
 62 | creating producer to vid
 63 | vid elapsed time in second vid s
 64 | recovery mode trying to reconnect to zero
 65 | could not apply query parameters vid to vid
 66 | producer vid with non persistent delivery
 67 | failed to call after delivery
 68 | failed to register mbean vid
 69 | master lock retry sleep interrupted
 70 | vid ms elapsed since last write check
 71 | vid remove request on vid from vid vid matching sub vid
 72 | vid attempting to acquire exclusive lease to become the master
 73 | async start of vid
 74 | vid no set batch from sequence id set vid
 75 | connector removed with uri vid
 76 | corrupt journal record unexpected exception on journal replay of location vid
 77 | apache activemq vid vid vid
 78 | auto transport newconnectionexecutor didn t cleanly
 79 | assigned vid to consumer vid
 80 | setting topic vid to vid
 81 | no queue named vid
 82 | could not connect to local uri vid vid
 83 | closed socket vid
 84 | locker keepalive resulted in
 85 | failure reason
 86 | notified failover transport vid of interruption completion
 87 | failed to initialize local connection for the jmsconnector
 88 | timeout waiting for echo service shutdown
 89 | trace entry vid
 90 | failed to remove consumer on connection vid
 91 | xa transaction rollback vid
 92 | bridge was disposed before the first vid
 93 | interrupted while redelivery
 94 | unsubscribing durable journal
 95 | sending to vid messages to vid
 96 | removing consumer vid
 97 | attempting to acquire the exclusive lock to become the master broker
 98 | not adding to dlq vid to vid
 99 | trying to build a pooledconnectionfactory
100 | sampler interrupted
101 | vid received message vid
102 | failed to close connection vid
103 | failed to accept accept for vid
104 | rolled back vid messages from the index in vid seconds
105 | error occured while processing vid
106 | unexpected local exception vid
107 | vid end of vid with vid
108 | master lock retry sleep interrupted
109 | message not found in sequence id index vid
110 | failed to deliver remove command for destination vid
111 | vid removed from scheduler vid
112 | installing discarding dead letter queue broker plugin dropall vid dropall vid
113 | failed to create object name to unregister vid
114 | vid vid ms elapsed since last write check
115 | failed to send mqtt subscription vid
116 | connector vid started
117 | session vid has more work to do b c of unconsumed
118 | could not transfer the template file to journal transferfile vid
119 | exception occurred for client vid vid processing vid
120 | async error occurred vid
121 | executing sql vid
122 | msg vid id vid destinationname vid
123 | failed to unregister mbean vid
124 | forcing shutdown of executorservice vid
125 | failed to prepare xaresource vid
126 | the remote exception was vid
127 | committing user vid
128 | amqp header arrived invalid version vid
129 | message expired vid
130 | error on queueing the ack compactor
131 | failed to load vid
132 | failed to unregister mbean vid
133 | vid recovered prepared vid
134 | vid ignoring destination vid restricted to vid network hops only
135 | journalled transacted acknowledge for vid at vid
136 | async exception with no exception listener vid
137 | could not preallocate journal file with zeros
138 | unable to unregister subscription vid
139 | attempting to acquire vid
140 | failed to remove scheduler vid
141 | starting a network connection between vid ms
142 | could not create transportlogger reason vid
143 | mqtt client vid connected version vid
144 | get peer broker index vid
145 | vid performance vid to vid
146 | vid ignoring destination vid restricted to vid network hops only
147 | transportloggerfactory could not be started reason vid
148 | received null command from url vid
149 | sending message to vid client vid
150 | last update vid full gc candidates set vid
151 | failed to call getplatformmbeanserver due to
152 | can t use property vid which is of type vid value
153 | policy not applied error processing object addition for addition of vid
154 | executing sql vid
155 | failed to write to scheduler vid
156 | rollback processing error
157 | cleanup removing the data
158 | could not connect to local uri vid vid
159 | starting network connection between vid and vid has been established
160 | failed to lookup the broker from vid
161 | vid ms elapsed and vid consumers subscribed starting dispatch
162 | waiting for outstanding responses to be properly
163 | thread using classloader vid
164 | unknown command vid
165 | stopped recover next messages
166 | vid failed to lease sleeping for vid milli s before trying again
167 | recovery replayed vid operations from the journal
168 | scope vid
169 | failed to register mbean vid
170 | exception occurred for client vid vid processing vid vid
171 | removed scheduled job vid
172 | shutting down test echo service
173 | connector not registered for uuid vid
174 | failed to send command vid
175 | connector stopped stopping proxy
176 | exception on dispatch to browser vid
177 | add exception was raised while executing the run command for oncomplete
178 | start failure exception
179 | the type vid should end with to be a valid discovery type
180 | continuation vid expired vid
181 | suppressing duplicate message send vid
182 | opening new cause
183 | no log writer available for vid
184 | starting to synchronously receive vid messages
185 | vid matching remote vid
186 | failed to unregister mbean vid
187 | load of vid
188 | running clientid vid
189 | failed to aquire lock
190 | adding destination vid
191 | restore consumer vid in pull mode pending recovery overriding prefetch vid
192 | rar vid stopped or undeployed recovery
193 | job scheduler store checkpoint complete
194 | connected to zookeeper
195 | endpoint vid failed to process message reason
196 | the type vid should end with to be a discovery type
197 | invoking start on vid
198 | policy not applied user vid does not have name attribute vid under entry vid
199 | master lock retry sleep interrupted
200 | forwarding of acks in journal file vid
201 | creating temporary file vid
202 | received_exception vid
203 | shutdown of executorservice vid is shutdown vid and terminated vid took vid
204 | async connection timeout task was rejected from the executor
205 | mqtt client vid established heart beat of vid ms vid ms grace period
206 | caught exception in mainloop
207 | exceeded redelivery with count vid ack vid
208 | ignoring consumerinfo vid from vid vid
209 | no connection attempt made in time for vid throwing inactivityioexception
210 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/activemq/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/activemq/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/activemq/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 209 examples
5 | | ['data-bin/context'] test 209 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/activemq/checkpoint_last.pt
7 | | Translated 209 sentences (2201 tokens) in 3.2s (64.63 sentences/s, 680.59 tokens/s)
8 | | Generate test with beam=8: BLEU = 26.58 46.1/28.3/21.8/18.7 (BP = 0.985 ratio = 0.985 hyp_len = 1299 ref_len = 1319)
9 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/activemq/translation.context.test.unsort:
--------------------------------------------------------------------------------
  1 | 51	vid ms before attempting to reconnect to vid
  2 | 144	vid performance vid to vid
  3 | 128	message expired vid
  4 | 105	unexpected local exception vid
  5 | 31	xa resource manager vid
  6 | 187	running clientid vid
  7 | 79	setting topic vid to vid
  8 | 92	interrupted while redelivery
  9 | 138	attempting to acquire vid
 10 | 4	error with selector vid
 11 | 109	failed to deliver remove command for destination vid
 12 | 5	network connection between vid and vid has been established
 13 | 97	not adding to dlq vid to vid
 14 | 26	stopping broker vid
 15 | 86	failed to initialize local connection for the jmsconnector
 16 | 179	continuation vid expired vid
 17 | 90	xa transaction rollback vid
 18 | 161	waiting for outstanding responses to be properly
 19 | 201	received_exception vid
 20 | 39	sendreq vid
 21 | 102	failed to accept accept for vid
 22 | 156	cleanup removing the data
 23 | 58	work rejected vid
 24 | 22	broker plugin vid started
 25 | 29	sendreq vid
 26 | 72	async start of vid
 27 | 21	attempting to acquire the exclusive lock to become the master broker
 28 | 200	creating temporary file vid
 29 | 158	starting network connection between vid and vid has been established
 30 | 96	attempting to acquire the exclusive lock to become the master broker
 31 | 181	opening new cause
 32 | 140	starting a network connection between vid ms
 33 | 2	async dispose interrupted
 34 | 41	failed to unregister mbean vid
 35 | 59	reusing an active session vid
 36 | 205	caught exception in mainloop
 37 | 16	redelivery of vid stopped
 38 | 111	installing discarding dead letter queue broker plugin dropall vid dropall vid
 39 | 193	connected to zookeeper
 40 | 185	failed to unregister mbean vid
 41 | 154	failed to write to scheduler vid
 42 | 114	failed to send mqtt subscription vid
 43 | 54	get destinations returned empty list
 44 | 99	sampler interrupted
 45 | 183	starting to synchronously receive vid messages
 46 | 91	bridge was disposed before the first vid
 47 | 142	mqtt client vid connected version vid
 48 | 13	setting optimized out of vid to vid
 49 | 195	the type vid should end with to be a discovery type
 50 | 153	executing sql vid
 51 | 208	no connection attempt made in time for vid throwing inactivityioexception
 52 | 139	failed to remove scheduler vid
 53 | 189	adding destination vid
 54 | 47	waiting vid ms before attempting to reconnect
 55 | 95	removing consumer vid
 56 | 115	connector vid started
 57 | 24	vid flushtodisk done vid ms vid
 58 | 124	failed to prepare xaresource vid
 59 | 71	vid attempting to acquire exclusive lease to become the master
 60 | 67	failed to register mbean vid
 61 | 172	connector not registered for uuid vid
 62 | 57	setting durable subscriber to vid
 63 | 196	invoking start on vid
 64 | 113	vid vid ms elapsed since last write check
 65 | 7	vid failed to resetting to batch
 66 | 10	store commit failed
 67 | 49	received an exception but connection is ignored vid
 68 | 170	removed scheduled job vid
 69 | 45	network bridge could not be registered in jmx vid
 70 | 70	vid remove request on vid from vid vid matching sub vid
 71 | 178	the type vid should end with to be a valid discovery type
 72 | 80	no queue named vid
 73 | 122	failed to unregister mbean vid
 74 | 52	failed to get durable subscription vid
 75 | 83	locker keepalive resulted in
 76 | 42	periodic checkpoint failed
 77 | 120	executing sql vid
 78 | 186	load of vid
 79 | 44	creating producer vid message vid
 80 | 87	timeout waiting for echo service shutdown
 81 | 148	sending message to vid client vid
 82 | 112	failed to create object name to unregister vid
 83 | 85	notified failover transport vid of interruption completion
 84 | 135	async exception with no exception listener vid
 85 | 174	connector stopped stopping proxy
 86 | 162	thread using classloader vid
 87 | 46	slow kahadb access journal append took vid ms index
 88 | 125	the remote exception was vid
 89 | 63	recovery mode trying to reconnect to zero
 90 | 53	unable to read persisted selector cache it will be ignored
 91 | 188	failed to aquire lock
 92 | 101	failed to close connection vid
 93 | 76	apache activemq vid vid vid
 94 | 15	tempdest vid
 95 | 61	creating producer to vid
 96 | 12	destination is full vid
 97 | 0	expiring connection to zookeeper vid
 98 | 203	async connection timeout task was rejected from the executor
 99 | 130	failed to load vid
100 | 146	transportloggerfactory could not be started reason vid
101 | 25	failed to fire bridge for vid
102 | 137	unable to unregister subscription vid
103 | 194	endpoint vid failed to process message reason
104 | 11	failed to create persistence adapter vid
105 | 74	connector removed with uri vid
106 | 77	auto transport newconnectionexecutor didn t cleanly
107 | 199	forwarding of acks in journal file vid
108 | 127	amqp header arrived invalid version vid
109 | 104	error occured while processing vid
110 | 147	received null command from url vid
111 | 143	get peer broker index vid
112 | 100	vid received message vid
113 | 136	could not preallocate journal file with zeros
114 | 171	shutting down test echo service
115 | 93	unsubscribing durable journal
116 | 117	could not transfer the template file to journal transferfile vid
117 | 191	rar vid stopped or undeployed recovery
118 | 176	add exception was raised while executing the run command for oncomplete
119 | 126	committing user vid
120 | 19	reconnected to vid
121 | 159	failed to lookup the broker from vid
122 | 38	message received since last read check resetting flag vid
123 | 106	vid end of vid with vid
124 | 65	producer vid with non persistent delivery
125 | 37	unknown datastruction vid
126 | 155	rollback processing error
127 | 64	could not apply query parameters vid to vid
128 | 60	thread does not hold the context lock on close of vid
129 | 27	sasl vid handshake complete
130 | 184	vid matching remote vid
131 | 75	corrupt journal record unexpected exception on journal replay of location vid
132 | 207	ignoring consumerinfo vid from vid vid
133 | 206	exceeded redelivery with count vid ack vid
134 | 197	policy not applied user vid does not have name attribute vid under entry vid
135 | 40	vid could not find the object rename for vid
136 | 89	failed to remove consumer on connection vid
137 | 14	async read check was rejected from the executor
138 | 182	no log writer available for vid
139 | 163	unknown command vid
140 | 150	failed to call getplatformmbeanserver due to
141 | 118	exception occurred for client vid vid processing vid
142 | 73	vid no set batch from sequence id set vid
143 | 6	unknown command vid
144 | 141	could not create transportlogger reason vid
145 | 134	journalled transacted acknowledge for vid at vid
146 | 18	vid ignoring sub vid on vid from vid is no longer active
147 | 84	failure reason
148 | 48	prepare of vid failed because it was marked rollback only
149 | 110	vid removed from scheduler vid
150 | 123	forcing shutdown of executorservice vid
151 | 55	caught an exception trying to determine if there is no flag
152 | 132	vid recovered prepared vid
153 | 168	failed to register mbean vid
154 | 173	failed to send command vid
155 | 28	error receiving message vid this exception is ignored
156 | 9	shutdown of topic traffic generator failed
157 | 169	exception occurred for client vid vid processing vid vid
158 | 34	vid message sent to vid
159 | 152	policy not applied error processing object addition for addition of vid
160 | 1	unsubscribing next messages
161 | 202	shutdown of executorservice vid is shutdown vid and terminated vid took vid
162 | 108	message not found in sequence id index vid
163 | 167	scope vid
164 | 68	master lock retry sleep interrupted
165 | 204	mqtt client vid established heart beat of vid ms vid ms grace period
166 | 198	master lock retry sleep interrupted
167 | 32	do not know how to process activemq command vid
168 | 66	failed to call after delivery
169 | 3	stopping async topic tasks
170 | 160	vid ms elapsed and vid consumers subscribed starting dispatch
171 | 157	could not connect to local uri vid vid
172 | 36	message not found in sequence id index vid
173 | 56	vid usage manager memory limit reached vid producers will be throttled to the rate at vid
174 | 107	master lock retry sleep interrupted
175 | 81	could not connect to local uri vid vid
176 | 151	can t use property vid which is of type vid value
177 | 131	failed to unregister mbean vid
178 | 166	recovery replayed vid operations from the journal
179 | 165	vid failed to lease sleeping for vid milli s before trying again
180 | 116	session vid has more work to do b c of unconsumed
181 | 149	last update vid full gc candidates set vid
182 | 78	assigned vid to consumer vid
183 | 164	stopped recover next messages
184 | 180	suppressing duplicate message send vid
185 | 69	vid ms elapsed since last write check
186 | 119	async error occurred vid
187 | 17	exception on forwarding to non existent temp dest
188 | 98	trying to build a pooledconnectionfactory
189 | 175	exception on dispatch to browser vid
190 | 82	closed socket vid
191 | 62	vid elapsed time in second vid s
192 | 190	restore consumer vid in pull mode pending recovery overriding prefetch vid
193 | 129	error on queueing the ack compactor
194 | 20	detected missing corrupt journal files dropped vid messages from the index in vid seconds
195 | 121	msg vid id vid destinationname vid
196 | 88	trace entry vid
197 | 103	rolled back vid messages from the index in vid seconds
198 | 35	zero length partial vid
199 | 43	error unsubscribing vid from vid vid
200 | 23	checkpoint failed
201 | 145	vid ignoring destination vid restricted to vid network hops only
202 | 177	start failure exception
203 | 33	sending to vid messages from vid to vid
204 | 50	could not correlate the connection vid
205 | 94	sending to vid messages to vid
206 | 133	vid ignoring destination vid restricted to vid network hops only
207 | 192	job scheduler store checkpoint complete
208 | 30	endpoint vid will not receive any messages due to broker zero error vid
209 | 8	virtual consumer added vid for virtual destination vid
210 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/ambari/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/ambari/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/ambari/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 365 examples
5 | | ['data-bin/context'] test 365 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/ambari/checkpoint_last.pt
7 | | Translated 365 sentences (4566 tokens) in 6.8s (53.74 sentences/s, 672.23 tokens/s)
8 | | Generate test with beam=8: BLEU = 25.50 46.4/28.2/20.9/16.7 (BP = 0.982 ratio = 0.982 hyp_len = 2683 ref_len = 2733)
9 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/brooklyn/translation.context.test:
--------------------------------------------------------------------------------
  1 | problem persisting but no longer running ignoring
  2 | unable to create policy spec for vid vid
  3 | management node vid in vid seconds waiting for persistence to write all data continuing
  4 | unable to instantiate vid vid rethrowing vid
  5 | vid invoking sensor vid on vid with vid
  6 | vid no port available for vid empty range vid
  7 | vid rethrowing
  8 | user vid not authorized to see sensor vid of entity vid excluding from current state results
  9 | determined reachability of sockets vid vid
 10 | vid health check for vid component continuing recovering vid
 11 | unable again to find details of location vid in rest call to list ignoring location vid
 12 | multiple ambiguous definitions for config key vid on vid with vid
 13 | authentication successful vid
 14 | retrieving java url vid from vid
 15 | publishing management node health vid
 16 | first server up in vid is vid
 17 | using first reachable address vid for node vid in vid
 18 | unable to instantiate vid vid
 19 | cancelled vid tasks for vid vid
 20 | can t calculate percentage value for entity vid as total from producer vid is zero
 21 | destroying app vid mgmt is vid
 22 | cannot set key vid on vid from flag vid containing class is not configurable
 23 | error resizing but no longer running vid
 24 | failed to resolve aws hostname of vid rethrowing
 25 | parsing values for vid at vid vid
 26 | invoking effector vid on vid with args vid
 27 | vid recording removal of container vid
 28 | vid redundant call to start vid skipping vid
 29 | for vid considering members vid
 30 | getrequiredopenports detected at vid vid
 31 | credentials have no effect in builder unless uri for host is specified
 32 | missing icon data for vid expected at vid already logged warn and error details
 33 | checkpointing delta of memento with references updating vid entities vid locations vid policies vid enrichers vid catalog items vid bundles removing vid
 34 | vid n plan being added is n vid n plan already present is n vid
 35 | configuration error vid
 36 | primary node vid is deprecated use vid instead use vid
 37 | deprecated use of managementcontext for unmanaged vid ignoring
 38 | loaded java type vid for vid vid but had errors vid
 39 | installing vid from vid on vid
 40 | missing catalog item for vid vid inferring as vid because that is able to load the item
 41 | reconfiguring vid config file for vid because vid is not on vid
 42 | conflicting value for key vid from deprecated name vid using earlier deprecated name vid
 43 | deserializing the non static class vid with multiple outer class fields vid when changing compilers it s possible that the instance won t be able to be deserialized due to changed outer class field names in those cases deserialization could fail with field not found exception or class cast exception following this log line
 44 | item vid cannot be moved skipping
 45 | launched brooklyn vid
 46 | replacing in vid member vid with old address vid new address vid
 47 | adding to vid vid appears identical to existing vid may get removed on rebind underlying addition should be modified so it is not added twice
 48 | error adding brooklyn properties for vid vid
 49 | error copying customize resources
 50 | problem polling for async script on vid for vid continuing
 51 | resize vid from vid to vid
 52 | tmpdirfinder candidate tmp dir vid cannot have files created inside it vid
 53 | loaded rebind raw data took vid vid entities vid locations vid policies vid enrichers vid feeds vid catalog items vid bundles from vid
 54 | changing hostname recorded against public ip vid from vid
 55 | executing vid failed with class vid
 56 | using ssh tool vid of type vid props
 57 | subsequent error during termination vid
 58 | failed rest deployment launching vid vid
 59 | rebindriver for vid is not transforming machine location so not generating machine vid vid
 60 | management node vid in vid new plane unable to promote to vid currently vid see log for vid
 61 | error destroying vid ignoring vid
 62 | detail on failure to deploy webapp vid
 63 | policy vid balancing finished at cold node vid workrate number no way to improve it
 64 | problem persisting change delta rethrowing
 65 | group vid got new member vid
 66 | vid recording metric update for item vid
 67 | brooklynsecurityproviderfilterjavax dofilter caught vid
 68 | problem in ha poller but no longer running vid
 69 | cannot request read only mode for vid when already running vid ignoring
 70 | rebindmanager instantiate vid rethrowing vid
 71 | vid check for vid continuing failing vid
 72 | cannot get hostname bug with string vid for vid ignoring
 73 | failed to set permissions to vid for file vid
 74 | fallback super realclass vid attempt failed orig class vid vid
 75 | location vid added to vid
 76 | success following serialized for vid vid
 77 | running shell command at vid vid
 78 | discouraged use of brooklyn properties deprecated use vid instead use vid
 79 | error calculating and setting combination for enricher vid
 80 | cassandra nics inferred ip vid for vid
 81 | policy vid balancing finished at cold node vid workrate number no way to improve it
 82 | jclouds using template vid options vid to provision machine in vid
 83 | initiating replica set with vid
 84 | deprecated use of brooklyn custom brooklyn properties for vid
 85 | vid publishing failed state vid currentfailurestarttime vid now vid
 86 | skipping configuration of non ec2 computeservice vid
 87 | rebinding entity vid even though actual state is vid expected state is vid
 88 | starting entity vid at vid
 89 | no portforwardmanager using legacy vid
 90 | geodns inferred geoinfo vid from hostname vid
 91 | deprecated use of scanjavaannotations instead use of vid version syntax in future versions to load vid
 92 | error rebinding brooklyn web console rebinding
 93 | seeds considered stable for cluster vid node vid
 94 | expected to find two security groups on node vid in app vid one shared one unique found vid vid
 95 | queued task vid rethrowing vid
 96 | error forcing brooklyn gc usage now vid
 97 | vid adding children to vid n vid
 98 | item vid cannot be moved skipping
 99 | unable to create from archive returning vid
100 | resolution of vid failed swallowing and returning vid
101 | queued task vid of vid no longer running vid
102 | disconnecting sshjtool vid vid
103 | brooklyn geo info lookup failed for vid
104 | cors brooklyn fee disabled
105 | context entity found by looking at target vid entity tag not context entity
106 | multiple definitions for effector vid on vid ignoring vid
107 | network facing enricher not transforming vid uri vid because no port in target vid for vid
108 | copying chunk vid to vid on vid
109 | bundle vid containing bom is not managed by brooklyn using legacy item installation
110 | deprecated use of name key to define vid version should be specified within id key or with version key not this tag
111 | vid ports not applicable or not yet applicable because has multiple locations vid ignoring
112 | invoking vid on vid in vid
113 | can t infer catalog item id from the following plan n vid
114 | uninstalling bundle vid from brooklyn ui module bundle location vid
115 | members of vid checking vid eliminating because not member
116 | vid added to machine vid of location vid vid
117 | error stopping child continuing and will rethrow if no other errors
118 | multiple definitions for config key vid on vid from vid and vid preferring lower vid value vid
119 | cancelling vid mode vid on vid
120 | uninstalling bundle vid from brooklyn managed bundle vid n vid
121 | failed to unmanage entity vid and its descendants after failure to initialise rethrowing original exception
122 | could not determine canonical name of file vid returning original file
123 | no maven resource file vid available
124 | vid clearing ssh for vid
125 | scheduling item for persistence addition vid
126 | error computing geo info for vid internet issues or too many requests to free servers for vid subsequent errors for vid
127 | network facing enricher not transforming vid uri vid because no port mapping for vid
128 | failed to set permissions to vid for file vid expected behaviour on windows vid subsequent failures on any file will be logged at trace
129 | trace for quarantine group vid failed to start entity vid removing vid
130 | osgi could not find bundle vid in search after installing it from vid
131 | two masters detected probably a handover just occured vid
132 | launching vid members of vid now vid
133 | installing image regex to vid for vid
134 | flagutils for vid setting field vid val vid newval vid key vid
135 | vid undeploying vid vid on vid
136 | running command at vid vid
137 | vid recording addition of container vid
138 | brooklynsecurityproviderfilterjavax start
139 | theoretical best primary at vid vid maybe others not available using next best vid
140 | formula configured vid
141 | error creating uri for vid rethrowing vid
142 | validation done in vid
143 | vid scheduling but no longer running vid
144 | members of vid checking vid eliminating because not up
145 | resource vid type vid deployed to vid
146 | cannot notifyofinitialvalue for subscription with value vid
147 | creating customizing vid for vid
148 | create shell command at vid
149 | no reachable address vid feed from vid to vid
150 | activating local management for vid on start
151 | sethostnamecustomizer ignoring machine vid in vid
152 | while starting vid obtained new location instance vid
153 | managing vid in mode vid doing this recursively because a child is preregistered
154 | problem setting application lifecycle usage event vid vid
155 | vid closing pool for vid
156 | autodeployment in parent s management context triggered for vid vid will not be supported in future explicit manage call required
157 | child spec vid is already set with parent vid how did this happen
158 | found existing shared security group in vid for app vid vid
159 | found namespace vid returning it
160 | skipping ssh check for vid vid due to config waitforconnectable vid
161 | failed transfer vid to vid retryable error attempt vid vid vid
162 | error stopping brooklynweb console rethrowing
163 | starting entity vid at vid
164 | could not register external ui module vid vid
165 | service vid could not be parsed at vid vid
166 | discouraged deprecated use of static annotated effector method vid defined in vid
167 | unable again to find details of location vid in rest call to list ignoring location vid
168 | vid pre start management of entity vid mode vid
169 | releasing machine vid in vid instance id vid
170 | problem releasing machine vid propagating after vid vid
171 | this management node vid supposed to be master but reportedly unhealthy no op as expect other node to fix self vid
172 | rebind entity vid no longer running vid
173 | fallback loadclass vid attempt failed orig class vid vid
174 | configuring brooklynnode entity startup
175 | no location has been set on vid cannot configure security groups in context vid
176 | sequence for vid incremented to vid
177 | updating brooklyn properties from vid
178 | jmx jar for vid is not a valid jmx on vid because no jmx
179 | deprecated automatic coercion of object to timeduration set breakpoint in typecoercions to inspect convert to duration
180 | referenced task for vid vid
181 | adding auto generated user vid vid in vid
182 | suspending machine vid in vid instance id vid
183 | forcing catalog load on access of catalog items
184 | misconfiguration for vid sslconfig vid but no https_port on vid
185 | vid detected item removal on change of vid
186 | rescheduling addition of shard vid because add failed via router vid
187 | destroyed and unmanaged vid mgmt now vid managed vid
188 | problem deleting temporary files of async script on vid ignoring
189 | stopped read only vid mgmt vid
190 | vault response code vid vid
191 | problem terminiating management node state listeners continuing
192 | removing from vid member vid with old address vid because inferred address is now null
193 | machine details for vid missing from jclouds using ssh test instead name vid version vid arch vid ram vid cpus vid
194 | formula configured vid
195 | started brooklyn rest server at vid vid
196 | geodns vid refreshing vid
197 | policy vid detected vid should be on vid but can t move it vid
198 | unable to instantiate vid rethrowing vid
199 | system bundles are vid
200 | error in enricher vid but no longer running vid
201 | creating brooklyn local copy of bundle file vid
202 | vid resizing vid from vid to vid vid
203 | discouraged deprecated use of brooklynproperties for vid instead vid
204 | brooklyn gc deleted vid tasks as was over global limit now have vid
205 | custom password rebind for vid vid
206 | error launching brooklyn items from node vid ignoring vid
207 | ignoring failed execution of task callback hook vid because executor is shutdown
208 | failed to resolve aws hostname of vid vid
209 | installing vid with exit code vid
210 | isfirstnodeset but no cluster members found to add vid
211 | cannot store location lifecycle usage event for vid state vid because storage not available
212 | looking up vid in osgi
213 | standard location resolvers not installed location resolution will fail shortly
214 | restarting entity vid in vid machine vid
215 | done vid checkentity vid
216 | unable to delete one or more paths vid on shutdown vid
217 | launching vid with role vid and source of attempt to vid with role vid and vid but no unmanaged
218 | parent not found discarding following original ring for vid
219 | loading initial catalog from vid
220 | vid invoking effector on vid effector vid parameters vid
221 | queueing update needed task for vid update will occur shortly
222 | adding startup script to enable winrm for windows vm on vid
223 | brooklyn thought it was already managing bundle vid but it s not installed to framework
224 | vid invoking effector vid on vid with vid which is the target vid
225 | error running mongodb script vid at vid
226 | creating zookeeper using custom spec for vid
227 | repeating problem vid but no longer active ignoring
228 | releasing machine vid in vid instance id vid ignoring and continuing vid vid
229 | deleting temporary token for vid with version vid
230 | invalid item in catalog when converting rest catalog item type vid
231 | deletion of orphan state found unusually referenced feeds keeping vid
232 | looking up external classpath for vid
233 | vid calculated desired pool size vid from vid to vid
234 | error launching brooklyn vid
235 | unable to re connect to jmx url vid vid
236 | problem notifying listener vid of vid
237 | vm vid connection succeeded after vid on vid
238 | tmpdirfinder candidate tmp dir vid cannot have files created inside it vid
239 | error recording monitor info vid
240 | ignoring flag open_iptables on non ssh location vid
241 | task vid was modified but modification was never used
242 | long poll retrieving status directly received exit status will retry on vid for vid
243 | vid picking up vid as the tracker already set often due to rebind
244 | multiple definitions for effector vid on vid preferring lower vid to vid
245 | deprecated use of entities startmanagement application managementcontext for vid ignoring vid
246 | vid set on vid but pollforfirstreachableaddress vid
247 | use of groovy lang closure is deprecated in basicsubscriptioncontext subscribe
248 | restarting brooklyn machine in vid instance id vid
249 | theoretical best primary at vid vid maybe others not available at vid
250 | ignoring deprecated flag open_iptables on windows location vid
251 | error polling for vid command vid
252 | knifeportuseknifedefault specified to vid when already told to use vid explicitly overriding previous see subsequent warning for more details
253 | vid recording pool size vid for vid
254 | use of groovy lang closure is deprecated in type vid
255 | catalog does not contain item for type vid loaded class directly instead
256 | for vid considering membership of vid which is in locations vid
257 | looking for vid in revised location vid
258 | delaying vid vid allowed vid elapsed then rechecking for vid ms
259 | rest request running as vid threw vid
260 | mysampleimpl init with config vid
261 | localhost obtainport vid returning vid
262 | fabric vid updating seeds chosen vid potential vid
263 | geodns including vid even though vid is a private subnet homeless ential vid
264 | ignoring userdatastring vid in vm creation because not supported for cloud type vid
265 | management node vid detected master change required newmaster vid oldmaster vid plane vid heartbeattimeout vid
266 | vid can t configure resolver at vid no sshmachines
267 | brooklyn management context for vid vid
268 | rebinding addition of memento vid vid
269 | starting entity vid at vid
270 | cancelled vid tasks for vid with vid remaining of vid vid
271 | enricher vid transforming vid to vid
272 | had to wait vid for vid vid to be true before setting vid
273 | resizing vid to vid proxy vid of vid
274 | change handler should be hidden by event handler trace for unexpected mongo node handler
275 | bundle vid matches metadata of managed bundle vid but not osgi bundle location vid and matches already installed osgi bundle is no op
276 | ignoring mode vid in favour of port for management candidates of vid vid
277 | unexpected structure for state module vid skipping vid vid
278 | queued task vid at context vid no hierarchy
279 | effector vid defined on vid has no body invoking caller supplied vid instead
280 | ambiguous spec supertypes vid for target vid it is recommended that any registered type constraint for a spec be compatible with the sions
281 | restart of vid requested be applied at machine level
282 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/brooklyn/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/brooklyn/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/brooklyn/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 281 examples
5 | | ['data-bin/context'] test 281 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/brooklyn/checkpoint_last.pt
7 | | Translated 281 sentences (4829 tokens) in 7.5s (37.59 sentences/s, 646.06 tokens/s)
8 | | Generate test with beam=8: BLEU = 31.22 51.6/32.5/25.6/22.1 (BP = 1.000 ratio = 1.014 hyp_len = 2680 ref_len = 2642)
9 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/camel/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/camel/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/camel/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 637 examples
5 | | ['data-bin/context'] test 637 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/camel/checkpoint_last.pt
7 | | Translated 637 sentences (7331 tokens) in 9.5s (67.14 sentences/s, 772.64 tokens/s)
8 | | Generate test with beam=8: BLEU = 40.05 59.9/45.1/39.6/37.3 (BP = 0.896 ratio = 0.901 hyp_len = 4093 ref_len = 4543)
9 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/cloudstack/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/cloudstack/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/cloudstack/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 1061 examples
5 | | ['data-bin/context'] test 1061 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/cloudstack/checkpoint_last.pt
7 | | Translated 1061 sentences (13432 tokens) in 20.5s (51.64 sentences/s, 653.81 tokens/s)
8 | | Generate test with beam=8: BLEU = 34.95 53.6/38.9/31.9/27.8 (BP = 0.948 ratio = 0.949 hyp_len = 8344 ref_len = 8789)
9 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/hadoop/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/hadoop/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hadoop/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 1127 examples
5 | | ['data-bin/context'] test 1127 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hadoop/checkpoint_last.pt
7 | | Translated 1127 sentences (13134 tokens) in 19.2s (58.77 sentences/s, 684.85 tokens/s)
8 | | Generate test with beam=8: BLEU = 23.79 46.1/28.0/22.0/19.1 (BP = 0.877 ratio = 0.884 hyp_len = 7660 ref_len = 8664)
9 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/hbase/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/hbase/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hbase/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 507 examples
5 | | ['data-bin/context'] test 507 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hbase/checkpoint_last.pt
7 | | Translated 507 sentences (5988 tokens) in 9.1s (55.81 sentences/s, 659.12 tokens/s)
8 | | Generate test with beam=8: BLEU = 23.73 45.2/27.9/21.7/17.7 (BP = 0.899 ratio = 0.904 hyp_len = 3583 ref_len = 3964)
9 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/hive/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/hive/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hive/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 629 examples
5 | | ['data-bin/context'] test 629 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/hive/checkpoint_last.pt
7 | | Translated 629 sentences (6861 tokens) in 10.8s (58.47 sentences/s, 637.76 tokens/s)
8 | | Generate test with beam=8: BLEU = 30.25 48.9/33.0/27.9/24.4 (BP = 0.934 ratio = 0.936 hyp_len = 3898 ref_len = 4163)
9 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/ignite/translation.context.test:
--------------------------------------------------------------------------------
  1 | received session attribute request nodeid vid msg vid
  2 | stopping spi vid
  3 | overriding partition map in full update map exchid vid curpart vid newpart vid
  4 | received session attribute request message msg vid nodeid vid
  5 | ipfinder
  6 | cassandra session refreshed
  7 | will move session to less loaded worker ses vid from vid to vid
  8 | baseline won t be changed cause the lost partitions were detected
  9 | partition changed state grp vid p vid prev vid to vid
 10 | got removed entry in lockasync method will retry vid
 11 | node connection is idle but there are unacknowledged messages will wait vid
 12 | refresh partitions due to topology update
 13 | starting loading model by the path vid
 14 | preserving deployment without node participants vid
 15 | xa resource start xid vid xid vid
 16 | obsolete version was not set because lock was explicit vid
 17 | failed to connect to ignite update server vid
 18 | generated node_joined bulk event nodecnt vid evtnode vid
 19 | mvcc coordinator issued topology version for service vid fut vid
 20 | processing node departure vid
 21 | refresh partitions due to mapping was changed
 22 | received job execution request while stopping will ignore vid
 23 | failed to add entry err vid entry vid
 24 | sent peer class loading response to node node does not exist vid
 25 | attempt to execute cassandra batch vid operation to process rest vid of vid elements
 26 | failed to find future for dht finish response txid vid node vid res vid
 27 | acquired deployment class from local cache vid
 28 | updating full partition map grp vid exchver vid fullmap vid
 29 | communication problem resolver detected job cancelled nodeid vid
 30 | failed to get future result
 31 | got removed entry in transaction getallasync will retry vid
 32 | restored near prepare from node vid
 33 | async response resp vid
 34 | skipped discovery notification node vid type vid topver vid
 35 | ignore communication error resolve message resolve process already started sndnode vid
 36 | ignore communication error resolver forced nodes stop reqid vid locnode vid
 37 | failed to add candidates because entry was removed will renew
 38 | maxconntimeout
 39 | cleared invalid entry from remote transaction will skip entry vid tx vid
 40 | vid xid version uuid vid
 41 | store remove key vid tx vid
 42 | received dht finish response txid vid dhttxid vid node vid
 43 | removed mapping for node nodeid vid tx vid
 44 | offheap remove key vid
 45 | cassandra session refreshed
 46 | committed from tm locnodeid vid tx vid
 47 | partition map after beforeexchange grp vid exchid vid fullmap vid
 48 | merge exchange future exchfut vid mergedfut vid evt vid evtnode vid evtnodeclient vid
 49 | successfully locked persistence storage folder
 50 | received near lock response for unknown future txid vid node vid
 51 | failed to find future for get response sender vid res vid
 52 | failed to find client message worker clientnode vid
 53 | shmemport
 54 | coordinator failed node is new coordinator ver vid
 55 | found unacknowledged batch for left node nodeid vid fut vid
 56 | restored partition state from wal grp vid p vid state vid updcntr vid
 57 | deactivate page store manager id vid topver vid
 58 | updated cache entry val vid old vid entry vid
 59 | failed to send initial demand request to node
 60 | deployment cannot be reused random class could not be loaded from sender node dep vid meta vid
 61 | get affinity from cache vid key vid val vid
 62 | failed to unswap entry
 63 | scanner processor started
 64 | boot class path vid
 65 | failed to unlock key all partition nodes left the grid
 66 | vid view caches
 67 | failed to find class probably due to task job cancellation name vid err vid
 68 | vid used cache groups id to name vid
 69 | vid has been interrupted
 70 | node is stopped or lock is broken in non failover safe mode aborting transaction
 71 | injecting cache store session
 72 | vid view information in a cluster
 73 | message has been sent to next node msg vid next vid
 74 | got removed entry while updating near value will retry vid
 75 | awscredentials
 76 | stopped closure processor
 77 | message has been sent to node nodeid vid msg vid
 78 | started moving ses vid
 79 | failed to find class protocol vid
 80 | injected task resources continuous query vid
 81 | use vid option to disable it
 82 | i am modified job_1 vid on vid
 83 | cassandra table vid cause appropriate keyspace doesn t exist
 84 | delete entries from db cache vid keytype vid cnt vid
 85 | can t initialize query string vid
 86 | finished range check range vid pos vid
 87 | vid truststore_type vid
 88 | cleaner has been cancelled
 89 | received remove lock request for removed entry will retry entry vid req vid
 90 | failed to send partition update to node left the grid
 91 | received metrics update message from unknown node vid
 92 | after vid release vid
 93 | unregistering mbean vid
 94 | error when polling event queue
 95 | received duplicate continuous query message vid
 96 | received schema propose discovery message but cache is statically configured and vid flag is set will report error opid vid msg vid
 97 | sent peer class loading request node vid req vid
 98 | discarding node add finished message join process is not finished vid
 99 | removed message set due to node leaving grid vid
100 | encrypted data status vid handshakestaus vid ses vid
101 | received near prepare from node that left txid vid node vid
102 | partition has been scheduled for eviction this node is oldest non affinity node grp vid p vid prevstate vid
103 | failed to send tx update response node left msg vid node vid
104 | failed to send dht finish response node left txid vid dhttxid vid node vid
105 | discarding node added message with empty topology vid
106 | failed to send message to node msg vid err vid
107 | waiting for handshake buffer vid
108 | coordinator received single message ver vid node vid allreceived vid
109 | closing connection locnodeid vid rmtaddr vid rmtport vid
110 | finished executing job processor onkernalstop callback
111 | baseline won t be changed in topology
112 | failed to notify exchange future callback for exchange future vid
113 | rolling back ignite transaction vid
114 | opened input stream path vid delegate vid
115 | failed to cancel service ignoring name vid execid vid
116 | new resources vid
117 | i am modified job_1 vid on vid
118 | discarding reconnect message reconnect is completed vid
119 | failed to acquire lock with negative node vid
120 | flushing shuffle messages before sending task completion notification taskinfo vid state vid err vid
121 | skipping global authentication for node security credentials not found probably due to coordinator has older version nodeid vid addrs vid
122 | stealing job to a new node newnode vid oldnode vid sesid vid job vid jobctx vid task vid
123 | partition states after afterexchange grp vid exchver vid
124 | failed to find count down latch with worker vid
125 | cleared invalid entry from remote transaction will skip entry vid tx vid
126 | injecting cache store session vid
127 | abandoning re map because future is done vid
128 | partition map before afterexchange exchid vid fullmap vid
129 | unexpected response to join request vid
130 | sent cache message msg vid node vid
131 | received unexpected response to join request vid
132 | jdbc drivers folder has no files returning empty list
133 | transaction was not found in nodes
134 | vid label vid
135 | message is ignored as it came for the closed topic vid
136 | invalid transaction state for rollback state vid tx vid
137 | completed fragmentizer coordinator remote node vid
138 | finished running ssl engine tasks handshakestatus vid
139 | duplicate initialize process request received will ignore vid
140 | closing socket to next not sent vid
141 | ipc io stopping as unused vid
142 | non loopback local ips vid
143 | failed to restore closed connection reconnect networktimeout vid jointimeout vid
144 | discarding metrics update message issued by node node is no more coordinator vid
145 | ignoring backup element row vid cachemode vid incbackups vid primary vid
146 | tuple id vid from storm vid
147 | skipping own directory vid
148 | received near prepare response txid vid node vid
149 | failed during partition counters delivery to remote node left cluster will ignore futid vid node vid
150 | got removed entry while updating will retry vid
151 | mqtt grid vid
152 | i am modified job_1 vid on vid
153 | received communication error resolve request nodeid vid req vid
154 | application vid is vid
155 | failed to send multicast address request will retry in ms vid
156 | runtime error caught during initial demand request sending
157 | added new daemon node to topology vid
158 | failed to send checkpoint message to node msg vid err vid
159 | discarding killed join vid
160 | closing zookeeper ip finder
161 | failed to send verified node left message to node msg vid
162 | node left topology vid
163 | unknown connection detected is some other software connecting to this ignite port vid connection vid rmtaddr vid
164 | removing left node from full map update grp vid nodeid vid partmap vid
165 | sent job request client disconnected node vid taskname vid
166 | dht lock fut failed to send request txid vid dhttxid vid intx vid node vid
167 | prepared statement cluster error detected another thread already first
168 | ignite node is in invalid state due to a critical failure
169 | partition map after beforeexchange grp vid exchid vid fullmap vid
170 | skipping deployment check as remote node does not have required class vid
171 | timed out waiting for lock response vid
172 | vid node id vid
173 | received job cancel stopped callback
174 | received onundeploy request ldriver vid
175 | initialized alive zookeeper ip finder vid
176 | added invalid partition to future invalidparts vid
177 | load cache vid key vid val vid
178 | caught malformed url exception vid
179 | entry clear key vid entry vid val vid
180 | write entries to db cache vid keytype vid cnt vid
181 | return lastinitializedfut for topology ready future ver vid fut vid
182 | got removed entry when adding lock will retry vid
183 | received shuffle ack desc vid msg vid
184 | total number of jobs to be stolen vid
185 | will move session to less loaded worker ses vid msg vid
186 | found duplicate future in futures map will not add vid
187 | offer not sufficient for slave request vid
188 | gc worker has been started
189 | skipping rebalancing partition state is not moving vid p vid
190 | waiting for coordinator initialization will retry vid
191 | sent near finish response for completed tx txid vid dhttxid vid node vid
192 | failed to send partition update to node because it left grid will ignore node vid msg vid
193 | jobs to reject count jobstoreject vid jobs vid
194 | received dht lock response txid vid dhttxid vid node vid
195 | received user finish request jobid vid ses vid
196 | handshake response from local node vid
197 | starvationinc
198 | updating full partition map grp vid exchver vid fullmap vid
199 | unregistered spi mbean vid
200 | control utility has completed execution at vid
201 | put from load cache vid key vid val vid
202 | completing topology ready future right away head vid topver vid
203 | one model training time was vid
204 | stopped port processor
205 | command vid finished with code vid
206 | received data load request vid
207 | message has been sent to address msg vid locnodeid vid
208 | ignoring entry for partition that does not belong key true val false
209 | failed to stop distributed node vid
210 | bytes sockch vid cnt vid
211 | starting loading model by the path vid
212 | initializing cache store
213 | check before retry node already created vid
214 | discarding node left message join process is not finished vid
215 | partitions have been scheduled to resend reason node vid
216 | ignore affinity change message lastaffver vid exchver vid msgver vid
217 | file has been concurrently deleted vid
218 | ignoring entry for partition that does not belong key true val true err false
219 | got removed entry in lockasync method will retry vid
220 | node version to set vid
221 | got removed entry in lockasync method will retry vid
222 | failed to communication error resolve diagnostic with additional information vid
223 | sent near finish response txid vid dhttxid vid node vid
224 | attempted to remove lock on removed entry will retry rmvver vid entry vid
225 | opened igfs output stream for file append igfsname vid path vid streamid vid ses vid
226 | undeployed class loader as there are no participating nodes vid
227 | partition states after afterexchange grp vid exchver vid states vid
228 | mbean for metric registry vid can t be created
229 | other nodes not found
230 | got removed entry while processing get response will not retry
231 | vid view information in a cluster
232 | failed to get entry version msg vid
233 | vid mapping type vid
234 | creating db table with index
235 | grid load balancing spi vid
236 | failed to read classpath resource vid
237 | vid used cache groups id to name vid
238 | received data load response vid nodeid vid res vid
239 | metric registry not found registry vid
240 | initialized connection with remote vid node nodeid vid rmtaddr vid
241 | received data load response vid
242 | received dht finish response txid vid dhttxid vid node vid
243 | write dump file vid
244 | started range vid pos vid
245 | started services deployment future init localnode vid
246 | discarding node failed message sent from node which is about to fail vid
247 | new coordinator sends request ver vid node vid
248 | failed to perform operation
249 | sending partition update to node because it left grid will ignore node vid msg vid
250 | failed to find node added message node vid
251 | idle_verify is still running processed vid of vid local partitions
252 | synchronization aftercompletion status_status vid
253 | skipping dump page history due to can not reserve wal segments vid
254 | skipping alive node vid
255 | completing future vid
256 | received incoming connection when already connected to this node
257 | localportrange addr vid rmtport vid
258 | finished restoring partition state for local groups groupsprocessed vid time vid ms
259 | created new meta with updated participants vid
260 | default values
261 | updated metadata on server node holder vid changedschemas vid
262 | acquired deployment class after verifying other class
263 | delay alive nodes change process max event threshold reached newevts vid totalevts vid
264 | client creation failed addr vid err vid
265 | put after update cache vid key vid val vid success vid
266 | coordinator received single message ver vid node vid allreceived vid
267 | failed to add candidates because entry was removed will renew
268 | vid addresses vid
269 | dfltpri
270 | undeployed class loader as there are no participating nodes vid
271 | check failed message has been ignored msg vid spistate vid
272 | failed to wait for metadata update typeid vid schemaid vid
273 | sending cache message msg vid node vid
274 | completing topology ready future right away head vid topver vid
275 | handling topology req vid
276 | configured session factory using file vid
277 | ignite_hostname_constraint has invalid pattern it will be ignore
278 | waiting for handshake rmtnode vid
279 | received handshake message rmtnode vid rcvcnt vid
280 | external collision notification to vid
281 | received shuffle ack desc vid msg vid
282 | failed to close incoming file vid
283 | ignoring response since task is already reducing or finishing res vid
284 | got removed entry in transaction getallasync method will retry vid
285 | notifying exchange future about to remote node
286 | store put key true val true tx false
287 | field not found vid
288 | failed to find future for get response sender vid res vid
289 | starting spi implementation vid
290 | exchange timings
291 | failed waiting while initialization is completed
292 | vid ping_interval vid
293 | failed to send global state response node left nodeid vid nodeid vid
294 | vid the subcommands that take vid as an arguments
295 | interrupted while waiting for consumer threads to shut down exiting uncleanly
296 | daemon node failed vid
297 | received incoming connection when already connected to this node rejecting locnode vid rmtnode vid
298 | skipping partition on recovery no page store or wal state grp vid p vid
299 | before acquiring transaction lock for put on keys vid
300 | ignore affinity for cache vid key vid val vid
301 | failed to get future result fut vid
302 | demo tcpserver stared
303 | failed to send unauthenticated message to node node vid err vid
304 | successfully bound shared memory communication to tcp port port vid lochost vid
305 | unregistered mbean vid
306 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/ignite/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/ignite/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/ignite/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 305 examples
5 | | ['data-bin/context'] test 305 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/ignite/checkpoint_last.pt
7 | | Translated 305 sentences (4151 tokens) in 5.9s (52.01 sentences/s, 707.82 tokens/s)
8 | | Generate test with beam=8: BLEU = 28.81 50.7/32.9/25.8/20.4 (BP = 0.942 ratio = 0.943 hyp_len = 2482 ref_len = 2631)
9 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/synapse/translation.context.test:
--------------------------------------------------------------------------------
  1 | priorityexecutor undeployment of the entry named vid started
  2 | connection without a pool something wrong need to fix
  3 | priorityexecutor with name vid does not exist
  4 | http connection vid output
  5 | vid message for the vid dropped in the pre mediation state by the mandatory sequence n vid
  6 | user id cannot be found
  7 | connection closed by the client end while writing the response vid
  8 | event source vid was removed from the synapse configuration successfully
  9 | no resource is defined for location vid
 10 | received to vid
 11 | cannot find a datasource with name vid either in in in memory or jndi datasource repositories
 12 | can not open a connection to the url with a path vid
 13 | restoring the messageprocessor with name vid completed
 14 | initializing child mediators of mediator vid
 15 | synapse library import named vid has been deployed from file vid
 16 | sequence vid has already been undeployed
 17 | the file vid is not a valid soap11
 18 | fail to create the condition in the given directory vid
 19 | there is no secret for alias vid returning itself
 20 | endpoint vid has been updated from the file vid
 21 | http connection vid response vid
 22 | deleting a job with name vid group vid
 23 | endpoint vid has been deployed from file vid
 24 | error opening key store vid
 25 | loading trust keystore from vid
 26 | synapse received a response message without a message id
 27 | error while pipe vid shutting down listener
 28 | directory vid is not writable
 29 | registered mediator serializer vid for vid
 30 | http connection vid closed
 31 | message request received for the request message id vid
 32 | synapse encountered an exception no error handlers sending fault
 33 | received to vid
 34 | created a error log vid
 35 | http protocol error vid
 36 | system may be unstable ioreactor encountered a checked exception vid
 37 | error while closing the temporary file vid
 38 | error occurred while shutting down jvm
 39 | priorityexecutor vid has already been undeployed
 40 | all transport threads and tasks are idle and no pending callbacks
 41 | using http tuning parameter vid vid
 42 | matching cher for the provided character sequence and the pattern vid
 43 | localentry update from file vid has started
 44 | can t send the out message sequence vid does not exist
 45 | couldn t get the lock for processing the file vid
 46 | initializing transport listener for request
 47 | undeploying proxy service vid
 48 | undeployment of the endpoint vid
 49 | added mediators for vid
 50 | configuring transport sender started
 51 | sequence vid has been built from the file vid
 52 | proxyservice named vid has been built from the
 53 | initializing xar metadata
 54 | setting a statistics stack on the message context
 55 | start writing the hessian message to outputstream
 56 | no secret repositories have been configured
 57 | error resolving directory to move after processing vid
 58 | initializing synapse in an already existing axis2 server instance
 59 | sequence deployment from file vid completed
 60 | pass through vid sender started
 61 | message store deployment from file vid completed
 62 | eventsource named vid has been built from the file vid
 63 | received a continue response
 64 | error in closing the input stream
 65 | error while releasing the file vid
 66 | connection closed by the target host while receiving request
 67 | creating a secret repositories for given configuration
 68 | priorityexecutor undeployment of the entry named vid started
 69 | synapse timed out for the request with message id vid
 70 | no beanstalk definitions found for initialization
 71 | soapaction vid
 72 | loading endpoints from vid
 73 | server certificate validation trust has been disabled do not use
 74 | amqp transport polling task started listen for service vid
 75 | keep alive connection was closed by the client vid
 76 | did not schedule the job vid job count is zero
 77 | thread was interrupted while waiting to be destroying
 78 | hot deployment has been suspended ignoring
 79 | sequence vid has been updated from the file vid
 80 | can t send the out message sequence vid does not exist
 81 | messagestore named vid has been restored
 82 | there is no private key in the given configuration
 83 | graceful stop request completed in milliseconds
 84 | error pausing transport sender
 85 | localentry update from file vid has started
 86 | one or more required fields are not found in the mgiven vid
 87 | api named vid has been deployed from file vid
 88 | endpoint deployment from file vid completed
 89 | error opening key store vid
 90 | deleting temporary file vid
 91 | connection time out while writing the response vid
 92 | template vid has been updated from the file vid
 93 | memory cache is full unable to initialize the cache value
 94 | unable to create ssl context with the given configuration
 95 | initiating a file based secret repository
 96 | the reconnection attempt number vid failed next re try will be after vid seconds
 97 | keep alive connection was closed
 98 | taskdescription cannot be found for name vid returning null
 99 | registered mediator for extension vid
100 | unexpected exception encountered in targethandler
101 | cannot create a urlconnection for given url vid
102 | deployment of the synapse artifact from file vid started
103 | message processor deployment from file vid started
104 | destroying the synapsecallbackreceiver
105 | getting a datasource with name vid from the given configuration
106 | synapsesubscription failed sending fault response
107 | template deployment from file vid completed
108 | proxyservice named vid has been built from the file vid
109 | base64 decoding on input
110 | loading a file vid from classpath
111 | cannot open vid
112 | creating new taskderepositories
113 | startuptask named vid has been undeployed
114 | session with id vid is still live
115 | synapsesubscription failed sending fault response
116 | interrupted while building message for rest_url request
117 | the property vid with key vid target vid
118 | template task vid has already been undeployed
119 | priorityexecutor named vid has been deployed from file vid
120 | added mediator serializer vid for vid
121 | starting apache synapse
122 | vid listener started on vid port vid
123 | outgoing request counter rolled over for the session vid from vid
124 | encountered an i o error vid
125 | template vid has been built from the file vid
126 | restoring the messagestore with name vid started
127 | there are no statistics to be cleaned
128 | removing the session with the session id vid
129 | proxyservice deployment from proxy service vid started
130 | start replicating the property with key vid
131 | removed taskdescription vid
132 | setting the store type vid to vid
133 | you are using a persistent message queue you will be loosing messages which are on the queue
134 | restoring the messagestore with name vid completed
135 | cookies string vid
136 | loading a file vid from classpath
137 | initializing mediators of mediator vid
138 | priorityexecutor vid has been updated from the file vid
139 | retrieving task was interrupted
140 | loading synapse properties from the file vid
141 | synapse has decided to abort the message n vid
142 | creating session information for given session id vid
143 | expiring message id vid dropping message after global statistics
144 | crl taken from cache
145 | could not determine host name
146 | error while destroying the task vid
147 | loading trust keystore from vid
148 | endpoint vid has been updated from the file vid
149 | initializing synapsecallbackreceiver
150 | destroying pass through vid listener
151 | starting apache synapse
152 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/synapse/translation.context.test.log:
--------------------------------------------------------------------------------
1 | Namespace(beam=8, cpu=False, data=['data-bin/context'], diverse_beam_groups=-1, diverse_beam_strength=0.5, fp16=False, fp16_init_scale=128, fp16_scale_tolerance=0.0, fp16_scale_window=None, gen_subset='test', lazy_load=False, left_pad_context='True', left_pad_source='True', left_pad_target='False', lenpen=2.5, log_format=None, log_interval=1000, match_source_len=False, max_context_positions=1024, max_len_a=0, max_len_b=100, max_sentences=32, max_source_positions=1024, max_target_positions=1024, max_tokens=None, memory_efficient_fp16=False, min_len=3.0, min_loss_scale=0.0001, model_overrides='{}', nbest=1, no_beamable_mm=False, no_early_stop=False, no_progress_bar=False, no_repeat_ngram_size=0, num_shards=1, num_workers=0, output='translations/1/synapse/translation.context.test.unsort', path='/home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/synapse/checkpoint_last.pt', prefix_size=0, print_alignment=False, quiet=True, raw_text=False, remove_bpe='@@ ', replace_unk=None, required_batch_size_multiple=8, results_path=None, sacrebleu=True, sampling=False, sampling_temperature=1, sampling_topk=-1, score_reference=False, seed=1, shard_id=0, skip_invalid_size_inputs_valid_test=True, source_lang=None, target_lang=None, task='translation_context', tensorboard_logdir='', threshold_loss_scale=None, unkpen=0, unnormalized=False, upsample_primary=1, user_dir=None)
2 | | [code] dictionary: 1080 types
3 | | [log] dictionary: 1080 types
4 | | data-bin/context test 151 examples
5 | | ['data-bin/context'] test 151 examples
6 | | loading model(s) from /home/zi_ding/log-generation-ext/similar-log-template-concat-batch/pre-ast-temp-1/saved_checkpoints/pre-ast-templete/synapse/checkpoint_last.pt
7 | | Translated 151 sentences (1749 tokens) in 2.5s (59.55 sentences/s, 689.74 tokens/s)
8 | | Generate test with beam=8: BLEU = 37.85 55.9/41.1/34.0/29.0 (BP = 0.976 ratio = 0.976 hyp_len = 1078 ref_len = 1104)
9 | 


--------------------------------------------------------------------------------
/src/Baselines/LoGenText-Plus/results/1/synapse/translation.context.test.unsort:
--------------------------------------------------------------------------------
  1 | 99	unexpected exception encountered in targethandler
  2 | 129	start replicating the property with key vid
  3 | 109	loading a file vid from classpath
  4 | 57	initializing synapse in an already existing axis2 server instance
  5 | 102	message processor deployment from file vid started
  6 | 135	loading a file vid from classpath
  7 | 20	http connection vid response vid
  8 | 66	creating a secret repositories for given configuration
  9 | 89	deleting temporary file vid
 10 | 125	restoring the messagestore with name vid started
 11 | 0	priorityexecutor undeployment of the entry named vid started
 12 | 67	priorityexecutor undeployment of the entry named vid started
 13 | 3	http connection vid output
 14 | 13	initializing child mediators of mediator vid
 15 | 131	setting the store type vid to vid
 16 | 29	http connection vid closed
 17 | 30	message request received for the request message id vid
 18 | 100	cannot create a urlconnection for given url vid
 19 | 127	removing the session with the session id vid
 20 | 7	event source vid was removed from the synapse configuration successfully
 21 | 96	keep alive connection was closed
 22 | 136	initializing mediators of mediator vid
 23 | 34	http protocol error vid
 24 | 46	undeploying proxy service vid
 25 | 103	destroying the synapsecallbackreceiver
 26 | 98	registered mediator for extension vid
 27 | 47	undeployment of the endpoint vid
 28 | 41	matching cher for the provided character sequence and the pattern vid
 29 | 81	there is no private key in the given configuration
 30 | 149	destroying pass through vid listener
 31 | 35	system may be unstable ioreactor encountered a checked exception vid
 32 | 150	starting apache synapse
 33 | 62	received a continue response
 34 | 77	hot deployment has been suspended ignoring
 35 | 1	connection without a pool something wrong need to fix
 36 | 16	the file vid is not a valid soap11
 37 | 144	could not determine host name
 38 | 110	cannot open vid
 39 | 139	loading synapse properties from the file vid
 40 | 52	initializing xar metadata
 41 | 21	deleting a job with name vid group vid
 42 | 97	taskdescription cannot be found for name vid returning null
 43 | 130	removed taskdescription vid
 44 | 76	thread was interrupted while waiting to be destroying
 45 | 71	loading endpoints from vid
 46 | 74	keep alive connection was closed by the client vid
 47 | 8	no resource is defined for location vid
 48 | 27	directory vid is not writable
 49 | 145	error while destroying the task vid
 50 | 128	proxyservice deployment from proxy service vid started
 51 | 101	deployment of the synapse artifact from file vid started
 52 | 54	start writing the hessian message to outputstream
 53 | 141	creating session information for given session id vid
 54 | 104	getting a datasource with name vid from the given configuration
 55 | 148	initializing synapsecallbackreceiver
 56 | 123	encountered an i o error vid
 57 | 84	localentry update from file vid has started
 58 | 73	amqp transport polling task started listen for service vid
 59 | 42	localentry update from file vid has started
 60 | 40	using http tuning parameter vid vid
 61 | 37	error occurred while shutting down jvm
 62 | 126	there are no statistics to be cleaned
 63 | 49	configuring transport sender started
 64 | 45	initializing transport listener for request
 65 | 133	restoring the messagestore with name vid completed
 66 | 112	startuptask named vid has been undeployed
 67 | 82	graceful stop request completed in milliseconds
 68 | 10	cannot find a datasource with name vid either in in in memory or jndi datasource repositories
 69 | 87	endpoint deployment from file vid completed
 70 | 44	couldn t get the lock for processing the file vid
 71 | 6	connection closed by the client end while writing the response vid
 72 | 60	message store deployment from file vid completed
 73 | 53	setting a statistics stack on the message context
 74 | 83	error pausing transport sender
 75 | 18	there is no secret for alias vid returning itself
 76 | 85	one or more required fields are not found in the mgiven vid
 77 | 120	starting apache synapse
 78 | 38	priorityexecutor vid has already been undeployed
 79 | 55	no secret repositories have been configured
 80 | 134	cookies string vid
 81 | 143	crl taken from cache
 82 | 108	base64 decoding on input
 83 | 69	no beanstalk definitions found for initialization
 84 | 26	error while pipe vid shutting down listener
 85 | 119	added mediator serializer vid for vid
 86 | 107	proxyservice named vid has been built from the file vid
 87 | 132	you are using a persistent message queue you will be loosing messages which are on the queue
 88 | 4	vid message for the vid dropped in the pre mediation state by the mandatory sequence n vid
 89 | 48	added mediators for vid
 90 | 17	fail to create the condition in the given directory vid
 91 | 51	proxyservice named vid has been built from the
 92 | 11	can not open a connection to the url with a path vid
 93 | 61	eventsource named vid has been built from the file vid
 94 | 70	soapaction vid
 95 | 116	the property vid with key vid target vid
 96 | 111	creating new taskderepositories
 97 | 115	interrupted while building message for rest_url request
 98 | 114	synapsesubscription failed sending fault response
 99 | 23	error opening key store vid
100 | 56	error resolving directory to move after processing vid
101 | 75	did not schedule the job vid job count is zero
102 | 64	error while releasing the file vid
103 | 137	priorityexecutor vid has been updated from the file vid
104 | 28	registered mediator serializer vid for vid
105 | 147	endpoint vid has been updated from the file vid
106 | 142	expiring message id vid dropping message after global statistics
107 | 122	outgoing request counter rolled over for the session vid from vid
108 | 91	template vid has been updated from the file vid
109 | 146	loading trust keystore from vid
110 | 19	endpoint vid has been updated from the file vid
111 | 15	sequence vid has already been undeployed
112 | 5	user id cannot be found
113 | 105	synapsesubscription failed sending fault response
114 | 36	error while closing the temporary file vid
115 | 65	connection closed by the target host while receiving request
116 | 31	synapse encountered an exception no error handlers sending fault
117 | 95	the reconnection attempt number vid failed next re try will be after vid seconds
118 | 80	messagestore named vid has been restored
119 | 90	connection time out while writing the response vid
120 | 12	restoring the messageprocessor with name vid completed
121 | 118	priorityexecutor named vid has been deployed from file vid
122 | 22	endpoint vid has been deployed from file vid
123 | 86	api named vid has been deployed from file vid
124 | 124	template vid has been built from the file vid
125 | 138	retrieving task was interrupted
126 | 68	synapse timed out for the request with message id vid
127 | 58	sequence deployment from file vid completed
128 | 33	created a error log vid
129 | 79	can t send the out message sequence vid does not exist
130 | 32	received to vid
131 | 9	received to vid
132 | 94	initiating a file based secret repository
133 | 92	memory cache is full unable to initialize the cache value
134 | 43	can t send the out message sequence vid does not exist
135 | 93	unable to create ssl context with the given configuration
136 | 59	pass through vid sender started
137 | 88	error opening key store vid
138 | 72	server certificate validation trust has been disabled do not use
139 | 121	vid listener started on vid port vid
140 | 25	synapse received a response message without a message id
141 | 140	synapse has decided to abort the message n vid
142 | 24	loading trust keystore from vid
143 | 63	error in closing the input stream
144 | 113	session with id vid is still live
145 | 2	priorityexecutor with name vid does not exist
146 | 39	all transport threads and tasks are idle and no pending callbacks
147 | 50	sequence vid has been built from the file vid
148 | 117	template task vid has already been undeployed
149 | 78	sequence vid has been updated from the file vid
150 | 14	synapse library import named vid has been deployed from file vid
151 | 106	template deployment from file vid completed
152 | 


--------------------------------------------------------------------------------
/src/Baselines/README.md:
--------------------------------------------------------------------------------
1 | # README
2 | We have open-sourced the code for the baselines we use. For the released model, we have open-sourced the code. For APIs, we have open-sourced the invocation scripts. As for those commercial plugins, due to usage restrictions, they can only be invoked manually.
3 | 
4 | For each baseline utilized, we kindly request that please ensure to cite the relevant paper.
5 | 


--------------------------------------------------------------------------------
/src/Baselines/StarCoder/starcoder.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | import re
 3 | import os
 4 | import tqdm
 5 | 
 6 | path = './LogBench-O_prefix_1point'
 7 | ground_truth_folder = './LogBench-O_prefix_1point'
 8 | output_path= './StarCoder_LogBench-O_prefix_1point'
 9 | FIM_INDICATOR = "<FILL_HERE>"
10 | FIM_PREFIX = "<fim_prefix>"
11 | FIM_MIDDLE = "<fim_middle>"
12 | FIM_SUFFIX = "<fim_suffix>"
13 | 
14 | checkpoint = "bigcode/starcoder"
15 | device = "cuda" 
16 | auth_token = "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"
17 | 
18 | # Check if output_path exists, if not, create it
19 | if not os.path.exists(output_path):
20 |     os.makedirs(output_path)
21 | 
22 | 
23 | def insert_text_to_java_file(file_name, line_number):
24 |     with open(file_name, 'r', encoding='utf-8') as file:
25 |         lines = file.readlines()
26 |     if line_number > len(lines):
27 |         print("out of range")
28 |     lines[line_number - 1] = lines[line_number - 1].rstrip() + FIM_INDICATOR +'\n'
29 |     with open(file_name, 'w', encoding='utf-8') as file:
30 |         file.writelines(lines)
31 | 
32 |         
33 | def extract_numbers(s):
34 |     return re.findall(r'\d+', s)
35 | 
36 | def parse_directory(dir_path, ground_truth_folder):
37 |     for filename in os.listdir(dir_path):
38 |         file_path = os.path.join(dir_path, filename)
39 |         if os.path.isfile(file_path) and file_path.endswith('.java'):
40 |             ground_truth_path = os.path.join(ground_truth_folder, file_path.split('/')[-1][:-5] + '_config.txt')
41 |             try:
42 |                 with open(ground_truth_path, 'r', encoding='utf-8') as f:
43 |                     lines = f.readlines()
44 |                     if len(lines) >= 1:
45 |                         line_number = int(extract_numbers(lines[0].strip(' ')[:-1])[0])
46 |                         insert_text_to_java_file(file_path, line_number)
47 |             except FileNotFoundError:
48 |                 pass
49 |         elif os.path.isdir(file_path):
50 |             parse_directory(file_path, ground_truth_folder)
51 | 
52 | parse_directory(path,ground_truth_folder)
53 | 
54 | tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_auth_token=auth_token)
55 | model = AutoModelForCausalLM.from_pretrained(checkpoint, use_auth_token=auth_token).to(device)
56 | 
57 | def generate(input_text):
58 |     if FIM_INDICATOR in input_text:
59 |         try:
60 |             prefix, suffix = input_text.split(FIM_INDICATOR)
61 |         except:
62 |             raise ValueError(f"Only one {FIM_INDICATOR} allowed in prompt!")
63 |         input_text = f"{FIM_PREFIX}{prefix}{FIM_SUFFIX}{suffix}{FIM_MIDDLE}"
64 | 
65 | 
66 |     inputs = tokenizer(input_text, return_tensors="pt")
67 |     inputs = {k: v.to(device) for k, v in inputs.items()} 
68 |     outputs = model.generate(
69 |         input_ids=inputs["input_ids"],
70 |         attention_mask=inputs["attention_mask"],
71 |         max_length=1024,
72 |         do_sample=True,
73 |         pad_token_id=tokenizer.eos_token_id,  # Set pad_token_id
74 |     )
75 |     return (tokenizer.decode(outputs[0]))
76 | 
77 | for filename in os.listdir(path):
78 |     if filename.endswith(".java"):
79 |         print(filename)
80 |         input_file_path = os.path.join(path, filename)
81 |         
82 |         try:
83 |             with open(input_file_path, 'r', encoding='utf-8') as file:
84 |                 file_content = file.read()
85 |                 example = f"'''\\\n{file_content}\n'''"
86 |                 processed_content = generate(example)
87 |                 output_file_path = os.path.join(output_path, filename)
88 |                 with open(output_file_path, 'w', encoding='utf-8') as output_file:
89 |                     output_file.write(f"{processed_content}\n")
90 |         except Exception as e:
91 |             print(f"Error processing file {filename}: {e}")
92 | 


--------------------------------------------------------------------------------
/src/Baselines/WhichVar/analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 23,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import json\n",
 10 |     "import numpy as np"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 9,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "with open(\"output.json\", \"r\") as f:\n",
 20 |     "    data_list = json.load(f)"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 10,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stdout",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "{'code': 'private void handleAdjustPublishRate(Context ctx) throws Exception {\\n    Double publishRate = mapper.readValue(ctx.body(), Double.class);', 'pred_variables': ['mapper', 'publishRate', 'body'], 'label_variables': ['publishRate']}\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "print(data_list[0])"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 20,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "def precision_recall_f1(labels, predictions):\n",
 47 |     "    true_positives = len(set(labels) & set(predictions))\n",
 48 |     "    false_positives = len(set(predictions) - set(labels))\n",
 49 |     "    false_negatives = len(set(labels) - set(predictions))\n",
 50 |     "\n",
 51 |     "    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0.0 else 0.0\n",
 52 |     "    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0.0 else 0.0\n",
 53 |     "    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0.0 else 0.0\n",
 54 |     "\n",
 55 |     "    return precision, recall, f1"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 22,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "precs, recs, f1s = [], [], []\n",
 65 |     "for idx, data in enumerate(data_list):\n",
 66 |     "    labels = data['label_variables']\n",
 67 |     "    predcits = data['pred_variables']\n",
 68 |     "    \n",
 69 |     "    # print(predcits, labels)\n",
 70 |     "    precision, recall, f1 = precision_recall_f1(labels, predcits)\n",
 71 |     "    precs.append(precision)\n",
 72 |     "    recs.append(recall)\n",
 73 |     "    f1s.append(f1)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 25,
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "name": "stdout",
 83 |      "output_type": "stream",
 84 |      "text": [
 85 |       "0.5030762324986151\n",
 86 |       "0.6346379386090578\n",
 87 |       "0.5348833543779392\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "print(np.mean(precs))\n",
 93 |     "print(np.mean(recs))\n",
 94 |     "print(np.mean(f1s))"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": []
103 |   }
104 |  ],
105 |  "metadata": {
106 |   "kernelspec": {
107 |    "display_name": "myenv",
108 |    "language": "python",
109 |    "name": "python3"
110 |   },
111 |   "language_info": {
112 |    "codemirror_mode": {
113 |     "name": "ipython",
114 |     "version": 3
115 |    },
116 |    "file_extension": ".py",
117 |    "mimetype": "text/x-python",
118 |    "name": "python",
119 |    "nbconvert_exporter": "python",
120 |    "pygments_lexer": "ipython3",
121 |    "version": "3.7.13"
122 |   }
123 |  },
124 |  "nbformat": 4,
125 |  "nbformat_minor": 2
126 | }
127 | 


--------------------------------------------------------------------------------
/src/Baselines/WhichVar/cleaner.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 4,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "import os\n",
10 |     "import json\n",
11 |     "import re\n",
12 |     "from collections import Counter\n",
13 |     "\n",
14 |     "regex = r\"(?i)(?:log(?:ger)?\\w*)\\s*\\.\\s*(?:log|error|info|warn|fatal|debug|trace|off|all)\\s*\\([^;]*\\)\"\n",
15 |     "\n",
16 |     "def process_directory(directory):\n",
17 |     "    for filename in os.listdir(directory):\n",
18 |     "        filepath = os.path.join(directory, filename)\n",
19 |     "        if os.path.isdir(filepath):\n",
20 |     "            process_directory(filepath)\n",
21 |     "        elif filename.endswith('.json'):\n",
22 |     "            process_file(filepath)\n",
23 |     "\n",
24 |     "def process_file(filepath):\n",
25 |     "    with open(filepath, 'r') as f:\n",
26 |     "        data = json.load(f)\n",
27 |     "        method_code = data.get('methodCode', '')\n",
28 |     "        log_variables = data.get('logVariables', [])\n",
29 |     "        \n",
30 |     "        for match in re.finditer(regex, method_code):\n",
31 |     "            logging_statement = match.group(0)\n",
32 |     "            \n",
33 |     "            if all(var in logging_statement for var in log_variables):\n",
34 |     "                start_index = match.start()\n",
35 |     "                line_count = Counter(method_code[:start_index])['\\n']\n",
36 |     "                start_line = max(0, line_count - 15)\n",
37 |     "                preceding_lines = method_code.split('\\n')[:start_line]\n",
38 |     "                start_index = len('\\n'.join(preceding_lines)) + 1 if preceding_lines else 0\n",
39 |     "                data['methodCode'] = method_code[start_index:match.end()]\n",
40 |     "                \n",
41 |     "                with open(filepath, 'w') as f:\n",
42 |     "                    json.dump(data, f)\n",
43 |     "                break\n",
44 |     "# ...\n",
45 |     "\n",
46 |     "process_directory('/Users/liyichen/data/')\n"
47 |    ]
48 |   },
49 |   {
50 |    "cell_type": "code",
51 |    "execution_count": null,
52 |    "metadata": {},
53 |    "outputs": [],
54 |    "source": []
55 |   }
56 |  ],
57 |  "metadata": {
58 |   "kernelspec": {
59 |    "display_name": "Python 3",
60 |    "language": "python",
61 |    "name": "python3"
62 |   },
63 |   "language_info": {
64 |    "codemirror_mode": {
65 |     "name": "ipython",
66 |     "version": 3
67 |    },
68 |    "file_extension": ".py",
69 |    "mimetype": "text/x-python",
70 |    "name": "python",
71 |    "nbconvert_exporter": "python",
72 |    "pygments_lexer": "ipython3",
73 |    "version": "3.9.7"
74 |   }
75 |  },
76 |  "nbformat": 4,
77 |  "nbformat_minor": 2
78 | }
79 | 


--------------------------------------------------------------------------------
/src/Baselines/WhichVar/model.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch
  3 | from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
  4 | from torchtext.vocab import GloVe
  5 | import json
  6 | from torch.utils.data import DataLoader, Dataset, random_split
  7 | import numpy as np
  8 | from sklearn.metrics import precision_score, recall_score, f1_score
  9 | import re
 10 | import random
 11 | 
 12 | def check_and_split_camel_case(s):
 13 |     if re.match(r'^[a-z]+([A-Z][a-z]*)*$', s):
 14 |         words = re.findall('[a-z]+|[A-Z][a-z]*', s)
 15 |         return "yes", words
 16 |     else:
 17 |         return "no", s
 18 | 
 19 | 
 20 | def setup_seed(seed):
 21 |     if seed == -1:
 22 |         seed = random.randint(0, 1000)
 23 |     torch.manual_seed(seed)
 24 |     torch.cuda.manual_seed_all(seed)
 25 |     np.random.seed(seed)
 26 |     random.seed(seed)
 27 |     torch.backends.cudnn.deterministic = True
 28 |     torch.backends.cudnn.benchmark = False
 29 |     return seed
 30 | 
 31 | 
 32 | class Model(nn.Module):
 33 |     def __init__(self, weight):
 34 |         super(Model, self).__init__()
 35 |         vocab_size = weight.shape[0]
 36 |         self.word_embed = nn.Embedding(num_embeddings=vocab_size+1, embedding_dim=weight.shape[-1])
 37 |         self.word_embed.weight.data[:vocab_size] = weight
 38 |         self.word_embed.weight.data[vocab_size] = torch.zeros(weight.shape[-1])
 39 |         self.word_embed.weight.requires_grad = False
 40 |         
 41 |         self.rnn = nn.LSTM(100, 128, num_layers=2, bidirectional=True, batch_first=True)
 42 |         self.num_heads = 4
 43 |         self.attention = nn.MultiheadAttention(embed_dim=256, num_heads=self.num_heads, batch_first=True)
 44 |         
 45 |         self.cls_layer = nn.Linear(256, 1, bias=False)
 46 |         
 47 |         
 48 |     def forward(self, sentences, lens):
 49 |         
 50 |         embeds = self.word_embed(sentences)
 51 |         outputs, _ = self.rnn(embeds)
 52 |         attn_mask=torch.zeros((sentences.size(0) * 4, sentences.size(1), sentences.size(1)), device=sentences.device).bool()
 53 |         for i, l in enumerate(lens):
 54 |             for j in range(1, self.num_heads+1):
 55 |                 attn_mask[i*j][:l][:l] = True
 56 |             
 57 |         attention_embeds, _ = self.attention(outputs, outputs, outputs, attn_mask=None)
 58 |         logits = self.cls_layer(attention_embeds).squeeze(dim=-1)
 59 |         
 60 |         return logits
 61 |     
 62 | class SensDataSet(Dataset):
 63 |     def __init__(self, data, label):
 64 |         self.data = data
 65 |         self.label = label
 66 |  
 67 |     def __len__(self):
 68 |         return len(self.data)
 69 |  
 70 |     def __getitem__(self, idx):
 71 |         tuple_ = (self.data[idx], self.label[idx])
 72 |         return tuple_
 73 |     
 74 |     
 75 | def collate_fn(data_tuple):
 76 |     # data_tuple.sort(key=lambda x: len(x[0]), reverse=True)
 77 |     data = [torch.LongTensor(sq[0]) for sq in data_tuple]
 78 |     label = [torch.Tensor(sq[1]) for sq in data_tuple]
 79 |     data_length = [len(sq) for sq in data]
 80 |     data = pad_sequence(data, batch_first=True)
 81 |     label = pad_sequence(label, batch_first=True)
 82 |     return data, label, data_length    
 83 | 
 84 | 
 85 | # def evaluate(model, test_dataloader, device):
 86 | #     acc = 0
 87 | #     n = 0
 88 | #     model.eval()
 89 | #     total_pred = []
 90 | #     total_label =[]
 91 | #     for batch_x, batch_y, batch_x_len in test_dataloader:
 92 | #         batch_x = batch_x.to(device)
 93 | #         batch_y = batch_y.to(device)
 94 | #         out = model(batch_x, batch_x_len)
 95 | #         predicts = (out > 0) + 0
 96 | #         for predict, label, length in zip(predicts, batch_y, batch_x_len):
 97 | #             total_pred.append(predict[:length])
 98 | #             total_label.append(label[:length])
 99 | #     total_pred = torch.cat(total_pred).cpu().numpy()        
100 | #     total_label = torch.cat(total_label).cpu().numpy()      
101 |     
102 | #     precision = precision_score(total_label, total_pred)
103 | #     recall = recall_score(total_label, total_pred)
104 | #     f1 = f1_score(total_label, total_pred)     
105 | #     return {"precision" : precision, "recall" : recall, "f1" : f1}
106 | 
107 | 
108 | def evaluate(model, test_dataloader, device):
109 |     model.eval()
110 |     precision_list = []
111 |     recall_list = []
112 |     f1_list = []
113 |     predicts_list = []
114 |     for batch_x, batch_y, batch_x_len in test_dataloader:
115 |         batch_x = batch_x.to(device)
116 |         batch_y = batch_y.to(device)
117 |         out = model(batch_x, batch_x_len)
118 |         predicts = (out > 0) + 0
119 |         batch_x = batch_x.cpu().numpy()
120 |         batch_y = batch_y.cpu().numpy()
121 |         predicts = predicts.cpu().numpy()
122 |         for x, predict, label, length in zip(batch_x, predicts, batch_y, batch_x_len):
123 |             # print(len(x), len(predict), len(label), length)
124 |             x, predict, label = x[:length], predict[:length], label[:length]
125 |             
126 |             pred_1_set = set(x[predict == 1])
127 |             pred_0_set = set(x) - pred_1_set
128 |             label_1_set = set(x[label == 1])
129 |             label_0_set = set(x) - label_1_set
130 |             TP = len(label_1_set.intersection(pred_1_set))
131 |             FN = len(label_1_set.intersection(pred_0_set))
132 |             FP = len(label_0_set.intersection(pred_1_set))
133 |             TN = len(label_0_set.intersection(pred_0_set))
134 |             precision = TP / (TP + FP) if (TP + FP) != 0 else 0
135 |             recall = TP / (TP + FN) if (TP + FN) != 0 else 0
136 |             f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0
137 |             precision_list.append(precision)
138 |             recall_list.append(recall)
139 |             f1_list.append(f1)
140 |             predicts_list.append(predict)
141 |     
142 |     precision = np.mean(precision_list)
143 |     recall = np.mean(recall_list)
144 |     f1 = np.mean(f1_list)
145 |     # print(len(f1_list))
146 |     
147 |     return {"precision" : precision, "recall" : recall, "f1" : f1}, predicts_list
148 | 
149 | 
150 | 
151 | 
152 | if __name__ == '__main__':
153 |     setup_seed(111)
154 |     device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
155 |     print(device)
156 |     epochs = 0
157 |     glove = GloVe(name='6B', dim=100)
158 |     vocab_size = len(glove)
159 |     with open('./train.json', 'r') as file:
160 |         train_data = json.load(file)
161 |     with open('./test.json', 'r') as file:
162 |         test_data = json.load(file)
163 | 
164 |     train_sentences = train_data['input']
165 |     train_sentences = [[glove.stoi[word] if word in glove.stoi.keys() else vocab_size for word in sentence] for sentence in train_sentences]
166 |     train_labels = train_data['label']
167 |     
168 |     test_sentences = test_data['input']
169 |     test_sentences = [[glove.stoi[word] if word in glove.stoi.keys() else vocab_size for word in sentence] for sentence in test_sentences]
170 |     test_labels = test_data['label']
171 |     # print(len(test_sentences))
172 |     # train_size = int(0.8 * len(sentences))
173 |     # train_sentences, test_sentences = sentences[:train_size], sentences[train_size:]
174 |     # train_labels, test_labels = labels[:train_size], labels[train_size:]
175 |     
176 |     train_dataset = SensDataSet(data=train_sentences, label=train_labels)
177 |     test_dataset = SensDataSet(data=test_sentences, label=test_labels)
178 | 
179 |     # train_dataset, test_dataset = random_split(dataset, [0.8, 0.2])
180 |     
181 |     train_loader = DataLoader(dataset=train_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
182 |     test_loader = DataLoader(dataset=test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
183 |     
184 |     model = Model(weight=glove.vectors).to(device)
185 |     optimizer = torch.optim.Adam(model.parameters(), lr=1e-03)
186 |     loss_fun = nn.BCEWithLogitsLoss(reduction='none')
187 |     
188 |     for epoch in range(1, epochs + 1):
189 |         model.train()
190 |         total_loss = []
191 |         for batch_id, (batch_x, batch_y, batch_x_len) in enumerate(train_loader):
192 |             batch_x = batch_x.to(device)
193 |             batch_y = batch_y.to(device)
194 |             out = model(batch_x, batch_x_len)
195 |             pos_mask=torch.zeros((batch_x.size(0), batch_x.size(1)), device=device).bool()
196 |             for i, l in enumerate(batch_x_len):
197 |                 pos_mask[i][:l] = True
198 |             loss = loss_fun(out, batch_y)[pos_mask].mean()
199 |             optimizer.zero_grad()
200 |             loss.backward()
201 |             optimizer.step()
202 |             total_loss.append(loss.item())
203 |         print("epoch: {}/{}, loss={}".format(epoch, epochs, np.mean(total_loss)))
204 |         result1, predicts_list1 = evaluate(model, train_loader, device)
205 |         result2, predicts_list2 = evaluate(model, test_loader, device)
206 |         print('result on train set: {}'.format(result1))
207 |         print('result on test set: {}'.format(result2))
208 | 
209 |     torch.save(model.state_dict(), 'model/model.pth')
210 |     
211 |     model.load_state_dict(torch.load('model/model.pth'))
212 |     
213 |     result, predicts_list = evaluate(model, test_loader, device)
214 |     print(len(predicts_list))
215 |     test_cases = []
216 |     for i in range(len(predicts_list)):
217 |         codes = test_data['codes'][i]
218 |         predict = predicts_list[i]
219 |         # print(len(test_data['input'][i]), len(test_sentences[i]), len(predict))
220 |         variables = list(set([test_data['input'][i][j] for j, v in enumerate(predict) if v == 1]))
221 |         label_variables = test_data['variables'][i]
222 |         output_data = {
223 |             'code': codes,
224 |             'pred_variables': variables,
225 |             'label_variables': label_variables
226 |         }
227 |         test_cases.append(output_data)
228 |     json.dump(test_cases, open('output.json', 'w'), indent=4)
229 |     
230 |     


--------------------------------------------------------------------------------
/src/Baselines/lance/README.md:
--------------------------------------------------------------------------------
 1 | # Using Deep Learning To Support Logging Activities
 2 | 
 3 | We present LANCE(Log stAtemeNt reCommEnder), a DL-based approach for supporting the task of log statement generation and injection in the context of Java. LANCE is built on the recently proposed  <a href="https://github.com/google-research/text-to-text-transfer-transformer">Text-To-Text Transfer Transformer (T5) architecture</a>
 4 | 
 5 | 
 6 | #### How to experiment with LANCE
 7 | 
 8 | 
 9 | *  ##### How to train a new <a href='https://github.com/google/sentencepiece/blob/master/python/README.md'>SentencePiece Model</a>
10 | 
11 |    Before training the [T5 small](https://github.com/google-research/text-to-text-transfer-transformer), namely the core of LANCE, it is important to also train a      new tokenizer (sentencepiece model) to accomodate the expanded vocabulary given by the java programming language. For such, we used the raw pre-training instances(Java corpus) + English sentences from the well known C4 dataset
12 | 
13 |     *Pythonic way*
14 | 
15 |     ```
16 |     pip install sentencepiece==0.1.96
17 |     import sentencepiece as spm
18 |     spm.SentencePieceTrainer.train('--input=all_sp.txt --model_prefix=LOG_SP --vocab_size=32000 --bos_id=-1  --eos_id=1 --unk_id=2 --pad_id=0 --shuffle_input_sentence=true --character_coverage=1.0 --user_defined_symbols=“<LOG_STMT>”') 
19 |     ```
20 | 
21 |     Under this path we also provide our trained tokenizer: https://github.com/lance-log/lance/tree/main/Code
22 | 
23 | * ##### Setup a Google Cloud Storage (GCS) Bucket
24 |     To setup a new GCS Bucket for training and fine-tuning a T5 Model, please follow the original guide provided by Google: Here the link: https://cloud.google.com/storage/docs/quickstart-console
25 | 
26 | 
27 | * ##### Datasets
28 | 
29 |     The datasets for pre-training, fine-tuning, validating and finally testing LANCE can be found at this link: https://drive.google.com/drive/folders/1D12y-CIJTYLxMeSmGQjxEXjTEzQImgaH?usp=sharing
30 | 
31 | * ##### Pre-training/Fine-tuning 
32 |   
33 |     To pre-train and then, fine-tune LANCE, please use the following:
34 |     - <a href ='https://github.com/lance-log/lance/blob/main/Code/Pre-training/Pre_training.ipynb'>Pre-Training</a> 
35 |     -  <a href ='https://github.com/lance-log/lance/blob/main/Code/Fine-tuning/Fine_Tuning.ipynb'>Fine-Tuning</a> 
36 | 
37 | 
38 | 
39 | * ##### Models
40 |   * <a href="https://drive.google.com/drive/folders/1vqNozabCLoAgIG8qJJ6qs0W8s77-FrPc?usp=sharing">Pre-trained on the tasks mixture (Multi-Task)</a>
41 |   * <a href="https://drive.google.com/drive/folders/15Wx9dBlqQxV1zFeHl_uh2JoRBZ9oPt4l?usp=sharing">Pre-trained on LogSTMT only Task</a>
42 |   * <a href="https://drive.google.com/drive/folders/1CU_rS-BX9BchUhQEbis4CYH2w6syJ6i2?usp=sharing">Pre-trained on Denoise only Task</a>
43 |   * <a href="https://drive.google.com/drive/folders/1SjIdfQUDPH5NI5KseHypkln0yiYN8-Jr?usp=sharing">No Pre-trained</a>
44 |   
45 | * ##### Results:  :open_file_folder: 
46 |     * <a href='https://drive.google.com/drive/folders/1Kutaau3q5vPP3phaWtdmouZ5bvHHipwM?usp=sharing'>Multi-Task</a>
47 |     * <a href="https://drive.google.com/drive/folders/1cPJElLO_C1MoPp0dWAlmPX77CMNW0SAP?usp=sharing">LogSTMT only Task</a>
48 |     * <a href="https://drive.google.com/drive/folders/1Ea4WxrdxD4nPOeOLsoSonM7NYqKe-Snn?usp=sharing">Denoising only Task</a>
49 |     * <a href="https://drive.google.com/drive/folders/1FRERpcgcEdG6b7Cp4WpURbZR3TGtGa6I?usp=sharing">No Pre-trained</a>
50 | 
51 | 
52 | * ##### Additional:
53 |     Under <a href='https://github.com/lance-log/lance/tree/main/Miscellaneous'>Miscellaneous</a>, you can find the additional script used for the data analysis and the exact hyper-parameters configuration we employed in the study.
54 | 
55 | 
56 | 
57 | 
58 | 
59 |     
60 | 
61 | 


--------------------------------------------------------------------------------
/src/Baselines/lance/lance.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import os
  3 | import gin
  4 | import tensorflow.compat.v1 as tf
  5 | import tensorflow_datasets as tfds
  6 | from contextlib import contextmanager
  7 | import logging as py_logging
  8 | import t5
  9 | from t5.data import postprocessors as t5_postprocessors
 10 | from t5.seqio import Feature,SentencePieceVocabulary
 11 | from mesh_tensorflow.transformer.learning_rate_schedules import slanted_triangular 
 12 | from mesh_tensorflow.transformer.learning_rate_schedules import truncated_rsqrt
 13 | from tensorflow.keras.optimizers.schedules import PolynomialDecay
 14 | from t5 import models
 15 | 
 16 | BASE_DIR = "gs://xxxx" #@param { type: "string" }
 17 | TPU_TOPOLOGY = "2x2"
 18 | tpu = tf.distribute.cluster_resolver.TPUClusterResolver("grpc://xx.xx.xx.xx")  # TPU detection
 19 | TPU_ADDRESS = tpu.get_master()
 20 | tf.disable_v2_behavior()
 21 | tf.get_logger().propagate = False
 22 | py_logging.root.setLevel('INFO')
 23 | 
 24 | @contextmanager
 25 | def tf_verbosity_level(level):
 26 |   og_level = tf.logging.get_verbosity()
 27 |   tf.logging.set_verbosity(level)
 28 |   yield
 29 |   tf.logging.set_verbosity(og_level)
 30 | 
 31 | path_finetuning = BASE_DIR + '/datasets/Fine-tuning/train.tsv' #@param { type: "string" }
 32 | path_eval = BASE_DIR + '/datasets/Fine-tuning/eval.tsv' #@param { type: "string" }
 33 | path_test = BASE_DIR + '/datasets/Fine-tuning/test.tsv' #@param { type: "string" }
 34 | 
 35 | nq_tsv_path = {
 36 |     "train":      path_finetuning,
 37 |     "validation": path_test
 38 | }
 39 | 
 40 | num_nq_examples = dict(train=106382, validation=12020)
 41 | 
 42 | vocab_model_path = BASE_DIR + '/Code/SP_LOG.model' #@param { type: "string" }
 43 | vocab_path = BASE_DIR + '/Code/SP_LOG.vocab' #@param { type: "string" }
 44 | 
 45 | 
 46 | TaskRegistry = t5.data.TaskRegistry
 47 | TfdsTask = t5.data.TfdsTask
 48 | 
 49 | 
 50 | def get_default_vocabulary():
 51 |   return SentencePieceVocabulary(vocab_model_path, 100)
 52 | 
 53 | DEFAULT_OUTPUT_FEATURES = {
 54 |     "inputs": Feature(
 55 |         vocabulary=get_default_vocabulary(), add_eos=True, required=False),
 56 | 
 57 |     "targets": Feature(
 58 |         vocabulary=get_default_vocabulary(), add_eos=True)
 59 | }
 60 | 
 61 | def nq_dataset_task(split, shuffle_files=True):
 62 |   # We only have one file for each split.
 63 |   del shuffle_files
 64 | 
 65 |   # Load lines from the text file as examples.
 66 | 
 67 |   ds = tf.data.TextLineDataset(nq_tsv_path[split])
 68 |   ds = ds.map(
 69 |       functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
 70 |                         field_delim="\t", use_quote_delim=True),
 71 |       num_parallel_calls=tf.data.experimental.AUTOTUNE)
 72 |   
 73 |   ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
 74 |   return ds
 75 | 
 76 | print("A few raw train examples...")
 77 | for ex in tfds.as_numpy(nq_dataset_task("train").take(5)):
 78 |   print(ex)
 79 | 
 80 | def preprocessing(ds):
 81 |   
 82 |   def to_inputs_and_targets(ex):
 83 |         x_input = tf.strings.strip(ex['input'])
 84 |         y_label = tf.strings.strip(ex['output']) 
 85 |         inputs = tf.strings.join([x_input], separator=' ')
 86 |         class_label = tf.strings.join([y_label], separator=' ')
 87 |         return {'inputs': inputs, 'targets': class_label}
 88 |     
 89 |   return ds.map(to_inputs_and_targets, 
 90 |                 num_parallel_calls=tf.data.experimental.AUTOTUNE)
 91 | 
 92 | t5.data.TaskRegistry.remove('log_injection')
 93 | t5.data.TaskRegistry.add(
 94 |     "log_injection",
 95 |     dataset_fn=nq_dataset_task,
 96 |     splits=["train","validation"],
 97 |     text_preprocessor=[preprocessing],
 98 |     output_features = DEFAULT_OUTPUT_FEATURES,
 99 |     metric_fns=[t5.evaluation.metrics.accuracy],
100 |     num_input_examples=num_nq_examples
101 | )
102 | 
103 | nq_task = t5.data.TaskRegistry.get("log_injection")
104 | ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 512, "targets": 512})
105 | print("A few preprocessed training examples...")
106 | for ex in tfds.as_numpy(ds.take(5)):
107 |   print(ex)
108 | 
109 | starter_learning_rate = 0.01
110 | end_learning_rate = 0.001
111 | decay_steps = 10000
112 | 
113 | learning_rate_fn = PolynomialDecay(
114 |      starter_learning_rate,
115 |      decay_steps,
116 |      end_learning_rate,
117 |      power=0.5)
118 | 
119 | MODEL_SIZE = "small" 
120 | 
121 | MODEL_DIR = BASE_DIR + '/modeltest/'#@param { type: "string" }
122 | 
123 | PRETRAINED_DIR=BASE_DIR + '/denoising_task_model/'#@param { type: "string" }
124 | 
125 | 
126 | model_parallelism, train_batch_size, keep_checkpoint_max = {
127 |     "small": (1, 128, 16),
128 |     "base": (2, 128, 8),
129 |     "large": (8, 64, 4),
130 |     "3B": (8, 16, 1),
131 |     "11B": (8, 16, 1)}[MODEL_SIZE]
132 | 
133 | tf.io.gfile.makedirs(MODEL_DIR)
134 | 
135 | model = t5.models.MtfModel(
136 |     model_dir=PRETRAINED_DIR,
137 |     tpu=TPU_ADDRESS,
138 |     #tpu_job_name="node-1",
139 |     #tpu_zone="us-central1-f",
140 |     #gcp_project="lance",
141 |     tpu_topology=TPU_TOPOLOGY,
142 |     model_parallelism=model_parallelism,
143 |     batch_size=train_batch_size,
144 |     learning_rate_schedule = learning_rate_fn, #pick the correct scheduler, according to the model you want to train
145 |     sequence_length={"inputs": 512, "targets": 512},
146 |     save_checkpoints_steps=5000,
147 |     keep_checkpoint_max=keep_checkpoint_max,
148 |     iterations_per_loop=100,
149 | )
150 | 
151 | PATH_GIN_FILE_NO_PT = BASE_DIR + '/Configs/no_pretraining_operative_config.gin'
152 | PATH_GIN_FILE_MT = BASE_DIR + '/Configs/multi-task_operative_config.gin'
153 | PATH_GIN_FILE_DENOISE = BASE_DIR + '/Configs/denoise_only_operative_config.gin'
154 | PATH_GIN_FILE_LOG_STMT = BASE_DIR + '/Configs/log_stmt_only_operative_config.gin'
155 | 
156 | #with gin.unlock_config():
157 | #    gin.parse_config_file(PATH_GIN_FILE_DENOISE)
158 | #    #RUN FINE-TUNING
159 | #    TRAIN_STEPS = 200000
160 | #    model.finetune(mixture_or_task_name="log_injection",
161 | #                  finetune_steps=TRAIN_STEPS,
162 | #                pretrained_model_dir=PRETRAINED_DIR)
163 | 
164 |     # If the no-pretraining experiment is the one you want to run, then, uncomment the following and comment model.finetune
165 |     # Also, make sure to upload the slanted_operative.gin
166 |     #model.train("log_injection", TRAIN_STEPS)
167 |     #model.bach_size=32
168 |     #model.eval(
169 |     #mixture_or_task_name="log_injection",
170 |     #checkpoint_steps=-1
171 |     #)
172 | #dataset_list = ["cassandra","elasticsearch","flink","hbase","wicket","zookeeper"]
173 | dataset_list = ['logstudy']
174 | for item in dataset_list:
175 |     model.batch_size = 256
176 |     input_file = BASE_DIR + f'/datasets/logr_input/lance_function_transformed.txt'#@param { type: "string" }
177 |     output_file = BASE_DIR+ f'/datasets/logr_input/lance_function_transformed_result.txt'#@param { type: "string" }
178 |     model.predict(input_file, output_file, checkpoint_steps=-1, vocabulary=get_default_vocabulary())


--------------------------------------------------------------------------------
/src/CodeTransformer/README.md:
--------------------------------------------------------------------------------
1 | # CodeTransformer
2 | 
3 | We provide only the compiled version for use throughout the review process. We will make the source code available after the paper has been accepted.
4 | 


--------------------------------------------------------------------------------
/src/DataCollector/check_pom.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import re
  3 | import xml.etree.ElementTree as ET
  4 | import os
  5 | import base64
  6 | import shutil
  7 | from github import Github
  8 | from github import GithubException
  9 | 
 10 | def check_string_in_file(file_path, search_str="log4j"):
 11 |     with open(file_path, 'r') as file:
 12 |         content = file.read()
 13 | 
 14 |     if "log4j" in content.lower() or "slf4j" in content.lower():
 15 |         return True
 16 |     else:
 17 |         return False
 18 | 
 19 | def check_log4j(pom_file_path):
 20 |     # Parse the POM file as XML
 21 |     try:
 22 |         # Parse XML file
 23 |         tree = ET.parse(pom_file_path)
 24 |         root = tree.getroot()
 25 | 
 26 |         # Define the Log4j dependency artifact details
 27 |         group_id = 'org.apache.logging.log4j'
 28 |         artifact_id = 'log4j-core'
 29 | 
 30 |         # Iterate over the dependency elements in the POM file and check for the Log4j dependency
 31 |         for dependency in root.findall('.//{http://maven.apache.org/POM/4.0.0}dependency'):
 32 |             # Retrieve the group ID and artifact ID of the dependency
 33 |             dep_group_id = dependency.find('.//{http://maven.apache.org/POM/4.0.0}groupId')
 34 |             dep_artifact_id = dependency.find('.//{http://maven.apache.org/POM/4.0.0}artifactId')
 35 |             if dep_group_id is not None and dep_artifact_id is not None:
 36 |                 dep_group_id, dep_artifact_id = dep_group_id.text, dep_artifact_id.text
 37 |                 # Check if the dependency is the Log4j dependency
 38 |                 if dep_group_id == group_id and dep_artifact_id == artifact_id:
 39 |                     print(f'The POM file {pom_file_path} features the Log4j dependency')
 40 |                     return True
 41 | 
 42 |     except ET.ParseError as e:
 43 |         # Handle XML parsing exception
 44 |         print('Error parsing XML file:', e)
 45 |     
 46 |     print(f'The POM file {pom_file_path} does not feature the Log4j dependency')
 47 |     return False
 48 | 
 49 | 
 50 | def get_sha_for_tag(repository, tag):
 51 |     """
 52 |     Returns a commit PyGithub object for the specified repository and tag.
 53 |     """
 54 |     branches = repository.get_branches()
 55 |     matched_branches = [match for match in branches if match.name == tag]
 56 |     if matched_branches:
 57 |         return matched_branches[0].commit.sha
 58 | 
 59 |     tags = repository.get_tags()
 60 |     matched_tags = [match for match in tags if match.name == tag]
 61 |     if not matched_tags:
 62 |         print("No Tag or Branch exists with that name")
 63 |         return None
 64 |     return matched_tags[0].commit.sha
 65 | 
 66 | 
 67 | def download_file(git, sha, repo, path):
 68 |     try:
 69 |         file_content = git.get_contents(path, ref=sha)
 70 |         file_data = base64.b64decode(file_content.content)
 71 |         directory_path, _ = os.path.split(path)
 72 |         if not os.path.exists(f"repos/{repo}/{directory_path}"):
 73 |             os.makedirs(f"repos/{repo}/{directory_path}", exist_ok=True)
 74 |         file_out = open(f"repos/{repo}/{path}", "wb")
 75 |         file_out.write(file_data)
 76 |         file_out.close()
 77 |     except (GithubException, IOError) as exc:
 78 |         print('Error processing %s: %s', path, exc)
 79 | 
 80 | def check_repo(owner, repo, branch="master"):
 81 |     # Define the Github Tree API endpoint and repository details
 82 |     api_url = 'https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1'
 83 |     # Make an HTTP GET request to the Github Tree API endpoint
 84 |     access_token = ""
 85 |     headers = {'Authorization': f'token {access_token}'}
 86 |     if not os.path.exists(f"repos/{repo}/"):
 87 |         os.makedirs(f"repos/{repo}/")
 88 |         #print(f"./{repo}/ created")
 89 | 
 90 |     git = Github("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")
 91 |     git_repo = git.get_repo(f"{owner}/{repo}")
 92 |     sha = get_sha_for_tag(git_repo, branch)
 93 |     # Parse the response data as JSON
 94 |     response = requests.get(api_url.format(owner=owner, repo=repo, branch=branch), headers=headers)
 95 |     data = response.json()
 96 |     contain_pom = False
 97 |     if sha is not None:
 98 |         for item in data['tree']:
 99 |             if re.search("pom.xml", item['path'], re.IGNORECASE):
100 |                 download_file(git_repo, sha, repo, item['path'])
101 |                 if check_log4j(f"repos/{repo}/{item['path']}"):
102 |                     contain_pom = True
103 |                     break
104 |                 else:
105 |                     os.remove(f"repos/{repo}/{item['path']}")
106 |     print(f"{owner}/{repo} pom checking result: ", contain_pom)
107 |     shutil.rmtree(f"repos/{repo}/")
108 |     return contain_pom
109 |     # # Iterate over the file and directory objects in the response
110 |     # for item in data['tree']:
111 |     #     # Retrieve the file path and type
112 |     #     path, type = item['path'], item['type']
113 | 
114 |     #     # If the item is a file, retrieve the raw content using the 'url' property
115 |     #     if type == 'blob':
116 |     #         file_url = item['url']
117 |     #         file_response = requests.get(file_url)
118 |     #         file_data = file_response.content
119 | 
120 |     #         # Process the file content as needed
121 |     #         print(f'File: {path}')
122 |     #         #print(file_data)
123 |     #     else:
124 |     #         # Process directories or other items as needed
125 |     #         print(f'Directory: {path}')
126 | # github.com/davidb/scala-maven-plugin
127 | 
128 | def check_repo_root(owner, repo, access_token, branch="master"):
129 |     # Define the Github Tree API endpoint and repository details
130 |     #api_url = 'https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1'
131 |     # Make an HTTP GET request to the Github Tree API endpoint
132 |     #headers = {'Authorization': f'token {access_token}'}
133 |     if not os.path.exists(f"repos/{repo}/"):
134 |         os.makedirs(f"repos/{repo}/")
135 |         #print(f"./{repo}/ created")
136 | 
137 |     git = Github(access_token)
138 |     try:
139 |         git_repo = git.get_repo(f"{owner}/{repo}")
140 |     except GithubException as e:
141 |         if e.status == 404:
142 |             print("non")
143 |         else:
144 |             print("error")
145 |         shutil.rmtree(f"repos/{repo}/")
146 |         return False
147 |     
148 |     sha = get_sha_for_tag(git_repo, branch)
149 |     # Parse the response data as JSON
150 |     contain_pom = False
151 |     if sha is not None:
152 |         contents = git_repo.get_dir_contents(".", ref=sha)
153 |         for content in contents:
154 |             if content.type == "file" and content.path == "pom.xml":
155 |                 download_file(git_repo, sha, repo, content.path)
156 |                 if check_log4j(f"repos/{repo}/{content.path}") or check_string_in_file(f"repos/{repo}/{content.path}"):
157 |                     contain_pom = True
158 |                     break
159 | 
160 |     shutil.rmtree(f"repos/{repo}/")
161 |     print(f"{owner}/{repo} pom checking result: ", contain_pom)
162 |     return contain_pom
163 | 
164 | #check_repo("davidb", "scala-maven-plugin")


--------------------------------------------------------------------------------
/src/DataCollector/download.py:
--------------------------------------------------------------------------------
 1 | from get_java import download_java
 2 | 
 3 | Key = ""
 4 | 
 5 | with open("1.txt", "r") as file:
 6 |     for line in file:
 7 |         repo_list = line.split()
 8 |         owner, repo, branch = repo_list[1], repo_list[2], repo_list[3]
 9 |         print(f"{repo_list[0]} repo: {owner} {repo} {branch}")
10 |         Done = False
11 |         with open("result1.txt", "r") as f:
12 |             content = f.read()
13 |             if owner in content and repo in content:
14 |                 Done = True
15 |         if Done:
16 |             print("Already Done!")
17 |             continue
18 |         cnt1, cnt2 = download_java(owner, repo, key, branch)
19 |         with open("result1.txt", "a") as f:
20 |             f.write(f"{repo_list[0]} {owner}/{repo} downloaded: {cnt1}/{cnt1+cnt2} files\n")


--------------------------------------------------------------------------------
/src/DataCollector/filter_pom.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import subprocess
 3 | from check_pom import check_repo_root
 4 | from tqdm import tqdm
 5 | 
 6 | with open("results.json", encoding='latin1') as rf:
 7 |     repos = json.load(rf)
 8 | 
 9 | 
10 | Key = ""
11 | 
12 | repos = repos['items']
13 | #check_repo_root("nysenate", "openlegislation", "dev")
14 | end_point = len(repos)
15 | 
16 | with open("result.txt", "a") as f:
17 |     for i in range(17, end_point):
18 |         repo_item = repos[i]
19 |         branch = repo_item['defaultBranch']
20 |         owner, repo = repo_item['name'].split('/')
21 |         print(f"\n{i}-{end_point}/{len(repos)} repo: {owner} {repo} {branch}\n")
22 |         if check_repo_root(owner, repo, key, branch):
23 |             f.write(f"{i} {owner} {repo} {branch}\n")
24 |             f.flush()
25 | 


--------------------------------------------------------------------------------
/src/DataCollector/get_java.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import re
  3 | import xml.etree.ElementTree as ET
  4 | import os
  5 | import base64
  6 | import shutil
  7 | from tqdm import tqdm
  8 | from github import Github
  9 | from github import GithubException
 10 | import subprocess
 11 | import time
 12 | 
 13 | pattern = r"(?im)log.*\.(log|error|info|warn|fatal|debug|trace|off|all)\(.*\)"
 14 | regex = re.compile(pattern, re.DOTALL)
 15 | 
 16 | 
 17 | def git_clone(owner, repo):
 18 |     max_attempts = 5
 19 |     retry_wait_time = 5  # in seconds
 20 | 
 21 |     git_url = f"https://github.com/{owner}/{repo}.git"
 22 |     local_path = f"./temp/{repo}"
 23 |     cmd = ["git", "clone", git_url, local_path]
 24 | 
 25 |     for i in range(max_attempts):
 26 |         try:
 27 |             subprocess.check_call(cmd)
 28 |             print("Git clone successful!")
 29 |             break
 30 |         except subprocess.CalledProcessError as e:
 31 |             print(f"Git clone attempt {i + 1} failed with error code {e.returncode}. Retrying in {retry_wait_time} seconds...")
 32 |             time.sleep(retry_wait_time)
 33 |     else:
 34 |         print(f"Git clone failed after {max_attempts} attempts.")
 35 | 
 36 | 
 37 | def get_sha_for_tag(repository, tag):
 38 |     """
 39 |     Returns a commit PyGithub object for the specified repository and tag.
 40 |     """
 41 |     branches = repository.get_branches()
 42 |     matched_branches = [match for match in branches if match.name == tag]
 43 |     if matched_branches:
 44 |         return matched_branches[0].commit.sha
 45 | 
 46 |     tags = repository.get_tags()
 47 |     matched_tags = [match for match in tags if match.name == tag]
 48 |     if not matched_tags:
 49 |         print("No Tag or Branch exists with that name")
 50 |         return None
 51 |     return matched_tags[0].commit.sha
 52 | 
 53 | 
 54 | def check_java(path):
 55 |     try:
 56 |         with open(path, 'r') as file:
 57 |             content = file.read()
 58 |             words = content.split()
 59 |             if len(words) > 300:
 60 |                 return False
 61 |             lines = content.split('\n')
 62 |             if len(lines) > 300:
 63 |                 return False
 64 |             match = regex.search(content)
 65 |             if match:
 66 |                 return True
 67 |     except UnicodeDecodeError as e:
 68 |         print(f"Error: {e} and Path: {path}")
 69 |         return False
 70 |     return False
 71 | 
 72 | 
 73 | def download_java_file(git, sha, repo, path):
 74 |     try:
 75 |         file_content = git.get_contents(path, ref=sha)
 76 |         _, file_name = os.path.split(path)
 77 |         file_data = base64.b64decode(file_content.content)
 78 |         file_out = open(f"repos/{repo}/{file_name}", "wb")
 79 |         file_out.write(file_data)
 80 |         file_out.close()
 81 |         if check_java(f"repos/{repo}/{file_name}") == False:
 82 |             os.remove(f"repos/{repo}/{file_name}")
 83 |             return 0
 84 |         return 1
 85 |     except (GithubException, IOError) as exc:
 86 |         print('Error processing %s: %s', path, exc)
 87 |         return 0
 88 | 
 89 | 
 90 | def download_java(owner, repo, access_token, branch="master"):
 91 |     if not os.path.exists(f"repos/{repo}/"):
 92 |         os.makedirs(f"repos/{repo}/")
 93 | 
 94 |     git = Github(access_token)
 95 |     try:
 96 |         git_repo = git.get_repo(f"{owner}/{repo}")
 97 |     except GithubException as e:
 98 |         if e.status == 404:
 99 |             print("Non")
100 |         else:
101 |             print("Error")
102 |         shutil.rmtree(f"repos/{repo}/")
103 |         return False
104 |     sha = get_sha_for_tag(git_repo, branch)
105 | 
106 |     # Define the Github Tree API endpoint and repository details
107 |     api_url = 'https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1'
108 |     # Make an HTTP GET request to the Github Tree API endpoint
109 |     headers = {'Authorization': f'token {access_token}'}
110 |     response = requests.get(api_url.format(owner=owner, repo=repo, branch=branch), headers=headers)
111 |     data = response.json()
112 |     cnt1, cnt2 = 0, 0
113 |     print(git_repo.size)
114 |     if git_repo.size < 500000000:
115 |         git_clone(owner, repo)
116 |         for subdir, dirs, files in os.walk(f"./temp/{repo}"):
117 |             for file in tqdm(files):
118 |                 if not file.endswith(".java"):
119 |                     continue
120 |                 file_path = os.path.join(subdir, file)
121 |                 if os.path.getsize(file_path) < 15 * 1024:
122 |                     cnt2 += 1
123 |                     if check_java(file_path):
124 |                         cnt1 += 1
125 |                         shutil.copy2(file_path, f"repos/{repo}/{file}")
126 |         shutil.rmtree(f"./temp/{repo}")
127 |     else:
128 |         print("File is too large!")
129 |         if sha is not None:
130 |             tree = data['tree']
131 |             leng = len(tree)
132 |             for file in tqdm(tree):
133 |             #for item in tqdm(tree):
134 |                 if file['type'] != "tree" and file['size'] < 15 * 1024 and file['path'].endswith(".java"):
135 |                     cnt1 += 1
136 |                     cnt1 += download_java_file(git_repo, sha, repo, file['path'])
137 |                     cnt2 += 1
138 |     if cnt1 == 0:
139 |         shutil.rmtree(f"repos/{repo}/")
140 |     print(f"{owner}/{repo} downloaded: {cnt1}/{cnt1+cnt2} files")
141 |     return cnt1, cnt2
142 | 


--------------------------------------------------------------------------------
/src/unixcoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Microsoft Corporation. 
  2 | # Licensed under the MIT license.
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
  7 | 
  8 | class UniXcoder(nn.Module):
  9 |     def __init__(self, model_name):
 10 |         """
 11 |             Build UniXcoder.
 12 | 
 13 |             Parameters:
 14 | 
 15 |             * `model_name`- huggingface model card name. e.g. microsoft/unixcoder-base
 16 |         """        
 17 |         super(UniXcoder, self).__init__()
 18 |         self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
 19 |         self.config = RobertaConfig.from_pretrained(model_name)
 20 |         self.config.is_decoder = True
 21 |         self.model = RobertaModel.from_pretrained(model_name, config=self.config)
 22 |         
 23 |         self.register_buffer("bias", torch.tril(torch.ones((1024, 1024), dtype=torch.uint8)).view(1,1024, 1024))
 24 |         self.lm_head = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)
 25 |         self.lm_head.weight = self.model.embeddings.word_embeddings.weight
 26 |         self.lsm = nn.LogSoftmax(dim=-1)
 27 |         
 28 |         self.tokenizer.add_tokens(["<mask0>"],special_tokens=True)
 29 |           
 30 |     def tokenize(self, inputs, mode="<encoder-only>", max_length=512, padding=False):
 31 |         """ 
 32 |         Convert string to token ids 
 33 |                 
 34 |         Parameters:
 35 | 
 36 |         * `inputs`- list of input strings.
 37 |         * `max_length`- The maximum total source sequence length after tokenization.
 38 |         * `padding`- whether to pad source sequence length to max_length. 
 39 |         * `mode`- which mode the sequence will use. i.e. <encoder-only>, <decoder-only>, <encoder-decoder>
 40 |         """
 41 |         assert mode in ["<encoder-only>", "<decoder-only>", "<encoder-decoder>"]
 42 |         assert max_length < 1024
 43 |         
 44 |         tokenizer = self.tokenizer
 45 |         
 46 |         tokens_ids = []
 47 |         for x in inputs:
 48 |             tokens = tokenizer.tokenize(x)
 49 |             if mode == "<encoder-only>":
 50 |                 tokens = tokens[:max_length-4]
 51 |                 tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens + [tokenizer.sep_token]                
 52 |             elif mode == "<decoder-only>":
 53 |                 tokens = tokens[-(max_length-3):]
 54 |                 tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens
 55 |             else:
 56 |                 tokens = tokens[:max_length-5]
 57 |                 tokens = [tokenizer.cls_token,mode,tokenizer.sep_token] + tokens + [tokenizer.sep_token]
 58 |                 
 59 |             tokens_id = tokenizer.convert_tokens_to_ids(tokens)
 60 |             if padding:
 61 |                 tokens_id = tokens_id + [self.config.pad_token_id] * (max_length-len(tokens_id))
 62 |             tokens_ids.append(tokens_id)
 63 |         return tokens_ids
 64 |             
 65 |     def decode(self, source_ids):   
 66 |         """ Convert token ids to string """      
 67 |         predictions = []
 68 |         for x in source_ids:
 69 |             prediction = []
 70 |             for y in x:
 71 |                 t = y.cpu().numpy()
 72 |                 t = list(t)
 73 |                 if 0 in t:
 74 |                     t = t[:t.index(0)]
 75 |                 text = self.tokenizer.decode(t,clean_up_tokenization_spaces=False)
 76 |                 prediction.append(text)        
 77 |             predictions.append(prediction)
 78 |         return predictions
 79 |     
 80 |     def forward(self, source_ids):   
 81 |         """ Obtain token embeddings and sentence embeddings """
 82 |         mask = source_ids.ne(self.config.pad_token_id)
 83 |         token_embeddings = self.model(source_ids,attention_mask = mask.unsqueeze(1) * mask.unsqueeze(2))[0]
 84 |         sentence_embeddings = (token_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1).unsqueeze(-1)
 85 |         return token_embeddings, sentence_embeddings       
 86 | 
 87 |     def generate(self, source_ids, decoder_only = True, eos_id = None, beam_size = 5, max_length = 64):
 88 |         """ Generate sequence given context (source_ids) """
 89 |         
 90 |         # Set encoder mask attention matrix: bidirectional for <encoder-decoder>, unirectional for <decoder-only>
 91 |         if decoder_only:
 92 |             mask = self.bias[:,:source_ids.size(-1),:source_ids.size(-1)]
 93 |         else:
 94 |             mask = source_ids.ne(self.config.pad_token_id)
 95 |             mask = mask.unsqueeze(1) * mask.unsqueeze(2)  
 96 |             
 97 |         if eos_id is None:
 98 |             eos_id = self.config.eos_token_id
 99 |         
100 |         device = source_ids.device
101 |         
102 |         # Decoding using beam search
103 |         preds = []       
104 |         zero = torch.LongTensor(1).fill_(0).to(device)   
105 |         source_len = list(source_ids.ne(1).sum(-1).cpu().numpy())
106 |         length = source_ids.size(-1)
107 |         encoder_output = self.model(source_ids,attention_mask=mask)
108 |         for i in range(source_ids.shape[0]):
109 |             context = [[x[i:i+1,:,:source_len[i]].repeat(beam_size,1,1,1) for x in y] 
110 |                      for y in encoder_output.past_key_values]
111 |             beam = Beam(beam_size,eos_id,device)
112 |             input_ids = beam.getCurrentState().clone()
113 |             context_ids = source_ids[i:i+1,:source_len[i]].repeat(beam_size,1)
114 |             out = encoder_output.last_hidden_state[i:i+1,:source_len[i]].repeat(beam_size,1,1)
115 |             for _ in range(max_length): 
116 |                 if beam.done():
117 |                     break
118 |                 if _ == 0: 
119 |                     hidden_states = out[:,-1,:]
120 |                     out = self.lsm(self.lm_head(hidden_states)).data
121 |                     beam.advance(out)
122 |                     input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
123 |                     input_ids = beam.getCurrentState().clone()
124 |                 else:
125 |                     length = context_ids.size(-1)+input_ids.size(-1)
126 |                     out = self.model(input_ids,attention_mask=self.bias[:,context_ids.size(-1):length,:length],
127 |                                        past_key_values=context).last_hidden_state
128 |                     hidden_states = out[:,-1,:]
129 |                     out = self.lsm(self.lm_head(hidden_states)).data
130 |                     beam.advance(out)
131 |                     input_ids.data.copy_(input_ids.data.index_select(0, beam.getCurrentOrigin()))
132 |                     input_ids = torch.cat((input_ids,beam.getCurrentState().clone()),-1)
133 |             hyp = beam.getHyp(beam.getFinal())
134 |             pred = beam.buildTargetTokens(hyp)[:beam_size]
135 |             pred = [torch.cat([x.view(-1) for x in p]+[zero]*(max_length-len(p))).view(1,-1) for p in pred]
136 |             preds.append(torch.cat(pred,0).unsqueeze(0))
137 | 
138 |         preds = torch.cat(preds,0)    
139 | 
140 |         return preds  
141 |     
142 | 
143 |     
144 | class Beam(object):
145 |     def __init__(self, size, eos, device):
146 |         self.size = size
147 |         self.device = device
148 |         # The score for each translation on the beam.
149 |         self.scores = torch.FloatTensor(size).zero_().to(device)
150 |         # The backpointers at each time-step.
151 |         self.prevKs = []
152 |         # The outputs at each time-step.
153 |         self.nextYs = [torch.LongTensor(size).fill_(0).to(device)]
154 |         # Has EOS topped the beam yet.
155 |         self._eos = eos
156 |         self.eosTop = False
157 |         # Time and k pair for finished.
158 |         self.finished = []
159 | 
160 |     def getCurrentState(self):
161 |         "Get the outputs for the current timestep."
162 |         batch = self.nextYs[-1].view(-1, 1)
163 |         return batch
164 | 
165 |     def getCurrentOrigin(self):
166 |         "Get the backpointers for the current timestep."
167 |         return self.prevKs[-1]
168 | 
169 |     def advance(self, wordLk):
170 |         """
171 |         Given prob over words for every last beam `wordLk` and attention
172 |         `attnOut`: Compute and update the beam search.
173 | 
174 |         Parameters:
175 | 
176 |         * `wordLk`- probs of advancing from the last step (K x words)
177 |         * `attnOut`- attention at the last step
178 | 
179 |         Returns: True if beam search is complete.
180 |         """
181 |         numWords = wordLk.size(1)
182 | 
183 |         # Sum the previous scores.
184 |         if len(self.prevKs) > 0:
185 |             beamLk = wordLk + self.scores.unsqueeze(1).expand_as(wordLk)
186 | 
187 |             # Don't let EOS have children.
188 |             for i in range(self.nextYs[-1].size(0)):
189 |                 if self.nextYs[-1][i] == self._eos:
190 |                     beamLk[i] = -1e20
191 |         else:
192 |             beamLk = wordLk[0]
193 |         flatBeamLk = beamLk.view(-1)
194 |         bestScores, bestScoresId = flatBeamLk.topk(self.size, 0, True, True)
195 | 
196 |         self.scores = bestScores
197 | 
198 |         # bestScoresId is flattened beam x word array, so calculate which
199 |         # word and beam each score came from
200 |         prevK = torch.div(bestScoresId, numWords, rounding_mode="floor")
201 |         self.prevKs.append(prevK)
202 |         self.nextYs.append((bestScoresId - prevK * numWords))
203 | 
204 | 
205 |         for i in range(self.nextYs[-1].size(0)):
206 |             if self.nextYs[-1][i] == self._eos:
207 |                 s = self.scores[i]
208 |                 self.finished.append((s, len(self.nextYs) - 1, i))
209 | 
210 |         # End condition is when top-of-beam is EOS and no global score.
211 |         if self.nextYs[-1][0] == self._eos:
212 |             self.eosTop = True
213 | 
214 |     def done(self):
215 |         return self.eosTop and len(self.finished) >= self.size
216 | 
217 |     def getFinal(self):
218 |         if len(self.finished) == 0:
219 |             self.finished.append((self.scores[0], len(self.nextYs) - 1, 0))
220 |         self.finished.sort(key=lambda a: -a[0])
221 |         if len(self.finished) != self.size:
222 |             unfinished=[]
223 |             for i in range(self.nextYs[-1].size(0)):
224 |                 if self.nextYs[-1][i] != self._eos:
225 |                     s = self.scores[i]
226 |                     unfinished.append((s, len(self.nextYs) - 1, i)) 
227 |             unfinished.sort(key=lambda a: -a[0])
228 |             self.finished+=unfinished[:self.size-len(self.finished)]
229 |         return self.finished[:self.size]
230 | 
231 |     def getHyp(self, beam_res):
232 |         """
233 |         Walk back to construct the full hypothesis.
234 |         """
235 |         hyps=[]
236 |         for _,timestep, k in beam_res:
237 |             hyp = []
238 |             for j in range(len(self.prevKs[:timestep]) - 1, -1, -1):
239 |                 hyp.append(self.nextYs[j+1][k])
240 |                 k = self.prevKs[j][k]
241 |             hyps.append(hyp[::-1])
242 |         return hyps
243 |     
244 |     def buildTargetTokens(self, preds):
245 |         sentence=[]
246 |         for pred in preds:
247 |             tokens = []
248 |             for tok in pred:
249 |                 if tok==self._eos:
250 |                     break
251 |                 tokens.append(tok)
252 |             sentence.append(tokens)
253 |         return sentence
254 |         
255 | 


--------------------------------------------------------------------------------