├── .gitignore ├── .vscode └── settings.json ├── AST_GEN.md ├── AutoComment_ResearchPaper.pdf ├── JavaExtractor ├── JPredict │ ├── .classpath │ ├── .gitignore │ ├── .project │ ├── .settings │ │ ├── org.eclipse.core.resources.prefs │ │ ├── org.eclipse.jdt.apt.core.prefs │ │ └── org.eclipse.jdt.core.prefs │ ├── JavaExtractor (1).iml │ ├── JavaExtractor.iml │ ├── dependency-reduced-pom.xml │ ├── error_log.txt │ ├── pom.xml │ └── src │ │ └── main │ │ └── java │ │ ├── JavaExtractor │ │ ├── App.java │ │ ├── Common │ │ │ ├── CommandLineValues.java │ │ │ ├── Common.java │ │ │ └── MethodContent.java │ │ ├── ExtractFeaturesTask.java │ │ ├── FeatureExtractor.java │ │ ├── FeaturesEntities │ │ │ ├── ProgramFeatures.java │ │ │ ├── ProgramRelation.java │ │ │ └── Property.java │ │ ├── Main.java │ │ └── Visitors │ │ │ ├── FunctionVisitor.java │ │ │ └── LeavesCollectorVisitor.java │ │ └── Test.java └── extract.py ├── README.md ├── bleu_score.py ├── code2seq_master ├── .gitignore ├── CSharpExtractor │ ├── .gitattributes │ ├── .gitignore │ ├── CSharpExtractor │ │ ├── .nuget │ │ │ └── packages.config │ │ ├── CSharpExtractor.sln │ │ └── Extractor │ │ │ ├── Extractor.cs │ │ │ ├── Extractor.csproj │ │ │ ├── PathFinder.cs │ │ │ ├── Program.cs │ │ │ ├── Properties │ │ │ └── launchSettings.json │ │ │ ├── Temp.cs │ │ │ ├── Tree │ │ │ └── Tree.cs │ │ │ ├── Utilities.cs │ │ │ └── Variable.cs │ └── extract.py ├── Input.java ├── JavaExtractor │ ├── JPredict │ │ ├── .classpath │ │ ├── .gitignore │ │ └── src │ │ │ └── main │ │ │ └── java │ │ │ ├── JavaExtractor │ │ │ ├── App.java │ │ │ ├── Common │ │ │ │ ├── CommandLineValues.java │ │ │ │ ├── Common.java │ │ │ │ └── MethodContent.java │ │ │ ├── ExtractFeaturesTask.java │ │ │ ├── FeatureExtractor.java │ │ │ ├── FeaturesEntities │ │ │ │ ├── ProgramFeatures.java │ │ │ │ ├── ProgramRelation.java │ │ │ │ └── Property.java │ │ │ └── Visitors │ │ │ │ ├── FunctionVisitor.java │ │ │ │ └── LeavesCollectorVisitor.java │ │ │ └── Test.java │ └── extract.py ├── LICENSE ├── README.md ├── __init__.py ├── baseline_tokenization │ ├── input_example.txt │ ├── javalang │ │ ├── __init__.py │ │ ├── ast.py │ │ ├── javadoc.py │ │ ├── parse.py │ │ ├── parser.py │ │ ├── test │ │ │ ├── __init__.py │ │ │ ├── source │ │ │ │ └── package-info │ │ │ │ │ ├── AnnotationJavadoc.java │ │ │ │ │ ├── AnnotationOnly.java │ │ │ │ │ ├── JavadocAnnotation.java │ │ │ │ │ ├── JavadocOnly.java │ │ │ │ │ └── NoAnnotationNoJavadoc.java │ │ │ ├── test_java_8_syntax.py │ │ │ ├── test_javadoc.py │ │ │ ├── test_package_declaration.py │ │ │ └── test_util.py │ │ ├── tokenizer.py │ │ ├── tree.py │ │ └── util.py │ └── subtokenize_nmt_baseline.py ├── code2seq.py ├── code2seq_ast_extractor.py ├── common.py ├── config.py ├── extract_ast.py ├── extractor.py ├── images │ └── network.png ├── init.py ├── interactive_predict.py ├── java_files_creator.py ├── model.py ├── preprocess.py ├── preprocess.sh ├── preprocess_csharp.sh ├── preprocess_custom.sh ├── reader.py ├── test_extracted_ast.py └── train.sh ├── data ├── .gitignore └── data.7z ├── images ├── network_architecture.png └── pipeline.png ├── poster ├── ML4SE_Poster_Group_3.pdf └── source_code │ ├── example.java │ ├── img │ ├── Embedding.png │ ├── TU_P1_full-color.png │ ├── distr.png │ ├── link_to_github.png │ ├── results_table.png │ └── zoomedInLength.png │ ├── poster.tex │ └── tudelftposter.cls ├── preproc ├── __init__.py ├── common.py ├── feature_extractor.py ├── java_files_creator.py ├── preprocess.py └── preprocess.sh ├── presentation ├── AutoComments_Presentation-Group3.pdf └── link_to_presentation.txt ├── report ├── ML4SE_group_3_report.pdf └── latex_code │ ├── BasicEncoderDecoder.png │ ├── BiLSTM.png │ ├── Embedding.png │ ├── Encoder(1).png │ ├── ExampleAST.png │ ├── LSTM.png │ ├── blueprints.tex │ ├── distr.png │ ├── example.java │ ├── main.tex │ ├── reference.bib │ ├── source-code │ ├── 1.java │ ├── 2.java │ ├── 3.java │ ├── 4.java │ ├── 5.java │ ├── 6.java │ ├── 7.java │ ├── 8.java │ └── 9.java │ └── zoomedInLength.png └── scripts └── multi-bleu.perl /.gitignore: -------------------------------------------------------------------------------- 1 | /data/*.json 2 | code2vec_base 3 | .idea 4 | code2vec_model 5 | tmp 6 | code2seq-master/java_code_valid 7 | code2seq-master/java_code_train 8 | code2seq-master/java_code_test 9 | data/auto_comment_dataset 10 | code2seq-master 11 | apnews_dbow -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "java.configuration.updateBuildConfiguration": "automatic", 3 | "files.exclude": { 4 | "**/.classpath": true, 5 | "**/.project": true, 6 | "**/.settings": true, 7 | "**/.factorypath": true 8 | } 9 | } -------------------------------------------------------------------------------- /AST_GEN.md: -------------------------------------------------------------------------------- 1 | # AutoComments 2 | 3 | ## Generation of AST - STEPS: 4 | 5 | Before everything, extract - 'data.7z' in data folder 6 | 7 | STEP 1 - Run this, python preproc/java_files_creator.py 8 | 9 | STEP 2 - Run this, bash preproc/preprocess_custom.sh 10 | 11 | RESULTS : 12 | You will see the ASTs of test, train, valid in folder - data/auto_comment_dataset 13 | 14 | NOTE: 15 | 1. Run all the above processes from the project's root directory 16 | 2. Toggle the boolean "get_ast_full_file", to extract AST for full dataset or only first 100 code snippets. 17 | i. True -> Runs for full dataset 18 | ii. Fasle -> Runs for first 100 code snippets 19 | -------------------------------------------------------------------------------- /AutoComment_ResearchPaper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/AutoComment_ResearchPaper.pdf -------------------------------------------------------------------------------- /JavaExtractor/JPredict/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ -------------------------------------------------------------------------------- /JavaExtractor/JPredict/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | JavaExtractor 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/.settings/org.eclipse.core.resources.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | encoding//src/main/java=UTF-8 3 | encoding/=UTF-8 4 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/.settings/org.eclipse.jdt.apt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.apt.aptEnabled=false 3 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 3 | org.eclipse.jdt.core.compiler.compliance=1.8 4 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled 5 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 6 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore 7 | org.eclipse.jdt.core.compiler.processAnnotations=disabled 8 | org.eclipse.jdt.core.compiler.release=disabled 9 | org.eclipse.jdt.core.compiler.source=1.8 10 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/JavaExtractor (1).iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/JavaExtractor.iml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/dependency-reduced-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | JavaExtractor 5 | JavaExtractor 6 | JPredict 7 | 0.0.1-SNAPSHOT 8 | http://maven.apache.org 9 | 10 | 11 | 12 | maven-compiler-plugin 13 | 3.2 14 | 15 | 1.8 16 | 1.8 17 | 18 | Test.java 19 | 20 | 21 | 22 | 23 | maven-shade-plugin 24 | 2.1 25 | 26 | 27 | package 28 | 29 | shade 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | UTF-8 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/error_log.txt: -------------------------------------------------------------------------------- 1 | /usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/Resources/Python.app/Contents/MacOS/Python: can't open file 'JavaExtractor/extract.py': [Errno 2] No such file or directory 2 | /usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/Resources/Python.app/Contents/MacOS/Python: can't open file 'JavaExtractor/extract.py': [Errno 2] No such file or directory 3 | /usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/Resources/Python.app/Contents/MacOS/Python: can't open file 'JavaExtractor/extract.py': [Errno 2] No such file or directory 4 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 4.0.0 5 | JavaExtractor 6 | JavaExtractor 7 | JPredict 8 | 0.0.1-SNAPSHOT 9 | http://maven.apache.org 10 | 11 | 12 | 13 | maven-compiler-plugin 14 | 3.2 15 | 16 | 1.8 17 | 1.8 18 | 19 | Test.java 20 | 21 | 22 | 23 | 24 | maven-shade-plugin 25 | 2.1 26 | 27 | 28 | package 29 | 30 | shade 31 | 32 | 33 | 34 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | com.github.javaparser 47 | javaparser-core 48 | 3.0.0-alpha.4 49 | 50 | 51 | commons-io 52 | commons-io 53 | 1.3.2 54 | compile 55 | 56 | 57 | com.fasterxml.jackson.core 58 | jackson-databind 59 | 2.9.8 60 | 61 | 62 | args4j 63 | args4j 64 | 2.33 65 | 66 | 67 | org.apache.commons 68 | commons-lang3 69 | 3.5 70 | 71 | 72 | 73 | UTF-8 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor; 2 | 3 | import JavaExtractor.Common.CommandLineValues; 4 | import org.kohsuke.args4j.CmdLineException; 5 | 6 | import java.io.IOException; 7 | import java.nio.file.Files; 8 | import java.nio.file.Paths; 9 | import java.util.LinkedList; 10 | import java.util.List; 11 | import java.util.concurrent.ExecutionException; 12 | import java.util.concurrent.Executors; 13 | import java.util.concurrent.Future; 14 | import java.util.concurrent.ThreadPoolExecutor; 15 | 16 | public class App { 17 | private static CommandLineValues s_CommandLineValues; 18 | 19 | public static void main(String[] args) { 20 | try { 21 | s_CommandLineValues = new CommandLineValues(args); 22 | } catch (CmdLineException e) { 23 | e.printStackTrace(); 24 | return; 25 | } 26 | 27 | if (s_CommandLineValues.File != null) { 28 | ExtractFeaturesTask extractFeaturesTask = new ExtractFeaturesTask(s_CommandLineValues, 29 | s_CommandLineValues.File.toPath()); 30 | extractFeaturesTask.processFile(); 31 | } else if (s_CommandLineValues.Dir != null) { 32 | extractDir(); 33 | } 34 | } 35 | 36 | private static void extractDir() { 37 | ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(s_CommandLineValues.NumThreads); 38 | LinkedList tasks = new LinkedList<>(); 39 | try { 40 | Files.walk(Paths.get(s_CommandLineValues.Dir)).filter(Files::isRegularFile) 41 | .filter(p -> p.toString().toLowerCase().endsWith(".java")).forEach(f -> { 42 | ExtractFeaturesTask task = new ExtractFeaturesTask(s_CommandLineValues, f); 43 | tasks.add(task); 44 | }); 45 | } catch (IOException e) { 46 | e.printStackTrace(); 47 | return; 48 | } 49 | List> tasksResults = null; 50 | try { 51 | tasksResults = executor.invokeAll(tasks); 52 | } catch (InterruptedException e) { 53 | e.printStackTrace(); 54 | } finally { 55 | executor.shutdown(); 56 | } 57 | tasksResults.forEach(f -> { 58 | try { 59 | f.get(); 60 | } catch (InterruptedException | ExecutionException e) { 61 | e.printStackTrace(); 62 | } 63 | }); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.Common; 2 | 3 | import org.kohsuke.args4j.CmdLineException; 4 | import org.kohsuke.args4j.CmdLineParser; 5 | import org.kohsuke.args4j.Option; 6 | 7 | import java.io.File; 8 | 9 | /** 10 | * This class handles the programs arguments. 11 | */ 12 | public class CommandLineValues { 13 | @Option(name = "--file", required = false) 14 | public File File = null; 15 | 16 | @Option(name = "--dir", required = false, forbids = "--file") 17 | public String Dir = null; 18 | 19 | @Option(name = "--max_path_length", required = true) 20 | public int MaxPathLength; 21 | 22 | @Option(name = "--max_path_width", required = true) 23 | public int MaxPathWidth; 24 | 25 | @Option(name = "--num_threads", required = false) 26 | public int NumThreads = 64; 27 | 28 | @Option(name = "--min_code_len", required = false) 29 | public int MinCodeLength = 1; 30 | 31 | @Option(name = "--max_code_len", required = false) 32 | public int MaxCodeLength = -1; 33 | 34 | @Option(name = "--max_file_len", required = false) 35 | public int MaxFileLength = -1; 36 | 37 | @Option(name = "--pretty_print", required = false) 38 | public boolean PrettyPrint = false; 39 | 40 | @Option(name = "--max_child_id", required = false) 41 | public int MaxChildId = 3; 42 | 43 | public CommandLineValues(String... args) throws CmdLineException { 44 | CmdLineParser parser = new CmdLineParser(this); 45 | try { 46 | parser.parseArgument(args); 47 | } catch (CmdLineException e) { 48 | System.err.println(e.getMessage()); 49 | parser.printUsage(System.err); 50 | throw e; 51 | } 52 | } 53 | 54 | public CommandLineValues() { 55 | 56 | } 57 | } -------------------------------------------------------------------------------- /JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.Common; 2 | 3 | import JavaExtractor.FeaturesEntities.Property; 4 | import com.github.javaparser.ast.Node; 5 | import com.github.javaparser.ast.UserDataKey; 6 | 7 | import java.util.ArrayList; 8 | import java.util.stream.Collectors; 9 | import java.util.stream.Stream; 10 | 11 | public final class Common { 12 | public static final UserDataKey PropertyKey = new UserDataKey() { 13 | }; 14 | public static final UserDataKey ChildId = new UserDataKey() { 15 | }; 16 | public static final String EmptyString = ""; 17 | 18 | public static final String MethodDeclaration = "MethodDeclaration"; 19 | public static final String NameExpr = "NameExpr"; 20 | public static final String BlankWord = "BLANK"; 21 | 22 | public static final int c_MaxLabelLength = 50; 23 | public static final String methodName = "METHOD_NAME"; 24 | public static final String internalSeparator = "|"; 25 | 26 | public static String normalizeName(String original, String defaultString) { 27 | original = original.toLowerCase().replaceAll("\\\\n", "") // escaped new 28 | // lines 29 | .replaceAll("//s+", "") // whitespaces 30 | .replaceAll("[\"',]", "") // quotes, apostrophies, commas 31 | .replaceAll("\\P{Print}", ""); // unicode weird characters 32 | String stripped = original.replaceAll("[^A-Za-z]", ""); 33 | if (stripped.length() == 0) { 34 | String carefulStripped = original.replaceAll(" ", "_"); 35 | if (carefulStripped.length() == 0) { 36 | return defaultString; 37 | } else { 38 | return carefulStripped; 39 | } 40 | } else { 41 | return stripped; 42 | } 43 | } 44 | 45 | public static boolean isMethod(Node node, String type) { 46 | Property parentProperty = node.getParentNode().getUserData(Common.PropertyKey); 47 | if (parentProperty == null) { 48 | return false; 49 | } 50 | 51 | String parentType = parentProperty.getType(); 52 | return Common.NameExpr.equals(type) && Common.MethodDeclaration.equals(parentType); 53 | } 54 | 55 | public static ArrayList splitToSubtokens(String str1) { 56 | String str2 = str1.replace("|", " "); 57 | String str3 = str2.trim(); 58 | return Stream.of(str3.split("(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+")) 59 | .filter(s -> s.length() > 0).map(s -> Common.normalizeName(s, Common.EmptyString)) 60 | .filter(s -> s.length() > 0).collect(Collectors.toCollection(ArrayList::new)); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.Common; 2 | 3 | import com.github.javaparser.ast.Node; 4 | 5 | import java.util.ArrayList; 6 | 7 | public class MethodContent { 8 | private final ArrayList leaves; 9 | private final String name; 10 | 11 | public MethodContent(ArrayList leaves, String name) { 12 | this.leaves = leaves; 13 | this.name = name; 14 | } 15 | 16 | public ArrayList getLeaves() { 17 | return leaves; 18 | } 19 | 20 | public String getName() { 21 | return name; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor; 2 | 3 | import JavaExtractor.Common.CommandLineValues; 4 | import JavaExtractor.Common.Common; 5 | import JavaExtractor.FeaturesEntities.ProgramFeatures; 6 | import org.apache.commons.lang3.StringUtils; 7 | 8 | import java.io.IOException; 9 | import java.nio.charset.Charset; 10 | import java.nio.file.Files; 11 | import java.nio.file.Path; 12 | import java.nio.file.Paths; 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | import java.util.concurrent.Callable; 16 | 17 | class ExtractFeaturesTask implements Callable { 18 | private final CommandLineValues m_CommandLineValues; 19 | private final Path filePath; 20 | 21 | public ExtractFeaturesTask(CommandLineValues commandLineValues, Path path) { 22 | m_CommandLineValues = commandLineValues; 23 | this.filePath = path; 24 | } 25 | 26 | @Override 27 | public Void call() { 28 | processFile(); 29 | return null; 30 | } 31 | 32 | public void processFile() { 33 | ArrayList features; 34 | try { 35 | features = extractSingleFile(); 36 | } catch (IOException e) { 37 | e.printStackTrace(); 38 | return; 39 | } 40 | if (features == null) { 41 | return; 42 | } 43 | //Find out how to itterator over programFeatures 44 | String toPrint = featuresToString(features); 45 | if (toPrint.length() > 0) { 46 | System.out.println(toPrint); 47 | } 48 | } 49 | 50 | private ArrayList extractSingleFile() throws IOException { 51 | String code; 52 | String comment; 53 | 54 | if (m_CommandLineValues.MaxFileLength > 0 && 55 | Files.lines(filePath, Charset.defaultCharset()).count() > m_CommandLineValues.MaxFileLength) { 56 | return new ArrayList<>(); 57 | } 58 | try { 59 | code = new String(Files.readAllBytes(filePath)); 60 | } catch (IOException e) { 61 | e.printStackTrace(); 62 | code = Common.EmptyString; 63 | } 64 | 65 | if (code == Common.EmptyString){ 66 | comment = Common.EmptyString; 67 | } 68 | else { 69 | String commentPath = filePath.toString(); 70 | commentPath = commentPath.replace('\\', '/'); 71 | int lst = commentPath.lastIndexOf("/"); 72 | commentPath = commentPath.replace(commentPath.substring(lst + 1), "comment.txt"); 73 | Path pathToComment = Paths.get(commentPath); 74 | comment = new String(Files.readAllBytes(pathToComment)); 75 | } 76 | FeatureExtractor featureExtractor = new FeatureExtractor(m_CommandLineValues); 77 | 78 | return featureExtractor.extractFeatures(code, comment); 79 | } 80 | 81 | public String featuresToString(ArrayList features) { 82 | if (features == null || features.isEmpty()) { 83 | return Common.EmptyString; 84 | } 85 | 86 | List methodsOutputs = new ArrayList<>(); 87 | 88 | for (ProgramFeatures singleMethodFeatures : features) { 89 | StringBuilder builder = new StringBuilder(); 90 | String toPrint = singleMethodFeatures.toString(); 91 | if (m_CommandLineValues.PrettyPrint) { 92 | toPrint = toPrint.replace(" ", "\n\t"); 93 | } 94 | builder.append(toPrint); 95 | methodsOutputs.add(builder.toString()); 96 | 97 | } 98 | return StringUtils.join(methodsOutputs, "\n"); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.FeaturesEntities; 2 | 3 | import com.fasterxml.jackson.annotation.JsonIgnore; 4 | 5 | import java.util.ArrayList; 6 | import java.util.stream.Collectors; 7 | 8 | public class ProgramFeatures { 9 | private final String name; 10 | 11 | private final ArrayList features = new ArrayList<>(); 12 | 13 | public ProgramFeatures(String name) { 14 | this.name = name; 15 | } 16 | 17 | @SuppressWarnings("StringBufferReplaceableByString") 18 | @Override 19 | public String toString() { 20 | StringBuilder stringBuilder = new StringBuilder(); 21 | stringBuilder.append(name).append(" "); 22 | stringBuilder.append(features.stream().map(ProgramRelation::toString).collect(Collectors.joining(" "))); 23 | 24 | return stringBuilder.toString(); 25 | } 26 | 27 | public void addFeature(Property source, String path, Property target) { 28 | ProgramRelation newRelation = new ProgramRelation(source, target, path); 29 | features.add(newRelation); 30 | } 31 | 32 | @JsonIgnore 33 | public boolean isEmpty() { 34 | return features.isEmpty(); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.FeaturesEntities; 2 | 3 | public class ProgramRelation { 4 | private final Property m_Source; 5 | private final Property m_Target; 6 | private final String m_Path; 7 | 8 | public ProgramRelation(Property sourceName, Property targetName, String path) { 9 | m_Source = sourceName; 10 | m_Target = targetName; 11 | m_Path = path; 12 | } 13 | 14 | public String toString() { 15 | return String.format("%s,%s,%s", m_Source.getName(), m_Path, 16 | m_Target.getName()); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/src/main/java/JavaExtractor/Main.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor; 2 | import java.util.List; 3 | 4 | import com.github.javaparser.JavaParser; 5 | import com.github.javaparser.ast.CompilationUnit; 6 | import com.github.javaparser.ast.Node; 7 | 8 | 9 | public class Main { 10 | public static void main(String args[]) throws Exception { 11 | System.out.println("runs"); 12 | String code = "public class Class{\nprivate void assign(HashMap labelMap,String label,DBIDRef id){\nif (labelMap.containsKey(label)) {\nDBIDs exist=labelMap.get(label);\nif (exist instanceof DBID) {\n ModifiableDBIDs n=DBIDUtil.newHashSet();\n n.add((DBID)exist);\nn.add(id);lnlabelMap.put(label,n);\n }\n else {\n assert (exist instanceof HashSetModifiableDBIDs);\n assert (exist.size() > 1);\n ((ModifiableDBIDs)exist).add(id);\n }\n }\n else {\n labelMap.put(label,DBIDUtil.deref(id));\n }\n}\n}"; 13 | 14 | // CompilationUnit parsed = JavaParser.parse(code); 15 | 16 | 17 | System.out.printf("%-28s %-12s %s%n", "Node.class.simpleName", "Identifier", "Node.toString()"); 18 | System.out.printf("%-28s %-12s %s%n", "=====================", "==========", "==============="); 19 | CompilationUnit parsed = JavaParser.parse(code); 20 | // parsed.walk(node -> { 21 | // String identifier = ""; 22 | // if (node instanceof NodeWithIdentifier) 23 | // identifier = ((NodeWithIdentifier) node).getIdentifier(); 24 | // System.out.printf("%-28s %-12s %s%n", 25 | // node.getClass().getSimpleName(), 26 | // identifier, 27 | // node.toString().replaceFirst("(?s)\\R.*", "...")); 28 | // }); 29 | 30 | System.out.println(parsed); 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.Visitors; 2 | 3 | import JavaExtractor.Common.CommandLineValues; 4 | import JavaExtractor.Common.Common; 5 | import JavaExtractor.Common.MethodContent; 6 | import com.github.javaparser.ast.Node; 7 | import com.github.javaparser.ast.body.MethodDeclaration; 8 | import com.github.javaparser.ast.visitor.VoidVisitorAdapter; 9 | 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | import java.util.regex.Pattern; 13 | 14 | @SuppressWarnings("StringEquality") 15 | public class FunctionVisitor extends VoidVisitorAdapter { 16 | private final ArrayList m_Methods = new ArrayList<>(); 17 | private final CommandLineValues m_CommandLineValues; 18 | 19 | public FunctionVisitor(CommandLineValues commandLineValues) { 20 | this.m_CommandLineValues = commandLineValues; 21 | } 22 | 23 | @Override 24 | public void visit(MethodDeclaration node, Object arg) { 25 | visitMethod(node, arg.toString()); 26 | 27 | super.visit(node, arg); 28 | } 29 | 30 | private void visitMethod(MethodDeclaration node, String comment) { 31 | LeavesCollectorVisitor leavesCollectorVisitor = new LeavesCollectorVisitor(); 32 | leavesCollectorVisitor.visitDepthFirst(node); 33 | ArrayList leaves = leavesCollectorVisitor.getLeaves(); 34 | String[] parts = comment.split(Pattern.quote(".")); 35 | comment = parts[0]; 36 | String normalizedMethodName = Common.normalizeName(comment, Common.BlankWord); 37 | // String normalizedMethodName = Common.normalizeName(node.getName(), Common.BlankWord); 38 | ArrayList splitNameParts = Common.splitToSubtokens(comment); 39 | String splitName = normalizedMethodName; 40 | if (splitNameParts.size() > 0) { 41 | splitName = String.join(Common.internalSeparator, splitNameParts); 42 | } 43 | 44 | if (node.getBody() != null) { 45 | long methodLength = getMethodLength(node.getBody().toString()); 46 | if (m_CommandLineValues.MaxCodeLength > 0) { 47 | if (methodLength >= m_CommandLineValues.MinCodeLength && methodLength <= m_CommandLineValues.MaxCodeLength) { 48 | m_Methods.add(new MethodContent(leaves, splitName)); 49 | } 50 | } else { 51 | m_Methods.add(new MethodContent(leaves, splitName)); 52 | } 53 | } 54 | } 55 | 56 | private long getMethodLength(String code) { 57 | String cleanCode = code.replaceAll("\r\n", "\n").replaceAll("\t", " "); 58 | if (cleanCode.startsWith("{\n")) 59 | cleanCode = cleanCode.substring(3).trim(); 60 | if (cleanCode.endsWith("\n}")) 61 | cleanCode = cleanCode.substring(0, cleanCode.length() - 2).trim(); 62 | if (cleanCode.length() == 0) { 63 | return 0; 64 | } 65 | return Arrays.stream(cleanCode.split("\n")) 66 | .filter(line -> (line.trim() != "{" && line.trim() != "}" && line.trim() != "")) 67 | .filter(line -> !line.trim().startsWith("/") && !line.trim().startsWith("*")).count(); 68 | } 69 | 70 | public ArrayList getMethodContents() { 71 | return m_Methods; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.Visitors; 2 | 3 | import JavaExtractor.Common.Common; 4 | import JavaExtractor.FeaturesEntities.Property; 5 | import com.github.javaparser.ast.Node; 6 | import com.github.javaparser.ast.comments.Comment; 7 | import com.github.javaparser.ast.expr.NullLiteralExpr; 8 | import com.github.javaparser.ast.stmt.Statement; 9 | import com.github.javaparser.ast.type.ClassOrInterfaceType; 10 | import com.github.javaparser.ast.visitor.TreeVisitor; 11 | 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | public class LeavesCollectorVisitor extends TreeVisitor { 16 | private final ArrayList m_Leaves = new ArrayList<>(); 17 | 18 | @Override 19 | public void process(Node node) { 20 | 21 | if (node instanceof Comment) { 22 | return; 23 | } 24 | boolean isLeaf = false; 25 | boolean isGenericParent = isGenericParent(node); 26 | if (hasNoChildren(node) && isNotComment(node)) { 27 | if (!node.toString().isEmpty() && (!"null".equals(node.toString()) || (node instanceof NullLiteralExpr))) { 28 | m_Leaves.add(node); 29 | isLeaf = true; 30 | } 31 | } 32 | 33 | int childId = getChildId(node); 34 | node.setUserData(Common.ChildId, childId); 35 | Property property = new Property(node, isLeaf, isGenericParent); 36 | node.setUserData(Common.PropertyKey, property); 37 | } 38 | 39 | private boolean isGenericParent(Node node) { 40 | return (node instanceof ClassOrInterfaceType) 41 | && ((ClassOrInterfaceType) node).getTypeArguments() != null 42 | && ((ClassOrInterfaceType) node).getTypeArguments().size() > 0; 43 | } 44 | 45 | private boolean hasNoChildren(Node node) { 46 | return node.getChildrenNodes().size() == 0; 47 | } 48 | 49 | private boolean isNotComment(Node node) { 50 | return !(node instanceof Comment) && !(node instanceof Statement); 51 | } 52 | 53 | public ArrayList getLeaves() { 54 | return m_Leaves; 55 | } 56 | 57 | private int getChildId(Node node) { 58 | Node parent = node.getParentNode(); 59 | List parentsChildren = parent.getChildrenNodes(); 60 | int childId = 0; 61 | for (Node child : parentsChildren) { 62 | if (child.getRange().equals(node.getRange())) { 63 | return childId; 64 | } 65 | childId++; 66 | } 67 | return childId; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /JavaExtractor/JPredict/src/main/java/Test.java: -------------------------------------------------------------------------------- 1 | class Test { 2 | void fooBar() { 3 | System.out.println("http://github.com"); 4 | } 5 | } -------------------------------------------------------------------------------- /JavaExtractor/extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import itertools 4 | import multiprocessing 5 | import os 6 | import shutil 7 | import subprocess 8 | import sys 9 | from argparse import ArgumentParser 10 | from threading import Timer 11 | 12 | 13 | def get_immediate_subdirectories(a_dir): 14 | return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir) 15 | if os.path.isdir(os.path.join(a_dir, name))] 16 | 17 | 18 | TMP_DIR = "" 19 | 20 | 21 | def ParallelExtractDir(args, dir): 22 | ExtractFeaturesForDir(args, dir, "") 23 | 24 | 25 | def ExtractFeaturesForDir(args, dir, prefix): 26 | command = ['java', '-Xmx100g', '-XX:MaxNewSize=60g', '-cp', args.jar, 'JavaExtractor.App', 27 | '--max_path_length', str(args.max_path_length), '--max_path_width', str(args.max_path_width), 28 | '--dir', dir, '--num_threads', str(args.num_threads)] 29 | 30 | # print command 31 | # os.system(command) 32 | kill = lambda process: process.kill() 33 | outputFileName = TMP_DIR + prefix + dir.split('/')[-1] 34 | failed = False 35 | with open(outputFileName, 'a') as outputFile: 36 | sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE) 37 | timer = Timer(60 * 60, kill, [sleeper]) 38 | 39 | try: 40 | timer.start() 41 | stdout, stderr = sleeper.communicate() 42 | finally: 43 | timer.cancel() 44 | 45 | if sleeper.poll() == 0: 46 | if len(stderr) > 0: 47 | print(stderr, file=sys.stderr) 48 | else: 49 | print('dir: ' + str(dir) + ' was not completed in time', file=sys.stderr) 50 | failed = True 51 | subdirs = get_immediate_subdirectories(dir) 52 | for subdir in subdirs: 53 | ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_') 54 | if failed: 55 | if os.path.exists(outputFileName): 56 | os.remove(outputFileName) 57 | 58 | 59 | def ExtractFeaturesForDirsList(args, dirs): 60 | global TMP_DIR 61 | TMP_DIR = "./tmp/feature_extractor%d/" % (os.getpid()) 62 | if os.path.exists(TMP_DIR): 63 | shutil.rmtree(TMP_DIR, ignore_errors=True) 64 | os.makedirs(TMP_DIR) 65 | try: 66 | p = multiprocessing.Pool(6) 67 | p.starmap(ParallelExtractDir, zip(itertools.repeat(args), dirs)) 68 | # for dir in dirs: 69 | # ExtractFeaturesForDir(args, dir, '') 70 | output_files = os.listdir(TMP_DIR) 71 | for f in output_files: 72 | os.system("cat %s/%s" % (TMP_DIR, f)) 73 | finally: 74 | shutil.rmtree(TMP_DIR, ignore_errors=True) 75 | 76 | 77 | if __name__ == '__main__': 78 | parser = ArgumentParser() 79 | parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8) 80 | parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2) 81 | parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64) 82 | parser.add_argument("-j", "--jar", dest="jar", required=True) 83 | parser.add_argument("-dir", "--dir", dest="dir", required=False) 84 | parser.add_argument("-file", "--file", dest="file", required=False) 85 | args = parser.parse_args() 86 | 87 | if args.file is not None: 88 | command = 'java -cp ' + args.jar + ' JavaExtractor.App --max_path_length ' + \ 89 | str(args.max_path_length) + ' --max_path_width ' + str(args.max_path_width) + ' --file ' + args.file 90 | os.system(command) 91 | elif args.dir is not None: 92 | subdirs = get_immediate_subdirectories(args.dir) 93 | # print("Sub Directories") 94 | # print(subdirs) 95 | if len(subdirs) == 0: 96 | subdirs = [args.dir] 97 | ExtractFeaturesForDirsList(args, subdirs) 98 | 99 | 100 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoComments 2 | 3 | ## :pencil2: Description: 4 | ### Motivation 5 | We want to create a deep Neural Network that can automatically generate comments for code snippets passed to it. 6 | The motivation behind this is that in software development and maintenance, developers spend around 59% of their time on program comprehension activities. Having comments that are generated automatically will hopefully cut this time down. 7 | In order to do this we will combine the recent paper, 8 | [Code2Vec: Learning Distributed Representations of Code](https://openreview.net/pdf?id=H1gKYo09tX) by Alon et al. with the paper [Deep Code Comment Generation](https://ink.library.smu.edu.sg/cgi/viewcontent.cgi?article=5295&context=sis_research) by X. Hu et.al, so as to make a better performing model using the newer Code2Vec encoding that was not used in the Deep Code Comment Generation paper. 9 | 10 | ### Work done 11 | In this project, 2 experiments were conducted. For the first one, we used the Code2Seq code to create model which will generate comments for code snippets (functions) of Java instead of the function names. For the second one, we repeated the procedure followed in the first experiment with modified ASTs. In particular, we added the precific name of each variable to the AST in order to make the comments generated more descriptive. The evaluation of the experiments were conducted in terms of BLEU-4 score. 12 | 13 | The performance of the first experiment was poor (BLEU-4 score 6.08), while with the novelty introduced in the second experiment we achieved an important improvement (BLEU-4 score 10.08). However, the performance achieved was much worse than the one achieved by X. HU (BLEU-4 score 38.17 ) in Deep Code Comment Generation paper, because our model was not able to produce long comments. Nevertheless, it predicted succesfully the shorter comments, as well as a part of the long ones. The reason for this behavior is that the Code2Seq was built to produce function names which are short, and not long sequences. 14 | 15 | All in all, an important main conclusion we can draw regarding our best model, is that with the novelty introduced with the variable names in the AST, it is capable of understanding the syntactic and semantic meaning of Java code regarding the automatic comment generation. However suffers from the incapability to generate longer comments and complete. 16 | 17 | ## :page_facing_up: Dataset: 18 | 19 | The dataset that we used, is the same dataset used by the Deep Code Comment Generation paper, this is a dataset of more than 500,000 code snippets including comments. 20 | This also gave us a baseline against which to compare. 21 | The dataset can be found [here](https://github.com/xing-hu/DeepCom). 22 | 23 | ## :scroll: System Overview 24 | The pipeline of the system is: 25 | 1. Extract the ASTs from the code snippets-comment pairs. 26 | 2. Use the extracted ASTs to train the model. 27 | 3. Test the trained model on the test data. 28 | 29 | The high-level pipeline is shown in the following image: 30 |

31 | 32 |

33 | 34 | ## :triangular_ruler: Network Architecture 35 | The Encoder-Decoder architecture of this project is shown in the image below and is influenced by the [work](https://openreview.net/pdf?id=H1gKYo09tX) of U. Alon et al. 36 | 37 |

38 | 39 |

40 | 41 | ## :bar_chart: Results 42 | 43 | The BLEU-4 score achieved in the test dataset is presented below: 44 | 45 | | Approaches | BLEU-4 | 46 | | ------- | -------------- | 47 | | DeepCom | 38.17 | 48 | | Method-1 | 6.08 | 49 | | Method-2 | 10.02 | 50 | 51 | 52 | For more information about the results and a detailed description of the 2 methods used, please feel free to take a look at our project [report](https://github.com/LRNavin/AutoComments/tree/master/report/ML4SE_group_3_report.pdf) that is included on this repository. 53 | 54 | 55 | ## :office: Project Structure 56 | The structure of the project is: 57 | 58 | * [`JavaExtractor`](https://github.com/LRNavin/AutoComments/tree/master/data/JavaExtractor) This directory contains the necessary code for exctracting the ASTs from the dataset. 59 | * [`code2seq_master`](https://github.com/LRNavin/AutoComments/tree/master/code2seq_master) This directory contains the original Code2Seq code. 60 | * [`data`](https://github.com/LRNavin/AutoComments/tree/master/data) here you can find a small portion of the data we used. We couldn't upload the whole dataset because of its size. 61 | * [`preproc`](https://github.com/LRNavin/AutoComments/tree/master/preproc) Conatins all the necessary ptython files and neccesary scripts for the preprocessing and the proper execution of the AST extraction. 62 | * [`report`](https://github.com/LRNavin/AutoComments/tree/master/report) Contains the report for this project and its latex code. 63 | * [`scripts`](https://github.com/LRNavin/AutoComments/tree/master/scripts) Contains all the extra scripts used, like the perl script for the BLEU score extraction. 64 | * [`bleu.py`](https://github.com/LRNavin/AutoComments/tree/master/bleu.py) Extracts the BLEU-4 score for a reference and a prediction file. 65 | 66 | ## Papers: 67 | 68 | [Code2Vec: Learning Distributed Representations of Code](https://openreview.net/pdf?id=H1gKYo09tX) 69 | 70 | [Deep Code Comment Generation](https://ink.library.smu.edu.sg/cgi/viewcontent.cgi?article=5295&context=sis_research) 71 | 72 | ## :busts_in_silhouette: Group 3 Team Members 73 | 74 | [Rafail Skoulos](https://github.com/RafailSkoulos17) 75 | 76 | [Navin Raj Prabhu](https://github.com/LRNavin) 77 | 78 | [Thomas Pfann](https://github.com/ThomasPf) 79 | 80 | [Jonathan Katzy](https://github.com/jkatzy) 81 | 82 | 83 | -------------------------------------------------------------------------------- /bleu_score.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | 4 | 5 | def compute_bleu(ref_file_name, predicted_file_name): 6 | with open(predicted_file_name) as predicted_file: 7 | pipe = subprocess.Popen(["perl", "scripts/multi-bleu.perl", ref_file_name], stdin=predicted_file, 8 | stdout=sys.stdout, stderr=sys.stderr) 9 | pipe.communicate() 10 | 11 | 12 | ref_file = 'outputs/1st_try/test/ref.txt' 13 | pred_file = 'outputs/1st_try/test/pred.txt' 14 | 15 | compute_bleu(ref_file, pred_file) 16 | -------------------------------------------------------------------------------- /code2seq_master/.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.lst 3 | .idea/* 4 | *.iml 5 | *.xml 6 | *.pyc 7 | 8 | -------------------------------------------------------------------------------- /code2seq_master/CSharpExtractor/.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /code2seq_master/CSharpExtractor/.gitignore: -------------------------------------------------------------------------------- 1 | ## Ignore Visual Studio temporary files, build results, and 2 | ## files generated by popular Visual Studio add-ons. 3 | 4 | # User-specific files 5 | *.suo 6 | *.user 7 | *.userosscache 8 | *.sln.docstates 9 | 10 | # User-specific files (MonoDevelop/Xamarin Studio) 11 | *.userprefs 12 | 13 | # Build results 14 | [Dd]ebug/ 15 | [Dd]ebugPublic/ 16 | [Rr]elease/ 17 | [Rr]eleases/ 18 | x64/ 19 | x86/ 20 | bld/ 21 | [Bb]in/ 22 | [Oo]bj/ 23 | [Ll]og/ 24 | 25 | # Visual Studio 2015 cache/options directory 26 | .vs/ 27 | # Uncomment if you have tasks that create the project's static files in wwwroot 28 | #wwwroot/ 29 | 30 | # MSTest test Results 31 | [Tt]est[Rr]esult*/ 32 | [Bb]uild[Ll]og.* 33 | 34 | # NUNIT 35 | *.VisualState.xml 36 | TestResult.xml 37 | 38 | # Build Results of an ATL Project 39 | [Dd]ebugPS/ 40 | [Rr]eleasePS/ 41 | dlldata.c 42 | 43 | # DNX 44 | project.lock.json 45 | artifacts/ 46 | 47 | *_i.c 48 | *_p.c 49 | *_i.h 50 | *.ilk 51 | *.meta 52 | *.obj 53 | *.pch 54 | *.pdb 55 | *.pgc 56 | *.pgd 57 | *.rsp 58 | *.sbr 59 | *.tlb 60 | *.tli 61 | *.tlh 62 | *.tmp 63 | *.tmp_proj 64 | *.log 65 | *.vspscc 66 | *.vssscc 67 | .builds 68 | *.pidb 69 | *.svclog 70 | *.scc 71 | 72 | # Chutzpah Test files 73 | _Chutzpah* 74 | 75 | # Visual C++ cache files 76 | ipch/ 77 | *.aps 78 | *.ncb 79 | *.opendb 80 | *.opensdf 81 | *.sdf 82 | *.cachefile 83 | *.VC.db 84 | *.VC.VC.opendb 85 | 86 | # Visual Studio profiler 87 | *.psess 88 | *.vsp 89 | *.vspx 90 | *.sap 91 | 92 | # TFS 2012 Local Workspace 93 | $tf/ 94 | 95 | # Guidance Automation Toolkit 96 | *.gpState 97 | 98 | # ReSharper is a .NET coding add-in 99 | _ReSharper*/ 100 | *.[Rr]e[Ss]harper 101 | *.DotSettings.user 102 | 103 | # JustCode is a .NET coding add-in 104 | .JustCode 105 | 106 | # TeamCity is a build add-in 107 | _TeamCity* 108 | 109 | # DotCover is a Code Coverage Tool 110 | *.dotCover 111 | 112 | # NCrunch 113 | _NCrunch_* 114 | .*crunch*.local.xml 115 | nCrunchTemp_* 116 | 117 | # MightyMoose 118 | *.mm.* 119 | AutoTest.Net/ 120 | 121 | # Web workbench (sass) 122 | .sass-cache/ 123 | 124 | # Installshield output folder 125 | [Ee]xpress/ 126 | 127 | # DocProject is a documentation generator add-in 128 | DocProject/buildhelp/ 129 | DocProject/Help/*.HxT 130 | DocProject/Help/*.HxC 131 | DocProject/Help/*.hhc 132 | DocProject/Help/*.hhk 133 | DocProject/Help/*.hhp 134 | DocProject/Help/Html2 135 | DocProject/Help/html 136 | 137 | # Click-Once directory 138 | publish/ 139 | 140 | # Publish Web Output 141 | *.[Pp]ublish.xml 142 | *.azurePubxml 143 | # TODO: Comment the next line if you want to checkin your web deploy settings 144 | # but database connection strings (with potential passwords) will be unencrypted 145 | *.pubxml 146 | *.publishproj 147 | 148 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 149 | # checkin your Azure Web App publish settings, but sensitive information contained 150 | # in these scripts will be unencrypted 151 | PublishScripts/ 152 | 153 | # NuGet Packages 154 | *.nupkg 155 | # The packages folder can be ignored because of Package Restore 156 | **/packages/* 157 | # except build/, which is used as an MSBuild target. 158 | !**/packages/build/ 159 | # Uncomment if necessary however generally it will be regenerated when needed 160 | #!**/packages/repositories.config 161 | # NuGet v3's project.json files produces more ignoreable files 162 | *.nuget.props 163 | *.nuget.targets 164 | 165 | # Microsoft Azure Build Output 166 | csx/ 167 | *.build.csdef 168 | 169 | # Microsoft Azure Emulator 170 | ecf/ 171 | rcf/ 172 | 173 | # Windows Store app package directories and files 174 | AppPackages/ 175 | BundleArtifacts/ 176 | Package.StoreAssociation.xml 177 | _pkginfo.txt 178 | 179 | # Visual Studio cache files 180 | # files ending in .cache can be ignored 181 | *.[Cc]ache 182 | # but keep track of directories ending in .cache 183 | !*.[Cc]ache/ 184 | 185 | # Others 186 | ClientBin/ 187 | ~$* 188 | *~ 189 | *.dbmdl 190 | *.dbproj.schemaview 191 | *.pfx 192 | *.publishsettings 193 | node_modules/ 194 | orleans.codegen.cs 195 | 196 | # Since there are multiple workflows, uncomment next line to ignore bower_components 197 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 198 | #bower_components/ 199 | 200 | # RIA/Silverlight projects 201 | Generated_Code/ 202 | 203 | # Backup & report files from converting an old project file 204 | # to a newer Visual Studio version. Backup files are not needed, 205 | # because we have git ;-) 206 | _UpgradeReport_Files/ 207 | Backup*/ 208 | UpgradeLog*.XML 209 | UpgradeLog*.htm 210 | 211 | # SQL Server files 212 | *.mdf 213 | *.ldf 214 | 215 | # Business Intelligence projects 216 | *.rdl.data 217 | *.bim.layout 218 | *.bim_*.settings 219 | 220 | # Microsoft Fakes 221 | FakesAssemblies/ 222 | 223 | # GhostDoc plugin setting file 224 | *.GhostDoc.xml 225 | 226 | # Node.js Tools for Visual Studio 227 | .ntvs_analysis.dat 228 | 229 | # Visual Studio 6 build log 230 | *.plg 231 | 232 | # Visual Studio 6 workspace options file 233 | *.opt 234 | 235 | # Visual Studio LightSwitch build output 236 | **/*.HTMLClient/GeneratedArtifacts 237 | **/*.DesktopClient/GeneratedArtifacts 238 | **/*.DesktopClient/ModelManifest.xml 239 | **/*.Server/GeneratedArtifacts 240 | **/*.Server/ModelManifest.xml 241 | _Pvt_Extensions 242 | 243 | # Paket dependency manager 244 | .paket/paket.exe 245 | paket-files/ 246 | 247 | # FAKE - F# Make 248 | .fake/ 249 | 250 | # JetBrains Rider 251 | .idea/ 252 | *.sln.iml 253 | 254 | # no data 255 | data/* 256 | backupdata/* 257 | -------------------------------------------------------------------------------- /code2seq_master/CSharpExtractor/CSharpExtractor/.nuget/packages.config: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /code2seq_master/CSharpExtractor/CSharpExtractor/CSharpExtractor.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.28307.136 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Extractor", "Extractor\Extractor.csproj", "{481EDE3F-0ED1-4CB9-814A-63A821022552}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Debug|x64 = Debug|x64 12 | Debug|x86 = Debug|x86 13 | Release|Any CPU = Release|Any CPU 14 | Release|x64 = Release|x64 15 | Release|x86 = Release|x86 16 | Release20|Any CPU = Release20|Any CPU 17 | Release20|x64 = Release20|x64 18 | Release20|x86 = Release20|x86 19 | EndGlobalSection 20 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 21 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 22 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|Any CPU.Build.0 = Debug|Any CPU 23 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x64.ActiveCfg = Debug|Any CPU 24 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x64.Build.0 = Debug|Any CPU 25 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x86.ActiveCfg = Debug|Any CPU 26 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x86.Build.0 = Debug|Any CPU 27 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|Any CPU.ActiveCfg = Release|Any CPU 28 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|Any CPU.Build.0 = Release|Any CPU 29 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x64.ActiveCfg = Release|Any CPU 30 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x64.Build.0 = Release|Any CPU 31 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x86.ActiveCfg = Release|Any CPU 32 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x86.Build.0 = Release|Any CPU 33 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|Any CPU.ActiveCfg = Release|Any CPU 34 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|Any CPU.Build.0 = Release|Any CPU 35 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x64.ActiveCfg = Release|Any CPU 36 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x64.Build.0 = Release|Any CPU 37 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x86.ActiveCfg = Release|Any CPU 38 | {481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x86.Build.0 = Release|Any CPU 39 | EndGlobalSection 40 | GlobalSection(SolutionProperties) = preSolution 41 | HideSolutionNode = FALSE 42 | EndGlobalSection 43 | GlobalSection(ExtensibilityGlobals) = postSolution 44 | SolutionGuid = {13A0DA89-D5D9-4E75-850E-70B9FBE88FF8} 45 | EndGlobalSection 46 | EndGlobal 47 | -------------------------------------------------------------------------------- /code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/Extractor.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp2.2 6 | Extractor.Program 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/PathFinder.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.CodeAnalysis; 2 | using Microsoft.CodeAnalysis.CSharp.Syntax; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | 7 | namespace Extractor 8 | { 9 | 10 | internal class PathFinder 11 | { 12 | internal class Path 13 | { 14 | public SyntaxToken Left { get; } 15 | public List LeftSide { get; } 16 | public SyntaxNode Ancesstor { get; } 17 | public List RightSide { get; } 18 | public SyntaxToken Right { get; } 19 | 20 | public Path(SyntaxToken left, IEnumerable leftSide, SyntaxNode ancesstor, 21 | IEnumerable rightSide, SyntaxToken right) 22 | { 23 | this.Left = left; 24 | this.LeftSide = leftSide.ToList(); 25 | this.Ancesstor = ancesstor; 26 | this.RightSide = rightSide.ToList(); 27 | this.Right = right; 28 | } 29 | } 30 | 31 | public int Length { get; } 32 | public int Width { get; } 33 | 34 | Tree tree; 35 | 36 | public PathFinder(Tree tree, int length = 7, int width = 4) 37 | { 38 | if (length < 1 || width < 1) 39 | throw new ArgumentException("Width and Length params must be positive."); 40 | 41 | Length = length; 42 | Width = width; 43 | this.tree = tree; 44 | } 45 | 46 | private int GetDepth(SyntaxNode n) 47 | { 48 | int depth = 0; 49 | while(n.Parent != null) 50 | { 51 | n = n.Parent; 52 | depth++; 53 | } 54 | return depth; 55 | } 56 | 57 | public SyntaxNode FirstAncestor(SyntaxNode l, SyntaxNode r) 58 | { 59 | if (l.Equals(r)) 60 | return l; 61 | 62 | if (GetDepth(l) >= GetDepth(r)) 63 | { 64 | l = l.Parent; 65 | } 66 | else 67 | { 68 | r = r.Parent; 69 | } 70 | return FirstAncestor(l, r); 71 | } 72 | 73 | private IEnumerable CollectPathToParent(SyntaxNode start, SyntaxNode parent) 74 | { 75 | while (!start.Equals(parent)) 76 | { 77 | yield return start; 78 | start = start.Parent; 79 | } 80 | } 81 | 82 | internal Path FindPath(SyntaxToken l, SyntaxToken r, bool limited = true) 83 | { 84 | SyntaxNode p = FirstAncestor(l.Parent, r.Parent); 85 | 86 | // + 2 for the distance of the leafs themselves 87 | if (GetDepth(r.Parent) + GetDepth(l.Parent) - 2 * GetDepth(p) + 2 > Length) 88 | { 89 | return null; 90 | } 91 | 92 | var leftSide = CollectPathToParent(l.Parent, p); 93 | var rightSide = CollectPathToParent(r.Parent, p); 94 | rightSide = rightSide.Reverse(); 95 | 96 | List widthCheck = p.ChildNodes().ToList(); 97 | if (limited && leftSide.Count() != 0 98 | && rightSide.Count() != 0) 99 | { 100 | int indexOfLeft = widthCheck.IndexOf(leftSide.Last()); 101 | int indexOfRight = widthCheck.IndexOf(rightSide.First()); 102 | if (Math.Abs(indexOfLeft - indexOfRight) >= Width) 103 | { 104 | return null; 105 | } 106 | } 107 | 108 | return new Path(l, leftSide, p, rightSide, r); 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/Program.cs: -------------------------------------------------------------------------------- 1 | using CommandLine; 2 | using CommandLine.Text; 3 | using System; 4 | using System.Collections.Generic; 5 | using System.IO; 6 | using System.Linq; 7 | 8 | namespace Extractor 9 | { 10 | class Program 11 | { 12 | static List ExtractSingleFile(string filename, Options opts) 13 | { 14 | string data = File.ReadAllText(filename); 15 | var extractor = new Extractor(data, opts); 16 | List result = extractor.Extract(); 17 | 18 | return result; 19 | } 20 | 21 | static void Main(string[] args) 22 | { 23 | Options options = new Options(); 24 | Parser.Default.ParseArguments(args) 25 | .WithParsed(opt => options = opt) 26 | .WithNotParsed(errors => 27 | { 28 | Console.WriteLine(errors); 29 | return; 30 | }); 31 | 32 | string path = options.Path; 33 | string[] files; 34 | if (Directory.Exists(path)) 35 | { 36 | files = Directory.GetFiles(path, "*.cs", SearchOption.AllDirectories); 37 | } 38 | else 39 | { 40 | files = new string[] { path }; 41 | } 42 | 43 | IEnumerable results = null; 44 | 45 | results = files.AsParallel().WithDegreeOfParallelism(options.Threads).SelectMany(filename => ExtractSingleFile(filename, options)); 46 | 47 | using (StreamWriter sw = new StreamWriter(options.OFileName, append: true)) 48 | { 49 | foreach (var res in results) 50 | { 51 | sw.WriteLine(res); 52 | } 53 | } 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/Properties/launchSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "profiles": { 3 | "Extractor": { 4 | "commandName": "Project", 5 | "commandLineArgs": "--path C:\\Users\\urial\\Source\\Repos\\CSharpExtractor\\CSharpExtractor\\Extractor\\bin\\ --no_hash" 6 | } 7 | } 8 | } -------------------------------------------------------------------------------- /code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/Temp.cs: -------------------------------------------------------------------------------- 1 | namespace Extractor 2 | { 3 | class Temp 4 | { 5 | class NestedClass 6 | { 7 | void fooBar() 8 | { 9 | a.b = c; 10 | } 11 | } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs: -------------------------------------------------------------------------------- 1 | using CommandLine; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | using System.Text; 6 | using System.Diagnostics; 7 | using System.Text.RegularExpressions; 8 | 9 | namespace Extractor 10 | { 11 | public class Options 12 | { 13 | [Option('t', "threads", Default = 1, HelpText = "How many threads to use <1>")] 14 | public int Threads { get; set; } 15 | 16 | [Option('p', "path", Default = "./data/", HelpText = "Where to find code files. <.>")] 17 | public string Path { get; set; } 18 | 19 | [Option('l', "max_length", Default = 9, HelpText = "Max path length")] 20 | public int MaxLength { get; set; } 21 | 22 | [Option('l', "max_width", Default = 2, HelpText = "Max path length")] 23 | public int MaxWidth { get; set; } 24 | 25 | [Option('o', "ofile_name", Default = "test.txt", HelpText = "Output file name")] 26 | public String OFileName { get; set; } 27 | 28 | [Option('h', "no_hash", Default = false, HelpText = "When enabled, prints the whole path strings (not hashed)")] 29 | public Boolean NoHash { get; set; } 30 | 31 | [Option('l', "max_contexts", Default = 30000, HelpText = "Max number of path contexts to sample. Affects only very large snippets")] 32 | public int MaxContexts { get; set; } 33 | } 34 | 35 | public static class Utilities 36 | { 37 | public static String[] NumbericLiteralsToKeep = new String[] { "0", "1", "2", "3", "4", "5", "10" }; 38 | public static IEnumerable> Choose2(IEnumerable enumerable) 39 | { 40 | int index = 0; 41 | 42 | foreach (var e in enumerable) 43 | { 44 | ++index; 45 | foreach (var t in enumerable.Skip(index)) 46 | yield return Tuple.Create(e, t); 47 | } 48 | } 49 | 50 | /// 51 | /// Sample uniform randomly numSamples from an enumerable, using reservoir sampling. 52 | /// See https://en.wikipedia.org/wiki/Reservoir_sampling 53 | /// 54 | /// 55 | /// 56 | /// 57 | /// 58 | public static IEnumerable ReservoirSample(this IEnumerable input, int numSamples) 59 | { 60 | var rng = new Random(); 61 | var sampledElements = new List(numSamples); 62 | int seenElementCount = 0; 63 | foreach (var element in input) 64 | { 65 | seenElementCount++; 66 | if (sampledElements.Count < numSamples) 67 | { 68 | sampledElements.Add(element); 69 | } 70 | else 71 | { 72 | int position = rng.Next(seenElementCount); 73 | if (position < numSamples) 74 | { 75 | sampledElements[position] = element; 76 | } 77 | } 78 | } 79 | Debug.Assert(sampledElements.Count <= numSamples); 80 | return sampledElements; 81 | } 82 | 83 | 84 | public static IEnumerable WeakConcat(IEnumerable enumerable1, IEnumerable enumerable2) 85 | { 86 | foreach (T t in enumerable1) 87 | yield return t; 88 | foreach (T t in enumerable2) 89 | yield return t; 90 | } 91 | 92 | public static IEnumerable SplitToSubtokens(String name) 93 | { 94 | return Regex.Split(name.Trim(), "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+") 95 | .Where(s => s.Length > 0) 96 | .Select(s => NormalizeName(s)) 97 | .Where(s => s.Length > 0); 98 | } 99 | 100 | private static Regex Whitespaces = new Regex(@"\s"); 101 | private static Regex NonAlphabetic = new Regex("[^A-Za-z]"); 102 | 103 | public static String NormalizeName(string s) 104 | { 105 | String partiallyNormalized = s.ToLowerInvariant() 106 | .Replace("\\\\n", String.Empty) 107 | .Replace("[\"',]", String.Empty); 108 | 109 | partiallyNormalized = Whitespaces.Replace(partiallyNormalized, ""); 110 | partiallyNormalized = Encoding.ASCII.GetString( 111 | Encoding.Convert( 112 | Encoding.UTF8, 113 | Encoding.GetEncoding( 114 | Encoding.ASCII.EncodingName, 115 | new EncoderReplacementFallback(string.Empty), 116 | new DecoderExceptionFallback() 117 | ), 118 | Encoding.UTF8.GetBytes(partiallyNormalized) 119 | ) 120 | ); 121 | 122 | if (partiallyNormalized.Contains('\n')) 123 | { 124 | partiallyNormalized = partiallyNormalized.Replace('\n', 'N'); 125 | } 126 | if (partiallyNormalized.Contains('\r')) 127 | { 128 | partiallyNormalized = partiallyNormalized.Replace('\r', 'R'); 129 | } 130 | if (partiallyNormalized.Contains(',')) 131 | { 132 | partiallyNormalized = partiallyNormalized.Replace(',', 'C'); 133 | } 134 | 135 | String completelyNormalized = NonAlphabetic.Replace(partiallyNormalized, String.Empty); 136 | if (completelyNormalized.Length == 0) 137 | { 138 | if (Regex.IsMatch(partiallyNormalized, @"^\d+$")) 139 | { 140 | if (NumbericLiteralsToKeep.Contains(partiallyNormalized)) 141 | { 142 | return partiallyNormalized; 143 | } 144 | else 145 | { 146 | return "NUM"; 147 | } 148 | } 149 | 150 | return String.Empty; 151 | } 152 | return completelyNormalized; 153 | 154 | } 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/Variable.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | using System.Collections.Generic; 3 | using System.Linq; 4 | using Microsoft.CodeAnalysis; 5 | using Microsoft.CodeAnalysis.CSharp.Syntax; 6 | 7 | namespace Extractor 8 | { 9 | namespace Semantics 10 | { 11 | public class Variable 12 | { 13 | Tree tree; 14 | 15 | public string Name { get; } 16 | private HashSet leaves; 17 | public HashSet Leaves 18 | { 19 | get 20 | { 21 | return leaves; 22 | } 23 | } 24 | 25 | private Nullable constant; 26 | public bool Const 27 | { 28 | get 29 | { 30 | return constant.Value; 31 | } 32 | } 33 | 34 | 35 | private Variable(string name, SyntaxToken[] leaves, Tree tree) 36 | { 37 | this.tree = tree; 38 | this.Name = name; 39 | this.leaves = new HashSet(leaves); 40 | 41 | 42 | constant = true; 43 | foreach (var leaf in leaves) 44 | { 45 | if (!tree.leaves[leaf].IsConst) 46 | { 47 | constant = false; 48 | // If not constant the it is a decleration token 49 | break; 50 | } 51 | } 52 | } 53 | 54 | public override int GetHashCode() 55 | { 56 | return this.Name.GetHashCode(); 57 | } 58 | 59 | public bool IsLiteral() 60 | { 61 | return Tree.literals.Contains(tree.leaves[Leaves.First()].Kind); 62 | } 63 | 64 | internal static Boolean isMethodName(SyntaxToken token) 65 | { 66 | return token.Parent.IsKind(Microsoft.CodeAnalysis.CSharp.SyntaxKind.MethodDeclaration) 67 | && token.IsKind(Microsoft.CodeAnalysis.CSharp.SyntaxKind.IdentifierToken); 68 | } 69 | 70 | // Create a variable for each variable in scope from tokens while splitting identically named but differently scoped vars. 71 | internal static IEnumerable CreateFromMethod(Tree methodTree) 72 | { 73 | var root = methodTree.nodes[methodTree.GetRoot()]; 74 | var leaves = root.Leaves.ToArray(); 75 | Dictionary tokenToName = new Dictionary(); 76 | Dictionary> nameToTokens = new Dictionary>(); 77 | foreach (SyntaxToken token in root.Leaves) 78 | { 79 | string name = methodTree.leaves[token].VariableName; 80 | if (isMethodName(token)) 81 | { 82 | name = Extractor.MethodNameConst; 83 | } 84 | tokenToName[token] = name; 85 | if (!nameToTokens.ContainsKey(name)) 86 | nameToTokens[name] = new List(); 87 | nameToTokens[name].Add(token); 88 | } 89 | 90 | List results = new List(); 91 | 92 | foreach (SyntaxToken leaf in leaves) 93 | { 94 | string name = tokenToName[leaf]; 95 | SyntaxToken[] syntaxTokens = nameToTokens[name].ToArray(); 96 | var v = new Variable(name, syntaxTokens, methodTree); 97 | 98 | //check if exists 99 | var matches = results.Where(p => p.Name == name).ToList(); 100 | bool alreadyExists = (matches.Count != 0); 101 | if (!alreadyExists) 102 | { 103 | results.Add(v); 104 | } 105 | } 106 | 107 | return results; 108 | } 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /code2seq_master/CSharpExtractor/extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import itertools 4 | import multiprocessing 5 | import os 6 | import sys 7 | import shutil 8 | import subprocess 9 | from threading import Timer 10 | import sys 11 | from argparse import ArgumentParser 12 | from subprocess import Popen, PIPE, STDOUT, call 13 | 14 | 15 | 16 | def get_immediate_subdirectories(a_dir): 17 | return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir) 18 | if os.path.isdir(os.path.join(a_dir, name))] 19 | 20 | 21 | TMP_DIR = "" 22 | 23 | def ParallelExtractDir(args, dir): 24 | ExtractFeaturesForDir(args, dir, "") 25 | 26 | 27 | def ExtractFeaturesForDir(args, dir, prefix): 28 | command = ['dotnet', 'run', '--project', args.csproj, 29 | '--max_length', str(args.max_path_length), '--max_width', str(args.max_path_width), 30 | '--path', dir, '--threads', str(args.num_threads), '--ofile_name', str(args.ofile_name)] 31 | 32 | 33 | # print command 34 | # os.system(command) 35 | kill = lambda process: process.kill() 36 | sleeper = subprocess.Popen(command, stderr=subprocess.PIPE) 37 | timer = Timer(600000, kill, [sleeper]) 38 | 39 | try: 40 | timer.start() 41 | _, stderr = sleeper.communicate() 42 | finally: 43 | timer.cancel() 44 | 45 | if sleeper.poll() == 0: 46 | if len(stderr) > 0: 47 | print(sys.stderr, stderr) 48 | else: 49 | print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time') 50 | failed = True 51 | subdirs = get_immediate_subdirectories(dir) 52 | for subdir in subdirs: 53 | ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_') 54 | if failed: 55 | if os.path.exists(str(args.ofile_name)): 56 | os.remove(str(args.ofile_name)) 57 | 58 | def ExtractFeaturesForDirsList(args, dirs): 59 | global TMP_DIR 60 | TMP_DIR = "./tmp/feature_extractor%d/" % (os.getpid()) 61 | if os.path.exists(TMP_DIR): 62 | shutil.rmtree(TMP_DIR, ignore_errors=True) 63 | os.makedirs(TMP_DIR) 64 | try: 65 | p = multiprocessing.Pool(4) 66 | p.starmap(ParallelExtractDir, zip(itertools.repeat(args), dirs)) 67 | #for dir in dirs: 68 | # ExtractFeaturesForDir(args, dir, '') 69 | output_files = os.listdir(TMP_DIR) 70 | for f in output_files: 71 | os.system("cat %s/%s" % (TMP_DIR, f)) 72 | finally: 73 | shutil.rmtree(TMP_DIR, ignore_errors=True) 74 | 75 | 76 | if __name__ == '__main__': 77 | 78 | parser = ArgumentParser() 79 | parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8) 80 | parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2) 81 | parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64) 82 | parser.add_argument("--csproj", dest="csproj", required=True) 83 | parser.add_argument("-dir", "--dir", dest="dir", required=False) 84 | parser.add_argument("-ofile_name", "--ofile_name", dest="ofile_name", required=True) 85 | args = parser.parse_args() 86 | 87 | if args.dir is not None: 88 | subdirs = get_immediate_subdirectories(args.dir) 89 | to_extract = subdirs 90 | if len(subdirs) == 0: 91 | to_extract = [args.dir.rstrip('/')] 92 | ExtractFeaturesForDirsList(args, to_extract) 93 | -------------------------------------------------------------------------------- /code2seq_master/Input.java: -------------------------------------------------------------------------------- 1 | boolean f(Set set, String value) { 2 | for (String entry : set) { 3 | if (entry.equalsIgnoreCase(value)) { 4 | return true; 5 | } 6 | } 7 | return false; 8 | } -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/JPredict/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/JPredict/.gitignore: -------------------------------------------------------------------------------- 1 | /target/ -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor; 2 | 3 | import JavaExtractor.Common.CommandLineValues; 4 | import org.kohsuke.args4j.CmdLineException; 5 | 6 | import java.io.IOException; 7 | import java.nio.file.Files; 8 | import java.nio.file.Paths; 9 | import java.util.LinkedList; 10 | import java.util.List; 11 | import java.util.concurrent.ExecutionException; 12 | import java.util.concurrent.Executors; 13 | import java.util.concurrent.Future; 14 | import java.util.concurrent.ThreadPoolExecutor; 15 | 16 | public class App { 17 | private static CommandLineValues s_CommandLineValues; 18 | 19 | public static void main(String[] args) { 20 | try { 21 | s_CommandLineValues = new CommandLineValues(args); 22 | } catch (CmdLineException e) { 23 | e.printStackTrace(); 24 | return; 25 | } 26 | 27 | if (s_CommandLineValues.File != null) { 28 | ExtractFeaturesTask extractFeaturesTask = new ExtractFeaturesTask(s_CommandLineValues, 29 | s_CommandLineValues.File.toPath()); 30 | extractFeaturesTask.processFile(); 31 | } else if (s_CommandLineValues.Dir != null) { 32 | extractDir(); 33 | } 34 | } 35 | 36 | private static void extractDir() { 37 | ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(s_CommandLineValues.NumThreads); 38 | LinkedList tasks = new LinkedList<>(); 39 | try { 40 | Files.walk(Paths.get(s_CommandLineValues.Dir)).filter(Files::isRegularFile) 41 | .filter(p -> p.toString().toLowerCase().endsWith(".java")).forEach(f -> { 42 | ExtractFeaturesTask task = new ExtractFeaturesTask(s_CommandLineValues, f); 43 | tasks.add(task); 44 | }); 45 | } catch (IOException e) { 46 | e.printStackTrace(); 47 | return; 48 | } 49 | List> tasksResults = null; 50 | try { 51 | tasksResults = executor.invokeAll(tasks); 52 | } catch (InterruptedException e) { 53 | e.printStackTrace(); 54 | } finally { 55 | executor.shutdown(); 56 | } 57 | tasksResults.forEach(f -> { 58 | try { 59 | f.get(); 60 | } catch (InterruptedException | ExecutionException e) { 61 | e.printStackTrace(); 62 | } 63 | }); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.Common; 2 | 3 | import org.kohsuke.args4j.CmdLineException; 4 | import org.kohsuke.args4j.CmdLineParser; 5 | import org.kohsuke.args4j.Option; 6 | 7 | import java.io.File; 8 | 9 | /** 10 | * This class handles the programs arguments. 11 | */ 12 | public class CommandLineValues { 13 | @Option(name = "--file", required = false) 14 | public File File = null; 15 | 16 | @Option(name = "--dir", required = false, forbids = "--file") 17 | public String Dir = null; 18 | 19 | @Option(name = "--max_path_length", required = true) 20 | public int MaxPathLength; 21 | 22 | @Option(name = "--max_path_width", required = true) 23 | public int MaxPathWidth; 24 | 25 | @Option(name = "--num_threads", required = false) 26 | public int NumThreads = 64; 27 | 28 | @Option(name = "--min_code_len", required = false) 29 | public int MinCodeLength = 1; 30 | 31 | @Option(name = "--max_code_len", required = false) 32 | public int MaxCodeLength = -1; 33 | 34 | @Option(name = "--max_file_len", required = false) 35 | public int MaxFileLength = -1; 36 | 37 | @Option(name = "--pretty_print", required = false) 38 | public boolean PrettyPrint = false; 39 | 40 | @Option(name = "--max_child_id", required = false) 41 | public int MaxChildId = 3; 42 | 43 | public CommandLineValues(String... args) throws CmdLineException { 44 | CmdLineParser parser = new CmdLineParser(this); 45 | try { 46 | parser.parseArgument(args); 47 | } catch (CmdLineException e) { 48 | System.err.println(e.getMessage()); 49 | parser.printUsage(System.err); 50 | throw e; 51 | } 52 | } 53 | 54 | public CommandLineValues() { 55 | 56 | } 57 | } -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.Common; 2 | 3 | import JavaExtractor.FeaturesEntities.Property; 4 | import com.github.javaparser.ast.Node; 5 | import com.github.javaparser.ast.UserDataKey; 6 | 7 | import java.util.ArrayList; 8 | import java.util.stream.Collectors; 9 | import java.util.stream.Stream; 10 | 11 | public final class Common { 12 | public static final UserDataKey PropertyKey = new UserDataKey() { 13 | }; 14 | public static final UserDataKey ChildId = new UserDataKey() { 15 | }; 16 | public static final String EmptyString = ""; 17 | 18 | public static final String MethodDeclaration = "MethodDeclaration"; 19 | public static final String NameExpr = "NameExpr"; 20 | public static final String BlankWord = "BLANK"; 21 | 22 | public static final int c_MaxLabelLength = 50; 23 | public static final String methodName = "METHOD_NAME"; 24 | public static final String internalSeparator = "|"; 25 | 26 | public static String normalizeName(String original, String defaultString) { 27 | original = original.toLowerCase().replaceAll("\\\\n", "") // escaped new 28 | // lines 29 | .replaceAll("//s+", "") // whitespaces 30 | .replaceAll("[\"',]", "") // quotes, apostrophies, commas 31 | .replaceAll("\\P{Print}", ""); // unicode weird characters 32 | String stripped = original.replaceAll("[^A-Za-z]", ""); 33 | if (stripped.length() == 0) { 34 | String carefulStripped = original.replaceAll(" ", "_"); 35 | if (carefulStripped.length() == 0) { 36 | return defaultString; 37 | } else { 38 | return carefulStripped; 39 | } 40 | } else { 41 | return stripped; 42 | } 43 | } 44 | 45 | public static boolean isMethod(Node node, String type) { 46 | Property parentProperty = node.getParentNode().getUserData(Common.PropertyKey); 47 | if (parentProperty == null) { 48 | return false; 49 | } 50 | 51 | String parentType = parentProperty.getType(); 52 | return Common.NameExpr.equals(type) && Common.MethodDeclaration.equals(parentType); 53 | } 54 | 55 | public static ArrayList splitToSubtokens(String str1) { 56 | String str2 = str1.replace("|", " "); 57 | String str3 = str2.trim(); 58 | return Stream.of(str3.split("(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+")) 59 | .filter(s -> s.length() > 0).map(s -> Common.normalizeName(s, Common.EmptyString)) 60 | .filter(s -> s.length() > 0).collect(Collectors.toCollection(ArrayList::new)); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.Common; 2 | 3 | import com.github.javaparser.ast.Node; 4 | 5 | import java.util.ArrayList; 6 | 7 | public class MethodContent { 8 | private final ArrayList leaves; 9 | private final String name; 10 | 11 | public MethodContent(ArrayList leaves, String name) { 12 | this.leaves = leaves; 13 | this.name = name; 14 | } 15 | 16 | public ArrayList getLeaves() { 17 | return leaves; 18 | } 19 | 20 | public String getName() { 21 | return name; 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor; 2 | 3 | import JavaExtractor.Common.CommandLineValues; 4 | import JavaExtractor.Common.Common; 5 | import JavaExtractor.FeaturesEntities.ProgramFeatures; 6 | import org.apache.commons.lang3.StringUtils; 7 | 8 | import java.io.IOException; 9 | import java.nio.charset.Charset; 10 | import java.nio.file.Files; 11 | import java.nio.file.Path; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | import java.util.concurrent.Callable; 15 | 16 | class ExtractFeaturesTask implements Callable { 17 | private final CommandLineValues m_CommandLineValues; 18 | private final Path filePath; 19 | 20 | public ExtractFeaturesTask(CommandLineValues commandLineValues, Path path) { 21 | m_CommandLineValues = commandLineValues; 22 | this.filePath = path; 23 | } 24 | 25 | @Override 26 | public Void call() { 27 | processFile(); 28 | return null; 29 | } 30 | 31 | public void processFile() { 32 | ArrayList features; 33 | try { 34 | features = extractSingleFile(); 35 | } catch (IOException e) { 36 | e.printStackTrace(); 37 | return; 38 | } 39 | if (features == null) { 40 | return; 41 | } 42 | 43 | String toPrint = featuresToString(features); 44 | if (toPrint.length() > 0) { 45 | System.out.println(toPrint); 46 | } 47 | } 48 | 49 | private ArrayList extractSingleFile() throws IOException { 50 | String code; 51 | 52 | if (m_CommandLineValues.MaxFileLength > 0 && 53 | Files.lines(filePath, Charset.defaultCharset()).count() > m_CommandLineValues.MaxFileLength) { 54 | return new ArrayList<>(); 55 | } 56 | try { 57 | code = new String(Files.readAllBytes(filePath)); 58 | } catch (IOException e) { 59 | e.printStackTrace(); 60 | code = Common.EmptyString; 61 | } 62 | FeatureExtractor featureExtractor = new FeatureExtractor(m_CommandLineValues); 63 | 64 | return featureExtractor.extractFeatures(code); 65 | } 66 | 67 | public String featuresToString(ArrayList features) { 68 | if (features == null || features.isEmpty()) { 69 | return Common.EmptyString; 70 | } 71 | 72 | List methodsOutputs = new ArrayList<>(); 73 | 74 | for (ProgramFeatures singleMethodFeatures : features) { 75 | StringBuilder builder = new StringBuilder(); 76 | 77 | String toPrint = singleMethodFeatures.toString(); 78 | if (m_CommandLineValues.PrettyPrint) { 79 | toPrint = toPrint.replace(" ", "\n\t"); 80 | } 81 | builder.append(toPrint); 82 | 83 | 84 | methodsOutputs.add(builder.toString()); 85 | 86 | } 87 | return StringUtils.join(methodsOutputs, "\n"); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.FeaturesEntities; 2 | 3 | import com.fasterxml.jackson.annotation.JsonIgnore; 4 | 5 | import java.util.ArrayList; 6 | import java.util.stream.Collectors; 7 | 8 | public class ProgramFeatures { 9 | private final String name; 10 | 11 | private final ArrayList features = new ArrayList<>(); 12 | 13 | public ProgramFeatures(String name) { 14 | this.name = name; 15 | } 16 | 17 | @SuppressWarnings("StringBufferReplaceableByString") 18 | @Override 19 | public String toString() { 20 | StringBuilder stringBuilder = new StringBuilder(); 21 | stringBuilder.append(name).append(" "); 22 | stringBuilder.append(features.stream().map(ProgramRelation::toString).collect(Collectors.joining(" "))); 23 | 24 | return stringBuilder.toString(); 25 | } 26 | 27 | public void addFeature(Property source, String path, Property target) { 28 | ProgramRelation newRelation = new ProgramRelation(source, target, path); 29 | features.add(newRelation); 30 | } 31 | 32 | @JsonIgnore 33 | public boolean isEmpty() { 34 | return features.isEmpty(); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.FeaturesEntities; 2 | 3 | public class ProgramRelation { 4 | private final Property m_Source; 5 | private final Property m_Target; 6 | private final String m_Path; 7 | 8 | public ProgramRelation(Property sourceName, Property targetName, String path) { 9 | m_Source = sourceName; 10 | m_Target = targetName; 11 | m_Path = path; 12 | } 13 | 14 | public String toString() { 15 | return String.format("%s,%s,%s", m_Source.getName(), m_Path, 16 | m_Target.getName()); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.Visitors; 2 | 3 | import JavaExtractor.Common.CommandLineValues; 4 | import JavaExtractor.Common.Common; 5 | import JavaExtractor.Common.MethodContent; 6 | import com.github.javaparser.ast.Node; 7 | import com.github.javaparser.ast.body.MethodDeclaration; 8 | import com.github.javaparser.ast.visitor.VoidVisitorAdapter; 9 | 10 | import java.util.ArrayList; 11 | import java.util.Arrays; 12 | 13 | @SuppressWarnings("StringEquality") 14 | public class FunctionVisitor extends VoidVisitorAdapter { 15 | private final ArrayList m_Methods = new ArrayList<>(); 16 | private final CommandLineValues m_CommandLineValues; 17 | 18 | public FunctionVisitor(CommandLineValues commandLineValues) { 19 | this.m_CommandLineValues = commandLineValues; 20 | } 21 | 22 | @Override 23 | public void visit(MethodDeclaration node, Object arg) { 24 | visitMethod(node); 25 | 26 | super.visit(node, arg); 27 | } 28 | 29 | private void visitMethod(MethodDeclaration node) { 30 | LeavesCollectorVisitor leavesCollectorVisitor = new LeavesCollectorVisitor(); 31 | leavesCollectorVisitor.visitDepthFirst(node); 32 | ArrayList leaves = leavesCollectorVisitor.getLeaves(); 33 | 34 | String normalizedMethodName = Common.normalizeName(node.getName(), Common.BlankWord); 35 | ArrayList splitNameParts = Common.splitToSubtokens(node.getName()); 36 | String splitName = normalizedMethodName; 37 | if (splitNameParts.size() > 0) { 38 | splitName = String.join(Common.internalSeparator, splitNameParts); 39 | } 40 | 41 | if (node.getBody() != null) { 42 | long methodLength = getMethodLength(node.getBody().toString()); 43 | if (m_CommandLineValues.MaxCodeLength > 0) { 44 | if (methodLength >= m_CommandLineValues.MinCodeLength && methodLength <= m_CommandLineValues.MaxCodeLength) { 45 | m_Methods.add(new MethodContent(leaves, splitName)); 46 | } 47 | } else { 48 | m_Methods.add(new MethodContent(leaves, splitName)); 49 | } 50 | } 51 | } 52 | 53 | private long getMethodLength(String code) { 54 | String cleanCode = code.replaceAll("\r\n", "\n").replaceAll("\t", " "); 55 | if (cleanCode.startsWith("{\n")) 56 | cleanCode = cleanCode.substring(3).trim(); 57 | if (cleanCode.endsWith("\n}")) 58 | cleanCode = cleanCode.substring(0, cleanCode.length() - 2).trim(); 59 | if (cleanCode.length() == 0) { 60 | return 0; 61 | } 62 | return Arrays.stream(cleanCode.split("\n")) 63 | .filter(line -> (line.trim() != "{" && line.trim() != "}" && line.trim() != "")) 64 | .filter(line -> !line.trim().startsWith("/") && !line.trim().startsWith("*")).count(); 65 | } 66 | 67 | public ArrayList getMethodContents() { 68 | return m_Methods; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java: -------------------------------------------------------------------------------- 1 | package JavaExtractor.Visitors; 2 | 3 | import JavaExtractor.Common.Common; 4 | import JavaExtractor.FeaturesEntities.Property; 5 | import com.github.javaparser.ast.Node; 6 | import com.github.javaparser.ast.comments.Comment; 7 | import com.github.javaparser.ast.expr.NullLiteralExpr; 8 | import com.github.javaparser.ast.stmt.Statement; 9 | import com.github.javaparser.ast.type.ClassOrInterfaceType; 10 | import com.github.javaparser.ast.visitor.TreeVisitor; 11 | 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | public class LeavesCollectorVisitor extends TreeVisitor { 16 | private final ArrayList m_Leaves = new ArrayList<>(); 17 | 18 | @Override 19 | public void process(Node node) { 20 | if (node instanceof Comment) { 21 | return; 22 | } 23 | boolean isLeaf = false; 24 | boolean isGenericParent = isGenericParent(node); 25 | if (hasNoChildren(node) && isNotComment(node)) { 26 | if (!node.toString().isEmpty() && (!"null".equals(node.toString()) || (node instanceof NullLiteralExpr))) { 27 | m_Leaves.add(node); 28 | isLeaf = true; 29 | } 30 | } 31 | 32 | int childId = getChildId(node); 33 | node.setUserData(Common.ChildId, childId); 34 | Property property = new Property(node, isLeaf, isGenericParent); 35 | node.setUserData(Common.PropertyKey, property); 36 | } 37 | 38 | private boolean isGenericParent(Node node) { 39 | return (node instanceof ClassOrInterfaceType) 40 | && ((ClassOrInterfaceType) node).getTypeArguments() != null 41 | && ((ClassOrInterfaceType) node).getTypeArguments().size() > 0; 42 | } 43 | 44 | private boolean hasNoChildren(Node node) { 45 | return node.getChildrenNodes().size() == 0; 46 | } 47 | 48 | private boolean isNotComment(Node node) { 49 | return !(node instanceof Comment) && !(node instanceof Statement); 50 | } 51 | 52 | public ArrayList getLeaves() { 53 | return m_Leaves; 54 | } 55 | 56 | private int getChildId(Node node) { 57 | Node parent = node.getParentNode(); 58 | List parentsChildren = parent.getChildrenNodes(); 59 | int childId = 0; 60 | for (Node child : parentsChildren) { 61 | if (child.getRange().equals(node.getRange())) { 62 | return childId; 63 | } 64 | childId++; 65 | } 66 | return childId; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/JPredict/src/main/java/Test.java: -------------------------------------------------------------------------------- 1 | class Test { 2 | void fooBar() { 3 | System.out.println("http://github.com"); 4 | } 5 | } -------------------------------------------------------------------------------- /code2seq_master/JavaExtractor/extract.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import itertools 4 | import multiprocessing 5 | import os 6 | import shutil 7 | import subprocess 8 | import sys 9 | from argparse import ArgumentParser 10 | from threading import Timer 11 | 12 | 13 | def get_immediate_subdirectories(a_dir): 14 | return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir) 15 | if os.path.isdir(os.path.join(a_dir, name))] 16 | 17 | 18 | TMP_DIR = "" 19 | 20 | 21 | def ParallelExtractDir(args, dir): 22 | ExtractFeaturesForDir(args, dir, "") 23 | 24 | 25 | def ExtractFeaturesForDir(args, dir, prefix): 26 | command = ['java', '-Xmx100g', '-XX:MaxNewSize=60g', '-cp', args.jar, 'JavaExtractor.App', 27 | '--max_path_length', str(args.max_path_length), '--max_path_width', str(args.max_path_width), 28 | '--dir', dir, '--num_threads', str(args.num_threads)] 29 | 30 | # print command 31 | # os.system(command) 32 | kill = lambda process: process.kill() 33 | outputFileName = TMP_DIR + prefix + dir.split('/')[-1] 34 | failed = False 35 | with open(outputFileName, 'a') as outputFile: 36 | sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE) 37 | timer = Timer(60 * 60, kill, [sleeper]) 38 | 39 | try: 40 | timer.start() 41 | stdout, stderr = sleeper.communicate() 42 | finally: 43 | timer.cancel() 44 | 45 | if sleeper.poll() == 0: 46 | if len(stderr) > 0: 47 | print(stderr, file=sys.stderr) 48 | else: 49 | print('dir: ' + str(dir) + ' was not completed in time', file=sys.stderr) 50 | failed = True 51 | subdirs = get_immediate_subdirectories(dir) 52 | for subdir in subdirs: 53 | ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_') 54 | if failed: 55 | if os.path.exists(outputFileName): 56 | os.remove(outputFileName) 57 | 58 | 59 | def ExtractFeaturesForDirsList(args, dirs): 60 | global TMP_DIR 61 | TMP_DIR = "./tmp/feature_extractor%d/" % (os.getpid()) 62 | if os.path.exists(TMP_DIR): 63 | shutil.rmtree(TMP_DIR, ignore_errors=True) 64 | os.makedirs(TMP_DIR) 65 | try: 66 | p = multiprocessing.Pool(6) 67 | p.starmap(ParallelExtractDir, zip(itertools.repeat(args), dirs)) 68 | # for dir in dirs: 69 | # ExtractFeaturesForDir(args, dir, '') 70 | output_files = os.listdir(TMP_DIR) 71 | for f in output_files: 72 | os.system("cat %s/%s" % (TMP_DIR, f)) 73 | finally: 74 | shutil.rmtree(TMP_DIR, ignore_errors=True) 75 | 76 | 77 | if __name__ == '__main__': 78 | parser = ArgumentParser() 79 | parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8) 80 | parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2) 81 | parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64) 82 | parser.add_argument("-j", "--jar", dest="jar", required=True) 83 | parser.add_argument("-dir", "--dir", dest="dir", required=False) 84 | parser.add_argument("-file", "--file", dest="file", required=False) 85 | args = parser.parse_args() 86 | 87 | if args.file is not None: 88 | command = 'java -cp ' + args.jar + ' JavaExtractor.App --max_path_length ' + \ 89 | str(args.max_path_length) + ' --max_path_width ' + str(args.max_path_width) + ' --file ' + args.file 90 | os.system(command) 91 | elif args.dir is not None: 92 | subdirs = get_immediate_subdirectories(args.dir) 93 | # print("Sub Directories") 94 | # print(subdirs) 95 | if len(subdirs) == 0: 96 | subdirs = [args.dir] 97 | ExtractFeaturesForDirsList(args, subdirs) 98 | -------------------------------------------------------------------------------- /code2seq_master/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Technion 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /code2seq_master/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/code2seq_master/__init__.py -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/input_example.txt: -------------------------------------------------------------------------------- 1 | requires landscape|boolean (){ return false; } 2 | get parent key|Object (){ return new ContactsUiKey(); } 3 | get parent key|Object (){ return new ContactsUiKey(); } 4 | get layout id|int (){ return R.layout.loose_screen; } 5 | get parent key|Object (){ return new EditContactKey(contactId); } 6 | to contact|Contact (){ return new Contact(id, name, email); } 7 | to string|String (){ return "Welcome!\nClick to continue."; } 8 | get parent key|Object (){ return new EditContactKey(contactId); } 9 | tear down services|void (@NonNull Services services){ } 10 | get layout id|int (){ return R.layout.landscape_screen; } 11 | -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from . import parser 3 | from . import parse 4 | from . import tokenizer 5 | from . import javadoc 6 | 7 | 8 | __version__ = "0.10.1" 9 | -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/ast.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import six 4 | 5 | 6 | class MetaNode(type): 7 | def __new__(mcs, name, bases, dict): 8 | attrs = list(dict['attrs']) 9 | dict['attrs'] = list() 10 | 11 | for base in bases: 12 | if hasattr(base, 'attrs'): 13 | dict['attrs'].extend(base.attrs) 14 | 15 | dict['attrs'].extend(attrs) 16 | 17 | return type.__new__(mcs, name, bases, dict) 18 | 19 | 20 | @six.add_metaclass(MetaNode) 21 | class Node(object): 22 | attrs = () 23 | 24 | def __init__(self, **kwargs): 25 | values = kwargs.copy() 26 | 27 | for attr_name in self.attrs: 28 | value = values.pop(attr_name, None) 29 | setattr(self, attr_name, value) 30 | 31 | if values: 32 | raise ValueError('Extraneous arguments') 33 | 34 | def __equals__(self, other): 35 | if type(other) is not type(self): 36 | return False 37 | 38 | for attr in self.attrs: 39 | if getattr(other, attr) != getattr(self, attr): 40 | return False 41 | 42 | return True 43 | 44 | def __repr__(self): 45 | return type(self).__name__ 46 | 47 | def __iter__(self): 48 | return walk_tree(self) 49 | 50 | def filter(self, pattern): 51 | for path, node in self: 52 | if ((isinstance(pattern, type) and isinstance(node, pattern)) or 53 | (node == pattern)): 54 | yield path, node 55 | 56 | @property 57 | def children(self): 58 | return [getattr(self, attr_name) for attr_name in self.attrs] 59 | 60 | def walk_tree(root): 61 | children = None 62 | 63 | if isinstance(root, Node): 64 | yield (), root 65 | children = root.children 66 | else: 67 | children = root 68 | 69 | for child in children: 70 | if isinstance(child, (Node, list, tuple)): 71 | for path, node in walk_tree(child): 72 | yield (root,) + path, node 73 | 74 | def dump(ast, file): 75 | pickle.dump(ast, file) 76 | 77 | def load(file): 78 | return pickle.load(file) 79 | -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/javadoc.py: -------------------------------------------------------------------------------- 1 | 2 | import re 3 | 4 | def join(s): 5 | return ' '.join(l.strip() for l in s.split('\n')) 6 | 7 | class DocBlock(object): 8 | def __init__(self): 9 | self.description = '' 10 | self.return_doc = None 11 | self.params = [] 12 | 13 | self.authors = [] 14 | self.deprecated = False 15 | 16 | # @exception and @throw are equivalent 17 | self.throws = {} 18 | self.exceptions = self.throws 19 | 20 | self.tags = {} 21 | 22 | def add_block(self, name, value): 23 | value = value.strip() 24 | 25 | if name == 'param': 26 | try: 27 | param, description = value.split(None, 1) 28 | except ValueError: 29 | param, description = value, '' 30 | self.params.append((param, join(description))) 31 | 32 | elif name in ('throws', 'exception'): 33 | try: 34 | ex, description = value.split(None, 1) 35 | except ValueError: 36 | ex, description = value, '' 37 | self.throws[ex] = join(description) 38 | 39 | elif name == 'return': 40 | self.return_doc = value 41 | 42 | elif name == 'author': 43 | self.authors.append(value) 44 | 45 | elif name == 'deprecated': 46 | self.deprecated = True 47 | 48 | self.tags.setdefault(name, []).append(value) 49 | 50 | blocks_re = re.compile('(^@)', re.MULTILINE) 51 | leading_space_re = re.compile(r'^\s*\*', re.MULTILINE) 52 | blocks_justify_re = re.compile(r'^\s*@', re.MULTILINE) 53 | 54 | def _sanitize(s): 55 | s = s.strip() 56 | 57 | if not (s[:3] == '/**' and s[-2:] == '*/'): 58 | raise ValueError('not a valid Javadoc comment') 59 | 60 | s = s.replace('\t', ' ') 61 | 62 | return s 63 | 64 | def _uncomment(s): 65 | # Remove /** and */ 66 | s = s[3:-2].strip() 67 | 68 | return leading_space_re.sub('', s) 69 | 70 | def _get_indent_level(s): 71 | return len(s) - len(s.lstrip()) 72 | 73 | def _left_justify(s): 74 | lines = s.rstrip().splitlines() 75 | 76 | if not lines: 77 | return '' 78 | 79 | indent_levels = [] 80 | for line in lines: 81 | if line.strip(): 82 | indent_levels.append(_get_indent_level(line)) 83 | indent_levels.sort() 84 | 85 | common_indent = indent_levels[0] 86 | if common_indent == 0: 87 | return s 88 | else: 89 | lines = [line[common_indent:] for line in lines] 90 | return '\n'.join(lines) 91 | 92 | def _force_blocks_left(s): 93 | return blocks_justify_re.sub('@', s) 94 | 95 | def parse(raw): 96 | sanitized = _sanitize(raw) 97 | uncommented = _uncomment(sanitized) 98 | justified = _left_justify(uncommented) 99 | justified_fixed = _force_blocks_left(justified) 100 | prepared = justified_fixed 101 | 102 | blocks = blocks_re.split(prepared) 103 | 104 | doc = DocBlock() 105 | 106 | if blocks[0] != '@': 107 | doc.description = blocks[0].strip() 108 | blocks = blocks[2::2] 109 | else: 110 | blocks = blocks[1::2] 111 | 112 | for block in blocks: 113 | try: 114 | tag, value = block.split(None, 1) 115 | except ValueError: 116 | tag, value = block, '' 117 | 118 | doc.add_block(tag, value) 119 | 120 | return doc 121 | -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/parse.py: -------------------------------------------------------------------------------- 1 | 2 | from .parser import Parser 3 | from .tokenizer import tokenize 4 | 5 | def parse_expression(exp): 6 | if not exp.endswith(';'): 7 | exp = exp + ';' 8 | 9 | tokens = tokenize(exp) 10 | parser = Parser(tokens) 11 | 12 | return parser.parse_expression() 13 | 14 | def parse_member_signature(sig): 15 | if not sig.endswith(';'): 16 | sig = sig + ';' 17 | 18 | tokens = tokenize(sig) 19 | parser = Parser(tokens) 20 | 21 | return parser.parse_member_declaration() 22 | 23 | def parse_constructor_signature(sig): 24 | # Add an empty body to the signature, replacing a ; if necessary 25 | if sig.endswith(';'): 26 | sig = sig[:-1] 27 | sig = sig + '{ }' 28 | 29 | tokens = tokenize(sig) 30 | parser = Parser(tokens) 31 | 32 | return parser.parse_member_declaration() 33 | 34 | def parse_type(s): 35 | tokens = tokenize(s) 36 | parser = Parser(tokens) 37 | 38 | return parser.parse_type() 39 | 40 | def parse_type_signature(sig): 41 | if sig.endswith(';'): 42 | sig = sig[:-1] 43 | sig = sig + '{ }' 44 | 45 | tokens = tokenize(sig) 46 | parser = Parser(tokens) 47 | 48 | return parser.parse_class_or_interface_declaration() 49 | 50 | def parse(s): 51 | tokens = tokenize(s) 52 | parser = Parser(tokens) 53 | return parser.parse() 54 | -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/code2seq_master/baseline_tokenization/javalang/test/__init__.py -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/test/source/package-info/AnnotationJavadoc.java: -------------------------------------------------------------------------------- 1 | @Package 2 | /** 3 | Test that includes java doc first but no annotation 4 | */ 5 | package org.javalang.test; -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/test/source/package-info/AnnotationOnly.java: -------------------------------------------------------------------------------- 1 | @Package 2 | package org.javalang.test; -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/test/source/package-info/JavadocAnnotation.java: -------------------------------------------------------------------------------- 1 | /** 2 | Test that includes java doc first but no annotation 3 | */ 4 | @Package 5 | package org.javalang.test; -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/test/source/package-info/JavadocOnly.java: -------------------------------------------------------------------------------- 1 | /** 2 | Test that includes java doc first but no annotation 3 | */ 4 | package org.javalang.test; -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/test/source/package-info/NoAnnotationNoJavadoc.java: -------------------------------------------------------------------------------- 1 | package org.javalang.test; -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/test/test_javadoc.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from .. import javadoc 4 | 5 | 6 | class TestJavadoc(unittest.TestCase): 7 | def test_empty_comment(self): 8 | javadoc.parse('/** */') 9 | javadoc.parse('/***/') 10 | javadoc.parse('/**\n *\n */') 11 | javadoc.parse('/**\n *\n *\n */') 12 | 13 | if __name__ == "__main__": 14 | unittest.main() 15 | -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/test/test_package_declaration.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pkg_resources import resource_string 4 | from .. import parse 5 | 6 | 7 | # From my reading of the spec (http://docs.oracle.com/javase/specs/jls/se7/html/jls-7.html) the 8 | # allowed order is javadoc, optional annotation, package declaration 9 | class PackageInfo(unittest.TestCase): 10 | def testPackageDeclarationOnly(self): 11 | source_file = "source/package-info/NoAnnotationNoJavadoc.java" 12 | ast = self.get_ast(source_file) 13 | 14 | self.failUnless(ast.package.name == "org.javalang.test") 15 | self.failIf(ast.package.annotations) 16 | self.failIf(ast.package.documentation) 17 | 18 | def testAnnotationOnly(self): 19 | source_file = "source/package-info/AnnotationOnly.java" 20 | ast = self.get_ast(source_file) 21 | 22 | self.failUnless(ast.package.name == "org.javalang.test") 23 | self.failUnless(ast.package.annotations) 24 | self.failIf(ast.package.documentation) 25 | 26 | def testJavadocOnly(self): 27 | source_file = "source/package-info/JavadocOnly.java" 28 | ast = self.get_ast(source_file) 29 | 30 | self.failUnless(ast.package.name == "org.javalang.test") 31 | self.failIf(ast.package.annotations) 32 | self.failUnless(ast.package.documentation) 33 | 34 | def testAnnotationThenJavadoc(self): 35 | source_file = "source/package-info/AnnotationJavadoc.java" 36 | ast = self.get_ast(source_file) 37 | 38 | self.failUnless(ast.package.name == "org.javalang.test") 39 | self.failUnless(ast.package.annotations) 40 | self.failIf(ast.package.documentation) 41 | 42 | def testJavadocThenAnnotation(self): 43 | source_file = "source/package-info/JavadocAnnotation.java" 44 | ast = self.get_ast(source_file) 45 | 46 | self.failUnless(ast.package.name == "org.javalang.test") 47 | self.failUnless(ast.package.annotations) 48 | self.failUnless(ast.package.documentation) 49 | 50 | def get_ast(self, filename): 51 | source = resource_string(__name__, filename) 52 | ast = parse.parse(source) 53 | 54 | return ast 55 | 56 | 57 | def main(): 58 | unittest.main() 59 | 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/test/test_util.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from ..util import LookAheadIterator 4 | 5 | 6 | class TestLookAheadIterator(unittest.TestCase): 7 | def test_usage(self): 8 | i = LookAheadIterator(list(range(0, 10000))) 9 | 10 | self.assertEqual(next(i), 0) 11 | self.assertEqual(next(i), 1) 12 | self.assertEqual(next(i), 2) 13 | 14 | self.assertEqual(i.last(), 2) 15 | 16 | self.assertEqual(i.look(), 3) 17 | self.assertEqual(i.last(), 3) 18 | 19 | self.assertEqual(i.look(1), 4) 20 | self.assertEqual(i.look(2), 5) 21 | self.assertEqual(i.look(3), 6) 22 | self.assertEqual(i.look(4), 7) 23 | 24 | self.assertEqual(i.last(), 7) 25 | 26 | i.push_marker() 27 | self.assertEqual(next(i), 3) 28 | self.assertEqual(next(i), 4) 29 | self.assertEqual(next(i), 5) 30 | i.pop_marker(True) # reset 31 | 32 | self.assertEqual(i.look(), 3) 33 | self.assertEqual(next(i), 3) 34 | 35 | i.push_marker() #1 36 | self.assertEqual(next(i), 4) 37 | self.assertEqual(next(i), 5) 38 | i.push_marker() #2 39 | self.assertEqual(next(i), 6) 40 | self.assertEqual(next(i), 7) 41 | i.push_marker() #3 42 | self.assertEqual(next(i), 8) 43 | self.assertEqual(next(i), 9) 44 | i.pop_marker(False) #3 45 | self.assertEqual(next(i), 10) 46 | i.pop_marker(True) #2 47 | self.assertEqual(next(i), 6) 48 | self.assertEqual(next(i), 7) 49 | self.assertEqual(next(i), 8) 50 | i.pop_marker(False) #1 51 | self.assertEqual(next(i), 9) 52 | 53 | try: 54 | with i: 55 | self.assertEqual(next(i), 10) 56 | self.assertEqual(next(i), 11) 57 | raise Exception() 58 | except: 59 | self.assertEqual(next(i), 10) 60 | self.assertEqual(next(i), 11) 61 | 62 | with i: 63 | self.assertEqual(next(i), 12) 64 | self.assertEqual(next(i), 13) 65 | self.assertEqual(next(i), 14) 66 | 67 | 68 | if __name__=="__main__": 69 | unittest.main() 70 | -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/tree.py: -------------------------------------------------------------------------------- 1 | 2 | from .ast import Node 3 | 4 | # ------------------------------------------------------------------------------ 5 | 6 | class CompilationUnit(Node): 7 | attrs = ("package", "imports", "types") 8 | 9 | class Import(Node): 10 | attrs = ("path", "static", "wildcard") 11 | 12 | class Documented(Node): 13 | attrs = ("documentation",) 14 | 15 | class Declaration(Node): 16 | attrs = ("modifiers", "annotations") 17 | 18 | class TypeDeclaration(Declaration, Documented): 19 | attrs = ("name", "body") 20 | 21 | @property 22 | def fields(self): 23 | return [decl for decl in self.body if isinstance(decl, FieldDeclaration)] 24 | 25 | @property 26 | def methods(self): 27 | return [decl for decl in self.body if isinstance(decl, MethodDeclaration)] 28 | 29 | @property 30 | def constructors(self): 31 | return [decl for decl in self.body if isinstance(decl, ConstructorDeclaration)] 32 | 33 | class PackageDeclaration(Declaration, Documented): 34 | attrs = ("name",) 35 | 36 | class ClassDeclaration(TypeDeclaration): 37 | attrs = ("type_parameters", "extends", "implements") 38 | 39 | class EnumDeclaration(TypeDeclaration): 40 | attrs = ("implements",) 41 | 42 | class InterfaceDeclaration(TypeDeclaration): 43 | attrs = ("type_parameters", "extends",) 44 | 45 | class AnnotationDeclaration(TypeDeclaration): 46 | attrs = () 47 | 48 | # ------------------------------------------------------------------------------ 49 | 50 | class Type(Node): 51 | attrs = ("name", "dimensions",) 52 | 53 | class BasicType(Type): 54 | attrs = () 55 | 56 | class ReferenceType(Type): 57 | attrs = ("arguments", "sub_type") 58 | 59 | class TypeArgument(Node): 60 | attrs = ("type", "pattern_type") 61 | 62 | # ------------------------------------------------------------------------------ 63 | 64 | class TypeParameter(Node): 65 | attrs = ("name", "extends") 66 | 67 | # ------------------------------------------------------------------------------ 68 | 69 | class Annotation(Node): 70 | attrs = ("name", "element") 71 | 72 | class ElementValuePair(Node): 73 | attrs = ("name", "value") 74 | 75 | class ElementArrayValue(Node): 76 | attrs = ("values",) 77 | 78 | # ------------------------------------------------------------------------------ 79 | 80 | class Member(Documented): 81 | attrs = () 82 | 83 | class MethodDeclaration(Member, Declaration): 84 | attrs = ("type_parameters", "return_type", "name", "parameters", "throws", "body") 85 | 86 | class FieldDeclaration(Member, Declaration): 87 | attrs = ("type", "declarators") 88 | 89 | class ConstructorDeclaration(Declaration, Documented): 90 | attrs = ("type_parameters", "name", "parameters", "throws", "body") 91 | 92 | # ------------------------------------------------------------------------------ 93 | 94 | class ConstantDeclaration(FieldDeclaration): 95 | attrs = () 96 | 97 | class ArrayInitializer(Node): 98 | attrs = ("initializers",) 99 | 100 | class VariableDeclaration(Declaration): 101 | attrs = ("type", "declarators") 102 | 103 | class LocalVariableDeclaration(VariableDeclaration): 104 | attrs = () 105 | 106 | class VariableDeclarator(Node): 107 | attrs = ("name", "dimensions", "initializer") 108 | 109 | class FormalParameter(Declaration): 110 | attrs = ("type", "name", "varargs") 111 | 112 | class InferredFormalParameter(Node): 113 | attrs = ('name',) 114 | 115 | # ------------------------------------------------------------------------------ 116 | 117 | class Statement(Node): 118 | attrs = ("label",) 119 | 120 | class IfStatement(Statement): 121 | attrs = ("condition", "then_statement", "else_statement") 122 | 123 | class WhileStatement(Statement): 124 | attrs = ("condition", "body") 125 | 126 | class DoStatement(Statement): 127 | attrs = ("condition", "body") 128 | 129 | class ForStatement(Statement): 130 | attrs = ("control", "body") 131 | 132 | class AssertStatement(Statement): 133 | attrs = ("condition", "value") 134 | 135 | class BreakStatement(Statement): 136 | attrs = ("goto",) 137 | 138 | class ContinueStatement(Statement): 139 | attrs = ("goto",) 140 | 141 | class ReturnStatement(Statement): 142 | attrs = ("expression",) 143 | 144 | class ThrowStatement(Statement): 145 | attrs = ("expression",) 146 | 147 | class SynchronizedStatement(Statement): 148 | attrs = ("lock", "block") 149 | 150 | class TryStatement(Statement): 151 | attrs = ("resources", "block", "catches", "finally_block") 152 | 153 | class SwitchStatement(Statement): 154 | attrs = ("expression", "cases") 155 | 156 | class BlockStatement(Statement): 157 | attrs = ("statements",) 158 | 159 | class StatementExpression(Statement): 160 | attrs = ("expression",) 161 | 162 | # ------------------------------------------------------------------------------ 163 | 164 | class TryResource(Declaration): 165 | attrs = ("type", "name", "value") 166 | 167 | class CatchClause(Statement): 168 | attrs = ("parameter", "block") 169 | 170 | class CatchClauseParameter(Declaration): 171 | attrs = ("types", "name") 172 | 173 | # ------------------------------------------------------------------------------ 174 | 175 | class SwitchStatementCase(Node): 176 | attrs = ("case", "statements") 177 | 178 | class ForControl(Node): 179 | attrs = ("init", "condition", "update") 180 | 181 | class EnhancedForControl(Node): 182 | attrs = ("var", "iterable") 183 | 184 | # ------------------------------------------------------------------------------ 185 | 186 | class Expression(Node): 187 | attrs = () 188 | 189 | class Assignment(Expression): 190 | attrs = ("expressionl", "value", "type") 191 | 192 | class TernaryExpression(Expression): 193 | attrs = ("condition", "if_true", "if_false") 194 | 195 | class BinaryOperation(Expression): 196 | attrs = ("operator", "operandl", "operandr") 197 | 198 | class Cast(Expression): 199 | attrs = ("type", "expression") 200 | 201 | class MethodReference(Expression): 202 | attrs = ("expression", "method", "type_arguments") 203 | 204 | class LambdaExpression(Expression): 205 | attrs = ('parameters', 'body') 206 | 207 | # ------------------------------------------------------------------------------ 208 | 209 | class Primary(Expression): 210 | attrs = ("prefix_operators", "postfix_operators", "qualifier", "selectors") 211 | 212 | class Literal(Primary): 213 | attrs = ("value",) 214 | 215 | class This(Primary): 216 | attrs = () 217 | 218 | class MemberReference(Primary): 219 | attrs = ("member",) 220 | 221 | class Invocation(Primary): 222 | attrs = ("type_arguments", "arguments") 223 | 224 | class ExplicitConstructorInvocation(Invocation): 225 | attrs = () 226 | 227 | class SuperConstructorInvocation(Invocation): 228 | attrs = () 229 | 230 | class MethodInvocation(Invocation): 231 | attrs = ("member",) 232 | 233 | class SuperMethodInvocation(Invocation): 234 | attrs = ("member",) 235 | 236 | class SuperMemberReference(Primary): 237 | attrs = ("member",) 238 | 239 | class ArraySelector(Expression): 240 | attrs = ("index",) 241 | 242 | class ClassReference(Primary): 243 | attrs = ("type",) 244 | 245 | class VoidClassReference(ClassReference): 246 | attrs = () 247 | 248 | # ------------------------------------------------------------------------------ 249 | 250 | class Creator(Primary): 251 | attrs = ("type",) 252 | 253 | class ArrayCreator(Creator): 254 | attrs = ("dimensions", "initializer") 255 | 256 | class ClassCreator(Creator): 257 | attrs = ("constructor_type_arguments", "arguments", "body") 258 | 259 | class InnerClassCreator(Creator): 260 | attrs = ("constructor_type_arguments", "arguments", "body") 261 | 262 | # ------------------------------------------------------------------------------ 263 | 264 | class EnumBody(Node): 265 | attrs = ("constants", "declarations") 266 | 267 | class EnumConstantDeclaration(Declaration, Documented): 268 | attrs = ("name", "arguments", "body") 269 | 270 | class AnnotationMethod(Declaration): 271 | attrs = ("name", "return_type", "dimensions", "default") 272 | 273 | -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/javalang/util.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class LookAheadIterator(object): 4 | def __init__(self, iterable): 5 | self.iterable = iter(iterable) 6 | self.look_ahead = list() 7 | self.markers = list() 8 | self.default = None 9 | self.value = None 10 | 11 | def __iter__(self): 12 | return self 13 | 14 | def set_default(self, value): 15 | self.default = value 16 | 17 | def next(self): 18 | return self.__next__() 19 | 20 | def __next__(self): 21 | if self.look_ahead: 22 | self.value = self.look_ahead.pop(0) 23 | else: 24 | self.value = next(self.iterable) 25 | 26 | if self.markers: 27 | self.markers[-1].append(self.value) 28 | 29 | return self.value 30 | 31 | def look(self, i=0): 32 | """ Look ahead of the iterable by some number of values with advancing 33 | past them. 34 | 35 | If the requested look ahead is past the end of the iterable then None is 36 | returned. 37 | 38 | """ 39 | 40 | length = len(self.look_ahead) 41 | 42 | if length <= i: 43 | try: 44 | self.look_ahead.extend([next(self.iterable) 45 | for _ in range(length, i + 1)]) 46 | except StopIteration: 47 | return self.default 48 | 49 | self.value = self.look_ahead[i] 50 | return self.value 51 | 52 | def last(self): 53 | return self.value 54 | 55 | def __enter__(self): 56 | self.push_marker() 57 | return self 58 | 59 | def __exit__(self, exc_type, exc_val, exc_tb): 60 | # Reset the iterator if there was an error 61 | if exc_type or exc_val or exc_tb: 62 | self.pop_marker(True) 63 | else: 64 | self.pop_marker(False) 65 | 66 | def push_marker(self): 67 | """ Push a marker on to the marker stack """ 68 | self.markers.append(list()) 69 | 70 | def pop_marker(self, reset): 71 | """ Pop a marker off of the marker stack. If reset is True then the 72 | iterator will be returned to the state it was in before the 73 | corresponding call to push_marker(). 74 | 75 | """ 76 | 77 | marker = self.markers.pop() 78 | 79 | if reset: 80 | # Make the values available to be read again 81 | marker.extend(self.look_ahead) 82 | self.look_ahead = marker 83 | elif self.markers: 84 | # Otherwise, reassign the values to the top marker 85 | self.markers[-1].extend(marker) 86 | else: 87 | # If there are not more markers in the stack then discard the values 88 | pass 89 | 90 | class LookAheadListIterator(object): 91 | def __init__(self, iterable): 92 | self.list = list(iterable) 93 | 94 | self.marker = 0 95 | self.saved_markers = [] 96 | 97 | self.default = None 98 | self.value = None 99 | 100 | def __iter__(self): 101 | return self 102 | 103 | def set_default(self, value): 104 | self.default = value 105 | 106 | def next(self): 107 | return self.__next__() 108 | 109 | def __next__(self): 110 | try: 111 | self.value = self.list[self.marker] 112 | self.marker += 1 113 | except IndexError: 114 | raise StopIteration() 115 | 116 | return self.value 117 | 118 | def look(self, i=0): 119 | """ Look ahead of the iterable by some number of values with advancing 120 | past them. 121 | 122 | If the requested look ahead is past the end of the iterable then None is 123 | returned. 124 | 125 | """ 126 | 127 | try: 128 | self.value = self.list[self.marker + i] 129 | except IndexError: 130 | return self.default 131 | 132 | return self.value 133 | 134 | def last(self): 135 | return self.value 136 | 137 | def __enter__(self): 138 | self.push_marker() 139 | return self 140 | 141 | def __exit__(self, exc_type, exc_val, exc_tb): 142 | # Reset the iterator if there was an error 143 | if exc_type or exc_val or exc_tb: 144 | self.pop_marker(True) 145 | else: 146 | self.pop_marker(False) 147 | 148 | def push_marker(self): 149 | """ Push a marker on to the marker stack """ 150 | self.saved_markers.append(self.marker) 151 | 152 | def pop_marker(self, reset): 153 | """ Pop a marker off of the marker stack. If reset is True then the 154 | iterator will be returned to the state it was in before the 155 | corresponding call to push_marker(). 156 | 157 | """ 158 | 159 | saved = self.saved_markers.pop() 160 | 161 | if reset: 162 | self.marker = saved 163 | elif self.saved_markers: 164 | self.saved_markers[-1] = saved 165 | 166 | -------------------------------------------------------------------------------- /code2seq_master/baseline_tokenization/subtokenize_nmt_baseline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import javalang 4 | import sys 5 | import re 6 | 7 | 8 | modifiers = ['public', 'private', 'protected', 'static'] 9 | 10 | RE_WORDS = re.compile(r''' 11 | # Find words in a string. Order matters! 12 | [A-Z]+(?=[A-Z][a-z]) | # All upper case before a capitalized word 13 | [A-Z]?[a-z]+ | # Capitalized words / all lower case 14 | [A-Z]+ | # All upper case 15 | \d+ | # Numbers 16 | .+ 17 | ''', re.VERBOSE) 18 | 19 | def split_subtokens(str): 20 | return [subtok for subtok in RE_WORDS.findall(str) if not subtok == '_'] 21 | 22 | def tokenizeFile(file_path): 23 | lines = 0 24 | with open(file_path, 'r', encoding="utf-8") as file: 25 | with open(file_path + 'method_names.txt', 'w') as method_names_file: 26 | with open(file_path + 'method_subtokens_content.txt', 'w') as method_contents_file: 27 | for line in file: 28 | lines += 1 29 | line = line.rstrip() 30 | parts = line.split('|', 1) 31 | method_name = parts[0] 32 | method_content = parts[1] 33 | try: 34 | tokens = list(javalang.tokenizer.tokenize(method_content)) 35 | except: 36 | print('ERROR in tokenizing: ' + method_content) 37 | #tokens = method_content.split(' ') 38 | if len(method_name) > 0 and len(tokens) > 0: 39 | method_names_file.write(method_name + '\n') 40 | method_contents_file.write(' '.join([' '.join(split_subtokens(i.value)) for i in tokens if not i.value in modifiers]) + '\n') 41 | else: 42 | print('ERROR in len of: ' + method_name + ', tokens: ' + str(tokens)) 43 | print(str(lines)) 44 | 45 | 46 | if __name__ == '__main__': 47 | file = sys.argv[1] 48 | tokenizeFile(file) 49 | 50 | 51 | -------------------------------------------------------------------------------- /code2seq_master/code2seq.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from config import Config 4 | from interactive_predict import InteractivePredictor 5 | from model import Model 6 | 7 | if __name__ == '__main__': 8 | parser = ArgumentParser() 9 | parser.add_argument("-d", "--data", dest="data_path", 10 | help="path to preprocessed dataset", required=False) 11 | parser.add_argument("-te", "--test", dest="test_path", 12 | help="path to test file", metavar="FILE", required=False) 13 | 14 | parser.add_argument("-s", "--save_prefix", dest="save_path_prefix", 15 | help="path to save file", metavar="FILE", required=False) 16 | parser.add_argument("-l", "--load", dest="load_path", 17 | help="path to saved file", metavar="FILE", required=False) 18 | parser.add_argument('--release', action='store_true', 19 | help='if specified and loading a trained model, release the loaded model for a smaller model ' 20 | 'size.') 21 | parser.add_argument('--predict', action='store_true') 22 | parser.add_argument('--debug', action='store_true') 23 | args = parser.parse_args() 24 | 25 | if args.debug: 26 | config = Config.get_debug_config(args) 27 | else: 28 | config = Config.get_default_config(args) 29 | 30 | model = Model(config) 31 | print('Created model') 32 | if config.TRAIN_PATH: 33 | model.train() 34 | if config.TEST_PATH and not args.data_path: 35 | results, precision, recall, f1 = model.evaluate() 36 | print('Accuracy: ' + str(results)) 37 | print('Precision: ' + str(precision) + ', recall: ' + str(recall) + ', F1: ' + str(f1)) 38 | if args.predict: 39 | print("Under Prediciton process.....") 40 | predictor = InteractivePredictor(config, model) 41 | predictor.predict() 42 | if args.release and args.load_path: 43 | model.evaluate(release=True) 44 | model.close_session() 45 | -------------------------------------------------------------------------------- /code2seq_master/code2seq_ast_extractor.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from config import Config 4 | from interactive_predict import InteractivePredictor 5 | from model import Model 6 | 7 | extract_AST = True 8 | 9 | if __name__ == '__main__': 10 | parser = ArgumentParser() 11 | parser.add_argument("-d", "--data", dest="data_path", 12 | help="path to preprocessed dataset", required=False) 13 | parser.add_argument("-te", "--test", dest="test_path", 14 | help="path to test file", metavar="FILE", required=False) 15 | 16 | parser.add_argument("-s", "--save_prefix", dest="save_path_prefix", 17 | help="path to save file", metavar="FILE", required=False) 18 | parser.add_argument("-l", "--load", dest="load_path", 19 | help="path to saved file", metavar="FILE", required=False) 20 | parser.add_argument('--release', action='store_true', 21 | help='if specified and loading a trained model, release the loaded model for a smaller model ' 22 | 'size.') 23 | parser.add_argument('--predict', action='store_true') 24 | parser.add_argument('--debug', action='store_true') 25 | args = parser.parse_args() 26 | 27 | if args.debug: 28 | config = Config.get_debug_config(args) 29 | else: 30 | config = Config.get_default_config(args) 31 | 32 | model = Model(config) 33 | print('Created model') 34 | if config.TRAIN_PATH: 35 | model.train() 36 | if config.TEST_PATH and not args.data_path: 37 | results, precision, recall, f1 = model.evaluate() 38 | print('Accuracy: ' + str(results)) 39 | print('Precision: ' + str(precision) + ', recall: ' + str(recall) + ', F1: ' + str(f1)) 40 | if args.predict: 41 | print("Under Prediciton process.....") 42 | predictor = InteractivePredictor(config, model) 43 | if extract_AST: 44 | ast_fetaure_list = predictor.get_ast_paths_for_file() 45 | print(f"AST Feature for {len(ast_fetaure_list)} snippets") 46 | print(ast_fetaure_list) 47 | else: 48 | predictor.predict() 49 | 50 | if args.release and args.load_path: 51 | model.evaluate(release=True) 52 | model.close_session() 53 | -------------------------------------------------------------------------------- /code2seq_master/common.py: -------------------------------------------------------------------------------- 1 | import re 2 | import subprocess 3 | import sys 4 | 5 | 6 | class Common: 7 | internal_delimiter = '|' 8 | SOS = '' 9 | EOS = '' 10 | PAD = '' 11 | UNK = '' 12 | 13 | @staticmethod 14 | def normalize_word(word): 15 | stripped = re.sub(r'[^a-zA-Z]', '', word) 16 | if len(stripped) == 0: 17 | return word.lower() 18 | else: 19 | return stripped.lower() 20 | 21 | @staticmethod 22 | def load_histogram(path, max_size=None): 23 | histogram = {} 24 | with open(path, 'r') as file: 25 | for line in file.readlines(): 26 | parts = line.split(' ') 27 | if not len(parts) == 2: 28 | continue 29 | histogram[parts[0]] = int(parts[1]) 30 | sorted_histogram = [(k, histogram[k]) for k in sorted(histogram, key=histogram.get, reverse=True)] 31 | return dict(sorted_histogram[:max_size]) 32 | 33 | @staticmethod 34 | def load_vocab_from_dict(word_to_count, add_values=[], max_size=None): 35 | word_to_index, index_to_word = {}, {} 36 | current_index = 0 37 | for value in add_values: 38 | word_to_index[value] = current_index 39 | index_to_word[current_index] = value 40 | current_index += 1 41 | sorted_counts = [(k, word_to_count[k]) for k in sorted(word_to_count, key=word_to_count.get, reverse=True)] 42 | limited_sorted = dict(sorted_counts[:max_size]) 43 | for word, count in limited_sorted.items(): 44 | word_to_index[word] = current_index 45 | index_to_word[current_index] = word 46 | current_index += 1 47 | return word_to_index, index_to_word, current_index 48 | 49 | @staticmethod 50 | def binary_to_string(binary_string): 51 | return binary_string.decode("utf-8") 52 | 53 | @staticmethod 54 | def binary_to_string_list(binary_string_list): 55 | return [Common.binary_to_string(w) for w in binary_string_list] 56 | 57 | @staticmethod 58 | def binary_to_string_matrix(binary_string_matrix): 59 | return [Common.binary_to_string_list(l) for l in binary_string_matrix] 60 | 61 | @staticmethod 62 | def binary_to_string_3d(binary_string_tensor): 63 | return [Common.binary_to_string_matrix(l) for l in binary_string_tensor] 64 | 65 | @staticmethod 66 | def legal_method_names_checker(name): 67 | return not name in [Common.UNK, Common.PAD, Common.EOS] 68 | 69 | @staticmethod 70 | def filter_impossible_names(top_words): 71 | result = list(filter(Common.legal_method_names_checker, top_words)) 72 | return result 73 | 74 | @staticmethod 75 | def unique(sequence): 76 | unique = [] 77 | [unique.append(item) for item in sequence if item not in unique] 78 | return unique 79 | 80 | @staticmethod 81 | def parse_results(result, pc_info_dict, topk=5): 82 | prediction_results = {} 83 | results_counter = 0 84 | for single_method in result: 85 | original_name, top_suggestions, top_scores, attention_per_context = list(single_method) 86 | current_method_prediction_results = PredictionResults(original_name) 87 | if attention_per_context is not None: 88 | word_attention_pairs = [(word, attention) for word, attention in 89 | zip(top_suggestions, attention_per_context) if 90 | Common.legal_method_names_checker(word)] 91 | for predicted_word, attention_timestep in word_attention_pairs: 92 | current_timestep_paths = [] 93 | for context, attention in [(key, attention_timestep[key]) for key in 94 | sorted(attention_timestep, key=attention_timestep.get, reverse=True)][ 95 | :topk]: 96 | if context in pc_info_dict: 97 | pc_info = pc_info_dict[context] 98 | current_timestep_paths.append((attention.item(), pc_info)) 99 | 100 | current_method_prediction_results.append_prediction(predicted_word, current_timestep_paths) 101 | else: 102 | for predicted_seq in top_suggestions: 103 | filtered_seq = [word for word in predicted_seq if Common.legal_method_names_checker(word)] 104 | current_method_prediction_results.append_prediction(filtered_seq, None) 105 | 106 | prediction_results[results_counter] = current_method_prediction_results 107 | results_counter += 1 108 | return prediction_results 109 | 110 | @staticmethod 111 | def compute_bleu(ref_file_name, predicted_file_name): 112 | with open(predicted_file_name) as predicted_file: 113 | pipe = subprocess.Popen(["perl", "scripts/multi-bleu.perl", ref_file_name], stdin=predicted_file, 114 | stdout=sys.stdout, stderr=sys.stderr) 115 | 116 | 117 | class PredictionResults: 118 | def __init__(self, original_name): 119 | self.original_name = original_name 120 | self.predictions = list() 121 | 122 | def append_prediction(self, name, current_timestep_paths): 123 | self.predictions.append(SingleTimeStepPrediction(name, current_timestep_paths)) 124 | 125 | class SingleTimeStepPrediction: 126 | def __init__(self, prediction, attention_paths): 127 | self.prediction = prediction 128 | if attention_paths is not None: 129 | paths_with_scores = [] 130 | for attention_score, pc_info in attention_paths: 131 | path_context_dict = {'score': attention_score, 132 | 'path': pc_info.longPath, 133 | 'token1': pc_info.token1, 134 | 'token2': pc_info.token2} 135 | paths_with_scores.append(path_context_dict) 136 | self.attention_paths = paths_with_scores 137 | 138 | 139 | class PathContextInformation: 140 | def __init__(self, context): 141 | self.token1 = context['name1'] 142 | self.longPath = context['path'] 143 | self.shortPath = context['shortPath'] 144 | self.token2 = context['name2'] 145 | 146 | def __str__(self): 147 | return '%s,%s,%s' % (self.token1, self.shortPath, self.token2) -------------------------------------------------------------------------------- /code2seq_master/config.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | @staticmethod 3 | def get_default_config(args): 4 | config = Config(args) 5 | config.NUM_EPOCHS = 3000 6 | config.SAVE_EVERY_EPOCHS = 1 7 | config.PATIENCE = 10 8 | config.BATCH_SIZE = 64 #512 9 | config.TEST_BATCH_SIZE = 64 10 | config.READER_NUM_PARALLEL_BATCHES = 1 11 | config.SHUFFLE_BUFFER_SIZE = 10000 12 | config.CSV_BUFFER_SIZE = 100 * 1024 * 1024 # 100 MB 13 | config.MAX_CONTEXTS = 200 14 | config.SUBTOKENS_VOCAB_MAX_SIZE = 190000 15 | config.TARGET_VOCAB_MAX_SIZE = 27000 16 | config.EMBEDDINGS_SIZE = 128 17 | config.RNN_SIZE = 128 * 2 # Two LSTMs to embed paths, each of size 128 18 | config.DECODER_SIZE = 320 19 | config.NUM_DECODER_LAYERS = 1 20 | config.MAX_PATH_LENGTH = 8 + 1 21 | config.MAX_NAME_PARTS = 5 22 | config.MAX_TARGET_PARTS = 6 23 | config.EMBEDDINGS_DROPOUT_KEEP_PROB = 0.75 24 | config.RNN_DROPOUT_KEEP_PROB = 0.5 25 | config.BIRNN = True 26 | config.RANDOM_CONTEXTS = True 27 | config.BEAM_WIDTH = 0 28 | config.USE_MOMENTUM = True 29 | return config 30 | 31 | def take_model_hyperparams_from(self, otherConfig): 32 | self.EMBEDDINGS_SIZE = otherConfig.EMBEDDINGS_SIZE 33 | self.RNN_SIZE = otherConfig.RNN_SIZE 34 | self.DECODER_SIZE = otherConfig.DECODER_SIZE 35 | self.NUM_DECODER_LAYERS = otherConfig.NUM_DECODER_LAYERS 36 | self.BIRNN = otherConfig.BIRNN 37 | if self.DATA_NUM_CONTEXTS <= 0: 38 | self.DATA_NUM_CONTEXTS = otherConfig.DATA_NUM_CONTEXTS 39 | 40 | def __init__(self, args): 41 | self.NUM_EPOCHS = 0 42 | self.SAVE_EVERY_EPOCHS = 0 43 | self.PATIENCE = 0 44 | self.BATCH_SIZE = 0 45 | self.TEST_BATCH_SIZE = 0 46 | self.READER_NUM_PARALLEL_BATCHES = 0 47 | self.SHUFFLE_BUFFER_SIZE = 0 48 | self.CSV_BUFFER_SIZE = None 49 | self.TRAIN_PATH = args.data_path 50 | self.TEST_PATH = args.test_path if args.test_path is not None else '' 51 | self.DATA_NUM_CONTEXTS = 0 52 | self.MAX_CONTEXTS = 0 53 | self.SUBTOKENS_VOCAB_MAX_SIZE = 0 54 | self.TARGET_VOCAB_MAX_SIZE = 0 55 | self.EMBEDDINGS_SIZE = 0 56 | self.RNN_SIZE = 0 57 | self.DECODER_SIZE = 0 58 | self.NUM_DECODER_LAYERS = 0 59 | self.SAVE_PATH = args.save_path_prefix 60 | self.LOAD_PATH = args.load_path 61 | self.MAX_PATH_LENGTH = 0 62 | self.MAX_NAME_PARTS = 0 63 | self.MAX_TARGET_PARTS = 0 64 | self.EMBEDDINGS_DROPOUT_KEEP_PROB = 0 65 | self.RNN_DROPOUT_KEEP_PROB = 0 66 | self.BIRNN = False 67 | self.RANDOM_CONTEXTS = True 68 | self.BEAM_WIDTH = 1 69 | self.USE_MOMENTUM = True 70 | self.RELEASE = args.release 71 | 72 | @staticmethod 73 | def get_debug_config(args): 74 | config = Config(args) 75 | config.NUM_EPOCHS = 3000 76 | config.SAVE_EVERY_EPOCHS = 100 77 | config.PATIENCE = 200 78 | config.BATCH_SIZE = 7 79 | config.TEST_BATCH_SIZE = 7 80 | config.READER_NUM_PARALLEL_BATCHES = 1 81 | config.SHUFFLE_BUFFER_SIZE = 10 82 | config.CSV_BUFFER_SIZE = None 83 | config.MAX_CONTEXTS = 5 84 | config.SUBTOKENS_VOCAB_MAX_SIZE = 190000 85 | config.TARGET_VOCAB_MAX_SIZE = 27000 86 | config.EMBEDDINGS_SIZE = 19 87 | config.RNN_SIZE = 10 88 | config.DECODER_SIZE = 11 89 | config.NUM_DECODER_LAYERS = 1 90 | config.MAX_PATH_LENGTH = 8 + 1 91 | config.MAX_NAME_PARTS = 5 92 | config.MAX_TARGET_PARTS = 6 93 | config.EMBEDDINGS_DROPOUT_KEEP_PROB = 1 94 | config.RNN_DROPOUT_KEEP_PROB = 1 95 | config.BIRNN = True 96 | config.RANDOM_CONTEXTS = True 97 | config.BEAM_WIDTH = 0 98 | config.USE_MOMENTUM = False 99 | return config 100 | -------------------------------------------------------------------------------- /code2seq_master/extract_ast.py: -------------------------------------------------------------------------------- 1 | from config import Config 2 | from extractor import Extractor 3 | from argparse import ArgumentParser 4 | 5 | EXTRACTION_API = 'https://po3g2dx2qa.execute-api.us-east-1.amazonaws.com/production/extractmethods' 6 | 7 | def read_file(input_filename): 8 | with open(input_filename, 'r') as file: 9 | return file.readlines() 10 | 11 | 12 | if __name__ == '__main__': 13 | parser = ArgumentParser() 14 | parser.add_argument("-d", "--data", dest="data_path", 15 | help="path to preprocessed dataset", required=False) 16 | parser.add_argument("-te", "--test", dest="test_path", 17 | help="path to test file", metavar="FILE", required=False) 18 | 19 | parser.add_argument("-s", "--save_prefix", dest="save_path_prefix", 20 | help="path to save file", metavar="FILE", required=False) 21 | parser.add_argument("-l", "--load", dest="load_path", 22 | help="path to saved file", metavar="FILE", required=False) 23 | parser.add_argument('--release', action='store_true', 24 | help='if specified and loading a trained model, release the loaded model for a smaller model ' 25 | 'size.') 26 | parser.add_argument('--predict', action='store_true') 27 | parser.add_argument('--debug', action='store_true') 28 | args = parser.parse_args() 29 | 30 | print(f"Args - {args}") 31 | 32 | config = Config.get_default_config(args) 33 | 34 | print(f"Config - {config}") 35 | 36 | path_extractor = Extractor(config, EXTRACTION_API, config.MAX_PATH_LENGTH, max_path_width=2) 37 | 38 | input_filename = 'Input.java' 39 | print('Extraction Begin - AST') 40 | user_input = ' '.join(read_file(input_filename)) 41 | predict_lines, pc_info_dict = path_extractor.extract_paths(user_input) 42 | 43 | 44 | print("*************************** EXTRACTED AST ***************************") 45 | print(predict_lines) 46 | print(pc_info_dict) 47 | 48 | model_results = self.model.predict(predict_lines) -------------------------------------------------------------------------------- /code2seq_master/extractor.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import requests 4 | 5 | from common import PathContextInformation 6 | 7 | 8 | class Extractor: 9 | def __init__(self, config, extractor_api_url, max_path_length, max_path_width): 10 | self.config = config 11 | self.max_path_length = max_path_length 12 | self.max_path_width = max_path_width 13 | self.extractor_api_url = extractor_api_url 14 | self.bad_characters_table = str.maketrans('', '', '\t\r\n') 15 | 16 | @staticmethod 17 | def post_request(url, code_string): 18 | return requests.post(url, data=json.dumps({"code": code_string, "decompose": True}, separators=(',', ':'))) 19 | 20 | def extract_paths(self, code_string): 21 | # print("Here is the code snippet ---------- ") 22 | # print(code_string) 23 | response = self.post_request(self.extractor_api_url, code_string) 24 | response_array = json.loads(response.text) 25 | if 'errorType' in response_array: 26 | raise ValueError(response.text) 27 | if 'errorMessage' in response_array: 28 | raise TimeoutError(response.text) 29 | pc_info_dict = {} 30 | result = [] 31 | for single_method in response_array: 32 | method_name = single_method['target'] 33 | current_result_line_parts = [method_name] 34 | contexts = single_method['paths'] 35 | # print(f"Number of response paths - {len(contexts)}, and taken till - {self.config.DATA_NUM_CONTEXTS} ") 36 | for context in contexts[:self.config.DATA_NUM_CONTEXTS]: 37 | pc_info = PathContextInformation(context) 38 | current_result_line_parts += [str(pc_info)] 39 | pc_info_dict[(pc_info.token1, pc_info.shortPath, pc_info.token2)] = pc_info 40 | space_padding = ' ' * (self.config.DATA_NUM_CONTEXTS - len(contexts)) 41 | result_line = ' '.join(current_result_line_parts) + space_padding 42 | result.append(result_line) 43 | return result, pc_info_dict 44 | -------------------------------------------------------------------------------- /code2seq_master/images/network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/code2seq_master/images/network.png -------------------------------------------------------------------------------- /code2seq_master/init.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/code2seq_master/init.py -------------------------------------------------------------------------------- /code2seq_master/interactive_predict.py: -------------------------------------------------------------------------------- 1 | from common import Common 2 | from extractor import Extractor 3 | import json 4 | import time 5 | 6 | SHOW_TOP_CONTEXTS = 10 7 | MAX_PATH_LENGTH = 8 8 | MAX_PATH_WIDTH = 2 9 | EXTRACTION_API = 'https://po3g2dx2qa.execute-api.us-east-1.amazonaws.com/production/extractmethods' 10 | 11 | 12 | class InteractivePredictor: 13 | exit_keywords = ['exit', 'quit', 'q'] 14 | test_extractor = True 15 | 16 | def __init__(self, config, model): 17 | model.predict([]) 18 | self.model = model 19 | self.config = config 20 | self.path_extractor = Extractor(config, EXTRACTION_API, self.config.MAX_PATH_LENGTH, max_path_width=2) 21 | 22 | @staticmethod 23 | def read_file(input_filename): 24 | with open(input_filename, 'r') as file: 25 | return file.readlines() 26 | 27 | 28 | def read_raw_code_data(self, data_file, take=1): 29 | raw_data_snippets = [] 30 | lines = self.read_file(data_file) 31 | for line in lines[:take]: 32 | raw_data_snippets.append(json.loads(line)["code"]) 33 | return raw_data_snippets 34 | 35 | def get_ast_paths_for_file(self): 36 | data_file = '/Users/navinLR/Desktop/ML_for_SE/AutoComments/data/test.json' 37 | print("Begin Extraction") 38 | ast_fetaure_list = [] 39 | raw_data_snippets = self.read_raw_code_data(data_file) 40 | for ind, snippet in enumerate(raw_data_snippets): 41 | predict_lines = self.get_ast_path_for_snippet(snippet) 42 | # print(f"Extracted AST for Snippet - {ind}") 43 | # print(f"AST Size - {len(predict_lines)}") 44 | ast_fetaure_list.extend(predict_lines) 45 | return ast_fetaure_list 46 | 47 | def get_ast_path_for_snippet(self, snippet): 48 | time.sleep(2) 49 | predict_lines, pc_info_dict = self.path_extractor.extract_paths(snippet) 50 | return predict_lines 51 | 52 | def predict(self): 53 | input_filename = 'Input.java' 54 | print('Serving') 55 | while True: 56 | print('Modify the file: "' + input_filename + '" and press any key when ready, or "q" / "exit" to exit') 57 | user_input = input() 58 | if user_input.lower() in self.exit_keywords: 59 | print('Exiting...') 60 | return 61 | user_input = ' '.join(self.read_file(input_filename)) 62 | try: 63 | predict_lines, pc_info_dict = self.path_extractor.extract_paths(user_input) 64 | except ValueError: 65 | continue 66 | 67 | # Navin's Modif 68 | if self.test_extractor: 69 | print("Testing Extracted ASTs") 70 | predict_lines = [self.read_file("../data/auto_comment_dataset/auto_comment_dataset.test.c2s")[1].replace("\n","")] # Take first Line 71 | 72 | # print(f"Path Extractor o/p - \n {predict_lines}") 73 | 74 | model_results = self.model.predict(predict_lines) 75 | 76 | # print(f"Model Resutls -------- \n{model_results}") 77 | 78 | prediction_results = Common.parse_results(model_results, pc_info_dict, topk=SHOW_TOP_CONTEXTS) 79 | for index, method_prediction in prediction_results.items(): 80 | print('Original name:\t' + method_prediction.original_name) 81 | if self.config.BEAM_WIDTH == 0: 82 | print('Predicted:\t%s' % [step.prediction for step in method_prediction.predictions]) 83 | for timestep, single_timestep_prediction in enumerate(method_prediction.predictions): 84 | print('Attention:') 85 | print('TIMESTEP: %d\t: %s' % (timestep, single_timestep_prediction.prediction)) 86 | for attention_obj in single_timestep_prediction.attention_paths: 87 | print('%f\tcontext: %s,%s,%s' % ( 88 | attention_obj['score'], attention_obj['token1'], attention_obj['path'], 89 | attention_obj['token2'])) 90 | else: 91 | print('Predicted:') 92 | for predicted_seq in method_prediction.predictions: 93 | print('\t%s' % predicted_seq.prediction) 94 | -------------------------------------------------------------------------------- /code2seq_master/java_files_creator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | 4 | base_folder = "java_code_data/" 5 | sub_folder = "data" 6 | 7 | dataset = "data/test.csv" 8 | 9 | def read_file(input_filename): 10 | with open(input_filename, 'r') as file: 11 | return file.readlines() 12 | 13 | with open(dataset) as csv_file: 14 | csv_reader = csv.reader(csv_file, delimiter=',') 15 | for index, row in enumerate(csv_reader): 16 | print(f"Wrining Java Snippet No:{index}") 17 | to_write_path = base_folder+sub_folder+str(index) 18 | os.mkdir(to_write_path) 19 | code = row[0] 20 | f = open(to_write_path+"/Input.java", "w") 21 | f.write(code) 22 | f.close() 23 | -------------------------------------------------------------------------------- /code2seq_master/preprocess.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from argparse import ArgumentParser 3 | 4 | import numpy as np 5 | 6 | import code2seq-master.common 7 | 8 | ''' 9 | This script preprocesses the data from MethodPaths. It truncates methods with too many contexts, 10 | and pads methods with less paths with spaces. 11 | ''' 12 | 13 | 14 | def save_dictionaries(dataset_name, subtoken_to_count, node_to_count, target_to_count, max_contexts, num_examples): 15 | save_dict_file_path = '{}.dict.c2s'.format(dataset_name) 16 | with open(save_dict_file_path, 'wb') as file: 17 | pickle.dump(subtoken_to_count, file) 18 | pickle.dump(node_to_count, file) 19 | pickle.dump(target_to_count, file) 20 | pickle.dump(max_contexts, file) 21 | pickle.dump(num_examples, file) 22 | print('Dictionaries saved to: {}'.format(save_dict_file_path)) 23 | 24 | 25 | def process_file(file_path, data_file_role, dataset_name, max_contexts, max_data_contexts): 26 | sum_total = 0 27 | sum_sampled = 0 28 | total = 0 29 | max_unfiltered = 0 30 | max_contexts_to_sample = max_data_contexts if data_file_role == 'train' else max_contexts 31 | output_path = '{}.{}.c2s'.format(dataset_name, data_file_role) 32 | with open(output_path, 'w') as outfile: 33 | with open(file_path, 'r') as file: 34 | for line in file: 35 | parts = line.rstrip('\n').split(' ') 36 | target_name = parts[0] 37 | contexts = parts[1:] 38 | 39 | if len(contexts) > max_unfiltered: 40 | max_unfiltered = len(contexts) 41 | 42 | sum_total += len(contexts) 43 | if len(contexts) > max_contexts_to_sample: 44 | contexts = np.random.choice(contexts, max_contexts_to_sample, replace=False) 45 | 46 | sum_sampled += len(contexts) 47 | 48 | csv_padding = " " * (max_data_contexts - len(contexts)) 49 | total += 1 50 | outfile.write(target_name + ' ' + " ".join(contexts) + csv_padding + '\n') 51 | 52 | print('File: ' + data_file_path) 53 | print('Average total contexts: ' + str(float(sum_total) / total)) 54 | print('Average final (after sampling) contexts: ' + str(float(sum_sampled) / total)) 55 | print('Total examples: ' + str(total)) 56 | print('Max number of contexts per word: ' + str(max_unfiltered)) 57 | return total 58 | 59 | 60 | def context_full_found(context_parts, word_to_count, path_to_count): 61 | return context_parts[0] in word_to_count \ 62 | and context_parts[1] in path_to_count and context_parts[2] in word_to_count 63 | 64 | 65 | def context_partial_found(context_parts, word_to_count, path_to_count): 66 | return context_parts[0] in word_to_count \ 67 | or context_parts[1] in path_to_count or context_parts[2] in word_to_count 68 | 69 | 70 | if __name__ == '__main__': 71 | parser = ArgumentParser() 72 | parser.add_argument("-trd", "--train_data", dest="train_data_path", 73 | help="path to training data file", required=True) 74 | parser.add_argument("-ted", "--test_data", dest="test_data_path", 75 | help="path to test data file", required=True) 76 | parser.add_argument("-vd", "--val_data", dest="val_data_path", 77 | help="path to validation data file", required=True) 78 | parser.add_argument("-mc", "--max_contexts", dest="max_contexts", default=200, 79 | help="number of max contexts to keep in test+validation", required=False) 80 | parser.add_argument("-mdc", "--max_data_contexts", dest="max_data_contexts", default=1000, 81 | help="number of max contexts to keep in the dataset", required=False) 82 | parser.add_argument("-svs", "--subtoken_vocab_size", dest="subtoken_vocab_size", default=186277, 83 | help="Max number of source subtokens to keep in the vocabulary", required=False) 84 | parser.add_argument("-tvs", "--target_vocab_size", dest="target_vocab_size", default=26347, 85 | help="Max number of target words to keep in the vocabulary", required=False) 86 | parser.add_argument("-sh", "--subtoken_histogram", dest="subtoken_histogram", 87 | help="subtoken histogram file", metavar="FILE", required=True) 88 | parser.add_argument("-nh", "--node_histogram", dest="node_histogram", 89 | help="node_histogram file", metavar="FILE", required=True) 90 | parser.add_argument("-th", "--target_histogram", dest="target_histogram", 91 | help="target histogram file", metavar="FILE", required=True) 92 | parser.add_argument("-o", "--output_name", dest="output_name", 93 | help="output name - the base name for the created dataset", required=True, default='data') 94 | args = parser.parse_args() 95 | 96 | train_data_path = args.train_data_path 97 | test_data_path = args.test_data_path 98 | val_data_path = args.val_data_path 99 | subtoken_histogram_path = args.subtoken_histogram 100 | node_histogram_path = args.node_histogram 101 | 102 | subtoken_to_count = common.Common.load_histogram(subtoken_histogram_path, 103 | max_size=int(args.subtoken_vocab_size)) 104 | node_to_count = common.Common.load_histogram(node_histogram_path, 105 | max_size=None) 106 | target_to_count = common.Common.load_histogram(args.target_histogram, 107 | max_size=int(args.target_vocab_size)) 108 | print('subtoken vocab size: ', len(subtoken_to_count)) 109 | print('node vocab size: ', len(node_to_count)) 110 | print('target vocab size: ', len(target_to_count)) 111 | 112 | num_training_examples = 0 113 | for data_file_path, data_role in zip([test_data_path, val_data_path, train_data_path], ['test', 'val', 'train']): 114 | num_examples = process_file(file_path=data_file_path, data_file_role=data_role, dataset_name=args.output_name, 115 | max_contexts=int(args.max_contexts), max_data_contexts=int(args.max_data_contexts)) 116 | if data_role == 'train': 117 | num_training_examples = num_examples 118 | 119 | save_dictionaries(dataset_name=args.output_name, subtoken_to_count=subtoken_to_count, 120 | node_to_count=node_to_count, target_to_count=target_to_count, 121 | max_contexts=int(args.max_data_contexts), num_examples=num_training_examples) 122 | -------------------------------------------------------------------------------- /code2seq_master/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ########################################################### 3 | # Change the following values to preprocess a new dataset. 4 | # TRAIN_DIR, VAL_DIR and TEST_DIR should be paths to 5 | # directories containing sub-directories with .java files 6 | # DATASET_NAME is just a name for the currently extracted 7 | # dataset. 8 | # MAX_DATA_CONTEXTS is the number of contexts to keep in the dataset for each 9 | # method (by default 1000). At training time, these contexts 10 | # will be downsampled dynamically to MAX_CONTEXTS. 11 | # MAX_CONTEXTS - the number of actual contexts (by default 200) 12 | # that are taken into consideration (out of MAX_DATA_CONTEXTS) 13 | # every training iteration. To avoid randomness at test time, 14 | # for the test and validation sets only MAX_CONTEXTS contexts are kept 15 | # (while for training, MAX_DATA_CONTEXTS are kept and MAX_CONTEXTS are 16 | # selected dynamically during training). 17 | # SUBTOKEN_VOCAB_SIZE, TARGET_VOCAB_SIZE - 18 | # - the number of subtokens and target words to keep 19 | # in the vocabulary (the top occurring words and paths will be kept). 20 | # NUM_THREADS - the number of parallel threads to use. It is 21 | # recommended to use a multi-core machine for the preprocessing 22 | # step and set this value to the number of cores. 23 | # PYTHON - python3 interpreter alias. 24 | TRAIN_DIR=my_training_dir 25 | VAL_DIR=my_val_dir 26 | TEST_DIR=my_test_dir 27 | DATASET_NAME=my_dataset 28 | MAX_DATA_CONTEXTS=1000 29 | MAX_CONTEXTS=200 30 | SUBTOKEN_VOCAB_SIZE=186277 31 | TARGET_VOCAB_SIZE=26347 32 | NUM_THREADS=64 33 | PYTHON=python3 34 | ########################################################### 35 | 36 | TRAIN_DATA_FILE=${DATASET_NAME}.train.raw.txt 37 | VAL_DATA_FILE=${DATASET_NAME}.val.raw.txt 38 | TEST_DATA_FILE=${DATASET_NAME}.test.raw.txt 39 | EXTRACTOR_JAR=JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar 40 | 41 | mkdir -p data 42 | mkdir -p data/${DATASET_NAME} 43 | 44 | echo "Extracting paths from validation set..." 45 | ${PYTHON} JavaExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${VAL_DATA_FILE} 2>> error_log.txt 46 | echo "Finished extracting paths from validation set" 47 | echo "Extracting paths from test set..." 48 | ${PYTHON} JavaExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${TEST_DATA_FILE} 2>> error_log.txt 49 | echo "Finished extracting paths from test set" 50 | echo "Extracting paths from training set..." 51 | ${PYTHON} JavaExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} | shuf > ${TRAIN_DATA_FILE} 2>> error_log.txt 52 | echo "Finished extracting paths from training set" 53 | 54 | TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2s 55 | SOURCE_SUBTOKEN_HISTOGRAM=data/${DATASET_NAME}/${DATASET_NAME}.histo.ori.c2s 56 | NODE_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.node.c2s 57 | 58 | echo "Creating histograms from the training data" 59 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f1 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${TARGET_HISTOGRAM_FILE} 60 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${SOURCE_SUBTOKEN_HISTOGRAM} 61 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${NODE_HISTOGRAM_FILE} 62 | 63 | ${PYTHON} preprocess.py --train_data ${TRAIN_DATA_FILE} --test_data ${TEST_DATA_FILE} --val_data ${VAL_DATA_FILE} \ 64 | --max_contexts ${MAX_CONTEXTS} --max_data_contexts ${MAX_DATA_CONTEXTS} --subtoken_vocab_size ${SUBTOKEN_VOCAB_SIZE} \ 65 | --target_vocab_size ${TARGET_VOCAB_SIZE} --subtoken_histogram ${SOURCE_SUBTOKEN_HISTOGRAM} \ 66 | --node_histogram ${NODE_HISTOGRAM_FILE} --target_histogram ${TARGET_HISTOGRAM_FILE} --output_name data/${DATASET_NAME}/${DATASET_NAME} 67 | 68 | # If all went well, the raw data files can be deleted, because preprocess.py creates new files 69 | # with truncated and padded number of paths for each example. 70 | rm ${TRAIN_DATA_FILE} ${VAL_DATA_FILE} ${TEST_DATA_FILE} ${TARGET_HISTOGRAM_FILE} ${SOURCE_SUBTOKEN_HISTOGRAM} \ 71 | ${NODE_HISTOGRAM_FILE} 72 | 73 | -------------------------------------------------------------------------------- /code2seq_master/preprocess_csharp.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ########################################################### 3 | # Change the following values to preprocess a new dataset. 4 | # TRAIN_DIR, VAL_DIR and TEST_DIR should be paths to 5 | # directories containing sub-directories with .java files 6 | # DATASET_NAME is just a name for the currently extracted 7 | # dataset. 8 | # MAX_DATA_CONTEXTS is the number of contexts to keep in the dataset for each 9 | # method (by default 1000). At training time, these contexts 10 | # will be downsampled dynamically to MAX_CONTEXTS. 11 | # MAX_CONTEXTS - the number of actual contexts (by default 200) 12 | # that are taken into consideration (out of MAX_DATA_CONTEXTS) 13 | # every training iteration. To avoid randomness at test time, 14 | # for the test and validation sets only MAX_CONTEXTS contexts are kept 15 | # (while for training, MAX_DATA_CONTEXTS are kept and MAX_CONTEXTS are 16 | # selected dynamically during training). 17 | # SUBTOKEN_VOCAB_SIZE, TARGET_VOCAB_SIZE - 18 | # - the number of subtokens and target words to keep 19 | # in the vocabulary (the top occurring words and paths will be kept). 20 | # NUM_THREADS - the number of parallel threads to use. It is 21 | # recommended to use a multi-core machine for the preprocessing 22 | # step and set this value to the number of cores. 23 | # PYTHON - python3 interpreter alias. 24 | 25 | #TRAIN_DIR=JavaExtractor/JPredict/src/main/java/JavaExtractor/Common 26 | #VAL_DIR=JavaExtractor/JPredict/src/main/java/JavaExtractor/Common 27 | #TEST_DIR=JavaExtractor/JPredict/src/main/java/JavaExtractor/Common 28 | 29 | TRAIN_DIR=../java_code_train 30 | VAL_DIR=../java_code_valid 31 | TEST_DIR=../java_code_test 32 | 33 | DATASET_NAME=my_dataset 34 | MAX_DATA_CONTEXTS=1000 35 | MAX_CONTEXTS=200 36 | SUBTOKEN_VOCAB_SIZE=186277 37 | TARGET_VOCAB_SIZE=26347 38 | NUM_THREADS=64 39 | PYTHON=python3 40 | ########################################################### 41 | 42 | TRAIN_DATA_FILE=${DATASET_NAME}.train.raw.txt 43 | VAL_DATA_FILE=${DATASET_NAME}.val.raw.txt 44 | TEST_DATA_FILE=${DATASET_NAME}.test.raw.txt 45 | EXTRACTOR_JAR=CSharpExtractor/CSharpExtractor/Extractor/Extractor.csproj 46 | 47 | #EXTRACTOR_JAR=JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar 48 | 49 | mkdir -p data 50 | mkdir -p data/${DATASET_NAME} 51 | 52 | echo "Extracting paths from validation set..." 53 | ${PYTHON} CSharpExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${VAL_DATA_FILE} 2>> error_log.txt 54 | echo "Finished extracting paths from validation set" 55 | echo "Extracting paths from test set..." 56 | ${PYTHON} CSharpExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${TEST_DATA_FILE} 2>> error_log.txt 57 | echo "Finished extracting paths from test set" 58 | echo "Extracting paths from training set..." 59 | ${PYTHON} CSharpExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${TRAIN_DATA_FILE}_unshuf 2>> error_log.txt 60 | echo "Finished extracting paths from training set" 61 | echo "Shuffling training data" 62 | cat ${TRAIN_DATA_FILE}_unshuf | shuf > ${TRAIN_DATA_FILE} 63 | rm ${TRAIN_DATA_FILE}_unshuf 64 | 65 | TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2s 66 | SOURCE_SUBTOKEN_HISTOGRAM=data/${DATASET_NAME}/${DATASET_NAME}.histo.ori.c2s 67 | NODE_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.node.c2s 68 | 69 | echo "Creating histograms from the training data" 70 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f1 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${TARGET_HISTOGRAM_FILE} 71 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${SOURCE_SUBTOKEN_HISTOGRAM} 72 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${NODE_HISTOGRAM_FILE} 73 | 74 | ${PYTHON} preprocess.py --train_data ${TRAIN_DATA_FILE} --test_data ${TEST_DATA_FILE} --val_data ${VAL_DATA_FILE} \ 75 | --max_contexts ${MAX_CONTEXTS} --max_data_contexts ${MAX_DATA_CONTEXTS} --subtoken_vocab_size ${SUBTOKEN_VOCAB_SIZE} \ 76 | --target_vocab_size ${TARGET_VOCAB_SIZE} --subtoken_histogram ${SOURCE_SUBTOKEN_HISTOGRAM} \ 77 | --node_histogram ${NODE_HISTOGRAM_FILE} --target_histogram ${TARGET_HISTOGRAM_FILE} --output_name data/${DATASET_NAME}/${DATASET_NAME} 78 | 79 | # If all went well, the raw data files can be deleted, because preprocess.py creates new files 80 | # with truncated and padded number of paths for each example. 81 | rm ${TRAIN_DATA_FILE} ${VAL_DATA_FILE} ${TEST_DATA_FILE} ${TARGET_HISTOGRAM_FILE} ${SOURCE_SUBTOKEN_HISTOGRAM} \ 82 | ${NODE_HISTOGRAM_FILE} 83 | 84 | 85 | -------------------------------------------------------------------------------- /code2seq_master/preprocess_custom.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ########################################################### 3 | # Change the following values to preprocess a new dataset. 4 | # TRAIN_DIR, VAL_DIR and TEST_DIR should be paths to 5 | # directories containing sub-directories with .java files 6 | # DATASET_NAME is just a name for the currently extracted 7 | # dataset. 8 | # MAX_DATA_CONTEXTS is the number of contexts to keep in the dataset for each 9 | # method (by default 1000). At training time, these contexts 10 | # will be downsampled dynamically to MAX_CONTEXTS. 11 | # MAX_CONTEXTS - the number of actual contexts (by default 200) 12 | # that are taken into consideration (out of MAX_DATA_CONTEXTS) 13 | # every training iteration. To avoid randomness at test time, 14 | # for the test and validation sets only MAX_CONTEXTS contexts are kept 15 | # (while for training, MAX_DATA_CONTEXTS are kept and MAX_CONTEXTS are 16 | # selected dynamically during training). 17 | # SUBTOKEN_VOCAB_SIZE, TARGET_VOCAB_SIZE - 18 | # - the number of subtokens and target words to keep 19 | # in the vocabulary (the top occurring words and paths will be kept). 20 | # NUM_THREADS - the number of parallel threads to use. It is 21 | # recommended to use a multi-core machine for the preprocessing 22 | # step and set this value to the number of cores. 23 | # PYTHON - python3 interpreter alias. 24 | #TRAIN_DIR=my_training_dir 25 | #VAL_DIR=my_val_dir 26 | MAIN_DIR="java_code_data" 27 | DATASET_NAME="sample_set" 28 | MAX_DATA_CONTEXTS=1000 29 | MAX_CONTEXTS=200 30 | SUBTOKEN_VOCAB_SIZE=186277 31 | TARGET_VOCAB_SIZE=26347 32 | NUM_THREADS=64 33 | PYTHON=python3 34 | ########################################################### 35 | 36 | #TRAIN_DATA_FILE=${DATASET_NAME}.train.raw.txt 37 | #VAL_DATA_FILE=${DATASET_NAME}.val.raw.txt 38 | MAIN_DATA_FILE=${DATASET_NAME}.test.raw.txt 39 | EXTRACTOR_JAR=JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar 40 | 41 | mkdir -p data 42 | mkdir -p data/${DATASET_NAME} 43 | 44 | echo "Extracting paths from set..." 45 | ${PYTHON} JavaExtractor/extract.py --dir ${MAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${MAIN_DATA_FILE} 2>> error_log.txt 46 | echo "Finished extracting paths from set" 47 | #echo "Extracting paths from test set..." 48 | #${PYTHON} JavaExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${TEST_DATA_FILE} 2>> error_log.txt 49 | #echo "Finished extracting paths from test set" 50 | #echo "Extracting paths from training set..." 51 | #${PYTHON} JavaExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} | shuf > ${TRAIN_DATA_FILE} 2>> error_log.txt 52 | #echo "Finished extracting paths from training set" 53 | 54 | TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2s 55 | SOURCE_SUBTOKEN_HISTOGRAM=data/${DATASET_NAME}/${DATASET_NAME}.histo.ori.c2s 56 | NODE_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.node.c2s 57 | # 58 | echo "Creating histograms from the training data" 59 | cat ${MAIN_DATA_FILE} | cut -d' ' -f1 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${TARGET_HISTOGRAM_FILE} 60 | cat ${MAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${SOURCE_SUBTOKEN_HISTOGRAM} 61 | cat ${MAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${NODE_HISTOGRAM_FILE} 62 | # 63 | ${PYTHON} preprocess.py --train_data ${MAIN_DATA_FILE} --test_data ${MAIN_DATA_FILE} --val_data ${MAIN_DATA_FILE} \ 64 | --max_contexts ${MAX_CONTEXTS} --max_data_contexts ${MAX_DATA_CONTEXTS} --subtoken_vocab_size ${SUBTOKEN_VOCAB_SIZE} \ 65 | --target_vocab_size ${TARGET_VOCAB_SIZE} --subtoken_histogram ${SOURCE_SUBTOKEN_HISTOGRAM} \ 66 | --node_histogram ${NODE_HISTOGRAM_FILE} --target_histogram ${TARGET_HISTOGRAM_FILE} --output_name data/${DATASET_NAME}/${DATASET_NAME} 67 | # 68 | ## If all went well, the raw data files can be deleted, because preprocess.py creates new files 69 | ## with truncated and padded number of paths for each example. 70 | rm ${MAIN_DATA_FILE} ${TARGET_HISTOGRAM_FILE} ${SOURCE_SUBTOKEN_HISTOGRAM} ${NODE_HISTOGRAM_FILE} 71 | -------------------------------------------------------------------------------- /code2seq_master/test_extracted_ast.py: -------------------------------------------------------------------------------- 1 | def read_file(input_filename): 2 | with open(input_filename, 'r') as file: 3 | return file.readlines() 4 | 5 | ast_file = "data/sample_set/sample_set.train.c2s" 6 | 7 | 8 | ast_of_snippets = read_file(ast_file) 9 | print(len(ast_of_snippets)) 10 | print(ast_of_snippets[0]) 11 | 12 | # for snippet in ast_of_snippets: 13 | # print(snippet) 14 | 15 | -------------------------------------------------------------------------------- /code2seq_master/train.sh: -------------------------------------------------------------------------------- 1 | ########################################################### 2 | # Change the following values to train a new model. 3 | # type: the name of the new model, only affects the saved file name. 4 | # dataset: the name of the dataset, as was preprocessed using preprocess.sh 5 | # test_data: by default, points to the validation set, since this is the set that 6 | # will be evaluated after each training iteration. If you wish to test 7 | # on the final (held-out) test set, change 'val' to 'test'. 8 | type=java-trail-model 9 | dataset_name=auto_comment_dataset 10 | data_dir=../data/auto_comment_dataset 11 | data=${data_dir}/${dataset_name} 12 | test_data=${data_dir}/${dataset_name}.val.c2s 13 | model_dir=models/${type} 14 | 15 | mkdir -p ${model_dir} 16 | set -e 17 | python3 -u code2seq.py --data ${data} --test ${test_data} --save_prefix ${model_dir}/model 18 | -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | auto_comment_dataset 2 | *.json 3 | 4 | -------------------------------------------------------------------------------- /data/data.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/data/data.7z -------------------------------------------------------------------------------- /images/network_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/images/network_architecture.png -------------------------------------------------------------------------------- /images/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/images/pipeline.png -------------------------------------------------------------------------------- /poster/ML4SE_Poster_Group_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/ML4SE_Poster_Group_3.pdf -------------------------------------------------------------------------------- /poster/source_code/example.java: -------------------------------------------------------------------------------- 1 | public static void add(int VAR0, int VAR1) { 2 | return VAR0 + VAR1; 3 | } 4 | -------------------------------------------------------------------------------- /poster/source_code/img/Embedding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/source_code/img/Embedding.png -------------------------------------------------------------------------------- /poster/source_code/img/TU_P1_full-color.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/source_code/img/TU_P1_full-color.png -------------------------------------------------------------------------------- /poster/source_code/img/distr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/source_code/img/distr.png -------------------------------------------------------------------------------- /poster/source_code/img/link_to_github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/source_code/img/link_to_github.png -------------------------------------------------------------------------------- /poster/source_code/img/results_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/source_code/img/results_table.png -------------------------------------------------------------------------------- /poster/source_code/img/zoomedInLength.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/source_code/img/zoomedInLength.png -------------------------------------------------------------------------------- /poster/source_code/poster.tex: -------------------------------------------------------------------------------- 1 | %By% TODO: 2 | % - cifar10 table 3 | % - eig plots 4 | % - bullet points 5 | % - research question explicit 6 | %Copyright (c) 2013 Joost van Zwieten 7 | % 8 | % Permission is hereby granted, free of charge, to any person obtaining a copy 9 | % of this software and associated documentation files (the "Software"), to deal 10 | % in the Software without restriction, including without limitation the rights 11 | % to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | % copies of the Software, and to permit persons to whom the Software is 13 | % furnished to do so, subject to the following conditions: 14 | % 15 | % The above copyright notice and this permission notice shall be included in 16 | % all copies or substantial portions of the Software. 17 | % 18 | % THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | % IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | % FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | % AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | % LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | % OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 24 | % THE SOFTWARE. 25 | % 26 | \documentclass{tudelftposter} 27 | 28 | % optional, makes QR code clickable 29 | \usepackage[hidelinks,implicit=false,bookmarks=false]{hyperref} 30 | \usepackage{booktabs} 31 | \usepackage{listings} 32 | \usepackage{xcolor} 33 | \usepackage{mathtools} 34 | \usepackage{subfigure} 35 | \usepackage{subfig} 36 | 37 | \definecolor{light-gray}{gray}{0.97} %the shade of grey that stack exchange uses 38 | \definecolor{codegray}{rgb}{0.5,0.5,0.5} 39 | \lstdefinestyle{mystyle}{ 40 | language = Java, 41 | numberstyle=\tiny\color{codegray}, 42 | basicstyle=\ttfamily\footnotesize, 43 | breakatwhitespace=false, 44 | breaklines=true, 45 | captionpos=b, 46 | keepspaces=true, 47 | numbers=left, 48 | numbersep=2pt, 49 | showspaces=false, 50 | showstringspaces=false, 51 | showtabs=false, 52 | tabsize=2 53 | } 54 | 55 | \lstset{style=mystyle} 56 | 57 | 58 | \title{Auto Comments: Generating Java code comments} 59 | 60 | \addauthornote{diam}{Delft Institute of Computer Science, TU Delft} 61 | 62 | \addauthor[diam]{R. Navin} 63 | \addauthor[diam]{J. Katzy} 64 | \addauthor[diam]{R. Skoulos} 65 | \addauthor[diam]{T. Pfann} 66 | 67 | \addfootimage(c:right column.center)[Delft Institute of Computer Science]{img/TU_P1_full-color.png} 68 | \addfootqrcode(l:left column.left)[project repository]{https://github.com/LRNavin/AutoComments} 69 | 70 | \begin{document} 71 | 72 | \section{Motivation \& Goal} 73 | \begin{itemize} 74 | \item In software development and maintenance, developers spend around 59\% of their time on program comprehension activities. 75 | \item Automatically generate human readable comments for code snippets. 76 | \item With DeepCom as baseline, we propose, 77 | \begin{itemize} 78 | \item Method-1: Replication of code2seq, with added capability to generate natural languages as comments. 79 | \item Method-2: Learn on modified ASTs, solving Out-of-Vocabulary problems. 80 | \end{itemize} 81 | \end{itemize} 82 | 83 | \section{Experiment Setup} 84 | Java methods are parsed into ASTs, which are encoded and passed to a Encoder-Decoder sequence to sequence neural network based upon bidirectional LSTMS (a code2seq based architecture). 85 | 86 | % \paragraph{Dataset} 87 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 88 | \lstinputlisting[language=Java, caption=Java example, frame=tb, backgroundcolor = \color{light-gray}]{example.java} 89 | \label{code:examplefunction} 90 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 91 | \begin{figure}[H] 92 | \centering 93 | \includegraphics[width=0.4\linewidth]{img/Embedding.png} 94 | \caption{Example AST of Function \ref{code:examplefunction}, the example path has been superimposed with thick arrows.} 95 | \label{fig:exampleAST} 96 | \end{figure} 97 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 98 | \textbf{Dataset} 99 | % \paragraph{Dataset} 100 | \begin{table}[H] 101 | \centering 102 | \resizebox{\linewidth}{!}{ 103 | \begin{tabular}{c c c c c} 104 | \# Methods & \# All tokens & \# All identifiers & \#Unique tokens & \#Unique identifiers\\ 105 | \toprule 106 | 588,108 & 44,378,497 & 13,779,297 & 794,711 & 794,621 107 | \end{tabular}} 108 | \caption{Statistics for code-snipets in DeepComm dataset} 109 | \label{tab:dataset-statistics} 110 | \end{table}{} 111 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 112 | \begin{figure}[H] 113 | \centering 114 | \subfloat[Full Distribution]{\includegraphics{img/distr.png}} 115 | \subfloat[\<40 words in comments]{\includegraphics{img/zoomedInLength.png}} 116 | \caption{Dataset distribution of target comment lengths.} 117 | \label{fig:data_dist} 118 | \end{figure} 119 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 120 | 121 | % \paragraph{Encoding} 122 | % \begin{figure}[H] 123 | % \centering 124 | % \includegraphics[width =0.8\linewidth]{Encoder(1).png} 125 | % \caption{Graphic representation of Encoder, $Encode(x) = \sum_{s\in x} E^{\text{subtokens}}_s$} 126 | % \label{fig:encoder} 127 | % \end{figure}{} 128 | % \paragraph{Decoding} 129 | 130 | 131 | 132 | % \paragraph{Dataset} 133 | 134 | 135 | \paragraph{Training} 136 | \begin{itemize} 137 | \item Setup 138 | \begin{itemize} 139 | % \item Cross-entropy loss with a Nesterov momentum of 0.95. 140 | \item Learning rate 0.01 with 0.05 decay every epoch. 141 | \item Embeddings size: 128, Encoder size: 256, Decoder size: 640, Batch size: 128. 142 | \item Trained for 100 epochs. Early stopping if no improvement for 10 epochs. 143 | \end{itemize}. 144 | \item Method - 1: Code2Seq model with comments as target sequence. 145 | \item Method - 2: Same as method 1 but with variable names in ASTs. 146 | \item Evaluation: BLEU-4 score 147 | \end{itemize} 148 | \section{Results} 149 | %%%%%%%%%%%%%%%%%%%%%%%%%% 150 | \begin{table}[H] 151 | \centering 152 | \begin{tabular}{cc} 153 | \hline 154 | Approaches & BLEU-4 score \\ \hline 155 | DeepCom & 38.17 \\ 156 | Method-1 & 6.08 \\ 157 | Method-2 & 10.02 \\ \hline 158 | \end{tabular} 159 | \caption{Evaluation results on Java Methods} 160 | \label{tab:bleu-table} 161 | \end{table} 162 | %%%%%%%%%%%%%%%%%%%%%%%%%% 163 | 164 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 165 | \begin{figure}[H] 166 | \centering 167 | \includegraphics[width =\linewidth]{img/results_table.png} 168 | \caption{Comments Generated by models.} 169 | \label{fig:comments_gen} 170 | \end{figure} 171 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 172 | 173 | \section{Discussion} 174 | \begin{itemize} 175 | \item Probable reasons for poor BLEU score [Table-\ref{tab:bleu-table}], 176 | \begin{itemize} 177 | \item Imbalanced distribution \ref{fig:data_dist} of target comment lengths in the dataset. 178 | \item Code2Seq architecture - Built to predict function names. 179 | \end{itemize} 180 | \item Performance of Method - 2, proves to be good solution to Out-of-Vocabulary problems. 181 | \item Model learnt the syntactic and semantic concepts from codes. [Fig - \ref{fig:comments_gen}] 182 | \item But, Incapable of generating longer comments (>6 words). 183 | \end{itemize} 184 | 185 | \section{Conclusion} 186 | \begin{itemize} 187 | \item Contributions: code2seq based AutoComments, and, AST extraction to solve Out-of-Vocabulary. 188 | \item Future Research, 189 | \begin{itemize} 190 | \item Balanced dataset - w.r.t. target comment lengths. 191 | \item More experiments with decoder, for generating better comments from the learnt code semantics and syntaxes. 192 | \end{itemize} 193 | \end{itemize} 194 | 195 | \end{document} 196 | -------------------------------------------------------------------------------- /preproc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/preproc/__init__.py -------------------------------------------------------------------------------- /preproc/common.py: -------------------------------------------------------------------------------- 1 | import re 2 | import subprocess 3 | import sys 4 | 5 | 6 | class Common: 7 | internal_delimiter = '|' 8 | SOS = '' 9 | EOS = '' 10 | PAD = '' 11 | UNK = '' 12 | 13 | @staticmethod 14 | def normalize_word(word): 15 | stripped = re.sub(r'[^a-zA-Z]', '', word) 16 | if len(stripped) == 0: 17 | return word.lower() 18 | else: 19 | return stripped.lower() 20 | 21 | @staticmethod 22 | def load_histogram(path, max_size=None): 23 | histogram = {} 24 | with open(path, 'r') as file: 25 | for line in file.readlines(): 26 | parts = line.split(' ') 27 | if not len(parts) == 2: 28 | continue 29 | histogram[parts[0]] = int(parts[1]) 30 | sorted_histogram = [(k, histogram[k]) for k in sorted(histogram, key=histogram.get, reverse=True)] 31 | return dict(sorted_histogram[:max_size]) 32 | 33 | @staticmethod 34 | def load_vocab_from_dict(word_to_count, add_values=[], max_size=None): 35 | word_to_index, index_to_word = {}, {} 36 | current_index = 0 37 | for value in add_values: 38 | word_to_index[value] = current_index 39 | index_to_word[current_index] = value 40 | current_index += 1 41 | sorted_counts = [(k, word_to_count[k]) for k in sorted(word_to_count, key=word_to_count.get, reverse=True)] 42 | limited_sorted = dict(sorted_counts[:max_size]) 43 | for word, count in limited_sorted.items(): 44 | word_to_index[word] = current_index 45 | index_to_word[current_index] = word 46 | current_index += 1 47 | return word_to_index, index_to_word, current_index 48 | 49 | @staticmethod 50 | def binary_to_string(binary_string): 51 | return binary_string.decode("utf-8") 52 | 53 | @staticmethod 54 | def binary_to_string_list(binary_string_list): 55 | return [Common.binary_to_string(w) for w in binary_string_list] 56 | 57 | @staticmethod 58 | def binary_to_string_matrix(binary_string_matrix): 59 | return [Common.binary_to_string_list(l) for l in binary_string_matrix] 60 | 61 | @staticmethod 62 | def binary_to_string_3d(binary_string_tensor): 63 | return [Common.binary_to_string_matrix(l) for l in binary_string_tensor] 64 | 65 | @staticmethod 66 | def legal_method_names_checker(name): 67 | return not name in [Common.UNK, Common.PAD, Common.EOS] 68 | 69 | @staticmethod 70 | def filter_impossible_names(top_words): 71 | result = list(filter(Common.legal_method_names_checker, top_words)) 72 | return result 73 | 74 | @staticmethod 75 | def unique(sequence): 76 | unique = [] 77 | [unique.append(item) for item in sequence if item not in unique] 78 | return unique 79 | 80 | @staticmethod 81 | def parse_results(result, pc_info_dict, topk=5): 82 | prediction_results = {} 83 | results_counter = 0 84 | for single_method in result: 85 | original_name, top_suggestions, top_scores, attention_per_context = list(single_method) 86 | current_method_prediction_results = PredictionResults(original_name) 87 | if attention_per_context is not None: 88 | word_attention_pairs = [(word, attention) for word, attention in 89 | zip(top_suggestions, attention_per_context) if 90 | Common.legal_method_names_checker(word)] 91 | for predicted_word, attention_timestep in word_attention_pairs: 92 | current_timestep_paths = [] 93 | for context, attention in [(key, attention_timestep[key]) for key in 94 | sorted(attention_timestep, key=attention_timestep.get, reverse=True)][ 95 | :topk]: 96 | if context in pc_info_dict: 97 | pc_info = pc_info_dict[context] 98 | current_timestep_paths.append((attention.item(), pc_info)) 99 | 100 | current_method_prediction_results.append_prediction(predicted_word, current_timestep_paths) 101 | else: 102 | for predicted_seq in top_suggestions: 103 | filtered_seq = [word for word in predicted_seq if Common.legal_method_names_checker(word)] 104 | current_method_prediction_results.append_prediction(filtered_seq, None) 105 | 106 | prediction_results[results_counter] = current_method_prediction_results 107 | results_counter += 1 108 | return prediction_results 109 | 110 | @staticmethod 111 | def compute_bleu(ref_file_name, predicted_file_name): 112 | with open(predicted_file_name) as predicted_file: 113 | pipe = subprocess.Popen(["perl", "scripts/multi-bleu.perl", ref_file_name], stdin=predicted_file, 114 | stdout=sys.stdout, stderr=sys.stderr) 115 | 116 | 117 | class PredictionResults: 118 | def __init__(self, original_name): 119 | self.original_name = original_name 120 | self.predictions = list() 121 | 122 | def append_prediction(self, name, current_timestep_paths): 123 | self.predictions.append(SingleTimeStepPrediction(name, current_timestep_paths)) 124 | 125 | class SingleTimeStepPrediction: 126 | def __init__(self, prediction, attention_paths): 127 | self.prediction = prediction 128 | if attention_paths is not None: 129 | paths_with_scores = [] 130 | for attention_score, pc_info in attention_paths: 131 | path_context_dict = {'score': attention_score, 132 | 'path': pc_info.longPath, 133 | 'token1': pc_info.token1, 134 | 'token2': pc_info.token2} 135 | paths_with_scores.append(path_context_dict) 136 | self.attention_paths = paths_with_scores 137 | 138 | 139 | class PathContextInformation: 140 | def __init__(self, context): 141 | self.token1 = context['name1'] 142 | self.longPath = context['path'] 143 | self.shortPath = context['shortPath'] 144 | self.token2 = context['name2'] 145 | 146 | def __str__(self): 147 | return '%s,%s,%s' % (self.token1, self.shortPath, self.token2) -------------------------------------------------------------------------------- /preproc/feature_extractor.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import pandas as pd 4 | 5 | data_folder = "../data/" 6 | data_files = ["train", "test", "valid"] 7 | 8 | def save_dataset(feature, label, filename): 9 | print(f"Saving File - {filename}") 10 | dataset = np.array([feature, label]) 11 | dataset = np.transpose(dataset) 12 | pd.DataFrame(dataset).to_csv(data_folder + filename, index=False, header=False) 13 | 14 | for file in data_files: 15 | print(f"Extracting File - {file}") 16 | code_feat = [] 17 | nl_label = [] 18 | file_path = data_folder + file + ".json" 19 | with open(file_path, 'r') as f: 20 | for line in f: 21 | code_feat.append(json.loads(line)["code"]) 22 | nl_label.append(json.loads(line)["nl"]) 23 | if len(code_feat) == 100: 24 | break 25 | save_dataset(code_feat, nl_label, file + ".csv") 26 | -------------------------------------------------------------------------------- /preproc/java_files_creator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import csv 3 | import json 4 | import re 5 | import pickle 6 | 7 | # Raw Data - Folders & Files 8 | raw_data_files = ["train", "test", "valid"] 9 | raw_data_folder = "data/" 10 | 11 | # Processed java file locations 12 | base_folder = "java_code_" 13 | sub_folder = "data" 14 | 15 | # Get AST - Only First 100 code - boolean 16 | get_ast_full_file = False 17 | 18 | def save_code_in_javafile(to_write_path, code): 19 | f = open(to_write_path + "/Input.java", "w") 20 | f.write(code) 21 | f.close() 22 | 23 | def save_comment_in_txtfile(to_write_path, comment): 24 | f = open(to_write_path + "/comment.txt", "w") 25 | f.write(comment) 26 | f.close() 27 | 28 | def extract_replacements(to_write_path, code, comment): 29 | varEncDict = {} 30 | varDecDict = {} 31 | codecopy = re.sub(r"\([A-z]+<.*>", "(type ", code) 32 | codecopy = re.sub(r"\,[A-z]+<.*>", ",type ", codecopy) 33 | codecopy = re.sub(r"@+.*(public|private|static)\s", "declaration", codecopy) 34 | decl = codecopy.split("\n")[0] 35 | varDecl = re.findall("\((.*?)\)", decl)[0] 36 | varList = varDecl.split(",") 37 | if (varList[0] == "" and len(varList) == 1): 38 | save_comment_in_txtfile(to_write_path, comment) 39 | save_code_in_javafile(to_write_path, code) 40 | return 41 | else: 42 | i = 0 43 | print(varList) 44 | for v in varList: 45 | name = v.split(" ")[-1] 46 | varEncDict[name] = "VAR" + str(i) 47 | varDecDict["VAR" + str(i)] = name 48 | i = i+1 49 | for name in varEncDict: 50 | code = code.replace(name, varEncDict[name]) 51 | comment = comment.replace(name, varEncDict[name]) 52 | 53 | save_comment_in_txtfile(to_write_path, comment) 54 | save_code_in_javafile(to_write_path, code) 55 | 56 | fEnc = open(to_write_path + "/encodeDict" , "wb") 57 | pickle.dump(varEncDict, fEnc) 58 | 59 | fDec = open(to_write_path + "/decodeDict", "wb") 60 | pickle.dump(varDecDict, fDec) 61 | 62 | for file in raw_data_files: 63 | curr_base_folder = base_folder + file 64 | os.mkdir(curr_base_folder) 65 | print(f"Extracting File - {file}") 66 | file_path = raw_data_folder + file + ".json" 67 | with open(file_path, 'r') as f: 68 | for index, line in enumerate(f): 69 | if not get_ast_full_file and index == 100: 70 | break 71 | print(f"Writing Java Snippet No:{index}") 72 | to_write_path = curr_base_folder + '/' + sub_folder + str(index) 73 | os.mkdir(to_write_path) 74 | code = json.loads(line)["code"] 75 | comment = json.loads(line)["nl"] 76 | extract_replacements(to_write_path, code, comment) 77 | f.close() 78 | -------------------------------------------------------------------------------- /preproc/preprocess.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import pickle 3 | from argparse import ArgumentParser 4 | 5 | import numpy as np 6 | 7 | import common 8 | ''' 9 | This script preprocesses the data from MethodPaths. It truncates methods with too many contexts, 10 | and pads methods with less paths with spaces. 11 | ''' 12 | 13 | 14 | def save_dictionaries(dataset_name, subtoken_to_count, node_to_count, target_to_count, max_contexts, num_examples): 15 | save_dict_file_path = '{}.dict.c2s'.format(dataset_name) 16 | with open(save_dict_file_path, 'wb') as file: 17 | pickle.dump(subtoken_to_count, file) 18 | pickle.dump(node_to_count, file) 19 | pickle.dump(target_to_count, file) 20 | pickle.dump(max_contexts, file) 21 | pickle.dump(num_examples, file) 22 | print('Dictionaries saved to: {}'.format(save_dict_file_path)) 23 | 24 | 25 | def process_file(file_path, data_file_role, dataset_name, max_contexts, max_data_contexts): 26 | sum_total = 0 27 | sum_sampled = 0 28 | total = 0 29 | max_unfiltered = 0 30 | max_contexts_to_sample = max_data_contexts if data_file_role == 'train' else max_contexts 31 | output_path = '{}.{}.c2s'.format(dataset_name, data_file_role) 32 | with open(output_path, 'w') as outfile: 33 | with open(file_path, 'r') as file: 34 | for line in file: 35 | print(line) 36 | parts = line.rstrip('\n').split(' ') 37 | target_name = parts[0] 38 | contexts = parts[1:] 39 | 40 | if len(contexts) > max_unfiltered: 41 | max_unfiltered = len(contexts) 42 | 43 | sum_total += len(contexts) 44 | if len(contexts) > max_contexts_to_sample: 45 | contexts = np.random.choice(contexts, max_contexts_to_sample, replace=False) 46 | 47 | sum_sampled += len(contexts) 48 | 49 | csv_padding = " " * (max_data_contexts - len(contexts)) 50 | total += 1 51 | outfile.write(target_name + ' ' + " ".join(contexts) + csv_padding + '\n') 52 | 53 | print('File: ' + data_file_path) 54 | print('Average total contexts: ' + str(float(sum_total) / total)) 55 | print('Average final (after sampling) contexts: ' + str(float(sum_sampled) / total)) 56 | print('Total examples: ' + str(total)) 57 | print('Max number of contexts per word: ' + str(max_unfiltered)) 58 | return total 59 | 60 | 61 | def context_full_found(context_parts, word_to_count, path_to_count): 62 | return context_parts[0] in word_to_count \ 63 | and context_parts[1] in path_to_count and context_parts[2] in word_to_count 64 | 65 | 66 | def context_partial_found(context_parts, word_to_count, path_to_count): 67 | return context_parts[0] in word_to_count \ 68 | or context_parts[1] in path_to_count or context_parts[2] in word_to_count 69 | 70 | 71 | if __name__ == '__main__': 72 | parser = ArgumentParser() 73 | parser.add_argument("-trd", "--train_data", dest="train_data_path", 74 | help="path to training data file", required=True) 75 | parser.add_argument("-ted", "--test_data", dest="test_data_path", 76 | help="path to test data file", required=True) 77 | parser.add_argument("-vd", "--val_data", dest="val_data_path", 78 | help="path to validation data file", required=True) 79 | parser.add_argument("-mc", "--max_contexts", dest="max_contexts", default=200, 80 | help="number of max contexts to keep in test+validation", required=False) 81 | parser.add_argument("-mdc", "--max_data_contexts", dest="max_data_contexts", default=1000, 82 | help="number of max contexts to keep in the dataset", required=False) 83 | parser.add_argument("-svs", "--subtoken_vocab_size", dest="subtoken_vocab_size", default=186277, 84 | help="Max number of source subtokens to keep in the vocabulary", required=False) 85 | parser.add_argument("-tvs", "--target_vocab_size", dest="target_vocab_size", default=26347, 86 | help="Max number of target words to keep in the vocabulary", required=False) 87 | parser.add_argument("-sh", "--subtoken_histogram", dest="subtoken_histogram", 88 | help="subtoken histogram file", metavar="FILE", required=True) 89 | parser.add_argument("-nh", "--node_histogram", dest="node_histogram", 90 | help="node_histogram file", metavar="FILE", required=True) 91 | parser.add_argument("-th", "--target_histogram", dest="target_histogram", 92 | help="target histogram file", metavar="FILE", required=True) 93 | parser.add_argument("-o", "--output_name", dest="output_name", 94 | help="output name - the base name for the created dataset", required=True, default='data') 95 | args = parser.parse_args() 96 | 97 | train_data_path = args.train_data_path 98 | test_data_path = args.test_data_path 99 | val_data_path = args.val_data_path 100 | subtoken_histogram_path = args.subtoken_histogram 101 | node_histogram_path = args.node_histogram 102 | 103 | subtoken_to_count = common.Common.load_histogram(subtoken_histogram_path, 104 | max_size=int(args.subtoken_vocab_size)) 105 | node_to_count = common.Common.load_histogram(node_histogram_path, 106 | max_size=None) 107 | target_to_count = common.Common.load_histogram(args.target_histogram, 108 | max_size=int(args.target_vocab_size)) 109 | print('subtoken vocab size: ', len(subtoken_to_count)) 110 | print('node vocab size: ', len(node_to_count)) 111 | print('target vocab size: ', len(target_to_count)) 112 | 113 | num_training_examples = 0 114 | for data_file_path, data_role in zip([test_data_path, val_data_path, train_data_path], ['test', 'val', 'train']): 115 | num_examples = process_file(file_path=data_file_path, data_file_role=data_role, dataset_name=args.output_name, 116 | max_contexts=int(args.max_contexts), max_data_contexts=int(args.max_data_contexts)) 117 | if data_role == 'train': 118 | num_training_examples = num_examples 119 | 120 | save_dictionaries(dataset_name=args.output_name, subtoken_to_count=subtoken_to_count, 121 | node_to_count=node_to_count, target_to_count=target_to_count, 122 | max_contexts=int(args.max_data_contexts), num_examples=num_training_examples) 123 | -------------------------------------------------------------------------------- /preproc/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | ########################################################### 3 | # Change the following values to preproc a new dataset. 4 | # TRAIN_DIR, VAL_DIR and TEST_DIR should be paths to 5 | # directories containing sub-directories with .java files 6 | # DATASET_NAME is just a name for the currently extracted 7 | # dataset. 8 | # MAX_DATA_CONTEXTS is the number of contexts to keep in the dataset for each 9 | # method (by default 1000). At training time, these contexts 10 | # will be downsampled dynamically to MAX_CONTEXTS. 11 | # MAX_CONTEXTS - the number of actual contexts (by default 200) 12 | # that are taken into consideration (out of MAX_DATA_CONTEXTS) 13 | # every training iteration. To avoid randomness at test time, 14 | # for the test and validation sets only MAX_CONTEXTS contexts are kept 15 | # (while for training, MAX_DATA_CONTEXTS are kept and MAX_CONTEXTS are 16 | # selected dynamically during training). 17 | # SUBTOKEN_VOCAB_SIZE, TARGET_VOCAB_SIZE - 18 | # - the number of subtokens and target words to keep 19 | # in the vocabulary (the top occurring words and paths will be kept). 20 | # NUM_THREADS - the number of parallel threads to use. It is 21 | # recommended to use a multi-core machine for the preprocessing 22 | # step and set this value to the number of cores. 23 | # PYTHON - python3 interpreter alias. 24 | TRAIN_DIR=java_code_train 25 | VAL_DIR=java_code_valid 26 | TEST_DIR=java_code_test 27 | 28 | DATASET_NAME=auto_comment_dataset 29 | 30 | MAX_DATA_CONTEXTS=1000 31 | MAX_CONTEXTS=200 32 | SUBTOKEN_VOCAB_SIZE=186277 33 | TARGET_VOCAB_SIZE=26347 34 | NUM_THREADS=32 35 | PYTHON=python3 36 | ########################################################### 37 | 38 | TRAIN_DATA_FILE=${DATASET_NAME}.train.raw.txt 39 | VAL_DATA_FILE=${DATASET_NAME}.val.raw.txt 40 | TEST_DATA_FILE=${DATASET_NAME}.test.raw.txt 41 | EXTRACTOR_JAR=JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar 42 | #EXTRACTOR_JAR=JavaExtractor/JPredict/target/JavaExtractor.jar 43 | 44 | mkdir -p data 45 | mkdir -p data/${DATASET_NAME} 46 | 47 | echo "Extracting paths from validation set..." 48 | ${PYTHON} JavaExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${VAL_DATA_FILE} 2>> error_log.txt 49 | echo "Finished extracting paths from validation set" 50 | echo "Extracting paths from test set..." 51 | ${PYTHON} JavaExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${TEST_DATA_FILE} 2>> error_log.txt 52 | echo "Finished extracting paths from test set" 53 | echo "Extracting paths from training set..." 54 | ${PYTHON} JavaExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${TRAIN_DATA_FILE} 2>> error_log.txt 55 | echo "Finished extracting paths from training set" 56 | 57 | TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2s 58 | SOURCE_SUBTOKEN_HISTOGRAM=data/${DATASET_NAME}/${DATASET_NAME}.histo.ori.c2s 59 | NODE_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.node.c2s 60 | 61 | echo "Creating histograms from the training data" 62 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f1 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${TARGET_HISTOGRAM_FILE} 63 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${SOURCE_SUBTOKEN_HISTOGRAM} 64 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${NODE_HISTOGRAM_FILE} 65 | 66 | ${PYTHON} preproc/preprocess.py --train_data ${TRAIN_DATA_FILE} --test_data ${TEST_DATA_FILE} --val_data ${VAL_DATA_FILE} \ 67 | --max_contexts ${MAX_CONTEXTS} --max_data_contexts ${MAX_DATA_CONTEXTS} --subtoken_vocab_size ${SUBTOKEN_VOCAB_SIZE} \ 68 | --target_vocab_size ${TARGET_VOCAB_SIZE} --subtoken_histogram ${SOURCE_SUBTOKEN_HISTOGRAM} \ 69 | --node_histogram ${NODE_HISTOGRAM_FILE} --target_histogram ${TARGET_HISTOGRAM_FILE} --output_name data/${DATASET_NAME}/${DATASET_NAME} 70 | 71 | # If all went well, the raw data files can be deleted, because preproc.py creates new files 72 | # with truncated and padded number of paths for each example. 73 | rm ${TRAIN_DATA_FILE} ${VAL_DATA_FILE} ${TEST_DATA_FILE} ${TARGET_HISTOGRAM_FILE} ${SOURCE_SUBTOKEN_HISTOGRAM} \ 74 | ${NODE_HISTOGRAM_FILE} 75 | -------------------------------------------------------------------------------- /presentation/AutoComments_Presentation-Group3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/presentation/AutoComments_Presentation-Group3.pdf -------------------------------------------------------------------------------- /presentation/link_to_presentation.txt: -------------------------------------------------------------------------------- 1 | The link to the presentation is: https://docs.google.com/presentation/d/1cNpiHHCmrLX-c2bckLLt2Ko3dJJmXe8CtRjA_8o7R-c/edit?usp=sharing -------------------------------------------------------------------------------- /report/ML4SE_group_3_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/ML4SE_group_3_report.pdf -------------------------------------------------------------------------------- /report/latex_code/BasicEncoderDecoder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/BasicEncoderDecoder.png -------------------------------------------------------------------------------- /report/latex_code/BiLSTM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/BiLSTM.png -------------------------------------------------------------------------------- /report/latex_code/Embedding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/Embedding.png -------------------------------------------------------------------------------- /report/latex_code/Encoder(1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/Encoder(1).png -------------------------------------------------------------------------------- /report/latex_code/ExampleAST.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/ExampleAST.png -------------------------------------------------------------------------------- /report/latex_code/LSTM.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/LSTM.png -------------------------------------------------------------------------------- /report/latex_code/blueprints.tex: -------------------------------------------------------------------------------- 1 | Table: 2 | 3 | \begin{table}[H] 4 | \centering 5 | \begin{tabular}{lc} 6 | \noalign{\smallskip} \hline \hline \noalign{\smallskip} 7 | Parameter & Value \\ \hline 8 | Dropout 1 & 0.71 \\ 9 | Dropout 2 & 0.15 \\ 10 | Receptive field 1 & 3 \\ 11 | Receptive field 2 & 2 \\ 12 | Stride size 1 & 2 \\ 13 | Stride size 2 & 1 \\ 14 | Dense & 50 \\ \hline 15 | \end{tabular} 16 | \caption{Hyperparameters Convolutional Recurrent model} 17 | \label{Table:hyperconvrec} 18 | \end{table} 19 | 20 | 21 | Image: 22 | 23 | % 24 | \begin{figure}[H] 25 | \centering 26 | \includegraphics[width=\linewidth]{graphs/imbalance.png} 27 | \caption{High class imbalance} 28 | \label{fig:imb} 29 | \end{figure} 30 | % -------------------------------------------------------------------------------- /report/latex_code/distr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/distr.png -------------------------------------------------------------------------------- /report/latex_code/example.java: -------------------------------------------------------------------------------- 1 | public static void add(int VAR0, int VAR1) { 2 | return VAR0 + VAR1; 3 | } 4 | -------------------------------------------------------------------------------- /report/latex_code/source-code/1.java: -------------------------------------------------------------------------------- 1 | public static byte[] bitmapToByte(Bitmap b){ 2 | ByteArrayOutputStream o = new ByteArrayOutputStream(); 3 | b.compress(Bitmap.CompressFormat.PNG,100,o); 4 | return o.toByteArray(); 5 | } -------------------------------------------------------------------------------- /report/latex_code/source-code/2.java: -------------------------------------------------------------------------------- 1 | private static void addDefaultProfile(App app ,Simple source){ 2 | if(!source.containsProperty("spring.profiles.active") 3 | && !System.getenv().containsKey("ACTIVE")){ 4 | app.setAdditionalProfiles(Constants.DEVELOPMENT); 5 | } 6 | } -------------------------------------------------------------------------------- /report/latex_code/source-code/3.java: -------------------------------------------------------------------------------- 1 | protected void createItemsLayout(){ 2 | if (mItemsLayout == null){ 3 | mItemsLayout=new LinearLayout(getContext()); 4 | mItemsLayout.setOrientation(LinearLayout.VERTICAL); 5 | } 6 | } -------------------------------------------------------------------------------- /report/latex_code/source-code/4.java: -------------------------------------------------------------------------------- 1 | public FactoryConfigurationError(Exception e){ 2 | super(e.toString()); 3 | this.exception=e; 4 | } -------------------------------------------------------------------------------- /report/latex_code/source-code/5.java: -------------------------------------------------------------------------------- 1 | public static void sort(Comparable[] a){ 2 | int n=a.length; 3 | for (int i=1; i < n; i++){ 4 | Comparable v=a[i]; 5 | int lo=0, hi=i; 6 | while (lo < hi){ ... } 7 | ... 8 | } 9 | assert isSorted(a); 10 | } -------------------------------------------------------------------------------- /report/latex_code/source-code/6.java: -------------------------------------------------------------------------------- 1 | public boolean isEmpty(){ 2 | return root == null; 3 | } -------------------------------------------------------------------------------- /report/latex_code/source-code/7.java: -------------------------------------------------------------------------------- 1 | public boolean contains(int key){ 2 | return rank(key) != -1; 3 | } -------------------------------------------------------------------------------- /report/latex_code/source-code/8.java: -------------------------------------------------------------------------------- 1 | public void tag(String inputFileName,String outputFileName){ 2 | List sentences=jsc.textFile(inputFileName).collect(); 3 | tag(sentences,outputFileName); 4 | } -------------------------------------------------------------------------------- /report/latex_code/source-code/9.java: -------------------------------------------------------------------------------- 1 | public void unlisten(String pattern){ 2 | UtilListener listener=listeners.get(pattern); 3 | if(listener!=null){ 4 | listener.destroy(); 5 | listeners.remove(pattern); 6 | }else{ 7 | client.onError(Topic.RECORD, Event.NOT_LISTENING,pattern); 8 | } 9 | } -------------------------------------------------------------------------------- /report/latex_code/zoomedInLength.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/zoomedInLength.png -------------------------------------------------------------------------------- /scripts/multi-bleu.perl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # This file is part of moses. Its use is licensed under the GNU Lesser General 4 | # Public License version 2.1 or, at your option, any later version. 5 | 6 | # $Id$ 7 | use warnings; 8 | use strict; 9 | 10 | my $lowercase = 0; 11 | if ($ARGV[0] eq "-lc") { 12 | $lowercase = 1; 13 | shift; 14 | } 15 | 16 | my $stem = $ARGV[0]; 17 | if (!defined $stem) { 18 | print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n"; 19 | print STDERR "Reads the references from reference or reference0, reference1, ...\n"; 20 | exit(1); 21 | } 22 | c 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0"; 24 | 25 | my @REF; 26 | my $ref=0; 27 | while(-e "$stem$ref") { 28 | &add_to_ref("$stem$ref",\@REF); 29 | $ref++; 30 | } 31 | &add_to_ref($stem,\@REF) if -e $stem; 32 | die("ERROR: could not find reference file $stem") unless scalar @REF; 33 | 34 | # add additional references explicitly specified on the command line 35 | shift; 36 | foreach my $stem (@ARGV) { 37 | &add_to_ref($stem,\@REF) if -e $stem; 38 | } 39 | 40 | 41 | 42 | sub add_to_ref { 43 | my ($file,$REF) = @_; 44 | my $s=0; 45 | if ($file =~ /.gz$/) { 46 | open(REF,"gzip -dc $file|") or die "Can't read $file"; 47 | } else { 48 | open(REF,$file) or die "Can't read $file"; 49 | } 50 | while() { 51 | chomp; 52 | push @{$$REF[$s++]}, $_; 53 | } 54 | close(REF); 55 | } 56 | 57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference); 58 | my $s=0; 59 | while() { 60 | chomp; 61 | $_ = lc if $lowercase; 62 | my @WORD = split; 63 | my %REF_NGRAM = (); 64 | my $length_translation_this_sentence = scalar(@WORD); 65 | my ($closest_diff,$closest_length) = (9999,9999); 66 | foreach my $reference (@{$REF[$s]}) { 67 | # print "$s $_ <=> $reference\n"; 68 | $reference = lc($reference) if $lowercase; 69 | my @WORD = split(' ',$reference); 70 | my $length = scalar(@WORD); 71 | my $diff = abs($length_translation_this_sentence-$length); 72 | if ($diff < $closest_diff) { 73 | $closest_diff = $diff; 74 | $closest_length = $length; 75 | # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n"; 76 | } elsif ($diff == $closest_diff) { 77 | $closest_length = $length if $length < $closest_length; 78 | # from two references with the same closeness to me 79 | # take the *shorter* into account, not the "first" one. 80 | } 81 | for(my $n=1;$n<=4;$n++) { 82 | my %REF_NGRAM_N = (); 83 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 84 | my $ngram = "$n"; 85 | for(my $w=0;$w<$n;$w++) { 86 | $ngram .= " ".$WORD[$start+$w]; 87 | } 88 | $REF_NGRAM_N{$ngram}++; 89 | } 90 | foreach my $ngram (keys %REF_NGRAM_N) { 91 | if (!defined($REF_NGRAM{$ngram}) || 92 | $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) { 93 | $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram}; 94 | # print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}
\n"; 95 | } 96 | } 97 | } 98 | } 99 | $length_translation += $length_translation_this_sentence; 100 | $length_reference += $closest_length; 101 | for(my $n=1;$n<=4;$n++) { 102 | my %T_NGRAM = (); 103 | for(my $start=0;$start<=$#WORD-($n-1);$start++) { 104 | my $ngram = "$n"; 105 | for(my $w=0;$w<$n;$w++) { 106 | $ngram .= " ".$WORD[$start+$w]; 107 | } 108 | $T_NGRAM{$ngram}++; 109 | } 110 | foreach my $ngram (keys %T_NGRAM) { 111 | $ngram =~ /^(\d+) /; 112 | my $n = $1; 113 | # my $corr = 0; 114 | # print "$i e $ngram $T_NGRAM{$ngram}
\n"; 115 | $TOTAL[$n] += $T_NGRAM{$ngram}; 116 | if (defined($REF_NGRAM{$ngram})) { 117 | if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) { 118 | $CORRECT[$n] += $T_NGRAM{$ngram}; 119 | # $corr = $T_NGRAM{$ngram}; 120 | # print "$i e correct1 $T_NGRAM{$ngram}
\n"; 121 | } 122 | else { 123 | $CORRECT[$n] += $REF_NGRAM{$ngram}; 124 | # $corr = $REF_NGRAM{$ngram}; 125 | # print "$i e correct2 $REF_NGRAM{$ngram}
\n"; 126 | } 127 | } 128 | # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram}; 129 | # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n" 130 | } 131 | } 132 | $s++; 133 | } 134 | my $brevity_penalty = 1; 135 | my $bleu = 0; 136 | 137 | my @bleu=(); 138 | 139 | for(my $n=1;$n<=4;$n++) { 140 | if (defined ($TOTAL[$n])){ 141 | $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0; 142 | # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n"; 143 | }else{ 144 | $bleu[$n]=0; 145 | } 146 | } 147 | 148 | if ($length_reference==0){ 149 | printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; 150 | exit(1); 151 | } 152 | 153 | if ($length_translation<$length_reference) { 154 | $brevity_penalty = exp(1-$length_reference/$length_translation); 155 | } 156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + 157 | my_log( $bleu[2] ) + 158 | my_log( $bleu[3] ) + 159 | my_log( $bleu[4] ) ) / 4) ; 160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n", 161 | 100*$bleu, 162 | 100*$bleu[1], 163 | 100*$bleu[2], 164 | 100*$bleu[3], 165 | 100*$bleu[4], 166 | $brevity_penalty, 167 | $length_translation / $length_reference, 168 | $length_translation, 169 | $length_reference; 170 | 171 | 172 | print STDERR "It is not advisable to publish scores from multi-bleu.perl. The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups. Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization. Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n"; 173 | 174 | sub my_log { 175 | return -9999999999 unless $_[0]; 176 | return log($_[0]); 177 | } --------------------------------------------------------------------------------