├── .gitignore
├── .vscode
    └── settings.json
├── AST_GEN.md
├── AutoComment_ResearchPaper.pdf
├── JavaExtractor
    ├── JPredict
    │   ├── .classpath
    │   ├── .gitignore
    │   ├── .project
    │   ├── .settings
    │   │   ├── org.eclipse.core.resources.prefs
    │   │   ├── org.eclipse.jdt.apt.core.prefs
    │   │   └── org.eclipse.jdt.core.prefs
    │   ├── JavaExtractor (1).iml
    │   ├── JavaExtractor.iml
    │   ├── dependency-reduced-pom.xml
    │   ├── error_log.txt
    │   ├── pom.xml
    │   └── src
    │   │   └── main
    │   │       └── java
    │   │           ├── JavaExtractor
    │   │               ├── App.java
    │   │               ├── Common
    │   │               │   ├── CommandLineValues.java
    │   │               │   ├── Common.java
    │   │               │   └── MethodContent.java
    │   │               ├── ExtractFeaturesTask.java
    │   │               ├── FeatureExtractor.java
    │   │               ├── FeaturesEntities
    │   │               │   ├── ProgramFeatures.java
    │   │               │   ├── ProgramRelation.java
    │   │               │   └── Property.java
    │   │               ├── Main.java
    │   │               └── Visitors
    │   │               │   ├── FunctionVisitor.java
    │   │               │   └── LeavesCollectorVisitor.java
    │   │           └── Test.java
    └── extract.py
├── README.md
├── bleu_score.py
├── code2seq_master
    ├── .gitignore
    ├── CSharpExtractor
    │   ├── .gitattributes
    │   ├── .gitignore
    │   ├── CSharpExtractor
    │   │   ├── .nuget
    │   │   │   └── packages.config
    │   │   ├── CSharpExtractor.sln
    │   │   └── Extractor
    │   │   │   ├── Extractor.cs
    │   │   │   ├── Extractor.csproj
    │   │   │   ├── PathFinder.cs
    │   │   │   ├── Program.cs
    │   │   │   ├── Properties
    │   │   │       └── launchSettings.json
    │   │   │   ├── Temp.cs
    │   │   │   ├── Tree
    │   │   │       └── Tree.cs
    │   │   │   ├── Utilities.cs
    │   │   │   └── Variable.cs
    │   └── extract.py
    ├── Input.java
    ├── JavaExtractor
    │   ├── JPredict
    │   │   ├── .classpath
    │   │   ├── .gitignore
    │   │   └── src
    │   │   │   └── main
    │   │   │       └── java
    │   │   │           ├── JavaExtractor
    │   │   │               ├── App.java
    │   │   │               ├── Common
    │   │   │               │   ├── CommandLineValues.java
    │   │   │               │   ├── Common.java
    │   │   │               │   └── MethodContent.java
    │   │   │               ├── ExtractFeaturesTask.java
    │   │   │               ├── FeatureExtractor.java
    │   │   │               ├── FeaturesEntities
    │   │   │               │   ├── ProgramFeatures.java
    │   │   │               │   ├── ProgramRelation.java
    │   │   │               │   └── Property.java
    │   │   │               └── Visitors
    │   │   │               │   ├── FunctionVisitor.java
    │   │   │               │   └── LeavesCollectorVisitor.java
    │   │   │           └── Test.java
    │   └── extract.py
    ├── LICENSE
    ├── README.md
    ├── __init__.py
    ├── baseline_tokenization
    │   ├── input_example.txt
    │   ├── javalang
    │   │   ├── __init__.py
    │   │   ├── ast.py
    │   │   ├── javadoc.py
    │   │   ├── parse.py
    │   │   ├── parser.py
    │   │   ├── test
    │   │   │   ├── __init__.py
    │   │   │   ├── source
    │   │   │   │   └── package-info
    │   │   │   │   │   ├── AnnotationJavadoc.java
    │   │   │   │   │   ├── AnnotationOnly.java
    │   │   │   │   │   ├── JavadocAnnotation.java
    │   │   │   │   │   ├── JavadocOnly.java
    │   │   │   │   │   └── NoAnnotationNoJavadoc.java
    │   │   │   ├── test_java_8_syntax.py
    │   │   │   ├── test_javadoc.py
    │   │   │   ├── test_package_declaration.py
    │   │   │   └── test_util.py
    │   │   ├── tokenizer.py
    │   │   ├── tree.py
    │   │   └── util.py
    │   └── subtokenize_nmt_baseline.py
    ├── code2seq.py
    ├── code2seq_ast_extractor.py
    ├── common.py
    ├── config.py
    ├── extract_ast.py
    ├── extractor.py
    ├── images
    │   └── network.png
    ├── init.py
    ├── interactive_predict.py
    ├── java_files_creator.py
    ├── model.py
    ├── preprocess.py
    ├── preprocess.sh
    ├── preprocess_csharp.sh
    ├── preprocess_custom.sh
    ├── reader.py
    ├── test_extracted_ast.py
    └── train.sh
├── data
    ├── .gitignore
    └── data.7z
├── images
    ├── network_architecture.png
    └── pipeline.png
├── poster
    ├── ML4SE_Poster_Group_3.pdf
    └── source_code
    │   ├── example.java
    │   ├── img
    │       ├── Embedding.png
    │       ├── TU_P1_full-color.png
    │       ├── distr.png
    │       ├── link_to_github.png
    │       ├── results_table.png
    │       └── zoomedInLength.png
    │   ├── poster.tex
    │   └── tudelftposter.cls
├── preproc
    ├── __init__.py
    ├── common.py
    ├── feature_extractor.py
    ├── java_files_creator.py
    ├── preprocess.py
    └── preprocess.sh
├── presentation
    ├── AutoComments_Presentation-Group3.pdf
    └── link_to_presentation.txt
├── report
    ├── ML4SE_group_3_report.pdf
    └── latex_code
    │   ├── BasicEncoderDecoder.png
    │   ├── BiLSTM.png
    │   ├── Embedding.png
    │   ├── Encoder(1).png
    │   ├── ExampleAST.png
    │   ├── LSTM.png
    │   ├── blueprints.tex
    │   ├── distr.png
    │   ├── example.java
    │   ├── main.tex
    │   ├── reference.bib
    │   ├── source-code
    │       ├── 1.java
    │       ├── 2.java
    │       ├── 3.java
    │       ├── 4.java
    │       ├── 5.java
    │       ├── 6.java
    │       ├── 7.java
    │       ├── 8.java
    │       └── 9.java
    │   └── zoomedInLength.png
└── scripts
    └── multi-bleu.perl


/.gitignore:
--------------------------------------------------------------------------------
 1 | /data/*.json
 2 | code2vec_base
 3 | .idea
 4 | code2vec_model
 5 | tmp
 6 | code2seq-master/java_code_valid
 7 | code2seq-master/java_code_train
 8 | code2seq-master/java_code_test
 9 | data/auto_comment_dataset
10 | code2seq-master
11 | apnews_dbow


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "java.configuration.updateBuildConfiguration": "automatic",
3 |     "files.exclude": {
4 |         "**/.classpath": true,
5 |         "**/.project": true,
6 |         "**/.settings": true,
7 |         "**/.factorypath": true
8 |     }
9 | }


--------------------------------------------------------------------------------
/AST_GEN.md:
--------------------------------------------------------------------------------
 1 | # AutoComments
 2 | 
 3 | ## Generation of AST - STEPS:
 4 | 
 5 | Before everything, extract - 'data.7z' in data folder
 6 | 
 7 | STEP 1 - Run this, python preproc/java_files_creator.py
 8 | 
 9 | STEP 2 - Run this, bash preproc/preprocess_custom.sh
10 | 
11 | RESULTS :
12 | You will see the ASTs of test, train, valid in folder - data/auto_comment_dataset
13 | 
14 | NOTE:
15 | 1. Run all the above processes from the project's root directory
16 | 2. Toggle the boolean "get_ast_full_file", to extract AST for full dataset or only first 100 code snippets.
17 |     i.  True -> Runs for full dataset
18 |     ii. Fasle -> Runs for first 100 code snippets
19 | 


--------------------------------------------------------------------------------
/AutoComment_ResearchPaper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/AutoComment_ResearchPaper.pdf


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry excluding="Test.java" kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
10 | 		<attributes>
11 | 			<attribute name="maven.pomderived" value="true"/>
12 | 		</attributes>
13 | 	</classpathentry>
14 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
15 | 		<attributes>
16 | 			<attribute name="maven.pomderived" value="true"/>
17 | 		</attributes>
18 | 	</classpathentry>
19 | 	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
20 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
21 | 		<attributes>
22 | 			<attribute name="optional" value="true"/>
23 | 			<attribute name="maven.pomderived" value="true"/>
24 | 			<attribute name="test" value="true"/>
25 | 		</attributes>
26 | 	</classpathentry>
27 | 	<classpathentry kind="src" path="target/generated-sources/annotations">
28 | 		<attributes>
29 | 			<attribute name="optional" value="true"/>
30 | 			<attribute name="maven.pomderived" value="true"/>
31 | 			<attribute name="ignore_optional_problems" value="true"/>
32 | 			<attribute name="m2e-apt" value="true"/>
33 | 		</attributes>
34 | 	</classpathentry>
35 | 	<classpathentry kind="src" output="target/test-classes" path="target/generated-test-sources/test-annotations">
36 | 		<attributes>
37 | 			<attribute name="optional" value="true"/>
38 | 			<attribute name="maven.pomderived" value="true"/>
39 | 			<attribute name="ignore_optional_problems" value="true"/>
40 | 			<attribute name="m2e-apt" value="true"/>
41 | 			<attribute name="test" value="true"/>
42 | 		</attributes>
43 | 	</classpathentry>
44 | 	<classpathentry kind="output" path="target/classes"/>
45 | </classpath>
46 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/.gitignore:
--------------------------------------------------------------------------------
1 | /target/


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>JavaExtractor</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>org.eclipse.jdt.core.javanature</nature>
21 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
22 | 	</natures>
23 | </projectDescription>
24 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/.settings/org.eclipse.core.resources.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | encoding//src/main/java=UTF-8
3 | encoding/<project>=UTF-8
4 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/.settings/org.eclipse.jdt.apt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.apt.aptEnabled=false
3 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
 3 | org.eclipse.jdt.core.compiler.compliance=1.8
 4 | org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled
 5 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
 6 | org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore
 7 | org.eclipse.jdt.core.compiler.processAnnotations=disabled
 8 | org.eclipse.jdt.core.compiler.release=disabled
 9 | org.eclipse.jdt.core.compiler.source=1.8
10 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/JavaExtractor (1).iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4">
3 |   <component name="MavenCustomPomFilePath">
4 |     <option name="mavenPomFileUrl" value="file://$MODULE_DIR$/dependency-reduced-pom.xml" />
5 |   </component>
6 | </module>


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/JavaExtractor.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module org.jetbrains.idea.maven.project.MavenProjectsManager.isMavenModule="true" type="JAVA_MODULE" version="4" />


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/dependency-reduced-pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 |   <groupId>JavaExtractor</groupId>
 5 |   <artifactId>JavaExtractor</artifactId>
 6 |   <name>JPredict</name>
 7 |   <version>0.0.1-SNAPSHOT</version>
 8 |   <url>http://maven.apache.org</url>
 9 |   <build>
10 |     <plugins>
11 |       <plugin>
12 |         <artifactId>maven-compiler-plugin</artifactId>
13 |         <version>3.2</version>
14 |         <configuration>
15 |           <source>1.8</source>
16 |           <target>1.8</target>
17 |           <excludes>
18 |             <exclude>Test.java</exclude>
19 |           </excludes>
20 |         </configuration>
21 |       </plugin>
22 |       <plugin>
23 |         <artifactId>maven-shade-plugin</artifactId>
24 |         <version>2.1</version>
25 |         <executions>
26 |           <execution>
27 |             <phase>package</phase>
28 |             <goals>
29 |               <goal>shade</goal>
30 |             </goals>
31 |             <configuration>
32 |               <transformers>
33 |                 <transformer />
34 |               </transformers>
35 |             </configuration>
36 |           </execution>
37 |         </executions>
38 |       </plugin>
39 |     </plugins>
40 |   </build>
41 |   <properties>
42 |     <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
43 |   </properties>
44 | </project>
45 | 
46 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/error_log.txt:
--------------------------------------------------------------------------------
1 | /usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/Resources/Python.app/Contents/MacOS/Python: can't open file 'JavaExtractor/extract.py': [Errno 2] No such file or directory
2 | /usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/Resources/Python.app/Contents/MacOS/Python: can't open file 'JavaExtractor/extract.py': [Errno 2] No such file or directory
3 | /usr/local/Cellar/python/3.7.2_2/Frameworks/Python.framework/Versions/3.7/Resources/Python.app/Contents/MacOS/Python: can't open file 'JavaExtractor/extract.py': [Errno 2] No such file or directory
4 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
 4 |     <modelVersion>4.0.0</modelVersion>
 5 |     <groupId>JavaExtractor</groupId>
 6 |     <artifactId>JavaExtractor</artifactId>
 7 |     <name>JPredict</name>
 8 |     <version>0.0.1-SNAPSHOT</version>
 9 |     <url>http://maven.apache.org</url>
10 |     <build>
11 |         <plugins>
12 |             <plugin>
13 |                 <artifactId>maven-compiler-plugin</artifactId>
14 |                 <version>3.2</version>
15 |                 <configuration>
16 |                     <source>1.8</source>
17 |                     <target>1.8</target>
18 |                     <excludes>
19 |                         <exclude>Test.java</exclude>
20 |                     </excludes>
21 |                 </configuration>
22 |             </plugin>
23 |             <plugin>
24 |                 <artifactId>maven-shade-plugin</artifactId>
25 |                 <version>2.1</version>
26 |                 <executions>
27 |                     <execution>
28 |                         <phase>package</phase>
29 |                         <goals>
30 |                             <goal>shade</goal>
31 |                         </goals>
32 |                         <configuration>
33 |                             <transformers>
34 |                                 <transformer
35 |                                         implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
36 |                                 </transformer>
37 |                             </transformers>
38 |                         </configuration>
39 |                     </execution>
40 |                 </executions>
41 |             </plugin>
42 |         </plugins>
43 |     </build>
44 |     <dependencies>
45 |         <dependency>
46 |             <groupId>com.github.javaparser</groupId>
47 |             <artifactId>javaparser-core</artifactId>
48 |             <version>3.0.0-alpha.4</version>
49 |         </dependency>
50 |         <dependency>
51 |             <groupId>commons-io</groupId>
52 |             <artifactId>commons-io</artifactId>
53 |             <version>1.3.2</version>
54 |             <scope>compile</scope>
55 |         </dependency>
56 |         <dependency>
57 |             <groupId>com.fasterxml.jackson.core</groupId>
58 |             <artifactId>jackson-databind</artifactId>
59 |             <version>2.9.8</version>
60 |         </dependency>
61 |         <dependency>
62 |             <groupId>args4j</groupId>
63 |             <artifactId>args4j</artifactId>
64 |             <version>2.33</version>
65 |         </dependency>
66 |         <dependency>
67 |             <groupId>org.apache.commons</groupId>
68 |             <artifactId>commons-lang3</artifactId>
69 |             <version>3.5</version>
70 |         </dependency>
71 |     </dependencies>
72 |     <properties>
73 |         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
74 |     </properties>
75 | </project>
76 | 
77 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor;
 2 | 
 3 | import JavaExtractor.Common.CommandLineValues;
 4 | import org.kohsuke.args4j.CmdLineException;
 5 | 
 6 | import java.io.IOException;
 7 | import java.nio.file.Files;
 8 | import java.nio.file.Paths;
 9 | import java.util.LinkedList;
10 | import java.util.List;
11 | import java.util.concurrent.ExecutionException;
12 | import java.util.concurrent.Executors;
13 | import java.util.concurrent.Future;
14 | import java.util.concurrent.ThreadPoolExecutor;
15 | 
16 | public class App {
17 |     private static CommandLineValues s_CommandLineValues;
18 | 
19 |     public static void main(String[] args) {
20 |         try {
21 |             s_CommandLineValues = new CommandLineValues(args);
22 |         } catch (CmdLineException e) {
23 |             e.printStackTrace();
24 |             return;
25 |         }
26 | 
27 |         if (s_CommandLineValues.File != null) {
28 |             ExtractFeaturesTask extractFeaturesTask = new ExtractFeaturesTask(s_CommandLineValues,
29 |                     s_CommandLineValues.File.toPath());
30 |             extractFeaturesTask.processFile();
31 |         } else if (s_CommandLineValues.Dir != null) {
32 |             extractDir();
33 |         }
34 |     }
35 | 
36 |     private static void extractDir() {
37 |         ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(s_CommandLineValues.NumThreads);
38 |         LinkedList<ExtractFeaturesTask> tasks = new LinkedList<>();
39 |         try {
40 |             Files.walk(Paths.get(s_CommandLineValues.Dir)).filter(Files::isRegularFile)
41 |                     .filter(p -> p.toString().toLowerCase().endsWith(".java")).forEach(f -> {
42 |                 ExtractFeaturesTask task = new ExtractFeaturesTask(s_CommandLineValues, f);
43 |                 tasks.add(task);
44 |             });
45 |         } catch (IOException e) {
46 |             e.printStackTrace();
47 |             return;
48 |         }
49 |         List<Future<Void>> tasksResults = null;
50 |         try {
51 |             tasksResults = executor.invokeAll(tasks);
52 |         } catch (InterruptedException e) {
53 |             e.printStackTrace();
54 |         } finally {
55 |             executor.shutdown();
56 |         }
57 |         tasksResults.forEach(f -> {
58 |             try {
59 |                 f.get();
60 |             } catch (InterruptedException | ExecutionException e) {
61 |                 e.printStackTrace();
62 |             }
63 |         });
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.Common;
 2 | 
 3 | import org.kohsuke.args4j.CmdLineException;
 4 | import org.kohsuke.args4j.CmdLineParser;
 5 | import org.kohsuke.args4j.Option;
 6 | 
 7 | import java.io.File;
 8 | 
 9 | /**
10 |  * This class handles the programs arguments.
11 |  */
12 | public class CommandLineValues {
13 |     @Option(name = "--file", required = false)
14 |     public File File = null;
15 | 
16 |     @Option(name = "--dir", required = false, forbids = "--file")
17 |     public String Dir = null;
18 | 
19 |     @Option(name = "--max_path_length", required = true)
20 |     public int MaxPathLength;
21 | 
22 |     @Option(name = "--max_path_width", required = true)
23 |     public int MaxPathWidth;
24 | 
25 |     @Option(name = "--num_threads", required = false)
26 |     public int NumThreads = 64;
27 | 
28 |     @Option(name = "--min_code_len", required = false)
29 |     public int MinCodeLength = 1;
30 | 
31 |     @Option(name = "--max_code_len", required = false)
32 |     public int MaxCodeLength = -1;
33 | 
34 |     @Option(name = "--max_file_len", required = false)
35 |     public int MaxFileLength = -1;
36 | 
37 |     @Option(name = "--pretty_print", required = false)
38 |     public boolean PrettyPrint = false;
39 | 
40 |     @Option(name = "--max_child_id", required = false)
41 |     public int MaxChildId = 3;
42 | 
43 |     public CommandLineValues(String... args) throws CmdLineException {
44 |         CmdLineParser parser = new CmdLineParser(this);
45 |         try {
46 |             parser.parseArgument(args);
47 |         } catch (CmdLineException e) {
48 |             System.err.println(e.getMessage());
49 |             parser.printUsage(System.err);
50 |             throw e;
51 |         }
52 |     }
53 | 
54 |     public CommandLineValues() {
55 | 
56 |     }
57 | }


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.Common;
 2 | 
 3 | import JavaExtractor.FeaturesEntities.Property;
 4 | import com.github.javaparser.ast.Node;
 5 | import com.github.javaparser.ast.UserDataKey;
 6 | 
 7 | import java.util.ArrayList;
 8 | import java.util.stream.Collectors;
 9 | import java.util.stream.Stream;
10 | 
11 | public final class Common {
12 |     public static final UserDataKey<Property> PropertyKey = new UserDataKey<Property>() {
13 |     };
14 |     public static final UserDataKey<Integer> ChildId = new UserDataKey<Integer>() {
15 |     };
16 |     public static final String EmptyString = "";
17 | 
18 |     public static final String MethodDeclaration = "MethodDeclaration";
19 |     public static final String NameExpr = "NameExpr";
20 |     public static final String BlankWord = "BLANK";
21 | 
22 |     public static final int c_MaxLabelLength = 50;
23 |     public static final String methodName = "METHOD_NAME";
24 |     public static final String internalSeparator = "|";
25 | 
26 |     public static String normalizeName(String original, String defaultString) {
27 |         original = original.toLowerCase().replaceAll("\\\\n", "") // escaped new
28 |                 // lines
29 |                 .replaceAll("//s+", "") // whitespaces
30 |                 .replaceAll("[\"',]", "") // quotes, apostrophies, commas
31 |                 .replaceAll("\\P{Print}", ""); // unicode weird characters
32 |         String stripped = original.replaceAll("[^A-Za-z]", "");
33 |         if (stripped.length() == 0) {
34 |             String carefulStripped = original.replaceAll(" ", "_");
35 |             if (carefulStripped.length() == 0) {
36 |                 return defaultString;
37 |             } else {
38 |                 return carefulStripped;
39 |             }
40 |         } else {
41 |             return stripped;
42 |         }
43 |     }
44 | 
45 |     public static boolean isMethod(Node node, String type) {
46 |         Property parentProperty = node.getParentNode().getUserData(Common.PropertyKey);
47 |         if (parentProperty == null) {
48 |             return false;
49 |         }
50 | 
51 |         String parentType = parentProperty.getType();
52 |         return Common.NameExpr.equals(type) && Common.MethodDeclaration.equals(parentType);
53 |     }
54 | 
55 |     public static ArrayList<String> splitToSubtokens(String str1) {
56 |         String str2 = str1.replace("|", " ");
57 |         String str3 = str2.trim();
58 |         return Stream.of(str3.split("(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+"))
59 |                 .filter(s -> s.length() > 0).map(s -> Common.normalizeName(s, Common.EmptyString))
60 |                 .filter(s -> s.length() > 0).collect(Collectors.toCollection(ArrayList::new));
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.Common;
 2 | 
 3 | import com.github.javaparser.ast.Node;
 4 | 
 5 | import java.util.ArrayList;
 6 | 
 7 | public class MethodContent {
 8 |     private final ArrayList<Node> leaves;
 9 |     private final String name;
10 | 
11 |     public MethodContent(ArrayList<Node> leaves, String name) {
12 |         this.leaves = leaves;
13 |         this.name = name;
14 |     }
15 | 
16 |     public ArrayList<Node> getLeaves() {
17 |         return leaves;
18 |     }
19 | 
20 |     public String getName() {
21 |         return name;
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java:
--------------------------------------------------------------------------------
  1 | package JavaExtractor;
  2 | 
  3 | import JavaExtractor.Common.CommandLineValues;
  4 | import JavaExtractor.Common.Common;
  5 | import JavaExtractor.FeaturesEntities.ProgramFeatures;
  6 | import org.apache.commons.lang3.StringUtils;
  7 | 
  8 | import java.io.IOException;
  9 | import java.nio.charset.Charset;
 10 | import java.nio.file.Files;
 11 | import java.nio.file.Path;
 12 | import java.nio.file.Paths;
 13 | import java.util.ArrayList;
 14 | import java.util.List;
 15 | import java.util.concurrent.Callable;
 16 | 
 17 | class ExtractFeaturesTask implements Callable<Void> {
 18 |     private final CommandLineValues m_CommandLineValues;
 19 |     private final Path filePath;
 20 | 
 21 |     public ExtractFeaturesTask(CommandLineValues commandLineValues, Path path) {
 22 |         m_CommandLineValues = commandLineValues;
 23 |         this.filePath = path;
 24 |     }
 25 | 
 26 |     @Override
 27 |     public Void call() {
 28 |         processFile();
 29 |         return null;
 30 |     }
 31 | 
 32 |     public void processFile() {
 33 |         ArrayList<ProgramFeatures> features;
 34 |         try {
 35 |             features = extractSingleFile();
 36 |         } catch (IOException e) {
 37 |             e.printStackTrace();
 38 |             return;
 39 |         }
 40 |         if (features == null) {
 41 |             return;
 42 |         }
 43 |         //Find out how to itterator over programFeatures
 44 |         String toPrint = featuresToString(features);
 45 |         if (toPrint.length() > 0) {
 46 |             System.out.println(toPrint);
 47 |         }
 48 |     }
 49 | 
 50 |     private ArrayList<ProgramFeatures> extractSingleFile() throws IOException {
 51 |         String code;
 52 |         String comment;
 53 | 
 54 |         if (m_CommandLineValues.MaxFileLength > 0 &&
 55 |                 Files.lines(filePath, Charset.defaultCharset()).count() > m_CommandLineValues.MaxFileLength) {
 56 |             return new ArrayList<>();
 57 |         }
 58 |         try {
 59 |             code = new String(Files.readAllBytes(filePath));
 60 |         } catch (IOException e) {
 61 |             e.printStackTrace();
 62 |             code = Common.EmptyString;
 63 |         }
 64 | 
 65 |         if (code == Common.EmptyString){
 66 |             comment = Common.EmptyString;
 67 |         }
 68 |         else {
 69 |             String commentPath = filePath.toString();
 70 |             commentPath = commentPath.replace('\\', '/');
 71 |             int lst = commentPath.lastIndexOf("/");
 72 |             commentPath = commentPath.replace(commentPath.substring(lst + 1), "comment.txt");
 73 |             Path pathToComment = Paths.get(commentPath);
 74 |             comment = new String(Files.readAllBytes(pathToComment));
 75 |         }
 76 |         FeatureExtractor featureExtractor = new FeatureExtractor(m_CommandLineValues);
 77 | 
 78 |         return featureExtractor.extractFeatures(code, comment);
 79 |     }
 80 | 
 81 |     public String featuresToString(ArrayList<ProgramFeatures> features) {
 82 |         if (features == null || features.isEmpty()) {
 83 |             return Common.EmptyString;
 84 |         }
 85 | 
 86 |         List<String> methodsOutputs = new ArrayList<>();
 87 | 
 88 |         for (ProgramFeatures singleMethodFeatures : features) {
 89 |             StringBuilder builder = new StringBuilder();
 90 |             String toPrint = singleMethodFeatures.toString();
 91 |             if (m_CommandLineValues.PrettyPrint) {
 92 |                 toPrint = toPrint.replace(" ", "\n\t");
 93 |             }
 94 |             builder.append(toPrint);
 95 |             methodsOutputs.add(builder.toString());
 96 | 
 97 |         }
 98 |         return StringUtils.join(methodsOutputs, "\n");
 99 |     }
100 | }
101 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.FeaturesEntities;
 2 | 
 3 | import com.fasterxml.jackson.annotation.JsonIgnore;
 4 | 
 5 | import java.util.ArrayList;
 6 | import java.util.stream.Collectors;
 7 | 
 8 | public class ProgramFeatures {
 9 |     private final String name;
10 | 
11 |     private final ArrayList<ProgramRelation> features = new ArrayList<>();
12 | 
13 |     public ProgramFeatures(String name) {
14 |         this.name = name;
15 |     }
16 | 
17 |     @SuppressWarnings("StringBufferReplaceableByString")
18 |     @Override
19 |     public String toString() {
20 |         StringBuilder stringBuilder = new StringBuilder();
21 |         stringBuilder.append(name).append(" ");
22 |         stringBuilder.append(features.stream().map(ProgramRelation::toString).collect(Collectors.joining(" ")));
23 | 
24 |         return stringBuilder.toString();
25 |     }
26 | 
27 |     public void addFeature(Property source, String path, Property target) {
28 |         ProgramRelation newRelation = new ProgramRelation(source, target, path);
29 |         features.add(newRelation);
30 |     }
31 | 
32 |     @JsonIgnore
33 |     public boolean isEmpty() {
34 |         return features.isEmpty();
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.FeaturesEntities;
 2 | 
 3 | public class ProgramRelation {
 4 |     private final Property m_Source;
 5 |     private final Property m_Target;
 6 |     private final String m_Path;
 7 | 
 8 |     public ProgramRelation(Property sourceName, Property targetName, String path) {
 9 |         m_Source = sourceName;
10 |         m_Target = targetName;
11 |         m_Path = path;
12 |     }
13 | 
14 |     public String toString() {
15 |         return String.format("%s,%s,%s", m_Source.getName(), m_Path,
16 |                 m_Target.getName());
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/src/main/java/JavaExtractor/Main.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor;
 2 | import java.util.List;
 3 | 
 4 | import com.github.javaparser.JavaParser;
 5 | import com.github.javaparser.ast.CompilationUnit;
 6 | import com.github.javaparser.ast.Node;
 7 | 
 8 | 
 9 | public class Main {
10 |     public static void main(String args[]) throws Exception {
11 | 	System.out.println("runs");
12 | 	String code = "public class Class{\nprivate void assign(HashMap<String,DBIDs> labelMap,String label,DBIDRef id){\nif (labelMap.containsKey(label)) {\nDBIDs exist=labelMap.get(label);\nif (exist instanceof DBID) {\n ModifiableDBIDs n=DBIDUtil.newHashSet();\n n.add((DBID)exist);\nn.add(id);lnlabelMap.put(label,n);\n    }\n else {\n      assert (exist instanceof HashSetModifiableDBIDs);\n      assert (exist.size() > 1);\n      ((ModifiableDBIDs)exist).add(id);\n    }\n  }\n else {\n    labelMap.put(label,DBIDUtil.deref(id));\n  }\n}\n}";
13 | 
14 | 	// CompilationUnit parsed = JavaParser.parse(code);
15 | 	
16 | 
17 | 	System.out.printf("%-28s %-12s %s%n", "Node.class.simpleName", "Identifier", "Node.toString()");
18 |         System.out.printf("%-28s %-12s %s%n", "=====================", "==========", "===============");
19 | 	CompilationUnit parsed = JavaParser.parse(code);
20 | 	// parsed.walk(node -> {
21 |     //         String identifier = "";
22 |     //         if (node instanceof NodeWithIdentifier)
23 |     //             identifier = ((NodeWithIdentifier<?>) node).getIdentifier();
24 |     //         System.out.printf("%-28s %-12s %s%n",
25 |     //                           node.getClass().getSimpleName(),
26 |     //                           identifier,
27 |     //                           node.toString().replaceFirst("(?s)\\R.*", "..."));
28 | 	// 	});
29 | 		
30 | 	System.out.println(parsed);
31 | 
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.Visitors;
 2 | 
 3 | import JavaExtractor.Common.CommandLineValues;
 4 | import JavaExtractor.Common.Common;
 5 | import JavaExtractor.Common.MethodContent;
 6 | import com.github.javaparser.ast.Node;
 7 | import com.github.javaparser.ast.body.MethodDeclaration;
 8 | import com.github.javaparser.ast.visitor.VoidVisitorAdapter;
 9 | 
10 | import java.util.ArrayList;
11 | import java.util.Arrays;
12 | import java.util.regex.Pattern;
13 | 
14 | @SuppressWarnings("StringEquality")
15 | public class FunctionVisitor extends VoidVisitorAdapter<Object> {
16 |     private final ArrayList<MethodContent> m_Methods = new ArrayList<>();
17 |     private final CommandLineValues m_CommandLineValues;
18 | 
19 |     public FunctionVisitor(CommandLineValues commandLineValues) {
20 |         this.m_CommandLineValues = commandLineValues;
21 |     }
22 | 
23 |     @Override
24 |     public void visit(MethodDeclaration node, Object arg) {
25 |         visitMethod(node, arg.toString());
26 | 
27 |         super.visit(node, arg);
28 |     }
29 | 
30 |     private void visitMethod(MethodDeclaration node, String comment) {
31 |         LeavesCollectorVisitor leavesCollectorVisitor = new LeavesCollectorVisitor();
32 |         leavesCollectorVisitor.visitDepthFirst(node);
33 |         ArrayList<Node> leaves = leavesCollectorVisitor.getLeaves();
34 |         String[] parts = comment.split(Pattern.quote("."));
35 |         comment = parts[0];
36 |         String normalizedMethodName = Common.normalizeName(comment, Common.BlankWord);
37 | //        String normalizedMethodName = Common.normalizeName(node.getName(), Common.BlankWord);
38 |         ArrayList<String> splitNameParts = Common.splitToSubtokens(comment);
39 |         String splitName = normalizedMethodName;
40 |         if (splitNameParts.size() > 0) {
41 |             splitName = String.join(Common.internalSeparator, splitNameParts);
42 |         }
43 | 
44 |         if (node.getBody() != null) {
45 |             long methodLength = getMethodLength(node.getBody().toString());
46 |             if (m_CommandLineValues.MaxCodeLength > 0) {
47 |                 if (methodLength >= m_CommandLineValues.MinCodeLength && methodLength <= m_CommandLineValues.MaxCodeLength) {
48 |                     m_Methods.add(new MethodContent(leaves, splitName));
49 |                 }
50 |             } else {
51 |                 m_Methods.add(new MethodContent(leaves, splitName));
52 |             }
53 |         }
54 |     }
55 | 
56 |     private long getMethodLength(String code) {
57 |         String cleanCode = code.replaceAll("\r\n", "\n").replaceAll("\t", " ");
58 |         if (cleanCode.startsWith("{\n"))
59 |             cleanCode = cleanCode.substring(3).trim();
60 |         if (cleanCode.endsWith("\n}"))
61 |             cleanCode = cleanCode.substring(0, cleanCode.length() - 2).trim();
62 |         if (cleanCode.length() == 0) {
63 |             return 0;
64 |         }
65 |         return Arrays.stream(cleanCode.split("\n"))
66 |                 .filter(line -> (line.trim() != "{" && line.trim() != "}" && line.trim() != ""))
67 |                 .filter(line -> !line.trim().startsWith("/") && !line.trim().startsWith("*")).count();
68 |     }
69 | 
70 |     public ArrayList<MethodContent> getMethodContents() {
71 |         return m_Methods;
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.Visitors;
 2 | 
 3 | import JavaExtractor.Common.Common;
 4 | import JavaExtractor.FeaturesEntities.Property;
 5 | import com.github.javaparser.ast.Node;
 6 | import com.github.javaparser.ast.comments.Comment;
 7 | import com.github.javaparser.ast.expr.NullLiteralExpr;
 8 | import com.github.javaparser.ast.stmt.Statement;
 9 | import com.github.javaparser.ast.type.ClassOrInterfaceType;
10 | import com.github.javaparser.ast.visitor.TreeVisitor;
11 | 
12 | import java.util.ArrayList;
13 | import java.util.List;
14 | 
15 | public class LeavesCollectorVisitor extends TreeVisitor {
16 |     private final ArrayList<Node> m_Leaves = new ArrayList<>();
17 | 
18 |     @Override
19 |     public void process(Node node) {
20 | 
21 |         if (node instanceof Comment) {
22 |             return;
23 |         }
24 |         boolean isLeaf = false;
25 |         boolean isGenericParent = isGenericParent(node);
26 |         if (hasNoChildren(node) && isNotComment(node)) {
27 |             if (!node.toString().isEmpty() && (!"null".equals(node.toString()) || (node instanceof NullLiteralExpr))) {
28 |                 m_Leaves.add(node);
29 |                 isLeaf = true;
30 |             }
31 |         }
32 | 
33 |         int childId = getChildId(node);
34 |         node.setUserData(Common.ChildId, childId);
35 |         Property property = new Property(node, isLeaf, isGenericParent);
36 | 	node.setUserData(Common.PropertyKey, property);
37 |     }
38 | 
39 |     private boolean isGenericParent(Node node) {
40 |         return (node instanceof ClassOrInterfaceType)
41 |                 && ((ClassOrInterfaceType) node).getTypeArguments() != null
42 |                 && ((ClassOrInterfaceType) node).getTypeArguments().size() > 0;
43 |     }
44 | 
45 |     private boolean hasNoChildren(Node node) {
46 |         return node.getChildrenNodes().size() == 0;
47 |     }
48 | 
49 |     private boolean isNotComment(Node node) {
50 |         return !(node instanceof Comment) && !(node instanceof Statement);
51 |     }
52 | 
53 |     public ArrayList<Node> getLeaves() {
54 |         return m_Leaves;
55 |     }
56 | 
57 |     private int getChildId(Node node) {
58 |         Node parent = node.getParentNode();
59 |         List<Node> parentsChildren = parent.getChildrenNodes();
60 |         int childId = 0;
61 |         for (Node child : parentsChildren) {
62 |             if (child.getRange().equals(node.getRange())) {
63 |                 return childId;
64 |             }
65 |             childId++;
66 |         }
67 |         return childId;
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/JavaExtractor/JPredict/src/main/java/Test.java:
--------------------------------------------------------------------------------
1 | class Test {
2 |     void fooBar() {
3 |         System.out.println("http://github.com");
4 |     }
5 | }


--------------------------------------------------------------------------------
/JavaExtractor/extract.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import itertools
  4 | import multiprocessing
  5 | import os
  6 | import shutil
  7 | import subprocess
  8 | import sys
  9 | from argparse import ArgumentParser
 10 | from threading import Timer
 11 | 
 12 | 
 13 | def get_immediate_subdirectories(a_dir):
 14 |     return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir)
 15 |             if os.path.isdir(os.path.join(a_dir, name))]
 16 | 
 17 | 
 18 | TMP_DIR = ""
 19 | 
 20 | 
 21 | def ParallelExtractDir(args, dir):
 22 |     ExtractFeaturesForDir(args, dir, "")
 23 | 
 24 | 
 25 | def ExtractFeaturesForDir(args, dir, prefix):
 26 |     command = ['java', '-Xmx100g', '-XX:MaxNewSize=60g', '-cp', args.jar, 'JavaExtractor.App',
 27 |                '--max_path_length', str(args.max_path_length), '--max_path_width', str(args.max_path_width),
 28 |                '--dir', dir, '--num_threads', str(args.num_threads)]
 29 | 
 30 |     # print command
 31 |     # os.system(command)
 32 |     kill = lambda process: process.kill()
 33 |     outputFileName = TMP_DIR + prefix + dir.split('/')[-1]
 34 |     failed = False
 35 |     with open(outputFileName, 'a') as outputFile:
 36 |         sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE)
 37 |         timer = Timer(60 * 60, kill, [sleeper])
 38 | 
 39 |         try:
 40 |             timer.start()
 41 |             stdout, stderr = sleeper.communicate()
 42 |         finally:
 43 |             timer.cancel()
 44 | 
 45 |         if sleeper.poll() == 0:
 46 |             if len(stderr) > 0:
 47 |                 print(stderr, file=sys.stderr)
 48 |         else:
 49 |             print('dir: ' + str(dir) + ' was not completed in time', file=sys.stderr)
 50 |             failed = True
 51 |             subdirs = get_immediate_subdirectories(dir)
 52 |             for subdir in subdirs:
 53 |                 ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')
 54 |     if failed:
 55 |         if os.path.exists(outputFileName):
 56 |             os.remove(outputFileName)
 57 | 
 58 | 
 59 | def ExtractFeaturesForDirsList(args, dirs):
 60 |     global TMP_DIR
 61 |     TMP_DIR = "./tmp/feature_extractor%d/" % (os.getpid())
 62 |     if os.path.exists(TMP_DIR):
 63 |         shutil.rmtree(TMP_DIR, ignore_errors=True)
 64 |     os.makedirs(TMP_DIR)
 65 |     try:
 66 |         p = multiprocessing.Pool(6)
 67 |         p.starmap(ParallelExtractDir, zip(itertools.repeat(args), dirs))
 68 |         # for dir in dirs:
 69 |         #    ExtractFeaturesForDir(args, dir, '')
 70 |         output_files = os.listdir(TMP_DIR)
 71 |         for f in output_files:
 72 |             os.system("cat %s/%s" % (TMP_DIR, f))
 73 |     finally:
 74 |         shutil.rmtree(TMP_DIR, ignore_errors=True)
 75 | 
 76 | 
 77 | if __name__ == '__main__':
 78 |     parser = ArgumentParser()
 79 |     parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8)
 80 |     parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2)
 81 |     parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64)
 82 |     parser.add_argument("-j", "--jar", dest="jar", required=True)
 83 |     parser.add_argument("-dir", "--dir", dest="dir", required=False)
 84 |     parser.add_argument("-file", "--file", dest="file", required=False)
 85 |     args = parser.parse_args()
 86 | 
 87 |     if args.file is not None:
 88 |         command = 'java -cp ' + args.jar + ' JavaExtractor.App --max_path_length ' + \
 89 |                   str(args.max_path_length) + ' --max_path_width ' + str(args.max_path_width) + ' --file ' + args.file
 90 |         os.system(command)
 91 |     elif args.dir is not None:
 92 |         subdirs = get_immediate_subdirectories(args.dir)
 93 |         # print("Sub Directories")
 94 |         # print(subdirs)
 95 |         if len(subdirs) == 0:
 96 |             subdirs = [args.dir]
 97 |         ExtractFeaturesForDirsList(args, subdirs)
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AutoComments
 2 | 
 3 | ## :pencil2: Description: 
 4 | ### Motivation
 5 | We want to create a deep Neural Network that can automatically generate comments for code snippets passed to it.
 6 | The motivation behind this is that in software development and maintenance, developers spend around 59% of their time on program comprehension activities. Having comments that are generated automatically will hopefully cut this time down.
 7 | In order to do this we will combine the recent paper,
 8 | [Code2Vec: Learning Distributed Representations of Code](https://openreview.net/pdf?id=H1gKYo09tX) by Alon et al. with the paper [Deep Code Comment Generation](https://ink.library.smu.edu.sg/cgi/viewcontent.cgi?article=5295&context=sis_research) by X. Hu et.al, so as to make a better performing model using the newer Code2Vec encoding that was not used in the Deep Code Comment Generation paper.
 9 | 
10 | ### Work done
11 | In this project, 2 experiments were conducted. For the first one, we used the Code2Seq code to create model which will generate comments for code snippets (functions) of Java instead of the function names. For the second one, we repeated the procedure followed in the first experiment with modified ASTs. In particular, we added the precific name of each variable to the AST in order to make the comments generated more descriptive. The evaluation of the experiments were conducted in terms of BLEU-4 score.
12 | 
13 | The performance of the first experiment was poor (BLEU-4 score  6.08), while with the novelty introduced in the second experiment we achieved an important improvement (BLEU-4 score 10.08).  However, the performance achieved was much worse than the one achieved by X. HU (BLEU-4 score 38.17 ) in Deep Code Comment Generation paper, because our model was not able to produce long comments. Nevertheless, it predicted succesfully the shorter comments, as well as a part of the long ones. The reason for this behavior is that the Code2Seq was built to produce function names which are short, and not long sequences.
14 | 
15 | All in all, an important main conclusion we can draw regarding our best model, is that with the novelty introduced with the variable names in the AST, it is capable of understanding the syntactic and semantic meaning of Java code regarding the automatic comment generation. However suffers from the incapability to generate longer comments and complete.
16 | 
17 | ## :page_facing_up: Dataset: 
18 | 
19 | The dataset that we used, is the same dataset used by the Deep Code Comment Generation paper, this is a dataset of more than 500,000 code snippets including comments.
20 | This also gave us a baseline against which to compare.
21 | The dataset can be found [here](https://github.com/xing-hu/DeepCom).
22 | 
23 | ## :scroll: System Overview 
24 | The pipeline of the system is:
25 | 1. Extract the ASTs from the code snippets-comment pairs.
26 | 2. Use the extracted ASTs to train the model.
27 | 3. Test the trained model on the test data.
28 | 
29 | The high-level pipeline is shown in the following image:
30 | <p align="center">
31 |   <img src="https://github.com/LRNavin/AutoComments/blob/master/images/pipeline.png" height="111" width="600">
32 | </p>
33 | 
34 | ## :triangular_ruler: Network Architecture 
35 | The Encoder-Decoder architecture of this project is shown in the image below and is influenced by the [work](https://openreview.net/pdf?id=H1gKYo09tX) of U. Alon et al. 
36 | 
37 | <p align="center">
38 |   <img src="https://github.com/LRNavin/AutoComments/blob/master/images/network_architecture.png" height="331" width="850">
39 | </p>
40 | 
41 | ## :bar_chart: Results
42 | 
43 | The BLEU-4 score achieved in the test dataset is presented below:
44 | 
45 | | Approaches    |     BLEU-4     |
46 | | -------       | -------------- |
47 | | DeepCom       | 38.17          |
48 | | Method-1      | 6.08           |
49 | | Method-2      | 10.02          |
50 | 
51 | 
52 | For more information about the results and a detailed description of the 2 methods used, please feel free to take a look at our project [report](https://github.com/LRNavin/AutoComments/tree/master/report/ML4SE_group_3_report.pdf) that is included on this repository.
53 | 
54 | 
55 | ## :office: Project Structure 
56 | The structure of the project is:
57 | 
58 | *   [`JavaExtractor`](https://github.com/LRNavin/AutoComments/tree/master/data/JavaExtractor) This directory contains the necessary code for exctracting the ASTs from the dataset.
59 | *   [`code2seq_master`](https://github.com/LRNavin/AutoComments/tree/master/code2seq_master) This directory contains the original Code2Seq code.
60 | * [`data`](https://github.com/LRNavin/AutoComments/tree/master/data) here you can find a small portion of the data we used. We couldn't upload the whole dataset because of its size.
61 | *   [`preproc`](https://github.com/LRNavin/AutoComments/tree/master/preproc) Conatins all the necessary ptython files and neccesary scripts for the preprocessing and the proper execution of the AST extraction.
62 | *   [`report`](https://github.com/LRNavin/AutoComments/tree/master/report) Contains the report for this project and its latex code.
63 | *   [`scripts`](https://github.com/LRNavin/AutoComments/tree/master/scripts) Contains all the extra scripts used, like the perl script for the BLEU score extraction.
64 | *   [`bleu.py`](https://github.com/LRNavin/AutoComments/tree/master/bleu.py) Extracts the BLEU-4 score for a reference and a prediction file.
65 | 
66 | ## Papers:
67 | 
68 | [Code2Vec: Learning Distributed Representations of Code](https://openreview.net/pdf?id=H1gKYo09tX)
69 | 
70 | [Deep Code Comment Generation](https://ink.library.smu.edu.sg/cgi/viewcontent.cgi?article=5295&context=sis_research) 
71 | 
72 | ## :busts_in_silhouette: Group 3 Team Members 
73 | 
74 | [Rafail Skoulos](https://github.com/RafailSkoulos17)
75 | 
76 | [Navin Raj Prabhu](https://github.com/LRNavin)
77 | 
78 | [Thomas Pfann](https://github.com/ThomasPf)
79 | 
80 | [Jonathan Katzy](https://github.com/jkatzy)
81 | 
82 | 
83 | 


--------------------------------------------------------------------------------
/bleu_score.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | 
 4 | 
 5 | def compute_bleu(ref_file_name, predicted_file_name):
 6 |     with open(predicted_file_name) as predicted_file:
 7 |         pipe = subprocess.Popen(["perl", "scripts/multi-bleu.perl", ref_file_name], stdin=predicted_file,
 8 |                                 stdout=sys.stdout, stderr=sys.stderr)
 9 |         pipe.communicate()
10 | 
11 | 
12 | ref_file = 'outputs/1st_try/test/ref.txt'
13 | pred_file = 'outputs/1st_try/test/pred.txt'
14 | 
15 | compute_bleu(ref_file, pred_file)
16 | 


--------------------------------------------------------------------------------
/code2seq_master/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.lst
3 | .idea/*
4 | *.iml
5 | *.xml
6 | *.pyc
7 | 
8 | 


--------------------------------------------------------------------------------
/code2seq_master/CSharpExtractor/.gitattributes:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Set default behavior to automatically normalize line endings.
 3 | ###############################################################################
 4 | * text=auto
 5 | 
 6 | ###############################################################################
 7 | # Set default behavior for command prompt diff.
 8 | #
 9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs     diff=csharp
14 | 
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following 
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln       merge=binary
26 | #*.csproj    merge=binary
27 | #*.vbproj    merge=binary
28 | #*.vcxproj   merge=binary
29 | #*.vcproj    merge=binary
30 | #*.dbproj    merge=binary
31 | #*.fsproj    merge=binary
32 | #*.lsproj    merge=binary
33 | #*.wixproj   merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj   merge=binary
36 | #*.wwaproj   merge=binary
37 | 
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg   binary
44 | #*.png   binary
45 | #*.gif   binary
46 | 
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | # 
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the 
52 | # entries below.
53 | ###############################################################################
54 | #*.doc   diff=astextplain
55 | #*.DOC   diff=astextplain
56 | #*.docx  diff=astextplain
57 | #*.DOCX  diff=astextplain
58 | #*.dot   diff=astextplain
59 | #*.DOT   diff=astextplain
60 | #*.pdf   diff=astextplain
61 | #*.PDF   diff=astextplain
62 | #*.rtf   diff=astextplain
63 | #*.RTF   diff=astextplain
64 | 


--------------------------------------------------------------------------------
/code2seq_master/CSharpExtractor/.gitignore:
--------------------------------------------------------------------------------
  1 | ## Ignore Visual Studio temporary files, build results, and
  2 | ## files generated by popular Visual Studio add-ons.
  3 | 
  4 | # User-specific files
  5 | *.suo
  6 | *.user
  7 | *.userosscache
  8 | *.sln.docstates
  9 | 
 10 | # User-specific files (MonoDevelop/Xamarin Studio)
 11 | *.userprefs
 12 | 
 13 | # Build results
 14 | [Dd]ebug/
 15 | [Dd]ebugPublic/
 16 | [Rr]elease/
 17 | [Rr]eleases/
 18 | x64/
 19 | x86/
 20 | bld/
 21 | [Bb]in/
 22 | [Oo]bj/
 23 | [Ll]og/
 24 | 
 25 | # Visual Studio 2015 cache/options directory
 26 | .vs/
 27 | # Uncomment if you have tasks that create the project's static files in wwwroot
 28 | #wwwroot/
 29 | 
 30 | # MSTest test Results
 31 | [Tt]est[Rr]esult*/
 32 | [Bb]uild[Ll]og.*
 33 | 
 34 | # NUNIT
 35 | *.VisualState.xml
 36 | TestResult.xml
 37 | 
 38 | # Build Results of an ATL Project
 39 | [Dd]ebugPS/
 40 | [Rr]eleasePS/
 41 | dlldata.c
 42 | 
 43 | # DNX
 44 | project.lock.json
 45 | artifacts/
 46 | 
 47 | *_i.c
 48 | *_p.c
 49 | *_i.h
 50 | *.ilk
 51 | *.meta
 52 | *.obj
 53 | *.pch
 54 | *.pdb
 55 | *.pgc
 56 | *.pgd
 57 | *.rsp
 58 | *.sbr
 59 | *.tlb
 60 | *.tli
 61 | *.tlh
 62 | *.tmp
 63 | *.tmp_proj
 64 | *.log
 65 | *.vspscc
 66 | *.vssscc
 67 | .builds
 68 | *.pidb
 69 | *.svclog
 70 | *.scc
 71 | 
 72 | # Chutzpah Test files
 73 | _Chutzpah*
 74 | 
 75 | # Visual C++ cache files
 76 | ipch/
 77 | *.aps
 78 | *.ncb
 79 | *.opendb
 80 | *.opensdf
 81 | *.sdf
 82 | *.cachefile
 83 | *.VC.db
 84 | *.VC.VC.opendb
 85 | 
 86 | # Visual Studio profiler
 87 | *.psess
 88 | *.vsp
 89 | *.vspx
 90 | *.sap
 91 | 
 92 | # TFS 2012 Local Workspace
 93 | $tf/
 94 | 
 95 | # Guidance Automation Toolkit
 96 | *.gpState
 97 | 
 98 | # ReSharper is a .NET coding add-in
 99 | _ReSharper*/
100 | *.[Rr]e[Ss]harper
101 | *.DotSettings.user
102 | 
103 | # JustCode is a .NET coding add-in
104 | .JustCode
105 | 
106 | # TeamCity is a build add-in
107 | _TeamCity*
108 | 
109 | # DotCover is a Code Coverage Tool
110 | *.dotCover
111 | 
112 | # NCrunch
113 | _NCrunch_*
114 | .*crunch*.local.xml
115 | nCrunchTemp_*
116 | 
117 | # MightyMoose
118 | *.mm.*
119 | AutoTest.Net/
120 | 
121 | # Web workbench (sass)
122 | .sass-cache/
123 | 
124 | # Installshield output folder
125 | [Ee]xpress/
126 | 
127 | # DocProject is a documentation generator add-in
128 | DocProject/buildhelp/
129 | DocProject/Help/*.HxT
130 | DocProject/Help/*.HxC
131 | DocProject/Help/*.hhc
132 | DocProject/Help/*.hhk
133 | DocProject/Help/*.hhp
134 | DocProject/Help/Html2
135 | DocProject/Help/html
136 | 
137 | # Click-Once directory
138 | publish/
139 | 
140 | # Publish Web Output
141 | *.[Pp]ublish.xml
142 | *.azurePubxml
143 | # TODO: Comment the next line if you want to checkin your web deploy settings
144 | # but database connection strings (with potential passwords) will be unencrypted
145 | *.pubxml
146 | *.publishproj
147 | 
148 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
149 | # checkin your Azure Web App publish settings, but sensitive information contained
150 | # in these scripts will be unencrypted
151 | PublishScripts/
152 | 
153 | # NuGet Packages
154 | *.nupkg
155 | # The packages folder can be ignored because of Package Restore
156 | **/packages/*
157 | # except build/, which is used as an MSBuild target.
158 | !**/packages/build/
159 | # Uncomment if necessary however generally it will be regenerated when needed
160 | #!**/packages/repositories.config
161 | # NuGet v3's project.json files produces more ignoreable files
162 | *.nuget.props
163 | *.nuget.targets
164 | 
165 | # Microsoft Azure Build Output
166 | csx/
167 | *.build.csdef
168 | 
169 | # Microsoft Azure Emulator
170 | ecf/
171 | rcf/
172 | 
173 | # Windows Store app package directories and files
174 | AppPackages/
175 | BundleArtifacts/
176 | Package.StoreAssociation.xml
177 | _pkginfo.txt
178 | 
179 | # Visual Studio cache files
180 | # files ending in .cache can be ignored
181 | *.[Cc]ache
182 | # but keep track of directories ending in .cache
183 | !*.[Cc]ache/
184 | 
185 | # Others
186 | ClientBin/
187 | ~$*
188 | *~
189 | *.dbmdl
190 | *.dbproj.schemaview
191 | *.pfx
192 | *.publishsettings
193 | node_modules/
194 | orleans.codegen.cs
195 | 
196 | # Since there are multiple workflows, uncomment next line to ignore bower_components
197 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
198 | #bower_components/
199 | 
200 | # RIA/Silverlight projects
201 | Generated_Code/
202 | 
203 | # Backup & report files from converting an old project file
204 | # to a newer Visual Studio version. Backup files are not needed,
205 | # because we have git ;-)
206 | _UpgradeReport_Files/
207 | Backup*/
208 | UpgradeLog*.XML
209 | UpgradeLog*.htm
210 | 
211 | # SQL Server files
212 | *.mdf
213 | *.ldf
214 | 
215 | # Business Intelligence projects
216 | *.rdl.data
217 | *.bim.layout
218 | *.bim_*.settings
219 | 
220 | # Microsoft Fakes
221 | FakesAssemblies/
222 | 
223 | # GhostDoc plugin setting file
224 | *.GhostDoc.xml
225 | 
226 | # Node.js Tools for Visual Studio
227 | .ntvs_analysis.dat
228 | 
229 | # Visual Studio 6 build log
230 | *.plg
231 | 
232 | # Visual Studio 6 workspace options file
233 | *.opt
234 | 
235 | # Visual Studio LightSwitch build output
236 | **/*.HTMLClient/GeneratedArtifacts
237 | **/*.DesktopClient/GeneratedArtifacts
238 | **/*.DesktopClient/ModelManifest.xml
239 | **/*.Server/GeneratedArtifacts
240 | **/*.Server/ModelManifest.xml
241 | _Pvt_Extensions
242 | 
243 | # Paket dependency manager
244 | .paket/paket.exe
245 | paket-files/
246 | 
247 | # FAKE - F# Make
248 | .fake/
249 | 
250 | # JetBrains Rider
251 | .idea/
252 | *.sln.iml
253 | 
254 | # no data
255 | data/*
256 | backupdata/*
257 | 


--------------------------------------------------------------------------------
/code2seq_master/CSharpExtractor/CSharpExtractor/.nuget/packages.config:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <packages>
 3 |   <package id="NUnit.ConsoleRunner" version="3.6.0" />
 4 |   <package id="NUnit.Extension.NUnitProjectLoader" version="3.5.0" />
 5 |   <package id="NUnit.Extension.NUnitV2Driver" version="3.6.0" />
 6 |   <package id="NUnit.Extension.NUnitV2ResultWriter" version="3.5.0" />
 7 |   <package id="NUnit.Extension.TeamCityEventListener" version="1.0.2" />
 8 |   <package id="NUnit.Extension.VSProjectLoader" version="3.5.0" />
 9 |   <package id="NUnit3TestAdapter" version="3.7.0" />
10 | </packages>


--------------------------------------------------------------------------------
/code2seq_master/CSharpExtractor/CSharpExtractor/CSharpExtractor.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 15
 4 | VisualStudioVersion = 15.0.28307.136
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Extractor", "Extractor\Extractor.csproj", "{481EDE3F-0ED1-4CB9-814A-63A821022552}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|Any CPU = Debug|Any CPU
11 | 		Debug|x64 = Debug|x64
12 | 		Debug|x86 = Debug|x86
13 | 		Release|Any CPU = Release|Any CPU
14 | 		Release|x64 = Release|x64
15 | 		Release|x86 = Release|x86
16 | 		Release20|Any CPU = Release20|Any CPU
17 | 		Release20|x64 = Release20|x64
18 | 		Release20|x86 = Release20|x86
19 | 	EndGlobalSection
20 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
21 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
22 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|Any CPU.Build.0 = Debug|Any CPU
23 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x64.ActiveCfg = Debug|Any CPU
24 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x64.Build.0 = Debug|Any CPU
25 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x86.ActiveCfg = Debug|Any CPU
26 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Debug|x86.Build.0 = Debug|Any CPU
27 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|Any CPU.ActiveCfg = Release|Any CPU
28 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|Any CPU.Build.0 = Release|Any CPU
29 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x64.ActiveCfg = Release|Any CPU
30 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x64.Build.0 = Release|Any CPU
31 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x86.ActiveCfg = Release|Any CPU
32 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release|x86.Build.0 = Release|Any CPU
33 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|Any CPU.ActiveCfg = Release|Any CPU
34 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|Any CPU.Build.0 = Release|Any CPU
35 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x64.ActiveCfg = Release|Any CPU
36 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x64.Build.0 = Release|Any CPU
37 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x86.ActiveCfg = Release|Any CPU
38 | 		{481EDE3F-0ED1-4CB9-814A-63A821022552}.Release20|x86.Build.0 = Release|Any CPU
39 | 	EndGlobalSection
40 | 	GlobalSection(SolutionProperties) = preSolution
41 | 		HideSolutionNode = FALSE
42 | 	EndGlobalSection
43 | 	GlobalSection(ExtensibilityGlobals) = postSolution
44 | 		SolutionGuid = {13A0DA89-D5D9-4E75-850E-70B9FBE88FF8}
45 | 	EndGlobalSection
46 | EndGlobal
47 | 


--------------------------------------------------------------------------------
/code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/Extractor.csproj:
--------------------------------------------------------------------------------
 1 | <Project Sdk="Microsoft.NET.Sdk">
 2 | 
 3 |   <PropertyGroup>
 4 |     <OutputType>Exe</OutputType>
 5 |     <TargetFramework>netcoreapp2.2</TargetFramework>
 6 |     <StartupObject>Extractor.Program</StartupObject>
 7 |   </PropertyGroup>
 8 | 
 9 |   <ItemGroup>
10 |     <Compile Remove="Temp.cs" />
11 |   </ItemGroup>
12 | 
13 |   <ItemGroup>
14 |     <None Include="Temp.cs" />
15 |   </ItemGroup>
16 | 
17 |   <ItemGroup>
18 |     <PackageReference Include="CommandLineParser" Version="2.3.0" />
19 |     <PackageReference Include="Microsoft.CodeAnalysis" Version="2.10.0" />
20 |   </ItemGroup>
21 | 
22 | </Project>
23 | 


--------------------------------------------------------------------------------
/code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/PathFinder.cs:
--------------------------------------------------------------------------------
  1 | using Microsoft.CodeAnalysis;
  2 | using Microsoft.CodeAnalysis.CSharp.Syntax;
  3 | using System;
  4 | using System.Collections.Generic;
  5 | using System.Linq;
  6 | 
  7 | namespace Extractor
  8 | {
  9 | 
 10 |     internal class PathFinder
 11 | 	{
 12 | 		internal class Path
 13 | 		{
 14 | 			public SyntaxToken Left { get; }
 15 | 			public List<SyntaxNode> LeftSide { get; }
 16 | 			public SyntaxNode Ancesstor { get; }
 17 | 			public List<SyntaxNode> RightSide { get; }
 18 | 			public SyntaxToken Right { get; }
 19 | 
 20 | 			public Path(SyntaxToken left, IEnumerable<SyntaxNode> leftSide, SyntaxNode ancesstor, 
 21 | 			            IEnumerable<SyntaxNode> rightSide, SyntaxToken right)
 22 | 			{
 23 | 				this.Left = left;
 24 | 				this.LeftSide = leftSide.ToList();
 25 | 				this.Ancesstor = ancesstor;
 26 | 				this.RightSide = rightSide.ToList();
 27 | 				this.Right = right;
 28 | 			}
 29 | 		}
 30 | 
 31 | 		public int Length { get; }
 32 | 		public int Width { get; }
 33 | 
 34 | 		Tree tree;
 35 | 
 36 | 		public PathFinder(Tree tree, int length = 7, int width = 4)
 37 | 		{
 38 | 			if (length < 1 || width < 1)
 39 | 				throw new ArgumentException("Width and Length params must be positive.");
 40 | 
 41 | 			Length = length;
 42 | 			Width = width;
 43 | 			this.tree = tree;
 44 | 		}
 45 | 
 46 | 		private int GetDepth(SyntaxNode n)
 47 | 		{
 48 |             int depth = 0;
 49 | 			while(n.Parent != null)
 50 |             {
 51 |                 n = n.Parent;
 52 |                 depth++;
 53 |             }
 54 |             return depth;
 55 | 		}
 56 | 
 57 | 		public SyntaxNode FirstAncestor(SyntaxNode l, SyntaxNode r)
 58 | 		{
 59 | 			if (l.Equals(r))
 60 | 				return l;
 61 | 
 62 | 			if (GetDepth(l) >= GetDepth(r))
 63 | 			{
 64 | 				l = l.Parent;
 65 | 			}
 66 | 			else
 67 | 			{
 68 | 				r = r.Parent;
 69 | 			}
 70 | 			return FirstAncestor(l, r);
 71 | 		}
 72 | 
 73 | 		private IEnumerable<SyntaxNode> CollectPathToParent(SyntaxNode start, SyntaxNode parent)
 74 | 		{
 75 | 			while (!start.Equals(parent))
 76 | 			{
 77 | 				yield return start;
 78 | 				start = start.Parent;
 79 | 			}
 80 | 		}
 81 | 
 82 | 		internal Path FindPath(SyntaxToken l, SyntaxToken r, bool limited = true)
 83 | 		{
 84 | 			SyntaxNode p = FirstAncestor(l.Parent, r.Parent);
 85 | 
 86 | 			// + 2 for the distance of the leafs themselves
 87 | 			if (GetDepth(r.Parent) + GetDepth(l.Parent) - 2 * GetDepth(p) + 2 > Length)
 88 | 			{
 89 | 				return null;
 90 | 			}
 91 | 
 92 | 			var leftSide = CollectPathToParent(l.Parent, p);
 93 | 			var rightSide = CollectPathToParent(r.Parent, p);
 94 | 			rightSide = rightSide.Reverse();
 95 | 
 96 | 			List<SyntaxNode> widthCheck = p.ChildNodes().ToList();
 97 | 			if (limited && leftSide.Count() != 0
 98 | 			    && rightSide.Count() != 0)
 99 | 			{
100 | 				int indexOfLeft = widthCheck.IndexOf(leftSide.Last());
101 | 				int indexOfRight = widthCheck.IndexOf(rightSide.First());
102 | 				if (Math.Abs(indexOfLeft - indexOfRight) >= Width)
103 | 				{
104 | 					return null;
105 | 				}
106 | 			}
107 | 
108 | 			return new Path(l, leftSide, p, rightSide, r);
109 | 		}
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/Program.cs:
--------------------------------------------------------------------------------
 1 | ﻿using CommandLine;
 2 | using CommandLine.Text;
 3 | using System;
 4 | using System.Collections.Generic;
 5 | using System.IO;
 6 | using System.Linq;
 7 | 
 8 | namespace Extractor
 9 | {
10 |     class Program
11 |     {
12 |         static List<String> ExtractSingleFile(string filename, Options opts)
13 |         {
14 |             string data = File.ReadAllText(filename);
15 |             var extractor = new Extractor(data, opts);
16 |             List<String> result = extractor.Extract();
17 | 
18 |             return result;
19 |         }
20 | 
21 |         static void Main(string[] args)
22 |         {
23 |             Options options = new Options();
24 |             Parser.Default.ParseArguments<Options>(args)
25 |                 .WithParsed(opt => options = opt)
26 |                 .WithNotParsed(errors =>
27 |                 {
28 |                     Console.WriteLine(errors);
29 |                     return;
30 |                 });
31 | 
32 |             string path = options.Path;
33 |             string[] files;
34 |             if (Directory.Exists(path))
35 |             {
36 |                 files = Directory.GetFiles(path, "*.cs", SearchOption.AllDirectories);
37 |             }
38 |             else
39 |             {
40 |                 files = new string[] { path };
41 |             }
42 | 
43 |             IEnumerable<string> results = null;
44 | 
45 |             results = files.AsParallel().WithDegreeOfParallelism(options.Threads).SelectMany(filename => ExtractSingleFile(filename, options));
46 | 
47 |             using (StreamWriter sw = new StreamWriter(options.OFileName, append: true))
48 |             {
49 |                 foreach (var res in results)
50 |                 {
51 |                     sw.WriteLine(res);
52 |                 }
53 |             }
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/Properties/launchSettings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "profiles": {
3 |     "Extractor": {
4 |       "commandName": "Project",
5 |       "commandLineArgs": "--path C:\\Users\\urial\\Source\\Repos\\CSharpExtractor\\CSharpExtractor\\Extractor\\bin\\ --no_hash"
6 |     }
7 |   }
8 | }


--------------------------------------------------------------------------------
/code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/Temp.cs:
--------------------------------------------------------------------------------
 1 | ﻿namespace Extractor
 2 | {
 3 |     class Temp
 4 |     {
 5 |         class NestedClass
 6 |         {
 7 |             void fooBar()
 8 |             {
 9 |                 a.b = c;
10 |             }
11 |         }
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/Utilities.cs:
--------------------------------------------------------------------------------
  1 | ﻿using CommandLine;
  2 | using System;
  3 | using System.Collections.Generic;
  4 | using System.Linq;
  5 | using System.Text;
  6 | using System.Diagnostics;
  7 | using System.Text.RegularExpressions;
  8 | 
  9 | namespace Extractor
 10 | {
 11 |     public class Options
 12 |     {
 13 |         [Option('t', "threads", Default = 1, HelpText = "How many threads to use <1>")]
 14 |         public int Threads { get; set; }
 15 | 
 16 |         [Option('p', "path", Default = "./data/", HelpText = "Where to find code files. <.>")]
 17 |         public string Path { get; set; }
 18 | 
 19 |         [Option('l', "max_length", Default = 9, HelpText = "Max path length")]
 20 |         public int MaxLength { get; set; }
 21 | 
 22 |         [Option('l', "max_width", Default = 2, HelpText = "Max path length")]
 23 |         public int MaxWidth { get; set; }
 24 | 
 25 |         [Option('o', "ofile_name", Default = "test.txt", HelpText = "Output file name")]
 26 |         public String OFileName { get; set; }
 27 | 
 28 |         [Option('h', "no_hash", Default = false, HelpText = "When enabled, prints the whole path strings (not hashed)")]
 29 |         public Boolean NoHash { get; set; }
 30 | 
 31 |         [Option('l', "max_contexts", Default = 30000, HelpText = "Max number of path contexts to sample. Affects only very large snippets")]
 32 |         public int MaxContexts { get; set; }
 33 |     }
 34 | 
 35 |     public static class Utilities
 36 | 	{
 37 | 	    public static String[] NumbericLiteralsToKeep = new String[] { "0", "1", "2", "3", "4", "5", "10" };
 38 |         public static IEnumerable<Tuple<T, T>> Choose2<T>(IEnumerable<T> enumerable)
 39 | 		{
 40 | 			int index = 0;
 41 | 
 42 | 			foreach (var e in enumerable)
 43 | 			{
 44 | 				++index;
 45 | 				foreach (var t in enumerable.Skip(index))
 46 | 					yield return Tuple.Create(e, t);
 47 | 			}
 48 | 		}
 49 | 
 50 |         /// <summary>
 51 |         /// Sample uniform randomly numSamples from an enumerable, using reservoir sampling.
 52 |         /// See https://en.wikipedia.org/wiki/Reservoir_sampling
 53 |         /// </summary>
 54 |         /// <typeparam name="T"></typeparam>
 55 |         /// <param name="input"></param>
 56 |         /// <param name="numSamples"></param>
 57 |         /// <returns></returns>
 58 |         public static IEnumerable<TSource> ReservoirSample<TSource>(this IEnumerable<TSource> input, int numSamples)
 59 |         {
 60 |             var rng = new Random();
 61 |             var sampledElements = new List<TSource>(numSamples);
 62 |             int seenElementCount = 0;
 63 |             foreach (var element in input)
 64 |             {
 65 |                 seenElementCount++;
 66 |                 if (sampledElements.Count < numSamples)
 67 |                 {
 68 |                     sampledElements.Add(element);
 69 |                 }
 70 |                 else
 71 |                 {
 72 |                     int position = rng.Next(seenElementCount);
 73 |                     if (position < numSamples)
 74 |                     {
 75 |                         sampledElements[position] = element;
 76 |                     }
 77 |                 }
 78 |             }
 79 |             Debug.Assert(sampledElements.Count <= numSamples);
 80 |             return sampledElements;
 81 |         }
 82 | 
 83 | 
 84 |         public static IEnumerable<T> WeakConcat<T>(IEnumerable<T> enumerable1, IEnumerable<T> enumerable2)
 85 | 		{
 86 | 			foreach (T t in enumerable1)
 87 | 				yield return t;
 88 | 			foreach (T t in enumerable2)
 89 | 				yield return t;
 90 | 		}
 91 | 
 92 |         public static IEnumerable<String> SplitToSubtokens(String name)
 93 |         {
 94 |             return Regex.Split(name.Trim(), "(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+")
 95 |                 .Where(s => s.Length > 0)
 96 |                 .Select(s => NormalizeName(s))
 97 |                 .Where(s => s.Length > 0);
 98 |         }
 99 | 
100 |         private static Regex Whitespaces = new Regex(@"\s");
101 |         private static Regex NonAlphabetic = new Regex("[^A-Za-z]");
102 | 
103 |         public static String NormalizeName(string s)
104 |         {
105 |             String partiallyNormalized = s.ToLowerInvariant()
106 |                 .Replace("\\\\n", String.Empty)
107 |                 .Replace("[\"',]", String.Empty);
108 | 
109 |             partiallyNormalized = Whitespaces.Replace(partiallyNormalized, "");
110 |             partiallyNormalized = Encoding.ASCII.GetString(
111 |                 Encoding.Convert(
112 |                     Encoding.UTF8,
113 |                     Encoding.GetEncoding(
114 |                         Encoding.ASCII.EncodingName,
115 |                         new EncoderReplacementFallback(string.Empty),
116 |                         new DecoderExceptionFallback()
117 |                     ),
118 |                     Encoding.UTF8.GetBytes(partiallyNormalized)
119 |                 )
120 |             );
121 | 
122 |             if (partiallyNormalized.Contains('\n'))
123 |             {
124 |                 partiallyNormalized = partiallyNormalized.Replace('\n', 'N');
125 |             }
126 |             if (partiallyNormalized.Contains('\r'))
127 |             {
128 |                 partiallyNormalized = partiallyNormalized.Replace('\r', 'R');
129 |             }
130 |             if (partiallyNormalized.Contains(','))
131 |             {
132 |                 partiallyNormalized = partiallyNormalized.Replace(',', 'C');
133 |             }
134 | 
135 |             String completelyNormalized = NonAlphabetic.Replace(partiallyNormalized, String.Empty);
136 |             if (completelyNormalized.Length == 0)
137 |             {
138 |                 if (Regex.IsMatch(partiallyNormalized, @"^\d+$"))
139 |                 {
140 |                     if (NumbericLiteralsToKeep.Contains(partiallyNormalized))
141 |                     {
142 |                         return partiallyNormalized;
143 |                     }
144 |                     else
145 |                     {
146 |                         return "NUM";
147 |                     }
148 |                 }
149 | 
150 |                 return String.Empty;
151 |             }
152 |             return completelyNormalized;
153 |             
154 |         }
155 |     }
156 | }
157 | 


--------------------------------------------------------------------------------
/code2seq_master/CSharpExtractor/CSharpExtractor/Extractor/Variable.cs:
--------------------------------------------------------------------------------
  1 | ﻿using System;
  2 | using System.Collections.Generic;
  3 | using System.Linq;
  4 | using Microsoft.CodeAnalysis;
  5 | using Microsoft.CodeAnalysis.CSharp.Syntax;
  6 | 
  7 | namespace Extractor
  8 | {
  9 | 	namespace Semantics
 10 | 	{
 11 | 		public class Variable
 12 | 		{
 13 | 			Tree tree;
 14 | 
 15 | 			public string Name { get; }
 16 | 			private HashSet<SyntaxToken> leaves;
 17 | 			public HashSet<SyntaxToken> Leaves
 18 | 			{
 19 | 				get
 20 | 				{
 21 | 					return leaves;
 22 | 				}
 23 | 			}
 24 | 
 25 | 			private Nullable<bool> constant;
 26 | 			public bool Const
 27 | 			{
 28 | 				get
 29 | 				{
 30 | 					return constant.Value;
 31 | 				}
 32 | 			}
 33 | 
 34 | 
 35 | 			private Variable(string name, SyntaxToken[] leaves, Tree tree)
 36 | 			{
 37 | 				this.tree = tree;
 38 | 				this.Name = name;
 39 | 				this.leaves = new HashSet<SyntaxToken>(leaves);
 40 | 
 41 | 
 42 | 				constant = true;
 43 | 				foreach (var leaf in leaves)
 44 | 				{
 45 | 					if (!tree.leaves[leaf].IsConst)
 46 | 					{
 47 | 						constant = false;
 48 | 						// If not constant the it is a decleration token
 49 | 						break;
 50 | 					}
 51 | 				}
 52 | 			}
 53 | 
 54 | 			public override int GetHashCode()
 55 | 			{
 56 | 				return this.Name.GetHashCode();
 57 | 			}
 58 | 
 59 | 			public bool IsLiteral()
 60 | 			{
 61 | 				return Tree.literals.Contains(tree.leaves[Leaves.First()].Kind);
 62 | 			}
 63 | 
 64 |             internal static Boolean isMethodName(SyntaxToken token)
 65 |             {
 66 |                 return token.Parent.IsKind(Microsoft.CodeAnalysis.CSharp.SyntaxKind.MethodDeclaration) 
 67 |                     && token.IsKind(Microsoft.CodeAnalysis.CSharp.SyntaxKind.IdentifierToken);
 68 |             }
 69 | 
 70 | 			// Create a variable for each variable in scope from tokens while splitting identically named but differently scoped vars.
 71 | 			internal static IEnumerable<Variable> CreateFromMethod(Tree methodTree)
 72 | 			{
 73 | 			    var root = methodTree.nodes[methodTree.GetRoot()];
 74 | 				var leaves = root.Leaves.ToArray();
 75 | 				Dictionary<SyntaxToken, string> tokenToName = new Dictionary<SyntaxToken, string>();
 76 | 				Dictionary<string, List<SyntaxToken>> nameToTokens = new Dictionary<string, List<SyntaxToken>>();
 77 | 				foreach (SyntaxToken token in root.Leaves)
 78 | 				{
 79 | 					string name = methodTree.leaves[token].VariableName;
 80 |                     if (isMethodName(token))
 81 |                     {
 82 |                         name = Extractor.MethodNameConst;
 83 |                     }
 84 |                     tokenToName[token] = name;
 85 | 					if (!nameToTokens.ContainsKey(name))
 86 | 						nameToTokens[name] = new List<SyntaxToken>();
 87 | 					nameToTokens[name].Add(token);
 88 | 				}
 89 | 
 90 |                 List<Variable> results = new List<Variable>();
 91 | 
 92 |                 foreach (SyntaxToken leaf in leaves)
 93 | 				{
 94 | 					string name = tokenToName[leaf];
 95 | 					SyntaxToken[] syntaxTokens = nameToTokens[name].ToArray();
 96 |                     var v = new Variable(name, syntaxTokens, methodTree);
 97 | 
 98 |                     //check if exists
 99 |                     var matches = results.Where(p => p.Name == name).ToList();
100 |                     bool alreadyExists = (matches.Count != 0);
101 |                     if (!alreadyExists)
102 |                     {
103 |                         results.Add(v);
104 |                     }
105 |                 }
106 | 
107 |                 return results;
108 | 			}
109 | 		}
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/code2seq_master/CSharpExtractor/extract.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import itertools
 4 | import multiprocessing
 5 | import os
 6 | import sys
 7 | import shutil
 8 | import subprocess
 9 | from threading import Timer
10 | import sys
11 | from argparse import ArgumentParser
12 | from subprocess import Popen, PIPE, STDOUT, call
13 | 
14 | 
15 | 
16 | def get_immediate_subdirectories(a_dir):
17 |     return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir)
18 |             if os.path.isdir(os.path.join(a_dir, name))]
19 | 
20 | 
21 | TMP_DIR = ""
22 | 
23 | def ParallelExtractDir(args, dir):
24 |     ExtractFeaturesForDir(args, dir, "")
25 | 
26 | 
27 | def ExtractFeaturesForDir(args, dir, prefix):
28 |     command = ['dotnet', 'run', '--project', args.csproj,
29 |                '--max_length', str(args.max_path_length), '--max_width', str(args.max_path_width),
30 |                '--path', dir, '--threads', str(args.num_threads), '--ofile_name', str(args.ofile_name)]
31 | 
32 | 
33 |     # print command
34 |     # os.system(command)
35 |     kill = lambda process: process.kill()
36 |     sleeper = subprocess.Popen(command, stderr=subprocess.PIPE)
37 |     timer = Timer(600000, kill, [sleeper])
38 | 
39 |     try:
40 |         timer.start()
41 |         _, stderr = sleeper.communicate()
42 |     finally:
43 |         timer.cancel()
44 | 
45 |     if sleeper.poll() == 0:
46 |         if len(stderr) > 0:
47 |             print(sys.stderr, stderr)
48 |     else:
49 |         print(sys.stderr, 'dir: ' + str(dir) + ' was not completed in time')
50 |         failed = True
51 |         subdirs = get_immediate_subdirectories(dir)
52 |         for subdir in subdirs:
53 |             ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')
54 |     if failed:
55 |         if os.path.exists(str(args.ofile_name)):
56 |             os.remove(str(args.ofile_name))
57 | 
58 | def ExtractFeaturesForDirsList(args, dirs):
59 |     global TMP_DIR
60 |     TMP_DIR = "./tmp/feature_extractor%d/" % (os.getpid())
61 |     if os.path.exists(TMP_DIR):
62 |         shutil.rmtree(TMP_DIR, ignore_errors=True)
63 |     os.makedirs(TMP_DIR)
64 |     try:
65 |         p = multiprocessing.Pool(4)
66 |         p.starmap(ParallelExtractDir, zip(itertools.repeat(args), dirs))
67 |         #for dir in dirs:
68 |         #    ExtractFeaturesForDir(args, dir, '')
69 |         output_files = os.listdir(TMP_DIR)
70 |         for f in output_files:
71 |             os.system("cat %s/%s" % (TMP_DIR, f))
72 |     finally:
73 |         shutil.rmtree(TMP_DIR, ignore_errors=True)
74 | 
75 | 
76 | if __name__ == '__main__':
77 | 
78 |     parser = ArgumentParser()
79 |     parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8)
80 |     parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2)
81 |     parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64)
82 |     parser.add_argument("--csproj", dest="csproj", required=True)
83 |     parser.add_argument("-dir", "--dir", dest="dir", required=False)
84 |     parser.add_argument("-ofile_name", "--ofile_name", dest="ofile_name", required=True)
85 |     args = parser.parse_args()
86 | 
87 |     if args.dir is not None:
88 |         subdirs = get_immediate_subdirectories(args.dir)
89 |         to_extract = subdirs
90 |         if len(subdirs) == 0:
91 |             to_extract = [args.dir.rstrip('/')]
92 |         ExtractFeaturesForDirsList(args, to_extract)
93 | 


--------------------------------------------------------------------------------
/code2seq_master/Input.java:
--------------------------------------------------------------------------------
1 | boolean f(Set<String> set, String value) {
2 |     for (String entry : set) {
3 |         if (entry.equalsIgnoreCase(value)) {
4 |             return true;
5 |         }
6 |     }
7 |     return false;
8 | }


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/JPredict/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry excluding="Test.java" kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
10 | 		<attributes>
11 | 			<attribute name="maven.pomderived" value="true"/>
12 | 		</attributes>
13 | 	</classpathentry>
14 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
15 | 		<attributes>
16 | 			<attribute name="maven.pomderived" value="true"/>
17 | 		</attributes>
18 | 	</classpathentry>
19 | 	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
20 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
21 | 		<attributes>
22 | 			<attribute name="optional" value="true"/>
23 | 			<attribute name="maven.pomderived" value="true"/>
24 | 		</attributes>
25 | 	</classpathentry>
26 | 	<classpathentry kind="output" path="target/classes"/>
27 | </classpath>
28 | 


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/JPredict/.gitignore:
--------------------------------------------------------------------------------
1 | /target/


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/App.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor;
 2 | 
 3 | import JavaExtractor.Common.CommandLineValues;
 4 | import org.kohsuke.args4j.CmdLineException;
 5 | 
 6 | import java.io.IOException;
 7 | import java.nio.file.Files;
 8 | import java.nio.file.Paths;
 9 | import java.util.LinkedList;
10 | import java.util.List;
11 | import java.util.concurrent.ExecutionException;
12 | import java.util.concurrent.Executors;
13 | import java.util.concurrent.Future;
14 | import java.util.concurrent.ThreadPoolExecutor;
15 | 
16 | public class App {
17 |     private static CommandLineValues s_CommandLineValues;
18 | 
19 |     public static void main(String[] args) {
20 |         try {
21 |             s_CommandLineValues = new CommandLineValues(args);
22 |         } catch (CmdLineException e) {
23 |             e.printStackTrace();
24 |             return;
25 |         }
26 | 
27 |         if (s_CommandLineValues.File != null) {
28 |             ExtractFeaturesTask extractFeaturesTask = new ExtractFeaturesTask(s_CommandLineValues,
29 |                     s_CommandLineValues.File.toPath());
30 |             extractFeaturesTask.processFile();
31 |         } else if (s_CommandLineValues.Dir != null) {
32 |             extractDir();
33 |         }
34 |     }
35 | 
36 |     private static void extractDir() {
37 |         ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(s_CommandLineValues.NumThreads);
38 |         LinkedList<ExtractFeaturesTask> tasks = new LinkedList<>();
39 |         try {
40 |             Files.walk(Paths.get(s_CommandLineValues.Dir)).filter(Files::isRegularFile)
41 |                     .filter(p -> p.toString().toLowerCase().endsWith(".java")).forEach(f -> {
42 |                 ExtractFeaturesTask task = new ExtractFeaturesTask(s_CommandLineValues, f);
43 |                 tasks.add(task);
44 |             });
45 |         } catch (IOException e) {
46 |             e.printStackTrace();
47 |             return;
48 |         }
49 |         List<Future<Void>> tasksResults = null;
50 |         try {
51 |             tasksResults = executor.invokeAll(tasks);
52 |         } catch (InterruptedException e) {
53 |             e.printStackTrace();
54 |         } finally {
55 |             executor.shutdown();
56 |         }
57 |         tasksResults.forEach(f -> {
58 |             try {
59 |                 f.get();
60 |             } catch (InterruptedException | ExecutionException e) {
61 |                 e.printStackTrace();
62 |             }
63 |         });
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/CommandLineValues.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.Common;
 2 | 
 3 | import org.kohsuke.args4j.CmdLineException;
 4 | import org.kohsuke.args4j.CmdLineParser;
 5 | import org.kohsuke.args4j.Option;
 6 | 
 7 | import java.io.File;
 8 | 
 9 | /**
10 |  * This class handles the programs arguments.
11 |  */
12 | public class CommandLineValues {
13 |     @Option(name = "--file", required = false)
14 |     public File File = null;
15 | 
16 |     @Option(name = "--dir", required = false, forbids = "--file")
17 |     public String Dir = null;
18 | 
19 |     @Option(name = "--max_path_length", required = true)
20 |     public int MaxPathLength;
21 | 
22 |     @Option(name = "--max_path_width", required = true)
23 |     public int MaxPathWidth;
24 | 
25 |     @Option(name = "--num_threads", required = false)
26 |     public int NumThreads = 64;
27 | 
28 |     @Option(name = "--min_code_len", required = false)
29 |     public int MinCodeLength = 1;
30 | 
31 |     @Option(name = "--max_code_len", required = false)
32 |     public int MaxCodeLength = -1;
33 | 
34 |     @Option(name = "--max_file_len", required = false)
35 |     public int MaxFileLength = -1;
36 | 
37 |     @Option(name = "--pretty_print", required = false)
38 |     public boolean PrettyPrint = false;
39 | 
40 |     @Option(name = "--max_child_id", required = false)
41 |     public int MaxChildId = 3;
42 | 
43 |     public CommandLineValues(String... args) throws CmdLineException {
44 |         CmdLineParser parser = new CmdLineParser(this);
45 |         try {
46 |             parser.parseArgument(args);
47 |         } catch (CmdLineException e) {
48 |             System.err.println(e.getMessage());
49 |             parser.printUsage(System.err);
50 |             throw e;
51 |         }
52 |     }
53 | 
54 |     public CommandLineValues() {
55 | 
56 |     }
57 | }


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/Common.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.Common;
 2 | 
 3 | import JavaExtractor.FeaturesEntities.Property;
 4 | import com.github.javaparser.ast.Node;
 5 | import com.github.javaparser.ast.UserDataKey;
 6 | 
 7 | import java.util.ArrayList;
 8 | import java.util.stream.Collectors;
 9 | import java.util.stream.Stream;
10 | 
11 | public final class Common {
12 |     public static final UserDataKey<Property> PropertyKey = new UserDataKey<Property>() {
13 |     };
14 |     public static final UserDataKey<Integer> ChildId = new UserDataKey<Integer>() {
15 |     };
16 |     public static final String EmptyString = "";
17 | 
18 |     public static final String MethodDeclaration = "MethodDeclaration";
19 |     public static final String NameExpr = "NameExpr";
20 |     public static final String BlankWord = "BLANK";
21 | 
22 |     public static final int c_MaxLabelLength = 50;
23 |     public static final String methodName = "METHOD_NAME";
24 |     public static final String internalSeparator = "|";
25 | 
26 |     public static String normalizeName(String original, String defaultString) {
27 |         original = original.toLowerCase().replaceAll("\\\\n", "") // escaped new
28 |                 // lines
29 |                 .replaceAll("//s+", "") // whitespaces
30 |                 .replaceAll("[\"',]", "") // quotes, apostrophies, commas
31 |                 .replaceAll("\\P{Print}", ""); // unicode weird characters
32 |         String stripped = original.replaceAll("[^A-Za-z]", "");
33 |         if (stripped.length() == 0) {
34 |             String carefulStripped = original.replaceAll(" ", "_");
35 |             if (carefulStripped.length() == 0) {
36 |                 return defaultString;
37 |             } else {
38 |                 return carefulStripped;
39 |             }
40 |         } else {
41 |             return stripped;
42 |         }
43 |     }
44 | 
45 |     public static boolean isMethod(Node node, String type) {
46 |         Property parentProperty = node.getParentNode().getUserData(Common.PropertyKey);
47 |         if (parentProperty == null) {
48 |             return false;
49 |         }
50 | 
51 |         String parentType = parentProperty.getType();
52 |         return Common.NameExpr.equals(type) && Common.MethodDeclaration.equals(parentType);
53 |     }
54 | 
55 |     public static ArrayList<String> splitToSubtokens(String str1) {
56 |         String str2 = str1.replace("|", " ");
57 |         String str3 = str2.trim();
58 |         return Stream.of(str3.split("(?<=[a-z])(?=[A-Z])|_|[0-9]|(?<=[A-Z])(?=[A-Z][a-z])|\\s+"))
59 |                 .filter(s -> s.length() > 0).map(s -> Common.normalizeName(s, Common.EmptyString))
60 |                 .filter(s -> s.length() > 0).collect(Collectors.toCollection(ArrayList::new));
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Common/MethodContent.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.Common;
 2 | 
 3 | import com.github.javaparser.ast.Node;
 4 | 
 5 | import java.util.ArrayList;
 6 | 
 7 | public class MethodContent {
 8 |     private final ArrayList<Node> leaves;
 9 |     private final String name;
10 | 
11 |     public MethodContent(ArrayList<Node> leaves, String name) {
12 |         this.leaves = leaves;
13 |         this.name = name;
14 |     }
15 | 
16 |     public ArrayList<Node> getLeaves() {
17 |         return leaves;
18 |     }
19 | 
20 |     public String getName() {
21 |         return name;
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/ExtractFeaturesTask.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor;
 2 | 
 3 | import JavaExtractor.Common.CommandLineValues;
 4 | import JavaExtractor.Common.Common;
 5 | import JavaExtractor.FeaturesEntities.ProgramFeatures;
 6 | import org.apache.commons.lang3.StringUtils;
 7 | 
 8 | import java.io.IOException;
 9 | import java.nio.charset.Charset;
10 | import java.nio.file.Files;
11 | import java.nio.file.Path;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 | import java.util.concurrent.Callable;
15 | 
16 | class ExtractFeaturesTask implements Callable<Void> {
17 |     private final CommandLineValues m_CommandLineValues;
18 |     private final Path filePath;
19 | 
20 |     public ExtractFeaturesTask(CommandLineValues commandLineValues, Path path) {
21 |         m_CommandLineValues = commandLineValues;
22 |         this.filePath = path;
23 |     }
24 | 
25 |     @Override
26 |     public Void call() {
27 |         processFile();
28 |         return null;
29 |     }
30 | 
31 |     public void processFile() {
32 |         ArrayList<ProgramFeatures> features;
33 |         try {
34 |             features = extractSingleFile();
35 |         } catch (IOException e) {
36 |             e.printStackTrace();
37 |             return;
38 |         }
39 |         if (features == null) {
40 |             return;
41 |         }
42 | 
43 |         String toPrint = featuresToString(features);
44 |         if (toPrint.length() > 0) {
45 |             System.out.println(toPrint);
46 |         }
47 |     }
48 | 
49 |     private ArrayList<ProgramFeatures> extractSingleFile() throws IOException {
50 |         String code;
51 | 
52 |         if (m_CommandLineValues.MaxFileLength > 0 &&
53 |                 Files.lines(filePath, Charset.defaultCharset()).count() > m_CommandLineValues.MaxFileLength) {
54 |             return new ArrayList<>();
55 |         }
56 |         try {
57 |             code = new String(Files.readAllBytes(filePath));
58 |         } catch (IOException e) {
59 |             e.printStackTrace();
60 |             code = Common.EmptyString;
61 |         }
62 |         FeatureExtractor featureExtractor = new FeatureExtractor(m_CommandLineValues);
63 | 
64 |         return featureExtractor.extractFeatures(code);
65 |     }
66 | 
67 |     public String featuresToString(ArrayList<ProgramFeatures> features) {
68 |         if (features == null || features.isEmpty()) {
69 |             return Common.EmptyString;
70 |         }
71 | 
72 |         List<String> methodsOutputs = new ArrayList<>();
73 | 
74 |         for (ProgramFeatures singleMethodFeatures : features) {
75 |             StringBuilder builder = new StringBuilder();
76 | 
77 |             String toPrint = singleMethodFeatures.toString();
78 |             if (m_CommandLineValues.PrettyPrint) {
79 |                 toPrint = toPrint.replace(" ", "\n\t");
80 |             }
81 |             builder.append(toPrint);
82 | 
83 | 
84 |             methodsOutputs.add(builder.toString());
85 | 
86 |         }
87 |         return StringUtils.join(methodsOutputs, "\n");
88 |     }
89 | }
90 | 


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramFeatures.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.FeaturesEntities;
 2 | 
 3 | import com.fasterxml.jackson.annotation.JsonIgnore;
 4 | 
 5 | import java.util.ArrayList;
 6 | import java.util.stream.Collectors;
 7 | 
 8 | public class ProgramFeatures {
 9 |     private final String name;
10 | 
11 |     private final ArrayList<ProgramRelation> features = new ArrayList<>();
12 | 
13 |     public ProgramFeatures(String name) {
14 |         this.name = name;
15 |     }
16 | 
17 |     @SuppressWarnings("StringBufferReplaceableByString")
18 |     @Override
19 |     public String toString() {
20 |         StringBuilder stringBuilder = new StringBuilder();
21 |         stringBuilder.append(name).append(" ");
22 |         stringBuilder.append(features.stream().map(ProgramRelation::toString).collect(Collectors.joining(" ")));
23 | 
24 |         return stringBuilder.toString();
25 |     }
26 | 
27 |     public void addFeature(Property source, String path, Property target) {
28 |         ProgramRelation newRelation = new ProgramRelation(source, target, path);
29 |         features.add(newRelation);
30 |     }
31 | 
32 |     @JsonIgnore
33 |     public boolean isEmpty() {
34 |         return features.isEmpty();
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/FeaturesEntities/ProgramRelation.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.FeaturesEntities;
 2 | 
 3 | public class ProgramRelation {
 4 |     private final Property m_Source;
 5 |     private final Property m_Target;
 6 |     private final String m_Path;
 7 | 
 8 |     public ProgramRelation(Property sourceName, Property targetName, String path) {
 9 |         m_Source = sourceName;
10 |         m_Target = targetName;
11 |         m_Path = path;
12 |     }
13 | 
14 |     public String toString() {
15 |         return String.format("%s,%s,%s", m_Source.getName(), m_Path,
16 |                 m_Target.getName());
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/FunctionVisitor.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.Visitors;
 2 | 
 3 | import JavaExtractor.Common.CommandLineValues;
 4 | import JavaExtractor.Common.Common;
 5 | import JavaExtractor.Common.MethodContent;
 6 | import com.github.javaparser.ast.Node;
 7 | import com.github.javaparser.ast.body.MethodDeclaration;
 8 | import com.github.javaparser.ast.visitor.VoidVisitorAdapter;
 9 | 
10 | import java.util.ArrayList;
11 | import java.util.Arrays;
12 | 
13 | @SuppressWarnings("StringEquality")
14 | public class FunctionVisitor extends VoidVisitorAdapter<Object> {
15 |     private final ArrayList<MethodContent> m_Methods = new ArrayList<>();
16 |     private final CommandLineValues m_CommandLineValues;
17 | 
18 |     public FunctionVisitor(CommandLineValues commandLineValues) {
19 |         this.m_CommandLineValues = commandLineValues;
20 |     }
21 | 
22 |     @Override
23 |     public void visit(MethodDeclaration node, Object arg) {
24 |         visitMethod(node);
25 | 
26 |         super.visit(node, arg);
27 |     }
28 | 
29 |     private void visitMethod(MethodDeclaration node) {
30 |         LeavesCollectorVisitor leavesCollectorVisitor = new LeavesCollectorVisitor();
31 |         leavesCollectorVisitor.visitDepthFirst(node);
32 |         ArrayList<Node> leaves = leavesCollectorVisitor.getLeaves();
33 | 
34 |         String normalizedMethodName = Common.normalizeName(node.getName(), Common.BlankWord);
35 |         ArrayList<String> splitNameParts = Common.splitToSubtokens(node.getName());
36 |         String splitName = normalizedMethodName;
37 |         if (splitNameParts.size() > 0) {
38 |             splitName = String.join(Common.internalSeparator, splitNameParts);
39 |         }
40 | 
41 |         if (node.getBody() != null) {
42 |             long methodLength = getMethodLength(node.getBody().toString());
43 |             if (m_CommandLineValues.MaxCodeLength > 0) {
44 |                 if (methodLength >= m_CommandLineValues.MinCodeLength && methodLength <= m_CommandLineValues.MaxCodeLength) {
45 |                     m_Methods.add(new MethodContent(leaves, splitName));
46 |                 }
47 |             } else {
48 |                 m_Methods.add(new MethodContent(leaves, splitName));
49 |             }
50 |         }
51 |     }
52 | 
53 |     private long getMethodLength(String code) {
54 |         String cleanCode = code.replaceAll("\r\n", "\n").replaceAll("\t", " ");
55 |         if (cleanCode.startsWith("{\n"))
56 |             cleanCode = cleanCode.substring(3).trim();
57 |         if (cleanCode.endsWith("\n}"))
58 |             cleanCode = cleanCode.substring(0, cleanCode.length() - 2).trim();
59 |         if (cleanCode.length() == 0) {
60 |             return 0;
61 |         }
62 |         return Arrays.stream(cleanCode.split("\n"))
63 |                 .filter(line -> (line.trim() != "{" && line.trim() != "}" && line.trim() != ""))
64 |                 .filter(line -> !line.trim().startsWith("/") && !line.trim().startsWith("*")).count();
65 |     }
66 | 
67 |     public ArrayList<MethodContent> getMethodContents() {
68 |         return m_Methods;
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/JPredict/src/main/java/JavaExtractor/Visitors/LeavesCollectorVisitor.java:
--------------------------------------------------------------------------------
 1 | package JavaExtractor.Visitors;
 2 | 
 3 | import JavaExtractor.Common.Common;
 4 | import JavaExtractor.FeaturesEntities.Property;
 5 | import com.github.javaparser.ast.Node;
 6 | import com.github.javaparser.ast.comments.Comment;
 7 | import com.github.javaparser.ast.expr.NullLiteralExpr;
 8 | import com.github.javaparser.ast.stmt.Statement;
 9 | import com.github.javaparser.ast.type.ClassOrInterfaceType;
10 | import com.github.javaparser.ast.visitor.TreeVisitor;
11 | 
12 | import java.util.ArrayList;
13 | import java.util.List;
14 | 
15 | public class LeavesCollectorVisitor extends TreeVisitor {
16 |     private final ArrayList<Node> m_Leaves = new ArrayList<>();
17 | 
18 |     @Override
19 |     public void process(Node node) {
20 |         if (node instanceof Comment) {
21 |             return;
22 |         }
23 |         boolean isLeaf = false;
24 |         boolean isGenericParent = isGenericParent(node);
25 |         if (hasNoChildren(node) && isNotComment(node)) {
26 |             if (!node.toString().isEmpty() && (!"null".equals(node.toString()) || (node instanceof NullLiteralExpr))) {
27 |                 m_Leaves.add(node);
28 |                 isLeaf = true;
29 |             }
30 |         }
31 | 
32 |         int childId = getChildId(node);
33 |         node.setUserData(Common.ChildId, childId);
34 |         Property property = new Property(node, isLeaf, isGenericParent);
35 |         node.setUserData(Common.PropertyKey, property);
36 |     }
37 | 
38 |     private boolean isGenericParent(Node node) {
39 |         return (node instanceof ClassOrInterfaceType)
40 |                 && ((ClassOrInterfaceType) node).getTypeArguments() != null
41 |                 && ((ClassOrInterfaceType) node).getTypeArguments().size() > 0;
42 |     }
43 | 
44 |     private boolean hasNoChildren(Node node) {
45 |         return node.getChildrenNodes().size() == 0;
46 |     }
47 | 
48 |     private boolean isNotComment(Node node) {
49 |         return !(node instanceof Comment) && !(node instanceof Statement);
50 |     }
51 | 
52 |     public ArrayList<Node> getLeaves() {
53 |         return m_Leaves;
54 |     }
55 | 
56 |     private int getChildId(Node node) {
57 |         Node parent = node.getParentNode();
58 |         List<Node> parentsChildren = parent.getChildrenNodes();
59 |         int childId = 0;
60 |         for (Node child : parentsChildren) {
61 |             if (child.getRange().equals(node.getRange())) {
62 |                 return childId;
63 |             }
64 |             childId++;
65 |         }
66 |         return childId;
67 |     }
68 | }
69 | 


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/JPredict/src/main/java/Test.java:
--------------------------------------------------------------------------------
1 | class Test {
2 |     void fooBar() {
3 |         System.out.println("http://github.com");
4 |     }
5 | }


--------------------------------------------------------------------------------
/code2seq_master/JavaExtractor/extract.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import itertools
 4 | import multiprocessing
 5 | import os
 6 | import shutil
 7 | import subprocess
 8 | import sys
 9 | from argparse import ArgumentParser
10 | from threading import Timer
11 | 
12 | 
13 | def get_immediate_subdirectories(a_dir):
14 |     return [(os.path.join(a_dir, name)) for name in os.listdir(a_dir)
15 |             if os.path.isdir(os.path.join(a_dir, name))]
16 | 
17 | 
18 | TMP_DIR = ""
19 | 
20 | 
21 | def ParallelExtractDir(args, dir):
22 |     ExtractFeaturesForDir(args, dir, "")
23 | 
24 | 
25 | def ExtractFeaturesForDir(args, dir, prefix):
26 |     command = ['java', '-Xmx100g', '-XX:MaxNewSize=60g', '-cp', args.jar, 'JavaExtractor.App',
27 |                '--max_path_length', str(args.max_path_length), '--max_path_width', str(args.max_path_width),
28 |                '--dir', dir, '--num_threads', str(args.num_threads)]
29 | 
30 |     # print command
31 |     # os.system(command)
32 |     kill = lambda process: process.kill()
33 |     outputFileName = TMP_DIR + prefix + dir.split('/')[-1]
34 |     failed = False
35 |     with open(outputFileName, 'a') as outputFile:
36 |         sleeper = subprocess.Popen(command, stdout=outputFile, stderr=subprocess.PIPE)
37 |         timer = Timer(60 * 60, kill, [sleeper])
38 | 
39 |         try:
40 |             timer.start()
41 |             stdout, stderr = sleeper.communicate()
42 |         finally:
43 |             timer.cancel()
44 | 
45 |         if sleeper.poll() == 0:
46 |             if len(stderr) > 0:
47 |                 print(stderr, file=sys.stderr)
48 |         else:
49 |             print('dir: ' + str(dir) + ' was not completed in time', file=sys.stderr)
50 |             failed = True
51 |             subdirs = get_immediate_subdirectories(dir)
52 |             for subdir in subdirs:
53 |                 ExtractFeaturesForDir(args, subdir, prefix + dir.split('/')[-1] + '_')
54 |     if failed:
55 |         if os.path.exists(outputFileName):
56 |             os.remove(outputFileName)
57 | 
58 | 
59 | def ExtractFeaturesForDirsList(args, dirs):
60 |     global TMP_DIR
61 |     TMP_DIR = "./tmp/feature_extractor%d/" % (os.getpid())
62 |     if os.path.exists(TMP_DIR):
63 |         shutil.rmtree(TMP_DIR, ignore_errors=True)
64 |     os.makedirs(TMP_DIR)
65 |     try:
66 |         p = multiprocessing.Pool(6)
67 |         p.starmap(ParallelExtractDir, zip(itertools.repeat(args), dirs))
68 |         # for dir in dirs:
69 |         #    ExtractFeaturesForDir(args, dir, '')
70 |         output_files = os.listdir(TMP_DIR)
71 |         for f in output_files:
72 |             os.system("cat %s/%s" % (TMP_DIR, f))
73 |     finally:
74 |         shutil.rmtree(TMP_DIR, ignore_errors=True)
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     parser = ArgumentParser()
79 |     parser.add_argument("-maxlen", "--max_path_length", dest="max_path_length", required=False, default=8)
80 |     parser.add_argument("-maxwidth", "--max_path_width", dest="max_path_width", required=False, default=2)
81 |     parser.add_argument("-threads", "--num_threads", dest="num_threads", required=False, default=64)
82 |     parser.add_argument("-j", "--jar", dest="jar", required=True)
83 |     parser.add_argument("-dir", "--dir", dest="dir", required=False)
84 |     parser.add_argument("-file", "--file", dest="file", required=False)
85 |     args = parser.parse_args()
86 | 
87 |     if args.file is not None:
88 |         command = 'java -cp ' + args.jar + ' JavaExtractor.App --max_path_length ' + \
89 |                   str(args.max_path_length) + ' --max_path_width ' + str(args.max_path_width) + ' --file ' + args.file
90 |         os.system(command)
91 |     elif args.dir is not None:
92 |         subdirs = get_immediate_subdirectories(args.dir)
93 |         # print("Sub Directories")
94 |         # print(subdirs)
95 |         if len(subdirs) == 0:
96 |             subdirs = [args.dir]
97 |         ExtractFeaturesForDirsList(args, subdirs)
98 | 


--------------------------------------------------------------------------------
/code2seq_master/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Technion
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/code2seq_master/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/code2seq_master/__init__.py


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/input_example.txt:
--------------------------------------------------------------------------------
 1 | requires landscape|boolean (){ return false; }
 2 | get parent key|Object (){ return new ContactsUiKey(); }
 3 | get parent key|Object (){ return new ContactsUiKey(); }
 4 | get layout id|int (){ return R.layout.loose_screen; }
 5 | get parent key|Object (){ return new EditContactKey(contactId); }
 6 | to contact|Contact (){ return new Contact(id, name, email); }
 7 | to string|String (){ return "Welcome!\nClick to continue."; }
 8 | get parent key|Object (){ return new EditContactKey(contactId); }
 9 | tear down services|void (@NonNull Services services){ }
10 | get layout id|int (){ return R.layout.landscape_screen; }
11 | 


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from . import parser
3 | from . import parse
4 | from . import tokenizer
5 | from . import javadoc
6 | 
7 | 
8 | __version__ = "0.10.1"
9 | 


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/ast.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | 
 3 | import six
 4 | 
 5 | 
 6 | class MetaNode(type):
 7 |     def __new__(mcs, name, bases, dict):
 8 |         attrs = list(dict['attrs'])
 9 |         dict['attrs'] = list()
10 | 
11 |         for base in bases:
12 |             if hasattr(base, 'attrs'):
13 |                 dict['attrs'].extend(base.attrs)
14 | 
15 |         dict['attrs'].extend(attrs)
16 | 
17 |         return type.__new__(mcs, name, bases, dict)
18 | 
19 | 
20 | @six.add_metaclass(MetaNode)
21 | class Node(object):
22 |     attrs = ()
23 | 
24 |     def __init__(self, **kwargs):
25 |         values = kwargs.copy()
26 | 
27 |         for attr_name in self.attrs:
28 |             value = values.pop(attr_name, None)
29 |             setattr(self, attr_name, value)
30 | 
31 |         if values:
32 |             raise ValueError('Extraneous arguments')
33 | 
34 |     def __equals__(self, other):
35 |         if type(other) is not type(self):
36 |             return False
37 | 
38 |         for attr in self.attrs:
39 |             if getattr(other, attr) != getattr(self, attr):
40 |                 return False
41 | 
42 |         return True
43 | 
44 |     def __repr__(self):
45 |         return type(self).__name__
46 | 
47 |     def __iter__(self):
48 |         return walk_tree(self)
49 | 
50 |     def filter(self, pattern):
51 |         for path, node in self:
52 |             if ((isinstance(pattern, type) and isinstance(node, pattern)) or
53 |                 (node == pattern)):
54 |                 yield path, node
55 | 
56 |     @property
57 |     def children(self):
58 |         return [getattr(self, attr_name) for attr_name in self.attrs]
59 | 
60 | def walk_tree(root):
61 |     children = None
62 | 
63 |     if isinstance(root, Node):
64 |         yield (), root
65 |         children = root.children
66 |     else:
67 |         children = root
68 | 
69 |     for child in children:
70 |         if isinstance(child, (Node, list, tuple)):
71 |             for path, node in walk_tree(child):
72 |                 yield (root,) + path, node
73 | 
74 | def dump(ast, file):
75 |     pickle.dump(ast, file)
76 | 
77 | def load(file):
78 |     return pickle.load(file)
79 | 


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/javadoc.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import re
  3 | 
  4 | def join(s):
  5 |     return ' '.join(l.strip() for l in s.split('\n'))
  6 | 
  7 | class DocBlock(object):
  8 |     def __init__(self):
  9 |         self.description = ''
 10 |         self.return_doc = None
 11 |         self.params = []
 12 | 
 13 |         self.authors = []
 14 |         self.deprecated = False
 15 | 
 16 |         # @exception and @throw are equivalent
 17 |         self.throws = {}
 18 |         self.exceptions = self.throws
 19 | 
 20 |         self.tags = {}
 21 | 
 22 |     def add_block(self, name, value):
 23 |         value = value.strip()
 24 | 
 25 |         if name == 'param':
 26 |             try:
 27 |                 param, description = value.split(None, 1)
 28 |             except ValueError:
 29 |                 param, description = value, ''
 30 |             self.params.append((param, join(description)))
 31 | 
 32 |         elif name in ('throws', 'exception'):
 33 |             try:
 34 |                 ex, description = value.split(None, 1)
 35 |             except ValueError:
 36 |                 ex, description = value, ''
 37 |             self.throws[ex] = join(description)
 38 | 
 39 |         elif name == 'return':
 40 |             self.return_doc = value
 41 | 
 42 |         elif name == 'author':
 43 |             self.authors.append(value)
 44 | 
 45 |         elif name == 'deprecated':
 46 |             self.deprecated = True
 47 | 
 48 |         self.tags.setdefault(name, []).append(value)
 49 | 
 50 | blocks_re = re.compile('(^@)', re.MULTILINE)
 51 | leading_space_re = re.compile(r'^\s*\*', re.MULTILINE)
 52 | blocks_justify_re = re.compile(r'^\s*@', re.MULTILINE)
 53 | 
 54 | def _sanitize(s):
 55 |     s = s.strip()
 56 | 
 57 |     if not (s[:3] == '/**' and s[-2:] == '*/'):
 58 |         raise ValueError('not a valid Javadoc comment')
 59 | 
 60 |     s = s.replace('\t', '    ')
 61 | 
 62 |     return s
 63 | 
 64 | def _uncomment(s):
 65 |     # Remove /** and */
 66 |     s = s[3:-2].strip()
 67 | 
 68 |     return leading_space_re.sub('', s)
 69 | 
 70 | def _get_indent_level(s):
 71 |     return len(s) - len(s.lstrip())
 72 | 
 73 | def _left_justify(s):
 74 |     lines = s.rstrip().splitlines()
 75 | 
 76 |     if not lines:
 77 |         return ''
 78 | 
 79 |     indent_levels = []
 80 |     for line in lines:
 81 |         if line.strip():
 82 |             indent_levels.append(_get_indent_level(line))
 83 |     indent_levels.sort()
 84 | 
 85 |     common_indent = indent_levels[0]
 86 |     if common_indent == 0:
 87 |         return s
 88 |     else:
 89 |         lines = [line[common_indent:] for line in lines]
 90 |         return '\n'.join(lines)
 91 | 
 92 | def _force_blocks_left(s):
 93 |     return blocks_justify_re.sub('@', s)
 94 | 
 95 | def parse(raw):
 96 |     sanitized = _sanitize(raw)
 97 |     uncommented = _uncomment(sanitized)
 98 |     justified = _left_justify(uncommented)
 99 |     justified_fixed = _force_blocks_left(justified)
100 |     prepared = justified_fixed
101 | 
102 |     blocks = blocks_re.split(prepared)
103 | 
104 |     doc = DocBlock()
105 | 
106 |     if blocks[0] != '@':
107 |         doc.description = blocks[0].strip()
108 |         blocks = blocks[2::2]
109 |     else:
110 |         blocks = blocks[1::2]
111 | 
112 |     for block in blocks:
113 |         try:
114 |             tag, value = block.split(None, 1)
115 |         except ValueError:
116 |             tag, value = block, ''
117 | 
118 |         doc.add_block(tag, value)
119 | 
120 |     return doc
121 | 


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/parse.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from .parser import Parser
 3 | from .tokenizer import tokenize
 4 | 
 5 | def parse_expression(exp):
 6 |     if not exp.endswith(';'):
 7 |         exp = exp + ';'
 8 | 
 9 |     tokens = tokenize(exp)
10 |     parser = Parser(tokens)
11 | 
12 |     return parser.parse_expression()
13 | 
14 | def parse_member_signature(sig):
15 |     if not sig.endswith(';'):
16 |         sig = sig + ';'
17 | 
18 |     tokens = tokenize(sig)
19 |     parser = Parser(tokens)
20 | 
21 |     return parser.parse_member_declaration()
22 | 
23 | def parse_constructor_signature(sig):
24 |     # Add an empty body to the signature, replacing a ; if necessary
25 |     if sig.endswith(';'):
26 |         sig = sig[:-1]
27 |     sig = sig + '{ }'
28 | 
29 |     tokens = tokenize(sig)
30 |     parser = Parser(tokens)
31 | 
32 |     return parser.parse_member_declaration()
33 | 
34 | def parse_type(s):
35 |     tokens = tokenize(s)
36 |     parser = Parser(tokens)
37 | 
38 |     return parser.parse_type()
39 | 
40 | def parse_type_signature(sig):
41 |     if sig.endswith(';'):
42 |         sig = sig[:-1]
43 |     sig = sig + '{ }'
44 | 
45 |     tokens = tokenize(sig)
46 |     parser = Parser(tokens)
47 | 
48 |     return parser.parse_class_or_interface_declaration()
49 | 
50 | def parse(s):
51 |     tokens = tokenize(s)
52 |     parser = Parser(tokens)
53 |     return parser.parse()
54 | 


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/code2seq_master/baseline_tokenization/javalang/test/__init__.py


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/test/source/package-info/AnnotationJavadoc.java:
--------------------------------------------------------------------------------
1 | @Package
2 | /**
3 |  Test that includes java doc first but no annotation
4 | */
5 | package org.javalang.test;


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/test/source/package-info/AnnotationOnly.java:
--------------------------------------------------------------------------------
1 | @Package
2 | package org.javalang.test;


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/test/source/package-info/JavadocAnnotation.java:
--------------------------------------------------------------------------------
1 | /**
2 |  Test that includes java doc first but no annotation
3 | */
4 | @Package
5 | package org.javalang.test;


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/test/source/package-info/JavadocOnly.java:
--------------------------------------------------------------------------------
1 | /**
2 |  Test that includes java doc first but no annotation
3 | */
4 | package org.javalang.test;


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/test/source/package-info/NoAnnotationNoJavadoc.java:
--------------------------------------------------------------------------------
1 | package org.javalang.test;


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/test/test_javadoc.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from .. import javadoc
 4 | 
 5 | 
 6 | class TestJavadoc(unittest.TestCase):
 7 |     def test_empty_comment(self):
 8 |         javadoc.parse('/** */')
 9 |         javadoc.parse('/***/')
10 |         javadoc.parse('/**\n *\n */')
11 |         javadoc.parse('/**\n *\n *\n */')
12 | 
13 | if __name__ == "__main__":
14 |     unittest.main()
15 | 


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/test/test_package_declaration.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from pkg_resources import resource_string
 4 | from .. import parse
 5 | 
 6 | 
 7 | # From my reading of the spec (http://docs.oracle.com/javase/specs/jls/se7/html/jls-7.html) the
 8 | # allowed order is javadoc, optional annotation, package declaration
 9 | class PackageInfo(unittest.TestCase):
10 |     def testPackageDeclarationOnly(self):
11 |         source_file = "source/package-info/NoAnnotationNoJavadoc.java"
12 |         ast = self.get_ast(source_file)
13 | 
14 |         self.failUnless(ast.package.name == "org.javalang.test")
15 |         self.failIf(ast.package.annotations)
16 |         self.failIf(ast.package.documentation)
17 | 
18 |     def testAnnotationOnly(self):
19 |         source_file = "source/package-info/AnnotationOnly.java"
20 |         ast = self.get_ast(source_file)
21 | 
22 |         self.failUnless(ast.package.name == "org.javalang.test")
23 |         self.failUnless(ast.package.annotations)
24 |         self.failIf(ast.package.documentation)
25 | 
26 |     def testJavadocOnly(self):
27 |         source_file = "source/package-info/JavadocOnly.java"
28 |         ast = self.get_ast(source_file)
29 | 
30 |         self.failUnless(ast.package.name == "org.javalang.test")
31 |         self.failIf(ast.package.annotations)
32 |         self.failUnless(ast.package.documentation)
33 | 
34 |     def testAnnotationThenJavadoc(self):
35 |         source_file = "source/package-info/AnnotationJavadoc.java"
36 |         ast = self.get_ast(source_file)
37 | 
38 |         self.failUnless(ast.package.name == "org.javalang.test")
39 |         self.failUnless(ast.package.annotations)
40 |         self.failIf(ast.package.documentation)
41 | 
42 |     def testJavadocThenAnnotation(self):
43 |         source_file = "source/package-info/JavadocAnnotation.java"
44 |         ast = self.get_ast(source_file)
45 | 
46 |         self.failUnless(ast.package.name == "org.javalang.test")
47 |         self.failUnless(ast.package.annotations)
48 |         self.failUnless(ast.package.documentation)
49 | 
50 |     def get_ast(self, filename):
51 |         source = resource_string(__name__, filename)
52 |         ast = parse.parse(source)
53 | 
54 |         return ast
55 | 
56 | 
57 | def main():
58 |     unittest.main()
59 | 
60 | if __name__ == '__main__':
61 |     main()
62 | 


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/test/test_util.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from ..util import LookAheadIterator
 4 | 
 5 | 
 6 | class TestLookAheadIterator(unittest.TestCase):
 7 |     def test_usage(self):
 8 |         i = LookAheadIterator(list(range(0, 10000)))
 9 | 
10 |         self.assertEqual(next(i), 0)
11 |         self.assertEqual(next(i), 1)
12 |         self.assertEqual(next(i), 2)
13 | 
14 |         self.assertEqual(i.last(), 2)
15 | 
16 |         self.assertEqual(i.look(), 3)
17 |         self.assertEqual(i.last(), 3)
18 | 
19 |         self.assertEqual(i.look(1), 4)
20 |         self.assertEqual(i.look(2), 5)
21 |         self.assertEqual(i.look(3), 6)
22 |         self.assertEqual(i.look(4), 7)
23 | 
24 |         self.assertEqual(i.last(), 7)
25 | 
26 |         i.push_marker()
27 |         self.assertEqual(next(i), 3)
28 |         self.assertEqual(next(i), 4)
29 |         self.assertEqual(next(i), 5)
30 |         i.pop_marker(True) # reset
31 | 
32 |         self.assertEqual(i.look(), 3)
33 |         self.assertEqual(next(i), 3)
34 | 
35 |         i.push_marker() #1
36 |         self.assertEqual(next(i), 4)
37 |         self.assertEqual(next(i), 5)
38 |         i.push_marker() #2
39 |         self.assertEqual(next(i), 6)
40 |         self.assertEqual(next(i), 7)
41 |         i.push_marker() #3
42 |         self.assertEqual(next(i), 8)
43 |         self.assertEqual(next(i), 9)
44 |         i.pop_marker(False) #3
45 |         self.assertEqual(next(i), 10)
46 |         i.pop_marker(True) #2
47 |         self.assertEqual(next(i), 6)
48 |         self.assertEqual(next(i), 7)
49 |         self.assertEqual(next(i), 8)
50 |         i.pop_marker(False) #1
51 |         self.assertEqual(next(i), 9)
52 | 
53 |         try:
54 |             with i:
55 |                 self.assertEqual(next(i), 10)
56 |                 self.assertEqual(next(i), 11)
57 |                 raise Exception()
58 |         except:
59 |             self.assertEqual(next(i), 10)
60 |             self.assertEqual(next(i), 11)
61 | 
62 |         with i:
63 |             self.assertEqual(next(i), 12)
64 |             self.assertEqual(next(i), 13)
65 |         self.assertEqual(next(i), 14)
66 | 
67 | 
68 | if __name__=="__main__":
69 |     unittest.main()
70 | 


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/tree.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from .ast import Node
  3 | 
  4 | # ------------------------------------------------------------------------------
  5 | 
  6 | class CompilationUnit(Node):
  7 |     attrs = ("package", "imports", "types")
  8 | 
  9 | class Import(Node):
 10 |     attrs = ("path", "static", "wildcard")
 11 | 
 12 | class Documented(Node):
 13 |     attrs = ("documentation",)
 14 | 
 15 | class Declaration(Node):
 16 |     attrs = ("modifiers", "annotations")
 17 | 
 18 | class TypeDeclaration(Declaration, Documented):
 19 |     attrs = ("name", "body")
 20 | 
 21 |     @property
 22 |     def fields(self):
 23 |         return [decl for decl in self.body if isinstance(decl, FieldDeclaration)]
 24 | 
 25 |     @property
 26 |     def methods(self):
 27 |         return [decl for decl in self.body if isinstance(decl, MethodDeclaration)]
 28 | 
 29 |     @property
 30 |     def constructors(self):
 31 |         return [decl for decl in self.body if isinstance(decl, ConstructorDeclaration)]
 32 | 
 33 | class PackageDeclaration(Declaration, Documented):
 34 |     attrs = ("name",)
 35 | 
 36 | class ClassDeclaration(TypeDeclaration):
 37 |     attrs = ("type_parameters", "extends", "implements")
 38 | 
 39 | class EnumDeclaration(TypeDeclaration):
 40 |     attrs = ("implements",)
 41 | 
 42 | class InterfaceDeclaration(TypeDeclaration):
 43 |     attrs = ("type_parameters", "extends",)
 44 | 
 45 | class AnnotationDeclaration(TypeDeclaration):
 46 |     attrs = ()
 47 | 
 48 | # ------------------------------------------------------------------------------
 49 | 
 50 | class Type(Node):
 51 |     attrs = ("name", "dimensions",)
 52 | 
 53 | class BasicType(Type):
 54 |     attrs = ()
 55 | 
 56 | class ReferenceType(Type):
 57 |     attrs = ("arguments", "sub_type")
 58 | 
 59 | class TypeArgument(Node):
 60 |     attrs = ("type", "pattern_type")
 61 | 
 62 | # ------------------------------------------------------------------------------
 63 | 
 64 | class TypeParameter(Node):
 65 |     attrs = ("name", "extends")
 66 | 
 67 | # ------------------------------------------------------------------------------
 68 | 
 69 | class Annotation(Node):
 70 |     attrs = ("name", "element")
 71 | 
 72 | class ElementValuePair(Node):
 73 |     attrs = ("name", "value")
 74 | 
 75 | class ElementArrayValue(Node):
 76 |     attrs = ("values",)
 77 | 
 78 | # ------------------------------------------------------------------------------
 79 | 
 80 | class Member(Documented):
 81 |     attrs = ()
 82 | 
 83 | class MethodDeclaration(Member, Declaration):
 84 |     attrs = ("type_parameters", "return_type", "name", "parameters", "throws", "body")
 85 | 
 86 | class FieldDeclaration(Member, Declaration):
 87 |     attrs = ("type", "declarators")
 88 | 
 89 | class ConstructorDeclaration(Declaration, Documented):
 90 |     attrs = ("type_parameters", "name", "parameters", "throws", "body")
 91 | 
 92 | # ------------------------------------------------------------------------------
 93 | 
 94 | class ConstantDeclaration(FieldDeclaration):
 95 |     attrs = ()
 96 | 
 97 | class ArrayInitializer(Node):
 98 |     attrs = ("initializers",)
 99 | 
100 | class VariableDeclaration(Declaration):
101 |     attrs = ("type", "declarators")
102 | 
103 | class LocalVariableDeclaration(VariableDeclaration):
104 |     attrs = ()
105 | 
106 | class VariableDeclarator(Node):
107 |     attrs = ("name", "dimensions", "initializer")
108 | 
109 | class FormalParameter(Declaration):
110 |     attrs = ("type", "name", "varargs")
111 | 
112 | class InferredFormalParameter(Node):
113 |     attrs = ('name',)
114 | 
115 | # ------------------------------------------------------------------------------
116 | 
117 | class Statement(Node):
118 |     attrs = ("label",)
119 | 
120 | class IfStatement(Statement):
121 |     attrs = ("condition", "then_statement", "else_statement")
122 | 
123 | class WhileStatement(Statement):
124 |     attrs = ("condition", "body")
125 | 
126 | class DoStatement(Statement):
127 |     attrs = ("condition", "body")
128 | 
129 | class ForStatement(Statement):
130 |     attrs = ("control", "body")
131 | 
132 | class AssertStatement(Statement):
133 |     attrs = ("condition", "value")
134 | 
135 | class BreakStatement(Statement):
136 |     attrs = ("goto",)
137 | 
138 | class ContinueStatement(Statement):
139 |     attrs = ("goto",)
140 | 
141 | class ReturnStatement(Statement):
142 |     attrs = ("expression",)
143 | 
144 | class ThrowStatement(Statement):
145 |     attrs = ("expression",)
146 | 
147 | class SynchronizedStatement(Statement):
148 |     attrs = ("lock", "block")
149 | 
150 | class TryStatement(Statement):
151 |     attrs = ("resources", "block", "catches", "finally_block")
152 | 
153 | class SwitchStatement(Statement):
154 |     attrs = ("expression", "cases")
155 | 
156 | class BlockStatement(Statement):
157 |     attrs = ("statements",)
158 | 
159 | class StatementExpression(Statement):
160 |     attrs = ("expression",)
161 | 
162 | # ------------------------------------------------------------------------------
163 | 
164 | class TryResource(Declaration):
165 |     attrs = ("type", "name", "value")
166 | 
167 | class CatchClause(Statement):
168 |     attrs = ("parameter", "block")
169 | 
170 | class CatchClauseParameter(Declaration):
171 |     attrs = ("types", "name")
172 | 
173 | # ------------------------------------------------------------------------------
174 | 
175 | class SwitchStatementCase(Node):
176 |     attrs = ("case", "statements")
177 | 
178 | class ForControl(Node):
179 |     attrs = ("init", "condition", "update")
180 | 
181 | class EnhancedForControl(Node):
182 |     attrs = ("var", "iterable")
183 | 
184 | # ------------------------------------------------------------------------------
185 | 
186 | class Expression(Node):
187 |     attrs = ()
188 | 
189 | class Assignment(Expression):
190 |     attrs = ("expressionl", "value", "type")
191 | 
192 | class TernaryExpression(Expression):
193 |     attrs = ("condition", "if_true", "if_false")
194 | 
195 | class BinaryOperation(Expression):
196 |     attrs = ("operator", "operandl", "operandr")
197 | 
198 | class Cast(Expression):
199 |     attrs = ("type", "expression")
200 | 
201 | class MethodReference(Expression):
202 |     attrs = ("expression", "method", "type_arguments")
203 | 
204 | class LambdaExpression(Expression):
205 |     attrs = ('parameters', 'body')
206 | 
207 | # ------------------------------------------------------------------------------
208 | 
209 | class Primary(Expression):
210 |     attrs = ("prefix_operators", "postfix_operators", "qualifier", "selectors")
211 | 
212 | class Literal(Primary):
213 |     attrs = ("value",)
214 | 
215 | class This(Primary):
216 |     attrs = ()
217 | 
218 | class MemberReference(Primary):
219 |     attrs = ("member",)
220 | 
221 | class Invocation(Primary):
222 |     attrs = ("type_arguments", "arguments")
223 | 
224 | class ExplicitConstructorInvocation(Invocation):
225 |     attrs = ()
226 | 
227 | class SuperConstructorInvocation(Invocation):
228 |     attrs = ()
229 | 
230 | class MethodInvocation(Invocation):
231 |     attrs = ("member",)
232 | 
233 | class SuperMethodInvocation(Invocation):
234 |     attrs = ("member",)
235 | 
236 | class SuperMemberReference(Primary):
237 |     attrs = ("member",)
238 | 
239 | class ArraySelector(Expression):
240 |     attrs = ("index",)
241 | 
242 | class ClassReference(Primary):
243 |     attrs = ("type",)
244 | 
245 | class VoidClassReference(ClassReference):
246 |     attrs = ()
247 | 
248 | # ------------------------------------------------------------------------------
249 | 
250 | class Creator(Primary):
251 |     attrs = ("type",)
252 | 
253 | class ArrayCreator(Creator):
254 |     attrs = ("dimensions", "initializer")
255 | 
256 | class ClassCreator(Creator):
257 |     attrs = ("constructor_type_arguments", "arguments", "body")
258 | 
259 | class InnerClassCreator(Creator):
260 |     attrs = ("constructor_type_arguments", "arguments", "body")
261 | 
262 | # ------------------------------------------------------------------------------
263 | 
264 | class EnumBody(Node):
265 |     attrs = ("constants", "declarations")
266 | 
267 | class EnumConstantDeclaration(Declaration, Documented):
268 |     attrs = ("name", "arguments", "body")
269 | 
270 | class AnnotationMethod(Declaration):
271 |     attrs = ("name", "return_type", "dimensions", "default")
272 | 
273 | 


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/javalang/util.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | class LookAheadIterator(object):
  4 |     def __init__(self, iterable):
  5 |         self.iterable = iter(iterable)
  6 |         self.look_ahead = list()
  7 |         self.markers = list()
  8 |         self.default = None
  9 |         self.value = None
 10 | 
 11 |     def __iter__(self):
 12 |         return self
 13 | 
 14 |     def set_default(self, value):
 15 |         self.default = value
 16 | 
 17 |     def next(self):
 18 |         return self.__next__()
 19 | 
 20 |     def __next__(self):
 21 |         if self.look_ahead:
 22 |             self.value = self.look_ahead.pop(0)
 23 |         else:
 24 |             self.value = next(self.iterable)
 25 | 
 26 |         if self.markers:
 27 |             self.markers[-1].append(self.value)
 28 | 
 29 |         return self.value
 30 | 
 31 |     def look(self, i=0):
 32 |         """ Look ahead of the iterable by some number of values with advancing
 33 |         past them.
 34 | 
 35 |         If the requested look ahead is past the end of the iterable then None is
 36 |         returned.
 37 | 
 38 |         """
 39 | 
 40 |         length = len(self.look_ahead)
 41 | 
 42 |         if length <= i:
 43 |             try:
 44 |                 self.look_ahead.extend([next(self.iterable)
 45 |                     for _ in range(length, i + 1)])
 46 |             except StopIteration:
 47 |                 return self.default
 48 | 
 49 |         self.value = self.look_ahead[i]
 50 |         return self.value
 51 | 
 52 |     def last(self):
 53 |         return self.value
 54 | 
 55 |     def __enter__(self):
 56 |         self.push_marker()
 57 |         return self
 58 | 
 59 |     def __exit__(self, exc_type, exc_val, exc_tb):
 60 |         # Reset the iterator if there was an error
 61 |         if exc_type or exc_val or exc_tb:
 62 |             self.pop_marker(True)
 63 |         else:
 64 |             self.pop_marker(False)
 65 | 
 66 |     def push_marker(self):
 67 |         """ Push a marker on to the marker stack """
 68 |         self.markers.append(list())
 69 | 
 70 |     def pop_marker(self, reset):
 71 |         """ Pop a marker off of the marker stack. If reset is True then the
 72 |         iterator will be returned to the state it was in before the
 73 |         corresponding call to push_marker().
 74 | 
 75 |         """
 76 | 
 77 |         marker = self.markers.pop()
 78 | 
 79 |         if reset:
 80 |             # Make the values available to be read again
 81 |             marker.extend(self.look_ahead)
 82 |             self.look_ahead = marker
 83 |         elif self.markers:
 84 |             # Otherwise, reassign the values to the top marker
 85 |             self.markers[-1].extend(marker)
 86 |         else:
 87 |             # If there are not more markers in the stack then discard the values
 88 |             pass
 89 | 
 90 | class LookAheadListIterator(object):
 91 |     def __init__(self, iterable):
 92 |         self.list = list(iterable)
 93 | 
 94 |         self.marker = 0
 95 |         self.saved_markers = []
 96 | 
 97 |         self.default = None
 98 |         self.value = None
 99 | 
100 |     def __iter__(self):
101 |         return self
102 | 
103 |     def set_default(self, value):
104 |         self.default = value
105 | 
106 |     def next(self):
107 |         return self.__next__()
108 | 
109 |     def __next__(self):
110 |         try:
111 |             self.value = self.list[self.marker]
112 |             self.marker += 1
113 |         except IndexError:
114 |             raise StopIteration()
115 | 
116 |         return self.value
117 | 
118 |     def look(self, i=0):
119 |         """ Look ahead of the iterable by some number of values with advancing
120 |         past them.
121 | 
122 |         If the requested look ahead is past the end of the iterable then None is
123 |         returned.
124 | 
125 |         """
126 | 
127 |         try:
128 |             self.value = self.list[self.marker + i]
129 |         except IndexError:
130 |             return self.default
131 | 
132 |         return self.value
133 | 
134 |     def last(self):
135 |         return self.value
136 | 
137 |     def __enter__(self):
138 |         self.push_marker()
139 |         return self
140 | 
141 |     def __exit__(self, exc_type, exc_val, exc_tb):
142 |         # Reset the iterator if there was an error
143 |         if exc_type or exc_val or exc_tb:
144 |             self.pop_marker(True)
145 |         else:
146 |             self.pop_marker(False)
147 | 
148 |     def push_marker(self):
149 |         """ Push a marker on to the marker stack """
150 |         self.saved_markers.append(self.marker)
151 | 
152 |     def pop_marker(self, reset):
153 |         """ Pop a marker off of the marker stack. If reset is True then the
154 |         iterator will be returned to the state it was in before the
155 |         corresponding call to push_marker().
156 | 
157 |         """
158 | 
159 |         saved = self.saved_markers.pop()
160 | 
161 |         if reset:
162 |             self.marker = saved
163 |         elif self.saved_markers:
164 |             self.saved_markers[-1] = saved
165 | 
166 | 


--------------------------------------------------------------------------------
/code2seq_master/baseline_tokenization/subtokenize_nmt_baseline.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import javalang
 4 | import sys
 5 | import re
 6 | 
 7 | 
 8 | modifiers = ['public', 'private', 'protected', 'static']
 9 | 
10 | RE_WORDS = re.compile(r'''
11 |     # Find words in a string. Order matters!
12 |     [A-Z]+(?=[A-Z][a-z]) |  # All upper case before a capitalized word
13 |     [A-Z]?[a-z]+ |  # Capitalized words / all lower case
14 |     [A-Z]+ |  # All upper case
15 |     \d+ | # Numbers
16 |     .+
17 | ''', re.VERBOSE)
18 | 
19 | def split_subtokens(str):
20 |     return [subtok for subtok in RE_WORDS.findall(str) if not subtok == '_']
21 | 
22 | def tokenizeFile(file_path):
23 |   lines = 0
24 |   with open(file_path, 'r', encoding="utf-8") as file:
25 |     with open(file_path + 'method_names.txt', 'w') as method_names_file:
26 |       with open(file_path + 'method_subtokens_content.txt', 'w') as method_contents_file:
27 |         for line in file:
28 |           lines += 1
29 |           line = line.rstrip()
30 |           parts = line.split('|', 1)
31 |           method_name = parts[0]
32 |           method_content = parts[1]
33 |           try:
34 |             tokens = list(javalang.tokenizer.tokenize(method_content))
35 |           except:
36 |             print('ERROR in tokenizing: ' + method_content)
37 |             #tokens = method_content.split(' ')
38 |           if len(method_name) > 0 and len(tokens) > 0:
39 |             method_names_file.write(method_name + '\n')
40 |             method_contents_file.write(' '.join([' '.join(split_subtokens(i.value)) for i in tokens if not i.value in modifiers]) + '\n')
41 |           else:
42 |             print('ERROR in len of: ' + method_name + ', tokens: ' + str(tokens))
43 |   print(str(lines))
44 | 
45 | 
46 | if __name__ == '__main__':
47 |   file = sys.argv[1]
48 |   tokenizeFile(file)
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/code2seq_master/code2seq.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | from config import Config
 4 | from interactive_predict import InteractivePredictor
 5 | from model import Model
 6 | 
 7 | if __name__ == '__main__':
 8 |     parser = ArgumentParser()
 9 |     parser.add_argument("-d", "--data", dest="data_path",
10 |                         help="path to preprocessed dataset", required=False)
11 |     parser.add_argument("-te", "--test", dest="test_path",
12 |                         help="path to test file", metavar="FILE", required=False)
13 | 
14 |     parser.add_argument("-s", "--save_prefix", dest="save_path_prefix",
15 |                         help="path to save file", metavar="FILE", required=False)
16 |     parser.add_argument("-l", "--load", dest="load_path",
17 |                         help="path to saved file", metavar="FILE", required=False)
18 |     parser.add_argument('--release', action='store_true',
19 |                         help='if specified and loading a trained model, release the loaded model for a smaller model '
20 |                              'size.')
21 |     parser.add_argument('--predict', action='store_true')
22 |     parser.add_argument('--debug', action='store_true')
23 |     args = parser.parse_args()
24 | 
25 |     if args.debug:
26 |         config = Config.get_debug_config(args)
27 |     else:
28 |         config = Config.get_default_config(args)
29 | 
30 |     model = Model(config)
31 |     print('Created model')
32 |     if config.TRAIN_PATH:
33 |         model.train()
34 |     if config.TEST_PATH and not args.data_path:
35 |         results, precision, recall, f1 = model.evaluate()
36 |         print('Accuracy: ' + str(results))
37 |         print('Precision: ' + str(precision) + ', recall: ' + str(recall) + ', F1: ' + str(f1))
38 |     if args.predict:
39 |         print("Under Prediciton process.....")
40 |         predictor = InteractivePredictor(config, model)
41 |         predictor.predict()
42 |     if args.release and args.load_path:
43 |         model.evaluate(release=True)
44 |     model.close_session()
45 | 


--------------------------------------------------------------------------------
/code2seq_master/code2seq_ast_extractor.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | from config import Config
 4 | from interactive_predict import InteractivePredictor
 5 | from model import Model
 6 | 
 7 | extract_AST = True
 8 | 
 9 | if __name__ == '__main__':
10 |     parser = ArgumentParser()
11 |     parser.add_argument("-d", "--data", dest="data_path",
12 |                         help="path to preprocessed dataset", required=False)
13 |     parser.add_argument("-te", "--test", dest="test_path",
14 |                         help="path to test file", metavar="FILE", required=False)
15 | 
16 |     parser.add_argument("-s", "--save_prefix", dest="save_path_prefix",
17 |                         help="path to save file", metavar="FILE", required=False)
18 |     parser.add_argument("-l", "--load", dest="load_path",
19 |                         help="path to saved file", metavar="FILE", required=False)
20 |     parser.add_argument('--release', action='store_true',
21 |                         help='if specified and loading a trained model, release the loaded model for a smaller model '
22 |                              'size.')
23 |     parser.add_argument('--predict', action='store_true')
24 |     parser.add_argument('--debug', action='store_true')
25 |     args = parser.parse_args()
26 | 
27 |     if args.debug:
28 |         config = Config.get_debug_config(args)
29 |     else:
30 |         config = Config.get_default_config(args)
31 | 
32 |     model = Model(config)
33 |     print('Created model')
34 |     if config.TRAIN_PATH:
35 |         model.train()
36 |     if config.TEST_PATH and not args.data_path:
37 |         results, precision, recall, f1 = model.evaluate()
38 |         print('Accuracy: ' + str(results))
39 |         print('Precision: ' + str(precision) + ', recall: ' + str(recall) + ', F1: ' + str(f1))
40 |     if args.predict:
41 |         print("Under Prediciton process.....")
42 |         predictor = InteractivePredictor(config, model)
43 |         if extract_AST:
44 |             ast_fetaure_list = predictor.get_ast_paths_for_file()
45 |             print(f"AST Feature for {len(ast_fetaure_list)} snippets")
46 |             print(ast_fetaure_list)
47 |         else:
48 |             predictor.predict()
49 | 
50 |     if args.release and args.load_path:
51 |         model.evaluate(release=True)
52 |     model.close_session()
53 | 


--------------------------------------------------------------------------------
/code2seq_master/common.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import subprocess
  3 | import sys
  4 | 
  5 | 
  6 | class Common:
  7 |     internal_delimiter = '|'
  8 |     SOS = '<S>'
  9 |     EOS = '</S>'
 10 |     PAD = '<PAD>'
 11 |     UNK = '<UNK>'
 12 | 
 13 |     @staticmethod
 14 |     def normalize_word(word):
 15 |         stripped = re.sub(r'[^a-zA-Z]', '', word)
 16 |         if len(stripped) == 0:
 17 |             return word.lower()
 18 |         else:
 19 |             return stripped.lower()
 20 | 
 21 |     @staticmethod
 22 |     def load_histogram(path, max_size=None):
 23 |         histogram = {}
 24 |         with open(path, 'r') as file:
 25 |             for line in file.readlines():
 26 |                 parts = line.split(' ')
 27 |                 if not len(parts) == 2:
 28 |                     continue
 29 |                 histogram[parts[0]] = int(parts[1])
 30 |         sorted_histogram = [(k, histogram[k]) for k in sorted(histogram, key=histogram.get, reverse=True)]
 31 |         return dict(sorted_histogram[:max_size])
 32 | 
 33 |     @staticmethod
 34 |     def load_vocab_from_dict(word_to_count, add_values=[], max_size=None):
 35 |         word_to_index, index_to_word = {}, {}
 36 |         current_index = 0
 37 |         for value in add_values:
 38 |             word_to_index[value] = current_index
 39 |             index_to_word[current_index] = value
 40 |             current_index += 1
 41 |         sorted_counts = [(k, word_to_count[k]) for k in sorted(word_to_count, key=word_to_count.get, reverse=True)]
 42 |         limited_sorted = dict(sorted_counts[:max_size])
 43 |         for word, count in limited_sorted.items():
 44 |             word_to_index[word] = current_index
 45 |             index_to_word[current_index] = word
 46 |             current_index += 1
 47 |         return word_to_index, index_to_word, current_index
 48 | 
 49 |     @staticmethod
 50 |     def binary_to_string(binary_string):
 51 |         return binary_string.decode("utf-8")
 52 | 
 53 |     @staticmethod
 54 |     def binary_to_string_list(binary_string_list):
 55 |         return [Common.binary_to_string(w) for w in binary_string_list]
 56 | 
 57 |     @staticmethod
 58 |     def binary_to_string_matrix(binary_string_matrix):
 59 |         return [Common.binary_to_string_list(l) for l in binary_string_matrix]
 60 | 
 61 |     @staticmethod
 62 |     def binary_to_string_3d(binary_string_tensor):
 63 |         return [Common.binary_to_string_matrix(l) for l in binary_string_tensor]
 64 | 
 65 |     @staticmethod
 66 |     def legal_method_names_checker(name):
 67 |         return not name in [Common.UNK, Common.PAD, Common.EOS]
 68 | 
 69 |     @staticmethod
 70 |     def filter_impossible_names(top_words):
 71 |         result = list(filter(Common.legal_method_names_checker, top_words))
 72 |         return result
 73 | 
 74 |     @staticmethod
 75 |     def unique(sequence):
 76 |         unique = []
 77 |         [unique.append(item) for item in sequence if item not in unique]
 78 |         return unique
 79 | 
 80 |     @staticmethod
 81 |     def parse_results(result, pc_info_dict, topk=5):
 82 |         prediction_results = {}
 83 |         results_counter = 0
 84 |         for single_method in result:
 85 |             original_name, top_suggestions, top_scores, attention_per_context = list(single_method)
 86 |             current_method_prediction_results = PredictionResults(original_name)
 87 |             if attention_per_context is not None:
 88 |                 word_attention_pairs = [(word, attention) for word, attention in
 89 |                                         zip(top_suggestions, attention_per_context) if
 90 |                                         Common.legal_method_names_checker(word)]
 91 |                 for predicted_word, attention_timestep in word_attention_pairs:
 92 |                     current_timestep_paths = []
 93 |                     for context, attention in [(key, attention_timestep[key]) for key in
 94 |                                                sorted(attention_timestep, key=attention_timestep.get, reverse=True)][
 95 |                                               :topk]:
 96 |                         if context in pc_info_dict:
 97 |                             pc_info = pc_info_dict[context]
 98 |                             current_timestep_paths.append((attention.item(), pc_info))
 99 | 
100 |                     current_method_prediction_results.append_prediction(predicted_word, current_timestep_paths)
101 |             else:
102 |                 for predicted_seq in top_suggestions:
103 |                     filtered_seq = [word for word in predicted_seq if Common.legal_method_names_checker(word)]
104 |                     current_method_prediction_results.append_prediction(filtered_seq, None)
105 | 
106 |             prediction_results[results_counter] = current_method_prediction_results
107 |             results_counter += 1
108 |         return prediction_results
109 | 
110 |     @staticmethod
111 |     def compute_bleu(ref_file_name, predicted_file_name):
112 |         with open(predicted_file_name) as predicted_file:
113 |             pipe = subprocess.Popen(["perl", "scripts/multi-bleu.perl", ref_file_name], stdin=predicted_file,
114 |                                     stdout=sys.stdout, stderr=sys.stderr)
115 | 
116 | 
117 | class PredictionResults:
118 |     def __init__(self, original_name):
119 |         self.original_name = original_name
120 |         self.predictions = list()
121 | 
122 |     def append_prediction(self, name, current_timestep_paths):
123 |         self.predictions.append(SingleTimeStepPrediction(name, current_timestep_paths))
124 | 
125 | class SingleTimeStepPrediction:
126 |     def __init__(self, prediction, attention_paths):
127 |         self.prediction = prediction
128 |         if attention_paths is not None:
129 |             paths_with_scores = []
130 |             for attention_score, pc_info in attention_paths:
131 |                 path_context_dict = {'score': attention_score,
132 |                                      'path': pc_info.longPath,
133 |                                      'token1': pc_info.token1,
134 |                                      'token2': pc_info.token2}
135 |                 paths_with_scores.append(path_context_dict)
136 |             self.attention_paths = paths_with_scores
137 | 
138 | 
139 | class PathContextInformation:
140 |     def __init__(self, context):
141 |         self.token1 = context['name1']
142 |         self.longPath = context['path']
143 |         self.shortPath = context['shortPath']
144 |         self.token2 = context['name2']
145 | 
146 |     def __str__(self):
147 |         return '%s,%s,%s' % (self.token1, self.shortPath, self.token2)


--------------------------------------------------------------------------------
/code2seq_master/config.py:
--------------------------------------------------------------------------------
  1 | class Config:
  2 |     @staticmethod
  3 |     def get_default_config(args):
  4 |         config = Config(args)
  5 |         config.NUM_EPOCHS = 3000
  6 |         config.SAVE_EVERY_EPOCHS = 1
  7 |         config.PATIENCE = 10
  8 |         config.BATCH_SIZE = 64 #512
  9 |         config.TEST_BATCH_SIZE = 64
 10 |         config.READER_NUM_PARALLEL_BATCHES = 1
 11 |         config.SHUFFLE_BUFFER_SIZE = 10000
 12 |         config.CSV_BUFFER_SIZE = 100 * 1024 * 1024  # 100 MB
 13 |         config.MAX_CONTEXTS = 200
 14 |         config.SUBTOKENS_VOCAB_MAX_SIZE = 190000
 15 |         config.TARGET_VOCAB_MAX_SIZE = 27000
 16 |         config.EMBEDDINGS_SIZE = 128
 17 |         config.RNN_SIZE = 128 * 2  # Two LSTMs to embed paths, each of size 128
 18 |         config.DECODER_SIZE = 320
 19 |         config.NUM_DECODER_LAYERS = 1
 20 |         config.MAX_PATH_LENGTH = 8 + 1
 21 |         config.MAX_NAME_PARTS = 5
 22 |         config.MAX_TARGET_PARTS = 6
 23 |         config.EMBEDDINGS_DROPOUT_KEEP_PROB = 0.75
 24 |         config.RNN_DROPOUT_KEEP_PROB = 0.5
 25 |         config.BIRNN = True
 26 |         config.RANDOM_CONTEXTS = True
 27 |         config.BEAM_WIDTH = 0
 28 |         config.USE_MOMENTUM = True
 29 |         return config
 30 | 
 31 |     def take_model_hyperparams_from(self, otherConfig):
 32 |         self.EMBEDDINGS_SIZE = otherConfig.EMBEDDINGS_SIZE
 33 |         self.RNN_SIZE = otherConfig.RNN_SIZE
 34 |         self.DECODER_SIZE = otherConfig.DECODER_SIZE
 35 |         self.NUM_DECODER_LAYERS = otherConfig.NUM_DECODER_LAYERS
 36 |         self.BIRNN = otherConfig.BIRNN
 37 |         if self.DATA_NUM_CONTEXTS <= 0:
 38 |             self.DATA_NUM_CONTEXTS = otherConfig.DATA_NUM_CONTEXTS
 39 | 
 40 |     def __init__(self, args):
 41 |         self.NUM_EPOCHS = 0
 42 |         self.SAVE_EVERY_EPOCHS = 0
 43 |         self.PATIENCE = 0
 44 |         self.BATCH_SIZE = 0
 45 |         self.TEST_BATCH_SIZE = 0
 46 |         self.READER_NUM_PARALLEL_BATCHES = 0
 47 |         self.SHUFFLE_BUFFER_SIZE = 0
 48 |         self.CSV_BUFFER_SIZE = None
 49 |         self.TRAIN_PATH = args.data_path
 50 |         self.TEST_PATH = args.test_path if args.test_path is not None else ''
 51 |         self.DATA_NUM_CONTEXTS = 0
 52 |         self.MAX_CONTEXTS = 0
 53 |         self.SUBTOKENS_VOCAB_MAX_SIZE = 0
 54 |         self.TARGET_VOCAB_MAX_SIZE = 0
 55 |         self.EMBEDDINGS_SIZE = 0
 56 |         self.RNN_SIZE = 0
 57 |         self.DECODER_SIZE = 0
 58 |         self.NUM_DECODER_LAYERS = 0
 59 |         self.SAVE_PATH = args.save_path_prefix
 60 |         self.LOAD_PATH = args.load_path
 61 |         self.MAX_PATH_LENGTH = 0
 62 |         self.MAX_NAME_PARTS = 0
 63 |         self.MAX_TARGET_PARTS = 0
 64 |         self.EMBEDDINGS_DROPOUT_KEEP_PROB = 0
 65 |         self.RNN_DROPOUT_KEEP_PROB = 0
 66 |         self.BIRNN = False
 67 |         self.RANDOM_CONTEXTS = True
 68 |         self.BEAM_WIDTH = 1
 69 |         self.USE_MOMENTUM = True
 70 |         self.RELEASE = args.release
 71 | 
 72 |     @staticmethod
 73 |     def get_debug_config(args):
 74 |         config = Config(args)
 75 |         config.NUM_EPOCHS = 3000
 76 |         config.SAVE_EVERY_EPOCHS = 100
 77 |         config.PATIENCE = 200
 78 |         config.BATCH_SIZE = 7
 79 |         config.TEST_BATCH_SIZE = 7
 80 |         config.READER_NUM_PARALLEL_BATCHES = 1
 81 |         config.SHUFFLE_BUFFER_SIZE = 10
 82 |         config.CSV_BUFFER_SIZE = None
 83 |         config.MAX_CONTEXTS = 5
 84 |         config.SUBTOKENS_VOCAB_MAX_SIZE = 190000
 85 |         config.TARGET_VOCAB_MAX_SIZE = 27000
 86 |         config.EMBEDDINGS_SIZE = 19
 87 |         config.RNN_SIZE = 10
 88 |         config.DECODER_SIZE = 11
 89 |         config.NUM_DECODER_LAYERS = 1
 90 |         config.MAX_PATH_LENGTH = 8 + 1
 91 |         config.MAX_NAME_PARTS = 5
 92 |         config.MAX_TARGET_PARTS = 6
 93 |         config.EMBEDDINGS_DROPOUT_KEEP_PROB = 1
 94 |         config.RNN_DROPOUT_KEEP_PROB = 1
 95 |         config.BIRNN = True
 96 |         config.RANDOM_CONTEXTS = True
 97 |         config.BEAM_WIDTH = 0
 98 |         config.USE_MOMENTUM = False
 99 |         return config
100 | 


--------------------------------------------------------------------------------
/code2seq_master/extract_ast.py:
--------------------------------------------------------------------------------
 1 | from config import Config
 2 | from extractor import Extractor
 3 | from argparse import ArgumentParser
 4 | 
 5 | EXTRACTION_API = 'https://po3g2dx2qa.execute-api.us-east-1.amazonaws.com/production/extractmethods'
 6 | 
 7 | def read_file(input_filename):
 8 |     with open(input_filename, 'r') as file:
 9 |         return file.readlines()
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     parser = ArgumentParser()
14 |     parser.add_argument("-d", "--data", dest="data_path",
15 |                         help="path to preprocessed dataset", required=False)
16 |     parser.add_argument("-te", "--test", dest="test_path",
17 |                         help="path to test file", metavar="FILE", required=False)
18 | 
19 |     parser.add_argument("-s", "--save_prefix", dest="save_path_prefix",
20 |                         help="path to save file", metavar="FILE", required=False)
21 |     parser.add_argument("-l", "--load", dest="load_path",
22 |                         help="path to saved file", metavar="FILE", required=False)
23 |     parser.add_argument('--release', action='store_true',
24 |                         help='if specified and loading a trained model, release the loaded model for a smaller model '
25 |                              'size.')
26 |     parser.add_argument('--predict', action='store_true')
27 |     parser.add_argument('--debug', action='store_true')
28 |     args = parser.parse_args()
29 | 
30 |     print(f"Args - {args}")
31 | 
32 |     config = Config.get_default_config(args)
33 | 
34 |     print(f"Config - {config}")
35 | 
36 |     path_extractor = Extractor(config, EXTRACTION_API, config.MAX_PATH_LENGTH, max_path_width=2)
37 | 
38 |     input_filename = 'Input.java'
39 |     print('Extraction Begin - AST')
40 |     user_input = ' '.join(read_file(input_filename))
41 |     predict_lines, pc_info_dict = path_extractor.extract_paths(user_input)
42 | 
43 | 
44 |     print("*************************** EXTRACTED AST ***************************")
45 |     print(predict_lines)
46 |     print(pc_info_dict)
47 | 
48 |     model_results = self.model.predict(predict_lines)


--------------------------------------------------------------------------------
/code2seq_master/extractor.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import requests
 4 | 
 5 | from common import PathContextInformation
 6 | 
 7 | 
 8 | class Extractor:
 9 |     def __init__(self, config, extractor_api_url, max_path_length, max_path_width):
10 |         self.config = config
11 |         self.max_path_length = max_path_length
12 |         self.max_path_width = max_path_width
13 |         self.extractor_api_url = extractor_api_url
14 |         self.bad_characters_table = str.maketrans('', '', '\t\r\n')
15 | 
16 |     @staticmethod
17 |     def post_request(url, code_string):
18 |         return requests.post(url, data=json.dumps({"code": code_string, "decompose": True}, separators=(',', ':')))
19 | 
20 |     def extract_paths(self, code_string):
21 |         # print("Here is the code snippet ---------- ")
22 |         # print(code_string)
23 |         response = self.post_request(self.extractor_api_url, code_string)
24 |         response_array = json.loads(response.text)
25 |         if 'errorType' in response_array:
26 |             raise ValueError(response.text)
27 |         if 'errorMessage' in response_array:
28 |             raise TimeoutError(response.text)
29 |         pc_info_dict = {}
30 |         result = []
31 |         for single_method in response_array:
32 |             method_name = single_method['target']
33 |             current_result_line_parts = [method_name]
34 |             contexts = single_method['paths']
35 |             # print(f"Number of response paths - {len(contexts)}, and taken till - {self.config.DATA_NUM_CONTEXTS} ")
36 |             for context in contexts[:self.config.DATA_NUM_CONTEXTS]:
37 |                 pc_info = PathContextInformation(context)
38 |                 current_result_line_parts += [str(pc_info)]
39 |                 pc_info_dict[(pc_info.token1, pc_info.shortPath, pc_info.token2)] = pc_info
40 |             space_padding = ' ' * (self.config.DATA_NUM_CONTEXTS - len(contexts))
41 |             result_line = ' '.join(current_result_line_parts) + space_padding
42 |             result.append(result_line)
43 |         return result, pc_info_dict
44 | 


--------------------------------------------------------------------------------
/code2seq_master/images/network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/code2seq_master/images/network.png


--------------------------------------------------------------------------------
/code2seq_master/init.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/code2seq_master/init.py


--------------------------------------------------------------------------------
/code2seq_master/interactive_predict.py:
--------------------------------------------------------------------------------
 1 | from common import Common
 2 | from extractor import Extractor
 3 | import json
 4 | import time
 5 | 
 6 | SHOW_TOP_CONTEXTS = 10
 7 | MAX_PATH_LENGTH = 8
 8 | MAX_PATH_WIDTH = 2
 9 | EXTRACTION_API = 'https://po3g2dx2qa.execute-api.us-east-1.amazonaws.com/production/extractmethods'
10 | 
11 | 
12 | class InteractivePredictor:
13 |     exit_keywords = ['exit', 'quit', 'q']
14 |     test_extractor = True
15 | 
16 |     def __init__(self, config, model):
17 |         model.predict([])
18 |         self.model = model
19 |         self.config = config
20 |         self.path_extractor = Extractor(config, EXTRACTION_API, self.config.MAX_PATH_LENGTH, max_path_width=2)
21 | 
22 |     @staticmethod
23 |     def read_file(input_filename):
24 |         with open(input_filename, 'r') as file:
25 |             return file.readlines()
26 | 
27 | 
28 |     def read_raw_code_data(self, data_file, take=1):
29 |         raw_data_snippets = []
30 |         lines = self.read_file(data_file)
31 |         for line in lines[:take]:
32 |             raw_data_snippets.append(json.loads(line)["code"])
33 |         return raw_data_snippets
34 | 
35 |     def get_ast_paths_for_file(self):
36 |         data_file = '/Users/navinLR/Desktop/ML_for_SE/AutoComments/data/test.json'
37 |         print("Begin Extraction")
38 |         ast_fetaure_list = []
39 |         raw_data_snippets = self.read_raw_code_data(data_file)
40 |         for ind, snippet in enumerate(raw_data_snippets):
41 |             predict_lines = self.get_ast_path_for_snippet(snippet)
42 |             # print(f"Extracted AST for Snippet - {ind}")
43 |             # print(f"AST Size - {len(predict_lines)}")
44 |             ast_fetaure_list.extend(predict_lines)
45 |         return ast_fetaure_list
46 | 
47 |     def get_ast_path_for_snippet(self, snippet):
48 |         time.sleep(2)
49 |         predict_lines, pc_info_dict = self.path_extractor.extract_paths(snippet)
50 |         return predict_lines
51 | 
52 |     def predict(self):
53 |         input_filename = 'Input.java'
54 |         print('Serving')
55 |         while True:
56 |             print('Modify the file: "' + input_filename + '" and press any key when ready, or "q" / "exit" to exit')
57 |             user_input = input()
58 |             if user_input.lower() in self.exit_keywords:
59 |                 print('Exiting...')
60 |                 return
61 |             user_input = ' '.join(self.read_file(input_filename))
62 |             try:
63 |                 predict_lines, pc_info_dict = self.path_extractor.extract_paths(user_input)
64 |             except ValueError:
65 |                 continue
66 | 
67 |             # Navin's Modif
68 |             if self.test_extractor:
69 |                 print("Testing Extracted ASTs")
70 |                 predict_lines = [self.read_file("../data/auto_comment_dataset/auto_comment_dataset.test.c2s")[1].replace("\n","")] # Take first Line
71 | 
72 |             # print(f"Path Extractor o/p - \n {predict_lines}")
73 | 
74 |             model_results = self.model.predict(predict_lines)
75 | 
76 |             # print(f"Model Resutls -------- \n{model_results}")
77 | 
78 |             prediction_results = Common.parse_results(model_results, pc_info_dict, topk=SHOW_TOP_CONTEXTS)
79 |             for index, method_prediction in prediction_results.items():
80 |                 print('Original name:\t' + method_prediction.original_name)
81 |                 if self.config.BEAM_WIDTH == 0:
82 |                     print('Predicted:\t%s' % [step.prediction for step in method_prediction.predictions])
83 |                     for timestep, single_timestep_prediction in enumerate(method_prediction.predictions):
84 |                         print('Attention:')
85 |                         print('TIMESTEP: %d\t: %s' % (timestep, single_timestep_prediction.prediction))
86 |                         for attention_obj in single_timestep_prediction.attention_paths:
87 |                             print('%f\tcontext: %s,%s,%s' % (
88 |                                 attention_obj['score'], attention_obj['token1'], attention_obj['path'],
89 |                                 attention_obj['token2']))
90 |                 else:
91 |                     print('Predicted:')
92 |                     for predicted_seq in method_prediction.predictions:
93 |                         print('\t%s' % predicted_seq.prediction)
94 | 


--------------------------------------------------------------------------------
/code2seq_master/java_files_creator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | 
 4 | base_folder = "java_code_data/"
 5 | sub_folder  = "data"
 6 | 
 7 | dataset = "data/test.csv"
 8 | 
 9 | def read_file(input_filename):
10 |     with open(input_filename, 'r') as file:
11 |         return file.readlines()
12 | 
13 | with open(dataset) as csv_file:
14 |     csv_reader = csv.reader(csv_file, delimiter=',')
15 |     for index, row in enumerate(csv_reader):
16 |         print(f"Wrining Java Snippet No:{index}")
17 |         to_write_path = base_folder+sub_folder+str(index)
18 |         os.mkdir(to_write_path)
19 |         code = row[0]
20 |         f = open(to_write_path+"/Input.java", "w")
21 |         f.write(code)
22 |         f.close()
23 | 


--------------------------------------------------------------------------------
/code2seq_master/preprocess.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from argparse import ArgumentParser
  3 | 
  4 | import numpy as np
  5 | 
  6 | import code2seq-master.common
  7 | 
  8 | '''
  9 | This script preprocesses the data from MethodPaths. It truncates methods with too many contexts,
 10 | and pads methods with less paths with spaces.
 11 | '''
 12 | 
 13 | 
 14 | def save_dictionaries(dataset_name, subtoken_to_count, node_to_count, target_to_count, max_contexts, num_examples):
 15 |     save_dict_file_path = '{}.dict.c2s'.format(dataset_name)
 16 |     with open(save_dict_file_path, 'wb') as file:
 17 |         pickle.dump(subtoken_to_count, file)
 18 |         pickle.dump(node_to_count, file)
 19 |         pickle.dump(target_to_count, file)
 20 |         pickle.dump(max_contexts, file)
 21 |         pickle.dump(num_examples, file)
 22 |         print('Dictionaries saved to: {}'.format(save_dict_file_path))
 23 | 
 24 | 
 25 | def process_file(file_path, data_file_role, dataset_name, max_contexts, max_data_contexts):
 26 |     sum_total = 0
 27 |     sum_sampled = 0
 28 |     total = 0
 29 |     max_unfiltered = 0
 30 |     max_contexts_to_sample = max_data_contexts if data_file_role == 'train' else max_contexts
 31 |     output_path = '{}.{}.c2s'.format(dataset_name, data_file_role)
 32 |     with open(output_path, 'w') as outfile:
 33 |         with open(file_path, 'r') as file:
 34 |             for line in file:
 35 |                 parts = line.rstrip('\n').split(' ')
 36 |                 target_name = parts[0]
 37 |                 contexts = parts[1:]
 38 | 
 39 |                 if len(contexts) > max_unfiltered:
 40 |                     max_unfiltered = len(contexts)
 41 | 
 42 |                 sum_total += len(contexts)
 43 |                 if len(contexts) > max_contexts_to_sample:
 44 |                     contexts = np.random.choice(contexts, max_contexts_to_sample, replace=False)
 45 | 
 46 |                 sum_sampled += len(contexts)
 47 | 
 48 |                 csv_padding = " " * (max_data_contexts - len(contexts))
 49 |                 total += 1
 50 |                 outfile.write(target_name + ' ' + " ".join(contexts) + csv_padding + '\n')
 51 | 
 52 |     print('File: ' + data_file_path)
 53 |     print('Average total contexts: ' + str(float(sum_total) / total))
 54 |     print('Average final (after sampling) contexts: ' + str(float(sum_sampled) / total))
 55 |     print('Total examples: ' + str(total))
 56 |     print('Max number of contexts per word: ' + str(max_unfiltered))
 57 |     return total
 58 | 
 59 | 
 60 | def context_full_found(context_parts, word_to_count, path_to_count):
 61 |     return context_parts[0] in word_to_count \
 62 |            and context_parts[1] in path_to_count and context_parts[2] in word_to_count
 63 | 
 64 | 
 65 | def context_partial_found(context_parts, word_to_count, path_to_count):
 66 |     return context_parts[0] in word_to_count \
 67 |            or context_parts[1] in path_to_count or context_parts[2] in word_to_count
 68 | 
 69 | 
 70 | if __name__ == '__main__':
 71 |     parser = ArgumentParser()
 72 |     parser.add_argument("-trd", "--train_data", dest="train_data_path",
 73 |                         help="path to training data file", required=True)
 74 |     parser.add_argument("-ted", "--test_data", dest="test_data_path",
 75 |                         help="path to test data file", required=True)
 76 |     parser.add_argument("-vd", "--val_data", dest="val_data_path",
 77 |                         help="path to validation data file", required=True)
 78 |     parser.add_argument("-mc", "--max_contexts", dest="max_contexts", default=200,
 79 |                         help="number of max contexts to keep in test+validation", required=False)
 80 |     parser.add_argument("-mdc", "--max_data_contexts", dest="max_data_contexts", default=1000,
 81 |                         help="number of max contexts to keep in the dataset", required=False)
 82 |     parser.add_argument("-svs", "--subtoken_vocab_size", dest="subtoken_vocab_size", default=186277,
 83 |                         help="Max number of source subtokens to keep in the vocabulary", required=False)
 84 |     parser.add_argument("-tvs", "--target_vocab_size", dest="target_vocab_size", default=26347,
 85 |                         help="Max number of target words to keep in the vocabulary", required=False)
 86 |     parser.add_argument("-sh", "--subtoken_histogram", dest="subtoken_histogram",
 87 |                         help="subtoken histogram file", metavar="FILE", required=True)
 88 |     parser.add_argument("-nh", "--node_histogram", dest="node_histogram",
 89 |                         help="node_histogram file", metavar="FILE", required=True)
 90 |     parser.add_argument("-th", "--target_histogram", dest="target_histogram",
 91 |                         help="target histogram file", metavar="FILE", required=True)
 92 |     parser.add_argument("-o", "--output_name", dest="output_name",
 93 |                         help="output name - the base name for the created dataset", required=True, default='data')
 94 |     args = parser.parse_args()
 95 | 
 96 |     train_data_path = args.train_data_path
 97 |     test_data_path = args.test_data_path
 98 |     val_data_path = args.val_data_path
 99 |     subtoken_histogram_path = args.subtoken_histogram
100 |     node_histogram_path = args.node_histogram
101 | 
102 |     subtoken_to_count = common.Common.load_histogram(subtoken_histogram_path,
103 |                                                      max_size=int(args.subtoken_vocab_size))
104 |     node_to_count = common.Common.load_histogram(node_histogram_path,
105 |                                                  max_size=None)
106 |     target_to_count = common.Common.load_histogram(args.target_histogram,
107 |                                                    max_size=int(args.target_vocab_size))
108 |     print('subtoken vocab size: ', len(subtoken_to_count))
109 |     print('node vocab size: ', len(node_to_count))
110 |     print('target vocab size: ', len(target_to_count))
111 | 
112 |     num_training_examples = 0
113 |     for data_file_path, data_role in zip([test_data_path, val_data_path, train_data_path], ['test', 'val', 'train']):
114 |         num_examples = process_file(file_path=data_file_path, data_file_role=data_role, dataset_name=args.output_name,
115 |                                     max_contexts=int(args.max_contexts), max_data_contexts=int(args.max_data_contexts))
116 |         if data_role == 'train':
117 |             num_training_examples = num_examples
118 | 
119 |     save_dictionaries(dataset_name=args.output_name, subtoken_to_count=subtoken_to_count,
120 |                       node_to_count=node_to_count, target_to_count=target_to_count,
121 |                       max_contexts=int(args.max_data_contexts), num_examples=num_training_examples)
122 | 


--------------------------------------------------------------------------------
/code2seq_master/preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | ###########################################################
 3 | # Change the following values to preprocess a new dataset.
 4 | # TRAIN_DIR, VAL_DIR and TEST_DIR should be paths to      
 5 | #   directories containing sub-directories with .java files
 6 | # DATASET_NAME is just a name for the currently extracted 
 7 | #   dataset.                                              
 8 | # MAX_DATA_CONTEXTS is the number of contexts to keep in the dataset for each 
 9 | #   method (by default 1000). At training time, these contexts
10 | #   will be downsampled dynamically to MAX_CONTEXTS.
11 | # MAX_CONTEXTS - the number of actual contexts (by default 200) 
12 | # that are taken into consideration (out of MAX_DATA_CONTEXTS)
13 | # every training iteration. To avoid randomness at test time, 
14 | # for the test and validation sets only MAX_CONTEXTS contexts are kept 
15 | # (while for training, MAX_DATA_CONTEXTS are kept and MAX_CONTEXTS are
16 | # selected dynamically during training).
17 | # SUBTOKEN_VOCAB_SIZE, TARGET_VOCAB_SIZE -   
18 | #   - the number of subtokens and target words to keep 
19 | #   in the vocabulary (the top occurring words and paths will be kept). 
20 | # NUM_THREADS - the number of parallel threads to use. It is 
21 | #   recommended to use a multi-core machine for the preprocessing 
22 | #   step and set this value to the number of cores.
23 | # PYTHON - python3 interpreter alias.
24 | TRAIN_DIR=my_training_dir
25 | VAL_DIR=my_val_dir
26 | TEST_DIR=my_test_dir
27 | DATASET_NAME=my_dataset
28 | MAX_DATA_CONTEXTS=1000
29 | MAX_CONTEXTS=200
30 | SUBTOKEN_VOCAB_SIZE=186277
31 | TARGET_VOCAB_SIZE=26347
32 | NUM_THREADS=64
33 | PYTHON=python3
34 | ###########################################################
35 | 
36 | TRAIN_DATA_FILE=${DATASET_NAME}.train.raw.txt
37 | VAL_DATA_FILE=${DATASET_NAME}.val.raw.txt
38 | TEST_DATA_FILE=${DATASET_NAME}.test.raw.txt
39 | EXTRACTOR_JAR=JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar
40 | 
41 | mkdir -p data
42 | mkdir -p data/${DATASET_NAME}
43 | 
44 | echo "Extracting paths from validation set..."
45 | ${PYTHON} JavaExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${VAL_DATA_FILE} 2>> error_log.txt
46 | echo "Finished extracting paths from validation set"
47 | echo "Extracting paths from test set..."
48 | ${PYTHON} JavaExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${TEST_DATA_FILE} 2>> error_log.txt
49 | echo "Finished extracting paths from test set"
50 | echo "Extracting paths from training set..."
51 | ${PYTHON} JavaExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} | shuf > ${TRAIN_DATA_FILE} 2>> error_log.txt
52 | echo "Finished extracting paths from training set"
53 | 
54 | TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2s
55 | SOURCE_SUBTOKEN_HISTOGRAM=data/${DATASET_NAME}/${DATASET_NAME}.histo.ori.c2s
56 | NODE_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.node.c2s
57 | 
58 | echo "Creating histograms from the training data"
59 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f1 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${TARGET_HISTOGRAM_FILE}
60 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${SOURCE_SUBTOKEN_HISTOGRAM}
61 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${NODE_HISTOGRAM_FILE}
62 | 
63 | ${PYTHON} preprocess.py --train_data ${TRAIN_DATA_FILE} --test_data ${TEST_DATA_FILE} --val_data ${VAL_DATA_FILE} \
64 |   --max_contexts ${MAX_CONTEXTS} --max_data_contexts ${MAX_DATA_CONTEXTS} --subtoken_vocab_size ${SUBTOKEN_VOCAB_SIZE} \
65 |   --target_vocab_size ${TARGET_VOCAB_SIZE} --subtoken_histogram ${SOURCE_SUBTOKEN_HISTOGRAM} \
66 |   --node_histogram ${NODE_HISTOGRAM_FILE} --target_histogram ${TARGET_HISTOGRAM_FILE} --output_name data/${DATASET_NAME}/${DATASET_NAME}
67 |     
68 | # If all went well, the raw data files can be deleted, because preprocess.py creates new files 
69 | # with truncated and padded number of paths for each example.
70 | rm ${TRAIN_DATA_FILE} ${VAL_DATA_FILE} ${TEST_DATA_FILE} ${TARGET_HISTOGRAM_FILE} ${SOURCE_SUBTOKEN_HISTOGRAM} \
71 |   ${NODE_HISTOGRAM_FILE}
72 | 
73 | 


--------------------------------------------------------------------------------
/code2seq_master/preprocess_csharp.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | ###########################################################
 3 | # Change the following values to preprocess a new dataset.
 4 | # TRAIN_DIR, VAL_DIR and TEST_DIR should be paths to      
 5 | #   directories containing sub-directories with .java files
 6 | # DATASET_NAME is just a name for the currently extracted 
 7 | #   dataset.                                              
 8 | # MAX_DATA_CONTEXTS is the number of contexts to keep in the dataset for each 
 9 | #   method (by default 1000). At training time, these contexts
10 | #   will be downsampled dynamically to MAX_CONTEXTS.
11 | # MAX_CONTEXTS - the number of actual contexts (by default 200) 
12 | # that are taken into consideration (out of MAX_DATA_CONTEXTS)
13 | # every training iteration. To avoid randomness at test time, 
14 | # for the test and validation sets only MAX_CONTEXTS contexts are kept 
15 | # (while for training, MAX_DATA_CONTEXTS are kept and MAX_CONTEXTS are
16 | # selected dynamically during training).
17 | # SUBTOKEN_VOCAB_SIZE, TARGET_VOCAB_SIZE -   
18 | #   - the number of subtokens and target words to keep 
19 | #   in the vocabulary (the top occurring words and paths will be kept). 
20 | # NUM_THREADS - the number of parallel threads to use. It is 
21 | #   recommended to use a multi-core machine for the preprocessing 
22 | #   step and set this value to the number of cores.
23 | # PYTHON - python3 interpreter alias.
24 | 
25 | #TRAIN_DIR=JavaExtractor/JPredict/src/main/java/JavaExtractor/Common
26 | #VAL_DIR=JavaExtractor/JPredict/src/main/java/JavaExtractor/Common
27 | #TEST_DIR=JavaExtractor/JPredict/src/main/java/JavaExtractor/Common
28 | 
29 | TRAIN_DIR=../java_code_train
30 | VAL_DIR=../java_code_valid
31 | TEST_DIR=../java_code_test
32 | 
33 | DATASET_NAME=my_dataset
34 | MAX_DATA_CONTEXTS=1000
35 | MAX_CONTEXTS=200
36 | SUBTOKEN_VOCAB_SIZE=186277
37 | TARGET_VOCAB_SIZE=26347
38 | NUM_THREADS=64
39 | PYTHON=python3
40 | ###########################################################
41 | 
42 | TRAIN_DATA_FILE=${DATASET_NAME}.train.raw.txt
43 | VAL_DATA_FILE=${DATASET_NAME}.val.raw.txt
44 | TEST_DATA_FILE=${DATASET_NAME}.test.raw.txt
45 | EXTRACTOR_JAR=CSharpExtractor/CSharpExtractor/Extractor/Extractor.csproj
46 | 
47 | #EXTRACTOR_JAR=JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar
48 | 
49 | mkdir -p data
50 | mkdir -p data/${DATASET_NAME}
51 | 
52 | echo "Extracting paths from validation set..."
53 | ${PYTHON} CSharpExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${VAL_DATA_FILE} 2>> error_log.txt
54 | echo "Finished extracting paths from validation set"
55 | echo "Extracting paths from test set..."
56 | ${PYTHON} CSharpExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${TEST_DATA_FILE} 2>> error_log.txt
57 | echo "Finished extracting paths from test set"
58 | echo "Extracting paths from training set..."
59 | ${PYTHON} CSharpExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --csproj ${EXTRACTOR_JAR} --ofile_name ${TRAIN_DATA_FILE}_unshuf 2>> error_log.txt
60 | echo "Finished extracting paths from training set"
61 | echo "Shuffling training data"
62 | cat ${TRAIN_DATA_FILE}_unshuf | shuf > ${TRAIN_DATA_FILE}
63 | rm ${TRAIN_DATA_FILE}_unshuf
64 | 
65 | TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2s
66 | SOURCE_SUBTOKEN_HISTOGRAM=data/${DATASET_NAME}/${DATASET_NAME}.histo.ori.c2s
67 | NODE_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.node.c2s
68 | 
69 | echo "Creating histograms from the training data"
70 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f1 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${TARGET_HISTOGRAM_FILE}
71 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${SOURCE_SUBTOKEN_HISTOGRAM}
72 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${NODE_HISTOGRAM_FILE}
73 | 
74 | ${PYTHON} preprocess.py --train_data ${TRAIN_DATA_FILE} --test_data ${TEST_DATA_FILE} --val_data ${VAL_DATA_FILE} \
75 |   --max_contexts ${MAX_CONTEXTS} --max_data_contexts ${MAX_DATA_CONTEXTS} --subtoken_vocab_size ${SUBTOKEN_VOCAB_SIZE} \
76 |   --target_vocab_size ${TARGET_VOCAB_SIZE} --subtoken_histogram ${SOURCE_SUBTOKEN_HISTOGRAM} \
77 |   --node_histogram ${NODE_HISTOGRAM_FILE} --target_histogram ${TARGET_HISTOGRAM_FILE} --output_name data/${DATASET_NAME}/${DATASET_NAME}
78 |     
79 | # If all went well, the raw data files can be deleted, because preprocess.py creates new files 
80 | # with truncated and padded number of paths for each example.
81 | rm ${TRAIN_DATA_FILE} ${VAL_DATA_FILE} ${TEST_DATA_FILE} ${TARGET_HISTOGRAM_FILE} ${SOURCE_SUBTOKEN_HISTOGRAM} \
82 |   ${NODE_HISTOGRAM_FILE}
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/code2seq_master/preprocess_custom.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | ###########################################################
 3 | # Change the following values to preprocess a new dataset.
 4 | # TRAIN_DIR, VAL_DIR and TEST_DIR should be paths to      
 5 | #   directories containing sub-directories with .java files
 6 | # DATASET_NAME is just a name for the currently extracted 
 7 | #   dataset.                                              
 8 | # MAX_DATA_CONTEXTS is the number of contexts to keep in the dataset for each 
 9 | #   method (by default 1000). At training time, these contexts
10 | #   will be downsampled dynamically to MAX_CONTEXTS.
11 | # MAX_CONTEXTS - the number of actual contexts (by default 200) 
12 | # that are taken into consideration (out of MAX_DATA_CONTEXTS)
13 | # every training iteration. To avoid randomness at test time, 
14 | # for the test and validation sets only MAX_CONTEXTS contexts are kept 
15 | # (while for training, MAX_DATA_CONTEXTS are kept and MAX_CONTEXTS are
16 | # selected dynamically during training).
17 | # SUBTOKEN_VOCAB_SIZE, TARGET_VOCAB_SIZE -   
18 | #   - the number of subtokens and target words to keep 
19 | #   in the vocabulary (the top occurring words and paths will be kept). 
20 | # NUM_THREADS - the number of parallel threads to use. It is 
21 | #   recommended to use a multi-core machine for the preprocessing 
22 | #   step and set this value to the number of cores.
23 | # PYTHON - python3 interpreter alias.
24 | #TRAIN_DIR=my_training_dir
25 | #VAL_DIR=my_val_dir
26 | MAIN_DIR="java_code_data"
27 | DATASET_NAME="sample_set"
28 | MAX_DATA_CONTEXTS=1000
29 | MAX_CONTEXTS=200
30 | SUBTOKEN_VOCAB_SIZE=186277
31 | TARGET_VOCAB_SIZE=26347
32 | NUM_THREADS=64
33 | PYTHON=python3
34 | ###########################################################
35 | 
36 | #TRAIN_DATA_FILE=${DATASET_NAME}.train.raw.txt
37 | #VAL_DATA_FILE=${DATASET_NAME}.val.raw.txt
38 | MAIN_DATA_FILE=${DATASET_NAME}.test.raw.txt
39 | EXTRACTOR_JAR=JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar
40 | 
41 | mkdir -p data
42 | mkdir -p data/${DATASET_NAME}
43 | 
44 | echo "Extracting paths from set..."
45 | ${PYTHON} JavaExtractor/extract.py --dir ${MAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${MAIN_DATA_FILE} 2>> error_log.txt
46 | echo "Finished extracting paths from set"
47 | #echo "Extracting paths from test set..."
48 | #${PYTHON} JavaExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${TEST_DATA_FILE} 2>> error_log.txt
49 | #echo "Finished extracting paths from test set"
50 | #echo "Extracting paths from training set..."
51 | #${PYTHON} JavaExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} | shuf > ${TRAIN_DATA_FILE} 2>> error_log.txt
52 | #echo "Finished extracting paths from training set"
53 | 
54 | TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2s
55 | SOURCE_SUBTOKEN_HISTOGRAM=data/${DATASET_NAME}/${DATASET_NAME}.histo.ori.c2s
56 | NODE_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.node.c2s
57 | #
58 | echo "Creating histograms from the training data"
59 | cat ${MAIN_DATA_FILE} | cut -d' ' -f1 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${TARGET_HISTOGRAM_FILE}
60 | cat ${MAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${SOURCE_SUBTOKEN_HISTOGRAM}
61 | cat ${MAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${NODE_HISTOGRAM_FILE}
62 | #
63 | ${PYTHON} preprocess.py --train_data ${MAIN_DATA_FILE}  --test_data ${MAIN_DATA_FILE} --val_data ${MAIN_DATA_FILE} \
64 |   --max_contexts ${MAX_CONTEXTS} --max_data_contexts ${MAX_DATA_CONTEXTS} --subtoken_vocab_size ${SUBTOKEN_VOCAB_SIZE} \
65 |   --target_vocab_size ${TARGET_VOCAB_SIZE} --subtoken_histogram ${SOURCE_SUBTOKEN_HISTOGRAM} \
66 |   --node_histogram ${NODE_HISTOGRAM_FILE} --target_histogram ${TARGET_HISTOGRAM_FILE} --output_name data/${DATASET_NAME}/${DATASET_NAME}
67 | #
68 | ## If all went well, the raw data files can be deleted, because preprocess.py creates new files
69 | ## with truncated and padded number of paths for each example.
70 | rm ${MAIN_DATA_FILE}  ${TARGET_HISTOGRAM_FILE} ${SOURCE_SUBTOKEN_HISTOGRAM} ${NODE_HISTOGRAM_FILE}
71 | 


--------------------------------------------------------------------------------
/code2seq_master/test_extracted_ast.py:
--------------------------------------------------------------------------------
 1 | def read_file(input_filename):
 2 |     with open(input_filename, 'r') as file:
 3 |         return file.readlines()
 4 | 
 5 | ast_file = "data/sample_set/sample_set.train.c2s"
 6 | 
 7 | 
 8 | ast_of_snippets = read_file(ast_file)
 9 | print(len(ast_of_snippets))
10 | print(ast_of_snippets[0])
11 | 
12 | # for snippet in ast_of_snippets:
13 |     # print(snippet)
14 | 
15 | 


--------------------------------------------------------------------------------
/code2seq_master/train.sh:
--------------------------------------------------------------------------------
 1 | ###########################################################
 2 | # Change the following values to train a new model.
 3 | # type: the name of the new model, only affects the saved file name.
 4 | # dataset: the name of the dataset, as was preprocessed using preprocess.sh
 5 | # test_data: by default, points to the validation set, since this is the set that
 6 | #   will be evaluated after each training iteration. If you wish to test
 7 | #   on the final (held-out) test set, change 'val' to 'test'.
 8 | type=java-trail-model
 9 | dataset_name=auto_comment_dataset
10 | data_dir=../data/auto_comment_dataset
11 | data=${data_dir}/${dataset_name}
12 | test_data=${data_dir}/${dataset_name}.val.c2s
13 | model_dir=models/${type}
14 | 
15 | mkdir -p ${model_dir}
16 | set -e
17 | python3 -u code2seq.py --data ${data} --test ${test_data} --save_prefix ${model_dir}/model
18 | 


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | auto_comment_dataset
2 | *.json
3 | 
4 | 


--------------------------------------------------------------------------------
/data/data.7z:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/data/data.7z


--------------------------------------------------------------------------------
/images/network_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/images/network_architecture.png


--------------------------------------------------------------------------------
/images/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/images/pipeline.png


--------------------------------------------------------------------------------
/poster/ML4SE_Poster_Group_3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/ML4SE_Poster_Group_3.pdf


--------------------------------------------------------------------------------
/poster/source_code/example.java:
--------------------------------------------------------------------------------
1 | public static void add(int VAR0, int VAR1) {
2 |     return VAR0 + VAR1;
3 | }
4 | 


--------------------------------------------------------------------------------
/poster/source_code/img/Embedding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/source_code/img/Embedding.png


--------------------------------------------------------------------------------
/poster/source_code/img/TU_P1_full-color.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/source_code/img/TU_P1_full-color.png


--------------------------------------------------------------------------------
/poster/source_code/img/distr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/source_code/img/distr.png


--------------------------------------------------------------------------------
/poster/source_code/img/link_to_github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/source_code/img/link_to_github.png


--------------------------------------------------------------------------------
/poster/source_code/img/results_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/source_code/img/results_table.png


--------------------------------------------------------------------------------
/poster/source_code/img/zoomedInLength.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/poster/source_code/img/zoomedInLength.png


--------------------------------------------------------------------------------
/poster/source_code/poster.tex:
--------------------------------------------------------------------------------
  1 | %By% TODO: 
  2 | % - cifar10 table 
  3 | % - eig plots
  4 | % - bullet points
  5 | % - research question explicit
  6 | %Copyright (c) 2013 Joost van Zwieten
  7 | %
  8 | % Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | % of this software and associated documentation files (the "Software"), to deal
 10 | % in the Software without restriction, including without limitation the rights
 11 | % to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | % copies of the Software, and to permit persons to whom the Software is
 13 | % furnished to do so, subject to the following conditions:
 14 | %
 15 | % The above copyright notice and this permission notice shall be included in
 16 | % all copies or substantial portions of the Software.
 17 | %
 18 | % THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | % IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | % FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | % AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | % LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 | % OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 24 | % THE SOFTWARE.
 25 | %
 26 | \documentclass{tudelftposter}
 27 | 
 28 | % optional, makes QR code clickable
 29 | \usepackage[hidelinks,implicit=false,bookmarks=false]{hyperref}
 30 | \usepackage{booktabs}
 31 | \usepackage{listings}
 32 | \usepackage{xcolor}
 33 | \usepackage{mathtools}
 34 | \usepackage{subfigure}
 35 | \usepackage{subfig}
 36 | 
 37 | \definecolor{light-gray}{gray}{0.97} %the shade of grey that stack exchange uses
 38 | \definecolor{codegray}{rgb}{0.5,0.5,0.5}
 39 | \lstdefinestyle{mystyle}{
 40 |     language = Java,
 41 |     numberstyle=\tiny\color{codegray},
 42 |     basicstyle=\ttfamily\footnotesize,
 43 |     breakatwhitespace=false,         
 44 |     breaklines=true,                 
 45 |     captionpos=b,                    
 46 |     keepspaces=true,                 
 47 |     numbers=left,   
 48 |     numbersep=2pt,                  
 49 |     showspaces=false,                
 50 |     showstringspaces=false,
 51 |     showtabs=false,                  
 52 |     tabsize=2
 53 | }
 54 | 
 55 | \lstset{style=mystyle}
 56 | 
 57 | 
 58 | \title{Auto Comments: Generating Java code comments}
 59 | 
 60 | \addauthornote{diam}{Delft Institute of Computer Science, TU Delft}
 61 | 
 62 | \addauthor[diam]{R. Navin}
 63 | \addauthor[diam]{J. Katzy}
 64 | \addauthor[diam]{R. Skoulos}
 65 | \addauthor[diam]{T. Pfann}
 66 | 
 67 | \addfootimage(c:right column.center)[Delft Institute of Computer Science]{img/TU_P1_full-color.png}
 68 | \addfootqrcode(l:left column.left)[project repository]{https://github.com/LRNavin/AutoComments}
 69 | 
 70 | \begin{document}
 71 | 
 72 | \section{Motivation \& Goal}
 73 | \begin{itemize}
 74 |     \item In software development and maintenance, developers spend around 59\% of their time on program comprehension activities.
 75 |     \item Automatically generate human readable comments for code snippets.
 76 |     \item With DeepCom as baseline, we propose,
 77 |     \begin{itemize}
 78 |         \item Method-1: Replication of code2seq, with added capability to generate natural languages as comments.
 79 |         \item Method-2: Learn on modified ASTs, solving Out-of-Vocabulary problems.
 80 |     \end{itemize}
 81 | \end{itemize}
 82 | 
 83 | \section{Experiment Setup}
 84 | Java methods are parsed into ASTs, which are encoded and passed to a Encoder-Decoder sequence to sequence neural network based upon bidirectional LSTMS (a code2seq based architecture). 
 85 | 
 86 | % \paragraph{Dataset}
 87 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 88 | \lstinputlisting[language=Java, caption=Java example, frame=tb, backgroundcolor = \color{light-gray}]{example.java}
 89 | \label{code:examplefunction}
 90 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 91 | \begin{figure}[H]
 92 |     \centering
 93 |     \includegraphics[width=0.4\linewidth]{img/Embedding.png}
 94 |     \caption{Example AST of Function \ref{code:examplefunction}, the example path has been superimposed with thick arrows.}
 95 |     \label{fig:exampleAST}
 96 | \end{figure}
 97 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 98 | \textbf{Dataset}
 99 | % \paragraph{Dataset}
100 | \begin{table}[H]
101 |     \centering
102 |     \resizebox{\linewidth}{!}{
103 |     \begin{tabular}{c c c c c}
104 |     \# Methods   &  \# All tokens & \# All identifiers & \#Unique tokens & \#Unique identifiers\\
105 |     \toprule
106 |     588,108 & 44,378,497 & 13,779,297 & 794,711 & 794,621
107 |     \end{tabular}}
108 |     \caption{Statistics for code-snipets in DeepComm dataset}
109 |     \label{tab:dataset-statistics}
110 | \end{table}{}
111 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
112 | \begin{figure}[H]
113 | \centering
114 | \subfloat[Full Distribution]{\includegraphics{img/distr.png}} 
115 | \subfloat[\<40 words in comments]{\includegraphics{img/zoomedInLength.png}}
116 | \caption{Dataset distribution of target comment lengths.} 
117 | \label{fig:data_dist} 
118 | \end{figure}
119 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
120 | 
121 | % \paragraph{Encoding}
122 | % \begin{figure}[H]
123 | %     \centering
124 | %     \includegraphics[width =0.8\linewidth]{Encoder(1).png}
125 | %     \caption{Graphic representation of Encoder, $Encode(x) = \sum_{s\in x} E^{\text{subtokens}}_s$}
126 | %     \label{fig:encoder}
127 | % \end{figure}{}
128 | % \paragraph{Decoding}
129 | 
130 | 
131 | 
132 | % \paragraph{Dataset}
133 | 
134 | 
135 | \paragraph{Training}
136 | \begin{itemize}
137 |     \item Setup
138 |     \begin{itemize}
139 |         % \item Cross-entropy loss with a Nesterov momentum of 0.95.
140 |         \item Learning rate 0.01 with 0.05 decay every epoch.
141 |         \item Embeddings size: 128, Encoder size: 256, Decoder size: 640, Batch size: 128.
142 |         \item Trained for 100 epochs. Early stopping if no improvement for 10 epochs. 
143 |     \end{itemize}.  
144 |     \item Method - 1: Code2Seq model with comments as target sequence.
145 |     \item Method - 2: Same as method 1 but with variable names in ASTs.
146 |     \item Evaluation: BLEU-4 score
147 | \end{itemize}
148 | \section{Results} 
149 | %%%%%%%%%%%%%%%%%%%%%%%%%%
150 | \begin{table}[H]
151 | \centering
152 | \begin{tabular}{cc}
153 | \hline
154 | Approaches & BLEU-4 score \\ \hline
155 | DeepCom & 38.17 \\
156 | Method-1 & 6.08 \\
157 | Method-2 & 10.02 \\ \hline
158 | \end{tabular}
159 | \caption{Evaluation results on Java Methods}
160 | \label{tab:bleu-table}
161 | \end{table}
162 | %%%%%%%%%%%%%%%%%%%%%%%%%%
163 | 
164 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
165 | \begin{figure}[H]
166 |     \centering
167 |     \includegraphics[width =\linewidth]{img/results_table.png}
168 |     \caption{Comments Generated by models.}
169 |     \label{fig:comments_gen}
170 | \end{figure}
171 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
172 | 
173 | \section{Discussion}
174 | \begin{itemize}
175 |     \item Probable reasons for poor BLEU score [Table-\ref{tab:bleu-table}],
176 |         \begin{itemize}
177 |             \item Imbalanced distribution \ref{fig:data_dist} of target comment lengths in the dataset.
178 |             \item Code2Seq architecture - Built to predict function names.
179 |         \end{itemize}
180 |     \item Performance of Method - 2, proves to be good solution to Out-of-Vocabulary problems.
181 |     \item Model learnt the syntactic and semantic concepts from codes. [Fig - \ref{fig:comments_gen}]
182 |     \item But, Incapable of generating longer comments (>6 words).
183 | \end{itemize}
184 | 
185 | \section{Conclusion}
186 | \begin{itemize}
187 |     \item Contributions: code2seq based AutoComments, and, AST extraction to solve Out-of-Vocabulary.
188 |     \item Future Research,
189 |     \begin{itemize}
190 |         \item Balanced dataset - w.r.t. target comment lengths.
191 |         \item More experiments with decoder, for generating better comments from the learnt code semantics and syntaxes.
192 |     \end{itemize}
193 | \end{itemize}
194 | 
195 | \end{document}
196 | 


--------------------------------------------------------------------------------
/preproc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/preproc/__init__.py


--------------------------------------------------------------------------------
/preproc/common.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import subprocess
  3 | import sys
  4 | 
  5 | 
  6 | class Common:
  7 |     internal_delimiter = '|'
  8 |     SOS = '<S>'
  9 |     EOS = '</S>'
 10 |     PAD = '<PAD>'
 11 |     UNK = '<UNK>'
 12 | 
 13 |     @staticmethod
 14 |     def normalize_word(word):
 15 |         stripped = re.sub(r'[^a-zA-Z]', '', word)
 16 |         if len(stripped) == 0:
 17 |             return word.lower()
 18 |         else:
 19 |             return stripped.lower()
 20 | 
 21 |     @staticmethod
 22 |     def load_histogram(path, max_size=None):
 23 |         histogram = {}
 24 |         with open(path, 'r') as file:
 25 |             for line in file.readlines():
 26 |                 parts = line.split(' ')
 27 |                 if not len(parts) == 2:
 28 |                     continue
 29 |                 histogram[parts[0]] = int(parts[1])
 30 |         sorted_histogram = [(k, histogram[k]) for k in sorted(histogram, key=histogram.get, reverse=True)]
 31 |         return dict(sorted_histogram[:max_size])
 32 | 
 33 |     @staticmethod
 34 |     def load_vocab_from_dict(word_to_count, add_values=[], max_size=None):
 35 |         word_to_index, index_to_word = {}, {}
 36 |         current_index = 0
 37 |         for value in add_values:
 38 |             word_to_index[value] = current_index
 39 |             index_to_word[current_index] = value
 40 |             current_index += 1
 41 |         sorted_counts = [(k, word_to_count[k]) for k in sorted(word_to_count, key=word_to_count.get, reverse=True)]
 42 |         limited_sorted = dict(sorted_counts[:max_size])
 43 |         for word, count in limited_sorted.items():
 44 |             word_to_index[word] = current_index
 45 |             index_to_word[current_index] = word
 46 |             current_index += 1
 47 |         return word_to_index, index_to_word, current_index
 48 | 
 49 |     @staticmethod
 50 |     def binary_to_string(binary_string):
 51 |         return binary_string.decode("utf-8")
 52 | 
 53 |     @staticmethod
 54 |     def binary_to_string_list(binary_string_list):
 55 |         return [Common.binary_to_string(w) for w in binary_string_list]
 56 | 
 57 |     @staticmethod
 58 |     def binary_to_string_matrix(binary_string_matrix):
 59 |         return [Common.binary_to_string_list(l) for l in binary_string_matrix]
 60 | 
 61 |     @staticmethod
 62 |     def binary_to_string_3d(binary_string_tensor):
 63 |         return [Common.binary_to_string_matrix(l) for l in binary_string_tensor]
 64 | 
 65 |     @staticmethod
 66 |     def legal_method_names_checker(name):
 67 |         return not name in [Common.UNK, Common.PAD, Common.EOS]
 68 | 
 69 |     @staticmethod
 70 |     def filter_impossible_names(top_words):
 71 |         result = list(filter(Common.legal_method_names_checker, top_words))
 72 |         return result
 73 | 
 74 |     @staticmethod
 75 |     def unique(sequence):
 76 |         unique = []
 77 |         [unique.append(item) for item in sequence if item not in unique]
 78 |         return unique
 79 | 
 80 |     @staticmethod
 81 |     def parse_results(result, pc_info_dict, topk=5):
 82 |         prediction_results = {}
 83 |         results_counter = 0
 84 |         for single_method in result:
 85 |             original_name, top_suggestions, top_scores, attention_per_context = list(single_method)
 86 |             current_method_prediction_results = PredictionResults(original_name)
 87 |             if attention_per_context is not None:
 88 |                 word_attention_pairs = [(word, attention) for word, attention in
 89 |                                         zip(top_suggestions, attention_per_context) if
 90 |                                         Common.legal_method_names_checker(word)]
 91 |                 for predicted_word, attention_timestep in word_attention_pairs:
 92 |                     current_timestep_paths = []
 93 |                     for context, attention in [(key, attention_timestep[key]) for key in
 94 |                                                sorted(attention_timestep, key=attention_timestep.get, reverse=True)][
 95 |                                               :topk]:
 96 |                         if context in pc_info_dict:
 97 |                             pc_info = pc_info_dict[context]
 98 |                             current_timestep_paths.append((attention.item(), pc_info))
 99 | 
100 |                     current_method_prediction_results.append_prediction(predicted_word, current_timestep_paths)
101 |             else:
102 |                 for predicted_seq in top_suggestions:
103 |                     filtered_seq = [word for word in predicted_seq if Common.legal_method_names_checker(word)]
104 |                     current_method_prediction_results.append_prediction(filtered_seq, None)
105 | 
106 |             prediction_results[results_counter] = current_method_prediction_results
107 |             results_counter += 1
108 |         return prediction_results
109 | 
110 |     @staticmethod
111 |     def compute_bleu(ref_file_name, predicted_file_name):
112 |         with open(predicted_file_name) as predicted_file:
113 |             pipe = subprocess.Popen(["perl", "scripts/multi-bleu.perl", ref_file_name], stdin=predicted_file,
114 |                                     stdout=sys.stdout, stderr=sys.stderr)
115 | 
116 | 
117 | class PredictionResults:
118 |     def __init__(self, original_name):
119 |         self.original_name = original_name
120 |         self.predictions = list()
121 | 
122 |     def append_prediction(self, name, current_timestep_paths):
123 |         self.predictions.append(SingleTimeStepPrediction(name, current_timestep_paths))
124 | 
125 | class SingleTimeStepPrediction:
126 |     def __init__(self, prediction, attention_paths):
127 |         self.prediction = prediction
128 |         if attention_paths is not None:
129 |             paths_with_scores = []
130 |             for attention_score, pc_info in attention_paths:
131 |                 path_context_dict = {'score': attention_score,
132 |                                      'path': pc_info.longPath,
133 |                                      'token1': pc_info.token1,
134 |                                      'token2': pc_info.token2}
135 |                 paths_with_scores.append(path_context_dict)
136 |             self.attention_paths = paths_with_scores
137 | 
138 | 
139 | class PathContextInformation:
140 |     def __init__(self, context):
141 |         self.token1 = context['name1']
142 |         self.longPath = context['path']
143 |         self.shortPath = context['shortPath']
144 |         self.token2 = context['name2']
145 | 
146 |     def __str__(self):
147 |         return '%s,%s,%s' % (self.token1, self.shortPath, self.token2)


--------------------------------------------------------------------------------
/preproc/feature_extractor.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import pandas as pd
 4 | 
 5 | data_folder = "../data/"
 6 | data_files  = ["train", "test", "valid"]
 7 | 
 8 | def save_dataset(feature, label, filename):
 9 |     print(f"Saving File - {filename}")
10 |     dataset = np.array([feature, label])
11 |     dataset = np.transpose(dataset)
12 |     pd.DataFrame(dataset).to_csv(data_folder + filename, index=False, header=False)
13 | 
14 | for file in data_files:
15 |     print(f"Extracting File - {file}")
16 |     code_feat = []
17 |     nl_label = []
18 |     file_path = data_folder + file + ".json"
19 |     with open(file_path, 'r') as f:
20 |         for line in f:
21 |             code_feat.append(json.loads(line)["code"])
22 |             nl_label.append(json.loads(line)["nl"])
23 |             if len(code_feat) == 100:
24 |                 break
25 |     save_dataset(code_feat, nl_label, file + ".csv")
26 | 


--------------------------------------------------------------------------------
/preproc/java_files_creator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import csv
 3 | import json
 4 | import re
 5 | import pickle
 6 | 
 7 | # Raw Data - Folders & Files
 8 | raw_data_files  = ["train", "test", "valid"]
 9 | raw_data_folder = "data/"
10 | 
11 | # Processed java file locations
12 | base_folder = "java_code_"
13 | sub_folder  = "data"
14 | 
15 | # Get AST - Only First 100 code - boolean
16 | get_ast_full_file = False
17 | 
18 | def save_code_in_javafile(to_write_path, code):
19 |     f = open(to_write_path + "/Input.java", "w")
20 |     f.write(code)
21 |     f.close()
22 | 
23 | def save_comment_in_txtfile(to_write_path, comment):
24 |     f = open(to_write_path + "/comment.txt", "w")
25 |     f.write(comment)
26 |     f.close()
27 | 
28 | def extract_replacements(to_write_path, code, comment):
29 |     varEncDict = {}
30 |     varDecDict = {}
31 |     codecopy = re.sub(r"\([A-z]+<.*>", "(type ", code)
32 |     codecopy = re.sub(r"\,[A-z]+<.*>", ",type ", codecopy)
33 |     codecopy = re.sub(r"@+.*(public|private|static)\s", "declaration", codecopy)
34 |     decl = codecopy.split("\n")[0]
35 |     varDecl = re.findall("\((.*?)\)", decl)[0]
36 |     varList = varDecl.split(",")
37 |     if (varList[0] == "" and len(varList) == 1):
38 |         save_comment_in_txtfile(to_write_path, comment)
39 |         save_code_in_javafile(to_write_path, code)
40 |         return
41 |     else:
42 |         i = 0
43 |         print(varList)
44 |         for v in varList:
45 |             name = v.split(" ")[-1]
46 |             varEncDict[name] = "VAR" + str(i)
47 |             varDecDict["VAR" + str(i)] = name 
48 |             i = i+1
49 |         for name in varEncDict:
50 |             code = code.replace(name, varEncDict[name])
51 |             comment = comment.replace(name, varEncDict[name])
52 |         
53 |         save_comment_in_txtfile(to_write_path, comment)
54 |         save_code_in_javafile(to_write_path, code)
55 |     
56 |         fEnc = open(to_write_path + "/encodeDict" , "wb")
57 |         pickle.dump(varEncDict, fEnc)
58 | 
59 |         fDec = open(to_write_path + "/decodeDict", "wb")
60 |         pickle.dump(varDecDict, fDec)
61 | 
62 | for file in raw_data_files:
63 |     curr_base_folder = base_folder + file
64 |     os.mkdir(curr_base_folder)
65 |     print(f"Extracting File - {file}")
66 |     file_path = raw_data_folder + file + ".json"
67 |     with open(file_path, 'r') as f:
68 |         for index, line in enumerate(f):
69 |             if not get_ast_full_file and index == 100:
70 |                 break
71 |             print(f"Writing Java Snippet No:{index}")
72 |             to_write_path = curr_base_folder + '/' + sub_folder + str(index)
73 |             os.mkdir(to_write_path)
74 |             code = json.loads(line)["code"]
75 |             comment = json.loads(line)["nl"]
76 |             extract_replacements(to_write_path, code, comment)
77 |         f.close()
78 | 


--------------------------------------------------------------------------------
/preproc/preprocess.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import pickle
  3 | from argparse import ArgumentParser
  4 | 
  5 | import numpy as np
  6 | 
  7 | import common
  8 | '''
  9 | This script preprocesses the data from MethodPaths. It truncates methods with too many contexts,
 10 | and pads methods with less paths with spaces.
 11 | '''
 12 | 
 13 | 
 14 | def save_dictionaries(dataset_name, subtoken_to_count, node_to_count, target_to_count, max_contexts, num_examples):
 15 |     save_dict_file_path = '{}.dict.c2s'.format(dataset_name)
 16 |     with open(save_dict_file_path, 'wb') as file:
 17 |         pickle.dump(subtoken_to_count, file)
 18 |         pickle.dump(node_to_count, file)
 19 |         pickle.dump(target_to_count, file)
 20 |         pickle.dump(max_contexts, file)
 21 |         pickle.dump(num_examples, file)
 22 |         print('Dictionaries saved to: {}'.format(save_dict_file_path))
 23 | 
 24 | 
 25 | def process_file(file_path, data_file_role, dataset_name, max_contexts, max_data_contexts):
 26 |     sum_total = 0
 27 |     sum_sampled = 0
 28 |     total = 0
 29 |     max_unfiltered = 0
 30 |     max_contexts_to_sample = max_data_contexts if data_file_role == 'train' else max_contexts
 31 |     output_path = '{}.{}.c2s'.format(dataset_name, data_file_role)
 32 |     with open(output_path, 'w') as outfile:
 33 |         with open(file_path, 'r') as file:
 34 |             for line in file:
 35 |                 print(line)
 36 |                 parts = line.rstrip('\n').split(' ')
 37 |                 target_name = parts[0]
 38 |                 contexts = parts[1:]
 39 | 
 40 |                 if len(contexts) > max_unfiltered:
 41 |                     max_unfiltered = len(contexts)
 42 | 
 43 |                 sum_total += len(contexts)
 44 |                 if len(contexts) > max_contexts_to_sample:
 45 |                     contexts = np.random.choice(contexts, max_contexts_to_sample, replace=False)
 46 | 
 47 |                 sum_sampled += len(contexts)
 48 | 
 49 |                 csv_padding = " " * (max_data_contexts - len(contexts))
 50 |                 total += 1
 51 |                 outfile.write(target_name + ' ' + " ".join(contexts) + csv_padding + '\n')
 52 | 
 53 |     print('File: ' + data_file_path)
 54 |     print('Average total contexts: ' + str(float(sum_total) / total))
 55 |     print('Average final (after sampling) contexts: ' + str(float(sum_sampled) / total))
 56 |     print('Total examples: ' + str(total))
 57 |     print('Max number of contexts per word: ' + str(max_unfiltered))
 58 |     return total
 59 | 
 60 | 
 61 | def context_full_found(context_parts, word_to_count, path_to_count):
 62 |     return context_parts[0] in word_to_count \
 63 |            and context_parts[1] in path_to_count and context_parts[2] in word_to_count
 64 | 
 65 | 
 66 | def context_partial_found(context_parts, word_to_count, path_to_count):
 67 |     return context_parts[0] in word_to_count \
 68 |            or context_parts[1] in path_to_count or context_parts[2] in word_to_count
 69 | 
 70 | 
 71 | if __name__ == '__main__':
 72 |     parser = ArgumentParser()
 73 |     parser.add_argument("-trd", "--train_data", dest="train_data_path",
 74 |                         help="path to training data file", required=True)
 75 |     parser.add_argument("-ted", "--test_data", dest="test_data_path",
 76 |                         help="path to test data file", required=True)
 77 |     parser.add_argument("-vd", "--val_data", dest="val_data_path",
 78 |                         help="path to validation data file", required=True)
 79 |     parser.add_argument("-mc", "--max_contexts", dest="max_contexts", default=200,
 80 |                         help="number of max contexts to keep in test+validation", required=False)
 81 |     parser.add_argument("-mdc", "--max_data_contexts", dest="max_data_contexts", default=1000,
 82 |                         help="number of max contexts to keep in the dataset", required=False)
 83 |     parser.add_argument("-svs", "--subtoken_vocab_size", dest="subtoken_vocab_size", default=186277,
 84 |                         help="Max number of source subtokens to keep in the vocabulary", required=False)
 85 |     parser.add_argument("-tvs", "--target_vocab_size", dest="target_vocab_size", default=26347,
 86 |                         help="Max number of target words to keep in the vocabulary", required=False)
 87 |     parser.add_argument("-sh", "--subtoken_histogram", dest="subtoken_histogram",
 88 |                         help="subtoken histogram file", metavar="FILE", required=True)
 89 |     parser.add_argument("-nh", "--node_histogram", dest="node_histogram",
 90 |                         help="node_histogram file", metavar="FILE", required=True)
 91 |     parser.add_argument("-th", "--target_histogram", dest="target_histogram",
 92 |                         help="target histogram file", metavar="FILE", required=True)
 93 |     parser.add_argument("-o", "--output_name", dest="output_name",
 94 |                         help="output name - the base name for the created dataset", required=True, default='data')
 95 |     args = parser.parse_args()
 96 | 
 97 |     train_data_path = args.train_data_path
 98 |     test_data_path = args.test_data_path
 99 |     val_data_path = args.val_data_path
100 |     subtoken_histogram_path = args.subtoken_histogram
101 |     node_histogram_path = args.node_histogram
102 | 
103 |     subtoken_to_count = common.Common.load_histogram(subtoken_histogram_path,
104 |                                                      max_size=int(args.subtoken_vocab_size))
105 |     node_to_count = common.Common.load_histogram(node_histogram_path,
106 |                                                  max_size=None)
107 |     target_to_count = common.Common.load_histogram(args.target_histogram,
108 |                                                    max_size=int(args.target_vocab_size))
109 |     print('subtoken vocab size: ', len(subtoken_to_count))
110 |     print('node vocab size: ', len(node_to_count))
111 |     print('target vocab size: ', len(target_to_count))
112 | 
113 |     num_training_examples = 0
114 |     for data_file_path, data_role in zip([test_data_path, val_data_path, train_data_path], ['test', 'val', 'train']):
115 |         num_examples = process_file(file_path=data_file_path, data_file_role=data_role, dataset_name=args.output_name,
116 |                                     max_contexts=int(args.max_contexts), max_data_contexts=int(args.max_data_contexts))
117 |         if data_role == 'train':
118 |             num_training_examples = num_examples
119 | 
120 |     save_dictionaries(dataset_name=args.output_name, subtoken_to_count=subtoken_to_count,
121 |                       node_to_count=node_to_count, target_to_count=target_to_count,
122 |                       max_contexts=int(args.max_data_contexts), num_examples=num_training_examples)
123 | 


--------------------------------------------------------------------------------
/preproc/preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | ###########################################################
 3 | # Change the following values to preproc a new dataset.
 4 | # TRAIN_DIR, VAL_DIR and TEST_DIR should be paths to      
 5 | #   directories containing sub-directories with .java files
 6 | # DATASET_NAME is just a name for the currently extracted 
 7 | #   dataset.                                              
 8 | # MAX_DATA_CONTEXTS is the number of contexts to keep in the dataset for each 
 9 | #   method (by default 1000). At training time, these contexts
10 | #   will be downsampled dynamically to MAX_CONTEXTS.
11 | # MAX_CONTEXTS - the number of actual contexts (by default 200) 
12 | # that are taken into consideration (out of MAX_DATA_CONTEXTS)
13 | # every training iteration. To avoid randomness at test time, 
14 | # for the test and validation sets only MAX_CONTEXTS contexts are kept 
15 | # (while for training, MAX_DATA_CONTEXTS are kept and MAX_CONTEXTS are
16 | # selected dynamically during training).
17 | # SUBTOKEN_VOCAB_SIZE, TARGET_VOCAB_SIZE -   
18 | #   - the number of subtokens and target words to keep 
19 | #   in the vocabulary (the top occurring words and paths will be kept). 
20 | # NUM_THREADS - the number of parallel threads to use. It is 
21 | #   recommended to use a multi-core machine for the preprocessing 
22 | #   step and set this value to the number of cores.
23 | # PYTHON - python3 interpreter alias.
24 | TRAIN_DIR=java_code_train
25 | VAL_DIR=java_code_valid
26 | TEST_DIR=java_code_test
27 | 
28 | DATASET_NAME=auto_comment_dataset
29 | 
30 | MAX_DATA_CONTEXTS=1000
31 | MAX_CONTEXTS=200
32 | SUBTOKEN_VOCAB_SIZE=186277
33 | TARGET_VOCAB_SIZE=26347
34 | NUM_THREADS=32
35 | PYTHON=python3
36 | ###########################################################
37 | 
38 | TRAIN_DATA_FILE=${DATASET_NAME}.train.raw.txt
39 | VAL_DATA_FILE=${DATASET_NAME}.val.raw.txt
40 | TEST_DATA_FILE=${DATASET_NAME}.test.raw.txt
41 | EXTRACTOR_JAR=JavaExtractor/JPredict/target/JavaExtractor-0.0.1-SNAPSHOT.jar
42 | #EXTRACTOR_JAR=JavaExtractor/JPredict/target/JavaExtractor.jar
43 | 
44 | mkdir -p data
45 | mkdir -p data/${DATASET_NAME}
46 | 
47 | echo "Extracting paths from validation set..."
48 | ${PYTHON} JavaExtractor/extract.py --dir ${VAL_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${VAL_DATA_FILE} 2>> error_log.txt
49 | echo "Finished extracting paths from validation set"
50 | echo "Extracting paths from test set..."
51 | ${PYTHON} JavaExtractor/extract.py --dir ${TEST_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${TEST_DATA_FILE} 2>> error_log.txt
52 | echo "Finished extracting paths from test set"
53 | echo "Extracting paths from training set..."
54 | ${PYTHON} JavaExtractor/extract.py --dir ${TRAIN_DIR} --max_path_length 8 --max_path_width 2 --num_threads ${NUM_THREADS} --jar ${EXTRACTOR_JAR} > ${TRAIN_DATA_FILE} 2>> error_log.txt
55 | echo "Finished extracting paths from training set"
56 | 
57 | TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2s
58 | SOURCE_SUBTOKEN_HISTOGRAM=data/${DATASET_NAME}/${DATASET_NAME}.histo.ori.c2s
59 | NODE_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.node.c2s
60 | 
61 | echo "Creating histograms from the training data"
62 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f1 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${TARGET_HISTOGRAM_FILE}
63 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${SOURCE_SUBTOKEN_HISTOGRAM}
64 | cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | tr '|' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${NODE_HISTOGRAM_FILE}
65 | 
66 | ${PYTHON} preproc/preprocess.py --train_data ${TRAIN_DATA_FILE} --test_data ${TEST_DATA_FILE} --val_data ${VAL_DATA_FILE} \
67 |   --max_contexts ${MAX_CONTEXTS} --max_data_contexts ${MAX_DATA_CONTEXTS} --subtoken_vocab_size ${SUBTOKEN_VOCAB_SIZE} \
68 |   --target_vocab_size ${TARGET_VOCAB_SIZE} --subtoken_histogram ${SOURCE_SUBTOKEN_HISTOGRAM} \
69 |   --node_histogram ${NODE_HISTOGRAM_FILE} --target_histogram ${TARGET_HISTOGRAM_FILE} --output_name data/${DATASET_NAME}/${DATASET_NAME}
70 | 
71 | # If all went well, the raw data files can be deleted, because preproc.py creates new files
72 | # with truncated and padded number of paths for each example.
73 | rm ${TRAIN_DATA_FILE} ${VAL_DATA_FILE} ${TEST_DATA_FILE} ${TARGET_HISTOGRAM_FILE} ${SOURCE_SUBTOKEN_HISTOGRAM} \
74 |   ${NODE_HISTOGRAM_FILE}
75 | 


--------------------------------------------------------------------------------
/presentation/AutoComments_Presentation-Group3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/presentation/AutoComments_Presentation-Group3.pdf


--------------------------------------------------------------------------------
/presentation/link_to_presentation.txt:
--------------------------------------------------------------------------------
1 | The link to the presentation is: https://docs.google.com/presentation/d/1cNpiHHCmrLX-c2bckLLt2Ko3dJJmXe8CtRjA_8o7R-c/edit?usp=sharing


--------------------------------------------------------------------------------
/report/ML4SE_group_3_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/ML4SE_group_3_report.pdf


--------------------------------------------------------------------------------
/report/latex_code/BasicEncoderDecoder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/BasicEncoderDecoder.png


--------------------------------------------------------------------------------
/report/latex_code/BiLSTM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/BiLSTM.png


--------------------------------------------------------------------------------
/report/latex_code/Embedding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/Embedding.png


--------------------------------------------------------------------------------
/report/latex_code/Encoder(1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/Encoder(1).png


--------------------------------------------------------------------------------
/report/latex_code/ExampleAST.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/ExampleAST.png


--------------------------------------------------------------------------------
/report/latex_code/LSTM.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/LSTM.png


--------------------------------------------------------------------------------
/report/latex_code/blueprints.tex:
--------------------------------------------------------------------------------
 1 | Table:
 2 | 
 3 | \begin{table}[H]
 4 | \centering
 5 | \begin{tabular}{lc}
 6 | \noalign{\smallskip} \hline \hline \noalign{\smallskip} 
 7 | Parameter & Value \\ \hline
 8 | Dropout  1 & 0.71 \\ 
 9 | Dropout  2 & 0.15 \\ 
10 | Receptive field 1 & 3 \\ 
11 | Receptive field 2 & 2  \\ 
12 | Stride size 1 & 2 \\ 
13 | Stride size 2 & 1  \\ 
14 | Dense & 50  \\ \hline
15 | \end{tabular}
16 | \caption{Hyperparameters Convolutional Recurrent model}
17 | \label{Table:hyperconvrec}
18 | \end{table}
19 | 
20 | 
21 | Image:
22 | 
23 | %
24 | \begin{figure}[H]
25 | \centering
26 | \includegraphics[width=\linewidth]{graphs/imbalance.png}
27 | \caption{High class imbalance}
28 | \label{fig:imb}
29 | \end{figure}
30 | %


--------------------------------------------------------------------------------
/report/latex_code/distr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/distr.png


--------------------------------------------------------------------------------
/report/latex_code/example.java:
--------------------------------------------------------------------------------
1 | public static void add(int VAR0, int VAR1) {
2 |     return VAR0 + VAR1;
3 | }
4 | 


--------------------------------------------------------------------------------
/report/latex_code/source-code/1.java:
--------------------------------------------------------------------------------
1 | public static byte[] bitmapToByte(Bitmap b){
2 |     ByteArrayOutputStream o = new ByteArrayOutputStream();
3 |     b.compress(Bitmap.CompressFormat.PNG,100,o);
4 |     return o.toByteArray();
5 | }


--------------------------------------------------------------------------------
/report/latex_code/source-code/2.java:
--------------------------------------------------------------------------------
1 | private static void addDefaultProfile(App app ,Simple source){
2 |     if(!source.containsProperty("spring.profiles.active")
3 |         && !System.getenv().containsKey("ACTIVE")){
4 |         app.setAdditionalProfiles(Constants.DEVELOPMENT);
5 |         } 
6 | }


--------------------------------------------------------------------------------
/report/latex_code/source-code/3.java:
--------------------------------------------------------------------------------
1 | protected void createItemsLayout(){
2 |     if (mItemsLayout == null){
3 |         mItemsLayout=new LinearLayout(getContext());
4 |         mItemsLayout.setOrientation(LinearLayout.VERTICAL);
5 |     }
6 | }


--------------------------------------------------------------------------------
/report/latex_code/source-code/4.java:
--------------------------------------------------------------------------------
1 | public FactoryConfigurationError(Exception e){
2 |     super(e.toString());
3 |     this.exception=e;
4 | }


--------------------------------------------------------------------------------
/report/latex_code/source-code/5.java:
--------------------------------------------------------------------------------
 1 | public static void sort(Comparable[] a){
 2 |     int n=a.length;
 3 |     for (int i=1; i < n; i++){
 4 |         Comparable v=a[i];
 5 |         int lo=0, hi=i;
 6 |         while (lo < hi){ ... }
 7 |         ...
 8 |         }        
 9 |         assert isSorted(a); 
10 | }


--------------------------------------------------------------------------------
/report/latex_code/source-code/6.java:
--------------------------------------------------------------------------------
1 | public boolean isEmpty(){
2 |     return root == null; 
3 | }


--------------------------------------------------------------------------------
/report/latex_code/source-code/7.java:
--------------------------------------------------------------------------------
1 | public boolean contains(int key){ 
2 |     return rank(key) != -1; 
3 | }


--------------------------------------------------------------------------------
/report/latex_code/source-code/8.java:
--------------------------------------------------------------------------------
1 | public void tag(String inputFileName,String outputFileName){
2 |     List sentences=jsc.textFile(inputFileName).collect();
3 |     tag(sentences,outputFileName);
4 | }


--------------------------------------------------------------------------------
/report/latex_code/source-code/9.java:
--------------------------------------------------------------------------------
1 | public void unlisten(String pattern){
2 |     UtilListener listener=listeners.get(pattern);
3 |     if(listener!=null){
4 |         listener.destroy();
5 |         listeners.remove(pattern);
6 |         }else{
7 |             client.onError(Topic.RECORD, Event.NOT_LISTENING,pattern); 
8 |         } 
9 | }


--------------------------------------------------------------------------------
/report/latex_code/zoomedInLength.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LRNavin/AutoComments/ed9ddadb00845daffe49f8513b81ddd32dfcd3aa/report/latex_code/zoomedInLength.png


--------------------------------------------------------------------------------
/scripts/multi-bleu.perl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # This file is part of moses.  Its use is licensed under the GNU Lesser General
  4 | # Public License version 2.1 or, at your option, any later version.
  5 | 
  6 | # $Id$
  7 | use warnings;
  8 | use strict;
  9 | 
 10 | my $lowercase = 0;
 11 | if ($ARGV[0] eq "-lc") {
 12 |   $lowercase = 1;
 13 |   shift;
 14 | }
 15 | 
 16 | my $stem = $ARGV[0];
 17 | if (!defined $stem) {
 18 |   print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
 19 |   print STDERR "Reads the references from reference or reference0, reference1, ...\n";
 20 |   exit(1);
 21 | }
 22 | c
 23 | $stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
 24 | 
 25 | my @REF;
 26 | my $ref=0;
 27 | while(-e "$stem$ref") {
 28 |     &add_to_ref("$stem$ref",\@REF);
 29 |     $ref++;
 30 | }
 31 | &add_to_ref($stem,\@REF) if -e $stem;
 32 | die("ERROR: could not find reference file $stem") unless scalar @REF;
 33 | 
 34 | # add additional references explicitly specified on the command line
 35 | shift;
 36 | foreach my $stem (@ARGV) {
 37 |     &add_to_ref($stem,\@REF) if -e $stem;
 38 | }
 39 | 
 40 | 
 41 | 
 42 | sub add_to_ref {
 43 |     my ($file,$REF) = @_;
 44 |     my $s=0;
 45 |     if ($file =~ /.gz$/) {
 46 | 	open(REF,"gzip -dc $file|") or die "Can't read $file";
 47 |     } else {
 48 | 	open(REF,$file) or die "Can't read $file";
 49 |     }
 50 |     while(<REF>) {
 51 | 	chomp;
 52 | 	push @{$$REF[$s++]}, $_;
 53 |     }
 54 |     close(REF);
 55 | }
 56 | 
 57 | my(@CORRECT,@TOTAL,$length_translation,$length_reference);
 58 | my $s=0;
 59 | while(<STDIN>) {
 60 |     chomp;
 61 |     $_ = lc if $lowercase;
 62 |     my @WORD = split;
 63 |     my %REF_NGRAM = ();
 64 |     my $length_translation_this_sentence = scalar(@WORD);
 65 |     my ($closest_diff,$closest_length) = (9999,9999);
 66 |     foreach my $reference (@{$REF[$s]}) {
 67 | #      print "$s $_ <=> $reference\n";
 68 |   $reference = lc($reference) if $lowercase;
 69 | 	my @WORD = split(' ',$reference);
 70 | 	my $length = scalar(@WORD);
 71 |         my $diff = abs($length_translation_this_sentence-$length);
 72 | 	if ($diff < $closest_diff) {
 73 | 	    $closest_diff = $diff;
 74 | 	    $closest_length = $length;
 75 | 	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
 76 | 	} elsif ($diff == $closest_diff) {
 77 |             $closest_length = $length if $length < $closest_length;
 78 |             # from two references with the same closeness to me
 79 |             # take the *shorter* into account, not the "first" one.
 80 |         }
 81 | 	for(my $n=1;$n<=4;$n++) {
 82 | 	    my %REF_NGRAM_N = ();
 83 | 	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
 84 | 		my $ngram = "$n";
 85 | 		for(my $w=0;$w<$n;$w++) {
 86 | 		    $ngram .= " ".$WORD[$start+$w];
 87 | 		}
 88 | 		$REF_NGRAM_N{$ngram}++;
 89 | 	    }
 90 | 	    foreach my $ngram (keys %REF_NGRAM_N) {
 91 | 		if (!defined($REF_NGRAM{$ngram}) ||
 92 | 		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
 93 | 		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
 94 | #	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
 95 | 		}
 96 | 	    }
 97 | 	}
 98 |     }
 99 |     $length_translation += $length_translation_this_sentence;
100 |     $length_reference += $closest_length;
101 |     for(my $n=1;$n<=4;$n++) {
102 | 	my %T_NGRAM = ();
103 | 	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
104 | 	    my $ngram = "$n";
105 | 	    for(my $w=0;$w<$n;$w++) {
106 | 		$ngram .= " ".$WORD[$start+$w];
107 | 	    }
108 | 	    $T_NGRAM{$ngram}++;
109 | 	}
110 | 	foreach my $ngram (keys %T_NGRAM) {
111 | 	    $ngram =~ /^(\d+) /;
112 | 	    my $n = $1;
113 |             # my $corr = 0;
114 | #	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
115 | 	    $TOTAL[$n] += $T_NGRAM{$ngram};
116 | 	    if (defined($REF_NGRAM{$ngram})) {
117 | 		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
118 | 		    $CORRECT[$n] += $T_NGRAM{$ngram};
119 |                     # $corr =  $T_NGRAM{$ngram};
120 | #	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
121 | 		}
122 | 		else {
123 | 		    $CORRECT[$n] += $REF_NGRAM{$ngram};
124 |                     # $corr =  $REF_NGRAM{$ngram};
125 | #	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
126 | 		}
127 | 	    }
128 |             # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
129 |             # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
130 | 	}
131 |     }
132 |     $s++;
133 | }
134 | my $brevity_penalty = 1;
135 | my $bleu = 0;
136 | 
137 | my @bleu=();
138 | 
139 | for(my $n=1;$n<=4;$n++) {
140 |   if (defined ($TOTAL[$n])){
141 |     $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
142 |     # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
143 |   }else{
144 |     $bleu[$n]=0;
145 |   }
146 | }
147 | 
148 | if ($length_reference==0){
149 |   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
150 |   exit(1);
151 | }
152 | 
153 | if ($length_translation<$length_reference) {
154 |   $brevity_penalty = exp(1-$length_reference/$length_translation);
155 | }
156 | $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
157 | 				my_log( $bleu[2] ) +
158 | 				my_log( $bleu[3] ) +
159 | 				my_log( $bleu[4] ) ) / 4) ;
160 | printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
161 |     100*$bleu,
162 |     100*$bleu[1],
163 |     100*$bleu[2],
164 |     100*$bleu[3],
165 |     100*$bleu[4],
166 |     $brevity_penalty,
167 |     $length_translation / $length_reference,
168 |     $length_translation,
169 |     $length_reference;
170 | 
171 | 
172 | print STDERR "It is not advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n";
173 | 
174 | sub my_log {
175 |   return -9999999999 unless $_[0];
176 |   return log($_[0]);
177 | }


--------------------------------------------------------------------------------