├── .gitignore ├── readme.md └── src ├── Consts.java ├── JavaClan.java ├── JavaFileParser.java ├── VectorComparator.java └── VectorValueItem.java /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea/* 2 | /JavaClanOnGitHub.iml 3 | /out/* 4 | 5 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # CLAN 2 | 3 | CLAN stands for Closely reLated ApplicatioNs. 4 | 5 | The original paper, by Collin McMillan, can be found here: 6 | http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.380.8742&rep=rep1&type=pdf 7 | 8 | This approach is implemented on GitHub, to be compared with RepoPal. RepoPal is a system to detect similarity of GitHub repositories based on stars and texual similarity. Sourcecode of RepoPal can be found here: https://github.com/Qualia-Li/RepoPal. 9 | 10 | For more information related to the research on Detecting Similar Repositories on GitHub, and the methdology to compare CLAN and RepoPal, please visit http://www.liquanlai.com/blog/?p=195. 11 | 12 | ##Feature 13 | 14 | CLAN computes similarity between Java applications by comparing the API calls made by the two applications. 15 | 16 | ## Specification 17 | 18 | Change values in `Const.java` to configure: 19 | 20 | - `ROOT_DIR`: a `.txt` file which contains the list of directories that are going to be checked. Data should be in form of `id,directory`, in which `number` stands for a unique id of a specific repository and `directory` stands for the directory where the repository locates. 21 | 22 | - `JDK_DIR`: a `.txt` file which contains the list of jdk methods that are counted in the evaluation vector. Data should be in form of `name`, in which `name` is the name of a JDK method. 23 | 24 | - `VEC_DIR`: a `.txt` file of all evaluation vectors calculated. Repository IDs are also included. 25 | 26 | - `AIM_DIR`: ID of the aim repository. 27 | 28 | - `ANS_AMT`: the wanted amount of answer to be shown. 29 | -------------------------------------------------------------------------------- /src/Consts.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Rex on 15/11/2. 3 | */ 4 | public class Consts { 5 | 6 | final static String ROOT_DIR = "/Users/Rex/Desktop/test/dir.txt"; 7 | final static String JDK_DIR = "/Users/Rex/Desktop/test/jdk.txt"; 8 | final static String VEC_DIR = "/Users/Rex/Desktop/test/vector.txt"; 9 | final static String AIM_NUM = "1"; 10 | final static int ANS_AMT = 100; 11 | } 12 | -------------------------------------------------------------------------------- /src/JavaClan.java: -------------------------------------------------------------------------------- 1 | import java.util.ArrayList; 2 | 3 | /** 4 | * Created by Rex on 15/11/2. 5 | */ 6 | public class JavaClan { 7 | 8 | public static void main(String[] args){ 9 | 10 | try { 11 | JavaFileParser parser = new JavaFileParser(); 12 | parser.getJdkMethodInFile(Consts.ROOT_DIR, Consts.JDK_DIR); 13 | VectorComparator vectorComparator = new VectorComparator(); 14 | ArrayList answerList = vectorComparator.getNearestVectorNumber(Consts.VEC_DIR, Consts.AIM_NUM, Consts.ANS_AMT); 15 | System.out.println("\n===== ANSWER =====\n"); 16 | for (String item : answerList){ 17 | System.out.println(item); 18 | } 19 | } catch (Exception e){ 20 | e.printStackTrace(); 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/JavaFileParser.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | import java.util.HashMap; 3 | import java.util.HashSet; 4 | import java.util.regex.Matcher; 5 | import java.util.regex.Pattern; 6 | 7 | public class JavaFileParser { 8 | 9 | private HashMap> javaFileJDKList = new HashMap>(); 10 | private HashMap javaDirList = new HashMap(); 11 | private HashSet jdkSet = new HashSet(); 12 | 13 | private void getJavaFileList(File fileList, String dirNumber, HashMap jdkMethodCount) throws Exception { 14 | 15 | File[] tempList = fileList.listFiles(); 16 | if (null != tempList){ 17 | System.out.println("Number of items under the given directory: " + tempList.length); 18 | } else throw new Exception("ERROR: Nothing under the given directory."); 19 | 20 | for (File file : tempList) { 21 | if (file.isFile()) { 22 | if (file.getName().contains("java")) { 23 | HashMap tempJdkList = getJDKList(file); 24 | for (String item : tempJdkList.keySet()){ 25 | if (jdkMethodCount.containsKey(item)){ 26 | jdkMethodCount.replace(item, jdkMethodCount.get(item) + tempJdkList.get(item)); 27 | } else { 28 | jdkMethodCount.put(item, tempJdkList.get(item)); 29 | } 30 | } 31 | javaFileJDKList.put(dirNumber, getJDKList(file)); 32 | } 33 | } else if (file.isDirectory()) { 34 | getJavaFileList(file, dirNumber, jdkMethodCount); 35 | } 36 | } 37 | javaFileJDKList.put(dirNumber, jdkMethodCount); 38 | } 39 | 40 | private HashMap getJDKList(File file) throws Exception { 41 | 42 | String regEx = "\\.[a-zA-Z0-9]*\\("; 43 | BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); 44 | 45 | HashMap jdkMethodCount = new HashMap(); 46 | 47 | String lineTxt; 48 | while ((lineTxt = bufferedReader.readLine()) != null) { 49 | Pattern pat = Pattern.compile(regEx); 50 | Matcher mat = pat.matcher(lineTxt); 51 | 52 | while (mat.find()) { 53 | String temp = mat.group().substring(1, mat.group().length() - 1); 54 | if (jdkSet.contains(temp)){ 55 | if (jdkMethodCount.containsKey(temp)) { 56 | jdkMethodCount.replace(temp, jdkMethodCount.get(temp) + 1); 57 | } else { 58 | jdkMethodCount.put(temp, 1); 59 | } 60 | } 61 | } 62 | } 63 | bufferedReader.close(); 64 | return jdkMethodCount; 65 | } 66 | 67 | private void getJdkSetFromFile(String jdkFileDir) throws Exception{ 68 | 69 | File file = new File(jdkFileDir); 70 | BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); 71 | 72 | String item; 73 | while (null != (item = bufferedReader.readLine())){ 74 | jdkSet.add(item); 75 | } 76 | bufferedReader.close(); 77 | } 78 | 79 | private void printJdkVectorToFile() throws Exception{ 80 | File file = new File(Consts.VEC_DIR); 81 | if (!file.exists()){ 82 | if (!file.createNewFile()) throw new Exception("Vector file creating failed."); 83 | } 84 | 85 | BufferedWriter writer = new BufferedWriter(new FileWriter(file)); 86 | for (String dirNumber: javaFileJDKList.keySet()){ 87 | int totalTermAmount = 0; 88 | for (String jdkName: javaFileJDKList.get(dirNumber).keySet()){ 89 | totalTermAmount += javaFileJDKList.get(dirNumber).get(jdkName); 90 | } 91 | StringBuilder infoToWrite = new StringBuilder(dirNumber); 92 | if (0 == totalTermAmount){ 93 | for (String item : jdkSet) { 94 | infoToWrite.append(",0"); 95 | } 96 | } else { 97 | for (String item : jdkSet) { 98 | if (null == javaFileJDKList.get(dirNumber).get(item)){ 99 | infoToWrite.append(",0"); 100 | } else { 101 | infoToWrite.append(","); 102 | infoToWrite.append((double)javaFileJDKList.get(dirNumber).get(item) * 1.0 / (double)totalTermAmount); 103 | } 104 | } 105 | } 106 | 107 | writer.write(infoToWrite.toString()); 108 | writer.newLine(); 109 | } 110 | writer.close(); 111 | System.out.println("Vector file wrote successfully."); 112 | } 113 | 114 | public void getJdkMethodInFile(String javaFileDir, String jdkFileDir) throws Exception{ 115 | 116 | getJdkSetFromFile(jdkFileDir); 117 | 118 | File file = new File(javaFileDir); 119 | BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); 120 | 121 | String item; 122 | while (null != (item = bufferedReader.readLine())){ 123 | int commaPosition = item.indexOf(','); 124 | String dirNumber = item.substring(0, commaPosition); 125 | String dirName = item.substring(commaPosition + 1, item.length()); 126 | javaDirList.put(dirName, dirNumber); 127 | HashMap jdkMethodCount = new HashMap(); 128 | getJavaFileList(new File(dirName), dirNumber, jdkMethodCount); 129 | } 130 | 131 | printJdkVectorToFile(); 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/VectorComparator.java: -------------------------------------------------------------------------------- 1 | import java.util.ArrayList; 2 | import java.util.Collections; 3 | import java.util.Comparator; 4 | import java.util.HashMap; 5 | import java.io.*; 6 | 7 | /** 8 | * Created by Rex on 15/11/2. 9 | */ 10 | public class VectorComparator { 11 | 12 | private HashMap> vectorList = new HashMap>(); 13 | private ArrayList compareResultList = new ArrayList(); 14 | 15 | Comparator vectorComp = new Comparator() { 16 | @Override 17 | public int compare(Object o1, Object o2) { 18 | VectorValueItem vec0 = (VectorValueItem)o1; 19 | VectorValueItem vec1 = (VectorValueItem)o2; 20 | if ((vec1.vectorValue - vec0.vectorValue) > 0.0000000001) return 1; 21 | else if ((vec0.vectorValue - vec1.vectorValue) > 0.0000000001) return -1; 22 | else return 0; 23 | } 24 | }; 25 | 26 | private void getVectorFromFile(String vectorDir) throws Exception{ 27 | 28 | File file = new File(vectorDir); 29 | BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file))); 30 | 31 | String item; 32 | while (null != (item = bufferedReader.readLine())){ 33 | if (item.length() == 0) break; 34 | String[] items = item.split(","); 35 | String vectorNumber = items[0]; 36 | ArrayList vectorItem = new ArrayList(); 37 | for (int i = 1; i < items.length; i++){ 38 | vectorItem.add(Double.parseDouble(items[i])); 39 | } 40 | vectorList.put(vectorNumber, vectorItem); 41 | } 42 | } 43 | 44 | private void calculateComparedVectorValue(String aimVectorNumber){ 45 | ArrayList aimVector = vectorList.get(aimVectorNumber); 46 | for (String item : vectorList.keySet()){ 47 | if (item.equals(aimVectorNumber)) continue; 48 | double comparedValue = 0; 49 | for (int i = 0; i < aimVector.size(); i++) { 50 | comparedValue += aimVector.get(i) * vectorList.get(item).get(i); 51 | } 52 | compareResultList.add(new VectorValueItem(item, comparedValue)); 53 | } 54 | } 55 | 56 | 57 | public ArrayList getNearestVectorNumber(String vectorDir, String aimVectorNumber, int answerAmount) throws Exception{ 58 | ArrayList topRatedVectorNumbers = new ArrayList(); 59 | getVectorFromFile(vectorDir); 60 | calculateComparedVectorValue(aimVectorNumber); 61 | Collections.sort(compareResultList, vectorComp); 62 | for (int i = 0; i < (answerAmount > compareResultList.size() ? compareResultList.size() : answerAmount); i++){ 63 | topRatedVectorNumbers.add(compareResultList.get(i).vectorNumber); 64 | } 65 | return topRatedVectorNumbers; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/VectorValueItem.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Rex on 15/11/2. 3 | */ 4 | public class VectorValueItem { 5 | public String vectorNumber; 6 | public double vectorValue; 7 | 8 | public VectorValueItem(String number, double value){ 9 | vectorNumber = number; 10 | vectorValue = value; 11 | } 12 | } 13 | --------------------------------------------------------------------------------