├── .gitignore
├── DeepBugs
    ├── benchmarks
    ├── javascript
    │   ├── astWalkTest.js
    │   ├── compareWarnings.js
    │   ├── extractFromJS.js
    │   ├── extractorOfAssignments.js
    │   ├── extractorOfAssignments2.js
    │   ├── extractorOfBinOps.js
    │   ├── extractorOfCalls.js
    │   ├── extractorOfCallsMissingArg.js
    │   ├── extractorOfIdsLitsWithASTFamily.js
    │   ├── extractorOfIdsLitsWithIds.js
    │   ├── extractorOfIdsLitsWithTokens.js
    │   ├── extractorOfTokens.js
    │   ├── jsExtractionUtil.js
    │   ├── modifyArgumentOrder.js
    │   ├── rb-nodeify.sh
    │   ├── seedBugs.js
    │   └── tokenize.js
    └── python
    │   ├── ASTEmbeddingLearner.py
    │   ├── ASTEmbeddingLearnerPerLocation.py
    │   ├── AccuracyMetricTest.py
    │   ├── BinOpContextToEmbedding.py
    │   ├── BugDetection.py
    │   ├── CallContextToEmbedding.py
    │   ├── CallPerCalleeCounter.py
    │   ├── CallPerFileCounter.py
    │   ├── EmbeddingEvaluator.py
    │   ├── EmbeddingEvaluatorWord2Vec.py
    │   ├── EmbeddingLearner.py
    │   ├── EmbeddingLearnerWord2Vec.py
    │   ├── EmbeddingModelValidator.py
    │   ├── LearningDataBinOperator.py
    │   ├── LearningDataIncorrectAssignment.py
    │   ├── LearningDataIncorrectAssignment_with_parents.py
    │   ├── LearningDataIncorrectBinaryOperand.py
    │   ├── LearningDataMissingArg.py
    │   ├── LearningDataSwappedArgs.py
    │   ├── LearningDataSwappedBinOperands.py
    │   ├── LocationBasedEmbeddingEvaluator.py
    │   ├── RandomEmbeddingLearner.py
    │   ├── TokenWithASTContextPerLocationToNumbers.py
    │   ├── TokenWithASTContextToNumbers.py
    │   ├── TokenWithContextStats.py
    │   ├── TokenWithContextToNumbers.py
    │   ├── TokensToTopTokens.py
    │   ├── Util.py
    │   ├── __init__.py
    │   ├── create_and_analyse_dataset_for_DeepBugs_assignments.ipynb
    │   ├── create_and_analyse_dataset_for_DeepBugs_binOpnd.ipynb
    │   ├── create_dataset_from_seeded_bugs.py
    │   ├── extract_from_js_parallel.py
    │   └── tools
    │       └── anomalyAnalyzer.py
├── INSTALL.md
├── LICENSE
├── README.md
├── REQUIREMENTS.md
├── bug_seeding
    ├── bug_seeding_approaches
    │   ├── SeedBugs.py
    │   ├── SemSeed
    │   │   ├── BugSeedingUtils.py
    │   │   └── SemSeedBugs.py
    │   └── Syntactic
    │   │   └── SyntacticSeedBugs.py
    ├── obtain_bug_seeding_patterns
    │   ├── extract_bug_seeding_patterns_from_repos
    │   │   ├── CodeAnalysis.js
    │   │   ├── aggregateChanges.py
    │   │   ├── analyses
    │   │   │   └── ExtractDataGivenNodes.js
    │   │   ├── callNodeJSExtractData.py
    │   │   ├── database
    │   │   │   └── GitHubCommits.py
    │   │   ├── extractNodeData.js
    │   │   ├── main.py
    │   │   ├── python_calls_me_to_extract_patterns.js
    │   │   └── utils
    │   │   │   ├── fileoperations.js
    │   │   │   ├── fileutils.py
    │   │   │   └── format_a_js_file.js
    │   └── repo_downloader
    │   │   ├── downloadTopGithubRepos.js
    │   │   ├── fileoperations.js
    │   │   ├── getTopGitHubRepoNames.js
    │   │   └── main.js
    ├── run_bug_seeding.py
    ├── seed_bugs_to_a_file.py
    └── utils
    │   ├── argument_utils.py
    │   ├── bug_seeding_pattern_utils.py
    │   ├── fileutils.py
    │   ├── format_bug_seeded_files.py
    │   ├── prepare_for_seeding_bug.py
    │   └── static_analysis_utils.py
├── compare_real_bug_finding_ability
    ├── DeepBugs_prediction_evaluation.ipynb
    ├── create_dataset_from_real_bugs_assignments.ipynb
    ├── create_dataset_from_real_bugs_binopnds.ipynb
    └── syntax_check_mutandis_compare.ipynb
├── database_config.json
├── package.json
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | **/node_modules/**
2 | **/.idea/**
3 | **/semseed_venv/**
4 | __pycache__/
5 | .ipynb_checkpoints
6 | **/.git/**
7 | 


--------------------------------------------------------------------------------
/DeepBugs/benchmarks:
--------------------------------------------------------------------------------
1 | IntxLNK. . / b e n c h m a r k s / 


--------------------------------------------------------------------------------
/DeepBugs/javascript/astWalkTest.js:
--------------------------------------------------------------------------------
 1 | // Author: Michael Pradel
 2 | 
 3 | (function() {
 4 | 
 5 |     const acorn = require("acorn");
 6 |     const estraverse = require("estraverse");
 7 | 
 8 |     function getChildren(parent, ignoredChild) {
 9 |         const children = [];
10 |         for (const prop in parent) {
11 |             if (parent.hasOwnProperty(prop)) {
12 |                 const child = parent[prop];
13 |                 if (Array.isArray(child)) {
14 |                     for (let i = 0; i < child.length; i++) {
15 |                         const actualChild = child[i];
16 |                         if (actualChild !== ignoredChild) {
17 |                             children.push(actualChild);
18 |                         }
19 |                     }
20 |                 } else if (typeof child === "object") {
21 |                     if (child !== ignoredChild) {
22 |                         children.push(child);
23 |                     }
24 |                 }
25 |             }
26 |         }
27 |         return children;
28 |     }
29 | 
30 |     function getAllChildren(parents, ignoredChild) {
31 |         const allChildren = [];
32 |         for (let i = 0; i < parents.length; i++) {
33 |             const parent = parents[i];
34 |             const newChildren = getChildren(parent);
35 |             for (let j = 0; j < newChildren.length; j++) {
36 |                 const newChild = newChildren[j];
37 |                 if (newChild !== ignoredChild) {
38 |                     allChildren.push(newChild)
39 |                 }
40 |             }
41 |         }
42 |         return allChildren;
43 |     }
44 | 
45 |     function positionIn(parent, child) {
46 |         const position = getChildren(parent).indexOf(child);
47 |         if (position === -1) throw "Could not find child in parent: " + JSON.stringify(parent) + " -- "+ JSON.stringify(child);
48 |         return position;
49 |     }
50 | 
51 |     function nodeToString(node) {
52 |         let result;
53 |         if (node.type === "Identifier") {
54 |             result = "ID:" + node.name;
55 |         } else if (node.type === "Literal") {
56 |             result = "LIT:" + node.value;
57 |         } else if (Array.isArray(node)) {
58 |             result = "Array";
59 |         } else if (typeof node.type === "string") {
60 |             result = node.type;
61 |         } else {
62 |             throw "Unexpected node type: " + JSON.stringify(node);
63 |         }
64 |         // TODO limit size
65 |         return result;
66 |     }
67 | 
68 |     const ast = acorn.parse("elems.push(2, 'aa')");
69 |     console.log(JSON.stringify(ast, 0, 2));
70 |     const ancestors= [];
71 |     estraverse.traverse(ast, {
72 |         enter:function(node, parent) {
73 |             if (node.type === "Literal") {
74 |                 const positionInParent = positionIn(parent, node);
75 |                 const grandParent = ancestors[ancestors.length - 2];
76 |                 const positionInGrandParent = positionIn(grandParent, parent);
77 |                 const siblings = getChildren(parent, node);
78 |                 const uncles = getChildren(grandParent, parent); // getUncles(grandParent, parent);
79 |                 const cousins = getAllChildren(uncles);
80 |                 const nephews = getAllChildren(siblings);
81 |                 console.log("\n"+JSON.stringify(node));
82 |                 console.log("Parent     : " + nodeToString(parent));
83 |                 console.log("  Position : " + positionInParent);
84 |                 console.log("Grandparent: " + nodeToString((grandParent)));
85 |                 console.log("  Position : " + positionInGrandParent);
86 |                 console.log("Siblings   : " + siblings.map(nodeToString));
87 |                 console.log("Uncles     : " + uncles.map(nodeToString));
88 |                 console.log("Cousins    : " + cousins.map(nodeToString));
89 |                 console.log("Nephews    : " + nephews.map(nodeToString));
90 |             }
91 | 
92 |             ancestors.push(node);
93 |         },
94 |         leave:function(node, parent) {
95 |             ancestors.pop();
96 |         }
97 |     });
98 | 
99 | })();


--------------------------------------------------------------------------------
/DeepBugs/javascript/compareWarnings.js:
--------------------------------------------------------------------------------
 1 | // Author: Michael Pradel
 2 | // Compares warnings found with different variants of the approach.
 3 | //  arg1 = file with inspected warnings
 4 | //  arg2 = file with other warnings
 5 | 
 6 | (function() {
 7 | 
 8 |     const fs = require("fs");
 9 |     const process = require("process");
10 | 
11 |     function Warning(score, location, extraInfo, isTruePositive) {
12 |         this.score = score;
13 |         this.location = location;
14 |         this.extraInfo = extraInfo;
15 |         this.isTruePositive = isTruePositive;
16 |     }
17 | 
18 |     Warning.prototype.equals = function(other) {
19 |         return this.location === other.location && this.extraInfo === other.extraInfo;
20 |     };
21 | 
22 |     function readWarnings(path) {
23 |         const result = []
24 |         let allLines = fs.readFileSync(path, {encoding:"utf8"});
25 |         allLines = allLines.split("\n");
26 |         for (let i = 0; i < allLines.length; i++) {
27 |             const line = allLines[i];
28 |             const entries = line.split(" | ");
29 |             let extraInfo, isTruePositive;
30 |             if (entries[entries.length - 2] === "y" || entries[entries.length - 2] === "n") {
31 |                 // has been manually inspected and classified
32 |                 extraInfo = entries.slice(2, entries.length - 2).join(" | ");
33 |                 isTruePositive = entries[entries.length - 2];
34 |             } else {
35 |                 extraInfo = entries.slice(2).join(" | ");
36 |             }
37 |             const warning = new Warning(entries[0], entries[1], extraInfo, isTruePositive);
38 |             result.push(warning);
39 |         }
40 |         return result;
41 |     }
42 | 
43 |     const args = process.argv.slice(2);
44 |     const inspectedWarnings = readWarnings(args[0]);
45 |     const otherWarnings = readWarnings(args[1]);
46 | 
47 |     for (let i = 0; i < inspectedWarnings.length; i++) {
48 |         const inspectedWarning = inspectedWarnings[i];
49 |         const classification = inspectedWarning.isTruePositive === "y" ? "TP" : "FP";
50 | 
51 |         let found = false;
52 |         for (let j = 0; j < otherWarnings.length; j++) {
53 |             const otherWarning = otherWarnings[j];
54 |             if (inspectedWarning.equals(otherWarning)) {
55 |                 found = true;
56 |                 console.log(classification, " with score ", otherWarning.score, inspectedWarning.location);
57 |                 break;
58 |             }
59 |         }
60 |         if (!found) {
61 |             console.log(classification, "not found", inspectedWarning.location);
62 |         }
63 | 
64 | 
65 | 
66 |     }
67 | 
68 | 
69 | })();


--------------------------------------------------------------------------------
/DeepBugs/javascript/extractorOfAssignments.js:
--------------------------------------------------------------------------------
 1 | // Author: Michael Pradel
 2 | 
 3 | (function() {
 4 | 
 5 |     const fs = require("fs");
 6 |     const estraverse = require("estraverse");
 7 |     const util = require("./jsExtractionUtil");
 8 | 
 9 |     function visitCode(ast, locationMap, path, allAssignments, fileID) {
10 |         console.log("Reading " + path);
11 | 
12 |         let totalAssignments = 0;
13 |         let totalAssignmentsConsidered = 0;
14 | 
15 |         const assignments = [];
16 |         const code = fs.readFileSync(path);
17 |         estraverse.traverse(ast, {
18 |             enter:function(node, parent) {
19 |                 let lhs, rhs;
20 |                 if (node && node.type === "AssignmentExpression") {
21 |                     totalAssignments += 1;
22 |                     if (node.left.type === "Identifier") {
23 |                         lhs = node.left;
24 |                         rhs = node.right;
25 |                     } else if (node && node.type === "VariableDeclarator" && node.init !== null) {
26 |                         lhs = node.id;
27 |                         rhs = node.init;
28 |                     } else return;
29 | 
30 |                     const nameOfLHS = util.getNameOfASTNode(lhs);
31 |                     const nameOfRHS = util.getNameOfASTNode(rhs);
32 |                     if (typeof nameOfLHS !== "undefined" && typeof nameOfRHS !== "undefined") {
33 |                         let locString = path + " : " + node.loc.start.line + " - " + node.loc.end.line;
34 |                         let typeOfRHS = util.getTypeOfASTNode(rhs);
35 |                         const assignment = {
36 |                             lhs:nameOfLHS,
37 |                             rhs:nameOfRHS,
38 |                             rhsType:typeOfRHS,
39 |                             src:locString
40 |                         };
41 |                         totalAssignmentsConsidered += 1;
42 |                         assignments.push(assignment);
43 |                     }
44 |                 }
45 |             }
46 |         });
47 |         allAssignments.push(...assignments);
48 |         console.log("Added assignments. Total now: " + allAssignments.length);
49 |         console.log("Considered assignments: " + totalAssignmentsConsidered + " out of " + totalAssignments + " (" + Math.round(100 * totalAssignmentsConsidered / totalAssignments) + "%)");
50 |     }
51 | 
52 |     module.exports.visitCode = visitCode;
53 | 
54 | })();
55 | 
56 | 


--------------------------------------------------------------------------------
/DeepBugs/javascript/extractorOfAssignments2.js:
--------------------------------------------------------------------------------
  1 | // Author: Michael Pradel
  2 | 
  3 | (function () {
  4 | 
  5 |     const fs = require("fs");
  6 |     const estraverse = require("estraverse");
  7 |     const util = require("./jsExtractionUtil");
  8 | 
  9 |     const identifierContextWindowSize = 20; // assumption: even number
 10 | 
 11 |     function visitCode(ast, locationMap, path, allAssignments, fileID) {
 12 |         console.log("Reading " + path);
 13 | 
 14 |         let totalAssignments = 0;
 15 |         let totalAssignmentsConsidered = 0;
 16 | 
 17 |         const pastIdentifiers = [];
 18 |         const unfinishedAssignments = [];
 19 |         const parentStack = [];
 20 |         const assignments = [];
 21 |         estraverse.traverse(ast, {
 22 |             enter: function (node, parent) {
 23 |                 if (parent) parentStack.push(parent);
 24 |                 let extract = false;
 25 |                 let cur_node_line_num = node.loc.start.line + "-" + node.loc.end.line;
 26 |                 // console.log(fileID, cur_node_line_num);
 27 |                 if (fileID === null) {
 28 |                     extract = true;
 29 |                 } else if (cur_node_line_num === fileID) {
 30 |                     extract = true;
 31 |                 }
 32 |                 if (extract && node && node.type === "Identifier") {
 33 |                     pastIdentifiers.push("ID:" + node.name);
 34 | 
 35 |                     // finalize assignments with now-available postIdentifierContext
 36 |                     let nbFinished = 0;
 37 |                     for (let i = 0; i < unfinishedAssignments.length; i++) {
 38 |                         const unfinishedAssignment = unfinishedAssignments[i];
 39 |                         if (pastIdentifiers.length >= unfinishedAssignment.identifierIndex + identifierContextWindowSize / 2) {
 40 |                             const postIdentifierContext = pastIdentifiers.slice(unfinishedAssignment.identifierIndex, unfinishedAssignment.identifierIndex + identifierContextWindowSize / 2);
 41 |                             unfinishedAssignment.assignment.context = unfinishedAssignment.assignment.context.concat(postIdentifierContext);
 42 |                             totalAssignmentsConsidered += 1;
 43 |                             assignments.push(unfinishedAssignment.assignment);
 44 |                             nbFinished++;
 45 |                         } else {
 46 |                             break;
 47 |                         }
 48 |                     }
 49 |                     unfinishedAssignments.splice(0, nbFinished);
 50 |                 }
 51 | 
 52 |                 let lhs, rhs;
 53 |                 // let selectedNodeTypes = ['ExpressionStatement', 'VariableDeclaration', 'VariableDeclarator',
 54 |                 //     'AssignmentExpression', 'AssignmentPattern'];
 55 |                 if (extract && node && node.type === "AssignmentExpression") {
 56 |                     totalAssignments += 1;
 57 |                     if (node.left.type === "Identifier") {
 58 |                         lhs = node.left;
 59 |                         rhs = node.right;
 60 |                     } else if (node && node.type === "VariableDeclarator" && node.init !== null) {
 61 |                         lhs = node.id;
 62 |                         rhs = node.init;
 63 |                     } else return;
 64 |                     // TODO: consider assignments to properties (and use property name as rhs)
 65 | 
 66 |                     const nameOfLHS = util.getNameOfASTNode(lhs);
 67 |                     const nameOfRHS = util.getNameOfASTNode(rhs);
 68 |                     const parentName = parent.type;
 69 |                     const grandParentName = parentStack.length > 1 ? parentStack[parentStack.length - 2].type : "";
 70 |                     const preIdentifierContext = pastIdentifiers.slice(Math.max(0, pastIdentifiers.length - identifierContextWindowSize / 2), pastIdentifiers.length);
 71 |                     while (preIdentifierContext.length < identifierContextWindowSize / 2) {
 72 |                         preIdentifierContext.unshift("");
 73 |                     }
 74 |                     if (typeof nameOfLHS !== "undefined" && typeof nameOfRHS !== "undefined") {
 75 |                         let locString = path + " : " + node.loc.start.line + " - " + node.loc.end.line;
 76 |                         let typeOfRHS = util.getTypeOfASTNode(rhs);
 77 |                         const assignment = {
 78 |                             lhs: nameOfLHS,
 79 |                             rhs: nameOfRHS,
 80 |                             rhsType: typeOfRHS,
 81 |                             parent: parentName,
 82 |                             grandParent: grandParentName,
 83 |                             context: preIdentifierContext, // postIdentifierContext will get appended later
 84 |                             src: locString,
 85 |                             range: [node.start, node.end]
 86 |                         };
 87 |                         unfinishedAssignments.push({assignment: assignment, identifierIndex: pastIdentifiers.length});
 88 |                     }
 89 |                 }
 90 |             },
 91 |             leave: function (node, parent) {
 92 |                 if (parent) parentStack.pop();
 93 |             }
 94 |         });
 95 | 
 96 |         for (let i = 0; i < unfinishedAssignments.length; i++) {
 97 |             const unfinishedAssignment = unfinishedAssignments[i];
 98 |             const postIdentifierContext = pastIdentifiers.slice(unfinishedAssignment.identifierIndex, unfinishedAssignment.identifierIndex + identifierContextWindowSize / 2);
 99 |             while (postIdentifierContext.length < identifierContextWindowSize / 2) {
100 |                 postIdentifierContext.push("");
101 |             }
102 |             unfinishedAssignment.assignment.context = unfinishedAssignment.assignment.context.concat(postIdentifierContext);
103 |             totalAssignmentsConsidered += 1;
104 |             assignments.push(unfinishedAssignment.assignment);
105 |         }
106 | 
107 |         allAssignments.push(...assignments);
108 |         console.log("Added assignments. Total now: " + allAssignments.length);
109 |         console.log("Considered assignments: " + totalAssignmentsConsidered + " out of " + totalAssignments + " (" + Math.round(100 * totalAssignmentsConsidered / totalAssignments) + "%)");
110 |     }
111 | 
112 |     module.exports.visitCode = visitCode;
113 | 
114 | })();
115 | 


--------------------------------------------------------------------------------
/DeepBugs/javascript/extractorOfBinOps.js:
--------------------------------------------------------------------------------
 1 | // Author: Michael Pradel
 2 | 
 3 | (function () {
 4 | 
 5 |     const fs = require("fs");
 6 |     const estraverse = require("estraverse");
 7 |     const util = require("./jsExtractionUtil");
 8 | 
 9 |     function visitCode(ast, locationMap, path, allBinOps, fileIDStr) {
10 |         console.log("Reading " + path);
11 | 
12 |         let totalBinOps = 0;
13 |         let totalBinOpsConsidered = 0;
14 | 
15 |         const parentStack = [];
16 |         const binOps = [];
17 |         let tokenID = 1;
18 |         estraverse.traverse(ast, {
19 |             enter: function (node, parent) {
20 |                 if (parent) parentStack.push(parent);
21 | 
22 |                 let extract = false;
23 | 
24 |                 if (fileIDStr === null) {
25 |                     extract = true;
26 |                 } else if ((node.loc.start.line + "-" + node.loc.end.line) === fileIDStr) {
27 |                     extract = true;
28 |                 }
29 | 
30 |                 if (node.type === "BinaryExpression" && extract) {
31 |                     totalBinOps += 1;
32 |                     const leftName = util.getNameOfASTNode(node.left);
33 |                     const rightName = util.getNameOfASTNode(node.right);
34 |                     const leftType = util.getTypeOfASTNode(node.left);
35 |                     const rightType = util.getTypeOfASTNode(node.right);
36 |                     const parentName = parent.type;
37 |                     const grandParentName = parentStack.length > 1 ? parentStack[parentStack.length - 2].type : "";
38 |                     if (typeof leftName !== "undefined" && typeof rightName !== "undefined") {
39 |                         let locString = path + " : " + node.loc.start.line + " - " + node.loc.end.line;
40 |                         
41 |                         const binOp = {
42 |                             left: leftName,
43 |                             right: rightName,
44 |                             op: node.operator,
45 |                             leftType: leftType,
46 |                             rightType: rightType,
47 |                             parent: parentName,
48 |                             grandParent: grandParentName,
49 |                             src: locString,
50 |                             range: [node.start, node.end]
51 |                         };
52 |                         binOps.push(binOp);
53 |                         totalBinOpsConsidered += 1;
54 |                         tokenID += 1;
55 |                     }
56 |                 }
57 |             },
58 |             leave: function (node, parent) {
59 |                 if (parent) parentStack.pop();
60 |             }
61 |         });
62 |         allBinOps.push(...binOps);
63 |         console.log("Added binary operations. Total now: " + allBinOps.length);
64 |         console.log("Considered binary operations: " + totalBinOpsConsidered + " out of " + totalBinOps + " (" + Math.round(100 * totalBinOpsConsidered / totalBinOps) + "%)");
65 |     }
66 | 
67 |     module.exports.visitCode = visitCode;
68 | 
69 | })();
70 | 


--------------------------------------------------------------------------------
/DeepBugs/javascript/extractorOfCalls.js:
--------------------------------------------------------------------------------
  1 | // Author: Michael Pradel
  2 | 
  3 | (function() {
  4 | 
  5 |     const fs = require("fs");
  6 |     const estraverse = require("estraverse");
  7 |     const util = require("./jsExtractionUtil");
  8 | 
  9 |     // configuration parameters
 10 |     const minArgs = 2;
 11 |     const maxLengthOfCalleeAndArguments = 200; // maximum number of characters
 12 | 
 13 |     function visitCode(ast, locationMap, path, allCalls, fileID) {
 14 |         console.log("Reading " + path);
 15 | 
 16 |         // first pass through AST: visit each fct. def. and extract formal parameter names
 17 |         const functionToParameters = {}; // string to array of strings
 18 |         let functionCounter = 0;
 19 |         estraverse.traverse(ast, {
 20 |             enter:function(node, parent) {
 21 |                 if (node.type === "FunctionDeclaration" || node.type === "FunctionExpression") {
 22 |                     functionCounter++;
 23 |                     if (node.params.length > 1) {
 24 |                         let functionName = util.getNameOfFunction(node, parent);
 25 |                         if (functionName) {
 26 |                             if (!functionToParameters.hasOwnProperty(functionName)) {
 27 |                                 const parameterNames = [];
 28 |                                 for (let i = 0; i < node.params.length; i++) {
 29 |                                     const parameter = node.params[i];
 30 |                                     parameterNames.push("ID:"+parameter.name);
 31 |                                 }
 32 |                                 functionToParameters[functionName] = parameterNames;
 33 |                             } // heuristically use only the first declaration in this file
 34 |                         }
 35 |                     }
 36 |                 }
 37 |             }
 38 |         });
 39 |         // console.log("Functions with parameter names: "+Object.keys(functionToParameters).length+" of "+functionCounter);
 40 | 
 41 |         // second pass through AST: visit each call site and extract call data
 42 |         const calls = [];
 43 |         const parentStack = [];
 44 |         let callCounter = 0;
 45 |         let callWithParameterNameCounter = 0;
 46 |         estraverse.traverse(ast, {
 47 |             enter:function(node, parent) {
 48 |                 if (parent) parentStack.push(parent);
 49 |                 if (node && node.type === "CallExpression") {
 50 |                     if (node.arguments.length < minArgs) return;
 51 | 
 52 |                     let calleeString;
 53 |                     let baseString;
 54 |                     let calleeNode;
 55 |                     if (node.callee.type === "MemberExpression") {
 56 |                         if (node.callee.computed === false) {
 57 |                             calleeNode = node.callee.property;
 58 |                             calleeString = util.getNameOfASTNode(calleeNode);
 59 |                             baseString = util.getNameOfASTNode(node.callee.object);
 60 |                         } else {
 61 |                             calleeNode = node.callee.object;
 62 |                             calleeString = util.getNameOfASTNode(calleeNode);
 63 |                             baseString = "";
 64 |                         }
 65 |                     } else {
 66 |                         calleeNode = node.callee;
 67 |                         calleeString = util.getNameOfASTNode(calleeNode);
 68 |                         baseString = "";
 69 |                     }
 70 | 
 71 |                     if (typeof calleeString === "undefined" || typeof baseString === "undefined") return;
 72 | 
 73 |                     const calleeLocation = fileID + util.getLocationOfASTNode(calleeNode, locationMap);
 74 | 
 75 |                     const argumentStrings = [];
 76 |                     const argumentLocations = [];
 77 |                     const argumentTypes = [];
 78 |                     for (let i = 0; i < node.arguments.length; i++) {
 79 |                         const argument = node.arguments[i];
 80 |                         const argumentString = util.getNameOfASTNode(argument);
 81 |                         const argumentLocation = fileID + util.getLocationOfASTNode(argument, locationMap);
 82 |                         const argumentType = util.getTypeOfASTNode(argument);
 83 |                         if (typeof argumentString === "undefined") return;
 84 |                         argumentStrings.push(argumentString.slice(0, maxLengthOfCalleeAndArguments));
 85 |                         argumentLocations.push(argumentLocation);
 86 |                         argumentTypes.push(argumentType);
 87 |                     }
 88 | 
 89 |                     const parameters = [];
 90 |                     let foundParameter = false;
 91 |                     for (let i = 0; i < argumentStrings.length; i++) {
 92 |                         let parameter = ""; // use empty parameter name if nothing else known
 93 |                         if (functionToParameters.hasOwnProperty(calleeString)) {
 94 |                             if (i < functionToParameters[calleeString].length) {
 95 |                                 parameter = functionToParameters[calleeString][i];
 96 |                                 foundParameter = true;
 97 |                             }
 98 |                         }
 99 |                         parameters.push(parameter);
100 |                     }
101 |                     callCounter++;
102 |                     if (foundParameter) callWithParameterNameCounter++;
103 | 
104 |                     calleeString = calleeString.slice(0, maxLengthOfCalleeAndArguments);
105 |                     baseString = baseString.slice(0, maxLengthOfCalleeAndArguments);
106 | 
107 |                     let locString = path + " : " + node.loc.start.line + " - " + node.loc.end.line;
108 |                     if (argumentStrings.length >= minArgs) {
109 |                         calls.push({
110 |                             base:baseString,
111 |                             callee:calleeString,
112 |                             calleeLocation:calleeLocation,
113 |                             arguments:argumentStrings,
114 |                             argumentLocations:argumentLocations,
115 |                             argumentTypes:argumentTypes,
116 |                             parameters:parameters,
117 |                             src:locString,
118 |                             filename:path
119 |                         });
120 |                     }
121 |                 }
122 |             },
123 |             leave:function(node, parent) {
124 |                 if (parent) parentStack.pop();
125 |             }
126 |         });
127 |         allCalls.push(...calls);
128 |         console.log("Added calls. Total now: " + allCalls.length);
129 | 
130 |         // console.log("Calls with resolved parameter name: " + callWithParameterNameCounter+" of "+callCounter);
131 |     }
132 | 
133 |     module.exports.visitCode = visitCode;
134 | 
135 | })();
136 | 


--------------------------------------------------------------------------------
/DeepBugs/javascript/extractorOfCallsMissingArg.js:
--------------------------------------------------------------------------------
  1 | // Author: Michael Pradel
  2 | 
  3 | (function() {
  4 | 
  5 |     const fs = require("fs");
  6 |     const estraverse = require("estraverse");
  7 |     const util = require("./jsExtractionUtil");
  8 | 
  9 |     // configuration parameters
 10 |     const maxLengthOfCalleeAndArguments = 200; // maximum number of characters
 11 | 
 12 |     function visitCode(ast, locationMap, path, allCalls, fileID) {
 13 |         console.log("Reading " + path);
 14 | 
 15 |         // first pass through AST: visit each fct. def. and extract formal parameter names
 16 |         const functionToParameters = {}; // string to array of strings
 17 |         let functionCounter = 0;
 18 |         estraverse.traverse(ast, {
 19 |             enter:function(node, parent) {
 20 |                 if (node.type === "FunctionDeclaration" || node.type === "FunctionExpression") {
 21 |                     functionCounter++;
 22 |                     if (node.params.length > 1) {
 23 |                         let functionName = util.getNameOfFunction(node, parent);
 24 |                         if (functionName) {
 25 |                             if (!functionToParameters.hasOwnProperty(functionName)) {
 26 |                                 const parameterNames = [];
 27 |                                 for (let i = 0; i < node.params.length; i++) {
 28 |                                     const parameter = node.params[i];
 29 |                                     parameterNames.push("ID:"+parameter.name);
 30 |                                 }
 31 |                                 functionToParameters[functionName] = parameterNames;
 32 |                             } // heuristically use only the first declaration in this file
 33 |                         }
 34 |                     }
 35 |                 }
 36 |             }
 37 |         });
 38 |         // console.log("Functions with parameter names: "+Object.keys(functionToParameters).length+" of "+functionCounter);
 39 | 
 40 |         // second pass through AST: visit each call site and extract call data
 41 |         const calls = [];
 42 |         const parentStack = [];
 43 |         let callCounter = 0;
 44 |         let callWithParameterNameCounter = 0;
 45 |         estraverse.traverse(ast, {
 46 |             enter:function(node, parent) {
 47 |                 if (parent) parentStack.push(parent);
 48 |                 if (node && node.type === "CallExpression") {
 49 |                     if (node.arguments.length === 0) return;
 50 | 
 51 |                     let calleeString;
 52 |                     let baseString;
 53 |                     let calleeNode;
 54 |                     if (node.callee.type === "MemberExpression") {
 55 |                         if (node.callee.computed === false) {
 56 |                             calleeNode = node.callee.property;
 57 |                             calleeString = util.getNameOfASTNode(calleeNode);
 58 |                             baseString = util.getNameOfASTNode(node.callee.object);
 59 |                         } else {
 60 |                             calleeNode = node.callee.object;
 61 |                             calleeString = util.getNameOfASTNode(calleeNode);
 62 |                             baseString = "";
 63 |                         }
 64 |                     } else {
 65 |                         calleeNode = node.callee;
 66 |                         calleeString = util.getNameOfASTNode(calleeNode);
 67 |                         baseString = "";
 68 |                     }
 69 | 
 70 |                     if (typeof calleeString === "undefined" || typeof baseString === "undefined") return;
 71 | 
 72 |                     const calleeLocation = fileID + util.getLocationOfASTNode(calleeNode, locationMap);
 73 | 
 74 |                     const argumentStrings = [];
 75 |                     const argumentLocations = [];
 76 |                     const argumentTypes = [];
 77 |                     for (let i = 0; i < node.arguments.length; i++) {
 78 |                         const argument = node.arguments[i];
 79 |                         const argumentString = util.getNameOfASTNode(argument);
 80 |                         const argumentLocation = fileID + util.getLocationOfASTNode(argument, locationMap);
 81 |                         const argumentType = util.getTypeOfASTNode(argument);
 82 |                         if (typeof argumentString === "undefined") return;
 83 |                         argumentStrings.push(argumentString.slice(0, maxLengthOfCalleeAndArguments));
 84 |                         argumentLocations.push(argumentLocation);
 85 |                         argumentTypes.push(argumentType);
 86 |                     }
 87 | 
 88 |                     const parameters = [];
 89 |                     let foundParameter = false;
 90 |                     for (let i = 0; i < argumentStrings.length; i++) {
 91 |                         let parameter = ""; // use empty parameter name if nothing else known
 92 |                         if (functionToParameters.hasOwnProperty(calleeString)) {
 93 |                             if (i < functionToParameters[calleeString].length) {
 94 |                                 parameter = functionToParameters[calleeString][i];
 95 |                                 foundParameter = true;
 96 |                             }
 97 |                         }
 98 |                         parameters.push(parameter);
 99 |                     }
100 |                     callCounter++;
101 |                     if (foundParameter) callWithParameterNameCounter++;
102 | 
103 |                     calleeString = calleeString.slice(0, maxLengthOfCalleeAndArguments);
104 |                     baseString = baseString.slice(0, maxLengthOfCalleeAndArguments);
105 | 
106 |                     let locString = path + " : " + node.loc.start.line + " - " + node.loc.end.line;
107 |                     if (argumentStrings.length >= 1) {
108 |                         calls.push({
109 |                             base:baseString,
110 |                             callee:calleeString,
111 |                             calleeLocation:calleeLocation,
112 |                             arguments:argumentStrings,
113 |                             argumentLocations:argumentLocations,
114 |                             argumentTypes:argumentTypes,
115 |                             parameters:parameters,
116 |                             src:locString,
117 |                             filename:path
118 |                         });
119 |                     }
120 |                 }
121 |             },
122 |             leave:function(node, parent) {
123 |                 if (parent) parentStack.pop();
124 |             }
125 |         });
126 |         allCalls.push(...calls);
127 |         console.log("Added calls. Total now: " + allCalls.length);
128 | 
129 |         // console.log("Calls with resolved parameter name: " + callWithParameterNameCounter+" of "+callCounter);
130 |     }
131 | 
132 |     module.exports.visitCode = visitCode;
133 | 
134 | })();
135 | 


--------------------------------------------------------------------------------
/DeepBugs/javascript/extractorOfIdsLitsWithASTFamily.js:
--------------------------------------------------------------------------------
 1 | // Author: Michael Pradel
 2 | 
 3 | (function() {
 4 | 
 5 |     const fs = require("fs");
 6 |     const estraverse = require("estraverse");
 7 |     const util = require("./jsExtractionUtil");
 8 | 
 9 |     function getChildren(parent, ignoredChild) {
10 |         const children = [];
11 |         for (const prop in parent) {
12 |             if (parent.hasOwnProperty(prop) && prop !== "regex" && prop !== "loc") {
13 |                 const child = parent[prop];
14 |                 if (Array.isArray(child)) {
15 |                     for (let i = 0; i < child.length; i++) {
16 |                         const actualChild = child[i];
17 |                         if (actualChild !== ignoredChild && !(child instanceof RegExp) && actualChild !== null) {
18 |                             children.push(actualChild);
19 |                         }
20 |                     }
21 |                 } else if (typeof child === "object" && child !== null) {
22 |                     if (child !== ignoredChild && !(child instanceof RegExp)) {
23 |                         children.push(child);
24 |                     }
25 |                 }
26 |             }
27 |         }
28 |         return children;
29 |     }
30 | 
31 |     function getAllChildren(parents, ignoredChild) {
32 |         const allChildren = [];
33 |         for (let i = 0; i < parents.length; i++) {
34 |             const parent = parents[i];
35 |             const newChildren = getChildren(parent);
36 |             for (let j = 0; j < newChildren.length; j++) {
37 |                 const newChild = newChildren[j];
38 |                 if (newChild !== ignoredChild) {
39 |                     allChildren.push(newChild)
40 |                 }
41 |             }
42 |         }
43 |         return allChildren;
44 |     }
45 | 
46 |     function positionIn(parent, child) {
47 |         const position = getChildren(parent).indexOf(child);
48 |         if (position === -1) throw "Could not find child in parent: " + JSON.stringify(parent) + " -- "+ JSON.stringify(child);
49 |         return position;
50 |     }
51 | 
52 |     function visitCode(ast, locationMap, path, allIdsLits, fileID) {
53 |         console.log("Reading " + path);
54 |         const ancestors= [];
55 |         estraverse.traverse(ast, {
56 |             enter:function(node, parent) {
57 |                 if (node.type === "Identifier" || node.type === "Literal") {
58 |                     const positionInParent = positionIn(parent, node);
59 |                     const grandParent = ancestors[ancestors.length - 2];
60 |                     const positionInGrandParent = positionIn(grandParent, parent);
61 |                     const siblings = getChildren(parent, node);
62 |                     const uncles = getChildren(grandParent, parent); // getUncles(grandParent, parent);
63 |                     const cousins = getAllChildren(uncles);
64 |                     const nephews = getAllChildren(siblings);
65 | 
66 |                     const idLit = {
67 |                         token: util.nodeToString(node),
68 |                         context: {
69 |                             parent: util.nodeToString(parent),
70 |                             positionInParent: positionInParent,
71 |                             grandParent: util.nodeToString(grandParent),
72 |                             positionInGrandParent: positionInGrandParent,
73 |                             siblings: Array.from(new Set(siblings.map(util.nodeToString))),
74 |                             uncles: Array.from(new Set(uncles.map(util.nodeToString))),
75 |                             cousins: Array.from(new Set(cousins.map(util.nodeToString))),
76 |                             nephews: Array.from(new Set(nephews.map(util.nodeToString)))
77 |                         },
78 |                         location: fileID + util.getLocationOfASTNode(node, locationMap)
79 |                     };
80 | 
81 |                     allIdsLits.push(idLit);
82 |                 }
83 | 
84 |                 ancestors.push(node);
85 |             },
86 |             leave:function(node, parent) {
87 |                 ancestors.pop();
88 |             }
89 |         });
90 |     }
91 | 
92 |     module.exports.visitCode = visitCode;
93 | 
94 | })();
95 | 


--------------------------------------------------------------------------------
/DeepBugs/javascript/extractorOfIdsLitsWithIds.js:
--------------------------------------------------------------------------------
 1 | // Author: Michael Pradel
 2 | 
 3 | (function() {
 4 | 
 5 |     const fs = require("fs");
 6 |     const util = require("./jsExtractionUtil");
 7 | 
 8 |     const tokenContextLength = 20; // must be an even number
 9 | 
10 |     function getContext(tokens, idx, targetLength) {
11 |         let preContext = [];
12 |         let currIdx = idx - 1;
13 |         while (currIdx >= 0 && preContext.length < targetLength) {
14 |             // go backward in token sequence and add identifiers to preContext
15 |             let currToken = tokens[currIdx];
16 |             if (util.isId(currToken)) preContext = [currToken].concat(preContext);
17 |             currIdx--;
18 |         }
19 | 
20 |         let postContext = [];
21 |         currIdx = idx + 1;
22 |         while (currIdx < tokens.length && postContext.length < targetLength) {
23 |             // go forward in token sequence and add identifiers to postContext
24 |             let currToken = tokens[currIdx];
25 |             if (util.isId(currToken)) postContext.push(currToken);
26 |             currIdx++;
27 |         }
28 | 
29 |         return [preContext, postContext];
30 |     }
31 | 
32 |     function visitFile(path, allIdsLits) {
33 |         console.log("Reading " + path);
34 | 
35 |         const code = fs.readFileSync(path);
36 |         const tokens = util.getTokens(code);
37 |         const k = tokenContextLength / 2;
38 |         if (tokens) {
39 |             for (let i = 0; i < tokens.length; i++) {
40 |                 const token = tokens[i];
41 |                 if (util.isIdLit(token)) {
42 |                     let [preContext, postContext] = getContext(tokens, i, k);
43 |                     preContext = util.tokensToStrings(preContext);
44 |                     while (preContext.length !== k) preContext = [""].concat(preContext);
45 |                     postContext = util.tokensToStrings(postContext);
46 |                     while (postContext.length !== k) postContext.push("");
47 |                     const idLit = {
48 |                         token: util.tokenToString(token),
49 |                         context: preContext.concat(postContext)
50 |                     };
51 |                     allIdsLits.push(idLit);
52 |                 }
53 |             }
54 |         } else {
55 |             console.log("Ignoring file with parse errors: " + path);
56 |         }
57 |     }
58 | 
59 |     module.exports.visitFile = visitFile;
60 | 
61 | })();


--------------------------------------------------------------------------------
/DeepBugs/javascript/extractorOfIdsLitsWithTokens.js:
--------------------------------------------------------------------------------
 1 | // Author: Michael Pradel
 2 | 
 3 | (function() {
 4 | 
 5 |     const fs = require("fs");
 6 |     const util = require("./jsExtractionUtil");
 7 | 
 8 |     // configuration parameters
 9 |     const tokenContextLength = 20; // must be an even number
10 | 
11 |     function visitFile(path, allIdsLits) {
12 |         console.log("Reading " + path);
13 | 
14 |         const code = fs.readFileSync(path);
15 |         const tokens = util.getTokens(code);
16 |         const k = tokenContextLength / 2;
17 |         if (tokens) {
18 |             for (let i = 0; i < tokens.length; i++) {
19 |                 const token = tokens[i];
20 |                 if (util.isIdLit(token)) {
21 |                     let preContext = tokens.slice(Math.max(0, i - k), i);
22 |                     preContext = util.tokensToStrings(preContext);
23 |                     while (preContext.length !== k) preContext = [""].concat(preContext);
24 |                     let postContext = tokens.slice(i + 1, i + k);
25 |                     postContext = util.tokensToStrings(postContext);
26 |                     while (postContext.length !== k) postContext.push("");
27 |                     const idLit = {
28 |                         token: util.tokenToString(token),
29 |                         context: preContext.concat(postContext)
30 |                     };
31 |                     allIdsLits.push(idLit);
32 |                 }
33 |             }
34 |         } else {
35 |             console.log("Ignoring file with parse errors: " + path);
36 |         }
37 |     }
38 | 
39 |     module.exports.visitFile = visitFile;
40 | 
41 | })();


--------------------------------------------------------------------------------
/DeepBugs/javascript/extractorOfTokens.js:
--------------------------------------------------------------------------------
 1 | // Author: Michael Pradel
 2 | 
 3 | (function() {
 4 | 
 5 |     const fs = require("fs");
 6 |     const util = require("./jsExtractionUtil");
 7 | 
 8 |     function visitFile(path, allTokenSequences) {
 9 |         console.log("Reading " + path);
10 | 
11 |         const assignments = [];
12 |         const code = fs.readFileSync(path);
13 |         const tokens = util.getTokens(code);
14 |         if (tokens) {
15 |             allTokenSequences.push(util.tokensToStrings(tokens));
16 |         } else {
17 |             console.log("Ignoring file with parse errors: " + path);
18 |         }
19 |     }
20 | 
21 |     module.exports.visitFile = visitFile;
22 | 
23 | })();


--------------------------------------------------------------------------------
/DeepBugs/javascript/jsExtractionUtil.js:
--------------------------------------------------------------------------------
  1 | // Author: Michael Pradel
  2 | 
  3 | (function() {
  4 | 
  5 |     const acorn = require("acorn");
  6 | 
  7 |     const maxLengthOfTokens = 200;
  8 | 
  9 |     function getTokens(code) {
 10 |         try {
 11 |             const tokenizer = acorn.tokenizer(code, {locations:true});
 12 |             const tokens = [];
 13 |             let nextToken = tokenizer.getToken();
 14 |             while (nextToken.type !== acorn.tokTypes.eof) {
 15 |                 tokens.push(nextToken);
 16 |                 nextToken = tokenizer.getToken();
 17 |             }
 18 |             return tokens;
 19 |         } catch (e) {
 20 |         }
 21 |     }
 22 | 
 23 |     function getAST(code, noLocations) {
 24 |         try {
 25 |             if (noLocations) return acorn.parse(code);
 26 |             else return acorn.parse(code, {locations:true});
 27 |         } catch (e) {
 28 |           //console.log(e);
 29 |         }
 30 |     }
 31 | 
 32 |     function getNameOfASTNode(node) {
 33 |         if (node.type === "Identifier") return "ID:" + node.name;
 34 |         else if (node.type === "CallExpression") return getNameOfASTNode(node.callee);
 35 |         else if (node.type === "MemberExpression" && node.computed === true) return getNameOfASTNode(node.object);
 36 |         else if (node.type === "MemberExpression" && node.computed === false) return getNameOfASTNode(node.property);
 37 |         else if (node.type === "Literal") return "LIT:" + String(node.value);
 38 |         else if (node.type === "ThisExpression") return "LIT:this";
 39 |         else if (node.type === "UpdateExpression") return getNameOfASTNode(node.argument);
 40 |     }
 41 | 
 42 |     function getKindOfASTNode(node) {
 43 |         if (node.type === "Identifier") return "ID";
 44 |         else if (node.type === "CallExpression") return getKindOfASTNode(node.callee);
 45 |         else if (node.type === "MemberExpression" && node.computed === true) return getKindOfASTNode(node.object);
 46 |         else if (node.type === "MemberExpression" && node.computed === false) return getKindOfASTNode(node.property);
 47 |         else if (node.type === "Literal") return "LIT";
 48 |         else if (node.type === "ThisExpression") return "LIT";
 49 |     }
 50 | 
 51 |     function getTypeOfASTNode(node) {
 52 |         if (node.type === "Literal") {
 53 |             if (node.hasOwnProperty("regex")) return "regex";
 54 |             else if (node.value === null) return "null";
 55 |             else return typeof node.value;
 56 |         } else if (node.type === "ThisExpression") return "object";
 57 |         else if (node.type === "Identifier" && node.name === "undefined") return "undefined";
 58 |         else return "unknown";
 59 |     }
 60 | 
 61 |     function nodeToString(node) {
 62 |         let result;
 63 |         if (node.type === "Identifier") {
 64 |             result = "ID:" + node.name;
 65 |         } else if (node.type === "Literal") {
 66 |             result = "LIT:" + node.value;
 67 |         } else if (Array.isArray(node)) {
 68 |             result = "Array";
 69 |         } else if (typeof node.type === "string") {
 70 |             result = node.type;
 71 |         } else {
 72 |             throw "Unexpected node type: " + JSON.stringify(node);
 73 |         }
 74 |         return result.slice(0, maxLengthOfTokens);
 75 |     }
 76 | 
 77 |     const identifierTokenType = "name";
 78 |     const literalTokenTypes = ["num", "regexp", "string", "null", "true", "false"];
 79 | 
 80 |     function tokenToString(t) {
 81 |         let result;
 82 |         if (t.type.label === identifierTokenType) {
 83 |             result = "ID:";
 84 |         } else if (literalTokenTypes.indexOf(t.type.label) != -1) {
 85 |             result = "LIT:";
 86 |         } else {
 87 |             result = "STD:";
 88 |         }
 89 | 
 90 |         if (typeof t.value === "undefined") result += t.type.label;
 91 |         else if (typeof t.value === "string" || typeof t.value === "number") result += String(t.value);
 92 |         else if (t.type.label === "regexp") result += String(t.value.value);
 93 |         else {
 94 |             console.log("Unexpected token:\n" + JSON.stringify(t, 0, 2));
 95 |         }
 96 |         return result.slice(0, maxLengthOfTokens);
 97 |     }
 98 | 
 99 |     function tokensToStrings(tokens) {
100 |         return tokens.map(tokenToString);
101 |     }
102 | 
103 |     function isIdLit(token) {
104 |         return isId(token) || isLit(token)
105 |     }
106 | 
107 |     function isId(token) {
108 |         return token.type.label === "name";
109 |     }
110 | 
111 |     function isLit(token) {
112 |         return token.type.label === "num" || token.type.label === "regexp" || token.type.label === "string"
113 |     }
114 | 
115 |     function computeLocationMap(tokens) {
116 |         // maps line-column-based location to character-based location
117 |         const lcLocationToCharLocation = {};
118 |         for (let i = 0; i < tokens.length; i++) {
119 |             const t = tokens[i];
120 |             const lcStartLocation = t.loc.start.line + ":" + t.loc.start.column;
121 |             const lcEndLocation = t.loc.end.line + ":" + t.loc.end.column;
122 |             lcLocationToCharLocation[lcStartLocation] = t.start;
123 |             lcLocationToCharLocation[lcEndLocation] = t.end;
124 |         }
125 |         return lcLocationToCharLocation;
126 |     }
127 | 
128 |     function getLocationOfASTNode(node, lcLocationToCharLocation) {
129 |         const lcStartLocation = node.loc.start.line + ":" + node.loc.start.column;
130 |         const lcEndLocation = node.loc.end.line + ":" + node.loc.end.column;
131 |         const start = lcLocationToCharLocation[lcStartLocation];
132 |         const end = lcLocationToCharLocation[lcEndLocation];
133 |         const diff = end-start;
134 |         return nbToPaddedStr(start, 6) + nbToPaddedStr(diff, 4);
135 |     }
136 | 
137 |     function nbToPaddedStr(nb, length) {
138 |         let str = String(nb);
139 |         while (str.length < length) {
140 |             str = "0" + str;
141 |         }
142 |         return str;
143 |     }
144 | 
145 |     function getNameOfFunction(functionNode, parentNode) {
146 |         if (functionNode.id && functionNode.id.name) return "ID:"+functionNode.id.name;
147 |         if (parentNode.type === "AssignmentExpression") {
148 |             if (parentNode.left.type === "Identifier") return "ID:"+parentNode.left.name;
149 |             if (parentNode.left.type === "MemberExpression" &&
150 |                 parentNode.left.property.type === "Identifier") return "ID:"+parentNode.left.property.name;
151 |         }
152 |         if (parentNode.type === "VariableDeclarator") {
153 |             if (parentNode.id.type === "Identifier") return "ID:"+parentNode.id.name;
154 |         }
155 |         if (parentNode.type === "Property") {
156 |             if (parentNode.key.type === "Identifier") return "ID:"+parentNode.key.name;
157 |         }
158 |     }
159 | 
160 |     module.exports.getTokens = getTokens;
161 |     module.exports.getAST = getAST;
162 |     module.exports.getNameOfASTNode = getNameOfASTNode;
163 |     module.exports.getKindOfASTNode = getKindOfASTNode;
164 |     module.exports.getTypeOfASTNode = getTypeOfASTNode;
165 |     module.exports.nodeToString = nodeToString;
166 |     module.exports.tokenToString = tokenToString;
167 |     module.exports.tokensToStrings = tokensToStrings;
168 |     module.exports.isId = isId;
169 |     module.exports.isLit = isLit;
170 |     module.exports.isIdLit = isIdLit;
171 |     module.exports.nbToPaddedStr = nbToPaddedStr;
172 |     module.exports.computeLocationMap = computeLocationMap;
173 |     module.exports.getLocationOfASTNode = getLocationOfASTNode;
174 |     module.exports.getNameOfFunction = getNameOfFunction;
175 | 
176 | })();
177 | 


--------------------------------------------------------------------------------
/DeepBugs/javascript/modifyArgumentOrder.js:
--------------------------------------------------------------------------------
 1 | var fs = require("fs");
 2 | var esprima = require("esprima");
 3 | var estraverse = require("estraverse");
 4 | var escodegen = require("escodegen");
 5 | 
 6 | var rawJSFilesDir = "../data/js/programs_50/";
 7 | var formattedJSFilesDir = "../data/js/shuffled_arguments/orig/";
 8 | var modifiedJSFilesDir = "../data/js/shuffled_arguments/shuffled/";
 9 | 
10 | function shuffle(a) {
11 |     var j, x, i;
12 |     for (i = a.length; i; i--) {
13 |         j = Math.floor(Math.random() * i);
14 |         x = a[i - 1];
15 |         a[i - 1] = a[j];
16 |         a[j] = x;
17 |     }
18 | }
19 | 
20 | function transformAST(ast) {
21 |     estraverse.traverse(ast, {
22 |         enter:function(node, parent) {
23 |             if (node.type === "CallExpression") {
24 |                 shuffle(node.arguments);
25 |             }
26 |         }
27 |     });
28 | }
29 | 
30 | var files = fs.readdirSync(rawJSFilesDir);
31 | for (var i = 0; i < files.length; i++) {
32 |     var file = files[i];
33 |     if (file.endsWith(".js")) {
34 |         var code = fs.readFileSync(rawJSFilesDir + "/" + file, {encoding:"utf8"});
35 |         var ast = esprima.parse(code);
36 |         var formattedCode = escodegen.generate(ast);
37 |         fs.writeFileSync(formattedJSFilesDir + file, formattedCode);
38 |         transformAST(ast);
39 |         var modifiedCode = escodegen.generate(ast);
40 |         fs.writeFileSync(modifiedJSFilesDir + file, modifiedCode);
41 |     }
42 | }


--------------------------------------------------------------------------------
/DeepBugs/javascript/rb-nodeify.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Convert js files to something compatible with node 4.*
 4 | 
 5 | rm -rf .*.js
 6 | 
 7 | for file in *.js; do
 8 | 	echo "Converting file ${file}..."
 9 | 	new_file=".${file}"
10 | 	# Need to use strict for let bindings inside for loop etc.
11 | 	sed "0,/\(^[^/]\+\)/ s/\(^[^/]\+\)/\"use strict\";\n\n\1/" $file > $new_file
12 | 	# const {spawn} = req... is not allowed
13 | 	sed -i "s/const {spawn} = require('child_process')/const spawn = require('child_process').spawn/" "$new_file"
14 | 	# Replace all local requires
15 | 	sed -i "s/require(\".\//require(\".\/./" $new_file
16 | 	# Spread operator not allowed
17 | 	sed -i "s/\.\.\.\(.*\));/Object.assign({}, \1));/" $new_file
18 | 
19 | done
20 | 


--------------------------------------------------------------------------------
/DeepBugs/javascript/seedBugs.js:
--------------------------------------------------------------------------------
  1 | // Author: Michael Pradel
  2 | 
  3 | (function() {
  4 | 
  5 |     var fs = require("fs");
  6 |     var esprima = require("esprima");
  7 |     var estraverse = require("estraverse");
  8 |     var escodegen = require("escodegen");
  9 |     var clone = require("clone");
 10 | 
 11 |     var rawJSFilesDir = "../data/js/programs_50/";
 12 |     var modifiedJSFilesDir = "../data/js/buggy_fcts";
 13 | 
 14 |     var maxBugs = 100;
 15 | 
 16 |     function randElem(arr) {
 17 |         if (!arr || arr.length === 0) return undefined;
 18 |         return arr[Math.floor(Math.random() * (arr.length))];
 19 |     }
 20 | 
 21 |     function randNb(maxInclusive) {
 22 |         return Math.floor(Math.random() * (maxInclusive + 1));
 23 |     }
 24 | 
 25 |     function splitIntoFcts(ast) {
 26 |         var fcts = [];
 27 |         estraverse.traverse(ast, {
 28 |             enter:function(node, parent) {
 29 |                 if (node.type === "FunctionDeclaration") {
 30 |                     fcts.push(clone(node));
 31 |                 }
 32 |             }
 33 |         });
 34 |         return fcts;
 35 |     }
 36 | 
 37 |     var expressionTypes = [
 38 |         "ThisExpression",
 39 |         "ArrayExpression",
 40 |         "ObjectExpression",
 41 |         "FunctionExpression",
 42 |         "ArrowExpression",
 43 |         "SequenceExpression",
 44 |         "UnaryExpression",
 45 |         "BinaeyExpression",
 46 |         "AssignmentExpression",
 47 |         "UpdateExpression",
 48 |         "LogicalExpression",
 49 |         "ConditionalExpression",
 50 |         "NewExpression",
 51 |         "CallExpression",
 52 |         "MemberExpression",
 53 |         "ComprehensionExpression"
 54 |     ];
 55 | 
 56 |     function modifyFunctionArgument(origAST) {
 57 |         // TODO: Use expressions from other programs? Otherwise, it always occurs twice.
 58 |         var ast = clone(origAST);
 59 |         var expressions = [];
 60 |         var callExpressions = [];
 61 |         estraverse.traverse(ast, {
 62 |             enter:function(node, parent) {
 63 |                 if (expressionTypes.indexOf(node.type) != -1) {
 64 |                     expressions.push(node);
 65 |                 }
 66 |                 if (node.type === "CallExpression") {
 67 |                     callExpressions.push(node);
 68 |                 }
 69 |             }
 70 |         });
 71 |         if (callExpressions.length > 0 && expressions.length > 2) {
 72 |             var callExpression = randElem(callExpressions);
 73 |             var replacementExpression = undefined;
 74 |             while (!replacementExpression) {
 75 |                 replacementExpression = randElem(expressions);
 76 |                 if (replacementExpression === callExpression) replacementExpression = undefined;
 77 |             }
 78 |             replacementExpression = clone(replacementExpression);
 79 | 
 80 |             var args = callExpression.arguments;
 81 |             if (args.length === 0) {
 82 |                 args.push(replacementExpression);
 83 |             } else {
 84 |                 var idxToReplace = randNb(args.length - 1);
 85 |                 args[idxToReplace] = replacementExpression;
 86 |             }
 87 |             return ast;
 88 |         }
 89 |     }
 90 | 
 91 |     var conditionalStmtTypes = [
 92 |         "IfStatement",
 93 |         "WhileStatement",
 94 |         "DoWhileStatement",
 95 |         "ForStatement"
 96 |     ];
 97 | 
 98 |     function modifyConditional(origAST) {
 99 |         var ast = clone(origAST);
100 |         var conditionalStmts = [];
101 |         var expressions = [];
102 |         estraverse.traverse(ast, {
103 |             enter:function(node, parent) {
104 |                 if (conditionalStmtTypes.indexOf(node.type) != -1) {
105 |                     conditionalStmts.push(node);
106 |                 }
107 |                 if (expressionTypes.indexOf(node.type) != -1) {
108 |                     expressions.push(node);
109 |                 }
110 |             }
111 |         });
112 | 
113 |         if (conditionalStmts.length > 0) {
114 |             var condStmt = randElem(conditionalStmts);
115 |             var expr = condStmt.test;
116 |             if (expr.type == "LogicalExpression") {
117 |                 condStmt.test = expr.left;
118 |                 return ast;
119 |             } else {
120 |                 if (expressions.length > 0) {
121 |                     var replacementExpr = randElem(expressions);
122 |                     condStmt.test = replacementExpr;
123 |                     return ast;
124 |                 }
125 |             }
126 |         }
127 |     }
128 | 
129 |     var transformerFcts = [modifyFunctionArgument, modifyConditional];
130 | 
131 |     var files = fs.readdirSync(rawJSFilesDir);
132 |     var origFcts = [];
133 |     for (var i = 0; i < files.length; i++) {
134 |         var file = files[i];
135 |         if (file.endsWith(".js")) {
136 |             var content = fs.readFileSync(rawJSFilesDir + "/" + file, {encoding:"utf8"});
137 |             var origAST = esprima.parse(content);
138 |             var fcts = splitIntoFcts(origAST);
139 |             for (var j = 0; j < fcts.length; j++) {
140 |                 var f = fcts[j];
141 |                 origFcts.push(f);
142 |             }
143 |         }
144 |     }
145 | 
146 |     console.log("Functions: " + origFcts.length);
147 | 
148 |     var astPairs = [];
149 |     for (var i = 0; i < origFcts.length; i++) {
150 |         var origFct = origFcts[i];
151 |         var transformer = randElem(transformerFcts);
152 |         var modifiedFct = transformer(origFct);
153 |         if (modifiedFct) {
154 |             astPairs.push([origFct, modifiedFct]);
155 |         }
156 |     }
157 | 
158 |     var fileCtr = 0;
159 |     for (var i = 0; i < astPairs.length && fileCtr < maxBugs; i++) {
160 |         fileCtr += 1;
161 |         var astPair = astPairs[i];
162 |         var origCode = escodegen.generate(astPair[0]);
163 |         fs.writeFileSync(modifiedJSFilesDir + "/orig/fct" + fileCtr + ".js", origCode);
164 |         var modifiedCode = escodegen.generate(astPair[1]);
165 |         fs.writeFileSync(modifiedJSFilesDir + "/buggy/fct" + fileCtr + ".js", modifiedCode);
166 |     }
167 | 
168 |     console.log("Pairs of functions: " + fileCtr);
169 | 
170 | 
171 | })();


--------------------------------------------------------------------------------
/DeepBugs/javascript/tokenize.js:
--------------------------------------------------------------------------------
1 | var esprima = require("esprima");
2 | var fs = require("fs");
3 | 
4 | var jsFile = process.argv[2]
5 | var tokenFile = process.argv[3]
6 | 
7 | var js = fs.readFileSync(jsFile, {encoding: "utf8"});
8 | var tokens = esprima.tokenize(js);
9 | fs.writeFileSync(tokenFile, JSON.stringify(tokens, 0, 2));


--------------------------------------------------------------------------------
/DeepBugs/python/ASTEmbeddingLearner.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 20, 2017
  3 | 
  4 | @author: Michael Pradel
  5 | '''
  6 | 
  7 | import json
  8 | import math
  9 | from os import getcwd
 10 | from os.path import join
 11 | import sys
 12 | import time
 13 | 
 14 | from keras.layers.core import Dense
 15 | from keras.models import Model
 16 | from keras.models import Sequential
 17 | from keras import backend as K
 18 | 
 19 | import numpy as np
 20 | import random
 21 | 
 22 | kept_main_tokens = 10000
 23 | kept_context_tokens = 1000
 24 | max_context_tokens_per_category = 10
 25 | 
 26 | embedding_size = 200
 27 | batch_size = 50
 28 | nb_epochs = 2
 29 | sampling_rate = 1
 30 | 
 31 | def count_samples(data_paths):
 32 |     total_examples = 0
 33 |     for path in data_paths:
 34 |         encoded_tokens_with_context = np.load(path)
 35 |         total_examples += len(encoded_tokens_with_context)
 36 |     return total_examples
 37 | 
 38 | def xy_pair_generator(data_paths, expected_x_length, expected_y_length):
 39 |     while True:
 40 |         for path in data_paths:
 41 |             encoded_tokens_with_context = np.load(path)
 42 |             for token_with_context in encoded_tokens_with_context:
 43 |                 sample = random.random() < sampling_rate
 44 |                 if sample:
 45 |                     # given encoding:
 46 |                     #  - first element  = number of main token
 47 |                     #  - second element = number of parent token
 48 |                     #  - third element  = position in parent
 49 |                     #  - fourth element = number of grand parent token
 50 |                     #  - fifth element  = position in grand parent
 51 |                     #  - next max_context_tokens_per_category elements = numbers of sibling tokens
 52 |                     #  - next max_context_tokens_per_category elements = numbers of uncle tokens
 53 |                     #  - next max_context_tokens_per_category elements = numbers of cousin tokens
 54 |                     #  - next max_context_tokens_per_category elements = numbers of nephew tokens
 55 |                     # representation to produce:
 56 |                     #  - main token: one-hot vector
 57 |                     #  - context vector: concatenation of subvectors:
 58 |                     #    - parent subvector: one-hot vector
 59 |                     #    - position in parent subvector: single number
 60 |                     #    - grand parent subvector: one-hot vector
 61 |                     #    - position in grand parent subvector: single number
 62 |                     #    - four subvectors for siblings, uncles, cousins, and nephews: each is a k-hot vector
 63 |                     x = np.zeros(kept_main_tokens + 1)
 64 |                     x[token_with_context[0]] = 1
 65 |                     assert len(x) == expected_x_length, str(len(x)) + " is not " + str(expected_x_length) 
 66 |                     
 67 |                     y_length = 6 * (kept_context_tokens + 1) + 2
 68 |                     y = np.zeros(y_length)
 69 |                     for idx in [1,3]: # do two times the same: for parent and grand parent
 70 |                         hot_element = token_with_context[idx]
 71 |                         position_in_parent = token_with_context[idx + 1]
 72 |                         offset = (kept_context_tokens + 1) + 1 if idx == 3 else 0
 73 |                         y[offset + hot_element] = 1
 74 |                         y[offset + kept_context_tokens + 1] = position_in_parent
 75 |                     for kind_nb in range(0,4): # do four times the same: for siblings, uncles, cousins, and nephews
 76 |                         offset = (2 * (kept_context_tokens + 1)) + 2
 77 |                         for hot_element in token_with_context[5 + (max_context_tokens_per_category * kind_nb):5 + (max_context_tokens_per_category * (kind_nb + 1))]:
 78 |                             if hot_element > -1:
 79 |                                 y[offset + hot_element] = 1
 80 |                     
 81 |                     assert len(y) == expected_y_length, len(y)
 82 |                     
 83 |                     yield (x, y)
 84 |     assert False, "Should never reach this line"
 85 | 
 86 | def batch_generator(xy_pair_generator):
 87 |     xs = []
 88 |     ys = []
 89 |     for x, y in xy_pair_generator:
 90 |         xs.append(x)
 91 |         ys.append(y)
 92 |         if len(xs) is batch_size:
 93 |             batch = (np.asarray(xs), np.asarray(ys))
 94 |             yield batch
 95 |             xs = []
 96 |             ys = []
 97 | 
 98 | if __name__ == '__main__':
 99 |     # arguments: <main_token_to_nb_file> <list of files with tokens and contexts>
100 |     
101 |     token_to_nb_file = sys.argv[1]
102 |     data_paths = list(map(lambda f: join(getcwd(), f), sys.argv[2:]))
103 |     if len(data_paths) is 0:
104 |         print("Must pass token_to_nb files and at least one data file")
105 |         sys.exit(1)
106 | 
107 |     x_length = kept_main_tokens + 1
108 |     y_length = 6 * (kept_context_tokens + 1) + 2
109 |     total_examples = count_samples(data_paths)
110 |     total_samples = total_examples * sampling_rate
111 |     
112 |     print("Total samples: " + str(total_examples))
113 |     print("Will sample about " + str(total_samples))
114 |     
115 |     model = Sequential()
116 |     model.add(Dense(embedding_size, input_shape=(x_length,), name="hidden"))
117 |     model.add(Dense(y_length, activation="sigmoid"))
118 |     
119 |     # using sigmoid for last layer + binary crossentropy because commonly used for multi-label, multi-class classification
120 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
121 |     
122 |     total_samples_per_epoch = total_samples / batch_size
123 |     validation_samples_per_epoch = total_samples_per_epoch * 0.2    
124 | 
125 |     generator = batch_generator(xy_pair_generator(data_paths, x_length, y_length))
126 |     model.fit_generator(generator=generator, steps_per_epoch=total_samples_per_epoch, epochs=nb_epochs, validation_steps=validation_samples_per_epoch)
127 | 
128 |     # store the model
129 |     time_stamp = math.floor(time.time() * 1000) 
130 |     model.save("embedding_model_" + str(time_stamp))
131 | 
132 |     # after training the model, write token-to-vector map (= learned embedding) to file
133 |     with open(token_to_nb_file, "r") as file:
134 |         token_to_nb = json.load(file)
135 |     intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer("hidden").output)
136 |     token_to_vector = dict()
137 |     for token, nb in token_to_nb.items():
138 |         x = [0] * (kept_main_tokens + 1)
139 |         x[nb] = 1
140 |         intermediate_output = intermediate_layer_model.predict(np.asarray([x]))
141 |         vector = intermediate_output[0].tolist()
142 |         token_to_vector[token] = vector
143 |     token_to_vector_file_name = "token_to_vector_" + str(time_stamp) + ".json"
144 |     with open(token_to_vector_file_name, "w") as file:
145 |         json.dump(token_to_vector, file, sort_keys=True, indent=4)
146 | 
147 |     # show prediction for a few randomly selected examples
148 | #     ctr = 0
149 | #     for (x,y) in xy_pair_generator(data_paths, x_length, y_length):
150 | #         print("X          : " + str(x))
151 | #         print("Y          : " + str(y))
152 | #         y_predicted = model.predict(x)
153 | #         print("Y_predicted: " + str(y_predicted))
154 | #         
155 | #         ctr += 1
156 | #         if ctr > 10:
157 | #             break
158 |         
159 |     
160 |     
161 |     
162 |     
163 | 


--------------------------------------------------------------------------------
/DeepBugs/python/AccuracyMetricTest.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 17, 2017
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | 
 8 | from keras import backend as K
 9 | import numpy as np
10 | 
11 | nb_tokens_in_context = 2
12 | kept_context_tokens = 5
13 | weight_of_ones = kept_context_tokens
14 | 
15 | def weighted_loss(y_true, y_pred):
16 |     weights = (y_true * (weight_of_ones - 1) + 1)
17 |     y_pred = K.variable(y_pred) ## required only for debugging (if arguments come from backend, no need to convert them) 
18 |     clipped_y_pred = K.clip(y_pred, K.epsilon(), None)
19 |     weighted_cross_entropy = -(y_true * K.log(clipped_y_pred) * weights)
20 |     result = K.mean(weighted_cross_entropy)
21 |     assert not np.isnan(K.eval(result))
22 |     return result
23 |     
24 | def weighted_accuracy(y_true, y_pred):
25 |     weights = (y_true * (weight_of_ones - 1) + 1)
26 |     equal = K.cast(K.equal(y_true, K.round(y_pred)), K.floatx())
27 |     debug = K.eval(equal)
28 |     weighted_equal = equal * weights
29 |     return K.mean(weighted_equal)
30 | 
31 | if __name__ == '__main__':
32 |     y_true = np.zeros(nb_tokens_in_context * kept_context_tokens)
33 |     y_true[2] = 1
34 |     y_true[7] = 1
35 |     y_pred = np.ones(nb_tokens_in_context * kept_context_tokens)
36 | #     y_pred[2] = 0.1
37 | #     y_pred[7] = 0.99
38 |     
39 |     print("Accuracy: " + str(K.eval(weighted_accuracy(y_true, y_pred))))
40 |     print("Loss: " + str(K.eval(weighted_loss(y_true, y_pred))))


--------------------------------------------------------------------------------
/DeepBugs/python/BinOpContextToEmbedding.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 31, 2017
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | import json
 8 | import math
 9 | import sys
10 | import time
11 | 
12 | import random
13 | 
14 | import Util
15 | 
16 | node_type_embedding_size = 8 # if changing here, then also change in LearningDataBinOperator
17 | 
18 | def create_random_embedding(size, used_embeddings):
19 |     while True:
20 |         embedding = []
21 |         for _ in range(0, size):
22 |             random_bit = round(random.random())
23 |             embedding.append(random_bit)
24 |         if not (str(embedding) in used_embeddings):
25 |             used_embeddings.add(str(embedding))
26 |             return embedding
27 | 
28 | if __name__ == '__main__':
29 |     # arguments: <binOp data files>
30 |     
31 |     data_paths = sys.argv[1:]
32 |     node_type_to_vector = dict()
33 |     node_type_embeddings = set()
34 |     for bin_op in Util.DataReader(data_paths):
35 |         node_types = [bin_op["parent"], bin_op["grandParent"]]
36 |         for node_type in node_types:
37 |             if not (node_type in node_type_to_vector):
38 |                 type_embedding = create_random_embedding(node_type_embedding_size, node_type_embeddings)
39 |                 node_type_to_vector[node_type] = type_embedding
40 | 
41 |     time_stamp = math.floor(time.time() * 1000)
42 |     node_type_to_vector_file = "node_type_to_vector_" + str(time_stamp) + ".json"
43 |     with open(node_type_to_vector_file, "w") as file:
44 |         json.dump(node_type_to_vector, file, sort_keys=True, indent=4)
45 |     
46 |     
47 |     


--------------------------------------------------------------------------------
/DeepBugs/python/CallContextToEmbedding.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 31, 2017
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | import json
 8 | import math
 9 | import sys
10 | import time
11 | 
12 | import random
13 | 
14 | import Util
15 | 
16 | # if changing the following, also change in AnomalyDetector
17 | filename_embedding_size = 50
18 | type_embedding_size = 5
19 | 
20 | def create_random_embedding(size, used_embeddings):
21 |     while True:
22 |         embedding = []
23 |         for _ in range(0, size):
24 |             random_bit = round(random.random())
25 |             embedding.append(random_bit)
26 |         if not (str(embedding) in used_embeddings):
27 |             used_embeddings.add(str(embedding))
28 |             return embedding
29 | 
30 | if __name__ == '__main__':
31 |     # arguments: <call data files>
32 |     
33 |     call_data_paths = sys.argv[1:]
34 |     filename_to_vector = dict()
35 |     type_to_vector = dict()
36 |     filename_embeddings = set()
37 |     type_embeddings = set()
38 |     for call in Util.DataReader(call_data_paths):
39 |         filename = call["filename"]
40 |         if not (filename in filename_to_vector):
41 |             filename_embedding = create_random_embedding(filename_embedding_size, filename_embeddings)
42 |             filename_to_vector[filename] = filename_embedding
43 |         argument_types = call["argumentTypes"]
44 |         for argument_type in argument_types:
45 |             if not (argument_type in type_to_vector):
46 |                 type_embedding = create_random_embedding(type_embedding_size, type_embeddings)
47 |                 type_to_vector[argument_type] = type_embedding
48 | 
49 |     time_stamp = math.floor(time.time() * 1000)
50 |     filename_to_vector_file = "filename_to_vector_" + str(time_stamp) + ".json"
51 |     with open(filename_to_vector_file, "w") as file:
52 |         json.dump(filename_to_vector, file, sort_keys=True, indent=4)
53 |     type_to_vector_file = "type_to_vector_" + str(time_stamp) + ".json"
54 |     with open(type_to_vector_file, "w") as file:
55 |         json.dump(type_to_vector, file, sort_keys=True, indent=4)
56 |         
57 |     
58 |     
59 |     


--------------------------------------------------------------------------------
/DeepBugs/python/CallPerCalleeCounter.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Nov 7, 2017
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | import json
 8 | import math
 9 | import sys
10 | import time
11 | 
12 | import random
13 | 
14 | import Util
15 | from collections import Counter
16 | 
17 | if __name__ == '__main__':
18 |     # arguments: <call data files>
19 |     
20 |     call_data_paths = sys.argv[1:]
21 |     callee_to_calls = Counter();
22 |     for call in Util.DataReader(call_data_paths):
23 |         callee = call["callee"]
24 |         callee_to_calls[callee] += 1
25 | 
26 |     time_stamp = math.floor(time.time() * 1000)
27 |     callee_to_calls_file = "callee_to_calls_" + str(time_stamp) + ".json"
28 |     with open(callee_to_calls_file, "w") as file:
29 |         json.dump(callee_to_calls, file, sort_keys=True, indent=4)
30 |         
31 |     
32 |     
33 |     


--------------------------------------------------------------------------------
/DeepBugs/python/CallPerFileCounter.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Nov 3, 2017
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | import json
 8 | import math
 9 | import sys
10 | import time
11 | 
12 | import random
13 | 
14 | import Util
15 | from collections import Counter
16 | 
17 | if __name__ == '__main__':
18 |     # arguments: <call data files>
19 |     
20 |     call_data_paths = sys.argv[1:]
21 |     file_name_to_calls = Counter();
22 |     for call in Util.DataReader(call_data_paths):
23 |         file_name = call["filename"]
24 |         file_name_to_calls[file_name] += 1
25 | 
26 |     time_stamp = math.floor(time.time() * 1000)
27 |     file_name_to_calls_file = "file_name_to_calls_" + str(time_stamp) + ".json"
28 |     with open(file_name_to_calls_file, "w") as file:
29 |         json.dump(file_name_to_calls, file, sort_keys=True, indent=4)
30 |         
31 |     
32 |     
33 |     


--------------------------------------------------------------------------------
/DeepBugs/python/EmbeddingEvaluator.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 4, 2017
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | import sys
 8 | import json
 9 | from os.path import join
10 | from os import getcwd
11 | from sklearn.decomposition.incremental_pca import IncrementalPCA
12 | from matplotlib import pyplot
13 | import re
14 | import random
15 | from scipy.spatial.kdtree import KDTree
16 | import numpy as np
17 | 
18 | sampling_rate_for_PCA = 0.01
19 | 
20 | if __name__ == '__main__':
21 |     # arguments: <name to vector file>
22 |     name_to_vector_file = join(getcwd(), sys.argv[1])
23 |     with open(name_to_vector_file) as f:
24 |         name_to_vector = json.load(f)
25 |     
26 |     names = []
27 |     vectors = []
28 |     for name, vector in name_to_vector.items():
29 |         names.append(name)
30 |         vectors.append(vector)
31 |     
32 |     # perform q few similarity queries
33 |     queries = [ "ID:i", "ID:name", "ID:jQuery", "ID:counter", "ID:element", "LIT:true", "ID:msg", "ID:length"] # for AST-based
34 |     kd_tree = KDTree(np.array(vectors))
35 |     for query in queries:
36 |         if query in name_to_vector:
37 |             print(query + " has similar names:")
38 |             query_vector = name_to_vector[query]
39 |             _, neighbor_idxs = kd_tree.query(query_vector, k=6)
40 |             closest_names = []
41 |             for idx in neighbor_idxs:
42 |                 close_name = names[idx]
43 |                 if close_name != query:
44 |                     print("  " + close_name)
45 |     
46 |     # show PCA
47 |     pca_vectors = []
48 |     pca_labels = []
49 |     for idx, name in enumerate(names):
50 |         if random.random() < sampling_rate_for_PCA:
51 |             pca_labels.append(name)
52 |             pca_vectors.append(vectors[idx])
53 |     
54 |     ipca = IncrementalPCA(n_components=2)
55 |     reduced_vectors = ipca.fit_transform(pca_vectors)
56 | 
57 |     fig, ax = pyplot.subplots()
58 |     x = reduced_vectors[:, 0]
59 |     y = reduced_vectors[:, 1]
60 |     ax.scatter(x, y)
61 |     for idx, label in enumerate(pca_labels):
62 |         escaped_label = re.escape(label)
63 |         ax.annotate(escaped_label, (x[idx], y[idx]))
64 |         
65 |     pyplot.show()
66 |     
67 |     
68 | 


--------------------------------------------------------------------------------
/DeepBugs/python/EmbeddingEvaluatorWord2Vec.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Mar 20, 2018
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | import sys
 8 | from gensim.models import Word2Vec
 9 | from sklearn.decomposition.incremental_pca import IncrementalPCA
10 | from matplotlib import pyplot
11 | import re
12 | 
13 | if __name__ == '__main__':
14 |     # arguments: embedding_model_file
15 |     model = Word2Vec.load(sys.argv[1])
16 |     
17 |     queries = [ "ID:i", "ID:name", "ID:jQuery", "ID:counter", "ID:element", "LIT:true", "ID:msg", "ID:length", "ID:nextSibling", "ID:toLowerCase", "ID:wrapper", "ID:width", "ID:getWidth"]
18 |     
19 |     for query in queries:
20 |         results = model.wv.most_similar(positive=[query])
21 |         print("\\begin{tabular}{rl}")
22 |         print("  \\toprule")
23 |         print("  \\multicolumn{2}{c}{\\emph{\\textbf{"+query+"}}} \\\\")
24 |         print("  \\midrule")
25 |         print("  Simil. & Identifier \\\\")
26 |         print("  \\midrule")
27 |         for (other_id, simil) in results:
28 |             escaped = other_id.replace("_", "\\_")
29 |             print("  "+str(round(simil, 2))+" & "+escaped+" \\\\")
30 |         print("  \\bottomrule")
31 |         print("\end{tabular}")
32 |         print()
33 |     
34 |     
35 |     # show PCA
36 |     pca_queries = [ "ID:wrapper", "ID:container", "ID:msg", "ID:alert", "ID:list", "ID:seq", "ID:lst", "ID:list", "LIT:error" ]
37 |     pca_vectors = []
38 |     pca_labels = []
39 |     for _, name in enumerate(pca_queries):
40 |         if name.startswith("LIT:"):
41 |             print_name = "\"" + name.replace("LIT:", "") + "\""  # assumes string literals only
42 |         else:
43 |             print_name = name.replace("ID:", "")
44 |         pca_labels.append(print_name)
45 |         pca_vectors.append(model.wv[name])
46 |     
47 |     ipca = IncrementalPCA(n_components=2)
48 |     reduced_vectors = ipca.fit_transform(pca_vectors)
49 | 
50 |     fig, ax = pyplot.subplots()
51 |     x = reduced_vectors[:, 0]
52 |     y = reduced_vectors[:, 1]
53 |     ax.scatter(x, y)
54 |     for idx, label in enumerate(pca_labels):
55 |         #escaped_label = re.escape(label)
56 |         ax.annotate(label, (x[idx], y[idx]))
57 | 
58 |     pyplot.show()
59 |     
60 | 


--------------------------------------------------------------------------------
/DeepBugs/python/EmbeddingLearner.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 3, 2017
  3 | 
  4 | @author: Michael Pradel
  5 | '''
  6 | 
  7 | import json
  8 | import math
  9 | from os import getcwd
 10 | from os.path import join
 11 | import sys
 12 | import time
 13 | 
 14 | from keras.layers.core import Dense
 15 | from keras.models import Model
 16 | from keras.models import Sequential
 17 | 
 18 | import numpy as np
 19 | import random
 20 | 
 21 | nb_tokens_in_context = 20
 22 | kept_main_tokens = 10000
 23 | kept_context_tokens = 1000
 24 | 
 25 | embedding_size = 200
 26 | batch_size = 50
 27 | nb_epochs = 2
 28 | sampling_rate = 1
 29 | 
 30 | def count_samples(data_paths):
 31 |     total_examples = 0
 32 |     for path in data_paths:
 33 |         encoded_tokens_with_context = np.load(path)
 34 |         total_examples += len(encoded_tokens_with_context)
 35 |     return total_examples
 36 | 
 37 | def xy_pair_generator(data_paths, expected_x_length, expected_y_length):
 38 |     while True:
 39 |         for path in data_paths:
 40 |             encoded_tokens_with_context = np.load(path)
 41 |             for token_with_context in encoded_tokens_with_context:
 42 |                 sample = random.random() < sampling_rate
 43 |                 if sample:
 44 |                     # encode token and context as one-hot vectors 
 45 |                     # first element of token_with_context = number of main token
 46 |                     x = np.zeros(kept_main_tokens + 1)
 47 |                     x[token_with_context[0]] = 1
 48 |                     assert len(x) == expected_x_length, str(len(x)) + " is not " + str(expected_x_length) 
 49 |                     
 50 |                     y = np.zeros(nb_tokens_in_context * (kept_context_tokens + 1))
 51 |                     for idx, nb_of_context_token in enumerate(token_with_context[1:]): # 2nd, 3rd, etc. element of token_with_context = numbers of context tokens
 52 |                         offset = idx * (kept_context_tokens + 1)
 53 |                         y[offset + nb_of_context_token] = 1
 54 |                     assert len(y) == expected_y_length, len(y)
 55 |                     
 56 |                     yield (x, y)
 57 |     assert False, "Should never reach this line"
 58 | 
 59 | def batch_generator(xy_pair_generator):
 60 |     xs = []
 61 |     ys = []
 62 |     for x, y in xy_pair_generator:
 63 |         xs.append(x)
 64 |         ys.append(y)
 65 |         if len(xs) is batch_size:
 66 |             batch = (np.asarray(xs), np.asarray(ys))
 67 |             yield batch
 68 |             xs = []
 69 |             ys = []
 70 | 
 71 | # custom loss and accuracy to account for unbalanced y vectors:
 72 | #          
 73 | # weight_of_ones = kept_context_tokens
 74 | # 
 75 | # def weighted_loss(y_true, y_pred):
 76 | #     weights = y_true * weight_of_ones
 77 | #     clipped_y_pred = K.clip(y_pred, K.epsilon(), None)
 78 | #     weighted_cross_entropy = -(y_true * K.log(clipped_y_pred) * weights)
 79 | #     result = K.mean(weighted_cross_entropy)
 80 | #     return result
 81 | #     
 82 | # def weighted_accuracy(y_true, y_pred):
 83 | #     weights = y_true * weight_of_ones
 84 | #     weighted_equal = K.cast(K.equal(y_true, K.round(y_pred)), K.floatx()) * weights
 85 | #     return K.mean(weighted_equal)
 86 | 
 87 | if __name__ == '__main__':
 88 |     # arguments: <token_to_nb_file> <list of files with tokens and contexts>
 89 |     
 90 |     token_to_nb_file = sys.argv[1]
 91 |     data_paths = list(map(lambda f: join(getcwd(), f), sys.argv[2:]))
 92 |     if len(data_paths) is 0:
 93 |         print("Must pass token_to_nb files and at least one data file")
 94 |         sys.exit(1)
 95 |     x_length = kept_main_tokens + 1
 96 |     y_length = nb_tokens_in_context * (kept_context_tokens + 1)
 97 |     total_examples = count_samples(data_paths)
 98 |     total_samples = total_examples * sampling_rate
 99 |     
100 |     print("Total samples: " + str(total_examples))
101 |     print("Will sample about " + str(total_samples))
102 |     
103 |     model = Sequential()
104 |     model.add(Dense(200, input_shape=(x_length,), name="hidden"))
105 |     model.add(Dense(y_length, activation="sigmoid"))
106 |     
107 |     # using sigmoid for last layer + binary crossentropy because commonly used for multi-label, multi-class classification
108 |     model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
109 | #     model.compile(loss=weighted_loss, optimizer='adam', metrics=[weighted_accuracy])
110 |     
111 |     total_samples_per_epoch = total_samples / batch_size
112 |     validation_samples_per_epoch = total_samples_per_epoch * 0.2    
113 | 
114 |     generator = batch_generator(xy_pair_generator(data_paths, x_length, y_length))
115 |     model.fit_generator(generator=generator, steps_per_epoch=total_samples_per_epoch, epochs=nb_epochs, validation_steps=validation_samples_per_epoch)
116 | 
117 |     # store the model
118 |     time_stamp = math.floor(time.time() * 1000) 
119 |     model.save("embedding_model_" + str(time_stamp))
120 | 
121 |     # after training the model, write token-to-vector map (= learned embedding) to file
122 |     with open(token_to_nb_file, "r") as file:
123 |         token_to_nb = json.load(file)
124 |     intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer("hidden").output)
125 |     token_to_vector = dict()
126 |     for token, nb in token_to_nb.items():
127 |         x = [0] * (kept_main_tokens + 1)
128 |         x[nb] = 1
129 |         intermediate_output = intermediate_layer_model.predict(np.asarray([x]))
130 |         vector = intermediate_output[0].tolist()
131 |         token_to_vector[token] = vector
132 |     token_to_vector_file_name = "token_to_vector_" + str(time_stamp) + ".json"
133 |     with open(token_to_vector_file_name, "w") as file:
134 |         json.dump(token_to_vector, file, sort_keys=True, indent=4)
135 | 
136 |     # show prediction for a few randomly selected examples
137 | #     ctr = 0
138 | #     for (x,y) in xy_pair_generator(data_paths, x_length, y_length):
139 | #         print("X          : " + str(x))
140 | #         print("Y          : " + str(y))
141 | #         y_predicted = model.predict(x)
142 | #         print("Y_predicted: " + str(y_predicted))
143 | #         
144 | #         ctr += 1
145 | #         if ctr > 10:
146 | #             break
147 |         
148 |     
149 |     
150 |     
151 |     


--------------------------------------------------------------------------------
/DeepBugs/python/EmbeddingLearnerWord2Vec.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 26, 2017
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | import math
 8 | from os import getcwd
 9 | from os.path import join
10 | import sys
11 | import time
12 | import json
13 | from gensim.models import Word2Vec
14 | 
15 | nb_tokens_in_context = 20
16 | kept_tokens = 10000
17 | 
18 | embedding_size = 200
19 | 
20 | class EncodedSequenceReader(object):
21 |     def __init__(self, data_paths):
22 |         self.data_paths = data_paths
23 |         
24 |     def __iter__(self):
25 |         for data_path in self.data_paths:
26 |             print("Reading file " + data_path)
27 |             with open(data_path) as file:
28 |                 token_sequences = json.load(file)
29 |             for seq in token_sequences:
30 |                 yield seq
31 | 
32 | if __name__ == '__main__':
33 |     # arguments: <token_to_nb_file> <list of .json files with tokens>
34 |     
35 |     token_to_nb_file = sys.argv[1]
36 |     data_paths = list(map(lambda f: join(getcwd(), f), sys.argv[2:]))
37 |     if len(data_paths) is 0:
38 |         print("Must pass token_to_nb files and at least one data file")
39 |         sys.exit(1)
40 | 
41 |     token_seqs = EncodedSequenceReader(data_paths)
42 |     model = Word2Vec(token_seqs, min_count=1, window=nb_tokens_in_context/2, size=embedding_size, workers=40)
43 | 
44 |     # store the model
45 |     time_stamp = math.floor(time.time() * 1000)
46 |     model.save("embedding_model_" + str(time_stamp))
47 | 
48 |     # after training the model, write token-to-vector map (= learned embedding) to file
49 |     with open(token_to_nb_file, "r") as file:
50 |         token_to_nb = json.load(file)
51 |     token_to_vector = dict()
52 |     for token in model.wv.vocab:
53 |         if token.startswith("ID:") or token.startswith("LIT:"):
54 |             vector = model[token].tolist()
55 |             token_to_vector[token] = vector
56 |     token_to_vector_file_name = "token_to_vector_" + str(time_stamp) + ".json"
57 |     with open(token_to_vector_file_name, "w") as file:
58 |         json.dump(token_to_vector, file, sort_keys=True, indent=4)
59 |        
60 |     


--------------------------------------------------------------------------------
/DeepBugs/python/EmbeddingModelValidator.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 17, 2017
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | import sys
 8 | from os import getcwd
 9 | from os.path import join
10 | import json
11 | from keras.models import load_model
12 | import numpy as np
13 | from keras import backend as K
14 | import random
15 | from numpy import float32
16 | 
17 | nb_tokens_in_context = 20
18 | kept_main_tokens = 10000
19 | kept_context_tokens = 1000
20 | 
21 | # custom loss and accuracy to account for unbalanced y vectors          
22 | weight_of_ones = kept_context_tokens
23 | 
24 | def weighted_loss(y_true, y_pred):
25 |     weights = y_true * weight_of_ones
26 |     clipped_y_pred = K.clip(y_pred, K.epsilon(), None)
27 |     weighted_cross_entropy = -(y_true * K.log(clipped_y_pred) * weights)
28 |     result = K.mean(weighted_cross_entropy)
29 |     return result
30 |     
31 | def weighted_accuracy(y_true, y_pred):
32 |     weights = y_true * weight_of_ones
33 |     weighted_equal = K.cast(K.equal(y_true, K.round(y_pred)), K.floatx()) * weights
34 |     return K.mean(weighted_equal)
35 | 
36 | def get_xy_pair(path):
37 |     encoded_tokens_with_context = np.load(path)
38 |     for token_with_context in encoded_tokens_with_context:
39 |         sample = random.random() < 0.001
40 |         if sample:
41 |             # encode token and context as one-hot vectors 
42 |             # first element of token_with_context = number of main token
43 |             x = np.zeros(kept_main_tokens + 1)
44 |             x[token_with_context[0]] = 1
45 |             
46 |             y = np.zeros(nb_tokens_in_context * (kept_context_tokens + 1))
47 |             for idx, nb_of_context_token in enumerate(token_with_context[1:]): # 2nd, 3rd, etc. element of token_with_context = numbers of context tokens
48 |                 offset = idx * (kept_context_tokens + 1)
49 |                 y[offset + nb_of_context_token] = 1
50 |             
51 |             yield (x, y)
52 | 
53 | if __name__ == '__main__':
54 |     # arguments: <stored_model_file> <encoded_tokens_with_context_file>
55 |     if len(sys.argv) < 3:
56 |         print("Insufficient arguments")
57 |         sys.exit(10)
58 |     model_file = sys.argv[1]
59 |     token_with_context_file = sys.argv[2]
60 |     
61 |     model = load_model(model_file, custom_objects={"weighted_loss":weighted_loss, "weighted_accuracy":weighted_accuracy})
62 |     
63 |     nb_examples = 0
64 |     for (x, y_true) in get_xy_pair(token_with_context_file):
65 |         print("x: "+str(x))
66 |         xs = np.asarray([x])
67 |         ys = model.predict(xs)
68 |         y_pred = ys[0]
69 |         print("y_pred: "+str(y_pred))
70 |         y_rounded = K.eval(K.round(y_pred))
71 |         print("y_rounded: "+str(y_rounded))
72 |         y_true = y_true.astype(float32)
73 |         print("y_true   : "+str(y_true))
74 |         print("accuracy : "+str(K.eval(weighted_accuracy(y_true, y_pred))))
75 |         
76 |         nb_examples += 1
77 |         if nb_examples > 0:
78 |             break
79 |     
80 |     
81 |     


--------------------------------------------------------------------------------
/DeepBugs/python/LearningDataBinOperator.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Nov 9, 2017
  3 | 
  4 | @author: Michael Pradel
  5 | '''
  6 | 
  7 | import Util
  8 | from collections import Counter
  9 | import random
 10 | import pandas as pd
 11 | 
 12 | data = pd.read_pickle('benchmarks/binOps_data.pkl', 'gzip')
 13 | type_embedding_size = 5
 14 | node_type_embedding_size = 8  # if changing here, then also change in LearningDataBinOperator
 15 | 
 16 | 
 17 | class CodePiece(object):
 18 |     def __init__(self, left, right, op, src):
 19 |         self.left = left
 20 |         self.right = right
 21 |         self.op = op
 22 |         self.src = src
 23 | 
 24 |     def to_message(self):
 25 |         return str(self.src) + " | " + str(self.left) + " | " + str(self.op) + " | " + str(self.right)
 26 | 
 27 | 
 28 | class LearningData(object):
 29 |     def __init__(self):
 30 |         self.all_operators = None
 31 |         self.stats = {}
 32 | 
 33 |     def resetStats(self):
 34 |         self.stats = {}
 35 | 
 36 |     def pre_scan(self, training_data_paths, validation_data_paths):
 37 |         all_operators_set = set()
 38 |         for bin_op in Util.DataReader(training_data_paths):
 39 |             if isinstance(bin_op, list):
 40 |                 for bop in bin_op:
 41 |                     all_operators_set.add(bop['op'])
 42 |             else:
 43 |                 all_operators_set.add(bin_op["op"])
 44 |         for bin_op in Util.DataReader(validation_data_paths):
 45 |             if isinstance(bin_op, list):
 46 |                 for bop in bin_op:
 47 |                     all_operators_set.add(bop['op'])
 48 |             else:
 49 |                 all_operators_set.add(bin_op["op"])
 50 |         all_operators_set.update(set(data['op']))
 51 |         self.all_operators = list(all_operators_set)
 52 | 
 53 |     def code_to_xy_pairs_given_incorrect_example(self, bin_op, xs, ys, name_to_vector, type_to_vector,
 54 |                                                  node_type_to_vector, code_pieces):
 55 |         x_correct, y_correct = None, None
 56 |         x_incorrect, y_incorrect = None, None
 57 |         cor_incorrect_code_pieces = []
 58 |         for op in bin_op:
 59 |             left = op["left"]
 60 |             right = op["right"]
 61 |             operator = op["op"]
 62 |             left_type = op["leftType"]
 63 |             right_type = op["rightType"]
 64 |             parent = op["parent"]
 65 |             grand_parent = op["grandParent"]
 66 |             src = op["src"]
 67 |             if left not in name_to_vector:
 68 |                 continue
 69 |             if right not in name_to_vector:
 70 |                 continue
 71 |             left_vector = name_to_vector[left]
 72 |             right_vector = name_to_vector[right]
 73 |             operator_vector = [0] * len(self.all_operators)
 74 |             operator_vector[self.all_operators.index(operator)] = 1
 75 |             left_type_vector = type_to_vector.get(left_type, [0] * type_embedding_size)
 76 |             right_type_vector = type_to_vector.get(right_type, [0] * type_embedding_size)
 77 |             parent_vector = node_type_to_vector[parent]
 78 |             grand_parent_vector = node_type_to_vector[grand_parent]
 79 |             vec = left_vector + right_vector + operator_vector + left_type_vector + right_type_vector + parent_vector + grand_parent_vector
 80 |             if op['probability_that_incorrect'] == 0:
 81 |                 x_correct = vec
 82 |                 y_correct = [0]
 83 |             elif op['probability_that_incorrect'] == 1:
 84 |                 x_incorrect = vec
 85 |                 y_incorrect = [1]
 86 |             cor_incorrect_code_pieces.append(CodePiece(left, right, operator, src))
 87 | 
 88 |         if x_correct and y_correct and x_incorrect and y_incorrect:
 89 |             xs.append(x_correct)
 90 |             ys.append(y_correct)
 91 |             xs.append(x_incorrect)
 92 |             ys.append(y_incorrect)
 93 |             code_pieces.extend(cor_incorrect_code_pieces)
 94 | 
 95 |     def code_to_xy_pairs(self, bin_op, xs, ys, name_to_vector, type_to_vector, node_type_to_vector, code_pieces):
 96 |         left = bin_op["left"]
 97 |         right = bin_op["right"]
 98 |         operator = bin_op["op"]
 99 |         left_type = bin_op["leftType"]
100 |         right_type = bin_op["rightType"]
101 |         parent = bin_op["parent"]
102 |         grand_parent = bin_op["grandParent"]
103 |         src = bin_op["src"]
104 |         if not (left in name_to_vector):
105 |             return
106 |         if not (right in name_to_vector):
107 |             return
108 | 
109 |         left_vector = name_to_vector[left]
110 |         right_vector = name_to_vector[right]
111 |         operator_vector = [0] * len(self.all_operators)
112 |         operator_vector[self.all_operators.index(operator)] = 1
113 |         left_type_vector = type_to_vector.get(left_type, [0] * type_embedding_size)
114 |         right_type_vector = type_to_vector.get(right_type, [0] * type_embedding_size)
115 |         parent_vector = node_type_to_vector[parent]
116 |         grand_parent_vector = node_type_to_vector[grand_parent]
117 | 
118 |         # for all xy-pairs: y value = probability that incorrect
119 |         x_correct = left_vector + right_vector + operator_vector + left_type_vector + right_type_vector + parent_vector + grand_parent_vector
120 |         y_correct = [0]
121 |         xs.append(x_correct)
122 |         ys.append(y_correct)
123 |         code_pieces.append(CodePiece(left, right, operator, src))
124 | 
125 |         # pick some other, likely incorrect operator
126 |         other_operator_vector = None
127 |         while other_operator_vector == None:
128 |             other_operator = random.choice(self.all_operators)
129 |             if other_operator != operator:
130 |                 other_operator_vector = [0] * len(self.all_operators)
131 |                 other_operator_vector[self.all_operators.index(other_operator)] = 1
132 | 
133 |         x_incorrect = left_vector + right_vector + other_operator_vector + left_type_vector + right_type_vector + parent_vector + grand_parent_vector
134 |         y_incorrect = [1]
135 |         xs.append(x_incorrect)
136 |         ys.append(y_incorrect)
137 |         code_pieces.append(CodePiece(left, right, other_operator, src))
138 | 
139 |     def anomaly_score(self, y_prediction_orig, y_prediction_changed):
140 |         return y_prediction_orig
141 | 
142 |     def normal_score(self, y_prediction_orig, y_prediction_changed):
143 |         return y_prediction_changed
144 | 


--------------------------------------------------------------------------------
/DeepBugs/python/LearningDataIncorrectAssignment_with_parents.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Nov 14, 2017
  3 | 
  4 | @author: Michael Pradel
  5 | '''
  6 | 
  7 | import Util
  8 | from collections import namedtuple
  9 | import random
 10 | from tqdm import tqdm
 11 | 
 12 | type_embedding_size = 5
 13 | 
 14 | class CodePiece(object):
 15 |     def __init__(self, lhs, rhs, src):
 16 |         self.lhs = lhs
 17 |         self.rhs = rhs
 18 |         self.src = src
 19 |     
 20 |     def to_message(self):
 21 |         return str(self.src) + " | " + str(self.lhs) + " | " + str(self.rhs)
 22 | 
 23 | RHS = namedtuple('Assignment', ['rhs', 'type'])
 24 |     
 25 | class LearningData(object):
 26 |     def __init__(self):
 27 |         self.file_to_RHSs = dict() # string to set of RHSs
 28 |         self.stats = {}
 29 | 
 30 |     def resetStats(self):
 31 |         self.stats = {}
 32 | 
 33 |     def pre_scan(self, training_data_paths, validation_data_paths):
 34 |         all_assignments = list(Util.DataReader(training_data_paths))
 35 |         for assignment in tqdm(all_assignments, desc='Preprocessing training data'):
 36 |             if isinstance(assignment, list):
 37 |                 for assgn in assignment:
 38 |                     file = assgn["src"].split(" : ")[0]
 39 |                     rhsides = self.file_to_RHSs.setdefault(file, set())
 40 |                     rhsides.add(RHS(assgn["rhs"], assgn["rhsType"]))
 41 |             else:
 42 |                 file = assignment["src"].split(" : ")[0]
 43 |                 rhsides = self.file_to_RHSs.setdefault(file, set())
 44 |                 rhsides.add(RHS(assignment["rhs"], assignment["rhsType"]))
 45 |         all_assignments = Util.DataReader(validation_data_paths)
 46 |         for assignment in tqdm(all_assignments, desc='Preprocessing validation data'):
 47 |             if isinstance(assignment, list):
 48 |                 for assgn in assignment:
 49 |                     file = assgn["src"].split(" : ")[0]
 50 |                     rhsides = self.file_to_RHSs.setdefault(file, set())
 51 |                     rhsides.add(RHS(assgn["rhs"], assgn["rhsType"]))
 52 |             else:
 53 |                 file = assignment["src"].split(" : ")[0]
 54 |                 rhsides = self.file_to_RHSs.setdefault(file, set())
 55 |                 rhsides.add(RHS(assignment["rhs"], assignment["rhsType"]))
 56 | 
 57 |     def code_to_xy_pairs_given_incorrect_example(self, assignment, xs, ys, name_to_vector, type_to_vector,
 58 |                                                  node_type_to_vector, code_pieces):
 59 |         x_correct, y_correct = None, None
 60 |         x_incorrect, y_incorrect = None, None
 61 |         cor_incorrect_code_pieces = []
 62 | 
 63 |         for assgn in assignment:
 64 |             lhs = assgn["lhs"]
 65 |             rhs = assgn["rhs"]
 66 |             rhs_type = assgn["rhsType"]
 67 |             parent = assgn["parent"]
 68 |             grand_parent = assgn["grandParent"]
 69 |             # context = assgn["context"]
 70 |             src = assgn["src"]
 71 |             if not (lhs in name_to_vector):
 72 |                 return
 73 |             if not (rhs in name_to_vector):
 74 |                 return
 75 | 
 76 |             lhs_vector = name_to_vector[lhs]
 77 |             rhs_vector = name_to_vector[rhs]
 78 |             rhs_type_vector = type_to_vector.get(rhs_type, [0] * type_embedding_size)
 79 |             parent_vector = node_type_to_vector[parent]
 80 |             grand_parent_vector = node_type_to_vector[grand_parent]
 81 | 
 82 |             # transform context into embedding vectors (0 if not available)
 83 |             # (pre_context, post_context, all_context) = self.select_context_ids(lhs, rhs, context)
 84 |             # context_vector = self.context_ids_to_embeddings(pre_context, post_context, name_to_vector)
 85 | 
 86 |             # for all xy-pairs: y value = probability that incorrect
 87 |             vec = lhs_vector + rhs_vector + rhs_type_vector + parent_vector + grand_parent_vector
 88 | 
 89 |             if int(assgn['probability_that_incorrect']) == 0:
 90 |                 x_correct = vec
 91 |                 y_correct = [0]
 92 |             elif int(assgn['probability_that_incorrect']) == 1:
 93 |                 x_incorrect = vec
 94 |                 y_incorrect = [1]
 95 |             cor_incorrect_code_pieces.append(CodePiece(lhs, rhs, src))
 96 |         if x_correct and y_correct and x_incorrect and y_incorrect:
 97 |             xs.append(x_correct)
 98 |             ys.append(y_correct)
 99 | 
100 |             xs.append(x_incorrect)
101 |             ys.append(y_incorrect)
102 |             code_pieces.append(cor_incorrect_code_pieces)
103 | 
104 |     def code_to_xy_pairs(self, assignment, xs, ys, name_to_vector, type_to_vector, node_type_to_vector, code_pieces):
105 |         lhs = assignment["lhs"]
106 |         rhs = assignment["rhs"]
107 |         rhs_type = assignment["rhsType"]
108 |         parent = assignment["parent"]
109 |         grand_parent = assignment["grandParent"]
110 |         src = assignment["src"]
111 |         if not (lhs in name_to_vector):
112 |             return
113 |         if not (rhs in name_to_vector):
114 |             return
115 |         
116 |         lhs_vector = name_to_vector[lhs]
117 |         rhs_vector = name_to_vector[rhs]
118 |         rhs_type_vector = type_to_vector.get(rhs_type, [0]*type_embedding_size)
119 |         parent_vector = node_type_to_vector[parent]
120 |         grand_parent_vector = node_type_to_vector[grand_parent]
121 |         
122 |         # find an alternative rhs in the same file
123 |         file = src.split(" : ")[0]
124 |         all_RHSs = self.file_to_RHSs[file]
125 |         tries_left = 100
126 |         found = False
127 |         while (not found) and tries_left > 0:
128 |             other_rhs = random.choice(list(all_RHSs))
129 |             if other_rhs.rhs in name_to_vector and other_rhs.rhs != rhs:
130 |                 found = True
131 |             tries_left -= 1
132 |             
133 |         if not found:
134 |             return
135 |         
136 |         # for all xy-pairs: y value = probability that incorrect
137 |         x_correct = lhs_vector + rhs_vector + rhs_type_vector + parent_vector + grand_parent_vector
138 |         y_correct = [0]
139 |         xs.append(x_correct)
140 |         ys.append(y_correct)
141 |         code_pieces.append(CodePiece(lhs, rhs, src))
142 |         
143 |         other_rhs_vector = name_to_vector[other_rhs.rhs]
144 |         other_rhs_type_vector = type_to_vector[other_rhs.type]
145 |         x_incorrect = lhs_vector + other_rhs_vector + other_rhs_type_vector + parent_vector + grand_parent_vector
146 |         y_incorrect = [1]
147 |         xs.append(x_incorrect)
148 |         ys.append(y_incorrect)
149 |         code_pieces.append(CodePiece(lhs, rhs, src))
150 |      
151 |     def anomaly_score(self, y_prediction_orig, y_prediction_changed):
152 |         return y_prediction_orig
153 |     
154 |     def normal_score(self, y_prediction_orig, y_prediction_changed):
155 |         return y_prediction_changed   
156 |             


--------------------------------------------------------------------------------
/DeepBugs/python/LearningDataMissingArg.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Apr 9, 2018
  3 | 
  4 | @author: Michael Pradel
  5 | '''
  6 | 
  7 | import Util
  8 | from collections import Counter
  9 | import random
 10 | 
 11 | name_embedding_size = 200
 12 | type_embedding_size = 5
 13 | max_nb_args = 2
 14 | 
 15 | class CodePiece(object):
 16 |     def __init__(self, callee, arguments, src):
 17 |         self.callee = callee
 18 |         self.arguments = arguments
 19 |         self.src = src
 20 |     
 21 |     def to_message(self):
 22 |         return str(self.src) + " | " + str(self.callee) + " | " + str(self.arguments)
 23 |         
 24 | class LearningData(object):
 25 |     def __init__(self):
 26 |         self.stats = {"calls": 0, "calls_with_too_many_args": 0, "calls_with_too_few_args": 0, "calls_with_known_names": 0,
 27 |                       "calls_with_known_base_object": 0}
 28 |     
 29 |     def pre_scan(self, training_data_paths, validation_data_paths):
 30 |         print("Stats on training data")
 31 |         self.gather_stats(training_data_paths)
 32 |         print("Stats on validation data")
 33 |         self.gather_stats(validation_data_paths)
 34 | 
 35 |     def gather_stats(self, data_paths):
 36 |         callee_to_freq = Counter()
 37 |         argument_to_freq = Counter()
 38 |         total_calls = 0
 39 |         
 40 |         for call in Util.DataReader(data_paths):
 41 |             callee_to_freq[call["callee"]] += 1
 42 |             for argument in call["arguments"]:
 43 |                 argument_to_freq[argument] += 1
 44 |             total_calls += 1
 45 |         
 46 |         print("Total calls           : " + str(total_calls))    
 47 |         print("Unique callees        : " + str(len(callee_to_freq)))
 48 |         print("  " + "\n  ".join(str(x) for x in callee_to_freq.most_common(10)))
 49 |         Util.analyze_histograms(callee_to_freq)
 50 |         print("Unique arguments      : " + str(len(argument_to_freq)))
 51 |         print("  " + "\n  ".join(str(x) for x in argument_to_freq.most_common(10)))
 52 |         Util.analyze_histograms(argument_to_freq)
 53 |         
 54 |     def code_to_xy_pairs(self, call, xs, ys, name_to_vector, type_to_vector, node_type_to_vector, calls=None):
 55 |         arguments = call["arguments"]
 56 |         self.stats["calls"] += 1
 57 |         if len(arguments) > max_nb_args:
 58 |             self.stats["calls_with_too_many_args"] += 1
 59 |             return
 60 |         if len(arguments) < 1:
 61 |             self.stats["calls_with_too_few_args"] += 1
 62 |             return
 63 |         
 64 |         # mandatory information: callee and argument names
 65 |         callee_string = call["callee"]
 66 |         argument_strings = call["arguments"]
 67 |         if not (callee_string in name_to_vector):
 68 |             return
 69 |         for argument_string in argument_strings:
 70 |             if not (argument_string in name_to_vector):
 71 |                 return
 72 |         self.stats["calls_with_known_names"] += 1
 73 |         callee_vector = name_to_vector[callee_string]
 74 |         argument_vectors = []
 75 |         for argument_string in argument_strings:
 76 |             argument_vectors.append(name_to_vector[argument_string])
 77 |             if len(argument_vectors) >= max_nb_args:
 78 |                 break
 79 |         
 80 |         # optional information: base object, argument types, etc.
 81 |         base_string = call["base"]
 82 |         base_vector = name_to_vector.get(base_string, [0]*name_embedding_size)
 83 |         if base_string in name_to_vector:
 84 |             self.stats["calls_with_known_base_object"] += 1
 85 |         
 86 |         argument_type_strings = call["argumentTypes"]
 87 |         argument_type_vectors = []
 88 |         for argument_type_string in argument_type_strings:
 89 |             argument_type_vectors.append(type_to_vector.get(argument_type_string, [0]*type_embedding_size))
 90 |             if len(argument_type_vectors) >= max_nb_args:
 91 |                 break
 92 |         
 93 |         parameter_strings = call["parameters"]
 94 |         parameter_vectors = []
 95 |         for parameter_string in parameter_strings:
 96 |             parameter_vectors.append(name_to_vector.get(parameter_string, [0]*name_embedding_size))
 97 |             if len(parameter_vectors) >= max_nb_args:
 98 |                 break
 99 |         
100 |         # for all xy-pairs: y value = probability that incorrect
101 |         x_orig = callee_vector + base_vector  
102 |         # add argument vectors (and pad if not enough available)
103 |         for i in range(max_nb_args):
104 |             if len(argument_vectors) > i:
105 |                 x_orig += argument_vectors[i]
106 |             else:
107 |                 x_orig += [0]*name_embedding_size
108 |         # add argument type vectors (and pad if not enough available)
109 |         for i in range(max_nb_args):
110 |             if len(argument_type_vectors) > i:
111 |                 x_orig += argument_type_vectors[i]
112 |             else:
113 |                 x_orig += [0]*type_embedding_size
114 |         # add parameter vectors (and pad if not enough available)
115 |         for i in range(max_nb_args):
116 |             if len(parameter_vectors) > i:
117 |                 x_orig += parameter_vectors[i]
118 |             else:
119 |                 x_orig += [0]*name_embedding_size
120 |         y_orig = [0]
121 |         xs.append(x_orig)
122 |         ys.append(y_orig)
123 |         if calls != None:
124 |             calls.append(CodePiece(callee_string, argument_strings, call["src"]))
125 |         
126 |         # for the negative example, remove a randomly picked argument
127 |         idx_to_remove = random.randint(0, len(argument_vectors)-1)
128 |         del argument_vectors[idx_to_remove]
129 |         del argument_type_vectors[idx_to_remove]
130 |         del parameter_vectors[idx_to_remove]
131 |         x_buggy = callee_vector + base_vector
132 |         # add argument vectors (and pad if not enough available)
133 |         for i in range(max_nb_args):
134 |             if len(argument_vectors) > i:
135 |                 x_buggy += argument_vectors[i]
136 |             else:
137 |                 x_buggy += [0]*name_embedding_size
138 |         # add argument type vectors (and pad if not enough available)
139 |         for i in range(max_nb_args):
140 |             if len(argument_type_vectors) > i:
141 |                 x_buggy += argument_type_vectors[i]
142 |             else:
143 |                 x_buggy += [0]*type_embedding_size
144 |         # add parameter vectors (and pad if not enough available)
145 |         for i in range(max_nb_args):
146 |             if len(parameter_vectors) > i:
147 |                 x_buggy += parameter_vectors[i]
148 |             else:
149 |                 x_buggy += [0]*name_embedding_size
150 |         y_buggy = [1]
151 |         
152 |         xs.append(x_buggy)
153 |         ys.append(y_buggy)
154 |         if calls != None:
155 |             calls.append(CodePiece(callee_string, argument_strings, call["src"]))
156 |             
157 |     def anomaly_score(self, y_prediction_orig, y_prediction_changed):
158 |         return y_prediction_orig # higher means more likely to be anomaly in current code
159 |     
160 |     def normal_score(self, y_prediction_orig, y_prediction_changed):
161 |         return y_prediction_changed # higher means more likely to be correct in current code
162 | 


--------------------------------------------------------------------------------
/DeepBugs/python/LearningDataSwappedArgs.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Nov 9, 2017
  3 | 
  4 | @author: Michael Pradel
  5 | '''
  6 | 
  7 | import Util
  8 | from collections import Counter
  9 | 
 10 | name_embedding_size = 200
 11 | file_name_embedding_size = 50
 12 | type_embedding_size = 5
 13 | 
 14 | class CodePiece(object):
 15 |     def __init__(self, callee, arguments, src):
 16 |         self.callee = callee
 17 |         self.arguments = arguments
 18 |         self.src = src
 19 |     
 20 |     def to_message(self):
 21 |         return str(self.src) + " | " + str(self.callee) + " | " + str(self.arguments)
 22 |         
 23 | class LearningData(object):
 24 |     def is_known_type(self, t):
 25 |         return t == "boolean" or t == "number" or t == "object" or t == "regex" or t == "string"
 26 |     
 27 |     def resetStats(self):
 28 |         self.stats = {"calls": 0, "calls_with_two_args": 0, "calls_with_known_names": 0,
 29 |                       "calls_with_known_base_object": 0, "calls_with_known_types": 0,
 30 |                       "calls_with_both_known_types": 0,
 31 |                       "calls_with_known_parameters" :0}
 32 |     
 33 |     def pre_scan(self, training_data_paths, validation_data_paths):
 34 |         print("Stats on training data")
 35 |         self.gather_stats(training_data_paths)
 36 |         print("Stats on validation data")
 37 |         self.gather_stats(validation_data_paths)
 38 | 
 39 |     def gather_stats(self, data_paths):
 40 |         callee_to_freq = Counter()
 41 |         argument_to_freq = Counter()
 42 |         
 43 |         for call in Util.DataReader(data_paths):
 44 |             callee_to_freq[call["callee"]] += 1
 45 |             for argument in call["arguments"]:
 46 |                 argument_to_freq[argument] += 1
 47 |             
 48 |         print("Unique callees        : " + str(len(callee_to_freq)))
 49 |         print("  " + "\n  ".join(str(x) for x in callee_to_freq.most_common(10)))
 50 |         Util.analyze_histograms(callee_to_freq)
 51 |         print("Unique arguments      : " + str(len(argument_to_freq)))
 52 |         print("  " + "\n  ".join(str(x) for x in argument_to_freq.most_common(10)))
 53 |         Util.analyze_histograms(argument_to_freq)
 54 |         
 55 |     def code_to_xy_pairs(self, call, xs, ys, name_to_vector, type_to_vector, node_type_to_vector, calls=None):
 56 |         arguments = call["arguments"]
 57 |         self.stats["calls"] += 1
 58 |         if len(arguments) != 2:
 59 |             return
 60 |         self.stats["calls_with_two_args"] += 1
 61 |         
 62 |         # mandatory information: callee and argument names        
 63 |         callee_string = call["callee"]
 64 |         argument_strings = call["arguments"]
 65 |         if not (callee_string in name_to_vector):
 66 |             return
 67 |         for argument_string in argument_strings:
 68 |             if not (argument_string in name_to_vector):
 69 |                 return
 70 |         self.stats["calls_with_known_names"] += 1
 71 |         callee_vector = name_to_vector[callee_string]
 72 |         argument0_vector = name_to_vector[argument_strings[0]]
 73 |         argument1_vector = name_to_vector[argument_strings[1]]
 74 |         
 75 |         # optional information: base object, argument types, etc.
 76 |         base_string = call["base"]
 77 |         base_vector = name_to_vector.get(base_string, [0]*name_embedding_size)
 78 |         if base_string in name_to_vector:
 79 |             self.stats["calls_with_known_base_object"] += 1
 80 |         
 81 |         argument_type_strings = call["argumentTypes"]
 82 |         argument0_type_vector = type_to_vector.get(argument_type_strings[0], [0]*type_embedding_size)
 83 |         argument1_type_vector = type_to_vector.get(argument_type_strings[1], [0]*type_embedding_size)
 84 |         if (self.is_known_type(argument_type_strings[0]) or self.is_known_type(argument_type_strings[1])):
 85 |             self.stats["calls_with_known_types"] += 1
 86 |         if (self.is_known_type(argument_type_strings[0]) and self.is_known_type(argument_type_strings[1])):
 87 |             self.stats["calls_with_both_known_types"] += 1
 88 |         
 89 |         parameter_strings = call["parameters"]
 90 |         parameter0_vector = name_to_vector.get(parameter_strings[0], [0]*name_embedding_size)
 91 |         parameter1_vector = name_to_vector.get(parameter_strings[1], [0]*name_embedding_size)
 92 |         if (parameter_strings[0] in name_to_vector or parameter_strings[1] in name_to_vector):
 93 |             self.stats["calls_with_known_parameters"] += 1
 94 |         
 95 |         # for all xy-pairs: y value = probability that incorrect
 96 |         x_keep = callee_vector + argument0_vector + argument1_vector
 97 |         x_keep += base_vector + argument0_type_vector + argument1_type_vector
 98 |         x_keep += parameter0_vector + parameter1_vector #+ file_name_vector
 99 |         y_keep = [0]
100 |         xs.append(x_keep)
101 |         ys.append(y_keep)
102 |         if calls != None:
103 |             calls.append(CodePiece(callee_string, argument_strings, call["src"]))
104 |         
105 |         x_swap = callee_vector + argument1_vector + argument0_vector
106 |         x_swap += base_vector + argument1_type_vector + argument0_type_vector
107 |         x_swap += parameter0_vector + parameter1_vector #+ file_name_vector
108 |         y_swap = [1]
109 |         xs.append(x_swap)
110 |         ys.append(y_swap)
111 |         if calls != None:
112 |             calls.append(CodePiece(callee_string, argument_strings, call["src"]))
113 |             
114 |     def anomaly_score(self, y_prediction_orig, y_prediction_changed):
115 |         return y_prediction_orig - y_prediction_changed # higher means more likely to be anomaly in current code
116 |     
117 |     def normal_score(self, y_prediction_orig, y_prediction_changed):
118 |         return y_prediction_changed - y_prediction_orig # higher means more likely to be correct in current code
119 | 


--------------------------------------------------------------------------------
/DeepBugs/python/LearningDataSwappedBinOperands.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Nov 13, 2017
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | import Util
 8 | from collections import Counter
 9 | import random
10 | 
11 | type_embedding_size = 5
12 | node_type_embedding_size = 8 # if changing here, then also change in LearningDataBinOperator
13 | 
14 | commutative_operators = ["+", "==", "===", "!==", "!=", "*", "|", "&", "^"]
15 | 
16 | class CodePiece(object):
17 |     def __init__(self, left, right, op, src):
18 |         self.left = left
19 |         self.right = right
20 |         self.op = op
21 |         self.src = src
22 |     
23 |     def to_message(self):
24 |         return str(self.src) + " | " + str(self.left) + " | " + str(self.op) + " | " + str(self.right)
25 |     
26 | class LearningData(object):
27 |     def __init__(self):
28 |         self.all_operators = None
29 |         self.stats = {}
30 | 
31 |     def pre_scan(self, training_data_paths, validation_data_paths):
32 |         all_operators_set = set()
33 |         for bin_op in Util.DataReader(training_data_paths):
34 |             all_operators_set.add(bin_op["op"])
35 |         for bin_op in Util.DataReader(validation_data_paths):
36 |             all_operators_set.add(bin_op["op"])
37 |         self.all_operators = list(all_operators_set)
38 |     
39 |     def code_to_xy_pairs(self, bin_op, xs, ys, name_to_vector, type_to_vector, node_type_to_vector, code_pieces):
40 |         left = bin_op["left"]
41 |         right = bin_op["right"]
42 |         operator = bin_op["op"]
43 |         left_type = bin_op["leftType"]
44 |         right_type = bin_op["rightType"]
45 |         parent = bin_op["parent"]
46 |         grand_parent = bin_op["grandParent"]
47 |         src = bin_op["src"]
48 |         if not (left in name_to_vector):
49 |             return
50 |         if not (right in name_to_vector):
51 |             return
52 |         if operator in commutative_operators:
53 |             return
54 |         
55 |         left_vector = name_to_vector[left]
56 |         right_vector = name_to_vector[right]
57 |         operator_vector = [0] * len(self.all_operators)
58 |         operator_vector[self.all_operators.index(operator)] = 1
59 |         left_type_vector = type_to_vector.get(left_type, [0]*type_embedding_size)
60 |         right_type_vector = type_to_vector.get(right_type, [0]*type_embedding_size)
61 |         parent_vector = node_type_to_vector[parent]
62 |         grand_parent_vector = node_type_to_vector[grand_parent]
63 |         
64 |         # for all xy-pairs: y value = probability that incorrect
65 |         x_correct = left_vector + right_vector + operator_vector + left_type_vector + right_type_vector + parent_vector + grand_parent_vector
66 |         y_correct = [0]
67 |         xs.append(x_correct)
68 |         ys.append(y_correct)
69 |         code_pieces.append(CodePiece(left, right, operator, src))
70 |         
71 |         # swap operands
72 |         x_incorrect = right_vector + left_vector + operator_vector + right_type_vector + left_type_vector + parent_vector + grand_parent_vector
73 |         y_incorrect = [1]
74 |         xs.append(x_incorrect)
75 |         ys.append(y_incorrect)
76 |         code_pieces.append(CodePiece(right, left, operator, src))
77 |         
78 |     def anomaly_score(self, y_prediction_orig, y_prediction_changed):
79 |         return y_prediction_orig - y_prediction_changed # higher means more likely to be anomaly in current code
80 |     
81 |     def normal_score(self, y_prediction_orig, y_prediction_changed):
82 |         return y_prediction_changed - y_prediction_orig # higher means more likely to be correct in current code  


--------------------------------------------------------------------------------
/DeepBugs/python/LocationBasedEmbeddingEvaluator.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Oct 24, 2017
  3 | 
  4 | @author: Michael Pradel
  5 | '''
  6 | 
  7 | import sys
  8 | import json
  9 | from os.path import join
 10 | from os import getcwd
 11 | from sklearn.decomposition.incremental_pca import IncrementalPCA
 12 | from matplotlib import pyplot
 13 | import re
 14 | import random
 15 | from scipy.spatial.kdtree import KDTree
 16 | from scipy.spatial.distance import cosine 
 17 | import numpy as np
 18 | import resource  # @UnresolvedImport
 19 | from dca.Util import Util
 20 | 
 21 | sampling_rate_for_PCA = 0.01
 22 | 
 23 | util = Util()
 24 | 
 25 | class RawDataReader(object):
 26 |     def __init__(self, data_paths):
 27 |         self.data_paths = data_paths
 28 |          
 29 |     def __iter__(self):
 30 |         for data_path in self.data_paths:
 31 |             print("Reading file " + data_path)
 32 |             with open(data_path) as file:
 33 |                 items = json.load(file)
 34 |                 for item in items:
 35 |                     yield item
 36 | 
 37 | if __name__ == '__main__':
 38 |     # arguments: <location to vector file> <idsListWithContext.json files>
 39 |     
 40 |     giga_byte = 1024 * 1024 * 1024
 41 |     max_bytes = 8 * giga_byte   
 42 |     resource.setrlimit(resource.RLIMIT_AS, (max_bytes, max_bytes))
 43 |     
 44 |     location_to_vector_file = join(getcwd(), sys.argv[1])
 45 |     with open(location_to_vector_file) as f:
 46 |         location_to_vector = json.load(f)
 47 |         
 48 |     data_paths = list(map(lambda f: join(getcwd(), f), sys.argv[2:]))
 49 |     if len(data_paths) is 0:
 50 |         print("Must pass token_to_nb files and at least one data file")
 51 |         sys.exit(1)
 52 |     
 53 |     location_to_name = dict()
 54 |     name_to_locations = dict()
 55 |     reader = RawDataReader(data_paths)
 56 |     for token_with_context in reader:
 57 |         name = token_with_context["token"]
 58 |         location = token_with_context["location"]
 59 |         location_to_name[location] = name
 60 |         if name in name_to_locations:
 61 |             locations = name_to_locations[name]
 62 |         else:
 63 |             locations = []
 64 |             name_to_locations[name] = locations
 65 |         locations.append(location)
 66 |     
 67 |     # prepare data structures for efficient similarity queries
 68 |     names = []
 69 |     vectors = []
 70 |     for location, vector in location_to_vector.items():
 71 |         if location in location_to_name: # some locations have no vectors because their names are infrequent
 72 |             name = location_to_name[location]
 73 |             names.append(name)
 74 |             vectors.append(vector)
 75 |     print("Name-vector pairs: " + str(len(names)))
 76 | 
 77 |     # inspect similarities of locations with same name
 78 |     remaining_samples = 20 
 79 |     print("\n")
 80 |     print("In-group simil, Out-group simil, Factor, #Vectors, Token")
 81 |     for name, locations in name_to_locations.items():
 82 |         if len(locations) > 5:
 83 |             vector_group = list(map(lambda location: location_to_vector[location], locations))
 84 |             # compute avg. pairwise similarity in group with same name
 85 |             in_group_simil = util.in_group_similarity(vector_group)
 86 |             
 87 |             # compute avg. similarity to some other vectors
 88 |             out_group_simil = util.out_group_similarity(vector_group, vectors)
 89 |             
 90 |             factor = in_group_simil / out_group_simil
 91 |             print(str(round(in_group_simil, 4))+", "+str(round(out_group_simil, 4))+", "+str(round(factor, 2))+", "+str(len(vector_group))+", "+name)
 92 |             remaining_samples -= 1
 93 |             if remaining_samples is 0:
 94 |                 break
 95 |             
 96 |             
 97 | #                 
 98 | # 
 99 | #     names = []
100 | #     vectors = []
101 | #     for name, vector in name_to_vector.items():
102 | #         names.append(name)
103 | #         vectors.append(vector)
104 | #     
105 | #     # perform q few similarity queries
106 | #     queries = [ "i", "name", "jQuery", "counter", "element", "true", "msg", "length"] # for token-based
107 | #     queries = [ "ID:i", "ID:name", "ID:jQuery", "ID:counter", "ID:element", "LIT:true", "ID:msg", "ID:length"] # for AST-based
108 | #     kd_tree = KDTree(np.array(vectors))
109 | #     for query in queries:
110 | #         if query in name_to_vector:
111 | #             print(query + " has similar names:")
112 | #             query_vector = name_to_vector[query]
113 | #             _, neighbor_idxs = kd_tree.query(query_vector, k=6)
114 | #             closest_names = []
115 | #             for idx in neighbor_idxs:
116 | #                 close_name = names[idx]
117 | #                 if close_name != query:
118 | #                     print("  " + close_name)
119 | #     
120 | #     # show PCA
121 | #     pca_vectors = []
122 | #     pca_labels = []
123 | #     for idx, name in enumerate(names):
124 | #         if random.random() < sampling_rate_for_PCA:
125 | #             pca_labels.append(name)
126 | #             pca_vectors.append(vectors[idx])
127 | #     
128 | #     ipca = IncrementalPCA(n_components=2)
129 | #     reduced_vectors = ipca.fit_transform(pca_vectors)
130 | # 
131 | #     fig, ax = pyplot.subplots()
132 | #     x = reduced_vectors[:, 0]
133 | #     y = reduced_vectors[:, 1]
134 | #     ax.scatter(x, y)
135 | #     for idx, label in enumerate(pca_labels):
136 | #         escaped_label = re.escape(label)
137 | #         ax.annotate(escaped_label, (x[idx], y[idx]))
138 | #         
139 | #     pyplot.show()
140 |     
141 |     
142 | 


--------------------------------------------------------------------------------
/DeepBugs/python/RandomEmbeddingLearner.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 3, 2017
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | import json
 8 | import math
 9 | import sys
10 | import time
11 | 
12 | import numpy as np
13 | import random
14 | 
15 | from numpy.random import normal
16 | 
17 | kept_main_tokens = 10000
18 | 
19 | embedding_size = 200
20 | 
21 | def count_samples(data_paths):
22 |     total_examples = 0
23 |     for path in data_paths:
24 |         encoded_tokens_with_context = np.load(path)
25 |         total_examples += len(encoded_tokens_with_context)
26 |     return total_examples
27 | 
28 | def create_random_embedding():
29 |     embedding = []
30 |     for _ in range(0,embedding_size):
31 | #         random_bit = round(random.random())
32 |         random_nb = normal(0.0, 0.7)  # Gaussian distribution that looks roughly like the values in learned embeddings
33 |         embedding.append(random_nb)
34 |     return embedding
35 | 
36 | if __name__ == '__main__':
37 |     # arguments: <token_to_nb_file> OR <token_to_vector_file>
38 |     
39 |     token_to_nb_file = sys.argv[1]
40 |     with open(token_to_nb_file, "r") as file:
41 |         token_to_nb = json.load(file)
42 |     token_to_vector = dict()
43 |     used_embeddings = set()
44 |     for token, _ in token_to_nb.items():
45 |         done = False
46 |         while not done:
47 |             embedding = create_random_embedding()
48 |             if not (str(embedding) in used_embeddings):
49 |                 token_to_vector[token] = embedding
50 |                 used_embeddings.add(str(embedding))
51 |                 done = True 
52 |     
53 |     time_stamp = math.floor(time.time() * 1000)
54 |     token_to_vector_file_name = "token_to_vector_" + str(time_stamp) + ".json"
55 |     with open(token_to_vector_file_name, "w") as file:
56 |         json.dump(token_to_vector, file, sort_keys=True, indent=4)
57 | 
58 |     
59 |     
60 |     
61 |     


--------------------------------------------------------------------------------
/DeepBugs/python/TokenWithContextStats.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Jul 17, 2017
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | from os import getcwd
 8 | from os.path import join
 9 | import sys
10 | 
11 | import numpy as np
12 | import json
13 | 
14 | nb_tokens_in_context = 20
15 | 
16 | if __name__ == '__main__':
17 |     # arguments: <list of files with tokens and contexts>
18 |     print("Total arguments: "+str(len(sys.argv)))
19 |     data_paths = list(map(lambda f: join(getcwd(), f), sys.argv[1:]))
20 |     print("Total files: "+str(len(data_paths)))
21 |     if len(data_paths) is 0:
22 |         print("Must pass at least one data file")
23 |         sys.exit(1)
24 |         
25 |     token_to_contexts = dict()  # store contexts as set of str(array_of_numbers)
26 |     context_to_tokens = dict()
27 | 
28 |     visited_files = 0    
29 |     for path in data_paths:
30 |         visited_files += 1
31 |         print("Visiting file "+str(visited_files)+" files of "+str(len(data_paths)))
32 |         encoded_tokens_with_context = np.load(path)
33 |         print("  Tokens with context: "+str(len(encoded_tokens_with_context)))
34 |         visited_tokens = 0
35 |         for token_with_context in encoded_tokens_with_context:
36 |             # first element of token_with_context = number of main token
37 |             token = str(token_with_context[0])
38 |             context_nbs = []
39 |             for nb_of_context_token in token_with_context[1:]: # 2nd, 3rd, etc. element of token_with_context = numbers of context tokens
40 |                 context_nbs.append(nb_of_context_token)
41 |             context = str(context_nbs)
42 |             
43 |             # track token-to-context mappings
44 |             if token in token_to_contexts:
45 |                 token_to_contexts[token].add(context)
46 |             else:
47 |                 token_to_contexts[token] = set([context])
48 |             
49 |             # track context-to-token mappings
50 |             if context in context_to_tokens:
51 |                 context_to_tokens[context].add(token)
52 |             else:
53 |                 context_to_tokens[context] = set([token])
54 |             
55 |             visited_tokens += 1
56 |             if visited_tokens % 100000 is 0:
57 |                 print("  Visited tokens: "+str(visited_tokens))
58 | 
59 |     # transform sets to lists for serialization & count 1:1 mappings
60 |     serializable_token_to_contexts = dict()
61 |     serializable_context_to_tokens = dict()
62 |     tokens_with_single_context = 0
63 |     contexts_with_single_token = 0
64 |     for token, contexts in token_to_contexts.items():
65 |         serializable_token_to_contexts[token] = list(contexts)
66 |         if len(contexts) is 1:
67 |             tokens_with_single_context += 1
68 |     for context, tokens in context_to_tokens.items():
69 |         serializable_context_to_tokens[context] = list(tokens)
70 |         if len(tokens) is 1:
71 |             contexts_with_single_token += 1
72 |                 
73 |     print(str(tokens_with_single_context)+" of "+str(len(token_to_contexts))+" tokens occur in only 1 context")
74 |     print(str(contexts_with_single_token)+" of "+str(len(context_to_tokens))+" contexts occur in only 1 context")
75 |                 
76 |     with open("tokens_to_contexts.json", "w") as file:
77 |         json.dump(serializable_token_to_contexts, file, indent=4)
78 |     with open("context_to_tokens.json", "w") as file:
79 |         json.dump(serializable_context_to_tokens, file, indent=4)
80 |                 
81 | 


--------------------------------------------------------------------------------
/DeepBugs/python/TokensToTopTokens.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on Jul 26, 2017
  3 | 
  4 | @author: Michael Pradel
  5 | '''
  6 | 
  7 | import sys
  8 | import json
  9 | from os.path import join
 10 | from os import getcwd
 11 | from collections import Counter
 12 | import math
 13 | import time
 14 | from multiprocessing import Pool
 15 | 
 16 | kept_tokens = 10000
 17 | 
 18 | nb_processes = 30
 19 | 
 20 | class RawDataReader(object):
 21 |     def __init__(self, data_paths):
 22 |         self.data_paths = data_paths
 23 |          
 24 |     def __iter__(self):
 25 |         for data_path in self.data_paths:
 26 |             print("Reading file " + data_path)
 27 |             with open(data_path) as file:
 28 |                 token_sequences = json.load(file)
 29 |                 for seq in token_sequences:
 30 |                     yield seq
 31 | 
 32 | def analyze_histograms(all_tokens):
 33 |     total = sum(all_tokens.values())
 34 |     sorted_pairs = all_tokens.most_common()
 35 |     percentages_to_cover = list(map(lambda x: x/100.0,range(1,100)))  #[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]
 36 |     nb_covered = 0
 37 |     pairs_covered = 0
 38 |     for pair in sorted_pairs:
 39 |         nb_covered += pair[1]
 40 |         pairs_covered += 1
 41 |         percentage_covered = (nb_covered * 1.0) / total
 42 |         done = False
 43 |         while not done and len(percentages_to_cover) > 0:
 44 |             next_percentage = percentages_to_cover[0]
 45 |             if percentage_covered >= next_percentage:
 46 |                 print(str(pairs_covered) + " most frequent terms cover " + str(next_percentage) + " of all terms") 
 47 |                 percentages_to_cover = percentages_to_cover[1:]
 48 |             else:
 49 |                 done = True
 50 |                 
 51 |     covered_by_kept_tokens = 0
 52 |     for pair in sorted_pairs[:kept_tokens]:
 53 |         covered_by_kept_tokens += pair[1]
 54 |     perc_covered_by_kept_tokens = (covered_by_kept_tokens * 1.0) / total
 55 |     print("----")
 56 |     print(str(covered_by_kept_tokens) + " most frequent terms cover " + str(perc_covered_by_kept_tokens) + " of all terms")
 57 | 
 58 | def save_tokens(encoded_tokens):
 59 |     time_stamp = math.floor(time.time() * 1000)
 60 |     file_name = "encoded_tokens_" + str(time_stamp) + ".json"
 61 |     with open(file_name, "w") as file:
 62 |         json.dump(encoded_tokens, file, indent=4)
 63 |     return file_name
 64 | 
 65 | def save_token_numbers(token_to_number):
 66 |     time_stamp = math.floor(time.time() * 1000)
 67 |     file_name = "token_to_number_" + str(time_stamp) + ".json"
 68 |     with open(file_name, 'w') as file:
 69 |         json.dump(token_to_number, file, sort_keys=True, indent=4)
 70 | 
 71 | unknown = "@@~UNKNOWN~@@" # represented by 0
 72 | def frequent_tokens(counter, nb_tokens): 
 73 |     token_to_number = dict()
 74 |     ctr = 1 # reserve 0 for "unknown"
 75 |     for pair in counter.most_common(nb_tokens):
 76 |         token_to_number[pair[0]] = ctr
 77 |         ctr += 1
 78 |     return token_to_number
 79 | 
 80 | def encode(frequent_to_number, token):
 81 |     if token in frequent_to_number:
 82 |         return token
 83 |     else:
 84 |         return "UNK"
 85 | 
 86 | def chunks(li, n):
 87 |     for i in range(0, len(li), n):
 88 |         yield li[i:i + n]
 89 | 
 90 | if __name__ == '__main__':
 91 |     # arguments: <list of .json files with tokens> 
 92 |     
 93 |     all_raw_data_paths = list(map(lambda f: join(getcwd(), f), sys.argv[1:]))
 94 |     print("Total files: "+str(len(all_raw_data_paths))) 
 95 |   
 96 |     # gather tokens (in parallel)
 97 |     def count_tokens(data_paths):
 98 |         print("Worker starting to read "+str(len(data_paths))+" files")
 99 |         reader = RawDataReader(data_paths)
100 |         tokens = Counter()
101 |         for token_seq in reader:
102 |             for token in token_seq:
103 |                 tokens[token] += 1
104 |         return tokens    
105 | 
106 |     pool = Pool(processes=nb_processes)
107 |     chunksize = round(len(all_raw_data_paths) / nb_processes)
108 |     if chunksize == 0:
109 |         chunksize = len(all_raw_data_paths)
110 |     counters = pool.map(count_tokens, chunks(all_raw_data_paths, chunksize))
111 |     
112 |     # merge counters that were gathered in parallel
113 |     print("Merging counters...")
114 |     all_tokens = Counter()
115 |     for tokens in counters:
116 |         all_tokens.update(tokens)
117 |     print("Done with merging counters")
118 |     
119 |     # analyze histograms
120 |     print()
121 |     print("Unique tokens: " + str(len(all_tokens)))
122 |     print("  " + "\n  ".join(str(x) for x in all_tokens.most_common(20)))
123 |     analyze_histograms(all_tokens)
124 |     print()
125 |     
126 |     # replace infrequent tokens w/ placeholder and write number-encoded tokens + contexts to files
127 |     frequent_tokens = frequent_tokens(all_tokens, kept_tokens)
128 |     
129 |     save_token_numbers(frequent_tokens)
130 | 
131 |     # parallelize the encoding
132 |     def encode_tokens(data_paths):
133 |         print("Data encoding worker called with "+str(len(data_paths))+" files")
134 |         reader = RawDataReader(data_paths)
135 |         token_ctr = 0
136 |         all_encoded_seqs = []
137 |         for token_seq in reader:
138 |             # replace infrequent tokens with "unknown"
139 |             encoded_token_seq = []
140 |             for t in token_seq:
141 |                 encoded_token_seq.append(encode(frequent_tokens, t))
142 |             token_ctr += len(token_seq)
143 |             all_encoded_seqs.append(encoded_token_seq)
144 |             
145 |             # occasionally save and forget (to avoid filling up all memory)
146 |             if token_ctr > 1000000:
147 |                 file_name = save_tokens(all_encoded_seqs)
148 |                 print("Have written data to " + file_name)
149 |                 token_ctr = 0
150 |                 all_encoded_seqs = []
151 |             
152 |         file_name = save_tokens(all_encoded_seqs)
153 |         print("Have written data to " + file_name)
154 |     
155 |     print("Encoding data and written it to files...")
156 |     pool = Pool(processes=nb_processes)
157 |     pool.map(encode_tokens, chunks(all_raw_data_paths, chunksize))
158 |     
159 |     print("Done")
160 |         
161 |     


--------------------------------------------------------------------------------
/DeepBugs/python/Util.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Created on Oct 26, 2017
 3 | 
 4 | @author: Michael Pradel
 5 | '''
 6 | 
 7 | from scipy.spatial.distance import cosine
 8 | import random
 9 | import json
10 | 
11 | def in_group_similarity(vector_group):
12 |     vector_group = list(vector_group)
13 |     in_group_simil = 0.0
14 |     in_group_ctr = 0
15 |     for i in range(0, len(vector_group)):
16 |         vector1 = vector_group[i]
17 |         for j in range(i+1, len(vector_group)):
18 |             vector2 = vector_group[j]
19 |             in_group_simil += (1 - cosine(vector1, vector2))
20 |             in_group_ctr += 1
21 |     in_group_simil = in_group_simil / in_group_ctr
22 |     return in_group_simil
23 | 
24 | def out_group_similarity(vector_group, other_vectors):
25 |     other_vectors = list(other_vectors)
26 |     out_vectors = []
27 |     for _ in range(20):
28 |         out_vectors.append(other_vectors[random.randint(0, len(other_vectors) - 1)])
29 |     out_group_simil = 0.0
30 |     out_group_ctr = 0
31 |     for vector1 in vector_group:
32 |         for vector2 in out_vectors:
33 |             out_group_simil += (1 - cosine(vector1, vector2))
34 |             out_group_ctr += 1
35 |     out_group_simil = out_group_simil / out_group_ctr
36 |     return out_group_simil
37 | 
38 | class DataReader(object):
39 |     def __init__(self, data_paths):
40 |         self.data_paths = data_paths
41 |          
42 |     def __iter__(self):
43 |         for data_path in self.data_paths:
44 |             print("Reading file " + data_path)
45 |             with open(data_path) as file:
46 |                 calls = json.load(file)
47 |                 for call in calls:
48 |                     yield call
49 |                     
50 | def analyze_histograms(counter):
51 |     total = sum(counter.values())
52 |     sorted_pairs = counter.most_common()
53 |     percentages_to_cover = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]
54 |     nb_covered = 0
55 |     pairs_covered = 0
56 |     for pair in sorted_pairs:
57 |         nb_covered += pair[1]
58 |         pairs_covered += 1
59 |         percentage_covered = (nb_covered * 1.0) / total
60 |         done = False
61 |         while not done and len(percentages_to_cover) > 0:
62 |             next_percentage = percentages_to_cover[0]
63 |             if percentage_covered >= next_percentage:
64 |                 print(str(pairs_covered) + " most frequent terms cover " + str(next_percentage) + " of all terms") 
65 |                 percentages_to_cover = percentages_to_cover[1:]
66 |             else:
67 |                 done = True
68 |         
69 | 


--------------------------------------------------------------------------------
/DeepBugs/python/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sola-st/SemSeed/278bf1ae3bb371bbe98965556d1fbb3a38b8c6f5/DeepBugs/python/__init__.py


--------------------------------------------------------------------------------
/DeepBugs/python/create_dataset_from_seeded_bugs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Created on 22-April-2020
  4 | @author Michael Pradel
  5 | 
  6 | Go through JSON files of a directory created by static analysis and map the positive and
  7 | negative examples.
  8 | 
  9 | """
 10 | 
 11 | from typing import List, Dict, Union, Tuple
 12 | from pathlib import Path
 13 | import json
 14 | import codecs
 15 | from tqdm import tqdm
 16 | import pandas as pd
 17 | 
 18 | 
 19 | def read_file_content(file_path: Path) -> Union[List, Dict]:
 20 |     content = []
 21 |     try:
 22 |         with codecs.open(str(file_path), 'r', encoding='utf-8') as f:
 23 |             c = f.read()
 24 |             content = json.loads(c)
 25 |     except FileNotFoundError:
 26 |         pass
 27 |     except ValueError:
 28 |         pass
 29 |     return content
 30 | 
 31 | 
 32 | def read_create_dataset(in_dir: str) -> Dict:
 33 |     json_files = list(Path(in_dir).rglob(pattern='*.json'))
 34 | 
 35 |     bug_examples = {}
 36 |     # Each incorrect example will have a +ve (correct) and a possibly list of
 37 |     # -ve (incorrect) examples
 38 |     for file in tqdm(json_files, desc='Going through files'):
 39 |         extracted_data = read_file_content(file_path=file)
 40 |         tqdm.write(f"Current bug examples={len(bug_examples)}")
 41 |         for content in extracted_data:
 42 |             analysed_location = content['src']
 43 |             if '_SEMSEED_MUTATED_' in analysed_location:
 44 |                 bug_seeding_metadata = read_file_content(analysed_location.split(' :')[0] + 'on')
 45 |                 # analysed_location.split(' :')[0]
 46 |                 # Get the original file name and the location where the bug was seeded. Create an unique key
 47 |                 file_name = bug_seeding_metadata['file_name_where_intended']
 48 |                 line = bug_seeding_metadata['target_line_range']['line'].split('-')
 49 |                 line = ' - '.join(line)
 50 |                 location_seeded_bug = file_name + ' : ' + line
 51 |                 content['probability_that_incorrect'] = 1
 52 |                 if location_seeded_bug not in bug_examples:
 53 |                     bug_examples[location_seeded_bug] = {
 54 |                         'correct': [],
 55 |                         'incorrect': []
 56 |                     }
 57 |                 bug_examples[location_seeded_bug]['incorrect'].append(content)
 58 |             else:
 59 |                 if analysed_location not in bug_examples:
 60 |                     bug_examples[analysed_location] = {
 61 |                         'correct': [],
 62 |                         'incorrect': []
 63 |                     }
 64 |                 content['probability_that_incorrect'] = 0
 65 |                 bug_examples[analysed_location]['correct'].append(content)
 66 |     return bug_examples
 67 | 
 68 | 
 69 | def process_bug_dataset(bug_dataset: Dict) -> List[List]:
 70 |     """
 71 |     There could be many examples where there is a positive example without a -ve
 72 |     example. Remove them and do other processing and return
 73 | 
 74 |     :param bug_dataset:
 75 |     :return:
 76 |     """
 77 |     filtered_data = []
 78 |     for file_path, data in tqdm(bug_dataset.items(), desc='Processing dataset'):
 79 |         if len(data['correct']) == 0 or len(data['incorrect']) == 0:
 80 |             continue
 81 |         # There could be multiple incorrect examples
 82 |         for ex in data['incorrect']:
 83 |             filtered_data.append([data['correct'][0], ex])
 84 |     return filtered_data
 85 | 
 86 | 
 87 | def filter_seeded_binOps(seeded_bugs: pd.DataFrame, seeded_bugs_binOps: pd.DataFrame):
 88 |     """
 89 |     Given the binOps from seeded bugs, extract only those locations
 90 |     where required
 91 |     """
 92 |     new_df = pd.DataFrame()
 93 |     i = 0
 94 |     for name, group in seeded_bugs_binOps.groupby('src', axis=0):
 95 |         i += 1
 96 |         if len(group) > 1:
 97 |             for _, row in group.iterrows():
 98 |                 f = row['src'].split(':')[0].lstrip().rstrip()
 99 |                 bugs_seeded_to_this_file = seeded_bugs.loc[seeded_bugs['file_name_where_intended'] == f]
100 |                 if len(bugs_seeded_to_this_file):
101 |                     tok_seqs = bugs_seeded_to_this_file['target_token_sequence-Buggy']
102 |                     for tk_seq in tok_seqs:
103 |                         if len(tk_seq) > 3:
104 |                             continue
105 |                         s = [row['left'], row['op'], row['right']]
106 |                         # print(s, tokens)
107 |                         # if s == tokens:
108 |                         #     print("Done")
109 |                         #     new_df.append(row)
110 |                 print(f)
111 |     print(len(new_df))
112 | 
113 | 
114 | if __name__ == '__main__':
115 |     """
116 |     Before running me, first run extractFromJS once on the non seeded JS files and next on the bug-seeded
117 |     JS files. This will create two separate JSON files in the 'benchmarks/binOps' directory.
118 |     
119 |     The current script will go through both JSON files and will map the correct code locations to 
120 |     buggy code locations and finally write all together to dataset.json
121 |     
122 |     One may create another script to split dataset.json to training and validation data as two separate JSON
123 |     files required for running DeepBugs.
124 |         
125 |     Eg. dataset.json
126 |     [
127 |         [
128 |         {
129 |           "left": "ID:g",
130 |           "right": "LIT:67",
131 |           "op": ">",
132 |           "leftType": "unknown",
133 |           "rightType": "number",
134 |           "parent": "IfStatement",
135 |           "grandParent": "BlockStatement",
136 |           "src": "benchmarks/data/data/1.js : 6 - 6",
137 |           "probability_that_incorrect": 0
138 |         },
139 |         {
140 |           "left": "ID:g",
141 |           "right": "LIT:67",
142 |           "op": ">=",
143 |           "leftType": "unknown",
144 |           "rightType": "number",
145 |           "parent": "IfStatement",
146 |           "grandParent": "BlockStatement",
147 |           "src": "benchmarks/js_benchmark_seeded_bugs/1_SEMSEED_MUTATED_1.js : 6 - 6",
148 |           "probability_that_incorrect": 1
149 |         }
150 |         ]
151 |     ]
152 |     """
153 |     # data_binOps = pd.read_pickle('benchmarks/binOps_data.pkl', 'gzip')[:100]
154 | 
155 |     seeded_bugs = pd.read_pickle('benchmarks/seeded_bugs_wrong_binary_operand.pkl', 'gzip')
156 |     seeded_bugs_binOps = pd.read_pickle('benchmarks/binOps_wrong_operand_withloc.pkl', 'gzip')
157 | 
158 |     filter_seeded_binOps(seeded_bugs, seeded_bugs_binOps)
159 |     # with open('benchmarks/dataset.json', 'w') as d:
160 |     #     d.write(json.dumps(f))
161 | 


--------------------------------------------------------------------------------
/DeepBugs/python/extract_from_js_parallel.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Created on 09-July-2020
  4 | @author Michael Pradel
  5 | 
  6 | Call
  7 | 
  8 | 'node extractFromJS --file data/one.js'
  9 | """
 10 | import os
 11 | import subprocess
 12 | from threading import Timer
 13 | from tqdm import tqdm
 14 | from multiprocessing import Pool, cpu_count
 15 | from typing import List
 16 | import random
 17 | import codecs
 18 | import json
 19 | from pathlib import Path
 20 | from collections import defaultdict
 21 | 
 22 | random.seed(a=42)
 23 | 
 24 | 
 25 | def extractFromJS(target_js_file_path: str, line_num: int) -> str:
 26 |     """
 27 |     Prepare a JS file for seeding bugs by converting JS file to AST nodes.
 28 |     The functions creates a Nodejs process to extract the required data.
 29 |     :param target_js_file_path: The input JS file that will be converted to AST node representations
 30 |     :param out_json_file_path:
 31 |     :return:
 32 |     """
 33 | 
 34 |     def kill_process(p):
 35 |         return p.kill()
 36 | 
 37 |     err_in_execution = False
 38 |     path_to_process = os.path.join(os.path.normpath(
 39 |         os.getcwd() + os.sep), 'javascript', 'extractFromJS.js')
 40 |     time_out_before_killing = 240  # seconds
 41 |     try:
 42 |         p = subprocess.Popen([
 43 |             'node', path_to_process,
 44 |             what,
 45 |             '--file', target_js_file_path,
 46 |             line_num
 47 |         ],
 48 |             stdout=subprocess.PIPE)
 49 |         time_out = Timer(time_out_before_killing, kill_process, [p])
 50 |         try:
 51 |             time_out.start()
 52 |             stdout, stderr = p.communicate()
 53 |             tqdm.write(stdout.decode("utf-8"))
 54 |             if stderr:
 55 |                 err_in_execution = stderr.decode("utf-8")
 56 |                 # tqdm.write(err_in_execution)
 57 |         finally:
 58 |             time_out.cancel()
 59 |     except subprocess.TimeoutExpired:
 60 |         pass
 61 |     return err_in_execution
 62 | 
 63 | 
 64 | def remove_duplicates(file_list: List, duplicate_file_groups: List) -> List:
 65 |     """
 66 |     Given a list of files, and known duplicates, keep only one of the duplicates
 67 |     :param duplicate_file_groups:
 68 |     :param file_list:
 69 |     :return:
 70 |     """
 71 |     dup_files = set()
 72 |     for file_group in duplicate_file_groups:
 73 |         # Except the first file rest are all duplicates
 74 |         dup_files.update(file_group[1:])
 75 | 
 76 |     files_without_duplicates = []
 77 |     # Now, we remove the known duplicates
 78 |     root_dir = '/data/'
 79 |     # dup_files = set([os.path.join(root_dir, fp) for fp in dup_files])
 80 |     for fl_path in file_list:
 81 |         if fl_path.split(root_dir)[1] not in dup_files:
 82 |             files_without_duplicates.append(fl_path)
 83 |     return files_without_duplicates
 84 | 
 85 | 
 86 | def read_json_file(json_file_path):
 87 |     try:
 88 |         obj_text = codecs.open(json_file_path, 'r', encoding='utf-8').read()
 89 |         return json.loads(obj_text)
 90 |     except FileNotFoundError:
 91 |         print(f"*** Can't find {json_file_path} provide a correct path")
 92 |         return {}
 93 |     except Exception as e:
 94 |         # Empty JSON file most likely due to abrupt killing of the process while writing
 95 |         # print (e)
 96 |         return {}
 97 | 
 98 | 
 99 | def add_required_line_number(file_path):
100 |     """
101 |     Add the required line number where bug was seeded
102 |     :return:
103 |     """
104 |     file_path = str(file_path)
105 |     seeded_bug_info = read_json_file(file_path + 'on')
106 |     line = seeded_bug_info["target_line_range"]["line"]
107 |     return file_path, line
108 | 
109 | 
110 | def extractFromJS_multi(arg):
111 |     if isinstance(arg, tuple):
112 |         file, loc = arg
113 |     else:
114 |         file = arg
115 |         loc = "null"
116 | 
117 |     extractFromJS(target_js_file_path=file, line_num=loc)
118 | 
119 | 
120 | def semseed_seeded_extraction(in_dir, what):
121 |     print(f"Reading files from {in_dir}")
122 |     js_files = list(Path(in_dir).rglob('*.js'))
123 |     js_files = [f for f in js_files if Path(f).is_file()]
124 |     if in_dir.endswith('/data'):
125 |         js_files = [str(f) for f in js_files]
126 |         print(" Removing duplicates from {} files in benchmarks".format(len(js_files)))
127 |         duplicate_file_groups = read_json_file('benchmarks/js150-duplicates.json')
128 |         js_files = remove_duplicates(file_list=js_files, duplicate_file_groups=duplicate_file_groups)
129 |     else:
130 |         print("Adding line numbers to extract")
131 |         line_out_file = f'benchmarks/files_and_line_numbers_wrong_{what}.json'
132 |         if Path(line_out_file).is_file():
133 |             print("Reading from pre-computed")
134 |             with open(line_out_file, 'r') as f:
135 |                 js_file_with_lines = json.load(f)
136 |             js_files = [(f, line[0]) for f, line in
137 |                         js_file_with_lines.items()]  # every file is unique and has only one line
138 |         else:
139 |             js_file_with_lines = defaultdict(list)
140 |             with Pool(cpu_count() // 2) as p:
141 |                 with tqdm(total=len(js_files)) as pbar:
142 |                     pbar.set_description_str(desc="Adding line numbers", refresh=False)
143 |                     for _, files_and_lines in enumerate(p.imap_unordered(add_required_line_number, js_files, 10)):
144 |                         js_file_with_lines[files_and_lines[0]].append(files_and_lines[1])
145 |                         # print(files_and_lines)
146 |                         pbar.update()
147 |                     p.close()
148 |                     p.join()
149 |             with open(line_out_file, 'w+') as o:
150 |                 json.dump(js_file_with_lines, o)
151 |             js_files = [(f, line[0]) for f, line in
152 |                         js_file_with_lines.items()]
153 |             # js_files = [(f,l) for f, l in js_files if 'elastic_SEMSEED' in f]
154 |     # random.shuffle(js_files)
155 |     # js_files = js_files[:10]
156 |     with Pool(processes=cpu_count()) as p:
157 |         with tqdm(total=len(js_files)) as pbar:
158 |             pbar.set_description_str(
159 |                 desc="Extract from JS", refresh=False)
160 |             for i, execution_errors in tqdm(
161 |                     enumerate(p.imap_unordered(extractFromJS_multi,
162 |                                                js_files, chunksize=10))):
163 |                 # ex_errors.append(execution_errors)
164 |                 pbar.update()
165 |             p.close()
166 |             p.join()
167 |     # print(ex_errors)
168 | 
169 | 
170 | def real_bugs_GitHub_extraction(in_dir):
171 |     js_files = list(Path(in_dir).rglob('*.js'))
172 |     js_files = [str(f) for f in js_files if Path(f).is_file()]
173 | 
174 |     js_files_with_lines = []
175 |     for file_path in tqdm(js_files, desc='Adding lines'):
176 |         file_name = os.path.basename(file_path)
177 |         line = file_name.split('_')[2]
178 |         js_files_with_lines.append((file_path, f'{line}-{line}'))
179 |     with Pool(processes=cpu_count()) as p:
180 |         with tqdm(total=len(js_files)) as pbar:
181 |             pbar.set_description_str(
182 |                 desc="Extract from JS", refresh=False)
183 |             for i, execution_errors in tqdm(
184 |                     enumerate(p.imap_unordered(extractFromJS_multi,
185 |                                                js_files_with_lines, chunksize=10))):
186 |                 # print(execution_errors)
187 |                 # ex_errors.append(execution_errors)
188 |                 pbar.update()
189 |             p.close()
190 |             p.join()
191 | 
192 | 
193 | if __name__ == '__main__':
194 |     what = ['binOps', 'assignments'][1]
195 |     in_dir = \
196 |         ['benchmarks/js_benchmark_seeded_bugs_wrong_assignment',
197 |          'benchmarks/js_benchmark_seeded_bugs_wrong_binop_operand',
198 |          'benchmarks/real_bugs_github',
199 |          'benchmarks/data'][2]
200 |     # semseed_seeded_extraction(in_dir, what)
201 |     real_bugs_GitHub_extraction(in_dir)
202 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # Install
 2 | 
 3 | Install **Node.js** and the required packages:
 4 | 
 5 | ````shell
 6 | # You may install Node.js using nvm : https://github.com/nvm-sh/nvm
 7 | wget -qO- https://raw.githubusercontent.com/nvm-sh/nvm/v0.38.0/install.sh | bash
 8 | source ~/.bashrc
 9 | 
10 | # Install Node.js 14
11 | nvm install 14.17.0
12 | # Install the required Node.js packages
13 | npm install
14 | ````
15 | 
16 | Create a virtual environment for **Python** and install the required packages:
17 | 
18 | ````shell
19 | sudo apt install -y python3-dev # required for the 'fasttext' package
20 | sudo apt install -y python3-venv
21 | 
22 | # Create a virtual environment
23 | python3 -m venv semseed_venv
24 | # Activate the virtual environment
25 | source semseed_venv/bin/activate
26 | # Install the required Python packages
27 | pip install -r requirements.txt
28 | ````
29 | 
30 | We provide pre-trained token embeddings trained using fastText (https://fasttext.cc). The training has been performed
31 | using JavaScript files obtained from https://www.sri.inf.ethz.ch/js150.
32 | 
33 | Install **MongoDB**
34 | 
35 | ````shell
36 | # Install MongoDB Community Edition on Ubuntu 20.04
37 | # Documentation -> https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu
38 | 
39 | wget -qO - https://www.mongodb.org/static/pgp/server-4.4.asc | sudo apt-key add -
40 | echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu focal/mongodb-org/4.4 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.4.list
41 | sudo apt-get update
42 | sudo apt-get install -y mongodb-org
43 | 
44 | # Once installation has finished start MongoDB
45 | sudo systemctl start mongod
46 | ````
47 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright 2021 Software Lab at University of Stuttgart
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 6 | 
 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
10 | 
11 | 


--------------------------------------------------------------------------------
/REQUIREMENTS.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sola-st/SemSeed/278bf1ae3bb371bbe98965556d1fbb3a38b8c6f5/REQUIREMENTS.md


--------------------------------------------------------------------------------
/bug_seeding/bug_seeding_approaches/SeedBugs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Created on 24-March-2020
  4 | @author Jibesh Patra
  5 | 
  6 | """
  7 | from abc import ABC, abstractmethod
  8 | from typing import List, Tuple
  9 | 
 10 | 
 11 | class SeedBugs(ABC):
 12 |     def __init__(self, bug_seeding_pattern: dict, target_location: dict, file_path: str):
 13 |         # Stuffs about the bug. Eg. Buggy, Correct, Surrounding tokens, Usages Identifiers, Literals etc.
 14 | 
 15 |         self.bug_metadata = {
 16 |             'file_name_where_intended': file_path,
 17 |             "target_token_sequence-Correct": target_location['tokens'],  # Abstract token sequence that will be mutated
 18 |             "target_token_sequence-Buggy": [],  # Concrete token sequence generated after mutation
 19 |             "token_sequence_abstraction-Correct": target_location['abstractedTokens'],
 20 |             "token_sequence_abstraction-Buggy": [],
 21 |             "target_line_range": {'line': target_location['line'], 'range': target_location['range']},
 22 |             "num_of_available_identifiers_to_choose_from": 0,
 23 |             "num_of_available_literals_to_choose_from": 0,
 24 |             "error": False
 25 |         }
 26 |         self.bug_seeding_pattern = bug_seeding_pattern
 27 |         self.target_location = target_location
 28 | 
 29 |     @abstractmethod
 30 |     def is_matching_token_sequence(self) -> bool:
 31 |         """
 32 |         For a 'syntactic' match check, this will return True if the
 33 |         token sequence in abstracted form match.
 34 | 
 35 |         For a 'semantic' matching this will depend on the cosine distance of the
 36 |         embedding of the tokens along with the threshold.
 37 |         :return:
 38 |         """
 39 |         raise NotImplementedError
 40 | 
 41 |     @abstractmethod
 42 |     def apply_pattern(self) -> List[List]:
 43 |         """
 44 |         Seed a bug by applying a given pattern
 45 |         :return:
 46 |         """
 47 |         raise NotImplementedError
 48 | 
 49 |     def extract_tokens_of_kinds(self, given_token_seq: List[str]) -> Tuple[List, List, List]:
 50 |         try:
 51 |             assert len(given_token_seq) == len(self.target_location['abstractedTokens'])
 52 |         except AssertionError as e:
 53 |             print("The lengths of these token sequences should be same")
 54 | 
 55 |         tokens = []
 56 |         idf_tokens = []
 57 |         lit_tokens = []
 58 | 
 59 |         idf_prefix = 'Idf_'
 60 |         lit_prefix = 'Lit_'
 61 | 
 62 |         for i, abs_tok in enumerate(self.target_location['abstractedTokens']):
 63 |             concrete_token = given_token_seq[i]
 64 |             if abs_tok.startswith(idf_prefix) or abs_tok.startswith(lit_prefix):
 65 |                 tokens.append(concrete_token)
 66 |             if abs_tok.startswith(idf_prefix):
 67 |                 idf_tokens.append(concrete_token)
 68 |             elif abs_tok.startswith(lit_prefix):
 69 |                 lit_tokens.append(concrete_token)
 70 |         return tokens, idf_tokens, lit_tokens
 71 | 
 72 |     def replace_target_with_mutated_token_sequence(self, token_list: List, token_range_list: List,
 73 |                                                    mutated_token_sequence: List) -> List:
 74 |         """
 75 |         Once the mutated token sequence has been found replace the target token sequence with this new
 76 |         :param token_list: The complete list of the token in the file
 77 |         :param token_range_list: The ranges of each token contained in the token list
 78 |         :param mutated_token_sequence: The token sequence that will be inserted to seed bugs
 79 |         :return: Token sequence after seeding the bug
 80 |         """
 81 | 
 82 |         assert len(token_list) == len(token_range_list)
 83 | 
 84 |         start_range = self.target_location["range"][0]
 85 |         end_range = self.target_location["range"][1]
 86 | 
 87 |         indices_to_remove = [i for i, rng in enumerate(token_range_list) if int(rng.split(
 88 |             '-')[0]) >= start_range and int(rng.split('-')[1]) <= end_range]
 89 | 
 90 |         part1 = token_list[:indices_to_remove[0]]
 91 |         part2 = token_list[indices_to_remove[-1] + 1:]
 92 | 
 93 |         token_list_after_seeding = part1 + mutated_token_sequence + part2
 94 |         assert len(token_list_after_seeding) == len(token_list) - len(self.target_location['tokens']) + len(
 95 |             mutated_token_sequence)
 96 |         return token_list_after_seeding
 97 | 
 98 |     def get_abstract_token_to_concrete_mapping(self) -> dict:
 99 |         """
100 |         This creates a mapping of the abstract token to its actual value
101 |         Eg. 'Idf_1' -> 'a'
102 |         """
103 |         mappings = {}
104 |         for i, abstract_tok in enumerate(self.target_location['abstractedTokens']):
105 |             if not abstract_tok.startswith('Idf_') and not abstract_tok.startswith('Lit_'):
106 |                 continue
107 |             mappings[abstract_tok] = self.target_location['tokens'][i]
108 |         return mappings
109 | 
110 |     def write_bug_seeded_file(self):
111 |         pass
112 | 
113 |     def __call__(self, *args, **kwargs):
114 |         pass
115 | 


--------------------------------------------------------------------------------
/bug_seeding/bug_seeding_approaches/Syntactic/SyntacticSeedBugs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Created on 25-March-2020
 4 | @author Jibesh Patra
 5 | 
 6 | """
 7 | 
 8 | from bug_seeding_approaches.SeedBugs import SeedBugs
 9 | from typing import List
10 | 
11 | 
12 | class SyntacticSeedBugs(SeedBugs):
13 |     def __init__(self, bug_seeding_pattern: dict, target_location: dict, file_path: str):
14 |         super().__init__(bug_seeding_pattern, target_location, file_path)
15 | 
16 |     def is_matching_token_sequence(self) -> bool:
17 |         target = self.target_location
18 |         seeding_pattern = self.bug_seeding_pattern
19 | 
20 |         # We only need to check syntactic matches
21 |         if target['abstractedTokens'] != seeding_pattern['fix']:
22 |             return False
23 |         else:
24 |             return True
25 | 
26 |     def apply_pattern(self) -> List[List]:
27 |         
28 |         return []
29 | 


--------------------------------------------------------------------------------
/bug_seeding/obtain_bug_seeding_patterns/extract_bug_seeding_patterns_from_repos/aggregateChanges.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @author Jibesh Patra
 3 | 
 4 | Aggregate the patterns and write to a JSON files.
 5 | """
 6 | 
 7 | from utils import fileutils as fs
 8 | 
 9 | 
10 | def write_bug_seeding_patterns_to_file(agg_data_out_file):
11 |     from database import GitHubCommits as db
12 |     abstract_changes = list(db.Commits.objects.get_abstracted_changes())
13 |     changes_across_all_repos = []
14 | 
15 |     for change_summary in abstract_changes:
16 |         change_summary['commit_time'] = change_summary['commit_time'].strftime("%d/%m/%Y, %H:%M:%S")
17 |         cfx_actual = [str(e) for e in change_summary['fix_actual']]
18 |         change_summary['fix_actual'] = cfx_actual
19 | 
20 |         cfb_actual = [str(e) for e in change_summary['buggy_actual']]
21 |         change_summary['buggy_actual'] = cfb_actual
22 | 
23 |         changes_across_all_repos.append(change_summary)
24 | 
25 |     print(f'Writing data to {agg_data_out_file}')
26 |     fs.writeJSONFile(changes_across_all_repos, agg_data_out_file)
27 | 
28 | 
29 | if __name__ == "__main__":
30 |     write_bug_seeding_patterns_to_file(agg_data_out_file='benchmarks/bug_seeding_patterns.json')
31 | 


--------------------------------------------------------------------------------
/bug_seeding/obtain_bug_seeding_patterns/extract_bug_seeding_patterns_from_repos/callNodeJSExtractData.py:
--------------------------------------------------------------------------------
 1 | """
 2 | @author Jibesh Patra
 3 | """
 4 | 
 5 | import subprocess
 6 | from threading import Timer
 7 | import os
 8 | import json
 9 | import multiprocessing
10 | from multiprocessing import Pool
11 | from tqdm import tqdm
12 | 
13 | 
14 | def callNodeJS(argument):
15 |     '''
16 |     Call Node.js for each commit and create patterns.
17 |     @param argument: Each argument is a commit id
18 |     @return:
19 |     '''
20 |     path_to_process = os.path.join(os.path.normpath(
21 |         os.getcwd() + os.sep), 'bug_seeding', 'obtain_bug_seeding_patterns', 'extract_bug_seeding_patterns_from_repos',
22 |         'python_calls_me_to_extract_patterns.js')
23 |     time_out_before_killing = 180  # seconds 180 -> 3 minutes
24 |     try:
25 |         def kill_process(p):
26 |             return p.kill()
27 | 
28 |         p = subprocess.Popen(['node', path_to_process, '-commitId', argument],
29 |                              stdout=subprocess.PIPE)
30 |         time_out = Timer(time_out_before_killing, kill_process, [p])
31 |         try:
32 |             time_out.start()
33 |             stdout, stderr = p.communicate()
34 |             # print(stdout, stderr)
35 |         finally:
36 |             time_out.cancel()
37 |     except subprocess.TimeoutExpired:
38 |         # p.kill()
39 |         pass
40 | 
41 | 
42 | def create_patterns_from_commits(select_num_of_commits=-1):
43 |     '''
44 |     Query the MongoDB database and select only those commits (commit_ids) where the number of files
45 |     changed is one and the changes are single line changes.
46 | 
47 |     Next, the CallNodeJS for only those commits and create patterns.
48 | 
49 |     @param select_num_of_commits: -1 means select all commits.
50 |     @return:
51 |     '''
52 |     from database import GitHubCommits as db
53 | 
54 |     # query filters
55 |     num_of_files_changed = 1
56 |     num_single_line_changes = 1
57 |     query_obj = db.Commits.objects(
58 |         num_files_changed=num_of_files_changed, num_single_line_changes=num_single_line_changes)
59 |     print('Found %d records that has only %d file change and only %d single line change' %
60 |           (query_obj.count(), num_of_files_changed, num_single_line_changes))
61 |     pks = json.loads(query_obj.only('pk').to_json())  # get only the primary keys
62 | 
63 |     # Now put all primary keys in a list.
64 |     # The primary keys are nothing but commit hashes concatenated with the repository
65 |     commit_ids = []
66 | 
67 |     for pk in pks:
68 |         commit_ids.append(pk['_id'])
69 | 
70 |     if select_num_of_commits > 0:
71 |         print("Selecting only %d commits of %d available commits" %
72 |               (select_num_of_commits, len(commit_ids)))
73 |         commit_ids = commit_ids[:select_num_of_commits]
74 | 
75 |     # Parallel execution
76 |     with Pool(processes=multiprocessing.cpu_count()) as p:
77 |         with tqdm(total=len(commit_ids)) as pbar:
78 |             pbar.set_description_str(
79 |                 desc="Extracting Patterns ", refresh=False)
80 |             for i, _ in tqdm(enumerate(p.imap_unordered(callNodeJS, commit_ids))):
81 |                 pbar.update()
82 |             p.close()
83 |             p.join()
84 | 


--------------------------------------------------------------------------------
/bug_seeding/obtain_bug_seeding_patterns/extract_bug_seeding_patterns_from_repos/database/GitHubCommits.py:
--------------------------------------------------------------------------------
  1 | """
  2 | @author Jibesh Patra
  3 | """
  4 | 
  5 | from mongoengine import *
  6 | import json
  7 | import codecs
  8 | 
  9 | 
 10 | def read_config(json_file_path):
 11 |     try:
 12 |         obj_text = codecs.open(json_file_path, 'r', encoding='utf-8').read()
 13 |         return json.loads(obj_text)
 14 |     except FileNotFoundError:
 15 |         print(f"*** Can't find {json_file_path} provide a correct path")
 16 |         return []
 17 |     except Exception as e:
 18 |         # Empty JSON file most likely due to abrupt killing of the process while writing
 19 |         # print (e)
 20 |         return []
 21 | 
 22 | 
 23 | db_config = read_config(json_file_path='database_config.json')
 24 | 
 25 | connect(db_config['database_name'], username=db_config['username'], password=db_config['password'],
 26 |         authentication_source='admin', host=db_config['host'], port=db_config['port'])
 27 | 
 28 | 
 29 | class QueryChanges(QuerySet):
 30 |     def get_abstracted_changes(self):
 31 |         pipeline = [
 32 |             {
 33 |                 '$project': {
 34 |                     'single_line_changes': True,
 35 |                     'num_files_changed': True,
 36 |                     'num_single_line_changes': True,
 37 |                     'commit_hash': True,
 38 |                     'url': True,
 39 |                     'commit_time': True,
 40 |                     'local_repo_path': True
 41 |                 }
 42 |             }, {
 43 |                 '$match': {
 44 |                     'num_files_changed': 1,
 45 |                     'num_single_line_changes': 1
 46 |                 }
 47 |             }, {
 48 |                 '$unwind': {
 49 |                     'path': '$single_line_changes',
 50 |                     'preserveNullAndEmptyArrays': False
 51 |                 }
 52 |             }, {
 53 |                 '$match': {
 54 |                     'single_line_changes.analysis_report': 'success'
 55 |                 }
 56 |             }, {
 57 |                 '$addFields': {
 58 |                     'fix': '$single_line_changes.change_summary.fix',
 59 |                     'fix_tokenType': '$single_line_changes.new_file.change_analysis.type',
 60 |                     'fix_file_path': '$single_line_changes.new_file.path',
 61 |                     'fix_actual': '$single_line_changes.new_file.change_analysis.tokens',
 62 |                     'fix_range': '$single_line_changes.new_file.change_analysis.range',
 63 |                     'fix_line': '$single_line_changes.new_file.change_analysis.line',
 64 |                     'buggy': '$single_line_changes.change_summary.buggy',
 65 |                     'buggy_tokenType': '$single_line_changes.old_file.change_analysis.type',
 66 |                     'buggy_file_path': '$single_line_changes.old_file.path',
 67 |                     'buggy_actual': '$single_line_changes.old_file.change_analysis.tokens',
 68 |                     'buggy_range': '$single_line_changes.old_file.change_analysis.range',
 69 |                     'buggy_line': '$single_line_changes.old_file.change_analysis.line'
 70 |                 }
 71 |             }, {
 72 |                 '$project': {
 73 |                     'buggy': True,
 74 |                     'buggy_actual': True,
 75 |                     'buggy_file_path': True,
 76 |                     'buggy_tokenType': True,
 77 |                     'buggy_range': True,
 78 |                     'buggy_line': True,
 79 |                     'fix': True,
 80 |                     'fix_tokenType': True,
 81 |                     'fix_actual': True,
 82 |                     'fix_file_path': True,
 83 |                     'fix_range': True,
 84 |                     'fix_line': True,
 85 |                     'commit_time': True,
 86 |                     'local_repo_path': True,
 87 |                     'lessthanX_fix': {
 88 |                         '$lt': [
 89 |                             {
 90 |                                 '$size': '$fix'
 91 |                             }, 20
 92 |                         ]
 93 |                     },
 94 |                     'lessthanX_buggy': {
 95 |                         '$lt': [
 96 |                             {
 97 |                                 '$size': '$buggy'
 98 |                             }, 20
 99 |                         ]
100 |                     },
101 |                     'commit_hash': True,
102 |                     'url': True
103 |                 }
104 |             }, {
105 |                 '$match': {
106 |                     'lessthanX_fix': True,
107 |                     'lessthanX_buggy': True
108 |                 }
109 |             }, {
110 |                 '$project': {
111 |                     'lessthanX_fix': False,
112 |                     'lessthanX_buggy': False
113 |                 }
114 |             }, {
115 |                 '$sort': {
116 |                     'commit_time': 1
117 |                 }
118 |             }
119 |         ]
120 |         return self().aggregate(*pipeline)
121 | 
122 |     def get_fix_and_buggy_tokens(self, id_h):
123 |         q = list(self(pk=id_h).only(
124 |             'single_line_changes.old_file.change_analysis.tokens',
125 |             'single_line_changes.new_file.change_analysis.tokens'))
126 |         fixed_tokens = q[0]['single_line_changes'][0]['new_file']['change_analysis']['tokens']
127 |         buggy_tokens = q[0]['single_line_changes'][0]['old_file']['change_analysis']['tokens']
128 |         return {'actual_buggy_tokens': buggy_tokens, 'actual_fixed_tokens': fixed_tokens}
129 | 
130 | 
131 | class Commits(Document):
132 |     commit_id = StringField(primary_key=True)
133 |     commit_hash = StringField(required=True)
134 |     commit_message = StringField(required=True)
135 |     commit_time = DateTimeField()
136 | 
137 |     local_repo_path = StringField()
138 |     parent_hash = StringField()
139 |     url = URLField()
140 | 
141 |     num_files_changed = IntField()
142 | 
143 |     single_line_changes = ListField(DictField(DictField()))
144 |     num_single_line_changes = IntField()
145 |     meta = {'queryset_class': QueryChanges}
146 | 


--------------------------------------------------------------------------------
/bug_seeding/obtain_bug_seeding_patterns/extract_bug_seeding_patterns_from_repos/extractNodeData.js:
--------------------------------------------------------------------------------
  1 | const ExtractSingleLineChangedNodes = require('./analyses/ExtractDataGivenNodes').ExtractSingleLineChangedNodes;
  2 | const fs = require('fs');
  3 | const assert = require('assert');
  4 | const format_file = require('./utils/format_a_js_file').formatJSfile;
  5 | 
  6 | function analyseCode(code, nonTrackingNodes, trackingNodes) {
  7 |     if (code.length === 0) return {};
  8 |     assert.ok(Array.isArray(nonTrackingNodes)); // Should be an Array
  9 |     assert.ok(nonTrackingNodes.length > 0); // It should have atleast one Node to track
 10 | 
 11 |     // Number of tokens to extract around each Identifier and also the number of tokens
 12 |     // to extract around each point of interest Eg. conditional test
 13 |     let config = {
 14 |         num_tokens_aroundIdf: 3,
 15 |         num_tokens_around_point_of_interest: 5
 16 |     };
 17 | 
 18 |     // ------------------------------ Extract data given some specific nodes ------------
 19 |     try {
 20 |         // Extract data of every node apart from Identifier or Literal
 21 |         let dataFromSpecificNodes = new ExtractSingleLineChangedNodes(code, nonTrackingNodes, trackingNodes);
 22 | 
 23 |         let analysedCode = dataFromSpecificNodes.goThroughASTExtractSpecificNodes();
 24 |         for (let dt of analysedCode) {
 25 |             // For now, we do not need Identifier and the Context for the token sequence
 26 |             // dataFromSpecificNodes.addIdentifiersAndContext(dt, config);
 27 |             dataFromSpecificNodes.abstractIdentifiers(dt);
 28 |             dataFromSpecificNodes.abstractLiterals(dt);
 29 |         }
 30 | 
 31 |         let map_to_obj = (range_to_tok => {
 32 |             const obj = {};
 33 |             range_to_tok.forEach((v, k) => {
 34 |                 obj[k] = v['value']
 35 |             });
 36 |             return obj;
 37 |         });
 38 |         return {
 39 |             'nodes': analysedCode,
 40 |             'functions_to_identifiers': dataFromSpecificNodes.scopeToIdentifier,
 41 |             'functions_to_literals': dataFromSpecificNodes.scopeToLiteral,
 42 |             'tokenList': dataFromSpecificNodes.tokenList.filter(value => value !== null),
 43 |             'tokenRangesList': dataFromSpecificNodes.tokenRangesList,
 44 |             'range_to_identifier': map_to_obj(dataFromSpecificNodes.rangeToIdentifier),
 45 |             'range_to_literal': map_to_obj(dataFromSpecificNodes.rangeToLiteral)
 46 |         };
 47 |     } catch (e) {
 48 |         return e;
 49 |     }
 50 | 
 51 | }
 52 | 
 53 | function extractNodeData(inFile, outFile) {
 54 |     // First format the file
 55 |     format_file(inFile);
 56 |     let code = '';
 57 |     try {
 58 |         code = fs.readFileSync(inFile, 'utf8');
 59 |     } catch (e) {
 60 |         return e;
 61 |     }
 62 | 
 63 |     let extractedData = analyseCode(code, ['Identifier', 'Literal'], ['BinaryExpression']);
 64 |     if (Object.keys(extractedData).length !== 0) {
 65 |         try {
 66 |             extractedData['file_path'] = inFile;
 67 |             // if (fs.existsSync(outFile)) {
 68 |             //     let knownLocationOfInterest = JSON.parse(fs.readFileSync(outFile, 'utf8'));
 69 |             //     if (knownLocationOfInterest && knownLocationOfInterest.hasOwnProperty('line')) {
 70 |             //         extractedData.line = knownLocationOfInterest.line;
 71 |             //     }
 72 |             // }
 73 |             if (fs.existsSync(outFile)) {
 74 |                 let random_num = Math.floor(Math.random() * 100000);
 75 |                 outFile = outFile.replace('.js', '_' + random_num + '.js')
 76 |             }
 77 |             fs.writeFileSync(outFile, JSON.stringify(extractedData));
 78 |         } catch (err) {
 79 |             return err;
 80 |         }
 81 | 
 82 |     }
 83 | 
 84 |     // console.log(outFile);
 85 |     // console.log(extractedData);
 86 | }
 87 | 
 88 | 
 89 | function parse_cli_arguments() {
 90 |     const ArgumentParser = require('argparse').ArgumentParser;
 91 |     let parser = new ArgumentParser({
 92 |         version: '0.0.1',
 93 |         addHelp: true,
 94 |         description: 'Go through the JS file and extract \'only\' certain the nodes'
 95 |     });
 96 | 
 97 |     // -------------------------- Debug ---------------------
 98 |     parser.addArgument(
 99 |         ['-inFile'],
100 |         {help: 'Specify the source file from which the data needs to be extracted'});
101 |     parser.addArgument(
102 |         ['-outFile'],
103 |         {help: 'Specify the file where the extracted data will be written'});
104 |     let args = parser.parseArgs();
105 |     return {
106 |         'inFile': args.inFile,
107 |         'outFile': args.outFile
108 |     }
109 | }
110 | 
111 | 
112 | (
113 |     async function () {
114 |         let {
115 |             inFile, outFile
116 |         } = parse_cli_arguments();
117 |         extractNodeData(inFile, outFile);
118 |     }
119 | )();
120 | 
121 | module.exports.analyseCode = analyseCode;
122 | 


--------------------------------------------------------------------------------
/bug_seeding/obtain_bug_seeding_patterns/extract_bug_seeding_patterns_from_repos/utils/fileoperations.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Contains utility methods for file and folder operations
  3 |  */
  4 | const fs = require('fs');
  5 | const path = require('path');
  6 | const assert = require('assert');
  7 | 
  8 | /**
  9 |  * Check if the p is a directory
 10 |  * @param {string} path
 11 |  * @returns {boolean}
 12 |  */
 13 | function isDir(path) {
 14 |     try {
 15 |         fs.accessSync(path, fs.constants.R_OK);
 16 |     } catch (err) {
 17 |         console.error(`\n\n##### ----- No access to ${path} ----- ######\n\n`);
 18 |         return false;
 19 |     }
 20 |     return fs.statSync(path).isDirectory();
 21 | }
 22 | 
 23 | /**
 24 |  * Check if a file exists and is available to be read
 25 |  * @param {string} filePath
 26 |  * @returns {boolean}
 27 |  */
 28 | function available(filePath) {
 29 |     return fs.existsSync(filePath);
 30 |     try {
 31 |         fs.accessSync(path, fs.constants.R_OK);
 32 |     } catch (err) {
 33 |         return false;
 34 |     }
 35 | 
 36 |     return true;
 37 | }
 38 | 
 39 | /**
 40 |  * Given a p, returns the extension of the p in lowercase
 41 |  * @param {string} filePath
 42 |  * @returns {string}
 43 |  */
 44 | function getExtension(filePath) {
 45 |     assert.ok(!isDir(filePath));
 46 |     return path.extname(filePath).toLowerCase();
 47 | }
 48 | 
 49 | /**
 50 |  * Read a file and returns the content of the file. If the filetype
 51 |  * is JSON then also converts to a JSON object.
 52 |  * If there is parsing error or for some reason, the file could not be read
 53 |  * then returns null
 54 |  * @param {string} filePath Path of the file that needs to be read
 55 |  * @returns {(null|string|Object.<string,string>)}
 56 |  */
 57 | function getFileContent(filePath) {
 58 |     let content;
 59 |     // assert.ok(!isDir(filePath));
 60 |     // assert.ok(available(filePath));
 61 |     if (isDir(filePath) || !available(filePath))
 62 |         return null;
 63 |     try {
 64 |         content = fs.readFileSync(filePath, 'utf8');
 65 |         if (getExtension(filePath) === '.json') content = JSON.parse(content);
 66 |     } catch (error) {
 67 |         content = null;
 68 |     }
 69 | 
 70 |     return content;
 71 | }
 72 | 
 73 | /**
 74 |  * Check if a file is accessible and then returns the size
 75 |  * of the file in bytes
 76 |  * @param {string} filePath
 77 |  * @returns {number}
 78 |  */
 79 | function getFileSize(filePath) {
 80 |     assert.ok(available(filePath));
 81 |     return fs.statSync(filePath).size;
 82 | }
 83 | 
 84 | /**
 85 |  * Go through a directory and all sub directories and create a list
 86 |  * links to the files in the particular directory.
 87 |  * @param {string} dirPath Initial p of a directory
 88 |  * @param {string} fileExtension Types of file
 89 |  * @returns {Array.<string>} List of files
 90 |  */
 91 | function createLinksOfFiles(dirPath, fileExtension) {
 92 |     /**
 93 |      * @type{string[]}
 94 |      */
 95 |     let fileList = [];
 96 |     let folderToTraverse = [dirPath];
 97 |     if (!fileExtension) throw 'Need extension of file that will be filtered';
 98 |     while (folderToTraverse.length !== 0) {
 99 |         let currentFolder = folderToTraverse.pop();
100 |         let list_of_files_and_folders = fs.readdirSync(currentFolder);
101 |         list_of_files_and_folders.forEach((f_path) => {
102 |             let complete_path = path.join(currentFolder, f_path);
103 |             if (isDir(complete_path))
104 |                 folderToTraverse.push(complete_path);
105 |             else if (getExtension(complete_path) === fileExtension)
106 |                 fileList.push(complete_path);
107 |         });
108 |     }
109 |     return fileList;
110 | }
111 | 
112 | module.exports.available = available;
113 | module.exports.getFileContent = getFileContent;
114 | module.exports.getExtension = getExtension;
115 | module.exports.getFileSize = getFileSize;
116 | module.exports.createLinksOfFiles = createLinksOfFiles;


--------------------------------------------------------------------------------
/bug_seeding/obtain_bug_seeding_patterns/extract_bug_seeding_patterns_from_repos/utils/format_a_js_file.js:
--------------------------------------------------------------------------------
  1 | /**
  2 | 
  3 |  Created on 16-April-2020
  4 |  @author Jibesh Patra
  5 | 
  6 |  Given a JavaScript file, remove comments and format it. The reason we do this is to
  7 |  seed bugs to a known format so that we can map the locations of the seeded bug later.
  8 | 
  9 |  Since we use a token sequence to seed bugs, comments and spaces in code messes up the
 10 |  location of the seeded bug. Once we re-generate the file with seeded bug, the original
 11 |  locations where the bug was seeded is hard to map back.
 12 |  **/
 13 | 
 14 | 
 15 | const fs = require('fs');
 16 | const fileops = require('./fileoperations');
 17 | const ArgumentParser = require('argparse').ArgumentParser;
 18 | const esprima = require('esprima');
 19 | const escodegen = require('escodegen');
 20 | const beautify = require('js-beautify').js,
 21 |     strip = require('strip-comments');
 22 | const UglifyJS = require("uglify-js");
 23 | 
 24 | function parse_cli_arguments() {
 25 |     let parser = new ArgumentParser({
 26 |         version: '0.0.1',
 27 |         addHelp: true,
 28 |         description: 'Take a JS file and format it and remove comments'
 29 |     });
 30 | 
 31 |     parser.addArgument(
 32 |         ['-inFile'], {
 33 |             help: 'The JavaScript file that needs to be formatted'
 34 |         });
 35 |     let args = parser.parseArgs();
 36 |     return {
 37 |         'inFile': args.inFile,
 38 |     }
 39 | }
 40 | 
 41 | /**
 42 |  * Given a file path, re-format it
 43 |  * @param{String} inFilePath
 44 |  */
 45 | function formatJSfile(inFilePath) {
 46 |     if (!fileops.available(inFilePath)) {
 47 |         // console.log(`${inFilePath} is not available`)
 48 |         return;
 49 |     }
 50 |     try {
 51 |         let code = fs.readFileSync(inFilePath, 'utf8');
 52 |         code = strip(code); // Remove comments from code
 53 |         let ast = {};
 54 |         try {
 55 |             ast = esprima.parseScript(code, {tokens: true});
 56 |         } catch (e) {
 57 |             try {
 58 |                 ast = esprima.parseModule(code, {tokens: true});
 59 |             } catch (e) {
 60 |                 ast = {}
 61 |             }
 62 |         }
 63 |         let tokens = [];
 64 |         for (let tok of ast.tokens) {
 65 |             tokens.push(tok.value);
 66 |         }
 67 | 
 68 |         code = tokens.join(' ');
 69 |         let formattedCode = beautify(code, {
 70 |             "indent_empty_lines": false,
 71 |             "break_chained_methods": false,
 72 |             "space_after_anon_function": false,
 73 |             "space_in_paren": false
 74 |         });
 75 |         let options = {
 76 |             compress: false,
 77 |             mangle: false,
 78 |             output: {
 79 |                 beautify: true
 80 |             }
 81 |         };
 82 |         fs.writeFileSync(inFilePath, formattedCode);
 83 |         // console.log(code);
 84 |         // let uglify_format = UglifyJS.minify(formattedCode, options);
 85 |         // if (uglify_format.hasOwnProperty('error')) { // uglify does not support es6 and above
 86 |         //     fs.writeFileSync(inFilePath, formattedCode);
 87 |         // } else {
 88 |         //     fs.writeFileSync(inFilePath, uglify_format['code']);
 89 |         // }
 90 |         // console.log(`Pretty printing ${inFilePath}`);
 91 |     } catch (e) {
 92 |         // console.log(e);
 93 |     }
 94 | }
 95 | 
 96 | function formatFilesInDir(inDir) {
 97 |     let filePaths = fileops.createLinksOfFiles(inDir, '.js');
 98 |     filePaths.forEach(fl => {
 99 |         formatJSfile(fl);
100 |     });
101 | }
102 | 
103 | // Test ---
104 | // (
105 | //     function () {
106 | //         let {
107 | //             inFile
108 | //         } = parse_cli_arguments();
109 | //         // formatFilesInDir(inFile);
110 | //         formatJSfile(inFile);
111 | //     }
112 | // )();
113 | 
114 | 
115 | module.exports.formatJSfile = formatJSfile;
116 | 


--------------------------------------------------------------------------------
/bug_seeding/obtain_bug_seeding_patterns/repo_downloader/downloadTopGithubRepos.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @author Jibesh Patra
 3 |  */
 4 | 
 5 | const fs = require('fs');
 6 | const git = require('simple-git');
 7 | 
 8 | /**
 9 |  *
10 |  * @param {String} filePath
11 |  * @param {Number} noOfRepos
12 |  */
13 | function readRepoNames(filePath, noOfRepos) {
14 |     let repos = JSON.parse(fs.readFileSync(filePath));
15 |     console.log(`\nFound ${repos.length} repositories, will download ${noOfRepos}`);
16 |     return repos.slice(0, noOfRepos);
17 | }
18 | 
19 | /**
20 |  *
21 |  * @param {Array<String>} repos
22 |  * @param {String} directory
23 |  */
24 | async function cloneSelectedRepos(repos, directory) {
25 |     let clone_tasks = [];
26 |     const sleep = (milliseconds) => {
27 |         return new Promise(resolve => setTimeout(resolve, milliseconds))
28 |     }
29 | 
30 |     let all = repos.length;
31 |     repos.forEach((repo) => {
32 |         console.log(`Cloning ${repo.clone_url}`);
33 |         clone_tasks.push(git(directory).clone(repo.clone_url).exec(() => {
34 |             console.log(`Done cloning ${repo.clone_url} remaining ${--all}`);
35 |         }));
36 |         clone_tasks.push(sleep(1000));
37 |     });
38 |     try {
39 |         let t = await Promise.all(clone_tasks);
40 |     } catch (err) {
41 |         console.log(err);
42 | 
43 |     }
44 | 
45 | }
46 | 
47 | /**
48 |  *
49 |  * @param {String} linkOfRepos
50 |  * @param {Number} numOfRepoToDownload
51 |  * @param {String} outDir
52 |  */
53 | function download_repositories(linkOfRepos, numOfRepoToDownload, outDir) {
54 |     let repos = readRepoNames(linkOfRepos, numOfRepoToDownload);
55 |     console.log("Writing the downloaded repositories to ==> " + outDir);
56 |     cloneSelectedRepos(repos, outDir);
57 | }
58 | 
59 | module.exports.download_repositories = download_repositories;
60 | 


--------------------------------------------------------------------------------
/bug_seeding/obtain_bug_seeding_patterns/repo_downloader/fileoperations.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * @author Jibesh Patra
  3 |  *
  4 |  * Contains utility methods for file and folder operations
  5 |  *
  6 |  */
  7 | const fs = require('fs');
  8 | const path = require('path');
  9 | const assert = require('assert');
 10 | 
 11 | /**
 12 |  * Check if the p is a directory
 13 |  * @param {string} path
 14 |  * @returns {boolean}
 15 |  */
 16 | function isDir(path) {
 17 |     try {
 18 |         fs.accessSync(path, fs.constants.R_OK);
 19 |     } catch (err) {
 20 |         console.error(`No access to ${path}`);
 21 |         return false;
 22 |     }
 23 |     return fs.statSync(path).isDirectory();
 24 | }
 25 | 
 26 | /**
 27 |  * Check if a file exists and is available to be read
 28 |  * @param {string} filePath
 29 |  * @returns {boolean}
 30 |  */
 31 | function available(filePath) {
 32 |     return fs.existsSync(filePath);
 33 |     try {
 34 |         fs.accessSync(path, fs.constants.R_OK);
 35 |     } catch (err) {
 36 |         return false;
 37 |     }
 38 | 
 39 |     return true;
 40 | }
 41 | 
 42 | /**
 43 |  * Given a p, returns the extension of the p in lowercase
 44 |  * @param {string} filePath
 45 |  * @returns {string}
 46 |  */
 47 | function getExtension(filePath) {
 48 |     assert.ok(!isDir(filePath));
 49 |     return path.extname(filePath).toLowerCase();
 50 | }
 51 | 
 52 | /**
 53 |  * Read a file and returns the content of the file. If the filetype
 54 |  * is JSON then also converts to a JSON object.
 55 |  * @param {string} filePath Path of the file that needs to be read
 56 |  * @returns {(string|Object.<string,string>)}
 57 |  */
 58 | function getFileContent(filePath) {
 59 |     let content;
 60 |     assert.ok(!isDir(filePath));
 61 |     assert.ok(available(filePath));
 62 |     content = fs.readFileSync(filePath, 'utf8');
 63 |     if (getExtension(filePath) === '.json') content = JSON.parse(content);
 64 |     return content;
 65 | }
 66 | 
 67 | /**
 68 |  * Check if a file is accessible and then returns the size
 69 |  * of the file in bytes
 70 |  * @param {string} filePath
 71 |  * @returns {number}
 72 |  */
 73 | function getFileSize(filePath) {
 74 |     assert.ok(available(filePath));
 75 |     return fs.statSync(filePath).size;
 76 | }
 77 | 
 78 | /**
 79 |  * Go through a directory and all sub directories and create a list
 80 |  * links to the files in the particular directory.
 81 |  * @param {string} dirPath Initial p of a directory
 82 |  * @param {string} fileExtension Types of file
 83 |  * @returns {Array.<string>} List of files
 84 |  */
 85 | function createLinksOfFiles(dirPath, fileExtension) {
 86 |     /**
 87 |      * @type{string[]}
 88 |      */
 89 |     let fileList = [];
 90 |     let folderToTraverse = [dirPath];
 91 |     if (!fileExtension) throw 'Need extension of file that will be filtered';
 92 |     while (folderToTraverse.length !== 0) {
 93 |         let currentFolder = folderToTraverse.pop();
 94 |         let list_of_files_and_folders = fs.readdirSync(currentFolder);
 95 |         list_of_files_and_folders.forEach((f_path) => {
 96 |             let complete_path = path.join(currentFolder, f_path);
 97 |             if (isDir(complete_path))
 98 |                 folderToTraverse.push(complete_path);
 99 |             else if (getExtension(complete_path) === fileExtension)
100 |                 fileList.push(complete_path);
101 |         });
102 |     }
103 |     return fileList;
104 | }
105 | 
106 | module.exports.available = available;
107 | module.exports.getFileContent = getFileContent;
108 | module.exports.getExtension = getExtension;
109 | module.exports.getFileSize = getFileSize;
110 | module.exports.createLinksOfFiles = createLinksOfFiles;


--------------------------------------------------------------------------------
/bug_seeding/obtain_bug_seeding_patterns/repo_downloader/main.js:
--------------------------------------------------------------------------------
 1 | /*
 2 | 1. This script uses GitHub API to get the top 'N' GitHub repositories and saves it in a file.
 3 | 2. It then goes through this list and downloads each repo locally
 4 | 
 5 | @author Jibesh Patra
 6 | */
 7 | const Scrapper = require('./getTopGitHubRepoNames').Scrapper;
 8 | const download = require('./downloadTopGithubRepos').download_repositories;
 9 | const path = require('path');
10 | const fs = require('fs');
11 | const fileutils = require('./fileoperations');
12 | 
13 | async function getLinks(link_to_top_repos, numOfGitHubRepos) {
14 |     // Uncomment the following lines if top1000GithubRepos.json is not present OR needs to be updated
15 | 
16 |     // let github_scapper = new Scrapper(link_to_top_repos, numOfGitHubRepos);
17 |     // // If more than '100' is required then the pages need to be changed for each request
18 |     // github_scapper.getRepositoriesParseCommits({
19 |     //   language: 'javascript',
20 |     //   page: 1,
21 |     //   q_no: 0
22 |     // });
23 | }
24 | 
25 | /**
26 |  *
27 |  * @param link_to_top_repos
28 |  * @param download_dir
29 |  * @param numOfGitHubRepos
30 |  */
31 | async function getLinksAndDownload(link_to_top_repos, download_dir, numOfGitHubRepos) {
32 |     // 1. Get the links to the top 100 GitHub repositories
33 |     getLinks(link_to_top_repos, numOfGitHubRepos).then(() => {
34 |         // 2. Download some/all of them
35 |         download(link_to_top_repos, numOfGitHubRepos, download_dir);
36 |     });
37 | }
38 | 
39 | function main() {
40 | 
41 |     let link_to_top_repos = path.join('benchmarks', 'top1000GithubRepos.json'); // Where the links (GitHub URLs) to top 'N' repos will be saved
42 |     let download_dir = path.join('benchmarks', 'top_JS_repos'); // Where the top 'N' repos will be saved
43 |     let num_of_github_repos_to_download = 100;
44 | 
45 |     if (!fileutils.available(download_dir))
46 |         fs.mkdirSync(download_dir);
47 | 
48 |     getLinksAndDownload(link_to_top_repos, download_dir, num_of_github_repos_to_download).then(() => {
49 |         console.log("Download ..");
50 |     });
51 | }
52 | 
53 | main();
54 | 


--------------------------------------------------------------------------------
/bug_seeding/run_bug_seeding.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Created on 17-March-2020
  4 | @author Jibesh Patra
  5 | 
  6 | The main file from where all experiments are run
  7 | """
  8 | import argparse
  9 | import utils.fileutils as fs
 10 | from utils.argument_utils import read_arguments
 11 | from utils.prepare_for_seeding_bug import prepare_dir_for_seeding_bugs
 12 | from utils.bug_seeding_pattern_utils import find_wrong_operand_in_binary_op_patterns, \
 13 |     get_only_idf_lit_containing_patterns
 14 | import os
 15 | from tqdm import tqdm
 16 | from multiprocessing import Pool, cpu_count
 17 | from seed_bugs_to_a_file import seed_bugs_to_a_file, seed_bugs_to_a_file_multiprocessing
 18 | import numpy as np
 19 | 
 20 | 
 21 | def select_particular_type_of_seeding_pattern(bug_seeding_patterns):
 22 |     # Select only 'Wrong Binary Operand' patterns
 23 |     # bug_seeding_patterns = find_wrong_operand_in_binary_op_patterns(bug_seeding_patterns)
 24 | 
 25 |     # Select only 'Wrong Assignments' patterns
 26 |     # bug_seeding_patterns = fs.read_json_file('benchmarks/bug_seeding_patterns_wrong_assignment.json')
 27 |     return bug_seeding_patterns
 28 | 
 29 | 
 30 | if __name__ == '__main__':
 31 | 
 32 |     parser = argparse.ArgumentParser(
 33 |         prog='python run_bug_seeding.py',
 34 |         description="Provide the proper directories where bugs may be seeded",
 35 |         epilog="You must provide directories"
 36 |     )
 37 |     in_dir, out_dir, working_dir, stats_dir, bug_seeding_patterns, k_freq_idf, k_freq_lit = read_arguments(parser)
 38 | 
 39 |     # print("Sampling files for using as target to seed bugs")
 40 |     # fs.sample_from_zip(zip_file_path='benchmarks/data.zip', out_dir=in_dir, file_extension_to_sample='.js',
 41 |     #                    required_number_of_files=100)
 42 | 
 43 |     # Read bug seeding patterns
 44 |     all_bug_seeding_patterns = fs.read_json_file(bug_seeding_patterns)
 45 |     all_bug_seeding_patterns = get_only_idf_lit_containing_patterns(all_bug_seeding_patterns)
 46 |     print(f"Complete bug seeding patterns = {len(all_bug_seeding_patterns)}")
 47 |     l_len = len(all_bug_seeding_patterns) * 80 // 100
 48 |     tr_patterns, val_patterns = all_bug_seeding_patterns[:l_len], all_bug_seeding_patterns[l_len:]
 49 |     print(
 50 |         f'Training patterns are {len(tr_patterns)} and validation are {len(val_patterns)}. We only use training patterns for bug seeding')
 51 |     bug_seeding_patterns = tr_patterns
 52 | 
 53 |     bug_seeding_patterns = select_particular_type_of_seeding_pattern(bug_seeding_patterns=bug_seeding_patterns)
 54 |     print("There are {} bug seeding patterns".format(len(bug_seeding_patterns)))
 55 | 
 56 |     # More intermediate directories needed
 57 |     static_analysis_out_dir = os.path.join(working_dir, '__TEMP__target_js_file_nodes')
 58 | 
 59 |     print("Preparing for bug seeding")
 60 |     prepare_dir_for_seeding_bugs(target_js_dir=in_dir,
 61 |                                  abstracted_out_dir=static_analysis_out_dir, num_of_files=-1)
 62 | 
 63 |     # Maximum number of tries to seed bugs per file. We could be always successful and seed 10 bugs or 0
 64 |     MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS = -1  # If -1 then try to seed everywhere
 65 |     actual_mutations_in_each_file = []
 66 | 
 67 |     # Now seed bugs
 68 |     K_most_frequent_identifiers = fs.read_json_file(k_freq_idf)
 69 |     K_most_frequent_literals = fs.read_json_file(k_freq_lit)
 70 |     analysed_files = fs.go_through_dir(directory=static_analysis_out_dir, filter_file_extension='.json')
 71 | 
 72 |     args_for_files = [
 73 |         (file, bug_seeding_patterns, K_most_frequent_identifiers, K_most_frequent_literals,
 74 |          MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS, out_dir) for
 75 |         file in analysed_files]
 76 | 
 77 |     # Multiprocessing only on machine with many CPUs
 78 |     if cpu_count() > 4:
 79 |         with Pool(processes=cpu_count()) as p:
 80 |             with tqdm(total=len(analysed_files)) as pbar:
 81 |                 pbar.set_description_str(
 82 |                     desc="Seeding bugs to files ", refresh=False)
 83 |                 for i, successful_mutations in tqdm(
 84 |                         enumerate(p.imap_unordered(seed_bugs_to_a_file_multiprocessing, args_for_files, chunksize=1)),
 85 |                         position=0):
 86 |                     actual_mutations_in_each_file.append(successful_mutations)
 87 |                     pbar.update()
 88 |                 p.close()
 89 |                 p.join()
 90 |     else:
 91 |         # Non multiprocessing
 92 |         for file in tqdm(analysed_files, desc='Seeding bugs to files', position=0, postfix={'approach': 'SemSeed'}):
 93 |             successful_mutations = seed_bugs_to_a_file(file, bug_seeding_patterns, K_most_frequent_identifiers,
 94 |                                                        K_most_frequent_literals,
 95 |                                                        MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS, out_dir)
 96 |             actual_mutations_in_each_file.append(successful_mutations)
 97 | 
 98 |     print("\n *** Bugs could be seeded in {}/{} files output directory is '{}' ***".format(
 99 |         np.count_nonzero(actual_mutations_in_each_file),
100 |         len(analysed_files), out_dir))
101 | 


--------------------------------------------------------------------------------
/bug_seeding/seed_bugs_to_a_file.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Created on 01-April-2020
  4 | @author Jibesh Patra
  5 | 
  6 | """
  7 | from bug_seeding_approaches.SemSeed.SemSeedBugs import SemSeedBugs
  8 | import utils.static_analysis_utils as static_analysis_utils
  9 | import utils.fileutils as fs
 10 | import random
 11 | from tqdm import tqdm
 12 | from pathlib import Path
 13 | from typing import List
 14 | import os
 15 | import jsbeautifier
 16 | 
 17 | random.seed(a=42)
 18 | 
 19 | 
 20 | def seed_bugs_to_a_file_multiprocessing(args):
 21 |     """
 22 |     The multiprocessing wrapper of seed_bugs_to_a_file function
 23 |     :param args:
 24 |     :return:
 25 |     """
 26 |     file, bug_seeding_patterns, K_most_frequent_identifiers, K_most_frequent_literals, MAX_TRIES_TO_SEED_BUGS, out_dir = args
 27 |     return seed_bugs_to_a_file(file, bug_seeding_patterns, K_most_frequent_identifiers, K_most_frequent_literals,
 28 |                                MAX_TRIES_TO_SEED_BUGS, out_dir)
 29 | 
 30 | 
 31 | def seed_bugs_to_a_file(file: str,
 32 |                         bug_seeding_patterns: List,
 33 |                         K_most_frequent_identifiers: List,
 34 |                         K_most_frequent_literals: List,
 35 |                         MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS: int,
 36 |                         out_dir: str) -> int:
 37 |     """
 38 |     Given a file seed bugs to it. The expected file is a JSON file rather than a JS file. It is expected
 39 |     that the input JS file has been analysed before and a corresponding JSON file has been createdl
 40 |     :param file: the corresponding JSON file of the JS file where bugs need to be seeded
 41 |     :param bug_seeding_patterns:
 42 |     :param K_most_frequent_identifiers:
 43 |     :param K_most_frequent_literals:
 44 |     :param MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS:
 45 |     :param out_dir: A path where the mutate code will be written
 46 |     :return: The count of bugs that could be seeded to the file
 47 |     """
 48 |     num_of_locations_that_could_be_mutated = 0
 49 | 
 50 |     target_js_file_analysed = fs.read_json_file(file)
 51 |     if len(target_js_file_analysed) == 0:  # The static analysis could not finish properly
 52 |         return num_of_locations_that_could_be_mutated
 53 |     possible_bug_seeding_locations = target_js_file_analysed['nodes']
 54 | 
 55 |     # We do not want to select the first 'n' locations and try to seed bugs. Rather we randomly
 56 |     # choose 'n' locations
 57 |     random.shuffle(possible_bug_seeding_locations)
 58 |     if MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS > 1:
 59 |         possible_bug_seeding_locations = possible_bug_seeding_locations[:MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS]
 60 | 
 61 |     # Get Identifiers and Literals available for selection in different scope
 62 |     identifiers_in_different_scopes = static_analysis_utils.get_tokens_from_different_scopes(
 63 |         analysed_file=target_js_file_analysed,
 64 |         kind='identifier',
 65 |         k_most_frequent=K_most_frequent_identifiers)
 66 |     literals_in_different_scopes = static_analysis_utils.get_tokens_from_different_scopes(
 67 |         analysed_file=target_js_file_analysed,
 68 |         kind='literal',
 69 |         k_most_frequent=K_most_frequent_literals)
 70 | 
 71 |     file_name = Path(file).name
 72 |     # Go through each seeding pattern available from the bug seeding patterns
 73 |     for seeding_pattern in tqdm(bug_seeding_patterns, position=1, ncols=100, ascii=" #",
 74 |                                 desc='Trying to apply pattern',
 75 |                                 postfix={'file': file_name}):
 76 |         # For each location in the file, try to seed a bug
 77 |         for target_location in possible_bug_seeding_locations:
 78 |             # ------------------------ SemSeed -----------------------------------------
 79 |             bug_seeding = SemSeedBugs(bug_seeding_pattern=seeding_pattern,
 80 |                                       target_location=target_location,
 81 |                                       file_path=target_js_file_analysed['file_path'],
 82 |                                       similarity_threshold=0.3,
 83 |                                       K=1,
 84 |                                       available_identifiers=identifiers_in_different_scopes,
 85 |                                       available_literals=literals_in_different_scopes,
 86 |                                       scope_of_selection='top_K')
 87 | 
 88 |             # Check if the seeding pattern and the target locations match
 89 |             if bug_seeding.is_matching_token_sequence():
 90 | 
 91 |                 # The mutated token sequences is only the 'mutated' target location token sequence
 92 |                 # We may get multiple sequences based on K. If K=2 and there is only one
 93 |                 # unbound token, we get 2 sequences
 94 |                 mutated_token_sequences = bug_seeding.apply_pattern()
 95 |                 if len(mutated_token_sequences) > 0:
 96 |                     num_of_locations_that_could_be_mutated += 1
 97 | 
 98 |                     for ms, mutated_sequence in enumerate(mutated_token_sequences):
 99 |                         token_sequence_after_seeding_bug = bug_seeding.replace_target_with_mutated_token_sequence(
100 |                             token_list=target_js_file_analysed['tokenList'],
101 |                             token_range_list=target_js_file_analysed['tokenRangesList'],
102 |                             mutated_token_sequence=mutated_sequence)
103 | 
104 |                         bug_seeding.bug_metadata['target_token_sequence-Buggy'] = mutated_sequence
105 |                         bug_seeding.bug_metadata['token_sequence_abstraction-Buggy'] = seeding_pattern['buggy']
106 |                         bug_seeding.bug_metadata['num_of_available_identifiers_to_choose_from'] = len(
107 |                             bug_seeding.identifiers_available_for_selecting_unbound_token)
108 |                         bug_seeding.bug_metadata['num_of_available_literals_to_choose_from'] = len(
109 |                             bug_seeding.literals_available_for_selecting_unbound_token)
110 |                         bug_seeding.bug_metadata['seeding_pattern_url'] = seeding_pattern['url']
111 | 
112 |                         # Simply joining the token list with a space
113 |                         mutated_code = ' '.join(token_sequence_after_seeding_bug)
114 | 
115 |                         # Write the output code & metadata about the bug seed
116 |                         out_file_name = file_name.replace('.json',
117 |                                                           f'_SEMSEED_MUTATED_{num_of_locations_that_could_be_mutated}.js')
118 |                         out_file_path = os.path.join(out_dir, out_file_name)
119 |                         if fs.pathExists(out_file_path):
120 |                             out_file_path = out_file_path.replace('.js',
121 |                                                                   f'_{str(random.randint(0, 10000))}_{["a", "b", "c", "d"][random.randint(0, 3)]}.js')
122 |                         try:
123 |                             # Remember, this does not check for Syntax Errors in the generated JS code. This needs to be
124 |                             # done separately
125 |                             mutated_code = jsbeautifier.beautify(mutated_code, {
126 |                                 "indent_empty_lines": False,
127 |                                 "break_chained_methods": False,
128 |                                 "space_after_anon_function": False,
129 |                                 "space_in_paren": False
130 |                             })
131 |                             fs.writeFile(data=mutated_code, file_path=out_file_path)
132 |                         except Exception as e:
133 |                             tqdm.write(f'ERROR: Could not seed bugs to {file_name} because {e}')
134 |                             bug_seeding.bug_metadata['error'] = str(e)
135 |                         finally:
136 |                             fs.writeJSONFile(data=bug_seeding.bug_metadata, file_path=out_file_path + 'on')
137 |                     else:
138 |                         pass
139 | 
140 |     return num_of_locations_that_could_be_mutated
141 | 


--------------------------------------------------------------------------------
/bug_seeding/utils/argument_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Created on 24-March-2020
 4 | @author Jibesh Patra
 5 | 
 6 | """
 7 | from argparse import ArgumentParser
 8 | from utils.fileutils import create_dir_list_if_not_present
 9 | from typing import Tuple
10 | 
11 | 
12 | def read_arguments(parser: ArgumentParser) -> Tuple:
13 |     parser = add_arguments_to_parser(parser)
14 |     args = parser.parse_args()
15 |     create_dir_list_if_not_present([args.out_dir, args.working_dir, args.stats_dir])
16 |     return args.in_dir, args.out_dir, args.working_dir, args.stats_dir, args.bug_seeding_patterns, args.K_freq_idf, args.K_freq_lit
17 | 
18 | 
19 | def add_arguments_to_parser(parser: ArgumentParser) -> ArgumentParser:
20 |     parser.add_argument(
21 |         '--in_dir',
22 |         type=str,
23 |         default='benchmarks/data',
24 |         help='The directory containing JS files where bugs may be seeded'
25 |     )
26 |     parser.add_argument(
27 |         '--out_dir',
28 |         type=str,
29 |         default='benchmarks/js_benchmark_seeded_bugs',
30 |         help='The directory where the bug seeded files will written'
31 |     )
32 |     parser.add_argument(
33 |         '--working_dir',
34 |         type=str,
35 |         default='benchmarks/js_benchmark_working_dir',
36 |         help='The directory where intermediate results will be written'
37 |     )
38 |     parser.add_argument(
39 |         '--stats_dir',
40 |         type=str,
41 |         default='benchmarks/js_benchmark_stats',
42 |         help='The directory where statistics about bug seeding will be written'
43 |     )
44 |     parser.add_argument(
45 |         '--bug_seeding_patterns',
46 |         type=str,
47 |         default='benchmarks/bug_seeding_patterns_for_semantic_seeding.json',
48 |         help='The path to a file that contains the change patterns'
49 |     )
50 | 
51 |     parser.add_argument(
52 |         '--K_freq_idf',
53 |         type=str,
54 |         default='benchmarks/topK_identifiers_in_training_commits.json',
55 |         help='K most frequent Identifier'
56 |     )
57 | 
58 |     parser.add_argument(
59 |         '--K_freq_lit',
60 |         type=str,
61 |         default='benchmarks/topK_literals_in_training_commits.json',
62 |         help='K most frequent Literal'
63 |     )
64 | 
65 |     return parser
66 | 


--------------------------------------------------------------------------------
/bug_seeding/utils/bug_seeding_pattern_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Created on 02-April-2020
 4 | @author Jibesh Patra
 5 | 
 6 | Given all change patterns do stuffs with them
 7 | 
 8 | """
 9 | from typing import List, Tuple
10 | import utils.fileutils as fs
11 | import re
12 | import pandas as pd
13 | 
14 | 
15 | def get_only_idf_lit_containing_patterns(all_changes):
16 |     """
17 |     It is possible that every bug-fix pattern can not be used to seed bugs.
18 |     We filter some of them here. For example:
19 |         * we may filter very long change patterns (although we do it once while aggregating data from MongoDB)
20 |         * we may select only those chage patterns that has atleast 'N' frequency
21 |     """
22 |     filtered_change_patterns = []
23 | 
24 |     # # ----------------------- Filtering number of tokens -------------------------
25 |     # max_number_of_tokens = 10
26 |     # for change_pattern in self.all_training_change_patterns:
27 |     #     print('\n\n \t *** ***  Selecting only change patterns having total {} tokens *** ***'.format(max_number_of_tokens*2))
28 |     #     if len(change_pattern['fix']) <= max_number_of_tokens and len(change_pattern['buggy']) <= max_number_of_tokens:
29 |     #         filtered_change_patterns.append(change_pattern)
30 | 
31 |     # ----------------------- Filtering based on the frequency of the change patterns -----------------
32 |     # min_frequency = 4
33 |     # print('\n \t *** ***  Filtering only change patterns having minimum frequency  {} *** ***\n'.format(min_frequency))
34 |     # mapping_of_change_patterns = SeedBugs._str_mapping_change_pattern_to_change(
35 |     #     all_changes)
36 | 
37 |     # for mapped_seq in mapping_of_change_patterns:
38 |     #     if len(mapping_of_change_patterns[mapped_seq]) >= min_frequency:
39 |     #         filtered_change_patterns.extend(
40 |     #             mapping_of_change_patterns[mapped_seq])
41 | 
42 |     # print("\tTotal {} change patterns and {} filtered change patterns ".format(
43 |     #     len(mapping_of_change_patterns), len(filtered_change_patterns)))
44 | 
45 |     # ------------------- Remove those change patterns that does not contain any Identifiers/Literals ------------
46 |     for t in all_changes:
47 |         # If the change pattern contains at-least one Identifier/Literal, we use that.
48 |         # Else the change pattern is discarded
49 |         if 'Idf_' in ' '.join(t['fix']) or 'Idf_' in ' '.join(t['buggy']) or 'Lit_' in ' '.join(
50 |                 t['fix']) or 'Lit_' in ' '.join(t['buggy']):
51 |             filtered_change_patterns.append(t)
52 | 
53 |     return filtered_change_patterns
54 | 
55 | 
56 | def find_wrong_operand_in_binary_op_patterns(bug_seeding_patterns: List) -> List:
57 |     filtered_patterns = []
58 |     dup_filter = set()
59 |     js_binary_operators = ["==", "!=", "===", "!==", "<", "<=", ">", ">=", "<<", ">>", ">>>", "\+", "-", "\*", "/", "%",
60 |                            "\|",
61 |                            "\^", "&", "in", "instanceof"]
62 |     regexps = []
63 |     for op in js_binary_operators:
64 |         regexps.append(re.compile('(Idf_[\d]|Lit_[\d])\s(' + op + ')\s(Idf_[\d]|Lit_[\d])'))
65 |     for pattern in bug_seeding_patterns:
66 |         correct_part_of_pattern = ' '.join(pattern['fix'])
67 |         buggy_part_of_pattern = ' '.join(pattern['buggy'])
68 |         if pattern['fix_tokenType'] == 'BinaryExpression' and pattern['buggy_tokenType'] == 'BinaryExpression':
69 |             for regex_op_1 in regexps:
70 |                 in_correct = regex_op_1.findall(correct_part_of_pattern)
71 |                 for regex_op_2 in regexps:
72 |                     in_buggy = regex_op_2.findall(buggy_part_of_pattern)
73 |                     for correct_match in in_correct:
74 |                         for buggy_match in in_buggy:
75 |                             if correct_match[1] == buggy_match[1] and correct_match[0] != buggy_match[0] and \
76 |                                     correct_match[
77 |                                         2] == buggy_match[2]:
78 |                                 pattern_as_str = correct_part_of_pattern + buggy_part_of_pattern
79 |                                 if pattern_as_str not in dup_filter:
80 |                                     dup_filter.add(pattern_as_str)
81 |                                     filtered_patterns.append(pattern)
82 |                             if correct_match[1] == buggy_match[1] and correct_match[0] == buggy_match[0] and \
83 |                                     correct_match[
84 |                                         2] != buggy_match[2]:
85 |                                 pattern_as_str = correct_part_of_pattern + buggy_part_of_pattern
86 |                                 if pattern_as_str not in dup_filter:
87 |                                     dup_filter.add(pattern_as_str)
88 |                                     filtered_patterns.append(pattern)
89 |     return filtered_patterns
90 | 


--------------------------------------------------------------------------------
/bug_seeding/utils/format_bug_seeded_files.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Created on 02-August-2020
 4 | @author Jibesh Patra
 5 | 
 6 | Run this script after bug seeding has finished
 7 | 
 8 | """
 9 | import os
10 | import subprocess
11 | from threading import Timer
12 | from tqdm import tqdm
13 | from multiprocessing import Pool, cpu_count
14 | from pathlib import Path
15 | 
16 | 
17 | def format_a_js_file(target_js_file_path: str) -> str:
18 |     def kill_process(p):
19 |         return p.kill()
20 | 
21 |     err_in_execution = False
22 |     path_to_process = os.path.join(os.path.normpath(
23 |         os.getcwd() + os.sep), 'static_analysis_js', 'utils', 'format_a_js_file.js')
24 |     time_out_before_killing = 5000  # seconds
25 |     try:
26 |         p = subprocess.Popen([
27 |             'node', path_to_process,
28 |             '-inFile', target_js_file_path,
29 |         ], stdout=subprocess.PIPE)
30 |         time_out = Timer(time_out_before_killing, kill_process, [p])
31 |         try:
32 |             time_out.start()
33 |             stdout, stderr = p.communicate()
34 |             if stderr:
35 |                 err_in_execution = stderr.decode("utf-8")
36 |         finally:
37 |             time_out.cancel()
38 |     except subprocess.TimeoutExpired:
39 |         pass
40 |     return err_in_execution
41 | 
42 | 
43 | def format_files_in_dir(indir):
44 |     js_files = list(Path(indir).rglob('*.js'))
45 |     print(f"Will format {len(js_files)} files")
46 |     with Pool(processes=cpu_count()) as p:
47 |         with tqdm(total=len(js_files)) as pbar:
48 |             pbar.set_description_str(
49 |                 desc="Formatting js files ", refresh=False)
50 |             for i, execution_errors in tqdm(
51 |                     enumerate(p.imap_unordered(format_a_js_file,
52 |                                                js_files, chunksize=10))):
53 |                 # print(execution_errors)
54 |                 pbar.update()
55 |             p.close()
56 |             p.join()
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     format_files_in_dir('benchmarks/js_benchmark_seeded_bugs')
61 | 


--------------------------------------------------------------------------------
/bug_seeding/utils/prepare_for_seeding_bug.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Created on 25-March-2020
  4 | @author Jibesh Patra
  5 | 
  6 | Call nodejs to tokenize and convert files to their AST representations, tokenize etc.
  7 | """
  8 | import os
  9 | import subprocess
 10 | from threading import Timer
 11 | import utils.fileutils as fs
 12 | from tqdm import tqdm
 13 | from multiprocessing import Pool, cpu_count
 14 | from typing import List
 15 | import random
 16 | from pathlib import Path
 17 | 
 18 | 
 19 | def prepare_a_js_file_for_seeding_bug(target_js_file_path: str, out_json_file_path: str) -> str:
 20 |     """
 21 |     Prepare a JS file for seeding bugs by converting JS file to AST nodes.
 22 |     The functions creates a Nodejs process to extract the required data.
 23 |     :param target_js_file_path: The input JS file that will be converted to AST node representations
 24 |     :param out_json_file_path:
 25 |     :return:
 26 |     """
 27 | 
 28 |     def kill_process(p):
 29 |         return p.kill()
 30 | 
 31 |     err_in_execution = False
 32 |     path_to_process = os.path.join(os.path.normpath(
 33 |         os.getcwd() + os.sep), 'bug_seeding', 'obtain_bug_seeding_patterns', 'extract_bug_seeding_patterns_from_repos',
 34 |         'extractNodeData.js')
 35 |     time_out_before_killing = 180  # seconds
 36 |     try:
 37 |         p = subprocess.Popen([
 38 |             'node', path_to_process,
 39 |             '-inFile', target_js_file_path,
 40 |             '-outFile', out_json_file_path,
 41 |         ],
 42 |             stdout=subprocess.PIPE)
 43 |         time_out = Timer(time_out_before_killing, kill_process, [p])
 44 |         try:
 45 |             time_out.start()
 46 |             stdout, stderr = p.communicate()
 47 |             if stderr:
 48 |                 err_in_execution = stderr.decode("utf-8")
 49 |         finally:
 50 |             time_out.cancel()
 51 |     except subprocess.TimeoutExpired:
 52 |         pass
 53 |     return err_in_execution
 54 | 
 55 | 
 56 | def remove_duplicates(file_list: List, duplicate_file_groups: List) -> List:
 57 |     """
 58 |     Given a list of files, and known duplicates, keep only one of the duplicates
 59 |     :param duplicate_file_groups:
 60 |     :param file_list:
 61 |     :return:
 62 |     """
 63 |     dup_files = set()
 64 |     for file_group in duplicate_file_groups:
 65 |         # Except the first file rest are all duplicates
 66 |         dup_files.update(file_group[1:])
 67 | 
 68 |     files_without_duplicates = []
 69 |     # Now, we remove the known duplicates
 70 |     root_dir = '/data/'
 71 |     # dup_files = set([os.path.join(root_dir, fp) for fp in dup_files])
 72 |     for fl_path in file_list:
 73 |         if fl_path.split(root_dir)[1] not in dup_files:
 74 |             files_without_duplicates.append(fl_path)
 75 |     return files_without_duplicates
 76 | 
 77 | 
 78 | def prepare_a_js_file_for_seeding_bug_multiprocessing(arg):
 79 |     target_js_file_path, out_json_file_path = arg
 80 |     prepare_a_js_file_for_seeding_bug(target_js_file_path, out_json_file_path)
 81 | 
 82 | 
 83 | def prepare_dir_for_seeding_bugs(target_js_dir: str, abstracted_out_dir: str, num_of_files: int = -1) -> None:
 84 |     """
 85 |     Given a directory of JS files, format the code and run static analysis to extract nodes
 86 |     from the code.
 87 |     :param num_of_files: Select only 'num_of_files' files from 'abstracted_out_dir' once it is ready
 88 |     :param target_js_dir:
 89 | 
 90 |     :param abstracted_out_dir:
 91 |     :return:
 92 |     """
 93 |     fs.create_dir_list_if_not_present([abstracted_out_dir])
 94 | 
 95 |     print(" Reading  files in {}".format(target_js_dir))
 96 |     all_target_js_files = sorted(Path(target_js_dir).rglob('*.js'))
 97 |     all_target_js_files = [str(pth) for pth in all_target_js_files if pth.is_file()]
 98 | 
 99 |     # Some datasets might have duplicate files. We want to remove the duplicates
100 |     print(" Removing duplicates from {} files in benchmarks".format(len(all_target_js_files)))
101 |     duplicate_file_groups = fs.read_json_file('benchmarks/js150-duplicates.json')
102 |     all_target_js_files = remove_duplicates(file_list=all_target_js_files, duplicate_file_groups=duplicate_file_groups)
103 |     
104 |     if num_of_files > 1:
105 |         random.seed(100)
106 |         random.shuffle(all_target_js_files)
107 |         all_target_js_files = all_target_js_files[:num_of_files]
108 |     print(" Total number of files in benchmark is {}".format(len(all_target_js_files)))
109 | 
110 |     def create_out_file_path(target_js_file_path: str) -> str:
111 |         return os.path.join(abstracted_out_dir, os.path.basename(target_js_file_path) + 'on')
112 | 
113 |     target_js_files_and_out_paths = [(target_js_file_path, create_out_file_path(target_js_file_path))
114 |                                      for target_js_file_path in all_target_js_files]
115 |     if cpu_count() > 4:
116 |         with Pool(processes=cpu_count()) as p:
117 |             with tqdm(total=len(all_target_js_files)) as pbar:
118 |                 pbar.set_description_str(
119 |                     desc="Preparing js files ", refresh=False)
120 |                 for i, execution_errors in tqdm(
121 |                         enumerate(p.imap_unordered(prepare_a_js_file_for_seeding_bug_multiprocessing,
122 |                                                    target_js_files_and_out_paths, chunksize=10))):
123 |                     # print(execution_errors)
124 |                     pbar.update()
125 |                 p.close()
126 |                 p.join()
127 |     else:
128 |         for target_file, out_file in tqdm(target_js_files_and_out_paths,
129 |                                           desc='Preparing JS files *** Sequentially ***'):
130 |             prepare_a_js_file_for_seeding_bug(target_js_file_path=target_file, out_json_file_path=out_file)
131 | 


--------------------------------------------------------------------------------
/bug_seeding/utils/static_analysis_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Created on 25-March-2020
 4 | @author Jibesh Patra
 5 | 
 6 | This file contains helper functions to parse the static analysis results
 7 | extracted using nodejs and esprima
 8 | """
 9 | from typing import List, Dict
10 | 
11 | 
12 | def get_all_tokens_in_file(range_to_token_mapping: Dict) -> List:
13 |     tokens = set()
14 |     for token in range_to_token_mapping.values():
15 |         tokens.add(token)
16 |     return list(tokens)
17 | 
18 | 
19 | def get_tokens_from_different_scopes(analysed_file: dict, kind: str, k_most_frequent: List) -> Dict:
20 |     if kind == 'identifier':
21 |         return {
22 |             'all_identifiers_in_same_file': get_all_tokens_in_file(
23 |                 analysed_file['range_to_identifier']),
24 |             # A mapping between the functions in the file and the containing Identifiers
25 |             'functions_to_identifiers': analysed_file['functions_to_identifiers'],
26 |             'K_most_frequent_identifiers': k_most_frequent  # 1000 most frequent Identifiers
27 |         }
28 |     else:
29 |         return {
30 |             'all_literals_in_same_file': get_all_tokens_in_file(
31 |                 analysed_file['range_to_literal']),
32 |             # A mapping between the functions in the file and the containing Literals
33 |             'functions_to_literals': analysed_file['functions_to_literals'],
34 |             'K_most_frequent_literals': k_most_frequent  # 1000 most frequent Literals
35 |         }
36 | 


--------------------------------------------------------------------------------
/database_config.json:
--------------------------------------------------------------------------------
1 | {
2 |   "database_name": "SemSeed_github_commits_db",
3 |   "host": "127.0.0.1",
4 |   "port": 27017,
5 |   "username": "semSeedUser",
6 |   "password": "semSeedPassWord124",
7 |   "collection_name": "commits"
8 | }


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "SemSeed",
 3 |   "version": "1.0.0",
 4 |   "devDependencies": {
 5 |     "@octokit/rest": "^18.5.6",
 6 |     "async": "^3.2.0",
 7 |     "simple-git": "^2.39.0",
 8 |     "acorn": "^7.1.1",
 9 |     "argparse": "^1.0.10",
10 |     "escodegen": "^1.14.1",
11 |     "esprima": "^4.0.1",
12 |     "estraverse": "^5.1.0",
13 |     "js-beautify": "^1.11.0",
14 |     "process": "^0.11.10",
15 |     "strip-comments": "^2.0.1",
16 |     "uglify-js": "^3.9.1",
17 |     "walk-sync": "^2.1.0"
18 |   },
19 |   "dependencies": {
20 |     "lodash": "^4.17.21",
21 |     "mongodb": "3.6.9"
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==0.12.0
  2 | argon2-cffi==20.1.0
  3 | astunparse==1.6.3
  4 | async-generator==1.10
  5 | attrs==19.3.0
  6 | Automat==0.8.0
  7 | backcall==0.2.0
  8 | bleach==3.3.0
  9 | blinker==1.4
 10 | cachetools==4.2.2
 11 | certifi==2019.11.28
 12 | cffi==1.14.5
 13 | chardet==3.0.4
 14 | Click==7.0
 15 | cloud-init==21.1
 16 | colorama==0.4.3
 17 | command-not-found==0.3
 18 | configobj==5.0.6
 19 | constantly==15.1.0
 20 | cryptography==2.8
 21 | cycler==0.10.0
 22 | dbus-python==1.2.16
 23 | decorator==5.0.9
 24 | defusedxml==0.7.1
 25 | distro==1.4.0
 26 | distro-info===0.23ubuntu1
 27 | EditorConfig==0.12.2
 28 | entrypoints==0.3
 29 | fasttext==0.9.1
 30 | flatbuffers==1.12
 31 | future==0.18.2
 32 | gast==0.4.0
 33 | google-auth==1.30.1
 34 | google-auth-oauthlib==0.4.4
 35 | google-pasta==0.2.0
 36 | grpcio==1.34.1
 37 | h5py==3.1.0
 38 | httplib2==0.14.0
 39 | hyperlink==19.0.0
 40 | idna==2.8
 41 | importlib-metadata==1.5.0
 42 | incremental==16.10.1
 43 | install==1.3.4
 44 | ipykernel==5.5.5
 45 | ipython==7.24.1
 46 | ipython-genutils==0.2.0
 47 | ipywidgets==7.6.3
 48 | jedi==0.18.0
 49 | Jinja2==2.10.1
 50 | jsbeautifier==1.11.0
 51 | jsonpatch==1.22
 52 | jsonpointer==2.0
 53 | jsonschema==3.2.0
 54 | jupyter==1.0.0
 55 | jupyter-client==6.1.12
 56 | jupyter-console==6.4.0
 57 | jupyter-core==4.7.1
 58 | jupyterlab-pygments==0.1.2
 59 | jupyterlab-widgets==1.0.0
 60 | Keras==2.4.3
 61 | keras-nightly==2.5.0.dev2021032900
 62 | Keras-Preprocessing==1.1.2
 63 | keyring==18.0.1
 64 | kiwisolver==1.3.1
 65 | language-selector==0.1
 66 | launchpadlib==1.10.13
 67 | lazr.restfulclient==0.14.2
 68 | lazr.uri==1.0.3
 69 | libcst==0.3.19
 70 | Markdown==3.3.4
 71 | MarkupSafe==1.1.0
 72 | matplotlib==3.4.2
 73 | matplotlib-inline==0.1.2
 74 | mistune==0.8.4
 75 | mongoengine==0.23.1
 76 | more-itertools==4.2.0
 77 | mypy-extensions==0.4.3
 78 | nbclient==0.5.3
 79 | nbconvert==6.0.7
 80 | nbformat==5.1.3
 81 | nest-asyncio==1.5.1
 82 | netifaces==0.10.4
 83 | notebook==6.4.0
 84 | numpy==1.19.5
 85 | oauthlib==3.1.0
 86 | opt-einsum==3.3.0
 87 | packaging==20.9
 88 | pandas==1.0.3
 89 | pandocfilters==1.4.3
 90 | parsepatch==0.1.3
 91 | parso==0.8.2
 92 | pbr==5.6.0
 93 | pexpect==4.6.0
 94 | pickleshare==0.7.5
 95 | Pillow==7.1.1
 96 | prometheus-client==0.11.0
 97 | prompt-toolkit==3.0.18
 98 | protobuf==3.17.2
 99 | ptyprocess==0.7.0
100 | pyasn1==0.4.2
101 | pyasn1-modules==0.2.1
102 | pybind11==2.5.0
103 | pycparser==2.20
104 | pygit2==1.6.0
105 | Pygments==2.9.0
106 | PyGObject==3.36.0
107 | PyHamcrest==1.9.0
108 | PyJWT==1.7.1
109 | pymacaroons==0.13.0
110 | pymongo==3.11.4
111 | PyNaCl==1.3.0
112 | pyOpenSSL==19.0.0
113 | pyparsing==2.4.7
114 | pyrsistent==0.15.5
115 | pyserial==3.4
116 | python-apt==2.0.0+ubuntu0.20.4.4
117 | python-dateutil==2.8.1
118 | python-debian===0.1.36ubuntu1
119 | pytz==2019.3
120 | PyYAML==5.3.1
121 | pyzmq==22.1.0
122 | qtconsole==5.1.0
123 | QtPy==1.9.0
124 | requests==2.22.0
125 | requests-oauthlib==1.3.0
126 | requests-unixsocket==0.2.0
127 | rsa==4.7.2
128 | scipy==1.4.1
129 | seaborn==0.11.1
130 | SecretStorage==2.3.1
131 | Send2Trash==1.5.0
132 | service-identity==18.1.0
133 | simplejson==3.16.0
134 | six==1.15.0
135 | sos==4.1
136 | ssh-import-id==5.10
137 | systemd-python==234
138 | tensorboard==2.5.0
139 | tensorboard-data-server==0.6.1
140 | tensorboard-plugin-wit==1.8.0
141 | tensorflow==2.5.0
142 | tensorflow-estimator==2.5.0
143 | termcolor==1.1.0
144 | terminado==0.10.0
145 | testpath==0.5.0
146 | testresources==2.0.1
147 | torch==1.8.1+cpu
148 | torchaudio==0.8.1
149 | torchvision==0.9.1+cpu
150 | tornado==6.1
151 | tqdm==4.45.0
152 | traitlets==5.0.5
153 | Twisted==18.9.0
154 | typing==3.7.4.1
155 | typing-extensions==3.7.4.3
156 | typing-inspect==0.6.0
157 | ubuntu-advantage-tools==27.0
158 | ufw==0.36
159 | unattended-upgrades==0.1
160 | urllib3==1.25.8
161 | wadllib==1.3.3
162 | wcwidth==0.2.5
163 | webencodings==0.5.1
164 | Werkzeug==2.0.1
165 | widgetsnbextension==3.5.1
166 | wrapt==1.12.1
167 | zipp==1.0.0
168 | zope.interface==4.7.1
169 | 


--------------------------------------------------------------------------------