├── .gitignore ├── DeepBugs ├── benchmarks ├── javascript │ ├── astWalkTest.js │ ├── compareWarnings.js │ ├── extractFromJS.js │ ├── extractorOfAssignments.js │ ├── extractorOfAssignments2.js │ ├── extractorOfBinOps.js │ ├── extractorOfCalls.js │ ├── extractorOfCallsMissingArg.js │ ├── extractorOfIdsLitsWithASTFamily.js │ ├── extractorOfIdsLitsWithIds.js │ ├── extractorOfIdsLitsWithTokens.js │ ├── extractorOfTokens.js │ ├── jsExtractionUtil.js │ ├── modifyArgumentOrder.js │ ├── rb-nodeify.sh │ ├── seedBugs.js │ └── tokenize.js └── python │ ├── ASTEmbeddingLearner.py │ ├── ASTEmbeddingLearnerPerLocation.py │ ├── AccuracyMetricTest.py │ ├── BinOpContextToEmbedding.py │ ├── BugDetection.py │ ├── CallContextToEmbedding.py │ ├── CallPerCalleeCounter.py │ ├── CallPerFileCounter.py │ ├── EmbeddingEvaluator.py │ ├── EmbeddingEvaluatorWord2Vec.py │ ├── EmbeddingLearner.py │ ├── EmbeddingLearnerWord2Vec.py │ ├── EmbeddingModelValidator.py │ ├── LearningDataBinOperator.py │ ├── LearningDataIncorrectAssignment.py │ ├── LearningDataIncorrectAssignment_with_parents.py │ ├── LearningDataIncorrectBinaryOperand.py │ ├── LearningDataMissingArg.py │ ├── LearningDataSwappedArgs.py │ ├── LearningDataSwappedBinOperands.py │ ├── LocationBasedEmbeddingEvaluator.py │ ├── RandomEmbeddingLearner.py │ ├── TokenWithASTContextPerLocationToNumbers.py │ ├── TokenWithASTContextToNumbers.py │ ├── TokenWithContextStats.py │ ├── TokenWithContextToNumbers.py │ ├── TokensToTopTokens.py │ ├── Util.py │ ├── __init__.py │ ├── create_and_analyse_dataset_for_DeepBugs_assignments.ipynb │ ├── create_and_analyse_dataset_for_DeepBugs_binOpnd.ipynb │ ├── create_dataset_from_seeded_bugs.py │ ├── extract_from_js_parallel.py │ └── tools │ └── anomalyAnalyzer.py ├── INSTALL.md ├── LICENSE ├── README.md ├── REQUIREMENTS.md ├── bug_seeding ├── bug_seeding_approaches │ ├── SeedBugs.py │ ├── SemSeed │ │ ├── BugSeedingUtils.py │ │ └── SemSeedBugs.py │ └── Syntactic │ │ └── SyntacticSeedBugs.py ├── obtain_bug_seeding_patterns │ ├── extract_bug_seeding_patterns_from_repos │ │ ├── CodeAnalysis.js │ │ ├── aggregateChanges.py │ │ ├── analyses │ │ │ └── ExtractDataGivenNodes.js │ │ ├── callNodeJSExtractData.py │ │ ├── database │ │ │ └── GitHubCommits.py │ │ ├── extractNodeData.js │ │ ├── main.py │ │ ├── python_calls_me_to_extract_patterns.js │ │ └── utils │ │ │ ├── fileoperations.js │ │ │ ├── fileutils.py │ │ │ └── format_a_js_file.js │ └── repo_downloader │ │ ├── downloadTopGithubRepos.js │ │ ├── fileoperations.js │ │ ├── getTopGitHubRepoNames.js │ │ └── main.js ├── run_bug_seeding.py ├── seed_bugs_to_a_file.py └── utils │ ├── argument_utils.py │ ├── bug_seeding_pattern_utils.py │ ├── fileutils.py │ ├── format_bug_seeded_files.py │ ├── prepare_for_seeding_bug.py │ └── static_analysis_utils.py ├── compare_real_bug_finding_ability ├── DeepBugs_prediction_evaluation.ipynb ├── create_dataset_from_real_bugs_assignments.ipynb ├── create_dataset_from_real_bugs_binopnds.ipynb └── syntax_check_mutandis_compare.ipynb ├── database_config.json ├── package.json └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | **/node_modules/** 2 | **/.idea/** 3 | **/semseed_venv/** 4 | __pycache__/ 5 | .ipynb_checkpoints 6 | **/.git/** 7 | -------------------------------------------------------------------------------- /DeepBugs/benchmarks: -------------------------------------------------------------------------------- 1 | IntxLNK../benchmarks/ -------------------------------------------------------------------------------- /DeepBugs/javascript/astWalkTest.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | 3 | (function() { 4 | 5 | const acorn = require("acorn"); 6 | const estraverse = require("estraverse"); 7 | 8 | function getChildren(parent, ignoredChild) { 9 | const children = []; 10 | for (const prop in parent) { 11 | if (parent.hasOwnProperty(prop)) { 12 | const child = parent[prop]; 13 | if (Array.isArray(child)) { 14 | for (let i = 0; i < child.length; i++) { 15 | const actualChild = child[i]; 16 | if (actualChild !== ignoredChild) { 17 | children.push(actualChild); 18 | } 19 | } 20 | } else if (typeof child === "object") { 21 | if (child !== ignoredChild) { 22 | children.push(child); 23 | } 24 | } 25 | } 26 | } 27 | return children; 28 | } 29 | 30 | function getAllChildren(parents, ignoredChild) { 31 | const allChildren = []; 32 | for (let i = 0; i < parents.length; i++) { 33 | const parent = parents[i]; 34 | const newChildren = getChildren(parent); 35 | for (let j = 0; j < newChildren.length; j++) { 36 | const newChild = newChildren[j]; 37 | if (newChild !== ignoredChild) { 38 | allChildren.push(newChild) 39 | } 40 | } 41 | } 42 | return allChildren; 43 | } 44 | 45 | function positionIn(parent, child) { 46 | const position = getChildren(parent).indexOf(child); 47 | if (position === -1) throw "Could not find child in parent: " + JSON.stringify(parent) + " -- "+ JSON.stringify(child); 48 | return position; 49 | } 50 | 51 | function nodeToString(node) { 52 | let result; 53 | if (node.type === "Identifier") { 54 | result = "ID:" + node.name; 55 | } else if (node.type === "Literal") { 56 | result = "LIT:" + node.value; 57 | } else if (Array.isArray(node)) { 58 | result = "Array"; 59 | } else if (typeof node.type === "string") { 60 | result = node.type; 61 | } else { 62 | throw "Unexpected node type: " + JSON.stringify(node); 63 | } 64 | // TODO limit size 65 | return result; 66 | } 67 | 68 | const ast = acorn.parse("elems.push(2, 'aa')"); 69 | console.log(JSON.stringify(ast, 0, 2)); 70 | const ancestors= []; 71 | estraverse.traverse(ast, { 72 | enter:function(node, parent) { 73 | if (node.type === "Literal") { 74 | const positionInParent = positionIn(parent, node); 75 | const grandParent = ancestors[ancestors.length - 2]; 76 | const positionInGrandParent = positionIn(grandParent, parent); 77 | const siblings = getChildren(parent, node); 78 | const uncles = getChildren(grandParent, parent); // getUncles(grandParent, parent); 79 | const cousins = getAllChildren(uncles); 80 | const nephews = getAllChildren(siblings); 81 | console.log("\n"+JSON.stringify(node)); 82 | console.log("Parent : " + nodeToString(parent)); 83 | console.log(" Position : " + positionInParent); 84 | console.log("Grandparent: " + nodeToString((grandParent))); 85 | console.log(" Position : " + positionInGrandParent); 86 | console.log("Siblings : " + siblings.map(nodeToString)); 87 | console.log("Uncles : " + uncles.map(nodeToString)); 88 | console.log("Cousins : " + cousins.map(nodeToString)); 89 | console.log("Nephews : " + nephews.map(nodeToString)); 90 | } 91 | 92 | ancestors.push(node); 93 | }, 94 | leave:function(node, parent) { 95 | ancestors.pop(); 96 | } 97 | }); 98 | 99 | })(); -------------------------------------------------------------------------------- /DeepBugs/javascript/compareWarnings.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | // Compares warnings found with different variants of the approach. 3 | // arg1 = file with inspected warnings 4 | // arg2 = file with other warnings 5 | 6 | (function() { 7 | 8 | const fs = require("fs"); 9 | const process = require("process"); 10 | 11 | function Warning(score, location, extraInfo, isTruePositive) { 12 | this.score = score; 13 | this.location = location; 14 | this.extraInfo = extraInfo; 15 | this.isTruePositive = isTruePositive; 16 | } 17 | 18 | Warning.prototype.equals = function(other) { 19 | return this.location === other.location && this.extraInfo === other.extraInfo; 20 | }; 21 | 22 | function readWarnings(path) { 23 | const result = [] 24 | let allLines = fs.readFileSync(path, {encoding:"utf8"}); 25 | allLines = allLines.split("\n"); 26 | for (let i = 0; i < allLines.length; i++) { 27 | const line = allLines[i]; 28 | const entries = line.split(" | "); 29 | let extraInfo, isTruePositive; 30 | if (entries[entries.length - 2] === "y" || entries[entries.length - 2] === "n") { 31 | // has been manually inspected and classified 32 | extraInfo = entries.slice(2, entries.length - 2).join(" | "); 33 | isTruePositive = entries[entries.length - 2]; 34 | } else { 35 | extraInfo = entries.slice(2).join(" | "); 36 | } 37 | const warning = new Warning(entries[0], entries[1], extraInfo, isTruePositive); 38 | result.push(warning); 39 | } 40 | return result; 41 | } 42 | 43 | const args = process.argv.slice(2); 44 | const inspectedWarnings = readWarnings(args[0]); 45 | const otherWarnings = readWarnings(args[1]); 46 | 47 | for (let i = 0; i < inspectedWarnings.length; i++) { 48 | const inspectedWarning = inspectedWarnings[i]; 49 | const classification = inspectedWarning.isTruePositive === "y" ? "TP" : "FP"; 50 | 51 | let found = false; 52 | for (let j = 0; j < otherWarnings.length; j++) { 53 | const otherWarning = otherWarnings[j]; 54 | if (inspectedWarning.equals(otherWarning)) { 55 | found = true; 56 | console.log(classification, " with score ", otherWarning.score, inspectedWarning.location); 57 | break; 58 | } 59 | } 60 | if (!found) { 61 | console.log(classification, "not found", inspectedWarning.location); 62 | } 63 | 64 | 65 | 66 | } 67 | 68 | 69 | })(); -------------------------------------------------------------------------------- /DeepBugs/javascript/extractorOfAssignments.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | 3 | (function() { 4 | 5 | const fs = require("fs"); 6 | const estraverse = require("estraverse"); 7 | const util = require("./jsExtractionUtil"); 8 | 9 | function visitCode(ast, locationMap, path, allAssignments, fileID) { 10 | console.log("Reading " + path); 11 | 12 | let totalAssignments = 0; 13 | let totalAssignmentsConsidered = 0; 14 | 15 | const assignments = []; 16 | const code = fs.readFileSync(path); 17 | estraverse.traverse(ast, { 18 | enter:function(node, parent) { 19 | let lhs, rhs; 20 | if (node && node.type === "AssignmentExpression") { 21 | totalAssignments += 1; 22 | if (node.left.type === "Identifier") { 23 | lhs = node.left; 24 | rhs = node.right; 25 | } else if (node && node.type === "VariableDeclarator" && node.init !== null) { 26 | lhs = node.id; 27 | rhs = node.init; 28 | } else return; 29 | 30 | const nameOfLHS = util.getNameOfASTNode(lhs); 31 | const nameOfRHS = util.getNameOfASTNode(rhs); 32 | if (typeof nameOfLHS !== "undefined" && typeof nameOfRHS !== "undefined") { 33 | let locString = path + " : " + node.loc.start.line + " - " + node.loc.end.line; 34 | let typeOfRHS = util.getTypeOfASTNode(rhs); 35 | const assignment = { 36 | lhs:nameOfLHS, 37 | rhs:nameOfRHS, 38 | rhsType:typeOfRHS, 39 | src:locString 40 | }; 41 | totalAssignmentsConsidered += 1; 42 | assignments.push(assignment); 43 | } 44 | } 45 | } 46 | }); 47 | allAssignments.push(...assignments); 48 | console.log("Added assignments. Total now: " + allAssignments.length); 49 | console.log("Considered assignments: " + totalAssignmentsConsidered + " out of " + totalAssignments + " (" + Math.round(100 * totalAssignmentsConsidered / totalAssignments) + "%)"); 50 | } 51 | 52 | module.exports.visitCode = visitCode; 53 | 54 | })(); 55 | 56 | -------------------------------------------------------------------------------- /DeepBugs/javascript/extractorOfAssignments2.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | 3 | (function () { 4 | 5 | const fs = require("fs"); 6 | const estraverse = require("estraverse"); 7 | const util = require("./jsExtractionUtil"); 8 | 9 | const identifierContextWindowSize = 20; // assumption: even number 10 | 11 | function visitCode(ast, locationMap, path, allAssignments, fileID) { 12 | console.log("Reading " + path); 13 | 14 | let totalAssignments = 0; 15 | let totalAssignmentsConsidered = 0; 16 | 17 | const pastIdentifiers = []; 18 | const unfinishedAssignments = []; 19 | const parentStack = []; 20 | const assignments = []; 21 | estraverse.traverse(ast, { 22 | enter: function (node, parent) { 23 | if (parent) parentStack.push(parent); 24 | let extract = false; 25 | let cur_node_line_num = node.loc.start.line + "-" + node.loc.end.line; 26 | // console.log(fileID, cur_node_line_num); 27 | if (fileID === null) { 28 | extract = true; 29 | } else if (cur_node_line_num === fileID) { 30 | extract = true; 31 | } 32 | if (extract && node && node.type === "Identifier") { 33 | pastIdentifiers.push("ID:" + node.name); 34 | 35 | // finalize assignments with now-available postIdentifierContext 36 | let nbFinished = 0; 37 | for (let i = 0; i < unfinishedAssignments.length; i++) { 38 | const unfinishedAssignment = unfinishedAssignments[i]; 39 | if (pastIdentifiers.length >= unfinishedAssignment.identifierIndex + identifierContextWindowSize / 2) { 40 | const postIdentifierContext = pastIdentifiers.slice(unfinishedAssignment.identifierIndex, unfinishedAssignment.identifierIndex + identifierContextWindowSize / 2); 41 | unfinishedAssignment.assignment.context = unfinishedAssignment.assignment.context.concat(postIdentifierContext); 42 | totalAssignmentsConsidered += 1; 43 | assignments.push(unfinishedAssignment.assignment); 44 | nbFinished++; 45 | } else { 46 | break; 47 | } 48 | } 49 | unfinishedAssignments.splice(0, nbFinished); 50 | } 51 | 52 | let lhs, rhs; 53 | // let selectedNodeTypes = ['ExpressionStatement', 'VariableDeclaration', 'VariableDeclarator', 54 | // 'AssignmentExpression', 'AssignmentPattern']; 55 | if (extract && node && node.type === "AssignmentExpression") { 56 | totalAssignments += 1; 57 | if (node.left.type === "Identifier") { 58 | lhs = node.left; 59 | rhs = node.right; 60 | } else if (node && node.type === "VariableDeclarator" && node.init !== null) { 61 | lhs = node.id; 62 | rhs = node.init; 63 | } else return; 64 | // TODO: consider assignments to properties (and use property name as rhs) 65 | 66 | const nameOfLHS = util.getNameOfASTNode(lhs); 67 | const nameOfRHS = util.getNameOfASTNode(rhs); 68 | const parentName = parent.type; 69 | const grandParentName = parentStack.length > 1 ? parentStack[parentStack.length - 2].type : ""; 70 | const preIdentifierContext = pastIdentifiers.slice(Math.max(0, pastIdentifiers.length - identifierContextWindowSize / 2), pastIdentifiers.length); 71 | while (preIdentifierContext.length < identifierContextWindowSize / 2) { 72 | preIdentifierContext.unshift(""); 73 | } 74 | if (typeof nameOfLHS !== "undefined" && typeof nameOfRHS !== "undefined") { 75 | let locString = path + " : " + node.loc.start.line + " - " + node.loc.end.line; 76 | let typeOfRHS = util.getTypeOfASTNode(rhs); 77 | const assignment = { 78 | lhs: nameOfLHS, 79 | rhs: nameOfRHS, 80 | rhsType: typeOfRHS, 81 | parent: parentName, 82 | grandParent: grandParentName, 83 | context: preIdentifierContext, // postIdentifierContext will get appended later 84 | src: locString, 85 | range: [node.start, node.end] 86 | }; 87 | unfinishedAssignments.push({assignment: assignment, identifierIndex: pastIdentifiers.length}); 88 | } 89 | } 90 | }, 91 | leave: function (node, parent) { 92 | if (parent) parentStack.pop(); 93 | } 94 | }); 95 | 96 | for (let i = 0; i < unfinishedAssignments.length; i++) { 97 | const unfinishedAssignment = unfinishedAssignments[i]; 98 | const postIdentifierContext = pastIdentifiers.slice(unfinishedAssignment.identifierIndex, unfinishedAssignment.identifierIndex + identifierContextWindowSize / 2); 99 | while (postIdentifierContext.length < identifierContextWindowSize / 2) { 100 | postIdentifierContext.push(""); 101 | } 102 | unfinishedAssignment.assignment.context = unfinishedAssignment.assignment.context.concat(postIdentifierContext); 103 | totalAssignmentsConsidered += 1; 104 | assignments.push(unfinishedAssignment.assignment); 105 | } 106 | 107 | allAssignments.push(...assignments); 108 | console.log("Added assignments. Total now: " + allAssignments.length); 109 | console.log("Considered assignments: " + totalAssignmentsConsidered + " out of " + totalAssignments + " (" + Math.round(100 * totalAssignmentsConsidered / totalAssignments) + "%)"); 110 | } 111 | 112 | module.exports.visitCode = visitCode; 113 | 114 | })(); 115 | -------------------------------------------------------------------------------- /DeepBugs/javascript/extractorOfBinOps.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | 3 | (function () { 4 | 5 | const fs = require("fs"); 6 | const estraverse = require("estraverse"); 7 | const util = require("./jsExtractionUtil"); 8 | 9 | function visitCode(ast, locationMap, path, allBinOps, fileIDStr) { 10 | console.log("Reading " + path); 11 | 12 | let totalBinOps = 0; 13 | let totalBinOpsConsidered = 0; 14 | 15 | const parentStack = []; 16 | const binOps = []; 17 | let tokenID = 1; 18 | estraverse.traverse(ast, { 19 | enter: function (node, parent) { 20 | if (parent) parentStack.push(parent); 21 | 22 | let extract = false; 23 | 24 | if (fileIDStr === null) { 25 | extract = true; 26 | } else if ((node.loc.start.line + "-" + node.loc.end.line) === fileIDStr) { 27 | extract = true; 28 | } 29 | 30 | if (node.type === "BinaryExpression" && extract) { 31 | totalBinOps += 1; 32 | const leftName = util.getNameOfASTNode(node.left); 33 | const rightName = util.getNameOfASTNode(node.right); 34 | const leftType = util.getTypeOfASTNode(node.left); 35 | const rightType = util.getTypeOfASTNode(node.right); 36 | const parentName = parent.type; 37 | const grandParentName = parentStack.length > 1 ? parentStack[parentStack.length - 2].type : ""; 38 | if (typeof leftName !== "undefined" && typeof rightName !== "undefined") { 39 | let locString = path + " : " + node.loc.start.line + " - " + node.loc.end.line; 40 | 41 | const binOp = { 42 | left: leftName, 43 | right: rightName, 44 | op: node.operator, 45 | leftType: leftType, 46 | rightType: rightType, 47 | parent: parentName, 48 | grandParent: grandParentName, 49 | src: locString, 50 | range: [node.start, node.end] 51 | }; 52 | binOps.push(binOp); 53 | totalBinOpsConsidered += 1; 54 | tokenID += 1; 55 | } 56 | } 57 | }, 58 | leave: function (node, parent) { 59 | if (parent) parentStack.pop(); 60 | } 61 | }); 62 | allBinOps.push(...binOps); 63 | console.log("Added binary operations. Total now: " + allBinOps.length); 64 | console.log("Considered binary operations: " + totalBinOpsConsidered + " out of " + totalBinOps + " (" + Math.round(100 * totalBinOpsConsidered / totalBinOps) + "%)"); 65 | } 66 | 67 | module.exports.visitCode = visitCode; 68 | 69 | })(); 70 | -------------------------------------------------------------------------------- /DeepBugs/javascript/extractorOfCalls.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | 3 | (function() { 4 | 5 | const fs = require("fs"); 6 | const estraverse = require("estraverse"); 7 | const util = require("./jsExtractionUtil"); 8 | 9 | // configuration parameters 10 | const minArgs = 2; 11 | const maxLengthOfCalleeAndArguments = 200; // maximum number of characters 12 | 13 | function visitCode(ast, locationMap, path, allCalls, fileID) { 14 | console.log("Reading " + path); 15 | 16 | // first pass through AST: visit each fct. def. and extract formal parameter names 17 | const functionToParameters = {}; // string to array of strings 18 | let functionCounter = 0; 19 | estraverse.traverse(ast, { 20 | enter:function(node, parent) { 21 | if (node.type === "FunctionDeclaration" || node.type === "FunctionExpression") { 22 | functionCounter++; 23 | if (node.params.length > 1) { 24 | let functionName = util.getNameOfFunction(node, parent); 25 | if (functionName) { 26 | if (!functionToParameters.hasOwnProperty(functionName)) { 27 | const parameterNames = []; 28 | for (let i = 0; i < node.params.length; i++) { 29 | const parameter = node.params[i]; 30 | parameterNames.push("ID:"+parameter.name); 31 | } 32 | functionToParameters[functionName] = parameterNames; 33 | } // heuristically use only the first declaration in this file 34 | } 35 | } 36 | } 37 | } 38 | }); 39 | // console.log("Functions with parameter names: "+Object.keys(functionToParameters).length+" of "+functionCounter); 40 | 41 | // second pass through AST: visit each call site and extract call data 42 | const calls = []; 43 | const parentStack = []; 44 | let callCounter = 0; 45 | let callWithParameterNameCounter = 0; 46 | estraverse.traverse(ast, { 47 | enter:function(node, parent) { 48 | if (parent) parentStack.push(parent); 49 | if (node && node.type === "CallExpression") { 50 | if (node.arguments.length < minArgs) return; 51 | 52 | let calleeString; 53 | let baseString; 54 | let calleeNode; 55 | if (node.callee.type === "MemberExpression") { 56 | if (node.callee.computed === false) { 57 | calleeNode = node.callee.property; 58 | calleeString = util.getNameOfASTNode(calleeNode); 59 | baseString = util.getNameOfASTNode(node.callee.object); 60 | } else { 61 | calleeNode = node.callee.object; 62 | calleeString = util.getNameOfASTNode(calleeNode); 63 | baseString = ""; 64 | } 65 | } else { 66 | calleeNode = node.callee; 67 | calleeString = util.getNameOfASTNode(calleeNode); 68 | baseString = ""; 69 | } 70 | 71 | if (typeof calleeString === "undefined" || typeof baseString === "undefined") return; 72 | 73 | const calleeLocation = fileID + util.getLocationOfASTNode(calleeNode, locationMap); 74 | 75 | const argumentStrings = []; 76 | const argumentLocations = []; 77 | const argumentTypes = []; 78 | for (let i = 0; i < node.arguments.length; i++) { 79 | const argument = node.arguments[i]; 80 | const argumentString = util.getNameOfASTNode(argument); 81 | const argumentLocation = fileID + util.getLocationOfASTNode(argument, locationMap); 82 | const argumentType = util.getTypeOfASTNode(argument); 83 | if (typeof argumentString === "undefined") return; 84 | argumentStrings.push(argumentString.slice(0, maxLengthOfCalleeAndArguments)); 85 | argumentLocations.push(argumentLocation); 86 | argumentTypes.push(argumentType); 87 | } 88 | 89 | const parameters = []; 90 | let foundParameter = false; 91 | for (let i = 0; i < argumentStrings.length; i++) { 92 | let parameter = ""; // use empty parameter name if nothing else known 93 | if (functionToParameters.hasOwnProperty(calleeString)) { 94 | if (i < functionToParameters[calleeString].length) { 95 | parameter = functionToParameters[calleeString][i]; 96 | foundParameter = true; 97 | } 98 | } 99 | parameters.push(parameter); 100 | } 101 | callCounter++; 102 | if (foundParameter) callWithParameterNameCounter++; 103 | 104 | calleeString = calleeString.slice(0, maxLengthOfCalleeAndArguments); 105 | baseString = baseString.slice(0, maxLengthOfCalleeAndArguments); 106 | 107 | let locString = path + " : " + node.loc.start.line + " - " + node.loc.end.line; 108 | if (argumentStrings.length >= minArgs) { 109 | calls.push({ 110 | base:baseString, 111 | callee:calleeString, 112 | calleeLocation:calleeLocation, 113 | arguments:argumentStrings, 114 | argumentLocations:argumentLocations, 115 | argumentTypes:argumentTypes, 116 | parameters:parameters, 117 | src:locString, 118 | filename:path 119 | }); 120 | } 121 | } 122 | }, 123 | leave:function(node, parent) { 124 | if (parent) parentStack.pop(); 125 | } 126 | }); 127 | allCalls.push(...calls); 128 | console.log("Added calls. Total now: " + allCalls.length); 129 | 130 | // console.log("Calls with resolved parameter name: " + callWithParameterNameCounter+" of "+callCounter); 131 | } 132 | 133 | module.exports.visitCode = visitCode; 134 | 135 | })(); 136 | -------------------------------------------------------------------------------- /DeepBugs/javascript/extractorOfCallsMissingArg.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | 3 | (function() { 4 | 5 | const fs = require("fs"); 6 | const estraverse = require("estraverse"); 7 | const util = require("./jsExtractionUtil"); 8 | 9 | // configuration parameters 10 | const maxLengthOfCalleeAndArguments = 200; // maximum number of characters 11 | 12 | function visitCode(ast, locationMap, path, allCalls, fileID) { 13 | console.log("Reading " + path); 14 | 15 | // first pass through AST: visit each fct. def. and extract formal parameter names 16 | const functionToParameters = {}; // string to array of strings 17 | let functionCounter = 0; 18 | estraverse.traverse(ast, { 19 | enter:function(node, parent) { 20 | if (node.type === "FunctionDeclaration" || node.type === "FunctionExpression") { 21 | functionCounter++; 22 | if (node.params.length > 1) { 23 | let functionName = util.getNameOfFunction(node, parent); 24 | if (functionName) { 25 | if (!functionToParameters.hasOwnProperty(functionName)) { 26 | const parameterNames = []; 27 | for (let i = 0; i < node.params.length; i++) { 28 | const parameter = node.params[i]; 29 | parameterNames.push("ID:"+parameter.name); 30 | } 31 | functionToParameters[functionName] = parameterNames; 32 | } // heuristically use only the first declaration in this file 33 | } 34 | } 35 | } 36 | } 37 | }); 38 | // console.log("Functions with parameter names: "+Object.keys(functionToParameters).length+" of "+functionCounter); 39 | 40 | // second pass through AST: visit each call site and extract call data 41 | const calls = []; 42 | const parentStack = []; 43 | let callCounter = 0; 44 | let callWithParameterNameCounter = 0; 45 | estraverse.traverse(ast, { 46 | enter:function(node, parent) { 47 | if (parent) parentStack.push(parent); 48 | if (node && node.type === "CallExpression") { 49 | if (node.arguments.length === 0) return; 50 | 51 | let calleeString; 52 | let baseString; 53 | let calleeNode; 54 | if (node.callee.type === "MemberExpression") { 55 | if (node.callee.computed === false) { 56 | calleeNode = node.callee.property; 57 | calleeString = util.getNameOfASTNode(calleeNode); 58 | baseString = util.getNameOfASTNode(node.callee.object); 59 | } else { 60 | calleeNode = node.callee.object; 61 | calleeString = util.getNameOfASTNode(calleeNode); 62 | baseString = ""; 63 | } 64 | } else { 65 | calleeNode = node.callee; 66 | calleeString = util.getNameOfASTNode(calleeNode); 67 | baseString = ""; 68 | } 69 | 70 | if (typeof calleeString === "undefined" || typeof baseString === "undefined") return; 71 | 72 | const calleeLocation = fileID + util.getLocationOfASTNode(calleeNode, locationMap); 73 | 74 | const argumentStrings = []; 75 | const argumentLocations = []; 76 | const argumentTypes = []; 77 | for (let i = 0; i < node.arguments.length; i++) { 78 | const argument = node.arguments[i]; 79 | const argumentString = util.getNameOfASTNode(argument); 80 | const argumentLocation = fileID + util.getLocationOfASTNode(argument, locationMap); 81 | const argumentType = util.getTypeOfASTNode(argument); 82 | if (typeof argumentString === "undefined") return; 83 | argumentStrings.push(argumentString.slice(0, maxLengthOfCalleeAndArguments)); 84 | argumentLocations.push(argumentLocation); 85 | argumentTypes.push(argumentType); 86 | } 87 | 88 | const parameters = []; 89 | let foundParameter = false; 90 | for (let i = 0; i < argumentStrings.length; i++) { 91 | let parameter = ""; // use empty parameter name if nothing else known 92 | if (functionToParameters.hasOwnProperty(calleeString)) { 93 | if (i < functionToParameters[calleeString].length) { 94 | parameter = functionToParameters[calleeString][i]; 95 | foundParameter = true; 96 | } 97 | } 98 | parameters.push(parameter); 99 | } 100 | callCounter++; 101 | if (foundParameter) callWithParameterNameCounter++; 102 | 103 | calleeString = calleeString.slice(0, maxLengthOfCalleeAndArguments); 104 | baseString = baseString.slice(0, maxLengthOfCalleeAndArguments); 105 | 106 | let locString = path + " : " + node.loc.start.line + " - " + node.loc.end.line; 107 | if (argumentStrings.length >= 1) { 108 | calls.push({ 109 | base:baseString, 110 | callee:calleeString, 111 | calleeLocation:calleeLocation, 112 | arguments:argumentStrings, 113 | argumentLocations:argumentLocations, 114 | argumentTypes:argumentTypes, 115 | parameters:parameters, 116 | src:locString, 117 | filename:path 118 | }); 119 | } 120 | } 121 | }, 122 | leave:function(node, parent) { 123 | if (parent) parentStack.pop(); 124 | } 125 | }); 126 | allCalls.push(...calls); 127 | console.log("Added calls. Total now: " + allCalls.length); 128 | 129 | // console.log("Calls with resolved parameter name: " + callWithParameterNameCounter+" of "+callCounter); 130 | } 131 | 132 | module.exports.visitCode = visitCode; 133 | 134 | })(); 135 | -------------------------------------------------------------------------------- /DeepBugs/javascript/extractorOfIdsLitsWithASTFamily.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | 3 | (function() { 4 | 5 | const fs = require("fs"); 6 | const estraverse = require("estraverse"); 7 | const util = require("./jsExtractionUtil"); 8 | 9 | function getChildren(parent, ignoredChild) { 10 | const children = []; 11 | for (const prop in parent) { 12 | if (parent.hasOwnProperty(prop) && prop !== "regex" && prop !== "loc") { 13 | const child = parent[prop]; 14 | if (Array.isArray(child)) { 15 | for (let i = 0; i < child.length; i++) { 16 | const actualChild = child[i]; 17 | if (actualChild !== ignoredChild && !(child instanceof RegExp) && actualChild !== null) { 18 | children.push(actualChild); 19 | } 20 | } 21 | } else if (typeof child === "object" && child !== null) { 22 | if (child !== ignoredChild && !(child instanceof RegExp)) { 23 | children.push(child); 24 | } 25 | } 26 | } 27 | } 28 | return children; 29 | } 30 | 31 | function getAllChildren(parents, ignoredChild) { 32 | const allChildren = []; 33 | for (let i = 0; i < parents.length; i++) { 34 | const parent = parents[i]; 35 | const newChildren = getChildren(parent); 36 | for (let j = 0; j < newChildren.length; j++) { 37 | const newChild = newChildren[j]; 38 | if (newChild !== ignoredChild) { 39 | allChildren.push(newChild) 40 | } 41 | } 42 | } 43 | return allChildren; 44 | } 45 | 46 | function positionIn(parent, child) { 47 | const position = getChildren(parent).indexOf(child); 48 | if (position === -1) throw "Could not find child in parent: " + JSON.stringify(parent) + " -- "+ JSON.stringify(child); 49 | return position; 50 | } 51 | 52 | function visitCode(ast, locationMap, path, allIdsLits, fileID) { 53 | console.log("Reading " + path); 54 | const ancestors= []; 55 | estraverse.traverse(ast, { 56 | enter:function(node, parent) { 57 | if (node.type === "Identifier" || node.type === "Literal") { 58 | const positionInParent = positionIn(parent, node); 59 | const grandParent = ancestors[ancestors.length - 2]; 60 | const positionInGrandParent = positionIn(grandParent, parent); 61 | const siblings = getChildren(parent, node); 62 | const uncles = getChildren(grandParent, parent); // getUncles(grandParent, parent); 63 | const cousins = getAllChildren(uncles); 64 | const nephews = getAllChildren(siblings); 65 | 66 | const idLit = { 67 | token: util.nodeToString(node), 68 | context: { 69 | parent: util.nodeToString(parent), 70 | positionInParent: positionInParent, 71 | grandParent: util.nodeToString(grandParent), 72 | positionInGrandParent: positionInGrandParent, 73 | siblings: Array.from(new Set(siblings.map(util.nodeToString))), 74 | uncles: Array.from(new Set(uncles.map(util.nodeToString))), 75 | cousins: Array.from(new Set(cousins.map(util.nodeToString))), 76 | nephews: Array.from(new Set(nephews.map(util.nodeToString))) 77 | }, 78 | location: fileID + util.getLocationOfASTNode(node, locationMap) 79 | }; 80 | 81 | allIdsLits.push(idLit); 82 | } 83 | 84 | ancestors.push(node); 85 | }, 86 | leave:function(node, parent) { 87 | ancestors.pop(); 88 | } 89 | }); 90 | } 91 | 92 | module.exports.visitCode = visitCode; 93 | 94 | })(); 95 | -------------------------------------------------------------------------------- /DeepBugs/javascript/extractorOfIdsLitsWithIds.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | 3 | (function() { 4 | 5 | const fs = require("fs"); 6 | const util = require("./jsExtractionUtil"); 7 | 8 | const tokenContextLength = 20; // must be an even number 9 | 10 | function getContext(tokens, idx, targetLength) { 11 | let preContext = []; 12 | let currIdx = idx - 1; 13 | while (currIdx >= 0 && preContext.length < targetLength) { 14 | // go backward in token sequence and add identifiers to preContext 15 | let currToken = tokens[currIdx]; 16 | if (util.isId(currToken)) preContext = [currToken].concat(preContext); 17 | currIdx--; 18 | } 19 | 20 | let postContext = []; 21 | currIdx = idx + 1; 22 | while (currIdx < tokens.length && postContext.length < targetLength) { 23 | // go forward in token sequence and add identifiers to postContext 24 | let currToken = tokens[currIdx]; 25 | if (util.isId(currToken)) postContext.push(currToken); 26 | currIdx++; 27 | } 28 | 29 | return [preContext, postContext]; 30 | } 31 | 32 | function visitFile(path, allIdsLits) { 33 | console.log("Reading " + path); 34 | 35 | const code = fs.readFileSync(path); 36 | const tokens = util.getTokens(code); 37 | const k = tokenContextLength / 2; 38 | if (tokens) { 39 | for (let i = 0; i < tokens.length; i++) { 40 | const token = tokens[i]; 41 | if (util.isIdLit(token)) { 42 | let [preContext, postContext] = getContext(tokens, i, k); 43 | preContext = util.tokensToStrings(preContext); 44 | while (preContext.length !== k) preContext = [""].concat(preContext); 45 | postContext = util.tokensToStrings(postContext); 46 | while (postContext.length !== k) postContext.push(""); 47 | const idLit = { 48 | token: util.tokenToString(token), 49 | context: preContext.concat(postContext) 50 | }; 51 | allIdsLits.push(idLit); 52 | } 53 | } 54 | } else { 55 | console.log("Ignoring file with parse errors: " + path); 56 | } 57 | } 58 | 59 | module.exports.visitFile = visitFile; 60 | 61 | })(); -------------------------------------------------------------------------------- /DeepBugs/javascript/extractorOfIdsLitsWithTokens.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | 3 | (function() { 4 | 5 | const fs = require("fs"); 6 | const util = require("./jsExtractionUtil"); 7 | 8 | // configuration parameters 9 | const tokenContextLength = 20; // must be an even number 10 | 11 | function visitFile(path, allIdsLits) { 12 | console.log("Reading " + path); 13 | 14 | const code = fs.readFileSync(path); 15 | const tokens = util.getTokens(code); 16 | const k = tokenContextLength / 2; 17 | if (tokens) { 18 | for (let i = 0; i < tokens.length; i++) { 19 | const token = tokens[i]; 20 | if (util.isIdLit(token)) { 21 | let preContext = tokens.slice(Math.max(0, i - k), i); 22 | preContext = util.tokensToStrings(preContext); 23 | while (preContext.length !== k) preContext = [""].concat(preContext); 24 | let postContext = tokens.slice(i + 1, i + k); 25 | postContext = util.tokensToStrings(postContext); 26 | while (postContext.length !== k) postContext.push(""); 27 | const idLit = { 28 | token: util.tokenToString(token), 29 | context: preContext.concat(postContext) 30 | }; 31 | allIdsLits.push(idLit); 32 | } 33 | } 34 | } else { 35 | console.log("Ignoring file with parse errors: " + path); 36 | } 37 | } 38 | 39 | module.exports.visitFile = visitFile; 40 | 41 | })(); -------------------------------------------------------------------------------- /DeepBugs/javascript/extractorOfTokens.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | 3 | (function() { 4 | 5 | const fs = require("fs"); 6 | const util = require("./jsExtractionUtil"); 7 | 8 | function visitFile(path, allTokenSequences) { 9 | console.log("Reading " + path); 10 | 11 | const assignments = []; 12 | const code = fs.readFileSync(path); 13 | const tokens = util.getTokens(code); 14 | if (tokens) { 15 | allTokenSequences.push(util.tokensToStrings(tokens)); 16 | } else { 17 | console.log("Ignoring file with parse errors: " + path); 18 | } 19 | } 20 | 21 | module.exports.visitFile = visitFile; 22 | 23 | })(); -------------------------------------------------------------------------------- /DeepBugs/javascript/jsExtractionUtil.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | 3 | (function() { 4 | 5 | const acorn = require("acorn"); 6 | 7 | const maxLengthOfTokens = 200; 8 | 9 | function getTokens(code) { 10 | try { 11 | const tokenizer = acorn.tokenizer(code, {locations:true}); 12 | const tokens = []; 13 | let nextToken = tokenizer.getToken(); 14 | while (nextToken.type !== acorn.tokTypes.eof) { 15 | tokens.push(nextToken); 16 | nextToken = tokenizer.getToken(); 17 | } 18 | return tokens; 19 | } catch (e) { 20 | } 21 | } 22 | 23 | function getAST(code, noLocations) { 24 | try { 25 | if (noLocations) return acorn.parse(code); 26 | else return acorn.parse(code, {locations:true}); 27 | } catch (e) { 28 | //console.log(e); 29 | } 30 | } 31 | 32 | function getNameOfASTNode(node) { 33 | if (node.type === "Identifier") return "ID:" + node.name; 34 | else if (node.type === "CallExpression") return getNameOfASTNode(node.callee); 35 | else if (node.type === "MemberExpression" && node.computed === true) return getNameOfASTNode(node.object); 36 | else if (node.type === "MemberExpression" && node.computed === false) return getNameOfASTNode(node.property); 37 | else if (node.type === "Literal") return "LIT:" + String(node.value); 38 | else if (node.type === "ThisExpression") return "LIT:this"; 39 | else if (node.type === "UpdateExpression") return getNameOfASTNode(node.argument); 40 | } 41 | 42 | function getKindOfASTNode(node) { 43 | if (node.type === "Identifier") return "ID"; 44 | else if (node.type === "CallExpression") return getKindOfASTNode(node.callee); 45 | else if (node.type === "MemberExpression" && node.computed === true) return getKindOfASTNode(node.object); 46 | else if (node.type === "MemberExpression" && node.computed === false) return getKindOfASTNode(node.property); 47 | else if (node.type === "Literal") return "LIT"; 48 | else if (node.type === "ThisExpression") return "LIT"; 49 | } 50 | 51 | function getTypeOfASTNode(node) { 52 | if (node.type === "Literal") { 53 | if (node.hasOwnProperty("regex")) return "regex"; 54 | else if (node.value === null) return "null"; 55 | else return typeof node.value; 56 | } else if (node.type === "ThisExpression") return "object"; 57 | else if (node.type === "Identifier" && node.name === "undefined") return "undefined"; 58 | else return "unknown"; 59 | } 60 | 61 | function nodeToString(node) { 62 | let result; 63 | if (node.type === "Identifier") { 64 | result = "ID:" + node.name; 65 | } else if (node.type === "Literal") { 66 | result = "LIT:" + node.value; 67 | } else if (Array.isArray(node)) { 68 | result = "Array"; 69 | } else if (typeof node.type === "string") { 70 | result = node.type; 71 | } else { 72 | throw "Unexpected node type: " + JSON.stringify(node); 73 | } 74 | return result.slice(0, maxLengthOfTokens); 75 | } 76 | 77 | const identifierTokenType = "name"; 78 | const literalTokenTypes = ["num", "regexp", "string", "null", "true", "false"]; 79 | 80 | function tokenToString(t) { 81 | let result; 82 | if (t.type.label === identifierTokenType) { 83 | result = "ID:"; 84 | } else if (literalTokenTypes.indexOf(t.type.label) != -1) { 85 | result = "LIT:"; 86 | } else { 87 | result = "STD:"; 88 | } 89 | 90 | if (typeof t.value === "undefined") result += t.type.label; 91 | else if (typeof t.value === "string" || typeof t.value === "number") result += String(t.value); 92 | else if (t.type.label === "regexp") result += String(t.value.value); 93 | else { 94 | console.log("Unexpected token:\n" + JSON.stringify(t, 0, 2)); 95 | } 96 | return result.slice(0, maxLengthOfTokens); 97 | } 98 | 99 | function tokensToStrings(tokens) { 100 | return tokens.map(tokenToString); 101 | } 102 | 103 | function isIdLit(token) { 104 | return isId(token) || isLit(token) 105 | } 106 | 107 | function isId(token) { 108 | return token.type.label === "name"; 109 | } 110 | 111 | function isLit(token) { 112 | return token.type.label === "num" || token.type.label === "regexp" || token.type.label === "string" 113 | } 114 | 115 | function computeLocationMap(tokens) { 116 | // maps line-column-based location to character-based location 117 | const lcLocationToCharLocation = {}; 118 | for (let i = 0; i < tokens.length; i++) { 119 | const t = tokens[i]; 120 | const lcStartLocation = t.loc.start.line + ":" + t.loc.start.column; 121 | const lcEndLocation = t.loc.end.line + ":" + t.loc.end.column; 122 | lcLocationToCharLocation[lcStartLocation] = t.start; 123 | lcLocationToCharLocation[lcEndLocation] = t.end; 124 | } 125 | return lcLocationToCharLocation; 126 | } 127 | 128 | function getLocationOfASTNode(node, lcLocationToCharLocation) { 129 | const lcStartLocation = node.loc.start.line + ":" + node.loc.start.column; 130 | const lcEndLocation = node.loc.end.line + ":" + node.loc.end.column; 131 | const start = lcLocationToCharLocation[lcStartLocation]; 132 | const end = lcLocationToCharLocation[lcEndLocation]; 133 | const diff = end-start; 134 | return nbToPaddedStr(start, 6) + nbToPaddedStr(diff, 4); 135 | } 136 | 137 | function nbToPaddedStr(nb, length) { 138 | let str = String(nb); 139 | while (str.length < length) { 140 | str = "0" + str; 141 | } 142 | return str; 143 | } 144 | 145 | function getNameOfFunction(functionNode, parentNode) { 146 | if (functionNode.id && functionNode.id.name) return "ID:"+functionNode.id.name; 147 | if (parentNode.type === "AssignmentExpression") { 148 | if (parentNode.left.type === "Identifier") return "ID:"+parentNode.left.name; 149 | if (parentNode.left.type === "MemberExpression" && 150 | parentNode.left.property.type === "Identifier") return "ID:"+parentNode.left.property.name; 151 | } 152 | if (parentNode.type === "VariableDeclarator") { 153 | if (parentNode.id.type === "Identifier") return "ID:"+parentNode.id.name; 154 | } 155 | if (parentNode.type === "Property") { 156 | if (parentNode.key.type === "Identifier") return "ID:"+parentNode.key.name; 157 | } 158 | } 159 | 160 | module.exports.getTokens = getTokens; 161 | module.exports.getAST = getAST; 162 | module.exports.getNameOfASTNode = getNameOfASTNode; 163 | module.exports.getKindOfASTNode = getKindOfASTNode; 164 | module.exports.getTypeOfASTNode = getTypeOfASTNode; 165 | module.exports.nodeToString = nodeToString; 166 | module.exports.tokenToString = tokenToString; 167 | module.exports.tokensToStrings = tokensToStrings; 168 | module.exports.isId = isId; 169 | module.exports.isLit = isLit; 170 | module.exports.isIdLit = isIdLit; 171 | module.exports.nbToPaddedStr = nbToPaddedStr; 172 | module.exports.computeLocationMap = computeLocationMap; 173 | module.exports.getLocationOfASTNode = getLocationOfASTNode; 174 | module.exports.getNameOfFunction = getNameOfFunction; 175 | 176 | })(); 177 | -------------------------------------------------------------------------------- /DeepBugs/javascript/modifyArgumentOrder.js: -------------------------------------------------------------------------------- 1 | var fs = require("fs"); 2 | var esprima = require("esprima"); 3 | var estraverse = require("estraverse"); 4 | var escodegen = require("escodegen"); 5 | 6 | var rawJSFilesDir = "../data/js/programs_50/"; 7 | var formattedJSFilesDir = "../data/js/shuffled_arguments/orig/"; 8 | var modifiedJSFilesDir = "../data/js/shuffled_arguments/shuffled/"; 9 | 10 | function shuffle(a) { 11 | var j, x, i; 12 | for (i = a.length; i; i--) { 13 | j = Math.floor(Math.random() * i); 14 | x = a[i - 1]; 15 | a[i - 1] = a[j]; 16 | a[j] = x; 17 | } 18 | } 19 | 20 | function transformAST(ast) { 21 | estraverse.traverse(ast, { 22 | enter:function(node, parent) { 23 | if (node.type === "CallExpression") { 24 | shuffle(node.arguments); 25 | } 26 | } 27 | }); 28 | } 29 | 30 | var files = fs.readdirSync(rawJSFilesDir); 31 | for (var i = 0; i < files.length; i++) { 32 | var file = files[i]; 33 | if (file.endsWith(".js")) { 34 | var code = fs.readFileSync(rawJSFilesDir + "/" + file, {encoding:"utf8"}); 35 | var ast = esprima.parse(code); 36 | var formattedCode = escodegen.generate(ast); 37 | fs.writeFileSync(formattedJSFilesDir + file, formattedCode); 38 | transformAST(ast); 39 | var modifiedCode = escodegen.generate(ast); 40 | fs.writeFileSync(modifiedJSFilesDir + file, modifiedCode); 41 | } 42 | } -------------------------------------------------------------------------------- /DeepBugs/javascript/rb-nodeify.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Convert js files to something compatible with node 4.* 4 | 5 | rm -rf .*.js 6 | 7 | for file in *.js; do 8 | echo "Converting file ${file}..." 9 | new_file=".${file}" 10 | # Need to use strict for let bindings inside for loop etc. 11 | sed "0,/\(^[^/]\+\)/ s/\(^[^/]\+\)/\"use strict\";\n\n\1/" $file > $new_file 12 | # const {spawn} = req... is not allowed 13 | sed -i "s/const {spawn} = require('child_process')/const spawn = require('child_process').spawn/" "$new_file" 14 | # Replace all local requires 15 | sed -i "s/require(\".\//require(\".\/./" $new_file 16 | # Spread operator not allowed 17 | sed -i "s/\.\.\.\(.*\));/Object.assign({}, \1));/" $new_file 18 | 19 | done 20 | -------------------------------------------------------------------------------- /DeepBugs/javascript/seedBugs.js: -------------------------------------------------------------------------------- 1 | // Author: Michael Pradel 2 | 3 | (function() { 4 | 5 | var fs = require("fs"); 6 | var esprima = require("esprima"); 7 | var estraverse = require("estraverse"); 8 | var escodegen = require("escodegen"); 9 | var clone = require("clone"); 10 | 11 | var rawJSFilesDir = "../data/js/programs_50/"; 12 | var modifiedJSFilesDir = "../data/js/buggy_fcts"; 13 | 14 | var maxBugs = 100; 15 | 16 | function randElem(arr) { 17 | if (!arr || arr.length === 0) return undefined; 18 | return arr[Math.floor(Math.random() * (arr.length))]; 19 | } 20 | 21 | function randNb(maxInclusive) { 22 | return Math.floor(Math.random() * (maxInclusive + 1)); 23 | } 24 | 25 | function splitIntoFcts(ast) { 26 | var fcts = []; 27 | estraverse.traverse(ast, { 28 | enter:function(node, parent) { 29 | if (node.type === "FunctionDeclaration") { 30 | fcts.push(clone(node)); 31 | } 32 | } 33 | }); 34 | return fcts; 35 | } 36 | 37 | var expressionTypes = [ 38 | "ThisExpression", 39 | "ArrayExpression", 40 | "ObjectExpression", 41 | "FunctionExpression", 42 | "ArrowExpression", 43 | "SequenceExpression", 44 | "UnaryExpression", 45 | "BinaeyExpression", 46 | "AssignmentExpression", 47 | "UpdateExpression", 48 | "LogicalExpression", 49 | "ConditionalExpression", 50 | "NewExpression", 51 | "CallExpression", 52 | "MemberExpression", 53 | "ComprehensionExpression" 54 | ]; 55 | 56 | function modifyFunctionArgument(origAST) { 57 | // TODO: Use expressions from other programs? Otherwise, it always occurs twice. 58 | var ast = clone(origAST); 59 | var expressions = []; 60 | var callExpressions = []; 61 | estraverse.traverse(ast, { 62 | enter:function(node, parent) { 63 | if (expressionTypes.indexOf(node.type) != -1) { 64 | expressions.push(node); 65 | } 66 | if (node.type === "CallExpression") { 67 | callExpressions.push(node); 68 | } 69 | } 70 | }); 71 | if (callExpressions.length > 0 && expressions.length > 2) { 72 | var callExpression = randElem(callExpressions); 73 | var replacementExpression = undefined; 74 | while (!replacementExpression) { 75 | replacementExpression = randElem(expressions); 76 | if (replacementExpression === callExpression) replacementExpression = undefined; 77 | } 78 | replacementExpression = clone(replacementExpression); 79 | 80 | var args = callExpression.arguments; 81 | if (args.length === 0) { 82 | args.push(replacementExpression); 83 | } else { 84 | var idxToReplace = randNb(args.length - 1); 85 | args[idxToReplace] = replacementExpression; 86 | } 87 | return ast; 88 | } 89 | } 90 | 91 | var conditionalStmtTypes = [ 92 | "IfStatement", 93 | "WhileStatement", 94 | "DoWhileStatement", 95 | "ForStatement" 96 | ]; 97 | 98 | function modifyConditional(origAST) { 99 | var ast = clone(origAST); 100 | var conditionalStmts = []; 101 | var expressions = []; 102 | estraverse.traverse(ast, { 103 | enter:function(node, parent) { 104 | if (conditionalStmtTypes.indexOf(node.type) != -1) { 105 | conditionalStmts.push(node); 106 | } 107 | if (expressionTypes.indexOf(node.type) != -1) { 108 | expressions.push(node); 109 | } 110 | } 111 | }); 112 | 113 | if (conditionalStmts.length > 0) { 114 | var condStmt = randElem(conditionalStmts); 115 | var expr = condStmt.test; 116 | if (expr.type == "LogicalExpression") { 117 | condStmt.test = expr.left; 118 | return ast; 119 | } else { 120 | if (expressions.length > 0) { 121 | var replacementExpr = randElem(expressions); 122 | condStmt.test = replacementExpr; 123 | return ast; 124 | } 125 | } 126 | } 127 | } 128 | 129 | var transformerFcts = [modifyFunctionArgument, modifyConditional]; 130 | 131 | var files = fs.readdirSync(rawJSFilesDir); 132 | var origFcts = []; 133 | for (var i = 0; i < files.length; i++) { 134 | var file = files[i]; 135 | if (file.endsWith(".js")) { 136 | var content = fs.readFileSync(rawJSFilesDir + "/" + file, {encoding:"utf8"}); 137 | var origAST = esprima.parse(content); 138 | var fcts = splitIntoFcts(origAST); 139 | for (var j = 0; j < fcts.length; j++) { 140 | var f = fcts[j]; 141 | origFcts.push(f); 142 | } 143 | } 144 | } 145 | 146 | console.log("Functions: " + origFcts.length); 147 | 148 | var astPairs = []; 149 | for (var i = 0; i < origFcts.length; i++) { 150 | var origFct = origFcts[i]; 151 | var transformer = randElem(transformerFcts); 152 | var modifiedFct = transformer(origFct); 153 | if (modifiedFct) { 154 | astPairs.push([origFct, modifiedFct]); 155 | } 156 | } 157 | 158 | var fileCtr = 0; 159 | for (var i = 0; i < astPairs.length && fileCtr < maxBugs; i++) { 160 | fileCtr += 1; 161 | var astPair = astPairs[i]; 162 | var origCode = escodegen.generate(astPair[0]); 163 | fs.writeFileSync(modifiedJSFilesDir + "/orig/fct" + fileCtr + ".js", origCode); 164 | var modifiedCode = escodegen.generate(astPair[1]); 165 | fs.writeFileSync(modifiedJSFilesDir + "/buggy/fct" + fileCtr + ".js", modifiedCode); 166 | } 167 | 168 | console.log("Pairs of functions: " + fileCtr); 169 | 170 | 171 | })(); -------------------------------------------------------------------------------- /DeepBugs/javascript/tokenize.js: -------------------------------------------------------------------------------- 1 | var esprima = require("esprima"); 2 | var fs = require("fs"); 3 | 4 | var jsFile = process.argv[2] 5 | var tokenFile = process.argv[3] 6 | 7 | var js = fs.readFileSync(jsFile, {encoding: "utf8"}); 8 | var tokens = esprima.tokenize(js); 9 | fs.writeFileSync(tokenFile, JSON.stringify(tokens, 0, 2)); -------------------------------------------------------------------------------- /DeepBugs/python/ASTEmbeddingLearner.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 20, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import json 8 | import math 9 | from os import getcwd 10 | from os.path import join 11 | import sys 12 | import time 13 | 14 | from keras.layers.core import Dense 15 | from keras.models import Model 16 | from keras.models import Sequential 17 | from keras import backend as K 18 | 19 | import numpy as np 20 | import random 21 | 22 | kept_main_tokens = 10000 23 | kept_context_tokens = 1000 24 | max_context_tokens_per_category = 10 25 | 26 | embedding_size = 200 27 | batch_size = 50 28 | nb_epochs = 2 29 | sampling_rate = 1 30 | 31 | def count_samples(data_paths): 32 | total_examples = 0 33 | for path in data_paths: 34 | encoded_tokens_with_context = np.load(path) 35 | total_examples += len(encoded_tokens_with_context) 36 | return total_examples 37 | 38 | def xy_pair_generator(data_paths, expected_x_length, expected_y_length): 39 | while True: 40 | for path in data_paths: 41 | encoded_tokens_with_context = np.load(path) 42 | for token_with_context in encoded_tokens_with_context: 43 | sample = random.random() < sampling_rate 44 | if sample: 45 | # given encoding: 46 | # - first element = number of main token 47 | # - second element = number of parent token 48 | # - third element = position in parent 49 | # - fourth element = number of grand parent token 50 | # - fifth element = position in grand parent 51 | # - next max_context_tokens_per_category elements = numbers of sibling tokens 52 | # - next max_context_tokens_per_category elements = numbers of uncle tokens 53 | # - next max_context_tokens_per_category elements = numbers of cousin tokens 54 | # - next max_context_tokens_per_category elements = numbers of nephew tokens 55 | # representation to produce: 56 | # - main token: one-hot vector 57 | # - context vector: concatenation of subvectors: 58 | # - parent subvector: one-hot vector 59 | # - position in parent subvector: single number 60 | # - grand parent subvector: one-hot vector 61 | # - position in grand parent subvector: single number 62 | # - four subvectors for siblings, uncles, cousins, and nephews: each is a k-hot vector 63 | x = np.zeros(kept_main_tokens + 1) 64 | x[token_with_context[0]] = 1 65 | assert len(x) == expected_x_length, str(len(x)) + " is not " + str(expected_x_length) 66 | 67 | y_length = 6 * (kept_context_tokens + 1) + 2 68 | y = np.zeros(y_length) 69 | for idx in [1,3]: # do two times the same: for parent and grand parent 70 | hot_element = token_with_context[idx] 71 | position_in_parent = token_with_context[idx + 1] 72 | offset = (kept_context_tokens + 1) + 1 if idx == 3 else 0 73 | y[offset + hot_element] = 1 74 | y[offset + kept_context_tokens + 1] = position_in_parent 75 | for kind_nb in range(0,4): # do four times the same: for siblings, uncles, cousins, and nephews 76 | offset = (2 * (kept_context_tokens + 1)) + 2 77 | for hot_element in token_with_context[5 + (max_context_tokens_per_category * kind_nb):5 + (max_context_tokens_per_category * (kind_nb + 1))]: 78 | if hot_element > -1: 79 | y[offset + hot_element] = 1 80 | 81 | assert len(y) == expected_y_length, len(y) 82 | 83 | yield (x, y) 84 | assert False, "Should never reach this line" 85 | 86 | def batch_generator(xy_pair_generator): 87 | xs = [] 88 | ys = [] 89 | for x, y in xy_pair_generator: 90 | xs.append(x) 91 | ys.append(y) 92 | if len(xs) is batch_size: 93 | batch = (np.asarray(xs), np.asarray(ys)) 94 | yield batch 95 | xs = [] 96 | ys = [] 97 | 98 | if __name__ == '__main__': 99 | # arguments: 100 | 101 | token_to_nb_file = sys.argv[1] 102 | data_paths = list(map(lambda f: join(getcwd(), f), sys.argv[2:])) 103 | if len(data_paths) is 0: 104 | print("Must pass token_to_nb files and at least one data file") 105 | sys.exit(1) 106 | 107 | x_length = kept_main_tokens + 1 108 | y_length = 6 * (kept_context_tokens + 1) + 2 109 | total_examples = count_samples(data_paths) 110 | total_samples = total_examples * sampling_rate 111 | 112 | print("Total samples: " + str(total_examples)) 113 | print("Will sample about " + str(total_samples)) 114 | 115 | model = Sequential() 116 | model.add(Dense(embedding_size, input_shape=(x_length,), name="hidden")) 117 | model.add(Dense(y_length, activation="sigmoid")) 118 | 119 | # using sigmoid for last layer + binary crossentropy because commonly used for multi-label, multi-class classification 120 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 121 | 122 | total_samples_per_epoch = total_samples / batch_size 123 | validation_samples_per_epoch = total_samples_per_epoch * 0.2 124 | 125 | generator = batch_generator(xy_pair_generator(data_paths, x_length, y_length)) 126 | model.fit_generator(generator=generator, steps_per_epoch=total_samples_per_epoch, epochs=nb_epochs, validation_steps=validation_samples_per_epoch) 127 | 128 | # store the model 129 | time_stamp = math.floor(time.time() * 1000) 130 | model.save("embedding_model_" + str(time_stamp)) 131 | 132 | # after training the model, write token-to-vector map (= learned embedding) to file 133 | with open(token_to_nb_file, "r") as file: 134 | token_to_nb = json.load(file) 135 | intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer("hidden").output) 136 | token_to_vector = dict() 137 | for token, nb in token_to_nb.items(): 138 | x = [0] * (kept_main_tokens + 1) 139 | x[nb] = 1 140 | intermediate_output = intermediate_layer_model.predict(np.asarray([x])) 141 | vector = intermediate_output[0].tolist() 142 | token_to_vector[token] = vector 143 | token_to_vector_file_name = "token_to_vector_" + str(time_stamp) + ".json" 144 | with open(token_to_vector_file_name, "w") as file: 145 | json.dump(token_to_vector, file, sort_keys=True, indent=4) 146 | 147 | # show prediction for a few randomly selected examples 148 | # ctr = 0 149 | # for (x,y) in xy_pair_generator(data_paths, x_length, y_length): 150 | # print("X : " + str(x)) 151 | # print("Y : " + str(y)) 152 | # y_predicted = model.predict(x) 153 | # print("Y_predicted: " + str(y_predicted)) 154 | # 155 | # ctr += 1 156 | # if ctr > 10: 157 | # break 158 | 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /DeepBugs/python/AccuracyMetricTest.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 17, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | 8 | from keras import backend as K 9 | import numpy as np 10 | 11 | nb_tokens_in_context = 2 12 | kept_context_tokens = 5 13 | weight_of_ones = kept_context_tokens 14 | 15 | def weighted_loss(y_true, y_pred): 16 | weights = (y_true * (weight_of_ones - 1) + 1) 17 | y_pred = K.variable(y_pred) ## required only for debugging (if arguments come from backend, no need to convert them) 18 | clipped_y_pred = K.clip(y_pred, K.epsilon(), None) 19 | weighted_cross_entropy = -(y_true * K.log(clipped_y_pred) * weights) 20 | result = K.mean(weighted_cross_entropy) 21 | assert not np.isnan(K.eval(result)) 22 | return result 23 | 24 | def weighted_accuracy(y_true, y_pred): 25 | weights = (y_true * (weight_of_ones - 1) + 1) 26 | equal = K.cast(K.equal(y_true, K.round(y_pred)), K.floatx()) 27 | debug = K.eval(equal) 28 | weighted_equal = equal * weights 29 | return K.mean(weighted_equal) 30 | 31 | if __name__ == '__main__': 32 | y_true = np.zeros(nb_tokens_in_context * kept_context_tokens) 33 | y_true[2] = 1 34 | y_true[7] = 1 35 | y_pred = np.ones(nb_tokens_in_context * kept_context_tokens) 36 | # y_pred[2] = 0.1 37 | # y_pred[7] = 0.99 38 | 39 | print("Accuracy: " + str(K.eval(weighted_accuracy(y_true, y_pred)))) 40 | print("Loss: " + str(K.eval(weighted_loss(y_true, y_pred)))) -------------------------------------------------------------------------------- /DeepBugs/python/BinOpContextToEmbedding.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 31, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import json 8 | import math 9 | import sys 10 | import time 11 | 12 | import random 13 | 14 | import Util 15 | 16 | node_type_embedding_size = 8 # if changing here, then also change in LearningDataBinOperator 17 | 18 | def create_random_embedding(size, used_embeddings): 19 | while True: 20 | embedding = [] 21 | for _ in range(0, size): 22 | random_bit = round(random.random()) 23 | embedding.append(random_bit) 24 | if not (str(embedding) in used_embeddings): 25 | used_embeddings.add(str(embedding)) 26 | return embedding 27 | 28 | if __name__ == '__main__': 29 | # arguments: 30 | 31 | data_paths = sys.argv[1:] 32 | node_type_to_vector = dict() 33 | node_type_embeddings = set() 34 | for bin_op in Util.DataReader(data_paths): 35 | node_types = [bin_op["parent"], bin_op["grandParent"]] 36 | for node_type in node_types: 37 | if not (node_type in node_type_to_vector): 38 | type_embedding = create_random_embedding(node_type_embedding_size, node_type_embeddings) 39 | node_type_to_vector[node_type] = type_embedding 40 | 41 | time_stamp = math.floor(time.time() * 1000) 42 | node_type_to_vector_file = "node_type_to_vector_" + str(time_stamp) + ".json" 43 | with open(node_type_to_vector_file, "w") as file: 44 | json.dump(node_type_to_vector, file, sort_keys=True, indent=4) 45 | 46 | 47 | -------------------------------------------------------------------------------- /DeepBugs/python/CallContextToEmbedding.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 31, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import json 8 | import math 9 | import sys 10 | import time 11 | 12 | import random 13 | 14 | import Util 15 | 16 | # if changing the following, also change in AnomalyDetector 17 | filename_embedding_size = 50 18 | type_embedding_size = 5 19 | 20 | def create_random_embedding(size, used_embeddings): 21 | while True: 22 | embedding = [] 23 | for _ in range(0, size): 24 | random_bit = round(random.random()) 25 | embedding.append(random_bit) 26 | if not (str(embedding) in used_embeddings): 27 | used_embeddings.add(str(embedding)) 28 | return embedding 29 | 30 | if __name__ == '__main__': 31 | # arguments: 32 | 33 | call_data_paths = sys.argv[1:] 34 | filename_to_vector = dict() 35 | type_to_vector = dict() 36 | filename_embeddings = set() 37 | type_embeddings = set() 38 | for call in Util.DataReader(call_data_paths): 39 | filename = call["filename"] 40 | if not (filename in filename_to_vector): 41 | filename_embedding = create_random_embedding(filename_embedding_size, filename_embeddings) 42 | filename_to_vector[filename] = filename_embedding 43 | argument_types = call["argumentTypes"] 44 | for argument_type in argument_types: 45 | if not (argument_type in type_to_vector): 46 | type_embedding = create_random_embedding(type_embedding_size, type_embeddings) 47 | type_to_vector[argument_type] = type_embedding 48 | 49 | time_stamp = math.floor(time.time() * 1000) 50 | filename_to_vector_file = "filename_to_vector_" + str(time_stamp) + ".json" 51 | with open(filename_to_vector_file, "w") as file: 52 | json.dump(filename_to_vector, file, sort_keys=True, indent=4) 53 | type_to_vector_file = "type_to_vector_" + str(time_stamp) + ".json" 54 | with open(type_to_vector_file, "w") as file: 55 | json.dump(type_to_vector, file, sort_keys=True, indent=4) 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /DeepBugs/python/CallPerCalleeCounter.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Nov 7, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import json 8 | import math 9 | import sys 10 | import time 11 | 12 | import random 13 | 14 | import Util 15 | from collections import Counter 16 | 17 | if __name__ == '__main__': 18 | # arguments: 19 | 20 | call_data_paths = sys.argv[1:] 21 | callee_to_calls = Counter(); 22 | for call in Util.DataReader(call_data_paths): 23 | callee = call["callee"] 24 | callee_to_calls[callee] += 1 25 | 26 | time_stamp = math.floor(time.time() * 1000) 27 | callee_to_calls_file = "callee_to_calls_" + str(time_stamp) + ".json" 28 | with open(callee_to_calls_file, "w") as file: 29 | json.dump(callee_to_calls, file, sort_keys=True, indent=4) 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /DeepBugs/python/CallPerFileCounter.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Nov 3, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import json 8 | import math 9 | import sys 10 | import time 11 | 12 | import random 13 | 14 | import Util 15 | from collections import Counter 16 | 17 | if __name__ == '__main__': 18 | # arguments: 19 | 20 | call_data_paths = sys.argv[1:] 21 | file_name_to_calls = Counter(); 22 | for call in Util.DataReader(call_data_paths): 23 | file_name = call["filename"] 24 | file_name_to_calls[file_name] += 1 25 | 26 | time_stamp = math.floor(time.time() * 1000) 27 | file_name_to_calls_file = "file_name_to_calls_" + str(time_stamp) + ".json" 28 | with open(file_name_to_calls_file, "w") as file: 29 | json.dump(file_name_to_calls, file, sort_keys=True, indent=4) 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /DeepBugs/python/EmbeddingEvaluator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 4, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import sys 8 | import json 9 | from os.path import join 10 | from os import getcwd 11 | from sklearn.decomposition.incremental_pca import IncrementalPCA 12 | from matplotlib import pyplot 13 | import re 14 | import random 15 | from scipy.spatial.kdtree import KDTree 16 | import numpy as np 17 | 18 | sampling_rate_for_PCA = 0.01 19 | 20 | if __name__ == '__main__': 21 | # arguments: 22 | name_to_vector_file = join(getcwd(), sys.argv[1]) 23 | with open(name_to_vector_file) as f: 24 | name_to_vector = json.load(f) 25 | 26 | names = [] 27 | vectors = [] 28 | for name, vector in name_to_vector.items(): 29 | names.append(name) 30 | vectors.append(vector) 31 | 32 | # perform q few similarity queries 33 | queries = [ "ID:i", "ID:name", "ID:jQuery", "ID:counter", "ID:element", "LIT:true", "ID:msg", "ID:length"] # for AST-based 34 | kd_tree = KDTree(np.array(vectors)) 35 | for query in queries: 36 | if query in name_to_vector: 37 | print(query + " has similar names:") 38 | query_vector = name_to_vector[query] 39 | _, neighbor_idxs = kd_tree.query(query_vector, k=6) 40 | closest_names = [] 41 | for idx in neighbor_idxs: 42 | close_name = names[idx] 43 | if close_name != query: 44 | print(" " + close_name) 45 | 46 | # show PCA 47 | pca_vectors = [] 48 | pca_labels = [] 49 | for idx, name in enumerate(names): 50 | if random.random() < sampling_rate_for_PCA: 51 | pca_labels.append(name) 52 | pca_vectors.append(vectors[idx]) 53 | 54 | ipca = IncrementalPCA(n_components=2) 55 | reduced_vectors = ipca.fit_transform(pca_vectors) 56 | 57 | fig, ax = pyplot.subplots() 58 | x = reduced_vectors[:, 0] 59 | y = reduced_vectors[:, 1] 60 | ax.scatter(x, y) 61 | for idx, label in enumerate(pca_labels): 62 | escaped_label = re.escape(label) 63 | ax.annotate(escaped_label, (x[idx], y[idx])) 64 | 65 | pyplot.show() 66 | 67 | 68 | -------------------------------------------------------------------------------- /DeepBugs/python/EmbeddingEvaluatorWord2Vec.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Mar 20, 2018 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import sys 8 | from gensim.models import Word2Vec 9 | from sklearn.decomposition.incremental_pca import IncrementalPCA 10 | from matplotlib import pyplot 11 | import re 12 | 13 | if __name__ == '__main__': 14 | # arguments: embedding_model_file 15 | model = Word2Vec.load(sys.argv[1]) 16 | 17 | queries = [ "ID:i", "ID:name", "ID:jQuery", "ID:counter", "ID:element", "LIT:true", "ID:msg", "ID:length", "ID:nextSibling", "ID:toLowerCase", "ID:wrapper", "ID:width", "ID:getWidth"] 18 | 19 | for query in queries: 20 | results = model.wv.most_similar(positive=[query]) 21 | print("\\begin{tabular}{rl}") 22 | print(" \\toprule") 23 | print(" \\multicolumn{2}{c}{\\emph{\\textbf{"+query+"}}} \\\\") 24 | print(" \\midrule") 25 | print(" Simil. & Identifier \\\\") 26 | print(" \\midrule") 27 | for (other_id, simil) in results: 28 | escaped = other_id.replace("_", "\\_") 29 | print(" "+str(round(simil, 2))+" & "+escaped+" \\\\") 30 | print(" \\bottomrule") 31 | print("\end{tabular}") 32 | print() 33 | 34 | 35 | # show PCA 36 | pca_queries = [ "ID:wrapper", "ID:container", "ID:msg", "ID:alert", "ID:list", "ID:seq", "ID:lst", "ID:list", "LIT:error" ] 37 | pca_vectors = [] 38 | pca_labels = [] 39 | for _, name in enumerate(pca_queries): 40 | if name.startswith("LIT:"): 41 | print_name = "\"" + name.replace("LIT:", "") + "\"" # assumes string literals only 42 | else: 43 | print_name = name.replace("ID:", "") 44 | pca_labels.append(print_name) 45 | pca_vectors.append(model.wv[name]) 46 | 47 | ipca = IncrementalPCA(n_components=2) 48 | reduced_vectors = ipca.fit_transform(pca_vectors) 49 | 50 | fig, ax = pyplot.subplots() 51 | x = reduced_vectors[:, 0] 52 | y = reduced_vectors[:, 1] 53 | ax.scatter(x, y) 54 | for idx, label in enumerate(pca_labels): 55 | #escaped_label = re.escape(label) 56 | ax.annotate(label, (x[idx], y[idx])) 57 | 58 | pyplot.show() 59 | 60 | -------------------------------------------------------------------------------- /DeepBugs/python/EmbeddingLearner.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 3, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import json 8 | import math 9 | from os import getcwd 10 | from os.path import join 11 | import sys 12 | import time 13 | 14 | from keras.layers.core import Dense 15 | from keras.models import Model 16 | from keras.models import Sequential 17 | 18 | import numpy as np 19 | import random 20 | 21 | nb_tokens_in_context = 20 22 | kept_main_tokens = 10000 23 | kept_context_tokens = 1000 24 | 25 | embedding_size = 200 26 | batch_size = 50 27 | nb_epochs = 2 28 | sampling_rate = 1 29 | 30 | def count_samples(data_paths): 31 | total_examples = 0 32 | for path in data_paths: 33 | encoded_tokens_with_context = np.load(path) 34 | total_examples += len(encoded_tokens_with_context) 35 | return total_examples 36 | 37 | def xy_pair_generator(data_paths, expected_x_length, expected_y_length): 38 | while True: 39 | for path in data_paths: 40 | encoded_tokens_with_context = np.load(path) 41 | for token_with_context in encoded_tokens_with_context: 42 | sample = random.random() < sampling_rate 43 | if sample: 44 | # encode token and context as one-hot vectors 45 | # first element of token_with_context = number of main token 46 | x = np.zeros(kept_main_tokens + 1) 47 | x[token_with_context[0]] = 1 48 | assert len(x) == expected_x_length, str(len(x)) + " is not " + str(expected_x_length) 49 | 50 | y = np.zeros(nb_tokens_in_context * (kept_context_tokens + 1)) 51 | for idx, nb_of_context_token in enumerate(token_with_context[1:]): # 2nd, 3rd, etc. element of token_with_context = numbers of context tokens 52 | offset = idx * (kept_context_tokens + 1) 53 | y[offset + nb_of_context_token] = 1 54 | assert len(y) == expected_y_length, len(y) 55 | 56 | yield (x, y) 57 | assert False, "Should never reach this line" 58 | 59 | def batch_generator(xy_pair_generator): 60 | xs = [] 61 | ys = [] 62 | for x, y in xy_pair_generator: 63 | xs.append(x) 64 | ys.append(y) 65 | if len(xs) is batch_size: 66 | batch = (np.asarray(xs), np.asarray(ys)) 67 | yield batch 68 | xs = [] 69 | ys = [] 70 | 71 | # custom loss and accuracy to account for unbalanced y vectors: 72 | # 73 | # weight_of_ones = kept_context_tokens 74 | # 75 | # def weighted_loss(y_true, y_pred): 76 | # weights = y_true * weight_of_ones 77 | # clipped_y_pred = K.clip(y_pred, K.epsilon(), None) 78 | # weighted_cross_entropy = -(y_true * K.log(clipped_y_pred) * weights) 79 | # result = K.mean(weighted_cross_entropy) 80 | # return result 81 | # 82 | # def weighted_accuracy(y_true, y_pred): 83 | # weights = y_true * weight_of_ones 84 | # weighted_equal = K.cast(K.equal(y_true, K.round(y_pred)), K.floatx()) * weights 85 | # return K.mean(weighted_equal) 86 | 87 | if __name__ == '__main__': 88 | # arguments: 89 | 90 | token_to_nb_file = sys.argv[1] 91 | data_paths = list(map(lambda f: join(getcwd(), f), sys.argv[2:])) 92 | if len(data_paths) is 0: 93 | print("Must pass token_to_nb files and at least one data file") 94 | sys.exit(1) 95 | x_length = kept_main_tokens + 1 96 | y_length = nb_tokens_in_context * (kept_context_tokens + 1) 97 | total_examples = count_samples(data_paths) 98 | total_samples = total_examples * sampling_rate 99 | 100 | print("Total samples: " + str(total_examples)) 101 | print("Will sample about " + str(total_samples)) 102 | 103 | model = Sequential() 104 | model.add(Dense(200, input_shape=(x_length,), name="hidden")) 105 | model.add(Dense(y_length, activation="sigmoid")) 106 | 107 | # using sigmoid for last layer + binary crossentropy because commonly used for multi-label, multi-class classification 108 | model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 109 | # model.compile(loss=weighted_loss, optimizer='adam', metrics=[weighted_accuracy]) 110 | 111 | total_samples_per_epoch = total_samples / batch_size 112 | validation_samples_per_epoch = total_samples_per_epoch * 0.2 113 | 114 | generator = batch_generator(xy_pair_generator(data_paths, x_length, y_length)) 115 | model.fit_generator(generator=generator, steps_per_epoch=total_samples_per_epoch, epochs=nb_epochs, validation_steps=validation_samples_per_epoch) 116 | 117 | # store the model 118 | time_stamp = math.floor(time.time() * 1000) 119 | model.save("embedding_model_" + str(time_stamp)) 120 | 121 | # after training the model, write token-to-vector map (= learned embedding) to file 122 | with open(token_to_nb_file, "r") as file: 123 | token_to_nb = json.load(file) 124 | intermediate_layer_model = Model(inputs=model.input, outputs=model.get_layer("hidden").output) 125 | token_to_vector = dict() 126 | for token, nb in token_to_nb.items(): 127 | x = [0] * (kept_main_tokens + 1) 128 | x[nb] = 1 129 | intermediate_output = intermediate_layer_model.predict(np.asarray([x])) 130 | vector = intermediate_output[0].tolist() 131 | token_to_vector[token] = vector 132 | token_to_vector_file_name = "token_to_vector_" + str(time_stamp) + ".json" 133 | with open(token_to_vector_file_name, "w") as file: 134 | json.dump(token_to_vector, file, sort_keys=True, indent=4) 135 | 136 | # show prediction for a few randomly selected examples 137 | # ctr = 0 138 | # for (x,y) in xy_pair_generator(data_paths, x_length, y_length): 139 | # print("X : " + str(x)) 140 | # print("Y : " + str(y)) 141 | # y_predicted = model.predict(x) 142 | # print("Y_predicted: " + str(y_predicted)) 143 | # 144 | # ctr += 1 145 | # if ctr > 10: 146 | # break 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /DeepBugs/python/EmbeddingLearnerWord2Vec.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 26, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import math 8 | from os import getcwd 9 | from os.path import join 10 | import sys 11 | import time 12 | import json 13 | from gensim.models import Word2Vec 14 | 15 | nb_tokens_in_context = 20 16 | kept_tokens = 10000 17 | 18 | embedding_size = 200 19 | 20 | class EncodedSequenceReader(object): 21 | def __init__(self, data_paths): 22 | self.data_paths = data_paths 23 | 24 | def __iter__(self): 25 | for data_path in self.data_paths: 26 | print("Reading file " + data_path) 27 | with open(data_path) as file: 28 | token_sequences = json.load(file) 29 | for seq in token_sequences: 30 | yield seq 31 | 32 | if __name__ == '__main__': 33 | # arguments: 34 | 35 | token_to_nb_file = sys.argv[1] 36 | data_paths = list(map(lambda f: join(getcwd(), f), sys.argv[2:])) 37 | if len(data_paths) is 0: 38 | print("Must pass token_to_nb files and at least one data file") 39 | sys.exit(1) 40 | 41 | token_seqs = EncodedSequenceReader(data_paths) 42 | model = Word2Vec(token_seqs, min_count=1, window=nb_tokens_in_context/2, size=embedding_size, workers=40) 43 | 44 | # store the model 45 | time_stamp = math.floor(time.time() * 1000) 46 | model.save("embedding_model_" + str(time_stamp)) 47 | 48 | # after training the model, write token-to-vector map (= learned embedding) to file 49 | with open(token_to_nb_file, "r") as file: 50 | token_to_nb = json.load(file) 51 | token_to_vector = dict() 52 | for token in model.wv.vocab: 53 | if token.startswith("ID:") or token.startswith("LIT:"): 54 | vector = model[token].tolist() 55 | token_to_vector[token] = vector 56 | token_to_vector_file_name = "token_to_vector_" + str(time_stamp) + ".json" 57 | with open(token_to_vector_file_name, "w") as file: 58 | json.dump(token_to_vector, file, sort_keys=True, indent=4) 59 | 60 | -------------------------------------------------------------------------------- /DeepBugs/python/EmbeddingModelValidator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 17, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import sys 8 | from os import getcwd 9 | from os.path import join 10 | import json 11 | from keras.models import load_model 12 | import numpy as np 13 | from keras import backend as K 14 | import random 15 | from numpy import float32 16 | 17 | nb_tokens_in_context = 20 18 | kept_main_tokens = 10000 19 | kept_context_tokens = 1000 20 | 21 | # custom loss and accuracy to account for unbalanced y vectors 22 | weight_of_ones = kept_context_tokens 23 | 24 | def weighted_loss(y_true, y_pred): 25 | weights = y_true * weight_of_ones 26 | clipped_y_pred = K.clip(y_pred, K.epsilon(), None) 27 | weighted_cross_entropy = -(y_true * K.log(clipped_y_pred) * weights) 28 | result = K.mean(weighted_cross_entropy) 29 | return result 30 | 31 | def weighted_accuracy(y_true, y_pred): 32 | weights = y_true * weight_of_ones 33 | weighted_equal = K.cast(K.equal(y_true, K.round(y_pred)), K.floatx()) * weights 34 | return K.mean(weighted_equal) 35 | 36 | def get_xy_pair(path): 37 | encoded_tokens_with_context = np.load(path) 38 | for token_with_context in encoded_tokens_with_context: 39 | sample = random.random() < 0.001 40 | if sample: 41 | # encode token and context as one-hot vectors 42 | # first element of token_with_context = number of main token 43 | x = np.zeros(kept_main_tokens + 1) 44 | x[token_with_context[0]] = 1 45 | 46 | y = np.zeros(nb_tokens_in_context * (kept_context_tokens + 1)) 47 | for idx, nb_of_context_token in enumerate(token_with_context[1:]): # 2nd, 3rd, etc. element of token_with_context = numbers of context tokens 48 | offset = idx * (kept_context_tokens + 1) 49 | y[offset + nb_of_context_token] = 1 50 | 51 | yield (x, y) 52 | 53 | if __name__ == '__main__': 54 | # arguments: 55 | if len(sys.argv) < 3: 56 | print("Insufficient arguments") 57 | sys.exit(10) 58 | model_file = sys.argv[1] 59 | token_with_context_file = sys.argv[2] 60 | 61 | model = load_model(model_file, custom_objects={"weighted_loss":weighted_loss, "weighted_accuracy":weighted_accuracy}) 62 | 63 | nb_examples = 0 64 | for (x, y_true) in get_xy_pair(token_with_context_file): 65 | print("x: "+str(x)) 66 | xs = np.asarray([x]) 67 | ys = model.predict(xs) 68 | y_pred = ys[0] 69 | print("y_pred: "+str(y_pred)) 70 | y_rounded = K.eval(K.round(y_pred)) 71 | print("y_rounded: "+str(y_rounded)) 72 | y_true = y_true.astype(float32) 73 | print("y_true : "+str(y_true)) 74 | print("accuracy : "+str(K.eval(weighted_accuracy(y_true, y_pred)))) 75 | 76 | nb_examples += 1 77 | if nb_examples > 0: 78 | break 79 | 80 | 81 | -------------------------------------------------------------------------------- /DeepBugs/python/LearningDataBinOperator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Nov 9, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import Util 8 | from collections import Counter 9 | import random 10 | import pandas as pd 11 | 12 | data = pd.read_pickle('benchmarks/binOps_data.pkl', 'gzip') 13 | type_embedding_size = 5 14 | node_type_embedding_size = 8 # if changing here, then also change in LearningDataBinOperator 15 | 16 | 17 | class CodePiece(object): 18 | def __init__(self, left, right, op, src): 19 | self.left = left 20 | self.right = right 21 | self.op = op 22 | self.src = src 23 | 24 | def to_message(self): 25 | return str(self.src) + " | " + str(self.left) + " | " + str(self.op) + " | " + str(self.right) 26 | 27 | 28 | class LearningData(object): 29 | def __init__(self): 30 | self.all_operators = None 31 | self.stats = {} 32 | 33 | def resetStats(self): 34 | self.stats = {} 35 | 36 | def pre_scan(self, training_data_paths, validation_data_paths): 37 | all_operators_set = set() 38 | for bin_op in Util.DataReader(training_data_paths): 39 | if isinstance(bin_op, list): 40 | for bop in bin_op: 41 | all_operators_set.add(bop['op']) 42 | else: 43 | all_operators_set.add(bin_op["op"]) 44 | for bin_op in Util.DataReader(validation_data_paths): 45 | if isinstance(bin_op, list): 46 | for bop in bin_op: 47 | all_operators_set.add(bop['op']) 48 | else: 49 | all_operators_set.add(bin_op["op"]) 50 | all_operators_set.update(set(data['op'])) 51 | self.all_operators = list(all_operators_set) 52 | 53 | def code_to_xy_pairs_given_incorrect_example(self, bin_op, xs, ys, name_to_vector, type_to_vector, 54 | node_type_to_vector, code_pieces): 55 | x_correct, y_correct = None, None 56 | x_incorrect, y_incorrect = None, None 57 | cor_incorrect_code_pieces = [] 58 | for op in bin_op: 59 | left = op["left"] 60 | right = op["right"] 61 | operator = op["op"] 62 | left_type = op["leftType"] 63 | right_type = op["rightType"] 64 | parent = op["parent"] 65 | grand_parent = op["grandParent"] 66 | src = op["src"] 67 | if left not in name_to_vector: 68 | continue 69 | if right not in name_to_vector: 70 | continue 71 | left_vector = name_to_vector[left] 72 | right_vector = name_to_vector[right] 73 | operator_vector = [0] * len(self.all_operators) 74 | operator_vector[self.all_operators.index(operator)] = 1 75 | left_type_vector = type_to_vector.get(left_type, [0] * type_embedding_size) 76 | right_type_vector = type_to_vector.get(right_type, [0] * type_embedding_size) 77 | parent_vector = node_type_to_vector[parent] 78 | grand_parent_vector = node_type_to_vector[grand_parent] 79 | vec = left_vector + right_vector + operator_vector + left_type_vector + right_type_vector + parent_vector + grand_parent_vector 80 | if op['probability_that_incorrect'] == 0: 81 | x_correct = vec 82 | y_correct = [0] 83 | elif op['probability_that_incorrect'] == 1: 84 | x_incorrect = vec 85 | y_incorrect = [1] 86 | cor_incorrect_code_pieces.append(CodePiece(left, right, operator, src)) 87 | 88 | if x_correct and y_correct and x_incorrect and y_incorrect: 89 | xs.append(x_correct) 90 | ys.append(y_correct) 91 | xs.append(x_incorrect) 92 | ys.append(y_incorrect) 93 | code_pieces.extend(cor_incorrect_code_pieces) 94 | 95 | def code_to_xy_pairs(self, bin_op, xs, ys, name_to_vector, type_to_vector, node_type_to_vector, code_pieces): 96 | left = bin_op["left"] 97 | right = bin_op["right"] 98 | operator = bin_op["op"] 99 | left_type = bin_op["leftType"] 100 | right_type = bin_op["rightType"] 101 | parent = bin_op["parent"] 102 | grand_parent = bin_op["grandParent"] 103 | src = bin_op["src"] 104 | if not (left in name_to_vector): 105 | return 106 | if not (right in name_to_vector): 107 | return 108 | 109 | left_vector = name_to_vector[left] 110 | right_vector = name_to_vector[right] 111 | operator_vector = [0] * len(self.all_operators) 112 | operator_vector[self.all_operators.index(operator)] = 1 113 | left_type_vector = type_to_vector.get(left_type, [0] * type_embedding_size) 114 | right_type_vector = type_to_vector.get(right_type, [0] * type_embedding_size) 115 | parent_vector = node_type_to_vector[parent] 116 | grand_parent_vector = node_type_to_vector[grand_parent] 117 | 118 | # for all xy-pairs: y value = probability that incorrect 119 | x_correct = left_vector + right_vector + operator_vector + left_type_vector + right_type_vector + parent_vector + grand_parent_vector 120 | y_correct = [0] 121 | xs.append(x_correct) 122 | ys.append(y_correct) 123 | code_pieces.append(CodePiece(left, right, operator, src)) 124 | 125 | # pick some other, likely incorrect operator 126 | other_operator_vector = None 127 | while other_operator_vector == None: 128 | other_operator = random.choice(self.all_operators) 129 | if other_operator != operator: 130 | other_operator_vector = [0] * len(self.all_operators) 131 | other_operator_vector[self.all_operators.index(other_operator)] = 1 132 | 133 | x_incorrect = left_vector + right_vector + other_operator_vector + left_type_vector + right_type_vector + parent_vector + grand_parent_vector 134 | y_incorrect = [1] 135 | xs.append(x_incorrect) 136 | ys.append(y_incorrect) 137 | code_pieces.append(CodePiece(left, right, other_operator, src)) 138 | 139 | def anomaly_score(self, y_prediction_orig, y_prediction_changed): 140 | return y_prediction_orig 141 | 142 | def normal_score(self, y_prediction_orig, y_prediction_changed): 143 | return y_prediction_changed 144 | -------------------------------------------------------------------------------- /DeepBugs/python/LearningDataIncorrectAssignment_with_parents.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Nov 14, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import Util 8 | from collections import namedtuple 9 | import random 10 | from tqdm import tqdm 11 | 12 | type_embedding_size = 5 13 | 14 | class CodePiece(object): 15 | def __init__(self, lhs, rhs, src): 16 | self.lhs = lhs 17 | self.rhs = rhs 18 | self.src = src 19 | 20 | def to_message(self): 21 | return str(self.src) + " | " + str(self.lhs) + " | " + str(self.rhs) 22 | 23 | RHS = namedtuple('Assignment', ['rhs', 'type']) 24 | 25 | class LearningData(object): 26 | def __init__(self): 27 | self.file_to_RHSs = dict() # string to set of RHSs 28 | self.stats = {} 29 | 30 | def resetStats(self): 31 | self.stats = {} 32 | 33 | def pre_scan(self, training_data_paths, validation_data_paths): 34 | all_assignments = list(Util.DataReader(training_data_paths)) 35 | for assignment in tqdm(all_assignments, desc='Preprocessing training data'): 36 | if isinstance(assignment, list): 37 | for assgn in assignment: 38 | file = assgn["src"].split(" : ")[0] 39 | rhsides = self.file_to_RHSs.setdefault(file, set()) 40 | rhsides.add(RHS(assgn["rhs"], assgn["rhsType"])) 41 | else: 42 | file = assignment["src"].split(" : ")[0] 43 | rhsides = self.file_to_RHSs.setdefault(file, set()) 44 | rhsides.add(RHS(assignment["rhs"], assignment["rhsType"])) 45 | all_assignments = Util.DataReader(validation_data_paths) 46 | for assignment in tqdm(all_assignments, desc='Preprocessing validation data'): 47 | if isinstance(assignment, list): 48 | for assgn in assignment: 49 | file = assgn["src"].split(" : ")[0] 50 | rhsides = self.file_to_RHSs.setdefault(file, set()) 51 | rhsides.add(RHS(assgn["rhs"], assgn["rhsType"])) 52 | else: 53 | file = assignment["src"].split(" : ")[0] 54 | rhsides = self.file_to_RHSs.setdefault(file, set()) 55 | rhsides.add(RHS(assignment["rhs"], assignment["rhsType"])) 56 | 57 | def code_to_xy_pairs_given_incorrect_example(self, assignment, xs, ys, name_to_vector, type_to_vector, 58 | node_type_to_vector, code_pieces): 59 | x_correct, y_correct = None, None 60 | x_incorrect, y_incorrect = None, None 61 | cor_incorrect_code_pieces = [] 62 | 63 | for assgn in assignment: 64 | lhs = assgn["lhs"] 65 | rhs = assgn["rhs"] 66 | rhs_type = assgn["rhsType"] 67 | parent = assgn["parent"] 68 | grand_parent = assgn["grandParent"] 69 | # context = assgn["context"] 70 | src = assgn["src"] 71 | if not (lhs in name_to_vector): 72 | return 73 | if not (rhs in name_to_vector): 74 | return 75 | 76 | lhs_vector = name_to_vector[lhs] 77 | rhs_vector = name_to_vector[rhs] 78 | rhs_type_vector = type_to_vector.get(rhs_type, [0] * type_embedding_size) 79 | parent_vector = node_type_to_vector[parent] 80 | grand_parent_vector = node_type_to_vector[grand_parent] 81 | 82 | # transform context into embedding vectors (0 if not available) 83 | # (pre_context, post_context, all_context) = self.select_context_ids(lhs, rhs, context) 84 | # context_vector = self.context_ids_to_embeddings(pre_context, post_context, name_to_vector) 85 | 86 | # for all xy-pairs: y value = probability that incorrect 87 | vec = lhs_vector + rhs_vector + rhs_type_vector + parent_vector + grand_parent_vector 88 | 89 | if int(assgn['probability_that_incorrect']) == 0: 90 | x_correct = vec 91 | y_correct = [0] 92 | elif int(assgn['probability_that_incorrect']) == 1: 93 | x_incorrect = vec 94 | y_incorrect = [1] 95 | cor_incorrect_code_pieces.append(CodePiece(lhs, rhs, src)) 96 | if x_correct and y_correct and x_incorrect and y_incorrect: 97 | xs.append(x_correct) 98 | ys.append(y_correct) 99 | 100 | xs.append(x_incorrect) 101 | ys.append(y_incorrect) 102 | code_pieces.append(cor_incorrect_code_pieces) 103 | 104 | def code_to_xy_pairs(self, assignment, xs, ys, name_to_vector, type_to_vector, node_type_to_vector, code_pieces): 105 | lhs = assignment["lhs"] 106 | rhs = assignment["rhs"] 107 | rhs_type = assignment["rhsType"] 108 | parent = assignment["parent"] 109 | grand_parent = assignment["grandParent"] 110 | src = assignment["src"] 111 | if not (lhs in name_to_vector): 112 | return 113 | if not (rhs in name_to_vector): 114 | return 115 | 116 | lhs_vector = name_to_vector[lhs] 117 | rhs_vector = name_to_vector[rhs] 118 | rhs_type_vector = type_to_vector.get(rhs_type, [0]*type_embedding_size) 119 | parent_vector = node_type_to_vector[parent] 120 | grand_parent_vector = node_type_to_vector[grand_parent] 121 | 122 | # find an alternative rhs in the same file 123 | file = src.split(" : ")[0] 124 | all_RHSs = self.file_to_RHSs[file] 125 | tries_left = 100 126 | found = False 127 | while (not found) and tries_left > 0: 128 | other_rhs = random.choice(list(all_RHSs)) 129 | if other_rhs.rhs in name_to_vector and other_rhs.rhs != rhs: 130 | found = True 131 | tries_left -= 1 132 | 133 | if not found: 134 | return 135 | 136 | # for all xy-pairs: y value = probability that incorrect 137 | x_correct = lhs_vector + rhs_vector + rhs_type_vector + parent_vector + grand_parent_vector 138 | y_correct = [0] 139 | xs.append(x_correct) 140 | ys.append(y_correct) 141 | code_pieces.append(CodePiece(lhs, rhs, src)) 142 | 143 | other_rhs_vector = name_to_vector[other_rhs.rhs] 144 | other_rhs_type_vector = type_to_vector[other_rhs.type] 145 | x_incorrect = lhs_vector + other_rhs_vector + other_rhs_type_vector + parent_vector + grand_parent_vector 146 | y_incorrect = [1] 147 | xs.append(x_incorrect) 148 | ys.append(y_incorrect) 149 | code_pieces.append(CodePiece(lhs, rhs, src)) 150 | 151 | def anomaly_score(self, y_prediction_orig, y_prediction_changed): 152 | return y_prediction_orig 153 | 154 | def normal_score(self, y_prediction_orig, y_prediction_changed): 155 | return y_prediction_changed 156 | -------------------------------------------------------------------------------- /DeepBugs/python/LearningDataMissingArg.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Apr 9, 2018 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import Util 8 | from collections import Counter 9 | import random 10 | 11 | name_embedding_size = 200 12 | type_embedding_size = 5 13 | max_nb_args = 2 14 | 15 | class CodePiece(object): 16 | def __init__(self, callee, arguments, src): 17 | self.callee = callee 18 | self.arguments = arguments 19 | self.src = src 20 | 21 | def to_message(self): 22 | return str(self.src) + " | " + str(self.callee) + " | " + str(self.arguments) 23 | 24 | class LearningData(object): 25 | def __init__(self): 26 | self.stats = {"calls": 0, "calls_with_too_many_args": 0, "calls_with_too_few_args": 0, "calls_with_known_names": 0, 27 | "calls_with_known_base_object": 0} 28 | 29 | def pre_scan(self, training_data_paths, validation_data_paths): 30 | print("Stats on training data") 31 | self.gather_stats(training_data_paths) 32 | print("Stats on validation data") 33 | self.gather_stats(validation_data_paths) 34 | 35 | def gather_stats(self, data_paths): 36 | callee_to_freq = Counter() 37 | argument_to_freq = Counter() 38 | total_calls = 0 39 | 40 | for call in Util.DataReader(data_paths): 41 | callee_to_freq[call["callee"]] += 1 42 | for argument in call["arguments"]: 43 | argument_to_freq[argument] += 1 44 | total_calls += 1 45 | 46 | print("Total calls : " + str(total_calls)) 47 | print("Unique callees : " + str(len(callee_to_freq))) 48 | print(" " + "\n ".join(str(x) for x in callee_to_freq.most_common(10))) 49 | Util.analyze_histograms(callee_to_freq) 50 | print("Unique arguments : " + str(len(argument_to_freq))) 51 | print(" " + "\n ".join(str(x) for x in argument_to_freq.most_common(10))) 52 | Util.analyze_histograms(argument_to_freq) 53 | 54 | def code_to_xy_pairs(self, call, xs, ys, name_to_vector, type_to_vector, node_type_to_vector, calls=None): 55 | arguments = call["arguments"] 56 | self.stats["calls"] += 1 57 | if len(arguments) > max_nb_args: 58 | self.stats["calls_with_too_many_args"] += 1 59 | return 60 | if len(arguments) < 1: 61 | self.stats["calls_with_too_few_args"] += 1 62 | return 63 | 64 | # mandatory information: callee and argument names 65 | callee_string = call["callee"] 66 | argument_strings = call["arguments"] 67 | if not (callee_string in name_to_vector): 68 | return 69 | for argument_string in argument_strings: 70 | if not (argument_string in name_to_vector): 71 | return 72 | self.stats["calls_with_known_names"] += 1 73 | callee_vector = name_to_vector[callee_string] 74 | argument_vectors = [] 75 | for argument_string in argument_strings: 76 | argument_vectors.append(name_to_vector[argument_string]) 77 | if len(argument_vectors) >= max_nb_args: 78 | break 79 | 80 | # optional information: base object, argument types, etc. 81 | base_string = call["base"] 82 | base_vector = name_to_vector.get(base_string, [0]*name_embedding_size) 83 | if base_string in name_to_vector: 84 | self.stats["calls_with_known_base_object"] += 1 85 | 86 | argument_type_strings = call["argumentTypes"] 87 | argument_type_vectors = [] 88 | for argument_type_string in argument_type_strings: 89 | argument_type_vectors.append(type_to_vector.get(argument_type_string, [0]*type_embedding_size)) 90 | if len(argument_type_vectors) >= max_nb_args: 91 | break 92 | 93 | parameter_strings = call["parameters"] 94 | parameter_vectors = [] 95 | for parameter_string in parameter_strings: 96 | parameter_vectors.append(name_to_vector.get(parameter_string, [0]*name_embedding_size)) 97 | if len(parameter_vectors) >= max_nb_args: 98 | break 99 | 100 | # for all xy-pairs: y value = probability that incorrect 101 | x_orig = callee_vector + base_vector 102 | # add argument vectors (and pad if not enough available) 103 | for i in range(max_nb_args): 104 | if len(argument_vectors) > i: 105 | x_orig += argument_vectors[i] 106 | else: 107 | x_orig += [0]*name_embedding_size 108 | # add argument type vectors (and pad if not enough available) 109 | for i in range(max_nb_args): 110 | if len(argument_type_vectors) > i: 111 | x_orig += argument_type_vectors[i] 112 | else: 113 | x_orig += [0]*type_embedding_size 114 | # add parameter vectors (and pad if not enough available) 115 | for i in range(max_nb_args): 116 | if len(parameter_vectors) > i: 117 | x_orig += parameter_vectors[i] 118 | else: 119 | x_orig += [0]*name_embedding_size 120 | y_orig = [0] 121 | xs.append(x_orig) 122 | ys.append(y_orig) 123 | if calls != None: 124 | calls.append(CodePiece(callee_string, argument_strings, call["src"])) 125 | 126 | # for the negative example, remove a randomly picked argument 127 | idx_to_remove = random.randint(0, len(argument_vectors)-1) 128 | del argument_vectors[idx_to_remove] 129 | del argument_type_vectors[idx_to_remove] 130 | del parameter_vectors[idx_to_remove] 131 | x_buggy = callee_vector + base_vector 132 | # add argument vectors (and pad if not enough available) 133 | for i in range(max_nb_args): 134 | if len(argument_vectors) > i: 135 | x_buggy += argument_vectors[i] 136 | else: 137 | x_buggy += [0]*name_embedding_size 138 | # add argument type vectors (and pad if not enough available) 139 | for i in range(max_nb_args): 140 | if len(argument_type_vectors) > i: 141 | x_buggy += argument_type_vectors[i] 142 | else: 143 | x_buggy += [0]*type_embedding_size 144 | # add parameter vectors (and pad if not enough available) 145 | for i in range(max_nb_args): 146 | if len(parameter_vectors) > i: 147 | x_buggy += parameter_vectors[i] 148 | else: 149 | x_buggy += [0]*name_embedding_size 150 | y_buggy = [1] 151 | 152 | xs.append(x_buggy) 153 | ys.append(y_buggy) 154 | if calls != None: 155 | calls.append(CodePiece(callee_string, argument_strings, call["src"])) 156 | 157 | def anomaly_score(self, y_prediction_orig, y_prediction_changed): 158 | return y_prediction_orig # higher means more likely to be anomaly in current code 159 | 160 | def normal_score(self, y_prediction_orig, y_prediction_changed): 161 | return y_prediction_changed # higher means more likely to be correct in current code 162 | -------------------------------------------------------------------------------- /DeepBugs/python/LearningDataSwappedArgs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Nov 9, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import Util 8 | from collections import Counter 9 | 10 | name_embedding_size = 200 11 | file_name_embedding_size = 50 12 | type_embedding_size = 5 13 | 14 | class CodePiece(object): 15 | def __init__(self, callee, arguments, src): 16 | self.callee = callee 17 | self.arguments = arguments 18 | self.src = src 19 | 20 | def to_message(self): 21 | return str(self.src) + " | " + str(self.callee) + " | " + str(self.arguments) 22 | 23 | class LearningData(object): 24 | def is_known_type(self, t): 25 | return t == "boolean" or t == "number" or t == "object" or t == "regex" or t == "string" 26 | 27 | def resetStats(self): 28 | self.stats = {"calls": 0, "calls_with_two_args": 0, "calls_with_known_names": 0, 29 | "calls_with_known_base_object": 0, "calls_with_known_types": 0, 30 | "calls_with_both_known_types": 0, 31 | "calls_with_known_parameters" :0} 32 | 33 | def pre_scan(self, training_data_paths, validation_data_paths): 34 | print("Stats on training data") 35 | self.gather_stats(training_data_paths) 36 | print("Stats on validation data") 37 | self.gather_stats(validation_data_paths) 38 | 39 | def gather_stats(self, data_paths): 40 | callee_to_freq = Counter() 41 | argument_to_freq = Counter() 42 | 43 | for call in Util.DataReader(data_paths): 44 | callee_to_freq[call["callee"]] += 1 45 | for argument in call["arguments"]: 46 | argument_to_freq[argument] += 1 47 | 48 | print("Unique callees : " + str(len(callee_to_freq))) 49 | print(" " + "\n ".join(str(x) for x in callee_to_freq.most_common(10))) 50 | Util.analyze_histograms(callee_to_freq) 51 | print("Unique arguments : " + str(len(argument_to_freq))) 52 | print(" " + "\n ".join(str(x) for x in argument_to_freq.most_common(10))) 53 | Util.analyze_histograms(argument_to_freq) 54 | 55 | def code_to_xy_pairs(self, call, xs, ys, name_to_vector, type_to_vector, node_type_to_vector, calls=None): 56 | arguments = call["arguments"] 57 | self.stats["calls"] += 1 58 | if len(arguments) != 2: 59 | return 60 | self.stats["calls_with_two_args"] += 1 61 | 62 | # mandatory information: callee and argument names 63 | callee_string = call["callee"] 64 | argument_strings = call["arguments"] 65 | if not (callee_string in name_to_vector): 66 | return 67 | for argument_string in argument_strings: 68 | if not (argument_string in name_to_vector): 69 | return 70 | self.stats["calls_with_known_names"] += 1 71 | callee_vector = name_to_vector[callee_string] 72 | argument0_vector = name_to_vector[argument_strings[0]] 73 | argument1_vector = name_to_vector[argument_strings[1]] 74 | 75 | # optional information: base object, argument types, etc. 76 | base_string = call["base"] 77 | base_vector = name_to_vector.get(base_string, [0]*name_embedding_size) 78 | if base_string in name_to_vector: 79 | self.stats["calls_with_known_base_object"] += 1 80 | 81 | argument_type_strings = call["argumentTypes"] 82 | argument0_type_vector = type_to_vector.get(argument_type_strings[0], [0]*type_embedding_size) 83 | argument1_type_vector = type_to_vector.get(argument_type_strings[1], [0]*type_embedding_size) 84 | if (self.is_known_type(argument_type_strings[0]) or self.is_known_type(argument_type_strings[1])): 85 | self.stats["calls_with_known_types"] += 1 86 | if (self.is_known_type(argument_type_strings[0]) and self.is_known_type(argument_type_strings[1])): 87 | self.stats["calls_with_both_known_types"] += 1 88 | 89 | parameter_strings = call["parameters"] 90 | parameter0_vector = name_to_vector.get(parameter_strings[0], [0]*name_embedding_size) 91 | parameter1_vector = name_to_vector.get(parameter_strings[1], [0]*name_embedding_size) 92 | if (parameter_strings[0] in name_to_vector or parameter_strings[1] in name_to_vector): 93 | self.stats["calls_with_known_parameters"] += 1 94 | 95 | # for all xy-pairs: y value = probability that incorrect 96 | x_keep = callee_vector + argument0_vector + argument1_vector 97 | x_keep += base_vector + argument0_type_vector + argument1_type_vector 98 | x_keep += parameter0_vector + parameter1_vector #+ file_name_vector 99 | y_keep = [0] 100 | xs.append(x_keep) 101 | ys.append(y_keep) 102 | if calls != None: 103 | calls.append(CodePiece(callee_string, argument_strings, call["src"])) 104 | 105 | x_swap = callee_vector + argument1_vector + argument0_vector 106 | x_swap += base_vector + argument1_type_vector + argument0_type_vector 107 | x_swap += parameter0_vector + parameter1_vector #+ file_name_vector 108 | y_swap = [1] 109 | xs.append(x_swap) 110 | ys.append(y_swap) 111 | if calls != None: 112 | calls.append(CodePiece(callee_string, argument_strings, call["src"])) 113 | 114 | def anomaly_score(self, y_prediction_orig, y_prediction_changed): 115 | return y_prediction_orig - y_prediction_changed # higher means more likely to be anomaly in current code 116 | 117 | def normal_score(self, y_prediction_orig, y_prediction_changed): 118 | return y_prediction_changed - y_prediction_orig # higher means more likely to be correct in current code 119 | -------------------------------------------------------------------------------- /DeepBugs/python/LearningDataSwappedBinOperands.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Nov 13, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import Util 8 | from collections import Counter 9 | import random 10 | 11 | type_embedding_size = 5 12 | node_type_embedding_size = 8 # if changing here, then also change in LearningDataBinOperator 13 | 14 | commutative_operators = ["+", "==", "===", "!==", "!=", "*", "|", "&", "^"] 15 | 16 | class CodePiece(object): 17 | def __init__(self, left, right, op, src): 18 | self.left = left 19 | self.right = right 20 | self.op = op 21 | self.src = src 22 | 23 | def to_message(self): 24 | return str(self.src) + " | " + str(self.left) + " | " + str(self.op) + " | " + str(self.right) 25 | 26 | class LearningData(object): 27 | def __init__(self): 28 | self.all_operators = None 29 | self.stats = {} 30 | 31 | def pre_scan(self, training_data_paths, validation_data_paths): 32 | all_operators_set = set() 33 | for bin_op in Util.DataReader(training_data_paths): 34 | all_operators_set.add(bin_op["op"]) 35 | for bin_op in Util.DataReader(validation_data_paths): 36 | all_operators_set.add(bin_op["op"]) 37 | self.all_operators = list(all_operators_set) 38 | 39 | def code_to_xy_pairs(self, bin_op, xs, ys, name_to_vector, type_to_vector, node_type_to_vector, code_pieces): 40 | left = bin_op["left"] 41 | right = bin_op["right"] 42 | operator = bin_op["op"] 43 | left_type = bin_op["leftType"] 44 | right_type = bin_op["rightType"] 45 | parent = bin_op["parent"] 46 | grand_parent = bin_op["grandParent"] 47 | src = bin_op["src"] 48 | if not (left in name_to_vector): 49 | return 50 | if not (right in name_to_vector): 51 | return 52 | if operator in commutative_operators: 53 | return 54 | 55 | left_vector = name_to_vector[left] 56 | right_vector = name_to_vector[right] 57 | operator_vector = [0] * len(self.all_operators) 58 | operator_vector[self.all_operators.index(operator)] = 1 59 | left_type_vector = type_to_vector.get(left_type, [0]*type_embedding_size) 60 | right_type_vector = type_to_vector.get(right_type, [0]*type_embedding_size) 61 | parent_vector = node_type_to_vector[parent] 62 | grand_parent_vector = node_type_to_vector[grand_parent] 63 | 64 | # for all xy-pairs: y value = probability that incorrect 65 | x_correct = left_vector + right_vector + operator_vector + left_type_vector + right_type_vector + parent_vector + grand_parent_vector 66 | y_correct = [0] 67 | xs.append(x_correct) 68 | ys.append(y_correct) 69 | code_pieces.append(CodePiece(left, right, operator, src)) 70 | 71 | # swap operands 72 | x_incorrect = right_vector + left_vector + operator_vector + right_type_vector + left_type_vector + parent_vector + grand_parent_vector 73 | y_incorrect = [1] 74 | xs.append(x_incorrect) 75 | ys.append(y_incorrect) 76 | code_pieces.append(CodePiece(right, left, operator, src)) 77 | 78 | def anomaly_score(self, y_prediction_orig, y_prediction_changed): 79 | return y_prediction_orig - y_prediction_changed # higher means more likely to be anomaly in current code 80 | 81 | def normal_score(self, y_prediction_orig, y_prediction_changed): 82 | return y_prediction_changed - y_prediction_orig # higher means more likely to be correct in current code -------------------------------------------------------------------------------- /DeepBugs/python/LocationBasedEmbeddingEvaluator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 24, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import sys 8 | import json 9 | from os.path import join 10 | from os import getcwd 11 | from sklearn.decomposition.incremental_pca import IncrementalPCA 12 | from matplotlib import pyplot 13 | import re 14 | import random 15 | from scipy.spatial.kdtree import KDTree 16 | from scipy.spatial.distance import cosine 17 | import numpy as np 18 | import resource # @UnresolvedImport 19 | from dca.Util import Util 20 | 21 | sampling_rate_for_PCA = 0.01 22 | 23 | util = Util() 24 | 25 | class RawDataReader(object): 26 | def __init__(self, data_paths): 27 | self.data_paths = data_paths 28 | 29 | def __iter__(self): 30 | for data_path in self.data_paths: 31 | print("Reading file " + data_path) 32 | with open(data_path) as file: 33 | items = json.load(file) 34 | for item in items: 35 | yield item 36 | 37 | if __name__ == '__main__': 38 | # arguments: 39 | 40 | giga_byte = 1024 * 1024 * 1024 41 | max_bytes = 8 * giga_byte 42 | resource.setrlimit(resource.RLIMIT_AS, (max_bytes, max_bytes)) 43 | 44 | location_to_vector_file = join(getcwd(), sys.argv[1]) 45 | with open(location_to_vector_file) as f: 46 | location_to_vector = json.load(f) 47 | 48 | data_paths = list(map(lambda f: join(getcwd(), f), sys.argv[2:])) 49 | if len(data_paths) is 0: 50 | print("Must pass token_to_nb files and at least one data file") 51 | sys.exit(1) 52 | 53 | location_to_name = dict() 54 | name_to_locations = dict() 55 | reader = RawDataReader(data_paths) 56 | for token_with_context in reader: 57 | name = token_with_context["token"] 58 | location = token_with_context["location"] 59 | location_to_name[location] = name 60 | if name in name_to_locations: 61 | locations = name_to_locations[name] 62 | else: 63 | locations = [] 64 | name_to_locations[name] = locations 65 | locations.append(location) 66 | 67 | # prepare data structures for efficient similarity queries 68 | names = [] 69 | vectors = [] 70 | for location, vector in location_to_vector.items(): 71 | if location in location_to_name: # some locations have no vectors because their names are infrequent 72 | name = location_to_name[location] 73 | names.append(name) 74 | vectors.append(vector) 75 | print("Name-vector pairs: " + str(len(names))) 76 | 77 | # inspect similarities of locations with same name 78 | remaining_samples = 20 79 | print("\n") 80 | print("In-group simil, Out-group simil, Factor, #Vectors, Token") 81 | for name, locations in name_to_locations.items(): 82 | if len(locations) > 5: 83 | vector_group = list(map(lambda location: location_to_vector[location], locations)) 84 | # compute avg. pairwise similarity in group with same name 85 | in_group_simil = util.in_group_similarity(vector_group) 86 | 87 | # compute avg. similarity to some other vectors 88 | out_group_simil = util.out_group_similarity(vector_group, vectors) 89 | 90 | factor = in_group_simil / out_group_simil 91 | print(str(round(in_group_simil, 4))+", "+str(round(out_group_simil, 4))+", "+str(round(factor, 2))+", "+str(len(vector_group))+", "+name) 92 | remaining_samples -= 1 93 | if remaining_samples is 0: 94 | break 95 | 96 | 97 | # 98 | # 99 | # names = [] 100 | # vectors = [] 101 | # for name, vector in name_to_vector.items(): 102 | # names.append(name) 103 | # vectors.append(vector) 104 | # 105 | # # perform q few similarity queries 106 | # queries = [ "i", "name", "jQuery", "counter", "element", "true", "msg", "length"] # for token-based 107 | # queries = [ "ID:i", "ID:name", "ID:jQuery", "ID:counter", "ID:element", "LIT:true", "ID:msg", "ID:length"] # for AST-based 108 | # kd_tree = KDTree(np.array(vectors)) 109 | # for query in queries: 110 | # if query in name_to_vector: 111 | # print(query + " has similar names:") 112 | # query_vector = name_to_vector[query] 113 | # _, neighbor_idxs = kd_tree.query(query_vector, k=6) 114 | # closest_names = [] 115 | # for idx in neighbor_idxs: 116 | # close_name = names[idx] 117 | # if close_name != query: 118 | # print(" " + close_name) 119 | # 120 | # # show PCA 121 | # pca_vectors = [] 122 | # pca_labels = [] 123 | # for idx, name in enumerate(names): 124 | # if random.random() < sampling_rate_for_PCA: 125 | # pca_labels.append(name) 126 | # pca_vectors.append(vectors[idx]) 127 | # 128 | # ipca = IncrementalPCA(n_components=2) 129 | # reduced_vectors = ipca.fit_transform(pca_vectors) 130 | # 131 | # fig, ax = pyplot.subplots() 132 | # x = reduced_vectors[:, 0] 133 | # y = reduced_vectors[:, 1] 134 | # ax.scatter(x, y) 135 | # for idx, label in enumerate(pca_labels): 136 | # escaped_label = re.escape(label) 137 | # ax.annotate(escaped_label, (x[idx], y[idx])) 138 | # 139 | # pyplot.show() 140 | 141 | 142 | -------------------------------------------------------------------------------- /DeepBugs/python/RandomEmbeddingLearner.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 3, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import json 8 | import math 9 | import sys 10 | import time 11 | 12 | import numpy as np 13 | import random 14 | 15 | from numpy.random import normal 16 | 17 | kept_main_tokens = 10000 18 | 19 | embedding_size = 200 20 | 21 | def count_samples(data_paths): 22 | total_examples = 0 23 | for path in data_paths: 24 | encoded_tokens_with_context = np.load(path) 25 | total_examples += len(encoded_tokens_with_context) 26 | return total_examples 27 | 28 | def create_random_embedding(): 29 | embedding = [] 30 | for _ in range(0,embedding_size): 31 | # random_bit = round(random.random()) 32 | random_nb = normal(0.0, 0.7) # Gaussian distribution that looks roughly like the values in learned embeddings 33 | embedding.append(random_nb) 34 | return embedding 35 | 36 | if __name__ == '__main__': 37 | # arguments: OR 38 | 39 | token_to_nb_file = sys.argv[1] 40 | with open(token_to_nb_file, "r") as file: 41 | token_to_nb = json.load(file) 42 | token_to_vector = dict() 43 | used_embeddings = set() 44 | for token, _ in token_to_nb.items(): 45 | done = False 46 | while not done: 47 | embedding = create_random_embedding() 48 | if not (str(embedding) in used_embeddings): 49 | token_to_vector[token] = embedding 50 | used_embeddings.add(str(embedding)) 51 | done = True 52 | 53 | time_stamp = math.floor(time.time() * 1000) 54 | token_to_vector_file_name = "token_to_vector_" + str(time_stamp) + ".json" 55 | with open(token_to_vector_file_name, "w") as file: 56 | json.dump(token_to_vector, file, sort_keys=True, indent=4) 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /DeepBugs/python/TokenWithContextStats.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 17, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | from os import getcwd 8 | from os.path import join 9 | import sys 10 | 11 | import numpy as np 12 | import json 13 | 14 | nb_tokens_in_context = 20 15 | 16 | if __name__ == '__main__': 17 | # arguments: 18 | print("Total arguments: "+str(len(sys.argv))) 19 | data_paths = list(map(lambda f: join(getcwd(), f), sys.argv[1:])) 20 | print("Total files: "+str(len(data_paths))) 21 | if len(data_paths) is 0: 22 | print("Must pass at least one data file") 23 | sys.exit(1) 24 | 25 | token_to_contexts = dict() # store contexts as set of str(array_of_numbers) 26 | context_to_tokens = dict() 27 | 28 | visited_files = 0 29 | for path in data_paths: 30 | visited_files += 1 31 | print("Visiting file "+str(visited_files)+" files of "+str(len(data_paths))) 32 | encoded_tokens_with_context = np.load(path) 33 | print(" Tokens with context: "+str(len(encoded_tokens_with_context))) 34 | visited_tokens = 0 35 | for token_with_context in encoded_tokens_with_context: 36 | # first element of token_with_context = number of main token 37 | token = str(token_with_context[0]) 38 | context_nbs = [] 39 | for nb_of_context_token in token_with_context[1:]: # 2nd, 3rd, etc. element of token_with_context = numbers of context tokens 40 | context_nbs.append(nb_of_context_token) 41 | context = str(context_nbs) 42 | 43 | # track token-to-context mappings 44 | if token in token_to_contexts: 45 | token_to_contexts[token].add(context) 46 | else: 47 | token_to_contexts[token] = set([context]) 48 | 49 | # track context-to-token mappings 50 | if context in context_to_tokens: 51 | context_to_tokens[context].add(token) 52 | else: 53 | context_to_tokens[context] = set([token]) 54 | 55 | visited_tokens += 1 56 | if visited_tokens % 100000 is 0: 57 | print(" Visited tokens: "+str(visited_tokens)) 58 | 59 | # transform sets to lists for serialization & count 1:1 mappings 60 | serializable_token_to_contexts = dict() 61 | serializable_context_to_tokens = dict() 62 | tokens_with_single_context = 0 63 | contexts_with_single_token = 0 64 | for token, contexts in token_to_contexts.items(): 65 | serializable_token_to_contexts[token] = list(contexts) 66 | if len(contexts) is 1: 67 | tokens_with_single_context += 1 68 | for context, tokens in context_to_tokens.items(): 69 | serializable_context_to_tokens[context] = list(tokens) 70 | if len(tokens) is 1: 71 | contexts_with_single_token += 1 72 | 73 | print(str(tokens_with_single_context)+" of "+str(len(token_to_contexts))+" tokens occur in only 1 context") 74 | print(str(contexts_with_single_token)+" of "+str(len(context_to_tokens))+" contexts occur in only 1 context") 75 | 76 | with open("tokens_to_contexts.json", "w") as file: 77 | json.dump(serializable_token_to_contexts, file, indent=4) 78 | with open("context_to_tokens.json", "w") as file: 79 | json.dump(serializable_context_to_tokens, file, indent=4) 80 | 81 | -------------------------------------------------------------------------------- /DeepBugs/python/TokensToTopTokens.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Jul 26, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | import sys 8 | import json 9 | from os.path import join 10 | from os import getcwd 11 | from collections import Counter 12 | import math 13 | import time 14 | from multiprocessing import Pool 15 | 16 | kept_tokens = 10000 17 | 18 | nb_processes = 30 19 | 20 | class RawDataReader(object): 21 | def __init__(self, data_paths): 22 | self.data_paths = data_paths 23 | 24 | def __iter__(self): 25 | for data_path in self.data_paths: 26 | print("Reading file " + data_path) 27 | with open(data_path) as file: 28 | token_sequences = json.load(file) 29 | for seq in token_sequences: 30 | yield seq 31 | 32 | def analyze_histograms(all_tokens): 33 | total = sum(all_tokens.values()) 34 | sorted_pairs = all_tokens.most_common() 35 | percentages_to_cover = list(map(lambda x: x/100.0,range(1,100))) #[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99] 36 | nb_covered = 0 37 | pairs_covered = 0 38 | for pair in sorted_pairs: 39 | nb_covered += pair[1] 40 | pairs_covered += 1 41 | percentage_covered = (nb_covered * 1.0) / total 42 | done = False 43 | while not done and len(percentages_to_cover) > 0: 44 | next_percentage = percentages_to_cover[0] 45 | if percentage_covered >= next_percentage: 46 | print(str(pairs_covered) + " most frequent terms cover " + str(next_percentage) + " of all terms") 47 | percentages_to_cover = percentages_to_cover[1:] 48 | else: 49 | done = True 50 | 51 | covered_by_kept_tokens = 0 52 | for pair in sorted_pairs[:kept_tokens]: 53 | covered_by_kept_tokens += pair[1] 54 | perc_covered_by_kept_tokens = (covered_by_kept_tokens * 1.0) / total 55 | print("----") 56 | print(str(covered_by_kept_tokens) + " most frequent terms cover " + str(perc_covered_by_kept_tokens) + " of all terms") 57 | 58 | def save_tokens(encoded_tokens): 59 | time_stamp = math.floor(time.time() * 1000) 60 | file_name = "encoded_tokens_" + str(time_stamp) + ".json" 61 | with open(file_name, "w") as file: 62 | json.dump(encoded_tokens, file, indent=4) 63 | return file_name 64 | 65 | def save_token_numbers(token_to_number): 66 | time_stamp = math.floor(time.time() * 1000) 67 | file_name = "token_to_number_" + str(time_stamp) + ".json" 68 | with open(file_name, 'w') as file: 69 | json.dump(token_to_number, file, sort_keys=True, indent=4) 70 | 71 | unknown = "@@~UNKNOWN~@@" # represented by 0 72 | def frequent_tokens(counter, nb_tokens): 73 | token_to_number = dict() 74 | ctr = 1 # reserve 0 for "unknown" 75 | for pair in counter.most_common(nb_tokens): 76 | token_to_number[pair[0]] = ctr 77 | ctr += 1 78 | return token_to_number 79 | 80 | def encode(frequent_to_number, token): 81 | if token in frequent_to_number: 82 | return token 83 | else: 84 | return "UNK" 85 | 86 | def chunks(li, n): 87 | for i in range(0, len(li), n): 88 | yield li[i:i + n] 89 | 90 | if __name__ == '__main__': 91 | # arguments: 92 | 93 | all_raw_data_paths = list(map(lambda f: join(getcwd(), f), sys.argv[1:])) 94 | print("Total files: "+str(len(all_raw_data_paths))) 95 | 96 | # gather tokens (in parallel) 97 | def count_tokens(data_paths): 98 | print("Worker starting to read "+str(len(data_paths))+" files") 99 | reader = RawDataReader(data_paths) 100 | tokens = Counter() 101 | for token_seq in reader: 102 | for token in token_seq: 103 | tokens[token] += 1 104 | return tokens 105 | 106 | pool = Pool(processes=nb_processes) 107 | chunksize = round(len(all_raw_data_paths) / nb_processes) 108 | if chunksize == 0: 109 | chunksize = len(all_raw_data_paths) 110 | counters = pool.map(count_tokens, chunks(all_raw_data_paths, chunksize)) 111 | 112 | # merge counters that were gathered in parallel 113 | print("Merging counters...") 114 | all_tokens = Counter() 115 | for tokens in counters: 116 | all_tokens.update(tokens) 117 | print("Done with merging counters") 118 | 119 | # analyze histograms 120 | print() 121 | print("Unique tokens: " + str(len(all_tokens))) 122 | print(" " + "\n ".join(str(x) for x in all_tokens.most_common(20))) 123 | analyze_histograms(all_tokens) 124 | print() 125 | 126 | # replace infrequent tokens w/ placeholder and write number-encoded tokens + contexts to files 127 | frequent_tokens = frequent_tokens(all_tokens, kept_tokens) 128 | 129 | save_token_numbers(frequent_tokens) 130 | 131 | # parallelize the encoding 132 | def encode_tokens(data_paths): 133 | print("Data encoding worker called with "+str(len(data_paths))+" files") 134 | reader = RawDataReader(data_paths) 135 | token_ctr = 0 136 | all_encoded_seqs = [] 137 | for token_seq in reader: 138 | # replace infrequent tokens with "unknown" 139 | encoded_token_seq = [] 140 | for t in token_seq: 141 | encoded_token_seq.append(encode(frequent_tokens, t)) 142 | token_ctr += len(token_seq) 143 | all_encoded_seqs.append(encoded_token_seq) 144 | 145 | # occasionally save and forget (to avoid filling up all memory) 146 | if token_ctr > 1000000: 147 | file_name = save_tokens(all_encoded_seqs) 148 | print("Have written data to " + file_name) 149 | token_ctr = 0 150 | all_encoded_seqs = [] 151 | 152 | file_name = save_tokens(all_encoded_seqs) 153 | print("Have written data to " + file_name) 154 | 155 | print("Encoding data and written it to files...") 156 | pool = Pool(processes=nb_processes) 157 | pool.map(encode_tokens, chunks(all_raw_data_paths, chunksize)) 158 | 159 | print("Done") 160 | 161 | -------------------------------------------------------------------------------- /DeepBugs/python/Util.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on Oct 26, 2017 3 | 4 | @author: Michael Pradel 5 | ''' 6 | 7 | from scipy.spatial.distance import cosine 8 | import random 9 | import json 10 | 11 | def in_group_similarity(vector_group): 12 | vector_group = list(vector_group) 13 | in_group_simil = 0.0 14 | in_group_ctr = 0 15 | for i in range(0, len(vector_group)): 16 | vector1 = vector_group[i] 17 | for j in range(i+1, len(vector_group)): 18 | vector2 = vector_group[j] 19 | in_group_simil += (1 - cosine(vector1, vector2)) 20 | in_group_ctr += 1 21 | in_group_simil = in_group_simil / in_group_ctr 22 | return in_group_simil 23 | 24 | def out_group_similarity(vector_group, other_vectors): 25 | other_vectors = list(other_vectors) 26 | out_vectors = [] 27 | for _ in range(20): 28 | out_vectors.append(other_vectors[random.randint(0, len(other_vectors) - 1)]) 29 | out_group_simil = 0.0 30 | out_group_ctr = 0 31 | for vector1 in vector_group: 32 | for vector2 in out_vectors: 33 | out_group_simil += (1 - cosine(vector1, vector2)) 34 | out_group_ctr += 1 35 | out_group_simil = out_group_simil / out_group_ctr 36 | return out_group_simil 37 | 38 | class DataReader(object): 39 | def __init__(self, data_paths): 40 | self.data_paths = data_paths 41 | 42 | def __iter__(self): 43 | for data_path in self.data_paths: 44 | print("Reading file " + data_path) 45 | with open(data_path) as file: 46 | calls = json.load(file) 47 | for call in calls: 48 | yield call 49 | 50 | def analyze_histograms(counter): 51 | total = sum(counter.values()) 52 | sorted_pairs = counter.most_common() 53 | percentages_to_cover = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99] 54 | nb_covered = 0 55 | pairs_covered = 0 56 | for pair in sorted_pairs: 57 | nb_covered += pair[1] 58 | pairs_covered += 1 59 | percentage_covered = (nb_covered * 1.0) / total 60 | done = False 61 | while not done and len(percentages_to_cover) > 0: 62 | next_percentage = percentages_to_cover[0] 63 | if percentage_covered >= next_percentage: 64 | print(str(pairs_covered) + " most frequent terms cover " + str(next_percentage) + " of all terms") 65 | percentages_to_cover = percentages_to_cover[1:] 66 | else: 67 | done = True 68 | 69 | -------------------------------------------------------------------------------- /DeepBugs/python/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sola-st/SemSeed/278bf1ae3bb371bbe98965556d1fbb3a38b8c6f5/DeepBugs/python/__init__.py -------------------------------------------------------------------------------- /DeepBugs/python/create_dataset_from_seeded_bugs.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on 22-April-2020 4 | @author Michael Pradel 5 | 6 | Go through JSON files of a directory created by static analysis and map the positive and 7 | negative examples. 8 | 9 | """ 10 | 11 | from typing import List, Dict, Union, Tuple 12 | from pathlib import Path 13 | import json 14 | import codecs 15 | from tqdm import tqdm 16 | import pandas as pd 17 | 18 | 19 | def read_file_content(file_path: Path) -> Union[List, Dict]: 20 | content = [] 21 | try: 22 | with codecs.open(str(file_path), 'r', encoding='utf-8') as f: 23 | c = f.read() 24 | content = json.loads(c) 25 | except FileNotFoundError: 26 | pass 27 | except ValueError: 28 | pass 29 | return content 30 | 31 | 32 | def read_create_dataset(in_dir: str) -> Dict: 33 | json_files = list(Path(in_dir).rglob(pattern='*.json')) 34 | 35 | bug_examples = {} 36 | # Each incorrect example will have a +ve (correct) and a possibly list of 37 | # -ve (incorrect) examples 38 | for file in tqdm(json_files, desc='Going through files'): 39 | extracted_data = read_file_content(file_path=file) 40 | tqdm.write(f"Current bug examples={len(bug_examples)}") 41 | for content in extracted_data: 42 | analysed_location = content['src'] 43 | if '_SEMSEED_MUTATED_' in analysed_location: 44 | bug_seeding_metadata = read_file_content(analysed_location.split(' :')[0] + 'on') 45 | # analysed_location.split(' :')[0] 46 | # Get the original file name and the location where the bug was seeded. Create an unique key 47 | file_name = bug_seeding_metadata['file_name_where_intended'] 48 | line = bug_seeding_metadata['target_line_range']['line'].split('-') 49 | line = ' - '.join(line) 50 | location_seeded_bug = file_name + ' : ' + line 51 | content['probability_that_incorrect'] = 1 52 | if location_seeded_bug not in bug_examples: 53 | bug_examples[location_seeded_bug] = { 54 | 'correct': [], 55 | 'incorrect': [] 56 | } 57 | bug_examples[location_seeded_bug]['incorrect'].append(content) 58 | else: 59 | if analysed_location not in bug_examples: 60 | bug_examples[analysed_location] = { 61 | 'correct': [], 62 | 'incorrect': [] 63 | } 64 | content['probability_that_incorrect'] = 0 65 | bug_examples[analysed_location]['correct'].append(content) 66 | return bug_examples 67 | 68 | 69 | def process_bug_dataset(bug_dataset: Dict) -> List[List]: 70 | """ 71 | There could be many examples where there is a positive example without a -ve 72 | example. Remove them and do other processing and return 73 | 74 | :param bug_dataset: 75 | :return: 76 | """ 77 | filtered_data = [] 78 | for file_path, data in tqdm(bug_dataset.items(), desc='Processing dataset'): 79 | if len(data['correct']) == 0 or len(data['incorrect']) == 0: 80 | continue 81 | # There could be multiple incorrect examples 82 | for ex in data['incorrect']: 83 | filtered_data.append([data['correct'][0], ex]) 84 | return filtered_data 85 | 86 | 87 | def filter_seeded_binOps(seeded_bugs: pd.DataFrame, seeded_bugs_binOps: pd.DataFrame): 88 | """ 89 | Given the binOps from seeded bugs, extract only those locations 90 | where required 91 | """ 92 | new_df = pd.DataFrame() 93 | i = 0 94 | for name, group in seeded_bugs_binOps.groupby('src', axis=0): 95 | i += 1 96 | if len(group) > 1: 97 | for _, row in group.iterrows(): 98 | f = row['src'].split(':')[0].lstrip().rstrip() 99 | bugs_seeded_to_this_file = seeded_bugs.loc[seeded_bugs['file_name_where_intended'] == f] 100 | if len(bugs_seeded_to_this_file): 101 | tok_seqs = bugs_seeded_to_this_file['target_token_sequence-Buggy'] 102 | for tk_seq in tok_seqs: 103 | if len(tk_seq) > 3: 104 | continue 105 | s = [row['left'], row['op'], row['right']] 106 | # print(s, tokens) 107 | # if s == tokens: 108 | # print("Done") 109 | # new_df.append(row) 110 | print(f) 111 | print(len(new_df)) 112 | 113 | 114 | if __name__ == '__main__': 115 | """ 116 | Before running me, first run extractFromJS once on the non seeded JS files and next on the bug-seeded 117 | JS files. This will create two separate JSON files in the 'benchmarks/binOps' directory. 118 | 119 | The current script will go through both JSON files and will map the correct code locations to 120 | buggy code locations and finally write all together to dataset.json 121 | 122 | One may create another script to split dataset.json to training and validation data as two separate JSON 123 | files required for running DeepBugs. 124 | 125 | Eg. dataset.json 126 | [ 127 | [ 128 | { 129 | "left": "ID:g", 130 | "right": "LIT:67", 131 | "op": ">", 132 | "leftType": "unknown", 133 | "rightType": "number", 134 | "parent": "IfStatement", 135 | "grandParent": "BlockStatement", 136 | "src": "benchmarks/data/data/1.js : 6 - 6", 137 | "probability_that_incorrect": 0 138 | }, 139 | { 140 | "left": "ID:g", 141 | "right": "LIT:67", 142 | "op": ">=", 143 | "leftType": "unknown", 144 | "rightType": "number", 145 | "parent": "IfStatement", 146 | "grandParent": "BlockStatement", 147 | "src": "benchmarks/js_benchmark_seeded_bugs/1_SEMSEED_MUTATED_1.js : 6 - 6", 148 | "probability_that_incorrect": 1 149 | } 150 | ] 151 | ] 152 | """ 153 | # data_binOps = pd.read_pickle('benchmarks/binOps_data.pkl', 'gzip')[:100] 154 | 155 | seeded_bugs = pd.read_pickle('benchmarks/seeded_bugs_wrong_binary_operand.pkl', 'gzip') 156 | seeded_bugs_binOps = pd.read_pickle('benchmarks/binOps_wrong_operand_withloc.pkl', 'gzip') 157 | 158 | filter_seeded_binOps(seeded_bugs, seeded_bugs_binOps) 159 | # with open('benchmarks/dataset.json', 'w') as d: 160 | # d.write(json.dumps(f)) 161 | -------------------------------------------------------------------------------- /DeepBugs/python/extract_from_js_parallel.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on 09-July-2020 4 | @author Michael Pradel 5 | 6 | Call 7 | 8 | 'node extractFromJS --file data/one.js' 9 | """ 10 | import os 11 | import subprocess 12 | from threading import Timer 13 | from tqdm import tqdm 14 | from multiprocessing import Pool, cpu_count 15 | from typing import List 16 | import random 17 | import codecs 18 | import json 19 | from pathlib import Path 20 | from collections import defaultdict 21 | 22 | random.seed(a=42) 23 | 24 | 25 | def extractFromJS(target_js_file_path: str, line_num: int) -> str: 26 | """ 27 | Prepare a JS file for seeding bugs by converting JS file to AST nodes. 28 | The functions creates a Nodejs process to extract the required data. 29 | :param target_js_file_path: The input JS file that will be converted to AST node representations 30 | :param out_json_file_path: 31 | :return: 32 | """ 33 | 34 | def kill_process(p): 35 | return p.kill() 36 | 37 | err_in_execution = False 38 | path_to_process = os.path.join(os.path.normpath( 39 | os.getcwd() + os.sep), 'javascript', 'extractFromJS.js') 40 | time_out_before_killing = 240 # seconds 41 | try: 42 | p = subprocess.Popen([ 43 | 'node', path_to_process, 44 | what, 45 | '--file', target_js_file_path, 46 | line_num 47 | ], 48 | stdout=subprocess.PIPE) 49 | time_out = Timer(time_out_before_killing, kill_process, [p]) 50 | try: 51 | time_out.start() 52 | stdout, stderr = p.communicate() 53 | tqdm.write(stdout.decode("utf-8")) 54 | if stderr: 55 | err_in_execution = stderr.decode("utf-8") 56 | # tqdm.write(err_in_execution) 57 | finally: 58 | time_out.cancel() 59 | except subprocess.TimeoutExpired: 60 | pass 61 | return err_in_execution 62 | 63 | 64 | def remove_duplicates(file_list: List, duplicate_file_groups: List) -> List: 65 | """ 66 | Given a list of files, and known duplicates, keep only one of the duplicates 67 | :param duplicate_file_groups: 68 | :param file_list: 69 | :return: 70 | """ 71 | dup_files = set() 72 | for file_group in duplicate_file_groups: 73 | # Except the first file rest are all duplicates 74 | dup_files.update(file_group[1:]) 75 | 76 | files_without_duplicates = [] 77 | # Now, we remove the known duplicates 78 | root_dir = '/data/' 79 | # dup_files = set([os.path.join(root_dir, fp) for fp in dup_files]) 80 | for fl_path in file_list: 81 | if fl_path.split(root_dir)[1] not in dup_files: 82 | files_without_duplicates.append(fl_path) 83 | return files_without_duplicates 84 | 85 | 86 | def read_json_file(json_file_path): 87 | try: 88 | obj_text = codecs.open(json_file_path, 'r', encoding='utf-8').read() 89 | return json.loads(obj_text) 90 | except FileNotFoundError: 91 | print(f"*** Can't find {json_file_path} provide a correct path") 92 | return {} 93 | except Exception as e: 94 | # Empty JSON file most likely due to abrupt killing of the process while writing 95 | # print (e) 96 | return {} 97 | 98 | 99 | def add_required_line_number(file_path): 100 | """ 101 | Add the required line number where bug was seeded 102 | :return: 103 | """ 104 | file_path = str(file_path) 105 | seeded_bug_info = read_json_file(file_path + 'on') 106 | line = seeded_bug_info["target_line_range"]["line"] 107 | return file_path, line 108 | 109 | 110 | def extractFromJS_multi(arg): 111 | if isinstance(arg, tuple): 112 | file, loc = arg 113 | else: 114 | file = arg 115 | loc = "null" 116 | 117 | extractFromJS(target_js_file_path=file, line_num=loc) 118 | 119 | 120 | def semseed_seeded_extraction(in_dir, what): 121 | print(f"Reading files from {in_dir}") 122 | js_files = list(Path(in_dir).rglob('*.js')) 123 | js_files = [f for f in js_files if Path(f).is_file()] 124 | if in_dir.endswith('/data'): 125 | js_files = [str(f) for f in js_files] 126 | print(" Removing duplicates from {} files in benchmarks".format(len(js_files))) 127 | duplicate_file_groups = read_json_file('benchmarks/js150-duplicates.json') 128 | js_files = remove_duplicates(file_list=js_files, duplicate_file_groups=duplicate_file_groups) 129 | else: 130 | print("Adding line numbers to extract") 131 | line_out_file = f'benchmarks/files_and_line_numbers_wrong_{what}.json' 132 | if Path(line_out_file).is_file(): 133 | print("Reading from pre-computed") 134 | with open(line_out_file, 'r') as f: 135 | js_file_with_lines = json.load(f) 136 | js_files = [(f, line[0]) for f, line in 137 | js_file_with_lines.items()] # every file is unique and has only one line 138 | else: 139 | js_file_with_lines = defaultdict(list) 140 | with Pool(cpu_count() // 2) as p: 141 | with tqdm(total=len(js_files)) as pbar: 142 | pbar.set_description_str(desc="Adding line numbers", refresh=False) 143 | for _, files_and_lines in enumerate(p.imap_unordered(add_required_line_number, js_files, 10)): 144 | js_file_with_lines[files_and_lines[0]].append(files_and_lines[1]) 145 | # print(files_and_lines) 146 | pbar.update() 147 | p.close() 148 | p.join() 149 | with open(line_out_file, 'w+') as o: 150 | json.dump(js_file_with_lines, o) 151 | js_files = [(f, line[0]) for f, line in 152 | js_file_with_lines.items()] 153 | # js_files = [(f,l) for f, l in js_files if 'elastic_SEMSEED' in f] 154 | # random.shuffle(js_files) 155 | # js_files = js_files[:10] 156 | with Pool(processes=cpu_count()) as p: 157 | with tqdm(total=len(js_files)) as pbar: 158 | pbar.set_description_str( 159 | desc="Extract from JS", refresh=False) 160 | for i, execution_errors in tqdm( 161 | enumerate(p.imap_unordered(extractFromJS_multi, 162 | js_files, chunksize=10))): 163 | # ex_errors.append(execution_errors) 164 | pbar.update() 165 | p.close() 166 | p.join() 167 | # print(ex_errors) 168 | 169 | 170 | def real_bugs_GitHub_extraction(in_dir): 171 | js_files = list(Path(in_dir).rglob('*.js')) 172 | js_files = [str(f) for f in js_files if Path(f).is_file()] 173 | 174 | js_files_with_lines = [] 175 | for file_path in tqdm(js_files, desc='Adding lines'): 176 | file_name = os.path.basename(file_path) 177 | line = file_name.split('_')[2] 178 | js_files_with_lines.append((file_path, f'{line}-{line}')) 179 | with Pool(processes=cpu_count()) as p: 180 | with tqdm(total=len(js_files)) as pbar: 181 | pbar.set_description_str( 182 | desc="Extract from JS", refresh=False) 183 | for i, execution_errors in tqdm( 184 | enumerate(p.imap_unordered(extractFromJS_multi, 185 | js_files_with_lines, chunksize=10))): 186 | # print(execution_errors) 187 | # ex_errors.append(execution_errors) 188 | pbar.update() 189 | p.close() 190 | p.join() 191 | 192 | 193 | if __name__ == '__main__': 194 | what = ['binOps', 'assignments'][1] 195 | in_dir = \ 196 | ['benchmarks/js_benchmark_seeded_bugs_wrong_assignment', 197 | 'benchmarks/js_benchmark_seeded_bugs_wrong_binop_operand', 198 | 'benchmarks/real_bugs_github', 199 | 'benchmarks/data'][2] 200 | # semseed_seeded_extraction(in_dir, what) 201 | real_bugs_GitHub_extraction(in_dir) 202 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | # Install 2 | 3 | Install **Node.js** and the required packages: 4 | 5 | ````shell 6 | # You may install Node.js using nvm : https://github.com/nvm-sh/nvm 7 | wget -qO- https://raw.githubusercontent.com/nvm-sh/nvm/v0.38.0/install.sh | bash 8 | source ~/.bashrc 9 | 10 | # Install Node.js 14 11 | nvm install 14.17.0 12 | # Install the required Node.js packages 13 | npm install 14 | ```` 15 | 16 | Create a virtual environment for **Python** and install the required packages: 17 | 18 | ````shell 19 | sudo apt install -y python3-dev # required for the 'fasttext' package 20 | sudo apt install -y python3-venv 21 | 22 | # Create a virtual environment 23 | python3 -m venv semseed_venv 24 | # Activate the virtual environment 25 | source semseed_venv/bin/activate 26 | # Install the required Python packages 27 | pip install -r requirements.txt 28 | ```` 29 | 30 | We provide pre-trained token embeddings trained using fastText (https://fasttext.cc). The training has been performed 31 | using JavaScript files obtained from https://www.sri.inf.ethz.ch/js150. 32 | 33 | Install **MongoDB** 34 | 35 | ````shell 36 | # Install MongoDB Community Edition on Ubuntu 20.04 37 | # Documentation -> https://docs.mongodb.com/manual/tutorial/install-mongodb-on-ubuntu 38 | 39 | wget -qO - https://www.mongodb.org/static/pgp/server-4.4.asc | sudo apt-key add - 40 | echo "deb [ arch=amd64,arm64 ] https://repo.mongodb.org/apt/ubuntu focal/mongodb-org/4.4 multiverse" | sudo tee /etc/apt/sources.list.d/mongodb-org-4.4.list 41 | sudo apt-get update 42 | sudo apt-get install -y mongodb-org 43 | 44 | # Once installation has finished start MongoDB 45 | sudo systemctl start mongod 46 | ```` 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright 2021 Software Lab at University of Stuttgart 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | 11 | -------------------------------------------------------------------------------- /REQUIREMENTS.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sola-st/SemSeed/278bf1ae3bb371bbe98965556d1fbb3a38b8c6f5/REQUIREMENTS.md -------------------------------------------------------------------------------- /bug_seeding/bug_seeding_approaches/SeedBugs.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on 24-March-2020 4 | @author Jibesh Patra 5 | 6 | """ 7 | from abc import ABC, abstractmethod 8 | from typing import List, Tuple 9 | 10 | 11 | class SeedBugs(ABC): 12 | def __init__(self, bug_seeding_pattern: dict, target_location: dict, file_path: str): 13 | # Stuffs about the bug. Eg. Buggy, Correct, Surrounding tokens, Usages Identifiers, Literals etc. 14 | 15 | self.bug_metadata = { 16 | 'file_name_where_intended': file_path, 17 | "target_token_sequence-Correct": target_location['tokens'], # Abstract token sequence that will be mutated 18 | "target_token_sequence-Buggy": [], # Concrete token sequence generated after mutation 19 | "token_sequence_abstraction-Correct": target_location['abstractedTokens'], 20 | "token_sequence_abstraction-Buggy": [], 21 | "target_line_range": {'line': target_location['line'], 'range': target_location['range']}, 22 | "num_of_available_identifiers_to_choose_from": 0, 23 | "num_of_available_literals_to_choose_from": 0, 24 | "error": False 25 | } 26 | self.bug_seeding_pattern = bug_seeding_pattern 27 | self.target_location = target_location 28 | 29 | @abstractmethod 30 | def is_matching_token_sequence(self) -> bool: 31 | """ 32 | For a 'syntactic' match check, this will return True if the 33 | token sequence in abstracted form match. 34 | 35 | For a 'semantic' matching this will depend on the cosine distance of the 36 | embedding of the tokens along with the threshold. 37 | :return: 38 | """ 39 | raise NotImplementedError 40 | 41 | @abstractmethod 42 | def apply_pattern(self) -> List[List]: 43 | """ 44 | Seed a bug by applying a given pattern 45 | :return: 46 | """ 47 | raise NotImplementedError 48 | 49 | def extract_tokens_of_kinds(self, given_token_seq: List[str]) -> Tuple[List, List, List]: 50 | try: 51 | assert len(given_token_seq) == len(self.target_location['abstractedTokens']) 52 | except AssertionError as e: 53 | print("The lengths of these token sequences should be same") 54 | 55 | tokens = [] 56 | idf_tokens = [] 57 | lit_tokens = [] 58 | 59 | idf_prefix = 'Idf_' 60 | lit_prefix = 'Lit_' 61 | 62 | for i, abs_tok in enumerate(self.target_location['abstractedTokens']): 63 | concrete_token = given_token_seq[i] 64 | if abs_tok.startswith(idf_prefix) or abs_tok.startswith(lit_prefix): 65 | tokens.append(concrete_token) 66 | if abs_tok.startswith(idf_prefix): 67 | idf_tokens.append(concrete_token) 68 | elif abs_tok.startswith(lit_prefix): 69 | lit_tokens.append(concrete_token) 70 | return tokens, idf_tokens, lit_tokens 71 | 72 | def replace_target_with_mutated_token_sequence(self, token_list: List, token_range_list: List, 73 | mutated_token_sequence: List) -> List: 74 | """ 75 | Once the mutated token sequence has been found replace the target token sequence with this new 76 | :param token_list: The complete list of the token in the file 77 | :param token_range_list: The ranges of each token contained in the token list 78 | :param mutated_token_sequence: The token sequence that will be inserted to seed bugs 79 | :return: Token sequence after seeding the bug 80 | """ 81 | 82 | assert len(token_list) == len(token_range_list) 83 | 84 | start_range = self.target_location["range"][0] 85 | end_range = self.target_location["range"][1] 86 | 87 | indices_to_remove = [i for i, rng in enumerate(token_range_list) if int(rng.split( 88 | '-')[0]) >= start_range and int(rng.split('-')[1]) <= end_range] 89 | 90 | part1 = token_list[:indices_to_remove[0]] 91 | part2 = token_list[indices_to_remove[-1] + 1:] 92 | 93 | token_list_after_seeding = part1 + mutated_token_sequence + part2 94 | assert len(token_list_after_seeding) == len(token_list) - len(self.target_location['tokens']) + len( 95 | mutated_token_sequence) 96 | return token_list_after_seeding 97 | 98 | def get_abstract_token_to_concrete_mapping(self) -> dict: 99 | """ 100 | This creates a mapping of the abstract token to its actual value 101 | Eg. 'Idf_1' -> 'a' 102 | """ 103 | mappings = {} 104 | for i, abstract_tok in enumerate(self.target_location['abstractedTokens']): 105 | if not abstract_tok.startswith('Idf_') and not abstract_tok.startswith('Lit_'): 106 | continue 107 | mappings[abstract_tok] = self.target_location['tokens'][i] 108 | return mappings 109 | 110 | def write_bug_seeded_file(self): 111 | pass 112 | 113 | def __call__(self, *args, **kwargs): 114 | pass 115 | -------------------------------------------------------------------------------- /bug_seeding/bug_seeding_approaches/Syntactic/SyntacticSeedBugs.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on 25-March-2020 4 | @author Jibesh Patra 5 | 6 | """ 7 | 8 | from bug_seeding_approaches.SeedBugs import SeedBugs 9 | from typing import List 10 | 11 | 12 | class SyntacticSeedBugs(SeedBugs): 13 | def __init__(self, bug_seeding_pattern: dict, target_location: dict, file_path: str): 14 | super().__init__(bug_seeding_pattern, target_location, file_path) 15 | 16 | def is_matching_token_sequence(self) -> bool: 17 | target = self.target_location 18 | seeding_pattern = self.bug_seeding_pattern 19 | 20 | # We only need to check syntactic matches 21 | if target['abstractedTokens'] != seeding_pattern['fix']: 22 | return False 23 | else: 24 | return True 25 | 26 | def apply_pattern(self) -> List[List]: 27 | 28 | return [] 29 | -------------------------------------------------------------------------------- /bug_seeding/obtain_bug_seeding_patterns/extract_bug_seeding_patterns_from_repos/aggregateChanges.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author Jibesh Patra 3 | 4 | Aggregate the patterns and write to a JSON files. 5 | """ 6 | 7 | from utils import fileutils as fs 8 | 9 | 10 | def write_bug_seeding_patterns_to_file(agg_data_out_file): 11 | from database import GitHubCommits as db 12 | abstract_changes = list(db.Commits.objects.get_abstracted_changes()) 13 | changes_across_all_repos = [] 14 | 15 | for change_summary in abstract_changes: 16 | change_summary['commit_time'] = change_summary['commit_time'].strftime("%d/%m/%Y, %H:%M:%S") 17 | cfx_actual = [str(e) for e in change_summary['fix_actual']] 18 | change_summary['fix_actual'] = cfx_actual 19 | 20 | cfb_actual = [str(e) for e in change_summary['buggy_actual']] 21 | change_summary['buggy_actual'] = cfb_actual 22 | 23 | changes_across_all_repos.append(change_summary) 24 | 25 | print(f'Writing data to {agg_data_out_file}') 26 | fs.writeJSONFile(changes_across_all_repos, agg_data_out_file) 27 | 28 | 29 | if __name__ == "__main__": 30 | write_bug_seeding_patterns_to_file(agg_data_out_file='benchmarks/bug_seeding_patterns.json') 31 | -------------------------------------------------------------------------------- /bug_seeding/obtain_bug_seeding_patterns/extract_bug_seeding_patterns_from_repos/callNodeJSExtractData.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author Jibesh Patra 3 | """ 4 | 5 | import subprocess 6 | from threading import Timer 7 | import os 8 | import json 9 | import multiprocessing 10 | from multiprocessing import Pool 11 | from tqdm import tqdm 12 | 13 | 14 | def callNodeJS(argument): 15 | ''' 16 | Call Node.js for each commit and create patterns. 17 | @param argument: Each argument is a commit id 18 | @return: 19 | ''' 20 | path_to_process = os.path.join(os.path.normpath( 21 | os.getcwd() + os.sep), 'bug_seeding', 'obtain_bug_seeding_patterns', 'extract_bug_seeding_patterns_from_repos', 22 | 'python_calls_me_to_extract_patterns.js') 23 | time_out_before_killing = 180 # seconds 180 -> 3 minutes 24 | try: 25 | def kill_process(p): 26 | return p.kill() 27 | 28 | p = subprocess.Popen(['node', path_to_process, '-commitId', argument], 29 | stdout=subprocess.PIPE) 30 | time_out = Timer(time_out_before_killing, kill_process, [p]) 31 | try: 32 | time_out.start() 33 | stdout, stderr = p.communicate() 34 | # print(stdout, stderr) 35 | finally: 36 | time_out.cancel() 37 | except subprocess.TimeoutExpired: 38 | # p.kill() 39 | pass 40 | 41 | 42 | def create_patterns_from_commits(select_num_of_commits=-1): 43 | ''' 44 | Query the MongoDB database and select only those commits (commit_ids) where the number of files 45 | changed is one and the changes are single line changes. 46 | 47 | Next, the CallNodeJS for only those commits and create patterns. 48 | 49 | @param select_num_of_commits: -1 means select all commits. 50 | @return: 51 | ''' 52 | from database import GitHubCommits as db 53 | 54 | # query filters 55 | num_of_files_changed = 1 56 | num_single_line_changes = 1 57 | query_obj = db.Commits.objects( 58 | num_files_changed=num_of_files_changed, num_single_line_changes=num_single_line_changes) 59 | print('Found %d records that has only %d file change and only %d single line change' % 60 | (query_obj.count(), num_of_files_changed, num_single_line_changes)) 61 | pks = json.loads(query_obj.only('pk').to_json()) # get only the primary keys 62 | 63 | # Now put all primary keys in a list. 64 | # The primary keys are nothing but commit hashes concatenated with the repository 65 | commit_ids = [] 66 | 67 | for pk in pks: 68 | commit_ids.append(pk['_id']) 69 | 70 | if select_num_of_commits > 0: 71 | print("Selecting only %d commits of %d available commits" % 72 | (select_num_of_commits, len(commit_ids))) 73 | commit_ids = commit_ids[:select_num_of_commits] 74 | 75 | # Parallel execution 76 | with Pool(processes=multiprocessing.cpu_count()) as p: 77 | with tqdm(total=len(commit_ids)) as pbar: 78 | pbar.set_description_str( 79 | desc="Extracting Patterns ", refresh=False) 80 | for i, _ in tqdm(enumerate(p.imap_unordered(callNodeJS, commit_ids))): 81 | pbar.update() 82 | p.close() 83 | p.join() 84 | -------------------------------------------------------------------------------- /bug_seeding/obtain_bug_seeding_patterns/extract_bug_seeding_patterns_from_repos/database/GitHubCommits.py: -------------------------------------------------------------------------------- 1 | """ 2 | @author Jibesh Patra 3 | """ 4 | 5 | from mongoengine import * 6 | import json 7 | import codecs 8 | 9 | 10 | def read_config(json_file_path): 11 | try: 12 | obj_text = codecs.open(json_file_path, 'r', encoding='utf-8').read() 13 | return json.loads(obj_text) 14 | except FileNotFoundError: 15 | print(f"*** Can't find {json_file_path} provide a correct path") 16 | return [] 17 | except Exception as e: 18 | # Empty JSON file most likely due to abrupt killing of the process while writing 19 | # print (e) 20 | return [] 21 | 22 | 23 | db_config = read_config(json_file_path='database_config.json') 24 | 25 | connect(db_config['database_name'], username=db_config['username'], password=db_config['password'], 26 | authentication_source='admin', host=db_config['host'], port=db_config['port']) 27 | 28 | 29 | class QueryChanges(QuerySet): 30 | def get_abstracted_changes(self): 31 | pipeline = [ 32 | { 33 | '$project': { 34 | 'single_line_changes': True, 35 | 'num_files_changed': True, 36 | 'num_single_line_changes': True, 37 | 'commit_hash': True, 38 | 'url': True, 39 | 'commit_time': True, 40 | 'local_repo_path': True 41 | } 42 | }, { 43 | '$match': { 44 | 'num_files_changed': 1, 45 | 'num_single_line_changes': 1 46 | } 47 | }, { 48 | '$unwind': { 49 | 'path': '$single_line_changes', 50 | 'preserveNullAndEmptyArrays': False 51 | } 52 | }, { 53 | '$match': { 54 | 'single_line_changes.analysis_report': 'success' 55 | } 56 | }, { 57 | '$addFields': { 58 | 'fix': '$single_line_changes.change_summary.fix', 59 | 'fix_tokenType': '$single_line_changes.new_file.change_analysis.type', 60 | 'fix_file_path': '$single_line_changes.new_file.path', 61 | 'fix_actual': '$single_line_changes.new_file.change_analysis.tokens', 62 | 'fix_range': '$single_line_changes.new_file.change_analysis.range', 63 | 'fix_line': '$single_line_changes.new_file.change_analysis.line', 64 | 'buggy': '$single_line_changes.change_summary.buggy', 65 | 'buggy_tokenType': '$single_line_changes.old_file.change_analysis.type', 66 | 'buggy_file_path': '$single_line_changes.old_file.path', 67 | 'buggy_actual': '$single_line_changes.old_file.change_analysis.tokens', 68 | 'buggy_range': '$single_line_changes.old_file.change_analysis.range', 69 | 'buggy_line': '$single_line_changes.old_file.change_analysis.line' 70 | } 71 | }, { 72 | '$project': { 73 | 'buggy': True, 74 | 'buggy_actual': True, 75 | 'buggy_file_path': True, 76 | 'buggy_tokenType': True, 77 | 'buggy_range': True, 78 | 'buggy_line': True, 79 | 'fix': True, 80 | 'fix_tokenType': True, 81 | 'fix_actual': True, 82 | 'fix_file_path': True, 83 | 'fix_range': True, 84 | 'fix_line': True, 85 | 'commit_time': True, 86 | 'local_repo_path': True, 87 | 'lessthanX_fix': { 88 | '$lt': [ 89 | { 90 | '$size': '$fix' 91 | }, 20 92 | ] 93 | }, 94 | 'lessthanX_buggy': { 95 | '$lt': [ 96 | { 97 | '$size': '$buggy' 98 | }, 20 99 | ] 100 | }, 101 | 'commit_hash': True, 102 | 'url': True 103 | } 104 | }, { 105 | '$match': { 106 | 'lessthanX_fix': True, 107 | 'lessthanX_buggy': True 108 | } 109 | }, { 110 | '$project': { 111 | 'lessthanX_fix': False, 112 | 'lessthanX_buggy': False 113 | } 114 | }, { 115 | '$sort': { 116 | 'commit_time': 1 117 | } 118 | } 119 | ] 120 | return self().aggregate(*pipeline) 121 | 122 | def get_fix_and_buggy_tokens(self, id_h): 123 | q = list(self(pk=id_h).only( 124 | 'single_line_changes.old_file.change_analysis.tokens', 125 | 'single_line_changes.new_file.change_analysis.tokens')) 126 | fixed_tokens = q[0]['single_line_changes'][0]['new_file']['change_analysis']['tokens'] 127 | buggy_tokens = q[0]['single_line_changes'][0]['old_file']['change_analysis']['tokens'] 128 | return {'actual_buggy_tokens': buggy_tokens, 'actual_fixed_tokens': fixed_tokens} 129 | 130 | 131 | class Commits(Document): 132 | commit_id = StringField(primary_key=True) 133 | commit_hash = StringField(required=True) 134 | commit_message = StringField(required=True) 135 | commit_time = DateTimeField() 136 | 137 | local_repo_path = StringField() 138 | parent_hash = StringField() 139 | url = URLField() 140 | 141 | num_files_changed = IntField() 142 | 143 | single_line_changes = ListField(DictField(DictField())) 144 | num_single_line_changes = IntField() 145 | meta = {'queryset_class': QueryChanges} 146 | -------------------------------------------------------------------------------- /bug_seeding/obtain_bug_seeding_patterns/extract_bug_seeding_patterns_from_repos/extractNodeData.js: -------------------------------------------------------------------------------- 1 | const ExtractSingleLineChangedNodes = require('./analyses/ExtractDataGivenNodes').ExtractSingleLineChangedNodes; 2 | const fs = require('fs'); 3 | const assert = require('assert'); 4 | const format_file = require('./utils/format_a_js_file').formatJSfile; 5 | 6 | function analyseCode(code, nonTrackingNodes, trackingNodes) { 7 | if (code.length === 0) return {}; 8 | assert.ok(Array.isArray(nonTrackingNodes)); // Should be an Array 9 | assert.ok(nonTrackingNodes.length > 0); // It should have atleast one Node to track 10 | 11 | // Number of tokens to extract around each Identifier and also the number of tokens 12 | // to extract around each point of interest Eg. conditional test 13 | let config = { 14 | num_tokens_aroundIdf: 3, 15 | num_tokens_around_point_of_interest: 5 16 | }; 17 | 18 | // ------------------------------ Extract data given some specific nodes ------------ 19 | try { 20 | // Extract data of every node apart from Identifier or Literal 21 | let dataFromSpecificNodes = new ExtractSingleLineChangedNodes(code, nonTrackingNodes, trackingNodes); 22 | 23 | let analysedCode = dataFromSpecificNodes.goThroughASTExtractSpecificNodes(); 24 | for (let dt of analysedCode) { 25 | // For now, we do not need Identifier and the Context for the token sequence 26 | // dataFromSpecificNodes.addIdentifiersAndContext(dt, config); 27 | dataFromSpecificNodes.abstractIdentifiers(dt); 28 | dataFromSpecificNodes.abstractLiterals(dt); 29 | } 30 | 31 | let map_to_obj = (range_to_tok => { 32 | const obj = {}; 33 | range_to_tok.forEach((v, k) => { 34 | obj[k] = v['value'] 35 | }); 36 | return obj; 37 | }); 38 | return { 39 | 'nodes': analysedCode, 40 | 'functions_to_identifiers': dataFromSpecificNodes.scopeToIdentifier, 41 | 'functions_to_literals': dataFromSpecificNodes.scopeToLiteral, 42 | 'tokenList': dataFromSpecificNodes.tokenList.filter(value => value !== null), 43 | 'tokenRangesList': dataFromSpecificNodes.tokenRangesList, 44 | 'range_to_identifier': map_to_obj(dataFromSpecificNodes.rangeToIdentifier), 45 | 'range_to_literal': map_to_obj(dataFromSpecificNodes.rangeToLiteral) 46 | }; 47 | } catch (e) { 48 | return e; 49 | } 50 | 51 | } 52 | 53 | function extractNodeData(inFile, outFile) { 54 | // First format the file 55 | format_file(inFile); 56 | let code = ''; 57 | try { 58 | code = fs.readFileSync(inFile, 'utf8'); 59 | } catch (e) { 60 | return e; 61 | } 62 | 63 | let extractedData = analyseCode(code, ['Identifier', 'Literal'], ['BinaryExpression']); 64 | if (Object.keys(extractedData).length !== 0) { 65 | try { 66 | extractedData['file_path'] = inFile; 67 | // if (fs.existsSync(outFile)) { 68 | // let knownLocationOfInterest = JSON.parse(fs.readFileSync(outFile, 'utf8')); 69 | // if (knownLocationOfInterest && knownLocationOfInterest.hasOwnProperty('line')) { 70 | // extractedData.line = knownLocationOfInterest.line; 71 | // } 72 | // } 73 | if (fs.existsSync(outFile)) { 74 | let random_num = Math.floor(Math.random() * 100000); 75 | outFile = outFile.replace('.js', '_' + random_num + '.js') 76 | } 77 | fs.writeFileSync(outFile, JSON.stringify(extractedData)); 78 | } catch (err) { 79 | return err; 80 | } 81 | 82 | } 83 | 84 | // console.log(outFile); 85 | // console.log(extractedData); 86 | } 87 | 88 | 89 | function parse_cli_arguments() { 90 | const ArgumentParser = require('argparse').ArgumentParser; 91 | let parser = new ArgumentParser({ 92 | version: '0.0.1', 93 | addHelp: true, 94 | description: 'Go through the JS file and extract \'only\' certain the nodes' 95 | }); 96 | 97 | // -------------------------- Debug --------------------- 98 | parser.addArgument( 99 | ['-inFile'], 100 | {help: 'Specify the source file from which the data needs to be extracted'}); 101 | parser.addArgument( 102 | ['-outFile'], 103 | {help: 'Specify the file where the extracted data will be written'}); 104 | let args = parser.parseArgs(); 105 | return { 106 | 'inFile': args.inFile, 107 | 'outFile': args.outFile 108 | } 109 | } 110 | 111 | 112 | ( 113 | async function () { 114 | let { 115 | inFile, outFile 116 | } = parse_cli_arguments(); 117 | extractNodeData(inFile, outFile); 118 | } 119 | )(); 120 | 121 | module.exports.analyseCode = analyseCode; 122 | -------------------------------------------------------------------------------- /bug_seeding/obtain_bug_seeding_patterns/extract_bug_seeding_patterns_from_repos/utils/fileoperations.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Contains utility methods for file and folder operations 3 | */ 4 | const fs = require('fs'); 5 | const path = require('path'); 6 | const assert = require('assert'); 7 | 8 | /** 9 | * Check if the p is a directory 10 | * @param {string} path 11 | * @returns {boolean} 12 | */ 13 | function isDir(path) { 14 | try { 15 | fs.accessSync(path, fs.constants.R_OK); 16 | } catch (err) { 17 | console.error(`\n\n##### ----- No access to ${path} ----- ######\n\n`); 18 | return false; 19 | } 20 | return fs.statSync(path).isDirectory(); 21 | } 22 | 23 | /** 24 | * Check if a file exists and is available to be read 25 | * @param {string} filePath 26 | * @returns {boolean} 27 | */ 28 | function available(filePath) { 29 | return fs.existsSync(filePath); 30 | try { 31 | fs.accessSync(path, fs.constants.R_OK); 32 | } catch (err) { 33 | return false; 34 | } 35 | 36 | return true; 37 | } 38 | 39 | /** 40 | * Given a p, returns the extension of the p in lowercase 41 | * @param {string} filePath 42 | * @returns {string} 43 | */ 44 | function getExtension(filePath) { 45 | assert.ok(!isDir(filePath)); 46 | return path.extname(filePath).toLowerCase(); 47 | } 48 | 49 | /** 50 | * Read a file and returns the content of the file. If the filetype 51 | * is JSON then also converts to a JSON object. 52 | * If there is parsing error or for some reason, the file could not be read 53 | * then returns null 54 | * @param {string} filePath Path of the file that needs to be read 55 | * @returns {(null|string|Object.)} 56 | */ 57 | function getFileContent(filePath) { 58 | let content; 59 | // assert.ok(!isDir(filePath)); 60 | // assert.ok(available(filePath)); 61 | if (isDir(filePath) || !available(filePath)) 62 | return null; 63 | try { 64 | content = fs.readFileSync(filePath, 'utf8'); 65 | if (getExtension(filePath) === '.json') content = JSON.parse(content); 66 | } catch (error) { 67 | content = null; 68 | } 69 | 70 | return content; 71 | } 72 | 73 | /** 74 | * Check if a file is accessible and then returns the size 75 | * of the file in bytes 76 | * @param {string} filePath 77 | * @returns {number} 78 | */ 79 | function getFileSize(filePath) { 80 | assert.ok(available(filePath)); 81 | return fs.statSync(filePath).size; 82 | } 83 | 84 | /** 85 | * Go through a directory and all sub directories and create a list 86 | * links to the files in the particular directory. 87 | * @param {string} dirPath Initial p of a directory 88 | * @param {string} fileExtension Types of file 89 | * @returns {Array.} List of files 90 | */ 91 | function createLinksOfFiles(dirPath, fileExtension) { 92 | /** 93 | * @type{string[]} 94 | */ 95 | let fileList = []; 96 | let folderToTraverse = [dirPath]; 97 | if (!fileExtension) throw 'Need extension of file that will be filtered'; 98 | while (folderToTraverse.length !== 0) { 99 | let currentFolder = folderToTraverse.pop(); 100 | let list_of_files_and_folders = fs.readdirSync(currentFolder); 101 | list_of_files_and_folders.forEach((f_path) => { 102 | let complete_path = path.join(currentFolder, f_path); 103 | if (isDir(complete_path)) 104 | folderToTraverse.push(complete_path); 105 | else if (getExtension(complete_path) === fileExtension) 106 | fileList.push(complete_path); 107 | }); 108 | } 109 | return fileList; 110 | } 111 | 112 | module.exports.available = available; 113 | module.exports.getFileContent = getFileContent; 114 | module.exports.getExtension = getExtension; 115 | module.exports.getFileSize = getFileSize; 116 | module.exports.createLinksOfFiles = createLinksOfFiles; -------------------------------------------------------------------------------- /bug_seeding/obtain_bug_seeding_patterns/extract_bug_seeding_patterns_from_repos/utils/format_a_js_file.js: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | Created on 16-April-2020 4 | @author Jibesh Patra 5 | 6 | Given a JavaScript file, remove comments and format it. The reason we do this is to 7 | seed bugs to a known format so that we can map the locations of the seeded bug later. 8 | 9 | Since we use a token sequence to seed bugs, comments and spaces in code messes up the 10 | location of the seeded bug. Once we re-generate the file with seeded bug, the original 11 | locations where the bug was seeded is hard to map back. 12 | **/ 13 | 14 | 15 | const fs = require('fs'); 16 | const fileops = require('./fileoperations'); 17 | const ArgumentParser = require('argparse').ArgumentParser; 18 | const esprima = require('esprima'); 19 | const escodegen = require('escodegen'); 20 | const beautify = require('js-beautify').js, 21 | strip = require('strip-comments'); 22 | const UglifyJS = require("uglify-js"); 23 | 24 | function parse_cli_arguments() { 25 | let parser = new ArgumentParser({ 26 | version: '0.0.1', 27 | addHelp: true, 28 | description: 'Take a JS file and format it and remove comments' 29 | }); 30 | 31 | parser.addArgument( 32 | ['-inFile'], { 33 | help: 'The JavaScript file that needs to be formatted' 34 | }); 35 | let args = parser.parseArgs(); 36 | return { 37 | 'inFile': args.inFile, 38 | } 39 | } 40 | 41 | /** 42 | * Given a file path, re-format it 43 | * @param{String} inFilePath 44 | */ 45 | function formatJSfile(inFilePath) { 46 | if (!fileops.available(inFilePath)) { 47 | // console.log(`${inFilePath} is not available`) 48 | return; 49 | } 50 | try { 51 | let code = fs.readFileSync(inFilePath, 'utf8'); 52 | code = strip(code); // Remove comments from code 53 | let ast = {}; 54 | try { 55 | ast = esprima.parseScript(code, {tokens: true}); 56 | } catch (e) { 57 | try { 58 | ast = esprima.parseModule(code, {tokens: true}); 59 | } catch (e) { 60 | ast = {} 61 | } 62 | } 63 | let tokens = []; 64 | for (let tok of ast.tokens) { 65 | tokens.push(tok.value); 66 | } 67 | 68 | code = tokens.join(' '); 69 | let formattedCode = beautify(code, { 70 | "indent_empty_lines": false, 71 | "break_chained_methods": false, 72 | "space_after_anon_function": false, 73 | "space_in_paren": false 74 | }); 75 | let options = { 76 | compress: false, 77 | mangle: false, 78 | output: { 79 | beautify: true 80 | } 81 | }; 82 | fs.writeFileSync(inFilePath, formattedCode); 83 | // console.log(code); 84 | // let uglify_format = UglifyJS.minify(formattedCode, options); 85 | // if (uglify_format.hasOwnProperty('error')) { // uglify does not support es6 and above 86 | // fs.writeFileSync(inFilePath, formattedCode); 87 | // } else { 88 | // fs.writeFileSync(inFilePath, uglify_format['code']); 89 | // } 90 | // console.log(`Pretty printing ${inFilePath}`); 91 | } catch (e) { 92 | // console.log(e); 93 | } 94 | } 95 | 96 | function formatFilesInDir(inDir) { 97 | let filePaths = fileops.createLinksOfFiles(inDir, '.js'); 98 | filePaths.forEach(fl => { 99 | formatJSfile(fl); 100 | }); 101 | } 102 | 103 | // Test --- 104 | // ( 105 | // function () { 106 | // let { 107 | // inFile 108 | // } = parse_cli_arguments(); 109 | // // formatFilesInDir(inFile); 110 | // formatJSfile(inFile); 111 | // } 112 | // )(); 113 | 114 | 115 | module.exports.formatJSfile = formatJSfile; 116 | -------------------------------------------------------------------------------- /bug_seeding/obtain_bug_seeding_patterns/repo_downloader/downloadTopGithubRepos.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jibesh Patra 3 | */ 4 | 5 | const fs = require('fs'); 6 | const git = require('simple-git'); 7 | 8 | /** 9 | * 10 | * @param {String} filePath 11 | * @param {Number} noOfRepos 12 | */ 13 | function readRepoNames(filePath, noOfRepos) { 14 | let repos = JSON.parse(fs.readFileSync(filePath)); 15 | console.log(`\nFound ${repos.length} repositories, will download ${noOfRepos}`); 16 | return repos.slice(0, noOfRepos); 17 | } 18 | 19 | /** 20 | * 21 | * @param {Array} repos 22 | * @param {String} directory 23 | */ 24 | async function cloneSelectedRepos(repos, directory) { 25 | let clone_tasks = []; 26 | const sleep = (milliseconds) => { 27 | return new Promise(resolve => setTimeout(resolve, milliseconds)) 28 | } 29 | 30 | let all = repos.length; 31 | repos.forEach((repo) => { 32 | console.log(`Cloning ${repo.clone_url}`); 33 | clone_tasks.push(git(directory).clone(repo.clone_url).exec(() => { 34 | console.log(`Done cloning ${repo.clone_url} remaining ${--all}`); 35 | })); 36 | clone_tasks.push(sleep(1000)); 37 | }); 38 | try { 39 | let t = await Promise.all(clone_tasks); 40 | } catch (err) { 41 | console.log(err); 42 | 43 | } 44 | 45 | } 46 | 47 | /** 48 | * 49 | * @param {String} linkOfRepos 50 | * @param {Number} numOfRepoToDownload 51 | * @param {String} outDir 52 | */ 53 | function download_repositories(linkOfRepos, numOfRepoToDownload, outDir) { 54 | let repos = readRepoNames(linkOfRepos, numOfRepoToDownload); 55 | console.log("Writing the downloaded repositories to ==> " + outDir); 56 | cloneSelectedRepos(repos, outDir); 57 | } 58 | 59 | module.exports.download_repositories = download_repositories; 60 | -------------------------------------------------------------------------------- /bug_seeding/obtain_bug_seeding_patterns/repo_downloader/fileoperations.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @author Jibesh Patra 3 | * 4 | * Contains utility methods for file and folder operations 5 | * 6 | */ 7 | const fs = require('fs'); 8 | const path = require('path'); 9 | const assert = require('assert'); 10 | 11 | /** 12 | * Check if the p is a directory 13 | * @param {string} path 14 | * @returns {boolean} 15 | */ 16 | function isDir(path) { 17 | try { 18 | fs.accessSync(path, fs.constants.R_OK); 19 | } catch (err) { 20 | console.error(`No access to ${path}`); 21 | return false; 22 | } 23 | return fs.statSync(path).isDirectory(); 24 | } 25 | 26 | /** 27 | * Check if a file exists and is available to be read 28 | * @param {string} filePath 29 | * @returns {boolean} 30 | */ 31 | function available(filePath) { 32 | return fs.existsSync(filePath); 33 | try { 34 | fs.accessSync(path, fs.constants.R_OK); 35 | } catch (err) { 36 | return false; 37 | } 38 | 39 | return true; 40 | } 41 | 42 | /** 43 | * Given a p, returns the extension of the p in lowercase 44 | * @param {string} filePath 45 | * @returns {string} 46 | */ 47 | function getExtension(filePath) { 48 | assert.ok(!isDir(filePath)); 49 | return path.extname(filePath).toLowerCase(); 50 | } 51 | 52 | /** 53 | * Read a file and returns the content of the file. If the filetype 54 | * is JSON then also converts to a JSON object. 55 | * @param {string} filePath Path of the file that needs to be read 56 | * @returns {(string|Object.)} 57 | */ 58 | function getFileContent(filePath) { 59 | let content; 60 | assert.ok(!isDir(filePath)); 61 | assert.ok(available(filePath)); 62 | content = fs.readFileSync(filePath, 'utf8'); 63 | if (getExtension(filePath) === '.json') content = JSON.parse(content); 64 | return content; 65 | } 66 | 67 | /** 68 | * Check if a file is accessible and then returns the size 69 | * of the file in bytes 70 | * @param {string} filePath 71 | * @returns {number} 72 | */ 73 | function getFileSize(filePath) { 74 | assert.ok(available(filePath)); 75 | return fs.statSync(filePath).size; 76 | } 77 | 78 | /** 79 | * Go through a directory and all sub directories and create a list 80 | * links to the files in the particular directory. 81 | * @param {string} dirPath Initial p of a directory 82 | * @param {string} fileExtension Types of file 83 | * @returns {Array.} List of files 84 | */ 85 | function createLinksOfFiles(dirPath, fileExtension) { 86 | /** 87 | * @type{string[]} 88 | */ 89 | let fileList = []; 90 | let folderToTraverse = [dirPath]; 91 | if (!fileExtension) throw 'Need extension of file that will be filtered'; 92 | while (folderToTraverse.length !== 0) { 93 | let currentFolder = folderToTraverse.pop(); 94 | let list_of_files_and_folders = fs.readdirSync(currentFolder); 95 | list_of_files_and_folders.forEach((f_path) => { 96 | let complete_path = path.join(currentFolder, f_path); 97 | if (isDir(complete_path)) 98 | folderToTraverse.push(complete_path); 99 | else if (getExtension(complete_path) === fileExtension) 100 | fileList.push(complete_path); 101 | }); 102 | } 103 | return fileList; 104 | } 105 | 106 | module.exports.available = available; 107 | module.exports.getFileContent = getFileContent; 108 | module.exports.getExtension = getExtension; 109 | module.exports.getFileSize = getFileSize; 110 | module.exports.createLinksOfFiles = createLinksOfFiles; -------------------------------------------------------------------------------- /bug_seeding/obtain_bug_seeding_patterns/repo_downloader/main.js: -------------------------------------------------------------------------------- 1 | /* 2 | 1. This script uses GitHub API to get the top 'N' GitHub repositories and saves it in a file. 3 | 2. It then goes through this list and downloads each repo locally 4 | 5 | @author Jibesh Patra 6 | */ 7 | const Scrapper = require('./getTopGitHubRepoNames').Scrapper; 8 | const download = require('./downloadTopGithubRepos').download_repositories; 9 | const path = require('path'); 10 | const fs = require('fs'); 11 | const fileutils = require('./fileoperations'); 12 | 13 | async function getLinks(link_to_top_repos, numOfGitHubRepos) { 14 | // Uncomment the following lines if top1000GithubRepos.json is not present OR needs to be updated 15 | 16 | // let github_scapper = new Scrapper(link_to_top_repos, numOfGitHubRepos); 17 | // // If more than '100' is required then the pages need to be changed for each request 18 | // github_scapper.getRepositoriesParseCommits({ 19 | // language: 'javascript', 20 | // page: 1, 21 | // q_no: 0 22 | // }); 23 | } 24 | 25 | /** 26 | * 27 | * @param link_to_top_repos 28 | * @param download_dir 29 | * @param numOfGitHubRepos 30 | */ 31 | async function getLinksAndDownload(link_to_top_repos, download_dir, numOfGitHubRepos) { 32 | // 1. Get the links to the top 100 GitHub repositories 33 | getLinks(link_to_top_repos, numOfGitHubRepos).then(() => { 34 | // 2. Download some/all of them 35 | download(link_to_top_repos, numOfGitHubRepos, download_dir); 36 | }); 37 | } 38 | 39 | function main() { 40 | 41 | let link_to_top_repos = path.join('benchmarks', 'top1000GithubRepos.json'); // Where the links (GitHub URLs) to top 'N' repos will be saved 42 | let download_dir = path.join('benchmarks', 'top_JS_repos'); // Where the top 'N' repos will be saved 43 | let num_of_github_repos_to_download = 100; 44 | 45 | if (!fileutils.available(download_dir)) 46 | fs.mkdirSync(download_dir); 47 | 48 | getLinksAndDownload(link_to_top_repos, download_dir, num_of_github_repos_to_download).then(() => { 49 | console.log("Download .."); 50 | }); 51 | } 52 | 53 | main(); 54 | -------------------------------------------------------------------------------- /bug_seeding/run_bug_seeding.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on 17-March-2020 4 | @author Jibesh Patra 5 | 6 | The main file from where all experiments are run 7 | """ 8 | import argparse 9 | import utils.fileutils as fs 10 | from utils.argument_utils import read_arguments 11 | from utils.prepare_for_seeding_bug import prepare_dir_for_seeding_bugs 12 | from utils.bug_seeding_pattern_utils import find_wrong_operand_in_binary_op_patterns, \ 13 | get_only_idf_lit_containing_patterns 14 | import os 15 | from tqdm import tqdm 16 | from multiprocessing import Pool, cpu_count 17 | from seed_bugs_to_a_file import seed_bugs_to_a_file, seed_bugs_to_a_file_multiprocessing 18 | import numpy as np 19 | 20 | 21 | def select_particular_type_of_seeding_pattern(bug_seeding_patterns): 22 | # Select only 'Wrong Binary Operand' patterns 23 | # bug_seeding_patterns = find_wrong_operand_in_binary_op_patterns(bug_seeding_patterns) 24 | 25 | # Select only 'Wrong Assignments' patterns 26 | # bug_seeding_patterns = fs.read_json_file('benchmarks/bug_seeding_patterns_wrong_assignment.json') 27 | return bug_seeding_patterns 28 | 29 | 30 | if __name__ == '__main__': 31 | 32 | parser = argparse.ArgumentParser( 33 | prog='python run_bug_seeding.py', 34 | description="Provide the proper directories where bugs may be seeded", 35 | epilog="You must provide directories" 36 | ) 37 | in_dir, out_dir, working_dir, stats_dir, bug_seeding_patterns, k_freq_idf, k_freq_lit = read_arguments(parser) 38 | 39 | # print("Sampling files for using as target to seed bugs") 40 | # fs.sample_from_zip(zip_file_path='benchmarks/data.zip', out_dir=in_dir, file_extension_to_sample='.js', 41 | # required_number_of_files=100) 42 | 43 | # Read bug seeding patterns 44 | all_bug_seeding_patterns = fs.read_json_file(bug_seeding_patterns) 45 | all_bug_seeding_patterns = get_only_idf_lit_containing_patterns(all_bug_seeding_patterns) 46 | print(f"Complete bug seeding patterns = {len(all_bug_seeding_patterns)}") 47 | l_len = len(all_bug_seeding_patterns) * 80 // 100 48 | tr_patterns, val_patterns = all_bug_seeding_patterns[:l_len], all_bug_seeding_patterns[l_len:] 49 | print( 50 | f'Training patterns are {len(tr_patterns)} and validation are {len(val_patterns)}. We only use training patterns for bug seeding') 51 | bug_seeding_patterns = tr_patterns 52 | 53 | bug_seeding_patterns = select_particular_type_of_seeding_pattern(bug_seeding_patterns=bug_seeding_patterns) 54 | print("There are {} bug seeding patterns".format(len(bug_seeding_patterns))) 55 | 56 | # More intermediate directories needed 57 | static_analysis_out_dir = os.path.join(working_dir, '__TEMP__target_js_file_nodes') 58 | 59 | print("Preparing for bug seeding") 60 | prepare_dir_for_seeding_bugs(target_js_dir=in_dir, 61 | abstracted_out_dir=static_analysis_out_dir, num_of_files=-1) 62 | 63 | # Maximum number of tries to seed bugs per file. We could be always successful and seed 10 bugs or 0 64 | MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS = -1 # If -1 then try to seed everywhere 65 | actual_mutations_in_each_file = [] 66 | 67 | # Now seed bugs 68 | K_most_frequent_identifiers = fs.read_json_file(k_freq_idf) 69 | K_most_frequent_literals = fs.read_json_file(k_freq_lit) 70 | analysed_files = fs.go_through_dir(directory=static_analysis_out_dir, filter_file_extension='.json') 71 | 72 | args_for_files = [ 73 | (file, bug_seeding_patterns, K_most_frequent_identifiers, K_most_frequent_literals, 74 | MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS, out_dir) for 75 | file in analysed_files] 76 | 77 | # Multiprocessing only on machine with many CPUs 78 | if cpu_count() > 4: 79 | with Pool(processes=cpu_count()) as p: 80 | with tqdm(total=len(analysed_files)) as pbar: 81 | pbar.set_description_str( 82 | desc="Seeding bugs to files ", refresh=False) 83 | for i, successful_mutations in tqdm( 84 | enumerate(p.imap_unordered(seed_bugs_to_a_file_multiprocessing, args_for_files, chunksize=1)), 85 | position=0): 86 | actual_mutations_in_each_file.append(successful_mutations) 87 | pbar.update() 88 | p.close() 89 | p.join() 90 | else: 91 | # Non multiprocessing 92 | for file in tqdm(analysed_files, desc='Seeding bugs to files', position=0, postfix={'approach': 'SemSeed'}): 93 | successful_mutations = seed_bugs_to_a_file(file, bug_seeding_patterns, K_most_frequent_identifiers, 94 | K_most_frequent_literals, 95 | MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS, out_dir) 96 | actual_mutations_in_each_file.append(successful_mutations) 97 | 98 | print("\n *** Bugs could be seeded in {}/{} files output directory is '{}' ***".format( 99 | np.count_nonzero(actual_mutations_in_each_file), 100 | len(analysed_files), out_dir)) 101 | -------------------------------------------------------------------------------- /bug_seeding/seed_bugs_to_a_file.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on 01-April-2020 4 | @author Jibesh Patra 5 | 6 | """ 7 | from bug_seeding_approaches.SemSeed.SemSeedBugs import SemSeedBugs 8 | import utils.static_analysis_utils as static_analysis_utils 9 | import utils.fileutils as fs 10 | import random 11 | from tqdm import tqdm 12 | from pathlib import Path 13 | from typing import List 14 | import os 15 | import jsbeautifier 16 | 17 | random.seed(a=42) 18 | 19 | 20 | def seed_bugs_to_a_file_multiprocessing(args): 21 | """ 22 | The multiprocessing wrapper of seed_bugs_to_a_file function 23 | :param args: 24 | :return: 25 | """ 26 | file, bug_seeding_patterns, K_most_frequent_identifiers, K_most_frequent_literals, MAX_TRIES_TO_SEED_BUGS, out_dir = args 27 | return seed_bugs_to_a_file(file, bug_seeding_patterns, K_most_frequent_identifiers, K_most_frequent_literals, 28 | MAX_TRIES_TO_SEED_BUGS, out_dir) 29 | 30 | 31 | def seed_bugs_to_a_file(file: str, 32 | bug_seeding_patterns: List, 33 | K_most_frequent_identifiers: List, 34 | K_most_frequent_literals: List, 35 | MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS: int, 36 | out_dir: str) -> int: 37 | """ 38 | Given a file seed bugs to it. The expected file is a JSON file rather than a JS file. It is expected 39 | that the input JS file has been analysed before and a corresponding JSON file has been createdl 40 | :param file: the corresponding JSON file of the JS file where bugs need to be seeded 41 | :param bug_seeding_patterns: 42 | :param K_most_frequent_identifiers: 43 | :param K_most_frequent_literals: 44 | :param MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS: 45 | :param out_dir: A path where the mutate code will be written 46 | :return: The count of bugs that could be seeded to the file 47 | """ 48 | num_of_locations_that_could_be_mutated = 0 49 | 50 | target_js_file_analysed = fs.read_json_file(file) 51 | if len(target_js_file_analysed) == 0: # The static analysis could not finish properly 52 | return num_of_locations_that_could_be_mutated 53 | possible_bug_seeding_locations = target_js_file_analysed['nodes'] 54 | 55 | # We do not want to select the first 'n' locations and try to seed bugs. Rather we randomly 56 | # choose 'n' locations 57 | random.shuffle(possible_bug_seeding_locations) 58 | if MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS > 1: 59 | possible_bug_seeding_locations = possible_bug_seeding_locations[:MAX_LOCATIONS_TO_TRY_TO_SEED_BUGS] 60 | 61 | # Get Identifiers and Literals available for selection in different scope 62 | identifiers_in_different_scopes = static_analysis_utils.get_tokens_from_different_scopes( 63 | analysed_file=target_js_file_analysed, 64 | kind='identifier', 65 | k_most_frequent=K_most_frequent_identifiers) 66 | literals_in_different_scopes = static_analysis_utils.get_tokens_from_different_scopes( 67 | analysed_file=target_js_file_analysed, 68 | kind='literal', 69 | k_most_frequent=K_most_frequent_literals) 70 | 71 | file_name = Path(file).name 72 | # Go through each seeding pattern available from the bug seeding patterns 73 | for seeding_pattern in tqdm(bug_seeding_patterns, position=1, ncols=100, ascii=" #", 74 | desc='Trying to apply pattern', 75 | postfix={'file': file_name}): 76 | # For each location in the file, try to seed a bug 77 | for target_location in possible_bug_seeding_locations: 78 | # ------------------------ SemSeed ----------------------------------------- 79 | bug_seeding = SemSeedBugs(bug_seeding_pattern=seeding_pattern, 80 | target_location=target_location, 81 | file_path=target_js_file_analysed['file_path'], 82 | similarity_threshold=0.3, 83 | K=1, 84 | available_identifiers=identifiers_in_different_scopes, 85 | available_literals=literals_in_different_scopes, 86 | scope_of_selection='top_K') 87 | 88 | # Check if the seeding pattern and the target locations match 89 | if bug_seeding.is_matching_token_sequence(): 90 | 91 | # The mutated token sequences is only the 'mutated' target location token sequence 92 | # We may get multiple sequences based on K. If K=2 and there is only one 93 | # unbound token, we get 2 sequences 94 | mutated_token_sequences = bug_seeding.apply_pattern() 95 | if len(mutated_token_sequences) > 0: 96 | num_of_locations_that_could_be_mutated += 1 97 | 98 | for ms, mutated_sequence in enumerate(mutated_token_sequences): 99 | token_sequence_after_seeding_bug = bug_seeding.replace_target_with_mutated_token_sequence( 100 | token_list=target_js_file_analysed['tokenList'], 101 | token_range_list=target_js_file_analysed['tokenRangesList'], 102 | mutated_token_sequence=mutated_sequence) 103 | 104 | bug_seeding.bug_metadata['target_token_sequence-Buggy'] = mutated_sequence 105 | bug_seeding.bug_metadata['token_sequence_abstraction-Buggy'] = seeding_pattern['buggy'] 106 | bug_seeding.bug_metadata['num_of_available_identifiers_to_choose_from'] = len( 107 | bug_seeding.identifiers_available_for_selecting_unbound_token) 108 | bug_seeding.bug_metadata['num_of_available_literals_to_choose_from'] = len( 109 | bug_seeding.literals_available_for_selecting_unbound_token) 110 | bug_seeding.bug_metadata['seeding_pattern_url'] = seeding_pattern['url'] 111 | 112 | # Simply joining the token list with a space 113 | mutated_code = ' '.join(token_sequence_after_seeding_bug) 114 | 115 | # Write the output code & metadata about the bug seed 116 | out_file_name = file_name.replace('.json', 117 | f'_SEMSEED_MUTATED_{num_of_locations_that_could_be_mutated}.js') 118 | out_file_path = os.path.join(out_dir, out_file_name) 119 | if fs.pathExists(out_file_path): 120 | out_file_path = out_file_path.replace('.js', 121 | f'_{str(random.randint(0, 10000))}_{["a", "b", "c", "d"][random.randint(0, 3)]}.js') 122 | try: 123 | # Remember, this does not check for Syntax Errors in the generated JS code. This needs to be 124 | # done separately 125 | mutated_code = jsbeautifier.beautify(mutated_code, { 126 | "indent_empty_lines": False, 127 | "break_chained_methods": False, 128 | "space_after_anon_function": False, 129 | "space_in_paren": False 130 | }) 131 | fs.writeFile(data=mutated_code, file_path=out_file_path) 132 | except Exception as e: 133 | tqdm.write(f'ERROR: Could not seed bugs to {file_name} because {e}') 134 | bug_seeding.bug_metadata['error'] = str(e) 135 | finally: 136 | fs.writeJSONFile(data=bug_seeding.bug_metadata, file_path=out_file_path + 'on') 137 | else: 138 | pass 139 | 140 | return num_of_locations_that_could_be_mutated 141 | -------------------------------------------------------------------------------- /bug_seeding/utils/argument_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on 24-March-2020 4 | @author Jibesh Patra 5 | 6 | """ 7 | from argparse import ArgumentParser 8 | from utils.fileutils import create_dir_list_if_not_present 9 | from typing import Tuple 10 | 11 | 12 | def read_arguments(parser: ArgumentParser) -> Tuple: 13 | parser = add_arguments_to_parser(parser) 14 | args = parser.parse_args() 15 | create_dir_list_if_not_present([args.out_dir, args.working_dir, args.stats_dir]) 16 | return args.in_dir, args.out_dir, args.working_dir, args.stats_dir, args.bug_seeding_patterns, args.K_freq_idf, args.K_freq_lit 17 | 18 | 19 | def add_arguments_to_parser(parser: ArgumentParser) -> ArgumentParser: 20 | parser.add_argument( 21 | '--in_dir', 22 | type=str, 23 | default='benchmarks/data', 24 | help='The directory containing JS files where bugs may be seeded' 25 | ) 26 | parser.add_argument( 27 | '--out_dir', 28 | type=str, 29 | default='benchmarks/js_benchmark_seeded_bugs', 30 | help='The directory where the bug seeded files will written' 31 | ) 32 | parser.add_argument( 33 | '--working_dir', 34 | type=str, 35 | default='benchmarks/js_benchmark_working_dir', 36 | help='The directory where intermediate results will be written' 37 | ) 38 | parser.add_argument( 39 | '--stats_dir', 40 | type=str, 41 | default='benchmarks/js_benchmark_stats', 42 | help='The directory where statistics about bug seeding will be written' 43 | ) 44 | parser.add_argument( 45 | '--bug_seeding_patterns', 46 | type=str, 47 | default='benchmarks/bug_seeding_patterns_for_semantic_seeding.json', 48 | help='The path to a file that contains the change patterns' 49 | ) 50 | 51 | parser.add_argument( 52 | '--K_freq_idf', 53 | type=str, 54 | default='benchmarks/topK_identifiers_in_training_commits.json', 55 | help='K most frequent Identifier' 56 | ) 57 | 58 | parser.add_argument( 59 | '--K_freq_lit', 60 | type=str, 61 | default='benchmarks/topK_literals_in_training_commits.json', 62 | help='K most frequent Literal' 63 | ) 64 | 65 | return parser 66 | -------------------------------------------------------------------------------- /bug_seeding/utils/bug_seeding_pattern_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on 02-April-2020 4 | @author Jibesh Patra 5 | 6 | Given all change patterns do stuffs with them 7 | 8 | """ 9 | from typing import List, Tuple 10 | import utils.fileutils as fs 11 | import re 12 | import pandas as pd 13 | 14 | 15 | def get_only_idf_lit_containing_patterns(all_changes): 16 | """ 17 | It is possible that every bug-fix pattern can not be used to seed bugs. 18 | We filter some of them here. For example: 19 | * we may filter very long change patterns (although we do it once while aggregating data from MongoDB) 20 | * we may select only those chage patterns that has atleast 'N' frequency 21 | """ 22 | filtered_change_patterns = [] 23 | 24 | # # ----------------------- Filtering number of tokens ------------------------- 25 | # max_number_of_tokens = 10 26 | # for change_pattern in self.all_training_change_patterns: 27 | # print('\n\n \t *** *** Selecting only change patterns having total {} tokens *** ***'.format(max_number_of_tokens*2)) 28 | # if len(change_pattern['fix']) <= max_number_of_tokens and len(change_pattern['buggy']) <= max_number_of_tokens: 29 | # filtered_change_patterns.append(change_pattern) 30 | 31 | # ----------------------- Filtering based on the frequency of the change patterns ----------------- 32 | # min_frequency = 4 33 | # print('\n \t *** *** Filtering only change patterns having minimum frequency {} *** ***\n'.format(min_frequency)) 34 | # mapping_of_change_patterns = SeedBugs._str_mapping_change_pattern_to_change( 35 | # all_changes) 36 | 37 | # for mapped_seq in mapping_of_change_patterns: 38 | # if len(mapping_of_change_patterns[mapped_seq]) >= min_frequency: 39 | # filtered_change_patterns.extend( 40 | # mapping_of_change_patterns[mapped_seq]) 41 | 42 | # print("\tTotal {} change patterns and {} filtered change patterns ".format( 43 | # len(mapping_of_change_patterns), len(filtered_change_patterns))) 44 | 45 | # ------------------- Remove those change patterns that does not contain any Identifiers/Literals ------------ 46 | for t in all_changes: 47 | # If the change pattern contains at-least one Identifier/Literal, we use that. 48 | # Else the change pattern is discarded 49 | if 'Idf_' in ' '.join(t['fix']) or 'Idf_' in ' '.join(t['buggy']) or 'Lit_' in ' '.join( 50 | t['fix']) or 'Lit_' in ' '.join(t['buggy']): 51 | filtered_change_patterns.append(t) 52 | 53 | return filtered_change_patterns 54 | 55 | 56 | def find_wrong_operand_in_binary_op_patterns(bug_seeding_patterns: List) -> List: 57 | filtered_patterns = [] 58 | dup_filter = set() 59 | js_binary_operators = ["==", "!=", "===", "!==", "<", "<=", ">", ">=", "<<", ">>", ">>>", "\+", "-", "\*", "/", "%", 60 | "\|", 61 | "\^", "&", "in", "instanceof"] 62 | regexps = [] 63 | for op in js_binary_operators: 64 | regexps.append(re.compile('(Idf_[\d]|Lit_[\d])\s(' + op + ')\s(Idf_[\d]|Lit_[\d])')) 65 | for pattern in bug_seeding_patterns: 66 | correct_part_of_pattern = ' '.join(pattern['fix']) 67 | buggy_part_of_pattern = ' '.join(pattern['buggy']) 68 | if pattern['fix_tokenType'] == 'BinaryExpression' and pattern['buggy_tokenType'] == 'BinaryExpression': 69 | for regex_op_1 in regexps: 70 | in_correct = regex_op_1.findall(correct_part_of_pattern) 71 | for regex_op_2 in regexps: 72 | in_buggy = regex_op_2.findall(buggy_part_of_pattern) 73 | for correct_match in in_correct: 74 | for buggy_match in in_buggy: 75 | if correct_match[1] == buggy_match[1] and correct_match[0] != buggy_match[0] and \ 76 | correct_match[ 77 | 2] == buggy_match[2]: 78 | pattern_as_str = correct_part_of_pattern + buggy_part_of_pattern 79 | if pattern_as_str not in dup_filter: 80 | dup_filter.add(pattern_as_str) 81 | filtered_patterns.append(pattern) 82 | if correct_match[1] == buggy_match[1] and correct_match[0] == buggy_match[0] and \ 83 | correct_match[ 84 | 2] != buggy_match[2]: 85 | pattern_as_str = correct_part_of_pattern + buggy_part_of_pattern 86 | if pattern_as_str not in dup_filter: 87 | dup_filter.add(pattern_as_str) 88 | filtered_patterns.append(pattern) 89 | return filtered_patterns 90 | -------------------------------------------------------------------------------- /bug_seeding/utils/format_bug_seeded_files.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on 02-August-2020 4 | @author Jibesh Patra 5 | 6 | Run this script after bug seeding has finished 7 | 8 | """ 9 | import os 10 | import subprocess 11 | from threading import Timer 12 | from tqdm import tqdm 13 | from multiprocessing import Pool, cpu_count 14 | from pathlib import Path 15 | 16 | 17 | def format_a_js_file(target_js_file_path: str) -> str: 18 | def kill_process(p): 19 | return p.kill() 20 | 21 | err_in_execution = False 22 | path_to_process = os.path.join(os.path.normpath( 23 | os.getcwd() + os.sep), 'static_analysis_js', 'utils', 'format_a_js_file.js') 24 | time_out_before_killing = 5000 # seconds 25 | try: 26 | p = subprocess.Popen([ 27 | 'node', path_to_process, 28 | '-inFile', target_js_file_path, 29 | ], stdout=subprocess.PIPE) 30 | time_out = Timer(time_out_before_killing, kill_process, [p]) 31 | try: 32 | time_out.start() 33 | stdout, stderr = p.communicate() 34 | if stderr: 35 | err_in_execution = stderr.decode("utf-8") 36 | finally: 37 | time_out.cancel() 38 | except subprocess.TimeoutExpired: 39 | pass 40 | return err_in_execution 41 | 42 | 43 | def format_files_in_dir(indir): 44 | js_files = list(Path(indir).rglob('*.js')) 45 | print(f"Will format {len(js_files)} files") 46 | with Pool(processes=cpu_count()) as p: 47 | with tqdm(total=len(js_files)) as pbar: 48 | pbar.set_description_str( 49 | desc="Formatting js files ", refresh=False) 50 | for i, execution_errors in tqdm( 51 | enumerate(p.imap_unordered(format_a_js_file, 52 | js_files, chunksize=10))): 53 | # print(execution_errors) 54 | pbar.update() 55 | p.close() 56 | p.join() 57 | 58 | 59 | if __name__ == '__main__': 60 | format_files_in_dir('benchmarks/js_benchmark_seeded_bugs') 61 | -------------------------------------------------------------------------------- /bug_seeding/utils/prepare_for_seeding_bug.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on 25-March-2020 4 | @author Jibesh Patra 5 | 6 | Call nodejs to tokenize and convert files to their AST representations, tokenize etc. 7 | """ 8 | import os 9 | import subprocess 10 | from threading import Timer 11 | import utils.fileutils as fs 12 | from tqdm import tqdm 13 | from multiprocessing import Pool, cpu_count 14 | from typing import List 15 | import random 16 | from pathlib import Path 17 | 18 | 19 | def prepare_a_js_file_for_seeding_bug(target_js_file_path: str, out_json_file_path: str) -> str: 20 | """ 21 | Prepare a JS file for seeding bugs by converting JS file to AST nodes. 22 | The functions creates a Nodejs process to extract the required data. 23 | :param target_js_file_path: The input JS file that will be converted to AST node representations 24 | :param out_json_file_path: 25 | :return: 26 | """ 27 | 28 | def kill_process(p): 29 | return p.kill() 30 | 31 | err_in_execution = False 32 | path_to_process = os.path.join(os.path.normpath( 33 | os.getcwd() + os.sep), 'bug_seeding', 'obtain_bug_seeding_patterns', 'extract_bug_seeding_patterns_from_repos', 34 | 'extractNodeData.js') 35 | time_out_before_killing = 180 # seconds 36 | try: 37 | p = subprocess.Popen([ 38 | 'node', path_to_process, 39 | '-inFile', target_js_file_path, 40 | '-outFile', out_json_file_path, 41 | ], 42 | stdout=subprocess.PIPE) 43 | time_out = Timer(time_out_before_killing, kill_process, [p]) 44 | try: 45 | time_out.start() 46 | stdout, stderr = p.communicate() 47 | if stderr: 48 | err_in_execution = stderr.decode("utf-8") 49 | finally: 50 | time_out.cancel() 51 | except subprocess.TimeoutExpired: 52 | pass 53 | return err_in_execution 54 | 55 | 56 | def remove_duplicates(file_list: List, duplicate_file_groups: List) -> List: 57 | """ 58 | Given a list of files, and known duplicates, keep only one of the duplicates 59 | :param duplicate_file_groups: 60 | :param file_list: 61 | :return: 62 | """ 63 | dup_files = set() 64 | for file_group in duplicate_file_groups: 65 | # Except the first file rest are all duplicates 66 | dup_files.update(file_group[1:]) 67 | 68 | files_without_duplicates = [] 69 | # Now, we remove the known duplicates 70 | root_dir = '/data/' 71 | # dup_files = set([os.path.join(root_dir, fp) for fp in dup_files]) 72 | for fl_path in file_list: 73 | if fl_path.split(root_dir)[1] not in dup_files: 74 | files_without_duplicates.append(fl_path) 75 | return files_without_duplicates 76 | 77 | 78 | def prepare_a_js_file_for_seeding_bug_multiprocessing(arg): 79 | target_js_file_path, out_json_file_path = arg 80 | prepare_a_js_file_for_seeding_bug(target_js_file_path, out_json_file_path) 81 | 82 | 83 | def prepare_dir_for_seeding_bugs(target_js_dir: str, abstracted_out_dir: str, num_of_files: int = -1) -> None: 84 | """ 85 | Given a directory of JS files, format the code and run static analysis to extract nodes 86 | from the code. 87 | :param num_of_files: Select only 'num_of_files' files from 'abstracted_out_dir' once it is ready 88 | :param target_js_dir: 89 | 90 | :param abstracted_out_dir: 91 | :return: 92 | """ 93 | fs.create_dir_list_if_not_present([abstracted_out_dir]) 94 | 95 | print(" Reading files in {}".format(target_js_dir)) 96 | all_target_js_files = sorted(Path(target_js_dir).rglob('*.js')) 97 | all_target_js_files = [str(pth) for pth in all_target_js_files if pth.is_file()] 98 | 99 | # Some datasets might have duplicate files. We want to remove the duplicates 100 | print(" Removing duplicates from {} files in benchmarks".format(len(all_target_js_files))) 101 | duplicate_file_groups = fs.read_json_file('benchmarks/js150-duplicates.json') 102 | all_target_js_files = remove_duplicates(file_list=all_target_js_files, duplicate_file_groups=duplicate_file_groups) 103 | 104 | if num_of_files > 1: 105 | random.seed(100) 106 | random.shuffle(all_target_js_files) 107 | all_target_js_files = all_target_js_files[:num_of_files] 108 | print(" Total number of files in benchmark is {}".format(len(all_target_js_files))) 109 | 110 | def create_out_file_path(target_js_file_path: str) -> str: 111 | return os.path.join(abstracted_out_dir, os.path.basename(target_js_file_path) + 'on') 112 | 113 | target_js_files_and_out_paths = [(target_js_file_path, create_out_file_path(target_js_file_path)) 114 | for target_js_file_path in all_target_js_files] 115 | if cpu_count() > 4: 116 | with Pool(processes=cpu_count()) as p: 117 | with tqdm(total=len(all_target_js_files)) as pbar: 118 | pbar.set_description_str( 119 | desc="Preparing js files ", refresh=False) 120 | for i, execution_errors in tqdm( 121 | enumerate(p.imap_unordered(prepare_a_js_file_for_seeding_bug_multiprocessing, 122 | target_js_files_and_out_paths, chunksize=10))): 123 | # print(execution_errors) 124 | pbar.update() 125 | p.close() 126 | p.join() 127 | else: 128 | for target_file, out_file in tqdm(target_js_files_and_out_paths, 129 | desc='Preparing JS files *** Sequentially ***'): 130 | prepare_a_js_file_for_seeding_bug(target_js_file_path=target_file, out_json_file_path=out_file) 131 | -------------------------------------------------------------------------------- /bug_seeding/utils/static_analysis_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Created on 25-March-2020 4 | @author Jibesh Patra 5 | 6 | This file contains helper functions to parse the static analysis results 7 | extracted using nodejs and esprima 8 | """ 9 | from typing import List, Dict 10 | 11 | 12 | def get_all_tokens_in_file(range_to_token_mapping: Dict) -> List: 13 | tokens = set() 14 | for token in range_to_token_mapping.values(): 15 | tokens.add(token) 16 | return list(tokens) 17 | 18 | 19 | def get_tokens_from_different_scopes(analysed_file: dict, kind: str, k_most_frequent: List) -> Dict: 20 | if kind == 'identifier': 21 | return { 22 | 'all_identifiers_in_same_file': get_all_tokens_in_file( 23 | analysed_file['range_to_identifier']), 24 | # A mapping between the functions in the file and the containing Identifiers 25 | 'functions_to_identifiers': analysed_file['functions_to_identifiers'], 26 | 'K_most_frequent_identifiers': k_most_frequent # 1000 most frequent Identifiers 27 | } 28 | else: 29 | return { 30 | 'all_literals_in_same_file': get_all_tokens_in_file( 31 | analysed_file['range_to_literal']), 32 | # A mapping between the functions in the file and the containing Literals 33 | 'functions_to_literals': analysed_file['functions_to_literals'], 34 | 'K_most_frequent_literals': k_most_frequent # 1000 most frequent Literals 35 | } 36 | -------------------------------------------------------------------------------- /database_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "database_name": "SemSeed_github_commits_db", 3 | "host": "127.0.0.1", 4 | "port": 27017, 5 | "username": "semSeedUser", 6 | "password": "semSeedPassWord124", 7 | "collection_name": "commits" 8 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "SemSeed", 3 | "version": "1.0.0", 4 | "devDependencies": { 5 | "@octokit/rest": "^18.5.6", 6 | "async": "^3.2.0", 7 | "simple-git": "^2.39.0", 8 | "acorn": "^7.1.1", 9 | "argparse": "^1.0.10", 10 | "escodegen": "^1.14.1", 11 | "esprima": "^4.0.1", 12 | "estraverse": "^5.1.0", 13 | "js-beautify": "^1.11.0", 14 | "process": "^0.11.10", 15 | "strip-comments": "^2.0.1", 16 | "uglify-js": "^3.9.1", 17 | "walk-sync": "^2.1.0" 18 | }, 19 | "dependencies": { 20 | "lodash": "^4.17.21", 21 | "mongodb": "3.6.9" 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==0.12.0 2 | argon2-cffi==20.1.0 3 | astunparse==1.6.3 4 | async-generator==1.10 5 | attrs==19.3.0 6 | Automat==0.8.0 7 | backcall==0.2.0 8 | bleach==3.3.0 9 | blinker==1.4 10 | cachetools==4.2.2 11 | certifi==2019.11.28 12 | cffi==1.14.5 13 | chardet==3.0.4 14 | Click==7.0 15 | cloud-init==21.1 16 | colorama==0.4.3 17 | command-not-found==0.3 18 | configobj==5.0.6 19 | constantly==15.1.0 20 | cryptography==2.8 21 | cycler==0.10.0 22 | dbus-python==1.2.16 23 | decorator==5.0.9 24 | defusedxml==0.7.1 25 | distro==1.4.0 26 | distro-info===0.23ubuntu1 27 | EditorConfig==0.12.2 28 | entrypoints==0.3 29 | fasttext==0.9.1 30 | flatbuffers==1.12 31 | future==0.18.2 32 | gast==0.4.0 33 | google-auth==1.30.1 34 | google-auth-oauthlib==0.4.4 35 | google-pasta==0.2.0 36 | grpcio==1.34.1 37 | h5py==3.1.0 38 | httplib2==0.14.0 39 | hyperlink==19.0.0 40 | idna==2.8 41 | importlib-metadata==1.5.0 42 | incremental==16.10.1 43 | install==1.3.4 44 | ipykernel==5.5.5 45 | ipython==7.24.1 46 | ipython-genutils==0.2.0 47 | ipywidgets==7.6.3 48 | jedi==0.18.0 49 | Jinja2==2.10.1 50 | jsbeautifier==1.11.0 51 | jsonpatch==1.22 52 | jsonpointer==2.0 53 | jsonschema==3.2.0 54 | jupyter==1.0.0 55 | jupyter-client==6.1.12 56 | jupyter-console==6.4.0 57 | jupyter-core==4.7.1 58 | jupyterlab-pygments==0.1.2 59 | jupyterlab-widgets==1.0.0 60 | Keras==2.4.3 61 | keras-nightly==2.5.0.dev2021032900 62 | Keras-Preprocessing==1.1.2 63 | keyring==18.0.1 64 | kiwisolver==1.3.1 65 | language-selector==0.1 66 | launchpadlib==1.10.13 67 | lazr.restfulclient==0.14.2 68 | lazr.uri==1.0.3 69 | libcst==0.3.19 70 | Markdown==3.3.4 71 | MarkupSafe==1.1.0 72 | matplotlib==3.4.2 73 | matplotlib-inline==0.1.2 74 | mistune==0.8.4 75 | mongoengine==0.23.1 76 | more-itertools==4.2.0 77 | mypy-extensions==0.4.3 78 | nbclient==0.5.3 79 | nbconvert==6.0.7 80 | nbformat==5.1.3 81 | nest-asyncio==1.5.1 82 | netifaces==0.10.4 83 | notebook==6.4.0 84 | numpy==1.19.5 85 | oauthlib==3.1.0 86 | opt-einsum==3.3.0 87 | packaging==20.9 88 | pandas==1.0.3 89 | pandocfilters==1.4.3 90 | parsepatch==0.1.3 91 | parso==0.8.2 92 | pbr==5.6.0 93 | pexpect==4.6.0 94 | pickleshare==0.7.5 95 | Pillow==7.1.1 96 | prometheus-client==0.11.0 97 | prompt-toolkit==3.0.18 98 | protobuf==3.17.2 99 | ptyprocess==0.7.0 100 | pyasn1==0.4.2 101 | pyasn1-modules==0.2.1 102 | pybind11==2.5.0 103 | pycparser==2.20 104 | pygit2==1.6.0 105 | Pygments==2.9.0 106 | PyGObject==3.36.0 107 | PyHamcrest==1.9.0 108 | PyJWT==1.7.1 109 | pymacaroons==0.13.0 110 | pymongo==3.11.4 111 | PyNaCl==1.3.0 112 | pyOpenSSL==19.0.0 113 | pyparsing==2.4.7 114 | pyrsistent==0.15.5 115 | pyserial==3.4 116 | python-apt==2.0.0+ubuntu0.20.4.4 117 | python-dateutil==2.8.1 118 | python-debian===0.1.36ubuntu1 119 | pytz==2019.3 120 | PyYAML==5.3.1 121 | pyzmq==22.1.0 122 | qtconsole==5.1.0 123 | QtPy==1.9.0 124 | requests==2.22.0 125 | requests-oauthlib==1.3.0 126 | requests-unixsocket==0.2.0 127 | rsa==4.7.2 128 | scipy==1.4.1 129 | seaborn==0.11.1 130 | SecretStorage==2.3.1 131 | Send2Trash==1.5.0 132 | service-identity==18.1.0 133 | simplejson==3.16.0 134 | six==1.15.0 135 | sos==4.1 136 | ssh-import-id==5.10 137 | systemd-python==234 138 | tensorboard==2.5.0 139 | tensorboard-data-server==0.6.1 140 | tensorboard-plugin-wit==1.8.0 141 | tensorflow==2.5.0 142 | tensorflow-estimator==2.5.0 143 | termcolor==1.1.0 144 | terminado==0.10.0 145 | testpath==0.5.0 146 | testresources==2.0.1 147 | torch==1.8.1+cpu 148 | torchaudio==0.8.1 149 | torchvision==0.9.1+cpu 150 | tornado==6.1 151 | tqdm==4.45.0 152 | traitlets==5.0.5 153 | Twisted==18.9.0 154 | typing==3.7.4.1 155 | typing-extensions==3.7.4.3 156 | typing-inspect==0.6.0 157 | ubuntu-advantage-tools==27.0 158 | ufw==0.36 159 | unattended-upgrades==0.1 160 | urllib3==1.25.8 161 | wadllib==1.3.3 162 | wcwidth==0.2.5 163 | webencodings==0.5.1 164 | Werkzeug==2.0.1 165 | widgetsnbextension==3.5.1 166 | wrapt==1.12.1 167 | zipp==1.0.0 168 | zope.interface==4.7.1 169 | --------------------------------------------------------------------------------