├── orm
    ├── __init__.py
    ├── README.md
    ├── user.py
    ├── feedback.py
    ├── repository.py
    ├── glmcoefficients.py
    ├── commit.py
    └── metrics.py
├── analyzer
    ├── __init__.py
    ├── datasets
    │   ├── monthly
    │   │   └── .placeholder
    │   ├── .placeholder
    │   └── README.md
    ├── README.md
    ├── repositorymetrics.py
    ├── code_file_extentions.txt
    ├── notifier.py
    ├── analyzer.py
    ├── bugfinder.py
    ├── githubissuetracker.py
    ├── medianmodel.py
    ├── metricsgenerator.py
    ├── git_commit_linker.py
    └── linear_reg_model.py
├── classifier
    ├── __init__.py
    ├── Categories
    │   ├── perfective.csv
    │   ├── non_functional.csv
    │   ├── corrective.csv
    │   ├── preventative.csv
    │   ├── feature_addition.csv
    │   └── README.md
    ├── category.py
    └── classifier.py
├── ingester
    ├── __init__.py
    ├── CASRepos
    │   └── .placeholder
    ├── casr.bat
    ├── README.md
    ├── commitFile.py
    ├── ingester.py
    ├── localrepository.py
    └── git.py
├── CONTRIBUTORS.md
├── config.py
├── .gitignore
├── config.example.json
├── script.py
├── caslogging.py
├── db.py
├── test_categorization.py
├── README.md
├── cas_manager.py
└── LICENSE


/orm/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/analyzer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/classifier/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ingester/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/analyzer/datasets/monthly/.placeholder:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/classifier/Categories/perfective.csv:
--------------------------------------------------------------------------------
1 | clean,better


--------------------------------------------------------------------------------
/ingester/CASRepos/.placeholder:
--------------------------------------------------------------------------------
1 | Placehoder
2 | 


--------------------------------------------------------------------------------
/classifier/Categories/non_functional.csv:
--------------------------------------------------------------------------------
1 | doc,merge


--------------------------------------------------------------------------------
/ingester/casr.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | python scan.py %*


--------------------------------------------------------------------------------
/classifier/Categories/corrective.csv:
--------------------------------------------------------------------------------
1 | fix,bug,wrong,fail,problem


--------------------------------------------------------------------------------
/classifier/Categories/preventative.csv:
--------------------------------------------------------------------------------
1 | test,junit,coverage,assert


--------------------------------------------------------------------------------
/orm/README.md:
--------------------------------------------------------------------------------
1 | Holds all the SQLAlchamy ORM base abstraction classes. 


--------------------------------------------------------------------------------
/classifier/Categories/feature_addition.csv:
--------------------------------------------------------------------------------
1 | new,add,requirement,initial,create


--------------------------------------------------------------------------------
/analyzer/datasets/.placeholder:
--------------------------------------------------------------------------------
1 | Simply a placeholder such that gitignore will track this folder.
2 | 


--------------------------------------------------------------------------------
/classifier/Categories/README.md:
--------------------------------------------------------------------------------
1 | This folder contains all the words used to categorize commits. Associated words with commits are expected to be comma-seperated and lower case. 


--------------------------------------------------------------------------------
/CONTRIBUTORS.md:
--------------------------------------------------------------------------------
1 | Christoffer Rosen <cbr4830@rit.edu>
2 | Ben Grawi <bjg1568@rit.edu>
3 | 
4 | Worker and ThreadPool implementaiton inspired by Emilio Monti shared @ http://code.activestate.com/recipes/577187-python-thread-pool/


--------------------------------------------------------------------------------
/ingester/README.md:
--------------------------------------------------------------------------------
1 | CAS Ingester
2 | ==========
3 | 
4 | Package containing files for parsing git commits that inserts each commit into a PostgreSQL table.
5 | 
6 | ###Dependencies
7 | * Python  >= 3.3
8 | * Pip for Python Version > 3.3
9 | * Git > 1.7


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | """
2 | file: config.py
3 | author: Ben Grawi <bjg1568@rit.edu>
4 | date: November 2013
5 | description: Reads the config.json info into a varible
6 | """
7 | import json
8 | #from StringIO import StringIO
9 | config = json.load(open('./config.json'))


--------------------------------------------------------------------------------
/analyzer/datasets/README.md:
--------------------------------------------------------------------------------
1 | This folder contains all the CSV datasets used for metrics for each repository.
2 | Kept for research/analysis purposes.
3 | 
4 | The monthly folder contains csv dumps of all commit data for each repository. Datasets
5 | named after their repository ID.
6 | 


--------------------------------------------------------------------------------
/analyzer/README.md:
--------------------------------------------------------------------------------
 1 | CAS Analyzer
 2 | ==========
 3 | 
 4 | Package containing files required for analyzing repositories and generate median values of the buggy versus non buggy metrics
 5 | 
 6 | ###Dependencies
 7 | * Python  >= 3.3
 8 | * Pip for Python Version > 3.3
 9 | * Git > 1.7
10 | * R
11 | * python-dev
12 | * rpy2
13 | * requests
14 | * dateutil 
15 | 


--------------------------------------------------------------------------------
/ingester/commitFile.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Representing a single file in a commit
 3 | @name				name of file
 4 | @loc 				lines of code in file
 5 | @authors		 all authors of the file
 6 | @nuc				 number of unique changes made to the file
 7 | """
 8 | 
 9 | class CommitFile:
10 | 
11 | 	def __init__(self, name, loc, authors, lastchanged):
12 | 		self.name = name												# File name
13 | 		self.loc = loc													# LOC in file
14 | 		self.authors = authors									# Array of authors
15 | 		self.lastchanged = lastchanged					# unix time stamp of when last changed
16 | 		self.nuc = 1														# number of unique changes to the file
17 | 


--------------------------------------------------------------------------------
/orm/user.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: user.py
 3 | description: Holds the user abstraction class and ORM
 4 | """
 5 | import uuid
 6 | from db import *
 7 | from datetime import datetime
 8 | 
 9 | class User(Base):
10 |     """
11 |     User():
12 |     description: The SQLAlchemy ORM for the user table
13 |     """
14 |     __tablename__ = 'users'
15 | 
16 |     id = Column(String, primary_key=True)
17 |     email = Column(String)
18 |     password = Column(String)
19 | 
20 |     def __init__(self, userDict):
21 |         """
22 |         __init__(): Dictonary -> NoneType
23 |         """
24 |         self.__dict__.update(userDict)
25 | 
26 |     def __repr__(self):
27 |         return "<User id: %s>" % (self.id)
28 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | __pycache__
21 | 
22 | # Installer logs
23 | pip-log.txt
24 | 
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | nosetests.xml
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | 
38 | # CAS Analyzer
39 | analyzer/CASAnalyzer.log
40 | CASAnalyzer.log
41 | 
42 | # CAS Reader
43 | ingester/CASReader.log
44 | ingester/CASRepos/git/*
45 | 
46 | analyzer/datasets/*
47 | 
48 | CASLog.log
49 | config.json
50 | .DS_Store
51 | !.gitkeep
52 | 


--------------------------------------------------------------------------------
/orm/feedback.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: feedback.py
 3 | description: Holds the feedback abstraction class and ORM
 4 | """
 5 | import uuid
 6 | from db import *
 7 | from datetime import datetime
 8 | 
 9 | class Feedback(Base):
10 |     """
11 |     Commit():
12 |     description: The SQLAlchemy ORM for the feedback table
13 |     """
14 |     __tablename__ = 'feedback'
15 | 
16 |     id = Column(String, primary_key=True)
17 |     commit_hash = Column(String)
18 |     score = Column(Integer)
19 |     comment = Column(String)
20 | 
21 |     def __init__(self, feedbackDict):
22 |         """
23 |         __init__(): Dictonary -> NoneType
24 |         """
25 |         self.__dict__.update(feedbackDict)
26 | 
27 |     def __repr__(self):
28 |         return "<Feedback id: %s>" % (self.id)
29 | 


--------------------------------------------------------------------------------
/config.example.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"db": {
 3 | 		"type":"postgresql",
 4 | 		"adapter":"pypostgresql",
 5 | 		"username": "USERNAME",
 6 | 		"password": "PASSWORD",
 7 | 		"host": "localhost",
 8 | 		"port": "5432",
 9 | 		"database": "cas"
10 | 	},
11 | 	"logging_system": {
12 | 		"filename": "CASLog.log"
13 | 	},
14 | 	"logging_analyzer": {
15 | 		"filename": "Analyzer.log"
16 | 	},
17 | 	"gmail": {
18 | 		"user": "USERNAME@gmail.com",
19 | 		"pass": "PASSWORD"
20 | 	},
21 | 	"repoUpdates": {
22 | 		"freqInDays": "FREQUENCY IN DAYS"
23 | 	},
24 | 	"system": {
25 | 		"workers": "NUMBER OF WORKER THREADS"
26 | 	},
27 | 	"github": {
28 | 		"user": "example_user",
29 | 		"pass": "PASSWORD"
30 | 	},
31 | 	"glm_modeling":{
32 | 		"months": "3"
33 | 	},
34 | 	"data_dumps": {
35 | 		"location": "THE FULL PATH TO /analyzer/datasets"
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/script.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: script.py
 3 | author: Christoffer Rosen <cbr4830@rit.edu>, Ben Grawi <bjg1568@rit.edu>
 4 | date: Jan. 2014
 5 | description: base script to call.
 6 | """
 7 | from cas_manager import *
 8 | from analyzer.analyzer import *
 9 | from orm.feedback import * # so that we create the table - used by web
10 | from orm.user import * # so that we create the table - used by web
11 | from orm.glmcoefficients import * # so that we create the table - used by web
12 | 
13 | if len(sys.argv) > 1:
14 | 	arg = sys.argv[1]
15 | else:
16 | 	arg = ''
17 | 
18 | if arg == "initDb":
19 |     # Init the database
20 |     logging.info('Initializing the Database...')
21 |     Base.metadata.create_all(engine)
22 |     logging.info('Done')
23 | 
24 | else:
25 | 	logging.info("Starting CAS Manager")
26 | 	cas_manager = CAS_Manager()
27 | 	cas_manager.start()
28 | 


--------------------------------------------------------------------------------
/caslogging.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: caslogging.py
 3 | author: Ben Grawi <bjg1568@rit.edu>
 4 | date: October 2013
 5 | description: Sets up the logging information for the CAS Reader
 6 | """
 7 | from config import config
 8 | import logging as root_logging
 9 | 
10 | # Set up the logger
11 | 
12 | logger = root_logging.getLogger()
13 | logger.setLevel(root_logging.INFO)
14 | 
15 | logger_format = root_logging.Formatter('%(asctime)s %(levelname)s: %(message)s', '%Y-%m-%d %H:%M:%S')
16 | 
17 | logging_file_handler = root_logging.FileHandler(config['logging_system']['filename'])
18 | logging_file_handler.setLevel(root_logging.INFO)
19 | logging_file_handler.setFormatter(logger_format)
20 | logger.addHandler(logging_file_handler)
21 | 
22 | logging_stream_handler = root_logging.StreamHandler()
23 | logging_stream_handler.setLevel(root_logging.INFO)
24 | logging_stream_handler.setFormatter(logger_format)
25 | logger.addHandler(logging_stream_handler)
26 | 
27 | logging = root_logging


--------------------------------------------------------------------------------
/db.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: db.py
 3 | author: Ben Grawi <bjg1568@rit.edu>
 4 | date: October 2013
 5 | description: Holds the db connection info
 6 | """
 7 | from config import *
 8 | import sqlalchemy
 9 | from sqlalchemy import *
10 | from sqlalchemy.ext.declarative import declarative_base
11 | from sqlalchemy.orm import sessionmaker
12 | 
13 | Session = sessionmaker()
14 | engine = sqlalchemy.create_engine(config['db']['type'] + '+' +
15 |                                   config['db']['adapter'] + '://' + 
16 |                                   config['db']['username'] + ':' +
17 |                                   config['db']['password'] + '@' +
18 |                                   config['db']['host'] + ':' +
19 |                                   config['db']['port'] + '/' +
20 |                                   config['db']['database'], pool_size=100, max_overflow=0) # the value of pool_size has to be less than the max_connections to postgres.
21 | Session.configure(bind=engine)
22 | Base = declarative_base()


--------------------------------------------------------------------------------
/analyzer/repositorymetrics.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class RepositoryMetrics:
 3 |   """
 4 |   Holds all the metrics values for a repository
 5 |   """
 6 | 
 7 |   def __init__(self):
 8 | 
 9 |     self.ns_buggy = []
10 |     self.ns_nonbuggy = []
11 |     self.nd_buggy = []
12 |     self.nd_nonbuggy = []
13 |     self.nf_buggy = []
14 |     self.nf_nonbuggy = []
15 |     self.entrophy_buggy = []
16 |     self.entrophy_nonbuggy = []
17 |     self.la_buggy = []
18 |     self.la_nonbuggy = []
19 |     self.ld_buggy = []
20 |     self.ld_nonbuggy = []
21 |     self.lt_buggy = []
22 |     self.lt_nonbuggy = []
23 |     self.ndev_buggy = []
24 |     self.ndev_nonbuggy = []
25 |     self.age_buggy = []
26 |     self.age_nonbuggy = []
27 |     self.nuc_buggy = []
28 |     self.nuc_nonbuggy = []
29 |     self.exp_buggy = []
30 |     self.exp_nonbuggy = []
31 |     self.rexp_nonbuggy = []
32 |     self.rexp_buggy = []
33 |     self.sexp_buggy = []
34 |     self.sexp_nonbuggy = []
35 |     self.num_buggy = 0
36 |     self.num_nonbuggy = 0
37 | 


--------------------------------------------------------------------------------
/orm/repository.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: repository.py
 3 | author: Ben Grawi <bjg1568@rit.edu>
 4 | date: October 2013
 5 | description: Holds the repository abstraction class and ORM
 6 | """
 7 | import uuid
 8 | from db import *
 9 | from datetime import datetime
10 | 
11 | class Repository(Base):
12 |     """
13 |     Commit():
14 |     description: The SQLAlchemy ORM for the repository table
15 |     """
16 |     __tablename__ = 'repositories'
17 | 
18 |     id = Column(String, primary_key=True)
19 |     name = Column(String)
20 |     url = Column(String)
21 | 
22 |     creation_date = Column(String)
23 |     ingestion_date = Column(String)
24 |     analysis_date = Column(String)
25 |     status = Column(String)
26 |     email = Column(String)
27 |     listed = Column(Boolean)
28 |     last_data_dump = Column(String)
29 | 
30 |     def __init__(self, repoDict):
31 |         """
32 |         __init__(): Dictonary -> NoneType
33 |         """
34 |         self.id = str(uuid.uuid1())
35 |         self.creation_date = str(datetime.now().replace(microsecond=0))
36 |         self.__dict__.update(repoDict)
37 | 
38 |     def __repr__(self):
39 |         return "<Repository: %s - %s>" % (self.name, self.id)
40 | 


--------------------------------------------------------------------------------
/analyzer/code_file_extentions.txt:
--------------------------------------------------------------------------------
  1 | ADA
  2 | ADB
  3 | ADS
  4 | ASM
  5 | BAS
  6 | BB
  7 | BMX
  8 | C
  9 | CLJ
 10 | CLS
 11 | COB
 12 | CBL
 13 | CPP
 14 | CC
 15 | CXX
 16 | C
 17 | CBP
 18 | CS
 19 | CSPROJ
 20 | D
 21 | DBA
 22 | DBPro123
 23 | E
 24 | EFS
 25 | EGT
 26 | EL
 27 | FOR
 28 | FTN
 29 | F
 30 | F77
 31 | F90
 32 | FRM
 33 | GO
 34 | H
 35 | HPP
 36 | HXX
 37 | HS
 38 | I
 39 | INC
 40 | JAVA
 41 | L
 42 | LGT
 43 | LISP
 44 | M
 45 | M4
 46 | ML
 47 | N
 48 | NB
 49 | P
 50 | PAS
 51 | PP
 52 | P
 53 | PHP
 54 | PHP3
 55 | PHP4
 56 | PHP5
 57 | PHPS
 58 | Phtml
 59 | PHP
 60 | PIV
 61 | PL
 62 | PM
 63 | PRG
 64 | PRO
 65 | PY
 66 | R
 67 | RB
 68 | RESX
 69 | RC
 70 | RC2
 71 | RKT
 72 | RKTL
 73 | SCI
 74 | SCE
 75 | SCM
 76 | SD7
 77 | SKB
 78 | SKC
 79 | SKD
 80 | SKF
 81 | SKG
 82 | SKI
 83 | SKK
 84 | SKM
 85 | SKO
 86 | SKP
 87 | SKQ
 88 | SKS
 89 | SKT
 90 | SKZ
 91 | SLN
 92 | SPIN
 93 | STK
 94 | SWG
 95 | TCL
 96 | VAP
 97 | VB
 98 | VBG
 99 | XPL
100 | XQ
101 | XSL
102 | Y
103 | AHK
104 | APPLESCRIPT
105 | AS
106 | AU3
107 | BAT
108 | BAS
109 | CMD
110 | Coffee
111 | EGG
112 | EGT
113 | ERB
114 | HTA
115 | IBI
116 | ICI
117 | IJS
118 | ITCL
119 | JS
120 | JSFL
121 | LUA
122 | MRC
123 | NCF
124 | NUT
125 | PHP
126 | PL
127 | PM
128 | PS1
129 | PS1XML
130 | PSC1
131 | PSD1
132 | PSM1
133 | PY
134 | R
135 | RB
136 | RDP
137 | SCPT
138 | SCPTD
139 | SDL
140 | SH
141 | TCL
142 | VBS
143 | XPL
144 | ebuild


--------------------------------------------------------------------------------
/classifier/category.py:
--------------------------------------------------------------------------------
 1 | import csv # csv module for reading in comma-seperated files
 2 | 
 3 | class Category():
 4 | 	""" 
 5 | 	represents a category used to categorize commits.
 6 | 	"""
 7 | 
 8 | 	associatedWords = [] # all words associated w/ this category
 9 | 	category_name = None # name of category
10 | 
11 | 	def __init__(self, fileLocation, name):
12 | 		""" 
13 | 		constructor
14 | 		reads in all associated words w/ this category from specified 
15 | 		file location
16 | 		"""
17 | 		self.category_name = name
18 | 		self.associatedWords = [] # reset the instance so that class name is visible to self reference
19 | 		self.readInAssociatedWords(fileLocation)
20 | 
21 | 	def readInAssociatedWords(self, fileLocation):
22 | 		""" 
23 | 		reads in all associated words w/ this category
24 | 		"""
25 | 		with open(fileLocation, 'rt') as csvfile:
26 | 			wordreader = csv.reader(csvfile, delimiter=',', quotechar='|')
27 | 			for row in wordreader:
28 | 				for word in row:
29 | 					self.associatedWords.append(word)
30 | 
31 | 	def belongs(self, commit_msg):
32 | 		"""
33 | 		checks if a commit belongs to this category by analyzing
34 | 		its commit message.
35 | 		@return boolean
36 | 		"""
37 | 		commit_msg = commit_msg.lower().split(" ") # to array
38 | 
39 | 		# need to go beyond list contains i.e. fixed = fix
40 | 		for word in commit_msg:
41 | 			for assoc_word in self.associatedWords:
42 | 				if assoc_word in word:
43 | 					return True
44 | 
45 | 		# No associated words found!
46 | 		return False
47 | 
48 | 	def getName(self):
49 | 		""" 
50 | 		returns the name of the category
51 | 		"""
52 | 		return self.category_name
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/classifier/classifier.py:
--------------------------------------------------------------------------------
 1 | from classifier.category import *
 2 | import os 
 3 | 
 4 | class Classifier():
 5 | 	"""
 6 | 	Classifier classifies commit messages into their appropriate
 7 | 	category. ALso defines the categories to be used.
 8 | 	"""
 9 | 
10 | 	categories = [] # array of possible commit categories
11 | 
12 | 	def __init__(self):
13 | 		"""
14 | 		constructor
15 | 		create all categories which commits can be categorized in.
16 | 		"""
17 | 
18 | 		# Get directory of the csv files of associated words for categories
19 | 		current_dir = os.path.dirname(__file__)
20 | 		dir_of_cats = current_dir + '/Categories'
21 | 
22 | 		# Create the categories - takes in the location of the csv that defines associated words
23 | 		#for the category & classification name
24 | 		corrective = Category(dir_of_cats + "/corrective.csv", "Corrective")
25 | 		feature_addition = Category(dir_of_cats + "/feature_addition.csv", "Feature Addition")
26 | 		non_functional = Category(dir_of_cats + "/non_functional.csv", "Non Functional")
27 | 		perfective = Category(dir_of_cats + "/perfective.csv", "Perfective")
28 | 		perventive = Category(dir_of_cats + "/preventative.csv", "Preventative")
29 | 
30 | 		# add to list of categories
31 | 		self.categories.extend([corrective,feature_addition,non_functional,perfective,perventive])
32 | 
33 | 	def categorize(self, commit_msg):
34 | 		"""
35 | 		returns the category of a commit_msg
36 | 		"""
37 | 		category_found = False
38 | 
39 | 		# See if commmit message belongs to any category
40 | 		for category in self.categories:
41 | 			if category.belongs(commit_msg):
42 | 				return category.getName()
43 | 
44 | 		# doesn't classify to any of the above
45 | 		return "None"
46 | 
47 | 


--------------------------------------------------------------------------------
/analyzer/notifier.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: notifier.py 
 3 | author: Christoffer Rosen <cbr4830@rit.edu>
 4 | date: December, 2013
 5 | description: Notification system using gmail to notify subscribers that
 6 | a repo's analysis has been completed. 
 7 | """
 8 | 
 9 | import smtplib
10 | from caslogging import logging
11 | 
12 | class Notifier:
13 | 
14 | 
15 | 	def __init__(self, gmail_user, gmail_pwd, repo):
16 | 		"""
17 | 		Constructor
18 | 		"""
19 | 		self.gmail_user = gmail_user
20 | 		self.gmail_pwd = gmail_pwd
21 | 		self.repo = repo
22 | 		self.subscribers = []
23 | 
24 | 	def addSubscribers(self, users):
25 | 		"""
26 | 		Subscribes a list of users to be notified next time, overriding 
27 | 		previous subscribers
28 | 		@param users: an array containing e-mail address of future subscribers
29 | 		"""
30 | 
31 | 		self.subscribers = users
32 | 
33 | 	def notify(self):
34 | 		"""
35 | 		Notify all subscribers that repo has been analyzed and is ready
36 | 		to be viewed
37 | 		"""
38 | 
39 | 		FROM = "cas.notifier@gmail.com"
40 | 		TO = self.subscribers
41 | 		SUBJECT = "Your repository has been analyzed"
42 | 		TEXT = "Your analyzed repository is now ready to be viewed at http://kiwi.se.rit.edu/repo/" + self.repo
43 | 
44 | 		# prepare actual message
45 | 		message = """\From: %s\nTo: %s\nSubject: %s\n\n%s""" % (FROM, ", ".join(TO), SUBJECT, TEXT)
46 | 		
47 | 		try:
48 | 			server = smtplib.SMTP("smtp.gmail.com", 587)
49 | 			server.ehlo()
50 | 			server.starttls()
51 | 			server.login(self.gmail_user, self.gmail_pwd) 
52 | 			server.sendmail(FROM, TO, message)
53 | 			server.quit()
54 | 
55 | 			logging.info("Notification sent successfully")
56 | 
57 | 		except:
58 | 			logging.error("Failed to send notification")
59 | 
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/orm/glmcoefficients.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: glmcoefficients.py
 3 | description: Holds the glm coefficients abstraction class and ORM
 4 | """
 5 | import uuid
 6 | from db import *
 7 | from datetime import datetime
 8 | 
 9 | class GlmCoefficients(Base):
10 | 
11 |     __tablename__ = 'glm_coefficients'
12 | 
13 |     repo = Column(String, primary_key=True)
14 | 
15 |     intercept = Column(Float, unique=False, default=0)
16 |     intercept_sig = Column(Float)
17 | 
18 |     ns = Column(Float, unique=False, default=0)
19 |     ns_sig = Column(Float)
20 | 
21 |     nd = Column(Float, unique=False, default=0)
22 |     nd_sig = Column(Float)
23 | 
24 |     nf = Column(Float, unique=False, default=0)
25 |     nf_sig = Column(Float)
26 | 
27 |     entrophy = Column(Float, unique=False, default=0)
28 |     entrophy_sig = Column(Float)
29 | 
30 |     la = Column(Float, unique=False, default=0)
31 |     la_sig = Column(Float)
32 | 
33 |     ld = Column(Float, unique=False, default=0)
34 |     ld_sig = Column(Float)
35 | 
36 |     lt = Column(Float, unique=False, default=0)
37 |     lt_sig = Column(Float)
38 | 
39 |     ndev = Column(Float, unique=False, default=0)
40 |     ndev_sig = Column(Float)
41 | 
42 |     age = Column(Float, unique=False, default=0)
43 |     age_sig = Column(Float)
44 | 
45 |     nuc = Column(Float, unique=False, default=0)
46 |     nuc_sig = Column(Float)
47 | 
48 |     exp = Column(Float, unique=False, default=0)
49 |     exp_sig = Column(Float)
50 | 
51 |     rexp = Column(Float, unique=False, default=0)
52 |     rexp_sig = Column(Float)
53 | 
54 |     sexp = Column(Float, unique=False, default=0)
55 |     sexp_sig = Column(Float)
56 | 
57 |     def __init__(self, glmCoefficientsDict):
58 |         """
59 |         __init__(): Dictonary -> NoneType
60 |         """
61 |         self.__dict__.update(glmCoefficientsDict)
62 | 
63 |     def __repr__(self):
64 |         return "<Repository id: %s>" % (self.repo)
65 | 


--------------------------------------------------------------------------------
/ingester/ingester.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: readRepo.py
 3 | authors: Ben Grawi <bjg1568@rit.edu>, Christoffer Rosen <cbr4830@rit.edu>
 4 | date: October 2013
 5 | description: This module contains the functions for ingesting a repository with
 6 |              a given id. 
 7 | """
 8 | from caslogging import logging
 9 | import sys
10 | from datetime import datetime, timedelta
11 | from orm.commit import *
12 | from orm.repository import *
13 | from orm.metrics import *
14 | from ingester.localrepository import *
15 | 
16 | def ingestRepo(repository_to_ingest, session):
17 |   """
18 |   Ingests a given repository
19 |   @param repository_to_ingest   The repository to inspect
20 |   @param session 				The SQLAlchemy session
21 |   @private
22 |   """
23 |   logging.info( 'A worker is starting scan repository: ' +
24 |                       repository_to_ingest.id )
25 | 
26 |   # Update status of repo to show it is ingesting
27 |   repository_to_ingest.status = "Ingesting"
28 |   session.commit()
29 | 
30 |   local_repo = LocalRepository(repository_to_ingest)
31 |   local_repo.sync()
32 |   session.merge(repository_to_ingest) 
33 |   repository_to_ingest.status = "Waiting to be Analyzed" # update status
34 |   session.commit() 
35 | 
36 |   logging.info( 'A worker finished ingesting repo ' + 
37 |                   repository_to_ingest.id )
38 | 
39 |   session.close()
40 | 
41 | def ingest(repo_id):
42 |   """
43 |   Ingest a repository with the given id. Gets the repository information
44 |   from the repository table and starts ingesting using ingestRepo method
45 |   @param repo_id   The repository id to ingest.
46 |   """
47 |   session = Session()
48 |   repo_to_analyze = (session.query(Repository)
49 |         .filter (Repository.id == repo_id)
50 |         .all()
51 |         )
52 | 
53 |   # Verify that repo exists
54 |   if len(repo_to_analyze) == 1:
55 |   	ingestRepo(repo_to_analyze[0], session)
56 |   else:
57 |     logging.info('Repo with id ' + repo_id_to_analyze + ' not found!')
58 | 
59 |   session.close()


--------------------------------------------------------------------------------
/orm/commit.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: commit.py
 3 | author: Ben Grawi <bjg1568@rit.edu>, Christoffer Rosen <cbr4830@rit.edu>
 4 | date: Novemeber 2013
 5 | description: Holds the commit abstraction class and ORM
 6 | """
 7 | from db import *
 8 | #from sqlalchemy import *
 9 | 
10 | class Commit(Base):
11 |     """
12 |     Commit():
13 |     description: The SQLAlchemy ORM for the commits table
14 |     """
15 |     __tablename__ = 'commits'
16 | 
17 |     commit_hash = Column(String, primary_key=True)
18 |     author_name  = Column(String)
19 |     author_date_unix_timestamp  = Column(String)
20 |     author_email  = Column(String)
21 |     author_date  = Column(String)
22 |     commit_message = Column(String)
23 | 
24 |     fix = Column(String, unique=False)
25 |     classification = Column(String, unique=False)
26 |     linked = Column(Boolean, default = False)
27 | 
28 |     contains_bug = Column(Boolean, unique=False, default=False)
29 |     fixes = Column(String, unique=False)
30 |     ns = Column(Float, unique=False, default=0)
31 |     nd = Column(Float, unique=False, default=0)
32 |     nf = Column(Float, unique=False, default=0)
33 |     entrophy = Column(Float, unique=False, default=0)
34 |     la = Column(Float, unique=False, default=0)
35 |     ld = Column(Float, unique=False, default=0)
36 |     fileschanged = Column(String, unique=False, default="NULL")
37 |     lt = Column(Float, unique=False, default=0)
38 |     ndev = Column(Float, unique=False, default=0)
39 |     age = Column(Float, unique=False, default=0)
40 |     nuc = Column(Float, unique=False, default=0)
41 |     exp = Column(Float, unique=False, default=0)
42 |     rexp = Column(Float, unique=False, default=0)
43 |     sexp = Column(Float, unique=False, default=0)
44 | 
45 |     # The linear regression probability of commit containing bug
46 |     glm_probability = Column(Float, unique=False, default=0)
47 | 
48 |     # Many-to-One Relation to repositories table
49 |     repository_id = Column(String)
50 | 
51 |     def __init__(self, commitDict):
52 |         """
53 |         __init__(): Dictonary -> NoneType
54 |         """
55 |         self.__dict__.update(commitDict)
56 | 
57 |     def __repr__(self):
58 |         return "<Commit('%s','%s', '%s', '%s')>" % \
59 |             (self.commit_hash,
60 |             self.author_name,
61 |             self.author_date,
62 |             self.commit_message)
63 | 


--------------------------------------------------------------------------------
/orm/metrics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: metrics.py
 3 | author: Christoffer Rosen <cbr4830@rit.edu>
 4 | date: November 2013
 5 | description: Holds the metrics abstraction class and ORM
 6 | """
 7 | from db import *
 8 | from datetime import datetime
 9 | 
10 | class Metrics(Base):
11 |     """
12 |     Metrics():
13 |     description: The SQLAlchemy ORM for the repository table
14 |     """
15 |     __tablename__ = 'metrics'
16 | 
17 |     repo = Column(String, primary_key=True)
18 | 
19 |     nsbuggy = Column(Float, unique=False, default=0)
20 |     nsnonbuggy = Column(Float, unique=False, default=0)
21 |     ns_sig = Column(Float)
22 | 
23 |     ndbuggy = Column(Float, unique=False, default=0)
24 |     ndnonbuggy = Column(Float, unique=False, default=0)
25 |     nd_sig = Column(Float)
26 | 
27 |     nfbuggy = Column(Float, unique=False, default=0)
28 |     nfnonbuggy = Column(Float, unique=False, default=0)
29 |     nf_sig = Column(Float)
30 | 
31 |     entrophybuggy = Column(Float, unique=False, default=0)
32 |     entrophynonbuggy = Column(Float, unique=False, default=0)
33 |     entrophy_sig = Column(Float)
34 | 
35 |     labuggy = Column(Float, unique=False, default=0)
36 |     lanonbuggy = Column(Float, unique=False, default=0)
37 |     la_sig = Column(Float)
38 | 
39 |     ldbuggy = Column(Float, unique=False, default=0)
40 |     ldnonbuggy = Column(Float, unique=False, default=0)
41 |     ld_sig = Column(Float)
42 | 
43 |     ltbuggy = Column(Float, unique=False, default=0)
44 |     ltnonbuggy = Column(Float, unique=False, default=0)
45 |     lt_sig = Column(Float)
46 | 
47 |     ndevbuggy = Column(Float, unique=False, default=0)
48 |     ndevnonbuggy = Column(Float, unique=False, default=0)
49 |     ndev_sig = Column(Float)
50 | 
51 |     agebuggy = Column(Float, unique=False, default=0)
52 |     agenonbuggy = Column(Float, unique=False, default=0)
53 |     age_sig = Column(Float)
54 | 
55 |     nucbuggy = Column(Float, unique=False, default=0)
56 |     nucnonbuggy = Column(Float, unique=False, default=0)
57 |     nuc_sig = Column(Float)
58 | 
59 |     expbuggy = Column(Float, unique=False, default=0)
60 |     expnonbuggy = Column(Float, unique=False, default=0)
61 |     exp_sig = Column(Float)
62 | 
63 |     rexpnonbuggy = Column(Float, unique=False, default=0)
64 |     rexpbuggy = Column(Float, unique=False, default=0)
65 |     rexp_sig = Column(Float)
66 | 
67 |     sexpbuggy = Column(Float, unique=False, default=0)
68 |     sexpnonbuggy = Column(Float, unique=False, default=0)
69 |     sexp_sig = Column(Float)
70 | 
71 |     def __init__(self, metricDict):
72 |         """
73 |         __init__(): Dictonary -> NoneType
74 |         """
75 |         self.__dict__.update(metricDict)
76 | 
77 |     def __repr__(self):
78 |         return "<Metrics table: %s>" % (self.repo)
79 | 


--------------------------------------------------------------------------------
/ingester/localrepository.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: localrepository.py
 3 | author: Ben Grawi <bjg1568@rit.edu>
 4 | date: October 2013
 5 | description: Holds the repository abstraction class
 6 | """
 7 | from ingester.git import *
 8 | from orm.commit import *
 9 | from datetime import datetime
10 | import os
11 | import logging
12 | 
13 | class LocalRepository():
14 |     """
15 |     Repository():
16 |     description: Abstracts the actions done on a repository
17 |     """
18 |     repo = None
19 |     adapter = None
20 |     start_date = None
21 |     def __init__(self, repo):
22 |         """
23 |         __init__(path): String -> NoneType
24 |         description: Abstracts the actions done on a repository
25 |         """
26 |         self.repo = repo
27 | 
28 |         # Temporary until other Repo types are added
29 |         self.adapter = Git
30 | 
31 |         self.commits = {}
32 | 
33 |     def sync(self):
34 |         """
35 |         sync():
36 |         description: Simply wraps the syncing functions together
37 |         """
38 | 
39 |         # TODO: Error checking.
40 |         firstSync = self.syncRepoFiles()
41 |         self.syncCommits(firstSync)
42 | 
43 |         # Set the date AFTER it has been ingested and synced.
44 |         self.repo.ingestion_date = self.start_date
45 | 
46 |     def syncRepoFiles(self):
47 |         """
48 |         syncRepoFiles() -> Boolean
49 |         description: Downloads the current repo locally, and sets the path and
50 |             injestion date accordingly
51 |         returns: Boolean - if this is the first sync
52 |         """
53 |         # Cache the start date to set later
54 |         self.start_date = str(datetime.now().replace(microsecond=0))
55 | 
56 |         path = os.path.dirname(__file__) + self.adapter.REPO_DIRECTORY + self.repo.id
57 |         # See if repo has already been downloaded, if it is pull, if not clone
58 |         if os.path.isdir(path):
59 |             self.adapter.pull(self.adapter, self.repo)
60 |             firstSync = False
61 |         else:
62 |             self.adapter.clone(self.adapter, self.repo)
63 |             firstSync = True
64 | 
65 |         return firstSync
66 | 
67 |     def syncCommits(self, firstSync):
68 |         """
69 |         syncCommits():
70 |         description: Makes each commit dictonary into an object and then
71 |             inserts them into the database
72 |         arguments: firstSync Boolean: whether to sync all commits or after the
73 |             ingestion date
74 |         """
75 |         commits = self.adapter.log(self.adapter, self.repo, firstSync)
76 |         commitsSession = Session()
77 |         logging.info('Saving commits to the database...')
78 |         for commitDict in commits:
79 |             commitDict['repository_id'] = self.repo.id
80 |             commitsSession.merge(Commit(commitDict))
81 |         commitsSession.commit()
82 |         commitsSession.close()
83 |         logging.info('Done saving commits to the database.')
84 | 


--------------------------------------------------------------------------------
/test_categorization.py:
--------------------------------------------------------------------------------
 1 | from classifier.classifier import *
 2 | from caslogging import logging
 3 | 
 4 | logging.info('Test categorization... ')
 5 | classifier = Classifier()
 6 | 
 7 | # Test classification of corrective commits
 8 | # fix,bug,wrong,fail,problem
 9 | 
10 | corrective_msg_1 = "fixed something"
11 | corrective_msg_2 = "bam, there goes a bug!"
12 | corrective_msg_3 = "x was wrong, but no more!"
13 | corrective_msg_4 = "Houston, we *had* a problem"
14 | corrective_msg_5 = "My watch is fun"
15 | corrective_msg_6 = "This is definitively NOT a you-know what!"
16 | 
17 | assert(classifier.categorize(corrective_msg_1) == "Corrective")
18 | assert(classifier.categorize(corrective_msg_2) == "Corrective")
19 | assert(classifier.categorize(corrective_msg_3) == "Corrective")
20 | assert(classifier.categorize(corrective_msg_4) == "Corrective")
21 | assert(classifier.categorize(corrective_msg_5) != "Corrective")
22 | assert(classifier.categorize(corrective_msg_6) != "Corrective")
23 | 
24 | # Test classification of feature additions
25 | # new,add,requirement,initial,create
26 | 
27 | feature_msg_1 = "new awesome thing added to that brillinat code"
28 | feature_msg_2 = "adding some color to this mundane gui!"
29 | feature_msg_3 = "Adding requirement.."
30 | feature_msg_4 = "This is an initial commit"
31 | feature_msg_5 = "Creating a new class for x,y, AND z!"
32 | feature_msg_6 = "This is definitively NOT a you-know what!"
33 | 
34 | assert(classifier.categorize(feature_msg_1) == "Feature Addition")
35 | assert(classifier.categorize(feature_msg_2) == "Feature Addition")
36 | assert(classifier.categorize(feature_msg_3) == "Feature Addition")
37 | assert(classifier.categorize(feature_msg_4) == "Feature Addition")
38 | assert(classifier.categorize(feature_msg_5) == "Feature Addition")
39 | assert(classifier.categorize(feature_msg_6) != "Feature Addition")
40 | 
41 | # Test classification of preventative commits
42 | # test,junit,coverage,assert
43 | 
44 | prev_msg_1 = "testing to make sure of stuff"
45 | prev_msg_2 = "junit rocks, stay heavy!"
46 | prev_msg_3 = "coverage is now much higher"
47 | prev_msg_4 = "asserting that our code doesn't make computers 'splode"
48 | prev_msg_5 = "I am totally awesome"
49 | 
50 | assert(classifier.categorize(prev_msg_1) == "Preventative")
51 | assert(classifier.categorize(prev_msg_2) == "Preventative")
52 | assert(classifier.categorize(prev_msg_3) == "Preventative")
53 | assert(classifier.categorize(prev_msg_4) == "Preventative")
54 | assert(classifier.categorize(prev_msg_5) != "Preventative")
55 | 
56 | # Test that corrective classification again
57 | # fix,bug,wrong,fail,problem
58 | 
59 | corrective_msg_1 = "fixed something"
60 | corrective_msg_2 = "bam, there goes a bug!"
61 | corrective_msg_3 = "x was wrong, but no more!"
62 | corrective_msg_4 = "Houston, we *had* a problem"
63 | corrective_msg_5 = "My watch is fun"
64 | corrective_msg_6 = "This is definitively NOT a you-know what!"
65 | 
66 | assert(classifier.categorize(corrective_msg_1) == "Corrective")
67 | assert(classifier.categorize(corrective_msg_2) == "Corrective")
68 | assert(classifier.categorize(corrective_msg_3) == "Corrective")
69 | assert(classifier.categorize(corrective_msg_4) == "Corrective")
70 | assert(classifier.categorize(corrective_msg_5) != "Corrective")
71 | assert(classifier.categorize(corrective_msg_6) != "Corrective")
72 | 
73 | logging.info("Passed tests")


--------------------------------------------------------------------------------
/analyzer/analyzer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | file: Analyzer.py
 3 | author: Christoffer Rosen <cbr4830@rit.edu>
 4 | date: November 2013
 5 | description: This module contains the functions for analyzing a repo with a given id.
 6 | Currently only supports the GitHub Issue Tracker.
 7 | """
 8 | import sys
 9 | from datetime import datetime, timedelta
10 | from orm.repository import *
11 | from orm.commit import *
12 | from analyzer.bugfinder import *
13 | from analyzer.metricsgenerator import *
14 | from analyzer.githubissuetracker import *
15 | from caslogging import logging
16 | from analyzer.notifier import *
17 | from config import config
18 | from analyzer.git_commit_linker import *
19 | from sqlalchemy import Date, cast 
20 | 
21 | def analyze(repo_id):
22 | 	"""
23 | 	Analyze the repository with the given id. Gets the repository from the repository table
24 | 	and starts ingesting using the analyzeRepo method.
25 | 	@param repo_id		The repository id to analyze
26 | 	"""
27 | 	session = Session()
28 | 
29 | 	repo_to_analyze = (session.query(Repository)
30 | 				.filter (Repository.id == repo_id)
31 | 				.all()
32 | 				)
33 | 
34 | 	# Verify that repo exists
35 | 	if len(repo_to_analyze) > 0:
36 | 		analyzeRepo(repo_to_analyze[0],session)
37 | 	else:
38 | 		logging.info('Repo with id ' + repo_id_to_analyze + ' not found!')
39 | 
40 | 	session.close()
41 | 
42 | def analyzeRepo(repository_to_analyze, session):
43 | 	"""
44 | 	Analyzes the given repository
45 | 	@param repository_to_analyze	The repository to analyze.
46 | 	@param session                  SQLAlchemy session
47 | 	@private
48 | 	"""
49 | 	repo_name = repository_to_analyze.name
50 | 	repo_id = repository_to_analyze.id
51 | 	last_analysis_date = repository_to_analyze.analysis_date
52 | 
53 | 	# Update status of repo to show it is analyzing
54 | 	repository_to_analyze.status = "Analyzing"
55 | 	session.commit()
56 | 
57 | 	logging.info('Worker analyzing repository id ' + repo_id)
58 | 
59 | 	# all commits in descending order
60 | 	all_commits = (session.query(Commit)
61 | 				.filter( Commit.repository_id == repo_id)
62 | 				.order_by( Commit.author_date_unix_timestamp.desc())
63 | 				.all()
64 | 				)
65 | 
66 | 	# corrective commits in ascending order 
67 | 	# if updating, only get the corrective commits that have not been linked yet.
68 | 	# No need to re-link corrective commits that have already been linked with the bug-inducing commit.
69 | 	corrective_commits = (session.query(Commit)
70 | 				.filter( 
71 | 					( Commit.fix == "True" ) &
72 | 					( Commit.repository_id == repo_id ) &
73 | 					( Commit.linked == False )
74 | 				)
75 | 				.order_by( Commit.author_date_unix_timestamp.asc() )
76 | 				.all()
77 | 				)
78 | 
79 | 	logging.info("Linking " + str(len(corrective_commits)) + " new corrective commits for repo " + repo_id)
80 | 
81 | 	try:
82 | 		git_commit_linker = GitCommitLinker(repo_id)
83 | 		git_commit_linker.linkCorrectiveCommits(corrective_commits, all_commits)
84 | 	except Exception as e:
85 | 		logging.exception("Got an exception linking bug fixing changes to bug inducing changes for repo " + repo_id)
86 | 		repository_to_analyze.status = "Error"
87 | 		session.commit() # update repo status
88 | 		raise
89 | 
90 | 	# Signify to CAS Manager that this repo is ready to have it's model built
91 | 	if repository_to_analyze.status != "Error":
92 | 		repository_to_analyze.status = "In Queue to Build Model"
93 | 		session.commit() # update repo status
94 | 


--------------------------------------------------------------------------------
/analyzer/bugfinder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | file: bugfinder.py
  3 | author: Christoffer Rosen <cbr4830@rit.edu>
  4 | date: November 2013
  5 | description: Links changes that introduces bugs by identifying changes
  6 | that fix problems.
  7 | """
  8 | 
  9 | import re
 10 | from orm.commit import *
 11 | from caslogging import logging
 12 | from analyzer.git_commit_linker import *
 13 | 
 14 | class BugFinder:
 15 | 	"""
 16 | 	BugFinder():
 17 | 	description: Links changes that introduces bugs.
 18 | 	"""
 19 | 
 20 | 	def __init__(self, allCommits, correctiveCommits, issueTracker):
 21 | 		"""
 22 | 		Constructor
 23 | 
 24 | 		@param commits: All commits in ascending order by date
 25 | 		@param correctiveCommits: All commits/changes which are identified
 26 | 		as fixing problems.
 27 | 		@param issueTracker: Issue tracker (e.g., GitHub Issues)
 28 | 		"""
 29 | 		self.allCommits = allCommits
 30 | 		self.correctiveCommits = correctiveCommits
 31 | 		self.issueTracker = issueTracker
 32 | 
 33 | 	def findIssueOpened(self, correctiveCommit):
 34 | 		"""
 35 | 		findIssueIds()
 36 | 		If the corrective change/commit links to a issue in the issue tracker, returns
 37 | 		the date of oldest open issue found otherwise returns none
 38 | 		"""
 39 | 		issue_opened = None
 40 | 
 41 | 		if(self.issueTracker is None or hasattr(self.issueTracker, "getDateOpened") == False):
 42 | 			return None
 43 | 
 44 | 		idMatch = re.compile('#[\d]+')
 45 | 		issue_ids = idMatch.findall(correctiveCommit.commit_message)
 46 | 		issue_ids = [issue_id.strip('#') for issue_id in issue_ids] # Remove the '#' from ids
 47 | 
 48 | 		if len(issue_ids) > 0:
 49 | 			issue_opened = self.issueTracker.getDateOpened(issue_ids[0])
 50 | 			# Use the oldest open bug
 51 | 			for issue_id in issue_ids:
 52 | 				logging.info('Searching for issue id: ' + issue_id)
 53 | 				curr_issue_opened = self.issueTracker.getDateOpened(issue_id)
 54 | 
 55 | 				# Verify that an issue was found.
 56 | 				if curr_issue_opened is not None:
 57 | 					if int(curr_issue_opened) < int(issue_opened):
 58 | 						issue_opened = curr_issue_opened
 59 | 
 60 | 		return issue_opened
 61 | 
 62 | 	def searchForBuggyCommit(self, correctiveCommit):
 63 | 		"""
 64 | 		Finds the buggy commit based on the bug fixing commit
 65 | 		Helper method for markBuggyCommits. If commir links to an
 66 | 		issue tracker, we check files changed prior to this date.
 67 | 		Otherwise, me only check date prior to the fix.
 68 | 
 69 | 		@param correctiveCommits: the bug fixing commit
 70 | 		"""
 71 | 		bug_introduced_prior = correctiveCommit.author_date_unix_timestamp
 72 | 		issue_opened = self.findIssueOpened(correctiveCommit)
 73 | 
 74 | 		if issue_opened is not None:
 75 | 			bug_introduced_prior = issue_opened
 76 | 
 77 | 		correctiveFiles = correctiveCommit.fileschanged.split(",CAS_DELIMITER,")
 78 | 
 79 | 		for commit in self.allCommits:
 80 | 
 81 | 			if int(commit.author_date_unix_timestamp) < int(bug_introduced_prior):
 82 | 				commitFiles = commit.fileschanged.split(",CAS_DELIMITER,")
 83 | 
 84 | 				for commitFile in commitFiles:
 85 | 
 86 | 					# This introudced the bug!
 87 | 					if commitFile in correctiveFiles:
 88 | 						return commit
 89 | 
 90 | 		return -1 # Not found
 91 | 
 92 | 	def markBuggyCommits(self):
 93 | 		"""
 94 | 		Finds bug inducing commits based on those that are
 95 | 		bug fixing. It checks commits prior to this and determines
 96 | 		it to be bug inducing if it changes the same file in a bug fixing
 97 | 		commit
 98 | 		"""
 99 | 
100 | 		for correctiveCommit in self.correctiveCommits:
101 | 			buggyCommit = self.searchForBuggyCommit(correctiveCommit)
102 | 			if buggyCommit is not -1:
103 | 				buggyCommit.contains_bug = True
104 | 			#else:
105 | 				#print("Cound not find the bug inducing commit for: " +
106 | 					#	correctiveCommit.commit_message)
107 | 


--------------------------------------------------------------------------------
/analyzer/githubissuetracker.py:
--------------------------------------------------------------------------------
  1 | """
  2 | file: gitissuetracker.py
  3 | author: Christoffer Rosen <cbr4830@rit.edu>
  4 | date: December, 2013
  5 | description: Represents a Github Issue tracker object used
  6 | for getting the dates issues were opened.
  7 | 
  8 | 12/12/13: Doesn't currently support private repos
  9 | """
 10 | 
 11 | import requests, json, dateutil.parser, time
 12 | from caslogging import logging
 13 | from config import *
 14 | 
 15 | class GithubIssueTracker:
 16 | 	"""
 17 | 	GitIssueTracker()
 18 | 	Represents a Github Issue Tracker Object
 19 | 	"""
 20 | 
 21 | 	owner = None											# Owner of the github repo
 22 | 	repo = None												# The repo name
 23 | 	request_repos = "https://api.github.com/repos"			# Request url to get issue info
 24 | 	request_auth = "https://api.github.com/authorizations" 	# Request url for auth
 25 | 
 26 | 	def __init__(self, owner, repo):
 27 | 		"""
 28 | 		Constructor
 29 | 		"""
 30 | 		self.owner = owner
 31 | 		self.repo = repo
 32 | 		self.auth_token = None
 33 | 		self.authenticate() # Authenticate our app
 34 | 
 35 | 	def authenticate(self):
 36 | 		"""
 37 | 		authenticate()
 38 | 		Authenticates this application to github using
 39 | 		the cas-user git user credentials. This is hopefully temporary!
 40 | 		"""
 41 | 
 42 | 		s = requests.Session()
 43 | 		username = config["github"]["user"]
 44 | 		password = config["github"]["pass"]
 45 | 		s.auth = (username, password)
 46 | 		payload = {"scopes": ["repo"]}
 47 | 		r = s.get(self.request_auth, params=payload)
 48 | 
 49 | 		if r.headers.get('x-ratelimit-remaining') == '0':
 50 | 			logging.info("Github quota limit hit -- waiting")
 51 | 
 52 | 			# Wait up to a hour until we can continue..
 53 | 			while r.headers.get('x-ratelimit-remaining') == '0':
 54 | 				time.sleep(600) # Wait 10 minutes and try again
 55 | 				r = s.get(self.request_auth, params=payload)
 56 | 				data = r.json()
 57 | 
 58 | 		data = r.json()[0]
 59 | 
 60 | 		if r.status_code >= 400:
 61 | 			msg = data.get('message')
 62 | 			logging.error("Failed to authenticate issue tracker: \n" +msg)
 63 | 			return # Exit
 64 | 		else:
 65 | 			self.auth_token = data.get("token")
 66 | 			requests_left = r.headers.get('x-ratelimit-remaining')
 67 | 			logging.info("Analyzer has " + requests_left + " issue tracker calls left this hour")
 68 | 
 69 | 
 70 | 	def getDateOpened(self, issueNumber):
 71 | 		"""
 72 | 		getDateOpened()
 73 | 		Gets the date the issue number was opened in unix time
 74 | 		If issue cannot be found for whichever reason, returns null.
 75 | 		"""
 76 | 		logging.info("searching for issue: " + str(issueNumber))
 77 | 		logging.info(self.request_repos + "/" + self.owner + "/" +
 78 | 				self.repo + "/issues/" + issueNumber)
 79 | 
 80 | 		header = {'Authorization': 'token ' + self.auth_token}
 81 | 		r = requests.get(self.request_repos + "/" + self.owner + "/" +
 82 | 				self.repo + "/issues/" + issueNumber, headers=header)
 83 | 
 84 | 		data = r.json()
 85 | 
 86 | 		# If forbidden
 87 | 		if r.status_code == 403:
 88 | 
 89 | 			# Check the api quota
 90 | 			if r.headers.get('x-ratelimit-remaining') == '0':
 91 | 				logging.info("Github quota limit hit -- waiting")
 92 | 
 93 | 				# Wait up to a hour until we can continue..
 94 | 				while r.headers.get('x-ratelimit-remaining') == '0':
 95 | 					time.sleep(600) # Wait 10 minutes and try again
 96 | 					r = requests.get(self.request_repos + "/" + self.owner + "/" +
 97 | 						self.repo + "/issues/" + issueNumber, headers=header)
 98 | 					data = r.json()
 99 | 
100 | 		# Check for other error codes
101 | 		elif r.status_code >= 400:
102 | 			msg = data.get('message')
103 | 			logging.info("issue not found")
104 | 			return None
105 | 		else:
106 | 			try:
107 | 				date = (dateutil.parser.parse(data.get('created_at'))).timestamp()
108 | 				return date
109 | 			except:
110 | 				logging.error("ISSUE TRACKER FAILURE: Could not get created_at from github issues API")
111 | 				return None
112 | 


--------------------------------------------------------------------------------
/analyzer/medianmodel.py:
--------------------------------------------------------------------------------
  1 | import rpy2.robjects as robjects # R integration to python
  2 | from analyzer.repositorymetrics import * # metrics abstraction; holds all metric values for commits
  3 | from db import *	# postgresql db information
  4 | from orm.metrics import *	# orm metrics table
  5 | from caslogging import logging
  6 | 
  7 | class MedianModel:
  8 |   """
  9 |   Builds the median model, which saves to the metrics table the
 10 |   median values for each buggy and non buggy metric for a specific
 11 |   repository
 12 |   """
 13 | 
 14 |   def __init__(self, metrics, repo_id):
 15 |     """
 16 |     constructor
 17 |     @metrics : object holding all metrics value for the repository
 18 |                model is being built for.
 19 |     """
 20 |     self.metrics = metrics
 21 |     self.repo_id = repo_id
 22 | 
 23 |     # A p-value for wilcox test
 24 |     self.psig = 0.05
 25 | 
 26 |     # R functions to be used
 27 |     self.medianFn = robjects.r['median']
 28 |     self.wilcoxFn = robjects.r['wilcox.test']
 29 | 
 30 |   def buildModel(self):
 31 |     """
 32 |     builds the model
 33 |     """
 34 |     self.calculateMedians()
 35 | 
 36 |   def getMedian(self,metric):
 37 |     """
 38 |     Helper function for the method calculateMedians.
 39 |     Takes in a metric and returns a string property of the results
 40 |     @private
 41 |     """
 42 |     median_props = ""
 43 | 
 44 |     try:
 45 |       # R functions to be used
 46 |       medianFn = robjects.r['median']
 47 |       wilcoxFn = robjects.r['wilcox.test']
 48 | 
 49 |       metric_buggy = getattr(self.metrics, metric + "_buggy")
 50 |       metric_nonbuggy = getattr(self.metrics, metric + "_nonbuggy")
 51 | 
 52 |       # First check p-values, if signficant then calculate median
 53 |       pvalue = self.wilcoxFn(robjects.FloatVector(metric_buggy), robjects.FloatVector(metric_nonbuggy))[2][0]
 54 |       buggy_median = self.medianFn(robjects.FloatVector(metric_buggy))
 55 |       nonbuggy_median = self.medianFn(robjects.FloatVector(metric_nonbuggy))
 56 |       median_props += '"' + metric + 'buggy":"' + str(buggy_median[0]) + '", '
 57 |       median_props += '"' + metric + 'nonbuggy":"' + str(nonbuggy_median[0]) + '", '
 58 | 
 59 |       if pvalue <= self.psig:
 60 |         median_props += '"' + metric + '_sig":"1", '
 61 |       else:
 62 |         median_props += '"' + metric + '_sig":"0", '
 63 | 
 64 |     except:
 65 |       # catch the case where we haven't made any observations to do this metric
 66 |       logging.info("Metric " + metric + " could not be used in the median model for repo " + self.repo_id)
 67 | 
 68 |     return median_props
 69 | 
 70 | 
 71 |   def calculateMedians(self):
 72 |     """
 73 |     Using R through the rpy2 module, generate the medians of each metrics
 74 |     lists. If it passes the wilcox test (statistically sig), put it into
 75 |     the metrics table. Otherwise, inserts a -1.
 76 |     @private
 77 |     """
 78 | 
 79 |     # Metric objects represents the metrics as a dictionary
 80 |     metricObject = '"repo":"' + self.repo_id + '", '
 81 | 
 82 |     metricObject += self.getMedian("ns")
 83 |     metricObject += self.getMedian("nd")
 84 |     metricObject += self.getMedian("nf")
 85 |     metricObject += self.getMedian("entrophy")
 86 |     metricObject += self.getMedian("la")
 87 |     metricObject += self.getMedian("ld")
 88 |     metricObject += self.getMedian("lt")
 89 |     metricObject += self.getMedian("ndev")
 90 |     metricObject += self.getMedian("age")
 91 |     metricObject += self.getMedian("nuc")
 92 |     metricObject += self.getMedian("exp")
 93 |     metricObject += self.getMedian("rexp")
 94 |     metricObject += self.getMedian("sexp")
 95 | 
 96 |     # Remove trailing comma
 97 |     metricObject = metricObject[:-2]
 98 | 
 99 |     # Put into the metrics table
100 |     metricsSession = Session()
101 |     metrics = Metrics(json.loads('{' + metricObject + '}'))
102 | 
103 |     # Copy state of metrics object to db
104 |     metricsSession.merge(metrics)
105 | 
106 |     # Write the metrics changes to the database
107 |     metricsSession.commit()
108 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | CAS_CodeRepoAnalyzer
  2 | ====================
  3 | 
  4 | Ingests and analyzes code repositories
  5 | 
  6 | ##Installation
  7 | 1. Clone this repository in to an empty directory
  8 | 2. Copy the `./config.example.json` to `./config.json` and change the
  9 | the configurations. All fields are required.
 10 | 
 11 | Db: information relating to your postgresql database setup
 12 | logging: information about how to write logging information
 13 | gmail: gmail account to be used to send cas notifications
 14 | repoUpdates: how often repositories should be updated for new commits
 15 | system: how many worker threads the cas system can use to analyze and ingest repos.
 16 | 
 17 | ###Dependencies
 18 | Additional Instructions are available in SETUP.md
 19 | * Python  >= 3.3
 20 | * Pip for Python Version > 3.3
 21 | * Git > 1.7
 22 | * R
 23 | * python-dev
 24 | * rpy2
 25 | * requests
 26 | * dateutil
 27 | * sqlalchemy
 28 | * py-postgresql
 29 | * GNU grep
 30 | * MonthDelta
 31 | 
 32 | ###Setting up python3.3 virtual env on Ubuntu
 33 | * Assumes you are working on Ubuntu 12.04
 34 | 
 35 | Install python3.3 using the deadsnakes PPA:
 36 | 
 37 | ```
 38 | sudo apt-get install python-software-properties
 39 | sudo add-apt-repository ppa:fkrull/deadsnakes
 40 | sudo apt-get update
 41 | sudo apt-get install python3.3
 42 | ```
 43 | 
 44 | Version 1.7.1.2 of virtual env that comes with Ubuntu 12.04 is not compatibale with python3.3.
 45 | Therefore, we must installa new version so that we can setup a working virutal environment. First,
 46 | you must uninstall the current python-virtualenv:
 47 | 
 48 | ```
 49 | sudo apt-get remove python-virtualenv
 50 | ```
 51 | 
 52 | Next, install the latest easy_install:
 53 | 
 54 | ```
 55 | wget http://peak.telecommunity.com/dist/ez_setup.py
 56 | sudo python ez_setup.py
 57 | ```
 58 | 
 59 | Next, install pip and the virtualenv:
 60 | 
 61 | ```
 62 | sudo easy_install pip
 63 | sudo pip install virtualenv
 64 | virtualenv --no-site-packages --distribute -p /usr/bin/python3.3 ~/.virtualenvs/pywork3
 65 | ```
 66 | 
 67 | By default, typically we don't have the python-dev available for python3 on Ubuntu after setting up a new
 68 | virtual environment for it and so have to install it as it's a dependency for rpy2. Install this with apt-get:
 69 | 
 70 | ```
 71 | sudo apt-get install python3.3-dev
 72 | ```
 73 | 
 74 | Now, we are finally ready to set up our virtual environment:
 75 | 
 76 | ```
 77 | virtualenv -p /usr/bin/python3.3 /path/to/new/virtual/environment
 78 | ```
 79 | 
 80 | To activate the virtual env:
 81 | 
 82 | ```
 83 | source /path/to/new/virtual/environemnt/bin/activate
 84 | ```
 85 | 
 86 | Type `deactiviate` to exit the virtual env
 87 | 
 88 | ###Installing rpy2
 89 | * Assumes you are working on Ubuntu 12.04 and python 3.3
 90 | 
 91 | Getting rpy2 to work can be a bit tricky. First, make sure R is installed. To do this, first
 92 | get the repository SSL key and import it to apt by doing
 93 | 
 94 |   ```
 95 |   gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9
 96 |   gpg -a --export E084DAB9 | sudo apt-key add -
 97 |   ```
 98 | 
 99 | Then, Edit the list of sources `gksudo gedit /etc/apt/sources.list` and add the following repo at the bottom:`deb http://cran.ma.imperial.ac.uk/bin/linux/ubuntu precise/`
100 | 
101 | Finally, we can install R by running the following commands:
102 | 
103 |   ```
104 |   sudo apt-get update
105 |   sudo apt-get install r-base
106 |   ```
107 | 
108 | Now we are ready to install rpy2. Make sure python version 3 or greater is in use (3.2 is not compatibale, however), such as by using a virtualenv and run
109 | 
110 | ```
111 | pip install rpy2
112 | ```
113 | 
114 | ###Additional Pip Packages
115 | Install the following packages by doing `pip install `  and then the package
116 | name. Make sure you are using python3, such as using a virtualenv if using Ubuntu.
117 | 
118 | * SQL Alchemy (sqlalchemy)
119 | * Py-PostgreSQL (py-postgresql)
120 | * requests (requests)
121 | * python-dateutil (python-dateutil)
122 | 
123 | To install the MonthDelta package, simply do: `pip install http://pypi.python.org/packages/source/M/MonthDelta/MonthDelta-1.0b.tar.bz2`
124 | 
125 | ###First-Time Database Setup
126 | Set up the database for the first time by running `python script.py initDb`
127 | 
128 | ##Usage
129 | In a terminal, type `nohup python script.py & ' to start the code repo analyzer and run it in the background.
130 | 


--------------------------------------------------------------------------------
/analyzer/metricsgenerator.py:
--------------------------------------------------------------------------------
  1 | """
  2 | file: metricsgenerator.py
  3 | author: Christoffer Rosen <cbr4830@rit.edu>
  4 | date: Novemember, 2013
  5 | description: Generats the metrics (medians) for each metric for the
  6 | non-buggy and buggy commits and outputs them into the metrics table
  7 | """
  8 | 
  9 | from analyzer.repositorymetrics import * # metrics abstraction; holds all metric values for commits
 10 | from analyzer.medianmodel import * # builds the median model
 11 | from analyzer.linear_reg_model import *
 12 | from orm.commit import *
 13 | import json
 14 | 
 15 | class MetricsGenerator:
 16 | 	"""
 17 | 	MetricsGenerator()
 18 | 	Generate the metrics for buggy & non-buggy commits
 19 | 	"""
 20 | 
 21 | 	def __init__(self, repo_id, trainingData, testData):
 22 | 		"""
 23 | 		Constructor
 24 | 		@repo_id : repository id
 25 | 		@training data : all commits that we are training the models on 
 26 | 		@testing data : all commits that we are testing the models on (i.e. glm model)
 27 | 		"""
 28 | 		self.repo_id = repo_id
 29 | 		self.trainingData = trainingData
 30 | 		self.testData = testData
 31 | 
 32 | 		# metrics
 33 | 		self.metrics = RepositoryMetrics()
 34 | 
 35 | 	def buildAllModels(self):
 36 | 		"""
 37 | 		builds all models and stores them in the metrics table
 38 | 		"""
 39 | 		self.fetchAllMetrics() # first get all metrics
 40 | 
 41 | 		# Only use training data b/c if new bugs are introduced in newer commits,
 42 | 		# then we do not know about it and therefore new data is unreliable. 
 43 | 		median_model = MedianModel(self.metrics, self.repo_id)
 44 | 		linear_reg_model = LinearRegressionModel(self.metrics, self.repo_id, self.testData)
 45 | 
 46 | 		median_model.buildModel() # build the median model
 47 | 		linear_reg_model.buildModel() # build the linear regression model & calculate the riskyness of each commit
 48 | 
 49 | 	def dumpData(self, commits):
 50 | 		"""
 51 | 		dumps all commit data into the monthly dataset folder.
 52 | 		dataset names after repository id
 53 | 		"""
 54 | 		# to write dataset file in this directory (git ignored!)
 55 | 		current_dir = os.path.dirname(__file__)
 56 | 
 57 | 		if config['data_dumps']['location'] != None or config['data_dumps']['location'] == "":
 58 | 			dir_of_datasets = config['data_dumps']['location']
 59 | 		else:
 60 | 			dir_of_datasets = current_dir + "/datasets/monthly/"
 61 | 
 62 | 		with open(dir_of_datasets + self.repo_id + ".csv", "w") as file:
 63 | 			csv_writer = csv.writer(file, dialect="excel")
 64 | 			columns = Commit.__table__.columns.keys()
 65 | 
 66 | 			# write the columns
 67 | 			csv_writer.writerow(columns)
 68 | 
 69 | 			# dump all commit data
 70 | 			for commit in commits:
 71 | 				commit_data = []
 72 | 				for col in columns:
 73 | 					commit_data.append(getattr(commit,col))
 74 | 				csv_writer.writerow(commit_data)
 75 | 
 76 | 	def fetchAllMetrics(self):
 77 | 		"""
 78 | 		fetchAllMetrics()
 79 | 		Iterate through each commit storing each individual's metrics into the metrics object,
 80 | 		to hold all metrics information necessary to build models.
 81 | 		@private
 82 | 		"""
 83 | 		for commit in self.trainingData:
 84 | 
 85 | 			# Exclude merge commits where no lines of code where changed
 86 | 			if commit.classification == "Merge" and commit.la == 0 and commit.ld == 0:
 87 | 				continue
 88 | 
 89 | 			else:
 90 | 
 91 | 				if commit.contains_bug == True:
 92 | 					self.metrics.ns_buggy.append(commit.ns)
 93 | 					self.metrics.nd_buggy.append(commit.nd)
 94 | 					self.metrics.nf_buggy.append(commit.nf)
 95 | 					self.metrics.entrophy_buggy.append(commit.entrophy)
 96 | 					self.metrics.la_buggy.append(commit.la)
 97 | 					self.metrics.ld_buggy.append(commit.ld)
 98 | 					self.metrics.lt_buggy.append(commit.lt)
 99 | 					self.metrics.ndev_buggy.append(commit.ndev)
100 | 					self.metrics.age_buggy.append(commit.age)
101 | 					self.metrics.nuc_buggy.append(commit.nuc)
102 | 					self.metrics.exp_buggy.append(commit.exp)
103 | 					self.metrics.rexp_buggy.append(commit.rexp)
104 | 					self.metrics.sexp_buggy.append(commit.sexp)
105 | 					self.metrics.num_buggy += 1
106 | 
107 | 				else:
108 | 					self.metrics.ns_nonbuggy.append(commit.ns)
109 | 					self.metrics.nd_nonbuggy.append(commit.nd)
110 | 					self.metrics.nf_nonbuggy.append(commit.nf)
111 | 					self.metrics.entrophy_nonbuggy.append(commit.entrophy)
112 | 					self.metrics.la_nonbuggy.append(commit.la)
113 | 					self.metrics.ld_nonbuggy.append(commit.ld)
114 | 					self.metrics.lt_nonbuggy.append(commit.lt)
115 | 					self.metrics.ndev_nonbuggy.append(commit.ndev)
116 | 					self.metrics.age_nonbuggy.append(commit.age)
117 | 					self.metrics.nuc_nonbuggy.append(commit.nuc)
118 | 					self.metrics.exp_nonbuggy.append(commit.exp)
119 | 					self.metrics.rexp_nonbuggy.append(commit.rexp)
120 | 					self.metrics.sexp_nonbuggy.append(commit.sexp)
121 | 					self.metrics.num_nonbuggy += 1
122 | 


--------------------------------------------------------------------------------
/cas_manager.py:
--------------------------------------------------------------------------------
  1 | """
  2 | file: cas_manager.py
  3 | authors: Christoffer Rosen <cbr4830@rit.edu>
  4 | date: Jan. 2014
  5 | description: This module contains the CAS_manager class, which is a thread that continously checks if there
  6 | 			 is work that needs to be done. Also contains supporting classes of Worker and ThreadPool used by
  7 | 			 the CAS_Manager.
  8 | """
  9 | from analyzer.analyzer import *
 10 | from ingester.ingester import *
 11 | from orm.repository import *
 12 | import calendar # to convert datetime to unix time
 13 | from caslogging import logging
 14 | from queue import *
 15 | import threading
 16 | import time
 17 | from monthdelta import MonthDelta
 18 | 
 19 | class CAS_Manager(threading.Thread):
 20 | 	""" 
 21 | 	Thread that continiously checks if there is work to be done and adds it to
 22 | 	the thread pool work queue
 23 | 	"""
 24 | 
 25 | 	def __init__(self):
 26 | 		"""Constructor"""
 27 | 		threading.Thread.__init__(self)
 28 | 		numOfWorkers = int(config['system']['workers'])
 29 | 		self.workQueue = ThreadPool(numOfWorkers)
 30 | 		self.modelQueue = Queue()
 31 | 
 32 | 	def checkIngestion(self):
 33 | 		"""Check if any repo needs to be ingested"""
 34 | 
 35 | 		session = Session()
 36 | 		repo_update_freq = int(config['repoUpdates']['freqInDays'])
 37 | 		refresh_date = str(datetime.utcnow() - timedelta(days=repo_update_freq))
 38 | 
 39 | 		repos_to_get = (session.query(Repository) 
 40 | 							.filter( 
 41 | 								(Repository.status == "Waiting to be Ingested") | 
 42 | 								(Repository.ingestion_date < refresh_date) &
 43 | 								(Repository.status != "Error") &
 44 | 								(Repository.status != "Analyzing"))
 45 | 							.all())
 46 | 
 47 | 		for repo in repos_to_get:
 48 | 			logging.info("Adding repo " + repo.id + " to work queue for ingesting")
 49 | 			repo.status = "In Queue to be Ingested"
 50 | 			session.commit() # update the status of repo
 51 | 			self.workQueue.add_task(ingest,repo.id)
 52 | 
 53 | 		session.close()
 54 | 
 55 | 	def checkAnalyzation(self):
 56 | 		"""Checks if any repo needs to be analyzed"""
 57 | 
 58 | 		session = Session()
 59 | 		repo_update_freq = int(config['repoUpdates']['freqInDays'])
 60 | 		refresh_date = str(datetime.utcnow() - timedelta(days=repo_update_freq))
 61 | 
 62 | 		repos_to_get = (session.query(Repository)
 63 | 						  .filter( (Repository.status == "Waiting to be Analyzed") )
 64 | 						  .all()
 65 | 						)
 66 | 		
 67 | 		for repo in repos_to_get:
 68 | 			logging.info("Adding repo " + repo.id + " to work queue for analyzing.")
 69 | 			repo.status = "In Queue to be Analyzed"
 70 | 			session.commit() # update the status of repo
 71 | 			self.workQueue.add_task(analyze, repo.id)
 72 | 
 73 | 		session.close()
 74 | 
 75 | 	def checkModel(self):
 76 | 		"""Check if any repo needs metrics to be generated"""
 77 | 
 78 | 		session = Session()
 79 | 		repos_to_get = (session.query(Repository) 
 80 | 							.filter( 
 81 | 								(Repository.status == "In Queue to Build Model") )
 82 | 							.all())
 83 | 
 84 | 		for repo in repos_to_get:
 85 | 			logging.info("Adding repo " + repo.id + " to model queue to finish analyzing")
 86 | 			repo.status = "Building Model"
 87 | 			session.commit() # update status of repo
 88 | 			self.modelQueue.put(repo.id)
 89 | 
 90 | 		session.close()
 91 | 
 92 | 	def checkBuildModel(self):
 93 | 		""" Checks if any repo is awaiting to build model. 
 94 | 			We are using a queue because we can't concurrently access R """
 95 | 
 96 | 		session = Session()
 97 | 
 98 | 		if self.modelQueue.empty() != True:
 99 | 			repo_id = self.modelQueue.get()
100 | 			repo = (session.query(Repository).filter(Repository.id == repo_id).first())
101 | 
102 | 			# use data only up to X months prior we won't have sufficent data to build models
103 | 			# as there may be bugs introduced in those months that haven't been fixed, skewing
104 | 			# our model.
105 | 			glm_model_time =  int(config['glm_modeling']['months']) 
106 | 			data_months_datetime = datetime.utcnow() - MonthDelta(glm_model_time)
107 | 			data_months_unixtime = calendar.timegm(data_months_datetime.utctimetuple())
108 | 		
109 | 			# all commits for repo prior to current time - glm model time
110 | 			training_commits = (session.query(Commit)
111 | 						.filter( 
112 | 							( Commit.repository_id == repo_id ) &
113 | 							( Commit.author_date_unix_timestamp < str(data_months_unixtime))
114 | 						)
115 | 						.order_by( Commit.author_date_unix_timestamp.desc() )
116 | 						.all())
117 | 
118 | 			# all commits for repo after or on current time - glm model time
119 | 			testing_commits = (session.query(Commit)
120 | 						.filter(
121 | 							( Commit.repository_id == repo_id ) &
122 | 							( Commit.author_date_unix_timestamp >= str(data_months_unixtime)))
123 | 						.all())
124 | 	
125 | 			try: 
126 | 				metrics_generator = MetricsGenerator(repo_id, training_commits, testing_commits)
127 | 				metrics_generator.buildAllModels()
128 | 
129 | 				# montly data dump - or rather, every 30 days.
130 | 				dump_refresh_date = str(datetime.utcnow() - timedelta(days=30))
131 | 				if repo.last_data_dump == None or repo.last_data_dump < dump_refresh_date:
132 | 					logging.info("Generating a monthly data dump for repository: " + repo_id)
133 | 
134 | 					# Get all commits for the repository
135 | 					all_commits = (session.query(Commit)
136 | 						.filter( 
137 | 							( Commit.repository_id == repo_id )
138 | 						)
139 | 						.order_by( Commit.author_date_unix_timestamp.desc() )
140 | 						.all())
141 | 
142 | 					metrics_generator.dumpData(all_commits)
143 | 					repo.last_data_dump = str(datetime.now().replace(microsecond=0))
144 | 					
145 | 				# Notify user if repo has never been analyzed previously
146 | 				if repo.analysis_date is None:
147 | 					self.notify(repo)
148 | 	
149 | 				logging.info("Repo " + repo_id + " finished analyzing.")
150 | 				repo.analysis_date = str(datetime.now().replace(microsecond=0))
151 | 				repo.status = "Analyzed"
152 | 				session.commit() # update status of repo
153 | 				session.close()
154 | 
155 | 			# uh-oh
156 | 			except Exception as e:
157 | 				logging.exception("Got an exception building model for repository " + repo_id)
158 | 
159 | 				repo.status = "Error"
160 | 				session.commit() # update repo status
161 | 				session.close()
162 | 
163 | 	def notify(self, repo):
164 | 		""" Send e-mail notifications if applicable to a repo 
165 | 			used by checkBuildModel """
166 | 
167 | 		notify = False
168 | 		notifier = None
169 | 		logging.info("Notifying subscribed users for repository " + repo.id)
170 | 
171 | 		# Create the Notifier
172 | 		gmail_user = config['gmail']['user']
173 | 		gmail_pass = config['gmail']['pass']
174 | 		notifier = Notifier(gmail_user, gmail_pass, repo.name)
175 | 
176 | 		# Add subscribers if applicable
177 | 		if repo.email is not None:
178 | 			notifier.addSubscribers([repo.email, gmail_user])
179 | 		else:
180 | 			notifier.addSubscribers([gmail_user])
181 | 
182 | 		notifier.notify()
183 | 
184 | 	def run(self):
185 | 
186 | 		while(True):
187 | 			### --- Check repository table if there is any work to be done ---  ###
188 | 			self.checkIngestion()
189 | 			self.checkAnalyzation()
190 | 			self.checkModel()
191 | 			self.checkBuildModel()
192 | 			time.sleep(10)
193 | 
194 | class Worker(threading.Thread):
195 | 	"""Thread executing tasks from a given tasks queue"""
196 | 	def __init__(self, tasks):
197 | 		threading.Thread.__init__(self)
198 | 		self.tasks = tasks
199 | 		self.daemon = True
200 | 		self.start()
201 | 	
202 | 	def run(self):
203 | 
204 | 		while True:
205 | 
206 | 			func, args, kargs = self.tasks.get()
207 | 			try:
208 | 				func(*args, **kargs)
209 | 			except Exception as e:
210 | 				print(e)
211 | 
212 | 			self.tasks.task_done()
213 | 
214 | class ThreadPool:
215 | 	"""Pool of threads consuming tasks from a queue"""
216 | 	def __init__(self, num_threads):
217 | 		self.tasks = Queue(num_threads)
218 | 		for _ in range(num_threads): Worker(self.tasks)
219 | 
220 | 	def add_task(self, func, *args, **kargs):
221 | 		"""Add a task to the queue"""
222 | 		self.tasks.put((func, args, kargs))
223 | 
224 | 	def wait_completion(self):
225 | 		"""Wait for completion of all the tasks in the queue"""
226 | 		self.tasks.join()
227 | 


--------------------------------------------------------------------------------
/analyzer/git_commit_linker.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os
  3 | import subprocess
  4 | from orm.commit import *
  5 | from caslogging import logging
  6 | import json
  7 | import re
  8 | 
  9 | class GitCommitLinker:
 10 |   """
 11 |   links a corrective change/commit from a git repository
 12 |   to a change that introduced the problem or caused the
 13 |   corrective commit to be made
 14 | 
 15 |   assumes that regions where modified or deleted source code
 16 |   in a corrective fix is the location of a bug.
 17 | 
 18 |   heavily commented as git diff tool doesn't provide a clean way of seeing
 19 |   the specific line of code modified. we are also using the git scm tool to annotate/blame to
 20 |   track down the bug-introducing changes
 21 |   """
 22 | 
 23 |   REPO_DIR = "ingester/CASRepos/git/" # locations where repo directories are stored
 24 | 
 25 |   def __init__(self, repoId):
 26 |     """
 27 |     constructor
 28 |     sets the repository path.
 29 |     """
 30 |     self.repo_path = os.path.join(os.path.dirname(__file__), '..', self.REPO_DIR + repoId)
 31 |     self.repo_id = repoId
 32 | 
 33 |   def linkCorrectiveCommits(self, corrective_commits, all_commits):
 34 |     """
 35 |     links all corrective changes/commits to the change that introduced the problem
 36 |     note: a bug introducing change may have introduced more than one bug.
 37 |     """
 38 | 
 39 |     linked_commits = {} # dict of buggy commit hash -> [corrective commits]
 40 | 
 41 |     # find all bug introducing commits
 42 |     for corrective_commit in corrective_commits:
 43 |       buggy_commits = self._linkCorrectiveCommit(corrective_commit)
 44 | 
 45 |       for buggy_commit in buggy_commits:
 46 |         
 47 |         if buggy_commit in linked_commits:
 48 |           linked_commits[buggy_commit].append(corrective_commit.commit_hash)
 49 |         else:
 50 |           linked_commits[buggy_commit] = [corrective_commit.commit_hash]
 51 | 
 52 |       corrective_commit.linked = True # mark that we have linked this corrective commit.
 53 | 
 54 |     for commit in all_commits:
 55 | 
 56 |       if commit.commit_hash in linked_commits:
 57 |         commit.contains_bug = True
 58 |         commit.fixes = json.dumps(linked_commits[commit.commit_hash])
 59 | 
 60 | 
 61 |   def _linkCorrectiveCommit(self, commit):
 62 |     """
 63 |     links the corrective change/commit to the change/commit which was the
 64 |     cause. this is the purpose of this object
 65 | 
 66 |     @commit - the corrective change to link w/ the changes that introduces the
 67 |     problems/issues it fixes.
 68 |     """
 69 |     region_chunks = self.getModifiedRegions(commit)
 70 | 
 71 |     logging.info("Linkage for commit " + commit.commit_hash)
 72 |     for k,v in region_chunks.items():
 73 |       logging.info("-- file: " + k)
 74 |       logging.info("---- loc modified: " + str(v))
 75 | 
 76 |     bug_introducing_changes = self.gitAnnotate(region_chunks, commit)
 77 |     return bug_introducing_changes
 78 | 
 79 |   def _getModifiedRegionsOnly(self, diff, files_modified):
 80 |     """
 81 |     returns a dict of file -> list of line numbers modified. helper function for getModifiedRegions
 82 |     git diff doesn't provide a clean way of simply getting the specific lines that were modified, so we are doing so here 
 83 |     manually. A possible refactor in the future may be to use an external diff tool, so that this implementation 
 84 |     wouldn't be scm (git) specific.
 85 | 
 86 |     if a file was merely deleted, then there was no chunk or region changed but we do capture the file.
 87 |     however, we do not assume this is a location of a bug.
 88 | 
 89 |     modified means modified or deleted -- not added! We assume are lines of code modified is the location of a bug.
 90 |     """
 91 |     region_diff = {}
 92 | 
 93 |     # only link code source files as any type of README, etc typically have HUGE changes and reduces 
 94 |     # the performance to unacceptable levels. it's very hard to blacklist everything; much easier just to whitelist
 95 |     # code source files endings.
 96 |     list_ext_dir = os.path.join(os.path.dirname(__file__), "code_file_extentions.txt")
 97 |     file_exts_to_include = open(list_ext_dir).read().splitlines()
 98 | 
 99 |     for file in files_modified:
100 | 
101 |       # weed out bad files/binary files/etc
102 |       if file != "'" and file != "":
103 |         file_info = file.split(".")
104 | 
105 |         # get extentions
106 |         if len(file_info) > 1:
107 |           file_ext = (file_info[1]).lower()
108 | 
109 |           # ensure these source code file endings
110 |           if file_ext.upper() in file_exts_to_include:
111 |             region_diff[file] = []
112 | 
113 |     # split all the different regions 
114 |     regions = diff.split("diff --git")[1:] # remove the clutter
115 | 
116 |     # Next, we study each region to get file that was modified & the lines modified so we can annotate them later
117 |     for region in regions:
118 | 
119 |       # We begin by splitting on the beginning of double at characters, which gives us an array looking like this:
120 |       # [file info, line info {double at characters} modified code]
121 |       chunks_initial = region.split(":CAS_DELIMITER_START:@@")
122 | 
123 |       # if a binary file it doesn't display the lines modified (a.k.a the 'line info {double at characters} modified code' part)
124 |       if len(chunks_initial) == 1:
125 |         continue
126 | 
127 |       file_info = chunks_initial[0] # file info is the first 'chunk', followed by the line_info {double at characters} modified code
128 |       file_info_split = file_info.split(" ")
129 |       file_name = file_info_split[1][2:] # remove the 'a/ character'
130 | 
131 |       # it is possible there is a binary file being tracked or something we shouldn't care about  
132 |       if file_name == None or file_name not in region_diff:
133 |         continue
134 | 
135 |       # Next, we must know the lines modified so that we can annotate. To do this, we must further split the chunks_initial.
136 |       # Specifically, we must seperate the line info from the code info. The second part of the initial chunk looks like
137 |       # -101,30, +202,33 {double at characters} code modified info. We can be pretty certain that the line info doesnt contain
138 |       # any at characters, so we can safely split the first set of doule at characters seen to divide this info up.
139 | 
140 |       # Iterate through - as in one file we can multiple sections modified.  
141 |       for chunk in range(1, len(chunks_initial), 1):
142 | 
143 |         code_info_chunk = chunks_initial[chunk].split("@@",1) # split only on the first occurance of the double at characters
144 | 
145 |         line_info = code_info_chunk[0] # This now contains the -101,30 +102,30 part (info about the lines modified)
146 |         code_info = code_info_chunk[1] # This now contains the modified lines of code seperated by the delimiter we set
147 | 
148 |         # As we only care about modified lines of code, we must ignore the +/additions as they do exist in previous versions
149 |         # and thus, we cannot even annotate them (they were added in this commit). So, we only care about the start where it was 
150 |         # modified and we will have to study which lines where modified and keep track of them.
151 | 
152 |         mod_line_info = line_info.split(" ")[1] # remove clutter -> we only care about what line the modificatin started, first index is just empty
153 |         mod_code_info = code_info.replace("\\n","").split(":CAS_DELIMITER:")[1:-1] # remove clutter -> first line contains info on the class and last line irrelevant
154 | 
155 |         # make sure this is legitimate. expect modified line info to start with '-'
156 |         if mod_line_info[0] != '-':
157 |           continue
158 | 
159 |         # remove comma from mod_line_info as we only care about the start of the modification
160 |         if mod_line_info.find(",") != -1:
161 |           mod_line_info = mod_line_info[0:mod_line_info.find(",")]
162 | 
163 |         current_line = abs(int(mod_line_info)) # remove the '-' in front of the line number by abs
164 | 
165 |         # now only use the code line changes that MODIFIES (not adds) in the diff
166 |         for section in mod_code_info:
167 | 
168 |           # this lines modifies or deletes a line of code
169 |           if section.startswith(":CAS_DELIMITER_START:-"):
170 |             region_diff[file_name].append(str(current_line))
171 | 
172 |             # we only increment modified lines of code because we those lines did NOT exist
173 |             # in the previous commit!
174 |             current_line += 1 
175 | 
176 |     return region_diff
177 | 
178 | 
179 |   def getModifiedRegions(self, commit):
180 |     """
181 |     returns the list of regions that were modified/deleted between this commit and its ancester.
182 |     a region is simply the file and the loc in it that were modified. 
183 | 
184 |     @commit - change to get the list of regions
185 |     """
186 | 
187 |     # diff cmd w/ no lines of context between current vs parent. 
188 |     # pipe it into bash and echo back with our own delimiter instead of new lines to seperate each line 
189 |     # of the git output to make parsing this a reality!
190 |     diff_cmd = "git diff " + commit.commit_hash + "^ "+ commit.commit_hash + " --unified=0 " \
191 |       + ' | while read; do echo ":CAS_DELIMITER_START:$REPLY:CAS_DELIMITER:"; done'
192 | 
193 |     # It is possible that a commit doesn't have a parent! i.e., merged from a clean branch.
194 |     try:
195 |       diff = str(subprocess.check_output(diff_cmd, shell=True, cwd= self.repo_path, executable="/bin/bash" ))
196 | 
197 |       # files changed, this is used by the getLineNumbersChanged function
198 |       diff_cmd_lines_changed = "git diff " + commit.commit_hash + "^ "+ commit.commit_hash + " --name-only"
199 | 
200 |       # get the files modified -> use this to validate if we have arrived at a new file
201 |       # when grepping for the specific lines changed.
202 |       files_modified = str( subprocess.check_output( diff_cmd_lines_changed, shell=True, cwd= self.repo_path )).replace("b'", "").split("\\n")
203 | 
204 |       # now, let's get the file and the line number changed in the commit
205 |       return self._getModifiedRegionsOnly(diff, files_modified)
206 | 
207 |     except:
208 |       # The code change did not have a parent change!
209 |       return {}
210 | 
211 |   def gitAnnotate(self, regions, commit):
212 |     """
213 |     tracks down the origin of the deleted/modified loc in the regions dict using
214 |     the git annotate (now called git blame) feature of git and a list of commit
215 |     hashes of the most recent revision in which the line identified by the regions
216 |     was modified. these discovered commits are identified as bug-introducing changes.
217 | 
218 |     git blame command is set up to start looking back starting from the commit BEFORE the 
219 |     commit that was passed in. this is because a bug MUST have occured prior to this commit.
220 | 
221 |     @regions - a dict of {file} -> {list of line numbers that were modified}
222 |     @commit - commit that belongs to the passed in chucks/regions.
223 |     """
224 |     bug_introducing_changes = []
225 | 
226 |     for file, lines in regions.items():
227 |       for line in lines:
228 | 
229 |         # assume if region starts at beginning its a deletion or rename and ignore
230 |         if line != 0 and line != "0" :
231 | 
232 |           # we need to git blame with the --follow option so that it follows renames in the file, and the '-l'
233 |           # option gives us the complete commit hash. additionally, start looking at the commit's ancestor 
234 |           buggy_change = str( subprocess.check_output( "git blame -L" + line + ",+1 " + commit.commit_hash + "^ -l -- '" \
235 |                             + file + "'", shell=True, cwd= self.repo_path )).split(" ")[0][2:]
236 | 
237 |           if buggy_change not in bug_introducing_changes:
238 |             bug_introducing_changes.append(buggy_change)
239 | 
240 |     return bug_introducing_changes
241 | 


--------------------------------------------------------------------------------
/analyzer/linear_reg_model.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | import rpy2.robjects as robjects # R integration
  4 | from rpy2.robjects.packages import importr # import the importr package from R
  5 | from orm.glmcoefficients import * # to store the glm coefficients
  6 | from db import *	# postgresql db information
  7 | import math
  8 | from caslogging import logging
  9 | 
 10 | class LinearRegressionModel:
 11 |   """
 12 |   builds the generalized linear regression model (GLM).
 13 |   all coefficients stored in the database under the glm_coefficients table
 14 |   probability: intercept + sum([metric_coefficient] * metric)
 15 |   """
 16 | 
 17 |   def __init__(self, metrics, repo_id, testingCommits):
 18 |     """
 19 |     @metrics - this is the list of metrics from the TRAINING data set.
 20 |     @repo_id - the repository repo_id
 21 |     @testingCommits - this is commits from the TESTING data set
 22 |     """
 23 |     self.metrics = metrics
 24 |     self.repo_id = repo_id
 25 |     self.stats = importr('stats', robject_translations={'format_perc': '_format_perc'})
 26 |     self.base = importr('base')
 27 |     self.readcsv = robjects.r['read.csv']
 28 |     self.sig_threshold = 0.05
 29 |     self.data = None 
 30 |     self.commits = testingCommits
 31 | 
 32 |   def buildModel(self):
 33 |     """
 34 |     Builds the GLM model, stores the coefficients, and calculates the probability based on model that a commit
 35 |     will introduce a bug.
 36 |     """
 37 |     self._buildDataSet()
 38 |     self._buildModelIncrementally()
 39 | 
 40 |   def _buildDataSet(self):
 41 |     """
 42 |     builds the data set to be used for getting the linear regression model.
 43 |     saves datasets in the datasets folder as csv files to easily be imported
 44 |     or used by R.
 45 |     """
 46 | 
 47 |     # to write dataset file in this directory (git ignored!)
 48 |     current_dir = os.path.dirname(__file__)
 49 |     dir_of_datasets = current_dir + "/datasets/"
 50 |     num_buggy = getattr(self.metrics, "num_buggy")
 51 |     num_nonbuggy = getattr(self.metrics, "num_nonbuggy")
 52 | 
 53 |     with open(dir_of_datasets + self.repo_id + ".csv", "w") as file:
 54 |       csv_writer = csv.writer(file, dialect="excel")
 55 | 
 56 |       # write the columns
 57 |       csv_writer.writerow(["ns","nd","nf","entrophy","la","ld","lt","ndev","age","nuc","exp","rexp","sexp","is_buggy"])
 58 | 
 59 |       # write the relevant data - start w/ the buggy data first
 60 |       for buggy_index in range(0,num_buggy):
 61 |         ns = self.metrics.ns_buggy[buggy_index]
 62 |         nd = self.metrics.nd_buggy[buggy_index]
 63 |         nf = self.metrics.nf_buggy[buggy_index]
 64 |         entrophy = self.metrics.entrophy_buggy[buggy_index]
 65 |         la = self.metrics.la_buggy[buggy_index]
 66 |         ld = self.metrics.ld_buggy[buggy_index]
 67 |         lt = self.metrics.lt_buggy[buggy_index]
 68 |         ndev = self.metrics.ndev_buggy[buggy_index]
 69 |         age = self.metrics.age_buggy[buggy_index]
 70 |         nuc = self.metrics.nuc_buggy[buggy_index]
 71 |         exp = self.metrics.exp_buggy[buggy_index]
 72 |         rexp = self.metrics.rexp_buggy[buggy_index]
 73 |         sexp = self.metrics.sexp_buggy[buggy_index]
 74 |         csv_writer.writerow([ns,nd,nf,entrophy,la,ld,lt,ndev,age,nuc,exp,rexp,sexp,True])
 75 |       # end buggy data
 76 | 
 77 |       # write the non buggy data
 78 |       for nonbuggy_index in range(0,num_nonbuggy):
 79 |         ns = self.metrics.ns_nonbuggy[nonbuggy_index]
 80 |         nd = self.metrics.nd_nonbuggy[nonbuggy_index]
 81 |         nf = self.metrics.nf_nonbuggy[nonbuggy_index]
 82 |         entrophy = self.metrics.entrophy_nonbuggy[nonbuggy_index]
 83 |         la = self.metrics.la_nonbuggy[nonbuggy_index]
 84 |         ld = self.metrics.ld_nonbuggy[nonbuggy_index]
 85 |         lt = self.metrics.lt_nonbuggy[nonbuggy_index]
 86 |         ndev = self.metrics.ndev_nonbuggy[nonbuggy_index]
 87 |         age = self.metrics.age_nonbuggy[nonbuggy_index]
 88 |         nuc = self.metrics.nuc_nonbuggy[nonbuggy_index]
 89 |         exp = self.metrics.exp_nonbuggy[nonbuggy_index]
 90 |         rexp = self.metrics.rexp_nonbuggy[nonbuggy_index]
 91 |         sexp = self.metrics.sexp_nonbuggy[nonbuggy_index]
 92 |         csv_writer.writerow([ns,nd,nf,entrophy,la,ld,lt,ndev,age,nuc,exp,rexp,sexp,False])
 93 |       # end non buggy data
 94 |     # end file
 95 | 
 96 |   def _isMetricSignificant(self, formula_metrics, metric):
 97 |     """
 98 |     Checks if adding a metric to the already significant metrics in formula_metrics in a GLM model is significant. If significant,
 99 |     and doesn't cause any previous metric in formula_metrics to become non significant, we return true. Otherwise, false.
100 | 
101 |     Note: The p-value is always given in the 4th column of the summary matrix!
102 |     """
103 |     sig_column = 4
104 | 
105 |     # Case 1: no existing metrics in the formula
106 |     if len(formula_metrics) == 0:
107 |       formula = "is_buggy~" + metric
108 |       fit = self.stats.glm(formula, data=self.data, family="binomial")
109 |       summary = self.base.summary(fit)
110 |       # Note - first row is the intercept information so we start at second row!
111 | 
112 |       try:
113 |         metric_sig = summary.rx2('coefficients').rx(2,sig_column)[0] # Second row, 4th column of the summary matrix.
114 |         if metric_sig <= self.sig_threshold:
115 |           return True
116 |         else:
117 |           return False
118 | 
119 |       except:
120 |         # If we have two metrics that are perfectly collinear it will not build the model with the metrics
121 |         # and we will get an exception when trying to find the significance of *all values*. Indeed, do not add
122 |         # this value to the model!
123 |         return False
124 | 
125 |     # Case 2: existing metrics in the formula    
126 |     else:
127 |       num_metrics = len(formula_metrics)+2 # plus one for the new metric we are adding and one for intercept
128 |       formula = "is_buggy~" + "+".join(formula_metrics) + "+" + metric
129 |       fit = self.stats.glm(formula, data=self.data, family="binomial")
130 |       summary = self.base.summary(fit)
131 | 
132 |       # If any metric is now not significant, than we should not have added this metric to the formula
133 |       # There are (intercept) + num_metrics rows in the matrix to check - starts at second row skipping intercept
134 |       try: 
135 |         for row in range(2,num_metrics+1):
136 |           metric_sig = summary.rx2('coefficients').rx(row,sig_column)[0]
137 |           if metric_sig > self.sig_threshold:
138 |             return False
139 |         return True # old metrics added to model ARE significant still as well as the new one being tested
140 | 
141 |       except:
142 |         # If we have two metrics that are perfectly collinear it will not build the model with the metrics
143 |         # and we will get an exception when trying to find the significance of *all values*. Indeed, do not add
144 |         # this value to the model!
145 |         return False
146 | 
147 |   def _buildModelIncrementally(self):
148 |     """
149 |     Builds the linear regression model incrementally. It adds one metric at the time to the formula and keeps it
150 |     if it is significant. However, if adding it to the model casuses any other metric already added to the formula
151 |     to become not significant anymore, we do add it to the glm forumla.
152 |     """
153 | 
154 |     metrics_list = ["la","ld","lt","ns","nd","nf","ndev","age","nuc","exp","rexp","sexp","entrophy"]
155 |     formula_metrics = []
156 |     current_dir = os.path.dirname(__file__)
157 |     dir_of_datasets = current_dir + "/datasets/"
158 |     self.data = self.readcsv(dir_of_datasets + self.repo_id + ".csv", header=True, sep = ",")
159 | 
160 |     for metric in metrics_list:
161 |       if self._isMetricSignificant(formula_metrics, metric):
162 |         formula_metrics.append(metric)
163 | 
164 |     # Store coefficients of our model w/ formula containing only the sig coefficients
165 |     self._storeCoefficients(formula_metrics)
166 | 
167 |     # Calculate all probability for each commit to introduce a bug
168 |     self.calculateCommitRiskyness(self.commits, formula_metrics)
169 | 
170 | 
171 |   def _getCoefficients(self, formula_coefs):
172 |     """
173 |     Builds a GLM model with a formula based on the passed in coefficients and retuns a dictionary containing each
174 |     coefficient with its value.
175 |     """
176 |     coef_dict = {} # a dict containing glm coefficients {name -> value}
177 |     formula = "is_buggy~" + "+".join(formula_coefs)
178 |     fit = self.stats.glm(formula, data=self.data, family="binomial")
179 | 
180 |     for coef in formula_coefs:
181 |       coef_dict[coef] = fit.rx2('coefficients').rx2(coef)[0]
182 | 
183 |     return coef_dict
184 | 
185 |   def _getInterceptValue(self, coefs):
186 |     """
187 |     Return the Intercept value of a GLM model and the p-value
188 |     Assumes that model can be built!
189 |     """
190 |     formula = "is_buggy~" + "+".join(coefs)
191 |     fit = self.stats.glm(formula, data=self.data, family="binomial")
192 |     summary = self.base.summary(fit)
193 |     return summary.rx2('coefficients').rx(1)[0], summary.rx2('coefficients').rx(1,4)[0]
194 | 
195 |   def _getCoefficientObject(self, coef_name, coef_value):
196 |     """
197 |     returns a JSON object representation of coefficient given the name and value. if coefficient significance, true or false
198 |     is given depending on if it meets the significance threshold
199 |     """
200 |     coef_object = ""
201 |     coef_object += '"' + str(coef_name) + '":"' + str(coef_value)
202 |     return coef_object + '",'
203 | 
204 |   def _storeCoefficients(self, coefficient_names):
205 |     """
206 |     stores the glm coefficients in the database
207 |     """
208 |     # We are making this into JSON to simply store it in the database.
209 |     coefs = ""
210 |     coefs += '"repo":"' + str(self.repo_id) + '",'
211 | 
212 |     # 2 Cases: where there are NO significant coefficients and the revese case.
213 |     if len(coefficient_names) == 0:
214 |       coefficient_dict = {}
215 |     else:
216 |       coefficient_dict = self._getCoefficients(coefficient_names)
217 | 
218 |       # get the constant (aka intercept value)
219 |       intercept_value, intercept_pvalue = self._getInterceptValue(coefficient_names)
220 |       if intercept_pvalue <= self.sig_threshold:
221 |         intercept_sig = 1
222 |       else:
223 |         intercept_sig = 0
224 | 
225 |       coefs += self._getCoefficientObject("intercept", intercept_value)
226 |       coefs += self._getCoefficientObject("intercept_sig", intercept_sig)
227 | 
228 |     # Keep track of all and the subset of all that are significant as we need to record everything to the db
229 |     sig_coefs = [] 
230 |     all_coefs = ["ns", "nd", "nf", "entrophy", "la", "ld", "lt", "ndev", "age", "nuc", "exp", "rexp", "sexp"]
231 | 
232 |     # iterate through all the values in the dict containing coeficients
233 |     for coef_name, coef_value in coefficient_dict.items():
234 |       coefs += self._getCoefficientObject(coef_name, coef_value)
235 |       coefs += self._getCoefficientObject(coef_name + "_sig", 1) # keep track more easily which are statistically significant in db
236 |       sig_coefs.append(coef_name)
237 | 
238 |     # append the non significant coefficents as -1 and not significant
239 |     for c in all_coefs:
240 |       if c not in sig_coefs:
241 |         coefs += self._getCoefficientObject(c, -1) 
242 |         coefs += self._getCoefficientObject(c + "_sig", 0)
243 | 
244 |     # remove the trailing comma
245 |     coefs = coefs[:-1]
246 | 
247 |     # Insert into the coefficient table
248 |     coefSession = Session()
249 |     allCoef = GlmCoefficients(json.loads('{' + coefs + '}'))
250 | 
251 |     # Copy to db
252 |     coefSession.merge(allCoef)
253 | 
254 |     # Write
255 |     coefSession.commit()
256 |     coefSession.close()
257 | 
258 |   def calculateCommitRiskyness(self, commits, coefficient_names):
259 |     """
260 |     calcualte the probability of commits to be buggy or not
261 |     using the linear regression model
262 | 
263 |     estimated probability = 1/[1 + exp(-a - BX)]
264 |     """
265 |     # 2 cases: model cannot possibly be build if no signficant coefficients available
266 |     # in this case, we just insert -1 for the probability to indicate no glm prediction possible
267 | 
268 |     if len(coefficient_names) == 0:
269 |       coefficient_dict = {}
270 |       model_available = False
271 |     else:
272 |       coefficient_dict = self._getCoefficients(coefficient_names)
273 |       model_available = True
274 |       intercept_value, intercept_pvalue = self._getInterceptValue(coefficient_names)
275 | 
276 |     for commit in commits:
277 | 
278 |       if model_available == False:
279 |         commit.glm_probability = -1
280 |       else:
281 |         coefs_sum = 0
282 |         for coef_name, coef_value in coefficient_dict.items():
283 |           coefs_sum += (coef_value * getattr(commit, coef_name))
284 | 
285 |         try:
286 |           riskyness = 1/(1+ math.exp(-intercept_value-coefs_sum))
287 |         except OverflowError:
288 |           logging.error("Overflow error for repo " + self.repo_id)
289 |           logging.error("Calculating riskyness for " + commit.commit_hash)
290 |           logging.error("Sum of coefficients: " + str(coefs_sum))
291 |           logging.error("Coeffiecents: " + str(coefficient_dict))
292 |           riskyness = 0.01
293 | 
294 |         commit.glm_probability = riskyness


--------------------------------------------------------------------------------
/ingester/git.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import json
  4 | import logging
  5 | import math                               # Required for the math.log function
  6 | from ingester.commitFile import *         # Represents a file
  7 | from classifier.classifier import *       # Used for classifying each commit
  8 | import time
  9 | 
 10 | """
 11 | file: repository.py
 12 | authors: Ben Grawi <bjg1568@rit.edu>, Christoffer Rosen <cbr4830@rit.edu>
 13 | date: October 2013
 14 | description: Holds the repository git abstraction class
 15 | """
 16 | 
 17 | class Git():
 18 |     """
 19 |     Git():
 20 |     pre-conditions: git is in the current PATH
 21 |                     self.path is set in a parent class
 22 |     description: a very basic abstraction for using git in python.
 23 |     """
 24 |     # Two backslashes to allow one backslash to be passed in the command.
 25 |     # This is given as a command line option to git for formatting output.
 26 | 
 27 |     # A commit mesasge in git is done such that first line is treated as the subject,
 28 |     # and the rest is treated as the message. We combine them under field commit_message
 29 | 
 30 |     # We want the log in ascending order, so we call --reverse
 31 |     # Numstat is used to get statistics for each commit
 32 |     LOG_FORMAT = '--pretty=format:\" CAS_READER_STARTPRETTY\
 33 |     \\"parent_hashes\\"CAS_READER_PROP_DELIMITER: \\"%P\\",CAS_READER_PROP_DELIMITER2\
 34 |     \\"commit_hash\\"CAS_READER_PROP_DELIMITER: \\"%H\\",CAS_READER_PROP_DELIMITER2\
 35 |     \\"author_name\\"CAS_READER_PROP_DELIMITER: \\"%an\\",CAS_READER_PROP_DELIMITER2\
 36 |     \\"author_email\\"CAS_READER_PROP_DELIMITER: \\"%ae\\",CAS_READER_PROP_DELIMITER2\
 37 |     \\"author_date\\"CAS_READER_PROP_DELIMITER: \\"%ad\\",CAS_READER_PROP_DELIMITER2\
 38 |     \\"author_date_unix_timestamp\\"CAS_READER_PROP_DELIMITER: \\"%at\\",CAS_READER_PROP_DELIMITER2\
 39 |     \\"commit_message\\"CAS_READER_PROP_DELIMITER: \\"%s%b\\"\
 40 |     CAS_READER_STOPPRETTY \" --numstat --reverse '
 41 | 
 42 |     CLONE_CMD = 'git clone {!s} {!s}'     # git clone command w/o downloading src code
 43 |     PULL_CMD = 'git pull'      # git pull command
 44 |     RESET_CMD = 'git reset --hard FETCH_HEAD'
 45 |     CLEAN_CMD = 'git clean -df' # f for force clean, d for untracked directories
 46 | 
 47 |     REPO_DIRECTORY = "/CASRepos/git/"        # directory in which to store repositories
 48 | 
 49 |     def getCommitStatsProperties( stats, commitFiles, devExperience, author, unixTimeStamp ):
 50 |         """
 51 |         getCommitStatsProperties
 52 |         Helper method for log. Caclulates statistics for each change/commit and
 53 |         returns them as a comma seperated string. Log will add these to the commit object
 54 |         properties
 55 | 
 56 |         @param stats            These are the stats given by --numstat as an array
 57 |         @param commitFiles      These are all tracked commit files
 58 |         @param devExperience    These are all tracked developer experiences
 59 |         @param author           The author of the commit
 60 |         @param unixTimeStamp    Time of the commit
 61 |         """
 62 | 
 63 |         statProperties = ""
 64 | 
 65 |         # Data structures to keep track of info needed for stats
 66 |         subsystemsSeen = []                         # List of system names seen
 67 |         directoriesSeen = []                        # List of directory names seen
 68 |         locModifiedPerFile = []                     # List of modified loc in each file seen
 69 |         authors = []                                # List of all unique authors seen for each file
 70 |         fileAges = []                               # List of the ages for each file in a commit
 71 | 
 72 |         # Stats variables
 73 |         la = 0                                      # lines added
 74 |         ld = 0                                      # lines deleted
 75 |         nf = 0                                      # Number of modified files
 76 |         ns = 0                                      # Number of modified subsystems
 77 |         nd = 0                                      # number of modified directories
 78 |         entrophy = 0                                # entrophy: distriubtion of modified code across each file
 79 |         lt = 0                                      # lines of code in each file (sum) before the commit
 80 |         ndev = 0                                    # the number of developers that modifed the files in a commit
 81 |         age = 0                                     # the average time interval between the last and current change
 82 |         exp = 0                                     # number of changes made by author previously
 83 |         rexp = 0                                    # experience weighted by age of files ( 1 / (n + 1))
 84 |         sexp = 0                                    # changes made previous by author in same subsystem
 85 |         totalLOCModified = 0                        # Total modified LOC across all files
 86 |         nuc = 0                                     # number of unique changes to the files
 87 |         filesSeen = ""                              # files seen in change/commit
 88 | 
 89 |         for stat in stats:
 90 | 
 91 |             if( stat == ' ' or stat == '' ):
 92 |                 continue
 93 | 
 94 |             fileStat = stat.split("\\t")
 95 | 
 96 |              # Check that we are only looking at file stat (i.e., remove extra newlines)
 97 |             if( len(fileStat) < 2):
 98 |                 continue
 99 | 
100 |             # catch the git "-" line changes
101 |             try:
102 |                 fileLa = int(fileStat[0])
103 |                 fileLd = int(fileStat[1])
104 |             except:
105 |                 fileLa = 0
106 |                 fileLd = 0
107 | 
108 |             # Remove oddities in filename so we can process it
109 |             fileName = (fileStat[2].replace("'",'').replace('"','').replace("\\",""))
110 | 
111 |             totalModified = fileLa + fileLd
112 | 
113 |             # have we seen this file already?
114 |             if(fileName in commitFiles):
115 |                 prevFileChanged = commitFiles[fileName]
116 |                 prevLOC = getattr(prevFileChanged, 'loc')
117 |                 prevAuthors = getattr(prevFileChanged, 'authors')
118 |                 prevChanged = getattr(prevFileChanged, 'lastchanged')
119 |                 file_nuc = getattr(prevFileChanged, 'nuc')
120 |                 nuc += file_nuc
121 |                 lt += prevLOC
122 | 
123 |                 for prevAuthor in prevAuthors:
124 |                     if prevAuthor not in authors:
125 |                         authors.append(prevAuthor)
126 | 
127 |                 # Convert age to days instead of seconds
128 |                 age += ( (int(unixTimeStamp) - int(prevChanged)) / 86400 )
129 |                 fileAges.append(prevChanged)
130 | 
131 |                 # Update the file info
132 | 
133 |                 file_nuc += 1 # file was modified in this commit
134 |                 setattr(prevFileChanged, 'loc', prevLOC + fileLa - fileLd)
135 |                 setattr(prevFileChanged, 'authors', authors)
136 |                 setattr(prevFileChanged, 'lastchanged', unixTimeStamp)
137 |                 setattr(prevFileChanged, 'nuc', file_nuc)
138 | 
139 |             else:
140 | 
141 |                 # new file we haven't seen b4, add it to file commit files dict
142 |                 if(author not in authors):
143 |                     authors.append(author)
144 | 
145 |                 if(unixTimeStamp not in fileAges):
146 |                     fileAges.append(unixTimeStamp)
147 | 
148 |                 fileObject = CommitFile(fileName, fileLa - fileLd, authors, unixTimeStamp)
149 |                 commitFiles[fileName] = fileObject
150 | 
151 |             # end of stats loop
152 | 
153 |             locModifiedPerFile.append(totalModified) # Required for entrophy
154 |             totalLOCModified += totalModified
155 |             fileDirs = fileName.split("/")
156 | 
157 |             if( len(fileDirs) == 1 ):
158 |                 subsystem = "root"
159 |                 directory = "root"
160 |             else:
161 |                 subsystem = fileDirs[0]
162 |                 directory = "/".join(fileDirs[0:-1])
163 | 
164 |             if( subsystem not in subsystemsSeen ):
165 |                 subsystemsSeen.append( subsystem )
166 | 
167 |             if( author in devExperience ):
168 |                 experiences = devExperience[author]
169 |                 exp += sum(experiences.values())
170 | 
171 |                 if( subsystem in experiences ):
172 |                     sexp = experiences[subsystem]
173 |                     experiences[subsystem] += 1
174 |                 else:
175 |                     experiences[subsystem] = 1
176 | 
177 |                 try:
178 |                     rexp += (1 / (age) + 1)
179 |                 except:
180 |                     rexp += 0
181 | 
182 |             else:
183 |                 devExperience[author] = {subsystem: 1}
184 | 
185 |             if( directory not in directoriesSeen ):
186 |                 directoriesSeen.append( directory )
187 | 
188 |             # Update file-level metrics
189 |             la += fileLa
190 |             ld += fileLd
191 |             nf += 1
192 |             filesSeen += fileName + ",CAS_DELIMITER,"
193 | 
194 |         # End stats loop
195 | 
196 |         if( nf < 1):
197 |             return ""
198 | 
199 |         # Update commit-level metrics
200 |         ns = len(subsystemsSeen)
201 |         nd = len(directoriesSeen)
202 |         ndev = len(authors)
203 |         lt = lt / nf
204 |         age = age / nf
205 |         exp = exp / nf
206 |         rexp = rexp / nf
207 | 
208 |         # Update entrophy
209 |         for fileLocMod in locModifiedPerFile:
210 |             if (fileLocMod != 0 ):
211 |                 avg = fileLocMod/totalLOCModified
212 |                 entrophy -= ( avg * math.log( avg,2 ) )
213 | 
214 |         # Add stat properties to the commit object
215 |         statProperties += ',"la":"' + str( la ) + '\"'
216 |         statProperties += ',"ld":"' + str( ld ) + '\"'
217 |         statProperties += ',"fileschanged":"' + filesSeen[0:-1] + '\"'
218 |         statProperties += ',"nf":"' + str( nf ) + '\"'
219 |         statProperties += ',"ns":"' + str( ns ) + '\"'
220 |         statProperties += ',"nd":"' + str( nd ) + '\"'
221 |         statProperties += ',"entrophy":"' + str(  entrophy ) + '\"'
222 |         statProperties += ',"ndev":"' + str( ndev ) + '\"'
223 |         statProperties += ',"lt":"' + str( lt ) + '\"'
224 |         statProperties += ',"nuc":"' + str( nuc ) + '\"'
225 |         statProperties += ',"age":"' + str( age ) + '\"'
226 |         statProperties += ',"exp":"' + str( exp ) + '\"'
227 |         statProperties += ',"rexp":"' + str( rexp ) + '\"'
228 |         statProperties += ',"sexp":"' + str( sexp ) + '\"'
229 | 
230 |         return statProperties
231 |     # End stats
232 | 
233 |     def log(self, repo, firstSync):
234 |         """
235 |         log(): Repository, Boolean -> Dictionary
236 |         arguments: repo Repository: the repository to clone
237 |                    firstSync Boolean: whether to sync all commits or after the
238 |             ingestion date
239 |         description: a very basic abstraction for using git in python.
240 |         """
241 |         repo_dir = os.chdir(os.path.dirname(__file__) + self.REPO_DIRECTORY + repo.id)
242 |         logging.info('Getting/parsing git commits: '+ str(repo) )
243 | 
244 |         # Spawn a git process and convert the output to a string
245 |         if not firstSync and repo.ingestion_date is not None:
246 |             cmd = 'git log --after="' + repo.ingestion_date + '" '
247 |         else:
248 |             cmd = 'git log '
249 | 
250 |         log = str( subprocess.check_output(cmd + self.LOG_FORMAT, shell=True, cwd = repo_dir ) )
251 |         log = log[2:-1]   # Remove head/end clutter
252 | 
253 |         # List of json objects
254 |         json_list = []
255 | 
256 |         # Make sure there are commits to parse
257 |         if len(log) == 0:
258 |             return []
259 | 
260 |         commitFiles = {}            # keep track of ALL file changes
261 |         devExperience = {}          # Keep track of ALL developer experience
262 |         classifier = Classifier()   # classifier for classifying commits (i.e., corrective, feature addition, etc)
263 | 
264 |         commitList = log.split("CAS_READER_STARTPRETTY")
265 | 
266 |         for commit in commitList:
267 |             author = ""                                 # author of commit
268 |             unixTimeStamp = 0                           # timestamp of commit
269 |             fix = False                                 # whether or not the change is a defect fix
270 |             classification = None                       # classification of the commit (i.e., corrective, feature addition, etc)
271 |             isMerge = False                             # whether or not the change is a merge
272 | 
273 |             commit = commit.replace('\\x', '\\u00')   # Remove invalid json escape characters
274 |             splitCommitStat = commit.split("CAS_READER_STOPPRETTY")  # split the commit info and its stats
275 | 
276 |             # The first split will contain an empty list
277 |             if(len(splitCommitStat) < 2):
278 |                 continue
279 | 
280 |             prettyCommit = splitCommitStat[0]
281 |             statCommit = splitCommitStat[1]
282 |             commitObject = ""
283 | 
284 |             # Start with the commit info (i.e., commit hash, author, date, subject, etc)
285 |             prettyInfo = prettyCommit.split(',CAS_READER_PROP_DELIMITER2    "')
286 |             for propValue in prettyInfo:
287 |                 props = propValue.split('"CAS_READER_PROP_DELIMITER: "')
288 |                 propStr = ''
289 |                 for prop in props:
290 |                     prop = prop.replace('\\','').replace("\\n", '')  # avoid escapes & newlines for JSON formatting
291 |                     propStr = propStr + '"' + prop.replace('"','') + '":'
292 | 
293 |                 values = propStr[0:-1].split(":")
294 | 
295 |                 if(values[0] == '"    parent_hashes"'):
296 |                     # Check to see if this is a merge change. Fix for Issue #26. 
297 |                     # Detects merges by counting the # of parent commits
298 |                     
299 |                     parents = values[1].split(' ')
300 |                     if len(parents) == 2:
301 |                         isMerge = True
302 | 
303 |                 if(values[0] == '"author_name"'):
304 |                     author = values[1].replace('"', '')
305 | 
306 |                 if(values[0] == '"author_date_unix_timestamp"'):
307 |                     unixTimeStamp = values[1].replace('"','')
308 | 
309 |                 # Classify the commit
310 |                 if(values[0] == '"commit_message"'):
311 | 
312 |                     if (isMerge):
313 |                         classification = "Merge"
314 |                     else:
315 |                         classification = classifier.categorize(values[1].lower())
316 | 
317 |                     # If it is a corrective commit, we induce it fixes a bug somewhere in the system
318 |                     if classification == "Corrective":
319 |                         fix = True
320 | 
321 | 
322 |                 commitObject += "," + propStr[0:-1]
323 |                 # End property loop
324 |             # End pretty info loop
325 | 
326 |             # Get the stat properties
327 |             stats = statCommit.split("\\n")
328 |             commitObject += self.getCommitStatsProperties(stats, commitFiles, devExperience, author, unixTimeStamp)
329 | 
330 |             # Update the classification of the commit
331 |             commitObject += ',"classification":"' + str( classification ) + '\"'
332 | 
333 |              # Update whether commit was a fix or not
334 |             commitObject += ',"fix":"' + str( fix ) + '\"'
335 | 
336 |             # Remove first comma and extra space
337 |             commitObject = commitObject[1:].replace('    ','')
338 | 
339 |             # Add commit object to json_list
340 |             json_list.append(json.loads('{' + commitObject + '}'))
341 |         # End commit loop
342 | 
343 |         logging.info('Done getting/parsing git commits.')
344 |         return json_list
345 | 
346 |     def clone(self, repo):
347 |         """
348 |         clone(repo): Repository -> String
349 |         description:Takes the current repo and clones it into the
350 |             `clone_directory/the_repo_id`
351 |         arguments: repo Repository: the repository to clone
352 |         pre-conditions: The repo has not been already created
353 |         """
354 |         repo_dir = os.chdir(os.path.dirname(__file__) + self.REPO_DIRECTORY)
355 | 
356 |         # Run the clone command and return the results
357 | 
358 |         logging.info('Git cloning repo: '+ str(repo) )
359 |         cloneResult = str(subprocess.check_output(
360 |                   self.CLONE_CMD.format(repo.url, './' + repo.id),
361 |                   shell= True,
362 |                   cwd = repo_dir ) )
363 |         logging.info('Done cloning.')
364 |         #logging.debug("Git clone result:\n" + cloneResult)
365 | 
366 |         # Reset path for next repo
367 | 
368 |         # TODO: only return true on success, else return false
369 |         return True
370 | 
371 |     def pull(self, repo):
372 |         """
373 |         fetch(repo): Repository -> String
374 |         description:Takes the current repo and pulls the latest changes.
375 |         arguments: repo Repository: the repository to pull
376 |         pre-conditions: The repo has already been created
377 |         """
378 | 
379 |         repo_dir = os.path.dirname(__file__) + self.REPO_DIRECTORY + repo.id
380 | 
381 |         # Weird sceneario where something in repo gets modified - reset all changes before pulling
382 |         subprocess.call(self.RESET_CMD, shell=True, cwd= repo_dir)
383 |         subprocess.call(self.CLEAN_CMD, shell=True, cwd= repo_dir)
384 | 
385 |         # Run the pull command and return the results
386 |         logging.info('Pulling latest changes from repo: '+ str(repo) )
387 |         fetchResult = str(subprocess.check_output(
388 |                   self.RESET_CMD + "\n" + self.PULL_CMD ,
389 |                   shell=True,
390 |                   cwd=  repo_dir  ) )
391 |         logging.info('Done fetching.')
392 |         #logging.debug("Git pull result:\n" + cloneResult)
393 | 
394 |         # TODO: only return true on success, else return false
395 |         return True
396 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | GNU GENERAL PUBLIC LICENSE
  2 |                        Version 2, June 1991
  3 | 
  4 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
  5 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  6 |  Everyone is permitted to copy and distribute verbatim copies
  7 |  of this license document, but changing it is not allowed.
  8 | 
  9 |                             Preamble
 10 | 
 11 |   The licenses for most software are designed to take away your
 12 | freedom to share and change it.  By contrast, the GNU General Public
 13 | License is intended to guarantee your freedom to share and change free
 14 | software--to make sure the software is free for all its users.  This
 15 | General Public License applies to most of the Free Software
 16 | Foundation's software and to any other program whose authors commit to
 17 | using it.  (Some other Free Software Foundation software is covered by
 18 | the GNU Lesser General Public License instead.)  You can apply it to
 19 | your programs, too.
 20 | 
 21 |   When we speak of free software, we are referring to freedom, not
 22 | price.  Our General Public Licenses are designed to make sure that you
 23 | have the freedom to distribute copies of free software (and charge for
 24 | this service if you wish), that you receive source code or can get it
 25 | if you want it, that you can change the software or use pieces of it
 26 | in new free programs; and that you know you can do these things.
 27 | 
 28 |   To protect your rights, we need to make restrictions that forbid
 29 | anyone to deny you these rights or to ask you to surrender the rights.
 30 | These restrictions translate to certain responsibilities for you if you
 31 | distribute copies of the software, or if you modify it.
 32 | 
 33 |   For example, if you distribute copies of such a program, whether
 34 | gratis or for a fee, you must give the recipients all the rights that
 35 | you have.  You must make sure that they, too, receive or can get the
 36 | source code.  And you must show them these terms so they know their
 37 | rights.
 38 | 
 39 |   We protect your rights with two steps: (1) copyright the software, and
 40 | (2) offer you this license which gives you legal permission to copy,
 41 | distribute and/or modify the software.
 42 | 
 43 |   Also, for each author's protection and ours, we want to make certain
 44 | that everyone understands that there is no warranty for this free
 45 | software.  If the software is modified by someone else and passed on, we
 46 | want its recipients to know that what they have is not the original, so
 47 | that any problems introduced by others will not reflect on the original
 48 | authors' reputations.
 49 | 
 50 |   Finally, any free program is threatened constantly by software
 51 | patents.  We wish to avoid the danger that redistributors of a free
 52 | program will individually obtain patent licenses, in effect making the
 53 | program proprietary.  To prevent this, we have made it clear that any
 54 | patent must be licensed for everyone's free use or not licensed at all.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                     GNU GENERAL PUBLIC LICENSE
 60 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 61 | 
 62 |   0. This License applies to any program or other work which contains
 63 | a notice placed by the copyright holder saying it may be distributed
 64 | under the terms of this General Public License.  The "Program", below,
 65 | refers to any such program or work, and a "work based on the Program"
 66 | means either the Program or any derivative work under copyright law:
 67 | that is to say, a work containing the Program or a portion of it,
 68 | either verbatim or with modifications and/or translated into another
 69 | language.  (Hereinafter, translation is included without limitation in
 70 | the term "modification".)  Each licensee is addressed as "you".
 71 | 
 72 | Activities other than copying, distribution and modification are not
 73 | covered by this License; they are outside its scope.  The act of
 74 | running the Program is not restricted, and the output from the Program
 75 | is covered only if its contents constitute a work based on the
 76 | Program (independent of having been made by running the Program).
 77 | Whether that is true depends on what the Program does.
 78 | 
 79 |   1. You may copy and distribute verbatim copies of the Program's
 80 | source code as you receive it, in any medium, provided that you
 81 | conspicuously and appropriately publish on each copy an appropriate
 82 | copyright notice and disclaimer of warranty; keep intact all the
 83 | notices that refer to this License and to the absence of any warranty;
 84 | and give any other recipients of the Program a copy of this License
 85 | along with the Program.
 86 | 
 87 | You may charge a fee for the physical act of transferring a copy, and
 88 | you may at your option offer warranty protection in exchange for a fee.
 89 | 
 90 |   2. You may modify your copy or copies of the Program or any portion
 91 | of it, thus forming a work based on the Program, and copy and
 92 | distribute such modifications or work under the terms of Section 1
 93 | above, provided that you also meet all of these conditions:
 94 | 
 95 |     a) You must cause the modified files to carry prominent notices
 96 |     stating that you changed the files and the date of any change.
 97 | 
 98 |     b) You must cause any work that you distribute or publish, that in
 99 |     whole or in part contains or is derived from the Program or any
100 |     part thereof, to be licensed as a whole at no charge to all third
101 |     parties under the terms of this License.
102 | 
103 |     c) If the modified program normally reads commands interactively
104 |     when run, you must cause it, when started running for such
105 |     interactive use in the most ordinary way, to print or display an
106 |     announcement including an appropriate copyright notice and a
107 |     notice that there is no warranty (or else, saying that you provide
108 |     a warranty) and that users may redistribute the program under
109 |     these conditions, and telling the user how to view a copy of this
110 |     License.  (Exception: if the Program itself is interactive but
111 |     does not normally print such an announcement, your work based on
112 |     the Program is not required to print an announcement.)
113 | 
114 | These requirements apply to the modified work as a whole.  If
115 | identifiable sections of that work are not derived from the Program,
116 | and can be reasonably considered independent and separate works in
117 | themselves, then this License, and its terms, do not apply to those
118 | sections when you distribute them as separate works.  But when you
119 | distribute the same sections as part of a whole which is a work based
120 | on the Program, the distribution of the whole must be on the terms of
121 | this License, whose permissions for other licensees extend to the
122 | entire whole, and thus to each and every part regardless of who wrote it.
123 | 
124 | Thus, it is not the intent of this section to claim rights or contest
125 | your rights to work written entirely by you; rather, the intent is to
126 | exercise the right to control the distribution of derivative or
127 | collective works based on the Program.
128 | 
129 | In addition, mere aggregation of another work not based on the Program
130 | with the Program (or with a work based on the Program) on a volume of
131 | a storage or distribution medium does not bring the other work under
132 | the scope of this License.
133 | 
134 |   3. You may copy and distribute the Program (or a work based on it,
135 | under Section 2) in object code or executable form under the terms of
136 | Sections 1 and 2 above provided that you also do one of the following:
137 | 
138 |     a) Accompany it with the complete corresponding machine-readable
139 |     source code, which must be distributed under the terms of Sections
140 |     1 and 2 above on a medium customarily used for software interchange; or,
141 | 
142 |     b) Accompany it with a written offer, valid for at least three
143 |     years, to give any third party, for a charge no more than your
144 |     cost of physically performing source distribution, a complete
145 |     machine-readable copy of the corresponding source code, to be
146 |     distributed under the terms of Sections 1 and 2 above on a medium
147 |     customarily used for software interchange; or,
148 | 
149 |     c) Accompany it with the information you received as to the offer
150 |     to distribute corresponding source code.  (This alternative is
151 |     allowed only for noncommercial distribution and only if you
152 |     received the program in object code or executable form with such
153 |     an offer, in accord with Subsection b above.)
154 | 
155 | The source code for a work means the preferred form of the work for
156 | making modifications to it.  For an executable work, complete source
157 | code means all the source code for all modules it contains, plus any
158 | associated interface definition files, plus the scripts used to
159 | control compilation and installation of the executable.  However, as a
160 | special exception, the source code distributed need not include
161 | anything that is normally distributed (in either source or binary
162 | form) with the major components (compiler, kernel, and so on) of the
163 | operating system on which the executable runs, unless that component
164 | itself accompanies the executable.
165 | 
166 | If distribution of executable or object code is made by offering
167 | access to copy from a designated place, then offering equivalent
168 | access to copy the source code from the same place counts as
169 | distribution of the source code, even though third parties are not
170 | compelled to copy the source along with the object code.
171 | 
172 |   4. You may not copy, modify, sublicense, or distribute the Program
173 | except as expressly provided under this License.  Any attempt
174 | otherwise to copy, modify, sublicense or distribute the Program is
175 | void, and will automatically terminate your rights under this License.
176 | However, parties who have received copies, or rights, from you under
177 | this License will not have their licenses terminated so long as such
178 | parties remain in full compliance.
179 | 
180 |   5. You are not required to accept this License, since you have not
181 | signed it.  However, nothing else grants you permission to modify or
182 | distribute the Program or its derivative works.  These actions are
183 | prohibited by law if you do not accept this License.  Therefore, by
184 | modifying or distributing the Program (or any work based on the
185 | Program), you indicate your acceptance of this License to do so, and
186 | all its terms and conditions for copying, distributing or modifying
187 | the Program or works based on it.
188 | 
189 |   6. Each time you redistribute the Program (or any work based on the
190 | Program), the recipient automatically receives a license from the
191 | original licensor to copy, distribute or modify the Program subject to
192 | these terms and conditions.  You may not impose any further
193 | restrictions on the recipients' exercise of the rights granted herein.
194 | You are not responsible for enforcing compliance by third parties to
195 | this License.
196 | 
197 |   7. If, as a consequence of a court judgment or allegation of patent
198 | infringement or for any other reason (not limited to patent issues),
199 | conditions are imposed on you (whether by court order, agreement or
200 | otherwise) that contradict the conditions of this License, they do not
201 | excuse you from the conditions of this License.  If you cannot
202 | distribute so as to satisfy simultaneously your obligations under this
203 | License and any other pertinent obligations, then as a consequence you
204 | may not distribute the Program at all.  For example, if a patent
205 | license would not permit royalty-free redistribution of the Program by
206 | all those who receive copies directly or indirectly through you, then
207 | the only way you could satisfy both it and this License would be to
208 | refrain entirely from distribution of the Program.
209 | 
210 | If any portion of this section is held invalid or unenforceable under
211 | any particular circumstance, the balance of the section is intended to
212 | apply and the section as a whole is intended to apply in other
213 | circumstances.
214 | 
215 | It is not the purpose of this section to induce you to infringe any
216 | patents or other property right claims or to contest validity of any
217 | such claims; this section has the sole purpose of protecting the
218 | integrity of the free software distribution system, which is
219 | implemented by public license practices.  Many people have made
220 | generous contributions to the wide range of software distributed
221 | through that system in reliance on consistent application of that
222 | system; it is up to the author/donor to decide if he or she is willing
223 | to distribute software through any other system and a licensee cannot
224 | impose that choice.
225 | 
226 | This section is intended to make thoroughly clear what is believed to
227 | be a consequence of the rest of this License.
228 | 
229 |   8. If the distribution and/or use of the Program is restricted in
230 | certain countries either by patents or by copyrighted interfaces, the
231 | original copyright holder who places the Program under this License
232 | may add an explicit geographical distribution limitation excluding
233 | those countries, so that distribution is permitted only in or among
234 | countries not thus excluded.  In such case, this License incorporates
235 | the limitation as if written in the body of this License.
236 | 
237 |   9. The Free Software Foundation may publish revised and/or new versions
238 | of the General Public License from time to time.  Such new versions will
239 | be similar in spirit to the present version, but may differ in detail to
240 | address new problems or concerns.
241 | 
242 | Each version is given a distinguishing version number.  If the Program
243 | specifies a version number of this License which applies to it and "any
244 | later version", you have the option of following the terms and conditions
245 | either of that version or of any later version published by the Free
246 | Software Foundation.  If the Program does not specify a version number of
247 | this License, you may choose any version ever published by the Free Software
248 | Foundation.
249 | 
250 |   10. If you wish to incorporate parts of the Program into other free
251 | programs whose distribution conditions are different, write to the author
252 | to ask for permission.  For software which is copyrighted by the Free
253 | Software Foundation, write to the Free Software Foundation; we sometimes
254 | make exceptions for this.  Our decision will be guided by the two goals
255 | of preserving the free status of all derivatives of our free software and
256 | of promoting the sharing and reuse of software generally.
257 | 
258 |                             NO WARRANTY
259 | 
260 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
268 | REPAIR OR CORRECTION.
269 | 
270 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
278 | POSSIBILITY OF SUCH DAMAGES.
279 | 
280 |                      END OF TERMS AND CONDITIONS
281 | 
282 |             How to Apply These Terms to Your New Programs
283 | 
284 |   If you develop a new program, and you want it to be of the greatest
285 | possible use to the public, the best way to achieve this is to make it
286 | free software which everyone can redistribute and change under these terms.
287 | 
288 |   To do so, attach the following notices to the program.  It is safest
289 | to attach them to the start of each source file to most effectively
290 | convey the exclusion of warranty; and each file should have at least
291 | the "copyright" line and a pointer to where the full notice is found.
292 | 
293 |     {description}
294 |     Copyright (C) {year}  {fullname}
295 | 
296 |     This program is free software; you can redistribute it and/or modify
297 |     it under the terms of the GNU General Public License as published by
298 |     the Free Software Foundation; either version 2 of the License, or
299 |     (at your option) any later version.
300 | 
301 |     This program is distributed in the hope that it will be useful,
302 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
303 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
304 |     GNU General Public License for more details.
305 | 
306 |     You should have received a copy of the GNU General Public License along
307 |     with this program; if not, write to the Free Software Foundation, Inc.,
308 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
309 | 
310 | Also add information on how to contact you by electronic and paper mail.
311 | 
312 | If the program is interactive, make it output a short notice like this
313 | when it starts in an interactive mode:
314 | 
315 |     Gnomovision version 69, Copyright (C) year name of author
316 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
317 |     This is free software, and you are welcome to redistribute it
318 |     under certain conditions; type `show c' for details.
319 | 
320 | The hypothetical commands `show w' and `show c' should show the appropriate
321 | parts of the General Public License.  Of course, the commands you use may
322 | be called something other than `show w' and `show c'; they could even be
323 | mouse-clicks or menu items--whatever suits your program.
324 | 
325 | You should also get your employer (if you work as a programmer) or your
326 | school, if any, to sign a "copyright disclaimer" for the program, if
327 | necessary.  Here is a sample; alter the names:
328 | 
329 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
330 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
331 | 
332 |   {signature of Ty Coon}, 1 April 1989
333 |   Ty Coon, President of Vice
334 | 
335 | This General Public License does not permit incorporating your program into
336 | proprietary programs.  If your program is a subroutine library, you may
337 | consider it more useful to permit linking proprietary applications with the
338 | library.  If this is what you want to do, use the GNU Lesser General
339 | Public License instead of this License.
340 | 


--------------------------------------------------------------------------------