0) {
28 | newstring = str_append(newstring, replace);
29 | i = i + str_len (search)-1;
30 | count = count - 1;
31 | }
32 | else {
33 | newstring = str_append_chr(newstring, subject[i]);
34 | }
35 |
36 | }
37 | else {
38 | newstring = str_append_chr(newstring, subject[i]);
39 | }
40 | }
41 | else {
42 | newstring = str_append_chr(newstring, subject[i]);
43 | }
44 | }
45 | return newstring;
46 | }
47 |
48 | /* added in B */
49 | int str_equals(char *equal1, char *eqaul2)
50 | {
51 | while(*equal1==*eqaul2)
52 | {
53 | if ( *equal1 == '\0' || *eqaul2 == '\0' ){break;}
54 | equal1++;
55 | eqaul2++;
56 | }
57 | if(*eqaul1 == '\0' && *eqaul2 == '\0' ){return 0;}
58 | else {return -1};
59 | }
--------------------------------------------------------------------------------
/test/test_feature_branch/I/feature-G.c:
--------------------------------------------------------------------------------
1 | /* added in G */
2 | char* str_replace(char* search, char* replace, char* subject) {
3 | char* newstring = "";
4 | int i = 0;
5 | for(i = 0; i < str_len(subject); i++) {
6 | if (subject[i] == search[0]) {
7 | int e = 0;
8 | char* calc = "";
9 | for(e = 0; e < str_len(search); e++) {
10 | if(subject[i+e] == search[e]) {
11 | calc = str_append_chr(calc, search[e]);
12 | }
13 | }
14 | if (str_equals(search, calc) == 0) {
15 | newstring = str_append(newstring, replace);
16 | i = i + str_len (search)-1;
17 | }
18 | else {
19 | newstring = str_append_chr(newstring, subject[i]);
20 | }
21 | }
22 | else {
23 | newstring = str_append_chr(newstring, subject[i]);
24 | }
25 | }
26 | return newstring;
27 | }
--------------------------------------------------------------------------------
/test/test_feature_branch/I/feature-H.c:
--------------------------------------------------------------------------------
1 | /* added in H */
2 | struct node
3 | {
4 | int data;
5 | struct node *next;
6 | }*head;
7 |
8 | /* added in H, edited in I */
9 | void append(int num)
10 | {
11 | struct node *temp, *prev;
12 | temp=head;
13 | while(temp!=NULL)
14 | {
15 | if(temp->data==num)
16 | {
17 | if(temp==head)
18 | {
19 | head=temp->next;
20 | free(temp);
21 | return 1;
22 | }
23 | else
24 | {
25 | prev->next=temp->next;
26 | free(temp);
27 | return 1;
28 | }
29 | }
30 | else
31 | {
32 | prev=temp;
33 | temp= temp->next;
34 | }
35 | }
36 | return 0;
37 | }
38 |
39 | /* added in H, edited in G */
40 | void add( int num )
41 | {
42 | struct node *temp;
43 | temp=(struct node *)malloc(sizeof(struct node));
44 | temp->data=num;
45 | if (head== NULL)
46 | {
47 | head=temp;
48 | head->next=NULL;
49 | }
50 | }
51 |
52 | /* insert() is deleted in I */
53 |
--------------------------------------------------------------------------------
/test/test_feature_branch/I/main.c:
--------------------------------------------------------------------------------
1 | /* added in A */
2 | int str_len(char *string)
3 | {
4 | char *count = string;
5 | while(*count) {count++;}
6 | return count - string;
7 | }
8 |
9 | /* str_append is deleted in B */
10 |
11 | int str_equals(char *equal1, char *eqaul2); // Forward decl
12 |
13 | /* added in B, edited in C */
14 | char* str_append_chr(char* string, char append) {
15 | char* newstring = "";
16 | int i = 0;
17 | for(i = 0; i < str_len(subject); i++) {
18 | if (subject[i] == search[0]) {
19 | int e = 0;
20 | char* calc = "";
21 | for(e = 0; e < str_len(search); e++) {
22 | if(subject[i+e] == search[e]) {
23 | calc = str_append_chr(calc, search[e]);
24 | }
25 | }
26 | if (str_equals(search, calc) == 0) {
27 | if(count > 0) {
28 | newstring = str_append(newstring, replace);
29 | i = i + str_len (search)-1;
30 | count = count - 1;
31 | }
32 | else {
33 | newstring = str_append_chr(newstring, subject[i]);
34 | }
35 |
36 | }
37 | else {
38 | newstring = str_append_chr(newstring, subject[i]);
39 | }
40 | }
41 | else {
42 | newstring = str_append_chr(newstring, subject[i]);
43 | }
44 | }
45 | return newstring;
46 | }
47 |
48 | /* added in B */
49 | int str_equals(char *equal1, char *eqaul2)
50 | {
51 | while(*equal1==*eqaul2)
52 | {
53 | if ( *equal1 == '\0' || *eqaul2 == '\0' ){break;}
54 | equal1++;
55 | eqaul2++;
56 | }
57 | if(*eqaul1 == '\0' && *eqaul2 == '\0' ){return 0;}
58 | else {return -1};
59 | }
--------------------------------------------------------------------------------
/test/test_feature_branch/J/feature-J.c:
--------------------------------------------------------------------------------
1 | /* added in J */
2 | void display(struct node *r)
3 | {
4 | r=head;
5 | if(r==NULL)
6 | {
7 | return;
8 | }
9 | while(r!=NULL)
10 | {
11 | printf("%d ",r->data);
12 | r=r->next;
13 | }
14 | printf("\n");
15 | }
16 |
17 | /* added in J */
18 | int count()
19 | {
20 | struct node *n;
21 | int c=0;
22 | n=head;
23 | while(n!=NULL)
24 | {
25 | n=n->next;
26 | c++;
27 | }
28 | return c;
29 | }
--------------------------------------------------------------------------------
/test/test_feature_branch/J/main.c:
--------------------------------------------------------------------------------
1 | /* added in A */
2 | int str_len(char *string)
3 | {
4 | char *count = string;
5 | while(*count) {count++;}
6 | return count - string;
7 | }
8 |
9 | /* added in A*/
10 | char* str_append(char* string, char* append) {
11 | char* newstring = NULL;
12 | size_t needed = snprintf(NULL, 0, "%s%s", string, append);
13 | newstring = malloc(needed);
14 | sprintf(newstring, "%s%s", string, append);
15 | return newstring;
16 | }
17 |
18 |
--------------------------------------------------------------------------------
/test/test_feature_branch/K/feature-G.c:
--------------------------------------------------------------------------------
1 | /* added in G */
2 | char* str_replace(char* search, char* replace, char* subject) {
3 | char* newstring = "";
4 | int i = 0;
5 | for(i = 0; i < str_len(subject); i++) {
6 | if (subject[i] == search[0]) {
7 | int e = 0;
8 | char* calc = "";
9 | for(e = 0; e < str_len(search); e++) {
10 | if(subject[i+e] == search[e]) {
11 | calc = str_append_chr(calc, search[e]);
12 | }
13 | }
14 | if (str_equals(search, calc) == 0) {
15 | newstring = str_append(newstring, replace);
16 | i = i + str_len (search)-1;
17 | }
18 | else {
19 | newstring = str_append_chr(newstring, subject[i]);
20 | }
21 | }
22 | else {
23 | newstring = str_append_chr(newstring, subject[i]);
24 | }
25 | }
26 | return newstring;
27 | }
--------------------------------------------------------------------------------
/test/test_feature_branch/K/feature-H.c:
--------------------------------------------------------------------------------
1 | /* added in H */
2 | struct node
3 | {
4 | int data;
5 | struct node *next;
6 | }*head;
7 |
8 | /* added in H, edited in I */
9 | void append(int num)
10 | {
11 | struct node *temp, *prev;
12 | temp=head;
13 | while(temp!=NULL)
14 | {
15 | if(temp->data==num)
16 | {
17 | if(temp==head)
18 | {
19 | head=temp->next;
20 | free(temp);
21 | return 1;
22 | }
23 | else
24 | {
25 | prev->next=temp->next;
26 | free(temp);
27 | return 1;
28 | }
29 | }
30 | else
31 | {
32 | prev=temp;
33 | temp= temp->next;
34 | }
35 | }
36 | return 0;
37 | }
38 |
39 | /* added in H, edited in G */
40 | void add( int num )
41 | {
42 | struct node *temp;
43 | temp=(struct node *)malloc(sizeof(struct node));
44 | temp->data=num;
45 | if (head== NULL)
46 | {
47 | head=temp;
48 | head->next=NULL;
49 | }
50 | }
51 |
52 | /* insert() is deleted in I */
53 |
--------------------------------------------------------------------------------
/test/test_feature_branch/K/feature-K.c:
--------------------------------------------------------------------------------
1 | /* added in J, edited in K */
2 | void display(struct node *r)
3 | {
4 | r=head;
5 | if(r==NULL)
6 | {
7 | return;
8 | }
9 | printf("\n");
10 | }
11 |
12 | /* added in J */
13 | int count()
14 | {
15 | struct node *n;
16 | int c=0;
17 | n=head;
18 | while(n!=NULL)
19 | {
20 | n=n->next;
21 | c++;
22 | }
23 | return c;
24 | }
--------------------------------------------------------------------------------
/test/test_feature_branch/K/main.c:
--------------------------------------------------------------------------------
1 | /* added in A */
2 | int str_len(char *string)
3 | {
4 | char *count = string;
5 | while(*count) {count++;}
6 | return count - string;
7 | }
8 |
9 | /* str_append is deleted in B */
10 |
11 | int str_equals(char *equal1, char *eqaul2); // Forward decl
12 |
13 | /* added in B, edited in C */
14 | char* str_append_chr(char* string, char append) {
15 | char* newstring = "";
16 | int i = 0;
17 | for(i = 0; i < str_len(subject); i++) {
18 | if (subject[i] == search[0]) {
19 | int e = 0;
20 | char* calc = "";
21 | for(e = 0; e < str_len(search); e++) {
22 | if(subject[i+e] == search[e]) {
23 | calc = str_append_chr(calc, search[e]);
24 | }
25 | }
26 | if (str_equals(search, calc) == 0) {
27 | if(count > 0) {
28 | newstring = str_append(newstring, replace);
29 | i = i + str_len (search)-1;
30 | count = count - 1;
31 | }
32 | else {
33 | newstring = str_append_chr(newstring, subject[i]);
34 | }
35 |
36 | }
37 | else {
38 | newstring = str_append_chr(newstring, subject[i]);
39 | }
40 | }
41 | else {
42 | newstring = str_append_chr(newstring, subject[i]);
43 | }
44 | }
45 | return newstring;
46 | }
47 |
48 | /* added in B */
49 | int str_equals(char *equal1, char *eqaul2)
50 | {
51 | while(*equal1==*eqaul2)
52 | {
53 | if ( *equal1 == '\0' || *eqaul2 == '\0' ){break;}
54 | equal1++;
55 | eqaul2++;
56 | }
57 | if(*eqaul1 == '\0' && *eqaul2 == '\0' ){return 0;}
58 | else {return -1};
59 | }
--------------------------------------------------------------------------------
/test/test_feature_branch/cg.dot:
--------------------------------------------------------------------------------
1 | digraph test_feature_branch {
2 | A -> B -> C -> D -> E -> F -> K;
3 | B -> G -> D ;
4 | D -> H -> I -> E ;
5 | A -> J -> F ;
6 | }
--------------------------------------------------------------------------------
/test/test_feature_branch/cg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Persper/code-analytics/3a2eb076153e29bc49b8e67265e04a5321e90af0/test/test_feature_branch/cg.png
--------------------------------------------------------------------------------
/tools/build_history.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import os
5 | import sys
6 | import pickle
7 | import subprocess
8 | from git import Repo
9 | from persper.graphs.analyzer import Analyzer
10 | from persper.graphs.c import CGraph
11 | from persper.util.path import root_path
12 |
13 |
14 | def usage(cmd):
15 | print("Usage: {0} [i]".format(cmd))
16 | print("\tBuild history for data/branch_commits_chunk[i].pickle")
17 |
18 |
19 | def run(i):
20 | repo_path = os.path.join(root_path, 'repos/linux-complete')
21 | pickle_path = os.path.join(
22 | root_path, 'data/branch_commits_chunk' + i + '.pickle')
23 | with open(pickle_path, 'rb') as f:
24 | sha_lst = pickle.load(f)
25 |
26 | az = Analyzer(repo_path, CGraph())
27 | r = Repo(repo_path)
28 | chunk_commits = [r.commit(sha) for sha in sha_lst]
29 | az.build_history(chunk_commits, phase='history-chunk-' + i)
30 |
31 |
32 | def main():
33 | if len(sys.argv) == 2:
34 | i = sys.argv[1]
35 | run(i)
36 | else:
37 | usage(sys.argv[0])
38 |
39 | if __name__ == "__main__":
40 | main()
41 |
--------------------------------------------------------------------------------
/tools/excel_charts/distance.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import math
4 |
5 |
6 | def deviation(map1, map2, index):
7 | n = len(map1)
8 | assert len(map2) == n
9 | var = 0
10 | for func, values in map1.items():
11 | var += (values[index] - map2.get(func, values)[index])**2
12 | return math.sqrt(var / n)
13 |
14 |
15 | def pair_changes(map1, map2, index):
16 | n = len(map1)
17 | assert len(map2) == n
18 | p = 0
19 | keys = list(map1.keys())
20 | for i in range(n - 1):
21 | for j in range(i + 1, n):
22 | d1 = map1[keys[i]][index] - map1[keys[j]][index]
23 | d2 = map2[keys[i]][index] - map2[keys[j]][index]
24 | if d1 == 0 and d2 == 0:
25 | continue
26 | elif d1 == 0 or d2 == 0:
27 | p += 1
28 | elif d1 * d2 < 0:
29 | p += 1
30 | return p
31 |
--------------------------------------------------------------------------------
/tools/excel_charts/excel.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from openpyxl import Workbook
4 |
5 |
6 | def fillout(worksheet, position, data):
7 | row = position[0]
8 | column = position[1]
9 | for i, array in enumerate(data):
10 | for j, value in enumerate(array):
11 | worksheet.cell(row=row + i, column=column + j, value=value)
12 |
13 |
14 | def fillin(worksheet, position, num_rows, num_columns):
15 | row = position[0]
16 | column = position[1]
17 | data = [[None for _ in range(num_columns)] for _ in range(num_rows)]
18 | for i in range(num_rows):
19 | for j in range(num_columns):
20 | data[i][j] = worksheet.cell(row=row + i, column=column + j).value
21 | return data
22 |
23 |
24 | def cell(worksheet, position, step_row=0, step_column=0, index=0):
25 | row = position[0]
26 | column = position[1]
27 | return worksheet.cell(row=row + index * step_row,
28 | column=column + index * step_column)
29 |
30 |
31 | def sheet(workbook, sheet_name):
32 | try:
33 | return workbook[sheet_name]
34 | except KeyError:
35 | return None
36 |
37 |
38 | def main():
39 | wb = Workbook()
40 | ws = wb.active
41 | data = [[x] for x in range(10)]
42 | fillout(ws, (1, 1), data)
43 |
44 | data = [[x, 2 * x] for x in range(10)]
45 | fillout(ws, (2, 2), data)
46 |
47 | data = [['Sheet1 rank distance', 'Sheet1 value distance'],
48 | [0.0, 0.5], [1.0, 0.5]]
49 | fillout(ws, (1, 2), data)
50 |
51 | print(sheet(wb, 'Sheet'))
52 | print(sheet(wb, 'InvalidSheetName'))
53 |
54 | print(fillin(ws, (1, 1), 10, 1))
55 | print(fillin(ws, (2, 2), 10, 2))
56 |
57 | i = 0
58 | while True:
59 | c = cell(ws, (2, 1), step_column=2, index=i)
60 | if c.value is None:
61 | break
62 | print(c.value)
63 | i += 1
64 |
65 | wb.save('check.xlsx')
66 |
67 |
68 | if __name__ == '__main__':
69 | main()
70 |
--------------------------------------------------------------------------------
/tools/excel_charts/gini/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
--------------------------------------------------------------------------------
/tools/excel_charts/gini/README.md:
--------------------------------------------------------------------------------
1 | # gini
2 | A Gini coefficient calculator in Python.
3 |
4 | ## Overview
5 | This is a function that calculates the Gini coefficient of a numpy array. Gini coefficients are often used to quantify income inequality, read more [here](http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm).
6 |
7 | The function in ```gini.py``` is based on the third equation from [here](http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm), which defines the Gini coefficient as:
8 |
9 | 
10 |
11 |
12 | ## Examples
13 | For a very unequal sample, 999 zeros and a single one,
14 | ```
15 | >>> from gini import *
16 | >>> a = np.zeros((1000))
17 | >>> a[0] = 1.0
18 | ```
19 | the Gini coefficient is very close to 1.0:
20 | ```
21 | >>> gini(a)
22 | 0.99890010998900103
23 | ```
24 |
25 | For uniformly distributed random numbers, it will be low, around 0.33:
26 | ```
27 | >>> s = np.random.uniform(-1,0,1000)
28 | >>> gini(s)
29 | 0.3295183767105907
30 | ```
31 |
32 | For a homogeneous sample, the Gini coefficient is 0.0:
33 | ```
34 | >>> b = np.ones((1000))
35 | >>> gini(b)
36 | 0.0
37 | ```
38 |
39 | ## Input Assumptions
40 | The Gini calculation by definition requires non-zero positive (ascending-order) sorted values within a 1d vector. This is dealt with within [```gini()```](https://github.com/oliviaguest/gini/blob/master/gini.py). So these four assumptions can be violated, as they are controlled for:
41 | ``` python
42 | def gini(array):
43 | """Calculate the Gini coefficient of a numpy array."""
44 | # based on bottom eq: http://www.statsdirect.com/help/content/image/stat0206_wmf.gif
45 | # from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
46 | array = array.flatten() #all values are treated equally, arrays must be 1d
47 | if np.amin(array) < 0:
48 | array -= np.amin(array) #values cannot be negative
49 | array += 0.0000001 #values cannot be 0
50 | array = np.sort(array) #values must be sorted
51 | index = np.arange(1,array.shape[0]+1) #index per array element
52 | n = array.shape[0]#number of array elements
53 | return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array))) #Gini coefficient
54 | ```
55 |
56 | ## Notes
57 | * It is significantly faster than (the [current implementation of](https://github.com/pysal/pysal/issues/855)) PySAL's Gini coefficient function (see [pysal.inequality.gini](http://pysal.readthedocs.io/en/latest/_modules/pysal/inequality/gini.html)) and outputs are indistinguishable before approximately 6 decimal places. In other words, the two functions are arithmetically identical.
58 |
59 | * It is slightly faster than the [Gini coefficient function by David on Ellipsix](http://www.ellipsix.net/blog/2012/11/the-gini-coefficient-for-distribution-inequality.html).
60 |
61 | Many other Gini coefficient functions found online do not produce equivalent results, hence why I wrote this.
62 |
--------------------------------------------------------------------------------
/tools/excel_charts/gini/gini.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Persper/code-analytics/3a2eb076153e29bc49b8e67265e04a5321e90af0/tools/excel_charts/gini/gini.png
--------------------------------------------------------------------------------
/tools/excel_charts/gini/gini.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def gini(array):
4 | """Calculate the Gini coefficient of a numpy array."""
5 | # based on bottom eq:
6 | # http://www.statsdirect.com/help/generatedimages/equations/equation154.svg
7 | # from:
8 | # http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
9 | # All values are treated equally, arrays must be 1d:
10 | array = array.flatten()
11 | if np.amin(array) < 0:
12 | # Values cannot be negative:
13 | array -= np.amin(array)
14 | # Values cannot be 0:
15 | array += 0.0000001
16 | # Values must be sorted:
17 | array = np.sort(array)
18 | # Index per array element:
19 | index = np.arange(1,array.shape[0]+1)
20 | # Number of array elements:
21 | n = array.shape[0]
22 | # Gini coefficient:
23 | return ((np.sum((2 * index - n - 1) * array)) / (n * np.sum(array)))
24 |
--------------------------------------------------------------------------------
/tools/excel_charts/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Persper/code-analytics/3a2eb076153e29bc49b8e67265e04a5321e90af0/tools/excel_charts/tests/__init__.py
--------------------------------------------------------------------------------
/tools/excel_charts/tests/draw_charts_test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Persper/code-analytics/3a2eb076153e29bc49b8e67265e04a5321e90af0/tools/excel_charts/tests/draw_charts_test.xlsx
--------------------------------------------------------------------------------
/tools/excel_charts/tests/test_distance.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import distance
4 | import math
5 | from statistics import mean
6 | from statistics import pstdev
7 | from random import random
8 | import unittest
9 |
10 | class TestDistanceMethods(unittest.TestCase):
11 |
12 | def test_deviation(self):
13 | n = 1000000
14 | r = [random() for x in range(n)]
15 | m = mean(r)
16 | map1 = {}
17 | map2 = {}
18 | for i, v in enumerate(r):
19 | map1[i] = [v]
20 | map2[i] = [m]
21 | d1 = distance.deviation(map1, map2, 0)
22 | d2 = pstdev(r, m)
23 | self.assertTrue(math.isclose(d1, d2))
24 |
25 | def test_pair_changes(self):
26 | map1 = {'A': [1], 'B': [2], 'C': [3], 'D': [4], 'E': [5]}
27 | map2 = {'A': [1], 'B': [3], 'C': [2], 'D': [4], 'E': [5]}
28 | map3 = {'A': [3], 'B': [2], 'C': [1], 'D': [4], 'E': [5]}
29 | map4 = {'A': [5], 'B': [1], 'C': [2], 'D': [3], 'E': [4]}
30 | self.assertEqual(distance.pair_changes(map1, map1, 0), 0)
31 | self.assertEqual(distance.pair_changes(map1, map2, 0), 1)
32 | self.assertEqual(distance.pair_changes(map1, map3, 0), 3)
33 | self.assertEqual(distance.pair_changes(map1, map4, 0), 4)
34 |
35 | if __name__ == '__main__':
36 | unittest.main()
37 |
38 |
--------------------------------------------------------------------------------
/tools/jira_stats/collect_git_urls.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import os
5 | import re
6 | import requests
7 | import sys
8 |
9 |
10 | def main():
11 | parser = argparse.ArgumentParser(
12 | description='Collect Apache and GitHub repo URLs of Apache projects')
13 | parser.add_argument('-f', '--file', required=True,
14 | help='the output file')
15 | args = parser.parse_args()
16 |
17 | if os.path.isfile(args.file):
18 | sys.exit('Error: output file already exists!')
19 |
20 | out_file = open(args.file, 'w')
21 |
22 | apache_git = 'https://git.apache.org/'
23 |
24 | resp = requests.get(apache_git)
25 |
26 | pattern = re.compile(r'(.+) | \s*'
27 | r'\s*.+\s* | \s*'
28 | r'\s*')
29 |
30 | for match in pattern.finditer(resp.text):
31 | name = match.group(1)
32 | apache_repo = match.group(2)
33 | github_repo = match.group(3)
34 | print(name, apache_repo, github_repo, sep=',', file=out_file)
35 |
36 | out_file.close()
37 |
38 |
39 | if __name__ == '__main__':
40 | main()
41 |
--------------------------------------------------------------------------------
/tools/jira_stats/process_stats.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | import os
5 | import re
6 |
7 |
8 | def find_github(name, urls):
9 | candidates = []
10 | target = set(x.lower() for x in name.split() if len(x) > 1)
11 | for item in urls:
12 | name_set = set(x.lower() for x in item['name'].split() if len(x) > 1)
13 | if target <= name_set:
14 | candidates.append({
15 | 'name': item['name'],
16 | 'github_repo': item['github_repo']
17 | })
18 | return candidates
19 |
20 |
21 | def get_issue_stats(file_path):
22 | issue_stats = []
23 | with open(file_path, 'r') as stats:
24 | for line in stats:
25 | name, key, id, count, \
26 | feature, bug, improvement, maintenance, \
27 | high, mid, low = line.split(',')
28 | if name == 'name' and key == 'key':
29 | continue
30 | issue_stats.append({
31 | 'name': name, 'key': key, 'id': id, 'count': count,
32 | 'feature': feature, 'bug': bug,
33 | 'improvement': improvement, 'maintenance': maintenance,
34 | 'high': high, 'mid': mid, 'low': low
35 | })
36 | return issue_stats
37 |
38 |
39 | def main():
40 | parser = argparse.ArgumentParser(
41 | description='Select projects to produce the config file for'
42 | 'the JIRA issue crawler')
43 | parser.add_argument('-s', '--stats-file', required=True,
44 | help='the project issue stats file '
45 | 'produced by global_stats')
46 | parser.add_argument('-u', '--url-file', required=True,
47 | help='the git url file produced by collect_git_urls')
48 | parser.add_argument('-d', '--parent-dir', required=True,
49 | help='the dir to contain repos')
50 | parser.add_argument('-o', '--output-file', required=True,
51 | help='output file')
52 | args = parser.parse_args()
53 |
54 | issue_stats = get_issue_stats(args.stats_file)
55 |
56 | project_urls = []
57 | with open(args.url_file, 'r') as urls:
58 | for line in urls:
59 | name, apache_repo, github_repo = line.split(',')
60 | project_urls.append({
61 | 'name': name,
62 | 'apache_repo': apache_repo,
63 | 'github_repo': github_repo
64 | })
65 |
66 | out_file = open(args.output_file, 'w')
67 | empty_file = open(args.output_file + '.empty', 'w')
68 |
69 | re_name = re.compile(r'https://github\.com/apache/(\S+)')
70 | for project in issue_stats:
71 | candidates = find_github(project['name'], project_urls)
72 | if len(candidates) == 0:
73 | print(args.parent_dir, project['key'], 'master',
74 | sep='\t', file=empty_file)
75 | continue
76 | for candidate in candidates:
77 | github = candidate['github_repo'].strip()
78 | dir_name = re_name.search(github).group(1)
79 | path = os.path.join(args.parent_dir, dir_name)
80 | print(path, project['key'], 'master', github + '.git',
81 | sep='\t', file=out_file)
82 |
83 | empty_file.close()
84 | out_file.close()
85 |
86 |
87 | if __name__ == '__main__':
88 | main()
89 |
--------------------------------------------------------------------------------
/tools/repo_crawler/.gitignore:
--------------------------------------------------------------------------------
1 | *-issues
2 | deleted.files
3 |
--------------------------------------------------------------------------------
/tools/repo_crawler/README.md:
--------------------------------------------------------------------------------
1 | ## Data set format
2 |
3 | Each [project]-issues directory contains JIRA issues and GitHub pull request
4 | (PR) comments of the project. Only issues resolved and PRs closed by commits
5 | are included.
6 |
7 | In a project directory, every file starts with the commit hash (first ten
8 | digits) that the issue/PR is associated with. You can browse the commit via
9 | https://github.com/[user]/[project]/commit/[hash]. E.g.,
10 | https://github.com/apache/spark/commit/b8aec6cd23.
11 |
12 | There are two types of files.
13 |
14 | 1. [hash]-[PROJECT]-[#].xml is an XML representation of the JIRA issue. You can
15 | browse the original issue via
16 | https://issues.apache.org/jira/browse/[PROJECT]-[#]. E.g.,
17 | https://issues.apache.org/jira/browse/SPARK-10474.
18 |
19 | 2. [hash]-GitHub-[#].xml is an XML representation of the PR conversation. You
20 | can browse the original PR via https://github.com/[user]/[project]/pull/[#].
21 | E.g., https://github.com/apache/spark/pull/13796.
22 |
23 | Besides, there are shadow files starting with ``.invalid.''. They can be
24 | ignored by users of this data set. Those files denote wrong information in
25 | commit messages.
26 |
--------------------------------------------------------------------------------
/tools/repo_crawler/github_comments.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import dicttoxml
3 | import github3
4 | import string
5 | import threading
6 | import time
7 | import xml.etree.ElementTree as ET
8 |
9 | class GitHubComments:
10 | def __init__(self, user = None, password = None, limit_per_min=81):
11 | self.gh = github3.login(user, password)
12 | self._limit_per_min = limit_per_min
13 |
14 | self._lock = threading.Lock()
15 | self._last_time = time.time()
16 | self._rest = limit_per_min
17 |
18 | def login(self, user, password):
19 | self.gh = github3.login(user, password)
20 |
21 | def get_lease(self):
22 | with self._lock:
23 | if self._rest > 0:
24 | self._rest -= 1
25 | return True
26 | elif time.time() - self._last_time > 60:
27 | self._rest = self._limit_per_min - 1
28 | self._last_time = time.time()
29 | return True
30 | else:
31 | return False
32 |
33 | def download(self, user, repo, num, file_path):
34 | while not self.get_lease():
35 | time.sleep(5)
36 | pr = self.gh.pull_request(user, repo, num)
37 | comments = ET.Element('comments')
38 | for comment in pr.issue_comments():
39 | snippet = dicttoxml.dicttoxml(comment.as_dict(),
40 | attr_type=False,
41 | custom_root='comment')
42 | snippet = ''.join(x for x in snippet if x in string.printable)
43 | comments.append(ET.fromstring(snippet))
44 | for comment in pr.review_comments():
45 | snippet = dicttoxml.dicttoxml(comment.as_dict(),
46 | attr_type=False,
47 | custom_root='comment')
48 | snippet = ''.join(x for x in snippet if x in string.printable)
49 | comments.append(ET.fromstring(snippet))
50 | return ET.ElementTree(comments).write(file_path, encoding="utf-8")
51 |
52 | def add_args(parser):
53 | parser.add_argument('-u', '--github-user',
54 | help='user name of a GitHub account',
55 | type=str, required=True)
56 | parser.add_argument('-p', '--github-password',
57 | help='password of a GitHub account',
58 | type=str, required=True)
59 |
60 | def main():
61 | parser = argparse.ArgumentParser()
62 | add_args(parser)
63 | args = parser.parse_args()
64 |
65 | ghc = GitHubComments(args.github_user, args.github_password)
66 | ghc.download('apache', 'spark', 8060, '8060.xml')
67 | ghc.download('apache', 'spark', 8069, '8069.xml')
68 |
69 | if __name__ == '__main__':
70 | main()
71 |
--------------------------------------------------------------------------------
/tools/repo_crawler/jira_issue.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 | import urllib
5 |
6 | _URL_PREFIX_XML = "https://issues.apache.org/jira/si/jira.issueviews:issue-xml/"
7 | _URL_SUFFIX_XML = ".xml"
8 |
9 | class JiraIssue:
10 | def __init__(self,
11 | url_prefix=_URL_PREFIX_XML,
12 | url_suffix=_URL_SUFFIX_XML):
13 | self.url_prefix = url_prefix
14 | self.url_suffix = url_suffix
15 |
16 | def download(self, issue_id, dir_path, file_name):
17 | url = self.url_prefix + issue_id + "/" + issue_id + self.url_suffix
18 | file_path = os.path.join(dir_path, file_name)
19 | invalid_path = os.path.join(dir_path, ".invalid." + file_name)
20 | if os.path.isfile(file_path) or os.path.isfile(invalid_path):
21 | return
22 | for i in range(3):
23 | try:
24 | print urllib.urlretrieve(url, file_path)[0]
25 | with open(file_path, 'r') as downloaded:
26 | if "Oops, you've found a dead link." in \
27 | downloaded.read():
28 | os.rename(file_path, invalid_path)
29 | print "Invalid issue ID:", invalid_path
30 | break
31 | except Exception as e:
32 | if i == 2:
33 | print "[Error] JiraIssue.download: ", type(e), e
34 | else:
35 | time.sleep(10)
36 |
37 | if __name__ == "__main__":
38 | if len(sys.argv) != 3:
39 | print sys.argv[0] + " ISSUE_ID FILE_PATH"
40 | sys.exit(1)
41 | jira_issue = JiraIssue()
42 | jira_issue.download(sys.argv[1], sys.argv[2]);
43 |
--------------------------------------------------------------------------------
/tools/repo_crawler/repo.config:
--------------------------------------------------------------------------------
1 | ../../repos/hbase HBASE rel/1.3.1 https://github.com/apache/hbase.git
2 | ../../repos/spark SPARK v2.1.1 https://github.com/apache/spark.git
3 | ../../repos/zookeeper ZOOKEEPER release-3.5.3 https://github.com/apache/zookeeper.git
4 | ../../repos/incubator-systemml SYSTEMML v0.14.0-incubating-rc4 https://github.com/apache/incubator-systemml.git
5 | ../../repos/maven MNG maven-3.5.0 https://github.com/apache/maven.git
6 | ../../repos/cassandra CASSANDRA cassandra-3.11.0 https://github.com/apache/cassandra.git
7 | ../../repos/couchdb COUCHDB 2.0.0 https://github.com/apache/couchdb.git
8 | ../../repos/hive HIVE release-2.3.0-rc0 https://github.com/apache/hive.git
9 | ../../repos/activemq AMQ activemq-5.15.0 https://github.com/apache/activemq.git
10 | ../../repos/beam BEAM v2.0.0 https://github.com/apache/beam.git
11 | ../../repos/cloudstack CLOUDSTACK 4.9.2.0 https://github.com/apache/cloudstack.git
12 | ../../repos/ambari AMBARI release-2.5.1 https://github.com/apache/ambari.git
13 | ../../repos/geode GEODE rel/v1.1.1 https://github.com/apache/geode.git
14 | ../../repos/jackrabbit JCR jackrabbit-2.15.4 https://github.com/apache/jackrabbit.git
15 | ../../repos/airavata AIRAVATA airavata-0.16 https://github.com/apache/airavata.git
16 | ../../repos/ant-ivy IVY 2.4.0 https://github.com/apache/ant-ivy.git
17 | ../../repos/archiva MRM archiva-2.2.3 https://github.com/apache/archiva.git
18 | ../../repos/arrow ARROW apache-arrow-0.4.1 https://github.com/apache/arrow.git
19 | ../../repos/avro AVRO release-1.8.2 https://github.com/apache/avro.git
20 | ../../repos/buildr BUILDR 1.4.25 https://github.com/apache/buildr.git
21 | ../../repos/camel CAMEL camel-2.19.1 https://github.com/apache/camel.git
22 |
--------------------------------------------------------------------------------
/tools/repo_crawler/setup.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | sudo apt install -y python
4 | sudo apt install -y python-pip
5 | sudo pip install --upgrade pip
6 | sudo pip install sh
7 |
8 | sudo apt install -y libssl-dev
9 | sudo pip install --pre github3.py
10 | sudo pip install dicttoxml
11 |
--------------------------------------------------------------------------------
/tools/repo_creater/README.md:
--------------------------------------------------------------------------------
1 | # Repo Creater Tool
2 |
3 | **Goal**: To be able to quickly create fake development history for test purpose
4 |
5 | # Workflow
6 | 1. `cd test` and `mkdir `
7 | 2. For each commit in the fake history, `mkdir `
8 | 3. Add source files for each commit
9 | 4. Write commit graph to `cg.dot` file, see `test/test_feature_branch/cg.dot` for an example. You can also plot it out for inspection with `dot -Tpng cg.dot -o cg.png`
10 | 5. Run repo_creater tool
11 | ```
12 | cd tools/repo_creater
13 | ./create_repo.py ../../test/
14 | ```
15 | The newly created repo has the same name and will be under `repos/` folder.
16 |
17 | 6. Examine repo history
18 | ```
19 | cd repos/
20 | git log --graph
21 | # alternatively, to see only master
22 | git log --first-parent
23 | ```
24 |
25 | # Assumptions
26 | - Merge only happens on master branch
27 | - No merge conflicts resolved manually.
28 | - All files dwell directly under `` (not in some subfolders)
29 |
30 |
--------------------------------------------------------------------------------
/tools/repo_stats/setup_ubuntu.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | sudo apt install -y python3 python3-pip
4 | sudo -H pip3 install --upgrade pip
5 | sudo -H pip3 install sh
6 |
--------------------------------------------------------------------------------
/tools/repo_stats/stats_author.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import argparse
5 | import os
6 | import subprocess
7 | import sys
8 |
9 |
10 | def stats_commits(repo_path, branch, author_stats=None):
11 | if not author_stats:
12 | author_stats = { }
13 | git_cmd = ['git', '-C', repo_path, 'checkout', branch]
14 | subprocess.check_output(git_cmd)
15 | git_cmd = ['git', '--no-pager', '-C', repo_path, 'shortlog', '-sn']
16 | p = subprocess.Popen(git_cmd, stdout=subprocess.PIPE)
17 | with os.fdopen(os.dup(p.stdout.fileno())) as commits_per_author:
18 | for line in commits_per_author:
19 | num, name = [s.strip() for s in line.split('\t')]
20 | if name not in author_stats:
21 | author_stats[name] = {'n_commits': int(num)}
22 | else:
23 | author_stats[name]['n_commits'] = int(num)
24 | return author_stats
25 |
26 |
27 | def main():
28 | parser = argparse.ArgumentParser(
29 | description='List author stats of git repo(s)')
30 | parser.add_argument('-c', '--count-commits', metavar='DIR',
31 | help='Git repo dir to list authors and their # commits')
32 | parser.add_argument('-b', '--branch', default='master',
33 | help='Branch of the repo to analyze')
34 | parser.add_argument('-a', '--count-authors', metavar='DIR', nargs='+',
35 | help='Multiple git repos to list their # authors')
36 | args = parser.parse_args()
37 | if args.count_commits:
38 | if not os.path.isdir(args.count_commits):
39 | sys.exit('Error: ' + args.dir + ' is not a valid dir!')
40 | author_stats = stats_commits(args.count_commits, args.branch)
41 | for name, stats in sorted(author_stats.items(),
42 | key=lambda x: x[1]['n_commits'],
43 | reverse=True):
44 | print(name, stats['n_commits'], sep=',')
45 | elif args.count_authors:
46 | project_authors = { }
47 | for d in args.count_authors:
48 | if os.path.isfile(d) or d.startswith('.'):
49 | continue
50 | repo_name = os.path.basename(os.path.normpath(d))
51 | print('Parsing ' + repo_name)
52 | project_authors[repo_name] = stats_commits(d, args.branch)
53 | for repo_name, author_stats in sorted(project_authors.items(),
54 | key=lambda x: len(x[1]),
55 | reverse=True):
56 | print(repo_name, len(author_stats), sep=',')
57 | else:
58 | sys.exit('Error: see -h for usage.')
59 |
60 |
61 | if __name__ == '__main__':
62 | main()
63 |
64 |
--------------------------------------------------------------------------------
/tools/repo_stats/stats_pr.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import argparse
5 | import os
6 | import re
7 | import sys
8 |
9 | from sh.contrib import git
10 | from sh import wc
11 |
12 |
13 | def jira_issue(commit_message, key):
14 | if key is None:
15 | return []
16 | matches = re.findall(key + "-\d+(?!\d*.\d+)", commit_message, re.IGNORECASE)
17 | return [m.upper() for m in matches]
18 |
19 |
20 | def parse_pr(commit_message):
21 | matches = re.findall("(?:close[ds]*|"
22 | "pull\s*request|"
23 | "fix(?:e[ds])?|"
24 | "merge[ds]*)"
25 | "\s*#\d+",
26 | commit_message, re.IGNORECASE)
27 | return [m.split('#')[-1] for m in matches]
28 |
29 |
30 | def num_commits(repo_dir):
31 | git_repo = git.bake('-C', os.path.expanduser(repo_dir))
32 | logs = git_repo.log('--oneline', '--first-parent')
33 | n = wc(logs, '-l')
34 | return int(n)
35 |
36 |
37 | def stats_pr(repo_dir, key, begin, end):
38 | """Lists the number of PR/issue-based commits in the range
39 | """
40 | git_repo = git.bake('-C', os.path.expanduser(repo_dir))
41 | num = 0
42 | prs = []
43 | for i in range(begin, end):
44 | message = str(git_repo.log('--first-parent', '-1', 'HEAD~' + str(i)))
45 | pi = []
46 | pi += jira_issue(message, key)
47 | pi += parse_pr(message)
48 | if pi:
49 | num += 1
50 | prs += pi
51 | return num, prs
52 |
53 |
54 | def main():
55 | parser = argparse.ArgumentParser(
56 | description='Stats commits through pull requests/issues')
57 | parser.add_argument('-n', '--num-groups', type=int, required=True,
58 | help='number of groups of commits in stats')
59 | parser.add_argument('-d', '--dir', required=True,
60 | help='dir of the git repo')
61 | parser.add_argument('-k', '--key', help='key of JIRA issue')
62 | parser.add_argument('-t', '--tag', help='tag to check out of the repo')
63 | parser.add_argument('-m', '--max', type=int,
64 | help='max number of commits to process')
65 | args = parser.parse_args()
66 |
67 | if not os.path.isdir(args.dir):
68 | sys.exit('Error: ' + args.dir + ' is not a valid dir!')
69 |
70 | if args.tag:
71 | git_repo = git.bake('-C', os.path.expanduser(args.dir))
72 | git_repo.checkout(args.tag)
73 |
74 | print(os.path.basename(os.path.normpath(args.dir)))
75 | n = num_commits(args.dir)
76 | if args.max < n:
77 | n = args.max
78 | n //= args.num_groups
79 | for i in reversed(range(args.num_groups)):
80 | np, prs = stats_pr(args.dir, args.key, i * n, (i + 1) * n)
81 | print(np / n, end=',')
82 | print('"{0}"'.format(','.join(prs)))
83 |
84 |
85 | if __name__ == '__main__':
86 | main()
87 |
--------------------------------------------------------------------------------
/tools/repo_stats/stats_pr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | N=100
4 | M=1000000
5 |
6 | DIR=results
7 |
8 | mkdir -p $DIR
9 |
10 | ./stats_pr.py -n $N -d repos/hbase -t rel/1.3.0 -k HBASE -m $M > $DIR/hbase.pr.csv &
11 | ./stats_pr.py -n $N -d repos/spark -t v2.1.0 -k SPARK -m $M > $DIR/spark.pr.csv &
12 | ./stats_pr.py -n $N -d repos/zookeeper -t release-3.4.9 -k ZOOKEEPER -m $M > $DIR/zookeeper.pr.csv &
13 | ./stats_pr.py -n $N -d repos/incubator-systemml -t v0.14.0-incubating-rc4 -k SYSTEMML -m $M > $DIR/systemml.pr.csv &
14 | ./stats_pr.py -n $N -d repos/maven -t maven-3.3.9 -k MNG -m $M > $DIR/maven.pr.csv &
15 | ./stats_pr.py -n $N -d repos/cassandra -t cassandra-3.10 -k CASSANDRA -m $M > $DIR/cassandra.pr.csv &
16 | ./stats_pr.py -n $N -d repos/couchdb -t 2.0.0 -k COUCHDB -m $M > $DIR/couchdb.pr.csv &
17 | ./stats_pr.py -n $N -d repos/hive -t rel/release-2.1.1 -k HIVE -m $M > $DIR/hive.pr.csv &
18 | ./stats_pr.py -n $N -d repos/rails -t v5.1.1 -m $M > $DIR/rails.pr.csv &
19 | ./stats_pr.py -n $N -d repos/opencv -t 3.2.0 -m $M > $DIR/opencv.pr.csv &
20 | ./stats_pr.py -n $N -d repos/tensorflow -t v1.1.0 -m $M > $DIR/tensorflow.pr.csv &
21 | ./stats_pr.py -n $N -d repos/vagrant -t v1.9.4 -m $M > $DIR/vagrant.pr.csv &
22 | ./stats_pr.py -n $N -d repos/jekyll -t v3.4.3 -m $M > $DIR/jekyll.pr.csv &
23 | ./stats_pr.py -n $N -d repos/discourse -t v1.7.8 -m $M > $DIR/discourse.pr.csv &
24 |
25 | for pid in $(jobs -p)
26 | do
27 | wait $pid
28 | done
29 |
30 |
--------------------------------------------------------------------------------
/tools/repo_stats/test/couchdb.pr.csv:
--------------------------------------------------------------------------------
1 | couchdb
2 | 0.0,""
3 | 0.2,"COUCHDB-1911,COUCHDB-1853"
4 | 0.1,"COUCHDB-1922"
5 | 0.0,""
6 | 0.0,""
7 | 0.1,"COUCHDB-1923"
8 | 0.3,"COUCHDB-1647,COUCHDB-1921,COUCHDB-1921"
9 | 0.1,"COUCHDB-1911"
10 | 0.1,"COUCHDB-1668"
11 | 0.0,""
12 | 0.2,"COUCHDB-1986,COUCHDB-1795,COUCHDB-1962"
13 | 0.0,""
14 | 0.0,""
15 | 0.0,""
16 | 0.0,""
17 | 0.0,""
18 | 0.0,""
19 | 0.0,""
20 | 0.1,"COUCHDB-2031"
21 | 0.0,""
22 | 0.2,"COUCHDB-2040,COUCHDB-2028"
23 | 0.1,"COUCHDB-2054"
24 | 0.1,"COUCHDB-1474"
25 | 0.1,"COUCHDB-2086"
26 | 0.1,"COUCHDB-2067"
27 | 0.6,"COUCHDB-2189,170,COUCHDB-1076,COUCHDB-2187,COUCHDB-2170,COUCHDB-2123"
28 | 0.3,"COUCHDB-2196,COUCHDB-1180,COUCHDB-1036,COUCHDB-1180,169"
29 | 0.5,"184,183,COUCHDB-2110,COUCHDB-2166,COUCHDB-2201"
30 | 0.3,"COUCHDB-2169,186,185"
31 | 0.2,"COUCHDB-2209,190"
32 | 0.0,""
33 | 0.1,"COUCHDB-2104"
34 | 0.1,"COUCHDB-1986"
35 | 0.0,""
36 | 0.3,"COUCHDB-1697,200,COUCHDB-2206"
37 | 0.1,"211"
38 | 0.2,"COUCHDB-2136,COUCHDB-2220,COUCHDB-1669"
39 | 0.0,""
40 | 0.2,"COUCHDB-2233,COUCHDB-2200"
41 | 0.1,"COUCHDB-2158"
42 | 0.0,""
43 | 0.4,"COUCHDB-2222,COUCHDB-2153,COUCHDB-2248,COUCHDB-2249"
44 | 0.1,"COUCHDB-2238"
45 | 0.1,"250"
46 | 0.0,""
47 | 0.0,""
48 | 0.1,"COUCHDB-2026"
49 | 0.2,"COUCHDB-1133,COUCHDB-1133"
50 | 0.3,"COUCHDB-1986,COUCHDB-2324,268"
51 | 0.1,"274"
52 | 0.5,"COUCHDB-1432,COUCHDB-2430,260,276,COUCHDB-708"
53 | 0.1,"COUCHDB-2362"
54 | 0.3,"293,295,277"
55 | 0.3,"COUCHDB-1145,291,COUCHDB-2557,296"
56 | 0.2,"272,COUCHDB-2619,306"
57 | 0.1,"COUCHDB-2684"
58 | 0.1,"COUCHDB-2237"
59 | 0.0,""
60 | 0.0,""
61 | 0.0,""
62 | 0.0,""
63 | 0.1,"COUCHDB-2762"
64 | 0.0,""
65 | 0.1,"COUCHDB-2797"
66 | 0.1,"COUCHDB-2824"
67 | 0.1,"COUCHDB-2824"
68 | 0.0,""
69 | 0.0,""
70 | 0.2,"359,COUCHDB-2844"
71 | 0.0,""
72 | 0.1,"COUCHDB-2511"
73 | 0.2,"COUCHDB-2775,COUCHDB-2775"
74 | 0.0,""
75 | 0.2,"COUCHDB-2409,COUCHDB-2674,COUCHDB-2850,COUCHDB-2858,COUCHDB-2666"
76 | 0.5,"COUCHDB-2879,COUCHDB-1447,COUCHDB-2835,COUCHDB-2534,COUCHDB-2859"
77 | 0.2,"COUCHDB-2905,COUCHDB-2897,COUCHDB-2898"
78 | 0.0,""
79 | 0.3,"COUCHDB-2874,COUCHDB-2082,COUCHDB-2082"
80 | 0.0,""
81 | 0.2,"COUCHDB-2938,COUCHDB-2866,COUCHDB-2872"
82 | 0.1,"COUCHDB-2938"
83 | 0.6,"COUCHDB-2968,COUCHDB-2965,COUCHDB-2963,COUCHDB-2959,390,386"
84 | 0.6,"COUCHDB-2978,COUCHDB-2978,COUCHDB-2900,397,396,393,391"
85 | 0.2,"408,COUCHDB-2988"
86 | 0.0,""
87 | 0.0,""
88 | 0.4,"COUCHDB-3016,COUCHDB-3016,COUCHDB-3016,COUCHDB-2990"
89 | 0.5,"COUCHDB-3028,420,421,419,416"
90 | 0.4,"COUCHDB-3054,428,426,COUCHDB-3039"
91 | 0.0,""
92 | 0.1,"COUCHDB-3060"
93 | 0.1,"COUCHDB-3066"
94 | 0.1,"COUCHDB-3070"
95 | 0.6,"COUCHDB-3096,COUCHDB-3089,COUCHDB-3092,COUCHDB-3084,COUCHDB-3084,COUCHDB-3082"
96 | 0.4,"COUCHDB-3060,COUCHDB-3104,COUCHDB-3104,COUCHDB-3104,COUCHDB-2779,COUCHDB-3097,COUCHDB-3099"
97 | 0.3,"COUCHDB-3102,COUCHDB-3017,438"
98 | 0.5,"COUCHDB-3135,COUCHDB-3122,COUCHDB-3134,COUCHDB-3136,COUCHDB-3121,COUCHDB-3114,COUCHDB-3118"
99 | 0.1,"COUCHDB-3132"
100 | 0.0,""
101 | 0.1,"COUCHDB-3143"
102 |
--------------------------------------------------------------------------------
/tools/repo_stats/test/discourse.pr.csv:
--------------------------------------------------------------------------------
1 | discourse
2 | 0.1,"2547"
3 | 0.0,""
4 | 0.1,"2549"
5 | 0.3,"2552,2551,2550"
6 | 0.5,"2546,2530,2537,2553,2554"
7 | 0.2,"2532,2555"
8 | 0.1,"2556"
9 | 0.1,"2557"
10 | 0.0,""
11 | 0.0,""
12 | 0.1,"2560"
13 | 0.2,"2563,2561"
14 | 0.3,"2572,2571,2564"
15 | 0.6,"2568,2566,2565,2548,2575,2573"
16 | 0.3,"2578,2569,2567"
17 | 0.1,"2580"
18 | 0.0,""
19 | 0.4,"2584,2583,2581,2582"
20 | 0.2,"2587,2586"
21 | 0.2,"2589,2588"
22 | 0.5,"2592,2595,2590,2596,2597"
23 | 0.0,""
24 | 0.2,"2585,2599"
25 | 0.3,"2600,2602,2601"
26 | 0.1,"2603"
27 | 0.0,""
28 | 0.3,"2591,2604,2606"
29 | 0.2,"2609,2608"
30 | 0.1,"2607"
31 | 0.0,""
32 | 0.1,"2612"
33 | 0.1,"2613"
34 | 0.2,"2614,2615"
35 | 0.0,""
36 | 0.0,""
37 | 0.4,"2617,2620,2618,2619"
38 | 0.2,"2622,2623"
39 | 0.3,"2625,2624,2627"
40 | 0.1,"2628"
41 | 0.0,""
42 | 0.1,"2632"
43 | 0.0,""
44 | 0.0,""
45 | 0.1,"2634"
46 | 0.0,""
47 | 0.1,"2629"
48 | 0.0,""
49 | 0.3,"2638,2643,2642"
50 | 0.1,"2644"
51 | 0.1,"2646"
52 | 0.0,""
53 | 0.3,"2649,2636,2648"
54 | 0.2,"2652,2651"
55 | 0.1,"2654"
56 | 0.2,"2655,2656"
57 | 0.0,""
58 | 0.2,"2662,2661"
59 | 0.2,"2663,2645"
60 | 0.1,"2658"
61 | 0.1,"2665"
62 | 0.5,"2667,2659,2650,2666,2511"
63 | 0.0,""
64 | 0.0,""
65 | 0.0,""
66 | 0.2,"2671,2669"
67 | 0.1,"2672"
68 | 0.0,""
69 | 0.0,""
70 | 0.2,"2676,2681"
71 | 0.5,"2682,2647,2675,2633,2670"
72 | 0.2,"2678,2683"
73 | 0.1,"2684"
74 | 0.2,"2690,2688"
75 | 0.2,"2677,2689"
76 | 0.3,"2693,2691,2692"
77 | 0.3,"2686,2695,2694"
78 | 0.5,"2700,2685,2696,2698,2699"
79 | 0.0,""
80 | 0.0,""
81 | 0.0,""
82 | 0.1,"3192"
83 | 0.0,""
84 | 0.0,""
85 | 0.0,""
86 | 0.0,""
87 | 0.0,""
88 | 0.0,""
89 | 0.0,""
90 | 0.0,""
91 | 0.0,""
92 | 0.0,""
93 | 0.0,""
94 | 0.0,""
95 | 0.0,""
96 | 0.0,""
97 | 0.0,""
98 | 0.0,""
99 | 0.0,""
100 | 0.0,""
101 | 0.0,""
102 |
--------------------------------------------------------------------------------
/tools/repo_stats/test/jekyll.pr.csv:
--------------------------------------------------------------------------------
1 | jekyll
2 | 0.5,"4410,4429,4424,4423,4404"
3 | 0.5,"4452,4428,4437,4436,4434"
4 | 0.7,"4460,4461,4459,4465,4463,4464,4455"
5 | 0.6,"4374,4496,4487,4485,4484,4473"
6 | 0.3,"4505,4502,4492"
7 | 0.6,"4522,4526,4525,4512,4514,4517"
8 | 0.3,"4546,4543,4535"
9 | 0.2,"4553,4547"
10 | 0.8,"4591,4592,4594,4566,4559,4561,4562,4554"
11 | 0.5,"4545,4606,4597,4583,4589"
12 | 0.5,"4557,4602,4611,4599,4381"
13 | 0.5,"4620,4621,4618,4598,4590"
14 | 0.5,"4635,4630,4637,4633,4601"
15 | 0.4,"4639,4636,4558,4641"
16 | 0.5,"4645,4646,4596,4628,4555"
17 | 0.6,"4658,4659,4660,4647,4653,4652"
18 | 0.3,"4685,4682,4670"
19 | 0.5,"4700,4694,4699,4686,4491"
20 | 0.5,"4704,4706,4542,4533,4474"
21 | 0.5,"4703,4712,4640,3849,4624"
22 | 0.6,"4755,4750,4751,4717,4537,4720"
23 | 0.6,"4756,4760,4741,4763,4758,4759"
24 | 0.4,"4769,4771,4775,4781"
25 | 0.4,"4789,4734,4478,4689"
26 | 0.4,"4804,4754,4813,4786"
27 | 0.6,"4808,4595,4819,4792,4793,4799"
28 | 0.4,"4854,4847,4844,4710"
29 | 0.6,"4863,4872,4874,4867,4857,4855"
30 | 0.5,"4849,4887,4886,4839,4859"
31 | 0.5,"4888,4881,4892,4510,4890"
32 | 0.3,"4848,4903,4902"
33 | 0.1,"4916"
34 | 0.4,"4947,4950,4951,4949"
35 | 0.4,"4948,4931,4933,4934"
36 | 0.3,"4974,4958,4971"
37 | 0.5,"4959,4956,4953,4978,4975"
38 | 0.5,"4980,4976,4966,4977,4962"
39 | 0.6,"4989,4973,4940,4987,4985,4979"
40 | 0.4,"5006,4908,5000,5005"
41 | 0.5,"5009,4917,5012,4993,5010"
42 | 0.4,"4922,5018,5017,5014"
43 | 0.4,"5025,5026,5027,5019"
44 | 0.4,"5030,5031,5032,5024"
45 | 0.5,"5056,5058,5054,5043,5015"
46 | 0.5,"5067,5096,5068,5063,5065"
47 | 0.4,"5100,5097,5069,5098"
48 | 0.5,"5060,5101,5053,5042,5011"
49 | 0.5,"5112,5114,5117,5022,5106"
50 | 0.5,"5124,5119,5115,5113,5116"
51 | 0.4,"5109,5118,5122,5123"
52 | 0.5,"5135,4860,5127,5129,5131"
53 | 0.3,"5138,5139,5137"
54 | 0.3,"5150,5140,5141"
55 | 0.5,"5177,5173,5152,5158,5143"
56 | 0.5,"5178,5168,5164,5154,5156"
57 | 0.5,"5194,5180,5185,5187,5188"
58 | 0.4,"5205,5183,5196,5190"
59 | 0.3,"5226,5214,5167"
60 | 0.5,"5244,5254,5239,5221,5222"
61 | 0.5,"5272,5274,5271,5258,5249"
62 | 0.5,"5291,5286,5287,5273,5279"
63 | 0.5,"5240,5293,5281,5294,5262"
64 | 0.5,"5189,5045,5295,5304,5280"
65 | 0.5,"5318,5334,5320,5308,5224"
66 | 0.4,"5337,5316,5335,5235"
67 | 0.5,"5372,5361,5199,5347,5326"
68 | 0.5,"5237,5369,5364,5381,5383"
69 | 0.4,"5389,5375,5376,5380"
70 | 0.5,"5403,5402,5399,5338,5397"
71 | 0.6,"5413,5416,5325,5421,5420,5405"
72 | 0.4,"5428,5157,5210,5408"
73 | 0.5,"5411,5410,5427,5430,5412"
74 | 0.5,"5212,5256,5431,5441,5264"
75 | 0.4,"5456,5433,5452,4873"
76 | 0.4,"5472,5471,5449,5442"
77 | 0.5,"5497,5494,5479,5489,5491"
78 | 0.5,"5504,5502,5495,5496,5492"
79 | 0.5,"5519,5512,5457,5505,5511"
80 | 0.5,"5538,5532,5529,5459,5466"
81 | 0.4,"5536,5539,5540,5533"
82 | 0.4,"5546,5564,5524,5557"
83 | 0.1,"5526"
84 | 0.5,"5572,5571,5464,5570,5559"
85 | 0.4,"5597,5435,5592,5582"
86 | 0.5,"5573,5587,5600,5608,5605"
87 | 0.5,"5614,5611,5530,5609,5384"
88 | 0.5,"5513,5612,5643,5641,5632"
89 | 0.5,"5655,5658,5657,5653,5652"
90 | 0.4,"5671,5670,5668,5666"
91 | 0.5,"5712,5711,5683,5705,5689"
92 | 0.5,"5726,5720,5688,5713,5693"
93 | 0.5,"5740,5738,5696,5692,5544"
94 | 0.5,"5746,5748,5725,5745,5473"
95 | 0.5,"5758,5761,5744,5752,5621"
96 | 0.5,"5768,5769,5765,5764,5750"
97 | 0.5,"5782,5731,5691,5767,5779"
98 | 0.5,"5780,5791,5784,5312,5781"
99 | 0.3,"5640,5542,5753"
100 | 0.0,""
101 | 0.3,"5968,5940,5924"
102 |
--------------------------------------------------------------------------------
/tools/repo_stats/test/stats_pr.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | N=100
4 | M=1000
5 |
6 | ../stats_pr.py -n $N -d ../repos/hbase -t rel/1.3.0 -k HBASE -m $M > hbase.pr.csv &
7 | ../stats_pr.py -n $N -d ../repos/spark -t v2.1.0 -k SPARK -m $M > spark.pr.csv &
8 | ../stats_pr.py -n $N -d ../repos/zookeeper -t release-3.4.9 -k ZOOKEEPER -m $M > zookeeper.pr.csv &
9 | ../stats_pr.py -n $N -d ../repos/incubator-systemml -t v0.14.0-incubating-rc4 -k SYSTEMML -m $M > systemml.pr.csv &
10 | ../stats_pr.py -n $N -d ../repos/maven -t maven-3.3.9 -k MNG -m $M > maven.pr.csv &
11 | ../stats_pr.py -n $N -d ../repos/cassandra -t cassandra-3.10 -k CASSANDRA -m $M > cassandra.pr.csv &
12 | ../stats_pr.py -n $N -d ../repos/couchdb -t 2.0.0 -k COUCHDB -m $M > couchdb.pr.csv &
13 | ../stats_pr.py -n $N -d ../repos/hive -t rel/release-2.1.1 -k HIVE -m $M > hive.pr.csv &
14 | ../stats_pr.py -n $N -d ../repos/rails -t v5.1.1 -m $M > rails.pr.csv &
15 | ../stats_pr.py -n $N -d ../repos/opencv -t 3.2.0 -m $M > opencv.pr.csv &
16 | ../stats_pr.py -n $N -d ../repos/tensorflow -t v1.1.0 -m $M > tensorflow.pr.csv &
17 | ../stats_pr.py -n $N -d ../repos/vagrant -t v1.9.4 -m $M > vagrant.pr.csv &
18 | ../stats_pr.py -n $N -d ../repos/jekyll -t v3.4.3 -m $M > jekyll.pr.csv &
19 | ../stats_pr.py -n $N -d ../repos/discourse -t v1.7.8 -m $M > discourse.pr.csv &
20 |
21 | for pid in $(jobs -p)
22 | do
23 | wait $pid
24 | done
25 |
26 |
--------------------------------------------------------------------------------
/tools/repo_stats/test/vagrant.pr.csv:
--------------------------------------------------------------------------------
1 | vagrant
2 | 0.4,"6502,5986,5981,6534"
3 | 0.3,"5991,5993,4738"
4 | 0.6,"6150,6149,6073,6071,6050,5999"
5 | 0.6,"6195,6185,6160,6172,6157,6156"
6 | 0.6,"6259,6254,6232,6219,6213,6203"
7 | 0.6,"6364,6322,6318,6307,6305,6288"
8 | 0.6,"6386,6383,6373,6444,6404,6367"
9 | 0.7,"6488,6479,6475,6474,6389,6521,6407"
10 | 0.6,"6515,6536,6535,6493,6382,6489"
11 | 0.5,"6049,6538,6539,6537,6406"
12 | 0.2,"6540,6541"
13 | 0.2,"6543,6542"
14 | 0.6,"6553,6551,6550,6548,6520,6545,6544"
15 | 0.5,"6557,6556,6555,6554,6552"
16 | 0.3,"4473,6560,6559"
17 | 0.4,"6565,6563,6562,6561"
18 | 0.2,"6568,6564"
19 | 0.0,""
20 | 0.3,"6581,6567,6575"
21 | 0.4,"6584,6582,6583,6118"
22 | 0.5,"6590,6589,6588,6587,6585"
23 | 0.5,"6601,5086,6599,6591,6597"
24 | 0.1,"6606"
25 | 0.4,"6649,6636,6650,6643,3539"
26 | 0.7,"6603,6612,6659,6664,6661,6288,6652"
27 | 0.6,"6681,6691,6675,6671,6671,6662"
28 | 0.5,"6722,6728,6714,6711,6700"
29 | 0.3,"6731,6706,6718"
30 | 0.3,"6756,6753,6740"
31 | 0.7,"6800,6795,6844,6833,6816,6774,6771"
32 | 0.6,"6932,6806,6926,6891,6763,6874"
33 | 0.7,"6912,6963,6962,6952,6948,6950,6923"
34 | 0.5,"6909,6899,6922,6760,6848"
35 | 0.5,"6602,6749,6897,6867,6893"
36 | 0.5,"6977,6969,6610,6843,6805"
37 | 0.7,"7024,7026,7016,7001,6991,6981,6983"
38 | 0.6,"7086,7085,7081,7078,7056,7041"
39 | 0.4,"7101,7107,6879,7093"
40 | 0.6,"7151,7050,7123,7120,7122,7121,7104"
41 | 0.6,"7203,7191,7159,7162,7154,7153"
42 | 0.6,"7219,7216,7215,7158,7184,7204"
43 | 0.5,"7251,7103,7239,7090,7223"
44 | 0.4,"7327,7110,7299,7283"
45 | 0.7,"7059,7352,7349,7347,7339,7334,7108"
46 | 0.3,"7290,7298,7353"
47 | 0.4,"7308,7355,7354,5670"
48 | 0.5,"7126,7179,7356,7287,7293"
49 | 0.4,"7363,7360,7358,7009"
50 | 0.6,"7359,7370,7369,7366,7365,7364"
51 | 0.5,"7379,7382,7377,7376,7276"
52 | 0.4,"7270,7387,7395,7393"
53 | 0.4,"6838,7207,7396,7372"
54 | 0.4,"7190,7409,7406,7400"
55 | 0.3,"7269,7419,7418"
56 | 0.4,"7460,7453,7432,7428"
57 | 0.4,"7478,7477,7467,7456"
58 | 0.4,"7484,7483,7481,7480"
59 | 0.5,"7505,7499,7487,7492,7491"
60 | 0.5,"7550,7587,7605,7589,7574"
61 | 0.4,"7569,7568,7571,7012,7524"
62 | 0.2,"7630,7611"
63 | 0.6,"6765,7650,7647,7643,7639,7632"
64 | 0.7,"7705,7676,7703,7690,7701,7698,7684"
65 | 0.4,"7725,7720,7675,7623"
66 | 0.6,"7752,7798,7781,7724,7740,7726"
67 | 0.5,"7712,7489,7778,7758,7819,7830"
68 | 0.5,"7813,7751,7831,7848,7802"
69 | 0.5,"7877,7873,7679,7674,7688"
70 | 0.5,"7881,7536,7866,7874,7756"
71 | 0.3,"7889,7887,7818"
72 | 0.2,"7929,7907"
73 | 0.6,"7931,7719,7928,7922,7921,7926"
74 | 0.3,"7947,7944,7943"
75 | 0.5,"7986,7793,7976,7980,7978"
76 | 0.5,"7897,7854,8000,7985,7989"
77 | 0.4,"8011,8008,7918,7879"
78 | 0.8,"8032,8033,8009,7896,7998,8027,8031,8028"
79 | 0.4,"8071,8070,8062,8041"
80 | 0.5,"8051,8052,8066,8068,8079"
81 | 0.5,"8106,8098,7867,8094,8087"
82 | 0.5,"8148,8102,8146,8092,8160"
83 | 0.8,"8198,8100,8246,8205,8233,8109,8119,8143"
84 | 0.4,"8192,8176,8191,8195"
85 | 0.5,"8270,8165,8272,8252,8248"
86 | 0.6,"8237,8219,8283,8167,8296,8273"
87 | 0.5,"8302,8291,8300,8194,8196"
88 | 0.2,"7035,8314"
89 | 0.6,"8073,7967,8089,8326,8334,7956"
90 | 0.4,"8337,8341,8327,8122"
91 | 0.5,"8353,8308,8364,8350,8344"
92 | 0.3,"8390,8366,8325"
93 | 0.3,"8385,8379,8336"
94 | 0.5,"8264,8401,8400,8399,8393"
95 | 0.2,"8416,8410"
96 | 0.5,"8436,8422,8414,8368,8421"
97 | 0.6,"8454,8451,7425,8442,8437,7840"
98 | 0.4,"8428,8457,8456,8329"
99 | 0.5,"8407,8443,8472,8310,8482"
100 | 0.4,"8497,8495,7797,8485"
101 | 0.4,"8504,8507,8503,8498"
102 |
--------------------------------------------------------------------------------
|