├── code
├── hdfs
│ ├── commit_url.txt
│ ├── commit_selection.py
│ ├── get_commit.py
│ ├── download_diff.py
│ ├── extract_commit.py
│ └── diff_file_parser.py
├── hbase
│ ├── commit_url.txt
│ ├── commit_selection.py
│ ├── get_commit.py
│ ├── download_diff.py
│ ├── extract_commit.py
│ └── diff_file_parser.py
├── spark
│ ├── commit_url.txt
│ ├── commit_selection.py
│ ├── get_commit.py
│ ├── download_diff.py
│ ├── extract_commit.py
│ └── diff_file_parser.py
├── cassandra
│ ├── commit_url.txt
│ ├── commit_selection.py
│ ├── get_commit.py
│ ├── download_diff.py
│ └── extract_commit.py
└── hdfs_demo_examples
│ ├── commit_selection.py
│ ├── hdfs_example_commits.txt
│ ├── download_diff.py
│ ├── extract_commit.py
│ └── diff_file_parser.py
├── commit_analysis
├── README.md
├── count_num.py
├── config_parsing.csv
├── rmv_replace.csv
├── rmv_with_code.csv
├── param_rename.csv
└── change_param_constraint.csv
├── README.md
└── config_commits
└── cassandra.csv
/code/hdfs/commit_url.txt:
--------------------------------------------------------------------------------
1 | https://github.com/apache/hadoop/commits/trunk?after=2b4febcf576e2da29ab86e2920302b82b47e435d+34&branch=trunk
--------------------------------------------------------------------------------
/code/hbase/commit_url.txt:
--------------------------------------------------------------------------------
1 | https://github.com/apache/hbase/commits/master?after=85842634e518155db3c964bf15555291d5fbdd45+34&branch=master
--------------------------------------------------------------------------------
/code/spark/commit_url.txt:
--------------------------------------------------------------------------------
1 | https://github.com/apache/spark/commits/master?after=8d09f9649510bf5d812c82b04f7711b9252a7db0+69&branch=master
--------------------------------------------------------------------------------
/code/cassandra/commit_url.txt:
--------------------------------------------------------------------------------
1 | https://github.com/apache/cassandra/commits/trunk?after=401e933b7395892bf0356f88308f64b94be84601+34&branch=trunk
--------------------------------------------------------------------------------
/code/cassandra/commit_selection.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import time
4 | import random
5 | import threading
6 |
7 | import extract_commit
8 |
9 |
10 | class ConfigParam:
11 | def __init__(self):
12 | self.param_name = '' #param name
13 | self.param_class = '' #class that this param belongs to
14 | self.param_func = '' #function that assign this param
15 |
16 | def main():
17 | config_variable_list = []
18 | searched_commit_num = 0
19 |
20 | commit_info_file = open('commit_info.txt','r')
21 | for commit_info in commit_info_file:
22 | commit_info = commit_info.strip('\n')
23 | extract_commit.extract(commit_info,config_variable_list)
24 | searched_commit_num = searched_commit_num + 1
25 | print (searched_commit_num)
26 |
27 | if __name__ == '__main__':
28 | main()
29 |
30 |
--------------------------------------------------------------------------------
/code/hdfs/commit_selection.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import time
4 | import random
5 | import threading
6 |
7 | import extract_commit
8 |
9 |
10 | class ConfigParam:
11 | def __init__(self):
12 | self.param_name = '' #param name
13 | self.param_class = '' #class that this param belongs to
14 | self.param_func = '' #function that assign this param
15 |
16 | def main():
17 | config_variable_list = []
18 | searched_commit_num = 0
19 |
20 | commit_info_file = open('commit_info.txt','r')
21 | for commit_info in commit_info_file:
22 | commit_info = commit_info.strip('\n')
23 | extract_commit.extract(commit_info,config_variable_list)
24 | searched_commit_num = searched_commit_num + 1
25 | print (searched_commit_num)
26 |
27 |
28 | if __name__ == '__main__':
29 | main()
30 |
31 |
--------------------------------------------------------------------------------
/code/hbase/commit_selection.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import time
4 | import random
5 | import threading
6 |
7 | import extract_commit
8 |
9 |
10 | class ConfigParam:
11 | def __init__(self):
12 | self.param_name = '' #param name
13 | self.param_class = '' #class that this param belongs to
14 | self.param_func = '' #function that assign this param
15 |
16 | def main():
17 | config_variable_list = []
18 | searched_commit_num = 0
19 |
20 | commit_info_file = open('commit_info.txt','r')
21 | for commit_info in commit_info_file:
22 | commit_info = commit_info.strip('\n')
23 | extract_commit.extract(commit_info,config_variable_list)
24 | searched_commit_num = searched_commit_num + 1
25 | print (searched_commit_num)
26 |
27 | if __name__ == '__main__':
28 | main()
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/code/spark/commit_selection.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import time
3 | import random
4 | import threading
5 |
6 | import extract_commit
7 |
8 |
9 | class ConfigParam:
10 | def __init__(self):
11 | self.param_name = '' #param name
12 | self.param_class = '' #class that this param belongs to
13 | self.param_func = '' #function that assign this param
14 |
15 | def main():
16 | config_variable_list = []
17 | searched_commit_num = 0
18 |
19 | commit_info_file = open('commit_info.txt','r')
20 | for commit_info in commit_info_file:
21 | commit_info = commit_info.strip('\n')
22 | extract_commit.extract(commit_info,config_variable_list)
23 | searched_commit_num = searched_commit_num + 1
24 | print (searched_commit_num)
25 |
26 | if __name__ == '__main__':
27 | main()
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/code/hdfs_demo_examples/commit_selection.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import time
4 | import random
5 | import threading
6 |
7 | import extract_commit
8 |
9 |
10 | class ConfigParam:
11 | def __init__(self):
12 | self.param_name = '' #param name
13 | self.param_class = '' #class that this param belongs to
14 | self.param_func = '' #function that assign this param
15 |
16 | def main():
17 | config_variable_list = []
18 | searched_commit_num = 0
19 |
20 | commit_info_file = open('hdfs_example_commits.txt','r')
21 | for commit_info in commit_info_file:
22 | commit_info = commit_info.strip('\n')
23 | extract_commit.extract(commit_info,config_variable_list)
24 | searched_commit_num = searched_commit_num + 1
25 | print (searched_commit_num)
26 |
27 |
28 | if __name__ == '__main__':
29 | main()
30 |
31 |
--------------------------------------------------------------------------------
/code/hdfs_demo_examples/hdfs_example_commits.txt:
--------------------------------------------------------------------------------
1 | https://github.com/apache/hadoop/commit/c81ac2ff0220b180cd6cbbf18221290c3783bfd5$$$HDFS-13607. [SBN read] Edit Tail Fast Path Part 1: Enhance JournalNod……e with an in-memory cache of recent edit transactions. Contributed by Erik Krogen.$$$2018-05-09T22:40:07Z
2 | https://github.com/apache/hadoop/commit/bfd3f8bd8a9ae2186ec3e4addc71f912ec7b8923$$$HDFS-12291: [SPS]: Provide a mechanism to recursively iterate and sat……isfy storage policy of all the files under the given dir. Contributed by Surendra Singh Lilhore.$$$2017-09-30T13:31:52Z
3 | https://github.com/apache/hadoop/commit/123342cd0759ff88801d4f5ab10987f6e3f344b0$$$HDFS-12412. Change ErasureCodingWorker.stripedReadPool to cached thre……ad pool. (Lei (Eddy) Xu)$$$2017-09-13T01:12:07Z
4 | https://github.com/apache/hadoop/commit/9ae9467f920e95ca989d7d51775b39e1b9fee300$$$HDFS-11998. Enable DFSNetworkTopology as default. Contributed by Chen…… Liang.$$$2017-06-22T05:01:37Z
5 | https://github.com/apache/hadoop/commit/3108d27edde941d153a58f71fb1096cce2995531$$$HDFS-12716. 'dfs.datanode.failed.volumes.tolerated' to support minimu……m number of volumes to be available. Contributed by Ranith Sardar and usharani$$$2018-07-30T10:20:04Z
6 | https://github.com/apache/hadoop/commit/42307e3c3abbfe0b83d9a2581deba327435b910f$$$HDFS-11576. Block recovery will fail indefinitely if recovery time > ……heartbeat interval. Contributed by Lukas Majercak$$$2017-12-02T06:34:30Z
7 | https://github.com/apache/hadoop/commit/035c6ee587e444550af6420676e4cee049e09869$$$HDFS-12603. Enable async edit logging by default. Contributed by Andr……ew Wang.$$$2017-10-16T16:43:39Z
--------------------------------------------------------------------------------
/commit_analysis/README.md:
--------------------------------------------------------------------------------
1 | # Data Layout
2 |
3 | There are 7 data sheets corresponding to the sections in the submitted paper.
4 |
5 | ## Data Sheets
6 |
7 | * Section IV (CONFIGURATION INTERFACE EVOLUTION)
8 |
9 | * IV.A.1) (Parameterization) → `parameterization.csv`
10 |
11 | * IV.A.2) (Removing Parameters) → `param_removal.csv`
12 |
13 | * IV.B (Evolution of Default Values) → `change_default_value.csv`
14 |
15 | * Section V (CONFIGURATION USAGE EVOLUTION)
16 |
17 | * V.A (Evolution of Parameter Checking Code) → `checking_and_handling_code.csv`
18 |
19 | * V.B (Evolution of Error-handling Code) → `checking_and_handling_code.csv`
20 |
21 | * V.C (Evolution of Using Parameter Values) → `change_param_existing_usage.csv` and `param_new_use.csv`
22 |
23 | * Section VI (CONFIGURATION DOCUMENT EVOLUTION) → `change_doucumentation.csv`
24 |
25 | ## Metadata Tags
26 |
27 | The first row describes the metadata. Besides the common metadata tags such as `#Parmameter`, `#Issue ID`, `#Title`, `#Issue URL`, `#Commit URL`, `#Note`, there are also specific tags in each spreadsheet.
28 |
29 | **Note that some tags are only available for a subset of commit/parameters. We list them here to avoid any confusion.**
30 |
31 | * `change_default_value.csv` : "#How to choose new value" is for 32 numeric parameters. Please refer to "Choosing new values" in Section IV.B.
32 |
33 | * `change_doucumentation.csv` : "#Info added" is for 63 changes that enhance inadequate documents. Please refer to "Content added to enhance documentation" in Section VI
34 |
35 | * `checking_and_handling_code.csv`: "#Checking content" is for configuration check changes (please refer to Section V.A); "#Changed message" is for misconfiguration feedback messages (please refer to Section V.B).
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/code/hbase/get_commit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import urllib
4 | from urllib.request import urlopen
5 | from bs4 import BeautifulSoup
6 | import http.client
7 | import time
8 | import random
9 | import threading
10 |
11 | import download_diff
12 |
13 |
14 | def get_commits(url,searched_commit_num):
15 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\
16 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
17 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\
18 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\
19 | {'User-Agent':'node.js-v0.8.8'}]
20 | try:
21 | c = random.randint(0,4)
22 | req = urllib.request.Request(url=url,headers=headers[c])
23 | response = urlopen(req)
24 | except (Exception) as e:
25 | print (e)
26 | time.sleep(60)
27 | soup = BeautifulSoup(response.read(), features = "html.parser")
28 | a_list = soup.find_all('a')
29 | commit_list = []
30 | older_href = ''
31 | for a in a_list:
32 | data = a.get('data-pjax')
33 | if data:
34 | if data == 'true':
35 | href = a.get('href')
36 | if href not in commit_list:
37 | commit_list.append(href)
38 | if a.text == 'Older':
39 | older_href = a.get('href')
40 |
41 | for commit in commit_list:
42 | searched_commit_num = searched_commit_num + 1
43 | try:
44 | while threading.activeCount() > 25:
45 | #print threading.activeCount()
46 | pass
47 | t = threading.Thread(target = download_diff.extract, args = (commit,))
48 | t.start()
49 | except (Exception) as e:
50 | print ("multiprocessing error")
51 | print (e.message)
52 | interval = random.uniform(1,2)
53 | time.sleep(interval)
54 |
55 | # print (older_href)
56 | file = open('download_log.txt','a')
57 | file.write(older_href + '\n')
58 | file.write("Already downloaded " + str(searched_commit_num) + " commits." + '\n')
59 | print ("Already downloaded " + str(searched_commit_num) + " commits.")
60 | file.close()
61 | out = open('commit_url.txt','w')
62 | out.write(older_href)
63 | out.close()
64 |
65 | get_commits(older_href,searched_commit_num)
66 |
67 |
68 | def main():
69 | #change the file(commit_url.txt) to get other software's commits
70 | fin = open('commit_url.txt','r')
71 | url = fin.readline()
72 | fin.close()
73 |
74 | searched_commit_num = 0
75 |
76 | try:
77 | get_commits(url,searched_commit_num)
78 | except (Exception) as e:
79 | print (e)
80 |
81 |
82 | if __name__ == '__main__':
83 | http.client.HTTPConnection._http_vsn = 10
84 | http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
85 | main()
86 |
87 |
--------------------------------------------------------------------------------
/code/hdfs/get_commit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import urllib
4 | from urllib.request import urlopen
5 | from bs4 import BeautifulSoup
6 | import http.client
7 | import time
8 | import random
9 | import threading
10 |
11 | import download_diff
12 |
13 |
14 | def get_commits(url,searched_commit_num):
15 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\
16 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
17 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\
18 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\
19 | {'User-Agent':'node.js-v0.8.8'}]
20 | try:
21 | c = random.randint(0,4)
22 | req = urllib.request.Request(url=url,headers=headers[c])
23 | response = urlopen(req)
24 | except (Exception) as e:
25 | print (e)
26 | time.sleep(60)
27 | soup = BeautifulSoup(response.read(), features = "html.parser")
28 | a_list = soup.find_all('a')
29 | commit_list = []
30 | older_href = ''
31 | for a in a_list:
32 | data = a.get('data-pjax')
33 | if data:
34 | if data == 'true':
35 | href = a.get('href')
36 | if href not in commit_list:
37 | commit_list.append(href)
38 | if a.text == 'Older':
39 | older_href = a.get('href')
40 |
41 | for commit in commit_list:
42 | searched_commit_num = searched_commit_num + 1
43 | try:
44 | while threading.activeCount() > 25:
45 | #print threading.activeCount()
46 | pass
47 | t = threading.Thread(target = download_diff.extract, args = (commit,))
48 | t.start()
49 | except (Exception) as e:
50 | print ("multiprocessing error")
51 | print (e.message)
52 | interval = random.uniform(1,2)
53 | time.sleep(interval)
54 |
55 | # print (older_href)
56 | file = open('download_log.txt','a')
57 | file.write(older_href + '\n')
58 | file.write("Already downloaded " + str(searched_commit_num) + " commits." + '\n')
59 | print ("Already downloaded " + str(searched_commit_num) + " commits.")
60 | file.close()
61 | out = open('commit_url.txt','w')
62 | out.write(older_href)
63 | out.close()
64 |
65 | get_commits(older_href,searched_commit_num)
66 |
67 |
68 | def main():
69 | #change the file(commit_url.txt) to get other software's commits
70 | fin = open('commit_url.txt','r')
71 | url = fin.readline()
72 | fin.close()
73 |
74 | searched_commit_num = 0
75 |
76 | try:
77 | get_commits(url,searched_commit_num)
78 | except (Exception) as e:
79 | print (e)
80 |
81 |
82 | if __name__ == '__main__':
83 | http.client.HTTPConnection._http_vsn = 10
84 | http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
85 | main()
86 |
87 |
--------------------------------------------------------------------------------
/code/spark/get_commit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import urllib
4 | from urllib.request import urlopen
5 | from bs4 import BeautifulSoup
6 | import http.client
7 | import time
8 | import random
9 | import threading
10 |
11 | import download_diff
12 |
13 |
14 | def get_commits(url,searched_commit_num):
15 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\
16 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
17 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\
18 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\
19 | {'User-Agent':'node.js-v0.8.8'}]
20 | try:
21 | c = random.randint(0,4)
22 | req = urllib.request.Request(url=url,headers=headers[c])
23 | response = urlopen(req)
24 | except (Exception) as e:
25 | print (e)
26 | time.sleep(60)
27 | soup = BeautifulSoup(response.read(), features = "html.parser")
28 | a_list = soup.find_all('a')
29 | commit_list = []
30 | older_href = ''
31 | for a in a_list:
32 | data = a.get('data-pjax')
33 | if data:
34 | if data == 'true':
35 | href = a.get('href')
36 | if href not in commit_list:
37 | commit_list.append(href)
38 | if a.text == 'Older':
39 | older_href = a.get('href')
40 |
41 | for commit in commit_list:
42 | searched_commit_num = searched_commit_num + 1
43 | try:
44 | while threading.activeCount() > 25:
45 | #print threading.activeCount()
46 | pass
47 | t = threading.Thread(target = download_diff.extract, args = (commit,))
48 | t.start()
49 | except (Exception) as e:
50 | print ("multiprocessing error")
51 | print (e.message)
52 | interval = random.uniform(1,2)
53 | time.sleep(interval)
54 |
55 | # print (older_href)
56 | file = open('download_log.txt','a')
57 | file.write(older_href + '\n')
58 | file.write("Already downloaded " + str(searched_commit_num) + " commits." + '\n')
59 | print ("Already downloaded " + str(searched_commit_num) + " commits.")
60 | file.close()
61 | out = open('commit_url.txt','w')
62 | out.write(older_href)
63 | out.close()
64 |
65 | get_commits(older_href,searched_commit_num)
66 |
67 |
68 | def main():
69 | #change the file(commit_url.txt) to get other software's commits
70 | fin = open('commit_url.txt','r')
71 | url = fin.readline()
72 | fin.close()
73 |
74 | searched_commit_num = 0
75 |
76 | try:
77 | get_commits(url,searched_commit_num)
78 | except (Exception) as e:
79 | print (e)
80 |
81 |
82 | if __name__ == '__main__':
83 | http.client.HTTPConnection._http_vsn = 10
84 | http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
85 | main()
86 |
87 |
--------------------------------------------------------------------------------
/code/cassandra/get_commit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import urllib
4 | from urllib.request import urlopen
5 | from bs4 import BeautifulSoup
6 | import http.client
7 | import time
8 | import random
9 | import threading
10 |
11 | import download_diff
12 |
13 |
14 | def get_commits(url,searched_commit_num):
15 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\
16 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
17 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\
18 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\
19 | {'User-Agent':'node.js-v0.8.8'}]
20 | try:
21 | c = random.randint(0,4)
22 | req = urllib.request.Request(url=url,headers=headers[c])
23 | response = urlopen(req)
24 | except (Exception) as e:
25 | print (e)
26 | time.sleep(60)
27 | soup = BeautifulSoup(response.read(), features = "html.parser")
28 | a_list = soup.find_all('a')
29 | commit_list = []
30 | older_href = ''
31 | for a in a_list:
32 | data = a.get('data-pjax')
33 | if data:
34 | if data == 'true':
35 | href = a.get('href')
36 | if href not in commit_list:
37 | commit_list.append(href)
38 | if a.text == 'Older':
39 | older_href = a.get('href')
40 |
41 | for commit in commit_list:
42 | searched_commit_num = searched_commit_num + 1
43 | try:
44 | while threading.activeCount() > 25:
45 | #print threading.activeCount()
46 | pass
47 | t = threading.Thread(target = download_diff.extract, args = (commit,))
48 | t.start()
49 | except (Exception) as e:
50 | print ("multiprocessing error")
51 | print (e.message)
52 | interval = random.uniform(1,2)
53 | time.sleep(interval)
54 |
55 | # print (older_href)
56 | file = open('download_log.txt','a')
57 | file.write(older_href + '\n')
58 | file.write("Already downloaded " + str(searched_commit_num) + " commits." + '\n')
59 | print ("Already downloaded " + str(searched_commit_num) + " commits.")
60 | file.close()
61 | out = open('commit_url.txt','w')
62 | out.write(older_href)
63 | out.close()
64 |
65 | get_commits(older_href,searched_commit_num)
66 |
67 |
68 | def main():
69 | #change the file(commit_url.txt) to get other software's commits
70 | fin = open('commit_url.txt','r')
71 | url = fin.readline()
72 | fin.close()
73 |
74 | searched_commit_num = 0
75 |
76 | try:
77 | get_commits(url,searched_commit_num)
78 | except (Exception) as e:
79 | print (e)
80 |
81 |
82 |
83 | if __name__ == '__main__':
84 | http.client.HTTPConnection._http_vsn = 10
85 | http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
86 | main()
87 |
88 |
--------------------------------------------------------------------------------
/code/hdfs_demo_examples/download_diff.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import urllib
4 | from urllib.request import urlopen
5 | from bs4 import BeautifulSoup
6 | import random
7 | import os
8 | from pathlib import Path
9 |
10 | BASE_URL = "https://github.com"
11 | DIFF_FILE_PATH = "."
12 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\
13 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
14 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\
15 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\
16 | {'User-Agent':'node.js-v0.8.8'}]
17 |
18 | #Download diff file of the commit
19 | def download(url):
20 | commit_sha = url.split('/')
21 | commit_sha = commit_sha[-1]
22 | h = random.randint(0,4)
23 | diff_response = urlopen(url + '.diff')
24 | diff = diff_response.read().decode('UTF-8')
25 | diff_file = open(DIFF_FILE_PATH + '/' + commit_sha + '.diff', 'w')
26 | diff_file.write(diff)
27 | diff_file.close()
28 |
29 | #Get all the information of the commit
30 | def extract(url):
31 |
32 | url = BASE_URL + url
33 |
34 | try:
35 | c = random.randint(0,4)
36 | req = urllib.request.Request(url = url,headers = headers[c])
37 | commit_reponse = urlopen(req)
38 | except (Exception) as e:
39 | return
40 |
41 | soup = BeautifulSoup(commit_reponse.read(), features = "html.parser")
42 |
43 | commit_title = ''
44 | p_list = soup.find_all('p')
45 | for p in p_list:
46 | name = p.get('class')
47 | if name:
48 | if name[0] == 'commit-title':
49 | commit_title = p.text
50 |
51 | commit_description = ''
52 | div_list = soup.find_all('div')
53 | for div in div_list:
54 | name = div.get('class')
55 | if name:
56 | if name[0] == 'commit-desc':
57 | commit_description = div.text
58 |
59 | commit_time_tag = soup.find('relative-time')
60 | if commit_time_tag:
61 | commit_time = commit_time_tag.get('datetime')
62 | print (url)
63 | print (commit_time)
64 | else:
65 | commit_time = 'commit_time_tag not exist'
66 |
67 | commit_sha = url.split('/')
68 | commit_sha = commit_sha[-1]
69 | commit_file = Path(DIFF_FILE_PATH + '/' + commit_sha + '.diff')
70 | if commit_file.is_file():
71 | print ("The diff file of " + commit_sha + " is already downloaded")
72 | else:
73 | commit_info_file = open('commit_info.txt','a')
74 | commit_info_file.write(url + '$$$' + commit_title.replace('\n','').strip() + commit_description.replace('\n','').strip() + '$$$' + commit_time + '\n')
75 | commit_info_file.close()
76 | try:
77 | download(url)
78 | except (Exception) as e:
79 | print (e)
80 | return
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/code/hbase/download_diff.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import urllib
4 | from urllib.request import urlopen
5 | from bs4 import BeautifulSoup
6 | import random
7 | import os
8 | from pathlib import Path
9 |
10 | BASE_URL = "https://github.com"
11 | DIFF_FILE_PATH = "/Users/zhangbuzhang/Desktop/diffs/HBase"
12 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\
13 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
14 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\
15 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\
16 | {'User-Agent':'node.js-v0.8.8'}]
17 |
18 | #Download diff file of the commit
19 | def download(url):
20 | commit_sha = url.split('/')
21 | commit_sha = commit_sha[-1]
22 | h = random.randint(0,4)
23 | diff_response = urlopen(url + '.diff')
24 | diff = diff_response.read().decode('UTF-8')
25 | diff_file = open(DIFF_FILE_PATH + '/' + commit_sha + '.diff', 'w')
26 | diff_file.write(diff)
27 | diff_file.close()
28 |
29 | #Get all the information of the commit
30 | def extract(url):
31 |
32 | url = BASE_URL + url
33 |
34 | try:
35 | c = random.randint(0,4)
36 | req = urllib.request.Request(url = url,headers = headers[c])
37 | commit_reponse = urlopen(req)
38 | except (Exception) as e:
39 | return
40 |
41 | soup = BeautifulSoup(commit_reponse.read(), features = "html.parser")
42 |
43 | commit_title = ''
44 | p_list = soup.find_all('p')
45 | for p in p_list:
46 | name = p.get('class')
47 | if name:
48 | if name[0] == 'commit-title':
49 | commit_title = p.text
50 |
51 | commit_description = ''
52 | div_list = soup.find_all('div')
53 | for div in div_list:
54 | name = div.get('class')
55 | if name:
56 | if name[0] == 'commit-desc':
57 | commit_description = div.text
58 |
59 | commit_time_tag = soup.find('relative-time')
60 | if commit_time_tag:
61 | commit_time = commit_time_tag.get('datetime')
62 | print (url)
63 | print (commit_time)
64 | else:
65 | commit_time = 'commit_time_tag not exist'
66 |
67 | commit_sha = url.split('/')
68 | commit_sha = commit_sha[-1]
69 | commit_file = Path(DIFF_FILE_PATH + '/' + commit_sha + '.diff')
70 | if commit_file.is_file():
71 | print ("The diff file of " + commit_sha + " is already downloaded")
72 | else:
73 | commit_info_file = open('commit_info.txt','a')
74 | commit_info_file.write(url + '$$$' + commit_title.replace('\n','').strip() + commit_description.replace('\n','').strip() + '$$$' + commit_time + '\n')
75 | commit_info_file.close()
76 | try:
77 | download(url)
78 | except (Exception) as e:
79 | print (e)
80 | return
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/code/hdfs/download_diff.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import urllib
4 | from urllib.request import urlopen
5 | from bs4 import BeautifulSoup
6 | import random
7 | import os
8 | from pathlib import Path
9 |
10 | BASE_URL = "https://github.com"
11 | DIFF_FILE_PATH = "/Users/zhangbuzhang/Desktop/diffs/HDFS"
12 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\
13 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
14 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\
15 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\
16 | {'User-Agent':'node.js-v0.8.8'}]
17 |
18 | #Download diff file of the commit
19 | def download(url):
20 | commit_sha = url.split('/')
21 | commit_sha = commit_sha[-1]
22 | h = random.randint(0,4)
23 | diff_response = urlopen(url + '.diff')
24 | diff = diff_response.read().decode('UTF-8')
25 | diff_file = open(DIFF_FILE_PATH + '/' + commit_sha + '.diff', 'w')
26 | diff_file.write(diff)
27 | diff_file.close()
28 |
29 | #Get all the information of the commit
30 | def extract(url):
31 |
32 | url = BASE_URL + url
33 |
34 | try:
35 | c = random.randint(0,4)
36 | req = urllib.request.Request(url = url,headers = headers[c])
37 | commit_reponse = urlopen(req)
38 | except (Exception) as e:
39 | return
40 |
41 | soup = BeautifulSoup(commit_reponse.read(), features = "html.parser")
42 |
43 | commit_title = ''
44 | p_list = soup.find_all('p')
45 | for p in p_list:
46 | name = p.get('class')
47 | if name:
48 | if name[0] == 'commit-title':
49 | commit_title = p.text
50 |
51 | commit_description = ''
52 | div_list = soup.find_all('div')
53 | for div in div_list:
54 | name = div.get('class')
55 | if name:
56 | if name[0] == 'commit-desc':
57 | commit_description = div.text
58 |
59 | commit_time_tag = soup.find('relative-time')
60 | if commit_time_tag:
61 | commit_time = commit_time_tag.get('datetime')
62 | print (url)
63 | print (commit_time)
64 | else:
65 | commit_time = 'commit_time_tag not exist'
66 |
67 | commit_sha = url.split('/')
68 | commit_sha = commit_sha[-1]
69 | commit_file = Path(DIFF_FILE_PATH + '/' + commit_sha + '.diff')
70 | if commit_file.is_file():
71 | print ("The diff file of " + commit_sha + " is already downloaded")
72 | else:
73 | commit_info_file = open('commit_info.txt','a')
74 | commit_info_file.write(url + '$$$' + commit_title.replace('\n','').strip() + commit_description.replace('\n','').strip() + '$$$' + commit_time + '\n')
75 | commit_info_file.close()
76 | try:
77 | download(url)
78 | except (Exception) as e:
79 | print (e)
80 | return
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/code/spark/download_diff.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import urllib
4 | from urllib.request import urlopen
5 | from bs4 import BeautifulSoup
6 | import random
7 | import os
8 | from pathlib import Path
9 |
10 | BASE_URL = "https://github.com"
11 | DIFF_FILE_PATH = "/Users/zhangbuzhang/Desktop/diffs/HBase"
12 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\
13 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
14 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\
15 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\
16 | {'User-Agent':'node.js-v0.8.8'}]
17 |
18 | #Download diff file of the commit
19 | def download(url):
20 | commit_sha = url.split('/')
21 | commit_sha = commit_sha[-1]
22 | h = random.randint(0,4)
23 | diff_response = urlopen(url + '.diff')
24 | diff = diff_response.read().decode('UTF-8')
25 | diff_file = open(DIFF_FILE_PATH + '/' + commit_sha + '.diff', 'w')
26 | diff_file.write(diff)
27 | diff_file.close()
28 |
29 | #Get all the information of the commit
30 | def extract(url):
31 |
32 | url = BASE_URL + url
33 |
34 | try:
35 | c = random.randint(0,4)
36 | req = urllib.request.Request(url = url,headers = headers[c])
37 | commit_reponse = urlopen(req)
38 | except (Exception) as e:
39 | return
40 |
41 | soup = BeautifulSoup(commit_reponse.read(), features = "html.parser")
42 |
43 | commit_title = ''
44 | p_list = soup.find_all('p')
45 | for p in p_list:
46 | name = p.get('class')
47 | if name:
48 | if name[0] == 'commit-title':
49 | commit_title = p.text
50 |
51 | commit_description = ''
52 | div_list = soup.find_all('div')
53 | for div in div_list:
54 | name = div.get('class')
55 | if name:
56 | if name[0] == 'commit-desc':
57 | commit_description = div.text
58 |
59 | commit_time_tag = soup.find('relative-time')
60 | if commit_time_tag:
61 | commit_time = commit_time_tag.get('datetime')
62 | print (url)
63 | print (commit_time)
64 | else:
65 | commit_time = 'commit_time_tag not exist'
66 |
67 | commit_sha = url.split('/')
68 | commit_sha = commit_sha[-1]
69 | commit_file = Path(DIFF_FILE_PATH + '/' + commit_sha + '.diff')
70 | if commit_file.is_file():
71 | print ("The diff file of " + commit_sha + " is already downloaded")
72 | else:
73 | commit_info_file = open('commit_info.txt','a')
74 | commit_info_file.write(url + '$$$' + commit_title.replace('\n','').strip() + commit_description.replace('\n','').strip() + '$$$' + commit_time + '\n')
75 | commit_info_file.close()
76 | try:
77 | download(url)
78 | except (Exception) as e:
79 | print (e)
80 | return
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/code/cassandra/download_diff.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import urllib
4 | from urllib.request import urlopen
5 | from bs4 import BeautifulSoup
6 | import random
7 | import os
8 | from pathlib import Path
9 |
10 | BASE_URL = "https://github.com"
11 | DIFF_FILE_PATH = "/Users/zhangbuzhang/Desktop/diffs/Cassandra"
12 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\
13 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
14 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\
15 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\
16 | {'User-Agent':'node.js-v0.8.8'}]
17 |
18 | #Download diff file of the commit
19 | def download(url):
20 | commit_sha = url.split('/')
21 | commit_sha = commit_sha[-1]
22 | h = random.randint(0,4)
23 | diff_response = urlopen(url + '.diff')
24 | diff = diff_response.read().decode('UTF-8')
25 | diff_file = open(DIFF_FILE_PATH + '/' + commit_sha + '.diff', 'w')
26 | diff_file.write(diff)
27 | diff_file.close()
28 |
29 | #Get all the information of the commit
30 | def extract(url):
31 |
32 | url = BASE_URL + url
33 |
34 | try:
35 | c = random.randint(0,4)
36 | req = urllib.request.Request(url = url,headers = headers[c])
37 | commit_reponse = urlopen(req)
38 | except (Exception) as e:
39 | return
40 |
41 | soup = BeautifulSoup(commit_reponse.read(), features = "html.parser")
42 |
43 | commit_title = ''
44 | p_list = soup.find_all('p')
45 | for p in p_list:
46 | name = p.get('class')
47 | if name:
48 | if name[0] == 'commit-title':
49 | commit_title = p.text
50 |
51 | commit_description = ''
52 | div_list = soup.find_all('div')
53 | for div in div_list:
54 | name = div.get('class')
55 | if name:
56 | if name[0] == 'commit-desc':
57 | commit_description = div.text
58 |
59 | commit_time_tag = soup.find('relative-time')
60 | if commit_time_tag:
61 | commit_time = commit_time_tag.get('datetime')
62 | print (url)
63 | print (commit_time)
64 | else:
65 | commit_time = 'commit_time_tag not exist'
66 |
67 | commit_sha = url.split('/')
68 | commit_sha = commit_sha[-1]
69 | commit_file = Path(DIFF_FILE_PATH + '/' + commit_sha + '.diff')
70 | if commit_file.is_file():
71 | print ("The diff file of " + commit_sha + " is already downloaded")
72 | else:
73 | commit_info_file = open('commit_info.txt','a')
74 | commit_info_file.write(url + '$$$' + commit_title.replace('\n','').strip() + commit_description.replace('\n','').strip() + '$$$' + commit_time + '\n')
75 | commit_info_file.close()
76 | try:
77 | download(url)
78 | except (Exception) as e:
79 | print (e)
80 | return
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/code/cassandra/extract_commit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import random
4 | import sys
5 | import diff_file_parser
6 | from download_diff import DIFF_FILE_PATH
7 | import nltk
8 | from nltk.stem.porter import PorterStemmer
9 |
10 |
11 | def extract(commit_info,configVariableList):
12 |
13 | commit_info = commit_info.split('$$$')
14 | commit_url = commit_info[0]
15 | commit_title = commit_info[1]
16 | commit_time = commit_info[2]
17 | commit_sha = commit_url.split('/')
18 | commit_sha = commit_sha[-1]
19 |
20 | desc_contain_keyword = False #whether commit description contains the configuration keyword
21 | is_merge_commit = False
22 | diff_contain_config = False #whether diff touches configuration
23 |
24 | titile_words = commit_title.split(' ')
25 | for word in titile_words:
26 | if word.lower() == 'option' or word.lower() == 'parameter':
27 | commit_title = commit_title.replace(word, "**" + word.upper() + "**")
28 | desc_contain_keyword = True
29 | st = PorterStemmer()
30 | word_stemmed = st.stem(word).lower()
31 | if word_stemmed in {'config','configure'}:
32 | commit_title = commit_title.replace(word_stemmed, "**" + word_stemmed.upper() + "**")
33 | desc_contain_keyword = True
34 |
35 |
36 | commit_titleword = commit_title.lower().split(' ')
37 | for word in {'merge','merging','checkstyle','findbugs'}:
38 | if word in commit_titleword:
39 | is_merge_commit = True
40 | break
41 |
42 | code_result = []
43 | diff_file_path = DIFF_FILE_PATH + '/' + commit_sha + '.diff'
44 | if is_merge_commit == False:
45 | code_result = diff_file_parser.diff_selection(diff_file_path, configVariableList)
46 |
47 | config_file_touched = False
48 |
49 | config_load_touched = False
50 |
51 | config_set_touched = False
52 |
53 | config_variable_touched = False
54 |
55 | config_message_touched = False
56 |
57 | if code_result:
58 |
59 | config_file_touched = code_result[0]
60 |
61 | config_load_touched = code_result[1]
62 |
63 | config_set_touched = code_result[2]
64 |
65 | config_variable_touched = code_result[3]
66 |
67 | config_message_touched = code_result[4]
68 |
69 | #the set of touched configuration option
70 | touched_config_file = code_result[5]
71 |
72 | #the set of touched configuration load function
73 | touched_config_load_func = code_result[6]
74 |
75 | #the set of touched configuration set function
76 | touched_config_set_func = code_result[7]
77 |
78 | #the set of touched configuration variables
79 | touched_variable = code_result[8]
80 |
81 | #the set of touched meesgae keyword
82 | touched_message = code_result[9]
83 |
84 | if True in (config_file_touched,config_load_touched,config_set_touched,config_variable_touched,config_message_touched):
85 | diff_contain_config = True
86 |
87 |
88 | if (is_merge_commit == False) and (desc_contain_keyword == True or diff_contain_config == True):
89 | print("find a candidate commit!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
90 | file = open('commit_selected.txt', 'a')
91 | file.write("###############################################################################" + '\n')
92 | file.write(commit_title + '\n' + commit_url + '\n' + commit_time + '\n')
93 | file.write('Commit message touches config:' + str(desc_contain_keyword) + '\n')
94 | file.write('Diff touches config define:' + str(config_file_touched) + '\n')
95 | file.write('Diff touches config loading:' + str(config_load_touched) + '\n')
96 | file.write('Diff touches config setting:' + str(config_set_touched) + '\n')
97 | file.write('Diff touches config variable (data flow):' + str(config_variable_touched) + '\n')
98 | file.write('Diff touches config message:' + str(config_message_touched) + '\n')
99 |
100 | file.write('\n_________________touchedConfigDefine_____________________\n\n')
101 | if config_file_touched:
102 | for config_file in touched_config_file:
103 | file.write(config_file)
104 | file.write('\n')
105 | else:
106 | file.write('Null\n')
107 |
108 | file.write('\n___________________touchedConfigLoad___________________\n\n')
109 | if config_load_touched:
110 | for config_load_func in touched_config_load_func:
111 | file.write(config_load_func + '\n')
112 | else:
113 | file.write('Null\n')
114 |
115 | file.write('\n___________________touchedConfigSet______________________\n\n')
116 | if config_set_touched:
117 | for config_set_func in touched_config_set_func:
118 | file.write(config_set_func + '\n')
119 | else:
120 | file.write('Null\n')
121 |
122 | file.write('\n___________________touchedConfigVariable_____________________\n\n')
123 | if config_variable_touched:
124 | for param in touched_variable:
125 | file.write(param + '\n')
126 | else:
127 | file.write('Null\n')
128 |
129 | file.write('\n____________________touchedMessage________________________\n\n')
130 | if config_message_touched:
131 | for keyword in touched_message:
132 | file.write('"' + keyword + '"' + '\n')
133 | else:
134 | file.write('Null\n')
135 |
136 | file.write('\n')
137 |
138 | file.close()
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
--------------------------------------------------------------------------------
/code/hdfs/extract_commit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import random
4 | import sys
5 | import diff_file_parser
6 | from download_diff import DIFF_FILE_PATH
7 | import nltk
8 | from nltk.stem.porter import PorterStemmer
9 |
10 |
11 | def extract(commit_info,configParamList):
12 |
13 | commit_info = commit_info.split('$$$')
14 | commit_url = commit_info[0]
15 | commit_title = commit_info[1]
16 | commit_time = commit_info[2]
17 | commit_sha = commit_url.split('/')
18 | commit_sha = commit_sha[-1]
19 |
20 | desc_contain_keyword = False #whether commit description contains the configuration keyword
21 | is_merge_commit = False
22 | diff_contain_config = False #whether diff touches configuration
23 |
24 | titile_words = commit_title.split(' ')
25 | for word in titile_words:
26 | if word.lower() == 'option' or word.lower() == 'parameter':
27 | commit_title = commit_title.replace(word, "**" + word.upper() + "**")
28 | desc_contain_keyword = True
29 | st = PorterStemmer()
30 | word_stemmed = st.stem(word).lower()
31 | if word_stemmed in {'config','configur'}:
32 | commit_title = commit_title.replace(word_stemmed, "**" + word_stemmed.upper() + "**")
33 | desc_contain_keyword = True
34 |
35 |
36 | commit_titleword = commit_title.lower().split(' ')
37 | for word in {'merge','merging','checkstyle','findbugs'}:
38 | if word in commit_titleword:
39 | is_merge_commit = True
40 | break
41 |
42 | codeResult = []
43 | diff_file_path = DIFF_FILE_PATH + '/' + commit_sha + '.diff'
44 | if is_merge_commit == False:
45 | codeResult = diff_file_parser.diffSelection(diff_file_path, configParamList)
46 |
47 | #Wheter a diff touches configuration file
48 | configFileTouched = False
49 |
50 | #Wheter a diff touches configuration load function
51 | configLoadTouched = False
52 |
53 | #Wheter a diff touches configuration set function
54 | configSetTouched = False
55 |
56 | #Wheter a diff touches configuration parameter
57 | configParamTouched = False
58 |
59 | #Whether a diff touches configuration message
60 | configMessageTouched = False
61 |
62 | if codeResult:
63 |
64 | configFileTouched = codeResult[0]
65 |
66 | configLoadTouched = codeResult[1]
67 |
68 | configSetTouched = codeResult[2]
69 |
70 | configParamTouched = codeResult[3]
71 |
72 | configMessageTouched = codeResult[4]
73 |
74 | #the set of touched file
75 | touchedFile = codeResult[5]
76 |
77 | #the set of touched configuration load function
78 | touchedLoadFunc = codeResult[6]
79 |
80 | #the set of touched configuration set function
81 | touchedSetFunc = codeResult[7]
82 |
83 | #the set of touched configuration parameter
84 | touchedParam = codeResult[8]
85 |
86 | #the set of touched meesgae
87 | touchedMessage = codeResult[9]
88 |
89 | if True in (configFileTouched,configLoadTouched,configSetTouched,configParamTouched,configMessageTouched):
90 | diff_contain_config = True
91 |
92 | if (is_merge_commit == False) and (desc_contain_keyword == True or diff_contain_config == True):
93 | print("find a candidate commit!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
94 | file = open('commit_selected.txt', 'a')
95 | file.write("###############################################################################" + '\n')
96 | file.write(commit_title + '\n' + commit_url + '\n' + commit_time + '\n')
97 | file.write('Commit message touches config:' + str(desc_contain_keyword) + '\n')
98 | file.write('Diff touches config define:' + str(configFileTouched) + '\n')
99 | file.write('Diff touches config loading:' + str(configLoadTouched) + '\n')
100 | file.write('Diff touches config setting:' + str(configSetTouched) + '\n')
101 | file.write('Diff touches config variable (data flow):' + str(configParamTouched) + '\n')
102 | file.write('Diff touches config message:' + str(configMessageTouched) + '\n')
103 |
104 | file.write('\n_________________touchedConfigDefine_____________________\n\n')
105 | if configFileTouched:
106 | for fileName in touchedFile:
107 | file.write(fileName + ' ')
108 | file.write('\n')
109 | else:
110 | file.write('Null\n')
111 |
112 | file.write('\n___________________touchedConfigLoad___________________\n\n')
113 | if configLoadTouched:
114 | for loadFunc in touchedLoadFunc:
115 | file.write(loadFunc + '\n')
116 | else:
117 | file.write('Null\n')
118 |
119 | file.write('\n___________________touchedConfigSet____________________\n\n')
120 | if configSetTouched:
121 | for setFunc in touchedSetFunc:
122 | file.write(setFunc + '\n')
123 | else:
124 | file.write('Null\n')
125 |
126 | file.write('\n___________________touchedConfigVariable_____________________\n\n')
127 | if configParamTouched:
128 | for param in touchedParam:
129 | file.write(param + '\n')
130 | else:
131 | file.write('Null\n')
132 |
133 | file.write('\n___________________touchedMessage_____________________\n\n')
134 | if configMessageTouched:
135 | for keyword in touchedMessage:
136 | file.write('"' + keyword + '"' + '\n')
137 | else:
138 | file.write('Null\n')
139 |
140 | file.write('\n')
141 |
142 | file.close()
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/code/hbase/extract_commit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import random
4 | import diff_file_parser
5 | from download_diff import DIFF_FILE_PATH
6 | import nltk
7 | from nltk.stem.porter import PorterStemmer
8 |
9 | def extract(commit_info,configParamList):
10 |
11 | commit_info = commit_info.split('$$$')
12 | commit_url = commit_info[0]
13 | commit_title = commit_info[1]
14 | commit_time = commit_info[2]
15 | commit_sha = commit_url.split('/')
16 | commit_sha = commit_sha[-1]
17 |
18 | desc_contain_keyword = False #whether commit description contains the configuration keyword
19 | irrelevant_commit = False
20 | diff_contain_config = False #whether diff touches configuration
21 |
22 | titile_words = commit_title.split(' ')
23 | for word in titile_words:
24 | if word.lower() == 'option' or word.lower() == 'parameter':
25 | commit_title = commit_title.replace(word, "**" + word.upper() + "**")
26 | desc_contain_keyword = True
27 | st = PorterStemmer()
28 | word_stemmed = st.stem(word).lower()
29 | if word_stemmed in {'config','configur'}:
30 | commit_title = commit_title.replace(word_stemmed, "**" + word_stemmed.upper() + "**")
31 | desc_contain_keyword = True
32 |
33 |
34 | commit_titleword = commit_title.lower().split(' ')
35 | for word in {'merge','merging','checkstyle','findbugs'}:
36 | if word in commit_titleword:
37 | irrelevant_commit = True
38 | break
39 |
40 | codeResult = []
41 | diff_file_path = DIFF_FILE_PATH + '/' + commit_sha + '.diff'
42 | if irrelevant_commit == False:
43 | codeResult = diff_file_parser.diffSelection(diff_file_path, configParamList)
44 |
45 | #Wheter a diff touches configuration file
46 | configFileTouched = False
47 |
48 | #Wheter a diff touches configuration load function
49 | configLoadTouched = False
50 |
51 | #Wheter a diff touches configuration set function
52 | configSetTouched = False
53 |
54 | #Wheter a diff touches configuration parameter
55 | configParamTouched = False
56 |
57 | #Whether a diff touches configuration message
58 | configMessageTouched = False
59 |
60 | if codeResult:
61 |
62 | configFileTouched = codeResult[0]
63 |
64 | configLoadTouched = codeResult[1]
65 |
66 | configSetTouched = codeResult[2]
67 |
68 | configParamTouched = codeResult[3]
69 |
70 | configMessageTouched = codeResult[4]
71 |
72 | #the set of touched file
73 | touchedFile = codeResult[5]
74 |
75 | #the set of touched configuration load function
76 | touchedLoadFunc = codeResult[6]
77 |
78 | #the set of touched configuration set function
79 | touchedSetFunc = codeResult[7]
80 |
81 | #the set of touched configuration parameter
82 | touchedParam = codeResult[8]
83 |
84 | #the set of touched meesgae
85 | touchedMessage = codeResult[9]
86 |
87 | if True in (configFileTouched,configLoadTouched,configSetTouched,configParamTouched,configMessageTouched):
88 | diff_contain_config = True
89 |
90 | if (irrelevant_commit == False) and (desc_contain_keyword == True or diff_contain_config == True):
91 | print("find a candidate commit!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
92 | file = open('commit_selected.txt', 'a')
93 | file.write("###############################################################################" + '\n')
94 | file.write(commit_title + '\n' + commit_url + '\n' + commit_time + '\n')
95 | file.write('Commit message touches config:' + str(desc_contain_keyword) + '\n')
96 | file.write('Diff touches config define:' + str(configFileTouched) + '\n')
97 | file.write('Diff touches config loading:' + str(configLoadTouched) + '\n')
98 | file.write('Diff touches config setting:' + str(configSetTouched) + '\n')
99 | file.write('Diff touches config variable (data flow):' + str(configParamTouched) + '\n')
100 | file.write('Diff touches config message:' + str(configMessageTouched) + '\n')
101 |
102 | file.write('\n_________________touchedConfigDefine_____________________\n\n')
103 | if configFileTouched:
104 | for fileName in touchedFile:
105 | file.write(fileName + ' ')
106 | file.write('\n')
107 | else:
108 | file.write('Null\n')
109 |
110 | file.write('\n___________________touchedConfigLoad___________________\n\n')
111 | if configLoadTouched:
112 | for loadFunc in touchedLoadFunc:
113 | file.write(loadFunc + '\n')
114 | else:
115 | file.write('Null\n')
116 |
117 |
118 | file.write('\n___________________touchedConfigSet____________________\n\n')
119 | if configSetTouched:
120 | for setFunc in touchedSetFunc:
121 | file.write(setFunc + '\n')
122 | else:
123 | file.write('Null\n')
124 |
125 |
126 | file.write('\n___________________touchedConfigVariable_____________________\n\n')
127 | if configParamTouched:
128 | for param in touchedParam:
129 | file.write(param + '\n')
130 | else:
131 | file.write('Null\n')
132 |
133 | file.write('\n___________________touchedMessage_____________________\n\n')
134 | if configMessageTouched:
135 | for keyword in touchedMessage:
136 | file.write('"' + keyword + '"' + '\n')
137 | else:
138 | file.write('Null\n')
139 |
140 | file.write('\n')
141 |
142 | file.close()
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/code/hdfs_demo_examples/extract_commit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 |
3 | import random
4 | import sys
5 | import diff_file_parser
6 | from download_diff import DIFF_FILE_PATH
7 | import nltk
8 | from nltk.stem.porter import PorterStemmer
9 |
10 |
11 | def extract(commit_info,configParamList):
12 |
13 | commit_info = commit_info.split('$$$')
14 | commit_url = commit_info[0]
15 | commit_title = commit_info[1]
16 | commit_time = commit_info[2]
17 | commit_sha = commit_url.split('/')
18 | commit_sha = commit_sha[-1]
19 |
20 | desc_contain_keyword = False #whether commit description contains the configuration keyword
21 | is_merge_commit = False
22 | diff_contain_config = False #whether diff touches configuration
23 |
24 | titile_words = commit_title.split(' ')
25 | for word in titile_words:
26 | if word.lower() == 'option' or word.lower() == 'parameter':
27 | commit_title = commit_title.replace(word, "**" + word.upper() + "**")
28 | desc_contain_keyword = True
29 | st = PorterStemmer()
30 | word_stemmed = st.stem(word).lower()
31 | if word_stemmed in {'config','configur'}:
32 | commit_title = commit_title.replace(word_stemmed, "**" + word_stemmed.upper() + "**")
33 | desc_contain_keyword = True
34 |
35 |
36 | commit_titleword = commit_title.lower().split(' ')
37 | for word in {'merge','merging','checkstyle','findbugs'}:
38 | if word in commit_titleword:
39 | is_merge_commit = True
40 | break
41 |
42 | codeResult = []
43 | diff_file_path = DIFF_FILE_PATH + '/' + commit_sha + '.diff'
44 | if is_merge_commit == False:
45 | codeResult = diff_file_parser.diffSelection(diff_file_path, configParamList)
46 |
47 | #Wheter a diff touches configuration file
48 | configFileTouched = False
49 |
50 | #Wheter a diff touches configuration load function
51 | configLoadTouched = False
52 |
53 | #Wheter a diff touches configuration set function
54 | configSetTouched = False
55 |
56 | #Wheter a diff touches configuration parameter
57 | configParamTouched = False
58 |
59 | #Whether a diff touches configuration message
60 | configMessageTouched = False
61 |
62 | if codeResult:
63 |
64 | configFileTouched = codeResult[0]
65 |
66 | configLoadTouched = codeResult[1]
67 |
68 | configSetTouched = codeResult[2]
69 |
70 | configParamTouched = codeResult[3]
71 |
72 | configMessageTouched = codeResult[4]
73 |
74 | #the set of touched file
75 | touchedFile = codeResult[5]
76 |
77 | #the set of touched configuration load function
78 | touchedLoadFunc = codeResult[6]
79 |
80 | #the set of touched configuration set function
81 | touchedSetFunc = codeResult[7]
82 |
83 | #the set of touched configuration parameter
84 | touchedParam = codeResult[8]
85 |
86 | #the set of touched meesgae
87 | touchedMessage = codeResult[9]
88 |
89 | if True in (configFileTouched,configLoadTouched,configSetTouched,configParamTouched,configMessageTouched):
90 | diff_contain_config = True
91 |
92 | if (is_merge_commit == False) and (desc_contain_keyword == True or diff_contain_config == True):
93 | print("find a candidate commit!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
94 | file = open('commit_selected.txt', 'a')
95 | file.write("###############################################################################" + '\n')
96 | file.write(commit_title + '\n' + commit_url + '\n' + commit_time + '\n')
97 | file.write('Commit message touches config:' + str(desc_contain_keyword) + '\n')
98 | file.write('Diff touches config define:' + str(configFileTouched) + '\n')
99 | file.write('Diff touches config loading:' + str(configLoadTouched) + '\n')
100 | file.write('Diff touches config setting:' + str(configSetTouched) + '\n')
101 | file.write('Diff touches config variable (data flow):' + str(configParamTouched) + '\n')
102 | file.write('Diff touches config message:' + str(configMessageTouched) + '\n')
103 |
104 | file.write('\n_________________touchedConfigDefine_____________________\n\n')
105 | if configFileTouched:
106 | for fileName in touchedFile:
107 | file.write(fileName + ' ')
108 | file.write('\n')
109 | else:
110 | file.write('Null\n')
111 |
112 | file.write('\n___________________touchedConfigLoad___________________\n\n')
113 | if configLoadTouched:
114 | for loadFunc in touchedLoadFunc:
115 | file.write(loadFunc + '\n')
116 | else:
117 | file.write('Null\n')
118 |
119 | file.write('\n___________________touchedConfigSet____________________\n\n')
120 | if configSetTouched:
121 | for setFunc in touchedSetFunc:
122 | file.write(setFunc + '\n')
123 | else:
124 | file.write('Null\n')
125 |
126 | file.write('\n___________________touchedConfigVariable_____________________\n\n')
127 | if configParamTouched:
128 | for param in touchedParam:
129 | file.write(param + '\n')
130 | else:
131 | file.write('Null\n')
132 |
133 | file.write('\n___________________touchedMessage_____________________\n\n')
134 | if configMessageTouched:
135 | for keyword in touchedMessage:
136 | file.write('"' + keyword + '"' + '\n')
137 | else:
138 | file.write('Null\n')
139 |
140 | file.write('\n')
141 |
142 | file.close()
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
--------------------------------------------------------------------------------
/commit_analysis/count_num.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import re
3 |
4 | def simple_count(file, commit_link_col):
5 |
6 | commit_list = []
7 | hdfs_commit_num = 0
8 | hbase_commit_num = 0
9 | saprk_commit_num = 0
10 | cassandra_commit_num = 0
11 |
12 | file = open(file)
13 | for line in file:
14 | vals = line.split(",")
15 | if vals[commit_link_col] not in commit_list:
16 | commit_list.append(vals[commit_link_col])
17 | if "hdfs" in vals[0].lower():
18 | hdfs_commit_num = hdfs_commit_num + 1
19 | if "hbase" in vals[0].lower():
20 | hbase_commit_num = hbase_commit_num + 1
21 | if "spark" in vals[0].lower():
22 | saprk_commit_num = saprk_commit_num + 1
23 | if "cassandra" in vals[0].lower():
24 | cassandra_commit_num = cassandra_commit_num + 1
25 | return hdfs_commit_num,hbase_commit_num,saprk_commit_num,cassandra_commit_num
26 | file.close()
27 |
28 | def simple_count_by_keyword(file, keywords, keyword_col):
29 | file = open(file)
30 | count = 0
31 | for line in file:
32 | vals = line.split(",")
33 | flag = 0
34 | for keyword in keywords:
35 | if keyword in vals[keyword_col].lower():
36 | flag = 1
37 | if flag == 1:
38 | count = count + 1
39 | return count
40 |
41 |
42 | def count_by_keyword(file, commit_link_col, keywords, keyword_col):
43 |
44 | commit_list = []
45 | hdfs_commit_num = 0
46 | hbase_commit_num = 0
47 | saprk_commit_num = 0
48 | cassandra_commit_num = 0
49 | file = open(file)
50 | for line in file:
51 | vals = line.split(",")
52 | flag = 0
53 | for keyword in keywords:
54 | if keyword in vals[keyword_col].lower():
55 | flag = 1
56 | if flag == 1:
57 | if vals[commit_link_col] not in commit_list:
58 | commit_list.append(vals[commit_link_col])
59 | if "hdfs" in vals[0].lower():
60 | hdfs_commit_num = hdfs_commit_num + 1
61 | if "hbase" in vals[0].lower():
62 | hbase_commit_num = hbase_commit_num + 1
63 | if "spark" in vals[0].lower():
64 | saprk_commit_num = saprk_commit_num + 1
65 | if "cassandra" in vals[0].lower():
66 | cassandra_commit_num = cassandra_commit_num + 1
67 | return hdfs_commit_num,hbase_commit_num,saprk_commit_num,cassandra_commit_num
68 | file.close()
69 |
70 |
71 | def print_simple_count(category, file, commit_link_col):
72 |
73 | count = simple_count(file, commit_link_col)
74 | print (category + ' ' + str(count[0]) + ' ' + str(count[1]) + ' ' + str(count[2]) + ' ' + str(count[3])
75 | + ' ' + str(count[0] + count[1] + count[2] + count[3]))
76 |
77 | def print_count_by_keyword(category, file, commit_link_col, keywords, keyword_col):
78 |
79 | count = count_by_keyword(file, commit_link_col, keywords, keyword_col)
80 | print (category + ' ' + str(count[0]) + ' ' + str(count[1]) + ' ' + str(count[2]) + ' ' + str(count[3])
81 | + ' ' + str(count[0] + count[1] + count[2] + count[3]))
82 |
83 | #just for table VI
84 | def print_commit_and_param_num(category, file, commit_link_col, keywords, keyword_col):
85 | commit_num = count_by_keyword(file, commit_link_col, keywords, keyword_col)
86 | param_num = simple_count_by_keyword(file, keywords, keyword_col)
87 | print (category + " " + str(commit_num[0] + commit_num[1] + commit_num[2] + commit_num[3]) + " " + str(param_num))
88 |
89 | print("##########################################################")
90 | print("Table IV")
91 | print("INTERFACE is caculated by adding up AddParam, RemoveParam and ModifyParam")
92 | print("BEHAVIOR is caculated by adding up Parse, Check, Handle and Use")
93 | print("DOCUMENT is caculated by adding up User Manual and Code Comments")
94 | print("##########################################################")
95 | print("Table V")
96 | print("AddParam, RemoveParam and ModifyParam are calculated by adding their sub-categories")
97 | print_count_by_keyword("AddNewCode", "add_param.csv",1,{"new"},2)
98 | print_count_by_keyword("AddCodeChange", "add_param.csv",1,{"change"},2)
99 | print_simple_count("AddParameterization","parameterization.csv",4)
100 | print_simple_count("RmvModule","rmv_with_code.csv",1)
101 | print_simple_count("RmvReplace","rmv_replace.csv",3)
102 | print_simple_count("ModNaming","param_rename.csv",2)
103 | print_simple_count("ModDefualtValue","change_default_value.csv",4)
104 | print_simple_count("ModConstraint","change_param_constraint.csv",4)
105 | print("##########################################################")
106 | print("Table VI")
107 | print_commit_and_param_num("Performance","parameterization.csv",4,{"performance"},6)
108 | print_commit_and_param_num("Reliability","parameterization.csv",4,{"reliability"},6)
109 | print_commit_and_param_num("Manageability","parameterization.csv",4,{"manageability"},6)
110 | print_commit_and_param_num("Debugging","parameterization.csv",4,{"debug"},6)
111 | print_commit_and_param_num("Environment","parameterization.csv",4,{"env"},6)
112 | print_commit_and_param_num("Compatibility","parameterization.csv",4,{"compatibility"},6)
113 | print_commit_and_param_num("Testability","parameterization.csv",4,{"testability"},6)
114 | print_commit_and_param_num("Security","parameterization.csv",4,{"security"},6)
115 | print("##########################################################")
116 | print("Table VIII")
117 | print("Handle and Use are calculated by adding their sub-categories")
118 | print_simple_count("Parse","config_parsing.csv",1)
119 | print_count_by_keyword("Check","checking_and_handling_code.csv",4,{"check"},5)
120 | print_count_by_keyword("HandleAction ","checking_and_handling_code.csv",4,{"exception"},5)
121 | print_count_by_keyword("HandleMessage","checking_and_handling_code.csv",4,{"message"},5)
122 | print_simple_count("UseChange","change_param_existing_usage.csv",4)
123 | print_simple_count("UseAdd","param_new_use.csv",4)
124 | print("##########################################################")
125 | print("Documentation")
126 | print_count_by_keyword("User Manual","change_documentation.csv",4,{"file","guide","command","description"},6)
127 | print_count_by_keyword("Code Comments","change_documentation.csv",4,{"code comment"},6)
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
--------------------------------------------------------------------------------
/code/spark/extract_commit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | import random
3 | import sys
4 | import diff_file_parser
5 | from download_diff import DIFF_FILE_PATH
6 | import nltk
7 | from nltk.stem.porter import PorterStemmer
8 |
9 |
10 | def extract(commit_info,configVariableList):
11 |
12 | commit_info = commit_info.split('$$$')
13 | commit_url = commit_info[0]
14 | commit_title = commit_info[1]
15 | if len(commit_info)>=3:
16 | commit_time = commit_info[2]
17 | commit_sha = commit_url.split('/')
18 | commit_sha = commit_sha[-1]
19 |
20 | desc_contain_keyword = False #whether commit description contains the configuration keyword
21 | irrelevant_commit = False
22 | diff_contain_config = False #whether diff touches configuration
23 |
24 | commit_title.replace('## What changes were proposed in this pull request?','')
25 | titile_words = commit_title.split(' ')
26 | count = 0
27 | for word in titile_words:
28 | if count > 20:
29 | break
30 | if word.lower() == 'option' or word.lower() == 'parameter':
31 | commit_title = commit_title.replace(word, "**" + word.upper() + "**")
32 | desc_contain_keyword = True
33 | st = PorterStemmer()
34 | word_stemmed = st.stem(word).lower()
35 | if 'config' in word.lower():
36 | commit_title = commit_title.replace(word, "**" + word.upper() + "**")
37 | desc_contain_keyword = True
38 | count = count + 1
39 |
40 |
41 | commit_titleword = commit_title.lower().split(' ')
42 | for word in {'merge','merging','checkstyle'}:
43 | if word in commit_titleword:
44 | irrelevant_commit = True
45 | break
46 |
47 | codeResult = []
48 | diff_file_path = DIFF_FILE_PATH + '/' + commit_sha + '.diff'
49 | if irrelevant_commit == False:
50 | codeResult = diff_file_parser.diff_selection(diff_file_path, configVariableList)
51 |
52 | #Whether a diff touches configuration doc
53 | configDocTouched = False
54 |
55 | #Whether a diff touches configuration build
56 | configBuildTouched = False
57 |
58 | #Whether a diff touches configuration load function
59 | configLoadTouched = False
60 |
61 | #Whether a diff touches configuration set function
62 | configSetTouched = False
63 |
64 | #Whether a diff touches configuration Variable
65 | configVariableTouched = False
66 |
67 | #Whether a diff touches configuration message
68 | configMessageTouched = False
69 |
70 | if codeResult:
71 |
72 | configDocTouched = codeResult[0]
73 |
74 | configBuildTouched = codeResult[1]
75 |
76 | configLoadTouched = codeResult[2]
77 |
78 | configSetTouched = codeResult[3]
79 |
80 | configVariableTouched = codeResult[4]
81 |
82 | configMessageTouched = codeResult[5]
83 |
84 | #the set of touched build function
85 | touchedBuildFunc = codeResult[6]
86 |
87 | #the set of touched configuration load function
88 | touchedConfigLoadFunc = codeResult[7]
89 |
90 | #the set of touched configuration set function
91 | touchedConfigSetFunc = codeResult[8]
92 |
93 | #the set of touched configuration variables
94 | touchedVariable = codeResult[9]
95 |
96 | #the set of touched meesgae keyword
97 | touchedMessage = codeResult[10]
98 |
99 | if True in (configDocTouched,configBuildTouched,configLoadTouched,configSetTouched,configVariableTouched,configMessageTouched):
100 | diff_contain_config = True
101 |
102 | if (irrelevant_commit == False) and (desc_contain_keyword == True or diff_contain_config == True):
103 | print("find a candidate commit!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
104 | file = open('commit_selected.txt', 'a')
105 | file.write("###############################################################################" + '\n')
106 | file.write(commit_title + '\n' + commit_url + '\n' + commit_time + '\n')
107 | file.write('Commit message touches config:' + str(desc_contain_keyword) + '\n')
108 | file.write('Diff touches config define(doc):' + str(configDocTouched) + '\n')
109 | file.write('Diff touches config define(buildFunc):' + str(configBuildTouched) + '\n')
110 | file.write('Diff touches config loading:' + str(configLoadTouched) + '\n')
111 | file.write('Diff touches config setting:' + str(configSetTouched) + '\n')
112 | file.write('Diff touches config variable (data flow):' + str(configVariableTouched) + '\n')
113 | file.write('Diff touches config message:' + str(configMessageTouched) + '\n')
114 |
115 | file.write('\n_________________touchedConfigDefine(Doc)_____________________\n\n')
116 | if configDocTouched:
117 | file.write('configuration.md' + ' ')
118 | file.write('\n')
119 | else:
120 | file.write('Null\n')
121 |
122 | file.write('\n_________________touchedConfigDefine(Build)_____________________\n\n')
123 | if configBuildTouched:
124 | for builFunc in touchedBuildFunc:
125 | file.write(builFunc + ' ')
126 | file.write('\n')
127 | else:
128 | file.write('Null\n')
129 |
130 | file.write('\n___________________touchedConfigLoad___________________\n\n')
131 | if configLoadTouched:
132 | for configLoadFunc in touchedConfigLoadFunc:
133 | file.write(configLoadFunc + '\n')
134 | else:
135 | file.write('Null\n')
136 |
137 | file.write('\n___________________touchedConfigSet____________________\n\n')
138 | if configSetTouched:
139 | for setFunc in touchedConfigSetFunc:
140 | file.write(setFunc + '\n')
141 | else:
142 | file.write('Null\n')
143 |
144 | file.write('\n___________________touchedConfigVariable_____________________\n\n')
145 | if configVariableTouched:
146 | for variable in touchedVariable:
147 | file.write(variable + '\n')
148 | else:
149 | file.write('Null\n')
150 |
151 | file.write('\n_______________________touchedMessage________________________\n\n')
152 | if configMessageTouched:
153 | for keyword in touchedMessage:
154 | file.write('"' + keyword + '"' + '\n')
155 | else:
156 | file.write('Null\n')
157 |
158 | file.write('\n')
159 |
160 | file.close()
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
--------------------------------------------------------------------------------
/commit_analysis/config_parsing.csv:
--------------------------------------------------------------------------------
1 | Issue ID,commit-URL,Type,
2 | CASSANDRA-14800,https://github.com/apache/cassandra/commit/bd0cef9a369ae9245b45040796a6e10f51e522ce#,change load way,
3 | HBASE-19619,https://github.com/apache/hbase/commit/41c2dd04da21bb76208f04af104df2e2f444970d#,change load way,
4 | HBASE-21568,https://github.com/apache/hbase/commit/67d6d5084cf8fc094cda4bd3f091d8a0a9cb1d3e#,change load way,
5 | SPARK-17920,https://github.com/apache/spark/commit/e0d7665cec1e6954d640f422c79ebba4c273be7d#,change load way,
6 | SPARK-21839,https://github.com/apache/spark/commit/d8f45408635d4fccac557cb1e877dfe9267fb326#,change load way,
7 | SPARK-21840,https://github.com/apache/spark/commit/3073344a2551fb198d63f2114a519ab97904cb55#,change load way,
8 | SPARK-22151,https://github.com/apache/spark/commit/1272b2034d4eed4bfe60a49e1065871b3a3f96e0#,change load way,
9 | SPARK-22219,https://github.com/apache/spark/commit/bbdcc3bf61da39704650d4570c6307b5a46f7100,change load way,
10 | SPARK-22372,https://github.com/apache/spark/commit/e1dd03e42c2131b167b1e80c761291e88bfdf03f#,change load way,
11 | SPARK-23207,https://github.com/apache/spark/commit/dad2d826ae9138f06751e5d092531a9e06028c21#,change load way,
12 | SPARK-23514,https://github.com/apache/spark/commit/476a7f026bc45462067ebd39cd269147e84cd641#,change load way,
13 | SPARK-23514,https://github.com/apache/spark/commit/dea381dfaa73e0cfb9a833b79c741b15ae274f64#,change load way,
14 | SPARK-23640,https://github.com/apache/spark/commit/ae9172017c361e5c1039bc2ca94048117021974a#,change load way,
15 | SPARK-24518,https://github.com/apache/spark/commit/33e77fa89b5805ecb1066fc534723527f70d37c7#,change load way,
16 | SPARK-24680,https://github.com/apache/spark/commit/4d693ac904d89b3afeba107eb0480120daf78174#,change load way,
17 | SPARK-26192,https://github.com/apache/spark/commit/5fd4d7499c9f2925268d84b5d74ecafaebe2113d#,change load way,
18 | SPARK-27253,https://github.com/apache/spark/commit/fc9aad0957fa98ce7a1af2ba529a476b33eebd0e#,change load way,
19 | SPARK-28907,https://github.com/apache/spark/commit/ca711778683a16999560cbdd7c61d98ad6bde6d,change load way,
20 | SPARK-26598,https://github.com/apache/spark/commit/962e330955581aea032ff336a12f23374c39e67,change load way,
21 | SPARK-28939,https://github.com/apache/spark/commit/ca6f693ef17ccb27a6ef5bdad9141abb2fe0434,change load way,
22 | SPARK-29326,https://github.com/apache/spark/commit/91747bd91b410e2d3b7556d0d595fb8e42e4c6d,change load way,
23 | SPARK-29530,https://github.com/apache/spark/commit/484f93e25506f84d1548504783be9ce940149bb,change load way,
24 | SPARK Pull Request #25273,https://github.com/apache/spark/commit/fbaa177d2ac19501add708cc7f28e18d30ca15f,change load way,
25 | SPARK-28642,https://github.com/apache/spark/commit/d19a56f9dbef4c995d80d4b46d03bfbfa4843c5,change load way,
26 | SPARK-28675,https://github.com/apache/spark/commit/47af8925b60509d2a2c932e2bcf25394721c6f1,change load way,
27 | HBASE-23212,https://github.com/apache/hbase/commit/10cc64a7d690429174405517b3e7d75e4f0998f,change load way,
28 | SPARK-27555,https://github.com/apache/spark/commit/c66ec439456c5a160e3849e23c2ce3970d4c6ec7#,fall back place,
29 | HBASE-17356,https://github.com/apache/hbase/commit/db66e6cc9e1c6ea027631388aba688cb623b7d0a#,hard-coded name,
30 | HBASE-19672,https://github.com/apache/hbase/commit/0cd6050d090d11240a40c012716b3d747fbcb58f#,hard-coded name,
31 | HBASE-21663,https://github.com/apache/hbase/commit/fbf79373e649d7cf3926b873e426fe0121f078c6#,hard-coded name,
32 | HDFS-11345,https://github.com/apache/hadoop/commit/2c769167dbdb66c52d2ba7b7193a686444085570#,hard-coded name,
33 | SPARK-16944,https://github.com/apache/spark/commit/4329eb2e73181819bb712f57ca9c7feac0d640ea#,hard-coded name,
34 | SPARK-19558,https://github.com/apache/spark/commit/bd4eb9ce57da7bacff69d9ed958c94f349b7e6fb#,hard-coded name,
35 | SPARK-20642,https://github.com/apache/spark/commit/74daf622de4e534d5a5929b424a6e836850eefad#,hard-coded name,
36 | SPARK-21428,https://github.com/apache/spark/commit/581200af717bcefd11c9930ac063fe53c6fd2fde#,hard-coded name,
37 | SPARK-22050,https://github.com/apache/spark/commit/1437e344ec0c29a44a19f4513986f5f184c44695#,hard-coded name,
38 | SPARK-24665,https://github.com/apache/spark/commit/8f91c697e251423b826cd6ac4ddd9e2dac15b96e#,hard-coded name,
39 | SPARK-25300,https://github.com/apache/spark/commit/ca861fea21adc4e6ec95eced7076cb27fc86ea18#,hard-coded name,
40 | SPARK-26443,https://github.com/apache/spark/commit/e6d3e7d0d8c80adaa51b43d76f1cc83bb9a010b9#,hard-coded name,
41 | SPARK-26463,https://github.com/apache/spark/commit/7bf0794651f4d11547325539ebf7131a57ee1ba2#,hard-coded name,
42 | SPARK-26470,https://github.com/apache/spark/commit/b1a9b5eff59f64c370cd7388761effdf2152a108#,hard-coded name,
43 | SPARK-26477,https://github.com/apache/spark/commit/64cc9e572e0213d5dea241b2b48ecdd68a5c6c99#,hard-coded name,
44 | SPARK-26698,https://github.com/apache/spark/commit/aa3d16d68b7ebd9210c330905f01590ef93d875c#,hard-coded name,
45 | SPARK-27141,https://github.com/apache/spark/commit/8204dc1e548b87aabaf36c5800592bafd44e4419#,hard-coded name,
46 | SPARK-27184,https://github.com/apache/spark/commit/68abf77b1ad8da7916a9dc5fa8bb350b64479410#,hard-coded name,
47 | SPARK-27343,https://github.com/apache/spark/commit/5a8aad01c2aaf0ceef8e9a3cfabbd2e88c8d9f0d#,hard-coded name,
48 | SPARK-27649,https://github.com/apache/spark/commit/8329e7debdaf6db9f3a52094bbc5dc4c1e2771ea#,hard-coded name,
49 | SPARK-27844,https://github.com/apache/spark/commit/447bfdec830ba5eaaee791e86caad39f4f6661eb#,hard-coded name,
50 | SPARK-28257,https://github.com/apache/spark/commit/42b80ae128ab1aa8a87c1376fe88e2cde52e6e4,hard-coded name,
51 | SPARK-25694,https://github.com/apache/spark/commit/8469614c0513fbed87977d4e741649db3fdd8ad,hard-coded name,
52 | HBASE-22859,https://github.com/apache/hbase/commit/018396d84cfe1008308f341562154452f4a45ac,hard-coded name,
53 | SPARK-21786,https://github.com/apache/spark/commit/00d169156d4b1c91d2bcfd788b254b03c509dc41#,load overriden,
54 | CASSANDRA-13614,https://github.com/apache/cassandra/commit/613a8b43d2b5a425080653898b28bde6cd7eb9ba#,refine API,
55 | CASSANDRA-13699,https://github.com/apache/cassandra/commit/cf4a0576a6f2b8f2d828a8b14140f212803adb7c#,refine API,
56 | HBASE-19621,https://github.com/apache/hbase/commit/1556939236016bb51e45ffa1e8038c74e0f0db75#,refine API,
57 | HBASE-21492,https://github.com/apache/hbase/commit/7877e09b6023c80e8bacd25fb8e0b9273ed7d258#,refine API,
58 | HDFS-13222,https://github.com/apache/hadoop/commit/88fba00caa8c8e26f70deb9be5b534e7482620a1#,refine API,
59 | SPARK-24003,https://github.com/apache/spark/commit/007ae6878f4b4defe1f08114212fa7289fc9ee4a#,refine API,
60 | SPARK-24250,https://github.com/apache/spark/commit/dd37529a8dada6ed8a49b8ce50875268f6a20cba#,refine API,
61 | SPARK-24782,https://github.com/apache/spark/commit/e008ad175256a3192fdcbd2c4793044d52f46d57#,refine API,
62 | SPARK-26384,https://github.com/apache/spark/commit/3c0bb6bc45e64fd82052d7857f2a06c34f0c1793#,refine API,
63 | HBASE-20856,https://github.com/apache/hbase/commit/1d0fca370bf56a41fc62b72bebe86a7185a2b0c2#,refine API,
64 | HBASE-21203,https://github.com/apache/hbase/commit/0e173d38b05363e1fb5c85955a4964f05958c1fc#,refine API,
65 | HDFS-14051,https://github.com/apache/hadoop/commit/f0ce072934515d39e9bf61744058ecad3393291e#,refine API,
66 | SPARK-28840,https://github.com/apache/spark/commit/7e6142591f3bc865806b86c7a7b90be008a319d,refine API,
67 | SPARK-28957,https://github.com/apache/spark/commit/d8b0914c2e0fdee72a3b9abb2d65283e22b6e8e,refine API,
68 | SPARK-10614,https://github.com/apache/spark/commit/857f109c47b26a38f5d114a94f94c516177db3f,refine API,
69 | SPARK-30195,https://github.com/apache/spark/commit/33f53cb2d51b62f4c294c8640dc069e42f36d68,refine API,
70 | SPARK-29158,https://github.com/apache/spark/commit/bd05339171db00c2f2dd89702f9500ed6e1e321,refine API,
71 | SPARK-28922,https://github.com/apache/spark/commit/d502c80404c398d852dfa5f86a0e87c104a6286,refine API,
72 | HBASE-23379,https://github.com/apache/hbase/commit/c39339c0046560b8f2083af513f384127e3f46d,refine API,
73 | CASSANDRA-15277,https://github.com/apache/cassandra/commit/860de83a02f3b7711e842a58a073802b9920a1a1,refine API,
74 | HBASE-20879,https://github.com/apache/hbase/commit/2997b6d0714d5542784baf830e7c16a9ef6b62d6#,refine API (sensitive),
75 | CASSANDRA-14716,https://github.com/apache/cassandra/commit/cdeac4992bdb1f569c3a04b628ded7e5351364ee#,refine API(case sensitive),
76 | SPARK-25415,https://github.com/apache/spark/commit/d522a563ad5ab157993a19f406a3cc6f443ccb9e#,refine API(case sensitive),
77 | HDFS-14039,https://github.com/apache/hadoop/commit/8d99648c203004045a9339ad27258092969145d6#,refine API(trimmed),
78 | HBASE-21639,https://github.com/apache/hbase/commit/6da0b4ec34727240e433825382cfc30366340097#,refine API(Unit),
79 | HDFS-12085,https://github.com/apache/hadoop/commit/3a7f02b81520ad4d3eebf92e9dbca662beec0302#,refine API(Unit),
80 | SPARK-21033,https://github.com/apache/spark/commit/083cf223569b7896e35ff1d53a73498a4971b28d#,refine API(Unit),
81 | SPARK-24332,https://github.com/apache/spark/commit/53c06ddabbdf689f8823807445849ad63173676f#,refine API(Unit),
82 | SPARK-24452,https://github.com/apache/spark/commit/90da7dc241f8eec2348c0434312c97c116330bc4#,refine API(Unit),
83 | CASSANDRA-14314,https://github.com/apache/cassandra/commit/11496039fb18bb45407246602e31740c56d28157#,wrong API,
84 | CASSANDRA-15019,https://github.com/apache/cassandra/commit/99ce007c5beb7988ce83fb1443a1e0ca259264cc#,wrong API,
85 | SPARK-29015,https://github.com/apache/spark/commit/cc852d4eec696731cef9ddd6fb0c0c2184194f6,wrong API,
86 | SPARK-28331,https://github.com/apache/spark/commit/c88df2ccf670db62aed6565c9dbdb58d5d5cca3,wrong API,
--------------------------------------------------------------------------------
/commit_analysis/rmv_replace.csv:
--------------------------------------------------------------------------------
1 | Issue-ID,Title,Parameter,Issue-URL,Commit-URL,Change type,Param type,Pattern,Note
2 | HBASE-18786,FileNotFoundException should not be silently handled for primary region replicas,hbase.hregion.unassign.for.fnfe,https://issues.apache.org/jira/browse/HBASE-18786,https://github.com/apache/hbase/commit/b27f9b582a858fba66036413936debad27737c3a,hard-coded logic,bool,make feature mandatory,"this is not something that should be parameterized. We either do it or we don't. Otherwise it becomes an obscure setting that could lead to serious conditions if an operator changes it to the non-default value, which we know won't be well tested.For me, FNFE should not happen and if it happens then there must be serious bugs that may cause data loss.That's why I introduce a config, the intention is to disable the feature as we used to always handle it silently..."
3 | HBASE-19999,Remove the SYNC_REPLICATION_ENABLED flag,hbase.replication.sync.enabled,https://issues.apache.org/jira/browse/HBASE-19999,https://github.com/apache/hbase/commit/c7d1085fa27a64621d262aefea825e980e6bc576,hard-coded logic,bool,make feature mandatory,It is a bit strange since we can not guard all the sync replication related code with it. We'd better change its name and only use it within the WAL construction. Now the default case will use SyncReplicationWALProvider.only disable SyncReplicationWALProvider for HMaster or HRegionServer which take system table only.
4 | HBASE-8518,Get rid of hbase.hstore.compaction.complete setting,hbase.hstore.compaction.complete,https://issues.apache.org/jira/browse/HBASE-8518,https://github.com/apache/hbase/commit/a21eb68f9584e69157fed683cc512ee3e8963dfb,hard-coded logic,bool,make feature mandatory,hbase.hstore.compaction.complete is a strange setting that causes the finished compaction to not complete (files are just left in tmp) in HStore. Looks like a flag which allow compacted files to be created but not used. May be someone who wants to see the time /size of compaction without affecting the stores. Does not seem very useful.
5 | SPARK-23366,Improve hot reading path in ReadAheadInputStream,spark.unsafe.sorter.spill.read.ahead.fraction,https://issues.apache.org/jira/browse/SPARK-23366,https://github.com/apache/spark/commit/7539ae59d6c354c95c50528abe9ddff6972e960f,hard-coded logic,bool,make feature mandatory,"Remove `readAheadThresholdInBytes` and instead immediately trigger async read when switching the buffers. It allows to simplify code paths, especially the hot one that then only has to check if there is available data in the active buffer, without worrying if it needs to retrigger async read. It seems to have positive effect on perf."
6 | SPARK-26362,Remove 'spark.driver.allowMultipleContexts' to disallow multiple Spark contexts,spark.driver.allowMultipleContexts,https://issues.apache.org/jira/browse/Spark-26362,https://github.com/apache/spark/commit/9ccae0c9e7d1a0a704e8cd7574ba508419e05e30,hard-coded logic,bool,make feature mandatory,"Multiple SparkContexts are discouraged and it has been warning for last 4 years, see SPARK-4180. It could cause arbitrary and mysterious error cases, see SPARK-2243. Honestly, I didn't even know Spark still allows it, which looks never officially supported, see SPARK-2243."
7 | SPARK-27938,Remove feature flag LEGACY_PASS_PARTITION_BY_AS_OPTIONS,LEGACY_PASS_PARTITION_BY_AS_OP,https://issues.apache.org/jira/browse/Spark-27938,https://github.com/apache/spark/commit/eee3467b1ea674a64a3c70775cfbf2710318993e,hard-coded logic,bool,make feature mandatory,"To make this change less intrusive for a patch release, we added a feature flag `LEGACY_PASS_PARTITION_BY_AS_OPTIONS` with the default to be false. For 3.0, we should just do the correct behavior for DSV1, i.e., always passing partitionBy as options, and remove this legacy feature flag."
8 | SPARK-28699,Cache an indeterminate RDD could lead to incorrect result while stage rerun,SQLConf.get.enableRadixSort,https://issues.apache.org/jira/browse/SPARK-28699,https://github.com/apache/spark/commit/2d9cc42aa83beb5952bb44d3cd0327d4432d385,hard-coded logic,bool,make feature mandatory,"After further investigation, we found that this bug is nothing to do with cache operation. So we focus on the sort + shuffle self and finally found the root cause is about the wrong usage for radix sort."
9 | HBASE-22760,Stop/Resume Snapshot Auto-Cleanup activity with shell command,hbase.master.cleaner.snapshot.disable,https://issues.apache.org/jira/browse/HBASE-22760,https://github.com/apache/hbase/commit/1dcc8ee50cd2120496ec768e09e7f368b6bc26b,hard-coded logic,bool,make feature mandatory,"For any scheduled snapshot backup activity, we would like to disable auto-cleaner for snapshot based on TTL. However, as per HBASE-22648 we have a config to disable snapshot auto-cleaner: hbase.master.cleaner.snapshot.disable, which would take effect only upon HMaster restart just similar to any other hbase-site configs."
10 | CASSANDRA-14108,Improve commit log chain marker updating,commitlog_marker_period_in_ms,https://issues.apache.org/jira/browse/CASSANDRA-14108,https://github.com/apache/cassandra/commit/db788fe860dfd69f06ab97ae35fa67fcf2517b6d,hard-coded value,time,using 100,"Instead of requiring users to configure a deep, dark implementation detail like the commit log chained markers (via commitlog_marker_period_in_ms in the yaml), we decided it is best to eliminate thew configuration and always update the chained markers (when in periodic mode). I've removed the confusing (and confusingly described) yaml property for setting the commitlog_marker_period_in_ms. Instead, I've hardcoded the marker interval to 100ms and it is always applied when a) using periodic mode, and b) not using compression or encryption."
11 | HBASE-19282,Making CellChunkMap the default index,hbase.hregion.compacting.memstore.index,https://issues.apache.org/jira/browse/HBASE-19282,https://github.com/apache/hbase/commit/8d0da1a77f50b730b366c28b5b477141aa83cc55,hard-coded value,index,using orignial default value,In order to avoid additional user settings. If no MSLAB is requested the index is going to be CellArrayMap
12 | HDFS-12412,Change ErasureCodingWorker.stripedReadPool to cached thread pool.,dfs.datanode.ec.reconstruction.stripedread.threads,https://issues.apache.org/jira/browse/HDFS-12412,https://github.com/apache/hadoop/commit/123342cd0759ff88801d4f5ab10987f6e3f344b0,hard-coded value,thread number,using Integer.MAX_VALUE,"The idea to remove the striped read pool and reuse the same reconstruction pool sounds good to me, since given the later and the most often used erasure codec, we can roughly estimate the striped read threads need. We can also simplify the configuration and codes. Less configuration with reasonable defaults would make the brand feature more easier to use. When needed, we can fine-tune and add more later."
13 | HDFS-12775,READ] Fix reporting of Provided volumes,dfs.provided.df.class,https://issues.apache.org/jira/browse/HDFS-12775,https://github.com/apache/hadoop/commit/3b1d30301bcd35bbe525a7e122d3e5acfab92c88,hard-coded value,class implementation,using orignial default value,"The capacity (and dfs used) of a PROVIDED volume on a DN is reported to be equal to the total size of the data (in bytes) mounted from the remote storage. Each volume reports zero available capacity (thus 100% usage). This included changes to ProvidedVolumeImpl, and adding a default ProvidedVolumeDFimplementation and removing the earlier configurable ProvidedVolumeDF interface."
14 | SPARK-25704,Allocate a bit less than Int.MaxValue,spark.storage.memoryMapLimitForTests,https://issues.apache.org/jira/browse/SPARK-25704,https://github.com/apache/spark/commit/43717dee570dc41d71f0b27b8939f6297a029a02,hard-coded value,maxChunkSize,using Integer.MAX_VALUE - 15,"Replicating a block > 2GB currently fails because it tries to allocate a bytebuffer that is just a bit too large, due to a bad default config. MEMORY_MAP_LIMIT_FOR_TESTS defaults to Integer.MAX_VALUE, but unfortunately that is just a tiny bit too big. Workaround: Set to ""spark.storage.memoryMapLimitForTests"" something a bit smaller, eg. 2147483135 (that's Integer.MAX_VALUE - 512, just in case its a bit different on other systems)."
15 | CASSANDRA-13990,Remove obsolete OldNetworkTopologyStrategy,replication_factor_strategies,https://issues.apache.org/jira/browse/CASSANDRA-13990,https://github.com/apache/cassandra/commit/7c5904753f4ede492f1a5a5e68edfe37651a5be6,hard-coded value,class implementation,using orignial default value,RackAwareStrategy was renamed OldNetworkTopologyStrategy back in 0.7 (CASSANDRA-1392) and it's still around.
16 | HBASE-16894,"Create more than 1 split per region, generalize HBASE-12590",hbase.mapreduce.input.autobalance.maxskewratio,https://issues.apache.org/jira/browse/HBASE-16894,https://github.com/apache/hbase/commit/16d483f9003ddee71404f37ce7694003d1a18ac4,program control,ratio,using better feature,"If we want to fix this properly, we should extend the approach in HBASE-12590, and make it so that the client can specify the desired num of mappers, or desired split size, and the TIF generates the splits based on the current region sizes very similar to the algorithm in HBASE-12590, but a more generic way. This also would eliminate the hand tuning of data skew ratio."
17 | HBASE-19616,Review of LogCleaner Class,hbase.oldwals.cleaner.thread.check.interval.msec,https://issues.apache.org/jira/browse/HBASE-19616,https://github.com/apache/hbase/commit/af923225d0a874ecf3c7deddbc0d7bc82184e1d1,program control,interval,using better feature,Using a CountDownLatch allows one or more threads to wait until a set of operations being performed in other threads completes. It will not blindly sleep between checks and it will return immediately after the condition is met. This removes the HBase configuration that controls the sleep interval.
18 | HBASE-21228,Memory leak since AbstractFSWAL caches Thread object and never clean later,REGION_SERVER_HANDLER_COUNT,https://issues.apache.org/jira/browse/HBASE-21228,https://github.com/apache/hbase/commit/86cb8e48ad8aecf52bca1169a98607c76198c70b,program control,thread number,using better feature,"In one of our customer's cluster, we noticed that even though there is no requests, the heap of the RS is almost full and CMS GC was triggered every second. We dumped the heap and then found out there were more than 30 thousands threads with Terminated state. which are all cached in this map above. Everything referenced in these threads were leaked."
--------------------------------------------------------------------------------
/commit_analysis/rmv_with_code.csv:
--------------------------------------------------------------------------------
1 | ,link,Title,change mode,param_name
2 | HDFS-12414,https://github.com/apache/hadoop/commit/e0b3c644e186d89138d4174efe0cbe77a0200315,Ensure to use CLI command to enable/disable erasure coding policy,remove param with code,dfs.namenode.ec.policies.enabled
3 | HDFS-14401,https://github.com/apache/hadoop/commit/9b0aace1e6c54f201784912c0b623707aa82b761,Refine the implementation for HDFS cache on SCM,remove param with code,dfs.datanode.cache.loader.class dfs.datanode.cache.pmem.capacity
4 | HDFS-14730,https://github.com/apache/hadoop/commit/30ed24a42112b3225ab2486ed24bd6a5011a7a7, Removed unused **CONFIGUR**ation dfs.web.authentication.filter.,Rmv.RmvModule,dfs.web.authentication.filter
5 | HBASE-18369,https://github.com/apache/hbase/commit/bbf23d9627849c32ee6914c1350da02bceba5127,hbase thrift web-ui not available,remove param with code,hbase.regionserver.thrift.port hbase.regionserver.thrift.server.type hbase.regionserver.thrift.compact hbase.regionserver.thrift.framed
6 | HBASE-18721,https://github.com/apache/hbase/commit/8a800c3f196fcbc3ed63f0967025c1779c43d486,Cleanup unused configs and private declaration,remove param with code,…
7 | HBASE-17972,https://github.com/apache/hbase/commit/5ff04c5e7fdf12946a3f0ae15ed7e83209f0e617,Remove mergePool from CompactSplitThread,remove param with code,hbase.regionserver.thread.merge
8 | HBASE-19073,https://github.com/apache/hbase/commit/dd70cc308158c435c6d8ec027e2435a29be4326b,Cleanup CoordinatedStateManager,remove param with code,hbase.coordinated.state.manager.class
9 | HBASE-19128,https://github.com/apache/hbase/commit/4132314f51951af43f4f56d9886233b3ba417903,"Purge Distributed Log Replay from codebase, configurations, text; mark the feature as unsupported, broken.",remove param with code,hbase.master.distributed.log.replay hbase.regionserver.disallow.writes.when.recovering zookeeper.znode.recovering.regions
10 | HBASE-19357,https://github.com/apache/hbase/commit/ba4f9f834948e6f042e771ae5ee016610afe928c,Bucket cache no longer L2 for LRU cache.,remove param with code,hbase.bucketcache.combinedcache.enabled
11 | HBASE-19148,https://github.com/apache/hbase/commit/4d6b928682cc2a17f3dfd0179fb3fd46fd9e0a1f,Reevaluate default values of **CONFIG**urationsRemoved unused: hbase.fs.tmp.dirAdded hbase.master.loadbalance.bytableEdit of description text. Moved stuff around to put **CONFIG**s beside eachother.M hbase-server/src/main/java/org/apache/hadoop/hbase/util/ServerCommandLine.java Emit some hbase **CONFIG**s in log on startup,remove param with code,hbase.fs.tmp.dir
12 | HBASE-19618,https://github.com/apache/hbase/commit/2ce5dc892710666c9a382fdeece412ecbb8559bb,Remove replicationQueuesClient.class/replicationQueues.class **CONFIG** and remove table based ReplicationQueuesClient/ReplicationQueues implementation,remove param with code,hbase.region.replica.replication.replicationQueuesClient.class
13 | HBASE-19617,https://github.com/apache/hbase/commit/f4703c6ed327f361df371312da8e8edb532048a1,"Remove ReplicationQueues, use ReplicationQueueStorage directly",remove param with code,hbase.replication.queues.createtable.retries.number
14 | HBASE-20000,https://github.com/apache/hbase/commit/c18e7a963d9c4dc862c4706f128a4e436111669c,"Remove the quantum logic in FairQueue, always put high priority queue in front",remove param with code,hbase.master.procedure.queue.meta.table.priority hbase.master.procedure.queue.system.table.priority hbase.master.procedure.queue.user.table.priority
15 | HBASE-21420,https://github.com/apache/hbase/commit/c8574ba3c52274ed5a93e46f7af30dd8b46fb878,Use procedure event to wake up the SyncReplicationReplayWALProcedures which wait for worker,remove param with code,zookeeper.znode.sync.replication.replaywal.workers
16 | HBASE-21792,https://github.com/apache/hbase/commit/7dc69b61287d66641f0ae3d251b1d106d2a00ccf,Mark HTableMultiplexer as deprecated and remove it in 3.0.0,remove param with code,…
17 | HBASE-22186,https://github.com/apache/hbase/commit/20f72f5e252233361ee474e58b4a8fef69926b8b,Remove usage of deprecated SnapshotDescriptionUtils fields,remove param with code,SNAPSHOT_TIMEOUT_MILLIS_KEY
18 | HBASE-22933,https://github.com/apache/hbase/commit/090c55f3ff40dea807dc7e67240f19dcafb3865,Do not need to kick reassign for rs group change any more (……#550),rmv RmvModule ,REASSIGN_WAIT_INTERVAL_KEY
19 | HBASE-23334,https://github.com/apache/hbase/commit/dbbba7932c2f3de8d25aa4f37be943bf07bbc46,The table-lock node of zk is not needed since HBASE-16786 (……#873),rmv RmvModule ,zookeeper.znode.tableLock
20 | HBASE-22971,https://github.com/apache/hbase/commit/b10b39ad0365b378bbf7a493c76501c77f73942,Deprecated RSGroupAdminEndpoint and make RSGroup feature ……always enabled (#595),rmv RmvModule ,hbase.rsgroup.grouploadbalancer.class
21 | SPARK-22487,https://github.com/apache/spark/commit/f7534b37ee91be14e511ab29259c3f83c7ad50af,[SQL][FOLLOWUP] still keep spark.sql.hive.version,rmv RmvModule ,spark.sql.hive.version
22 | SPARK-21253,https://github.com/apache/spark/commit/80f7ac3a601709dd9471092244612023363f54cd,Disable spark.reducer.maxReqSizeShuffleToMem,rmv RmvModule ,spark.reducer.maxReqSizeShuffleToMem
23 | SPARK-25876,https://github.com/apache/spark/commit/6be272b75b4ae3149869e19df193675cc4117763,Simplify configuration types in k8s backend,rmv RmvModule ,spark.kubernetes.python.pyFiles
24 | SPARK-13656,https://github.com/apache/spark/commit/e00f1a1da12be4a1fdb7b89eb5e098aa16c5c2c3,[SQL] Delete spark.sql.parquet.cacheMetadata from SQLConf and docs,rmv RmvModule ,spark.sql.parquet.cacheMetadata
25 | SPARK-20646,https://github.com/apache/spark/commit/11eea1a4ce32c9018218d4dfc9f46b744eb82991,[CORE] Port executors page to new UI backend.,rmv RmvModule ,spark.ui.timeline.executors.maximum spark.ui.retainedDeadExecutors
26 | SPARK-20648,https://github.com/apache/spark/commit/4741c07809393ab85be8b4a169d4ed3da93a4781,[CORE] Port JobsTab and StageTab to the new UI backend.,rmv RmvModule ,spark.ui.retainedJobs spark.ui.retainedStages
27 | SPARK-20652,https://github.com/apache/spark/commit/0ffa7c488fa8156e2a1aa282e60b7c36b86d8af8,[SQL] Store SQL UI data in the new app status store.,rmv RmvModule ,spark.sql.ui.retainedExecutions
28 | SPARK-22489,https://github.com/apache/spark/commit/8ff474f6e543203fac5d49af7fbe98a8a98da567,[CORE] Remove JobProgressListener.,rmv RmvModule ,spark.ui.retainedStages spark.ui.retainedJobs spark.ui.retainedTasks
29 | SPARK-22520,https://github.com/apache/spark/commit/087879a77acb37b790c36f8da67355b90719c2dc,[SQL] Support code generation for large CaseWhen,rmv RmvModule ,spark.sql.codegen.maxCaseBranches
30 | SPARK-22839,https://github.com/apache/spark/commit/f15906da153f139b698e192ec6f82f078f896f1e,[K8S] Remove the use of init-container for downloading remote dependencies,rmv RmvModule ,spark.kubernetes.mountDependencies.filesDownloadDir spark.kubernetes.mountDependencies.jarsDownloadDir spark.kubernetes.mountDependencies.timeout …
31 | SPARK-23361,https://github.com/apache/spark/commit/5fa438471110afbf4e2174df449ac79e292501f8,[YARN] Allow AM to restart after initial tokens expire.,rmv RmvModule ,spark.yarn.credentials.file.retention.count spark.yarn.credentials.file.retention.days spark.yarn.credentials.file spark.yarn.credentials.renewalTime spark.yarn.credentials.updateTime
32 | SPARK-23538,https://github.com/apache/spark/commit/508573958dc9b6402e684cd6dd37202deaaa97f6,[CORE] Remove custom **CONFIGUR**ation for SSL client.,rmv RmvModule ,spark.ssl.fs
33 | SPARK-25160,https://github.com/apache/spark/commit/60af2501e1afc00192c779f2736a4e3de12428fa,Avro: remove sql **CONFIGUR**ation spark.sql.avro.outp……utTimestampType,rmv RmvModule ,spark.sql.avro.outputTimestampType
34 | SPARK-25705,https://github.com/apache/spark/commit/703e6da1ecb52ab5b8f42b3b4cac39f27caa51d8,Remove Kafka 0.8 integration,rmv RmvModule ,spark.streaming.kafka.maxRetries
35 | SPARK-25711,https://github.com/apache/spark/commit/26c1b959cf29b8552beb715cc5d39288d5298bdc,Allow history server to show usage and remove deprecated options,rmv RmvModule ,spark.history.fs.logDirectory
36 | SPARK-25815,https://github.com/apache/spark/commit/4b3fe3a9ccc8a4a8eb0d037d19cb07a8a288e37a,"Support kerberos in client mode, keytab-based token renewal.",rmv RmvModule ,spark.kubernetes.executor.krb5ConfigMapName spark.kubernetes.kerberos.spark-user-name
37 | SPARK-26503,https://github.com/apache/spark/commit/51a6ba0181a013f2b62b47184785a8b6f6a78f12,Get rid of spark.sql.legacy.timeParser.enabled,rmv RmvModule ,spark.sql.legacy.timeParser.enabled
38 | SPARK-26539,https://github.com/apache/spark/commit/2f8a938805ce3c182d61bab8f66b9ff6d90dc83b,Remove spark.memory.useLegacyMode and StaticMemoryManager,rmv RmvModule ,spark.memory.useLegacyMode
39 | SPARK-26584,https://github.com/apache/spark/commit/270916f8cd8ba01341f2a38a8376e9e4be08a2e8,Remove `spark.sql.orc.copyBatchToSpark` internal conf,rmv RmvModule ,spark.sql.orc.copyBatchToSpark
40 | SPARK-26788,https://github.com/apache/spark/commit/4808393449ccad2c6bc73c91d0ed8dd8f60c7054,Remove SchedulerExtensionService.,rmv RmvModule ,spark.yarn.services
41 | SPARK-26998,https://github.com/apache/spark/commit/57aff93886ac7d02b88294672ce0d2495b0942b8,Remove SSL **CONFIGUR**ation from executors,rmv RmvModule ,spark.ssl
42 | SPARK-27008,https://github.com/apache/spark/commit/8e5f9995cad409799f3646b3d03761a771ea1664,Support java.time.LocalDate as an external type of…… DateType,rmv RmvModule ,spark.sql.catalyst.timestampType
43 | SPARK-27349,https://github.com/apache/spark/commit/1d95dea30788b9f64c5e304d908b85936aafb238,Dealing with TimeVars removed in Hive 2.x,rmv RmvModule ,ConfVars.HIVE_STATS_JDBC_TIMEOUT ConfVars.HIVE_STATS_RETRIES_WAIT
44 | SPARK-29399,https://github.com/apache/spark/commit/56a0b5421e41f46a65375c0e5ef9993e9502f93,[CORE] Remove old ExecutorPlugin interface,rmv RmvModule ,spark.executor.plugins
45 | SPARK-29930,https://github.com/apache/spark/commit/5eb8973f871fef557fb4ca3f494406ed676a431,[SQL] Remove SQL **CONFIGS** declared to be removed in Spark 3.0,rmv RmvModule ,"spark.sql.fromJsonForceNullableSchema, spark.sql.legacy.compareDateTimestampInTimestamp, spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation"
46 | CASSANDRA-13625,https://github.com/apache/cassandra/commit/082af0a9ba6b5dde26055fcb9ddd2085e4240381,Remove unused max_value_size_in_mb config setting,rmv RmvModule ,max_value_size_in_mb
47 | CASSANDRA-13910,https://github.com/apache/cassandra/commit/2fcd29b830e7b201e7047d283de385d5f1c427b5,Eliminate background repair and probablistic read_repair_chance table option,rmv RmvModule ,dclocal_read_repair_chance read_repair_chance
48 | CASSANDRA-14081,https://github.com/apache/cassandra/commit/df51d0cbbaaa99aea9bc2a582f788f9170dbdc03,Remove unused and deprecated methods from AbstractCompactionStrategy,rmv RmvModule ,COMPACTION_ENABLED
49 | CASSANDRA-14173,https://github.com/apache/cassandra/commit/28ee665b3c0c9238b61a871064f024d54cddcc79,Remove dependencies on JVM internals for JMX support,rmv RmvModule ,com.sun.management.jmxremote.ssl
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Artifacts for Configuration Evolution of Cloud Systems
2 |
3 | This repository includes the artifacts of the our paper: [An Evolutionary Study of Configuration Design and Implementation in Cloud Systems](https://arxiv.org/pdf/2102.07052.pdf) in 43rd International Conference on Software Engineering (ICSE'21), May. 2021.
4 |
5 | Please cite the paper if you use the code or the datasets.
6 |
7 | The repository includes the following artifacts:
8 |
9 | * `config_commits`: 1178 configuration evolution commits from a recent
10 | 2.5 year (2017.06-2019.12) version control history of four large-scale open-source projects (HDFS, HBase, Spark, and
11 | Cassandra).
12 | * `commit_analysis`: Studied commits with well-labeled categorizations and analysis results, organized based on the structure of the paper.
13 | * `code`: Python scripts for collecting raw commits that touch configuration.
14 | * `commit_study.md`: Documentation of the manual study methodology (analyzing raw commits and issues), including code snippet examples and descriptions for each category in Table II of the submission.
15 |
16 | ## 1. Data Comprehension and Layout
17 |
18 | We provide the data that we studied in this paper. All the data sheets are in the format of CSV, with titles/labels as the first rows. Note that some labels are recorded for specific commits/parameters. (e.g. "How to choose new value" in `change_default_value.csv` is just for numeric parameters and we describe the reason in Section IV.B in the paper)
19 |
20 | All the data sheets except `change_doucumentation.csv` use each row to record an individual parameter change, with the links to the commit/issue page. In `change_doucumentation.csv`, each row records a document change.
21 |
22 | **Note that one commit can contains changes of multiple parameters for multiple reasons.**
23 |
24 | Here is a mapping from the subsection of the paper to the data sheet (in the `commit_analysis` directory).
25 |
26 | * **Section IV (Configuration Interface Evolution)**
27 |
28 | * Section IV.A(1) (Parameterization) → `parameterization.csv`
29 |
30 | * Section IV.A(2) (Removing parameters) → `rmv_replace.csv`
31 |
32 | * Section IV.B (Evolution of default values) → `change_default_value.csv`
33 |
34 | * **Section V (Configuration Usage Evolution)**
35 |
36 | * Section V.A (Evolution of parameter checking code) → `checking_and_handling_code.csv`
37 |
38 | * Section V.B (Evolution of error-handling code) → `checking_and_handling_code.csv`
39 |
40 | * Section V.C (Evolution of using parameter values) → `change_param_existing_usage.csv` and `param_new_use.csv`
41 |
42 | * **Section VI (Configuration Document Evolution)** → `change_doucumentation.csv`
43 |
44 | We also provide sheets for other categories for future study and reuse.
45 | The script in `commit_analysis` is to count the numbers and generate the main tables in the paper:
46 |
47 | ~~~
48 | python3 count_num.py
49 | ~~~
50 |
51 | ## 2. Commit Collection and Analysis
52 |
53 | Besides the data in this paper, for future reuse and study, we also provide the script we use to collect the raw commits and a tutorial to show how we do the manual study of each raw commit.
54 |
55 | ### 2.1 Collect raw commits that touch configuration
56 |
57 | Please use python3 to install the dependencies and run the code(we use python 3.8.5).
58 |
59 | 1. Install dependenceis
60 | ~~~bash
61 | pip3 install pathlib
62 | pip3 install nltk
63 | pip3 install beautifulsoup4
64 | ~~~
65 |
66 | 2. Goto `code/'software'`
67 | Change the following file path in `download_diff.py`
68 | * DIFF_FILE_PATH = "The path that you want to store the commit diff files"
69 | And run:
70 | ```bash
71 | python3 get_commit.py
72 | ```
73 | to download the raw commits for the target software projects. Please add the latest `commit_page_url` of studied software among studied time span in `commit_url.txt` (There is already one in the file).
74 | * Note: You can stop the downloading process by using ctrl+c whenever you think the time span is enough (downloading all the commits and diff files are time-comsuming, you can first try with a short time-span). If you want to continue the downloading process, just simply run the `get_commit.py` again. (The url in `commit_url.txt` will be automatically updated, you can check `url_log.txt`). If the programm stops (mostly are network issues or too many request), also run `get_commit.py` again to continue.
75 |
76 | The output will be `commit_info.txt` that contains basic info of each commit, and corresponding diff files will be downloaded in `DIFF_FILE_PATH`
77 |
78 | 3. Run
79 | ```bash
80 | python3 commit_selection.py
81 | ```
82 | to automtically select commits that touch configuration. The out put will be `commit_selected.txt` that has structured info for each selected commit, and the info contains hints that how this diff touches configuration. **By search those hints in the diff file, one can quickly locate/briefly understant the configuration change.**
83 |
84 | The detailed methodologies are described in the submitted paper.
85 | One method of selecting configuration-related commits is to use regular expressions to capture configuation-related code patterns. For example, one of the regular expressions used for HDFS is:
86 | ~~~
87 | HDFS_CONFIG_LOAD_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*get[a-zA-Z]*\([^;<>]+\)'
88 | ~~~
89 |
90 | The regular expression can find commits like [HDFS-13607](https://github.com/apache/hadoop/commit/c81ac2ff0220b180cd6cbbf18221290c3783bfd5) which adds a new parameter `dfs.journalnode.edit-cache-size.bytes` by matching the following code snippet:
91 |
92 | ~~~
93 | + capacity = conf.getInt(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY,
94 | + DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_DEFAULT);
95 | ~~~
96 |
97 | The complete output info of this commit is shown below.
98 |
99 | ~~~
100 | HDFS-13607. [SBN read] Edit Tail Fast Path Part 1 //commit title
101 | https://github.com/apache/hadoop/commit/c81ac2ff0220b180cd6cbbf18221290c3783bfd5 //commit link
102 | 2018-05-09T22:40:07Z //commit time
103 | Commit message touches config:False //whether commit message touch "config" keyword
104 | Diff touches config define:True //whether diff touches config define
105 | Diff touches config loading:True //whether diff touches config load
106 | Diff touches config setting:False //whether diff touches config set
107 | Diff touches config variable (data flow):True //whether diff touches config variable
108 | Diff touches config message:False //whether diff touches message that have "config" keyword
109 |
110 | _________________touchedConfigDefine_____________________
111 |
112 | +hdfs-default.xml
113 |
114 | ___________________touchedConfigLoad___________________
115 |
116 | +conf.getInt(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY,DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_DEFAULT)
117 |
118 | ___________________touchedConfigSet____________________
119 |
120 | Null
121 |
122 | ___________________touchedConfigVariable_____________________
123 |
124 | +capacity JournaledEditsCache.java
125 |
126 | ___________________touchedMessage_____________________
127 |
128 | Null
129 | ~~~
130 |
131 | We provide a [demo](https://github.com/xlab-uiuc/open-cevo/tree/main/code/hdfs_demo_examples) (The above case is in that demo) for all HDFS commit examples in [commit_study.md (tutorial)](https://github.com/xlab-uiuc/open-cevo/blob/main/commit_study.md), they are HDFS-13607, HDFS-12291, HDFS-12412, HDFS-11998, HDFS-12716, HDFS-11576 and HDFS-12603. Run `commit_selection.py` in `/code/hdfs_demo_examples` to see `commit_selected.txt` and the structured info for each commit.
132 | ~~~bash
133 | cd code/hdfs_demo_examples
134 | python3 commit_selection.py
135 | ~~~
136 |
137 | We implement software-specific regular expressions which can be found in `diff_file_parser.py` in each software subdirectory. All the regular expressions are carefully crafted based on a pilot study of configuration-related commits of the target software projects.
138 |
139 | ### 2.2 Commit Study
140 |
141 | We validate, analyze and categorize each commit based on the commit log and diff, as well as the corresponding JIRA or GitHub Issues as described in the paper. Our categorization is based on the taxonomy of Figure 1 and Table II of the submission. This step currently is manually without program automation. We provide a [tutorial](https://github.com/xlab-uiuc/open-cevo/blob/main/commit_study.md) that contains concrete code examples for every category.
142 |
143 | **Note that one commit can touch several categories; we study it in each category.**
144 |
145 | We also analyze JIRA issues or GitHub Pull Requests (PRs) that linked with each commit which provides more background and context information of the commit.
146 |
147 | All the commits in our study are linked to JIRA issues or GitHub PRs.
148 |
149 | ## 3. Reusability
150 |
151 | ### 3.1 Extending our study to longer time span
152 |
153 | We provide our script to select commits related to configuration, which can be reused for large-scale studies (e.g., longer time span).
154 |
155 | To do so, please change the `url` in `code/'software'/commit_url.txt` to the corresponding commit you want to start with. Our script will crawl **oldler** commit based on this. For example, if you want to crawl commit of `HBase` before `Dec.25 2020`, one can do:
156 | ```bash
157 | echo "https://github.com/apache/hbase/commits/master?before=0f868da05d7ffabe4512a0cae110ed097b033ebf+35&branch=master" > code/hbase/commit_url.txt
158 | ```
159 |
160 | ### 3.2 Extending our study to other software projects
161 |
162 | The main idea to select configuration related commits is using text-based regular expression matching. We show the regex we used in `diff_file_parser.py` in each
163 | `code/'software'` folder. One can reuse and tweak the scripts for other software projects. We suggest you to test the regex using [regex101](https://regex101.com).
164 |
165 |
166 | You will need to modify:
167 | - ```bash
168 | cd code
169 | mkdir other_software
170 | cp -r hbase/* other_software/
171 | ```
172 | - change`commit_url.txt`, using the github commits page url of that software.
173 | - change the **regular expressions** global varaibles in diff_file_parser.py specific to the target software project.
174 |
175 | ### 3.3 Followup analysis based on the commits in this artifact
176 |
177 | We provide an [tutorial](https://github.com/xlab-uiuc/open-cevo/blob/main/commit_study.md) to explain our taxonomy/categorization to help followup studies.
178 |
--------------------------------------------------------------------------------
/commit_analysis/param_rename.csv:
--------------------------------------------------------------------------------
1 | ,Parameter,commit-URL,Why
2 | SPARK-26060,spark.sql.legacy.execution.setCommandRejectsSparkConfs,https://github.com/apache/spark/commit/1ab3d3e474ce2e36d58aea8ad09fb61f0c73e5c5#,/
3 | SPARK-4502,spark.sql.nestedSchemaPruning.enabled,https://github.com/apache/spark/commit/76399d75e23f2c7d6c2a1fb77a4387c5e15c809b#,/
4 | SPARK-29753,spark.sql.defaultCatalog,https://github.com/apache/spark/commit/942753a44beeae5f0142ceefa307e90cbc1234c,/
5 | SPARK-27760,spark.driver.resource.{resourceName}.count,https://github.com/apache/spark/commit/d30284b5a51dd784f663eb4eea37087b35a54d00#,Change to allow future usage of containing both a count and a unit
6 | SPARK-27760,spark.executor.resource.{resourceName}.count,https://github.com/apache/spark/commit/d30284b5a51dd784f663eb4eea37087b35a54d00#,Change to allow future usage of containing both a count and a unit
7 | SPARK-27760,spark.task.resource.{resourceName}.count,https://github.com/apache/spark/commit/d30284b5a51dd784f663eb4eea37087b35a54d00#,Change to allow future usage of containing both a count and a unit
8 | SPARK-27687,spark.kafka.consumer.cache.capacity,https://github.com/apache/spark/commit/efa303581ac61d6f517aacd08883da2d01530bd2#,consistent naming convention
9 | HDFS-12114,hadoop.httpfs.http.port,https://github.com/apache/hadoop/commit/ac0a04a6e165920a6d43c2aa3dab06ca38f3135b#,consistent naming convention
10 | HDFS-12114,hadoop.httpfs.http.host,https://github.com/apache/hadoop/commit/ac0a04a6e165920a6d43c2aa3dab06ca38f3135b#,consistent naming convention
11 | HDFS-12114,hadoop.httpfs.http.administrators,https://github.com/apache/hadoop/commit/ac0a04a6e165920a6d43c2aa3dab06ca38f3135b#,consistent naming convention
12 | HDFS-12114,hadoop.httpfs.ssl.enabled,https://github.com/apache/hadoop/commit/ac0a04a6e165920a6d43c2aa3dab06ca38f3135b#,consistent naming convention
13 | SPARK-24626,spark.sql.statistics.parallelFileListingInStatsComputation.enabled,https://github.com/apache/spark/commit/4193c7623b92765adaee539e723328ddc9048c09#,consistent naming convention
14 | SPARK-19724,spark.sql.allowCreatingManagedTableUsingNonemptyLocation,https://github.com/apache/spark/commit/4a11209539130c6a075119bf87c5ad854d42978e#,consistent naming convention
15 | SPARK-23549,spark.sql.legacy.compareDateTimestampInTimestamp,https://github.com/apache/spark/commit/411ecc365ea62aef7a29d8764e783e6a58dbb1d5#,consistent naming convention
16 | SPARK-24157,spark.sql.streaming.noDataMicroBatchesEnabled,https://github.com/apache/spark/commit/936c920347e196381b48bc3656ca81a06f2ff46d#,consistent naming convention
17 | SPARK-24324,spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName,https://github.com/apache/spark/commit/8c2edf46d0f89e5ec54968218d89f30a3f8190bc#,consistent naming convention
18 | SPARK-22159,spark.sql.execution.arrow.enabled,https://github.com/apache/spark/commit/d29d1e87995e02cb57ba3026c945c3cd66bb06e2#,consistent naming convention
19 | SPARK-22159,spark.sql.codegen.aggregate.map.twolevel.enabled,https://github.com/apache/spark/commit/af8a34c787dc3d68f5148a7d9975b52650bb7729#,consistent naming convention
20 | HDFS-12214,dfs.storage.policy.satisfier.enabled,https://github.com/apache/hadoop/commit/ac0a04a6e165920a6d43c2aa3dab06ca38f3135b#,consistent naming convention
21 | HDFS-12438,dfs.datanode.ec.reconstruction.threads,https://github.com/apache/hadoop/commit/e12f3e85bde0e7e83142b383a45c4ea945dfd64e#,consistent naming convention
22 | SPARK-28339,spark.sql.runtime.reoptimization.enabled,https://github.com/apache/spark/commit/3f375c850b5a41ae1ca5deb84fdcea667c32a03,consistent naming convention
23 | SPARK-27959,spark.yarn.am.resource.{resource-type}.amount,https://github.com/apache/spark/commit/43d68cd4ff84530c3d597f07352984225ab1db7,consistent naming convention
24 | SPARK-27959,spark.yarn.driver.resource.{resource-type}.amount,https://github.com/apache/spark/commit/43d68cd4ff84530c3d597f07352984225ab1db7,consistent naming convention
25 | SPARK-27959,spark.yarn.executor.resource.{resource-type}.amount,https://github.com/apache/spark/commit/43d68cd4ff84530c3d597f07352984225ab1db7,consistent naming convention
26 | SPARK-9853,spark.sql.adaptive.shuffle.reducePostShufflePartitions.enabled,https://github.com/apache/spark/commit/8616109061efc5b23b24bb9ec4a3c0f2745903c,consistent naming convention
27 | SPARK-9853,spark.sql.adaptive.shuffle.minNumPostShufflePartitions,https://github.com/apache/spark/commit/8616109061efc5b23b24bb9ec4a3c0f2745903c,consistent naming convention
28 | SPARK-9853,spark.sql.adaptive.shuffle.maxNumPostShufflePartitions,https://github.com/apache/spark/commit/8616109061efc5b23b24bb9ec4a3c0f2745903c,consistent naming convention
29 | SPARK-9853,spark.sql.adaptive.shuffle.optimizedLocalShuffleReader.enabled,https://github.com/apache/spark/commit/8616109061efc5b23b24bb9ec4a3c0f2745903c,consistent naming convention
30 | SPARK#26694,spark.sql.analyzer.failAmbiguousSelfJoin.enabled,https://github.com/apache/spark/commit/e271664a01fd7dee63391890514d76262cad1bc,consistent naming convention
31 | SPARK-30060,spark.metrics.appStatusSource.enabled,https://github.com/apache/spark/commit/60f20e5ea2000ab8f4a593b5e4217fd5637c5e2,consistent naming convention
32 | SPARK-30060,spark.metrics.staticSources.enabled,https://github.com/apache/spark/commit/60f20e5ea2000ab8f4a593b5e4217fd5637c5e2,consistent naming convention
33 | SPARK-25855,spark.eventLog.erasureCoding.enabled,https://github.com/apache/spark/commit/35506dced739ef16136e9f3d5d48c638899d3ce,consistent naming convention
34 | SPARK-26389,spark.sql.streaming.forceDeleteTempCheckpointLocation.enabled,https://github.com/apache/spark/commit/6d64fc2407e5b21a2db59c5213df438c74a3163,consistent naming convention
35 | HBASE-18307,hbase.netty.rpc.server.worker.count,https://github.com/apache/hbase/commit/351703455a091171a1abc90f250f52f0a7a0aaab#,feature has changed
36 | HBASE-18307,hbase.rpc.server.nativetransport,https://github.com/apache/hbase/commit/351703455a091171a1abc90f250f52f0a7a0aaab#,feature has changed
37 | HBASE-22598,hbase.server.allocator.max.buffer.count,https://github.com/apache/hbase/commit/686847cb79038d2fe91aee277f3827fbe5341b49#,feature has changed
38 | HBASE-22598,hbase.server.allocator.buffer.size,https://github.com/apache/hbase/commit/686847cb79038d2fe91aee277f3827fbe5341b49#,feature has changed
39 | HBASE-16894,hbase.mapreduce.input.autobalance,https://github.com/apache/hbase/commit/16d483f9003ddee71404f37ce7694003d1a18ac4#,feature has changed
40 | HBASE-19768,hbase.wal.async.create.retries,https://github.com/apache/hbase/commit/c554340a91e24cdc86e25efd87c46430ec1ec673#,feature has changed
41 | HBASE-22301,hbase.regionserver.hlog.roll.on.sync.ms,https://github.com/apache/hbase/commit/47b4ab7b9732b790b2b471c489f670093e64ad2c#,feature has changed
42 | HBASE-22301,hbase.regionserver.hlog.slowsync.ms,https://github.com/apache/hbase/commit/47b4ab7b9732b790b2b471c489f670093e64ad2c#,feature has changed
43 | HBASE-22301,hbase.regionserver.hlog.sync.timeout,https://github.com/apache/hbase/commit/47b4ab7b9732b790b2b471c489f670093e64ad2c#,feature has changed
44 | HBASE-22547,hbase.server.allocator.pool.enabled,https://github.com/apache/hbase/commit/2e414360bd7aee15769eb46a00b2fa108b3bcbb5#,feature has changed
45 | HBASE-22547,hbase.server.allocator.minimal.allocate.size,https://github.com/apache/hbase/commit/2e414360bd7aee15769eb46a00b2fa108b3bcbb5#,feature has changed
46 | CASSANDRA-13530,commitlog_sync_batch_window_in_ms,https://github.com/apache/cassandra/commit/f3f90c1896eab4f3fb5507b0cf348e2f149db5d1#,feature has changed
47 | SPARK-28741,spark.sql.arithmeticOperations.failOnOverFlow,https://github.com/apache/spark/commit/8258660f673f8b57a3cdd79ecd57c79df5554e3,feature has changed
48 | SPARK-29893,spark.sql.adaptive.shuffle.localShuffleReader.enabled,https://github.com/apache/spark/commit/6e581cf164c3a2930966b270ac1406dc1195c94,feature has changed
49 | SPARK-29412,spark.sql.catalog.session,https://github.com/apache/spark/commit/9407fba0375675d6ee6461253f3b8230e8d6750,feature has changed
50 | HDFS-14845,httpfs.authentication.kerberos.keytab,https://github.com/apache/hadoop/commit/3f89084ac756c9296d412821d76ff2bee57d0c2,feature has changed
51 | HDFS-14845,httpfs.authentication.signature.secret.file,https://github.com/apache/hadoop/commit/3f89084ac756c9296d412821d76ff2bee57d0c2,feature has changed
52 | HDFS-14845,httpfs.authentication.kerberos.principal,https://github.com/apache/hadoop/commit/3f89084ac756c9296d412821d76ff2bee57d0c2,feature has changed
53 | HDFS-14845,httpfs.authentication.type,https://github.com/apache/hadoop/commit/3f89084ac756c9296d412821d76ff2bee57d0c2,feature has changed
54 | HBASE-22610,hbase.offheapcache.minblocksize,https://github.com/apache/hbase/commit/06f5c43de340da62e765a753c10caba5465eeae,feature has changed
55 | SPARK-25372,spark.yarn.keytab spark.yarn.principal,https://github.com/apache/spark/commit/51540c2fa677658be954c820bc18ba748e4c8583#,not precise
56 | SPARK-26766,spark.yarn.access.namenodes,https://github.com/apache/spark/commit/d0443a74d185ec72b747fa39994fa9a40ce974cf#,not precise
57 | SPARK-26766,spark.yarn.access.hadoopFileSystems,https://github.com/apache/spark/commit/d0443a74d185ec72b747fa39994fa9a40ce974cf#,not precise
58 | HDFS-14142,IPFAILOVER_CONFIG_PREFIX,https://github.com/apache/hadoop/commit/b8ad6c85a549a6f17cf6675e58ef002d84059d3c#,not precise
59 | SPARK-22233,spark.hadoopRDD.ignoreEmptySplits,https://github.com/apache/spark/commit/0fa10666cf75e3c4929940af49c8a6f6ea874759#,not precise
60 | SPARK-22807,spark.kubernetes.driver.container.image,https://github.com/apache/spark/commit/fb3636b482be3d0940345b1528c1d5090bbc25e6#,not precise
61 | SPARK-22807,spark.kubernetes.executor.container.image,https://github.com/apache/spark/commit/fb3636b482be3d0940345b1528c1d5090bbc25e6#,not precise
62 | SPARK-22807,spark.kubernetes.container.image.pullPolicy,https://github.com/apache/spark/commit/fb3636b482be3d0940345b1528c1d5090bbc25e6#,not precise
63 | SPARK-29807,spark.sql.ansi.enabled,https://github.com/apache/spark/commit/40ea4a11d7f1534023669f0b81faf5d398174e4,not precise
64 | HBASE-22776,hbase.user.scan.snapshot.common.directory.permission,https://github.com/apache/hbase/commit/0e5dc6d7cee92524bf648b6f49d1565e098e5bc,not precise
65 | HBASE-22776,hbase.user.scan.snapshot.enable,https://github.com/apache/hbase/commit/0e5dc6d7cee92524bf648b6f49d1565e098e5bc,not precise
66 | HBASE-22776,hbase.user.scan.snapshot.thread.number,https://github.com/apache/hbase/commit/0e5dc6d7cee92524bf648b6f49d1565e098e5bc,not precise
67 | SPARK-20101,spark.sql.columnVector.offheap.enabled,https://github.com/apache/spark/commit/572af5027e45ca96e0d283a8bf7c84dcf476f9bc#,typo
68 | SPARK-21127,spark.sql.statistics.size.autoUpdate.enabled,https://github.com/apache/spark/commit/d5202259d9aa9ad95d572af253bf4a722b7b437a#,typo
69 | SPARK-26082,spark.mesos.fetcherCache.enable,https://github.com/apache/spark/commit/d5202259d9aa9ad95d572af253bf4a722b7b437a#,typo
70 | SPARK-27215,spark.kryo.unsafe,https://github.com/apache/spark/commit/93c6d2a198d1b3070eea32210042873c68d0d5f7#,typo
71 | SPARK-27215,spark.kryo.pool,https://github.com/apache/spark/commit/93c6d2a198d1b3070eea32210042873c68d0d5f7#,typo
72 | HDFS-12404,dfs.namenode.authorization.provider.bypass.users,https://github.com/apache/hadoop/commit/3b3be355b35d08a78d9dcd647650812a2d28207b#,typo
--------------------------------------------------------------------------------
/config_commits/cassandra.csv:
--------------------------------------------------------------------------------
1 | Issue ID,Commit Link,Title
2 | CASSANDRA-11097,https://github.com/apache/cassandra/commit/0240a4659d761f06f94f8cd97097f2d0ad2d220c,Introduce optional timeouts for idle client sessions
3 | CASSANDRA-13006,https://github.com/apache/cassandra/commit/02aba7343ce300397ab672bbb1788aa8182d8a48,Rely on the JVM to handle OutOfMemoryErrors
4 | CASSANDRA-13987,https://github.com/apache/cassandra/commit/05cb556f90dbd1929a180254809e05620265419b,More frequent commitlog chained markers
5 | CASSANDRA-14798,https://github.com/apache/cassandra/commit/0766f7e54182d04ecf5a15a732f5ec7951d62326,Improve wording around partitioner selection
6 | CASSANDRA-13625,https://github.com/apache/cassandra/commit/082af0a9ba6b5dde26055fcb9ddd2085e4240381,Remove unused max_value_size_in_mb config setting
7 | CASSANDRA-14314,https://github.com/apache/cassandra/commit/11496039fb18bb45407246602e31740c56d28157,Correct and clarify SSLFactory.getSslContext method and call site
8 | CASSANDRA-13656,https://github.com/apache/cassandra/commit/12d4e2f189fb228250edc876963d0c74b5ab0d4f,Change default start_native_transport to true and remove from jvm.options
9 | CASSANDRA-13418,https://github.com/apache/cassandra/commit/14d67d81c57d6387c77bd85c57b342d285880835,Allow to skip overlapings checks
10 | CASSANDRA-14991,https://github.com/apache/cassandra/commit/16ef9ac37c21c4f9091cd1f3658e54abddab8ad8,SSL Cert Hot Reloading should check for sanity of the new keystore/truststore before loading it
11 | CASSANDRA-14580,https://github.com/apache/cassandra/commit/176d4bac22c356c80e275dcb4040bc5cbd0da1c2,Make PeriodicCommitLogService.blockWhenSyncLagsNanos configurable
12 | CASSANDRA-14275,https://github.com/apache/cassandra/commit/19d26bcb80219bce0089fbe8942a34e3a331fd17,Add ability to specify driver name and version
13 | CASSANDRA-14303,https://github.com/apache/cassandra/commit/1f19d5f7a243cc4227da923459f5eb2f66066778,Auto-expand replication_factor for NetworkTopologyStrategy
14 | CASSANDRA-15202,https://github.com/apache/cassandra/commit/2117e2af00603f5fb2181e53dbcba190b2eab861,Make repair coordination less expensive by moving MerkleTrees off heap
15 | CASSANDRA-14173,https://github.com/apache/cassandra/commit/28ee665b3c0c9238b61a871064f024d54cddcc79,Remove dependencies on JVM internals for JMX support
16 | CASSANDRA-13910,https://github.com/apache/cassandra/commit/2fcd29b830e7b201e7047d283de385d5f1c427b5,Eliminate background repair and probablistic read_repair_chance table option
17 | CASSANDRA-14938,https://github.com/apache/cassandra/commit/3ddfbc8f5871c78bde26e96a936e96deeccb4366,Add specialized IndexRegistry for offline tools/clients
18 | CASSANDRA-14372,https://github.com/apache/cassandra/commit/42827e6a6709c4ba031e0a137a3bab257f88b54f,Yaml comments: data_file_directories distributes data evenly by partitioning its token ranges.
19 | CASSANDRA-13518,https://github.com/apache/cassandra/commit/428eaa3e37cab7227c81fdf124d29dfc1db4257c,Add storage port options to sstableloader
20 | CASSANDRA-14566,https://github.com/apache/cassandra/commit/47a12c52a313258307ab88392f75c5866d9a2bb1,Stream entire SSTables when possible
21 | CASSANDRA-15007,https://github.com/apache/cassandra/commit/47d4971b56d97ba8a528f7c17bfd6b11f1ababa3,Fix SimpleStrategy option validation
22 | CASSANDRA-14352,https://github.com/apache/cassandra/commit/4991ca26aa424286ebdee89742d35e813f9e9259,Clean up parsing speculative retry params from string
23 | CASSANDRA-12245,https://github.com/apache/cassandra/commit/4c80eeece37d79f434078224a0504400ae10a20d,Parallelize initial materialized view build
24 | CASSANDRA-14084,https://github.com/apache/cassandra/commit/50e6e721b2a81da7f11f60a2fa405fd46e5415d4,Fix imbalanced disks when replacing node with same address with JBOD
25 | CASSANDRA-14226,https://github.com/apache/cassandra/commit/518ddbf9d21491d341a3d7e2f2a2e65409595e07,Better document in code InetAddressAndPort usage post 7544
26 | CASSANDRA-12526,https://github.com/apache/cassandra/commit/53c0ef171424454c47d64a9326b0ba83cd743a50,Bump SSTable level instead of rewriting SSTable completely during single-sstable compactions
27 | CASSANDRA-13985,https://github.com/apache/cassandra/commit/54de771e643e9cc64d1f5dd28b5de8a9a91a219e,Add network auth
28 | CASSANDRA-7544,https://github.com/apache/cassandra/commit/59b5b6bef0fa76bf5740b688fcd4d9cf525760d0,Allow storage port to be configurable per nodePatch
29 | CASSANDRA-15013,https://github.com/apache/cassandra/commit/5a03898c680ed6ada63901e8a4b278ccc8070717,Prevent client requests from blocking on executor task queue
30 | CASSANDRA-13897,https://github.com/apache/cassandra/commit/5b23054f10f4d6553e8dacbf53bd59e552f2a031,Round buffer size to powers of 2 for the chunk cache
31 | CASSANDRA-14467,https://github.com/apache/cassandra/commit/5d8767765090cd968c39008f76b0cd795d6e5032,Add option to sanity check tombstones on reads/compaction
32 | CASSANDRA-14145,https://github.com/apache/cassandra/commit/5fbb938adaafd91e7bea1672f09a03c7ac5b9b9d,Detect inconsistencies in repaired data on the read path
33 | CASSANDRA-13614,https://github.com/apache/cassandra/commit/613a8b43d2b5a425080653898b28bde6cd7eb9ba,Add 'nodetool getbatchlogreplaythrottle' and 'nodetool setbatchlogreplaythrottle
34 | CASSANDRA-13594,https://github.com/apache/cassandra/commit/62d39f6544e3fbcbc268aecbb3a46950dcba2bf0,Use an ExecutorService for repair commands instead of new Thread(..).start()
35 | CASSANDRA-14373,https://github.com/apache/cassandra/commit/6e00ab956eb0148a74e926666862e4cc78936301,Allow using custom script for chronicle queue BinLog archival
36 | CASSANDRA-14659,https://github.com/apache/cassandra/commit/7b61b0be88ef1fcc29646ae8bdbb05da825bc1b2,Disable old native protocol versions on demand
37 | CASSANDRA-14654,https://github.com/apache/cassandra/commit/7df67eff2d66dba4bed2b4f6aeabf05144d9b057,Reduce heap pressure during compactions
38 | CASSANDRA-15002,https://github.com/apache/cassandra/commit/7f634feb7cf1fdb135133946ffd75efa681b8cb7,Avoid leaking threads when remote nodes fail anticompaction and rate limit anticompactions
39 | CASSANDRA-14297,https://github.com/apache/cassandra/commit/801cb70ee811c956e987718a00695638d5bec1b6,Startup checker should wait for count rather than percentage
40 | CASSANDRA-14225,https://github.com/apache/cassandra/commit/834f2a6ecdb8974839762bf4e9c5fed32163f9c8,Fix comparison of address and port for repair and messages
41 | CASSANDRA-14153,https://github.com/apache/cassandra/commit/8587b0ceb47fa54308dfa9b0bfdc320e6afdc311,Delete temp test files on exit
42 | CASSANDRA-13299,https://github.com/apache/cassandra/commit/8ef71f3f29fb040cce18ba158ff5f289b388c30b,Throttle base partitions during MV repair streaming to prevent OOM
43 | CASSANDRA-13651,https://github.com/apache/cassandra/commit/96ef514917e5a4829dbe864104dbc08a7d0e0cec,Remove Netty timed batching and instead do the batch during next eventLoop invocation after a write has been enqueued.
44 | CASSANDRA-15019,https://github.com/apache/cassandra/commit/99ce007c5beb7988ce83fb1443a1e0ca259264cc,Correctly set repaired data tracking flag on range commands
45 | CASSANDRA-13622,https://github.com/apache/cassandra/commit/a586f6c88dab173663b765261d084ed8410efe81,Improve config validation and documentation on overflow and NPE
46 | CASSANDRA-14525,https://github.com/apache/cassandra/commit/a6196a3a79b67dc6577747e591456328e57c314f,Do not enable native transport if bootstrap is pending
47 | CASSANDRA-14435,https://github.com/apache/cassandra/commit/a79e5903b552e40f77c151e23172f054ffb7f39e,Add JMX query support for diagnostic events
48 | CASSANDRA-13983,https://github.com/apache/cassandra/commit/ae837806bd07dbb8b881960feeeeb90c1a665d93,Support a means of logging all queries as they were invoked.
49 | CASSANDRA-12014,https://github.com/apache/cassandra/commit/ae88fd6c79b066f12ad76c2c1bfc1620d86bdbc5,Avoid assertion error when IndexSummary > 2G
50 | CASSANDRA-14092,https://github.com/apache/cassandra/commit/b2949439ec62077128103540e42570238520f4ee,Protect against overflow of local expiration time
51 | CASSANDRA-13740,https://github.com/apache/cassandra/commit/b2f6ce961f38a3e4cd744e102026bf7a471056c9,Delay hints store excise by write timeout to avoid race with decommission
52 | CASSANDRA-14096,https://github.com/apache/cassandra/commit/b30c8c98a594a5682f6ea1f0b5511463b700b6e8,Improve merkle tree size and time on heap
53 | CASSANDRA-14855,https://github.com/apache/cassandra/commit/b82a42fd9ae99dc115ec04339f4265096bb45044,Disable immediate flusher by default for cassandra-3.0 and cassandra-3.11
54 | CASSANDRA-13993,https://github.com/apache/cassandra/commit/b86801e95a58c5f1a9c779b21fa57136e0225d61,Add optional startup delay to wait until peers are ready
55 | CASSANDRA-13959,https://github.com/apache/cassandra/commit/b8697441d7a051e7ff68def6aa9cf14bd92ace9e,"Add flag to disable materialized views, and warnings on creation"
56 | CASSANDRA-14800,https://github.com/apache/cassandra/commit/bd0cef9a369ae9245b45040796a6e10f51e522ce,Avoid using DatabaseDescriptor in ProtocolVersion
57 | CASSANDRA-14358,https://github.com/apache/cassandra/commit/bfbc5274f2b3a5af2cbbe9679f0e78f1066ef638,Partitioned outbound internode TCP connections can occur when nodes restart
58 | CASSANDRA-13884,https://github.com/apache/cassandra/commit/c22ee2bd451d030e99cfb65be839bbc735a5352f,Add sstableloader OPTION to accept target keyspace name
59 | CASSANDRA-15059,https://github.com/apache/cassandra/commit/c3ce32e239b1ba41faf1d58a942465b9bf45b986,Fix assorted gossip races and add related runtime checks
60 | CASSANDRA-3200,https://github.com/apache/cassandra/commit/cb56d9fc3c773abbefa2044ce41ddbfb7717e0cb,Add option to optimize Merkle tree comparison across replicas
61 | CASSANDRA-14716,https://github.com/apache/cassandra/commit/cdeac4992bdb1f569c3a04b628ded7e5351364ee,Make CONTENT_CHECKSUM protocol OPTION values case insensitive
62 | CASSANDRA-13699,https://github.com/apache/cassandra/commit/cf4a0576a6f2b8f2d828a8b14140f212803adb7c,Allow to set batch_size_warn_threshold_in_kb via JMX
63 | CASSANDRA-14197,https://github.com/apache/cassandra/commit/d14a9266c7ddff0589fdbe7a1836217b8bb8b394,Automatic sstable upgrades
64 | CASSANDRA-9375,https://github.com/apache/cassandra/commit/d2dcd7f884cc997905c820d7cef8c9fc886ff4f7,force minumum timeout value
65 | CASSANDRA-14134,https://github.com/apache/cassandra/commit/d6e508f33c1a7274b5826ad9d5ce814d719bd848,Migrate dtests to use pytest and python3
66 | CASSANDRA-14108,https://github.com/apache/cassandra/commit/db788fe860dfd69f06ab97ae35fa67fcf2517b6d,Improve commit log chain marker updating
67 | CASSANDRA-14482,https://github.com/apache/cassandra/commit/dccf53061a61e7c632669c60cd94626e405518e9,ZSTD Compressor support in Cassandra
68 | CASSANDRA-14081,https://github.com/apache/cassandra/commit/df51d0cbbaaa99aea9bc2a582f788f9170dbdc03,Remove unused and deprecated methods from AbstractCompactionStrategy
69 | CASSANDRA-14726,https://github.com/apache/cassandra/commit/e645b9172c5d50fc2af407de724e46121edfe109,ReplicaCollection follow-up
70 | CASSANDRA-14866,https://github.com/apache/cassandra/commit/e6a61be8c857106d5d99a270b2d17de9f84c4d67,"Add flag to disable SASI indexes, and warning on creation"
71 | CASSANDRA-13669,https://github.com/apache/cassandra/commit/ea62d8862c311e3d9b64d622bea0a68d3825aa7d,Validate supported column type with SASI analyzer
72 | CASSANDRA-13910,https://github.com/apache/cassandra/commit/eaf9bf18b2ec50713170a9ca472c34586b17a5a3,Deprecate background repair and probablistic read_repair_chance table option
73 | CASSANDRA-13975,https://github.com/apache/cassandra/commit/f1e850a492126572efc636a6838cff90333806b9,Add flag to allow dropping oversized read repair mutations
74 | CASSANDRA-14821,https://github.com/apache/cassandra/commit/f22fec927de7ac291266660c2f34de5b8cc1c695,Introduce in-jvm distributed tests
75 | CASSANDRA-13530,https://github.com/apache/cassandra/commit/f3f90c1896eab4f3fb5507b0cf348e2f149db5d1,Add GroupCommitLogService
76 | CASSANDRA-14498,https://github.com/apache/cassandra/commit/f46762eeca9f5d7e32e731573a8c3e521b70fc05,Audit log allows system keyspaces to be audited via configuration options
77 | CASSANDRA-12151,https://github.com/apache/cassandra/commit/f56871b88be1e8965f166769c12cfa43313bac74,Audit logging for database activity
78 | CASSANDRA-14404,https://github.com/apache/cassandra/commit/f7431b432875e334170ccdb19934d05545d2cebd,Transient Replication and Cheap Quorums
79 | CASSANDRA-14619,https://github.com/apache/cassandra/commit/f83bd5ac2bbc6755213a6ad0675e7e5400c79670,Add fqltool comparePatch by marcuse
80 | CASSANDRA-13664,https://github.com/apache/cassandra/commit/ff06424faccc8acedd027c71e955a38fd8ddee6c,Only optimize large ranges when figuring out where to stream from
81 | CASSANDRA-14995,https://github.com/apache/cassandra/commit/ff73c33ab78f70cd0e70280c89e8d8a46f5536d8,lean up all javadoc related errors
82 | CASSANDRA-14855,https://github.com/apache/cassandra/commit/fff6eec2903ee85f648535dd051c9bc72631f524,Backport ImmediateFlusher to cassandra-3.0 and cassandra-3.11
83 | CASSANDRAd4054e0cf,https://github.com/apache/cassandra/commit/d4054e0cf88bdf85cbde33b6416a6eb20da876e2,"ninja: Fix ""No newline at end of file"" in c*.yaml"
84 | CASSANDRA-15260,https://github.com/apache/cassandra/commit/068d2d37c6fbdb60546821c4d408a84161fd1cb6,Add `allocate_tokens_for_local_rf` yaml option for token allocation that doesn't require keyspace knowledge/existence
85 | CASSANDRA-15193,https://github.com/apache/cassandra/commit/0388d89e29393d0b1f50baa24848bc8cb0a7c9a3,Allow max protocol version to be cappedPatch
86 | CASSANDRA-15277,https://github.com/apache/cassandra/commit/860de83a02f3b7711e842a58a073802b9920a1a1,Enable nodetool/JMX resizing of processing stage executor pool
87 | CASSANDRA-13990,https://github.com/apache/cassandra/commit/7c5904753f4ede492f1a5a5e68edfe37651a5be6,Remove obsolete OldNetworkTopologyStrategy
88 | CASSANDRA-15295,https://github.com/apache/cassandra/commit/3a8300e0b86c4acfb7b7702197d36cc39ebe94bc,Avoid deadlock during CommitLog initialization
--------------------------------------------------------------------------------
/commit_analysis/change_param_constraint.csv:
--------------------------------------------------------------------------------
1 | Issue-id,Title,Parameter,Issue-URL,Commit-URL,Type,Note
2 | HBASE-18108,Procedure WALs are archived but not cleaned,hbase.master.logcleaner.plugins,https://issues.apache.org/jira/browse/HBASE-18108,https://github.com/apache/hbase/commit/023d4f1ae8081da3cb9ff54e6b2e545799704ce7#,acceptble value change: new class,The TimeToLiveProcedureWALCleaner is now added to hbase.master.logcleaner.plugins to clean the 2 WALs in one run.
3 | CASSANDRA-14482,ZSTD Compressor support in Cassandra,commitlog_compression,https://issues.apache.org/jira/browse/CASSANDRA-14482,https://github.com/apache/cassandra/commit/dccf53061a61e7c632669c60cd94626e405518e9#,acceptble value change: new class,ZSTD Compressor support in Cassandra
4 | HBASE-19187,Remove option to create on heap bucket cache.,hbase.bucketcache.ioengine,https://issues.apache.org/jira/browse/HBASE-19187,https://github.com/apache/hbase/commit/bff619ef7b100e8b09f7f5eb0f6e289ca51de096#,acceptble value change: new mode,"Removing the on heap Bucket cache feature. The config ""hbase.bucketcache.ioengine"" no longer support the 'heap' value. Its supported values now are 'offheap', 'file:', 'files:' and 'mmap:"
5 | SPARK-24360,Support Hive 3.1 metastore,spark.sql.hive.metastore.version,https://issues.apache.org/jira/browse/SPARK-24360,https://github.com/apache/spark/commit/aeff69bd879661367367f39b5dfecd9a76223c0b#,acceptble value change: new version,Hive 3.1.1 is released. This PR aims to support Hive 3.1.x metastore.
6 | SPARK-27418,[SQL] Migrate Parquet to File Data Source V2,spark.sql.sources.write.useV1SourceList,https://issues.apache.org/jira/browse/SPARK-27418,https://github.com/apache/spark/commit/23ebd389b5cb528a7ba04113a12929bebfaf1e9a#,acceptble value change: value range,Support parquet
7 | SPARK-17788,[SQL] fix the potential OOM in UnsafeExternalSorter and ShuffleExternalSorter,spark.shuffle.spill.numElementsForceSpillThreshold,https://issues.apache.org/jira/browse/SPARK-17788,https://github.com/apache/spark/commit/079a2609d7ad0a7dd2ec3eaa594e6ed8801a8008#,type change: long -> int,"The Double values I'm trying to sort are mostly in the range [0,1] (~70% of the data which roughly equates 1 billion records), other numbers in the dataset are as high as 2000."
8 | HBASE-18511,Default no regions on master,hbase.balancer.tablesOnMaster,https://issues.apache.org/jira/browse/HBASE-18511,https://github.com/apache/hbase/commit/473446719b7b81b56216862bf2a94a576ff90f60#,type change: mode -> bool,Changes the configuration hbase.balancer.tablesOnMaster from list of table names to instead be a boolean; true if master carries tables/regions and false if it does not.
9 | CASSANDRA-13990,Remove obsolete OldNetworkTopologyStrategy,replication_strategies,https://issues.apache.org/jira/browse/CASSANDRA-13990,https://github.com/apache/cassandra/commit/7c5904753f4ede492f1a5a5e68edfe37651a5be6,acceptble value change: value range,"Removed the strategy from cqlsh autocomplete, including an array for replication_factor autocomplete that was only used for SimpleStrategy and OldNetworkTopologyStrategy."
10 | SPARK-30074,The maxNumPostShufflePartitions config should obey reducePostShufflePartitions enabled,spark.sql.adaptive.shuffle.maxNumPostShufflePartitions,https://issues.apache.org/jira/browse/SPARK-30074,https://github.com/apache/spark/commit/d1465a1b0dea690fcfbf75edb73ff9f8a015c0d,dependency,The maxNumPostShufflePartitions config should obey reducePostShufflePartitions enabled
11 | SPARK-21012,[SUBMIT] Add glob support for resources adding to Spark,spark.jars,https://issues.apache.org/jira/browse/SPARK-21012,https://github.com/apache/spark/commit/5800144a54f5c0180ccf67392f32c3e8a51119b1#,acceptble value change: glob path,"Current ""--jars (spark.jars)"", ""--files (spark.files)"", ""--py-files (spark.submit.pyFiles)"" and ""--archives (spark.yarn.dist.archives)"" only support non-glob path. This is OK for most of the cases, but when user requires to add more jars, files into Spark, it is too verbose to list one by one. So here propose to add glob path support for resources."
12 | SPARK-21012,[SUBMIT] Add glob support for resources adding to Spark,spark.files,https://issues.apache.org/jira/browse/SPARK-21012,https://github.com/apache/spark/commit/5800144a54f5c0180ccf67392f32c3e8a51119b1#,acceptble value change: glob path,"Current ""--jars (spark.jars)"", ""--files (spark.files)"", ""--py-files (spark.submit.pyFiles)"" and ""--archives (spark.yarn.dist.archives)"" only support non-glob path. This is OK for most of the cases, but when user requires to add more jars, files into Spark, it is too verbose to list one by one. So here propose to add glob path support for resources."
13 | SPARK-21012,[SUBMIT] Add glob support for resources adding to Spark,spark.submit.pyFiles,https://issues.apache.org/jira/browse/SPARK-21012,https://github.com/apache/spark/commit/5800144a54f5c0180ccf67392f32c3e8a51119b1#,acceptble value change: glob path,"Current ""--jars (spark.jars)"", ""--files (spark.files)"", ""--py-files (spark.submit.pyFiles)"" and ""--archives (spark.yarn.dist.archives)"" only support non-glob path. This is OK for most of the cases, but when user requires to add more jars, files into Spark, it is too verbose to list one by one. So here propose to add glob path support for resources."
14 | SPARK-21012,[SUBMIT] Add glob support for resources adding to Spark,spark.yarn.dist.archives,https://issues.apache.org/jira/browse/SPARK-21012,https://github.com/apache/spark/commit/5800144a54f5c0180ccf67392f32c3e8a51119b1#,acceptble value change: glob path,"Current ""--jars (spark.jars)"", ""--files (spark.files)"", ""--py-files (spark.submit.pyFiles)"" and ""--archives (spark.yarn.dist.archives)"" only support non-glob path. This is OK for most of the cases, but when user requires to add more jars, files into Spark, it is too verbose to list one by one. So here propose to add glob path support for resources."
15 | SPARK-24646,[CORE] Minor change to spark.yarn.dist.forceDownloadSchemes to support wildcard '*',spark.yarn.dist.forceDownloadSchemes,https://issues.apache.org/jira/browse/SPARK-24646,https://github.com/apache/spark/commit/e2c7e09f742a7e522efd74fe8e14c2620afdb522#,acceptble value change: surpport *,"Minor change to spark.yarn.dist.forceDownloadSchemes to support wildcard '*', For the ease of using this configuration, here propose to add wildcard '*' support to `spark.yarn.dist.forceDownloadSchemes`"
16 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_CLIENT_CACHE_READAHEAD,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit
17 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_DATANODE_MAX_LOCKED_MEMORY_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit
18 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit
19 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_NAMENODE_MAX_XATTR_SIZE_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit
20 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_NAMENODE_MAX_COMPONENT_LENGTH_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit
21 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_NAMENODE_MIN_BLOCK_SIZE_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit
22 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_IMAGE_TRANSFER_RATE_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit
23 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_IMAGE_TRANSFER_BOOTSTRAP_STANDBY_RATE_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit
24 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_NAMENODE_DU_RESERVED_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit
25 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_IMAGE_TRANSFER_CHUNKSIZE_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit
26 | SPARK-27256,"If the configuration is used to set the number of bytes, we'd better use `bytesConf`",spark.sql.files.maxPartitionBytes,https://issues.apache.org/jira/browse/SPARK-27256,https://github.com/apache/spark/commit/e4b36df2c0ae3bdba4484f9f92461dbb528d8fb9#,type change: long -> byte,"Currently, if we want to configure `spark. sql. files. maxPartitionBytes` to 256 megabytes, we must set `spark. sql. files. maxPartitionBytes=268435456`, which is very unfriendly to users."
27 | SPARK-27256,"If the configuration is used to set the number of bytes, we'd better use `bytesConf`",spark.files.maxPartitionBytes,https://issues.apache.org/jira/browse/SPARK-27256,https://github.com/apache/spark/commit/e4b36df2c0ae3bdba4484f9f92461dbb528d8fb9#,type change: long -> byte,"Currently, if we want to configure `spark. sql. files. maxPartitionBytes` to 256 megabytes, we must set `spark. sql. files. maxPartitionBytes=268435456`, which is very unfriendly to users."
28 | SPARK-27256,"If the configuration is used to set the number of bytes, we'd better use `bytesConf`",spark.files.openCostInBytes,https://issues.apache.org/jira/browse/SPARK-27256,https://github.com/apache/spark/commit/e4b36df2c0ae3bdba4484f9f92461dbb528d8fb9#,type change: long -> byte,"Currently, if we want to configure `spark. sql. files. maxPartitionBytes` to 256 megabytes, we must set `spark. sql. files. maxPartitionBytes=268435456`, which is very unfriendly to users."
29 | SPARK-27256,"If the configuration is used to set the number of bytes, we'd better use `bytesConf`",spark.shuffle.sort.initialBufferSize,https://issues.apache.org/jira/browse/SPARK-27256,https://github.com/apache/spark/commit/e4b36df2c0ae3bdba4484f9f92461dbb528d8fb9#,type change: long -> byte,"Currently, if we want to configure `spark. sql. files. maxPartitionBytes` to 256 megabytes, we must set `spark. sql. files. maxPartitionBytes=268435456`, which is very unfriendly to users."
30 | SPARK-27256,"If the configuration is used to set the number of bytes, we'd better use `bytesConf`",spark.shuffle.spill.initialMemoryThreshold,https://issues.apache.org/jira/browse/SPARK-27256,https://github.com/apache/spark/commit/e4b36df2c0ae3bdba4484f9f92461dbb528d8fb9#,type change: long -> byte,"Currently, if we want to configure `spark. sql. files. maxPartitionBytes` to 256 megabytes, we must set `spark. sql. files. maxPartitionBytes=268435456`, which is very unfriendly to users."
31 | SPARK-22845,Modify spark.kubernetes.allocation.batch.delay to take time instead of int,spark.kubernetes.allocation.batch.delay,https://issues.apache.org/jira/browse/SPARK-22845,https://github.com/apache/spark/commit/0114c89d049724b95f7823b957bf33790216316b#,type change: long -> time,Fixing configuration that was taking an int which should take time. Made the granularity milliseconds as opposed to seconds since there's a use-case for sub-second reactions to scale-up rapidly especially with dynamic allocation.
32 | SPARK-29151,Support fractional resources for task resource scheduling,spark.task.resource.{resourceName}.amount,https://issues.apache.org/jira/browse/SPARK-21287,https://github.com/apache/spark/commit/3cb18d90c441bbaa64c693e276793b670213e59,acceptble value change: surpport fractional,There is a configuration change where `spark.task.resource.[resource type].amount` can now be fractional.
33 | HDFS-14719,Correct the safemode threshold value in BlockManagerSafeMode.,DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY,https://issues.apache.org/jira/browse/HDFS-14719,https://github.com/apache/hadoop/commit/34681643e92774da6f74826c468ecec4dcbedf5,type change: double -> float,"BlockManagerSafeMode is doing wrong parsing for safemode threshold. It is storing float value in double, which will give different result some time."
34 | HDFS-14158,Checkpointer ignores configured time period > 5 minutes,dfs.namenode.checkpoint.period,https://issues.apache.org/jira/browse/HDFS-14158,https://github.com/apache/hadoop/commit/9aa3dc872ca9a528cb98ef56d9a33ab9d4531aa1#,acceptble value change: value range,"Are you running BackupNode? It has received little to no attention since people have not used it much for a long time. The standard way of checkpointing until the HA feature was to use secondary namenode, which has its own checks. ""periodMsec is always 5 minutes or lower"" might have been intentional and reasonable long time ago when BackupNode was first created."
35 | HDFS-12716,dfs.datanode.failed.volumes.tolerated' to support minimum number of volumes to be available.,dfs.datanode.failed.volumes.tolerated,https://issues.apache.org/jira/browse/HDFS-12716,https://github.com/apache/hadoop/commit/3108d27edde941d153a58f71fb1096cce2995531#,acceptble value change: value range,"Support 'dfs.datanode.failed.volumes.tolerated' to accept special 'negative value 'x' to tolerate failures of upto ""n-x"""
36 | SPARK-21287,Remove requirement of fetch_size>=0 from JDBCOptions,JDBC_BATCH_FETCH_SIZE,https://issues.apache.org/jira/browse/SPARK-21287,https://github.com/apache/spark/commit/92b25295ca0dc5b80aaddb1c8f8d5ef0a250d11,acceptble value change: value range,Remove the requirement of fetch_size>=0 from JDBCOptions to allow negative fetch size.
--------------------------------------------------------------------------------
/code/hdfs/diff_file_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 | import download_diff
3 | from pathlib import Path
4 |
5 | BASE_URL = "https://github.com/apache/hadoop/commit/"
6 |
7 | #RE for config File for HDFS
8 | HDFS_CONFIG_FILE_RE = '[a-zA-Z\.\_\-]*-default.xml'
9 |
10 | #RE for config Load in HDFS
11 | HDFS_CONFIG_LOAD_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*get[a-zA-Z]*\([^;<>]+\)'
12 |
13 | #RE for config assign in HDFS
14 | HDFS_CONFIG_ASSIGN_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + HDFS_CONFIG_LOAD_FUNC_RE
15 |
16 | #RE for config set in HDFS
17 | HDFS_CONFIG_SET_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*set[a-zA-Z]*\([^;<>]+\)'
18 |
19 | #RE for system parameter load in HDFS
20 | HDFS_SYS_PARAM_LOAD_FUNC_RE = 'System\.get(?:Property|env)\([^)^;]+\)'
21 |
22 | #RE for system parameter assign in HDFS
23 | HDFS_SYS_PARAM_ASSIGN_FUNC_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + HDFS_SYS_PARAM_LOAD_FUNC_RE
24 |
25 | #RE for system parameter set in HDFS
26 | HDFS_SYS_PARAM_SET_FUNC_RE = 'System\.set(?:Property|env)\([^)^;]+\)'
27 |
28 | #Message in source code
29 | MESSAGE_RE = '".+"'
30 |
31 | class DiffElement:
32 | def __init__(self):
33 | self.diff_class = '' #class that this diff belongs to
34 | self.diff_method = '' #method that this diff belongs to
35 | self.diff_snippet = '' #changed code in this diff
36 | self.diff_change_mode = '' #'+' or '-'
37 |
38 | class CodeElement:
39 | def __init__(self):
40 | self.code_class = '' #class that this diff belongs to
41 | self.code_snippet = '' #changed code in this diff
42 |
43 | class ConfigVariable:
44 | def __init__(self):
45 | self.variable_name = '' #Variable name
46 | self.variable_class = '' #class that this Variable belongs to
47 | self.variable_func = '' #function that assign this Variable
48 |
49 | def add_diff_snippet(code_snippet,changed_class,change_mode,changed_method):
50 | code_element = DiffElement()
51 | code_element.diff_snippet = code_snippet
52 | code_element.diff_class = changed_class
53 | code_element.diff_change_mode = change_mode
54 | code_element.diff_method = changed_method
55 | return code_element
56 |
57 | def collect_config_variable(assign_obj,code_element,config_variable_list):
58 | """collect variables that assgined by Cassandra configuration/system properties"""
59 | assign_obj = assign_obj.replace('\n','').replace(' ','').replace('this.','')
60 |
61 | #extract Variable that assigned
62 | m_variable = ConfigVariable()
63 | m_variable.variable_class = code_element.code_class
64 | m_variable.variable_func = assign_obj
65 | variable_name = assign_obj.split('=')
66 | variable_name = variable_name[0]
67 | m_variable.variable_name = variable_name
68 |
69 | #if this Variable is a new Variable, add it into configVariable set
70 | duplicate_flag = 0
71 | for variable in config_variable_list:
72 | if m_variable.variable_name == variable.variable_name and m_variable.variable_class == variable.variable_class:
73 | duplicate_flag =1
74 | break
75 | if duplicate_flag == 0 and len(m_variable.variable_name)>=3 and m_variable.variable_class != 'null':
76 | config_variable_list.append(m_variable)
77 | file = open('config_variable.txt','a')
78 | file.write(m_variable.variable_name + '##' + m_variable.variable_class + '##' + m_variable.variable_func + '\n')
79 | file.close()
80 |
81 | def diff_file_parser(url):
82 | """parse the diff_file, return the whole code and changed code (codeSet, diffSet)"""
83 | try:
84 | diff_file = open(url,'r')
85 | except (Exception) as e:
86 | # print (e)
87 | if Path(url).is_file() == False:
88 | commit_sha = url.replace('.diff','').split('/')
89 | download_diff.download(BASE_URL + commit_sha[-1])
90 | diff_file = open(url,'r')
91 | else:
92 | print (e)
93 | return
94 |
95 | #get code snippets, correlated class
96 | code_set = []
97 | code_snippet = ''
98 | code_class = ''
99 | for line in diff_file:
100 | if line:
101 | line = line.strip('\n')
102 | if len(line) > 1:
103 | if '+++' in line or '---' in line:
104 | if code_snippet:
105 | code_element = CodeElement()
106 | code_element.code_snippet = code_snippet
107 | code_element.code_class = code_class
108 | code_set.append(code_element)
109 | code_snippet = ''
110 | if '/dev/null' not in line:
111 | line = line.split('/')
112 | code_class = line[-1]
113 | else:
114 | if line[0] == '+':
115 | line = line.replace('+','',1)
116 | if line[0] == '-':
117 | line = line.replace('-','',1)
118 | code_snippet = code_snippet + line
119 | if code_snippet:
120 | code_element = CodeElement()
121 | code_element.code_snippet = code_snippet
122 | code_element.code_class = code_class
123 | code_set.append(code_element)
124 | code_snippet = ''
125 |
126 | diff_file.close()
127 |
128 | #get diff snippets, correlated changed class and method
129 | try:
130 | diff_file2 = open(url,'r')
131 | except (Exception) as e:
132 | print (e)
133 | return
134 |
135 | diff_set = []
136 | changed_class = ''
137 | changed_method = ''
138 | add_snippet = ''
139 | add_flag = 0
140 | minus_snippet = ''
141 | minus_flag = 0
142 | for line in diff_file2:
143 | if line:
144 | line = line.strip('\n')
145 | if '@@' in line:
146 | line = line.split('@@')
147 | if len(line) >= 3:
148 | changed_method = line[2]
149 | elif '+++' in line or '---' in line:
150 | if '/dev/null' not in line:
151 | if 'test' in line:
152 | changed_class = 'test'
153 | else:
154 | line = line.split('/')
155 | changed_class = line[-1]
156 | else:
157 | if line[0] == '+':
158 | line = line.replace('+','',1)
159 | if add_flag == 0:
160 | add_snippet = ''
161 | if 'import' not in line:
162 | add_snippet = add_snippet + line + '\n'
163 | add_flag = 1
164 | elif line[0] == '-':
165 | line = line.replace('-','',1)
166 | if minus_flag == 0:
167 | minus_snippet = ''
168 | if 'import' not in line:
169 | minus_snippet = minus_snippet + line + '\n'
170 | minus_flag = 1
171 | else:
172 | if add_flag == 1:
173 | if add_snippet:
174 | if changed_class != 'test':
175 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method)
176 | diff_set.append(add_element)
177 | add_flag = 0
178 | if minus_flag == 1:
179 | if minus_snippet:
180 | if changed_class != 'test':
181 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method)
182 | diff_set.append(minus_element)
183 | minus_flag = 0
184 | #if file end with diffline
185 | if add_flag == 1:
186 | if add_snippet:
187 | if changed_class != 'test':
188 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method)
189 | diff_set.append(add_element)
190 |
191 | if minus_flag == 1:
192 | if minus_snippet:
193 | if changed_class != 'test':
194 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method)
195 | diff_set.append(minus_element)
196 |
197 | diff_file2.close()
198 |
199 | return code_set,diff_set
200 |
201 | def diffSelection(url,config_variable_list):
202 |
203 | diff = diff_file_parser(url)
204 |
205 | if diff:
206 | codeSet = diff[0]
207 | diffSet = diff[1]
208 | else:
209 | codeSet = 0
210 | diffSet = 0
211 |
212 | #Wheter a diff touches configuration file
213 | configFileTouched = False
214 |
215 | #Wheter a diff touches configuration load function
216 | configLoadTouched = False
217 |
218 | #Wheter a diff touches configuration set function
219 | configSetTouched = False
220 |
221 | #Wheter a diff touches configuration Variableeter
222 | configVariableTouched = False
223 |
224 | #whether a diff touches configuration message(log, error message)
225 | configMessageTouched = False
226 |
227 | #the set of touched file
228 | touchedFile = []
229 |
230 | #the set of touched configuration load function
231 | touchedLoadFunc = []
232 |
233 | #the set of touched configuration set function
234 | touchedSetFunc = []
235 |
236 | #the set of touched configuration Variableeter
237 | touchedVariable = []
238 |
239 | #the set of touched configuration message
240 | touchedMessage = []
241 |
242 | if codeSet and diffSet:
243 |
244 | #collect configuration variables in code snippet(not diff snippet)
245 | for codeElement in codeSet:
246 | configAssignObj = re.findall(HDFS_CONFIG_ASSIGN_RE,codeElement.code_snippet,re.M | re.I)
247 | if configAssignObj:
248 | for assignObj in configAssignObj:
249 | collect_config_variable(assignObj,codeElement,config_variable_list)
250 |
251 | sysParamAssignObj = re.findall(HDFS_SYS_PARAM_ASSIGN_FUNC_RE,codeElement.code_snippet,re.M | re.I)
252 | if sysParamAssignObj:
253 | for assignObj in sysParamAssignObj:
254 | collect_config_variable(assignObj,codeElement,config_variable_list)
255 |
256 | for diffElement in diffSet:
257 |
258 | #check whether diff touches config file
259 | configFileObj = re.findall(HDFS_CONFIG_FILE_RE,diffElement.diff_class,re.M | re.I)
260 | if configFileObj:
261 | configFileTouched = True
262 | for fileObj in configFileObj:
263 | touchedFile.append(diffElement.diff_change_mode + fileObj)
264 |
265 | #check whether diff touches config load function
266 | configLoadObj = re.findall(HDFS_CONFIG_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
267 | if configLoadObj:
268 | for loadObj in configLoadObj:
269 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')
270 | if diffElement.diff_change_mode == '+':
271 | reverseMode = '-'
272 | else:
273 | reverseMode = '+'
274 | reverseFlag = False
275 | for Func in touchedLoadFunc:
276 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''):
277 | touchedLoadFunc.remove(Func)
278 | reverseFlag = True
279 | break
280 | if reverseFlag == False:
281 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n',''))
282 |
283 | sysParamLoadObj = re.findall(HDFS_SYS_PARAM_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
284 | if sysParamLoadObj:
285 | for loadObj in sysParamLoadObj:
286 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')
287 | if diffElement.diff_change_mode == '+':
288 | reverseMode = '-'
289 | else:
290 | reverseMode = '+'
291 | reverseFlag = False
292 | for Func in touchedLoadFunc:
293 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''):
294 | touchedLoadFunc.remove(Func)
295 | reverseFlag = True
296 | break
297 | if reverseFlag == False:
298 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n',''))
299 |
300 | #check whether diff touches config set function
301 | configSetObj = re.findall(HDFS_CONFIG_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
302 | if configSetObj:
303 | for setObj in configSetObj:
304 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')
305 | if diffElement.diff_change_mode == '+':
306 | reverseMode = '-'
307 | else:
308 | reverseMode = '+'
309 | reverseFlag = False
310 | for Func in touchedSetFunc:
311 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''):
312 | touchedSetFunc.remove(Func)
313 | reverseFlag = True
314 | break
315 | if reverseFlag == False:
316 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n',''))
317 |
318 | sysParamSetObj = re.findall(HDFS_SYS_PARAM_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
319 | if sysParamSetObj:
320 | for setObj in sysParamSetObj:
321 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')
322 | if diffElement.diff_change_mode == '+':
323 | reverseMode = '-'
324 | else:
325 | reverseMode = '+'
326 | reverseFlag = False
327 | for Func in touchedSetFunc:
328 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''):
329 | touchedSetFunc.remove(Func)
330 | reverseFlag = True
331 | break
332 | if reverseFlag == False:
333 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n',''))
334 |
335 | #check whether diff touches config related Variable
336 | for Variable in config_variable_list:
337 | if Variable.variable_name in diffElement.diff_snippet and Variable.variable_class == diffElement.diff_class:
338 | variableStr = diffElement.diff_change_mode + Variable.variable_name + ' ' + Variable.variable_class
339 | if diffElement.diff_change_mode == '+':
340 | reverseMode = '-'
341 | else:
342 | reverseMode = '+'
343 | reverseFlag = False
344 | for var in touchedVariable:
345 | if var == reverseMode + Variable.variable_name + ' ' + Variable.variable_class:
346 | touchedVariable.remove(var)
347 | reverseFlag = True
348 | break
349 | if reverseFlag == False:
350 | touchedVariable.append(variableStr)
351 |
352 | #check whether diff touches configuration message
353 | messageObj = re.findall(MESSAGE_RE,diffElement.diff_snippet,re.M | re.I)
354 | if messageObj:
355 | for messages in messageObj:
356 | messages = messages.split('"')
357 | for message in messages:
358 | words = message.lower().split(" ")
359 | if len(words) > 3:
360 | if 'option' in words or 'parameter' in words or 'config' in message.lower():
361 | configMessageTouched = True
362 | touchedMessage.append(diffElement.diff_change_mode + message)
363 |
364 | if touchedLoadFunc != []:
365 | configLoadTouched = True
366 |
367 | if touchedSetFunc != []:
368 | configSetTouched = True
369 |
370 | if touchedVariable != []:
371 | configVariableTouched = True
372 |
373 | return configFileTouched,configLoadTouched,configSetTouched,configVariableTouched,configMessageTouched,touchedFile,touchedLoadFunc,touchedSetFunc,touchedVariable,touchedMessage
374 |
375 | else:
376 | return False
377 |
378 |
379 |
380 |
381 |
--------------------------------------------------------------------------------
/code/hdfs_demo_examples/diff_file_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 | import download_diff
3 | from pathlib import Path
4 |
5 | BASE_URL = "https://github.com/apache/hadoop/commit/"
6 |
7 | #RE for config File for HDFS
8 | HDFS_CONFIG_FILE_RE = '[a-zA-Z\.\_\-]*-default.xml'
9 |
10 | #RE for config Load in HDFS
11 | HDFS_CONFIG_LOAD_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*get[a-zA-Z]*\([^;<>]+\)'
12 |
13 | #RE for config assign in HDFS
14 | HDFS_CONFIG_ASSIGN_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + HDFS_CONFIG_LOAD_FUNC_RE
15 |
16 | #RE for config set in HDFS
17 | HDFS_CONFIG_SET_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*set[a-zA-Z]*\([^;<>]+\)'
18 |
19 | #RE for system parameter load in HDFS
20 | HDFS_SYS_PARAM_LOAD_FUNC_RE = 'System\.get(?:Property|env)\([^)^;]+\)'
21 |
22 | #RE for system parameter assign in HDFS
23 | HDFS_SYS_PARAM_ASSIGN_FUNC_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + HDFS_SYS_PARAM_LOAD_FUNC_RE
24 |
25 | #RE for system parameter set in HDFS
26 | HDFS_SYS_PARAM_SET_FUNC_RE = 'System\.set(?:Property|env)\([^)^;]+\)'
27 |
28 | #Message in source code
29 | MESSAGE_RE = '".+"'
30 |
31 | class DiffElement:
32 | def __init__(self):
33 | self.diff_class = '' #class that this diff belongs to
34 | self.diff_method = '' #method that this diff belongs to
35 | self.diff_snippet = '' #changed code in this diff
36 | self.diff_change_mode = '' #'+' or '-'
37 |
38 | class CodeElement:
39 | def __init__(self):
40 | self.code_class = '' #class that this diff belongs to
41 | self.code_snippet = '' #changed code in this diff
42 |
43 | class ConfigVariable:
44 | def __init__(self):
45 | self.variable_name = '' #Variable name
46 | self.variable_class = '' #class that this Variable belongs to
47 | self.variable_func = '' #function that assign this Variable
48 |
49 | def add_diff_snippet(code_snippet,changed_class,change_mode,changed_method):
50 | code_element = DiffElement()
51 | code_element.diff_snippet = code_snippet
52 | code_element.diff_class = changed_class
53 | code_element.diff_change_mode = change_mode
54 | code_element.diff_method = changed_method
55 | return code_element
56 |
57 | def collect_config_variable(assign_obj,code_element,config_variable_list):
58 | """collect variables that assgined by Cassandra configuration/system properties"""
59 | assign_obj = assign_obj.replace('\n','').replace(' ','').replace('this.','')
60 |
61 | #extract Variable that assigned
62 | m_variable = ConfigVariable()
63 | m_variable.variable_class = code_element.code_class
64 | m_variable.variable_func = assign_obj
65 | variable_name = assign_obj.split('=')
66 | variable_name = variable_name[0]
67 | m_variable.variable_name = variable_name
68 |
69 | #if this Variable is a new Variable, add it into configVariable set
70 | duplicate_flag = 0
71 | for variable in config_variable_list:
72 | if m_variable.variable_name == variable.variable_name and m_variable.variable_class == variable.variable_class:
73 | duplicate_flag =1
74 | break
75 | if duplicate_flag == 0 and len(m_variable.variable_name)>=3 and m_variable.variable_class != 'null':
76 | config_variable_list.append(m_variable)
77 | file = open('config_variable.txt','a')
78 | file.write(m_variable.variable_name + '##' + m_variable.variable_class + '##' + m_variable.variable_func + '\n')
79 | file.close()
80 |
81 | def diff_file_parser(url):
82 | """parse the diff_file, return the whole code and changed code (codeSet, diffSet)"""
83 | try:
84 | diff_file = open(url,'r')
85 | except (Exception) as e:
86 | # print (e)
87 | if Path(url).is_file() == False:
88 | commit_sha = url.replace('.diff','').split('/')
89 | download_diff.download(BASE_URL + commit_sha[-1])
90 | diff_file = open(url,'r')
91 | else:
92 | print (e)
93 | return
94 |
95 | #get code snippets, correlated class
96 | code_set = []
97 | code_snippet = ''
98 | code_class = ''
99 | for line in diff_file:
100 | if line:
101 | line = line.strip('\n')
102 | if len(line) > 1:
103 | if '+++' in line or '---' in line:
104 | if code_snippet:
105 | code_element = CodeElement()
106 | code_element.code_snippet = code_snippet
107 | code_element.code_class = code_class
108 | code_set.append(code_element)
109 | code_snippet = ''
110 | if '/dev/null' not in line:
111 | line = line.split('/')
112 | code_class = line[-1]
113 | else:
114 | if line[0] == '+':
115 | line = line.replace('+','',1)
116 | if line[0] == '-':
117 | line = line.replace('-','',1)
118 | code_snippet = code_snippet + line
119 | if code_snippet:
120 | code_element = CodeElement()
121 | code_element.code_snippet = code_snippet
122 | code_element.code_class = code_class
123 | code_set.append(code_element)
124 | code_snippet = ''
125 |
126 | diff_file.close()
127 |
128 | #get diff snippets, correlated changed class and method
129 | try:
130 | diff_file2 = open(url,'r')
131 | except (Exception) as e:
132 | print (e)
133 | return
134 |
135 | diff_set = []
136 | changed_class = ''
137 | changed_method = ''
138 | add_snippet = ''
139 | add_flag = 0
140 | minus_snippet = ''
141 | minus_flag = 0
142 | for line in diff_file2:
143 | if line:
144 | line = line.strip('\n')
145 | if '@@' in line:
146 | line = line.split('@@')
147 | if len(line) >= 3:
148 | changed_method = line[2]
149 | elif '+++' in line or '---' in line:
150 | if '/dev/null' not in line:
151 | if 'test' in line:
152 | changed_class = 'test'
153 | else:
154 | line = line.split('/')
155 | changed_class = line[-1]
156 | else:
157 | if line[0] == '+':
158 | line = line.replace('+','',1)
159 | if add_flag == 0:
160 | add_snippet = ''
161 | if 'import' not in line:
162 | add_snippet = add_snippet + line + '\n'
163 | add_flag = 1
164 | elif line[0] == '-':
165 | line = line.replace('-','',1)
166 | if minus_flag == 0:
167 | minus_snippet = ''
168 | if 'import' not in line:
169 | minus_snippet = minus_snippet + line + '\n'
170 | minus_flag = 1
171 | else:
172 | if add_flag == 1:
173 | if add_snippet:
174 | if changed_class != 'test':
175 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method)
176 | diff_set.append(add_element)
177 | add_flag = 0
178 | if minus_flag == 1:
179 | if minus_snippet:
180 | if changed_class != 'test':
181 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method)
182 | diff_set.append(minus_element)
183 | minus_flag = 0
184 | #if file end with diffline
185 | if add_flag == 1:
186 | if add_snippet:
187 | if changed_class != 'test':
188 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method)
189 | diff_set.append(add_element)
190 |
191 | if minus_flag == 1:
192 | if minus_snippet:
193 | if changed_class != 'test':
194 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method)
195 | diff_set.append(minus_element)
196 |
197 | diff_file2.close()
198 |
199 | return code_set,diff_set
200 |
201 | def diffSelection(url,config_variable_list):
202 |
203 | diff = diff_file_parser(url)
204 |
205 | if diff:
206 | codeSet = diff[0]
207 | diffSet = diff[1]
208 | else:
209 | codeSet = 0
210 | diffSet = 0
211 |
212 | #Wheter a diff touches configuration file
213 | configFileTouched = False
214 |
215 | #Wheter a diff touches configuration load function
216 | configLoadTouched = False
217 |
218 | #Wheter a diff touches configuration set function
219 | configSetTouched = False
220 |
221 | #Wheter a diff touches configuration Variableeter
222 | configVariableTouched = False
223 |
224 | #whether a diff touches configuration message(log, error message)
225 | configMessageTouched = False
226 |
227 | #the set of touched file
228 | touchedFile = []
229 |
230 | #the set of touched configuration load function
231 | touchedLoadFunc = []
232 |
233 | #the set of touched configuration set function
234 | touchedSetFunc = []
235 |
236 | #the set of touched configuration Variableeter
237 | touchedVariable = []
238 |
239 | #the set of touched configuration message
240 | touchedMessage = []
241 |
242 | if codeSet and diffSet:
243 |
244 | #collect configuration variables in code snippet(not diff snippet)
245 | for codeElement in codeSet:
246 | configAssignObj = re.findall(HDFS_CONFIG_ASSIGN_RE,codeElement.code_snippet,re.M | re.I)
247 | if configAssignObj:
248 | for assignObj in configAssignObj:
249 | collect_config_variable(assignObj,codeElement,config_variable_list)
250 |
251 | sysParamAssignObj = re.findall(HDFS_SYS_PARAM_ASSIGN_FUNC_RE,codeElement.code_snippet,re.M | re.I)
252 | if sysParamAssignObj:
253 | for assignObj in sysParamAssignObj:
254 | collect_config_variable(assignObj,codeElement,config_variable_list)
255 |
256 | for diffElement in diffSet:
257 |
258 | #check whether diff touches config file
259 | configFileObj = re.findall(HDFS_CONFIG_FILE_RE,diffElement.diff_class,re.M | re.I)
260 | if configFileObj:
261 | configFileTouched = True
262 | for fileObj in configFileObj:
263 | touchedFile.append(diffElement.diff_change_mode + fileObj)
264 |
265 | #check whether diff touches config load function
266 | configLoadObj = re.findall(HDFS_CONFIG_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
267 | if configLoadObj:
268 | for loadObj in configLoadObj:
269 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')
270 | if diffElement.diff_change_mode == '+':
271 | reverseMode = '-'
272 | else:
273 | reverseMode = '+'
274 | reverseFlag = False
275 | for Func in touchedLoadFunc:
276 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''):
277 | touchedLoadFunc.remove(Func)
278 | reverseFlag = True
279 | break
280 | if reverseFlag == False:
281 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n',''))
282 |
283 | sysParamLoadObj = re.findall(HDFS_SYS_PARAM_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
284 | if sysParamLoadObj:
285 | for loadObj in sysParamLoadObj:
286 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')
287 | if diffElement.diff_change_mode == '+':
288 | reverseMode = '-'
289 | else:
290 | reverseMode = '+'
291 | reverseFlag = False
292 | for Func in touchedLoadFunc:
293 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''):
294 | touchedLoadFunc.remove(Func)
295 | reverseFlag = True
296 | break
297 | if reverseFlag == False:
298 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n',''))
299 |
300 | #check whether diff touches config set function
301 | configSetObj = re.findall(HDFS_CONFIG_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
302 | if configSetObj:
303 | for setObj in configSetObj:
304 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')
305 | if diffElement.diff_change_mode == '+':
306 | reverseMode = '-'
307 | else:
308 | reverseMode = '+'
309 | reverseFlag = False
310 | for Func in touchedSetFunc:
311 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''):
312 | touchedSetFunc.remove(Func)
313 | reverseFlag = True
314 | break
315 | if reverseFlag == False:
316 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n',''))
317 |
318 | sysParamSetObj = re.findall(HDFS_SYS_PARAM_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
319 | if sysParamSetObj:
320 | for setObj in sysParamSetObj:
321 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')
322 | if diffElement.diff_change_mode == '+':
323 | reverseMode = '-'
324 | else:
325 | reverseMode = '+'
326 | reverseFlag = False
327 | for Func in touchedSetFunc:
328 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''):
329 | touchedSetFunc.remove(Func)
330 | reverseFlag = True
331 | break
332 | if reverseFlag == False:
333 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n',''))
334 |
335 | #check whether diff touches config related Variable
336 | for Variable in config_variable_list:
337 | if Variable.variable_name in diffElement.diff_snippet and Variable.variable_class == diffElement.diff_class:
338 | variableStr = diffElement.diff_change_mode + Variable.variable_name + ' ' + Variable.variable_class
339 | if diffElement.diff_change_mode == '+':
340 | reverseMode = '-'
341 | else:
342 | reverseMode = '+'
343 | reverseFlag = False
344 | for var in touchedVariable:
345 | if var == reverseMode + Variable.variable_name + ' ' + Variable.variable_class:
346 | touchedVariable.remove(var)
347 | reverseFlag = True
348 | break
349 | if reverseFlag == False:
350 | touchedVariable.append(variableStr)
351 |
352 | #check whether diff touches configuration message
353 | messageObj = re.findall(MESSAGE_RE,diffElement.diff_snippet,re.M | re.I)
354 | if messageObj:
355 | for messages in messageObj:
356 | messages = messages.split('"')
357 | for message in messages:
358 | words = message.lower().split(" ")
359 | if len(words) > 3:
360 | if 'option' in words or 'parameter' in words or 'config' in message.lower():
361 | configMessageTouched = True
362 | touchedMessage.append(diffElement.diff_change_mode + message)
363 |
364 | if touchedLoadFunc != []:
365 | configLoadTouched = True
366 |
367 | if touchedSetFunc != []:
368 | configSetTouched = True
369 |
370 | if touchedVariable != []:
371 | configVariableTouched = True
372 |
373 | return configFileTouched,configLoadTouched,configSetTouched,configVariableTouched,configMessageTouched,touchedFile,touchedLoadFunc,touchedSetFunc,touchedVariable,touchedMessage
374 |
375 | else:
376 | return False
377 |
378 |
379 |
380 |
381 |
--------------------------------------------------------------------------------
/code/hbase/diff_file_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 | import download_diff
3 | from pathlib import Path
4 |
5 | BASE_URL = "https://github.com/apache/hbase/commit/"
6 |
7 | #configFile name for HBase
8 | HBASE_CONFIG_FILE_RE = '[a-zA-Z\.\_\-]*-default.xml'
9 |
10 | #RE for config Load in HBase
11 | HBASE_CONFIG_LOAD_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*get[a-zA-Z]*\([^;<>]+\)'
12 |
13 | #RE for config assign in HBase
14 | HBASE_CONFIG_ASSIGN_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + HBASE_CONFIG_LOAD_FUNC_RE
15 |
16 | #RE for config set in HBase
17 | HBASE_CONFIG_SET_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*set[a-zA-Z]*\([^;<>]+\)'
18 |
19 | #RE for system parameter load in HBase
20 | HBASE_SYS_PARAM_LOAD_FUNC_RE = 'System\.get(?:Property|env)\([^)^;]+\)'
21 |
22 | #RE for system parameter assign in HBase
23 | HBASE_SYS_PARAM_ASSIGN_FUNC_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + HBASE_SYS_PARAM_LOAD_FUNC_RE
24 |
25 | #RE for system parameter set in HBase
26 | HBASE_SYS_PARAM_SET_FUNC_RE = 'System\.set(?:Property|env)\([^)^;]+\)'
27 |
28 | #RE for message in program
29 | MESSAGE_RE = '".+"'
30 |
31 | class DiffElement:
32 | def __init__(self):
33 | self.diff_class = '' #class that this diff belongs to
34 | self.diff_method = '' #method that this diff belongs to
35 | self.diff_snippet = '' #changed code in this diff
36 | self.diff_change_mode = '' #'+' or '-'
37 |
38 | class CodeElement:
39 | def __init__(self):
40 | self.code_class = '' #class that this diff belongs to
41 | self.code_snippet = '' #changed code in this diff
42 |
43 | class ConfigVariable:
44 | def __init__(self):
45 | self.variable_name = '' #Variable name
46 | self.variable_class = '' #class that this Variable belongs to
47 | self.variable_func = '' #function that assign this Variable
48 |
49 | def add_diff_snippet(code_snippet,changed_class,change_mode,changed_method):
50 | code_element = DiffElement()
51 | code_element.diff_snippet = code_snippet
52 | code_element.diff_class = changed_class
53 | code_element.diff_change_mode = change_mode
54 | code_element.diff_method = changed_method
55 | return code_element
56 |
57 | def collect_config_variable(assign_obj,code_element,config_variable_list):
58 | """collect variables that assgined by Cassandra configuration/system properties"""
59 | assign_obj = assign_obj.replace('\n','').replace(' ','').replace('this.','')
60 |
61 | #extract Variable that assigned
62 | m_variable = ConfigVariable()
63 | m_variable.variable_class = code_element.code_class
64 | m_variable.variable_func = assign_obj
65 | variable_name = assign_obj.split('=')
66 | variable_name = variable_name[0]
67 | m_variable.variable_name = variable_name
68 |
69 | #if this Variable is a new Variable, add it into configVariable set
70 | duplicate_flag = 0
71 | for variable in config_variable_list:
72 | if m_variable.variable_name == variable.variable_name and m_variable.variable_class == variable.variable_class:
73 | duplicate_flag =1
74 | break
75 | if duplicate_flag == 0 and len(m_variable.variable_name)>=3 and m_variable.variable_class != 'null':
76 | config_variable_list.append(m_variable)
77 | file = open('config_variable.txt','a')
78 | file.write(m_variable.variable_name + '##' + m_variable.variable_class + '##' + m_variable.variable_func + '\n')
79 | file.close()
80 |
81 | def diff_file_parser(url):
82 | """parse the diff_file, return the whole code and changed code (codeSet, diffSet)"""
83 | try:
84 | diff_file = open(url,'r')
85 | except (Exception) as e:
86 | # print (e)
87 | if Path(url).is_file() == False:
88 | commit_sha = url.replace('.diff','').split('/')
89 | download_diff.download(BASE_URL + commit_sha[-1])
90 | diff_file = open(url,'r')
91 | else:
92 | print (e)
93 | return
94 |
95 | #get code snippets, correlated class
96 | code_set = []
97 | code_snippet = ''
98 | code_class = ''
99 | for line in diff_file:
100 | if line:
101 | line = line.strip('\n')
102 | if len(line) > 1:
103 | if '+++' in line or '---' in line:
104 | if code_snippet:
105 | code_element = CodeElement()
106 | code_element.code_snippet = code_snippet
107 | code_element.code_class = code_class
108 | code_set.append(code_element)
109 | code_snippet = ''
110 | if '/dev/null' not in line:
111 | line = line.split('/')
112 | code_class = line[-1]
113 | else:
114 | if line[0] == '+':
115 | line = line.replace('+','',1)
116 | if line[0] == '-':
117 | line = line.replace('-','',1)
118 | code_snippet = code_snippet + line
119 | if code_snippet:
120 | code_element = CodeElement()
121 | code_element.code_snippet = code_snippet
122 | code_element.code_class = code_class
123 | code_set.append(code_element)
124 | code_snippet = ''
125 |
126 | diff_file.close()
127 |
128 | #get diff snippets, correlated changed class and method
129 | try:
130 | diff_file2 = open(url,'r')
131 | except (Exception) as e:
132 | print (e)
133 | return
134 |
135 | diff_set = []
136 | changed_class = ''
137 | changed_method = ''
138 | add_snippet = ''
139 | add_flag = 0
140 | minus_snippet = ''
141 | minus_flag = 0
142 | for line in diff_file2:
143 | if line:
144 | line = line.strip('\n')
145 | if '@@' in line:
146 | line = line.split('@@')
147 | if len(line) >= 3:
148 | changed_method = line[2]
149 | elif '+++' in line or '---' in line:
150 | if '/dev/null' not in line:
151 | if 'test' in line:
152 | changed_class = 'test'
153 | else:
154 | line = line.split('/')
155 | changed_class = line[-1]
156 | else:
157 | if line[0] == '+':
158 | line = line.replace('+','',1)
159 | if add_flag == 0:
160 | add_snippet = ''
161 | if 'import' not in line:
162 | add_snippet = add_snippet + line + '\n'
163 | add_flag = 1
164 | elif line[0] == '-':
165 | line = line.replace('-','',1)
166 | if minus_flag == 0:
167 | minus_snippet = ''
168 | if 'import' not in line:
169 | minus_snippet = minus_snippet + line + '\n'
170 | minus_flag = 1
171 | else:
172 | if add_flag == 1:
173 | if add_snippet:
174 | if changed_class != 'test':
175 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method)
176 | diff_set.append(add_element)
177 | add_flag = 0
178 | if minus_flag == 1:
179 | if minus_snippet:
180 | if changed_class != 'test':
181 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method)
182 | diff_set.append(minus_element)
183 | minus_flag = 0
184 | #if file end with diffline
185 | if add_flag == 1:
186 | if add_snippet:
187 | if changed_class != 'test':
188 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method)
189 | diff_set.append(add_element)
190 |
191 | if minus_flag == 1:
192 | if minus_snippet:
193 | if changed_class != 'test':
194 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method)
195 | diff_set.append(minus_element)
196 |
197 | diff_file2.close()
198 |
199 | return code_set,diff_set
200 |
201 | def diffSelection(url,configVariableList):
202 |
203 | diff = diff_file_parser(url)
204 |
205 | if diff:
206 | codeSet = diff[0]
207 | diffSet = diff[1]
208 | else:
209 | codeSet = 0
210 | diffSet = 0
211 |
212 | #Wheter a diff touches configuration file
213 | configFileTouched = False
214 |
215 | #Wheter a diff touches configuration load function
216 | configLoadTouched = False
217 |
218 | #Wheter a diff touches configuration set function
219 | configSetTouched = False
220 |
221 | #Wheter a diff touches configuration Variableeter
222 | configVariableTouched = False
223 |
224 | #whether a diff touches configuration message(log, error message)
225 | configMessageTouched = False
226 |
227 | #the set of touched file
228 | touchedFile = []
229 |
230 | #the set of touched configuration load function
231 | touchedLoadFunc = []
232 |
233 | #the set of touched configuration set function
234 | touchedSetFunc = []
235 |
236 | #the set of touched configuration Variable
237 | touchedVariable = []
238 |
239 | #the set of touched configuration message
240 | touchedMessage = []
241 |
242 | if codeSet and diffSet:
243 |
244 | #collect configuration variables in code snippet(not diff snippet)
245 | for codeElement in codeSet:
246 | configAssignObj = re.findall(HBASE_CONFIG_ASSIGN_RE,codeElement.code_snippet,re.M | re.I)
247 | if configAssignObj:
248 | for assignObj in configAssignObj:
249 | collect_config_variable(assignObj,codeElement,configVariableList)
250 |
251 | sysParamAssignObj = re.findall(HBASE_SYS_PARAM_ASSIGN_FUNC_RE,codeElement.code_snippet,re.M | re.I)
252 | if sysParamAssignObj:
253 | for assignObj in sysParamAssignObj:
254 | collect_config_variable(assignObj,codeElement,configVariableList)
255 |
256 | for diffElement in diffSet:
257 |
258 | #check whether diff touches config file
259 | configFileObj = re.findall(HBASE_CONFIG_FILE_RE,diffElement.diff_class,re.M | re.I)
260 | if configFileObj:
261 | configFileTouched = True
262 | for fileObj in configFileObj:
263 | touchedFile.append(diffElement.diff_change_mode + fileObj)
264 |
265 | #check whether diff touches config load function
266 | configLoadObj = re.findall(HBASE_CONFIG_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
267 | if configLoadObj:
268 | for loadObj in configLoadObj:
269 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')
270 | if diffElement.diff_change_mode == '+':
271 | reverseMode = '-'
272 | else:
273 | reverseMode = '+'
274 | reverseFlag = False
275 | for Func in touchedLoadFunc:
276 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''):
277 | touchedLoadFunc.remove(Func)
278 | reverseFlag = True
279 | break
280 | if reverseFlag == False:
281 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n',''))
282 |
283 | sysParamLoadObj = re.findall(HBASE_SYS_PARAM_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
284 | if sysParamLoadObj:
285 | for loadObj in sysParamLoadObj:
286 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')
287 | if diffElement.diff_change_mode == '+':
288 | reverseMode = '-'
289 | else:
290 | reverseMode = '+'
291 | reverseFlag = False
292 | for Func in touchedLoadFunc:
293 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''):
294 | touchedLoadFunc.remove(Func)
295 | reverseFlag = True
296 | break
297 | if reverseFlag == False:
298 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n',''))
299 |
300 |
301 | #check whether diff touches config set function
302 | configSetObj = re.findall(HBASE_CONFIG_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
303 | if configSetObj:
304 | for setObj in configSetObj:
305 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')
306 | if diffElement.diff_change_mode == '+':
307 | reverseMode = '-'
308 | else:
309 | reverseMode = '+'
310 | reverseFlag = False
311 | for Func in touchedSetFunc:
312 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''):
313 | touchedSetFunc.remove(Func)
314 | reverseFlag = True
315 | break
316 | if reverseFlag == False:
317 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n',''))
318 |
319 | sysParamSetObj = re.findall(HBASE_SYS_PARAM_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
320 | if sysParamSetObj:
321 | for setObj in sysParamSetObj:
322 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')
323 | if diffElement.diff_change_mode == '+':
324 | reverseMode = '-'
325 | else:
326 | reverseMode = '+'
327 | reverseFlag = False
328 | for Func in touchedSetFunc:
329 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''):
330 | touchedSetFunc.remove(Func)
331 | reverseFlag = True
332 | break
333 | if reverseFlag == False:
334 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n',''))
335 |
336 |
337 | #check whether diff touches config related Variable
338 | for Variable in configVariableList:
339 | if Variable.variable_name in diffElement.diff_snippet and Variable.variable_class == diffElement.diff_class:
340 | variableStr = diffElement.diff_change_mode + Variable.variable_name + ' ' + Variable.variable_class
341 | if diffElement.diff_change_mode == '+':
342 | reverseMode = '-'
343 | else:
344 | reverseMode = '+'
345 | reverseFlag = False
346 | for var in touchedVariable:
347 | if var == reverseMode + Variable.variable_name + ' ' + Variable.variable_class:
348 | touchedVariable.remove(var)
349 | reverseFlag = True
350 | break
351 | if reverseFlag == False:
352 | touchedVariable.append(variableStr)
353 |
354 | #check whether diff touches configuration message
355 | messageObj = re.findall(MESSAGE_RE,diffElement.diff_snippet,re.M | re.I)
356 | if messageObj:
357 | for messages in messageObj:
358 | messages = messages.split('"')
359 | for message in messages:
360 | words = message.lower().split(" ")
361 | if len(words) > 3:
362 | if 'option' in words or 'parameter' in words or 'config' in message.lower():
363 | configMessageTouched = True
364 | touchedMessage.append(diffElement.diff_change_mode + message)
365 |
366 |
367 | if touchedLoadFunc != []:
368 | configLoadTouched = True
369 |
370 | if touchedSetFunc != []:
371 | configSetTouched = True
372 |
373 | if touchedVariable != []:
374 | configVariableTouched = True
375 |
376 |
377 | return configFileTouched,configLoadTouched,configSetTouched,configVariableTouched,configMessageTouched,touchedFile,touchedLoadFunc,touchedSetFunc,touchedVariable,touchedMessage
378 |
379 | else:
380 | return False
381 |
382 |
383 |
384 |
385 |
--------------------------------------------------------------------------------
/code/spark/diff_file_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 | import download_diff
3 | from pathlib import Path
4 |
5 | BASE_URL = "https://github.com/apache/saprk/commit/"
6 |
7 | #configDoc name in Spark
8 | SPARK_CONFIG_DOC_RE = 'configuration.md'
9 |
10 | #RE for config build in Spark
11 | SPARK_CONFIG_BUILDE_RE = '(?:ConfigBuilder|buildConf)\([^)^;]+\)'
12 |
13 | #RE for config load in Spark
14 | SPARK_CONFIG_LOAD_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*get[a-zA-Z]*\([^)^;]+\)'
15 |
16 | #RE for config assign in Spark
17 | SPARK_CONFIG_ASSIGN_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + SPARK_CONFIG_LOAD_FUNC_RE
18 |
19 | #RE for config set in Spark
20 | SPARK_CONFIG_SET_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*set[a-zA-Z]*\([^)^;]+\)'
21 |
22 | #RE for SQL config load in Spark
23 | SQL_CONFIG_LOAD_FUNC_RE = 'SQLConf.get.[a-zA-Z]*'
24 |
25 | #RE for SQL config Assign in Spark
26 | SQL_CONFIG_ASSIGN_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + SQL_CONFIG_LOAD_FUNC_RE
27 |
28 | #RE for system param load in Spark
29 | SPARK_SYS_PARAM_LOAD_RE = 'System\.get(?:Property|env)\([^)^;]+\)'
30 |
31 | #RE for system param assign in Spark
32 | SPARK_SYS_PARAM_ASSIGN_FUNC_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + SPARK_SYS_PARAM_LOAD_RE
33 |
34 | #RE for system param set in Spark
35 | SPARK_SYS_PARAM_SET_FUNC_RE = 'System\.set(?:Property|env)\([^)^;]+\)'
36 |
37 | #Message in source code
38 | MESSAGE_RE = '".+"'
39 |
40 | class DiffElement:
41 | def __init__(self):
42 | self.diff_class = '' #class that this diff belongs to
43 | self.diff_method = '' #method that this diff belongs to
44 | self.diff_snippet = '' #changed code in this diff
45 | self.diff_change_mode = '' #'+' or '-'
46 |
47 | class CodeElement:
48 | def __init__(self):
49 | self.code_class = '' #class that this diff belongs to
50 | self.code_snippet = '' #changed code in this diff
51 |
52 | class ConfigVariable:
53 | def __init__(self):
54 | self.variable_name = '' #Variable name
55 | self.variable_class = '' #class that this Variable belongs to
56 | self.variable_func = '' #function that assign this Variable
57 |
58 | def add_diff_snippet(code_snippet,changed_class,change_mode,changed_method):
59 | code_element = DiffElement()
60 | code_element.diff_snippet = code_snippet
61 | code_element.diff_class = changed_class
62 | code_element.diff_change_mode = change_mode
63 | code_element.diff_method = changed_method
64 | return code_element
65 |
66 | def collect_config_variable(assign_obj,code_element,config_variable_list):
67 | """collect variables that assgined by Cassandra configuration/system properties"""
68 | assign_obj = assign_obj.replace('\n','').replace(' ','').replace('this.','')
69 |
70 | #extract Variable that assigned
71 | m_variable = ConfigVariable()
72 | m_variable.variable_class = code_element.code_class
73 | m_variable.variable_func = assign_obj
74 | variable_name = assign_obj.split('=')
75 | variable_name = variable_name[0]
76 | m_variable.variable_name = variable_name
77 |
78 | #if this Variable is a new Variable, add it into configVariable set
79 | duplicate_flag = 0
80 | for variable in config_variable_list:
81 | if m_variable.variable_name == variable.variable_name and m_variable.variable_class == variable.variable_class:
82 | duplicate_flag =1
83 | break
84 | if duplicate_flag == 0 and len(m_variable.variable_name)>=3 and m_variable.variable_class != 'null':
85 | config_variable_list.append(m_variable)
86 | file = open('config_variable.txt','a')
87 | file.write(m_variable.variable_name + '##' + m_variable.variable_class + '##' + m_variable.variable_func + '\n')
88 | file.close()
89 |
90 | def diff_file_parser(url):
91 | """parse the diff_file, return the whole code and changed code (codeSet, diffSet)"""
92 | try:
93 | diff_file = open(url,'r')
94 | except (Exception) as e:
95 | # print (e)
96 | if Path(url).is_file() == False:
97 | commit_sha = url.replace('.diff','').split('/')
98 | download_diff.download(BASE_URL + commit_sha[-1])
99 | diff_file = open(url,'r')
100 | else:
101 | print (e)
102 | return
103 |
104 | #get code snippets, correlated class
105 | code_set = []
106 | code_snippet = ''
107 | code_class = ''
108 | for line in diff_file:
109 | if line:
110 | line = line.strip('\n')
111 | if len(line) > 1:
112 | if '+++' in line or '---' in line:
113 | if code_snippet:
114 | code_element = CodeElement()
115 | code_element.code_snippet = code_snippet
116 | code_element.code_class = code_class
117 | code_set.append(code_element)
118 | code_snippet = ''
119 | if '/dev/null' not in line:
120 | line = line.split('/')
121 | code_class = line[-1]
122 | else:
123 | if line[0] == '+':
124 | line = line.replace('+','',1)
125 | if line[0] == '-':
126 | line = line.replace('-','',1)
127 | code_snippet = code_snippet + line
128 | if code_snippet:
129 | code_element = CodeElement()
130 | code_element.code_snippet = code_snippet
131 | code_element.code_class = code_class
132 | code_set.append(code_element)
133 | code_snippet = ''
134 |
135 | diff_file.close()
136 |
137 | #get diff snippets, correlated changed class and method
138 | try:
139 | diff_file2 = open(url,'r')
140 | except (Exception) as e:
141 | print (e)
142 | return
143 |
144 | diff_set = []
145 | changed_class = ''
146 | changed_method = ''
147 | add_snippet = ''
148 | add_flag = 0
149 | minus_snippet = ''
150 | minus_flag = 0
151 | for line in diff_file2:
152 | if line:
153 | line = line.strip('\n')
154 | if '@@' in line:
155 | line = line.split('@@')
156 | if len(line) >= 3:
157 | changed_method = line[2]
158 | elif '+++' in line or '---' in line:
159 | if '/dev/null' not in line:
160 | if 'test' in line:
161 | changed_class = 'test'
162 | else:
163 | line = line.split('/')
164 | changed_class = line[-1]
165 | else:
166 | if line[0] == '+':
167 | line = line.replace('+','',1)
168 | if add_flag == 0:
169 | add_snippet = ''
170 | if 'import' not in line:
171 | add_snippet = add_snippet + line + '\n'
172 | add_flag = 1
173 | elif line[0] == '-':
174 | line = line.replace('-','',1)
175 | if minus_flag == 0:
176 | minus_snippet = ''
177 | if 'import' not in line:
178 | minus_snippet = minus_snippet + line + '\n'
179 | minus_flag = 1
180 | else:
181 | if add_flag == 1:
182 | if add_snippet:
183 | if changed_class != 'test':
184 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method)
185 | diff_set.append(add_element)
186 | add_flag = 0
187 | if minus_flag == 1:
188 | if minus_snippet:
189 | if changed_class != 'test':
190 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method)
191 | diff_set.append(minus_element)
192 | minus_flag = 0
193 | #if file end with diffline
194 | if add_flag == 1:
195 | if add_snippet:
196 | if changed_class != 'test':
197 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method)
198 | diff_set.append(add_element)
199 |
200 | if minus_flag == 1:
201 | if minus_snippet:
202 | if changed_class != 'test':
203 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method)
204 | diff_set.append(minus_element)
205 |
206 | diff_file2.close()
207 |
208 | return code_set,diff_set
209 |
210 | def diff_selection(url,config_variable_list):
211 |
212 | diff = diff_file_parser(url)
213 |
214 | if diff:
215 | codeSet = diff[0]
216 | diffSet = diff[1]
217 | else:
218 | codeSet = 0
219 | diffSet = 0
220 |
221 | #Whether a diff touches configuration doc
222 | configDocTouched = False
223 |
224 | #Whether a diff touches configuration build
225 | configBuildTouched = False
226 |
227 | #Whether a diff touches configuration load function
228 | configLoadTouched = False
229 |
230 | #Whether a diff touches configuration set function
231 | configSetTouched = False
232 |
233 | #Whether a diff touches configuration Variable
234 | configVariableTouched = False
235 |
236 | #whether a diff touches configuration message(log, error message)
237 | configMessageTouched = False
238 |
239 | #the set of touched configuration build function
240 | touchedBuildFunc = []
241 |
242 | #the set of touched configuration load function
243 | touchedLoadFunc = []
244 |
245 | #the set of touched configuration set function
246 | touchedSetFunc = []
247 |
248 | #the set of touched configuration variable
249 | touchedVariable = []
250 |
251 | #the set of touched configuration message
252 | touchedMessage = []
253 |
254 | if codeSet and diffSet:
255 |
256 | #collect configuration variables in code snippet(not diff snippet)
257 | for codeElement in codeSet:
258 |
259 | #collect variables that assigned by spark param
260 | configAssignObj = re.findall(SPARK_CONFIG_ASSIGN_RE,codeElement.code_snippet,re.M | re.I)
261 | if configAssignObj:
262 | for assignObj in configAssignObj:
263 | collect_config_variable(assignObj,codeElement,config_variable_list)
264 |
265 | #collect variables that assigned by system properties
266 | sysParamAssignObj = re.findall(SPARK_SYS_PARAM_ASSIGN_FUNC_RE,codeElement.code_snippet,re.M | re.I)
267 | if sysParamAssignObj:
268 | for assignObj in sysParamAssignObj:
269 | collect_config_variable(assignObj,codeElement,config_variable_list)
270 |
271 | #collect variables that assigned by SQL param
272 | SQLparamAssignObj = re.findall(SQL_CONFIG_ASSIGN_RE,codeElement.code_snippet,re.M | re.I)
273 | if SQLparamAssignObj:
274 | for assignObj in SQLparamAssignObj:
275 | collect_config_variable(assignObj,codeElement,config_variable_list)
276 |
277 | #identify whether the diffs touch configuraton related part
278 | for diffElement in diffSet:
279 |
280 | #check whether diff touches config doc
281 | if SPARK_CONFIG_DOC_RE == diffElement.diff_class:
282 | configDocTouched = True
283 |
284 | #check whether diff touches config build function
285 | configBuildObj = re.findall(SPARK_CONFIG_BUILDE_RE,diffElement.diff_snippet,re.M | re.I)
286 | if configBuildObj:
287 | configBuildTouched = True
288 | for buildObj in configBuildObj:
289 | touchedBuildFunc.append(diffElement.diff_change_mode + buildObj)
290 |
291 | #check whether diff touches spark config load function
292 | configLoadObj = re.findall(SPARK_CONFIG_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
293 | if configLoadObj:
294 | for loadObj in configLoadObj:
295 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')
296 | if diffElement.diff_change_mode == '+':
297 | reverseMode = '-'
298 | else:
299 | reverseMode = '+'
300 | reverseFlag = False
301 | for Func in touchedLoadFunc:
302 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''):
303 | touchedLoadFunc.remove(Func)
304 | reverseFlag = True
305 | break
306 | if reverseFlag == False:
307 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n',''))
308 |
309 | #check whether diff touches SQL config load function
310 | SQLconfigLoadObj = re.findall(SQL_CONFIG_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
311 | if SQLconfigLoadObj:
312 | for loadObj in SQLconfigLoadObj:
313 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')
314 | if diffElement.diff_change_mode == '+':
315 | reverseMode = '-'
316 | else:
317 | reverseMode = '+'
318 | reverseFlag = False
319 | for Func in touchedLoadFunc:
320 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''):
321 | touchedLoadFunc.remove(Func)
322 | reverseFlag = True
323 | break
324 | if reverseFlag == False:
325 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n',''))
326 |
327 | #check whether diff touches system parameters load function
328 | sysParamLoadObj = re.findall(SPARK_SYS_PARAM_LOAD_RE,diffElement.diff_snippet,re.M | re.I)
329 | if sysParamLoadObj:
330 | for loadObj in sysParamLoadObj:
331 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')
332 | if diffElement.diff_change_mode == '+':
333 | reverseMode = '-'
334 | else:
335 | reverseMode = '+'
336 | reverseFlag = False
337 | for Func in touchedLoadFunc:
338 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''):
339 | touchedLoadFunc.remove(Func)
340 | reverseFlag = True
341 | break
342 | if reverseFlag == False:
343 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n',''))
344 |
345 | #check whether diff touches config set function
346 | configSetObj = re.findall(SPARK_CONFIG_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
347 | if configSetObj:
348 | for setObj in configSetObj:
349 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')
350 | if diffElement.diff_change_mode == '+':
351 | reverseMode = '-'
352 | else:
353 | reverseMode = '+'
354 | reverseFlag = False
355 | for Func in touchedSetFunc:
356 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''):
357 | touchedSetFunc.remove(Func)
358 | reverseFlag = True
359 | break
360 | if reverseFlag == False:
361 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n',''))
362 |
363 | sysParamSetObj = re.findall(SPARK_SYS_PARAM_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I)
364 | if sysParamSetObj:
365 | for setObj in sysParamSetObj:
366 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')
367 | if diffElement.diff_change_mode == '+':
368 | reverseMode = '-'
369 | else:
370 | reverseMode = '+'
371 | reverseFlag = False
372 | for Func in touchedSetFunc:
373 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''):
374 | touchedSetFunc.remove(Func)
375 | reverseFlag = True
376 | break
377 | if reverseFlag == False:
378 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n',''))
379 |
380 | #check whether diff touches config related Variable
381 | for Variable in config_variable_list:
382 | if Variable.variable_name in diffElement.diff_snippet and Variable.variable_class == diffElement.diff_class:
383 | variableStr = diffElement.diff_change_mode + Variable.variable_name + ' ' + Variable.variable_class
384 | if diffElement.diff_change_mode == '+':
385 | reverseMode = '-'
386 | else:
387 | reverseMode = '+'
388 | reverseFlag = False
389 | for var in touchedVariable:
390 | if var == reverseMode + Variable.variable_name + ' ' + Variable.variable_class:
391 | touchedVariable.remove(var)
392 | reverseFlag = True
393 | break
394 | if reverseFlag == False:
395 | touchedVariable.append(diffElement.diff_change_mode + Variable.variable_name + ' ' + Variable.variable_class)
396 |
397 | #check whether diff touches configuration message
398 | messageObj = re.findall(MESSAGE_RE,diffElement.diff_snippet,re.M | re.I)
399 | if messageObj:
400 | for messages in messageObj:
401 | messages = messages.split('"')
402 | for message in messages:
403 | words = message.lower().split(" ")
404 | if len(words) > 3:
405 | if 'option' in words or 'parameter' in words or 'config' in message.lower():
406 | messageStr = diffElement.diff_change_mode + ' ' + message
407 | if diffElement.diff_change_mode == '+':
408 | reverseMode = '-'
409 | else:
410 | reverseMode = '+'
411 | reverseFlag = False
412 | for msg in touchedMessage:
413 | if msg == reverseMode + ' ' + message:
414 | touchedMessage.remove(msg)
415 | reverseFlag = True
416 | break
417 | if reverseFlag == False:
418 | touchedMessage.append(messageStr)
419 |
420 | if touchedLoadFunc != []:
421 | configLoadTouched = True
422 |
423 | if touchedSetFunc != []:
424 | configSetTouched = True
425 |
426 | if touchedVariable != []:
427 | configVariableTouched = True
428 |
429 | if touchedMessage != []:
430 | configMessageTouched = True
431 |
432 |
433 | return configDocTouched,configBuildTouched,configLoadTouched,configSetTouched,configVariableTouched,configMessageTouched,touchedBuildFunc,touchedLoadFunc,touchedSetFunc,touchedVariable,touchedMessage
434 |
435 | else:
436 | return False
437 |
438 |
439 |
440 |
441 |
--------------------------------------------------------------------------------