├── code ├── hdfs │ ├── commit_url.txt │ ├── commit_selection.py │ ├── get_commit.py │ ├── download_diff.py │ ├── extract_commit.py │ └── diff_file_parser.py ├── hbase │ ├── commit_url.txt │ ├── commit_selection.py │ ├── get_commit.py │ ├── download_diff.py │ ├── extract_commit.py │ └── diff_file_parser.py ├── spark │ ├── commit_url.txt │ ├── commit_selection.py │ ├── get_commit.py │ ├── download_diff.py │ ├── extract_commit.py │ └── diff_file_parser.py ├── cassandra │ ├── commit_url.txt │ ├── commit_selection.py │ ├── get_commit.py │ ├── download_diff.py │ └── extract_commit.py └── hdfs_demo_examples │ ├── commit_selection.py │ ├── hdfs_example_commits.txt │ ├── download_diff.py │ ├── extract_commit.py │ └── diff_file_parser.py ├── commit_analysis ├── README.md ├── count_num.py ├── config_parsing.csv ├── rmv_replace.csv ├── rmv_with_code.csv ├── param_rename.csv └── change_param_constraint.csv ├── README.md └── config_commits └── cassandra.csv /code/hdfs/commit_url.txt: -------------------------------------------------------------------------------- 1 | https://github.com/apache/hadoop/commits/trunk?after=2b4febcf576e2da29ab86e2920302b82b47e435d+34&branch=trunk -------------------------------------------------------------------------------- /code/hbase/commit_url.txt: -------------------------------------------------------------------------------- 1 | https://github.com/apache/hbase/commits/master?after=85842634e518155db3c964bf15555291d5fbdd45+34&branch=master -------------------------------------------------------------------------------- /code/spark/commit_url.txt: -------------------------------------------------------------------------------- 1 | https://github.com/apache/spark/commits/master?after=8d09f9649510bf5d812c82b04f7711b9252a7db0+69&branch=master -------------------------------------------------------------------------------- /code/cassandra/commit_url.txt: -------------------------------------------------------------------------------- 1 | https://github.com/apache/cassandra/commits/trunk?after=401e933b7395892bf0356f88308f64b94be84601+34&branch=trunk -------------------------------------------------------------------------------- /code/cassandra/commit_selection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import time 4 | import random 5 | import threading 6 | 7 | import extract_commit 8 | 9 | 10 | class ConfigParam: 11 | def __init__(self): 12 | self.param_name = '' #param name 13 | self.param_class = '' #class that this param belongs to 14 | self.param_func = '' #function that assign this param 15 | 16 | def main(): 17 | config_variable_list = [] 18 | searched_commit_num = 0 19 | 20 | commit_info_file = open('commit_info.txt','r') 21 | for commit_info in commit_info_file: 22 | commit_info = commit_info.strip('\n') 23 | extract_commit.extract(commit_info,config_variable_list) 24 | searched_commit_num = searched_commit_num + 1 25 | print (searched_commit_num) 26 | 27 | if __name__ == '__main__': 28 | main() 29 | 30 | -------------------------------------------------------------------------------- /code/hdfs/commit_selection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import time 4 | import random 5 | import threading 6 | 7 | import extract_commit 8 | 9 | 10 | class ConfigParam: 11 | def __init__(self): 12 | self.param_name = '' #param name 13 | self.param_class = '' #class that this param belongs to 14 | self.param_func = '' #function that assign this param 15 | 16 | def main(): 17 | config_variable_list = [] 18 | searched_commit_num = 0 19 | 20 | commit_info_file = open('commit_info.txt','r') 21 | for commit_info in commit_info_file: 22 | commit_info = commit_info.strip('\n') 23 | extract_commit.extract(commit_info,config_variable_list) 24 | searched_commit_num = searched_commit_num + 1 25 | print (searched_commit_num) 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | 31 | -------------------------------------------------------------------------------- /code/hbase/commit_selection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import time 4 | import random 5 | import threading 6 | 7 | import extract_commit 8 | 9 | 10 | class ConfigParam: 11 | def __init__(self): 12 | self.param_name = '' #param name 13 | self.param_class = '' #class that this param belongs to 14 | self.param_func = '' #function that assign this param 15 | 16 | def main(): 17 | config_variable_list = [] 18 | searched_commit_num = 0 19 | 20 | commit_info_file = open('commit_info.txt','r') 21 | for commit_info in commit_info_file: 22 | commit_info = commit_info.strip('\n') 23 | extract_commit.extract(commit_info,config_variable_list) 24 | searched_commit_num = searched_commit_num + 1 25 | print (searched_commit_num) 26 | 27 | if __name__ == '__main__': 28 | main() 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /code/spark/commit_selection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import time 3 | import random 4 | import threading 5 | 6 | import extract_commit 7 | 8 | 9 | class ConfigParam: 10 | def __init__(self): 11 | self.param_name = '' #param name 12 | self.param_class = '' #class that this param belongs to 13 | self.param_func = '' #function that assign this param 14 | 15 | def main(): 16 | config_variable_list = [] 17 | searched_commit_num = 0 18 | 19 | commit_info_file = open('commit_info.txt','r') 20 | for commit_info in commit_info_file: 21 | commit_info = commit_info.strip('\n') 22 | extract_commit.extract(commit_info,config_variable_list) 23 | searched_commit_num = searched_commit_num + 1 24 | print (searched_commit_num) 25 | 26 | if __name__ == '__main__': 27 | main() 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /code/hdfs_demo_examples/commit_selection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import time 4 | import random 5 | import threading 6 | 7 | import extract_commit 8 | 9 | 10 | class ConfigParam: 11 | def __init__(self): 12 | self.param_name = '' #param name 13 | self.param_class = '' #class that this param belongs to 14 | self.param_func = '' #function that assign this param 15 | 16 | def main(): 17 | config_variable_list = [] 18 | searched_commit_num = 0 19 | 20 | commit_info_file = open('hdfs_example_commits.txt','r') 21 | for commit_info in commit_info_file: 22 | commit_info = commit_info.strip('\n') 23 | extract_commit.extract(commit_info,config_variable_list) 24 | searched_commit_num = searched_commit_num + 1 25 | print (searched_commit_num) 26 | 27 | 28 | if __name__ == '__main__': 29 | main() 30 | 31 | -------------------------------------------------------------------------------- /code/hdfs_demo_examples/hdfs_example_commits.txt: -------------------------------------------------------------------------------- 1 | https://github.com/apache/hadoop/commit/c81ac2ff0220b180cd6cbbf18221290c3783bfd5$$$HDFS-13607. [SBN read] Edit Tail Fast Path Part 1: Enhance JournalNod……e with an in-memory cache of recent edit transactions. Contributed by Erik Krogen.$$$2018-05-09T22:40:07Z 2 | https://github.com/apache/hadoop/commit/bfd3f8bd8a9ae2186ec3e4addc71f912ec7b8923$$$HDFS-12291: [SPS]: Provide a mechanism to recursively iterate and sat……isfy storage policy of all the files under the given dir. Contributed by Surendra Singh Lilhore.$$$2017-09-30T13:31:52Z 3 | https://github.com/apache/hadoop/commit/123342cd0759ff88801d4f5ab10987f6e3f344b0$$$HDFS-12412. Change ErasureCodingWorker.stripedReadPool to cached thre……ad pool. (Lei (Eddy) Xu)$$$2017-09-13T01:12:07Z 4 | https://github.com/apache/hadoop/commit/9ae9467f920e95ca989d7d51775b39e1b9fee300$$$HDFS-11998. Enable DFSNetworkTopology as default. Contributed by Chen…… Liang.$$$2017-06-22T05:01:37Z 5 | https://github.com/apache/hadoop/commit/3108d27edde941d153a58f71fb1096cce2995531$$$HDFS-12716. 'dfs.datanode.failed.volumes.tolerated' to support minimu……m number of volumes to be available. Contributed by Ranith Sardar and usharani$$$2018-07-30T10:20:04Z 6 | https://github.com/apache/hadoop/commit/42307e3c3abbfe0b83d9a2581deba327435b910f$$$HDFS-11576. Block recovery will fail indefinitely if recovery time > ……heartbeat interval. Contributed by Lukas Majercak$$$2017-12-02T06:34:30Z 7 | https://github.com/apache/hadoop/commit/035c6ee587e444550af6420676e4cee049e09869$$$HDFS-12603. Enable async edit logging by default. Contributed by Andr……ew Wang.$$$2017-10-16T16:43:39Z -------------------------------------------------------------------------------- /commit_analysis/README.md: -------------------------------------------------------------------------------- 1 | # Data Layout 2 | 3 | There are 7 data sheets corresponding to the sections in the submitted paper. 4 | 5 | ## Data Sheets 6 | 7 | * Section IV (CONFIGURATION INTERFACE EVOLUTION) 8 | 9 | * IV.A.1) (Parameterization) → `parameterization.csv` 10 | 11 | * IV.A.2) (Removing Parameters) → `param_removal.csv` 12 | 13 | * IV.B (Evolution of Default Values) → `change_default_value.csv` 14 | 15 | * Section V (CONFIGURATION USAGE EVOLUTION) 16 | 17 | * V.A (Evolution of Parameter Checking Code) → `checking_and_handling_code.csv` 18 | 19 | * V.B (Evolution of Error-handling Code) → `checking_and_handling_code.csv` 20 | 21 | * V.C (Evolution of Using Parameter Values) → `change_param_existing_usage.csv` and `param_new_use.csv` 22 | 23 | * Section VI (CONFIGURATION DOCUMENT EVOLUTION) → `change_doucumentation.csv` 24 | 25 | ## Metadata Tags 26 | 27 | The first row describes the metadata. Besides the common metadata tags such as `#Parmameter`, `#Issue ID`, `#Title`, `#Issue URL`, `#Commit URL`, `#Note`, there are also specific tags in each spreadsheet. 28 | 29 | **Note that some tags are only available for a subset of commit/parameters. We list them here to avoid any confusion.** 30 | 31 | * `change_default_value.csv` : "#How to choose new value" is for 32 numeric parameters. Please refer to "Choosing new values" in Section IV.B. 32 | 33 | * `change_doucumentation.csv` : "#Info added" is for 63 changes that enhance inadequate documents. Please refer to "Content added to enhance documentation" in Section VI 34 | 35 | * `checking_and_handling_code.csv`: "#Checking content" is for configuration check changes (please refer to Section V.A); "#Changed message" is for misconfiguration feedback messages (please refer to Section V.B). 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /code/hbase/get_commit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import urllib 4 | from urllib.request import urlopen 5 | from bs4 import BeautifulSoup 6 | import http.client 7 | import time 8 | import random 9 | import threading 10 | 11 | import download_diff 12 | 13 | 14 | def get_commits(url,searched_commit_num): 15 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\ 16 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ 17 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\ 18 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\ 19 | {'User-Agent':'node.js-v0.8.8'}] 20 | try: 21 | c = random.randint(0,4) 22 | req = urllib.request.Request(url=url,headers=headers[c]) 23 | response = urlopen(req) 24 | except (Exception) as e: 25 | print (e) 26 | time.sleep(60) 27 | soup = BeautifulSoup(response.read(), features = "html.parser") 28 | a_list = soup.find_all('a') 29 | commit_list = [] 30 | older_href = '' 31 | for a in a_list: 32 | data = a.get('data-pjax') 33 | if data: 34 | if data == 'true': 35 | href = a.get('href') 36 | if href not in commit_list: 37 | commit_list.append(href) 38 | if a.text == 'Older': 39 | older_href = a.get('href') 40 | 41 | for commit in commit_list: 42 | searched_commit_num = searched_commit_num + 1 43 | try: 44 | while threading.activeCount() > 25: 45 | #print threading.activeCount() 46 | pass 47 | t = threading.Thread(target = download_diff.extract, args = (commit,)) 48 | t.start() 49 | except (Exception) as e: 50 | print ("multiprocessing error") 51 | print (e.message) 52 | interval = random.uniform(1,2) 53 | time.sleep(interval) 54 | 55 | # print (older_href) 56 | file = open('download_log.txt','a') 57 | file.write(older_href + '\n') 58 | file.write("Already downloaded " + str(searched_commit_num) + " commits." + '\n') 59 | print ("Already downloaded " + str(searched_commit_num) + " commits.") 60 | file.close() 61 | out = open('commit_url.txt','w') 62 | out.write(older_href) 63 | out.close() 64 | 65 | get_commits(older_href,searched_commit_num) 66 | 67 | 68 | def main(): 69 | #change the file(commit_url.txt) to get other software's commits 70 | fin = open('commit_url.txt','r') 71 | url = fin.readline() 72 | fin.close() 73 | 74 | searched_commit_num = 0 75 | 76 | try: 77 | get_commits(url,searched_commit_num) 78 | except (Exception) as e: 79 | print (e) 80 | 81 | 82 | if __name__ == '__main__': 83 | http.client.HTTPConnection._http_vsn = 10 84 | http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0' 85 | main() 86 | 87 | -------------------------------------------------------------------------------- /code/hdfs/get_commit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import urllib 4 | from urllib.request import urlopen 5 | from bs4 import BeautifulSoup 6 | import http.client 7 | import time 8 | import random 9 | import threading 10 | 11 | import download_diff 12 | 13 | 14 | def get_commits(url,searched_commit_num): 15 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\ 16 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ 17 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\ 18 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\ 19 | {'User-Agent':'node.js-v0.8.8'}] 20 | try: 21 | c = random.randint(0,4) 22 | req = urllib.request.Request(url=url,headers=headers[c]) 23 | response = urlopen(req) 24 | except (Exception) as e: 25 | print (e) 26 | time.sleep(60) 27 | soup = BeautifulSoup(response.read(), features = "html.parser") 28 | a_list = soup.find_all('a') 29 | commit_list = [] 30 | older_href = '' 31 | for a in a_list: 32 | data = a.get('data-pjax') 33 | if data: 34 | if data == 'true': 35 | href = a.get('href') 36 | if href not in commit_list: 37 | commit_list.append(href) 38 | if a.text == 'Older': 39 | older_href = a.get('href') 40 | 41 | for commit in commit_list: 42 | searched_commit_num = searched_commit_num + 1 43 | try: 44 | while threading.activeCount() > 25: 45 | #print threading.activeCount() 46 | pass 47 | t = threading.Thread(target = download_diff.extract, args = (commit,)) 48 | t.start() 49 | except (Exception) as e: 50 | print ("multiprocessing error") 51 | print (e.message) 52 | interval = random.uniform(1,2) 53 | time.sleep(interval) 54 | 55 | # print (older_href) 56 | file = open('download_log.txt','a') 57 | file.write(older_href + '\n') 58 | file.write("Already downloaded " + str(searched_commit_num) + " commits." + '\n') 59 | print ("Already downloaded " + str(searched_commit_num) + " commits.") 60 | file.close() 61 | out = open('commit_url.txt','w') 62 | out.write(older_href) 63 | out.close() 64 | 65 | get_commits(older_href,searched_commit_num) 66 | 67 | 68 | def main(): 69 | #change the file(commit_url.txt) to get other software's commits 70 | fin = open('commit_url.txt','r') 71 | url = fin.readline() 72 | fin.close() 73 | 74 | searched_commit_num = 0 75 | 76 | try: 77 | get_commits(url,searched_commit_num) 78 | except (Exception) as e: 79 | print (e) 80 | 81 | 82 | if __name__ == '__main__': 83 | http.client.HTTPConnection._http_vsn = 10 84 | http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0' 85 | main() 86 | 87 | -------------------------------------------------------------------------------- /code/spark/get_commit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import urllib 4 | from urllib.request import urlopen 5 | from bs4 import BeautifulSoup 6 | import http.client 7 | import time 8 | import random 9 | import threading 10 | 11 | import download_diff 12 | 13 | 14 | def get_commits(url,searched_commit_num): 15 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\ 16 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ 17 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\ 18 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\ 19 | {'User-Agent':'node.js-v0.8.8'}] 20 | try: 21 | c = random.randint(0,4) 22 | req = urllib.request.Request(url=url,headers=headers[c]) 23 | response = urlopen(req) 24 | except (Exception) as e: 25 | print (e) 26 | time.sleep(60) 27 | soup = BeautifulSoup(response.read(), features = "html.parser") 28 | a_list = soup.find_all('a') 29 | commit_list = [] 30 | older_href = '' 31 | for a in a_list: 32 | data = a.get('data-pjax') 33 | if data: 34 | if data == 'true': 35 | href = a.get('href') 36 | if href not in commit_list: 37 | commit_list.append(href) 38 | if a.text == 'Older': 39 | older_href = a.get('href') 40 | 41 | for commit in commit_list: 42 | searched_commit_num = searched_commit_num + 1 43 | try: 44 | while threading.activeCount() > 25: 45 | #print threading.activeCount() 46 | pass 47 | t = threading.Thread(target = download_diff.extract, args = (commit,)) 48 | t.start() 49 | except (Exception) as e: 50 | print ("multiprocessing error") 51 | print (e.message) 52 | interval = random.uniform(1,2) 53 | time.sleep(interval) 54 | 55 | # print (older_href) 56 | file = open('download_log.txt','a') 57 | file.write(older_href + '\n') 58 | file.write("Already downloaded " + str(searched_commit_num) + " commits." + '\n') 59 | print ("Already downloaded " + str(searched_commit_num) + " commits.") 60 | file.close() 61 | out = open('commit_url.txt','w') 62 | out.write(older_href) 63 | out.close() 64 | 65 | get_commits(older_href,searched_commit_num) 66 | 67 | 68 | def main(): 69 | #change the file(commit_url.txt) to get other software's commits 70 | fin = open('commit_url.txt','r') 71 | url = fin.readline() 72 | fin.close() 73 | 74 | searched_commit_num = 0 75 | 76 | try: 77 | get_commits(url,searched_commit_num) 78 | except (Exception) as e: 79 | print (e) 80 | 81 | 82 | if __name__ == '__main__': 83 | http.client.HTTPConnection._http_vsn = 10 84 | http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0' 85 | main() 86 | 87 | -------------------------------------------------------------------------------- /code/cassandra/get_commit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import urllib 4 | from urllib.request import urlopen 5 | from bs4 import BeautifulSoup 6 | import http.client 7 | import time 8 | import random 9 | import threading 10 | 11 | import download_diff 12 | 13 | 14 | def get_commits(url,searched_commit_num): 15 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\ 16 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ 17 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\ 18 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\ 19 | {'User-Agent':'node.js-v0.8.8'}] 20 | try: 21 | c = random.randint(0,4) 22 | req = urllib.request.Request(url=url,headers=headers[c]) 23 | response = urlopen(req) 24 | except (Exception) as e: 25 | print (e) 26 | time.sleep(60) 27 | soup = BeautifulSoup(response.read(), features = "html.parser") 28 | a_list = soup.find_all('a') 29 | commit_list = [] 30 | older_href = '' 31 | for a in a_list: 32 | data = a.get('data-pjax') 33 | if data: 34 | if data == 'true': 35 | href = a.get('href') 36 | if href not in commit_list: 37 | commit_list.append(href) 38 | if a.text == 'Older': 39 | older_href = a.get('href') 40 | 41 | for commit in commit_list: 42 | searched_commit_num = searched_commit_num + 1 43 | try: 44 | while threading.activeCount() > 25: 45 | #print threading.activeCount() 46 | pass 47 | t = threading.Thread(target = download_diff.extract, args = (commit,)) 48 | t.start() 49 | except (Exception) as e: 50 | print ("multiprocessing error") 51 | print (e.message) 52 | interval = random.uniform(1,2) 53 | time.sleep(interval) 54 | 55 | # print (older_href) 56 | file = open('download_log.txt','a') 57 | file.write(older_href + '\n') 58 | file.write("Already downloaded " + str(searched_commit_num) + " commits." + '\n') 59 | print ("Already downloaded " + str(searched_commit_num) + " commits.") 60 | file.close() 61 | out = open('commit_url.txt','w') 62 | out.write(older_href) 63 | out.close() 64 | 65 | get_commits(older_href,searched_commit_num) 66 | 67 | 68 | def main(): 69 | #change the file(commit_url.txt) to get other software's commits 70 | fin = open('commit_url.txt','r') 71 | url = fin.readline() 72 | fin.close() 73 | 74 | searched_commit_num = 0 75 | 76 | try: 77 | get_commits(url,searched_commit_num) 78 | except (Exception) as e: 79 | print (e) 80 | 81 | 82 | 83 | if __name__ == '__main__': 84 | http.client.HTTPConnection._http_vsn = 10 85 | http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0' 86 | main() 87 | 88 | -------------------------------------------------------------------------------- /code/hdfs_demo_examples/download_diff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import urllib 4 | from urllib.request import urlopen 5 | from bs4 import BeautifulSoup 6 | import random 7 | import os 8 | from pathlib import Path 9 | 10 | BASE_URL = "https://github.com" 11 | DIFF_FILE_PATH = "." 12 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\ 13 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ 14 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\ 15 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\ 16 | {'User-Agent':'node.js-v0.8.8'}] 17 | 18 | #Download diff file of the commit 19 | def download(url): 20 | commit_sha = url.split('/') 21 | commit_sha = commit_sha[-1] 22 | h = random.randint(0,4) 23 | diff_response = urlopen(url + '.diff') 24 | diff = diff_response.read().decode('UTF-8') 25 | diff_file = open(DIFF_FILE_PATH + '/' + commit_sha + '.diff', 'w') 26 | diff_file.write(diff) 27 | diff_file.close() 28 | 29 | #Get all the information of the commit 30 | def extract(url): 31 | 32 | url = BASE_URL + url 33 | 34 | try: 35 | c = random.randint(0,4) 36 | req = urllib.request.Request(url = url,headers = headers[c]) 37 | commit_reponse = urlopen(req) 38 | except (Exception) as e: 39 | return 40 | 41 | soup = BeautifulSoup(commit_reponse.read(), features = "html.parser") 42 | 43 | commit_title = '' 44 | p_list = soup.find_all('p') 45 | for p in p_list: 46 | name = p.get('class') 47 | if name: 48 | if name[0] == 'commit-title': 49 | commit_title = p.text 50 | 51 | commit_description = '' 52 | div_list = soup.find_all('div') 53 | for div in div_list: 54 | name = div.get('class') 55 | if name: 56 | if name[0] == 'commit-desc': 57 | commit_description = div.text 58 | 59 | commit_time_tag = soup.find('relative-time') 60 | if commit_time_tag: 61 | commit_time = commit_time_tag.get('datetime') 62 | print (url) 63 | print (commit_time) 64 | else: 65 | commit_time = 'commit_time_tag not exist' 66 | 67 | commit_sha = url.split('/') 68 | commit_sha = commit_sha[-1] 69 | commit_file = Path(DIFF_FILE_PATH + '/' + commit_sha + '.diff') 70 | if commit_file.is_file(): 71 | print ("The diff file of " + commit_sha + " is already downloaded") 72 | else: 73 | commit_info_file = open('commit_info.txt','a') 74 | commit_info_file.write(url + '$$$' + commit_title.replace('\n','').strip() + commit_description.replace('\n','').strip() + '$$$' + commit_time + '\n') 75 | commit_info_file.close() 76 | try: 77 | download(url) 78 | except (Exception) as e: 79 | print (e) 80 | return 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /code/hbase/download_diff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import urllib 4 | from urllib.request import urlopen 5 | from bs4 import BeautifulSoup 6 | import random 7 | import os 8 | from pathlib import Path 9 | 10 | BASE_URL = "https://github.com" 11 | DIFF_FILE_PATH = "/Users/zhangbuzhang/Desktop/diffs/HBase" 12 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\ 13 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ 14 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\ 15 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\ 16 | {'User-Agent':'node.js-v0.8.8'}] 17 | 18 | #Download diff file of the commit 19 | def download(url): 20 | commit_sha = url.split('/') 21 | commit_sha = commit_sha[-1] 22 | h = random.randint(0,4) 23 | diff_response = urlopen(url + '.diff') 24 | diff = diff_response.read().decode('UTF-8') 25 | diff_file = open(DIFF_FILE_PATH + '/' + commit_sha + '.diff', 'w') 26 | diff_file.write(diff) 27 | diff_file.close() 28 | 29 | #Get all the information of the commit 30 | def extract(url): 31 | 32 | url = BASE_URL + url 33 | 34 | try: 35 | c = random.randint(0,4) 36 | req = urllib.request.Request(url = url,headers = headers[c]) 37 | commit_reponse = urlopen(req) 38 | except (Exception) as e: 39 | return 40 | 41 | soup = BeautifulSoup(commit_reponse.read(), features = "html.parser") 42 | 43 | commit_title = '' 44 | p_list = soup.find_all('p') 45 | for p in p_list: 46 | name = p.get('class') 47 | if name: 48 | if name[0] == 'commit-title': 49 | commit_title = p.text 50 | 51 | commit_description = '' 52 | div_list = soup.find_all('div') 53 | for div in div_list: 54 | name = div.get('class') 55 | if name: 56 | if name[0] == 'commit-desc': 57 | commit_description = div.text 58 | 59 | commit_time_tag = soup.find('relative-time') 60 | if commit_time_tag: 61 | commit_time = commit_time_tag.get('datetime') 62 | print (url) 63 | print (commit_time) 64 | else: 65 | commit_time = 'commit_time_tag not exist' 66 | 67 | commit_sha = url.split('/') 68 | commit_sha = commit_sha[-1] 69 | commit_file = Path(DIFF_FILE_PATH + '/' + commit_sha + '.diff') 70 | if commit_file.is_file(): 71 | print ("The diff file of " + commit_sha + " is already downloaded") 72 | else: 73 | commit_info_file = open('commit_info.txt','a') 74 | commit_info_file.write(url + '$$$' + commit_title.replace('\n','').strip() + commit_description.replace('\n','').strip() + '$$$' + commit_time + '\n') 75 | commit_info_file.close() 76 | try: 77 | download(url) 78 | except (Exception) as e: 79 | print (e) 80 | return 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /code/hdfs/download_diff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import urllib 4 | from urllib.request import urlopen 5 | from bs4 import BeautifulSoup 6 | import random 7 | import os 8 | from pathlib import Path 9 | 10 | BASE_URL = "https://github.com" 11 | DIFF_FILE_PATH = "/Users/zhangbuzhang/Desktop/diffs/HDFS" 12 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\ 13 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ 14 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\ 15 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\ 16 | {'User-Agent':'node.js-v0.8.8'}] 17 | 18 | #Download diff file of the commit 19 | def download(url): 20 | commit_sha = url.split('/') 21 | commit_sha = commit_sha[-1] 22 | h = random.randint(0,4) 23 | diff_response = urlopen(url + '.diff') 24 | diff = diff_response.read().decode('UTF-8') 25 | diff_file = open(DIFF_FILE_PATH + '/' + commit_sha + '.diff', 'w') 26 | diff_file.write(diff) 27 | diff_file.close() 28 | 29 | #Get all the information of the commit 30 | def extract(url): 31 | 32 | url = BASE_URL + url 33 | 34 | try: 35 | c = random.randint(0,4) 36 | req = urllib.request.Request(url = url,headers = headers[c]) 37 | commit_reponse = urlopen(req) 38 | except (Exception) as e: 39 | return 40 | 41 | soup = BeautifulSoup(commit_reponse.read(), features = "html.parser") 42 | 43 | commit_title = '' 44 | p_list = soup.find_all('p') 45 | for p in p_list: 46 | name = p.get('class') 47 | if name: 48 | if name[0] == 'commit-title': 49 | commit_title = p.text 50 | 51 | commit_description = '' 52 | div_list = soup.find_all('div') 53 | for div in div_list: 54 | name = div.get('class') 55 | if name: 56 | if name[0] == 'commit-desc': 57 | commit_description = div.text 58 | 59 | commit_time_tag = soup.find('relative-time') 60 | if commit_time_tag: 61 | commit_time = commit_time_tag.get('datetime') 62 | print (url) 63 | print (commit_time) 64 | else: 65 | commit_time = 'commit_time_tag not exist' 66 | 67 | commit_sha = url.split('/') 68 | commit_sha = commit_sha[-1] 69 | commit_file = Path(DIFF_FILE_PATH + '/' + commit_sha + '.diff') 70 | if commit_file.is_file(): 71 | print ("The diff file of " + commit_sha + " is already downloaded") 72 | else: 73 | commit_info_file = open('commit_info.txt','a') 74 | commit_info_file.write(url + '$$$' + commit_title.replace('\n','').strip() + commit_description.replace('\n','').strip() + '$$$' + commit_time + '\n') 75 | commit_info_file.close() 76 | try: 77 | download(url) 78 | except (Exception) as e: 79 | print (e) 80 | return 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /code/spark/download_diff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import urllib 4 | from urllib.request import urlopen 5 | from bs4 import BeautifulSoup 6 | import random 7 | import os 8 | from pathlib import Path 9 | 10 | BASE_URL = "https://github.com" 11 | DIFF_FILE_PATH = "/Users/zhangbuzhang/Desktop/diffs/HBase" 12 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\ 13 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ 14 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\ 15 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\ 16 | {'User-Agent':'node.js-v0.8.8'}] 17 | 18 | #Download diff file of the commit 19 | def download(url): 20 | commit_sha = url.split('/') 21 | commit_sha = commit_sha[-1] 22 | h = random.randint(0,4) 23 | diff_response = urlopen(url + '.diff') 24 | diff = diff_response.read().decode('UTF-8') 25 | diff_file = open(DIFF_FILE_PATH + '/' + commit_sha + '.diff', 'w') 26 | diff_file.write(diff) 27 | diff_file.close() 28 | 29 | #Get all the information of the commit 30 | def extract(url): 31 | 32 | url = BASE_URL + url 33 | 34 | try: 35 | c = random.randint(0,4) 36 | req = urllib.request.Request(url = url,headers = headers[c]) 37 | commit_reponse = urlopen(req) 38 | except (Exception) as e: 39 | return 40 | 41 | soup = BeautifulSoup(commit_reponse.read(), features = "html.parser") 42 | 43 | commit_title = '' 44 | p_list = soup.find_all('p') 45 | for p in p_list: 46 | name = p.get('class') 47 | if name: 48 | if name[0] == 'commit-title': 49 | commit_title = p.text 50 | 51 | commit_description = '' 52 | div_list = soup.find_all('div') 53 | for div in div_list: 54 | name = div.get('class') 55 | if name: 56 | if name[0] == 'commit-desc': 57 | commit_description = div.text 58 | 59 | commit_time_tag = soup.find('relative-time') 60 | if commit_time_tag: 61 | commit_time = commit_time_tag.get('datetime') 62 | print (url) 63 | print (commit_time) 64 | else: 65 | commit_time = 'commit_time_tag not exist' 66 | 67 | commit_sha = url.split('/') 68 | commit_sha = commit_sha[-1] 69 | commit_file = Path(DIFF_FILE_PATH + '/' + commit_sha + '.diff') 70 | if commit_file.is_file(): 71 | print ("The diff file of " + commit_sha + " is already downloaded") 72 | else: 73 | commit_info_file = open('commit_info.txt','a') 74 | commit_info_file.write(url + '$$$' + commit_title.replace('\n','').strip() + commit_description.replace('\n','').strip() + '$$$' + commit_time + '\n') 75 | commit_info_file.close() 76 | try: 77 | download(url) 78 | except (Exception) as e: 79 | print (e) 80 | return 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /code/cassandra/download_diff.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import urllib 4 | from urllib.request import urlopen 5 | from bs4 import BeautifulSoup 6 | import random 7 | import os 8 | from pathlib import Path 9 | 10 | BASE_URL = "https://github.com" 11 | DIFF_FILE_PATH = "/Users/zhangbuzhang/Desktop/diffs/Cassandra" 12 | headers = [{"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"},\ 13 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\ 14 | {"User-Agent": "Mozilla/5.0 (Linux; X11)"},\ 15 | {"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.5)"},\ 16 | {'User-Agent':'node.js-v0.8.8'}] 17 | 18 | #Download diff file of the commit 19 | def download(url): 20 | commit_sha = url.split('/') 21 | commit_sha = commit_sha[-1] 22 | h = random.randint(0,4) 23 | diff_response = urlopen(url + '.diff') 24 | diff = diff_response.read().decode('UTF-8') 25 | diff_file = open(DIFF_FILE_PATH + '/' + commit_sha + '.diff', 'w') 26 | diff_file.write(diff) 27 | diff_file.close() 28 | 29 | #Get all the information of the commit 30 | def extract(url): 31 | 32 | url = BASE_URL + url 33 | 34 | try: 35 | c = random.randint(0,4) 36 | req = urllib.request.Request(url = url,headers = headers[c]) 37 | commit_reponse = urlopen(req) 38 | except (Exception) as e: 39 | return 40 | 41 | soup = BeautifulSoup(commit_reponse.read(), features = "html.parser") 42 | 43 | commit_title = '' 44 | p_list = soup.find_all('p') 45 | for p in p_list: 46 | name = p.get('class') 47 | if name: 48 | if name[0] == 'commit-title': 49 | commit_title = p.text 50 | 51 | commit_description = '' 52 | div_list = soup.find_all('div') 53 | for div in div_list: 54 | name = div.get('class') 55 | if name: 56 | if name[0] == 'commit-desc': 57 | commit_description = div.text 58 | 59 | commit_time_tag = soup.find('relative-time') 60 | if commit_time_tag: 61 | commit_time = commit_time_tag.get('datetime') 62 | print (url) 63 | print (commit_time) 64 | else: 65 | commit_time = 'commit_time_tag not exist' 66 | 67 | commit_sha = url.split('/') 68 | commit_sha = commit_sha[-1] 69 | commit_file = Path(DIFF_FILE_PATH + '/' + commit_sha + '.diff') 70 | if commit_file.is_file(): 71 | print ("The diff file of " + commit_sha + " is already downloaded") 72 | else: 73 | commit_info_file = open('commit_info.txt','a') 74 | commit_info_file.write(url + '$$$' + commit_title.replace('\n','').strip() + commit_description.replace('\n','').strip() + '$$$' + commit_time + '\n') 75 | commit_info_file.close() 76 | try: 77 | download(url) 78 | except (Exception) as e: 79 | print (e) 80 | return 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /code/cassandra/extract_commit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import random 4 | import sys 5 | import diff_file_parser 6 | from download_diff import DIFF_FILE_PATH 7 | import nltk 8 | from nltk.stem.porter import PorterStemmer 9 | 10 | 11 | def extract(commit_info,configVariableList): 12 | 13 | commit_info = commit_info.split('$$$') 14 | commit_url = commit_info[0] 15 | commit_title = commit_info[1] 16 | commit_time = commit_info[2] 17 | commit_sha = commit_url.split('/') 18 | commit_sha = commit_sha[-1] 19 | 20 | desc_contain_keyword = False #whether commit description contains the configuration keyword 21 | is_merge_commit = False 22 | diff_contain_config = False #whether diff touches configuration 23 | 24 | titile_words = commit_title.split(' ') 25 | for word in titile_words: 26 | if word.lower() == 'option' or word.lower() == 'parameter': 27 | commit_title = commit_title.replace(word, "**" + word.upper() + "**") 28 | desc_contain_keyword = True 29 | st = PorterStemmer() 30 | word_stemmed = st.stem(word).lower() 31 | if word_stemmed in {'config','configure'}: 32 | commit_title = commit_title.replace(word_stemmed, "**" + word_stemmed.upper() + "**") 33 | desc_contain_keyword = True 34 | 35 | 36 | commit_titleword = commit_title.lower().split(' ') 37 | for word in {'merge','merging','checkstyle','findbugs'}: 38 | if word in commit_titleword: 39 | is_merge_commit = True 40 | break 41 | 42 | code_result = [] 43 | diff_file_path = DIFF_FILE_PATH + '/' + commit_sha + '.diff' 44 | if is_merge_commit == False: 45 | code_result = diff_file_parser.diff_selection(diff_file_path, configVariableList) 46 | 47 | config_file_touched = False 48 | 49 | config_load_touched = False 50 | 51 | config_set_touched = False 52 | 53 | config_variable_touched = False 54 | 55 | config_message_touched = False 56 | 57 | if code_result: 58 | 59 | config_file_touched = code_result[0] 60 | 61 | config_load_touched = code_result[1] 62 | 63 | config_set_touched = code_result[2] 64 | 65 | config_variable_touched = code_result[3] 66 | 67 | config_message_touched = code_result[4] 68 | 69 | #the set of touched configuration option 70 | touched_config_file = code_result[5] 71 | 72 | #the set of touched configuration load function 73 | touched_config_load_func = code_result[6] 74 | 75 | #the set of touched configuration set function 76 | touched_config_set_func = code_result[7] 77 | 78 | #the set of touched configuration variables 79 | touched_variable = code_result[8] 80 | 81 | #the set of touched meesgae keyword 82 | touched_message = code_result[9] 83 | 84 | if True in (config_file_touched,config_load_touched,config_set_touched,config_variable_touched,config_message_touched): 85 | diff_contain_config = True 86 | 87 | 88 | if (is_merge_commit == False) and (desc_contain_keyword == True or diff_contain_config == True): 89 | print("find a candidate commit!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") 90 | file = open('commit_selected.txt', 'a') 91 | file.write("###############################################################################" + '\n') 92 | file.write(commit_title + '\n' + commit_url + '\n' + commit_time + '\n') 93 | file.write('Commit message touches config:' + str(desc_contain_keyword) + '\n') 94 | file.write('Diff touches config define:' + str(config_file_touched) + '\n') 95 | file.write('Diff touches config loading:' + str(config_load_touched) + '\n') 96 | file.write('Diff touches config setting:' + str(config_set_touched) + '\n') 97 | file.write('Diff touches config variable (data flow):' + str(config_variable_touched) + '\n') 98 | file.write('Diff touches config message:' + str(config_message_touched) + '\n') 99 | 100 | file.write('\n_________________touchedConfigDefine_____________________\n\n') 101 | if config_file_touched: 102 | for config_file in touched_config_file: 103 | file.write(config_file) 104 | file.write('\n') 105 | else: 106 | file.write('Null\n') 107 | 108 | file.write('\n___________________touchedConfigLoad___________________\n\n') 109 | if config_load_touched: 110 | for config_load_func in touched_config_load_func: 111 | file.write(config_load_func + '\n') 112 | else: 113 | file.write('Null\n') 114 | 115 | file.write('\n___________________touchedConfigSet______________________\n\n') 116 | if config_set_touched: 117 | for config_set_func in touched_config_set_func: 118 | file.write(config_set_func + '\n') 119 | else: 120 | file.write('Null\n') 121 | 122 | file.write('\n___________________touchedConfigVariable_____________________\n\n') 123 | if config_variable_touched: 124 | for param in touched_variable: 125 | file.write(param + '\n') 126 | else: 127 | file.write('Null\n') 128 | 129 | file.write('\n____________________touchedMessage________________________\n\n') 130 | if config_message_touched: 131 | for keyword in touched_message: 132 | file.write('"' + keyword + '"' + '\n') 133 | else: 134 | file.write('Null\n') 135 | 136 | file.write('\n') 137 | 138 | file.close() 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | -------------------------------------------------------------------------------- /code/hdfs/extract_commit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import random 4 | import sys 5 | import diff_file_parser 6 | from download_diff import DIFF_FILE_PATH 7 | import nltk 8 | from nltk.stem.porter import PorterStemmer 9 | 10 | 11 | def extract(commit_info,configParamList): 12 | 13 | commit_info = commit_info.split('$$$') 14 | commit_url = commit_info[0] 15 | commit_title = commit_info[1] 16 | commit_time = commit_info[2] 17 | commit_sha = commit_url.split('/') 18 | commit_sha = commit_sha[-1] 19 | 20 | desc_contain_keyword = False #whether commit description contains the configuration keyword 21 | is_merge_commit = False 22 | diff_contain_config = False #whether diff touches configuration 23 | 24 | titile_words = commit_title.split(' ') 25 | for word in titile_words: 26 | if word.lower() == 'option' or word.lower() == 'parameter': 27 | commit_title = commit_title.replace(word, "**" + word.upper() + "**") 28 | desc_contain_keyword = True 29 | st = PorterStemmer() 30 | word_stemmed = st.stem(word).lower() 31 | if word_stemmed in {'config','configur'}: 32 | commit_title = commit_title.replace(word_stemmed, "**" + word_stemmed.upper() + "**") 33 | desc_contain_keyword = True 34 | 35 | 36 | commit_titleword = commit_title.lower().split(' ') 37 | for word in {'merge','merging','checkstyle','findbugs'}: 38 | if word in commit_titleword: 39 | is_merge_commit = True 40 | break 41 | 42 | codeResult = [] 43 | diff_file_path = DIFF_FILE_PATH + '/' + commit_sha + '.diff' 44 | if is_merge_commit == False: 45 | codeResult = diff_file_parser.diffSelection(diff_file_path, configParamList) 46 | 47 | #Wheter a diff touches configuration file 48 | configFileTouched = False 49 | 50 | #Wheter a diff touches configuration load function 51 | configLoadTouched = False 52 | 53 | #Wheter a diff touches configuration set function 54 | configSetTouched = False 55 | 56 | #Wheter a diff touches configuration parameter 57 | configParamTouched = False 58 | 59 | #Whether a diff touches configuration message 60 | configMessageTouched = False 61 | 62 | if codeResult: 63 | 64 | configFileTouched = codeResult[0] 65 | 66 | configLoadTouched = codeResult[1] 67 | 68 | configSetTouched = codeResult[2] 69 | 70 | configParamTouched = codeResult[3] 71 | 72 | configMessageTouched = codeResult[4] 73 | 74 | #the set of touched file 75 | touchedFile = codeResult[5] 76 | 77 | #the set of touched configuration load function 78 | touchedLoadFunc = codeResult[6] 79 | 80 | #the set of touched configuration set function 81 | touchedSetFunc = codeResult[7] 82 | 83 | #the set of touched configuration parameter 84 | touchedParam = codeResult[8] 85 | 86 | #the set of touched meesgae 87 | touchedMessage = codeResult[9] 88 | 89 | if True in (configFileTouched,configLoadTouched,configSetTouched,configParamTouched,configMessageTouched): 90 | diff_contain_config = True 91 | 92 | if (is_merge_commit == False) and (desc_contain_keyword == True or diff_contain_config == True): 93 | print("find a candidate commit!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") 94 | file = open('commit_selected.txt', 'a') 95 | file.write("###############################################################################" + '\n') 96 | file.write(commit_title + '\n' + commit_url + '\n' + commit_time + '\n') 97 | file.write('Commit message touches config:' + str(desc_contain_keyword) + '\n') 98 | file.write('Diff touches config define:' + str(configFileTouched) + '\n') 99 | file.write('Diff touches config loading:' + str(configLoadTouched) + '\n') 100 | file.write('Diff touches config setting:' + str(configSetTouched) + '\n') 101 | file.write('Diff touches config variable (data flow):' + str(configParamTouched) + '\n') 102 | file.write('Diff touches config message:' + str(configMessageTouched) + '\n') 103 | 104 | file.write('\n_________________touchedConfigDefine_____________________\n\n') 105 | if configFileTouched: 106 | for fileName in touchedFile: 107 | file.write(fileName + ' ') 108 | file.write('\n') 109 | else: 110 | file.write('Null\n') 111 | 112 | file.write('\n___________________touchedConfigLoad___________________\n\n') 113 | if configLoadTouched: 114 | for loadFunc in touchedLoadFunc: 115 | file.write(loadFunc + '\n') 116 | else: 117 | file.write('Null\n') 118 | 119 | file.write('\n___________________touchedConfigSet____________________\n\n') 120 | if configSetTouched: 121 | for setFunc in touchedSetFunc: 122 | file.write(setFunc + '\n') 123 | else: 124 | file.write('Null\n') 125 | 126 | file.write('\n___________________touchedConfigVariable_____________________\n\n') 127 | if configParamTouched: 128 | for param in touchedParam: 129 | file.write(param + '\n') 130 | else: 131 | file.write('Null\n') 132 | 133 | file.write('\n___________________touchedMessage_____________________\n\n') 134 | if configMessageTouched: 135 | for keyword in touchedMessage: 136 | file.write('"' + keyword + '"' + '\n') 137 | else: 138 | file.write('Null\n') 139 | 140 | file.write('\n') 141 | 142 | file.close() 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /code/hbase/extract_commit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import random 4 | import diff_file_parser 5 | from download_diff import DIFF_FILE_PATH 6 | import nltk 7 | from nltk.stem.porter import PorterStemmer 8 | 9 | def extract(commit_info,configParamList): 10 | 11 | commit_info = commit_info.split('$$$') 12 | commit_url = commit_info[0] 13 | commit_title = commit_info[1] 14 | commit_time = commit_info[2] 15 | commit_sha = commit_url.split('/') 16 | commit_sha = commit_sha[-1] 17 | 18 | desc_contain_keyword = False #whether commit description contains the configuration keyword 19 | irrelevant_commit = False 20 | diff_contain_config = False #whether diff touches configuration 21 | 22 | titile_words = commit_title.split(' ') 23 | for word in titile_words: 24 | if word.lower() == 'option' or word.lower() == 'parameter': 25 | commit_title = commit_title.replace(word, "**" + word.upper() + "**") 26 | desc_contain_keyword = True 27 | st = PorterStemmer() 28 | word_stemmed = st.stem(word).lower() 29 | if word_stemmed in {'config','configur'}: 30 | commit_title = commit_title.replace(word_stemmed, "**" + word_stemmed.upper() + "**") 31 | desc_contain_keyword = True 32 | 33 | 34 | commit_titleword = commit_title.lower().split(' ') 35 | for word in {'merge','merging','checkstyle','findbugs'}: 36 | if word in commit_titleword: 37 | irrelevant_commit = True 38 | break 39 | 40 | codeResult = [] 41 | diff_file_path = DIFF_FILE_PATH + '/' + commit_sha + '.diff' 42 | if irrelevant_commit == False: 43 | codeResult = diff_file_parser.diffSelection(diff_file_path, configParamList) 44 | 45 | #Wheter a diff touches configuration file 46 | configFileTouched = False 47 | 48 | #Wheter a diff touches configuration load function 49 | configLoadTouched = False 50 | 51 | #Wheter a diff touches configuration set function 52 | configSetTouched = False 53 | 54 | #Wheter a diff touches configuration parameter 55 | configParamTouched = False 56 | 57 | #Whether a diff touches configuration message 58 | configMessageTouched = False 59 | 60 | if codeResult: 61 | 62 | configFileTouched = codeResult[0] 63 | 64 | configLoadTouched = codeResult[1] 65 | 66 | configSetTouched = codeResult[2] 67 | 68 | configParamTouched = codeResult[3] 69 | 70 | configMessageTouched = codeResult[4] 71 | 72 | #the set of touched file 73 | touchedFile = codeResult[5] 74 | 75 | #the set of touched configuration load function 76 | touchedLoadFunc = codeResult[6] 77 | 78 | #the set of touched configuration set function 79 | touchedSetFunc = codeResult[7] 80 | 81 | #the set of touched configuration parameter 82 | touchedParam = codeResult[8] 83 | 84 | #the set of touched meesgae 85 | touchedMessage = codeResult[9] 86 | 87 | if True in (configFileTouched,configLoadTouched,configSetTouched,configParamTouched,configMessageTouched): 88 | diff_contain_config = True 89 | 90 | if (irrelevant_commit == False) and (desc_contain_keyword == True or diff_contain_config == True): 91 | print("find a candidate commit!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") 92 | file = open('commit_selected.txt', 'a') 93 | file.write("###############################################################################" + '\n') 94 | file.write(commit_title + '\n' + commit_url + '\n' + commit_time + '\n') 95 | file.write('Commit message touches config:' + str(desc_contain_keyword) + '\n') 96 | file.write('Diff touches config define:' + str(configFileTouched) + '\n') 97 | file.write('Diff touches config loading:' + str(configLoadTouched) + '\n') 98 | file.write('Diff touches config setting:' + str(configSetTouched) + '\n') 99 | file.write('Diff touches config variable (data flow):' + str(configParamTouched) + '\n') 100 | file.write('Diff touches config message:' + str(configMessageTouched) + '\n') 101 | 102 | file.write('\n_________________touchedConfigDefine_____________________\n\n') 103 | if configFileTouched: 104 | for fileName in touchedFile: 105 | file.write(fileName + ' ') 106 | file.write('\n') 107 | else: 108 | file.write('Null\n') 109 | 110 | file.write('\n___________________touchedConfigLoad___________________\n\n') 111 | if configLoadTouched: 112 | for loadFunc in touchedLoadFunc: 113 | file.write(loadFunc + '\n') 114 | else: 115 | file.write('Null\n') 116 | 117 | 118 | file.write('\n___________________touchedConfigSet____________________\n\n') 119 | if configSetTouched: 120 | for setFunc in touchedSetFunc: 121 | file.write(setFunc + '\n') 122 | else: 123 | file.write('Null\n') 124 | 125 | 126 | file.write('\n___________________touchedConfigVariable_____________________\n\n') 127 | if configParamTouched: 128 | for param in touchedParam: 129 | file.write(param + '\n') 130 | else: 131 | file.write('Null\n') 132 | 133 | file.write('\n___________________touchedMessage_____________________\n\n') 134 | if configMessageTouched: 135 | for keyword in touchedMessage: 136 | file.write('"' + keyword + '"' + '\n') 137 | else: 138 | file.write('Null\n') 139 | 140 | file.write('\n') 141 | 142 | file.close() 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /code/hdfs_demo_examples/extract_commit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import random 4 | import sys 5 | import diff_file_parser 6 | from download_diff import DIFF_FILE_PATH 7 | import nltk 8 | from nltk.stem.porter import PorterStemmer 9 | 10 | 11 | def extract(commit_info,configParamList): 12 | 13 | commit_info = commit_info.split('$$$') 14 | commit_url = commit_info[0] 15 | commit_title = commit_info[1] 16 | commit_time = commit_info[2] 17 | commit_sha = commit_url.split('/') 18 | commit_sha = commit_sha[-1] 19 | 20 | desc_contain_keyword = False #whether commit description contains the configuration keyword 21 | is_merge_commit = False 22 | diff_contain_config = False #whether diff touches configuration 23 | 24 | titile_words = commit_title.split(' ') 25 | for word in titile_words: 26 | if word.lower() == 'option' or word.lower() == 'parameter': 27 | commit_title = commit_title.replace(word, "**" + word.upper() + "**") 28 | desc_contain_keyword = True 29 | st = PorterStemmer() 30 | word_stemmed = st.stem(word).lower() 31 | if word_stemmed in {'config','configur'}: 32 | commit_title = commit_title.replace(word_stemmed, "**" + word_stemmed.upper() + "**") 33 | desc_contain_keyword = True 34 | 35 | 36 | commit_titleword = commit_title.lower().split(' ') 37 | for word in {'merge','merging','checkstyle','findbugs'}: 38 | if word in commit_titleword: 39 | is_merge_commit = True 40 | break 41 | 42 | codeResult = [] 43 | diff_file_path = DIFF_FILE_PATH + '/' + commit_sha + '.diff' 44 | if is_merge_commit == False: 45 | codeResult = diff_file_parser.diffSelection(diff_file_path, configParamList) 46 | 47 | #Wheter a diff touches configuration file 48 | configFileTouched = False 49 | 50 | #Wheter a diff touches configuration load function 51 | configLoadTouched = False 52 | 53 | #Wheter a diff touches configuration set function 54 | configSetTouched = False 55 | 56 | #Wheter a diff touches configuration parameter 57 | configParamTouched = False 58 | 59 | #Whether a diff touches configuration message 60 | configMessageTouched = False 61 | 62 | if codeResult: 63 | 64 | configFileTouched = codeResult[0] 65 | 66 | configLoadTouched = codeResult[1] 67 | 68 | configSetTouched = codeResult[2] 69 | 70 | configParamTouched = codeResult[3] 71 | 72 | configMessageTouched = codeResult[4] 73 | 74 | #the set of touched file 75 | touchedFile = codeResult[5] 76 | 77 | #the set of touched configuration load function 78 | touchedLoadFunc = codeResult[6] 79 | 80 | #the set of touched configuration set function 81 | touchedSetFunc = codeResult[7] 82 | 83 | #the set of touched configuration parameter 84 | touchedParam = codeResult[8] 85 | 86 | #the set of touched meesgae 87 | touchedMessage = codeResult[9] 88 | 89 | if True in (configFileTouched,configLoadTouched,configSetTouched,configParamTouched,configMessageTouched): 90 | diff_contain_config = True 91 | 92 | if (is_merge_commit == False) and (desc_contain_keyword == True or diff_contain_config == True): 93 | print("find a candidate commit!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") 94 | file = open('commit_selected.txt', 'a') 95 | file.write("###############################################################################" + '\n') 96 | file.write(commit_title + '\n' + commit_url + '\n' + commit_time + '\n') 97 | file.write('Commit message touches config:' + str(desc_contain_keyword) + '\n') 98 | file.write('Diff touches config define:' + str(configFileTouched) + '\n') 99 | file.write('Diff touches config loading:' + str(configLoadTouched) + '\n') 100 | file.write('Diff touches config setting:' + str(configSetTouched) + '\n') 101 | file.write('Diff touches config variable (data flow):' + str(configParamTouched) + '\n') 102 | file.write('Diff touches config message:' + str(configMessageTouched) + '\n') 103 | 104 | file.write('\n_________________touchedConfigDefine_____________________\n\n') 105 | if configFileTouched: 106 | for fileName in touchedFile: 107 | file.write(fileName + ' ') 108 | file.write('\n') 109 | else: 110 | file.write('Null\n') 111 | 112 | file.write('\n___________________touchedConfigLoad___________________\n\n') 113 | if configLoadTouched: 114 | for loadFunc in touchedLoadFunc: 115 | file.write(loadFunc + '\n') 116 | else: 117 | file.write('Null\n') 118 | 119 | file.write('\n___________________touchedConfigSet____________________\n\n') 120 | if configSetTouched: 121 | for setFunc in touchedSetFunc: 122 | file.write(setFunc + '\n') 123 | else: 124 | file.write('Null\n') 125 | 126 | file.write('\n___________________touchedConfigVariable_____________________\n\n') 127 | if configParamTouched: 128 | for param in touchedParam: 129 | file.write(param + '\n') 130 | else: 131 | file.write('Null\n') 132 | 133 | file.write('\n___________________touchedMessage_____________________\n\n') 134 | if configMessageTouched: 135 | for keyword in touchedMessage: 136 | file.write('"' + keyword + '"' + '\n') 137 | else: 138 | file.write('Null\n') 139 | 140 | file.write('\n') 141 | 142 | file.close() 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | -------------------------------------------------------------------------------- /commit_analysis/count_num.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import re 3 | 4 | def simple_count(file, commit_link_col): 5 | 6 | commit_list = [] 7 | hdfs_commit_num = 0 8 | hbase_commit_num = 0 9 | saprk_commit_num = 0 10 | cassandra_commit_num = 0 11 | 12 | file = open(file) 13 | for line in file: 14 | vals = line.split(",") 15 | if vals[commit_link_col] not in commit_list: 16 | commit_list.append(vals[commit_link_col]) 17 | if "hdfs" in vals[0].lower(): 18 | hdfs_commit_num = hdfs_commit_num + 1 19 | if "hbase" in vals[0].lower(): 20 | hbase_commit_num = hbase_commit_num + 1 21 | if "spark" in vals[0].lower(): 22 | saprk_commit_num = saprk_commit_num + 1 23 | if "cassandra" in vals[0].lower(): 24 | cassandra_commit_num = cassandra_commit_num + 1 25 | return hdfs_commit_num,hbase_commit_num,saprk_commit_num,cassandra_commit_num 26 | file.close() 27 | 28 | def simple_count_by_keyword(file, keywords, keyword_col): 29 | file = open(file) 30 | count = 0 31 | for line in file: 32 | vals = line.split(",") 33 | flag = 0 34 | for keyword in keywords: 35 | if keyword in vals[keyword_col].lower(): 36 | flag = 1 37 | if flag == 1: 38 | count = count + 1 39 | return count 40 | 41 | 42 | def count_by_keyword(file, commit_link_col, keywords, keyword_col): 43 | 44 | commit_list = [] 45 | hdfs_commit_num = 0 46 | hbase_commit_num = 0 47 | saprk_commit_num = 0 48 | cassandra_commit_num = 0 49 | file = open(file) 50 | for line in file: 51 | vals = line.split(",") 52 | flag = 0 53 | for keyword in keywords: 54 | if keyword in vals[keyword_col].lower(): 55 | flag = 1 56 | if flag == 1: 57 | if vals[commit_link_col] not in commit_list: 58 | commit_list.append(vals[commit_link_col]) 59 | if "hdfs" in vals[0].lower(): 60 | hdfs_commit_num = hdfs_commit_num + 1 61 | if "hbase" in vals[0].lower(): 62 | hbase_commit_num = hbase_commit_num + 1 63 | if "spark" in vals[0].lower(): 64 | saprk_commit_num = saprk_commit_num + 1 65 | if "cassandra" in vals[0].lower(): 66 | cassandra_commit_num = cassandra_commit_num + 1 67 | return hdfs_commit_num,hbase_commit_num,saprk_commit_num,cassandra_commit_num 68 | file.close() 69 | 70 | 71 | def print_simple_count(category, file, commit_link_col): 72 | 73 | count = simple_count(file, commit_link_col) 74 | print (category + ' ' + str(count[0]) + ' ' + str(count[1]) + ' ' + str(count[2]) + ' ' + str(count[3]) 75 | + ' ' + str(count[0] + count[1] + count[2] + count[3])) 76 | 77 | def print_count_by_keyword(category, file, commit_link_col, keywords, keyword_col): 78 | 79 | count = count_by_keyword(file, commit_link_col, keywords, keyword_col) 80 | print (category + ' ' + str(count[0]) + ' ' + str(count[1]) + ' ' + str(count[2]) + ' ' + str(count[3]) 81 | + ' ' + str(count[0] + count[1] + count[2] + count[3])) 82 | 83 | #just for table VI 84 | def print_commit_and_param_num(category, file, commit_link_col, keywords, keyword_col): 85 | commit_num = count_by_keyword(file, commit_link_col, keywords, keyword_col) 86 | param_num = simple_count_by_keyword(file, keywords, keyword_col) 87 | print (category + " " + str(commit_num[0] + commit_num[1] + commit_num[2] + commit_num[3]) + " " + str(param_num)) 88 | 89 | print("##########################################################") 90 | print("Table IV") 91 | print("INTERFACE is caculated by adding up AddParam, RemoveParam and ModifyParam") 92 | print("BEHAVIOR is caculated by adding up Parse, Check, Handle and Use") 93 | print("DOCUMENT is caculated by adding up User Manual and Code Comments") 94 | print("##########################################################") 95 | print("Table V") 96 | print("AddParam, RemoveParam and ModifyParam are calculated by adding their sub-categories") 97 | print_count_by_keyword("AddNewCode", "add_param.csv",1,{"new"},2) 98 | print_count_by_keyword("AddCodeChange", "add_param.csv",1,{"change"},2) 99 | print_simple_count("AddParameterization","parameterization.csv",4) 100 | print_simple_count("RmvModule","rmv_with_code.csv",1) 101 | print_simple_count("RmvReplace","rmv_replace.csv",3) 102 | print_simple_count("ModNaming","param_rename.csv",2) 103 | print_simple_count("ModDefualtValue","change_default_value.csv",4) 104 | print_simple_count("ModConstraint","change_param_constraint.csv",4) 105 | print("##########################################################") 106 | print("Table VI") 107 | print_commit_and_param_num("Performance","parameterization.csv",4,{"performance"},6) 108 | print_commit_and_param_num("Reliability","parameterization.csv",4,{"reliability"},6) 109 | print_commit_and_param_num("Manageability","parameterization.csv",4,{"manageability"},6) 110 | print_commit_and_param_num("Debugging","parameterization.csv",4,{"debug"},6) 111 | print_commit_and_param_num("Environment","parameterization.csv",4,{"env"},6) 112 | print_commit_and_param_num("Compatibility","parameterization.csv",4,{"compatibility"},6) 113 | print_commit_and_param_num("Testability","parameterization.csv",4,{"testability"},6) 114 | print_commit_and_param_num("Security","parameterization.csv",4,{"security"},6) 115 | print("##########################################################") 116 | print("Table VIII") 117 | print("Handle and Use are calculated by adding their sub-categories") 118 | print_simple_count("Parse","config_parsing.csv",1) 119 | print_count_by_keyword("Check","checking_and_handling_code.csv",4,{"check"},5) 120 | print_count_by_keyword("HandleAction ","checking_and_handling_code.csv",4,{"exception"},5) 121 | print_count_by_keyword("HandleMessage","checking_and_handling_code.csv",4,{"message"},5) 122 | print_simple_count("UseChange","change_param_existing_usage.csv",4) 123 | print_simple_count("UseAdd","param_new_use.csv",4) 124 | print("##########################################################") 125 | print("Documentation") 126 | print_count_by_keyword("User Manual","change_documentation.csv",4,{"file","guide","command","description"},6) 127 | print_count_by_keyword("Code Comments","change_documentation.csv",4,{"code comment"},6) 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | -------------------------------------------------------------------------------- /code/spark/extract_commit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import random 3 | import sys 4 | import diff_file_parser 5 | from download_diff import DIFF_FILE_PATH 6 | import nltk 7 | from nltk.stem.porter import PorterStemmer 8 | 9 | 10 | def extract(commit_info,configVariableList): 11 | 12 | commit_info = commit_info.split('$$$') 13 | commit_url = commit_info[0] 14 | commit_title = commit_info[1] 15 | if len(commit_info)>=3: 16 | commit_time = commit_info[2] 17 | commit_sha = commit_url.split('/') 18 | commit_sha = commit_sha[-1] 19 | 20 | desc_contain_keyword = False #whether commit description contains the configuration keyword 21 | irrelevant_commit = False 22 | diff_contain_config = False #whether diff touches configuration 23 | 24 | commit_title.replace('## What changes were proposed in this pull request?','') 25 | titile_words = commit_title.split(' ') 26 | count = 0 27 | for word in titile_words: 28 | if count > 20: 29 | break 30 | if word.lower() == 'option' or word.lower() == 'parameter': 31 | commit_title = commit_title.replace(word, "**" + word.upper() + "**") 32 | desc_contain_keyword = True 33 | st = PorterStemmer() 34 | word_stemmed = st.stem(word).lower() 35 | if 'config' in word.lower(): 36 | commit_title = commit_title.replace(word, "**" + word.upper() + "**") 37 | desc_contain_keyword = True 38 | count = count + 1 39 | 40 | 41 | commit_titleword = commit_title.lower().split(' ') 42 | for word in {'merge','merging','checkstyle'}: 43 | if word in commit_titleword: 44 | irrelevant_commit = True 45 | break 46 | 47 | codeResult = [] 48 | diff_file_path = DIFF_FILE_PATH + '/' + commit_sha + '.diff' 49 | if irrelevant_commit == False: 50 | codeResult = diff_file_parser.diff_selection(diff_file_path, configVariableList) 51 | 52 | #Whether a diff touches configuration doc 53 | configDocTouched = False 54 | 55 | #Whether a diff touches configuration build 56 | configBuildTouched = False 57 | 58 | #Whether a diff touches configuration load function 59 | configLoadTouched = False 60 | 61 | #Whether a diff touches configuration set function 62 | configSetTouched = False 63 | 64 | #Whether a diff touches configuration Variable 65 | configVariableTouched = False 66 | 67 | #Whether a diff touches configuration message 68 | configMessageTouched = False 69 | 70 | if codeResult: 71 | 72 | configDocTouched = codeResult[0] 73 | 74 | configBuildTouched = codeResult[1] 75 | 76 | configLoadTouched = codeResult[2] 77 | 78 | configSetTouched = codeResult[3] 79 | 80 | configVariableTouched = codeResult[4] 81 | 82 | configMessageTouched = codeResult[5] 83 | 84 | #the set of touched build function 85 | touchedBuildFunc = codeResult[6] 86 | 87 | #the set of touched configuration load function 88 | touchedConfigLoadFunc = codeResult[7] 89 | 90 | #the set of touched configuration set function 91 | touchedConfigSetFunc = codeResult[8] 92 | 93 | #the set of touched configuration variables 94 | touchedVariable = codeResult[9] 95 | 96 | #the set of touched meesgae keyword 97 | touchedMessage = codeResult[10] 98 | 99 | if True in (configDocTouched,configBuildTouched,configLoadTouched,configSetTouched,configVariableTouched,configMessageTouched): 100 | diff_contain_config = True 101 | 102 | if (irrelevant_commit == False) and (desc_contain_keyword == True or diff_contain_config == True): 103 | print("find a candidate commit!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") 104 | file = open('commit_selected.txt', 'a') 105 | file.write("###############################################################################" + '\n') 106 | file.write(commit_title + '\n' + commit_url + '\n' + commit_time + '\n') 107 | file.write('Commit message touches config:' + str(desc_contain_keyword) + '\n') 108 | file.write('Diff touches config define(doc):' + str(configDocTouched) + '\n') 109 | file.write('Diff touches config define(buildFunc):' + str(configBuildTouched) + '\n') 110 | file.write('Diff touches config loading:' + str(configLoadTouched) + '\n') 111 | file.write('Diff touches config setting:' + str(configSetTouched) + '\n') 112 | file.write('Diff touches config variable (data flow):' + str(configVariableTouched) + '\n') 113 | file.write('Diff touches config message:' + str(configMessageTouched) + '\n') 114 | 115 | file.write('\n_________________touchedConfigDefine(Doc)_____________________\n\n') 116 | if configDocTouched: 117 | file.write('configuration.md' + ' ') 118 | file.write('\n') 119 | else: 120 | file.write('Null\n') 121 | 122 | file.write('\n_________________touchedConfigDefine(Build)_____________________\n\n') 123 | if configBuildTouched: 124 | for builFunc in touchedBuildFunc: 125 | file.write(builFunc + ' ') 126 | file.write('\n') 127 | else: 128 | file.write('Null\n') 129 | 130 | file.write('\n___________________touchedConfigLoad___________________\n\n') 131 | if configLoadTouched: 132 | for configLoadFunc in touchedConfigLoadFunc: 133 | file.write(configLoadFunc + '\n') 134 | else: 135 | file.write('Null\n') 136 | 137 | file.write('\n___________________touchedConfigSet____________________\n\n') 138 | if configSetTouched: 139 | for setFunc in touchedConfigSetFunc: 140 | file.write(setFunc + '\n') 141 | else: 142 | file.write('Null\n') 143 | 144 | file.write('\n___________________touchedConfigVariable_____________________\n\n') 145 | if configVariableTouched: 146 | for variable in touchedVariable: 147 | file.write(variable + '\n') 148 | else: 149 | file.write('Null\n') 150 | 151 | file.write('\n_______________________touchedMessage________________________\n\n') 152 | if configMessageTouched: 153 | for keyword in touchedMessage: 154 | file.write('"' + keyword + '"' + '\n') 155 | else: 156 | file.write('Null\n') 157 | 158 | file.write('\n') 159 | 160 | file.close() 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /commit_analysis/config_parsing.csv: -------------------------------------------------------------------------------- 1 | Issue ID,commit-URL,Type, 2 | CASSANDRA-14800,https://github.com/apache/cassandra/commit/bd0cef9a369ae9245b45040796a6e10f51e522ce#,change load way, 3 | HBASE-19619,https://github.com/apache/hbase/commit/41c2dd04da21bb76208f04af104df2e2f444970d#,change load way, 4 | HBASE-21568,https://github.com/apache/hbase/commit/67d6d5084cf8fc094cda4bd3f091d8a0a9cb1d3e#,change load way, 5 | SPARK-17920,https://github.com/apache/spark/commit/e0d7665cec1e6954d640f422c79ebba4c273be7d#,change load way, 6 | SPARK-21839,https://github.com/apache/spark/commit/d8f45408635d4fccac557cb1e877dfe9267fb326#,change load way, 7 | SPARK-21840,https://github.com/apache/spark/commit/3073344a2551fb198d63f2114a519ab97904cb55#,change load way, 8 | SPARK-22151,https://github.com/apache/spark/commit/1272b2034d4eed4bfe60a49e1065871b3a3f96e0#,change load way, 9 | SPARK-22219,https://github.com/apache/spark/commit/bbdcc3bf61da39704650d4570c6307b5a46f7100,change load way, 10 | SPARK-22372,https://github.com/apache/spark/commit/e1dd03e42c2131b167b1e80c761291e88bfdf03f#,change load way, 11 | SPARK-23207,https://github.com/apache/spark/commit/dad2d826ae9138f06751e5d092531a9e06028c21#,change load way, 12 | SPARK-23514,https://github.com/apache/spark/commit/476a7f026bc45462067ebd39cd269147e84cd641#,change load way, 13 | SPARK-23514,https://github.com/apache/spark/commit/dea381dfaa73e0cfb9a833b79c741b15ae274f64#,change load way, 14 | SPARK-23640,https://github.com/apache/spark/commit/ae9172017c361e5c1039bc2ca94048117021974a#,change load way, 15 | SPARK-24518,https://github.com/apache/spark/commit/33e77fa89b5805ecb1066fc534723527f70d37c7#,change load way, 16 | SPARK-24680,https://github.com/apache/spark/commit/4d693ac904d89b3afeba107eb0480120daf78174#,change load way, 17 | SPARK-26192,https://github.com/apache/spark/commit/5fd4d7499c9f2925268d84b5d74ecafaebe2113d#,change load way, 18 | SPARK-27253,https://github.com/apache/spark/commit/fc9aad0957fa98ce7a1af2ba529a476b33eebd0e#,change load way, 19 | SPARK-28907,https://github.com/apache/spark/commit/ca711778683a16999560cbdd7c61d98ad6bde6d,change load way, 20 | SPARK-26598,https://github.com/apache/spark/commit/962e330955581aea032ff336a12f23374c39e67,change load way, 21 | SPARK-28939,https://github.com/apache/spark/commit/ca6f693ef17ccb27a6ef5bdad9141abb2fe0434,change load way, 22 | SPARK-29326,https://github.com/apache/spark/commit/91747bd91b410e2d3b7556d0d595fb8e42e4c6d,change load way, 23 | SPARK-29530,https://github.com/apache/spark/commit/484f93e25506f84d1548504783be9ce940149bb,change load way, 24 | SPARK Pull Request #25273,https://github.com/apache/spark/commit/fbaa177d2ac19501add708cc7f28e18d30ca15f,change load way, 25 | SPARK-28642,https://github.com/apache/spark/commit/d19a56f9dbef4c995d80d4b46d03bfbfa4843c5,change load way, 26 | SPARK-28675,https://github.com/apache/spark/commit/47af8925b60509d2a2c932e2bcf25394721c6f1,change load way, 27 | HBASE-23212,https://github.com/apache/hbase/commit/10cc64a7d690429174405517b3e7d75e4f0998f,change load way, 28 | SPARK-27555,https://github.com/apache/spark/commit/c66ec439456c5a160e3849e23c2ce3970d4c6ec7#,fall back place, 29 | HBASE-17356,https://github.com/apache/hbase/commit/db66e6cc9e1c6ea027631388aba688cb623b7d0a#,hard-coded name, 30 | HBASE-19672,https://github.com/apache/hbase/commit/0cd6050d090d11240a40c012716b3d747fbcb58f#,hard-coded name, 31 | HBASE-21663,https://github.com/apache/hbase/commit/fbf79373e649d7cf3926b873e426fe0121f078c6#,hard-coded name, 32 | HDFS-11345,https://github.com/apache/hadoop/commit/2c769167dbdb66c52d2ba7b7193a686444085570#,hard-coded name, 33 | SPARK-16944,https://github.com/apache/spark/commit/4329eb2e73181819bb712f57ca9c7feac0d640ea#,hard-coded name, 34 | SPARK-19558,https://github.com/apache/spark/commit/bd4eb9ce57da7bacff69d9ed958c94f349b7e6fb#,hard-coded name, 35 | SPARK-20642,https://github.com/apache/spark/commit/74daf622de4e534d5a5929b424a6e836850eefad#,hard-coded name, 36 | SPARK-21428,https://github.com/apache/spark/commit/581200af717bcefd11c9930ac063fe53c6fd2fde#,hard-coded name, 37 | SPARK-22050,https://github.com/apache/spark/commit/1437e344ec0c29a44a19f4513986f5f184c44695#,hard-coded name, 38 | SPARK-24665,https://github.com/apache/spark/commit/8f91c697e251423b826cd6ac4ddd9e2dac15b96e#,hard-coded name, 39 | SPARK-25300,https://github.com/apache/spark/commit/ca861fea21adc4e6ec95eced7076cb27fc86ea18#,hard-coded name, 40 | SPARK-26443,https://github.com/apache/spark/commit/e6d3e7d0d8c80adaa51b43d76f1cc83bb9a010b9#,hard-coded name, 41 | SPARK-26463,https://github.com/apache/spark/commit/7bf0794651f4d11547325539ebf7131a57ee1ba2#,hard-coded name, 42 | SPARK-26470,https://github.com/apache/spark/commit/b1a9b5eff59f64c370cd7388761effdf2152a108#,hard-coded name, 43 | SPARK-26477,https://github.com/apache/spark/commit/64cc9e572e0213d5dea241b2b48ecdd68a5c6c99#,hard-coded name, 44 | SPARK-26698,https://github.com/apache/spark/commit/aa3d16d68b7ebd9210c330905f01590ef93d875c#,hard-coded name, 45 | SPARK-27141,https://github.com/apache/spark/commit/8204dc1e548b87aabaf36c5800592bafd44e4419#,hard-coded name, 46 | SPARK-27184,https://github.com/apache/spark/commit/68abf77b1ad8da7916a9dc5fa8bb350b64479410#,hard-coded name, 47 | SPARK-27343,https://github.com/apache/spark/commit/5a8aad01c2aaf0ceef8e9a3cfabbd2e88c8d9f0d#,hard-coded name, 48 | SPARK-27649,https://github.com/apache/spark/commit/8329e7debdaf6db9f3a52094bbc5dc4c1e2771ea#,hard-coded name, 49 | SPARK-27844,https://github.com/apache/spark/commit/447bfdec830ba5eaaee791e86caad39f4f6661eb#,hard-coded name, 50 | SPARK-28257,https://github.com/apache/spark/commit/42b80ae128ab1aa8a87c1376fe88e2cde52e6e4,hard-coded name, 51 | SPARK-25694,https://github.com/apache/spark/commit/8469614c0513fbed87977d4e741649db3fdd8ad,hard-coded name, 52 | HBASE-22859,https://github.com/apache/hbase/commit/018396d84cfe1008308f341562154452f4a45ac,hard-coded name, 53 | SPARK-21786,https://github.com/apache/spark/commit/00d169156d4b1c91d2bcfd788b254b03c509dc41#,load overriden, 54 | CASSANDRA-13614,https://github.com/apache/cassandra/commit/613a8b43d2b5a425080653898b28bde6cd7eb9ba#,refine API, 55 | CASSANDRA-13699,https://github.com/apache/cassandra/commit/cf4a0576a6f2b8f2d828a8b14140f212803adb7c#,refine API, 56 | HBASE-19621,https://github.com/apache/hbase/commit/1556939236016bb51e45ffa1e8038c74e0f0db75#,refine API, 57 | HBASE-21492,https://github.com/apache/hbase/commit/7877e09b6023c80e8bacd25fb8e0b9273ed7d258#,refine API, 58 | HDFS-13222,https://github.com/apache/hadoop/commit/88fba00caa8c8e26f70deb9be5b534e7482620a1#,refine API, 59 | SPARK-24003,https://github.com/apache/spark/commit/007ae6878f4b4defe1f08114212fa7289fc9ee4a#,refine API, 60 | SPARK-24250,https://github.com/apache/spark/commit/dd37529a8dada6ed8a49b8ce50875268f6a20cba#,refine API, 61 | SPARK-24782,https://github.com/apache/spark/commit/e008ad175256a3192fdcbd2c4793044d52f46d57#,refine API, 62 | SPARK-26384,https://github.com/apache/spark/commit/3c0bb6bc45e64fd82052d7857f2a06c34f0c1793#,refine API, 63 | HBASE-20856,https://github.com/apache/hbase/commit/1d0fca370bf56a41fc62b72bebe86a7185a2b0c2#,refine API, 64 | HBASE-21203,https://github.com/apache/hbase/commit/0e173d38b05363e1fb5c85955a4964f05958c1fc#,refine API, 65 | HDFS-14051,https://github.com/apache/hadoop/commit/f0ce072934515d39e9bf61744058ecad3393291e#,refine API, 66 | SPARK-28840,https://github.com/apache/spark/commit/7e6142591f3bc865806b86c7a7b90be008a319d,refine API, 67 | SPARK-28957,https://github.com/apache/spark/commit/d8b0914c2e0fdee72a3b9abb2d65283e22b6e8e,refine API, 68 | SPARK-10614,https://github.com/apache/spark/commit/857f109c47b26a38f5d114a94f94c516177db3f,refine API, 69 | SPARK-30195,https://github.com/apache/spark/commit/33f53cb2d51b62f4c294c8640dc069e42f36d68,refine API, 70 | SPARK-29158,https://github.com/apache/spark/commit/bd05339171db00c2f2dd89702f9500ed6e1e321,refine API, 71 | SPARK-28922,https://github.com/apache/spark/commit/d502c80404c398d852dfa5f86a0e87c104a6286,refine API, 72 | HBASE-23379,https://github.com/apache/hbase/commit/c39339c0046560b8f2083af513f384127e3f46d,refine API, 73 | CASSANDRA-15277,https://github.com/apache/cassandra/commit/860de83a02f3b7711e842a58a073802b9920a1a1,refine API, 74 | HBASE-20879,https://github.com/apache/hbase/commit/2997b6d0714d5542784baf830e7c16a9ef6b62d6#,refine API (sensitive), 75 | CASSANDRA-14716,https://github.com/apache/cassandra/commit/cdeac4992bdb1f569c3a04b628ded7e5351364ee#,refine API(case sensitive), 76 | SPARK-25415,https://github.com/apache/spark/commit/d522a563ad5ab157993a19f406a3cc6f443ccb9e#,refine API(case sensitive), 77 | HDFS-14039,https://github.com/apache/hadoop/commit/8d99648c203004045a9339ad27258092969145d6#,refine API(trimmed), 78 | HBASE-21639,https://github.com/apache/hbase/commit/6da0b4ec34727240e433825382cfc30366340097#,refine API(Unit), 79 | HDFS-12085,https://github.com/apache/hadoop/commit/3a7f02b81520ad4d3eebf92e9dbca662beec0302#,refine API(Unit), 80 | SPARK-21033,https://github.com/apache/spark/commit/083cf223569b7896e35ff1d53a73498a4971b28d#,refine API(Unit), 81 | SPARK-24332,https://github.com/apache/spark/commit/53c06ddabbdf689f8823807445849ad63173676f#,refine API(Unit), 82 | SPARK-24452,https://github.com/apache/spark/commit/90da7dc241f8eec2348c0434312c97c116330bc4#,refine API(Unit), 83 | CASSANDRA-14314,https://github.com/apache/cassandra/commit/11496039fb18bb45407246602e31740c56d28157#,wrong API, 84 | CASSANDRA-15019,https://github.com/apache/cassandra/commit/99ce007c5beb7988ce83fb1443a1e0ca259264cc#,wrong API, 85 | SPARK-29015,https://github.com/apache/spark/commit/cc852d4eec696731cef9ddd6fb0c0c2184194f6,wrong API, 86 | SPARK-28331,https://github.com/apache/spark/commit/c88df2ccf670db62aed6565c9dbdb58d5d5cca3,wrong API, -------------------------------------------------------------------------------- /commit_analysis/rmv_replace.csv: -------------------------------------------------------------------------------- 1 | Issue-ID,Title,Parameter,Issue-URL,Commit-URL,Change type,Param type,Pattern,Note 2 | HBASE-18786,FileNotFoundException should not be silently handled for primary region replicas,hbase.hregion.unassign.for.fnfe,https://issues.apache.org/jira/browse/HBASE-18786,https://github.com/apache/hbase/commit/b27f9b582a858fba66036413936debad27737c3a,hard-coded logic,bool,make feature mandatory,"this is not something that should be parameterized. We either do it or we don't. Otherwise it becomes an obscure setting that could lead to serious conditions if an operator changes it to the non-default value, which we know won't be well tested.For me, FNFE should not happen and if it happens then there must be serious bugs that may cause data loss.That's why I introduce a config, the intention is to disable the feature as we used to always handle it silently..." 3 | HBASE-19999,Remove the SYNC_REPLICATION_ENABLED flag,hbase.replication.sync.enabled,https://issues.apache.org/jira/browse/HBASE-19999,https://github.com/apache/hbase/commit/c7d1085fa27a64621d262aefea825e980e6bc576,hard-coded logic,bool,make feature mandatory,It is a bit strange since we can not guard all the sync replication related code with it. We'd better change its name and only use it within the WAL construction. Now the default case will use SyncReplicationWALProvider.only disable SyncReplicationWALProvider for HMaster or HRegionServer which take system table only. 4 | HBASE-8518,Get rid of hbase.hstore.compaction.complete setting,hbase.hstore.compaction.complete,https://issues.apache.org/jira/browse/HBASE-8518,https://github.com/apache/hbase/commit/a21eb68f9584e69157fed683cc512ee3e8963dfb,hard-coded logic,bool,make feature mandatory,hbase.hstore.compaction.complete is a strange setting that causes the finished compaction to not complete (files are just left in tmp) in HStore. Looks like a flag which allow compacted files to be created but not used. May be someone who wants to see the time /size of compaction without affecting the stores. Does not seem very useful. 5 | SPARK-23366,Improve hot reading path in ReadAheadInputStream,spark.unsafe.sorter.spill.read.ahead.fraction,https://issues.apache.org/jira/browse/SPARK-23366,https://github.com/apache/spark/commit/7539ae59d6c354c95c50528abe9ddff6972e960f,hard-coded logic,bool,make feature mandatory,"Remove `readAheadThresholdInBytes` and instead immediately trigger async read when switching the buffers. It allows to simplify code paths, especially the hot one that then only has to check if there is available data in the active buffer, without worrying if it needs to retrigger async read. It seems to have positive effect on perf." 6 | SPARK-26362,Remove 'spark.driver.allowMultipleContexts' to disallow multiple Spark contexts,spark.driver.allowMultipleContexts,https://issues.apache.org/jira/browse/Spark-26362,https://github.com/apache/spark/commit/9ccae0c9e7d1a0a704e8cd7574ba508419e05e30,hard-coded logic,bool,make feature mandatory,"Multiple SparkContexts are discouraged and it has been warning for last 4 years, see SPARK-4180. It could cause arbitrary and mysterious error cases, see SPARK-2243. Honestly, I didn't even know Spark still allows it, which looks never officially supported, see SPARK-2243." 7 | SPARK-27938,Remove feature flag LEGACY_PASS_PARTITION_BY_AS_OPTIONS,LEGACY_PASS_PARTITION_BY_AS_OP,https://issues.apache.org/jira/browse/Spark-27938,https://github.com/apache/spark/commit/eee3467b1ea674a64a3c70775cfbf2710318993e,hard-coded logic,bool,make feature mandatory,"To make this change less intrusive for a patch release, we added a feature flag `LEGACY_PASS_PARTITION_BY_AS_OPTIONS` with the default to be false. For 3.0, we should just do the correct behavior for DSV1, i.e., always passing partitionBy as options, and remove this legacy feature flag." 8 | SPARK-28699,Cache an indeterminate RDD could lead to incorrect result while stage rerun,SQLConf.get.enableRadixSort,https://issues.apache.org/jira/browse/SPARK-28699,https://github.com/apache/spark/commit/2d9cc42aa83beb5952bb44d3cd0327d4432d385,hard-coded logic,bool,make feature mandatory,"After further investigation, we found that this bug is nothing to do with cache operation. So we focus on the sort + shuffle self and finally found the root cause is about the wrong usage for radix sort." 9 | HBASE-22760,Stop/Resume Snapshot Auto-Cleanup activity with shell command,hbase.master.cleaner.snapshot.disable,https://issues.apache.org/jira/browse/HBASE-22760,https://github.com/apache/hbase/commit/1dcc8ee50cd2120496ec768e09e7f368b6bc26b,hard-coded logic,bool,make feature mandatory,"For any scheduled snapshot backup activity, we would like to disable auto-cleaner for snapshot based on TTL. However, as per HBASE-22648 we have a config to disable snapshot auto-cleaner: hbase.master.cleaner.snapshot.disable, which would take effect only upon HMaster restart just similar to any other hbase-site configs." 10 | CASSANDRA-14108,Improve commit log chain marker updating,commitlog_marker_period_in_ms,https://issues.apache.org/jira/browse/CASSANDRA-14108,https://github.com/apache/cassandra/commit/db788fe860dfd69f06ab97ae35fa67fcf2517b6d,hard-coded value,time,using 100,"Instead of requiring users to configure a deep, dark implementation detail like the commit log chained markers (via commitlog_marker_period_in_ms in the yaml), we decided it is best to eliminate thew configuration and always update the chained markers (when in periodic mode). I've removed the confusing (and confusingly described) yaml property for setting the commitlog_marker_period_in_ms. Instead, I've hardcoded the marker interval to 100ms and it is always applied when a) using periodic mode, and b) not using compression or encryption." 11 | HBASE-19282,Making CellChunkMap the default index,hbase.hregion.compacting.memstore.index,https://issues.apache.org/jira/browse/HBASE-19282,https://github.com/apache/hbase/commit/8d0da1a77f50b730b366c28b5b477141aa83cc55,hard-coded value,index,using orignial default value,In order to avoid additional user settings. If no MSLAB is requested the index is going to be CellArrayMap 12 | HDFS-12412,Change ErasureCodingWorker.stripedReadPool to cached thread pool.,dfs.datanode.ec.reconstruction.stripedread.threads,https://issues.apache.org/jira/browse/HDFS-12412,https://github.com/apache/hadoop/commit/123342cd0759ff88801d4f5ab10987f6e3f344b0,hard-coded value,thread number,using Integer.MAX_VALUE,"The idea to remove the striped read pool and reuse the same reconstruction pool sounds good to me, since given the later and the most often used erasure codec, we can roughly estimate the striped read threads need. We can also simplify the configuration and codes. Less configuration with reasonable defaults would make the brand feature more easier to use. When needed, we can fine-tune and add more later." 13 | HDFS-12775,READ] Fix reporting of Provided volumes,dfs.provided.df.class,https://issues.apache.org/jira/browse/HDFS-12775,https://github.com/apache/hadoop/commit/3b1d30301bcd35bbe525a7e122d3e5acfab92c88,hard-coded value,class implementation,using orignial default value,"The capacity (and dfs used) of a PROVIDED volume on a DN is reported to be equal to the total size of the data (in bytes) mounted from the remote storage. Each volume reports zero available capacity (thus 100% usage). This included changes to ProvidedVolumeImpl, and adding a default ProvidedVolumeDFimplementation and removing the earlier configurable ProvidedVolumeDF interface." 14 | SPARK-25704,Allocate a bit less than Int.MaxValue,spark.storage.memoryMapLimitForTests,https://issues.apache.org/jira/browse/SPARK-25704,https://github.com/apache/spark/commit/43717dee570dc41d71f0b27b8939f6297a029a02,hard-coded value,maxChunkSize,using Integer.MAX_VALUE - 15,"Replicating a block > 2GB currently fails because it tries to allocate a bytebuffer that is just a bit too large, due to a bad default config. MEMORY_MAP_LIMIT_FOR_TESTS defaults to Integer.MAX_VALUE, but unfortunately that is just a tiny bit too big. Workaround: Set to ""spark.storage.memoryMapLimitForTests"" something a bit smaller, eg. 2147483135 (that's Integer.MAX_VALUE - 512, just in case its a bit different on other systems)." 15 | CASSANDRA-13990,Remove obsolete OldNetworkTopologyStrategy,replication_factor_strategies,https://issues.apache.org/jira/browse/CASSANDRA-13990,https://github.com/apache/cassandra/commit/7c5904753f4ede492f1a5a5e68edfe37651a5be6,hard-coded value,class implementation,using orignial default value,RackAwareStrategy was renamed OldNetworkTopologyStrategy back in 0.7 (CASSANDRA-1392) and it's still around. 16 | HBASE-16894,"Create more than 1 split per region, generalize HBASE-12590",hbase.mapreduce.input.autobalance.maxskewratio,https://issues.apache.org/jira/browse/HBASE-16894,https://github.com/apache/hbase/commit/16d483f9003ddee71404f37ce7694003d1a18ac4,program control,ratio,using better feature,"If we want to fix this properly, we should extend the approach in HBASE-12590, and make it so that the client can specify the desired num of mappers, or desired split size, and the TIF generates the splits based on the current region sizes very similar to the algorithm in HBASE-12590, but a more generic way. This also would eliminate the hand tuning of data skew ratio." 17 | HBASE-19616,Review of LogCleaner Class,hbase.oldwals.cleaner.thread.check.interval.msec,https://issues.apache.org/jira/browse/HBASE-19616,https://github.com/apache/hbase/commit/af923225d0a874ecf3c7deddbc0d7bc82184e1d1,program control,interval,using better feature,Using a CountDownLatch allows one or more threads to wait until a set of operations being performed in other threads completes. It will not blindly sleep between checks and it will return immediately after the condition is met. This removes the HBase configuration that controls the sleep interval. 18 | HBASE-21228,Memory leak since AbstractFSWAL caches Thread object and never clean later,REGION_SERVER_HANDLER_COUNT,https://issues.apache.org/jira/browse/HBASE-21228,https://github.com/apache/hbase/commit/86cb8e48ad8aecf52bca1169a98607c76198c70b,program control,thread number,using better feature,"In one of our customer's cluster, we noticed that even though there is no requests, the heap of the RS is almost full and CMS GC was triggered every second. We dumped the heap and then found out there were more than 30 thousands threads with Terminated state. which are all cached in this map above. Everything referenced in these threads were leaked." -------------------------------------------------------------------------------- /commit_analysis/rmv_with_code.csv: -------------------------------------------------------------------------------- 1 | ,link,Title,change mode,param_name 2 | HDFS-12414,https://github.com/apache/hadoop/commit/e0b3c644e186d89138d4174efe0cbe77a0200315,Ensure to use CLI command to enable/disable erasure coding policy,remove param with code,dfs.namenode.ec.policies.enabled 3 | HDFS-14401,https://github.com/apache/hadoop/commit/9b0aace1e6c54f201784912c0b623707aa82b761,Refine the implementation for HDFS cache on SCM,remove param with code,dfs.datanode.cache.loader.class dfs.datanode.cache.pmem.capacity 4 | HDFS-14730,https://github.com/apache/hadoop/commit/30ed24a42112b3225ab2486ed24bd6a5011a7a7, Removed unused **CONFIGUR**ation dfs.web.authentication.filter.,Rmv.RmvModule,dfs.web.authentication.filter 5 | HBASE-18369,https://github.com/apache/hbase/commit/bbf23d9627849c32ee6914c1350da02bceba5127,hbase thrift web-ui not available,remove param with code,hbase.regionserver.thrift.port hbase.regionserver.thrift.server.type hbase.regionserver.thrift.compact hbase.regionserver.thrift.framed 6 | HBASE-18721,https://github.com/apache/hbase/commit/8a800c3f196fcbc3ed63f0967025c1779c43d486,Cleanup unused configs and private declaration,remove param with code,… 7 | HBASE-17972,https://github.com/apache/hbase/commit/5ff04c5e7fdf12946a3f0ae15ed7e83209f0e617,Remove mergePool from CompactSplitThread,remove param with code,hbase.regionserver.thread.merge 8 | HBASE-19073,https://github.com/apache/hbase/commit/dd70cc308158c435c6d8ec027e2435a29be4326b,Cleanup CoordinatedStateManager,remove param with code,hbase.coordinated.state.manager.class 9 | HBASE-19128,https://github.com/apache/hbase/commit/4132314f51951af43f4f56d9886233b3ba417903,"Purge Distributed Log Replay from codebase, configurations, text; mark the feature as unsupported, broken.",remove param with code,hbase.master.distributed.log.replay hbase.regionserver.disallow.writes.when.recovering zookeeper.znode.recovering.regions 10 | HBASE-19357,https://github.com/apache/hbase/commit/ba4f9f834948e6f042e771ae5ee016610afe928c,Bucket cache no longer L2 for LRU cache.,remove param with code,hbase.bucketcache.combinedcache.enabled 11 | HBASE-19148,https://github.com/apache/hbase/commit/4d6b928682cc2a17f3dfd0179fb3fd46fd9e0a1f,Reevaluate default values of **CONFIG**urationsRemoved unused: hbase.fs.tmp.dirAdded hbase.master.loadbalance.bytableEdit of description text. Moved stuff around to put **CONFIG**s beside eachother.M hbase-server/src/main/java/org/apache/hadoop/hbase/util/ServerCommandLine.java Emit some hbase **CONFIG**s in log on startup,remove param with code,hbase.fs.tmp.dir 12 | HBASE-19618,https://github.com/apache/hbase/commit/2ce5dc892710666c9a382fdeece412ecbb8559bb,Remove replicationQueuesClient.class/replicationQueues.class **CONFIG** and remove table based ReplicationQueuesClient/ReplicationQueues implementation,remove param with code,hbase.region.replica.replication.replicationQueuesClient.class 13 | HBASE-19617,https://github.com/apache/hbase/commit/f4703c6ed327f361df371312da8e8edb532048a1,"Remove ReplicationQueues, use ReplicationQueueStorage directly",remove param with code,hbase.replication.queues.createtable.retries.number 14 | HBASE-20000,https://github.com/apache/hbase/commit/c18e7a963d9c4dc862c4706f128a4e436111669c,"Remove the quantum logic in FairQueue, always put high priority queue in front",remove param with code,hbase.master.procedure.queue.meta.table.priority hbase.master.procedure.queue.system.table.priority hbase.master.procedure.queue.user.table.priority 15 | HBASE-21420,https://github.com/apache/hbase/commit/c8574ba3c52274ed5a93e46f7af30dd8b46fb878,Use procedure event to wake up the SyncReplicationReplayWALProcedures which wait for worker,remove param with code,zookeeper.znode.sync.replication.replaywal.workers 16 | HBASE-21792,https://github.com/apache/hbase/commit/7dc69b61287d66641f0ae3d251b1d106d2a00ccf,Mark HTableMultiplexer as deprecated and remove it in 3.0.0,remove param with code,… 17 | HBASE-22186,https://github.com/apache/hbase/commit/20f72f5e252233361ee474e58b4a8fef69926b8b,Remove usage of deprecated SnapshotDescriptionUtils fields,remove param with code,SNAPSHOT_TIMEOUT_MILLIS_KEY 18 | HBASE-22933,https://github.com/apache/hbase/commit/090c55f3ff40dea807dc7e67240f19dcafb3865,Do not need to kick reassign for rs group change any more (……#550),rmv RmvModule ,REASSIGN_WAIT_INTERVAL_KEY 19 | HBASE-23334,https://github.com/apache/hbase/commit/dbbba7932c2f3de8d25aa4f37be943bf07bbc46,The table-lock node of zk is not needed since HBASE-16786 (……#873),rmv RmvModule ,zookeeper.znode.tableLock 20 | HBASE-22971,https://github.com/apache/hbase/commit/b10b39ad0365b378bbf7a493c76501c77f73942,Deprecated RSGroupAdminEndpoint and make RSGroup feature ……always enabled (#595),rmv RmvModule ,hbase.rsgroup.grouploadbalancer.class 21 | SPARK-22487,https://github.com/apache/spark/commit/f7534b37ee91be14e511ab29259c3f83c7ad50af,[SQL][FOLLOWUP] still keep spark.sql.hive.version,rmv RmvModule ,spark.sql.hive.version 22 | SPARK-21253,https://github.com/apache/spark/commit/80f7ac3a601709dd9471092244612023363f54cd,Disable spark.reducer.maxReqSizeShuffleToMem,rmv RmvModule ,spark.reducer.maxReqSizeShuffleToMem 23 | SPARK-25876,https://github.com/apache/spark/commit/6be272b75b4ae3149869e19df193675cc4117763,Simplify configuration types in k8s backend,rmv RmvModule ,spark.kubernetes.python.pyFiles 24 | SPARK-13656,https://github.com/apache/spark/commit/e00f1a1da12be4a1fdb7b89eb5e098aa16c5c2c3,[SQL] Delete spark.sql.parquet.cacheMetadata from SQLConf and docs,rmv RmvModule ,spark.sql.parquet.cacheMetadata 25 | SPARK-20646,https://github.com/apache/spark/commit/11eea1a4ce32c9018218d4dfc9f46b744eb82991,[CORE] Port executors page to new UI backend.,rmv RmvModule ,spark.ui.timeline.executors.maximum spark.ui.retainedDeadExecutors 26 | SPARK-20648,https://github.com/apache/spark/commit/4741c07809393ab85be8b4a169d4ed3da93a4781,[CORE] Port JobsTab and StageTab to the new UI backend.,rmv RmvModule ,spark.ui.retainedJobs spark.ui.retainedStages 27 | SPARK-20652,https://github.com/apache/spark/commit/0ffa7c488fa8156e2a1aa282e60b7c36b86d8af8,[SQL] Store SQL UI data in the new app status store.,rmv RmvModule ,spark.sql.ui.retainedExecutions 28 | SPARK-22489,https://github.com/apache/spark/commit/8ff474f6e543203fac5d49af7fbe98a8a98da567,[CORE] Remove JobProgressListener.,rmv RmvModule ,spark.ui.retainedStages spark.ui.retainedJobs spark.ui.retainedTasks 29 | SPARK-22520,https://github.com/apache/spark/commit/087879a77acb37b790c36f8da67355b90719c2dc,[SQL] Support code generation for large CaseWhen,rmv RmvModule ,spark.sql.codegen.maxCaseBranches 30 | SPARK-22839,https://github.com/apache/spark/commit/f15906da153f139b698e192ec6f82f078f896f1e,[K8S] Remove the use of init-container for downloading remote dependencies,rmv RmvModule ,spark.kubernetes.mountDependencies.filesDownloadDir spark.kubernetes.mountDependencies.jarsDownloadDir spark.kubernetes.mountDependencies.timeout … 31 | SPARK-23361,https://github.com/apache/spark/commit/5fa438471110afbf4e2174df449ac79e292501f8,[YARN] Allow AM to restart after initial tokens expire.,rmv RmvModule ,spark.yarn.credentials.file.retention.count spark.yarn.credentials.file.retention.days spark.yarn.credentials.file spark.yarn.credentials.renewalTime spark.yarn.credentials.updateTime 32 | SPARK-23538,https://github.com/apache/spark/commit/508573958dc9b6402e684cd6dd37202deaaa97f6,[CORE] Remove custom **CONFIGUR**ation for SSL client.,rmv RmvModule ,spark.ssl.fs 33 | SPARK-25160,https://github.com/apache/spark/commit/60af2501e1afc00192c779f2736a4e3de12428fa,Avro: remove sql **CONFIGUR**ation spark.sql.avro.outp……utTimestampType,rmv RmvModule ,spark.sql.avro.outputTimestampType 34 | SPARK-25705,https://github.com/apache/spark/commit/703e6da1ecb52ab5b8f42b3b4cac39f27caa51d8,Remove Kafka 0.8 integration,rmv RmvModule ,spark.streaming.kafka.maxRetries 35 | SPARK-25711,https://github.com/apache/spark/commit/26c1b959cf29b8552beb715cc5d39288d5298bdc,Allow history server to show usage and remove deprecated options,rmv RmvModule ,spark.history.fs.logDirectory 36 | SPARK-25815,https://github.com/apache/spark/commit/4b3fe3a9ccc8a4a8eb0d037d19cb07a8a288e37a,"Support kerberos in client mode, keytab-based token renewal.",rmv RmvModule ,spark.kubernetes.executor.krb5ConfigMapName spark.kubernetes.kerberos.spark-user-name 37 | SPARK-26503,https://github.com/apache/spark/commit/51a6ba0181a013f2b62b47184785a8b6f6a78f12,Get rid of spark.sql.legacy.timeParser.enabled,rmv RmvModule ,spark.sql.legacy.timeParser.enabled 38 | SPARK-26539,https://github.com/apache/spark/commit/2f8a938805ce3c182d61bab8f66b9ff6d90dc83b,Remove spark.memory.useLegacyMode and StaticMemoryManager,rmv RmvModule ,spark.memory.useLegacyMode 39 | SPARK-26584,https://github.com/apache/spark/commit/270916f8cd8ba01341f2a38a8376e9e4be08a2e8,Remove `spark.sql.orc.copyBatchToSpark` internal conf,rmv RmvModule ,spark.sql.orc.copyBatchToSpark 40 | SPARK-26788,https://github.com/apache/spark/commit/4808393449ccad2c6bc73c91d0ed8dd8f60c7054,Remove SchedulerExtensionService.,rmv RmvModule ,spark.yarn.services 41 | SPARK-26998,https://github.com/apache/spark/commit/57aff93886ac7d02b88294672ce0d2495b0942b8,Remove SSL **CONFIGUR**ation from executors,rmv RmvModule ,spark.ssl 42 | SPARK-27008,https://github.com/apache/spark/commit/8e5f9995cad409799f3646b3d03761a771ea1664,Support java.time.LocalDate as an external type of…… DateType,rmv RmvModule ,spark.sql.catalyst.timestampType 43 | SPARK-27349,https://github.com/apache/spark/commit/1d95dea30788b9f64c5e304d908b85936aafb238,Dealing with TimeVars removed in Hive 2.x,rmv RmvModule ,ConfVars.HIVE_STATS_JDBC_TIMEOUT ConfVars.HIVE_STATS_RETRIES_WAIT 44 | SPARK-29399,https://github.com/apache/spark/commit/56a0b5421e41f46a65375c0e5ef9993e9502f93,[CORE] Remove old ExecutorPlugin interface,rmv RmvModule ,spark.executor.plugins 45 | SPARK-29930,https://github.com/apache/spark/commit/5eb8973f871fef557fb4ca3f494406ed676a431,[SQL] Remove SQL **CONFIGS** declared to be removed in Spark 3.0,rmv RmvModule ,"spark.sql.fromJsonForceNullableSchema, spark.sql.legacy.compareDateTimestampInTimestamp, spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation" 46 | CASSANDRA-13625,https://github.com/apache/cassandra/commit/082af0a9ba6b5dde26055fcb9ddd2085e4240381,Remove unused max_value_size_in_mb config setting,rmv RmvModule ,max_value_size_in_mb 47 | CASSANDRA-13910,https://github.com/apache/cassandra/commit/2fcd29b830e7b201e7047d283de385d5f1c427b5,Eliminate background repair and probablistic read_repair_chance table option,rmv RmvModule ,dclocal_read_repair_chance read_repair_chance 48 | CASSANDRA-14081,https://github.com/apache/cassandra/commit/df51d0cbbaaa99aea9bc2a582f788f9170dbdc03,Remove unused and deprecated methods from AbstractCompactionStrategy,rmv RmvModule ,COMPACTION_ENABLED 49 | CASSANDRA-14173,https://github.com/apache/cassandra/commit/28ee665b3c0c9238b61a871064f024d54cddcc79,Remove dependencies on JVM internals for JMX support,rmv RmvModule ,com.sun.management.jmxremote.ssl -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Artifacts for Configuration Evolution of Cloud Systems 2 | 3 | This repository includes the artifacts of the our paper: [An Evolutionary Study of Configuration Design and Implementation in Cloud Systems](https://arxiv.org/pdf/2102.07052.pdf) in 43rd International Conference on Software Engineering (ICSE'21), May. 2021. 4 | 5 | Please cite the paper if you use the code or the datasets. 6 | 7 | The repository includes the following artifacts: 8 | 9 | * `config_commits`: 1178 configuration evolution commits from a recent 10 | 2.5 year (2017.06-2019.12) version control history of four large-scale open-source projects (HDFS, HBase, Spark, and 11 | Cassandra). 12 | * `commit_analysis`: Studied commits with well-labeled categorizations and analysis results, organized based on the structure of the paper. 13 | * `code`: Python scripts for collecting raw commits that touch configuration. 14 | * `commit_study.md`: Documentation of the manual study methodology (analyzing raw commits and issues), including code snippet examples and descriptions for each category in Table II of the submission. 15 | 16 | ## 1. Data Comprehension and Layout 17 | 18 | We provide the data that we studied in this paper. All the data sheets are in the format of CSV, with titles/labels as the first rows. Note that some labels are recorded for specific commits/parameters. (e.g. "How to choose new value" in `change_default_value.csv` is just for numeric parameters and we describe the reason in Section IV.B in the paper) 19 | 20 | All the data sheets except `change_doucumentation.csv` use each row to record an individual parameter change, with the links to the commit/issue page. In `change_doucumentation.csv`, each row records a document change. 21 | 22 | **Note that one commit can contains changes of multiple parameters for multiple reasons.** 23 | 24 | Here is a mapping from the subsection of the paper to the data sheet (in the `commit_analysis` directory). 25 | 26 | * **Section IV (Configuration Interface Evolution)** 27 | 28 | * Section IV.A(1) (Parameterization) → `parameterization.csv` 29 | 30 | * Section IV.A(2) (Removing parameters) → `rmv_replace.csv` 31 | 32 | * Section IV.B (Evolution of default values) → `change_default_value.csv` 33 | 34 | * **Section V (Configuration Usage Evolution)** 35 | 36 | * Section V.A (Evolution of parameter checking code) → `checking_and_handling_code.csv` 37 | 38 | * Section V.B (Evolution of error-handling code) → `checking_and_handling_code.csv` 39 | 40 | * Section V.C (Evolution of using parameter values) → `change_param_existing_usage.csv` and `param_new_use.csv` 41 | 42 | * **Section VI (Configuration Document Evolution)** → `change_doucumentation.csv` 43 | 44 | We also provide sheets for other categories for future study and reuse. 45 | The script in `commit_analysis` is to count the numbers and generate the main tables in the paper: 46 | 47 | ~~~ 48 | python3 count_num.py 49 | ~~~ 50 | 51 | ## 2. Commit Collection and Analysis 52 | 53 | Besides the data in this paper, for future reuse and study, we also provide the script we use to collect the raw commits and a tutorial to show how we do the manual study of each raw commit. 54 | 55 | ### 2.1 Collect raw commits that touch configuration 56 | 57 | Please use python3 to install the dependencies and run the code(we use python 3.8.5). 58 | 59 | 1. Install dependenceis 60 | ~~~bash 61 | pip3 install pathlib 62 | pip3 install nltk 63 | pip3 install beautifulsoup4 64 | ~~~ 65 | 66 | 2. Goto `code/'software'` 67 | Change the following file path in `download_diff.py` 68 | * DIFF_FILE_PATH = "The path that you want to store the commit diff files" 69 | And run: 70 | ```bash 71 | python3 get_commit.py 72 | ``` 73 | to download the raw commits for the target software projects. Please add the latest `commit_page_url` of studied software among studied time span in `commit_url.txt` (There is already one in the file). 74 | * Note: You can stop the downloading process by using ctrl+c whenever you think the time span is enough (downloading all the commits and diff files are time-comsuming, you can first try with a short time-span). If you want to continue the downloading process, just simply run the `get_commit.py` again. (The url in `commit_url.txt` will be automatically updated, you can check `url_log.txt`). If the programm stops (mostly are network issues or too many request), also run `get_commit.py` again to continue. 75 | 76 | The output will be `commit_info.txt` that contains basic info of each commit, and corresponding diff files will be downloaded in `DIFF_FILE_PATH` 77 | 78 | 3. Run 79 | ```bash 80 | python3 commit_selection.py 81 | ``` 82 | to automtically select commits that touch configuration. The out put will be `commit_selected.txt` that has structured info for each selected commit, and the info contains hints that how this diff touches configuration. **By search those hints in the diff file, one can quickly locate/briefly understant the configuration change.** 83 | 84 | The detailed methodologies are described in the submitted paper. 85 | One method of selecting configuration-related commits is to use regular expressions to capture configuation-related code patterns. For example, one of the regular expressions used for HDFS is: 86 | ~~~ 87 | HDFS_CONFIG_LOAD_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*get[a-zA-Z]*\([^;<>]+\)' 88 | ~~~ 89 | 90 | The regular expression can find commits like [HDFS-13607](https://github.com/apache/hadoop/commit/c81ac2ff0220b180cd6cbbf18221290c3783bfd5) which adds a new parameter `dfs.journalnode.edit-cache-size.bytes` by matching the following code snippet: 91 | 92 | ~~~ 93 | + capacity = conf.getInt(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY, 94 | + DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_DEFAULT); 95 | ~~~ 96 | 97 | The complete output info of this commit is shown below. 98 | 99 | ~~~ 100 | HDFS-13607. [SBN read] Edit Tail Fast Path Part 1 //commit title 101 | https://github.com/apache/hadoop/commit/c81ac2ff0220b180cd6cbbf18221290c3783bfd5 //commit link 102 | 2018-05-09T22:40:07Z //commit time 103 | Commit message touches config:False //whether commit message touch "config" keyword 104 | Diff touches config define:True //whether diff touches config define 105 | Diff touches config loading:True //whether diff touches config load 106 | Diff touches config setting:False //whether diff touches config set 107 | Diff touches config variable (data flow):True //whether diff touches config variable 108 | Diff touches config message:False //whether diff touches message that have "config" keyword 109 | 110 | _________________touchedConfigDefine_____________________ 111 | 112 | +hdfs-default.xml 113 | 114 | ___________________touchedConfigLoad___________________ 115 | 116 | +conf.getInt(DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_KEY,DFSConfigKeys.DFS_JOURNALNODE_EDIT_CACHE_SIZE_DEFAULT) 117 | 118 | ___________________touchedConfigSet____________________ 119 | 120 | Null 121 | 122 | ___________________touchedConfigVariable_____________________ 123 | 124 | +capacity JournaledEditsCache.java 125 | 126 | ___________________touchedMessage_____________________ 127 | 128 | Null 129 | ~~~ 130 | 131 | We provide a [demo](https://github.com/xlab-uiuc/open-cevo/tree/main/code/hdfs_demo_examples) (The above case is in that demo) for all HDFS commit examples in [commit_study.md (tutorial)](https://github.com/xlab-uiuc/open-cevo/blob/main/commit_study.md), they are HDFS-13607, HDFS-12291, HDFS-12412, HDFS-11998, HDFS-12716, HDFS-11576 and HDFS-12603. Run `commit_selection.py` in `/code/hdfs_demo_examples` to see `commit_selected.txt` and the structured info for each commit. 132 | ~~~bash 133 | cd code/hdfs_demo_examples 134 | python3 commit_selection.py 135 | ~~~ 136 | 137 | We implement software-specific regular expressions which can be found in `diff_file_parser.py` in each software subdirectory. All the regular expressions are carefully crafted based on a pilot study of configuration-related commits of the target software projects. 138 | 139 | ### 2.2 Commit Study 140 | 141 | We validate, analyze and categorize each commit based on the commit log and diff, as well as the corresponding JIRA or GitHub Issues as described in the paper. Our categorization is based on the taxonomy of Figure 1 and Table II of the submission. This step currently is manually without program automation. We provide a [tutorial](https://github.com/xlab-uiuc/open-cevo/blob/main/commit_study.md) that contains concrete code examples for every category. 142 | 143 | **Note that one commit can touch several categories; we study it in each category.** 144 | 145 | We also analyze JIRA issues or GitHub Pull Requests (PRs) that linked with each commit which provides more background and context information of the commit. 146 | 147 | All the commits in our study are linked to JIRA issues or GitHub PRs. 148 | 149 | ## 3. Reusability 150 | 151 | ### 3.1 Extending our study to longer time span 152 | 153 | We provide our script to select commits related to configuration, which can be reused for large-scale studies (e.g., longer time span). 154 | 155 | To do so, please change the `url` in `code/'software'/commit_url.txt` to the corresponding commit you want to start with. Our script will crawl **oldler** commit based on this. For example, if you want to crawl commit of `HBase` before `Dec.25 2020`, one can do: 156 | ```bash 157 | echo "https://github.com/apache/hbase/commits/master?before=0f868da05d7ffabe4512a0cae110ed097b033ebf+35&branch=master" > code/hbase/commit_url.txt 158 | ``` 159 | 160 | ### 3.2 Extending our study to other software projects 161 | 162 | The main idea to select configuration related commits is using text-based regular expression matching. We show the regex we used in `diff_file_parser.py` in each 163 | `code/'software'` folder. One can reuse and tweak the scripts for other software projects. We suggest you to test the regex using [regex101](https://regex101.com). 164 | 165 | 166 | You will need to modify: 167 | - ```bash 168 | cd code 169 | mkdir other_software 170 | cp -r hbase/* other_software/ 171 | ``` 172 | - change`commit_url.txt`, using the github commits page url of that software. 173 | - change the **regular expressions** global varaibles in diff_file_parser.py specific to the target software project. 174 | 175 | ### 3.3 Followup analysis based on the commits in this artifact 176 | 177 | We provide an [tutorial](https://github.com/xlab-uiuc/open-cevo/blob/main/commit_study.md) to explain our taxonomy/categorization to help followup studies. 178 | -------------------------------------------------------------------------------- /commit_analysis/param_rename.csv: -------------------------------------------------------------------------------- 1 | ,Parameter,commit-URL,Why 2 | SPARK-26060,spark.sql.legacy.execution.setCommandRejectsSparkConfs,https://github.com/apache/spark/commit/1ab3d3e474ce2e36d58aea8ad09fb61f0c73e5c5#,/ 3 | SPARK-4502,spark.sql.nestedSchemaPruning.enabled,https://github.com/apache/spark/commit/76399d75e23f2c7d6c2a1fb77a4387c5e15c809b#,/ 4 | SPARK-29753,spark.sql.defaultCatalog,https://github.com/apache/spark/commit/942753a44beeae5f0142ceefa307e90cbc1234c,/ 5 | SPARK-27760,spark.driver.resource.{resourceName}.count,https://github.com/apache/spark/commit/d30284b5a51dd784f663eb4eea37087b35a54d00#,Change to allow future usage of containing both a count and a unit 6 | SPARK-27760,spark.executor.resource.{resourceName}.count,https://github.com/apache/spark/commit/d30284b5a51dd784f663eb4eea37087b35a54d00#,Change to allow future usage of containing both a count and a unit 7 | SPARK-27760,spark.task.resource.{resourceName}.count,https://github.com/apache/spark/commit/d30284b5a51dd784f663eb4eea37087b35a54d00#,Change to allow future usage of containing both a count and a unit 8 | SPARK-27687,spark.kafka.consumer.cache.capacity,https://github.com/apache/spark/commit/efa303581ac61d6f517aacd08883da2d01530bd2#,consistent naming convention 9 | HDFS-12114,hadoop.httpfs.http.port,https://github.com/apache/hadoop/commit/ac0a04a6e165920a6d43c2aa3dab06ca38f3135b#,consistent naming convention 10 | HDFS-12114,hadoop.httpfs.http.host,https://github.com/apache/hadoop/commit/ac0a04a6e165920a6d43c2aa3dab06ca38f3135b#,consistent naming convention 11 | HDFS-12114,hadoop.httpfs.http.administrators,https://github.com/apache/hadoop/commit/ac0a04a6e165920a6d43c2aa3dab06ca38f3135b#,consistent naming convention 12 | HDFS-12114,hadoop.httpfs.ssl.enabled,https://github.com/apache/hadoop/commit/ac0a04a6e165920a6d43c2aa3dab06ca38f3135b#,consistent naming convention 13 | SPARK-24626,spark.sql.statistics.parallelFileListingInStatsComputation.enabled,https://github.com/apache/spark/commit/4193c7623b92765adaee539e723328ddc9048c09#,consistent naming convention 14 | SPARK-19724,spark.sql.allowCreatingManagedTableUsingNonemptyLocation,https://github.com/apache/spark/commit/4a11209539130c6a075119bf87c5ad854d42978e#,consistent naming convention 15 | SPARK-23549,spark.sql.legacy.compareDateTimestampInTimestamp,https://github.com/apache/spark/commit/411ecc365ea62aef7a29d8764e783e6a58dbb1d5#,consistent naming convention 16 | SPARK-24157,spark.sql.streaming.noDataMicroBatchesEnabled,https://github.com/apache/spark/commit/936c920347e196381b48bc3656ca81a06f2ff46d#,consistent naming convention 17 | SPARK-24324,spark.sql.legacy.execution.pandas.groupedMap.assignColumnsByName,https://github.com/apache/spark/commit/8c2edf46d0f89e5ec54968218d89f30a3f8190bc#,consistent naming convention 18 | SPARK-22159,spark.sql.execution.arrow.enabled,https://github.com/apache/spark/commit/d29d1e87995e02cb57ba3026c945c3cd66bb06e2#,consistent naming convention 19 | SPARK-22159,spark.sql.codegen.aggregate.map.twolevel.enabled,https://github.com/apache/spark/commit/af8a34c787dc3d68f5148a7d9975b52650bb7729#,consistent naming convention 20 | HDFS-12214,dfs.storage.policy.satisfier.enabled,https://github.com/apache/hadoop/commit/ac0a04a6e165920a6d43c2aa3dab06ca38f3135b#,consistent naming convention 21 | HDFS-12438,dfs.datanode.ec.reconstruction.threads,https://github.com/apache/hadoop/commit/e12f3e85bde0e7e83142b383a45c4ea945dfd64e#,consistent naming convention 22 | SPARK-28339,spark.sql.runtime.reoptimization.enabled,https://github.com/apache/spark/commit/3f375c850b5a41ae1ca5deb84fdcea667c32a03,consistent naming convention 23 | SPARK-27959,spark.yarn.am.resource.{resource-type}.amount,https://github.com/apache/spark/commit/43d68cd4ff84530c3d597f07352984225ab1db7,consistent naming convention 24 | SPARK-27959,spark.yarn.driver.resource.{resource-type}.amount,https://github.com/apache/spark/commit/43d68cd4ff84530c3d597f07352984225ab1db7,consistent naming convention 25 | SPARK-27959,spark.yarn.executor.resource.{resource-type}.amount,https://github.com/apache/spark/commit/43d68cd4ff84530c3d597f07352984225ab1db7,consistent naming convention 26 | SPARK-9853,spark.sql.adaptive.shuffle.reducePostShufflePartitions.enabled,https://github.com/apache/spark/commit/8616109061efc5b23b24bb9ec4a3c0f2745903c,consistent naming convention 27 | SPARK-9853,spark.sql.adaptive.shuffle.minNumPostShufflePartitions,https://github.com/apache/spark/commit/8616109061efc5b23b24bb9ec4a3c0f2745903c,consistent naming convention 28 | SPARK-9853,spark.sql.adaptive.shuffle.maxNumPostShufflePartitions,https://github.com/apache/spark/commit/8616109061efc5b23b24bb9ec4a3c0f2745903c,consistent naming convention 29 | SPARK-9853,spark.sql.adaptive.shuffle.optimizedLocalShuffleReader.enabled,https://github.com/apache/spark/commit/8616109061efc5b23b24bb9ec4a3c0f2745903c,consistent naming convention 30 | SPARK#26694,spark.sql.analyzer.failAmbiguousSelfJoin.enabled,https://github.com/apache/spark/commit/e271664a01fd7dee63391890514d76262cad1bc,consistent naming convention 31 | SPARK-30060,spark.metrics.appStatusSource.enabled,https://github.com/apache/spark/commit/60f20e5ea2000ab8f4a593b5e4217fd5637c5e2,consistent naming convention 32 | SPARK-30060,spark.metrics.staticSources.enabled,https://github.com/apache/spark/commit/60f20e5ea2000ab8f4a593b5e4217fd5637c5e2,consistent naming convention 33 | SPARK-25855,spark.eventLog.erasureCoding.enabled,https://github.com/apache/spark/commit/35506dced739ef16136e9f3d5d48c638899d3ce,consistent naming convention 34 | SPARK-26389,spark.sql.streaming.forceDeleteTempCheckpointLocation.enabled,https://github.com/apache/spark/commit/6d64fc2407e5b21a2db59c5213df438c74a3163,consistent naming convention 35 | HBASE-18307,hbase.netty.rpc.server.worker.count,https://github.com/apache/hbase/commit/351703455a091171a1abc90f250f52f0a7a0aaab#,feature has changed 36 | HBASE-18307,hbase.rpc.server.nativetransport,https://github.com/apache/hbase/commit/351703455a091171a1abc90f250f52f0a7a0aaab#,feature has changed 37 | HBASE-22598,hbase.server.allocator.max.buffer.count,https://github.com/apache/hbase/commit/686847cb79038d2fe91aee277f3827fbe5341b49#,feature has changed 38 | HBASE-22598,hbase.server.allocator.buffer.size,https://github.com/apache/hbase/commit/686847cb79038d2fe91aee277f3827fbe5341b49#,feature has changed 39 | HBASE-16894,hbase.mapreduce.input.autobalance,https://github.com/apache/hbase/commit/16d483f9003ddee71404f37ce7694003d1a18ac4#,feature has changed 40 | HBASE-19768,hbase.wal.async.create.retries,https://github.com/apache/hbase/commit/c554340a91e24cdc86e25efd87c46430ec1ec673#,feature has changed 41 | HBASE-22301,hbase.regionserver.hlog.roll.on.sync.ms,https://github.com/apache/hbase/commit/47b4ab7b9732b790b2b471c489f670093e64ad2c#,feature has changed 42 | HBASE-22301,hbase.regionserver.hlog.slowsync.ms,https://github.com/apache/hbase/commit/47b4ab7b9732b790b2b471c489f670093e64ad2c#,feature has changed 43 | HBASE-22301,hbase.regionserver.hlog.sync.timeout,https://github.com/apache/hbase/commit/47b4ab7b9732b790b2b471c489f670093e64ad2c#,feature has changed 44 | HBASE-22547,hbase.server.allocator.pool.enabled,https://github.com/apache/hbase/commit/2e414360bd7aee15769eb46a00b2fa108b3bcbb5#,feature has changed 45 | HBASE-22547,hbase.server.allocator.minimal.allocate.size,https://github.com/apache/hbase/commit/2e414360bd7aee15769eb46a00b2fa108b3bcbb5#,feature has changed 46 | CASSANDRA-13530,commitlog_sync_batch_window_in_ms,https://github.com/apache/cassandra/commit/f3f90c1896eab4f3fb5507b0cf348e2f149db5d1#,feature has changed 47 | SPARK-28741,spark.sql.arithmeticOperations.failOnOverFlow,https://github.com/apache/spark/commit/8258660f673f8b57a3cdd79ecd57c79df5554e3,feature has changed 48 | SPARK-29893,spark.sql.adaptive.shuffle.localShuffleReader.enabled,https://github.com/apache/spark/commit/6e581cf164c3a2930966b270ac1406dc1195c94,feature has changed 49 | SPARK-29412,spark.sql.catalog.session,https://github.com/apache/spark/commit/9407fba0375675d6ee6461253f3b8230e8d6750,feature has changed 50 | HDFS-14845,httpfs.authentication.kerberos.keytab,https://github.com/apache/hadoop/commit/3f89084ac756c9296d412821d76ff2bee57d0c2,feature has changed 51 | HDFS-14845,httpfs.authentication.signature.secret.file,https://github.com/apache/hadoop/commit/3f89084ac756c9296d412821d76ff2bee57d0c2,feature has changed 52 | HDFS-14845,httpfs.authentication.kerberos.principal,https://github.com/apache/hadoop/commit/3f89084ac756c9296d412821d76ff2bee57d0c2,feature has changed 53 | HDFS-14845,httpfs.authentication.type,https://github.com/apache/hadoop/commit/3f89084ac756c9296d412821d76ff2bee57d0c2,feature has changed 54 | HBASE-22610,hbase.offheapcache.minblocksize,https://github.com/apache/hbase/commit/06f5c43de340da62e765a753c10caba5465eeae,feature has changed 55 | SPARK-25372,spark.yarn.keytab spark.yarn.principal,https://github.com/apache/spark/commit/51540c2fa677658be954c820bc18ba748e4c8583#,not precise 56 | SPARK-26766,spark.yarn.access.namenodes,https://github.com/apache/spark/commit/d0443a74d185ec72b747fa39994fa9a40ce974cf#,not precise 57 | SPARK-26766,spark.yarn.access.hadoopFileSystems,https://github.com/apache/spark/commit/d0443a74d185ec72b747fa39994fa9a40ce974cf#,not precise 58 | HDFS-14142,IPFAILOVER_CONFIG_PREFIX,https://github.com/apache/hadoop/commit/b8ad6c85a549a6f17cf6675e58ef002d84059d3c#,not precise 59 | SPARK-22233,spark.hadoopRDD.ignoreEmptySplits,https://github.com/apache/spark/commit/0fa10666cf75e3c4929940af49c8a6f6ea874759#,not precise 60 | SPARK-22807,spark.kubernetes.driver.container.image,https://github.com/apache/spark/commit/fb3636b482be3d0940345b1528c1d5090bbc25e6#,not precise 61 | SPARK-22807,spark.kubernetes.executor.container.image,https://github.com/apache/spark/commit/fb3636b482be3d0940345b1528c1d5090bbc25e6#,not precise 62 | SPARK-22807,spark.kubernetes.container.image.pullPolicy,https://github.com/apache/spark/commit/fb3636b482be3d0940345b1528c1d5090bbc25e6#,not precise 63 | SPARK-29807,spark.sql.ansi.enabled,https://github.com/apache/spark/commit/40ea4a11d7f1534023669f0b81faf5d398174e4,not precise 64 | HBASE-22776,hbase.user.scan.snapshot.common.directory.permission,https://github.com/apache/hbase/commit/0e5dc6d7cee92524bf648b6f49d1565e098e5bc,not precise 65 | HBASE-22776,hbase.user.scan.snapshot.enable,https://github.com/apache/hbase/commit/0e5dc6d7cee92524bf648b6f49d1565e098e5bc,not precise 66 | HBASE-22776,hbase.user.scan.snapshot.thread.number,https://github.com/apache/hbase/commit/0e5dc6d7cee92524bf648b6f49d1565e098e5bc,not precise 67 | SPARK-20101,spark.sql.columnVector.offheap.enabled,https://github.com/apache/spark/commit/572af5027e45ca96e0d283a8bf7c84dcf476f9bc#,typo 68 | SPARK-21127,spark.sql.statistics.size.autoUpdate.enabled,https://github.com/apache/spark/commit/d5202259d9aa9ad95d572af253bf4a722b7b437a#,typo 69 | SPARK-26082,spark.mesos.fetcherCache.enable,https://github.com/apache/spark/commit/d5202259d9aa9ad95d572af253bf4a722b7b437a#,typo 70 | SPARK-27215,spark.kryo.unsafe,https://github.com/apache/spark/commit/93c6d2a198d1b3070eea32210042873c68d0d5f7#,typo 71 | SPARK-27215,spark.kryo.pool,https://github.com/apache/spark/commit/93c6d2a198d1b3070eea32210042873c68d0d5f7#,typo 72 | HDFS-12404,dfs.namenode.authorization.provider.bypass.users,https://github.com/apache/hadoop/commit/3b3be355b35d08a78d9dcd647650812a2d28207b#,typo -------------------------------------------------------------------------------- /config_commits/cassandra.csv: -------------------------------------------------------------------------------- 1 | Issue ID,Commit Link,Title 2 | CASSANDRA-11097,https://github.com/apache/cassandra/commit/0240a4659d761f06f94f8cd97097f2d0ad2d220c,Introduce optional timeouts for idle client sessions 3 | CASSANDRA-13006,https://github.com/apache/cassandra/commit/02aba7343ce300397ab672bbb1788aa8182d8a48,Rely on the JVM to handle OutOfMemoryErrors 4 | CASSANDRA-13987,https://github.com/apache/cassandra/commit/05cb556f90dbd1929a180254809e05620265419b,More frequent commitlog chained markers 5 | CASSANDRA-14798,https://github.com/apache/cassandra/commit/0766f7e54182d04ecf5a15a732f5ec7951d62326,Improve wording around partitioner selection 6 | CASSANDRA-13625,https://github.com/apache/cassandra/commit/082af0a9ba6b5dde26055fcb9ddd2085e4240381,Remove unused max_value_size_in_mb config setting 7 | CASSANDRA-14314,https://github.com/apache/cassandra/commit/11496039fb18bb45407246602e31740c56d28157,Correct and clarify SSLFactory.getSslContext method and call site 8 | CASSANDRA-13656,https://github.com/apache/cassandra/commit/12d4e2f189fb228250edc876963d0c74b5ab0d4f,Change default start_native_transport to true and remove from jvm.options 9 | CASSANDRA-13418,https://github.com/apache/cassandra/commit/14d67d81c57d6387c77bd85c57b342d285880835,Allow to skip overlapings checks 10 | CASSANDRA-14991,https://github.com/apache/cassandra/commit/16ef9ac37c21c4f9091cd1f3658e54abddab8ad8,SSL Cert Hot Reloading should check for sanity of the new keystore/truststore before loading it 11 | CASSANDRA-14580,https://github.com/apache/cassandra/commit/176d4bac22c356c80e275dcb4040bc5cbd0da1c2,Make PeriodicCommitLogService.blockWhenSyncLagsNanos configurable 12 | CASSANDRA-14275,https://github.com/apache/cassandra/commit/19d26bcb80219bce0089fbe8942a34e3a331fd17,Add ability to specify driver name and version 13 | CASSANDRA-14303,https://github.com/apache/cassandra/commit/1f19d5f7a243cc4227da923459f5eb2f66066778,Auto-expand replication_factor for NetworkTopologyStrategy 14 | CASSANDRA-15202,https://github.com/apache/cassandra/commit/2117e2af00603f5fb2181e53dbcba190b2eab861,Make repair coordination less expensive by moving MerkleTrees off heap 15 | CASSANDRA-14173,https://github.com/apache/cassandra/commit/28ee665b3c0c9238b61a871064f024d54cddcc79,Remove dependencies on JVM internals for JMX support 16 | CASSANDRA-13910,https://github.com/apache/cassandra/commit/2fcd29b830e7b201e7047d283de385d5f1c427b5,Eliminate background repair and probablistic read_repair_chance table option 17 | CASSANDRA-14938,https://github.com/apache/cassandra/commit/3ddfbc8f5871c78bde26e96a936e96deeccb4366,Add specialized IndexRegistry for offline tools/clients 18 | CASSANDRA-14372,https://github.com/apache/cassandra/commit/42827e6a6709c4ba031e0a137a3bab257f88b54f,Yaml comments: data_file_directories distributes data evenly by partitioning its token ranges. 19 | CASSANDRA-13518,https://github.com/apache/cassandra/commit/428eaa3e37cab7227c81fdf124d29dfc1db4257c,Add storage port options to sstableloader 20 | CASSANDRA-14566,https://github.com/apache/cassandra/commit/47a12c52a313258307ab88392f75c5866d9a2bb1,Stream entire SSTables when possible 21 | CASSANDRA-15007,https://github.com/apache/cassandra/commit/47d4971b56d97ba8a528f7c17bfd6b11f1ababa3,Fix SimpleStrategy option validation 22 | CASSANDRA-14352,https://github.com/apache/cassandra/commit/4991ca26aa424286ebdee89742d35e813f9e9259,Clean up parsing speculative retry params from string 23 | CASSANDRA-12245,https://github.com/apache/cassandra/commit/4c80eeece37d79f434078224a0504400ae10a20d,Parallelize initial materialized view build 24 | CASSANDRA-14084,https://github.com/apache/cassandra/commit/50e6e721b2a81da7f11f60a2fa405fd46e5415d4,Fix imbalanced disks when replacing node with same address with JBOD 25 | CASSANDRA-14226,https://github.com/apache/cassandra/commit/518ddbf9d21491d341a3d7e2f2a2e65409595e07,Better document in code InetAddressAndPort usage post 7544 26 | CASSANDRA-12526,https://github.com/apache/cassandra/commit/53c0ef171424454c47d64a9326b0ba83cd743a50,Bump SSTable level instead of rewriting SSTable completely during single-sstable compactions 27 | CASSANDRA-13985,https://github.com/apache/cassandra/commit/54de771e643e9cc64d1f5dd28b5de8a9a91a219e,Add network auth 28 | CASSANDRA-7544,https://github.com/apache/cassandra/commit/59b5b6bef0fa76bf5740b688fcd4d9cf525760d0,Allow storage port to be configurable per nodePatch 29 | CASSANDRA-15013,https://github.com/apache/cassandra/commit/5a03898c680ed6ada63901e8a4b278ccc8070717,Prevent client requests from blocking on executor task queue 30 | CASSANDRA-13897,https://github.com/apache/cassandra/commit/5b23054f10f4d6553e8dacbf53bd59e552f2a031,Round buffer size to powers of 2 for the chunk cache 31 | CASSANDRA-14467,https://github.com/apache/cassandra/commit/5d8767765090cd968c39008f76b0cd795d6e5032,Add option to sanity check tombstones on reads/compaction 32 | CASSANDRA-14145,https://github.com/apache/cassandra/commit/5fbb938adaafd91e7bea1672f09a03c7ac5b9b9d,Detect inconsistencies in repaired data on the read path 33 | CASSANDRA-13614,https://github.com/apache/cassandra/commit/613a8b43d2b5a425080653898b28bde6cd7eb9ba,Add 'nodetool getbatchlogreplaythrottle' and 'nodetool setbatchlogreplaythrottle 34 | CASSANDRA-13594,https://github.com/apache/cassandra/commit/62d39f6544e3fbcbc268aecbb3a46950dcba2bf0,Use an ExecutorService for repair commands instead of new Thread(..).start() 35 | CASSANDRA-14373,https://github.com/apache/cassandra/commit/6e00ab956eb0148a74e926666862e4cc78936301,Allow using custom script for chronicle queue BinLog archival 36 | CASSANDRA-14659,https://github.com/apache/cassandra/commit/7b61b0be88ef1fcc29646ae8bdbb05da825bc1b2,Disable old native protocol versions on demand 37 | CASSANDRA-14654,https://github.com/apache/cassandra/commit/7df67eff2d66dba4bed2b4f6aeabf05144d9b057,Reduce heap pressure during compactions 38 | CASSANDRA-15002,https://github.com/apache/cassandra/commit/7f634feb7cf1fdb135133946ffd75efa681b8cb7,Avoid leaking threads when remote nodes fail anticompaction and rate limit anticompactions 39 | CASSANDRA-14297,https://github.com/apache/cassandra/commit/801cb70ee811c956e987718a00695638d5bec1b6,Startup checker should wait for count rather than percentage 40 | CASSANDRA-14225,https://github.com/apache/cassandra/commit/834f2a6ecdb8974839762bf4e9c5fed32163f9c8,Fix comparison of address and port for repair and messages 41 | CASSANDRA-14153,https://github.com/apache/cassandra/commit/8587b0ceb47fa54308dfa9b0bfdc320e6afdc311,Delete temp test files on exit 42 | CASSANDRA-13299,https://github.com/apache/cassandra/commit/8ef71f3f29fb040cce18ba158ff5f289b388c30b,Throttle base partitions during MV repair streaming to prevent OOM 43 | CASSANDRA-13651,https://github.com/apache/cassandra/commit/96ef514917e5a4829dbe864104dbc08a7d0e0cec,Remove Netty timed batching and instead do the batch during next eventLoop invocation after a write has been enqueued. 44 | CASSANDRA-15019,https://github.com/apache/cassandra/commit/99ce007c5beb7988ce83fb1443a1e0ca259264cc,Correctly set repaired data tracking flag on range commands 45 | CASSANDRA-13622,https://github.com/apache/cassandra/commit/a586f6c88dab173663b765261d084ed8410efe81,Improve config validation and documentation on overflow and NPE 46 | CASSANDRA-14525,https://github.com/apache/cassandra/commit/a6196a3a79b67dc6577747e591456328e57c314f,Do not enable native transport if bootstrap is pending 47 | CASSANDRA-14435,https://github.com/apache/cassandra/commit/a79e5903b552e40f77c151e23172f054ffb7f39e,Add JMX query support for diagnostic events 48 | CASSANDRA-13983,https://github.com/apache/cassandra/commit/ae837806bd07dbb8b881960feeeeb90c1a665d93,Support a means of logging all queries as they were invoked. 49 | CASSANDRA-12014,https://github.com/apache/cassandra/commit/ae88fd6c79b066f12ad76c2c1bfc1620d86bdbc5,Avoid assertion error when IndexSummary > 2G 50 | CASSANDRA-14092,https://github.com/apache/cassandra/commit/b2949439ec62077128103540e42570238520f4ee,Protect against overflow of local expiration time 51 | CASSANDRA-13740,https://github.com/apache/cassandra/commit/b2f6ce961f38a3e4cd744e102026bf7a471056c9,Delay hints store excise by write timeout to avoid race with decommission 52 | CASSANDRA-14096,https://github.com/apache/cassandra/commit/b30c8c98a594a5682f6ea1f0b5511463b700b6e8,Improve merkle tree size and time on heap 53 | CASSANDRA-14855,https://github.com/apache/cassandra/commit/b82a42fd9ae99dc115ec04339f4265096bb45044,Disable immediate flusher by default for cassandra-3.0 and cassandra-3.11 54 | CASSANDRA-13993,https://github.com/apache/cassandra/commit/b86801e95a58c5f1a9c779b21fa57136e0225d61,Add optional startup delay to wait until peers are ready 55 | CASSANDRA-13959,https://github.com/apache/cassandra/commit/b8697441d7a051e7ff68def6aa9cf14bd92ace9e,"Add flag to disable materialized views, and warnings on creation" 56 | CASSANDRA-14800,https://github.com/apache/cassandra/commit/bd0cef9a369ae9245b45040796a6e10f51e522ce,Avoid using DatabaseDescriptor in ProtocolVersion 57 | CASSANDRA-14358,https://github.com/apache/cassandra/commit/bfbc5274f2b3a5af2cbbe9679f0e78f1066ef638,Partitioned outbound internode TCP connections can occur when nodes restart 58 | CASSANDRA-13884,https://github.com/apache/cassandra/commit/c22ee2bd451d030e99cfb65be839bbc735a5352f,Add sstableloader OPTION to accept target keyspace name 59 | CASSANDRA-15059,https://github.com/apache/cassandra/commit/c3ce32e239b1ba41faf1d58a942465b9bf45b986,Fix assorted gossip races and add related runtime checks 60 | CASSANDRA-3200,https://github.com/apache/cassandra/commit/cb56d9fc3c773abbefa2044ce41ddbfb7717e0cb,Add option to optimize Merkle tree comparison across replicas 61 | CASSANDRA-14716,https://github.com/apache/cassandra/commit/cdeac4992bdb1f569c3a04b628ded7e5351364ee,Make CONTENT_CHECKSUM protocol OPTION values case insensitive 62 | CASSANDRA-13699,https://github.com/apache/cassandra/commit/cf4a0576a6f2b8f2d828a8b14140f212803adb7c,Allow to set batch_size_warn_threshold_in_kb via JMX 63 | CASSANDRA-14197,https://github.com/apache/cassandra/commit/d14a9266c7ddff0589fdbe7a1836217b8bb8b394,Automatic sstable upgrades 64 | CASSANDRA-9375,https://github.com/apache/cassandra/commit/d2dcd7f884cc997905c820d7cef8c9fc886ff4f7,force minumum timeout value 65 | CASSANDRA-14134,https://github.com/apache/cassandra/commit/d6e508f33c1a7274b5826ad9d5ce814d719bd848,Migrate dtests to use pytest and python3 66 | CASSANDRA-14108,https://github.com/apache/cassandra/commit/db788fe860dfd69f06ab97ae35fa67fcf2517b6d,Improve commit log chain marker updating 67 | CASSANDRA-14482,https://github.com/apache/cassandra/commit/dccf53061a61e7c632669c60cd94626e405518e9,ZSTD Compressor support in Cassandra 68 | CASSANDRA-14081,https://github.com/apache/cassandra/commit/df51d0cbbaaa99aea9bc2a582f788f9170dbdc03,Remove unused and deprecated methods from AbstractCompactionStrategy 69 | CASSANDRA-14726,https://github.com/apache/cassandra/commit/e645b9172c5d50fc2af407de724e46121edfe109,ReplicaCollection follow-up 70 | CASSANDRA-14866,https://github.com/apache/cassandra/commit/e6a61be8c857106d5d99a270b2d17de9f84c4d67,"Add flag to disable SASI indexes, and warning on creation" 71 | CASSANDRA-13669,https://github.com/apache/cassandra/commit/ea62d8862c311e3d9b64d622bea0a68d3825aa7d,Validate supported column type with SASI analyzer 72 | CASSANDRA-13910,https://github.com/apache/cassandra/commit/eaf9bf18b2ec50713170a9ca472c34586b17a5a3,Deprecate background repair and probablistic read_repair_chance table option 73 | CASSANDRA-13975,https://github.com/apache/cassandra/commit/f1e850a492126572efc636a6838cff90333806b9,Add flag to allow dropping oversized read repair mutations 74 | CASSANDRA-14821,https://github.com/apache/cassandra/commit/f22fec927de7ac291266660c2f34de5b8cc1c695,Introduce in-jvm distributed tests 75 | CASSANDRA-13530,https://github.com/apache/cassandra/commit/f3f90c1896eab4f3fb5507b0cf348e2f149db5d1,Add GroupCommitLogService 76 | CASSANDRA-14498,https://github.com/apache/cassandra/commit/f46762eeca9f5d7e32e731573a8c3e521b70fc05,Audit log allows system keyspaces to be audited via configuration options 77 | CASSANDRA-12151,https://github.com/apache/cassandra/commit/f56871b88be1e8965f166769c12cfa43313bac74,Audit logging for database activity 78 | CASSANDRA-14404,https://github.com/apache/cassandra/commit/f7431b432875e334170ccdb19934d05545d2cebd,Transient Replication and Cheap Quorums 79 | CASSANDRA-14619,https://github.com/apache/cassandra/commit/f83bd5ac2bbc6755213a6ad0675e7e5400c79670,Add fqltool comparePatch by marcuse 80 | CASSANDRA-13664,https://github.com/apache/cassandra/commit/ff06424faccc8acedd027c71e955a38fd8ddee6c,Only optimize large ranges when figuring out where to stream from 81 | CASSANDRA-14995,https://github.com/apache/cassandra/commit/ff73c33ab78f70cd0e70280c89e8d8a46f5536d8,lean up all javadoc related errors 82 | CASSANDRA-14855,https://github.com/apache/cassandra/commit/fff6eec2903ee85f648535dd051c9bc72631f524,Backport ImmediateFlusher to cassandra-3.0 and cassandra-3.11 83 | CASSANDRAd4054e0cf,https://github.com/apache/cassandra/commit/d4054e0cf88bdf85cbde33b6416a6eb20da876e2,"ninja: Fix ""No newline at end of file"" in c*.yaml" 84 | CASSANDRA-15260,https://github.com/apache/cassandra/commit/068d2d37c6fbdb60546821c4d408a84161fd1cb6,Add `allocate_tokens_for_local_rf` yaml option for token allocation that doesn't require keyspace knowledge/existence 85 | CASSANDRA-15193,https://github.com/apache/cassandra/commit/0388d89e29393d0b1f50baa24848bc8cb0a7c9a3,Allow max protocol version to be cappedPatch 86 | CASSANDRA-15277,https://github.com/apache/cassandra/commit/860de83a02f3b7711e842a58a073802b9920a1a1,Enable nodetool/JMX resizing of processing stage executor pool 87 | CASSANDRA-13990,https://github.com/apache/cassandra/commit/7c5904753f4ede492f1a5a5e68edfe37651a5be6,Remove obsolete OldNetworkTopologyStrategy 88 | CASSANDRA-15295,https://github.com/apache/cassandra/commit/3a8300e0b86c4acfb7b7702197d36cc39ebe94bc,Avoid deadlock during CommitLog initialization -------------------------------------------------------------------------------- /commit_analysis/change_param_constraint.csv: -------------------------------------------------------------------------------- 1 | Issue-id,Title,Parameter,Issue-URL,Commit-URL,Type,Note 2 | HBASE-18108,Procedure WALs are archived but not cleaned,hbase.master.logcleaner.plugins,https://issues.apache.org/jira/browse/HBASE-18108,https://github.com/apache/hbase/commit/023d4f1ae8081da3cb9ff54e6b2e545799704ce7#,acceptble value change: new class,The TimeToLiveProcedureWALCleaner is now added to hbase.master.logcleaner.plugins to clean the 2 WALs in one run. 3 | CASSANDRA-14482,ZSTD Compressor support in Cassandra,commitlog_compression,https://issues.apache.org/jira/browse/CASSANDRA-14482,https://github.com/apache/cassandra/commit/dccf53061a61e7c632669c60cd94626e405518e9#,acceptble value change: new class,ZSTD Compressor support in Cassandra 4 | HBASE-19187,Remove option to create on heap bucket cache.,hbase.bucketcache.ioengine,https://issues.apache.org/jira/browse/HBASE-19187,https://github.com/apache/hbase/commit/bff619ef7b100e8b09f7f5eb0f6e289ca51de096#,acceptble value change: new mode,"Removing the on heap Bucket cache feature. The config ""hbase.bucketcache.ioengine"" no longer support the 'heap' value. Its supported values now are 'offheap', 'file:', 'files:' and 'mmap:" 5 | SPARK-24360,Support Hive 3.1 metastore,spark.sql.hive.metastore.version,https://issues.apache.org/jira/browse/SPARK-24360,https://github.com/apache/spark/commit/aeff69bd879661367367f39b5dfecd9a76223c0b#,acceptble value change: new version,Hive 3.1.1 is released. This PR aims to support Hive 3.1.x metastore. 6 | SPARK-27418,[SQL] Migrate Parquet to File Data Source V2,spark.sql.sources.write.useV1SourceList,https://issues.apache.org/jira/browse/SPARK-27418,https://github.com/apache/spark/commit/23ebd389b5cb528a7ba04113a12929bebfaf1e9a#,acceptble value change: value range,Support parquet 7 | SPARK-17788,[SQL] fix the potential OOM in UnsafeExternalSorter and ShuffleExternalSorter,spark.shuffle.spill.numElementsForceSpillThreshold,https://issues.apache.org/jira/browse/SPARK-17788,https://github.com/apache/spark/commit/079a2609d7ad0a7dd2ec3eaa594e6ed8801a8008#,type change: long -> int,"The Double values I'm trying to sort are mostly in the range [0,1] (~70% of the data which roughly equates 1 billion records), other numbers in the dataset are as high as 2000." 8 | HBASE-18511,Default no regions on master,hbase.balancer.tablesOnMaster,https://issues.apache.org/jira/browse/HBASE-18511,https://github.com/apache/hbase/commit/473446719b7b81b56216862bf2a94a576ff90f60#,type change: mode -> bool,Changes the configuration hbase.balancer.tablesOnMaster from list of table names to instead be a boolean; true if master carries tables/regions and false if it does not. 9 | CASSANDRA-13990,Remove obsolete OldNetworkTopologyStrategy,replication_strategies,https://issues.apache.org/jira/browse/CASSANDRA-13990,https://github.com/apache/cassandra/commit/7c5904753f4ede492f1a5a5e68edfe37651a5be6,acceptble value change: value range,"Removed the strategy from cqlsh autocomplete, including an array for replication_factor autocomplete that was only used for SimpleStrategy and OldNetworkTopologyStrategy." 10 | SPARK-30074,The maxNumPostShufflePartitions config should obey reducePostShufflePartitions enabled,spark.sql.adaptive.shuffle.maxNumPostShufflePartitions,https://issues.apache.org/jira/browse/SPARK-30074,https://github.com/apache/spark/commit/d1465a1b0dea690fcfbf75edb73ff9f8a015c0d,dependency,The maxNumPostShufflePartitions config should obey reducePostShufflePartitions enabled 11 | SPARK-21012,[SUBMIT] Add glob support for resources adding to Spark,spark.jars,https://issues.apache.org/jira/browse/SPARK-21012,https://github.com/apache/spark/commit/5800144a54f5c0180ccf67392f32c3e8a51119b1#,acceptble value change: glob path,"Current ""--jars (spark.jars)"", ""--files (spark.files)"", ""--py-files (spark.submit.pyFiles)"" and ""--archives (spark.yarn.dist.archives)"" only support non-glob path. This is OK for most of the cases, but when user requires to add more jars, files into Spark, it is too verbose to list one by one. So here propose to add glob path support for resources." 12 | SPARK-21012,[SUBMIT] Add glob support for resources adding to Spark,spark.files,https://issues.apache.org/jira/browse/SPARK-21012,https://github.com/apache/spark/commit/5800144a54f5c0180ccf67392f32c3e8a51119b1#,acceptble value change: glob path,"Current ""--jars (spark.jars)"", ""--files (spark.files)"", ""--py-files (spark.submit.pyFiles)"" and ""--archives (spark.yarn.dist.archives)"" only support non-glob path. This is OK for most of the cases, but when user requires to add more jars, files into Spark, it is too verbose to list one by one. So here propose to add glob path support for resources." 13 | SPARK-21012,[SUBMIT] Add glob support for resources adding to Spark,spark.submit.pyFiles,https://issues.apache.org/jira/browse/SPARK-21012,https://github.com/apache/spark/commit/5800144a54f5c0180ccf67392f32c3e8a51119b1#,acceptble value change: glob path,"Current ""--jars (spark.jars)"", ""--files (spark.files)"", ""--py-files (spark.submit.pyFiles)"" and ""--archives (spark.yarn.dist.archives)"" only support non-glob path. This is OK for most of the cases, but when user requires to add more jars, files into Spark, it is too verbose to list one by one. So here propose to add glob path support for resources." 14 | SPARK-21012,[SUBMIT] Add glob support for resources adding to Spark,spark.yarn.dist.archives,https://issues.apache.org/jira/browse/SPARK-21012,https://github.com/apache/spark/commit/5800144a54f5c0180ccf67392f32c3e8a51119b1#,acceptble value change: glob path,"Current ""--jars (spark.jars)"", ""--files (spark.files)"", ""--py-files (spark.submit.pyFiles)"" and ""--archives (spark.yarn.dist.archives)"" only support non-glob path. This is OK for most of the cases, but when user requires to add more jars, files into Spark, it is too verbose to list one by one. So here propose to add glob path support for resources." 15 | SPARK-24646,[CORE] Minor change to spark.yarn.dist.forceDownloadSchemes to support wildcard '*',spark.yarn.dist.forceDownloadSchemes,https://issues.apache.org/jira/browse/SPARK-24646,https://github.com/apache/spark/commit/e2c7e09f742a7e522efd74fe8e14c2620afdb522#,acceptble value change: surpport *,"Minor change to spark.yarn.dist.forceDownloadSchemes to support wildcard '*', For the ease of using this configuration, here propose to add wildcard '*' support to `spark.yarn.dist.forceDownloadSchemes`" 16 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_CLIENT_CACHE_READAHEAD,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit 17 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_DATANODE_MAX_LOCKED_MEMORY_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit 18 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_DATANODE_AVAILABLE_SPACE_VOLUME_CHOOSING_POLICY_BALANCED_SPACE_THRESHOLD_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit 19 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_NAMENODE_MAX_XATTR_SIZE_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit 20 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_NAMENODE_MAX_COMPONENT_LENGTH_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit 21 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_NAMENODE_MIN_BLOCK_SIZE_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit 22 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_IMAGE_TRANSFER_RATE_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit 23 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_IMAGE_TRANSFER_BOOTSTRAP_STANDBY_RATE_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit 24 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_NAMENODE_DU_RESERVED_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit 25 | HDFS-9872,HDFS bytes-default configurations should accept multiple size unit,DFS_IMAGE_TRANSFER_CHUNKSIZE_KEY,https://issues.apache.org/jira/browse/HDFS-9872,https://github.com/apache/hadoop/commit/88cce32551e6d52fd1c5a5bfd6c41499bf6ab1ab#,type change,HDFS bytes-default configurations should accept multiple size unit 26 | SPARK-27256,"If the configuration is used to set the number of bytes, we'd better use `bytesConf`",spark.sql.files.maxPartitionBytes,https://issues.apache.org/jira/browse/SPARK-27256,https://github.com/apache/spark/commit/e4b36df2c0ae3bdba4484f9f92461dbb528d8fb9#,type change: long -> byte,"Currently, if we want to configure `spark. sql. files. maxPartitionBytes` to 256 megabytes, we must set `spark. sql. files. maxPartitionBytes=268435456`, which is very unfriendly to users." 27 | SPARK-27256,"If the configuration is used to set the number of bytes, we'd better use `bytesConf`",spark.files.maxPartitionBytes,https://issues.apache.org/jira/browse/SPARK-27256,https://github.com/apache/spark/commit/e4b36df2c0ae3bdba4484f9f92461dbb528d8fb9#,type change: long -> byte,"Currently, if we want to configure `spark. sql. files. maxPartitionBytes` to 256 megabytes, we must set `spark. sql. files. maxPartitionBytes=268435456`, which is very unfriendly to users." 28 | SPARK-27256,"If the configuration is used to set the number of bytes, we'd better use `bytesConf`",spark.files.openCostInBytes,https://issues.apache.org/jira/browse/SPARK-27256,https://github.com/apache/spark/commit/e4b36df2c0ae3bdba4484f9f92461dbb528d8fb9#,type change: long -> byte,"Currently, if we want to configure `spark. sql. files. maxPartitionBytes` to 256 megabytes, we must set `spark. sql. files. maxPartitionBytes=268435456`, which is very unfriendly to users." 29 | SPARK-27256,"If the configuration is used to set the number of bytes, we'd better use `bytesConf`",spark.shuffle.sort.initialBufferSize,https://issues.apache.org/jira/browse/SPARK-27256,https://github.com/apache/spark/commit/e4b36df2c0ae3bdba4484f9f92461dbb528d8fb9#,type change: long -> byte,"Currently, if we want to configure `spark. sql. files. maxPartitionBytes` to 256 megabytes, we must set `spark. sql. files. maxPartitionBytes=268435456`, which is very unfriendly to users." 30 | SPARK-27256,"If the configuration is used to set the number of bytes, we'd better use `bytesConf`",spark.shuffle.spill.initialMemoryThreshold,https://issues.apache.org/jira/browse/SPARK-27256,https://github.com/apache/spark/commit/e4b36df2c0ae3bdba4484f9f92461dbb528d8fb9#,type change: long -> byte,"Currently, if we want to configure `spark. sql. files. maxPartitionBytes` to 256 megabytes, we must set `spark. sql. files. maxPartitionBytes=268435456`, which is very unfriendly to users." 31 | SPARK-22845,Modify spark.kubernetes.allocation.batch.delay to take time instead of int,spark.kubernetes.allocation.batch.delay,https://issues.apache.org/jira/browse/SPARK-22845,https://github.com/apache/spark/commit/0114c89d049724b95f7823b957bf33790216316b#,type change: long -> time,Fixing configuration that was taking an int which should take time. Made the granularity milliseconds as opposed to seconds since there's a use-case for sub-second reactions to scale-up rapidly especially with dynamic allocation. 32 | SPARK-29151,Support fractional resources for task resource scheduling,spark.task.resource.{resourceName}.amount,https://issues.apache.org/jira/browse/SPARK-21287,https://github.com/apache/spark/commit/3cb18d90c441bbaa64c693e276793b670213e59,acceptble value change: surpport fractional,There is a configuration change where `spark.task.resource.[resource type].amount` can now be fractional. 33 | HDFS-14719,Correct the safemode threshold value in BlockManagerSafeMode.,DFS_NAMENODE_REPL_QUEUE_THRESHOLD_PCT_KEY,https://issues.apache.org/jira/browse/HDFS-14719,https://github.com/apache/hadoop/commit/34681643e92774da6f74826c468ecec4dcbedf5,type change: double -> float,"BlockManagerSafeMode is doing wrong parsing for safemode threshold. It is storing float value in double, which will give different result some time." 34 | HDFS-14158,Checkpointer ignores configured time period > 5 minutes,dfs.namenode.checkpoint.period,https://issues.apache.org/jira/browse/HDFS-14158,https://github.com/apache/hadoop/commit/9aa3dc872ca9a528cb98ef56d9a33ab9d4531aa1#,acceptble value change: value range,"Are you running BackupNode? It has received little to no attention since people have not used it much for a long time. The standard way of checkpointing until the HA feature was to use secondary namenode, which has its own checks. ""periodMsec is always 5 minutes or lower"" might have been intentional and reasonable long time ago when BackupNode was first created." 35 | HDFS-12716,dfs.datanode.failed.volumes.tolerated' to support minimum number of volumes to be available.,dfs.datanode.failed.volumes.tolerated,https://issues.apache.org/jira/browse/HDFS-12716,https://github.com/apache/hadoop/commit/3108d27edde941d153a58f71fb1096cce2995531#,acceptble value change: value range,"Support 'dfs.datanode.failed.volumes.tolerated' to accept special 'negative value 'x' to tolerate failures of upto ""n-x""" 36 | SPARK-21287,Remove requirement of fetch_size>=0 from JDBCOptions,JDBC_BATCH_FETCH_SIZE,https://issues.apache.org/jira/browse/SPARK-21287,https://github.com/apache/spark/commit/92b25295ca0dc5b80aaddb1c8f8d5ef0a250d11,acceptble value change: value range,Remove the requirement of fetch_size>=0 from JDBCOptions to allow negative fetch size. -------------------------------------------------------------------------------- /code/hdfs/diff_file_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | import download_diff 3 | from pathlib import Path 4 | 5 | BASE_URL = "https://github.com/apache/hadoop/commit/" 6 | 7 | #RE for config File for HDFS 8 | HDFS_CONFIG_FILE_RE = '[a-zA-Z\.\_\-]*-default.xml' 9 | 10 | #RE for config Load in HDFS 11 | HDFS_CONFIG_LOAD_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*get[a-zA-Z]*\([^;<>]+\)' 12 | 13 | #RE for config assign in HDFS 14 | HDFS_CONFIG_ASSIGN_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + HDFS_CONFIG_LOAD_FUNC_RE 15 | 16 | #RE for config set in HDFS 17 | HDFS_CONFIG_SET_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*set[a-zA-Z]*\([^;<>]+\)' 18 | 19 | #RE for system parameter load in HDFS 20 | HDFS_SYS_PARAM_LOAD_FUNC_RE = 'System\.get(?:Property|env)\([^)^;]+\)' 21 | 22 | #RE for system parameter assign in HDFS 23 | HDFS_SYS_PARAM_ASSIGN_FUNC_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + HDFS_SYS_PARAM_LOAD_FUNC_RE 24 | 25 | #RE for system parameter set in HDFS 26 | HDFS_SYS_PARAM_SET_FUNC_RE = 'System\.set(?:Property|env)\([^)^;]+\)' 27 | 28 | #Message in source code 29 | MESSAGE_RE = '".+"' 30 | 31 | class DiffElement: 32 | def __init__(self): 33 | self.diff_class = '' #class that this diff belongs to 34 | self.diff_method = '' #method that this diff belongs to 35 | self.diff_snippet = '' #changed code in this diff 36 | self.diff_change_mode = '' #'+' or '-' 37 | 38 | class CodeElement: 39 | def __init__(self): 40 | self.code_class = '' #class that this diff belongs to 41 | self.code_snippet = '' #changed code in this diff 42 | 43 | class ConfigVariable: 44 | def __init__(self): 45 | self.variable_name = '' #Variable name 46 | self.variable_class = '' #class that this Variable belongs to 47 | self.variable_func = '' #function that assign this Variable 48 | 49 | def add_diff_snippet(code_snippet,changed_class,change_mode,changed_method): 50 | code_element = DiffElement() 51 | code_element.diff_snippet = code_snippet 52 | code_element.diff_class = changed_class 53 | code_element.diff_change_mode = change_mode 54 | code_element.diff_method = changed_method 55 | return code_element 56 | 57 | def collect_config_variable(assign_obj,code_element,config_variable_list): 58 | """collect variables that assgined by Cassandra configuration/system properties""" 59 | assign_obj = assign_obj.replace('\n','').replace(' ','').replace('this.','') 60 | 61 | #extract Variable that assigned 62 | m_variable = ConfigVariable() 63 | m_variable.variable_class = code_element.code_class 64 | m_variable.variable_func = assign_obj 65 | variable_name = assign_obj.split('=') 66 | variable_name = variable_name[0] 67 | m_variable.variable_name = variable_name 68 | 69 | #if this Variable is a new Variable, add it into configVariable set 70 | duplicate_flag = 0 71 | for variable in config_variable_list: 72 | if m_variable.variable_name == variable.variable_name and m_variable.variable_class == variable.variable_class: 73 | duplicate_flag =1 74 | break 75 | if duplicate_flag == 0 and len(m_variable.variable_name)>=3 and m_variable.variable_class != 'null': 76 | config_variable_list.append(m_variable) 77 | file = open('config_variable.txt','a') 78 | file.write(m_variable.variable_name + '##' + m_variable.variable_class + '##' + m_variable.variable_func + '\n') 79 | file.close() 80 | 81 | def diff_file_parser(url): 82 | """parse the diff_file, return the whole code and changed code (codeSet, diffSet)""" 83 | try: 84 | diff_file = open(url,'r') 85 | except (Exception) as e: 86 | # print (e) 87 | if Path(url).is_file() == False: 88 | commit_sha = url.replace('.diff','').split('/') 89 | download_diff.download(BASE_URL + commit_sha[-1]) 90 | diff_file = open(url,'r') 91 | else: 92 | print (e) 93 | return 94 | 95 | #get code snippets, correlated class 96 | code_set = [] 97 | code_snippet = '' 98 | code_class = '' 99 | for line in diff_file: 100 | if line: 101 | line = line.strip('\n') 102 | if len(line) > 1: 103 | if '+++' in line or '---' in line: 104 | if code_snippet: 105 | code_element = CodeElement() 106 | code_element.code_snippet = code_snippet 107 | code_element.code_class = code_class 108 | code_set.append(code_element) 109 | code_snippet = '' 110 | if '/dev/null' not in line: 111 | line = line.split('/') 112 | code_class = line[-1] 113 | else: 114 | if line[0] == '+': 115 | line = line.replace('+','',1) 116 | if line[0] == '-': 117 | line = line.replace('-','',1) 118 | code_snippet = code_snippet + line 119 | if code_snippet: 120 | code_element = CodeElement() 121 | code_element.code_snippet = code_snippet 122 | code_element.code_class = code_class 123 | code_set.append(code_element) 124 | code_snippet = '' 125 | 126 | diff_file.close() 127 | 128 | #get diff snippets, correlated changed class and method 129 | try: 130 | diff_file2 = open(url,'r') 131 | except (Exception) as e: 132 | print (e) 133 | return 134 | 135 | diff_set = [] 136 | changed_class = '' 137 | changed_method = '' 138 | add_snippet = '' 139 | add_flag = 0 140 | minus_snippet = '' 141 | minus_flag = 0 142 | for line in diff_file2: 143 | if line: 144 | line = line.strip('\n') 145 | if '@@' in line: 146 | line = line.split('@@') 147 | if len(line) >= 3: 148 | changed_method = line[2] 149 | elif '+++' in line or '---' in line: 150 | if '/dev/null' not in line: 151 | if 'test' in line: 152 | changed_class = 'test' 153 | else: 154 | line = line.split('/') 155 | changed_class = line[-1] 156 | else: 157 | if line[0] == '+': 158 | line = line.replace('+','',1) 159 | if add_flag == 0: 160 | add_snippet = '' 161 | if 'import' not in line: 162 | add_snippet = add_snippet + line + '\n' 163 | add_flag = 1 164 | elif line[0] == '-': 165 | line = line.replace('-','',1) 166 | if minus_flag == 0: 167 | minus_snippet = '' 168 | if 'import' not in line: 169 | minus_snippet = minus_snippet + line + '\n' 170 | minus_flag = 1 171 | else: 172 | if add_flag == 1: 173 | if add_snippet: 174 | if changed_class != 'test': 175 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method) 176 | diff_set.append(add_element) 177 | add_flag = 0 178 | if minus_flag == 1: 179 | if minus_snippet: 180 | if changed_class != 'test': 181 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method) 182 | diff_set.append(minus_element) 183 | minus_flag = 0 184 | #if file end with diffline 185 | if add_flag == 1: 186 | if add_snippet: 187 | if changed_class != 'test': 188 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method) 189 | diff_set.append(add_element) 190 | 191 | if minus_flag == 1: 192 | if minus_snippet: 193 | if changed_class != 'test': 194 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method) 195 | diff_set.append(minus_element) 196 | 197 | diff_file2.close() 198 | 199 | return code_set,diff_set 200 | 201 | def diffSelection(url,config_variable_list): 202 | 203 | diff = diff_file_parser(url) 204 | 205 | if diff: 206 | codeSet = diff[0] 207 | diffSet = diff[1] 208 | else: 209 | codeSet = 0 210 | diffSet = 0 211 | 212 | #Wheter a diff touches configuration file 213 | configFileTouched = False 214 | 215 | #Wheter a diff touches configuration load function 216 | configLoadTouched = False 217 | 218 | #Wheter a diff touches configuration set function 219 | configSetTouched = False 220 | 221 | #Wheter a diff touches configuration Variableeter 222 | configVariableTouched = False 223 | 224 | #whether a diff touches configuration message(log, error message) 225 | configMessageTouched = False 226 | 227 | #the set of touched file 228 | touchedFile = [] 229 | 230 | #the set of touched configuration load function 231 | touchedLoadFunc = [] 232 | 233 | #the set of touched configuration set function 234 | touchedSetFunc = [] 235 | 236 | #the set of touched configuration Variableeter 237 | touchedVariable = [] 238 | 239 | #the set of touched configuration message 240 | touchedMessage = [] 241 | 242 | if codeSet and diffSet: 243 | 244 | #collect configuration variables in code snippet(not diff snippet) 245 | for codeElement in codeSet: 246 | configAssignObj = re.findall(HDFS_CONFIG_ASSIGN_RE,codeElement.code_snippet,re.M | re.I) 247 | if configAssignObj: 248 | for assignObj in configAssignObj: 249 | collect_config_variable(assignObj,codeElement,config_variable_list) 250 | 251 | sysParamAssignObj = re.findall(HDFS_SYS_PARAM_ASSIGN_FUNC_RE,codeElement.code_snippet,re.M | re.I) 252 | if sysParamAssignObj: 253 | for assignObj in sysParamAssignObj: 254 | collect_config_variable(assignObj,codeElement,config_variable_list) 255 | 256 | for diffElement in diffSet: 257 | 258 | #check whether diff touches config file 259 | configFileObj = re.findall(HDFS_CONFIG_FILE_RE,diffElement.diff_class,re.M | re.I) 260 | if configFileObj: 261 | configFileTouched = True 262 | for fileObj in configFileObj: 263 | touchedFile.append(diffElement.diff_change_mode + fileObj) 264 | 265 | #check whether diff touches config load function 266 | configLoadObj = re.findall(HDFS_CONFIG_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 267 | if configLoadObj: 268 | for loadObj in configLoadObj: 269 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','') 270 | if diffElement.diff_change_mode == '+': 271 | reverseMode = '-' 272 | else: 273 | reverseMode = '+' 274 | reverseFlag = False 275 | for Func in touchedLoadFunc: 276 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''): 277 | touchedLoadFunc.remove(Func) 278 | reverseFlag = True 279 | break 280 | if reverseFlag == False: 281 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')) 282 | 283 | sysParamLoadObj = re.findall(HDFS_SYS_PARAM_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 284 | if sysParamLoadObj: 285 | for loadObj in sysParamLoadObj: 286 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','') 287 | if diffElement.diff_change_mode == '+': 288 | reverseMode = '-' 289 | else: 290 | reverseMode = '+' 291 | reverseFlag = False 292 | for Func in touchedLoadFunc: 293 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''): 294 | touchedLoadFunc.remove(Func) 295 | reverseFlag = True 296 | break 297 | if reverseFlag == False: 298 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')) 299 | 300 | #check whether diff touches config set function 301 | configSetObj = re.findall(HDFS_CONFIG_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 302 | if configSetObj: 303 | for setObj in configSetObj: 304 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','') 305 | if diffElement.diff_change_mode == '+': 306 | reverseMode = '-' 307 | else: 308 | reverseMode = '+' 309 | reverseFlag = False 310 | for Func in touchedSetFunc: 311 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''): 312 | touchedSetFunc.remove(Func) 313 | reverseFlag = True 314 | break 315 | if reverseFlag == False: 316 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')) 317 | 318 | sysParamSetObj = re.findall(HDFS_SYS_PARAM_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 319 | if sysParamSetObj: 320 | for setObj in sysParamSetObj: 321 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','') 322 | if diffElement.diff_change_mode == '+': 323 | reverseMode = '-' 324 | else: 325 | reverseMode = '+' 326 | reverseFlag = False 327 | for Func in touchedSetFunc: 328 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''): 329 | touchedSetFunc.remove(Func) 330 | reverseFlag = True 331 | break 332 | if reverseFlag == False: 333 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')) 334 | 335 | #check whether diff touches config related Variable 336 | for Variable in config_variable_list: 337 | if Variable.variable_name in diffElement.diff_snippet and Variable.variable_class == diffElement.diff_class: 338 | variableStr = diffElement.diff_change_mode + Variable.variable_name + ' ' + Variable.variable_class 339 | if diffElement.diff_change_mode == '+': 340 | reverseMode = '-' 341 | else: 342 | reverseMode = '+' 343 | reverseFlag = False 344 | for var in touchedVariable: 345 | if var == reverseMode + Variable.variable_name + ' ' + Variable.variable_class: 346 | touchedVariable.remove(var) 347 | reverseFlag = True 348 | break 349 | if reverseFlag == False: 350 | touchedVariable.append(variableStr) 351 | 352 | #check whether diff touches configuration message 353 | messageObj = re.findall(MESSAGE_RE,diffElement.diff_snippet,re.M | re.I) 354 | if messageObj: 355 | for messages in messageObj: 356 | messages = messages.split('"') 357 | for message in messages: 358 | words = message.lower().split(" ") 359 | if len(words) > 3: 360 | if 'option' in words or 'parameter' in words or 'config' in message.lower(): 361 | configMessageTouched = True 362 | touchedMessage.append(diffElement.diff_change_mode + message) 363 | 364 | if touchedLoadFunc != []: 365 | configLoadTouched = True 366 | 367 | if touchedSetFunc != []: 368 | configSetTouched = True 369 | 370 | if touchedVariable != []: 371 | configVariableTouched = True 372 | 373 | return configFileTouched,configLoadTouched,configSetTouched,configVariableTouched,configMessageTouched,touchedFile,touchedLoadFunc,touchedSetFunc,touchedVariable,touchedMessage 374 | 375 | else: 376 | return False 377 | 378 | 379 | 380 | 381 | -------------------------------------------------------------------------------- /code/hdfs_demo_examples/diff_file_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | import download_diff 3 | from pathlib import Path 4 | 5 | BASE_URL = "https://github.com/apache/hadoop/commit/" 6 | 7 | #RE for config File for HDFS 8 | HDFS_CONFIG_FILE_RE = '[a-zA-Z\.\_\-]*-default.xml' 9 | 10 | #RE for config Load in HDFS 11 | HDFS_CONFIG_LOAD_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*get[a-zA-Z]*\([^;<>]+\)' 12 | 13 | #RE for config assign in HDFS 14 | HDFS_CONFIG_ASSIGN_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + HDFS_CONFIG_LOAD_FUNC_RE 15 | 16 | #RE for config set in HDFS 17 | HDFS_CONFIG_SET_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*set[a-zA-Z]*\([^;<>]+\)' 18 | 19 | #RE for system parameter load in HDFS 20 | HDFS_SYS_PARAM_LOAD_FUNC_RE = 'System\.get(?:Property|env)\([^)^;]+\)' 21 | 22 | #RE for system parameter assign in HDFS 23 | HDFS_SYS_PARAM_ASSIGN_FUNC_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + HDFS_SYS_PARAM_LOAD_FUNC_RE 24 | 25 | #RE for system parameter set in HDFS 26 | HDFS_SYS_PARAM_SET_FUNC_RE = 'System\.set(?:Property|env)\([^)^;]+\)' 27 | 28 | #Message in source code 29 | MESSAGE_RE = '".+"' 30 | 31 | class DiffElement: 32 | def __init__(self): 33 | self.diff_class = '' #class that this diff belongs to 34 | self.diff_method = '' #method that this diff belongs to 35 | self.diff_snippet = '' #changed code in this diff 36 | self.diff_change_mode = '' #'+' or '-' 37 | 38 | class CodeElement: 39 | def __init__(self): 40 | self.code_class = '' #class that this diff belongs to 41 | self.code_snippet = '' #changed code in this diff 42 | 43 | class ConfigVariable: 44 | def __init__(self): 45 | self.variable_name = '' #Variable name 46 | self.variable_class = '' #class that this Variable belongs to 47 | self.variable_func = '' #function that assign this Variable 48 | 49 | def add_diff_snippet(code_snippet,changed_class,change_mode,changed_method): 50 | code_element = DiffElement() 51 | code_element.diff_snippet = code_snippet 52 | code_element.diff_class = changed_class 53 | code_element.diff_change_mode = change_mode 54 | code_element.diff_method = changed_method 55 | return code_element 56 | 57 | def collect_config_variable(assign_obj,code_element,config_variable_list): 58 | """collect variables that assgined by Cassandra configuration/system properties""" 59 | assign_obj = assign_obj.replace('\n','').replace(' ','').replace('this.','') 60 | 61 | #extract Variable that assigned 62 | m_variable = ConfigVariable() 63 | m_variable.variable_class = code_element.code_class 64 | m_variable.variable_func = assign_obj 65 | variable_name = assign_obj.split('=') 66 | variable_name = variable_name[0] 67 | m_variable.variable_name = variable_name 68 | 69 | #if this Variable is a new Variable, add it into configVariable set 70 | duplicate_flag = 0 71 | for variable in config_variable_list: 72 | if m_variable.variable_name == variable.variable_name and m_variable.variable_class == variable.variable_class: 73 | duplicate_flag =1 74 | break 75 | if duplicate_flag == 0 and len(m_variable.variable_name)>=3 and m_variable.variable_class != 'null': 76 | config_variable_list.append(m_variable) 77 | file = open('config_variable.txt','a') 78 | file.write(m_variable.variable_name + '##' + m_variable.variable_class + '##' + m_variable.variable_func + '\n') 79 | file.close() 80 | 81 | def diff_file_parser(url): 82 | """parse the diff_file, return the whole code and changed code (codeSet, diffSet)""" 83 | try: 84 | diff_file = open(url,'r') 85 | except (Exception) as e: 86 | # print (e) 87 | if Path(url).is_file() == False: 88 | commit_sha = url.replace('.diff','').split('/') 89 | download_diff.download(BASE_URL + commit_sha[-1]) 90 | diff_file = open(url,'r') 91 | else: 92 | print (e) 93 | return 94 | 95 | #get code snippets, correlated class 96 | code_set = [] 97 | code_snippet = '' 98 | code_class = '' 99 | for line in diff_file: 100 | if line: 101 | line = line.strip('\n') 102 | if len(line) > 1: 103 | if '+++' in line or '---' in line: 104 | if code_snippet: 105 | code_element = CodeElement() 106 | code_element.code_snippet = code_snippet 107 | code_element.code_class = code_class 108 | code_set.append(code_element) 109 | code_snippet = '' 110 | if '/dev/null' not in line: 111 | line = line.split('/') 112 | code_class = line[-1] 113 | else: 114 | if line[0] == '+': 115 | line = line.replace('+','',1) 116 | if line[0] == '-': 117 | line = line.replace('-','',1) 118 | code_snippet = code_snippet + line 119 | if code_snippet: 120 | code_element = CodeElement() 121 | code_element.code_snippet = code_snippet 122 | code_element.code_class = code_class 123 | code_set.append(code_element) 124 | code_snippet = '' 125 | 126 | diff_file.close() 127 | 128 | #get diff snippets, correlated changed class and method 129 | try: 130 | diff_file2 = open(url,'r') 131 | except (Exception) as e: 132 | print (e) 133 | return 134 | 135 | diff_set = [] 136 | changed_class = '' 137 | changed_method = '' 138 | add_snippet = '' 139 | add_flag = 0 140 | minus_snippet = '' 141 | minus_flag = 0 142 | for line in diff_file2: 143 | if line: 144 | line = line.strip('\n') 145 | if '@@' in line: 146 | line = line.split('@@') 147 | if len(line) >= 3: 148 | changed_method = line[2] 149 | elif '+++' in line or '---' in line: 150 | if '/dev/null' not in line: 151 | if 'test' in line: 152 | changed_class = 'test' 153 | else: 154 | line = line.split('/') 155 | changed_class = line[-1] 156 | else: 157 | if line[0] == '+': 158 | line = line.replace('+','',1) 159 | if add_flag == 0: 160 | add_snippet = '' 161 | if 'import' not in line: 162 | add_snippet = add_snippet + line + '\n' 163 | add_flag = 1 164 | elif line[0] == '-': 165 | line = line.replace('-','',1) 166 | if minus_flag == 0: 167 | minus_snippet = '' 168 | if 'import' not in line: 169 | minus_snippet = minus_snippet + line + '\n' 170 | minus_flag = 1 171 | else: 172 | if add_flag == 1: 173 | if add_snippet: 174 | if changed_class != 'test': 175 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method) 176 | diff_set.append(add_element) 177 | add_flag = 0 178 | if minus_flag == 1: 179 | if minus_snippet: 180 | if changed_class != 'test': 181 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method) 182 | diff_set.append(minus_element) 183 | minus_flag = 0 184 | #if file end with diffline 185 | if add_flag == 1: 186 | if add_snippet: 187 | if changed_class != 'test': 188 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method) 189 | diff_set.append(add_element) 190 | 191 | if minus_flag == 1: 192 | if minus_snippet: 193 | if changed_class != 'test': 194 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method) 195 | diff_set.append(minus_element) 196 | 197 | diff_file2.close() 198 | 199 | return code_set,diff_set 200 | 201 | def diffSelection(url,config_variable_list): 202 | 203 | diff = diff_file_parser(url) 204 | 205 | if diff: 206 | codeSet = diff[0] 207 | diffSet = diff[1] 208 | else: 209 | codeSet = 0 210 | diffSet = 0 211 | 212 | #Wheter a diff touches configuration file 213 | configFileTouched = False 214 | 215 | #Wheter a diff touches configuration load function 216 | configLoadTouched = False 217 | 218 | #Wheter a diff touches configuration set function 219 | configSetTouched = False 220 | 221 | #Wheter a diff touches configuration Variableeter 222 | configVariableTouched = False 223 | 224 | #whether a diff touches configuration message(log, error message) 225 | configMessageTouched = False 226 | 227 | #the set of touched file 228 | touchedFile = [] 229 | 230 | #the set of touched configuration load function 231 | touchedLoadFunc = [] 232 | 233 | #the set of touched configuration set function 234 | touchedSetFunc = [] 235 | 236 | #the set of touched configuration Variableeter 237 | touchedVariable = [] 238 | 239 | #the set of touched configuration message 240 | touchedMessage = [] 241 | 242 | if codeSet and diffSet: 243 | 244 | #collect configuration variables in code snippet(not diff snippet) 245 | for codeElement in codeSet: 246 | configAssignObj = re.findall(HDFS_CONFIG_ASSIGN_RE,codeElement.code_snippet,re.M | re.I) 247 | if configAssignObj: 248 | for assignObj in configAssignObj: 249 | collect_config_variable(assignObj,codeElement,config_variable_list) 250 | 251 | sysParamAssignObj = re.findall(HDFS_SYS_PARAM_ASSIGN_FUNC_RE,codeElement.code_snippet,re.M | re.I) 252 | if sysParamAssignObj: 253 | for assignObj in sysParamAssignObj: 254 | collect_config_variable(assignObj,codeElement,config_variable_list) 255 | 256 | for diffElement in diffSet: 257 | 258 | #check whether diff touches config file 259 | configFileObj = re.findall(HDFS_CONFIG_FILE_RE,diffElement.diff_class,re.M | re.I) 260 | if configFileObj: 261 | configFileTouched = True 262 | for fileObj in configFileObj: 263 | touchedFile.append(diffElement.diff_change_mode + fileObj) 264 | 265 | #check whether diff touches config load function 266 | configLoadObj = re.findall(HDFS_CONFIG_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 267 | if configLoadObj: 268 | for loadObj in configLoadObj: 269 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','') 270 | if diffElement.diff_change_mode == '+': 271 | reverseMode = '-' 272 | else: 273 | reverseMode = '+' 274 | reverseFlag = False 275 | for Func in touchedLoadFunc: 276 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''): 277 | touchedLoadFunc.remove(Func) 278 | reverseFlag = True 279 | break 280 | if reverseFlag == False: 281 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')) 282 | 283 | sysParamLoadObj = re.findall(HDFS_SYS_PARAM_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 284 | if sysParamLoadObj: 285 | for loadObj in sysParamLoadObj: 286 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','') 287 | if diffElement.diff_change_mode == '+': 288 | reverseMode = '-' 289 | else: 290 | reverseMode = '+' 291 | reverseFlag = False 292 | for Func in touchedLoadFunc: 293 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''): 294 | touchedLoadFunc.remove(Func) 295 | reverseFlag = True 296 | break 297 | if reverseFlag == False: 298 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')) 299 | 300 | #check whether diff touches config set function 301 | configSetObj = re.findall(HDFS_CONFIG_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 302 | if configSetObj: 303 | for setObj in configSetObj: 304 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','') 305 | if diffElement.diff_change_mode == '+': 306 | reverseMode = '-' 307 | else: 308 | reverseMode = '+' 309 | reverseFlag = False 310 | for Func in touchedSetFunc: 311 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''): 312 | touchedSetFunc.remove(Func) 313 | reverseFlag = True 314 | break 315 | if reverseFlag == False: 316 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')) 317 | 318 | sysParamSetObj = re.findall(HDFS_SYS_PARAM_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 319 | if sysParamSetObj: 320 | for setObj in sysParamSetObj: 321 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','') 322 | if diffElement.diff_change_mode == '+': 323 | reverseMode = '-' 324 | else: 325 | reverseMode = '+' 326 | reverseFlag = False 327 | for Func in touchedSetFunc: 328 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''): 329 | touchedSetFunc.remove(Func) 330 | reverseFlag = True 331 | break 332 | if reverseFlag == False: 333 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')) 334 | 335 | #check whether diff touches config related Variable 336 | for Variable in config_variable_list: 337 | if Variable.variable_name in diffElement.diff_snippet and Variable.variable_class == diffElement.diff_class: 338 | variableStr = diffElement.diff_change_mode + Variable.variable_name + ' ' + Variable.variable_class 339 | if diffElement.diff_change_mode == '+': 340 | reverseMode = '-' 341 | else: 342 | reverseMode = '+' 343 | reverseFlag = False 344 | for var in touchedVariable: 345 | if var == reverseMode + Variable.variable_name + ' ' + Variable.variable_class: 346 | touchedVariable.remove(var) 347 | reverseFlag = True 348 | break 349 | if reverseFlag == False: 350 | touchedVariable.append(variableStr) 351 | 352 | #check whether diff touches configuration message 353 | messageObj = re.findall(MESSAGE_RE,diffElement.diff_snippet,re.M | re.I) 354 | if messageObj: 355 | for messages in messageObj: 356 | messages = messages.split('"') 357 | for message in messages: 358 | words = message.lower().split(" ") 359 | if len(words) > 3: 360 | if 'option' in words or 'parameter' in words or 'config' in message.lower(): 361 | configMessageTouched = True 362 | touchedMessage.append(diffElement.diff_change_mode + message) 363 | 364 | if touchedLoadFunc != []: 365 | configLoadTouched = True 366 | 367 | if touchedSetFunc != []: 368 | configSetTouched = True 369 | 370 | if touchedVariable != []: 371 | configVariableTouched = True 372 | 373 | return configFileTouched,configLoadTouched,configSetTouched,configVariableTouched,configMessageTouched,touchedFile,touchedLoadFunc,touchedSetFunc,touchedVariable,touchedMessage 374 | 375 | else: 376 | return False 377 | 378 | 379 | 380 | 381 | -------------------------------------------------------------------------------- /code/hbase/diff_file_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | import download_diff 3 | from pathlib import Path 4 | 5 | BASE_URL = "https://github.com/apache/hbase/commit/" 6 | 7 | #configFile name for HBase 8 | HBASE_CONFIG_FILE_RE = '[a-zA-Z\.\_\-]*-default.xml' 9 | 10 | #RE for config Load in HBase 11 | HBASE_CONFIG_LOAD_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*get[a-zA-Z]*\([^;<>]+\)' 12 | 13 | #RE for config assign in HBase 14 | HBASE_CONFIG_ASSIGN_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + HBASE_CONFIG_LOAD_FUNC_RE 15 | 16 | #RE for config set in HBase 17 | HBASE_CONFIG_SET_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*set[a-zA-Z]*\([^;<>]+\)' 18 | 19 | #RE for system parameter load in HBase 20 | HBASE_SYS_PARAM_LOAD_FUNC_RE = 'System\.get(?:Property|env)\([^)^;]+\)' 21 | 22 | #RE for system parameter assign in HBase 23 | HBASE_SYS_PARAM_ASSIGN_FUNC_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + HBASE_SYS_PARAM_LOAD_FUNC_RE 24 | 25 | #RE for system parameter set in HBase 26 | HBASE_SYS_PARAM_SET_FUNC_RE = 'System\.set(?:Property|env)\([^)^;]+\)' 27 | 28 | #RE for message in program 29 | MESSAGE_RE = '".+"' 30 | 31 | class DiffElement: 32 | def __init__(self): 33 | self.diff_class = '' #class that this diff belongs to 34 | self.diff_method = '' #method that this diff belongs to 35 | self.diff_snippet = '' #changed code in this diff 36 | self.diff_change_mode = '' #'+' or '-' 37 | 38 | class CodeElement: 39 | def __init__(self): 40 | self.code_class = '' #class that this diff belongs to 41 | self.code_snippet = '' #changed code in this diff 42 | 43 | class ConfigVariable: 44 | def __init__(self): 45 | self.variable_name = '' #Variable name 46 | self.variable_class = '' #class that this Variable belongs to 47 | self.variable_func = '' #function that assign this Variable 48 | 49 | def add_diff_snippet(code_snippet,changed_class,change_mode,changed_method): 50 | code_element = DiffElement() 51 | code_element.diff_snippet = code_snippet 52 | code_element.diff_class = changed_class 53 | code_element.diff_change_mode = change_mode 54 | code_element.diff_method = changed_method 55 | return code_element 56 | 57 | def collect_config_variable(assign_obj,code_element,config_variable_list): 58 | """collect variables that assgined by Cassandra configuration/system properties""" 59 | assign_obj = assign_obj.replace('\n','').replace(' ','').replace('this.','') 60 | 61 | #extract Variable that assigned 62 | m_variable = ConfigVariable() 63 | m_variable.variable_class = code_element.code_class 64 | m_variable.variable_func = assign_obj 65 | variable_name = assign_obj.split('=') 66 | variable_name = variable_name[0] 67 | m_variable.variable_name = variable_name 68 | 69 | #if this Variable is a new Variable, add it into configVariable set 70 | duplicate_flag = 0 71 | for variable in config_variable_list: 72 | if m_variable.variable_name == variable.variable_name and m_variable.variable_class == variable.variable_class: 73 | duplicate_flag =1 74 | break 75 | if duplicate_flag == 0 and len(m_variable.variable_name)>=3 and m_variable.variable_class != 'null': 76 | config_variable_list.append(m_variable) 77 | file = open('config_variable.txt','a') 78 | file.write(m_variable.variable_name + '##' + m_variable.variable_class + '##' + m_variable.variable_func + '\n') 79 | file.close() 80 | 81 | def diff_file_parser(url): 82 | """parse the diff_file, return the whole code and changed code (codeSet, diffSet)""" 83 | try: 84 | diff_file = open(url,'r') 85 | except (Exception) as e: 86 | # print (e) 87 | if Path(url).is_file() == False: 88 | commit_sha = url.replace('.diff','').split('/') 89 | download_diff.download(BASE_URL + commit_sha[-1]) 90 | diff_file = open(url,'r') 91 | else: 92 | print (e) 93 | return 94 | 95 | #get code snippets, correlated class 96 | code_set = [] 97 | code_snippet = '' 98 | code_class = '' 99 | for line in diff_file: 100 | if line: 101 | line = line.strip('\n') 102 | if len(line) > 1: 103 | if '+++' in line or '---' in line: 104 | if code_snippet: 105 | code_element = CodeElement() 106 | code_element.code_snippet = code_snippet 107 | code_element.code_class = code_class 108 | code_set.append(code_element) 109 | code_snippet = '' 110 | if '/dev/null' not in line: 111 | line = line.split('/') 112 | code_class = line[-1] 113 | else: 114 | if line[0] == '+': 115 | line = line.replace('+','',1) 116 | if line[0] == '-': 117 | line = line.replace('-','',1) 118 | code_snippet = code_snippet + line 119 | if code_snippet: 120 | code_element = CodeElement() 121 | code_element.code_snippet = code_snippet 122 | code_element.code_class = code_class 123 | code_set.append(code_element) 124 | code_snippet = '' 125 | 126 | diff_file.close() 127 | 128 | #get diff snippets, correlated changed class and method 129 | try: 130 | diff_file2 = open(url,'r') 131 | except (Exception) as e: 132 | print (e) 133 | return 134 | 135 | diff_set = [] 136 | changed_class = '' 137 | changed_method = '' 138 | add_snippet = '' 139 | add_flag = 0 140 | minus_snippet = '' 141 | minus_flag = 0 142 | for line in diff_file2: 143 | if line: 144 | line = line.strip('\n') 145 | if '@@' in line: 146 | line = line.split('@@') 147 | if len(line) >= 3: 148 | changed_method = line[2] 149 | elif '+++' in line or '---' in line: 150 | if '/dev/null' not in line: 151 | if 'test' in line: 152 | changed_class = 'test' 153 | else: 154 | line = line.split('/') 155 | changed_class = line[-1] 156 | else: 157 | if line[0] == '+': 158 | line = line.replace('+','',1) 159 | if add_flag == 0: 160 | add_snippet = '' 161 | if 'import' not in line: 162 | add_snippet = add_snippet + line + '\n' 163 | add_flag = 1 164 | elif line[0] == '-': 165 | line = line.replace('-','',1) 166 | if minus_flag == 0: 167 | minus_snippet = '' 168 | if 'import' not in line: 169 | minus_snippet = minus_snippet + line + '\n' 170 | minus_flag = 1 171 | else: 172 | if add_flag == 1: 173 | if add_snippet: 174 | if changed_class != 'test': 175 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method) 176 | diff_set.append(add_element) 177 | add_flag = 0 178 | if minus_flag == 1: 179 | if minus_snippet: 180 | if changed_class != 'test': 181 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method) 182 | diff_set.append(minus_element) 183 | minus_flag = 0 184 | #if file end with diffline 185 | if add_flag == 1: 186 | if add_snippet: 187 | if changed_class != 'test': 188 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method) 189 | diff_set.append(add_element) 190 | 191 | if minus_flag == 1: 192 | if minus_snippet: 193 | if changed_class != 'test': 194 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method) 195 | diff_set.append(minus_element) 196 | 197 | diff_file2.close() 198 | 199 | return code_set,diff_set 200 | 201 | def diffSelection(url,configVariableList): 202 | 203 | diff = diff_file_parser(url) 204 | 205 | if diff: 206 | codeSet = diff[0] 207 | diffSet = diff[1] 208 | else: 209 | codeSet = 0 210 | diffSet = 0 211 | 212 | #Wheter a diff touches configuration file 213 | configFileTouched = False 214 | 215 | #Wheter a diff touches configuration load function 216 | configLoadTouched = False 217 | 218 | #Wheter a diff touches configuration set function 219 | configSetTouched = False 220 | 221 | #Wheter a diff touches configuration Variableeter 222 | configVariableTouched = False 223 | 224 | #whether a diff touches configuration message(log, error message) 225 | configMessageTouched = False 226 | 227 | #the set of touched file 228 | touchedFile = [] 229 | 230 | #the set of touched configuration load function 231 | touchedLoadFunc = [] 232 | 233 | #the set of touched configuration set function 234 | touchedSetFunc = [] 235 | 236 | #the set of touched configuration Variable 237 | touchedVariable = [] 238 | 239 | #the set of touched configuration message 240 | touchedMessage = [] 241 | 242 | if codeSet and diffSet: 243 | 244 | #collect configuration variables in code snippet(not diff snippet) 245 | for codeElement in codeSet: 246 | configAssignObj = re.findall(HBASE_CONFIG_ASSIGN_RE,codeElement.code_snippet,re.M | re.I) 247 | if configAssignObj: 248 | for assignObj in configAssignObj: 249 | collect_config_variable(assignObj,codeElement,configVariableList) 250 | 251 | sysParamAssignObj = re.findall(HBASE_SYS_PARAM_ASSIGN_FUNC_RE,codeElement.code_snippet,re.M | re.I) 252 | if sysParamAssignObj: 253 | for assignObj in sysParamAssignObj: 254 | collect_config_variable(assignObj,codeElement,configVariableList) 255 | 256 | for diffElement in diffSet: 257 | 258 | #check whether diff touches config file 259 | configFileObj = re.findall(HBASE_CONFIG_FILE_RE,diffElement.diff_class,re.M | re.I) 260 | if configFileObj: 261 | configFileTouched = True 262 | for fileObj in configFileObj: 263 | touchedFile.append(diffElement.diff_change_mode + fileObj) 264 | 265 | #check whether diff touches config load function 266 | configLoadObj = re.findall(HBASE_CONFIG_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 267 | if configLoadObj: 268 | for loadObj in configLoadObj: 269 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','') 270 | if diffElement.diff_change_mode == '+': 271 | reverseMode = '-' 272 | else: 273 | reverseMode = '+' 274 | reverseFlag = False 275 | for Func in touchedLoadFunc: 276 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''): 277 | touchedLoadFunc.remove(Func) 278 | reverseFlag = True 279 | break 280 | if reverseFlag == False: 281 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')) 282 | 283 | sysParamLoadObj = re.findall(HBASE_SYS_PARAM_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 284 | if sysParamLoadObj: 285 | for loadObj in sysParamLoadObj: 286 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','') 287 | if diffElement.diff_change_mode == '+': 288 | reverseMode = '-' 289 | else: 290 | reverseMode = '+' 291 | reverseFlag = False 292 | for Func in touchedLoadFunc: 293 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''): 294 | touchedLoadFunc.remove(Func) 295 | reverseFlag = True 296 | break 297 | if reverseFlag == False: 298 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')) 299 | 300 | 301 | #check whether diff touches config set function 302 | configSetObj = re.findall(HBASE_CONFIG_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 303 | if configSetObj: 304 | for setObj in configSetObj: 305 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','') 306 | if diffElement.diff_change_mode == '+': 307 | reverseMode = '-' 308 | else: 309 | reverseMode = '+' 310 | reverseFlag = False 311 | for Func in touchedSetFunc: 312 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''): 313 | touchedSetFunc.remove(Func) 314 | reverseFlag = True 315 | break 316 | if reverseFlag == False: 317 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')) 318 | 319 | sysParamSetObj = re.findall(HBASE_SYS_PARAM_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 320 | if sysParamSetObj: 321 | for setObj in sysParamSetObj: 322 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','') 323 | if diffElement.diff_change_mode == '+': 324 | reverseMode = '-' 325 | else: 326 | reverseMode = '+' 327 | reverseFlag = False 328 | for Func in touchedSetFunc: 329 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''): 330 | touchedSetFunc.remove(Func) 331 | reverseFlag = True 332 | break 333 | if reverseFlag == False: 334 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')) 335 | 336 | 337 | #check whether diff touches config related Variable 338 | for Variable in configVariableList: 339 | if Variable.variable_name in diffElement.diff_snippet and Variable.variable_class == diffElement.diff_class: 340 | variableStr = diffElement.diff_change_mode + Variable.variable_name + ' ' + Variable.variable_class 341 | if diffElement.diff_change_mode == '+': 342 | reverseMode = '-' 343 | else: 344 | reverseMode = '+' 345 | reverseFlag = False 346 | for var in touchedVariable: 347 | if var == reverseMode + Variable.variable_name + ' ' + Variable.variable_class: 348 | touchedVariable.remove(var) 349 | reverseFlag = True 350 | break 351 | if reverseFlag == False: 352 | touchedVariable.append(variableStr) 353 | 354 | #check whether diff touches configuration message 355 | messageObj = re.findall(MESSAGE_RE,diffElement.diff_snippet,re.M | re.I) 356 | if messageObj: 357 | for messages in messageObj: 358 | messages = messages.split('"') 359 | for message in messages: 360 | words = message.lower().split(" ") 361 | if len(words) > 3: 362 | if 'option' in words or 'parameter' in words or 'config' in message.lower(): 363 | configMessageTouched = True 364 | touchedMessage.append(diffElement.diff_change_mode + message) 365 | 366 | 367 | if touchedLoadFunc != []: 368 | configLoadTouched = True 369 | 370 | if touchedSetFunc != []: 371 | configSetTouched = True 372 | 373 | if touchedVariable != []: 374 | configVariableTouched = True 375 | 376 | 377 | return configFileTouched,configLoadTouched,configSetTouched,configVariableTouched,configMessageTouched,touchedFile,touchedLoadFunc,touchedSetFunc,touchedVariable,touchedMessage 378 | 379 | else: 380 | return False 381 | 382 | 383 | 384 | 385 | -------------------------------------------------------------------------------- /code/spark/diff_file_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | import download_diff 3 | from pathlib import Path 4 | 5 | BASE_URL = "https://github.com/apache/saprk/commit/" 6 | 7 | #configDoc name in Spark 8 | SPARK_CONFIG_DOC_RE = 'configuration.md' 9 | 10 | #RE for config build in Spark 11 | SPARK_CONFIG_BUILDE_RE = '(?:ConfigBuilder|buildConf)\([^)^;]+\)' 12 | 13 | #RE for config load in Spark 14 | SPARK_CONFIG_LOAD_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*get[a-zA-Z]*\([^)^;]+\)' 15 | 16 | #RE for config assign in Spark 17 | SPARK_CONFIG_ASSIGN_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + SPARK_CONFIG_LOAD_FUNC_RE 18 | 19 | #RE for config set in Spark 20 | SPARK_CONFIG_SET_FUNC_RE = '[a-zA-Z\.\_\-]*[cC]onf[ig]*[\(\)]*[\s]*[\n]*[\s]*\.[\s]*[\n]*[\s]*set[a-zA-Z]*\([^)^;]+\)' 21 | 22 | #RE for SQL config load in Spark 23 | SQL_CONFIG_LOAD_FUNC_RE = 'SQLConf.get.[a-zA-Z]*' 24 | 25 | #RE for SQL config Assign in Spark 26 | SQL_CONFIG_ASSIGN_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + SQL_CONFIG_LOAD_FUNC_RE 27 | 28 | #RE for system param load in Spark 29 | SPARK_SYS_PARAM_LOAD_RE = 'System\.get(?:Property|env)\([^)^;]+\)' 30 | 31 | #RE for system param assign in Spark 32 | SPARK_SYS_PARAM_ASSIGN_FUNC_RE = '[a-zA-Z\.\_\-]+[\s]*=[\s]*[\n]*[\s]*' + SPARK_SYS_PARAM_LOAD_RE 33 | 34 | #RE for system param set in Spark 35 | SPARK_SYS_PARAM_SET_FUNC_RE = 'System\.set(?:Property|env)\([^)^;]+\)' 36 | 37 | #Message in source code 38 | MESSAGE_RE = '".+"' 39 | 40 | class DiffElement: 41 | def __init__(self): 42 | self.diff_class = '' #class that this diff belongs to 43 | self.diff_method = '' #method that this diff belongs to 44 | self.diff_snippet = '' #changed code in this diff 45 | self.diff_change_mode = '' #'+' or '-' 46 | 47 | class CodeElement: 48 | def __init__(self): 49 | self.code_class = '' #class that this diff belongs to 50 | self.code_snippet = '' #changed code in this diff 51 | 52 | class ConfigVariable: 53 | def __init__(self): 54 | self.variable_name = '' #Variable name 55 | self.variable_class = '' #class that this Variable belongs to 56 | self.variable_func = '' #function that assign this Variable 57 | 58 | def add_diff_snippet(code_snippet,changed_class,change_mode,changed_method): 59 | code_element = DiffElement() 60 | code_element.diff_snippet = code_snippet 61 | code_element.diff_class = changed_class 62 | code_element.diff_change_mode = change_mode 63 | code_element.diff_method = changed_method 64 | return code_element 65 | 66 | def collect_config_variable(assign_obj,code_element,config_variable_list): 67 | """collect variables that assgined by Cassandra configuration/system properties""" 68 | assign_obj = assign_obj.replace('\n','').replace(' ','').replace('this.','') 69 | 70 | #extract Variable that assigned 71 | m_variable = ConfigVariable() 72 | m_variable.variable_class = code_element.code_class 73 | m_variable.variable_func = assign_obj 74 | variable_name = assign_obj.split('=') 75 | variable_name = variable_name[0] 76 | m_variable.variable_name = variable_name 77 | 78 | #if this Variable is a new Variable, add it into configVariable set 79 | duplicate_flag = 0 80 | for variable in config_variable_list: 81 | if m_variable.variable_name == variable.variable_name and m_variable.variable_class == variable.variable_class: 82 | duplicate_flag =1 83 | break 84 | if duplicate_flag == 0 and len(m_variable.variable_name)>=3 and m_variable.variable_class != 'null': 85 | config_variable_list.append(m_variable) 86 | file = open('config_variable.txt','a') 87 | file.write(m_variable.variable_name + '##' + m_variable.variable_class + '##' + m_variable.variable_func + '\n') 88 | file.close() 89 | 90 | def diff_file_parser(url): 91 | """parse the diff_file, return the whole code and changed code (codeSet, diffSet)""" 92 | try: 93 | diff_file = open(url,'r') 94 | except (Exception) as e: 95 | # print (e) 96 | if Path(url).is_file() == False: 97 | commit_sha = url.replace('.diff','').split('/') 98 | download_diff.download(BASE_URL + commit_sha[-1]) 99 | diff_file = open(url,'r') 100 | else: 101 | print (e) 102 | return 103 | 104 | #get code snippets, correlated class 105 | code_set = [] 106 | code_snippet = '' 107 | code_class = '' 108 | for line in diff_file: 109 | if line: 110 | line = line.strip('\n') 111 | if len(line) > 1: 112 | if '+++' in line or '---' in line: 113 | if code_snippet: 114 | code_element = CodeElement() 115 | code_element.code_snippet = code_snippet 116 | code_element.code_class = code_class 117 | code_set.append(code_element) 118 | code_snippet = '' 119 | if '/dev/null' not in line: 120 | line = line.split('/') 121 | code_class = line[-1] 122 | else: 123 | if line[0] == '+': 124 | line = line.replace('+','',1) 125 | if line[0] == '-': 126 | line = line.replace('-','',1) 127 | code_snippet = code_snippet + line 128 | if code_snippet: 129 | code_element = CodeElement() 130 | code_element.code_snippet = code_snippet 131 | code_element.code_class = code_class 132 | code_set.append(code_element) 133 | code_snippet = '' 134 | 135 | diff_file.close() 136 | 137 | #get diff snippets, correlated changed class and method 138 | try: 139 | diff_file2 = open(url,'r') 140 | except (Exception) as e: 141 | print (e) 142 | return 143 | 144 | diff_set = [] 145 | changed_class = '' 146 | changed_method = '' 147 | add_snippet = '' 148 | add_flag = 0 149 | minus_snippet = '' 150 | minus_flag = 0 151 | for line in diff_file2: 152 | if line: 153 | line = line.strip('\n') 154 | if '@@' in line: 155 | line = line.split('@@') 156 | if len(line) >= 3: 157 | changed_method = line[2] 158 | elif '+++' in line or '---' in line: 159 | if '/dev/null' not in line: 160 | if 'test' in line: 161 | changed_class = 'test' 162 | else: 163 | line = line.split('/') 164 | changed_class = line[-1] 165 | else: 166 | if line[0] == '+': 167 | line = line.replace('+','',1) 168 | if add_flag == 0: 169 | add_snippet = '' 170 | if 'import' not in line: 171 | add_snippet = add_snippet + line + '\n' 172 | add_flag = 1 173 | elif line[0] == '-': 174 | line = line.replace('-','',1) 175 | if minus_flag == 0: 176 | minus_snippet = '' 177 | if 'import' not in line: 178 | minus_snippet = minus_snippet + line + '\n' 179 | minus_flag = 1 180 | else: 181 | if add_flag == 1: 182 | if add_snippet: 183 | if changed_class != 'test': 184 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method) 185 | diff_set.append(add_element) 186 | add_flag = 0 187 | if minus_flag == 1: 188 | if minus_snippet: 189 | if changed_class != 'test': 190 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method) 191 | diff_set.append(minus_element) 192 | minus_flag = 0 193 | #if file end with diffline 194 | if add_flag == 1: 195 | if add_snippet: 196 | if changed_class != 'test': 197 | add_element = add_diff_snippet(add_snippet,changed_class,'+',changed_method) 198 | diff_set.append(add_element) 199 | 200 | if minus_flag == 1: 201 | if minus_snippet: 202 | if changed_class != 'test': 203 | minus_element = add_diff_snippet(minus_snippet,changed_class,'-',changed_method) 204 | diff_set.append(minus_element) 205 | 206 | diff_file2.close() 207 | 208 | return code_set,diff_set 209 | 210 | def diff_selection(url,config_variable_list): 211 | 212 | diff = diff_file_parser(url) 213 | 214 | if diff: 215 | codeSet = diff[0] 216 | diffSet = diff[1] 217 | else: 218 | codeSet = 0 219 | diffSet = 0 220 | 221 | #Whether a diff touches configuration doc 222 | configDocTouched = False 223 | 224 | #Whether a diff touches configuration build 225 | configBuildTouched = False 226 | 227 | #Whether a diff touches configuration load function 228 | configLoadTouched = False 229 | 230 | #Whether a diff touches configuration set function 231 | configSetTouched = False 232 | 233 | #Whether a diff touches configuration Variable 234 | configVariableTouched = False 235 | 236 | #whether a diff touches configuration message(log, error message) 237 | configMessageTouched = False 238 | 239 | #the set of touched configuration build function 240 | touchedBuildFunc = [] 241 | 242 | #the set of touched configuration load function 243 | touchedLoadFunc = [] 244 | 245 | #the set of touched configuration set function 246 | touchedSetFunc = [] 247 | 248 | #the set of touched configuration variable 249 | touchedVariable = [] 250 | 251 | #the set of touched configuration message 252 | touchedMessage = [] 253 | 254 | if codeSet and diffSet: 255 | 256 | #collect configuration variables in code snippet(not diff snippet) 257 | for codeElement in codeSet: 258 | 259 | #collect variables that assigned by spark param 260 | configAssignObj = re.findall(SPARK_CONFIG_ASSIGN_RE,codeElement.code_snippet,re.M | re.I) 261 | if configAssignObj: 262 | for assignObj in configAssignObj: 263 | collect_config_variable(assignObj,codeElement,config_variable_list) 264 | 265 | #collect variables that assigned by system properties 266 | sysParamAssignObj = re.findall(SPARK_SYS_PARAM_ASSIGN_FUNC_RE,codeElement.code_snippet,re.M | re.I) 267 | if sysParamAssignObj: 268 | for assignObj in sysParamAssignObj: 269 | collect_config_variable(assignObj,codeElement,config_variable_list) 270 | 271 | #collect variables that assigned by SQL param 272 | SQLparamAssignObj = re.findall(SQL_CONFIG_ASSIGN_RE,codeElement.code_snippet,re.M | re.I) 273 | if SQLparamAssignObj: 274 | for assignObj in SQLparamAssignObj: 275 | collect_config_variable(assignObj,codeElement,config_variable_list) 276 | 277 | #identify whether the diffs touch configuraton related part 278 | for diffElement in diffSet: 279 | 280 | #check whether diff touches config doc 281 | if SPARK_CONFIG_DOC_RE == diffElement.diff_class: 282 | configDocTouched = True 283 | 284 | #check whether diff touches config build function 285 | configBuildObj = re.findall(SPARK_CONFIG_BUILDE_RE,diffElement.diff_snippet,re.M | re.I) 286 | if configBuildObj: 287 | configBuildTouched = True 288 | for buildObj in configBuildObj: 289 | touchedBuildFunc.append(diffElement.diff_change_mode + buildObj) 290 | 291 | #check whether diff touches spark config load function 292 | configLoadObj = re.findall(SPARK_CONFIG_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 293 | if configLoadObj: 294 | for loadObj in configLoadObj: 295 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','') 296 | if diffElement.diff_change_mode == '+': 297 | reverseMode = '-' 298 | else: 299 | reverseMode = '+' 300 | reverseFlag = False 301 | for Func in touchedLoadFunc: 302 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''): 303 | touchedLoadFunc.remove(Func) 304 | reverseFlag = True 305 | break 306 | if reverseFlag == False: 307 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')) 308 | 309 | #check whether diff touches SQL config load function 310 | SQLconfigLoadObj = re.findall(SQL_CONFIG_LOAD_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 311 | if SQLconfigLoadObj: 312 | for loadObj in SQLconfigLoadObj: 313 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','') 314 | if diffElement.diff_change_mode == '+': 315 | reverseMode = '-' 316 | else: 317 | reverseMode = '+' 318 | reverseFlag = False 319 | for Func in touchedLoadFunc: 320 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''): 321 | touchedLoadFunc.remove(Func) 322 | reverseFlag = True 323 | break 324 | if reverseFlag == False: 325 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')) 326 | 327 | #check whether diff touches system parameters load function 328 | sysParamLoadObj = re.findall(SPARK_SYS_PARAM_LOAD_RE,diffElement.diff_snippet,re.M | re.I) 329 | if sysParamLoadObj: 330 | for loadObj in sysParamLoadObj: 331 | loadObjStr = diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','') 332 | if diffElement.diff_change_mode == '+': 333 | reverseMode = '-' 334 | else: 335 | reverseMode = '+' 336 | reverseFlag = False 337 | for Func in touchedLoadFunc: 338 | if Func == reverseMode + loadObj.replace(' ','').replace('\n',''): 339 | touchedLoadFunc.remove(Func) 340 | reverseFlag = True 341 | break 342 | if reverseFlag == False: 343 | touchedLoadFunc.append(diffElement.diff_change_mode + loadObj.replace(' ','').replace('\n','')) 344 | 345 | #check whether diff touches config set function 346 | configSetObj = re.findall(SPARK_CONFIG_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 347 | if configSetObj: 348 | for setObj in configSetObj: 349 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','') 350 | if diffElement.diff_change_mode == '+': 351 | reverseMode = '-' 352 | else: 353 | reverseMode = '+' 354 | reverseFlag = False 355 | for Func in touchedSetFunc: 356 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''): 357 | touchedSetFunc.remove(Func) 358 | reverseFlag = True 359 | break 360 | if reverseFlag == False: 361 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')) 362 | 363 | sysParamSetObj = re.findall(SPARK_SYS_PARAM_SET_FUNC_RE,diffElement.diff_snippet,re.M | re.I) 364 | if sysParamSetObj: 365 | for setObj in sysParamSetObj: 366 | setObjStr = diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','') 367 | if diffElement.diff_change_mode == '+': 368 | reverseMode = '-' 369 | else: 370 | reverseMode = '+' 371 | reverseFlag = False 372 | for Func in touchedSetFunc: 373 | if Func == reverseMode + setObj.replace(' ','').replace('\n',''): 374 | touchedSetFunc.remove(Func) 375 | reverseFlag = True 376 | break 377 | if reverseFlag == False: 378 | touchedSetFunc.append(diffElement.diff_change_mode + setObj.replace(' ','').replace('\n','')) 379 | 380 | #check whether diff touches config related Variable 381 | for Variable in config_variable_list: 382 | if Variable.variable_name in diffElement.diff_snippet and Variable.variable_class == diffElement.diff_class: 383 | variableStr = diffElement.diff_change_mode + Variable.variable_name + ' ' + Variable.variable_class 384 | if diffElement.diff_change_mode == '+': 385 | reverseMode = '-' 386 | else: 387 | reverseMode = '+' 388 | reverseFlag = False 389 | for var in touchedVariable: 390 | if var == reverseMode + Variable.variable_name + ' ' + Variable.variable_class: 391 | touchedVariable.remove(var) 392 | reverseFlag = True 393 | break 394 | if reverseFlag == False: 395 | touchedVariable.append(diffElement.diff_change_mode + Variable.variable_name + ' ' + Variable.variable_class) 396 | 397 | #check whether diff touches configuration message 398 | messageObj = re.findall(MESSAGE_RE,diffElement.diff_snippet,re.M | re.I) 399 | if messageObj: 400 | for messages in messageObj: 401 | messages = messages.split('"') 402 | for message in messages: 403 | words = message.lower().split(" ") 404 | if len(words) > 3: 405 | if 'option' in words or 'parameter' in words or 'config' in message.lower(): 406 | messageStr = diffElement.diff_change_mode + ' ' + message 407 | if diffElement.diff_change_mode == '+': 408 | reverseMode = '-' 409 | else: 410 | reverseMode = '+' 411 | reverseFlag = False 412 | for msg in touchedMessage: 413 | if msg == reverseMode + ' ' + message: 414 | touchedMessage.remove(msg) 415 | reverseFlag = True 416 | break 417 | if reverseFlag == False: 418 | touchedMessage.append(messageStr) 419 | 420 | if touchedLoadFunc != []: 421 | configLoadTouched = True 422 | 423 | if touchedSetFunc != []: 424 | configSetTouched = True 425 | 426 | if touchedVariable != []: 427 | configVariableTouched = True 428 | 429 | if touchedMessage != []: 430 | configMessageTouched = True 431 | 432 | 433 | return configDocTouched,configBuildTouched,configLoadTouched,configSetTouched,configVariableTouched,configMessageTouched,touchedBuildFunc,touchedLoadFunc,touchedSetFunc,touchedVariable,touchedMessage 434 | 435 | else: 436 | return False 437 | 438 | 439 | 440 | 441 | --------------------------------------------------------------------------------