├── results
    ├── defects4j
    │   ├── hist_diff-1.png
    │   ├── hist_file-1.png
    │   └── hist_loc-1.png
    ├── found_bugs_all_tools-1.png
    └── README.md
├── scripts
    ├── execute_warnings_based_approach.sh
    ├── prepare_checkers_output.sh
    ├── execute_diff_based_approach.sh
    ├── do_study.sh
    ├── config.sh
    ├── run_checkers_on_defects4j.sh
    ├── download_defects4j.sh
    ├── download_static_checkers.sh
    └── README.md
├── LICENSE
├── python
    ├── CompareBugToFixErrorprone.py
    ├── CompareBugToFixInfer.py
    ├── CompareBugToFixSpotbugs.py
    ├── CompareDiffsToErrorprone.py
    ├── CompareDiffsToSpotbugs.py
    ├── CompareDiffsToInfer.py
    ├── ParseAndSerializeInfer.py
    ├── TryAllCompileD4J.py
    ├── ComputeStatsOnSpotbugsOutput.py
    ├── ComputeStatsOnErrorproneOutput.py
    ├── ComputeStatsOnInferOutput.py
    ├── RunErrorprone.py
    ├── CheckoutD4j.py
    ├── RunSpotbugs.py
    ├── ParseAndSerializeErrorprone.py
    ├── ParseAndSerializeSpotbugs.py
    ├── ExtractAndSerializeDiffs.py
    ├── RunInfer.py
    ├── ComputeStatsOnD4J.py
    ├── Util.py
    └── ComputeStatsOnD4JToolsResults.py
├── .gitignore
└── README.md


/results/defects4j/hist_diff-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sola-da/StaticBugCheckers/HEAD/results/defects4j/hist_diff-1.png


--------------------------------------------------------------------------------
/results/defects4j/hist_file-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sola-da/StaticBugCheckers/HEAD/results/defects4j/hist_file-1.png


--------------------------------------------------------------------------------
/results/defects4j/hist_loc-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sola-da/StaticBugCheckers/HEAD/results/defects4j/hist_loc-1.png


--------------------------------------------------------------------------------
/results/found_bugs_all_tools-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sola-da/StaticBugCheckers/HEAD/results/found_bugs_all_tools-1.png


--------------------------------------------------------------------------------
/scripts/execute_warnings_based_approach.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Author: Andrew Habib
 4 | # Created on: 31 August 2018
 5 | 
 6 | [[ ! "${#BASH_SOURCE[@]}" -gt "1" ]] && { source ./scripts/config.sh; }
 7 | 
 8 | echo
 9 | echo ">>> Executing the Removed warnings-based methodology: find removed (a.k.a disappeared) warnings <<<"
10 | 
11 | (cd ${STUDY_ROOT} \
12 | 	&& python3 ${PY_SCRIPTS_ROOT}/CompareBugToFixErrorprone.py ${OUT_BUGGY}/ep_parsed.json ${OUT_FIXED}/ep_parsed.json \
13 | 	&& python3 ${PY_SCRIPTS_ROOT}/CompareBugToFixInfer.py ${OUT_BUGGY}/inf_parsed.json ${OUT_FIXED}/inf_parsed.json \
14 | 	&& python3 ${PY_SCRIPTS_ROOT}/CompareBugToFixSpotbugs.py ${OUT_BUGGY}/sb_parsed.json ${OUT_FIXED}/sb_parsed.json \
15 | )
16 | 


--------------------------------------------------------------------------------
/scripts/prepare_checkers_output.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Author: Andrew Habib
 4 | # Created on: 31 August 2018
 5 | 
 6 | [[ ! "${#BASH_SOURCE[@]}" -gt "1" ]] && { source ./scripts/config.sh; }
 7 | 
 8 | echo
 9 | echo ">>> Parsing and serializing output from the static checkers <<<"
10 | 
11 | (cd $OUT_BUGGY \
12 | 	&& python3 ${PY_SCRIPTS_ROOT}/ParseAndSerializeErrorprone.py ep_output/ \
13 | 	&& python3 ${PY_SCRIPTS_ROOT}/ParseAndSerializeInfer.py inf_output_json/ \
14 | 	&& python3 ${PY_SCRIPTS_ROOT}/ParseAndSerializeSpotbugs.py sb_output/ \
15 | )
16 | 
17 | (cd $OUT_FIXED \
18 | 	&& python3 ${PY_SCRIPTS_ROOT}/ParseAndSerializeErrorprone.py ep_output/ \
19 | 	&& python3 ${PY_SCRIPTS_ROOT}/ParseAndSerializeInfer.py inf_output_json/ \
20 | 	&& python3 ${PY_SCRIPTS_ROOT}/ParseAndSerializeSpotbugs.py sb_output/ \
21 | )
22 | 


--------------------------------------------------------------------------------
/scripts/execute_diff_based_approach.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Author: Andrew Habib
 4 | # Created on: 31 August 2018
 5 | 
 6 | [[ ! "${#BASH_SOURCE[@]}" -gt "1" ]] && { source ./scripts/config.sh; }
 7 | 
 8 | echo
 9 | echo ">>> Computing diffs between buggy and fixed versions in the Defects4j <<<"
10 | 
11 | (cd ${STUDY_ROOT} \
12 | 	&& python3 ${PY_SCRIPTS_ROOT}/ExtractAndSerializeDiffs.py $D4J_BUGGY $D4J_FIXED > /dev/null 2>&1 \
13 | )
14 | 
15 | echo
16 | echo ">>> Executing the Diff-based methodology: intersect diffs with flagged lines <<<"
17 | 
18 | (cd ${STUDY_ROOT} \
19 | 	&& python3 ${PY_SCRIPTS_ROOT}/CompareDiffsToErrorprone.py $DIFFS_FILE ${OUT_BUGGY}/ep_parsed.json \
20 | 	&& python3 ${PY_SCRIPTS_ROOT}/CompareDiffsToInfer.py $DIFFS_FILE ${OUT_BUGGY}/inf_parsed.json \
21 | 	&& python3 ${PY_SCRIPTS_ROOT}/CompareDiffsToSpotbugs.py $DIFFS_FILE ${OUT_BUGGY}/sb_parsed.json \
22 | )
23 | 


--------------------------------------------------------------------------------
/scripts/do_study.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Author: Andrew Habib
 4 | # Created on: 31 August 2018
 5 | 
 6 | source ./scripts/config.sh
 7 | 
 8 | echo
 9 | echo "**********************************************************************************************"
10 | echo "Running the study of static bug checkers and their effectiveness by Habib and Pradel [ASE2018]"
11 | echo "**********************************************************************************************"
12 | 
13 | bash ./scripts/download_static_checkers.sh
14 | 
15 | bash ./scripts/download_defects4j.sh
16 | 
17 | bash ./scripts/run_checkers_on_defects4j.sh
18 | 
19 | bash ./scripts/prepare_checkers_output.sh
20 | 
21 | bash ./scripts/execute_diff_based_approach.sh
22 | 
23 | bash ./scripts/execute_warnings_based_approach.sh
24 | 
25 | echo
26 | echo "*************************"
27 | echo "Done performing the study"
28 | echo "*************************"
29 | echo
30 | echo "Check the results in:"
31 | echo $STUDY_ROOT
32 | echo
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Software Lab at TU Darmstadt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/scripts/config.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Author: Andrew Habib
 4 | # Created on: 31 August 2018
 5 | 
 6 | # Defining various paths used by our scripts
 7 | 
 8 | export CHECKERS_ROOT="$PWD/static-checkers"
 9 | 
10 | export D4J_ROOT="$PWD/defects4j"
11 | export D4J_BUGGY="${D4J_ROOT}/projects/b"
12 | export D4J_FIXED="${D4J_ROOT}/projects/f"
13 | 
14 | export PY_SCRIPTS_ROOT="$PWD/python"
15 | 
16 | export STUDY_ROOT="$PWD/study"
17 | export OUT_BUGGY="${STUDY_ROOT}/output-buggy"
18 | export OUT_FIXED="${STUDY_ROOT}/output-fixed"
19 | export DIFFS_FILE="${STUDY_ROOT}/diffs_parsed.json"
20 | 
21 | # Find out which os am I running on
22 | unameOut="$(uname -s)"
23 | case "${unameOut}" in
24 |     Linux*)     MACHINE=Linux;;
25 |     Darwin*)    MACHINE=Mac;;
26 |     CYGWIN*)    MACHINE=Cygwin;;
27 |     MINGW*)     MACHINE=MinGw;;
28 |     *)          MACHINE="UNKNOWN:${unameOut}"
29 | esac
30 | export $MACHINE
31 | 
32 | # Use 2/3 of the number of available cores
33 | # Ubuntu-like OS:
34 | if [ $MACHINE = "Linux" ]; then
35 |   JOBS=$((`grep -c ^processor /proc/cpuinfo` * 2/3))
36 | elif [ $MACHINE = "Mac" ]; then
37 |   JOBS=$((`sysctl -n hw.ncpu` * 2/3))
38 | else
39 |   JOBS=`python -c 'import multiprocessing as mp; print(mp.cpu_count()*2/3)'`
40 | fi
41 | export $"JOBS"
42 | 


--------------------------------------------------------------------------------
/python/CompareBugToFixErrorprone.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Nov. 30, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import json
10 | import os
11 | import sys
12 | 
13 | from Util import load_parsed_ep, CustomEncoder
14 | 
15 | 
16 | def match_ep_msg_no_lines(msg, msgs):
17 |     for msg2 in msgs:
18 |         if (msg.proj == msg2.proj and msg.cls == msg2.cls and
19 |             msg.typ == msg2.typ and msg.cat == msg2.cat and
20 |             msg.msg == msg2.msg and msg.code == msg2.code):
21 |         
22 |             return True
23 |         
24 |     return False
25 | 
26 | 
27 | def get_removed_warnings_ep(ep_b, ep_f):
28 |     removed_warnings = []
29 |     for b_msg in ep_b:
30 |         if not match_ep_msg_no_lines(b_msg, ep_f):
31 |             removed_warnings.append(b_msg)
32 |     
33 |     return removed_warnings
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     
38 |     """Get errors/warnings that disappeared in fixed versions"""
39 |     
40 |     ep_file = os.path.join(os.getcwd(), sys.argv[1])
41 |     ep_res_b = load_parsed_ep(ep_file)
42 |       
43 |     ep_file = os.path.join(os.getcwd(), sys.argv[2])
44 |     ep_res_f = load_parsed_ep(ep_file)
45 |       
46 |     warnings = get_removed_warnings_ep(ep_res_b, ep_res_f)
47 |     
48 |     output_file_name = "ep_removed_warnings.json"
49 |     with open(output_file_name, "w") as file:
50 |         json.dump(warnings, file, cls=CustomEncoder, indent=4)
51 | 


--------------------------------------------------------------------------------
/python/CompareBugToFixInfer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Jan. 4, 2018
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import json
10 | import os
11 | import sys
12 | 
13 | from Util import load_parsed_inf, CustomEncoder
14 | 
15 | 
16 | def match_inf_msg_no_lines(msg, msgs):
17 |     for msg2 in msgs:
18 |         if (msg.proj == msg2.proj and msg.cls == msg2.cls and
19 |             msg.bug_class == msg2.bug_class and msg.kind == msg2.kind and
20 |             msg.bug_type == msg2.bug_type and msg.severity == msg2.severity and
21 |             msg.visibility == msg2.visibility and msg.procedure == msg2.procedure):
22 |         
23 |             return True
24 |         
25 |     return False
26 | 
27 | 
28 | def get_removed_warnings_inf(inf_b, inf_f):
29 |     removed_warnings = []
30 |     for b_msg in inf_b:
31 |         if not match_inf_msg_no_lines(b_msg, inf_f):
32 |             removed_warnings.append(b_msg)
33 | 
34 |     return removed_warnings
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     
39 |     """Get errors/warnings that disappeared in fixed versions"""
40 |     
41 |     inf_file = os.path.join(os.getcwd(), sys.argv[1])
42 |     inf_res_b = load_parsed_inf(inf_file)
43 |       
44 |     inf_file = os.path.join(os.getcwd(), sys.argv[2])
45 |     inf_res_f = load_parsed_inf(inf_file)
46 |       
47 |     warnings = get_removed_warnings_inf(inf_res_b, inf_res_f)
48 |     
49 |     output_file_name = "inf_removed_warnings.json"
50 |     with open(output_file_name, "w") as file:
51 |         json.dump(warnings, file, cls=CustomEncoder, indent=4)
52 | 


--------------------------------------------------------------------------------
/python/CompareBugToFixSpotbugs.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Nov. 30, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import json
10 | import os
11 | import sys
12 | 
13 | from Util import load_parsed_sb, CustomEncoder
14 | 
15 | 
16 | def match_sb_msg_no_lines(msg, msgs):
17 |     for msg2 in msgs:
18 |         if (msg.proj == msg2.proj and msg.cls == msg2.cls and
19 |             msg.cat == msg2.cat and msg.abbrev == msg2.abbrev and 
20 |             msg.typ == msg2.typ and msg.prio == msg2.prio and
21 |             msg.rank == msg2.rank and msg.msg == msg2.msg and
22 |             msg.mth == msg2.mth and msg.field == msg2.field):
23 |         
24 |             return True
25 |         
26 |     return False
27 | 
28 | 
29 | def get_removed_warnings_sb(sb_b, sb_f):
30 |     removed_warnings = []
31 |     for b_msg in sb_b:
32 |         if not match_sb_msg_no_lines(b_msg, sb_f):
33 |             removed_warnings.append(b_msg)
34 |             
35 |     return removed_warnings
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     
40 |     """Get errors/warnings that disappeared in fixed versions"""
41 | 
42 |     sb_file = os.path.join(os.getcwd(), sys.argv[1])
43 |     sb_res_b = load_parsed_sb(sb_file)
44 |        
45 |     sb_file = os.path.join(os.getcwd(), sys.argv[2])
46 |     sb_res_f = load_parsed_sb(sb_file)
47 |        
48 |     warnings = get_removed_warnings_sb(sb_res_b, sb_res_f)
49 | 
50 |     output_file_name = "sb_removed_warnings.json"
51 |     with open(output_file_name, "w") as file:
52 |         json.dump(warnings, file, cls=CustomEncoder, indent=4)
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/python/CompareDiffsToErrorprone.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Nov. 30, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import json
10 | import os
11 | import sys
12 | 
13 | from Util import load_parsed_diffs, load_parsed_ep, find_msg_by_proj_and_cls, \
14 |                 LineMatchesToMessages, CustomEncoder
15 | 
16 | 
17 | def match_diff_ep(d, ep_list):
18 |     matches = []
19 |     lines_matches = []
20 |     for inst in ep_list:
21 |         if inst.line in d.lines:
22 |             matches.append(inst)
23 |             lines_matches.append(inst.line)
24 |     return matches, set(lines_matches)
25 | 
26 | 
27 | def get_hits_diffs_ep(diffs, ep_res_set):
28 |     ep_count = 0
29 |     ep_all_matches = []
30 |     diffs_match_ep = []
31 |     for d in diffs:
32 |         
33 |         proj = d.proj
34 |         cls = d.cls
35 |         
36 |         ep_list = find_msg_by_proj_and_cls(proj, cls, ep_res_set)
37 |         diff_ep, lines = match_diff_ep(d, ep_list)
38 |         if diff_ep:
39 | 
40 |             ep_count += len(diff_ep)
41 |             ep_all_matches.append(LineMatchesToMessages(lines, diff_ep))
42 |             diffs_match_ep.extend(diff_ep)
43 | #     print(ep_count)
44 | 
45 | #     return ep_all_matches
46 |     return diffs_match_ep
47 | 
48 | 
49 | if __name__ == '__main__':
50 | 
51 |     """Get lines matches between each tool and  bug fixes diffs"""
52 | 
53 |     diffs_file = os.path.join(os.getcwd(), sys.argv[1])
54 |     diffs = load_parsed_diffs(diffs_file)
55 |       
56 |     ep_file = os.path.join(os.getcwd(), sys.argv[2])
57 |     ep_res_set = load_parsed_ep(ep_file)
58 |   
59 |     diffs_ep = get_hits_diffs_ep(diffs, ep_res_set)
60 | 
61 |     output_file_name = "ep_diffs_warnings.json"
62 |     with open(output_file_name, "w") as file:
63 |         json.dump(diffs_ep, file, cls=CustomEncoder, indent=4)
64 | 


--------------------------------------------------------------------------------
/python/CompareDiffsToSpotbugs.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Nov. 30, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import json
10 | import os
11 | import sys
12 | 
13 | from Util import load_parsed_diffs, load_parsed_sb, find_msg_by_proj_and_cls, \
14 |                 LineMatchesToMessages, CustomEncoder
15 | 
16 | 
17 | def match_diff_sb(d, sb_list):
18 |     matches = []
19 |     lines_matches = []
20 |     for inst in sb_list:
21 |         sb_lines = inst.unrollLines()
22 |         if d.lines.intersection(sb_lines):
23 |             matches.append(inst)
24 |             lines_matches.extend(d.lines.intersection(sb_lines))
25 |     return matches, set(lines_matches)
26 | 
27 | 
28 | def get_hits_diffs_sb(diffs, sb_res):
29 |     sb_count = 0    
30 |     sb_all_matches = []
31 |     diffs_match_sb = []
32 |     for d in diffs:
33 |         
34 |         proj = d.proj
35 |         cls = d.cls
36 |         
37 |         sb_list = find_msg_by_proj_and_cls(proj, cls, sb_res)
38 |         diff_sb, lines = match_diff_sb(d, sb_list)
39 |         if diff_sb:
40 |             sb_count += len(diff_sb)
41 |             sb_all_matches.append(LineMatchesToMessages(lines, diff_sb))
42 |             diffs_match_sb.extend(diff_sb)
43 | 
44 | #     print(sb_count)
45 | 
46 | #     return sb_all_matches
47 |     return diffs_match_sb
48 | 
49 | 
50 | if __name__ == '__main__':
51 | 
52 |     """Get lines matches between each tool and  bug fixes diffs"""
53 | 
54 |     diffs_file = os.path.join(os.getcwd(), sys.argv[1])
55 |     diffs = load_parsed_diffs(diffs_file)
56 |       
57 |     sb_file = os.path.join(os.getcwd(), sys.argv[2])
58 |     sb_res = load_parsed_sb(sb_file)
59 |   
60 |     diffs_sb = get_hits_diffs_sb(diffs, sb_res)
61 | 
62 |     output_file_name = "sb_diffs_warnings.json"
63 |     with open(output_file_name, "w") as file:
64 |         json.dump(diffs_sb, file, cls=CustomEncoder, indent=4)
65 | 


--------------------------------------------------------------------------------
/python/CompareDiffsToInfer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Jan. 4, 2018
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import json
10 | import os
11 | import sys
12 | 
13 | from Util import load_parsed_diffs, load_parsed_inf, find_msg_by_proj_and_cls, \
14 |                 LineMatchesToMessages, CustomEncoder
15 | 
16 | 
17 | def match_diff_inf(d, inf_list):
18 |     matches = []
19 |     lines_matches = []
20 |     for inst in inf_list:
21 |         for l in inst.lines:
22 |             if l in d.lines:
23 |                 matches.append(inst)
24 |                 lines_matches.append(l)
25 |                 break
26 |     return matches, set(lines_matches)
27 | 
28 | 
29 | def get_hits_diffs_inf(diffs, inf_res):
30 |     inf_count = 0
31 |     inf_all_matches = []
32 |     diffs_match_inf = []
33 |     for d in diffs:
34 |         
35 |         proj = d.proj
36 |         cls = d.cls
37 |         
38 |         ep_list = find_msg_by_proj_and_cls(proj, cls, inf_res)
39 |         diff_inf, lines = match_diff_inf(d, ep_list)
40 |         if diff_inf:
41 |             inf_count += len(diff_inf)
42 |             inf_all_matches.append(LineMatchesToMessages(lines, diff_inf))
43 |             diffs_match_inf.extend(diff_inf)
44 | 
45 | #     print(inf_count)
46 | 
47 | #     return inf_all_matches
48 |     return diffs_match_inf
49 | 
50 | 
51 | if __name__ == '__main__':
52 | 
53 |     """Get lines matches between each tool and  bug fixes diffs"""
54 | 
55 |     diffs_file = os.path.join(os.getcwd(), sys.argv[1])
56 |     diffs = load_parsed_diffs(diffs_file)
57 |       
58 |     inf_file = os.path.join(os.getcwd(), sys.argv[2])
59 |     inf_res = load_parsed_inf(inf_file)
60 |   
61 |     diffs_inf = get_hits_diffs_inf(diffs, inf_res)
62 | 
63 |     output_file_name = "inf_diffs_warnings.json"
64 |     with open(output_file_name, "w") as file:
65 |         json.dump(diffs_inf, file, cls=CustomEncoder, indent=4)
66 | 


--------------------------------------------------------------------------------
/scripts/run_checkers_on_defects4j.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Author: Andrew Habib
 4 | # Created on: 31 August 2018
 5 | 
 6 | [[ ! "${#BASH_SOURCE[@]}" -gt "1" ]] && { source ./scripts/config.sh; }
 7 | 
 8 | if [ $MACHINE = "Linux" ]; then
 9 | 	get_abs_path() {
10 | 		readlink -f $1
11 | 	}
12 | 	INF_BIN="$(get_abs_path `find ${CHECKERS_ROOT} -maxdepth 1 -type d -name infer-linux*`)/bin/infer"
13 | elif [ $MACHINE = "Mac" ]; then
14 | 	get_abs_path() {
15 | 		greadlink -f $1
16 | 	}
17 | 	INF_BIN="$(get_abs_path `find ${CHECKERS_ROOT} -maxdepth 1 -type d -name infer-osx*`)/bin/infer"
18 | else
19 | 	echo "Reporting from script: $(basename $BASH_SOURCE) at line: $LINENO"
20 | 	echo "Not sure how to get abs path of static analyzers given your OS: $MACHINE."
21 | 	echo "Add your OS equivalent of 'readlink -f' here"
22 | 	echo "Will exit..."
23 | 	exit 1
24 | fi
25 | EP_BIN=$(get_abs_path `find ${CHECKERS_ROOT} -name error_prone*.jar`)
26 | SB_BIN="$(get_abs_path `find ${CHECKERS_ROOT} -maxdepth 1 -type d -name spotbugs*`)/lib/spotbugs.jar"
27 | 
28 | if [ -d ${STUDY_ROOT} ]; then rm -rf ${STUDY_ROOT}; fi
29 | mkdir -p $OUT_BUGGY && mkdir -p $OUT_FIXED
30 | 
31 | echo
32 | echo ">>> Running the static checkers on the buggy versions from the Defects4j <<<"
33 | (cd $OUT_BUGGY \
34 | 	&& echo ">>> Running Error Prone" && python3 ${PY_SCRIPTS_ROOT}/RunErrorprone.py ${EP_BIN} ${D4J_BUGGY} $JOBS \
35 | 	&& echo ">>> Running Infer" && python3 ${PY_SCRIPTS_ROOT}/RunInfer.py ${INF_BIN} ${D4J_BUGGY} $JOBS \
36 | 	&& echo ">>> Running SpotBugs" && python3 ${PY_SCRIPTS_ROOT}/RunSpotbugs.py ${SB_BIN} ${D4J_BUGGY} $JOBS \
37 | )
38 | 
39 | echo
40 | echo ">>> Running the static checkers on the fixed versions from the Defects4j <<<"
41 | (cd $OUT_FIXED \
42 | 	&& echo ">>> Running Error Prone" && python3 ${PY_SCRIPTS_ROOT}/RunErrorprone.py ${EP_BIN} ${D4J_FIXED} $JOBS \
43 | 	&& echo ">>> Running Infer" && python3 ${PY_SCRIPTS_ROOT}/RunInfer.py ${INF_BIN} ${D4J_FIXED} $JOBS \
44 | 	&& echo ">>> Running SpotBugs"	&& python3 ${PY_SCRIPTS_ROOT}/RunSpotbugs.py ${SB_BIN} ${D4J_FIXED} $JOBS \
45 | )
46 | 


--------------------------------------------------------------------------------
/python/ParseAndSerializeInfer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Dec. 28, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import json
10 | import os
11 | import sys
12 | 
13 | from Util import JsonDataReader, InferIssue, InferMsg, CustomEncoder, \
14 |                 get_cls_name_from_file_path, NO_WARNING
15 | 
16 | 
17 | def parse_infer_json_output(proj, json_issue):
18 |     # Case where report file is NOT empty 
19 |     if json_issue:
20 |         issue = InferIssue(*list(json_issue[k] for k in InferIssue.keys))
21 |         cls = get_cls_name_from_file_path(issue.file) 
22 |         lines = extract_lines_from_issue(issue)
23 |         infer_msg = InferMsg(proj, cls, issue.bug_class, issue.kind, issue.bug_type, issue.qualifier,
24 |                              issue.severity, issue.visibility, lines, issue.procedure)
25 |     # Case where report file is empty
26 |     else:
27 |         infer_msg = InferMsg(proj, "", "", "", NO_WARNING, "", "", "", "", "")
28 |     return infer_msg
29 |     
30 | 
31 | def extract_lines_from_issue(issue):
32 |     src_file = issue.file
33 |     trace_lines = []
34 |     trace_lines.append(issue.line)
35 |     for t in issue.bug_trace:
36 |         if t.filename == src_file:
37 |             trace_lines.append(t.line)
38 |     return set(trace_lines)
39 | 
40 |     
41 | '''
42 | Takes only one argument: path to infer json files
43 | '''
44 | 
45 | if __name__ == '__main__':
46 |     
47 |     location_to_data = os.path.join(os.getcwd(), sys.argv[1])
48 |     list_of_data = sorted(os.listdir(location_to_data))
49 |     
50 |     data_paths = list(map(lambda f: os.path.join(location_to_data, f), list_of_data))
51 | 
52 |     parsed_reports_per_project = []
53 |     
54 |     for proj, json_issue in JsonDataReader(data_paths):
55 |         msg = parse_infer_json_output(proj, json_issue)
56 |         if msg:
57 |             parsed_reports_per_project.append(msg)
58 |         
59 | #     time_stamp = time.strftime("%Y%m%d-%H%M%S")
60 |     time_stamp = ''
61 |     parsed_output_file_name = "inf_parsed" + time_stamp + ".json"
62 |     with open(parsed_output_file_name, "w") as file:
63 |         json.dump(parsed_reports_per_project, file, cls=CustomEncoder, indent=4)
64 | 


--------------------------------------------------------------------------------
/python/TryAllCompileD4J.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Dec. 15, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import os
10 | import subprocess
11 | import sys
12 | 
13 | from joblib import Parallel, delayed
14 | 
15 | 
16 | def try_compile(proj, path, d4j_binary):
17 |     print()
18 |     print("Trying to compile:", proj)
19 |     print()
20 |     
21 |     proj_dir = os.path.join(path, proj)
22 | 
23 |     os.chdir(proj_dir)
24 |     
25 |     # Compiling using the build-in d4j compile command
26 |     # Moved to Checkoutd4j.
27 | #     cmd = [d4j_binary, 'compile']
28 | #     p = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
29 | #     p.communicate()
30 | #     print(proj, p.returncode)
31 |     
32 |     # Compiling using brute-force, trying different compilation commands
33 |     cmd1 = ['ant', 'compile']
34 |     cmd2 = ['mvn', 'compile']
35 |     cmd3 = ['gradle', 'build']
36 |     
37 |     p = subprocess.Popen(cmd1, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
38 |     p.communicate()
39 |     print(proj, p.returncode)
40 |     
41 |     p = subprocess.Popen(cmd2, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
42 |     p.communicate()
43 |     print(proj, p.returncode)
44 |     
45 |     p = subprocess.Popen(cmd3, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
46 |     p.communicate()
47 |     print(proj, p.returncode)    
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     
52 |     path_d4j_projects = str(sys.argv[1])
53 |     path_d4j = str(sys.argv[2])
54 |     jobs = int(sys.argv[3])
55 |     
56 |     d4j_binary = os.path.join(path_d4j, 'framework/bin/defects4j')
57 |     projects = sorted(os.listdir(path_d4j_projects))
58 |     
59 |     # Use a cmd filter to debug specific projects
60 |     is_filter = False
61 |     if len(sys.argv) > 3:
62 |         is_filter = True
63 |         with open(sys.argv[3]) as file:
64 |             filter_list = file.read().splitlines()
65 |     if is_filter:
66 |         projects = sorted(list(i for i in projects if i in filter_list))
67 | 
68 |     Parallel(n_jobs=jobs)(delayed(try_compile)
69 |                           (p, path_d4j_projects, d4j_binary)
70 |                           for p in projects)
71 |     
72 | 


--------------------------------------------------------------------------------
/python/ComputeStatsOnSpotbugsOutput.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Nov. 30, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import os
10 | import sys
11 | 
12 | from collections import Counter, OrderedDict
13 | from statistics import mean, median
14 | 
15 | from Util import load_parsed_sb, NO_WARNING
16 | 
17 | if __name__ == '__main__':
18 |       
19 |     sb_file = os.path.join(os.getcwd(), sys.argv[1])
20 |     sb_res = load_parsed_sb(sb_file)
21 |   
22 |     proj_to_msg_count = {}
23 |     categories = []
24 |     for msg in sb_res:
25 |         categories.append(msg.typ)
26 |         
27 |         if msg.typ == NO_WARNING:
28 |             proj_to_msg_count[msg.proj] = 0
29 |         else:
30 |             try:
31 |                 proj_to_msg_count[msg.proj] += 1
32 |             except KeyError:
33 |                 proj_to_msg_count[msg.proj] = 1
34 |             
35 |     print()
36 |     print("-------------------------")
37 |     print("Stats on warnings per bug")
38 |     print("-------------------------")
39 |     print()
40 |     
41 |     msgs_count = proj_to_msg_count.values()
42 |     print('{:>6}: {:>4}'.format("Min", min(msgs_count)))
43 |     print('{:>6}: {:>4}'.format("Max", max(msgs_count)))
44 |     print('{:>6}: {:>6.2}'.format("Mean", mean(msgs_count)))
45 |     print('{:>6}: {:>6.2}'.format("Median", median(msgs_count)))
46 |     print('{:>6}: {:>4}'.format("Total", sum(msgs_count)))
47 |     
48 |     categories = Counter(categories)
49 |     no_warning_count = categories[NO_WARNING]
50 |     del categories[NO_WARNING]
51 |     
52 |     topX = OrderedDict(Counter(categories).most_common(5))
53 |     max_length = max(len(i) for i in topX)
54 | #     print('{0:-<{1}}'.format('-', max_length))
55 | #     print('{0:^{1}}'.format("Stats per error category", max_length))
56 | #     print('{0:-<{1}}'.format('-', max_length))
57 |     print()
58 |     print("------------------------")
59 |     print("Stats per error category")
60 |     print("------------------------")
61 |     print()
62 |     
63 |     print("\n".join('{:>{}}: {:>4}'.format(i, max_length, j) for i, j in topX.items()))
64 |     print()
65 |     
66 |     print('{:>{}}: {:>4}'.format("Projects without warnings", max_length, no_warning_count))
67 |     print()


--------------------------------------------------------------------------------
/python/ComputeStatsOnErrorproneOutput.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Nov. 30, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import os
10 | import sys
11 | 
12 | from collections import Counter, OrderedDict
13 | from statistics import mean, median
14 | 
15 | from Util import load_parsed_ep, NO_WARNING
16 | 
17 | if __name__ == '__main__':
18 |       
19 |     ep_file = os.path.join(os.getcwd(), sys.argv[1])
20 |     ep_res_set = load_parsed_ep(ep_file)
21 |   
22 |     proj_to_msg_count = {}
23 |     categories = []
24 |     for msg in ep_res_set:
25 |         categories.append(msg.cat)
26 |         
27 |         if msg.cat == NO_WARNING:
28 |             proj_to_msg_count[msg.proj] = 0
29 |         else:
30 |             try:
31 |                 proj_to_msg_count[msg.proj] += 1
32 |             except KeyError:
33 |                 proj_to_msg_count[msg.proj] = 1
34 |             
35 |     print()
36 |     print("-------------------------")
37 |     print("Stats on warnings per bug")
38 |     print("-------------------------")
39 |     print()
40 |     
41 |     msgs_count = proj_to_msg_count.values()
42 |     print('{:>6}: {:>4}'.format("Min", min(msgs_count)))
43 |     print('{:>6}: {:>4}'.format("Max", max(msgs_count)))
44 |     print('{:>6}: {:>7.2f}'.format("Mean", mean(msgs_count)))
45 |     print('{:>6}: {:>7.2f}'.format("Median", median(msgs_count)))
46 |     print('{:>6}: {:>4}'.format("Total", sum(msgs_count)))
47 |     
48 |     categories = Counter(categories)
49 |     no_warning_count = categories[NO_WARNING]
50 |     del categories[NO_WARNING]
51 |     
52 |     topX = OrderedDict(Counter(categories).most_common(5))
53 |     max_length = max(len(i) for i in topX)
54 | #     print('{0:-<{1}}'.format('-', max_length))
55 | #     print('{0:^{1}}'.format("Stats per error category", max_length))
56 | #     print('{0:-<{1}}'.format('-', max_length))
57 |     print()
58 |     print("------------------------")
59 |     print("Stats per error category")
60 |     print("------------------------")
61 |     print()
62 |     
63 |     print("\n".join('{:>{}}: {:>4}'.format(i, max_length, j) for i, j in topX.items()))
64 |     print()
65 |     
66 |     print('{:>{}}: {:>4}'.format("Projects without warnings", max_length, no_warning_count))
67 |     print()
68 |     


--------------------------------------------------------------------------------
/python/ComputeStatsOnInferOutput.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Jan. 23, 2018
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import os
10 | import sys
11 | 
12 | from collections import Counter, OrderedDict
13 | from statistics import mean, median
14 | 
15 | from Util import load_parsed_inf, NO_WARNING
16 | 
17 | if __name__ == '__main__':
18 |       
19 |     inf_file = os.path.join(os.getcwd(), sys.argv[1])
20 |     inf_res = load_parsed_inf(inf_file)
21 |   
22 |     proj_to_msg_count = {}
23 |     categories = []
24 |     for msg in inf_res:
25 |         categories.append(msg.bug_type)
26 |         
27 |         if msg.bug_type == NO_WARNING:
28 |             proj_to_msg_count[msg.proj] = 0
29 |         else:
30 |             try:
31 |                 proj_to_msg_count[msg.proj] += 1
32 |             except KeyError:
33 |                 proj_to_msg_count[msg.proj] = 1
34 |             
35 |     print()
36 |     print("-------------------------")
37 |     print("Stats on warnings per bug")
38 |     print("-------------------------")
39 |     print()
40 |     
41 |     msgs_count = proj_to_msg_count.values()
42 |     print('{:>6}: {:>4}'.format("Min", min(msgs_count)))
43 |     print('{:>6}: {:>4}'.format("Max", max(msgs_count)))
44 |     print('{:>6}: {:>7.2f}'.format("Mean", mean(msgs_count)))
45 |     print('{:>6}: {:>7.2f}'.format("Median", median(msgs_count)))
46 |     print('{:>6}: {:>4}'.format("Total", sum(msgs_count)))
47 |     
48 |     categories = Counter(categories)
49 |     no_warning_count = categories[NO_WARNING]
50 |     del categories[NO_WARNING]
51 |     
52 |     topX = OrderedDict(Counter(categories).most_common(5))
53 |     max_length = max(len(i) for i in topX) + 1
54 | #     print('{0:-<{1}}'.format('-', max_length))
55 | #     print('{0:^{1}}'.format("Stats per error category", max_length))
56 | #     print('{0:-<{1}}'.format('-', max_length))
57 |     print()
58 |     print("------------------------")
59 |     print("Stats per error category")
60 |     print("------------------------")
61 |     print()
62 |     
63 |     print("\n".join('{:>{}}: {:>4}'.format(i, max_length, j) for i, j in topX.items()))
64 |     print()
65 |     
66 |     print('{:>{}}: {:>4}'.format("Projects without warnings", max_length, no_warning_count))
67 |     print()


--------------------------------------------------------------------------------
/scripts/download_defects4j.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Author: Andrew Habib
 4 | # Created on: 31 August 2018
 5 | 
 6 | [[ ! "${#BASH_SOURCE[@]}" -gt "1" ]] && { source ./scripts/config.sh; }
 7 | 
 8 | if [ -d ${D4J_ROOT} ]; then rm -rf ${D4J_ROOT}; fi
 9 | echo
10 | echo ">>> Downloading the Defects4J framework <<<"
11 | 
12 | git clone -q https://github.com/rjust/defects4j.git
13 | 
14 | # This is the original pull request (pr) #112 used in the study.
15 | # However, many of the additional projects (not in the official d4j), have wrong d4j properties
16 | # and they don't compile right away. For the study, we have manually fixed all these errors.
17 | # But adding our ad-hoc fixes to the pr is not easy and is not our goal as it is unofficial anyways.
18 | # One alternative is to either restrict the reproduction to the original d4j.
19 | # Or use a different and more recent (hopefully cleaner) pull requests such as pr #174
20 | 
21 | # NOTE: depending on which version is used, python/CheckoutD4j.py should be updated accordingly.
22 | # as it now has the list of Defects projects identifiers and number of bugs encoded manually.
23 | 
24 | D4J_PR="112"	# Comment out this line to use the most recent official release of the Defects4J.
25 | 							# Look at the note above. Don't forget to update python/CheckoutD4j.py.
26 | 
27 | if [ $D4J_PR ]
28 | then
29 | 	echo ">>> Switching from master to pr #${D4J_PR}"
30 | 	(cd $D4J_ROOT \
31 | 		&& git fetch -q origin pull/${D4J_PR}/head:extendedD4J \
32 | 		&& git checkout -q extendedD4J \
33 | 	)
34 | fi
35 | 
36 | echo ">>> Initializing the framework"
37 | echo
38 | (cd $D4J_ROOT \
39 | 	&& ./init.sh
40 | 	# && ./init.sh > /dev/null 2>&1 \
41 | )
42 | echo
43 | 
44 | echo ">>> Checking out and compiling the dataset"
45 | echo
46 | 
47 | echo ">>> Checking out buggy versions to:"
48 | # python3 ${PY_SCRIPTS_ROOT}/CheckoutD4j.py ${D4J_ROOT} b ${JOBS} > /dev/null 2>&1
49 | python3 ${PY_SCRIPTS_ROOT}/CheckoutD4j.py ${D4J_ROOT} b ${JOBS}
50 | echo
51 | echo ">>> Checking out fixed versions to:"
52 | # python3 ${PY_SCRIPTS_ROOT}/CheckoutD4j.py ${D4J_ROOT} f ${JOBS} > /dev/null 2>&1
53 | python3 ${PY_SCRIPTS_ROOT}/CheckoutD4j.py ${D4J_ROOT} f ${JOBS}
54 | 
55 | # Hack to force compiling all d4j projects
56 | #python3 ${PY_SCRIPTS_ROOT}/TryAllCompileD4J.py ${D4J_ROOT}/b $JOBS
57 | #python3 ${PY_SCRIPTS_ROOT}/TryAllCompileD4J.py ${D4J_ROOT}/f $JOBS
58 | 


--------------------------------------------------------------------------------
/python/RunErrorprone.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Dec. 15, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import os
10 | import subprocess
11 | import sys
12 | 
13 | from joblib import Parallel, delayed
14 | 
15 | from Util import prepare_tool
16 | 
17 | 
18 | def run_ep_on_proj(proj, path, path_out, path_ep):
19 |     log = open(os.path.join(os.getcwd(), 'ep_log'), 'a')
20 | 
21 |     log.write("Running Errorprone on: " + proj + "\n\n")
22 |             
23 |     _, proj_cp, proj_javac_opts, proj_buggy_files, _ = prepare_tool(path, proj)
24 |     
25 |     f = open(os.path.join(path_out, proj), 'w')
26 |     for buggy_f in proj_buggy_files:
27 |         cmd = ['java', '-Xbootclasspath/p:' + path_ep,
28 |                'com.google.errorprone.ErrorProneCompiler',
29 |                '-implicit:none'] 
30 |         if proj_javac_opts:
31 |             cmd = cmd + proj_javac_opts.split(' ') + ['-cp', proj_cp, buggy_f]
32 |         else:
33 |             cmd = cmd + ['-cp', proj_cp, buggy_f]
34 |         
35 |         log.write(" ".join(cmd) + "\n\n")
36 |         
37 |         p = subprocess.Popen(cmd, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
38 |         (cmd_out, _) = p.communicate()
39 |         
40 |         f.write(cmd_out)
41 |         
42 |         log.write(cmd_out + "\n")
43 |         log.write("*"*24 + "\n\n")        
44 | 
45 |     log.write("#"*212 + "\n\n")
46 |     log.close()
47 |         
48 | 
49 | if __name__ == '__main__':
50 |     path_ep = os.path.join(os.getcwd(), sys.argv[1])
51 |     path_d4j_projects = os.path.join(os.getcwd(), sys.argv[2])
52 |     jobs = int(sys.argv[3])
53 |     
54 |     path_out = os.path.join(os.getcwd(), 'ep_output')
55 |     if not os.path.isdir(path_out):
56 |         os.makedirs(path_out)
57 |     
58 |     projects = sorted(os.listdir(path_d4j_projects))
59 |     
60 |     # Use a cmd is_filter to debug specific projects
61 |     is_filter = False
62 |     if len(sys.argv) > 4:
63 |         is_filter = True
64 |         with open(sys.argv[4]) as file:
65 |             filter_list = file.read().splitlines()
66 |     if is_filter:
67 |         projects = sorted(list(i for i in projects if i in filter_list))
68 |         
69 |     Parallel(n_jobs=jobs)(delayed(run_ep_on_proj)
70 |                           (p, path_d4j_projects, path_out, path_ep)
71 |                           for p in projects)
72 | 


--------------------------------------------------------------------------------
/python/CheckoutD4j.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Dec. 15, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import os
10 | import subprocess
11 | import sys
12 | 
13 | from joblib import Parallel, delayed
14 | 
15 | 
16 | def exec_cmd(cmd):
17 |     subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)
18 | 
19 | 
20 | def check_out_each_project(d4j_binary, dist, proj, ver, ver_type):
21 |     print("Checkingout:", proj, ver, ver_type)
22 |     ver = str(ver)
23 |     proj_dist = dist + '/' + proj + '-' + ver
24 |     
25 |     cmd = [d4j_binary, 'checkout', '-p', proj, '-v', ver + ver_type, '-w', proj_dist]
26 |     exec_cmd(cmd)
27 |     
28 |     print("Getting properties:", proj, ver, ver_type)
29 |     os.chdir(proj_dist)
30 |     
31 |     cmd = [d4j_binary, 'export', '-p', 'classes.modified', '-o', 'prop-buggy-classes']
32 |     exec_cmd(cmd)
33 |  
34 |     cmd = [d4j_binary, 'export', '-p', 'dir.src.classes', '-o', 'prop-source-dir']
35 |     exec_cmd(cmd)
36 |  
37 |     cmd = [d4j_binary, 'export', '-p', 'cp.compile', '-o', 'prop-compile-path']
38 |     exec_cmd(cmd)
39 | 
40 |     print("Compiling:", proj, ver, ver_type)
41 |     cmd = [d4j_binary, 'compile']
42 |     exec_cmd(cmd)
43 | 
44 | if __name__ == '__main__':
45 |     
46 |     path_d4j = sys.argv[1] if sys.argv[1].startswith("/") else os.path.join(os.getcwd(), sys.argv[1])
47 |     ver_type = sys.argv[2]
48 |     jobs = int(sys.argv[3])
49 |     
50 |     d4j_binary = os.path.join(path_d4j, 'framework/bin/defects4j')
51 |     dist = os.path.join(path_d4j, 'projects', ver_type)
52 |     
53 |     print(dist)
54 |     if not os.path.isdir(dist):
55 |         os.makedirs(dist)
56 |         
57 |     projects = {
58 |         'Chart': 26,
59 |         'Closure': 133,
60 |         'CommonsCodec': 22,
61 |         'CommonsCLI': 24,
62 |         'CommonsCsv': 12,
63 |         'CommonsJXPath': 14,
64 |         'Guava': 9,
65 |         'JacksonCore': 13,
66 |         'JacksonDatabind': 39,
67 |         'JacksonXml': 5,
68 |         'Jsoup': 64,
69 |         'Lang': 65,
70 |         'Math': 106,
71 |         'Mockito': 38,
72 |         'Time': 27
73 |         }
74 |     
75 |     Parallel(n_jobs=jobs)(delayed(check_out_each_project)
76 |                           (d4j_binary, dist, proj, ver, ver_type) 
77 |                             for proj, count in projects.items() 
78 |                             for ver in range(1, count + 1))
79 | 


--------------------------------------------------------------------------------
/scripts/download_static_checkers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Author: Andrew Habib
 4 | # Created on: 31 August 2018
 5 | 
 6 | [[ ! "${#BASH_SOURCE[@]}" -gt "1" ]] && { source ./scripts/config.sh; }
 7 | 
 8 | # This is the ErrorProne release used in the study.
 9 | EP_VER="2.1.1"
10 | EP_URL="https://repo1.maven.org/maven2/com/google/errorprone/error_prone_ant/${EP_VER}/error_prone_ant-${EP_VER}.jar"
11 | # You can instead any other version by setting EP_VER accordingly.
12 | # NOTE: EP_ROOT= the downloaded Error Prone ant jar.
13 | 
14 | 
15 | # This is the Infer release used in the study.
16 | # INF_VER="0.13.0"
17 | # But it does not come with ready binary. It requires manual build
18 | # and meeting all Infer dependencies. This is cumbersome but doable.
19 | # NOTE for 0.13.0: INF_BIN: INF_ROOT/infer/bin/infer
20 | # So instead, we use a more recent Infer release, which has a ready-made binary.
21 | # And according to documentation, the newer release did not have any major changed
22 | # to the Java checkers. So we expect more or less the same effectiveness in Java
23 | # bug detetction.
24 | INF_VER="0.15.0"
25 | # NOTE for 0.15.0: INF_BIN=INF_ROOT/bin/infer
26 | # Make sure to update $INF_BIN accordingly.
27 | 
28 | 
29 | # This is the SpotBugs release used for the study.
30 | SB_VER="3.1.0"
31 | SB_URL="http://repo.maven.apache.org/maven2/com/github/spotbugs/spotbugs/${SB_VER}/spotbugs-${SB_VER}.tgz"
32 | # NOTE: SB_BIN=SB_ROOT/lib/spotbugs.jar
33 | 
34 | if [ -d ${CHECKERS_ROOT} ]; then rm -rf ${CHECKERS_ROOT}; fi
35 | mkdir $CHECKERS_ROOT
36 | 
37 | echo
38 | echo ">>> Downloading and extracting static checkers <<<"
39 | 
40 | echo ">>> Preparing Google's ErrorProne"
41 | (cd $CHECKERS_ROOT && wget -q $EP_URL)
42 | 
43 | # Infer has different binaries for Linux and MacOS
44 | echo ">>> Preparing Facebook's Infer"
45 | if [ $MACHINE = "Linux" ]; then
46 |   INF_URL="https://github.com/facebook/infer/releases/download/v${INF_VER}/infer-linux64-v${INF_VER}.tar.xz"
47 |   curl -sSL $INF_URL | tar -C $CHECKERS_ROOT -xJ
48 | elif [ $MACHINE = "Mac" ]; then
49 |   INF_URL="https://github.com/facebook/infer/releases/download/v${INF_VER}/infer-osx-v${INF_VER}.tar.xz"
50 |   curl -sSL $INF_URL | tar -C $CHECKERS_ROOT -x
51 | else
52 |   echo "Reporting from script: $(basename $BASH_SOURCE) at line: $LINENO"
53 |   echo "Uknown OS: $MACHINE"
54 |   echo "Cann't get compatible Infer version"
55 |   echo "Will exit..."
56 |   exit 1
57 | fi
58 | 
59 | echo ">>> Preparing SpotBugs"
60 | wget -cq $SB_URL -O - | tar -xz -C $CHECKERS_ROOT
61 | 


--------------------------------------------------------------------------------
/python/RunSpotbugs.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Dec. 15, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import os
10 | import subprocess
11 | import sys
12 | 
13 | from joblib import Parallel, delayed
14 | 
15 | from Util import prepare_tool
16 | 
17 | 
18 | def run_sb_on_proj(proj, path, path_out, path_sb):
19 |     log = open(os.path.join(os.getcwd(), 'sb_log'), 'a')
20 | 
21 |     log.write("Runnning SpotBugs on: " + proj + "\n\n")
22 | 
23 |     proj_src, proj_cp, proj_javac_opts, proj_buggy_files, proj_buggy_classes = prepare_tool(path, proj)
24 |     
25 |     for buggy_f in proj_buggy_files:
26 |         if proj_javac_opts:
27 |             cmd = ['javac'] + proj_javac_opts.split(' ') + ['-cp', proj_cp, buggy_f]
28 |         else:
29 |             cmd = ['javac'] + ['-cp', proj_cp, buggy_f]
30 |         
31 |         log.write(" ".join(cmd) + "\n\n")
32 |         
33 |         p = subprocess.Popen(cmd, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
34 |         (out, _) = p.communicate()
35 |         
36 |         log.write(out + "\n")
37 |         log.write("*"*24 + "\n\n")
38 |         
39 |     cmd = ['java', '-jar', path_sb,
40 |                '-textui', '-xml:withMessages',
41 |                '-output', os.path.join(path_out, proj) + '.xml',
42 |                '-auxclasspath', proj_cp,
43 |                '-onlyAnalyze', ','.join(cl for cl in proj_buggy_classes),
44 |                proj_src]
45 |     
46 |     log.write(" ".join(cmd) + "\n\n")
47 |     
48 |     p = subprocess.Popen(cmd, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
49 |     (out, _) = p.communicate()
50 |     
51 |     log.write(out + "\n")
52 |     
53 |     log.write("#"*212 + "\n\n")
54 |     log.close()
55 |     
56 | 
57 | if __name__ == '__main__':
58 |     path_sb = os.path.join(os.getcwd(), sys.argv[1])
59 |     path_d4j_projects = os.path.join(os.getcwd(), sys.argv[2])
60 |     jobs = int(sys.argv[3])
61 |     
62 |     path_out = os.path.join(os.getcwd(), 'sb_output')
63 |     if not os.path.isdir(path_out):
64 |         os.makedirs(path_out)
65 |     
66 |     projects = sorted(os.listdir(path_d4j_projects))
67 |     
68 |     # Use a cmd is_filter to debug specific projects
69 |     is_filter = False
70 |     if len(sys.argv) > 4:
71 |         is_filter = True
72 |         with open(sys.argv[4]) as file:
73 |             filter_list = file.read().splitlines()
74 |     if is_filter:
75 |         projects = sorted(list(i for i in projects if i in filter_list))
76 | 
77 |     Parallel(n_jobs=jobs)(delayed(run_sb_on_proj)
78 |                           (p, path_d4j_projects, path_out, path_sb)
79 |                           for p in projects)
80 | 


--------------------------------------------------------------------------------
/python/ParseAndSerializeErrorprone.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Nov. 23, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import json
10 | import os
11 | import re
12 | import sys
13 | 
14 | from Util import DataReader, ErrorproneMsg, CustomEncoder, \
15 |                 get_cls_name_from_file_path, NO_WARNING
16 | 
17 | '''
18 | Currently, the errorprone output files may contain 
19 | analysis results of more than one .java file.
20 | This happens in cases where analyzed bug involves 
21 | more than one .java file.
22 | '''
23 | 
24 | 
25 | def parse_errorprone_output(proj, report):
26 |     pattern_raw_message = re.compile("^((/[^/ ]*)+/?):([0-9]+): (warning|error): \[([a-zA-Z]+)\] (.*)")
27 |         
28 |     reports = []
29 |     
30 |     # Case where report file is empty 
31 |     if len(report) == 0:
32 |         reports.append(ErrorproneMsg(proj, "", "", NO_WARNING, "", "", "", -1))
33 |         return reports
34 |     
35 |     # Case where report file is NOT empty
36 |     i = 0
37 |     while i < len(report):
38 |         line = report[i]        
39 |         match = pattern_raw_message.match(line)
40 |         
41 |         if match:
42 |             raw_message = match.groups()[0:6]
43 |         
44 |             cls_path = raw_message[0]
45 |             cls = get_cls_name_from_file_path(cls_path)
46 | #             if '/com/' in cls_path:
47 | #                 cls = 'com.' + cls_path.split('/com/')[1].replace('/', '.').replace('.java', '')
48 | #             elif '/org/' in cls_path:
49 | #                 cls = 'org.' + cls_path.split('/org/')[1].replace('/', '.').replace('.java', '')
50 |             
51 |             line = raw_message[2]
52 |             typ = raw_message[3]
53 |             cat = raw_message[4]
54 |             msg = raw_message[5]
55 |             code = report[i + 1].replace('\n', '')
56 |             mark = report[i + 2].replace('\n', '')
57 |         
58 |             parsed_msg = ErrorproneMsg(proj, cls, typ, cat, msg, code, mark, line)
59 |             reports.append(parsed_msg)
60 | 
61 |             i += 3
62 |         else:
63 |             i += 1
64 | 
65 |     return reports
66 |         
67 | 
68 | '''
69 | Takes only one argument: path to errorprone raw data
70 | '''    
71 | 
72 | if __name__ == '__main__':
73 |     
74 |     location_to_data = os.path.join(os.getcwd(), sys.argv[1])
75 |     list_of_data = sorted(os.listdir(location_to_data))
76 |     
77 |     data_paths = list(map(lambda f: os.path.join(location_to_data, f), list_of_data))
78 | 
79 |     parsed_reports_per_project = []
80 |     
81 |     for proj, report in DataReader(data_paths):
82 |         parsed_reports_per_project.extend(parse_errorprone_output(proj, report))
83 |             
84 | #     time_stamp = time.strftime("%Y%m%d-%H%M%S")
85 |     time_stamp = ''
86 |     parsed_output_file_name = "ep_parsed" + time_stamp + ".json"
87 |     with open(parsed_output_file_name, "w") as file:
88 |         json.dump(parsed_reports_per_project, file, cls=CustomEncoder, indent=4)
89 | 


--------------------------------------------------------------------------------
/scripts/README.md:
--------------------------------------------------------------------------------
 1 | ## Scripts to perform the static checkers study steps.
 2 | 
 3 | **Note:**
 4 | *Scripts in this directory should be run from the top level of the
 5 | repository `./` and not from the `./scritps/` itself.*
 6 | 
 7 | ### Contents:
 8 | 
 9 | - `config.sh`
10 | Contains constants defining several directories names. This
11 | script is `source`'d in all other scripts.
12 | 
13 | - `do_study.sh`
14 | The entry point to perform the entire study.
15 | 
16 | The following scripts are in the order at which they are called from
17 | `do_study.sh`. Moreover, each is self contained and could be run separately
18 | provided that, its dependencies on previous steps are met (e.g., running `run_checkers_on_defects4j.sh` assumes the checkers and the defects4j data set
19 | are already available in the relevant directories).
20 | 
21 | - `download_static_checkers.sh`
22 | downloads and extracts binaries of the static checkers we used in our study. The installation is local and not system wide and should not affect/interact with
23 | any other system-wide installations. It is configured to use the exact versions
24 | used in our study but it is straightforward to use other versions by changing each_tool_VER variable in the script. This script creates the top level
25 | directory `./static-checkers/`.
26 | 
27 | - `download_defects4j.sh`
28 | downloads and initializes the defects4j framework. It is configured to use the
29 | specific pull request (pr) [#112](https://github.com/rjust/defects4j/pull/112)
30 | used in our study but you can change this. If you comment out the D4J_PR
31 | variable, then the study will be done on the most recent official release of the Defects4J. Or, you could try different pull requests which also add more data
32 | points to the Defects4J, e.g., pr [#112](https://github.com/rjust/defects4j/pull/112)
33 | or pr [#174](https://github.com/rjust/defects4j/pull/174). This script creates
34 | the top level directory `./defects4j/` and the directories
35 | `./defects4j/projects/{b,f}`.
36 | 
37 | - `run_checkers_on_defects4j.sh`
38 | runs the three static checkers on the buggy and fixed versions of instances in
39 | the Defects4J. This script creates the top level directory `./study/` which
40 | contains the two directories `./study/output_{buggy,fixed}`. Each of those two directories contain the output and a consolidated log of running each static
41 | checker on each bug instance (and its fixed version) in the data set.
42 | 
43 | - `prepare_checkers_output.sh`
44 | parses the static checkers output and prepares json files for the consolidated
45 | output of each checker. The output from this script is single json file per tool
46 |  in `./study/output_{buggy,fixed}/`.
47 | 
48 | - `execute_diff_based_approach.sh`
49 | performs the diff-based automated matching approach to match warnings and lines
50 | of code in the diff between the buggy and fixed versions of a bug.
51 | 
52 | - `execute_warnings_based_approach.sh`
53 | performs the fixed warnings-based automated matching approach to match warnings
54 | and bugs based on warnings which have disappeared (a.k.a removed) between
55 | the buggy and fixed versions of a bug.
56 | 


--------------------------------------------------------------------------------
/python/ParseAndSerializeSpotbugs.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 
  3 | Created on Nov. 23, 2017
  4 | 
  5 | @author Andrew Habib
  6 | 
  7 | '''
  8 | 
  9 | '''
 10 | Currently, the spotbugs output files may contain 
 11 | analysis results of more than one .java file.
 12 | This happens in cases where analyzed bug involves 
 13 | more than one .java file.
 14 | '''
 15 | 
 16 | import json
 17 | import os
 18 | import sys
 19 | 
 20 | from xml.etree import cElementTree as ET
 21 | 
 22 | from Util import XmlReader, SpotbugsMsg, CustomEncoder, NO_WARNING
 23 | 
 24 | 
 25 | def parse_spotbugs_xml_output(proj, tree):
 26 |     reports = []
 27 |     has_bugs = False
 28 |     
 29 |     # Case where report file is NOT empty 
 30 |     try:
 31 |         for _, elem in tree:
 32 |             if elem.tag == "BugInstance":
 33 |                 '''
 34 |                 Tags gauaranteed to exist
 35 |                 '''
 36 |                 has_bugs = True
 37 |                 cls = elem.find('Class').get("classname", "")
 38 |                 cat = elem.attrib['category']
 39 |                 abbrev = elem.attrib['abbrev']
 40 |                 typ = elem.attrib['type']
 41 |                 prio = elem.attrib['priority']
 42 |                 rank = elem.attrib['rank']
 43 |                 msg = elem.find('LongMessage').text
 44 |                 '''
 45 |                 Optional Tags (may not always exist)
 46 |                 '''
 47 |                 elem_mth = elem.find('Method')
 48 |                 if elem_mth:
 49 |                     mth = elem_mth.get('name', '')
 50 |                 else: mth = ''
 51 |             
 52 |                 elem_field = elem.find('Field')
 53 |                 if elem_field:
 54 |                     field = elem_field.get('name', '')
 55 |                 else: field = ''
 56 |                 
 57 |                 lines = []
 58 |                 elem_src_lines = elem.findall('SourceLine')
 59 |                 if elem_src_lines:
 60 |                     for src_line in elem_src_lines:
 61 |                         if (src_line.tag == 'SourceLine' and
 62 |                             all (attr in src_line.attrib for attr in ['start', 'end']) and
 63 |                             src_line.get('classname') == cls):
 64 |                                 lines.append((src_line.get('start'), src_line.get('end'), src_line.get('role')))                                
 65 |                 
 66 |                 parsed_msg = SpotbugsMsg(proj, cls, cat, abbrev, typ, prio, rank, msg, mth, field, lines)
 67 |                 reports.append(parsed_msg)
 68 | 
 69 |     except ET.ParseError as err:
 70 | #         print(proj + " failed to parse.")
 71 | #         print(err)
 72 |         pass
 73 |     
 74 |     # Case where report file is empty 
 75 |     if not has_bugs:
 76 |         reports.append(SpotbugsMsg(proj, "", "", "", NO_WARNING, "", "", "", "", "", ""))
 77 |     
 78 |     return reports
 79 | 
 80 | 
 81 | '''
 82 | Takes only one argument: path to spotbugs raw data
 83 | '''
 84 | 
 85 | if __name__ == '__main__':
 86 |     
 87 |     location_to_data = os.path.join(os.getcwd(), sys.argv[1])
 88 |     list_of_data = sorted(os.listdir(location_to_data))
 89 |     
 90 |     data_paths = list(map(lambda f: os.path.join(location_to_data, f), list_of_data))
 91 | 
 92 |     parsed_reports_per_project = []
 93 | 
 94 |     for proj, tree in XmlReader(data_paths):
 95 |         parsed_reports_per_project.extend(parse_spotbugs_xml_output(proj, tree))
 96 |         
 97 | #     time_stamp = time.strftime("%Y%m%d-%H%M%S")
 98 |     time_stamp = ''
 99 |     parsed_output_file_name = "sb_parsed" + time_stamp + ".json"
100 |     with open(parsed_output_file_name, "w") as file:
101 |         json.dump(parsed_reports_per_project, file, cls=CustomEncoder, indent=4)
102 | 


--------------------------------------------------------------------------------
/python/ExtractAndSerializeDiffs.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | 
 3 | Created on Nov. 29, 2017
 4 | 
 5 | @author Andrew Habib
 6 | 
 7 | '''
 8 | 
 9 | import json
10 | import os
11 | import subprocess
12 | import sys
13 | 
14 | from Util import FileDiff, CustomEncoder
15 | 
16 | 
17 | def compute_proj_diff(proj_b, proj_f):
18 |     with open(os.path.join(proj_b, 'prop-source-dir')) as file:
19 |         src_dir = file.readline()
20 |     
21 |     with open(os.path.join(proj_b, 'prop-buggy-classes')) as file:
22 |         changed_classes = file.read().splitlines()
23 |         changed_files = map(lambda f: os.path.join(src_dir, f.replace('.', '/') + '.java'), changed_classes)
24 |     
25 |     file_diffs = []
26 |     
27 |     proj = os.path.split(proj_b)[1]
28 |     for changed_class, changed_file in zip(changed_classes, changed_files):
29 |         buggy_file_name = os.path.join(proj_b, changed_file)
30 |         fixed_file_name = os.path.join(proj_f, changed_file)
31 |         print(buggy_file_name)
32 |         print(fixed_file_name)
33 |         
34 |         '''
35 |         Get modified or deleted lines from old file
36 |         '''
37 |         command = 'diff --unchanged-line-format="" --old-line-format="%dn\n" --new-line-format="" ' + buggy_file_name + ' ' + fixed_file_name
38 |         out, _ = subprocess.Popen(command, universal_newlines=True, shell=True, stdout=subprocess.PIPE).communicate()
39 |         mod_or_del_lines = set(map(lambda l: int(l), list(out.split('\n')[:-1])))
40 | 
41 |         '''
42 |         Get new lines inserted in new files; but it also returns modified lines as new lines
43 |         '''
44 |         command = 'diff --unchanged-line-format="" --old-line-format="" --new-line-format="%dn\n" ' + buggy_file_name + ' ' + fixed_file_name
45 |         out, _ = subprocess.Popen(command, universal_newlines=True, shell=True, stdout=subprocess.PIPE).communicate()
46 |         new_lines = set(map(lambda l: int(l), list(out.split('\n')[:-1])))
47 | 
48 |         '''
49 |         Compute set difference before approximating true_new_lines to exclude modified lines
50 |         '''
51 |         true_new_lines = new_lines.difference(mod_or_del_lines)
52 | 
53 |         '''old changed lines'''
54 | #             changed_lines.extend(mod_or_del_lines).extend(new_lines)            
55 |         
56 |         '''
57 |         changed_lines is approximation of inserted(new) lines 
58 |         and deleted or modified lines
59 |         '''
60 |         approx_lines = set([line + i for line in true_new_lines for i in [-1,0, 1]])
61 | 
62 |         changed_lines = sorted(approx_lines.union(mod_or_del_lines))
63 | 
64 |         file_diff = FileDiff(proj, changed_class, changed_lines)
65 |         file_diffs.append(file_diff)
66 | 
67 |     return file_diffs
68 | 
69 | 
70 | '''
71 | Takes two arguments: paths to d4j buggy and fixed projects 
72 | in this order: buggy_path fixed_path
73 | '''    
74 | 
75 | if __name__ == '__main__':
76 |     
77 |     location_to_d4j_b = os.path.join(os.getcwd(), sys.argv[1])
78 |     location_to_d4j_f = os.path.join(os.getcwd(), sys.argv[2])
79 |     
80 |     list_d4j_b = sorted(os.listdir(location_to_d4j_b))
81 |     list_d4j_f = sorted(os.listdir(location_to_d4j_f))
82 |     
83 |     list_d4j_b = list(map(lambda f: os.path.join(location_to_d4j_b, f), list_d4j_b))
84 |     list_d4j_f = list(map(lambda f: os.path.join(location_to_d4j_f, f), list_d4j_f))
85 |     
86 |     parsed_projects_diffs = []
87 |         
88 |     for proj_b, proj_f in zip(list_d4j_b, list_d4j_f):
89 |         parsed_projects_diffs.extend(compute_proj_diff(proj_b, proj_f))
90 |          
91 | #     time_stamp = time.strftime("%Y%m%d-%H%M%S")
92 | #     time_stamp = ''
93 | #     parsed_output_file_name = "diffs_parserd_" + time_stamp + ".json"
94 |     parsed_output_file_name = "diffs_parsed" + ".json"
95 |     with open(parsed_output_file_name, "w") as file:
96 |         json.dump(parsed_projects_diffs, file, cls=CustomEncoder, indent=4)
97 |         
98 | 


--------------------------------------------------------------------------------
/python/RunInfer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 
  3 | Created on Dec. 25, 2017
  4 | 
  5 | @author Andrew Habib
  6 | 
  7 | '''
  8 | 
  9 | import os
 10 | import shutil
 11 | import subprocess
 12 | import sys
 13 | import tempfile
 14 | 
 15 | from joblib import Parallel, delayed
 16 | 
 17 | from Util import prepare_tool
 18 | 
 19 | 
 20 | def run_infer_on_proj(proj, path, path_out_txt, path_out_json, path_infer):
 21 |     log = open(os.path.join(os.getcwd(), 'inf_log'), 'a')
 22 | 
 23 |     log.write("Runnning Infer on: " + proj + "\n\n")
 24 |     
 25 |     _, proj_cp, proj_javac_opts, proj_buggy_files, _ = prepare_tool(path, proj)
 26 |     
 27 |     infer_txt_results = []
 28 |     infer_json_results = []
 29 |     
 30 |     tmp_out_dir = tempfile.mkdtemp(prefix='infer-out.', dir=os.getcwd())
 31 |     
 32 |     for buggy_f in proj_buggy_files:
 33 |         cmd = [path_infer, 'run', '-o', tmp_out_dir, '--', 'javac']
 34 |         if proj_javac_opts: 
 35 |             cmd = cmd + proj_javac_opts.split(' ') + ['-cp', proj_cp, buggy_f] 
 36 |         else: 
 37 |             cmd = cmd + ['-cp', proj_cp, buggy_f]
 38 |                
 39 |         log.write(" ".join(cmd) + "\n\n")
 40 |         
 41 |         p = subprocess.Popen(cmd, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 42 |         (out, _) = p.communicate()
 43 |         
 44 |         log.write(out + "\n")
 45 |         log.write("*"*24 + "\n\n")
 46 |         
 47 |         try:
 48 |             with open(os.path.join(os.getcwd(), tmp_out_dir + '/bugs.txt'), 'r') as file:
 49 |                 infer_txt_results.append(file.read())
 50 |         except IOError:
 51 |             pass
 52 | 
 53 |         try:
 54 |             with open(os.path.join(os.getcwd(), tmp_out_dir + '/report.json'), 'r') as file:
 55 |                 infer_json_results.append(file.read().strip("\n"))
 56 |         except IOError:
 57 |             pass            
 58 | 
 59 |     shutil.rmtree(tmp_out_dir)
 60 |     
 61 |     with open(os.path.join(path_out_txt, proj), 'w') as file:
 62 |         file.write("\n".join(res for res in infer_txt_results))
 63 |     
 64 |     with open(os.path.join(path_out_json, proj), 'w') as file:
 65 |         file.write(manual_merge_json(infer_json_results))
 66 |      
 67 |     log.write("#"*212 + "\n\n")
 68 |     log.close()
 69 | 
 70 |     
 71 | def manual_merge_json(json_strings):
 72 |     json_strings = [x for x in json_strings if x != "" and x != '[]']
 73 |     length = len(json_strings)
 74 |     
 75 |     if length == 1:
 76 |         return json_strings[0]
 77 |     
 78 |     if length > 1:
 79 |         for i in range(1, length):
 80 |             json_strings[i] = json_strings[i][1:-1]
 81 |         json_strings[0] = json_strings[0][:-1]
 82 |         json_strings[length - 1] = json_strings[length - 1] + ']'
 83 |         return ",".join(s for s in json_strings)
 84 |     
 85 |     return ""
 86 |         
 87 | 
 88 | if __name__ == '__main__':
 89 |     path_infer = os.path.join(os.getcwd(), sys.argv[1])
 90 |     path_d4j_projects = os.path.join(os.getcwd(), sys.argv[2])
 91 |     jobs = int(sys.argv[3])
 92 |     
 93 |     path_out_txt = os.path.join(os.getcwd(), 'inf_output_txt')
 94 |     if not os.path.isdir(path_out_txt):
 95 |         os.makedirs(path_out_txt)
 96 |         
 97 |     path_out_json = os.path.join(os.getcwd(), 'inf_output_json')
 98 |     if not os.path.isdir(path_out_json):
 99 |         os.makedirs(path_out_json)
100 |     
101 |     projects = sorted(os.listdir(path_d4j_projects))
102 |     
103 |     # Use a cmd is_filter to debug specific projects
104 |     is_filter = False
105 |     if len(sys.argv) > 4:
106 |         is_filter = True
107 |         with open(sys.argv[4]) as file:
108 |             filter_list = file.read().splitlines()
109 |     if is_filter:
110 |         projects = sorted(list(i for i in projects if i in filter_list))
111 |    
112 |     Parallel(n_jobs=jobs)(delayed(run_infer_on_proj)
113 |                           (p, path_d4j_projects, path_out_txt, path_out_json, path_infer)
114 |                           for p in projects)
115 |     
116 | 


--------------------------------------------------------------------------------
/python/ComputeStatsOnD4J.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 
  3 | Created on Jan. 24, 2018
  4 | 
  5 | @author Andrew Habib
  6 | 
  7 | '''
  8 | 
  9 | import os
 10 | import re
 11 | import sys
 12 | import subprocess
 13 | import numpy
 14 | 
 15 | from joblib.parallel import delayed, Parallel
 16 | 
 17 | from collections import Counter
 18 | 
 19 | from Util import load_parsed_ep, load_parsed_inf, load_parsed_sb
 20 | 
 21 | def how_many_warnings_per_bug(warnings):
 22 |     bugs = [w.proj for w in warnings]
 23 |     return(Counter(bugs))
 24 | 
 25 | def get_files_locs_diffs_per_bug(proj_b, proj_f):
 26 |     with open(os.path.join(proj_b, 'prop-source-dir')) as file:
 27 |         src_dir = file.readline()
 28 |     
 29 |     with open(os.path.join(proj_b, 'prop-buggy-classes')) as file:
 30 |         changed_classes = file.read().splitlines()
 31 |         changed_files = map(lambda f: os.path.join(src_dir, f.replace('.', '/') + '.java'), changed_classes)
 32 |             
 33 |     pattern_loc = re.compile("Java +([0-9]+) +([0-9]+) +([0-9]+) +([0-9]+)")
 34 |     pattern_mod = re.compile("modified +([0-9]+) +([0-9]+) +([0-9]+) +([0-9]+)")
 35 |     pattern_new = re.compile("added +([0-9]+) +([0-9]+) +([0-9]+) +([0-9]+)")
 36 |     pattern_del = re.compile("removed +([0-9]+) +([0-9]+) +([0-9]+) +([0-9]+)")
 37 |     
 38 |     proj = os.path.split(proj_b)[1]
 39 |     loc = []
 40 |     diff = []
 41 | 
 42 |     for _, changed_file in zip(changed_classes, changed_files):
 43 |         buggy_file_name = os.path.join(proj_b, changed_file)
 44 |         fixed_file_name = os.path.join(proj_f, changed_file)
 45 | #         print(buggy_file_name)
 46 | #         print(fixed_file_name)
 47 |         
 48 |         cmd = cloc + ' ' + buggy_file_name
 49 |         out, _ = subprocess.Popen(cmd, universal_newlines=True, shell=True, stdout=subprocess.PIPE).communicate()
 50 |         loc.append(int(pattern_loc.search(out).groups()[3]))
 51 | #         print(out)
 52 | #         print(pattern_loc.search(out).groups()[3])
 53 |         
 54 |         cmd = cloc + ' --diff ' + buggy_file_name + ' ' + fixed_file_name
 55 |         out, _ = subprocess.Popen(cmd, universal_newlines=True, shell=True, stdout=subprocess.PIPE).communicate()
 56 |         diff.append(int(pattern_mod.search(out).groups()[3]) + int(pattern_new.search(out).groups()[3]) + int(pattern_del.search(out).groups()[3]))
 57 | #         print(out)
 58 | #         print(pattern_mod.search(out).groups()[3])
 59 | #         print(pattern_new.search(out).groups()[3])
 60 | #         print(pattern_del.search(out).groups()[3])
 61 | #         input("")
 62 | 
 63 |     return proj, len(changed_classes), sum(loc), sum(diff)
 64 | 
 65 |     
 66 | if __name__ == '__main__':
 67 |     
 68 |     ''' Aggregated numbers of modified files, LoCs, diff-size '''
 69 |     
 70 |     location_to_d4j_b = os.path.join(os.getcwd(), sys.argv[1])
 71 |     location_to_d4j_f = os.path.join(os.getcwd(), sys.argv[2])
 72 |     cloc = os.path.join(os.getcwd(), sys.argv[3])
 73 | 
 74 |     list_d4j_b = sorted(os.listdir(location_to_d4j_b))
 75 |     list_d4j_f = sorted(os.listdir(location_to_d4j_f))
 76 |      
 77 |     list_d4j_b = list(map(lambda f: os.path.join(location_to_d4j_b, f), list_d4j_b))
 78 |     list_d4j_f = list(map(lambda f: os.path.join(location_to_d4j_f, f), list_d4j_f))
 79 |  
 80 |     out = Parallel(n_jobs=30)(delayed(get_files_locs_diffs_per_bug)
 81 |                         (proj_b, proj_f)
 82 |                             for proj_b, proj_f in zip(list_d4j_b, list_d4j_f))
 83 |     projects, files, locs, diffs = zip(*out)
 84 |     
 85 | 
 86 |     print("\nBins of # of modified files\n")    
 87 |     bin_files = Counter(files)
 88 |     for (k,v) in bin_files.items():
 89 |         print(k,v)
 90 |     
 91 |     print("\nBins of LoC\n")
 92 |     hist, edges = numpy.histogram(locs, [1, 100, 1000, 2000, 3000, 4000, 5000, 10000, 20000])
 93 |     for i in range(len(hist)):
 94 |         print(edges[i], edges[i+1]-1, hist[i])
 95 |     
 96 |     print("\nMin and Max of LoC per bug", min(locs), max(locs))
 97 |     
 98 |     print("\nSum of all LoC of all bugs", sum(locs))
 99 |     
100 |     print("\nBins of Diffs\n")
101 |     hist, edges = numpy.histogram(diffs, [1, 5, 10, 15, 20, 25, 50, 75, 100, 200, 2000])
102 |     for i in range(len(hist)):
103 |         print(edges[i], edges[i+1]-1, hist[i])
104 |         
105 |     print("\nMin and Max of diff per bug", min(diffs), max(diffs))
106 |     
107 |     
108 |     ''' D4J stats per bug per tool '''
109 |         
110 |     print("\nStats per bug\n")
111 |     ep_all_b = load_parsed_ep('./b/ep_parsed.json')
112 |     ep_b = how_many_warnings_per_bug(ep_all_b)
113 |     ep_all_f = load_parsed_ep('./f/ep_parsed.json')
114 |     ep_f = how_many_warnings_per_bug(ep_all_f)
115 |     
116 |     inf_all_b = load_parsed_inf('./b/inf_parsed.json')
117 |     inf_b = how_many_warnings_per_bug(inf_all_b)
118 |     inf_all_f = load_parsed_inf('./f/inf_parsed.json')
119 |     inf_f = how_many_warnings_per_bug(inf_all_f)
120 |     
121 |     sb_all_b = load_parsed_sb('./b/sb_parsed.json')
122 |     sb_b = how_many_warnings_per_bug(sb_all_b)
123 |     sb_all_f = load_parsed_sb('./f/sb_parsed.json')
124 |     sb_f = how_many_warnings_per_bug(sb_all_f)
125 |     
126 |     print("Bug Files LoC Diff Ep Inf Sb (from buggy versions)")
127 |     for (p, f, l, d) in out:
128 |         print(p, f, l, d, 
129 |               ep_b[p], inf_b[p], sb_b[p])
130 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # An empirical study of static bug checkers and how many bugs do they find.
  2 | 
  3 | *Static bug detection tools* such as Google's [Error Prone](https://errorprone.info/),
  4 | Facebook's [Infer](http://fbinfer.com), and [SpotBugs](https://spotbugs.github.io/)
  5 | are widely used nowadays not only by academic researchers but also by major
  6 | software development companies and in general by developers in various
  7 | industries.
  8 | 
  9 | As such tools are becoming more and more famous and widely adopted, we
 10 | performed an empirical study on the **recall** of these three state-of-the-art
 11 | static bug checkers using a representative set of 594 real world Java bugs from
 12 | 15 popular projects. In other words, our study answers the question of *how many
 13 | of a set of known bugs do these static checkers find* in reality.
 14 | 
 15 | This work is detailed and published in our ASE 2018 paper:
 16 | 
 17 |     How Many of All Bugs Do We Find? A Study of Static Bug Detectors.
 18 |     Andrew Habib and Michael Pradel.
 19 |     In Proceedings of the 33rd ACM/IEEE International Conference on Automated Software Engineering (ASE),
 20 |     pp. 317-328. ACM, 2018.
 21 | 
 22 | This repository contains the source code of the analysis pipeline we implemented
 23 | to perform our study. It is intended to be used to replicate our study results
 24 | and to perform further similar studies.
 25 | 
 26 | Also, the repository includes more elaborate results on our findings in the
 27 | ASE 2018 paper.
 28 | 
 29 | ## Requirements to install and run
 30 | 
 31 | - ### Our scripts
 32 |   - `bash`
 33 |   - `curl` (on Linux, use `apt-get install curl`)
 34 |   - `wget` (on MacOS, if you have [brew](https://brew.sh/), use `brew install wget`)
 35 |   - `python 3` (also `joblib` is required. Use `pip3 install joblib`)
 36 | 
 37 |   - *For MacOS users*, `coreutils` is required too. Use `brew install coreutils`
 38 | 
 39 |   - Based on our testing, `Java 8` is the most compatible with the static analyzers 
 40 |   and Defects4J framework.
 41 | 
 42 | - ### 3rd party tools
 43 |   - [Defects4J](https://github.com/rjust/defects4j) requirements has to be met.
 44 | 
 45 | ## Repository structure and contents
 46 | 
 47 | Study results:
 48 | 
 49 | - [`./results/`](results)
 50 | contains tables, figures, and charts detailing the findings of our study. You
 51 | will find more data available in this folder than what is in the ASE2018 paper
 52 | due to space limitation while publishing the paper.
 53 | 
 54 | **All ``bash`` and ``python`` scripts have descriptive names.**
 55 | 
 56 | - [`./scripts/`](scripts)
 57 | contains the driver script `do_study.sh` which runs the entire study
 58 | pipeline. This directory also contains modular scripts to run many parts of the
 59 | study separately provided that the specific scripts requirements are met.
 60 | *All scripts in this directory should be run from the top level of the
 61 | repository ```./``` and not from the ```./scritps/``` itself.*
 62 | 
 63 | - [`./python/`](python)
 64 | contains all python scripts which perform the actual study steps such as
 65 | checking out the actual bugs from Defects4J, running the static analyzers
 66 | on the bugs set, analyzing the checkers output, ..., etc. These python scripts
 67 | are called by the driver scripts in `./scripts/`. But they also could be
 68 | called directory by providing the appropriate input for each script.
 69 | 
 70 | Running the entire study pipeline by invoking `./scripts/do_study.sh` would
 71 | create three more directories in the top level `./`:
 72 | 
 73 | - `./static-checkers/`
 74 | contains the downloaded binaries of three static checkers we use
 75 | in the study: [Error Prone](https://errorprone.info/), [Infer](http://fbinfer.com),
 76 | and [SpotBugs](https://spotbugs.github.io/).
 77 | 
 78 | - `./defects4j/`
 79 | contains the cloned [Defects4J](https://github.com/rjust/defects4j) framework.
 80 | In this directory, our scripts also create the following sub-directories:
 81 |   - `./defects4j/projects/{b,f}` which contain the checked out (b)uggy
 82 |   and (f)ixed versions of each benchmark in the Defects4J bug set.
 83 | 
 84 | - `./study`
 85 | contains output from all study steps such as the logs and output of Running
 86 | the static analyzers on the bug set, the line diffs between buggy and fixed
 87 | versions, the pairs of warnings and bugs obtained from the different matching
 88 | methodologies we explain in the paper, and so on. It has two sub-directories
 89 | for the output of running the checkers on buggy and fixed versions from the
 90 | benchmark respectively:
 91 |   - `./study/output-buggy/`
 92 |   - `./study/output-fixed/`
 93 | 
 94 | **Important Note:**
 95 | In our study, we used an unofficial version from the Defects4J through a pull
 96 | request (pr) [#112](https://github.com/rjust/defects4j/pull/112) to obtain more
 97 | data (596 bugs instead of 395 bugs in the official release).
 98 | Unfortunately, this pull request is not clean and many of the new bug instances
 99 | included in it have wrong values for Defects4J properties. We had to identify
100 | and fix those issues manually in an adhoc manner.
101 | Therefore currently, if you use the exact same pull request, you will not obtain
102 | the same results because some of the benchmarks will not be analyzed correctly.
103 | We will create our own snapshot of the Defects4J along with the pull request
104 | we used in our study and the corrected issues so that reproducing our results is
105 | easier to achieve.
106 | 


--------------------------------------------------------------------------------
/results/README.md:
--------------------------------------------------------------------------------
 1 | # Original results published in the ASE2018 paper and more.
 2 | 
 3 | **Note 1:**
 4 | These results are based upon the specific version of Defects4J obtained through
 5 | pull request (pr) [#112](https://github.com/rjust/defects4j/pull/112).
 6 | Please see [here](../README.md).
 7 | 
 8 | **Note 2:**
 9 | We will use figures and tables numbers analogous to what is in the paper.
10 | 
11 | ## Defects4J properties
12 | In the following, we show some statistics about Defects4J bugs used in our study.
13 | 
14 | <image src="defects4j/hist_file-1.png" width=450/>
15 | (a) Number of buggy files.
16 | 
17 | 
18 | <image src="defects4j/hist_diff-1.png" width=450/>
19 | (b) Total size of diffs between buggy and fixed files.
20 | 
21 | 
22 | <image src="defects4j/hist_loc-1.png" width=450/>
23 | (c) Total size of buggy files.
24 | 
25 | 
26 | **Figure 3: Properties of the studied bugs.**
27 | 
28 | ## Detected bugs
29 | In the following, we first show the total number of bugs detected by each tool
30 | and the overlap between the checkers. Then, we present extra tables showing
31 | which bugs are detected by which static checker according to our findings.
32 | 
33 | ### Bugs detected by the three tools together
34 | <image src="found_bugs_all_tools-1.png" width=257 />
35 | 
36 | | Tool              | Number of bugs |
37 | |-------------------|---------------:|
38 | | SpotBugs          |             18 |
39 | | Error Prone       |              8 |
40 | | Infer             |              5 |
41 | | **Total:**        |         **31** |
42 | | **Total unique:** |         **27** |
43 | 
44 | **Figure 4: Total number of bugs found by all three static checkers and their
45 | overlap.**
46 | 
47 | 
48 | The following tables are not in the paper due to space limitation.
49 | ### Bugs detected by SpotBugs
50 | | Count | Bug Id             | Bug type                                           | Message as reported by SpotBugs |
51 | |------:|--------------------|----------------------------------------------------|---------------------------------|
52 | |     1 | Chart-1            | NP_ALWAYS_NULL                                | Null pointer dereference of ? in org.jfree.chart.renderer.category.AbstractCategoryItemRenderer.getLegendItems()
53 | |     2 | Chart-4            | NP_NULL_ON_SOME_PATH                          | Possible null pointer dereference of ? in org.jfree.chart.plot.XYPlot.getDataRange(ValueAxis)
54 | |     3 | Chart-17           | CN_IDIOM_NO_SUPER_CALL                        | org.jfree.data.time.TimeSeries.clone() does not call super.clone()
55 | |     4 | Chart-22           | UC_USELESS_CONDITION                          | Useless condition: it's known that local$3 >= 0 at this point
56 | |     5 | Chart-24           | DLS_DEAD_LOCAL_STORE                          | Dead store to $L3 in org.jfree.chart.renderer.GrayPaintScale.getPaint(double)
57 | |     6 | CommonsCodec-22    | SI_INSTANCE_BEFORE_FINALS_ASSIGNED            | Static initializer for org.apache.commons.codec.language.RefinedSoundex creates instance before all static final fields assigned
58 | |     7 | JacksonDatabind-13 | NP_BOOLEAN_RETURN_NULL                        | com.fasterxml.jackson.databind.ser.std.EnumSerializer._isShapeWrittenUsingIndex(Class, JsonFormat$Value, boolean) has Boolean return type and returns explicit null
59 | |     8 | JacksonDatabind-32 | VA_FORMAT_STRING_EXTRA_ARGUMENTS_PASSED       | Format-string method String.format(String, Object[]) called with format string "Can not construct Map key of type %s from String "%s": " wants 2 arguments but is given 3 in com.fasterxml.jackson.databind.DeserializationContext.weirdKeyException(Class, String, String)
60 | |     9 | Jsoup-22           | UUF_UNUSED_PUBLIC_OR_PROTECTED_FIELD          | Unused public or protected field: org.jsoup.parser.TreeBuilder.currentToken
61 | |    10 | Lang-3             | DLS_DEAD_LOCAL_STORE                          | Dead store to $L9 in org.apache.commons.lang3.math.NumberUtils.createNumber(String)
62 | |    11 | Lang-23            | EQ_DOESNT_OVERRIDE_EQUALS                     | org.apache.commons.lang3.text.ExtendedMessageFormat doesn't override java.text.MessageFormat.equals(Object)
63 | |    12 | Lang-56            | SE_BAD_FIELD                                  | Class org.apache.commons.lang.time.FastDateFormat defines non-transient non-serializable instance field mRules
64 | |    13 | Lang-62            | SF_DEAD_STORE_DUE_TO_SWITCH_FALLTHROUGH       | Value of ? from previous case is overwritten here due to switch statement fall through
65 | |       | Lang-62            | SF_SWITCH_NO_DEFAULT                          | Switch statement found in org.apache.commons.lang.Entities.unescape(Writer, String) where default case is missing
66 | |    14 | Math-50            | FE_FLOATING_POINT_EQUALITY                    | Test for floating point equality in org.apache.commons.math.analysis.solvers.BaseSecantSolver.doSolve()
67 | |    15 | Math-91            | CO_COMPARETO_INCORRECT_FLOATING               | org.apache.commons.math.fraction.Fraction.compareTo(Fraction) incorrectly handles double value
68 | |    16 | Mockito-1          | DLS_DEAD_LOCAL_STORE                          | Dead store to $L2 in org.mockito.internal.invocation.InvocationMatcher.captureArgumentsFrom(Invocationi)
69 | |    17 | Mockito-11         | EQ_CHECK_FOR_OPERAND_NOT_COMPATIBLE_WITH_THIS | org.mockito.internal.creation.DelegatingMethod.equals(Object) checks for operand being a reflect.Method
70 | |    18 | Mockito-23         | SE_BAD_FIELD                                  | Class org.mockito.internal.stubbing.defaultanswers.ReturnsDeepStubs defines non-transient non-serializable instance field mockitoCore
71 | 
72 | ### Bugs detected by Error Prone
73 | | Count | Bug Id             | Bug type                            | W / E   | Message as reported by Error Prone |
74 | |------:|--------------------|-------------------------------------|---------|------------------------------------|
75 | |     1 | Chart-8            | ChainingConstructorIgnoresParameter | Error   | The called constructor accepts a parameter with the same name and type as one of its caller's parameters, but its caller doesn't pass that parameter to it.  It's likely that it was intended to.
76 | |     2 | JacksonDatabind-32 | FormatString                        | Error   | extra format arguments: used 2, provided 3
77 | |     3 | Lang-51            | FallThrough                         | Error   | Execution may fall through from the previous case; add a `// fall through` comment before this line if it was deliberate
78 | |     4 | Lang-62            | FallThrough                         | Error   | Execution may fall through from the previous case; add a `// fall through` comment before this line if it was deliberate
79 | |     5 | Math-57            | NarrowingCompoundAssignment         | Warning | Compound assignments from double to int hide lossy casts
80 | |     6 | Math-66            | MissingOverride                     | Warning | optimize overrides method in AbstractUnivariateRealOptimizer; expected @Override
81 | |       |                    | MissingOverride                     | Warning | optimize overrides method in AbstractUnivariateRealOptimizer; expected @Override
82 | |     7 | Math-77            | MissingOverride                     | Warning | getLInfNorm overrides method in AbstractRealVector; expected @Override
83 | |     8 | Mockito-19         | MissingOverride                     | Warning | filterCandidate implements method in MockCandidateFilter; expected @Override
84 | |       |                    | MissingOverride                     | Warning | filterCandidate implements method in MockCandidateFilter; expected @Override
85 | 
86 | ### Bugs detected by Infer
87 | | Count | Bug Id             | Bug type         | Message as reported by Infer |
88 | |------:|--------------------|------------------|------------------------------|
89 | |     1 | Chart-1            | NULL_DEREFERENCE | object `dataset` last assigned on line 1796 could be null and is dereferenced at line 1800.
90 | |     2 | Chart-4            | NULL_DEREFERENCE | object `r` last assigned on line 4473 could be null and is dereferenced at line 4493.
91 | |     3 | Jsoup-59           | NULL_DEREFERENCE | object returned by `ownerDocument()` could be null and is dereferenced at line 363.
92 | |     4 | JacksonDatabind-12 | RESOURCE_LEAK    | resource of type `com.fasterxml.jackson.databind.util.TokenBuffer` acquired by call to `new()` at line 573 is not released after line 603.
93 | |     5 | Math-87            | NULL_DEREFERENCE | object returned by `getBasicRow(this,(getArtificialVariableOffset()+artificialVar))` could be null and is dereferenced at line 249.
94 | 


--------------------------------------------------------------------------------
/python/Util.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 
  3 | Created on Nov. 23, 2017
  4 | 
  5 | @author Andrew Habib
  6 | 
  7 | '''
  8 | 
  9 | import json
 10 | import os
 11 | 
 12 | from collections import OrderedDict, namedtuple
 13 | from xml.etree import cElementTree as ET
 14 | 
 15 | 
 16 | class DataReader(object):
 17 | 
 18 |     def __init__(self, data_paths):
 19 |         self.data_paths = data_paths
 20 |         
 21 |     def __iter__(self):
 22 |         for data_path in self.data_paths:
 23 |             name = os.path.split(data_path)[1]
 24 |             with open(data_path, 'r') as file:
 25 |                 content = file.readlines()
 26 |                 yield name, content
 27 | 
 28 |                 
 29 | class XmlReader(object):
 30 | 
 31 |     def __init__(self, data_paths):
 32 |         self.data_paths = data_paths
 33 |         
 34 |     def __iter__(self):
 35 |         for data_path in self.data_paths:
 36 |             name = os.path.split(data_path)[1]
 37 |             with open(data_path, 'r') as file:
 38 |                 yield name.replace('.xml', ''), ET.iterparse(file)
 39 | 
 40 | 
 41 | class JsonReader(object):
 42 | 
 43 |     def __init__(self, data_path):
 44 |         self.data_path = data_path
 45 |         
 46 |     def __iter__(self):
 47 |         with open(self.data_path, 'r') as file:
 48 |             entries = json.load(file)
 49 |             for entry in entries:
 50 |                 yield entry
 51 | 
 52 | 
 53 | class JsonDataReader(object):
 54 |     
 55 |     def __init__(self, data_paths):
 56 |         self.data_paths = data_paths
 57 |         
 58 |     def __iter__(self):
 59 |         for data_path in self.data_paths:
 60 |             name = os.path.split(data_path)[1]
 61 |             if os.path.getsize(data_path) < 1:
 62 |                 yield name, None
 63 |             else:
 64 |                 with open(data_path, 'r') as file:
 65 |                     entries = json.load(file)
 66 |                     for entry in entries:
 67 |                         yield name, entry
 68 | 
 69 | 
 70 | def load_json_list(json_file):
 71 |     json_list = []
 72 |     for entry in JsonReader(json_file):
 73 |         json_list.append(entry)
 74 |     return json_list
 75 | 
 76 | 
 77 | def get_list_of_uniq_jsons(lst):
 78 |     uniq = []
 79 |     for new in lst:
 80 |         found = False
 81 |         for old in uniq:
 82 |             if new == old:
 83 |                 found = True
 84 |                 break
 85 |         if not found:
 86 |             uniq.append(new)
 87 |     return uniq
 88 | 
 89 | 
 90 | class PrettyDict(dict):
 91 | 
 92 |     def __str__(self):
 93 |         return "{" + ", ".join("%r: %r\n" % (key, self[key]) for key in sorted(self)) + "}"
 94 | 
 95 |     __repr__ = __str__
 96 | 
 97 | 
 98 | class ErrorproneMsg(object):
 99 |     
100 |     keys = [' Proj',
101 |             'Class',
102 |             ' Type',
103 |             '  Cat',
104 |             '  Msg',
105 |             ' Code',
106 |             ' Mark',
107 |             ' Line']
108 | 
109 |     def __init__(self, proj, cls, typ, cat, msg, code, mark, line):
110 |         self.proj = proj
111 |         self.cls = cls
112 |         self.typ = typ
113 |         self.cat = cat
114 |         self.msg = msg
115 |         self.code = code
116 |         self.mark = mark
117 |         self.line = int(line)
118 |         self.values = [self.proj, self.cls, self.typ, self.cat,
119 |                        self.msg, self.code, self.mark, self.line]
120 | 
121 |     def __str__(self):
122 |         return("\n" + "\n".join(k + ": " + str(v) for (k, v) in zip(ErrorproneMsg.keys, self.values)) + "\n")
123 | 
124 |     __repr__ = __str__
125 | 
126 | 
127 | class SpotbugsMsg(object):
128 |     
129 |     keys = ['    Proj',
130 |             '   Class',
131 |             '     Cat',
132 |             '  Abbrev',
133 |             '    Type',
134 |             'Priority',
135 |             '    Rank',
136 |             '     Msg',
137 |             '  Method',
138 |             '   Field',
139 |             '   Lines']
140 |     
141 |     def __init__(self, proj, cls, cat, abbrev, typ, prio, rank, msg, mth, field, lines):
142 |         self.proj = proj
143 |         self.cls = cls
144 |         self.cat = cat
145 |         self.abbrev = abbrev
146 |         self.typ = typ
147 |         self.prio = prio
148 |         self.rank = rank
149 |         self.msg = msg
150 |         self.mth = mth
151 |         self.field = field
152 |         # lines could be list of tuples during serialization
153 |         # or list of lists during deserialization
154 |         # so construct namedtuples here instead of passing it from outside
155 |         # so that it works during deserialization also.
156 |         self.lines = []
157 |         for l in lines:
158 |             self.lines.append(SpotbugsSrcline(int(l[0]), int(l[1]), l[2]))
159 |         self.values = [self.proj, self.cls, self.cat, self.abbrev, self.typ, self.prio,
160 |                        self.rank, self.msg, self.mth, self.field, self.lines]
161 |         
162 |     def __str__(self):
163 |         return("\n" + "\n".join(k + ": " + str(v) for (k, v) in zip(SpotbugsMsg.keys, self.values)) + "\n")
164 | 
165 |     __repr__ = __str__
166 |     
167 |     def unrollLines(self):
168 |         lines = []
169 |         for l in self.lines:
170 |             lines.extend(range(l.start, l.end + 1))
171 |         return list(set(lines))
172 | 
173 | 
174 | SpotbugsSrcline = namedtuple('SpotbugsSrcline', ['start', 'end', 'role'])
175 | 
176 | '''
177 | InferIssue and InferBugTrace are slightly modified to cope
178 | with the new json format in Infer 0.15.0
179 | '''
180 | class InferIssue(object):
181 | #     keys = ['bug_class', 'kind', 'bug_type', 'qualifier', 'severity', 'visibility', 'line',
182 | #             'column', 'procedure', 'procedure_id', 'procedure_start_line', 'file', 'bug_trace',
183 | #             'key', 'qualifier_tags', 'hash', 'bug_type_hum']
184 |     keys = ['bug_class', 'kind', 'bug_type', 'qualifier', 'severity', 'visibility', 'line',
185 |             'column', 'procedure', 'procedure_id', 'procedure_start_line', 'file', 'bug_trace',
186 |             'key', 'node_key', 'hash', 'bug_type_hum']
187 |     def __init__(self, bug_class, kind, bug_type, qualifier, severity, visibility,
188 |                  line, column, procedure, procedure_id, procedure_start_line,
189 |                  file, bug_trace, key, qualifier_tags, hashh, bug_type_hum):
190 |         self.bug_class = bug_class
191 |         self.kind = kind
192 |         self.bug_type = bug_type
193 |         self.qualifier = qualifier
194 |         self.severity = severity
195 |         self.visibility = visibility
196 |         self.line = line
197 |         self.column = column
198 |         self.procedure = procedure
199 |         self.procedure_id = procedure_id
200 |         self.procedure_start_line = procedure_start_line
201 |         self.file = file
202 |         self.bug_trace = []
203 |         for t in bug_trace:
204 |             self.bug_trace.append(InferBugTrace(*list(t[k] for k in InferBugTrace.keys)))
205 |         self.key = key
206 | #         self.qualifier_tags = qualifier_tags
207 |         self.hashh = hashh
208 |         self.bug_type_hum = bug_type_hum
209 |         
210 |         self.values = [self.bug_class, self.kind, self.bug_type, self.qualifier,
211 |                        self.severity, self.visibility, self.line, self.column,
212 |                        self.procedure, self.procedure_id, self.procedure_start_line,
213 |                         self.file, self.bug_trace, self.key, 
214 | #                         self.qualifier_tags,
215 |                        self.hashh, self.bug_type_hum]
216 |         
217 |     def __str__(self):
218 |         return("\n" + "\n".join(k + ": " + str(v) for (k, v) in zip(InferIssue.keys, self.values)) + "\n")
219 |     
220 |     __repr__ = __str__
221 | 
222 | 
223 | class InferBugTrace(object):
224 | #     keys = ['level', 'filename', 'line_number', 'column_number', 'description', 'node_tags']
225 |     keys = ['level', 'filename', 'line_number', 'column_number', 'description']
226 |     
227 | #     def __init__(self, level, filename, line, column, desc, tags):
228 |     def __init__(self, level, filename, line, column, desc):
229 |         self.level = level
230 |         self.filename = filename
231 |         self.line = line
232 |         self.column = column
233 |         self.desc = desc
234 | #         self.tags = tags
235 |         
236 | #         self.values = [self.level, self.filename, self.line, self.column, self.desc, self.tags]
237 |         self.values = [self.level, self.filename, self.line, self.column, self.desc]
238 |         
239 |     def __str__(self):
240 |         return("\n" + "\n".join(k + ": " + str(v) for (k, v) in zip(InferBugTrace.keys, self.values)) + "\n")
241 |     
242 |     __repr__ = __str__    
243 | 
244 | 
245 | class InferMsg(object):
246 |     keys = ['      Proj',
247 |             '     Class',
248 |             ' Bug_Class',
249 |             '      Kind',
250 |             '  Bug_Type',
251 |             '       Msg',
252 |             '  Severity',
253 |             'Visibility',
254 |             '     Lines',
255 |             ' Procedure']
256 | 
257 |     def __init__(self, proj, cls, bug_class, kind, bug_type, msg,
258 |                  severity, visibility, lines, procedure):
259 |         self.proj = proj
260 |         self.cls = cls
261 |         self.bug_class = bug_class
262 |         self.kind = kind
263 |         self.bug_type = bug_type
264 |         self.msg = msg
265 |         self.severity = severity
266 |         self.visibility = visibility
267 |         self.lines = lines
268 |         self.procedure = procedure
269 | 
270 |         self.values = [self.proj, self.cls, self.bug_class, self.kind, self.bug_type, self.msg,
271 |                        self.severity, self.visibility, self.lines, self.procedure]
272 |         
273 |     def __str__(self):
274 |         return("\n" + "\n".join(k + ": " + str(v) for (k, v) in zip(InferMsg.keys, self.values)))
275 |     
276 |     __repr__ = __str__
277 | 
278 | 
279 | class FileDiff(object):
280 | 
281 |     keys = ['Project: ',
282 |             '  Class: ',
283 |             '  Lines: ']
284 |     
285 |     def __init__(self, proj, cls, lines):
286 |         self.proj = proj
287 |         self.cls = cls
288 |         self.lines = set(int(i) for i in lines)
289 |         self.values = [self.proj, self.cls, self.lines]
290 | 
291 |     def __str__(self):
292 |         return("\n" + "\n".join(k + str(v) for (k, v) in zip(FileDiff.keys, self.values)) + "\n")
293 | 
294 |     __repr__ = __str__
295 | 
296 | 
297 | class CustomEncoder(json.JSONEncoder):
298 | 
299 |     def default(self, o):
300 |         if isinstance(o, ErrorproneMsg):
301 |             return OrderedDict(zip(ErrorproneMsg.keys, o.values))
302 |         elif isinstance(o, InferIssue):
303 |             return OrderedDict(zip(InferIssue.keys, o.values))
304 |         elif isinstance(o, InferMsg):
305 |             return OrderedDict(zip(InferMsg.keys, o.values))
306 |         elif isinstance(o, SpotbugsMsg):
307 |             return OrderedDict(zip(SpotbugsMsg.keys, o.values))
308 |         elif isinstance(o, FileDiff):
309 |             return OrderedDict(zip(FileDiff.keys, o.values))
310 |         elif isinstance(o, set):
311 |             return list(o)
312 |         else:
313 |             json.JSONEncoder.default(self, o)
314 | 
315 | 
316 | def load_parsed_diffs(diffs_file):
317 |     diffs_ = []
318 |     for diff in JsonReader(diffs_file):
319 |         inst = FileDiff(*list(diff[k] for k in FileDiff.keys))
320 |         diffs_.append(inst)
321 |     return diffs_
322 | 
323 | 
324 | def load_parsed_ep(ep_file):
325 |     ep_res_ = []
326 |     for msg in JsonReader(ep_file):
327 |         inst = ErrorproneMsg(*list(msg[k] for k in ErrorproneMsg.keys))
328 |         ep_res_.append(inst)
329 |     return ep_res_
330 | 
331 | 
332 | def load_parsed_sb(sb_file):
333 |     sb_res_ = []
334 |     for msg in JsonReader(sb_file):
335 |         inst = SpotbugsMsg(*list(msg[k] for k in SpotbugsMsg.keys))
336 |         sb_res_.append(inst)
337 |     return sb_res_
338 | 
339 | 
340 | def load_parsed_inf(inf_file):
341 |     inf_res_ = []
342 |     for msg in JsonReader(inf_file):
343 |         inst = InferMsg(*list(msg[k] for k in InferMsg.keys))
344 |         inf_res_.append(inst)
345 |     return inf_res_
346 | 
347 | 
348 | def find_msg_by_proj_and_cls(proj, cls, msgs):
349 |     found_messages = []
350 |     for m in msgs:
351 |         if m.proj == proj and m.cls == cls:
352 |             found_messages.append(m)
353 |     return found_messages
354 | 
355 | 
356 | LineMatchesToMessages = namedtuple('LineMatchesToMessages', ['lines', 'messages'])
357 | 
358 | 
359 | def get_cls_name_from_file_path(cls_path):
360 |     cls = None
361 |     if '/com/' in cls_path:
362 |         cls = 'com.' + cls_path.split('/com/')[1].replace('/', '.').replace('.java', '')
363 |     elif '/org/' in cls_path:
364 |         cls = 'org.' + cls_path.split('/org/')[1].replace('/', '.').replace('.java', '')
365 |     return cls
366 | 
367 | 
368 | def prepare_tool(path, proj):
369 |         
370 |     proj_dir = os.path.join(path, proj)
371 |     
372 |     with open(os.path.join(proj_dir, 'prop-source-dir')) as file:
373 |         proj_src = file.read()
374 |     proj_src = os.path.join(proj_dir, proj_src)
375 |     
376 |     with open(os.path.join(proj_dir, 'prop-compile-path')) as file:
377 |         proj_cp = file.read()
378 |     
379 |     with open(os.path.join(proj_dir, 'prop-buggy-classes')) as file:
380 |         proj_buggy_classes = file.read().splitlines()
381 |     
382 |     try:
383 |         with open(os.path.join(proj_dir, 'prop-exclude-classes')) as file:
384 |             proj_exclude_classes = file.read().splitlines()
385 |     except IOError:
386 |         proj_exclude_classes = []
387 |     
388 |     proj_buggy_classes = set(proj_buggy_classes) - set(proj_exclude_classes)
389 |     
390 |     proj_buggy_files = map(lambda f: os.path.join(proj_src, f.replace('.', '/') + '.java'), proj_buggy_classes)
391 |     
392 |     try:
393 |         with open(os.path.join(proj_dir, 'prop-javac-options')) as file:
394 |             proj_javac_opts = file.read()
395 |     except IOError:
396 |         proj_javac_opts = ""
397 |         
398 |     return proj_src, proj_cp, proj_javac_opts, proj_buggy_files, proj_buggy_classes
399 | 
400 | NO_WARNING = "NO_WARNING"


--------------------------------------------------------------------------------
/python/ComputeStatsOnD4JToolsResults.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | 
  3 | Created on Jan. 24, 2018
  4 | 
  5 | @author Andrew Habib
  6 | 
  7 | '''
  8 | 
  9 | from statistics import mean
 10 | from collections import Counter
 11 | import os 
 12 | 
 13 | from Util import load_parsed_ep, load_parsed_inf, load_parsed_sb, load_json_list, get_list_of_uniq_jsons
 14 | 
 15 | 
 16 | def display_min_max_avg_warnings_per_bug_total():
 17 |     print("\nMin, Max, Avg (warnings per bug) and Total number of warnings")
 18 |     print("\nBuggy versions:\n")
 19 |     rel_path = './b/'
 20 |     ep_all = load_parsed_ep(rel_path + 'ep_parsed.json')
 21 |     inf_all = load_parsed_inf(rel_path + 'inf_parsed.json')
 22 |     sb_all = load_parsed_sb(rel_path + 'sb_parsed.json')
 23 |     print("Tool Min. Max. Avg. Total")
 24 |     print("Errorprone", get_min_max_avg_warnings_per_bug_total(ep_all))
 25 |     print("Infer", get_min_max_avg_warnings_per_bug_total(inf_all))
 26 |     print("Spotbugs", get_min_max_avg_warnings_per_bug_total(sb_all))
 27 |     print("\nTotal number of warnings by all tools:",
 28 |           get_min_max_avg_warnings_per_bug_total(ep_all)[3] + get_min_max_avg_warnings_per_bug_total(inf_all)[3] + get_min_max_avg_warnings_per_bug_total(sb_all)[3])
 29 |     ''''''
 30 |     print("\nFixed versions:\n")
 31 |     rel_path = './f/'
 32 |     ep_all = load_parsed_ep(rel_path + 'ep_parsed.json')
 33 |     inf_all = load_parsed_inf(rel_path + 'inf_parsed.json')
 34 |     sb_all = load_parsed_sb(rel_path + 'sb_parsed.json')
 35 |     print("Tool Total Min. Max. Avg.")
 36 |     print("Errorprone", get_min_max_avg_warnings_per_bug_total(ep_all))
 37 |     print("Infer", get_min_max_avg_warnings_per_bug_total(inf_all))
 38 |     print("Spotbugs", get_min_max_avg_warnings_per_bug_total(sb_all))
 39 |     print("\nTotal number of warnings by all tools:",
 40 |           get_min_max_avg_warnings_per_bug_total(ep_all)[3] + get_min_max_avg_warnings_per_bug_total(inf_all)[3] + get_min_max_avg_warnings_per_bug_total(sb_all)[3])
 41 | 
 42 | 
 43 | def get_min_max_avg_warnings_per_bug_total(warnings):
 44 |     count = [w.proj for w in warnings]
 45 |     counter = Counter(count)
 46 |     return min(counter.values()), max(counter.values()), mean(counter.values()), len(count)
 47 | 
 48 | 
 49 | def get_warnings_bugs_from_each_approach():
 50 |     print("\nWarnings and bugs from each automatic matching approach")
 51 |     print("** warnings for combined approach are not unique (duplicates exist) **\n")
 52 |     rel_path = './diffs_warnings/'
 53 |     ep_res1 = load_parsed_ep(rel_path + "ep_warnings.json") 
 54 |     inf_res1 = load_parsed_inf(rel_path + "inf_warnings.json")
 55 |     sb_res1 = load_parsed_sb(rel_path + "sb_warnings.json")
 56 |     rel_path = './removed_warnings/'
 57 |     ep_res2 = load_parsed_ep(rel_path + "ep_warnings.json")
 58 |     inf_res2 = load_parsed_inf(rel_path + "inf_warnings.json")
 59 |     sb_res2 = load_parsed_sb(rel_path + "sb_warnings.json")
 60 |     _all_b = []
 61 |     print("Tool Diff-based Fixed-based Combined")
 62 |     print("     W        B W         B W      B")
 63 |     all_b = []
 64 |     b_diff = get_bugs_from_warnings(ep_res1)
 65 |     b_fixed = get_bugs_from_warnings(ep_res2)
 66 |     all_b.extend(b_diff)
 67 |     all_b.extend(b_fixed)
 68 |     _all_b.extend(all_b)
 69 |     print("Error Prone   ", len(ep_res1), len(b_diff), len(ep_res2), len(b_fixed), len(ep_res1) + len(ep_res2), len(b_diff | b_fixed))
 70 |  
 71 |     all_b = []
 72 |     b_diff = get_bugs_from_warnings(inf_res1)
 73 |     b_fixed = get_bugs_from_warnings(inf_res2)
 74 |     all_b.extend(b_diff)
 75 |     all_b.extend(b_fixed)
 76 |     _all_b.extend(all_b)
 77 |     print("Infer         ", len(inf_res1), len(b_diff), len(inf_res2), len(b_fixed), len(inf_res1) + len(inf_res2), len(b_diff | b_fixed))
 78 |     
 79 |     all_b = []
 80 |     b_diff = get_bugs_from_warnings(sb_res1)
 81 |     b_fixed = get_bugs_from_warnings(sb_res2)
 82 |     all_b.extend(b_diff)
 83 |     all_b.extend(b_fixed)
 84 |     _all_b.extend(all_b)
 85 |     print("SpotBugs      ", len(sb_res1), len(b_diff), len(sb_res2), len(b_fixed), len(sb_res1) + len(sb_res2), len(b_diff | b_fixed))
 86 |     
 87 |     print("\nUnique warnings from each approachcombined approach:\n")
 88 |     rel_path = './diffs_warnings/'
 89 |     ep_res1 = load_json_list(rel_path + "ep_warnings.json") 
 90 |     inf_res1 = load_json_list(rel_path + "inf_warnings.json")
 91 |     sb_res1 = load_json_list(rel_path + "sb_warnings.json")    
 92 |     rel_path = './removed_warnings/'
 93 |     ep_res2 = load_json_list(rel_path + "ep_warnings.json")
 94 |     inf_res2 = load_json_list(rel_path + "inf_warnings.json")
 95 |     sb_res2 = load_json_list(rel_path + "sb_warnings.json")
 96 |     
 97 |     print("Ep ", len(ep_res1), len(ep_res2), len(get_list_of_uniq_jsons(ep_res1 + ep_res2)))
 98 |     print("Inf", len(inf_res1), len(inf_res2), len(get_list_of_uniq_jsons(inf_res1 + inf_res2)))
 99 |     print("Sb ", len(sb_res1), len(sb_res2), len(get_list_of_uniq_jsons(sb_res1 + sb_res2)))
100 | 
101 |     print("\nUnique bugs from combined approach: ", len(set(_all_b)))
102 | 
103 | 
104 | def get_bugs_from_warnings(warnings):
105 |     bugs = set(w.proj for w in warnings)
106 |     return bugs
107 | 
108 | 
109 | def count_bugs_from_warnings(warnings):
110 |     bugs = set(w.proj for w in warnings)
111 |     return(len(bugs))
112 | 
113 | 
114 | def get_manually_inspected_warnings_bugs():
115 |     print("\nManual inspection of warnings aggregated on warnings and bugs levels")
116 |     print("\nDiffs-based approach:\n")
117 |     rel_path = './diffs_warnings/'
118 |     ep_res = load_parsed_ep(rel_path + "ep_warnings.json")
119 |     ep_succ = load_parsed_ep(rel_path + "ep_succ.json")
120 |     ep_part = load_parsed_ep(rel_path + "ep_part.json")
121 |     ep_fail = load_parsed_ep(rel_path + "ep_fail.json")
122 |     inf_res = load_parsed_inf(rel_path + "inf_warnings.json")
123 |     inf_succ = load_parsed_inf(rel_path + "inf_succ.json")
124 |     inf_part = load_parsed_inf(rel_path + "inf_part.json")
125 |     inf_fail = load_parsed_inf(rel_path + "inf_fail.json")
126 |     sb_res = load_parsed_sb(rel_path + "sb_warnings.json")
127 |     sb_succ = load_parsed_sb(rel_path + "sb_succ.json")
128 |     sb_part = load_parsed_sb(rel_path + "sb_part.json")
129 |     sb_fail = load_parsed_sb(rel_path + "sb_fail.json")
130 |     print("Warnings:\n")
131 |     print('Tool "Full match" "Partial match" Mismatch Total')
132 |     print('"Error Prone"', len(ep_succ), len(ep_part), len(ep_fail), len(ep_res))
133 |     print("Infer", len(inf_succ), len(inf_part), len(inf_fail), len(inf_res))
134 |     print("Spotbugs", len(sb_succ), len(sb_part), len(sb_fail), len(sb_res))
135 |     print("\nBugs:\n")
136 |     print('Tool "Full match" "Partial match" Mismatch Total')
137 |     print('"Error Prone"', count_bugs_from_warnings(ep_succ), count_bugs_from_warnings(ep_part), count_bugs_from_warnings(ep_fail), count_bugs_from_warnings(ep_res))
138 |     print("Infer", count_bugs_from_warnings(inf_succ), count_bugs_from_warnings(inf_part), count_bugs_from_warnings(inf_fail), count_bugs_from_warnings(inf_res))
139 |     print("Spotbugs", count_bugs_from_warnings(sb_succ), count_bugs_from_warnings(sb_part), count_bugs_from_warnings(sb_fail), count_bugs_from_warnings(sb_res))
140 | 
141 |     print("\nFixed warnings approach\n")
142 |     rel_path = './removed_warnings/'
143 |     ep_res = load_parsed_ep(rel_path + "ep_warnings.json")
144 |     ep_succ = load_parsed_ep(rel_path + "ep_succ.json")
145 |     ep_part = load_parsed_ep(rel_path + "ep_part.json")
146 |     ep_fail = load_parsed_ep(rel_path + "ep_fail.json")
147 |     inf_res = load_parsed_inf(rel_path + "inf_warnings.json")
148 |     inf_succ = load_parsed_inf(rel_path + "inf_succ.json")
149 |     inf_part = load_parsed_inf(rel_path + "inf_part.json")
150 |     inf_fail = load_parsed_inf(rel_path + "inf_fail.json")
151 |     sb_res = load_parsed_sb(rel_path + "sb_warnings.json")
152 |     sb_succ = load_parsed_sb(rel_path + "sb_succ.json")
153 |     sb_part = load_parsed_sb(rel_path + "sb_part.json")
154 |     sb_fail = load_parsed_sb(rel_path + "sb_fail.json")
155 |     print("Warnings:\n")
156 |     print('Tool "Full match" "Partial match" Mismatch Total')
157 |     print('"Error Prone"', len(ep_succ), len(ep_part), len(ep_fail), len(ep_res))
158 |     print("Infer", len(inf_succ), len(inf_part), len(inf_fail), len(inf_res))
159 |     print("Spotbugs", len(sb_succ), len(sb_part), len(sb_fail), len(sb_res))
160 |     print("\nBugs:\n")
161 |     print('Tool "Full match" "Partial match" Mismatch Total')
162 |     print('"Error Prone"', count_bugs_from_warnings(ep_succ), count_bugs_from_warnings(ep_part), count_bugs_from_warnings(ep_fail), count_bugs_from_warnings(ep_res))
163 |     print("Infer", count_bugs_from_warnings(inf_succ), count_bugs_from_warnings(inf_part), count_bugs_from_warnings(inf_fail), count_bugs_from_warnings(inf_res))
164 |     print("Spotbugs", count_bugs_from_warnings(sb_succ), count_bugs_from_warnings(sb_part), count_bugs_from_warnings(sb_fail), count_bugs_from_warnings(sb_res))
165 | 
166 |     get_manually_inspected_warnings_bugs_combined_approach()
167 | 
168 | 
169 | def get_manually_inspected_warnings_bugs_combined_approach():
170 |     print("\nCombined approach\n")
171 |     rel_path = './diffs_warnings/'
172 |     ep_succ1 = load_json_list(rel_path + "ep_succ.json")
173 |     ep_part1 = load_json_list(rel_path + "ep_part.json")
174 |     ep_fail1 = load_json_list(rel_path + "ep_fail.json")
175 |      
176 |     inf_succ1 = load_json_list(rel_path + "inf_succ.json")
177 |     inf_part1 = load_json_list(rel_path + "inf_part.json")
178 |     inf_fail1 = load_json_list(rel_path + "inf_fail.json")
179 |      
180 |     sb_succ1 = load_json_list(rel_path + "sb_succ.json")
181 |     sb_part1 = load_json_list(rel_path + "sb_part.json")
182 |     sb_fail1 = load_json_list(rel_path + "sb_fail.json")
183 |     
184 |     rel_path = './removed_warnings/'
185 |     ep_succ2 = load_json_list(rel_path + "ep_succ.json")
186 |     ep_part2 = load_json_list(rel_path + "ep_part.json")
187 |     ep_fail2 = load_json_list(rel_path + "ep_fail.json")
188 |      
189 |     inf_succ2 = load_json_list(rel_path + "inf_succ.json")
190 |     inf_part2 = load_json_list(rel_path + "inf_part.json")
191 |     inf_fail2 = load_json_list(rel_path + "inf_fail.json")
192 |      
193 |     sb_succ2 = load_json_list(rel_path + "sb_succ.json")
194 |     sb_part2 = load_json_list(rel_path + "sb_part.json")
195 |     sb_fail2 = load_json_list(rel_path + "sb_fail.json")
196 | 
197 |     # comnined data #
198 |     ep_succ = get_list_of_uniq_jsons(ep_succ1 + ep_succ2)
199 |     ep_part = get_list_of_uniq_jsons(ep_part1 + ep_part2)
200 |     ep_fail = get_list_of_uniq_jsons(ep_fail1 + ep_fail2)
201 |     
202 |     inf_succ = get_list_of_uniq_jsons(inf_succ1 + inf_succ2)
203 |     inf_part = get_list_of_uniq_jsons(inf_part1 + inf_part2)
204 |     inf_fail = get_list_of_uniq_jsons(inf_fail1 + inf_fail2)
205 |     
206 |     sb_succ = get_list_of_uniq_jsons(sb_succ1 + sb_succ2)
207 |     sb_part = get_list_of_uniq_jsons(sb_part1 + sb_part2)
208 |     sb_fail = get_list_of_uniq_jsons(sb_fail1 + sb_fail2)
209 |     
210 |     print("Warnings:\n")
211 |     print('Tool "Full match" "Partial match" Mismatch Total')
212 |     print('"Error Prone"', len(ep_succ), len(ep_part), len(ep_fail), len(ep_succ) + len(ep_part) + len(ep_fail))
213 |     print('Infer', len(inf_succ), len(inf_part), len(inf_fail), len(inf_succ) + len(inf_part) + len(inf_fail))
214 |     print('SpotBugs', len(sb_succ), len(sb_part), len(sb_fail), len(sb_succ) + len(sb_part) + len(sb_fail))
215 |     
216 |     print("\nBugs:\n")
217 |     print('Tool "Full match" "Partial match" Mismatch Total')
218 |     b_succ, b_part, b_fail = len(Counter(p[' Proj'] for p in ep_succ)), len(Counter(p[' Proj'] for p in ep_part)), len(Counter(p[' Proj'] for p in ep_fail))
219 |     print('"Error Prone"', b_succ, b_part, b_fail, b_succ + b_part + b_fail)
220 |     
221 |     b_succ, b_part, b_fail = len(Counter(p['      Proj'] for p in inf_succ)), len(Counter(p['      Proj'] for p in inf_part)), len(Counter(p['      Proj'] for p in inf_fail))
222 |     print('Infer', b_succ, b_part, b_fail, b_succ + b_part + b_fail)
223 |     
224 |     b_succ, b_part, b_fail = len(Counter(p['    Proj'] for p in sb_succ)), len(Counter(p['    Proj'] for p in sb_part)), len(Counter(p['    Proj'] for p in sb_fail))
225 |     print('SpotBugs', b_succ, b_part, b_fail, b_succ + b_part + b_fail)     
226 | 
227 | 
228 | def get_cand_detected_bugs_tools_sets():
229 |     print("\nCandidate and detected bugs by each tool and each approach")
230 |     rel_path = './diffs_warnings/'
231 |     ep_res1 = load_parsed_ep(rel_path + "ep_warnings.json")
232 |     ep_succ1 = load_parsed_ep(rel_path + "ep_succ.json")
233 |     ep_part1 = load_parsed_ep(rel_path + "ep_part.json")
234 |      
235 |     inf_res1 = load_parsed_inf(rel_path + "inf_warnings.json")
236 |     inf_succ1 = load_parsed_inf(rel_path + "inf_succ.json")
237 |     inf_part1 = load_parsed_inf(rel_path + "inf_part.json")
238 |      
239 |     sb_res1 = load_parsed_sb(rel_path + "sb_warnings.json")
240 |     sb_succ1 = load_parsed_sb(rel_path + "sb_succ.json")
241 |     sb_part1 = load_parsed_sb(rel_path + "sb_part.json")
242 |     
243 |     rel_path = './removed_warnings/'
244 |     ep_res2 = load_parsed_ep(rel_path + "ep_warnings.json")
245 |     ep_succ2 = load_parsed_ep(rel_path + "ep_succ.json")
246 |     ep_part2 = load_parsed_ep(rel_path + "ep_part.json")
247 |      
248 |     inf_res2 = load_parsed_inf(rel_path + "inf_warnings.json")
249 |     inf_succ2 = load_parsed_inf(rel_path + "inf_succ.json")
250 |     inf_part2 = load_parsed_inf(rel_path + "inf_part.json")
251 |      
252 |     sb_res2 = load_parsed_sb(rel_path + "sb_warnings.json")
253 |     sb_succ2 = load_parsed_sb(rel_path + "sb_succ.json")
254 |     sb_part2 = load_parsed_sb(rel_path + "sb_part.json")
255 |     
256 |     print("\nCandidate bugs:\n")
257 |     print("Tool Diff-based Fixed-based Both")
258 |     ep_cand_diff = get_bugs_from_warnings(ep_res1)
259 |     ep_cand_fixed = get_bugs_from_warnings(ep_res2)
260 |     print('"Error Prone"', len(ep_cand_diff), len(ep_cand_fixed), len(ep_cand_diff & ep_cand_fixed))
261 |     inf_cand_diff = get_bugs_from_warnings(inf_res1)
262 |     inf_cand_fixed = get_bugs_from_warnings(inf_res2)
263 |     print("Infer", len(inf_cand_diff), len(inf_cand_fixed), len(inf_cand_diff & inf_cand_fixed))
264 |     sb_cand_diff = get_bugs_from_warnings(sb_res1)
265 |     sb_cand_fixed = get_bugs_from_warnings(sb_res2)
266 |     print("Spotbugs", len(sb_cand_diff), len(sb_cand_fixed), len(sb_cand_diff & sb_cand_fixed))
267 |     
268 |     print("\nTrue bugs (fully or partially flagged)\n")
269 |     print("Tool Diff-based Fixed-based Both")
270 |     ep_succ_diff = get_bugs_from_warnings(ep_succ1) | get_bugs_from_warnings(ep_part1)
271 |     ep_succ_fixed = get_bugs_from_warnings(ep_succ2) | get_bugs_from_warnings(ep_part2)
272 |     print('"Error Prone"', len(ep_succ_diff), len(ep_succ_fixed), len(ep_succ_diff & ep_succ_fixed))
273 |     inf_succ_diff = get_bugs_from_warnings(inf_succ1) | get_bugs_from_warnings(inf_part1)
274 |     inf_succ_fixed = get_bugs_from_warnings(inf_succ2) | get_bugs_from_warnings(inf_part2)
275 |     print("Infer", len(inf_succ_diff), len(inf_succ_fixed), len(inf_succ_diff & inf_succ_fixed))
276 |     sb_succ_diff = get_bugs_from_warnings(sb_succ1) | get_bugs_from_warnings(sb_part1)
277 |     sb_succ_fixed = get_bugs_from_warnings(sb_succ2) | get_bugs_from_warnings(sb_part2)
278 |     print("Spotbugs", len(sb_succ_diff), len(sb_succ_fixed), len(sb_succ_diff & sb_succ_fixed))
279 |     
280 |     print("\nTrue bugs found by all tools\n")
281 |     ep_succ = get_bugs_from_warnings(ep_succ1) | get_bugs_from_warnings(ep_succ2) | get_bugs_from_warnings(ep_part1) | get_bugs_from_warnings(ep_part2)
282 |     print("Ep:", len(ep_succ))
283 |      
284 |     inf_succ = get_bugs_from_warnings(inf_succ1) | get_bugs_from_warnings(inf_succ2) | get_bugs_from_warnings(inf_part1) | get_bugs_from_warnings(inf_part2)
285 |     print("Inf:", len(inf_succ))
286 |      
287 |     sb_succ = get_bugs_from_warnings(sb_succ1) | get_bugs_from_warnings(sb_succ2) | get_bugs_from_warnings(sb_part1) | get_bugs_from_warnings(sb_part2)
288 |     print("Sb:", len(sb_succ))
289 |      
290 |     print("Ep & Inf:", len(ep_succ & inf_succ))
291 |     print("Ep & Sb:", len(ep_succ & sb_succ))
292 |     print("Inf & Sb:", len(inf_succ & sb_succ))
293 |     print("Ep & Inf & Sb:", len(ep_succ & inf_succ & sb_succ))
294 | 
295 | 
296 | def get_cand_detected_bugs_tools_table():
297 |     print("\nAll candidate and detected bugs by each tool and each approach\n")
298 |     rel_path = './diffs_warnings/'
299 |     ep_res1 = load_parsed_ep(rel_path + "ep_warnings.json")
300 |     ep_succ1 = load_parsed_ep(rel_path + "ep_succ.json")
301 |     ep_part1 = load_parsed_ep(rel_path + "ep_part.json")
302 |     ep_fail1 = load_parsed_ep(rel_path + "ep_fail.json")
303 |      
304 |     inf_res1 = load_parsed_inf(rel_path + "inf_warnings.json")
305 |     inf_succ1 = load_parsed_inf(rel_path + "inf_succ.json")
306 |     inf_part1 = load_parsed_inf(rel_path + "inf_part.json")
307 |     inf_fail1 = load_parsed_inf(rel_path + "inf_fail.json")
308 |      
309 |     sb_res1 = load_parsed_sb(rel_path + "sb_warnings.json")
310 |     sb_succ1 = load_parsed_sb(rel_path + "sb_succ.json")
311 |     sb_part1 = load_parsed_sb(rel_path + "sb_part.json")
312 |     sb_fail1 = load_parsed_sb(rel_path + "sb_fail.json")
313 |     
314 |     rel_path = './removed_warnings/'
315 |     ep_res2 = load_parsed_ep(rel_path + "ep_warnings.json")
316 |     ep_succ2 = load_parsed_ep(rel_path + "ep_succ.json")
317 |     ep_part2 = load_parsed_ep(rel_path + "ep_part.json")
318 |     ep_fail2 = load_parsed_ep(rel_path + "ep_fail.json")
319 |      
320 |     inf_res2 = load_parsed_inf(rel_path + "inf_warnings.json")
321 |     inf_succ2 = load_parsed_inf(rel_path + "inf_succ.json")
322 |     inf_part2 = load_parsed_inf(rel_path + "inf_part.json")
323 |     inf_fail2 = load_parsed_inf(rel_path + "inf_fail.json")
324 |      
325 |     sb_res2 = load_parsed_sb(rel_path + "sb_warnings.json")
326 |     sb_succ2 = load_parsed_sb(rel_path + "sb_succ.json")
327 |     sb_part2 = load_parsed_sb(rel_path + "sb_part.json")
328 |     sb_fail2 = load_parsed_sb(rel_path + "sb_fail.json")
329 | 
330 |     bugs = []
331 |     
332 |     bugs.extend(w.proj for w in ep_res1)
333 |     bugs.extend(w.proj for w in inf_res1)
334 |     bugs.extend(w.proj for w in sb_res1)
335 |     
336 |     bugs.extend(w.proj for w in ep_res2)
337 |     bugs.extend(w.proj for w in inf_res2)
338 |     bugs.extend(w.proj for w in sb_res2)
339 |     
340 |     bugs = sorted(list(set(bugs)))
341 |     
342 |     print("        Removed Warnings    Diffs-based         Combined")
343 |     print("Tool       Ep Inf SB         Ep Inf SB          Ep Inf SB")
344 |     for b in bugs:
345 |         entry = b + " "
346 |         #####################################
347 |         if b in get_bugs_from_warnings(ep_succ1):
348 |             entry += "& F "
349 |         elif b in get_bugs_from_warnings(ep_part1):
350 |             entry += "& P "
351 |         elif b in get_bugs_from_warnings(ep_fail1):
352 |             entry += "& M "
353 |         else:
354 |             entry += "& - "
355 |         
356 |         if b in get_bugs_from_warnings(inf_succ1):
357 |             entry += "& F "
358 |         elif b in get_bugs_from_warnings(inf_part1):
359 |             entry += "& P "
360 |         elif b in get_bugs_from_warnings(inf_fail1):
361 |             entry += "& M "
362 |         else:
363 |             entry += "& - "
364 |             
365 |         if b in get_bugs_from_warnings(sb_succ1):
366 |             entry += "& F "
367 |         elif b in get_bugs_from_warnings(sb_part1):
368 |             entry += "& P "
369 |         elif b in get_bugs_from_warnings(sb_fail1):
370 |             entry += "& M "
371 |         else:
372 |             entry += "& - "
373 |         
374 |         #####################################
375 |         if b in get_bugs_from_warnings(ep_succ2):
376 |             entry += "& F "
377 |         elif b in get_bugs_from_warnings(ep_part2):
378 |             entry += "& P "
379 |         elif b in get_bugs_from_warnings(ep_fail2):
380 |             entry += "& M "
381 |         else:
382 |             entry += "& - "
383 |         
384 |         if b in get_bugs_from_warnings(inf_succ2):
385 |             entry += "& F "
386 |         elif b in get_bugs_from_warnings(inf_part2):
387 |             entry += "& P "
388 |         elif b in get_bugs_from_warnings(inf_fail2):
389 |             entry += "& M "
390 |         else:
391 |             entry += "& - "
392 |             
393 |         if b in get_bugs_from_warnings(sb_succ2):
394 |             entry += "& F "
395 |         elif b in get_bugs_from_warnings(sb_part2):
396 |             entry += "& P "
397 |         elif b in get_bugs_from_warnings(sb_fail2):
398 |             entry += "& M "
399 |         else:
400 |             entry += "& - "
401 |         
402 |         #####################################
403 |         if b in get_bugs_from_warnings(ep_succ1) or b in get_bugs_from_warnings(ep_succ2):
404 |             entry += "& F "
405 |         elif b in get_bugs_from_warnings(ep_part1) or b in get_bugs_from_warnings(ep_part2):
406 |             entry += "& P "
407 |         elif b in get_bugs_from_warnings(ep_fail1) or b in get_bugs_from_warnings(ep_fail2):
408 |             entry += "& M "
409 |         else:
410 |             entry += "& - "
411 |         
412 |         if b in get_bugs_from_warnings(inf_succ1) or b in get_bugs_from_warnings(inf_succ2):
413 |             entry += "& F "
414 |         elif b in get_bugs_from_warnings(inf_part1) or b in get_bugs_from_warnings(inf_part2):
415 |             entry += "& P "
416 |         elif b in get_bugs_from_warnings(inf_fail1) or b in get_bugs_from_warnings(inf_fail2):
417 |             entry += "& M "
418 |         else:
419 |             entry += "& - "
420 |             
421 |         if b in get_bugs_from_warnings(sb_succ1) or b in get_bugs_from_warnings(sb_succ2):
422 |             entry += "& F "
423 |         elif b in get_bugs_from_warnings(sb_part1) or b in get_bugs_from_warnings(sb_part2):
424 |             entry += "& P "
425 |         elif b in get_bugs_from_warnings(sb_fail1) or b in get_bugs_from_warnings(sb_fail2):
426 |             entry += "& M "
427 |         else:
428 |             entry += "& - "
429 |             
430 |         entry += "\\\\"  
431 |             
432 |         print(entry)
433 |         
434 |     print()
435 | 
436 | 
437 | def get_true_detected_bugs_by_each_tool():
438 |     rel_path = './diffs_warnings/'
439 |     ep_res1 = load_parsed_ep(rel_path + "ep_warnings.json")
440 |     ep_succ1 = load_parsed_ep(rel_path + "ep_succ.json")
441 |     ep_part1 = load_parsed_ep(rel_path + "ep_part.json")
442 |      
443 |     inf_res1 = load_parsed_inf(rel_path + "inf_warnings.json")
444 |     inf_succ1 = load_parsed_inf(rel_path + "inf_succ.json")
445 |     inf_part1 = load_parsed_inf(rel_path + "inf_part.json")
446 |      
447 |     sb_res1 = load_parsed_sb(rel_path + "sb_warnings.json")
448 |     sb_succ1 = load_parsed_sb(rel_path + "sb_succ.json")
449 |     sb_part1 = load_parsed_sb(rel_path + "sb_part.json")
450 |     
451 |     rel_path = './removed_warnings/'
452 |     ep_res2 = load_parsed_ep(rel_path + "ep_warnings.json")
453 |     ep_succ2 = load_parsed_ep(rel_path + "ep_succ.json")
454 |     ep_part2 = load_parsed_ep(rel_path + "ep_part.json")
455 |      
456 |     inf_res2 = load_parsed_inf(rel_path + "inf_warnings.json")
457 |     inf_succ2 = load_parsed_inf(rel_path + "inf_succ.json")
458 |     inf_part2 = load_parsed_inf(rel_path + "inf_part.json")
459 |      
460 |     sb_res2 = load_parsed_sb(rel_path + "sb_warnings.json")
461 |     sb_succ2 = load_parsed_sb(rel_path + "sb_succ.json")
462 |     sb_part2 = load_parsed_sb(rel_path + "sb_part.json")
463 |     
464 |     print("\nTrue bugs found by each tool\n")
465 |     ep_succ = get_bugs_from_warnings(ep_succ1) | get_bugs_from_warnings(ep_succ2) | get_bugs_from_warnings(ep_part1) | get_bugs_from_warnings(ep_part2)
466 |     print("Ep:", len(ep_succ))
467 |     with open(os.path.join(os.getcwd(), "ep_detected"), 'w') as f:
468 |         f.write("\n".join(i for i in ep_succ))
469 |      
470 |     inf_succ = get_bugs_from_warnings(inf_succ1) | get_bugs_from_warnings(inf_succ2) | get_bugs_from_warnings(inf_part1) | get_bugs_from_warnings(inf_part2)
471 |     print("Inf:", len(inf_succ))
472 |     with open(os.path.join(os.getcwd(), "inf_detected"), 'w') as f:
473 |         f.write("\n".join(i for i in inf_succ))
474 |      
475 |     sb_succ = get_bugs_from_warnings(sb_succ1) | get_bugs_from_warnings(sb_succ2) | get_bugs_from_warnings(sb_part1) | get_bugs_from_warnings(sb_part2)
476 |     print("Sb:", len(sb_succ))
477 |     with open(os.path.join(os.getcwd(), "sb_detected"), 'w') as f:
478 |         f.write("\n".join(i for i in sb_succ))
479 |     
480 |     print()
481 | 
482 |      
483 | 
484 | ''' this script has to be run from the results/ directory '''
485 | 
486 |     
487 | if __name__ == '__main__':
488 |     
489 |     # display_min_max_avg_warnings_per_bug_total()
490 |     
491 |     # get_warnings_bugs_from_each_approach()
492 |      
493 |     # get_manually_inspected_warnings_bugs()
494 | 
495 |     # get_cand_detected_bugs_tools_sets()
496 |       
497 |     # get_cand_detected_bugs_tools_table()
498 | 
499 |     get_true_detected_bugs_by_each_tool()
500 | 


--------------------------------------------------------------------------------