├── test
    ├── stata_tests.do
    ├── simple.do
    ├── test_detect.py
    ├── bad.do
    └── bad_corrected.do
├── .gitattributes
├── stata.toc
├── stata_linter.pkg
├── src-py
    └── setup.py
├── LICENSE
├── admin
    ├── ssc-meta-info.md
    └── checklist-submitting-SSC.md
├── .github
    └── workflows
    │   └── python_test.yaml
├── .gitignore
├── run
    └── lint.do
├── src
    ├── stata_linter_utils.py
    ├── lint.sthlp
    ├── lint.ado
    ├── stata_linter_correct.py
    └── stata_linter_detect.py
└── README.md


/test/stata_tests.do:
--------------------------------------------------------------------------------
1 | lint bad.do
2 | 
3 | lint simple.do
4 | 
5 | 


--------------------------------------------------------------------------------
/test/simple.do:
--------------------------------------------------------------------------------
1 | set obs 3
2 | gen x = _n
3 | 
4 | summary x, det
5 | 
6 | exit, clear


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/stata.toc:
--------------------------------------------------------------------------------
1 | v 1.02
2 | d DIME Analytics, World Bank Group, Development Economics Research
3 | p stata_linter
4 | 


--------------------------------------------------------------------------------
/stata_linter.pkg:
--------------------------------------------------------------------------------
 1 | v 1.02
 2 | d DIME Analytics, World Bank Group, Development Economics Research
 3 | p stata_linter
 4 | f /src/stata_linter_detect.py
 5 | f /src/stata_linter_correct.py
 6 | f /src/stata_linter_utils.py
 7 | f /src/lint.ado
 8 | f /src/lint.sthlp
 9 | e
10 | 


--------------------------------------------------------------------------------
/src-py/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | setup(
 3 |     name='stata_linter',
 4 |     version='1.0',
 5 |     entry_points={
 6 |         'console_scripts': [
 7 |             'stata_linter_detect=stata_linter_detect:run'
 8 |         ]
 9 |     },
10 |     install_requires=[
11 |           'pandas',
12 |           'openpyxl'
13 |       ]
14 | )
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) DIME Analytics, DIME, DEC, The World Bank Group.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/test/test_detect.py:
--------------------------------------------------------------------------------
 1 | from stata_linter_detect import stata_linter_detect_py
 2 | import subprocess
 3 | 
 4 | class TestCLI:
 5 |     def test_cli_bad(self):
 6 |         assert subprocess.run(["stata_linter_detect", "test/bad.do"]).returncode == 1
 7 |     def test_cli_simple(self):
 8 |         assert subprocess.run(["stata_linter_detect", "test/simple.do"]).returncode == 0
 9 | 
10 | class TestDetect:
11 |     def test_basic(self):
12 |         assert stata_linter_detect_py(
13 |         input_file="test/bad.do",
14 |         indent=4,
15 |         suppress="0",
16 |         summary="0",
17 |         excel="",
18 |         linemax=80,
19 |         tab_space=4
20 |         ) == 1
21 | 
22 |     def test_excel(self):
23 |         assert stata_linter_detect_py(
24 |         input_file="test/bad.do",
25 |         indent=4,
26 |         suppress="0",
27 |         summary="0",
28 |         excel="linter.xlsx",
29 |         linemax=80,
30 |         tab_space=4
31 |         ) == 1
32 | 
33 |     def test_simple(self):
34 |         assert stata_linter_detect_py(
35 |         input_file="test/simple.do",
36 |         indent=4,
37 |         suppress="0",
38 |         summary="0",
39 |         excel="",
40 |         linemax=80,
41 |         tab_space=4
42 |         ) == 0
43 | 


--------------------------------------------------------------------------------
/admin/ssc-meta-info.md:
--------------------------------------------------------------------------------
 1 | ### PACKAGE NAME:
 2 | STATA_LINTER
 3 | 
 4 | ### TITLE:
 5 | 'STATA_LINTER': tool to detect and correct bad Stata coding practices
 6 | 
 7 | ### DESCRIPTION:
 8 | The stata_linter package provides a linter for Stata code.
 9 | Read about what a linter is here: https://en.wikipedia.org/wiki/Lint_(software).
10 | The package contains a command that detects bad Stata coding practices in a do-file so that users can manually correct them.
11 | The command can also correct some of the issues flagged in a new do-file.
12 | The purpose of the command is to help users improve code clarity, readability, and organization in Stata do-files.
13 | This linter is based on the best practices outlined in The DIME Analytics Coding Guide published as an appendix to the book Development Research in Practice.
14 | See here https://worldbank.github.io/dime-data-handbook/coding.html. For more info about this linter, see https://github.com/worldbank/stata-linter.
15 | 
16 | ### AUTHOR:
17 | "DIME Analytics, DIME, The World Bank Group", dimeanalytics@worldbank.org
18 | 
19 | ### KEYWORDS:
20 | - linter
21 | - style guide
22 | - code best practices
23 | 
24 | ### STATA VERSION REQUIREMENT:
25 | Stata 16
26 | 
27 | ### FILES REQUIRED TO BE IN PACKAGE:
28 | - lint.ado
29 | - lint.sthlp
30 | - stata_linter_correct.py
31 | - stata_linter_detect.py
32 | - stata_linter_utils.py
33 | 


--------------------------------------------------------------------------------
/.github/workflows/python_test.yaml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | jobs:
12 |   build:
13 | 
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: [3.6, 3.7, 3.8, 3.9]
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |       - name: Set up Python ${{ matrix.python-version }}
22 |         uses: actions/setup-python@v5
23 |         with:
24 |           python-version: ${{ matrix.python-version }}
25 |       - name: Install dependencies
26 |         run: |
27 |           python -m pip install --upgrade pip
28 |           pip install flake8 pytest pandas openpyxl
29 |           if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
30 |       - name: Lint with flake8
31 |         run: |
32 |           # stop the build if there are Python syntax errors or undefined names
33 |           flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
34 |           # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
35 |           flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
36 |       - name: Install Module
37 |         run: python -m pip install -e src
38 |       - name: Test with pytest
39 |         run: |
40 |           pytest --doctest-modules --junitxml=junit/test-results-${{ matrix.python-version }}.xml
41 |       - name: Publish Unit Test Results
42 |         uses: EnricoMi/publish-unit-test-result-action@v2
43 |         if: always()
44 |         with:
45 |           files: junit/test-results-*.xml
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ########################################################################
 2 | #
 3 | # Based on DIME .gitignore template. Follow the instructions in the URL
 4 | # below to set up this template in your own repository
 5 | # https://github.com/worldbank/dime-github-trainings/tree/master/GitHub-resources/DIME-GitHub-Templates
 6 | #
 7 | # Note that if you are using GitKraken, you need to use version 5.x or more
 8 | # recent for this template to work properly
 9 | #
10 | ########################################################################
11 | 
12 | #######################
13 | # Start by ignoring everything, and below we are explicitly saying
14 | # what to not ignore
15 | *
16 | 
17 | #######################
18 | # List of files with GitHub functionality anywhere in the repo
19 | # that we do not want to ignore
20 | 
21 | # These files include GitHub settings
22 | !.gitignore
23 | !.gitattributes
24 | !.github/workflows/*.yaml
25 | 
26 | # Keep markdown files used for documentation on GitHub
27 | !README.md
28 | !CONTRIBUTING.md
29 | !LICENSE*
30 | 
31 | #######################
32 | # For performance reasons, if a folder is already ignored, then
33 | # GitHub does not check the content for that folder for matches
34 | # with additional rules. The line below includes folder in the
35 | # top folder (but not their content), so that anything matching
36 | # the rules below will still not be ignored.
37 | !*/
38 | 
39 | #######################
40 | # The following file types are code that should always be
41 | # included no matter where in the repository folder they are
42 | # located unless you explicitly ignore that folder
43 | 
44 | # Stata
45 | !/**/*.do
46 | !/**/*.ado
47 | !/**/*.sthlp
48 | !/**/*.pkg
49 | !/**/stata.toc
50 | 
51 | # R
52 | !/**/*.R
53 | !/**/*.Rmd
54 | 
55 | # LaTeX
56 | !/**/*.tex
57 | !/**/*.bib
58 | 
59 | # Python
60 | !/**/*.py
61 | !/**/*.ipynb
62 | # Still ignore .ipynb files in checkpoint folders
63 | .ipynb_checkpoints
64 | 
65 | # Matlab
66 | !/**/*.m
67 | 
68 | # Markdown
69 | !/**/*.md
70 | 
71 | # Julia
72 | !/**/*.jl
73 | 
74 | 
75 | #######################
76 | # Some admin data in txt formet
77 | !/**/admin/**/*.txt
78 | 
79 | 
80 | #######################
81 | # Include all the files with passwords or tokens here. All files named
82 | # password or passwords are with this template ignored no matter which
83 | # format you are using. Additionally, all content in any folder called
84 | # password or passwords are also ignored. NOTE that your project might be
85 | # using different names and then you must edit the lines below accordingly.
86 | password.*
87 | passwords.*
88 | password/
89 | passwords/
90 | .Rproj.user
91 | 


--------------------------------------------------------------------------------
/run/lint.do:
--------------------------------------------------------------------------------
 1 | * Set the global to folder where test files are stored
 2 | 
 3 | 	global project 	  	"/Users/bbdaniels/GitHub/stata-linter"
 4 | 	global test_dir     "${project}/test"
 5 | 	adopath ++ "${project}/src"
 6 | 	
 7 |   // net install stata_linter, from("https://raw.githubusercontent.com/worldbank/stata-linter/develop") replace
 8 |   run "${project}/src/lint.ado"
 9 | 
10 |   // Detect --------------------------------------------------------------------
11 |   lint "${test_dir}/bad.do",
12 |   lint "${test_dir}/bad.do", verbose  
13 |   lint "${test_dir}/bad.do", verbose nosummary
14 |   lint "${test_dir}/bad.do", nosummary
15 |   
16 |   // Lint with results in excel file
17 |   lint "${test_dir}/bad.do", nosummary          ///
18 |     excel("${test_dir}/detect_lint.xlsx")
19 |   
20 |   // Lint a folder
21 |   lint "${test_dir}"
22 |   lint "${test_dir}", verbose 
23 | 
24 |   // Lint a folder and create an excel file
25 |   lint "${test_dir}",                           ///
26 |     excel("${test_dir}/detect_output_all.xlsx")
27 | 
28 |   // Correct -------------------------------------------------------------------
29 |   lint "${test_dir}/bad.do"                     ///
30 |     using "${test_dir}/bad_corrected.do",       ///
31 |     nosummary                               ///
32 |     replace
33 | 	
34 |   lint "${test_dir}/bad.do"                     ///
35 |     using "${test_dir}/bad_corrected.do",       ///
36 |     nosummary                               ///
37 |     replace automatic
38 | 
39 |   
40 |   // detecting + correcting + excel file results
41 |   lint "${test_dir}/bad.do"                     ///
42 |     using "${test_dir}/bad_corrected.do",       ///
43 |     excel("${test_dir}/detect_lint.xlsx")       ///                               
44 |     replace                                     ///
45 |     automatic   
46 | 
47 |   // Check errors --------------------------------------------------------------
48 |   
49 |   // Invalid file paths
50 |   
51 |   cap lint "oi"
52 |   assert _rc == 601
53 |   
54 |   cap lint oi
55 |   assert _rc == 601
56 |   
57 |   cap lint "oi.do"
58 |   assert _rc == 601
59 |   
60 |   cap lint oi.do
61 |   assert _rc == 601
62 |   
63 |   cap lint "C:\Users\wb501238\Documents\GitHub\iefieldkit\run\output/iecorrect-template.xlsx"
64 |   assert _rc == 198
65 | 
66 |   // This should return an error. Input file is not a do file
67 | 	cap lint "${test_dir}"                        ///
68 | 		using "${test_dir}/bad_corrected.do",       ///
69 | 		nosummary                                   ///
70 | 		replace automatic debug
71 | 	
72 | 	assert _rc == 198
73 | 	
74 | // -----------------------------------------------------------------------------
75 | 
76 | 	adopath - "${project}/src"
77 | 


--------------------------------------------------------------------------------
/test/bad.do:
--------------------------------------------------------------------------------
 1 | * Rules =====================
 2 | * Hard tabs should not be used
 3 | * "delimit" should not be used
 4 | * In brackets after "for" or "if", indentation should be used
 5 | * Too long lines should be divided into multiple lines
 6 | * Before an opening curly bracket "{", put a whitespace
 7 | * Remove blank lines before closing brackets
 8 | * Remove duplicated blank lines
 9 | 
10 | * Stata codes to be corrected =================
11 | 
12 | * All hard tabs are replaced with soft tabs (= whitespaces)
13 | 
14 | 	* delimit is corrected and three forward slashes will be used instead
15 | 	#delimit ;
16 | 
17 | 	foreach something in something something something something something something
18 | 		something something{ ; // some comment
19 | 		do something ;
20 | 	} ;
21 | 
22 | 	#delimit cr
23 | 
24 | 	* Add indentation in brackets
25 | 	if something {
26 | 	do something
27 | 	if another == 1 {
28 | 	do that
29 | 	} 
30 | 	}
31 |   
32 | 	foreach ii in potato potato cassava maize potato ///
33 |   cassava maize potato cassava maize potato cassava maize potato cassava maize potato cassava maize potato cassava maize potato cassava maize { 
34 | 	if something ~= 1 & something != . {
35 | 	do something // some very very very very very very very very very very very very very very very very very very very very very very long comment
36 | 	} 
37 | 	}
38 | 
39 | 	* Split a long line into multiple lines
40 | 	* (for now, too long comments are not corrected)
41 | 	foreach ii in potato potato cassava maize potato cassava maize potato cassava maize potato cassava maize potato cassava maize potato cassava maize potato cassava maize potato cassava maize { 
42 | 	if something ~= 1 & something != . {
43 | 	do something // some very very very very very very very very very very very very very very very very very very very very very very long comment
44 | 	} 
45 | 	}
46 | 
47 | 	* Add a whitespace before an opening curly bracket "{"
48 | 	if something ~= 1 & something != .{
49 | 	do something
50 | 	} 
51 | 
52 | 	* Remove blank lines before a closing bracket "}"
53 | 	if something ~= 1 & something != .{
54 | 
55 | 		do something
56 | 
57 | 	} 
58 | 
59 | 	* Remove duplicated blank lines
60 | 	if something ~= 1 & something != .{ /* some comment */
61 | 
62 | 
63 | 		do something
64 | 
65 | 
66 | 	} 
67 | 
68 | 	* Forvalues with quietly option
69 | 	qui forv i = 1/`theN' {
70 | 	ivregress 2sls indiv_theta_mean hh_faultdist ///
71 | 			( m_indiv_edu_binary m_edu_fault  = instrument i_d ) ///
72 | 			`fault_controls' `other_controls' `mother_controls' ///
73 | 			if group != `i' ///
74 | 			, cl(village_code)
75 | 			noi noi di "`i'/`theN' done!"
76 | 
77 | 		mat a = r(table)
78 | 		local lower = a[5,2]
79 | 		local upper = a[6,2]
80 | 
81 | 	replace b_alt = _b[m_edu_fault] if group == `i'
82 | 	replace b_min = `lower' if group == `i'
83 | 	replace b_max = `upper' if group == `i'
84 | 	}  
85 | 


--------------------------------------------------------------------------------
/test/bad_corrected.do:
--------------------------------------------------------------------------------
 1 | * Rules =====================
 2 | * Hard tabs should not be used
 3 | * "delimit" should not be used
 4 | * In brackets after "for" or "if", indentation should be used
 5 | * Too long lines should be divided into multiple lines
 6 | * Before an opening curly bracket " {", put a whitespace
 7 | * Remove blank lines before closing brackets
 8 | * Remove duplicated blank lines
 9 | 
10 | * Stata codes to be corrected =================
11 | 
12 | * All hard tabs are replaced with soft tabs (= whitespaces)
13 | 
14 |     * delimit is corrected and three forward slashes will be used instead
15 | 
16 |     foreach something in something something something something something something /// 
17 |         something something {  // some comment
18 |         do something  
19 |     }  
20 | 
21 |     * Add indentation in brackets
22 |     if something {
23 |         do something
24 |         if another == 1 {
25 |             do that
26 |         } 
27 |     }
28 |   
29 |     foreach ii in potato potato cassava maize potato ///
30 |         cassava maize potato cassava maize potato cassava maize ///
31 |         potato cassava maize potato cassava maize potato cassava maize ///
32 |         potato cassava maize {
33 |         if something ~= 1 & something != . {
34 |             do something // some very very very very very very very very very very very very very very very very very very very very very very long comment
35 |         } 
36 |     }
37 | 
38 |     * Split a long line into multiple lines
39 |     * (for now, too long comments are not corrected)
40 |     foreach ii in potato potato cassava maize potato cassava maize ///
41 |         potato cassava maize potato cassava maize potato cassava maize potato ///
42 |         cassava maize potato cassava maize potato cassava maize {
43 |         if something ~= 1 & something != . {
44 |             do something // some very very very very very very very very very very very very very very very very very very very very very very long comment
45 |         } 
46 |     }
47 | 
48 |     * Add a whitespace before an opening curly bracket " {"
49 |     if something ~= 1 & something != . {
50 |         do something
51 |     } 
52 | 
53 |     * Remove blank lines before a closing bracket "}"
54 |     if something ~= 1 & something != . {
55 | 
56 |         do something
57 |     } 
58 | 
59 |     * Remove duplicated blank lines
60 |     if something ~= 1 & something != . { /* some comment */
61 | 
62 |         do something
63 |     } 
64 | 
65 |     * Forvalues with quietly option
66 |     qui forv i = 1/`theN' {
67 |         ivregress 2sls indiv_theta_mean hh_faultdist ///
68 |         ( m_indiv_edu_binary m_edu_fault  = instrument i_d ) ///
69 |         `fault_controls' `other_controls' `mother_controls' ///
70 |         if group != `i' ///
71 |         , cl(village_code)
72 |         noi noi di "`i'/`theN' done!"
73 | 
74 |         mat a = r(table)
75 |         local lower = a[5,2]
76 |         local upper = a[6,2]
77 | 
78 |         replace b_alt = _b[m_edu_fault] if group == `i'
79 |         replace b_min = `lower' if group == `i'
80 |         replace b_max = `upper' if group == `i'
81 |     }  
82 | 
83 | 


--------------------------------------------------------------------------------
/src/stata_linter_utils.py:
--------------------------------------------------------------------------------
  1 | # version 1.02  06apr2023  DIME Analytics dimeanalytics@worldbank.org
  2 | # Import packages ====================
  3 | import re
  4 | import pandas as pd
  5 | import stata_linter_detect as sld
  6 | 
  7 | # functions
  8 | 
  9 | def read_dofile(file, include_comments=False):
 10 | 
 11 |     '''
 12 |     Returns a list of the lines in the dofile
 13 |     Omits comment lines or commented-out code by default
 14 |     '''
 15 | 
 16 |     with open(file, "r") as f:
 17 |         dofile_lines = f.readlines()
 18 | 
 19 |     if include_comments:
 20 |         return dofile_lines
 21 | 
 22 |     dofile_lines2 = []
 23 |     comment_delimiter = 0
 24 | 
 25 |     for line in dofile_lines:
 26 | 
 27 |         comment_delimiter = sld.update_comment_delimiter(comment_delimiter, line)
 28 | 
 29 |         if comment_delimiter == 0:
 30 |             # Removing end-of-line comments
 31 |             filtered_line = re.sub(r"\s*((\/\/)|(\/\*)).*", r"", line)
 32 |             dofile_lines2.append(filtered_line)
 33 | 
 34 |     return dofile_lines2
 35 | 
 36 | def detect_duplicated_blank_line_in_file(file):
 37 | 
 38 |     dofile_lines = read_dofile(file, include_comments=True)
 39 | 
 40 |     for line_index, line in enumerate(dofile_lines):
 41 | 
 42 |         if sld.detect_duplicated_blank_line(line_index, line, dofile_lines):
 43 |             return True
 44 | 
 45 |     return False
 46 | 
 47 | def detect_blank_line_before_curly_close_in_file(file):
 48 | 
 49 |     dofile_lines = read_dofile(file, include_comments=True)
 50 | 
 51 |     for line_index, line in enumerate(dofile_lines):
 52 | 
 53 |         if sld.detect_blank_line_before_curly_close(line_index, line, dofile_lines):
 54 |             return True
 55 | 
 56 |     return False
 57 | 
 58 | def detect_no_space_before_curly_bracket_in_file(file):
 59 | 
 60 |     dofile_lines = read_dofile(file)
 61 | 
 62 |     for line in dofile_lines:
 63 | 
 64 |         if sld.detect_no_space_before_curly_bracket(line):
 65 |             return True
 66 | 
 67 |     return False
 68 | 
 69 | def detect_line_too_long_in_file(file, linemax):
 70 | 
 71 |     dofile_lines = read_dofile(file)
 72 |     linemax = int(linemax)
 73 | 
 74 |     for line in dofile_lines:
 75 | 
 76 |         if sld.detect_line_too_long(line, linemax):
 77 |             return True
 78 | 
 79 |     return False
 80 | 
 81 | def detect_bad_indent_in_file(file, indent, tab_space):
 82 | 
 83 |     dofile_lines = read_dofile(file)
 84 |     indent = int(indent)
 85 |     tab_space = int(tab_space)
 86 | 
 87 |     for line_index, line in enumerate(dofile_lines):
 88 | 
 89 |         if sld.detect_bad_indent(line_index, line, dofile_lines, indent, tab_space):
 90 |             return True
 91 | 
 92 |     return False
 93 | 
 94 | def detect_hard_tab_in_file(file):
 95 | 
 96 |     dofile_lines = read_dofile(file)
 97 | 
 98 |     for line in dofile_lines:
 99 | 
100 |         if sld.detect_hard_tab(line):
101 |             return True
102 | 
103 |     # No hard tabs detected in any line
104 |     return False
105 | 
106 | def detect_delimit_in_file(file):
107 | 
108 |     dofile_lines = read_dofile(file)
109 | 
110 |     for line in dofile_lines:
111 | 
112 |         if sld.detect_delimit(line):
113 |             # whenever the first delimiter is detected, return True
114 |             # and interrupt script
115 |             return True
116 | 
117 |     # if delimiters were never detected, return False
118 |     return False
119 | 


--------------------------------------------------------------------------------
/admin/checklist-submitting-SSC.md:
--------------------------------------------------------------------------------
 1 | # Checklist for submitting new versions to SSC
 2 | 
 3 | *Copy the list below to an issue when starting the process of publishing a new version of stata_linter*
 4 | 
 5 | - [ ] 1. **Merge to *develop*** - Merge all branches with the changes that should be included in the new version first to the `develop` branch.
 6 | - [ ] 2. **Create version branch** - This branch _MUST_ be created from the `master` branch. Name this branch the same as the version number you are about to release. For example, `v1.1`, `v2.32` etc.
 7 | - [ ] 3. **Merge *develop* to the version branch** - Solve all the conflicts in the version branch and then make sure that step 3.1-3.4 are done in the version branch and nowhere else.
 8 | 	- [ ] 3.1 **Test in different operative systems** - This step is not necessary every time, but testing the commands in Stata on each of the PC, Mac and Linux operative systems should be done from time to time. A particularly good time to do this is after writing or editing code that depends on file paths, the console, special settings etc. If small updates are needed, then do them in the _version_ branch, otherwise do them in branches of the `develop` branch, merge those to `develop` and then re-merge `develop` to the version branch and test again.
 9 | 	- [ ] 3.2 **Update version and date** - In the _version_ branch, update the version number and date in all ado-files and all dates in all help files. See section below for details.
10 | 	- [ ] 3.3 **Update version globals** - Update the _version_ado_ local in the file _lint.ado_ and the  _VERSION_ global in _stata_linter_detect.py_ and _stata_linter_correct.py_.
11 | 	- [ ] 3.4 **Update version in .pkg and .toc** - This has nothing to do with SSC but should be kept up to date to. This is for when people install directly through GitHub using `net install`. If any new command has been added, remember to add the files for that command to the `.pkg` file.
12 | 	- [ ] 3.5 **Create a .zip file** - Create a .zip file with the files listed below (ado-files, Python scripts, and help files). If a version update ever includes a new ado-file or Python script necessary to run the linter, include that new file in the .zip too. These files are not allowed to be in a sub-folder in this .zip file. No other files should be in this folder. Make a copy of this file in the archive folder of this package.
13 | - [ ] 4. **Email Prof. Baum** - Email the .zip file created in step 3.5 to **kit.baum@bc.edu**.
14 | 	- [ ] 4.1 - If any commands are added or deleted, make note of that in the email.
15 | 	- [ ] 4.2 - If any of the meta info (title, description, keywords, version or author/contact) has changed then include those updates in your email.
16 | - [ ] 5. **Draft release note** - Go to the [release notes](https://github.com/worldbank/stata-linter/releases) and draft a new release note for the new version. Follow the format from previous releases with links to [issues](https://github.com/worldbank/stata-linter/issues) solved.
17 | - [ ] 6. **Wait for publication confirmation** - Do not proceed pass this step until Prof. Baum has confirmed that the new version is uploaded to the servers.
18 | - [ ] 7. **Merge version branch to *master*** - If step 2 and 3 was done correctly, then there should not be any merge conflicts in this step. Once merged, delete the `version` branch.
19 | - [ ] 8. **Rebase *develop* to *master*** - This step brings edits done in 3 and 3.1, as well as version updates done in 3.2 and 3.3 into the *develop* branch. The same result can be accomplished - although by creating a slightly messier history - by merging *master* into *develop*. Regardless if the branches are merged or rebased, if any branches created of *develop* was not included in this version, make sure to rebase them to *develop* afterwards, otherwise there is a big risk for very messy conflicts in the future.
20 | - [ ] 9. **Publish release note** - Once the new version is up on SSC, publish the release note.
21 | - [ ] 10. **Close issues** - When the new version is up, close all the [issues](https://github.com/worldbank/stata-linter/issues) that was solved in the new version.
22 | - [ ] 11. **Send announce email** - If it is a major release (new commands or significant updates to existing commands), send an email to DIME Team to announce the new version.
23 | 
24 | ### Version number and dates in ado-files, Python files in src, and help files.
25 | 
26 | The version number is on the format `number.number` where the first number is incremented if it is a major release. If the first number is incremented the second number is reset to 0. If it is not a major release, then the first number is left unchanged and the second number is incremented.
27 | 
28 | Version number and date in ado-file. Change both version number and date. Make sure that this line is the very first line in the ado-file.
29 | ```
30 | *! version 1.0  06dec2022  DIME Analytics dimeanalytics@worldbank.org
31 | 
32 | 
33 | 	capture program drop lint
34 | 	program lint
35 | ```
36 | 
37 | Date at the top of the help file. Change only the date, there is no version number in the help file.
38 | ```
39 | {smcl}
40 | {* 06 Dec 2022}{...}
41 | {hline}
42 | help for {hi:ietoolkit}
43 | {hline}
44 | ```
45 | 


--------------------------------------------------------------------------------
/src/lint.sthlp:
--------------------------------------------------------------------------------
  1 | {smcl}
  2 | {* 06 Apr 2023}{...}
  3 | {hline}
  4 | help for {hi:lint}
  5 | {hline}
  6 | 
  7 | {title:Title}
  8 | 
  9 | {p 4 4 2}
 10 | 
 11 | {cmdab:lint} {hline 2} detects and corrects bad coding practices in Stata do-files following the {browse "https://worldbank.github.io/dime-data-handbook/coding.html#the-dime-analytics-stata-style-guide":DIME Analytics Stata Style Guide}.
 12 | 
 13 | {p 4 4 2}
 14 | For this command to run, you will need Stata version 16 or greater, Python,
 15 |   and the Python package {browse "https://pandas.pydata.org/":Pandas} installed. {break}
 16 | 	To install Python and integrate it with Stata, refer to {browse "https://blog.stata.com/2020/08/18/stata-python-integration-part-1-setting-up-stata-to-use-python/":this page}. {break}
 17 |   To install Python packages, refer to {browse "https://blog.stata.com/2020/09/01/stata-python-integration-part-3-how-to-install-python-packages/":this page}.
 18 | 
 19 | {title:Basic syntax}
 20 | 
 21 | {p 4 6 6}
 22 | {cmdab:lint} "{it:input_file}" [using "{it:output_file}"] , [{it:options}]
 23 | {p_end}
 24 | {break}
 25 | {p 4 4 2} The lint command can be broken into two functionalities:
 26 |       {break}1. {hi:Detection} identifies bad coding practices in a Stata do-files
 27 |       {break}2. {hi:Correction} corrects bad coding practices in a Stata do-file.
 28 | {p_end}
 29 | {break}
 30 | {p 4 4 6} If an {it:output_file} is specified with {opt using},
 31 |   then the linter will apply the {hi:Correction} functionality and will write
 32 |   a new file with corrections.{break}
 33 | 	If not, the command will only apply the {hi:Detection} functionality, returning
 34 |   a report of suggested corrections	and potential issues of the do-file
 35 |   in Stata's Results window.{break}
 36 |   Users should note that not all the bad practices identified in {hi:Detection}
 37 |   can be amended by {hi:Correction}.{p_end}
 38 | 
 39 | {marker opts}{...}
 40 | {synoptset 25}{...}
 41 | {synopthdr:Option}
 42 | {synoptline}
 43 | 
 44 | {synopt :{cmdab:v:erbose}}Report bad practices and issues found on each line of the do-file.{p_end}
 45 | {synopt :{cmdab:nosum:mary}}Suppress summary table of bad practices and potential issues.{p_end}
 46 | {synopt :{cmdab:i:ndent(}{it:integer}{cmd:)}}Number of whitespaces used when checking indentation coding practices (default: 4).{p_end}
 47 | {synopt :{cmdab:s:pace(}{it:integer}{cmd:)}}Number of whitespaces used instead of hard tabs when checking indentation practices (default: same as {it:indent}).{p_end}
 48 | {synopt :{cmdab:l:inemax(}{it:integer}{cmd:)}}Maximum number of characters in a line when checking line extension practices (default: 80).{p_end}
 49 | {synopt :{cmdab:e:xcel(}{it:{help filename}}{cmd:)}}Save an Excel file of line-by-line results.{p_end}
 50 | {synopt :{cmdab:force}}Allow the output file name to be the same as the name of the input file;
 51 |   overwriting the original do-file. {hi:The use of this option is not recommended} because it is
 52 |   slightly possible that the corrected do-file created by the command will break something
 53 |   in your code and you should always keep a backup of it.{p_end}
 54 | {synopt :{cmdab:auto:matic}}Correct all bad coding practices without asking
 55 |   if you want each bad coding practice to be corrected or not.
 56 | 	By default, the command will ask the user about each correction interactively
 57 | 	after producing the summary report.{p_end}
 58 | {synopt :{cmdab:replace}}Overwrite any existing {it:output} file.{p_end}
 59 | 
 60 | {synoptline}
 61 | 
 62 | 
 63 | {title:{it:Detect} functionality: Bad style practices and potential issues detected}
 64 | 
 65 | {pstd}{hi:Use whitespaces instead of hard tabs}
 66 | {break}
 67 | Use whitespaces (usually 2 or 4) instead of hard tabs.
 68 | 
 69 | {pstd}{hi:Avoid abstract index names}
 70 | {break}
 71 | In for-loop statements, index names should describe what the code is looping over.
 72 | For example, avoid writing code like this:
 73 | 
 74 | {pmore}{input:foreach i of varlist cassava maize wheat {  }}
 75 | 
 76 | {pstd}Instead, looping commands should name the index local descriptively:
 77 | 
 78 | {pmore}{input:foreach crop of varlist cassava maize wheat {  }}
 79 | 
 80 | {pstd}{hi:Use proper indentations}
 81 | {break}
 82 | After declaring for-loop statements or if-else statements, add indentation with
 83 | whitespaces (usually 2 or 4) in the lines inside the loop.
 84 | 
 85 | {pstd}{hi:Use indentations after declaring newline symbols (///)}
 86 | {break}
 87 | After a new line statement (///), add indentation (usually 2 or 4 whitespaces).
 88 | 
 89 | {pstd}{hi:Use the "{cmdab:!missing()}" function for conditions with missing values}
 90 | {break}
 91 | For clarity, use {cmdab:!missing(var)} instead of {cmdab:var < .} or {cmdab:var != .}
 92 | 
 93 | {pstd}{hi:Add whitespaces around math symbols ({cmdab:+, =, <, >})}
 94 | {break}
 95 | For better readability, add whitespaces around math symbols.
 96 | For example, do {cmdab:gen a = b + c if d == e} instead of {cmdab:gen a=b+c if d==e}.
 97 | 
 98 | {pstd}{hi:Specify the condition in an "if" statement}
 99 | {break}
100 | Always explicitly specify the condition in the if statement.
101 | For example, declare {cmdab:if var == 1} instead of just using {cmdab:if var}.
102 | 
103 | {pstd}{hi:Do not use "{cmdab:#delimit}", instead use "///" for line breaks}
104 | {break}
105 | More information about the use of line breaks {browse "https://worldbank.github.io/dime-data-handbook/coding.html#line-breaks":here}.
106 | 
107 | {pstd}{hi:Do not use cd to change current folder}
108 | {break}
109 | Use absolute and dynamic file paths. More about this {browse "https://worldbank.github.io/dime-data-handbook/coding.html#writing-file-paths":here}.
110 | 
111 | {pstd}{hi:Use line breaks in long lines}
112 | {break}
113 | For lines that are too long, use {cmdab:///} to divide them into multiple lines.
114 | It is recommended to restrict the number of characters in a line to 80 or less.
115 | 
116 | {pstd}{hi:Use curly brackets for global macros}
117 | {break}
118 | Always use {cmdab:${ }} for global macros.
119 | For exmaple, use {cmdab:${global_name}} instead of {cmdab:$global_name}.
120 | 
121 | {pstd}{hi:Include missing values in condition expressions}
122 | {break}
123 | Condition expressions like {cmdab:var != 0} or {cmdab:var > 0} are evaluated to true for missing values.
124 | Make sure to explicitly take missing values into account by using {cmdab:missing(var)} in expressions.
125 | 
126 | {pstd}{hi:Check if backslashes are not used in file paths}
127 | {break}
128 | Check if backslashes ({cmdab:\}) are not used in file paths.
129 | If you are using them, then replace them with forward slashes ({cmdab:/}).
130 | Users should note that the linter might not distinguish perfectly which uses of
131 | a backslash are file paths. In general, this flag will come up every time a
132 | backslash is used in the same line as a local, glocal, or the {it:cd} command.
133 | 
134 | {pstd}{hi:Check if tildes (~) are not used for negations}
135 | {break}
136 | If you are using tildes ({cmdab:~}) are used for negations, replace them with bangs ({cmdab:!}).
137 | 
138 | {title:{it:Correct} functionality: coding practices to be corrected}
139 | 
140 | {p 4 4 2}
141 | Users should note that the {it:Correct} feature does not correct all the bad practices detected.
142 | It only corrects the following:
143 | 
144 | {pstd}- Replaces the use of {cmdab:#delimit} with three forward slashes ({cmdab:///}) in each line affected by {cmdab:#delimit}
145 | 
146 | {pstd}- Replaces hard tabs with soft spaces (4 by default). The amount of spaces can be set with the {cmdab:tab_space()} option
147 | 
148 | {pstd}- Indents lines inside curly brackets with 4 spaces by default. The amount of spaces can be set with the {cmdab:indent()} option
149 | 
150 | {pstd}- Breaks long lines into multiple lines. Long lines are considered to have more than 80 characters by default,
151 | but this setting can be changed with the option {cmdab:linemax()}.
152 | Note that lines can only be split in whitespaces that are not inside
153 | parentheses, curly brackets, or double quotes. If a line does not have any
154 | whitespaces, the linter will not be able to break a long line.
155 | 
156 | {pstd}- Adds a whitespace before opening curly brackets, except for globals
157 | 
158 | {pstd}- Removes redundant blank lines after closing curly brackets
159 | 
160 | {pstd}- Removes duplicated blank lines
161 | 
162 | {p 4 4 2}
163 | If the option {cmdab:automatic} is omitted, Stata will prompt the user to confirm that
164 | they want to correct each of these bad practices only in case they are detected.
165 | If none of these are detected, it will show a message saying that none of the
166 | bad practices it can correct were detected.
167 | 
168 | {marker exa}
169 | {title:Examples}
170 | 
171 | {p 4 4 2}
172 | The following examples illustrate the basic usage of {cmd:lint}.
173 | Additional examples can be found at
174 | {browse "https://github.com/worldbank/stata-linter/"}.
175 | 
176 | {pstd}{hi:1. Detecting bad coding practices}
177 | 
178 | {p 4 4 2} The basic usage is to point to a do-file that requires revision as follows:
179 | 
180 |         {com}. lint "test/bad.do"
181 | 
182 | {p 4 4 2} For the detection feature you can use all the options but {it:automatic}, {it:force}, and {it:replace}, which are part of the correction functionality.
183 | 
184 |         Options:
185 | 
186 |         1. Show bad coding practices line-by-line
187 |         {com}. lint "test/bad.do", verbose
188 | 
189 |         2. Remove the summary of bad practices
190 |         {com}. lint "test/bad.do", nosummary
191 | 
192 |         3. Specify the number of whitespaces used for detecting indentation practices (default: 4):
193 |         {com}. lint "test/bad.do", indent(2)
194 | 
195 |         4. Specify the number of whitespaces used instead of hard tabs for detecting indentation practices (default: same value used in {it:indent}):
196 |         {com}. lint "test/bad.do", tab_space(6)
197 | 
198 |         5. Specify the maximum number of characters in a line allowed when detecting line extension (default: 80):
199 |         {com}. lint "test/bad.do", linemax(100)
200 | 
201 |         6. Export to Excel the results of the line by line analysis
202 |         {com}. lint "test/bad.do", excel("test_dir/detect_output.xlsx")
203 | 
204 |         7. You can also use this command to test all the do-files in a folder:
205 |         {com}. lint "test/"
206 | 
207 | {pstd}{hi:2. Correcting bad coding practices}
208 | 
209 | {p 4 4 2} The basic usage of the correction feature requires to specify the input do-file
210 | and the output do-file that will have the corrections.
211 | If you do not include any options, the linter will ask you confirm if you want a specific bad practice to be corrected
212 | for each bad practice detected:
213 | 
214 |         1. Basic correction use (the linter will ask what to correct):
215 |         {com}. lint "test/bad.do" using "test/bad_corrected.do"
216 | 
217 |         2. Automatic use (Stata will correct the file automatically):
218 |         {com}. lint "test/bad.do" using "test/bad_corrected.do", automatic
219 | 
220 |         3. Use the same name for the output file (note that this will overwrite the input file, this is not recommended):
221 |         {com}. lint "test/bad.do" using "test/bad.do", automatic force
222 | 
223 |         4. Replace the output file if it already exists
224 |         {com}. lint "test/bad.do" using "test/bad_corrected.do", automatic replace
225 | 
226 | {title:Acknowledgements}
227 | 
228 | {phang}This work is a product of the initial idea and work of Mizuhiro Suzuki.
229 |   Rony Rodriguez Ramirez, Luiza Cardoso de Andrade and Luis Eduardo San Martin also contributed to this command,
230 |   and Kristoffer Bjärkefur and Benjamin B. Daniels provided comments and code reviews.
231 | 
232 | {title:Authors}
233 | 
234 | {phang}This command was developed by DIME Analytics at DIME, The World Bank's department for Development Impact Evaluations.
235 | 
236 | {phang}Please send bug reports, suggestions, and requests for clarifications
237 |   writing "Stata linter" in the subject line to:{break}
238 |   dimeanalytics@worldbank.org
239 | 
240 | {phang}You can also see the code, make comments to the code, see the version
241 | 		 history of the code, and submit additions or edits to the code through {browse "https://github.com/worldbank/stata-linter":the GitHub repository of this package}.{p_end}
242 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # stata_linter - Stata command for do file linter
  2 | 
  3 | ## Installation
  4 | 
  5 | ### Installing published versions of `stata_linter`
  6 | 
  7 | To install `stata_linter`, type `ssc install stata_linter` and restart Stata.
  8 | 
  9 | This will install the most recent published version of `stata_linter`.
 10 | The main version of the code in this repository (the `master` branch) is what
 11 | is published on SSC as well.
 12 | 
 13 | ### Python stand-alone installation
 14 | 
 15 | To install the linter to run directly with Python and not via Stata, clone this repository and then run the following command on your terminal:
 16 | 
 17 | ```python
 18 | pip install -e src-py/
 19 | ```
 20 | 
 21 | This will also install `pandas` and `openpyxl` if they are not currently installed.
 22 | 
 23 | ## Requirements
 24 | 
 25 | 1. Stata version 16 or higher.
 26 | 2. Python 3 or higher
 27 | 
 28 | For setting up Stata to use Python, refer to [this web page](https://blog.stata.com/2020/08/18/stata-python-integration-part-1-setting-up-stata-to-use-python/).
 29 | `stata_linter` also requires the Python package `pandas` and `openpyxl`.
 30 | Refer to [this web page](https://blog.stata.com/2020/09/01/stata-python-integration-part-3-how-to-install-python-packages/) to know more about installing Python packages.
 31 | 
 32 | ## Content
 33 | 
 34 | The `stata_linter` package works through the `lint` command.
 35 | `lint` is an opinionated detector that attempts to improve the readability and organization of Stata do files.
 36 | The command is written based on the good coding practices of the Development Impact Evaluation Unit at The World Bank.
 37 | For these standards, refer to [DIME's Stata Coding practices](https://dimewiki.worldbank.org/wiki/Stata_Coding_Practices) and _Appendix: The DIME Analytics Coding Guide_ of [Development Research in Practice](https://worldbank.github.io/dime-data-handbook/).
 38 | 
 39 | The `lint` command can be broken into two functionalities:
 40 | 
 41 | 1. **detection** identifies bad coding practices in one or multiple Stata do-files
 42 | 2. **correction** corrects a few of the bad coding practices detected in a Stata do-file
 43 | 
 44 | > _Disclaimer_: Please note that this command is not guaranteed to correct codes without changing results.
 45 | It is strongly recommended that after using this command you check if results of the do file do not change.
 46 | 
 47 | ## Syntax and basic usage
 48 | 
 49 | ```stata
 50 | lint "input_file" using "output_file", options  
 51 | ```
 52 | 
 53 | ### 1. Detection
 54 | 
 55 | To detect bad practices in a do-file you can run the following:
 56 | 
 57 | ```stata
 58 | lint "test/bad.do"
 59 | ```
 60 | 
 61 | and on your Stata console you will get a summary of bad coding practices that were found in your code:
 62 | 
 63 | ```stata
 64 | -------------------------------------------------------------------------------------
 65 | Bad practice                                                          Occurrences                   
 66 | -------------------------------------------------------------------------------------
 67 | Hard tabs used instead of soft tabs:                                  Yes       
 68 | One-letter local name in for-loop:                                    3
 69 | Non-standard indentation in { } code block:                           7
 70 | No indentation on line following ///:                                 1
 71 | Missing whitespaces around operators:                                 0
 72 | Implicit logic in if-condition:                                       1
 73 | Delimiter changed:                                                    1
 74 | Working directory changed:                                            0
 75 | Lines too long:                                                       5
 76 | Global macro reference without { }:                                   0
 77 | Use of . where missing() is appropriate:                              6
 78 | Backslash detected in potential file path:                            0
 79 | Tilde (~) used instead of bang (!) in expression:                     5
 80 | -------------------------------------------------------------------------------------
 81 | ```
 82 | 
 83 | If you want to get the lines where those bad coding practices appear you can use the option `verbose`. For example:
 84 | 
 85 | ```stata
 86 | lint "test/bad.do", verbose
 87 | ```
 88 | 
 89 | Gives the following information before the regular output of the command.
 90 | 
 91 | ```stata
 92 | (line 14): Use 4 white spaces instead of tabs. (This may apply to other lines as well.)
 93 | (line 15): Avoid to use "delimit". For line breaks, use "///" instead.
 94 | (line 17): This line is too long (82 characters). Use "///" for line breaks so that one line has at m
 95 | > ost 80 characters.
 96 | (line 25): After declaring for loop statement or if-else statement, add indentation (4 whitespaces).
 97 | (line 25): Always explicitly specify the condition in the if statement. (For example, declare "if var
 98 | >  == 1" instead of "if var".)
 99 | ...
100 | ```
101 | 
102 | You can also pass a folder path to detect all the bad practices in all the do-files that are in the same folder.
103 | 
104 | ### 2. Correction
105 | 
106 | If you would like to correct bad practices in a do-file you can run the following:
107 | 
108 | ```stata
109 | lint "test/bad.do" using "test/bad_corrected.do"   
110 | ```
111 | 
112 | In this case, the lint command will create a do-file called `bad_corrected.do`.
113 | Stata will ask you if you would like to perform a set of corrections for each bad practice detected, one by one.
114 | You can add the option `automatic` to perform the corrections automatically and skip the manual confirmations.
115 | It is strongly recommended that the output file has a different name from the input file, as the original do-file should be kept as a backup.
116 | 
117 | As a result of this command, a piece of Stata code as the following:
118 | 
119 | ```stata
120 | #delimit ;
121 | 
122 | foreach something in something something something something something something
123 |   something something{ ; // some comment
124 |   do something ;
125 | } ;
126 | 
127 | #delimit cr
128 | 
129 | ```
130 | 
131 | becomes:
132 | 
133 | ```stata
134 | foreach something in something something something something something something ///
135 |   something something {  // some comment
136 |   do something  
137 | }
138 | ```
139 | 
140 | and
141 | 
142 | ```stata
143 | if something ~= 1 & something != . {
144 | do something
145 | if another == 1 {
146 | do that
147 | }
148 | }
149 | ```
150 | 
151 | becomes
152 | 
153 | ```stata
154 | if something ~= 1 & something != . {
155 |   do something
156 |   if another == 1 {
157 |       do that
158 |   }
159 | }
160 | ```
161 | 
162 | ### Other options
163 | 
164 | You can use the following options with the `lint` command:
165 | 
166 | - Options related to the **detection** feature:
167 |   - `verbose`: show all the lines where bad practices appear.
168 |   - `nosummary`: suppress the summary of bad practices.
169 |   - `excel()`: export detection results to Excel.
170 | 
171 | - Options exclusive to the **correction** feature:
172 |   - `automatic`: correct all bad coding practices without asking if you want each bad coding practice detected to be corrected or not.
173 |   - `replace`: replace the existing output file.
174 |   - `force`: allow the output file name to be the same as the name of the input file (not recommended).
175 | 
176 | - Options for **both** features:
177 |   - `indent()`: specify the number of whitespaces used for indentation (default is 4).
178 |   - `linemax()`: maximum number of characters in a line (default: 80)
179 |   - `tab_space()`: number of whitespaces used instead of hard tabs (default is 4).
180 | 
181 | ## Coding practices to be detected
182 | 
183 | - **Use soft tabs (i.e., whitespaces), not hard tabs:**
184 | Use white spaces (usually 2 or 4 whitespaces are used) instead of hard tabs.
185 | You can change this option in the do-file editor preferences.
186 | 
187 | - **Avoid using abstract index names:**
188 | In *for loops*, index names should describe what the code is looping over.
189 | Hence, for example, avoid coding like this:
190 | 
191 |   ```{stata}
192 |   foreach i of var cassava maize wheat { }
193 |   ```
194 | 
195 |   Instead, looping commands should name the index local descriptively:
196 | 
197 |   ```{stata}
198 |   foreach crop of var cassava maize wheat { }
199 |   ```
200 | 
201 | - **Use proper indentations:**
202 | After declaring a for loop statement or if-else statement, add indentation with whitespaces (usually 2 or 4 whitespaces).
203 | 
204 | - **Use indentations after declaring newline symbols `///`:**
205 | After a new line statement `(///)`, add indentation (usually 2 or 4 whitespaces).
206 | 
207 | - **Use `!missing()` function for conditions of missing values:**
208 | For clarity, use `!missing(var)` instead of `var < .` or `var != .`
209 | 
210 | - **Add whitespaces around math symbols (`+`, `=`, `<`, `>`):**
211 | For better readability, add whitespaces around math symbols.
212 | For example, write `gen a = b + c if d == e` instead of `gen a=b+c if d==e`.
213 | 
214 | - **Specify the condition in the if statement:**
215 | Always explicitly specify the condition in the if statement.
216 | For example, declare `if var == 1` instead of `if var`.
217 | 
218 | - **Do not use `delimit`, instead use `///` for line breaks:**
219 | More information about the use of line breaks [here](https://worldbank.github.io/dime-data-handbook/coding.html#line-breaks).
220 | 
221 | - **Do not use the `cd` command to change the current folder:**
222 | Use absolute and dynamic file paths. More about this [here](https://worldbank.github.io/dime-data-handbook/coding.html#writing-file-paths).
223 | 
224 | - **Use line breaks for too long lines:**
225 | For lines that are too long, use `///` for line breaks and divide them into multiple lines.
226 | It is recommended to restrict the number of characters in a line under 80.
227 | Though sometimes this is difficult since, for example, Stata does not allow line
228 | breaks within double quotes, try to follow this rule when possible.
229 | 
230 | - **Use curly brackets for global macros:**
231 | Always use `${ }` for global macros.
232 | For instance, use `${global}` instead of `$global`.
233 | 
234 | - **Include missing values in condition expressions:**
235 | Condition expressions like `var != 0` or `var > 0` are evaluated to true for missing values.
236 | Make sure to explicitly take missing values into account by using `missing()` in expressions.
237 | 
238 | - **Check if backslashes are not used in file paths:**
239 | Check if backslashes `(\)` are not used in file paths.
240 | If you are using them, then replace them with forward slashes `(/)`.
241 | 
242 | - **Check if tildes `(~)` are not used for negations:**
243 | If you are using tildes `(~)` for negations, replace them with the bang symbol `(!)`.
244 | 
245 | ## Coding practices to be corrected
246 | 
247 | The `correction` feature does not correct all the bad practices detected by `detect`.
248 | It only corrects the following:
249 | 
250 | - Replaces the use of `delimit` with three forward slashes (`///`) in each line affected by `delimit`
251 | - Replaces hard tabs with soft spaces (4 by default). The amount of spaces can be set with the `tab_space()` option
252 | - Indents lines inside curly brackets with 4 spaces by default. The amount of spaces can be set with the `indent()` option
253 | - Breaks long lines into two lines. Long lines are considered to have more than 80 characters by default, but this setting can be changed with the option `linemax()`
254 | - Adds a whitespace before opening curly brackets, except for globals
255 | - Removes redundant blank lines after closing curly brackets
256 | - Removes duplicated blank lines
257 | 
258 | If the option `automatic` is omitted, `lint` will prompt the user to confirm that they want to correct each of these bad practices only in case they are detected. If none of these are detected, it will show the message:
259 | 
260 |   ```{stata}
261 |   Nothing to correct.
262 |   The issues lint is able to correct are not present in your dofile.
263 |   No output files were generated.
264 |   ```
265 | 
266 | ## Recommended use
267 | 
268 | To minimize the risk of crashing a do-file, the `correction` feature works based on fewer rules than the `detection` feature.
269 | That is, we can can detect more bad coding practices with `lint "input_file"` in comparison to `lint "input_file" using "output_file"`.
270 | Therefore, after writing a do-file, you can first `detect` bad practices to check how many bad coding practices are contained in the do-file and later decide whether you would like to use the correction feature.
271 | 
272 | If there are not too many bad practices, you can go through the lines flagged by the `detection` feature and manually correct them.
273 | This also avoids potential crashes by the `correction` feature.
274 | 
275 | If there are many bad practices detected, you can use the `correction` feature first to correct some of the flagged lines, and then you can `detect` again and `correct` the remaining bad practices manually.
276 | We strongly recommend not overwriting the original input do-file so it can remain as a backup in case `correct` introduces unintended changes in the code.
277 | Additionally, we recommend checking that the results of the do-file are not changed by the correction feature.
278 | 
279 | ## Bug Reports and Feature Requests
280 | 
281 | If you are familiar with GitHub go to the [**Contributions**](https://github.com/worldbank/stata-linter#contributions) section below for advanced instructions.
282 | 
283 | An easy but still very efficient way to provide any feedback on these commands is to create an *issue* in GitHub. You can read *issues* submitted by other users or create a new *issue* in the top menu below [**worldbank**/**stata-linter**](https://github.com/worldbank/stata-linter). If you have an idea for a new command, or a new feature on an existing command, creating an *issue* is a great tool for suggesting that. Please read already existing *issues* to check whether someone else has made the same suggestion or reported the same error before creating a new *issue*.
284 | 
285 | While we have a slight preference for receiving feedback here on GitHub, you are still very welcome to send a regular email with your feedback to [dimeanalytics@worldbank.org](mailto:dimeanalytics@worldbank.org).
286 | 
287 | ## Contributions
288 | 
289 | If you are not familiar with GitHub see the [**Bug reports and feature requests**](https://github.com/worldbank/stata-linter#bug-reports-and-feature-requests) section above for a less technical but still very helpful way to contribute to **stata-linter**.
290 | 
291 | We appreciate contributions directly to the code and will give credit to anyone providing contributions that we merge to the master branch.
292 | If you have any questions on anything in this section, please do not hesitate to email [dimeanalytics@worldbank.org](mailto:dimeanalytics@worldbank.org).
293 | 
294 | The files on the `master` branch are the files most recently released on the SSC server.
295 | README, LICENSE and similar files are updated directly to `master` in between releases.
296 | All the other files are updated in the `develop` branch before being merged into `master`.
297 | Check out the `develop` branch if you want to see what future updates we are currently working on.
298 | 
299 | Please make pull requests to the `master` branch **only** if you wish to contribute to README, LICENSE or similar meta data files.
300 | If you wish to make a contribution to any other file, then please **do not** use the `master` branch.
301 | Instead, please fork this repository from `develop` and make your pull request to that branch.
302 | The `develop` branch includes all minor edits we have made to already published commands since the last release that we will include in the next version released on the SSC server.
303 | 
304 | ## License
305 | 
306 | **stata_linter** is developed under MIT license. See http://adampritchard.mit-license.org/ or see [the `LICENSE` file](https://github.com/worldbank/ietoolkit/blob/master/LICENSE) for details.
307 | 
308 | ## Main Contact
309 | 
310 | Luis Eduardo San Martin ([dimeanalytics@worldbank.org](mailto:dimeanalytics@worldbank.org))
311 | 
312 | ## **Authors**
313 | 
314 | This command is developed by DIME Analytics at DIME, The World Bank's department for Development Impact Evaluations.
315 | 
316 | ## About DIME Analytics
317 | 
318 | [DIME](https://www.worldbank.org/en/research/dime) is the World Bank's impact evaluation department. Part of DIME’s mission is to intensify the production of and access to public goods that improve the quantity and quality of global development research, while lowering the costs of doing IE for the entire research community. This Library is developed and maintained by [DIME Analytics](https://www.worldbank.org/en/research/dime/data-and-analytics). DIME Analytics supports quality research processes across the DIME portfolio, offers public trainings, and develops tools for the global community of development researchers.
319 | 
320 | Other DIME Analytics public goods are:
321 | 
322 | - [Development Research in Practice:](https://worldbank.github.io/dime-data-handbook/) the DIME Analytics Data Handbook
323 | - [DIME Wiki:](https://dimewiki.worldbank.org/wiki/Main_Page) a one-stop-shop for impact evaluation resources
324 | - [ietoolkit:](https://github.com/worldbank/ietoolkit) Stata package for impact evaluations
325 | - [iefieldkit:](https://github.com/worldbank/iefieldkit) Stata package for primary data collection
326 | - [Stata Visual Library](https://github.com/worldbank/stata-visual-library)
327 | - [R Econ Visual Library](https://github.com/worldbank/r-econ-visual-library)
328 | - [DIME Research Standards:](https://github.com/worldbank/dime-standards/blob/master/dime-research-standards/) DIME's commitments to best practices
329 | 


--------------------------------------------------------------------------------
/src/lint.ado:
--------------------------------------------------------------------------------
  1 | *! version 1.02  06apr2023  DIME Analytics dimeanalytics@worldbank.org
  2 | 
  3 | capture program drop lint
  4 | 		program 	 lint
  5 | 
  6 |   version 16
  7 | 
  8 |   syntax anything [using/],        	///
  9 | 									/// Options
 10 |     [                   			///
 11 |       Verbose           			///
 12 |       NOSUMmary         			///
 13 |       Indent(string)    			///
 14 |       Linemax(string)   			///
 15 |       Space(string) 				///
 16 |       Correct(string)   			///
 17 |       Excel(string)     			///
 18 |       AUTOmatic         			///
 19 |       replace           			///
 20 |       force            			///
 21 | 	  debug							///
 22 |     ]
 23 | 
 24 | /*******************************************************************************
 25 | ********************************************************************************
 26 | 
 27 | 	PART 1: Prepare inputs
 28 | 
 29 | ********************************************************************************
 30 | *******************************************************************************/
 31 | 
 32 | /*******************************************************************************
 33 | 	Set defaults
 34 | *******************************************************************************/
 35 | 
 36 |   * set indent size = 4 if missing
 37 |   if missing("`indent'")		local indent "4"
 38 | 
 39 |   * set whitespaces for tab (space) = indent size if space is missing
 40 |   if missing("`space'")   		local space "`indent'"
 41 | 
 42 |   * set linemax = 80 if missing
 43 |   if missing("`linemax'")		local linemax "80"
 44 | 
 45 |   * if !missing("`excel'")   cap erase `excel'
 46 |   if !missing("`excel'")		cap rm `excel'
 47 | 
 48 |   * set excel = "" if excel is missing
 49 |   if missing("`excel'")      	local excel ""
 50 | 
 51 |   * set a constant for the suppress option being used
 52 |   local suppress_flag "1"
 53 |   if !missing("`verbose'")    	local suppress_flag "0"
 54 | 
 55 |   * set a constant for the summary option being used
 56 |   local summary_flag "1"
 57 |   if !missing("`nosummary'")  	local summary_flag "0"
 58 | 
 59 |   * In debug mode, print status
 60 |   if !missing("`debug'") 		di "Inputs prepared"
 61 | 
 62 | 
 63 | /*******************************************************************************
 64 | 	Prepare file paths
 65 | *******************************************************************************/
 66 | 
 67 | // Check format of do-file to be linted ----------------------------------------
 68 | 
 69 | 	* File or Folder to be detected
 70 | 	gettoken anything : anything
 71 | 
 72 | 	* Check if main input is a file or a folder
 73 | 	local input =  `"`anything'"'
 74 | 
 75 | 	_testpath "`input'", ext(`"".do", ".ado""') argument(lint's main argument) exists `debug'
 76 | 	local folder =  "`r(folder)'"
 77 |     local file 	 =  "`r(file)'"
 78 | 
 79 | // Check do-file with corrections ----------------------------------------------
 80 | 
 81 | 	if !missing("`using'") {
 82 | 
 83 | 		* Can only be used when linting a do-file
 84 | 		if missing("`file'") {
 85 | 			di as error "{phang}Option [using] cannot be used when linting a directory. To use this option, specify a do-file as lint's main argument.{p_end}"
 86 | 			error 198
 87 | 		}
 88 | 
 89 | 		_testpath "`using'", ext(`"".do", ".ado""') argument(lint's [using] argument) `debug'
 90 | 		local output = "`r(file)'"
 91 | 
 92 | 		* Unless force is used, the output file should have a different name than the input
 93 | 		if missing("`force'") & ("`input'" == "`output'") {
 94 | 			di as error "{phang}It is recommended to use different file names for lint's main argument and its [using] argument. This is because it is slightly possible that the corrected do-file created by the command will break something in your code, and you may want to keep a backup. If you want still to replace the current do-file with the do-file corrected by lint, use the option [force]. {p_end}"
 95 | 			error 198
 96 | 		}
 97 |     }
 98 | 
 99 | // Check Excel with corrections ------------------------------------------------
100 | 
101 | 	if !missing("`excel'") {
102 | 
103 | 		_checkopenpyxlinstall
104 | 
105 | 		_testpath "`excel'", ext(`"".xls", ".xlsx""') argument(lint's [excel] argument) `debug'
106 | 		local excel = "`r(file)'"
107 | 	}
108 | 
109 | // In debug mode, print file paths ---------------------------------------------
110 | 
111 |   if !missing("`debug'") {
112 |   	di "Folder: `folder'"
113 | 	di "File: `file'"
114 | 	di "Excel: `excel'"
115 | 	di "Input: `input'"
116 | 	di "Output: `output'"
117 |   }
118 | 
119 | // Check if python is installed ------------------------------------------------
120 | 
121 | 	_checkpyinstall
122 | 
123 | 	* Check that the Python function is defined
124 | 	qui: findfile stata_linter_detect.py
125 | 	if c(os) == "Windows" {
126 | 		local ado_path = subinstr(r(fn), "\", "/", .)
127 | 	}
128 | 	else {
129 | 		local ado_path = r(fn)
130 | 	}
131 | 
132 | // Check that versions of all auxiliary files are the same ---------------------
133 | 
134 | _checkversions
135 | 
136 | /*******************************************************************************
137 | ********************************************************************************
138 | 
139 | 	PART 2: Execute linter
140 | 
141 | ********************************************************************************
142 | *******************************************************************************/
143 | 
144 | /*******************************************************************************
145 | 	Detect issues
146 | *******************************************************************************/
147 | 
148 |     * Check a single do-file
149 |     if !missing("`file'") {
150 | 
151 | 		if   missing("`using'") {
152 | 			local header header
153 | 		}
154 | 
155 | 		if (!missing("`verbose'") |	(`summary_flag' == 1) | !missing("`excel'") | !missing("`using'")) {
156 | 				local footer footer
157 | 		}
158 | 
159 | 		_detect, ///
160 | 			file("`file'") excel("`excel'") ado_path("`ado_path'") ///
161 | 			indent("`indent'") linemax("`linemax'") space("`space'") ///
162 | 			suppress_flag("`suppress_flag'") summary_flag("`summary_flag'") ///
163 | 			`header' `footer'
164 |     }
165 | 
166 |     * Check all do-files in a folder
167 |     else if !missing("`folder'") {
168 | 
169 |         local files: dir "`folder'" files "*.do"
170 | 
171 |         foreach file of local files {
172 | 
173 | 			_detect, ///
174 | 				file("`folder'/`file'") excel("`excel'") ado_path("`ado_path'") ///
175 | 				indent("`indent'") linemax("`linemax'") space("`space'") ///
176 | 				suppress_flag("`suppress_flag'") summary_flag("`summary_flag'") ///
177 | 				header footer
178 | 		}
179 | 	}
180 | 
181 | 	* In debug mode, print status
182 | 	if !missing("`debug'") noi di "Exiting detect function"
183 | 
184 | /*******************************************************************************
185 | 	Correct issues
186 | *******************************************************************************/
187 | 
188 | 	if !missing("`using'") {
189 | 
190 | 		_correct, ///
191 | 			input("`input'") output("`output'") ///
192 | 			indent("`indent'") space("`space'") linemax("`linemax'") ///
193 | 			`replace' `force' `automatic' `debug'
194 | 
195 | 	}
196 | 
197 | end
198 | 
199 | /*******************************************************************************
200 | ********************************************************************************
201 | 
202 | 	PART 3: Auxiliary functions
203 | 
204 | ********************************************************************************
205 | *******************************************************************************/
206 | 
207 | // Correct ---------------------------------------------------------------------
208 | 
209 | capture program drop 	_correct
210 | 		program			_correct
211 | 
212 | 	syntax, ///
213 | 		input(string) output(string) ///
214 | 		indent(string) space(string) linemax(string) ///
215 | 		[replace force automatic debug]
216 | 
217 | 	* Check that the Python function is defined
218 |     qui: findfile stata_linter_correct.py
219 |     if c(os) == "Windows" {
220 |       local ado_path = subinstr(r(fn), "\", "/", .)
221 |     }
222 |     else {
223 |       local ado_path = r(fn)
224 |     }
225 | 
226 |   * Display a message if the correct option is added, so the output can be separated
227 |     display as text 	" "
228 |     display as result 	_dup(60) "-"
229 |     display as result 	"Correcting {bf:do-file}"
230 |     display as result	_dup(60) "-"
231 |     display as text 	" "
232 | 
233 | 	* Import relevant python libraries
234 |     python: import sys, os
235 | 		python: from sfi import Macro
236 |     python: sys.path.append(os.path.dirname(r"`ado_path'"))
237 |     python: from stata_linter_correct import *
238 | 		python: import stata_linter_detect as sld
239 | 		python: import stata_linter_utils as slu
240 | 
241 | 	* Checking which issues are present in the dofile so we ask for their correction
242 | 		python: Macro.setLocal('_delimiter',  str(slu.detect_delimit_in_file(r"`input'")))
243 | 		python: Macro.setLocal('_hard_tab',   str(slu.detect_hard_tab_in_file(r"`input'")))
244 | 		python: Macro.setLocal('_bad_indent', str(slu.detect_bad_indent_in_file(r"`input'", "`indent'", "`space'")))
245 | 		python: Macro.setLocal('_long_lines', str(slu.detect_line_too_long_in_file(r"`input'", "`linemax'")))
246 | 		python: Macro.setLocal('_no_space_before_curly', str(slu.detect_no_space_before_curly_bracket_in_file(r"`input'")))
247 | 		python: Macro.setLocal('_blank_before_curly', str(slu.detect_blank_line_before_curly_close_in_file(r"`input'")))
248 | 		python: Macro.setLocal('_dup_blank_line', str(slu.detect_duplicated_blank_line_in_file(r"`input'")))
249 | 
250 | 	* If no issue was found, the function ends here.
251 | 	* Otherwise _correct continues.
252 | 	 if ("`_delimiter'" == "False" & ///
253 | 	     "`_hard_tab'" == "False" & ///
254 | 			 "`_bad_indent'" == "False" & ///
255 | 			 "`_long_lines'" == "False" & ///
256 | 			 "`_no_space_before_curly'" == "False" & ///
257 | 			 "`_blank_before_curly'" == "False" & ///
258 | 			 "`_dup_blank_line'" == "False") {
259 | 			 display as result `"{phang}Nothing to correct.{p_end}"'
260 | 	     display as result `"{phang}The issues lint is able to correct are not present in your dofile.{p_end}"'
261 | 			 display as result `"{phang}No output files were generated.{p_end}"'
262 | 	 }
263 | 	 else {
264 | 
265 | 	* Counter of number of issues being corrected
266 | 	  local _n_to_correct 0
267 | 
268 |   * Correct the output file, looping for each python command
269 |     foreach fun in 	delimit_to_three_forward_slashes ///
270 | 		 				tab_to_space ///
271 | 						indent_in_bracket ///
272 | 						too_long_line ///
273 | 						space_before_curly ///
274 | 						remove_blank_lines_before_curly_close ///
275 | 						remove_duplicated_blank_lines {
276 | 
277 | 			* If the issue is not present, we continue with the next one
278 | 			if ("`_delimiter'" == "False" & "`fun'" == "delimit_to_three_forward_slashes") {
279 | 			    continue
280 | 			}
281 | 			else if ("`_hard_tab'" == "False" & "`fun'" == "tab_to_space") {
282 | 					continue
283 | 			}
284 | 			else if ("`_bad_indent'" == "False" & "`fun'" == "indent_in_bracket") {
285 | 					continue
286 | 			}
287 | 			else if ("`_long_lines'" == "False" & "`fun'" == "too_long_line") {
288 | 					continue
289 | 			}
290 | 			else if ("`_no_space_before_curly'" == "False" & "`fun'" == "space_before_curly") {
291 | 					continue
292 | 			}
293 | 			else if ("`_blank_before_curly'" == "False" & "`fun'" == "remove_blank_lines_before_curly_close") {
294 | 					continue
295 | 			}
296 | 			else if ("`_dup_blank_line'" == "False" & "`fun'" == "remove_duplicated_blank_lines") {
297 | 					continue
298 | 			}
299 | 
300 | 			if missing("`automatic'") {
301 | 
302 |           noi di ""
303 |           global confirmation "" //Reset global
304 | 
305 |           while (upper("${confirmation}") != "Y" & upper("${confirmation}") != "N" & upper("${confirmation}") != "BREAK") {
306 | 					    if ("${confirmation}" != "") {
307 | 									noi di as txt "{pstd} Invalid input. {p_end}"
308 | 									noi di as txt "{pstd} Please type {bf:Y} or {bf:N} and hit enter. Type {bf:BREAK} and hit enter to exit. {p_end}"
309 | 									noi di ""
310 | 							}
311 |               if ("`fun'" == "delimit_to_three_forward_slashes") {
312 | 							    di as result "{pstd} Avoid using [delimit], use three forward slashes (///) instead. {p_end}"
313 |               }
314 |               else if ("`fun'" == "tab_to_space") {
315 |               		di as result "{pstd} Avoid using hard tabs, use soft tabs (white spaces) instead. {p_end}"
316 |               }
317 |               else if ("`fun'" == "indent_in_bracket") {
318 |                   di as result "{pstd} Indent commands inside curly brackets. {p_end}"
319 |               }
320 |               else if ("`fun'" == "space_before_curly") {
321 |                   di as result "{pstd} Use white space before opening curly brackets. {p_end}"
322 |               }
323 | 							else if ("`fun'" == "too_long_line") {
324 |                   di as result "{pstd} Limit line length to `linemax' characters. {p_end}"
325 |               }
326 |               else if ("`fun'" == "remove_blank_lines_before_curly_close") {
327 |                   di as result "{pstd} Remove redundant blank lines before closing brackets. {p_end}"
328 |               }
329 |               else if ("`fun'" == "remove_duplicated_blank_lines") {
330 |                   di as result "{pstd} Remove duplicated blank lines. {p_end}"
331 |               }
332 |               noi di as txt "{pstd} Do you want to correct this? To confirm type {bf:Y} and hit enter, to abort type {bf:N} and hit enter. Type {bf:BREAK} and hit enter to stop the code. See option {help lint:automatic} to not be prompted before creating files. {p_end}", _request(confirmation)
333 |           }
334 | 
335 |           // Copy user input to local
336 |           local createfile = upper("${confirmation}")
337 | 
338 |           // If user wrote "BREAK" then exit the code
339 |           if ("`createfile'" == "BREAK") error 1
340 |       }
341 | 
342 |     // if automatic is used, always run the corresponding function
343 |     else {
344 | 	      local createfile "Y"
345 | 	  }
346 | 
347 | 		* If option [manual] was used and input was [N], function won't be used for this issue
348 | 		if ("`createfile'" == "N") {
349 | 		    noi di as result ""
350 | 		}
351 | 		* If option input was [Y], or if option [automatic] was used, run the function
352 | 		else if ("`createfile'" == "Y") {
353 | 
354 | 		    local _n_to_correct = `_n_to_correct' + 1
355 | 
356 | 				* If this is the first issue to correct, create the output file
357 | 				if `_n_to_correct' == 1 {
358 | 
359 | 				    if (missing("`force'")) {
360 | 						    qui copy "`input'" "`output'", replace
361 | 				    }
362 | 				}
363 | 
364 | 		    python: `fun'(r"`output'", r"`output'", "`indent'", "`space'", "`linemax'")
365 | 		}
366 |     }
367 | 
368 | 	* Print link to corrected output file if it was created
369 |    if `_n_to_correct' > 0 {
370 | 	     display as result `"{phang}Corrected do-file saved to {browse "`output'":`output'}.{p_end}"'
371 | 	 }
372 | 	 }
373 | 
374 | 
375 | end
376 | 
377 | // Detect ----------------------------------------------------------------------
378 | 
379 | capture program drop	_detect
380 | 		program			_detect
381 | 
382 | 		syntax , ///
383 | 				file(string) ado_path(string) ///
384 | 				indent(string) linemax(string) space(string) ///
385 | 				suppress_flag(string) summary_flag(string) ///
386 | 				[excel(string) header footer]
387 | 
388 | 		* Import relevant python functions
389 | 		python: import sys, os
390 | 		python: sys.path.append(os.path.dirname(r"`ado_path'"))
391 | 		python: from stata_linter_detect import *
392 | 
393 | 		* Stata result header
394 | 		if !missing("`header'") {
395 | 			di as result ""
396 | 			di as result "Linting file: `file'"
397 | 			di as result ""
398 | 		}
399 | 
400 | 		* Actually run the Python code
401 |         python: r = stata_linter_detect_py("`file'", "`indent'", "`suppress_flag'", "`summary_flag'", "`excel'", "`linemax'", "`space'")
402 | 
403 | 		* Stata result footer
404 | 		if !missing("`footer'") {
405 | 
406 | 				display as result 	_dup(85) "-"
407 | 
408 | 			if "`excel'" != "" {
409 | 				display as result 	`"{phang}File {browse "`excel'":`excel'} created.{p_end}"'
410 | 			}
411 | 
412 | 				display as result 	`"{phang}For more information about coding guidelines visit the {browse "https://dimewiki.worldbank.org/Stata_Linter":Stata linter wiki.}{p_end}"'
413 | 		}
414 | 
415 | 
416 | 
417 | end
418 | 
419 | // File Paths ------------------------------------------------------------------
420 | 
421 | cap program drop _testpath
422 | 	program		 _testpath, rclass
423 | 
424 | 	syntax anything, argument(string) ext(string) [details(string) debug exists]
425 | 
426 | 	if !missing("`debug'") di "Entering subcommand _filepath"
427 | 
428 | 	* Standardize file path
429 | 	local path = subinstr(`"`anything'"', "\", "/", .)
430 | 
431 | 	* If a folder, test that folder exists
432 | 	if !regex(`"`path'"', "\.") {
433 | 	    _testdirectory 	`path'	, argument(`argument') details(`details') 	   `debug'
434 | 		local folder 	`path'
435 | 	}
436 | 
437 | 	* If a file, parse information
438 | 	else {
439 | 	    _testfile  `path'		, argument(`argument') ext(`"`ext'"') `exists' `debug'
440 | 		local file `path'
441 | 	}
442 | 
443 | 	return local folder "`folder'"
444 | 	if !missing("`debug'") di `"Folder: `folder'"'
445 | 
446 | 	return local file 	"`file'"
447 | 	if !missing("`debug'") di `"File: `file'"'
448 | 
449 | 	if !missing("`debug'") di "Exiting subcommand _filepath"
450 | 
451 | end
452 | 
453 | // Test file format ------------------------------------------------------------
454 | 
455 | cap program drop _testfile
456 | 	program		 _testfile, rclass
457 | 
458 | 	syntax anything, ext(string) argument(string) [debug exists]
459 | 
460 | 	if !missing("`debug'") di "Entering subcommand _testfile"
461 | 
462 | 
463 | 	if !missing("`exists'") {
464 | 	    confirm file `anything'
465 | 	}
466 | 
467 | 	* Get index of separation between file name and file format
468 | 	local r_lastdot = strlen(`anything') - strpos(strreverse(`anything'), ".")
469 | 
470 | 	* File format starts at the last period and ends at the end of the string
471 | 	local suffix     = substr(`anything', `r_lastdot' + 1, .)
472 | 
473 | 	if !inlist("`suffix'", `ext') {
474 | 	    di as error `"{phang}File `anything' is not a valid input for `argument'. Only the following file extensions are accepted: `ext'.{p_end}"'
475 | 		error 198
476 | 	}
477 | 
478 | end
479 | 
480 | // Check if folder exists ------------------------------------------------------
481 | 
482 | cap program drop _testdirectory
483 |     program      _testdirectory
484 | 
485 | 	syntax anything, argument(string) [details(string) debug]
486 | 
487 | 	if !missing("`debug'") di "Entering subcommand _testdirectory"
488 | 
489 | 	* Test that the folder for the report file exists
490 | 	 mata : st_numscalar("r(dirExist)", direxists(`anything'))
491 | 	 if `r(dirExist)' == 0  {
492 | 	 	noi di as error `"{phang}Directory `anything', used `argument', does not exist. `details'{p_end}"'
493 | 		error 601
494 | 	 }
495 | 
496 | end
497 | 
498 | 
499 | // Error checks ----------------------------------------------------------------
500 | 
501 | capture program drop  	_checkpyinstall
502 | 		program 		_checkpyinstall
503 | 
504 | 	* Check if python is installed
505 | 	cap python search
506 | 	if _rc {
507 | 		noi di as error `"{phang}For this command, Python installation is required. Refer to {browse "https://blog.stata.com/2020/08/18/stata-python-integration-part-1-setting-up-stata-to-use-python/":this page} for how to integrate Python to Stata. {p_end}"'
508 | 		exit
509 | 	}
510 | 
511 | 	* Check if pandas package is installed
512 | 	cap python which pandas
513 | 	if _rc {
514 | 		noi di as error `"{phang}For this command to run, the Python package "pandas" needs to be installed. Refer to {browse "https://blog.stata.com/2020/09/01/stata-python-integration-part-3-how-to-install-python-packages/":this page} for how to install Python packages. {p_end}"'
515 | 		exit
516 | 	}
517 | 
518 | end
519 | 
520 | capture program drop  	_checkopenpyxlinstall
521 | 		program 		_checkopenpyxlinstall
522 | 
523 | 	* Check if openpyxl package is installed
524 | 	cap python which openpyxl
525 | 	if _rc {
526 | 		noi di as error `"{phang}For this command to run, the Python package "openpyxl" needs to be installed. Refer to {browse "https://blog.stata.com/2020/09/01/stata-python-integration-part-3-how-to-install-python-packages/":this page} for how to install Python packages. {p_end}"'
527 | 		exit
528 | 	}
529 | 
530 | end
531 | 
532 | // Check that version of lint.ado and Python scripts are the same
533 | 
534 | capture program drop _checkversions
535 | 				program			 _checkversions
536 | 
537 | 	* IMPORTANT: Every time we have a package update, update the version number here
538 | 	* Otherwise we'd be introducing a major bug!
539 | 	local version_ado 1.02
540 | 
541 | 	* Check versions of .py files
542 | 	python: from sfi import Macro
543 | 	python: import stata_linter_detect as sld
544 | 	python: import stata_linter_correct as slc
545 | 	python: Macro.setLocal('version_detect', sld.VERSION)
546 | 	python: Macro.setLocal('version_correct', slc.VERSION)
547 | 
548 | 	* Checking that versions are the same
549 | 	cap assert "`version_ado'" == "`version_detect'"
550 | 	if _rc {
551 | 		noi di as error `"{phang}For this command to run, the versions of all its auxiliary files need to be the same. Please update the command to the newest version with: {bf:ssc install stata_linter, replace} , restart Stata, and try again{p_end}"'
552 | 		error
553 | 	}
554 | 	cap assert "`version_ado'" == "`version_correct'"
555 | 	if _rc {
556 | 	noi di as error `"{phang}For this command to run, the versions of all its auxiliary files need to be the same. Please update the command to the newest version with: {bf:ssc install stata_linter, replace} , restart Stata, and try again{p_end}"'
557 | 		error
558 | 	}
559 | 
560 | end
561 | 
562 | ************************************************************* Have a lovely day!
563 | 


--------------------------------------------------------------------------------
/src/stata_linter_correct.py:
--------------------------------------------------------------------------------
  1 | # version 1.02  06apr2023  DIME Analytics dimeanalytics@worldbank.org
  2 | # Import packages ============
  3 | import os
  4 | import re
  5 | import sys
  6 | import stata_linter_detect as sld
  7 | 
  8 | # Version Global
  9 | ## VERY IMPORTANT: Update the version number here every time there's an update
 10 | ## in the package. Otherwise this will cause a major bug
 11 | VERSION = "1.02"
 12 | 
 13 | # Function to update comment delimiter =============
 14 | # (detection works only when comment delimiter == 0)
 15 | def update_comment_delimiter(comment_delimiter, line):
 16 |     '''
 17 |     This function detects if a line is opening a comment section
 18 |     in a Stata dofile. Comment sections are delimited by the
 19 |     charaters "/*" and "*/"
 20 |     '''
 21 |     # if "/*" and "*/" are in the same line, never mind
 22 |     if re.search(r"\/\*.*\*\/", line):
 23 |         comment_delimiter += 0
 24 |     # if "/*" (opening) detected, add 1
 25 |     elif re.search(r"\/\*", line):
 26 |         comment_delimiter += 1
 27 |     # if "*/" (closing) detected, subtract 1
 28 |     elif (re.search(r"\*\/", line) != None) & (comment_delimiter > 0):
 29 |         comment_delimiter -= 1
 30 |     return(comment_delimiter)
 31 | 
 32 | # Functions for auto-correction ===================
 33 | 
 34 | # Convert delimit to three forward slashes -------------------
 35 | def delimit_to_three_forward_slashes(input_file, output_file, indent, tab_space, linemax):
 36 |     output_list = []
 37 |     with open(input_file, "r") as reader:
 38 |         input_lines = reader.readlines()
 39 |         delimit_on = 0
 40 |         comment_delimiter = 0
 41 |         for line_index, line in enumerate(input_lines):
 42 |             # update comment_delimiter
 43 |             comment_delimiter = update_comment_delimiter(comment_delimiter, line)
 44 |             if comment_delimiter > 0:
 45 |                 output_list.append(line)
 46 |             elif comment_delimiter == 0:
 47 |                 # check if "#delimit (something other than cr)" is included in a line
 48 |                 if re.search(r"^#delimit(?! cr)", line.lstrip()):
 49 |                     delimit_on = 1
 50 |                     # store the character used for line breaks (ignoring comments)
 51 |                     # (if not specified, default is ";")
 52 |                     line_split = re.split(r"//", line)[0].strip().split(" ")
 53 |                     if len(line_split) > 1:
 54 |                       delimit_symbol = line_split[1]
 55 |                     else:
 56 |                       delimit_symbol = ";"
 57 |                 # check if "#delimit cr" appears in a line, which means
 58 |                 # the end of delimit function
 59 |                 elif re.search(r"^#delimit cr", line.lstrip()):
 60 |                     delimit_on = 0
 61 |                 # for other lines, if delimit_on = 0, then just use the line, and
 62 |                 # if delimit_on = 1, then add "///" at the end of line but before
 63 |                 # any comments
 64 |                 else:
 65 |                     if delimit_on == 0:
 66 |                         output_list.append(line)
 67 |                     elif delimit_on == 1:
 68 |                         # get any non-comment part of the line and
 69 |                         # strip any redundant whitespaces at the end
 70 |                         line_split_for_comment = re.split(r"//", line)
 71 |                         line_main = line_split_for_comment[0]
 72 |                         if len(line_split_for_comment) > 1:
 73 |                             line_comment = line_split_for_comment[1]
 74 |                         line_main_rstrip = line_main.rstrip()
 75 |                         # if the line is not blank, add appropriate line break commands (///)
 76 |                         if len(line_main_rstrip) > 0:
 77 |                             # if the line does not end with the delimit symbol (such as ";"),
 78 |                             # then that means the command continues to the next line,
 79 |                             # so add a line break
 80 |                             if line_main_rstrip[-1] != delimit_symbol:
 81 |                                 output_line = line_main_rstrip + " ///"
 82 |                             # if the line does end with the delimit symbol, then
 83 |                             # just remove the last symbol in the line
 84 |                             elif line_main_rstrip[-1] == delimit_symbol:
 85 |                                 output_line = line_main_rstrip[:-1]
 86 | 
 87 |                             # replace all the remaining delimit symbols to "\n"
 88 |                             output_line = re.sub(delimit_symbol, "\n", output_line)
 89 | 
 90 |                             # if there is any comment in the line, then
 91 |                             # just append the comment
 92 |                             if len(line_split_for_comment) > 1:
 93 |                                 output_line = output_line + " //" + line_comment
 94 |                             # if there is no comment in the line, then
 95 |                             # just add a newline command (\n) at the end
 96 |                             elif len(line_split_for_comment) == 1:
 97 |                                 output_line = output_line + " \n"
 98 | 
 99 |                             output_list.append(output_line)
100 | 
101 |                         # if the line is blank, just append the blank line
102 |                         elif len(line_main_rstrip) == 0:
103 |                             output_list.append(line)
104 | 
105 |     with open(output_file, "w") as writer:
106 |         for output_line in output_list:
107 |             writer.write(output_line)
108 | 
109 | 
110 | # Convert hard tabs to soft tabs (= whitespaces) ----------------------
111 | def tab_to_space(input_file, output_file, indent, tab_space, linemax):
112 |     output_list = []
113 |     with open(input_file, "r") as reader:
114 |         input_lines = reader.readlines()
115 |         comment_delimiter = 0
116 |         for line_index, line in enumerate(input_lines):
117 |             # replace the hard tabs detected in a line to soft tabs (whitespaces)
118 |             spaces = ' ' * int(tab_space)
119 |             pattern = r'^( *)(\t+)([^\t].*\n{0,1})'
120 |             match = re.match(pattern, line)
121 |             if match:
122 |                 output_list.append(match.group(1) +
123 |                     match.group(2).replace('\t', spaces) +
124 |                     match.group(3))
125 |             else:
126 |                 output_list.append(line)
127 |     with open(output_file, "w") as writer:
128 |         for output_line in output_list:
129 |             writer.write(output_line)
130 | 
131 | # Use indents in brackets after for and while loops or if/else conditions --------------------
132 | def indent_in_bracket(input_file, output_file, indent, tab_space, linemax):
133 |     with open(input_file, "r") as reader:
134 |         input_lines = reader.readlines()
135 |         loop_start = []
136 |         bracket_start = []
137 |         bracket_pair = []
138 |         nest_level = 0
139 |         max_nest_level = 0
140 |         comment_delimiter = 0
141 |         for line_index, line in enumerate(input_lines):
142 |             # update comment_delimiter
143 |             comment_delimiter = update_comment_delimiter(comment_delimiter, line)
144 |             if comment_delimiter == 0:
145 |                 # get the main command of the line (ignoring comments at the end) and remove
146 |                 # redundant whitespaces
147 |                 line_rstrip = re.sub(r"(\/\/)|(\/\*).*", r"", line).rstrip()
148 |                 # if the line is not blank or has any command other than comments,
149 |                 # do the followings
150 |                 if len(line_rstrip) > 0:
151 |                     # check if the line starts with commands that potentially have curly brackets
152 |                     # (but ignore if this line is the continuation from the previous line,
153 |                     # because then the expression here should not have curly brackets)
154 |                     if (
155 |                         (re.search(r"^(qui[a-z]*\s+)?(foreach |while |forv|if |else |cap)", line.lstrip()) != None) &
156 |                         (re.search(r"\/\/\/", input_lines[max(line_index - 1, 0)]) == None)
157 |                         ):
158 |                         # if the line ends with an open curly bracket,
159 |                         # then tag it (here the depth of the nests are stored as well)
160 |                         if line_rstrip[-1] == "{":
161 |                             loop_start.append(line_index)
162 |                             bracket_start.append(line_index)
163 |                             nest_level += 1
164 |                             max_nest_level = max(max_nest_level, nest_level)
165 |                         # if the line does not end with an open curly bracket but includes line breaks,
166 |                         # then search for the line including the open curly bracket in the following lines
167 |                         # and tag the line
168 |                         elif (line_rstrip[-1] != "{") & (re.search(r"\/\/\/", line) != None):
169 |                             loop_start.append(line_index)
170 |                             for i in range(line_index, len(input_lines)):
171 |                                 temp_line_rstrip = re.sub(r"\/\/.*", r"", input_lines[i]).rstrip()
172 |                                 if temp_line_rstrip[-1] == "{":
173 |                                     bracket_start.append(i)
174 |                                     break
175 |                             nest_level += 1
176 |                             max_nest_level = max(max_nest_level, nest_level)
177 |                     # check if the line ends with a closing curly bracket
178 |                     # (ignore it if that is not used for global macro)
179 |                     if (line_rstrip[-1] == "}") & (not re.search(r"\$.?{", line)):
180 |                         bracket_pair.append([loop_start.pop(), line_index, nest_level, bracket_start.pop()])
181 |                         nest_level -= 1
182 |         # for each depth of nests, add appropriate indentations
183 |         for nest_level in range(1, max_nest_level + 1):
184 |             for pair in bracket_pair:
185 |                 if pair[2] == nest_level:
186 |                     # get the position of where to start indentations
187 |                     start_indent = len(input_lines[pair[0]]) - len(input_lines[pair[0]].lstrip())
188 |                     # for each line in the nest, do the followings
189 |                     for j in range(pair[0] + 1, pair[1]):
190 |                         # if the line is blank, ignore it
191 |                         if len(input_lines[j].lstrip()) == 0:
192 |                             pass
193 |                         # if the line is not blank, then add indentations at the beginning of the line
194 |                         elif len(input_lines[j].lstrip()) > 0:
195 |                             input_lines[j] = " " * (start_indent + int(indent)) + (input_lines[j].lstrip())
196 |     with open(output_file, "w") as writer:
197 |         for output_line in input_lines:
198 |             writer.write(output_line)
199 | 
200 | # Split too long line (> linemax characters) to multiple lines
201 | # (but do not break strings in double quotes (""), parentheses, or curly brackets) --------------------
202 | def too_long_line(input_file, output_file, indent, tab_space, linemax):
203 |     output_list = []
204 |     with open(input_file, "r") as reader:
205 |         input_lines = reader.readlines()
206 |         newline_flag = 0
207 |         comment_delimiter = 0
208 |         for line_index, line in enumerate(input_lines):
209 |             # update comment_delimiter
210 |             comment_delimiter = update_comment_delimiter(comment_delimiter, line)
211 |             if comment_delimiter > 0:
212 |                 output_list.append(line)
213 |             elif comment_delimiter == 0:
214 |                 # do nothing if any of the following conditions are met
215 |                 if (
216 |                     (len(line) <= int(linemax)) | # the line is not too long, or
217 |                     ((line.lstrip() + " ")[0] == "*") | # the line is a comment
218 |                     ((line.lstrip() + "  ")[:2] == "//") # line contains a comment
219 |                     ):
220 |                     output_list.append(line)
221 |                 # otherwise, do the followings
222 |                 else:
223 |                     # separate the comment part and the command part of the line
224 |                     line_split_for_comment = re.split(r"//", line)
225 |                     line_main = line_split_for_comment[0]
226 |                     if "\n" in line_main:
227 |                         line_main = line_main.rstrip() + "\n"
228 |                     else:
229 |                         line_main = line_main.rstrip()
230 |                     if len(line_split_for_comment) > 1:
231 |                         line_comment = line_split_for_comment[1]
232 |                     line_indent = (
233 |                         len(line_main.rstrip()) -
234 |                         len(line_main.rstrip().expandtabs(int(indent)).lstrip())
235 |                         )
236 | 
237 |                     i = 0
238 |                     break_line = []
239 |                     potential_break_line = []
240 |                     double_quote_count = 0
241 |                     parenthesis_count = 0
242 |                     curly_count = 0
243 |                     # looking at each character of a line, tag where to break the line
244 |                     for j, c in enumerate(line_main.lstrip()):
245 | 
246 |                         position = j + len(line_main) - len(line_main.lstrip())
247 | 
248 |                         if c == '''"''':
249 |                             double_quote_count = 1 - double_quote_count
250 |                         elif c == "(":
251 |                             parenthesis_count += 1
252 |                         elif c == ")":
253 |                             parenthesis_count -= 1
254 |                         elif c == "{":
255 |                             curly_count += 1
256 |                         elif c == "}":
257 |                             curly_count -= 1
258 | 
259 |                         # We check "potential" break lines first
260 |                         if ((c == "," or c == " ") and # break line at "," or " "
261 |                             (double_quote_count == 0) and # ignore if in double quotes
262 |                             (parenthesis_count == 0) and # ignore if in parentheses
263 |                             (curly_count == 0)# ignore if in curly brackets
264 |                             ):
265 | 
266 |                             if c == " ":
267 | 
268 |                                 position2 = line_indent + i + 4
269 |                                 potential_break_line.append(position)
270 | 
271 |                                 # If the soon-to-be new line is equal to the linemax,
272 |                                 # we add the last potential line break position
273 |                                 if position2 >= int(linemax):
274 |                                     break_line.append(potential_break_line[-1])
275 |                                     i = int(indent) + position - potential_break_line[-1]
276 |                                 else:
277 |                                     i += 1
278 | 
279 |                             elif c == ",":
280 | 
281 |                                 position2 = line_indent + i + 5
282 | 
283 |                                 # If the soon-to-be new line is equal to the linemax,
284 |                                 # we add the last potential line break position
285 |                                 if position2 >= int(linemax):
286 |                                     break_line.append(potential_break_line[-1])
287 |                                     i = int(indent) + position - potential_break_line[-1]
288 |                                 else:
289 |                                     i += 1
290 | 
291 |                                 potential_break_line.append(position + 1)
292 | 
293 |                         else:
294 | 
295 |                             position2 = line_indent + i + 4
296 |                             if position2 >= int(linemax):
297 |                                 break_line.append(potential_break_line[-1])
298 |                                 i = int(indent) + position - potential_break_line[-1]
299 |                             else:
300 |                                 i += 1
301 | 
302 |                     # break lines
303 |                     line_split = []
304 |                     break_line_index = [0]
305 |                     break_line_index.extend(break_line)
306 |                     break_line_index.append(len(line_main))
307 |                     for k in range(len(break_line_index) - 1):
308 |                         # if no line break is needed, just append the line
309 |                         if (break_line_index == 2):
310 |                             line_split.append(
311 |                                 line_main[break_line_index[k]:break_line_index[k + 1]].rstrip()
312 |                                 )
313 |                         # otherwise, break the line according to the positions of characters tagged above
314 |                         else:
315 |                             line_split.append(line_main[break_line_index[k]:break_line_index[k + 1]])
316 | 
317 |                     # if no line break is needed, then just append the line
318 |                     # with appropriate indentations (and commends if needed)
319 |                     if len(line_split) == 1:
320 |                         if len(line_split_for_comment) > 1:
321 |                             output_list.append(
322 |                                 " " * line_indent + line_split[0].lstrip() + " //" + line_comment
323 |                                 )
324 |                         elif len(line_split_for_comment) == 1:
325 |                             output_list.append(" " * line_indent + line_split[0].lstrip() + "\n")
326 |                     # otherwise, break the line
327 |                     elif len(line_split) > 1:
328 |                         for i, temp_line in enumerate(line_split):
329 |                             # the first line
330 |                             if i == 0:
331 |                                 new_line = " " * line_indent + temp_line.lstrip() + " ///\n"
332 |                             # from the second to the last to the second line
333 |                             elif (i > 0) & (i < len(line_split) - 1):
334 |                                 # if the previous line does not include a line break, then
335 |                                 # add an appropriate indentations
336 |                                 if newline_flag == 0:
337 |                                     new_line = " " * (line_indent + int(indent)) + temp_line.lstrip() + " ///\n"
338 |                                 # if the previous line does include a line break, then
339 |                                 # assuming that the indentation is correctly done,
340 |                                 # add no indentations
341 |                                 elif newline_flag == 1:
342 |                                     new_line = " " * (line_indent) + temp_line.lstrip() + " ///\n"
343 |                             # the last line
344 |                             elif (i == len(line_split) - 1):
345 |                                 # if the previous line does not include a line break, then
346 |                                 # add an appropriate indentations
347 |                                 if newline_flag == 0:
348 |                                     new_line = " " * (line_indent + int(indent)) + temp_line.lstrip()
349 |                                 # if the previous line does include a line break, then
350 |                                 # assuming that the indentation is correctly done,
351 |                                 # add no indentations
352 |                                 elif newline_flag == 1:
353 |                                     new_line = " " * (line_indent) + temp_line.lstrip()
354 |                                 # if there is any comment in the original line, add it at the end
355 |                                 if len(line_split_for_comment) > 1:
356 |                                     new_line = new_line + " //" + line_comment
357 |                             output_list.append(new_line)
358 |                 # flag if the line includes a line break, which will be used
359 |                 # in the next line
360 |                 if "///" in line:
361 |                     newline_flag = 1
362 |                 else:
363 |                     newline_flag = 0
364 |     with open(output_file, "w") as writer:
365 |         for output_line in output_list:
366 |             writer.write(output_line)
367 | 
368 | # Add a white space before a curly bracket
369 | # (but not if the curly bracket is used for global macro, as in "${}") --------------------
370 | def space_before_curly(input_file, output_file, indent, tab_space, linemax):
371 |     output_list = []
372 |     with open(input_file, "r") as reader:
373 |         input_lines = reader.readlines()
374 |         comment_delimiter = 0
375 |         for line_index, line in enumerate(input_lines):
376 |             # update comment_delimiter
377 |             comment_delimiter = update_comment_delimiter(comment_delimiter, line)
378 |             if comment_delimiter > 0:
379 |                 output_list.append(line)
380 |             elif comment_delimiter == 0:
381 |                 # replace "{" with " {" if there is no whitespace
382 |                 # before an open curly bracket, but ignore if
383 |                 # "${" since this is for global macro
384 |                 output_list.append(re.sub(r"([^ $]){", r"\1 {", line))
385 |     with open(output_file, "w") as writer:
386 |         for output_line in output_list:
387 |             writer.write(output_line)
388 | 
389 | # Remove blank lines before curly brackets are closed --------------------
390 | def remove_blank_lines_before_curly_close(input_file, output_file, indent, tab_space, linemax):
391 |     output_list = []
392 |     with open(input_file, "r") as reader:
393 |         input_lines = reader.readlines()
394 |         comment_delimiter = 0
395 |         for line_index, line in enumerate(input_lines):
396 |             # update comment_delimiter
397 |             comment_delimiter = update_comment_delimiter(comment_delimiter, line)
398 |             if comment_delimiter > 0:
399 |                 output_list.append(line)
400 |             elif comment_delimiter == 0:
401 |                 if len(line.strip()) == 0:
402 |                     for i in range(line_index + 1, len(input_lines)):
403 |                         if len(input_lines[i].strip()) == 0:
404 |                             pass
405 |                         elif len(input_lines[i].strip()) > 0:
406 |                             line_rstrip = " " + re.sub(r"//.*", r"", input_lines[i]).rstrip()
407 |                             if (line_rstrip[-1] == "}") & (not re.search(r"\$.*{", input_lines[i])):
408 |                                 break
409 |                             else:
410 |                                 output_list.append(line)
411 |                                 break
412 |                 elif len(line.strip()) > 0:
413 |                     output_list.append(line)
414 |     with open(output_file, "w") as writer:
415 |         for output_line in output_list:
416 |             writer.write(output_line)
417 | 
418 | 
419 | # Remove duplicated blank lines --------------------
420 | def remove_duplicated_blank_lines(input_file, output_file, indent, tab_space, linemax):
421 |     output_list = []
422 |     with open(input_file, "r") as reader:
423 |         input_lines = reader.readlines()
424 |         comment_delimiter = 0
425 |         for line_index, line in enumerate(input_lines):
426 |             # update comment_delimiter
427 |             comment_delimiter = update_comment_delimiter(comment_delimiter, line)
428 |             if comment_delimiter > 0:
429 |                 output_list.append(line)
430 |             elif comment_delimiter == 0:
431 |                 if sld.detect_duplicated_blank_line(line_index, line, input_lines):
432 |                     pass
433 |                 else:
434 |                     output_list.append(line)
435 |     with open(output_file, "w") as writer:
436 |         for i, output_line in enumerate(output_list):
437 |             writer.write(output_line)
438 | 


--------------------------------------------------------------------------------
/src/stata_linter_detect.py:
--------------------------------------------------------------------------------
  1 | # version 1.02  06apr2023  DIME Analytics dimeanalytics@worldbank.org
  2 | # Import packages ====================
  3 | import os
  4 | import re
  5 | import sys
  6 | import pandas as pd
  7 | import argparse
  8 | 
  9 | # Version Global
 10 | ## VERY IMPORTANT: Update the version number here every time there's an update
 11 | ## in the package. Otherwise this will cause a major bug
 12 | VERSION = "1.02"
 13 | 
 14 | # simple run entry point
 15 | def run():
 16 |     parser = argparse.ArgumentParser(description='Lint a Stata do-file.')
 17 |     parser.add_argument('filename', metavar='file', type=str, nargs='?',
 18 |                         help='The name of the file to lint.')
 19 |     parser.add_argument('--indent', type=int, nargs='?', default=4,
 20 |                             help="Number of spaces to use for each indentation"
 21 |                             )
 22 |     parser.add_argument('--suppress', action='store_true',
 23 |                             help="Suppress line item printout"
 24 |                             )
 25 |     parser.add_argument('--summary', action='store_true',
 26 |                             help="Print a summary of bad practices detected"
 27 |                             )
 28 |     parser.add_argument('--linemax', type=int, nargs='?', default=80,
 29 |                             help="Maximum number of characters per line"
 30 |                             )
 31 |     parser.add_argument('--excel_output', type=str, nargs='?', default="",
 32 |                             help="If specified, save results to Excel workbook"
 33 |                             )
 34 | 
 35 | 
 36 |     args=parser.parse_args()
 37 |     return stata_linter_detect_py(
 38 |         input_file=args.filename,
 39 |         indent=args.indent,
 40 |         suppress="1" if args.suppress else "0",
 41 |         summary="1" if args.summary else "0",
 42 |         excel=args.excel_output,
 43 |         linemax=args.linemax,
 44 |         tab_space=args.indent
 45 |         )
 46 | 
 47 | # Style ===================
 48 | 
 49 | # Avoid to use abstract index names ----------------
 50 | def abstract_index_name(
 51 |     line_index, line, input_lines, indent,
 52 |     suppress, style_dictionary, excel_output_list,
 53 |     tab_space
 54 |     ):
 55 | 
 56 |     if re.search(r"^(qui[a-z]*\s+)?(foreach|forv)", line.lstrip()):
 57 |         list_of_words = line.split()
 58 |         # get the index used in for loops
 59 |         for word in list_of_words:
 60 |             if re.search(r"^(foreach)", word):
 61 |                 index_in_loop = list_of_words[list_of_words.index(word) + 1]
 62 |                 break
 63 |             elif re.search(r"^(forv)", word):
 64 |                 index_in_loop = list_of_words[list_of_words.index(word) + 1].split("=")[0]
 65 |                 break
 66 |         # warn if the number of characters in the index is just 1
 67 |         if len(set(index_in_loop)) == 1:
 68 |             print_output = (
 69 |                 '''In for loops, index names should describe what the code is looping over. ''' +
 70 |                 '''Do not use an abstract index such as "{:s}".'''.format(index_in_loop)
 71 |                 )
 72 |             if suppress != "1":
 73 |                 print(
 74 |                     '''(line {:d}): '''.format(line_index + 1) +
 75 |                     print_output
 76 |                     )
 77 | 
 78 |             style_dictionary["abstract_index_name"] += 1
 79 |             excel_output_list.append([line_index + 1, "style", print_output])
 80 | 
 81 |     return([style_dictionary, excel_output_list])
 82 | 
 83 | def loop_open(line):
 84 | 
 85 |     '''
 86 |     Detect if a line is opening a loop
 87 |     '''
 88 |     line_rstrip = re.sub(r"((\/\/)|(\/\*)).*", r"", line).rstrip()
 89 |     if len(line_rstrip) > 0:
 90 |         # check if the line includes for-loop, while-loop, or if/else statements
 91 |         if (
 92 |             (re.search(r"^(qui[a-z]*\s+)?(foreach |forv|if |else )", line.lstrip()) != None) &
 93 |             (line_rstrip[-1] == "{")
 94 |             ):
 95 |             return True
 96 |     return False
 97 | 
 98 | 
 99 | def loop_close(line):
100 | 
101 |     '''
102 |     Detects if a line is closing a loop
103 |     '''
104 |     relevant_part = re.split('//', line)[0].rstrip()
105 | 
106 |     if len(relevant_part) > 0:
107 | 
108 |         if relevant_part[-1] =='}':
109 |             return True
110 |         else:
111 |             return False
112 | 
113 |     else:
114 |         return False
115 | 
116 | def bad_indent_in_loop(line, open_loop_line, indent, tab_space):
117 | 
118 |     '''
119 |     Detect if a line is correctly indented by checking the indentation of
120 |     the first line of the loop
121 |     '''
122 |     line_ws = line.expandtabs(tab_space)
123 |     line_left_spaces1 = len(open_loop_line) - len(open_loop_line.lstrip())
124 |     line_left_spaces2 = len(line_ws) - len(line_ws.lstrip())
125 |     if (line_left_spaces2 - line_left_spaces1 < indent) & (len(line_ws.strip()) > 0):
126 |         return True
127 |     else:
128 |         return False
129 | 
130 | # Use proper indentations in for-loops, while-loops, and if/else statements ----------------
131 | def detect_bad_indent(line_index, line, input_lines, indent, tab_space):
132 | 
133 |     if loop_open(line):
134 |         line_ws = line.expandtabs(tab_space)
135 |         j = 1
136 |         embedded_loops = 0
137 | 
138 |         # Checking the lines inside the loop
139 |         while j + line_index < len(input_lines):
140 |             next_line = input_lines[line_index + j]
141 | 
142 |             # (next) line is opening another loop
143 |             if loop_open(next_line):
144 |                 embedded_loops += 1
145 |                 j += 1
146 |                 continue
147 | 
148 |             # (next) line is closing a loop
149 |             if loop_close(next_line):
150 |                 if embedded_loops > 0:
151 |                     # closing an embedded loop
152 |                     embedded_loops -= 1
153 |                 else:
154 |                     # closing the main loop
155 |                     break
156 | 
157 |             # (next) line is inside an embedded loop, we don't check it here.
158 |             # it will be checked when this function is applied on its
159 |             # correcponding loop level
160 |             if embedded_loops > 0:
161 |                 j += 1
162 |                 continue
163 | 
164 |             # for other cases, we check they're non-blank lines and then
165 |             # correct indentation
166 |             if (
167 |                 (len(next_line.strip()) > 0) &
168 |                 (re.search(r"^(\*|\/\/)", next_line.lstrip()) == None)
169 |                 ):
170 |                 if bad_indent_in_loop(next_line, line_ws, indent, tab_space):
171 |                     return True
172 | 
173 |             j += 1
174 | 
175 |     # No bad indentations detected
176 |     return False
177 | 
178 | def proper_indent(
179 |     line_index, line, input_lines, indent,
180 |     suppress, style_dictionary, excel_output_list,
181 |     tab_space
182 |     ):
183 | 
184 |     if detect_bad_indent(line_index, line, input_lines, indent, tab_space):
185 | 
186 |         print_output = (
187 |             '''After declaring for loop statement or if-else statement, ''' +
188 |             '''add indentation ({:d} whitespaces).'''.format(indent)
189 |             )
190 | 
191 |         if suppress != "1":
192 |             print(
193 |                 '''(line {:d}): '''.format(line_index + 1) +
194 |                 print_output
195 |                 )
196 | 
197 |         style_dictionary["proper_indent"] += 1
198 |         excel_output_list.append([line_index + 1, "style", print_output])
199 | 
200 |     return([style_dictionary, excel_output_list])
201 | 
202 | # Use indentations after line breaks (///) ----------------
203 | def indent_after_newline(
204 |     line_index, line, input_lines, indent,
205 |     suppress, style_dictionary, excel_output_list,
206 |     tab_space
207 |     ):
208 | 
209 |     # check if the previous line doesn't have "///" or if it's first line in dofile
210 |     if not re.search(r"\/\/\/", input_lines[max(line_index - 1, 0)]) or line_index == 0:
211 |         # no "///" found, the function finishes here
212 |         return([style_dictionary, excel_output_list])
213 | 
214 |     else:
215 |         # Now we check which of the previous lines contained "///"
216 |         # we then check indentation spaces with respect of the first
217 |         # line with "///"
218 |         i = 0
219 |         while re.search(r"\/\/\/", input_lines[line_index - (i + 1)]):
220 |             i += 1
221 |             pass
222 | 
223 |         first_line = input_lines[line_index - i].expandtabs(tab_space)
224 |         first_line_indent = len(first_line) - len(first_line.lstrip())
225 | 
226 |         line_ws = line.expandtabs(tab_space)
227 |         line_left_spaces = len(line_ws) - len(line_ws.lstrip())
228 | 
229 |         if line_left_spaces - first_line_indent < indent:
230 |             print_output = (
231 |                 '''After new line statement ("///"), add indentation ({:d} whitespaces).'''.format(indent)
232 |                 )
233 | 
234 |             if suppress != "1":
235 |                 print(
236 |                     '''(line {:d}): '''.format(line_index + 1) +
237 |                     print_output
238 |                     )
239 | 
240 |             style_dictionary["indent_after_newline"] += 1
241 |             excel_output_list.append([line_index + 1, "style", print_output])
242 | 
243 |         return([style_dictionary, excel_output_list])
244 | 
245 | # No whitespaces around math symbols ----------------
246 | def no_space_before_symbol(line):
247 | 
248 |     line = line.split('///')[0]
249 |     groups = line.split('"')
250 |     pattern = r"(?:[a-z]|[A-Z]|[0-9]|_|\)|')(?:<|>|=|\+|-|\*|\^)"
251 | 
252 |     for i, group in enumerate(groups):
253 | 
254 |         if i % 2 == 0:
255 |             if re.search(pattern, group):
256 |                 return True
257 | 
258 |     return False
259 | 
260 | def no_space_after_symbol(line):
261 | 
262 |     line = line.split('///')[0]
263 |     groups = line.split('"')
264 |     pattern = r"(?:(?:<|>|=|\+|-|\*|\^)(?:[a-z]|[A-Z]|_|\(|`|\.|$))|(?:(?:<|>|=|\+|\*|\^)(?:[0-9]))"
265 | 
266 |     for i, group in enumerate(groups):
267 | 
268 |         if i % 2 == 0:
269 |             if re.search(pattern, group):
270 |                 return True
271 | 
272 |     return False
273 | 
274 | def whitespace_symbol(
275 |     line_index, line, input_lines, indent,
276 |     suppress, style_dictionary, excel_output_list,
277 |     tab_space
278 |     ):
279 | 
280 |     # warn if no whitespaces around math symbols
281 |     if no_space_before_symbol(line) or no_space_after_symbol(line):
282 |         print_output = (
283 |             '''Before and after math symbols (>, <, =, +, etc), it is recommended to use whitespaces. ''' +
284 |             '''(For example, do "gen a = b + c" instead of "gen a=b+c".)'''
285 |             )
286 |         if suppress != "1":
287 |             print(
288 |                 '''(line {:d}): '''.format(line_index + 1) +
289 |                 print_output
290 |                 )
291 | 
292 |         style_dictionary["whitespace_symbol"] += 1
293 |         excel_output_list.append([line_index + 1, "style", print_output])
294 |     return([style_dictionary, excel_output_list])
295 | 
296 | # For missing values "var < ." or "var != ." are used (!missing(var) is recommended) ----------------
297 | def has_condition_missing(line):
298 | 
299 |     if re.search(r"(<|<=|!=|~=)( )*(\.(?![0-9]))", line):
300 |         return True
301 |     else:
302 |         return False
303 | 
304 | def condition_missing(
305 |     line_index, line, input_lines, indent,
306 |     suppress, style_dictionary, excel_output_list,
307 |     tab_space
308 |     ):
309 | 
310 |     # warn if "var < ." or "var != ." or "var ~= ." are used
311 |     if has_condition_missing(line):
312 |         print_output = (
313 |             '''Use "!missing(var)" instead of "var < ." or "var != ." or "var ~= ."'''
314 |             )
315 |         if suppress != "1":
316 |             print(
317 |                 '''(line {:d}): '''.format(line_index + 1) +
318 |                 print_output
319 |                 )
320 | 
321 |         style_dictionary["condition_missing"] += 1
322 |         excel_output_list.append([line_index + 1, "style", print_output])
323 |     return([style_dictionary, excel_output_list])
324 | 
325 | # Using "#delimit" should be avoided
326 | def detect_delimit(line):
327 | 
328 |     if re.search(r"#delimit(?! cr)", line):
329 |         return True
330 |     else:
331 |         return False
332 | 
333 | def dont_use_delimit(
334 |     line_index, line, input_lines, indent,
335 |     suppress, style_dictionary, excel_output_list,
336 |     tab_space
337 |     ):
338 | 
339 |     # warn if "#delimit" is used
340 |     if detect_delimit(line):
341 |         print_output = (
342 |             '''Avoid to use "delimit". For line breaks, use "///" instead.'''
343 |             )
344 |         if suppress != "1":
345 |             print(
346 |                 '''(line {:d}): '''.format(line_index + 1) +
347 |                 print_output
348 |                 )
349 | 
350 |         style_dictionary["dont_use_delimit"] += 1
351 |         excel_output_list.append([line_index + 1, "style", print_output])
352 |     return([style_dictionary, excel_output_list])
353 | 
354 | def check_cd(line):
355 | 
356 |     if re.search(r"^cd\s", line.lstrip()):
357 |         return True
358 |     else:
359 |         return False
360 | 
361 | # Using "cd" should be avoided
362 | def dont_use_cd(
363 |     line_index, line, input_lines, indent,
364 |     suppress, style_dictionary, excel_output_list,
365 |     tab_space
366 |     ):
367 | 
368 |     # warn if "#cd" is used
369 |     if check_cd(line):
370 |         print_output = (
371 |             '''Do not use "cd" but use absolute and dynamic file paths.'''
372 |             )
373 |         if suppress != "1":
374 |             print(
375 |                 '''(line {:d}): '''.format(line_index + 1) +
376 |                 print_output
377 |                 )
378 | 
379 |         style_dictionary["dont_use_cd"] += 1
380 |         excel_output_list.append([line_index + 1, "style", print_output])
381 |     return([style_dictionary, excel_output_list])
382 | 
383 | # If a line is too lone, it should be broken into multiple lines
384 | def detect_line_too_long(line, linemax):
385 | 
386 |     # if the last char is a line break, we leave it out
387 |     if len(line) > 0 and line[-1] == '\n':
388 |         line = line[:-1]
389 | 
390 |     if (len(line) > linemax):
391 |         return True
392 |     else:
393 |         return False
394 | 
395 | def too_long_line(
396 |     line_index, line, input_lines, indent, linemax,
397 |     suppress, style_dictionary, excel_output_list,
398 |     tab_space
399 |     ):
400 | 
401 |     # warn if the line is too long (and line breaks are not used yet)
402 |     if detect_line_too_long(line, linemax):
403 |         print_output = (
404 |             '''This line is too long ({:d} characters). '''.format(len(line)) +
405 |             '''Use "///" for line breaks so that one line has at most {:d} characters.'''.format(linemax)
406 |             )
407 |         if suppress != "1":
408 |             print(
409 |                 '''(line {:d}): '''.format(line_index + 1) +
410 |                 print_output
411 |                 )
412 | 
413 |         style_dictionary["too_long_line"] += 1
414 |         excel_output_list.append([line_index + 1, "style", print_output])
415 |     return([style_dictionary, excel_output_list])
416 | 
417 | # "if" condition should be explicit
418 | def detect_implicit_if(line):
419 | 
420 |     search_if  = re.search(r"(?:^|\s)(?:if|else if)\s", line.lstrip())
421 | 
422 |     if search_if != None:
423 | 
424 |         line = line[search_if.span()[0]:]
425 |         if (
426 |             (re.search(r"missing\(", line) == None) &
427 |             (re.search(r"inrange\(", line) == None) &
428 |             (re.search(r"inlist\(", line) == None) &
429 |             (re.search(r"=|<|>", line) == None)
430 |             ):
431 |             return True
432 | 
433 |     return False
434 | 
435 | def explicit_if(
436 |     line_index, line, input_lines, indent,
437 |     suppress, style_dictionary, excel_output_list,
438 |     tab_space
439 |     ):
440 | 
441 |     # warn if "if" statement is used but the condition is not explicit
442 |     if detect_implicit_if(line):
443 |         print_output = (
444 |             '''Always explicitly specify the condition in the if statement. ''' +
445 |             '''(For example, declare "if var == 1" instead of "if var".) '''
446 |             )
447 |         if suppress != "1":
448 |             print(
449 |                 '''(line {:d}): '''.format(line_index + 1) +
450 |                 print_output
451 |                 )
452 |         style_dictionary["explicit_if"] += 1
453 |         excel_output_list.append([line_index + 1, "style", print_output])
454 | 
455 |     return([style_dictionary, excel_output_list])
456 | 
457 | # Use parentheses for global macros
458 | def parentheses_for_global_macro(
459 |     line_index, line, input_lines, indent,
460 |     suppress, style_dictionary, excel_output_list,
461 |     tab_space
462 |     ):
463 | 
464 |     # warn if global macros are used without parentheses
465 |     if re.search(r"\$[a-zA-Z]", line):
466 |         print_output = (
467 |             '''Always use "${}" for global macros. '''
468 |             )
469 |         if suppress != "1":
470 |             print(
471 |                 '''(line {:d}): '''.format(line_index + 1) +
472 |                 print_output
473 |                 )
474 | 
475 |         style_dictionary["parentheses_for_global_macro"] += 1
476 |         excel_output_list.append([line_index + 1, "style", print_output])
477 |     return([style_dictionary, excel_output_list])
478 | 
479 | # Check ===================
480 | 
481 | # Ask if missing variables are properly taken into account
482 | def check_missing_expression(line):
483 | 
484 |     if re.search(r"(<|!=|~=)( )*(\.(?![0-9]))|!missing\(.+\)", line):
485 |         return True
486 |     else:
487 |         return False
488 | 
489 | def check_expression(line):
490 | 
491 |     if re.search(r"(~=|!=|>|>=)(?! *\.(?![0-9]))", line):
492 |         return True
493 |     else:
494 |         return False
495 | 
496 | 
497 | def check_missing(
498 |     line_index, line, input_lines, indent,
499 |     suppress, check_dictionary, excel_output_list,
500 |     tab_space
501 |     ):
502 |     # ask if missing variables are properly taken into account
503 | 
504 |     expression = check_expression(line)
505 |     missing_expression = check_missing_expression(line)
506 | 
507 |     if expression and not missing_expression:
508 |         print_output = (
509 |             '''Are you taking missing values into account properly? ''' +
510 |             '''(Remember that "a != 0" or "a > 0" include cases where a is missing.)'''
511 |             )
512 |         if suppress != "1":
513 |             print(
514 |                 '''(line {:d}): '''.format(line_index + 1) +
515 |                 print_output
516 |                 )
517 | 
518 |         check_dictionary["check_missing"] += 1
519 |         excel_output_list.append([line_index + 1, "check", print_output])
520 |     return([check_dictionary, excel_output_list])
521 | 
522 | # Ask if the user may be using backslashes in file paths
523 | def check_global(line):
524 | 
525 |     if re.search(r"^global\s", line.lstrip()):
526 |         return True
527 |     else:
528 |         return False
529 | 
530 | def check_local(line):
531 |     if re.search(r"^local\s", line.lstrip()):
532 |         return True
533 |     else:
534 |         return False
535 | 
536 | def check_backslash(line):
537 |     if re.search(r"\\", line):
538 |         return True
539 |     else:
540 |         return False
541 | 
542 | def backslash_in_path(
543 |     line_index, line, input_lines, indent,
544 |     suppress, check_dictionary, excel_output_list,
545 |     tab_space
546 |     ):
547 |     # warn if anything is sandwiched by backslashes,
548 |     # which suggests that the user may be using backslashes for file paths
549 |     changes_dir = check_cd(line)
550 |     is_local = check_local(line)
551 |     is_global = check_global(line)
552 |     has_backslash = check_backslash(line)
553 | 
554 |     if (changes_dir | is_local | is_global) & has_backslash:
555 |         print_output = (
556 |             '''Are you using backslashes ("\\") for a file path? ''' +
557 |             '''If so, use forward slashes ("/") instead.'''
558 |             )
559 |         if suppress != "1":
560 |             print(
561 |                 '''(line {:d}): '''.format(line_index + 1) +
562 |                 print_output
563 |                 )
564 | 
565 |         check_dictionary["backslash_in_path"] += 1
566 |         excel_output_list.append([line_index + 1, "check", print_output])
567 |     return([check_dictionary, excel_output_list])
568 | 
569 | def bang_not_tilde(
570 |     line_index, line, input_lines, indent,
571 |     suppress, check_dictionary, excel_output_list,
572 |     tab_space
573 |     ):
574 | 
575 |     # warn if tilde is used, which suggests
576 |     # that the user may be using tilde for negation
577 |     if re.search(r"~=\s*([^\s.]|\.[0-9]+)", line):
578 |         print_output = (
579 |             '''Are you using tilde (~) for negation? ''' +
580 |             '''If so, for negation, use bang (!) instead of tilde (~).'''
581 |             )
582 | 
583 |         if suppress != "1":
584 |             print(
585 |                 '''(line {:d}): '''.format(line_index + 1) +
586 |                 print_output
587 |                 )
588 | 
589 |         check_dictionary["bang_not_tilde"] += 1
590 |         excel_output_list.append([line_index + 1, "check", print_output])
591 |     return([check_dictionary, excel_output_list])
592 | 
593 | def detect_hard_tab(line):
594 | 
595 |     if re.search(r"\t", line):
596 |         return True
597 |     else:
598 |         return False
599 | 
600 | def detect_no_space_before_curly_bracket(line):
601 | 
602 |     if re.search(r"([^ $]){", line):
603 |         return True
604 |     else:
605 |         return False
606 | 
607 | def detect_blank_line_before_curly_close(line_index, line, dofile_lines):
608 | 
609 |     if len(line.strip()) > 0 or line_index == len(dofile_lines) - 1:
610 |         # non-blank lines or last line in the dofile
611 |         return False
612 | 
613 |     # only blank lines from this point
614 |     else:
615 |         next_line = dofile_lines[line_index+1]
616 |         next_line_rstrip = " " + re.sub(r"//.*", r"", next_line).rstrip()
617 | 
618 |         # Checking if next line is a closing bracket
619 |         if (next_line_rstrip[-1] == "}") & (not re.search(r"\$.*{", next_line)):
620 |             return True
621 |         else:
622 |             return False
623 | 
624 | def detect_duplicated_blank_line(line_index, line, dofile_lines):
625 | 
626 |     #if len(line.strip()) > 0 or line_index == len(dofile_lines) - 1:
627 |     if len(line.strip()) > 0:
628 |         # non-blank lines
629 |         return False
630 | 
631 |     # only blank lines from this point
632 |     else:
633 |         # Check if there is not next line -- note that Python doesn't show
634 |         # empty next lines as an empty last element
635 |         if line_index+1 >= len(dofile_lines):
636 |             return True
637 | 
638 |         # Check if next line is also blank:
639 |         next_line = dofile_lines[line_index+1]
640 |         if len(next_line.strip()) == 0:
641 |             return True
642 |         else:
643 |             return False
644 | 
645 | # Function to update comment delimiter ======================
646 | # (detection works only when comment delimiter == 0)
647 | def update_comment_delimiter(comment_delimiter, line):
648 |     # if "/*" and "*/" are in the same line, never mind
649 |     if re.search(r"\/\*.*\*\/", line):
650 |         pass
651 |     # if "/*" (opening) detected, add 1
652 |     elif re.search(r"\/\*", line):
653 |         comment_delimiter += 1
654 |     # if "*/" (closing) detected, subtract 1
655 |     elif (re.search(r"\*\/", line) != None) & (comment_delimiter > 0):
656 |         comment_delimiter -= 1
657 |     return(comment_delimiter)
658 | 
659 | # Run linter program to detect bad coding practices ===================
660 | def stata_linter_detect_py(
661 |     input_file, indent,
662 |     suppress, summary, excel, linemax,
663 |     tab_space
664 |     ):
665 | 
666 |     excel_output_list = []
667 | 
668 |     # style ============
669 |     # Any hard tabs in the do file
670 |     with open(input_file, "r") as f:
671 |         input_lines = f.readlines()
672 |         comment_delimiter = 0
673 |         for line_index, line in enumerate(input_lines):
674 | 
675 |             comment_delimiter = update_comment_delimiter(comment_delimiter, line)
676 | 
677 |             if comment_delimiter == 0:
678 |                 hard_tab = "No"
679 |                 if detect_hard_tab(line):
680 |                     hard_tab = "Yes"
681 |                     print_output = (
682 |                         '''Use {:d} white spaces instead of tabs. '''.format(int(indent)) +
683 |                         '''(This may apply to other lines as well.)'''
684 |                         )
685 |                     excel_output_list.append([line_index + 1, "style", print_output])
686 |                     if suppress != "1":
687 |                         print(
688 |                             '''(line {:d}): '''.format(line_index + 1) +
689 |                             print_output
690 |                             )
691 |                     break
692 | 
693 |     # Other line-by-line bad practices
694 |     style_dictionary = {
695 |         "abstract_index_name": 0,
696 |         "proper_indent": 0,
697 |         "indent_after_newline": 0,
698 |         "whitespace_symbol": 0,
699 |         "condition_missing": 0,
700 |         "explicit_if": 0,
701 |         "dont_use_delimit": 0,
702 |         "dont_use_cd": 0,
703 |         "too_long_line": 0,
704 |         "parentheses_for_global_macro": 0
705 |     }
706 | 
707 |     with open(input_file, "r") as f:
708 |         input_lines = f.readlines()
709 |         comment_delimiter = 0
710 |         for line_index, line in enumerate(input_lines):
711 |             # update comment delimiter
712 |             comment_delimiter = update_comment_delimiter(comment_delimiter, line)
713 | 
714 |             if re.search(r"^(\*|\/\/)", line.lstrip()) != None:
715 |                 pass
716 |             elif comment_delimiter > 0:
717 |                 pass
718 |             else:
719 |                 style_dictionary, excel_output_list = abstract_index_name(
720 |                     line_index, line, input_lines, int(indent),
721 |                     suppress, style_dictionary, excel_output_list,
722 |                     int(tab_space)
723 |                     )
724 |                 style_dictionary, excel_output_list = proper_indent(
725 |                     line_index, line, input_lines, int(indent),
726 |                     suppress, style_dictionary, excel_output_list,
727 |                     int(tab_space)
728 |                     )
729 |                 style_dictionary, excel_output_list = indent_after_newline(
730 |                     line_index, line, input_lines, int(indent),
731 |                     suppress, style_dictionary, excel_output_list,
732 |                     int(tab_space)
733 |                     )
734 |                 style_dictionary, excel_output_list = whitespace_symbol(
735 |                     line_index, line, input_lines, int(indent),
736 |                     suppress, style_dictionary, excel_output_list,
737 |                     int(tab_space)
738 |                     )
739 |                 style_dictionary, excel_output_list = condition_missing(
740 |                     line_index, line, input_lines, int(indent),
741 |                     suppress, style_dictionary, excel_output_list,
742 |                     int(tab_space)
743 |                     )
744 |                 style_dictionary, excel_output_list = explicit_if(
745 |                     line_index, line, input_lines, int(indent),
746 |                     suppress, style_dictionary, excel_output_list,
747 |                     int(tab_space)
748 |                     )
749 |                 style_dictionary, excel_output_list = dont_use_delimit(
750 |                     line_index, line, input_lines, int(indent),
751 |                     suppress, style_dictionary, excel_output_list,
752 |                     int(tab_space)
753 |                     )
754 |                 style_dictionary, excel_output_list = dont_use_cd(
755 |                     line_index, line, input_lines, int(indent),
756 |                     suppress, style_dictionary, excel_output_list,
757 |                     int(tab_space)
758 |                     )
759 |                 style_dictionary, excel_output_list = too_long_line(
760 |                     line_index, line, input_lines, int(indent), int(linemax),
761 |                     suppress, style_dictionary, excel_output_list,
762 |                     int(tab_space)
763 |                     )
764 |                 style_dictionary, excel_output_list = parentheses_for_global_macro(
765 |                     line_index, line, input_lines, int(indent),
766 |                     suppress, style_dictionary, excel_output_list,
767 |                     int(tab_space)
768 |                     )
769 |     # check ============
770 |     check_dictionary = {
771 |         "check_missing": 0,
772 |         "backslash_in_path": 0,
773 |         "bang_not_tilde": 0,
774 |     }
775 | 
776 |     with open(input_file, "r") as f:
777 |         input_lines = f.readlines()
778 |         comment_delimiter = 0
779 |         for line_index, line in enumerate(input_lines):
780 | 
781 |             # update comment delimiter
782 |             comment_delimiter = update_comment_delimiter(comment_delimiter, line)
783 | 
784 |             if re.search(r"^(\*|\/\/)", line.lstrip()) != None:
785 |                 pass
786 |             elif comment_delimiter > 0:
787 |                 pass
788 |             else:
789 |                 check_dictionary, excel_output_list = check_missing(
790 |                     line_index, line, input_lines, int(indent),
791 |                     suppress, check_dictionary, excel_output_list,
792 |                     int(tab_space)
793 |                     )
794 |                 check_dictionary, excel_output_list = backslash_in_path(
795 |                     line_index, line, input_lines, int(indent),
796 |                     suppress, check_dictionary, excel_output_list,
797 |                     int(tab_space)
798 |                     )
799 |                 check_dictionary, excel_output_list = bang_not_tilde(
800 |                     line_index, line, input_lines, int(indent),
801 |                     suppress, check_dictionary, excel_output_list,
802 |                     int(tab_space)
803 |                     )
804 |         print("")
805 | 
806 |     if summary == "1":
807 |         print("-------------------------------------------------------------------------------------")
808 |         print("{:69s} {:30s}".format("Bad practice", "Occurrences"))
809 |         print("-------------------------------------------------------------------------------------")
810 | 
811 |         print("{:69s} {:10s}".format("Hard tabs used instead of soft tabs: ", hard_tab))
812 |         print("{:60s} {:10d}".format("One-letter local name in for-loop: ", style_dictionary["abstract_index_name"]))
813 |         print("{:60s} {:10d}".format("Non-standard indentation in { } code block: ", style_dictionary["proper_indent"]))
814 |         print("{:60s} {:10d}".format("No indentation on line following ///: ", style_dictionary["indent_after_newline"]))
815 |         print("{:60s} {:10d}".format("Use of . where missing() is appropriate: ", style_dictionary["condition_missing"]))
816 |         print("{:60s} {:10d}".format("Missing whitespaces around operators: ", style_dictionary["whitespace_symbol"]))
817 |         print("{:60s} {:10d}".format("Implicit logic in if-condition: ", style_dictionary["explicit_if"]))
818 |         print("{:60s} {:10d}".format("Delimiter changed: ", style_dictionary["dont_use_delimit"]))
819 |         print("{:60s} {:10d}".format("Working directory changed: ", style_dictionary["dont_use_cd"]))
820 |         print("{:60s} {:10d}".format("Lines too long: ", style_dictionary["too_long_line"]))
821 |         print("{:60s} {:10d}".format("Global macro reference without { }: ", style_dictionary["parentheses_for_global_macro"]))
822 |         print("{:60s} {:10d}".format("Potential omission of missing values in expression: ", check_dictionary["check_missing"]))
823 |         print("{:60s} {:10d}".format("Backslash detected in potential file path: ", check_dictionary["backslash_in_path"]))
824 |         print("{:60s} {:10d}".format("Tilde (~) used instead of bang (!) in expression: ", check_dictionary["bang_not_tilde"]))
825 | 
826 |     output_df = pd.DataFrame(excel_output_list)
827 |     if excel != "":
828 |         if (output_df.empty == True):
829 |             output_df = pd.DataFrame(columns = ["Line", "Type", "Problem"])
830 |         output_df.columns = ["Line", "Type", "Problem"]
831 |         if os.path.exists(excel):
832 |             with pd.ExcelWriter(excel, engine = "openpyxl", mode = "a") as writer:
833 |                 output_df.to_excel(writer, index = False, sheet_name = os.path.basename(input_file)[:20])
834 |         else:
835 |             with pd.ExcelWriter(excel) as writer:
836 |                 output_df.to_excel(writer, index = False, sheet_name = os.path.basename(input_file)[:20])
837 | 
838 |     return( not output_df.empty )
839 | 


--------------------------------------------------------------------------------