├── +UnitTest
    └── +tokernizer
    │   ├── FileCheckTests.m
    │   └── TokenizeTests.m
├── README.md
├── Token.m
├── analyze_file.m
├── check.m
├── check_settings.m
├── run_unittests.m
├── testFiles
    ├── MatlabArgumentClass.m
    └── MatlabIndentedClass.m
├── test_MatlabIndentedClass.m
├── test_check.m
└── tokenize_code.m


/+UnitTest/+tokernizer/FileCheckTests.m:
--------------------------------------------------------------------------------
 1 | classdef FileCheckTests < matlab.unittest.TestCase
 2 |     
 3 |     methods(TestClassSetup)
 4 |         % Shared setup for the entire test class
 5 |         function setPathDef(~)
 6 |             addpath('testFiles')
 7 |         end
 8 |     end
 9 |     
10 |     methods(TestClassTeardown)
11 |         % Setup for each test
12 |         function rmPathDef(~)
13 |             rmpath('testFiles')
14 |         end
15 |     end
16 |     
17 |     methods(Test)
18 |         % Test methods
19 |         
20 |         function testMatlabIndentedClass(testCase)
21 |             % Matlab indentation class test
22 |             H = @() check('MatlabIndentedClass.m');
23 |             testCase.verifyWarningFree(H);
24 |         end
25 | 
26 |         function testMatlabArgumentValidation(testCase)
27 |             % Argument validation test
28 | 
29 |             % Argument validation not supported by versions earlier than 9.7
30 |             % (earlier than R2019b)
31 |             testCase.assumeFalse(verLessThan('matlab', '9.7'))
32 |             H = @() check('MatlabArgumentClass.m');
33 |             testCase.verifyWarningFree(H);
34 |         end
35 | 
36 |     end
37 |     
38 | end


--------------------------------------------------------------------------------
/+UnitTest/+tokernizer/TokenizeTests.m:
--------------------------------------------------------------------------------
  1 | classdef TokenizeTests < matlab.unittest.TestCase
  2 |     %TOKENIZETESTS Tests for tokenize_code
  3 |     
  4 |     methods(Test)
  5 |         function testText(obj)
  6 |             %TESTTEXT Tokenizing a text should not change the content
  7 | 
  8 |             % Read file
  9 |             text = fileread('check.m');
 10 | 
 11 |             % Tokenize code
 12 |             tokens = tokenize_code(text);
 13 | 
 14 |             % Reconstruct text from tokens
 15 |             reconstructed_text = horzcat(tokens.text);
 16 | 
 17 |             % Compare with actual text
 18 |             obj.assertEqual(reconstructed_text, text)
 19 |         end
 20 | 
 21 |         function testDoubleQuote(obj)
 22 |             %TESTDOUBLEQUOTE Tests a double quoted string
 23 |             
 24 |             % Input data for the test
 25 |             input_str = '"test"'; % String: "test"
 26 |             
 27 |             % Construct expected output for comparison
 28 |             expected = Token('string', input_str, 1, 1);
 29 |             
 30 |             % Get actual output
 31 |             actual = tokenize_code(input_str);
 32 |             
 33 |             % Compare actual output with expected output
 34 |             obj.verifyEqual(actual, expected);
 35 |         end
 36 |         
 37 |         function testSoloDoubleQuote(obj)
 38 |             %TESTSOLODOUBLEQUOTE Tests a string with only a double quoted
 39 |             
 40 |             % Input data for the test
 41 |             input_str = 'output = "test"'; % String: output = 'test'
 42 |             
 43 |             % Construct expected output for comparison
 44 |             expected(1) = Token('identifier', 'output', 1, 1);
 45 |             expected(2) = Token('space', ' ', 1, 7);
 46 |             expected(3) = Token('punctuation', '=', 1, 8);
 47 |             expected(4) = Token('space', ' ', 1, 9);
 48 |             expected(5) = Token('string', '"test"', 1, 10);
 49 |             
 50 |             % Get actual output
 51 |             actual = tokenize_code(input_str);
 52 |             
 53 |             % Compare actual output with expected output
 54 |             obj.verifyEqual(actual, expected);
 55 |         end
 56 |         
 57 |         function testNestedQuote(obj)
 58 |             %TESTNESTEDQUOTE Tests a double quote inside single quote
 59 |             
 60 |             % Input data for the test
 61 |             input_str = '"let''s go"'; % String: "let's go"
 62 |             
 63 |             % Construct expected output for comparison
 64 |             expected = Token('string', input_str, 1, 1);
 65 |             
 66 |             % Get actual output
 67 |             actual = tokenize_code(input_str);
 68 |             
 69 |             % Compare actual output with expected output
 70 |             obj.verifyEqual(actual, expected);
 71 |         end
 72 |         
 73 |         function testNestedQuote2(obj)
 74 |             %TESTNESTEDQUOTE2 Tests a double quote inside single quote
 75 |             
 76 |             % Input data for the test
 77 |             input_str = '''He said, "hi"'''; % String: 'He said, "hi"'
 78 |             
 79 |             % Construct expected output for comparison
 80 |             expected = Token('string', input_str, 1, 1);
 81 |             
 82 |             % Get actual output
 83 |             actual = tokenize_code(input_str);
 84 |             
 85 |             % Compare actual output with expected output
 86 |             obj.verifyEqual(actual, expected);
 87 |         end
 88 | 
 89 |         function testFunctionNames(obj)
 90 |             %TESTFUNCTIONNAMES Function names should be extracted
 91 |             report = analyze_file('', tokenize_code('function foo(); end'));
 92 |             obj.assertEqual(report.name.text, 'foo')
 93 |             
 94 |             report = analyze_file('', tokenize_code('function x = foo(); end'));
 95 |             obj.assertEqual(report.name.text, 'foo')
 96 |             
 97 |             report = analyze_file('', tokenize_code('function [x, y] = foo(); end'));
 98 |             obj.assertEqual(report.name.text, 'foo')
 99 |         end
100 | 
101 |         function testFunctionReturnNames(obj)
102 |             %TESTFUNCTIONRETURNNAMES Function return names should be extracted
103 |             report = analyze_file('', tokenize_code('function foo(); end'));
104 |             obj.assertEmpty(report.returns)
105 |             
106 |             report = analyze_file('', tokenize_code('function x = foo(); end'));
107 |             obj.assertEqual(report.returns(1).text, 'x')
108 |             obj.assertLength(report.returns, 1)
109 |             
110 |             report = analyze_file('', tokenize_code('function [x, y] = foo(); end'));
111 |             obj.assertEqual(report.returns(1).text, 'x')
112 |             obj.assertEqual(report.returns(2).text, 'y')
113 |             obj.assertLength(report.returns, 2)
114 |         end
115 | 
116 |         function testFunctionArguments(obj)
117 |             %TESTFUNCTIONARGUMENTS Function arguments should be extracted
118 |             report = analyze_file('', tokenize_code('function foo(); end'));
119 |             obj.assertEmpty(report.arguments)
120 |             
121 |             report = analyze_file('', tokenize_code('function foo(x); end'));
122 |             obj.assertEqual(report.arguments(1).text, 'x')
123 |             obj.assertLength(report.arguments, 1)
124 |             
125 |             report = analyze_file('', tokenize_code('function foo(x, y); end'));
126 |             obj.assertEqual(report.arguments(1).text, 'x')
127 |             obj.assertEqual(report.arguments(2).text, 'y')
128 |             obj.assertLength(report.arguments, 2)
129 | 
130 |         end
131 | 
132 |         function testOperatorsGeneral(obj)
133 |             %TESTOPERATORSGENERAL Operators should be parsed correctly
134 |             tokens = tokenize_code('a>=-b');
135 |             obj.assertTrue(tokens(2).hasText('>='))
136 |             obj.assertTrue(tokens(3).hasText('-'))
137 |         end
138 | 
139 |         function testOperatorsTranspose(obj)
140 |             %TESTOPERATORSTRANSPOSE Transpose Operators should not be strings
141 |             tokens = tokenize_code('a''');
142 |             obj.assertTrue(tokens(2).isEqual('punctuation', ''''))
143 |             
144 |             tokens = tokenize_code('a.''');
145 |             obj.assertTrue(tokens(2).isEqual('punctuation', '.'''))
146 |             
147 |             tokens = tokenize_code('a''+''a''.''');
148 |             obj.assertTrue(tokens(2).isEqual('punctuation', ''''))
149 |             obj.assertTrue(tokens(4).isEqual('string', '''a'''))
150 |             obj.assertTrue(tokens(5).isEqual('punctuation', '.'''))
151 |         end
152 | 
153 |         function testCommands(obj)
154 |             %TESTCOMMANDS Differentiate commands from expressions
155 |             tokens = tokenize_code('help me please % test');
156 |             obj.assertTrue(tokens(1).isEqual('identifier', 'help'))
157 |             obj.assertTrue(tokens(3).isEqual('string', 'me'))
158 |             obj.assertTrue(tokens(5).isEqual('string', 'please'))
159 |             obj.assertTrue(tokens(7).isEqual('comment', '% test'))
160 |         end
161 | 
162 |         function testEnd(obj)
163 |             %TESTEND Differentiate keyword end from variable end
164 |             tokens = tokenize_code('if a(end); end');
165 |             obj.assertTrue(tokens(5).isEqual('identifier', 'end'))
166 |             obj.assertTrue(tokens(9).isEqual('keyword', 'end'))
167 |         end
168 | 
169 |         function testSimicolon(obj)
170 |             %TESTSEMICOLONS Differentiate semicolons from linebreaks
171 |             tokens = tokenize_code('[1;2];3');
172 |             obj.assertTrue(tokens(3).isEqual('punctuation', ';'))
173 |             obj.assertTrue(tokens(6).isEqual('linebreak', ';'))
174 |         end
175 | 
176 |         function testBlock(obj)
177 |             %TESTBLOCK Identify block comments
178 |             comment = sprintf('%%{ \n foo bar \n %%}');
179 |             tokens = tokenize_code(comment);
180 |             obj.assertLength(tokens, 1)
181 |             obj.assertTrue(tokens.isEqual('comment', comment))
182 |             
183 |             tokens = tokenize_code(sprintf('x\n%s\nx', comment));
184 |             obj.assertLength(tokens, 5)
185 |             obj.assertTrue(tokens(3).isEqual('comment', comment))
186 |         end
187 | 
188 |         function testLinebreak(obj)
189 |             %TESTLINEBREAK Test line breaks
190 | 
191 |             % Line breaks should break lines
192 |             tokens = tokenize_code(',foo bar');
193 |             obj.assertTrue(tokens(1).hasType('linebreak'))
194 |             obj.assertTrue(tokens(4).hasType('string'))
195 |             
196 |             tokens = tokenize_code(';foo bar');
197 |             obj.assertTrue(tokens(1).hasType('linebreak'))
198 |             obj.assertTrue(tokens(4).hasType('string'))
199 | 
200 |             % Line breaks should not break lines within brackets
201 |             tokens = tokenize_code('[a;b];');
202 |             obj.assertTrue(tokens(3).hasType('punctuation'))
203 |             obj.assertTrue(tokens(6).hasType('linebreak'))
204 |             
205 |             tokens = tokenize_code('[a,b],');
206 |             obj.assertTrue(tokens(3).hasType('punctuation'))
207 |             obj.assertTrue(tokens(6).hasType('linebreak'))
208 |         end
209 | 
210 |         function testComment(obj)
211 |             %TESTCOMMENT Test conventional comments in text
212 |             
213 |             % Conventional comments in text
214 |             tokens = tokenize_code('% this is a comment');
215 |             obj.assertLength(tokens, 1)
216 |             obj.assertTrue(tokens(1).hasType('comment'));
217 | 
218 |             tokens = tokenize_code('    % this is a comment');
219 |             obj.assertLength(tokens, 2)
220 |             obj.assertTrue(tokens(1).hasType('space'));
221 |             obj.assertTrue(tokens(2).hasType('comment'));
222 | 
223 |             txt = sprintf('%s\n%s', ...
224 |                 '    % this is a comment', ...
225 |                 '    && ...');
226 |             tokens = tokenize_code(txt);
227 |             obj.assertLength(tokens, 7)
228 |             obj.assertTrue(tokens(1).hasType('space'));
229 |             obj.assertTrue(tokens(2).hasType('comment'));
230 |             obj.assertTrue(tokens(3).hasType('linebreak'));
231 |             obj.assertTrue(tokens(4).hasType('space'));
232 |             obj.assertTrue(tokens(5).hasType('punctuation'));
233 |             obj.assertTrue(tokens(6).hasType('space'));
234 |             obj.assertTrue(tokens(7).hasType('punctuation'));
235 |         end
236 | 
237 |         function testCommentContinuationOperator(obj)
238 |             %TESTCOMMENTCONTINUATIONOPERATOR Test comments that follow continuation operator 
239 | 
240 |             % Test comments that follow continuation operator
241 |             tokens = tokenize_code('... % this is a comment');
242 |             obj.assertLength(tokens, 3)
243 |             obj.assertTrue(tokens(1).hasType('punctuation'));
244 |             obj.assertTrue(tokens(2).hasType('space'));
245 |             obj.assertTrue(tokens(3).hasType('comment'));
246 |             
247 |             tokens = tokenize_code('... this is a comment');
248 |             obj.assertLength(tokens, 3)
249 |             obj.assertTrue(tokens(1).hasType('punctuation'));
250 |             obj.assertTrue(tokens(2).hasType('space'));
251 |             obj.assertTrue(tokens(3).hasType('comment'));
252 | 
253 |             tokens = tokenize_code('    ... % this is a comment');
254 |             obj.assertLength(tokens, 4)
255 |             obj.assertTrue(tokens(1).hasType('space'));
256 |             obj.assertTrue(tokens(2).hasType('punctuation'));
257 |             obj.assertTrue(tokens(3).hasType('space'));
258 |             obj.assertTrue(tokens(4).hasType('comment'));
259 |             
260 |             tokens = tokenize_code('....');
261 |             obj.assertLength(tokens, 2)
262 |             obj.assertTrue(tokens(1).hasType('punctuation'));
263 |             obj.assertTrue(tokens(2).hasType('comment'));
264 | 
265 |             tokens = tokenize_code('..., this is a comment');
266 |             obj.assertLength(tokens, 2)
267 |             obj.assertTrue(tokens(1).hasType('punctuation'));
268 |             obj.assertTrue(tokens(2).hasType('comment'));
269 |             
270 |             tokens = tokenize_code('.*...');
271 |             obj.assertLength(tokens, 2)
272 |             obj.assertTrue(tokens(1).hasType('punctuation'));
273 |             obj.assertTrue(tokens(2).hasType('punctuation'));
274 | 
275 |             tokens = tokenize_code('    &&...this is a comment');
276 |             obj.assertLength(tokens, 4)
277 |             obj.assertTrue(tokens(1).hasType('space'));
278 |             obj.assertTrue(tokens(2).hasType('punctuation'));
279 |             obj.assertTrue(tokens(3).hasType('punctuation'));
280 |             obj.assertTrue(tokens(4).hasType('comment'));
281 |             
282 |             tokens = tokenize_code('&... this is a comment');
283 |             obj.assertLength(tokens, 4)
284 |             obj.assertTrue(tokens(1).hasType('punctuation'));
285 |             obj.assertTrue(tokens(2).hasType('punctuation'));
286 |             obj.assertTrue(tokens(3).hasType('space'));
287 |             obj.assertTrue(tokens(4).hasType('comment'));
288 | 
289 |             % Test comments that follow continuation operator with line break
290 |             txt = sprintf('%s\n%s', ...
291 |                 '    |... this is a comment', ...
292 |                 '    ||.... this is a comment');
293 |             tokens = tokenize_code(txt);
294 |             obj.assertLength(tokens, 10)
295 |             obj.assertTrue(tokens(1).hasType('space'));
296 |             obj.assertTrue(tokens(2).hasType('punctuation'));
297 |             obj.assertTrue(tokens(3).hasType('punctuation'));
298 |             obj.assertTrue(tokens(4).hasType('space'));
299 |             obj.assertTrue(tokens(5).hasType('comment'));
300 |             obj.assertTrue(tokens(6).hasType('linebreak'));
301 |             obj.assertTrue(tokens(7).hasType('space'));
302 |             obj.assertTrue(tokens(8).hasType('punctuation'));
303 |             obj.assertTrue(tokens(9).hasType('punctuation'));
304 |             obj.assertTrue(tokens(10).hasType('comment'));
305 | 
306 |             txt = sprintf('%s\n%s\n%s', ...
307 |                 '    % this is a comment', ...
308 |                 '    true||.... this is a comment', ...
309 |                 '    false% this is a comment');
310 |             tokens = tokenize_code(txt);
311 |             obj.assertLength(tokens, 12)
312 |             obj.assertTrue(tokens(1).hasType('space'));
313 |             obj.assertTrue(tokens(2).hasType('comment'));
314 |             obj.assertTrue(tokens(3).hasType('linebreak'));
315 |             obj.assertTrue(tokens(4).hasType('space'));
316 |             obj.assertTrue(tokens(5).hasType('identifier'));
317 |             obj.assertTrue(tokens(6).hasType('punctuation'));
318 |             obj.assertTrue(tokens(7).hasType('punctuation'));
319 |             obj.assertTrue(tokens(8).hasType('comment'));
320 |             obj.assertTrue(tokens(9).hasType('linebreak'));
321 |             obj.assertTrue(tokens(10).hasType('space'));
322 |             obj.assertTrue(tokens(11).hasType('identifier'));
323 |             obj.assertTrue(tokens(12).hasType('comment'));
324 |         end
325 |     end
326 | end


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Matlab Code Analyzer
 2 | ==================
 3 | 
 4 | MATLAB comes with the very important tool MLINT, which can check your code for common defects. Experience shows that these hints can be very helpful for cleaning up MATLAB code, and preventing simple errors. 
 5 | 
 6 | Crucially though, MLINT is not a style checker. That is where this program comes in:
 7 | 
 8 | Say you have some code in `ugly_code.m`. You can analyze this code for problems using one simple command:
 9 | 
10 | ```matlab
11 | check ugly_code.m
12 | ```
13 | 
14 | This might produce a report like this:
15 | 
16 | ```
17 | Code Analysis for ugly_code.m
18 | 
19 |   Required files: ugly_code.m, ugly_toolbox.m
20 |   Required toolboxes: MATLAB, Signal Processing Toolbox
21 | 
22 |   Function ugly_code (Line 1, col 18):
23 | 
24 |     Number of lines: 67 (high)
25 |     Number of function arguments: 2 (good)
26 |     Number of used variables: 5 (good)
27 |     Max level of nesting: 3 (high)
28 |     Code complexity: 6 (good)
29 | 
30 |     Line 1, col 1: too few comments (2 comments for 67 lines of code)
31 |     Line 1, col 10: return argument 'szOut' is very short (used 5 times across 38 lines)
32 |     Line 1, col 18: function argument 'testInput' is not mentioned in the documentation
33 |     Line 15, col 84: very long line
34 |     Line 20, col 22: no spaces after operator ','
35 |     Line 27, col 1: incorrect indentation
36 |     Line 27, col 1: variable 'szOut' is very short (used 5 times across 38 lines)
37 |     Line 27, col 23: variable 'text' shadows a built-in
38 |     Line 27, col 34: Eval should never be used
39 |     Line 39, col 10: no spaces around operator '='
40 | ```
41 | 
42 | A report like this will be printed for every function in the file, for script-files, and for classes. The more serious of these comments will be highlighted in red, whereas less important ones will stay black. Every line number is clickable and opens directly in the editor.
43 | 
44 | Additionally, this comes with a settings file `check_settings.m`, which can change the thresholds on all warnings, and even enable or disable whole categories of warnings entirely.
45 | 
46 | Contributing
47 | ------------
48 | 
49 | While this file works well for our current applications, it is a complex piece of software, and it has not been thoroughly tested yet. If you find a bug, or would like to see a new feature, or would like to contribute a new feature, please feel free to open an issue or pull request.
50 | 
51 | However, this is not my job, and I can not guarantee an immediate response, or support for every problem. That said, the code is available under the terms of the BSD 3-clause license, so feel free to use it however you like as long as you honor my authorship of it.
52 | 
53 | Also, please bear in mind that all of the warnings generated by this program are just that: Warnings. They are *not* laws. If a slightly longer line improves readability, please *do not* make it shorter just to make the style analyzer happy. Please watch [this video](https://www.youtube.com/watch?v=wf-BqAjZb8M) for some context. 
54 | 
55 | And finally, while this style checker can find many issues, it is by no means perfect. It can not comment on whether your variable names are good or not, whether your comments are out of date or not, or whether your code makes intuitive sense when reading, or just results in confusion. For more in-depth heuristics on how to improve these aspects of your code, pleas read the wonderful [MATLAB Style Guidelines 1.0](http://mathworks.com/matlabcentral/fileexchange/46056-matlab-style-guidelines-2-0).
56 | 


--------------------------------------------------------------------------------
/Token.m:
--------------------------------------------------------------------------------
 1 | classdef Token < handle
 2 |     properties
 3 |         type
 4 |         text
 5 |         line
 6 |         col
 7 |     end
 8 | 
 9 |     methods
10 |         function obj = Token(type, text, line, col)
11 |         %TOKEN an atomic piece of source code
12 |         %   Each token references an atomic piece of source code TEXT at a
13 |         %   specific LINE and COL. Each TOKEN is tagged as a certain TYPE.
14 |         %   returns a new OBJ.
15 | 
16 |             obj.type = type;
17 |             obj.text = text;
18 |             obj.line = line;
19 |             obj.col = col;
20 |         end
21 | 
22 |         function yesNo = hasType(obj, type)
23 |         %HASTYPE checks it OBJ has matching TYPE
24 |         %   YESNO is a boolean.
25 | 
26 |             yesNo = any(strcmp(obj.type, type));
27 |         end
28 | 
29 |         function yesNo = hasText(obj, text)
30 |         %HASTEXT checks it OBJ has matching TEXT
31 |         %   YESNO is a boolean.
32 | 
33 |             yesNo = any(strcmp(obj.text, text));
34 |         end
35 | 
36 |         function yesNo = isEqual(obj, type, text)
37 |         %ISEQUAL checks it OBJ has matching TYPE and TEXT
38 |         %   YESNO is a boolean.
39 | 
40 |             yesNo = obj.hasType(type) && obj.hasText(text);
41 |         end
42 |     end
43 | end
44 | 


--------------------------------------------------------------------------------
/analyze_file.m:
--------------------------------------------------------------------------------
  1 | function blocks = analyze_file(filename, tokenlist)
  2 | %ANALYZE_FILE analyzes TOKENLIST and extracts information about BLOCKS
  3 | %   in FILENAME. TOKENLIST is assumed to be the content of FILENAME.
  4 | %
  5 | %   Returns a struct array with fields:
  6 | %   - name: the function name
  7 | %   - body: the tokens that make up the body of the function
  8 | %   - nesting: how deeply is this block nested within other blocks
  9 | %   - children: other blocks nested within this block
 10 | %               (again as a struct array)
 11 | %   - variables: variables defined in this block, or properties if the
 12 | %                block is a class.
 13 | %   - arguments: function arguments of this block (if a function)
 14 | %   - returns: return variable names of this block (if a function)
 15 | %   - type: one of 'Function', 'Nested Function', 'Subfunction',
 16 | %           'Class', or 'Script'.
 17 | %   - filename: the FILENAME.
 18 | 
 19 | % (c) 2016, Bastian Bechtold
 20 | % This code is licensed under the terms of the BSD 3-clause license
 21 | 
 22 |     beginnings = check_settings('beginnings');
 23 | 
 24 |     blocks = struct('name', {}, 'body', {}, 'nesting', {}, ...
 25 |                     'children', {}, 'variables', {}, ...
 26 |                     'arguments', {}, 'returns', {}, ...
 27 |                     'type', {}, 'filename', {});
 28 |     function_stack = struct('start', {}, 'nesting', {}, 'children', {});
 29 |     nesting = 0;
 30 |     is_first_block = true;
 31 |     main_type = '';
 32 |     for current_pos = 1:length(tokenlist)
 33 |         current_token = tokenlist(current_pos);
 34 | 
 35 |         % count the 'end's to figure out function extents:
 36 |         if current_token.isEqual('keyword', beginnings)
 37 |             nesting = nesting + 1;
 38 |         elseif current_token.isEqual('keyword', 'end')
 39 |             nesting = nesting - 1;
 40 |         end
 41 | 
 42 |         % determine file type (Script, Function, or Class):
 43 |         if isempty(main_type) && ...
 44 |            ~current_token.hasType({'linebreak', 'comment'})
 45 |             if current_token.isEqual('keyword', 'function')
 46 |                 main_type = 'Function';
 47 |             elseif current_token.isEqual('keyword', 'classdef')
 48 |                 main_type = 'Class';
 49 |             else
 50 |                 main_type = 'Script';
 51 |             end
 52 |         end
 53 | 
 54 |         % pre-compute intermediate values for better readability:
 55 |         is_end_of_block = current_token.isEqual('keyword', 'end') && ...
 56 |                           ~isempty(function_stack) && ...
 57 |                           nesting == function_stack(end).nesting;
 58 |         is_end_of_function_file = current_pos == length(tokenlist) && ...
 59 |                                   ~isempty(function_stack);
 60 |         is_end_of_other_file = current_pos == length(tokenlist) && ...
 61 |                                any(strcmp(main_type, {'Script' 'Class'}));
 62 | 
 63 |         % build a stack of function definitions:
 64 |         % We don't know where these functions end, yet. As soon as we
 65 |         % know the end, it will get appended to the block list. For
 66 |         % now, only record where the function starts.
 67 |         if current_token.isEqual('keyword', 'function')
 68 |             % include any leading space in the function body, so that
 69 |             % later analysis steps can figure out the initial
 70 |             % indentation of the function:
 71 |             if current_pos > 1 && tokenlist(current_pos-1).hasType('space')
 72 |                 function_start = current_pos - 1;
 73 |             else
 74 |                 function_start = current_pos;
 75 |             end
 76 | 
 77 |             % save the new function on the function stack:
 78 |             stack_frame = struct('start', function_start, ...
 79 |                                  'nesting', nesting-1, ...
 80 |                                  'children', []);
 81 |             function_stack = [function_stack stack_frame]; %#ok
 82 | 
 83 |         elseif is_end_of_block || is_end_of_function_file
 84 |             function_body = ...
 85 |                 tokenlist(function_stack(end).start:current_pos);
 86 | 
 87 |             % determine function type (Top-Level, Nested, or Subfunction):
 88 |             if nesting > 0 && current_pos ~= length(tokenlist)
 89 |                 block_type = 'Nested Function';
 90 |             elseif is_first_block
 91 |                 block_type = main_type;
 92 |                 is_first_block = false;
 93 |             else
 94 |                 block_type = 'Subfunction';
 95 |             end
 96 | 
 97 |             % build block struct:
 98 |             new_block = struct( ...
 99 |                 'name', get_funcname(function_body), ...
100 |                 'body', function_body, ...
101 |                 'nesting', function_stack(end).nesting, ...
102 |                 'children', function_stack(end).children, ...
103 |                 'variables', {get_funcvariables(function_body)}, ...
104 |                 'arguments', {get_funcarguments(function_body)}, ...
105 |                 'returns', {get_funcreturns(function_body)}, ...
106 |                 'type', block_type, 'filename', filename);
107 | 
108 |             % update function stack with new block struct:
109 |             function_stack(end) = [];
110 |             if nesting > 0 && ~isempty(function_stack)
111 |                 if isempty(function_stack(end).children)
112 |                     function_stack(end).children = new_block;
113 |                 else
114 |                     function_stack(end).children = ...
115 |                         [function_stack(end).children new_block];
116 |                 end
117 |             else
118 |                 blocks = [blocks new_block]; %#ok
119 |             end
120 | 
121 |         elseif is_end_of_other_file
122 |             % in classes, variables contains properties:
123 |             if strcmp(main_type, 'Script')
124 |                 variables = {get_variables(tokenlist)};
125 |             else
126 |                 variables = {get_properties(tokenlist)};
127 |             end
128 |             blocks = struct('name', Token('special', filename, 0, 0), ...
129 |                             'body', tokenlist, ...
130 |                             'nesting', 0, ...
131 |                             'children', blocks, ...
132 |                             'variables', variables, ...
133 |                             'arguments', [], ...
134 |                             'returns', [], ...
135 |                             'type', main_type, ...
136 |                             'filename', filename);
137 |         end
138 |     end
139 | end
140 | 
141 | 
142 | function variables = get_properties(tokenlist)
143 | %GET_PROPERTIES extracts all assigned property VARIABLES from TOKENLIST
144 | %   returns an object array of Tokens.
145 | 
146 |     variables = Token.empty;
147 |     in_properties = false; % true whenever the loop is inside a properties
148 |                            % block.
149 |     is_first = false; % true whenever the loop is between a line break and
150 |                       % the beginning of the line's content.
151 |     for pos = 1:length(tokenlist)
152 |         token = tokenlist(pos);
153 |         if token.isEqual('keyword', 'properties')
154 |             in_properties = true;
155 |             is_first = false;
156 |         elseif in_properties && token.isEqual('keyword', 'end')
157 |             in_properties = false;
158 |         end
159 |         if token.hasType('linebreak')
160 |             is_first = true;
161 |         elseif token.hasType('identifier') && is_first && in_properties
162 |             variables = [variables token]; %#ok
163 |             is_first = false;
164 |         end
165 |     end
166 | end
167 | 
168 | 
169 | function variables = get_funcvariables(tokenlist)
170 | %GET_FUNCVARIABLES extracts all assigned VARIABLES from TOKENLIST
171 | %
172 | % See also: get_variables
173 | 
174 |     % skip the function declaration:
175 |     end_declaration = search_token('pair', ')', tokenlist, 1, +1);
176 |     variables = get_variables(tokenlist(end_declaration+1:end));
177 | end
178 | 
179 | 
180 | function variables = get_variables(tokenlist)
181 | %GET_VARIABLES extracts all assigned VARIABLES from TOKENLIST
182 | %   Variables are things on the left hand side of equal signs which are not
183 | %   enclosed in braces.
184 | 
185 |     variables = containers.Map();
186 |     for token_idx = 1:length(tokenlist)
187 |         token = tokenlist(token_idx);
188 |         if token.isEqual('punctuation', '=')
189 |             start = search_token('linebreak', [], tokenlist, token_idx, -1);
190 |             lhs_tokens = tokenlist(start:token_idx);
191 |             % all non-nested identifiers are assigned variable names
192 |             nesting = 0;
193 |             for this_token = lhs_tokens
194 |                 if this_token.isEqual('pair', {'{' '('})
195 |                     nesting = nesting + 1;
196 |                 elseif this_token.isEqual('pair', {'}' ')'})
197 |                     nesting = nesting - 1;
198 |                 elseif this_token.hasType('identifier') && ...
199 |                        nesting == 0 && ...
200 |                        ~variables.isKey(this_token.text)
201 |                     variables(this_token.text) = this_token;
202 |                 end
203 |             end
204 |         end
205 |     end
206 |     variables = variables.values();
207 |     variables = [variables{:}]; % convert to object array
208 |     if ~isempty(variables)
209 |         % sort by column:
210 |         [~, sort_idx] = sort([variables.col]);
211 |         variables = variables(sort_idx);
212 |         % sort by line (this preserves column ordering for variables
213 |         % on the same line):
214 |         [~, sort_idx] = sort([variables.line]);
215 |         variables = variables(sort_idx);
216 |     end
217 | end
218 | 
219 | 
220 | function name = get_funcname(tokenlist)
221 | %GET_FUNCNAME analyzes TOKENLIST to find function name
222 | %   NAME is a Token
223 | 
224 |     pos = search_token('pair', '(', tokenlist, 1, +1);
225 |     pos = search_token('identifier', [], tokenlist, pos, -1);
226 |     name = tokenlist(pos);
227 | end
228 | 
229 | 
230 | function arguments = get_funcarguments(tokenlist)
231 | %GET_FUNCARGUMENTS analyzes TOKENLIST to find function return values
232 | %   ARGUMENTS is an object array of Tokens.
233 | 
234 |     start = search_token('pair', '(', tokenlist, 1, +1);
235 |     stop = search_token('pair', ')', tokenlist, start, +1);
236 |     arguments = tokenlist(start+1:stop-1);
237 |     % extract all identifiers:
238 |     arguments = arguments(strcmp({arguments.type}, 'identifier'));
239 | end
240 | 
241 | 
242 | function returns = get_funcreturns(tokenlist)
243 | %GET_FUNCRETURNS analyzes TOKENLIST to find function return values
244 | %   RETURNS is an object array of Tokens.
245 | 
246 |     start = search_token('keyword', 'function', tokenlist, 1, +1);
247 |     pos = search_token('pair', '(', tokenlist, start, +1);
248 |     stop = search_token('identifier', [], tokenlist, pos, -1);
249 |     returns = tokenlist(start+1:stop-1);
250 |     % extract all identifiers:
251 |     returns = returns(strcmp({returns.type}, 'identifier'));
252 | end
253 | 
254 | 
255 | function token_idx = search_token(token_type, token_text, tokenlist, token_idx, increment)
256 | %SEARCH_TOKEN search TOKENLIST for token with TOKEN_TYPE and TOKEN_TEXT
257 | %   starting from TOKEN_IDX and stepping with INCREMENT.
258 | %
259 | %   To search for any Token with a given TOKEN_TYPE, leave TOKEN_TEXT empty
260 | %   To search for any Token with a given TOKEN_TEXT, leave TOKEN_TYPE empty
261 | %   Set INCREMENT to 1 for forward searching and -1 for backward searching
262 | %
263 | %   Returns the TOKEN_IDX of the first matching token.
264 | 
265 |     if ~isempty(token_type) && ~isempty(token_text)
266 |         while ~tokenlist(token_idx).isEqual(token_type, token_text)
267 |             if token_idx + increment < 1 || ...
268 |                token_idx + increment > length(tokenlist)
269 |                 break
270 |             end
271 |             token_idx = token_idx + increment;
272 |         end
273 |     elseif ~isempty(token_text)
274 |         while ~tokenlist(token_idx).hasText(token_text)
275 |             if token_idx + increment < 1 || ...
276 |                token_idx + increment > length(tokenlist)
277 |                 break
278 |             end
279 |             token_idx = token_idx + increment;
280 |         end
281 |     elseif ~isempty(token_type)
282 |         while ~tokenlist(token_idx).hasType(token_type)
283 |             if token_idx + increment < 1 || ...
284 |                token_idx + increment > length(tokenlist)
285 |                 break
286 |             end
287 |             token_idx = token_idx + increment;
288 |         end
289 |     end
290 | end
291 | 


--------------------------------------------------------------------------------
/check.m:
--------------------------------------------------------------------------------
   1 | function check(filename)
   2 | %CHECK a source file FILENAME for problems
   3 | %
   4 | %   CHECK does a deep analysis of the code in FILENAME, and reports on
   5 | %   problems with the code.
   6 | %
   7 | %   Each function defined in the file is reported separately, with
   8 | %   separate statistics and warnings. Minor warnings are written in
   9 | %   black, while major warnings are printed red. Even though some
  10 | %   warnings are somewhat subjective, in general, at least all red
  11 | %   issues *should* be fixed.
  12 | %
  13 | %   Every warning is presented as a clickable link that will jump to the
  14 | %   correct line in the editor.
  15 | %
  16 | %   Many warnings have configurable settings in CHECK_SETTINGS. Note
  17 | %   though that *disabling* a warning does not count as *fixing* it.
  18 | %
  19 | %   Warnings include:
  20 | %   - Required files to run the code
  21 | %   - Required toolboxes to run the code
  22 | %   - High number of lines
  23 | %   - High number of function arguments
  24 | %   - High number of used variables
  25 | %   - Too many levels of nesting
  26 | %   - Too much function complexity
  27 | %   - MLINT warnings
  28 | %   - missing documentation, or missing documentation of function arguments
  29 | %   - not enough comments
  30 | %   - incorrect or insufficient indentation
  31 | %   - excessive line length
  32 | %   - too short variable names
  33 | %   - no spaces around some operators
  34 | %   - use of dangerous functions like eval
  35 | 
  36 | % (c) 2016, Bastian Bechtold
  37 | % This code is licensed under the terms of the BSD 3-clause license
  38 | 
  39 |     [requiredFiles, requiredProducts] = ...
  40 |         matlab.codetools.requiredFilesAndProducts(filename);
  41 |     % manually fetch file name, since checkcode won't do it correctly
  42 |     fullfilename = which(filename);
  43 |     mlintInfo = ...
  44 |         checkcode(fullfilename, '-cyc', '-id', '-struct', '-fullpath');
  45 | 
  46 |     source_code = fileread(filename);
  47 |     tokens = tokenize_code(source_code);
  48 |     func_report = analyze_file(fullfilename, tokens);
  49 | 
  50 |     fprintf('Code Analysis for <strong>%s</strong>\n\n', filename);
  51 | 
  52 |     fprintf('  Required files: ');
  53 |     for file_idx = 1:length(requiredFiles)
  54 |         [~, basename, ext] = fileparts(requiredFiles{file_idx});
  55 |         fprintf('%s%s', basename, ext);
  56 |         if file_idx < length(requiredFiles)
  57 |             fprintf(', ');
  58 |         else
  59 |             fprintf('\n');
  60 |         end
  61 |     end
  62 | 
  63 |     fprintf('  Required toolboxes: ');
  64 |     for product_idx = 1:length(requiredProducts)
  65 |         fprintf('%s%s', requiredProducts(product_idx).Name);
  66 |         if product_idx < length(requiredProducts)
  67 |             fprintf(', ');
  68 |         else
  69 |             fprintf('\n\n');
  70 |         end
  71 |     end
  72 | 
  73 |     for func = func_report
  74 |         print_code_report(func, mlintInfo, 2);
  75 |     end
  76 | end
  77 | 
  78 | 
  79 | function print_code_report(func, mlintInfo, indentation)
  80 | %PRINT_CODE_REPORT prints a comprehensive report about a code block FUNC
  81 | %   The printed text is indented at INDENTATION spaces.
  82 | %
  83 | %   FUNC is analyzed for many common defects and stylistic mishaps, and
  84 | %   prints a nicely formatted list of issues, plus some additional
  85 | %   statistics about the code block.
  86 | %
  87 | %   Depending on the type of code block (Function, Subfunction, Nested
  88 | %   Function, Class, Script) different kinds of statistics are reported.
  89 | %
  90 | %   Additionally, many warnings are collected and presented, including
  91 | %   MLINT warnings from MLINTINFO.
  92 | 
  93 |     prefix = repmat(' ', 1, indentation);
  94 |     link = sprintf('<a href="%s">Line %i, col %i</a>', ...
  95 |                    open_file_link(func.filename, func.name.line), ...
  96 |                    func.name.line, func.name.col);
  97 |     fprintf('%s%s <strong>%s</strong> (%s):\n\n', ...
  98 |             prefix, func.type, func.name.text, link);
  99 | 
 100 |     functypes = {'Function', 'Subfunction', 'Nested Function'};
 101 |     if any(strcmp(func.type, functypes))
 102 |         stats = get_function_stats(func, mlintInfo);
 103 |         print_function_stats(stats, indentation+2);
 104 |         fprintf('\n');
 105 |     elseif strcmp(func.type, 'Class')
 106 |         stats = get_class_stats(func);
 107 |         print_class_stats(stats, indentation+2);
 108 |         fprintf('\n');
 109 |     elseif strcmp(func.type, 'Script')
 110 |         stats = get_script_stats(func);
 111 |         print_script_stats(stats, indentation+2);
 112 |         fprintf('\n');
 113 |     end
 114 | 
 115 |     reports = [report_documentation(func) ...
 116 |                report_comments(func.body) ...
 117 |                report_mlint_warnings(mlintInfo, func.body) ...
 118 |                report_indentation(func) ...
 119 |                report_line_length(func.body) ...
 120 |                report_variables(func.variables, func.body, 'variable') ...
 121 |                report_operators(func.body) ...
 122 |                report_eval(func.body)];
 123 | 
 124 |     if any(strcmp(func.type, functypes))
 125 |         reports = [reports ...
 126 |                    report_variables(func.name, func.body, ...
 127 |                                     'function') ...
 128 |                    report_variables(func.arguments, func.body, ...
 129 |                                     'function argument') ...
 130 |                    report_variables(func.returns, func.body, ...
 131 |                                     'return argument')];
 132 |     end
 133 | 
 134 |     if ~isempty(reports)
 135 |         % First, secondary sort by column
 136 |         report_tokens = [reports.token];
 137 |         [~, sort_idx] = sort([report_tokens.col]);
 138 |         reports = reports(sort_idx);
 139 |         % Second, primary sort by line (preserves secondary
 140 |         % sorting order in case of collisions)
 141 |         report_tokens = [reports.token];
 142 |         [~, sort_idx] = sort([report_tokens.line]);
 143 |         reports = reports(sort_idx);
 144 |         print_report(reports, indentation+2, func.filename);
 145 |     end
 146 | 
 147 |     fprintf('\n\n');
 148 | 
 149 |     for subfunc = func.children
 150 |         print_code_report(subfunc, mlintInfo, indentation+4)
 151 |     end
 152 | end
 153 | 
 154 | 
 155 | function class_stats = get_class_stats(class_struct)
 156 | %GET_CLASS_STATS analyzes a script CLASS_STRUCT and
 157 | %   gathers some statistics CLASS_STATS about them.
 158 | %
 159 | %   Statistics gathered (fieldname):
 160 | %   - number of lines (num_lines)
 161 | %   - number of properties (num_properties)
 162 | %   - number of methods (num_methods)
 163 | %
 164 | %   The statistics are returned as struct CLASS_STATS
 165 | 
 166 |     class_stats.num_lines = length(split_lines(class_struct.body));
 167 |     class_stats.num_properties = length(class_struct.variables);
 168 |     class_stats.num_methods = length(class_struct.children);
 169 | end
 170 | 
 171 | 
 172 | function print_class_stats(class_stats, indentation)
 173 | %PRINT_CLASS_STATS prints some general statistics CLSS_STATS about
 174 | %   a class. The printed text is indented at INDENTATION spaces.
 175 | %
 176 | %   This function prints an evaluation of
 177 | %   - the number of lines in the function
 178 | %   - the number of properties
 179 | %   - the number of methods
 180 | %
 181 | %   All of these values are evaluated as `good` if they are below a
 182 | %   certain low threshold; as `high` if they are above this threshold
 183 | %   and as `too high` and in red text if they exceed a high threshold.
 184 | %   The thresholds can be controlled using the settings
 185 | %   - `lo_class_num_lines` and `hi_class_num_lines`
 186 | %   - `lo_class_num_properties` and `hi_class_num_properties`
 187 | %   - `lo_class_num_methods` and `hi_class_num_methods`
 188 | 
 189 |     prefix = repmat(' ', 1, indentation);
 190 | 
 191 |     fprintf('%sNumber of lines: ', prefix);
 192 |     print_evaluation(class_stats.num_lines, ...
 193 |                      check_settings('lo_class_num_lines'), ...
 194 |                      check_settings('hi_class_num_lines'));
 195 | 
 196 |     fprintf('%sNumber of properties: ', prefix);
 197 |     print_evaluation(class_stats.num_properties, ...
 198 |                      check_settings('lo_class_num_properties'), ...
 199 |                      check_settings('hi_class_num_properties'));
 200 | 
 201 |     fprintf('%sNumber of methods: ', prefix);
 202 |     print_evaluation(class_stats.num_methods, ...
 203 |                      check_settings('lo_class_num_methods'), ...
 204 |                      check_settings('hi_class_num_methods'));
 205 | end
 206 | 
 207 | 
 208 | function script_stats = get_script_stats(script_struct)
 209 | %GET_SCRIPT_STATS analyzes a script SCRIPT_STRUCT and
 210 | %   gathers some statistics SCRIPT_STATS about them.
 211 | %
 212 | %   Statistics gathered (fieldname):
 213 | %   - number of lines (num_lines)
 214 | %   - number of variables used in the function (num_variables)
 215 | %   - the maximum level of indentation in the function (max_indentation)
 216 | %
 217 | %   The statistics are returned as struct SCRIPT_STATS
 218 | 
 219 |     script_stats.num_lines = length(split_lines(script_struct.body));
 220 |     script_stats.num_variables = length(script_struct.variables);
 221 | 
 222 |     % max indentation
 223 |     keyword_indices = strcmp({script_struct.body.type}, 'keyword');
 224 |     keywords = script_struct.body(keyword_indices);
 225 |     indentation = 1;
 226 |     max_indentation = 0;
 227 |     for keyword = keywords
 228 |         if keyword.hasText({'if' 'for' 'parfor' 'while' 'switch'})
 229 |             indentation = indentation + 1;
 230 |             max_indentation = max(max_indentation, indentation);
 231 |         elseif keyword.hasText('end')
 232 |             indentation = indentation - 1;
 233 |         end
 234 |     end
 235 |     script_stats.max_indentation = max_indentation;
 236 | end
 237 | 
 238 | 
 239 | function print_script_stats(script_stats, indentation)
 240 | %PRINT_SCRIPT_STATS prints some general statistics SCRIPT_STATS about
 241 | %   a script. The printed text is indented at INDENTATION spaces.
 242 | %
 243 | %   This function prints an evaluation of
 244 | %   - the number of lines in the function
 245 | %   - the number of variables used in the script
 246 | %   - the maximum level of indentation in the script
 247 | %
 248 | %   All of these values are evaluated as `good` if they are below a
 249 | %   certain low threshold; as `high` if they are above this threshold
 250 | %   and as `too high` and in red text if they exceed a high threshold.
 251 | %   The thresholds can be controlled using the settings
 252 | %   - `lo_script_num_lines` and `hi_script_num_lines`
 253 | %   - `lo_script_num_variables` and `hi_script_num_variables`
 254 | %   - `lo_script_max_indentation` and `hi_script_max_indentation`
 255 |     prefix = repmat(' ', 1, indentation);
 256 | 
 257 |     fprintf('%sNumber of lines: ', prefix);
 258 |     print_evaluation(script_stats.num_lines, ...
 259 |                      check_settings('lo_script_num_lines'), ...
 260 |                      check_settings('hi_script_num_lines'));
 261 | 
 262 |     fprintf('%sNumber of variables: ', prefix);
 263 |     print_evaluation(script_stats.num_variables, ...
 264 |                      check_settings('lo_script_num_variables'), ...
 265 |                      check_settings('hi_script_num_variables'));
 266 | 
 267 |     fprintf('%sNumber of variables: ', prefix);
 268 |     print_evaluation(script_stats.max_indentation, ...
 269 |                      check_settings('lo_script_max_indentation'), ...
 270 |                      check_settings('hi_script_max_indentation'));
 271 | end
 272 | 
 273 | 
 274 | function func_stats = get_function_stats(func_struct, mlintInfo)
 275 | %GET_FUNCTION_STATS analyzes a function FUNC_STRUCT and MLINTINFO and
 276 | %   gathers some statistics FUNC_STATS about them.
 277 | %
 278 | %   Statistics gathered (fieldname):
 279 | %   - number of lines (num_lines)
 280 | %   - number of function arguments (num_arguments)
 281 | %   - number of variables used in the function (num_variables)
 282 | %   - the maximum level of indentation in the function (max_indentation)
 283 | %   - the function complexity (complexity)
 284 | %
 285 | %   The statistics are returned as struct FUNC_STATS
 286 | 
 287 |     func_stats.num_lines = length(split_lines(func_struct.body));
 288 |     func_stats.num_arguments = length(func_struct.arguments);
 289 |     func_stats.num_variables = length(func_struct.variables);
 290 | 
 291 |     % max indentation
 292 |     keyword_indices = strcmp({func_struct.body.type}, 'keyword');
 293 |     keywords = func_struct.body(keyword_indices);
 294 |     indentation = 1;
 295 |     max_indentation = 0;
 296 |     for keyword = keywords
 297 |         if keyword.hasText({'if' 'for' 'parfor' 'while' 'switch'})
 298 |             indentation = indentation + 1;
 299 |             max_indentation = max(max_indentation, indentation);
 300 |         elseif keyword.hasText('end')
 301 |             indentation = indentation - 1;
 302 |         end
 303 |     end
 304 |     func_stats.max_indentation = max_indentation;
 305 | 
 306 |     % cyclomatic complexity
 307 |     mlintInfo = mlintInfo(strcmp({mlintInfo.id}, 'CABE'));
 308 |     mlintInfo = mlintInfo([mlintInfo.line] == func_struct.body(1).line);
 309 |     assert(length(mlintInfo) == 1);
 310 |     pattern = '''(?<f>[^'']+)'' is (?<n>[0-9]+)';
 311 |     matches = regexp(mlintInfo.message, pattern, 'names');
 312 |     func_stats.complexity = str2double(matches.n);
 313 | end
 314 | 
 315 | 
 316 | function print_function_stats(func_stats, indentation)
 317 | %PRINT_FUNCTION_STATS prints some general statistics FUNC_STATS about
 318 | %   a function. The printed text is indented at INDENTATION spaces.
 319 | %
 320 | %   This function prints an evaluation of
 321 | %   - the number of lines in the function
 322 | %   - the number of function arguments
 323 | %   - the number of variables used in the function
 324 | %   - the maximum level of indentation in the function
 325 | %   - the function complexity
 326 | %
 327 | %   All of these values are evaluated as `good` if they are below a
 328 | %   certain low threshold; as `high` if they are above this threshold
 329 | %   and as `too high` and in red text if they exceed a high threshold.
 330 | %   The thresholds can be controlled using the settings
 331 | %   - `lo_function_num_lines` and `hi_function_num_lines`
 332 | %   - `lo_function_num_arguments` and `hi_function_num_arguments`
 333 | %   - `lo_function_num_variables` and `hi_function_num_variables`
 334 | %   - `lo_function_max_indentation` and `hi_function_max_indentation`
 335 | %   - `lo_function_complexity` and `hi_function_complexity`
 336 | 
 337 |     prefix = repmat(' ', 1, indentation);
 338 | 
 339 |     fprintf('%sNumber of lines: ', prefix);
 340 |     print_evaluation(func_stats.num_lines, ...
 341 |                      check_settings('lo_function_num_lines'), ...
 342 |                      check_settings('hi_function_num_lines'));
 343 | 
 344 |     fprintf('%sNumber of function arguments: ', prefix);
 345 |     print_evaluation(func_stats.num_arguments, ...
 346 |                      check_settings('lo_function_num_arguments'), ...
 347 |                      check_settings('hi_function_num_arguments'));
 348 | 
 349 |     fprintf('%sNumber of used variables: ', prefix);
 350 |     print_evaluation(func_stats.num_variables, ...
 351 |                      check_settings('lo_function_num_variables'), ...
 352 |                      check_settings('hi_function_num_variables'));
 353 | 
 354 |     fprintf('%sMax level of nesting: ', prefix);
 355 |     print_evaluation(func_stats.max_indentation, ...
 356 |                      check_settings('lo_function_max_indentation'), ...
 357 |                      check_settings('hi_function_max_indentation'));
 358 | 
 359 |     fprintf('%sCode complexity: ', prefix);
 360 |     print_evaluation(func_stats.complexity, ...
 361 |                      check_settings('lo_function_complexity'), ...
 362 |                      check_settings('hi_function_complexity'));
 363 | end
 364 | 
 365 | 
 366 | function print_evaluation(value, low_thr, high_thr)
 367 | %PRINT_EVALUATION prints an evaluation of VALUE.
 368 | %   LOW_THR and HIGH_THR mark thresholds, above which the value is
 369 | %   described as "(good)" -> "(high)" -> "(too high)" in red
 370 | 
 371 |     if value < low_thr
 372 |         fprintf('%i (good)\n', value);
 373 |     elseif value < high_thr
 374 |         fprintf('%i (high)\n', value);
 375 |     else
 376 |         fprintf('%i [\b(too high)]\b\n', value);
 377 |     end
 378 | end
 379 | 
 380 | 
 381 | function print_report(report, indentation, filename)
 382 | %PRINT_REPORT prints the contents of REPORT at INDENTATION. Each REPORT
 383 | %   item is written as a link to the appropriate place in FILENAME.
 384 | 
 385 |     prefix = repmat(' ', 1, indentation);
 386 | 
 387 |     for report_entry = report
 388 |         % print severe report_entrys in red:
 389 |         % red text is created by surrounding it with `[<backspace>` and
 390 |         % `]<backspace>`. The `<backspace>` will delete the preceding
 391 |         % bracket and not show up in the text itself, but it will be
 392 |         % interpreted as a flag to change the text color. This is an
 393 |         % ancient ASCII convention.
 394 |         if report_entry.severity == 2
 395 |             fprintf('%s<a href="%s">Line %i, col %i</a>: [\b%s]\b\n', ...
 396 |                     prefix, ...
 397 |                     open_file_link(filename, report_entry.token.line), ...
 398 |                     report_entry.token.line, ...
 399 |                     report_entry.token.col, ...
 400 |                     report_entry.message);
 401 | 
 402 |         % print regular report_entrys in black:
 403 |         else
 404 |             fprintf('%s<a href="%s">Line %i, col %i</a>: %s\n', ...
 405 |                     prefix, ...
 406 |                     open_file_link(filename, report_entry.token.line), ...
 407 |                     report_entry.token.line, ...
 408 |                     report_entry.token.col, ...
 409 |                     report_entry.message);
 410 |         end
 411 |     end
 412 | end
 413 | 
 414 | 
 415 | function report = report_comments(tokenlist)
 416 | %REPORT_COMMENTS REPORTs on the number of comments in TOKENLIST.
 417 | %
 418 | %   Comments should not describe the code itself, but provide context
 419 | %   for reading the code. In other words, they should describe the
 420 | %   *why*, not the *what.
 421 | %
 422 | %   returns a struct array REPORT with fields `token`, `message`, and
 423 | %   `severity`.
 424 | %
 425 | %   This check can be switched off by setting `do_check_comments` in
 426 | %   CHECK_SETTINGS to FALSE.
 427 | 
 428 |     report = struct('token', {}, 'severity', {}, 'message', {});
 429 |     if ~check_settings('do_check_comments')
 430 |         return
 431 |     end
 432 | 
 433 |     linelist = split_lines(tokenlist);
 434 |     num_lines = length(linelist);
 435 |     num_comments = 0;
 436 |     for line_idx = 1:length(linelist)
 437 |         line_tokens = linelist{line_idx};
 438 |         if any(strcmp({line_tokens.type}, 'comment'))
 439 |             num_comments = num_comments + 1;
 440 |         end
 441 |     end
 442 | 
 443 |     usage = sprintf('(%i comments for %i lines of code)', ...
 444 |                     num_comments, num_lines);
 445 |     if num_comments/num_lines < 0.1
 446 |         report = struct('token', tokenlist(1), ...
 447 |                         'severity', 2, ...
 448 |                         'message', ['too few comments ' usage]);
 449 |     elseif num_comments/num_lines < 0.2
 450 |         report = struct('token', tokenlist(1), ...
 451 |                         'severity', 1, ...
 452 |                         'message', ['very few comments ' usage]);
 453 |     end
 454 | end
 455 | 
 456 | 
 457 | function report = report_documentation(func_struct)
 458 | %REPORT_DOCUMENTATION REPORTs on problems with the documentation of the
 459 | %   function in FUNC_STRUCT.
 460 | %
 461 | %   Documentation is very important for humans. Code is not primarily
 462 | %   written for the machine to execute, but mostly for humans to read.
 463 | %   But many ideas are more efficiently described in prose than in code,
 464 | %   hence we write documentation. Functions in particular should always
 465 | %   be documented.
 466 | %
 467 | %   Problems might be:
 468 | %   - the function name is not mentioned in the documentation
 469 | %   - the function arguments are not mentioned
 470 | %   - the function return values are not mentioned
 471 | %   - there is no documentation
 472 | %
 473 | %   returns a struct array REPORT with fields `token`, `message`, and
 474 | %   `severity`.
 475 | %
 476 | %   This check can be switched off by setting `do_check_documentation` in
 477 | %   CHECK_SETTINGS to FALSE.
 478 | 
 479 |     report = struct('token', {}, 'severity', {}, 'message', {});
 480 |     if ~check_settings('do_check_documentation')
 481 |         return
 482 |     end
 483 | 
 484 |     doc_text = get_function_documentation(func_struct.body);
 485 |     if isempty(doc_text)
 486 |         msg = 'there is no documentation';
 487 |         report = [report struct('token', func_struct.body(1), ...
 488 |                                 'severity', 2, ...
 489 |                                 'message', msg)];
 490 |         return
 491 |     end
 492 |     template = '%s ''%s'' is not mentioned in the documentation';
 493 |     [~, funcname, ~] = fileparts(func_struct.name.text);
 494 |     if isempty(strfind(lower(doc_text), lower(funcname)))
 495 |         msg = sprintf(template, 'function name', func_struct.name.text);
 496 |         report = [report struct('token', func_struct.name, ...
 497 |                                 'severity', 2, ...
 498 |                                 'message', msg)];
 499 |     end
 500 |     for variable = func_struct.arguments
 501 |         if isempty(strfind(lower(doc_text), lower(variable.text))) && ...
 502 |         	~strcmp(doc_text, 'varargin')
 503 |             msg = sprintf(template, 'function argument', variable.text);
 504 |             report = [report struct('token', variable, ...
 505 |                                     'severity', 2, ...
 506 |                                     'message', msg)]; %#ok
 507 |         end
 508 |     end
 509 |     for variable = func_struct.returns
 510 |         if isempty(strfind(lower(doc_text), lower(variable.text))) && ...
 511 |         	~strcmp(doc_text, 'varargout')
 512 |             msg = sprintf(template, 'return argument', variable.text);
 513 |             report = [report struct('token', variable, ...
 514 |                                     'severity', 2, ...
 515 |                                     'message', msg)]; %#ok
 516 |         end
 517 |     end
 518 | end
 519 | 
 520 | 
 521 | function doc_text = get_function_documentation(tokenlist)
 522 | %GET_FUNCTION_DOCUMENTATION extracts function documentation from TOKENLIST
 523 | %
 524 | %   returns DOC_TEXT as a string
 525 | 
 526 |     % skip function declaration
 527 |     token_idx = 1;
 528 |     while token_idx <= length(tokenlist) && ...
 529 |           ~tokenlist(token_idx).isEqual('pair', ')')
 530 |         token_idx = token_idx + 1;
 531 |     end
 532 |     token_idx = token_idx + 2;
 533 | 
 534 |     % find documentation
 535 |     doc_types = {'comment' 'space' 'linebreak'};
 536 |     start = token_idx;
 537 |     while token_idx <= length(tokenlist) && ...
 538 |           tokenlist(token_idx).hasType(doc_types)
 539 |         token_idx = token_idx + 1;
 540 |     end
 541 | 
 542 |     % extract documentation text
 543 |     comment_tokens = tokenlist(start:token_idx-1);
 544 |     comment_tokens = ...
 545 |         comment_tokens(strcmp({comment_tokens.type}, 'comment'));
 546 |     doc_text = [comment_tokens.text];
 547 | end
 548 | 
 549 | 
 550 | function report = report_eval(tokenlist)
 551 | %REPORT_EVAL REPORTs on uses of `eval` in TOKENLIST.
 552 | %
 553 | %   Using `eval` is *never* the right thing to do. There is *always*
 554 | %   a better way. Seriously.
 555 | %
 556 | %   returns a struct array REPORT with fields `token`, `message`, and
 557 | %   `severity`.
 558 | %
 559 | %   This check can be switched off by setting `do_check_eval` in
 560 | %   CHECK_SETTINGS to FALSE.
 561 | 
 562 |     report = struct('token', {}, 'severity', {}, 'message', {});
 563 |     if ~check_settings('do_check_eval')
 564 |         return
 565 |     end
 566 | 
 567 |     eval_tokens = tokenlist(strcmp({tokenlist.text}, 'eval') & ...
 568 |                             strcmp({tokenlist.type}, 'identifier'));
 569 |     for t = eval_tokens
 570 |         msg = 'Eval should never be used';
 571 |         report = [report struct('token', t, ...
 572 |                                 'severity', 2, ...
 573 |                                 'message', msg)]; %#ok
 574 |     end
 575 | end
 576 | 
 577 | 
 578 | function report = report_operators(tokenlist)
 579 | %REPORT_OPERATORS reports on incorrectly used operators in TOKENLIST
 580 | %
 581 | %   To improve readability, operators should be treated like punctuation
 582 | %   in regular English, i.e. be preceded and followed by spaces just like
 583 | %   in English and math. In particular:
 584 | %   - relational operators such as `>`, `<`, `==`, `~=`, `<=`, `>=`, `=`,
 585 | %     `||`, and `&&` should be surrounded by spaces.
 586 | %   - punctuation such as `,` and `;` should be followed by a space.
 587 | %   - unary operators such as `@` and `...` should be preceded by a space.
 588 | %
 589 | %   returns a struct array REPORT with fields `token`, `message`, and
 590 | %   `severity`.
 591 | %
 592 | %   This check can be switched off by setting `do_check_operators` in
 593 | %   CHECK_SETTINGS to FALSE.
 594 | 
 595 |     report = struct('token', {}, 'severity', {}, 'message', {});
 596 |     if ~check_settings('do_check_operators')
 597 |         return
 598 |     end
 599 | 
 600 |     space_around_operators = { '>' '<' '==' '>=' '<=' '~=' ...
 601 |                                '=' '||' '&&'};
 602 |     space_after_operators = { ',' ';' };
 603 |     space_before_operators = { '@' '...' };
 604 | 
 605 |     op_indices = find(strcmp({tokenlist.type}, 'punctuation'));
 606 |     for op_idx = op_indices
 607 |         has_space_before = op_idx > 1 && ...
 608 |                            tokenlist(op_idx-1).hasType('space');
 609 |         has_space_after = op_idx < length(tokenlist) && ...
 610 |                           tokenlist(op_idx+1).hasType('space');
 611 |         has_newline_after = op_idx < length(tokenlist) && ...
 612 |                             tokenlist(op_idx+1).hasText(sprintf('\n'));
 613 |         if tokenlist(op_idx).hasText(space_around_operators) && ...
 614 |            (~has_space_before || ~has_space_after)
 615 |             msg = sprintf('no spaces around operator ''%s''', ...
 616 |                           tokenlist(op_idx).text);
 617 |             report = [report struct('token', tokenlist(op_idx), ...
 618 |                                     'severity', 1, ...
 619 |                                     'message', msg)]; %#ok
 620 |         elseif tokenlist(op_idx).hasText(space_after_operators) && ...
 621 |                ~has_space_after && ~has_newline_after
 622 |             msg = sprintf('no spaces after operator ''%s''', ...
 623 |                           tokenlist(op_idx).text);
 624 |             report = [report struct('token', tokenlist(op_idx), ...
 625 |                                     'severity', 1, ...
 626 |                                     'message', msg)]; %#ok
 627 |         elseif tokenlist(op_idx).hasText(space_before_operators) && ...
 628 |                ~has_space_before
 629 |             msg = sprintf('no spaces before operator ''%s''', ...
 630 |                           tokenlist(op_idx).text);
 631 |             report = [report struct('token', tokenlist(op_idx), ...
 632 |                                     'severity', 1, ...
 633 |                                     'message', msg)]; %#ok
 634 |         end
 635 |     end
 636 | end
 637 | 
 638 | 
 639 | function report = report_variables(varlist, tokenlist, description)
 640 | %REPORT_VARIABLES checks all variables in VARLIST, as used in TOKENLIST,
 641 | %   and REPORTs on problems with these variables. DESCRIPTION is used
 642 | %   to describe the variable in REPORT.
 643 | %
 644 | %   Problems with variables can be:
 645 | %   - The variable shadows a built-in
 646 | %   - The variable has a very short name and is used very often.
 647 | %
 648 | %   In general, variable name lengths should correlate with the amount
 649 | %   of code they are used in. If variables are used over a long piece
 650 | %   of code, the programmer will stumble across the variable often,
 651 | %   and it should have a descriptive name. Short variable names are
 652 | %   only allowed if they are ephemeral, such as loop counters in small
 653 | %   loops. There, they don't need to be remembered for long, thus a short
 654 | %   name is permissible.
 655 | %
 656 | %   returns a struct array REPORT with fields `token`, `message`, and
 657 | %   `severity`.
 658 | %
 659 | %   This check can be switched off by setting `do_check_variables` in
 660 | %   CHECK_SETTINGS to FALSE.
 661 | 
 662 |     report = struct('token', {}, 'severity', {}, 'message', {});
 663 |     if ~check_settings('do_check_variables')
 664 |         return
 665 |     end
 666 | 
 667 |     for variable = varlist
 668 |         if does_shadow(variable.text) && ...
 669 |         	~any(strcmp(variable.text, {'varargin', 'varargout'}))
 670 |             msg = sprintf('%s ''%s'' shadows a built-in', ...
 671 |                           description, variable.text);
 672 |             report = [report struct('token', variable, ...
 673 |                                     'severity', 2, ...
 674 |                                     'message', msg)]; %#ok
 675 |         end
 676 |         [numuses, spread] = get_variable_usage(variable.text, tokenlist);
 677 |         usage_descr = sprintf('(used %i times across %i lines)', ...
 678 |                               numuses, spread);
 679 |         varlen = length(variable.text);
 680 | 
 681 |         short_spread = check_settings('lo_varname_short_spread');
 682 |         short_length = check_settings('lo_varname_short_length');
 683 |         long_spread = check_settings('lo_varname_long_spread');
 684 |         long_length = check_settings('lo_varname_long_length');
 685 |         slightly_too_short = ...
 686 |             (spread > short_spread && varlen <= short_length) || ...
 687 |             (spread > long_spread && varlen <= long_length);
 688 | 
 689 |         short_spread = check_settings('hi_varname_short_spread');
 690 |         short_length = check_settings('hi_varname_short_length');
 691 |         long_spread = check_settings('hi_varname_long_spread');
 692 |         long_length = check_settings('hi_varname_long_length');
 693 |         much_too_short = ...
 694 |             (spread > short_spread && varlen <= short_length) || ...
 695 |             (spread > long_spread && varlen <= long_length);
 696 | 
 697 | 
 698 |         if slightly_too_short
 699 |             msg = sprintf('%s ''%s'' is very short %s', ...
 700 |                           description, variable.text, usage_descr);
 701 |             report = [report struct('token', variable, ...
 702 |                                     'severity', 1, ...
 703 |                                     'message', msg)]; %#ok
 704 |         elseif much_too_short
 705 |             msg = sprintf('%s ''%s'' is too short %s', ...
 706 |                           description, variable.text, usage_descr);
 707 |             report = [report struct('token', variable, ...
 708 |                                     'severity', 2, ...
 709 |                                     'message', msg)]; %#ok
 710 |         end
 711 |     end
 712 | end
 713 | 
 714 | 
 715 | function [numuses, linerange] = get_variable_usage(varname, tokenlist)
 716 | %GET_VARIABLE_USAGE finds all uses of variable VARNAME in TOKENLIST
 717 | %   Returns the number of uses NUMUSES and the range of lines LINERANGE
 718 | %   in which the variable is used.
 719 | 
 720 |     uses = tokenlist(strcmp({tokenlist.text}, varname) & ...
 721 |                      strcmp({tokenlist.type}, 'identifier'));
 722 |     numuses = length(uses);
 723 |     linelist = [uses.line];
 724 |     linerange = max(linelist)-min(linelist);
 725 | end
 726 | 
 727 | 
 728 | function report = report_mlint_warnings(mlint_info, tokenlist)
 729 | %REPORT_MLINT_WARNINGS reads through MLINT_INFO and REPORTs on all messages
 730 | %   that refer to the code in TOKENLIST.
 731 | %
 732 | %   returns a struct array REPORT with fields `token`, `message`, and
 733 | %   `severity`.
 734 | %
 735 | %   This check can be switched off by setting `do_check_mlint_warnings` in
 736 | %   CHECK_SETTINGS to FALSE.
 737 | 
 738 |     report = struct('token', {}, 'severity', {}, 'message', {});
 739 |     if ~check_settings('do_check_mlint_warnings')
 740 |         return
 741 |     end
 742 | 
 743 |     mlint_info = mlint_info([mlint_info.line] >= tokenlist(1).line);
 744 |     mlint_info = mlint_info([mlint_info.line] <= tokenlist(end).line);
 745 |     mlint_info = mlint_info(~strcmp({mlint_info.id}, 'CABE'));
 746 |     if isempty(mlint_info)
 747 |         return
 748 |     end
 749 |     for idx = 1:length(mlint_info)
 750 |         mlint_msg = mlint_info(idx);
 751 |         token = Token('special', 'mlint warning', ...
 752 |                       mlint_msg.line, mlint_msg.column(1));
 753 |         report = [report struct('token', token, ...
 754 |                                 'severity', 2, ...
 755 |                                 'message', mlint_msg.message)]; %#ok
 756 |     end
 757 | end
 758 | 
 759 | 
 760 | function is_builtin = does_shadow(varname)
 761 | %DOES_SHADOW figures out if variable with name VARNAME shadows a built-in
 762 | %   function or variable.
 763 | %
 764 | %   returns a boolean IS_BUILTIN.
 765 | 
 766 |     if any(exist(varname) == [2 3 4 5 6 8]) %#ok
 767 |         % now we know that something with name `varname` exists. But is it
 768 |         % a built-in, or something I wrote?
 769 |         % `which` can tell, in one of three spellings:
 770 |         shadows = which(varname, '-all');
 771 |         builtinfun = 'is a built-in method';
 772 |         builtinstr = 'built-in';
 773 |         for idx = 1:length(shadows)
 774 |             shadow = shadows{idx};
 775 |             if ( length(shadow) >= length(matlabroot) && ...
 776 |                  strcmp(shadow(1:length(matlabroot)), matlabroot) ) || ...
 777 |                ( length(shadow) >= length(builtinstr) && ...
 778 |                  strcmp(shadow(1:length(builtinstr)), builtinstr) ) || ...
 779 |                ( length(shadow) >= length(builtinfun) && ...
 780 |                  strcmp(shadow(end-length(builtinfun)+1:end), builtinfun) )
 781 |                 is_builtin = true;
 782 |                 return
 783 |             end
 784 |         end
 785 |     end
 786 |     is_builtin = false;
 787 | end
 788 | 
 789 | 
 790 | function report = report_line_length(tokenlist)
 791 | %REPORT_LINE_LENGTH walks through TOKENLIST and REPORTs on the length of
 792 | %   all lines.
 793 | %
 794 | %   While line length should not matter with today's high-resolution
 795 | %   displays, it is still useful to limit line lengths in order to be
 796 | %   able to fit several editor panes next to one another, or to be able
 797 | %   print the source code.
 798 | %
 799 | %   - By default, lines longer than 75 characters are flagged
 800 | %     as `very long`, and
 801 | %   - lines longer than 90 characters are flagged as `too long`.
 802 | %
 803 | %   returns a struct array REPORT with fields `token`, `message`, and
 804 | %   `severity`.
 805 | %
 806 | %   This check can be switched off by setting `do_check_line_length` in
 807 | %   CHECK_SETTINGS to FALSE.
 808 | 
 809 |     report = struct('token', {}, 'message', {}, 'severity', {});
 810 |     if ~check_settings('do_check_line_length')
 811 |         return
 812 |     end
 813 |     lo_line_length = check_settings('lo_line_length');
 814 |     hi_line_length = check_settings('hi_line_length');
 815 | 
 816 |     linelist = split_lines(tokenlist);
 817 |     for line_idx = 1:length(linelist)
 818 |         line_tokens = linelist{line_idx};
 819 |         line_text = [line_tokens.text];
 820 |         if length(line_text) > lo_line_length
 821 |             report_token = Token('special', 'line warning', ...
 822 |                                  line_tokens(1).line, ...
 823 |                                  length(line_text));
 824 |             report = [report struct('token', report_token, ...
 825 |                                     'message', 'line very long', ...
 826 |                                     'severity', 1)]; %#ok
 827 |         elseif length(line_text) > hi_line_length
 828 |             report_token = Token('special', 'line warning', ...
 829 |                                  line_tokens(1).line, ...
 830 |                                  length(line_text));
 831 |             report = [report struct('token', report_token, ...
 832 |                                     'message', 'line too long', ...
 833 |                                     'severity', 2)]; %#ok
 834 |         end
 835 |     end
 836 | end
 837 | 
 838 | 
 839 | function report = report_indentation(func_struct)
 840 | %REPORT_INDENTATION parses FUNC_STRUCT and REPORTs about its indentation.
 841 | %
 842 | %   Indentation is one of the primary means of making code easy to read,
 843 | %   by highlighting the structure of the code. If code is not indented
 844 | %   correctly, it can be hard to see where where nested blocks (if, for,
 845 | %   etc.) begin and end.
 846 | %
 847 | %   The first line is assumed to be indented correctly, and subsequent
 848 | %   indentation follows the normal MATLAB indentation rules:
 849 | %
 850 | %   - Indent after `for`, `parfor`, `while`, `if`, `switch`, `classdef`,
 851 | %                  `events`, `properties`, `enumeration`, `methods`,
 852 | %                  `function`.
 853 | %   - Dedent for `end`
 854 | %   - Dedent momentarily for `else`, `elseif`, `case`, `otherwise`.
 855 | %   - Comments are allowed to be indented one level out, and any amount of
 856 | %     deeper indentation than the source code.
 857 | %   - Continuation lines must be indented deeper than the surrounding
 858 | %     source code.
 859 | %
 860 | %   returns a struct array REPORT with fields `token`, `message`, and
 861 | %   `severity`.
 862 | %
 863 | %   This check can be switched off by setting `do_check_indentation` in
 864 | %   CHECK_SETTINGS to FALSE.
 865 | %
 866 | %   The setting `indentation_check_like_matlab` controls whether
 867 | %   indentation should be checked like MATLAB does it (top-level function
 868 | %   bodies are not indented in function files) or how every other language
 869 | %   on this planet does it (function bodies are always indented).
 870 | 
 871 |     report = struct('token', {}, 'message', {}, 'severity', {});
 872 |     if ~check_settings('do_check_indentation')
 873 |         return
 874 |     end
 875 | 
 876 |     linelist = split_lines(func_struct.body);
 877 | 
 878 |     nesting = func_struct.nesting;
 879 |     function_nesting = func_struct.nesting;
 880 | 
 881 |     is_switch_nesting = false;
 882 | 
 883 |     for line_idx = 1:length(linelist)
 884 |         line_tokens = linelist{line_idx};
 885 |         is_continuation = is_continuation_line(line_idx, linelist);
 886 | 
 887 |         if isempty(line_tokens)
 888 |             continue
 889 |         end
 890 | 
 891 |         first_nonspace = get_first_nonspace(line_tokens);
 892 | 
 893 | 
 894 |         if ~is_continuation
 895 |             [nesting, function_nesting, correction] = ...
 896 |                indentation_rule(nesting, function_nesting, first_nonspace);
 897 | 
 898 |             % Special case for switch
 899 |             if first_nonspace.isEqual('keyword', 'switch')
 900 |             	% Increment nesting by 1 for switch statement
 901 |                 nesting = nesting + 1;
 902 |                 correction = correction - 1;
 903 |                 is_switch_nesting = true;
 904 |             end
 905 | 
 906 |             if first_nonspace.isEqual('keyword', 'end') && is_switch_nesting
 907 |             	% Reverse nesting increment for switch statement at 'end'
 908 |                 nesting = nesting - 1;
 909 |                 is_switch_nesting = false;
 910 |             end
 911 |         end
 912 | 
 913 |         increment = check_settings('indentation_step');
 914 |         expected_indent = (nesting+correction) * increment;
 915 |         expected_indent = max(expected_indent, 0);
 916 | 
 917 |         current_indent = get_line_indentation(line_tokens);
 918 | 
 919 |         incorrect_comment = ...
 920 |             first_nonspace.hasType('comment') && ...
 921 |             ~(current_indent >= expected_indent) && ...
 922 |             current_indent ~= expected_indent-increment;
 923 |         incorrect_normal_line = ...
 924 |             ~first_nonspace.hasType('comment') && ...
 925 |             ~is_continuation && ...
 926 |             current_indent ~= expected_indent;
 927 |         incorrect_continuation_line = ...
 928 |             ~first_nonspace.hasType('comment') && ...
 929 |             is_continuation && ...
 930 |             current_indent <= expected_indent;
 931 | 
 932 |         if incorrect_comment || incorrect_normal_line || ...
 933 |            incorrect_continuation_line
 934 |             report_token = Token('special', 'indentation warning', ...
 935 |                              line_tokens(1).line, line_tokens(1).col);
 936 |             report_entry = struct('token', report_token, ...
 937 |                                   'message', 'incorrect indentation', ...
 938 |                                   'severity', 2);
 939 |             report = [report report_entry]; %#ok
 940 |         end
 941 |     end
 942 | end
 943 | 
 944 | 
 945 | function yesNo = is_continuation_line(line_idx, linelist)
 946 | %IS_CONTINUATION_LINE checks if LINELIST{LINE_IDX} is a continuation
 947 | %   of the previous line. YESNO is a boolean.
 948 | 
 949 |     if line_idx > 1
 950 |         previous_line = linelist{line_idx-1};
 951 |         yesNo = any(strcmp({previous_line.text}, '...'));
 952 |     else
 953 |         yesNo = false;
 954 |     end
 955 | end
 956 | 
 957 | 
 958 | function [nesting, function_nesting, correction] = indentation_rule(nesting, function_nesting, first_token)
 959 | %INDENTATION_RULE decides about the indentation of the current line
 960 | %   NESTING and FUNCTION_NESTING will change depending on the
 961 | %   FIRST_TOKEN on the current line.
 962 | %
 963 | %   NESTING holds the current nesting within if/for/function blocks and
 964 | %   FUNCTION_NESTING holds the current nesting within function blocks.
 965 | %   CORRECTION is an offset on NESTING for the current line only.
 966 | %
 967 | %   In case of scripts and class files, FUNCTION_NESTING is
 968 | %   effectively ignored. In case of function files, FUNCTION_NESTING
 969 | %   is used to determine whether the current function is a top-level
 970 | %   function (whose body should not be indented) or a nested function
 971 | %   (whose body should be indented).
 972 | %
 973 | %   All indentations are given and returned as integer levels of
 974 | %   indentation. Depending on your editor setup, one level might correspond
 975 | %   to 2, 3, 4, or 8 spaces.
 976 | %
 977 | %   The correct indentation for the current line is (by default):
 978 | %       (nesting + correction)*4 spaces
 979 | 
 980 |     beginnings = check_settings('beginnings');
 981 |     middles = check_settings('middles');
 982 |     
 983 |     % deactivate function file rules in class files:
 984 |     if first_token.isEqual('keyword', 'classdef')
 985 |         function_nesting = nan;
 986 |     end
 987 | 
 988 |     if ~check_settings('indentation_check_like_matlab')
 989 |         function_nesting = nan;
 990 |     end
 991 | 
 992 |     % beginning of a function:
 993 |     if first_token.isEqual('keyword', 'function')
 994 |         function_nesting = function_nesting + 1;
 995 |         nesting = nesting + 1;
 996 |         correction = -1;
 997 |     % any other beginning:
 998 |     elseif first_token.isEqual('keyword', beginnings)
 999 |         nesting = nesting + 1;
1000 |         correction = -1;
1001 |     % end of a function in:
1002 |     elseif first_token.isEqual('keyword', 'end') && ...
1003 |            nesting == function_nesting
1004 |         function_nesting = function_nesting - 1;
1005 |         nesting = nesting - 1;
1006 |         if function_nesting == 1
1007 |             correction = +1;
1008 |         else
1009 |             correction = 0;
1010 |         end
1011 |     % any other end:
1012 |     elseif first_token.isEqual('keyword', 'end')
1013 |         nesting = nesting - 1;
1014 |         correction = 0;
1015 |     % any middle (else, elseif, case):
1016 |     elseif first_token.isEqual('keyword', middles)
1017 |         correction = -1;
1018 |     % a normal line:
1019 |     else
1020 |         correction = 0;
1021 |     end
1022 | 
1023 |     % if this is in a top-level function:
1024 |     if function_nesting == 1
1025 |         correction = correction - 1;
1026 |     end
1027 | end
1028 | 
1029 | 
1030 | function indentation = get_line_indentation(line_tokens)
1031 | %GET_LINE_INDENTATION returns the number of spaces at the beginning of
1032 | %   LINE_TOKENS. INDENTATION is an integer.
1033 | 
1034 |     if ~isempty(line_tokens) && line_tokens(1).hasType('space')
1035 |         indentation = length(line_tokens(1).text);
1036 |     else
1037 |         indentation = 0;
1038 |     end
1039 | end
1040 | 
1041 | 
1042 | function token = get_first_nonspace(tokenlist)
1043 | %GET_FIRST_NONSPACE returns the first TOKEN in TOKENLIST that is not a
1044 | %   token of type space.
1045 | %   This can be useful to return the first "real" token on a line.
1046 | 
1047 |     token_idx = 1;
1048 |     while token_idx < length(tokenlist) && ...
1049 |           tokenlist(token_idx).hasType('space')
1050 |         token_idx = token_idx + 1;
1051 |     end
1052 |     token = tokenlist(token_idx);
1053 | end
1054 | 
1055 | 
1056 | function linelist = split_lines(tokens)
1057 | %SPLIT_LINES splits TOKENS into lines.
1058 | %   returns a cell array LINELIST of Token-arrays.
1059 | 
1060 |     linelist = {};
1061 |     line_start = 1;
1062 |     linebreaks = {sprintf('\n'), sprintf('\r\n')};
1063 |     for pos = 1:length(tokens)+1
1064 |         if pos == length(tokens)+1 || ...
1065 |            tokens(pos).isEqual('linebreak', linebreaks)
1066 |             linelist = [linelist {tokens(line_start:pos-1)}]; %#ok
1067 |             line_start = pos + 1;
1068 |         end
1069 |     end
1070 | end
1071 | 
1072 | 
1073 | function link = open_file_link(filename, linenum)
1074 | %OPEN_FILE_LINK returns a link target for HTML links
1075 | %   the LINK is supposed to be used in <a href="LINK">...</a> links inside
1076 | %   MATLAB. It will generate a linke that opens FILENAME at LINENUM in the
1077 | %   MATLAB editor.
1078 | 
1079 |     prefix = 'matlab.desktop.editor.openAndGoToLine';
1080 |     link = sprintf('matlab:%s(''%s'', %i);', prefix, filename, linenum);
1081 | end
1082 | 


--------------------------------------------------------------------------------
/check_settings.m:
--------------------------------------------------------------------------------
 1 | function value = check_settings(name)
 2 | %CHECK_SETTINGS returns settings vor CHECK.
 3 | %   CHECK_SETTINGS(NAME) returns the VALUE of the settings called NAME.
 4 | %
 5 | %   Create a local copy of this file and overwrite values if you want
 6 | %   custom behavior in a specific project.
 7 | 
 8 |     % thresholds for the number of lines in classes:
 9 |     settings.lo_class_num_lines = 200;
10 |     settings.hi_class_num_lines = 400;
11 |     % thresholds for the number of properties in classes:
12 |     settings.lo_class_num_properties = 10;
13 |     settings.hi_class_num_properties = 15;
14 |     % thresholds for the number of methods in classes:
15 |     settings.lo_class_num_methods = 10;
16 |     settings.hi_class_num_methods = 20;
17 | 
18 |     % thresholds for the number of lines in scripts:
19 |     settings.lo_script_num_lines = 100;
20 |     settings.hi_script_num_lines = 200;
21 |     % thresholds for the number of variables in scripts:
22 |     settings.lo_script_num_variables = 10;
23 |     settings.hi_script_num_variables = 20;
24 |     % thresholds for the level of indentation in scripts:
25 |     settings.lo_script_max_indentation = 4;
26 |     settings.hi_script_max_indentation = 8;
27 | 
28 |     % thresholds for the number of lines in functions:
29 |     settings.lo_function_num_lines = 50;
30 |     settings.hi_function_num_lines = 100;
31 |     % thresholds for the number of arguments in functions:
32 |     settings.lo_function_num_arguments = 3;
33 |     settings.hi_function_num_arguments = 5;
34 |     % thresholds for the number of variables in functions:
35 |     settings.lo_function_num_variables = 7;
36 |     settings.hi_function_num_variables = 15;
37 |     % thresholds for the level of indentation in functions:
38 |     settings.lo_function_max_indentation = 3;
39 |     settings.hi_function_max_indentation = 6;
40 |     % thresholds for the complexity of functions:
41 |     settings.lo_function_complexity = 10;
42 |     settings.hi_function_complexity = 15;
43 | 
44 |     % thresholds for the line length of files:
45 |     settings.lo_line_length = 75;
46 |     settings.hi_line_length = 90;
47 | 
48 |     % threshold for the variable length and spread (spread is the
49 |     % number of lines in which a variable is used).
50 |     % Read this as "if a variable name is less than 3 characters
51 |     % long, it should be use in no more than 3 lines":
52 |     settings.lo_varname_short_length = 3;
53 |     settings.lo_varname_short_spread = 3;
54 |     settings.lo_varname_long_length = 5;
55 |     settings.lo_varname_long_spread = 10;
56 |     settings.hi_varname_short_length = 3;
57 |     settings.hi_varname_short_spread = 5;
58 |     settings.hi_varname_long_length = 5;
59 |     settings.hi_varname_long_spread = 15;
60 | 
61 |     % switches to switch whole modules on or off:
62 |     settings.do_check_comments = true;
63 |     settings.do_check_documentation = true;
64 |     settings.do_check_eval = true;
65 |     settings.do_check_operators = true;
66 |     settings.do_check_variables = true;
67 |     settings.do_check_mlint_warnings = true;
68 |     settings.do_check_line_length = true;
69 |     settings.do_check_indentation = true;
70 | 
71 |     % indent by this many spaces per level of indentation:
72 |     settings.indentation_step = 4;
73 |     % Matlab does not indent top-level function bodies. Most other
74 |     % languages would think this behavior funny:
75 |     settings.indentation_check_like_matlab = true;
76 | 
77 |     % keywords for tokenize_code
78 |     settings.keywords = {'for' 'try' 'while' 'if' 'else' 'elseif' 'switch' ...
79 |                 'case' 'otherwise' 'function' 'classdef' 'methods' ...
80 |                 'properties' 'events' 'enumeration' 'parfor' ...
81 |                 'return' 'break' 'continue' 'catch', 'arguments'};
82 | 
83 |     % keyword beginnings which are considered for indentation calculation
84 |     settings.beginnings = {'for' 'parfor' 'while' 'if' 'switch' 'classdef' ...
85 |         'events' 'properties' 'enumeration' 'methods' ...
86 |         'function' 'try', 'arguments'};
87 |     % keyword middles which are considered for indentation calculation
88 |     settings.middles = {'else' 'elseif' 'case' 'otherwise' 'catch'};
89 | 
90 |     value = settings.(name);
91 | end
92 | 


--------------------------------------------------------------------------------
/run_unittests.m:
--------------------------------------------------------------------------------
 1 | function run_unittests()
 2 |     %RUN_UNITTESTS Runs all unit tests
 3 |     
 4 |     import matlab.unittest.TestSuite
 5 |     import matlab.unittest.TestRunner
 6 |     
 7 |     try
 8 |         % Create a test suite
 9 |         suite = ...
10 |             TestSuite.fromPackage('UnitTest', ...
11 |             'IncludingSubpackages', true);
12 | 
13 |         % Run all tests
14 |         runner = TestRunner.withTextOutput;
15 |         result = runner.run(suite);
16 | 
17 |         % Display results
18 |         disp(table(result));
19 |         disp(result);
20 | 
21 |         % Throw an error if any test failed
22 |         if sum([result(:).Failed]) + sum([result(:).Incomplete]) > 0
23 |             error('There are failing unittests!')
24 |         end
25 |     catch err
26 |         disp(err.getReport)
27 |     end
28 | end
29 | 


--------------------------------------------------------------------------------
/testFiles/MatlabArgumentClass.m:
--------------------------------------------------------------------------------
 1 | classdef MatlabArgumentClass < matlab.mixin.Heterogeneous
 2 |     %MATLABARGUMENTCLASS This is an example class for testing the
 3 |     %   argument validation
 4 |     %
 5 |     %   Some more comments to make the checker happy
 6 |     %   Some more comments to make the checker happy
 7 |     %   returns a new OBJ.
 8 |     
 9 |     %
10 |     %
11 |     properties (Access = private)
12 |         property1 (1,1) string = "Hello World"
13 | 
14 |         property2 (1,:) char = 'Hello World'
15 | 
16 |         property3 {mustBeTextScalar}
17 |     end
18 |     
19 |     methods (Access = protected)
20 |         
21 |         function obj = foo_function(input1, input2, options)
22 |             %FOO_FUNCTION This is an example function for testing the
23 |             %   indentation check
24 |             %   output1 = foo_function: input1, input2
25 |             %   Some more comments to make the checker happy
26 | 
27 |             arguments
28 |                 input1 (1,1) string
29 |                 input2 {mustBeText}
30 |                 options.?matlab.mixin.Heterogeneous
31 |             end
32 |             
33 |             try
34 |                 input1 = 42;
35 |             catch
36 |                 input2 = 42;
37 |             end
38 |             % Some more comments to make the checker happy
39 |             if input1
40 |                 obj = 1;
41 |             elseif input2
42 |                 obj = 2;
43 |             else
44 |                 obj = 0;
45 |             end
46 | 
47 |             obj.property3 = options;
48 |             
49 |         end
50 |         
51 |         function foobar = second_function(barfoo)
52 |             %SECOND_FUNCTION This is an example function for testing the
53 |             %   indentation check
54 |             %   foobar, barfoo
55 |             foobar = barfoo;
56 |         end
57 |         
58 |         function varargout = variable_length_of_in_and_output(varargin)
59 |             %VARIABLE_LENGTH_OF_IN_AND_OUTPUT is provided with input param
60 |             %    VARARGIN and output parameter VARARGOUT
61 |             varargout = varargin;
62 |         end
63 |         
64 |         function output = test_linebreak_with_continuation_operator(inputarg)
65 |             %TEST_LINEBREAK_WITH_CONTINUATION_OPERATOR is a test to verify
66 |             %    line continuation operator
67 |             %    INPUTARG, OUTPUT
68 |             
69 |             assignment_at_first_line = ...
70 |                 inputarg;
71 |             
72 |             assignment_at_second_line = ... some comment
73 |                 assignment_at_first_line;
74 |             
75 |             output = .... 4 dots give also comment
76 |                 assignment_at_second_line;
77 |         end
78 |         
79 |         function test_switch_case(inputarg)
80 |             %TEST_SWITCH_CASE test indentation of switch case
81 |             %    INPUTARG
82 |             %    Some more comments to make the checker happy
83 | 
84 |             switch inputarg
85 |                 case 1
86 |                     return
87 |                 case 2
88 |                     return
89 |                 otherwise
90 |                     return
91 |             end
92 |         end
93 |     end
94 | end
95 | 


--------------------------------------------------------------------------------
/testFiles/MatlabIndentedClass.m:
--------------------------------------------------------------------------------
 1 | classdef MatlabIndentedClass
 2 |     %MATLABINDENTEDCLASS This is an example class for testing the
 3 |     %   indentation check
 4 |     %
 5 |     %   Some more comments to make the checker happy
 6 |     %   Some more comments to make the checker happy
 7 |     %   returns a new OBJ.
 8 |     
 9 |     %
10 |     %
11 |     properties(Access = private)
12 |         foobar
13 |     end
14 |     
15 |     methods(Access = protected)
16 |         
17 |         function output1 = foo_function(input1, input2)
18 |             %FOO_FUNCTION This is an example function for testing the
19 |             %   indentation check
20 |             %   output1 = foo_function: input1, input2
21 |             %   Some more comments to make the checker happy
22 |             
23 |             try
24 |                 input1 = 42;
25 |             catch
26 |                 input2 = 42;
27 |             end
28 |             % Some more comments to make the checker happy
29 |             if input1
30 |                 output1 = 1;
31 |             elseif input2
32 |                 output1 = 2;
33 |             else
34 |                 output1 = 0;
35 |             end
36 |             
37 |         end
38 |         
39 |         function foobar = second_function(barfoo)
40 |             %SECOND_FUNCTION This is an example function for testing the
41 |             %   indentation check
42 |             %   foobar, barfoo
43 |             foobar = barfoo;
44 |         end
45 |         
46 |         function varargout = variable_length_of_in_and_output(varargin)
47 |             %VARIABLE_LENGTH_OF_IN_AND_OUTPUT is provided with input param
48 |             %    VARARGIN and output parameter VARARGOUT
49 |             varargout = varargin;
50 |         end
51 |         
52 |         function output = test_linebreak_with_continuation_operator(inputarg)
53 |             %TEST_LINEBREAK_WITH_CONTINUATION_OPERATOR is a test to verify
54 |             %    line continuation operator
55 |             %    INPUTARG, OUTPUT
56 |             
57 |             assignment_at_first_line = ...
58 |                 inputarg;
59 |             
60 |             assignment_at_second_line = ... some comment
61 |                 assignment_at_first_line;
62 |             
63 |             output = .... 4 dots give also comment
64 |                 assignment_at_second_line;
65 |         end
66 |         
67 |         function test_switch_case(inputarg)
68 |             %TEST_SWITCH_CASE test indentation of switch case
69 |             %    INPUTARG
70 |             %    Some more comments to make the checker happy
71 | 
72 |             switch inputarg
73 |                 case 1
74 |                     return
75 |                 case 2
76 |                     return
77 |                 otherwise
78 |                     return
79 |             end
80 |         end
81 |     end
82 | end
83 | 


--------------------------------------------------------------------------------
/test_MatlabIndentedClass.m:
--------------------------------------------------------------------------------
1 | function test_MatlabIndentedClass()
2 | 
3 | assert(check_settings('indentation_check_like_matlab') == true)
4 | 
5 | addpath('testFiles')
6 | check('testFiles/MatlabIndentedClass.m');
7 | 
8 | end


--------------------------------------------------------------------------------
/test_check.m:
--------------------------------------------------------------------------------
  1 | %% Tokenizing a text should not change the content
  2 | text = fileread('check.m');
  3 | tokens = tokenize_code(text);
  4 | reconstructed_text = horzcat(tokens.text);
  5 | assert(strcmp(reconstructed_text, text))
  6 | 
  7 | 
  8 | %% Function names should be extracted
  9 | report = analyze_file('', tokenize_code('function foo(); end'));
 10 | assert(strcmp(report.name.text, 'foo'))
 11 | 
 12 | report = analyze_file('', tokenize_code('function x = foo(); end'));
 13 | assert(strcmp(report.name.text, 'foo'))
 14 | 
 15 | report = analyze_file('', tokenize_code('function [x, y] = foo(); end'));
 16 | assert(strcmp(report.name.text, 'foo'))
 17 | 
 18 | 
 19 | %% Function return names should be extracted
 20 | report = analyze_file('', tokenize_code('function foo(); end'));
 21 | assert(isempty(report.returns))
 22 | 
 23 | report = analyze_file('', tokenize_code('function x = foo(); end'));
 24 | assert(strcmp(report.returns(1).text, 'x'))
 25 | assert(length(report.returns) == 1)
 26 | 
 27 | report = analyze_file('', tokenize_code('function [x, y] = foo(); end'));
 28 | assert(strcmp(report.returns(1).text, 'x'))
 29 | assert(strcmp(report.returns(2).text, 'y'))
 30 | assert(length(report.returns) == 2)
 31 | 
 32 | 
 33 | %% Function arguments should be extracted
 34 | report = analyze_file('', tokenize_code('function foo(); end'));
 35 | assert(isempty(report.arguments))
 36 | 
 37 | report = analyze_file('', tokenize_code('function foo(x); end'));
 38 | assert(strcmp(report.arguments(1).text, 'x'))
 39 | assert(length(report.arguments) == 1)
 40 | 
 41 | report = analyze_file('', tokenize_code('function foo(x, y); end'));
 42 | assert(strcmp(report.arguments(1).text, 'x'))
 43 | assert(strcmp(report.arguments(2).text, 'y'))
 44 | assert(length(report.arguments) == 2)
 45 | 
 46 | 
 47 | %% Operators should be parsed correctly
 48 | tokens = tokenize_code('a>=-b');
 49 | assert(tokens(2).hasText('>='))
 50 | assert(tokens(3).hasText('-'))
 51 | 
 52 | 
 53 | %% Transpose Operators should not be strings
 54 | tokens = tokenize_code('a''');
 55 | assert(tokens(2).isEqual('punctuation', ''''))
 56 | 
 57 | tokens = tokenize_code('a.''');
 58 | assert(tokens(2).isEqual('punctuation', '.'''))
 59 | 
 60 | tokens = tokenize_code('a''+''a''.''');
 61 | assert(tokens(2).isEqual('punctuation', ''''))
 62 | assert(tokens(4).isEqual('string', '''a'''))
 63 | assert(tokens(5).isEqual('punctuation', '.'''))
 64 | 
 65 | 
 66 | %% differentiate commands from expressions
 67 | tokens = tokenize_code('help me please % test');
 68 | assert(tokens(1).isEqual('identifier', 'help'))
 69 | assert(tokens(3).isEqual('string', 'me'))
 70 | assert(tokens(5).isEqual('string', 'please'))
 71 | assert(tokens(7).isEqual('comment', '% test'))
 72 | 
 73 | 
 74 | %% differentiate keyword end from variable end
 75 | tokens = tokenize_code('if a(end); end');
 76 | assert(tokens(5).isEqual('identifier', 'end'))
 77 | assert(tokens(9).isEqual('keyword', 'end'))
 78 | 
 79 | 
 80 | %% differentiate semicolons from linebreaks
 81 | tokens = tokenize_code('[1;2];3');
 82 | assert(tokens(3).isEqual('punctuation', ';'))
 83 | assert(tokens(6).isEqual('linebreak', ';'))
 84 | 
 85 | 
 86 | %% Identify block comments
 87 | comment = sprintf('%%{ \n foo bar \n %%}');
 88 | tokens = tokenize_code(comment);
 89 | assert(length(tokens) == 1)
 90 | assert(tokens.isEqual('comment', comment))
 91 | 
 92 | tokens = tokenize_code(sprintf('x\n%s\nx', comment));
 93 | assert(length(tokens) == 5)
 94 | assert(tokens(3).isEqual('comment', comment))
 95 | 
 96 | 
 97 | %% line breaks should break lines
 98 | tokens = tokenize_code(',foo bar');
 99 | assert(tokens(1).hasType('linebreak'))
100 | assert(tokens(4).hasType('string'))
101 | 
102 | tokens = tokenize_code(';foo bar');
103 | assert(tokens(1).hasType('linebreak'))
104 | assert(tokens(4).hasType('string'))
105 | 
106 | 
107 | %% line breaks should not break lines within brackets
108 | tokens = tokenize_code('[a;b];');
109 | assert(tokens(3).hasType('punctuation'))
110 | assert(tokens(6).hasType('linebreak'))
111 | 
112 | tokens = tokenize_code('[a,b],');
113 | assert(tokens(3).hasType('punctuation'))
114 | assert(tokens(6).hasType('linebreak'))
115 | 
116 | %% comments follow continuation operator
117 | tokens = tokenize_code('... % this is a comment');
118 | assert(tokens(1).hasType('punctuation'));
119 | assert(tokens(3).hasType('comment'));
120 | 
121 | tokens = tokenize_code('... this is a comment');
122 | assert(tokens(1).hasType('punctuation'));
123 | assert(tokens(2).hasType('space'));
124 | assert(tokens(3).hasType('comment'));
125 | 
126 | tokens = tokenize_code('....');
127 | assert(tokens(1).hasType('punctuation'));
128 | assert(tokens(2).hasType('comment'));
129 | 
130 | tokens = tokenize_code('.*...');
131 | assert(tokens(1).hasType('punctuation'));
132 | assert(tokens(2).hasType('punctuation'));


--------------------------------------------------------------------------------
/tokenize_code.m:
--------------------------------------------------------------------------------
  1 | function tokenlist = tokenize_code(source_code)
  2 | %TOKENIZE_CODE splits M-code into Tokens
  3 | %   TOKENIZE(SOURCE_CODE) splits the SOURCE_CODE into interpretable
  4 | %   parts. It returns an object array of Tokens TOKENLIST, where each
  5 | %   token has a 'type', a 'text', a 'line', and a 'col'. Concatenating
  6 | %   all 'text's recreates the original SOURCE_CODE.
  7 | %   'type' can be one of:
  8 | %   - 'keyword'
  9 | %   - 'identifier'
 10 | %   - 'space'
 11 | %   - 'punctuation'
 12 | %   - 'property'
 13 | %   - 'string'
 14 | %   - 'number'
 15 | %   - 'pair'
 16 | %   - 'linebreak'
 17 | %   - 'comment'
 18 | %   - 'escape'
 19 | %
 20 | % See also: Token
 21 | 
 22 | % (c) 2016, Bastian Bechtold
 23 | % This code is licensed under the terms of the BSD 3-clause license
 24 | 
 25 |     punctuation = '=.&|><~+-*^/\:@?';
 26 |     open_pairs = '{[(';
 27 |     close_pairs = '}])';
 28 |     escapes = '!%';
 29 | 
 30 |     keywords = check_settings('keywords');
 31 |     
 32 |     operators = { '+'  '-'  '*'  '/'  '^'  '\' ...
 33 |                  '.+' '.-' '.*' './' '.^' '.\' ...
 34 |                  '>' '<' '~' '==' '>=' '<=' '~=' ...
 35 |                  '@' '=' ',' ';' '||' '&&' '|' '&' '...' ':' '.?'};
 36 |     unary_operators = '+-@~.';
 37 | 
 38 |     spaces = sprintf(' \t');
 39 |     breaks = sprintf('\n\r');
 40 |     number_start = '0123456789';
 41 |     number_body = [number_start 'eEij.'];
 42 |     name_start = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
 43 |     name_body = [name_start '0123456789_'];
 44 | 
 45 |     tokenlist = Token.empty;
 46 |     pos = 1; % the current character position in the source_code
 47 |     line_num = 1; % the current line number
 48 |     line_start = pos; % where the current line started
 49 |     is_first_symbol = true; % the first symbol can have special meaning
 50 |     source_code = [source_code sprintf('\n')]; % ensure proper file end
 51 |     nesting = 0; % count braces, since some operators have different
 52 |                  % meaning inside and outside braces
 53 |     while pos < length(source_code)
 54 |         letter = source_code(pos);
 55 |         % a variable or a function or a keyword:
 56 |         if any(letter == name_start)
 57 |             symbol = skip(name_body);
 58 |             % keywords such as `if` or `classdef`
 59 |             if any(strcmp(symbol, keywords))
 60 |                 is_first_symbol = false;
 61 |                 add_token('keyword', symbol);
 62 |             % the keyword `end`:
 63 |             elseif strcmp(symbol, 'end') && nesting == 0
 64 |                 add_token('keyword', symbol);
 65 |             % anything else is just a variable or function name:
 66 |             else
 67 |                 add_token('identifier', symbol);
 68 |                 % if this is the start of a command, the rest of the line
 69 |                 % needs to be interpreted as strings. 
 70 |                 % Note: this is not the case if the the identifier is inside a
 71 |                 % 'properties' or 'arguments' block. In that case, the rest of 
 72 |                 % the line needs to be interpreted as validation routine.
 73 |                 last_keyword_idx = find(strcmp({tokenlist.type}, 'keyword'), 1, 'last');
 74 |                 is_argument_validation_command = ~isempty(last_keyword_idx) && ...
 75 |                     any(strcmp(tokenlist(last_keyword_idx).text, {'properties' 'arguments'}), 2);
 76 |                 if is_first_symbol && nesting == 0 && ~is_argument_validation_command
 77 |                     is_first_symbol = false;
 78 |                     saved_pos = pos;
 79 |                     first_space = skip(spaces);
 80 |                     first_word = skip_unless([spaces breaks ';,%']);
 81 |                     pos = saved_pos;
 82 |                     % commands are any single identifier that is not
 83 |                     % followed by space-operator-space:
 84 |                     if ~any(strcmp(first_word, operators)) && ...
 85 |                        ~isempty(first_space)
 86 |                         parse_command()
 87 |                     end
 88 |                 end
 89 |             end
 90 |         % a sequence of one or more spaces or tabs:
 91 |         elseif any(letter == spaces)
 92 |             symbol = skip(spaces);
 93 |             add_token('space', symbol);
 94 |         % any binary or unary operator, such as `+`, `>=`, or `.foo`
 95 |         elseif any(letter == punctuation)
 96 |             is_first_symbol = false;
 97 |             % property access begins with a `.` operator, and includes a
 98 |             % name, such as `.foo`. Classifying this as punctuation makes
 99 |             % it easier to differentiate it from variable/function names.
100 |             if letter == '.' && pos < length(source_code) && ...
101 |                any(source_code(pos+1) == name_start)
102 |                 pos = pos + 1;
103 |                 symbol = [letter skip(name_body)];
104 |                 add_token('property', symbol);
105 |             % any other operator:
106 |             else
107 |                 symbol = skip(punctuation);
108 |                 % one operator:
109 |                 % Multiple operators can be present in 'symbol', e.g. '&&...' or
110 |                 % '|...'. Find largest operator at start of symbol.
111 |                 largest_start_operator = find_pattern(operators);
112 |                 if ~isempty(largest_start_operator)
113 |                     % Add operator and keep remainder of symbol for next
114 |                     % iteration.
115 |                     add_token('punctuation', largest_start_operator);
116 |                     pos = pos - length(symbol) + length(largest_start_operator);
117 |                     % All text on the same line after '...' must be interpreted 
118 |                     % as a comment.
119 |                     if strcmp(largest_start_operator, '...')
120 |                         symbol = skip(spaces);
121 |                         if ~isempty(symbol)
122 |                             add_token('space', symbol)
123 |                         end
124 |                         symbol = skip_unless(breaks);
125 |                         if ~isempty(symbol)
126 |                             add_token('comment', symbol);
127 |                         end
128 |                     end
129 |                 % a binary operator, followed by a unary operator:
130 |                 elseif any(symbol(end) == unary_operators) && ...
131 |                        any(strcmp(symbol(1:end-1), operators))
132 |                     add_token('punctuation', symbol(1:end-1));
133 |                     add_token('punctuation', symbol(end));
134 |                 % element-wise transpose operator:
135 |                 % This has to be parsed here, so as to not confuse the `'`
136 |                 % with the beginning of a string.
137 |                 elseif strcmp(symbol, '.') && source_code(pos) == ''''
138 |                     pos = pos + 1;
139 |                     add_token('punctuation', '.''');
140 |                 % struct access operator such as `.(foo)`:
141 |                 % There is normally no `.` operator, but it makes sense to
142 |                 % classify `.(` as such here.
143 |                 elseif strcmp(symbol, '.') && source_code(pos) == '('
144 |                     add_token('punctuation', '.');
145 |                 % this should never happen:
146 |                 else
147 |                     error(['unknown operator ''' symbol '''']);
148 |                 end
149 |             end
150 |         % strings and transpose begin with `'`. The `.'` operator has
151 |         % already been handled above:
152 |         elseif letter == ''''
153 |             % the first symbol cannot be transpose, so must be string
154 |             if is_first_symbol
155 |                 string = skip_string('''');
156 |                 add_token('string', string);
157 |             else
158 |                 previous = tokenlist(end);
159 |                 
160 |                 % transpose operator:
161 |                 % To differentiate the start of a string from the
162 |                 % transpose operator, we need to check whether the
163 |                 % previous token was a value or an operator. If a value,
164 |                 % `'` means transpose. If an operator, `'` marks the start
165 |                 % of a string.
166 |                 if previous.isEqual('pair', {'}' ']' ')'}) || ...
167 |                    previous.hasType({'identifier' 'number' 'property'})
168 |                     pos = pos + 1;
169 |                     add_token('punctuation', letter);
170 |                 % strings:
171 |                 else
172 |                     string = skip_string('''');
173 |                     add_token('string', string);
174 |                 end
175 |             end
176 |             is_first_symbol = false;
177 |         % string that starts with double quotes (")
178 |         elseif letter == '"'
179 |             is_first_symbol = false;
180 |             string = skip_string('"');
181 |             add_token('string', string);
182 |         % we don't make any distinction between different kinds of parens:
183 |         elseif any(letter == open_pairs)
184 |             is_first_symbol = false;
185 |             pos = pos + 1;
186 |             nesting = nesting + 1;
187 |             add_token('pair', letter);
188 |         elseif any(letter == close_pairs)
189 |             pos = pos + 1;
190 |             nesting = nesting - 1;
191 |             add_token('pair', letter);
192 |         % new lines are line breaks and increment the line:
193 |         elseif any(letter == breaks)
194 |             % split into individual line breaks
195 |             start = pos;
196 |             line_breaks = regexp(skip(breaks), '(\n)|(\r\n)', 'match');
197 |             pos = start;
198 |             for line_break = line_breaks
199 |                 pos = pos + length(line_break{1});
200 |                 add_token('linebreak', line_break{1});
201 |                 % add the token before incrementing the line to to avoid
202 |                 % confusing add_token
203 |                 line_num = line_num + 1;
204 |                 line_start = pos;
205 |             end
206 |             is_first_symbol = true;
207 |         % `,` and `;` are line breaks that do not increment the line,
208 |         % or simple operators if they occur within a pair
209 |         elseif any(letter == ';,')
210 |             pos = pos + 1;
211 |             if nesting == 0
212 |                 add_token('linebreak', letter);
213 |                 is_first_symbol = true;
214 |             else
215 |                 add_token('punctuation', letter);
216 |             end
217 |         % numbers are easy, and may contain `.`, `e`, `E`, `i`, and `j`
218 |         elseif any(letter == number_start)
219 |             is_first_symbol = false;
220 |             symbol = skip(number_body);
221 |             add_token('number', symbol);
222 |         % finally, comments and `!` include the rest of the line,
223 |         % unless they are block comments, in which case they might include
224 |         % much more.
225 |         elseif any(letter == escapes)
226 |             comment = skip_line();
227 |             if letter == '%'
228 |                 if ~isempty(regexp(comment, '^\%\{\s*$', 'once')) && ...
229 |                    is_first_symbol
230 |                     comment = [comment skip_block_comment()]; %#ok
231 |                 end
232 |                 add_token('comment', comment);
233 |             else
234 |                 add_token('escape', comment);
235 |             end
236 |         else
237 |             error('unknown identifier');
238 |         end
239 |     end
240 | 
241 |     function add_token(token_type, token_text)
242 |     %ADD_TOKEN adds a new token to the token list, and annotates it
243 |     %   with the current line number and column. TOKEN_TYPE and TOKEN_TEXT
244 |     %   become the Token's `type` and `text` property.
245 |     %   this modifies TOKENLIST!
246 | 
247 |         char_num = pos-line_start-length(token_text)+1;
248 |         tokenlist(length(tokenlist)+1) = Token(token_type, token_text, ...
249 |                                                line_num, char_num);
250 |     end
251 | 
252 |     function string = skip(letters)
253 |     %SKIP skips LETTERS and returns skipped letters as STRING
254 |     %   this modifies POS!
255 | 
256 |         string_start = pos;
257 |         while any(source_code(pos) == letters) && pos < length(source_code)
258 |             pos = pos + 1;
259 |         end
260 |         string = source_code(string_start:pos-1);
261 |     end
262 | 
263 |     function string = skip_unless(letters)
264 |     %SKIP_UNLESS skips letters not in LETTERS and returns skipped letters
265 |     %   as STRING.
266 |     %   this modifies POS!
267 | 
268 |         string_start = pos;
269 |         while all(source_code(pos) ~= letters)
270 |             pos = pos + 1;
271 |         end
272 |         string = source_code(string_start:pos-1);
273 |     end
274 | 
275 |     function string = skip_line()
276 |     %SKIP_LINE skips to the end of the line and returns the line as STRING
277 |     %   this modifies POS!
278 | 
279 |         string_start = pos;
280 |         while all(source_code(pos) ~= sprintf('\r\n'))
281 |             pos = pos + 1;
282 |         end
283 |         string = source_code(string_start:pos-1);
284 |     end
285 | 
286 |     function string = skip_string(quote_type)
287 |     %SKIP_STRING skips to the end of the string and returns the STRING
288 |     %   the STRING includes both quotation marks. QUOTE_TYPE is the
289 |     %   type of quote character to look for (' or ").
290 |     %   this modifies POS!
291 | 
292 |         string_start = pos;
293 |         while true
294 |             if source_code(pos) ~= quote_type || pos == string_start
295 |                 pos = pos + 1;
296 |             elseif length(source_code) > pos ...
297 |                     && source_code(pos+1) == quote_type
298 |                 pos = pos + 2;
299 |             else % source_code(pos) == quote_type
300 |                 pos = pos + 1;
301 |                 break;
302 |             end
303 |         end
304 |         string = source_code(string_start:pos-1);
305 |     end
306 | 
307 |     function string = skip_block_comment()
308 |     %SKIP_block_comment skips to the end of the block comment and returns
309 |     %   the whole multi-line block comment as STRING.
310 |     %   this modifies POS!
311 | 
312 |         block_start = pos;
313 |         is_first_statement = false;
314 |         while pos <= length(source_code)
315 |             % line break:
316 |             if any(source_code(pos) == sprintf('\n\r'))
317 |                 is_first_statement = true;
318 |             % don't change `is_first_statement` while skipping spaces:
319 |             elseif any(source_code(pos) == sprintf('\t '))
320 |                 % nothing changes
321 |             % block comment ends must be alone on the line:
322 |             elseif source_code(pos) == '%' && is_first_statement && ...
323 |                    pos < length(source_code) && source_code(pos+1) == '}'
324 |                 pos = pos + 2;
325 |                 break
326 |             % any other character is just part of the comment:
327 |             else
328 |                 is_first_statement = false;
329 |             end
330 |             pos = pos + 1;
331 |         end
332 |         string = source_code(block_start:pos-1);
333 |     end
334 | 
335 |     function parse_command()
336 |     %PARSE_COMMAND parses to the end of a command, and appends all args
337 |     %   to the token list.
338 |     %   this modifies POS and TOKENLIST!
339 | 
340 |         while pos < length(source_code)
341 |             letter = source_code(pos);
342 |             % commands can contain literal strings:
343 |             if letter == ''''
344 |                 string_literal = skip_string('''');
345 |                 add_token('string', string_literal);
346 |             elseif letter == '"'
347 |                 string_literal = skip_string('"');
348 |                 add_token('string', string_literal);
349 |             % commands can contain spaces:
350 |             elseif any(letter == spaces)
351 |                 symbol = skip(spaces);
352 |                 add_token('space', symbol);
353 |             % commands end at `\n`, `%`, `,`, or `;`:
354 |             elseif any(letter == [breaks '%,;'])
355 |                 break
356 |             % any other non-space sequence is interpreted as a string:
357 |             else
358 |                 str = skip_unless([breaks spaces '%,;']);
359 |                 add_token('string', str);
360 |             end
361 |         end
362 |     end
363 | 
364 |     function pat_out = find_pattern(pat)
365 |     %FIND_PATTERN Find pattern with most characters in symbol.
366 |     % pat_out = FIND_PATTERN(pat) returns the pattern with which
367 |     % SYMBOL starts and that has the most characters. The input 
368 |     % pat is a cell array of character vectors that represent the
369 |     % patterns that should be tested. If symbol does not start 
370 |     % with any pattern defined by pat, pat_out is empty.
371 |         pat_out = '';
372 |         % Find pat location. If non-existent, idx is zero.
373 |         pat_idx = cellfun(@(x) strfind(symbol, x), pat, 'UniformOutput', false);
374 |         pat_idx(cellfun(@isempty, pat_idx)) = {0};
375 |         pat_idx = [pat_idx{:}];
376 |         % Only evaluate patterns with which symbol starts (i.e.
377 |         % pat_idx == 1)
378 |         if any(pat_idx == 1)
379 |             start_pat_array = pat(pat_idx == 1);
380 |             [~, max_start_idx] = max(cellfun(@length, start_pat_array));
381 |             if length(max_start_idx) == 1
382 |                 pat_out = start_pat_array{max_start_idx};
383 |             end
384 |         end
385 |     end
386 | end
387 | 


--------------------------------------------------------------------------------