├── .github └── workflows │ ├── build_and_release.yml │ └── unittest.yml ├── .gitignore ├── .idea └── .gitignore ├── HISTORY.md ├── LICENSE ├── README.md ├── asset └── img │ ├── codetext_logo.png │ └── codetext_logo_line.png ├── pyproject.toml ├── requirements.txt ├── src └── codetext │ ├── __init__.py │ ├── __main__.py │ ├── clean │ ├── __init__.py │ └── noise_removal.py │ ├── codetext_cli.py │ ├── parser │ ├── README.md │ ├── __init__.py │ ├── c_sharp_parser.py │ ├── cpp_parser.py │ ├── go_parser.py │ ├── java_parser.py │ ├── javascript_parser.py │ ├── language_parser.py │ ├── php_parser.py │ ├── python_parser.py │ ├── ruby_parser.py │ └── rust_parser.py │ └── utils │ ├── __init__.py │ ├── imports.py │ └── utils.py └── tests ├── __init__.py ├── setup.py ├── test_clean ├── __init__.py └── test_clean_utils.py ├── test_parser ├── __init__.py ├── test_c.py ├── test_cpp.py ├── test_csharp.py ├── test_go.py ├── test_java.py ├── test_javascript.py ├── test_php.py ├── test_python.py ├── test_ruby.py ├── test_rust.py └── test_sample │ ├── README.md │ ├── c_sharp_test_sample.cs │ ├── c_test_sample.c │ ├── cpp_test_sample.cpp │ ├── go_test_sample.go │ ├── java_test_sample.java │ ├── javascript_test_sample.js │ ├── php_test_sample.php │ ├── py_test_sample.py │ ├── ruby_test_sample.rb │ └── rust_test_sample.rs └── test_utils ├── __init__.py └── test_utils.py /.github/workflows/build_and_release.yml: -------------------------------------------------------------------------------- 1 | 2 | name: Publish package to PyPI 3 | 4 | on: 5 | release: 6 | types: [created] 7 | 8 | jobs: 9 | release: 10 | # if: github.event_name == 'release' && github.event.action == 'created' 11 | name: PyPi Release 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | name: Checkout repo 17 | 18 | - name: Set up Python 3.7 19 | uses: actions/setup-python@v1 20 | with: 21 | python-version: 3.7 22 | 23 | - uses: actions/cache@v1 24 | name: Cache pip dependencies 25 | with: 26 | path: ~/.cache/pip 27 | key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }} 28 | restore-keys: | 29 | ${{ runner.os }}-pip- 30 | - name: Install pip dependencies 31 | run: | 32 | pip install --upgrade pip 33 | pip install -r requirements.txt 34 | python3 -m pip install --upgrade build twine wheel 35 | - name: Make distribution 36 | run: | 37 | python3 setup.py sdist bdist_wheel 38 | twine check dist/* 39 | - name: Publish a Python distribution to PyPI 40 | uses: pypa/gh-action-pypi-publish@master 41 | with: 42 | user: __token__ 43 | password: ${{ secrets.PYPI_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/unittest.yml: -------------------------------------------------------------------------------- 1 | name: Unittest 2 | 3 | on: push 4 | 5 | jobs: 6 | unittest: 7 | name: Unittest 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | pyversion: [ "3.10" ] 12 | 13 | steps: 14 | - name: Check out Git repository 15 | uses: actions/checkout@v2 16 | 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: ${{ matrix.pyversion }} 21 | 22 | - name: Install dependencies 23 | run: | 24 | pip install -r requirements.txt 25 | # git clone https://github.com/nmd-2000/docstring_parser docstring_parser 26 | # pip install -e ./docstring_parser 27 | 28 | - name: Run tests 29 | run: | 30 | python -m unittest 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | cache/* 2 | src/*/*.txt 3 | src/codetext.egg-info/* 4 | */build/* 5 | */dist/* 6 | */tree-sitter-* 7 | *.jsonl 8 | *.json 9 | *.zip 10 | *.gz 11 | *.pyc 12 | *.so 13 | *.whl 14 | .idea 15 | .vscode 16 | *.iml 17 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | .idea 5 | .vscode 6 | *.iml -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | ======== 2 | Releases 3 | ======== 4 | 5 | Version 0.0.9 6 | ============= 7 | Release date: Jul 1, 2024 8 | * Skip building language binaries from source 9 | 10 | Version 0.0.8 11 | ============= 12 | Release date: Aug 17, 2023 13 | 14 | * Update format codetext_cli 15 | * Update PythonParser: Handle class definitions with empty argument list class ABC() 16 | * Add Javascript undeclared functions 17 | * Add PHP interface 18 | * Add Ruby actions with block parameters 19 | 20 | Version 0.0.7 21 | ============= 22 | Release date: Jul 5, 2023 23 | 24 | * Update all class extractor format (using dict instead of list) 25 | * Fix missing identifier, parameter in C, C#, Java parser 26 | * Implement CLI 27 | 28 | Version 0.0.6 29 | ============= 30 | Release date: Jan 9, 2023 31 | 32 | * Add tree sitter utils (in codetext.parser) 33 | * Replace all `match_from_span` to `get_node_text` 34 | * Replace all `traverse_type` to `get_node_by_kind` 35 | * Fix `CppParser.get_function_metadata` missing `param_type` and `param_identifier` 36 | * Update return metadata from all parser 37 | 38 | Version 0.0.5 39 | ============= 40 | Release date: Dec 12, 2022 41 | 42 | * Fix package import path 43 | * Adding auto build workflow 44 | * Seperate codetext parser with processing source code 45 | * Fix `remove_comment_delimiter` remove leading whitespace 46 | * Update unittest for parser and utilites 47 | 48 | Version 0.0.4 49 | ============= 50 | Release date: Dec 2, 2022 51 | 52 | * Fix main package root path 53 | * Loosen `docstring_parser` dependency 54 | 55 | Version 0.0.3 56 | ============= 57 | Release date: Dec 2, 2022 58 | 59 | * New clean docstring function 60 | * check_docstring_contain_question 61 | * check_docstring_underdevelopment 62 | * check_docstring_autogenerated 63 | * check_contain_little_single_char 64 | * check_contain_many_special_char 65 | * check_contain_little_unique_chars 66 | * check_contain_little_unique_words 67 | * check_contain_many_special_case 68 | * check_contain_too_many_variables 69 | * check_contain_many_repeated_word 70 | * check_contain_many_uppercase_word 71 | * check_contain_many_long_word 72 | 73 | Version 0.0.2 74 | ============= 75 | Release date: Nov 25, 2022 76 | 77 | * Language parser for Rust 78 | * get_docstring 79 | * get_class_list, get_function_list 80 | * get_class_metadata, get_function_metadata 81 | * Processing utils: 82 | * extract_docstring 83 | * extract_node 84 | * get_line_definitions 85 | * get_node_definitions 86 | * process_raw_node 87 | * Postprocessing: 88 | * Merge file (from batches) 89 | * Split into train/test/valid (by #sample category) 90 | * Deduplicate sample 91 | 92 | Version 0.0.1 93 | ============= 94 | Release date: Nov 9, 2022 95 | 96 | * Language parser for Java, Python, JavaScript, PHP, Golang, Ruby, C++, C#, C 97 | * get_docstring 98 | * get_class_list, get_function_list 99 | * get_class_metadata, get_function_metadata 100 | * Clean docstring function 101 | * Data preprocessing source code 102 | * Tree-sitter utils: build_language, parse_code 103 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 AI4Code Research Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

4 | logo 5 |

6 | ______________________________________________________________________ 7 | 8 | 9 | 10 | | Branch | Build | Unittest | Release | License | 11 | |-------- |------- |---------- |--------- |--------- | 12 | | main | | [![Unittest](https://github.com/AI4Code-Research/CodeText-parser/actions/workflows/unittest.yml/badge.svg)](https://github.com/AI4Code-Research/CodeText-parser/actions/workflows/unittest.yml) | [![release](https://img.shields.io/pypi/v/codetext)](https://pypi.org/project/codetext/) [![pyversion](https://img.shields.io/pypi/pyversions/codetext)](https://pypi.org/project/codetext/)| [![license](https://img.shields.io/github/license/AI4Code-Research/CodeText-parser)](https://github.com/AI4Code-Research/CodeText-parser/blob/main/LICENSES.txt) | 13 | 14 |
15 | 16 | ______________________________________________________________________ 17 | 18 | **Code-Text parser** is a custom [tree-sitter](https://github.com/tree-sitter)'s grammar parser for extract raw source code into class and function level. We support 10 common programming languages: 19 | - Python 20 | - Java 21 | - JavaScript 22 | - PHP 23 | - Ruby 24 | - Rust 25 | - C 26 | - C++ 27 | - C# 28 | - Go 29 | 30 | # Installation 31 | **codetext** package require python 3.7 or above and tree-sitter. Setup environment and install dependencies manually from source: 32 | ```bash 33 | git https://github.com/FSoft-AI4Code/CodeText-parser.git; cd CodeText-parser 34 | pip install -r requirement.txt 35 | pip install -e . 36 | ``` 37 | 38 | Or install via `pypi` package: 39 | ```bash 40 | pip install codetext 41 | ``` 42 | 43 | # Getting started 44 | 45 | ## `codetext` CLI Usage 46 | ```bash 47 | codetext [options] [PATH or FILE] ... 48 | ``` 49 | 50 | For example extract any python file in `src/` folder: 51 | ```bash 52 | codetext src/ --language Python 53 | ``` 54 | 55 | If you want to store extracted class and function, use flag `--json` and give a path to destination file: 56 | ```bash 57 | codetext src/ --language Python --output_file ./python_report.json --json 58 | ``` 59 | 60 | **Options** 61 | 62 | ```bash 63 | positional arguments: 64 | paths list of the filename/paths. 65 | 66 | optional arguments: 67 | -h, --help show this help message and exit 68 | --version show program's version number and exit 69 | -l LANGUAGE, --language LANGUAGE 70 | Target the programming languages you want to analyze. 71 | -o OUTPUT_FILE, --output_file OUTPUT_FILE 72 | Output file (e.g report.json). 73 | --json Generate json output as a transform of the default 74 | output 75 | --verbose Print progress bar 76 | 77 | ``` 78 | 79 | **Example** 80 | ``` 81 | File circle_linkedlist.py analyzed: 82 | ================================================== 83 | Number of class : 1 84 | Number of function : 2 85 | -------------------------------------------------- 86 | 87 | Class summary: 88 | +-----+---------+-------------+ 89 | | # | Class | Arguments | 90 | +=====+=========+=============+ 91 | | 0 | Node | | 92 | +-----+---------+-------------+ 93 | 94 | Class analyse: Node 95 | +-----+---------------+-------------+--------+---------------+ 96 | | # | Method name | Paramters | Type | Return type | 97 | +=====+===============+=============+========+===============+ 98 | | 0 | __init__ | self | | | 99 | | | | data | | | 100 | +-----+---------------+-------------+--------+---------------+ 101 | 102 | Function analyse: 103 | +-----+-----------------+-------------+--------+---------------+ 104 | | # | Function name | Paramters | Type | Return type | 105 | +=====+=================+=============+========+===============+ 106 | | 0 | push | head_ref | | Node | 107 | | | | data | Any | Node | 108 | | 1 | countNodes | head | Node | | 109 | +-----+-----------------+-------------+--------+---------------+ 110 | ``` 111 | 112 | ## Using `codetext` as Python module 113 | ### Build your language 114 | `codetext` need tree-sitter language file (i.e `.so` file) to work properly. You can manually compile language ([see more](https://github.com/tree-sitter/py-tree-sitter#usage)) or automatically build use our pre-defined function (the `.so` will saved in a folder name `/tree-sitter/`): 115 | ```python 116 | from codetext.utils import build_language 117 | 118 | language = 'rust' 119 | build_language(language) 120 | 121 | # INFO:utils:Not found tree-sitter-rust, attempt clone from github 122 | # Cloning into 'tree-sitter-rust'... 123 | # remote: Enumerating objects: 2835, done. ... 124 | # INFO:utils:Attempt to build Tree-sitter Language for rust and store in .../tree-sitter/rust.so 125 | ``` 126 | 127 | ### Using Language Parser 128 | Each programming language we supported are correspond to a custome `language_parser`. (e.g Python is [`PythonParser()`](src/codetext/parser/python_parser.py#L11)). `language_parser` take input as raw source code and use breadth-first search to traveser through all syntax node. The class, method or stand-alone function will then be collected: 129 | 130 | ```python 131 | from codetext.utils import parse_code 132 | 133 | raw_code = """ 134 | /** 135 | * Sum of 2 number 136 | * @param a int number 137 | * @param b int number 138 | */ 139 | double sum2num(int a, int b) { 140 | return a + b; 141 | } 142 | """ 143 | 144 | # Auto parse code into tree-sitter.Tree 145 | root = parse_code(raw_code, 'cpp') 146 | root_node = root.root_node 147 | ``` 148 | 149 | Get all function nodes inside a specific node: 150 | ```python 151 | from codetext.utils.parser import CppParser 152 | 153 | function_list = CppParser.get_function_list(root_node) 154 | print(function_list) 155 | 156 | # [] 157 | 158 | ``` 159 | 160 | Get function metadata (e.g. function's name, parameters, (optional) return type) 161 | ```python 162 | function = function_list[0] 163 | 164 | metadata = CppParser.get_function_metadata(function, raw_code) 165 | 166 | # {'identifier': 'sum2num', 'parameters': {'a': 'int', 'b': 'int'}, 'type': 'double'} 167 | ``` 168 | Get docstring (documentation) of a function 169 | ```python 170 | docstring = CppParser.get_docstring(function, code_sample) 171 | 172 | # ['Sum of 2 number \n@param a int number \n@param b int number'] 173 | ``` 174 | 175 | We also provide 2 command for extract class object 176 | ```python 177 | class_list = CppParser.get_class_list(root_node) 178 | # and 179 | metadata = CppParser.get_metadata_list(root_node) 180 | ``` 181 | 182 | # Limitations 183 | `codetext` heavly depends on tree-sitter syntax: 184 | - Since we use tree-sitter grammar to extract desire node like function, class, function's name (identifier) or class's argument list, etc. `codetext` is easily vulnerable by tree-sitter update patch or syntax change in future. 185 | 186 | - While we try our best to capture all possiblity, there are still plenty out there. We open for community to contribute into this project. -------------------------------------------------------------------------------- /asset/img/codetext_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/asset/img/codetext_logo.png -------------------------------------------------------------------------------- /asset/img/codetext_logo_line.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/asset/img/codetext_logo_line.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "codetext" 7 | version = "0.0.9" 8 | authors = [ 9 | { name="Dung Manh Nguyen", email="dungnm.workspace@gmail.com" }, 10 | ] 11 | description = "Multilingual programming language parsers for the extract from raw source code into multiple levels of pair data" 12 | readme = "README.md" 13 | requires-python = ">=3.6" 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: MIT License", 17 | "Operating System :: OS Independent", 18 | ] 19 | dependencies = [ 20 | "tree-sitter>=0.20", 21 | "Levenshtein>=0.20", 22 | "langdetect>=1.0.0", 23 | "bs4>=0.0.1", 24 | "tabulate>=0.9.0" 25 | ] 26 | 27 | [project.urls] 28 | "Homepage" = "https://github.com/AI4Code-Research/CodeText-data" 29 | "Bug Tracker" = "https://github.com/AI4Code-Research/CodeText-data/issues" 30 | 31 | [project.scripts] 32 | codetext = "codetext.__main__:main" 33 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # for preprocessing 2 | tree-sitter==0.20.4 3 | tabulate 4 | Levenshtein 5 | langdetect 6 | bs4 7 | tree_sitter_languages==1.10.2 8 | -------------------------------------------------------------------------------- /src/codetext/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/src/codetext/__init__.py -------------------------------------------------------------------------------- /src/codetext/__main__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import argparse 4 | import pkg_resources 5 | 6 | import json 7 | from .codetext_cli import parse_file, print_result, PL_MATCHING 8 | 9 | 10 | def get_args(): 11 | parser = argparse.ArgumentParser(description=f"codetext parser {20*'='}") 12 | 13 | parser.add_argument('paths', nargs='*', default=['.'], 14 | help='list of the filename/paths.') 15 | parser.add_argument("--version", action="version", 16 | version=pkg_resources.get_distribution("codetext").version) 17 | parser.add_argument("-l", "--language", 18 | help='''Target the programming languages you want to 19 | analyze.''') 20 | parser.add_argument("-o", "--output_file", 21 | help='''Output file (e.g report.json). 22 | ''', 23 | type=str) 24 | parser.add_argument("--json", 25 | help='''Generate json output as a transform of the 26 | default output''', 27 | action="store_true") 28 | parser.add_argument("--verbose", 29 | help='''Print progress bar''', 30 | action="store_true") 31 | 32 | return parser.parse_args() 33 | 34 | 35 | def main(): 36 | opt = get_args() 37 | 38 | # check args 39 | if opt.json: 40 | if not opt.output_file: 41 | raise ValueError("Missing --output_file") 42 | if opt.language: 43 | if opt.language not in PL_MATCHING.keys(): 44 | raise ValueError( 45 | "{language} not supported. Currently support {sp_language}" 46 | .format(language=opt.language, 47 | sp_language=list(PL_MATCHING.keys()))) 48 | 49 | # check path 50 | for path in opt.paths: 51 | assert os.path.exists(path) == True, "paths is not valid" 52 | 53 | if os.path.isdir(path): 54 | files = [os.path.join(path, f) for f in os.listdir(path) \ 55 | if os.path.isfile(os.path.join(path, f))] 56 | elif os.path.isfile(path): 57 | files = [path] 58 | 59 | if opt.language: 60 | for file in files[:]: 61 | filename, file_extension = os.path.splitext(file) 62 | if file_extension not in PL_MATCHING[opt.language]: 63 | files.remove(file) 64 | 65 | output_metadata = {} 66 | for file in files: 67 | filename, file_extension = os.path.splitext(file) 68 | 69 | if opt.language == None: 70 | for lang, ext_list in PL_MATCHING.items(): 71 | if file_extension in ext_list: 72 | language = lang 73 | break 74 | else: 75 | language = opt.language 76 | 77 | output = parse_file(file, language=language) 78 | print_result( 79 | output, 80 | file_name=str(filename).split(os.sep)[-1]+file_extension 81 | ) 82 | output_metadata[file] = output 83 | 84 | if opt.json: 85 | save_path = opt.output_file 86 | with open(save_path, 'w') as output_file: 87 | json.dump(output_metadata, output_file, sort_keys=True, indent=4) 88 | print(50*'=') 89 | print("Save report to {path}".format(path=save_path)) 90 | 91 | 92 | if __name__ == '__main__': 93 | main() 94 | -------------------------------------------------------------------------------- /src/codetext/clean/__init__.py: -------------------------------------------------------------------------------- 1 | """Clean utilities""" 2 | 3 | from .noise_removal import remove_comment_delimiters, remove_special_tag, remove_special_character 4 | 5 | 6 | __all__ = [ 7 | 'remove_comment_delimiters', 'remove_special_tag', 'remove_special_character' 8 | ] -------------------------------------------------------------------------------- /src/codetext/clean/noise_removal.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | import warnings 4 | from collections import Counter 5 | from itertools import permutations 6 | from typing import Any, Dict, List, Union 7 | 8 | from langdetect import detect, detect_langs 9 | from bs4 import BeautifulSoup 10 | import Levenshtein as lev 11 | 12 | from tree_sitter import Node 13 | from ..parser.language_parser import tokenize_docstring, get_node_by_kind 14 | warnings.filterwarnings("ignore", category=UserWarning, module='bs4') 15 | 16 | 17 | REGEX_TEXT = ("(?<=[a-z0-9])(?=[A-Z])|" 18 | "(?<=[A-Z0-9])(?=[A-Z][a-z])|" 19 | "(?<=[0-9])(?=[a-zA-Z])|" 20 | "(?<=[A-Za-z])(?=[0-9])|" 21 | "(?<=[@$.'\"])(?=[a-zA-Z0-9])|" 22 | "(?<=[a-zA-Z0-9])(?=[@$.'\"])|" 23 | "_|\\s+") 24 | 25 | if sys.version_info >= (3, 7): 26 | import re 27 | SPLIT_REGEX = re.compile(REGEX_TEXT) 28 | else: 29 | import regex 30 | SPLIT_REGEX = regex.compile("(?V1)"+REGEX_TEXT) 31 | 32 | 33 | def split_sentences(docstring): 34 | # sentences = re.split("(? List[str]: 43 | """ 44 | Split a single identifier into parts on snake_case and camelCase 45 | """ 46 | identifier_parts = list(s.lower() for s in SPLIT_REGEX.split(identifier) if len(s)>0) 47 | 48 | if len(identifier_parts) == 0: 49 | return [identifier] 50 | return identifier_parts 51 | 52 | 53 | def check_is_node_error(node: Node) -> bool: 54 | """ 55 | Check if node contains "ERROR" node 56 | Args: 57 | node (tree_sitter.Node): node 58 | 59 | Return: 60 | bool 61 | """ 62 | if not isinstance(node, Node): 63 | raise ValueError("Expect type tree_sitter.Node, get %i", type(node)) 64 | 65 | error_node = get_node_by_kind(node, ['ERROR']) 66 | if len(error_node) > 0: 67 | return True 68 | else: 69 | return False 70 | 71 | 72 | def get_node_length(node: Node) -> int: 73 | """ 74 | Get node length 75 | Args: 76 | node (tree_sitter.Node): node 77 | 78 | Return: 79 | int 80 | """ 81 | if not isinstance(node, Node): 82 | raise ValueError("Expect type tree_sitter.Node, get %i", type(node)) 83 | 84 | line_start = node.start_point[0] 85 | line_end = node.end_point[0] 86 | return int(line_end - line_start) 87 | 88 | 89 | def remove_comment_delimiters(docstring: str, remove_whitespace: bool=True) -> str: 90 | """ 91 | Remove comment delimiters. 92 | Example: //, /*, */, #, etc 93 | 94 | Args: 95 | docstring (str): raw (line or block) comment 96 | remove_whitespace (bool): remove leading whitespace or not 97 | Returns: 98 | str: removed delimiters docstring/comment 99 | 100 | """ 101 | clean_pattern = re.compile(r'([\'\"]{3})$|^([\'\"]{3})') # remove python ''' or """ 102 | clean_pattern1 = re.compile(r'([#]+)$|^([#]+)') # special single-line comment with # 103 | clean_pattern2 = re.compile(r'([\/*=-]+)$|^([\/*!=-]+)') 104 | 105 | docstring = re.sub(clean_pattern, '', docstring) 106 | new_docstring = [] 107 | for line in docstring.split('\n'): 108 | if remove_whitespace: 109 | line = line.strip() 110 | line = re.sub(clean_pattern1, '', line) 111 | line = re.sub(clean_pattern2, '', line) 112 | new_docstring.append(line) 113 | 114 | return '\n'.join(new_docstring) 115 | 116 | 117 | def remove_special_tag(docstring: str) -> str: 118 | """ 119 | Remove all special tag (html tag, e.g.

docstring

) 120 | """ 121 | return BeautifulSoup(docstring, "html.parser").get_text() 122 | 123 | 124 | def remove_special_character(docstring: str) -> str: 125 | return re.sub(r'[^a-zA-Z0-9\\\_\.\,]', ' ', docstring) 126 | 127 | 128 | def remove_function_name_at_the_beginning(docstring): 129 | """ 130 | This function is applied at docstring/paragraph-level. 131 | """ 132 | ending_symbols = [":", "\s-"] 133 | for symbol in ending_symbols: 134 | pattern = "^[a-zA-Z0-9_\(\)]+" + symbol 135 | docstring = re.sub(pattern, "", docstring) 136 | 137 | docstring = docstring.strip() 138 | 139 | return docstring 140 | 141 | 142 | def remove_link_in_brackets(docstring): 143 | """ 144 | Removing patterns, for examples: 145 | - (https://www.a.ai) 146 | - 147 | - 148 | 149 | \param 150 | \brief 151 | 152 | This function is applied to each line of the docstring/paragraph. 153 | """ 154 | pattern = "\%s(?:http|see|e\.g|eg.).*?\%s" 155 | bracket_pairs = [("(", ")"), ("<", ">")] 156 | for pair in bracket_pairs: 157 | docstring = re.sub(pattern % pair, "", docstring.strip()) 158 | 159 | return docstring.strip() 160 | 161 | 162 | def remove_everything_after_a_pattern(docstring): 163 | """ 164 | Only keep the part appears before the patterns. 165 | Ignore everything after the patterns. 166 | 167 | This function is applied at docstring-level 168 | """ 169 | patterns = [ 170 | "E.g", "e.g", "eg.", "Eg.", 171 | "Example usage:", "Created by", "Example:", 172 | "Note:", ". Note", "note::", "note:", ". note" 173 | ] 174 | 175 | for pattern in patterns: 176 | docstring = docstring.strip().split(pattern)[0] 177 | 178 | docstring = docstring.strip() 179 | return docstring 180 | 181 | 182 | def remove_everything_after_an_url(docstring): 183 | """ 184 | This function applies at sentence-level 185 | TO-DO: Should apply on docstring-level by regular expression 186 | """ 187 | patterns = ["https:", "http:"] 188 | sentences = split_sentences(docstring) 189 | sentences_ = [] 190 | for sentence in sentences: 191 | has_pattern = False 192 | for pattern in patterns: 193 | if pattern in sentence: 194 | has_pattern = True 195 | break 196 | if has_pattern: 197 | break 198 | sentences_.append(sentence) 199 | docstring = ". ".join(sentences_) 200 | 201 | docstring = docstring.strip() 202 | 203 | return docstring 204 | 205 | 206 | def remove_lines_start_and_end_with_the_same_char(docstring): 207 | """ 208 | Remove noisy lines. 209 | This function applies at line-level 210 | """ 211 | lines = docstring.strip().split("\n") 212 | patterns = ["*", "-", "_", "=", "/", "+"] 213 | lines_ = [] 214 | for line in lines: 215 | line = line.strip() 216 | if line == "": 217 | lines_.append(line) 218 | continue 219 | flag = False 220 | for pattern in patterns: 221 | p = "^\%s.*\%s$" % (pattern, pattern) 222 | if re.search(p, line) is not None: 223 | flag = True 224 | break 225 | if flag: 226 | continue 227 | 228 | lines_.append(line) 229 | docstring = "\n".join(lines_).strip() 230 | 231 | return docstring 232 | 233 | 234 | def remove_lines_contain_only_a_single_char(docstring): 235 | """ 236 | This function applies at line-level 237 | """ 238 | patterns = ["*", "/", "=", "-", "+"] 239 | lines = docstring.strip().split("\n") 240 | for i, line in enumerate(lines): 241 | if line.strip() in patterns: 242 | lines[i] = "" 243 | continue 244 | 245 | docstring = "\n".join(lines).strip() 246 | 247 | return docstring 248 | 249 | 250 | def remove_patterns_at_any_positions(docstring): 251 | """ 252 | This function applies at docstring-level 253 | """ 254 | patterns = ["/**", "/*", "", "", "*-*"] 255 | for pattern in patterns: 256 | if pattern in docstring: 257 | docstring = docstring.replace(pattern, "").strip() 258 | 259 | return docstring 260 | 261 | 262 | def remove_patterns_at_the_start_and_end_of_a_line(docstring): 263 | """ 264 | This function applies at line-level 265 | """ 266 | patterns = ["* "] 267 | lines = docstring.strip().split("\n") 268 | for i, line in enumerate(lines): 269 | flag = True 270 | while flag: 271 | flag = False 272 | # at the beginning 273 | for pattern in patterns: 274 | if line.startswith(pattern): 275 | line = line[len(pattern):] 276 | for symbol in [".", "*", "-", "_", "@", "#", "$", "!", "\\", "/", "+"]: 277 | pattern = r"^\%s{2,}" % (symbol) 278 | line_ = re.sub(pattern, "", line) 279 | if line_ != line: 280 | flag = True 281 | line = line_ 282 | 283 | # at the end 284 | for symbol in [".", "*", "-", "_", "@", "#", "$", "!", "\\", "/", "+"]: 285 | pattern = r"\%s{2,}$" % (symbol) 286 | line_ = re.sub(pattern, "", line) 287 | if line_ != line: 288 | flag = True 289 | line = line_ 290 | lines[i] = line 291 | 292 | docstring = "\n".join(lines).strip() 293 | 294 | return docstring 295 | 296 | 297 | def remove_patterns_at_the_end_of_a_docstring(docstring): 298 | """ 299 | Remove ending character(s) 300 | This function applies at docstring-level 301 | """ 302 | patterns = [":", ";", ",", "...", "@@", "@"] 303 | if docstring != "": 304 | if docstring[-1] in patterns: 305 | docstring = docstring[:-1] + '.' 306 | 307 | docstring = docstring.strip() 308 | 309 | return docstring 310 | 311 | 312 | def remove_specific_pattern(docstring: str) -> str: 313 | """ 314 | pattern 1 will match "(e.g something)" 315 | pattern 2 will match "e.g something\n" or "e.g something. " 316 | pattern 3 will match "{@tag content}" and change to "content" 317 | pattern 4 will match trailing special chars "==============" or "************" 318 | """ 319 | pattern1 = re.compile(r'(\(((i\.e)|(e\.g)|(\beg)|(\bie))[\s\S]+?)(\))', flags=re.IGNORECASE|re.MULTILINE) 320 | pattern3 = re.compile(r'{@.*?}') 321 | pattern4 = re.compile(r'(-|=|#|\*){5,}') 322 | 323 | docstring = re.sub(pattern1, '', docstring) 324 | # docstring = re.sub(pattern2, '', docstring) 325 | docstring = re.sub(pattern4, '', docstring) 326 | all_matches = re.findall(pattern3, docstring) 327 | for match in all_matches: 328 | new_match = str(match)[1:-1] # remove { } 329 | new_match = re.sub(r'@\w*', '', new_match) 330 | docstring = docstring.replace(match, new_match) 331 | 332 | return docstring 333 | 334 | 335 | def remove_unrelevant(docstring: str) -> str: 336 | flag = True 337 | while flag: 338 | flag = False 339 | docstring_ = docstring 340 | 341 | removing_functions = [ 342 | remove_specific_pattern, 343 | remove_link_in_brackets, 344 | # remove_everything_after_an_url, # Overlap 345 | # remove_everything_after_a_pattern, # Noticeable wrong catch 346 | remove_patterns_at_any_positions, 347 | remove_lines_contain_only_a_single_char, 348 | remove_lines_start_and_end_with_the_same_char, 349 | remove_patterns_at_the_start_and_end_of_a_line, 350 | remove_function_name_at_the_beginning, 351 | ] 352 | for removing_function in removing_functions: 353 | docstring = removing_function(docstring) 354 | # print(removing_function.__name__) 355 | # print(docstring) 356 | # print('\n\n') 357 | 358 | if docstring != docstring_: 359 | flag = True 360 | 361 | docstring = remove_patterns_at_the_end_of_a_docstring(docstring) 362 | return docstring 363 | 364 | 365 | # =================== Check code ====================== 366 | 367 | def check_is_black_node(node_name: str, exclude_list: List = None): 368 | """ 369 | Check if node belongs to black list. E.g: 370 | - Built-in function 371 | - Test function, test class 372 | - Constructor 373 | """ 374 | black_keywords = ['test_', 'Test_', '_test', 'toString', 'constructor', 'Constructor'] 375 | black_keywords.extend(exclude_list) 376 | 377 | if not isinstance(node_name, str): 378 | raise ValueError(f'Expect str, get {type(node_name)}') 379 | if node_name.startswith('__') and node_name.endswith('__'): 380 | return True 381 | if node_name.startswith('set') or node_name.startswith('get'): 382 | return True 383 | if any(keyword in node_name for keyword in black_keywords): 384 | return True 385 | 386 | return False 387 | 388 | 389 | def check_is_empty_function(node): 390 | """ 391 | If node width is longer than 3 lines, then it's not an empty function 392 | """ 393 | if get_node_length(node) <= 3: 394 | return True 395 | return False 396 | 397 | 398 | def check_autogenerated_by_code(raw_code: str, identifier: str): 399 | threshold = 0.4 400 | fn_name_splited = split_identifier_into_parts(identifier) 401 | fn_name_splited = ' '.join(fn_name_splited).lower() 402 | 403 | comment = str(re.sub(r'[^a-zA-Z0-9]', ' ', comment)).lower() 404 | 405 | d0 = lev.distance(fn_name_splited, comment) 406 | d1 = max(len(fn_name_splited), len(comment)) 407 | 408 | if d0 <= d1*threshold: 409 | return True 410 | 411 | return False 412 | 413 | # =================== Check docstring ====================== 414 | 415 | def check_docstring_length(docstring: str): 416 | doc_tokens = docstring.strip().split() 417 | if len(doc_tokens) < 3: # or len(doc_tokens) > 256: 418 | # if len(doc_tokens) >= 256: 419 | return True 420 | return False 421 | 422 | 423 | def check_docstring_literal(docstring: str): 424 | """ 425 | Check if docstring is EN 426 | TODO: "Ce n'est pas en anglais" -> Fr 427 | """ 428 | p = re.compile('[a-zA-Z0-9]') 429 | if not docstring.isascii(): 430 | return True 431 | if not p.search(docstring): 432 | return True 433 | # TODO: uncomment this 434 | # try: 435 | # _docstring = re.sub(r'[^a-zA-Z0-9]', ' ', docstring) 436 | # _docstring = ' '.join(split_all_sepcial_case(_docstring)) 437 | 438 | # print(_docstring) 439 | # if detect(_docstring) != 'en': 440 | # print(detect_langs(_docstring)) 441 | # return True 442 | # except: 443 | # pass 444 | return False 445 | 446 | 447 | def check_docstring_contain_question(docstring: str): 448 | pattern = re.compile(r'(?i)^(why\b|how\b|what\'?s?\b|where\b|is\b|are\b)') 449 | 450 | if docstring[-1] == '?' or pattern.search(docstring): 451 | return True 452 | else: 453 | return False 454 | 455 | 456 | def check_docstring_underdevelopment(docstring: str): 457 | p1 = re.compile('(?i)^((Description of the Method)|(NOT YET DOCUMENTED)|(Missing[\s\S]+Description)|(not in use)|' 458 | '(Insert the method\'s description here)|(No implementation provided)|(\(non\-Javadoc\)))') 459 | p2 = re.compile('(?i)^(todo|to-do|deprecate|copyright|fixme)', flags=re.IGNORECASE) 460 | # p3 = re.compile('^[A-Za-z]+(\([A-Za-z_]+\))?:') 461 | 462 | if p1.search(docstring) or p2.search(docstring): 463 | return True 464 | else: 465 | return False 466 | 467 | 468 | def check_docstring_autogenerated(docstring: str): 469 | p1 = re.compile(r'(?i)@[a-zA-Z]*generated\b') 470 | p2 = re.compile('(?i)^([aA]uto[-\s]generated)') 471 | p3 = re.compile('(?i)^(This method initializes)') 472 | p4 = re.compile('(?i)^(This method was generated by)') 473 | 474 | if docstring is not None: 475 | if p1.search(docstring): 476 | return True 477 | 478 | if p2.search(docstring) or p3.search(docstring) or p4.search(docstring): 479 | return True 480 | 481 | else: 482 | return False 483 | 484 | 485 | def check_docstring_contain_specific_pattern(docstring: str): 486 | condition1 = re.compile(r'((i\.e)|(e\.g)|(\beg)|(\bie))(\s|\.)', flags=re.IGNORECASE) 487 | condition2 = re.compile(r'(^(Sees*)|(example usage)|(example)|(note:*))', flags=re.IGNORECASE) 488 | condition_follow = re.compile(r'[^a-zA-Z0-9\s\.\,\:\;\'\"]') 489 | 490 | # if pattern 1 and 2 match -> check if the line contain any special characters 491 | if condition1.match(docstring) or condition2.match(docstring): 492 | if condition_follow.match(docstring): 493 | return True 494 | 495 | return False 496 | 497 | 498 | # =================== Check characters ====================== 499 | 500 | def does_str_containt_math(str): 501 | math_indicators = ["equation", "\exp(", "\log(", "\sqrt(", "mathbf", "mathrm"] 502 | # TODO: page [number] 503 | containt_math = False 504 | for math_indicator in math_indicators: 505 | if math_indicator in str: 506 | containt_math = True 507 | break 508 | 509 | return containt_math 510 | 511 | 512 | def check_contain_little_alphabet_char(docstring: str): 513 | thresholds = [5, 0.65, 15, 0.4] 514 | docstring = docstring.strip() 515 | contain_math = does_str_containt_math(docstring) 516 | docstring = "".join(docstring.strip().split()) 517 | if len(docstring) < 1: 518 | return True 519 | num_alphabet_chars = len(re.findall("[a-zA-Z]", docstring)) 520 | 521 | return len(docstring) > thresholds[0 + 2*int(contain_math)] and num_alphabet_chars / len(docstring) < thresholds[1 + 2*int(contain_math)] 522 | 523 | 524 | def convert_special_pattern(docstring): 525 | patterns = [ 526 | (["HH", "MM", "SS"], (":", "-")), 527 | (["MM", "DD", "YY"], (":", "-")), 528 | (["MM", "DD", "YYYY"], (":", "-")), 529 | 530 | (["hh", "mm", "ss"], (":", "-")), 531 | (["mm", "dd", "yy"], (":", "-")), 532 | (["mm", "dd", "yyyy"], (":", "-")), 533 | 534 | (["R", "G", "B"], (",", "-")), 535 | 536 | (["r", "g", "b"], (",", "-")) 537 | ] 538 | for pattern, signs in patterns: 539 | for sign in signs: 540 | pms = permutations(pattern) 541 | for pm in pms: 542 | string = sign.join(pm) 543 | if string in docstring: 544 | docstring = docstring.replace(string, "".join(pm).lower()) 545 | return docstring 546 | 547 | 548 | def check_contain_many_special_char(docstring: str): 549 | threshold_dict = [[4, 6, 10, 6], # max #bracket schar, max #normal schar, max #math schar 550 | [10, 0.3, 17, 0,5], # acceptable #total schar or acceptable ratio 551 | [15, 20]] #, 0.3] # max #schar 552 | docstring = docstring.strip() 553 | containt_math = does_str_containt_math(docstring) 554 | docstring = convert_special_pattern(docstring) 555 | num_tokens = len(tokenize_docstring(docstring)) 556 | counter = Counter(docstring) 557 | 558 | count = 0 559 | math_symbols = ["+", "-", "*", "/", ":", "^", "=", "<", ">", "|", "(",] 560 | 561 | symbols = ["$", "!", "@", "#", "%", "^", "&", "*", "<", ">", 562 | "~", "|", "\\", "'", '"',"?", "-", "+", "=", "`", 563 | ":", "/", "(", "[", "{"] 564 | 565 | for symb in symbols: 566 | threshold = threshold_dict[0][0] 567 | if symb in ["(", "[", "{"]: 568 | threshold = threshold_dict[0][1] 569 | if containt_math: 570 | threshold = threshold_dict[0][3] 571 | else: 572 | if containt_math: 573 | if symb in math_symbols: 574 | threshold = threshold_dict[0][2] 575 | 576 | if counter[symb] > threshold: 577 | return True 578 | 579 | # brackets 580 | if symb not in ["(", "[", "{"]: 581 | count += counter[symb] 582 | 583 | return count > max(threshold_dict[1][0 + 2*int(containt_math)], threshold_dict[1][1 + 2*int(containt_math)]*num_tokens) \ 584 | and count > threshold_dict[2][int(containt_math)] 585 | 586 | 587 | def check_contain_little_unique_chars(docstring): 588 | """ 589 | This function applies on docstring line 590 | """ 591 | threshold_dict = [5, 3] 592 | docstring = "".join(docstring.strip().split()) 593 | return len(docstring) > threshold_dict[0] and len(set(docstring)) <= threshold_dict[1] 594 | 595 | # =================== Check words ====================== 596 | 597 | def check_contain_little_unique_words(docstring): 598 | threshold_dict = [3, 0.3] 599 | ignored_words = ["the", "of", "a", "an", "it", "for", "or", "in", "but",] 600 | # ".", ",", "(", ")", "{", "}", "<", ">", "[", "]", "-", "|"] 601 | docs = ' '.join(re.findall(r'\b[a-zA-Z0-9]+\b', docstring)) 602 | docstring_tokens = tokenize_docstring(docs) 603 | counter = Counter(docstring_tokens) 604 | try: 605 | most_repeated_word = counter.most_common()[0][0] 606 | except IndexError: 607 | return True 608 | max_count = counter.most_common()[0][1] 609 | 610 | index = 1 611 | while most_repeated_word in ignored_words: 612 | try: 613 | most_repeated_word = counter.most_common()[index][0] 614 | max_count = counter.most_common()[index][1] 615 | index += 1 616 | except IndexError: 617 | return False 618 | 619 | return max_count >= threshold_dict[0] and max_count / len(docstring_tokens) > threshold_dict[1] 620 | 621 | 622 | # def check_contain_many_special_case(docstring: str): 623 | # """ 624 | # Check if the string contains too much sneak_case or camelCase 625 | # """ 626 | # threshold = 0.3 627 | # total_words = docstring.strip().split() 628 | # if len(total_words) == 0: 629 | # return True 630 | # sneak_cases = re.findall("\w+_\w+", docstring) 631 | # camelCases = re.findall("[A-Z]([A-Z0-9]*[a-z][a-z0-9]*[A-Z]|[a-z0-9]*[A-Z][A-Z0-9]*[a-z])[A-Za-z0-9]*", docstring) 632 | # return (len(sneak_cases) + len(camelCases))/len(total_words) > threshold 633 | 634 | 635 | # def check_contain_many_repeated_word(docstring: str): 636 | # """ 637 | # Check if the string (longer than 30 words) have too many repeated word 638 | # """ 639 | # threshold_dict = [30, 0.5] # max number, ratio 640 | # docstring = "".join(docstring.strip().split()) 641 | # counter = Counter(docstring) 642 | # return len(docstring) > threshold_dict[0] and counter.most_common()[0][1] / len(docstring) > threshold_dict[1] 643 | 644 | 645 | def check_contain_many_uppercase_word(docstring: str): 646 | threshold_dict = [10, 0.3] 647 | patterns = ["DD", "MM", "YY", "YYYY", "R,G,B", "R-G-B", "SS", "HH", "API"] 648 | for pattern in patterns: 649 | docstring = docstring.replace(pattern, pattern.lower()) 650 | 651 | docstring = docstring.strip() 652 | snake_case_identifiers = re.findall("\w+_\w+", docstring) 653 | 654 | for identifier in snake_case_identifiers: 655 | docstring = docstring.replace(identifier, identifier.lower()) 656 | 657 | uppercase_words = re.findall(r"(?<=\s)[A-Z][A-Z0-9_]+", docstring) 658 | docstring_tokens = docstring.strip().split() 659 | return len(docstring_tokens) > threshold_dict[0] and len(uppercase_words) / len(docstring_tokens) > threshold_dict[1] 660 | 661 | 662 | def check_contain_too_many_variables(docstring): 663 | """ 664 | Check if the string contains too much sneak_case or camelCase 665 | """ 666 | threshold_dict = 0.3 667 | total_words = docstring.strip().split() 668 | if not total_words: 669 | return False 670 | 671 | # snake_case variable name 672 | snake_case_identifiers = re.findall("\w+_\w+", docstring) 673 | for identifier in snake_case_identifiers: 674 | docstring = docstring.replace(identifier, "").strip() 675 | # CamelCaes variable name 676 | camel_case_identifiers = re.finditer(r"[A-Z]([A-Z0-9]*[a-z][a-z0-9]*[A-Z]|[a-z0-9]*[A-Z][A-Z0-9]*[a-z])[A-Za-z0-9]*", docstring) 677 | camel_case_identifiers = [x.group() for x in camel_case_identifiers] 678 | # Method call 679 | variable_names = snake_case_identifiers + camel_case_identifiers 680 | 681 | return len(variable_names)/len(total_words) > threshold_dict 682 | 683 | 684 | def check_contain_too_many_method_call(docstring): 685 | threshold_dict = 0.2 686 | total_words = docstring.strip().split() 687 | if not total_words: 688 | return False 689 | 690 | method_call_identifiers = re.finditer(r"[a-zA-Z0-9]+((\.|\()[a-zA-Z0-9]+)+", docstring) 691 | method_call_identifiers = [x.group() for x in method_call_identifiers] 692 | 693 | return len(method_call_identifiers)/len(total_words) > threshold_dict 694 | 695 | 696 | def camel_case_split(identifier): 697 | matches = re.finditer(r'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier) 698 | return [m.group(0) for m in matches] 699 | 700 | 701 | def snake_case_split(identifier): 702 | return identifier.strip().split("_") 703 | 704 | 705 | def split_all_sepcial_case(docstring: str): 706 | docstring_tokens = [] 707 | for token in tokenize_docstring(docstring.strip()): 708 | sub_tokens = snake_case_split(token) 709 | for sub_token in sub_tokens: 710 | sub_sub_tokens = camel_case_split(sub_token) 711 | docstring_tokens.extend(sub_sub_tokens) 712 | 713 | return docstring_tokens 714 | 715 | def check_contain_many_long_word(docstring: str): 716 | threshold = 30 717 | docstring_tokens = split_all_sepcial_case(docstring) 718 | 719 | if len(docstring_tokens) == 0: 720 | return True 721 | 722 | return max([len(docstring_token) for docstring_token in docstring_tokens]) > threshold 723 | 724 | 725 | def check_contain_url(docstring: str): 726 | pattern = re.compile(r'(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])', flags=re.I) 727 | 728 | if pattern.search(docstring): 729 | return True 730 | return False 731 | 732 | # =================== End checking ====================== 733 | 734 | def check_function(node, node_metadata: Dict[str, Any], exclude_list: List = None, is_class=False): 735 | """ 736 | Check function if 737 | - is built-in function (python) 738 | - is constructor 739 | - is empty 740 | - is error node 741 | - have length < 3 lines 742 | 743 | Args: 744 | node (tree_sitter.Node): function node 745 | exclude_list (List): exclude name of function 746 | Return: 747 | bool: pass the check or not 748 | """ 749 | node_identifier = node_metadata['identifier'] 750 | 751 | # Check node/code 752 | if check_is_node_error(node): 753 | return False 754 | if check_is_black_node(node_identifier, exclude_list): 755 | return False 756 | if check_is_empty_function(node): 757 | return False 758 | 759 | return True 760 | 761 | 762 | def check_docstring(docstring: str, loosen_filter: bool = False): 763 | """ 764 | Check docstring is valid or not 765 | """ 766 | check_funcs_mapping = [ 767 | # 'check_docstring_literal', 768 | 'check_docstring_contain_question', 769 | 'check_docstring_underdevelopment', 770 | 'check_docstring_autogenerated', 771 | 'check_docstring_contain_specific_pattern', 772 | 'check_contain_little_alphabet_char', 773 | 'check_contain_many_special_char', 774 | 'check_contain_little_unique_chars', 775 | 'check_contain_little_unique_words', 776 | # 'check_contain_many_special_case', 777 | 'check_contain_too_many_variables', 778 | 'check_contain_too_many_method_call', 779 | # 'check_contain_many_repeated_word', 780 | 'check_contain_many_uppercase_word', 781 | 'check_contain_many_long_word', 782 | 'check_contain_url', 783 | ] 784 | 785 | check_docstring_funcs = [ 786 | # check_docstring_literal, 787 | check_docstring_contain_question, 788 | check_docstring_underdevelopment, 789 | check_docstring_autogenerated, 790 | check_docstring_contain_specific_pattern, 791 | check_contain_little_alphabet_char, 792 | check_contain_many_special_char, 793 | check_contain_little_unique_chars, 794 | check_contain_little_unique_words, 795 | # check_contain_many_special_case, 796 | check_contain_too_many_variables, 797 | check_contain_too_many_method_call, 798 | # check_contain_many_repeated_word, 799 | check_contain_many_uppercase_word, 800 | check_contain_many_long_word, 801 | check_contain_url, 802 | ] 803 | 804 | if loosen_filter: 805 | check_docstring_funcs = [ 806 | check_docstring_contain_question, 807 | check_docstring_underdevelopment, 808 | check_docstring_autogenerated, 809 | check_docstring_contain_specific_pattern, 810 | check_contain_little_alphabet_char, 811 | # check_contain_many_special_char, 812 | check_contain_little_unique_chars, 813 | check_contain_little_unique_words, 814 | # check_contain_many_special_case, 815 | # check_contain_too_many_variables, 816 | # check_contain_too_many_method_call, 817 | # check_contain_many_repeated_word, 818 | check_contain_many_uppercase_word, 819 | check_contain_many_long_word, 820 | check_contain_url, 821 | ] 822 | 823 | # docstring_list = docstring.split('.') 824 | # print(f'\nAfter split {docstring_list}') 825 | 826 | applied_res = [] 827 | result = False 828 | for i, check_condition in zip(check_funcs_mapping, check_docstring_funcs): 829 | # for comment in docstring_list: 830 | if docstring == '' or not docstring: 831 | return True #, [] 832 | # if True then docstring have fail 833 | if check_condition(docstring): 834 | return True 835 | # return True 836 | # applied_res.append(f"<{i}> {docstring}") 837 | 838 | return result #, applied_res 839 | 840 | 841 | def clean_docstring(docstring: str, loosen_filter: bool = False): 842 | """ 843 | Clean docstring by removing special tag/url, characters, unrelevant information 844 | """ 845 | cleaned_docstring = [] 846 | if docstring == '' or docstring == None: 847 | return None 848 | _docstring = remove_comment_delimiters(docstring) 849 | if check_docstring_literal(_docstring): # True is not pass 850 | return None #, [f" {docstring}"] 851 | 852 | # _docstring = '\n'.join(remove_comment_delimiters(docstring)) 853 | docstring_paragraph_list = _docstring.strip().split('\n\n') 854 | 855 | for para in docstring_paragraph_list: 856 | docs = remove_unrelevant(para) 857 | docstring_list = re.split(r'(?<=.)[.!\?](?=\s+)', docs, flags=re.M) 858 | clean_line = [] 859 | for line in docstring_list: 860 | try: 861 | line = remove_special_tag(line) 862 | except: 863 | print('Oops') 864 | return None 865 | 866 | # not_pass, res = check_docstring(line, loosen_filter) 867 | not_pass = check_docstring(line, loosen_filter) 868 | if not not_pass: 869 | clean_line.append(line) 870 | else: 871 | break 872 | 873 | if len(clean_line) < len(docstring_list): 874 | clean_line.append('') 875 | cleaned_docstring.append('.'.join(clean_line)) 876 | 877 | 878 | cleaned_docstring = '\n\n'.join(cleaned_docstring) 879 | 880 | 881 | if check_docstring_length(cleaned_docstring): 882 | # if not res: 883 | # return None #, [f" {docstring}"] 884 | # else: 885 | return None #, res 886 | 887 | return cleaned_docstring #, res 888 | 889 | if __name__ == '__main__': 890 | # test remove comment delimiters 891 | raw = [ 892 | '// C, C++, C#', 893 | '/// C, C++, C#', 894 | 895 | '/*******' 896 | '* Java' 897 | '/*******', 898 | '//** Java */', 899 | 900 | '# Python', 901 | 902 | '//! Rust', 903 | '//!!! Rust', 904 | '/*!! Rust', 905 | '/*! Rust', 906 | 907 | ''' 908 | /* The code below will print the words Hello World to the screen, and it is amazing 909 | 910 | Somethin here too*/ 911 | ''' 912 | ] 913 | 914 | # for item in raw: 915 | # print(remove_comment_delimiters(item)) 916 | 917 | samples = [ 918 | '\n\t\t/* 将JSONArray转换为Bean的List, 默认为ArrayList */', 919 | '// TODO: Why is he using Math.round?', 920 | '/* for now try mappig full type URI */', 921 | '// public String transformTypeID(URI typeuri){', 922 | '// return typeuri.toString();}', 923 | '/* Do we need to show the upgrade wizard prompt? */', 924 | '/* fixme: This function is not in use */', 925 | '// SampleEncryptionBox (senc) and SampleAuxiliaryInformation{Sizes|Offsets}Box', 926 | '/* This method initializes by me. The second line \n\n Abcdef*/', 927 | '/* @func_name_generated', 928 | '/* Auto-generated by IDE', 929 | '/ Auto-generated by IDE', 930 | ''' 931 | /// Abc 932 | /// Abc 933 | /// Abc 934 | ''', 935 | ''' 936 | /* Abc 937 | * def 938 | */ 939 | ''' 940 | ] 941 | 942 | # for item in samples: 943 | # print(clean_docstring(item)) 944 | 945 | samples = [ 946 | ''' 947 | Returns the Surface's pixel buffer if the Surface doesn't require locking. 948 | (e.g. it's a software surface) 949 | ''', 950 | ''' 951 | Taking in a sequence string, return the canonical form of the sequence 952 | (e.g. the lexigraphically lowest of either the original sequence or its 953 | reverse complement) 954 | ''', 955 | ''' 956 | Internal clear timeout. The function checks that the `id` was not removed 957 | (e.g. by `chart.destroy()`). For the details see 958 | [issue #7901](https://github.com/highcharts/highcharts/issues/7901). 959 | ''', 960 | ] 961 | 962 | # print('==== Cleaning ====') 963 | # for item in samples: 964 | # print(clean_docstring(item)) 965 | 966 | sample = ''' 967 | Returns the message Id to use as heading text, depending on what types of 968 | usage are present (i.e. just writable files, or also readable directories, 969 | etc). 970 | |need_lifetime_text_at_end| is set to false iff the returned message Id 971 | already includes an explanation for how long a website will have access to 972 | the listed paths. It is set to true iff a separate label is needed at the end 973 | of the dialog to explain lifetime. 974 | ''' 975 | print(sample) 976 | print('==== Cleaning ====') 977 | print(clean_docstring(sample)[0]) 978 | 979 | # print(extract_docstring(sample, [], 'cpp')) 980 | 981 | # res = clean_docstring(sample) 982 | # print(res[0]) 983 | # print(res[1]) 984 | 985 | # sample = '''Convert java.util.regex.Matcher groups to JavaScript groups''' 986 | # print(check_contain_too_many_variables(sample)) -------------------------------------------------------------------------------- /src/codetext/codetext_cli.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import List, Dict 3 | 4 | from tabulate import tabulate 5 | 6 | from .parser import * 7 | from .utils import parse_code 8 | 9 | 10 | def parse_file(file_path: str, language: str = None, verbose: bool = False) -> List: 11 | assert language != None, "Auto detect is not implemented, please specify language" 12 | language = str(language).lower() 13 | # assert (language in SUPPORT_LANGUAGE) == True, f"{language} is not supported" 14 | assert os.path.isfile(file_path) == True, "File not found" 15 | 16 | if verbose: 17 | print(50 * "=") 18 | print("Parse code into tree-sitter node") 19 | 20 | content: str = open(file_path, "r").read() 21 | root_node = parse_code(raw_code=content, language=language).root_node 22 | 23 | if language == "python": 24 | parser: LanguageParser = PythonParser 25 | elif language == "java": 26 | parser: LanguageParser = JavaParser 27 | elif language == "javascript": 28 | parser: LanguageParser = JavascriptParser 29 | elif language == "go": 30 | parser: LanguageParser = GoParser 31 | elif language in ["c", "c++"]: 32 | parser: LanguageParser = CppParser 33 | elif language == "c#": 34 | parser: LanguageParser = CsharpParser 35 | elif language == "rust": 36 | parser: LanguageParser = RustParser 37 | elif language == "ruby": 38 | parser: LanguageParser = RubyParser 39 | elif language == "php": 40 | parser: LanguageParser = PhpParser 41 | else: 42 | raise KeyError(f"{language} is not supported") 43 | 44 | if verbose: 45 | print(50 * "=") 46 | print("Get node detail") 47 | 48 | cls_list = parser.get_class_list(root_node) 49 | method_list = [] 50 | cls_metadata = [] 51 | for _cls in cls_list: 52 | cls_info = parser.get_class_metadata(_cls) 53 | cls_info["code"] = get_node_text(_cls) 54 | 55 | cls_method = [] 56 | current_class_methods = parser.get_function_list(_cls) 57 | for method in current_class_methods: 58 | method_info = parser.get_function_metadata(method) 59 | method_info['code'] = get_node_text(method) 60 | cls_method.append(method_info) 61 | 62 | cls_info["method"] = cls_method 63 | cls_metadata.append(cls_info) 64 | method_list.extend(current_class_methods) 65 | 66 | fn_list: List = parser.get_function_list(root_node) 67 | for node in fn_list[:]: 68 | if node in method_list: 69 | fn_list.remove(node) 70 | 71 | fn_metadata = [] 72 | for fn in fn_list: 73 | fn_metadata.append(parser.get_function_metadata(fn)) 74 | 75 | output_metadata = {"class": cls_metadata, "function": fn_metadata} 76 | 77 | return output_metadata 78 | 79 | 80 | def print_result(res: Dict, file_name: str = "no_name_file"): 81 | # ======== Print file name ======== 82 | print("File {name} analyzed:".format(name=file_name)) 83 | print(50 * "=") 84 | 85 | # ========= Summary ========= 86 | print("Number of class : {length}".format(length=len(res["class"]))) 87 | print("Number of function : {length}".format(length=len(res["function"]))) 88 | print(50 * "-" + "\n") 89 | 90 | # ========= Print class & method ========= 91 | cls_headers = ["#", "Class", "Arguments"] 92 | cls_method_headers = ["#", "Method name", "Paramters", 93 | "Type", "Return type", "Throws"] 94 | cls_info = [] 95 | method_info = {} 96 | for cls_idx, _cls in enumerate(res["class"]): 97 | cls_max_length = max(1, len(_cls["parameters"].keys())) 98 | for i in range(cls_max_length): 99 | clslist = [""] * len(cls_headers) 100 | clslist[0] = cls_idx if i < 1 else "" 101 | clslist[1] = _cls["identifier"] if i < 1 else "" 102 | if _cls["parameters"].keys(): 103 | clslist[2] = list(_cls["parameters"].keys())[i] 104 | cls_info.append(clslist) 105 | 106 | _method_info = [] 107 | for idx, method in enumerate(_cls["method"]): 108 | max_length = max(1, len(method["parameters"].keys())) 109 | for i in range(max_length): 110 | sublist = [""] * len(cls_method_headers) 111 | sublist[0] = idx if i < 1 else "" 112 | sublist[1] = method["identifier"] if i < 1 else "" 113 | if method["parameters"].keys(): 114 | sublist[2] = list(method["parameters"].keys())[i] 115 | sublist[3] = list(method["parameters"].values())[i] 116 | sublist[4] = ( 117 | method["return_type"] 118 | if i <= 1 and method["return_type"] != "" 119 | else "" 120 | ) 121 | sublist[5] = ( 122 | method["throws"] 123 | if i <= 1 and "throws" in method.keys() 124 | else "" 125 | ) 126 | _method_info.append(sublist) 127 | 128 | method_info[file_name] = [_cls["identifier"], _method_info] 129 | 130 | if cls_info: 131 | print("Class summary:") 132 | print(tabulate(cls_info, headers=cls_headers, tablefmt="outline")) 133 | print("\n") 134 | 135 | for _, info in method_info.items(): 136 | name, info = info 137 | print("Class analyse: {name}".format(name=name)) 138 | print(tabulate(info, headers=cls_method_headers, tablefmt="outline")) 139 | print("\n") 140 | 141 | # ========= Print stand alone function ========= 142 | fn_headers = ["#", "Function name", "Paramters", "Type", "Return type"] 143 | function_info = [] 144 | 145 | for idx, fn in enumerate(res["function"]): 146 | max_length = max(1, len(fn["parameters"].keys())) 147 | for i in range(max_length): 148 | sublist = [""] * len(fn_headers) 149 | sublist[0] = idx if i < 1 else "" 150 | sublist[1] = fn["identifier"] if i < 1 else "" 151 | if fn["parameters"].keys(): 152 | sublist[2] = list(fn["parameters"].keys())[i] 153 | sublist[3] = list(fn["parameters"].values())[i] 154 | sublist[4] = ( 155 | fn["return_type"] 156 | if i <= 1 and fn["return_type"] != "" 157 | else "" 158 | ) 159 | function_info.append(sublist) 160 | 161 | if function_info: 162 | print("Function analyse:") 163 | print(tabulate(function_info, headers=fn_headers, tablefmt="outline")) 164 | print("\n") 165 | 166 | elif not method_info: 167 | print("File empty") 168 | print("\n") 169 | 170 | 171 | PL_MATCHING = { 172 | "Java": [".java"], 173 | "JavaScript": [ 174 | ".js", 175 | "._js", 176 | ".bones", 177 | ".es6", 178 | ".jake", 179 | ".jsb", 180 | ".jscad", 181 | ".jsfl", 182 | ".jsm", 183 | ".jss", 184 | ".njs", 185 | ".pac", 186 | ".sjs", 187 | ".ssjs", 188 | ".xsjs", 189 | ".xsjslib", 190 | ], 191 | "Python": [ 192 | ".py", 193 | ".bzl", 194 | ".gyp", 195 | ".lmi", 196 | ".pyde", 197 | ".pyp", 198 | ".pyt", 199 | ".pyw", 200 | ".tac", 201 | ".wsgi", 202 | ".xpy", 203 | ], 204 | "PHP": [".php", ".aw", ".ctp", ".php3", ".php4", ".php5", ".phps", ".phpt"], 205 | "Go": [".go"], 206 | "Rust": [".rs", ".rs.in"], 207 | "Ruby": [ 208 | ".rb", 209 | ".builder", 210 | ".gemspec", 211 | ".god", 212 | ".irbrc", 213 | ".jbuilder", 214 | ".mspec", 215 | ".podspec", 216 | ".rabl", 217 | ".rake", 218 | ".rbuild", 219 | ".rbw", 220 | ".rbx", 221 | ".ru", 222 | ".ruby", 223 | ".thor", 224 | ".watchr", 225 | ], 226 | "C": [".c", ".cats", ".h", ".idc", ".w"], 227 | "C#": [".cs", ".cake", ".cshtml", ".csx"], 228 | "C++": [ 229 | ".cpp", 230 | ".c++", 231 | ".cc", 232 | ".cp", 233 | ".cxx", 234 | ".h++", 235 | ".hh", 236 | ".hpp", 237 | ".hxx", 238 | ".inl", 239 | ".ipp", 240 | ".tcc", 241 | ".tpp", 242 | ".C", 243 | ".H", 244 | ], 245 | } 246 | -------------------------------------------------------------------------------- /src/codetext/parser/README.md: -------------------------------------------------------------------------------- 1 | # Parser Appendix 2 | 3 | With `codetext` parser, we support to extract serveral function type, however, by using `tree-sitter` grammarly, some function or some language might be not fully supported. 4 | 5 | This is the list of current supported function: 6 | 7 | -------------------------------------------------------------------------------- /src/codetext/parser/__init__.py: -------------------------------------------------------------------------------- 1 | """Codetext parser 2 | Parse code to get docstring node, comment node 3 | """ 4 | from .go_parser import GoParser 5 | from .php_parser import PhpParser 6 | from .ruby_parser import RubyParser 7 | from .java_parser import JavaParser 8 | from .javascript_parser import JavascriptParser 9 | from .python_parser import PythonParser 10 | from .cpp_parser import CppParser 11 | from .c_sharp_parser import CsharpParser 12 | from .rust_parser import RustParser 13 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text, \ 14 | tokenize_code, tokenize_docstring, nodes_are_equal 15 | 16 | SUPPORT_LANGUAGE = [ 17 | "go", "php", "ruby", "java", "javascript", 18 | "python", "cpp", "c", "c_sharp", "rust" 19 | ] 20 | 21 | __all__ = [ 22 | 'GoParser', 'PhpParser', 'RubyParser', 'JavaParser', 'JavascriptParser', 23 | 'PythonParser', 'CppParser', 'CsharpParser', 'RustParser', 'LanguageParser', 24 | 'get_node_by_kind', 'get_node_text', 'tokenize_code', 'tokenize_docstring', 25 | 'nodes_are_equal' 26 | ] 27 | -------------------------------------------------------------------------------- /src/codetext/parser/c_sharp_parser.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Any 2 | import tree_sitter 3 | import logging 4 | 5 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text 6 | 7 | logger = logging.getLogger(name=__name__) 8 | 9 | 10 | class CsharpParser(LanguageParser): 11 | 12 | BLACKLISTED_FUNCTION_NAMES = [] 13 | 14 | @staticmethod 15 | def get_docstring(node, blob=None): 16 | """ 17 | Get docstring description for node 18 | 19 | Args: 20 | node (tree_sitter.Node) 21 | blob (str): original source code which parse the `node` 22 | Returns: 23 | str: docstring 24 | """ 25 | if blob: 26 | logger.info('From version `0.0.6` this function will update argument in the API') 27 | docstring_node = CsharpParser.get_docstring_node(node) 28 | docstring = '\n'.join(get_node_text(s) for s in docstring_node) 29 | return docstring 30 | 31 | @staticmethod 32 | def get_docstring_node(node): 33 | """ 34 | Get docstring node from it parent node. 35 | C# docstring is written line by line and stay outside it own node, see example below. 36 | 37 | Args: 38 | node (tree_sitter.Node): parent node (usually function node) to get its docstring 39 | Return: 40 | List: list of docstring nodes 41 | Example: 42 | str = ''' 43 | // 44 | // Docstring of a method 45 | // 46 | // Argument. 47 | // 48 | // None. 49 | public void honk(string animal_honk) 50 | { 51 | Console.WriteLine(animal_honk); 52 | Console.WriteLine("Tuut, tuut!"); 53 | } 54 | ''' 55 | ... 56 | print(C_sharp.get_docstring_node(function_node)) 57 | 58 | >>> [, \ 59 | , \ 60 | , \ 61 | , \ 62 | , \ 63 | ] 64 | """ 65 | docstring_node = [] 66 | 67 | prev_node = node.prev_sibling 68 | if prev_node and prev_node.type == 'comment': 69 | docstring_node.append(prev_node) 70 | prev_node = prev_node.prev_sibling 71 | 72 | while prev_node and prev_node.type == 'comment': 73 | # Assume the comment is dense 74 | x_current = prev_node.start_point[0] 75 | x_next = prev_node.next_sibling.start_point[0] 76 | if x_next - x_current > 1: 77 | break 78 | 79 | docstring_node.insert(0, prev_node) 80 | prev_node = prev_node.prev_sibling 81 | 82 | return docstring_node 83 | 84 | @staticmethod 85 | def get_comment_node(node): 86 | """ 87 | Return all comment node inside a parent node 88 | Args: 89 | node (tree_sitter.Node) 90 | Return: 91 | List: list of comment nodes 92 | """ 93 | comment_node = get_node_by_kind(node, kind=['comment']) 94 | return comment_node 95 | 96 | @staticmethod 97 | def get_function_list(node): 98 | res = get_node_by_kind(node, ['local_function_statement', 'method_declaration']) 99 | # We don't use "constructor_declaration" 100 | return res 101 | 102 | @staticmethod 103 | def get_class_list(node): 104 | res = get_node_by_kind(node, ['class_declaration']) 105 | return res 106 | 107 | @staticmethod 108 | def get_function_metadata(function_node, blob: str = None) -> Dict[str, Any]: 109 | """ 110 | Function metadata contains: 111 | - identifier (str): function name 112 | - parameters (Dict[str, str]): parameter's name and their type (e.g: {'param_a': 'int'}) 113 | - type (str): type 114 | """ 115 | metadata = { 116 | 'identifier': '', 117 | 'parameters': {}, 118 | 'return_type': None 119 | } 120 | assert type(function_node) == tree_sitter.Node 121 | 122 | for child in function_node.children: 123 | if child.type in ['predefined_type', 'generic_name']: 124 | metadata['return_type'] = get_node_text(child) 125 | elif child.type == 'identifier': 126 | if child.next_named_sibling.type != 'parameter_list': 127 | metadata['return_type'] = get_node_text(child) 128 | else: 129 | metadata['identifier'] = get_node_text(child) 130 | elif child.type == 'parameter_list': 131 | for param_node in child.children: 132 | param_nodes = get_node_by_kind(param_node, ['parameter']) 133 | for param in param_nodes: 134 | if len(param.children) > 1: 135 | param_type = get_node_text(param.children[0]) 136 | param_name = get_node_text(param.children[1]) 137 | metadata['parameters'][param_name] = param_type 138 | 139 | else: 140 | param_name = get_node_text(param.children[0]) 141 | metadata['parameters'][param_name] = None 142 | # for node in param.children: 143 | # if node.type in ['array_type', 'implicit_type', \ 144 | # 'nullable_type', 'pointer_type', 'function_pointer_type', \ 145 | # 'predefined_type', 'tuple_type']: 146 | # param_type = get_node_text(node) 147 | # elif node.type == 'identifier': 148 | # param_identifier = get_node_text(node) 149 | 150 | # param_type = get_node_text(param.child_by_field_name('type')) 151 | # param_identifier = get_node_text(param.child_by_field_name('name')) 152 | return metadata 153 | 154 | @staticmethod 155 | def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]: 156 | """ 157 | Class metadata contains: 158 | - identifier (str): class's name 159 | - parameters (List[str]): inheritance class 160 | """ 161 | if blob: 162 | logger.info('From version `0.0.6` this function will update argument in the API') 163 | metadata = { 164 | 'identifier': '', 165 | 'parameters': {}, 166 | } 167 | assert type(class_node) == tree_sitter.Node 168 | 169 | for child in class_node.children: 170 | if child.type == 'identifier': 171 | metadata['identifier'] = get_node_text(child) 172 | elif child.type == 'base_list': 173 | for arg in child.children: 174 | if arg.type == 'identifier': 175 | metadata['parameters'][get_node_text(arg)] = None 176 | # argument_list.append(get_node_text(arg)) 177 | # metadata['parameters'] = argument_list 178 | 179 | return metadata 180 | 181 | -------------------------------------------------------------------------------- /src/codetext/parser/cpp_parser.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Any 2 | 3 | import tree_sitter 4 | import logging 5 | 6 | from .language_parser import LanguageParser, get_node_text, get_node_by_kind 7 | 8 | logger = logging.getLogger(name=__name__) 9 | 10 | 11 | class CppParser(LanguageParser): 12 | 13 | BLACKLISTED_FUNCTION_NAMES = ['main', 'constructor'] 14 | 15 | @staticmethod 16 | def get_docstring(node, blob=None): 17 | """ 18 | Get docstring description for node 19 | 20 | Args: 21 | node (tree_sitter.Node) 22 | blob (str): original source code which parse the `node` 23 | Returns: 24 | str: docstring 25 | """ 26 | if blob: 27 | logger.info('From version `0.0.6` this function will update argument in the API') 28 | docstring_node = CppParser.get_docstring_node(node) 29 | docstring = '\n'.join(get_node_text(s) for s in docstring_node) 30 | return docstring 31 | 32 | @staticmethod 33 | def get_docstring_node(node): 34 | """ 35 | Get docstring node from it parent node. 36 | C and C++ share the same syntax. Their docstring usually is 1 single block 37 | Expect length of return list == 1 38 | 39 | Args: 40 | node (tree_sitter.Node): parent node (usually function node) to get its docstring 41 | Return: 42 | List: list of docstring nodes (expect==1) 43 | Example: 44 | str = ''' 45 | /** 46 | * Find 2 sum 47 | * 48 | * @param nums List number. 49 | * @param target Sum target. 50 | * @return postion of 2 number. 51 | */ 52 | vector twoSum(vector& nums, int target) { 53 | ... 54 | } 55 | ''' 56 | ... 57 | print(CppParser.get_docstring_node(function_node)) 58 | 59 | >>> [] 60 | """ 61 | docstring_node = [] 62 | 63 | prev_node = node.prev_sibling 64 | if prev_node and prev_node.type == 'comment': 65 | docstring_node.append(prev_node) 66 | prev_node = prev_node.prev_sibling 67 | 68 | while prev_node and prev_node.type == 'comment': 69 | # Assume the comment is dense 70 | x_current = prev_node.start_point[0] 71 | x_next = prev_node.next_sibling.start_point[0] 72 | if x_next - x_current > 1: 73 | break 74 | 75 | docstring_node.insert(0, prev_node) 76 | prev_node = prev_node.prev_sibling 77 | 78 | return docstring_node 79 | 80 | @staticmethod 81 | def get_function_list(node): 82 | res = get_node_by_kind(node, ['function_definition']) 83 | return res 84 | 85 | @staticmethod 86 | def get_class_list(node): 87 | res = get_node_by_kind(node, ['class_specifier']) 88 | return res 89 | 90 | @staticmethod 91 | def get_comment_node(node): 92 | """ 93 | Return all comment node inside a parent node 94 | Args: 95 | node (tree_sitter.Node) 96 | Return: 97 | List: list of comment nodes 98 | """ 99 | comment_node = get_node_by_kind(node, kind=['comment']) 100 | return comment_node 101 | 102 | @staticmethod 103 | def get_function_metadata(function_node, blob: str=None) -> Dict[str, Any]: 104 | """ 105 | Function metadata contains: 106 | - identifier (str): function name 107 | - parameters (Dict[str, str]): parameter's name and their type (e.g: {'param_a': 'int'}) 108 | - return_type (str or NoneType): function's return type 109 | """ 110 | if blob: 111 | logger.info('From version `0.0.6` this function will update argument in the API') 112 | metadata = { 113 | 'identifier': '', 114 | 'parameters': {}, 115 | 'return_type': None, 116 | } 117 | assert type(function_node) == tree_sitter.Node 118 | 119 | for child in function_node.children: 120 | if child.type in ['primitive_type', 'type_identifier']: 121 | metadata['return_type'] = get_node_text(child) 122 | # search for "function_declarator" 123 | elif child.type == 'pointer_declarator': 124 | for subchild in child.children: 125 | if subchild.type == 'function_declarator': 126 | child = subchild 127 | if child.type == 'function_declarator': 128 | for subchild in child.children: 129 | if subchild.type in ['qualified_identifier', 'identifier', 'field_identifier']: 130 | metadata['identifier'] = get_node_text(subchild) 131 | elif subchild.type == 'parameter_list': 132 | param_nodes = get_node_by_kind(subchild, ['parameter_declaration']) 133 | for param in param_nodes: 134 | param_type = param.child_by_field_name('type') 135 | param_type = get_node_text(param_type) 136 | list_name = get_node_by_kind(param, ['identifier']) 137 | if not list_name: 138 | continue 139 | param_name = get_node_text(list_name[0]) 140 | metadata['parameters'][param_name] = param_type 141 | # for item in param.children: 142 | 143 | # if item.type in ['type_identifier', 'primitive_type']: 144 | # param_type = get_node_text(item) 145 | # elif item.type == 'identifier': 146 | # param_identifier = get_node_text(item) 147 | 148 | return metadata 149 | 150 | @staticmethod 151 | def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]: 152 | """ 153 | Class metadata contains: 154 | - identifier (str): class's name 155 | - parameters (List[str]): inheritance class 156 | """ 157 | if blob: 158 | logger.info('From version `0.0.6` this function will update argument in the API') 159 | metadata = { 160 | 'identifier': '', 161 | 'parameters': {}, 162 | } 163 | assert type(class_node) == tree_sitter.Node 164 | 165 | for child in class_node.children: 166 | if child.type == 'type_identifier': 167 | metadata['identifier'] = get_node_text(child) 168 | elif child.type == 'base_class_clause': 169 | argument_list = [] 170 | for param in child.children: 171 | if param.type == 'type_identifier': 172 | metadata['parameters'][get_node_text(param)] = None 173 | # argument_list.append(get_node_text(param)) 174 | # metadata['parameters'] = argument_list 175 | 176 | return metadata 177 | -------------------------------------------------------------------------------- /src/codetext/parser/go_parser.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Any 2 | import logging 3 | 4 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text 5 | 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class GoParser(LanguageParser): 11 | 12 | BLACKLISTED_FUNCTION_NAMES = ['test', 'vendor'] 13 | 14 | @staticmethod 15 | def get_comment_node(function_node): 16 | """ 17 | Return all comment node inside a parent node 18 | Args: 19 | node (tree_sitter.Node) 20 | Return: 21 | List: list of comment nodes 22 | """ 23 | comment_node = get_node_by_kind(function_node, kind='comment') 24 | return comment_node 25 | 26 | @staticmethod 27 | def get_docstring_node(node): 28 | """ 29 | Get docstring node from it parent node. 30 | Go's docstring is written line by line 31 | 32 | Args: 33 | node (tree_sitter.Node): parent node (usually function node) to get its docstring 34 | Return: 35 | List: list of docstring nodes 36 | Example: 37 | str = ''' 38 | // The path package should only be used for paths separated by forward 39 | // slashes, such as the paths in URLs. This package does not deal with 40 | // Windows paths with drive letters or backslashes; to manipulate 41 | // operating system paths, use the [path/filepath] package. 42 | func (e TypeError) Error() string { 43 | ... 44 | } 45 | ''' 46 | ... 47 | print(GoParser.get_docstring_node(function_node)) 48 | 49 | >>> [, \ 50 | , \ 51 | , \ 52 | ] 53 | """ 54 | docstring_node = [] 55 | 56 | prev_node = node.prev_sibling 57 | if prev_node and prev_node.type == 'comment': 58 | docstring_node.append(prev_node) 59 | prev_node = prev_node.prev_sibling 60 | 61 | while prev_node and prev_node.type == 'comment': 62 | # Assume the comment is dense 63 | x_current = prev_node.start_point[0] 64 | x_next = prev_node.next_sibling.start_point[0] 65 | if x_next - x_current > 1: 66 | break 67 | 68 | docstring_node.insert(0, prev_node) 69 | prev_node = prev_node.prev_sibling 70 | 71 | return docstring_node 72 | 73 | @staticmethod 74 | def get_docstring(node, blob:str=None): 75 | """ 76 | Get docstring description for node 77 | 78 | Args: 79 | node (tree_sitter.Node) 80 | blob (str): original source code which parse the `node` 81 | Returns: 82 | str: docstring 83 | """ 84 | if blob: 85 | logger.info('From version `0.0.6` this function will update argument in the API') 86 | docstring_node = GoParser.get_docstring_node(node) 87 | docstring = '\n'.join(get_node_text(s) for s in docstring_node) 88 | return docstring 89 | 90 | @staticmethod 91 | def get_function_list(node): 92 | res = get_node_by_kind(node, ['method_declaration', 'function_declaration']) 93 | return res 94 | 95 | @staticmethod 96 | def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]: 97 | if blob: 98 | logger.info('From version `0.0.6` this function will update argument in the API') 99 | metadata = { 100 | 'identifier': '', 101 | 'parameters': {}, 102 | 'return_type': None, 103 | } 104 | 105 | for child in function_node.children: 106 | if child.type in ['field_identifier', 'identifier']: 107 | metadata['identifier'] = get_node_text(child) 108 | elif child.type == 'type_identifier': 109 | metadata['return_type'] = get_node_text(child) 110 | elif child.type == 'parameter_list': 111 | for subchild in child.children: 112 | if subchild.type in ['parameter_declaration', 'variadic_parameter_declaration']: 113 | identifier_node = subchild.child_by_field_name('name') 114 | 115 | if not identifier_node: 116 | continue 117 | 118 | param_type = get_node_text(subchild.child_by_field_name('type')) 119 | identifier = get_node_text(identifier_node) 120 | if identifier and param_type: 121 | metadata['parameters'][identifier] = param_type 122 | 123 | return metadata 124 | 125 | @staticmethod 126 | def get_class_list(node): 127 | pass 128 | 129 | @staticmethod 130 | def get_class_metadata(class_node, blob=None) -> Dict[str, str]: 131 | if blob: 132 | logger.info('From version `0.0.6` this function will update argument in the API') 133 | pass 134 | -------------------------------------------------------------------------------- /src/codetext/parser/java_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Dict, Any 3 | import logging 4 | 5 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class JavaParser(LanguageParser): 12 | 13 | FILTER_PATHS = ('test', 'tests') 14 | 15 | BLACKLISTED_FUNCTION_NAMES = ['toString', 'hashCode', 'equals', 'finalize', 'notify', 'notifyAll', 'clone'] 16 | 17 | @staticmethod 18 | def get_docstring_node(node): 19 | """ 20 | Get docstring node from it parent node. Expect return list have length==1 21 | 22 | Args: 23 | node (tree_sitter.Node): parent node (usually function node) to get its docstring 24 | Return: 25 | List: list of docstring nodes 26 | """ 27 | docstring_node = [] 28 | 29 | if node.prev_sibling: 30 | prev_node = node.prev_sibling 31 | if prev_node.type == 'block_comment' or prev_node.type == 'line_comment': 32 | docstring_node.append(prev_node) 33 | 34 | return docstring_node 35 | 36 | @staticmethod 37 | def get_docstring(node, blob=None): 38 | """ 39 | Get docstring description for node 40 | 41 | Args: 42 | node (tree_sitter.Node) 43 | blob (str): original source code which parse the `node` 44 | Returns: 45 | str: docstring 46 | """ 47 | if blob: 48 | logger.info('From version `0.0.6` this function will update argument in the API') 49 | docstring_node = JavaParser.get_docstring_node(node) 50 | 51 | docstring = '' 52 | if docstring_node: 53 | docstring = get_node_text(docstring_node[0]) 54 | return docstring 55 | 56 | @staticmethod 57 | def get_comment_node(function_node): 58 | """ 59 | Return all comment node inside a parent node 60 | Args: 61 | node (tree_sitter.Node) 62 | Return: 63 | List: list of comment nodes 64 | """ 65 | comment_node = get_node_by_kind(function_node, kind=['line_comment']) 66 | return comment_node 67 | 68 | @staticmethod 69 | def get_class_list(node): 70 | res = get_node_by_kind(node, ['class_declaration']) 71 | return res 72 | 73 | @staticmethod 74 | def get_function_list(node): 75 | res = get_node_by_kind(node, ['method_declaration']) 76 | return res 77 | 78 | @staticmethod 79 | def is_method_body_empty(node): 80 | for c in node.children: 81 | if c.type in {'method_body', 'constructor_body'}: 82 | if c.start_point[0] == c.end_point[0]: 83 | return True 84 | 85 | @staticmethod 86 | def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]: 87 | if blob: 88 | logger.info('From version `0.0.6` this function will update argument in the API') 89 | metadata = { 90 | 'identifier': '', 91 | 'parameters': {}, 92 | } 93 | argument_list = [] 94 | for child in class_node.children: 95 | if child.type == 'identifier': 96 | metadata['identifier'] = get_node_text(child) 97 | elif child.type == 'superclass' or child.type == 'super_interfaces': 98 | for subchild in child.children: 99 | if subchild.type == 'type_list' or subchild.type == 'type_identifier': 100 | metadata['parameters'][get_node_text(subchild)] = None 101 | # argument_list.append(get_node_text(subchild)) 102 | 103 | # metadata['parameters'] = argument_list 104 | return metadata 105 | 106 | @staticmethod 107 | def get_function_metadata(function_node, blob: str = None) -> Dict[str, str]: 108 | metadata = { 109 | 'identifier': '', 110 | 'parameters': {}, 111 | 'return_type': None 112 | } 113 | 114 | return_kinds = ["void_type", 115 | "integral_type", 116 | "floating_point_type", 117 | "boolean_type", 118 | "type_identifier", 119 | "scoped_type_identifier", 120 | "generic_type"] 121 | 122 | 123 | for child in function_node.children: 124 | if child.type == 'identifier': 125 | metadata['identifier'] = get_node_text(child) 126 | elif child.type in return_kinds: 127 | metadata['return_type'] = get_node_text(child) 128 | elif child.type == 'throws': 129 | for subchild in child.children: 130 | if 'identifier' in subchild.type: 131 | metadata['throws'] = get_node_text(subchild) 132 | elif child.type == 'formal_parameters': 133 | param_list = get_node_by_kind(child, ['formal_parameter']) # speed_parameter 134 | for param in param_list: 135 | param_type = get_node_text(param.child_by_field_name('type')) 136 | identifier = get_node_text(param.child_by_field_name('name')) 137 | metadata['parameters'][identifier] = param_type 138 | 139 | 140 | return metadata -------------------------------------------------------------------------------- /src/codetext/parser/javascript_parser.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Any 2 | import logging 3 | 4 | from .language_parser import LanguageParser, get_node_text, get_node_by_kind 5 | 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class JavascriptParser(LanguageParser): 11 | 12 | FILTER_PATHS = ('test', 'node_modules') 13 | 14 | BLACKLISTED_FUNCTION_NAMES = ['toString', 'toLocaleString', 'valueOf', 'constructor'] 15 | 16 | @staticmethod 17 | def get_docstring_node(node): 18 | docstring_node = [] 19 | prev_node = node.prev_sibling 20 | parent_node = node.parent 21 | 22 | if prev_node and prev_node.type == 'comment': 23 | docstring_node.append(prev_node) 24 | 25 | elif parent_node: 26 | if parent_node.type != 'class_body': # node not inside a class 27 | prev_node = parent_node.prev_sibling 28 | if prev_node and prev_node.type == 'comment': 29 | docstring_node.append(prev_node) 30 | 31 | return docstring_node 32 | 33 | @staticmethod 34 | def get_docstring(node, blob=None): 35 | if blob: 36 | logger.info('From version `0.0.6` this function will update argument in the API') 37 | docstring_node = JavascriptParser.get_docstring_node(node) 38 | 39 | docstring = '' 40 | if docstring_node: 41 | docstring = get_node_text(docstring_node[0]) 42 | return docstring 43 | 44 | @staticmethod 45 | def get_comment_node(function_node): 46 | comment_node = get_node_by_kind(function_node, kind=['comment']) 47 | return comment_node 48 | 49 | @staticmethod 50 | def get_function_list(node): 51 | function_types = ['function_declaration', 52 | 'function', 53 | 'method_definition', 54 | 'generator_function_declaration', 55 | 'arrow_function', 56 | 'generator_function'] 57 | res = get_node_by_kind(node, function_types) 58 | for node in res[:]: 59 | if not node.children: 60 | res.remove(node) 61 | 62 | return res 63 | 64 | @staticmethod 65 | def get_class_list(node): 66 | res = get_node_by_kind(node, ['class_declaration', 'class']) 67 | for node in res[:]: 68 | if not node.children: 69 | res.remove(node) 70 | 71 | return res 72 | 73 | @staticmethod 74 | def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]: 75 | if blob: 76 | logger.info('From version `0.0.6` this function will update argument in the API') 77 | metadata = { 78 | 'identifier': '', 79 | 'parameters': {}, 80 | 'return_type': None, 81 | } 82 | param = [] 83 | for child in function_node.children: 84 | if child.type in ['identifier', 'property_identifier']: 85 | metadata['identifier'] = get_node_text(child) 86 | elif child.type == 'formal_parameters': 87 | params = get_node_by_kind(child, ['identifier']) 88 | for param in params: 89 | identifier = get_node_text(param) 90 | metadata['parameters'][identifier] = None # JS not have type define 91 | 92 | return_statement = get_node_by_kind(function_node, ['return_statement']) 93 | if len(return_statement) > 0: 94 | metadata['return_type'] = '' 95 | 96 | if function_node.type in ["function", 97 | "arrow_function", 98 | "generator_function"]: 99 | # function inside object property or variable declarator 100 | identifier = function_node.prev_named_sibling 101 | if identifier: 102 | if identifier.type in ["identifier"]: 103 | metadata["identifier"] = identifier.text.decode() 104 | 105 | return metadata 106 | 107 | @staticmethod 108 | def get_class_metadata(class_node, blob=None): 109 | if blob: 110 | logger.info('From version `0.0.6` this function will update argument in the API') 111 | metadata = { 112 | 'identifier': '', 113 | 'parameters': {}, 114 | } 115 | param = [] 116 | for child in class_node.children: 117 | if child.type == 'identifier': 118 | metadata['identifier'] = get_node_text(child) 119 | elif child.type == 'class_heritage': 120 | for subchild in child.children: 121 | if subchild.type == 'identifier': 122 | metadata['parameters'][get_node_text(subchild)] = None 123 | # param.append(get_node_text(subchild)) 124 | 125 | # metadata['parameters'] = param 126 | return metadata 127 | -------------------------------------------------------------------------------- /src/codetext/parser/language_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from abc import ABC, abstractmethod 3 | from typing import List, Dict, Any, Set, Optional 4 | 5 | import tree_sitter 6 | 7 | import logging 8 | 9 | DOCSTRING_REGEX = re.compile(r"(['\"])\1\1(.*?)\1{3}", flags=re.DOTALL) 10 | DOCSTRING_REGEX_TOKENIZER = re.compile(r"[^\s,'\"`.():\[\]=*;>{\}+-/\\]+|\\+|\.+|\(\)|{\}|\[\]|\(+|\)+|:+|\[+|\]+|{+|\}+|=+|\*+|;+|>+|\++|-+|/+|\'|\"|`") 11 | logger = logging.getLogger() 12 | 13 | def remove_words_in_string(words, string): 14 | new_string = string 15 | for word in words: 16 | new_string = str(new_string).replace(word, '') 17 | return new_string 18 | 19 | 20 | def tokenize_docstring(docstring: str) -> List[str]: 21 | return [t for t in DOCSTRING_REGEX_TOKENIZER.findall(str(docstring)) if t is not None and len(t) > 0] 22 | 23 | 24 | def tokenize_code(node, blob: str, nodes_to_exclude: Optional[Set]=None) -> List: 25 | tokens = [] 26 | traverse(node, tokens) 27 | # print(tokens) 28 | # for token in tokens: 29 | # print(token.text) 30 | return [match_from_span(token, blob) for token in tokens if nodes_to_exclude is None or token not in nodes_to_exclude] 31 | 32 | def nodes_are_equal(n1, n2): 33 | return n1.type == n2.type and n1.start_point == n2.start_point and n1.end_point == n2.end_point 34 | 35 | def parent_and_previous_sibling(tree, node): 36 | """Merge `node_parent` and `previous_sibling` function 37 | """ 38 | parent = node_parent(tree, node) 39 | for i, node_at_i in enumerate(parent.children): 40 | if nodes_are_equal(node, node_at_i): 41 | if i > 0: 42 | return parent, parent.children[i-1] 43 | return parent, None 44 | 45 | return ValueError("Could not find node in tree.") 46 | 47 | 48 | def previous_sibling(tree, node): 49 | """ 50 | Search for the previous sibling of the node. 51 | TODO: C TreeSitter should support this natively, but not its Python bindings yet. Replace later. 52 | """ 53 | to_visit = [tree.root_node] 54 | while len(to_visit) > 0: 55 | next_node = to_visit.pop() 56 | for i, node_at_i in enumerate(next_node.children): 57 | if nodes_are_equal(node, node_at_i): 58 | if i > 0: 59 | return next_node.children[i-1] 60 | return None 61 | else: 62 | to_visit.extend(next_node.children) 63 | return ValueError("Could not find node in tree.") 64 | 65 | 66 | # if parent_node.type == 'variable_declarator': 67 | # # node 68 | # base_node = node_parent(tree, parent_node) # Get the variable declaration 69 | # # parent 70 | # parent_node = node_parent(tree, base_node) 71 | # elif parent_node.type == 'pair': 72 | # base_node = parent_node # This is a common pattern where a function is assigned as a value to a dictionary. 73 | # parent_node = node_parent(tree, base_node) 74 | # else: 75 | # base_node = node 76 | 77 | def traverse_type_parent(node, kind:List) -> None: 78 | results = [] 79 | to_visit = [node] 80 | while len(to_visit) > 0: 81 | next_node = to_visit.pop() 82 | for child in next_node.children: 83 | if child.type in kind: 84 | results.append([next_node, child]) 85 | else: 86 | to_visit.extend(next_node.children) 87 | 88 | return results 89 | 90 | 91 | def node_parent(tree, node): 92 | to_visit = [tree.root_node] 93 | while len(to_visit) > 0: 94 | next_node = to_visit.pop() 95 | for child in next_node.children: 96 | if nodes_are_equal(child, node): 97 | return next_node 98 | else: 99 | to_visit.extend(next_node.children) 100 | raise ValueError("Could not find node in tree.") 101 | 102 | 103 | def traverse(node, results: List) -> None: 104 | if node.type == 'string': 105 | results.append(node) 106 | return 107 | for n in node.children: 108 | traverse(n, results) 109 | if not node.children: 110 | results.append(node) 111 | 112 | 113 | def traverse_type(node, results, kind:List) -> None: 114 | # logger.warn('From version 0.0.6, we move `traverse_type` to `get_node_by_kind`') 115 | if node.type in kind: 116 | results.append(node) 117 | if not node.children: 118 | return 119 | for n in node.children: 120 | traverse_type(n, results, kind) 121 | 122 | 123 | def get_node_by_kind(root: tree_sitter.Node, kind: List[str]) -> List: 124 | """ 125 | Get all nodes with specific type 126 | 127 | Args: 128 | root (tree_sitter.Node): Tree sitter root node 129 | kind (List[str]): (node's) type that want to get 130 | 131 | Return: 132 | List[tree_sitter.Node]: List of all 133 | """ 134 | assert type(root) == tree_sitter.Node, f"Expect `root` to be `tree_sitter.Node`, get {type(root)}" 135 | assert type(kind) in [list, str], f"Expect `kind` to be `list` of string or `str`, get {type(kind)}" 136 | assert all(isinstance(s, str) for s in kind) == True, f"Expect search kind to be `str`" 137 | 138 | node_list = [] 139 | traverse_type(root, node_list, kind=kind) 140 | return node_list 141 | 142 | 143 | def get_node_text(root: tree_sitter.Node) -> str: 144 | """ 145 | Get text of a tree-sitter Node. Can be use to replace `match_from_span`. 146 | 147 | Args: 148 | root (tree_sitter.Node): Tree sitter node to get text 149 | 150 | Return: 151 | str: text of `root` 152 | """ 153 | assert type(root) == tree_sitter.Node, f"Expect `root` to be `tree_sitter.Node`, get {type(root)}" 154 | 155 | text = root.text.decode() 156 | return text 157 | 158 | 159 | def match_from_span(node, blob: str) -> str: 160 | # logger.warn('From version 0.0.6, we move `match_from_span` to `get_node_text`') 161 | lines = blob.split('\n') 162 | line_start = node.start_point[0] 163 | line_end = node.end_point[0] 164 | char_start = node.start_point[1] 165 | char_end = node.end_point[1] 166 | if line_start != line_end: 167 | return '\n'.join([lines[line_start][char_start:]] + lines[line_start+1:line_end] + [lines[line_end][:char_end]]) 168 | else: 169 | return lines[line_start][char_start:char_end] 170 | 171 | 172 | def match_from_spans(nodes, blob: str) -> str: 173 | """ 174 | Get text from multiple note 175 | 176 | Args: 177 | nodes (List): List of `tree_sitter.Node` 178 | blob (str): Full source 179 | 180 | Return: 181 | str: combined text of list node 182 | """ 183 | assert len(nodes) != 0, "Empty node list" 184 | start_point = nodes[0] 185 | end_point = nodes[0] 186 | 187 | for node in nodes: 188 | if node.start_point[0] < start_point.start_point[0]: 189 | start_point = node 190 | elif node.end_point[0] > end_point.end_point[0]: 191 | end_point = node 192 | 193 | line_start = start_point.start_point[0] 194 | char_start = start_point.start_point[1] 195 | line_end = end_point.end_point[0] 196 | char_end = end_point.end_point[1] 197 | 198 | lines = blob.split('\n') 199 | if line_start != line_end: 200 | string = '\n'.join([lines[line_start][char_start:]] + lines[line_start+1:line_end] + [lines[line_end][:char_end]]) 201 | else: 202 | string = lines[line_start][char_start:char_end] 203 | 204 | return string, start_point, end_point 205 | 206 | 207 | class LanguageParser(ABC): 208 | BLACKLISTED_FUNCTION_NAMES = [] 209 | 210 | @staticmethod 211 | @abstractmethod 212 | def get_function_list(node): 213 | pass 214 | 215 | @staticmethod 216 | @abstractmethod 217 | def get_class_list(node): 218 | pass 219 | 220 | @staticmethod 221 | @abstractmethod 222 | def get_docstring_node(node) -> List[tree_sitter.Node]: 223 | pass 224 | 225 | @staticmethod 226 | @abstractmethod 227 | def get_comment_node(node) -> List[tree_sitter.Node]: 228 | pass 229 | 230 | @staticmethod 231 | @abstractmethod 232 | def get_class_metadata(class_node, blob=None): 233 | pass 234 | 235 | @staticmethod 236 | @abstractmethod 237 | def get_function_metadata(function_node, blob=None) -> Dict[str, str]: 238 | pass 239 | 240 | 241 | # @staticmethod 242 | # @abstractmethod 243 | # def get_function_definitions(tree, blob) -> List: 244 | # pass 245 | 246 | # @staticmethod 247 | # @abstractmethod 248 | # def get_class_definitions(tree, blob) -> List: 249 | # pass 250 | 251 | # @staticmethod 252 | # @abstractmethod 253 | # def get_line_definitions(tree, blob) -> List: 254 | # pass 255 | 256 | # @staticmethod 257 | # @abstractmethod 258 | # def get_context(tree, blob): 259 | # raise NotImplementedError 260 | 261 | # @staticmethod 262 | # @abstractmethod 263 | # def get_calls(tree, blob): 264 | # raise NotImplementedError -------------------------------------------------------------------------------- /src/codetext/parser/php_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Dict, Any 3 | import tree_sitter 4 | import logging 5 | 6 | from .language_parser import LanguageParser, get_node_text, get_node_by_kind 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class PhpParser(LanguageParser): 13 | 14 | FILTER_PATHS = ('test', 'tests') 15 | 16 | BLACKLISTED_FUNCTION_NAMES = ['__construct', '__destruct', '__call', '__callStatic', 17 | '__get', '__set', '__isset', '__unset', 18 | '__sleep', '__wakeup', '__toString', '__invoke', 19 | '__set_state', '__clone', '__debugInfo', '__serialize', 20 | '__unserialize'] 21 | 22 | @staticmethod 23 | def get_docstring(node, blob: str=None) -> str: 24 | if blob: 25 | logger.info('From version `0.0.6` this function will update argument in the API') 26 | docstring_node = PhpParser.get_docstring_node(node) 27 | 28 | docstring = '' 29 | if docstring_node: 30 | docstring = get_node_text(docstring_node[0]) 31 | 32 | return docstring 33 | 34 | @staticmethod 35 | def get_docstring_node(node): 36 | docstring_node = [] 37 | 38 | if node.prev_sibling is not None: 39 | prev_node = node.prev_sibling 40 | if prev_node.type == 'comment': 41 | docstring_node.append(prev_node) 42 | 43 | return docstring_node 44 | 45 | @staticmethod 46 | def get_comment_node(function_node): 47 | comment_node = get_node_by_kind(function_node, kind='comment') 48 | return comment_node 49 | 50 | @staticmethod 51 | def get_class_list(node): 52 | res = get_node_by_kind(node, ['class_declaration', 53 | 'trait_declaration', 54 | 'interface_declaration']) 55 | return res 56 | 57 | @staticmethod 58 | def get_function_list(node): 59 | res = get_node_by_kind(node, ['function_definition', 'method_declaration']) 60 | return res 61 | 62 | @staticmethod 63 | def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]: 64 | if blob: 65 | logger.info('From version `0.0.6` this function will update argument in the API') 66 | metadata = { 67 | 'identifier': '', 68 | 'parameters': {}, 69 | 'return_type': None, 70 | } 71 | 72 | for n in function_node.children: 73 | if n.type == 'name': 74 | metadata['identifier'] = get_node_text(n) 75 | if n.type in ['union_type', 'intersection_type']: 76 | metadata['return_type'] = get_node_text(n) 77 | elif n.type == 'formal_parameters': 78 | for param_node in n.children: 79 | if param_node.type in ['simple_parameter', 'variadic_parameter', 'property_promotion_parameter']: 80 | identifier = get_node_text(param_node.child_by_field_name('name')) 81 | param_type = param_node.child_by_field_name('type') 82 | if param_type: 83 | param_type = get_node_text(param_type) 84 | metadata['parameters'][identifier] = param_type 85 | else: 86 | metadata['parameters'][identifier] = None 87 | 88 | if not metadata['return_type']: 89 | return_statement = get_node_by_kind(function_node, ['return_statement']) 90 | if len(return_statement) > 0: 91 | metadata['return_type'] = '' 92 | else: 93 | metadata['return_type'] = None 94 | 95 | return metadata 96 | 97 | 98 | @staticmethod 99 | def get_class_metadata(class_node, blob: str=None): 100 | if blob: 101 | logger.info('From version `0.0.6` this function will update argument in the API') 102 | metadata = { 103 | 'identifier': '', 104 | 'parameters': {}, 105 | } 106 | assert type(class_node) == tree_sitter.Node 107 | 108 | for child in class_node.children: 109 | if child.type == 'name': 110 | metadata['identifier'] = get_node_text(child) 111 | elif child.type == 'base_clause': 112 | argument_list = [] 113 | for param in child.children: 114 | if param.type == 'name': 115 | name = get_node_text(param) 116 | metadata['parameters'][name] = None 117 | # argument_list.append(get_node_text(param)) 118 | # metadata['parameters'] = argument_list 119 | 120 | return metadata 121 | -------------------------------------------------------------------------------- /src/codetext/parser/python_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Dict, Iterable, Optional, Iterator, Any 3 | import logging 4 | 5 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class PythonParser(LanguageParser): 12 | 13 | BLACKLISTED_FUNCTION_NAMES = ['__init__', '__name__', '__main__'] 14 | 15 | @staticmethod 16 | def get_docstring(node, blob:str=None): 17 | if blob: 18 | logger.info('From version `0.0.6` this function will update argument in the API') 19 | docstring_node = PythonParser.get_docstring_node(node) 20 | 21 | docstring = '' 22 | if docstring_node is not None: 23 | docstring = get_node_text(docstring_node[0]) 24 | docstring = docstring.strip('"').strip("'").strip("#") 25 | return docstring 26 | 27 | @staticmethod 28 | def get_function_list(node): 29 | res = get_node_by_kind(node, ['function_definition']) 30 | return res 31 | 32 | @staticmethod 33 | def get_class_list(node): 34 | res = get_node_by_kind(node, ['class_definition']) 35 | return res 36 | 37 | @staticmethod 38 | def get_docstring_node(node): 39 | docstring_node = [] 40 | # traverse_type(node, docstring_node, kind=['expression_statement']) #, 'comment']) 41 | for child in node.children: 42 | if child.type == 'block': 43 | for sub_child in child.children: 44 | if sub_child.type == 'expression_statement': 45 | docstring_node.append(sub_child) 46 | 47 | docstring_node = [node for node in docstring_node if 48 | node.type == 'expression_statement' and node.children[0].type == 'string'] 49 | 50 | if len(docstring_node) > 0: 51 | return [docstring_node[0].children[0]] # only take the first block 52 | 53 | return None 54 | 55 | @staticmethod 56 | def get_comment_node(node): 57 | comment_node = get_node_by_kind(node, kind=['comment', 'expression_statement']) 58 | for node in comment_node[:]: 59 | if node.type == 'expression_statement' and node.children[0].type != 'string': 60 | comment_node.remove(node) 61 | return comment_node 62 | 63 | @staticmethod 64 | def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]: 65 | if blob: 66 | logger.info('From version `0.0.6` this function will update argument in the API') 67 | metadata = { 68 | 'identifier': '', 69 | 'parameters': {}, 70 | 'return_type': None, 71 | } 72 | 73 | for child in function_node.children: 74 | if child.type == 'identifier': 75 | metadata['identifier'] = get_node_text(child) 76 | elif child.type == 'parameters': 77 | for subchild in child.children: 78 | if subchild.type == 'identifier': 79 | metadata['parameters'][get_node_text(subchild)] = None 80 | elif subchild.type in ['typed_parameter', 'default_parameter', 'typed_default_parameter']: 81 | param_type = get_node_by_kind(subchild, ['type']) 82 | if param_type: 83 | param_type = get_node_text(param_type[0]) 84 | else: 85 | param_type = None 86 | param_identifier = get_node_by_kind(subchild, ['identifier']) 87 | assert len(param_identifier) != 0, "Empty identifier" 88 | param_identifier = get_node_text(param_identifier[0]) 89 | metadata['parameters'][param_identifier] = param_type 90 | elif child.type == 'type': 91 | metadata['return_type'] = get_node_text(child) 92 | 93 | if not metadata['return_type']: 94 | return_statement = get_node_by_kind(function_node, ['return_statement']) 95 | if len(return_statement) > 0: 96 | metadata['return_type'] = '' 97 | else: 98 | metadata['return_type'] = None 99 | 100 | return metadata 101 | 102 | @staticmethod 103 | def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]: 104 | if blob: 105 | logger.info('From version `0.0.6` this function will update argument in the API') 106 | metadata = { 107 | 'identifier': '', 108 | 'parameters': {}, 109 | } 110 | for child in class_node.children: 111 | if child.type == 'identifier': 112 | metadata['identifier'] = get_node_text(child) 113 | elif child.type == 'argument_list': 114 | argument_list = get_node_text(child).split(',') 115 | for arg in argument_list: 116 | item = re.sub(r'[^a-zA-Z0-9\_]', ' ', arg).split() 117 | # Handle class definitions with empty argument list class ABC() 118 | if len(item) > 0: 119 | metadata['parameters'][item[0].strip()] = None 120 | 121 | # get __init__ function 122 | return metadata 123 | -------------------------------------------------------------------------------- /src/codetext/parser/ruby_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Dict, Any 3 | 4 | import tree_sitter 5 | import logging 6 | 7 | from .language_parser import LanguageParser, get_node_text, get_node_by_kind 8 | # from function_parser.parsers.commentutils import get_docstring_summary 9 | 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class RubyParser(LanguageParser): 15 | 16 | FILTER_PATHS = ('test', 'vendor') 17 | 18 | BLACKLISTED_FUNCTION_NAMES = ['initialize', 'to_text', 'display', 'dup', 'clone', 'equal?', '==', '<=>', 19 | '===', '<=', '<', '>', '>=', 'between?', 'eql?', 'hash'] 20 | 21 | @staticmethod 22 | def get_function_list(node): 23 | res = get_node_by_kind(node, ['method', 24 | 'singleton_method']) 25 | return res 26 | 27 | @staticmethod 28 | def get_class_list(node): 29 | res = get_node_by_kind(node, ['class', 'module']) 30 | 31 | # remove class keywords 32 | for node in res[:]: 33 | if not node.children: 34 | res.remove(node) 35 | 36 | return res 37 | 38 | @staticmethod 39 | def get_docstring_node(node) -> str: 40 | docstring_node = [] 41 | 42 | prev_node = node.prev_sibling 43 | if not prev_node or prev_node.type != 'comment': 44 | parent_node = node.parent 45 | if parent_node: 46 | prev_node = parent_node.prev_sibling 47 | 48 | if prev_node and prev_node.type == 'comment': 49 | docstring_node.append(prev_node) 50 | prev_node = prev_node.prev_sibling 51 | 52 | while prev_node and prev_node.type == 'comment': 53 | # Assume the comment is dense 54 | x_current = prev_node.start_point[0] 55 | x_next = prev_node.next_sibling.start_point[0] 56 | if x_next - x_current > 1: 57 | break 58 | 59 | docstring_node.insert(0, prev_node) 60 | prev_node = prev_node.prev_sibling 61 | 62 | return docstring_node 63 | 64 | @staticmethod 65 | def get_docstring(node, blob=None): 66 | if blob: 67 | logger.info('From version `0.0.6` this function will update argument in the API') 68 | docstring_node = RubyParser.get_docstring_node(node) 69 | docstring = [] 70 | for item in docstring_node: 71 | doc = get_node_text(item) 72 | doc_lines = doc.split('\n') 73 | for line in doc_lines: 74 | if '=begin' in line or '=end' in line: 75 | continue 76 | docstring.append(line) 77 | 78 | docstring = '\n'.join(docstring) 79 | return docstring 80 | 81 | @staticmethod 82 | def get_function_metadata(function_node, blob=None) -> Dict[str, str]: 83 | if blob: 84 | logger.info('From version `0.0.6` this function will update argument in the API') 85 | metadata = { 86 | 'identifier': '', 87 | 'parameters': {}, 88 | 'return_type': None, 89 | } 90 | 91 | assert type(function_node) == tree_sitter.Node 92 | assert function_node.type in ['method', 'singleton_method'] 93 | 94 | for child in function_node.children: 95 | if child.type == 'identifier': 96 | metadata['identifier'] = get_node_text(child) 97 | elif child.type in ['method_parameters', 'parameters', 'bare_parameters']: 98 | params = get_node_by_kind(child, ['identifier']) 99 | for item in params: 100 | metadata['parameters'][get_node_text(item)] = None 101 | 102 | if not metadata['return_type']: 103 | return_statement = get_node_by_kind(function_node, ['return']) 104 | if len(return_statement) > 0: 105 | metadata['return_type'] = '' 106 | else: 107 | metadata['return_type'] = None 108 | 109 | return metadata 110 | 111 | @staticmethod 112 | def get_class_metadata(class_node, blob=None): 113 | if blob: 114 | logger.info('From version `0.0.6` this function will update argument in the API') 115 | metadata = { 116 | 'identifier': '', 117 | 'parameters': {}, 118 | } 119 | 120 | assert type(class_node) == tree_sitter.Node 121 | 122 | for child in class_node.children: 123 | if child.type == 'constant': 124 | metadata['identifier'] = get_node_text(child) 125 | if child.type == 'superclass': 126 | for subchild in child.children: 127 | if subchild.type == 'constant': 128 | metadata['parameters'][get_node_text(subchild)] = None 129 | 130 | return metadata 131 | 132 | 133 | @staticmethod 134 | def get_comment_node(function_node): 135 | comment_node = get_node_by_kind(function_node, kind='comment') 136 | return comment_node 137 | 138 | @staticmethod 139 | def get_action_list(action_node): 140 | call_nodes = get_node_by_kind(action_node, ['call']) 141 | res = [] 142 | for call_node in call_nodes: 143 | if get_node_by_kind(call_node, ["do_block"]): 144 | res.append(call_node) 145 | # print(res) 146 | return res 147 | 148 | @staticmethod 149 | def get_action_metadata(action_node): 150 | metadata = { 151 | 'identifier': '', 152 | 'parameters': {}, 153 | 'return_type': None, 154 | } 155 | 156 | for child in action_node.children: 157 | if child.type in ["identifier"]: 158 | metadata['identifier'] = get_node_text(child) 159 | if child.type in ["argument_list"]: 160 | symbol = get_node_by_kind(child, ["simple_symbol"]) 161 | if symbol: 162 | metadata['identifier'] += get_node_text(symbol[0]) 163 | 164 | parameters = get_node_by_kind(action_node, ["block_parameters"]) 165 | 166 | if parameters: 167 | for param in get_node_by_kind(parameters[0], ["identifier"]): 168 | param_name = get_node_text(param) 169 | metadata['parameters'].update({param_name : None}) 170 | 171 | return metadata 172 | 173 | -------------------------------------------------------------------------------- /src/codetext/parser/rust_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Dict, Any 3 | 4 | import tree_sitter 5 | import logging 6 | 7 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class RustParser(LanguageParser): 14 | 15 | FILTER_PATHS = ('test', 'vendor') 16 | 17 | BLACKLISTED_FUNCTION_NAMES = ['main'] 18 | 19 | @staticmethod 20 | def get_function_list(node): 21 | res = get_node_by_kind(node, ['function_item']) 22 | return res 23 | 24 | @staticmethod 25 | def get_class_list(node): 26 | res = get_node_by_kind(node, ['impl_item', 'mod_item']) # trait is like an interface 27 | return res 28 | 29 | @staticmethod 30 | def get_docstring_node(node) -> List: 31 | docstring_node = [] 32 | 33 | prev_node = node.prev_sibling 34 | if prev_node: 35 | if prev_node.type == 'block_comment': 36 | docstring_node.append(prev_node) 37 | 38 | elif prev_node.type == 'line_comment': 39 | docstring_node.append(prev_node) 40 | prev_node = prev_node.prev_sibling 41 | 42 | while prev_node and prev_node.type == 'line_comment': 43 | # Assume the comment is dense 44 | x_current = prev_node.start_point[0] 45 | x_next = prev_node.next_sibling.start_point[0] 46 | if x_next - x_current > 1: 47 | break 48 | 49 | docstring_node.insert(0, prev_node) 50 | prev_node = prev_node.prev_sibling 51 | 52 | return docstring_node 53 | 54 | @staticmethod 55 | def get_docstring(node, blob=None): 56 | if blob: 57 | logger.info('From version `0.0.6` this function will update argument in the API') 58 | docstring_node = RustParser.get_docstring_node(node) 59 | docstring = [] 60 | if docstring_node: 61 | for item in docstring_node: 62 | doc = get_node_text(item) 63 | docstring.append(doc) 64 | 65 | docstring = '\n'.join(docstring) 66 | return docstring 67 | 68 | @staticmethod 69 | def get_function_metadata(function_node, blob=None) -> Dict[str, str]: 70 | if blob: 71 | logger.info('From version `0.0.6` this function will update argument in the API') 72 | metadata = { 73 | 'identifier': '', 74 | 'parameters': {}, 75 | 'return_type': None, 76 | } 77 | 78 | assert type(function_node) == tree_sitter.Node 79 | assert function_node.type == 'function_item' 80 | 81 | for child in function_node.children: 82 | if child.type == 'identifier': 83 | metadata['identifier'] = get_node_text(child) 84 | elif child.type in ['parameters']: 85 | params = get_node_by_kind(child, ['parameter', 'variadic_parameter', 'self_parameter']) 86 | for item in params: 87 | if item.type == 'self_parameter': 88 | metadata['parameters'][get_node_text(item)] = None 89 | 90 | else: 91 | param_name = '' 92 | for subchild in item.children: 93 | if subchild.type == 'mutable_specifier': 94 | param_name = 'self' 95 | break 96 | elif subchild.type == 'identifier': 97 | param_name = get_node_text(subchild) 98 | break 99 | param_type = item.child_by_field_name('type') 100 | 101 | if param_type: 102 | param_type = get_node_text(param_type) 103 | metadata['parameters'][param_name] = param_type 104 | else: 105 | metadata['parameters'][param_name] = None 106 | param_type = None 107 | 108 | if child.type == 'reference_type': 109 | metadata['return_type'] = get_node_text(child) 110 | 111 | if not metadata['return_type']: 112 | return_statement = get_node_by_kind(function_node, ['return_expression']) 113 | if len(return_statement) > 0: 114 | metadata['return_type'] = '' 115 | else: 116 | metadata['return_type'] = None 117 | 118 | return metadata 119 | 120 | @staticmethod 121 | def get_class_metadata(class_node, blob=None): 122 | if blob: 123 | logger.info('From version `0.0.6` this function will update argument in the API') 124 | metadata = { 125 | 'identifier': '', 126 | 'parameters': {}, 127 | } 128 | 129 | assert type(class_node) == tree_sitter.Node 130 | 131 | if class_node.type == 'mod_item': 132 | for child in class_node.children: 133 | if child.type == 'identifier': 134 | metadata['identifier'] = get_node_text(child) 135 | 136 | else: 137 | identifier = get_node_by_kind(class_node, ['type_identifier']) 138 | 139 | metadata['identifier'] = get_node_text(identifier[0]) 140 | if len(identifier) > 1: 141 | for param in identifier[1:]: 142 | metadata['parameters'][get_node_text(param)] = None 143 | 144 | return metadata 145 | 146 | 147 | @staticmethod 148 | def get_comment_node(function_node): 149 | comment_node = get_node_by_kind(function_node, kind=['comment', 'line_comment', 'block_comment']) 150 | return comment_node 151 | -------------------------------------------------------------------------------- /src/codetext/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import build_language, parse_code, SUPPORTED_LANGUAGE 2 | from .imports import module_available 3 | 4 | __all__ = ["build_languagem", "parse_code", "module_available"] -------------------------------------------------------------------------------- /src/codetext/utils/imports.py: -------------------------------------------------------------------------------- 1 | """Import utilities.""" 2 | import importlib 3 | from importlib.util import find_spec 4 | 5 | 6 | def _package_available(package_name: str) -> bool: 7 | """Check if a package is available in your environment. 8 | .. code-block:: python 9 | 10 | >>> _package_available('os') 11 | True 12 | >>> _package_available('bla') 13 | False 14 | """ 15 | return find_spec(package_name) is not None 16 | 17 | 18 | def module_available(module_path: str) -> bool: 19 | """Check if a module path is available in your environment. 20 | Source: pytorch_lightning/utilities/imports.py 21 | .. code-block:: python 22 | 23 | >>> module_available('os') 24 | True 25 | >>> module_available('os.bla') 26 | False 27 | >>> module_available('bla.bla') 28 | False 29 | """ 30 | module_names = module_path.split(".") 31 | if not _package_available(module_names[0]): 32 | return False 33 | module = importlib.import_module(module_names[0]) 34 | for name in module_names[1:]: 35 | if not hasattr(module, name): 36 | return False 37 | module = getattr(module, name) 38 | return True -------------------------------------------------------------------------------- /src/codetext/utils/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import inspect 3 | import sys 4 | import os 5 | import subprocess 6 | import logging 7 | from pathlib import Path 8 | from typing import List, Dict, Any, Union 9 | 10 | import tree_sitter 11 | from tree_sitter import Language, Parser 12 | 13 | 14 | logger = logging.getLogger('utils') 15 | logging.basicConfig(level = logging.INFO) 16 | 17 | 18 | SUPPORTED_LANGUAGE = ['python', 'java', 'javascript', 'ruby', 'go', 'c', 'cpp', 'c++', 'c#', 'c_sharp', 'php', 'rust'] 19 | 20 | 21 | def build_language(language: str, save_path: str=None): 22 | """ 23 | Build tree-sitter language 24 | 25 | Args: 26 | language (str): java, python, cpp, c_sharp, etc 27 | save_path (str): save path (default create a `/tree-sitter/` dir) 28 | """ 29 | language = str(language).lower() 30 | if language == 'c#': 31 | language = 'c_sharp' 32 | elif language == 'c++': 33 | language = 'cpp' 34 | 35 | assert language.lower() in SUPPORTED_LANGUAGE, f"Expect {language} in {SUPPORTED_LANGUAGE}" 36 | if not save_path: 37 | calling_script_path = Path(inspect.getframeinfo(sys._getframe(1)).filename) 38 | save_path = calling_script_path.parent 39 | 40 | # create `tree-sitter` dir 41 | ts_path = os.path.join(save_path, 'tree-sitter') 42 | if not os.path.exists(ts_path): 43 | logger.warning( 44 | f"Not found `tree-sitter` folder, create new one in {ts_path}" 45 | ) 46 | os.mkdir(ts_path) 47 | 48 | # check `tree-sitter/tree-sitter-` 49 | ts_lang_path = os.path.join(ts_path, 'tree-sitter-'+language.replace('_', '-')) 50 | if not os.path.exists(ts_lang_path): 51 | logger.warning( 52 | f"Not found `tree-sitter-{language.replace('_', '-')}`, attempt clone from github to {ts_path}" 53 | ) 54 | command = f"cd {ts_path}; git clone https://github.com/tree-sitter/tree-sitter-{language.replace('_', '-')}.git" 55 | subprocess.Popen(command ,shell=True).wait() 56 | 57 | assert os.path.exists(ts_lang_path)==True, f"Unable to find {language} tree-sitter in {ts_path}" 58 | 59 | # if language == 'c-sharp': language = 'c_sharp' 60 | lang_path = os.path.join(save_path, 'tree-sitter', f'{language}.so') 61 | if not os.path.exists(lang_path): 62 | logger.info( 63 | f"Attempt to build Tree-sitter Language for {language} and store in {lang_path}" 64 | ) 65 | Language.build_library(lang_path, [ts_lang_path]) 66 | assert os.path.exists(lang_path)==True 67 | else: 68 | logger.info(f"Language already existed!") 69 | 70 | 71 | def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) -> tree_sitter.Tree: 72 | """ 73 | Auto parse raw code into `tree_sitter.Tree` 74 | 75 | Args: 76 | raw_code (str): Raw source code need to parse 77 | language (str): Language to load parser 78 | """ 79 | # TODO: auto detect language 80 | if language == 'Auto': 81 | raise NotImplemented("This feature is underdevelopment") 82 | language = str(language).lower() 83 | if language == 'c#': 84 | language = 'c_sharp' 85 | elif language == 'c++': 86 | language = 'cpp' 87 | assert language in SUPPORTED_LANGUAGE, f"Expect {language} in {SUPPORTED_LANGUAGE}" 88 | 89 | if tree_sitter_path: 90 | load_path = tree_sitter_path 91 | else: 92 | calling_script_path = Path(inspect.getframeinfo(sys._getframe(1)).filename) 93 | load_path = str(calling_script_path.parent) 94 | 95 | # Get parser from languages 96 | parser = Parser() 97 | try: 98 | from tree_sitter_languages import get_language, get_parser 99 | language = get_language(language) 100 | except ImportError: 101 | # Work-around when pre-built binaries wheels for tree-sitter-languages are not available 102 | logger.warning(f"Troubled importing 'tree-sitter-languages', attemp to look for pre-built binaries in the workspace") 103 | ts_lang_path = os.path.join(load_path, 'tree-sitter', f'{language}.so') 104 | if not os.path.exists(ts_lang_path): 105 | logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language") 106 | build_language(language, load_path) 107 | language = Language(load_path + f"/tree-sitter/{language}.so", language) 108 | parser.set_language(language) 109 | 110 | if isinstance(raw_code, str): 111 | raw_code = bytes(raw_code, 'utf8') 112 | elif isinstance(raw_code, bytes): 113 | pass 114 | else: 115 | raise ValueError(f"Expect `str`, got {type(raw_code)}") 116 | tree = parser.parse(raw_code) 117 | return tree 118 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | PROJECT_PATH = os.getcwd() 4 | SOURCE_PATH = os.path.join( 5 | PROJECT_PATH,"src" 6 | ) 7 | 8 | 9 | sys.path.append(SOURCE_PATH) -------------------------------------------------------------------------------- /tests/setup.py: -------------------------------------------------------------------------------- 1 | from ..src.codetext.utils import build_language 2 | from tree_sitter_languages import get_language, get_parser 3 | 4 | if __name__ == '__main__': 5 | lang_list = ['python', 'cpp', 'java', 'c-sharp', 'ruby', 'rust', 'javascript', 'php', 'go'] 6 | 7 | for lang in lang_list: 8 | # build_language(lang) 9 | try: 10 | get_parser(get_language(lang)) 11 | except: 12 | build_language(lang) 13 | -------------------------------------------------------------------------------- /tests/test_clean/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/tests/test_clean/__init__.py -------------------------------------------------------------------------------- /tests/test_clean/test_clean_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/tests/test_clean/test_clean_utils.py -------------------------------------------------------------------------------- /tests/test_parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/tests/test_parser/__init__.py -------------------------------------------------------------------------------- /tests/test_parser/test_c.py: -------------------------------------------------------------------------------- 1 | '''test for C++ parser''' 2 | import os 3 | import unittest 4 | 5 | from src.codetext.parser import CppParser 6 | from src.codetext.utils import parse_code 7 | 8 | 9 | class Test_CppParser_with_C(unittest.TestCase): 10 | def setUp(self) -> None: 11 | with open('tests/test_parser/test_sample/c_test_sample.c', 'r') as file: 12 | self.code_sample = file.read() 13 | 14 | tree = parse_code(self.code_sample, 'c') 15 | self.root_node = tree.root_node 16 | 17 | return super().setUp() 18 | 19 | def test_get_function_list(self): 20 | root = self.root_node 21 | 22 | function_list = CppParser.get_function_list(root) 23 | 24 | self.assertEqual(len(function_list), 2) 25 | 26 | def test_get_function_metadata(self): 27 | root = self.root_node 28 | 29 | function = CppParser.get_function_list(root)[0] 30 | metadata = CppParser.get_function_metadata(function) 31 | 32 | for key in ['identifier', 'parameters', 'return_type']: 33 | self.assertTrue(key in metadata.keys()) 34 | self.assertEqual(metadata['parameters'], {'random_seed': 'int'}) 35 | self.assertEqual(metadata['identifier'], 'reverseSentence') 36 | self.assertEqual(metadata['return_type'], 'void') 37 | 38 | def test_get_class_list(self): 39 | pass 40 | 41 | def test_get_class_metadata(self): 42 | pass 43 | 44 | def test_get_docstring(self): 45 | code_sample = """ 46 | /** 47 | * A brief description. A more elaborate class description 48 | * @param random_seed somearg. 49 | * @see Test() 50 | * @return The test results 51 | */ 52 | void reverseSentence(int random_seed) { 53 | char c; 54 | scanf("%c", &c); 55 | if (c != '\n') { 56 | reverseSentence(); 57 | printf("%c", c); 58 | } 59 | } 60 | """ 61 | tree = parse_code(code_sample, 'c') 62 | root = tree.root_node 63 | 64 | fn= CppParser.get_function_list(root)[0] 65 | 66 | docs = CppParser.get_docstring(fn) 67 | 68 | self.assertEqual(docs, '/**\n * A brief description. A more elaborate class description\n * @param random_seed somearg.\n * @see Test()\n * @return The test results\n */') 69 | 70 | 71 | def test_extract_docstring(self): 72 | pass 73 | 74 | 75 | if __name__ == '__main__': 76 | unittest.main() 77 | -------------------------------------------------------------------------------- /tests/test_parser/test_cpp.py: -------------------------------------------------------------------------------- 1 | '''test for C++ parser''' 2 | import os 3 | import unittest 4 | from pathlib import Path 5 | 6 | from src.codetext.parser import CppParser 7 | from src.codetext.utils import parse_code 8 | 9 | 10 | class Test_CppParser(unittest.TestCase): 11 | def setUp(self) -> None: 12 | with open('tests/test_parser/test_sample/cpp_test_sample.cpp', 'r') as file: 13 | self.code_sample = file.read() 14 | 15 | tree = parse_code(self.code_sample, 'c++') 16 | self.root_node = tree.root_node 17 | 18 | return super().setUp() 19 | 20 | def test_get_function_list(self): 21 | root = self.root_node 22 | 23 | function_list = CppParser.get_function_list(root) 24 | 25 | self.assertEqual(len(function_list), 3) 26 | 27 | def test_get_class_list(self): 28 | root = self.root_node 29 | 30 | class_list = CppParser.get_class_list(root) 31 | 32 | self.assertEqual(len(class_list), 2) 33 | 34 | def test_get_function_metadata(self): 35 | root = self.root_node 36 | 37 | function = list(CppParser.get_function_list(root))[0] 38 | metadata = CppParser.get_function_metadata(function) 39 | 40 | for key in ['identifier', 'parameters', 'return_type']: 41 | self.assertTrue(key in metadata.keys(), "Missing {}".format(key)) 42 | self.assertEqual(metadata['parameters'], {'a': 'int', 'b': 'int'}) 43 | self.assertEqual(metadata['identifier'], 'sum2number') 44 | self.assertEqual(metadata['return_type'], 'int') 45 | 46 | def test_get_class_metadata(self): 47 | root = self.root_node 48 | 49 | classes = list(CppParser.get_class_list(root))[0] 50 | metadata = CppParser.get_class_metadata(classes) 51 | 52 | self.assertEqual(metadata['parameters'], {'Vehicle': None, 'B': None}) 53 | self.assertEqual(metadata['identifier'], 'Car') 54 | 55 | def test_get_docstring(self): 56 | code_sample = """ 57 | /** 58 | * Find 2 sum 59 | * 60 | * @param nums List number. 61 | * @param target Sum target. 62 | * @return postion of 2 number. 63 | */ 64 | vector twoSum(vector& nums, int target) { 65 | map m; 66 | vector v; 67 | int n= nums.size(); 68 | for(int i=0;isecond); 76 | v.push_back(i); 77 | } 78 | m.insert(make_pair(nums[i],i)); 79 | } 80 | 81 | return v; 82 | } 83 | 84 | // Comment in 85 | // multiple line 86 | // of the function sum 87 | double sum2num(int a, int b) { 88 | return a + b; 89 | } 90 | """ 91 | tree = parse_code(code_sample, 'c++') 92 | root = tree.root_node 93 | 94 | fn1, fn2 = list(CppParser.get_function_list(root)) 95 | 96 | docs1 = CppParser.get_docstring(fn1) 97 | docs2 = CppParser.get_docstring(fn2) 98 | 99 | self.assertEqual(docs1, '/**\n * Find 2 sum\n *\n * @param nums List number.\n * @param target Sum target.\n * @return postion of 2 number.\n */') 100 | self.assertEqual(docs2, '// Comment in\n// multiple line\n// of the function sum') 101 | 102 | 103 | if __name__ == '__main__': 104 | unittest.main() 105 | -------------------------------------------------------------------------------- /tests/test_parser/test_csharp.py: -------------------------------------------------------------------------------- 1 | '''test for C# parser''' 2 | import os 3 | import unittest 4 | from pathlib import Path 5 | 6 | from src.codetext.parser import CsharpParser 7 | from src.codetext.utils import parse_code 8 | 9 | 10 | class Test_CsharpParser(unittest.TestCase): 11 | def setUp(self) -> None: 12 | with open('tests/test_parser/test_sample/c_sharp_test_sample.cs', 'r') as file: 13 | self.code_sample = file.read() 14 | 15 | tree = parse_code(self.code_sample, 'c#') 16 | self.root_node = tree.root_node 17 | 18 | return super().setUp() 19 | 20 | def test_get_function_list(self): 21 | root = self.root_node 22 | 23 | function_list = CsharpParser.get_function_list(root) 24 | 25 | self.assertEqual(len(function_list), 3) # exclude constructor 26 | 27 | def test_get_class_list(self): 28 | root = self.root_node 29 | 30 | class_list = CsharpParser.get_class_list(root) 31 | 32 | self.assertEqual(len(class_list), 1) 33 | 34 | def test_get_docstring(self): 35 | code_sample = """ 36 | class Vehicle 37 | { 38 | public string brand = "Ford"; // Vehicle field 39 | 40 | // 41 | // Docstring of a method 42 | // 43 | // Argument. 44 | // 45 | // None. 46 | public void honk(string animal_honk) 47 | { 48 | Console.WriteLine(animal_honk); 49 | Console.WriteLine("Tuut, tuut!"); 50 | } 51 | 52 | /* Another method docstring 53 | in multiple line */ 54 | public void _honk() 55 | { 56 | Console.WriteLine("Tuut, tuut!"); 57 | } 58 | } 59 | """ 60 | tree = parse_code(code_sample, 'c#') 61 | root = tree.root_node 62 | 63 | fn1, fn2 = list(CsharpParser.get_function_list(root)) 64 | 65 | docs1 = CsharpParser.get_docstring(fn1) 66 | docs2 = CsharpParser.get_docstring(fn2) 67 | 68 | self.assertEqual(docs1, '// \n// Docstring of a method\n// \n// Argument.\n// \n// None.') 69 | self.assertEqual(docs2, '/* Another method docstring\n in multiple line */') 70 | 71 | 72 | def test_get_function_metadata(self): 73 | root = self.root_node 74 | 75 | function = list(CsharpParser.get_function_list(root))[0] 76 | metadata = CsharpParser.get_function_metadata(function) 77 | 78 | for key in ['identifier', 'parameters', 'return_type']: 79 | self.assertTrue(key in metadata.keys()) 80 | self.assertEqual(metadata['parameters'], {'path': 'string', 'filename': 'string'}) 81 | self.assertEqual(metadata['identifier'], 'GetText') 82 | self.assertEqual(metadata['return_type'], 'string') 83 | 84 | def test_get_class_metadata(self): 85 | root = self.root_node 86 | 87 | classes = list(CsharpParser.get_class_list(root))[0] 88 | metadata = CsharpParser.get_class_metadata(classes) 89 | 90 | self.assertEqual(metadata['parameters'], {'Animal': None}) 91 | self.assertEqual(metadata['identifier'], 'Dog') 92 | 93 | 94 | if __name__ == '__main__': 95 | unittest.main() 96 | -------------------------------------------------------------------------------- /tests/test_parser/test_go.py: -------------------------------------------------------------------------------- 1 | '''test for C++ parser''' 2 | import os 3 | import unittest 4 | from pathlib import Path 5 | 6 | from src.codetext.parser import GoParser 7 | from src.codetext.utils import parse_code 8 | 9 | 10 | class Test_GoParser(unittest.TestCase): 11 | def setUp(self) -> None: 12 | with open('tests/test_parser/test_sample/go_test_sample.go', 'r') as file: 13 | self.code_sample = file.read() 14 | 15 | tree = parse_code(self.code_sample, 'go') 16 | self.root_node = tree.root_node 17 | return super().setUp() 18 | 19 | def test_get_function_list(self): 20 | root = self.root_node 21 | 22 | function_list = GoParser.get_function_list(root) 23 | 24 | self.assertEqual(len(function_list), 1) 25 | 26 | def test_get_function_metadata(self): 27 | root = self.root_node 28 | 29 | function = GoParser.get_function_list(root)[0] 30 | metadata = GoParser.get_function_metadata(function) 31 | 32 | for key in ['identifier', 'parameters', 'return_type']: 33 | self.assertTrue(key in metadata.keys()) 34 | self.assertEqual(metadata['parameters'], {'e': 'TypeError'}) 35 | self.assertEqual(metadata['identifier'], 'Error') 36 | self.assertEqual(metadata['return_type'], 'string') 37 | 38 | def test_get_docstring(self): 39 | code_sample = """ 40 | type TypeError struct { 41 | Type1, Type2 reflect.Type 42 | Extra string 43 | } 44 | // Something must not include as docstring 45 | 46 | // The path package should only be used for paths separated by forward 47 | // slashes, such as the paths in URLs. This package does not deal with 48 | // Windows paths with drive letters or backslashes; to manipulate 49 | // operating system paths, use the [path/filepath] package. 50 | func (e TypeError) Error() string { 51 | msg := e.Type1.String() 52 | if e.Type2 != nil { 53 | msg += " and " + e.Type2.String() 54 | } 55 | msg += " " + e.Extra 56 | return msg 57 | } 58 | """ 59 | tree = parse_code(code_sample, 'go') 60 | root = tree.root_node 61 | 62 | fn = GoParser.get_function_list(root)[0] 63 | 64 | docs = GoParser.get_docstring(fn) 65 | self.assertEqual(docs, '// The path package should only be used for paths separated by forward\n// slashes, such as the paths in URLs. This package does not deal with\n// Windows paths with drive letters or backslashes; to manipulate\n// operating system paths, use the [path/filepath] package.') 66 | 67 | 68 | def test_extract_docstring(self): 69 | pass 70 | 71 | 72 | if __name__ == '__main__': 73 | unittest.main() 74 | -------------------------------------------------------------------------------- /tests/test_parser/test_java.py: -------------------------------------------------------------------------------- 1 | '''test for Java parser''' 2 | import os 3 | import unittest 4 | from pathlib import Path 5 | 6 | from src.codetext.parser import JavaParser 7 | from src.codetext.utils import parse_code 8 | 9 | 10 | class Test_JavaParser(unittest.TestCase): 11 | def setUp(self) -> None: 12 | with open('tests/test_parser/test_sample/java_test_sample.java', 'r') as file: 13 | self.code_sample = file.read() 14 | 15 | tree = parse_code(self.code_sample, 'java') 16 | self.root_node = tree.root_node 17 | 18 | return super().setUp() 19 | 20 | def test_get_function_list(self): 21 | root = self.root_node 22 | 23 | function_list = JavaParser.get_function_list(root) 24 | 25 | self.assertEqual(len(function_list), 2) 26 | 27 | def test_get_class_list(self): 28 | root = self.root_node 29 | 30 | class_list = JavaParser.get_class_list(root) 31 | 32 | self.assertEqual(len(class_list), 1) 33 | 34 | def test_get_docstring(self): 35 | code_sample = """ 36 | public class SaveFileController { 37 | /** 38 | * Adds new user and saves to file. 39 | * 40 | * @param context instance of Context 41 | * @param user instance of User 42 | * @see User 43 | */ 44 | public void addNewUser(Context context, User user){ 45 | loadFromFile(context); 46 | this.allUsers.add(user); 47 | saveToFile(context); 48 | } 49 | } 50 | """ 51 | tree = parse_code(code_sample, 'java', './') 52 | root = tree.root_node 53 | 54 | fn = list(JavaParser.get_function_list(root))[0] 55 | 56 | docs = JavaParser.get_docstring(fn) 57 | self.assertEqual(docs, '/**\n * Adds new user and saves to file.\n *\n * @param context instance of Context\n * @param user instance of User\n * @see User\n */') 58 | 59 | 60 | def test_get_function_metadata(self): 61 | root = self.root_node 62 | 63 | function = list(JavaParser.get_function_list(root))[0] 64 | metadata = JavaParser.get_function_metadata(function) 65 | 66 | for key in ['identifier', 'parameters', 'return_type']: 67 | self.assertTrue(key in metadata.keys()) 68 | self.assertEqual(metadata['parameters'], {'context': 'Context', 'userIndex': 'int'}) 69 | self.assertEqual(metadata['identifier'], 'getHabitList') 70 | self.assertEqual(metadata['return_type'], 'HabitList') 71 | 72 | def test_get_class_metadata(self): 73 | root = self.root_node 74 | 75 | classes = list(JavaParser.get_class_list(root))[0] 76 | metadata = JavaParser.get_class_metadata(classes) 77 | 78 | self.assertEqual(metadata['parameters'], {'SudoUser': None, 'FileController': None}) 79 | self.assertEqual(metadata['identifier'], 'SaveFileController') 80 | 81 | def test_extract_docstring(self): 82 | pass 83 | 84 | 85 | if __name__ == '__main__': 86 | unittest.main() 87 | -------------------------------------------------------------------------------- /tests/test_parser/test_javascript.py: -------------------------------------------------------------------------------- 1 | '''test for JavaScript parser''' 2 | import os 3 | import unittest 4 | from pathlib import Path 5 | 6 | from src.codetext.parser import JavascriptParser 7 | from src.codetext.utils import parse_code 8 | 9 | 10 | class Test_JavascriptParser(unittest.TestCase): 11 | def setUp(self) -> None: 12 | with open('tests/test_parser/test_sample/javascript_test_sample.js', 'r') as file: 13 | self.code_sample = file.read() 14 | 15 | tree = parse_code(self.code_sample, 'javascript') 16 | self.root_node = tree.root_node 17 | 18 | return super().setUp() 19 | 20 | def test_get_function_list(self): 21 | root = self.root_node 22 | 23 | function_list = JavascriptParser.get_function_list(root) 24 | 25 | self.assertEqual(len(function_list), 7) 26 | 27 | def test_get_class_list(self): 28 | root = self.root_node 29 | 30 | class_list = JavascriptParser.get_class_list(root) 31 | 32 | self.assertEqual(len(class_list), 2) 33 | 34 | def test_get_docstring(self): 35 | code_sample = """ 36 | /** 37 | * Dispatched when the repositories are loaded by the request saga 38 | * 39 | * @param {array} repos The repository data 40 | * @param {string} username The current username 41 | * 42 | * @return {object} An action object with a type of LOAD_REPOS_SUCCESS passing the repos 43 | */ 44 | function songsLoaded(repos, username) { 45 | return { 46 | type: LOAD_SONGS_SUCCESS, 47 | repos, 48 | username, 49 | }; 50 | } 51 | 52 | class Car { 53 | /** 54 | * Present the object Car 55 | * 56 | * @return {None} 57 | */ 58 | present() { 59 | return 'I have a ' + this.carname; 60 | } 61 | } 62 | """ 63 | 64 | tree = parse_code(code_sample, 'javascript') 65 | root = tree.root_node 66 | 67 | fn1, fn2 = JavascriptParser.get_function_list(root) 68 | 69 | 70 | docs1 = JavascriptParser.get_docstring(fn1) 71 | docs2 = JavascriptParser.get_docstring(fn2) 72 | 73 | self.assertEqual(docs1, '/**\n * Dispatched when the repositories are loaded by the request saga\n *\n * @param {array} repos The repository data\n * @param {string} username The current username\n *\n * @return {object} An action object with a type of LOAD_REPOS_SUCCESS passing the repos\n */') 74 | self.assertEqual(docs2, '/**\n * Present the object Car\n *\n * @return {None}\n */') 75 | 76 | def test_get_function_metadata(self): 77 | root = self.root_node 78 | 79 | _function = JavascriptParser.get_function_list(root)[1] 80 | metadata = JavascriptParser.get_function_metadata(_function) 81 | 82 | for key in ['identifier', 'parameters', 'return_type']: 83 | self.assertTrue(key in metadata.keys()) 84 | self.assertEqual(metadata['identifier'], 'songsLoaded') 85 | self.assertEqual(metadata['parameters'], {'repos': None, 'username': None}) 86 | 87 | def test_metadata_with_return_statement(self): 88 | code_sample = ''' 89 | function myFunction(p1, p2) { 90 | return p1 * p2; 91 | } 92 | ''' 93 | root = parse_code(code_sample, 'javascript').root_node 94 | fn = JavascriptParser.get_function_list(root)[0] 95 | metadata = JavascriptParser.get_function_metadata(fn) 96 | 97 | return_type = metadata['return_type'] 98 | self.assertEqual(return_type, '') 99 | 100 | def test_get_class_metadata(self): 101 | root = self.root_node 102 | 103 | classes = JavascriptParser.get_class_list(root)[0] 104 | metadata = JavascriptParser.get_class_metadata(classes) 105 | 106 | self.assertEqual(metadata['identifier'], 'Model') 107 | self.assertEqual(metadata['parameters'], {'Car': None}) 108 | 109 | def test_extract_docstring(self): 110 | pass 111 | 112 | 113 | def test_metadata_with_arrow_function(self): 114 | code_sample = ''' 115 | export const parseModel = async (mesh) => 116 | new Promise((resolve) => { 117 | exporter.parse( 118 | mesh, 119 | (gltf) => { 120 | const blob = new Blob([gltf], { type: "application/octet-stream" }); 121 | resolve(blob); 122 | return blob; 123 | }, 124 | (error) => { 125 | console.log(error); 126 | return error; 127 | 128 | } 129 | ); 130 | }); 131 | ''' 132 | root = parse_code(code_sample, 'javascript').root_node 133 | fn = JavascriptParser.get_function_list(root)[0] 134 | metadata = JavascriptParser.get_function_metadata(fn) 135 | 136 | identifier = metadata['identifier'] 137 | self.assertEqual(identifier, 'parseModel') 138 | 139 | def test_metadata_with_undecleared_functions(self): 140 | code_sample = """ 141 | const asyncFunctionExpression = async function() { 142 | // async function expression definition 143 | return a 144 | }; 145 | 146 | const generatorFunctionExpression = function*() { 147 | // generator function expression definition 148 | return b 149 | }; 150 | """ 151 | root = parse_code(code_sample, 'javascript').root_node 152 | fn1, fn2 = JavascriptParser.get_function_list(root) 153 | 154 | self.assertEqual(fn1.type, 'function') 155 | self.assertEqual(fn2.type, 'generator_function') 156 | 157 | metadata1 = JavascriptParser.get_function_metadata(fn1) 158 | metadata2 = JavascriptParser.get_function_metadata(fn2) 159 | 160 | self.assertEqual(metadata1['identifier'], 'asyncFunctionExpression') 161 | self.assertEqual(metadata2['identifier'], 'generatorFunctionExpression') 162 | 163 | 164 | if __name__ == '__main__': 165 | unittest.main() 166 | -------------------------------------------------------------------------------- /tests/test_parser/test_php.py: -------------------------------------------------------------------------------- 1 | '''test for PHP parser''' 2 | import os 3 | import unittest 4 | from pathlib import Path 5 | 6 | from src.codetext.parser import PhpParser 7 | from src.codetext.utils import parse_code 8 | 9 | 10 | class Test_PhpParser(unittest.TestCase): 11 | def setUp(self) -> None: 12 | with open('tests/test_parser/test_sample/php_test_sample.php', 'r') as file: 13 | self.code_sample = file.read() 14 | 15 | tree = parse_code(self.code_sample, 'php') 16 | self.root_node = tree.root_node 17 | 18 | return super().setUp() 19 | 20 | def test_get_function_list(self): 21 | root = self.root_node 22 | 23 | function_list = PhpParser.get_function_list(root) 24 | 25 | self.assertEqual(len(function_list), 5) 26 | 27 | def test_get_class_list(self): 28 | root = self.root_node 29 | 30 | class_list = PhpParser.get_class_list(root) 31 | 32 | self.assertEqual(len(class_list), 3) 33 | 34 | def test_get_docstring(self): 35 | code_sample = """ 36 | 51 | """ 52 | 53 | tree = parse_code(code_sample, 'php') 54 | root = tree.root_node 55 | 56 | fn = PhpParser.get_function_list(root)[0] 57 | 58 | docs = PhpParser.get_docstring(fn) 59 | 60 | self.assertEqual(docs, '/**\n * Get all image nodes.\n *\n * @param \\DOMNode $node The \\DOMDocument instance\n * @param boolean $strict If the document has to be valid\n *\n * @return \\DOMNode\n */') 61 | 62 | 63 | def test_get_function_metadata(self): 64 | root = self.root_node 65 | 66 | function = list(PhpParser.get_function_list(root))[1] 67 | metadata = PhpParser.get_function_metadata(function) 68 | 69 | for key in ['identifier', 'parameters', 'return_type']: 70 | self.assertTrue(key in metadata.keys()) 71 | self.assertEqual(metadata['parameters'], {'$params': 'array', '$connectionOptions': 'array'}) 72 | self.assertEqual(metadata['identifier'], 'constructDsn') 73 | self.assertEqual(metadata['return_type'], 'string') 74 | 75 | def test_metadata_with_return_statement(self): 76 | code_sample = ''' 77 | 82 | ''' 83 | root = parse_code(code_sample, 'PHP').root_node 84 | fn = PhpParser.get_function_list(root)[0] 85 | metadata = PhpParser.get_function_metadata(fn) 86 | 87 | return_type = metadata['return_type'] 88 | self.assertEqual(return_type, '') 89 | 90 | def test_metadata_without_return_statement(self): 91 | code_sample = ''' 92 | 96 | ''' 97 | root = parse_code(code_sample, 'PHP').root_node 98 | fn = PhpParser.get_function_list(root)[0] 99 | metadata = PhpParser.get_function_metadata(fn) 100 | 101 | return_type = metadata['return_type'] 102 | self.assertEqual(return_type, None) 103 | 104 | def test_get_class_metadata(self): 105 | root = self.root_node 106 | 107 | _class, interface, trait = list(PhpParser.get_class_list(root)) 108 | class_metadata = PhpParser.get_class_metadata(_class) 109 | 110 | self.assertEqual(class_metadata['parameters'], {'AbstractSQLServerDriver': None}) 111 | self.assertEqual(class_metadata['identifier'], 'Driver') 112 | 113 | interface_metadata = PhpParser.get_class_metadata(interface) 114 | self.assertEqual(interface_metadata['identifier'], 'MyInterface') 115 | 116 | trait_metadata = PhpParser.get_class_metadata(trait) 117 | self.assertEqual(trait_metadata['identifier'], 'MyTrait') 118 | 119 | 120 | if __name__ == '__main__': 121 | unittest.main() 122 | -------------------------------------------------------------------------------- /tests/test_parser/test_python.py: -------------------------------------------------------------------------------- 1 | '''test for python parser''' 2 | import os 3 | import unittest 4 | from pathlib import Path 5 | 6 | from src.codetext.parser import PythonParser 7 | from src.codetext.utils import parse_code 8 | 9 | 10 | class Test_PythonParser(unittest.TestCase): 11 | def setUp(self) -> None: 12 | with open('tests/test_parser/test_sample/py_test_sample.py', 'r') as file: 13 | self.code_sample = file.read() 14 | 15 | tree = parse_code(self.code_sample, 'python') 16 | self.root_node = tree.root_node 17 | return super().setUp() 18 | 19 | def test_get_function_list(self): 20 | root = self.root_node 21 | 22 | function_list = PythonParser.get_function_list(root) 23 | 24 | self.assertEqual(len(function_list), 3) 25 | 26 | def test_get_class_list(self): 27 | root = self.root_node 28 | 29 | class_list = PythonParser.get_class_list(root) 30 | self.assertEqual(len(class_list), 1) 31 | 32 | def test_get_docstring(self): 33 | code_sample = ''' 34 | def test_sample(): 35 | """This is a docstring""" 36 | return 37 | ''' 38 | root = parse_code(code_sample, 'python').root_node 39 | 40 | function = PythonParser.get_function_list(root)[0] 41 | docstring = PythonParser.get_docstring(function) 42 | self.assertEqual(docstring, "This is a docstring") 43 | 44 | def test_get_function_metadata(self): 45 | code_sample = ''' 46 | def test_sample(arg1: str = "string", arg2 = "another_string"): 47 | return NotImplement() 48 | ''' 49 | root = parse_code(code_sample, 'python').root_node 50 | 51 | function = list(PythonParser.get_function_list(root))[0] 52 | metadata = PythonParser.get_function_metadata(function) 53 | 54 | for key in ['identifier', 'parameters', 'return_type']: 55 | self.assertTrue(key in metadata.keys()) 56 | self.assertEqual(metadata['parameters'], {'arg1': 'str', 'arg2': None}) 57 | self.assertEqual(metadata['identifier'], 'test_sample') 58 | 59 | def test_get_class_metadata(self): 60 | code_sample = ''' 61 | class ABC(): 62 | pass 63 | 64 | class Sample(ABC): 65 | def __init__(self): 66 | pass 67 | 68 | def test_sample(self, arg1: str = "string", arg2 = "another_string"): 69 | return NotImplement() 70 | 71 | class ThisIsalsoAclass(ABC, Sample): 72 | pass 73 | ''' 74 | root = parse_code(code_sample, 'python').root_node 75 | 76 | 77 | classes = list(PythonParser.get_class_list(root)) 78 | self.assertEqual(len(classes), 3) 79 | 80 | metadata = PythonParser.get_class_metadata(classes[0]) 81 | self.assertEqual(metadata['parameters'], {}) 82 | self.assertEqual(metadata['identifier'], 'ABC') 83 | 84 | 85 | metadata = PythonParser.get_class_metadata(classes[1]) 86 | self.assertEqual(metadata['parameters'], {'ABC': None}) 87 | self.assertEqual(metadata['identifier'], 'Sample') 88 | 89 | 90 | metadata = PythonParser.get_class_metadata(classes[2]) 91 | self.assertEqual(metadata['parameters'], {'ABC': None, 'Sample': None}) 92 | self.assertEqual(metadata['identifier'], 'ThisIsalsoAclass') 93 | 94 | 95 | 96 | def test_get_comment_list(self): 97 | root = self.root_node 98 | 99 | comment_list = PythonParser.get_comment_node(root) 100 | comment_list = [node.text.decode() for node in comment_list] 101 | 102 | assert comment_list[1] == '# choose the rightmost element as pivot' 103 | assert comment_list[2] == '# pointer for greater element' 104 | assert len(comment_list) == 16 105 | 106 | def test_metadata_without_return_statement(self): 107 | code_sample = ''' 108 | def sum2num(): 109 | pass 110 | ''' 111 | root = parse_code(code_sample, 'python').root_node 112 | fn = PythonParser.get_function_list(root)[0] 113 | metadata = PythonParser.get_function_metadata(fn) 114 | 115 | return_type = metadata['return_type'] 116 | self.assertEqual(return_type, None) 117 | 118 | def test_metadata_with_return_statement(self): 119 | code_sample = ''' 120 | def sum2num(): 121 | return True 122 | ''' 123 | root = parse_code(code_sample, 'python').root_node 124 | fn = PythonParser.get_function_list(root)[0] 125 | metadata = PythonParser.get_function_metadata(fn) 126 | 127 | return_type = metadata['return_type'] 128 | self.assertEqual(return_type, '') 129 | 130 | def test_get_parameter(self): 131 | code_sample = ''' 132 | def sum2num(a: tree_sitter.Node=None, b=None, c:string) -> int: 133 | pass 134 | ''' 135 | 136 | root = parse_code(code_sample, 'python').root_node 137 | fn = PythonParser.get_function_list(root)[0] 138 | 139 | metadata = PythonParser.get_function_metadata(fn) 140 | parameter = metadata['parameters'] 141 | self.assertEqual(len(parameter.keys()), 3) 142 | self.assertTrue('a' in parameter.keys()) 143 | self.assertTrue('b' in parameter.keys()) 144 | self.assertTrue('c' in parameter.keys()) 145 | 146 | return_type = metadata['return_type'] 147 | self.assertEqual(return_type, 'int') 148 | 149 | 150 | if __name__ == '__main__': 151 | unittest.main() 152 | -------------------------------------------------------------------------------- /tests/test_parser/test_ruby.py: -------------------------------------------------------------------------------- 1 | '''test for Ruby parser''' 2 | import os 3 | import unittest 4 | from pathlib import Path 5 | 6 | from src.codetext.parser import RubyParser 7 | from src.codetext.utils import parse_code 8 | 9 | 10 | class Test_RubyParser(unittest.TestCase): 11 | def setUp(self) -> None: 12 | with open('tests/test_parser/test_sample/ruby_test_sample.rb', 'r') as file: 13 | self.code_sample = file.read() 14 | 15 | tree = parse_code(self.code_sample, 'ruby') 16 | self.root_node = tree.root_node 17 | 18 | return super().setUp() 19 | 20 | def test_get_function_list(self): 21 | root = self.root_node 22 | 23 | function_list = RubyParser.get_function_list(root) 24 | 25 | self.assertEqual(len(function_list), 2) 26 | 27 | def test_get_class_list(self): 28 | root = self.root_node 29 | 30 | class_list = RubyParser.get_class_list(root) 31 | 32 | self.assertEqual(len(class_list), 3) 33 | 34 | def test_get_docstring(self): 35 | code_sample = """ 36 | module Encryption 37 | 38 | # Search for links. 39 | # 40 | # @param query [String] The search query. 41 | # @option options [String, RedditKit::Subreddit] subreddit The optional subreddit to search. 42 | def encrypt(string) 43 | Digest::SHA2.hexdigest(string) 44 | end 45 | end 46 | 47 | =begin 48 | comment line 1 49 | comment line 2 50 | =end 51 | class Orange 52 | def initialize 53 | @juice_available = 100 54 | end 55 | def squeeze 56 | @juice_available -= 50 57 | end 58 | end 59 | 60 | orange = Orange.new 61 | orange.squeeze 62 | """ 63 | 64 | tree = parse_code(code_sample, 'ruby') 65 | root = tree.root_node 66 | 67 | fn = RubyParser.get_function_list(root)[0] 68 | clas = RubyParser.get_class_list(root)[1] 69 | 70 | docs1 = RubyParser.get_docstring(fn) 71 | docs2 = RubyParser.get_docstring(clas) 72 | 73 | self.assertEqual(docs1, '# Search for links.\n#\n# @param query [String] The search query.\n# @option options [String, RedditKit::Subreddit] subreddit The optional subreddit to search.') 74 | self.assertEqual(docs2, ' comment line 1\n comment line 2') 75 | 76 | def test_get_function_metadata(self): 77 | root = self.root_node 78 | 79 | _function = RubyParser.get_function_list(root)[0] 80 | metadata = RubyParser.get_function_metadata(_function) 81 | 82 | for key in ['identifier', 'parameters', 'return_type']: 83 | self.assertTrue(key in metadata.keys()) 84 | self.assertEqual(metadata['identifier'], 'search') 85 | self.assertEqual(metadata['parameters'], {'query': None, 'options': None}) 86 | self.assertEqual(metadata['return_type'], None) 87 | 88 | _singleton = RubyParser.get_function_list(root)[1] 89 | metadata = RubyParser.get_function_metadata(_singleton) 90 | for key in ['identifier', 'parameters', 'return_type']: 91 | self.assertTrue(key in metadata.keys()) 92 | self.assertEqual(metadata['identifier'], 'my_method') 93 | self.assertEqual(metadata['parameters'], {'a': None}) 94 | self.assertEqual(metadata['return_type'], '') 95 | 96 | 97 | def test_metadata_without_return_statement(self): 98 | code_sample = ''' 99 | def write_code(number_of_errors) 100 | if number_of_errors > 1 101 | mood = "Ask me later" 102 | else 103 | mood = puts "No Problem" 104 | end 105 | return mood 106 | end 107 | ''' 108 | root = parse_code(code_sample, 'Ruby').root_node 109 | fn = RubyParser.get_function_list(root)[0] 110 | metadata = RubyParser.get_function_metadata(fn) 111 | 112 | return_type = metadata['return_type'] 113 | self.assertEqual(return_type, '') 114 | 115 | 116 | def test_get_class_metadata(self): 117 | root = self.root_node 118 | 119 | classes = RubyParser.get_class_list(root)[1] 120 | metadata = RubyParser.get_class_metadata(classes) 121 | 122 | self.assertEqual(metadata['identifier'], 'Client') 123 | self.assertEqual(metadata['parameters'], {'API': None}) 124 | 125 | def test_get_action_list(self): 126 | root = self.root_node 127 | actions = RubyParser.get_action_list(root) 128 | 129 | self.assertEqual(len(actions), 5) 130 | 131 | def test_get_action_metadata(self): 132 | root = self.root_node 133 | actions = RubyParser.get_action_list(root) 134 | metadatas = [ RubyParser.get_action_metadata(action) for action in actions] 135 | self.assertEqual(metadatas[0]["identifier"], "load_current_value") 136 | self.assertEqual(metadatas[1]["identifier"], "action:install") 137 | self.assertEqual(metadatas[2]["identifier"], "converge_by") 138 | 139 | self.assertEqual(metadatas[3]["identifier"], "action:reinstall") 140 | self.assertEqual(metadatas[4]["identifier"], "converge_by") 141 | 142 | self.assertEqual(metadatas[0]["parameters"]["new_resource"], None) 143 | self.assertEqual(metadatas[0]["parameters"]["old_resource"], None) 144 | 145 | 146 | if __name__ == '__main__': 147 | unittest.main() 148 | -------------------------------------------------------------------------------- /tests/test_parser/test_rust.py: -------------------------------------------------------------------------------- 1 | '''test for Ruby parser''' 2 | import os 3 | import unittest 4 | from pathlib import Path 5 | 6 | from src.codetext.parser import RustParser 7 | from src.codetext.utils import parse_code 8 | 9 | 10 | class Test_RustParser(unittest.TestCase): 11 | def setUp(self) -> None: 12 | with open('tests/test_parser/test_sample/rust_test_sample.rs', 'r') as file: 13 | self.code_sample = file.read() 14 | 15 | tree = parse_code(self.code_sample, 'rust') 16 | self.root_node = tree.root_node 17 | 18 | return super().setUp() 19 | 20 | def test_get_function_list(self): 21 | root = self.root_node 22 | 23 | function_list = RustParser.get_function_list(root) 24 | 25 | self.assertEqual(len(function_list), 4) 26 | 27 | def test_get_class_list(self): 28 | root = self.root_node 29 | 30 | class_list = RustParser.get_class_list(root) 31 | 32 | self.assertEqual(len(class_list), 2) 33 | 34 | def test_get_docstring(self): 35 | code_sample = """ 36 | // Comment something 37 | mod my_mod { 38 | /// Creates a new rendering surface. 39 | /// 40 | /// # Arguments 41 | /// 42 | /// Initialization of surfaces happens through the types provided by 43 | /// [`drm-rs`](drm). 44 | /// 45 | /// - [`crtcs`](drm::control::crtc) represent scanout engines of the device pointing to one framebuffer. \\ 46 | /// Their responsibility is to read the data of the framebuffer and export it into an "Encoder". \\ 47 | /// The number of crtc's represent the number of independent output devices the hardware may handle. 48 | fn private_function() { 49 | println!("called `my_mod::private_function()`"); 50 | } 51 | 52 | /** - Outer block doc (exactly) 2 asterisks */ 53 | pub fn function() { 54 | println!("called `my_mod::function()`"); 55 | } 56 | 57 | // Items can access other items in the same module, 58 | // even when private. 59 | pub fn indirect_access() { 60 | print!("called `my_mod::indirect_access()`, that\n> "); 61 | private_function(); 62 | } 63 | } 64 | """ 65 | 66 | tree = parse_code(code_sample, 'rust') 67 | root = tree.root_node 68 | 69 | fn1 = RustParser.get_function_list(root)[0] 70 | fn2 = RustParser.get_function_list(root)[1] 71 | clas = RustParser.get_class_list(root)[0] 72 | 73 | docs1 = RustParser.get_docstring(fn1) 74 | docs2 = RustParser.get_docstring(fn2) 75 | docs3 = RustParser.get_docstring(clas) 76 | 77 | self.assertEqual(docs1, '/// Creates a new rendering surface.\n///\n/// # Arguments\n///\n/// Initialization of surfaces happens through the types provided by\n/// [`drm-rs`](drm).\n///\n/// - [`crtcs`](drm::control::crtc) represent scanout engines of the device pointing to one framebuffer. \\\n/// Their responsibility is to read the data of the framebuffer and export it into an "Encoder". \\\n/// The number of crtc\'s represent the number of independent output devices the hardware may handle.') 78 | self.assertEqual(docs2, '/** - Outer block doc (exactly) 2 asterisks */') 79 | self.assertEqual(docs3, '// Comment something') 80 | 81 | def test_get_function_metadata(self): 82 | root = self.root_node 83 | 84 | function = RustParser.get_function_list(root)[0] 85 | metadata = RustParser.get_function_metadata(function) 86 | 87 | for key in ['identifier', 'parameters', 'return_type']: 88 | self.assertTrue(key in metadata.keys()) 89 | self.assertEqual(metadata['identifier'], 'long_string') 90 | self.assertEqual(metadata['parameters'], {'x': '&str'}) 91 | self.assertEqual(metadata['return_type'], '&str') 92 | 93 | def test_metadata_with_return_statement(self): 94 | code_sample = ''' 95 | fn quack(&self) { 96 | println!("quack!"); 97 | return "hello"; 98 | } 99 | ''' 100 | root = parse_code(code_sample, 'Rust').root_node 101 | fn = RustParser.get_function_list(root)[0] 102 | metadata = RustParser.get_function_metadata(fn) 103 | 104 | return_type = metadata['return_type'] 105 | self.assertEqual(return_type, '') 106 | 107 | def test_get_class_metadata(self): 108 | root = self.root_node 109 | 110 | classes = RustParser.get_class_list(root)[0] 111 | metadata = RustParser.get_class_metadata(classes) 112 | 113 | self.assertEqual(metadata['identifier'], 'Quack') 114 | self.assertEqual(metadata['parameters'], {'Duck': None}) 115 | 116 | 117 | if __name__ == '__main__': 118 | unittest.main() 119 | -------------------------------------------------------------------------------- /tests/test_parser/test_sample/README.md: -------------------------------------------------------------------------------- 1 | # Tree-sitter function/class type 2 | 3 | ## C/C++ 4 | Node type - Sample 5 | 6 | - with C 7 | ```c 8 | // function_definition 9 | void reverseSentence(int random_seed) { 10 | char c; 11 | scanf("%c", &c); 12 | if (c != '\n') { 13 | reverseSentence(); 14 | printf("%c", c); 15 | } 16 | } 17 | ``` 18 | 19 | - with C++ 20 | ```c++ 21 | // function_definition 22 | double plusFuncDouble(double x, double y) { 23 | return x + y; 24 | } 25 | 26 | // function_definition 27 | int main() { 28 | int myNum1 = plusFuncInt(8, 5); 29 | double myNum2 = plusFuncDouble(4.3, 6.26); 30 | cout << "Int: " << myNum1 << "\n"; 31 | cout << "Double: " << myNum2; 32 | return 0; 33 | } 34 | 35 | // class_specifier 36 | class Animal { 37 | public: 38 | // function_definition 39 | void animalSound() { 40 | cout << "The animal makes a sound \n"; 41 | } 42 | }; 43 | 44 | // class_specifier 45 | class Pig : public Animal { 46 | public: 47 | // function_definition 48 | void animalSound() { 49 | cout << "The pig says: wee wee \n"; 50 | } 51 | }; 52 | ``` 53 | 54 | ## C# 55 | 56 | ```c# 57 | // local_function_statement 58 | private static string GetText(string path, string filename) 59 | { 60 | // local_declaration_statement 61 | var reader = File.OpenText($"{AppendPathSeparator(path)}{filename}"); 62 | var text = reader.ReadToEnd(); 63 | return text; 64 | 65 | // local_function_statement 66 | string AppendPathSeparator(string filepath) 67 | { 68 | return filepath.EndsWith(@"\") ? filepath : filepath + @"\"; 69 | } 70 | } 71 | 72 | using System; 73 | 74 | // class_declaration 75 | public class Dog : Animal { 76 | 77 | String name; 78 | String breed; 79 | int age; 80 | String color; 81 | 82 | // constructor_declaration 83 | public Dog(String name, String breed, 84 | int age, String color) 85 | { 86 | this.name = name; 87 | this.breed = breed; 88 | this.age = age; 89 | this.color = color; 90 | } 91 | 92 | // method_declaration 93 | static void Main(string[] args) 94 | { 95 | Car myObj = new Car(); 96 | Console.WriteLine(myObj.color); 97 | } 98 | } 99 | ``` 100 | 101 | ## Java 102 | 103 | ```Java 104 | // class_declaration 105 | public class SaveFileController extends SudoUser implements FileController { 106 | // field_declaration 107 | private ArrayList allUsers; 108 | private String saveFile = "test_save_file4.sav"; 109 | 110 | // constructor_declaration 111 | public SaveFileController(){ 112 | this.allUsers = new ArrayList(); 113 | } 114 | 115 | // method_declaration 116 | public HabitList getHabitList(Context context, int userIndex){ 117 | loadFromFile(context); 118 | return this.allUsers.get(userIndex).getHabitList(); 119 | } 120 | } 121 | ``` 122 | 123 | ## Python 124 | ```python 125 | # class_definition 126 | class Person: 127 | # function_definition 128 | def __init__(self, name, age): 129 | self.name = name 130 | self.age = age 131 | 132 | # function_definition 133 | def say_my_name(self): 134 | print(self.name) 135 | 136 | # function_definition 137 | def create_a_person(name, age): 138 | new_person = Person(name, age) 139 | ``` 140 | 141 | ## JavaScript 142 | ```JavaScript 143 | // function_declaration 144 | export function loadSongs() { 145 | return { 146 | type: LOAD_SONGS, 147 | }; 148 | } 149 | 150 | // class_declaration 151 | class Model extends Car { 152 | // method_definition 153 | constructor(brand, mod) { 154 | super(brand); 155 | this.model = mod; 156 | } 157 | 158 | // method_definition 159 | show() { 160 | return this.present() + ', it is a ' + this.model; 161 | } 162 | } 163 | ``` 164 | 165 | ## PHP 166 | 167 | ```PHP 168 | // function_definition 169 | function familyName($fname) { 170 | echo "$fname Refsnes.
"; 171 | } 172 | 173 | // class_declaration 174 | final class Driver extends AbstractSQLServerDriver 175 | { 176 | // method_declaration 177 | public function connect(array $params) 178 | { 179 | $driverOptions = $dsnOptions = []; 180 | if (isset($params['driverOptions'])) { 181 | foreach ($params['driverOptions'] as $option => $value) { 182 | if (is_int($option)) { 183 | $driverOptions[$option] = $value; 184 | } else { 185 | $dsnOptions[$option] = $value; 186 | } 187 | } 188 | } 189 | } 190 | } 191 | ``` 192 | 193 | ## GO 194 | 195 | ```GO 196 | // function_declaration 197 | func add(x int, y int) int { 198 | return x + y 199 | } 200 | 201 | // function_declaration 202 | func main() { 203 | fmt.Println(add(42, 13)) 204 | } 205 | 206 | // method_declaration 207 | func (e TypeError) Error() string { 208 | msg := e.Type1.String() 209 | if e.Type2 != nil { 210 | msg += " and " + e.Type2.String() 211 | } 212 | msg += " " + e.Extra 213 | return msg 214 | } 215 | 216 | ``` 217 | 218 | ## Ruby 219 | 220 | ```Ruby 221 | # class 222 | class Customer 223 | @@no_of_customers = 0 224 | 225 | # method 226 | def initialize(id, name, addr) 227 | @cust_id = id 228 | @cust_name = name 229 | @cust_addr = addr 230 | end 231 | end 232 | 233 | # method 234 | def test(a1 = "Ruby", a2 = "Perl") 235 | puts "The programming language is #{a1}" 236 | puts "The programming language is #{a2}" 237 | end 238 | 239 | # module 240 | module RedditKit 241 | # class 242 | class Client < API 243 | # method 244 | def search(query, options = {}) 245 | path = "%s/search.json" % ('r/' + options[:subreddit] if options[:subreddit]) 246 | parameters = { :q => query, 247 | :restrict_sr => options[:restrict_to_subreddit], 248 | :limit => options[:limit], 249 | :count => options[:count], 250 | :sort => options[:sort], 251 | :before => options[:before], 252 | :after => options[:after], 253 | :syntax => options[:syntax], 254 | :t => options[:time] 255 | } 256 | 257 | objects_from_response(:get, path, parameters) 258 | end 259 | end 260 | end 261 | 262 | ``` 263 | 264 | ## Rust 265 | 266 | ```Rust 267 | // trait_item 268 | trait Quack { 269 | // function_signature_item <- This is function declaration 270 | fn quack(&self); 271 | } 272 | 273 | // struct_item 274 | struct Duck (); 275 | 276 | // function_item 277 | fn long_string(x: &str) -> &str { 278 | if x.len() > 10 { 279 | "too long" 280 | } else { 281 | x 282 | } 283 | 284 | } 285 | 286 | // impl_item 287 | impl Quack for Duck { 288 | // function_item 289 | fn quack(&self) { 290 | println!("quack!"); 291 | } 292 | } 293 | 294 | // mod_item 295 | mod my_mod { 296 | // function_item 297 | fn private_function() { 298 | println!("called `my_mod::private_function()`"); 299 | } 300 | } 301 | 302 | // function_item 303 | fn quack_everyone (iter: I) 304 | where I: Iterator> { 305 | for d in iter { 306 | d.quack(); 307 | } 308 | } 309 | ``` -------------------------------------------------------------------------------- /tests/test_parser/test_sample/c_sharp_test_sample.cs: -------------------------------------------------------------------------------- 1 | private static string GetText(string path, string filename) 2 | { 3 | var reader = File.OpenText($"{AppendPathSeparator(path)}{filename}"); 4 | var text = reader.ReadToEnd(); 5 | return text; 6 | 7 | string AppendPathSeparator(string filepath) 8 | { 9 | return filepath.EndsWith(@"\") ? filepath : filepath + @"\"; 10 | } 11 | } 12 | 13 | using System; 14 | public class Dog : Animal { 15 | 16 | // Instance Variables 17 | String name; 18 | String breed; 19 | int age; 20 | String color; 21 | 22 | // Constructor Declaration of Class 23 | public Dog(String name, String breed, 24 | int age, String color) 25 | { 26 | this.name = name; 27 | this.breed = breed; 28 | this.age = age; 29 | this.color = color; 30 | } 31 | 32 | // Docstring of this function 33 | static void Main(string[] args) 34 | { 35 | Car myObj = new Car(); 36 | Console.WriteLine(myObj.color); 37 | } 38 | } -------------------------------------------------------------------------------- /tests/test_parser/test_sample/c_test_sample.c: -------------------------------------------------------------------------------- 1 | #include 2 | void reverseSentence(); 3 | 4 | /** 5 | * A brief description. A more elaborate class description 6 | * @param random_seed somearg. 7 | * @see Test() 8 | * @return The test results 9 | */ 10 | void reverseSentence(int random_seed) { 11 | char c; 12 | scanf("%c", &c); 13 | if (c != '\n') { 14 | reverseSentence(); 15 | printf("%c", c); 16 | } 17 | } 18 | 19 | int main() { 20 | printf("Enter a sentence: "); 21 | reverseSentence(); 22 | return 0; 23 | } 24 | -------------------------------------------------------------------------------- /tests/test_parser/test_sample/cpp_test_sample.cpp: -------------------------------------------------------------------------------- 1 | // Derived class 2 | class Car: public Vehicle, private B { 3 | public: 4 | string model = "Mustang"; 5 | }; 6 | 7 | // A static function 8 | int sum2number (int a, int b) { 9 | return a + b; 10 | } 11 | 12 | // Base class 13 | class Vehicle { 14 | public: 15 | string brand = "Ford"; 16 | void honk() { 17 | cout << "Tuut, tuut! \n" ; 18 | } 19 | }; 20 | 21 | int main() { 22 | Car myCar; 23 | myCar.honk(); 24 | cout << myCar.brand + " " + myCar.model; 25 | return 0; 26 | } 27 | 28 | -------------------------------------------------------------------------------- /tests/test_parser/test_sample/go_test_sample.go: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | type TypeError struct { 5 | Type1, Type2 reflect.Type 6 | Extra string 7 | } 8 | 9 | // The path package should only be used for paths separated by forward 10 | // slashes, such as the paths in URLs. This package does not deal with 11 | // Windows paths with drive letters or backslashes; to manipulate 12 | // operating system paths, use the [path/filepath] package. 13 | func (e TypeError) Error() string { 14 | msg := e.Type1.String() 15 | if e.Type2 != nil { 16 | msg += " and " + e.Type2.String() 17 | } 18 | msg += " " + e.Extra 19 | return msg 20 | } 21 | -------------------------------------------------------------------------------- /tests/test_parser/test_sample/java_test_sample.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Implements the file to save data to. 3 | * 4 | * @version 1.0 5 | */ 6 | public class SaveFileController extends SudoUser implements FileController { 7 | private ArrayList allUsers; 8 | //private String username; 9 | private String saveFile = "test_save_file4.sav"; 10 | 11 | public SaveFileController(){ 12 | this.allUsers = new ArrayList(); 13 | } 14 | 15 | /** 16 | * Gets HabitList instance. 17 | * 18 | * @param context instance of Context 19 | * @param userIndex integer user index 20 | * @return HabitList 21 | * @see HabitList 22 | */ 23 | public HabitList getHabitList(Context context, int userIndex){ 24 | loadFromFile(context); 25 | return this.allUsers.get(userIndex).getHabitList(); 26 | } 27 | 28 | /** 29 | * Removes a habit event from a particular user's habit event list. 30 | * 31 | * @param context instance of Context 32 | * @param userIndex integer user index 33 | * @param habitIndex integer index of habit 34 | * @param habitEventIndex integer index of habit event 35 | */ 36 | public void removeHabitEvent(Context context, int userIndex, int habitIndex, int habitEventIndex){ 37 | loadFromFile(context); 38 | this.allUsers.get(userIndex).getHabitList().getHabit(habitIndex) 39 | .getHabitEventHistory().getHabitEvents().remove(habitEventIndex); 40 | saveToFile(context); 41 | } 42 | } -------------------------------------------------------------------------------- /tests/test_parser/test_sample/javascript_test_sample.js: -------------------------------------------------------------------------------- 1 | /* 2 | * App Actions 3 | * 4 | * Actions change things in your application 5 | * Since this boilerplate uses a uni-directional data flow, specifically redux, 6 | * we have these actions which are the only way your application interacts with 7 | * your application state. This guarantees that your state is up to date and nobody 8 | * messes it up weirdly somewhere. 9 | * 10 | * To add a new Action: 11 | * 1) Import your constant 12 | * 2) Add a function like this: 13 | * export function yourAction(var) { 14 | * return { type: YOUR_ACTION_CONSTANT, var: var } 15 | * } 16 | */ 17 | 18 | import { 19 | LOAD_SONGS, 20 | LOAD_SONGS_SUCCESS, 21 | LOAD_SONGS_ERROR, 22 | } from './constants'; 23 | 24 | /** 25 | * Load the repositories, this action starts the request saga 26 | * 27 | * @return {object} An action object with a type of LOAD_REPOS 28 | */ 29 | export function loadSongs() { 30 | return { 31 | type: LOAD_SONGS, 32 | }; 33 | } 34 | 35 | /** 36 | * Dispatched when the repositories are loaded by the request saga 37 | * 38 | * @param {array} repos The repository data 39 | * @param {string} username The current username 40 | * 41 | * @return {object} An action object with a type of LOAD_REPOS_SUCCESS passing the repos 42 | */ 43 | export function songsLoaded(repos, username=10) { 44 | return { 45 | type: LOAD_SONGS_SUCCESS, 46 | repos, 47 | username, 48 | }; 49 | } 50 | 51 | /** 52 | * Dispatched when loading the repositories fails 53 | * 54 | * @param {object} error The error 55 | * 56 | * @return {object} An action object with a type of LOAD_REPOS_ERROR passing the error 57 | */ 58 | export function songsLoadingError(error) { 59 | return { 60 | type: LOAD_SONGS_ERROR, 61 | error, 62 | }; 63 | } 64 | 65 | class Model extends Car { 66 | constructor(brand, mod) { 67 | super(brand); 68 | this.model = mod; 69 | } 70 | 71 | /** 72 | * Comment something 73 | */ 74 | show() { 75 | return this.present() + ', it is a ' + this.model; 76 | } 77 | } 78 | 79 | class Car { 80 | constructor(brand) { 81 | this.carname = brand; 82 | } 83 | 84 | /** 85 | * Dispatched when loading the repositories fails 86 | * 87 | * @param {object} error The error 88 | * 89 | * @return {object} An action object with a type of LOAD_REPOS_ERROR passing the error 90 | */ 91 | present() { 92 | return 'I have a ' + this.carname; 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /tests/test_parser/test_sample/php_test_sample.php: -------------------------------------------------------------------------------- 1 | $value) { 16 | if (is_int($option)) { 17 | $driverOptions[$option] = $value; 18 | } else { 19 | $dsnOptions[$option] = $value; 20 | } 21 | } 22 | } 23 | 24 | if (! empty($params['persistent'])) { 25 | $driverOptions[PDO::ATTR_PERSISTENT] = true; 26 | } 27 | 28 | try { 29 | $pdo = new PDO( 30 | $this->constructDsn($params, $dsnOptions), 31 | $params['user'] ?? '', 32 | $params['password'] ?? '', 33 | $driverOptions 34 | ); 35 | } catch (\\PDOException $exception) { 36 | throw PDOException::new($exception); 37 | } 38 | 39 | return new Connection(new PDOConnection($pdo)); 40 | } 41 | 42 | /** 43 | * Constructs the Sqlsrv PDO DSN. 44 | * 45 | * @param mixed[] $params 46 | * @param string[] $connectionOptions 47 | * 48 | * @throws Exception 49 | */ 50 | private function constructDsn(array $params=null, array $connectionOptions): string 51 | { 52 | $dsn = 'sqlsrv:server='; 53 | 54 | if (isset($params['host'])) { 55 | $dsn .= $params['host']; 56 | 57 | if (isset($params['port'])) { 58 | $dsn .= ',' . $params['port']; 59 | } 60 | } elseif (isset($params['port'])) { 61 | throw PortWithoutHost::new(); 62 | } 63 | 64 | if (isset($params['dbname'])) { 65 | $connectionOptions['Database'] = $params['dbname']; 66 | } 67 | 68 | if (isset($params['MultipleActiveResultSets'])) { 69 | $connectionOptions['MultipleActiveResultSets'] = $params['MultipleActiveResultSets'] ? 'true' : 'false'; 70 | } 71 | 72 | return $dsn . $this->getConnectionOptionsDsn($connectionOptions); 73 | } 74 | 75 | /** 76 | * Converts a connection options array to the DSN 77 | * 78 | * @param string[] $connectionOptions 79 | */ 80 | private function getConnectionOptionsDsn(array $connectionOptions): string 81 | { 82 | $connectionOptionsDsn = ''; 83 | 84 | foreach ($connectionOptions as $paramName => $paramValue) { 85 | $connectionOptionsDsn .= sprintf(';%s=%s', $paramName, $paramValue); 86 | } 87 | 88 | return $connectionOptionsDsn; 89 | } 90 | } 91 | 92 | interface MyInterface { 93 | public function myMethod() { 94 | // Method implementation 95 | } 96 | 97 | } 98 | 99 | trait MyTrait { 100 | 101 | public function setBackgroundImage(Drawing $objDrawing): self 102 | { 103 | if (!array_key_exists($objDrawing->getType(), Drawing::IMAGE_TYPES_CONVERTION_MAP)) { 104 | throw new PhpSpreadsheetException('Unsupported image type in comment background. Supported types: PNG, JPEG, BMP, GIF.'); 105 | } 106 | $this->backgroundImage = $objDrawing; 107 | 108 | return $this; 109 | } 110 | 111 | } 112 | 113 | -------------------------------------------------------------------------------- /tests/test_parser/test_sample/py_test_sample.py: -------------------------------------------------------------------------------- 1 | def partition(array, low, high): 2 | """ 3 | Function to find the partition position 4 | 5 | :param array: the unsorted array 6 | :type array: List 7 | :param low: smaller pivot 8 | :type low: int 9 | :param high: greater pivot 10 | :type high: int 11 | 12 | """ 13 | # choose the rightmost element as pivot 14 | pivot = array[high] 15 | 16 | # pointer for greater element 17 | i = low - 1 18 | 19 | # traverse through all elements 20 | # compare each element with pivot 21 | for j in range(low, high): 22 | if array[j] <= pivot: 23 | 24 | # If element smaller than pivot is found 25 | # swap it with the greater element pointed by i 26 | i = i + 1 27 | 28 | # Swapping element at i with element at j 29 | (array[i], array[j]) = (array[j], array[i]) 30 | 31 | # Swap the pivot element with the greater element specified by i 32 | (array[i + 1], array[high]) = (array[high], array[i + 1]) 33 | 34 | # Return the position from where partition is done 35 | return i + 1 36 | 37 | def quickSort(array, low, high): 38 | """ 39 | Function to perform quicksort 40 | """ 41 | if low < high: 42 | 43 | # Find pivot element such that 44 | # element smaller than pivot are on the left 45 | # element greater than pivot are on the right 46 | pi = partition(array, low, high) 47 | 48 | # Recursive call on the left of pivot 49 | quickSort(array, low, pi - 1) 50 | 51 | # Recursive call on the right of pivot 52 | quickSort(array, pi + 1, high) 53 | 54 | 55 | data = [1, 7, 4, 1, 10, 9, -2] 56 | print("Unsorted Array") 57 | print(data) 58 | 59 | size = len(data) 60 | 61 | quickSort(data, 0, size - 1) 62 | 63 | print('Sorted Array in Ascending Order:') 64 | print(data) 65 | 66 | class Person: 67 | def __init__(self, name, age): 68 | self.name = name 69 | self.age = age -------------------------------------------------------------------------------- /tests/test_parser/test_sample/ruby_test_sample.rb: -------------------------------------------------------------------------------- 1 | module RedditKit 2 | class Client < API 3 | 4 | # Methods for searching reddit's links. 5 | module Search 6 | 7 | # Search for links. 8 | # 9 | # @param query [String] The search query. 10 | # @option options [String, RedditKit::Subreddit] subreddit The optional subreddit to search. 11 | # @option options [true, false] restrict_to_subreddit Whether to search only in a specified subreddit. 12 | # @option options [1..100] limit The number of links to return. 13 | # @option options [String] count The number of results to return before or after. This is different from `limit`. 14 | # @option options [relevance, new, hot, top, comments] sort The sorting order for search results. 15 | # @option options [String] before Only return links before this full name. 16 | # @option options [String] after Only return links after this full name. 17 | # @option options [cloudsearch, lucene, plain] syntax Specify the syntax for the search. Learn more: http://www.reddit.com/r/redditdev/comments/1hpicu/whats_this_syntaxcloudsearch_do/cawm0fe 18 | # @option options [hour, day, week, month, year, all] time Show results with a specific time period. 19 | # @return [RedditKit::PaginatedResponse] 20 | def search(query, options = {}) 21 | path = "%s/search.json" % ('r/' + options[:subreddit] if options[:subreddit]) 22 | parameters = { :q => query, 23 | :restrict_sr => options[:restrict_to_subreddit], 24 | :limit => options[:limit], 25 | :count => options[:count], 26 | :sort => options[:sort], 27 | :before => options[:before], 28 | :after => options[:after], 29 | :syntax => options[:syntax], 30 | :t => options[:time] 31 | } 32 | 33 | objects_from_response(:get, path, parameters) 34 | end 35 | 36 | def self.my_method(a) 37 | # Method implementation 38 | puts(a) 39 | return a 40 | end 41 | 42 | end 43 | end 44 | end 45 | 46 | load_current_value do |new_resource, old_resource| 47 | unless current_installed_version(new_resource).nil? 48 | version(current_installed_version(new_resource)) 49 | Chef::Log.debug("Current version is #{version}") if version 50 | return a 51 | end 52 | end 53 | 54 | action :install do 55 | build_essential 56 | 57 | install_version = new_resource.version unless new_resource.version.nil? || new_resource.version == current_resource.version 58 | versions_match = candidate_version == current_installed_version(new_resource) 59 | 60 | if install_version || new_resource.version.nil? && !versions_match 61 | converge_by("install package #{new_resource.package_name} #{install_version}") do 62 | info_output = "Installing #{new_resource.package_name}" 63 | info_output << " version #{install_version}" if install_version && !install_version.empty? 64 | Chef::Log.info(info_output) 65 | install_package(new_resource.package_name, install_version) 66 | end 67 | end 68 | end 69 | 70 | action :reinstall do 71 | build_essential 72 | 73 | install_version = new_resource.version unless new_resource.version.nil? 74 | converge_by("reinstall package #{new_resource.package_name} #{install_version}") do 75 | info_output = "Installing #{new_resource.package_name}" 76 | info_output << " version #{install_version}" if install_version && !install_version.empty? 77 | Chef::Log.info(info_output) 78 | install_package(new_resource.package_name, install_version, force: true) 79 | end 80 | end 81 | 82 | a = 1 83 | 84 | reinstall 85 | -------------------------------------------------------------------------------- /tests/test_parser/test_sample/rust_test_sample.rs: -------------------------------------------------------------------------------- 1 | trait Quack { 2 | fn quack(&self); 3 | } 4 | 5 | struct Duck (); 6 | 7 | fn long_string(x: &str) -> &str { 8 | if x.len() > 10 { 9 | "too long" 10 | } else { 11 | x 12 | } 13 | 14 | } 15 | 16 | impl Quack for Duck { 17 | fn quack(&self) { 18 | println!("quack!"); 19 | } 20 | } 21 | 22 | mod my_mod { 23 | // Items in modules default to private visibility. 24 | fn private_function() { 25 | println!("called `my_mod::private_function()`"); 26 | } 27 | } 28 | 29 | fn quack_everyone (iter: I) 30 | where I: Iterator> { 31 | for d in iter { 32 | d.quack(); 33 | } 34 | } 35 | 36 | let ducks: Vec> = vec![Box::new(duck1),Box::new(duck2),Box::new(parrot),Box::new(int)]; 37 | -------------------------------------------------------------------------------- /tests/test_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/tests/test_utils/__init__.py -------------------------------------------------------------------------------- /tests/test_utils/test_utils.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from src.codetext.utils import build_language, parse_code 3 | 4 | 5 | class Test_Utils(unittest.TestCase): 6 | def test_build_language(self): 7 | langs = ['python', 'rust'] 8 | for l in langs: 9 | # clear it later 10 | build_language(language=l) 11 | 12 | def test_parse_code(self): 13 | sample = """ 14 | def sum_2_num(a, b): 15 | return a + b 16 | """ 17 | parse_code(sample, 'python') 18 | 19 | 20 | if __name__ == '__main__': 21 | unittest.main() --------------------------------------------------------------------------------