├── .github
└── workflows
│ ├── build_and_release.yml
│ └── unittest.yml
├── .gitignore
├── .idea
└── .gitignore
├── HISTORY.md
├── LICENSE
├── README.md
├── asset
└── img
│ ├── codetext_logo.png
│ └── codetext_logo_line.png
├── pyproject.toml
├── requirements.txt
├── src
└── codetext
│ ├── __init__.py
│ ├── __main__.py
│ ├── clean
│ ├── __init__.py
│ └── noise_removal.py
│ ├── codetext_cli.py
│ ├── parser
│ ├── README.md
│ ├── __init__.py
│ ├── c_sharp_parser.py
│ ├── cpp_parser.py
│ ├── go_parser.py
│ ├── java_parser.py
│ ├── javascript_parser.py
│ ├── language_parser.py
│ ├── php_parser.py
│ ├── python_parser.py
│ ├── ruby_parser.py
│ └── rust_parser.py
│ └── utils
│ ├── __init__.py
│ ├── imports.py
│ └── utils.py
└── tests
├── __init__.py
├── setup.py
├── test_clean
├── __init__.py
└── test_clean_utils.py
├── test_parser
├── __init__.py
├── test_c.py
├── test_cpp.py
├── test_csharp.py
├── test_go.py
├── test_java.py
├── test_javascript.py
├── test_php.py
├── test_python.py
├── test_ruby.py
├── test_rust.py
└── test_sample
│ ├── README.md
│ ├── c_sharp_test_sample.cs
│ ├── c_test_sample.c
│ ├── cpp_test_sample.cpp
│ ├── go_test_sample.go
│ ├── java_test_sample.java
│ ├── javascript_test_sample.js
│ ├── php_test_sample.php
│ ├── py_test_sample.py
│ ├── ruby_test_sample.rb
│ └── rust_test_sample.rs
└── test_utils
├── __init__.py
└── test_utils.py
/.github/workflows/build_and_release.yml:
--------------------------------------------------------------------------------
1 |
2 | name: Publish package to PyPI
3 |
4 | on:
5 | release:
6 | types: [created]
7 |
8 | jobs:
9 | release:
10 | # if: github.event_name == 'release' && github.event.action == 'created'
11 | name: PyPi Release
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - uses: actions/checkout@v2
16 | name: Checkout repo
17 |
18 | - name: Set up Python 3.7
19 | uses: actions/setup-python@v1
20 | with:
21 | python-version: 3.7
22 |
23 | - uses: actions/cache@v1
24 | name: Cache pip dependencies
25 | with:
26 | path: ~/.cache/pip
27 | key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
28 | restore-keys: |
29 | ${{ runner.os }}-pip-
30 | - name: Install pip dependencies
31 | run: |
32 | pip install --upgrade pip
33 | pip install -r requirements.txt
34 | python3 -m pip install --upgrade build twine wheel
35 | - name: Make distribution
36 | run: |
37 | python3 setup.py sdist bdist_wheel
38 | twine check dist/*
39 | - name: Publish a Python distribution to PyPI
40 | uses: pypa/gh-action-pypi-publish@master
41 | with:
42 | user: __token__
43 | password: ${{ secrets.PYPI_TOKEN }}
--------------------------------------------------------------------------------
/.github/workflows/unittest.yml:
--------------------------------------------------------------------------------
1 | name: Unittest
2 |
3 | on: push
4 |
5 | jobs:
6 | unittest:
7 | name: Unittest
8 | runs-on: ubuntu-latest
9 | strategy:
10 | matrix:
11 | pyversion: [ "3.10" ]
12 |
13 | steps:
14 | - name: Check out Git repository
15 | uses: actions/checkout@v2
16 |
17 | - name: Set up Python
18 | uses: actions/setup-python@v2
19 | with:
20 | python-version: ${{ matrix.pyversion }}
21 |
22 | - name: Install dependencies
23 | run: |
24 | pip install -r requirements.txt
25 | # git clone https://github.com/nmd-2000/docstring_parser docstring_parser
26 | # pip install -e ./docstring_parser
27 |
28 | - name: Run tests
29 | run: |
30 | python -m unittest
31 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | cache/*
2 | src/*/*.txt
3 | src/codetext.egg-info/*
4 | */build/*
5 | */dist/*
6 | */tree-sitter-*
7 | *.jsonl
8 | *.json
9 | *.zip
10 | *.gz
11 | *.pyc
12 | *.so
13 | *.whl
14 | .idea
15 | .vscode
16 | *.iml
17 |
--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | .idea
5 | .vscode
6 | *.iml
--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
1 | ========
2 | Releases
3 | ========
4 |
5 | Version 0.0.9
6 | =============
7 | Release date: Jul 1, 2024
8 | * Skip building language binaries from source
9 |
10 | Version 0.0.8
11 | =============
12 | Release date: Aug 17, 2023
13 |
14 | * Update format codetext_cli
15 | * Update PythonParser: Handle class definitions with empty argument list class ABC()
16 | * Add Javascript undeclared functions
17 | * Add PHP interface
18 | * Add Ruby actions with block parameters
19 |
20 | Version 0.0.7
21 | =============
22 | Release date: Jul 5, 2023
23 |
24 | * Update all class extractor format (using dict instead of list)
25 | * Fix missing identifier, parameter in C, C#, Java parser
26 | * Implement CLI
27 |
28 | Version 0.0.6
29 | =============
30 | Release date: Jan 9, 2023
31 |
32 | * Add tree sitter utils (in codetext.parser)
33 | * Replace all `match_from_span` to `get_node_text`
34 | * Replace all `traverse_type` to `get_node_by_kind`
35 | * Fix `CppParser.get_function_metadata` missing `param_type` and `param_identifier`
36 | * Update return metadata from all parser
37 |
38 | Version 0.0.5
39 | =============
40 | Release date: Dec 12, 2022
41 |
42 | * Fix package import path
43 | * Adding auto build workflow
44 | * Seperate codetext parser with processing source code
45 | * Fix `remove_comment_delimiter` remove leading whitespace
46 | * Update unittest for parser and utilites
47 |
48 | Version 0.0.4
49 | =============
50 | Release date: Dec 2, 2022
51 |
52 | * Fix main package root path
53 | * Loosen `docstring_parser` dependency
54 |
55 | Version 0.0.3
56 | =============
57 | Release date: Dec 2, 2022
58 |
59 | * New clean docstring function
60 | * check_docstring_contain_question
61 | * check_docstring_underdevelopment
62 | * check_docstring_autogenerated
63 | * check_contain_little_single_char
64 | * check_contain_many_special_char
65 | * check_contain_little_unique_chars
66 | * check_contain_little_unique_words
67 | * check_contain_many_special_case
68 | * check_contain_too_many_variables
69 | * check_contain_many_repeated_word
70 | * check_contain_many_uppercase_word
71 | * check_contain_many_long_word
72 |
73 | Version 0.0.2
74 | =============
75 | Release date: Nov 25, 2022
76 |
77 | * Language parser for Rust
78 | * get_docstring
79 | * get_class_list, get_function_list
80 | * get_class_metadata, get_function_metadata
81 | * Processing utils:
82 | * extract_docstring
83 | * extract_node
84 | * get_line_definitions
85 | * get_node_definitions
86 | * process_raw_node
87 | * Postprocessing:
88 | * Merge file (from batches)
89 | * Split into train/test/valid (by #sample category)
90 | * Deduplicate sample
91 |
92 | Version 0.0.1
93 | =============
94 | Release date: Nov 9, 2022
95 |
96 | * Language parser for Java, Python, JavaScript, PHP, Golang, Ruby, C++, C#, C
97 | * get_docstring
98 | * get_class_list, get_function_list
99 | * get_class_metadata, get_function_metadata
100 | * Clean docstring function
101 | * Data preprocessing source code
102 | * Tree-sitter utils: build_language, parse_code
103 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 AI4Code Research Group
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | ______________________________________________________________________
7 |
8 |
9 |
10 | | Branch | Build | Unittest | Release | License |
11 | |-------- |------- |---------- |--------- |--------- |
12 | | main | | [](https://github.com/AI4Code-Research/CodeText-parser/actions/workflows/unittest.yml) | [](https://pypi.org/project/codetext/) [](https://pypi.org/project/codetext/)| [](https://github.com/AI4Code-Research/CodeText-parser/blob/main/LICENSES.txt) |
13 |
14 |
15 |
16 | ______________________________________________________________________
17 |
18 | **Code-Text parser** is a custom [tree-sitter](https://github.com/tree-sitter)'s grammar parser for extract raw source code into class and function level. We support 10 common programming languages:
19 | - Python
20 | - Java
21 | - JavaScript
22 | - PHP
23 | - Ruby
24 | - Rust
25 | - C
26 | - C++
27 | - C#
28 | - Go
29 |
30 | # Installation
31 | **codetext** package require python 3.7 or above and tree-sitter. Setup environment and install dependencies manually from source:
32 | ```bash
33 | git https://github.com/FSoft-AI4Code/CodeText-parser.git; cd CodeText-parser
34 | pip install -r requirement.txt
35 | pip install -e .
36 | ```
37 |
38 | Or install via `pypi` package:
39 | ```bash
40 | pip install codetext
41 | ```
42 |
43 | # Getting started
44 |
45 | ## `codetext` CLI Usage
46 | ```bash
47 | codetext [options] [PATH or FILE] ...
48 | ```
49 |
50 | For example extract any python file in `src/` folder:
51 | ```bash
52 | codetext src/ --language Python
53 | ```
54 |
55 | If you want to store extracted class and function, use flag `--json` and give a path to destination file:
56 | ```bash
57 | codetext src/ --language Python --output_file ./python_report.json --json
58 | ```
59 |
60 | **Options**
61 |
62 | ```bash
63 | positional arguments:
64 | paths list of the filename/paths.
65 |
66 | optional arguments:
67 | -h, --help show this help message and exit
68 | --version show program's version number and exit
69 | -l LANGUAGE, --language LANGUAGE
70 | Target the programming languages you want to analyze.
71 | -o OUTPUT_FILE, --output_file OUTPUT_FILE
72 | Output file (e.g report.json).
73 | --json Generate json output as a transform of the default
74 | output
75 | --verbose Print progress bar
76 |
77 | ```
78 |
79 | **Example**
80 | ```
81 | File circle_linkedlist.py analyzed:
82 | ==================================================
83 | Number of class : 1
84 | Number of function : 2
85 | --------------------------------------------------
86 |
87 | Class summary:
88 | +-----+---------+-------------+
89 | | # | Class | Arguments |
90 | +=====+=========+=============+
91 | | 0 | Node | |
92 | +-----+---------+-------------+
93 |
94 | Class analyse: Node
95 | +-----+---------------+-------------+--------+---------------+
96 | | # | Method name | Paramters | Type | Return type |
97 | +=====+===============+=============+========+===============+
98 | | 0 | __init__ | self | | |
99 | | | | data | | |
100 | +-----+---------------+-------------+--------+---------------+
101 |
102 | Function analyse:
103 | +-----+-----------------+-------------+--------+---------------+
104 | | # | Function name | Paramters | Type | Return type |
105 | +=====+=================+=============+========+===============+
106 | | 0 | push | head_ref | | Node |
107 | | | | data | Any | Node |
108 | | 1 | countNodes | head | Node | |
109 | +-----+-----------------+-------------+--------+---------------+
110 | ```
111 |
112 | ## Using `codetext` as Python module
113 | ### Build your language
114 | `codetext` need tree-sitter language file (i.e `.so` file) to work properly. You can manually compile language ([see more](https://github.com/tree-sitter/py-tree-sitter#usage)) or automatically build use our pre-defined function (the `.so` will saved in a folder name `/tree-sitter/`):
115 | ```python
116 | from codetext.utils import build_language
117 |
118 | language = 'rust'
119 | build_language(language)
120 |
121 | # INFO:utils:Not found tree-sitter-rust, attempt clone from github
122 | # Cloning into 'tree-sitter-rust'...
123 | # remote: Enumerating objects: 2835, done. ...
124 | # INFO:utils:Attempt to build Tree-sitter Language for rust and store in .../tree-sitter/rust.so
125 | ```
126 |
127 | ### Using Language Parser
128 | Each programming language we supported are correspond to a custome `language_parser`. (e.g Python is [`PythonParser()`](src/codetext/parser/python_parser.py#L11)). `language_parser` take input as raw source code and use breadth-first search to traveser through all syntax node. The class, method or stand-alone function will then be collected:
129 |
130 | ```python
131 | from codetext.utils import parse_code
132 |
133 | raw_code = """
134 | /**
135 | * Sum of 2 number
136 | * @param a int number
137 | * @param b int number
138 | */
139 | double sum2num(int a, int b) {
140 | return a + b;
141 | }
142 | """
143 |
144 | # Auto parse code into tree-sitter.Tree
145 | root = parse_code(raw_code, 'cpp')
146 | root_node = root.root_node
147 | ```
148 |
149 | Get all function nodes inside a specific node:
150 | ```python
151 | from codetext.utils.parser import CppParser
152 |
153 | function_list = CppParser.get_function_list(root_node)
154 | print(function_list)
155 |
156 | # []
157 |
158 | ```
159 |
160 | Get function metadata (e.g. function's name, parameters, (optional) return type)
161 | ```python
162 | function = function_list[0]
163 |
164 | metadata = CppParser.get_function_metadata(function, raw_code)
165 |
166 | # {'identifier': 'sum2num', 'parameters': {'a': 'int', 'b': 'int'}, 'type': 'double'}
167 | ```
168 | Get docstring (documentation) of a function
169 | ```python
170 | docstring = CppParser.get_docstring(function, code_sample)
171 |
172 | # ['Sum of 2 number \n@param a int number \n@param b int number']
173 | ```
174 |
175 | We also provide 2 command for extract class object
176 | ```python
177 | class_list = CppParser.get_class_list(root_node)
178 | # and
179 | metadata = CppParser.get_metadata_list(root_node)
180 | ```
181 |
182 | # Limitations
183 | `codetext` heavly depends on tree-sitter syntax:
184 | - Since we use tree-sitter grammar to extract desire node like function, class, function's name (identifier) or class's argument list, etc. `codetext` is easily vulnerable by tree-sitter update patch or syntax change in future.
185 |
186 | - While we try our best to capture all possiblity, there are still plenty out there. We open for community to contribute into this project.
--------------------------------------------------------------------------------
/asset/img/codetext_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/asset/img/codetext_logo.png
--------------------------------------------------------------------------------
/asset/img/codetext_logo_line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/asset/img/codetext_logo_line.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "codetext"
7 | version = "0.0.9"
8 | authors = [
9 | { name="Dung Manh Nguyen", email="dungnm.workspace@gmail.com" },
10 | ]
11 | description = "Multilingual programming language parsers for the extract from raw source code into multiple levels of pair data"
12 | readme = "README.md"
13 | requires-python = ">=3.6"
14 | classifiers = [
15 | "Programming Language :: Python :: 3",
16 | "License :: OSI Approved :: MIT License",
17 | "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 | "tree-sitter>=0.20",
21 | "Levenshtein>=0.20",
22 | "langdetect>=1.0.0",
23 | "bs4>=0.0.1",
24 | "tabulate>=0.9.0"
25 | ]
26 |
27 | [project.urls]
28 | "Homepage" = "https://github.com/AI4Code-Research/CodeText-data"
29 | "Bug Tracker" = "https://github.com/AI4Code-Research/CodeText-data/issues"
30 |
31 | [project.scripts]
32 | codetext = "codetext.__main__:main"
33 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # for preprocessing
2 | tree-sitter==0.20.4
3 | tabulate
4 | Levenshtein
5 | langdetect
6 | bs4
7 | tree_sitter_languages==1.10.2
8 |
--------------------------------------------------------------------------------
/src/codetext/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/src/codetext/__init__.py
--------------------------------------------------------------------------------
/src/codetext/__main__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import argparse
4 | import pkg_resources
5 |
6 | import json
7 | from .codetext_cli import parse_file, print_result, PL_MATCHING
8 |
9 |
10 | def get_args():
11 | parser = argparse.ArgumentParser(description=f"codetext parser {20*'='}")
12 |
13 | parser.add_argument('paths', nargs='*', default=['.'],
14 | help='list of the filename/paths.')
15 | parser.add_argument("--version", action="version",
16 | version=pkg_resources.get_distribution("codetext").version)
17 | parser.add_argument("-l", "--language",
18 | help='''Target the programming languages you want to
19 | analyze.''')
20 | parser.add_argument("-o", "--output_file",
21 | help='''Output file (e.g report.json).
22 | ''',
23 | type=str)
24 | parser.add_argument("--json",
25 | help='''Generate json output as a transform of the
26 | default output''',
27 | action="store_true")
28 | parser.add_argument("--verbose",
29 | help='''Print progress bar''',
30 | action="store_true")
31 |
32 | return parser.parse_args()
33 |
34 |
35 | def main():
36 | opt = get_args()
37 |
38 | # check args
39 | if opt.json:
40 | if not opt.output_file:
41 | raise ValueError("Missing --output_file")
42 | if opt.language:
43 | if opt.language not in PL_MATCHING.keys():
44 | raise ValueError(
45 | "{language} not supported. Currently support {sp_language}"
46 | .format(language=opt.language,
47 | sp_language=list(PL_MATCHING.keys())))
48 |
49 | # check path
50 | for path in opt.paths:
51 | assert os.path.exists(path) == True, "paths is not valid"
52 |
53 | if os.path.isdir(path):
54 | files = [os.path.join(path, f) for f in os.listdir(path) \
55 | if os.path.isfile(os.path.join(path, f))]
56 | elif os.path.isfile(path):
57 | files = [path]
58 |
59 | if opt.language:
60 | for file in files[:]:
61 | filename, file_extension = os.path.splitext(file)
62 | if file_extension not in PL_MATCHING[opt.language]:
63 | files.remove(file)
64 |
65 | output_metadata = {}
66 | for file in files:
67 | filename, file_extension = os.path.splitext(file)
68 |
69 | if opt.language == None:
70 | for lang, ext_list in PL_MATCHING.items():
71 | if file_extension in ext_list:
72 | language = lang
73 | break
74 | else:
75 | language = opt.language
76 |
77 | output = parse_file(file, language=language)
78 | print_result(
79 | output,
80 | file_name=str(filename).split(os.sep)[-1]+file_extension
81 | )
82 | output_metadata[file] = output
83 |
84 | if opt.json:
85 | save_path = opt.output_file
86 | with open(save_path, 'w') as output_file:
87 | json.dump(output_metadata, output_file, sort_keys=True, indent=4)
88 | print(50*'=')
89 | print("Save report to {path}".format(path=save_path))
90 |
91 |
92 | if __name__ == '__main__':
93 | main()
94 |
--------------------------------------------------------------------------------
/src/codetext/clean/__init__.py:
--------------------------------------------------------------------------------
1 | """Clean utilities"""
2 |
3 | from .noise_removal import remove_comment_delimiters, remove_special_tag, remove_special_character
4 |
5 |
6 | __all__ = [
7 | 'remove_comment_delimiters', 'remove_special_tag', 'remove_special_character'
8 | ]
--------------------------------------------------------------------------------
/src/codetext/clean/noise_removal.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sys
3 | import warnings
4 | from collections import Counter
5 | from itertools import permutations
6 | from typing import Any, Dict, List, Union
7 |
8 | from langdetect import detect, detect_langs
9 | from bs4 import BeautifulSoup
10 | import Levenshtein as lev
11 |
12 | from tree_sitter import Node
13 | from ..parser.language_parser import tokenize_docstring, get_node_by_kind
14 | warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
15 |
16 |
17 | REGEX_TEXT = ("(?<=[a-z0-9])(?=[A-Z])|"
18 | "(?<=[A-Z0-9])(?=[A-Z][a-z])|"
19 | "(?<=[0-9])(?=[a-zA-Z])|"
20 | "(?<=[A-Za-z])(?=[0-9])|"
21 | "(?<=[@$.'\"])(?=[a-zA-Z0-9])|"
22 | "(?<=[a-zA-Z0-9])(?=[@$.'\"])|"
23 | "_|\\s+")
24 |
25 | if sys.version_info >= (3, 7):
26 | import re
27 | SPLIT_REGEX = re.compile(REGEX_TEXT)
28 | else:
29 | import regex
30 | SPLIT_REGEX = regex.compile("(?V1)"+REGEX_TEXT)
31 |
32 |
33 | def split_sentences(docstring):
34 | # sentences = re.split("(? List[str]:
43 | """
44 | Split a single identifier into parts on snake_case and camelCase
45 | """
46 | identifier_parts = list(s.lower() for s in SPLIT_REGEX.split(identifier) if len(s)>0)
47 |
48 | if len(identifier_parts) == 0:
49 | return [identifier]
50 | return identifier_parts
51 |
52 |
53 | def check_is_node_error(node: Node) -> bool:
54 | """
55 | Check if node contains "ERROR" node
56 | Args:
57 | node (tree_sitter.Node): node
58 |
59 | Return:
60 | bool
61 | """
62 | if not isinstance(node, Node):
63 | raise ValueError("Expect type tree_sitter.Node, get %i", type(node))
64 |
65 | error_node = get_node_by_kind(node, ['ERROR'])
66 | if len(error_node) > 0:
67 | return True
68 | else:
69 | return False
70 |
71 |
72 | def get_node_length(node: Node) -> int:
73 | """
74 | Get node length
75 | Args:
76 | node (tree_sitter.Node): node
77 |
78 | Return:
79 | int
80 | """
81 | if not isinstance(node, Node):
82 | raise ValueError("Expect type tree_sitter.Node, get %i", type(node))
83 |
84 | line_start = node.start_point[0]
85 | line_end = node.end_point[0]
86 | return int(line_end - line_start)
87 |
88 |
89 | def remove_comment_delimiters(docstring: str, remove_whitespace: bool=True) -> str:
90 | """
91 | Remove comment delimiters.
92 | Example: //, /*, */, #, etc
93 |
94 | Args:
95 | docstring (str): raw (line or block) comment
96 | remove_whitespace (bool): remove leading whitespace or not
97 | Returns:
98 | str: removed delimiters docstring/comment
99 |
100 | """
101 | clean_pattern = re.compile(r'([\'\"]{3})$|^([\'\"]{3})') # remove python ''' or """
102 | clean_pattern1 = re.compile(r'([#]+)$|^([#]+)') # special single-line comment with #
103 | clean_pattern2 = re.compile(r'([\/*=-]+)$|^([\/*!=-]+)')
104 |
105 | docstring = re.sub(clean_pattern, '', docstring)
106 | new_docstring = []
107 | for line in docstring.split('\n'):
108 | if remove_whitespace:
109 | line = line.strip()
110 | line = re.sub(clean_pattern1, '', line)
111 | line = re.sub(clean_pattern2, '', line)
112 | new_docstring.append(line)
113 |
114 | return '\n'.join(new_docstring)
115 |
116 |
117 | def remove_special_tag(docstring: str) -> str:
118 | """
119 | Remove all special tag (html tag, e.g. docstring
)
120 | """
121 | return BeautifulSoup(docstring, "html.parser").get_text()
122 |
123 |
124 | def remove_special_character(docstring: str) -> str:
125 | return re.sub(r'[^a-zA-Z0-9\\\_\.\,]', ' ', docstring)
126 |
127 |
128 | def remove_function_name_at_the_beginning(docstring):
129 | """
130 | This function is applied at docstring/paragraph-level.
131 | """
132 | ending_symbols = [":", "\s-"]
133 | for symbol in ending_symbols:
134 | pattern = "^[a-zA-Z0-9_\(\)]+" + symbol
135 | docstring = re.sub(pattern, "", docstring)
136 |
137 | docstring = docstring.strip()
138 |
139 | return docstring
140 |
141 |
142 | def remove_link_in_brackets(docstring):
143 | """
144 | Removing patterns, for examples:
145 | - (https://www.a.ai)
146 | -
147 | -
148 |
149 | \param
150 | \brief
151 |
152 | This function is applied to each line of the docstring/paragraph.
153 | """
154 | pattern = "\%s(?:http|see|e\.g|eg.).*?\%s"
155 | bracket_pairs = [("(", ")"), ("<", ">")]
156 | for pair in bracket_pairs:
157 | docstring = re.sub(pattern % pair, "", docstring.strip())
158 |
159 | return docstring.strip()
160 |
161 |
162 | def remove_everything_after_a_pattern(docstring):
163 | """
164 | Only keep the part appears before the patterns.
165 | Ignore everything after the patterns.
166 |
167 | This function is applied at docstring-level
168 | """
169 | patterns = [
170 | "E.g", "e.g", "eg.", "Eg.",
171 | "Example usage:", "Created by", "Example:",
172 | "Note:", ". Note", "note::", "note:", ". note"
173 | ]
174 |
175 | for pattern in patterns:
176 | docstring = docstring.strip().split(pattern)[0]
177 |
178 | docstring = docstring.strip()
179 | return docstring
180 |
181 |
182 | def remove_everything_after_an_url(docstring):
183 | """
184 | This function applies at sentence-level
185 | TO-DO: Should apply on docstring-level by regular expression
186 | """
187 | patterns = ["https:", "http:"]
188 | sentences = split_sentences(docstring)
189 | sentences_ = []
190 | for sentence in sentences:
191 | has_pattern = False
192 | for pattern in patterns:
193 | if pattern in sentence:
194 | has_pattern = True
195 | break
196 | if has_pattern:
197 | break
198 | sentences_.append(sentence)
199 | docstring = ". ".join(sentences_)
200 |
201 | docstring = docstring.strip()
202 |
203 | return docstring
204 |
205 |
206 | def remove_lines_start_and_end_with_the_same_char(docstring):
207 | """
208 | Remove noisy lines.
209 | This function applies at line-level
210 | """
211 | lines = docstring.strip().split("\n")
212 | patterns = ["*", "-", "_", "=", "/", "+"]
213 | lines_ = []
214 | for line in lines:
215 | line = line.strip()
216 | if line == "":
217 | lines_.append(line)
218 | continue
219 | flag = False
220 | for pattern in patterns:
221 | p = "^\%s.*\%s$" % (pattern, pattern)
222 | if re.search(p, line) is not None:
223 | flag = True
224 | break
225 | if flag:
226 | continue
227 |
228 | lines_.append(line)
229 | docstring = "\n".join(lines_).strip()
230 |
231 | return docstring
232 |
233 |
234 | def remove_lines_contain_only_a_single_char(docstring):
235 | """
236 | This function applies at line-level
237 | """
238 | patterns = ["*", "/", "=", "-", "+"]
239 | lines = docstring.strip().split("\n")
240 | for i, line in enumerate(lines):
241 | if line.strip() in patterns:
242 | lines[i] = ""
243 | continue
244 |
245 | docstring = "\n".join(lines).strip()
246 |
247 | return docstring
248 |
249 |
250 | def remove_patterns_at_any_positions(docstring):
251 | """
252 | This function applies at docstring-level
253 | """
254 | patterns = ["/**", "/*", "", "
", "*-*"]
255 | for pattern in patterns:
256 | if pattern in docstring:
257 | docstring = docstring.replace(pattern, "").strip()
258 |
259 | return docstring
260 |
261 |
262 | def remove_patterns_at_the_start_and_end_of_a_line(docstring):
263 | """
264 | This function applies at line-level
265 | """
266 | patterns = ["* "]
267 | lines = docstring.strip().split("\n")
268 | for i, line in enumerate(lines):
269 | flag = True
270 | while flag:
271 | flag = False
272 | # at the beginning
273 | for pattern in patterns:
274 | if line.startswith(pattern):
275 | line = line[len(pattern):]
276 | for symbol in [".", "*", "-", "_", "@", "#", "$", "!", "\\", "/", "+"]:
277 | pattern = r"^\%s{2,}" % (symbol)
278 | line_ = re.sub(pattern, "", line)
279 | if line_ != line:
280 | flag = True
281 | line = line_
282 |
283 | # at the end
284 | for symbol in [".", "*", "-", "_", "@", "#", "$", "!", "\\", "/", "+"]:
285 | pattern = r"\%s{2,}$" % (symbol)
286 | line_ = re.sub(pattern, "", line)
287 | if line_ != line:
288 | flag = True
289 | line = line_
290 | lines[i] = line
291 |
292 | docstring = "\n".join(lines).strip()
293 |
294 | return docstring
295 |
296 |
297 | def remove_patterns_at_the_end_of_a_docstring(docstring):
298 | """
299 | Remove ending character(s)
300 | This function applies at docstring-level
301 | """
302 | patterns = [":", ";", ",", "...", "@@", "@"]
303 | if docstring != "":
304 | if docstring[-1] in patterns:
305 | docstring = docstring[:-1] + '.'
306 |
307 | docstring = docstring.strip()
308 |
309 | return docstring
310 |
311 |
312 | def remove_specific_pattern(docstring: str) -> str:
313 | """
314 | pattern 1 will match "(e.g something)"
315 | pattern 2 will match "e.g something\n" or "e.g something. "
316 | pattern 3 will match "{@tag content}" and change to "content"
317 | pattern 4 will match trailing special chars "==============" or "************"
318 | """
319 | pattern1 = re.compile(r'(\(((i\.e)|(e\.g)|(\beg)|(\bie))[\s\S]+?)(\))', flags=re.IGNORECASE|re.MULTILINE)
320 | pattern3 = re.compile(r'{@.*?}')
321 | pattern4 = re.compile(r'(-|=|#|\*){5,}')
322 |
323 | docstring = re.sub(pattern1, '', docstring)
324 | # docstring = re.sub(pattern2, '', docstring)
325 | docstring = re.sub(pattern4, '', docstring)
326 | all_matches = re.findall(pattern3, docstring)
327 | for match in all_matches:
328 | new_match = str(match)[1:-1] # remove { }
329 | new_match = re.sub(r'@\w*', '', new_match)
330 | docstring = docstring.replace(match, new_match)
331 |
332 | return docstring
333 |
334 |
335 | def remove_unrelevant(docstring: str) -> str:
336 | flag = True
337 | while flag:
338 | flag = False
339 | docstring_ = docstring
340 |
341 | removing_functions = [
342 | remove_specific_pattern,
343 | remove_link_in_brackets,
344 | # remove_everything_after_an_url, # Overlap
345 | # remove_everything_after_a_pattern, # Noticeable wrong catch
346 | remove_patterns_at_any_positions,
347 | remove_lines_contain_only_a_single_char,
348 | remove_lines_start_and_end_with_the_same_char,
349 | remove_patterns_at_the_start_and_end_of_a_line,
350 | remove_function_name_at_the_beginning,
351 | ]
352 | for removing_function in removing_functions:
353 | docstring = removing_function(docstring)
354 | # print(removing_function.__name__)
355 | # print(docstring)
356 | # print('\n\n')
357 |
358 | if docstring != docstring_:
359 | flag = True
360 |
361 | docstring = remove_patterns_at_the_end_of_a_docstring(docstring)
362 | return docstring
363 |
364 |
365 | # =================== Check code ======================
366 |
367 | def check_is_black_node(node_name: str, exclude_list: List = None):
368 | """
369 | Check if node belongs to black list. E.g:
370 | - Built-in function
371 | - Test function, test class
372 | - Constructor
373 | """
374 | black_keywords = ['test_', 'Test_', '_test', 'toString', 'constructor', 'Constructor']
375 | black_keywords.extend(exclude_list)
376 |
377 | if not isinstance(node_name, str):
378 | raise ValueError(f'Expect str, get {type(node_name)}')
379 | if node_name.startswith('__') and node_name.endswith('__'):
380 | return True
381 | if node_name.startswith('set') or node_name.startswith('get'):
382 | return True
383 | if any(keyword in node_name for keyword in black_keywords):
384 | return True
385 |
386 | return False
387 |
388 |
389 | def check_is_empty_function(node):
390 | """
391 | If node width is longer than 3 lines, then it's not an empty function
392 | """
393 | if get_node_length(node) <= 3:
394 | return True
395 | return False
396 |
397 |
398 | def check_autogenerated_by_code(raw_code: str, identifier: str):
399 | threshold = 0.4
400 | fn_name_splited = split_identifier_into_parts(identifier)
401 | fn_name_splited = ' '.join(fn_name_splited).lower()
402 |
403 | comment = str(re.sub(r'[^a-zA-Z0-9]', ' ', comment)).lower()
404 |
405 | d0 = lev.distance(fn_name_splited, comment)
406 | d1 = max(len(fn_name_splited), len(comment))
407 |
408 | if d0 <= d1*threshold:
409 | return True
410 |
411 | return False
412 |
413 | # =================== Check docstring ======================
414 |
415 | def check_docstring_length(docstring: str):
416 | doc_tokens = docstring.strip().split()
417 | if len(doc_tokens) < 3: # or len(doc_tokens) > 256:
418 | # if len(doc_tokens) >= 256:
419 | return True
420 | return False
421 |
422 |
423 | def check_docstring_literal(docstring: str):
424 | """
425 | Check if docstring is EN
426 | TODO: "Ce n'est pas en anglais" -> Fr
427 | """
428 | p = re.compile('[a-zA-Z0-9]')
429 | if not docstring.isascii():
430 | return True
431 | if not p.search(docstring):
432 | return True
433 | # TODO: uncomment this
434 | # try:
435 | # _docstring = re.sub(r'[^a-zA-Z0-9]', ' ', docstring)
436 | # _docstring = ' '.join(split_all_sepcial_case(_docstring))
437 |
438 | # print(_docstring)
439 | # if detect(_docstring) != 'en':
440 | # print(detect_langs(_docstring))
441 | # return True
442 | # except:
443 | # pass
444 | return False
445 |
446 |
447 | def check_docstring_contain_question(docstring: str):
448 | pattern = re.compile(r'(?i)^(why\b|how\b|what\'?s?\b|where\b|is\b|are\b)')
449 |
450 | if docstring[-1] == '?' or pattern.search(docstring):
451 | return True
452 | else:
453 | return False
454 |
455 |
456 | def check_docstring_underdevelopment(docstring: str):
457 | p1 = re.compile('(?i)^((Description of the Method)|(NOT YET DOCUMENTED)|(Missing[\s\S]+Description)|(not in use)|'
458 | '(Insert the method\'s description here)|(No implementation provided)|(\(non\-Javadoc\)))')
459 | p2 = re.compile('(?i)^(todo|to-do|deprecate|copyright|fixme)', flags=re.IGNORECASE)
460 | # p3 = re.compile('^[A-Za-z]+(\([A-Za-z_]+\))?:')
461 |
462 | if p1.search(docstring) or p2.search(docstring):
463 | return True
464 | else:
465 | return False
466 |
467 |
468 | def check_docstring_autogenerated(docstring: str):
469 | p1 = re.compile(r'(?i)@[a-zA-Z]*generated\b')
470 | p2 = re.compile('(?i)^([aA]uto[-\s]generated)')
471 | p3 = re.compile('(?i)^(This method initializes)')
472 | p4 = re.compile('(?i)^(This method was generated by)')
473 |
474 | if docstring is not None:
475 | if p1.search(docstring):
476 | return True
477 |
478 | if p2.search(docstring) or p3.search(docstring) or p4.search(docstring):
479 | return True
480 |
481 | else:
482 | return False
483 |
484 |
485 | def check_docstring_contain_specific_pattern(docstring: str):
486 | condition1 = re.compile(r'((i\.e)|(e\.g)|(\beg)|(\bie))(\s|\.)', flags=re.IGNORECASE)
487 | condition2 = re.compile(r'(^(Sees*)|(example usage)|(example)|(note:*))', flags=re.IGNORECASE)
488 | condition_follow = re.compile(r'[^a-zA-Z0-9\s\.\,\:\;\'\"]')
489 |
490 | # if pattern 1 and 2 match -> check if the line contain any special characters
491 | if condition1.match(docstring) or condition2.match(docstring):
492 | if condition_follow.match(docstring):
493 | return True
494 |
495 | return False
496 |
497 |
498 | # =================== Check characters ======================
499 |
500 | def does_str_containt_math(str):
501 | math_indicators = ["equation", "\exp(", "\log(", "\sqrt(", "mathbf", "mathrm"]
502 | # TODO: page [number]
503 | containt_math = False
504 | for math_indicator in math_indicators:
505 | if math_indicator in str:
506 | containt_math = True
507 | break
508 |
509 | return containt_math
510 |
511 |
512 | def check_contain_little_alphabet_char(docstring: str):
513 | thresholds = [5, 0.65, 15, 0.4]
514 | docstring = docstring.strip()
515 | contain_math = does_str_containt_math(docstring)
516 | docstring = "".join(docstring.strip().split())
517 | if len(docstring) < 1:
518 | return True
519 | num_alphabet_chars = len(re.findall("[a-zA-Z]", docstring))
520 |
521 | return len(docstring) > thresholds[0 + 2*int(contain_math)] and num_alphabet_chars / len(docstring) < thresholds[1 + 2*int(contain_math)]
522 |
523 |
524 | def convert_special_pattern(docstring):
525 | patterns = [
526 | (["HH", "MM", "SS"], (":", "-")),
527 | (["MM", "DD", "YY"], (":", "-")),
528 | (["MM", "DD", "YYYY"], (":", "-")),
529 |
530 | (["hh", "mm", "ss"], (":", "-")),
531 | (["mm", "dd", "yy"], (":", "-")),
532 | (["mm", "dd", "yyyy"], (":", "-")),
533 |
534 | (["R", "G", "B"], (",", "-")),
535 |
536 | (["r", "g", "b"], (",", "-"))
537 | ]
538 | for pattern, signs in patterns:
539 | for sign in signs:
540 | pms = permutations(pattern)
541 | for pm in pms:
542 | string = sign.join(pm)
543 | if string in docstring:
544 | docstring = docstring.replace(string, "".join(pm).lower())
545 | return docstring
546 |
547 |
548 | def check_contain_many_special_char(docstring: str):
549 | threshold_dict = [[4, 6, 10, 6], # max #bracket schar, max #normal schar, max #math schar
550 | [10, 0.3, 17, 0,5], # acceptable #total schar or acceptable ratio
551 | [15, 20]] #, 0.3] # max #schar
552 | docstring = docstring.strip()
553 | containt_math = does_str_containt_math(docstring)
554 | docstring = convert_special_pattern(docstring)
555 | num_tokens = len(tokenize_docstring(docstring))
556 | counter = Counter(docstring)
557 |
558 | count = 0
559 | math_symbols = ["+", "-", "*", "/", ":", "^", "=", "<", ">", "|", "(",]
560 |
561 | symbols = ["$", "!", "@", "#", "%", "^", "&", "*", "<", ">",
562 | "~", "|", "\\", "'", '"',"?", "-", "+", "=", "`",
563 | ":", "/", "(", "[", "{"]
564 |
565 | for symb in symbols:
566 | threshold = threshold_dict[0][0]
567 | if symb in ["(", "[", "{"]:
568 | threshold = threshold_dict[0][1]
569 | if containt_math:
570 | threshold = threshold_dict[0][3]
571 | else:
572 | if containt_math:
573 | if symb in math_symbols:
574 | threshold = threshold_dict[0][2]
575 |
576 | if counter[symb] > threshold:
577 | return True
578 |
579 | # brackets
580 | if symb not in ["(", "[", "{"]:
581 | count += counter[symb]
582 |
583 | return count > max(threshold_dict[1][0 + 2*int(containt_math)], threshold_dict[1][1 + 2*int(containt_math)]*num_tokens) \
584 | and count > threshold_dict[2][int(containt_math)]
585 |
586 |
587 | def check_contain_little_unique_chars(docstring):
588 | """
589 | This function applies on docstring line
590 | """
591 | threshold_dict = [5, 3]
592 | docstring = "".join(docstring.strip().split())
593 | return len(docstring) > threshold_dict[0] and len(set(docstring)) <= threshold_dict[1]
594 |
595 | # =================== Check words ======================
596 |
597 | def check_contain_little_unique_words(docstring):
598 | threshold_dict = [3, 0.3]
599 | ignored_words = ["the", "of", "a", "an", "it", "for", "or", "in", "but",]
600 | # ".", ",", "(", ")", "{", "}", "<", ">", "[", "]", "-", "|"]
601 | docs = ' '.join(re.findall(r'\b[a-zA-Z0-9]+\b', docstring))
602 | docstring_tokens = tokenize_docstring(docs)
603 | counter = Counter(docstring_tokens)
604 | try:
605 | most_repeated_word = counter.most_common()[0][0]
606 | except IndexError:
607 | return True
608 | max_count = counter.most_common()[0][1]
609 |
610 | index = 1
611 | while most_repeated_word in ignored_words:
612 | try:
613 | most_repeated_word = counter.most_common()[index][0]
614 | max_count = counter.most_common()[index][1]
615 | index += 1
616 | except IndexError:
617 | return False
618 |
619 | return max_count >= threshold_dict[0] and max_count / len(docstring_tokens) > threshold_dict[1]
620 |
621 |
622 | # def check_contain_many_special_case(docstring: str):
623 | # """
624 | # Check if the string contains too much sneak_case or camelCase
625 | # """
626 | # threshold = 0.3
627 | # total_words = docstring.strip().split()
628 | # if len(total_words) == 0:
629 | # return True
630 | # sneak_cases = re.findall("\w+_\w+", docstring)
631 | # camelCases = re.findall("[A-Z]([A-Z0-9]*[a-z][a-z0-9]*[A-Z]|[a-z0-9]*[A-Z][A-Z0-9]*[a-z])[A-Za-z0-9]*", docstring)
632 | # return (len(sneak_cases) + len(camelCases))/len(total_words) > threshold
633 |
634 |
635 | # def check_contain_many_repeated_word(docstring: str):
636 | # """
637 | # Check if the string (longer than 30 words) have too many repeated word
638 | # """
639 | # threshold_dict = [30, 0.5] # max number, ratio
640 | # docstring = "".join(docstring.strip().split())
641 | # counter = Counter(docstring)
642 | # return len(docstring) > threshold_dict[0] and counter.most_common()[0][1] / len(docstring) > threshold_dict[1]
643 |
644 |
645 | def check_contain_many_uppercase_word(docstring: str):
646 | threshold_dict = [10, 0.3]
647 | patterns = ["DD", "MM", "YY", "YYYY", "R,G,B", "R-G-B", "SS", "HH", "API"]
648 | for pattern in patterns:
649 | docstring = docstring.replace(pattern, pattern.lower())
650 |
651 | docstring = docstring.strip()
652 | snake_case_identifiers = re.findall("\w+_\w+", docstring)
653 |
654 | for identifier in snake_case_identifiers:
655 | docstring = docstring.replace(identifier, identifier.lower())
656 |
657 | uppercase_words = re.findall(r"(?<=\s)[A-Z][A-Z0-9_]+", docstring)
658 | docstring_tokens = docstring.strip().split()
659 | return len(docstring_tokens) > threshold_dict[0] and len(uppercase_words) / len(docstring_tokens) > threshold_dict[1]
660 |
661 |
662 | def check_contain_too_many_variables(docstring):
663 | """
664 | Check if the string contains too much sneak_case or camelCase
665 | """
666 | threshold_dict = 0.3
667 | total_words = docstring.strip().split()
668 | if not total_words:
669 | return False
670 |
671 | # snake_case variable name
672 | snake_case_identifiers = re.findall("\w+_\w+", docstring)
673 | for identifier in snake_case_identifiers:
674 | docstring = docstring.replace(identifier, "").strip()
675 | # CamelCaes variable name
676 | camel_case_identifiers = re.finditer(r"[A-Z]([A-Z0-9]*[a-z][a-z0-9]*[A-Z]|[a-z0-9]*[A-Z][A-Z0-9]*[a-z])[A-Za-z0-9]*", docstring)
677 | camel_case_identifiers = [x.group() for x in camel_case_identifiers]
678 | # Method call
679 | variable_names = snake_case_identifiers + camel_case_identifiers
680 |
681 | return len(variable_names)/len(total_words) > threshold_dict
682 |
683 |
684 | def check_contain_too_many_method_call(docstring):
685 | threshold_dict = 0.2
686 | total_words = docstring.strip().split()
687 | if not total_words:
688 | return False
689 |
690 | method_call_identifiers = re.finditer(r"[a-zA-Z0-9]+((\.|\()[a-zA-Z0-9]+)+", docstring)
691 | method_call_identifiers = [x.group() for x in method_call_identifiers]
692 |
693 | return len(method_call_identifiers)/len(total_words) > threshold_dict
694 |
695 |
696 | def camel_case_split(identifier):
697 | matches = re.finditer(r'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
698 | return [m.group(0) for m in matches]
699 |
700 |
701 | def snake_case_split(identifier):
702 | return identifier.strip().split("_")
703 |
704 |
705 | def split_all_sepcial_case(docstring: str):
706 | docstring_tokens = []
707 | for token in tokenize_docstring(docstring.strip()):
708 | sub_tokens = snake_case_split(token)
709 | for sub_token in sub_tokens:
710 | sub_sub_tokens = camel_case_split(sub_token)
711 | docstring_tokens.extend(sub_sub_tokens)
712 |
713 | return docstring_tokens
714 |
715 | def check_contain_many_long_word(docstring: str):
716 | threshold = 30
717 | docstring_tokens = split_all_sepcial_case(docstring)
718 |
719 | if len(docstring_tokens) == 0:
720 | return True
721 |
722 | return max([len(docstring_token) for docstring_token in docstring_tokens]) > threshold
723 |
724 |
725 | def check_contain_url(docstring: str):
726 | pattern = re.compile(r'(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])', flags=re.I)
727 |
728 | if pattern.search(docstring):
729 | return True
730 | return False
731 |
732 | # =================== End checking ======================
733 |
734 | def check_function(node, node_metadata: Dict[str, Any], exclude_list: List = None, is_class=False):
735 | """
736 | Check function if
737 | - is built-in function (python)
738 | - is constructor
739 | - is empty
740 | - is error node
741 | - have length < 3 lines
742 |
743 | Args:
744 | node (tree_sitter.Node): function node
745 | exclude_list (List): exclude name of function
746 | Return:
747 | bool: pass the check or not
748 | """
749 | node_identifier = node_metadata['identifier']
750 |
751 | # Check node/code
752 | if check_is_node_error(node):
753 | return False
754 | if check_is_black_node(node_identifier, exclude_list):
755 | return False
756 | if check_is_empty_function(node):
757 | return False
758 |
759 | return True
760 |
761 |
762 | def check_docstring(docstring: str, loosen_filter: bool = False):
763 | """
764 | Check docstring is valid or not
765 | """
766 | check_funcs_mapping = [
767 | # 'check_docstring_literal',
768 | 'check_docstring_contain_question',
769 | 'check_docstring_underdevelopment',
770 | 'check_docstring_autogenerated',
771 | 'check_docstring_contain_specific_pattern',
772 | 'check_contain_little_alphabet_char',
773 | 'check_contain_many_special_char',
774 | 'check_contain_little_unique_chars',
775 | 'check_contain_little_unique_words',
776 | # 'check_contain_many_special_case',
777 | 'check_contain_too_many_variables',
778 | 'check_contain_too_many_method_call',
779 | # 'check_contain_many_repeated_word',
780 | 'check_contain_many_uppercase_word',
781 | 'check_contain_many_long_word',
782 | 'check_contain_url',
783 | ]
784 |
785 | check_docstring_funcs = [
786 | # check_docstring_literal,
787 | check_docstring_contain_question,
788 | check_docstring_underdevelopment,
789 | check_docstring_autogenerated,
790 | check_docstring_contain_specific_pattern,
791 | check_contain_little_alphabet_char,
792 | check_contain_many_special_char,
793 | check_contain_little_unique_chars,
794 | check_contain_little_unique_words,
795 | # check_contain_many_special_case,
796 | check_contain_too_many_variables,
797 | check_contain_too_many_method_call,
798 | # check_contain_many_repeated_word,
799 | check_contain_many_uppercase_word,
800 | check_contain_many_long_word,
801 | check_contain_url,
802 | ]
803 |
804 | if loosen_filter:
805 | check_docstring_funcs = [
806 | check_docstring_contain_question,
807 | check_docstring_underdevelopment,
808 | check_docstring_autogenerated,
809 | check_docstring_contain_specific_pattern,
810 | check_contain_little_alphabet_char,
811 | # check_contain_many_special_char,
812 | check_contain_little_unique_chars,
813 | check_contain_little_unique_words,
814 | # check_contain_many_special_case,
815 | # check_contain_too_many_variables,
816 | # check_contain_too_many_method_call,
817 | # check_contain_many_repeated_word,
818 | check_contain_many_uppercase_word,
819 | check_contain_many_long_word,
820 | check_contain_url,
821 | ]
822 |
823 | # docstring_list = docstring.split('.')
824 | # print(f'\nAfter split {docstring_list}')
825 |
826 | applied_res = []
827 | result = False
828 | for i, check_condition in zip(check_funcs_mapping, check_docstring_funcs):
829 | # for comment in docstring_list:
830 | if docstring == '' or not docstring:
831 | return True #, []
832 | # if True then docstring have fail
833 | if check_condition(docstring):
834 | return True
835 | # return True
836 | # applied_res.append(f"<{i}> {docstring}")
837 |
838 | return result #, applied_res
839 |
840 |
841 | def clean_docstring(docstring: str, loosen_filter: bool = False):
842 | """
843 | Clean docstring by removing special tag/url, characters, unrelevant information
844 | """
845 | cleaned_docstring = []
846 | if docstring == '' or docstring == None:
847 | return None
848 | _docstring = remove_comment_delimiters(docstring)
849 | if check_docstring_literal(_docstring): # True is not pass
850 | return None #, [f" {docstring}"]
851 |
852 | # _docstring = '\n'.join(remove_comment_delimiters(docstring))
853 | docstring_paragraph_list = _docstring.strip().split('\n\n')
854 |
855 | for para in docstring_paragraph_list:
856 | docs = remove_unrelevant(para)
857 | docstring_list = re.split(r'(?<=.)[.!\?](?=\s+)', docs, flags=re.M)
858 | clean_line = []
859 | for line in docstring_list:
860 | try:
861 | line = remove_special_tag(line)
862 | except:
863 | print('Oops')
864 | return None
865 |
866 | # not_pass, res = check_docstring(line, loosen_filter)
867 | not_pass = check_docstring(line, loosen_filter)
868 | if not not_pass:
869 | clean_line.append(line)
870 | else:
871 | break
872 |
873 | if len(clean_line) < len(docstring_list):
874 | clean_line.append('')
875 | cleaned_docstring.append('.'.join(clean_line))
876 |
877 |
878 | cleaned_docstring = '\n\n'.join(cleaned_docstring)
879 |
880 |
881 | if check_docstring_length(cleaned_docstring):
882 | # if not res:
883 | # return None #, [f" {docstring}"]
884 | # else:
885 | return None #, res
886 |
887 | return cleaned_docstring #, res
888 |
889 | if __name__ == '__main__':
890 | # test remove comment delimiters
891 | raw = [
892 | '// C, C++, C#',
893 | '/// C, C++, C#',
894 |
895 | '/*******'
896 | '* Java'
897 | '/*******',
898 | '//** Java */',
899 |
900 | '# Python',
901 |
902 | '//! Rust',
903 | '//!!! Rust',
904 | '/*!! Rust',
905 | '/*! Rust',
906 |
907 | '''
908 | /* The code below will print the words Hello World to the screen, and it is amazing
909 |
910 | Somethin here too*/
911 | '''
912 | ]
913 |
914 | # for item in raw:
915 | # print(remove_comment_delimiters(item))
916 |
917 | samples = [
918 | '\n\t\t/* 将JSONArray转换为Bean的List, 默认为ArrayList */',
919 | '// TODO: Why is he using Math.round?',
920 | '/* for now try mappig full type URI */',
921 | '// public String transformTypeID(URI typeuri){',
922 | '// return typeuri.toString();}',
923 | '/* Do we need to show the upgrade wizard prompt? */',
924 | '/* fixme: This function is not in use */',
925 | '// SampleEncryptionBox (senc) and SampleAuxiliaryInformation{Sizes|Offsets}Box',
926 | '/* This method initializes by me. The second line \n\n Abcdef*/',
927 | '/* @func_name_generated',
928 | '/* Auto-generated by IDE',
929 | '/ Auto-generated by IDE',
930 | '''
931 | /// Abc
932 | /// Abc
933 | /// Abc
934 | ''',
935 | '''
936 | /* Abc
937 | * def
938 | */
939 | '''
940 | ]
941 |
942 | # for item in samples:
943 | # print(clean_docstring(item))
944 |
945 | samples = [
946 | '''
947 | Returns the Surface's pixel buffer if the Surface doesn't require locking.
948 | (e.g. it's a software surface)
949 | ''',
950 | '''
951 | Taking in a sequence string, return the canonical form of the sequence
952 | (e.g. the lexigraphically lowest of either the original sequence or its
953 | reverse complement)
954 | ''',
955 | '''
956 | Internal clear timeout. The function checks that the `id` was not removed
957 | (e.g. by `chart.destroy()`). For the details see
958 | [issue #7901](https://github.com/highcharts/highcharts/issues/7901).
959 | ''',
960 | ]
961 |
962 | # print('==== Cleaning ====')
963 | # for item in samples:
964 | # print(clean_docstring(item))
965 |
966 | sample = '''
967 | Returns the message Id to use as heading text, depending on what types of
968 | usage are present (i.e. just writable files, or also readable directories,
969 | etc).
970 | |need_lifetime_text_at_end| is set to false iff the returned message Id
971 | already includes an explanation for how long a website will have access to
972 | the listed paths. It is set to true iff a separate label is needed at the end
973 | of the dialog to explain lifetime.
974 | '''
975 | print(sample)
976 | print('==== Cleaning ====')
977 | print(clean_docstring(sample)[0])
978 |
979 | # print(extract_docstring(sample, [], 'cpp'))
980 |
981 | # res = clean_docstring(sample)
982 | # print(res[0])
983 | # print(res[1])
984 |
985 | # sample = '''Convert java.util.regex.Matcher groups to JavaScript groups'''
986 | # print(check_contain_too_many_variables(sample))
--------------------------------------------------------------------------------
/src/codetext/codetext_cli.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import List, Dict
3 |
4 | from tabulate import tabulate
5 |
6 | from .parser import *
7 | from .utils import parse_code
8 |
9 |
10 | def parse_file(file_path: str, language: str = None, verbose: bool = False) -> List:
11 | assert language != None, "Auto detect is not implemented, please specify language"
12 | language = str(language).lower()
13 | # assert (language in SUPPORT_LANGUAGE) == True, f"{language} is not supported"
14 | assert os.path.isfile(file_path) == True, "File not found"
15 |
16 | if verbose:
17 | print(50 * "=")
18 | print("Parse code into tree-sitter node")
19 |
20 | content: str = open(file_path, "r").read()
21 | root_node = parse_code(raw_code=content, language=language).root_node
22 |
23 | if language == "python":
24 | parser: LanguageParser = PythonParser
25 | elif language == "java":
26 | parser: LanguageParser = JavaParser
27 | elif language == "javascript":
28 | parser: LanguageParser = JavascriptParser
29 | elif language == "go":
30 | parser: LanguageParser = GoParser
31 | elif language in ["c", "c++"]:
32 | parser: LanguageParser = CppParser
33 | elif language == "c#":
34 | parser: LanguageParser = CsharpParser
35 | elif language == "rust":
36 | parser: LanguageParser = RustParser
37 | elif language == "ruby":
38 | parser: LanguageParser = RubyParser
39 | elif language == "php":
40 | parser: LanguageParser = PhpParser
41 | else:
42 | raise KeyError(f"{language} is not supported")
43 |
44 | if verbose:
45 | print(50 * "=")
46 | print("Get node detail")
47 |
48 | cls_list = parser.get_class_list(root_node)
49 | method_list = []
50 | cls_metadata = []
51 | for _cls in cls_list:
52 | cls_info = parser.get_class_metadata(_cls)
53 | cls_info["code"] = get_node_text(_cls)
54 |
55 | cls_method = []
56 | current_class_methods = parser.get_function_list(_cls)
57 | for method in current_class_methods:
58 | method_info = parser.get_function_metadata(method)
59 | method_info['code'] = get_node_text(method)
60 | cls_method.append(method_info)
61 |
62 | cls_info["method"] = cls_method
63 | cls_metadata.append(cls_info)
64 | method_list.extend(current_class_methods)
65 |
66 | fn_list: List = parser.get_function_list(root_node)
67 | for node in fn_list[:]:
68 | if node in method_list:
69 | fn_list.remove(node)
70 |
71 | fn_metadata = []
72 | for fn in fn_list:
73 | fn_metadata.append(parser.get_function_metadata(fn))
74 |
75 | output_metadata = {"class": cls_metadata, "function": fn_metadata}
76 |
77 | return output_metadata
78 |
79 |
80 | def print_result(res: Dict, file_name: str = "no_name_file"):
81 | # ======== Print file name ========
82 | print("File {name} analyzed:".format(name=file_name))
83 | print(50 * "=")
84 |
85 | # ========= Summary =========
86 | print("Number of class : {length}".format(length=len(res["class"])))
87 | print("Number of function : {length}".format(length=len(res["function"])))
88 | print(50 * "-" + "\n")
89 |
90 | # ========= Print class & method =========
91 | cls_headers = ["#", "Class", "Arguments"]
92 | cls_method_headers = ["#", "Method name", "Paramters",
93 | "Type", "Return type", "Throws"]
94 | cls_info = []
95 | method_info = {}
96 | for cls_idx, _cls in enumerate(res["class"]):
97 | cls_max_length = max(1, len(_cls["parameters"].keys()))
98 | for i in range(cls_max_length):
99 | clslist = [""] * len(cls_headers)
100 | clslist[0] = cls_idx if i < 1 else ""
101 | clslist[1] = _cls["identifier"] if i < 1 else ""
102 | if _cls["parameters"].keys():
103 | clslist[2] = list(_cls["parameters"].keys())[i]
104 | cls_info.append(clslist)
105 |
106 | _method_info = []
107 | for idx, method in enumerate(_cls["method"]):
108 | max_length = max(1, len(method["parameters"].keys()))
109 | for i in range(max_length):
110 | sublist = [""] * len(cls_method_headers)
111 | sublist[0] = idx if i < 1 else ""
112 | sublist[1] = method["identifier"] if i < 1 else ""
113 | if method["parameters"].keys():
114 | sublist[2] = list(method["parameters"].keys())[i]
115 | sublist[3] = list(method["parameters"].values())[i]
116 | sublist[4] = (
117 | method["return_type"]
118 | if i <= 1 and method["return_type"] != ""
119 | else ""
120 | )
121 | sublist[5] = (
122 | method["throws"]
123 | if i <= 1 and "throws" in method.keys()
124 | else ""
125 | )
126 | _method_info.append(sublist)
127 |
128 | method_info[file_name] = [_cls["identifier"], _method_info]
129 |
130 | if cls_info:
131 | print("Class summary:")
132 | print(tabulate(cls_info, headers=cls_headers, tablefmt="outline"))
133 | print("\n")
134 |
135 | for _, info in method_info.items():
136 | name, info = info
137 | print("Class analyse: {name}".format(name=name))
138 | print(tabulate(info, headers=cls_method_headers, tablefmt="outline"))
139 | print("\n")
140 |
141 | # ========= Print stand alone function =========
142 | fn_headers = ["#", "Function name", "Paramters", "Type", "Return type"]
143 | function_info = []
144 |
145 | for idx, fn in enumerate(res["function"]):
146 | max_length = max(1, len(fn["parameters"].keys()))
147 | for i in range(max_length):
148 | sublist = [""] * len(fn_headers)
149 | sublist[0] = idx if i < 1 else ""
150 | sublist[1] = fn["identifier"] if i < 1 else ""
151 | if fn["parameters"].keys():
152 | sublist[2] = list(fn["parameters"].keys())[i]
153 | sublist[3] = list(fn["parameters"].values())[i]
154 | sublist[4] = (
155 | fn["return_type"]
156 | if i <= 1 and fn["return_type"] != ""
157 | else ""
158 | )
159 | function_info.append(sublist)
160 |
161 | if function_info:
162 | print("Function analyse:")
163 | print(tabulate(function_info, headers=fn_headers, tablefmt="outline"))
164 | print("\n")
165 |
166 | elif not method_info:
167 | print("File empty")
168 | print("\n")
169 |
170 |
171 | PL_MATCHING = {
172 | "Java": [".java"],
173 | "JavaScript": [
174 | ".js",
175 | "._js",
176 | ".bones",
177 | ".es6",
178 | ".jake",
179 | ".jsb",
180 | ".jscad",
181 | ".jsfl",
182 | ".jsm",
183 | ".jss",
184 | ".njs",
185 | ".pac",
186 | ".sjs",
187 | ".ssjs",
188 | ".xsjs",
189 | ".xsjslib",
190 | ],
191 | "Python": [
192 | ".py",
193 | ".bzl",
194 | ".gyp",
195 | ".lmi",
196 | ".pyde",
197 | ".pyp",
198 | ".pyt",
199 | ".pyw",
200 | ".tac",
201 | ".wsgi",
202 | ".xpy",
203 | ],
204 | "PHP": [".php", ".aw", ".ctp", ".php3", ".php4", ".php5", ".phps", ".phpt"],
205 | "Go": [".go"],
206 | "Rust": [".rs", ".rs.in"],
207 | "Ruby": [
208 | ".rb",
209 | ".builder",
210 | ".gemspec",
211 | ".god",
212 | ".irbrc",
213 | ".jbuilder",
214 | ".mspec",
215 | ".podspec",
216 | ".rabl",
217 | ".rake",
218 | ".rbuild",
219 | ".rbw",
220 | ".rbx",
221 | ".ru",
222 | ".ruby",
223 | ".thor",
224 | ".watchr",
225 | ],
226 | "C": [".c", ".cats", ".h", ".idc", ".w"],
227 | "C#": [".cs", ".cake", ".cshtml", ".csx"],
228 | "C++": [
229 | ".cpp",
230 | ".c++",
231 | ".cc",
232 | ".cp",
233 | ".cxx",
234 | ".h++",
235 | ".hh",
236 | ".hpp",
237 | ".hxx",
238 | ".inl",
239 | ".ipp",
240 | ".tcc",
241 | ".tpp",
242 | ".C",
243 | ".H",
244 | ],
245 | }
246 |
--------------------------------------------------------------------------------
/src/codetext/parser/README.md:
--------------------------------------------------------------------------------
1 | # Parser Appendix
2 |
3 | With `codetext` parser, we support to extract serveral function type, however, by using `tree-sitter` grammarly, some function or some language might be not fully supported.
4 |
5 | This is the list of current supported function:
6 |
7 |
--------------------------------------------------------------------------------
/src/codetext/parser/__init__.py:
--------------------------------------------------------------------------------
1 | """Codetext parser
2 | Parse code to get docstring node, comment node
3 | """
4 | from .go_parser import GoParser
5 | from .php_parser import PhpParser
6 | from .ruby_parser import RubyParser
7 | from .java_parser import JavaParser
8 | from .javascript_parser import JavascriptParser
9 | from .python_parser import PythonParser
10 | from .cpp_parser import CppParser
11 | from .c_sharp_parser import CsharpParser
12 | from .rust_parser import RustParser
13 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text, \
14 | tokenize_code, tokenize_docstring, nodes_are_equal
15 |
16 | SUPPORT_LANGUAGE = [
17 | "go", "php", "ruby", "java", "javascript",
18 | "python", "cpp", "c", "c_sharp", "rust"
19 | ]
20 |
21 | __all__ = [
22 | 'GoParser', 'PhpParser', 'RubyParser', 'JavaParser', 'JavascriptParser',
23 | 'PythonParser', 'CppParser', 'CsharpParser', 'RustParser', 'LanguageParser',
24 | 'get_node_by_kind', 'get_node_text', 'tokenize_code', 'tokenize_docstring',
25 | 'nodes_are_equal'
26 | ]
27 |
--------------------------------------------------------------------------------
/src/codetext/parser/c_sharp_parser.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Any
2 | import tree_sitter
3 | import logging
4 |
5 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text
6 |
7 | logger = logging.getLogger(name=__name__)
8 |
9 |
10 | class CsharpParser(LanguageParser):
11 |
12 | BLACKLISTED_FUNCTION_NAMES = []
13 |
14 | @staticmethod
15 | def get_docstring(node, blob=None):
16 | """
17 | Get docstring description for node
18 |
19 | Args:
20 | node (tree_sitter.Node)
21 | blob (str): original source code which parse the `node`
22 | Returns:
23 | str: docstring
24 | """
25 | if blob:
26 | logger.info('From version `0.0.6` this function will update argument in the API')
27 | docstring_node = CsharpParser.get_docstring_node(node)
28 | docstring = '\n'.join(get_node_text(s) for s in docstring_node)
29 | return docstring
30 |
31 | @staticmethod
32 | def get_docstring_node(node):
33 | """
34 | Get docstring node from it parent node.
35 | C# docstring is written line by line and stay outside it own node, see example below.
36 |
37 | Args:
38 | node (tree_sitter.Node): parent node (usually function node) to get its docstring
39 | Return:
40 | List: list of docstring nodes
41 | Example:
42 | str = '''
43 | //
44 | // Docstring of a method
45 | //
46 | // Argument.
47 | //
48 | // None.
49 | public void honk(string animal_honk)
50 | {
51 | Console.WriteLine(animal_honk);
52 | Console.WriteLine("Tuut, tuut!");
53 | }
54 | '''
55 | ...
56 | print(C_sharp.get_docstring_node(function_node))
57 |
58 | >>> [, \
59 | , \
60 | , \
61 | , \
62 | , \
63 | ]
64 | """
65 | docstring_node = []
66 |
67 | prev_node = node.prev_sibling
68 | if prev_node and prev_node.type == 'comment':
69 | docstring_node.append(prev_node)
70 | prev_node = prev_node.prev_sibling
71 |
72 | while prev_node and prev_node.type == 'comment':
73 | # Assume the comment is dense
74 | x_current = prev_node.start_point[0]
75 | x_next = prev_node.next_sibling.start_point[0]
76 | if x_next - x_current > 1:
77 | break
78 |
79 | docstring_node.insert(0, prev_node)
80 | prev_node = prev_node.prev_sibling
81 |
82 | return docstring_node
83 |
84 | @staticmethod
85 | def get_comment_node(node):
86 | """
87 | Return all comment node inside a parent node
88 | Args:
89 | node (tree_sitter.Node)
90 | Return:
91 | List: list of comment nodes
92 | """
93 | comment_node = get_node_by_kind(node, kind=['comment'])
94 | return comment_node
95 |
96 | @staticmethod
97 | def get_function_list(node):
98 | res = get_node_by_kind(node, ['local_function_statement', 'method_declaration'])
99 | # We don't use "constructor_declaration"
100 | return res
101 |
102 | @staticmethod
103 | def get_class_list(node):
104 | res = get_node_by_kind(node, ['class_declaration'])
105 | return res
106 |
107 | @staticmethod
108 | def get_function_metadata(function_node, blob: str = None) -> Dict[str, Any]:
109 | """
110 | Function metadata contains:
111 | - identifier (str): function name
112 | - parameters (Dict[str, str]): parameter's name and their type (e.g: {'param_a': 'int'})
113 | - type (str): type
114 | """
115 | metadata = {
116 | 'identifier': '',
117 | 'parameters': {},
118 | 'return_type': None
119 | }
120 | assert type(function_node) == tree_sitter.Node
121 |
122 | for child in function_node.children:
123 | if child.type in ['predefined_type', 'generic_name']:
124 | metadata['return_type'] = get_node_text(child)
125 | elif child.type == 'identifier':
126 | if child.next_named_sibling.type != 'parameter_list':
127 | metadata['return_type'] = get_node_text(child)
128 | else:
129 | metadata['identifier'] = get_node_text(child)
130 | elif child.type == 'parameter_list':
131 | for param_node in child.children:
132 | param_nodes = get_node_by_kind(param_node, ['parameter'])
133 | for param in param_nodes:
134 | if len(param.children) > 1:
135 | param_type = get_node_text(param.children[0])
136 | param_name = get_node_text(param.children[1])
137 | metadata['parameters'][param_name] = param_type
138 |
139 | else:
140 | param_name = get_node_text(param.children[0])
141 | metadata['parameters'][param_name] = None
142 | # for node in param.children:
143 | # if node.type in ['array_type', 'implicit_type', \
144 | # 'nullable_type', 'pointer_type', 'function_pointer_type', \
145 | # 'predefined_type', 'tuple_type']:
146 | # param_type = get_node_text(node)
147 | # elif node.type == 'identifier':
148 | # param_identifier = get_node_text(node)
149 |
150 | # param_type = get_node_text(param.child_by_field_name('type'))
151 | # param_identifier = get_node_text(param.child_by_field_name('name'))
152 | return metadata
153 |
154 | @staticmethod
155 | def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]:
156 | """
157 | Class metadata contains:
158 | - identifier (str): class's name
159 | - parameters (List[str]): inheritance class
160 | """
161 | if blob:
162 | logger.info('From version `0.0.6` this function will update argument in the API')
163 | metadata = {
164 | 'identifier': '',
165 | 'parameters': {},
166 | }
167 | assert type(class_node) == tree_sitter.Node
168 |
169 | for child in class_node.children:
170 | if child.type == 'identifier':
171 | metadata['identifier'] = get_node_text(child)
172 | elif child.type == 'base_list':
173 | for arg in child.children:
174 | if arg.type == 'identifier':
175 | metadata['parameters'][get_node_text(arg)] = None
176 | # argument_list.append(get_node_text(arg))
177 | # metadata['parameters'] = argument_list
178 |
179 | return metadata
180 |
181 |
--------------------------------------------------------------------------------
/src/codetext/parser/cpp_parser.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Any
2 |
3 | import tree_sitter
4 | import logging
5 |
6 | from .language_parser import LanguageParser, get_node_text, get_node_by_kind
7 |
8 | logger = logging.getLogger(name=__name__)
9 |
10 |
11 | class CppParser(LanguageParser):
12 |
13 | BLACKLISTED_FUNCTION_NAMES = ['main', 'constructor']
14 |
15 | @staticmethod
16 | def get_docstring(node, blob=None):
17 | """
18 | Get docstring description for node
19 |
20 | Args:
21 | node (tree_sitter.Node)
22 | blob (str): original source code which parse the `node`
23 | Returns:
24 | str: docstring
25 | """
26 | if blob:
27 | logger.info('From version `0.0.6` this function will update argument in the API')
28 | docstring_node = CppParser.get_docstring_node(node)
29 | docstring = '\n'.join(get_node_text(s) for s in docstring_node)
30 | return docstring
31 |
32 | @staticmethod
33 | def get_docstring_node(node):
34 | """
35 | Get docstring node from it parent node.
36 | C and C++ share the same syntax. Their docstring usually is 1 single block
37 | Expect length of return list == 1
38 |
39 | Args:
40 | node (tree_sitter.Node): parent node (usually function node) to get its docstring
41 | Return:
42 | List: list of docstring nodes (expect==1)
43 | Example:
44 | str = '''
45 | /**
46 | * Find 2 sum
47 | *
48 | * @param nums List number.
49 | * @param target Sum target.
50 | * @return postion of 2 number.
51 | */
52 | vector twoSum(vector& nums, int target) {
53 | ...
54 | }
55 | '''
56 | ...
57 | print(CppParser.get_docstring_node(function_node))
58 |
59 | >>> []
60 | """
61 | docstring_node = []
62 |
63 | prev_node = node.prev_sibling
64 | if prev_node and prev_node.type == 'comment':
65 | docstring_node.append(prev_node)
66 | prev_node = prev_node.prev_sibling
67 |
68 | while prev_node and prev_node.type == 'comment':
69 | # Assume the comment is dense
70 | x_current = prev_node.start_point[0]
71 | x_next = prev_node.next_sibling.start_point[0]
72 | if x_next - x_current > 1:
73 | break
74 |
75 | docstring_node.insert(0, prev_node)
76 | prev_node = prev_node.prev_sibling
77 |
78 | return docstring_node
79 |
80 | @staticmethod
81 | def get_function_list(node):
82 | res = get_node_by_kind(node, ['function_definition'])
83 | return res
84 |
85 | @staticmethod
86 | def get_class_list(node):
87 | res = get_node_by_kind(node, ['class_specifier'])
88 | return res
89 |
90 | @staticmethod
91 | def get_comment_node(node):
92 | """
93 | Return all comment node inside a parent node
94 | Args:
95 | node (tree_sitter.Node)
96 | Return:
97 | List: list of comment nodes
98 | """
99 | comment_node = get_node_by_kind(node, kind=['comment'])
100 | return comment_node
101 |
102 | @staticmethod
103 | def get_function_metadata(function_node, blob: str=None) -> Dict[str, Any]:
104 | """
105 | Function metadata contains:
106 | - identifier (str): function name
107 | - parameters (Dict[str, str]): parameter's name and their type (e.g: {'param_a': 'int'})
108 | - return_type (str or NoneType): function's return type
109 | """
110 | if blob:
111 | logger.info('From version `0.0.6` this function will update argument in the API')
112 | metadata = {
113 | 'identifier': '',
114 | 'parameters': {},
115 | 'return_type': None,
116 | }
117 | assert type(function_node) == tree_sitter.Node
118 |
119 | for child in function_node.children:
120 | if child.type in ['primitive_type', 'type_identifier']:
121 | metadata['return_type'] = get_node_text(child)
122 | # search for "function_declarator"
123 | elif child.type == 'pointer_declarator':
124 | for subchild in child.children:
125 | if subchild.type == 'function_declarator':
126 | child = subchild
127 | if child.type == 'function_declarator':
128 | for subchild in child.children:
129 | if subchild.type in ['qualified_identifier', 'identifier', 'field_identifier']:
130 | metadata['identifier'] = get_node_text(subchild)
131 | elif subchild.type == 'parameter_list':
132 | param_nodes = get_node_by_kind(subchild, ['parameter_declaration'])
133 | for param in param_nodes:
134 | param_type = param.child_by_field_name('type')
135 | param_type = get_node_text(param_type)
136 | list_name = get_node_by_kind(param, ['identifier'])
137 | if not list_name:
138 | continue
139 | param_name = get_node_text(list_name[0])
140 | metadata['parameters'][param_name] = param_type
141 | # for item in param.children:
142 |
143 | # if item.type in ['type_identifier', 'primitive_type']:
144 | # param_type = get_node_text(item)
145 | # elif item.type == 'identifier':
146 | # param_identifier = get_node_text(item)
147 |
148 | return metadata
149 |
150 | @staticmethod
151 | def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]:
152 | """
153 | Class metadata contains:
154 | - identifier (str): class's name
155 | - parameters (List[str]): inheritance class
156 | """
157 | if blob:
158 | logger.info('From version `0.0.6` this function will update argument in the API')
159 | metadata = {
160 | 'identifier': '',
161 | 'parameters': {},
162 | }
163 | assert type(class_node) == tree_sitter.Node
164 |
165 | for child in class_node.children:
166 | if child.type == 'type_identifier':
167 | metadata['identifier'] = get_node_text(child)
168 | elif child.type == 'base_class_clause':
169 | argument_list = []
170 | for param in child.children:
171 | if param.type == 'type_identifier':
172 | metadata['parameters'][get_node_text(param)] = None
173 | # argument_list.append(get_node_text(param))
174 | # metadata['parameters'] = argument_list
175 |
176 | return metadata
177 |
--------------------------------------------------------------------------------
/src/codetext/parser/go_parser.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Any
2 | import logging
3 |
4 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text
5 |
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 |
10 | class GoParser(LanguageParser):
11 |
12 | BLACKLISTED_FUNCTION_NAMES = ['test', 'vendor']
13 |
14 | @staticmethod
15 | def get_comment_node(function_node):
16 | """
17 | Return all comment node inside a parent node
18 | Args:
19 | node (tree_sitter.Node)
20 | Return:
21 | List: list of comment nodes
22 | """
23 | comment_node = get_node_by_kind(function_node, kind='comment')
24 | return comment_node
25 |
26 | @staticmethod
27 | def get_docstring_node(node):
28 | """
29 | Get docstring node from it parent node.
30 | Go's docstring is written line by line
31 |
32 | Args:
33 | node (tree_sitter.Node): parent node (usually function node) to get its docstring
34 | Return:
35 | List: list of docstring nodes
36 | Example:
37 | str = '''
38 | // The path package should only be used for paths separated by forward
39 | // slashes, such as the paths in URLs. This package does not deal with
40 | // Windows paths with drive letters or backslashes; to manipulate
41 | // operating system paths, use the [path/filepath] package.
42 | func (e TypeError) Error() string {
43 | ...
44 | }
45 | '''
46 | ...
47 | print(GoParser.get_docstring_node(function_node))
48 |
49 | >>> [, \
50 | , \
51 | , \
52 | ]
53 | """
54 | docstring_node = []
55 |
56 | prev_node = node.prev_sibling
57 | if prev_node and prev_node.type == 'comment':
58 | docstring_node.append(prev_node)
59 | prev_node = prev_node.prev_sibling
60 |
61 | while prev_node and prev_node.type == 'comment':
62 | # Assume the comment is dense
63 | x_current = prev_node.start_point[0]
64 | x_next = prev_node.next_sibling.start_point[0]
65 | if x_next - x_current > 1:
66 | break
67 |
68 | docstring_node.insert(0, prev_node)
69 | prev_node = prev_node.prev_sibling
70 |
71 | return docstring_node
72 |
73 | @staticmethod
74 | def get_docstring(node, blob:str=None):
75 | """
76 | Get docstring description for node
77 |
78 | Args:
79 | node (tree_sitter.Node)
80 | blob (str): original source code which parse the `node`
81 | Returns:
82 | str: docstring
83 | """
84 | if blob:
85 | logger.info('From version `0.0.6` this function will update argument in the API')
86 | docstring_node = GoParser.get_docstring_node(node)
87 | docstring = '\n'.join(get_node_text(s) for s in docstring_node)
88 | return docstring
89 |
90 | @staticmethod
91 | def get_function_list(node):
92 | res = get_node_by_kind(node, ['method_declaration', 'function_declaration'])
93 | return res
94 |
95 | @staticmethod
96 | def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]:
97 | if blob:
98 | logger.info('From version `0.0.6` this function will update argument in the API')
99 | metadata = {
100 | 'identifier': '',
101 | 'parameters': {},
102 | 'return_type': None,
103 | }
104 |
105 | for child in function_node.children:
106 | if child.type in ['field_identifier', 'identifier']:
107 | metadata['identifier'] = get_node_text(child)
108 | elif child.type == 'type_identifier':
109 | metadata['return_type'] = get_node_text(child)
110 | elif child.type == 'parameter_list':
111 | for subchild in child.children:
112 | if subchild.type in ['parameter_declaration', 'variadic_parameter_declaration']:
113 | identifier_node = subchild.child_by_field_name('name')
114 |
115 | if not identifier_node:
116 | continue
117 |
118 | param_type = get_node_text(subchild.child_by_field_name('type'))
119 | identifier = get_node_text(identifier_node)
120 | if identifier and param_type:
121 | metadata['parameters'][identifier] = param_type
122 |
123 | return metadata
124 |
125 | @staticmethod
126 | def get_class_list(node):
127 | pass
128 |
129 | @staticmethod
130 | def get_class_metadata(class_node, blob=None) -> Dict[str, str]:
131 | if blob:
132 | logger.info('From version `0.0.6` this function will update argument in the API')
133 | pass
134 |
--------------------------------------------------------------------------------
/src/codetext/parser/java_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import List, Dict, Any
3 | import logging
4 |
5 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text
6 |
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 |
11 | class JavaParser(LanguageParser):
12 |
13 | FILTER_PATHS = ('test', 'tests')
14 |
15 | BLACKLISTED_FUNCTION_NAMES = ['toString', 'hashCode', 'equals', 'finalize', 'notify', 'notifyAll', 'clone']
16 |
17 | @staticmethod
18 | def get_docstring_node(node):
19 | """
20 | Get docstring node from it parent node. Expect return list have length==1
21 |
22 | Args:
23 | node (tree_sitter.Node): parent node (usually function node) to get its docstring
24 | Return:
25 | List: list of docstring nodes
26 | """
27 | docstring_node = []
28 |
29 | if node.prev_sibling:
30 | prev_node = node.prev_sibling
31 | if prev_node.type == 'block_comment' or prev_node.type == 'line_comment':
32 | docstring_node.append(prev_node)
33 |
34 | return docstring_node
35 |
36 | @staticmethod
37 | def get_docstring(node, blob=None):
38 | """
39 | Get docstring description for node
40 |
41 | Args:
42 | node (tree_sitter.Node)
43 | blob (str): original source code which parse the `node`
44 | Returns:
45 | str: docstring
46 | """
47 | if blob:
48 | logger.info('From version `0.0.6` this function will update argument in the API')
49 | docstring_node = JavaParser.get_docstring_node(node)
50 |
51 | docstring = ''
52 | if docstring_node:
53 | docstring = get_node_text(docstring_node[0])
54 | return docstring
55 |
56 | @staticmethod
57 | def get_comment_node(function_node):
58 | """
59 | Return all comment node inside a parent node
60 | Args:
61 | node (tree_sitter.Node)
62 | Return:
63 | List: list of comment nodes
64 | """
65 | comment_node = get_node_by_kind(function_node, kind=['line_comment'])
66 | return comment_node
67 |
68 | @staticmethod
69 | def get_class_list(node):
70 | res = get_node_by_kind(node, ['class_declaration'])
71 | return res
72 |
73 | @staticmethod
74 | def get_function_list(node):
75 | res = get_node_by_kind(node, ['method_declaration'])
76 | return res
77 |
78 | @staticmethod
79 | def is_method_body_empty(node):
80 | for c in node.children:
81 | if c.type in {'method_body', 'constructor_body'}:
82 | if c.start_point[0] == c.end_point[0]:
83 | return True
84 |
85 | @staticmethod
86 | def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]:
87 | if blob:
88 | logger.info('From version `0.0.6` this function will update argument in the API')
89 | metadata = {
90 | 'identifier': '',
91 | 'parameters': {},
92 | }
93 | argument_list = []
94 | for child in class_node.children:
95 | if child.type == 'identifier':
96 | metadata['identifier'] = get_node_text(child)
97 | elif child.type == 'superclass' or child.type == 'super_interfaces':
98 | for subchild in child.children:
99 | if subchild.type == 'type_list' or subchild.type == 'type_identifier':
100 | metadata['parameters'][get_node_text(subchild)] = None
101 | # argument_list.append(get_node_text(subchild))
102 |
103 | # metadata['parameters'] = argument_list
104 | return metadata
105 |
106 | @staticmethod
107 | def get_function_metadata(function_node, blob: str = None) -> Dict[str, str]:
108 | metadata = {
109 | 'identifier': '',
110 | 'parameters': {},
111 | 'return_type': None
112 | }
113 |
114 | return_kinds = ["void_type",
115 | "integral_type",
116 | "floating_point_type",
117 | "boolean_type",
118 | "type_identifier",
119 | "scoped_type_identifier",
120 | "generic_type"]
121 |
122 |
123 | for child in function_node.children:
124 | if child.type == 'identifier':
125 | metadata['identifier'] = get_node_text(child)
126 | elif child.type in return_kinds:
127 | metadata['return_type'] = get_node_text(child)
128 | elif child.type == 'throws':
129 | for subchild in child.children:
130 | if 'identifier' in subchild.type:
131 | metadata['throws'] = get_node_text(subchild)
132 | elif child.type == 'formal_parameters':
133 | param_list = get_node_by_kind(child, ['formal_parameter']) # speed_parameter
134 | for param in param_list:
135 | param_type = get_node_text(param.child_by_field_name('type'))
136 | identifier = get_node_text(param.child_by_field_name('name'))
137 | metadata['parameters'][identifier] = param_type
138 |
139 |
140 | return metadata
--------------------------------------------------------------------------------
/src/codetext/parser/javascript_parser.py:
--------------------------------------------------------------------------------
1 | from typing import List, Dict, Any
2 | import logging
3 |
4 | from .language_parser import LanguageParser, get_node_text, get_node_by_kind
5 |
6 |
7 | logger = logging.getLogger(__name__)
8 |
9 |
10 | class JavascriptParser(LanguageParser):
11 |
12 | FILTER_PATHS = ('test', 'node_modules')
13 |
14 | BLACKLISTED_FUNCTION_NAMES = ['toString', 'toLocaleString', 'valueOf', 'constructor']
15 |
16 | @staticmethod
17 | def get_docstring_node(node):
18 | docstring_node = []
19 | prev_node = node.prev_sibling
20 | parent_node = node.parent
21 |
22 | if prev_node and prev_node.type == 'comment':
23 | docstring_node.append(prev_node)
24 |
25 | elif parent_node:
26 | if parent_node.type != 'class_body': # node not inside a class
27 | prev_node = parent_node.prev_sibling
28 | if prev_node and prev_node.type == 'comment':
29 | docstring_node.append(prev_node)
30 |
31 | return docstring_node
32 |
33 | @staticmethod
34 | def get_docstring(node, blob=None):
35 | if blob:
36 | logger.info('From version `0.0.6` this function will update argument in the API')
37 | docstring_node = JavascriptParser.get_docstring_node(node)
38 |
39 | docstring = ''
40 | if docstring_node:
41 | docstring = get_node_text(docstring_node[0])
42 | return docstring
43 |
44 | @staticmethod
45 | def get_comment_node(function_node):
46 | comment_node = get_node_by_kind(function_node, kind=['comment'])
47 | return comment_node
48 |
49 | @staticmethod
50 | def get_function_list(node):
51 | function_types = ['function_declaration',
52 | 'function',
53 | 'method_definition',
54 | 'generator_function_declaration',
55 | 'arrow_function',
56 | 'generator_function']
57 | res = get_node_by_kind(node, function_types)
58 | for node in res[:]:
59 | if not node.children:
60 | res.remove(node)
61 |
62 | return res
63 |
64 | @staticmethod
65 | def get_class_list(node):
66 | res = get_node_by_kind(node, ['class_declaration', 'class'])
67 | for node in res[:]:
68 | if not node.children:
69 | res.remove(node)
70 |
71 | return res
72 |
73 | @staticmethod
74 | def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]:
75 | if blob:
76 | logger.info('From version `0.0.6` this function will update argument in the API')
77 | metadata = {
78 | 'identifier': '',
79 | 'parameters': {},
80 | 'return_type': None,
81 | }
82 | param = []
83 | for child in function_node.children:
84 | if child.type in ['identifier', 'property_identifier']:
85 | metadata['identifier'] = get_node_text(child)
86 | elif child.type == 'formal_parameters':
87 | params = get_node_by_kind(child, ['identifier'])
88 | for param in params:
89 | identifier = get_node_text(param)
90 | metadata['parameters'][identifier] = None # JS not have type define
91 |
92 | return_statement = get_node_by_kind(function_node, ['return_statement'])
93 | if len(return_statement) > 0:
94 | metadata['return_type'] = ''
95 |
96 | if function_node.type in ["function",
97 | "arrow_function",
98 | "generator_function"]:
99 | # function inside object property or variable declarator
100 | identifier = function_node.prev_named_sibling
101 | if identifier:
102 | if identifier.type in ["identifier"]:
103 | metadata["identifier"] = identifier.text.decode()
104 |
105 | return metadata
106 |
107 | @staticmethod
108 | def get_class_metadata(class_node, blob=None):
109 | if blob:
110 | logger.info('From version `0.0.6` this function will update argument in the API')
111 | metadata = {
112 | 'identifier': '',
113 | 'parameters': {},
114 | }
115 | param = []
116 | for child in class_node.children:
117 | if child.type == 'identifier':
118 | metadata['identifier'] = get_node_text(child)
119 | elif child.type == 'class_heritage':
120 | for subchild in child.children:
121 | if subchild.type == 'identifier':
122 | metadata['parameters'][get_node_text(subchild)] = None
123 | # param.append(get_node_text(subchild))
124 |
125 | # metadata['parameters'] = param
126 | return metadata
127 |
--------------------------------------------------------------------------------
/src/codetext/parser/language_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 | from abc import ABC, abstractmethod
3 | from typing import List, Dict, Any, Set, Optional
4 |
5 | import tree_sitter
6 |
7 | import logging
8 |
9 | DOCSTRING_REGEX = re.compile(r"(['\"])\1\1(.*?)\1{3}", flags=re.DOTALL)
10 | DOCSTRING_REGEX_TOKENIZER = re.compile(r"[^\s,'\"`.():\[\]=*;>{\}+-/\\]+|\\+|\.+|\(\)|{\}|\[\]|\(+|\)+|:+|\[+|\]+|{+|\}+|=+|\*+|;+|>+|\++|-+|/+|\'|\"|`")
11 | logger = logging.getLogger()
12 |
13 | def remove_words_in_string(words, string):
14 | new_string = string
15 | for word in words:
16 | new_string = str(new_string).replace(word, '')
17 | return new_string
18 |
19 |
20 | def tokenize_docstring(docstring: str) -> List[str]:
21 | return [t for t in DOCSTRING_REGEX_TOKENIZER.findall(str(docstring)) if t is not None and len(t) > 0]
22 |
23 |
24 | def tokenize_code(node, blob: str, nodes_to_exclude: Optional[Set]=None) -> List:
25 | tokens = []
26 | traverse(node, tokens)
27 | # print(tokens)
28 | # for token in tokens:
29 | # print(token.text)
30 | return [match_from_span(token, blob) for token in tokens if nodes_to_exclude is None or token not in nodes_to_exclude]
31 |
32 | def nodes_are_equal(n1, n2):
33 | return n1.type == n2.type and n1.start_point == n2.start_point and n1.end_point == n2.end_point
34 |
35 | def parent_and_previous_sibling(tree, node):
36 | """Merge `node_parent` and `previous_sibling` function
37 | """
38 | parent = node_parent(tree, node)
39 | for i, node_at_i in enumerate(parent.children):
40 | if nodes_are_equal(node, node_at_i):
41 | if i > 0:
42 | return parent, parent.children[i-1]
43 | return parent, None
44 |
45 | return ValueError("Could not find node in tree.")
46 |
47 |
48 | def previous_sibling(tree, node):
49 | """
50 | Search for the previous sibling of the node.
51 | TODO: C TreeSitter should support this natively, but not its Python bindings yet. Replace later.
52 | """
53 | to_visit = [tree.root_node]
54 | while len(to_visit) > 0:
55 | next_node = to_visit.pop()
56 | for i, node_at_i in enumerate(next_node.children):
57 | if nodes_are_equal(node, node_at_i):
58 | if i > 0:
59 | return next_node.children[i-1]
60 | return None
61 | else:
62 | to_visit.extend(next_node.children)
63 | return ValueError("Could not find node in tree.")
64 |
65 |
66 | # if parent_node.type == 'variable_declarator':
67 | # # node
68 | # base_node = node_parent(tree, parent_node) # Get the variable declaration
69 | # # parent
70 | # parent_node = node_parent(tree, base_node)
71 | # elif parent_node.type == 'pair':
72 | # base_node = parent_node # This is a common pattern where a function is assigned as a value to a dictionary.
73 | # parent_node = node_parent(tree, base_node)
74 | # else:
75 | # base_node = node
76 |
77 | def traverse_type_parent(node, kind:List) -> None:
78 | results = []
79 | to_visit = [node]
80 | while len(to_visit) > 0:
81 | next_node = to_visit.pop()
82 | for child in next_node.children:
83 | if child.type in kind:
84 | results.append([next_node, child])
85 | else:
86 | to_visit.extend(next_node.children)
87 |
88 | return results
89 |
90 |
91 | def node_parent(tree, node):
92 | to_visit = [tree.root_node]
93 | while len(to_visit) > 0:
94 | next_node = to_visit.pop()
95 | for child in next_node.children:
96 | if nodes_are_equal(child, node):
97 | return next_node
98 | else:
99 | to_visit.extend(next_node.children)
100 | raise ValueError("Could not find node in tree.")
101 |
102 |
103 | def traverse(node, results: List) -> None:
104 | if node.type == 'string':
105 | results.append(node)
106 | return
107 | for n in node.children:
108 | traverse(n, results)
109 | if not node.children:
110 | results.append(node)
111 |
112 |
113 | def traverse_type(node, results, kind:List) -> None:
114 | # logger.warn('From version 0.0.6, we move `traverse_type` to `get_node_by_kind`')
115 | if node.type in kind:
116 | results.append(node)
117 | if not node.children:
118 | return
119 | for n in node.children:
120 | traverse_type(n, results, kind)
121 |
122 |
123 | def get_node_by_kind(root: tree_sitter.Node, kind: List[str]) -> List:
124 | """
125 | Get all nodes with specific type
126 |
127 | Args:
128 | root (tree_sitter.Node): Tree sitter root node
129 | kind (List[str]): (node's) type that want to get
130 |
131 | Return:
132 | List[tree_sitter.Node]: List of all
133 | """
134 | assert type(root) == tree_sitter.Node, f"Expect `root` to be `tree_sitter.Node`, get {type(root)}"
135 | assert type(kind) in [list, str], f"Expect `kind` to be `list` of string or `str`, get {type(kind)}"
136 | assert all(isinstance(s, str) for s in kind) == True, f"Expect search kind to be `str`"
137 |
138 | node_list = []
139 | traverse_type(root, node_list, kind=kind)
140 | return node_list
141 |
142 |
143 | def get_node_text(root: tree_sitter.Node) -> str:
144 | """
145 | Get text of a tree-sitter Node. Can be use to replace `match_from_span`.
146 |
147 | Args:
148 | root (tree_sitter.Node): Tree sitter node to get text
149 |
150 | Return:
151 | str: text of `root`
152 | """
153 | assert type(root) == tree_sitter.Node, f"Expect `root` to be `tree_sitter.Node`, get {type(root)}"
154 |
155 | text = root.text.decode()
156 | return text
157 |
158 |
159 | def match_from_span(node, blob: str) -> str:
160 | # logger.warn('From version 0.0.6, we move `match_from_span` to `get_node_text`')
161 | lines = blob.split('\n')
162 | line_start = node.start_point[0]
163 | line_end = node.end_point[0]
164 | char_start = node.start_point[1]
165 | char_end = node.end_point[1]
166 | if line_start != line_end:
167 | return '\n'.join([lines[line_start][char_start:]] + lines[line_start+1:line_end] + [lines[line_end][:char_end]])
168 | else:
169 | return lines[line_start][char_start:char_end]
170 |
171 |
172 | def match_from_spans(nodes, blob: str) -> str:
173 | """
174 | Get text from multiple note
175 |
176 | Args:
177 | nodes (List): List of `tree_sitter.Node`
178 | blob (str): Full source
179 |
180 | Return:
181 | str: combined text of list node
182 | """
183 | assert len(nodes) != 0, "Empty node list"
184 | start_point = nodes[0]
185 | end_point = nodes[0]
186 |
187 | for node in nodes:
188 | if node.start_point[0] < start_point.start_point[0]:
189 | start_point = node
190 | elif node.end_point[0] > end_point.end_point[0]:
191 | end_point = node
192 |
193 | line_start = start_point.start_point[0]
194 | char_start = start_point.start_point[1]
195 | line_end = end_point.end_point[0]
196 | char_end = end_point.end_point[1]
197 |
198 | lines = blob.split('\n')
199 | if line_start != line_end:
200 | string = '\n'.join([lines[line_start][char_start:]] + lines[line_start+1:line_end] + [lines[line_end][:char_end]])
201 | else:
202 | string = lines[line_start][char_start:char_end]
203 |
204 | return string, start_point, end_point
205 |
206 |
207 | class LanguageParser(ABC):
208 | BLACKLISTED_FUNCTION_NAMES = []
209 |
210 | @staticmethod
211 | @abstractmethod
212 | def get_function_list(node):
213 | pass
214 |
215 | @staticmethod
216 | @abstractmethod
217 | def get_class_list(node):
218 | pass
219 |
220 | @staticmethod
221 | @abstractmethod
222 | def get_docstring_node(node) -> List[tree_sitter.Node]:
223 | pass
224 |
225 | @staticmethod
226 | @abstractmethod
227 | def get_comment_node(node) -> List[tree_sitter.Node]:
228 | pass
229 |
230 | @staticmethod
231 | @abstractmethod
232 | def get_class_metadata(class_node, blob=None):
233 | pass
234 |
235 | @staticmethod
236 | @abstractmethod
237 | def get_function_metadata(function_node, blob=None) -> Dict[str, str]:
238 | pass
239 |
240 |
241 | # @staticmethod
242 | # @abstractmethod
243 | # def get_function_definitions(tree, blob) -> List:
244 | # pass
245 |
246 | # @staticmethod
247 | # @abstractmethod
248 | # def get_class_definitions(tree, blob) -> List:
249 | # pass
250 |
251 | # @staticmethod
252 | # @abstractmethod
253 | # def get_line_definitions(tree, blob) -> List:
254 | # pass
255 |
256 | # @staticmethod
257 | # @abstractmethod
258 | # def get_context(tree, blob):
259 | # raise NotImplementedError
260 |
261 | # @staticmethod
262 | # @abstractmethod
263 | # def get_calls(tree, blob):
264 | # raise NotImplementedError
--------------------------------------------------------------------------------
/src/codetext/parser/php_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import List, Dict, Any
3 | import tree_sitter
4 | import logging
5 |
6 | from .language_parser import LanguageParser, get_node_text, get_node_by_kind
7 |
8 |
9 | logger = logging.getLogger(__name__)
10 |
11 |
12 | class PhpParser(LanguageParser):
13 |
14 | FILTER_PATHS = ('test', 'tests')
15 |
16 | BLACKLISTED_FUNCTION_NAMES = ['__construct', '__destruct', '__call', '__callStatic',
17 | '__get', '__set', '__isset', '__unset',
18 | '__sleep', '__wakeup', '__toString', '__invoke',
19 | '__set_state', '__clone', '__debugInfo', '__serialize',
20 | '__unserialize']
21 |
22 | @staticmethod
23 | def get_docstring(node, blob: str=None) -> str:
24 | if blob:
25 | logger.info('From version `0.0.6` this function will update argument in the API')
26 | docstring_node = PhpParser.get_docstring_node(node)
27 |
28 | docstring = ''
29 | if docstring_node:
30 | docstring = get_node_text(docstring_node[0])
31 |
32 | return docstring
33 |
34 | @staticmethod
35 | def get_docstring_node(node):
36 | docstring_node = []
37 |
38 | if node.prev_sibling is not None:
39 | prev_node = node.prev_sibling
40 | if prev_node.type == 'comment':
41 | docstring_node.append(prev_node)
42 |
43 | return docstring_node
44 |
45 | @staticmethod
46 | def get_comment_node(function_node):
47 | comment_node = get_node_by_kind(function_node, kind='comment')
48 | return comment_node
49 |
50 | @staticmethod
51 | def get_class_list(node):
52 | res = get_node_by_kind(node, ['class_declaration',
53 | 'trait_declaration',
54 | 'interface_declaration'])
55 | return res
56 |
57 | @staticmethod
58 | def get_function_list(node):
59 | res = get_node_by_kind(node, ['function_definition', 'method_declaration'])
60 | return res
61 |
62 | @staticmethod
63 | def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]:
64 | if blob:
65 | logger.info('From version `0.0.6` this function will update argument in the API')
66 | metadata = {
67 | 'identifier': '',
68 | 'parameters': {},
69 | 'return_type': None,
70 | }
71 |
72 | for n in function_node.children:
73 | if n.type == 'name':
74 | metadata['identifier'] = get_node_text(n)
75 | if n.type in ['union_type', 'intersection_type']:
76 | metadata['return_type'] = get_node_text(n)
77 | elif n.type == 'formal_parameters':
78 | for param_node in n.children:
79 | if param_node.type in ['simple_parameter', 'variadic_parameter', 'property_promotion_parameter']:
80 | identifier = get_node_text(param_node.child_by_field_name('name'))
81 | param_type = param_node.child_by_field_name('type')
82 | if param_type:
83 | param_type = get_node_text(param_type)
84 | metadata['parameters'][identifier] = param_type
85 | else:
86 | metadata['parameters'][identifier] = None
87 |
88 | if not metadata['return_type']:
89 | return_statement = get_node_by_kind(function_node, ['return_statement'])
90 | if len(return_statement) > 0:
91 | metadata['return_type'] = ''
92 | else:
93 | metadata['return_type'] = None
94 |
95 | return metadata
96 |
97 |
98 | @staticmethod
99 | def get_class_metadata(class_node, blob: str=None):
100 | if blob:
101 | logger.info('From version `0.0.6` this function will update argument in the API')
102 | metadata = {
103 | 'identifier': '',
104 | 'parameters': {},
105 | }
106 | assert type(class_node) == tree_sitter.Node
107 |
108 | for child in class_node.children:
109 | if child.type == 'name':
110 | metadata['identifier'] = get_node_text(child)
111 | elif child.type == 'base_clause':
112 | argument_list = []
113 | for param in child.children:
114 | if param.type == 'name':
115 | name = get_node_text(param)
116 | metadata['parameters'][name] = None
117 | # argument_list.append(get_node_text(param))
118 | # metadata['parameters'] = argument_list
119 |
120 | return metadata
121 |
--------------------------------------------------------------------------------
/src/codetext/parser/python_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import List, Dict, Iterable, Optional, Iterator, Any
3 | import logging
4 |
5 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text
6 |
7 |
8 | logger = logging.getLogger(__name__)
9 |
10 |
11 | class PythonParser(LanguageParser):
12 |
13 | BLACKLISTED_FUNCTION_NAMES = ['__init__', '__name__', '__main__']
14 |
15 | @staticmethod
16 | def get_docstring(node, blob:str=None):
17 | if blob:
18 | logger.info('From version `0.0.6` this function will update argument in the API')
19 | docstring_node = PythonParser.get_docstring_node(node)
20 |
21 | docstring = ''
22 | if docstring_node is not None:
23 | docstring = get_node_text(docstring_node[0])
24 | docstring = docstring.strip('"').strip("'").strip("#")
25 | return docstring
26 |
27 | @staticmethod
28 | def get_function_list(node):
29 | res = get_node_by_kind(node, ['function_definition'])
30 | return res
31 |
32 | @staticmethod
33 | def get_class_list(node):
34 | res = get_node_by_kind(node, ['class_definition'])
35 | return res
36 |
37 | @staticmethod
38 | def get_docstring_node(node):
39 | docstring_node = []
40 | # traverse_type(node, docstring_node, kind=['expression_statement']) #, 'comment'])
41 | for child in node.children:
42 | if child.type == 'block':
43 | for sub_child in child.children:
44 | if sub_child.type == 'expression_statement':
45 | docstring_node.append(sub_child)
46 |
47 | docstring_node = [node for node in docstring_node if
48 | node.type == 'expression_statement' and node.children[0].type == 'string']
49 |
50 | if len(docstring_node) > 0:
51 | return [docstring_node[0].children[0]] # only take the first block
52 |
53 | return None
54 |
55 | @staticmethod
56 | def get_comment_node(node):
57 | comment_node = get_node_by_kind(node, kind=['comment', 'expression_statement'])
58 | for node in comment_node[:]:
59 | if node.type == 'expression_statement' and node.children[0].type != 'string':
60 | comment_node.remove(node)
61 | return comment_node
62 |
63 | @staticmethod
64 | def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]:
65 | if blob:
66 | logger.info('From version `0.0.6` this function will update argument in the API')
67 | metadata = {
68 | 'identifier': '',
69 | 'parameters': {},
70 | 'return_type': None,
71 | }
72 |
73 | for child in function_node.children:
74 | if child.type == 'identifier':
75 | metadata['identifier'] = get_node_text(child)
76 | elif child.type == 'parameters':
77 | for subchild in child.children:
78 | if subchild.type == 'identifier':
79 | metadata['parameters'][get_node_text(subchild)] = None
80 | elif subchild.type in ['typed_parameter', 'default_parameter', 'typed_default_parameter']:
81 | param_type = get_node_by_kind(subchild, ['type'])
82 | if param_type:
83 | param_type = get_node_text(param_type[0])
84 | else:
85 | param_type = None
86 | param_identifier = get_node_by_kind(subchild, ['identifier'])
87 | assert len(param_identifier) != 0, "Empty identifier"
88 | param_identifier = get_node_text(param_identifier[0])
89 | metadata['parameters'][param_identifier] = param_type
90 | elif child.type == 'type':
91 | metadata['return_type'] = get_node_text(child)
92 |
93 | if not metadata['return_type']:
94 | return_statement = get_node_by_kind(function_node, ['return_statement'])
95 | if len(return_statement) > 0:
96 | metadata['return_type'] = ''
97 | else:
98 | metadata['return_type'] = None
99 |
100 | return metadata
101 |
102 | @staticmethod
103 | def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]:
104 | if blob:
105 | logger.info('From version `0.0.6` this function will update argument in the API')
106 | metadata = {
107 | 'identifier': '',
108 | 'parameters': {},
109 | }
110 | for child in class_node.children:
111 | if child.type == 'identifier':
112 | metadata['identifier'] = get_node_text(child)
113 | elif child.type == 'argument_list':
114 | argument_list = get_node_text(child).split(',')
115 | for arg in argument_list:
116 | item = re.sub(r'[^a-zA-Z0-9\_]', ' ', arg).split()
117 | # Handle class definitions with empty argument list class ABC()
118 | if len(item) > 0:
119 | metadata['parameters'][item[0].strip()] = None
120 |
121 | # get __init__ function
122 | return metadata
123 |
--------------------------------------------------------------------------------
/src/codetext/parser/ruby_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import List, Dict, Any
3 |
4 | import tree_sitter
5 | import logging
6 |
7 | from .language_parser import LanguageParser, get_node_text, get_node_by_kind
8 | # from function_parser.parsers.commentutils import get_docstring_summary
9 |
10 |
11 | logger = logging.getLogger(__name__)
12 |
13 |
14 | class RubyParser(LanguageParser):
15 |
16 | FILTER_PATHS = ('test', 'vendor')
17 |
18 | BLACKLISTED_FUNCTION_NAMES = ['initialize', 'to_text', 'display', 'dup', 'clone', 'equal?', '==', '<=>',
19 | '===', '<=', '<', '>', '>=', 'between?', 'eql?', 'hash']
20 |
21 | @staticmethod
22 | def get_function_list(node):
23 | res = get_node_by_kind(node, ['method',
24 | 'singleton_method'])
25 | return res
26 |
27 | @staticmethod
28 | def get_class_list(node):
29 | res = get_node_by_kind(node, ['class', 'module'])
30 |
31 | # remove class keywords
32 | for node in res[:]:
33 | if not node.children:
34 | res.remove(node)
35 |
36 | return res
37 |
38 | @staticmethod
39 | def get_docstring_node(node) -> str:
40 | docstring_node = []
41 |
42 | prev_node = node.prev_sibling
43 | if not prev_node or prev_node.type != 'comment':
44 | parent_node = node.parent
45 | if parent_node:
46 | prev_node = parent_node.prev_sibling
47 |
48 | if prev_node and prev_node.type == 'comment':
49 | docstring_node.append(prev_node)
50 | prev_node = prev_node.prev_sibling
51 |
52 | while prev_node and prev_node.type == 'comment':
53 | # Assume the comment is dense
54 | x_current = prev_node.start_point[0]
55 | x_next = prev_node.next_sibling.start_point[0]
56 | if x_next - x_current > 1:
57 | break
58 |
59 | docstring_node.insert(0, prev_node)
60 | prev_node = prev_node.prev_sibling
61 |
62 | return docstring_node
63 |
64 | @staticmethod
65 | def get_docstring(node, blob=None):
66 | if blob:
67 | logger.info('From version `0.0.6` this function will update argument in the API')
68 | docstring_node = RubyParser.get_docstring_node(node)
69 | docstring = []
70 | for item in docstring_node:
71 | doc = get_node_text(item)
72 | doc_lines = doc.split('\n')
73 | for line in doc_lines:
74 | if '=begin' in line or '=end' in line:
75 | continue
76 | docstring.append(line)
77 |
78 | docstring = '\n'.join(docstring)
79 | return docstring
80 |
81 | @staticmethod
82 | def get_function_metadata(function_node, blob=None) -> Dict[str, str]:
83 | if blob:
84 | logger.info('From version `0.0.6` this function will update argument in the API')
85 | metadata = {
86 | 'identifier': '',
87 | 'parameters': {},
88 | 'return_type': None,
89 | }
90 |
91 | assert type(function_node) == tree_sitter.Node
92 | assert function_node.type in ['method', 'singleton_method']
93 |
94 | for child in function_node.children:
95 | if child.type == 'identifier':
96 | metadata['identifier'] = get_node_text(child)
97 | elif child.type in ['method_parameters', 'parameters', 'bare_parameters']:
98 | params = get_node_by_kind(child, ['identifier'])
99 | for item in params:
100 | metadata['parameters'][get_node_text(item)] = None
101 |
102 | if not metadata['return_type']:
103 | return_statement = get_node_by_kind(function_node, ['return'])
104 | if len(return_statement) > 0:
105 | metadata['return_type'] = ''
106 | else:
107 | metadata['return_type'] = None
108 |
109 | return metadata
110 |
111 | @staticmethod
112 | def get_class_metadata(class_node, blob=None):
113 | if blob:
114 | logger.info('From version `0.0.6` this function will update argument in the API')
115 | metadata = {
116 | 'identifier': '',
117 | 'parameters': {},
118 | }
119 |
120 | assert type(class_node) == tree_sitter.Node
121 |
122 | for child in class_node.children:
123 | if child.type == 'constant':
124 | metadata['identifier'] = get_node_text(child)
125 | if child.type == 'superclass':
126 | for subchild in child.children:
127 | if subchild.type == 'constant':
128 | metadata['parameters'][get_node_text(subchild)] = None
129 |
130 | return metadata
131 |
132 |
133 | @staticmethod
134 | def get_comment_node(function_node):
135 | comment_node = get_node_by_kind(function_node, kind='comment')
136 | return comment_node
137 |
138 | @staticmethod
139 | def get_action_list(action_node):
140 | call_nodes = get_node_by_kind(action_node, ['call'])
141 | res = []
142 | for call_node in call_nodes:
143 | if get_node_by_kind(call_node, ["do_block"]):
144 | res.append(call_node)
145 | # print(res)
146 | return res
147 |
148 | @staticmethod
149 | def get_action_metadata(action_node):
150 | metadata = {
151 | 'identifier': '',
152 | 'parameters': {},
153 | 'return_type': None,
154 | }
155 |
156 | for child in action_node.children:
157 | if child.type in ["identifier"]:
158 | metadata['identifier'] = get_node_text(child)
159 | if child.type in ["argument_list"]:
160 | symbol = get_node_by_kind(child, ["simple_symbol"])
161 | if symbol:
162 | metadata['identifier'] += get_node_text(symbol[0])
163 |
164 | parameters = get_node_by_kind(action_node, ["block_parameters"])
165 |
166 | if parameters:
167 | for param in get_node_by_kind(parameters[0], ["identifier"]):
168 | param_name = get_node_text(param)
169 | metadata['parameters'].update({param_name : None})
170 |
171 | return metadata
172 |
173 |
--------------------------------------------------------------------------------
/src/codetext/parser/rust_parser.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import List, Dict, Any
3 |
4 | import tree_sitter
5 | import logging
6 |
7 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text
8 |
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | class RustParser(LanguageParser):
14 |
15 | FILTER_PATHS = ('test', 'vendor')
16 |
17 | BLACKLISTED_FUNCTION_NAMES = ['main']
18 |
19 | @staticmethod
20 | def get_function_list(node):
21 | res = get_node_by_kind(node, ['function_item'])
22 | return res
23 |
24 | @staticmethod
25 | def get_class_list(node):
26 | res = get_node_by_kind(node, ['impl_item', 'mod_item']) # trait is like an interface
27 | return res
28 |
29 | @staticmethod
30 | def get_docstring_node(node) -> List:
31 | docstring_node = []
32 |
33 | prev_node = node.prev_sibling
34 | if prev_node:
35 | if prev_node.type == 'block_comment':
36 | docstring_node.append(prev_node)
37 |
38 | elif prev_node.type == 'line_comment':
39 | docstring_node.append(prev_node)
40 | prev_node = prev_node.prev_sibling
41 |
42 | while prev_node and prev_node.type == 'line_comment':
43 | # Assume the comment is dense
44 | x_current = prev_node.start_point[0]
45 | x_next = prev_node.next_sibling.start_point[0]
46 | if x_next - x_current > 1:
47 | break
48 |
49 | docstring_node.insert(0, prev_node)
50 | prev_node = prev_node.prev_sibling
51 |
52 | return docstring_node
53 |
54 | @staticmethod
55 | def get_docstring(node, blob=None):
56 | if blob:
57 | logger.info('From version `0.0.6` this function will update argument in the API')
58 | docstring_node = RustParser.get_docstring_node(node)
59 | docstring = []
60 | if docstring_node:
61 | for item in docstring_node:
62 | doc = get_node_text(item)
63 | docstring.append(doc)
64 |
65 | docstring = '\n'.join(docstring)
66 | return docstring
67 |
68 | @staticmethod
69 | def get_function_metadata(function_node, blob=None) -> Dict[str, str]:
70 | if blob:
71 | logger.info('From version `0.0.6` this function will update argument in the API')
72 | metadata = {
73 | 'identifier': '',
74 | 'parameters': {},
75 | 'return_type': None,
76 | }
77 |
78 | assert type(function_node) == tree_sitter.Node
79 | assert function_node.type == 'function_item'
80 |
81 | for child in function_node.children:
82 | if child.type == 'identifier':
83 | metadata['identifier'] = get_node_text(child)
84 | elif child.type in ['parameters']:
85 | params = get_node_by_kind(child, ['parameter', 'variadic_parameter', 'self_parameter'])
86 | for item in params:
87 | if item.type == 'self_parameter':
88 | metadata['parameters'][get_node_text(item)] = None
89 |
90 | else:
91 | param_name = ''
92 | for subchild in item.children:
93 | if subchild.type == 'mutable_specifier':
94 | param_name = 'self'
95 | break
96 | elif subchild.type == 'identifier':
97 | param_name = get_node_text(subchild)
98 | break
99 | param_type = item.child_by_field_name('type')
100 |
101 | if param_type:
102 | param_type = get_node_text(param_type)
103 | metadata['parameters'][param_name] = param_type
104 | else:
105 | metadata['parameters'][param_name] = None
106 | param_type = None
107 |
108 | if child.type == 'reference_type':
109 | metadata['return_type'] = get_node_text(child)
110 |
111 | if not metadata['return_type']:
112 | return_statement = get_node_by_kind(function_node, ['return_expression'])
113 | if len(return_statement) > 0:
114 | metadata['return_type'] = ''
115 | else:
116 | metadata['return_type'] = None
117 |
118 | return metadata
119 |
120 | @staticmethod
121 | def get_class_metadata(class_node, blob=None):
122 | if blob:
123 | logger.info('From version `0.0.6` this function will update argument in the API')
124 | metadata = {
125 | 'identifier': '',
126 | 'parameters': {},
127 | }
128 |
129 | assert type(class_node) == tree_sitter.Node
130 |
131 | if class_node.type == 'mod_item':
132 | for child in class_node.children:
133 | if child.type == 'identifier':
134 | metadata['identifier'] = get_node_text(child)
135 |
136 | else:
137 | identifier = get_node_by_kind(class_node, ['type_identifier'])
138 |
139 | metadata['identifier'] = get_node_text(identifier[0])
140 | if len(identifier) > 1:
141 | for param in identifier[1:]:
142 | metadata['parameters'][get_node_text(param)] = None
143 |
144 | return metadata
145 |
146 |
147 | @staticmethod
148 | def get_comment_node(function_node):
149 | comment_node = get_node_by_kind(function_node, kind=['comment', 'line_comment', 'block_comment'])
150 | return comment_node
151 |
--------------------------------------------------------------------------------
/src/codetext/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import build_language, parse_code, SUPPORTED_LANGUAGE
2 | from .imports import module_available
3 |
4 | __all__ = ["build_languagem", "parse_code", "module_available"]
--------------------------------------------------------------------------------
/src/codetext/utils/imports.py:
--------------------------------------------------------------------------------
1 | """Import utilities."""
2 | import importlib
3 | from importlib.util import find_spec
4 |
5 |
6 | def _package_available(package_name: str) -> bool:
7 | """Check if a package is available in your environment.
8 | .. code-block:: python
9 |
10 | >>> _package_available('os')
11 | True
12 | >>> _package_available('bla')
13 | False
14 | """
15 | return find_spec(package_name) is not None
16 |
17 |
18 | def module_available(module_path: str) -> bool:
19 | """Check if a module path is available in your environment.
20 | Source: pytorch_lightning/utilities/imports.py
21 | .. code-block:: python
22 |
23 | >>> module_available('os')
24 | True
25 | >>> module_available('os.bla')
26 | False
27 | >>> module_available('bla.bla')
28 | False
29 | """
30 | module_names = module_path.split(".")
31 | if not _package_available(module_names[0]):
32 | return False
33 | module = importlib.import_module(module_names[0])
34 | for name in module_names[1:]:
35 | if not hasattr(module, name):
36 | return False
37 | module = getattr(module, name)
38 | return True
--------------------------------------------------------------------------------
/src/codetext/utils/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import inspect
3 | import sys
4 | import os
5 | import subprocess
6 | import logging
7 | from pathlib import Path
8 | from typing import List, Dict, Any, Union
9 |
10 | import tree_sitter
11 | from tree_sitter import Language, Parser
12 |
13 |
14 | logger = logging.getLogger('utils')
15 | logging.basicConfig(level = logging.INFO)
16 |
17 |
18 | SUPPORTED_LANGUAGE = ['python', 'java', 'javascript', 'ruby', 'go', 'c', 'cpp', 'c++', 'c#', 'c_sharp', 'php', 'rust']
19 |
20 |
21 | def build_language(language: str, save_path: str=None):
22 | """
23 | Build tree-sitter language
24 |
25 | Args:
26 | language (str): java, python, cpp, c_sharp, etc
27 | save_path (str): save path (default create a `/tree-sitter/` dir)
28 | """
29 | language = str(language).lower()
30 | if language == 'c#':
31 | language = 'c_sharp'
32 | elif language == 'c++':
33 | language = 'cpp'
34 |
35 | assert language.lower() in SUPPORTED_LANGUAGE, f"Expect {language} in {SUPPORTED_LANGUAGE}"
36 | if not save_path:
37 | calling_script_path = Path(inspect.getframeinfo(sys._getframe(1)).filename)
38 | save_path = calling_script_path.parent
39 |
40 | # create `tree-sitter` dir
41 | ts_path = os.path.join(save_path, 'tree-sitter')
42 | if not os.path.exists(ts_path):
43 | logger.warning(
44 | f"Not found `tree-sitter` folder, create new one in {ts_path}"
45 | )
46 | os.mkdir(ts_path)
47 |
48 | # check `tree-sitter/tree-sitter-`
49 | ts_lang_path = os.path.join(ts_path, 'tree-sitter-'+language.replace('_', '-'))
50 | if not os.path.exists(ts_lang_path):
51 | logger.warning(
52 | f"Not found `tree-sitter-{language.replace('_', '-')}`, attempt clone from github to {ts_path}"
53 | )
54 | command = f"cd {ts_path}; git clone https://github.com/tree-sitter/tree-sitter-{language.replace('_', '-')}.git"
55 | subprocess.Popen(command ,shell=True).wait()
56 |
57 | assert os.path.exists(ts_lang_path)==True, f"Unable to find {language} tree-sitter in {ts_path}"
58 |
59 | # if language == 'c-sharp': language = 'c_sharp'
60 | lang_path = os.path.join(save_path, 'tree-sitter', f'{language}.so')
61 | if not os.path.exists(lang_path):
62 | logger.info(
63 | f"Attempt to build Tree-sitter Language for {language} and store in {lang_path}"
64 | )
65 | Language.build_library(lang_path, [ts_lang_path])
66 | assert os.path.exists(lang_path)==True
67 | else:
68 | logger.info(f"Language already existed!")
69 |
70 |
71 | def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) -> tree_sitter.Tree:
72 | """
73 | Auto parse raw code into `tree_sitter.Tree`
74 |
75 | Args:
76 | raw_code (str): Raw source code need to parse
77 | language (str): Language to load parser
78 | """
79 | # TODO: auto detect language
80 | if language == 'Auto':
81 | raise NotImplemented("This feature is underdevelopment")
82 | language = str(language).lower()
83 | if language == 'c#':
84 | language = 'c_sharp'
85 | elif language == 'c++':
86 | language = 'cpp'
87 | assert language in SUPPORTED_LANGUAGE, f"Expect {language} in {SUPPORTED_LANGUAGE}"
88 |
89 | if tree_sitter_path:
90 | load_path = tree_sitter_path
91 | else:
92 | calling_script_path = Path(inspect.getframeinfo(sys._getframe(1)).filename)
93 | load_path = str(calling_script_path.parent)
94 |
95 | # Get parser from languages
96 | parser = Parser()
97 | try:
98 | from tree_sitter_languages import get_language, get_parser
99 | language = get_language(language)
100 | except ImportError:
101 | # Work-around when pre-built binaries wheels for tree-sitter-languages are not available
102 | logger.warning(f"Troubled importing 'tree-sitter-languages', attemp to look for pre-built binaries in the workspace")
103 | ts_lang_path = os.path.join(load_path, 'tree-sitter', f'{language}.so')
104 | if not os.path.exists(ts_lang_path):
105 | logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language")
106 | build_language(language, load_path)
107 | language = Language(load_path + f"/tree-sitter/{language}.so", language)
108 | parser.set_language(language)
109 |
110 | if isinstance(raw_code, str):
111 | raw_code = bytes(raw_code, 'utf8')
112 | elif isinstance(raw_code, bytes):
113 | pass
114 | else:
115 | raise ValueError(f"Expect `str`, got {type(raw_code)}")
116 | tree = parser.parse(raw_code)
117 | return tree
118 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | PROJECT_PATH = os.getcwd()
4 | SOURCE_PATH = os.path.join(
5 | PROJECT_PATH,"src"
6 | )
7 |
8 |
9 | sys.path.append(SOURCE_PATH)
--------------------------------------------------------------------------------
/tests/setup.py:
--------------------------------------------------------------------------------
1 | from ..src.codetext.utils import build_language
2 | from tree_sitter_languages import get_language, get_parser
3 |
4 | if __name__ == '__main__':
5 | lang_list = ['python', 'cpp', 'java', 'c-sharp', 'ruby', 'rust', 'javascript', 'php', 'go']
6 |
7 | for lang in lang_list:
8 | # build_language(lang)
9 | try:
10 | get_parser(get_language(lang))
11 | except:
12 | build_language(lang)
13 |
--------------------------------------------------------------------------------
/tests/test_clean/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/tests/test_clean/__init__.py
--------------------------------------------------------------------------------
/tests/test_clean/test_clean_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/tests/test_clean/test_clean_utils.py
--------------------------------------------------------------------------------
/tests/test_parser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/tests/test_parser/__init__.py
--------------------------------------------------------------------------------
/tests/test_parser/test_c.py:
--------------------------------------------------------------------------------
1 | '''test for C++ parser'''
2 | import os
3 | import unittest
4 |
5 | from src.codetext.parser import CppParser
6 | from src.codetext.utils import parse_code
7 |
8 |
9 | class Test_CppParser_with_C(unittest.TestCase):
10 | def setUp(self) -> None:
11 | with open('tests/test_parser/test_sample/c_test_sample.c', 'r') as file:
12 | self.code_sample = file.read()
13 |
14 | tree = parse_code(self.code_sample, 'c')
15 | self.root_node = tree.root_node
16 |
17 | return super().setUp()
18 |
19 | def test_get_function_list(self):
20 | root = self.root_node
21 |
22 | function_list = CppParser.get_function_list(root)
23 |
24 | self.assertEqual(len(function_list), 2)
25 |
26 | def test_get_function_metadata(self):
27 | root = self.root_node
28 |
29 | function = CppParser.get_function_list(root)[0]
30 | metadata = CppParser.get_function_metadata(function)
31 |
32 | for key in ['identifier', 'parameters', 'return_type']:
33 | self.assertTrue(key in metadata.keys())
34 | self.assertEqual(metadata['parameters'], {'random_seed': 'int'})
35 | self.assertEqual(metadata['identifier'], 'reverseSentence')
36 | self.assertEqual(metadata['return_type'], 'void')
37 |
38 | def test_get_class_list(self):
39 | pass
40 |
41 | def test_get_class_metadata(self):
42 | pass
43 |
44 | def test_get_docstring(self):
45 | code_sample = """
46 | /**
47 | * A brief description. A more elaborate class description
48 | * @param random_seed somearg.
49 | * @see Test()
50 | * @return The test results
51 | */
52 | void reverseSentence(int random_seed) {
53 | char c;
54 | scanf("%c", &c);
55 | if (c != '\n') {
56 | reverseSentence();
57 | printf("%c", c);
58 | }
59 | }
60 | """
61 | tree = parse_code(code_sample, 'c')
62 | root = tree.root_node
63 |
64 | fn= CppParser.get_function_list(root)[0]
65 |
66 | docs = CppParser.get_docstring(fn)
67 |
68 | self.assertEqual(docs, '/**\n * A brief description. A more elaborate class description\n * @param random_seed somearg.\n * @see Test()\n * @return The test results\n */')
69 |
70 |
71 | def test_extract_docstring(self):
72 | pass
73 |
74 |
75 | if __name__ == '__main__':
76 | unittest.main()
77 |
--------------------------------------------------------------------------------
/tests/test_parser/test_cpp.py:
--------------------------------------------------------------------------------
1 | '''test for C++ parser'''
2 | import os
3 | import unittest
4 | from pathlib import Path
5 |
6 | from src.codetext.parser import CppParser
7 | from src.codetext.utils import parse_code
8 |
9 |
10 | class Test_CppParser(unittest.TestCase):
11 | def setUp(self) -> None:
12 | with open('tests/test_parser/test_sample/cpp_test_sample.cpp', 'r') as file:
13 | self.code_sample = file.read()
14 |
15 | tree = parse_code(self.code_sample, 'c++')
16 | self.root_node = tree.root_node
17 |
18 | return super().setUp()
19 |
20 | def test_get_function_list(self):
21 | root = self.root_node
22 |
23 | function_list = CppParser.get_function_list(root)
24 |
25 | self.assertEqual(len(function_list), 3)
26 |
27 | def test_get_class_list(self):
28 | root = self.root_node
29 |
30 | class_list = CppParser.get_class_list(root)
31 |
32 | self.assertEqual(len(class_list), 2)
33 |
34 | def test_get_function_metadata(self):
35 | root = self.root_node
36 |
37 | function = list(CppParser.get_function_list(root))[0]
38 | metadata = CppParser.get_function_metadata(function)
39 |
40 | for key in ['identifier', 'parameters', 'return_type']:
41 | self.assertTrue(key in metadata.keys(), "Missing {}".format(key))
42 | self.assertEqual(metadata['parameters'], {'a': 'int', 'b': 'int'})
43 | self.assertEqual(metadata['identifier'], 'sum2number')
44 | self.assertEqual(metadata['return_type'], 'int')
45 |
46 | def test_get_class_metadata(self):
47 | root = self.root_node
48 |
49 | classes = list(CppParser.get_class_list(root))[0]
50 | metadata = CppParser.get_class_metadata(classes)
51 |
52 | self.assertEqual(metadata['parameters'], {'Vehicle': None, 'B': None})
53 | self.assertEqual(metadata['identifier'], 'Car')
54 |
55 | def test_get_docstring(self):
56 | code_sample = """
57 | /**
58 | * Find 2 sum
59 | *
60 | * @param nums List number.
61 | * @param target Sum target.
62 | * @return postion of 2 number.
63 | */
64 | vector twoSum(vector& nums, int target) {
65 | map m;
66 | vector v;
67 | int n= nums.size();
68 | for(int i=0;isecond);
76 | v.push_back(i);
77 | }
78 | m.insert(make_pair(nums[i],i));
79 | }
80 |
81 | return v;
82 | }
83 |
84 | // Comment in
85 | // multiple line
86 | // of the function sum
87 | double sum2num(int a, int b) {
88 | return a + b;
89 | }
90 | """
91 | tree = parse_code(code_sample, 'c++')
92 | root = tree.root_node
93 |
94 | fn1, fn2 = list(CppParser.get_function_list(root))
95 |
96 | docs1 = CppParser.get_docstring(fn1)
97 | docs2 = CppParser.get_docstring(fn2)
98 |
99 | self.assertEqual(docs1, '/**\n * Find 2 sum\n *\n * @param nums List number.\n * @param target Sum target.\n * @return postion of 2 number.\n */')
100 | self.assertEqual(docs2, '// Comment in\n// multiple line\n// of the function sum')
101 |
102 |
103 | if __name__ == '__main__':
104 | unittest.main()
105 |
--------------------------------------------------------------------------------
/tests/test_parser/test_csharp.py:
--------------------------------------------------------------------------------
1 | '''test for C# parser'''
2 | import os
3 | import unittest
4 | from pathlib import Path
5 |
6 | from src.codetext.parser import CsharpParser
7 | from src.codetext.utils import parse_code
8 |
9 |
10 | class Test_CsharpParser(unittest.TestCase):
11 | def setUp(self) -> None:
12 | with open('tests/test_parser/test_sample/c_sharp_test_sample.cs', 'r') as file:
13 | self.code_sample = file.read()
14 |
15 | tree = parse_code(self.code_sample, 'c#')
16 | self.root_node = tree.root_node
17 |
18 | return super().setUp()
19 |
20 | def test_get_function_list(self):
21 | root = self.root_node
22 |
23 | function_list = CsharpParser.get_function_list(root)
24 |
25 | self.assertEqual(len(function_list), 3) # exclude constructor
26 |
27 | def test_get_class_list(self):
28 | root = self.root_node
29 |
30 | class_list = CsharpParser.get_class_list(root)
31 |
32 | self.assertEqual(len(class_list), 1)
33 |
34 | def test_get_docstring(self):
35 | code_sample = """
36 | class Vehicle
37 | {
38 | public string brand = "Ford"; // Vehicle field
39 |
40 | //
41 | // Docstring of a method
42 | //
43 | // Argument.
44 | //
45 | // None.
46 | public void honk(string animal_honk)
47 | {
48 | Console.WriteLine(animal_honk);
49 | Console.WriteLine("Tuut, tuut!");
50 | }
51 |
52 | /* Another method docstring
53 | in multiple line */
54 | public void _honk()
55 | {
56 | Console.WriteLine("Tuut, tuut!");
57 | }
58 | }
59 | """
60 | tree = parse_code(code_sample, 'c#')
61 | root = tree.root_node
62 |
63 | fn1, fn2 = list(CsharpParser.get_function_list(root))
64 |
65 | docs1 = CsharpParser.get_docstring(fn1)
66 | docs2 = CsharpParser.get_docstring(fn2)
67 |
68 | self.assertEqual(docs1, '// \n// Docstring of a method\n// \n// Argument.\n// \n// None.')
69 | self.assertEqual(docs2, '/* Another method docstring\n in multiple line */')
70 |
71 |
72 | def test_get_function_metadata(self):
73 | root = self.root_node
74 |
75 | function = list(CsharpParser.get_function_list(root))[0]
76 | metadata = CsharpParser.get_function_metadata(function)
77 |
78 | for key in ['identifier', 'parameters', 'return_type']:
79 | self.assertTrue(key in metadata.keys())
80 | self.assertEqual(metadata['parameters'], {'path': 'string', 'filename': 'string'})
81 | self.assertEqual(metadata['identifier'], 'GetText')
82 | self.assertEqual(metadata['return_type'], 'string')
83 |
84 | def test_get_class_metadata(self):
85 | root = self.root_node
86 |
87 | classes = list(CsharpParser.get_class_list(root))[0]
88 | metadata = CsharpParser.get_class_metadata(classes)
89 |
90 | self.assertEqual(metadata['parameters'], {'Animal': None})
91 | self.assertEqual(metadata['identifier'], 'Dog')
92 |
93 |
94 | if __name__ == '__main__':
95 | unittest.main()
96 |
--------------------------------------------------------------------------------
/tests/test_parser/test_go.py:
--------------------------------------------------------------------------------
1 | '''test for C++ parser'''
2 | import os
3 | import unittest
4 | from pathlib import Path
5 |
6 | from src.codetext.parser import GoParser
7 | from src.codetext.utils import parse_code
8 |
9 |
10 | class Test_GoParser(unittest.TestCase):
11 | def setUp(self) -> None:
12 | with open('tests/test_parser/test_sample/go_test_sample.go', 'r') as file:
13 | self.code_sample = file.read()
14 |
15 | tree = parse_code(self.code_sample, 'go')
16 | self.root_node = tree.root_node
17 | return super().setUp()
18 |
19 | def test_get_function_list(self):
20 | root = self.root_node
21 |
22 | function_list = GoParser.get_function_list(root)
23 |
24 | self.assertEqual(len(function_list), 1)
25 |
26 | def test_get_function_metadata(self):
27 | root = self.root_node
28 |
29 | function = GoParser.get_function_list(root)[0]
30 | metadata = GoParser.get_function_metadata(function)
31 |
32 | for key in ['identifier', 'parameters', 'return_type']:
33 | self.assertTrue(key in metadata.keys())
34 | self.assertEqual(metadata['parameters'], {'e': 'TypeError'})
35 | self.assertEqual(metadata['identifier'], 'Error')
36 | self.assertEqual(metadata['return_type'], 'string')
37 |
38 | def test_get_docstring(self):
39 | code_sample = """
40 | type TypeError struct {
41 | Type1, Type2 reflect.Type
42 | Extra string
43 | }
44 | // Something must not include as docstring
45 |
46 | // The path package should only be used for paths separated by forward
47 | // slashes, such as the paths in URLs. This package does not deal with
48 | // Windows paths with drive letters or backslashes; to manipulate
49 | // operating system paths, use the [path/filepath] package.
50 | func (e TypeError) Error() string {
51 | msg := e.Type1.String()
52 | if e.Type2 != nil {
53 | msg += " and " + e.Type2.String()
54 | }
55 | msg += " " + e.Extra
56 | return msg
57 | }
58 | """
59 | tree = parse_code(code_sample, 'go')
60 | root = tree.root_node
61 |
62 | fn = GoParser.get_function_list(root)[0]
63 |
64 | docs = GoParser.get_docstring(fn)
65 | self.assertEqual(docs, '// The path package should only be used for paths separated by forward\n// slashes, such as the paths in URLs. This package does not deal with\n// Windows paths with drive letters or backslashes; to manipulate\n// operating system paths, use the [path/filepath] package.')
66 |
67 |
68 | def test_extract_docstring(self):
69 | pass
70 |
71 |
72 | if __name__ == '__main__':
73 | unittest.main()
74 |
--------------------------------------------------------------------------------
/tests/test_parser/test_java.py:
--------------------------------------------------------------------------------
1 | '''test for Java parser'''
2 | import os
3 | import unittest
4 | from pathlib import Path
5 |
6 | from src.codetext.parser import JavaParser
7 | from src.codetext.utils import parse_code
8 |
9 |
10 | class Test_JavaParser(unittest.TestCase):
11 | def setUp(self) -> None:
12 | with open('tests/test_parser/test_sample/java_test_sample.java', 'r') as file:
13 | self.code_sample = file.read()
14 |
15 | tree = parse_code(self.code_sample, 'java')
16 | self.root_node = tree.root_node
17 |
18 | return super().setUp()
19 |
20 | def test_get_function_list(self):
21 | root = self.root_node
22 |
23 | function_list = JavaParser.get_function_list(root)
24 |
25 | self.assertEqual(len(function_list), 2)
26 |
27 | def test_get_class_list(self):
28 | root = self.root_node
29 |
30 | class_list = JavaParser.get_class_list(root)
31 |
32 | self.assertEqual(len(class_list), 1)
33 |
34 | def test_get_docstring(self):
35 | code_sample = """
36 | public class SaveFileController {
37 | /**
38 | * Adds new user and saves to file.
39 | *
40 | * @param context instance of Context
41 | * @param user instance of User
42 | * @see User
43 | */
44 | public void addNewUser(Context context, User user){
45 | loadFromFile(context);
46 | this.allUsers.add(user);
47 | saveToFile(context);
48 | }
49 | }
50 | """
51 | tree = parse_code(code_sample, 'java', './')
52 | root = tree.root_node
53 |
54 | fn = list(JavaParser.get_function_list(root))[0]
55 |
56 | docs = JavaParser.get_docstring(fn)
57 | self.assertEqual(docs, '/**\n * Adds new user and saves to file.\n *\n * @param context instance of Context\n * @param user instance of User\n * @see User\n */')
58 |
59 |
60 | def test_get_function_metadata(self):
61 | root = self.root_node
62 |
63 | function = list(JavaParser.get_function_list(root))[0]
64 | metadata = JavaParser.get_function_metadata(function)
65 |
66 | for key in ['identifier', 'parameters', 'return_type']:
67 | self.assertTrue(key in metadata.keys())
68 | self.assertEqual(metadata['parameters'], {'context': 'Context', 'userIndex': 'int'})
69 | self.assertEqual(metadata['identifier'], 'getHabitList')
70 | self.assertEqual(metadata['return_type'], 'HabitList')
71 |
72 | def test_get_class_metadata(self):
73 | root = self.root_node
74 |
75 | classes = list(JavaParser.get_class_list(root))[0]
76 | metadata = JavaParser.get_class_metadata(classes)
77 |
78 | self.assertEqual(metadata['parameters'], {'SudoUser': None, 'FileController': None})
79 | self.assertEqual(metadata['identifier'], 'SaveFileController')
80 |
81 | def test_extract_docstring(self):
82 | pass
83 |
84 |
85 | if __name__ == '__main__':
86 | unittest.main()
87 |
--------------------------------------------------------------------------------
/tests/test_parser/test_javascript.py:
--------------------------------------------------------------------------------
1 | '''test for JavaScript parser'''
2 | import os
3 | import unittest
4 | from pathlib import Path
5 |
6 | from src.codetext.parser import JavascriptParser
7 | from src.codetext.utils import parse_code
8 |
9 |
10 | class Test_JavascriptParser(unittest.TestCase):
11 | def setUp(self) -> None:
12 | with open('tests/test_parser/test_sample/javascript_test_sample.js', 'r') as file:
13 | self.code_sample = file.read()
14 |
15 | tree = parse_code(self.code_sample, 'javascript')
16 | self.root_node = tree.root_node
17 |
18 | return super().setUp()
19 |
20 | def test_get_function_list(self):
21 | root = self.root_node
22 |
23 | function_list = JavascriptParser.get_function_list(root)
24 |
25 | self.assertEqual(len(function_list), 7)
26 |
27 | def test_get_class_list(self):
28 | root = self.root_node
29 |
30 | class_list = JavascriptParser.get_class_list(root)
31 |
32 | self.assertEqual(len(class_list), 2)
33 |
34 | def test_get_docstring(self):
35 | code_sample = """
36 | /**
37 | * Dispatched when the repositories are loaded by the request saga
38 | *
39 | * @param {array} repos The repository data
40 | * @param {string} username The current username
41 | *
42 | * @return {object} An action object with a type of LOAD_REPOS_SUCCESS passing the repos
43 | */
44 | function songsLoaded(repos, username) {
45 | return {
46 | type: LOAD_SONGS_SUCCESS,
47 | repos,
48 | username,
49 | };
50 | }
51 |
52 | class Car {
53 | /**
54 | * Present the object Car
55 | *
56 | * @return {None}
57 | */
58 | present() {
59 | return 'I have a ' + this.carname;
60 | }
61 | }
62 | """
63 |
64 | tree = parse_code(code_sample, 'javascript')
65 | root = tree.root_node
66 |
67 | fn1, fn2 = JavascriptParser.get_function_list(root)
68 |
69 |
70 | docs1 = JavascriptParser.get_docstring(fn1)
71 | docs2 = JavascriptParser.get_docstring(fn2)
72 |
73 | self.assertEqual(docs1, '/**\n * Dispatched when the repositories are loaded by the request saga\n *\n * @param {array} repos The repository data\n * @param {string} username The current username\n *\n * @return {object} An action object with a type of LOAD_REPOS_SUCCESS passing the repos\n */')
74 | self.assertEqual(docs2, '/**\n * Present the object Car\n *\n * @return {None}\n */')
75 |
76 | def test_get_function_metadata(self):
77 | root = self.root_node
78 |
79 | _function = JavascriptParser.get_function_list(root)[1]
80 | metadata = JavascriptParser.get_function_metadata(_function)
81 |
82 | for key in ['identifier', 'parameters', 'return_type']:
83 | self.assertTrue(key in metadata.keys())
84 | self.assertEqual(metadata['identifier'], 'songsLoaded')
85 | self.assertEqual(metadata['parameters'], {'repos': None, 'username': None})
86 |
87 | def test_metadata_with_return_statement(self):
88 | code_sample = '''
89 | function myFunction(p1, p2) {
90 | return p1 * p2;
91 | }
92 | '''
93 | root = parse_code(code_sample, 'javascript').root_node
94 | fn = JavascriptParser.get_function_list(root)[0]
95 | metadata = JavascriptParser.get_function_metadata(fn)
96 |
97 | return_type = metadata['return_type']
98 | self.assertEqual(return_type, '')
99 |
100 | def test_get_class_metadata(self):
101 | root = self.root_node
102 |
103 | classes = JavascriptParser.get_class_list(root)[0]
104 | metadata = JavascriptParser.get_class_metadata(classes)
105 |
106 | self.assertEqual(metadata['identifier'], 'Model')
107 | self.assertEqual(metadata['parameters'], {'Car': None})
108 |
109 | def test_extract_docstring(self):
110 | pass
111 |
112 |
113 | def test_metadata_with_arrow_function(self):
114 | code_sample = '''
115 | export const parseModel = async (mesh) =>
116 | new Promise((resolve) => {
117 | exporter.parse(
118 | mesh,
119 | (gltf) => {
120 | const blob = new Blob([gltf], { type: "application/octet-stream" });
121 | resolve(blob);
122 | return blob;
123 | },
124 | (error) => {
125 | console.log(error);
126 | return error;
127 |
128 | }
129 | );
130 | });
131 | '''
132 | root = parse_code(code_sample, 'javascript').root_node
133 | fn = JavascriptParser.get_function_list(root)[0]
134 | metadata = JavascriptParser.get_function_metadata(fn)
135 |
136 | identifier = metadata['identifier']
137 | self.assertEqual(identifier, 'parseModel')
138 |
139 | def test_metadata_with_undecleared_functions(self):
140 | code_sample = """
141 | const asyncFunctionExpression = async function() {
142 | // async function expression definition
143 | return a
144 | };
145 |
146 | const generatorFunctionExpression = function*() {
147 | // generator function expression definition
148 | return b
149 | };
150 | """
151 | root = parse_code(code_sample, 'javascript').root_node
152 | fn1, fn2 = JavascriptParser.get_function_list(root)
153 |
154 | self.assertEqual(fn1.type, 'function')
155 | self.assertEqual(fn2.type, 'generator_function')
156 |
157 | metadata1 = JavascriptParser.get_function_metadata(fn1)
158 | metadata2 = JavascriptParser.get_function_metadata(fn2)
159 |
160 | self.assertEqual(metadata1['identifier'], 'asyncFunctionExpression')
161 | self.assertEqual(metadata2['identifier'], 'generatorFunctionExpression')
162 |
163 |
164 | if __name__ == '__main__':
165 | unittest.main()
166 |
--------------------------------------------------------------------------------
/tests/test_parser/test_php.py:
--------------------------------------------------------------------------------
1 | '''test for PHP parser'''
2 | import os
3 | import unittest
4 | from pathlib import Path
5 |
6 | from src.codetext.parser import PhpParser
7 | from src.codetext.utils import parse_code
8 |
9 |
10 | class Test_PhpParser(unittest.TestCase):
11 | def setUp(self) -> None:
12 | with open('tests/test_parser/test_sample/php_test_sample.php', 'r') as file:
13 | self.code_sample = file.read()
14 |
15 | tree = parse_code(self.code_sample, 'php')
16 | self.root_node = tree.root_node
17 |
18 | return super().setUp()
19 |
20 | def test_get_function_list(self):
21 | root = self.root_node
22 |
23 | function_list = PhpParser.get_function_list(root)
24 |
25 | self.assertEqual(len(function_list), 5)
26 |
27 | def test_get_class_list(self):
28 | root = self.root_node
29 |
30 | class_list = PhpParser.get_class_list(root)
31 |
32 | self.assertEqual(len(class_list), 3)
33 |
34 | def test_get_docstring(self):
35 | code_sample = """
36 |
51 | """
52 |
53 | tree = parse_code(code_sample, 'php')
54 | root = tree.root_node
55 |
56 | fn = PhpParser.get_function_list(root)[0]
57 |
58 | docs = PhpParser.get_docstring(fn)
59 |
60 | self.assertEqual(docs, '/**\n * Get all image nodes.\n *\n * @param \\DOMNode $node The \\DOMDocument instance\n * @param boolean $strict If the document has to be valid\n *\n * @return \\DOMNode\n */')
61 |
62 |
63 | def test_get_function_metadata(self):
64 | root = self.root_node
65 |
66 | function = list(PhpParser.get_function_list(root))[1]
67 | metadata = PhpParser.get_function_metadata(function)
68 |
69 | for key in ['identifier', 'parameters', 'return_type']:
70 | self.assertTrue(key in metadata.keys())
71 | self.assertEqual(metadata['parameters'], {'$params': 'array', '$connectionOptions': 'array'})
72 | self.assertEqual(metadata['identifier'], 'constructDsn')
73 | self.assertEqual(metadata['return_type'], 'string')
74 |
75 | def test_metadata_with_return_statement(self):
76 | code_sample = '''
77 |
82 | '''
83 | root = parse_code(code_sample, 'PHP').root_node
84 | fn = PhpParser.get_function_list(root)[0]
85 | metadata = PhpParser.get_function_metadata(fn)
86 |
87 | return_type = metadata['return_type']
88 | self.assertEqual(return_type, '')
89 |
90 | def test_metadata_without_return_statement(self):
91 | code_sample = '''
92 |
96 | '''
97 | root = parse_code(code_sample, 'PHP').root_node
98 | fn = PhpParser.get_function_list(root)[0]
99 | metadata = PhpParser.get_function_metadata(fn)
100 |
101 | return_type = metadata['return_type']
102 | self.assertEqual(return_type, None)
103 |
104 | def test_get_class_metadata(self):
105 | root = self.root_node
106 |
107 | _class, interface, trait = list(PhpParser.get_class_list(root))
108 | class_metadata = PhpParser.get_class_metadata(_class)
109 |
110 | self.assertEqual(class_metadata['parameters'], {'AbstractSQLServerDriver': None})
111 | self.assertEqual(class_metadata['identifier'], 'Driver')
112 |
113 | interface_metadata = PhpParser.get_class_metadata(interface)
114 | self.assertEqual(interface_metadata['identifier'], 'MyInterface')
115 |
116 | trait_metadata = PhpParser.get_class_metadata(trait)
117 | self.assertEqual(trait_metadata['identifier'], 'MyTrait')
118 |
119 |
120 | if __name__ == '__main__':
121 | unittest.main()
122 |
--------------------------------------------------------------------------------
/tests/test_parser/test_python.py:
--------------------------------------------------------------------------------
1 | '''test for python parser'''
2 | import os
3 | import unittest
4 | from pathlib import Path
5 |
6 | from src.codetext.parser import PythonParser
7 | from src.codetext.utils import parse_code
8 |
9 |
10 | class Test_PythonParser(unittest.TestCase):
11 | def setUp(self) -> None:
12 | with open('tests/test_parser/test_sample/py_test_sample.py', 'r') as file:
13 | self.code_sample = file.read()
14 |
15 | tree = parse_code(self.code_sample, 'python')
16 | self.root_node = tree.root_node
17 | return super().setUp()
18 |
19 | def test_get_function_list(self):
20 | root = self.root_node
21 |
22 | function_list = PythonParser.get_function_list(root)
23 |
24 | self.assertEqual(len(function_list), 3)
25 |
26 | def test_get_class_list(self):
27 | root = self.root_node
28 |
29 | class_list = PythonParser.get_class_list(root)
30 | self.assertEqual(len(class_list), 1)
31 |
32 | def test_get_docstring(self):
33 | code_sample = '''
34 | def test_sample():
35 | """This is a docstring"""
36 | return
37 | '''
38 | root = parse_code(code_sample, 'python').root_node
39 |
40 | function = PythonParser.get_function_list(root)[0]
41 | docstring = PythonParser.get_docstring(function)
42 | self.assertEqual(docstring, "This is a docstring")
43 |
44 | def test_get_function_metadata(self):
45 | code_sample = '''
46 | def test_sample(arg1: str = "string", arg2 = "another_string"):
47 | return NotImplement()
48 | '''
49 | root = parse_code(code_sample, 'python').root_node
50 |
51 | function = list(PythonParser.get_function_list(root))[0]
52 | metadata = PythonParser.get_function_metadata(function)
53 |
54 | for key in ['identifier', 'parameters', 'return_type']:
55 | self.assertTrue(key in metadata.keys())
56 | self.assertEqual(metadata['parameters'], {'arg1': 'str', 'arg2': None})
57 | self.assertEqual(metadata['identifier'], 'test_sample')
58 |
59 | def test_get_class_metadata(self):
60 | code_sample = '''
61 | class ABC():
62 | pass
63 |
64 | class Sample(ABC):
65 | def __init__(self):
66 | pass
67 |
68 | def test_sample(self, arg1: str = "string", arg2 = "another_string"):
69 | return NotImplement()
70 |
71 | class ThisIsalsoAclass(ABC, Sample):
72 | pass
73 | '''
74 | root = parse_code(code_sample, 'python').root_node
75 |
76 |
77 | classes = list(PythonParser.get_class_list(root))
78 | self.assertEqual(len(classes), 3)
79 |
80 | metadata = PythonParser.get_class_metadata(classes[0])
81 | self.assertEqual(metadata['parameters'], {})
82 | self.assertEqual(metadata['identifier'], 'ABC')
83 |
84 |
85 | metadata = PythonParser.get_class_metadata(classes[1])
86 | self.assertEqual(metadata['parameters'], {'ABC': None})
87 | self.assertEqual(metadata['identifier'], 'Sample')
88 |
89 |
90 | metadata = PythonParser.get_class_metadata(classes[2])
91 | self.assertEqual(metadata['parameters'], {'ABC': None, 'Sample': None})
92 | self.assertEqual(metadata['identifier'], 'ThisIsalsoAclass')
93 |
94 |
95 |
96 | def test_get_comment_list(self):
97 | root = self.root_node
98 |
99 | comment_list = PythonParser.get_comment_node(root)
100 | comment_list = [node.text.decode() for node in comment_list]
101 |
102 | assert comment_list[1] == '# choose the rightmost element as pivot'
103 | assert comment_list[2] == '# pointer for greater element'
104 | assert len(comment_list) == 16
105 |
106 | def test_metadata_without_return_statement(self):
107 | code_sample = '''
108 | def sum2num():
109 | pass
110 | '''
111 | root = parse_code(code_sample, 'python').root_node
112 | fn = PythonParser.get_function_list(root)[0]
113 | metadata = PythonParser.get_function_metadata(fn)
114 |
115 | return_type = metadata['return_type']
116 | self.assertEqual(return_type, None)
117 |
118 | def test_metadata_with_return_statement(self):
119 | code_sample = '''
120 | def sum2num():
121 | return True
122 | '''
123 | root = parse_code(code_sample, 'python').root_node
124 | fn = PythonParser.get_function_list(root)[0]
125 | metadata = PythonParser.get_function_metadata(fn)
126 |
127 | return_type = metadata['return_type']
128 | self.assertEqual(return_type, '')
129 |
130 | def test_get_parameter(self):
131 | code_sample = '''
132 | def sum2num(a: tree_sitter.Node=None, b=None, c:string) -> int:
133 | pass
134 | '''
135 |
136 | root = parse_code(code_sample, 'python').root_node
137 | fn = PythonParser.get_function_list(root)[0]
138 |
139 | metadata = PythonParser.get_function_metadata(fn)
140 | parameter = metadata['parameters']
141 | self.assertEqual(len(parameter.keys()), 3)
142 | self.assertTrue('a' in parameter.keys())
143 | self.assertTrue('b' in parameter.keys())
144 | self.assertTrue('c' in parameter.keys())
145 |
146 | return_type = metadata['return_type']
147 | self.assertEqual(return_type, 'int')
148 |
149 |
150 | if __name__ == '__main__':
151 | unittest.main()
152 |
--------------------------------------------------------------------------------
/tests/test_parser/test_ruby.py:
--------------------------------------------------------------------------------
1 | '''test for Ruby parser'''
2 | import os
3 | import unittest
4 | from pathlib import Path
5 |
6 | from src.codetext.parser import RubyParser
7 | from src.codetext.utils import parse_code
8 |
9 |
10 | class Test_RubyParser(unittest.TestCase):
11 | def setUp(self) -> None:
12 | with open('tests/test_parser/test_sample/ruby_test_sample.rb', 'r') as file:
13 | self.code_sample = file.read()
14 |
15 | tree = parse_code(self.code_sample, 'ruby')
16 | self.root_node = tree.root_node
17 |
18 | return super().setUp()
19 |
20 | def test_get_function_list(self):
21 | root = self.root_node
22 |
23 | function_list = RubyParser.get_function_list(root)
24 |
25 | self.assertEqual(len(function_list), 2)
26 |
27 | def test_get_class_list(self):
28 | root = self.root_node
29 |
30 | class_list = RubyParser.get_class_list(root)
31 |
32 | self.assertEqual(len(class_list), 3)
33 |
34 | def test_get_docstring(self):
35 | code_sample = """
36 | module Encryption
37 |
38 | # Search for links.
39 | #
40 | # @param query [String] The search query.
41 | # @option options [String, RedditKit::Subreddit] subreddit The optional subreddit to search.
42 | def encrypt(string)
43 | Digest::SHA2.hexdigest(string)
44 | end
45 | end
46 |
47 | =begin
48 | comment line 1
49 | comment line 2
50 | =end
51 | class Orange
52 | def initialize
53 | @juice_available = 100
54 | end
55 | def squeeze
56 | @juice_available -= 50
57 | end
58 | end
59 |
60 | orange = Orange.new
61 | orange.squeeze
62 | """
63 |
64 | tree = parse_code(code_sample, 'ruby')
65 | root = tree.root_node
66 |
67 | fn = RubyParser.get_function_list(root)[0]
68 | clas = RubyParser.get_class_list(root)[1]
69 |
70 | docs1 = RubyParser.get_docstring(fn)
71 | docs2 = RubyParser.get_docstring(clas)
72 |
73 | self.assertEqual(docs1, '# Search for links.\n#\n# @param query [String] The search query.\n# @option options [String, RedditKit::Subreddit] subreddit The optional subreddit to search.')
74 | self.assertEqual(docs2, ' comment line 1\n comment line 2')
75 |
76 | def test_get_function_metadata(self):
77 | root = self.root_node
78 |
79 | _function = RubyParser.get_function_list(root)[0]
80 | metadata = RubyParser.get_function_metadata(_function)
81 |
82 | for key in ['identifier', 'parameters', 'return_type']:
83 | self.assertTrue(key in metadata.keys())
84 | self.assertEqual(metadata['identifier'], 'search')
85 | self.assertEqual(metadata['parameters'], {'query': None, 'options': None})
86 | self.assertEqual(metadata['return_type'], None)
87 |
88 | _singleton = RubyParser.get_function_list(root)[1]
89 | metadata = RubyParser.get_function_metadata(_singleton)
90 | for key in ['identifier', 'parameters', 'return_type']:
91 | self.assertTrue(key in metadata.keys())
92 | self.assertEqual(metadata['identifier'], 'my_method')
93 | self.assertEqual(metadata['parameters'], {'a': None})
94 | self.assertEqual(metadata['return_type'], '')
95 |
96 |
97 | def test_metadata_without_return_statement(self):
98 | code_sample = '''
99 | def write_code(number_of_errors)
100 | if number_of_errors > 1
101 | mood = "Ask me later"
102 | else
103 | mood = puts "No Problem"
104 | end
105 | return mood
106 | end
107 | '''
108 | root = parse_code(code_sample, 'Ruby').root_node
109 | fn = RubyParser.get_function_list(root)[0]
110 | metadata = RubyParser.get_function_metadata(fn)
111 |
112 | return_type = metadata['return_type']
113 | self.assertEqual(return_type, '')
114 |
115 |
116 | def test_get_class_metadata(self):
117 | root = self.root_node
118 |
119 | classes = RubyParser.get_class_list(root)[1]
120 | metadata = RubyParser.get_class_metadata(classes)
121 |
122 | self.assertEqual(metadata['identifier'], 'Client')
123 | self.assertEqual(metadata['parameters'], {'API': None})
124 |
125 | def test_get_action_list(self):
126 | root = self.root_node
127 | actions = RubyParser.get_action_list(root)
128 |
129 | self.assertEqual(len(actions), 5)
130 |
131 | def test_get_action_metadata(self):
132 | root = self.root_node
133 | actions = RubyParser.get_action_list(root)
134 | metadatas = [ RubyParser.get_action_metadata(action) for action in actions]
135 | self.assertEqual(metadatas[0]["identifier"], "load_current_value")
136 | self.assertEqual(metadatas[1]["identifier"], "action:install")
137 | self.assertEqual(metadatas[2]["identifier"], "converge_by")
138 |
139 | self.assertEqual(metadatas[3]["identifier"], "action:reinstall")
140 | self.assertEqual(metadatas[4]["identifier"], "converge_by")
141 |
142 | self.assertEqual(metadatas[0]["parameters"]["new_resource"], None)
143 | self.assertEqual(metadatas[0]["parameters"]["old_resource"], None)
144 |
145 |
146 | if __name__ == '__main__':
147 | unittest.main()
148 |
--------------------------------------------------------------------------------
/tests/test_parser/test_rust.py:
--------------------------------------------------------------------------------
1 | '''test for Ruby parser'''
2 | import os
3 | import unittest
4 | from pathlib import Path
5 |
6 | from src.codetext.parser import RustParser
7 | from src.codetext.utils import parse_code
8 |
9 |
10 | class Test_RustParser(unittest.TestCase):
11 | def setUp(self) -> None:
12 | with open('tests/test_parser/test_sample/rust_test_sample.rs', 'r') as file:
13 | self.code_sample = file.read()
14 |
15 | tree = parse_code(self.code_sample, 'rust')
16 | self.root_node = tree.root_node
17 |
18 | return super().setUp()
19 |
20 | def test_get_function_list(self):
21 | root = self.root_node
22 |
23 | function_list = RustParser.get_function_list(root)
24 |
25 | self.assertEqual(len(function_list), 4)
26 |
27 | def test_get_class_list(self):
28 | root = self.root_node
29 |
30 | class_list = RustParser.get_class_list(root)
31 |
32 | self.assertEqual(len(class_list), 2)
33 |
34 | def test_get_docstring(self):
35 | code_sample = """
36 | // Comment something
37 | mod my_mod {
38 | /// Creates a new rendering surface.
39 | ///
40 | /// # Arguments
41 | ///
42 | /// Initialization of surfaces happens through the types provided by
43 | /// [`drm-rs`](drm).
44 | ///
45 | /// - [`crtcs`](drm::control::crtc) represent scanout engines of the device pointing to one framebuffer. \\
46 | /// Their responsibility is to read the data of the framebuffer and export it into an "Encoder". \\
47 | /// The number of crtc's represent the number of independent output devices the hardware may handle.
48 | fn private_function() {
49 | println!("called `my_mod::private_function()`");
50 | }
51 |
52 | /** - Outer block doc (exactly) 2 asterisks */
53 | pub fn function() {
54 | println!("called `my_mod::function()`");
55 | }
56 |
57 | // Items can access other items in the same module,
58 | // even when private.
59 | pub fn indirect_access() {
60 | print!("called `my_mod::indirect_access()`, that\n> ");
61 | private_function();
62 | }
63 | }
64 | """
65 |
66 | tree = parse_code(code_sample, 'rust')
67 | root = tree.root_node
68 |
69 | fn1 = RustParser.get_function_list(root)[0]
70 | fn2 = RustParser.get_function_list(root)[1]
71 | clas = RustParser.get_class_list(root)[0]
72 |
73 | docs1 = RustParser.get_docstring(fn1)
74 | docs2 = RustParser.get_docstring(fn2)
75 | docs3 = RustParser.get_docstring(clas)
76 |
77 | self.assertEqual(docs1, '/// Creates a new rendering surface.\n///\n/// # Arguments\n///\n/// Initialization of surfaces happens through the types provided by\n/// [`drm-rs`](drm).\n///\n/// - [`crtcs`](drm::control::crtc) represent scanout engines of the device pointing to one framebuffer. \\\n/// Their responsibility is to read the data of the framebuffer and export it into an "Encoder". \\\n/// The number of crtc\'s represent the number of independent output devices the hardware may handle.')
78 | self.assertEqual(docs2, '/** - Outer block doc (exactly) 2 asterisks */')
79 | self.assertEqual(docs3, '// Comment something')
80 |
81 | def test_get_function_metadata(self):
82 | root = self.root_node
83 |
84 | function = RustParser.get_function_list(root)[0]
85 | metadata = RustParser.get_function_metadata(function)
86 |
87 | for key in ['identifier', 'parameters', 'return_type']:
88 | self.assertTrue(key in metadata.keys())
89 | self.assertEqual(metadata['identifier'], 'long_string')
90 | self.assertEqual(metadata['parameters'], {'x': '&str'})
91 | self.assertEqual(metadata['return_type'], '&str')
92 |
93 | def test_metadata_with_return_statement(self):
94 | code_sample = '''
95 | fn quack(&self) {
96 | println!("quack!");
97 | return "hello";
98 | }
99 | '''
100 | root = parse_code(code_sample, 'Rust').root_node
101 | fn = RustParser.get_function_list(root)[0]
102 | metadata = RustParser.get_function_metadata(fn)
103 |
104 | return_type = metadata['return_type']
105 | self.assertEqual(return_type, '')
106 |
107 | def test_get_class_metadata(self):
108 | root = self.root_node
109 |
110 | classes = RustParser.get_class_list(root)[0]
111 | metadata = RustParser.get_class_metadata(classes)
112 |
113 | self.assertEqual(metadata['identifier'], 'Quack')
114 | self.assertEqual(metadata['parameters'], {'Duck': None})
115 |
116 |
117 | if __name__ == '__main__':
118 | unittest.main()
119 |
--------------------------------------------------------------------------------
/tests/test_parser/test_sample/README.md:
--------------------------------------------------------------------------------
1 | # Tree-sitter function/class type
2 |
3 | ## C/C++
4 | Node type - Sample
5 |
6 | - with C
7 | ```c
8 | // function_definition
9 | void reverseSentence(int random_seed) {
10 | char c;
11 | scanf("%c", &c);
12 | if (c != '\n') {
13 | reverseSentence();
14 | printf("%c", c);
15 | }
16 | }
17 | ```
18 |
19 | - with C++
20 | ```c++
21 | // function_definition
22 | double plusFuncDouble(double x, double y) {
23 | return x + y;
24 | }
25 |
26 | // function_definition
27 | int main() {
28 | int myNum1 = plusFuncInt(8, 5);
29 | double myNum2 = plusFuncDouble(4.3, 6.26);
30 | cout << "Int: " << myNum1 << "\n";
31 | cout << "Double: " << myNum2;
32 | return 0;
33 | }
34 |
35 | // class_specifier
36 | class Animal {
37 | public:
38 | // function_definition
39 | void animalSound() {
40 | cout << "The animal makes a sound \n";
41 | }
42 | };
43 |
44 | // class_specifier
45 | class Pig : public Animal {
46 | public:
47 | // function_definition
48 | void animalSound() {
49 | cout << "The pig says: wee wee \n";
50 | }
51 | };
52 | ```
53 |
54 | ## C#
55 |
56 | ```c#
57 | // local_function_statement
58 | private static string GetText(string path, string filename)
59 | {
60 | // local_declaration_statement
61 | var reader = File.OpenText($"{AppendPathSeparator(path)}{filename}");
62 | var text = reader.ReadToEnd();
63 | return text;
64 |
65 | // local_function_statement
66 | string AppendPathSeparator(string filepath)
67 | {
68 | return filepath.EndsWith(@"\") ? filepath : filepath + @"\";
69 | }
70 | }
71 |
72 | using System;
73 |
74 | // class_declaration
75 | public class Dog : Animal {
76 |
77 | String name;
78 | String breed;
79 | int age;
80 | String color;
81 |
82 | // constructor_declaration
83 | public Dog(String name, String breed,
84 | int age, String color)
85 | {
86 | this.name = name;
87 | this.breed = breed;
88 | this.age = age;
89 | this.color = color;
90 | }
91 |
92 | // method_declaration
93 | static void Main(string[] args)
94 | {
95 | Car myObj = new Car();
96 | Console.WriteLine(myObj.color);
97 | }
98 | }
99 | ```
100 |
101 | ## Java
102 |
103 | ```Java
104 | // class_declaration
105 | public class SaveFileController extends SudoUser implements FileController {
106 | // field_declaration
107 | private ArrayList allUsers;
108 | private String saveFile = "test_save_file4.sav";
109 |
110 | // constructor_declaration
111 | public SaveFileController(){
112 | this.allUsers = new ArrayList();
113 | }
114 |
115 | // method_declaration
116 | public HabitList getHabitList(Context context, int userIndex){
117 | loadFromFile(context);
118 | return this.allUsers.get(userIndex).getHabitList();
119 | }
120 | }
121 | ```
122 |
123 | ## Python
124 | ```python
125 | # class_definition
126 | class Person:
127 | # function_definition
128 | def __init__(self, name, age):
129 | self.name = name
130 | self.age = age
131 |
132 | # function_definition
133 | def say_my_name(self):
134 | print(self.name)
135 |
136 | # function_definition
137 | def create_a_person(name, age):
138 | new_person = Person(name, age)
139 | ```
140 |
141 | ## JavaScript
142 | ```JavaScript
143 | // function_declaration
144 | export function loadSongs() {
145 | return {
146 | type: LOAD_SONGS,
147 | };
148 | }
149 |
150 | // class_declaration
151 | class Model extends Car {
152 | // method_definition
153 | constructor(brand, mod) {
154 | super(brand);
155 | this.model = mod;
156 | }
157 |
158 | // method_definition
159 | show() {
160 | return this.present() + ', it is a ' + this.model;
161 | }
162 | }
163 | ```
164 |
165 | ## PHP
166 |
167 | ```PHP
168 | // function_definition
169 | function familyName($fname) {
170 | echo "$fname Refsnes.
";
171 | }
172 |
173 | // class_declaration
174 | final class Driver extends AbstractSQLServerDriver
175 | {
176 | // method_declaration
177 | public function connect(array $params)
178 | {
179 | $driverOptions = $dsnOptions = [];
180 | if (isset($params['driverOptions'])) {
181 | foreach ($params['driverOptions'] as $option => $value) {
182 | if (is_int($option)) {
183 | $driverOptions[$option] = $value;
184 | } else {
185 | $dsnOptions[$option] = $value;
186 | }
187 | }
188 | }
189 | }
190 | }
191 | ```
192 |
193 | ## GO
194 |
195 | ```GO
196 | // function_declaration
197 | func add(x int, y int) int {
198 | return x + y
199 | }
200 |
201 | // function_declaration
202 | func main() {
203 | fmt.Println(add(42, 13))
204 | }
205 |
206 | // method_declaration
207 | func (e TypeError) Error() string {
208 | msg := e.Type1.String()
209 | if e.Type2 != nil {
210 | msg += " and " + e.Type2.String()
211 | }
212 | msg += " " + e.Extra
213 | return msg
214 | }
215 |
216 | ```
217 |
218 | ## Ruby
219 |
220 | ```Ruby
221 | # class
222 | class Customer
223 | @@no_of_customers = 0
224 |
225 | # method
226 | def initialize(id, name, addr)
227 | @cust_id = id
228 | @cust_name = name
229 | @cust_addr = addr
230 | end
231 | end
232 |
233 | # method
234 | def test(a1 = "Ruby", a2 = "Perl")
235 | puts "The programming language is #{a1}"
236 | puts "The programming language is #{a2}"
237 | end
238 |
239 | # module
240 | module RedditKit
241 | # class
242 | class Client < API
243 | # method
244 | def search(query, options = {})
245 | path = "%s/search.json" % ('r/' + options[:subreddit] if options[:subreddit])
246 | parameters = { :q => query,
247 | :restrict_sr => options[:restrict_to_subreddit],
248 | :limit => options[:limit],
249 | :count => options[:count],
250 | :sort => options[:sort],
251 | :before => options[:before],
252 | :after => options[:after],
253 | :syntax => options[:syntax],
254 | :t => options[:time]
255 | }
256 |
257 | objects_from_response(:get, path, parameters)
258 | end
259 | end
260 | end
261 |
262 | ```
263 |
264 | ## Rust
265 |
266 | ```Rust
267 | // trait_item
268 | trait Quack {
269 | // function_signature_item <- This is function declaration
270 | fn quack(&self);
271 | }
272 |
273 | // struct_item
274 | struct Duck ();
275 |
276 | // function_item
277 | fn long_string(x: &str) -> &str {
278 | if x.len() > 10 {
279 | "too long"
280 | } else {
281 | x
282 | }
283 |
284 | }
285 |
286 | // impl_item
287 | impl Quack for Duck {
288 | // function_item
289 | fn quack(&self) {
290 | println!("quack!");
291 | }
292 | }
293 |
294 | // mod_item
295 | mod my_mod {
296 | // function_item
297 | fn private_function() {
298 | println!("called `my_mod::private_function()`");
299 | }
300 | }
301 |
302 | // function_item
303 | fn quack_everyone (iter: I)
304 | where I: Iterator- > {
305 | for d in iter {
306 | d.quack();
307 | }
308 | }
309 | ```
--------------------------------------------------------------------------------
/tests/test_parser/test_sample/c_sharp_test_sample.cs:
--------------------------------------------------------------------------------
1 | private static string GetText(string path, string filename)
2 | {
3 | var reader = File.OpenText($"{AppendPathSeparator(path)}{filename}");
4 | var text = reader.ReadToEnd();
5 | return text;
6 |
7 | string AppendPathSeparator(string filepath)
8 | {
9 | return filepath.EndsWith(@"\") ? filepath : filepath + @"\";
10 | }
11 | }
12 |
13 | using System;
14 | public class Dog : Animal {
15 |
16 | // Instance Variables
17 | String name;
18 | String breed;
19 | int age;
20 | String color;
21 |
22 | // Constructor Declaration of Class
23 | public Dog(String name, String breed,
24 | int age, String color)
25 | {
26 | this.name = name;
27 | this.breed = breed;
28 | this.age = age;
29 | this.color = color;
30 | }
31 |
32 | // Docstring of this function
33 | static void Main(string[] args)
34 | {
35 | Car myObj = new Car();
36 | Console.WriteLine(myObj.color);
37 | }
38 | }
--------------------------------------------------------------------------------
/tests/test_parser/test_sample/c_test_sample.c:
--------------------------------------------------------------------------------
1 | #include
2 | void reverseSentence();
3 |
4 | /**
5 | * A brief description. A more elaborate class description
6 | * @param random_seed somearg.
7 | * @see Test()
8 | * @return The test results
9 | */
10 | void reverseSentence(int random_seed) {
11 | char c;
12 | scanf("%c", &c);
13 | if (c != '\n') {
14 | reverseSentence();
15 | printf("%c", c);
16 | }
17 | }
18 |
19 | int main() {
20 | printf("Enter a sentence: ");
21 | reverseSentence();
22 | return 0;
23 | }
24 |
--------------------------------------------------------------------------------
/tests/test_parser/test_sample/cpp_test_sample.cpp:
--------------------------------------------------------------------------------
1 | // Derived class
2 | class Car: public Vehicle, private B {
3 | public:
4 | string model = "Mustang";
5 | };
6 |
7 | // A static function
8 | int sum2number (int a, int b) {
9 | return a + b;
10 | }
11 |
12 | // Base class
13 | class Vehicle {
14 | public:
15 | string brand = "Ford";
16 | void honk() {
17 | cout << "Tuut, tuut! \n" ;
18 | }
19 | };
20 |
21 | int main() {
22 | Car myCar;
23 | myCar.honk();
24 | cout << myCar.brand + " " + myCar.model;
25 | return 0;
26 | }
27 |
28 |
--------------------------------------------------------------------------------
/tests/test_parser/test_sample/go_test_sample.go:
--------------------------------------------------------------------------------
1 | // Copyright 2016 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 | type TypeError struct {
5 | Type1, Type2 reflect.Type
6 | Extra string
7 | }
8 |
9 | // The path package should only be used for paths separated by forward
10 | // slashes, such as the paths in URLs. This package does not deal with
11 | // Windows paths with drive letters or backslashes; to manipulate
12 | // operating system paths, use the [path/filepath] package.
13 | func (e TypeError) Error() string {
14 | msg := e.Type1.String()
15 | if e.Type2 != nil {
16 | msg += " and " + e.Type2.String()
17 | }
18 | msg += " " + e.Extra
19 | return msg
20 | }
21 |
--------------------------------------------------------------------------------
/tests/test_parser/test_sample/java_test_sample.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Implements the file to save data to.
3 | *
4 | * @version 1.0
5 | */
6 | public class SaveFileController extends SudoUser implements FileController {
7 | private ArrayList allUsers;
8 | //private String username;
9 | private String saveFile = "test_save_file4.sav";
10 |
11 | public SaveFileController(){
12 | this.allUsers = new ArrayList();
13 | }
14 |
15 | /**
16 | * Gets HabitList instance.
17 | *
18 | * @param context instance of Context
19 | * @param userIndex integer user index
20 | * @return HabitList
21 | * @see HabitList
22 | */
23 | public HabitList getHabitList(Context context, int userIndex){
24 | loadFromFile(context);
25 | return this.allUsers.get(userIndex).getHabitList();
26 | }
27 |
28 | /**
29 | * Removes a habit event from a particular user's habit event list.
30 | *
31 | * @param context instance of Context
32 | * @param userIndex integer user index
33 | * @param habitIndex integer index of habit
34 | * @param habitEventIndex integer index of habit event
35 | */
36 | public void removeHabitEvent(Context context, int userIndex, int habitIndex, int habitEventIndex){
37 | loadFromFile(context);
38 | this.allUsers.get(userIndex).getHabitList().getHabit(habitIndex)
39 | .getHabitEventHistory().getHabitEvents().remove(habitEventIndex);
40 | saveToFile(context);
41 | }
42 | }
--------------------------------------------------------------------------------
/tests/test_parser/test_sample/javascript_test_sample.js:
--------------------------------------------------------------------------------
1 | /*
2 | * App Actions
3 | *
4 | * Actions change things in your application
5 | * Since this boilerplate uses a uni-directional data flow, specifically redux,
6 | * we have these actions which are the only way your application interacts with
7 | * your application state. This guarantees that your state is up to date and nobody
8 | * messes it up weirdly somewhere.
9 | *
10 | * To add a new Action:
11 | * 1) Import your constant
12 | * 2) Add a function like this:
13 | * export function yourAction(var) {
14 | * return { type: YOUR_ACTION_CONSTANT, var: var }
15 | * }
16 | */
17 |
18 | import {
19 | LOAD_SONGS,
20 | LOAD_SONGS_SUCCESS,
21 | LOAD_SONGS_ERROR,
22 | } from './constants';
23 |
24 | /**
25 | * Load the repositories, this action starts the request saga
26 | *
27 | * @return {object} An action object with a type of LOAD_REPOS
28 | */
29 | export function loadSongs() {
30 | return {
31 | type: LOAD_SONGS,
32 | };
33 | }
34 |
35 | /**
36 | * Dispatched when the repositories are loaded by the request saga
37 | *
38 | * @param {array} repos The repository data
39 | * @param {string} username The current username
40 | *
41 | * @return {object} An action object with a type of LOAD_REPOS_SUCCESS passing the repos
42 | */
43 | export function songsLoaded(repos, username=10) {
44 | return {
45 | type: LOAD_SONGS_SUCCESS,
46 | repos,
47 | username,
48 | };
49 | }
50 |
51 | /**
52 | * Dispatched when loading the repositories fails
53 | *
54 | * @param {object} error The error
55 | *
56 | * @return {object} An action object with a type of LOAD_REPOS_ERROR passing the error
57 | */
58 | export function songsLoadingError(error) {
59 | return {
60 | type: LOAD_SONGS_ERROR,
61 | error,
62 | };
63 | }
64 |
65 | class Model extends Car {
66 | constructor(brand, mod) {
67 | super(brand);
68 | this.model = mod;
69 | }
70 |
71 | /**
72 | * Comment something
73 | */
74 | show() {
75 | return this.present() + ', it is a ' + this.model;
76 | }
77 | }
78 |
79 | class Car {
80 | constructor(brand) {
81 | this.carname = brand;
82 | }
83 |
84 | /**
85 | * Dispatched when loading the repositories fails
86 | *
87 | * @param {object} error The error
88 | *
89 | * @return {object} An action object with a type of LOAD_REPOS_ERROR passing the error
90 | */
91 | present() {
92 | return 'I have a ' + this.carname;
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/tests/test_parser/test_sample/php_test_sample.php:
--------------------------------------------------------------------------------
1 | $value) {
16 | if (is_int($option)) {
17 | $driverOptions[$option] = $value;
18 | } else {
19 | $dsnOptions[$option] = $value;
20 | }
21 | }
22 | }
23 |
24 | if (! empty($params['persistent'])) {
25 | $driverOptions[PDO::ATTR_PERSISTENT] = true;
26 | }
27 |
28 | try {
29 | $pdo = new PDO(
30 | $this->constructDsn($params, $dsnOptions),
31 | $params['user'] ?? '',
32 | $params['password'] ?? '',
33 | $driverOptions
34 | );
35 | } catch (\\PDOException $exception) {
36 | throw PDOException::new($exception);
37 | }
38 |
39 | return new Connection(new PDOConnection($pdo));
40 | }
41 |
42 | /**
43 | * Constructs the Sqlsrv PDO DSN.
44 | *
45 | * @param mixed[] $params
46 | * @param string[] $connectionOptions
47 | *
48 | * @throws Exception
49 | */
50 | private function constructDsn(array $params=null, array $connectionOptions): string
51 | {
52 | $dsn = 'sqlsrv:server=';
53 |
54 | if (isset($params['host'])) {
55 | $dsn .= $params['host'];
56 |
57 | if (isset($params['port'])) {
58 | $dsn .= ',' . $params['port'];
59 | }
60 | } elseif (isset($params['port'])) {
61 | throw PortWithoutHost::new();
62 | }
63 |
64 | if (isset($params['dbname'])) {
65 | $connectionOptions['Database'] = $params['dbname'];
66 | }
67 |
68 | if (isset($params['MultipleActiveResultSets'])) {
69 | $connectionOptions['MultipleActiveResultSets'] = $params['MultipleActiveResultSets'] ? 'true' : 'false';
70 | }
71 |
72 | return $dsn . $this->getConnectionOptionsDsn($connectionOptions);
73 | }
74 |
75 | /**
76 | * Converts a connection options array to the DSN
77 | *
78 | * @param string[] $connectionOptions
79 | */
80 | private function getConnectionOptionsDsn(array $connectionOptions): string
81 | {
82 | $connectionOptionsDsn = '';
83 |
84 | foreach ($connectionOptions as $paramName => $paramValue) {
85 | $connectionOptionsDsn .= sprintf(';%s=%s', $paramName, $paramValue);
86 | }
87 |
88 | return $connectionOptionsDsn;
89 | }
90 | }
91 |
92 | interface MyInterface {
93 | public function myMethod() {
94 | // Method implementation
95 | }
96 |
97 | }
98 |
99 | trait MyTrait {
100 |
101 | public function setBackgroundImage(Drawing $objDrawing): self
102 | {
103 | if (!array_key_exists($objDrawing->getType(), Drawing::IMAGE_TYPES_CONVERTION_MAP)) {
104 | throw new PhpSpreadsheetException('Unsupported image type in comment background. Supported types: PNG, JPEG, BMP, GIF.');
105 | }
106 | $this->backgroundImage = $objDrawing;
107 |
108 | return $this;
109 | }
110 |
111 | }
112 |
113 |
--------------------------------------------------------------------------------
/tests/test_parser/test_sample/py_test_sample.py:
--------------------------------------------------------------------------------
1 | def partition(array, low, high):
2 | """
3 | Function to find the partition position
4 |
5 | :param array: the unsorted array
6 | :type array: List
7 | :param low: smaller pivot
8 | :type low: int
9 | :param high: greater pivot
10 | :type high: int
11 |
12 | """
13 | # choose the rightmost element as pivot
14 | pivot = array[high]
15 |
16 | # pointer for greater element
17 | i = low - 1
18 |
19 | # traverse through all elements
20 | # compare each element with pivot
21 | for j in range(low, high):
22 | if array[j] <= pivot:
23 |
24 | # If element smaller than pivot is found
25 | # swap it with the greater element pointed by i
26 | i = i + 1
27 |
28 | # Swapping element at i with element at j
29 | (array[i], array[j]) = (array[j], array[i])
30 |
31 | # Swap the pivot element with the greater element specified by i
32 | (array[i + 1], array[high]) = (array[high], array[i + 1])
33 |
34 | # Return the position from where partition is done
35 | return i + 1
36 |
37 | def quickSort(array, low, high):
38 | """
39 | Function to perform quicksort
40 | """
41 | if low < high:
42 |
43 | # Find pivot element such that
44 | # element smaller than pivot are on the left
45 | # element greater than pivot are on the right
46 | pi = partition(array, low, high)
47 |
48 | # Recursive call on the left of pivot
49 | quickSort(array, low, pi - 1)
50 |
51 | # Recursive call on the right of pivot
52 | quickSort(array, pi + 1, high)
53 |
54 |
55 | data = [1, 7, 4, 1, 10, 9, -2]
56 | print("Unsorted Array")
57 | print(data)
58 |
59 | size = len(data)
60 |
61 | quickSort(data, 0, size - 1)
62 |
63 | print('Sorted Array in Ascending Order:')
64 | print(data)
65 |
66 | class Person:
67 | def __init__(self, name, age):
68 | self.name = name
69 | self.age = age
--------------------------------------------------------------------------------
/tests/test_parser/test_sample/ruby_test_sample.rb:
--------------------------------------------------------------------------------
1 | module RedditKit
2 | class Client < API
3 |
4 | # Methods for searching reddit's links.
5 | module Search
6 |
7 | # Search for links.
8 | #
9 | # @param query [String] The search query.
10 | # @option options [String, RedditKit::Subreddit] subreddit The optional subreddit to search.
11 | # @option options [true, false] restrict_to_subreddit Whether to search only in a specified subreddit.
12 | # @option options [1..100] limit The number of links to return.
13 | # @option options [String] count The number of results to return before or after. This is different from `limit`.
14 | # @option options [relevance, new, hot, top, comments] sort The sorting order for search results.
15 | # @option options [String] before Only return links before this full name.
16 | # @option options [String] after Only return links after this full name.
17 | # @option options [cloudsearch, lucene, plain] syntax Specify the syntax for the search. Learn more: http://www.reddit.com/r/redditdev/comments/1hpicu/whats_this_syntaxcloudsearch_do/cawm0fe
18 | # @option options [hour, day, week, month, year, all] time Show results with a specific time period.
19 | # @return [RedditKit::PaginatedResponse]
20 | def search(query, options = {})
21 | path = "%s/search.json" % ('r/' + options[:subreddit] if options[:subreddit])
22 | parameters = { :q => query,
23 | :restrict_sr => options[:restrict_to_subreddit],
24 | :limit => options[:limit],
25 | :count => options[:count],
26 | :sort => options[:sort],
27 | :before => options[:before],
28 | :after => options[:after],
29 | :syntax => options[:syntax],
30 | :t => options[:time]
31 | }
32 |
33 | objects_from_response(:get, path, parameters)
34 | end
35 |
36 | def self.my_method(a)
37 | # Method implementation
38 | puts(a)
39 | return a
40 | end
41 |
42 | end
43 | end
44 | end
45 |
46 | load_current_value do |new_resource, old_resource|
47 | unless current_installed_version(new_resource).nil?
48 | version(current_installed_version(new_resource))
49 | Chef::Log.debug("Current version is #{version}") if version
50 | return a
51 | end
52 | end
53 |
54 | action :install do
55 | build_essential
56 |
57 | install_version = new_resource.version unless new_resource.version.nil? || new_resource.version == current_resource.version
58 | versions_match = candidate_version == current_installed_version(new_resource)
59 |
60 | if install_version || new_resource.version.nil? && !versions_match
61 | converge_by("install package #{new_resource.package_name} #{install_version}") do
62 | info_output = "Installing #{new_resource.package_name}"
63 | info_output << " version #{install_version}" if install_version && !install_version.empty?
64 | Chef::Log.info(info_output)
65 | install_package(new_resource.package_name, install_version)
66 | end
67 | end
68 | end
69 |
70 | action :reinstall do
71 | build_essential
72 |
73 | install_version = new_resource.version unless new_resource.version.nil?
74 | converge_by("reinstall package #{new_resource.package_name} #{install_version}") do
75 | info_output = "Installing #{new_resource.package_name}"
76 | info_output << " version #{install_version}" if install_version && !install_version.empty?
77 | Chef::Log.info(info_output)
78 | install_package(new_resource.package_name, install_version, force: true)
79 | end
80 | end
81 |
82 | a = 1
83 |
84 | reinstall
85 |
--------------------------------------------------------------------------------
/tests/test_parser/test_sample/rust_test_sample.rs:
--------------------------------------------------------------------------------
1 | trait Quack {
2 | fn quack(&self);
3 | }
4 |
5 | struct Duck ();
6 |
7 | fn long_string(x: &str) -> &str {
8 | if x.len() > 10 {
9 | "too long"
10 | } else {
11 | x
12 | }
13 |
14 | }
15 |
16 | impl Quack for Duck {
17 | fn quack(&self) {
18 | println!("quack!");
19 | }
20 | }
21 |
22 | mod my_mod {
23 | // Items in modules default to private visibility.
24 | fn private_function() {
25 | println!("called `my_mod::private_function()`");
26 | }
27 | }
28 |
29 | fn quack_everyone (iter: I)
30 | where I: Iterator
- > {
31 | for d in iter {
32 | d.quack();
33 | }
34 | }
35 |
36 | let ducks: Vec> = vec![Box::new(duck1),Box::new(duck2),Box::new(parrot),Box::new(int)];
37 |
--------------------------------------------------------------------------------
/tests/test_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/tests/test_utils/__init__.py
--------------------------------------------------------------------------------
/tests/test_utils/test_utils.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from src.codetext.utils import build_language, parse_code
3 |
4 |
5 | class Test_Utils(unittest.TestCase):
6 | def test_build_language(self):
7 | langs = ['python', 'rust']
8 | for l in langs:
9 | # clear it later
10 | build_language(language=l)
11 |
12 | def test_parse_code(self):
13 | sample = """
14 | def sum_2_num(a, b):
15 | return a + b
16 | """
17 | parse_code(sample, 'python')
18 |
19 |
20 | if __name__ == '__main__':
21 | unittest.main()
--------------------------------------------------------------------------------