├── .github
    └── workflows
    │   ├── build_and_release.yml
    │   └── unittest.yml
├── .gitignore
├── .idea
    └── .gitignore
├── HISTORY.md
├── LICENSE
├── README.md
├── asset
    └── img
    │   ├── codetext_logo.png
    │   └── codetext_logo_line.png
├── pyproject.toml
├── requirements.txt
├── src
    └── codetext
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── clean
    │       ├── __init__.py
    │       └── noise_removal.py
    │   ├── codetext_cli.py
    │   ├── parser
    │       ├── README.md
    │       ├── __init__.py
    │       ├── c_sharp_parser.py
    │       ├── cpp_parser.py
    │       ├── go_parser.py
    │       ├── java_parser.py
    │       ├── javascript_parser.py
    │       ├── language_parser.py
    │       ├── php_parser.py
    │       ├── python_parser.py
    │       ├── ruby_parser.py
    │       └── rust_parser.py
    │   └── utils
    │       ├── __init__.py
    │       ├── imports.py
    │       └── utils.py
└── tests
    ├── __init__.py
    ├── setup.py
    ├── test_clean
        ├── __init__.py
        └── test_clean_utils.py
    ├── test_parser
        ├── __init__.py
        ├── test_c.py
        ├── test_cpp.py
        ├── test_csharp.py
        ├── test_go.py
        ├── test_java.py
        ├── test_javascript.py
        ├── test_php.py
        ├── test_python.py
        ├── test_ruby.py
        ├── test_rust.py
        └── test_sample
        │   ├── README.md
        │   ├── c_sharp_test_sample.cs
        │   ├── c_test_sample.c
        │   ├── cpp_test_sample.cpp
        │   ├── go_test_sample.go
        │   ├── java_test_sample.java
        │   ├── javascript_test_sample.js
        │   ├── php_test_sample.php
        │   ├── py_test_sample.py
        │   ├── ruby_test_sample.rb
        │   └── rust_test_sample.rs
    └── test_utils
        ├── __init__.py
        └── test_utils.py


/.github/workflows/build_and_release.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: Publish package to PyPI
 3 | 
 4 | on:
 5 |   release:
 6 |     types: [created]
 7 | 
 8 | jobs:
 9 |   release:
10 | #    if: github.event_name == 'release' && github.event.action == 'created'
11 |     name: PyPi Release
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v2
16 |       name: Checkout repo
17 | 
18 |     - name: Set up Python 3.7
19 |       uses: actions/setup-python@v1
20 |       with:
21 |         python-version: 3.7
22 | 
23 |     - uses: actions/cache@v1
24 |       name: Cache pip dependencies
25 |       with:
26 |         path: ~/.cache/pip
27 |         key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }}
28 |         restore-keys: |
29 |           ${{ runner.os }}-pip-
30 |     - name: Install pip dependencies
31 |       run: |
32 |          pip install --upgrade pip
33 |          pip install -r requirements.txt
34 |          python3 -m pip install --upgrade build twine wheel
35 |     - name: Make distribution
36 |       run: |
37 |         python3 setup.py sdist bdist_wheel
38 |         twine check dist/*
39 |     - name: Publish a Python distribution to PyPI
40 |       uses: pypa/gh-action-pypi-publish@master
41 |       with:
42 |         user: __token__
43 |         password: ${{ secrets.PYPI_TOKEN }}


--------------------------------------------------------------------------------
/.github/workflows/unittest.yml:
--------------------------------------------------------------------------------
 1 | name: Unittest
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   unittest:
 7 |     name: Unittest
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       matrix:
11 |         pyversion: [ "3.10" ]
12 | 
13 |     steps:
14 |     - name: Check out Git repository
15 |       uses: actions/checkout@v2
16 | 
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: ${{ matrix.pyversion }}
21 | 
22 |     - name: Install dependencies
23 |       run: |
24 |         pip install -r requirements.txt
25 | #         git clone https://github.com/nmd-2000/docstring_parser docstring_parser
26 | #         pip install -e ./docstring_parser
27 | 
28 |     - name: Run tests
29 |       run: |
30 |         python -m unittest
31 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | cache/*
 2 | src/*/*.txt
 3 | src/codetext.egg-info/*
 4 | */build/*
 5 | */dist/*
 6 | */tree-sitter-*
 7 | *.jsonl
 8 | *.json
 9 | *.zip
10 | *.gz
11 | *.pyc
12 | *.so
13 | *.whl
14 | .idea
15 | .vscode
16 | *.iml
17 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | .idea
5 | .vscode
6 | *.iml


--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
  1 | ========
  2 | Releases
  3 | ========
  4 | 
  5 | Version 0.0.9
  6 | =============
  7 | Release date: Jul 1, 2024
  8 | * Skip building language binaries from source
  9 | 
 10 | Version 0.0.8
 11 | =============
 12 | Release date: Aug 17, 2023
 13 | 
 14 | * Update format codetext_cli
 15 | * Update PythonParser: Handle class definitions with empty argument list class ABC()
 16 | * Add Javascript undeclared functions
 17 | * Add PHP interface
 18 | * Add Ruby actions with block parameters
 19 | 
 20 | Version 0.0.7
 21 | =============
 22 | Release date: Jul 5, 2023
 23 | 
 24 | * Update all class extractor format (using dict instead of list)
 25 | * Fix missing identifier, parameter in C, C#, Java parser
 26 | * Implement CLI
 27 | 
 28 | Version 0.0.6
 29 | =============
 30 | Release date: Jan 9, 2023
 31 | 
 32 | * Add tree sitter utils (in codetext.parser)
 33 | * Replace all `match_from_span` to `get_node_text`
 34 | * Replace all `traverse_type` to `get_node_by_kind`
 35 | * Fix `CppParser.get_function_metadata` missing `param_type` and `param_identifier`
 36 | * Update return metadata from all parser
 37 | 
 38 | Version 0.0.5
 39 | =============
 40 | Release date: Dec 12, 2022
 41 | 
 42 | * Fix package import path
 43 | * Adding auto build workflow
 44 | * Seperate codetext parser with processing source code
 45 | * Fix `remove_comment_delimiter` remove leading whitespace
 46 | * Update unittest for parser and utilites
 47 | 
 48 | Version 0.0.4
 49 | =============
 50 | Release date: Dec 2, 2022
 51 | 
 52 | *  Fix main package root path
 53 | *  Loosen `docstring_parser` dependency
 54 | 
 55 | Version 0.0.3
 56 | =============
 57 | Release date: Dec 2, 2022
 58 | 
 59 | *  New clean docstring function
 60 |     * check_docstring_contain_question
 61 |     * check_docstring_underdevelopment
 62 |     * check_docstring_autogenerated
 63 |     * check_contain_little_single_char
 64 |     * check_contain_many_special_char
 65 |     * check_contain_little_unique_chars
 66 |     * check_contain_little_unique_words
 67 |     * check_contain_many_special_case
 68 |     * check_contain_too_many_variables
 69 |     * check_contain_many_repeated_word
 70 |     * check_contain_many_uppercase_word
 71 |     * check_contain_many_long_word
 72 | 
 73 | Version 0.0.2
 74 | =============
 75 | Release date: Nov 25, 2022
 76 | 
 77 | *  Language parser for Rust
 78 |     * get_docstring
 79 |     * get_class_list, get_function_list
 80 |     * get_class_metadata, get_function_metadata
 81 | * Processing utils:
 82 |     * extract_docstring
 83 |     * extract_node
 84 |     * get_line_definitions
 85 |     * get_node_definitions
 86 |     * process_raw_node
 87 | * Postprocessing:
 88 |     * Merge file (from batches)
 89 |     * Split into train/test/valid (by #sample category)
 90 |     * Deduplicate sample
 91 | 
 92 | Version 0.0.1
 93 | =============
 94 | Release date: Nov 9, 2022
 95 | 
 96 | *  Language parser for Java, Python, JavaScript, PHP, Golang, Ruby, C++, C#, C
 97 |     * get_docstring
 98 |     * get_class_list, get_function_list
 99 |     * get_class_metadata, get_function_metadata
100 | *  Clean docstring function
101 | *  Data preprocessing source code
102 | *  Tree-sitter utils: build_language, parse_code
103 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 AI4Code Research Group
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 | 
  3 | <p align="center">
  4 |   <img src="./asset/img/codetext_logo.png" width="220px" alt="logo">
  5 | </p>
  6 | ______________________________________________________________________
  7 | 
  8 | 
  9 | <!-- Badge start -->
 10 | | Branch 	| Build 	| Unittest 	| Release 	| License 	|
 11 | |--------	|-------	|----------	|---------	|---------	|
 12 | | main   	|       	| [![Unittest](https://github.com/AI4Code-Research/CodeText-parser/actions/workflows/unittest.yml/badge.svg)](https://github.com/AI4Code-Research/CodeText-parser/actions/workflows/unittest.yml) | [![release](https://img.shields.io/pypi/v/codetext)](https://pypi.org/project/codetext/) [![pyversion](https://img.shields.io/pypi/pyversions/codetext)](https://pypi.org/project/codetext/)| [![license](https://img.shields.io/github/license/AI4Code-Research/CodeText-parser)](https://github.com/AI4Code-Research/CodeText-parser/blob/main/LICENSES.txt) |
 13 | <!-- Badge end -->
 14 | </div>
 15 | 
 16 | ______________________________________________________________________
 17 | 
 18 | **Code-Text parser** is a custom [tree-sitter](https://github.com/tree-sitter)'s grammar parser for extract raw source code into class and function level. We support 10 common programming languages:
 19 | - Python
 20 | - Java
 21 | - JavaScript
 22 | - PHP
 23 | - Ruby
 24 | - Rust
 25 | - C
 26 | - C++
 27 | - C#
 28 | - Go
 29 | 
 30 | # Installation
 31 | **codetext** package require python 3.7 or above and tree-sitter. Setup environment and install dependencies manually from source:
 32 | ```bash
 33 | git https://github.com/FSoft-AI4Code/CodeText-parser.git; cd CodeText-parser
 34 | pip install -r requirement.txt
 35 | pip install -e .
 36 | ```
 37 | 
 38 | Or install via `pypi` package:
 39 | ```bash
 40 | pip install codetext
 41 | ```
 42 | 
 43 | # Getting started
 44 | 
 45 | ## `codetext` CLI Usage
 46 | ```bash
 47 | codetext [options] [PATH or FILE] ...
 48 | ```
 49 | 
 50 | For example extract any python file in `src/` folder:
 51 | ```bash
 52 | codetext src/ --language Python
 53 | ```
 54 | 
 55 | If you want to store extracted class and function, use flag `--json` and give a path to destination file:
 56 | ```bash
 57 | codetext src/ --language Python --output_file ./python_report.json --json
 58 | ```
 59 | 
 60 | **Options**
 61 | 
 62 | ```bash
 63 | positional arguments:
 64 |   paths                 list of the filename/paths.
 65 | 
 66 | optional arguments:
 67 |   -h, --help            show this help message and exit
 68 |   --version             show program's version number and exit
 69 |   -l LANGUAGE, --language LANGUAGE
 70 |                         Target the programming languages you want to analyze.
 71 |   -o OUTPUT_FILE, --output_file OUTPUT_FILE
 72 |                         Output file (e.g report.json).
 73 |   --json                Generate json output as a transform of the default
 74 |                         output
 75 |   --verbose             Print progress bar
 76 | 
 77 | ```
 78 | 
 79 | **Example**
 80 | ```
 81 | File circle_linkedlist.py analyzed:
 82 | ==================================================
 83 | Number of class    : 1
 84 | Number of function : 2
 85 | --------------------------------------------------
 86 | 
 87 | Class summary:
 88 | +-----+---------+-------------+
 89 | |   # | Class   | Arguments   |
 90 | +=====+=========+=============+
 91 | |   0 | Node    |             |
 92 | +-----+---------+-------------+
 93 | 
 94 | Class analyse: Node
 95 | +-----+---------------+-------------+--------+---------------+
 96 | | #   | Method name   | Paramters   | Type   | Return type   |
 97 | +=====+===============+=============+========+===============+
 98 | | 0   | __init__      | self        |        |               |
 99 | |     |               | data        |        |               |
100 | +-----+---------------+-------------+--------+---------------+
101 | 
102 | Function analyse:
103 | +-----+-----------------+-------------+--------+---------------+
104 | | #   | Function name   | Paramters   | Type   | Return type   |
105 | +=====+=================+=============+========+===============+
106 | | 0   | push            | head_ref    |        | Node          |
107 | |     |                 | data        | Any    | Node          |
108 | | 1   | countNodes      | head        | Node   |               |
109 | +-----+-----------------+-------------+--------+---------------+
110 | ```
111 | 
112 | ## Using `codetext` as Python module
113 | ### Build your language
114 | `codetext` need tree-sitter language file (i.e `.so` file) to work properly. You can manually compile language ([see more](https://github.com/tree-sitter/py-tree-sitter#usage)) or automatically build use our pre-defined function (the `<language>.so` will saved in a folder name `/tree-sitter/`):
115 | ```python
116 | from codetext.utils import build_language
117 | 
118 | language = 'rust'
119 | build_language(language)
120 | 
121 | # INFO:utils:Not found tree-sitter-rust, attempt clone from github
122 | # Cloning into 'tree-sitter-rust'...
123 | # remote: Enumerating objects: 2835, done. ...
124 | # INFO:utils:Attempt to build Tree-sitter Language for rust and store in .../tree-sitter/rust.so
125 | ```
126 | 
127 | ### Using Language Parser
128 | Each programming language we supported are correspond to a custome `language_parser`. (e.g Python is [`PythonParser()`](src/codetext/parser/python_parser.py#L11)). `language_parser` take input as raw source code and use breadth-first search to traveser through all syntax node. The class, method or stand-alone function will then be collected:
129 | 
130 | ```python
131 | from codetext.utils import parse_code
132 | 
133 | raw_code = """
134 |     /**
135 |     * Sum of 2 number
136 |     * @param a int number
137 |     * @param b int number
138 |     */
139 |     double sum2num(int a, int b) {
140 |         return a + b;
141 |     } 
142 | """
143 | 
144 | # Auto parse code into tree-sitter.Tree
145 | root = parse_code(raw_code, 'cpp')
146 | root_node = root.root_node
147 | ```
148 | 
149 | Get all function nodes inside a specific node:
150 | ```python
151 | from codetext.utils.parser import CppParser
152 | 
153 | function_list = CppParser.get_function_list(root_node)
154 | print(function_list)
155 | 
156 | # [<Node type=function_definition, start_point=(6, 0), end_point=(8, 1)>]
157 | 
158 | ```
159 | 
160 | Get function metadata (e.g. function's name, parameters, (optional) return type)
161 | ```python
162 | function = function_list[0]
163 | 
164 | metadata = CppParser.get_function_metadata(function, raw_code)
165 | 
166 | # {'identifier': 'sum2num', 'parameters': {'a': 'int', 'b': 'int'}, 'type': 'double'}
167 | ```
168 | Get docstring (documentation) of a function
169 | ```python
170 | docstring = CppParser.get_docstring(function, code_sample)
171 | 
172 | # ['Sum of 2 number \n@param a int number \n@param b int number']
173 | ```
174 | 
175 | We also provide 2 command for extract class object
176 | ```python
177 | class_list = CppParser.get_class_list(root_node)
178 | # and
179 | metadata = CppParser.get_metadata_list(root_node)
180 | ```
181 | 
182 | # Limitations
183 | `codetext` heavly depends on tree-sitter syntax:
184 | - Since we use tree-sitter grammar to extract desire node like function, class, function's name (identifier) or class's argument list, etc. `codetext` is easily vulnerable by tree-sitter update patch or syntax change in future.
185 | 
186 | - While we try our best to capture all possiblity, there are still plenty out there. We open for community to contribute into this project.


--------------------------------------------------------------------------------
/asset/img/codetext_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/asset/img/codetext_logo.png


--------------------------------------------------------------------------------
/asset/img/codetext_logo_line.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/asset/img/codetext_logo_line.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "codetext"
 7 | version = "0.0.9"
 8 | authors = [
 9 |   { name="Dung Manh Nguyen", email="dungnm.workspace@gmail.com" },
10 | ]
11 | description = "Multilingual programming language parsers for the extract from raw source code into multiple levels of pair data"
12 | readme = "README.md"
13 | requires-python = ">=3.6"
14 | classifiers = [
15 |     "Programming Language :: Python :: 3",
16 |     "License :: OSI Approved :: MIT License",
17 |     "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 |     "tree-sitter>=0.20",
21 |     "Levenshtein>=0.20",
22 |     "langdetect>=1.0.0",
23 |     "bs4>=0.0.1",
24 |     "tabulate>=0.9.0"
25 | ]
26 | 
27 | [project.urls]
28 | "Homepage" = "https://github.com/AI4Code-Research/CodeText-data"
29 | "Bug Tracker" = "https://github.com/AI4Code-Research/CodeText-data/issues"
30 | 
31 | [project.scripts]
32 | codetext = "codetext.__main__:main"
33 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # for preprocessing
2 | tree-sitter==0.20.4
3 | tabulate
4 | Levenshtein
5 | langdetect
6 | bs4
7 | tree_sitter_languages==1.10.2
8 | 


--------------------------------------------------------------------------------
/src/codetext/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/src/codetext/__init__.py


--------------------------------------------------------------------------------
/src/codetext/__main__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import argparse
 4 | import pkg_resources
 5 | 
 6 | import json
 7 | from .codetext_cli import parse_file, print_result, PL_MATCHING
 8 | 
 9 | 
10 | def get_args():
11 |     parser = argparse.ArgumentParser(description=f"codetext parser {20*'='}")
12 |     
13 |     parser.add_argument('paths', nargs='*', default=['.'],
14 |                         help='list of the filename/paths.')
15 |     parser.add_argument("--version", action="version",
16 |                         version=pkg_resources.get_distribution("codetext").version)
17 |     parser.add_argument("-l", "--language",
18 |                         help='''Target the programming languages you want to
19 |                         analyze.''')
20 |     parser.add_argument("-o", "--output_file",
21 |                         help='''Output file (e.g report.json).
22 |                         ''',
23 |                         type=str)
24 |     parser.add_argument("--json",
25 |                         help='''Generate json output as a transform of the
26 |                         default output''',
27 |                         action="store_true")
28 |     parser.add_argument("--verbose",
29 |                         help='''Print progress bar''',
30 |                         action="store_true")
31 |     
32 |     return parser.parse_args()
33 | 
34 | 
35 | def main():
36 |     opt = get_args()
37 |     
38 |     # check args
39 |     if opt.json:
40 |         if not opt.output_file: 
41 |             raise ValueError("Missing --output_file")
42 |     if opt.language:
43 |         if opt.language not in PL_MATCHING.keys():
44 |             raise ValueError(
45 |                 "{language} not supported. Currently support {sp_language}"
46 |                 .format(language=opt.language, 
47 |                         sp_language=list(PL_MATCHING.keys())))
48 |     
49 |     # check path
50 |     for path in opt.paths:
51 |         assert os.path.exists(path) == True, "paths is not valid"
52 |         
53 |         if os.path.isdir(path):
54 |             files = [os.path.join(path, f) for f in os.listdir(path) \
55 |                     if os.path.isfile(os.path.join(path, f))]
56 |         elif os.path.isfile(path):
57 |             files = [path]
58 |             
59 |         if opt.language:
60 |             for file in files[:]:
61 |                 filename, file_extension = os.path.splitext(file)
62 |                 if file_extension not in PL_MATCHING[opt.language]:
63 |                     files.remove(file)
64 | 
65 |     output_metadata = {}
66 |     for file in files:
67 |         filename, file_extension = os.path.splitext(file)
68 |         
69 |         if opt.language == None:
70 |             for lang, ext_list in PL_MATCHING.items():
71 |                 if file_extension in ext_list:
72 |                     language = lang
73 |                     break
74 |         else:
75 |             language = opt.language
76 | 
77 |         output = parse_file(file, language=language)
78 |         print_result(
79 |             output, 
80 |             file_name=str(filename).split(os.sep)[-1]+file_extension
81 |         )
82 |         output_metadata[file] = output
83 |     
84 |     if opt.json:
85 |         save_path = opt.output_file
86 |         with open(save_path, 'w') as output_file:
87 |             json.dump(output_metadata, output_file, sort_keys=True, indent=4)
88 |             print(50*'=')
89 |             print("Save report to {path}".format(path=save_path))
90 | 
91 | 
92 | if __name__ == '__main__':
93 |     main()
94 | 


--------------------------------------------------------------------------------
/src/codetext/clean/__init__.py:
--------------------------------------------------------------------------------
1 | """Clean utilities"""
2 | 
3 | from .noise_removal import remove_comment_delimiters, remove_special_tag, remove_special_character
4 | 
5 | 
6 | __all__ = [
7 |     'remove_comment_delimiters', 'remove_special_tag', 'remove_special_character'
8 | ]


--------------------------------------------------------------------------------
/src/codetext/clean/noise_removal.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | import warnings
  4 | from collections import Counter
  5 | from itertools import permutations
  6 | from typing import Any, Dict, List, Union
  7 | 
  8 | from langdetect import detect, detect_langs
  9 | from bs4 import BeautifulSoup
 10 | import Levenshtein as lev
 11 | 
 12 | from tree_sitter import Node
 13 | from ..parser.language_parser import tokenize_docstring, get_node_by_kind
 14 | warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
 15 | 
 16 | 
 17 | REGEX_TEXT = ("(?<=[a-z0-9])(?=[A-Z])|"
 18 |               "(?<=[A-Z0-9])(?=[A-Z][a-z])|"
 19 |               "(?<=[0-9])(?=[a-zA-Z])|"
 20 |               "(?<=[A-Za-z])(?=[0-9])|"
 21 |               "(?<=[@$.'\"])(?=[a-zA-Z0-9])|"
 22 |               "(?<=[a-zA-Z0-9])(?=[@$.'\"])|"
 23 |               "_|\\s+")
 24 | 
 25 | if sys.version_info >= (3, 7):
 26 |     import re
 27 |     SPLIT_REGEX = re.compile(REGEX_TEXT)
 28 | else:
 29 |     import regex
 30 |     SPLIT_REGEX = regex.compile("(?V1)"+REGEX_TEXT)
 31 | 
 32 | 
 33 | def split_sentences(docstring):
 34 |     # sentences = re.split("(?<![\.])\.(?![\.\w])", docstring)
 35 | 
 36 |     sentences = re.split("(?<=.)[\.\!\?](?=\s+)", docstring)
 37 |     sentences = [sentence.strip() for sentence in sentences if sentence.strip() != ""]
 38 | 
 39 |     return sentences
 40 | 
 41 | 
 42 | def split_identifier_into_parts(identifier: str) -> List[str]:
 43 |     """
 44 |     Split a single identifier into parts on snake_case and camelCase
 45 |     """
 46 |     identifier_parts = list(s.lower() for s in SPLIT_REGEX.split(identifier) if len(s)>0)
 47 | 
 48 |     if len(identifier_parts) == 0:
 49 |         return [identifier]
 50 |     return identifier_parts
 51 | 
 52 | 
 53 | def check_is_node_error(node: Node) -> bool:
 54 |     """
 55 |     Check if node contains "ERROR" node
 56 |     Args:
 57 |         node (tree_sitter.Node): node
 58 |     
 59 |     Return:
 60 |         bool
 61 |     """
 62 |     if not isinstance(node, Node):
 63 |         raise ValueError("Expect type tree_sitter.Node, get %i", type(node))
 64 | 
 65 |     error_node = get_node_by_kind(node, ['ERROR'])
 66 |     if len(error_node) > 0:
 67 |         return True
 68 |     else:
 69 |         return False
 70 | 
 71 | 
 72 | def get_node_length(node: Node) -> int:
 73 |     """
 74 |     Get node length
 75 |     Args:
 76 |         node (tree_sitter.Node): node
 77 |         
 78 |     Return:
 79 |         int
 80 |     """
 81 |     if not isinstance(node, Node):
 82 |         raise ValueError("Expect type tree_sitter.Node, get %i", type(node))
 83 | 
 84 |     line_start = node.start_point[0]
 85 |     line_end = node.end_point[0]
 86 |     return int(line_end - line_start)
 87 |     
 88 |     
 89 | def remove_comment_delimiters(docstring: str, remove_whitespace: bool=True) -> str:
 90 |     """
 91 |     Remove comment delimiters.
 92 |     Example: //, /*, */, #, etc
 93 |     
 94 |     Args:
 95 |         docstring (str): raw (line or block) comment
 96 |         remove_whitespace (bool): remove leading whitespace or not
 97 |     Returns:
 98 |         str: removed delimiters docstring/comment
 99 |     
100 |     """
101 |     clean_pattern = re.compile(r'([\'\"]{3})$|^([\'\"]{3})') # remove python ''' or """
102 |     clean_pattern1 = re.compile(r'([#]+)$|^([#]+)')  # special single-line comment with #
103 |     clean_pattern2 = re.compile(r'([\/*=-]+)$|^([\/*!=-]+)')
104 |     
105 |     docstring = re.sub(clean_pattern, '', docstring)
106 |     new_docstring = []
107 |     for line in docstring.split('\n'):
108 |         if remove_whitespace:
109 |             line = line.strip()
110 |         line = re.sub(clean_pattern1, '', line)
111 |         line = re.sub(clean_pattern2, '', line)
112 |         new_docstring.append(line)
113 | 
114 |     return '\n'.join(new_docstring)
115 | 
116 | 
117 | def remove_special_tag(docstring: str) -> str:
118 |     """
119 |     Remove all special tag (html tag, e.g. <p>docstring</p>)
120 |     """
121 |     return BeautifulSoup(docstring, "html.parser").get_text()
122 | 
123 | 
124 | def remove_special_character(docstring: str) -> str:
125 |     return re.sub(r'[^a-zA-Z0-9\\\_\.\,]', ' ', docstring)
126 | 
127 | 
128 | def remove_function_name_at_the_beginning(docstring):
129 |     """
130 |     This function is applied at docstring/paragraph-level.
131 |     """
132 |     ending_symbols = [":", "\s-"]
133 |     for symbol in ending_symbols:
134 |         pattern = "^[a-zA-Z0-9_\(\)]+" + symbol
135 |         docstring = re.sub(pattern, "", docstring)
136 | 
137 |     docstring = docstring.strip()
138 | 
139 |     return docstring
140 | 
141 | 
142 | def remove_link_in_brackets(docstring):
143 |     """
144 |     Removing patterns, for examples:
145 |         - (https://www.a.ai)
146 |         - <see https://www.b.ai>
147 |         - <eg. a b c>
148 |         
149 |     \param
150 |     \brief
151 | 
152 |     This function is applied to each line of the docstring/paragraph.
153 |     """
154 |     pattern = "\%s(?:http|see|e\.g|eg.).*?\%s"
155 |     bracket_pairs = [("(", ")"), ("<", ">")]
156 |     for pair in bracket_pairs:
157 |         docstring = re.sub(pattern % pair, "", docstring.strip())
158 |     
159 |     return docstring.strip()
160 | 
161 | 
162 | def remove_everything_after_a_pattern(docstring):
163 |     """
164 |     Only keep the part appears before the patterns.
165 |     Ignore everything after the patterns.
166 |                 
167 |     This function is applied at docstring-level
168 |     """
169 |     patterns = [
170 |                 "E.g", "e.g", "eg.", "Eg.",
171 |                 "Example usage:", "Created by", "Example:",
172 |                 "Note:", ". Note", "note::", "note:", ". note"
173 |                 ]
174 | 
175 |     for pattern in patterns:
176 |         docstring = docstring.strip().split(pattern)[0]
177 | 
178 |     docstring = docstring.strip()
179 |     return docstring
180 | 
181 | 
182 | def remove_everything_after_an_url(docstring):
183 |     """
184 |     This function applies at sentence-level
185 |     TO-DO: Should apply on docstring-level by regular expression
186 |     """
187 |     patterns = ["https:", "http:"]
188 |     sentences = split_sentences(docstring)
189 |     sentences_ = []
190 |     for sentence in sentences:
191 |         has_pattern = False
192 |         for pattern in patterns:
193 |             if pattern in sentence:
194 |                 has_pattern = True
195 |                 break
196 |         if has_pattern:
197 |             break
198 |         sentences_.append(sentence)
199 |     docstring = ". ".join(sentences_)
200 | 
201 |     docstring = docstring.strip()
202 | 
203 |     return docstring
204 | 
205 | 
206 | def remove_lines_start_and_end_with_the_same_char(docstring):
207 |     """
208 |     Remove noisy lines.
209 |     This function applies at line-level
210 |     """
211 |     lines = docstring.strip().split("\n")
212 |     patterns = ["*", "-", "_", "=", "/", "+"]
213 |     lines_ = []
214 |     for line in lines:
215 |         line = line.strip()
216 |         if line == "":
217 |             lines_.append(line)
218 |             continue
219 |         flag = False
220 |         for pattern in patterns:
221 |             p = "^\%s.*\%s$" % (pattern, pattern)
222 |             if re.search(p, line) is not None:
223 |                 flag = True
224 |                 break
225 |         if flag:
226 |             continue
227 | 
228 |         lines_.append(line)
229 |     docstring = "\n".join(lines_).strip()
230 | 
231 |     return docstring
232 | 
233 | 
234 | def remove_lines_contain_only_a_single_char(docstring):
235 |     """
236 |     This function applies at line-level
237 |     """
238 |     patterns = ["*", "/", "=", "-", "+"]
239 |     lines = docstring.strip().split("\n")
240 |     for i, line in enumerate(lines):
241 |         if line.strip() in patterns:
242 |             lines[i] = ""
243 |             continue
244 |     
245 |     docstring = "\n".join(lines).strip()
246 | 
247 |     return docstring
248 | 
249 | 
250 | def remove_patterns_at_any_positions(docstring):
251 |     """
252 |     This function applies at docstring-level
253 |     """
254 |     patterns = ["/**", "/*", "<code>", "</code>", "*-*"]
255 |     for pattern in patterns:
256 |         if pattern in docstring:
257 |             docstring = docstring.replace(pattern, "").strip()
258 | 
259 |     return docstring
260 | 
261 | 
262 | def remove_patterns_at_the_start_and_end_of_a_line(docstring):
263 |     """
264 |     This function applies at line-level
265 |     """ 
266 |     patterns = ["* "]
267 |     lines = docstring.strip().split("\n")
268 |     for i, line in enumerate(lines):
269 |         flag = True
270 |         while flag:
271 |             flag = False
272 |             # at the beginning
273 |             for pattern in patterns:
274 |                 if line.startswith(pattern):
275 |                     line = line[len(pattern):]
276 |             for symbol in [".", "*", "-", "_", "@", "#", "$", "!", "\\", "/", "+"]:
277 |                 pattern = r"^\%s{2,}" % (symbol) 
278 |                 line_ = re.sub(pattern, "", line)
279 |                 if line_ != line:
280 |                     flag = True
281 |                 line = line_
282 | 
283 |             # at the end
284 |             for symbol in [".", "*", "-", "_", "@", "#", "$", "!", "\\", "/", "+"]:
285 |                 pattern = r"\%s{2,}$" % (symbol) 
286 |                 line_ = re.sub(pattern, "", line)
287 |                 if line_ != line:
288 |                     flag = True
289 |                 line = line_
290 |         lines[i] = line
291 | 
292 |     docstring = "\n".join(lines).strip()
293 | 
294 |     return docstring
295 | 
296 | 
297 | def remove_patterns_at_the_end_of_a_docstring(docstring):
298 |     """
299 |     Remove ending character(s)
300 |     This function applies at docstring-level
301 |     """
302 |     patterns = [":", ";", ",", "...", "@@", "@"]
303 |     if docstring != "":
304 |         if docstring[-1] in patterns:
305 |             docstring = docstring[:-1] + '.'
306 | 
307 |     docstring = docstring.strip()
308 | 
309 |     return docstring
310 | 
311 | 
312 | def remove_specific_pattern(docstring: str) -> str:
313 |     """
314 |     pattern 1 will match "(e.g something)"
315 |     pattern 2 will match "e.g something\n" or "e.g something. "
316 |     pattern 3 will match "{@tag content}" and change to "content"
317 |     pattern 4 will match trailing special chars "==============" or "************"
318 |     """
319 |     pattern1 = re.compile(r'(\(((i\.e)|(e\.g)|(\beg)|(\bie))[\s\S]+?)(\))', flags=re.IGNORECASE|re.MULTILINE)
320 |     pattern3 = re.compile(r'{@.*?}')
321 |     pattern4 = re.compile(r'(-|=|#|\*){5,}')
322 | 
323 |     docstring = re.sub(pattern1, '', docstring)
324 |     # docstring = re.sub(pattern2, '', docstring)
325 |     docstring = re.sub(pattern4, '', docstring)
326 |     all_matches = re.findall(pattern3, docstring)
327 |     for match in all_matches:
328 |         new_match = str(match)[1:-1]  # remove { }
329 |         new_match = re.sub(r'@\w*', '', new_match)
330 |         docstring = docstring.replace(match, new_match)
331 |     
332 |     return docstring
333 | 
334 | 
335 | def remove_unrelevant(docstring: str) -> str:
336 |     flag = True
337 |     while flag:
338 |         flag = False
339 |         docstring_ = docstring
340 |         
341 |         removing_functions = [
342 |             remove_specific_pattern,
343 |             remove_link_in_brackets,
344 |             # remove_everything_after_an_url,  # Overlap
345 |             # remove_everything_after_a_pattern,  # Noticeable wrong catch
346 |             remove_patterns_at_any_positions,
347 |             remove_lines_contain_only_a_single_char,
348 |             remove_lines_start_and_end_with_the_same_char,
349 |             remove_patterns_at_the_start_and_end_of_a_line,
350 |             remove_function_name_at_the_beginning,
351 |         ]
352 |         for removing_function in removing_functions:
353 |             docstring = removing_function(docstring)
354 |             # print(removing_function.__name__)
355 |             # print(docstring)
356 |             # print('\n\n')
357 | 
358 |         if docstring != docstring_:
359 |             flag = True
360 |     
361 |     docstring = remove_patterns_at_the_end_of_a_docstring(docstring)
362 |     return docstring
363 | 
364 | 
365 | # =================== Check code ======================
366 | 
367 | def check_is_black_node(node_name: str, exclude_list: List = None):
368 |     """
369 |     Check if node belongs to black list. E.g:
370 |         - Built-in function
371 |         - Test function, test class
372 |         - Constructor
373 |     """
374 |     black_keywords = ['test_', 'Test_', '_test', 'toString', 'constructor', 'Constructor']
375 |     black_keywords.extend(exclude_list)
376 |     
377 |     if not isinstance(node_name, str):
378 |         raise ValueError(f'Expect str, get {type(node_name)}')
379 |     if node_name.startswith('__') and node_name.endswith('__'):
380 |         return True
381 |     if node_name.startswith('set') or node_name.startswith('get'):
382 |         return True
383 |     if any(keyword in node_name for keyword in black_keywords):
384 |         return True
385 |     
386 |     return False
387 | 
388 | 
389 | def check_is_empty_function(node):
390 |     """
391 |     If node width is longer than 3 lines, then it's not an empty function
392 |     """
393 |     if get_node_length(node) <= 3:
394 |         return True
395 |     return False
396 | 
397 | 
398 | def check_autogenerated_by_code(raw_code: str, identifier: str):
399 |     threshold = 0.4
400 |     fn_name_splited = split_identifier_into_parts(identifier)
401 |     fn_name_splited = ' '.join(fn_name_splited).lower()
402 |     
403 |     comment = str(re.sub(r'[^a-zA-Z0-9]', ' ', comment)).lower()
404 | 
405 |     d0 = lev.distance(fn_name_splited, comment)
406 |     d1 = max(len(fn_name_splited), len(comment))
407 |     
408 |     if d0 <= d1*threshold:
409 |         return True
410 |     
411 |     return False
412 | 
413 | # =================== Check docstring ======================
414 | 
415 | def check_docstring_length(docstring: str):
416 |     doc_tokens = docstring.strip().split()
417 |     if len(doc_tokens) < 3: # or len(doc_tokens) > 256:
418 |     # if len(doc_tokens) >= 256:
419 |         return True
420 |     return False
421 | 
422 | 
423 | def check_docstring_literal(docstring: str):
424 |     """
425 |     Check if docstring is EN
426 |     TODO: "Ce n'est pas en anglais" -> Fr
427 |     """
428 |     p = re.compile('[a-zA-Z0-9]')
429 |     if not docstring.isascii():
430 |         return True
431 |     if not p.search(docstring):
432 |         return True
433 |     # TODO: uncomment this
434 |     # try:
435 |     #     _docstring = re.sub(r'[^a-zA-Z0-9]', ' ', docstring)
436 |     #     _docstring = ' '.join(split_all_sepcial_case(_docstring))
437 |             
438 |     #     print(_docstring)
439 |     #     if detect(_docstring) != 'en':
440 |     #         print(detect_langs(_docstring))
441 |     #         return True
442 |     # except:
443 |     #     pass
444 |     return False
445 | 
446 | 
447 | def check_docstring_contain_question(docstring: str):
448 |     pattern = re.compile(r'(?i)^(why\b|how\b|what\'?s?\b|where\b|is\b|are\b)')
449 | 
450 |     if docstring[-1] == '?' or pattern.search(docstring):
451 |         return True
452 |     else:
453 |         return False
454 | 
455 | 
456 | def check_docstring_underdevelopment(docstring: str):
457 |     p1 = re.compile('(?i)^((Description of the Method)|(NOT YET DOCUMENTED)|(Missing[\s\S]+Description)|(not in use)|'
458 |                     '(Insert the method\'s description here)|(No implementation provided)|(\(non\-Javadoc\)))')
459 |     p2 = re.compile('(?i)^(todo|to-do|deprecate|copyright|fixme)', flags=re.IGNORECASE)
460 |     # p3 = re.compile('^[A-Za-z]+(\([A-Za-z_]+\))?:')
461 | 
462 |     if p1.search(docstring) or p2.search(docstring):
463 |         return True
464 |     else:
465 |         return False
466 | 
467 | 
468 | def check_docstring_autogenerated(docstring: str):
469 |     p1 = re.compile(r'(?i)@[a-zA-Z]*generated\b')
470 |     p2 = re.compile('(?i)^([aA]uto[-\s]generated)')
471 |     p3 = re.compile('(?i)^(This method initializes)')
472 |     p4 = re.compile('(?i)^(This method was generated by)')
473 | 
474 |     if docstring is not None:
475 |         if p1.search(docstring):
476 |             return True
477 | 
478 |     if p2.search(docstring) or p3.search(docstring) or p4.search(docstring):
479 |         return True
480 |     
481 |     else:
482 |         return False
483 |     
484 | 
485 | def check_docstring_contain_specific_pattern(docstring: str):
486 |     condition1 = re.compile(r'((i\.e)|(e\.g)|(\beg)|(\bie))(\s|\.)', flags=re.IGNORECASE)
487 |     condition2 = re.compile(r'(^(Sees*)|(example usage)|(example)|(note:*))', flags=re.IGNORECASE)
488 |     condition_follow = re.compile(r'[^a-zA-Z0-9\s\.\,\:\;\'\"]')
489 |     
490 |     # if pattern 1 and 2 match -> check if the line contain any special characters
491 |     if condition1.match(docstring) or condition2.match(docstring):
492 |         if condition_follow.match(docstring):
493 |             return True
494 |         
495 |     return False
496 |     
497 | 
498 | # =================== Check characters ======================
499 | 
500 | def does_str_containt_math(str):
501 |     math_indicators = ["equation", "\exp(", "\log(", "\sqrt(", "mathbf", "mathrm"]
502 |     # TODO: page [number]
503 |     containt_math = False
504 |     for math_indicator in math_indicators:
505 |         if math_indicator in str:
506 |             containt_math = True
507 |             break
508 | 
509 |     return containt_math
510 | 
511 | 
512 | def check_contain_little_alphabet_char(docstring: str):
513 |     thresholds = [5, 0.65, 15, 0.4]
514 |     docstring = docstring.strip()
515 |     contain_math = does_str_containt_math(docstring)
516 |     docstring = "".join(docstring.strip().split())
517 |     if len(docstring) < 1:
518 |         return True
519 |     num_alphabet_chars = len(re.findall("[a-zA-Z]", docstring))
520 | 
521 |     return len(docstring) > thresholds[0 + 2*int(contain_math)] and num_alphabet_chars / len(docstring) < thresholds[1 + 2*int(contain_math)]
522 | 
523 | 
524 | def convert_special_pattern(docstring):
525 |     patterns = [
526 |                 (["HH", "MM", "SS"], (":", "-")),
527 |                 (["MM", "DD", "YY"], (":", "-")),
528 |                 (["MM", "DD", "YYYY"], (":", "-")),
529 | 
530 |                 (["hh", "mm", "ss"], (":", "-")),
531 |                 (["mm", "dd", "yy"], (":", "-")),
532 |                 (["mm", "dd", "yyyy"], (":", "-")),
533 | 
534 |                 (["R", "G", "B"], (",", "-")),
535 | 
536 |                 (["r", "g", "b"], (",", "-"))
537 |                 ]
538 |     for pattern, signs in patterns:
539 |         for sign in signs:
540 |             pms = permutations(pattern)
541 |             for pm in pms:
542 |                 string = sign.join(pm)
543 |                 if string in docstring:
544 |                     docstring = docstring.replace(string, "".join(pm).lower())
545 |     return docstring
546 | 
547 | 
548 | def check_contain_many_special_char(docstring: str):
549 |     threshold_dict = [[4, 6, 10, 6],  # max #bracket schar, max #normal schar, max #math schar
550 |                       [10, 0.3, 17, 0,5],   # acceptable #total schar or acceptable ratio
551 |                       [15, 20]] #, 0.3]  # max #schar
552 |     docstring = docstring.strip()
553 |     containt_math = does_str_containt_math(docstring)
554 |     docstring = convert_special_pattern(docstring)
555 |     num_tokens = len(tokenize_docstring(docstring))
556 |     counter = Counter(docstring)
557 | 
558 |     count = 0
559 |     math_symbols = ["+", "-", "*", "/", ":", "^", "=", "<", ">", "|", "(",]
560 | 
561 |     symbols = ["$", "!", "@", "#", "%", "^", "&", "*", "<", ">",
562 |                "~", "|", "\\", "'", '"',"?", "-", "+", "=", "`",
563 |                ":", "/", "(", "[", "{"]
564 |     
565 |     for symb in symbols:
566 |         threshold = threshold_dict[0][0]
567 |         if symb in ["(", "[", "{"]:
568 |             threshold = threshold_dict[0][1]
569 |             if containt_math:
570 |                 threshold = threshold_dict[0][3]
571 |         else:
572 |             if containt_math:
573 |                 if symb in math_symbols:
574 |                     threshold = threshold_dict[0][2]
575 |             
576 |         if counter[symb] > threshold:
577 |             return True
578 |         
579 |         # brackets
580 |         if symb not in ["(", "[", "{"]:
581 |             count += counter[symb]
582 | 
583 |     return count > max(threshold_dict[1][0 + 2*int(containt_math)], threshold_dict[1][1 + 2*int(containt_math)]*num_tokens) \
584 |             and count > threshold_dict[2][int(containt_math)]
585 | 
586 | 
587 | def check_contain_little_unique_chars(docstring):
588 |     """
589 |     This function applies on docstring line
590 |     """
591 |     threshold_dict = [5, 3] 
592 |     docstring = "".join(docstring.strip().split()) 
593 |     return len(docstring) > threshold_dict[0] and len(set(docstring)) <= threshold_dict[1]
594 | 
595 | # =================== Check words ======================
596 | 
597 | def check_contain_little_unique_words(docstring):
598 |     threshold_dict = [3, 0.3]
599 |     ignored_words = ["the", "of", "a", "an", "it", "for", "or", "in", "but",]
600 |                      # ".", ",", "(", ")", "{", "}", "<", ">", "[", "]", "-", "|"]
601 |     docs = ' '.join(re.findall(r'\b[a-zA-Z0-9]+\b', docstring))
602 |     docstring_tokens = tokenize_docstring(docs)
603 |     counter = Counter(docstring_tokens)
604 |     try:
605 |         most_repeated_word = counter.most_common()[0][0]
606 |     except IndexError:
607 |         return True
608 |     max_count = counter.most_common()[0][1]
609 | 
610 |     index = 1
611 |     while most_repeated_word in ignored_words:
612 |         try:
613 |             most_repeated_word = counter.most_common()[index][0]
614 |             max_count = counter.most_common()[index][1]
615 |             index += 1
616 |         except IndexError:
617 |             return False
618 |     
619 |     return max_count >= threshold_dict[0] and max_count / len(docstring_tokens) > threshold_dict[1]
620 | 
621 | 
622 | # def check_contain_many_special_case(docstring: str):
623 | #     """
624 | #     Check if the string contains too much sneak_case or camelCase
625 | #     """
626 | #     threshold = 0.3
627 | #     total_words = docstring.strip().split()
628 | #     if len(total_words) == 0:
629 | #         return True
630 | #     sneak_cases = re.findall("\w+_\w+", docstring)
631 | #     camelCases = re.findall("[A-Z]([A-Z0-9]*[a-z][a-z0-9]*[A-Z]|[a-z0-9]*[A-Z][A-Z0-9]*[a-z])[A-Za-z0-9]*", docstring)
632 | #     return (len(sneak_cases) + len(camelCases))/len(total_words) > threshold
633 | 
634 | 
635 | # def check_contain_many_repeated_word(docstring: str):
636 | #     """
637 | #     Check if the string (longer than 30 words) have too many repeated word
638 | #     """
639 | #     threshold_dict = [30, 0.5]  # max number, ratio
640 | #     docstring = "".join(docstring.strip().split())
641 | #     counter = Counter(docstring)
642 | #     return len(docstring) > threshold_dict[0] and counter.most_common()[0][1] / len(docstring) > threshold_dict[1]
643 | 
644 | 
645 | def check_contain_many_uppercase_word(docstring: str):
646 |     threshold_dict = [10, 0.3]
647 |     patterns = ["DD", "MM", "YY", "YYYY", "R,G,B", "R-G-B", "SS", "HH", "API"]
648 |     for pattern in patterns:
649 |         docstring = docstring.replace(pattern, pattern.lower())
650 | 
651 |     docstring = docstring.strip()
652 |     snake_case_identifiers = re.findall("\w+_\w+", docstring)
653 | 
654 |     for identifier in snake_case_identifiers:
655 |         docstring = docstring.replace(identifier, identifier.lower())
656 | 
657 |     uppercase_words = re.findall(r"(?<=\s)[A-Z][A-Z0-9_]+", docstring)
658 |     docstring_tokens = docstring.strip().split()
659 |     return len(docstring_tokens) > threshold_dict[0] and len(uppercase_words) / len(docstring_tokens) > threshold_dict[1]
660 | 
661 | 
662 | def check_contain_too_many_variables(docstring):
663 |     """
664 |     Check if the string contains too much sneak_case or camelCase
665 |     """
666 |     threshold_dict = 0.3
667 |     total_words = docstring.strip().split()
668 |     if not total_words:
669 |         return False
670 |     
671 |     # snake_case variable name
672 |     snake_case_identifiers = re.findall("\w+_\w+", docstring)
673 |     for identifier in snake_case_identifiers:
674 |         docstring = docstring.replace(identifier, "").strip()
675 |     # CamelCaes variable name
676 |     camel_case_identifiers = re.finditer(r"[A-Z]([A-Z0-9]*[a-z][a-z0-9]*[A-Z]|[a-z0-9]*[A-Z][A-Z0-9]*[a-z])[A-Za-z0-9]*", docstring)
677 |     camel_case_identifiers = [x.group() for x in camel_case_identifiers]
678 |     # Method call
679 |     variable_names = snake_case_identifiers + camel_case_identifiers
680 | 
681 |     return len(variable_names)/len(total_words) > threshold_dict
682 | 
683 | 
684 | def check_contain_too_many_method_call(docstring):
685 |     threshold_dict = 0.2
686 |     total_words = docstring.strip().split()
687 |     if not total_words:
688 |         return False
689 | 
690 |     method_call_identifiers = re.finditer(r"[a-zA-Z0-9]+((\.|\()[a-zA-Z0-9]+)+", docstring)
691 |     method_call_identifiers = [x.group() for x in method_call_identifiers]
692 | 
693 |     return len(method_call_identifiers)/len(total_words) > threshold_dict
694 | 
695 | 
696 | def camel_case_split(identifier):
697 |     matches = re.finditer(r'.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
698 |     return [m.group(0) for m in matches]
699 | 
700 | 
701 | def snake_case_split(identifier):
702 |     return identifier.strip().split("_")
703 | 
704 | 
705 | def split_all_sepcial_case(docstring: str):
706 |     docstring_tokens = []
707 |     for token in tokenize_docstring(docstring.strip()):
708 |         sub_tokens = snake_case_split(token)
709 |         for sub_token in sub_tokens:
710 |             sub_sub_tokens = camel_case_split(sub_token)
711 |             docstring_tokens.extend(sub_sub_tokens)
712 |     
713 |     return docstring_tokens
714 | 
715 | def check_contain_many_long_word(docstring: str):
716 |     threshold = 30
717 |     docstring_tokens = split_all_sepcial_case(docstring)
718 | 
719 |     if len(docstring_tokens) == 0:
720 |         return True
721 | 
722 |     return max([len(docstring_token) for docstring_token in docstring_tokens]) > threshold
723 | 
724 | 
725 | def check_contain_url(docstring: str):
726 |     pattern = re.compile(r'(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])', flags=re.I)
727 |     
728 |     if pattern.search(docstring):
729 |         return True
730 |     return False
731 | 
732 | # =================== End checking ======================
733 | 
734 | def check_function(node, node_metadata: Dict[str, Any], exclude_list: List = None, is_class=False):
735 |     """
736 |     Check function if
737 |         - is built-in function (python)
738 |         - is constructor
739 |         - is empty 
740 |         - is error node
741 |         - have length < 3 lines
742 |     
743 |     Args:
744 |         node (tree_sitter.Node): function node
745 |         exclude_list (List): exclude name of function
746 |     Return:
747 |         bool: pass the check or not
748 |     """
749 |     node_identifier = node_metadata['identifier']
750 |     
751 |     # Check node/code
752 |     if check_is_node_error(node):
753 |         return False
754 |     if check_is_black_node(node_identifier, exclude_list):
755 |         return False
756 |     if check_is_empty_function(node):
757 |         return False
758 |     
759 |     return True
760 | 
761 | 
762 | def check_docstring(docstring: str, loosen_filter: bool = False):
763 |     """
764 |     Check docstring is valid or not
765 |     """
766 |     check_funcs_mapping = [
767 |         # 'check_docstring_literal',
768 |         'check_docstring_contain_question',
769 |         'check_docstring_underdevelopment',
770 |         'check_docstring_autogenerated',
771 |         'check_docstring_contain_specific_pattern',
772 |         'check_contain_little_alphabet_char',
773 |         'check_contain_many_special_char',
774 |         'check_contain_little_unique_chars',
775 |         'check_contain_little_unique_words',
776 |         # 'check_contain_many_special_case',
777 |         'check_contain_too_many_variables',
778 |         'check_contain_too_many_method_call',
779 |         # 'check_contain_many_repeated_word',
780 |         'check_contain_many_uppercase_word',
781 |         'check_contain_many_long_word',
782 |         'check_contain_url',
783 |     ]
784 |     
785 |     check_docstring_funcs = [
786 |         # check_docstring_literal,
787 |         check_docstring_contain_question,
788 |         check_docstring_underdevelopment,
789 |         check_docstring_autogenerated,
790 |         check_docstring_contain_specific_pattern,
791 |         check_contain_little_alphabet_char,
792 |         check_contain_many_special_char,
793 |         check_contain_little_unique_chars,
794 |         check_contain_little_unique_words,
795 |         # check_contain_many_special_case,
796 |         check_contain_too_many_variables,
797 |         check_contain_too_many_method_call,
798 |         # check_contain_many_repeated_word,
799 |         check_contain_many_uppercase_word,
800 |         check_contain_many_long_word,
801 |         check_contain_url,
802 |     ]
803 |     
804 |     if loosen_filter:
805 |         check_docstring_funcs = [
806 |         check_docstring_contain_question,
807 |         check_docstring_underdevelopment,
808 |         check_docstring_autogenerated,
809 |         check_docstring_contain_specific_pattern,
810 |         check_contain_little_alphabet_char,
811 |         # check_contain_many_special_char,
812 |         check_contain_little_unique_chars,
813 |         check_contain_little_unique_words,
814 |         # check_contain_many_special_case,
815 |         # check_contain_too_many_variables,
816 |         # check_contain_too_many_method_call,
817 |         # check_contain_many_repeated_word,
818 |         check_contain_many_uppercase_word,
819 |         check_contain_many_long_word,
820 |         check_contain_url,
821 |     ]
822 |     
823 |     # docstring_list = docstring.split('.')
824 |     # print(f'\nAfter split {docstring_list}')
825 |     
826 |     applied_res = []
827 |     result = False
828 |     for i, check_condition in zip(check_funcs_mapping, check_docstring_funcs):
829 |         # for comment in docstring_list:
830 |         if docstring == '' or not docstring:
831 |             return True #, []
832 |         # if True then docstring have fail
833 |         if check_condition(docstring):
834 |             return True
835 |             # return True
836 |             # applied_res.append(f"<{i}> {docstring}")
837 |     
838 |     return result #, applied_res
839 | 
840 | 
841 | def clean_docstring(docstring: str, loosen_filter: bool = False):
842 |     """
843 |     Clean docstring by removing special tag/url, characters, unrelevant information
844 |     """
845 |     cleaned_docstring = []
846 |     if docstring == '' or docstring == None:
847 |         return None
848 |     _docstring = remove_comment_delimiters(docstring)
849 |     if check_docstring_literal(_docstring):  # True is not pass
850 |         return None #, [f"<check_docstring_literal> {docstring}"]
851 | 
852 |     # _docstring = '\n'.join(remove_comment_delimiters(docstring))
853 |     docstring_paragraph_list = _docstring.strip().split('\n\n')
854 |     
855 |     for para in docstring_paragraph_list:
856 |         docs = remove_unrelevant(para)
857 |         docstring_list = re.split(r'(?<=.)[.!\?](?=\s+)', docs, flags=re.M)
858 |         clean_line = []
859 |         for line in docstring_list:
860 |             try:
861 |                 line = remove_special_tag(line)
862 |             except:
863 |                 print('Oops')
864 |                 return None
865 |             
866 |             # not_pass, res = check_docstring(line, loosen_filter)
867 |             not_pass = check_docstring(line, loosen_filter)
868 |             if not not_pass:
869 |                 clean_line.append(line)
870 |             else:
871 |                 break
872 |         
873 |         if len(clean_line) < len(docstring_list):
874 |             clean_line.append('')
875 |         cleaned_docstring.append('.'.join(clean_line))
876 |         
877 | 
878 |     cleaned_docstring = '\n\n'.join(cleaned_docstring)
879 | 
880 |     
881 |     if check_docstring_length(cleaned_docstring):
882 |         # if not res:
883 |         #     return None #, [f"<check_docstring_length> {docstring}"]
884 |         # else:
885 |         return None #, res
886 |     
887 |     return cleaned_docstring #, res
888 | 
889 | if __name__ == '__main__':
890 |     # test remove comment delimiters
891 |     raw = [
892 |         '// C, C++, C#',
893 |         '/// C, C++, C#',   
894 |         
895 |         '/*******'
896 |         '* Java'
897 |         '/*******',
898 |         '//** Java */',
899 |         
900 |         '# Python', 
901 |         
902 |         '//! Rust',
903 |         '//!!! Rust',
904 |         '/*!! Rust',
905 |         '/*! Rust',
906 |         
907 |         '''
908 |         /* The code below will print the words Hello World to the screen, and it is amazing 
909 |         
910 |         Somethin here too*/
911 |         '''
912 |     ]
913 | 
914 |     # for item in raw:
915 |     #     print(remove_comment_delimiters(item))
916 |         
917 |     samples = [
918 |         '\n\t\t/* 将JSONArray转换为Bean的List, 默认为ArrayList */',
919 |         '// TODO: Why is he using Math.round?',
920 |         '/* for now try mappig full type URI */',
921 |         '// public String transformTypeID(URI typeuri){',
922 |         '// return typeuri.toString();}',
923 |         '/* Do we need to show the upgrade wizard prompt? */',
924 |         '/* fixme: This function is not in use */',
925 |         '// SampleEncryptionBox (senc) and SampleAuxiliaryInformation{Sizes|Offsets}Box',
926 |         '/* This method initializes by me. The second line \n\n Abcdef*/',
927 |         '/* @func_name_generated',
928 |         '/* Auto-generated by IDE',
929 |         '/ Auto-generated by IDE',
930 |         '''
931 |         /// Abc
932 |         /// Abc
933 |         /// Abc
934 |         ''',
935 |         '''
936 |         /* Abc
937 |          * def
938 |          */
939 |         '''
940 |     ]
941 |     
942 |     # for item in samples:
943 |     #     print(clean_docstring(item))
944 |         
945 |     samples = [
946 |         '''
947 |         Returns the Surface's pixel buffer if the Surface doesn't require locking.
948 |         (e.g. it's a software surface)
949 |         ''',
950 |         '''
951 |         Taking in a sequence string, return the canonical form of the sequence
952 |         (e.g. the lexigraphically lowest of either the original sequence or its
953 |         reverse complement)
954 |         ''',
955 |         '''
956 |         Internal clear timeout. The function checks that the `id` was not removed
957 |         (e.g. by `chart.destroy()`). For the details see
958 |         [issue #7901](https://github.com/highcharts/highcharts/issues/7901).
959 |         ''',
960 |     ]
961 |     
962 |     # print('==== Cleaning ====')
963 |     # for item in samples:
964 |     #     print(clean_docstring(item))
965 |         
966 |     sample = '''
967 |     Returns the message Id to use as heading text, depending on what types of
968 |     usage are present (i.e. just writable files, or also readable directories,
969 |     etc).
970 |     |need_lifetime_text_at_end| is set to false iff the returned message Id
971 |     already includes an explanation for how long a website will have access to
972 |     the listed paths. It is set to true iff a separate label is needed at the end
973 |     of the dialog to explain lifetime.
974 |     '''
975 |     print(sample)
976 |     print('==== Cleaning ====')
977 |     print(clean_docstring(sample)[0])
978 |     
979 |     # print(extract_docstring(sample, [], 'cpp'))
980 |     
981 |     # res = clean_docstring(sample)
982 |     # print(res[0])
983 |     # print(res[1])
984 |     
985 |     # sample = '''Convert java.util.regex.Matcher groups to JavaScript groups'''
986 |     # print(check_contain_too_many_variables(sample))


--------------------------------------------------------------------------------
/src/codetext/codetext_cli.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import List, Dict
  3 | 
  4 | from tabulate import tabulate
  5 | 
  6 | from .parser import *
  7 | from .utils import parse_code
  8 | 
  9 | 
 10 | def parse_file(file_path: str, language: str = None, verbose: bool = False) -> List:
 11 |     assert language != None, "Auto detect is not implemented, please specify language"
 12 |     language = str(language).lower()
 13 |     # assert (language in SUPPORT_LANGUAGE) == True, f"{language} is not supported"
 14 |     assert os.path.isfile(file_path) == True, "File not found"
 15 | 
 16 |     if verbose:
 17 |         print(50 * "=")
 18 |         print("Parse code into tree-sitter node")
 19 | 
 20 |     content: str = open(file_path, "r").read()
 21 |     root_node = parse_code(raw_code=content, language=language).root_node
 22 | 
 23 |     if language == "python":
 24 |         parser: LanguageParser = PythonParser
 25 |     elif language == "java":
 26 |         parser: LanguageParser = JavaParser
 27 |     elif language == "javascript":
 28 |         parser: LanguageParser = JavascriptParser
 29 |     elif language == "go":
 30 |         parser: LanguageParser = GoParser
 31 |     elif language in ["c", "c++"]:
 32 |         parser: LanguageParser = CppParser
 33 |     elif language == "c#":
 34 |         parser: LanguageParser = CsharpParser
 35 |     elif language == "rust":
 36 |         parser: LanguageParser = RustParser
 37 |     elif language == "ruby":
 38 |         parser: LanguageParser = RubyParser
 39 |     elif language == "php":
 40 |         parser: LanguageParser = PhpParser
 41 |     else:
 42 |         raise KeyError(f"{language} is not supported")
 43 | 
 44 |     if verbose:
 45 |         print(50 * "=")
 46 |         print("Get node detail")
 47 | 
 48 |     cls_list = parser.get_class_list(root_node)
 49 |     method_list = []
 50 |     cls_metadata = []
 51 |     for _cls in cls_list:
 52 |         cls_info = parser.get_class_metadata(_cls)
 53 |         cls_info["code"] = get_node_text(_cls)
 54 | 
 55 |         cls_method = []
 56 |         current_class_methods = parser.get_function_list(_cls)
 57 |         for method in current_class_methods:
 58 |             method_info = parser.get_function_metadata(method)
 59 |             method_info['code'] = get_node_text(method)
 60 |             cls_method.append(method_info)
 61 | 
 62 |         cls_info["method"] = cls_method
 63 |         cls_metadata.append(cls_info)
 64 |         method_list.extend(current_class_methods)
 65 | 
 66 |     fn_list: List = parser.get_function_list(root_node)
 67 |     for node in fn_list[:]:
 68 |         if node in method_list:
 69 |             fn_list.remove(node)
 70 | 
 71 |     fn_metadata = []
 72 |     for fn in fn_list:
 73 |         fn_metadata.append(parser.get_function_metadata(fn))
 74 | 
 75 |     output_metadata = {"class": cls_metadata, "function": fn_metadata}
 76 | 
 77 |     return output_metadata
 78 | 
 79 | 
 80 | def print_result(res: Dict, file_name: str = "no_name_file"):
 81 |     # ======== Print file name ========
 82 |     print("File {name} analyzed:".format(name=file_name))
 83 |     print(50 * "=")
 84 | 
 85 |     # ========= Summary =========
 86 |     print("Number of class    : {length}".format(length=len(res["class"])))
 87 |     print("Number of function : {length}".format(length=len(res["function"])))
 88 |     print(50 * "-" + "\n")
 89 | 
 90 |     # ========= Print class & method =========
 91 |     cls_headers = ["#", "Class", "Arguments"]
 92 |     cls_method_headers = ["#", "Method name", "Paramters", 
 93 |                           "Type", "Return type", "Throws"]
 94 |     cls_info = []
 95 |     method_info = {}
 96 |     for cls_idx, _cls in enumerate(res["class"]):
 97 |         cls_max_length = max(1, len(_cls["parameters"].keys()))
 98 |         for i in range(cls_max_length):
 99 |             clslist = [""] * len(cls_headers)
100 |             clslist[0] = cls_idx if i < 1 else ""
101 |             clslist[1] = _cls["identifier"] if i < 1 else ""
102 |             if _cls["parameters"].keys():
103 |                 clslist[2] = list(_cls["parameters"].keys())[i]
104 |             cls_info.append(clslist)
105 | 
106 |         _method_info = []
107 |         for idx, method in enumerate(_cls["method"]):
108 |             max_length = max(1, len(method["parameters"].keys()))
109 |             for i in range(max_length):
110 |                 sublist = [""] * len(cls_method_headers)
111 |                 sublist[0] = idx if i < 1 else ""
112 |                 sublist[1] = method["identifier"] if i < 1 else ""
113 |                 if method["parameters"].keys():
114 |                     sublist[2] = list(method["parameters"].keys())[i]
115 |                     sublist[3] = list(method["parameters"].values())[i]
116 |                 sublist[4] = (
117 |                     method["return_type"]
118 |                     if i <= 1 and method["return_type"] != "<not_specific>"
119 |                     else ""
120 |                 )
121 |                 sublist[5] = (
122 |                     method["throws"]
123 |                     if i <= 1 and "throws" in method.keys()
124 |                     else ""
125 |                 )
126 |                 _method_info.append(sublist)
127 | 
128 |             method_info[file_name] = [_cls["identifier"], _method_info]
129 | 
130 |     if cls_info:
131 |         print("Class summary:")
132 |         print(tabulate(cls_info, headers=cls_headers, tablefmt="outline"))
133 |         print("\n")
134 | 
135 |         for _, info in method_info.items():
136 |             name, info = info
137 |             print("Class analyse: {name}".format(name=name))
138 |             print(tabulate(info, headers=cls_method_headers, tablefmt="outline"))
139 |             print("\n")
140 | 
141 |     # ========= Print stand alone function =========
142 |     fn_headers = ["#", "Function name", "Paramters", "Type", "Return type"]
143 |     function_info = []
144 | 
145 |     for idx, fn in enumerate(res["function"]):
146 |         max_length = max(1, len(fn["parameters"].keys()))
147 |         for i in range(max_length):
148 |             sublist = [""] * len(fn_headers)
149 |             sublist[0] = idx if i < 1 else ""
150 |             sublist[1] = fn["identifier"] if i < 1 else ""
151 |             if fn["parameters"].keys():
152 |                 sublist[2] = list(fn["parameters"].keys())[i]
153 |                 sublist[3] = list(fn["parameters"].values())[i]
154 |             sublist[4] = (
155 |                 fn["return_type"]
156 |                 if i <= 1 and fn["return_type"] != "<not_specific>"
157 |                 else ""
158 |             )
159 |             function_info.append(sublist)
160 | 
161 |     if function_info:
162 |         print("Function analyse:")
163 |         print(tabulate(function_info, headers=fn_headers, tablefmt="outline"))
164 |         print("\n")
165 | 
166 |     elif not method_info:
167 |         print("File empty")
168 |         print("\n")
169 | 
170 | 
171 | PL_MATCHING = {
172 |     "Java": [".java"],
173 |     "JavaScript": [
174 |         ".js",
175 |         "._js",
176 |         ".bones",
177 |         ".es6",
178 |         ".jake",
179 |         ".jsb",
180 |         ".jscad",
181 |         ".jsfl",
182 |         ".jsm",
183 |         ".jss",
184 |         ".njs",
185 |         ".pac",
186 |         ".sjs",
187 |         ".ssjs",
188 |         ".xsjs",
189 |         ".xsjslib",
190 |     ],
191 |     "Python": [
192 |         ".py",
193 |         ".bzl",
194 |         ".gyp",
195 |         ".lmi",
196 |         ".pyde",
197 |         ".pyp",
198 |         ".pyt",
199 |         ".pyw",
200 |         ".tac",
201 |         ".wsgi",
202 |         ".xpy",
203 |     ],
204 |     "PHP": [".php", ".aw", ".ctp", ".php3", ".php4", ".php5", ".phps", ".phpt"],
205 |     "Go": [".go"],
206 |     "Rust": [".rs", ".rs.in"],
207 |     "Ruby": [
208 |         ".rb",
209 |         ".builder",
210 |         ".gemspec",
211 |         ".god",
212 |         ".irbrc",
213 |         ".jbuilder",
214 |         ".mspec",
215 |         ".podspec",
216 |         ".rabl",
217 |         ".rake",
218 |         ".rbuild",
219 |         ".rbw",
220 |         ".rbx",
221 |         ".ru",
222 |         ".ruby",
223 |         ".thor",
224 |         ".watchr",
225 |     ],
226 |     "C": [".c", ".cats", ".h", ".idc", ".w"],
227 |     "C#": [".cs", ".cake", ".cshtml", ".csx"],
228 |     "C++": [
229 |         ".cpp",
230 |         ".c++",
231 |         ".cc",
232 |         ".cp",
233 |         ".cxx",
234 |         ".h++",
235 |         ".hh",
236 |         ".hpp",
237 |         ".hxx",
238 |         ".inl",
239 |         ".ipp",
240 |         ".tcc",
241 |         ".tpp",
242 |         ".C",
243 |         ".H",
244 |     ],
245 | }
246 | 


--------------------------------------------------------------------------------
/src/codetext/parser/README.md:
--------------------------------------------------------------------------------
1 | # Parser Appendix
2 | 
3 | With `codetext` parser, we support to extract serveral function type, however, by using `tree-sitter` grammarly, some function or some language might be not fully supported.
4 | 
5 | This is the list of current supported function:
6 | 
7 | 


--------------------------------------------------------------------------------
/src/codetext/parser/__init__.py:
--------------------------------------------------------------------------------
 1 | """Codetext parser
 2 | Parse code to get docstring node, comment node
 3 | """
 4 | from .go_parser import GoParser
 5 | from .php_parser import PhpParser
 6 | from .ruby_parser import RubyParser
 7 | from .java_parser import JavaParser
 8 | from .javascript_parser import JavascriptParser
 9 | from .python_parser import PythonParser
10 | from .cpp_parser import CppParser
11 | from .c_sharp_parser import CsharpParser
12 | from .rust_parser import RustParser
13 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text, \
14 |     tokenize_code, tokenize_docstring, nodes_are_equal
15 |     
16 | SUPPORT_LANGUAGE = [
17 |     "go", "php", "ruby", "java", "javascript", 
18 |     "python", "cpp", "c", "c_sharp", "rust"
19 | ]
20 | 
21 | __all__ = [
22 |     'GoParser', 'PhpParser', 'RubyParser', 'JavaParser', 'JavascriptParser',
23 |     'PythonParser', 'CppParser', 'CsharpParser', 'RustParser', 'LanguageParser',
24 |     'get_node_by_kind', 'get_node_text', 'tokenize_code', 'tokenize_docstring',
25 |     'nodes_are_equal'
26 | ]
27 | 


--------------------------------------------------------------------------------
/src/codetext/parser/c_sharp_parser.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Any
  2 | import tree_sitter
  3 | import logging
  4 | 
  5 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text
  6 | 
  7 | logger = logging.getLogger(name=__name__)
  8 | 
  9 | 
 10 | class CsharpParser(LanguageParser):
 11 |     
 12 |     BLACKLISTED_FUNCTION_NAMES = []
 13 |     
 14 |     @staticmethod
 15 |     def get_docstring(node, blob=None):
 16 |         """
 17 |         Get docstring description for node
 18 |         
 19 |         Args:
 20 |             node (tree_sitter.Node)
 21 |             blob (str): original source code which parse the `node`
 22 |         Returns:
 23 |             str: docstring
 24 |         """
 25 |         if blob:
 26 |             logger.info('From version `0.0.6` this function will update argument in the API')
 27 |         docstring_node = CsharpParser.get_docstring_node(node)
 28 |         docstring = '\n'.join(get_node_text(s) for s in docstring_node)
 29 |         return docstring
 30 |     
 31 |     @staticmethod
 32 |     def get_docstring_node(node):
 33 |         """
 34 |         Get docstring node from it parent node.
 35 |         C# docstring is written line by line and stay outside it own node, see example below.
 36 |         
 37 |         Args:
 38 |             node (tree_sitter.Node): parent node (usually function node) to get its docstring
 39 |         Return:
 40 |             List: list of docstring nodes
 41 |         Example:
 42 |             str = '''
 43 |                 // <summary>
 44 |                 // Docstring of a method
 45 |                 // </summary>
 46 |                 // <param name="animal_honk">Argument.</param>
 47 |                 // <returns>
 48 |                 // None.
 49 |                 public void honk(string animal_honk)
 50 |                 {                    
 51 |                     Console.WriteLine(animal_honk);
 52 |                     Console.WriteLine("Tuut, tuut!");
 53 |                 }
 54 |             '''
 55 |             ...
 56 |             print(C_sharp.get_docstring_node(function_node))
 57 |             
 58 |             >>> [<Node type=comment, start_point=(5, 12), end_point=(5, 24)>, \
 59 |                 <Node type=comment, start_point=(6, 12), end_point=(6, 36)>, \
 60 |                 <Node type=comment, start_point=(7, 12), end_point=(7, 25)>, \
 61 |                 <Node type=comment, start_point=(8, 12), end_point=(8, 58)>, \
 62 |                 <Node type=comment, start_point=(9, 12), end_point=(9, 24)>, \
 63 |                 <Node type=comment, start_point=(10, 12), end_point=(10, 20)>]
 64 |         """
 65 |         docstring_node = []
 66 |         
 67 |         prev_node = node.prev_sibling
 68 |         if prev_node and prev_node.type == 'comment':
 69 |             docstring_node.append(prev_node)
 70 |             prev_node = prev_node.prev_sibling
 71 | 
 72 |         while prev_node and prev_node.type == 'comment':
 73 |             # Assume the comment is dense
 74 |             x_current = prev_node.start_point[0]
 75 |             x_next = prev_node.next_sibling.start_point[0]
 76 |             if x_next - x_current > 1:
 77 |                 break
 78 |             
 79 |             docstring_node.insert(0, prev_node)    
 80 |             prev_node = prev_node.prev_sibling
 81 |             
 82 |         return docstring_node
 83 |     
 84 |     @staticmethod
 85 |     def get_comment_node(node):
 86 |         """
 87 |         Return all comment node inside a parent node
 88 |         Args:
 89 |             node (tree_sitter.Node)
 90 |         Return:
 91 |             List: list of comment nodes
 92 |         """
 93 |         comment_node = get_node_by_kind(node, kind=['comment'])
 94 |         return comment_node
 95 |     
 96 |     @staticmethod
 97 |     def get_function_list(node):
 98 |         res = get_node_by_kind(node, ['local_function_statement', 'method_declaration'])
 99 |         # We don't use "constructor_declaration"
100 |         return res
101 | 
102 |     @staticmethod
103 |     def get_class_list(node):
104 |         res = get_node_by_kind(node, ['class_declaration'])
105 |         return res
106 | 
107 |     @staticmethod
108 |     def get_function_metadata(function_node, blob: str = None) -> Dict[str, Any]:
109 |         """
110 |         Function metadata contains:
111 |             - identifier (str): function name
112 |             - parameters (Dict[str, str]): parameter's name and their type (e.g: {'param_a': 'int'})
113 |             - type (str): type
114 |         """
115 |         metadata = {
116 |             'identifier': '',
117 |             'parameters': {},
118 |             'return_type': None
119 |         }
120 |         assert type(function_node) == tree_sitter.Node
121 |         
122 |         for child in function_node.children:
123 |             if child.type in ['predefined_type', 'generic_name']:
124 |                 metadata['return_type'] = get_node_text(child)
125 |             elif child.type == 'identifier':
126 |                 if child.next_named_sibling.type != 'parameter_list':
127 |                     metadata['return_type'] = get_node_text(child)
128 |                 else:
129 |                     metadata['identifier'] = get_node_text(child)
130 |             elif child.type == 'parameter_list':
131 |                 for param_node in child.children:
132 |                     param_nodes = get_node_by_kind(param_node, ['parameter'])
133 |                     for param in param_nodes:
134 |                         if len(param.children) > 1:
135 |                             param_type = get_node_text(param.children[0])
136 |                             param_name = get_node_text(param.children[1])
137 |                             metadata['parameters'][param_name] = param_type
138 |                         
139 |                         else:
140 |                             param_name = get_node_text(param.children[0])
141 |                             metadata['parameters'][param_name] = None
142 |                         # for node in param.children:
143 |                         #     if node.type in ['array_type', 'implicit_type', \
144 |                         #         'nullable_type', 'pointer_type', 'function_pointer_type', \
145 |                         #         'predefined_type', 'tuple_type']:
146 |                         #         param_type = get_node_text(node)
147 |                         #     elif node.type == 'identifier':
148 |                         #         param_identifier = get_node_text(node)
149 |                                 
150 |                         # param_type = get_node_text(param.child_by_field_name('type'))
151 |                         # param_identifier = get_node_text(param.child_by_field_name('name'))
152 |         return metadata
153 | 
154 |     @staticmethod
155 |     def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]:
156 |         """
157 |         Class metadata contains:
158 |             - identifier (str): class's name
159 |             - parameters (List[str]): inheritance class
160 |         """
161 |         if blob:
162 |             logger.info('From version `0.0.6` this function will update argument in the API')
163 |         metadata = {
164 |             'identifier': '',
165 |             'parameters': {},
166 |         }
167 |         assert type(class_node) == tree_sitter.Node
168 |         
169 |         for child in class_node.children:
170 |             if child.type == 'identifier':
171 |                 metadata['identifier'] = get_node_text(child)
172 |             elif child.type == 'base_list':
173 |                 for arg in child.children:
174 |                     if arg.type == 'identifier':
175 |                         metadata['parameters'][get_node_text(arg)] = None
176 |                         # argument_list.append(get_node_text(arg))
177 |                 # metadata['parameters'] = argument_list
178 | 
179 |         return metadata
180 |     
181 | 


--------------------------------------------------------------------------------
/src/codetext/parser/cpp_parser.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Any
  2 | 
  3 | import tree_sitter
  4 | import logging
  5 | 
  6 | from .language_parser import LanguageParser, get_node_text, get_node_by_kind
  7 | 
  8 | logger = logging.getLogger(name=__name__)
  9 | 
 10 | 
 11 | class CppParser(LanguageParser):
 12 |     
 13 |     BLACKLISTED_FUNCTION_NAMES = ['main', 'constructor']
 14 |     
 15 |     @staticmethod
 16 |     def get_docstring(node, blob=None):
 17 |         """
 18 |         Get docstring description for node
 19 |         
 20 |         Args:
 21 |             node (tree_sitter.Node)
 22 |             blob (str): original source code which parse the `node`
 23 |         Returns:
 24 |             str: docstring
 25 |         """
 26 |         if blob:
 27 |             logger.info('From version `0.0.6` this function will update argument in the API')
 28 |         docstring_node = CppParser.get_docstring_node(node)
 29 |         docstring = '\n'.join(get_node_text(s) for s in docstring_node)
 30 |         return docstring
 31 |     
 32 |     @staticmethod
 33 |     def get_docstring_node(node):
 34 |         """
 35 |         Get docstring node from it parent node.
 36 |         C and C++ share the same syntax. Their docstring usually is 1 single block
 37 |         Expect length of return list == 1
 38 |         
 39 |         Args:
 40 |             node (tree_sitter.Node): parent node (usually function node) to get its docstring
 41 |         Return:
 42 |             List: list of docstring nodes (expect==1)
 43 |         Example:
 44 |             str = '''
 45 |                 /**
 46 |                 * Find 2 sum
 47 |                 *
 48 |                 * @param nums List number.
 49 |                 * @param target Sum target.
 50 |                 * @return postion of 2 number.
 51 |                 */
 52 |                 vector<int> twoSum(vector<int>& nums, int target) {
 53 |                     ...
 54 |                 }
 55 |             '''
 56 |             ...
 57 |             print(CppParser.get_docstring_node(function_node))
 58 |             
 59 |             >>> [<Node type=comment, start_point=(x, y), end_point=(x, y)>]
 60 |         """
 61 |         docstring_node = []
 62 |         
 63 |         prev_node = node.prev_sibling
 64 |         if prev_node and prev_node.type == 'comment':
 65 |             docstring_node.append(prev_node)
 66 |             prev_node = prev_node.prev_sibling
 67 | 
 68 |         while prev_node and prev_node.type == 'comment':
 69 |             # Assume the comment is dense
 70 |             x_current = prev_node.start_point[0]
 71 |             x_next = prev_node.next_sibling.start_point[0]
 72 |             if x_next - x_current > 1:
 73 |                 break
 74 |             
 75 |             docstring_node.insert(0, prev_node)    
 76 |             prev_node = prev_node.prev_sibling
 77 |         
 78 |         return docstring_node
 79 |     
 80 |     @staticmethod
 81 |     def get_function_list(node):
 82 |         res = get_node_by_kind(node, ['function_definition'])
 83 |         return res
 84 | 
 85 |     @staticmethod
 86 |     def get_class_list(node):
 87 |         res = get_node_by_kind(node, ['class_specifier'])
 88 |         return res
 89 |         
 90 |     @staticmethod
 91 |     def get_comment_node(node):
 92 |         """
 93 |         Return all comment node inside a parent node
 94 |         Args:
 95 |             node (tree_sitter.Node)
 96 |         Return:
 97 |             List: list of comment nodes
 98 |         """
 99 |         comment_node = get_node_by_kind(node, kind=['comment'])
100 |         return comment_node
101 |     
102 |     @staticmethod
103 |     def get_function_metadata(function_node, blob: str=None) -> Dict[str, Any]:
104 |         """
105 |         Function metadata contains:
106 |             - identifier (str): function name
107 |             - parameters (Dict[str, str]): parameter's name and their type (e.g: {'param_a': 'int'})
108 |             - return_type (str or NoneType): function's return type
109 |         """
110 |         if blob:
111 |             logger.info('From version `0.0.6` this function will update argument in the API')
112 |         metadata = {
113 |             'identifier': '',
114 |             'parameters': {},
115 |             'return_type': None,
116 |         }
117 |         assert type(function_node) == tree_sitter.Node
118 |         
119 |         for child in function_node.children:
120 |             if child.type in ['primitive_type', 'type_identifier']:
121 |                 metadata['return_type'] = get_node_text(child)
122 |                 # search for "function_declarator"
123 |             elif child.type == 'pointer_declarator':
124 |                 for subchild in child.children:
125 |                     if subchild.type == 'function_declarator':
126 |                         child = subchild
127 |             if child.type == 'function_declarator':
128 |                 for subchild in child.children:
129 |                     if subchild.type in ['qualified_identifier', 'identifier', 'field_identifier']:
130 |                         metadata['identifier'] = get_node_text(subchild)
131 |                     elif subchild.type == 'parameter_list':
132 |                         param_nodes = get_node_by_kind(subchild, ['parameter_declaration'])
133 |                         for param in param_nodes:
134 |                             param_type = param.child_by_field_name('type')
135 |                             param_type = get_node_text(param_type)
136 |                             list_name = get_node_by_kind(param, ['identifier'])
137 |                             if not list_name:
138 |                                 continue
139 |                             param_name = get_node_text(list_name[0])
140 |                             metadata['parameters'][param_name] = param_type
141 |                             # for item in param.children:
142 |                                 
143 |                             #     if item.type in ['type_identifier', 'primitive_type']:
144 |                             #         param_type = get_node_text(item)
145 |                             #     elif item.type == 'identifier':
146 |                             #         param_identifier = get_node_text(item)
147 | 
148 |         return metadata
149 | 
150 |     @staticmethod
151 |     def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]:
152 |         """
153 |         Class metadata contains:
154 |             - identifier (str): class's name
155 |             - parameters (List[str]): inheritance class
156 |         """
157 |         if blob:
158 |             logger.info('From version `0.0.6` this function will update argument in the API')
159 |         metadata = {
160 |             'identifier': '',
161 |             'parameters': {},
162 |         }
163 |         assert type(class_node) == tree_sitter.Node
164 |         
165 |         for child in class_node.children:
166 |             if child.type == 'type_identifier':
167 |                 metadata['identifier'] = get_node_text(child)
168 |             elif child.type == 'base_class_clause':
169 |                 argument_list = []
170 |                 for param in child.children:
171 |                     if param.type == 'type_identifier':
172 |                         metadata['parameters'][get_node_text(param)] = None
173 |                         # argument_list.append(get_node_text(param))
174 |                 # metadata['parameters'] = argument_list
175 | 
176 |         return metadata
177 | 


--------------------------------------------------------------------------------
/src/codetext/parser/go_parser.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Any
  2 | import logging
  3 | 
  4 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text
  5 | 
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | class GoParser(LanguageParser):
 11 | 
 12 |     BLACKLISTED_FUNCTION_NAMES = ['test', 'vendor']
 13 |     
 14 |     @staticmethod
 15 |     def get_comment_node(function_node):
 16 |         """
 17 |         Return all comment node inside a parent node
 18 |         Args:
 19 |             node (tree_sitter.Node)
 20 |         Return:
 21 |             List: list of comment nodes
 22 |         """
 23 |         comment_node = get_node_by_kind(function_node, kind='comment')
 24 |         return comment_node
 25 |     
 26 |     @staticmethod
 27 |     def get_docstring_node(node):
 28 |         """
 29 |         Get docstring node from it parent node.
 30 |         Go's docstring is written line by line
 31 |         
 32 |         Args:
 33 |             node (tree_sitter.Node): parent node (usually function node) to get its docstring
 34 |         Return:
 35 |             List: list of docstring nodes
 36 |         Example:
 37 |             str = '''
 38 |                 // The path package should only be used for paths separated by forward
 39 |                 // slashes, such as the paths in URLs. This package does not deal with
 40 |                 // Windows paths with drive letters or backslashes; to manipulate
 41 |                 // operating system paths, use the [path/filepath] package.
 42 |                 func (e TypeError) Error() string {
 43 |                     ...
 44 |                 }
 45 |             '''
 46 |             ...
 47 |             print(GoParser.get_docstring_node(function_node))
 48 |             
 49 |             >>> [<Node type=comment, start_point=(x, y), end_point=(x, y)>, \
 50 |                 <Node type=comment, start_point=(x, y), end_point=(x, y)>, \
 51 |                 <Node type=comment, start_point=(x, y), end_point=(x, y)>, \
 52 |                 <Node type=comment, start_point=(x, y), end_point=(x, y)>]
 53 |         """
 54 |         docstring_node = []
 55 |         
 56 |         prev_node = node.prev_sibling
 57 |         if prev_node and prev_node.type == 'comment':
 58 |             docstring_node.append(prev_node)
 59 |             prev_node = prev_node.prev_sibling
 60 | 
 61 |         while prev_node and prev_node.type == 'comment':
 62 |             # Assume the comment is dense
 63 |             x_current = prev_node.start_point[0]
 64 |             x_next = prev_node.next_sibling.start_point[0]
 65 |             if x_next - x_current > 1:
 66 |                 break
 67 |             
 68 |             docstring_node.insert(0, prev_node)    
 69 |             prev_node = prev_node.prev_sibling
 70 |             
 71 |         return docstring_node
 72 |     
 73 |     @staticmethod
 74 |     def get_docstring(node, blob:str=None):
 75 |         """
 76 |         Get docstring description for node
 77 |         
 78 |         Args:
 79 |             node (tree_sitter.Node)
 80 |             blob (str): original source code which parse the `node`
 81 |         Returns:
 82 |             str: docstring
 83 |         """
 84 |         if blob:
 85 |             logger.info('From version `0.0.6` this function will update argument in the API')
 86 |         docstring_node = GoParser.get_docstring_node(node)
 87 |         docstring = '\n'.join(get_node_text(s) for s in docstring_node)
 88 |         return docstring
 89 |     
 90 |     @staticmethod
 91 |     def get_function_list(node):
 92 |         res = get_node_by_kind(node, ['method_declaration', 'function_declaration'])
 93 |         return res
 94 |     
 95 |     @staticmethod
 96 |     def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]:
 97 |         if blob:
 98 |             logger.info('From version `0.0.6` this function will update argument in the API')
 99 |         metadata = {
100 |             'identifier': '',
101 |             'parameters': {},
102 |             'return_type': None,
103 |         }
104 |         
105 |         for child in function_node.children:
106 |             if child.type in ['field_identifier', 'identifier']:
107 |                 metadata['identifier'] = get_node_text(child)
108 |             elif child.type == 'type_identifier':
109 |                 metadata['return_type'] = get_node_text(child)
110 |             elif child.type == 'parameter_list':
111 |                 for subchild in child.children:
112 |                     if subchild.type in ['parameter_declaration', 'variadic_parameter_declaration']:
113 |                         identifier_node = subchild.child_by_field_name('name')
114 |                         
115 |                         if not identifier_node:
116 |                             continue
117 |                         
118 |                         param_type = get_node_text(subchild.child_by_field_name('type'))
119 |                         identifier = get_node_text(identifier_node)
120 |                         if identifier and param_type:
121 |                             metadata['parameters'][identifier] = param_type
122 |         
123 |         return metadata
124 | 
125 |     @staticmethod
126 |     def get_class_list(node):
127 |         pass
128 |     
129 |     @staticmethod
130 |     def get_class_metadata(class_node, blob=None) -> Dict[str, str]:
131 |         if blob:
132 |             logger.info('From version `0.0.6` this function will update argument in the API')
133 |         pass
134 | 


--------------------------------------------------------------------------------
/src/codetext/parser/java_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List, Dict, Any
  3 | import logging
  4 | 
  5 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text
  6 | 
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class JavaParser(LanguageParser):
 12 | 
 13 |     FILTER_PATHS = ('test', 'tests')
 14 | 
 15 |     BLACKLISTED_FUNCTION_NAMES = ['toString', 'hashCode', 'equals', 'finalize', 'notify', 'notifyAll', 'clone']
 16 | 
 17 |     @staticmethod
 18 |     def get_docstring_node(node):
 19 |         """
 20 |         Get docstring node from it parent node. Expect return list have length==1
 21 |         
 22 |         Args:
 23 |             node (tree_sitter.Node): parent node (usually function node) to get its docstring
 24 |         Return:
 25 |             List: list of docstring nodes
 26 |         """
 27 |         docstring_node = []
 28 |         
 29 |         if node.prev_sibling:
 30 |             prev_node = node.prev_sibling
 31 |             if prev_node.type == 'block_comment' or prev_node.type == 'line_comment':
 32 |                 docstring_node.append(prev_node)
 33 |         
 34 |         return docstring_node
 35 | 
 36 |     @staticmethod
 37 |     def get_docstring(node, blob=None):
 38 |         """
 39 |         Get docstring description for node
 40 |         
 41 |         Args:
 42 |             node (tree_sitter.Node)
 43 |             blob (str): original source code which parse the `node`
 44 |         Returns:
 45 |             str: docstring
 46 |         """
 47 |         if blob:
 48 |             logger.info('From version `0.0.6` this function will update argument in the API')
 49 |         docstring_node = JavaParser.get_docstring_node(node)
 50 | 
 51 |         docstring = ''
 52 |         if docstring_node:
 53 |             docstring = get_node_text(docstring_node[0])
 54 |         return docstring
 55 | 
 56 |     @staticmethod
 57 |     def get_comment_node(function_node):
 58 |         """
 59 |         Return all comment node inside a parent node
 60 |         Args:
 61 |             node (tree_sitter.Node)
 62 |         Return:
 63 |             List: list of comment nodes
 64 |         """
 65 |         comment_node = get_node_by_kind(function_node, kind=['line_comment'])
 66 |         return comment_node
 67 |     
 68 |     @staticmethod
 69 |     def get_class_list(node):
 70 |         res = get_node_by_kind(node, ['class_declaration'])
 71 |         return res
 72 |     
 73 |     @staticmethod
 74 |     def get_function_list(node):
 75 |         res = get_node_by_kind(node, ['method_declaration'])
 76 |         return res
 77 |     
 78 |     @staticmethod
 79 |     def is_method_body_empty(node):
 80 |         for c in node.children:
 81 |             if c.type in {'method_body', 'constructor_body'}:
 82 |                 if c.start_point[0] == c.end_point[0]:
 83 |                     return True
 84 |     
 85 |     @staticmethod
 86 |     def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]:
 87 |         if blob:
 88 |             logger.info('From version `0.0.6` this function will update argument in the API')
 89 |         metadata = {
 90 |             'identifier': '',
 91 |             'parameters': {},
 92 |         }
 93 |         argument_list = []
 94 |         for child in class_node.children:
 95 |             if child.type == 'identifier':
 96 |                 metadata['identifier'] = get_node_text(child)
 97 |             elif child.type == 'superclass' or child.type == 'super_interfaces':
 98 |                 for subchild in child.children:
 99 |                     if subchild.type == 'type_list' or subchild.type == 'type_identifier':
100 |                         metadata['parameters'][get_node_text(subchild)] = None
101 |                         # argument_list.append(get_node_text(subchild))
102 |                     
103 |         # metadata['parameters'] = argument_list
104 |         return metadata
105 | 
106 |     @staticmethod
107 |     def get_function_metadata(function_node, blob: str = None) -> Dict[str, str]:
108 |         metadata = {
109 |             'identifier': '',
110 |             'parameters': {},
111 |             'return_type': None
112 |         }
113 |         
114 |         return_kinds = ["void_type", 
115 |                         "integral_type",
116 |                         "floating_point_type",
117 |                         "boolean_type",
118 |                         "type_identifier",
119 |                         "scoped_type_identifier",
120 |                         "generic_type"]
121 |         
122 | 
123 |         for child in function_node.children:
124 |             if child.type == 'identifier':
125 |                 metadata['identifier'] = get_node_text(child)    
126 |             elif child.type in return_kinds:
127 |                 metadata['return_type'] = get_node_text(child)
128 |             elif child.type == 'throws':
129 |                 for subchild in child.children:
130 |                     if 'identifier' in subchild.type:
131 |                         metadata['throws'] = get_node_text(subchild)
132 |             elif child.type == 'formal_parameters':
133 |                 param_list = get_node_by_kind(child, ['formal_parameter'])  # speed_parameter
134 |                 for param in param_list:
135 |                     param_type = get_node_text(param.child_by_field_name('type'))
136 |                     identifier = get_node_text(param.child_by_field_name('name'))
137 |                     metadata['parameters'][identifier] = param_type
138 |         
139 |         
140 |         return metadata


--------------------------------------------------------------------------------
/src/codetext/parser/javascript_parser.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Dict, Any
  2 | import logging
  3 | 
  4 | from .language_parser import LanguageParser, get_node_text, get_node_by_kind
  5 | 
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | 
 10 | class JavascriptParser(LanguageParser):
 11 | 
 12 |     FILTER_PATHS = ('test', 'node_modules')
 13 | 
 14 |     BLACKLISTED_FUNCTION_NAMES = ['toString', 'toLocaleString', 'valueOf', 'constructor']
 15 | 
 16 |     @staticmethod
 17 |     def get_docstring_node(node):
 18 |         docstring_node = []
 19 |         prev_node = node.prev_sibling
 20 |         parent_node = node.parent
 21 |                 
 22 |         if prev_node and prev_node.type == 'comment':
 23 |             docstring_node.append(prev_node)
 24 |         
 25 |         elif parent_node:
 26 |             if parent_node.type != 'class_body':  # node not inside a class
 27 |                 prev_node = parent_node.prev_sibling
 28 |                 if prev_node and prev_node.type == 'comment':
 29 |                     docstring_node.append(prev_node)
 30 |             
 31 |         return docstring_node
 32 |     
 33 |     @staticmethod
 34 |     def get_docstring(node, blob=None):
 35 |         if blob:
 36 |             logger.info('From version `0.0.6` this function will update argument in the API')
 37 |         docstring_node = JavascriptParser.get_docstring_node(node)
 38 |         
 39 |         docstring = ''
 40 |         if docstring_node:
 41 |             docstring = get_node_text(docstring_node[0])
 42 |         return docstring
 43 |     
 44 |     @staticmethod
 45 |     def get_comment_node(function_node):
 46 |         comment_node = get_node_by_kind(function_node, kind=['comment'])
 47 |         return comment_node
 48 |     
 49 |     @staticmethod
 50 |     def get_function_list(node):
 51 |         function_types = ['function_declaration',
 52 |                     'function',
 53 |                     'method_definition',
 54 |                     'generator_function_declaration',
 55 |                     'arrow_function',
 56 |                     'generator_function']
 57 |         res = get_node_by_kind(node, function_types)
 58 |         for node in res[:]:
 59 |             if not node.children:
 60 |                 res.remove(node)
 61 | 
 62 |         return res
 63 |     
 64 |     @staticmethod
 65 |     def get_class_list(node):
 66 |         res = get_node_by_kind(node, ['class_declaration', 'class'])
 67 |         for node in res[:]:
 68 |             if not node.children:
 69 |                 res.remove(node)
 70 | 
 71 |         return res
 72 | 
 73 |     @staticmethod
 74 |     def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]:
 75 |         if blob:
 76 |             logger.info('From version `0.0.6` this function will update argument in the API')
 77 |         metadata = {
 78 |             'identifier': '',
 79 |             'parameters': {},
 80 |             'return_type': None,
 81 |         }
 82 |         param = []
 83 |         for child in function_node.children:
 84 |             if child.type in ['identifier', 'property_identifier']:
 85 |                 metadata['identifier'] = get_node_text(child)
 86 |             elif child.type == 'formal_parameters':
 87 |                 params = get_node_by_kind(child, ['identifier'])
 88 |                 for param in params:
 89 |                     identifier = get_node_text(param)
 90 |                     metadata['parameters'][identifier] = None  # JS not have type define
 91 |         
 92 |         return_statement = get_node_by_kind(function_node, ['return_statement'])
 93 |         if len(return_statement) > 0:
 94 |             metadata['return_type'] = '<not_specific>'
 95 |             
 96 |         if function_node.type in ["function",
 97 |                                   "arrow_function",
 98 |                                   "generator_function"]:
 99 |             # function inside object property or variable declarator
100 |             identifier = function_node.prev_named_sibling
101 |             if identifier:
102 |                 if identifier.type in ["identifier"]:
103 |                     metadata["identifier"] = identifier.text.decode()
104 |         
105 |         return metadata
106 | 
107 |     @staticmethod
108 |     def get_class_metadata(class_node, blob=None):
109 |         if blob:
110 |             logger.info('From version `0.0.6` this function will update argument in the API')
111 |         metadata = {
112 |             'identifier': '',
113 |             'parameters': {},
114 |         }
115 |         param = []
116 |         for child in class_node.children:
117 |             if child.type == 'identifier':
118 |                 metadata['identifier'] = get_node_text(child)
119 |             elif child.type == 'class_heritage':
120 |                 for subchild in child.children:
121 |                     if subchild.type == 'identifier':
122 |                         metadata['parameters'][get_node_text(subchild)] = None
123 |                         # param.append(get_node_text(subchild))
124 |                         
125 |         # metadata['parameters'] = param
126 |         return metadata
127 | 


--------------------------------------------------------------------------------
/src/codetext/parser/language_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from abc import ABC, abstractmethod
  3 | from typing import List, Dict, Any, Set, Optional
  4 | 
  5 | import tree_sitter
  6 | 
  7 | import logging
  8 | 
  9 | DOCSTRING_REGEX = re.compile(r"(['\"])\1\1(.*?)\1{3}", flags=re.DOTALL)
 10 | DOCSTRING_REGEX_TOKENIZER = re.compile(r"[^\s,'\"`.():\[\]=*;>{\}+-/\\]+|\\+|\.+|\(\)|{\}|\[\]|\(+|\)+|:+|\[+|\]+|{+|\}+|=+|\*+|;+|>+|\++|-+|/+|\'|\"|`")
 11 | logger = logging.getLogger()
 12 | 
 13 | def remove_words_in_string(words, string):
 14 |     new_string = string
 15 |     for word in words:
 16 |         new_string = str(new_string).replace(word, '')
 17 |     return new_string
 18 | 
 19 | 
 20 | def tokenize_docstring(docstring: str) -> List[str]:
 21 |     return [t for t in DOCSTRING_REGEX_TOKENIZER.findall(str(docstring)) if t is not None and len(t) > 0]
 22 | 
 23 | 
 24 | def tokenize_code(node, blob: str, nodes_to_exclude: Optional[Set]=None) -> List:
 25 |     tokens = []
 26 |     traverse(node, tokens)
 27 |     # print(tokens)
 28 |     # for token in tokens:
 29 |     #     print(token.text)
 30 |     return [match_from_span(token, blob) for token in tokens if nodes_to_exclude is None or token not in nodes_to_exclude]
 31 | 
 32 | def nodes_are_equal(n1, n2):
 33 |     return n1.type == n2.type and n1.start_point == n2.start_point and n1.end_point == n2.end_point
 34 | 
 35 | def parent_and_previous_sibling(tree, node):
 36 |     """Merge `node_parent` and `previous_sibling` function
 37 |     """
 38 |     parent = node_parent(tree, node)
 39 |     for i, node_at_i in enumerate(parent.children):
 40 |         if nodes_are_equal(node, node_at_i):
 41 |             if i > 0:
 42 |                 return parent, parent.children[i-1]
 43 |             return parent, None
 44 | 
 45 |     return ValueError("Could not find node in tree.")
 46 | 
 47 | 
 48 | def previous_sibling(tree, node):
 49 |     """
 50 |     Search for the previous sibling of the node.
 51 |     TODO: C TreeSitter should support this natively, but not its Python bindings yet. Replace later.
 52 |     """
 53 |     to_visit = [tree.root_node]
 54 |     while len(to_visit) > 0:
 55 |         next_node = to_visit.pop()
 56 |         for i, node_at_i in enumerate(next_node.children):
 57 |             if nodes_are_equal(node, node_at_i):
 58 |                 if i > 0:
 59 |                     return next_node.children[i-1]
 60 |                 return None
 61 |         else:
 62 |             to_visit.extend(next_node.children)
 63 |     return ValueError("Could not find node in tree.")
 64 | 
 65 | 
 66 | # if parent_node.type == 'variable_declarator':
 67 | #     # node
 68 | #     base_node = node_parent(tree, parent_node)  # Get the variable declaration
 69 | #     # parent
 70 | #     parent_node = node_parent(tree, base_node)
 71 | # elif parent_node.type == 'pair':
 72 | #     base_node = parent_node  # This is a common pattern where a function is assigned as a value to a dictionary.
 73 | #     parent_node = node_parent(tree, base_node)
 74 | # else:
 75 | #     base_node = node
 76 | 
 77 | def traverse_type_parent(node, kind:List) -> None:
 78 |     results = []
 79 |     to_visit = [node]
 80 |     while len(to_visit) > 0:
 81 |         next_node = to_visit.pop()
 82 |         for child in next_node.children:
 83 |             if child.type in kind:
 84 |                 results.append([next_node, child])
 85 |         else:
 86 |             to_visit.extend(next_node.children)
 87 |     
 88 |     return results
 89 | 
 90 | 
 91 | def node_parent(tree, node):
 92 |     to_visit = [tree.root_node]
 93 |     while len(to_visit) > 0:
 94 |         next_node = to_visit.pop()
 95 |         for child in next_node.children:
 96 |             if nodes_are_equal(child, node):
 97 |                 return next_node
 98 |         else:
 99 |             to_visit.extend(next_node.children)
100 |     raise ValueError("Could not find node in tree.")
101 | 
102 | 
103 | def traverse(node, results: List) -> None:
104 |     if node.type == 'string':
105 |         results.append(node)
106 |         return
107 |     for n in node.children:
108 |         traverse(n, results)
109 |     if not node.children:
110 |         results.append(node)
111 | 
112 | 
113 | def traverse_type(node, results, kind:List) -> None:
114 |     # logger.warn('From version 0.0.6, we move `traverse_type` to `get_node_by_kind`')
115 |     if node.type in kind:
116 |         results.append(node)
117 |     if not node.children:
118 |         return
119 |     for n in node.children:
120 |         traverse_type(n, results, kind)
121 | 
122 | 
123 | def get_node_by_kind(root: tree_sitter.Node, kind: List[str]) -> List:
124 |     """
125 |     Get all nodes with specific type
126 |     
127 |     Args:
128 |         root (tree_sitter.Node): Tree sitter root node
129 |         kind (List[str]): (node's) type that want to get
130 |     
131 |     Return:
132 |         List[tree_sitter.Node]: List of all 
133 |     """
134 |     assert type(root) == tree_sitter.Node, f"Expect `root` to be `tree_sitter.Node`, get {type(root)}"
135 |     assert type(kind) in [list, str], f"Expect `kind` to be `list` of string or `str`, get {type(kind)}"
136 |     assert all(isinstance(s, str) for s in kind) == True, f"Expect search kind to be `str`"
137 | 
138 |     node_list = []
139 |     traverse_type(root, node_list, kind=kind)
140 |     return node_list
141 | 
142 | 
143 | def get_node_text(root: tree_sitter.Node) -> str:
144 |     """
145 |     Get text of a tree-sitter Node. Can be use to replace `match_from_span`.
146 |     
147 |     Args:
148 |         root (tree_sitter.Node): Tree sitter node to get text
149 |         
150 |     Return:
151 |         str: text of `root`
152 |     """
153 |     assert type(root) == tree_sitter.Node, f"Expect `root` to be `tree_sitter.Node`, get {type(root)}"
154 | 
155 |     text = root.text.decode()
156 |     return text
157 | 
158 | 
159 | def match_from_span(node, blob: str) -> str:
160 |     # logger.warn('From version 0.0.6, we move `match_from_span` to `get_node_text`')
161 |     lines = blob.split('\n')
162 |     line_start = node.start_point[0]
163 |     line_end = node.end_point[0]
164 |     char_start = node.start_point[1]
165 |     char_end = node.end_point[1]
166 |     if line_start != line_end:
167 |         return '\n'.join([lines[line_start][char_start:]] + lines[line_start+1:line_end] + [lines[line_end][:char_end]])
168 |     else:
169 |         return lines[line_start][char_start:char_end]
170 |     
171 | 
172 | def match_from_spans(nodes, blob: str) -> str:
173 |     """
174 |     Get text from multiple note
175 |     
176 |     Args:
177 |         nodes (List): List of `tree_sitter.Node`
178 |         blob (str): Full source
179 |     
180 |     Return:
181 |         str: combined text of list node
182 |     """
183 |     assert len(nodes) != 0, "Empty node list"
184 |     start_point = nodes[0]
185 |     end_point = nodes[0]
186 |     
187 |     for node in nodes:
188 |         if node.start_point[0] < start_point.start_point[0]:
189 |             start_point = node
190 |         elif node.end_point[0] > end_point.end_point[0]:
191 |             end_point = node
192 |     
193 |     line_start = start_point.start_point[0]
194 |     char_start = start_point.start_point[1]
195 |     line_end = end_point.end_point[0]
196 |     char_end = end_point.end_point[1]
197 |         
198 |     lines = blob.split('\n')
199 |     if line_start != line_end:
200 |         string = '\n'.join([lines[line_start][char_start:]] + lines[line_start+1:line_end] + [lines[line_end][:char_end]])
201 |     else:
202 |         string = lines[line_start][char_start:char_end]
203 |     
204 |     return string, start_point, end_point
205 | 
206 | 
207 | class LanguageParser(ABC):
208 |     BLACKLISTED_FUNCTION_NAMES = []
209 |     
210 |     @staticmethod
211 |     @abstractmethod
212 |     def get_function_list(node):
213 |         pass
214 |     
215 |     @staticmethod
216 |     @abstractmethod
217 |     def get_class_list(node):
218 |         pass
219 |     
220 |     @staticmethod
221 |     @abstractmethod
222 |     def get_docstring_node(node) -> List[tree_sitter.Node]:
223 |         pass
224 |     
225 |     @staticmethod
226 |     @abstractmethod
227 |     def get_comment_node(node) -> List[tree_sitter.Node]:
228 |         pass
229 |     
230 |     @staticmethod
231 |     @abstractmethod
232 |     def get_class_metadata(class_node, blob=None):
233 |         pass
234 | 
235 |     @staticmethod
236 |     @abstractmethod
237 |     def get_function_metadata(function_node, blob=None) -> Dict[str, str]:
238 |         pass
239 |     
240 |     
241 |     # @staticmethod
242 |     # @abstractmethod
243 |     # def get_function_definitions(tree, blob) -> List:
244 |     #     pass
245 | 
246 |     # @staticmethod
247 |     # @abstractmethod
248 |     # def get_class_definitions(tree, blob) -> List:
249 |     #     pass
250 | 
251 |     # @staticmethod
252 |     # @abstractmethod
253 |     # def get_line_definitions(tree, blob) -> List:
254 |     #     pass
255 |     
256 |     # @staticmethod
257 |     # @abstractmethod
258 |     # def get_context(tree, blob):
259 |     #     raise NotImplementedError
260 | 
261 |     # @staticmethod
262 |     # @abstractmethod
263 |     # def get_calls(tree, blob):
264 |     #     raise NotImplementedError


--------------------------------------------------------------------------------
/src/codetext/parser/php_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List, Dict, Any
  3 | import tree_sitter
  4 | import logging
  5 | 
  6 | from .language_parser import LanguageParser, get_node_text, get_node_by_kind
  7 | 
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class PhpParser(LanguageParser):
 13 | 
 14 |     FILTER_PATHS = ('test', 'tests')
 15 | 
 16 |     BLACKLISTED_FUNCTION_NAMES = ['__construct', '__destruct', '__call', '__callStatic',
 17 |                                   '__get', '__set', '__isset', '__unset',
 18 |                                   '__sleep', '__wakeup', '__toString', '__invoke',
 19 |                                   '__set_state', '__clone', '__debugInfo', '__serialize',
 20 |                                   '__unserialize']
 21 | 
 22 |     @staticmethod
 23 |     def get_docstring(node, blob: str=None) -> str:
 24 |         if blob:
 25 |             logger.info('From version `0.0.6` this function will update argument in the API')
 26 |         docstring_node = PhpParser.get_docstring_node(node)
 27 |         
 28 |         docstring = ''
 29 |         if docstring_node:
 30 |             docstring = get_node_text(docstring_node[0])
 31 |         
 32 |         return docstring
 33 |     
 34 |     @staticmethod
 35 |     def get_docstring_node(node):
 36 |         docstring_node = []
 37 |         
 38 |         if node.prev_sibling is not None:
 39 |             prev_node = node.prev_sibling
 40 |             if prev_node.type == 'comment':
 41 |                 docstring_node.append(prev_node)
 42 |         
 43 |         return docstring_node
 44 |     
 45 |     @staticmethod
 46 |     def get_comment_node(function_node):
 47 |         comment_node = get_node_by_kind(function_node, kind='comment')
 48 |         return comment_node
 49 |     
 50 |     @staticmethod
 51 |     def get_class_list(node):
 52 |         res = get_node_by_kind(node, ['class_declaration', 
 53 |                                       'trait_declaration',
 54 |                                       'interface_declaration'])
 55 |         return res
 56 |     
 57 |     @staticmethod
 58 |     def get_function_list(node):
 59 |         res = get_node_by_kind(node, ['function_definition', 'method_declaration'])
 60 |         return res
 61 |     
 62 |     @staticmethod
 63 |     def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]:
 64 |         if blob:
 65 |             logger.info('From version `0.0.6` this function will update argument in the API')
 66 |         metadata = {
 67 |             'identifier': '',
 68 |             'parameters': {},
 69 |             'return_type': None,
 70 |         }
 71 | 
 72 |         for n in function_node.children:
 73 |             if n.type == 'name':
 74 |                 metadata['identifier'] = get_node_text(n)
 75 |             if n.type in ['union_type', 'intersection_type']:
 76 |                 metadata['return_type'] = get_node_text(n)
 77 |             elif n.type == 'formal_parameters':
 78 |                 for param_node in n.children:
 79 |                     if param_node.type in ['simple_parameter', 'variadic_parameter', 'property_promotion_parameter']:
 80 |                         identifier = get_node_text(param_node.child_by_field_name('name'))
 81 |                         param_type = param_node.child_by_field_name('type')
 82 |                         if param_type:
 83 |                             param_type = get_node_text(param_type)
 84 |                             metadata['parameters'][identifier] = param_type
 85 |                         else:
 86 |                             metadata['parameters'][identifier] = None
 87 |                         
 88 |         if not metadata['return_type']:
 89 |             return_statement = get_node_by_kind(function_node, ['return_statement'])
 90 |             if len(return_statement) > 0:
 91 |                 metadata['return_type'] = '<not_specific>'
 92 |             else:
 93 |                 metadata['return_type'] = None
 94 | 
 95 |         return metadata
 96 | 
 97 |     
 98 |     @staticmethod
 99 |     def get_class_metadata(class_node, blob: str=None):
100 |         if blob:
101 |             logger.info('From version `0.0.6` this function will update argument in the API')
102 |         metadata = {
103 |             'identifier': '',
104 |             'parameters': {},
105 |         }
106 |         assert type(class_node) == tree_sitter.Node
107 |         
108 |         for child in class_node.children:
109 |             if child.type == 'name':
110 |                 metadata['identifier'] = get_node_text(child)
111 |             elif child.type == 'base_clause':
112 |                 argument_list = []
113 |                 for param in child.children:
114 |                     if param.type == 'name':
115 |                         name = get_node_text(param)
116 |                         metadata['parameters'][name] = None
117 |                         # argument_list.append(get_node_text(param))
118 |                 # metadata['parameters'] = argument_list 
119 |     
120 |         return metadata
121 | 


--------------------------------------------------------------------------------
/src/codetext/parser/python_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List, Dict, Iterable, Optional, Iterator, Any
  3 | import logging
  4 | 
  5 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text
  6 | 
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class PythonParser(LanguageParser):
 12 |     
 13 |     BLACKLISTED_FUNCTION_NAMES = ['__init__', '__name__', '__main__']
 14 |     
 15 |     @staticmethod
 16 |     def get_docstring(node, blob:str=None):
 17 |         if blob:
 18 |             logger.info('From version `0.0.6` this function will update argument in the API')
 19 |         docstring_node = PythonParser.get_docstring_node(node)
 20 |         
 21 |         docstring = ''
 22 |         if docstring_node is not None:
 23 |             docstring = get_node_text(docstring_node[0])
 24 |             docstring = docstring.strip('"').strip("'").strip("#")
 25 |         return docstring
 26 |     
 27 |     @staticmethod
 28 |     def get_function_list(node):
 29 |         res = get_node_by_kind(node, ['function_definition'])
 30 |         return res
 31 | 
 32 |     @staticmethod
 33 |     def get_class_list(node):
 34 |         res = get_node_by_kind(node, ['class_definition'])
 35 |         return res
 36 |     
 37 |     @staticmethod
 38 |     def get_docstring_node(node):
 39 |         docstring_node = []
 40 |         # traverse_type(node, docstring_node, kind=['expression_statement']) #, 'comment'])
 41 |         for child in node.children:
 42 |             if child.type == 'block':
 43 |                 for sub_child in child.children:
 44 |                     if sub_child.type == 'expression_statement':
 45 |                         docstring_node.append(sub_child)
 46 | 
 47 |         docstring_node = [node for node in docstring_node if
 48 |                           node.type == 'expression_statement' and node.children[0].type == 'string']
 49 |         
 50 |         if len(docstring_node) > 0:
 51 |             return [docstring_node[0].children[0]]  # only take the first block
 52 | 
 53 |         return None
 54 |     
 55 |     @staticmethod
 56 |     def get_comment_node(node):
 57 |         comment_node = get_node_by_kind(node, kind=['comment', 'expression_statement'])
 58 |         for node in comment_node[:]:
 59 |             if node.type == 'expression_statement' and node.children[0].type != 'string':
 60 |                 comment_node.remove(node)
 61 |         return comment_node
 62 |     
 63 |     @staticmethod
 64 |     def get_function_metadata(function_node, blob: str=None) -> Dict[str, str]:
 65 |         if blob:
 66 |             logger.info('From version `0.0.6` this function will update argument in the API')
 67 |         metadata = {
 68 |             'identifier': '',
 69 |             'parameters': {},
 70 |             'return_type': None,
 71 |         }
 72 | 
 73 |         for child in function_node.children:
 74 |             if child.type == 'identifier':
 75 |                 metadata['identifier'] = get_node_text(child)
 76 |             elif child.type == 'parameters':
 77 |                 for subchild in child.children:
 78 |                     if subchild.type == 'identifier':
 79 |                         metadata['parameters'][get_node_text(subchild)] = None
 80 |                     elif subchild.type in ['typed_parameter', 'default_parameter', 'typed_default_parameter']:
 81 |                         param_type = get_node_by_kind(subchild, ['type'])
 82 |                         if param_type:
 83 |                             param_type = get_node_text(param_type[0])
 84 |                         else:
 85 |                             param_type = None
 86 |                         param_identifier = get_node_by_kind(subchild, ['identifier'])
 87 |                         assert len(param_identifier) != 0, "Empty identifier"
 88 |                         param_identifier = get_node_text(param_identifier[0])
 89 |                         metadata['parameters'][param_identifier] = param_type
 90 |             elif child.type == 'type':
 91 |                 metadata['return_type'] = get_node_text(child)
 92 |                 
 93 |         if not metadata['return_type']:
 94 |             return_statement = get_node_by_kind(function_node, ['return_statement'])
 95 |             if len(return_statement) > 0:
 96 |                 metadata['return_type'] = '<not_specific>'
 97 |             else:
 98 |                 metadata['return_type'] = None
 99 |                 
100 |         return metadata
101 | 
102 |     @staticmethod
103 |     def get_class_metadata(class_node, blob: str=None) -> Dict[str, str]:
104 |         if blob:
105 |             logger.info('From version `0.0.6` this function will update argument in the API')
106 |         metadata = {
107 |             'identifier': '',
108 |             'parameters': {},
109 |         }
110 |         for child in class_node.children:
111 |             if child.type == 'identifier':
112 |                 metadata['identifier'] = get_node_text(child)
113 |             elif child.type == 'argument_list':
114 |                 argument_list = get_node_text(child).split(',')
115 |                 for arg in argument_list:
116 |                     item = re.sub(r'[^a-zA-Z0-9\_]', ' ', arg).split()
117 |                     # Handle class definitions with empty argument list class ABC()
118 |                     if len(item) > 0:
119 |                         metadata['parameters'][item[0].strip()] = None
120 | 
121 |         # get __init__ function
122 |         return metadata
123 | 


--------------------------------------------------------------------------------
/src/codetext/parser/ruby_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List, Dict, Any
  3 | 
  4 | import tree_sitter
  5 | import logging
  6 | 
  7 | from .language_parser import LanguageParser, get_node_text, get_node_by_kind
  8 | # from function_parser.parsers.commentutils import get_docstring_summary
  9 | 
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class RubyParser(LanguageParser):
 15 | 
 16 |     FILTER_PATHS = ('test', 'vendor')
 17 | 
 18 |     BLACKLISTED_FUNCTION_NAMES = ['initialize', 'to_text', 'display', 'dup', 'clone', 'equal?', '==', '<=>',
 19 |                                   '===', '<=', '<', '>', '>=', 'between?', 'eql?', 'hash']
 20 | 
 21 |     @staticmethod
 22 |     def get_function_list(node):
 23 |         res = get_node_by_kind(node, ['method',
 24 |                                       'singleton_method'])
 25 |         return res
 26 |     
 27 |     @staticmethod
 28 |     def get_class_list(node):
 29 |         res = get_node_by_kind(node, ['class', 'module'])
 30 |         
 31 |         # remove class keywords
 32 |         for node in res[:]:
 33 |             if not node.children:
 34 |                 res.remove(node)
 35 | 
 36 |         return res
 37 | 
 38 |     @staticmethod
 39 |     def get_docstring_node(node) -> str:
 40 |         docstring_node = []
 41 |         
 42 |         prev_node = node.prev_sibling        
 43 |         if not prev_node or prev_node.type != 'comment':
 44 |             parent_node = node.parent
 45 |             if parent_node:
 46 |                 prev_node = parent_node.prev_sibling
 47 | 
 48 |         if prev_node and prev_node.type == 'comment':
 49 |             docstring_node.append(prev_node)
 50 |             prev_node = prev_node.prev_sibling
 51 |                 
 52 |         while prev_node and prev_node.type == 'comment':
 53 |             # Assume the comment is dense
 54 |             x_current = prev_node.start_point[0]
 55 |             x_next = prev_node.next_sibling.start_point[0]
 56 |             if x_next - x_current > 1:
 57 |                 break
 58 |                     
 59 |             docstring_node.insert(0, prev_node)    
 60 |             prev_node = prev_node.prev_sibling
 61 |             
 62 |         return docstring_node
 63 |     
 64 |     @staticmethod
 65 |     def get_docstring(node, blob=None):
 66 |         if blob:
 67 |             logger.info('From version `0.0.6` this function will update argument in the API')
 68 |         docstring_node = RubyParser.get_docstring_node(node)
 69 |         docstring = []
 70 |         for item in docstring_node:
 71 |             doc = get_node_text(item)
 72 |             doc_lines = doc.split('\n')
 73 |             for line in doc_lines:
 74 |                 if '=begin' in line or '=end' in line:
 75 |                     continue
 76 |                 docstring.append(line)
 77 |             
 78 |         docstring = '\n'.join(docstring)
 79 |         return docstring
 80 |     
 81 |     @staticmethod
 82 |     def get_function_metadata(function_node, blob=None) -> Dict[str, str]:
 83 |         if blob:
 84 |             logger.info('From version `0.0.6` this function will update argument in the API')
 85 |         metadata = {
 86 |             'identifier': '',
 87 |             'parameters': {},
 88 |             'return_type': None,
 89 |         }
 90 |         
 91 |         assert type(function_node) == tree_sitter.Node
 92 |         assert function_node.type in ['method', 'singleton_method']
 93 |         
 94 |         for child in function_node.children:
 95 |             if child.type == 'identifier':
 96 |                 metadata['identifier'] = get_node_text(child)
 97 |             elif child.type in ['method_parameters', 'parameters', 'bare_parameters']:
 98 |                 params = get_node_by_kind(child, ['identifier'])
 99 |                 for item in params:
100 |                     metadata['parameters'][get_node_text(item)] = None
101 | 
102 |         if not metadata['return_type']:
103 |             return_statement = get_node_by_kind(function_node, ['return'])
104 |             if len(return_statement) > 0:
105 |                 metadata['return_type'] = '<not_specific>'
106 |             else:
107 |                 metadata['return_type'] = None
108 | 
109 |         return metadata
110 |     
111 |     @staticmethod
112 |     def get_class_metadata(class_node, blob=None):
113 |         if blob:
114 |             logger.info('From version `0.0.6` this function will update argument in the API')
115 |         metadata = {
116 |             'identifier': '',
117 |             'parameters': {},
118 |         }
119 |         
120 |         assert type(class_node) == tree_sitter.Node
121 |         
122 |         for child in class_node.children:
123 |             if child.type == 'constant':
124 |                 metadata['identifier'] = get_node_text(child)
125 |             if child.type == 'superclass':
126 |                 for subchild in child.children:
127 |                     if subchild.type == 'constant':
128 |                         metadata['parameters'][get_node_text(subchild)] = None
129 | 
130 |         return metadata
131 |         
132 | 
133 |     @staticmethod
134 |     def get_comment_node(function_node):
135 |         comment_node = get_node_by_kind(function_node, kind='comment')
136 |         return comment_node
137 |     
138 |     @staticmethod
139 |     def get_action_list(action_node):
140 |         call_nodes =  get_node_by_kind(action_node, ['call'])
141 |         res = []
142 |         for call_node in call_nodes:
143 |             if get_node_by_kind(call_node, ["do_block"]):
144 |                 res.append(call_node)
145 |         # print(res)
146 |         return res
147 |     
148 |     @staticmethod
149 |     def get_action_metadata(action_node):
150 |         metadata = {
151 |             'identifier': '',
152 |             'parameters': {},
153 |             'return_type': None,
154 |         }
155 |         
156 |         for child in action_node.children:
157 |             if child.type in ["identifier"]:
158 |                 metadata['identifier'] = get_node_text(child)
159 |             if child.type in ["argument_list"]:
160 |                 symbol =  get_node_by_kind(child, ["simple_symbol"])
161 |                 if symbol:
162 |                     metadata['identifier'] += get_node_text(symbol[0])
163 |         
164 |         parameters =  get_node_by_kind(action_node, ["block_parameters"])
165 |         
166 |         if parameters:
167 |             for param in get_node_by_kind(parameters[0], ["identifier"]):
168 |                 param_name = get_node_text(param)
169 |                 metadata['parameters'].update({param_name : None})
170 |         
171 |         return metadata
172 |     
173 | 


--------------------------------------------------------------------------------
/src/codetext/parser/rust_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List, Dict, Any
  3 | 
  4 | import tree_sitter
  5 | import logging
  6 | 
  7 | from .language_parser import LanguageParser, get_node_by_kind, get_node_text
  8 | 
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class RustParser(LanguageParser):
 14 | 
 15 |     FILTER_PATHS = ('test', 'vendor')
 16 | 
 17 |     BLACKLISTED_FUNCTION_NAMES = ['main']
 18 | 
 19 |     @staticmethod
 20 |     def get_function_list(node):
 21 |         res = get_node_by_kind(node, ['function_item'])
 22 |         return res
 23 |     
 24 |     @staticmethod
 25 |     def get_class_list(node):
 26 |         res = get_node_by_kind(node, ['impl_item', 'mod_item'])  # trait is like an interface
 27 |         return res
 28 | 
 29 |     @staticmethod
 30 |     def get_docstring_node(node) -> List:
 31 |         docstring_node = []
 32 |         
 33 |         prev_node = node.prev_sibling
 34 |         if prev_node:
 35 |             if prev_node.type == 'block_comment':
 36 |                 docstring_node.append(prev_node)
 37 |                 
 38 |             elif prev_node.type == 'line_comment':
 39 |                 docstring_node.append(prev_node)
 40 |                 prev_node = prev_node.prev_sibling
 41 |                 
 42 |                 while prev_node and prev_node.type == 'line_comment':
 43 |                     # Assume the comment is dense
 44 |                     x_current = prev_node.start_point[0]
 45 |                     x_next = prev_node.next_sibling.start_point[0]
 46 |                     if x_next - x_current > 1:
 47 |                         break
 48 |                             
 49 |                     docstring_node.insert(0, prev_node)    
 50 |                     prev_node = prev_node.prev_sibling
 51 |             
 52 |         return docstring_node
 53 |     
 54 |     @staticmethod
 55 |     def get_docstring(node, blob=None):
 56 |         if blob:
 57 |             logger.info('From version `0.0.6` this function will update argument in the API')
 58 |         docstring_node = RustParser.get_docstring_node(node)
 59 |         docstring = []
 60 |         if docstring_node:
 61 |             for item in docstring_node:
 62 |                 doc = get_node_text(item)
 63 |                 docstring.append(doc)
 64 | 
 65 |         docstring = '\n'.join(docstring)
 66 |         return docstring
 67 |     
 68 |     @staticmethod
 69 |     def get_function_metadata(function_node, blob=None) -> Dict[str, str]:
 70 |         if blob:
 71 |             logger.info('From version `0.0.6` this function will update argument in the API')
 72 |         metadata = {
 73 |             'identifier': '',
 74 |             'parameters': {},
 75 |             'return_type': None,
 76 |         }
 77 |         
 78 |         assert type(function_node) == tree_sitter.Node
 79 |         assert function_node.type == 'function_item'
 80 |         
 81 |         for child in function_node.children:
 82 |             if child.type == 'identifier':
 83 |                 metadata['identifier'] = get_node_text(child)
 84 |             elif child.type in ['parameters']:
 85 |                 params = get_node_by_kind(child, ['parameter', 'variadic_parameter', 'self_parameter'])
 86 |                 for item in params:
 87 |                     if item.type == 'self_parameter':
 88 |                         metadata['parameters'][get_node_text(item)] = None
 89 |                     
 90 |                     else:
 91 |                         param_name = ''
 92 |                         for subchild in item.children:
 93 |                             if subchild.type == 'mutable_specifier':
 94 |                                 param_name = 'self'
 95 |                                 break
 96 |                             elif subchild.type == 'identifier':
 97 |                                 param_name = get_node_text(subchild)
 98 |                                 break
 99 |                         param_type = item.child_by_field_name('type')
100 |                         
101 |                         if param_type:
102 |                             param_type = get_node_text(param_type)
103 |                             metadata['parameters'][param_name] = param_type
104 |                         else:
105 |                             metadata['parameters'][param_name] = None
106 |                             param_type = None
107 | 
108 |             if child.type == 'reference_type':
109 |                 metadata['return_type'] = get_node_text(child)
110 |             
111 |             if not metadata['return_type']:
112 |                 return_statement = get_node_by_kind(function_node, ['return_expression'])
113 |                 if len(return_statement) > 0:
114 |                     metadata['return_type'] = '<not_specific>'
115 |                 else:
116 |                     metadata['return_type'] = None
117 |                 
118 |         return metadata
119 |     
120 |     @staticmethod
121 |     def get_class_metadata(class_node, blob=None):
122 |         if blob:
123 |             logger.info('From version `0.0.6` this function will update argument in the API')
124 |         metadata = {
125 |             'identifier': '',
126 |             'parameters': {},
127 |         }
128 |         
129 |         assert type(class_node) == tree_sitter.Node
130 |         
131 |         if class_node.type == 'mod_item':
132 |             for child in class_node.children:
133 |                 if child.type ==  'identifier':
134 |                     metadata['identifier'] = get_node_text(child)
135 |         
136 |         else:
137 |             identifier = get_node_by_kind(class_node, ['type_identifier'])
138 |             
139 |             metadata['identifier'] = get_node_text(identifier[0])
140 |             if len(identifier) > 1:
141 |                 for param in identifier[1:]:
142 |                     metadata['parameters'][get_node_text(param)] = None
143 | 
144 |         return metadata
145 |         
146 | 
147 |     @staticmethod
148 |     def get_comment_node(function_node):
149 |         comment_node = get_node_by_kind(function_node, kind=['comment', 'line_comment', 'block_comment'])
150 |         return comment_node
151 | 


--------------------------------------------------------------------------------
/src/codetext/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import build_language, parse_code, SUPPORTED_LANGUAGE
2 | from .imports import module_available
3 | 
4 | __all__ = ["build_languagem", "parse_code", "module_available"]


--------------------------------------------------------------------------------
/src/codetext/utils/imports.py:
--------------------------------------------------------------------------------
 1 | """Import utilities."""
 2 | import importlib
 3 | from importlib.util import find_spec
 4 | 
 5 | 
 6 | def _package_available(package_name: str) -> bool:
 7 |     """Check if a package is available in your environment.
 8 |     .. code-block:: python
 9 |     
10 |         >>> _package_available('os')
11 |         True
12 |         >>> _package_available('bla')
13 |         False
14 |     """
15 |     return find_spec(package_name) is not None
16 | 
17 | 
18 | def module_available(module_path: str) -> bool:
19 |     """Check if a module path is available in your environment.
20 |     Source: pytorch_lightning/utilities/imports.py
21 |     .. code-block:: python
22 |     
23 |         >>> module_available('os')
24 |         True
25 |         >>> module_available('os.bla')
26 |         False
27 |         >>> module_available('bla.bla')
28 |         False
29 |     """
30 |     module_names = module_path.split(".")
31 |     if not _package_available(module_names[0]):
32 |         return False
33 |     module = importlib.import_module(module_names[0])
34 |     for name in module_names[1:]:
35 |         if not hasattr(module, name):
36 |             return False
37 |         module = getattr(module, name)
38 |     return True


--------------------------------------------------------------------------------
/src/codetext/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import inspect
  3 | import sys
  4 | import os
  5 | import subprocess
  6 | import logging
  7 | from pathlib import Path
  8 | from typing import List, Dict, Any, Union
  9 | 
 10 | import tree_sitter
 11 | from tree_sitter import Language, Parser
 12 | 
 13 | 
 14 | logger = logging.getLogger('utils')
 15 | logging.basicConfig(level = logging.INFO)
 16 | 
 17 | 
 18 | SUPPORTED_LANGUAGE = ['python', 'java', 'javascript', 'ruby', 'go', 'c', 'cpp', 'c++', 'c#', 'c_sharp', 'php', 'rust']
 19 | 
 20 | 
 21 | def build_language(language: str, save_path: str=None):
 22 |     """
 23 |     Build tree-sitter language
 24 |     
 25 |     Args:
 26 |         language (str): java, python, cpp, c_sharp, etc
 27 |         save_path (str): save path (default create a `/tree-sitter/` dir)
 28 |     """
 29 |     language = str(language).lower()
 30 |     if language == 'c#':
 31 |         language = 'c_sharp'
 32 |     elif language == 'c++':
 33 |         language = 'cpp'
 34 | 
 35 |     assert language.lower() in SUPPORTED_LANGUAGE, f"Expect {language} in {SUPPORTED_LANGUAGE}"
 36 |     if not save_path:
 37 |         calling_script_path = Path(inspect.getframeinfo(sys._getframe(1)).filename)
 38 |         save_path = calling_script_path.parent
 39 |         
 40 |     # create `tree-sitter` dir
 41 |     ts_path = os.path.join(save_path, 'tree-sitter')
 42 |     if not os.path.exists(ts_path):
 43 |         logger.warning(
 44 |             f"Not found `tree-sitter` folder, create new one in {ts_path}"
 45 |         )
 46 |         os.mkdir(ts_path)
 47 |     
 48 |     # check `tree-sitter/tree-sitter-<language>`
 49 |     ts_lang_path = os.path.join(ts_path, 'tree-sitter-'+language.replace('_', '-'))
 50 |     if not os.path.exists(ts_lang_path):
 51 |         logger.warning(
 52 |             f"Not found `tree-sitter-{language.replace('_', '-')}`, attempt clone from github to {ts_path}"
 53 |         )
 54 |         command = f"cd {ts_path}; git clone https://github.com/tree-sitter/tree-sitter-{language.replace('_', '-')}.git"
 55 |         subprocess.Popen(command ,shell=True).wait()
 56 |         
 57 |         assert os.path.exists(ts_lang_path)==True, f"Unable to find {language} tree-sitter in {ts_path}"
 58 |     
 59 |     # if language == 'c-sharp': language = 'c_sharp'
 60 |     lang_path = os.path.join(save_path, 'tree-sitter', f'{language}.so')
 61 |     if not os.path.exists(lang_path):
 62 |         logger.info(
 63 |             f"Attempt to build Tree-sitter Language for {language} and store in {lang_path}"
 64 |         )
 65 |         Language.build_library(lang_path, [ts_lang_path])
 66 |         assert os.path.exists(lang_path)==True
 67 |     else:
 68 |         logger.info(f"Language already existed!")
 69 |         
 70 |     
 71 | def parse_code(raw_code: str, language: str='Auto', tree_sitter_path: str=None) -> tree_sitter.Tree:
 72 |     """
 73 |     Auto parse raw code into `tree_sitter.Tree`
 74 |     
 75 |     Args:
 76 |         raw_code (str): Raw source code need to parse
 77 |         language (str): Language to load parser
 78 |     """
 79 |     # TODO: auto detect language
 80 |     if language == 'Auto':
 81 |         raise NotImplemented("This feature is underdevelopment")
 82 |     language = str(language).lower()
 83 |     if language == 'c#':
 84 |         language = 'c_sharp'
 85 |     elif language == 'c++':
 86 |         language = 'cpp'
 87 |     assert language in SUPPORTED_LANGUAGE, f"Expect {language} in {SUPPORTED_LANGUAGE}"
 88 |     
 89 |     if tree_sitter_path:
 90 |         load_path = tree_sitter_path
 91 |     else:
 92 |         calling_script_path = Path(inspect.getframeinfo(sys._getframe(1)).filename)
 93 |         load_path = str(calling_script_path.parent)
 94 | 
 95 |     # Get parser from languages
 96 |     parser = Parser()
 97 |     try:
 98 |         from tree_sitter_languages import get_language, get_parser
 99 |         language = get_language(language)
100 |     except ImportError:
101 |         # Work-around when pre-built binaries wheels for tree-sitter-languages are not available
102 |         logger.warning(f"Troubled importing 'tree-sitter-languages', attemp to look for pre-built binaries in the workspace")    
103 |         ts_lang_path = os.path.join(load_path, 'tree-sitter', f'{language}.so')
104 |         if not os.path.exists(ts_lang_path):
105 |             logger.warning(f"Not found `{language}.so` in `{load_path}/tree-sitter/`, attemp to build language")
106 |             build_language(language, load_path)    
107 |         language = Language(load_path + f"/tree-sitter/{language}.so", language)
108 |     parser.set_language(language)
109 |     
110 |     if isinstance(raw_code, str):
111 |         raw_code = bytes(raw_code, 'utf8')
112 |     elif isinstance(raw_code, bytes):
113 |         pass
114 |     else:
115 |         raise ValueError(f"Expect `str`, got {type(raw_code)}")
116 |     tree = parser.parse(raw_code)
117 |     return tree
118 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | PROJECT_PATH = os.getcwd()
4 | SOURCE_PATH = os.path.join(
5 |     PROJECT_PATH,"src"
6 | )
7 | 
8 | 
9 | sys.path.append(SOURCE_PATH)


--------------------------------------------------------------------------------
/tests/setup.py:
--------------------------------------------------------------------------------
 1 | from ..src.codetext.utils import build_language
 2 | from tree_sitter_languages import get_language, get_parser
 3 | 
 4 | if __name__ == '__main__':
 5 |     lang_list = ['python', 'cpp', 'java', 'c-sharp', 'ruby', 'rust', 'javascript', 'php', 'go']
 6 |     
 7 |     for lang in lang_list:
 8 |         # build_language(lang)
 9 |         try:
10 |             get_parser(get_language(lang))
11 |         except:
12 |             build_language(lang)
13 | 


--------------------------------------------------------------------------------
/tests/test_clean/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/tests/test_clean/__init__.py


--------------------------------------------------------------------------------
/tests/test_clean/test_clean_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/tests/test_clean/test_clean_utils.py


--------------------------------------------------------------------------------
/tests/test_parser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/tests/test_parser/__init__.py


--------------------------------------------------------------------------------
/tests/test_parser/test_c.py:
--------------------------------------------------------------------------------
 1 | '''test for C++ parser'''
 2 | import os
 3 | import unittest
 4 | 
 5 | from src.codetext.parser import CppParser
 6 | from src.codetext.utils import parse_code
 7 | 
 8 | 
 9 | class Test_CppParser_with_C(unittest.TestCase):
10 |     def setUp(self) -> None:
11 |         with open('tests/test_parser/test_sample/c_test_sample.c', 'r') as file:
12 |             self.code_sample = file.read()
13 |             
14 |         tree = parse_code(self.code_sample, 'c')
15 |         self.root_node = tree.root_node
16 | 
17 |         return super().setUp()
18 | 
19 |     def test_get_function_list(self):
20 |         root = self.root_node
21 |         
22 |         function_list = CppParser.get_function_list(root)
23 |         
24 |         self.assertEqual(len(function_list), 2)
25 | 
26 |     def test_get_function_metadata(self):
27 |         root = self.root_node
28 |         
29 |         function = CppParser.get_function_list(root)[0]
30 |         metadata = CppParser.get_function_metadata(function)
31 |         
32 |         for key in ['identifier', 'parameters', 'return_type']:
33 |             self.assertTrue(key in metadata.keys())
34 |         self.assertEqual(metadata['parameters'], {'random_seed': 'int'})
35 |         self.assertEqual(metadata['identifier'], 'reverseSentence')
36 |         self.assertEqual(metadata['return_type'], 'void')
37 |         
38 |     def test_get_class_list(self):
39 |         pass
40 |     
41 |     def test_get_class_metadata(self):
42 |         pass
43 | 
44 |     def test_get_docstring(self):
45 |         code_sample = """
46 |         /**
47 |         * A brief description. A more elaborate class description
48 |         * @param random_seed somearg.
49 |         * @see Test()
50 |         * @return The test results
51 |         */
52 |         void reverseSentence(int random_seed) {
53 |             char c;
54 |             scanf("%c", &c);
55 |             if (c != '\n') {
56 |                 reverseSentence();
57 |                 printf("%c", c);
58 |             }
59 |         }
60 |         """
61 |         tree = parse_code(code_sample, 'c')
62 |         root = tree.root_node
63 |         
64 |         fn= CppParser.get_function_list(root)[0]
65 | 
66 |         docs = CppParser.get_docstring(fn)
67 |         
68 |         self.assertEqual(docs, '/**\n        * A brief description. A more elaborate class description\n        * @param random_seed somearg.\n        * @see Test()\n        * @return The test results\n        */')
69 |         
70 | 
71 |     def test_extract_docstring(self):
72 |         pass
73 | 
74 | 
75 | if __name__ == '__main__':
76 |     unittest.main()
77 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_cpp.py:
--------------------------------------------------------------------------------
  1 | '''test for C++ parser'''
  2 | import os
  3 | import unittest
  4 | from pathlib import Path
  5 | 
  6 | from src.codetext.parser import CppParser
  7 | from src.codetext.utils import parse_code
  8 | 
  9 | 
 10 | class Test_CppParser(unittest.TestCase):
 11 |     def setUp(self) -> None:
 12 |         with open('tests/test_parser/test_sample/cpp_test_sample.cpp', 'r') as file:
 13 |             self.code_sample = file.read()
 14 |             
 15 |         tree = parse_code(self.code_sample, 'c++')
 16 |         self.root_node = tree.root_node
 17 | 
 18 |         return super().setUp()
 19 | 
 20 |     def test_get_function_list(self):
 21 |         root = self.root_node
 22 |         
 23 |         function_list = CppParser.get_function_list(root)
 24 |         
 25 |         self.assertEqual(len(function_list), 3)
 26 |         
 27 |     def test_get_class_list(self):
 28 |         root = self.root_node
 29 |         
 30 |         class_list = CppParser.get_class_list(root)
 31 |         
 32 |         self.assertEqual(len(class_list), 2)
 33 | 
 34 |     def test_get_function_metadata(self):
 35 |         root = self.root_node
 36 |         
 37 |         function = list(CppParser.get_function_list(root))[0]
 38 |         metadata = CppParser.get_function_metadata(function)
 39 | 
 40 |         for key in ['identifier', 'parameters', 'return_type']:
 41 |             self.assertTrue(key in metadata.keys(), "Missing {}".format(key))
 42 |         self.assertEqual(metadata['parameters'], {'a': 'int', 'b': 'int'})
 43 |         self.assertEqual(metadata['identifier'], 'sum2number')
 44 |         self.assertEqual(metadata['return_type'], 'int')
 45 |     
 46 |     def test_get_class_metadata(self):
 47 |         root = self.root_node
 48 |         
 49 |         classes = list(CppParser.get_class_list(root))[0]
 50 |         metadata = CppParser.get_class_metadata(classes)
 51 | 
 52 |         self.assertEqual(metadata['parameters'], {'Vehicle': None, 'B': None})
 53 |         self.assertEqual(metadata['identifier'], 'Car')
 54 | 
 55 |     def test_get_docstring(self):
 56 |         code_sample = """
 57 |         /**
 58 |         * Find 2 sum
 59 |         *
 60 |         * @param nums List number.
 61 |         * @param target Sum target.
 62 |         * @return postion of 2 number.
 63 |         */
 64 |         vector<int> twoSum(vector<int>& nums, int target) {
 65 |             map<int,int> m;
 66 |             vector<int> v;
 67 |             int n= nums.size();
 68 |             for(int i=0;i<n;i++)
 69 |             {
 70 |                 
 71 |                     int diff = target - nums[i];
 72 |                     if(m.find(diff) != m.end())
 73 |                     {
 74 |                     auto p = m.find(diff);        
 75 |                     v.push_back(p->second);
 76 |                     v.push_back(i);
 77 |                     }
 78 |                     m.insert(make_pair(nums[i],i));
 79 |             }
 80 | 
 81 |             return v;
 82 |         }
 83 | 
 84 |         // Comment in
 85 |         // multiple line
 86 |         // of the function sum
 87 |         double sum2num(int a, int b) {
 88 |             return a + b;
 89 |         }
 90 |         """
 91 |         tree = parse_code(code_sample, 'c++')
 92 |         root = tree.root_node
 93 |         
 94 |         fn1, fn2 = list(CppParser.get_function_list(root))
 95 | 
 96 |         docs1 = CppParser.get_docstring(fn1)
 97 |         docs2 = CppParser.get_docstring(fn2)
 98 |         
 99 |         self.assertEqual(docs1, '/**\n        * Find 2 sum\n        *\n        * @param nums List number.\n        * @param target Sum target.\n        * @return postion of 2 number.\n        */')
100 |         self.assertEqual(docs2, '// Comment in\n// multiple line\n// of the function sum')
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     unittest.main()
105 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_csharp.py:
--------------------------------------------------------------------------------
 1 | '''test for C# parser'''
 2 | import os
 3 | import unittest
 4 | from pathlib import Path
 5 | 
 6 | from src.codetext.parser import CsharpParser
 7 | from src.codetext.utils import parse_code
 8 | 
 9 | 
10 | class Test_CsharpParser(unittest.TestCase):
11 |     def setUp(self) -> None:
12 |         with open('tests/test_parser/test_sample/c_sharp_test_sample.cs', 'r') as file:
13 |             self.code_sample = file.read()
14 |         
15 |         tree = parse_code(self.code_sample, 'c#')
16 |         self.root_node = tree.root_node
17 |         
18 |         return super().setUp()
19 | 
20 |     def test_get_function_list(self):
21 |         root = self.root_node
22 |         
23 |         function_list = CsharpParser.get_function_list(root)
24 |         
25 |         self.assertEqual(len(function_list), 3)  # exclude constructor
26 | 
27 |     def test_get_class_list(self):
28 |         root = self.root_node
29 |         
30 |         class_list = CsharpParser.get_class_list(root)
31 |         
32 |         self.assertEqual(len(class_list), 1)
33 | 
34 |     def test_get_docstring(self):
35 |         code_sample = """
36 |         class Vehicle
37 |         {
38 |             public string brand = "Ford";  // Vehicle field
39 |             
40 |             // <summary>
41 |             // Docstring of a method
42 |             // </summary>
43 |             // <param name="animal_honk">Argument.</param>
44 |             // <returns>
45 |             // None.
46 |             public void honk(string animal_honk)
47 |             {                    
48 |                 Console.WriteLine(animal_honk);
49 |                 Console.WriteLine("Tuut, tuut!");
50 |             }
51 |             
52 |             /* Another method docstring
53 |             in multiple line */
54 |             public void _honk()
55 |             {
56 |                 Console.WriteLine("Tuut, tuut!");
57 |             }
58 |         }   
59 |         """
60 |         tree = parse_code(code_sample, 'c#')
61 |         root = tree.root_node
62 |         
63 |         fn1, fn2 = list(CsharpParser.get_function_list(root))
64 | 
65 |         docs1 = CsharpParser.get_docstring(fn1)
66 |         docs2 = CsharpParser.get_docstring(fn2)
67 |         
68 |         self.assertEqual(docs1, '// <summary>\n// Docstring of a method\n// </summary>\n// <param name="animal_honk">Argument.</param>\n// <returns>\n// None.')
69 |         self.assertEqual(docs2, '/* Another method docstring\n            in multiple line */')
70 |         
71 | 
72 |     def test_get_function_metadata(self):
73 |         root = self.root_node
74 |         
75 |         function = list(CsharpParser.get_function_list(root))[0]
76 |         metadata = CsharpParser.get_function_metadata(function)
77 | 
78 |         for key in ['identifier', 'parameters', 'return_type']:
79 |             self.assertTrue(key in metadata.keys())
80 |         self.assertEqual(metadata['parameters'], {'path': 'string', 'filename': 'string'})
81 |         self.assertEqual(metadata['identifier'], 'GetText')
82 |         self.assertEqual(metadata['return_type'], 'string')
83 | 
84 |     def test_get_class_metadata(self):
85 |         root = self.root_node
86 |         
87 |         classes = list(CsharpParser.get_class_list(root))[0]
88 |         metadata = CsharpParser.get_class_metadata(classes)
89 | 
90 |         self.assertEqual(metadata['parameters'], {'Animal': None})
91 |         self.assertEqual(metadata['identifier'], 'Dog')
92 | 
93 | 
94 | if __name__ == '__main__':
95 |     unittest.main()
96 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_go.py:
--------------------------------------------------------------------------------
 1 | '''test for C++ parser'''
 2 | import os
 3 | import unittest
 4 | from pathlib import Path
 5 | 
 6 | from src.codetext.parser import GoParser
 7 | from src.codetext.utils import parse_code
 8 | 
 9 | 
10 | class Test_GoParser(unittest.TestCase):
11 |     def setUp(self) -> None:
12 |         with open('tests/test_parser/test_sample/go_test_sample.go', 'r') as file:
13 |             self.code_sample = file.read()
14 |             
15 |         tree = parse_code(self.code_sample, 'go')
16 |         self.root_node = tree.root_node
17 |         return super().setUp()
18 | 
19 |     def test_get_function_list(self):
20 |         root = self.root_node
21 |         
22 |         function_list = GoParser.get_function_list(root)
23 |         
24 |         self.assertEqual(len(function_list), 1)
25 | 
26 |     def test_get_function_metadata(self):
27 |         root = self.root_node
28 |         
29 |         function = GoParser.get_function_list(root)[0]
30 |         metadata = GoParser.get_function_metadata(function)
31 | 
32 |         for key in ['identifier', 'parameters', 'return_type']:
33 |             self.assertTrue(key in metadata.keys())
34 |         self.assertEqual(metadata['parameters'], {'e': 'TypeError'})
35 |         self.assertEqual(metadata['identifier'], 'Error')
36 |         self.assertEqual(metadata['return_type'], 'string')
37 | 
38 |     def test_get_docstring(self):
39 |         code_sample = """
40 |         type TypeError struct {
41 |             Type1, Type2 reflect.Type
42 |             Extra        string
43 |         }
44 |         // Something must not include as docstring
45 |         
46 |         // The path package should only be used for paths separated by forward
47 |         // slashes, such as the paths in URLs. This package does not deal with
48 |         // Windows paths with drive letters or backslashes; to manipulate
49 |         // operating system paths, use the [path/filepath] package.
50 |         func (e TypeError) Error() string {
51 |                 msg := e.Type1.String()
52 |                 if e.Type2 != nil {
53 |                     msg += " and " + e.Type2.String()
54 |             }
55 |             msg += " " + e.Extra
56 |             return msg
57 |         }
58 |         """
59 |         tree = parse_code(code_sample, 'go')
60 |         root = tree.root_node
61 |         
62 |         fn = GoParser.get_function_list(root)[0]
63 | 
64 |         docs = GoParser.get_docstring(fn)
65 |         self.assertEqual(docs, '// The path package should only be used for paths separated by forward\n// slashes, such as the paths in URLs. This package does not deal with\n// Windows paths with drive letters or backslashes; to manipulate\n// operating system paths, use the [path/filepath] package.')
66 |         
67 | 
68 |     def test_extract_docstring(self):
69 |         pass
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     unittest.main()
74 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_java.py:
--------------------------------------------------------------------------------
 1 | '''test for Java parser'''
 2 | import os
 3 | import unittest
 4 | from pathlib import Path
 5 | 
 6 | from src.codetext.parser import JavaParser
 7 | from src.codetext.utils import parse_code
 8 | 
 9 | 
10 | class Test_JavaParser(unittest.TestCase):
11 |     def setUp(self) -> None:        
12 |         with open('tests/test_parser/test_sample/java_test_sample.java', 'r') as file:
13 |             self.code_sample = file.read()
14 |             
15 |         tree = parse_code(self.code_sample, 'java')
16 |         self.root_node = tree.root_node
17 | 
18 |         return super().setUp()
19 | 
20 |     def test_get_function_list(self):
21 |         root = self.root_node
22 |         
23 |         function_list = JavaParser.get_function_list(root)
24 |         
25 |         self.assertEqual(len(function_list), 2)
26 | 
27 |     def test_get_class_list(self):
28 |         root = self.root_node
29 |         
30 |         class_list = JavaParser.get_class_list(root)
31 |         
32 |         self.assertEqual(len(class_list), 1)
33 | 
34 |     def test_get_docstring(self):
35 |         code_sample = """
36 |         public class SaveFileController {
37 |             /**
38 |             * Adds new user and saves to file.
39 |             *
40 |             * @param context instance of Context
41 |             * @param user instance of User
42 |             * @see User
43 |             */
44 |             public void addNewUser(Context context, User user){
45 |                     loadFromFile(context);
46 |                 this.allUsers.add(user);
47 |                 saveToFile(context);
48 |             }
49 |         }
50 |         """
51 |         tree = parse_code(code_sample, 'java', './')
52 |         root = tree.root_node
53 |         
54 |         fn = list(JavaParser.get_function_list(root))[0]
55 | 
56 |         docs = JavaParser.get_docstring(fn)
57 |         self.assertEqual(docs, '/**\n            * Adds new user and saves to file.\n            *\n            * @param context instance of Context\n            * @param user instance of User\n            * @see User\n            */')
58 |         
59 | 
60 |     def test_get_function_metadata(self):
61 |         root = self.root_node
62 |         
63 |         function = list(JavaParser.get_function_list(root))[0]
64 |         metadata = JavaParser.get_function_metadata(function)
65 | 
66 |         for key in ['identifier', 'parameters', 'return_type']:
67 |             self.assertTrue(key in metadata.keys())
68 |         self.assertEqual(metadata['parameters'], {'context': 'Context', 'userIndex': 'int'})
69 |         self.assertEqual(metadata['identifier'], 'getHabitList')
70 |         self.assertEqual(metadata['return_type'], 'HabitList')
71 | 
72 |     def test_get_class_metadata(self):
73 |         root = self.root_node
74 |         
75 |         classes = list(JavaParser.get_class_list(root))[0]
76 |         metadata = JavaParser.get_class_metadata(classes)
77 | 
78 |         self.assertEqual(metadata['parameters'], {'SudoUser': None, 'FileController': None})
79 |         self.assertEqual(metadata['identifier'], 'SaveFileController')
80 | 
81 |     def test_extract_docstring(self):
82 |         pass
83 |         
84 | 
85 | if __name__ == '__main__':
86 |     unittest.main()
87 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_javascript.py:
--------------------------------------------------------------------------------
  1 | '''test for JavaScript parser'''
  2 | import os
  3 | import unittest
  4 | from pathlib import Path
  5 | 
  6 | from src.codetext.parser import JavascriptParser
  7 | from src.codetext.utils import parse_code
  8 | 
  9 | 
 10 | class Test_JavascriptParser(unittest.TestCase):
 11 |     def setUp(self) -> None:
 12 |         with open('tests/test_parser/test_sample/javascript_test_sample.js', 'r') as file:
 13 |             self.code_sample = file.read()
 14 |             
 15 |         tree = parse_code(self.code_sample, 'javascript')
 16 |         self.root_node = tree.root_node
 17 | 
 18 |         return super().setUp()
 19 | 
 20 |     def test_get_function_list(self):
 21 |         root = self.root_node
 22 |         
 23 |         function_list = JavascriptParser.get_function_list(root)
 24 |         
 25 |         self.assertEqual(len(function_list), 7)
 26 | 
 27 |     def test_get_class_list(self):
 28 |         root = self.root_node
 29 |         
 30 |         class_list = JavascriptParser.get_class_list(root)
 31 |         
 32 |         self.assertEqual(len(class_list), 2)
 33 | 
 34 |     def test_get_docstring(self):
 35 |         code_sample = """
 36 |         /**
 37 |         * Dispatched when the repositories are loaded by the request saga
 38 |         *
 39 |         * @param  {array} repos The repository data
 40 |         * @param  {string} username The current username
 41 |         *
 42 |         * @return {object}      An action object with a type of LOAD_REPOS_SUCCESS passing the repos
 43 |         */
 44 |         function songsLoaded(repos, username) {
 45 |             return {
 46 |                 type: LOAD_SONGS_SUCCESS,
 47 |             repos,
 48 |             username,
 49 |             };
 50 |         }
 51 |         
 52 |         class Car {
 53 |             /**
 54 |             * Present the object Car
 55 |             *
 56 |             * @return {None}
 57 |             */
 58 |             present() {
 59 |                 return 'I have a ' + this.carname;
 60 |             }
 61 |         }
 62 |         """
 63 | 
 64 |         tree = parse_code(code_sample, 'javascript')
 65 |         root = tree.root_node
 66 |         
 67 |         fn1, fn2 = JavascriptParser.get_function_list(root)
 68 |         
 69 | 
 70 |         docs1 = JavascriptParser.get_docstring(fn1)
 71 |         docs2 = JavascriptParser.get_docstring(fn2)
 72 |         
 73 |         self.assertEqual(docs1, '/**\n        * Dispatched when the repositories are loaded by the request saga\n        *\n        * @param  {array} repos The repository data\n        * @param  {string} username The current username\n        *\n        * @return {object}      An action object with a type of LOAD_REPOS_SUCCESS passing the repos\n        */')
 74 |         self.assertEqual(docs2, '/**\n            * Present the object Car\n            *\n            * @return {None}\n            */')
 75 | 
 76 |     def test_get_function_metadata(self):
 77 |         root = self.root_node
 78 |         
 79 |         _function = JavascriptParser.get_function_list(root)[1]
 80 |         metadata = JavascriptParser.get_function_metadata(_function)
 81 | 
 82 |         for key in ['identifier', 'parameters', 'return_type']:
 83 |             self.assertTrue(key in metadata.keys())
 84 |         self.assertEqual(metadata['identifier'], 'songsLoaded')
 85 |         self.assertEqual(metadata['parameters'], {'repos': None, 'username': None})
 86 |         
 87 |     def test_metadata_with_return_statement(self):
 88 |         code_sample = '''
 89 |         function myFunction(p1, p2) {
 90 |             return p1 * p2;
 91 |         }
 92 |         '''
 93 |         root = parse_code(code_sample, 'javascript').root_node
 94 |         fn = JavascriptParser.get_function_list(root)[0]
 95 |         metadata = JavascriptParser.get_function_metadata(fn)
 96 |         
 97 |         return_type = metadata['return_type']
 98 |         self.assertEqual(return_type, '<not_specific>')
 99 | 
100 |     def test_get_class_metadata(self):
101 |         root = self.root_node
102 |         
103 |         classes = JavascriptParser.get_class_list(root)[0]
104 |         metadata = JavascriptParser.get_class_metadata(classes)
105 | 
106 |         self.assertEqual(metadata['identifier'], 'Model')
107 |         self.assertEqual(metadata['parameters'], {'Car': None})
108 | 
109 |     def test_extract_docstring(self):
110 |         pass
111 |         
112 |         
113 |     def test_metadata_with_arrow_function(self):
114 |         code_sample = '''
115 |             export const parseModel = async (mesh) =>
116 |                 new Promise((resolve) => {
117 |                     exporter.parse(
118 |                         mesh,
119 |                         (gltf) => {
120 |                             const blob = new Blob([gltf], { type: "application/octet-stream" });
121 |                             resolve(blob);
122 |                             return blob;
123 |                         },
124 |                         (error) => {
125 |                             console.log(error);
126 |                             return error;
127 | 
128 |                         }
129 |                     );
130 |                 });        
131 |         '''
132 |         root = parse_code(code_sample, 'javascript').root_node
133 |         fn = JavascriptParser.get_function_list(root)[0]
134 |         metadata = JavascriptParser.get_function_metadata(fn)
135 |         
136 |         identifier = metadata['identifier']
137 |         self.assertEqual(identifier, 'parseModel')
138 |         
139 |     def test_metadata_with_undecleared_functions(self):
140 |         code_sample = """
141 |             const asyncFunctionExpression = async function() {
142 |                 // async function expression definition
143 |                 return a 
144 |             };
145 |             
146 |             const generatorFunctionExpression = function*() {
147 |                 // generator function expression definition
148 |                 return b
149 |             };        
150 |         """
151 |         root = parse_code(code_sample, 'javascript').root_node
152 |         fn1, fn2 = JavascriptParser.get_function_list(root)
153 |         
154 |         self.assertEqual(fn1.type, 'function') 
155 |         self.assertEqual(fn2.type, 'generator_function')
156 |         
157 |         metadata1 = JavascriptParser.get_function_metadata(fn1)
158 |         metadata2 = JavascriptParser.get_function_metadata(fn2)
159 | 
160 |         self.assertEqual(metadata1['identifier'], 'asyncFunctionExpression')
161 |         self.assertEqual(metadata2['identifier'], 'generatorFunctionExpression')
162 | 
163 | 
164 | if __name__ == '__main__':
165 |     unittest.main()
166 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_php.py:
--------------------------------------------------------------------------------
  1 | '''test for PHP parser'''
  2 | import os
  3 | import unittest
  4 | from pathlib import Path
  5 | 
  6 | from src.codetext.parser import PhpParser
  7 | from src.codetext.utils import parse_code
  8 | 
  9 | 
 10 | class Test_PhpParser(unittest.TestCase):
 11 |     def setUp(self) -> None:
 12 |         with open('tests/test_parser/test_sample/php_test_sample.php', 'r') as file:
 13 |             self.code_sample = file.read()
 14 |             
 15 |         tree = parse_code(self.code_sample, 'php')
 16 |         self.root_node = tree.root_node
 17 | 
 18 |         return super().setUp()
 19 | 
 20 |     def test_get_function_list(self):
 21 |         root = self.root_node
 22 |         
 23 |         function_list = PhpParser.get_function_list(root)
 24 |         
 25 |         self.assertEqual(len(function_list), 5)
 26 | 
 27 |     def test_get_class_list(self):
 28 |         root = self.root_node
 29 |         
 30 |         class_list = PhpParser.get_class_list(root)
 31 |         
 32 |         self.assertEqual(len(class_list), 3)
 33 | 
 34 |     def test_get_docstring(self):
 35 |         code_sample = """
 36 |         <?php
 37 |         /**
 38 |         * Get all image nodes.
 39 |         *
 40 |         * @param \DOMNode     $node       The \DOMDocument instance
 41 |         * @param boolean      $strict     If the document has to be valid
 42 |         *
 43 |         * @return \DOMNode
 44 |         */
 45 |         function getImageNodes(\DOMNode $node, $strict = true): \DOMNode
 46 |         {
 47 |             // ...
 48 |             return $node;
 49 |         }
 50 |         ?>
 51 |         """
 52 | 
 53 |         tree = parse_code(code_sample, 'php')
 54 |         root = tree.root_node
 55 |         
 56 |         fn = PhpParser.get_function_list(root)[0]
 57 | 
 58 |         docs = PhpParser.get_docstring(fn)
 59 |         
 60 |         self.assertEqual(docs, '/**\n        * Get all image nodes.\n        *\n        * @param \\DOMNode     $node       The \\DOMDocument instance\n        * @param boolean      $strict     If the document has to be valid\n        *\n        * @return \\DOMNode\n        */')
 61 |         
 62 | 
 63 |     def test_get_function_metadata(self):
 64 |         root = self.root_node
 65 |         
 66 |         function = list(PhpParser.get_function_list(root))[1]
 67 |         metadata = PhpParser.get_function_metadata(function)
 68 | 
 69 |         for key in ['identifier', 'parameters', 'return_type']:
 70 |             self.assertTrue(key in metadata.keys())
 71 |         self.assertEqual(metadata['parameters'],  {'$params': 'array', '$connectionOptions': 'array'})
 72 |         self.assertEqual(metadata['identifier'], 'constructDsn')
 73 |         self.assertEqual(metadata['return_type'], 'string')
 74 |         
 75 |     def test_metadata_with_return_statement(self):
 76 |         code_sample = '''
 77 |         <?php
 78 |         function sum($a, $b): {
 79 |             return $a + $b;
 80 |         }
 81 |         ?>
 82 |         '''
 83 |         root = parse_code(code_sample, 'PHP').root_node
 84 |         fn = PhpParser.get_function_list(root)[0]
 85 |         metadata = PhpParser.get_function_metadata(fn)
 86 |         
 87 |         return_type = metadata['return_type']
 88 |         self.assertEqual(return_type, '<not_specific>')
 89 | 
 90 |     def test_metadata_without_return_statement(self):
 91 |         code_sample = '''
 92 |         <?php
 93 |         function sum($a, $b): {
 94 |         }
 95 |         ?>
 96 |         '''
 97 |         root = parse_code(code_sample, 'PHP').root_node
 98 |         fn = PhpParser.get_function_list(root)[0]
 99 |         metadata = PhpParser.get_function_metadata(fn)
100 |         
101 |         return_type = metadata['return_type']
102 |         self.assertEqual(return_type, None)
103 | 
104 |     def test_get_class_metadata(self):
105 |         root = self.root_node
106 |         
107 |         _class, interface, trait = list(PhpParser.get_class_list(root))
108 |         class_metadata = PhpParser.get_class_metadata(_class)
109 | 
110 |         self.assertEqual(class_metadata['parameters'], {'AbstractSQLServerDriver': None})
111 |         self.assertEqual(class_metadata['identifier'], 'Driver')
112 |         
113 |         interface_metadata = PhpParser.get_class_metadata(interface)
114 |         self.assertEqual(interface_metadata['identifier'], 'MyInterface')
115 |         
116 |         trait_metadata = PhpParser.get_class_metadata(trait)
117 |         self.assertEqual(trait_metadata['identifier'], 'MyTrait')
118 |         
119 | 
120 | if __name__ == '__main__':
121 |     unittest.main()
122 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_python.py:
--------------------------------------------------------------------------------
  1 | '''test for python parser'''
  2 | import os
  3 | import unittest
  4 | from pathlib import Path
  5 | 
  6 | from src.codetext.parser import PythonParser
  7 | from src.codetext.utils import parse_code
  8 | 
  9 | 
 10 | class Test_PythonParser(unittest.TestCase):
 11 |     def setUp(self) -> None:        
 12 |         with open('tests/test_parser/test_sample/py_test_sample.py', 'r') as file:
 13 |             self.code_sample = file.read()
 14 |         
 15 |         tree = parse_code(self.code_sample, 'python')
 16 |         self.root_node = tree.root_node
 17 |         return super().setUp()
 18 | 
 19 |     def test_get_function_list(self):
 20 |         root = self.root_node
 21 |         
 22 |         function_list = PythonParser.get_function_list(root)
 23 |         
 24 |         self.assertEqual(len(function_list), 3)
 25 | 
 26 |     def test_get_class_list(self):
 27 |         root = self.root_node
 28 |         
 29 |         class_list = PythonParser.get_class_list(root)
 30 |         self.assertEqual(len(class_list), 1)
 31 | 
 32 |     def test_get_docstring(self):
 33 |         code_sample = '''
 34 |         def test_sample():
 35 |             """This is a docstring"""
 36 |             return
 37 |         '''
 38 |         root = parse_code(code_sample, 'python').root_node
 39 |         
 40 |         function = PythonParser.get_function_list(root)[0]
 41 |         docstring = PythonParser.get_docstring(function)
 42 |         self.assertEqual(docstring, "This is a docstring")
 43 | 
 44 |     def test_get_function_metadata(self):
 45 |         code_sample = '''
 46 |         def test_sample(arg1: str = "string", arg2 = "another_string"):
 47 |             return NotImplement()
 48 |         '''
 49 |         root = parse_code(code_sample, 'python').root_node
 50 |         
 51 |         function = list(PythonParser.get_function_list(root))[0]
 52 |         metadata = PythonParser.get_function_metadata(function)
 53 | 
 54 |         for key in ['identifier', 'parameters', 'return_type']:
 55 |             self.assertTrue(key in metadata.keys())
 56 |         self.assertEqual(metadata['parameters'], {'arg1': 'str', 'arg2': None})
 57 |         self.assertEqual(metadata['identifier'], 'test_sample')
 58 | 
 59 |     def test_get_class_metadata(self):
 60 |         code_sample = '''
 61 |         class ABC():
 62 |             pass
 63 |             
 64 |         class Sample(ABC):
 65 |             def __init__(self):
 66 |                 pass
 67 | 
 68 |             def test_sample(self, arg1: str = "string", arg2 = "another_string"):
 69 |                 return NotImplement()
 70 |         
 71 |         class ThisIsalsoAclass(ABC, Sample):
 72 |             pass
 73 |         '''
 74 |         root = parse_code(code_sample, 'python').root_node
 75 |         
 76 |         
 77 |         classes = list(PythonParser.get_class_list(root))
 78 |         self.assertEqual(len(classes), 3)
 79 |         
 80 |         metadata = PythonParser.get_class_metadata(classes[0])
 81 |         self.assertEqual(metadata['parameters'], {})
 82 |         self.assertEqual(metadata['identifier'], 'ABC')
 83 |         
 84 |         
 85 |         metadata = PythonParser.get_class_metadata(classes[1])
 86 |         self.assertEqual(metadata['parameters'], {'ABC': None})
 87 |         self.assertEqual(metadata['identifier'], 'Sample')
 88 |         
 89 |         
 90 |         metadata = PythonParser.get_class_metadata(classes[2])
 91 |         self.assertEqual(metadata['parameters'], {'ABC': None, 'Sample': None})
 92 |         self.assertEqual(metadata['identifier'], 'ThisIsalsoAclass')
 93 |         
 94 |         
 95 |         
 96 |     def test_get_comment_list(self):
 97 |         root = self.root_node
 98 |         
 99 |         comment_list = PythonParser.get_comment_node(root)
100 |         comment_list = [node.text.decode() for node in comment_list]
101 |         
102 |         assert comment_list[1] == '# choose the rightmost element as pivot'
103 |         assert comment_list[2] == '# pointer for greater element'
104 |         assert len(comment_list) == 16
105 |         
106 |     def test_metadata_without_return_statement(self):
107 |         code_sample = '''
108 |         def sum2num():
109 |             pass
110 |         '''
111 |         root = parse_code(code_sample, 'python').root_node
112 |         fn = PythonParser.get_function_list(root)[0]
113 |         metadata = PythonParser.get_function_metadata(fn)
114 |         
115 |         return_type = metadata['return_type']
116 |         self.assertEqual(return_type, None)
117 |         
118 |     def test_metadata_with_return_statement(self):
119 |         code_sample = '''
120 |         def sum2num():
121 |             return True
122 |         '''
123 |         root = parse_code(code_sample, 'python').root_node
124 |         fn = PythonParser.get_function_list(root)[0]
125 |         metadata = PythonParser.get_function_metadata(fn)
126 |         
127 |         return_type = metadata['return_type']
128 |         self.assertEqual(return_type, '<not_specific>')
129 |         
130 |     def test_get_parameter(self):
131 |         code_sample = '''
132 |         def sum2num(a: tree_sitter.Node=None, b=None, c:string) -> int:
133 |             pass
134 |         '''
135 |         
136 |         root = parse_code(code_sample, 'python').root_node
137 |         fn = PythonParser.get_function_list(root)[0]
138 |         
139 |         metadata = PythonParser.get_function_metadata(fn)
140 |         parameter = metadata['parameters']
141 |         self.assertEqual(len(parameter.keys()), 3)
142 |         self.assertTrue('a' in parameter.keys())
143 |         self.assertTrue('b' in parameter.keys())
144 |         self.assertTrue('c' in parameter.keys())
145 |         
146 |         return_type = metadata['return_type']
147 |         self.assertEqual(return_type, 'int')
148 |     
149 |     
150 | if __name__ == '__main__':
151 |     unittest.main()
152 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_ruby.py:
--------------------------------------------------------------------------------
  1 | '''test for Ruby parser'''
  2 | import os
  3 | import unittest
  4 | from pathlib import Path
  5 | 
  6 | from src.codetext.parser import RubyParser
  7 | from src.codetext.utils import parse_code
  8 | 
  9 | 
 10 | class Test_RubyParser(unittest.TestCase):
 11 |     def setUp(self) -> None:
 12 |         with open('tests/test_parser/test_sample/ruby_test_sample.rb', 'r') as file:
 13 |             self.code_sample = file.read()
 14 |             
 15 |         tree = parse_code(self.code_sample, 'ruby')
 16 |         self.root_node = tree.root_node
 17 | 
 18 |         return super().setUp()
 19 | 
 20 |     def test_get_function_list(self):
 21 |         root = self.root_node
 22 |         
 23 |         function_list = RubyParser.get_function_list(root)
 24 |         
 25 |         self.assertEqual(len(function_list), 2)
 26 | 
 27 |     def test_get_class_list(self):
 28 |         root = self.root_node
 29 |         
 30 |         class_list = RubyParser.get_class_list(root)
 31 |         
 32 |         self.assertEqual(len(class_list), 3)
 33 | 
 34 |     def test_get_docstring(self):
 35 |         code_sample = """
 36 |         module Encryption
 37 | 
 38 |             # Search for links.
 39 |             #
 40 |             # @param query [String] The search query.
 41 |             # @option options [String, RedditKit::Subreddit] subreddit The optional subreddit to search.
 42 |             def encrypt(string)
 43 |                 Digest::SHA2.hexdigest(string)
 44 |             end
 45 |         end
 46 |            
 47 |         =begin 
 48 |         comment line 1
 49 |         comment line 2
 50 |         =end  
 51 |         class Orange
 52 |             def initialize
 53 |                 @juice_available = 100
 54 |             end
 55 |             def squeeze
 56 |                 @juice_available -= 50
 57 |             end
 58 |         end
 59 | 
 60 |         orange = Orange.new
 61 |         orange.squeeze
 62 |         """
 63 | 
 64 |         tree = parse_code(code_sample, 'ruby')
 65 |         root = tree.root_node
 66 |         
 67 |         fn = RubyParser.get_function_list(root)[0]
 68 |         clas = RubyParser.get_class_list(root)[1]
 69 |         
 70 |         docs1 = RubyParser.get_docstring(fn)
 71 |         docs2 = RubyParser.get_docstring(clas)
 72 |         
 73 |         self.assertEqual(docs1, '# Search for links.\n#\n# @param query [String] The search query.\n# @option options [String, RedditKit::Subreddit] subreddit The optional subreddit to search.')
 74 |         self.assertEqual(docs2, '        comment line 1\n        comment line 2')
 75 | 
 76 |     def test_get_function_metadata(self):
 77 |         root = self.root_node
 78 |         
 79 |         _function = RubyParser.get_function_list(root)[0]
 80 |         metadata = RubyParser.get_function_metadata(_function)
 81 | 
 82 |         for key in ['identifier', 'parameters', 'return_type']:
 83 |             self.assertTrue(key in metadata.keys())
 84 |         self.assertEqual(metadata['identifier'], 'search')
 85 |         self.assertEqual(metadata['parameters'], {'query': None, 'options': None})
 86 |         self.assertEqual(metadata['return_type'], None)
 87 |         
 88 |         _singleton = RubyParser.get_function_list(root)[1]
 89 |         metadata = RubyParser.get_function_metadata(_singleton)
 90 |         for key in ['identifier', 'parameters', 'return_type']:
 91 |                     self.assertTrue(key in metadata.keys())
 92 |         self.assertEqual(metadata['identifier'], 'my_method')
 93 |         self.assertEqual(metadata['parameters'], {'a': None})
 94 |         self.assertEqual(metadata['return_type'], '<not_specific>')
 95 |         
 96 |     
 97 |     def test_metadata_without_return_statement(self):
 98 |         code_sample = '''
 99 |         def write_code(number_of_errors)
100 |             if number_of_errors > 1
101 |                 mood =  "Ask me later"
102 |             else
103 |                 mood = puts "No Problem"
104 |             end  
105 |             return mood
106 |         end
107 |         '''
108 |         root = parse_code(code_sample, 'Ruby').root_node
109 |         fn = RubyParser.get_function_list(root)[0]
110 |         metadata = RubyParser.get_function_metadata(fn)
111 |         
112 |         return_type = metadata['return_type']
113 |         self.assertEqual(return_type, '<not_specific>')
114 |         
115 | 
116 |     def test_get_class_metadata(self):
117 |         root = self.root_node
118 |         
119 |         classes = RubyParser.get_class_list(root)[1]
120 |         metadata = RubyParser.get_class_metadata(classes)
121 | 
122 |         self.assertEqual(metadata['identifier'], 'Client')
123 |         self.assertEqual(metadata['parameters'], {'API': None})
124 |         
125 |     def test_get_action_list(self):
126 |         root = self.root_node
127 |         actions  = RubyParser.get_action_list(root)
128 |         
129 |         self.assertEqual(len(actions), 5)
130 |     
131 |     def test_get_action_metadata(self):
132 |         root = self.root_node
133 |         actions  = RubyParser.get_action_list(root)
134 |         metadatas = [ RubyParser.get_action_metadata(action) for action in actions]
135 |         self.assertEqual(metadatas[0]["identifier"], "load_current_value")      
136 |         self.assertEqual(metadatas[1]["identifier"], "action:install")
137 |         self.assertEqual(metadatas[2]["identifier"], "converge_by")
138 | 
139 |         self.assertEqual(metadatas[3]["identifier"], "action:reinstall")
140 |         self.assertEqual(metadatas[4]["identifier"], "converge_by")
141 | 
142 |         self.assertEqual(metadatas[0]["parameters"]["new_resource"], None)
143 |         self.assertEqual(metadatas[0]["parameters"]["old_resource"], None)
144 | 
145 | 
146 | if __name__ == '__main__':
147 |     unittest.main()
148 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_rust.py:
--------------------------------------------------------------------------------
  1 | '''test for Ruby parser'''
  2 | import os
  3 | import unittest
  4 | from pathlib import Path
  5 | 
  6 | from src.codetext.parser import RustParser
  7 | from src.codetext.utils import parse_code
  8 | 
  9 | 
 10 | class Test_RustParser(unittest.TestCase):
 11 |     def setUp(self) -> None:
 12 |         with open('tests/test_parser/test_sample/rust_test_sample.rs', 'r') as file:
 13 |             self.code_sample = file.read()
 14 |             
 15 |         tree = parse_code(self.code_sample, 'rust')
 16 |         self.root_node = tree.root_node
 17 | 
 18 |         return super().setUp()
 19 | 
 20 |     def test_get_function_list(self):
 21 |         root = self.root_node
 22 |         
 23 |         function_list = RustParser.get_function_list(root)
 24 |         
 25 |         self.assertEqual(len(function_list), 4)
 26 | 
 27 |     def test_get_class_list(self):
 28 |         root = self.root_node
 29 |         
 30 |         class_list = RustParser.get_class_list(root)
 31 |         
 32 |         self.assertEqual(len(class_list), 2)
 33 | 
 34 |     def test_get_docstring(self):
 35 |         code_sample = """
 36 |         // Comment something
 37 |         mod my_mod {
 38 |             /// Creates a new rendering surface.
 39 |             ///
 40 |             /// # Arguments
 41 |             ///
 42 |             /// Initialization of surfaces happens through the types provided by
 43 |             /// [`drm-rs`](drm).
 44 |             ///
 45 |             /// - [`crtcs`](drm::control::crtc) represent scanout engines of the device pointing to one framebuffer. \\
 46 |             ///     Their responsibility is to read the data of the framebuffer and export it into an "Encoder". \\
 47 |             ///     The number of crtc's represent the number of independent output devices the hardware may handle.
 48 |             fn private_function() {
 49 |                 println!("called `my_mod::private_function()`");
 50 |             }
 51 | 
 52 |             /**  - Outer block doc (exactly) 2 asterisks */
 53 |             pub fn function() {
 54 |                 println!("called `my_mod::function()`");
 55 |             }
 56 | 
 57 |             // Items can access other items in the same module,
 58 |             // even when private.
 59 |             pub fn indirect_access() {
 60 |                 print!("called `my_mod::indirect_access()`, that\n> ");
 61 |                 private_function();
 62 |             }
 63 |         }
 64 |         """
 65 | 
 66 |         tree = parse_code(code_sample, 'rust')
 67 |         root = tree.root_node
 68 |         
 69 |         fn1 = RustParser.get_function_list(root)[0]
 70 |         fn2 = RustParser.get_function_list(root)[1]
 71 |         clas = RustParser.get_class_list(root)[0]
 72 |         
 73 |         docs1 = RustParser.get_docstring(fn1)
 74 |         docs2 = RustParser.get_docstring(fn2)
 75 |         docs3 = RustParser.get_docstring(clas)
 76 |         
 77 |         self.assertEqual(docs1, '/// Creates a new rendering surface.\n///\n/// # Arguments\n///\n/// Initialization of surfaces happens through the types provided by\n/// [`drm-rs`](drm).\n///\n/// - [`crtcs`](drm::control::crtc) represent scanout engines of the device pointing to one framebuffer. \\\n///     Their responsibility is to read the data of the framebuffer and export it into an "Encoder". \\\n///     The number of crtc\'s represent the number of independent output devices the hardware may handle.')
 78 |         self.assertEqual(docs2, '/**  - Outer block doc (exactly) 2 asterisks */')
 79 |         self.assertEqual(docs3, '// Comment something')
 80 | 
 81 |     def test_get_function_metadata(self):
 82 |         root = self.root_node
 83 |         
 84 |         function = RustParser.get_function_list(root)[0]
 85 |         metadata = RustParser.get_function_metadata(function)
 86 | 
 87 |         for key in ['identifier', 'parameters', 'return_type']:
 88 |             self.assertTrue(key in metadata.keys())
 89 |         self.assertEqual(metadata['identifier'], 'long_string')
 90 |         self.assertEqual(metadata['parameters'], {'x': '&str'})
 91 |         self.assertEqual(metadata['return_type'], '&str')
 92 |     
 93 |     def test_metadata_with_return_statement(self):
 94 |         code_sample = '''
 95 |         fn quack(&self) {
 96 |             println!("quack!");
 97 |             return "hello";
 98 |         }
 99 |         '''
100 |         root = parse_code(code_sample, 'Rust').root_node
101 |         fn = RustParser.get_function_list(root)[0]
102 |         metadata = RustParser.get_function_metadata(fn)
103 |         
104 |         return_type = metadata['return_type']
105 |         self.assertEqual(return_type, '<not_specific>')
106 | 
107 |     def test_get_class_metadata(self):
108 |         root = self.root_node
109 |         
110 |         classes = RustParser.get_class_list(root)[0]
111 |         metadata = RustParser.get_class_metadata(classes)
112 |         
113 |         self.assertEqual(metadata['identifier'], 'Quack')
114 |         self.assertEqual(metadata['parameters'], {'Duck': None})
115 |         
116 | 
117 | if __name__ == '__main__':
118 |     unittest.main()
119 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_sample/README.md:
--------------------------------------------------------------------------------
  1 | # Tree-sitter function/class type
  2 | 
  3 | ## C/C++
  4 | Node type - Sample
  5 | 
  6 | - with C
  7 | ```c
  8 | // function_definition
  9 | void reverseSentence(int random_seed) {
 10 |     char c;
 11 |     scanf("%c", &c);
 12 |     if (c != '\n') {
 13 |         reverseSentence();
 14 |         printf("%c", c);
 15 |     }
 16 | }
 17 | ```
 18 | 
 19 | - with C++
 20 | ```c++
 21 | // function_definition
 22 | double plusFuncDouble(double x, double y) {
 23 |   return x + y;
 24 | }
 25 | 
 26 | // function_definition
 27 | int main() {
 28 |   int myNum1 = plusFuncInt(8, 5);
 29 |   double myNum2 = plusFuncDouble(4.3, 6.26);
 30 |   cout << "Int: " << myNum1 << "\n";
 31 |   cout << "Double: " << myNum2;
 32 |   return 0;
 33 | }
 34 | 
 35 | // class_specifier
 36 | class Animal {
 37 |   public:
 38 |     // function_definition
 39 |     void animalSound() {
 40 |       cout << "The animal makes a sound \n";
 41 |     }
 42 | };
 43 | 
 44 | // class_specifier
 45 | class Pig : public Animal {
 46 |   public:
 47 |     // function_definition
 48 |     void animalSound() {
 49 |       cout << "The pig says: wee wee \n";
 50 |     }
 51 | };
 52 | ```
 53 | 
 54 | ## C#
 55 | 
 56 | ```c#
 57 | // local_function_statement
 58 | private static string GetText(string path, string filename)
 59 | {
 60 |     // local_declaration_statement
 61 |     var reader = File.OpenText($"{AppendPathSeparator(path)}{filename}");
 62 |     var text = reader.ReadToEnd();
 63 |     return text;
 64 | 
 65 |     // local_function_statement
 66 |     string AppendPathSeparator(string filepath)
 67 |     {
 68 |         return filepath.EndsWith(@"\") ? filepath : filepath + @"\";
 69 |     }
 70 | }
 71 | 
 72 | using System;
 73 | 
 74 | // class_declaration
 75 | public class Dog : Animal {
 76 |  
 77 |     String name;
 78 |     String breed;
 79 |     int age;
 80 |     String color;
 81 |  
 82 |     // constructor_declaration
 83 |     public Dog(String name, String breed,
 84 |                   int age, String color)
 85 |     {
 86 |         this.name = name;
 87 |         this.breed = breed;
 88 |         this.age = age;
 89 |         this.color = color;
 90 |     }
 91 |     
 92 |     // method_declaration
 93 |     static void Main(string[] args)
 94 |     {
 95 |       Car myObj = new Car();
 96 |       Console.WriteLine(myObj.color);
 97 |     }
 98 | }
 99 | ```
100 | 
101 | ## Java
102 | 
103 | ```Java
104 | // class_declaration
105 | public class SaveFileController extends SudoUser implements FileController {
106 |     // field_declaration
107 |     private ArrayList<User> allUsers;
108 |     private String saveFile = "test_save_file4.sav";
109 | 
110 |     // constructor_declaration
111 |     public SaveFileController(){
112 |         this.allUsers = new ArrayList<User>();
113 |     }
114 | 
115 |     // method_declaration
116 |     public HabitList getHabitList(Context context, int userIndex){
117 |         loadFromFile(context);
118 |         return this.allUsers.get(userIndex).getHabitList();
119 |     }
120 | }
121 | ```
122 | 
123 | ## Python
124 | ```python
125 | # class_definition
126 | class Person:
127 |     # function_definition
128 |     def __init__(self, name, age):
129 |         self.name = name
130 |         self.age = age
131 | 
132 |     # function_definition
133 |     def say_my_name(self):
134 |         print(self.name)
135 | 
136 | # function_definition
137 | def create_a_person(name, age):
138 |     new_person = Person(name, age)
139 | ```
140 | 
141 | ## JavaScript
142 | ```JavaScript
143 | // function_declaration
144 | export function loadSongs() {
145 |     return {
146 |         type: LOAD_SONGS,
147 |     };
148 | }
149 | 
150 | // class_declaration
151 | class Model extends Car {
152 |     // method_definition
153 |     constructor(brand, mod) {
154 |         super(brand);
155 |         this.model = mod;
156 |     }
157 | 
158 |     // method_definition
159 |     show() {
160 |         return this.present() + ', it is a ' + this.model;
161 |     }
162 | }
163 | ```
164 | 
165 | ## PHP
166 | 
167 | ```PHP
168 | // function_definition
169 | function familyName($fname) {
170 |   echo "$fname Refsnes.<br>";
171 | }
172 | 
173 | // class_declaration
174 | final class Driver extends AbstractSQLServerDriver
175 | {
176 |     // method_declaration
177 |     public function connect(array $params)
178 |     {
179 |         $driverOptions = $dsnOptions = [];
180 |         if (isset($params['driverOptions'])) {
181 |             foreach ($params['driverOptions'] as $option => $value) {
182 |                 if (is_int($option)) {
183 |                     $driverOptions[$option] = $value;
184 |                 } else {
185 |                         $dsnOptions[$option] = $value;
186 |                 }
187 |             }
188 |         }
189 |     }
190 | }
191 | ```
192 | 
193 | ## GO
194 | 
195 | ```GO
196 | // function_declaration
197 | func add(x int, y int) int {
198 | 	return x + y
199 | }
200 | 
201 | // function_declaration
202 | func main() {
203 | 	fmt.Println(add(42, 13))
204 | }
205 | 
206 | // method_declaration
207 | func (e TypeError) Error() string {
208 | 		msg := e.Type1.String()
209 | 		if e.Type2 != nil {
210 | 			msg += " and " + e.Type2.String()
211 | 	}
212 | 	msg += " " + e.Extra
213 | 	return msg
214 | }
215 | 
216 | ```
217 | 
218 | ## Ruby
219 | 
220 | ```Ruby
221 | # class
222 | class Customer
223 |    @@no_of_customers = 0
224 |    
225 |    # method
226 |    def initialize(id, name, addr)
227 |       @cust_id = id
228 |       @cust_name = name
229 |       @cust_addr = addr
230 |    end
231 | end
232 | 
233 | # method
234 | def test(a1 = "Ruby", a2 = "Perl")
235 |    puts "The programming language is #{a1}"
236 |    puts "The programming language is #{a2}"
237 | end
238 | 
239 | # module
240 | module RedditKit
241 |     # class
242 |     class Client < API
243 |         # method
244 |         def search(query, options = {})
245 |             path = "%s/search.json" % ('r/' + options[:subreddit] if options[:subreddit])
246 |             parameters = { :q => query,
247 |                             :restrict_sr => options[:restrict_to_subreddit],
248 |                             :limit       => options[:limit],
249 |                             :count       => options[:count],
250 |                             :sort        => options[:sort],
251 |                             :before      => options[:before],
252 |                             :after       => options[:after],
253 |                             :syntax      => options[:syntax],
254 |                             :t           => options[:time]
255 |             }
256 | 
257 |             objects_from_response(:get, path, parameters)
258 |         end
259 |     end
260 | end
261 |     
262 | ```
263 | 
264 | ## Rust
265 | 
266 | ```Rust
267 | // trait_item
268 | trait Quack {
269 |     // function_signature_item <- This is function declaration
270 |     fn quack(&self);
271 | }
272 | 
273 | // struct_item
274 | struct Duck ();
275 | 
276 | // function_item
277 | fn long_string(x: &str) -> &str {
278 |     if x.len() > 10 {
279 |         "too long"
280 |     } else {
281 |         x
282 |     }
283 | 
284 | }
285 | 
286 | // impl_item
287 | impl Quack for Duck {
288 |     // function_item
289 |     fn quack(&self) {
290 |         println!("quack!");
291 |     }
292 | }
293 | 
294 | // mod_item
295 | mod my_mod {
296 |     // function_item
297 |     fn private_function() {
298 |         println!("called `my_mod::private_function()`");
299 |     }
300 | }
301 | 
302 | // function_item
303 | fn quack_everyone <I> (iter: I)
304 | where I: Iterator<Item=Box<Quack>> {
305 |     for d in iter {
306 |         d.quack();
307 |     }
308 | }
309 | ```


--------------------------------------------------------------------------------
/tests/test_parser/test_sample/c_sharp_test_sample.cs:
--------------------------------------------------------------------------------
 1 | private static string GetText(string path, string filename)
 2 | {
 3 |     var reader = File.OpenText($"{AppendPathSeparator(path)}{filename}");
 4 |     var text = reader.ReadToEnd();
 5 |     return text;
 6 | 
 7 |     string AppendPathSeparator(string filepath)
 8 |     {
 9 |         return filepath.EndsWith(@"\") ? filepath : filepath + @"\";
10 |     }
11 | }
12 | 
13 | using System;
14 | public class Dog : Animal {
15 |  
16 |     // Instance Variables
17 |     String name;
18 |     String breed;
19 |     int age;
20 |     String color;
21 |  
22 |     // Constructor Declaration of Class
23 |     public Dog(String name, String breed,
24 |                   int age, String color)
25 |     {
26 |         this.name = name;
27 |         this.breed = breed;
28 |         this.age = age;
29 |         this.color = color;
30 |     }
31 |     
32 |     // Docstring of this function
33 |     static void Main(string[] args)
34 |     {
35 |       Car myObj = new Car();
36 |       Console.WriteLine(myObj.color);
37 |     }
38 |   }


--------------------------------------------------------------------------------
/tests/test_parser/test_sample/c_test_sample.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | void reverseSentence();
 3 | 
 4 | /**
 5 |  * A brief description. A more elaborate class description
 6 |  * @param random_seed somearg.
 7 |  * @see Test()
 8 |  * @return The test results
 9 |  */
10 | void reverseSentence(int random_seed) {
11 |     char c;
12 |     scanf("%c", &c);
13 |     if (c != '\n') {
14 |         reverseSentence();
15 |         printf("%c", c);
16 |     }
17 | }
18 | 
19 | int main() {
20 |     printf("Enter a sentence: ");
21 |     reverseSentence();
22 |     return 0;
23 | }
24 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_sample/cpp_test_sample.cpp:
--------------------------------------------------------------------------------
 1 | // Derived class
 2 | class Car: public Vehicle, private B {
 3 |   public:
 4 |     string model = "Mustang";
 5 | };
 6 | 
 7 | // A static function
 8 | int sum2number (int a, int b) {
 9 |   return a + b;
10 | }
11 | 
12 | // Base class
13 | class Vehicle {
14 |   public:
15 |     string brand = "Ford";
16 |     void honk() {
17 |       cout << "Tuut, tuut! \n" ;
18 |     }
19 | };
20 | 
21 | int main() {
22 |   Car myCar;
23 |   myCar.honk();
24 |   cout << myCar.brand + " " + myCar.model;
25 |   return 0;
26 | }
27 | 
28 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_sample/go_test_sample.go:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 The Go Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | type TypeError struct {
 5 | 		Type1, Type2 reflect.Type
 6 | 		Extra        string
 7 | 	}
 8 | 
 9 | // The path package should only be used for paths separated by forward
10 | // slashes, such as the paths in URLs. This package does not deal with
11 | // Windows paths with drive letters or backslashes; to manipulate
12 | // operating system paths, use the [path/filepath] package.
13 | func (e TypeError) Error() string {
14 | 		msg := e.Type1.String()
15 | 		if e.Type2 != nil {
16 | 			msg += " and " + e.Type2.String()
17 | 	}
18 | 	msg += " " + e.Extra
19 | 	return msg
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_sample/java_test_sample.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Implements the file to save data to.
 3 |  *
 4 |  * @version 1.0
 5 |  */
 6 | public class SaveFileController extends SudoUser implements FileController {
 7 |     private ArrayList<User> allUsers;
 8 |     //private String username;
 9 |     private String saveFile = "test_save_file4.sav";
10 | 
11 |     public SaveFileController(){
12 |         this.allUsers = new ArrayList<User>();
13 |     }
14 | 
15 |     /**
16 |      * Gets HabitList instance.
17 |      *
18 |      * @param context instance of Context
19 |      * @param userIndex integer user index
20 |      * @return HabitList
21 |      * @see HabitList
22 |      */
23 |     public HabitList getHabitList(Context context, int userIndex){
24 |         loadFromFile(context);
25 |         return this.allUsers.get(userIndex).getHabitList();
26 |     }
27 | 
28 |     /**
29 |      * Removes a habit event from a particular user's habit event list.
30 |      *
31 |      * @param context instance of Context
32 |      * @param userIndex integer user index
33 |      * @param habitIndex integer index of habit
34 |      * @param habitEventIndex integer index of habit event
35 |      */
36 |     public void removeHabitEvent(Context context, int userIndex, int habitIndex, int habitEventIndex){
37 |         loadFromFile(context);
38 |         this.allUsers.get(userIndex).getHabitList().getHabit(habitIndex)
39 |                 .getHabitEventHistory().getHabitEvents().remove(habitEventIndex);
40 |         saveToFile(context);
41 |     }
42 | }


--------------------------------------------------------------------------------
/tests/test_parser/test_sample/javascript_test_sample.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * App Actions
 3 |  *
 4 |  * Actions change things in your application
 5 |  * Since this boilerplate uses a uni-directional data flow, specifically redux,
 6 |  * we have these actions which are the only way your application interacts with
 7 |  * your application state. This guarantees that your state is up to date and nobody
 8 |  * messes it up weirdly somewhere.
 9 |  *
10 |  * To add a new Action:
11 |  * 1) Import your constant
12 |  * 2) Add a function like this:
13 |  *    export function yourAction(var) {
14 |      *        return { type: YOUR_ACTION_CONSTANT, var: var }
15 |  *    }
16 |  */
17 | 
18 | import {
19 |       LOAD_SONGS,
20 |       LOAD_SONGS_SUCCESS,
21 |       LOAD_SONGS_ERROR,
22 | } from './constants';
23 | 
24 | /**
25 |  * Load the repositories, this action starts the request saga
26 |  *
27 |  * @return {object} An action object with a type of LOAD_REPOS
28 |  */
29 | export function loadSongs() {
30 |       return {
31 |         type: LOAD_SONGS,
32 |   };
33 | }
34 | 
35 | /**
36 |  * Dispatched when the repositories are loaded by the request saga
37 |  *
38 |  * @param  {array} repos The repository data
39 |  * @param  {string} username The current username
40 |  *
41 |  * @return {object}      An action object with a type of LOAD_REPOS_SUCCESS passing the repos
42 |  */
43 | export function songsLoaded(repos, username=10) {
44 |       return {
45 |         type: LOAD_SONGS_SUCCESS,
46 |     repos,
47 |     username,
48 |   };
49 | }
50 | 
51 | /**
52 |  * Dispatched when loading the repositories fails
53 |  *
54 |  * @param  {object} error The error
55 |  *
56 |  * @return {object}       An action object with a type of LOAD_REPOS_ERROR passing the error
57 |  */
58 | export function songsLoadingError(error) {
59 |       return {
60 |         type: LOAD_SONGS_ERROR,
61 |     error,
62 |   };
63 | }
64 | 
65 | class Model extends Car {
66 |   constructor(brand, mod) {
67 |     super(brand);
68 |     this.model = mod;
69 |   }
70 |   
71 |   /**
72 |   * Comment something
73 |   */
74 |   show() {
75 |     return this.present() + ', it is a ' + this.model;
76 |   }
77 | }
78 | 
79 | class Car {
80 |   constructor(brand) {
81 |     this.carname = brand;
82 |   }
83 |   
84 |     /**
85 |     * Dispatched when loading the repositories fails
86 |     *
87 |     * @param  {object} error The error
88 |     *
89 |     * @return {object}       An action object with a type of LOAD_REPOS_ERROR passing the error
90 |     */
91 |   present() {
92 |     return 'I have a ' + this.carname;
93 |   }
94 | }
95 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_sample/php_test_sample.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | 
  3 | final class Driver extends AbstractSQLServerDriver
  4 | {
  5 |     /**
  6 |      * {@inheritdoc}
  7 |      *
  8 |      * @return Connection
  9 |      */
 10 |     public function connect(array $params)
 11 |     {
 12 |             $driverOptions = $dsnOptions = [];
 13 | 
 14 |         if (isset($params['driverOptions'])) {
 15 |                 foreach ($params['driverOptions'] as $option => $value) {
 16 |                     if (is_int($option)) {
 17 |                         $driverOptions[$option] = $value;
 18 |                 } else {
 19 |                         $dsnOptions[$option] = $value;
 20 |                 }
 21 |             }
 22 |         }
 23 | 
 24 |         if (! empty($params['persistent'])) {
 25 |                 $driverOptions[PDO::ATTR_PERSISTENT] = true;
 26 |         }
 27 | 
 28 |         try {
 29 |                 $pdo = new PDO(
 30 |                     $this->constructDsn($params, $dsnOptions),
 31 |                 $params['user'] ?? '',
 32 |                 $params['password'] ?? '',
 33 |                 $driverOptions
 34 |             );
 35 |         } catch (\\PDOException $exception) {
 36 |                 throw PDOException::new($exception);
 37 |         }
 38 | 
 39 |         return new Connection(new PDOConnection($pdo));
 40 |     }
 41 | 
 42 |     /**
 43 |      * Constructs the Sqlsrv PDO DSN.
 44 |      *
 45 |      * @param mixed[]  $params
 46 |      * @param string[] $connectionOptions
 47 |      *
 48 |      * @throws Exception
 49 |      */
 50 |     private function constructDsn(array $params=null, array $connectionOptions): string
 51 |     {
 52 |             $dsn = 'sqlsrv:server=';
 53 | 
 54 |         if (isset($params['host'])) {
 55 |                 $dsn .= $params['host'];
 56 | 
 57 |             if (isset($params['port'])) {
 58 |                     $dsn .= ',' . $params['port'];
 59 |             }
 60 |         } elseif (isset($params['port'])) {
 61 |                 throw PortWithoutHost::new();
 62 |         }
 63 | 
 64 |         if (isset($params['dbname'])) {
 65 |                 $connectionOptions['Database'] = $params['dbname'];
 66 |         }
 67 | 
 68 |         if (isset($params['MultipleActiveResultSets'])) {
 69 |                 $connectionOptions['MultipleActiveResultSets'] = $params['MultipleActiveResultSets'] ? 'true' : 'false';
 70 |         }
 71 | 
 72 |         return $dsn . $this->getConnectionOptionsDsn($connectionOptions);
 73 |     }
 74 | 
 75 |     /**
 76 |      * Converts a connection options array to the DSN
 77 |      *
 78 |      * @param string[] $connectionOptions
 79 |      */
 80 |     private function getConnectionOptionsDsn(array $connectionOptions): string
 81 |     {
 82 |             $connectionOptionsDsn = '';
 83 | 
 84 |         foreach ($connectionOptions as $paramName => $paramValue) {
 85 |                 $connectionOptionsDsn .= sprintf(';%s=%s', $paramName, $paramValue);
 86 |         }
 87 | 
 88 |         return $connectionOptionsDsn;
 89 |     }
 90 | }
 91 | 
 92 | interface MyInterface {
 93 |         public function myMethod() {
 94 |             // Method implementation
 95 |         }
 96 |     
 97 | }
 98 | 
 99 | trait MyTrait {
100 |     
101 |         public function setBackgroundImage(Drawing $objDrawing): self
102 |         {
103 |             if (!array_key_exists($objDrawing->getType(), Drawing::IMAGE_TYPES_CONVERTION_MAP)) {
104 |                 throw new PhpSpreadsheetException('Unsupported image type in comment background. Supported types: PNG, JPEG, BMP, GIF.');
105 |             }
106 |             $this->backgroundImage = $objDrawing;
107 |     
108 |             return $this;
109 |         }
110 |     
111 | }
112 |     
113 | 


--------------------------------------------------------------------------------
/tests/test_parser/test_sample/py_test_sample.py:
--------------------------------------------------------------------------------
 1 | def partition(array, low, high):
 2 |     """
 3 |     Function to find the partition position
 4 |     
 5 |     :param array: the unsorted array
 6 |     :type array: List
 7 |     :param low: smaller pivot
 8 |     :type low: int
 9 |     :param high: greater pivot
10 |     :type high: int
11 |     
12 |     """
13 |     # choose the rightmost element as pivot
14 |     pivot = array[high]
15 |  
16 |     # pointer for greater element
17 |     i = low - 1
18 |  
19 |     # traverse through all elements
20 |     # compare each element with pivot
21 |     for j in range(low, high):
22 |         if array[j] <= pivot:
23 |  
24 |             # If element smaller than pivot is found
25 |             # swap it with the greater element pointed by i
26 |             i = i + 1
27 |  
28 |             # Swapping element at i with element at j
29 |             (array[i], array[j]) = (array[j], array[i])
30 |  
31 |     # Swap the pivot element with the greater element specified by i
32 |     (array[i + 1], array[high]) = (array[high], array[i + 1])
33 |  
34 |     # Return the position from where partition is done
35 |     return i + 1
36 | 
37 | def quickSort(array, low, high):
38 |     """
39 |     Function to perform quicksort
40 |     """
41 |     if low < high:
42 |  
43 |         # Find pivot element such that
44 |         # element smaller than pivot are on the left
45 |         # element greater than pivot are on the right
46 |         pi = partition(array, low, high)
47 |  
48 |         # Recursive call on the left of pivot
49 |         quickSort(array, low, pi - 1)
50 |  
51 |         # Recursive call on the right of pivot
52 |         quickSort(array, pi + 1, high)
53 |  
54 |  
55 | data = [1, 7, 4, 1, 10, 9, -2]
56 | print("Unsorted Array")
57 | print(data)
58 |  
59 | size = len(data)
60 |  
61 | quickSort(data, 0, size - 1)
62 |  
63 | print('Sorted Array in Ascending Order:')
64 | print(data)
65 | 
66 | class Person:
67 |     def __init__(self, name, age):
68 |         self.name = name
69 |         self.age = age


--------------------------------------------------------------------------------
/tests/test_parser/test_sample/ruby_test_sample.rb:
--------------------------------------------------------------------------------
 1 | module RedditKit
 2 |     class Client < API
 3 |     
 4 |         # Methods for searching reddit's links.
 5 |         module Search
 6 |     
 7 |         # Search for links.
 8 |         #
 9 |         # @param query [String] The search query.
10 |         # @option options [String, RedditKit::Subreddit] subreddit The optional subreddit to search.
11 |         # @option options [true, false] restrict_to_subreddit Whether to search only in a specified subreddit.
12 |         # @option options [1..100] limit The number of links to return.
13 |         # @option options [String] count The number of results to return before or after. This is different from `limit`.
14 |         # @option options [relevance, new, hot, top, comments] sort The sorting order for search results.
15 |         # @option options [String] before Only return links before this full name.
16 |         # @option options [String] after Only return links after this full name.
17 |         # @option options [cloudsearch, lucene, plain] syntax Specify the syntax for the search. Learn more: http://www.reddit.com/r/redditdev/comments/1hpicu/whats_this_syntaxcloudsearch_do/cawm0fe
18 |         # @option options [hour, day, week, month, year, all] time Show results with a specific time period.
19 |         # @return [RedditKit::PaginatedResponse]
20 |         def search(query, options = {})
21 |             path = "%s/search.json" % ('r/' + options[:subreddit] if options[:subreddit])
22 |             parameters = { :q => query,
23 |                             :restrict_sr => options[:restrict_to_subreddit],
24 |                             :limit       => options[:limit],
25 |                             :count       => options[:count],
26 |                             :sort        => options[:sort],
27 |                             :before      => options[:before],
28 |                             :after       => options[:after],
29 |                             :syntax      => options[:syntax],
30 |                             :t           => options[:time]
31 |             }
32 | 
33 |             objects_from_response(:get, path, parameters)
34 |         end
35 | 
36 |         def self.my_method(a)
37 |             # Method implementation
38 |             puts(a)
39 |             return a
40 |         end
41 |     
42 |     end
43 |   end
44 | end
45 | 
46 | load_current_value do |new_resource, old_resource|
47 |     unless current_installed_version(new_resource).nil?
48 |       version(current_installed_version(new_resource))
49 |       Chef::Log.debug("Current version is #{version}") if version
50 |       return a
51 |     end
52 |   end
53 |   
54 |   action :install  do
55 |     build_essential
56 |   
57 |     install_version = new_resource.version unless new_resource.version.nil? || new_resource.version == current_resource.version
58 |     versions_match = candidate_version == current_installed_version(new_resource)
59 |   
60 |     if install_version || new_resource.version.nil? && !versions_match
61 |       converge_by("install package #{new_resource.package_name} #{install_version}") do
62 |         info_output = "Installing #{new_resource.package_name}"
63 |         info_output << " version #{install_version}" if install_version && !install_version.empty?
64 |         Chef::Log.info(info_output)
65 |         install_package(new_resource.package_name, install_version)
66 |       end
67 |     end
68 |   end
69 |   
70 |   action :reinstall do
71 |     build_essential
72 |     
73 |     install_version = new_resource.version unless new_resource.version.nil?
74 |     converge_by("reinstall package #{new_resource.package_name} #{install_version}") do
75 |       info_output = "Installing #{new_resource.package_name}"
76 |       info_output << " version #{install_version}" if install_version && !install_version.empty?
77 |       Chef::Log.info(info_output)
78 |       install_package(new_resource.package_name, install_version, force: true)
79 |     end
80 |   end
81 | 
82 | a = 1
83 | 
84 | reinstall
85 |     


--------------------------------------------------------------------------------
/tests/test_parser/test_sample/rust_test_sample.rs:
--------------------------------------------------------------------------------
 1 | trait Quack {
 2 |     fn quack(&self);
 3 | }
 4 | 
 5 | struct Duck ();
 6 | 
 7 | fn long_string(x: &str) -> &str {
 8 |     if x.len() > 10 {
 9 |         "too long"
10 |     } else {
11 |         x
12 |     }
13 | 
14 | }
15 | 
16 | impl Quack for Duck {
17 |     fn quack(&self) {
18 |         println!("quack!");
19 |     }
20 | }
21 | 
22 | mod my_mod {
23 |     // Items in modules default to private visibility.
24 |     fn private_function() {
25 |         println!("called `my_mod::private_function()`");
26 |     }
27 | }
28 | 
29 | fn quack_everyone <I> (iter: I)
30 | where I: Iterator<Item=Box<Quack>> {
31 |     for d in iter {
32 |         d.quack();
33 |     }
34 | }
35 | 
36 | let ducks: Vec<Box<Quack>> = vec![Box::new(duck1),Box::new(duck2),Box::new(parrot),Box::new(int)];
37 | 


--------------------------------------------------------------------------------
/tests/test_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FSoft-AI4Code/CodeText-parser/d2a7365f7f944650e84d9fdb6b6794d6c5ea620b/tests/test_utils/__init__.py


--------------------------------------------------------------------------------
/tests/test_utils/test_utils.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from src.codetext.utils import build_language, parse_code
 3 | 
 4 | 
 5 | class Test_Utils(unittest.TestCase):
 6 |     def test_build_language(self):
 7 |         langs = ['python', 'rust']
 8 |         for l in langs:
 9 |             # clear it later
10 |             build_language(language=l)
11 |     
12 |     def test_parse_code(self):
13 |         sample = """
14 |         def sum_2_num(a, b):
15 |             return a + b
16 |         """
17 |         parse_code(sample, 'python')
18 |     
19 | 
20 | if __name__ == '__main__':
21 |     unittest.main()


--------------------------------------------------------------------------------