├── .gitignore ├── LICENSE ├── Project_Indexer.py ├── README.md ├── cSharpIndexer ├── cSharpIndexer.py └── indexer_dependencies.csproj ├── memory-bank ├── productContext.md └── systemPatterns.md ├── parser ├── __init__.py ├── csharp_parser.py ├── parser.py ├── python_parser.py └── typescript_parser.py ├── requirements.txt └── test ├── benchmark_parser.py ├── resources ├── test.cs ├── test.ts └── test.tsx └── test_parser.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Project Specific 7 | ProjectIndex.json 8 | 9 | # IDEs and Editors 10 | .vscode/ 11 | .idea/ 12 | 13 | # Virtual Environment 14 | venv/ 15 | 16 | # roo code 17 | memory-bank/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Dolfie-01 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Project_Indexer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | from parser import extract_types_and_members_from_file_for_typescript 6 | from parser.parser import extract_types_and_members_from_file_for_csharp, extract_types_and_members_from_file_for_python 7 | 8 | def index_project_structure(root_dir: str, extract_imports: bool = False): 9 | """ 10 | Walks through the directory tree starting at root_dir. 11 | Extracts type definitions and members from each C# or Python file and creates a structured index. 12 | """ 13 | project_index = {} 14 | print(f"Indexing project structure starting at: {root_dir}") 15 | # Walk through the directory tree 16 | for subdir, _, files in os.walk(root_dir): 17 | for file in files: 18 | # Process only C# and Python files 19 | if not file.endswith('.cs') and not file.endswith('.py') and not file.endswith('.tsx') and not file.endswith('.ts'): 20 | print(f"Skipping non-C# or non-Python file or non-Typescript: {file}") 21 | continue 22 | # Construct the full file path and relative path 23 | file_path = os.path.join(subdir, file) 24 | relative_path = os.path.relpath(file_path, root_dir) 25 | if file.endswith('.cs'): 26 | # Extract C# types and members 27 | details = extract_types_and_members_from_file_for_csharp(file_path) 28 | elif file.endswith('.py'): 29 | # Extract Python types and members 30 | details = extract_types_and_members_from_file_for_python(file_path, extract_imports) 31 | elif file.endswith('.tsx') or file.endswith('.ts'): 32 | # Extract TypeScript types and members 33 | details = extract_types_and_members_from_file_for_typescript(file_path) 34 | # Include in the index only if any type or member was found 35 | project_index_details = details.__to_dict__() 36 | if any(project_index_details.values()): 37 | project_index[relative_path] = project_index_details 38 | return project_index 39 | 40 | if __name__ == "__main__": 41 | # Specify pwd as default root directory and argument --path if provided 42 | root_directory = os.getcwd() # Default to current working directory 43 | # Check if a path argument is provided 44 | parser = argparse.ArgumentParser(description='Index project structure for C# and Python files.') 45 | parser.add_argument('--path', type=str, help='Path to the project directory to index') 46 | parser.add_argument('--imports', action='store_true', help='Extract imports from Python files', default=False) 47 | args = parser.parse_args() 48 | if args.path: 49 | root_directory = args.path 50 | # Check if the provided path exists 51 | if not os.path.exists(root_directory): 52 | print(f"Provided path does not exist: {root_directory}") 53 | exit(1) 54 | # Check if the provided path is a directory 55 | if not os.path.isdir(root_directory): 56 | print(f"Provided path is not a directory: {root_directory}") 57 | exit(1) 58 | # Index the project structure starting at the specified root directory 59 | index = index_project_structure(root_directory, args.imports) 60 | # Export file renamed to ProjectIndex.json 61 | export_filename = f"{root_directory}/ProjectIndex.json" 62 | with open(export_filename, 'w', encoding='utf-8') as index_file: 63 | json.dump(index, index_file, indent=4) 64 | print(f"Project structure indexed successfully and exported to {export_filename}.") -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Project Indexer 2 | 3 | ## Overview 4 | 5 | `Project Indexer` is a simple script designed to index the locations of classes, files, and other components within a project. This indexing allows **Large Language Models (LLMs)** to quickly understand the project's structure, enabling **D.R.Y. (Don't Repeat Yourself) coding** by preventing redundant object creation and avoiding full project scans. 6 | 7 | By providing an organized index, `Project Indexer` enhances the LLM's understanding from the outset. When combined with a **well-structured design document** containing patterns and examples, it facilitates **efficient onboarding** for new tasks or ongoing development in an active codebase. 8 | 9 | The tool currently supports both C# and Python files, with specialized regex patterns for each language's syntax. The index includes: 10 | 11 | - For C#: classes, structs, interfaces, enums, and methods 12 | - For Python: 13 | - Classes (including decorated classes) 14 | - Functions (including decorated functions) 15 | - Methods (instance, class and static methods) 16 | - Imports (absolute and relative) 17 | - Docstrings (as metadata) 18 | 19 | 20 | ## Why Use ProjectIndexer? 21 | 22 | - **Reduces unnecessary LLM scanning**: Saves token usage compared to constant back-and-forth querying. 23 | - **Prevents hallucinations**: Helps LLMs recognize what exists in the codebase. 24 | - **Speeds up development**: Ideal for large projects with many files and classes. 25 | - **Compatible with multiple LLMs**, including: 26 | - RooCode - the Awesome VSCode AI Developement Extension. 27 | - Gemini (All) 28 | - Sonnet (All) 29 | - o3-mini and o3-mini-high 30 | - Deepseek V3.1 & Deepseek R1 31 | - GPT models 32 | - Grok 33 | 34 | ## Features 35 | 36 | - **Creates** a **.json file** listing class names, methods and locations 37 | - **Multi-language support**: Works with both C# and Python codebases 38 | - **Tree-sitter parsing**: More accurate than regex for complex syntax cases 39 | - **Lightweight**: Only indexes names and locations, not full class properties or implementations 40 | - **Supports integration with RooCode**: In some cases, you can configure `Code/Architect` mode to frequently refer to `ProjectIndex.json`. You may also set it to **run periodically** upon reaching milestones or creating new tasks 41 | 42 | > **Note:** This tool was built for personal use due to managing a **VS Solution with 5 projects and over 600 classes/methods**. 43 | 44 | ## Parser Architecture 45 | 46 | The project uses a modular parser system with Tree-sitter for accurate syntax parsing: 47 | 48 | - `parser/__init__.py`: Main parser interface and Tree-sitter grammar initialization 49 | - `parser/csharp_parser.py`: Handles C# specific parsing using Tree-sitter 50 | - `parser/python_parser.py`: Handles Python specific parsing using Tree-sitter 51 | 52 | Each parser implements: 53 | 54 | - File extension detection 55 | - Tree-sitter based parsing with language grammars 56 | - Common output format (JSON) 57 | 58 | ### Tree-sitter Integration 59 | 60 | The project now uses [Tree-sitter](https://tree-sitter.github.io/tree-sitter/) for more accurate parsing: 61 | 62 | - **Pre-loaded grammars** for Python and C# at startup 63 | - **Faster parsing** by avoiding regex pattern matching 64 | - **More reliable** extraction of code structures 65 | - **Better handling** of complex syntax cases 66 | 67 | Required dependencies (automatically installed via requirements.txt): 68 | 69 | - tree-sitter==0.24.0 70 | - tree-sitter-python==0.23.6 71 | - tree-sitter-c-sharp==0.23.1 72 | 73 | ## Installation & Usage 74 | 75 | 1. Drop `Project_Indexer.py` into the root of your project 76 | 77 | 2. Configure the root directory: 78 | 79 | - Open `Project_Indexer.py` 80 | - On line 54, set `root_directory` to your project's root path 81 | - Example for Windows: `"C:\\MyProject"` 82 | - Example for Mac/Linux: `"/Users/name/MyProject"` 83 | 84 | 3. Run the script via command line: 85 | 86 | ```sh 87 | # Using --path argument to specify project directory 88 | python Project_Indexer.py --path /path/to/your/project 89 | # Using --imports to specify extracting imported libraries/methods in each file (only supported for python right now) 90 | python Project_Indexer.py --path /path/to/your/project --imports 91 | # Without arguments (uses hardcoded path in script) 92 | python Project_Indexer.py 93 | ``` 94 | 95 | 3. The script will generate `ProjectIndex.json`. 96 | 97 | 4. Direct your LLM to read `ProjectIndex.json` for efficient project awareness. 98 | 99 | ## C# Project Indexer 100 | 101 | 1. This is a highly specialized indexer for **C# files only**. It also reads and indexes `.razor` files for Blazor projects. 102 | 2. It uses the **PythonNET** library to interface with **Roslyn analyzers** to "walk the tree," resulting in a highly compressed `ProjectIndex.json` file. 103 | 3. Place the script in the **root directory** of your project. 104 | 4. Run the following command to generate the index: 105 | ```python cSharpIndexer.py ``` 106 | 5. If Roslyn dependencies are missing, the script will attempt to install them automatically. Manual installation may be required in some cases. 107 | 6. Pull requests and contributions are welcome — this tool is highly experimental. 108 | 109 | 110 | 111 | I hope this is useful. 112 | 113 | Happy coding! 114 | 115 | ## Contributing 116 | 117 | Feel free to fork the repository and submit pull requests for improvements! 118 | 119 | ## License 120 | 121 | This project is licensed under the [MIT License](LICENSE). 122 | 123 | --- 124 | -------------------------------------------------------------------------------- /cSharpIndexer/cSharpIndexer.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | import time 4 | import argparse 5 | import os 6 | import sys 7 | 8 | # --- Set Environment Variable for Pythonnet Runtime --- 9 | print("Setting PYTHONNET_RUNTIME=coreclr environment variable...") 10 | os.environ['PYTHONNET_RUNTIME'] = 'coreclr' 11 | 12 | # --- Pythonnet Setup --- 13 | try: 14 | import clr 15 | print("pythonnet library imported.") 16 | runtime_env = os.environ.get('PYTHONNET_RUNTIME') 17 | if runtime_env and runtime_env.lower() == 'coreclr': 18 | print("Confirmed PYTHONNET_RUNTIME=coreclr is set.") 19 | else: 20 | print("Warning: PYTHONNET_RUNTIME might not be 'coreclr'. Issues may occur.") 21 | 22 | except ImportError: 23 | print("Error: pythonnet library not found.") 24 | print("Please install it using: pip install pythonnet") 25 | sys.exit(1) 26 | except Exception as e: 27 | print(f"Error during pythonnet import or runtime check: {e}") 28 | sys.exit(1) 29 | 30 | 31 | # --- Load Roslyn Assemblies --- 32 | roslyn_loaded = False 33 | try: 34 | roslyn_version = "4.8.0" 35 | target_framework = "netstandard2.0" 36 | nuget_base_path = Path.home() / ".nuget" / "packages" 37 | 38 | # Construct correct paths for the assemblies 39 | analysis_common_pkg_path = nuget_base_path / "microsoft.codeanalysis.common" / roslyn_version / "lib" / target_framework 40 | analysis_dll = analysis_common_pkg_path / "Microsoft.CodeAnalysis.dll" 41 | 42 | csharp_pkg_path = nuget_base_path / "microsoft.codeanalysis.csharp" / roslyn_version / "lib" / target_framework 43 | csharp_dll = csharp_pkg_path / "Microsoft.CodeAnalysis.CSharp.dll" 44 | 45 | # Debug prints for paths 46 | print(f"Attempting to load Roslyn v{roslyn_version} from NuGet cache:") 47 | print(f" - Analysis DLL: {analysis_dll}") 48 | print(f" - CSharp DLL: {csharp_dll}") 49 | 50 | # Verify paths exist 51 | if not analysis_dll.exists(): 52 | raise FileNotFoundError(f"Microsoft.CodeAnalysis.dll not found at expected NuGet path: {analysis_dll}") 53 | if not csharp_dll.exists(): 54 | raise FileNotFoundError(f"Microsoft.CodeAnalysis.CSharp.dll not found at expected NuGet path: {csharp_dll}") 55 | 56 | # Load the assemblies 57 | clr.AddReference(str(analysis_dll)) 58 | clr.AddReference(str(csharp_dll)) 59 | roslyn_loaded = True 60 | print("Roslyn assemblies loaded successfully from NuGet cache.") 61 | 62 | # Import namespaces after loading assemblies 63 | import Microsoft.CodeAnalysis as MSAnalysis 64 | import Microsoft.CodeAnalysis.CSharp as MSCSharp 65 | import Microsoft.CodeAnalysis.CSharp.Syntax as MSSyntax 66 | print("Roslyn namespaces imported successfully.") 67 | 68 | except Exception as e: 69 | print(f"Error: Failed to load or import Roslyn assemblies. {e}") 70 | sys.exit(1) 71 | 72 | 73 | def get_parent_context(node): 74 | """Finds the parent context (namespace, class, etc.) for a given node.""" 75 | parent = node.Parent 76 | while parent is not None: 77 | if isinstance(parent, MSSyntax.BaseTypeDeclarationSyntax): # Class, Struct, Interface, Enum, Record 78 | return parent.Identifier.ValueText 79 | elif isinstance(parent, MSSyntax.NamespaceDeclarationSyntax): 80 | return parent.Name.ToString() 81 | parent = parent.Parent 82 | return None 83 | 84 | 85 | def index_csharp_and_razor_files(root_dir, output_file="ProjectIndex.json"): 86 | """Indexes C# and Razor files in the given directory.""" 87 | start_time = time.time() 88 | all_definitions = [] 89 | root_path = Path(root_dir).resolve() 90 | 91 | print(f"Starting indexing in: {root_path}") 92 | 93 | # Find all .cs and .razor files 94 | all_files = list(root_path.rglob("*.cs")) + list(root_path.rglob("*.razor")) 95 | print(f"Found {len(all_files)} .cs and .razor files.") 96 | 97 | excluded_dir_names = {"obj", "bin"} 98 | files_to_process = [f for f in all_files if not any(part.lower() in excluded_dir_names for part in f.parts)] 99 | 100 | for file_path in files_to_process: 101 | relative_path_str = str(file_path.relative_to(root_path)).replace("\\", "/") 102 | file_extension = file_path.suffix.lower() 103 | 104 | try: 105 | content = file_path.read_text(encoding="utf-8") 106 | 107 | if file_extension == ".cs": 108 | parse_options = MSCSharp.CSharpParseOptions(languageVersion=MSCSharp.LanguageVersion.Latest) 109 | syntax_tree = MSCSharp.CSharpSyntaxTree.ParseText(content, options=parse_options, path=str(file_path)) 110 | root = syntax_tree.GetRoot() 111 | 112 | for node in root.DescendantNodes(): 113 | # Only index high-level definitions 114 | if isinstance(node, MSSyntax.ClassDeclarationSyntax): 115 | name = node.Identifier.ValueText 116 | parent = get_parent_context(node) 117 | # Add class definition as [name, path, parent] 118 | all_definitions.append([name, relative_path_str, parent]) 119 | elif isinstance(node, MSSyntax.NamespaceDeclarationSyntax): 120 | name = node.Name.ToString() 121 | # Add namespace definition as [name, path, None] 122 | all_definitions.append([name, relative_path_str, None]) 123 | 124 | elif file_extension == ".razor": 125 | # Add Razor file path only as a string 126 | all_definitions.append(relative_path_str) 127 | 128 | except Exception as e: 129 | print(f"Error processing file {relative_path_str}: {e}") 130 | 131 | # Output results 132 | try: 133 | output_path = root_path / output_file 134 | with open(output_path, "w", encoding="utf-8") as f: 135 | json.dump(all_definitions, f, separators=(',', ':')) # Compact JSON formatting 136 | print(f"Indexing completed successfully. Output written to {output_path}") 137 | except Exception as e: 138 | print(f"Error writing output file: {e}") 139 | 140 | print(f"Indexing completed in {time.time() - start_time:.2f} seconds.") 141 | 142 | 143 | if __name__ == "__main__": 144 | parser = argparse.ArgumentParser(description="Index C# and Razor project definitions.") 145 | parser.add_argument( 146 | "root_dir", 147 | nargs="?", 148 | default=".", 149 | help="Root directory of the C# project/solution (default: current directory)." 150 | ) 151 | parser.add_argument( 152 | "-o", "--output", 153 | default="ProjectIndex.json", 154 | help="Output JSON file name (default: ProjectIndex.json)." 155 | ) 156 | args = parser.parse_args() 157 | 158 | target_dir = Path(args.root_dir) 159 | if not target_dir.is_dir(): 160 | print(f"Error: Provided root directory '{args.root_dir}' not found or is not a directory.") 161 | sys.exit(1) 162 | 163 | index_csharp_and_razor_files(target_dir, args.output) -------------------------------------------------------------------------------- /cSharpIndexer/indexer_dependencies.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | 6 | true 7 | false 8 | false 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /memory-bank/productContext.md: -------------------------------------------------------------------------------- 1 | # Product Context 2 | 3 | ## Project Overview 4 | 5 | - Name: ProjectIndexer 6 | - Purpose: Develop a CLI tool to index and organize project files across multiple programming languages (Python, C#, Java, JavaScript) for enhanced searchability and code understanding. 7 | - Key Features: Multi-language file indexing (Python, C#, Java, JS), robust search, code metadata extraction, CLI interface. 8 | 9 | ## Technical Stack 10 | 11 | - Target Languages: Python, C#, Java, JavaScript (Initial focus: Python) 12 | - Core Language: Python (for CLI and orchestration) 13 | - Dependencies: TBD 14 | 15 | ## Architecture 16 | 17 | - Components: CLI interface, Core orchestrator, Language-specific parsers/indexers (Strategy Pattern likely needed), Unified search engine, Data storage (for index) 18 | - Data Flow: TBD 19 | -------------------------------------------------------------------------------- /memory-bank/systemPatterns.md: -------------------------------------------------------------------------------- 1 | # System Patterns 2 | 3 | ## ProjectIndexer Architecture Patterns 4 | 5 | ### File Processing Pipeline 6 | 7 | - **Pattern**: Sequential processing pipeline 8 | - **Components**: Scanner → Indexer → Search Engine 9 | - **Characteristics**: 10 | - Each component has single responsibility 11 | - Data flows linearly between components 12 | - Easy to add new processing steps 13 | 14 | ### Indexing Strategy 15 | 16 | - **Pattern**: Inverted index 17 | - **Characteristics**: 18 | - Efficient for text search 19 | - Scales well with document count 20 | - Supports partial matches 21 | 22 | ### Error Handling 23 | 24 | - **Pattern**: Fail-fast with recovery 25 | - **Characteristics**: 26 | - Validate inputs early 27 | - Log detailed errors 28 | - Continue processing other files if one fails 29 | 30 | ### Language Handling Strategy 31 | 32 | - **Pattern**: Strategy Pattern (Recommended) 33 | - **Context**: The system needs to parse and index files from multiple programming languages (Python, C#, Java, JavaScript), each requiring specific logic. 34 | - **Solution**: Define a common interface for language processors (parsers/indexers). Implement concrete strategies for each supported language. A central orchestrator or factory selects the appropriate strategy based on file type or language detection. 35 | - **Characteristics**: 36 | - Encapsulates language-specific logic. 37 | - Easily extensible to support new languages without modifying core components. 38 | - Promotes separation of concerns. 39 | -------------------------------------------------------------------------------- /parser/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tree_sitter import Language, Parser 3 | import tree_sitter_python 4 | import tree_sitter_c_sharp 5 | import tree_sitter_typescript 6 | 7 | # Initialize Tree-sitter languages 8 | PYTHON_LANGUAGE = None 9 | CSHARP_LANGUAGE = None 10 | TYPESCRIPT_LANGUAGE = None 11 | TSX_LANGUAGE = None 12 | 13 | def initialize_grammars(): 14 | """Initialize Tree-sitter language grammars""" 15 | global PYTHON_LANGUAGE, CSHARP_LANGUAGE, TYPESCRIPT_LANGUAGE, TSX_LANGUAGE 16 | 17 | try: 18 | PYTHON_LANGUAGE = Language(tree_sitter_python.language()) 19 | except Exception as e: 20 | raise RuntimeError(f"Failed to load Python grammar: {e}") 21 | 22 | try: 23 | CSHARP_LANGUAGE = Language(tree_sitter_c_sharp.language()) 24 | except Exception as e: 25 | raise RuntimeError(f"Failed to load C# grammar: {e}") 26 | 27 | try: 28 | TYPESCRIPT_LANGUAGE = Language(tree_sitter_typescript.language_typescript()) 29 | except Exception as e: 30 | raise RuntimeError(f"Failed to load TypeScript grammar: {e}") 31 | 32 | try: 33 | TSX_LANGUAGE = Language(tree_sitter_typescript.language_tsx()) 34 | except Exception as e: 35 | raise RuntimeError(f"Failed to load TSX grammar: {e}") 36 | 37 | def grammars_loaded(): 38 | """Check if grammars are loaded""" 39 | return (PYTHON_LANGUAGE is not None and 40 | CSHARP_LANGUAGE is not None and 41 | TYPESCRIPT_LANGUAGE is not None and 42 | TSX_LANGUAGE is not None) 43 | 44 | # Initialize grammars when module is imported 45 | initialize_grammars() 46 | 47 | # Import parser functions 48 | from .parser import * 49 | from .python_parser import * 50 | from .csharp_parser import * 51 | from .typescript_parser import * # Added for future TypeScript parser -------------------------------------------------------------------------------- /parser/csharp_parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tree_sitter import Parser, Query 3 | from . import CSHARP_LANGUAGE 4 | 5 | # Query definitions as class-level constants 6 | CLASS_QUERY_STR = """ 7 | (class_declaration 8 | name: (identifier) @class_name 9 | (base_list)? @bases 10 | body: (declaration_list) @class_body) @class_def 11 | """ 12 | 13 | STRUCT_QUERY_STR = """ 14 | (struct_declaration 15 | name: (identifier) @struct_name 16 | (base_list)? @bases 17 | body: (declaration_list) @struct_body) @struct_def 18 | """ 19 | 20 | INTERFACE_QUERY_STR = """ 21 | (interface_declaration 22 | name: (identifier) @interface_name 23 | (base_list)? @bases 24 | body: (declaration_list) @interface_body) @interface_def 25 | """ 26 | 27 | ENUM_QUERY_STR = """ 28 | (enum_declaration 29 | name: (identifier) @enum_name 30 | body: (enum_member_declaration_list) @enum_body) @enum_def 31 | """ 32 | 33 | METHOD_QUERY_STR = """ 34 | (method_declaration 35 | (type)? @return_type 36 | name: (identifier) @method_name 37 | (parameter_list) @params 38 | body: (block)? @method_body) @method_def 39 | """ 40 | 41 | class C_Sharp_Result: 42 | def __init__(self): 43 | self.classes = [] 44 | self.structs = [] 45 | self.interfaces = [] 46 | self.enums = [] 47 | 48 | def __to_dict__(self): 49 | result = {} 50 | if self.classes: 51 | result['classes'] = self.classes 52 | if self.structs: 53 | result['structs'] = self.structs 54 | if self.interfaces: 55 | result['interfaces'] = self.interfaces 56 | if self.enums: 57 | result['enums'] = self.enums 58 | return result 59 | 60 | def process_method_node(method_node): 61 | """Process a method node and extract its information. 62 | 63 | Args: 64 | method_node: The tree-sitter node representing a method 65 | 66 | Returns: 67 | dict: Method information including name, parameters, return type and modifiers 68 | """ 69 | if not method_node: 70 | return None 71 | 72 | method_info = {} 73 | method_info['name'] = method_node.child_by_field_name('name').text.decode('utf8') 74 | 75 | raw_parameters = method_node.child_by_field_name('parameters') 76 | parameters = ','.join([p.text.decode('utf8') for p in raw_parameters.children if p.type == 'parameter']) 77 | if len(parameters) > 0: 78 | method_info['parameters'] = parameters 79 | 80 | type = method_node.child_by_field_name('type') 81 | if type: 82 | method_info['return_type'] = type.text.decode('utf8') 83 | 84 | modifiers_node = method_node.child_by_field_name('modifiers') 85 | if modifiers_node: 86 | method_info['modifiers'] = [m.text.decode('utf8') for m in modifiers_node.children] 87 | return method_info 88 | 89 | def _should_skip_file(file_path: str) -> bool: 90 | """Check if the file should be skipped based on its extension. 91 | 92 | Args: 93 | file_path: Path to the file 94 | 95 | Returns: 96 | bool: True if the file should be skipped, False otherwise 97 | """ 98 | file_extension_supported = file_path.endswith(".cs") or file_path.endswith(".h") 99 | return not file_extension_supported or 'node_modules' in file_path or 'dist' in file_path or 'build' in file_path 100 | 101 | def _read_and_validate_file(file_path: str) -> str: 102 | """Read and validate a C# source file. 103 | 104 | Args: 105 | file_path: Path to the C# file 106 | 107 | Returns: 108 | str: The file content if valid, None otherwise 109 | """ 110 | try: 111 | if _should_skip_file(file_path): 112 | return None 113 | 114 | with open(file_path, 'r', encoding='utf-8') as f: 115 | source_code = f.read() 116 | 117 | return source_code if source_code else None 118 | 119 | except Exception as e: 120 | print(f"Error reading file {file_path}: {str(e)}") 121 | return None 122 | 123 | def _initialize_parser(source_code: str) -> tuple: 124 | """Initialize the tree-sitter parser and parse the source code. 125 | 126 | Args: 127 | source_code: The C# source code to parse 128 | 129 | Returns: 130 | tuple: (Parser, Tree) objects 131 | """ 132 | parser = Parser(language=CSHARP_LANGUAGE) 133 | tree = parser.parse(bytes(source_code, 'utf8')) 134 | return parser, tree 135 | 136 | def _process_class(struct_node, method_query, result): 137 | """Process a class node and extract its information. 138 | 139 | Args: 140 | class_node: The tree-sitter node representing a class 141 | method_query: The method query object 142 | result: The C_Sharp_Result object to populate 143 | 144 | Returns: 145 | dict: Class information including name, bases and methods 146 | """ 147 | class_info = { 148 | 'name': struct_node.child_by_field_name('name').text.decode('utf8') 149 | } 150 | 151 | bases_node = struct_node.child_by_field_name('bases') 152 | if bases_node: 153 | bases = [b.text.decode('utf8') for b in bases_node.children if b.type != ':'] 154 | class_info['bases'] = "".join(bases) 155 | 156 | methods = [] 157 | body_node = struct_node.child_by_field_name('body') 158 | for _, method_nodes_dict in method_query.matches(body_node): 159 | method_node = method_nodes_dict['method_def'][0] 160 | method_info = process_method_node(method_node) 161 | methods.append(method_info) 162 | 163 | if methods: 164 | class_info['methods'] = methods 165 | 166 | return class_info 167 | 168 | def _process_struct(struct_node, method_query, result): 169 | """Process a struct node and extract its information. 170 | 171 | Args: 172 | struct_node: The tree-sitter node representing a struct 173 | method_query: The method query object 174 | result: The C_Sharp_Result object to populate 175 | 176 | Returns: 177 | dict: Struct information including name and methods 178 | """ 179 | struct_info = { 180 | 'name': struct_node.child_by_field_name('name').text.decode('utf8') 181 | } 182 | 183 | methods = [] 184 | body_node = struct_node.child_by_field_name('body') 185 | for _, method_nodes_dict in method_query.matches(body_node): 186 | method_node = method_nodes_dict['method_def'][0] 187 | method_info = process_method_node(method_node) 188 | methods.append(method_info) 189 | 190 | if methods: 191 | struct_info['methods'] = methods 192 | 193 | return struct_info 194 | 195 | def _process_interface(interface_node): 196 | """Process an interface node and extract its name. 197 | 198 | Args: 199 | interface_node: The tree-sitter node representing an interface 200 | 201 | Returns: 202 | dict: Interface information with name 203 | """ 204 | return { 205 | 'name': interface_node.child_by_field_name('name').text.decode('utf8') 206 | } 207 | 208 | def _process_enum(enum_node): 209 | """Process an enum node and extract its name. 210 | 211 | Args: 212 | enum_node: The tree-sitter node representing an enum 213 | 214 | Returns: 215 | dict: Enum information with name 216 | """ 217 | return { 218 | 'name': enum_node.child_by_field_name('name').text.decode('utf8') 219 | } 220 | 221 | def extract_types_and_members_from_file_for_csharp(file_path: str) -> C_Sharp_Result: 222 | """Extract types and members from a C# source file. 223 | 224 | Args: 225 | file_path: Path to the C# file 226 | 227 | Returns: 228 | C_Sharp_Result: Object containing all extracted types and members 229 | """ 230 | result = C_Sharp_Result() 231 | 232 | # Read and validate file 233 | source_code = _read_and_validate_file(file_path) 234 | if not source_code: 235 | return result 236 | 237 | # Initialize parser and parse source code 238 | parser, tree = _initialize_parser(source_code) 239 | 240 | # Create queries 241 | class_query = CSHARP_LANGUAGE.query(CLASS_QUERY_STR).matches(tree.root_node) 242 | struct_query = Query(CSHARP_LANGUAGE, STRUCT_QUERY_STR).matches(tree.root_node) 243 | interface_query = Query(CSHARP_LANGUAGE, INTERFACE_QUERY_STR).matches(tree.root_node) 244 | enum_query = Query(CSHARP_LANGUAGE, ENUM_QUERY_STR).matches(tree.root_node) 245 | # this one doesn't match the root node directly, because the query is for methods inside classes/structs as well 246 | # as top-level methods 247 | method_query = Query(CSHARP_LANGUAGE, METHOD_QUERY_STR) 248 | 249 | # Process classes 250 | for _, class_nodes_dict in class_query: 251 | class_node = class_nodes_dict['class_def'][0] 252 | class_info = _process_class(class_node, method_query, result) 253 | result.classes.append(class_info) 254 | 255 | # Process structs 256 | for _, struct_node_dict in struct_query: 257 | struct_node = struct_node_dict['struct_def'][0] 258 | struct_info = _process_struct(struct_node, method_query, result) 259 | result.structs.append(struct_info) 260 | 261 | # Process interfaces 262 | for _, interface_node_dict in interface_query: 263 | interface_node = interface_node_dict['interface_def'][0] 264 | interface_info = _process_interface(interface_node) 265 | result.interfaces.append(interface_info) 266 | 267 | # Process enums 268 | for _, enum_node_dict in enum_query: 269 | enum_node = enum_node_dict['enum_def'][0] 270 | enum_info = _process_enum(enum_node) 271 | result.enums.append(enum_info) 272 | 273 | return result 274 | 275 | -------------------------------------------------------------------------------- /parser/parser.py: -------------------------------------------------------------------------------- 1 | from parser.csharp_parser import extract_types_and_members_from_file_for_csharp 2 | from parser.python_parser import extract_types_and_members_from_file_for_python 3 | from .typescript_parser import extract_types_and_members_from_file_for_typescript -------------------------------------------------------------------------------- /parser/python_parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tree_sitter import Parser, Query 3 | from . import PYTHON_LANGUAGE 4 | 5 | class Python_Result: 6 | def __init__(self): 7 | self.py_classes = [] 8 | self.py_functions = [] 9 | self.py_imports = [] 10 | 11 | def __to_dict__(self): 12 | result = {} 13 | if self.py_classes: 14 | result['py_classes'] = self.py_classes 15 | if self.py_functions: 16 | result['py_functions'] = self.py_functions 17 | if self.py_imports: 18 | result['py_imports'] = self.py_imports 19 | return result 20 | 21 | # Tree-sitter queries as class-level constants 22 | CLASS_QUERY = Query(PYTHON_LANGUAGE, """ 23 | (class_definition 24 | name: (identifier) @class_name 25 | body: (block) @class_body) @class_def 26 | """) 27 | 28 | FUNCTION_QUERY = Query(PYTHON_LANGUAGE, """ 29 | (function_definition 30 | name: (identifier) @function_name 31 | parameters: (parameters) @params 32 | return_type: (type)? @return_type 33 | body: (block) @function_body) @function_def 34 | """) 35 | 36 | IMPORT_QUERY = Query(PYTHON_LANGUAGE, """ 37 | (import_statement) @import 38 | (import_from_statement) @import_from 39 | """) 40 | 41 | def _should_skip_file(file_path: str) -> bool: 42 | """Check if file should be skipped based on path patterns.""" 43 | return (not file_path.endswith('.py') or 44 | 'venv' in file_path or 45 | '__pycache__' in file_path or 46 | '_test.py' in file_path or 47 | 'test_' in file_path or 48 | 'node_modules' in file_path) 49 | 50 | def _read_source_code(file_path: str) -> str: 51 | """Read and return the contents of a Python source file.""" 52 | with open(file_path, 'r', encoding='utf-8') as f: 53 | return f.read() 54 | 55 | def _process_class(class_node, class_nodes_dict) -> dict: 56 | """Process a class node and return class information.""" 57 | class_info = { 58 | 'name': class_node.child_by_field_name('name').text.decode('utf8') 59 | } 60 | 61 | # Get base classes 62 | bases_node = class_nodes_dict.get('bases', [None])[0] 63 | if bases_node: 64 | class_info['bases'] = [b.text.decode('utf8') for b in bases_node.children 65 | if b.type != '(' and b.type != ')'] 66 | 67 | # Get methods 68 | methods = [] 69 | body_node = class_node.child_by_field_name('body') 70 | for method_index, method_nodes_dict in FUNCTION_QUERY.matches(body_node): 71 | method_node = method_nodes_dict['function_def'][0] 72 | methods.append(_process_function(method_node)) 73 | 74 | if methods: 75 | class_info['methods'] = methods 76 | 77 | return class_info 78 | 79 | def _process_function(function_node) -> str: 80 | """Process a function node and return its signature.""" 81 | func_name = function_node.child_by_field_name('name').text.decode('utf8') 82 | params = function_node.child_by_field_name('parameters').text.decode('utf8') 83 | return_type = function_node.child_by_field_name('return_type') 84 | return_type_str = f" -> {return_type.text.decode('utf8')}" if return_type else " -> None" 85 | 86 | # Get decorators 87 | decorators = [] 88 | if function_node.prev_named_sibling and function_node.prev_named_sibling.type == 'decorator': 89 | decorator_node = function_node.prev_named_sibling 90 | while decorator_node and decorator_node.type == 'decorator': 91 | decorators.append(decorator_node.text.decode('utf8')) 92 | decorator_node = decorator_node.prev_named_sibling 93 | 94 | return f"{' '.join(reversed(decorators))} {func_name}{params}{return_type_str}".strip() 95 | 96 | def _process_imports(tree_root_node, result: Python_Result) -> None: 97 | """Process import statements and add them to the result.""" 98 | for index, import_nodes_dict in IMPORT_QUERY.matches(tree_root_node): 99 | import_node = list(import_nodes_dict.values())[0][0] 100 | result.py_imports.append(import_node.text.decode('utf8')) 101 | 102 | def extract_types_and_members_from_file_for_python(file_path: str, extract_imports: bool = False) -> Python_Result: 103 | """ 104 | Extract Python class, function, and import information from a file. 105 | 106 | Args: 107 | file_path: Path to the Python file to analyze 108 | extract_imports: Whether to extract import statements (default: False) 109 | 110 | Returns: 111 | Python_Result object containing extracted information 112 | """ 113 | result = Python_Result() 114 | 115 | if _should_skip_file(file_path): 116 | return result 117 | 118 | source_code = _read_source_code(file_path) 119 | parser = Parser(language=PYTHON_LANGUAGE) 120 | tree = parser.parse(bytes(source_code, 'utf8')) 121 | 122 | # Process classes 123 | for index, class_nodes_dict in CLASS_QUERY.matches(tree.root_node): 124 | class_node = class_nodes_dict['class_def'][0] 125 | result.py_classes.append(_process_class(class_node, class_nodes_dict)) 126 | 127 | # Process top-level functions 128 | for index, function_nodes_dict in FUNCTION_QUERY.matches(tree.root_node): 129 | function_node = function_nodes_dict['function_def'][0] 130 | if (function_node.parent and 131 | function_node.parent.type == 'class_definition'): 132 | continue 133 | 134 | result.py_functions.append(_process_function(function_node)) 135 | 136 | # Process imports if requested 137 | if extract_imports: 138 | _process_imports(tree.root_node, result) 139 | 140 | return result -------------------------------------------------------------------------------- /parser/typescript_parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tree_sitter 3 | from . import TYPESCRIPT_LANGUAGE, TSX_LANGUAGE 4 | from typing import List, Dict, Any, Optional 5 | 6 | class TypeScript_Result: 7 | """Holds extracted data from a TypeScript/TSX file.""" 8 | def __init__(self): 9 | self.classes: List[Dict[str, Any]] = [] 10 | self.interfaces: List[Dict[str, Any]] = [] 11 | self.functions: List[Dict[str, Any]] = [] 12 | self.enums: List[Dict[str, Any]] = [] 13 | self.imports: List[Dict[str, Any]] = [] 14 | 15 | def __to_dict__(self): 16 | """Converts the result object to a dictionary.""" 17 | # remove the start and end lines 18 | result_json = {} 19 | if self.classes: 20 | result_json['classes'] = self.classes 21 | if self.interfaces: 22 | result_json['interfaces'] = [interface.get("name") for interface in self.interfaces] 23 | if self.functions: 24 | result_json['functions'] = [f['function_signature'] for f in self.functions] 25 | if self.enums: 26 | result_json['enums'] = self.enums 27 | if self.imports: 28 | result_json['imports'] = self.imports 29 | return result_json 30 | 31 | # Tree-sitter queries for TypeScript/TSX 32 | QUERIES = { 33 | "imports": """ 34 | (import_statement 35 | (import_clause 36 | (named_imports 37 | (import_specifier 38 | name: (identifier) @import.name))? 39 | (namespace_import (identifier) @import.namespace)? 40 | (identifier)? @import.default) ; Matches default import like 'import React from "react"' 41 | source: (string) @import.source) 42 | 43 | (import_statement 44 | source: (string) @import.source) ; Matches side-effect imports like 'import "./styles.css"' 45 | 46 | (import_statement 47 | (import_clause (identifier) @import.default) ; Matches 'import defaultExport from "module-name";' 48 | source: (string) @import.source) 49 | """, 50 | "classes": """ 51 | (class_declaration 52 | name: (type_identifier) @class.name 53 | body: (class_body) @class.body) 54 | 55 | (export_statement 56 | declaration: (class_declaration 57 | name: (type_identifier) @class.name 58 | body: (class_body) @class.body)) 59 | """, 60 | "interfaces": """ 61 | (interface_declaration 62 | name: (type_identifier) @interface.name 63 | body: (interface_body) @interface.body) 64 | 65 | (export_statement 66 | (interface_declaration 67 | name: (type_identifier) @interface.name 68 | body: (interface_body) @interface.body)) 69 | """, 70 | "functions": """ 71 | (function_declaration 72 | name: (identifier) @function.name 73 | parameters: (formal_parameters) @function.parameters 74 | return_type: (_)? @function.return_type 75 | body: (statement_block) @function.body) 76 | 77 | (export_statement 78 | declaration: (function_declaration 79 | name: (identifier) @function.name 80 | parameters: (formal_parameters) @function.parameters 81 | return_type: (_)? @function.return_type 82 | body: (statement_block) @function.body)) 83 | 84 | (method_definition 85 | name: (property_identifier) @function.name ; Treat methods as functions for now 86 | parameters: (formal_parameters) @function.parameters 87 | return_type: (_)? @function.return_type 88 | body: (statement_block) @function.body) 89 | 90 | ; Arrow functions assigned to variables (const myFunction = () => {}) 91 | (lexical_declaration 92 | (variable_declarator 93 | name: (identifier) @function.name 94 | value: (arrow_function 95 | parameters: (formal_parameters)? @function.parameters 96 | return_type: (_)? @function.return_type 97 | body: (_) @function.body))) 98 | 99 | (export_statement 100 | declaration: (lexical_declaration 101 | (variable_declarator 102 | name: (identifier) @function.name 103 | value: (arrow_function 104 | parameters: (formal_parameters)? @function.parameters 105 | return_type: (_)? @function.return_type 106 | body: (_) @function.body)))) 107 | """, 108 | "enums": """ 109 | (enum_declaration 110 | name: (identifier) @enum.name 111 | body: (enum_body) @enum.body) 112 | 113 | (export_statement 114 | declaration: (enum_declaration 115 | name: (identifier) @enum.name 116 | body: (enum_body) @enum.body)) 117 | """ 118 | } 119 | 120 | def _should_skip_file(file_path: str) -> bool: 121 | """Check if file should be skipped based on path patterns.""" 122 | return (not file_path.endswith('.ts') and not file_path.endswith('.tsx') or 123 | 'node_modules' in file_path or 124 | '__tests__' in file_path or 125 | 'test_' in file_path or 126 | 'test.tsx' in file_path or 127 | 'test.ts' in file_path) 128 | 129 | def _get_node_text(node: tree_sitter.Node) -> str: 130 | """Safely decode node text.""" 131 | return node.text.decode('utf8').replace("\n","").replace(" ","") if node else "" 132 | 133 | def _process_import(capture: Dict[str, tree_sitter.Node]) -> Dict[str, Any]: 134 | """Processes an import capture.""" 135 | source = _get_node_text(capture.get("import.source")).strip('"\'') 136 | imported_items = [] 137 | if "import.name" in capture: 138 | imported_items.append(_get_node_text(capture["import.name"])) 139 | if "import.namespace" in capture: 140 | imported_items.append(f"* as {_get_node_text(capture['import.namespace'])}") 141 | if "import.default" in capture: 142 | # Check if it's a default import name or part of named imports 143 | default_node = capture["import.default"] 144 | # Simple heuristic: if parent is import_clause and it's the first named child, it's likely the default import 145 | if default_node.parent and default_node.parent.type == 'import_clause' and default_node.prev_sibling is None: 146 | imported_items.append(f"default as {_get_node_text(default_node)}") 147 | elif default_node.type == 'identifier' and 'import.name' not in capture and 'import.namespace' not in capture: 148 | # Handles cases like 'import defaultExport from "module-name";' 149 | imported_items.append(f"default as {_get_node_text(default_node)}") 150 | 151 | 152 | return { 153 | "source": source, 154 | "imported_items": imported_items if imported_items else ["*"], # For side-effect imports or if logic fails 155 | "start_line": capture.get("import.source", list(capture.values())[0]).start_point[0] + 1, 156 | "end_line": capture.get("import.source", list(capture.values())[0]).end_point[0] + 1, 157 | } 158 | 159 | 160 | def _process_class(capture: Dict[str, tree_sitter.Node]) -> Dict[str, Any]: 161 | """Processes a class capture.""" 162 | name_node = capture.get("class.name") 163 | body_node = capture.get("class.body") 164 | return { 165 | "name": _get_node_text(name_node), 166 | "start_line": name_node.start_point[0] + 1 if name_node else 0, 167 | "end_line": body_node.end_point[0] + 1 if body_node else 0, 168 | } 169 | 170 | def _process_interface(capture: Dict[str, tree_sitter.Node]) -> Dict[str, Any]: 171 | """Processes an interface capture.""" 172 | name_node = capture.get("interface.name") 173 | body_node = capture.get("interface.body") 174 | return { 175 | "name": _get_node_text(name_node), 176 | "start_line": name_node.start_point[0] + 1 if name_node else 0, 177 | "end_line": body_node.end_point[0] + 1 if body_node else 0, 178 | } 179 | 180 | def _process_function(capture: Dict[str, tree_sitter.Node]) -> Dict[str, Any] | None: 181 | """Processes a function or method capture.""" 182 | name_node = capture.get("function.name") 183 | if not name_node: 184 | return None 185 | params_node = capture.get("function.parameters") 186 | body_node = capture.get("function.body") 187 | return_type_node = capture.get("function.return_type") 188 | 189 | # Determine start and end lines carefully 190 | start_node = name_node if name_node else list(capture.values())[0] # Fallback to first node 191 | end_node = body_node if body_node else start_node # Fallback to start node if no body 192 | 193 | name = _get_node_text(name_node) 194 | parameters = _get_node_text(params_node) 195 | return_type = _get_node_text(return_type_node.child(1)) if return_type_node and return_type_node.child_count > 1 else _get_node_text(return_type_node) # Attempt to get type after ':' 196 | 197 | function_signature = f"{name}{parameters}" + (f": {return_type}" if return_type else "") 198 | return { 199 | "function_signature":function_signature, 200 | "start_line": start_node.start_point[0] + 1, 201 | "end_line": end_node.end_point[0] + 1, 202 | } 203 | 204 | def _process_enum(capture: Dict[str, tree_sitter.Node]) -> Dict[str, Any]: 205 | """Processes an enum capture.""" 206 | name_node = capture.get("enum.name") 207 | body_node = capture.get("enum.body") 208 | return { 209 | "name": _get_node_text(name_node), 210 | "start_line": name_node.start_point[0] + 1 if name_node else 0, 211 | "end_line": body_node.end_point[0] + 1 if body_node else 0, 212 | } 213 | 214 | 215 | def extract_types_and_members_from_file_for_typescript(file_path: str, extract_imports: bool = False) -> TypeScript_Result: 216 | """ 217 | Parses a TypeScript or TSX file and extracts structural information. 218 | 219 | Args: 220 | file_path: The path to the TypeScript/TSX file. 221 | extract_imports: Whether to extract import statements. 222 | 223 | Returns: 224 | A TypeScript_Result object containing the extracted data. 225 | """ 226 | result = TypeScript_Result() 227 | file_extension = os.path.splitext(file_path)[1].lower() 228 | 229 | print(f"Parsing file: {file_path} with extension {file_extension}") 230 | 231 | if _should_skip_file(file_path): 232 | return result 233 | 234 | language = TSX_LANGUAGE if file_extension == ".tsx" else TYPESCRIPT_LANGUAGE 235 | if not language: 236 | print(f"Tree-sitter language for {file_extension} not available.") 237 | return result # Should not happen if __init__ is correct 238 | 239 | parser = tree_sitter.Parser(language=language) 240 | 241 | with open(file_path, "rb") as file: 242 | source_code = file.read() 243 | 244 | tree = parser.parse(source_code) 245 | root_node = tree.root_node 246 | 247 | processing_map = { 248 | "classes": (_process_class, result.classes), 249 | "interfaces": (_process_interface, result.interfaces), 250 | "functions": (_process_function, result.functions), 251 | "enums": (_process_enum, result.enums), 252 | } 253 | if extract_imports: 254 | processing_map["imports"] = (_process_import, result.imports) 255 | 256 | 257 | for query_name, (process_func, result_list) in processing_map.items(): 258 | query_string = QUERIES.get(query_name) 259 | if not query_string: 260 | continue 261 | 262 | query = language.query(query_string) 263 | captures: dict[str, list[tree_sitter.Node]] = query.captures(root_node) 264 | 265 | # Process captures, grouping by the start line of the primary node 266 | processed_captures = {} 267 | capture_items = captures.items() 268 | for capture_name, node_list in capture_items: 269 | # Use the start line of the node that defines the item (e.g., class name, function name) 270 | # Heuristic: Use the first node in the capture group if specific name isn't found 271 | for node in node_list: 272 | primary_node_key = f"{query_name}.name" if f"{query_name}.name" in QUERIES[query_name] else capture_name.split('.')[0] + '.' + capture_name.split('.')[1] # e.g. import.source 273 | start_line = node.start_point[0] 274 | 275 | # Find the most relevant node for the start line key 276 | relevant_node_for_key = node 277 | temp_captures = [] 278 | for cn, n_list in capture_items: 279 | # Check if the node is on the same line as the primary node 280 | for n in n_list: 281 | if n.start_point[0] == start_line: 282 | temp_captures.append((n, cn)) 283 | 284 | for n_temp, cn_temp in temp_captures: 285 | if cn_temp == primary_node_key: 286 | relevant_node_for_key = n_temp 287 | break 288 | # Fallback for imports where 'source' is key 289 | if query_name == "imports" and cn_temp == "import.source": 290 | relevant_node_for_key = n_temp 291 | # Don't break, maybe find a name later 292 | 293 | key = relevant_node_for_key.start_point[0] # Group by the start line of the defining node 294 | 295 | if key not in processed_captures: 296 | processed_captures[key] = {} 297 | processed_captures[key][capture_name] = node 298 | 299 | # Apply processing function to grouped captures 300 | for key in sorted(processed_captures.keys()): 301 | processed_item = process_func(processed_captures[key]) 302 | if processed_item: # Ensure item was processed correctly 303 | # Avoid duplicates based on name and start line for functions/classes etc. 304 | is_duplicate = False 305 | if query_name != "imports": # Imports can have same source 306 | for existing_item in result_list: 307 | if existing_item.get("name") == processed_item.get("name") and \ 308 | existing_item.get("start_line") == processed_item.get("start_line"): 309 | is_duplicate = True 310 | break 311 | if not is_duplicate: 312 | processed_item.pop("start_line", None) 313 | processed_item.pop("end_line", None) 314 | result_list.append(processed_item) 315 | return result 316 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tree-sitter==0.24.0 2 | tree-sitter-python==0.23.6 3 | tree-sitter-c-sharp==0.23.1 4 | tree-sitter-javascript==0.23.1 5 | tree-sitter-typescript==0.23.2 6 | -------------------------------------------------------------------------------- /test/benchmark_parser.py: -------------------------------------------------------------------------------- 1 | import time 2 | from parser.csharp_parser import extract_types_and_members_from_file_for_csharp as tree_sitter_parse 3 | from parser.csharp_parser import C_Sharp_Result as TreeSitterResult 4 | 5 | # Backup old implementation for comparison 6 | def regex_parse(file_path: str) -> TreeSitterResult: 7 | import re 8 | patterns_csharp = { 9 | 'classes': re.compile(r'\bclass\s+(\w+)(?=\s*[:{])'), 10 | 'structs': re.compile(r'\bstruct\s+(\w+)(?=\s*[:{])'), 11 | 'interfaces': re.compile(r'\binterface\s+(\w+)(?=\s*[:{])'), 12 | 'enums': re.compile(r'\benum\s+(\w+)(?=\s*[{])'), 13 | 'methods': re.compile(r'\b(?:public|private|protected|internal)\s+(?:static\s+)?(?:[\w\<\>\[\]]+\s+)+(\w+)\s*\([^)]*\)\s*(?=\{|=>)') 14 | } 15 | 16 | results = TreeSitterResult() 17 | with open(file_path, 'r', encoding='utf-8') as file: 18 | content = file.read() 19 | for key, pattern in patterns_csharp.items(): 20 | matches = pattern.findall(content) 21 | if not matches: 22 | continue 23 | for match in matches: 24 | getattr(results, key).append(match) 25 | return results 26 | 27 | def benchmark_parser(parser_func, file_path: str, iterations: int = 100): 28 | start_time = time.time() 29 | 30 | # Warm up 31 | for _ in range(5): 32 | parser_func(file_path) 33 | 34 | # Time individual runs 35 | times = [] 36 | for _ in range(iterations): 37 | start = time.perf_counter() 38 | parser_func(file_path) 39 | end = time.perf_counter() 40 | times.append(end - start) 41 | 42 | total_time = time.time() - start_time 43 | avg_time = sum(times) / iterations 44 | min_time = min(times) 45 | max_time = max(times) 46 | 47 | return { 48 | 'total_time': total_time, 49 | 'avg_time': avg_time, 50 | 'min_time': min_time, 51 | 'max_time': max_time, 52 | 'iterations': iterations 53 | } 54 | 55 | if __name__ == '__main__': 56 | test_file = 'resources/test.cs' 57 | iterations = 100 58 | 59 | print("Benchmarking Tree-sitter parser...") 60 | ts_results = benchmark_parser(tree_sitter_parse, test_file, iterations) 61 | print(f"Tree-sitter results: {ts_results}") 62 | 63 | print("\nBenchmarking Regex parser...") 64 | regex_results = benchmark_parser(regex_parse, test_file, iterations) 65 | print(f"Regex results: {regex_results}") 66 | 67 | print("\nComparison:") 68 | print(f"Tree-sitter is {regex_results['avg_time'] / ts_results['avg_time']:.2f}x faster than Regex") 69 | print(f"Tree-sitter parsed {iterations} files in {ts_results['total_time']:.4f}s") 70 | print(f"Regex parsed {iterations} files in {regex_results['total_time']:.4f}s") 71 | 72 | # Print actual parsing results for verification 73 | print("\nTree-sitter parse results:") 74 | print(tree_sitter_parse(test_file).__to_dict__()) 75 | 76 | print("\nRegex parse results:") 77 | print(regex_parse(test_file).__to_dict__()) -------------------------------------------------------------------------------- /test/resources/test.cs: -------------------------------------------------------------------------------- 1 | using System; 2 | 3 | namespace TestProject 4 | { 5 | public interface IExample 6 | { 7 | void InterfaceMethod(); 8 | } 9 | 10 | public abstract class BaseClass 11 | { 12 | public abstract void AbstractMethod(); 13 | } 14 | 15 | public class ExampleClass : BaseClass, IExample 16 | { 17 | private int _privateField; 18 | public string PublicProperty { get; set; } 19 | 20 | public const double PI = 3.14; 21 | 22 | public override void AbstractMethod() 23 | { 24 | Console.WriteLine("Implemented abstract method"); 25 | } 26 | 27 | public void InterfaceMethod() 28 | { 29 | Console.WriteLine("Implemented interface method"); 30 | } 31 | 32 | public string GetGreeting(string name) 33 | { 34 | return $"Hello, {name}"; 35 | } 36 | 37 | private void PrivateMethod() 38 | { 39 | // Do something private 40 | } 41 | } 42 | 43 | public struct Point 44 | { 45 | public int X; 46 | public int Y; 47 | } 48 | 49 | public enum Status 50 | { 51 | Active, 52 | Inactive, 53 | Pending 54 | } 55 | } -------------------------------------------------------------------------------- /test/resources/test.ts: -------------------------------------------------------------------------------- 1 | // Sample TypeScript file for testing the parser 2 | 3 | import { AnotherClass } from "./another-module"; 4 | import * as fs from "fs"; 5 | 6 | // Simple interface 7 | interface Person { 8 | name: string; 9 | age: number; 10 | greet?(): void; // Optional method 11 | } 12 | 13 | // Enum definition 14 | export enum Color { 15 | Red, 16 | Green, 17 | Blue = 5, // Explicit value 18 | } 19 | 20 | // Class definition 21 | class SampleClass extends AnotherClass implements Person { 22 | public name: string; 23 | readonly age: number; 24 | private secret: string; 25 | protected status: Color; 26 | 27 | constructor(name: string, age: number) { 28 | super(); 29 | this.name = name; 30 | this.age = age; 31 | this.secret = "shhh"; 32 | this.status = Color.Green; 33 | } 34 | 35 | // Public method 36 | public greet(): void { 37 | console.log( 38 | `Hello, my name is ${this.name} and I am ${this.age} years old.` 39 | ); 40 | } 41 | 42 | // Private method 43 | private revealSecret(): string { 44 | return this.secret; 45 | } 46 | 47 | // Static method 48 | static createDefault(): SampleClass { 49 | return new SampleClass("Default", 0); 50 | } 51 | 52 | // Async method 53 | async loadData(path: string): Promise { 54 | return new Promise((resolve, reject) => { 55 | fs.readFile(path, "utf8", (err, data) => { 56 | if (err) { 57 | reject(err); 58 | } else { 59 | resolve(data); 60 | } 61 | }); 62 | }); 63 | } 64 | } 65 | 66 | // Top-level function 67 | function add(x: number, y: number): number { 68 | return x + y; 69 | } 70 | 71 | // Arrow function 72 | const multiply = (a: number, b: number): number => a * b; 73 | 74 | // Exporting variables 75 | export const PI = 3.14159; 76 | export let version = "1.0.0"; 77 | 78 | // Type alias 79 | type StringOrNumber = string | number; 80 | 81 | let value: StringOrNumber = "hello"; 82 | value = 123; 83 | 84 | // Using the enum 85 | let myColor: Color = Color.Blue; 86 | 87 | // Using the class 88 | const personInstance = new SampleClass("Alice", 30); 89 | personInstance.greet(); 90 | const defaultPerson = SampleClass.createDefault(); 91 | 92 | console.log(add(5, 3)); 93 | console.log(multiply(4, 6)); 94 | console.log(`Color: ${Color[myColor]}, Value: ${myColor}`); 95 | -------------------------------------------------------------------------------- /test/resources/test.tsx: -------------------------------------------------------------------------------- 1 | // Sample TSX file for testing the parser 2 | 3 | import React, { useState, useEffect } from "react"; 4 | import { SampleClass } from "./test"; // Assuming test.ts exports SampleClass 5 | 6 | // Interface for component props 7 | interface MyComponentProps { 8 | title: string; 9 | initialCount?: number; 10 | } 11 | 12 | // Functional component with hooks and JSX 13 | const MyComponent: React.FC = ({ 14 | title, 15 | initialCount = 0, 16 | }) => { 17 | const [count, setCount] = useState(initialCount); 18 | const [data, setData] = useState(null); 19 | 20 | useEffect(() => { 21 | // Simulate fetching data 22 | const instance = new SampleClass("TSX Component", count); 23 | setData(instance); 24 | console.log("Component mounted or count updated"); 25 | 26 | return () => { 27 | console.log("Component will unmount or count changed"); 28 | }; 29 | }, [count]); // Dependency array 30 | 31 | const increment = () => setCount((prevCount) => prevCount + 1); 32 | const decrement = () => setCount((prevCount) => prevCount - 1); 33 | 34 | return ( 35 |
39 |

{title}

40 |

Current Count: {count}

41 | {data &&

Data Name: {data.name}

} 42 | 43 | 44 | {/* Self-closing tag */} 45 |
46 | {/* Fragment */} 47 | <> 48 |

Fragment content

49 | 50 | {/* Conditional rendering */} 51 | {count > 5 &&

Count is greater than 5!

} 52 |
53 | ); 54 | }; 55 | 56 | // Another simple component 57 | function SimpleDiv() { 58 | return
Just a simple div.
; 59 | } 60 | 61 | // Exporting the component 62 | export default MyComponent; 63 | export { SimpleDiv }; 64 | 65 | // Top-level variable using JSX type 66 | let element: JSX.Element = ; 67 | -------------------------------------------------------------------------------- /test/test_parser.py: -------------------------------------------------------------------------------- 1 | import time 2 | from parser.python_parser import extract_types_and_members_from_file_for_python 3 | from parser.csharp_parser import extract_types_and_members_from_file_for_csharp 4 | from parser.typescript_parser import extract_types_and_members_from_file_for_typescript 5 | 6 | def test_python_parser(file_path): 7 | print(f"\nTesting Python parser on: {file_path}") 8 | 9 | start_time = time.time() 10 | result = extract_types_and_members_from_file_for_python(file_path, extract_imports=True) 11 | elapsed_time = time.time() - start_time 12 | 13 | print(f"\nParsing completed in {elapsed_time:.4f} seconds") 14 | 15 | print("\nClasses found:") 16 | for cls in result.py_classes: 17 | print(f"- {cls['name']}") 18 | if 'bases' in cls: 19 | print(f" Inherits from: {', '.join(cls['bases'])}") 20 | if 'methods' in cls: 21 | print(f" Methods: {len(cls['methods'])}") 22 | for method in cls['methods']: 23 | print(f" - {method}") 24 | 25 | 26 | def test_csharp_parser(file_path): 27 | print(f"\nTesting C# parser on: {file_path}") 28 | 29 | start_time = time.time() 30 | result = extract_types_and_members_from_file_for_csharp(file_path) 31 | elapsed_time = time.time() - start_time 32 | 33 | print(f"\nParsing completed in {elapsed_time:.4f} seconds") 34 | 35 | print("\nClasses found:") 36 | for cls in result.classes: 37 | print(f"- {cls['name']}") 38 | if 'bases' in cls: 39 | print(f" Inherits from: {cls['bases']}") 40 | if 'methods' in cls: 41 | print(f" Methods: {len(cls['methods'])}") 42 | for method in cls['methods']: 43 | print(f" - {method['name']}()") 44 | if 'modifiers' in method: 45 | print(f" Modifiers: {', '.join(method['modifiers'])}") 46 | 47 | print("Raw dictionary:") 48 | print(result.__to_dict__()) 49 | 50 | 51 | def test_typescript_parser(file_path): 52 | print(f"\nTesting TypeScript/TSX parser on: {file_path}") 53 | 54 | start_time = time.time() 55 | result = extract_types_and_members_from_file_for_typescript(file_path, extract_imports=True) 56 | elapsed_time = time.time() - start_time 57 | 58 | print(f"\nParsing completed in {elapsed_time:.4f} seconds") 59 | 60 | print("\nImports found:") 61 | for imp in result.imports: 62 | print(f"- {imp}") 63 | 64 | print("\nInterfaces found:") 65 | for iface in result.interfaces: 66 | print(f"- {iface['name']}") 67 | # Add more detail printing if needed 68 | 69 | print("\nEnums found:") 70 | for enm in result.enums: 71 | print(f"- {enm['name']}") 72 | # Add more detail printing if needed 73 | 74 | print("\nClasses found:") 75 | for cls in result.classes: 76 | print(f"- {cls['name']}") 77 | if 'heritage' in cls: 78 | print(f" Heritage: {cls['heritage']}") 79 | if 'methods' in cls: 80 | print(f" Methods: {len(cls['methods'])}") 81 | for method in cls['methods']: 82 | print(f" - {method['name']}") # Basic method name 83 | # Add more detail printing if needed 84 | 85 | print("\nFunctions found:") 86 | for func in result.functions: 87 | print(f"- {func['function_signature']}") 88 | # Add more detail printing if needed 89 | 90 | print("\nRaw dictionary:") 91 | print(result.__to_dict__()) 92 | 93 | 94 | if __name__ == "__main__": 95 | test_python_parser("./Project_Indexer.py") 96 | test_python_parser("parser/python_parser.py") 97 | test_csharp_parser("test/resources/test.cs") 98 | test_typescript_parser("test/resources/test.ts") 99 | test_typescript_parser("test/resources/test.tsx") --------------------------------------------------------------------------------