├── crates ├── similarity-php │ ├── src │ │ └── lib.rs │ └── Cargo.toml ├── similarity-py │ ├── src │ │ └── lib.rs │ └── Cargo.toml ├── similarity-rs │ ├── src │ │ └── lib.rs │ ├── Cargo.toml │ └── tests │ │ ├── test_min_tokens.rs │ │ ├── parser_test.rs │ │ ├── test_rename_zero.rs │ │ ├── test_debug_rename_cost.rs │ │ ├── test_full_function_similarity.rs │ │ ├── debug_ast.rs │ │ ├── test_tsed_debugging.rs │ │ ├── test_function_extraction.rs │ │ └── test_ast_comparison.rs ├── similarity-ts │ ├── src │ │ └── lib.rs │ ├── Cargo.toml │ └── tests │ │ └── tsx_test.rs ├── similarity-elixir │ ├── src │ │ └── lib.rs │ ├── tests │ │ └── elixir_test_helper.rs │ ├── Cargo.toml │ └── README.md ├── similarity-generic │ ├── language_configs │ │ ├── ruby.json │ │ ├── c.json │ │ ├── cpp.json │ │ ├── java.json │ │ ├── csharp.json │ │ └── go.json │ ├── examples │ │ ├── sample.go │ │ ├── configs │ │ │ ├── go.json │ │ │ └── custom-language-template.json │ │ └── usage.sh │ ├── Cargo.toml │ └── build.rs ├── core │ ├── tests │ │ ├── fixtures │ │ │ ├── sample2.ts │ │ │ └── sample1.ts │ │ ├── debug_similarity.rs │ │ └── ast_fingerprint_test.rs │ ├── README.md │ ├── src │ │ ├── tree.rs │ │ ├── cli_output.rs │ │ ├── cli_file_utils.rs │ │ └── ast_exchange.rs │ └── Cargo.toml ├── similarity-css │ ├── Cargo.toml │ ├── src │ │ ├── lib.rs │ │ └── bin │ │ │ └── test_parser.rs │ ├── examples │ │ ├── test.css │ │ └── test.scss │ └── README.md └── similarity-md │ ├── src │ └── lib.rs │ ├── examples │ ├── japanese_similarity_test.md │ └── test_levenshtein.rs │ └── Cargo.toml ├── test └── __fixtures__ │ ├── edge_cases │ ├── empty_1.ts │ ├── empty_2.ts │ ├── syntax_error_1.ts │ ├── syntax_error_2.ts │ ├── identical_1.ts │ └── identical_2.ts │ ├── similar │ ├── function_rename_2.ts │ ├── function_rename_1.ts │ ├── interface_extend_1.ts │ ├── interface_extend_2.ts │ ├── async_function_2.ts │ ├── async_function_1.ts │ ├── class_rename_1.ts │ └── class_rename_2.ts │ ├── dissimilar │ ├── function_vs_class_1.ts │ ├── sync_vs_async_1.ts │ ├── interface_vs_type_1.ts │ ├── interface_vs_type_2.ts │ ├── function_vs_class_2.ts │ ├── sync_vs_async_2.ts │ ├── imperative_vs_functional_2.ts │ └── imperative_vs_functional_1.ts │ ├── performance │ ├── small │ │ ├── small_3.ts │ │ ├── small_4.ts │ │ ├── small_5.ts │ │ ├── small_1.ts │ │ └── small_2.ts │ └── metadata.json │ ├── duplication │ ├── structural │ │ ├── metadata.json │ │ ├── loop_pattern_1.ts │ │ ├── visitnode_pattern_1.ts │ │ ├── visitnode_pattern_3.ts │ │ ├── visitnode_pattern_2.ts │ │ ├── array_iteration_pattern_2.ts │ │ ├── error_handling_pattern_2.ts │ │ ├── array_iteration_pattern_1.ts │ │ └── error_handling_pattern_1.ts │ ├── semantic │ │ ├── async_operations_1.ts │ │ ├── async_operations_2.ts │ │ └── validation_pattern_1.ts │ ├── exact │ │ ├── service_duplication_1.ts │ │ └── service_duplication_2.ts │ └── copy_paste │ │ └── loop_pattern.ts │ └── refactoring │ └── class_to_function │ ├── calculator_class.ts │ ├── user_service_functions.ts │ ├── calculator_functions.ts │ ├── user_service_class.ts │ ├── metadata.json │ └── repository_class.ts ├── rust-toolchain.toml ├── .rustfmt.toml ├── clippy.toml ├── __deprecated ├── tsdown.config.ts ├── vitest.config.ts ├── package-build.json ├── src │ ├── core │ │ ├── levenshtein.ts │ │ ├── oxc_types.ts │ │ ├── ast_traversal.ts │ │ └── ast.ts │ ├── cli │ │ └── io.ts │ └── parser.ts └── test │ └── basic.test.ts ├── examples ├── identical_enums.rs ├── test_enums.rs ├── overlap-detection │ ├── file2.js │ ├── file1.js │ ├── exact-duplication.js │ ├── false-positives.js │ ├── similar-patterns.js │ └── partial-overlap.js ├── specs │ ├── sample_project │ │ └── src │ │ │ ├── models │ │ │ ├── product.ts │ │ │ ├── order.ts │ │ │ └── user.ts │ │ │ ├── utils │ │ │ ├── errors.ts │ │ │ ├── validator.ts │ │ │ └── logger.ts │ │ │ ├── components │ │ │ ├── user_list.ts │ │ │ └── product_list.ts │ │ │ └── services │ │ │ ├── user_service.ts │ │ │ ├── product_service.ts │ │ │ └── order_service.ts │ ├── test_cli.ts │ ├── README.md │ ├── duplicate-types.ts │ ├── test_async.ts │ ├── test_extraction.ts │ ├── debug_arrow.ts │ ├── duplicate-functions.ts │ ├── duplicate-functions2.ts │ ├── basic_usage.ts │ ├── type-similarity │ │ ├── test_types_sample.ts │ │ └── test_type_literal_sample.ts │ └── debug_ast.ts ├── README.md ├── mixed_language_project │ ├── helpers.py │ └── utils.js ├── test_structure_comparison.ts ├── duplicate_python.py ├── test_rust_structures.rs ├── test_different_structures.rs ├── test_different_ts_structures.ts ├── rust_types_example.rs └── test_rust_with_derives.rs ├── .mcp.json ├── .oxlintrc.json ├── .cargo └── audit.toml ├── benchmarks ├── data │ ├── test_simple.ts │ └── test_duplicates.ts └── README.md ├── benchmark_results.txt ├── .claude └── settings.json ├── biome.json ├── docs ├── algorithm │ ├── README.md │ └── tsed-similarity-summary.md ├── implementation │ ├── README.md │ └── performance-baseline.md ├── lib │ └── README.md ├── README.md ├── prompt-ja.md └── prompt.md ├── Cargo.toml ├── KNOWN_ISSUES.md ├── LICENSE ├── RECOMMENDATIONS.md └── RELEASE_NOTES.md /crates/similarity-php/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod php_parser; 2 | -------------------------------------------------------------------------------- /crates/similarity-py/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod python_parser; 2 | -------------------------------------------------------------------------------- /crates/similarity-rs/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod rust_parser; 2 | -------------------------------------------------------------------------------- /test/__fixtures__/edge_cases/empty_1.ts: -------------------------------------------------------------------------------- 1 | // Edge case: Empty file -------------------------------------------------------------------------------- /test/__fixtures__/edge_cases/empty_2.ts: -------------------------------------------------------------------------------- 1 | // Edge case: Empty file with different comment -------------------------------------------------------------------------------- /rust-toolchain.toml: -------------------------------------------------------------------------------- 1 | [toolchain] 2 | channel = "stable" 3 | components = ["rustfmt", "clippy"] -------------------------------------------------------------------------------- /crates/similarity-ts/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod parallel; 2 | pub mod sequential; 3 | pub mod typescript_parser; 4 | -------------------------------------------------------------------------------- /.rustfmt.toml: -------------------------------------------------------------------------------- 1 | # Rust formatting configuration 2 | edition = "2021" 3 | max_width = 100 4 | use_small_heuristics = "Max" -------------------------------------------------------------------------------- /clippy.toml: -------------------------------------------------------------------------------- 1 | # Clippy linting configuration 2 | cognitive-complexity-threshold = 30 3 | too-many-arguments-threshold = 7 -------------------------------------------------------------------------------- /crates/similarity-elixir/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod elixir_parser; 2 | pub mod parallel; 3 | 4 | pub use elixir_parser::ElixirParser; 5 | -------------------------------------------------------------------------------- /test/__fixtures__/edge_cases/syntax_error_1.ts: -------------------------------------------------------------------------------- 1 | // Edge case: Syntax error 2 | function broken( { 3 | return "missing closing brace"; -------------------------------------------------------------------------------- /test/__fixtures__/edge_cases/syntax_error_2.ts: -------------------------------------------------------------------------------- 1 | // Edge case: Different syntax error 2 | class Incomplete { 3 | constructor() { 4 | this.value = -------------------------------------------------------------------------------- /__deprecated/tsdown.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "tsdown"; 2 | 3 | export default defineConfig({ 4 | entry: {}, 5 | external: ["oxc-parser"], 6 | }); 7 | -------------------------------------------------------------------------------- /examples/identical_enums.rs: -------------------------------------------------------------------------------- 1 | enum Color1 { 2 | Red, 3 | Green, 4 | Blue, 5 | } 6 | 7 | enum Color2 { 8 | Red, 9 | Green, 10 | Blue, 11 | } -------------------------------------------------------------------------------- /test/__fixtures__/similar/function_rename_2.ts: -------------------------------------------------------------------------------- 1 | // Similar: Function with renamed identifiers 2 | function addNumbers(x: number, y: number): number { 3 | return x + y; 4 | } -------------------------------------------------------------------------------- /.mcp.json: -------------------------------------------------------------------------------- 1 | { 2 | "mcpServers": { 3 | "rust": { 4 | "command": "npx", 5 | "args": ["-y", "@mizchi/lsmcp", "--bin", "rust-analyzer"] 6 | } 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /test/__fixtures__/similar/function_rename_1.ts: -------------------------------------------------------------------------------- 1 | // Similar: Function with renamed identifiers 2 | function calculateSum(a: number, b: number): number { 3 | return a + b; 4 | } -------------------------------------------------------------------------------- /examples/test_enums.rs: -------------------------------------------------------------------------------- 1 | enum Status { 2 | Active, 3 | Inactive, 4 | Pending, 5 | } 6 | 7 | enum State { 8 | Running, 9 | Stopped, 10 | Waiting, 11 | } -------------------------------------------------------------------------------- /test/__fixtures__/dissimilar/function_vs_class_1.ts: -------------------------------------------------------------------------------- 1 | // Dissimilar: Function implementation 2 | function processData(data: string[]): string { 3 | return data.filter(item => item.length > 0).join(', '); 4 | } -------------------------------------------------------------------------------- /.oxlintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "rules": { 3 | "no-console": "off", 4 | "typescript/no-explicit-any": "off" 5 | }, 6 | "ignorePatterns": ["node_modules", "dist", "build", "coverage", "*.min.js", "target", "test/__fixtures__"] 7 | } 8 | -------------------------------------------------------------------------------- /test/__fixtures__/dissimilar/sync_vs_async_1.ts: -------------------------------------------------------------------------------- 1 | // Dissimilar: Synchronous code 2 | function calculateTotal(items: number[]): number { 3 | let total = 0; 4 | for (const item of items) { 5 | total += item; 6 | } 7 | return total; 8 | } -------------------------------------------------------------------------------- /.cargo/audit.toml: -------------------------------------------------------------------------------- 1 | [advisories] 2 | # List of advisory IDs to ignore 3 | ignore = [ 4 | # instant is unmaintained - used by vibrato->rucrf->argmin 5 | "RUSTSEC-2024-0384", 6 | # paste is unmaintained - used by vibrato->rucrf->argmin 7 | "RUSTSEC-2024-0436", 8 | ] -------------------------------------------------------------------------------- /test/__fixtures__/similar/interface_extend_1.ts: -------------------------------------------------------------------------------- 1 | // Similar: Interface with minor additions 2 | interface BaseConfig { 3 | apiUrl: string; 4 | timeout: number; 5 | retryCount: number; 6 | } 7 | 8 | interface AppConfig extends BaseConfig { 9 | debug: boolean; 10 | } -------------------------------------------------------------------------------- /benchmarks/data/test_simple.ts: -------------------------------------------------------------------------------- 1 | function add(a: number, b: number): number { 2 | return a + b; 3 | } 4 | 5 | function subtract(a: number, b: number): number { 6 | return a - b; 7 | } 8 | 9 | function multiply(a: number, b: number): number { 10 | return a * b; 11 | } 12 | -------------------------------------------------------------------------------- /__deprecated/vitest.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "vitest/config"; 2 | export default defineConfig({ 3 | test: { 4 | include: ["test/**/*.test.ts"], 5 | coverage: { 6 | include: ["src/**/*.ts"], 7 | exclude: ["test/**", "examples/**", "scripts/**"], 8 | }, 9 | }, 10 | }); 11 | -------------------------------------------------------------------------------- /test/__fixtures__/similar/interface_extend_2.ts: -------------------------------------------------------------------------------- 1 | // Similar: Interface with minor additions 2 | interface BaseSettings { 3 | apiUrl: string; 4 | timeout: number; 5 | retryCount: number; 6 | } 7 | 8 | interface ApplicationSettings extends BaseSettings { 9 | debug: boolean; 10 | logLevel: string; 11 | } -------------------------------------------------------------------------------- /examples/overlap-detection/file2.js: -------------------------------------------------------------------------------- 1 | // File 2: Contains similar patterns 2 | 3 | function transformData(data) { 4 | const output = []; 5 | for (let i = 0; i < data.length; i++) { 6 | if (data[i].active) { 7 | output.push(data[i].value * 2); 8 | } 9 | } 10 | return output; 11 | } -------------------------------------------------------------------------------- /test/__fixtures__/similar/async_function_2.ts: -------------------------------------------------------------------------------- 1 | // Similar: Async function with same logic 2 | async function getUserInfo(id: string): Promise { 3 | const res = await fetch(`/api/users/${id}`); 4 | if (!res.ok) { 5 | throw new Error('User not found'); 6 | } 7 | const userData = await res.json(); 8 | return userData; 9 | } -------------------------------------------------------------------------------- /examples/overlap-detection/file1.js: -------------------------------------------------------------------------------- 1 | // File 1: Contains functions with overlapping patterns 2 | 3 | function processItems(items) { 4 | const results = []; 5 | for (let i = 0; i < items.length; i++) { 6 | if (items[i].active) { 7 | results.push(items[i].value * 2); 8 | } 9 | } 10 | return results; 11 | } -------------------------------------------------------------------------------- /test/__fixtures__/similar/async_function_1.ts: -------------------------------------------------------------------------------- 1 | // Similar: Async function with same logic 2 | async function fetchUserData(userId: string): Promise { 3 | const response = await fetch(`/api/users/${userId}`); 4 | if (!response.ok) { 5 | throw new Error('User not found'); 6 | } 7 | const data = await response.json(); 8 | return data; 9 | } -------------------------------------------------------------------------------- /test/__fixtures__/dissimilar/interface_vs_type_1.ts: -------------------------------------------------------------------------------- 1 | // Dissimilar: Interface declaration 2 | interface DatabaseConnection { 3 | host: string; 4 | port: number; 5 | username: string; 6 | password: string; 7 | database: string; 8 | 9 | connect(): Promise; 10 | disconnect(): Promise; 11 | query(sql: string, params?: any[]): Promise; 12 | } -------------------------------------------------------------------------------- /examples/specs/sample_project/src/models/product.ts: -------------------------------------------------------------------------------- 1 | export interface Product { 2 | id: string; 3 | name: string; 4 | description: string; 5 | price: number; 6 | category: string; 7 | stock: number; 8 | createdAt: Date; 9 | updatedAt?: Date; 10 | } 11 | 12 | export interface ProductVariant extends Product { 13 | parentId: string; 14 | sku: string; 15 | attributes: Record; 16 | } 17 | -------------------------------------------------------------------------------- /test/__fixtures__/performance/small/small_3.ts: -------------------------------------------------------------------------------- 1 | // Small test file 3 2 | 3 | import { Component } from '@angular/core'; 4 | import { Observable } from 'rxjs'; 5 | 6 | export function calculate3(a: number, b: number): number { 7 | const result = a + b; 8 | console.log('Result:', result); 9 | return result; 10 | } 11 | 12 | interface Data3 { 13 | id: number; 14 | name: string; 15 | value: number; 16 | } 17 | 18 | -------------------------------------------------------------------------------- /test/__fixtures__/performance/small/small_4.ts: -------------------------------------------------------------------------------- 1 | // Small test file 4 2 | 3 | import { Component } from '@angular/core'; 4 | import { Observable } from 'rxjs'; 5 | 6 | export function calculate4(a: number, b: number): number { 7 | const result = a + b; 8 | console.log('Result:', result); 9 | return result; 10 | } 11 | 12 | interface Data4 { 13 | id: number; 14 | name: string; 15 | value: number; 16 | } 17 | 18 | -------------------------------------------------------------------------------- /test/__fixtures__/performance/small/small_5.ts: -------------------------------------------------------------------------------- 1 | // Small test file 5 2 | 3 | import { Component } from '@angular/core'; 4 | import { Observable } from 'rxjs'; 5 | 6 | export function calculate5(a: number, b: number): number { 7 | const result = a + b; 8 | console.log('Result:', result); 9 | return result; 10 | } 11 | 12 | interface Data5 { 13 | id: number; 14 | name: string; 15 | value: number; 16 | } 17 | 18 | -------------------------------------------------------------------------------- /test/__fixtures__/dissimilar/interface_vs_type_2.ts: -------------------------------------------------------------------------------- 1 | // Dissimilar: Type aliases and utility types 2 | type Status = 'pending' | 'active' | 'completed' | 'failed'; 3 | 4 | type UserRole = 'admin' | 'user' | 'guest'; 5 | 6 | type ApiResponse = { 7 | success: boolean; 8 | data?: T; 9 | error?: string; 10 | timestamp: number; 11 | }; 12 | 13 | type DeepPartial = { 14 | [P in keyof T]?: T[P] extends object ? DeepPartial : T[P]; 15 | }; -------------------------------------------------------------------------------- /crates/similarity-elixir/tests/elixir_test_helper.rs: -------------------------------------------------------------------------------- 1 | use std::io::Write; 2 | use std::path::PathBuf; 3 | use tempfile::TempDir; 4 | 5 | pub fn create_elixir_file(content: &str) -> (TempDir, PathBuf) { 6 | let dir = TempDir::new().unwrap(); 7 | let file_path = dir.path().join("test.ex"); 8 | let mut file = std::fs::File::create(&file_path).unwrap(); 9 | writeln!(file, "{content}").unwrap(); 10 | file.flush().unwrap(); 11 | (dir, file_path) 12 | } 13 | -------------------------------------------------------------------------------- /test/__fixtures__/dissimilar/function_vs_class_2.ts: -------------------------------------------------------------------------------- 1 | // Dissimilar: Class implementation 2 | class DataProcessor { 3 | private cache: Map = new Map(); 4 | 5 | process(input: string): string { 6 | if (this.cache.has(input)) { 7 | return this.cache.get(input)!; 8 | } 9 | const result = this.transform(input); 10 | this.cache.set(input, result); 11 | return result; 12 | } 13 | 14 | private transform(data: string): string { 15 | return data.toUpperCase(); 16 | } 17 | } -------------------------------------------------------------------------------- /test/__fixtures__/similar/class_rename_1.ts: -------------------------------------------------------------------------------- 1 | // Similar: Class with renamed identifiers 2 | class UserService { 3 | private users: User[] = []; 4 | 5 | addUser(user: User): void { 6 | this.users.push(user); 7 | } 8 | 9 | getUser(id: number): User | undefined { 10 | return this.users.find(u => u.id === id); 11 | } 12 | 13 | removeUser(id: number): boolean { 14 | const index = this.users.findIndex(u => u.id === id); 15 | if (index !== -1) { 16 | this.users.splice(index, 1); 17 | return true; 18 | } 19 | return false; 20 | } 21 | } -------------------------------------------------------------------------------- /test/__fixtures__/performance/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Performance test fixtures of various sizes", 3 | "categories": { 4 | "small": { 5 | "description": "Small files (10-50 lines)", 6 | "averageSize": "~30 lines", 7 | "files": 5 8 | }, 9 | "medium": { 10 | "description": "Medium files (100-500 lines)", 11 | "averageSize": "~300 lines", 12 | "files": 3 13 | }, 14 | "large": { 15 | "description": "Large files (1000+ lines)", 16 | "averageSize": "~1500 lines", 17 | "files": 2 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /test/__fixtures__/duplication/structural/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Structural duplication patterns found in the codebase", 3 | "testCases": [ 4 | { 5 | "name": "visitNode pattern variations", 6 | "files": [ 7 | "visitnode_pattern_1.ts", 8 | "visitnode_pattern_2.ts", 9 | "visitnode_pattern_3.ts" 10 | ], 11 | "expectedSimilarity": { 12 | "1_vs_2": 0.75, 13 | "1_vs_3": 0.70, 14 | "2_vs_3": 0.65 15 | }, 16 | "notes": "Similar AST traversal patterns with different purposes" 17 | } 18 | ] 19 | } -------------------------------------------------------------------------------- /benchmark_results.txt: -------------------------------------------------------------------------------- 1 | Finished `bench` profile [optimized] target(s) in 0.04s 2 | Running unittests src/lib.rs (target/release/deps/ts_similarity_core-f9e4be796569fb52) 3 | 4 | running 0 tests 5 | 6 | test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 23 filtered out; finished in 0.00s 7 | 8 | Running benches/function_comparison.rs (target/release/deps/function_comparison-d48bd7340358873b) 9 | Gnuplot not found, using plotters backend 10 | Running benches/tsed_benchmark.rs (target/release/deps/tsed_benchmark-444e570d13af4368) 11 | Gnuplot not found, using plotters backend 12 | -------------------------------------------------------------------------------- /.claude/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "permissions": { 3 | "allow": [ 4 | "Bash(pnpm test)", 5 | "Bash(ls:*)", 6 | "Bash(grep:*)", 7 | "mcp__typescript__find_references", 8 | "mcp__typescript__get_definitions", 9 | "mcp__typescript__get_diagnostics", 10 | "mcp__typescript__get_module_symbols", 11 | "mcp__typescript__get_type_in_module", 12 | "mcp__typescript__move_file", 13 | "mcp__typescript__rename_symbol", 14 | "mcp__typescript__delete_symbol", 15 | "mcp__typescript__get_type_at_symbol" 16 | ], 17 | "deny": [] 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /__deprecated/package-build.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ts-similarity", 3 | "version": "1.0.0", 4 | "description": "TypeScript implementation (deprecated)", 5 | "type": "module", 6 | "main": "index.js", 7 | "bin": { 8 | "ts-similarity": "./dist/cli/cli.mjs" 9 | }, 10 | "scripts": { 11 | "test": "vitest run", 12 | "test:watch": "vitest", 13 | "test:cov": "vitest run --coverage", 14 | "build": "tsdown src/index.ts --outfile=dist/index.mjs --format=esm && tsdown src/cli/cli.ts --outfile=dist/cli/cli.mjs --format=esm --platform=node", 15 | "prepublishOnly": "pnpm run build" 16 | } 17 | } -------------------------------------------------------------------------------- /crates/similarity-generic/language_configs/ruby.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": "ruby", 3 | "function_nodes": ["method", "singleton_method"], 4 | "type_nodes": ["class", "module"], 5 | "field_mappings": { 6 | "name_field": "name", 7 | "params_field": "parameters", 8 | "body_field": "body", 9 | "decorator_field": null, 10 | "class_field": null 11 | }, 12 | "value_nodes": ["identifier", "string", "integer", "float", "true", "false", "nil"], 13 | "test_patterns": { 14 | "attribute_patterns": [], 15 | "name_prefixes": ["test_"], 16 | "name_suffixes": ["_test", "_spec"] 17 | } 18 | } -------------------------------------------------------------------------------- /examples/specs/sample_project/src/models/order.ts: -------------------------------------------------------------------------------- 1 | export interface Order { 2 | id: string; 3 | userId: string; 4 | items: OrderItem[]; 5 | status: OrderStatus; 6 | totalAmount: number; 7 | createdAt: Date; 8 | updatedAt: Date; 9 | shippedAt?: Date; 10 | deliveredAt?: Date; 11 | } 12 | 13 | export interface OrderItem { 14 | productId: string; 15 | quantity: number; 16 | price?: number; 17 | } 18 | 19 | export enum OrderStatus { 20 | PENDING = "pending", 21 | PROCESSING = "processing", 22 | SHIPPED = "shipped", 23 | DELIVERED = "delivered", 24 | CANCELLED = "cancelled", 25 | } 26 | -------------------------------------------------------------------------------- /biome.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://biomejs.dev/schemas/1.9.4/schema.json", 3 | "files": { 4 | "ignore": ["node_modules", "dist", "coverage", "*.min.js", "target", "test/__fixtures__", "pnpm-lock.yaml"] 5 | }, 6 | "formatter": { 7 | "enabled": true, 8 | "formatWithErrors": false, 9 | "indentStyle": "space", 10 | "indentWidth": 2, 11 | "lineEnding": "lf", 12 | "lineWidth": 120 13 | }, 14 | "linter": { 15 | "enabled": false 16 | }, 17 | "javascript": { 18 | "formatter": { 19 | "quoteStyle": "double", 20 | "semicolons": "always" 21 | } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /examples/specs/sample_project/src/models/user.ts: -------------------------------------------------------------------------------- 1 | export interface User { 2 | id: string; 3 | email: string; 4 | name: string; 5 | role: UserRole; 6 | createdAt: Date; 7 | updatedAt?: Date; 8 | lastLoginAt?: Date; 9 | } 10 | 11 | export enum UserRole { 12 | ADMIN = "admin", 13 | USER = "user", 14 | GUEST = "guest", 15 | } 16 | 17 | export interface UserProfile extends User { 18 | bio?: string; 19 | avatar?: string; 20 | preferences: UserPreferences; 21 | } 22 | 23 | export interface UserPreferences { 24 | theme: "light" | "dark"; 25 | language: string; 26 | notifications: boolean; 27 | } 28 | -------------------------------------------------------------------------------- /test/__fixtures__/similar/class_rename_2.ts: -------------------------------------------------------------------------------- 1 | // Similar: Class with renamed identifiers 2 | class PersonManager { 3 | private people: Person[] = []; 4 | 5 | addPerson(person: Person): void { 6 | this.people.push(person); 7 | } 8 | 9 | getPerson(identifier: number): Person | undefined { 10 | return this.people.find(p => p.id === identifier); 11 | } 12 | 13 | removePerson(identifier: number): boolean { 14 | const idx = this.people.findIndex(p => p.id === identifier); 15 | if (idx !== -1) { 16 | this.people.splice(idx, 1); 17 | return true; 18 | } 19 | return false; 20 | } 21 | } -------------------------------------------------------------------------------- /crates/similarity-generic/language_configs/c.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": "c", 3 | "function_nodes": ["function_definition"], 4 | "type_nodes": ["struct_specifier", "enum_specifier", "type_definition"], 5 | "field_mappings": { 6 | "name_field": "declarator", 7 | "params_field": "declarator", 8 | "body_field": "body", 9 | "decorator_field": null, 10 | "class_field": null 11 | }, 12 | "value_nodes": ["identifier", "string_literal", "number_literal", "true", "false", "null"], 13 | "test_patterns": { 14 | "attribute_patterns": [], 15 | "name_prefixes": ["test_"], 16 | "name_suffixes": ["_test"] 17 | } 18 | } -------------------------------------------------------------------------------- /test/__fixtures__/performance/small/small_1.ts: -------------------------------------------------------------------------------- 1 | // Small test file 1 2 | 3 | import { Component } from '@angular/core'; 4 | import { Observable } from 'rxjs'; 5 | 6 | export function calculate1(a: number, b: number): number { 7 | const result = a + b; 8 | console.log('Result:', result); 9 | return result; 10 | } 11 | 12 | interface Data1 { 13 | id: number; 14 | name: string; 15 | value: number; 16 | } 17 | 18 | class Service1 { 19 | private data: Data1[] = []; 20 | 21 | add(item: Data1): void { 22 | this.data.push(item); 23 | } 24 | 25 | get(id: number): Data1 | undefined { 26 | return this.data.find(d => d.id === id); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /test/__fixtures__/performance/small/small_2.ts: -------------------------------------------------------------------------------- 1 | // Small test file 2 2 | 3 | import { Component } from '@angular/core'; 4 | import { Observable } from 'rxjs'; 5 | 6 | export function calculate2(a: number, b: number): number { 7 | const result = a + b; 8 | console.log('Result:', result); 9 | return result; 10 | } 11 | 12 | interface Data2 { 13 | id: number; 14 | name: string; 15 | value: number; 16 | } 17 | 18 | class Service2 { 19 | private data: Data2[] = []; 20 | 21 | add(item: Data2): void { 22 | this.data.push(item); 23 | } 24 | 25 | get(id: number): Data2 | undefined { 26 | return this.data.find(d => d.id === id); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /crates/core/tests/fixtures/sample2.ts: -------------------------------------------------------------------------------- 1 | // Test sample 2: Similar functions that should have high similarity 2 | 3 | export function calculateSum(numbers: number[]): number { 4 | if (numbers.length === 0) return 0; 5 | 6 | let total = 0; 7 | for (const num of numbers) { 8 | total += num; 9 | } 10 | 11 | return total; 12 | } 13 | 14 | // Very similar to calculateAverage from sample1.ts 15 | export function computeMean(values: number[]): number { 16 | if (values.length === 0) return 0; 17 | 18 | let sum = 0; 19 | for (const val of values) { 20 | sum += val; 21 | } 22 | 23 | return sum / values.length; 24 | } -------------------------------------------------------------------------------- /crates/similarity-generic/language_configs/cpp.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": "cpp", 3 | "function_nodes": ["function_definition", "lambda_expression"], 4 | "type_nodes": ["class_specifier", "struct_specifier", "enum_specifier"], 5 | "field_mappings": { 6 | "name_field": "declarator", 7 | "params_field": "declarator", 8 | "body_field": "body", 9 | "decorator_field": null, 10 | "class_field": null 11 | }, 12 | "value_nodes": ["identifier", "string_literal", "number_literal", "true", "false", "nullptr"], 13 | "test_patterns": { 14 | "attribute_patterns": [], 15 | "name_prefixes": ["test_", "Test"], 16 | "name_suffixes": ["_test", "Test"] 17 | } 18 | } -------------------------------------------------------------------------------- /test/__fixtures__/duplication/structural/loop_pattern_1.ts: -------------------------------------------------------------------------------- 1 | // Simple loop patterns that are structurally identical 2 | // These demonstrate pure structural duplication 3 | 4 | export function sumNumbers(numbers: number[]): number { 5 | let total = 0; 6 | for (const num of numbers) { 7 | total += num; 8 | } 9 | return total; 10 | } 11 | 12 | export function sumPrices(prices: number[]): number { 13 | let total = 0; 14 | for (const price of prices) { 15 | total += price; 16 | } 17 | return total; 18 | } 19 | 20 | export function sumScores(scores: number[]): number { 21 | let total = 0; 22 | for (const score of scores) { 23 | total += score; 24 | } 25 | return total; 26 | } -------------------------------------------------------------------------------- /crates/similarity-css/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "similarity-css" 3 | version = "0.4.2" 4 | edition = "2021" 5 | authors = ["similarity contributors"] 6 | description = "CSS/SCSS similarity detection tool (experimental)" 7 | license = "MIT" 8 | repository = "https://github.com/mizchi/similarity" 9 | 10 | [dependencies] 11 | similarity-core = { version = "0.4.2", path = "../core" } 12 | clap = { version = "4.0", features = ["derive"] } 13 | tree-sitter = "0.24" 14 | tree-sitter-css = "0.23" 15 | tree-sitter-scss = "1.0" 16 | rayon = "1.7" 17 | serde = { version = "1.0", features = ["derive"] } 18 | serde_json = "1.0" 19 | indexmap = "2.0" 20 | ignore = "0.4" 21 | 22 | [dev-dependencies] 23 | tempfile = "3.5" 24 | insta = "1.29" -------------------------------------------------------------------------------- /crates/similarity-generic/language_configs/java.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": "java", 3 | "function_nodes": ["method_declaration", "constructor_declaration"], 4 | "type_nodes": ["class_declaration", "interface_declaration", "enum_declaration"], 5 | "field_mappings": { 6 | "name_field": "name", 7 | "params_field": "parameters", 8 | "body_field": "body", 9 | "decorator_field": "annotation", 10 | "class_field": null 11 | }, 12 | "value_nodes": ["identifier", "string_literal", "integer_literal", "floating_point_literal", "true", "false", "null_literal"], 13 | "test_patterns": { 14 | "attribute_patterns": ["@Test", "@ParameterizedTest"], 15 | "name_prefixes": ["test"], 16 | "name_suffixes": ["Test"] 17 | } 18 | } -------------------------------------------------------------------------------- /crates/similarity-md/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod levenshtein; 2 | pub mod markdown_parser; 3 | pub mod morphological_similarity; 4 | pub mod section_extractor; 5 | pub mod similarity_calculator; 6 | 7 | pub use levenshtein::{ 8 | levenshtein_distance, levenshtein_similarity, word_levenshtein_distance, 9 | word_levenshtein_similarity, 10 | }; 11 | pub use markdown_parser::{MarkdownParser, MarkdownSection}; 12 | pub use morphological_similarity::{ 13 | MorphemeToken, MorphologicalSimilarityCalculator, PosSimilarity, 14 | }; 15 | pub use section_extractor::{ExtractedSection, SectionExtractor, SimilarTitlePair}; 16 | pub use similarity_calculator::{ 17 | SimilarSectionPair, SimilarityCalculator, SimilarityOptions, SimilarityResult, 18 | }; 19 | -------------------------------------------------------------------------------- /test/__fixtures__/duplication/structural/visitnode_pattern_1.ts: -------------------------------------------------------------------------------- 1 | // visitNode pattern from function_body_comparer.ts 2 | function visitNode(node: any, callback: (n: any) => void): void { 3 | if (!node || typeof node !== 'object') { 4 | return; 5 | } 6 | 7 | callback(node); 8 | 9 | // Handle arrays 10 | if (Array.isArray(node)) { 11 | for (const item of node) { 12 | visitNode(item, callback); 13 | } 14 | return; 15 | } 16 | 17 | // Skip certain properties 18 | const skipKeys = new Set(['loc', 'range', 'start', 'end', 'parent']); 19 | 20 | // Visit all properties 21 | for (const key in node) { 22 | if (node.hasOwnProperty(key) && !skipKeys.has(key)) { 23 | visitNode(node[key], callback); 24 | } 25 | } 26 | } -------------------------------------------------------------------------------- /test/__fixtures__/refactoring/class_to_function/calculator_class.ts: -------------------------------------------------------------------------------- 1 | // Stateful Calculator class 2 | class Calculator { 3 | private value: number = 0; 4 | 5 | add(n: number): number { 6 | this.value += n; 7 | return this.value; 8 | } 9 | 10 | subtract(n: number): number { 11 | this.value -= n; 12 | return this.value; 13 | } 14 | 15 | multiply(n: number): number { 16 | this.value *= n; 17 | return this.value; 18 | } 19 | 20 | divide(n: number): number { 21 | if (n === 0) { 22 | throw new Error('Division by zero'); 23 | } 24 | this.value /= n; 25 | return this.value; 26 | } 27 | 28 | reset(): void { 29 | this.value = 0; 30 | } 31 | 32 | getValue(): number { 33 | return this.value; 34 | } 35 | } -------------------------------------------------------------------------------- /crates/similarity-generic/language_configs/csharp.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": "csharp", 3 | "function_nodes": ["method_declaration", "constructor_declaration", "lambda_expression"], 4 | "type_nodes": ["class_declaration", "interface_declaration", "struct_declaration", "enum_declaration"], 5 | "field_mappings": { 6 | "name_field": "name", 7 | "params_field": "parameters", 8 | "body_field": "body", 9 | "decorator_field": "attribute", 10 | "class_field": null 11 | }, 12 | "value_nodes": ["identifier", "string_literal", "integer_literal", "real_literal", "true", "false", "null_literal"], 13 | "test_patterns": { 14 | "attribute_patterns": ["[Test]", "[TestMethod]", "[Fact]"], 15 | "name_prefixes": ["Test"], 16 | "name_suffixes": ["Test", "Tests"] 17 | } 18 | } -------------------------------------------------------------------------------- /crates/similarity-generic/examples/sample.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import "fmt" 4 | 5 | // Similar functions that should be detected 6 | func calculateSum(numbers []int) int { 7 | total := 0 8 | for _, num := range numbers { 9 | total += num 10 | } 11 | return total 12 | } 13 | 14 | func computeTotal(values []int) int { 15 | sum := 0 16 | for _, val := range values { 17 | sum += val 18 | } 19 | return sum 20 | } 21 | 22 | // Different function 23 | func printMessage(msg string) { 24 | fmt.Println("Message:", msg) 25 | } 26 | 27 | // Test function (can be excluded with --skip-test) 28 | func TestCalculateSum(t *testing.T) { 29 | result := calculateSum([]int{1, 2, 3}) 30 | if result != 6 { 31 | t.Error("Expected 6") 32 | } 33 | } -------------------------------------------------------------------------------- /crates/similarity-generic/language_configs/go.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": "go", 3 | "function_nodes": [ 4 | "function_declaration", 5 | "method_declaration" 6 | ], 7 | "type_nodes": [ 8 | "type_declaration", 9 | "struct_type" 10 | ], 11 | "field_mappings": { 12 | "name_field": "name", 13 | "params_field": "parameters", 14 | "body_field": "body" 15 | }, 16 | "value_nodes": [ 17 | "identifier", 18 | "interpreted_string_literal", 19 | "raw_string_literal", 20 | "int_literal", 21 | "float_literal", 22 | "true", 23 | "false", 24 | "nil" 25 | ], 26 | "test_patterns": { 27 | "attribute_patterns": [], 28 | "name_prefixes": [ 29 | "Test", 30 | "Benchmark" 31 | ], 32 | "name_suffixes": [ 33 | "_test" 34 | ] 35 | } 36 | } -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | This directory contains performance benchmarking data and complex test cases. 4 | 5 | ## Structure 6 | 7 | - `data/` - Various TypeScript files for benchmarking and stress testing 8 | - Performance tests 9 | - Complex duplication scenarios 10 | - Real-world code patterns 11 | 12 | ## Running Benchmarks 13 | 14 | ```bash 15 | # Basic performance test 16 | time similarity-ts benchmarks/data/ --threshold 0.8 17 | 18 | # Memory usage test 19 | /usr/bin/time -v similarity-ts benchmarks/data/ --threshold 0.8 20 | 21 | # Stress test with many files 22 | similarity-ts benchmarks/data/ --threshold 0.7 --min-tokens 10 23 | ``` 24 | 25 | ## Notes 26 | 27 | These files are not meant for understanding the tool's basic functionality. 28 | For specification examples, see `examples/specs/`. -------------------------------------------------------------------------------- /docs/algorithm/README.md: -------------------------------------------------------------------------------- 1 | # Algorithm Documentation 2 | 3 | This directory contains documentation about the algorithms used in the similarity detection tools. 4 | 5 | ## Contents 6 | 7 | - [TSED (Tree Similarity of Edit Distance)](tsed-similarity.md) - Complete academic paper on the TSED algorithm 8 | - [TSED Summary](tsed-similarity-summary.md) - Summary of the TSED paper 9 | - [Tree-sitter Integration Analysis](tree-sitter-integration-analysis.md) - Analysis of tree-sitter integration for AST parsing 10 | 11 | ## Overview 12 | 13 | The similarity detection tools use AST-based comparison algorithms to detect code duplication across multiple programming languages. The core algorithm is TSED (Tree Similarity of Edit Distance), which provides accurate structural comparison of code while considering both structure and size differences. -------------------------------------------------------------------------------- /__deprecated/src/core/levenshtein.ts: -------------------------------------------------------------------------------- 1 | export function levenshtein(str1: string, str2: string): number { 2 | const len1 = str1.length; 3 | const len2 = str2.length; 4 | 5 | if (len1 === 0) return len2; 6 | if (len2 === 0) return len1; 7 | 8 | const matrix: number[][] = []; 9 | 10 | for (let i = 0; i <= len2; i++) { 11 | matrix[i] = [i]; 12 | } 13 | 14 | for (let j = 0; j <= len1; j++) { 15 | matrix[0][j] = j; 16 | } 17 | 18 | for (let i = 1; i <= len2; i++) { 19 | for (let j = 1; j <= len1; j++) { 20 | if (str2.charAt(i - 1) === str1.charAt(j - 1)) { 21 | matrix[i][j] = matrix[i - 1][j - 1]; 22 | } else { 23 | matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j] + 1); 24 | } 25 | } 26 | } 27 | 28 | return matrix[len2][len1]; 29 | } 30 | -------------------------------------------------------------------------------- /test/__fixtures__/edge_cases/identical_1.ts: -------------------------------------------------------------------------------- 1 | // Edge case: Identical complex code 2 | export class ComplexService { 3 | private readonly repository: Repository; 4 | private readonly cache: Map = new Map(); 5 | 6 | constructor(repository: Repository) { 7 | this.repository = repository; 8 | } 9 | 10 | async findById(id: string): Promise { 11 | if (this.cache.has(id)) { 12 | return this.cache.get(id)!; 13 | } 14 | 15 | const entity = await this.repository.findOne({ where: { id } }); 16 | if (entity) { 17 | this.cache.set(id, entity); 18 | } 19 | return entity; 20 | } 21 | 22 | async save(entity: T): Promise { 23 | const saved = await this.repository.save(entity); 24 | this.cache.set(saved.id, saved); 25 | return saved; 26 | } 27 | } -------------------------------------------------------------------------------- /test/__fixtures__/edge_cases/identical_2.ts: -------------------------------------------------------------------------------- 1 | // Edge case: Identical complex code 2 | export class ComplexService { 3 | private readonly repository: Repository; 4 | private readonly cache: Map = new Map(); 5 | 6 | constructor(repository: Repository) { 7 | this.repository = repository; 8 | } 9 | 10 | async findById(id: string): Promise { 11 | if (this.cache.has(id)) { 12 | return this.cache.get(id)!; 13 | } 14 | 15 | const entity = await this.repository.findOne({ where: { id } }); 16 | if (entity) { 17 | this.cache.set(id, entity); 18 | } 19 | return entity; 20 | } 21 | 22 | async save(entity: T): Promise { 23 | const saved = await this.repository.save(entity); 24 | this.cache.set(saved.id, saved); 25 | return saved; 26 | } 27 | } -------------------------------------------------------------------------------- /__deprecated/src/cli/io.ts: -------------------------------------------------------------------------------- 1 | // IO operations for file system access 2 | import { readFileSync } from "fs"; 3 | import { join, relative } from "path"; 4 | import { glob } from "glob"; 5 | 6 | interface FileInfo { 7 | id: string; 8 | path: string; 9 | content: string; 10 | } 11 | 12 | /** 13 | * Load files from a directory pattern 14 | */ 15 | export async function loadFilesFromPattern(pattern: string, basePath: string = "."): Promise { 16 | const files = await glob(pattern, { cwd: basePath }); 17 | const results: FileInfo[] = []; 18 | 19 | for (const file of files) { 20 | const fullPath = join(basePath, file); 21 | const content = readFileSync(fullPath, "utf-8"); 22 | const id = relative(basePath, fullPath); 23 | 24 | results.push({ id, path: fullPath, content }); 25 | } 26 | 27 | return results; 28 | } 29 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | 3 | Simple, clear examples to understand similarity-ts functionality. 4 | 5 | ## Directory Structure 6 | 7 | - `specs/` - Specification examples demonstrating core features 8 | - `duplicate-functions.ts` - Function similarity detection 9 | - `duplicate-types.ts` - Type similarity detection 10 | - `sample_project/` - Multi-file project example 11 | - Other test files for specific features 12 | 13 | ## Quick Test 14 | 15 | ```bash 16 | # Test function detection 17 | similarity-ts examples/specs/duplicate-functions.ts --threshold 0.8 --min-tokens 20 18 | 19 | # Test type detection 20 | similarity-ts examples/specs/duplicate-types.ts --experimental-types --threshold 0.8 21 | 22 | # Test multi-file project 23 | similarity-ts examples/specs/sample_project/ --threshold 0.85 24 | ``` 25 | 26 | See `specs/README.md` for expected results. -------------------------------------------------------------------------------- /examples/mixed_language_project/helpers.py: -------------------------------------------------------------------------------- 1 | # Python helper functions 2 | 3 | def process_data(data): 4 | """Process data and return result.""" 5 | result = [] 6 | for item in data: 7 | if item > 0: 8 | result.append(item * 2) 9 | return result 10 | 11 | def calculate_sum(numbers): 12 | """Calculate sum of numbers.""" 13 | total = 0 14 | for num in numbers: 15 | total += num 16 | return total 17 | 18 | class DataHelper: 19 | def __init__(self): 20 | self.data = [] 21 | 22 | def process(self, data): 23 | result = [] 24 | for item in data: 25 | if item > 0: 26 | result.append(item * 2) 27 | return result 28 | 29 | def sum(self, numbers): 30 | total = 0 31 | for n in numbers: 32 | total += n 33 | return total -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "crates/core", 4 | "crates/similarity-ts", 5 | "crates/similarity-py", 6 | "crates/similarity-php", 7 | "crates/similarity-rs", 8 | "crates/similarity-generic", 9 | "crates/similarity-elixir", 10 | "crates/similarity-md", 11 | "crates/similarity-css", 12 | ] 13 | resolver = "2" 14 | 15 | [workspace.dependencies] 16 | oxc_allocator = "0.73.0" 17 | oxc_ast = "0.73.0" 18 | oxc_parser = "0.73.0" 19 | oxc_span = "0.73.0" 20 | tree-sitter = "0.24" 21 | tree-sitter-c = "0.23" 22 | tree-sitter-c-sharp = "0.23" 23 | tree-sitter-cpp = "0.23" 24 | tree-sitter-elixir = "0.3" 25 | tree-sitter-go = "0.23" 26 | tree-sitter-java = "0.23" 27 | tree-sitter-javascript = "0.23" 28 | tree-sitter-php = "0.23" 29 | tree-sitter-python = "0.23" 30 | tree-sitter-ruby = "0.23" 31 | tree-sitter-rust = "0.23" 32 | tree-sitter-typescript = "0.23" 33 | -------------------------------------------------------------------------------- /examples/mixed_language_project/utils.js: -------------------------------------------------------------------------------- 1 | // JavaScript utility functions 2 | 3 | function processData(data) { 4 | const result = []; 5 | for (const item of data) { 6 | if (item > 0) { 7 | result.push(item * 2); 8 | } 9 | } 10 | return result; 11 | } 12 | 13 | function transformData(data) { 14 | const output = []; 15 | for (const element of data) { 16 | if (element > 0) { 17 | output.push(element * 2); 18 | } 19 | } 20 | return output; 21 | } 22 | 23 | export class DataProcessor { 24 | constructor() { 25 | this.cache = {}; 26 | } 27 | 28 | process(data) { 29 | const result = []; 30 | for (const item of data) { 31 | if (item > 0) { 32 | result.push(item * 2); 33 | } 34 | } 35 | return result; 36 | } 37 | } -------------------------------------------------------------------------------- /crates/similarity-css/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod css_comparator; 2 | pub mod css_parser; 3 | pub mod css_rule_converter; 4 | pub mod duplicate_analyzer; 5 | pub mod parser; 6 | pub mod scss_flattener; 7 | pub mod scss_simple_flattener; 8 | pub mod shorthand_expander; 9 | pub mod specificity; 10 | 11 | pub use css_comparator::{ 12 | calculate_rule_similarity, compare_css_rules, CssRule, CssSimilarityResult, SerializableCssRule, 13 | }; 14 | pub use css_rule_converter::{convert_to_css_rule, parse_css_to_rules}; 15 | pub use duplicate_analyzer::{ 16 | DuplicateAnalysisResult, DuplicateAnalyzer, DuplicateRule, DuplicateType, 17 | SerializableDuplicateRule, 18 | }; 19 | pub use parser::CssParser; 20 | pub use scss_flattener::{flatten_scss_rules, FlatRule}; 21 | pub use shorthand_expander::expand_shorthand_properties; 22 | pub use specificity::{calculate_specificity, SelectorAnalysis, Specificity}; 23 | -------------------------------------------------------------------------------- /examples/specs/sample_project/src/utils/errors.ts: -------------------------------------------------------------------------------- 1 | export class ApplicationError extends Error { 2 | constructor( 3 | message: string, 4 | public code: string, 5 | ) { 6 | super(message); 7 | this.name = "ApplicationError"; 8 | } 9 | } 10 | 11 | export class ValidationError extends ApplicationError { 12 | constructor(message: string) { 13 | super(message, "VALIDATION_ERROR"); 14 | this.name = "ValidationError"; 15 | } 16 | } 17 | 18 | export class NotFoundError extends ApplicationError { 19 | constructor(resource: string, id: string) { 20 | super(`${resource} not found: ${id}`, "NOT_FOUND"); 21 | this.name = "NotFoundError"; 22 | } 23 | } 24 | 25 | export class UnauthorizedError extends ApplicationError { 26 | constructor(message: string = "Unauthorized") { 27 | super(message, "UNAUTHORIZED"); 28 | this.name = "UnauthorizedError"; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /examples/specs/sample_project/src/utils/validator.ts: -------------------------------------------------------------------------------- 1 | export class Validator { 2 | static isEmail(email: string): boolean { 3 | const emailRegex = /^[^\s@]+@[^\s@]+\.[^\s@]+$/; 4 | return emailRegex.test(email); 5 | } 6 | 7 | static isStrongPassword(password: string): boolean { 8 | return password.length >= 8 && /[A-Z]/.test(password) && /[a-z]/.test(password) && /[0-9]/.test(password); 9 | } 10 | 11 | static isValidPhoneNumber(phone: string): boolean { 12 | const phoneRegex = /^\+?[\d\s-()]+$/; 13 | return phoneRegex.test(phone) && phone.replace(/\D/g, "").length >= 10; 14 | } 15 | 16 | static isValidUrl(url: string): boolean { 17 | try { 18 | new URL(url); 19 | return true; 20 | } catch { 21 | return false; 22 | } 23 | } 24 | 25 | static sanitizeInput(input: string): string { 26 | return input.trim().replace(/[<>]/g, ""); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /examples/specs/test_cli.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Test the CLI functionality 3 | */ 4 | import { spawn } from "child_process"; 5 | import { join, dirname } from "path"; 6 | 7 | const __dirname = dirname(new URL(import.meta.url).pathname); 8 | const cliPath = join(__dirname, "../src/cli/cli.ts"); 9 | const targetDir = join(__dirname, "sample_project/src"); 10 | 11 | console.log("🔍 Testing TypeScript Function Similarity CLI\n"); 12 | console.log(`CLI Path: ${cliPath}`); 13 | console.log(`Target Directory: ${targetDir}\n`); 14 | 15 | // Run the CLI with tsx 16 | const child = spawn("npx", ["tsx", cliPath, targetDir, "-t", "0.6"], { 17 | stdio: "inherit", 18 | shell: true, 19 | }); 20 | 21 | child.on("error", (error) => { 22 | console.error("Failed to start CLI:", error); 23 | }); 24 | 25 | child.on("close", (code) => { 26 | if (code !== 0) { 27 | console.error(`CLI exited with code ${code}`); 28 | } 29 | }); 30 | -------------------------------------------------------------------------------- /crates/core/README.md: -------------------------------------------------------------------------------- 1 | # similarity-ts-core 2 | 3 | Core library for TypeScript/JavaScript code similarity detection using AST-based comparison. 4 | 5 | ## Features 6 | 7 | - Extract functions from TypeScript/JavaScript code 8 | - Compare function similarity using Tree Structured Edit Distance (TSED) 9 | - Fast similarity detection with bloom filter pre-filtering 10 | - Support for various function types (regular functions, arrow functions, methods) 11 | - Configurable similarity thresholds 12 | 13 | ## Usage 14 | 15 | ```rust 16 | use similarity_core::{extract_functions, compare_functions, TSEDOptions}; 17 | 18 | // Extract functions from code 19 | let functions = extract_functions("example.ts", source_code)?; 20 | 21 | // Compare two functions 22 | let options = TSEDOptions::default(); 23 | let similarity = compare_functions(&func1, &func2, source1, source2, &options)?; 24 | ``` 25 | 26 | ## License 27 | 28 | MIT -------------------------------------------------------------------------------- /test/__fixtures__/dissimilar/sync_vs_async_2.ts: -------------------------------------------------------------------------------- 1 | // Dissimilar: Asynchronous code with different purpose 2 | async function fetchAndProcessData(url: string): Promise { 3 | try { 4 | const response = await fetch(url); 5 | const rawData = await response.json(); 6 | 7 | const processed = await Promise.all( 8 | rawData.items.map(async (item: any) => { 9 | const details = await fetchDetails(item.id); 10 | return { ...item, details }; 11 | }) 12 | ); 13 | 14 | return { 15 | timestamp: Date.now(), 16 | data: processed, 17 | status: 'success' 18 | }; 19 | } catch (error) { 20 | console.error('Processing failed:', error); 21 | throw new Error('Failed to process data'); 22 | } 23 | } 24 | 25 | async function fetchDetails(id: string): Promise { 26 | const response = await fetch(`/api/details/${id}`); 27 | return response.json(); 28 | } -------------------------------------------------------------------------------- /crates/similarity-generic/examples/configs/go.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": "go", 3 | "function_nodes": [ 4 | "function_declaration", 5 | "method_declaration" 6 | ], 7 | "type_nodes": [ 8 | "type_declaration", 9 | "struct_type", 10 | "interface_type" 11 | ], 12 | "field_mappings": { 13 | "name_field": "name", 14 | "params_field": "parameters", 15 | "body_field": "body", 16 | "decorator_field": null, 17 | "class_field": null 18 | }, 19 | "value_nodes": [ 20 | "identifier", 21 | "interpreted_string_literal", 22 | "raw_string_literal", 23 | "int_literal", 24 | "float_literal", 25 | "true", 26 | "false", 27 | "nil" 28 | ], 29 | "test_patterns": { 30 | "attribute_patterns": [], 31 | "name_prefixes": [ 32 | "Test", 33 | "Benchmark" 34 | ], 35 | "name_suffixes": [ 36 | "_test" 37 | ] 38 | }, 39 | "custom_mappings": null 40 | } -------------------------------------------------------------------------------- /test/__fixtures__/refactoring/class_to_function/user_service_functions.ts: -------------------------------------------------------------------------------- 1 | // Refactored to functional style 2 | function addUser(users: Map, user: User): void { 3 | if (!user.id) { 4 | throw new Error('User must have an ID'); 5 | } 6 | users.set(user.id, user); 7 | console.log(`Added user: ${user.name}`); 8 | } 9 | 10 | function removeUser(users: Map, userId: string): boolean { 11 | const user = users.get(userId); 12 | if (!user) { 13 | return false; 14 | } 15 | users.delete(userId); 16 | console.log(`Removed user: ${user.name}`); 17 | return true; 18 | } 19 | 20 | function getUser(users: Map, userId: string): User | undefined { 21 | return users.get(userId); 22 | } 23 | 24 | function getAllUsers(users: Map): User[] { 25 | return Array.from(users.values()); 26 | } 27 | 28 | interface User { 29 | id: string; 30 | name: string; 31 | email: string; 32 | } -------------------------------------------------------------------------------- /test/__fixtures__/refactoring/class_to_function/calculator_functions.ts: -------------------------------------------------------------------------------- 1 | // Functional Calculator with state parameter 2 | interface CalculatorState { 3 | value: number; 4 | } 5 | 6 | const add = (state: CalculatorState, n: number): number => { 7 | state.value += n; 8 | return state.value; 9 | }; 10 | 11 | const subtract = (state: CalculatorState, n: number): number => { 12 | state.value -= n; 13 | return state.value; 14 | }; 15 | 16 | const multiply = (state: CalculatorState, n: number): number => { 17 | state.value *= n; 18 | return state.value; 19 | }; 20 | 21 | const divide = (state: CalculatorState, n: number): number => { 22 | if (n === 0) { 23 | throw new Error('Division by zero'); 24 | } 25 | state.value /= n; 26 | return state.value; 27 | }; 28 | 29 | const reset = (state: CalculatorState): void => { 30 | state.value = 0; 31 | }; 32 | 33 | const getValue = (state: CalculatorState): number => { 34 | return state.value; 35 | }; -------------------------------------------------------------------------------- /examples/specs/README.md: -------------------------------------------------------------------------------- 1 | # Specification Examples 2 | 3 | These examples demonstrate the core functionality of similarity-ts. 4 | 5 | ## Files 6 | 7 | - `duplicate-functions.ts` - Function similarity detection examples 8 | - `duplicate-types.ts` - Type similarity detection examples (requires --experimental-types) 9 | 10 | ## Expected Results 11 | 12 | ### Function Detection 13 | ```bash 14 | similarity-ts duplicate-functions.ts --threshold 0.8 --min-tokens 20 15 | ``` 16 | Should detect: 17 | - calculateUserAge vs calculateCustomerAge (~95% similarity) 18 | - findMaxValue vs getMaximumValue (~85% similarity) 19 | - processUserData vs processCustomerData (~90% similarity) 20 | 21 | ### Type Detection 22 | ```bash 23 | similarity-ts duplicate-types.ts --experimental-types --threshold 0.8 24 | ``` 25 | Should detect: 26 | - User vs Customer (100% similarity) 27 | - UserResponse vs CustomerResponse (~85% similarity) 28 | - ApiResponse vs ServiceResponse (100% similarity) -------------------------------------------------------------------------------- /test/__fixtures__/refactoring/class_to_function/user_service_class.ts: -------------------------------------------------------------------------------- 1 | // Original class implementation 2 | class UserService { 3 | private users: Map = new Map(); 4 | 5 | addUser(user: User): void { 6 | if (!user.id) { 7 | throw new Error('User must have an ID'); 8 | } 9 | this.users.set(user.id, user); 10 | console.log(`Added user: ${user.name}`); 11 | } 12 | 13 | removeUser(userId: string): boolean { 14 | const user = this.users.get(userId); 15 | if (!user) { 16 | return false; 17 | } 18 | this.users.delete(userId); 19 | console.log(`Removed user: ${user.name}`); 20 | return true; 21 | } 22 | 23 | getUser(userId: string): User | undefined { 24 | return this.users.get(userId); 25 | } 26 | 27 | getAllUsers(): User[] { 28 | return Array.from(this.users.values()); 29 | } 30 | } 31 | 32 | interface User { 33 | id: string; 34 | name: string; 35 | email: string; 36 | } -------------------------------------------------------------------------------- /crates/similarity-md/examples/japanese_similarity_test.md: -------------------------------------------------------------------------------- 1 | # 日本語類似性検出のテスト 2 | 3 | このドキュメントは、Vibrato を使った形態素解析による日本語テキストの類似性検出をテストするためのサンプルです。 4 | 5 | ## 機械学習について 6 | 7 | 機械学習は、コンピュータがデータから自動的にパターンを学習する技術です。この技術は、画像認識、自然言語処理、推薦システムなど、様々な分野で活用されています。機械学習のアルゴリズムには、教師あり学習、教師なし学習、強化学習などがあります。 8 | 9 | ## マシンラーニングの概要 10 | 11 | マシンラーニングとは、計算機がデータから自動的にパターンを習得する手法です。この手法は、画像解析、言語処理、レコメンドシステムなど、多様な領域で利用されています。マシンラーニングの手法には、監督学習、非監督学習、強化学習などが存在します。 12 | 13 | ## 深層学習の基礎 14 | 15 | 深層学習は機械学習の一分野で、多層のニューラルネットワークを使用してデータの複雑なパターンを学習します。深層学習は画像認識、音声認識、自然言語処理などの分野で革新的な成果を上げています。 16 | 17 | ## ディープラーニングの原理 18 | 19 | ディープラーニングは機械学習の一種で、多層のニューラルネットワークを利用してデータの複雑なパターンを習得します。ディープラーニングは画像解析、音声解析、言語処理などの領域で画期的な結果を達成しています。 20 | 21 | ## プログラミング言語の比較 22 | 23 | プログラミング言語には多くの種類があります。Python は機械学習やデータサイエンスの分野で人気があり、JavaScript はウェブ開発で広く使われています。Java は企業システムの開発でよく利用され、C++ は高性能なアプリケーションの開発に適しています。 24 | 25 | ## 全く異なる内容 26 | 27 | 今日の天気は晴れです。公園で散歩をしました。桜の花がとても美しく咲いていました。子供たちが元気に遊んでいる姿を見て、心が温かくなりました。 28 | -------------------------------------------------------------------------------- /test/__fixtures__/refactoring/class_to_function/metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Class to function refactoring patterns", 3 | "testCases": [ 4 | { 5 | "name": "UserService refactoring", 6 | "files": { 7 | "before": "user_service_class.ts", 8 | "after": "user_service_functions.ts" 9 | }, 10 | "expectedSimilarity": { 11 | "addUser": 0.85, 12 | "removeUser": 0.85, 13 | "getUser": 0.90, 14 | "getAllUsers": 0.90 15 | }, 16 | "notes": "Methods converted to functions with state parameter" 17 | }, 18 | { 19 | "name": "Calculator refactoring", 20 | "files": { 21 | "before": "calculator_class.ts", 22 | "after": "calculator_functions.ts" 23 | }, 24 | "expectedSimilarity": { 25 | "add": 0.80, 26 | "subtract": 0.80 27 | }, 28 | "notes": "Stateful class converted to functional with state parameter" 29 | } 30 | ] 31 | } -------------------------------------------------------------------------------- /examples/specs/sample_project/src/utils/logger.ts: -------------------------------------------------------------------------------- 1 | export class Logger { 2 | private context: string; 3 | 4 | constructor(context: string) { 5 | this.context = context; 6 | } 7 | 8 | info(message: string, data?: any): void { 9 | console.log(`[${this.getTimestamp()}] [INFO] [${this.context}] ${message}`, data || ""); 10 | } 11 | 12 | warn(message: string, data?: any): void { 13 | console.warn(`[${this.getTimestamp()}] [WARN] [${this.context}] ${message}`, data || ""); 14 | } 15 | 16 | error(message: string, error?: Error): void { 17 | console.error(`[${this.getTimestamp()}] [ERROR] [${this.context}] ${message}`, error || ""); 18 | } 19 | 20 | debug(message: string, data?: any): void { 21 | if (process.env.DEBUG) { 22 | console.debug(`[${this.getTimestamp()}] [DEBUG] [${this.context}] ${message}`, data || ""); 23 | } 24 | } 25 | 26 | private getTimestamp(): string { 27 | return new Date().toISOString(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /test/__fixtures__/duplication/structural/visitnode_pattern_3.ts: -------------------------------------------------------------------------------- 1 | // visitNode pattern from semantic_normalizer.ts 2 | function visitNode(node: any, replacer: (n: any) => any): any { 3 | if (!node || typeof node !== 'object') { 4 | return node; 5 | } 6 | 7 | // Apply replacer 8 | const replaced = replacer(node); 9 | if (replaced !== node) { 10 | return replaced; 11 | } 12 | 13 | // Clone and process children 14 | const newNode: any = { ...node }; 15 | const skipKeys = new Set(['loc', 'range', 'start', 'end', 'parent']); 16 | 17 | for (const key in newNode) { 18 | if (newNode.hasOwnProperty(key) && !skipKeys.has(key)) { 19 | const value = newNode[key]; 20 | 21 | if (Array.isArray(value)) { 22 | newNode[key] = value.map(item => visitNode(item, replacer)); 23 | } else if (value && typeof value === 'object') { 24 | newNode[key] = visitNode(value, replacer); 25 | } 26 | } 27 | } 28 | 29 | return newNode; 30 | } -------------------------------------------------------------------------------- /KNOWN_ISSUES.md: -------------------------------------------------------------------------------- 1 | # Known Issues 2 | 3 | ## Rust Type Similarity Detection 4 | 5 | ### Enum Similarity Detection 6 | - **Issue**: Enum similarity detection shows lower than expected similarity scores even for structurally identical enums 7 | - **Example**: Two enums with identical variants show only ~43% similarity 8 | - **Cause**: The AST structure for enums includes variant names as values, and the current rename_cost parameter doesn't adequately handle this case 9 | - **Workaround**: Use a lower threshold (0.4-0.5) for enum similarity detection 10 | - **Status**: Under investigation 11 | 12 | ### Struct Similarity Detection 13 | - **Status**: Working as expected 14 | - Structs with similar field types but different field names correctly show high similarity (90%+) 15 | - Generic structs are properly compared 16 | 17 | ## TypeScript Type Similarity Detection 18 | - **Status**: Working as expected 19 | - Interfaces, type aliases, and type literals are correctly detected with appropriate similarity scores -------------------------------------------------------------------------------- /crates/core/src/tree.rs: -------------------------------------------------------------------------------- 1 | use std::rc::Rc; 2 | 3 | #[derive(Debug, Clone)] 4 | pub struct TreeNode { 5 | pub label: String, 6 | pub value: String, 7 | pub children: Vec>, 8 | pub id: usize, 9 | pub subtree_size: Option, 10 | } 11 | 12 | impl TreeNode { 13 | #[must_use] 14 | pub fn new(label: String, value: String, id: usize) -> Self { 15 | TreeNode { label, value, children: Vec::new(), id, subtree_size: None } 16 | } 17 | 18 | pub fn add_child(&mut self, child: Rc) { 19 | self.children.push(child); 20 | } 21 | 22 | #[must_use] 23 | pub fn get_subtree_size(&self) -> usize { 24 | // Since we can't mutate through Rc, we'll calculate it each time 25 | // In a real implementation, you might want to use RefCell for interior mutability 26 | let mut size = 1; 27 | for child in &self.children { 28 | size += child.get_subtree_size(); 29 | } 30 | size 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /test/__fixtures__/duplication/semantic/async_operations_1.ts: -------------------------------------------------------------------------------- 1 | // Async operation patterns with Promise handling 2 | // Different approaches to the same async pattern 3 | 4 | export class DataFetcher { 5 | async fetchData(url: string): Promise { 6 | try { 7 | const response = await fetch(url); 8 | if (!response.ok) { 9 | throw new Error(`Failed to fetch: ${response.status}`); 10 | } 11 | return await response.json(); 12 | } catch (error) { 13 | console.error('Fetch error:', error); 14 | throw error; 15 | } 16 | } 17 | 18 | async fetchWithRetry(url: string, maxRetries: number = 3): Promise { 19 | let lastError; 20 | 21 | for (let i = 0; i < maxRetries; i++) { 22 | try { 23 | return await this.fetchData(url); 24 | } catch (error) { 25 | lastError = error; 26 | await new Promise(resolve => setTimeout(resolve, 1000 * (i + 1))); 27 | } 28 | } 29 | 30 | throw lastError; 31 | } 32 | } -------------------------------------------------------------------------------- /__deprecated/src/parser.ts: -------------------------------------------------------------------------------- 1 | // TypeScript parser wrapper 2 | import * as oxc from "oxc-parser"; 3 | 4 | /** 5 | * Parse TypeScript code into AST synchronously 6 | * @deprecated Use parseTypeScriptAsync for better performance 7 | */ 8 | export const parseTypeScript = oxc.parseSync; 9 | 10 | /** 11 | * Parse TypeScript code into AST asynchronously 12 | */ 13 | export const parseTypeScriptAsync = oxc.parseAsync; 14 | 15 | /** 16 | * Parse multiple TypeScript files in parallel 17 | */ 18 | export async function parseMultipleAsync( 19 | files: Array<{ filename: string; code: string }>, 20 | ): Promise> { 21 | const promises = files.map(async ({ filename, code }) => { 22 | try { 23 | const ast = await parseTypeScriptAsync(filename, code); 24 | return { filename, ast }; 25 | } catch (error) { 26 | return { filename, ast: null as any, error: error as Error }; 27 | } 28 | }); 29 | 30 | return Promise.all(promises); 31 | } 32 | -------------------------------------------------------------------------------- /crates/similarity-php/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "similarity-php" 3 | version = "0.4.2" 4 | edition = "2021" 5 | license = "MIT" 6 | description = "CLI tool for detecting code duplication in PHP projects" 7 | authors = ["SuguruOoki"] 8 | repository = "https://github.com/mizchi/similarity" 9 | homepage = "https://github.com/mizchi/similarity" 10 | documentation = "https://docs.rs/similarity-php" 11 | keywords = ["php", "duplicate", "detection", "cli", "similarity"] 12 | categories = ["command-line-utilities", "development-tools"] 13 | 14 | [[bin]] 15 | name = "similarity-php" 16 | path = "src/main.rs" 17 | 18 | [lib] 19 | name = "similarity_php" 20 | 21 | [dependencies] 22 | similarity-core = { version = "0.4.2", path = "../core" } 23 | clap = { version = "4.0", features = ["derive"] } 24 | anyhow = "1.0" 25 | walkdir = "2.5" 26 | ignore = "0.4" 27 | rayon = "1.10" 28 | tree-sitter = { workspace = true } 29 | tree-sitter-php = { workspace = true } 30 | 31 | [dev-dependencies] 32 | assert_cmd = "2.0" 33 | predicates = "3.0" 34 | tempfile = "3.0" -------------------------------------------------------------------------------- /crates/similarity-py/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "similarity-py" 3 | version = "0.4.2" 4 | edition = "2021" 5 | license = "MIT" 6 | description = "CLI tool for detecting code duplication in Python projects" 7 | authors = ["mizchi"] 8 | repository = "https://github.com/mizchi/similarity" 9 | homepage = "https://github.com/mizchi/similarity" 10 | documentation = "https://docs.rs/similarity-py" 11 | keywords = ["python", "duplicate", "detection", "cli", "similarity"] 12 | categories = ["command-line-utilities", "development-tools"] 13 | 14 | [[bin]] 15 | name = "similarity-py" 16 | path = "src/main.rs" 17 | 18 | [lib] 19 | name = "similarity_py" 20 | 21 | [dependencies] 22 | similarity-core = { version = "0.4.2", path = "../core" } 23 | clap = { version = "4.0", features = ["derive"] } 24 | anyhow = "1.0" 25 | walkdir = "2.5" 26 | ignore = "0.4" 27 | rayon = "1.10" 28 | tree-sitter = { workspace = true } 29 | tree-sitter-python = { workspace = true } 30 | 31 | [dev-dependencies] 32 | assert_cmd = "2.0" 33 | predicates = "3.0" 34 | tempfile = "3.0" -------------------------------------------------------------------------------- /docs/implementation/README.md: -------------------------------------------------------------------------------- 1 | # Implementation Documentation 2 | 3 | This directory contains documentation about implementation details, performance optimization, and benchmarks. 4 | 5 | ## Contents 6 | 7 | - [Performance Optimization](performance-optimization.md) - Strategies for optimizing performance 8 | - [Performance Baseline](performance-baseline.md) - Baseline performance measurements 9 | - [Hybrid Approach Results](hybrid-approach-results.md) - Results from hybrid detection approach 10 | - [Benchmark Results](benchmark_results.md) - Comprehensive benchmark results 11 | - [Rust vs TypeScript Comparison](rust-ts-compare.md) - Performance comparison between implementations 12 | 13 | ## Performance Overview 14 | 15 | The Rust implementation provides significant performance improvements over the original TypeScript prototype: 16 | - TypeScript/JavaScript parsing: Uses oxc-parser for ~10x faster parsing 17 | - Parallel processing: Leverages Rayon for concurrent file processing 18 | - Memory efficiency: Optimized AST representations and algorithms -------------------------------------------------------------------------------- /crates/similarity-rs/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "similarity-rs" 3 | version = "0.4.2" 4 | edition = "2021" 5 | license = "MIT" 6 | description = "CLI tool for detecting code duplication in Rust projects" 7 | authors = ["mizchi"] 8 | repository = "https://github.com/mizchi/similarity" 9 | homepage = "https://github.com/mizchi/similarity" 10 | documentation = "https://docs.rs/similarity-rs" 11 | keywords = ["rust", "duplicate", "detection", "cli", "similarity"] 12 | categories = ["command-line-utilities", "development-tools"] 13 | 14 | [[bin]] 15 | name = "similarity-rs" 16 | path = "src/main.rs" 17 | 18 | [lib] 19 | name = "similarity_rs" 20 | 21 | [dependencies] 22 | similarity-core = { version = "0.4.2", path = "../core" } 23 | clap = { version = "4.5", features = ["derive"] } 24 | anyhow = "1.0" 25 | rayon = "1.10" 26 | ignore = "0.4" 27 | walkdir = "2.5" 28 | globset = "0.4" 29 | tree-sitter = { workspace = true } 30 | tree-sitter-rust = { workspace = true } 31 | 32 | [dev-dependencies] 33 | assert_cmd = "2.0" 34 | predicates = "3.1" 35 | tempfile = "3.10" 36 | -------------------------------------------------------------------------------- /test/__fixtures__/dissimilar/imperative_vs_functional_2.ts: -------------------------------------------------------------------------------- 1 | // Dissimilar: Functional style 2 | type Product = { 3 | id: string; 4 | name: string; 5 | price: number; 6 | }; 7 | 8 | type Order = { 9 | products: Product[]; 10 | discount: number; 11 | }; 12 | 13 | const createOrder = (products: Product[] = [], discount = 0): Order => ({ 14 | products, 15 | discount: Math.min(Math.max(discount, 0), 100) 16 | }); 17 | 18 | const addProduct = (order: Order, product: Product): Order => ({ 19 | ...order, 20 | products: [...order.products, product] 21 | }); 22 | 23 | const removeProduct = (order: Order, productId: string): Order => ({ 24 | ...order, 25 | products: order.products.filter(p => p.id !== productId) 26 | }); 27 | 28 | const calculateOrderTotal = (order: Order): number => { 29 | const subtotal = order.products.reduce((sum, product) => sum + product.price, 0); 30 | return subtotal * (1 - order.discount / 100); 31 | }; 32 | 33 | const pipe = (...fns: Array<(arg: T) => T>) => (value: T): T => 34 | fns.reduce((acc, fn) => fn(acc), value); -------------------------------------------------------------------------------- /crates/similarity-elixir/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "similarity-elixir" 3 | version = "0.4.2" 4 | edition = "2021" 5 | license = "MIT" 6 | description = "CLI tool for detecting code duplication in Elixir projects" 7 | authors = ["mizchi"] 8 | repository = "https://github.com/mizchi/similarity" 9 | homepage = "https://github.com/mizchi/similarity" 10 | documentation = "https://docs.rs/similarity-elixir" 11 | keywords = ["elixir", "duplicate", "detection", "cli", "similarity"] 12 | categories = ["command-line-utilities", "development-tools"] 13 | 14 | [[bin]] 15 | name = "similarity-elixir" 16 | path = "src/main.rs" 17 | 18 | [lib] 19 | name = "similarity_elixir" 20 | 21 | [dependencies] 22 | similarity-core = { version = "0.4.2", path = "../core" } 23 | clap = { version = "4.0", features = ["derive"] } 24 | anyhow = "1.0" 25 | walkdir = "2.5" 26 | ignore = "0.4" 27 | rayon = "1.10" 28 | tree-sitter = { workspace = true } 29 | tree-sitter-elixir = { workspace = true } 30 | 31 | [dev-dependencies] 32 | assert_cmd = "2.0" 33 | predicates = "3.0" 34 | tempfile = "3.0" 35 | -------------------------------------------------------------------------------- /examples/specs/duplicate-types.ts: -------------------------------------------------------------------------------- 1 | // Example: Type duplication detection (--experimental-types) 2 | 3 | // Duplicate 1: Identical interfaces with different names 4 | interface User { 5 | id: string; 6 | name: string; 7 | email: string; 8 | createdAt: Date; 9 | updatedAt: Date; 10 | } 11 | 12 | interface Customer { 13 | id: string; 14 | name: string; 15 | email: string; 16 | createdAt: Date; 17 | updatedAt: Date; 18 | } 19 | 20 | // Duplicate 2: Similar type aliases 21 | type UserResponse = { 22 | userId: string; 23 | userName: string; 24 | userEmail: string; 25 | isActive: boolean; 26 | }; 27 | 28 | type CustomerResponse = { 29 | customerId: string; 30 | customerName: string; 31 | customerEmail: string; 32 | isActive: boolean; 33 | }; 34 | 35 | // Duplicate 3: Common API response patterns 36 | interface ApiResponse { 37 | data: T; 38 | status: number; 39 | message: string; 40 | timestamp: Date; 41 | } 42 | 43 | interface ServiceResponse { 44 | data: T; 45 | status: number; 46 | message: string; 47 | timestamp: Date; 48 | } -------------------------------------------------------------------------------- /test/__fixtures__/dissimilar/imperative_vs_functional_1.ts: -------------------------------------------------------------------------------- 1 | // Dissimilar: Imperative style 2 | class ShoppingCart { 3 | private items: CartItem[] = []; 4 | private discount: number = 0; 5 | 6 | addItem(item: CartItem): void { 7 | const existing = this.items.find(i => i.id === item.id); 8 | if (existing) { 9 | existing.quantity += item.quantity; 10 | } else { 11 | this.items.push({ ...item }); 12 | } 13 | } 14 | 15 | removeItem(id: string): void { 16 | const index = this.items.findIndex(i => i.id === id); 17 | if (index !== -1) { 18 | this.items.splice(index, 1); 19 | } 20 | } 21 | 22 | setDiscount(percent: number): void { 23 | this.discount = Math.min(Math.max(percent, 0), 100); 24 | } 25 | 26 | calculateTotal(): number { 27 | let total = 0; 28 | for (const item of this.items) { 29 | total += item.price * item.quantity; 30 | } 31 | return total * (1 - this.discount / 100); 32 | } 33 | } 34 | 35 | interface CartItem { 36 | id: string; 37 | name: string; 38 | price: number; 39 | quantity: number; 40 | } -------------------------------------------------------------------------------- /crates/similarity-rs/tests/test_min_tokens.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::uninlined_format_args)] 2 | 3 | use similarity_core::language_parser::LanguageParser; 4 | use similarity_rs::rust_parser::RustParser; 5 | 6 | #[test] 7 | fn test_function_token_counts() { 8 | let mut parser = RustParser::new().unwrap(); 9 | 10 | // Test various function sizes 11 | let test_cases = vec![ 12 | ("fn a() { 1 }", "one liner"), 13 | ("fn add(a: i32, b: i32) -> i32 { a + b }", "simple add"), 14 | ("fn complex() -> i32 {\n let x = 1;\n let y = 2;\n x + y\n}", "multi-statement"), 15 | ( 16 | r#"fn format_message(name: &str, age: u32) -> String { 17 | format!("Hello {}, you are {} years old", name, age) 18 | }"#, 19 | "format_message", 20 | ), 21 | ]; 22 | 23 | for (code, desc) in test_cases { 24 | let tree = parser.parse(code, "test.rs").unwrap(); 25 | let size = tree.get_subtree_size(); 26 | println!("{}: {} tokens", desc, size); 27 | println!("Code: {}", code); 28 | println!(); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /crates/similarity-css/src/bin/test_parser.rs: -------------------------------------------------------------------------------- 1 | fn main() { 2 | let scss_content = r#".m-0 { margin: 0; } 3 | .m-1 { margin: 0.25rem; } 4 | .m-2 { margin: 0.5rem; }"#; 5 | 6 | println!("Testing SCSS parser with single-line rules:"); 7 | println!("{scss_content}"); 8 | println!("\n---\n"); 9 | 10 | use similarity_css::scss_simple_flattener::simple_flatten_scss; 11 | 12 | match simple_flatten_scss(scss_content) { 13 | Ok(rules) => { 14 | println!("Found {} rules:", rules.len()); 15 | for rule in &rules { 16 | println!( 17 | " - {} (lines {}-{}, {} declarations)", 18 | rule.selector, 19 | rule.start_line, 20 | rule.end_line, 21 | rule.declarations.len() 22 | ); 23 | for (prop, val) in &rule.declarations { 24 | println!(" {prop}: {val}"); 25 | } 26 | } 27 | } 28 | Err(e) => { 29 | println!("Error: {e}"); 30 | } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /crates/similarity-generic/examples/configs/custom-language-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "language": "your-language-name", 3 | "function_nodes": [ 4 | "function_definition", 5 | "method_definition", 6 | "lambda_expression" 7 | ], 8 | "type_nodes": [ 9 | "class_declaration", 10 | "interface_declaration", 11 | "struct_declaration" 12 | ], 13 | "field_mappings": { 14 | "name_field": "name", 15 | "params_field": "parameters", 16 | "body_field": "body", 17 | "decorator_field": "decorators", 18 | "class_field": "class" 19 | }, 20 | "value_nodes": [ 21 | "identifier", 22 | "string_literal", 23 | "number_literal", 24 | "boolean_literal" 25 | ], 26 | "test_patterns": { 27 | "attribute_patterns": [ 28 | "@test", 29 | "@Test", 30 | "#[test]" 31 | ], 32 | "name_prefixes": [ 33 | "test_", 34 | "Test" 35 | ], 36 | "name_suffixes": [ 37 | "_test", 38 | "Test", 39 | "_spec" 40 | ] 41 | }, 42 | "custom_mappings": { 43 | "comment": "Optional custom mappings for special cases" 44 | } 45 | } -------------------------------------------------------------------------------- /test/__fixtures__/duplication/semantic/async_operations_2.ts: -------------------------------------------------------------------------------- 1 | // Same async operations using functional approach 2 | // Shows semantic equivalence with different structure 3 | 4 | export const fetchData = (url: string): Promise => 5 | fetch(url) 6 | .then(response => { 7 | if (!response.ok) { 8 | throw new Error(`Failed to fetch: ${response.status}`); 9 | } 10 | return response.json(); 11 | }) 12 | .catch(error => { 13 | console.error('Fetch error:', error); 14 | throw error; 15 | }); 16 | 17 | export const fetchWithRetry = async ( 18 | url: string, 19 | maxRetries: number = 3 20 | ): Promise => { 21 | const attempt = async (retriesLeft: number): Promise => { 22 | try { 23 | return await fetchData(url); 24 | } catch (error) { 25 | if (retriesLeft === 0) throw error; 26 | 27 | await new Promise(resolve => 28 | setTimeout(resolve, 1000 * (maxRetries - retriesLeft + 1)) 29 | ); 30 | 31 | return attempt(retriesLeft - 1); 32 | } 33 | }; 34 | 35 | return attempt(maxRetries - 1); 36 | }; -------------------------------------------------------------------------------- /crates/similarity-generic/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "similarity-generic" 3 | version = "0.4.2" 4 | edition = "2021" 5 | license = "MIT" 6 | description = "Generic language similarity analyzer using tree-sitter" 7 | authors = ["mizchi"] 8 | repository = "https://github.com/mizchi/similarity" 9 | build = "build.rs" 10 | 11 | [[bin]] 12 | name = "similarity-generic" 13 | path = "src/main.rs" 14 | 15 | [dependencies] 16 | similarity-core = { version = "0.4.2", path = "../core" } 17 | clap = { version = "4.0", features = ["derive"] } 18 | anyhow = "1.0" 19 | tree-sitter = { workspace = true } 20 | tree-sitter-go = { workspace = true } 21 | tree-sitter-java = { workspace = true } 22 | tree-sitter-c = { workspace = true } 23 | tree-sitter-cpp = { workspace = true } 24 | tree-sitter-c-sharp = { workspace = true } 25 | tree-sitter-ruby = { workspace = true } 26 | serde = { version = "1.0", features = ["derive"] } 27 | serde_json = "1.0" 28 | once_cell = "1.21" 29 | 30 | [build-dependencies] 31 | serde_json = "1.0" 32 | once_cell = "1.21" 33 | 34 | [dev-dependencies] 35 | tempfile = "3.0" 36 | assert_cmd = "2.0" 37 | predicates = "3.0" -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 mizchi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /docs/lib/README.md: -------------------------------------------------------------------------------- 1 | # Library Documentation 2 | 3 | This directory contains documentation about the library design, architecture, and features. 4 | 5 | ## Contents 6 | 7 | - [AI Documentation](ai-documentation.md) - Comprehensive technical documentation for AI developers 8 | - [Multi-file Similarity](multi_file_similarity.md) - Implementation details for cross-file similarity detection 9 | - [Type Similarity Design](type-similarity-design.md) - Design document for TypeScript type similarity detection 10 | - [Visitor Implementation Example](visitor-implementation-example.md) - Example of visitor pattern implementation 11 | - [Python Support](python-support.md) - Documentation for Python language support 12 | 13 | ## Architecture Overview 14 | 15 | The similarity detection library is organized as a Rust workspace with: 16 | - **similarity-core**: Language-agnostic core algorithms and utilities 17 | - **similarity-ts**: TypeScript/JavaScript specific implementation 18 | - **similarity-py**: Python specific implementation 19 | - **similarity-rs**: Rust specific implementation 20 | 21 | Each language-specific crate implements the `LanguageParser` trait from the core library. -------------------------------------------------------------------------------- /test/__fixtures__/duplication/structural/visitnode_pattern_2.ts: -------------------------------------------------------------------------------- 1 | // visitNode pattern from function_extractor.ts 2 | function visitNode(node: any, ancestors: any[] = []): void { 3 | if (!node || typeof node !== 'object') { 4 | return; 5 | } 6 | 7 | // Process based on node type 8 | if (node.type === 'FunctionDeclaration' || node.type === 'FunctionExpression') { 9 | extractFunction(node, ancestors); 10 | } else if (node.type === 'MethodDefinition') { 11 | extractMethod(node, ancestors); 12 | } else if (node.type === 'ArrowFunctionExpression') { 13 | extractArrowFunction(node, ancestors); 14 | } 15 | 16 | // Skip certain keys 17 | const skipKeys = new Set(['loc', 'range', 'start', 'end', 'parent']); 18 | 19 | // Visit children 20 | for (const key in node) { 21 | if (node.hasOwnProperty(key) && !skipKeys.has(key)) { 22 | const value = node[key]; 23 | 24 | if (Array.isArray(value)) { 25 | for (const item of value) { 26 | visitNode(item, [...ancestors, node]); 27 | } 28 | } else if (value && typeof value === 'object') { 29 | visitNode(value, [...ancestors, node]); 30 | } 31 | } 32 | } 33 | } -------------------------------------------------------------------------------- /crates/core/tests/fixtures/sample1.ts: -------------------------------------------------------------------------------- 1 | // Test sample 1: Different functions that should have low similarity 2 | 3 | export function processUserData(users: User[]): ProcessedData { 4 | const result: ProcessedData = { 5 | total: users.length, 6 | active: 0, 7 | inactive: 0 8 | }; 9 | 10 | for (const user of users) { 11 | if (user.isActive) { 12 | result.active++; 13 | } else { 14 | result.inactive++; 15 | } 16 | } 17 | 18 | return result; 19 | } 20 | 21 | export function calculateAverage(numbers: number[]): number { 22 | if (numbers.length === 0) return 0; 23 | 24 | let sum = 0; 25 | for (const num of numbers) { 26 | sum += num; 27 | } 28 | 29 | return sum / numbers.length; 30 | } 31 | 32 | // Similar structure but different purpose 33 | export function findMaxValue(values: number[]): number { 34 | if (values.length === 0) return -Infinity; 35 | 36 | let max = values[0]; 37 | for (let i = 1; i < values.length; i++) { 38 | if (values[i] > max) { 39 | max = values[i]; 40 | } 41 | } 42 | 43 | return max; 44 | } -------------------------------------------------------------------------------- /examples/specs/test_async.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env tsx 2 | /** 3 | * Test async functionality after refactoring 4 | */ 5 | 6 | import { calculateAPTEDSimilarityAsync, calculateAPTEDSimilarityFromAST, parseAsync } from "../src/index.ts"; 7 | 8 | async function main() { 9 | const code1 = ` 10 | function add(a: number, b: number): number { 11 | return a + b; 12 | } 13 | `; 14 | 15 | const code2 = ` 16 | function sum(x: number, y: number): number { 17 | return x + y; 18 | } 19 | `; 20 | 21 | console.log("Testing async APTED similarity..."); 22 | const similarity = await calculateAPTEDSimilarityAsync(code1, code2); 23 | console.log(`Similarity: ${(similarity * 100).toFixed(1)}%`); 24 | 25 | console.log("\nTesting with pre-parsed AST..."); 26 | const [ast1, ast2] = await Promise.all([parseAsync("test1.ts", code1), parseAsync("test2.ts", code2)]); 27 | 28 | const similarityFromAST = calculateAPTEDSimilarityFromAST(ast1, ast2); 29 | console.log(`Similarity from AST: ${(similarityFromAST * 100).toFixed(1)}%`); 30 | 31 | console.log("\nCore modules remain sync - async parsing is handled at the application level ✓"); 32 | } 33 | 34 | main().catch(console.error); 35 | -------------------------------------------------------------------------------- /crates/similarity-ts/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "similarity-ts" 3 | version = "0.4.2" 4 | edition = "2021" 5 | license = "MIT" 6 | description = "CLI tool for detecting code duplication in TypeScript/JavaScript projects" 7 | authors = ["mizchi"] 8 | repository = "https://github.com/mizchi/similarity" 9 | homepage = "https://github.com/mizchi/similarity" 10 | documentation = "https://docs.rs/similarity-ts" 11 | keywords = ["typescript", "javascript", "duplicate", "detection", "cli"] 12 | categories = ["command-line-utilities", "development-tools"] 13 | 14 | [[bin]] 15 | name = "similarity-ts" 16 | path = "src/main.rs" 17 | 18 | [dependencies] 19 | similarity-core = { version = "0.4.2", path = "../core" } 20 | clap = { version = "4.0", features = ["derive"] } 21 | anyhow = "1.0" 22 | walkdir = "2.5" 23 | ignore = "0.4" 24 | globset = "0.4" 25 | rayon = "1.10" 26 | oxc_parser = { workspace = true } 27 | oxc_ast = { workspace = true } 28 | oxc_span = { workspace = true } 29 | oxc_allocator = { workspace = true } 30 | 31 | [dev-dependencies] 32 | assert_cmd = "2.0" 33 | predicates = "3.0" 34 | tempfile = "3.0" 35 | criterion = "0.5" 36 | rayon = "1.10" 37 | 38 | [[bench]] 39 | name = "parallel_benchmark" 40 | harness = false -------------------------------------------------------------------------------- /test/__fixtures__/duplication/exact/service_duplication_1.ts: -------------------------------------------------------------------------------- 1 | // Example of exact duplication: UserService 2 | // This is a common pattern where a service is copied and slightly modified 3 | export class UserService { 4 | private users: Map = new Map(); 5 | 6 | addUser(user: User): void { 7 | if (!user.id) { 8 | throw new Error('User must have an ID'); 9 | } 10 | if (this.users.has(user.id)) { 11 | throw new Error('User already exists'); 12 | } 13 | this.users.set(user.id, user); 14 | } 15 | 16 | getUser(id: string): User | undefined { 17 | return this.users.get(id); 18 | } 19 | 20 | updateUser(id: string, updates: Partial): User { 21 | const user = this.users.get(id); 22 | if (!user) { 23 | throw new Error('User not found'); 24 | } 25 | const updatedUser = { ...user, ...updates }; 26 | this.users.set(id, updatedUser); 27 | return updatedUser; 28 | } 29 | 30 | deleteUser(id: string): boolean { 31 | return this.users.delete(id); 32 | } 33 | 34 | getAllUsers(): User[] { 35 | return Array.from(this.users.values()); 36 | } 37 | } 38 | 39 | interface User { 40 | id: string; 41 | name: string; 42 | email: string; 43 | createdAt: Date; 44 | } -------------------------------------------------------------------------------- /__deprecated/src/core/oxc_types.ts: -------------------------------------------------------------------------------- 1 | // Re-export oxc-parser types for easier use throughout the codebase 2 | import type { 3 | Program, 4 | Expression, 5 | Statement, 6 | Declaration, 7 | IdentifierReference, 8 | BindingIdentifier, 9 | Function, 10 | Class, 11 | VariableDeclarator, 12 | ModuleDeclaration, 13 | NumericLiteral, 14 | StringLiteral, 15 | BooleanLiteral, 16 | Directive, 17 | } from "@oxc-project/types"; 18 | 19 | // Re-export types 20 | export type { Program, NumericLiteral, StringLiteral, BooleanLiteral }; 21 | 22 | // Type guards 23 | export function isIdentifier(node: any): node is IdentifierReference | BindingIdentifier { 24 | return node?.type === "Identifier"; 25 | } 26 | 27 | export function isFunctionDeclaration(node: any): node is Function { 28 | return node?.type === "FunctionDeclaration"; 29 | } 30 | 31 | export function isClassDeclaration(node: any): node is Class { 32 | return node?.type === "ClassDeclaration"; 33 | } 34 | 35 | export function isVariableDeclarator(node: any): node is VariableDeclarator { 36 | return node?.type === "VariableDeclarator"; 37 | } 38 | 39 | // Union type for all AST nodes 40 | export type ASTNode = Expression | Statement | Declaration | ModuleDeclaration | Directive | Program; 41 | -------------------------------------------------------------------------------- /examples/test_structure_comparison.ts: -------------------------------------------------------------------------------- 1 | // Test file for structure comparison framework 2 | 3 | // Interface with common structure 4 | interface User { 5 | id: string; 6 | name: string; 7 | email: string; 8 | age?: number; 9 | } 10 | 11 | // Type alias with similar structure 12 | type Person = { 13 | id: string; 14 | name: string; 15 | email: string; 16 | age?: number; 17 | }; 18 | 19 | // Another interface with same properties (should be detected as similar) 20 | interface Customer { 21 | id: string; 22 | name: string; 23 | email: string; 24 | age?: number; 25 | } 26 | 27 | // Type literal in variable declaration 28 | const employee: { 29 | id: string; 30 | name: string; 31 | email: string; 32 | age?: number; 33 | } = { 34 | id: "emp001", 35 | name: "John Doe", 36 | email: "john@example.com", 37 | age: 30 38 | }; 39 | 40 | // Similar class structure 41 | class Account { 42 | id: string; 43 | name: string; 44 | email: string; 45 | age?: number; 46 | 47 | constructor(id: string, name: string, email: string, age?: number) { 48 | this.id = id; 49 | this.name = name; 50 | this.email = email; 51 | this.age = age; 52 | } 53 | } 54 | 55 | // Slightly different structure (missing email) 56 | interface Admin { 57 | id: string; 58 | name: string; 59 | role: string; 60 | age?: number; 61 | } -------------------------------------------------------------------------------- /crates/similarity-css/examples/test.css: -------------------------------------------------------------------------------- 1 | /* Test CSS file for similarity detection */ 2 | 3 | .button { 4 | background-color: #007bff; 5 | color: white; 6 | padding: 10px 20px; 7 | border: none; 8 | border-radius: 4px; 9 | cursor: pointer; 10 | } 11 | 12 | .btn { 13 | background-color: #007bff; 14 | color: #fff; 15 | padding: 10px 20px; 16 | border: none; 17 | border-radius: 4px; 18 | cursor: pointer; 19 | } 20 | 21 | .primary-button { 22 | background: #007bff; 23 | color: white; 24 | padding: 0.625rem 1.25rem; 25 | border: 0; 26 | border-radius: 4px; 27 | cursor: pointer; 28 | } 29 | 30 | .card { 31 | background: white; 32 | padding: 20px; 33 | border-radius: 8px; 34 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 35 | } 36 | 37 | .panel { 38 | background-color: #ffffff; 39 | padding: 20px; 40 | border-radius: 8px; 41 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 42 | } 43 | 44 | .header { 45 | display: flex; 46 | justify-content: space-between; 47 | align-items: center; 48 | padding: 1rem; 49 | } 50 | 51 | .navbar { 52 | display: flex; 53 | justify-content: space-between; 54 | align-items: center; 55 | padding: 16px; 56 | } 57 | 58 | @media (max-width: 768px) { 59 | .button, 60 | .btn { 61 | width: 100%; 62 | padding: 12px; 63 | } 64 | 65 | .card { 66 | padding: 15px; 67 | } 68 | } -------------------------------------------------------------------------------- /test/__fixtures__/duplication/exact/service_duplication_2.ts: -------------------------------------------------------------------------------- 1 | // Example of exact duplication: CustomerService 2 | // This is copied from UserService with just name changes - a typical copy-paste scenario 3 | export class CustomerService { 4 | private customers: Map = new Map(); 5 | 6 | addCustomer(customer: Customer): void { 7 | if (!customer.id) { 8 | throw new Error('Customer must have an ID'); 9 | } 10 | if (this.customers.has(customer.id)) { 11 | throw new Error('Customer already exists'); 12 | } 13 | this.customers.set(customer.id, customer); 14 | } 15 | 16 | getCustomer(id: string): Customer | undefined { 17 | return this.customers.get(id); 18 | } 19 | 20 | updateCustomer(id: string, updates: Partial): Customer { 21 | const customer = this.customers.get(id); 22 | if (!customer) { 23 | throw new Error('Customer not found'); 24 | } 25 | const updatedCustomer = { ...customer, ...updates }; 26 | this.customers.set(id, updatedCustomer); 27 | return updatedCustomer; 28 | } 29 | 30 | deleteCustomer(id: string): boolean { 31 | return this.customers.delete(id); 32 | } 33 | 34 | getAllCustomers(): Customer[] { 35 | return Array.from(this.customers.values()); 36 | } 37 | } 38 | 39 | interface Customer { 40 | id: string; 41 | name: string; 42 | email: string; 43 | createdAt: Date; 44 | } -------------------------------------------------------------------------------- /crates/core/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "similarity-core" 3 | version = "0.4.2" 4 | edition = "2021" 5 | license = "MIT" 6 | description = "Core library for code similarity detection using AST-based comparison" 7 | authors = ["mizchi"] 8 | repository = "https://github.com/mizchi/similarity" 9 | homepage = "https://github.com/mizchi/similarity" 10 | documentation = "https://docs.rs/similarity-core" 11 | keywords = ["typescript", "javascript", "similarity", "ast", "refactoring"] 12 | categories = ["development-tools", "parser-implementations"] 13 | 14 | [dependencies] 15 | oxc_parser = { workspace = true } 16 | oxc_ast = { workspace = true } 17 | oxc_span = { workspace = true } 18 | oxc_allocator = { workspace = true } 19 | serde = { version = "1.0", features = ["derive"] } 20 | serde_json = "1.0" 21 | tree-sitter = { workspace = true } 22 | tree-sitter-go = { workspace = true } 23 | tree-sitter-java = { workspace = true } 24 | tree-sitter-c = { workspace = true } 25 | tree-sitter-cpp = { workspace = true } 26 | tree-sitter-c-sharp = { workspace = true } 27 | tree-sitter-ruby = { workspace = true } 28 | rayon = "1.10" 29 | ignore = "0.4" 30 | anyhow = "1.0" 31 | 32 | [dev-dependencies] 33 | criterion = "0.5" 34 | 35 | [[bench]] 36 | name = "tsed_benchmark" 37 | harness = false 38 | 39 | [[bench]] 40 | name = "function_comparison" 41 | harness = false 42 | 43 | # Examples removed - language-specific examples moved to respective crates -------------------------------------------------------------------------------- /crates/similarity-rs/tests/parser_test.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::uninlined_format_args)] 2 | 3 | use similarity_core::language_parser::LanguageParser; 4 | use similarity_rs::rust_parser::RustParser; 5 | 6 | #[test] 7 | fn test_parser_parses_complete_function() { 8 | let code = r#" 9 | fn add(a: i32, b: i32) -> i32 { 10 | a + b 11 | } 12 | "#; 13 | 14 | let mut parser = RustParser::new().unwrap(); 15 | let tree = parser.parse(code, "test.rs").unwrap(); 16 | 17 | // The tree should contain function signature elements 18 | let tree_str = format!("{:?}", tree); 19 | println!("Parsed tree: {}", tree_str); 20 | 21 | // Check tree size is reasonable (not just the body) 22 | let size = tree.get_subtree_size(); 23 | println!("Tree size: {}", size); 24 | assert!(size > 10, "Tree too small, might be parsing only body: {}", size); 25 | } 26 | 27 | #[test] 28 | fn test_parser_differentiates_function_names() { 29 | let code1 = "fn foo() {}"; 30 | let code2 = "fn bar() {}"; 31 | 32 | let mut parser = RustParser::new().unwrap(); 33 | let tree1 = parser.parse(code1, "test.rs").unwrap(); 34 | let tree2 = parser.parse(code2, "test.rs").unwrap(); 35 | 36 | // Trees should be different even for empty functions with different names 37 | assert_ne!( 38 | format!("{:?}", tree1), 39 | format!("{:?}", tree2), 40 | "Functions with different names should produce different trees" 41 | ); 42 | } 43 | -------------------------------------------------------------------------------- /examples/specs/test_extraction.ts: -------------------------------------------------------------------------------- 1 | import { extractFunctions } from "../src/core/function_extractor.ts"; 2 | 3 | const code = ` 4 | class UserService { 5 | addUser(user: User): void { 6 | if (!user.id) { 7 | throw new Error('User must have an ID'); 8 | } 9 | this.users.set(user.id, user); 10 | console.log(\`User \${user.name} added\`); 11 | } 12 | } 13 | 14 | function addUserToStore(store: Map, user: User): void { 15 | if (!user.id) { 16 | throw new Error('User must have an ID'); 17 | } 18 | store.set(user.id, user); 19 | console.log(\`User \${user.name} added\`); 20 | } 21 | 22 | const addUserToMap = (userMap: Map, newUser: User): void => { 23 | if (!newUser.id) { 24 | throw new Error('User must have an ID'); 25 | } 26 | userMap.set(newUser.id, newUser); 27 | console.log(\`User \${newUser.name} added\`); 28 | }; 29 | `; 30 | 31 | console.log("Extracting functions...\n"); 32 | const functions = extractFunctions(code); 33 | 34 | console.log(`Found ${functions.length} functions:\n`); 35 | 36 | functions.forEach((func) => { 37 | console.log(`Name: ${func.name}`); 38 | console.log(`Type: ${func.type}`); 39 | console.log(`Parameters: [${func.parameters.join(", ")}]`); 40 | console.log(`Body length: ${func.body.length}`); 41 | console.log(`Body preview: ${func.body.substring(0, 100)}...`); 42 | if (func.className) { 43 | console.log(`Class: ${func.className}`); 44 | } 45 | console.log("---\n"); 46 | }); 47 | -------------------------------------------------------------------------------- /examples/specs/debug_arrow.ts: -------------------------------------------------------------------------------- 1 | import { parseTypeScript } from "../src/parser.ts"; 2 | 3 | const code = ` 4 | const addUserToMap = (userMap: Map, newUser: User): void => { 5 | if (!newUser.id) { 6 | throw new Error('User must have an ID'); 7 | } 8 | userMap.set(newUser.id, newUser); 9 | console.log(\`User \${newUser.name} added\`); 10 | }; 11 | `; 12 | 13 | const ast = parseTypeScript("test.ts", code); 14 | 15 | function findArrowFunction(node: any, depth = 0): void { 16 | if (!node || typeof node !== "object") return; 17 | 18 | const indent = " ".repeat(depth); 19 | 20 | if (node.type === "VariableDeclarator") { 21 | console.log(indent + "VariableDeclarator:", node.id?.name); 22 | console.log(indent + " init type:", node.init?.type); 23 | } 24 | 25 | if (node.type === "ArrowFunctionExpression") { 26 | console.log(indent + "Found ArrowFunctionExpression!"); 27 | console.log(indent + " has body:", !!node.body); 28 | console.log(indent + " body type:", node.body?.type); 29 | console.log(indent + " expression:", node.expression); 30 | } 31 | 32 | for (const key in node) { 33 | if (key === "parent" || key === "scope") continue; 34 | const value = node[key]; 35 | if (Array.isArray(value)) { 36 | value.forEach((v) => findArrowFunction(v, depth + 1)); 37 | } else if (value && typeof value === "object") { 38 | findArrowFunction(value, depth + 1); 39 | } 40 | } 41 | } 42 | 43 | findArrowFunction(ast.program); 44 | -------------------------------------------------------------------------------- /examples/duplicate_python.py: -------------------------------------------------------------------------------- 1 | # Example Python file with duplicate functions 2 | 3 | def process_data(data): 4 | """Process data and return result.""" 5 | result = [] 6 | for item in data: 7 | if item > 0: 8 | result.append(item * 2) 9 | return result 10 | 11 | def transform_data(data): 12 | """Transform data and return result.""" 13 | output = [] 14 | for element in data: 15 | if element > 0: 16 | output.append(element * 2) 17 | return output 18 | 19 | class DataProcessor: 20 | def __init__(self): 21 | self.cache = {} 22 | 23 | def process(self, data): 24 | result = [] 25 | for item in data: 26 | if item > 0: 27 | result.append(item * 2) 28 | return result 29 | 30 | def transform(self, data): 31 | output = [] 32 | for element in data: 33 | if element > 0: 34 | output.append(element * 2) 35 | return output 36 | 37 | # Another duplicate with slight variations 38 | def filter_and_double(items): 39 | """Filter positive numbers and double them.""" 40 | filtered = [] 41 | for i in items: 42 | if i > 0: 43 | filtered.append(i * 2) 44 | return filtered 45 | 46 | class NumberProcessor: 47 | def process_numbers(self, numbers): 48 | processed = [] 49 | for num in numbers: 50 | if num > 0: 51 | processed.append(num * 2) 52 | return processed -------------------------------------------------------------------------------- /crates/similarity-rs/tests/test_rename_zero.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::uninlined_format_args)] 2 | 3 | use similarity_core::{ 4 | language_parser::LanguageParser, 5 | tsed::{calculate_tsed, TSEDOptions}, 6 | }; 7 | use similarity_rs::rust_parser::RustParser; 8 | 9 | #[test] 10 | fn test_rename_cost_zero() { 11 | let code1 = r#" 12 | let result = x + 1; 13 | result * 2 14 | "#; 15 | 16 | let code2 = r#" 17 | let temp = y + 1; 18 | temp * 2 19 | "#; 20 | 21 | let mut parser = RustParser::new().unwrap(); 22 | let tree1 = parser.parse(code1, "test1.rs").unwrap(); 23 | let tree2 = parser.parse(code2, "test2.rs").unwrap(); 24 | 25 | // rename_cost = 0.0, compare_values = true 26 | let mut options = TSEDOptions::default(); 27 | options.apted_options.rename_cost = 0.0; 28 | options.apted_options.compare_values = true; 29 | 30 | let similarity = calculate_tsed(&tree1, &tree2, &options); 31 | println!("With compare_values=true, rename_cost=0.0: {:.2}%", similarity * 100.0); 32 | 33 | // rename_cost = 0.0, compare_values = false (構造のみ比較) 34 | options.apted_options.compare_values = false; 35 | let similarity2 = calculate_tsed(&tree1, &tree2, &options); 36 | println!("With compare_values=false, rename_cost=0.0: {:.2}%", similarity2 * 100.0); 37 | 38 | // デフォルト設定 39 | let options_default = TSEDOptions::default(); 40 | let similarity3 = calculate_tsed(&tree1, &tree2, &options_default); 41 | println!("With default settings: {:.2}%", similarity3 * 100.0); 42 | } 43 | -------------------------------------------------------------------------------- /crates/similarity-md/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["mizchi"] 3 | categories = ["command-line-utilities", "development-tools"] 4 | description = "Experimental CLI tool for detecting content similarity in Markdown documents" 5 | documentation = "https://docs.rs/similarity-md" 6 | edition = "2021" 7 | homepage = "https://github.com/mizchi/similarity" 8 | keywords = ["markdown", "similarity", "duplicate", "detection", "cli"] 9 | license = "MIT" 10 | name = "similarity-md" 11 | publish = true 12 | repository = "https://github.com/mizchi/similarity" 13 | version = "0.4.2" 14 | 15 | [[bin]] 16 | name = "similarity-md" 17 | path = "src/main.rs" 18 | 19 | [dependencies] 20 | anyhow = "1.0" 21 | clap = {version = "4.0", features = ["derive"]} 22 | globset = "0.4" 23 | ignore = "0.4" 24 | pulldown-cmark = "0.10" 25 | rayon = "1.10" 26 | serde = {version = "1.0", features = ["derive"]} 27 | serde_json = "1.0" 28 | vibrato = "0.5" 29 | walkdir = "2.5" 30 | zstd = {version = "0.13", optional = true} 31 | 32 | [features] 33 | default = [] 34 | zstd-support = ["zstd"] 35 | 36 | [dev-dependencies] 37 | assert_cmd = "2.0" 38 | criterion = "0.5" 39 | predicates = "3.0" 40 | tempfile = "3.0" 41 | 42 | [[bench]] 43 | harness = false 44 | name = "markdown_similarity_benchmark" 45 | 46 | [[example]] 47 | name = "morphological_test" 48 | path = "examples/morphological_test.rs" 49 | 50 | [[example]] 51 | name = "debug_similarity" 52 | path = "examples/debug_similarity.rs" 53 | 54 | [[example]] 55 | name = "test_levenshtein" 56 | path = "examples/test_levenshtein.rs" 57 | -------------------------------------------------------------------------------- /crates/similarity-css/examples/test.scss: -------------------------------------------------------------------------------- 1 | // Test SCSS file for similarity detection 2 | 3 | $primary-color: #007bff; 4 | $white: #fff; 5 | $spacing-unit: 20px; 6 | 7 | @mixin button-base { 8 | padding: 10px 20px; 9 | border: none; 10 | border-radius: 4px; 11 | cursor: pointer; 12 | } 13 | 14 | .button { 15 | @include button-base; 16 | background-color: $primary-color; 17 | color: white; 18 | } 19 | 20 | .btn { 21 | @include button-base; 22 | background-color: $primary-color; 23 | color: $white; 24 | } 25 | 26 | .primary-button { 27 | background: $primary-color; 28 | color: white; 29 | padding: 0.625rem 1.25rem; 30 | border: 0; 31 | border-radius: 4px; 32 | cursor: pointer; 33 | } 34 | 35 | @mixin card-style { 36 | background: white; 37 | padding: $spacing-unit; 38 | border-radius: 8px; 39 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 40 | } 41 | 42 | .card { 43 | @include card-style; 44 | } 45 | 46 | .panel { 47 | background-color: #ffffff; 48 | padding: $spacing-unit; 49 | border-radius: 8px; 50 | box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); 51 | } 52 | 53 | @mixin flex-header { 54 | display: flex; 55 | justify-content: space-between; 56 | align-items: center; 57 | } 58 | 59 | .header { 60 | @include flex-header; 61 | padding: 1rem; 62 | } 63 | 64 | .navbar { 65 | @include flex-header; 66 | padding: 16px; 67 | } 68 | 69 | @media (max-width: 768px) { 70 | .button, 71 | .btn { 72 | width: 100%; 73 | padding: 12px; 74 | } 75 | 76 | .card { 77 | padding: 15px; 78 | } 79 | } -------------------------------------------------------------------------------- /test/__fixtures__/duplication/copy_paste/loop_pattern.ts: -------------------------------------------------------------------------------- 1 | // Copy-paste duplication: Loop patterns with similar structure 2 | // Common pattern where loops are copied and modified slightly 3 | 4 | export function findMaxValue(numbers: number[]): number { 5 | let max = numbers[0]; 6 | for (let i = 1; i < numbers.length; i++) { 7 | if (numbers[i] > max) { 8 | max = numbers[i]; 9 | } 10 | } 11 | return max; 12 | } 13 | 14 | export function findMinValue(numbers: number[]): number { 15 | let min = numbers[0]; 16 | for (let i = 1; i < numbers.length; i++) { 17 | if (numbers[i] < min) { 18 | min = numbers[i]; 19 | } 20 | } 21 | return min; 22 | } 23 | 24 | export function calculateSum(numbers: number[]): number { 25 | let sum = 0; 26 | for (let i = 0; i < numbers.length; i++) { 27 | sum += numbers[i]; 28 | } 29 | return sum; 30 | } 31 | 32 | export function calculateProduct(numbers: number[]): number { 33 | let product = 1; 34 | for (let i = 0; i < numbers.length; i++) { 35 | product *= numbers[i]; 36 | } 37 | return product; 38 | } 39 | 40 | export function countPositive(numbers: number[]): number { 41 | let count = 0; 42 | for (let i = 0; i < numbers.length; i++) { 43 | if (numbers[i] > 0) { 44 | count++; 45 | } 46 | } 47 | return count; 48 | } 49 | 50 | export function countNegative(numbers: number[]): number { 51 | let count = 0; 52 | for (let i = 0; i < numbers.length; i++) { 53 | if (numbers[i] < 0) { 54 | count++; 55 | } 56 | } 57 | return count; 58 | } -------------------------------------------------------------------------------- /test/__fixtures__/duplication/structural/array_iteration_pattern_2.ts: -------------------------------------------------------------------------------- 1 | // Structural duplication: Functional array processing 2 | // Same logic as pattern_1 but using functional approach 3 | export function processUserData(users: User[]): ProcessedUser[] { 4 | return users 5 | .filter(user => user.isActive) 6 | .map(user => ({ 7 | id: user.id, 8 | displayName: `${user.firstName} ${user.lastName}`, 9 | status: 'active', 10 | lastSeen: user.lastLogin 11 | })); 12 | } 13 | 14 | export function processOrderData(orders: Order[]): ProcessedOrder[] { 15 | return orders 16 | .filter(order => order.status === 'completed') 17 | .map(order => ({ 18 | id: order.id, 19 | customerName: `${order.customer.firstName} ${order.customer.lastName}`, 20 | total: order.items.reduce((sum, item) => sum + item.price, 0), 21 | completedAt: order.completedDate 22 | })); 23 | } 24 | 25 | // Same types as pattern_1 26 | interface User { 27 | id: string; 28 | firstName: string; 29 | lastName: string; 30 | isActive: boolean; 31 | lastLogin: Date; 32 | } 33 | 34 | interface ProcessedUser { 35 | id: string; 36 | displayName: string; 37 | status: string; 38 | lastSeen: Date; 39 | } 40 | 41 | interface Order { 42 | id: string; 43 | status: string; 44 | customer: { firstName: string; lastName: string }; 45 | items: Array<{ price: number }>; 46 | completedDate: Date; 47 | } 48 | 49 | interface ProcessedOrder { 50 | id: string; 51 | customerName: string; 52 | total: number; 53 | completedAt: Date; 54 | } -------------------------------------------------------------------------------- /crates/similarity-generic/examples/usage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "=== similarity-generic Usage Examples ===" 4 | echo 5 | 6 | echo "1. Basic usage with built-in language support:" 7 | echo " similarity-generic sample.go --language go" 8 | echo 9 | 10 | echo "2. Show all functions in a file:" 11 | echo " similarity-generic sample.go --language go --show-functions" 12 | echo 13 | 14 | echo "3. Use custom threshold:" 15 | echo " similarity-generic sample.go --language go --threshold 0.9" 16 | echo 17 | 18 | echo "4. Show supported languages:" 19 | echo " similarity-generic --supported" 20 | echo 21 | 22 | echo "5. Show language configuration:" 23 | echo " similarity-generic --show-config go" 24 | echo " similarity-generic --show-config go > my-go-config.json" 25 | echo 26 | 27 | echo "6. Use custom configuration file:" 28 | echo " similarity-generic sample.go --config configs/go.json" 29 | echo 30 | 31 | echo "7. Create and use a modified configuration:" 32 | echo " # Get base configuration" 33 | echo " similarity-generic --show-config go > my-config.json" 34 | echo " # Edit my-config.json to customize" 35 | echo " # Use the custom configuration" 36 | echo " similarity-generic sample.go --config my-config.json" 37 | echo 38 | 39 | echo "8. Analyze multiple files:" 40 | echo " find . -name '*.go' -exec similarity-generic {} --language go \;" 41 | echo 42 | 43 | echo "9. Output in VSCode-compatible format (default):" 44 | echo " similarity-generic sample.go --language go" 45 | echo " # Click on the file paths in VSCode terminal to jump to location" -------------------------------------------------------------------------------- /test/__fixtures__/duplication/structural/error_handling_pattern_2.ts: -------------------------------------------------------------------------------- 1 | // Same error handling pattern refactored to remove duplication 2 | // This demonstrates how the structural duplication can be eliminated 3 | 4 | type ApiResult = { data?: T; error?: string }; 5 | 6 | async function apiCall( 7 | url: string, 8 | options?: RequestInit, 9 | errorContext?: string 10 | ): Promise> { 11 | try { 12 | const response = await fetch(url, options); 13 | 14 | if (!response.ok) { 15 | throw new Error(`HTTP error! status: ${response.status}`); 16 | } 17 | 18 | const data = await response.json(); 19 | return { data }; 20 | } catch (error) { 21 | const context = errorContext || 'API call'; 22 | console.error(`Error in ${context}:`, error); 23 | return { 24 | error: error instanceof Error ? error.message : 'Unknown error occurred' 25 | }; 26 | } 27 | } 28 | 29 | export const fetchUserData = (userId: string) => 30 | apiCall(`/api/users/${userId}`, undefined, 'fetching user data'); 31 | 32 | export const fetchProductData = (productId: string) => 33 | apiCall(`/api/products/${productId}`, undefined, 'fetching product data'); 34 | 35 | export const fetchOrderData = (orderId: string) => 36 | apiCall(`/api/orders/${orderId}`, undefined, 'fetching order data'); 37 | 38 | export const postComment = (postId: string, comment: string) => 39 | apiCall( 40 | `/api/posts/${postId}/comments`, 41 | { 42 | method: 'POST', 43 | headers: { 'Content-Type': 'application/json' }, 44 | body: JSON.stringify({ comment }) 45 | }, 46 | 'posting comment' 47 | ); -------------------------------------------------------------------------------- /examples/overlap-detection/exact-duplication.js: -------------------------------------------------------------------------------- 1 | // Test case 1: Exact duplication of code blocks 2 | 3 | function processUserData(users) { 4 | const validUsers = []; 5 | // Exact duplicate block 6 | for (let i = 0; i < users.length; i++) { 7 | if (users[i].age >= 18 && users[i].isActive) { 8 | validUsers.push({ 9 | id: users[i].id, 10 | name: users[i].name, 11 | email: users[i].email 12 | }); 13 | } 14 | } 15 | return validUsers; 16 | } 17 | 18 | function filterActiveAdults(people) { 19 | const results = []; 20 | // Same logic, different variable names 21 | for (let i = 0; i < people.length; i++) { 22 | if (people[i].age >= 18 && people[i].isActive) { 23 | results.push({ 24 | id: people[i].id, 25 | name: people[i].name, 26 | email: people[i].email 27 | }); 28 | } 29 | } 30 | return results; 31 | } 32 | 33 | function validateAndTransform(items) { 34 | const output = []; 35 | // Similar pattern but with additional logic 36 | for (let i = 0; i < items.length; i++) { 37 | if (items[i].age >= 18 && items[i].isActive) { 38 | // Additional validation 39 | if (items[i].email && items[i].email.includes('@')) { 40 | output.push({ 41 | id: items[i].id, 42 | name: items[i].name, 43 | email: items[i].email, 44 | validated: true 45 | }); 46 | } 47 | } 48 | } 49 | return output; 50 | } -------------------------------------------------------------------------------- /examples/specs/duplicate-functions.ts: -------------------------------------------------------------------------------- 1 | // Example: Function duplication detection 2 | 3 | // Duplicate 1: Nearly identical functions (variable names only) 4 | function calculateUserAge(birthYear: number): number { 5 | const currentYear = new Date().getFullYear(); 6 | const age = currentYear - birthYear; 7 | return age; 8 | } 9 | 10 | function calculateCustomerAge(birthYear: number): number { 11 | const currentYear = new Date().getFullYear(); 12 | const age = currentYear - birthYear; 13 | return age; 14 | } 15 | 16 | // Duplicate 2: Same algorithm, different implementation style 17 | function findMaxValue(numbers: number[]): number { 18 | let max = numbers[0]; 19 | for (let i = 1; i < numbers.length; i++) { 20 | if (numbers[i] > max) { 21 | max = numbers[i]; 22 | } 23 | } 24 | return max; 25 | } 26 | 27 | function getMaximumValue(values: number[]): number { 28 | let maximum = values[0]; 29 | for (const value of values) { 30 | if (value > maximum) { 31 | maximum = value; 32 | } 33 | } 34 | return maximum; 35 | } 36 | 37 | // Duplicate 3: Data processing with different field names 38 | function processUserData(users: any[]) { 39 | return users 40 | .filter(user => user.isActive) 41 | .map(user => ({ 42 | id: user.userId, 43 | name: user.fullName, 44 | email: user.emailAddress 45 | })); 46 | } 47 | 48 | function processCustomerData(customers: any[]) { 49 | return customers 50 | .filter(customer => customer.isActive) 51 | .map(customer => ({ 52 | id: customer.customerId, 53 | name: customer.fullName, 54 | email: customer.emailAddress 55 | })); 56 | } -------------------------------------------------------------------------------- /examples/specs/duplicate-functions2.ts: -------------------------------------------------------------------------------- 1 | // Example: Function duplication detection 2 | 3 | // Duplicate 1: Nearly identical functions (variable names only) 4 | function calculateUserAge(birthYear: number): number { 5 | const currentYear = new Date().getFullYear(); 6 | const age = currentYear - birthYear; 7 | return age; 8 | } 9 | 10 | function calculateCustomerAge(birthYear: number): number { 11 | const currentYear = new Date().getFullYear(); 12 | const age = currentYear - birthYear; 13 | return age; 14 | } 15 | 16 | // Duplicate 2: Same algorithm, different implementation style 17 | function findMaxValue(numbers: number[]): number { 18 | let max = numbers[0]; 19 | for (let i = 1; i < numbers.length; i++) { 20 | if (numbers[i] > max) { 21 | max = numbers[i]; 22 | } 23 | } 24 | return max; 25 | } 26 | 27 | function getMaximumValue(values: number[]): number { 28 | let maximum = values[0]; 29 | for (const value of values) { 30 | if (value > maximum) { 31 | maximum = value; 32 | } 33 | } 34 | return maximum; 35 | } 36 | 37 | // Duplicate 3: Data processing with different field names 38 | function processUserData(users: any[]) { 39 | return users 40 | .filter(user => user.isActive) 41 | .map(user => ({ 42 | id: user.userId, 43 | name: user.fullName, 44 | email: user.emailAddress 45 | })); 46 | } 47 | 48 | function processCustomerData(customers: any[]) { 49 | return customers 50 | .filter(customer => customer.isActive) 51 | .map(customer => ({ 52 | id: customer.customerId, 53 | name: customer.fullName, 54 | email: customer.emailAddress 55 | })); 56 | } -------------------------------------------------------------------------------- /examples/overlap-detection/false-positives.js: -------------------------------------------------------------------------------- 1 | // Test case 4: Patterns that might cause false positives 2 | 3 | // Very short similar patterns 4 | function isPositive(n) { 5 | return n > 0; 6 | } 7 | 8 | function isNegative(n) { 9 | return n < 0; 10 | } 11 | 12 | function isZero(n) { 13 | return n === 0; 14 | } 15 | 16 | // Common boilerplate patterns 17 | function fetchUserData(userId) { 18 | try { 19 | const user = database.get(userId); 20 | return { success: true, data: user }; 21 | } catch (error) { 22 | return { success: false, error: error.message }; 23 | } 24 | } 25 | 26 | function fetchProductData(productId) { 27 | try { 28 | const product = database.get(productId); 29 | return { success: true, data: product }; 30 | } catch (error) { 31 | return { success: false, error: error.message }; 32 | } 33 | } 34 | 35 | // Different algorithms with similar structure 36 | function bubbleSort(arr) { 37 | const n = arr.length; 38 | for (let i = 0; i < n - 1; i++) { 39 | for (let j = 0; j < n - i - 1; j++) { 40 | if (arr[j] > arr[j + 1]) { 41 | [arr[j], arr[j + 1]] = [arr[j + 1], arr[j]]; 42 | } 43 | } 44 | } 45 | return arr; 46 | } 47 | 48 | function selectionSort(arr) { 49 | const n = arr.length; 50 | for (let i = 0; i < n - 1; i++) { 51 | let minIdx = i; 52 | for (let j = i + 1; j < n; j++) { 53 | if (arr[j] < arr[minIdx]) { 54 | minIdx = j; 55 | } 56 | } 57 | [arr[i], arr[minIdx]] = [arr[minIdx], arr[i]]; 58 | } 59 | return arr; 60 | } -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # similarity Documentation 2 | 3 | This directory contains comprehensive documentation for the similarity detection tools, organized by category. 4 | 5 | ## 📚 Documentation Structure 6 | 7 | ### [Algorithm](./algorithm/) 8 | Theoretical foundations and algorithm documentation: 9 | - TSED (Tree Similarity of Edit Distance) academic paper 10 | - Algorithm summaries and analyses 11 | - Tree-sitter integration details 12 | 13 | ### [Library](./lib/) 14 | Library design, architecture, and features: 15 | - AI documentation for developers 16 | - Multi-file similarity detection 17 | - Type similarity design 18 | - Language-specific implementations 19 | 20 | ### [Implementation](./implementation/) 21 | Implementation details and performance: 22 | - Performance optimization strategies 23 | - Benchmark results 24 | - Rust vs TypeScript comparisons 25 | 26 | ## 🚀 Quick Start 27 | 28 | For users: 29 | - [`prompt.md`](./prompt.md) - AI assistant quick guide (English) 30 | - [`prompt-ja.md`](./prompt-ja.md) - AI assistant quick guide (Japanese) 31 | - [Main README](../README.md) - Installation and usage 32 | 33 | For developers: 34 | - [`lib/ai-documentation.md`](./lib/ai-documentation.md) - Technical documentation 35 | - [`algorithm/tsed-similarity-summary.md`](./algorithm/tsed-similarity-summary.md) - Algorithm overview 36 | 37 | ## 📖 Additional Resources 38 | 39 | ### Blog Posts 40 | - [`introduce-ja.md`](./introduce-ja.md) - Project introduction and development story (Japanese) 41 | 42 | ### Project Management 43 | - [`../CLAUDE.md`](../CLAUDE.md) - Project instructions for Claude 44 | - [`../TODO.md`](../TODO.md) - Task list 45 | - [`../CHANGELOG.md`](../CHANGELOG.md) - Version history -------------------------------------------------------------------------------- /examples/overlap-detection/similar-patterns.js: -------------------------------------------------------------------------------- 1 | // Test case 2: Similar algorithmic patterns 2 | 3 | // Pattern 1: Array reduction 4 | function sumValues(numbers) { 5 | let total = 0; 6 | for (let i = 0; i < numbers.length; i++) { 7 | total += numbers[i]; 8 | } 9 | return total; 10 | } 11 | 12 | function calculateProduct(values) { 13 | let product = 1; 14 | for (let i = 0; i < values.length; i++) { 15 | product *= values[i]; 16 | } 17 | return product; 18 | } 19 | 20 | // Pattern 2: Find maximum 21 | function findMax(arr) { 22 | let max = arr[0]; 23 | for (let i = 1; i < arr.length; i++) { 24 | if (arr[i] > max) { 25 | max = arr[i]; 26 | } 27 | } 28 | return max; 29 | } 30 | 31 | function findMin(arr) { 32 | let min = arr[0]; 33 | for (let i = 1; i < arr.length; i++) { 34 | if (arr[i] < min) { 35 | min = arr[i]; 36 | } 37 | } 38 | return min; 39 | } 40 | 41 | // Pattern 3: Nested loops 42 | function findDuplicates(items) { 43 | const duplicates = []; 44 | for (let i = 0; i < items.length; i++) { 45 | for (let j = i + 1; j < items.length; j++) { 46 | if (items[i] === items[j]) { 47 | duplicates.push(items[i]); 48 | } 49 | } 50 | } 51 | return duplicates; 52 | } 53 | 54 | function findPairs(numbers, targetSum) { 55 | const pairs = []; 56 | for (let i = 0; i < numbers.length; i++) { 57 | for (let j = i + 1; j < numbers.length; j++) { 58 | if (numbers[i] + numbers[j] === targetSum) { 59 | pairs.push([numbers[i], numbers[j]]); 60 | } 61 | } 62 | } 63 | return pairs; 64 | } -------------------------------------------------------------------------------- /examples/specs/basic_usage.ts: -------------------------------------------------------------------------------- 1 | import { CodeSimilarity } from "../src/index.ts"; 2 | 3 | function main() { 4 | const similarity = new CodeSimilarity(); 5 | 6 | // Example 1: Similar functions with minor differences 7 | const code1 = ` 8 | function add(a: number, b: number): number { 9 | return a + b; 10 | }`; 11 | 12 | const code2 = ` 13 | function sum(x: number, y: number): number { 14 | return x + y; 15 | }`; 16 | 17 | console.log("=== Example 1: Similar functions ==="); 18 | const score1 = similarity.calculateSimilarity(code1, code2); 19 | console.log(`Similarity score: ${score1.toFixed(4)}`); 20 | 21 | const report1 = similarity.getDetailedReport(code1, code2); 22 | console.log("Detailed report:", report1); 23 | 24 | // Example 2: Identical code 25 | const code3 = ` 26 | class Calculator { 27 | add(a: number, b: number): number { 28 | return a + b; 29 | } 30 | }`; 31 | 32 | console.log("\n=== Example 2: Identical code ==="); 33 | const score2 = similarity.calculateSimilarity(code3, code3); 34 | console.log(`Similarity score: ${score2.toFixed(4)} (should be 1.0)`); 35 | 36 | // Example 3: Very different code 37 | const code4 = ` 38 | interface User { 39 | id: number; 40 | name: string; 41 | }`; 42 | 43 | console.log("\n=== Example 3: Different code structures ==="); 44 | const score3 = similarity.calculateSimilarity(code3, code4); 45 | console.log(`Similarity score: ${score3.toFixed(4)}`); 46 | 47 | // Example 4: Parse AST 48 | console.log("\n=== Example 4: AST Structure ==="); 49 | const ast = similarity.parse(code1); 50 | console.log("AST for code1:"); 51 | console.log(JSON.stringify(ast.program, null, 2).substring(0, 500) + "..."); 52 | } 53 | 54 | // Run the examples 55 | main(); 56 | -------------------------------------------------------------------------------- /crates/similarity-rs/tests/test_debug_rename_cost.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::uninlined_format_args)] 2 | 3 | use similarity_core::{ 4 | language_parser::LanguageParser, 5 | tsed::{calculate_tsed, TSEDOptions}, 6 | }; 7 | use similarity_rs::rust_parser::RustParser; 8 | 9 | #[test] 10 | fn test_rename_cost_effect() { 11 | let code1 = r#" 12 | let result = x + 1; 13 | result * 2 14 | "#; 15 | 16 | let code2 = r#" 17 | let temp = y + 1; 18 | temp * 2 19 | "#; 20 | 21 | let mut parser = RustParser::new().unwrap(); 22 | let tree1 = parser.parse(code1, "test1.rs").unwrap(); 23 | let tree2 = parser.parse(code2, "test2.rs").unwrap(); 24 | 25 | // Print AST structure 26 | fn print_tree(node: &similarity_core::tree::TreeNode, depth: usize) { 27 | let indent = " ".repeat(depth); 28 | if node.value.is_empty() { 29 | println!("{}{}", indent, node.label); 30 | } else { 31 | println!("{}{} = '{}'", indent, node.label, node.value); 32 | } 33 | for child in &node.children { 34 | print_tree(child, depth + 1); 35 | } 36 | } 37 | 38 | println!("=== Tree 1 ==="); 39 | print_tree(&tree1, 0); 40 | println!("\n=== Tree 2 ==="); 41 | print_tree(&tree2, 0); 42 | 43 | // Test different rename_cost values 44 | for rename_cost in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0] { 45 | let mut options = TSEDOptions::default(); 46 | options.apted_options.rename_cost = rename_cost; 47 | options.apted_options.compare_values = true; 48 | 49 | let similarity = calculate_tsed(&tree1, &tree2, &options); 50 | println!("rename_cost = {:.1}: similarity = {:.2}%", rename_cost, similarity * 100.0); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /examples/specs/sample_project/src/components/user_list.ts: -------------------------------------------------------------------------------- 1 | import { User } from "../models/user.ts"; 2 | 3 | export class UserList { 4 | private container: HTMLElement; 5 | private users: User[] = []; 6 | 7 | constructor(containerId: string) { 8 | const element = document.getElementById(containerId); 9 | if (!element) { 10 | throw new Error(`Container with id ${containerId} not found`); 11 | } 12 | this.container = element; 13 | } 14 | 15 | setUsers(users: User[]): void { 16 | this.users = users; 17 | this.render(); 18 | } 19 | 20 | addUser(user: User): void { 21 | this.users.push(user); 22 | this.render(); 23 | } 24 | 25 | removeUser(userId: string): void { 26 | this.users = this.users.filter((u) => u.id !== userId); 27 | this.render(); 28 | } 29 | 30 | private render(): void { 31 | this.container.innerHTML = ""; 32 | 33 | if (this.users.length === 0) { 34 | this.container.innerHTML = "

No users found

"; 35 | return; 36 | } 37 | 38 | const ul = document.createElement("ul"); 39 | ul.className = "user-list"; 40 | 41 | this.users.forEach((user) => { 42 | const li = document.createElement("li"); 43 | li.className = "user-item"; 44 | li.innerHTML = ` 45 | ${this.escapeHtml(user.name)} 46 | ${this.escapeHtml(user.email)} 47 | ${user.role} 48 | `; 49 | ul.appendChild(li); 50 | }); 51 | 52 | this.container.appendChild(ul); 53 | } 54 | 55 | private escapeHtml(text: string): string { 56 | const div = document.createElement("div"); 57 | div.textContent = text; 58 | return div.innerHTML; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /docs/prompt-ja.md: -------------------------------------------------------------------------------- 1 | # similarity-ts: AIアシスタントガイド 2 | 3 | ## 目的 4 | ASTベースの比較でTypeScript/JavaScriptの重複コードを検出し、リファクタリングを支援します。 5 | 6 | ## インストール 7 | ```bash 8 | cargo install similarity-ts 9 | ``` 10 | 11 | ## コマンド形式 12 | ```bash 13 | similarity-ts [パス...] [オプション] 14 | ``` 15 | 16 | ## 主要オプション 17 | - `--threshold <0-1>`: 類似度しきい値(デフォルト: 0.8) 18 | - `--min-tokens `: n個未満のASTノードを持つ関数をスキップ(推奨: 20-30) 19 | - `--print`: 実際のコードスニペットを表示 20 | 21 | ## AIリファクタリングワークフロー 22 | 23 | ### 1. 広範囲スキャン 24 | コードベース全体の重複を発見: 25 | ```bash 26 | similarity-ts src/ --threshold 0.85 --min-tokens 25 27 | ``` 28 | 29 | ### 2. 詳細分析 30 | 特定のファイルペアを調査: 31 | ```bash 32 | similarity-ts file1.ts file2.ts --threshold 0.8 --min-tokens 20 --print 33 | ``` 34 | 35 | ### 3. しきい値調整 36 | 結果がない場合は段階的に下げる: 37 | ```bash 38 | similarity-ts file1.ts file2.ts --threshold 0.75 --min-tokens 20 39 | similarity-ts file1.ts file2.ts --threshold 0.7 --min-tokens 20 40 | ``` 41 | 42 | ## 出力形式 43 | ``` 44 | Function: functionName (file.ts:開始行-終了行) 45 | Similar to: otherFunction (other.ts:開始行-終了行) 46 | Similarity: 85% 47 | ``` 48 | 49 | ## 効果的なしきい値 50 | - `0.95+`: ほぼ同一(変数名の違いのみ) 51 | - `0.85-0.95`: 同じアルゴリズム、軽微な違い 52 | - `0.75-0.85`: 類似構造、詳細は異なる 53 | - `0.7-0.75`: 関連ロジック、調査の価値あり 54 | 55 | ## リファクタリング戦略 56 | 57 | 1. **高いしきい値から開始**(0.9)で明らかな重複を発見 58 | 2. **特定ペアを比較**して類似性を確認 59 | 3. **--printを使用**して実際のコードの違いを確認 60 | 4. **共通ロジックを抽出**して共有関数/モジュール化 61 | 5. **リファクタリング後に再実行**して新たな重複がないか確認 62 | 63 | ## リファクタリング対象の一般的なパターン 64 | 65 | - **データ処理ループ**(異なるフィールド名) 66 | - **APIハンドラー**(類似のリクエスト/レスポンスロジック) 67 | - **バリデーション関数**(異なるルール) 68 | - **状態管理**(繰り返されるパターン) 69 | 70 | ## ベストプラクティス 71 | 72 | - 正確な複雑さフィルタリングに`--min-tokens`を使用(20-30トークン) 73 | - 80%以上の類似度のファイルを優先 74 | - 類似関数が同じモジュール内にあるか確認(リファクタリングが容易) 75 | - 関数サイズを考慮 - 大きな重複ほど影響大 76 | - ペアだけでなく複数ファイルにまたがるパターンを探す -------------------------------------------------------------------------------- /examples/specs/type-similarity/test_types_sample.ts: -------------------------------------------------------------------------------- 1 | // Sample TypeScript file for testing type similarity detection 2 | 3 | // Similar interfaces - should be detected as highly similar 4 | interface User { 5 | id: string; 6 | name: string; 7 | email: string; 8 | age?: number; 9 | } 10 | 11 | interface Person { 12 | id: string; 13 | name: string; 14 | email: string; 15 | age?: number; 16 | } 17 | 18 | // Similar but with different property names - should be detected as moderately similar 19 | interface Customer { 20 | id: string; 21 | fullName: string; 22 | emailAddress: string; 23 | yearsOld?: number; 24 | } 25 | 26 | // Type alias vs interface - should be detected as similar if cross-kind comparison is enabled 27 | type UserType = { 28 | id: string; 29 | name: string; 30 | email: string; 31 | age?: number; 32 | }; 33 | 34 | // Union types 35 | type Status = "active" | "inactive" | "pending"; 36 | type State = "active" | "inactive" | "suspended"; 37 | 38 | // Different structure - should not be similar 39 | interface Product { 40 | sku: string; 41 | price: number; 42 | category: string; 43 | inStock: boolean; 44 | } 45 | 46 | // Generic interface 47 | interface Container { 48 | value: T; 49 | metadata: { 50 | created: Date; 51 | updated: Date; 52 | }; 53 | } 54 | 55 | // Similar generic interface 56 | interface Wrapper { 57 | value: T; 58 | metadata: { 59 | created: Date; 60 | updated: Date; 61 | }; 62 | } 63 | 64 | // Interface with extends 65 | interface BaseEntity { 66 | id: string; 67 | createdAt: Date; 68 | } 69 | 70 | interface ExtendedUser extends BaseEntity { 71 | name: string; 72 | email: string; 73 | } 74 | 75 | interface ExtendedPerson extends BaseEntity { 76 | name: string; 77 | email: string; 78 | } 79 | -------------------------------------------------------------------------------- /crates/similarity-rs/tests/test_full_function_similarity.rs: -------------------------------------------------------------------------------- 1 | use similarity_core::language_parser::LanguageParser; 2 | use similarity_core::{ 3 | apted::APTEDOptions, 4 | tsed::{calculate_tsed, TSEDOptions}, 5 | }; 6 | use similarity_rs::rust_parser::RustParser; 7 | 8 | #[test] 9 | fn test_full_function_similarity() { 10 | let mut parser = RustParser::new().unwrap(); 11 | 12 | let func1 = "fn add(a: i32, b: i32) -> i32 { a + b }"; 13 | let func2 = "fn sub(a: i32, b: i32) -> i32 { a - b }"; 14 | let func3 = "fn mul(a: i32, b: i32) -> i32 { a * b }"; 15 | 16 | let tree1 = parser.parse(func1, "test1.rs").unwrap(); 17 | let tree2 = parser.parse(func2, "test2.rs").unwrap(); 18 | let tree3 = parser.parse(func3, "test3.rs").unwrap(); 19 | 20 | let options = TSEDOptions { 21 | apted_options: APTEDOptions { 22 | rename_cost: 0.3, 23 | delete_cost: 1.0, 24 | insert_cost: 1.0, 25 | compare_values: true, 26 | }, 27 | min_lines: 1, 28 | min_tokens: None, 29 | size_penalty: true, 30 | skip_test: false, 31 | }; 32 | 33 | let sim12 = calculate_tsed(&tree1, &tree2, &options); 34 | let sim13 = calculate_tsed(&tree1, &tree3, &options); 35 | 36 | println!("Tree1 size: {}", tree1.get_subtree_size()); 37 | println!("Tree2 size: {}", tree2.get_subtree_size()); 38 | println!("Full function similarity 'add' vs 'sub': {:.2}%", sim12 * 100.0); 39 | println!("Full function similarity 'add' vs 'mul': {:.2}%", sim13 * 100.0); 40 | 41 | // These should not be 100% similar 42 | assert!(sim12 < 1.0, "Different functions should not be 100% similar, got {}%", sim12 * 100.0); 43 | assert!(sim13 < 1.0, "Different functions should not be 100% similar, got {}%", sim13 * 100.0); 44 | } 45 | -------------------------------------------------------------------------------- /examples/specs/sample_project/src/services/user_service.ts: -------------------------------------------------------------------------------- 1 | import { User } from "../models/user.ts"; 2 | import { Logger } from "../utils/logger.ts"; 3 | 4 | export class UserService { 5 | private users: Map = new Map(); 6 | private logger: Logger; 7 | 8 | constructor(logger: Logger) { 9 | this.logger = logger; 10 | } 11 | 12 | async createUser(data: Omit): Promise { 13 | const user: User = { 14 | id: this.generateId(), 15 | ...data, 16 | createdAt: new Date(), 17 | }; 18 | 19 | this.users.set(user.id, user); 20 | this.logger.info(`User created: ${user.id}`); 21 | 22 | return user; 23 | } 24 | 25 | async getUserById(id: string): Promise { 26 | const user = this.users.get(id); 27 | 28 | if (!user) { 29 | this.logger.warn(`User not found: ${id}`); 30 | return null; 31 | } 32 | 33 | return user; 34 | } 35 | 36 | async updateUser(id: string, updates: Partial): Promise { 37 | const user = await this.getUserById(id); 38 | 39 | if (!user) { 40 | return null; 41 | } 42 | 43 | const updatedUser = { ...user, ...updates, id: user.id }; 44 | this.users.set(id, updatedUser); 45 | this.logger.info(`User updated: ${id}`); 46 | 47 | return updatedUser; 48 | } 49 | 50 | async deleteUser(id: string): Promise { 51 | const exists = this.users.has(id); 52 | 53 | if (exists) { 54 | this.users.delete(id); 55 | this.logger.info(`User deleted: ${id}`); 56 | } 57 | 58 | return exists; 59 | } 60 | 61 | async getAllUsers(): Promise { 62 | return Array.from(this.users.values()); 63 | } 64 | 65 | private generateId(): string { 66 | return `user_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /examples/test_rust_structures.rs: -------------------------------------------------------------------------------- 1 | // Test file for Rust structure comparison 2 | 3 | // User struct with common fields 4 | #[derive(Debug, Clone)] 5 | pub struct User { 6 | pub id: u64, 7 | pub name: String, 8 | pub email: String, 9 | pub age: Option, 10 | } 11 | 12 | // Person struct with same fields (should be detected as similar) 13 | #[derive(Debug, Clone)] 14 | pub struct Person { 15 | pub id: u64, 16 | pub name: String, 17 | pub email: String, 18 | pub age: Option, 19 | } 20 | 21 | // Customer struct with same structure 22 | #[derive(Debug)] 23 | struct Customer { 24 | id: u64, 25 | name: String, 26 | email: String, 27 | age: Option, 28 | } 29 | 30 | // Admin struct with different field (role instead of email) 31 | pub struct Admin { 32 | pub id: u64, 33 | pub name: String, 34 | pub role: String, 35 | pub age: Option, 36 | } 37 | 38 | // Result-like enum 39 | pub enum MyResult { 40 | Ok(T), 41 | Err(E), 42 | } 43 | 44 | // Another Result-like enum (should be detected as similar) 45 | pub enum CustomResult { 46 | Success(V), 47 | Failure(F), 48 | } 49 | 50 | // Option-like enum 51 | pub enum MyOption { 52 | Some(T), 53 | None, 54 | } 55 | 56 | // Status enum with different variants 57 | pub enum Status { 58 | Pending, 59 | Active, 60 | Inactive, 61 | Deleted, 62 | } 63 | 64 | // Similar status enum with slightly different names 65 | pub enum UserStatus { 66 | Waiting, 67 | Enabled, 68 | Disabled, 69 | Removed, 70 | } 71 | 72 | // Complex enum with different variant types 73 | pub enum Message { 74 | Text(String), 75 | Number(i32), 76 | Struct { x: f64, y: f64 }, 77 | Empty, 78 | } 79 | 80 | // Tuple struct 81 | pub struct Point(f64, f64, f64); 82 | 83 | // Another tuple struct with same structure 84 | pub struct Vector(f64, f64, f64); -------------------------------------------------------------------------------- /examples/specs/sample_project/src/components/product_list.ts: -------------------------------------------------------------------------------- 1 | import { Product } from "../models/product.ts"; 2 | 3 | export class ProductList { 4 | private container: HTMLElement; 5 | private products: Product[] = []; 6 | 7 | constructor(containerId: string) { 8 | const element = document.getElementById(containerId); 9 | if (!element) { 10 | throw new Error(`Container with id ${containerId} not found`); 11 | } 12 | this.container = element; 13 | } 14 | 15 | setProducts(products: Product[]): void { 16 | this.products = products; 17 | this.render(); 18 | } 19 | 20 | addProduct(product: Product): void { 21 | this.products.push(product); 22 | this.render(); 23 | } 24 | 25 | removeProduct(productId: string): void { 26 | this.products = this.products.filter((p) => p.id !== productId); 27 | this.render(); 28 | } 29 | 30 | private render(): void { 31 | this.container.innerHTML = ""; 32 | 33 | if (this.products.length === 0) { 34 | this.container.innerHTML = "

No products found

"; 35 | return; 36 | } 37 | 38 | const div = document.createElement("div"); 39 | div.className = "product-grid"; 40 | 41 | this.products.forEach((product) => { 42 | const card = document.createElement("div"); 43 | card.className = "product-card"; 44 | card.innerHTML = ` 45 |

${this.escapeHtml(product.name)}

46 |

${this.escapeHtml(product.description)}

47 |

$${product.price.toFixed(2)}

48 |

Stock: ${product.stock}

49 | `; 50 | div.appendChild(card); 51 | }); 52 | 53 | this.container.appendChild(div); 54 | } 55 | 56 | private escapeHtml(text: string): string { 57 | const div = document.createElement("div"); 58 | div.textContent = text; 59 | return div.innerHTML; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /examples/test_different_structures.rs: -------------------------------------------------------------------------------- 1 | // Test file to verify that clearly different structures are not detected as similar 2 | 3 | // Simple enum with few variants 4 | pub enum Color { 5 | Red, 6 | Green, 7 | Blue, 8 | } 9 | 10 | // Complex struct with many fields 11 | pub struct DatabaseConnection { 12 | pub host: String, 13 | pub port: u16, 14 | pub username: String, 15 | pub password: String, 16 | pub database_name: String, 17 | pub max_connections: u32, 18 | pub timeout: std::time::Duration, 19 | pub use_ssl: bool, 20 | pub certificate_path: Option, 21 | } 22 | 23 | // Unit struct (no fields) 24 | pub struct EmptyMarker; 25 | 26 | // Tuple struct with single field 27 | pub struct Id(u64); 28 | 29 | // Enum with complex variants 30 | pub enum Request { 31 | Get { url: String, headers: Vec }, 32 | Post { url: String, body: Vec, headers: Vec }, 33 | Put { url: String, body: Vec }, 34 | Delete { url: String }, 35 | } 36 | 37 | // Simple struct with two fields 38 | pub struct Point2D { 39 | pub x: f64, 40 | pub y: f64, 41 | } 42 | 43 | // Another enum with different structure 44 | pub enum Shape { 45 | Circle(f64), 46 | Rectangle(f64, f64), 47 | Triangle(f64, f64, f64), 48 | Polygon(Vec<(f64, f64)>), 49 | } 50 | 51 | // Struct that might look similar to Point2D but has different types 52 | pub struct Coordinate { 53 | pub lat: f64, 54 | pub lng: f64, 55 | } 56 | 57 | // Large struct that should not match with anything 58 | pub struct Configuration { 59 | pub app_name: String, 60 | pub version: String, 61 | pub environment: String, 62 | pub debug_mode: bool, 63 | pub log_level: String, 64 | pub api_key: String, 65 | pub secret_key: String, 66 | pub endpoints: Vec, 67 | pub features: std::collections::HashMap, 68 | pub limits: std::collections::HashMap, 69 | } -------------------------------------------------------------------------------- /test/__fixtures__/duplication/structural/array_iteration_pattern_1.ts: -------------------------------------------------------------------------------- 1 | // Structural duplication: Imperative array processing 2 | // Common pattern of iterating arrays with for loops 3 | export function processUserData(users: User[]): ProcessedUser[] { 4 | const result: ProcessedUser[] = []; 5 | 6 | for (let i = 0; i < users.length; i++) { 7 | const user = users[i]; 8 | if (user.isActive) { 9 | const processed: ProcessedUser = { 10 | id: user.id, 11 | displayName: user.firstName + ' ' + user.lastName, 12 | status: 'active', 13 | lastSeen: user.lastLogin 14 | }; 15 | result.push(processed); 16 | } 17 | } 18 | 19 | return result; 20 | } 21 | 22 | export function processOrderData(orders: Order[]): ProcessedOrder[] { 23 | const result: ProcessedOrder[] = []; 24 | 25 | for (let i = 0; i < orders.length; i++) { 26 | const order = orders[i]; 27 | if (order.status === 'completed') { 28 | const processed: ProcessedOrder = { 29 | id: order.id, 30 | customerName: order.customer.firstName + ' ' + order.customer.lastName, 31 | total: order.items.reduce((sum, item) => sum + item.price, 0), 32 | completedAt: order.completedDate 33 | }; 34 | result.push(processed); 35 | } 36 | } 37 | 38 | return result; 39 | } 40 | 41 | // Types 42 | interface User { 43 | id: string; 44 | firstName: string; 45 | lastName: string; 46 | isActive: boolean; 47 | lastLogin: Date; 48 | } 49 | 50 | interface ProcessedUser { 51 | id: string; 52 | displayName: string; 53 | status: string; 54 | lastSeen: Date; 55 | } 56 | 57 | interface Order { 58 | id: string; 59 | status: string; 60 | customer: { firstName: string; lastName: string }; 61 | items: Array<{ price: number }>; 62 | completedDate: Date; 63 | } 64 | 65 | interface ProcessedOrder { 66 | id: string; 67 | customerName: string; 68 | total: number; 69 | completedAt: Date; 70 | } -------------------------------------------------------------------------------- /crates/core/tests/debug_similarity.rs: -------------------------------------------------------------------------------- 1 | use similarity_core::{calculate_tsed, parse_and_convert_to_tree, TSEDOptions}; 2 | use std::rc::Rc; 3 | 4 | #[test] 5 | fn debug_high_similarity_issue() { 6 | // Test the problematic case: extractTokensFromAST vs getNodeLabel 7 | let code1 = r#" 8 | function extractTokensFromAST(ast: any): string[] { 9 | const tokens: string[] = []; 10 | function traverse(node: any) { 11 | if (!node) return; 12 | if (node.type) tokens.push(node.type); 13 | } 14 | traverse(ast); 15 | return tokens; 16 | } 17 | "#; 18 | 19 | let code2 = r#" 20 | function getNodeLabel(node: TreeNode): string { 21 | switch (node.type) { 22 | case 'Identifier': return 'ID'; 23 | case 'StringLiteral': return 'STR'; 24 | default: return node.type || 'UNKNOWN'; 25 | } 26 | } 27 | "#; 28 | 29 | // Parse both functions to trees 30 | let tree1 = parse_and_convert_to_tree("test1.ts", code1).unwrap(); 31 | let tree2 = parse_and_convert_to_tree("test2.ts", code2).unwrap(); 32 | 33 | println!("Tree1 size: {}", tree1.get_subtree_size()); 34 | println!("Tree2 size: {}", tree2.get_subtree_size()); 35 | 36 | // Test with different rename costs 37 | for rename_cost in &[0.1, 0.3, 0.5, 0.7, 1.0] { 38 | let mut options = TSEDOptions::default(); 39 | options.apted_options.rename_cost = *rename_cost; 40 | 41 | let similarity = calculate_tsed(&tree1, &tree2, &options); 42 | println!("Rename cost {}: similarity = {:.2}%", rename_cost, similarity * 100.0); 43 | } 44 | 45 | // Print tree structure 46 | print_tree(&tree1, 0); 47 | println!("\n---\n"); 48 | print_tree(&tree2, 0); 49 | } 50 | 51 | fn print_tree(node: &Rc, depth: usize) { 52 | let indent = " ".repeat(depth); 53 | println!("{}{}", indent, node.label); 54 | for child in &node.children { 55 | print_tree(child, depth + 1); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /crates/similarity-md/examples/test_levenshtein.rs: -------------------------------------------------------------------------------- 1 | //! Levenshtein距離計算のテスト 2 | 3 | use similarity_md::{levenshtein_similarity, word_levenshtein_similarity}; 4 | 5 | fn main() { 6 | println!("=== Levenshtein距離計算テスト ===\n"); 7 | 8 | // 基本的なテスト 9 | let text1 = "機械学習について"; 10 | let text2 = "マシンラーニングの概要"; 11 | 12 | println!("テキスト1: '{text1}'"); 13 | println!("テキスト2: '{text2}'"); 14 | 15 | let char_sim = levenshtein_similarity(text1, text2); 16 | let word_sim = word_levenshtein_similarity(text1, text2); 17 | 18 | println!("文字レベル類似性: {char_sim:.4}"); 19 | println!("単語レベル類似性: {word_sim:.4}"); 20 | println!(); 21 | 22 | // より類似したテキストのテスト 23 | let text3 = "機械学習は、コンピュータがデータから自動的にパターンを学習する技術です。"; 24 | let text4 = "マシンラーニングとは、計算機がデータから自動的にパターンを習得する手法です。"; 25 | 26 | println!("テキスト3: '{text3}'"); 27 | println!("テキスト4: '{text4}'"); 28 | 29 | let char_sim2 = levenshtein_similarity(text3, text4); 30 | let word_sim2 = word_levenshtein_similarity(text3, text4); 31 | 32 | println!("文字レベル類似性: {char_sim2:.4}"); 33 | println!("単語レベル類似性: {word_sim2:.4}"); 34 | println!(); 35 | 36 | // 英語のテスト 37 | let en1 = "machine learning"; 38 | let en2 = "machine learning"; 39 | 40 | println!("英語テキスト1: '{en1}'"); 41 | println!("英語テキスト2: '{en2}'"); 42 | 43 | let char_sim3 = levenshtein_similarity(en1, en2); 44 | let word_sim3 = word_levenshtein_similarity(en1, en2); 45 | 46 | println!("文字レベル類似性: {char_sim3:.4}"); 47 | println!("単語レベル類似性: {word_sim3:.4}"); 48 | println!(); 49 | 50 | // 完全に異なるテキスト 51 | let diff1 = "今日の天気は晴れです"; 52 | let diff2 = "プログラミング言語"; 53 | 54 | println!("異なるテキスト1: '{diff1}'"); 55 | println!("異なるテキスト2: '{diff2}'"); 56 | 57 | let char_sim4 = levenshtein_similarity(diff1, diff2); 58 | let word_sim4 = word_levenshtein_similarity(diff1, diff2); 59 | 60 | println!("文字レベル類似性: {char_sim4:.4}"); 61 | println!("単語レベル類似性: {word_sim4:.4}"); 62 | } 63 | -------------------------------------------------------------------------------- /crates/similarity-rs/tests/debug_ast.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::uninlined_format_args)] 2 | 3 | use similarity_core::language_parser::LanguageParser; 4 | use similarity_rs::rust_parser::RustParser; 5 | 6 | #[test] 7 | fn debug_ast_values() { 8 | let code1 = r#" 9 | fn func1(x: i32) -> i32 { 10 | let result = x + 1; 11 | result * 2 12 | } 13 | "#; 14 | 15 | let code2 = r#" 16 | fn func2(y: i32) -> i32 { 17 | let temp = y + 1; 18 | temp * 3 19 | } 20 | "#; 21 | 22 | let mut parser = RustParser::new().unwrap(); 23 | let tree1 = parser.parse(code1, "test.rs").unwrap(); 24 | let tree2 = parser.parse(code2, "test.rs").unwrap(); 25 | 26 | // Print the tree to see if values are captured 27 | fn print_tree(node: &similarity_core::tree::TreeNode, depth: usize) { 28 | let indent = " ".repeat(depth); 29 | if node.value.is_empty() { 30 | println!("{}{}", indent, node.label); 31 | } else { 32 | println!("{}{} = '{}'", indent, node.label, node.value); 33 | } 34 | for child in &node.children { 35 | print_tree(child, depth + 1); 36 | } 37 | } 38 | 39 | println!("=== Tree 1 (func1) ==="); 40 | print_tree(&tree1, 0); 41 | println!("\n=== Tree 2 (func2) ==="); 42 | print_tree(&tree2, 0); 43 | 44 | // Also check similarity 45 | use similarity_core::{calculate_enhanced_similarity, APTEDOptions, EnhancedSimilarityOptions}; 46 | let options = EnhancedSimilarityOptions { 47 | structural_weight: 0.7, 48 | size_weight: 0.2, 49 | type_distribution_weight: 0.1, 50 | min_size_ratio: 0.5, 51 | apted_options: APTEDOptions { 52 | rename_cost: 0.3, 53 | delete_cost: 1.0, 54 | insert_cost: 1.0, 55 | compare_values: true, 56 | }, 57 | }; 58 | let similarity = calculate_enhanced_similarity(&tree1, &tree2, &options); 59 | println!("\nSimilarity: {}", similarity); 60 | } 61 | -------------------------------------------------------------------------------- /RECOMMENDATIONS.md: -------------------------------------------------------------------------------- 1 | # Rust コード類似度検出の推奨設定 2 | 3 | ## 実プロジェクトでの検証結果 4 | 5 | ### 検出された主な重複パターン 6 | 7 | 1. **構造的に同一なコード(正当な検出)** 8 | - `extract_struct_definition` と `extract_enum_definition`: 98.44% 9 | - 実際にリファクタリング可能な重複コード 10 | 11 | 2. **偽陽性のパターン** 12 | - テスト関数: 構造が似ているため95-99%の類似度 13 | - 短い関数: サイズペナルティがあっても誤検出されやすい 14 | 15 | ### 推奨パラメータ設定 16 | 17 | #### 1. 一般的な重複検出 18 | ```bash 19 | similarity-rs --threshold 0.8 --min-lines 10 --min-tokens 50 20 | ``` 21 | - 80%以上の類似度 22 | - 10行以上の関数 23 | - 50トークン以上(ASTノード数) 24 | 25 | #### 2. 厳密な重複検出 26 | ```bash 27 | similarity-rs --threshold 0.9 --min-lines 15 --min-tokens 100 28 | ``` 29 | - 90%以上の類似度 30 | - 15行以上の関数 31 | - 100トークン以上 32 | 33 | #### 3. テストコードを除外 34 | ```bash 35 | similarity-rs --threshold 0.8 --min-lines 10 --skip-test 36 | ``` 37 | - `#[test]` 属性の付いた関数を除外 38 | - `test_` で始まる関数を除外 39 | 40 | ### パラメータの影響 41 | 42 | | パラメータ | 効果 | 推奨値 | 43 | |----------|------|--------| 44 | | `threshold` | 類似度の閾値 | 0.8-0.9 | 45 | | `min-lines` | 最小行数 | 10-15 | 46 | | `min-tokens` | 最小トークン数 | 50-100 | 47 | | `size-penalty` | 短い関数へのペナルティ | true(デフォルト) | 48 | | `rename-cost` | 変数名の違いへの寛容度 | 0.3(デフォルト) | 49 | 50 | ### 実際の使用例 51 | 52 | #### CI/CDでの使用 53 | ```yaml 54 | - name: Check code duplication 55 | run: | 56 | cargo install similarity-rs 57 | similarity-rs src \ 58 | --threshold 0.85 \ 59 | --min-lines 12 \ 60 | --min-tokens 60 \ 61 | --skip-test 62 | ``` 63 | 64 | #### リファクタリング候補の検出 65 | ```bash 66 | # 高い類似度の長い関数を検出 67 | similarity-rs src \ 68 | --threshold 0.95 \ 69 | --min-lines 20 \ 70 | --min-tokens 150 71 | ``` 72 | 73 | ### 注意事項 74 | 75 | 1. **テストコードの扱い** 76 | - テスト関数は構造が似ているため偽陽性が多い 77 | - `--skip-test` オプションの使用を推奨 78 | 79 | 2. **最小トークン数の重要性** 80 | - `min-tokens` を設定しないと短い関数で偽陽性が増える 81 | - 50トークン以上を推奨 82 | 83 | 3. **言語特性の考慮** 84 | - Rustのマクロ展開後のコードは検出されない 85 | - ジェネリクスの具体化は別関数として扱われる 86 | 87 | ### まとめ 88 | 89 | `compare_values` パラメータの修正により、Rust コードの類似度検出が大幅に改善されました。適切なパラメータ設定により、実用的な重複検出が可能になっています。 -------------------------------------------------------------------------------- /crates/similarity-generic/build.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::fs; 3 | use std::path::Path; 4 | 5 | fn main() { 6 | // Get the output directory 7 | let out_dir = env::var("OUT_DIR").unwrap(); 8 | let dest_path = Path::new(&out_dir).join("language_configs.rs"); 9 | 10 | // Read all JSON files from language_configs directory 11 | let configs_dir = Path::new("language_configs"); 12 | 13 | let mut output = String::new(); 14 | output.push_str("// Auto-generated by build.rs\n\n"); 15 | output.push_str("use once_cell::sync::Lazy;\n"); 16 | output.push_str("use std::collections::HashMap;\n\n"); 17 | 18 | // Generate a static HashMap of language configs 19 | output.push_str( 20 | "pub static LANGUAGE_CONFIGS: Lazy> = Lazy::new(|| {\n", 21 | ); 22 | output.push_str(" let mut map = HashMap::new();\n"); 23 | 24 | if configs_dir.exists() { 25 | for entry in fs::read_dir(configs_dir).unwrap() { 26 | let entry = entry.unwrap(); 27 | let path = entry.path(); 28 | 29 | if path.extension().and_then(|s| s.to_str()) == Some("json") { 30 | let file_name = path.file_stem().unwrap().to_str().unwrap(); 31 | let content = fs::read_to_string(&path).unwrap(); 32 | 33 | // Escape the JSON content for inclusion in Rust code 34 | let escaped_content = 35 | content.replace('\\', "\\\\").replace('"', "\\\"").replace('\n', "\\n"); 36 | 37 | output.push_str(&format!( 38 | " map.insert(\"{file_name}\", \"{escaped_content}\");\n" 39 | )); 40 | } 41 | } 42 | } 43 | 44 | output.push_str(" map\n"); 45 | output.push_str("});\n"); 46 | 47 | // Write the generated code 48 | fs::write(&dest_path, output).unwrap(); 49 | 50 | // Tell Cargo to rerun if any config file changes 51 | println!("cargo:rerun-if-changed=language_configs"); 52 | } 53 | -------------------------------------------------------------------------------- /docs/implementation/performance-baseline.md: -------------------------------------------------------------------------------- 1 | # Performance Baseline (oxc_parser) 2 | 3 | This document establishes the performance baseline using oxc_parser before transitioning to tree-sitter. 4 | 5 | ## Current Performance Metrics 6 | 7 | ### Function Comparison Benchmarks 8 | 9 | #### Within File Analysis 10 | - **Small file (4 functions)**: ~8.3µs 11 | - **Medium file (8 functions)**: ~59.6µs 12 | - **Large file (9 functions)**: ~66.7µs 13 | 14 | #### Cross-File Analysis 15 | - **2 small files**: ~17.2µs 16 | - **3 mixed files**: ~165.9µs 17 | - **4 mixed files (worst case)**: ~192.2µs 18 | 19 | #### Fast Mode (Bloom Filter) 20 | - **Small file**: ~8.1µs 21 | - **Medium file**: ~41.0µs 22 | - **Large file**: ~85.1µs 23 | - **3 mixed files cross-file**: ~126.5µs 24 | 25 | ### TSED (Tree Similarity Edit Distance) Benchmarks 26 | 27 | #### Full Calculation 28 | - **Small files**: ~15.8µs 29 | - **Medium files**: ~12.7µs 30 | 31 | #### Parsing Only 32 | - **Small file**: ~2.2µs 33 | - **Medium file**: ~5.9µs 34 | 35 | #### Tree Edit Distance Computation 36 | - **Small trees**: ~10.8µs 37 | - **Medium trees**: ~194ns 38 | 39 | #### Large Scale 40 | - **100 small file comparisons**: ~1.69ms 41 | 42 | ## Performance Targets for tree-sitter 43 | 44 | To ensure tree-sitter integration is viable, we should aim for: 45 | 46 | 1. **Parsing overhead**: < 2x slower than oxc_parser 47 | 2. **Overall performance**: Within 50% of current metrics 48 | 3. **Memory usage**: Comparable or better 49 | 4. **Multi-language support**: Justifies any performance trade-offs 50 | 51 | ## Key Performance Considerations 52 | 53 | 1. **oxc_parser advantages**: 54 | - Zero-copy parsing 55 | - Optimized specifically for JS/TS 56 | - Minimal allocations 57 | - Type-safe AST 58 | 59 | 2. **tree-sitter potential advantages**: 60 | - Incremental parsing 61 | - Error recovery 62 | - Language agnostic 63 | - Query-based extraction 64 | 65 | 3. **Critical paths to optimize**: 66 | - AST to TreeNode conversion 67 | - Function/type extraction 68 | - Tree traversal -------------------------------------------------------------------------------- /test/__fixtures__/duplication/semantic/validation_pattern_1.ts: -------------------------------------------------------------------------------- 1 | // Semantic duplication: Early return validation pattern 2 | // Common validation logic using early returns 3 | export function validateUserRegistration(data: any): ValidationResult { 4 | if (!data.email) { 5 | return { valid: false, error: 'Email is required' }; 6 | } 7 | 8 | if (!isValidEmail(data.email)) { 9 | return { valid: false, error: 'Invalid email format' }; 10 | } 11 | 12 | if (!data.password) { 13 | return { valid: false, error: 'Password is required' }; 14 | } 15 | 16 | if (data.password.length < 8) { 17 | return { valid: false, error: 'Password must be at least 8 characters' }; 18 | } 19 | 20 | if (!data.username) { 21 | return { valid: false, error: 'Username is required' }; 22 | } 23 | 24 | if (data.username.length < 3) { 25 | return { valid: false, error: 'Username must be at least 3 characters' }; 26 | } 27 | 28 | return { valid: true }; 29 | } 30 | 31 | export function validateProductCreation(data: any): ValidationResult { 32 | if (!data.name) { 33 | return { valid: false, error: 'Product name is required' }; 34 | } 35 | 36 | if (data.name.length < 3) { 37 | return { valid: false, error: 'Product name must be at least 3 characters' }; 38 | } 39 | 40 | if (!data.price) { 41 | return { valid: false, error: 'Price is required' }; 42 | } 43 | 44 | if (typeof data.price !== 'number' || data.price <= 0) { 45 | return { valid: false, error: 'Price must be a positive number' }; 46 | } 47 | 48 | if (!data.category) { 49 | return { valid: false, error: 'Category is required' }; 50 | } 51 | 52 | if (!['electronics', 'clothing', 'food', 'other'].includes(data.category)) { 53 | return { valid: false, error: 'Invalid category' }; 54 | } 55 | 56 | return { valid: true }; 57 | } 58 | 59 | // Helper 60 | function isValidEmail(email: string): boolean { 61 | return /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(email); 62 | } 63 | 64 | interface ValidationResult { 65 | valid: boolean; 66 | error?: string; 67 | } -------------------------------------------------------------------------------- /benchmarks/data/test_duplicates.ts: -------------------------------------------------------------------------------- 1 | // Test file with various function sizes 2 | 3 | // Small function (likely < 20 tokens) 4 | function tiny(x: number) { 5 | return x + 1; 6 | } 7 | 8 | // Another small function 9 | function small(y: number) { 10 | return y + 1; 11 | } 12 | 13 | // Medium function (~25-30 tokens) 14 | function processUser(user: { id: string; name: string; age: number }) { 15 | if (!user.id) { 16 | throw new Error('Invalid user'); 17 | } 18 | const result = { 19 | userId: user.id, 20 | displayName: user.name.toUpperCase(), 21 | ageGroup: user.age >= 18 ? 'adult' : 'minor' 22 | }; 23 | return result; 24 | } 25 | 26 | // Similar medium function 27 | function processCustomer(customer: { id: string; name: string; age: number }) { 28 | if (!customer.id) { 29 | throw new Error('Invalid customer'); 30 | } 31 | const result = { 32 | customerId: customer.id, 33 | displayName: customer.name.toUpperCase(), 34 | ageGroup: customer.age >= 18 ? 'adult' : 'minor' 35 | }; 36 | return result; 37 | } 38 | 39 | // Larger function (> 30 tokens) 40 | function calculateStatistics(numbers: number[]) { 41 | if (numbers.length === 0) { 42 | return { min: 0, max: 0, avg: 0, sum: 0 }; 43 | } 44 | 45 | let min = numbers[0]; 46 | let max = numbers[0]; 47 | let sum = 0; 48 | 49 | for (const num of numbers) { 50 | if (num < min) min = num; 51 | if (num > max) max = num; 52 | sum += num; 53 | } 54 | 55 | return { 56 | min, 57 | max, 58 | avg: sum / numbers.length, 59 | sum 60 | }; 61 | } 62 | 63 | // Similar larger function 64 | function computeStatistics(values: number[]) { 65 | if (values.length === 0) { 66 | return { min: 0, max: 0, avg: 0, sum: 0 }; 67 | } 68 | 69 | let min = values[0]; 70 | let max = values[0]; 71 | let sum = 0; 72 | 73 | for (const val of values) { 74 | if (val < min) min = val; 75 | if (val > max) max = val; 76 | sum += val; 77 | } 78 | 79 | return { 80 | min, 81 | max, 82 | avg: sum / values.length, 83 | sum 84 | }; 85 | } -------------------------------------------------------------------------------- /crates/core/tests/ast_fingerprint_test.rs: -------------------------------------------------------------------------------- 1 | use similarity_core::AstFingerprint; 2 | 3 | #[test] 4 | fn test_ast_fingerprint_usage() { 5 | let code1 = r#" 6 | function processArray(arr) { 7 | const result = []; 8 | for (let i = 0; i < arr.length; i++) { 9 | if (arr[i] > 0) { 10 | result.push(arr[i] * 2); 11 | } 12 | } 13 | return result; 14 | } 15 | "#; 16 | 17 | let code2 = r#" 18 | function filterAndDouble(items) { 19 | const output = []; 20 | for (let j = 0; j < items.length; j++) { 21 | if (items[j] > 0) { 22 | output.push(items[j] * 2); 23 | } 24 | } 25 | return output; 26 | } 27 | "#; 28 | 29 | let fp1 = AstFingerprint::from_source(code1).unwrap(); 30 | let fp2 = AstFingerprint::from_source(code2).unwrap(); 31 | 32 | // Print node counts for debugging 33 | println!("\nNode counts for function 1:"); 34 | for (node_type, count) in fp1.node_counts() { 35 | if *count > 0 { 36 | println!(" {node_type}: {count}"); 37 | } 38 | } 39 | 40 | // Test similarity 41 | let similarity = fp1.similarity(&fp2); 42 | println!("\nSimilarity: {:.2}%", similarity * 100.0); 43 | assert!(similarity > 0.9, "Expected high similarity for structurally identical functions"); 44 | 45 | // Test bloom filter 46 | assert!(fp1.might_be_similar(&fp2, 0.5), "Bloom filter should pass for similar functions"); 47 | 48 | // Test with different structure 49 | let code3 = r#" 50 | function processArray(arr) { 51 | return arr.filter(x => x > 0).map(x => x * 2); 52 | } 53 | "#; 54 | 55 | let fp3 = AstFingerprint::from_source(code3).unwrap(); 56 | let similarity_different = fp1.similarity(&fp3); 57 | println!("Similarity with different implementation: {:.2}%", similarity_different * 100.0); 58 | assert!(similarity_different < 0.8, "Expected lower similarity for different implementations"); 59 | } 60 | -------------------------------------------------------------------------------- /crates/similarity-rs/tests/test_tsed_debugging.rs: -------------------------------------------------------------------------------- 1 | use similarity_core::language_parser::LanguageParser; 2 | use similarity_core::{ 3 | apted::APTEDOptions, 4 | tsed::{calculate_tsed, TSEDOptions}, 5 | }; 6 | use similarity_rs::rust_parser::RustParser; 7 | 8 | #[test] 9 | fn test_short_function_similarity() { 10 | let mut parser = RustParser::new().unwrap(); 11 | 12 | let code1 = "a + b"; 13 | let code2 = "a - b"; 14 | let code3 = "a * b"; 15 | 16 | let tree1 = parser.parse(code1, "test1.rs").unwrap(); 17 | let tree2 = parser.parse(code2, "test2.rs").unwrap(); 18 | let tree3 = parser.parse(code3, "test3.rs").unwrap(); 19 | 20 | let options = TSEDOptions { 21 | apted_options: APTEDOptions { 22 | rename_cost: 0.3, 23 | delete_cost: 1.0, 24 | insert_cost: 1.0, 25 | compare_values: true, 26 | }, 27 | min_lines: 1, 28 | min_tokens: None, 29 | size_penalty: true, 30 | skip_test: false, 31 | }; 32 | 33 | let sim12 = calculate_tsed(&tree1, &tree2, &options); 34 | let sim13 = calculate_tsed(&tree1, &tree3, &options); 35 | let sim23 = calculate_tsed(&tree2, &tree3, &options); 36 | 37 | println!("Tree1 size: {}", tree1.get_subtree_size()); 38 | println!("Tree2 size: {}", tree2.get_subtree_size()); 39 | println!("Tree3 size: {}", tree3.get_subtree_size()); 40 | println!("Similarity between 'a + b' and 'a - b': {:.2}%", sim12 * 100.0); 41 | println!("Similarity between 'a + b' and 'a * b': {:.2}%", sim13 * 100.0); 42 | println!("Similarity between 'a - b' and 'a * b': {:.2}%", sim23 * 100.0); 43 | 44 | // These should not be 100% similar due to different operators 45 | assert!(sim12 < 1.0, "Different operators should not be 100% similar"); 46 | assert!(sim13 < 1.0, "Different operators should not be 100% similar"); 47 | assert!(sim23 < 1.0, "Different operators should not be 100% similar"); 48 | 49 | // With size penalty, short functions should have reduced similarity 50 | assert!(sim12 < 0.85, "Short functions with different operators should have low similarity"); 51 | } 52 | -------------------------------------------------------------------------------- /examples/specs/sample_project/src/services/product_service.ts: -------------------------------------------------------------------------------- 1 | import { Product } from "../models/product.ts"; 2 | import { Logger } from "../utils/logger.ts"; 3 | 4 | export class ProductService { 5 | private products: Map = new Map(); 6 | private logger: Logger; 7 | 8 | constructor(logger: Logger) { 9 | this.logger = logger; 10 | } 11 | 12 | async createProduct(data: Omit): Promise { 13 | const product: Product = { 14 | id: this.generateId(), 15 | ...data, 16 | createdAt: new Date(), 17 | }; 18 | 19 | this.products.set(product.id, product); 20 | this.logger.info(`Product created: ${product.id}`); 21 | 22 | return product; 23 | } 24 | 25 | async getProductById(id: string): Promise { 26 | const product = this.products.get(id); 27 | 28 | if (!product) { 29 | this.logger.warn(`Product not found: ${id}`); 30 | return null; 31 | } 32 | 33 | return product; 34 | } 35 | 36 | async updateProduct(id: string, updates: Partial): Promise { 37 | const product = await this.getProductById(id); 38 | 39 | if (!product) { 40 | return null; 41 | } 42 | 43 | const updatedProduct = { ...product, ...updates, id: product.id }; 44 | this.products.set(id, updatedProduct); 45 | this.logger.info(`Product updated: ${id}`); 46 | 47 | return updatedProduct; 48 | } 49 | 50 | async deleteProduct(id: string): Promise { 51 | const exists = this.products.has(id); 52 | 53 | if (exists) { 54 | this.products.delete(id); 55 | this.logger.info(`Product deleted: ${id}`); 56 | } 57 | 58 | return exists; 59 | } 60 | 61 | async getAllProducts(): Promise { 62 | return Array.from(this.products.values()); 63 | } 64 | 65 | async getProductsByCategory(category: string): Promise { 66 | return Array.from(this.products.values()).filter((product) => product.category === category); 67 | } 68 | 69 | private generateId(): string { 70 | return `product_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /crates/core/src/cli_output.rs: -------------------------------------------------------------------------------- 1 | use std::fs; 2 | 3 | /// Format function output in VSCode-compatible format 4 | pub fn format_function_output( 5 | file_path: &str, 6 | function_name: &str, 7 | start_line: u32, 8 | end_line: u32, 9 | ) -> String { 10 | format!("{file_path}:{start_line}-{end_line} {function_name}") 11 | } 12 | 13 | /// Extract lines from file content within the specified range 14 | pub fn extract_lines_from_content(content: &str, start_line: u32, end_line: u32) -> String { 15 | let lines: Vec<&str> = content.lines().collect(); 16 | let start_idx = (start_line.saturating_sub(1)) as usize; 17 | let end_idx = std::cmp::min(end_line as usize, lines.len()); 18 | 19 | if start_idx >= lines.len() { 20 | return String::new(); 21 | } 22 | 23 | lines[start_idx..end_idx].join("\n") 24 | } 25 | 26 | /// Display code content for a function 27 | pub fn show_function_code(file_path: &str, function_name: &str, start_line: u32, end_line: u32) { 28 | match fs::read_to_string(file_path) { 29 | Ok(content) => { 30 | let code = extract_lines_from_content(&content, start_line, end_line); 31 | println!( 32 | "\n\x1b[36m--- {}:{} (lines {}-{}) ---\x1b[0m", 33 | file_path, function_name, start_line, end_line 34 | ); 35 | println!("{}", code); 36 | } 37 | Err(e) => { 38 | eprintln!("Error reading file {}: {}", file_path, e); 39 | } 40 | } 41 | } 42 | 43 | /// Generic duplicate result structure 44 | pub struct DuplicateResult { 45 | pub file1: String, 46 | pub file2: String, 47 | pub item1: T, 48 | pub item2: T, 49 | pub similarity: f64, 50 | } 51 | 52 | impl DuplicateResult { 53 | pub fn new(file1: String, file2: String, item1: T, item2: T, similarity: f64) -> Self { 54 | Self { file1, file2, item1, item2, similarity } 55 | } 56 | 57 | /// Calculate priority score for sorting 58 | pub fn priority(&self, get_size: impl Fn(&T) -> f64) -> f64 { 59 | let avg_size = (get_size(&self.item1) + get_size(&self.item2)) / 2.0; 60 | self.similarity * avg_size 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /crates/similarity-elixir/README.md: -------------------------------------------------------------------------------- 1 | # similarity-elixir 2 | 3 | Elixir code similarity analyzer using Tree-sitter parser. 4 | 5 | ## Installation 6 | 7 | ```bash 8 | cargo install similarity-elixir 9 | ``` 10 | 11 | ## Usage 12 | 13 | ```bash 14 | # Analyze a single Elixir file 15 | similarity-elixir lib/my_module.ex 16 | 17 | # Analyze multiple files 18 | similarity-elixir lib/ 19 | 20 | # Set similarity threshold (default: 0.85) 21 | similarity-elixir lib/ -t 0.9 22 | 23 | # Show all functions 24 | similarity-elixir lib/my_module.ex --show-functions 25 | 26 | # Print similar function pairs with code 27 | similarity-elixir lib/ -p 28 | ``` 29 | 30 | ## Options 31 | 32 | - `-t, --threshold ` - Similarity threshold (0.0-1.0, default: 0.85) 33 | - `-p, --print` - Print similar function pairs with source code 34 | - `--show-functions` - Show all functions found 35 | - `--filter-function ` - Filter results to functions containing NAME 36 | - `--filter-function-body ` - Filter by function body content 37 | - `--min-lines ` - Minimum function lines (default: 5) 38 | - `--rename-cost ` - Cost for renaming operations (default: 1.0) 39 | 40 | ## Features 41 | 42 | - Detects similar functions across Elixir modules 43 | - Supports pattern matching and guard clauses 44 | - Handles pipe operators and anonymous functions 45 | - Recognizes module, protocol, and implementation definitions 46 | - Fast AST-based comparison using Tree-sitter 47 | 48 | ## Example 49 | 50 | ```elixir 51 | # Input: lib/calculator.ex 52 | defmodule Calculator do 53 | def add(a, b) do 54 | a + b 55 | end 56 | 57 | def sum(x, y) do 58 | x + y 59 | end 60 | end 61 | ``` 62 | 63 | ```bash 64 | $ similarity-elixir lib/calculator.ex 65 | Analyzing Elixir code similarity... 66 | 67 | Found 2 functions 68 | - add 69 | - sum 70 | 71 | Duplicates in lib/calculator.ex: 72 | ------------------------------------------------------------ 73 | lib/calculator.ex:2-4 add <-> lib/calculator.ex:6-8 sum 74 | Similarity: 100.00% 75 | ``` 76 | 77 | ## Algorithm 78 | 79 | Uses Tree Structure Edit Distance (TSED) to compare function ASTs with configurable rename costs and size penalties. 80 | 81 | ## License 82 | 83 | MIT -------------------------------------------------------------------------------- /examples/specs/type-similarity/test_type_literal_sample.ts: -------------------------------------------------------------------------------- 1 | // Sample TypeScript file for testing type literal similarity detection 2 | 3 | // Type declaration 4 | type UserData = { id: number; name: string; email: string }; 5 | 6 | // Function with type literal return type - should match UserData 7 | function getUser(): { id: number; name: string; email: string } { 8 | return { id: 1, name: "John", email: "john@example.com" }; 9 | } 10 | 11 | // Function with type literal parameter - should match UserData 12 | function updateUser(user: { id: number; name: string; email: string }): void { 13 | console.log("Updating user:", user); 14 | } 15 | 16 | // Variable with type literal - should match UserData 17 | const defaultUser: { id: number; name: string; email: string } = { 18 | id: 0, 19 | name: "Default", 20 | email: "default@example.com", 21 | }; 22 | 23 | // Arrow function with type literal return type 24 | const createUser = (): { id: number; name: string; email: string } => { 25 | return { id: Math.random(), name: "New User", email: "new@example.com" }; 26 | }; 27 | 28 | // Different type literal - should not match 29 | function getProduct(): { sku: string; price: number; category: string } { 30 | return { sku: "ABC123", price: 99.99, category: "Electronics" }; 31 | } 32 | 33 | // Similar but slightly different type literal 34 | function getUserInfo(): { id: number; fullName: string; email: string } { 35 | return { id: 1, fullName: "John Doe", email: "john@example.com" }; 36 | } 37 | 38 | // Nested type literal 39 | function getOrder(): { 40 | id: number; 41 | user: { id: number; name: string; email: string }; 42 | items: Array<{ sku: string; quantity: number }>; 43 | } { 44 | return { 45 | id: 1, 46 | user: { id: 1, name: "John", email: "john@example.com" }, 47 | items: [{ sku: "ABC123", quantity: 2 }], 48 | }; 49 | } 50 | 51 | // Type literal with optional properties 52 | function getPartialUser(): { id: number; name?: string; email?: string } { 53 | return { id: 1 }; 54 | } 55 | 56 | // Interface for comparison 57 | interface ProductInfo { 58 | sku: string; 59 | price: number; 60 | category: string; 61 | } 62 | 63 | // Type alias for comparison 64 | type OrderData = { 65 | id: number; 66 | user: { id: number; name: string; email: string }; 67 | items: Array<{ sku: string; quantity: number }>; 68 | }; 69 | -------------------------------------------------------------------------------- /docs/algorithm/tsed-similarity-summary.md: -------------------------------------------------------------------------------- 1 | # コード類似性評価論文の要約 2 | 3 | https://arxiv.org/abs/2404.08817 4 | 5 | ## 論文タイトル 6 | 7 | Revisiting Code Similarity Evaluation with Abstract Syntax Tree Edit Distance 8 | 9 | ## 概要 10 | 11 | この論文は、コード類似性評価における抽象構文木(AST)編集距離の適用を探求した研究です。特に、TSED(Tree Similarity of Edit Distance)メトリクスを多様なプログラミング言語に拡張し、従来の評価手法との比較を行っています。 12 | 13 | ## 主要な貢献 14 | 15 | 1. **TSED(Tree Similarity of Edit Distance)メトリクスの拡張** 16 | 17 | - 元々 SQL 用だった TSED を 48 のプログラミング言語に適用可能に拡張 18 | - 新しいツールとして公開 19 | 20 | 2. **評価メトリクスの包括的比較** 21 | 22 | - TSED 23 | - GPT-4 類似性スコア 24 | - BLEU 25 | - Jaccard 類似度 26 | - 実行一致度 27 | 28 | 3. **主な発見** 29 | - TSED は従来の統計的メトリクス(BLEU 等)より実行一致度との相関が高い 30 | - GPT-4 の類似性スコアは効果的だが出力が不安定 31 | - AST パーサーの選択が TSED の性能に大きく影響 32 | 33 | ## 技術的アプローチ 34 | 35 | ### TSED の計算プロセス 36 | 37 | 1. **コード解析**: tree-sitter を使用してコードを AST に変換 38 | 2. **木編集距離計算**: APTED アルゴリズムを使用 39 | 3. **正規化**: 0-1 のスコアに正規化 40 | 41 | ### 計算式 42 | 43 | ``` 44 | Δ(G1, G2) = min_ops Σ w(op_i) 45 | TSED = max{1 - δ/MaxNodes(G1, G2), 0} 46 | ``` 47 | 48 | ## 実験結果 49 | 50 | ### 対象言語と性能 51 | 52 | - Java、Python、JavaScript、TypeScript、Ruby、Kotlin で評価 53 | - TSED と GPT 類似性は従来メトリクスより高い精度を示す 54 | 55 | ### 主要な数値結果(MBXP データセット) 56 | 57 | | 言語 | TSED | BLEU | Jaccard | GPT-4 | 実行一致 | 58 | | ---------- | ------ | ------ | ------- | ------ | -------- | 59 | | Java | 0.3746 | 0.2041 | 0.2733 | 0.8143 | 0.6550 | 60 | | Python | 0.1888 | 0.0843 | 0.2000 | 0.6751 | 0.6842 | 61 | | JavaScript | 0.2037 | 0.0846 | 0.2037 | 0.6763 | 0.6811 | 62 | 63 | ## 制限事項 64 | 65 | 1. **GPT スコアの不安定性** 66 | 67 | - 同じ入力でも出力が変動 68 | - MSE: 約 0.05-0.06、MAE: 約 0.18-0.20 69 | 70 | 2. **TSED のパラメータ依存性** 71 | 72 | - 操作の重み(削除、挿入、リネーム)が結果に影響 73 | - 言語ごとに最適なパラメータが異なる可能性 74 | 75 | 3. **パーサー依存性** 76 | - tree-sitter と ANTLR で結果が大きく異なる 77 | - パーサーの品質が評価精度に直接影響 78 | 79 | ## 実用的な意義 80 | 81 | 1. **コード生成タスクの評価** 82 | 83 | - LLM が生成したコードの品質評価に有効 84 | - 実行結果だけでなく構造的な類似性も評価可能 85 | 86 | 2. **多言語対応** 87 | 88 | - 48 のプログラミング言語に対応 89 | - 言語固有の構造を考慮した評価が可能 90 | 91 | 3. **従来手法の改善** 92 | - BLEU や Jaccard 類似度より実行一致度との相関が高い 93 | - コードの構造的特徴をより正確に捉える 94 | 95 | ## まとめ 96 | 97 | TSED は、コードの構造的類似性を評価する有効な手法として、従来の統計的手法を上回る性能を示しました。特に、実行結果との相関が高く、コード生成タスクの評価において実用的な指標となることが期待されます。ただし、GPT スコアの不安定性やパラメータ調整の必要性など、実用化に向けた課題も明らかになりました。 98 | -------------------------------------------------------------------------------- /examples/overlap-detection/partial-overlap.js: -------------------------------------------------------------------------------- 1 | // Test case 3: Partial overlaps within larger functions 2 | 3 | function complexDataProcessor(data) { 4 | // Validation phase 5 | if (!data || !Array.isArray(data)) { 6 | throw new Error('Invalid input'); 7 | } 8 | 9 | const results = { 10 | processed: [], 11 | errors: [], 12 | stats: { 13 | total: 0, 14 | success: 0, 15 | failed: 0 16 | } 17 | }; 18 | 19 | // Processing phase - similar to other functions 20 | for (let i = 0; i < data.length; i++) { 21 | try { 22 | if (data[i].value > 0) { 23 | results.processed.push({ 24 | id: data[i].id, 25 | value: data[i].value * 2, 26 | timestamp: new Date() 27 | }); 28 | results.stats.success++; 29 | } 30 | } catch (error) { 31 | results.errors.push({ 32 | index: i, 33 | error: error.message 34 | }); 35 | results.stats.failed++; 36 | } 37 | results.stats.total++; 38 | } 39 | 40 | // Summary phase 41 | console.log(`Processed ${results.stats.total} items`); 42 | return results; 43 | } 44 | 45 | function simpleProcessor(items) { 46 | const output = []; 47 | // This loop is similar to part of complexDataProcessor 48 | for (let i = 0; i < items.length; i++) { 49 | if (items[i].value > 0) { 50 | output.push({ 51 | id: items[i].id, 52 | value: items[i].value * 2, 53 | timestamp: new Date() 54 | }); 55 | } 56 | } 57 | return output; 58 | } 59 | 60 | function batchProcessor(batches) { 61 | const allResults = []; 62 | 63 | for (let batch of batches) { 64 | const batchResults = []; 65 | // Inner loop similar to simpleProcessor 66 | for (let i = 0; i < batch.length; i++) { 67 | if (batch[i].value > 0) { 68 | batchResults.push({ 69 | id: batch[i].id, 70 | value: batch[i].value * 2, 71 | timestamp: new Date() 72 | }); 73 | } 74 | } 75 | allResults.push(batchResults); 76 | } 77 | 78 | return allResults; 79 | } -------------------------------------------------------------------------------- /crates/core/src/cli_file_utils.rs: -------------------------------------------------------------------------------- 1 | use ignore::WalkBuilder; 2 | use std::collections::HashSet; 3 | use std::path::{Path, PathBuf}; 4 | 5 | /// Collect files from paths with given extensions 6 | pub fn collect_files(paths: &[String], extensions: &[&str]) -> anyhow::Result> { 7 | let mut files = Vec::new(); 8 | let mut visited = HashSet::new(); 9 | 10 | // Process each path 11 | for path_str in paths { 12 | let path = Path::new(path_str); 13 | 14 | if path.is_file() { 15 | // If it's a file, check extension and add it 16 | if let Some(ext) = path.extension() { 17 | if let Some(ext_str) = ext.to_str() { 18 | if extensions.contains(&ext_str) { 19 | if let Ok(canonical) = path.canonicalize() { 20 | if visited.insert(canonical.clone()) { 21 | files.push(path.to_path_buf()); 22 | } 23 | } 24 | } 25 | } 26 | } 27 | } else if path.is_dir() { 28 | // If it's a directory, walk it respecting .gitignore 29 | let walker = WalkBuilder::new(path).follow_links(false).build(); 30 | 31 | for entry in walker { 32 | let entry = entry?; 33 | let entry_path = entry.path(); 34 | 35 | // Skip if not a file 36 | if !entry_path.is_file() { 37 | continue; 38 | } 39 | 40 | // Check extension 41 | if let Some(ext) = entry_path.extension() { 42 | if let Some(ext_str) = ext.to_str() { 43 | if extensions.contains(&ext_str) { 44 | if let Ok(canonical) = entry_path.canonicalize() { 45 | if visited.insert(canonical.clone()) { 46 | files.push(entry_path.to_path_buf()); 47 | } 48 | } 49 | } 50 | } 51 | } 52 | } 53 | } else { 54 | eprintln!("Path does not exist or is not accessible: {}", path_str); 55 | } 56 | } 57 | 58 | // Sort files for consistent output 59 | files.sort(); 60 | 61 | Ok(files) 62 | } 63 | -------------------------------------------------------------------------------- /examples/test_different_ts_structures.ts: -------------------------------------------------------------------------------- 1 | // Test file to verify that clearly different TypeScript structures are not detected as similar 2 | 3 | // Simple interface with two fields 4 | interface Point2D { 5 | x: number; 6 | y: number; 7 | } 8 | 9 | // Complex interface with many fields 10 | interface DatabaseConfig { 11 | host: string; 12 | port: number; 13 | username: string; 14 | password: string; 15 | database: string; 16 | poolSize: number; 17 | timeout: number; 18 | ssl: boolean; 19 | certificate?: string; 20 | retryAttempts: number; 21 | retryDelay: number; 22 | } 23 | 24 | // Empty interface 25 | interface Marker {} 26 | 27 | // Single field type 28 | type Id = string; 29 | 30 | // Union type 31 | type Status = 'pending' | 'active' | 'inactive' | 'deleted'; 32 | 33 | // Another simple interface that might look similar to Point2D 34 | interface Coordinate { 35 | lat: number; 36 | lng: number; 37 | } 38 | 39 | // Complex type with nested structure 40 | type ApiResponse = { 41 | success: boolean; 42 | data: T; 43 | error?: { 44 | code: number; 45 | message: string; 46 | details?: string[]; 47 | }; 48 | metadata: { 49 | timestamp: number; 50 | version: string; 51 | requestId: string; 52 | }; 53 | }; 54 | 55 | // Class with methods (different from interfaces) 56 | class UserService { 57 | private users: Map; 58 | 59 | constructor() { 60 | this.users = new Map(); 61 | } 62 | 63 | getUser(id: string) { 64 | return this.users.get(id); 65 | } 66 | 67 | addUser(id: string, data: any) { 68 | this.users.set(id, data); 69 | } 70 | } 71 | 72 | // Enum (different structure from interfaces) 73 | enum Color { 74 | Red = '#FF0000', 75 | Green = '#00FF00', 76 | Blue = '#0000FF', 77 | } 78 | 79 | // Large configuration object 80 | interface ApplicationConfig { 81 | appName: string; 82 | version: string; 83 | environment: 'dev' | 'staging' | 'production'; 84 | features: { 85 | auth: boolean; 86 | analytics: boolean; 87 | notifications: boolean; 88 | darkMode: boolean; 89 | }; 90 | api: { 91 | baseUrl: string; 92 | timeout: number; 93 | retries: number; 94 | }; 95 | logging: { 96 | level: 'debug' | 'info' | 'warn' | 'error'; 97 | file: string; 98 | console: boolean; 99 | }; 100 | } -------------------------------------------------------------------------------- /examples/rust_types_example.rs: -------------------------------------------------------------------------------- 1 | // Example file with similar Rust types for testing 2 | 3 | // Very similar structs with different names 4 | struct User { 5 | id: u64, 6 | name: String, 7 | email: String, 8 | created_at: std::time::SystemTime, 9 | } 10 | 11 | struct Person { 12 | id: u64, 13 | full_name: String, 14 | email_address: String, 15 | birth_date: std::time::SystemTime, 16 | } 17 | 18 | struct Customer { 19 | customer_id: u64, 20 | customer_name: String, 21 | contact_email: String, 22 | registration_date: std::time::SystemTime, 23 | } 24 | 25 | // Similar enums 26 | enum Status { 27 | Active, 28 | Inactive, 29 | Pending, 30 | Completed, 31 | } 32 | 33 | enum State { 34 | Running, 35 | Stopped, 36 | Waiting, 37 | Finished, 38 | } 39 | 40 | enum TaskStatus { 41 | InProgress, 42 | Paused, 43 | Queued, 44 | Done, 45 | } 46 | 47 | // Generic structs 48 | struct Response { 49 | data: T, 50 | status: u16, 51 | message: String, 52 | } 53 | 54 | struct ApiResult { 55 | result: T, 56 | code: u16, 57 | description: String, 58 | } 59 | 60 | struct ServerResponse { 61 | payload: T, 62 | status_code: u16, 63 | error_message: String, 64 | } 65 | 66 | // Nested structs 67 | struct ComplexUser { 68 | id: u64, 69 | profile: UserProfile, 70 | settings: UserSettings, 71 | } 72 | 73 | struct UserProfile { 74 | name: String, 75 | email: String, 76 | phone: String, 77 | } 78 | 79 | struct UserSettings { 80 | theme: String, 81 | notifications: bool, 82 | } 83 | 84 | struct ComplexPerson { 85 | person_id: u64, 86 | person_profile: PersonProfile, 87 | person_settings: PersonSettings, 88 | } 89 | 90 | struct PersonProfile { 91 | full_name: String, 92 | email_address: String, 93 | phone_number: String, 94 | } 95 | 96 | struct PersonSettings { 97 | ui_theme: String, 98 | enable_notifications: bool, 99 | } 100 | 101 | // Different structures 102 | struct Product { 103 | sku: String, 104 | name: String, 105 | price: f64, 106 | in_stock: bool, 107 | } 108 | 109 | struct Order { 110 | order_id: String, 111 | items: Vec, 112 | total: f64, 113 | paid: bool, 114 | } 115 | 116 | // Type aliases 117 | type UserId = u64; 118 | type CustomerId = u64; 119 | type OrderId = String; -------------------------------------------------------------------------------- /docs/prompt.md: -------------------------------------------------------------------------------- 1 | # similarity-ts: AI Assistant Guide 2 | 3 | ## Purpose 4 | 5 | Detects duplicate TypeScript/JavaScript code using AST comparison for refactoring. 6 | 7 | ## Installation 8 | 9 | ```bash 10 | cargo install similarity-ts 11 | # check options 12 | similarity-ts --help 13 | ``` 14 | 15 | ## Key Options 16 | 17 | - `--threshold <0-1>`: Similarity threshold (default: 0.8) 18 | - `--min-tokens `: Skip functions with { 7 | describe("parseTypeScript", () => { 8 | it("should parse valid TypeScript code", () => { 9 | const code = `const x = 42;`; 10 | const ast = parseTypeScript("test.ts", code); 11 | 12 | expect(ast).toBeDefined(); 13 | expect(ast.program).toBeDefined(); 14 | expect(ast.program.body).toHaveLength(1); 15 | }); 16 | 17 | it("should parse function declarations", () => { 18 | const code = `function test(a: number): number { return a * 2; }`; 19 | const ast = parseTypeScript("test.ts", code); 20 | 21 | expect(ast.program.body).toHaveLength(1); 22 | expect(ast.program.body[0].type).toBe("FunctionDeclaration"); 23 | }); 24 | 25 | it("should parse class declarations", () => { 26 | const code = `class MyClass { method() {} }`; 27 | const ast = parseTypeScript("test.ts", code); 28 | 29 | expect(ast.program.body).toHaveLength(1); 30 | expect(ast.program.body[0].type).toBe("ClassDeclaration"); 31 | }); 32 | }); 33 | 34 | describe("astToString", () => { 35 | it("should convert AST to string representation", () => { 36 | const code = `const x = 1;`; 37 | const ast = parseTypeScript("test.ts", code); 38 | const str = astToString(ast.program); 39 | 40 | expect(str).toContain("VariableDeclaration"); 41 | expect(str).toContain("VariableDeclarator"); 42 | }); 43 | }); 44 | 45 | describe("calculateSimilarity (Levenshtein only)", () => { 46 | it("should work with simple examples", () => { 47 | const code1 = `const x = 1;`; 48 | const code2 = `const y = 2;`; 49 | 50 | const similarity = calculateSimilarity(code1, code2); 51 | expect(similarity).toBeGreaterThan(0.7); 52 | expect(similarity).toBeLessThan(1.0); 53 | }); 54 | 55 | it("should handle empty code", () => { 56 | const similarity = calculateSimilarity("", ""); 57 | expect(similarity).toBe(1.0); 58 | }); 59 | 60 | it("should handle completely different code", () => { 61 | const code1 = `function add(a: number, b: number) { return a + b; }`; 62 | const code2 = `import { readFileSync } from "fs";`; 63 | 64 | const similarity = calculateSimilarity(code1, code2); 65 | expect(similarity).toBeLessThan(0.4); 66 | }); 67 | }); 68 | }); 69 | -------------------------------------------------------------------------------- /crates/similarity-rs/tests/test_function_extraction.rs: -------------------------------------------------------------------------------- 1 | use similarity_core::language_parser::LanguageParser; 2 | use similarity_rs::rust_parser::RustParser; 3 | 4 | #[test] 5 | fn test_function_extraction() { 6 | let content = r#"fn f1() -> i32 { 1 } 7 | fn f2() -> i32 { 1 } 8 | 9 | fn longer_func1() -> i32 { 10 | let x = 1; 11 | let y = 2; 12 | let z = 3; 13 | x + y + z 14 | } 15 | 16 | fn longer_func2() -> i32 { 17 | let a = 1; 18 | let b = 2; 19 | let c = 3; 20 | a + b + c 21 | }"#; 22 | 23 | let mut parser = RustParser::new().unwrap(); 24 | let functions = parser.extract_functions(content, "test.rs").unwrap(); 25 | 26 | println!("\n=== Extracted Functions ==="); 27 | for (i, func) in functions.iter().enumerate() { 28 | println!( 29 | "[{}] {}: lines {}-{}, body {}-{}", 30 | i, func.name, func.start_line, func.end_line, func.body_start_line, func.body_end_line 31 | ); 32 | 33 | // Extract body 34 | let lines: Vec<&str> = content.lines().collect(); 35 | let start_idx = if func.body_start_line > 0 { 36 | (func.body_start_line.saturating_sub(1)) as usize 37 | } else { 38 | (func.start_line.saturating_sub(1)) as usize 39 | }; 40 | 41 | let end_idx = if func.body_end_line > 0 { 42 | std::cmp::min(func.body_end_line as usize, lines.len()) 43 | } else { 44 | std::cmp::min(func.end_line as usize, lines.len()) 45 | }; 46 | 47 | let body = lines[start_idx..end_idx].join("\n"); 48 | println!(" Start idx: {}, End idx: {}", start_idx, end_idx); 49 | println!(" Lines total: {}", lines.len()); 50 | if start_idx < lines.len() { 51 | println!(" Line at start_idx: {:?}", lines[start_idx]); 52 | } 53 | println!(" Body: {}", body.replace('\n', "\\n")); 54 | } 55 | 56 | assert_eq!(functions.len(), 4); 57 | assert_eq!(functions[2].name, "longer_func1"); 58 | assert_eq!(functions[3].name, "longer_func2"); 59 | 60 | // Check that longer functions have correct line counts 61 | let func1_lines = functions[2].end_line - functions[2].start_line + 1; 62 | let func2_lines = functions[3].end_line - functions[3].start_line + 1; 63 | 64 | println!("\nlonger_func1 has {} lines", func1_lines); 65 | println!("longer_func2 has {} lines", func2_lines); 66 | 67 | assert!(func1_lines >= 5, "longer_func1 should have at least 5 lines"); 68 | assert!(func2_lines >= 5, "longer_func2 should have at least 5 lines"); 69 | } 70 | -------------------------------------------------------------------------------- /crates/core/src/ast_exchange.rs: -------------------------------------------------------------------------------- 1 | use crate::tree::TreeNode; 2 | use serde::{Deserialize, Serialize}; 3 | use std::rc::Rc; 4 | 5 | /// Serializable version of TreeNode for external exchange 6 | #[derive(Debug, Clone, Serialize, Deserialize)] 7 | pub struct SerializableTreeNode { 8 | pub label: String, 9 | pub value: String, 10 | pub children: Vec, 11 | pub id: usize, 12 | } 13 | 14 | impl From<&TreeNode> for SerializableTreeNode { 15 | fn from(node: &TreeNode) -> Self { 16 | SerializableTreeNode { 17 | label: node.label.clone(), 18 | value: node.value.clone(), 19 | children: node.children.iter().map(|child| child.as_ref().into()).collect(), 20 | id: node.id, 21 | } 22 | } 23 | } 24 | 25 | impl From for TreeNode { 26 | fn from(node: SerializableTreeNode) -> Self { 27 | let mut tree_node = TreeNode::new(node.label, node.value, node.id); 28 | for child in node.children { 29 | tree_node.add_child(Rc::new(child.into())); 30 | } 31 | tree_node 32 | } 33 | } 34 | 35 | /// Function definition for external exchange 36 | #[derive(Debug, Clone, Serialize, Deserialize)] 37 | pub struct ExchangeFunctionDef { 38 | pub name: String, 39 | pub start_line: u32, 40 | pub end_line: u32, 41 | pub body_start_line: u32, 42 | pub body_end_line: u32, 43 | pub ast: SerializableTreeNode, 44 | } 45 | 46 | /// Complete AST exchange format 47 | #[derive(Debug, Clone, Serialize, Deserialize)] 48 | pub struct ASTExchange { 49 | pub language: String, 50 | pub filename: String, 51 | pub functions: Vec, 52 | pub full_ast: Option, 53 | } 54 | 55 | #[cfg(test)] 56 | mod tests { 57 | use super::*; 58 | 59 | #[test] 60 | fn test_tree_node_serialization() { 61 | let mut root = TreeNode::new("function".to_string(), "foo".to_string(), 0); 62 | root.add_child(Rc::new(TreeNode::new("params".to_string(), "".to_string(), 1))); 63 | root.add_child(Rc::new(TreeNode::new("body".to_string(), "".to_string(), 2))); 64 | 65 | let serializable: SerializableTreeNode = (&root).into(); 66 | let json = serde_json::to_string(&serializable).unwrap(); 67 | let deserialized: SerializableTreeNode = serde_json::from_str(&json).unwrap(); 68 | let restored: TreeNode = deserialized.into(); 69 | 70 | assert_eq!(root.label, restored.label); 71 | assert_eq!(root.children.len(), restored.children.len()); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /crates/similarity-ts/tests/tsx_test.rs: -------------------------------------------------------------------------------- 1 | use assert_cmd::Command; 2 | use predicates::prelude::*; 3 | use std::fs; 4 | use tempfile::tempdir; 5 | 6 | #[test] 7 | fn test_tsx_file_support() { 8 | let dir = tempdir().unwrap(); 9 | let tsx_file = dir.path().join("component.tsx"); 10 | 11 | // Create a .tsx file with React component 12 | fs::write( 13 | &tsx_file, 14 | r#" 15 | import React from 'react'; 16 | 17 | interface ButtonProps { 18 | label: string; 19 | onClick: () => void; 20 | } 21 | 22 | export function Button({ label, onClick }: ButtonProps) { 23 | return React.createElement('button', { onClick }, label); 24 | } 25 | 26 | export function PrimaryButton({ label, onClick }: ButtonProps) { 27 | return React.createElement('button', { onClick, className: 'primary' }, label); 28 | } 29 | "#, 30 | ) 31 | .unwrap(); 32 | 33 | // Run similarity-ts on .tsx file 34 | let mut cmd = Command::cargo_bin("similarity-ts").unwrap(); 35 | cmd.arg(dir.path()) 36 | .arg("--min-lines") 37 | .arg("1") 38 | .arg("--threshold") 39 | .arg("0.5") 40 | .arg("--no-size-penalty") 41 | .assert() 42 | .success() 43 | .stdout(predicate::str::contains("Checking 1 files for duplicates")); 44 | } 45 | 46 | #[test] 47 | fn test_mixed_ts_tsx_files() { 48 | let dir = tempdir().unwrap(); 49 | let ts_file = dir.path().join("utils.ts"); 50 | let tsx_file = dir.path().join("component.tsx"); 51 | 52 | // Create a .ts file 53 | fs::write( 54 | &ts_file, 55 | r#" 56 | export function formatName(first: string, last: string): string { 57 | return `${first} ${last}`; 58 | } 59 | "#, 60 | ) 61 | .unwrap(); 62 | 63 | // Create a .tsx file with similar function 64 | fs::write( 65 | &tsx_file, 66 | r#" 67 | import React from 'react'; 68 | 69 | export function formatFullName(firstName: string, lastName: string): string { 70 | return `${firstName} ${lastName}`; 71 | } 72 | 73 | export function NameDisplay({ name }: { name: string }) { 74 | return React.createElement('span', null, name); 75 | } 76 | "#, 77 | ) 78 | .unwrap(); 79 | 80 | // Run similarity-ts on both files 81 | let mut cmd = Command::cargo_bin("similarity-ts").unwrap(); 82 | cmd.arg(dir.path()) 83 | .arg("--threshold") 84 | .arg("0.5") 85 | .arg("--min-lines") 86 | .arg("1") 87 | .arg("--no-size-penalty") 88 | .assert() 89 | .success() 90 | .stdout(predicate::str::contains("Checking 2 files for duplicates")); 91 | } 92 | -------------------------------------------------------------------------------- /examples/test_rust_with_derives.rs: -------------------------------------------------------------------------------- 1 | // Test file for Rust structures with derive attributes 2 | 3 | // Structs with common derives 4 | #[derive(Debug, Clone, PartialEq)] 5 | pub struct User { 6 | pub id: u64, 7 | pub name: String, 8 | pub email: String, 9 | } 10 | 11 | #[derive(Debug, Clone, PartialEq)] 12 | pub struct Person { 13 | pub id: u64, 14 | pub full_name: String, 15 | pub email_address: String, 16 | } 17 | 18 | // Similar structure but different derives 19 | #[derive(Debug, Serialize, Deserialize)] 20 | pub struct Account { 21 | pub id: u64, 22 | pub username: String, 23 | pub email: String, 24 | } 25 | 26 | // Completely different derives 27 | #[derive(Default)] 28 | pub struct Profile { 29 | pub user_id: u64, 30 | pub display_name: String, 31 | pub contact_email: String, 32 | } 33 | 34 | // Enums with derives 35 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 36 | pub enum Status { 37 | Active, 38 | Inactive, 39 | Pending, 40 | Suspended, 41 | } 42 | 43 | #[derive(Debug, Clone, PartialEq)] 44 | pub enum UserStatus { 45 | Active, 46 | Inactive, 47 | Pending, 48 | Banned, 49 | } 50 | 51 | // Different enum with same derives 52 | #[derive(Debug, Clone, Copy, PartialEq, Eq)] 53 | pub enum Color { 54 | Red, 55 | Green, 56 | Blue, 57 | Yellow, 58 | } 59 | 60 | // Complex derives with serde 61 | #[derive(Debug, Clone, Serialize, Deserialize)] 62 | #[serde(rename_all = "camelCase")] 63 | pub struct ApiResponse { 64 | pub success: bool, 65 | pub message: String, 66 | pub data: Option, 67 | } 68 | 69 | #[derive(Debug, Clone, Serialize, Deserialize)] 70 | #[serde(rename_all = "snake_case")] 71 | pub struct ApiResult { 72 | pub is_success: bool, 73 | pub error_message: String, 74 | pub payload: Option, 75 | } 76 | 77 | // Structs with custom attributes 78 | #[derive(Debug, Clone)] 79 | #[cfg(feature = "postgres")] 80 | pub struct DatabaseConfig { 81 | pub host: String, 82 | pub port: u16, 83 | pub database: String, 84 | } 85 | 86 | #[derive(Debug, Clone)] 87 | #[cfg(feature = "mysql")] 88 | pub struct DbConfig { 89 | pub hostname: String, 90 | pub port_number: u16, 91 | pub db_name: String, 92 | } 93 | 94 | // Generic structs with derives 95 | #[derive(Debug, Clone, PartialEq)] 96 | pub struct Result { 97 | value: Option, 98 | error: Option, 99 | } 100 | 101 | #[derive(Debug, Clone, PartialEq)] 102 | pub struct Response { 103 | data: Option, 104 | err: Option, 105 | } -------------------------------------------------------------------------------- /__deprecated/src/core/ast_traversal.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Shared AST traversal utility to eliminate code duplication 3 | * This module provides a common traversal pattern used across the codebase 4 | */ 5 | 6 | interface NodeHandler { 7 | (node: any, state: T, parent?: any): void; 8 | } 9 | 10 | interface NodeHandlers { 11 | // Lifecycle hooks 12 | enter?: NodeHandler; 13 | leave?: NodeHandler; 14 | 15 | // Node type specific handlers 16 | FunctionDeclaration?: NodeHandler; 17 | FunctionExpression?: NodeHandler; 18 | ArrowFunctionExpression?: NodeHandler; 19 | MethodDefinition?: NodeHandler; 20 | ClassDeclaration?: NodeHandler; 21 | ClassExpression?: NodeHandler; 22 | VariableDeclaration?: NodeHandler; 23 | VariableDeclarator?: NodeHandler; 24 | MemberExpression?: NodeHandler; 25 | CallExpression?: NodeHandler; 26 | ThisExpression?: NodeHandler; 27 | Identifier?: NodeHandler; 28 | BlockStatement?: NodeHandler; 29 | 30 | // Generic handler for any node type 31 | [nodeType: string]: NodeHandler | undefined; 32 | } 33 | 34 | /** 35 | * Traverse AST with given handlers 36 | * @param node - AST node to traverse 37 | * @param handlers - Object containing node handlers 38 | * @param state - State object passed to all handlers 39 | * @param parent - Parent node (optional) 40 | */ 41 | export function traverseAST(node: any, handlers: NodeHandlers, state: T, parent?: any): void { 42 | if (!node || typeof node !== "object") return; 43 | 44 | // Call enter lifecycle hook 45 | handlers.enter?.(node, state, parent); 46 | 47 | // Call node type specific handler 48 | if (node.type && typeof node.type === "string") { 49 | const handler = handlers[node.type]; 50 | if (handler) { 51 | handler(node, state, parent); 52 | } 53 | } 54 | 55 | // Traverse children 56 | for (const key in node) { 57 | // Skip circular references and internal properties 58 | if (key === "parent" || key === "scope" || key === "_parent") continue; 59 | 60 | const value = node[key]; 61 | if (Array.isArray(value)) { 62 | // Traverse array elements 63 | value.forEach((child) => traverseAST(child, handlers, state, node)); 64 | } else if (value && typeof value === "object") { 65 | // Traverse object properties 66 | traverseAST(value, handlers, state, node); 67 | } 68 | } 69 | 70 | // Call leave lifecycle hook 71 | handlers.leave?.(node, state, parent); 72 | } 73 | 74 | /** 75 | * Helper to create a typed visitor 76 | */ 77 | export function createVisitor(handlers: NodeHandlers): NodeHandlers { 78 | return handlers; 79 | } 80 | -------------------------------------------------------------------------------- /test/__fixtures__/refactoring/class_to_function/repository_class.ts: -------------------------------------------------------------------------------- 1 | // Repository pattern as a class with internal state 2 | // This is a real-world pattern from the codebase 3 | 4 | export interface Entity { 5 | id: string; 6 | createdAt: Date; 7 | updatedAt: Date; 8 | } 9 | 10 | export class Repository { 11 | private items: Map = new Map(); 12 | private entityName: string; 13 | 14 | constructor(entityName: string) { 15 | this.entityName = entityName; 16 | } 17 | 18 | create(data: Omit): T { 19 | const id = this.generateId(); 20 | const now = new Date(); 21 | 22 | const entity = { 23 | ...data, 24 | id, 25 | createdAt: now, 26 | updatedAt: now 27 | } as T; 28 | 29 | this.items.set(id, entity); 30 | this.log('created', id); 31 | 32 | return entity; 33 | } 34 | 35 | findById(id: string): T | null { 36 | const item = this.items.get(id); 37 | 38 | if (!item) { 39 | this.log('not found', id); 40 | return null; 41 | } 42 | 43 | return item; 44 | } 45 | 46 | update(id: string, updates: Partial>): T | null { 47 | const existing = this.findById(id); 48 | 49 | if (!existing) { 50 | return null; 51 | } 52 | 53 | const updated = { 54 | ...existing, 55 | ...updates, 56 | id: existing.id, 57 | createdAt: existing.createdAt, 58 | updatedAt: new Date() 59 | } as T; 60 | 61 | this.items.set(id, updated); 62 | this.log('updated', id); 63 | 64 | return updated; 65 | } 66 | 67 | delete(id: string): boolean { 68 | const exists = this.items.has(id); 69 | 70 | if (exists) { 71 | this.items.delete(id); 72 | this.log('deleted', id); 73 | } 74 | 75 | return exists; 76 | } 77 | 78 | findAll(): T[] { 79 | return Array.from(this.items.values()); 80 | } 81 | 82 | findByPredicate(predicate: (item: T) => boolean): T[] { 83 | return this.findAll().filter(predicate); 84 | } 85 | 86 | count(): number { 87 | return this.items.size; 88 | } 89 | 90 | clear(): void { 91 | this.items.clear(); 92 | this.log('cleared', 'all'); 93 | } 94 | 95 | private generateId(): string { 96 | return `${this.entityName}_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; 97 | } 98 | 99 | private log(action: string, id: string): void { 100 | console.log(`[${this.entityName}] ${action}: ${id}`); 101 | } 102 | } -------------------------------------------------------------------------------- /test/__fixtures__/duplication/structural/error_handling_pattern_1.ts: -------------------------------------------------------------------------------- 1 | // Structural duplication: Error handling with try-catch pattern 2 | // This shows repeated error handling structure across different operations 3 | 4 | export async function fetchUserData(userId: string): Promise<{ data?: any; error?: string }> { 5 | try { 6 | const response = await fetch(`/api/users/${userId}`); 7 | 8 | if (!response.ok) { 9 | throw new Error(`HTTP error! status: ${response.status}`); 10 | } 11 | 12 | const data = await response.json(); 13 | return { data }; 14 | } catch (error) { 15 | console.error('Error fetching user data:', error); 16 | return { 17 | error: error instanceof Error ? error.message : 'Unknown error occurred' 18 | }; 19 | } 20 | } 21 | 22 | export async function fetchProductData(productId: string): Promise<{ data?: any; error?: string }> { 23 | try { 24 | const response = await fetch(`/api/products/${productId}`); 25 | 26 | if (!response.ok) { 27 | throw new Error(`HTTP error! status: ${response.status}`); 28 | } 29 | 30 | const data = await response.json(); 31 | return { data }; 32 | } catch (error) { 33 | console.error('Error fetching product data:', error); 34 | return { 35 | error: error instanceof Error ? error.message : 'Unknown error occurred' 36 | }; 37 | } 38 | } 39 | 40 | export async function fetchOrderData(orderId: string): Promise<{ data?: any; error?: string }> { 41 | try { 42 | const response = await fetch(`/api/orders/${orderId}`); 43 | 44 | if (!response.ok) { 45 | throw new Error(`HTTP error! status: ${response.status}`); 46 | } 47 | 48 | const data = await response.json(); 49 | return { data }; 50 | } catch (error) { 51 | console.error('Error fetching order data:', error); 52 | return { 53 | error: error instanceof Error ? error.message : 'Unknown error occurred' 54 | }; 55 | } 56 | } 57 | 58 | export async function postComment(postId: string, comment: string): Promise<{ data?: any; error?: string }> { 59 | try { 60 | const response = await fetch(`/api/posts/${postId}/comments`, { 61 | method: 'POST', 62 | headers: { 'Content-Type': 'application/json' }, 63 | body: JSON.stringify({ comment }) 64 | }); 65 | 66 | if (!response.ok) { 67 | throw new Error(`HTTP error! status: ${response.status}`); 68 | } 69 | 70 | const data = await response.json(); 71 | return { data }; 72 | } catch (error) { 73 | console.error('Error posting comment:', error); 74 | return { 75 | error: error instanceof Error ? error.message : 'Unknown error occurred' 76 | }; 77 | } 78 | } -------------------------------------------------------------------------------- /examples/specs/debug_ast.ts: -------------------------------------------------------------------------------- 1 | import { parseTypeScript } from "../src/parser.ts"; 2 | 3 | const code = ` 4 | function addUser(user: User): void { 5 | console.log('hello'); 6 | } 7 | `; 8 | 9 | const ast = parseTypeScript("test.ts", code); 10 | 11 | function inspect(node: any, depth = 0): void { 12 | const indent = " ".repeat(depth); 13 | if (!node || typeof node !== "object") { 14 | console.log(indent + node); 15 | return; 16 | } 17 | 18 | console.log(indent + node.type + " {"); 19 | 20 | for (const key in node) { 21 | if (key === "parent" || key === "scope") continue; 22 | const value = node[key]; 23 | 24 | if (key === "span" && value) { 25 | console.log(indent + " " + key + ": { start: " + value.start + ", end: " + value.end + " }"); 26 | } else if (Array.isArray(value)) { 27 | if (value.length > 0) { 28 | console.log(indent + " " + key + ": ["); 29 | value.forEach((item) => inspect(item, depth + 2)); 30 | console.log(indent + " ]"); 31 | } 32 | } else if (value && typeof value === "object" && value.type) { 33 | console.log(indent + " " + key + ":"); 34 | inspect(value, depth + 2); 35 | } else if (value !== undefined && value !== null && key !== "raw" && key !== "regex") { 36 | console.log(indent + " " + key + ": " + JSON.stringify(value)); 37 | } 38 | } 39 | 40 | console.log(indent + "}"); 41 | } 42 | 43 | // Find the function 44 | function findFunction(node: any): any { 45 | if (!node) return null; 46 | 47 | if (node.type === "FunctionDeclaration") { 48 | return node; 49 | } 50 | 51 | for (const key in node) { 52 | if (key === "parent" || key === "scope") continue; 53 | const value = node[key]; 54 | if (Array.isArray(value)) { 55 | for (const item of value) { 56 | const result = findFunction(item); 57 | if (result) return result; 58 | } 59 | } else if (value && typeof value === "object") { 60 | const result = findFunction(value); 61 | if (result) return result; 62 | } 63 | } 64 | 65 | return null; 66 | } 67 | 68 | const func = findFunction(ast.program); 69 | if (func) { 70 | console.log("Found function:"); 71 | inspect(func, 0); 72 | 73 | console.log("\n\nCode snippet:"); 74 | if (func.body) { 75 | console.log("func.body exists:", !!func.body); 76 | console.log("func.body.span:", func.body.span); 77 | console.log("func.body.start:", func.body.start); 78 | console.log("func.body.end:", func.body.end); 79 | 80 | if (func.body.start !== undefined && func.body.end !== undefined) { 81 | const bodyCode = code.substring(func.body.start, func.body.end); 82 | console.log("Body code (using start/end):", bodyCode); 83 | } 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /examples/specs/sample_project/src/services/order_service.ts: -------------------------------------------------------------------------------- 1 | import { Order, OrderStatus } from "../models/order.ts"; 2 | import { Logger } from "../utils/logger.ts"; 3 | import { ValidationError } from "../utils/errors.ts"; 4 | 5 | export class OrderService { 6 | private orders: Map = new Map(); 7 | private logger: Logger; 8 | 9 | constructor(logger: Logger) { 10 | this.logger = logger; 11 | } 12 | 13 | async createOrder(userId: string, items: Array<{ productId: string; quantity: number }>): Promise { 14 | if (items.length === 0) { 15 | throw new ValidationError("Order must contain at least one item"); 16 | } 17 | 18 | const order: Order = { 19 | id: this.generateOrderId(), 20 | userId, 21 | items, 22 | status: OrderStatus.PENDING, 23 | totalAmount: this.calculateTotal(items), 24 | createdAt: new Date(), 25 | updatedAt: new Date(), 26 | }; 27 | 28 | this.orders.set(order.id, order); 29 | this.logger.info(`Order created: ${order.id} for user: ${userId}`); 30 | 31 | return order; 32 | } 33 | 34 | async getOrderById(id: string): Promise { 35 | return this.orders.get(id) || null; 36 | } 37 | 38 | async getOrdersByUser(userId: string): Promise { 39 | return Array.from(this.orders.values()).filter((order) => order.userId === userId); 40 | } 41 | 42 | async updateOrderStatus(id: string, status: OrderStatus): Promise { 43 | const order = this.orders.get(id); 44 | 45 | if (!order) { 46 | this.logger.warn(`Order not found: ${id}`); 47 | return null; 48 | } 49 | 50 | const updatedOrder = { 51 | ...order, 52 | status, 53 | updatedAt: new Date(), 54 | }; 55 | 56 | this.orders.set(id, updatedOrder); 57 | this.logger.info(`Order ${id} status updated to: ${status}`); 58 | 59 | return updatedOrder; 60 | } 61 | 62 | async cancelOrder(id: string): Promise { 63 | const order = await this.getOrderById(id); 64 | 65 | if (!order) { 66 | return false; 67 | } 68 | 69 | if (order.status !== OrderStatus.PENDING) { 70 | this.logger.warn(`Cannot cancel order ${id} with status: ${order.status}`); 71 | return false; 72 | } 73 | 74 | await this.updateOrderStatus(id, OrderStatus.CANCELLED); 75 | return true; 76 | } 77 | 78 | private calculateTotal(items: Array<{ productId: string; quantity: number }>): number { 79 | // Simplified calculation - in real app would fetch product prices 80 | return items.reduce((total, item) => total + item.quantity * 10, 0); 81 | } 82 | 83 | private generateOrderId(): string { 84 | return `order_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /RELEASE_NOTES.md: -------------------------------------------------------------------------------- 1 | # similarity-ts v0.1.0 2 | 3 | First release of similarity-ts (formerly ts-similarity) - a high-performance TypeScript/JavaScript code similarity detection tool written in Rust. 4 | 5 | ## 🎯 Features 6 | 7 | ### Core Functionality 8 | - **Function Similarity Detection**: Find duplicate or similar functions across your codebase 9 | - **Type Similarity Detection** (experimental): Detect similar interfaces and type definitions 10 | - **AST-based Comparison**: Uses Tree Structured Edit Distance (TSED) algorithm for accurate structural comparison 11 | - **Cross-file Analysis**: Find duplicates across multiple files in your project 12 | 13 | ### Performance 14 | - **Bloom Filter Pre-filtering**: ~90% reduction in comparisons with AST fingerprinting 15 | - **Multi-threaded Processing**: Parallel file parsing and analysis using Rayon 16 | - **Memory Efficient**: Written in Rust for optimal memory usage 17 | - **Fast Mode**: Default mode with intelligent pre-filtering 18 | 19 | ### Developer Experience 20 | - **Zero Configuration**: Works out of the box with sensible defaults 21 | - **VSCode-compatible Output**: Click file paths to jump directly to code 22 | - **Flexible Filtering**: 23 | - `--min-tokens`: Filter by AST node count (recommended: 20-30) 24 | - `--min-lines`: Filter by line count 25 | - `--threshold`: Configurable similarity threshold (0.0-1.0) 26 | - **Multiple Output Options**: Standard output or detailed code printing with `--print` 27 | 28 | ## 📦 Installation 29 | 30 | ```bash 31 | cargo install similarity-ts 32 | ``` 33 | 34 | ## 🚀 Quick Start 35 | 36 | ```bash 37 | # Check current directory for duplicates 38 | similarity-ts 39 | 40 | # Analyze specific directories 41 | similarity-ts src/ lib/ 42 | 43 | # Set custom threshold 44 | similarity-ts --threshold 0.9 45 | 46 | # Filter by complexity 47 | similarity-ts --min-tokens 25 48 | 49 | # Show code snippets 50 | similarity-ts --print 51 | ``` 52 | 53 | ## 📊 Performance Benchmarks 54 | 55 | Tested on real-world TypeScript projects: 56 | - Small files (4 functions): ~8µs 57 | - Medium files (8 functions): ~62µs 58 | - Large files (9+ functions): ~71µs 59 | - 100 files parallel processing: ~3ms (4x faster than sequential) 60 | 61 | ## 🔧 Technical Details 62 | 63 | - Built with [oxc-parser](https://github.com/oxc-project/oxc) for fast TypeScript/JavaScript parsing 64 | - Implements TSED algorithm from academic research 65 | - Uses SIMD-accelerated bloom filters for pre-filtering 66 | - Supports `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs`, `.cjs`, `.mts`, `.cts` files 67 | 68 | ## 🙏 Acknowledgments 69 | 70 | This project was developed with significant assistance from Claude (Anthropic) in implementing the Rust version, optimizing performance, and creating documentation. 71 | 72 | ## 📝 License 73 | 74 | MIT 75 | 76 | --- 77 | 78 | For more information, visit the [GitHub repository](https://github.com/mizchi/similarity-ts). -------------------------------------------------------------------------------- /crates/similarity-rs/tests/test_ast_comparison.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::uninlined_format_args)] 2 | 3 | use similarity_core::{ 4 | language_parser::LanguageParser, 5 | tsed::{calculate_tsed, TSEDOptions}, 6 | }; 7 | use similarity_rs::rust_parser::RustParser; 8 | 9 | #[test] 10 | fn test_different_functions_should_have_low_similarity() { 11 | let code1 = r#" 12 | let result = x + 1; 13 | result * 2 14 | "#; 15 | 16 | let code2 = r#" 17 | let mut sum = 0; 18 | for val in values { 19 | if val > 0 { 20 | sum += val; 21 | } 22 | } 23 | sum 24 | "#; 25 | 26 | let mut parser = RustParser::new().unwrap(); 27 | let tree1 = parser.parse(code1, "test1.rs").unwrap(); 28 | let tree2 = parser.parse(code2, "test2.rs").unwrap(); 29 | 30 | let options = TSEDOptions::default(); 31 | let similarity = calculate_tsed(&tree1, &tree2, &options); 32 | 33 | println!("Similarity between addition and loop: {:.2}%", similarity * 100.0); 34 | 35 | // These are completely different - similarity should be low 36 | assert!(similarity < 0.5, "Different functions should have low similarity, got {}", similarity); 37 | } 38 | 39 | #[test] 40 | fn test_similar_functions_should_have_high_similarity() { 41 | let code1 = r#" 42 | let result = x + 1; 43 | result * 2 44 | "#; 45 | 46 | let code2 = r#" 47 | let temp = y + 1; 48 | temp * 2 49 | "#; 50 | 51 | let mut parser = RustParser::new().unwrap(); 52 | let tree1 = parser.parse(code1, "test1.rs").unwrap(); 53 | let tree2 = parser.parse(code2, "test2.rs").unwrap(); 54 | 55 | let options = TSEDOptions::default(); 56 | let similarity = calculate_tsed(&tree1, &tree2, &options); 57 | 58 | println!("Similarity between similar functions: {:.2}%", similarity * 100.0); 59 | 60 | // These are very similar - similarity should be high 61 | assert!(similarity > 0.8, "Similar functions should have high similarity, got {}", similarity); 62 | } 63 | 64 | #[test] 65 | fn test_ast_tree_structure() { 66 | let code = r#" 67 | let result = x + 1; 68 | result * 2 69 | "#; 70 | 71 | let mut parser = RustParser::new().unwrap(); 72 | let tree = parser.parse(code, "test.rs").unwrap(); 73 | 74 | fn print_tree(node: &similarity_core::tree::TreeNode, depth: usize) { 75 | let indent = " ".repeat(depth); 76 | if node.value.is_empty() { 77 | println!("{}{}", indent, node.label); 78 | } else { 79 | println!("{}{} = '{}'", indent, node.label, node.value); 80 | } 81 | for child in &node.children { 82 | print_tree(child, depth + 1); 83 | } 84 | } 85 | 86 | println!("=== AST Structure ==="); 87 | print_tree(&tree, 0); 88 | 89 | // Check that the tree has reasonable structure 90 | assert!(tree.get_subtree_size() > 5, "Tree should have multiple nodes"); 91 | } 92 | -------------------------------------------------------------------------------- /crates/similarity-css/README.md: -------------------------------------------------------------------------------- 1 | # similarity-css 2 | 3 | > ⚠️ **EXPERIMENTAL**: This is a prototype implementation for CSS/SCSS similarity detection. The API and functionality may change significantly. Use at your own risk. 4 | 5 | A CSS/SCSS similarity detection tool that identifies duplicate styles, redundant rules, and BEM component variations. 6 | 7 | ## Features 8 | 9 | - **CSS and SCSS parsing** using tree-sitter 10 | - **Nested SCSS syntax flattening** with BEM notation support (`&__element`, `&--modifier`) 11 | - **Multiple similarity detection types**: 12 | - Exact duplicates 13 | - Style duplicates (same styles, different selectors) 14 | - BEM component variations 15 | - Selector conflicts 16 | - **Shorthand property expansion** for accurate comparison 17 | - **CSS specificity calculation** 18 | - **Multiple output formats**: standard, VSCode, JSON 19 | 20 | ## Installation 21 | 22 | This tool is part of the similarity workspace. Build it with: 23 | 24 | ```bash 25 | cargo build --release -p similarity-css 26 | ``` 27 | 28 | ## Usage 29 | 30 | ```bash 31 | # Analyze CSS files 32 | similarity-css path/to/css/ 33 | 34 | # Analyze SCSS files 35 | similarity-css --scss path/to/scss/ 36 | 37 | # Set custom threshold (0.0-1.0) 38 | similarity-css --threshold 0.7 path/to/css/ 39 | 40 | # Different output formats 41 | similarity-css --output json path/to/css/ 42 | similarity-css --output vscode path/to/css/ 43 | ``` 44 | 45 | ## Examples 46 | 47 | ### Analyzing BEM components 48 | 49 | ```bash 50 | similarity-css --scss examples/scss-bem/ 51 | ``` 52 | 53 | This will detect: 54 | - Duplicate button styles (`.btn` vs `.button`) 55 | - Similar form input styles 56 | - BEM modifier variations 57 | 58 | ### Output Example 59 | 60 | ``` 61 | === CSS Similarity Analysis Results === 62 | 63 | ## Similar Styles Found: 74 64 | 65 | 1. .btn and .button (similarity: 60.00%) 66 | Files: button.scss and button.scss 67 | Lines: 2-14 and 138-149 68 | 69 | ## BEM Component Variations Found: 37 70 | 71 | 1. BEM variation: .btn--primary 72 | Similar to: .btn--secondary 73 | Similarity: 51.00% 74 | ``` 75 | 76 | ## Implementation Notes 77 | 78 | - Uses TSED algorithm for AST comparison (currently weighted at 0%) 79 | - Simple text-based SCSS flattener for handling complex nested rules 80 | - Handles multiple selectors and media queries 81 | - Supports single-line CSS rules 82 | 83 | ## Limitations 84 | 85 | - SCSS variable resolution is not implemented 86 | - Mixin expansion is not supported 87 | - Import statements are not followed 88 | - Cross-file BEM component detection is limited 89 | 90 | ## Future Improvements 91 | 92 | - [ ] SCSS variable and mixin support 93 | - [ ] Import resolution 94 | - [ ] CSS-in-JS support 95 | - [ ] Performance optimizations for large codebases 96 | - [ ] Integration with build tools 97 | 98 | ## License 99 | 100 | See the main repository's LICENSE file. -------------------------------------------------------------------------------- /__deprecated/src/core/ast.ts: -------------------------------------------------------------------------------- 1 | // AST-related pure functions with proper oxc-parser types 2 | import { parseTypeScript } from "../parser.ts"; 3 | import { levenshtein } from "./levenshtein.ts"; 4 | import type { ASTNode, Program } from "./oxc_types.ts"; 5 | import type { ParseResult } from "oxc-parser"; 6 | 7 | /** 8 | * Extract structure from AST node with proper type handling 9 | */ 10 | function extractStructure(node: ASTNode | any): any { 11 | if (!node || typeof node !== "object") { 12 | return node; 13 | } 14 | 15 | const skipKeys = new Set(["range", "loc", "span", "start", "end"]); 16 | const result: any = {}; 17 | 18 | if (node.type) { 19 | result.type = node.type; 20 | } 21 | 22 | for (const [key, value] of Object.entries(node)) { 23 | if (skipKeys.has(key)) continue; 24 | 25 | if (Array.isArray(value)) { 26 | result[key] = value.map((item) => extractStructure(item)); 27 | } else if (typeof value === "object" && value !== null) { 28 | result[key] = extractStructure(value); 29 | } else if (key !== "type") { 30 | result[key] = value; 31 | } 32 | } 33 | 34 | return result; 35 | } 36 | 37 | /** 38 | * Convert AST to string representation 39 | */ 40 | export function astToString(ast: ParseResult | Program | ASTNode): string { 41 | // Handle ParseResult from oxc-parser 42 | if ("program" in ast && !("type" in ast)) { 43 | return JSON.stringify(extractStructure(ast.program), null, 2); 44 | } 45 | // Handle direct AST nodes 46 | return JSON.stringify(extractStructure(ast), null, 2); 47 | } 48 | 49 | /** 50 | * Calculate similarity between two code strings 51 | */ 52 | export function calculateSimilarity(code1: string, code2: string): number { 53 | try { 54 | const ast1 = parseTypeScript("file1.ts", code1); 55 | const ast2 = parseTypeScript("file2.ts", code2); 56 | 57 | const str1 = astToString(ast1); 58 | const str2 = astToString(ast2); 59 | 60 | const distance = levenshtein(str1, str2); 61 | const maxLength = Math.max(str1.length, str2.length); 62 | 63 | return maxLength === 0 ? 1.0 : 1 - distance / maxLength; 64 | } catch (error) { 65 | // If parsing fails, fall back to simple string comparison 66 | return code1 === code2 ? 1.0 : 0.0; 67 | } 68 | } 69 | 70 | /** 71 | * Compare structures and return similarity metrics 72 | */ 73 | export function compareStructures( 74 | ast1: ParseResult, 75 | ast2: ParseResult, 76 | ): { 77 | similarity: number; 78 | distance: number; 79 | maxLength: number; 80 | structure1: string; 81 | structure2: string; 82 | } { 83 | const str1 = astToString(ast1); 84 | const str2 = astToString(ast2); 85 | 86 | const distance = levenshtein(str1, str2); 87 | const maxLength = Math.max(str1.length, str2.length); 88 | const similarity = maxLength === 0 ? 1.0 : 1 - distance / maxLength; 89 | 90 | return { 91 | similarity, 92 | distance, 93 | maxLength, 94 | structure1: str1, 95 | structure2: str2, 96 | }; 97 | } 98 | --------------------------------------------------------------------------------