├── crates
    ├── similarity-php
    │   ├── src
    │   │   └── lib.rs
    │   └── Cargo.toml
    ├── similarity-py
    │   ├── src
    │   │   └── lib.rs
    │   └── Cargo.toml
    ├── similarity-rs
    │   ├── src
    │   │   └── lib.rs
    │   ├── Cargo.toml
    │   └── tests
    │   │   ├── test_min_tokens.rs
    │   │   ├── parser_test.rs
    │   │   ├── test_rename_zero.rs
    │   │   ├── test_debug_rename_cost.rs
    │   │   ├── test_full_function_similarity.rs
    │   │   ├── debug_ast.rs
    │   │   ├── test_tsed_debugging.rs
    │   │   ├── test_function_extraction.rs
    │   │   └── test_ast_comparison.rs
    ├── similarity-ts
    │   ├── src
    │   │   └── lib.rs
    │   ├── Cargo.toml
    │   └── tests
    │   │   └── tsx_test.rs
    ├── similarity-elixir
    │   ├── src
    │   │   └── lib.rs
    │   ├── tests
    │   │   └── elixir_test_helper.rs
    │   ├── Cargo.toml
    │   └── README.md
    ├── similarity-generic
    │   ├── language_configs
    │   │   ├── ruby.json
    │   │   ├── c.json
    │   │   ├── cpp.json
    │   │   ├── java.json
    │   │   ├── csharp.json
    │   │   └── go.json
    │   ├── examples
    │   │   ├── sample.go
    │   │   ├── configs
    │   │   │   ├── go.json
    │   │   │   └── custom-language-template.json
    │   │   └── usage.sh
    │   ├── Cargo.toml
    │   └── build.rs
    ├── core
    │   ├── tests
    │   │   ├── fixtures
    │   │   │   ├── sample2.ts
    │   │   │   └── sample1.ts
    │   │   ├── debug_similarity.rs
    │   │   └── ast_fingerprint_test.rs
    │   ├── README.md
    │   ├── src
    │   │   ├── tree.rs
    │   │   ├── cli_output.rs
    │   │   ├── cli_file_utils.rs
    │   │   └── ast_exchange.rs
    │   └── Cargo.toml
    ├── similarity-css
    │   ├── Cargo.toml
    │   ├── src
    │   │   ├── lib.rs
    │   │   └── bin
    │   │   │   └── test_parser.rs
    │   ├── examples
    │   │   ├── test.css
    │   │   └── test.scss
    │   └── README.md
    └── similarity-md
    │   ├── src
    │       └── lib.rs
    │   ├── examples
    │       ├── japanese_similarity_test.md
    │       └── test_levenshtein.rs
    │   └── Cargo.toml
├── test
    └── __fixtures__
    │   ├── edge_cases
    │       ├── empty_1.ts
    │       ├── empty_2.ts
    │       ├── syntax_error_1.ts
    │       ├── syntax_error_2.ts
    │       ├── identical_1.ts
    │       └── identical_2.ts
    │   ├── similar
    │       ├── function_rename_2.ts
    │       ├── function_rename_1.ts
    │       ├── interface_extend_1.ts
    │       ├── interface_extend_2.ts
    │       ├── async_function_2.ts
    │       ├── async_function_1.ts
    │       ├── class_rename_1.ts
    │       └── class_rename_2.ts
    │   ├── dissimilar
    │       ├── function_vs_class_1.ts
    │       ├── sync_vs_async_1.ts
    │       ├── interface_vs_type_1.ts
    │       ├── interface_vs_type_2.ts
    │       ├── function_vs_class_2.ts
    │       ├── sync_vs_async_2.ts
    │       ├── imperative_vs_functional_2.ts
    │       └── imperative_vs_functional_1.ts
    │   ├── performance
    │       ├── small
    │       │   ├── small_3.ts
    │       │   ├── small_4.ts
    │       │   ├── small_5.ts
    │       │   ├── small_1.ts
    │       │   └── small_2.ts
    │       └── metadata.json
    │   ├── duplication
    │       ├── structural
    │       │   ├── metadata.json
    │       │   ├── loop_pattern_1.ts
    │       │   ├── visitnode_pattern_1.ts
    │       │   ├── visitnode_pattern_3.ts
    │       │   ├── visitnode_pattern_2.ts
    │       │   ├── array_iteration_pattern_2.ts
    │       │   ├── error_handling_pattern_2.ts
    │       │   ├── array_iteration_pattern_1.ts
    │       │   └── error_handling_pattern_1.ts
    │       ├── semantic
    │       │   ├── async_operations_1.ts
    │       │   ├── async_operations_2.ts
    │       │   └── validation_pattern_1.ts
    │       ├── exact
    │       │   ├── service_duplication_1.ts
    │       │   └── service_duplication_2.ts
    │       └── copy_paste
    │       │   └── loop_pattern.ts
    │   └── refactoring
    │       └── class_to_function
    │           ├── calculator_class.ts
    │           ├── user_service_functions.ts
    │           ├── calculator_functions.ts
    │           ├── user_service_class.ts
    │           ├── metadata.json
    │           └── repository_class.ts
├── rust-toolchain.toml
├── .rustfmt.toml
├── clippy.toml
├── __deprecated
    ├── tsdown.config.ts
    ├── vitest.config.ts
    ├── package-build.json
    ├── src
    │   ├── core
    │   │   ├── levenshtein.ts
    │   │   ├── oxc_types.ts
    │   │   ├── ast_traversal.ts
    │   │   └── ast.ts
    │   ├── cli
    │   │   └── io.ts
    │   └── parser.ts
    └── test
    │   └── basic.test.ts
├── examples
    ├── identical_enums.rs
    ├── test_enums.rs
    ├── overlap-detection
    │   ├── file2.js
    │   ├── file1.js
    │   ├── exact-duplication.js
    │   ├── false-positives.js
    │   ├── similar-patterns.js
    │   └── partial-overlap.js
    ├── specs
    │   ├── sample_project
    │   │   └── src
    │   │   │   ├── models
    │   │   │       ├── product.ts
    │   │   │       ├── order.ts
    │   │   │       └── user.ts
    │   │   │   ├── utils
    │   │   │       ├── errors.ts
    │   │   │       ├── validator.ts
    │   │   │       └── logger.ts
    │   │   │   ├── components
    │   │   │       ├── user_list.ts
    │   │   │       └── product_list.ts
    │   │   │   └── services
    │   │   │       ├── user_service.ts
    │   │   │       ├── product_service.ts
    │   │   │       └── order_service.ts
    │   ├── test_cli.ts
    │   ├── README.md
    │   ├── duplicate-types.ts
    │   ├── test_async.ts
    │   ├── test_extraction.ts
    │   ├── debug_arrow.ts
    │   ├── duplicate-functions.ts
    │   ├── duplicate-functions2.ts
    │   ├── basic_usage.ts
    │   ├── type-similarity
    │   │   ├── test_types_sample.ts
    │   │   └── test_type_literal_sample.ts
    │   └── debug_ast.ts
    ├── README.md
    ├── mixed_language_project
    │   ├── helpers.py
    │   └── utils.js
    ├── test_structure_comparison.ts
    ├── duplicate_python.py
    ├── test_rust_structures.rs
    ├── test_different_structures.rs
    ├── test_different_ts_structures.ts
    ├── rust_types_example.rs
    └── test_rust_with_derives.rs
├── .mcp.json
├── .oxlintrc.json
├── .cargo
    └── audit.toml
├── benchmarks
    ├── data
    │   ├── test_simple.ts
    │   └── test_duplicates.ts
    └── README.md
├── benchmark_results.txt
├── .claude
    └── settings.json
├── biome.json
├── docs
    ├── algorithm
    │   ├── README.md
    │   └── tsed-similarity-summary.md
    ├── implementation
    │   ├── README.md
    │   └── performance-baseline.md
    ├── lib
    │   └── README.md
    ├── README.md
    ├── prompt-ja.md
    └── prompt.md
├── Cargo.toml
├── KNOWN_ISSUES.md
├── LICENSE
├── RECOMMENDATIONS.md
└── RELEASE_NOTES.md


/crates/similarity-php/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod php_parser;
2 | 


--------------------------------------------------------------------------------
/crates/similarity-py/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod python_parser;
2 | 


--------------------------------------------------------------------------------
/crates/similarity-rs/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod rust_parser;
2 | 


--------------------------------------------------------------------------------
/test/__fixtures__/edge_cases/empty_1.ts:
--------------------------------------------------------------------------------
1 | // Edge case: Empty file


--------------------------------------------------------------------------------
/test/__fixtures__/edge_cases/empty_2.ts:
--------------------------------------------------------------------------------
1 | // Edge case: Empty file with different comment


--------------------------------------------------------------------------------
/rust-toolchain.toml:
--------------------------------------------------------------------------------
1 | [toolchain]
2 | channel = "stable"
3 | components = ["rustfmt", "clippy"]


--------------------------------------------------------------------------------
/crates/similarity-ts/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod parallel;
2 | pub mod sequential;
3 | pub mod typescript_parser;
4 | 


--------------------------------------------------------------------------------
/.rustfmt.toml:
--------------------------------------------------------------------------------
1 | # Rust formatting configuration
2 | edition = "2021"
3 | max_width = 100
4 | use_small_heuristics = "Max"


--------------------------------------------------------------------------------
/clippy.toml:
--------------------------------------------------------------------------------
1 | # Clippy linting configuration
2 | cognitive-complexity-threshold = 30
3 | too-many-arguments-threshold = 7


--------------------------------------------------------------------------------
/crates/similarity-elixir/src/lib.rs:
--------------------------------------------------------------------------------
1 | pub mod elixir_parser;
2 | pub mod parallel;
3 | 
4 | pub use elixir_parser::ElixirParser;
5 | 


--------------------------------------------------------------------------------
/test/__fixtures__/edge_cases/syntax_error_1.ts:
--------------------------------------------------------------------------------
1 | // Edge case: Syntax error
2 | function broken( {
3 |   return "missing closing brace";


--------------------------------------------------------------------------------
/test/__fixtures__/edge_cases/syntax_error_2.ts:
--------------------------------------------------------------------------------
1 | // Edge case: Different syntax error
2 | class Incomplete {
3 |   constructor() {
4 |     this.value = 


--------------------------------------------------------------------------------
/__deprecated/tsdown.config.ts:
--------------------------------------------------------------------------------
1 | import { defineConfig } from "tsdown";
2 | 
3 | export default defineConfig({
4 |   entry: {},
5 |   external: ["oxc-parser"],
6 | });
7 | 


--------------------------------------------------------------------------------
/examples/identical_enums.rs:
--------------------------------------------------------------------------------
 1 | enum Color1 {
 2 |     Red,
 3 |     Green,
 4 |     Blue,
 5 | }
 6 | 
 7 | enum Color2 {
 8 |     Red,
 9 |     Green,
10 |     Blue,
11 | }


--------------------------------------------------------------------------------
/test/__fixtures__/similar/function_rename_2.ts:
--------------------------------------------------------------------------------
1 | // Similar: Function with renamed identifiers
2 | function addNumbers(x: number, y: number): number {
3 |   return x + y;
4 | }


--------------------------------------------------------------------------------
/.mcp.json:
--------------------------------------------------------------------------------
1 | {
2 |   "mcpServers": {
3 |     "rust": {
4 |       "command": "npx",
5 |       "args": ["-y", "@mizchi/lsmcp", "--bin", "rust-analyzer"]
6 |     }
7 |   }
8 | }
9 | 


--------------------------------------------------------------------------------
/test/__fixtures__/similar/function_rename_1.ts:
--------------------------------------------------------------------------------
1 | // Similar: Function with renamed identifiers
2 | function calculateSum(a: number, b: number): number {
3 |   return a + b;
4 | }


--------------------------------------------------------------------------------
/examples/test_enums.rs:
--------------------------------------------------------------------------------
 1 | enum Status {
 2 |     Active,
 3 |     Inactive,
 4 |     Pending,
 5 | }
 6 | 
 7 | enum State {
 8 |     Running,
 9 |     Stopped,
10 |     Waiting,
11 | }


--------------------------------------------------------------------------------
/test/__fixtures__/dissimilar/function_vs_class_1.ts:
--------------------------------------------------------------------------------
1 | // Dissimilar: Function implementation
2 | function processData(data: string[]): string {
3 |   return data.filter(item => item.length > 0).join(', ');
4 | }


--------------------------------------------------------------------------------
/.oxlintrc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "rules": {
3 |     "no-console": "off",
4 |     "typescript/no-explicit-any": "off"
5 |   },
6 |   "ignorePatterns": ["node_modules", "dist", "build", "coverage", "*.min.js", "target", "test/__fixtures__"]
7 | }
8 | 


--------------------------------------------------------------------------------
/test/__fixtures__/dissimilar/sync_vs_async_1.ts:
--------------------------------------------------------------------------------
1 | // Dissimilar: Synchronous code
2 | function calculateTotal(items: number[]): number {
3 |   let total = 0;
4 |   for (const item of items) {
5 |     total += item;
6 |   }
7 |   return total;
8 | }


--------------------------------------------------------------------------------
/.cargo/audit.toml:
--------------------------------------------------------------------------------
1 | [advisories]
2 | # List of advisory IDs to ignore
3 | ignore = [
4 |     # instant is unmaintained - used by vibrato->rucrf->argmin
5 |     "RUSTSEC-2024-0384",
6 |     # paste is unmaintained - used by vibrato->rucrf->argmin  
7 |     "RUSTSEC-2024-0436",
8 | ]


--------------------------------------------------------------------------------
/test/__fixtures__/similar/interface_extend_1.ts:
--------------------------------------------------------------------------------
 1 | // Similar: Interface with minor additions
 2 | interface BaseConfig {
 3 |   apiUrl: string;
 4 |   timeout: number;
 5 |   retryCount: number;
 6 | }
 7 | 
 8 | interface AppConfig extends BaseConfig {
 9 |   debug: boolean;
10 | }


--------------------------------------------------------------------------------
/benchmarks/data/test_simple.ts:
--------------------------------------------------------------------------------
 1 | function add(a: number, b: number): number {
 2 |   return a + b;
 3 | }
 4 | 
 5 | function subtract(a: number, b: number): number {
 6 |   return a - b;
 7 | }
 8 | 
 9 | function multiply(a: number, b: number): number {
10 |   return a * b;
11 | }
12 | 


--------------------------------------------------------------------------------
/__deprecated/vitest.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from "vitest/config";
 2 | export default defineConfig({
 3 |   test: {
 4 |     include: ["test/**/*.test.ts"],
 5 |     coverage: {
 6 |       include: ["src/**/*.ts"],
 7 |       exclude: ["test/**", "examples/**", "scripts/**"],
 8 |     },
 9 |   },
10 | });
11 | 


--------------------------------------------------------------------------------
/test/__fixtures__/similar/interface_extend_2.ts:
--------------------------------------------------------------------------------
 1 | // Similar: Interface with minor additions
 2 | interface BaseSettings {
 3 |   apiUrl: string;
 4 |   timeout: number;
 5 |   retryCount: number;
 6 | }
 7 | 
 8 | interface ApplicationSettings extends BaseSettings {
 9 |   debug: boolean;
10 |   logLevel: string;
11 | }


--------------------------------------------------------------------------------
/examples/overlap-detection/file2.js:
--------------------------------------------------------------------------------
 1 | // File 2: Contains similar patterns
 2 | 
 3 | function transformData(data) {
 4 |     const output = [];
 5 |     for (let i = 0; i < data.length; i++) {
 6 |         if (data[i].active) {
 7 |             output.push(data[i].value * 2);
 8 |         }
 9 |     }
10 |     return output;
11 | }


--------------------------------------------------------------------------------
/test/__fixtures__/similar/async_function_2.ts:
--------------------------------------------------------------------------------
1 | // Similar: Async function with same logic
2 | async function getUserInfo(id: string): Promise<User> {
3 |   const res = await fetch(`/api/users/${id}`);
4 |   if (!res.ok) {
5 |     throw new Error('User not found');
6 |   }
7 |   const userData = await res.json();
8 |   return userData;
9 | }


--------------------------------------------------------------------------------
/examples/overlap-detection/file1.js:
--------------------------------------------------------------------------------
 1 | // File 1: Contains functions with overlapping patterns
 2 | 
 3 | function processItems(items) {
 4 |     const results = [];
 5 |     for (let i = 0; i < items.length; i++) {
 6 |         if (items[i].active) {
 7 |             results.push(items[i].value * 2);
 8 |         }
 9 |     }
10 |     return results;
11 | }


--------------------------------------------------------------------------------
/test/__fixtures__/similar/async_function_1.ts:
--------------------------------------------------------------------------------
1 | // Similar: Async function with same logic
2 | async function fetchUserData(userId: string): Promise<User> {
3 |   const response = await fetch(`/api/users/${userId}`);
4 |   if (!response.ok) {
5 |     throw new Error('User not found');
6 |   }
7 |   const data = await response.json();
8 |   return data;
9 | }


--------------------------------------------------------------------------------
/test/__fixtures__/dissimilar/interface_vs_type_1.ts:
--------------------------------------------------------------------------------
 1 | // Dissimilar: Interface declaration
 2 | interface DatabaseConnection {
 3 |   host: string;
 4 |   port: number;
 5 |   username: string;
 6 |   password: string;
 7 |   database: string;
 8 |   
 9 |   connect(): Promise<void>;
10 |   disconnect(): Promise<void>;
11 |   query<T>(sql: string, params?: any[]): Promise<T[]>;
12 | }


--------------------------------------------------------------------------------
/examples/specs/sample_project/src/models/product.ts:
--------------------------------------------------------------------------------
 1 | export interface Product {
 2 |   id: string;
 3 |   name: string;
 4 |   description: string;
 5 |   price: number;
 6 |   category: string;
 7 |   stock: number;
 8 |   createdAt: Date;
 9 |   updatedAt?: Date;
10 | }
11 | 
12 | export interface ProductVariant extends Product {
13 |   parentId: string;
14 |   sku: string;
15 |   attributes: Record<string, string>;
16 | }
17 | 


--------------------------------------------------------------------------------
/test/__fixtures__/performance/small/small_3.ts:
--------------------------------------------------------------------------------
 1 | // Small test file 3
 2 | 
 3 | import { Component } from '@angular/core';
 4 | import { Observable } from 'rxjs';
 5 | 
 6 | export function calculate3(a: number, b: number): number {
 7 |   const result = a + b;
 8 |   console.log('Result:', result);
 9 |   return result;
10 | }
11 | 
12 | interface Data3 {
13 |   id: number;
14 |   name: string;
15 |   value: number;
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/test/__fixtures__/performance/small/small_4.ts:
--------------------------------------------------------------------------------
 1 | // Small test file 4
 2 | 
 3 | import { Component } from '@angular/core';
 4 | import { Observable } from 'rxjs';
 5 | 
 6 | export function calculate4(a: number, b: number): number {
 7 |   const result = a + b;
 8 |   console.log('Result:', result);
 9 |   return result;
10 | }
11 | 
12 | interface Data4 {
13 |   id: number;
14 |   name: string;
15 |   value: number;
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/test/__fixtures__/performance/small/small_5.ts:
--------------------------------------------------------------------------------
 1 | // Small test file 5
 2 | 
 3 | import { Component } from '@angular/core';
 4 | import { Observable } from 'rxjs';
 5 | 
 6 | export function calculate5(a: number, b: number): number {
 7 |   const result = a + b;
 8 |   console.log('Result:', result);
 9 |   return result;
10 | }
11 | 
12 | interface Data5 {
13 |   id: number;
14 |   name: string;
15 |   value: number;
16 | }
17 | 
18 | 


--------------------------------------------------------------------------------
/test/__fixtures__/dissimilar/interface_vs_type_2.ts:
--------------------------------------------------------------------------------
 1 | // Dissimilar: Type aliases and utility types
 2 | type Status = 'pending' | 'active' | 'completed' | 'failed';
 3 | 
 4 | type UserRole = 'admin' | 'user' | 'guest';
 5 | 
 6 | type ApiResponse<T> = {
 7 |   success: boolean;
 8 |   data?: T;
 9 |   error?: string;
10 |   timestamp: number;
11 | };
12 | 
13 | type DeepPartial<T> = {
14 |   [P in keyof T]?: T[P] extends object ? DeepPartial<T[P]> : T[P];
15 | };


--------------------------------------------------------------------------------
/crates/similarity-elixir/tests/elixir_test_helper.rs:
--------------------------------------------------------------------------------
 1 | use std::io::Write;
 2 | use std::path::PathBuf;
 3 | use tempfile::TempDir;
 4 | 
 5 | pub fn create_elixir_file(content: &str) -> (TempDir, PathBuf) {
 6 |     let dir = TempDir::new().unwrap();
 7 |     let file_path = dir.path().join("test.ex");
 8 |     let mut file = std::fs::File::create(&file_path).unwrap();
 9 |     writeln!(file, "{content}").unwrap();
10 |     file.flush().unwrap();
11 |     (dir, file_path)
12 | }
13 | 


--------------------------------------------------------------------------------
/test/__fixtures__/dissimilar/function_vs_class_2.ts:
--------------------------------------------------------------------------------
 1 | // Dissimilar: Class implementation
 2 | class DataProcessor {
 3 |   private cache: Map<string, string> = new Map();
 4 | 
 5 |   process(input: string): string {
 6 |     if (this.cache.has(input)) {
 7 |       return this.cache.get(input)!;
 8 |     }
 9 |     const result = this.transform(input);
10 |     this.cache.set(input, result);
11 |     return result;
12 |   }
13 | 
14 |   private transform(data: string): string {
15 |     return data.toUpperCase();
16 |   }
17 | }


--------------------------------------------------------------------------------
/test/__fixtures__/similar/class_rename_1.ts:
--------------------------------------------------------------------------------
 1 | // Similar: Class with renamed identifiers
 2 | class UserService {
 3 |   private users: User[] = [];
 4 | 
 5 |   addUser(user: User): void {
 6 |     this.users.push(user);
 7 |   }
 8 | 
 9 |   getUser(id: number): User | undefined {
10 |     return this.users.find(u => u.id === id);
11 |   }
12 | 
13 |   removeUser(id: number): boolean {
14 |     const index = this.users.findIndex(u => u.id === id);
15 |     if (index !== -1) {
16 |       this.users.splice(index, 1);
17 |       return true;
18 |     }
19 |     return false;
20 |   }
21 | }


--------------------------------------------------------------------------------
/test/__fixtures__/performance/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Performance test fixtures of various sizes",
 3 |   "categories": {
 4 |     "small": {
 5 |       "description": "Small files (10-50 lines)",
 6 |       "averageSize": "~30 lines",
 7 |       "files": 5
 8 |     },
 9 |     "medium": {
10 |       "description": "Medium files (100-500 lines)",
11 |       "averageSize": "~300 lines",
12 |       "files": 3
13 |     },
14 |     "large": {
15 |       "description": "Large files (1000+ lines)",
16 |       "averageSize": "~1500 lines",
17 |       "files": 2
18 |     }
19 |   }
20 | }


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/structural/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Structural duplication patterns found in the codebase",
 3 |   "testCases": [
 4 |     {
 5 |       "name": "visitNode pattern variations",
 6 |       "files": [
 7 |         "visitnode_pattern_1.ts",
 8 |         "visitnode_pattern_2.ts",
 9 |         "visitnode_pattern_3.ts"
10 |       ],
11 |       "expectedSimilarity": {
12 |         "1_vs_2": 0.75,
13 |         "1_vs_3": 0.70,
14 |         "2_vs_3": 0.65
15 |       },
16 |       "notes": "Similar AST traversal patterns with different purposes"
17 |     }
18 |   ]
19 | }


--------------------------------------------------------------------------------
/benchmark_results.txt:
--------------------------------------------------------------------------------
 1 |     Finished `bench` profile [optimized] target(s) in 0.04s
 2 |      Running unittests src/lib.rs (target/release/deps/ts_similarity_core-f9e4be796569fb52)
 3 | 
 4 | running 0 tests
 5 | 
 6 | test result: ok. 0 passed; 0 failed; 0 ignored; 0 measured; 23 filtered out; finished in 0.00s
 7 | 
 8 |      Running benches/function_comparison.rs (target/release/deps/function_comparison-d48bd7340358873b)
 9 | Gnuplot not found, using plotters backend
10 |      Running benches/tsed_benchmark.rs (target/release/deps/tsed_benchmark-444e570d13af4368)
11 | Gnuplot not found, using plotters backend
12 | 


--------------------------------------------------------------------------------
/.claude/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "permissions": {
 3 |     "allow": [
 4 |       "Bash(pnpm test)",
 5 |       "Bash(ls:*)",
 6 |       "Bash(grep:*)",
 7 |       "mcp__typescript__find_references",
 8 |       "mcp__typescript__get_definitions",
 9 |       "mcp__typescript__get_diagnostics",
10 |       "mcp__typescript__get_module_symbols",
11 |       "mcp__typescript__get_type_in_module",
12 |       "mcp__typescript__move_file",
13 |       "mcp__typescript__rename_symbol",
14 |       "mcp__typescript__delete_symbol",
15 |       "mcp__typescript__get_type_at_symbol"
16 |     ],
17 |     "deny": []
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/__deprecated/package-build.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "ts-similarity",
 3 |   "version": "1.0.0",
 4 |   "description": "TypeScript implementation (deprecated)",
 5 |   "type": "module",
 6 |   "main": "index.js",
 7 |   "bin": {
 8 |     "ts-similarity": "./dist/cli/cli.mjs"
 9 |   },
10 |   "scripts": {
11 |     "test": "vitest run",
12 |     "test:watch": "vitest",
13 |     "test:cov": "vitest run --coverage",
14 |     "build": "tsdown src/index.ts --outfile=dist/index.mjs --format=esm && tsdown src/cli/cli.ts --outfile=dist/cli/cli.mjs --format=esm --platform=node",
15 |     "prepublishOnly": "pnpm run build"
16 |   }
17 | }


--------------------------------------------------------------------------------
/crates/similarity-generic/language_configs/ruby.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "language": "ruby",
 3 |   "function_nodes": ["method", "singleton_method"],
 4 |   "type_nodes": ["class", "module"],
 5 |   "field_mappings": {
 6 |     "name_field": "name",
 7 |     "params_field": "parameters",
 8 |     "body_field": "body",
 9 |     "decorator_field": null,
10 |     "class_field": null
11 |   },
12 |   "value_nodes": ["identifier", "string", "integer", "float", "true", "false", "nil"],
13 |   "test_patterns": {
14 |     "attribute_patterns": [],
15 |     "name_prefixes": ["test_"],
16 |     "name_suffixes": ["_test", "_spec"]
17 |   }
18 | }


--------------------------------------------------------------------------------
/examples/specs/sample_project/src/models/order.ts:
--------------------------------------------------------------------------------
 1 | export interface Order {
 2 |   id: string;
 3 |   userId: string;
 4 |   items: OrderItem[];
 5 |   status: OrderStatus;
 6 |   totalAmount: number;
 7 |   createdAt: Date;
 8 |   updatedAt: Date;
 9 |   shippedAt?: Date;
10 |   deliveredAt?: Date;
11 | }
12 | 
13 | export interface OrderItem {
14 |   productId: string;
15 |   quantity: number;
16 |   price?: number;
17 | }
18 | 
19 | export enum OrderStatus {
20 |   PENDING = "pending",
21 |   PROCESSING = "processing",
22 |   SHIPPED = "shipped",
23 |   DELIVERED = "delivered",
24 |   CANCELLED = "cancelled",
25 | }
26 | 


--------------------------------------------------------------------------------
/biome.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://biomejs.dev/schemas/1.9.4/schema.json",
 3 |   "files": {
 4 |     "ignore": ["node_modules", "dist", "coverage", "*.min.js", "target", "test/__fixtures__", "pnpm-lock.yaml"]
 5 |   },
 6 |   "formatter": {
 7 |     "enabled": true,
 8 |     "formatWithErrors": false,
 9 |     "indentStyle": "space",
10 |     "indentWidth": 2,
11 |     "lineEnding": "lf",
12 |     "lineWidth": 120
13 |   },
14 |   "linter": {
15 |     "enabled": false
16 |   },
17 |   "javascript": {
18 |     "formatter": {
19 |       "quoteStyle": "double",
20 |       "semicolons": "always"
21 |     }
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/examples/specs/sample_project/src/models/user.ts:
--------------------------------------------------------------------------------
 1 | export interface User {
 2 |   id: string;
 3 |   email: string;
 4 |   name: string;
 5 |   role: UserRole;
 6 |   createdAt: Date;
 7 |   updatedAt?: Date;
 8 |   lastLoginAt?: Date;
 9 | }
10 | 
11 | export enum UserRole {
12 |   ADMIN = "admin",
13 |   USER = "user",
14 |   GUEST = "guest",
15 | }
16 | 
17 | export interface UserProfile extends User {
18 |   bio?: string;
19 |   avatar?: string;
20 |   preferences: UserPreferences;
21 | }
22 | 
23 | export interface UserPreferences {
24 |   theme: "light" | "dark";
25 |   language: string;
26 |   notifications: boolean;
27 | }
28 | 


--------------------------------------------------------------------------------
/test/__fixtures__/similar/class_rename_2.ts:
--------------------------------------------------------------------------------
 1 | // Similar: Class with renamed identifiers
 2 | class PersonManager {
 3 |   private people: Person[] = [];
 4 | 
 5 |   addPerson(person: Person): void {
 6 |     this.people.push(person);
 7 |   }
 8 | 
 9 |   getPerson(identifier: number): Person | undefined {
10 |     return this.people.find(p => p.id === identifier);
11 |   }
12 | 
13 |   removePerson(identifier: number): boolean {
14 |     const idx = this.people.findIndex(p => p.id === identifier);
15 |     if (idx !== -1) {
16 |       this.people.splice(idx, 1);
17 |       return true;
18 |     }
19 |     return false;
20 |   }
21 | }


--------------------------------------------------------------------------------
/crates/similarity-generic/language_configs/c.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "language": "c",
 3 |   "function_nodes": ["function_definition"],
 4 |   "type_nodes": ["struct_specifier", "enum_specifier", "type_definition"],
 5 |   "field_mappings": {
 6 |     "name_field": "declarator",
 7 |     "params_field": "declarator",
 8 |     "body_field": "body",
 9 |     "decorator_field": null,
10 |     "class_field": null
11 |   },
12 |   "value_nodes": ["identifier", "string_literal", "number_literal", "true", "false", "null"],
13 |   "test_patterns": {
14 |     "attribute_patterns": [],
15 |     "name_prefixes": ["test_"],
16 |     "name_suffixes": ["_test"]
17 |   }
18 | }


--------------------------------------------------------------------------------
/test/__fixtures__/performance/small/small_1.ts:
--------------------------------------------------------------------------------
 1 | // Small test file 1
 2 | 
 3 | import { Component } from '@angular/core';
 4 | import { Observable } from 'rxjs';
 5 | 
 6 | export function calculate1(a: number, b: number): number {
 7 |   const result = a + b;
 8 |   console.log('Result:', result);
 9 |   return result;
10 | }
11 | 
12 | interface Data1 {
13 |   id: number;
14 |   name: string;
15 |   value: number;
16 | }
17 | 
18 | class Service1 {
19 |   private data: Data1[] = [];
20 | 
21 |   add(item: Data1): void {
22 |     this.data.push(item);
23 |   }
24 | 
25 |   get(id: number): Data1 | undefined {
26 |     return this.data.find(d => d.id === id);
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/test/__fixtures__/performance/small/small_2.ts:
--------------------------------------------------------------------------------
 1 | // Small test file 2
 2 | 
 3 | import { Component } from '@angular/core';
 4 | import { Observable } from 'rxjs';
 5 | 
 6 | export function calculate2(a: number, b: number): number {
 7 |   const result = a + b;
 8 |   console.log('Result:', result);
 9 |   return result;
10 | }
11 | 
12 | interface Data2 {
13 |   id: number;
14 |   name: string;
15 |   value: number;
16 | }
17 | 
18 | class Service2 {
19 |   private data: Data2[] = [];
20 | 
21 |   add(item: Data2): void {
22 |     this.data.push(item);
23 |   }
24 | 
25 |   get(id: number): Data2 | undefined {
26 |     return this.data.find(d => d.id === id);
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/crates/core/tests/fixtures/sample2.ts:
--------------------------------------------------------------------------------
 1 | // Test sample 2: Similar functions that should have high similarity
 2 | 
 3 | export function calculateSum(numbers: number[]): number {
 4 |     if (numbers.length === 0) return 0;
 5 |     
 6 |     let total = 0;
 7 |     for (const num of numbers) {
 8 |         total += num;
 9 |     }
10 |     
11 |     return total;
12 | }
13 | 
14 | // Very similar to calculateAverage from sample1.ts
15 | export function computeMean(values: number[]): number {
16 |     if (values.length === 0) return 0;
17 |     
18 |     let sum = 0;
19 |     for (const val of values) {
20 |         sum += val;
21 |     }
22 |     
23 |     return sum / values.length;
24 | }


--------------------------------------------------------------------------------
/crates/similarity-generic/language_configs/cpp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "language": "cpp",
 3 |   "function_nodes": ["function_definition", "lambda_expression"],
 4 |   "type_nodes": ["class_specifier", "struct_specifier", "enum_specifier"],
 5 |   "field_mappings": {
 6 |     "name_field": "declarator",
 7 |     "params_field": "declarator",
 8 |     "body_field": "body",
 9 |     "decorator_field": null,
10 |     "class_field": null
11 |   },
12 |   "value_nodes": ["identifier", "string_literal", "number_literal", "true", "false", "nullptr"],
13 |   "test_patterns": {
14 |     "attribute_patterns": [],
15 |     "name_prefixes": ["test_", "Test"],
16 |     "name_suffixes": ["_test", "Test"]
17 |   }
18 | }


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/structural/loop_pattern_1.ts:
--------------------------------------------------------------------------------
 1 | // Simple loop patterns that are structurally identical
 2 | // These demonstrate pure structural duplication
 3 | 
 4 | export function sumNumbers(numbers: number[]): number {
 5 |   let total = 0;
 6 |   for (const num of numbers) {
 7 |     total += num;
 8 |   }
 9 |   return total;
10 | }
11 | 
12 | export function sumPrices(prices: number[]): number {
13 |   let total = 0;
14 |   for (const price of prices) {
15 |     total += price;
16 |   }
17 |   return total;
18 | }
19 | 
20 | export function sumScores(scores: number[]): number {
21 |   let total = 0;
22 |   for (const score of scores) {
23 |     total += score;
24 |   }
25 |   return total;
26 | }


--------------------------------------------------------------------------------
/crates/similarity-css/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "similarity-css"
 3 | version = "0.4.2"
 4 | edition = "2021"
 5 | authors = ["similarity contributors"]
 6 | description = "CSS/SCSS similarity detection tool (experimental)"
 7 | license = "MIT"
 8 | repository = "https://github.com/mizchi/similarity"
 9 | 
10 | [dependencies]
11 | similarity-core = { version = "0.4.2", path = "../core" }
12 | clap = { version = "4.0", features = ["derive"] }
13 | tree-sitter = "0.24"
14 | tree-sitter-css = "0.23"
15 | tree-sitter-scss = "1.0"
16 | rayon = "1.7"
17 | serde = { version = "1.0", features = ["derive"] }
18 | serde_json = "1.0"
19 | indexmap = "2.0"
20 | ignore = "0.4"
21 | 
22 | [dev-dependencies]
23 | tempfile = "3.5"
24 | insta = "1.29"


--------------------------------------------------------------------------------
/crates/similarity-generic/language_configs/java.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "language": "java",
 3 |   "function_nodes": ["method_declaration", "constructor_declaration"],
 4 |   "type_nodes": ["class_declaration", "interface_declaration", "enum_declaration"],
 5 |   "field_mappings": {
 6 |     "name_field": "name",
 7 |     "params_field": "parameters",
 8 |     "body_field": "body",
 9 |     "decorator_field": "annotation",
10 |     "class_field": null
11 |   },
12 |   "value_nodes": ["identifier", "string_literal", "integer_literal", "floating_point_literal", "true", "false", "null_literal"],
13 |   "test_patterns": {
14 |     "attribute_patterns": ["@Test", "@ParameterizedTest"],
15 |     "name_prefixes": ["test"],
16 |     "name_suffixes": ["Test"]
17 |   }
18 | }


--------------------------------------------------------------------------------
/crates/similarity-md/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod levenshtein;
 2 | pub mod markdown_parser;
 3 | pub mod morphological_similarity;
 4 | pub mod section_extractor;
 5 | pub mod similarity_calculator;
 6 | 
 7 | pub use levenshtein::{
 8 |     levenshtein_distance, levenshtein_similarity, word_levenshtein_distance,
 9 |     word_levenshtein_similarity,
10 | };
11 | pub use markdown_parser::{MarkdownParser, MarkdownSection};
12 | pub use morphological_similarity::{
13 |     MorphemeToken, MorphologicalSimilarityCalculator, PosSimilarity,
14 | };
15 | pub use section_extractor::{ExtractedSection, SectionExtractor, SimilarTitlePair};
16 | pub use similarity_calculator::{
17 |     SimilarSectionPair, SimilarityCalculator, SimilarityOptions, SimilarityResult,
18 | };
19 | 


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/structural/visitnode_pattern_1.ts:
--------------------------------------------------------------------------------
 1 | // visitNode pattern from function_body_comparer.ts
 2 | function visitNode(node: any, callback: (n: any) => void): void {
 3 |   if (!node || typeof node !== 'object') {
 4 |     return;
 5 |   }
 6 | 
 7 |   callback(node);
 8 | 
 9 |   // Handle arrays
10 |   if (Array.isArray(node)) {
11 |     for (const item of node) {
12 |       visitNode(item, callback);
13 |     }
14 |     return;
15 |   }
16 | 
17 |   // Skip certain properties
18 |   const skipKeys = new Set(['loc', 'range', 'start', 'end', 'parent']);
19 | 
20 |   // Visit all properties
21 |   for (const key in node) {
22 |     if (node.hasOwnProperty(key) && !skipKeys.has(key)) {
23 |       visitNode(node[key], callback);
24 |     }
25 |   }
26 | }


--------------------------------------------------------------------------------
/test/__fixtures__/refactoring/class_to_function/calculator_class.ts:
--------------------------------------------------------------------------------
 1 | // Stateful Calculator class
 2 | class Calculator {
 3 |   private value: number = 0;
 4 |   
 5 |   add(n: number): number {
 6 |     this.value += n;
 7 |     return this.value;
 8 |   }
 9 |   
10 |   subtract(n: number): number {
11 |     this.value -= n;
12 |     return this.value;
13 |   }
14 |   
15 |   multiply(n: number): number {
16 |     this.value *= n;
17 |     return this.value;
18 |   }
19 |   
20 |   divide(n: number): number {
21 |     if (n === 0) {
22 |       throw new Error('Division by zero');
23 |     }
24 |     this.value /= n;
25 |     return this.value;
26 |   }
27 |   
28 |   reset(): void {
29 |     this.value = 0;
30 |   }
31 |   
32 |   getValue(): number {
33 |     return this.value;
34 |   }
35 | }


--------------------------------------------------------------------------------
/crates/similarity-generic/language_configs/csharp.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "language": "csharp",
 3 |   "function_nodes": ["method_declaration", "constructor_declaration", "lambda_expression"],
 4 |   "type_nodes": ["class_declaration", "interface_declaration", "struct_declaration", "enum_declaration"],
 5 |   "field_mappings": {
 6 |     "name_field": "name",
 7 |     "params_field": "parameters",
 8 |     "body_field": "body",
 9 |     "decorator_field": "attribute",
10 |     "class_field": null
11 |   },
12 |   "value_nodes": ["identifier", "string_literal", "integer_literal", "real_literal", "true", "false", "null_literal"],
13 |   "test_patterns": {
14 |     "attribute_patterns": ["[Test]", "[TestMethod]", "[Fact]"],
15 |     "name_prefixes": ["Test"],
16 |     "name_suffixes": ["Test", "Tests"]
17 |   }
18 | }


--------------------------------------------------------------------------------
/crates/similarity-generic/examples/sample.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import "fmt"
 4 | 
 5 | // Similar functions that should be detected
 6 | func calculateSum(numbers []int) int {
 7 |     total := 0
 8 |     for _, num := range numbers {
 9 |         total += num
10 |     }
11 |     return total
12 | }
13 | 
14 | func computeTotal(values []int) int {
15 |     sum := 0
16 |     for _, val := range values {
17 |         sum += val
18 |     }
19 |     return sum
20 | }
21 | 
22 | // Different function
23 | func printMessage(msg string) {
24 |     fmt.Println("Message:", msg)
25 | }
26 | 
27 | // Test function (can be excluded with --skip-test)
28 | func TestCalculateSum(t *testing.T) {
29 |     result := calculateSum([]int{1, 2, 3})
30 |     if result != 6 {
31 |         t.Error("Expected 6")
32 |     }
33 | }


--------------------------------------------------------------------------------
/crates/similarity-generic/language_configs/go.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "language": "go",
 3 |   "function_nodes": [
 4 |     "function_declaration",
 5 |     "method_declaration"
 6 |   ],
 7 |   "type_nodes": [
 8 |     "type_declaration",
 9 |     "struct_type"
10 |   ],
11 |   "field_mappings": {
12 |     "name_field": "name",
13 |     "params_field": "parameters",
14 |     "body_field": "body"
15 |   },
16 |   "value_nodes": [
17 |     "identifier",
18 |     "interpreted_string_literal",
19 |     "raw_string_literal",
20 |     "int_literal",
21 |     "float_literal",
22 |     "true",
23 |     "false",
24 |     "nil"
25 |   ],
26 |   "test_patterns": {
27 |     "attribute_patterns": [],
28 |     "name_prefixes": [
29 |       "Test",
30 |       "Benchmark"
31 |     ],
32 |     "name_suffixes": [
33 |       "_test"
34 |     ]
35 |   }
36 | }


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # Benchmarks
 2 | 
 3 | This directory contains performance benchmarking data and complex test cases.
 4 | 
 5 | ## Structure
 6 | 
 7 | - `data/` - Various TypeScript files for benchmarking and stress testing
 8 |   - Performance tests
 9 |   - Complex duplication scenarios
10 |   - Real-world code patterns
11 | 
12 | ## Running Benchmarks
13 | 
14 | ```bash
15 | # Basic performance test
16 | time similarity-ts benchmarks/data/ --threshold 0.8
17 | 
18 | # Memory usage test
19 | /usr/bin/time -v similarity-ts benchmarks/data/ --threshold 0.8
20 | 
21 | # Stress test with many files
22 | similarity-ts benchmarks/data/ --threshold 0.7 --min-tokens 10
23 | ```
24 | 
25 | ## Notes
26 | 
27 | These files are not meant for understanding the tool's basic functionality.
28 | For specification examples, see `examples/specs/`.


--------------------------------------------------------------------------------
/docs/algorithm/README.md:
--------------------------------------------------------------------------------
 1 | # Algorithm Documentation
 2 | 
 3 | This directory contains documentation about the algorithms used in the similarity detection tools.
 4 | 
 5 | ## Contents
 6 | 
 7 | - [TSED (Tree Similarity of Edit Distance)](tsed-similarity.md) - Complete academic paper on the TSED algorithm
 8 | - [TSED Summary](tsed-similarity-summary.md) - Summary of the TSED paper
 9 | - [Tree-sitter Integration Analysis](tree-sitter-integration-analysis.md) - Analysis of tree-sitter integration for AST parsing
10 | 
11 | ## Overview
12 | 
13 | The similarity detection tools use AST-based comparison algorithms to detect code duplication across multiple programming languages. The core algorithm is TSED (Tree Similarity of Edit Distance), which provides accurate structural comparison of code while considering both structure and size differences.


--------------------------------------------------------------------------------
/__deprecated/src/core/levenshtein.ts:
--------------------------------------------------------------------------------
 1 | export function levenshtein(str1: string, str2: string): number {
 2 |   const len1 = str1.length;
 3 |   const len2 = str2.length;
 4 | 
 5 |   if (len1 === 0) return len2;
 6 |   if (len2 === 0) return len1;
 7 | 
 8 |   const matrix: number[][] = [];
 9 | 
10 |   for (let i = 0; i <= len2; i++) {
11 |     matrix[i] = [i];
12 |   }
13 | 
14 |   for (let j = 0; j <= len1; j++) {
15 |     matrix[0][j] = j;
16 |   }
17 | 
18 |   for (let i = 1; i <= len2; i++) {
19 |     for (let j = 1; j <= len1; j++) {
20 |       if (str2.charAt(i - 1) === str1.charAt(j - 1)) {
21 |         matrix[i][j] = matrix[i - 1][j - 1];
22 |       } else {
23 |         matrix[i][j] = Math.min(matrix[i - 1][j - 1] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j] + 1);
24 |       }
25 |     }
26 |   }
27 | 
28 |   return matrix[len2][len1];
29 | }
30 | 


--------------------------------------------------------------------------------
/test/__fixtures__/edge_cases/identical_1.ts:
--------------------------------------------------------------------------------
 1 | // Edge case: Identical complex code
 2 | export class ComplexService<T extends BaseEntity> {
 3 |   private readonly repository: Repository<T>;
 4 |   private readonly cache: Map<string, T> = new Map();
 5 | 
 6 |   constructor(repository: Repository<T>) {
 7 |     this.repository = repository;
 8 |   }
 9 | 
10 |   async findById(id: string): Promise<T | null> {
11 |     if (this.cache.has(id)) {
12 |       return this.cache.get(id)!;
13 |     }
14 | 
15 |     const entity = await this.repository.findOne({ where: { id } });
16 |     if (entity) {
17 |       this.cache.set(id, entity);
18 |     }
19 |     return entity;
20 |   }
21 | 
22 |   async save(entity: T): Promise<T> {
23 |     const saved = await this.repository.save(entity);
24 |     this.cache.set(saved.id, saved);
25 |     return saved;
26 |   }
27 | }


--------------------------------------------------------------------------------
/test/__fixtures__/edge_cases/identical_2.ts:
--------------------------------------------------------------------------------
 1 | // Edge case: Identical complex code
 2 | export class ComplexService<T extends BaseEntity> {
 3 |   private readonly repository: Repository<T>;
 4 |   private readonly cache: Map<string, T> = new Map();
 5 | 
 6 |   constructor(repository: Repository<T>) {
 7 |     this.repository = repository;
 8 |   }
 9 | 
10 |   async findById(id: string): Promise<T | null> {
11 |     if (this.cache.has(id)) {
12 |       return this.cache.get(id)!;
13 |     }
14 | 
15 |     const entity = await this.repository.findOne({ where: { id } });
16 |     if (entity) {
17 |       this.cache.set(id, entity);
18 |     }
19 |     return entity;
20 |   }
21 | 
22 |   async save(entity: T): Promise<T> {
23 |     const saved = await this.repository.save(entity);
24 |     this.cache.set(saved.id, saved);
25 |     return saved;
26 |   }
27 | }


--------------------------------------------------------------------------------
/__deprecated/src/cli/io.ts:
--------------------------------------------------------------------------------
 1 | // IO operations for file system access
 2 | import { readFileSync } from "fs";
 3 | import { join, relative } from "path";
 4 | import { glob } from "glob";
 5 | 
 6 | interface FileInfo {
 7 |   id: string;
 8 |   path: string;
 9 |   content: string;
10 | }
11 | 
12 | /**
13 |  * Load files from a directory pattern
14 |  */
15 | export async function loadFilesFromPattern(pattern: string, basePath: string = "."): Promise<FileInfo[]> {
16 |   const files = await glob(pattern, { cwd: basePath });
17 |   const results: FileInfo[] = [];
18 | 
19 |   for (const file of files) {
20 |     const fullPath = join(basePath, file);
21 |     const content = readFileSync(fullPath, "utf-8");
22 |     const id = relative(basePath, fullPath);
23 | 
24 |     results.push({ id, path: fullPath, content });
25 |   }
26 | 
27 |   return results;
28 | }
29 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | 
 3 | Simple, clear examples to understand similarity-ts functionality.
 4 | 
 5 | ## Directory Structure
 6 | 
 7 | - `specs/` - Specification examples demonstrating core features
 8 |   - `duplicate-functions.ts` - Function similarity detection
 9 |   - `duplicate-types.ts` - Type similarity detection
10 |   - `sample_project/` - Multi-file project example
11 |   - Other test files for specific features
12 | 
13 | ## Quick Test
14 | 
15 | ```bash
16 | # Test function detection
17 | similarity-ts examples/specs/duplicate-functions.ts --threshold 0.8 --min-tokens 20
18 | 
19 | # Test type detection
20 | similarity-ts examples/specs/duplicate-types.ts --experimental-types --threshold 0.8
21 | 
22 | # Test multi-file project
23 | similarity-ts examples/specs/sample_project/ --threshold 0.85
24 | ```
25 | 
26 | See `specs/README.md` for expected results.


--------------------------------------------------------------------------------
/examples/mixed_language_project/helpers.py:
--------------------------------------------------------------------------------
 1 | # Python helper functions
 2 | 
 3 | def process_data(data):
 4 |     """Process data and return result."""
 5 |     result = []
 6 |     for item in data:
 7 |         if item > 0:
 8 |             result.append(item * 2)
 9 |     return result
10 | 
11 | def calculate_sum(numbers):
12 |     """Calculate sum of numbers."""
13 |     total = 0
14 |     for num in numbers:
15 |         total += num
16 |     return total
17 | 
18 | class DataHelper:
19 |     def __init__(self):
20 |         self.data = []
21 |     
22 |     def process(self, data):
23 |         result = []
24 |         for item in data:
25 |             if item > 0:
26 |                 result.append(item * 2)
27 |         return result
28 |     
29 |     def sum(self, numbers):
30 |         total = 0
31 |         for n in numbers:
32 |             total += n
33 |         return total


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | members = [
 3 |   "crates/core",
 4 |   "crates/similarity-ts",
 5 |   "crates/similarity-py",
 6 |   "crates/similarity-php",
 7 |   "crates/similarity-rs",
 8 |   "crates/similarity-generic",
 9 |   "crates/similarity-elixir",
10 |   "crates/similarity-md",
11 |   "crates/similarity-css",
12 | ]
13 | resolver = "2"
14 | 
15 | [workspace.dependencies]
16 | oxc_allocator = "0.73.0"
17 | oxc_ast = "0.73.0"
18 | oxc_parser = "0.73.0"
19 | oxc_span = "0.73.0"
20 | tree-sitter = "0.24"
21 | tree-sitter-c = "0.23"
22 | tree-sitter-c-sharp = "0.23"
23 | tree-sitter-cpp = "0.23"
24 | tree-sitter-elixir = "0.3"
25 | tree-sitter-go = "0.23"
26 | tree-sitter-java = "0.23"
27 | tree-sitter-javascript = "0.23"
28 | tree-sitter-php = "0.23"
29 | tree-sitter-python = "0.23"
30 | tree-sitter-ruby = "0.23"
31 | tree-sitter-rust = "0.23"
32 | tree-sitter-typescript = "0.23"
33 | 


--------------------------------------------------------------------------------
/examples/mixed_language_project/utils.js:
--------------------------------------------------------------------------------
 1 | // JavaScript utility functions
 2 | 
 3 | function processData(data) {
 4 |     const result = [];
 5 |     for (const item of data) {
 6 |         if (item > 0) {
 7 |             result.push(item * 2);
 8 |         }
 9 |     }
10 |     return result;
11 | }
12 | 
13 | function transformData(data) {
14 |     const output = [];
15 |     for (const element of data) {
16 |         if (element > 0) {
17 |             output.push(element * 2);
18 |         }
19 |     }
20 |     return output;
21 | }
22 | 
23 | export class DataProcessor {
24 |     constructor() {
25 |         this.cache = {};
26 |     }
27 |     
28 |     process(data) {
29 |         const result = [];
30 |         for (const item of data) {
31 |             if (item > 0) {
32 |                 result.push(item * 2);
33 |             }
34 |         }
35 |         return result;
36 |     }
37 | }


--------------------------------------------------------------------------------
/crates/similarity-css/src/lib.rs:
--------------------------------------------------------------------------------
 1 | pub mod css_comparator;
 2 | pub mod css_parser;
 3 | pub mod css_rule_converter;
 4 | pub mod duplicate_analyzer;
 5 | pub mod parser;
 6 | pub mod scss_flattener;
 7 | pub mod scss_simple_flattener;
 8 | pub mod shorthand_expander;
 9 | pub mod specificity;
10 | 
11 | pub use css_comparator::{
12 |     calculate_rule_similarity, compare_css_rules, CssRule, CssSimilarityResult, SerializableCssRule,
13 | };
14 | pub use css_rule_converter::{convert_to_css_rule, parse_css_to_rules};
15 | pub use duplicate_analyzer::{
16 |     DuplicateAnalysisResult, DuplicateAnalyzer, DuplicateRule, DuplicateType,
17 |     SerializableDuplicateRule,
18 | };
19 | pub use parser::CssParser;
20 | pub use scss_flattener::{flatten_scss_rules, FlatRule};
21 | pub use shorthand_expander::expand_shorthand_properties;
22 | pub use specificity::{calculate_specificity, SelectorAnalysis, Specificity};
23 | 


--------------------------------------------------------------------------------
/examples/specs/sample_project/src/utils/errors.ts:
--------------------------------------------------------------------------------
 1 | export class ApplicationError extends Error {
 2 |   constructor(
 3 |     message: string,
 4 |     public code: string,
 5 |   ) {
 6 |     super(message);
 7 |     this.name = "ApplicationError";
 8 |   }
 9 | }
10 | 
11 | export class ValidationError extends ApplicationError {
12 |   constructor(message: string) {
13 |     super(message, "VALIDATION_ERROR");
14 |     this.name = "ValidationError";
15 |   }
16 | }
17 | 
18 | export class NotFoundError extends ApplicationError {
19 |   constructor(resource: string, id: string) {
20 |     super(`${resource} not found: ${id}`, "NOT_FOUND");
21 |     this.name = "NotFoundError";
22 |   }
23 | }
24 | 
25 | export class UnauthorizedError extends ApplicationError {
26 |   constructor(message: string = "Unauthorized") {
27 |     super(message, "UNAUTHORIZED");
28 |     this.name = "UnauthorizedError";
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/examples/specs/sample_project/src/utils/validator.ts:
--------------------------------------------------------------------------------
 1 | export class Validator {
 2 |   static isEmail(email: string): boolean {
 3 |     const emailRegex = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
 4 |     return emailRegex.test(email);
 5 |   }
 6 | 
 7 |   static isStrongPassword(password: string): boolean {
 8 |     return password.length >= 8 && /[A-Z]/.test(password) && /[a-z]/.test(password) && /[0-9]/.test(password);
 9 |   }
10 | 
11 |   static isValidPhoneNumber(phone: string): boolean {
12 |     const phoneRegex = /^\+?[\d\s-()]+$/;
13 |     return phoneRegex.test(phone) && phone.replace(/\D/g, "").length >= 10;
14 |   }
15 | 
16 |   static isValidUrl(url: string): boolean {
17 |     try {
18 |       new URL(url);
19 |       return true;
20 |     } catch {
21 |       return false;
22 |     }
23 |   }
24 | 
25 |   static sanitizeInput(input: string): string {
26 |     return input.trim().replace(/[<>]/g, "");
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/examples/specs/test_cli.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Test the CLI functionality
 3 |  */
 4 | import { spawn } from "child_process";
 5 | import { join, dirname } from "path";
 6 | 
 7 | const __dirname = dirname(new URL(import.meta.url).pathname);
 8 | const cliPath = join(__dirname, "../src/cli/cli.ts");
 9 | const targetDir = join(__dirname, "sample_project/src");
10 | 
11 | console.log("🔍 Testing TypeScript Function Similarity CLI\n");
12 | console.log(`CLI Path: ${cliPath}`);
13 | console.log(`Target Directory: ${targetDir}\n`);
14 | 
15 | // Run the CLI with tsx
16 | const child = spawn("npx", ["tsx", cliPath, targetDir, "-t", "0.6"], {
17 |   stdio: "inherit",
18 |   shell: true,
19 | });
20 | 
21 | child.on("error", (error) => {
22 |   console.error("Failed to start CLI:", error);
23 | });
24 | 
25 | child.on("close", (code) => {
26 |   if (code !== 0) {
27 |     console.error(`CLI exited with code ${code}`);
28 |   }
29 | });
30 | 


--------------------------------------------------------------------------------
/crates/core/README.md:
--------------------------------------------------------------------------------
 1 | # similarity-ts-core
 2 | 
 3 | Core library for TypeScript/JavaScript code similarity detection using AST-based comparison.
 4 | 
 5 | ## Features
 6 | 
 7 | - Extract functions from TypeScript/JavaScript code
 8 | - Compare function similarity using Tree Structured Edit Distance (TSED)
 9 | - Fast similarity detection with bloom filter pre-filtering
10 | - Support for various function types (regular functions, arrow functions, methods)
11 | - Configurable similarity thresholds
12 | 
13 | ## Usage
14 | 
15 | ```rust
16 | use similarity_core::{extract_functions, compare_functions, TSEDOptions};
17 | 
18 | // Extract functions from code
19 | let functions = extract_functions("example.ts", source_code)?;
20 | 
21 | // Compare two functions
22 | let options = TSEDOptions::default();
23 | let similarity = compare_functions(&func1, &func2, source1, source2, &options)?;
24 | ```
25 | 
26 | ## License
27 | 
28 | MIT


--------------------------------------------------------------------------------
/test/__fixtures__/dissimilar/sync_vs_async_2.ts:
--------------------------------------------------------------------------------
 1 | // Dissimilar: Asynchronous code with different purpose
 2 | async function fetchAndProcessData(url: string): Promise<ProcessedData> {
 3 |   try {
 4 |     const response = await fetch(url);
 5 |     const rawData = await response.json();
 6 |     
 7 |     const processed = await Promise.all(
 8 |       rawData.items.map(async (item: any) => {
 9 |         const details = await fetchDetails(item.id);
10 |         return { ...item, details };
11 |       })
12 |     );
13 |     
14 |     return {
15 |       timestamp: Date.now(),
16 |       data: processed,
17 |       status: 'success'
18 |     };
19 |   } catch (error) {
20 |     console.error('Processing failed:', error);
21 |     throw new Error('Failed to process data');
22 |   }
23 | }
24 | 
25 | async function fetchDetails(id: string): Promise<any> {
26 |   const response = await fetch(`/api/details/${id}`);
27 |   return response.json();
28 | }


--------------------------------------------------------------------------------
/crates/similarity-generic/examples/configs/go.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "language": "go",
 3 |   "function_nodes": [
 4 |     "function_declaration",
 5 |     "method_declaration"
 6 |   ],
 7 |   "type_nodes": [
 8 |     "type_declaration",
 9 |     "struct_type",
10 |     "interface_type"
11 |   ],
12 |   "field_mappings": {
13 |     "name_field": "name",
14 |     "params_field": "parameters",
15 |     "body_field": "body",
16 |     "decorator_field": null,
17 |     "class_field": null
18 |   },
19 |   "value_nodes": [
20 |     "identifier",
21 |     "interpreted_string_literal",
22 |     "raw_string_literal",
23 |     "int_literal",
24 |     "float_literal",
25 |     "true",
26 |     "false",
27 |     "nil"
28 |   ],
29 |   "test_patterns": {
30 |     "attribute_patterns": [],
31 |     "name_prefixes": [
32 |       "Test",
33 |       "Benchmark"
34 |     ],
35 |     "name_suffixes": [
36 |       "_test"
37 |     ]
38 |   },
39 |   "custom_mappings": null
40 | }


--------------------------------------------------------------------------------
/test/__fixtures__/refactoring/class_to_function/user_service_functions.ts:
--------------------------------------------------------------------------------
 1 | // Refactored to functional style
 2 | function addUser(users: Map<string, User>, user: User): void {
 3 |   if (!user.id) {
 4 |     throw new Error('User must have an ID');
 5 |   }
 6 |   users.set(user.id, user);
 7 |   console.log(`Added user: ${user.name}`);
 8 | }
 9 | 
10 | function removeUser(users: Map<string, User>, userId: string): boolean {
11 |   const user = users.get(userId);
12 |   if (!user) {
13 |     return false;
14 |   }
15 |   users.delete(userId);
16 |   console.log(`Removed user: ${user.name}`);
17 |   return true;
18 | }
19 | 
20 | function getUser(users: Map<string, User>, userId: string): User | undefined {
21 |   return users.get(userId);
22 | }
23 | 
24 | function getAllUsers(users: Map<string, User>): User[] {
25 |   return Array.from(users.values());
26 | }
27 | 
28 | interface User {
29 |   id: string;
30 |   name: string;
31 |   email: string;
32 | }


--------------------------------------------------------------------------------
/test/__fixtures__/refactoring/class_to_function/calculator_functions.ts:
--------------------------------------------------------------------------------
 1 | // Functional Calculator with state parameter
 2 | interface CalculatorState {
 3 |   value: number;
 4 | }
 5 | 
 6 | const add = (state: CalculatorState, n: number): number => {
 7 |   state.value += n;
 8 |   return state.value;
 9 | };
10 | 
11 | const subtract = (state: CalculatorState, n: number): number => {
12 |   state.value -= n;
13 |   return state.value;
14 | };
15 | 
16 | const multiply = (state: CalculatorState, n: number): number => {
17 |   state.value *= n;
18 |   return state.value;
19 | };
20 | 
21 | const divide = (state: CalculatorState, n: number): number => {
22 |   if (n === 0) {
23 |     throw new Error('Division by zero');
24 |   }
25 |   state.value /= n;
26 |   return state.value;
27 | };
28 | 
29 | const reset = (state: CalculatorState): void => {
30 |   state.value = 0;
31 | };
32 | 
33 | const getValue = (state: CalculatorState): number => {
34 |   return state.value;
35 | };


--------------------------------------------------------------------------------
/examples/specs/README.md:
--------------------------------------------------------------------------------
 1 | # Specification Examples
 2 | 
 3 | These examples demonstrate the core functionality of similarity-ts.
 4 | 
 5 | ## Files
 6 | 
 7 | - `duplicate-functions.ts` - Function similarity detection examples
 8 | - `duplicate-types.ts` - Type similarity detection examples (requires --experimental-types)
 9 | 
10 | ## Expected Results
11 | 
12 | ### Function Detection
13 | ```bash
14 | similarity-ts duplicate-functions.ts --threshold 0.8 --min-tokens 20
15 | ```
16 | Should detect:
17 | - calculateUserAge vs calculateCustomerAge (~95% similarity)
18 | - findMaxValue vs getMaximumValue (~85% similarity)
19 | - processUserData vs processCustomerData (~90% similarity)
20 | 
21 | ### Type Detection
22 | ```bash
23 | similarity-ts duplicate-types.ts --experimental-types --threshold 0.8
24 | ```
25 | Should detect:
26 | - User vs Customer (100% similarity)
27 | - UserResponse vs CustomerResponse (~85% similarity)
28 | - ApiResponse vs ServiceResponse (100% similarity)


--------------------------------------------------------------------------------
/test/__fixtures__/refactoring/class_to_function/user_service_class.ts:
--------------------------------------------------------------------------------
 1 | // Original class implementation
 2 | class UserService {
 3 |   private users: Map<string, User> = new Map();
 4 |   
 5 |   addUser(user: User): void {
 6 |     if (!user.id) {
 7 |       throw new Error('User must have an ID');
 8 |     }
 9 |     this.users.set(user.id, user);
10 |     console.log(`Added user: ${user.name}`);
11 |   }
12 |   
13 |   removeUser(userId: string): boolean {
14 |     const user = this.users.get(userId);
15 |     if (!user) {
16 |       return false;
17 |     }
18 |     this.users.delete(userId);
19 |     console.log(`Removed user: ${user.name}`);
20 |     return true;
21 |   }
22 |   
23 |   getUser(userId: string): User | undefined {
24 |     return this.users.get(userId);
25 |   }
26 |   
27 |   getAllUsers(): User[] {
28 |     return Array.from(this.users.values());
29 |   }
30 | }
31 | 
32 | interface User {
33 |   id: string;
34 |   name: string;
35 |   email: string;
36 | }


--------------------------------------------------------------------------------
/crates/similarity-md/examples/japanese_similarity_test.md:
--------------------------------------------------------------------------------
 1 | # 日本語類似性検出のテスト
 2 | 
 3 | このドキュメントは、Vibrato を使った形態素解析による日本語テキストの類似性検出をテストするためのサンプルです。
 4 | 
 5 | ## 機械学習について
 6 | 
 7 | 機械学習は、コンピュータがデータから自動的にパターンを学習する技術です。この技術は、画像認識、自然言語処理、推薦システムなど、様々な分野で活用されています。機械学習のアルゴリズムには、教師あり学習、教師なし学習、強化学習などがあります。
 8 | 
 9 | ## マシンラーニングの概要
10 | 
11 | マシンラーニングとは、計算機がデータから自動的にパターンを習得する手法です。この手法は、画像解析、言語処理、レコメンドシステムなど、多様な領域で利用されています。マシンラーニングの手法には、監督学習、非監督学習、強化学習などが存在します。
12 | 
13 | ## 深層学習の基礎
14 | 
15 | 深層学習は機械学習の一分野で、多層のニューラルネットワークを使用してデータの複雑なパターンを学習します。深層学習は画像認識、音声認識、自然言語処理などの分野で革新的な成果を上げています。
16 | 
17 | ## ディープラーニングの原理
18 | 
19 | ディープラーニングは機械学習の一種で、多層のニューラルネットワークを利用してデータの複雑なパターンを習得します。ディープラーニングは画像解析、音声解析、言語処理などの領域で画期的な結果を達成しています。
20 | 
21 | ## プログラミング言語の比較
22 | 
23 | プログラミング言語には多くの種類があります。Python は機械学習やデータサイエンスの分野で人気があり、JavaScript はウェブ開発で広く使われています。Java は企業システムの開発でよく利用され、C++ は高性能なアプリケーションの開発に適しています。
24 | 
25 | ## 全く異なる内容
26 | 
27 | 今日の天気は晴れです。公園で散歩をしました。桜の花がとても美しく咲いていました。子供たちが元気に遊んでいる姿を見て、心が温かくなりました。
28 | 


--------------------------------------------------------------------------------
/test/__fixtures__/refactoring/class_to_function/metadata.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Class to function refactoring patterns",
 3 |   "testCases": [
 4 |     {
 5 |       "name": "UserService refactoring",
 6 |       "files": {
 7 |         "before": "user_service_class.ts",
 8 |         "after": "user_service_functions.ts"
 9 |       },
10 |       "expectedSimilarity": {
11 |         "addUser": 0.85,
12 |         "removeUser": 0.85,
13 |         "getUser": 0.90,
14 |         "getAllUsers": 0.90
15 |       },
16 |       "notes": "Methods converted to functions with state parameter"
17 |     },
18 |     {
19 |       "name": "Calculator refactoring",
20 |       "files": {
21 |         "before": "calculator_class.ts",
22 |         "after": "calculator_functions.ts"
23 |       },
24 |       "expectedSimilarity": {
25 |         "add": 0.80,
26 |         "subtract": 0.80
27 |       },
28 |       "notes": "Stateful class converted to functional with state parameter"
29 |     }
30 |   ]
31 | }


--------------------------------------------------------------------------------
/examples/specs/sample_project/src/utils/logger.ts:
--------------------------------------------------------------------------------
 1 | export class Logger {
 2 |   private context: string;
 3 | 
 4 |   constructor(context: string) {
 5 |     this.context = context;
 6 |   }
 7 | 
 8 |   info(message: string, data?: any): void {
 9 |     console.log(`[${this.getTimestamp()}] [INFO] [${this.context}] ${message}`, data || "");
10 |   }
11 | 
12 |   warn(message: string, data?: any): void {
13 |     console.warn(`[${this.getTimestamp()}] [WARN] [${this.context}] ${message}`, data || "");
14 |   }
15 | 
16 |   error(message: string, error?: Error): void {
17 |     console.error(`[${this.getTimestamp()}] [ERROR] [${this.context}] ${message}`, error || "");
18 |   }
19 | 
20 |   debug(message: string, data?: any): void {
21 |     if (process.env.DEBUG) {
22 |       console.debug(`[${this.getTimestamp()}] [DEBUG] [${this.context}] ${message}`, data || "");
23 |     }
24 |   }
25 | 
26 |   private getTimestamp(): string {
27 |     return new Date().toISOString();
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/structural/visitnode_pattern_3.ts:
--------------------------------------------------------------------------------
 1 | // visitNode pattern from semantic_normalizer.ts
 2 | function visitNode(node: any, replacer: (n: any) => any): any {
 3 |   if (!node || typeof node !== 'object') {
 4 |     return node;
 5 |   }
 6 | 
 7 |   // Apply replacer
 8 |   const replaced = replacer(node);
 9 |   if (replaced !== node) {
10 |     return replaced;
11 |   }
12 | 
13 |   // Clone and process children
14 |   const newNode: any = { ...node };
15 |   const skipKeys = new Set(['loc', 'range', 'start', 'end', 'parent']);
16 | 
17 |   for (const key in newNode) {
18 |     if (newNode.hasOwnProperty(key) && !skipKeys.has(key)) {
19 |       const value = newNode[key];
20 |       
21 |       if (Array.isArray(value)) {
22 |         newNode[key] = value.map(item => visitNode(item, replacer));
23 |       } else if (value && typeof value === 'object') {
24 |         newNode[key] = visitNode(value, replacer);
25 |       }
26 |     }
27 |   }
28 | 
29 |   return newNode;
30 | }


--------------------------------------------------------------------------------
/KNOWN_ISSUES.md:
--------------------------------------------------------------------------------
 1 | # Known Issues
 2 | 
 3 | ## Rust Type Similarity Detection
 4 | 
 5 | ### Enum Similarity Detection
 6 | - **Issue**: Enum similarity detection shows lower than expected similarity scores even for structurally identical enums
 7 | - **Example**: Two enums with identical variants show only ~43% similarity
 8 | - **Cause**: The AST structure for enums includes variant names as values, and the current rename_cost parameter doesn't adequately handle this case
 9 | - **Workaround**: Use a lower threshold (0.4-0.5) for enum similarity detection
10 | - **Status**: Under investigation
11 | 
12 | ### Struct Similarity Detection
13 | - **Status**: Working as expected
14 | - Structs with similar field types but different field names correctly show high similarity (90%+)
15 | - Generic structs are properly compared
16 | 
17 | ## TypeScript Type Similarity Detection
18 | - **Status**: Working as expected
19 | - Interfaces, type aliases, and type literals are correctly detected with appropriate similarity scores


--------------------------------------------------------------------------------
/crates/core/src/tree.rs:
--------------------------------------------------------------------------------
 1 | use std::rc::Rc;
 2 | 
 3 | #[derive(Debug, Clone)]
 4 | pub struct TreeNode {
 5 |     pub label: String,
 6 |     pub value: String,
 7 |     pub children: Vec<Rc<TreeNode>>,
 8 |     pub id: usize,
 9 |     pub subtree_size: Option<usize>,
10 | }
11 | 
12 | impl TreeNode {
13 |     #[must_use]
14 |     pub fn new(label: String, value: String, id: usize) -> Self {
15 |         TreeNode { label, value, children: Vec::new(), id, subtree_size: None }
16 |     }
17 | 
18 |     pub fn add_child(&mut self, child: Rc<TreeNode>) {
19 |         self.children.push(child);
20 |     }
21 | 
22 |     #[must_use]
23 |     pub fn get_subtree_size(&self) -> usize {
24 |         // Since we can't mutate through Rc, we'll calculate it each time
25 |         // In a real implementation, you might want to use RefCell for interior mutability
26 |         let mut size = 1;
27 |         for child in &self.children {
28 |             size += child.get_subtree_size();
29 |         }
30 |         size
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/semantic/async_operations_1.ts:
--------------------------------------------------------------------------------
 1 | // Async operation patterns with Promise handling
 2 | // Different approaches to the same async pattern
 3 | 
 4 | export class DataFetcher {
 5 |   async fetchData(url: string): Promise<any> {
 6 |     try {
 7 |       const response = await fetch(url);
 8 |       if (!response.ok) {
 9 |         throw new Error(`Failed to fetch: ${response.status}`);
10 |       }
11 |       return await response.json();
12 |     } catch (error) {
13 |       console.error('Fetch error:', error);
14 |       throw error;
15 |     }
16 |   }
17 |   
18 |   async fetchWithRetry(url: string, maxRetries: number = 3): Promise<any> {
19 |     let lastError;
20 |     
21 |     for (let i = 0; i < maxRetries; i++) {
22 |       try {
23 |         return await this.fetchData(url);
24 |       } catch (error) {
25 |         lastError = error;
26 |         await new Promise(resolve => setTimeout(resolve, 1000 * (i + 1)));
27 |       }
28 |     }
29 |     
30 |     throw lastError;
31 |   }
32 | }


--------------------------------------------------------------------------------
/__deprecated/src/parser.ts:
--------------------------------------------------------------------------------
 1 | // TypeScript parser wrapper
 2 | import * as oxc from "oxc-parser";
 3 | 
 4 | /**
 5 |  * Parse TypeScript code into AST synchronously
 6 |  * @deprecated Use parseTypeScriptAsync for better performance
 7 |  */
 8 | export const parseTypeScript = oxc.parseSync;
 9 | 
10 | /**
11 |  * Parse TypeScript code into AST asynchronously
12 |  */
13 | export const parseTypeScriptAsync = oxc.parseAsync;
14 | 
15 | /**
16 |  * Parse multiple TypeScript files in parallel
17 |  */
18 | export async function parseMultipleAsync(
19 |   files: Array<{ filename: string; code: string }>,
20 | ): Promise<Array<{ filename: string; ast: oxc.ParseResult; error?: Error }>> {
21 |   const promises = files.map(async ({ filename, code }) => {
22 |     try {
23 |       const ast = await parseTypeScriptAsync(filename, code);
24 |       return { filename, ast };
25 |     } catch (error) {
26 |       return { filename, ast: null as any, error: error as Error };
27 |     }
28 |   });
29 | 
30 |   return Promise.all(promises);
31 | }
32 | 


--------------------------------------------------------------------------------
/crates/similarity-php/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "similarity-php"
 3 | version = "0.4.2"
 4 | edition = "2021"
 5 | license = "MIT"
 6 | description = "CLI tool for detecting code duplication in PHP projects"
 7 | authors = ["SuguruOoki"]
 8 | repository = "https://github.com/mizchi/similarity"
 9 | homepage = "https://github.com/mizchi/similarity"
10 | documentation = "https://docs.rs/similarity-php"
11 | keywords = ["php", "duplicate", "detection", "cli", "similarity"]
12 | categories = ["command-line-utilities", "development-tools"]
13 | 
14 | [[bin]]
15 | name = "similarity-php"
16 | path = "src/main.rs"
17 | 
18 | [lib]
19 | name = "similarity_php"
20 | 
21 | [dependencies]
22 | similarity-core = { version = "0.4.2", path = "../core" }
23 | clap = { version = "4.0", features = ["derive"] }
24 | anyhow = "1.0"
25 | walkdir = "2.5"
26 | ignore = "0.4"
27 | rayon = "1.10"
28 | tree-sitter = { workspace = true }
29 | tree-sitter-php = { workspace = true }
30 | 
31 | [dev-dependencies]
32 | assert_cmd = "2.0"
33 | predicates = "3.0"
34 | tempfile = "3.0"


--------------------------------------------------------------------------------
/crates/similarity-py/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "similarity-py"
 3 | version = "0.4.2"
 4 | edition = "2021"
 5 | license = "MIT"
 6 | description = "CLI tool for detecting code duplication in Python projects"
 7 | authors = ["mizchi"]
 8 | repository = "https://github.com/mizchi/similarity"
 9 | homepage = "https://github.com/mizchi/similarity"
10 | documentation = "https://docs.rs/similarity-py"
11 | keywords = ["python", "duplicate", "detection", "cli", "similarity"]
12 | categories = ["command-line-utilities", "development-tools"]
13 | 
14 | [[bin]]
15 | name = "similarity-py"
16 | path = "src/main.rs"
17 | 
18 | [lib]
19 | name = "similarity_py"
20 | 
21 | [dependencies]
22 | similarity-core = { version = "0.4.2", path = "../core" }
23 | clap = { version = "4.0", features = ["derive"] }
24 | anyhow = "1.0"
25 | walkdir = "2.5"
26 | ignore = "0.4"
27 | rayon = "1.10"
28 | tree-sitter = { workspace = true }
29 | tree-sitter-python = { workspace = true }
30 | 
31 | [dev-dependencies]
32 | assert_cmd = "2.0"
33 | predicates = "3.0"
34 | tempfile = "3.0"


--------------------------------------------------------------------------------
/docs/implementation/README.md:
--------------------------------------------------------------------------------
 1 | # Implementation Documentation
 2 | 
 3 | This directory contains documentation about implementation details, performance optimization, and benchmarks.
 4 | 
 5 | ## Contents
 6 | 
 7 | - [Performance Optimization](performance-optimization.md) - Strategies for optimizing performance
 8 | - [Performance Baseline](performance-baseline.md) - Baseline performance measurements
 9 | - [Hybrid Approach Results](hybrid-approach-results.md) - Results from hybrid detection approach
10 | - [Benchmark Results](benchmark_results.md) - Comprehensive benchmark results
11 | - [Rust vs TypeScript Comparison](rust-ts-compare.md) - Performance comparison between implementations
12 | 
13 | ## Performance Overview
14 | 
15 | The Rust implementation provides significant performance improvements over the original TypeScript prototype:
16 | - TypeScript/JavaScript parsing: Uses oxc-parser for ~10x faster parsing
17 | - Parallel processing: Leverages Rayon for concurrent file processing
18 | - Memory efficiency: Optimized AST representations and algorithms


--------------------------------------------------------------------------------
/crates/similarity-rs/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "similarity-rs"
 3 | version = "0.4.2"
 4 | edition = "2021"
 5 | license = "MIT"
 6 | description = "CLI tool for detecting code duplication in Rust projects"
 7 | authors = ["mizchi"]
 8 | repository = "https://github.com/mizchi/similarity"
 9 | homepage = "https://github.com/mizchi/similarity"
10 | documentation = "https://docs.rs/similarity-rs"
11 | keywords = ["rust", "duplicate", "detection", "cli", "similarity"]
12 | categories = ["command-line-utilities", "development-tools"]
13 | 
14 | [[bin]]
15 | name = "similarity-rs"
16 | path = "src/main.rs"
17 | 
18 | [lib]
19 | name = "similarity_rs"
20 | 
21 | [dependencies]
22 | similarity-core = { version = "0.4.2", path = "../core" }
23 | clap = { version = "4.5", features = ["derive"] }
24 | anyhow = "1.0"
25 | rayon = "1.10"
26 | ignore = "0.4"
27 | walkdir = "2.5"
28 | globset = "0.4"
29 | tree-sitter = { workspace = true }
30 | tree-sitter-rust = { workspace = true }
31 | 
32 | [dev-dependencies]
33 | assert_cmd = "2.0"
34 | predicates = "3.1"
35 | tempfile = "3.10"
36 | 


--------------------------------------------------------------------------------
/test/__fixtures__/dissimilar/imperative_vs_functional_2.ts:
--------------------------------------------------------------------------------
 1 | // Dissimilar: Functional style
 2 | type Product = {
 3 |   id: string;
 4 |   name: string;
 5 |   price: number;
 6 | };
 7 | 
 8 | type Order = {
 9 |   products: Product[];
10 |   discount: number;
11 | };
12 | 
13 | const createOrder = (products: Product[] = [], discount = 0): Order => ({
14 |   products,
15 |   discount: Math.min(Math.max(discount, 0), 100)
16 | });
17 | 
18 | const addProduct = (order: Order, product: Product): Order => ({
19 |   ...order,
20 |   products: [...order.products, product]
21 | });
22 | 
23 | const removeProduct = (order: Order, productId: string): Order => ({
24 |   ...order,
25 |   products: order.products.filter(p => p.id !== productId)
26 | });
27 | 
28 | const calculateOrderTotal = (order: Order): number => {
29 |   const subtotal = order.products.reduce((sum, product) => sum + product.price, 0);
30 |   return subtotal * (1 - order.discount / 100);
31 | };
32 | 
33 | const pipe = <T>(...fns: Array<(arg: T) => T>) => (value: T): T =>
34 |   fns.reduce((acc, fn) => fn(acc), value);


--------------------------------------------------------------------------------
/crates/similarity-elixir/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "similarity-elixir"
 3 | version = "0.4.2"
 4 | edition = "2021"
 5 | license = "MIT"
 6 | description = "CLI tool for detecting code duplication in Elixir projects"
 7 | authors = ["mizchi"]
 8 | repository = "https://github.com/mizchi/similarity"
 9 | homepage = "https://github.com/mizchi/similarity"
10 | documentation = "https://docs.rs/similarity-elixir"
11 | keywords = ["elixir", "duplicate", "detection", "cli", "similarity"]
12 | categories = ["command-line-utilities", "development-tools"]
13 | 
14 | [[bin]]
15 | name = "similarity-elixir"
16 | path = "src/main.rs"
17 | 
18 | [lib]
19 | name = "similarity_elixir"
20 | 
21 | [dependencies]
22 | similarity-core = { version = "0.4.2", path = "../core" }
23 | clap = { version = "4.0", features = ["derive"] }
24 | anyhow = "1.0"
25 | walkdir = "2.5"
26 | ignore = "0.4"
27 | rayon = "1.10"
28 | tree-sitter = { workspace = true }
29 | tree-sitter-elixir = { workspace = true }
30 | 
31 | [dev-dependencies]
32 | assert_cmd = "2.0"
33 | predicates = "3.0"
34 | tempfile = "3.0"
35 | 


--------------------------------------------------------------------------------
/examples/specs/duplicate-types.ts:
--------------------------------------------------------------------------------
 1 | // Example: Type duplication detection (--experimental-types)
 2 | 
 3 | // Duplicate 1: Identical interfaces with different names
 4 | interface User {
 5 |   id: string;
 6 |   name: string;
 7 |   email: string;
 8 |   createdAt: Date;
 9 |   updatedAt: Date;
10 | }
11 | 
12 | interface Customer {
13 |   id: string;
14 |   name: string;
15 |   email: string;
16 |   createdAt: Date;
17 |   updatedAt: Date;
18 | }
19 | 
20 | // Duplicate 2: Similar type aliases
21 | type UserResponse = {
22 |   userId: string;
23 |   userName: string;
24 |   userEmail: string;
25 |   isActive: boolean;
26 | };
27 | 
28 | type CustomerResponse = {
29 |   customerId: string;
30 |   customerName: string;
31 |   customerEmail: string;
32 |   isActive: boolean;
33 | };
34 | 
35 | // Duplicate 3: Common API response patterns
36 | interface ApiResponse<T> {
37 |   data: T;
38 |   status: number;
39 |   message: string;
40 |   timestamp: Date;
41 | }
42 | 
43 | interface ServiceResponse<T> {
44 |   data: T;
45 |   status: number;
46 |   message: string;
47 |   timestamp: Date;
48 | }


--------------------------------------------------------------------------------
/test/__fixtures__/dissimilar/imperative_vs_functional_1.ts:
--------------------------------------------------------------------------------
 1 | // Dissimilar: Imperative style
 2 | class ShoppingCart {
 3 |   private items: CartItem[] = [];
 4 |   private discount: number = 0;
 5 | 
 6 |   addItem(item: CartItem): void {
 7 |     const existing = this.items.find(i => i.id === item.id);
 8 |     if (existing) {
 9 |       existing.quantity += item.quantity;
10 |     } else {
11 |       this.items.push({ ...item });
12 |     }
13 |   }
14 | 
15 |   removeItem(id: string): void {
16 |     const index = this.items.findIndex(i => i.id === id);
17 |     if (index !== -1) {
18 |       this.items.splice(index, 1);
19 |     }
20 |   }
21 | 
22 |   setDiscount(percent: number): void {
23 |     this.discount = Math.min(Math.max(percent, 0), 100);
24 |   }
25 | 
26 |   calculateTotal(): number {
27 |     let total = 0;
28 |     for (const item of this.items) {
29 |       total += item.price * item.quantity;
30 |     }
31 |     return total * (1 - this.discount / 100);
32 |   }
33 | }
34 | 
35 | interface CartItem {
36 |   id: string;
37 |   name: string;
38 |   price: number;
39 |   quantity: number;
40 | }


--------------------------------------------------------------------------------
/crates/similarity-rs/tests/test_min_tokens.rs:
--------------------------------------------------------------------------------
 1 | #![allow(clippy::uninlined_format_args)]
 2 | 
 3 | use similarity_core::language_parser::LanguageParser;
 4 | use similarity_rs::rust_parser::RustParser;
 5 | 
 6 | #[test]
 7 | fn test_function_token_counts() {
 8 |     let mut parser = RustParser::new().unwrap();
 9 | 
10 |     // Test various function sizes
11 |     let test_cases = vec![
12 |         ("fn a() { 1 }", "one liner"),
13 |         ("fn add(a: i32, b: i32) -> i32 { a + b }", "simple add"),
14 |         ("fn complex() -> i32 {\n    let x = 1;\n    let y = 2;\n    x + y\n}", "multi-statement"),
15 |         (
16 |             r#"fn format_message(name: &str, age: u32) -> String {
17 |     format!("Hello {}, you are {} years old", name, age)
18 | }"#,
19 |             "format_message",
20 |         ),
21 |     ];
22 | 
23 |     for (code, desc) in test_cases {
24 |         let tree = parser.parse(code, "test.rs").unwrap();
25 |         let size = tree.get_subtree_size();
26 |         println!("{}: {} tokens", desc, size);
27 |         println!("Code: {}", code);
28 |         println!();
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/crates/similarity-css/src/bin/test_parser.rs:
--------------------------------------------------------------------------------
 1 | fn main() {
 2 |     let scss_content = r#".m-0 { margin: 0; }
 3 | .m-1 { margin: 0.25rem; }
 4 | .m-2 { margin: 0.5rem; }"#;
 5 | 
 6 |     println!("Testing SCSS parser with single-line rules:");
 7 |     println!("{scss_content}");
 8 |     println!("\n---\n");
 9 | 
10 |     use similarity_css::scss_simple_flattener::simple_flatten_scss;
11 | 
12 |     match simple_flatten_scss(scss_content) {
13 |         Ok(rules) => {
14 |             println!("Found {} rules:", rules.len());
15 |             for rule in &rules {
16 |                 println!(
17 |                     "  - {} (lines {}-{}, {} declarations)",
18 |                     rule.selector,
19 |                     rule.start_line,
20 |                     rule.end_line,
21 |                     rule.declarations.len()
22 |                 );
23 |                 for (prop, val) in &rule.declarations {
24 |                     println!("    {prop}: {val}");
25 |                 }
26 |             }
27 |         }
28 |         Err(e) => {
29 |             println!("Error: {e}");
30 |         }
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/crates/similarity-generic/examples/configs/custom-language-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "language": "your-language-name",
 3 |   "function_nodes": [
 4 |     "function_definition",
 5 |     "method_definition",
 6 |     "lambda_expression"
 7 |   ],
 8 |   "type_nodes": [
 9 |     "class_declaration",
10 |     "interface_declaration",
11 |     "struct_declaration"
12 |   ],
13 |   "field_mappings": {
14 |     "name_field": "name",
15 |     "params_field": "parameters",
16 |     "body_field": "body",
17 |     "decorator_field": "decorators",
18 |     "class_field": "class"
19 |   },
20 |   "value_nodes": [
21 |     "identifier",
22 |     "string_literal",
23 |     "number_literal",
24 |     "boolean_literal"
25 |   ],
26 |   "test_patterns": {
27 |     "attribute_patterns": [
28 |       "@test",
29 |       "@Test",
30 |       "#[test]"
31 |     ],
32 |     "name_prefixes": [
33 |       "test_",
34 |       "Test"
35 |     ],
36 |     "name_suffixes": [
37 |       "_test",
38 |       "Test",
39 |       "_spec"
40 |     ]
41 |   },
42 |   "custom_mappings": {
43 |     "comment": "Optional custom mappings for special cases"
44 |   }
45 | }


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/semantic/async_operations_2.ts:
--------------------------------------------------------------------------------
 1 | // Same async operations using functional approach
 2 | // Shows semantic equivalence with different structure
 3 | 
 4 | export const fetchData = (url: string): Promise<any> => 
 5 |   fetch(url)
 6 |     .then(response => {
 7 |       if (!response.ok) {
 8 |         throw new Error(`Failed to fetch: ${response.status}`);
 9 |       }
10 |       return response.json();
11 |     })
12 |     .catch(error => {
13 |       console.error('Fetch error:', error);
14 |       throw error;
15 |     });
16 | 
17 | export const fetchWithRetry = async (
18 |   url: string, 
19 |   maxRetries: number = 3
20 | ): Promise<any> => {
21 |   const attempt = async (retriesLeft: number): Promise<any> => {
22 |     try {
23 |       return await fetchData(url);
24 |     } catch (error) {
25 |       if (retriesLeft === 0) throw error;
26 |       
27 |       await new Promise(resolve => 
28 |         setTimeout(resolve, 1000 * (maxRetries - retriesLeft + 1))
29 |       );
30 |       
31 |       return attempt(retriesLeft - 1);
32 |     }
33 |   };
34 |   
35 |   return attempt(maxRetries - 1);
36 | };


--------------------------------------------------------------------------------
/crates/similarity-generic/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "similarity-generic"
 3 | version = "0.4.2"
 4 | edition = "2021"
 5 | license = "MIT"
 6 | description = "Generic language similarity analyzer using tree-sitter"
 7 | authors = ["mizchi"]
 8 | repository = "https://github.com/mizchi/similarity"
 9 | build = "build.rs"
10 | 
11 | [[bin]]
12 | name = "similarity-generic"
13 | path = "src/main.rs"
14 | 
15 | [dependencies]
16 | similarity-core = { version = "0.4.2", path = "../core" }
17 | clap = { version = "4.0", features = ["derive"] }
18 | anyhow = "1.0"
19 | tree-sitter = { workspace = true }
20 | tree-sitter-go = { workspace = true }
21 | tree-sitter-java = { workspace = true }
22 | tree-sitter-c = { workspace = true }
23 | tree-sitter-cpp = { workspace = true }
24 | tree-sitter-c-sharp = { workspace = true }
25 | tree-sitter-ruby = { workspace = true }
26 | serde = { version = "1.0", features = ["derive"] }
27 | serde_json = "1.0"
28 | once_cell = "1.21"
29 | 
30 | [build-dependencies]
31 | serde_json = "1.0"
32 | once_cell = "1.21"
33 | 
34 | [dev-dependencies]
35 | tempfile = "3.0"
36 | assert_cmd = "2.0"
37 | predicates = "3.0"


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 mizchi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/docs/lib/README.md:
--------------------------------------------------------------------------------
 1 | # Library Documentation
 2 | 
 3 | This directory contains documentation about the library design, architecture, and features.
 4 | 
 5 | ## Contents
 6 | 
 7 | - [AI Documentation](ai-documentation.md) - Comprehensive technical documentation for AI developers
 8 | - [Multi-file Similarity](multi_file_similarity.md) - Implementation details for cross-file similarity detection
 9 | - [Type Similarity Design](type-similarity-design.md) - Design document for TypeScript type similarity detection
10 | - [Visitor Implementation Example](visitor-implementation-example.md) - Example of visitor pattern implementation
11 | - [Python Support](python-support.md) - Documentation for Python language support
12 | 
13 | ## Architecture Overview
14 | 
15 | The similarity detection library is organized as a Rust workspace with:
16 | - **similarity-core**: Language-agnostic core algorithms and utilities
17 | - **similarity-ts**: TypeScript/JavaScript specific implementation
18 | - **similarity-py**: Python specific implementation  
19 | - **similarity-rs**: Rust specific implementation
20 | 
21 | Each language-specific crate implements the `LanguageParser` trait from the core library.


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/structural/visitnode_pattern_2.ts:
--------------------------------------------------------------------------------
 1 | // visitNode pattern from function_extractor.ts
 2 | function visitNode(node: any, ancestors: any[] = []): void {
 3 |   if (!node || typeof node !== 'object') {
 4 |     return;
 5 |   }
 6 | 
 7 |   // Process based on node type
 8 |   if (node.type === 'FunctionDeclaration' || node.type === 'FunctionExpression') {
 9 |     extractFunction(node, ancestors);
10 |   } else if (node.type === 'MethodDefinition') {
11 |     extractMethod(node, ancestors);
12 |   } else if (node.type === 'ArrowFunctionExpression') {
13 |     extractArrowFunction(node, ancestors);
14 |   }
15 | 
16 |   // Skip certain keys
17 |   const skipKeys = new Set(['loc', 'range', 'start', 'end', 'parent']);
18 | 
19 |   // Visit children
20 |   for (const key in node) {
21 |     if (node.hasOwnProperty(key) && !skipKeys.has(key)) {
22 |       const value = node[key];
23 |       
24 |       if (Array.isArray(value)) {
25 |         for (const item of value) {
26 |           visitNode(item, [...ancestors, node]);
27 |         }
28 |       } else if (value && typeof value === 'object') {
29 |         visitNode(value, [...ancestors, node]);
30 |       }
31 |     }
32 |   }
33 | }


--------------------------------------------------------------------------------
/crates/core/tests/fixtures/sample1.ts:
--------------------------------------------------------------------------------
 1 | // Test sample 1: Different functions that should have low similarity
 2 | 
 3 | export function processUserData(users: User[]): ProcessedData {
 4 |     const result: ProcessedData = {
 5 |         total: users.length,
 6 |         active: 0,
 7 |         inactive: 0
 8 |     };
 9 |     
10 |     for (const user of users) {
11 |         if (user.isActive) {
12 |             result.active++;
13 |         } else {
14 |             result.inactive++;
15 |         }
16 |     }
17 |     
18 |     return result;
19 | }
20 | 
21 | export function calculateAverage(numbers: number[]): number {
22 |     if (numbers.length === 0) return 0;
23 |     
24 |     let sum = 0;
25 |     for (const num of numbers) {
26 |         sum += num;
27 |     }
28 |     
29 |     return sum / numbers.length;
30 | }
31 | 
32 | // Similar structure but different purpose
33 | export function findMaxValue(values: number[]): number {
34 |     if (values.length === 0) return -Infinity;
35 |     
36 |     let max = values[0];
37 |     for (let i = 1; i < values.length; i++) {
38 |         if (values[i] > max) {
39 |             max = values[i];
40 |         }
41 |     }
42 |     
43 |     return max;
44 | }


--------------------------------------------------------------------------------
/examples/specs/test_async.ts:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env tsx
 2 | /**
 3 |  * Test async functionality after refactoring
 4 |  */
 5 | 
 6 | import { calculateAPTEDSimilarityAsync, calculateAPTEDSimilarityFromAST, parseAsync } from "../src/index.ts";
 7 | 
 8 | async function main() {
 9 |   const code1 = `
10 |     function add(a: number, b: number): number {
11 |       return a + b;
12 |     }
13 |   `;
14 | 
15 |   const code2 = `
16 |     function sum(x: number, y: number): number {
17 |       return x + y;
18 |     }
19 |   `;
20 | 
21 |   console.log("Testing async APTED similarity...");
22 |   const similarity = await calculateAPTEDSimilarityAsync(code1, code2);
23 |   console.log(`Similarity: ${(similarity * 100).toFixed(1)}%`);
24 | 
25 |   console.log("\nTesting with pre-parsed AST...");
26 |   const [ast1, ast2] = await Promise.all([parseAsync("test1.ts", code1), parseAsync("test2.ts", code2)]);
27 | 
28 |   const similarityFromAST = calculateAPTEDSimilarityFromAST(ast1, ast2);
29 |   console.log(`Similarity from AST: ${(similarityFromAST * 100).toFixed(1)}%`);
30 | 
31 |   console.log("\nCore modules remain sync - async parsing is handled at the application level ✓");
32 | }
33 | 
34 | main().catch(console.error);
35 | 


--------------------------------------------------------------------------------
/crates/similarity-ts/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "similarity-ts"
 3 | version = "0.4.2"
 4 | edition = "2021"
 5 | license = "MIT"
 6 | description = "CLI tool for detecting code duplication in TypeScript/JavaScript projects"
 7 | authors = ["mizchi"]
 8 | repository = "https://github.com/mizchi/similarity"
 9 | homepage = "https://github.com/mizchi/similarity"
10 | documentation = "https://docs.rs/similarity-ts"
11 | keywords = ["typescript", "javascript", "duplicate", "detection", "cli"]
12 | categories = ["command-line-utilities", "development-tools"]
13 | 
14 | [[bin]]
15 | name = "similarity-ts"
16 | path = "src/main.rs"
17 | 
18 | [dependencies]
19 | similarity-core = { version = "0.4.2", path = "../core" }
20 | clap = { version = "4.0", features = ["derive"] }
21 | anyhow = "1.0"
22 | walkdir = "2.5"
23 | ignore = "0.4"
24 | globset = "0.4"
25 | rayon = "1.10"
26 | oxc_parser = { workspace = true }
27 | oxc_ast = { workspace = true }
28 | oxc_span = { workspace = true }
29 | oxc_allocator = { workspace = true }
30 | 
31 | [dev-dependencies]
32 | assert_cmd = "2.0"
33 | predicates = "3.0"
34 | tempfile = "3.0"
35 | criterion = "0.5"
36 | rayon = "1.10"
37 | 
38 | [[bench]]
39 | name = "parallel_benchmark"
40 | harness = false


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/exact/service_duplication_1.ts:
--------------------------------------------------------------------------------
 1 | // Example of exact duplication: UserService
 2 | // This is a common pattern where a service is copied and slightly modified
 3 | export class UserService {
 4 |   private users: Map<string, User> = new Map();
 5 |   
 6 |   addUser(user: User): void {
 7 |     if (!user.id) {
 8 |       throw new Error('User must have an ID');
 9 |     }
10 |     if (this.users.has(user.id)) {
11 |       throw new Error('User already exists');
12 |     }
13 |     this.users.set(user.id, user);
14 |   }
15 |   
16 |   getUser(id: string): User | undefined {
17 |     return this.users.get(id);
18 |   }
19 |   
20 |   updateUser(id: string, updates: Partial<User>): User {
21 |     const user = this.users.get(id);
22 |     if (!user) {
23 |       throw new Error('User not found');
24 |     }
25 |     const updatedUser = { ...user, ...updates };
26 |     this.users.set(id, updatedUser);
27 |     return updatedUser;
28 |   }
29 |   
30 |   deleteUser(id: string): boolean {
31 |     return this.users.delete(id);
32 |   }
33 |   
34 |   getAllUsers(): User[] {
35 |     return Array.from(this.users.values());
36 |   }
37 | }
38 | 
39 | interface User {
40 |   id: string;
41 |   name: string;
42 |   email: string;
43 |   createdAt: Date;
44 | }


--------------------------------------------------------------------------------
/__deprecated/src/core/oxc_types.ts:
--------------------------------------------------------------------------------
 1 | // Re-export oxc-parser types for easier use throughout the codebase
 2 | import type {
 3 |   Program,
 4 |   Expression,
 5 |   Statement,
 6 |   Declaration,
 7 |   IdentifierReference,
 8 |   BindingIdentifier,
 9 |   Function,
10 |   Class,
11 |   VariableDeclarator,
12 |   ModuleDeclaration,
13 |   NumericLiteral,
14 |   StringLiteral,
15 |   BooleanLiteral,
16 |   Directive,
17 | } from "@oxc-project/types";
18 | 
19 | // Re-export types
20 | export type { Program, NumericLiteral, StringLiteral, BooleanLiteral };
21 | 
22 | // Type guards
23 | export function isIdentifier(node: any): node is IdentifierReference | BindingIdentifier {
24 |   return node?.type === "Identifier";
25 | }
26 | 
27 | export function isFunctionDeclaration(node: any): node is Function {
28 |   return node?.type === "FunctionDeclaration";
29 | }
30 | 
31 | export function isClassDeclaration(node: any): node is Class {
32 |   return node?.type === "ClassDeclaration";
33 | }
34 | 
35 | export function isVariableDeclarator(node: any): node is VariableDeclarator {
36 |   return node?.type === "VariableDeclarator";
37 | }
38 | 
39 | // Union type for all AST nodes
40 | export type ASTNode = Expression | Statement | Declaration | ModuleDeclaration | Directive | Program;
41 | 


--------------------------------------------------------------------------------
/examples/test_structure_comparison.ts:
--------------------------------------------------------------------------------
 1 | // Test file for structure comparison framework
 2 | 
 3 | // Interface with common structure
 4 | interface User {
 5 |   id: string;
 6 |   name: string;
 7 |   email: string;
 8 |   age?: number;
 9 | }
10 | 
11 | // Type alias with similar structure
12 | type Person = {
13 |   id: string;
14 |   name: string; 
15 |   email: string;
16 |   age?: number;
17 | };
18 | 
19 | // Another interface with same properties (should be detected as similar)
20 | interface Customer {
21 |   id: string;
22 |   name: string;
23 |   email: string;
24 |   age?: number;
25 | }
26 | 
27 | // Type literal in variable declaration
28 | const employee: {
29 |   id: string;
30 |   name: string;
31 |   email: string;
32 |   age?: number;
33 | } = {
34 |   id: "emp001",
35 |   name: "John Doe",
36 |   email: "john@example.com",
37 |   age: 30
38 | };
39 | 
40 | // Similar class structure
41 | class Account {
42 |   id: string;
43 |   name: string;
44 |   email: string;
45 |   age?: number;
46 | 
47 |   constructor(id: string, name: string, email: string, age?: number) {
48 |     this.id = id;
49 |     this.name = name;
50 |     this.email = email;
51 |     this.age = age;
52 |   }
53 | }
54 | 
55 | // Slightly different structure (missing email)
56 | interface Admin {
57 |   id: string;
58 |   name: string;
59 |   role: string;
60 |   age?: number;
61 | }


--------------------------------------------------------------------------------
/crates/similarity-css/examples/test.css:
--------------------------------------------------------------------------------
 1 | /* Test CSS file for similarity detection */
 2 | 
 3 | .button {
 4 |   background-color: #007bff;
 5 |   color: white;
 6 |   padding: 10px 20px;
 7 |   border: none;
 8 |   border-radius: 4px;
 9 |   cursor: pointer;
10 | }
11 | 
12 | .btn {
13 |   background-color: #007bff;
14 |   color: #fff;
15 |   padding: 10px 20px;
16 |   border: none;
17 |   border-radius: 4px;
18 |   cursor: pointer;
19 | }
20 | 
21 | .primary-button {
22 |   background: #007bff;
23 |   color: white;
24 |   padding: 0.625rem 1.25rem;
25 |   border: 0;
26 |   border-radius: 4px;
27 |   cursor: pointer;
28 | }
29 | 
30 | .card {
31 |   background: white;
32 |   padding: 20px;
33 |   border-radius: 8px;
34 |   box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
35 | }
36 | 
37 | .panel {
38 |   background-color: #ffffff;
39 |   padding: 20px;
40 |   border-radius: 8px;
41 |   box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
42 | }
43 | 
44 | .header {
45 |   display: flex;
46 |   justify-content: space-between;
47 |   align-items: center;
48 |   padding: 1rem;
49 | }
50 | 
51 | .navbar {
52 |   display: flex;
53 |   justify-content: space-between;
54 |   align-items: center;
55 |   padding: 16px;
56 | }
57 | 
58 | @media (max-width: 768px) {
59 |   .button,
60 |   .btn {
61 |     width: 100%;
62 |     padding: 12px;
63 |   }
64 |   
65 |   .card {
66 |     padding: 15px;
67 |   }
68 | }


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/exact/service_duplication_2.ts:
--------------------------------------------------------------------------------
 1 | // Example of exact duplication: CustomerService
 2 | // This is copied from UserService with just name changes - a typical copy-paste scenario
 3 | export class CustomerService {
 4 |   private customers: Map<string, Customer> = new Map();
 5 |   
 6 |   addCustomer(customer: Customer): void {
 7 |     if (!customer.id) {
 8 |       throw new Error('Customer must have an ID');
 9 |     }
10 |     if (this.customers.has(customer.id)) {
11 |       throw new Error('Customer already exists');
12 |     }
13 |     this.customers.set(customer.id, customer);
14 |   }
15 |   
16 |   getCustomer(id: string): Customer | undefined {
17 |     return this.customers.get(id);
18 |   }
19 |   
20 |   updateCustomer(id: string, updates: Partial<Customer>): Customer {
21 |     const customer = this.customers.get(id);
22 |     if (!customer) {
23 |       throw new Error('Customer not found');
24 |     }
25 |     const updatedCustomer = { ...customer, ...updates };
26 |     this.customers.set(id, updatedCustomer);
27 |     return updatedCustomer;
28 |   }
29 |   
30 |   deleteCustomer(id: string): boolean {
31 |     return this.customers.delete(id);
32 |   }
33 |   
34 |   getAllCustomers(): Customer[] {
35 |     return Array.from(this.customers.values());
36 |   }
37 | }
38 | 
39 | interface Customer {
40 |   id: string;
41 |   name: string;
42 |   email: string;
43 |   createdAt: Date;
44 | }


--------------------------------------------------------------------------------
/crates/core/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "similarity-core"
 3 | version = "0.4.2"
 4 | edition = "2021"
 5 | license = "MIT"
 6 | description = "Core library for code similarity detection using AST-based comparison"
 7 | authors = ["mizchi"]
 8 | repository = "https://github.com/mizchi/similarity"
 9 | homepage = "https://github.com/mizchi/similarity"
10 | documentation = "https://docs.rs/similarity-core"
11 | keywords = ["typescript", "javascript", "similarity", "ast", "refactoring"]
12 | categories = ["development-tools", "parser-implementations"]
13 | 
14 | [dependencies]
15 | oxc_parser = { workspace = true }
16 | oxc_ast = { workspace = true }
17 | oxc_span = { workspace = true }
18 | oxc_allocator = { workspace = true }
19 | serde = { version = "1.0", features = ["derive"] }
20 | serde_json = "1.0"
21 | tree-sitter = { workspace = true }
22 | tree-sitter-go = { workspace = true }
23 | tree-sitter-java = { workspace = true }
24 | tree-sitter-c = { workspace = true }
25 | tree-sitter-cpp = { workspace = true }
26 | tree-sitter-c-sharp = { workspace = true }
27 | tree-sitter-ruby = { workspace = true }
28 | rayon = "1.10"
29 | ignore = "0.4"
30 | anyhow = "1.0"
31 | 
32 | [dev-dependencies]
33 | criterion = "0.5"
34 | 
35 | [[bench]]
36 | name = "tsed_benchmark"
37 | harness = false
38 | 
39 | [[bench]]
40 | name = "function_comparison"
41 | harness = false
42 | 
43 | # Examples removed - language-specific examples moved to respective crates


--------------------------------------------------------------------------------
/crates/similarity-rs/tests/parser_test.rs:
--------------------------------------------------------------------------------
 1 | #![allow(clippy::uninlined_format_args)]
 2 | 
 3 | use similarity_core::language_parser::LanguageParser;
 4 | use similarity_rs::rust_parser::RustParser;
 5 | 
 6 | #[test]
 7 | fn test_parser_parses_complete_function() {
 8 |     let code = r#"
 9 | fn add(a: i32, b: i32) -> i32 {
10 |     a + b
11 | }
12 | "#;
13 | 
14 |     let mut parser = RustParser::new().unwrap();
15 |     let tree = parser.parse(code, "test.rs").unwrap();
16 | 
17 |     // The tree should contain function signature elements
18 |     let tree_str = format!("{:?}", tree);
19 |     println!("Parsed tree: {}", tree_str);
20 | 
21 |     // Check tree size is reasonable (not just the body)
22 |     let size = tree.get_subtree_size();
23 |     println!("Tree size: {}", size);
24 |     assert!(size > 10, "Tree too small, might be parsing only body: {}", size);
25 | }
26 | 
27 | #[test]
28 | fn test_parser_differentiates_function_names() {
29 |     let code1 = "fn foo() {}";
30 |     let code2 = "fn bar() {}";
31 | 
32 |     let mut parser = RustParser::new().unwrap();
33 |     let tree1 = parser.parse(code1, "test.rs").unwrap();
34 |     let tree2 = parser.parse(code2, "test.rs").unwrap();
35 | 
36 |     // Trees should be different even for empty functions with different names
37 |     assert_ne!(
38 |         format!("{:?}", tree1),
39 |         format!("{:?}", tree2),
40 |         "Functions with different names should produce different trees"
41 |     );
42 | }
43 | 


--------------------------------------------------------------------------------
/examples/specs/test_extraction.ts:
--------------------------------------------------------------------------------
 1 | import { extractFunctions } from "../src/core/function_extractor.ts";
 2 | 
 3 | const code = `
 4 | class UserService {
 5 |   addUser(user: User): void {
 6 |     if (!user.id) {
 7 |       throw new Error('User must have an ID');
 8 |     }
 9 |     this.users.set(user.id, user);
10 |     console.log(\`User \${user.name} added\`);
11 |   }
12 | }
13 | 
14 | function addUserToStore(store: Map<string, User>, user: User): void {
15 |   if (!user.id) {
16 |     throw new Error('User must have an ID');
17 |   }
18 |   store.set(user.id, user);
19 |   console.log(\`User \${user.name} added\`);
20 | }
21 | 
22 | const addUserToMap = (userMap: Map<string, User>, newUser: User): void => {
23 |   if (!newUser.id) {
24 |     throw new Error('User must have an ID');
25 |   }
26 |   userMap.set(newUser.id, newUser);
27 |   console.log(\`User \${newUser.name} added\`);
28 | };
29 | `;
30 | 
31 | console.log("Extracting functions...\n");
32 | const functions = extractFunctions(code);
33 | 
34 | console.log(`Found ${functions.length} functions:\n`);
35 | 
36 | functions.forEach((func) => {
37 |   console.log(`Name: ${func.name}`);
38 |   console.log(`Type: ${func.type}`);
39 |   console.log(`Parameters: [${func.parameters.join(", ")}]`);
40 |   console.log(`Body length: ${func.body.length}`);
41 |   console.log(`Body preview: ${func.body.substring(0, 100)}...`);
42 |   if (func.className) {
43 |     console.log(`Class: ${func.className}`);
44 |   }
45 |   console.log("---\n");
46 | });
47 | 


--------------------------------------------------------------------------------
/examples/specs/debug_arrow.ts:
--------------------------------------------------------------------------------
 1 | import { parseTypeScript } from "../src/parser.ts";
 2 | 
 3 | const code = `
 4 | const addUserToMap = (userMap: Map<string, User>, newUser: User): void => {
 5 |   if (!newUser.id) {
 6 |     throw new Error('User must have an ID');
 7 |   }
 8 |   userMap.set(newUser.id, newUser);
 9 |   console.log(\`User \${newUser.name} added\`);
10 | };
11 | `;
12 | 
13 | const ast = parseTypeScript("test.ts", code);
14 | 
15 | function findArrowFunction(node: any, depth = 0): void {
16 |   if (!node || typeof node !== "object") return;
17 | 
18 |   const indent = "  ".repeat(depth);
19 | 
20 |   if (node.type === "VariableDeclarator") {
21 |     console.log(indent + "VariableDeclarator:", node.id?.name);
22 |     console.log(indent + "  init type:", node.init?.type);
23 |   }
24 | 
25 |   if (node.type === "ArrowFunctionExpression") {
26 |     console.log(indent + "Found ArrowFunctionExpression!");
27 |     console.log(indent + "  has body:", !!node.body);
28 |     console.log(indent + "  body type:", node.body?.type);
29 |     console.log(indent + "  expression:", node.expression);
30 |   }
31 | 
32 |   for (const key in node) {
33 |     if (key === "parent" || key === "scope") continue;
34 |     const value = node[key];
35 |     if (Array.isArray(value)) {
36 |       value.forEach((v) => findArrowFunction(v, depth + 1));
37 |     } else if (value && typeof value === "object") {
38 |       findArrowFunction(value, depth + 1);
39 |     }
40 |   }
41 | }
42 | 
43 | findArrowFunction(ast.program);
44 | 


--------------------------------------------------------------------------------
/examples/duplicate_python.py:
--------------------------------------------------------------------------------
 1 | # Example Python file with duplicate functions
 2 | 
 3 | def process_data(data):
 4 |     """Process data and return result."""
 5 |     result = []
 6 |     for item in data:
 7 |         if item > 0:
 8 |             result.append(item * 2)
 9 |     return result
10 | 
11 | def transform_data(data):
12 |     """Transform data and return result."""
13 |     output = []
14 |     for element in data:
15 |         if element > 0:
16 |             output.append(element * 2)
17 |     return output
18 | 
19 | class DataProcessor:
20 |     def __init__(self):
21 |         self.cache = {}
22 |     
23 |     def process(self, data):
24 |         result = []
25 |         for item in data:
26 |             if item > 0:
27 |                 result.append(item * 2)
28 |         return result
29 |     
30 |     def transform(self, data):
31 |         output = []
32 |         for element in data:
33 |             if element > 0:
34 |                 output.append(element * 2)
35 |         return output
36 | 
37 | # Another duplicate with slight variations
38 | def filter_and_double(items):
39 |     """Filter positive numbers and double them."""
40 |     filtered = []
41 |     for i in items:
42 |         if i > 0:
43 |             filtered.append(i * 2)
44 |     return filtered
45 | 
46 | class NumberProcessor:
47 |     def process_numbers(self, numbers):
48 |         processed = []
49 |         for num in numbers:
50 |             if num > 0:
51 |                 processed.append(num * 2)
52 |         return processed


--------------------------------------------------------------------------------
/crates/similarity-rs/tests/test_rename_zero.rs:
--------------------------------------------------------------------------------
 1 | #![allow(clippy::uninlined_format_args)]
 2 | 
 3 | use similarity_core::{
 4 |     language_parser::LanguageParser,
 5 |     tsed::{calculate_tsed, TSEDOptions},
 6 | };
 7 | use similarity_rs::rust_parser::RustParser;
 8 | 
 9 | #[test]
10 | fn test_rename_cost_zero() {
11 |     let code1 = r#"
12 |     let result = x + 1;
13 |     result * 2
14 | "#;
15 | 
16 |     let code2 = r#"
17 |     let temp = y + 1;
18 |     temp * 2
19 | "#;
20 | 
21 |     let mut parser = RustParser::new().unwrap();
22 |     let tree1 = parser.parse(code1, "test1.rs").unwrap();
23 |     let tree2 = parser.parse(code2, "test2.rs").unwrap();
24 | 
25 |     // rename_cost = 0.0, compare_values = true
26 |     let mut options = TSEDOptions::default();
27 |     options.apted_options.rename_cost = 0.0;
28 |     options.apted_options.compare_values = true;
29 | 
30 |     let similarity = calculate_tsed(&tree1, &tree2, &options);
31 |     println!("With compare_values=true, rename_cost=0.0: {:.2}%", similarity * 100.0);
32 | 
33 |     // rename_cost = 0.0, compare_values = false (構造のみ比較)
34 |     options.apted_options.compare_values = false;
35 |     let similarity2 = calculate_tsed(&tree1, &tree2, &options);
36 |     println!("With compare_values=false, rename_cost=0.0: {:.2}%", similarity2 * 100.0);
37 | 
38 |     // デフォルト設定
39 |     let options_default = TSEDOptions::default();
40 |     let similarity3 = calculate_tsed(&tree1, &tree2, &options_default);
41 |     println!("With default settings: {:.2}%", similarity3 * 100.0);
42 | }
43 | 


--------------------------------------------------------------------------------
/crates/similarity-md/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | authors = ["mizchi"]
 3 | categories = ["command-line-utilities", "development-tools"]
 4 | description = "Experimental CLI tool for detecting content similarity in Markdown documents"
 5 | documentation = "https://docs.rs/similarity-md"
 6 | edition = "2021"
 7 | homepage = "https://github.com/mizchi/similarity"
 8 | keywords = ["markdown", "similarity", "duplicate", "detection", "cli"]
 9 | license = "MIT"
10 | name = "similarity-md"
11 | publish = true
12 | repository = "https://github.com/mizchi/similarity"
13 | version = "0.4.2"
14 | 
15 | [[bin]]
16 | name = "similarity-md"
17 | path = "src/main.rs"
18 | 
19 | [dependencies]
20 | anyhow = "1.0"
21 | clap = {version = "4.0", features = ["derive"]}
22 | globset = "0.4"
23 | ignore = "0.4"
24 | pulldown-cmark = "0.10"
25 | rayon = "1.10"
26 | serde = {version = "1.0", features = ["derive"]}
27 | serde_json = "1.0"
28 | vibrato = "0.5"
29 | walkdir = "2.5"
30 | zstd = {version = "0.13", optional = true}
31 | 
32 | [features]
33 | default = []
34 | zstd-support = ["zstd"]
35 | 
36 | [dev-dependencies]
37 | assert_cmd = "2.0"
38 | criterion = "0.5"
39 | predicates = "3.0"
40 | tempfile = "3.0"
41 | 
42 | [[bench]]
43 | harness = false
44 | name = "markdown_similarity_benchmark"
45 | 
46 | [[example]]
47 | name = "morphological_test"
48 | path = "examples/morphological_test.rs"
49 | 
50 | [[example]]
51 | name = "debug_similarity"
52 | path = "examples/debug_similarity.rs"
53 | 
54 | [[example]]
55 | name = "test_levenshtein"
56 | path = "examples/test_levenshtein.rs"
57 | 


--------------------------------------------------------------------------------
/crates/similarity-css/examples/test.scss:
--------------------------------------------------------------------------------
 1 | // Test SCSS file for similarity detection
 2 | 
 3 | $primary-color: #007bff;
 4 | $white: #fff;
 5 | $spacing-unit: 20px;
 6 | 
 7 | @mixin button-base {
 8 |   padding: 10px 20px;
 9 |   border: none;
10 |   border-radius: 4px;
11 |   cursor: pointer;
12 | }
13 | 
14 | .button {
15 |   @include button-base;
16 |   background-color: $primary-color;
17 |   color: white;
18 | }
19 | 
20 | .btn {
21 |   @include button-base;
22 |   background-color: $primary-color;
23 |   color: $white;
24 | }
25 | 
26 | .primary-button {
27 |   background: $primary-color;
28 |   color: white;
29 |   padding: 0.625rem 1.25rem;
30 |   border: 0;
31 |   border-radius: 4px;
32 |   cursor: pointer;
33 | }
34 | 
35 | @mixin card-style {
36 |   background: white;
37 |   padding: $spacing-unit;
38 |   border-radius: 8px;
39 |   box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
40 | }
41 | 
42 | .card {
43 |   @include card-style;
44 | }
45 | 
46 | .panel {
47 |   background-color: #ffffff;
48 |   padding: $spacing-unit;
49 |   border-radius: 8px;
50 |   box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
51 | }
52 | 
53 | @mixin flex-header {
54 |   display: flex;
55 |   justify-content: space-between;
56 |   align-items: center;
57 | }
58 | 
59 | .header {
60 |   @include flex-header;
61 |   padding: 1rem;
62 | }
63 | 
64 | .navbar {
65 |   @include flex-header;
66 |   padding: 16px;
67 | }
68 | 
69 | @media (max-width: 768px) {
70 |   .button,
71 |   .btn {
72 |     width: 100%;
73 |     padding: 12px;
74 |   }
75 |   
76 |   .card {
77 |     padding: 15px;
78 |   }
79 | }


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/copy_paste/loop_pattern.ts:
--------------------------------------------------------------------------------
 1 | // Copy-paste duplication: Loop patterns with similar structure
 2 | // Common pattern where loops are copied and modified slightly
 3 | 
 4 | export function findMaxValue(numbers: number[]): number {
 5 |   let max = numbers[0];
 6 |   for (let i = 1; i < numbers.length; i++) {
 7 |     if (numbers[i] > max) {
 8 |       max = numbers[i];
 9 |     }
10 |   }
11 |   return max;
12 | }
13 | 
14 | export function findMinValue(numbers: number[]): number {
15 |   let min = numbers[0];
16 |   for (let i = 1; i < numbers.length; i++) {
17 |     if (numbers[i] < min) {
18 |       min = numbers[i];
19 |     }
20 |   }
21 |   return min;
22 | }
23 | 
24 | export function calculateSum(numbers: number[]): number {
25 |   let sum = 0;
26 |   for (let i = 0; i < numbers.length; i++) {
27 |     sum += numbers[i];
28 |   }
29 |   return sum;
30 | }
31 | 
32 | export function calculateProduct(numbers: number[]): number {
33 |   let product = 1;
34 |   for (let i = 0; i < numbers.length; i++) {
35 |     product *= numbers[i];
36 |   }
37 |   return product;
38 | }
39 | 
40 | export function countPositive(numbers: number[]): number {
41 |   let count = 0;
42 |   for (let i = 0; i < numbers.length; i++) {
43 |     if (numbers[i] > 0) {
44 |       count++;
45 |     }
46 |   }
47 |   return count;
48 | }
49 | 
50 | export function countNegative(numbers: number[]): number {
51 |   let count = 0;
52 |   for (let i = 0; i < numbers.length; i++) {
53 |     if (numbers[i] < 0) {
54 |       count++;
55 |     }
56 |   }
57 |   return count;
58 | }


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/structural/array_iteration_pattern_2.ts:
--------------------------------------------------------------------------------
 1 | // Structural duplication: Functional array processing
 2 | // Same logic as pattern_1 but using functional approach
 3 | export function processUserData(users: User[]): ProcessedUser[] {
 4 |   return users
 5 |     .filter(user => user.isActive)
 6 |     .map(user => ({
 7 |       id: user.id,
 8 |       displayName: `${user.firstName} ${user.lastName}`,
 9 |       status: 'active',
10 |       lastSeen: user.lastLogin
11 |     }));
12 | }
13 | 
14 | export function processOrderData(orders: Order[]): ProcessedOrder[] {
15 |   return orders
16 |     .filter(order => order.status === 'completed')
17 |     .map(order => ({
18 |       id: order.id,
19 |       customerName: `${order.customer.firstName} ${order.customer.lastName}`,
20 |       total: order.items.reduce((sum, item) => sum + item.price, 0),
21 |       completedAt: order.completedDate
22 |     }));
23 | }
24 | 
25 | // Same types as pattern_1
26 | interface User {
27 |   id: string;
28 |   firstName: string;
29 |   lastName: string;
30 |   isActive: boolean;
31 |   lastLogin: Date;
32 | }
33 | 
34 | interface ProcessedUser {
35 |   id: string;
36 |   displayName: string;
37 |   status: string;
38 |   lastSeen: Date;
39 | }
40 | 
41 | interface Order {
42 |   id: string;
43 |   status: string;
44 |   customer: { firstName: string; lastName: string };
45 |   items: Array<{ price: number }>;
46 |   completedDate: Date;
47 | }
48 | 
49 | interface ProcessedOrder {
50 |   id: string;
51 |   customerName: string;
52 |   total: number;
53 |   completedAt: Date;
54 | }


--------------------------------------------------------------------------------
/crates/similarity-generic/examples/usage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo "=== similarity-generic Usage Examples ==="
 4 | echo
 5 | 
 6 | echo "1. Basic usage with built-in language support:"
 7 | echo "   similarity-generic sample.go --language go"
 8 | echo
 9 | 
10 | echo "2. Show all functions in a file:"
11 | echo "   similarity-generic sample.go --language go --show-functions"
12 | echo
13 | 
14 | echo "3. Use custom threshold:"
15 | echo "   similarity-generic sample.go --language go --threshold 0.9"
16 | echo
17 | 
18 | echo "4. Show supported languages:"
19 | echo "   similarity-generic --supported"
20 | echo
21 | 
22 | echo "5. Show language configuration:"
23 | echo "   similarity-generic --show-config go"
24 | echo "   similarity-generic --show-config go > my-go-config.json"
25 | echo
26 | 
27 | echo "6. Use custom configuration file:"
28 | echo "   similarity-generic sample.go --config configs/go.json"
29 | echo
30 | 
31 | echo "7. Create and use a modified configuration:"
32 | echo "   # Get base configuration"
33 | echo "   similarity-generic --show-config go > my-config.json"
34 | echo "   # Edit my-config.json to customize"
35 | echo "   # Use the custom configuration"
36 | echo "   similarity-generic sample.go --config my-config.json"
37 | echo
38 | 
39 | echo "8. Analyze multiple files:"
40 | echo "   find . -name '*.go' -exec similarity-generic {} --language go \;"
41 | echo
42 | 
43 | echo "9. Output in VSCode-compatible format (default):"
44 | echo "   similarity-generic sample.go --language go"
45 | echo "   # Click on the file paths in VSCode terminal to jump to location"


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/structural/error_handling_pattern_2.ts:
--------------------------------------------------------------------------------
 1 | // Same error handling pattern refactored to remove duplication
 2 | // This demonstrates how the structural duplication can be eliminated
 3 | 
 4 | type ApiResult<T> = { data?: T; error?: string };
 5 | 
 6 | async function apiCall<T>(
 7 |   url: string,
 8 |   options?: RequestInit,
 9 |   errorContext?: string
10 | ): Promise<ApiResult<T>> {
11 |   try {
12 |     const response = await fetch(url, options);
13 |     
14 |     if (!response.ok) {
15 |       throw new Error(`HTTP error! status: ${response.status}`);
16 |     }
17 |     
18 |     const data = await response.json();
19 |     return { data };
20 |   } catch (error) {
21 |     const context = errorContext || 'API call';
22 |     console.error(`Error in ${context}:`, error);
23 |     return { 
24 |       error: error instanceof Error ? error.message : 'Unknown error occurred' 
25 |     };
26 |   }
27 | }
28 | 
29 | export const fetchUserData = (userId: string) => 
30 |   apiCall(`/api/users/${userId}`, undefined, 'fetching user data');
31 | 
32 | export const fetchProductData = (productId: string) => 
33 |   apiCall(`/api/products/${productId}`, undefined, 'fetching product data');
34 | 
35 | export const fetchOrderData = (orderId: string) => 
36 |   apiCall(`/api/orders/${orderId}`, undefined, 'fetching order data');
37 | 
38 | export const postComment = (postId: string, comment: string) => 
39 |   apiCall(
40 |     `/api/posts/${postId}/comments`,
41 |     {
42 |       method: 'POST',
43 |       headers: { 'Content-Type': 'application/json' },
44 |       body: JSON.stringify({ comment })
45 |     },
46 |     'posting comment'
47 |   );


--------------------------------------------------------------------------------
/examples/overlap-detection/exact-duplication.js:
--------------------------------------------------------------------------------
 1 | // Test case 1: Exact duplication of code blocks
 2 | 
 3 | function processUserData(users) {
 4 |     const validUsers = [];
 5 |     // Exact duplicate block
 6 |     for (let i = 0; i < users.length; i++) {
 7 |         if (users[i].age >= 18 && users[i].isActive) {
 8 |             validUsers.push({
 9 |                 id: users[i].id,
10 |                 name: users[i].name,
11 |                 email: users[i].email
12 |             });
13 |         }
14 |     }
15 |     return validUsers;
16 | }
17 | 
18 | function filterActiveAdults(people) {
19 |     const results = [];
20 |     // Same logic, different variable names
21 |     for (let i = 0; i < people.length; i++) {
22 |         if (people[i].age >= 18 && people[i].isActive) {
23 |             results.push({
24 |                 id: people[i].id,
25 |                 name: people[i].name,
26 |                 email: people[i].email
27 |             });
28 |         }
29 |     }
30 |     return results;
31 | }
32 | 
33 | function validateAndTransform(items) {
34 |     const output = [];
35 |     // Similar pattern but with additional logic
36 |     for (let i = 0; i < items.length; i++) {
37 |         if (items[i].age >= 18 && items[i].isActive) {
38 |             // Additional validation
39 |             if (items[i].email && items[i].email.includes('@')) {
40 |                 output.push({
41 |                     id: items[i].id,
42 |                     name: items[i].name,
43 |                     email: items[i].email,
44 |                     validated: true
45 |                 });
46 |             }
47 |         }
48 |     }
49 |     return output;
50 | }


--------------------------------------------------------------------------------
/examples/specs/duplicate-functions.ts:
--------------------------------------------------------------------------------
 1 | // Example: Function duplication detection
 2 | 
 3 | // Duplicate 1: Nearly identical functions (variable names only)
 4 | function calculateUserAge(birthYear: number): number {
 5 |   const currentYear = new Date().getFullYear();
 6 |   const age = currentYear - birthYear;
 7 |   return age;
 8 | }
 9 | 
10 | function calculateCustomerAge(birthYear: number): number {
11 |   const currentYear = new Date().getFullYear();
12 |   const age = currentYear - birthYear;
13 |   return age;
14 | }
15 | 
16 | // Duplicate 2: Same algorithm, different implementation style
17 | function findMaxValue(numbers: number[]): number {
18 |   let max = numbers[0];
19 |   for (let i = 1; i < numbers.length; i++) {
20 |     if (numbers[i] > max) {
21 |       max = numbers[i];
22 |     }
23 |   }
24 |   return max;
25 | }
26 | 
27 | function getMaximumValue(values: number[]): number {
28 |   let maximum = values[0];
29 |   for (const value of values) {
30 |     if (value > maximum) {
31 |       maximum = value;
32 |     }
33 |   }
34 |   return maximum;
35 | }
36 | 
37 | // Duplicate 3: Data processing with different field names
38 | function processUserData(users: any[]) {
39 |   return users
40 |     .filter(user => user.isActive)
41 |     .map(user => ({
42 |       id: user.userId,
43 |       name: user.fullName,
44 |       email: user.emailAddress
45 |     }));
46 | }
47 | 
48 | function processCustomerData(customers: any[]) {
49 |   return customers
50 |     .filter(customer => customer.isActive)
51 |     .map(customer => ({
52 |       id: customer.customerId,
53 |       name: customer.fullName,
54 |       email: customer.emailAddress
55 |     }));
56 | }


--------------------------------------------------------------------------------
/examples/specs/duplicate-functions2.ts:
--------------------------------------------------------------------------------
 1 | // Example: Function duplication detection
 2 | 
 3 | // Duplicate 1: Nearly identical functions (variable names only)
 4 | function calculateUserAge(birthYear: number): number {
 5 |   const currentYear = new Date().getFullYear();
 6 |   const age = currentYear - birthYear;
 7 |   return age;
 8 | }
 9 | 
10 | function calculateCustomerAge(birthYear: number): number {
11 |   const currentYear = new Date().getFullYear();
12 |   const age = currentYear - birthYear;
13 |   return age;
14 | }
15 | 
16 | // Duplicate 2: Same algorithm, different implementation style
17 | function findMaxValue(numbers: number[]): number {
18 |   let max = numbers[0];
19 |   for (let i = 1; i < numbers.length; i++) {
20 |     if (numbers[i] > max) {
21 |       max = numbers[i];
22 |     }
23 |   }
24 |   return max;
25 | }
26 | 
27 | function getMaximumValue(values: number[]): number {
28 |   let maximum = values[0];
29 |   for (const value of values) {
30 |     if (value > maximum) {
31 |       maximum = value;
32 |     }
33 |   }
34 |   return maximum;
35 | }
36 | 
37 | // Duplicate 3: Data processing with different field names
38 | function processUserData(users: any[]) {
39 |   return users
40 |     .filter(user => user.isActive)
41 |     .map(user => ({
42 |       id: user.userId,
43 |       name: user.fullName,
44 |       email: user.emailAddress
45 |     }));
46 | }
47 | 
48 | function processCustomerData(customers: any[]) {
49 |   return customers
50 |     .filter(customer => customer.isActive)
51 |     .map(customer => ({
52 |       id: customer.customerId,
53 |       name: customer.fullName,
54 |       email: customer.emailAddress
55 |     }));
56 | }


--------------------------------------------------------------------------------
/examples/overlap-detection/false-positives.js:
--------------------------------------------------------------------------------
 1 | // Test case 4: Patterns that might cause false positives
 2 | 
 3 | // Very short similar patterns
 4 | function isPositive(n) {
 5 |     return n > 0;
 6 | }
 7 | 
 8 | function isNegative(n) {
 9 |     return n < 0;
10 | }
11 | 
12 | function isZero(n) {
13 |     return n === 0;
14 | }
15 | 
16 | // Common boilerplate patterns
17 | function fetchUserData(userId) {
18 |     try {
19 |         const user = database.get(userId);
20 |         return { success: true, data: user };
21 |     } catch (error) {
22 |         return { success: false, error: error.message };
23 |     }
24 | }
25 | 
26 | function fetchProductData(productId) {
27 |     try {
28 |         const product = database.get(productId);
29 |         return { success: true, data: product };
30 |     } catch (error) {
31 |         return { success: false, error: error.message };
32 |     }
33 | }
34 | 
35 | // Different algorithms with similar structure
36 | function bubbleSort(arr) {
37 |     const n = arr.length;
38 |     for (let i = 0; i < n - 1; i++) {
39 |         for (let j = 0; j < n - i - 1; j++) {
40 |             if (arr[j] > arr[j + 1]) {
41 |                 [arr[j], arr[j + 1]] = [arr[j + 1], arr[j]];
42 |             }
43 |         }
44 |     }
45 |     return arr;
46 | }
47 | 
48 | function selectionSort(arr) {
49 |     const n = arr.length;
50 |     for (let i = 0; i < n - 1; i++) {
51 |         let minIdx = i;
52 |         for (let j = i + 1; j < n; j++) {
53 |             if (arr[j] < arr[minIdx]) {
54 |                 minIdx = j;
55 |             }
56 |         }
57 |         [arr[i], arr[minIdx]] = [arr[minIdx], arr[i]];
58 |     }
59 |     return arr;
60 | }


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # similarity Documentation
 2 | 
 3 | This directory contains comprehensive documentation for the similarity detection tools, organized by category.
 4 | 
 5 | ## 📚 Documentation Structure
 6 | 
 7 | ### [Algorithm](./algorithm/)
 8 | Theoretical foundations and algorithm documentation:
 9 | - TSED (Tree Similarity of Edit Distance) academic paper
10 | - Algorithm summaries and analyses
11 | - Tree-sitter integration details
12 | 
13 | ### [Library](./lib/)
14 | Library design, architecture, and features:
15 | - AI documentation for developers
16 | - Multi-file similarity detection
17 | - Type similarity design
18 | - Language-specific implementations
19 | 
20 | ### [Implementation](./implementation/)
21 | Implementation details and performance:
22 | - Performance optimization strategies
23 | - Benchmark results
24 | - Rust vs TypeScript comparisons
25 | 
26 | ## 🚀 Quick Start
27 | 
28 | For users:
29 | - [`prompt.md`](./prompt.md) - AI assistant quick guide (English)
30 | - [`prompt-ja.md`](./prompt-ja.md) - AI assistant quick guide (Japanese)
31 | - [Main README](../README.md) - Installation and usage
32 | 
33 | For developers:
34 | - [`lib/ai-documentation.md`](./lib/ai-documentation.md) - Technical documentation
35 | - [`algorithm/tsed-similarity-summary.md`](./algorithm/tsed-similarity-summary.md) - Algorithm overview
36 | 
37 | ## 📖 Additional Resources
38 | 
39 | ### Blog Posts
40 | - [`introduce-ja.md`](./introduce-ja.md) - Project introduction and development story (Japanese)
41 | 
42 | ### Project Management
43 | - [`../CLAUDE.md`](../CLAUDE.md) - Project instructions for Claude
44 | - [`../TODO.md`](../TODO.md) - Task list
45 | - [`../CHANGELOG.md`](../CHANGELOG.md) - Version history


--------------------------------------------------------------------------------
/examples/overlap-detection/similar-patterns.js:
--------------------------------------------------------------------------------
 1 | // Test case 2: Similar algorithmic patterns
 2 | 
 3 | // Pattern 1: Array reduction
 4 | function sumValues(numbers) {
 5 |     let total = 0;
 6 |     for (let i = 0; i < numbers.length; i++) {
 7 |         total += numbers[i];
 8 |     }
 9 |     return total;
10 | }
11 | 
12 | function calculateProduct(values) {
13 |     let product = 1;
14 |     for (let i = 0; i < values.length; i++) {
15 |         product *= values[i];
16 |     }
17 |     return product;
18 | }
19 | 
20 | // Pattern 2: Find maximum
21 | function findMax(arr) {
22 |     let max = arr[0];
23 |     for (let i = 1; i < arr.length; i++) {
24 |         if (arr[i] > max) {
25 |             max = arr[i];
26 |         }
27 |     }
28 |     return max;
29 | }
30 | 
31 | function findMin(arr) {
32 |     let min = arr[0];
33 |     for (let i = 1; i < arr.length; i++) {
34 |         if (arr[i] < min) {
35 |             min = arr[i];
36 |         }
37 |     }
38 |     return min;
39 | }
40 | 
41 | // Pattern 3: Nested loops
42 | function findDuplicates(items) {
43 |     const duplicates = [];
44 |     for (let i = 0; i < items.length; i++) {
45 |         for (let j = i + 1; j < items.length; j++) {
46 |             if (items[i] === items[j]) {
47 |                 duplicates.push(items[i]);
48 |             }
49 |         }
50 |     }
51 |     return duplicates;
52 | }
53 | 
54 | function findPairs(numbers, targetSum) {
55 |     const pairs = [];
56 |     for (let i = 0; i < numbers.length; i++) {
57 |         for (let j = i + 1; j < numbers.length; j++) {
58 |             if (numbers[i] + numbers[j] === targetSum) {
59 |                 pairs.push([numbers[i], numbers[j]]);
60 |             }
61 |         }
62 |     }
63 |     return pairs;
64 | }


--------------------------------------------------------------------------------
/examples/specs/basic_usage.ts:
--------------------------------------------------------------------------------
 1 | import { CodeSimilarity } from "../src/index.ts";
 2 | 
 3 | function main() {
 4 |   const similarity = new CodeSimilarity();
 5 | 
 6 |   // Example 1: Similar functions with minor differences
 7 |   const code1 = `
 8 | function add(a: number, b: number): number {
 9 |   return a + b;
10 | }`;
11 | 
12 |   const code2 = `
13 | function sum(x: number, y: number): number {
14 |   return x + y;
15 | }`;
16 | 
17 |   console.log("=== Example 1: Similar functions ===");
18 |   const score1 = similarity.calculateSimilarity(code1, code2);
19 |   console.log(`Similarity score: ${score1.toFixed(4)}`);
20 | 
21 |   const report1 = similarity.getDetailedReport(code1, code2);
22 |   console.log("Detailed report:", report1);
23 | 
24 |   // Example 2: Identical code
25 |   const code3 = `
26 | class Calculator {
27 |   add(a: number, b: number): number {
28 |     return a + b;
29 |   }
30 | }`;
31 | 
32 |   console.log("\n=== Example 2: Identical code ===");
33 |   const score2 = similarity.calculateSimilarity(code3, code3);
34 |   console.log(`Similarity score: ${score2.toFixed(4)} (should be 1.0)`);
35 | 
36 |   // Example 3: Very different code
37 |   const code4 = `
38 | interface User {
39 |   id: number;
40 |   name: string;
41 | }`;
42 | 
43 |   console.log("\n=== Example 3: Different code structures ===");
44 |   const score3 = similarity.calculateSimilarity(code3, code4);
45 |   console.log(`Similarity score: ${score3.toFixed(4)}`);
46 | 
47 |   // Example 4: Parse AST
48 |   console.log("\n=== Example 4: AST Structure ===");
49 |   const ast = similarity.parse(code1);
50 |   console.log("AST for code1:");
51 |   console.log(JSON.stringify(ast.program, null, 2).substring(0, 500) + "...");
52 | }
53 | 
54 | // Run the examples
55 | main();
56 | 


--------------------------------------------------------------------------------
/crates/similarity-rs/tests/test_debug_rename_cost.rs:
--------------------------------------------------------------------------------
 1 | #![allow(clippy::uninlined_format_args)]
 2 | 
 3 | use similarity_core::{
 4 |     language_parser::LanguageParser,
 5 |     tsed::{calculate_tsed, TSEDOptions},
 6 | };
 7 | use similarity_rs::rust_parser::RustParser;
 8 | 
 9 | #[test]
10 | fn test_rename_cost_effect() {
11 |     let code1 = r#"
12 |     let result = x + 1;
13 |     result * 2
14 | "#;
15 | 
16 |     let code2 = r#"
17 |     let temp = y + 1;
18 |     temp * 2
19 | "#;
20 | 
21 |     let mut parser = RustParser::new().unwrap();
22 |     let tree1 = parser.parse(code1, "test1.rs").unwrap();
23 |     let tree2 = parser.parse(code2, "test2.rs").unwrap();
24 | 
25 |     // Print AST structure
26 |     fn print_tree(node: &similarity_core::tree::TreeNode, depth: usize) {
27 |         let indent = "  ".repeat(depth);
28 |         if node.value.is_empty() {
29 |             println!("{}{}", indent, node.label);
30 |         } else {
31 |             println!("{}{} = '{}'", indent, node.label, node.value);
32 |         }
33 |         for child in &node.children {
34 |             print_tree(child, depth + 1);
35 |         }
36 |     }
37 | 
38 |     println!("=== Tree 1 ===");
39 |     print_tree(&tree1, 0);
40 |     println!("\n=== Tree 2 ===");
41 |     print_tree(&tree2, 0);
42 | 
43 |     // Test different rename_cost values
44 |     for rename_cost in [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 1.0] {
45 |         let mut options = TSEDOptions::default();
46 |         options.apted_options.rename_cost = rename_cost;
47 |         options.apted_options.compare_values = true;
48 | 
49 |         let similarity = calculate_tsed(&tree1, &tree2, &options);
50 |         println!("rename_cost = {:.1}: similarity = {:.2}%", rename_cost, similarity * 100.0);
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/examples/specs/sample_project/src/components/user_list.ts:
--------------------------------------------------------------------------------
 1 | import { User } from "../models/user.ts";
 2 | 
 3 | export class UserList {
 4 |   private container: HTMLElement;
 5 |   private users: User[] = [];
 6 | 
 7 |   constructor(containerId: string) {
 8 |     const element = document.getElementById(containerId);
 9 |     if (!element) {
10 |       throw new Error(`Container with id ${containerId} not found`);
11 |     }
12 |     this.container = element;
13 |   }
14 | 
15 |   setUsers(users: User[]): void {
16 |     this.users = users;
17 |     this.render();
18 |   }
19 | 
20 |   addUser(user: User): void {
21 |     this.users.push(user);
22 |     this.render();
23 |   }
24 | 
25 |   removeUser(userId: string): void {
26 |     this.users = this.users.filter((u) => u.id !== userId);
27 |     this.render();
28 |   }
29 | 
30 |   private render(): void {
31 |     this.container.innerHTML = "";
32 | 
33 |     if (this.users.length === 0) {
34 |       this.container.innerHTML = "<p>No users found</p>";
35 |       return;
36 |     }
37 | 
38 |     const ul = document.createElement("ul");
39 |     ul.className = "user-list";
40 | 
41 |     this.users.forEach((user) => {
42 |       const li = document.createElement("li");
43 |       li.className = "user-item";
44 |       li.innerHTML = `
45 |         <span class="user-name">${this.escapeHtml(user.name)}</span>
46 |         <span class="user-email">${this.escapeHtml(user.email)}</span>
47 |         <span class="user-role">${user.role}</span>
48 |       `;
49 |       ul.appendChild(li);
50 |     });
51 | 
52 |     this.container.appendChild(ul);
53 |   }
54 | 
55 |   private escapeHtml(text: string): string {
56 |     const div = document.createElement("div");
57 |     div.textContent = text;
58 |     return div.innerHTML;
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/docs/prompt-ja.md:
--------------------------------------------------------------------------------
 1 | # similarity-ts: AIアシスタントガイド
 2 | 
 3 | ## 目的
 4 | ASTベースの比較でTypeScript/JavaScriptの重複コードを検出し、リファクタリングを支援します。
 5 | 
 6 | ## インストール
 7 | ```bash
 8 | cargo install similarity-ts
 9 | ```
10 | 
11 | ## コマンド形式
12 | ```bash
13 | similarity-ts [パス...] [オプション]
14 | ```
15 | 
16 | ## 主要オプション
17 | - `--threshold <0-1>`: 類似度しきい値（デフォルト: 0.8）
18 | - `--min-tokens <n>`: n個未満のASTノードを持つ関数をスキップ（推奨: 20-30）
19 | - `--print`: 実際のコードスニペットを表示
20 | 
21 | ## AIリファクタリングワークフロー
22 | 
23 | ### 1. 広範囲スキャン
24 | コードベース全体の重複を発見：
25 | ```bash
26 | similarity-ts src/ --threshold 0.85 --min-tokens 25
27 | ```
28 | 
29 | ### 2. 詳細分析
30 | 特定のファイルペアを調査：
31 | ```bash
32 | similarity-ts file1.ts file2.ts --threshold 0.8 --min-tokens 20 --print
33 | ```
34 | 
35 | ### 3. しきい値調整
36 | 結果がない場合は段階的に下げる：
37 | ```bash
38 | similarity-ts file1.ts file2.ts --threshold 0.75 --min-tokens 20
39 | similarity-ts file1.ts file2.ts --threshold 0.7 --min-tokens 20
40 | ```
41 | 
42 | ## 出力形式
43 | ```
44 | Function: functionName (file.ts:開始行-終了行)
45 | Similar to: otherFunction (other.ts:開始行-終了行)  
46 | Similarity: 85%
47 | ```
48 | 
49 | ## 効果的なしきい値
50 | - `0.95+`: ほぼ同一（変数名の違いのみ）
51 | - `0.85-0.95`: 同じアルゴリズム、軽微な違い
52 | - `0.75-0.85`: 類似構造、詳細は異なる
53 | - `0.7-0.75`: 関連ロジック、調査の価値あり
54 | 
55 | ## リファクタリング戦略
56 | 
57 | 1. **高いしきい値から開始**（0.9）で明らかな重複を発見
58 | 2. **特定ペアを比較**して類似性を確認
59 | 3. **--printを使用**して実際のコードの違いを確認
60 | 4. **共通ロジックを抽出**して共有関数/モジュール化
61 | 5. **リファクタリング後に再実行**して新たな重複がないか確認
62 | 
63 | ## リファクタリング対象の一般的なパターン
64 | 
65 | - **データ処理ループ**（異なるフィールド名）
66 | - **APIハンドラー**（類似のリクエスト/レスポンスロジック）
67 | - **バリデーション関数**（異なるルール）
68 | - **状態管理**（繰り返されるパターン）
69 | 
70 | ## ベストプラクティス
71 | 
72 | - 正確な複雑さフィルタリングに`--min-tokens`を使用（20-30トークン）
73 | - 80%以上の類似度のファイルを優先
74 | - 類似関数が同じモジュール内にあるか確認（リファクタリングが容易）
75 | - 関数サイズを考慮 - 大きな重複ほど影響大
76 | - ペアだけでなく複数ファイルにまたがるパターンを探す


--------------------------------------------------------------------------------
/examples/specs/type-similarity/test_types_sample.ts:
--------------------------------------------------------------------------------
 1 | // Sample TypeScript file for testing type similarity detection
 2 | 
 3 | // Similar interfaces - should be detected as highly similar
 4 | interface User {
 5 |   id: string;
 6 |   name: string;
 7 |   email: string;
 8 |   age?: number;
 9 | }
10 | 
11 | interface Person {
12 |   id: string;
13 |   name: string;
14 |   email: string;
15 |   age?: number;
16 | }
17 | 
18 | // Similar but with different property names - should be detected as moderately similar
19 | interface Customer {
20 |   id: string;
21 |   fullName: string;
22 |   emailAddress: string;
23 |   yearsOld?: number;
24 | }
25 | 
26 | // Type alias vs interface - should be detected as similar if cross-kind comparison is enabled
27 | type UserType = {
28 |   id: string;
29 |   name: string;
30 |   email: string;
31 |   age?: number;
32 | };
33 | 
34 | // Union types
35 | type Status = "active" | "inactive" | "pending";
36 | type State = "active" | "inactive" | "suspended";
37 | 
38 | // Different structure - should not be similar
39 | interface Product {
40 |   sku: string;
41 |   price: number;
42 |   category: string;
43 |   inStock: boolean;
44 | }
45 | 
46 | // Generic interface
47 | interface Container<T> {
48 |   value: T;
49 |   metadata: {
50 |     created: Date;
51 |     updated: Date;
52 |   };
53 | }
54 | 
55 | // Similar generic interface
56 | interface Wrapper<T> {
57 |   value: T;
58 |   metadata: {
59 |     created: Date;
60 |     updated: Date;
61 |   };
62 | }
63 | 
64 | // Interface with extends
65 | interface BaseEntity {
66 |   id: string;
67 |   createdAt: Date;
68 | }
69 | 
70 | interface ExtendedUser extends BaseEntity {
71 |   name: string;
72 |   email: string;
73 | }
74 | 
75 | interface ExtendedPerson extends BaseEntity {
76 |   name: string;
77 |   email: string;
78 | }
79 | 


--------------------------------------------------------------------------------
/crates/similarity-rs/tests/test_full_function_similarity.rs:
--------------------------------------------------------------------------------
 1 | use similarity_core::language_parser::LanguageParser;
 2 | use similarity_core::{
 3 |     apted::APTEDOptions,
 4 |     tsed::{calculate_tsed, TSEDOptions},
 5 | };
 6 | use similarity_rs::rust_parser::RustParser;
 7 | 
 8 | #[test]
 9 | fn test_full_function_similarity() {
10 |     let mut parser = RustParser::new().unwrap();
11 | 
12 |     let func1 = "fn add(a: i32, b: i32) -> i32 { a + b }";
13 |     let func2 = "fn sub(a: i32, b: i32) -> i32 { a - b }";
14 |     let func3 = "fn mul(a: i32, b: i32) -> i32 { a * b }";
15 | 
16 |     let tree1 = parser.parse(func1, "test1.rs").unwrap();
17 |     let tree2 = parser.parse(func2, "test2.rs").unwrap();
18 |     let tree3 = parser.parse(func3, "test3.rs").unwrap();
19 | 
20 |     let options = TSEDOptions {
21 |         apted_options: APTEDOptions {
22 |             rename_cost: 0.3,
23 |             delete_cost: 1.0,
24 |             insert_cost: 1.0,
25 |             compare_values: true,
26 |         },
27 |         min_lines: 1,
28 |         min_tokens: None,
29 |         size_penalty: true,
30 |         skip_test: false,
31 |     };
32 | 
33 |     let sim12 = calculate_tsed(&tree1, &tree2, &options);
34 |     let sim13 = calculate_tsed(&tree1, &tree3, &options);
35 | 
36 |     println!("Tree1 size: {}", tree1.get_subtree_size());
37 |     println!("Tree2 size: {}", tree2.get_subtree_size());
38 |     println!("Full function similarity 'add' vs 'sub': {:.2}%", sim12 * 100.0);
39 |     println!("Full function similarity 'add' vs 'mul': {:.2}%", sim13 * 100.0);
40 | 
41 |     // These should not be 100% similar
42 |     assert!(sim12 < 1.0, "Different functions should not be 100% similar, got {}%", sim12 * 100.0);
43 |     assert!(sim13 < 1.0, "Different functions should not be 100% similar, got {}%", sim13 * 100.0);
44 | }
45 | 


--------------------------------------------------------------------------------
/examples/specs/sample_project/src/services/user_service.ts:
--------------------------------------------------------------------------------
 1 | import { User } from "../models/user.ts";
 2 | import { Logger } from "../utils/logger.ts";
 3 | 
 4 | export class UserService {
 5 |   private users: Map<string, User> = new Map();
 6 |   private logger: Logger;
 7 | 
 8 |   constructor(logger: Logger) {
 9 |     this.logger = logger;
10 |   }
11 | 
12 |   async createUser(data: Omit<User, "id" | "createdAt">): Promise<User> {
13 |     const user: User = {
14 |       id: this.generateId(),
15 |       ...data,
16 |       createdAt: new Date(),
17 |     };
18 | 
19 |     this.users.set(user.id, user);
20 |     this.logger.info(`User created: ${user.id}`);
21 | 
22 |     return user;
23 |   }
24 | 
25 |   async getUserById(id: string): Promise<User | null> {
26 |     const user = this.users.get(id);
27 | 
28 |     if (!user) {
29 |       this.logger.warn(`User not found: ${id}`);
30 |       return null;
31 |     }
32 | 
33 |     return user;
34 |   }
35 | 
36 |   async updateUser(id: string, updates: Partial<User>): Promise<User | null> {
37 |     const user = await this.getUserById(id);
38 | 
39 |     if (!user) {
40 |       return null;
41 |     }
42 | 
43 |     const updatedUser = { ...user, ...updates, id: user.id };
44 |     this.users.set(id, updatedUser);
45 |     this.logger.info(`User updated: ${id}`);
46 | 
47 |     return updatedUser;
48 |   }
49 | 
50 |   async deleteUser(id: string): Promise<boolean> {
51 |     const exists = this.users.has(id);
52 | 
53 |     if (exists) {
54 |       this.users.delete(id);
55 |       this.logger.info(`User deleted: ${id}`);
56 |     }
57 | 
58 |     return exists;
59 |   }
60 | 
61 |   async getAllUsers(): Promise<User[]> {
62 |     return Array.from(this.users.values());
63 |   }
64 | 
65 |   private generateId(): string {
66 |     return `user_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/examples/test_rust_structures.rs:
--------------------------------------------------------------------------------
 1 | // Test file for Rust structure comparison
 2 | 
 3 | // User struct with common fields
 4 | #[derive(Debug, Clone)]
 5 | pub struct User {
 6 |     pub id: u64,
 7 |     pub name: String,
 8 |     pub email: String,
 9 |     pub age: Option<u32>,
10 | }
11 | 
12 | // Person struct with same fields (should be detected as similar)
13 | #[derive(Debug, Clone)]
14 | pub struct Person {
15 |     pub id: u64,
16 |     pub name: String,
17 |     pub email: String,
18 |     pub age: Option<u32>,
19 | }
20 | 
21 | // Customer struct with same structure
22 | #[derive(Debug)]
23 | struct Customer {
24 |     id: u64,
25 |     name: String,
26 |     email: String,
27 |     age: Option<u32>,
28 | }
29 | 
30 | // Admin struct with different field (role instead of email)
31 | pub struct Admin {
32 |     pub id: u64,
33 |     pub name: String,
34 |     pub role: String,
35 |     pub age: Option<u32>,
36 | }
37 | 
38 | // Result-like enum
39 | pub enum MyResult<T, E> {
40 |     Ok(T),
41 |     Err(E),
42 | }
43 | 
44 | // Another Result-like enum (should be detected as similar)
45 | pub enum CustomResult<V, F> {
46 |     Success(V),
47 |     Failure(F),
48 | }
49 | 
50 | // Option-like enum
51 | pub enum MyOption<T> {
52 |     Some(T),
53 |     None,
54 | }
55 | 
56 | // Status enum with different variants
57 | pub enum Status {
58 |     Pending,
59 |     Active,
60 |     Inactive,
61 |     Deleted,
62 | }
63 | 
64 | // Similar status enum with slightly different names
65 | pub enum UserStatus {
66 |     Waiting,
67 |     Enabled,
68 |     Disabled,
69 |     Removed,
70 | }
71 | 
72 | // Complex enum with different variant types
73 | pub enum Message {
74 |     Text(String),
75 |     Number(i32),
76 |     Struct { x: f64, y: f64 },
77 |     Empty,
78 | }
79 | 
80 | // Tuple struct
81 | pub struct Point(f64, f64, f64);
82 | 
83 | // Another tuple struct with same structure
84 | pub struct Vector(f64, f64, f64);


--------------------------------------------------------------------------------
/examples/specs/sample_project/src/components/product_list.ts:
--------------------------------------------------------------------------------
 1 | import { Product } from "../models/product.ts";
 2 | 
 3 | export class ProductList {
 4 |   private container: HTMLElement;
 5 |   private products: Product[] = [];
 6 | 
 7 |   constructor(containerId: string) {
 8 |     const element = document.getElementById(containerId);
 9 |     if (!element) {
10 |       throw new Error(`Container with id ${containerId} not found`);
11 |     }
12 |     this.container = element;
13 |   }
14 | 
15 |   setProducts(products: Product[]): void {
16 |     this.products = products;
17 |     this.render();
18 |   }
19 | 
20 |   addProduct(product: Product): void {
21 |     this.products.push(product);
22 |     this.render();
23 |   }
24 | 
25 |   removeProduct(productId: string): void {
26 |     this.products = this.products.filter((p) => p.id !== productId);
27 |     this.render();
28 |   }
29 | 
30 |   private render(): void {
31 |     this.container.innerHTML = "";
32 | 
33 |     if (this.products.length === 0) {
34 |       this.container.innerHTML = "<p>No products found</p>";
35 |       return;
36 |     }
37 | 
38 |     const div = document.createElement("div");
39 |     div.className = "product-grid";
40 | 
41 |     this.products.forEach((product) => {
42 |       const card = document.createElement("div");
43 |       card.className = "product-card";
44 |       card.innerHTML = `
45 |         <h3>${this.escapeHtml(product.name)}</h3>
46 |         <p class="description">${this.escapeHtml(product.description)}</p>
47 |         <p class="price">$${product.price.toFixed(2)}</p>
48 |         <p class="stock">Stock: ${product.stock}</p>
49 |       `;
50 |       div.appendChild(card);
51 |     });
52 | 
53 |     this.container.appendChild(div);
54 |   }
55 | 
56 |   private escapeHtml(text: string): string {
57 |     const div = document.createElement("div");
58 |     div.textContent = text;
59 |     return div.innerHTML;
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/examples/test_different_structures.rs:
--------------------------------------------------------------------------------
 1 | // Test file to verify that clearly different structures are not detected as similar
 2 | 
 3 | // Simple enum with few variants
 4 | pub enum Color {
 5 |     Red,
 6 |     Green,
 7 |     Blue,
 8 | }
 9 | 
10 | // Complex struct with many fields
11 | pub struct DatabaseConnection {
12 |     pub host: String,
13 |     pub port: u16,
14 |     pub username: String,
15 |     pub password: String,
16 |     pub database_name: String,
17 |     pub max_connections: u32,
18 |     pub timeout: std::time::Duration,
19 |     pub use_ssl: bool,
20 |     pub certificate_path: Option<String>,
21 | }
22 | 
23 | // Unit struct (no fields)
24 | pub struct EmptyMarker;
25 | 
26 | // Tuple struct with single field
27 | pub struct Id(u64);
28 | 
29 | // Enum with complex variants
30 | pub enum Request {
31 |     Get { url: String, headers: Vec<String> },
32 |     Post { url: String, body: Vec<u8>, headers: Vec<String> },
33 |     Put { url: String, body: Vec<u8> },
34 |     Delete { url: String },
35 | }
36 | 
37 | // Simple struct with two fields
38 | pub struct Point2D {
39 |     pub x: f64,
40 |     pub y: f64,
41 | }
42 | 
43 | // Another enum with different structure
44 | pub enum Shape {
45 |     Circle(f64),
46 |     Rectangle(f64, f64),
47 |     Triangle(f64, f64, f64),
48 |     Polygon(Vec<(f64, f64)>),
49 | }
50 | 
51 | // Struct that might look similar to Point2D but has different types
52 | pub struct Coordinate {
53 |     pub lat: f64,
54 |     pub lng: f64,
55 | }
56 | 
57 | // Large struct that should not match with anything
58 | pub struct Configuration {
59 |     pub app_name: String,
60 |     pub version: String,
61 |     pub environment: String,
62 |     pub debug_mode: bool,
63 |     pub log_level: String,
64 |     pub api_key: String,
65 |     pub secret_key: String,
66 |     pub endpoints: Vec<String>,
67 |     pub features: std::collections::HashMap<String, bool>,
68 |     pub limits: std::collections::HashMap<String, u32>,
69 | }


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/structural/array_iteration_pattern_1.ts:
--------------------------------------------------------------------------------
 1 | // Structural duplication: Imperative array processing
 2 | // Common pattern of iterating arrays with for loops
 3 | export function processUserData(users: User[]): ProcessedUser[] {
 4 |   const result: ProcessedUser[] = [];
 5 |   
 6 |   for (let i = 0; i < users.length; i++) {
 7 |     const user = users[i];
 8 |     if (user.isActive) {
 9 |       const processed: ProcessedUser = {
10 |         id: user.id,
11 |         displayName: user.firstName + ' ' + user.lastName,
12 |         status: 'active',
13 |         lastSeen: user.lastLogin
14 |       };
15 |       result.push(processed);
16 |     }
17 |   }
18 |   
19 |   return result;
20 | }
21 | 
22 | export function processOrderData(orders: Order[]): ProcessedOrder[] {
23 |   const result: ProcessedOrder[] = [];
24 |   
25 |   for (let i = 0; i < orders.length; i++) {
26 |     const order = orders[i];
27 |     if (order.status === 'completed') {
28 |       const processed: ProcessedOrder = {
29 |         id: order.id,
30 |         customerName: order.customer.firstName + ' ' + order.customer.lastName,
31 |         total: order.items.reduce((sum, item) => sum + item.price, 0),
32 |         completedAt: order.completedDate
33 |       };
34 |       result.push(processed);
35 |     }
36 |   }
37 |   
38 |   return result;
39 | }
40 | 
41 | // Types
42 | interface User {
43 |   id: string;
44 |   firstName: string;
45 |   lastName: string;
46 |   isActive: boolean;
47 |   lastLogin: Date;
48 | }
49 | 
50 | interface ProcessedUser {
51 |   id: string;
52 |   displayName: string;
53 |   status: string;
54 |   lastSeen: Date;
55 | }
56 | 
57 | interface Order {
58 |   id: string;
59 |   status: string;
60 |   customer: { firstName: string; lastName: string };
61 |   items: Array<{ price: number }>;
62 |   completedDate: Date;
63 | }
64 | 
65 | interface ProcessedOrder {
66 |   id: string;
67 |   customerName: string;
68 |   total: number;
69 |   completedAt: Date;
70 | }


--------------------------------------------------------------------------------
/crates/core/tests/debug_similarity.rs:
--------------------------------------------------------------------------------
 1 | use similarity_core::{calculate_tsed, parse_and_convert_to_tree, TSEDOptions};
 2 | use std::rc::Rc;
 3 | 
 4 | #[test]
 5 | fn debug_high_similarity_issue() {
 6 |     // Test the problematic case: extractTokensFromAST vs getNodeLabel
 7 |     let code1 = r#"
 8 | function extractTokensFromAST(ast: any): string[] {
 9 |     const tokens: string[] = [];
10 |     function traverse(node: any) {
11 |         if (!node) return;
12 |         if (node.type) tokens.push(node.type);
13 |     }
14 |     traverse(ast);
15 |     return tokens;
16 | }
17 | "#;
18 | 
19 |     let code2 = r#"
20 | function getNodeLabel(node: TreeNode): string {
21 |     switch (node.type) {
22 |         case 'Identifier': return 'ID';
23 |         case 'StringLiteral': return 'STR';
24 |         default: return node.type || 'UNKNOWN';
25 |     }
26 | }
27 | "#;
28 | 
29 |     // Parse both functions to trees
30 |     let tree1 = parse_and_convert_to_tree("test1.ts", code1).unwrap();
31 |     let tree2 = parse_and_convert_to_tree("test2.ts", code2).unwrap();
32 | 
33 |     println!("Tree1 size: {}", tree1.get_subtree_size());
34 |     println!("Tree2 size: {}", tree2.get_subtree_size());
35 | 
36 |     // Test with different rename costs
37 |     for rename_cost in &[0.1, 0.3, 0.5, 0.7, 1.0] {
38 |         let mut options = TSEDOptions::default();
39 |         options.apted_options.rename_cost = *rename_cost;
40 | 
41 |         let similarity = calculate_tsed(&tree1, &tree2, &options);
42 |         println!("Rename cost {}: similarity = {:.2}%", rename_cost, similarity * 100.0);
43 |     }
44 | 
45 |     // Print tree structure
46 |     print_tree(&tree1, 0);
47 |     println!("\n---\n");
48 |     print_tree(&tree2, 0);
49 | }
50 | 
51 | fn print_tree(node: &Rc<similarity_core::tree::TreeNode>, depth: usize) {
52 |     let indent = "  ".repeat(depth);
53 |     println!("{}{}", indent, node.label);
54 |     for child in &node.children {
55 |         print_tree(child, depth + 1);
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/crates/similarity-md/examples/test_levenshtein.rs:
--------------------------------------------------------------------------------
 1 | //! Levenshtein距離計算のテスト
 2 | 
 3 | use similarity_md::{levenshtein_similarity, word_levenshtein_similarity};
 4 | 
 5 | fn main() {
 6 |     println!("=== Levenshtein距離計算テスト ===\n");
 7 | 
 8 |     // 基本的なテスト
 9 |     let text1 = "機械学習について";
10 |     let text2 = "マシンラーニングの概要";
11 | 
12 |     println!("テキスト1: '{text1}'");
13 |     println!("テキスト2: '{text2}'");
14 | 
15 |     let char_sim = levenshtein_similarity(text1, text2);
16 |     let word_sim = word_levenshtein_similarity(text1, text2);
17 | 
18 |     println!("文字レベル類似性: {char_sim:.4}");
19 |     println!("単語レベル類似性: {word_sim:.4}");
20 |     println!();
21 | 
22 |     // より類似したテキストのテスト
23 |     let text3 = "機械学習は、コンピュータがデータから自動的にパターンを学習する技術です。";
24 |     let text4 = "マシンラーニングとは、計算機がデータから自動的にパターンを習得する手法です。";
25 | 
26 |     println!("テキスト3: '{text3}'");
27 |     println!("テキスト4: '{text4}'");
28 | 
29 |     let char_sim2 = levenshtein_similarity(text3, text4);
30 |     let word_sim2 = word_levenshtein_similarity(text3, text4);
31 | 
32 |     println!("文字レベル類似性: {char_sim2:.4}");
33 |     println!("単語レベル類似性: {word_sim2:.4}");
34 |     println!();
35 | 
36 |     // 英語のテスト
37 |     let en1 = "machine learning";
38 |     let en2 = "machine learning";
39 | 
40 |     println!("英語テキスト1: '{en1}'");
41 |     println!("英語テキスト2: '{en2}'");
42 | 
43 |     let char_sim3 = levenshtein_similarity(en1, en2);
44 |     let word_sim3 = word_levenshtein_similarity(en1, en2);
45 | 
46 |     println!("文字レベル類似性: {char_sim3:.4}");
47 |     println!("単語レベル類似性: {word_sim3:.4}");
48 |     println!();
49 | 
50 |     // 完全に異なるテキスト
51 |     let diff1 = "今日の天気は晴れです";
52 |     let diff2 = "プログラミング言語";
53 | 
54 |     println!("異なるテキスト1: '{diff1}'");
55 |     println!("異なるテキスト2: '{diff2}'");
56 | 
57 |     let char_sim4 = levenshtein_similarity(diff1, diff2);
58 |     let word_sim4 = word_levenshtein_similarity(diff1, diff2);
59 | 
60 |     println!("文字レベル類似性: {char_sim4:.4}");
61 |     println!("単語レベル類似性: {word_sim4:.4}");
62 | }
63 | 


--------------------------------------------------------------------------------
/crates/similarity-rs/tests/debug_ast.rs:
--------------------------------------------------------------------------------
 1 | #![allow(clippy::uninlined_format_args)]
 2 | 
 3 | use similarity_core::language_parser::LanguageParser;
 4 | use similarity_rs::rust_parser::RustParser;
 5 | 
 6 | #[test]
 7 | fn debug_ast_values() {
 8 |     let code1 = r#"
 9 | fn func1(x: i32) -> i32 {
10 |     let result = x + 1;
11 |     result * 2
12 | }
13 | "#;
14 | 
15 |     let code2 = r#"
16 | fn func2(y: i32) -> i32 {
17 |     let temp = y + 1;
18 |     temp * 3
19 | }
20 | "#;
21 | 
22 |     let mut parser = RustParser::new().unwrap();
23 |     let tree1 = parser.parse(code1, "test.rs").unwrap();
24 |     let tree2 = parser.parse(code2, "test.rs").unwrap();
25 | 
26 |     // Print the tree to see if values are captured
27 |     fn print_tree(node: &similarity_core::tree::TreeNode, depth: usize) {
28 |         let indent = "  ".repeat(depth);
29 |         if node.value.is_empty() {
30 |             println!("{}{}", indent, node.label);
31 |         } else {
32 |             println!("{}{} = '{}'", indent, node.label, node.value);
33 |         }
34 |         for child in &node.children {
35 |             print_tree(child, depth + 1);
36 |         }
37 |     }
38 | 
39 |     println!("=== Tree 1 (func1) ===");
40 |     print_tree(&tree1, 0);
41 |     println!("\n=== Tree 2 (func2) ===");
42 |     print_tree(&tree2, 0);
43 | 
44 |     // Also check similarity
45 |     use similarity_core::{calculate_enhanced_similarity, APTEDOptions, EnhancedSimilarityOptions};
46 |     let options = EnhancedSimilarityOptions {
47 |         structural_weight: 0.7,
48 |         size_weight: 0.2,
49 |         type_distribution_weight: 0.1,
50 |         min_size_ratio: 0.5,
51 |         apted_options: APTEDOptions {
52 |             rename_cost: 0.3,
53 |             delete_cost: 1.0,
54 |             insert_cost: 1.0,
55 |             compare_values: true,
56 |         },
57 |     };
58 |     let similarity = calculate_enhanced_similarity(&tree1, &tree2, &options);
59 |     println!("\nSimilarity: {}", similarity);
60 | }
61 | 


--------------------------------------------------------------------------------
/RECOMMENDATIONS.md:
--------------------------------------------------------------------------------
 1 | # Rust コード類似度検出の推奨設定
 2 | 
 3 | ## 実プロジェクトでの検証結果
 4 | 
 5 | ### 検出された主な重複パターン
 6 | 
 7 | 1. **構造的に同一なコード（正当な検出）**
 8 |    - `extract_struct_definition` と `extract_enum_definition`: 98.44%
 9 |    - 実際にリファクタリング可能な重複コード
10 | 
11 | 2. **偽陽性のパターン**
12 |    - テスト関数: 構造が似ているため95-99%の類似度
13 |    - 短い関数: サイズペナルティがあっても誤検出されやすい
14 | 
15 | ### 推奨パラメータ設定
16 | 
17 | #### 1. 一般的な重複検出
18 | ```bash
19 | similarity-rs --threshold 0.8 --min-lines 10 --min-tokens 50
20 | ```
21 | - 80%以上の類似度
22 | - 10行以上の関数
23 | - 50トークン以上（ASTノード数）
24 | 
25 | #### 2. 厳密な重複検出
26 | ```bash
27 | similarity-rs --threshold 0.9 --min-lines 15 --min-tokens 100
28 | ```
29 | - 90%以上の類似度
30 | - 15行以上の関数
31 | - 100トークン以上
32 | 
33 | #### 3. テストコードを除外
34 | ```bash
35 | similarity-rs --threshold 0.8 --min-lines 10 --skip-test
36 | ```
37 | - `#[test]` 属性の付いた関数を除外
38 | - `test_` で始まる関数を除外
39 | 
40 | ### パラメータの影響
41 | 
42 | | パラメータ | 効果 | 推奨値 |
43 | |----------|------|--------|
44 | | `threshold` | 類似度の閾値 | 0.8-0.9 |
45 | | `min-lines` | 最小行数 | 10-15 |
46 | | `min-tokens` | 最小トークン数 | 50-100 |
47 | | `size-penalty` | 短い関数へのペナルティ | true（デフォルト） |
48 | | `rename-cost` | 変数名の違いへの寛容度 | 0.3（デフォルト） |
49 | 
50 | ### 実際の使用例
51 | 
52 | #### CI/CDでの使用
53 | ```yaml
54 | - name: Check code duplication
55 |   run: |
56 |     cargo install similarity-rs
57 |     similarity-rs src \
58 |       --threshold 0.85 \
59 |       --min-lines 12 \
60 |       --min-tokens 60 \
61 |       --skip-test
62 | ```
63 | 
64 | #### リファクタリング候補の検出
65 | ```bash
66 | # 高い類似度の長い関数を検出
67 | similarity-rs src \
68 |   --threshold 0.95 \
69 |   --min-lines 20 \
70 |   --min-tokens 150
71 | ```
72 | 
73 | ### 注意事項
74 | 
75 | 1. **テストコードの扱い**
76 |    - テスト関数は構造が似ているため偽陽性が多い
77 |    - `--skip-test` オプションの使用を推奨
78 | 
79 | 2. **最小トークン数の重要性**
80 |    - `min-tokens` を設定しないと短い関数で偽陽性が増える
81 |    - 50トークン以上を推奨
82 | 
83 | 3. **言語特性の考慮**
84 |    - Rustのマクロ展開後のコードは検出されない
85 |    - ジェネリクスの具体化は別関数として扱われる
86 | 
87 | ### まとめ
88 | 
89 | `compare_values` パラメータの修正により、Rust コードの類似度検出が大幅に改善されました。適切なパラメータ設定により、実用的な重複検出が可能になっています。


--------------------------------------------------------------------------------
/crates/similarity-generic/build.rs:
--------------------------------------------------------------------------------
 1 | use std::env;
 2 | use std::fs;
 3 | use std::path::Path;
 4 | 
 5 | fn main() {
 6 |     // Get the output directory
 7 |     let out_dir = env::var("OUT_DIR").unwrap();
 8 |     let dest_path = Path::new(&out_dir).join("language_configs.rs");
 9 | 
10 |     // Read all JSON files from language_configs directory
11 |     let configs_dir = Path::new("language_configs");
12 | 
13 |     let mut output = String::new();
14 |     output.push_str("// Auto-generated by build.rs\n\n");
15 |     output.push_str("use once_cell::sync::Lazy;\n");
16 |     output.push_str("use std::collections::HashMap;\n\n");
17 | 
18 |     // Generate a static HashMap of language configs
19 |     output.push_str(
20 |         "pub static LANGUAGE_CONFIGS: Lazy<HashMap<&'static str, &'static str>> = Lazy::new(|| {\n",
21 |     );
22 |     output.push_str("    let mut map = HashMap::new();\n");
23 | 
24 |     if configs_dir.exists() {
25 |         for entry in fs::read_dir(configs_dir).unwrap() {
26 |             let entry = entry.unwrap();
27 |             let path = entry.path();
28 | 
29 |             if path.extension().and_then(|s| s.to_str()) == Some("json") {
30 |                 let file_name = path.file_stem().unwrap().to_str().unwrap();
31 |                 let content = fs::read_to_string(&path).unwrap();
32 | 
33 |                 // Escape the JSON content for inclusion in Rust code
34 |                 let escaped_content =
35 |                     content.replace('\\', "\\\\").replace('"', "\\\"").replace('\n', "\\n");
36 | 
37 |                 output.push_str(&format!(
38 |                     "    map.insert(\"{file_name}\", \"{escaped_content}\");\n"
39 |                 ));
40 |             }
41 |         }
42 |     }
43 | 
44 |     output.push_str("    map\n");
45 |     output.push_str("});\n");
46 | 
47 |     // Write the generated code
48 |     fs::write(&dest_path, output).unwrap();
49 | 
50 |     // Tell Cargo to rerun if any config file changes
51 |     println!("cargo:rerun-if-changed=language_configs");
52 | }
53 | 


--------------------------------------------------------------------------------
/docs/implementation/performance-baseline.md:
--------------------------------------------------------------------------------
 1 | # Performance Baseline (oxc_parser)
 2 | 
 3 | This document establishes the performance baseline using oxc_parser before transitioning to tree-sitter.
 4 | 
 5 | ## Current Performance Metrics
 6 | 
 7 | ### Function Comparison Benchmarks
 8 | 
 9 | #### Within File Analysis
10 | - **Small file (4 functions)**: ~8.3µs
11 | - **Medium file (8 functions)**: ~59.6µs  
12 | - **Large file (9 functions)**: ~66.7µs
13 | 
14 | #### Cross-File Analysis
15 | - **2 small files**: ~17.2µs
16 | - **3 mixed files**: ~165.9µs
17 | - **4 mixed files (worst case)**: ~192.2µs
18 | 
19 | #### Fast Mode (Bloom Filter)
20 | - **Small file**: ~8.1µs
21 | - **Medium file**: ~41.0µs
22 | - **Large file**: ~85.1µs
23 | - **3 mixed files cross-file**: ~126.5µs
24 | 
25 | ### TSED (Tree Similarity Edit Distance) Benchmarks
26 | 
27 | #### Full Calculation
28 | - **Small files**: ~15.8µs
29 | - **Medium files**: ~12.7µs
30 | 
31 | #### Parsing Only
32 | - **Small file**: ~2.2µs
33 | - **Medium file**: ~5.9µs
34 | 
35 | #### Tree Edit Distance Computation
36 | - **Small trees**: ~10.8µs
37 | - **Medium trees**: ~194ns
38 | 
39 | #### Large Scale
40 | - **100 small file comparisons**: ~1.69ms
41 | 
42 | ## Performance Targets for tree-sitter
43 | 
44 | To ensure tree-sitter integration is viable, we should aim for:
45 | 
46 | 1. **Parsing overhead**: < 2x slower than oxc_parser
47 | 2. **Overall performance**: Within 50% of current metrics
48 | 3. **Memory usage**: Comparable or better
49 | 4. **Multi-language support**: Justifies any performance trade-offs
50 | 
51 | ## Key Performance Considerations
52 | 
53 | 1. **oxc_parser advantages**:
54 |    - Zero-copy parsing
55 |    - Optimized specifically for JS/TS
56 |    - Minimal allocations
57 |    - Type-safe AST
58 | 
59 | 2. **tree-sitter potential advantages**:
60 |    - Incremental parsing
61 |    - Error recovery
62 |    - Language agnostic
63 |    - Query-based extraction
64 | 
65 | 3. **Critical paths to optimize**:
66 |    - AST to TreeNode conversion
67 |    - Function/type extraction
68 |    - Tree traversal


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/semantic/validation_pattern_1.ts:
--------------------------------------------------------------------------------
 1 | // Semantic duplication: Early return validation pattern
 2 | // Common validation logic using early returns
 3 | export function validateUserRegistration(data: any): ValidationResult {
 4 |   if (!data.email) {
 5 |     return { valid: false, error: 'Email is required' };
 6 |   }
 7 |   
 8 |   if (!isValidEmail(data.email)) {
 9 |     return { valid: false, error: 'Invalid email format' };
10 |   }
11 |   
12 |   if (!data.password) {
13 |     return { valid: false, error: 'Password is required' };
14 |   }
15 |   
16 |   if (data.password.length < 8) {
17 |     return { valid: false, error: 'Password must be at least 8 characters' };
18 |   }
19 |   
20 |   if (!data.username) {
21 |     return { valid: false, error: 'Username is required' };
22 |   }
23 |   
24 |   if (data.username.length < 3) {
25 |     return { valid: false, error: 'Username must be at least 3 characters' };
26 |   }
27 |   
28 |   return { valid: true };
29 | }
30 | 
31 | export function validateProductCreation(data: any): ValidationResult {
32 |   if (!data.name) {
33 |     return { valid: false, error: 'Product name is required' };
34 |   }
35 |   
36 |   if (data.name.length < 3) {
37 |     return { valid: false, error: 'Product name must be at least 3 characters' };
38 |   }
39 |   
40 |   if (!data.price) {
41 |     return { valid: false, error: 'Price is required' };
42 |   }
43 |   
44 |   if (typeof data.price !== 'number' || data.price <= 0) {
45 |     return { valid: false, error: 'Price must be a positive number' };
46 |   }
47 |   
48 |   if (!data.category) {
49 |     return { valid: false, error: 'Category is required' };
50 |   }
51 |   
52 |   if (!['electronics', 'clothing', 'food', 'other'].includes(data.category)) {
53 |     return { valid: false, error: 'Invalid category' };
54 |   }
55 |   
56 |   return { valid: true };
57 | }
58 | 
59 | // Helper
60 | function isValidEmail(email: string): boolean {
61 |   return /^[^\s@]+@[^\s@]+\.[^\s@]+$/.test(email);
62 | }
63 | 
64 | interface ValidationResult {
65 |   valid: boolean;
66 |   error?: string;
67 | }


--------------------------------------------------------------------------------
/benchmarks/data/test_duplicates.ts:
--------------------------------------------------------------------------------
 1 | // Test file with various function sizes
 2 | 
 3 | // Small function (likely < 20 tokens)
 4 | function tiny(x: number) {
 5 |   return x + 1;
 6 | }
 7 | 
 8 | // Another small function  
 9 | function small(y: number) {
10 |   return y + 1;
11 | }
12 | 
13 | // Medium function (~25-30 tokens)
14 | function processUser(user: { id: string; name: string; age: number }) {
15 |   if (!user.id) {
16 |     throw new Error('Invalid user');
17 |   }
18 |   const result = {
19 |     userId: user.id,
20 |     displayName: user.name.toUpperCase(),
21 |     ageGroup: user.age >= 18 ? 'adult' : 'minor'
22 |   };
23 |   return result;
24 | }
25 | 
26 | // Similar medium function
27 | function processCustomer(customer: { id: string; name: string; age: number }) {
28 |   if (!customer.id) {
29 |     throw new Error('Invalid customer');
30 |   }
31 |   const result = {
32 |     customerId: customer.id,
33 |     displayName: customer.name.toUpperCase(),
34 |     ageGroup: customer.age >= 18 ? 'adult' : 'minor'
35 |   };
36 |   return result;
37 | }
38 | 
39 | // Larger function (> 30 tokens)
40 | function calculateStatistics(numbers: number[]) {
41 |   if (numbers.length === 0) {
42 |     return { min: 0, max: 0, avg: 0, sum: 0 };
43 |   }
44 |   
45 |   let min = numbers[0];
46 |   let max = numbers[0];
47 |   let sum = 0;
48 |   
49 |   for (const num of numbers) {
50 |     if (num < min) min = num;
51 |     if (num > max) max = num;
52 |     sum += num;
53 |   }
54 |   
55 |   return {
56 |     min,
57 |     max,
58 |     avg: sum / numbers.length,
59 |     sum
60 |   };
61 | }
62 | 
63 | // Similar larger function
64 | function computeStatistics(values: number[]) {
65 |   if (values.length === 0) {
66 |     return { min: 0, max: 0, avg: 0, sum: 0 };
67 |   }
68 |   
69 |   let min = values[0];
70 |   let max = values[0];
71 |   let sum = 0;
72 |   
73 |   for (const val of values) {
74 |     if (val < min) min = val;
75 |     if (val > max) max = val;
76 |     sum += val;
77 |   }
78 |   
79 |   return {
80 |     min,
81 |     max,
82 |     avg: sum / values.length,
83 |     sum
84 |   };
85 | }


--------------------------------------------------------------------------------
/crates/core/tests/ast_fingerprint_test.rs:
--------------------------------------------------------------------------------
 1 | use similarity_core::AstFingerprint;
 2 | 
 3 | #[test]
 4 | fn test_ast_fingerprint_usage() {
 5 |     let code1 = r#"
 6 |         function processArray(arr) {
 7 |             const result = [];
 8 |             for (let i = 0; i < arr.length; i++) {
 9 |                 if (arr[i] > 0) {
10 |                     result.push(arr[i] * 2);
11 |                 }
12 |             }
13 |             return result;
14 |         }
15 |     "#;
16 | 
17 |     let code2 = r#"
18 |         function filterAndDouble(items) {
19 |             const output = [];
20 |             for (let j = 0; j < items.length; j++) {
21 |                 if (items[j] > 0) {
22 |                     output.push(items[j] * 2);
23 |                 }
24 |             }
25 |             return output;
26 |         }
27 |     "#;
28 | 
29 |     let fp1 = AstFingerprint::from_source(code1).unwrap();
30 |     let fp2 = AstFingerprint::from_source(code2).unwrap();
31 | 
32 |     // Print node counts for debugging
33 |     println!("\nNode counts for function 1:");
34 |     for (node_type, count) in fp1.node_counts() {
35 |         if *count > 0 {
36 |             println!("  {node_type}: {count}");
37 |         }
38 |     }
39 | 
40 |     // Test similarity
41 |     let similarity = fp1.similarity(&fp2);
42 |     println!("\nSimilarity: {:.2}%", similarity * 100.0);
43 |     assert!(similarity > 0.9, "Expected high similarity for structurally identical functions");
44 | 
45 |     // Test bloom filter
46 |     assert!(fp1.might_be_similar(&fp2, 0.5), "Bloom filter should pass for similar functions");
47 | 
48 |     // Test with different structure
49 |     let code3 = r#"
50 |         function processArray(arr) {
51 |             return arr.filter(x => x > 0).map(x => x * 2);
52 |         }
53 |     "#;
54 | 
55 |     let fp3 = AstFingerprint::from_source(code3).unwrap();
56 |     let similarity_different = fp1.similarity(&fp3);
57 |     println!("Similarity with different implementation: {:.2}%", similarity_different * 100.0);
58 |     assert!(similarity_different < 0.8, "Expected lower similarity for different implementations");
59 | }
60 | 


--------------------------------------------------------------------------------
/crates/similarity-rs/tests/test_tsed_debugging.rs:
--------------------------------------------------------------------------------
 1 | use similarity_core::language_parser::LanguageParser;
 2 | use similarity_core::{
 3 |     apted::APTEDOptions,
 4 |     tsed::{calculate_tsed, TSEDOptions},
 5 | };
 6 | use similarity_rs::rust_parser::RustParser;
 7 | 
 8 | #[test]
 9 | fn test_short_function_similarity() {
10 |     let mut parser = RustParser::new().unwrap();
11 | 
12 |     let code1 = "a + b";
13 |     let code2 = "a - b";
14 |     let code3 = "a * b";
15 | 
16 |     let tree1 = parser.parse(code1, "test1.rs").unwrap();
17 |     let tree2 = parser.parse(code2, "test2.rs").unwrap();
18 |     let tree3 = parser.parse(code3, "test3.rs").unwrap();
19 | 
20 |     let options = TSEDOptions {
21 |         apted_options: APTEDOptions {
22 |             rename_cost: 0.3,
23 |             delete_cost: 1.0,
24 |             insert_cost: 1.0,
25 |             compare_values: true,
26 |         },
27 |         min_lines: 1,
28 |         min_tokens: None,
29 |         size_penalty: true,
30 |         skip_test: false,
31 |     };
32 | 
33 |     let sim12 = calculate_tsed(&tree1, &tree2, &options);
34 |     let sim13 = calculate_tsed(&tree1, &tree3, &options);
35 |     let sim23 = calculate_tsed(&tree2, &tree3, &options);
36 | 
37 |     println!("Tree1 size: {}", tree1.get_subtree_size());
38 |     println!("Tree2 size: {}", tree2.get_subtree_size());
39 |     println!("Tree3 size: {}", tree3.get_subtree_size());
40 |     println!("Similarity between 'a + b' and 'a - b': {:.2}%", sim12 * 100.0);
41 |     println!("Similarity between 'a + b' and 'a * b': {:.2}%", sim13 * 100.0);
42 |     println!("Similarity between 'a - b' and 'a * b': {:.2}%", sim23 * 100.0);
43 | 
44 |     // These should not be 100% similar due to different operators
45 |     assert!(sim12 < 1.0, "Different operators should not be 100% similar");
46 |     assert!(sim13 < 1.0, "Different operators should not be 100% similar");
47 |     assert!(sim23 < 1.0, "Different operators should not be 100% similar");
48 | 
49 |     // With size penalty, short functions should have reduced similarity
50 |     assert!(sim12 < 0.85, "Short functions with different operators should have low similarity");
51 | }
52 | 


--------------------------------------------------------------------------------
/examples/specs/sample_project/src/services/product_service.ts:
--------------------------------------------------------------------------------
 1 | import { Product } from "../models/product.ts";
 2 | import { Logger } from "../utils/logger.ts";
 3 | 
 4 | export class ProductService {
 5 |   private products: Map<string, Product> = new Map();
 6 |   private logger: Logger;
 7 | 
 8 |   constructor(logger: Logger) {
 9 |     this.logger = logger;
10 |   }
11 | 
12 |   async createProduct(data: Omit<Product, "id" | "createdAt">): Promise<Product> {
13 |     const product: Product = {
14 |       id: this.generateId(),
15 |       ...data,
16 |       createdAt: new Date(),
17 |     };
18 | 
19 |     this.products.set(product.id, product);
20 |     this.logger.info(`Product created: ${product.id}`);
21 | 
22 |     return product;
23 |   }
24 | 
25 |   async getProductById(id: string): Promise<Product | null> {
26 |     const product = this.products.get(id);
27 | 
28 |     if (!product) {
29 |       this.logger.warn(`Product not found: ${id}`);
30 |       return null;
31 |     }
32 | 
33 |     return product;
34 |   }
35 | 
36 |   async updateProduct(id: string, updates: Partial<Product>): Promise<Product | null> {
37 |     const product = await this.getProductById(id);
38 | 
39 |     if (!product) {
40 |       return null;
41 |     }
42 | 
43 |     const updatedProduct = { ...product, ...updates, id: product.id };
44 |     this.products.set(id, updatedProduct);
45 |     this.logger.info(`Product updated: ${id}`);
46 | 
47 |     return updatedProduct;
48 |   }
49 | 
50 |   async deleteProduct(id: string): Promise<boolean> {
51 |     const exists = this.products.has(id);
52 | 
53 |     if (exists) {
54 |       this.products.delete(id);
55 |       this.logger.info(`Product deleted: ${id}`);
56 |     }
57 | 
58 |     return exists;
59 |   }
60 | 
61 |   async getAllProducts(): Promise<Product[]> {
62 |     return Array.from(this.products.values());
63 |   }
64 | 
65 |   async getProductsByCategory(category: string): Promise<Product[]> {
66 |     return Array.from(this.products.values()).filter((product) => product.category === category);
67 |   }
68 | 
69 |   private generateId(): string {
70 |     return `product_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/crates/core/src/cli_output.rs:
--------------------------------------------------------------------------------
 1 | use std::fs;
 2 | 
 3 | /// Format function output in VSCode-compatible format
 4 | pub fn format_function_output(
 5 |     file_path: &str,
 6 |     function_name: &str,
 7 |     start_line: u32,
 8 |     end_line: u32,
 9 | ) -> String {
10 |     format!("{file_path}:{start_line}-{end_line} {function_name}")
11 | }
12 | 
13 | /// Extract lines from file content within the specified range
14 | pub fn extract_lines_from_content(content: &str, start_line: u32, end_line: u32) -> String {
15 |     let lines: Vec<&str> = content.lines().collect();
16 |     let start_idx = (start_line.saturating_sub(1)) as usize;
17 |     let end_idx = std::cmp::min(end_line as usize, lines.len());
18 | 
19 |     if start_idx >= lines.len() {
20 |         return String::new();
21 |     }
22 | 
23 |     lines[start_idx..end_idx].join("\n")
24 | }
25 | 
26 | /// Display code content for a function
27 | pub fn show_function_code(file_path: &str, function_name: &str, start_line: u32, end_line: u32) {
28 |     match fs::read_to_string(file_path) {
29 |         Ok(content) => {
30 |             let code = extract_lines_from_content(&content, start_line, end_line);
31 |             println!(
32 |                 "\n\x1b[36m--- {}:{} (lines {}-{}) ---\x1b[0m",
33 |                 file_path, function_name, start_line, end_line
34 |             );
35 |             println!("{}", code);
36 |         }
37 |         Err(e) => {
38 |             eprintln!("Error reading file {}: {}", file_path, e);
39 |         }
40 |     }
41 | }
42 | 
43 | /// Generic duplicate result structure
44 | pub struct DuplicateResult<T> {
45 |     pub file1: String,
46 |     pub file2: String,
47 |     pub item1: T,
48 |     pub item2: T,
49 |     pub similarity: f64,
50 | }
51 | 
52 | impl<T> DuplicateResult<T> {
53 |     pub fn new(file1: String, file2: String, item1: T, item2: T, similarity: f64) -> Self {
54 |         Self { file1, file2, item1, item2, similarity }
55 |     }
56 | 
57 |     /// Calculate priority score for sorting
58 |     pub fn priority(&self, get_size: impl Fn(&T) -> f64) -> f64 {
59 |         let avg_size = (get_size(&self.item1) + get_size(&self.item2)) / 2.0;
60 |         self.similarity * avg_size
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/crates/similarity-elixir/README.md:
--------------------------------------------------------------------------------
 1 | # similarity-elixir
 2 | 
 3 | Elixir code similarity analyzer using Tree-sitter parser.
 4 | 
 5 | ## Installation
 6 | 
 7 | ```bash
 8 | cargo install similarity-elixir
 9 | ```
10 | 
11 | ## Usage
12 | 
13 | ```bash
14 | # Analyze a single Elixir file
15 | similarity-elixir lib/my_module.ex
16 | 
17 | # Analyze multiple files
18 | similarity-elixir lib/
19 | 
20 | # Set similarity threshold (default: 0.85)
21 | similarity-elixir lib/ -t 0.9
22 | 
23 | # Show all functions
24 | similarity-elixir lib/my_module.ex --show-functions
25 | 
26 | # Print similar function pairs with code
27 | similarity-elixir lib/ -p
28 | ```
29 | 
30 | ## Options
31 | 
32 | - `-t, --threshold <THRESHOLD>` - Similarity threshold (0.0-1.0, default: 0.85)
33 | - `-p, --print` - Print similar function pairs with source code
34 | - `--show-functions` - Show all functions found
35 | - `--filter-function <NAME>` - Filter results to functions containing NAME
36 | - `--filter-function-body <PATTERN>` - Filter by function body content
37 | - `--min-lines <N>` - Minimum function lines (default: 5)
38 | - `--rename-cost <COST>` - Cost for renaming operations (default: 1.0)
39 | 
40 | ## Features
41 | 
42 | - Detects similar functions across Elixir modules
43 | - Supports pattern matching and guard clauses
44 | - Handles pipe operators and anonymous functions
45 | - Recognizes module, protocol, and implementation definitions
46 | - Fast AST-based comparison using Tree-sitter
47 | 
48 | ## Example
49 | 
50 | ```elixir
51 | # Input: lib/calculator.ex
52 | defmodule Calculator do
53 |   def add(a, b) do
54 |     a + b
55 |   end
56 | 
57 |   def sum(x, y) do
58 |     x + y
59 |   end
60 | end
61 | ```
62 | 
63 | ```bash
64 | $ similarity-elixir lib/calculator.ex
65 | Analyzing Elixir code similarity...
66 | 
67 | Found 2 functions
68 |   - add
69 |   - sum
70 | 
71 | Duplicates in lib/calculator.ex:
72 | ------------------------------------------------------------
73 |   lib/calculator.ex:2-4 add <-> lib/calculator.ex:6-8 sum
74 |   Similarity: 100.00%
75 | ```
76 | 
77 | ## Algorithm
78 | 
79 | Uses Tree Structure Edit Distance (TSED) to compare function ASTs with configurable rename costs and size penalties.
80 | 
81 | ## License
82 | 
83 | MIT


--------------------------------------------------------------------------------
/examples/specs/type-similarity/test_type_literal_sample.ts:
--------------------------------------------------------------------------------
 1 | // Sample TypeScript file for testing type literal similarity detection
 2 | 
 3 | // Type declaration
 4 | type UserData = { id: number; name: string; email: string };
 5 | 
 6 | // Function with type literal return type - should match UserData
 7 | function getUser(): { id: number; name: string; email: string } {
 8 |   return { id: 1, name: "John", email: "john@example.com" };
 9 | }
10 | 
11 | // Function with type literal parameter - should match UserData
12 | function updateUser(user: { id: number; name: string; email: string }): void {
13 |   console.log("Updating user:", user);
14 | }
15 | 
16 | // Variable with type literal - should match UserData
17 | const defaultUser: { id: number; name: string; email: string } = {
18 |   id: 0,
19 |   name: "Default",
20 |   email: "default@example.com",
21 | };
22 | 
23 | // Arrow function with type literal return type
24 | const createUser = (): { id: number; name: string; email: string } => {
25 |   return { id: Math.random(), name: "New User", email: "new@example.com" };
26 | };
27 | 
28 | // Different type literal - should not match
29 | function getProduct(): { sku: string; price: number; category: string } {
30 |   return { sku: "ABC123", price: 99.99, category: "Electronics" };
31 | }
32 | 
33 | // Similar but slightly different type literal
34 | function getUserInfo(): { id: number; fullName: string; email: string } {
35 |   return { id: 1, fullName: "John Doe", email: "john@example.com" };
36 | }
37 | 
38 | // Nested type literal
39 | function getOrder(): {
40 |   id: number;
41 |   user: { id: number; name: string; email: string };
42 |   items: Array<{ sku: string; quantity: number }>;
43 | } {
44 |   return {
45 |     id: 1,
46 |     user: { id: 1, name: "John", email: "john@example.com" },
47 |     items: [{ sku: "ABC123", quantity: 2 }],
48 |   };
49 | }
50 | 
51 | // Type literal with optional properties
52 | function getPartialUser(): { id: number; name?: string; email?: string } {
53 |   return { id: 1 };
54 | }
55 | 
56 | // Interface for comparison
57 | interface ProductInfo {
58 |   sku: string;
59 |   price: number;
60 |   category: string;
61 | }
62 | 
63 | // Type alias for comparison
64 | type OrderData = {
65 |   id: number;
66 |   user: { id: number; name: string; email: string };
67 |   items: Array<{ sku: string; quantity: number }>;
68 | };
69 | 


--------------------------------------------------------------------------------
/docs/algorithm/tsed-similarity-summary.md:
--------------------------------------------------------------------------------
 1 | # コード類似性評価論文の要約
 2 | 
 3 | https://arxiv.org/abs/2404.08817
 4 | 
 5 | ## 論文タイトル
 6 | 
 7 | Revisiting Code Similarity Evaluation with Abstract Syntax Tree Edit Distance
 8 | 
 9 | ## 概要
10 | 
11 | この論文は、コード類似性評価における抽象構文木（AST）編集距離の適用を探求した研究です。特に、TSED（Tree Similarity of Edit Distance）メトリクスを多様なプログラミング言語に拡張し、従来の評価手法との比較を行っています。
12 | 
13 | ## 主要な貢献
14 | 
15 | 1. **TSED（Tree Similarity of Edit Distance）メトリクスの拡張**
16 | 
17 |    - 元々 SQL 用だった TSED を 48 のプログラミング言語に適用可能に拡張
18 |    - 新しいツールとして公開
19 | 
20 | 2. **評価メトリクスの包括的比較**
21 | 
22 |    - TSED
23 |    - GPT-4 類似性スコア
24 |    - BLEU
25 |    - Jaccard 類似度
26 |    - 実行一致度
27 | 
28 | 3. **主な発見**
29 |    - TSED は従来の統計的メトリクス（BLEU 等）より実行一致度との相関が高い
30 |    - GPT-4 の類似性スコアは効果的だが出力が不安定
31 |    - AST パーサーの選択が TSED の性能に大きく影響
32 | 
33 | ## 技術的アプローチ
34 | 
35 | ### TSED の計算プロセス
36 | 
37 | 1. **コード解析**: tree-sitter を使用してコードを AST に変換
38 | 2. **木編集距離計算**: APTED アルゴリズムを使用
39 | 3. **正規化**: 0-1 のスコアに正規化
40 | 
41 | ### 計算式
42 | 
43 | ```
44 | Δ(G1, G2) = min_ops Σ w(op_i)
45 | TSED = max{1 - δ/MaxNodes(G1, G2), 0}
46 | ```
47 | 
48 | ## 実験結果
49 | 
50 | ### 対象言語と性能
51 | 
52 | - Java、Python、JavaScript、TypeScript、Ruby、Kotlin で評価
53 | - TSED と GPT 類似性は従来メトリクスより高い精度を示す
54 | 
55 | ### 主要な数値結果（MBXP データセット）
56 | 
57 | | 言語       | TSED   | BLEU   | Jaccard | GPT-4  | 実行一致 |
58 | | ---------- | ------ | ------ | ------- | ------ | -------- |
59 | | Java       | 0.3746 | 0.2041 | 0.2733  | 0.8143 | 0.6550   |
60 | | Python     | 0.1888 | 0.0843 | 0.2000  | 0.6751 | 0.6842   |
61 | | JavaScript | 0.2037 | 0.0846 | 0.2037  | 0.6763 | 0.6811   |
62 | 
63 | ## 制限事項
64 | 
65 | 1. **GPT スコアの不安定性**
66 | 
67 |    - 同じ入力でも出力が変動
68 |    - MSE: 約 0.05-0.06、MAE: 約 0.18-0.20
69 | 
70 | 2. **TSED のパラメータ依存性**
71 | 
72 |    - 操作の重み（削除、挿入、リネーム）が結果に影響
73 |    - 言語ごとに最適なパラメータが異なる可能性
74 | 
75 | 3. **パーサー依存性**
76 |    - tree-sitter と ANTLR で結果が大きく異なる
77 |    - パーサーの品質が評価精度に直接影響
78 | 
79 | ## 実用的な意義
80 | 
81 | 1. **コード生成タスクの評価**
82 | 
83 |    - LLM が生成したコードの品質評価に有効
84 |    - 実行結果だけでなく構造的な類似性も評価可能
85 | 
86 | 2. **多言語対応**
87 | 
88 |    - 48 のプログラミング言語に対応
89 |    - 言語固有の構造を考慮した評価が可能
90 | 
91 | 3. **従来手法の改善**
92 |    - BLEU や Jaccard 類似度より実行一致度との相関が高い
93 |    - コードの構造的特徴をより正確に捉える
94 | 
95 | ## まとめ
96 | 
97 | TSED は、コードの構造的類似性を評価する有効な手法として、従来の統計的手法を上回る性能を示しました。特に、実行結果との相関が高く、コード生成タスクの評価において実用的な指標となることが期待されます。ただし、GPT スコアの不安定性やパラメータ調整の必要性など、実用化に向けた課題も明らかになりました。
98 | 


--------------------------------------------------------------------------------
/examples/overlap-detection/partial-overlap.js:
--------------------------------------------------------------------------------
 1 | // Test case 3: Partial overlaps within larger functions
 2 | 
 3 | function complexDataProcessor(data) {
 4 |     // Validation phase
 5 |     if (!data || !Array.isArray(data)) {
 6 |         throw new Error('Invalid input');
 7 |     }
 8 |     
 9 |     const results = {
10 |         processed: [],
11 |         errors: [],
12 |         stats: {
13 |             total: 0,
14 |             success: 0,
15 |             failed: 0
16 |         }
17 |     };
18 |     
19 |     // Processing phase - similar to other functions
20 |     for (let i = 0; i < data.length; i++) {
21 |         try {
22 |             if (data[i].value > 0) {
23 |                 results.processed.push({
24 |                     id: data[i].id,
25 |                     value: data[i].value * 2,
26 |                     timestamp: new Date()
27 |                 });
28 |                 results.stats.success++;
29 |             }
30 |         } catch (error) {
31 |             results.errors.push({
32 |                 index: i,
33 |                 error: error.message
34 |             });
35 |             results.stats.failed++;
36 |         }
37 |         results.stats.total++;
38 |     }
39 |     
40 |     // Summary phase
41 |     console.log(`Processed ${results.stats.total} items`);
42 |     return results;
43 | }
44 | 
45 | function simpleProcessor(items) {
46 |     const output = [];
47 |     // This loop is similar to part of complexDataProcessor
48 |     for (let i = 0; i < items.length; i++) {
49 |         if (items[i].value > 0) {
50 |             output.push({
51 |                 id: items[i].id,
52 |                 value: items[i].value * 2,
53 |                 timestamp: new Date()
54 |             });
55 |         }
56 |     }
57 |     return output;
58 | }
59 | 
60 | function batchProcessor(batches) {
61 |     const allResults = [];
62 |     
63 |     for (let batch of batches) {
64 |         const batchResults = [];
65 |         // Inner loop similar to simpleProcessor
66 |         for (let i = 0; i < batch.length; i++) {
67 |             if (batch[i].value > 0) {
68 |                 batchResults.push({
69 |                     id: batch[i].id,
70 |                     value: batch[i].value * 2,
71 |                     timestamp: new Date()
72 |                 });
73 |             }
74 |         }
75 |         allResults.push(batchResults);
76 |     }
77 |     
78 |     return allResults;
79 | }


--------------------------------------------------------------------------------
/crates/core/src/cli_file_utils.rs:
--------------------------------------------------------------------------------
 1 | use ignore::WalkBuilder;
 2 | use std::collections::HashSet;
 3 | use std::path::{Path, PathBuf};
 4 | 
 5 | /// Collect files from paths with given extensions
 6 | pub fn collect_files(paths: &[String], extensions: &[&str]) -> anyhow::Result<Vec<PathBuf>> {
 7 |     let mut files = Vec::new();
 8 |     let mut visited = HashSet::new();
 9 | 
10 |     // Process each path
11 |     for path_str in paths {
12 |         let path = Path::new(path_str);
13 | 
14 |         if path.is_file() {
15 |             // If it's a file, check extension and add it
16 |             if let Some(ext) = path.extension() {
17 |                 if let Some(ext_str) = ext.to_str() {
18 |                     if extensions.contains(&ext_str) {
19 |                         if let Ok(canonical) = path.canonicalize() {
20 |                             if visited.insert(canonical.clone()) {
21 |                                 files.push(path.to_path_buf());
22 |                             }
23 |                         }
24 |                     }
25 |                 }
26 |             }
27 |         } else if path.is_dir() {
28 |             // If it's a directory, walk it respecting .gitignore
29 |             let walker = WalkBuilder::new(path).follow_links(false).build();
30 | 
31 |             for entry in walker {
32 |                 let entry = entry?;
33 |                 let entry_path = entry.path();
34 | 
35 |                 // Skip if not a file
36 |                 if !entry_path.is_file() {
37 |                     continue;
38 |                 }
39 | 
40 |                 // Check extension
41 |                 if let Some(ext) = entry_path.extension() {
42 |                     if let Some(ext_str) = ext.to_str() {
43 |                         if extensions.contains(&ext_str) {
44 |                             if let Ok(canonical) = entry_path.canonicalize() {
45 |                                 if visited.insert(canonical.clone()) {
46 |                                     files.push(entry_path.to_path_buf());
47 |                                 }
48 |                             }
49 |                         }
50 |                     }
51 |                 }
52 |             }
53 |         } else {
54 |             eprintln!("Path does not exist or is not accessible: {}", path_str);
55 |         }
56 |     }
57 | 
58 |     // Sort files for consistent output
59 |     files.sort();
60 | 
61 |     Ok(files)
62 | }
63 | 


--------------------------------------------------------------------------------
/examples/test_different_ts_structures.ts:
--------------------------------------------------------------------------------
  1 | // Test file to verify that clearly different TypeScript structures are not detected as similar
  2 | 
  3 | // Simple interface with two fields
  4 | interface Point2D {
  5 |   x: number;
  6 |   y: number;
  7 | }
  8 | 
  9 | // Complex interface with many fields
 10 | interface DatabaseConfig {
 11 |   host: string;
 12 |   port: number;
 13 |   username: string;
 14 |   password: string;
 15 |   database: string;
 16 |   poolSize: number;
 17 |   timeout: number;
 18 |   ssl: boolean;
 19 |   certificate?: string;
 20 |   retryAttempts: number;
 21 |   retryDelay: number;
 22 | }
 23 | 
 24 | // Empty interface
 25 | interface Marker {}
 26 | 
 27 | // Single field type
 28 | type Id = string;
 29 | 
 30 | // Union type
 31 | type Status = 'pending' | 'active' | 'inactive' | 'deleted';
 32 | 
 33 | // Another simple interface that might look similar to Point2D
 34 | interface Coordinate {
 35 |   lat: number;
 36 |   lng: number;
 37 | }
 38 | 
 39 | // Complex type with nested structure
 40 | type ApiResponse<T> = {
 41 |   success: boolean;
 42 |   data: T;
 43 |   error?: {
 44 |     code: number;
 45 |     message: string;
 46 |     details?: string[];
 47 |   };
 48 |   metadata: {
 49 |     timestamp: number;
 50 |     version: string;
 51 |     requestId: string;
 52 |   };
 53 | };
 54 | 
 55 | // Class with methods (different from interfaces)
 56 | class UserService {
 57 |   private users: Map<string, any>;
 58 |   
 59 |   constructor() {
 60 |     this.users = new Map();
 61 |   }
 62 |   
 63 |   getUser(id: string) {
 64 |     return this.users.get(id);
 65 |   }
 66 |   
 67 |   addUser(id: string, data: any) {
 68 |     this.users.set(id, data);
 69 |   }
 70 | }
 71 | 
 72 | // Enum (different structure from interfaces)
 73 | enum Color {
 74 |   Red = '#FF0000',
 75 |   Green = '#00FF00',
 76 |   Blue = '#0000FF',
 77 | }
 78 | 
 79 | // Large configuration object
 80 | interface ApplicationConfig {
 81 |   appName: string;
 82 |   version: string;
 83 |   environment: 'dev' | 'staging' | 'production';
 84 |   features: {
 85 |     auth: boolean;
 86 |     analytics: boolean;
 87 |     notifications: boolean;
 88 |     darkMode: boolean;
 89 |   };
 90 |   api: {
 91 |     baseUrl: string;
 92 |     timeout: number;
 93 |     retries: number;
 94 |   };
 95 |   logging: {
 96 |     level: 'debug' | 'info' | 'warn' | 'error';
 97 |     file: string;
 98 |     console: boolean;
 99 |   };
100 | }


--------------------------------------------------------------------------------
/examples/rust_types_example.rs:
--------------------------------------------------------------------------------
  1 | // Example file with similar Rust types for testing
  2 | 
  3 | // Very similar structs with different names
  4 | struct User {
  5 |     id: u64,
  6 |     name: String,
  7 |     email: String,
  8 |     created_at: std::time::SystemTime,
  9 | }
 10 | 
 11 | struct Person {
 12 |     id: u64,
 13 |     full_name: String,
 14 |     email_address: String,
 15 |     birth_date: std::time::SystemTime,
 16 | }
 17 | 
 18 | struct Customer {
 19 |     customer_id: u64,
 20 |     customer_name: String,
 21 |     contact_email: String,
 22 |     registration_date: std::time::SystemTime,
 23 | }
 24 | 
 25 | // Similar enums
 26 | enum Status {
 27 |     Active,
 28 |     Inactive,
 29 |     Pending,
 30 |     Completed,
 31 | }
 32 | 
 33 | enum State {
 34 |     Running,
 35 |     Stopped,
 36 |     Waiting,
 37 |     Finished,
 38 | }
 39 | 
 40 | enum TaskStatus {
 41 |     InProgress,
 42 |     Paused,
 43 |     Queued,
 44 |     Done,
 45 | }
 46 | 
 47 | // Generic structs
 48 | struct Response<T> {
 49 |     data: T,
 50 |     status: u16,
 51 |     message: String,
 52 | }
 53 | 
 54 | struct ApiResult<T> {
 55 |     result: T,
 56 |     code: u16,
 57 |     description: String,
 58 | }
 59 | 
 60 | struct ServerResponse<T> {
 61 |     payload: T,
 62 |     status_code: u16,
 63 |     error_message: String,
 64 | }
 65 | 
 66 | // Nested structs
 67 | struct ComplexUser {
 68 |     id: u64,
 69 |     profile: UserProfile,
 70 |     settings: UserSettings,
 71 | }
 72 | 
 73 | struct UserProfile {
 74 |     name: String,
 75 |     email: String,
 76 |     phone: String,
 77 | }
 78 | 
 79 | struct UserSettings {
 80 |     theme: String,
 81 |     notifications: bool,
 82 | }
 83 | 
 84 | struct ComplexPerson {
 85 |     person_id: u64,
 86 |     person_profile: PersonProfile,
 87 |     person_settings: PersonSettings,
 88 | }
 89 | 
 90 | struct PersonProfile {
 91 |     full_name: String,
 92 |     email_address: String,
 93 |     phone_number: String,
 94 | }
 95 | 
 96 | struct PersonSettings {
 97 |     ui_theme: String,
 98 |     enable_notifications: bool,
 99 | }
100 | 
101 | // Different structures
102 | struct Product {
103 |     sku: String,
104 |     name: String,
105 |     price: f64,
106 |     in_stock: bool,
107 | }
108 | 
109 | struct Order {
110 |     order_id: String,
111 |     items: Vec<String>,
112 |     total: f64,
113 |     paid: bool,
114 | }
115 | 
116 | // Type aliases
117 | type UserId = u64;
118 | type CustomerId = u64;
119 | type OrderId = String;


--------------------------------------------------------------------------------
/docs/prompt.md:
--------------------------------------------------------------------------------
 1 | # similarity-ts: AI Assistant Guide
 2 | 
 3 | ## Purpose
 4 | 
 5 | Detects duplicate TypeScript/JavaScript code using AST comparison for refactoring.
 6 | 
 7 | ## Installation
 8 | 
 9 | ```bash
10 | cargo install similarity-ts
11 | # check options
12 | similarity-ts --help
13 | ```
14 | 
15 | ## Key Options
16 | 
17 | - `--threshold <0-1>`: Similarity threshold (default: 0.8)
18 | - `--min-tokens <n>`: Skip functions with <n AST nodes (recommended: 20-30)
19 | - `--print`: Show actual code snippets
20 | 
21 | ## AI Refactoring Workflow
22 | 
23 | ### 1. Broad Scan
24 | 
25 | Find all duplicates in codebase:
26 | 
27 | ```bash
28 | similarity-ts src/ --threshold 0.85 --min-tokens 25
29 | ```
30 | 
31 | ### 2. Focused Analysis
32 | 
33 | Examine specific file pairs:
34 | 
35 | ```bash
36 | similarity-ts file1.ts file2.ts --threshold 0.8 --min-tokens 20 --print
37 | ```
38 | 
39 | ### 3. Threshold Tuning
40 | 
41 | If no results, progressively lower:
42 | 
43 | ```bash
44 | similarity-ts file1.ts file2.ts --threshold 0.75 --min-tokens 20
45 | similarity-ts file1.ts file2.ts --threshold 0.7 --min-tokens 20
46 | ```
47 | 
48 | ## Output Format
49 | 
50 | ```
51 | Function: functionName (file.ts:startLine-endLine)
52 | Similar to: otherFunction (other.ts:startLine-endLine)
53 | Similarity: 85%
54 | ```
55 | 
56 | ## Effective Thresholds
57 | 
58 | - `0.95+`: Nearly identical (variable renames only)
59 | - `0.85-0.95`: Same algorithm, minor differences
60 | - `0.75-0.85`: Similar structure, different details
61 | - `0.7-0.75`: Related logic, worth investigating
62 | 
63 | ## Refactoring Strategy
64 | 
65 | 1. **Start with high threshold** (0.9) to find obvious duplicates
66 | 2. **Compare specific pairs** when similarity found
67 | 3. **Use --print** to see actual code differences
68 | 4. **Extract common logic** into shared functions/modules
69 | 5. **Re-run after refactoring** to verify no new duplicates
70 | 
71 | ## Common Patterns to Refactor
72 | 
73 | - **Data processing loops** with different field names
74 | - **API handlers** with similar request/response logic
75 | - **Validation functions** with different rules
76 | - **State management** with repeated patterns
77 | 
78 | ## Best Practices
79 | 
80 | - Use `--min-tokens` for accurate complexity filtering (20-30 tokens)
81 | - Focus on files with 80%+ similarity first
82 | - Check if similar functions are in same module (easier to refactor)
83 | - Consider function size - larger duplicates have more impact
84 | - Look for patterns across multiple files, not just pairs
85 | 


--------------------------------------------------------------------------------
/__deprecated/test/basic.test.ts:
--------------------------------------------------------------------------------
 1 | import { describe, it, expect } from "vitest";
 2 | import { calculateSimilarity } from "../src/index.ts";
 3 | import { parseTypeScript } from "../src/parser.ts";
 4 | import { astToString } from "../src/core/ast.ts";
 5 | 
 6 | describe("Basic Functionality", () => {
 7 |   describe("parseTypeScript", () => {
 8 |     it("should parse valid TypeScript code", () => {
 9 |       const code = `const x = 42;`;
10 |       const ast = parseTypeScript("test.ts", code);
11 | 
12 |       expect(ast).toBeDefined();
13 |       expect(ast.program).toBeDefined();
14 |       expect(ast.program.body).toHaveLength(1);
15 |     });
16 | 
17 |     it("should parse function declarations", () => {
18 |       const code = `function test(a: number): number { return a * 2; }`;
19 |       const ast = parseTypeScript("test.ts", code);
20 | 
21 |       expect(ast.program.body).toHaveLength(1);
22 |       expect(ast.program.body[0].type).toBe("FunctionDeclaration");
23 |     });
24 | 
25 |     it("should parse class declarations", () => {
26 |       const code = `class MyClass { method() {} }`;
27 |       const ast = parseTypeScript("test.ts", code);
28 | 
29 |       expect(ast.program.body).toHaveLength(1);
30 |       expect(ast.program.body[0].type).toBe("ClassDeclaration");
31 |     });
32 |   });
33 | 
34 |   describe("astToString", () => {
35 |     it("should convert AST to string representation", () => {
36 |       const code = `const x = 1;`;
37 |       const ast = parseTypeScript("test.ts", code);
38 |       const str = astToString(ast.program);
39 | 
40 |       expect(str).toContain("VariableDeclaration");
41 |       expect(str).toContain("VariableDeclarator");
42 |     });
43 |   });
44 | 
45 |   describe("calculateSimilarity (Levenshtein only)", () => {
46 |     it("should work with simple examples", () => {
47 |       const code1 = `const x = 1;`;
48 |       const code2 = `const y = 2;`;
49 | 
50 |       const similarity = calculateSimilarity(code1, code2);
51 |       expect(similarity).toBeGreaterThan(0.7);
52 |       expect(similarity).toBeLessThan(1.0);
53 |     });
54 | 
55 |     it("should handle empty code", () => {
56 |       const similarity = calculateSimilarity("", "");
57 |       expect(similarity).toBe(1.0);
58 |     });
59 | 
60 |     it("should handle completely different code", () => {
61 |       const code1 = `function add(a: number, b: number) { return a + b; }`;
62 |       const code2 = `import { readFileSync } from "fs";`;
63 | 
64 |       const similarity = calculateSimilarity(code1, code2);
65 |       expect(similarity).toBeLessThan(0.4);
66 |     });
67 |   });
68 | });
69 | 


--------------------------------------------------------------------------------
/crates/similarity-rs/tests/test_function_extraction.rs:
--------------------------------------------------------------------------------
 1 | use similarity_core::language_parser::LanguageParser;
 2 | use similarity_rs::rust_parser::RustParser;
 3 | 
 4 | #[test]
 5 | fn test_function_extraction() {
 6 |     let content = r#"fn f1() -> i32 { 1 }
 7 | fn f2() -> i32 { 1 }
 8 | 
 9 | fn longer_func1() -> i32 {
10 |     let x = 1;
11 |     let y = 2;
12 |     let z = 3;
13 |     x + y + z
14 | }
15 | 
16 | fn longer_func2() -> i32 {
17 |     let a = 1;
18 |     let b = 2; 
19 |     let c = 3;
20 |     a + b + c
21 | }"#;
22 | 
23 |     let mut parser = RustParser::new().unwrap();
24 |     let functions = parser.extract_functions(content, "test.rs").unwrap();
25 | 
26 |     println!("\n=== Extracted Functions ===");
27 |     for (i, func) in functions.iter().enumerate() {
28 |         println!(
29 |             "[{}] {}: lines {}-{}, body {}-{}",
30 |             i, func.name, func.start_line, func.end_line, func.body_start_line, func.body_end_line
31 |         );
32 | 
33 |         // Extract body
34 |         let lines: Vec<&str> = content.lines().collect();
35 |         let start_idx = if func.body_start_line > 0 {
36 |             (func.body_start_line.saturating_sub(1)) as usize
37 |         } else {
38 |             (func.start_line.saturating_sub(1)) as usize
39 |         };
40 | 
41 |         let end_idx = if func.body_end_line > 0 {
42 |             std::cmp::min(func.body_end_line as usize, lines.len())
43 |         } else {
44 |             std::cmp::min(func.end_line as usize, lines.len())
45 |         };
46 | 
47 |         let body = lines[start_idx..end_idx].join("\n");
48 |         println!("  Start idx: {}, End idx: {}", start_idx, end_idx);
49 |         println!("  Lines total: {}", lines.len());
50 |         if start_idx < lines.len() {
51 |             println!("  Line at start_idx: {:?}", lines[start_idx]);
52 |         }
53 |         println!("  Body: {}", body.replace('\n', "\\n"));
54 |     }
55 | 
56 |     assert_eq!(functions.len(), 4);
57 |     assert_eq!(functions[2].name, "longer_func1");
58 |     assert_eq!(functions[3].name, "longer_func2");
59 | 
60 |     // Check that longer functions have correct line counts
61 |     let func1_lines = functions[2].end_line - functions[2].start_line + 1;
62 |     let func2_lines = functions[3].end_line - functions[3].start_line + 1;
63 | 
64 |     println!("\nlonger_func1 has {} lines", func1_lines);
65 |     println!("longer_func2 has {} lines", func2_lines);
66 | 
67 |     assert!(func1_lines >= 5, "longer_func1 should have at least 5 lines");
68 |     assert!(func2_lines >= 5, "longer_func2 should have at least 5 lines");
69 | }
70 | 


--------------------------------------------------------------------------------
/crates/core/src/ast_exchange.rs:
--------------------------------------------------------------------------------
 1 | use crate::tree::TreeNode;
 2 | use serde::{Deserialize, Serialize};
 3 | use std::rc::Rc;
 4 | 
 5 | /// Serializable version of TreeNode for external exchange
 6 | #[derive(Debug, Clone, Serialize, Deserialize)]
 7 | pub struct SerializableTreeNode {
 8 |     pub label: String,
 9 |     pub value: String,
10 |     pub children: Vec<SerializableTreeNode>,
11 |     pub id: usize,
12 | }
13 | 
14 | impl From<&TreeNode> for SerializableTreeNode {
15 |     fn from(node: &TreeNode) -> Self {
16 |         SerializableTreeNode {
17 |             label: node.label.clone(),
18 |             value: node.value.clone(),
19 |             children: node.children.iter().map(|child| child.as_ref().into()).collect(),
20 |             id: node.id,
21 |         }
22 |     }
23 | }
24 | 
25 | impl From<SerializableTreeNode> for TreeNode {
26 |     fn from(node: SerializableTreeNode) -> Self {
27 |         let mut tree_node = TreeNode::new(node.label, node.value, node.id);
28 |         for child in node.children {
29 |             tree_node.add_child(Rc::new(child.into()));
30 |         }
31 |         tree_node
32 |     }
33 | }
34 | 
35 | /// Function definition for external exchange
36 | #[derive(Debug, Clone, Serialize, Deserialize)]
37 | pub struct ExchangeFunctionDef {
38 |     pub name: String,
39 |     pub start_line: u32,
40 |     pub end_line: u32,
41 |     pub body_start_line: u32,
42 |     pub body_end_line: u32,
43 |     pub ast: SerializableTreeNode,
44 | }
45 | 
46 | /// Complete AST exchange format
47 | #[derive(Debug, Clone, Serialize, Deserialize)]
48 | pub struct ASTExchange {
49 |     pub language: String,
50 |     pub filename: String,
51 |     pub functions: Vec<ExchangeFunctionDef>,
52 |     pub full_ast: Option<SerializableTreeNode>,
53 | }
54 | 
55 | #[cfg(test)]
56 | mod tests {
57 |     use super::*;
58 | 
59 |     #[test]
60 |     fn test_tree_node_serialization() {
61 |         let mut root = TreeNode::new("function".to_string(), "foo".to_string(), 0);
62 |         root.add_child(Rc::new(TreeNode::new("params".to_string(), "".to_string(), 1)));
63 |         root.add_child(Rc::new(TreeNode::new("body".to_string(), "".to_string(), 2)));
64 | 
65 |         let serializable: SerializableTreeNode = (&root).into();
66 |         let json = serde_json::to_string(&serializable).unwrap();
67 |         let deserialized: SerializableTreeNode = serde_json::from_str(&json).unwrap();
68 |         let restored: TreeNode = deserialized.into();
69 | 
70 |         assert_eq!(root.label, restored.label);
71 |         assert_eq!(root.children.len(), restored.children.len());
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/crates/similarity-ts/tests/tsx_test.rs:
--------------------------------------------------------------------------------
 1 | use assert_cmd::Command;
 2 | use predicates::prelude::*;
 3 | use std::fs;
 4 | use tempfile::tempdir;
 5 | 
 6 | #[test]
 7 | fn test_tsx_file_support() {
 8 |     let dir = tempdir().unwrap();
 9 |     let tsx_file = dir.path().join("component.tsx");
10 | 
11 |     // Create a .tsx file with React component
12 |     fs::write(
13 |         &tsx_file,
14 |         r#"
15 | import React from 'react';
16 | 
17 | interface ButtonProps {
18 |     label: string;
19 |     onClick: () => void;
20 | }
21 | 
22 | export function Button({ label, onClick }: ButtonProps) {
23 |     return React.createElement('button', { onClick }, label);
24 | }
25 | 
26 | export function PrimaryButton({ label, onClick }: ButtonProps) {
27 |     return React.createElement('button', { onClick, className: 'primary' }, label);
28 | }
29 | "#,
30 |     )
31 |     .unwrap();
32 | 
33 |     // Run similarity-ts on .tsx file
34 |     let mut cmd = Command::cargo_bin("similarity-ts").unwrap();
35 |     cmd.arg(dir.path())
36 |         .arg("--min-lines")
37 |         .arg("1")
38 |         .arg("--threshold")
39 |         .arg("0.5")
40 |         .arg("--no-size-penalty")
41 |         .assert()
42 |         .success()
43 |         .stdout(predicate::str::contains("Checking 1 files for duplicates"));
44 | }
45 | 
46 | #[test]
47 | fn test_mixed_ts_tsx_files() {
48 |     let dir = tempdir().unwrap();
49 |     let ts_file = dir.path().join("utils.ts");
50 |     let tsx_file = dir.path().join("component.tsx");
51 | 
52 |     // Create a .ts file
53 |     fs::write(
54 |         &ts_file,
55 |         r#"
56 | export function formatName(first: string, last: string): string {
57 |     return `${first} ${last}`;
58 | }
59 | "#,
60 |     )
61 |     .unwrap();
62 | 
63 |     // Create a .tsx file with similar function
64 |     fs::write(
65 |         &tsx_file,
66 |         r#"
67 | import React from 'react';
68 | 
69 | export function formatFullName(firstName: string, lastName: string): string {
70 |     return `${firstName} ${lastName}`;
71 | }
72 | 
73 | export function NameDisplay({ name }: { name: string }) {
74 |     return React.createElement('span', null, name);
75 | }
76 | "#,
77 |     )
78 |     .unwrap();
79 | 
80 |     // Run similarity-ts on both files
81 |     let mut cmd = Command::cargo_bin("similarity-ts").unwrap();
82 |     cmd.arg(dir.path())
83 |         .arg("--threshold")
84 |         .arg("0.5")
85 |         .arg("--min-lines")
86 |         .arg("1")
87 |         .arg("--no-size-penalty")
88 |         .assert()
89 |         .success()
90 |         .stdout(predicate::str::contains("Checking 2 files for duplicates"));
91 | }
92 | 


--------------------------------------------------------------------------------
/examples/test_rust_with_derives.rs:
--------------------------------------------------------------------------------
  1 | // Test file for Rust structures with derive attributes
  2 | 
  3 | // Structs with common derives
  4 | #[derive(Debug, Clone, PartialEq)]
  5 | pub struct User {
  6 |     pub id: u64,
  7 |     pub name: String,
  8 |     pub email: String,
  9 | }
 10 | 
 11 | #[derive(Debug, Clone, PartialEq)]
 12 | pub struct Person {
 13 |     pub id: u64,
 14 |     pub full_name: String,
 15 |     pub email_address: String,
 16 | }
 17 | 
 18 | // Similar structure but different derives
 19 | #[derive(Debug, Serialize, Deserialize)]
 20 | pub struct Account {
 21 |     pub id: u64,
 22 |     pub username: String,
 23 |     pub email: String,
 24 | }
 25 | 
 26 | // Completely different derives
 27 | #[derive(Default)]
 28 | pub struct Profile {
 29 |     pub user_id: u64,
 30 |     pub display_name: String,
 31 |     pub contact_email: String,
 32 | }
 33 | 
 34 | // Enums with derives
 35 | #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 36 | pub enum Status {
 37 |     Active,
 38 |     Inactive,
 39 |     Pending,
 40 |     Suspended,
 41 | }
 42 | 
 43 | #[derive(Debug, Clone, PartialEq)]
 44 | pub enum UserStatus {
 45 |     Active,
 46 |     Inactive,
 47 |     Pending,
 48 |     Banned,
 49 | }
 50 | 
 51 | // Different enum with same derives
 52 | #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 53 | pub enum Color {
 54 |     Red,
 55 |     Green,
 56 |     Blue,
 57 |     Yellow,
 58 | }
 59 | 
 60 | // Complex derives with serde
 61 | #[derive(Debug, Clone, Serialize, Deserialize)]
 62 | #[serde(rename_all = "camelCase")]
 63 | pub struct ApiResponse {
 64 |     pub success: bool,
 65 |     pub message: String,
 66 |     pub data: Option<String>,
 67 | }
 68 | 
 69 | #[derive(Debug, Clone, Serialize, Deserialize)]
 70 | #[serde(rename_all = "snake_case")]
 71 | pub struct ApiResult {
 72 |     pub is_success: bool,
 73 |     pub error_message: String,
 74 |     pub payload: Option<String>,
 75 | }
 76 | 
 77 | // Structs with custom attributes
 78 | #[derive(Debug, Clone)]
 79 | #[cfg(feature = "postgres")]
 80 | pub struct DatabaseConfig {
 81 |     pub host: String,
 82 |     pub port: u16,
 83 |     pub database: String,
 84 | }
 85 | 
 86 | #[derive(Debug, Clone)]
 87 | #[cfg(feature = "mysql")]
 88 | pub struct DbConfig {
 89 |     pub hostname: String,
 90 |     pub port_number: u16,
 91 |     pub db_name: String,
 92 | }
 93 | 
 94 | // Generic structs with derives
 95 | #[derive(Debug, Clone, PartialEq)]
 96 | pub struct Result<T, E> {
 97 |     value: Option<T>,
 98 |     error: Option<E>,
 99 | }
100 | 
101 | #[derive(Debug, Clone, PartialEq)]
102 | pub struct Response<T, E> {
103 |     data: Option<T>,
104 |     err: Option<E>,
105 | }


--------------------------------------------------------------------------------
/__deprecated/src/core/ast_traversal.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Shared AST traversal utility to eliminate code duplication
 3 |  * This module provides a common traversal pattern used across the codebase
 4 |  */
 5 | 
 6 | interface NodeHandler<T> {
 7 |   (node: any, state: T, parent?: any): void;
 8 | }
 9 | 
10 | interface NodeHandlers<T> {
11 |   // Lifecycle hooks
12 |   enter?: NodeHandler<T>;
13 |   leave?: NodeHandler<T>;
14 | 
15 |   // Node type specific handlers
16 |   FunctionDeclaration?: NodeHandler<T>;
17 |   FunctionExpression?: NodeHandler<T>;
18 |   ArrowFunctionExpression?: NodeHandler<T>;
19 |   MethodDefinition?: NodeHandler<T>;
20 |   ClassDeclaration?: NodeHandler<T>;
21 |   ClassExpression?: NodeHandler<T>;
22 |   VariableDeclaration?: NodeHandler<T>;
23 |   VariableDeclarator?: NodeHandler<T>;
24 |   MemberExpression?: NodeHandler<T>;
25 |   CallExpression?: NodeHandler<T>;
26 |   ThisExpression?: NodeHandler<T>;
27 |   Identifier?: NodeHandler<T>;
28 |   BlockStatement?: NodeHandler<T>;
29 | 
30 |   // Generic handler for any node type
31 |   [nodeType: string]: NodeHandler<T> | undefined;
32 | }
33 | 
34 | /**
35 |  * Traverse AST with given handlers
36 |  * @param node - AST node to traverse
37 |  * @param handlers - Object containing node handlers
38 |  * @param state - State object passed to all handlers
39 |  * @param parent - Parent node (optional)
40 |  */
41 | export function traverseAST<T>(node: any, handlers: NodeHandlers<T>, state: T, parent?: any): void {
42 |   if (!node || typeof node !== "object") return;
43 | 
44 |   // Call enter lifecycle hook
45 |   handlers.enter?.(node, state, parent);
46 | 
47 |   // Call node type specific handler
48 |   if (node.type && typeof node.type === "string") {
49 |     const handler = handlers[node.type];
50 |     if (handler) {
51 |       handler(node, state, parent);
52 |     }
53 |   }
54 | 
55 |   // Traverse children
56 |   for (const key in node) {
57 |     // Skip circular references and internal properties
58 |     if (key === "parent" || key === "scope" || key === "_parent") continue;
59 | 
60 |     const value = node[key];
61 |     if (Array.isArray(value)) {
62 |       // Traverse array elements
63 |       value.forEach((child) => traverseAST(child, handlers, state, node));
64 |     } else if (value && typeof value === "object") {
65 |       // Traverse object properties
66 |       traverseAST(value, handlers, state, node);
67 |     }
68 |   }
69 | 
70 |   // Call leave lifecycle hook
71 |   handlers.leave?.(node, state, parent);
72 | }
73 | 
74 | /**
75 |  * Helper to create a typed visitor
76 |  */
77 | export function createVisitor<T>(handlers: NodeHandlers<T>): NodeHandlers<T> {
78 |   return handlers;
79 | }
80 | 


--------------------------------------------------------------------------------
/test/__fixtures__/refactoring/class_to_function/repository_class.ts:
--------------------------------------------------------------------------------
  1 | // Repository pattern as a class with internal state
  2 | // This is a real-world pattern from the codebase
  3 | 
  4 | export interface Entity {
  5 |   id: string;
  6 |   createdAt: Date;
  7 |   updatedAt: Date;
  8 | }
  9 | 
 10 | export class Repository<T extends Entity> {
 11 |   private items: Map<string, T> = new Map();
 12 |   private entityName: string;
 13 |   
 14 |   constructor(entityName: string) {
 15 |     this.entityName = entityName;
 16 |   }
 17 |   
 18 |   create(data: Omit<T, 'id' | 'createdAt' | 'updatedAt'>): T {
 19 |     const id = this.generateId();
 20 |     const now = new Date();
 21 |     
 22 |     const entity = {
 23 |       ...data,
 24 |       id,
 25 |       createdAt: now,
 26 |       updatedAt: now
 27 |     } as T;
 28 |     
 29 |     this.items.set(id, entity);
 30 |     this.log('created', id);
 31 |     
 32 |     return entity;
 33 |   }
 34 |   
 35 |   findById(id: string): T | null {
 36 |     const item = this.items.get(id);
 37 |     
 38 |     if (!item) {
 39 |       this.log('not found', id);
 40 |       return null;
 41 |     }
 42 |     
 43 |     return item;
 44 |   }
 45 |   
 46 |   update(id: string, updates: Partial<Omit<T, 'id' | 'createdAt'>>): T | null {
 47 |     const existing = this.findById(id);
 48 |     
 49 |     if (!existing) {
 50 |       return null;
 51 |     }
 52 |     
 53 |     const updated = {
 54 |       ...existing,
 55 |       ...updates,
 56 |       id: existing.id,
 57 |       createdAt: existing.createdAt,
 58 |       updatedAt: new Date()
 59 |     } as T;
 60 |     
 61 |     this.items.set(id, updated);
 62 |     this.log('updated', id);
 63 |     
 64 |     return updated;
 65 |   }
 66 |   
 67 |   delete(id: string): boolean {
 68 |     const exists = this.items.has(id);
 69 |     
 70 |     if (exists) {
 71 |       this.items.delete(id);
 72 |       this.log('deleted', id);
 73 |     }
 74 |     
 75 |     return exists;
 76 |   }
 77 |   
 78 |   findAll(): T[] {
 79 |     return Array.from(this.items.values());
 80 |   }
 81 |   
 82 |   findByPredicate(predicate: (item: T) => boolean): T[] {
 83 |     return this.findAll().filter(predicate);
 84 |   }
 85 |   
 86 |   count(): number {
 87 |     return this.items.size;
 88 |   }
 89 |   
 90 |   clear(): void {
 91 |     this.items.clear();
 92 |     this.log('cleared', 'all');
 93 |   }
 94 |   
 95 |   private generateId(): string {
 96 |     return `${this.entityName}_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
 97 |   }
 98 |   
 99 |   private log(action: string, id: string): void {
100 |     console.log(`[${this.entityName}] ${action}: ${id}`);
101 |   }
102 | }


--------------------------------------------------------------------------------
/test/__fixtures__/duplication/structural/error_handling_pattern_1.ts:
--------------------------------------------------------------------------------
 1 | // Structural duplication: Error handling with try-catch pattern
 2 | // This shows repeated error handling structure across different operations
 3 | 
 4 | export async function fetchUserData(userId: string): Promise<{ data?: any; error?: string }> {
 5 |   try {
 6 |     const response = await fetch(`/api/users/${userId}`);
 7 |     
 8 |     if (!response.ok) {
 9 |       throw new Error(`HTTP error! status: ${response.status}`);
10 |     }
11 |     
12 |     const data = await response.json();
13 |     return { data };
14 |   } catch (error) {
15 |     console.error('Error fetching user data:', error);
16 |     return { 
17 |       error: error instanceof Error ? error.message : 'Unknown error occurred' 
18 |     };
19 |   }
20 | }
21 | 
22 | export async function fetchProductData(productId: string): Promise<{ data?: any; error?: string }> {
23 |   try {
24 |     const response = await fetch(`/api/products/${productId}`);
25 |     
26 |     if (!response.ok) {
27 |       throw new Error(`HTTP error! status: ${response.status}`);
28 |     }
29 |     
30 |     const data = await response.json();
31 |     return { data };
32 |   } catch (error) {
33 |     console.error('Error fetching product data:', error);
34 |     return { 
35 |       error: error instanceof Error ? error.message : 'Unknown error occurred' 
36 |     };
37 |   }
38 | }
39 | 
40 | export async function fetchOrderData(orderId: string): Promise<{ data?: any; error?: string }> {
41 |   try {
42 |     const response = await fetch(`/api/orders/${orderId}`);
43 |     
44 |     if (!response.ok) {
45 |       throw new Error(`HTTP error! status: ${response.status}`);
46 |     }
47 |     
48 |     const data = await response.json();
49 |     return { data };
50 |   } catch (error) {
51 |     console.error('Error fetching order data:', error);
52 |     return { 
53 |       error: error instanceof Error ? error.message : 'Unknown error occurred' 
54 |     };
55 |   }
56 | }
57 | 
58 | export async function postComment(postId: string, comment: string): Promise<{ data?: any; error?: string }> {
59 |   try {
60 |     const response = await fetch(`/api/posts/${postId}/comments`, {
61 |       method: 'POST',
62 |       headers: { 'Content-Type': 'application/json' },
63 |       body: JSON.stringify({ comment })
64 |     });
65 |     
66 |     if (!response.ok) {
67 |       throw new Error(`HTTP error! status: ${response.status}`);
68 |     }
69 |     
70 |     const data = await response.json();
71 |     return { data };
72 |   } catch (error) {
73 |     console.error('Error posting comment:', error);
74 |     return { 
75 |       error: error instanceof Error ? error.message : 'Unknown error occurred' 
76 |     };
77 |   }
78 | }


--------------------------------------------------------------------------------
/examples/specs/debug_ast.ts:
--------------------------------------------------------------------------------
 1 | import { parseTypeScript } from "../src/parser.ts";
 2 | 
 3 | const code = `
 4 | function addUser(user: User): void {
 5 |   console.log('hello');
 6 | }
 7 | `;
 8 | 
 9 | const ast = parseTypeScript("test.ts", code);
10 | 
11 | function inspect(node: any, depth = 0): void {
12 |   const indent = "  ".repeat(depth);
13 |   if (!node || typeof node !== "object") {
14 |     console.log(indent + node);
15 |     return;
16 |   }
17 | 
18 |   console.log(indent + node.type + " {");
19 | 
20 |   for (const key in node) {
21 |     if (key === "parent" || key === "scope") continue;
22 |     const value = node[key];
23 | 
24 |     if (key === "span" && value) {
25 |       console.log(indent + "  " + key + ": { start: " + value.start + ", end: " + value.end + " }");
26 |     } else if (Array.isArray(value)) {
27 |       if (value.length > 0) {
28 |         console.log(indent + "  " + key + ": [");
29 |         value.forEach((item) => inspect(item, depth + 2));
30 |         console.log(indent + "  ]");
31 |       }
32 |     } else if (value && typeof value === "object" && value.type) {
33 |       console.log(indent + "  " + key + ":");
34 |       inspect(value, depth + 2);
35 |     } else if (value !== undefined && value !== null && key !== "raw" && key !== "regex") {
36 |       console.log(indent + "  " + key + ": " + JSON.stringify(value));
37 |     }
38 |   }
39 | 
40 |   console.log(indent + "}");
41 | }
42 | 
43 | // Find the function
44 | function findFunction(node: any): any {
45 |   if (!node) return null;
46 | 
47 |   if (node.type === "FunctionDeclaration") {
48 |     return node;
49 |   }
50 | 
51 |   for (const key in node) {
52 |     if (key === "parent" || key === "scope") continue;
53 |     const value = node[key];
54 |     if (Array.isArray(value)) {
55 |       for (const item of value) {
56 |         const result = findFunction(item);
57 |         if (result) return result;
58 |       }
59 |     } else if (value && typeof value === "object") {
60 |       const result = findFunction(value);
61 |       if (result) return result;
62 |     }
63 |   }
64 | 
65 |   return null;
66 | }
67 | 
68 | const func = findFunction(ast.program);
69 | if (func) {
70 |   console.log("Found function:");
71 |   inspect(func, 0);
72 | 
73 |   console.log("\n\nCode snippet:");
74 |   if (func.body) {
75 |     console.log("func.body exists:", !!func.body);
76 |     console.log("func.body.span:", func.body.span);
77 |     console.log("func.body.start:", func.body.start);
78 |     console.log("func.body.end:", func.body.end);
79 | 
80 |     if (func.body.start !== undefined && func.body.end !== undefined) {
81 |       const bodyCode = code.substring(func.body.start, func.body.end);
82 |       console.log("Body code (using start/end):", bodyCode);
83 |     }
84 |   }
85 | }
86 | 


--------------------------------------------------------------------------------
/examples/specs/sample_project/src/services/order_service.ts:
--------------------------------------------------------------------------------
 1 | import { Order, OrderStatus } from "../models/order.ts";
 2 | import { Logger } from "../utils/logger.ts";
 3 | import { ValidationError } from "../utils/errors.ts";
 4 | 
 5 | export class OrderService {
 6 |   private orders: Map<string, Order> = new Map();
 7 |   private logger: Logger;
 8 | 
 9 |   constructor(logger: Logger) {
10 |     this.logger = logger;
11 |   }
12 | 
13 |   async createOrder(userId: string, items: Array<{ productId: string; quantity: number }>): Promise<Order> {
14 |     if (items.length === 0) {
15 |       throw new ValidationError("Order must contain at least one item");
16 |     }
17 | 
18 |     const order: Order = {
19 |       id: this.generateOrderId(),
20 |       userId,
21 |       items,
22 |       status: OrderStatus.PENDING,
23 |       totalAmount: this.calculateTotal(items),
24 |       createdAt: new Date(),
25 |       updatedAt: new Date(),
26 |     };
27 | 
28 |     this.orders.set(order.id, order);
29 |     this.logger.info(`Order created: ${order.id} for user: ${userId}`);
30 | 
31 |     return order;
32 |   }
33 | 
34 |   async getOrderById(id: string): Promise<Order | null> {
35 |     return this.orders.get(id) || null;
36 |   }
37 | 
38 |   async getOrdersByUser(userId: string): Promise<Order[]> {
39 |     return Array.from(this.orders.values()).filter((order) => order.userId === userId);
40 |   }
41 | 
42 |   async updateOrderStatus(id: string, status: OrderStatus): Promise<Order | null> {
43 |     const order = this.orders.get(id);
44 | 
45 |     if (!order) {
46 |       this.logger.warn(`Order not found: ${id}`);
47 |       return null;
48 |     }
49 | 
50 |     const updatedOrder = {
51 |       ...order,
52 |       status,
53 |       updatedAt: new Date(),
54 |     };
55 | 
56 |     this.orders.set(id, updatedOrder);
57 |     this.logger.info(`Order ${id} status updated to: ${status}`);
58 | 
59 |     return updatedOrder;
60 |   }
61 | 
62 |   async cancelOrder(id: string): Promise<boolean> {
63 |     const order = await this.getOrderById(id);
64 | 
65 |     if (!order) {
66 |       return false;
67 |     }
68 | 
69 |     if (order.status !== OrderStatus.PENDING) {
70 |       this.logger.warn(`Cannot cancel order ${id} with status: ${order.status}`);
71 |       return false;
72 |     }
73 | 
74 |     await this.updateOrderStatus(id, OrderStatus.CANCELLED);
75 |     return true;
76 |   }
77 | 
78 |   private calculateTotal(items: Array<{ productId: string; quantity: number }>): number {
79 |     // Simplified calculation - in real app would fetch product prices
80 |     return items.reduce((total, item) => total + item.quantity * 10, 0);
81 |   }
82 | 
83 |   private generateOrderId(): string {
84 |     return `order_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`;
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/RELEASE_NOTES.md:
--------------------------------------------------------------------------------
 1 | # similarity-ts v0.1.0
 2 | 
 3 | First release of similarity-ts (formerly ts-similarity) - a high-performance TypeScript/JavaScript code similarity detection tool written in Rust.
 4 | 
 5 | ## 🎯 Features
 6 | 
 7 | ### Core Functionality
 8 | - **Function Similarity Detection**: Find duplicate or similar functions across your codebase
 9 | - **Type Similarity Detection** (experimental): Detect similar interfaces and type definitions
10 | - **AST-based Comparison**: Uses Tree Structured Edit Distance (TSED) algorithm for accurate structural comparison
11 | - **Cross-file Analysis**: Find duplicates across multiple files in your project
12 | 
13 | ### Performance
14 | - **Bloom Filter Pre-filtering**: ~90% reduction in comparisons with AST fingerprinting
15 | - **Multi-threaded Processing**: Parallel file parsing and analysis using Rayon
16 | - **Memory Efficient**: Written in Rust for optimal memory usage
17 | - **Fast Mode**: Default mode with intelligent pre-filtering
18 | 
19 | ### Developer Experience
20 | - **Zero Configuration**: Works out of the box with sensible defaults
21 | - **VSCode-compatible Output**: Click file paths to jump directly to code
22 | - **Flexible Filtering**: 
23 |   - `--min-tokens`: Filter by AST node count (recommended: 20-30)
24 |   - `--min-lines`: Filter by line count
25 |   - `--threshold`: Configurable similarity threshold (0.0-1.0)
26 | - **Multiple Output Options**: Standard output or detailed code printing with `--print`
27 | 
28 | ## 📦 Installation
29 | 
30 | ```bash
31 | cargo install similarity-ts
32 | ```
33 | 
34 | ## 🚀 Quick Start
35 | 
36 | ```bash
37 | # Check current directory for duplicates
38 | similarity-ts
39 | 
40 | # Analyze specific directories
41 | similarity-ts src/ lib/
42 | 
43 | # Set custom threshold
44 | similarity-ts --threshold 0.9
45 | 
46 | # Filter by complexity
47 | similarity-ts --min-tokens 25
48 | 
49 | # Show code snippets
50 | similarity-ts --print
51 | ```
52 | 
53 | ## 📊 Performance Benchmarks
54 | 
55 | Tested on real-world TypeScript projects:
56 | - Small files (4 functions): ~8µs
57 | - Medium files (8 functions): ~62µs
58 | - Large files (9+ functions): ~71µs
59 | - 100 files parallel processing: ~3ms (4x faster than sequential)
60 | 
61 | ## 🔧 Technical Details
62 | 
63 | - Built with [oxc-parser](https://github.com/oxc-project/oxc) for fast TypeScript/JavaScript parsing
64 | - Implements TSED algorithm from academic research
65 | - Uses SIMD-accelerated bloom filters for pre-filtering
66 | - Supports `.ts`, `.tsx`, `.js`, `.jsx`, `.mjs`, `.cjs`, `.mts`, `.cts` files
67 | 
68 | ## 🙏 Acknowledgments
69 | 
70 | This project was developed with significant assistance from Claude (Anthropic) in implementing the Rust version, optimizing performance, and creating documentation.
71 | 
72 | ## 📝 License
73 | 
74 | MIT
75 | 
76 | ---
77 | 
78 | For more information, visit the [GitHub repository](https://github.com/mizchi/similarity-ts).


--------------------------------------------------------------------------------
/crates/similarity-rs/tests/test_ast_comparison.rs:
--------------------------------------------------------------------------------
 1 | #![allow(clippy::uninlined_format_args)]
 2 | 
 3 | use similarity_core::{
 4 |     language_parser::LanguageParser,
 5 |     tsed::{calculate_tsed, TSEDOptions},
 6 | };
 7 | use similarity_rs::rust_parser::RustParser;
 8 | 
 9 | #[test]
10 | fn test_different_functions_should_have_low_similarity() {
11 |     let code1 = r#"
12 |     let result = x + 1;
13 |     result * 2
14 | "#;
15 | 
16 |     let code2 = r#"
17 |     let mut sum = 0;
18 |     for val in values {
19 |         if val > 0 {
20 |             sum += val;
21 |         }
22 |     }
23 |     sum
24 | "#;
25 | 
26 |     let mut parser = RustParser::new().unwrap();
27 |     let tree1 = parser.parse(code1, "test1.rs").unwrap();
28 |     let tree2 = parser.parse(code2, "test2.rs").unwrap();
29 | 
30 |     let options = TSEDOptions::default();
31 |     let similarity = calculate_tsed(&tree1, &tree2, &options);
32 | 
33 |     println!("Similarity between addition and loop: {:.2}%", similarity * 100.0);
34 | 
35 |     // These are completely different - similarity should be low
36 |     assert!(similarity < 0.5, "Different functions should have low similarity, got {}", similarity);
37 | }
38 | 
39 | #[test]
40 | fn test_similar_functions_should_have_high_similarity() {
41 |     let code1 = r#"
42 |     let result = x + 1;
43 |     result * 2
44 | "#;
45 | 
46 |     let code2 = r#"
47 |     let temp = y + 1;
48 |     temp * 2
49 | "#;
50 | 
51 |     let mut parser = RustParser::new().unwrap();
52 |     let tree1 = parser.parse(code1, "test1.rs").unwrap();
53 |     let tree2 = parser.parse(code2, "test2.rs").unwrap();
54 | 
55 |     let options = TSEDOptions::default();
56 |     let similarity = calculate_tsed(&tree1, &tree2, &options);
57 | 
58 |     println!("Similarity between similar functions: {:.2}%", similarity * 100.0);
59 | 
60 |     // These are very similar - similarity should be high
61 |     assert!(similarity > 0.8, "Similar functions should have high similarity, got {}", similarity);
62 | }
63 | 
64 | #[test]
65 | fn test_ast_tree_structure() {
66 |     let code = r#"
67 |     let result = x + 1;
68 |     result * 2
69 | "#;
70 | 
71 |     let mut parser = RustParser::new().unwrap();
72 |     let tree = parser.parse(code, "test.rs").unwrap();
73 | 
74 |     fn print_tree(node: &similarity_core::tree::TreeNode, depth: usize) {
75 |         let indent = "  ".repeat(depth);
76 |         if node.value.is_empty() {
77 |             println!("{}{}", indent, node.label);
78 |         } else {
79 |             println!("{}{} = '{}'", indent, node.label, node.value);
80 |         }
81 |         for child in &node.children {
82 |             print_tree(child, depth + 1);
83 |         }
84 |     }
85 | 
86 |     println!("=== AST Structure ===");
87 |     print_tree(&tree, 0);
88 | 
89 |     // Check that the tree has reasonable structure
90 |     assert!(tree.get_subtree_size() > 5, "Tree should have multiple nodes");
91 | }
92 | 


--------------------------------------------------------------------------------
/crates/similarity-css/README.md:
--------------------------------------------------------------------------------
  1 | # similarity-css
  2 | 
  3 | > ⚠️ **EXPERIMENTAL**: This is a prototype implementation for CSS/SCSS similarity detection. The API and functionality may change significantly. Use at your own risk.
  4 | 
  5 | A CSS/SCSS similarity detection tool that identifies duplicate styles, redundant rules, and BEM component variations.
  6 | 
  7 | ## Features
  8 | 
  9 | - **CSS and SCSS parsing** using tree-sitter
 10 | - **Nested SCSS syntax flattening** with BEM notation support (`&__element`, `&--modifier`)
 11 | - **Multiple similarity detection types**:
 12 |   - Exact duplicates
 13 |   - Style duplicates (same styles, different selectors)
 14 |   - BEM component variations
 15 |   - Selector conflicts
 16 | - **Shorthand property expansion** for accurate comparison
 17 | - **CSS specificity calculation**
 18 | - **Multiple output formats**: standard, VSCode, JSON
 19 | 
 20 | ## Installation
 21 | 
 22 | This tool is part of the similarity workspace. Build it with:
 23 | 
 24 | ```bash
 25 | cargo build --release -p similarity-css
 26 | ```
 27 | 
 28 | ## Usage
 29 | 
 30 | ```bash
 31 | # Analyze CSS files
 32 | similarity-css path/to/css/
 33 | 
 34 | # Analyze SCSS files
 35 | similarity-css --scss path/to/scss/
 36 | 
 37 | # Set custom threshold (0.0-1.0)
 38 | similarity-css --threshold 0.7 path/to/css/
 39 | 
 40 | # Different output formats
 41 | similarity-css --output json path/to/css/
 42 | similarity-css --output vscode path/to/css/
 43 | ```
 44 | 
 45 | ## Examples
 46 | 
 47 | ### Analyzing BEM components
 48 | 
 49 | ```bash
 50 | similarity-css --scss examples/scss-bem/
 51 | ```
 52 | 
 53 | This will detect:
 54 | - Duplicate button styles (`.btn` vs `.button`)
 55 | - Similar form input styles
 56 | - BEM modifier variations
 57 | 
 58 | ### Output Example
 59 | 
 60 | ```
 61 | === CSS Similarity Analysis Results ===
 62 | 
 63 | ## Similar Styles Found: 74
 64 | 
 65 | 1. .btn and .button (similarity: 60.00%)
 66 |    Files: button.scss and button.scss
 67 |    Lines: 2-14 and 138-149
 68 | 
 69 | ## BEM Component Variations Found: 37
 70 | 
 71 | 1. BEM variation: .btn--primary
 72 |    Similar to: .btn--secondary
 73 |    Similarity: 51.00%
 74 | ```
 75 | 
 76 | ## Implementation Notes
 77 | 
 78 | - Uses TSED algorithm for AST comparison (currently weighted at 0%)
 79 | - Simple text-based SCSS flattener for handling complex nested rules
 80 | - Handles multiple selectors and media queries
 81 | - Supports single-line CSS rules
 82 | 
 83 | ## Limitations
 84 | 
 85 | - SCSS variable resolution is not implemented
 86 | - Mixin expansion is not supported
 87 | - Import statements are not followed
 88 | - Cross-file BEM component detection is limited
 89 | 
 90 | ## Future Improvements
 91 | 
 92 | - [ ] SCSS variable and mixin support
 93 | - [ ] Import resolution
 94 | - [ ] CSS-in-JS support
 95 | - [ ] Performance optimizations for large codebases
 96 | - [ ] Integration with build tools
 97 | 
 98 | ## License
 99 | 
100 | See the main repository's LICENSE file.


--------------------------------------------------------------------------------
/__deprecated/src/core/ast.ts:
--------------------------------------------------------------------------------
 1 | // AST-related pure functions with proper oxc-parser types
 2 | import { parseTypeScript } from "../parser.ts";
 3 | import { levenshtein } from "./levenshtein.ts";
 4 | import type { ASTNode, Program } from "./oxc_types.ts";
 5 | import type { ParseResult } from "oxc-parser";
 6 | 
 7 | /**
 8 |  * Extract structure from AST node with proper type handling
 9 |  */
10 | function extractStructure(node: ASTNode | any): any {
11 |   if (!node || typeof node !== "object") {
12 |     return node;
13 |   }
14 | 
15 |   const skipKeys = new Set(["range", "loc", "span", "start", "end"]);
16 |   const result: any = {};
17 | 
18 |   if (node.type) {
19 |     result.type = node.type;
20 |   }
21 | 
22 |   for (const [key, value] of Object.entries(node)) {
23 |     if (skipKeys.has(key)) continue;
24 | 
25 |     if (Array.isArray(value)) {
26 |       result[key] = value.map((item) => extractStructure(item));
27 |     } else if (typeof value === "object" && value !== null) {
28 |       result[key] = extractStructure(value);
29 |     } else if (key !== "type") {
30 |       result[key] = value;
31 |     }
32 |   }
33 | 
34 |   return result;
35 | }
36 | 
37 | /**
38 |  * Convert AST to string representation
39 |  */
40 | export function astToString(ast: ParseResult | Program | ASTNode): string {
41 |   // Handle ParseResult from oxc-parser
42 |   if ("program" in ast && !("type" in ast)) {
43 |     return JSON.stringify(extractStructure(ast.program), null, 2);
44 |   }
45 |   // Handle direct AST nodes
46 |   return JSON.stringify(extractStructure(ast), null, 2);
47 | }
48 | 
49 | /**
50 |  * Calculate similarity between two code strings
51 |  */
52 | export function calculateSimilarity(code1: string, code2: string): number {
53 |   try {
54 |     const ast1 = parseTypeScript("file1.ts", code1);
55 |     const ast2 = parseTypeScript("file2.ts", code2);
56 | 
57 |     const str1 = astToString(ast1);
58 |     const str2 = astToString(ast2);
59 | 
60 |     const distance = levenshtein(str1, str2);
61 |     const maxLength = Math.max(str1.length, str2.length);
62 | 
63 |     return maxLength === 0 ? 1.0 : 1 - distance / maxLength;
64 |   } catch (error) {
65 |     // If parsing fails, fall back to simple string comparison
66 |     return code1 === code2 ? 1.0 : 0.0;
67 |   }
68 | }
69 | 
70 | /**
71 |  * Compare structures and return similarity metrics
72 |  */
73 | export function compareStructures(
74 |   ast1: ParseResult,
75 |   ast2: ParseResult,
76 | ): {
77 |   similarity: number;
78 |   distance: number;
79 |   maxLength: number;
80 |   structure1: string;
81 |   structure2: string;
82 | } {
83 |   const str1 = astToString(ast1);
84 |   const str2 = astToString(ast2);
85 | 
86 |   const distance = levenshtein(str1, str2);
87 |   const maxLength = Math.max(str1.length, str2.length);
88 |   const similarity = maxLength === 0 ? 1.0 : 1 - distance / maxLength;
89 | 
90 |   return {
91 |     similarity,
92 |     distance,
93 |     maxLength,
94 |     structure1: str1,
95 |     structure2: str2,
96 |   };
97 | }
98 | 


--------------------------------------------------------------------------------