├── .gitignore ├── CHANGELOG.md ├── CONTRIBUTING.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── docs ├── data-analysis.md ├── getting-started.md ├── query-language.md ├── string-operations.md └── text-processing.md ├── examples ├── README.md ├── scripts │ ├── README.md │ ├── cleanup.sh │ ├── download_datasets.sh │ └── generate_large.sh └── small │ ├── application.log │ ├── customers.json │ ├── ec2_instances.json │ ├── employees.json │ ├── error_messages.txt │ ├── nginx.conf │ ├── nginx_access.log │ ├── orders.csv │ ├── products.yaml │ ├── urls.txt │ └── user_behavior.json └── src ├── arg.rs ├── error.rs ├── executor.rs ├── filter.rs ├── lib.rs ├── main.rs ├── output.rs ├── parser.rs ├── setup.rs ├── stats_opts.rs ├── string_ops.rs └── utils.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | dist 3 | *.rs.bk 4 | 5 | # Generated large datasets (user can create these with scripts) 6 | examples/large/ 7 | examples/external/ 8 | examples/generated/ 9 | 10 | # Temporary files 11 | *.tmp 12 | *.temp 13 | *.swp 14 | *.bak 15 | 16 | # OS generated files 17 | .DS_Store 18 | .DS_Store? 19 | ._* 20 | .Spotlight-V100 21 | .Trashes 22 | ehthumbs.db 23 | Thumbs.db 24 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [0.2.2] - 2025-07-18 9 | 10 | ### 🎉 New Features 11 | 12 | #### Logical Operations 13 | 14 | - **NOT operator support**: Added `not` operator for negating conditions in `select()` statements 15 | 16 | - Syntax: `select(not (.condition))` - parentheses are required for clarity 17 | - Works with all comparison operators and string operations 18 | - Examples: `select(not (.age > 65))`, `select(not (.email | contains("spam")))` 19 | 20 | - **OR operator support**: Added pipe-delimited pattern matching for OR conditions 21 | - Syntax: `select(.field | contains("pattern1|pattern2|pattern3"))` 22 | - Multi-pattern matching in a single operation 23 | - Examples: `select(.level | contains("ERROR|FATAL"))`, `select(.role | contains("admin|manager|supervisor"))` 24 | 25 | #### Array Slicing Operations 26 | 27 | - **Python-style slicing**: Complete slice notation support for arrays and string split results 28 | 29 | - Basic slicing: `.[start:end]`, `.[start:]`, `.[:end]`, `.[:]` 30 | - Negative index support: `.[-5:]`, `.[:-3]`, `.[-10:-5]` 31 | - Field-specific slicing: `.users[0:10]`, `.logs[-100:]` 32 | 33 | - **String split slicing**: Advanced slicing support for string split operations 34 | 35 | - Direct slicing after split: `split(",")[1:3]`, `split("/")[-1]` 36 | - Complex path processing: `split("/")[1:-1]` for middle path components 37 | - CSV column extraction: `split(",")[2:5]` for specific column ranges 38 | 39 | - **Negative indexing**: Support for negative array indices 40 | - Last element access: `.array[-1]`, `.users[-1]` 41 | - Reverse indexing: `.array[-3]` for third from last 42 | - Compatible with all array operations 43 | 44 | #### Enhanced Filtering Capabilities 45 | 46 | - **Complex logical combinations**: Combine NOT and OR operations for sophisticated filtering 47 | 48 | - Example: `select(not (.status | contains("deleted|suspended|inactive")))` 49 | - Multi-condition filtering: `select(not (.type | contains("debug|trace|verbose")))` 50 | 51 | - **Pattern-based exclusion**: Exclude multiple patterns efficiently 52 | - File filtering: `select(not (.filename | contains(".tmp|.bak|.swp")))` 53 | - Content filtering: `select(not (. | contains("DEBUG|INFO|TRACE")))` 54 | 55 | ### 🔧 Improvements 56 | 57 | #### Enhanced String Operations 58 | 59 | - **Improved split operations**: Better integration with slicing for complex text processing 60 | - **Optimized pattern matching**: More efficient OR pattern processing with pipe-delimited syntax 61 | - **Better error handling**: Clearer error messages for logical operation syntax errors 62 | 63 | #### Performance Optimizations 64 | 65 | - **Slice operation efficiency**: Optimized memory usage for large array slicing operations 66 | - **Pattern matching performance**: Improved performance for multi-pattern OR operations 67 | - **Logical operation caching**: Better performance for complex NOT/OR combinations 68 | 69 | #### Documentation and Examples 70 | 71 | - **Comprehensive logical operation examples**: Real-world use cases for NOT and OR operations 72 | - **Slicing operation guide**: Complete reference for all slicing capabilities 73 | - **Advanced filtering patterns**: Examples combining multiple logical operations 74 | 75 | ### 📊 New Use Cases Enabled 76 | 77 | #### Advanced Log Analysis 78 | 79 | ```bash 80 | # Exclude multiple log levels efficiently 81 | hawk '. | select(not (.level | contains("DEBUG|INFO|TRACE")))' app.log 82 | 83 | # Get recent critical errors only 84 | hawk '.logs[-1000:] | select(.level | contains("ERROR|FATAL|CRITICAL"))' system.log 85 | 86 | # Extract specific time ranges with slicing 87 | hawk '.logs[] | .timestamp | split("T")[0] | split("-")[1:3]' logs.json 88 | ``` 89 | 90 | #### Sophisticated Data Filtering 91 | 92 | ```bash 93 | # Filter active users excluding test accounts 94 | hawk '.users[] | select(not (.email | contains("test|demo|temp")))' users.json 95 | 96 | # Find high-priority items excluding archived 97 | hawk '.items[] | select(.priority | contains("high|urgent")) | select(not (.status | contains("archived|deleted")))' items.json 98 | 99 | # Process middle sections of arrays 100 | hawk '.data[100:200] | select(not (.type | contains("noise|test")))' dataset.json 101 | ``` 102 | 103 | #### Complex Text Processing 104 | 105 | ```bash 106 | # Extract domain names efficiently 107 | hawk '.urls[] | split("://")[1] | split("/")[0]' urls.txt 108 | 109 | # Get file paths without extension 110 | hawk '.files[] | .path | split(".")[:-1] | join(".")' filelist.json 111 | 112 | # Process CSV columns with exclusions 113 | hawk -t '. | split(",")[2:8] | select(not (.[0] | contains("null|empty|N/A")))' data.csv 114 | ``` 115 | 116 | #### Advanced Data Analysis Workflows 117 | 118 | ```bash 119 | # Multi-step filtering with slicing 120 | hawk '.events[0:5000] | select(not (.type | contains("debug|trace"))) | group_by(.service) | count' events.json 121 | 122 | # Recent data analysis with pattern exclusion 123 | hawk '.metrics[-500:] | select(not (.source | contains("test|staging"))) | avg(.value)' metrics.json 124 | 125 | # Complex field extraction with logical operations 126 | hawk '.users[] | select(.role | contains("admin|manager")) | select(not (.status | contains("inactive|suspended"))) | count' users.json 127 | ``` 128 | 129 | ### 🛠️ Technical Improvements 130 | 131 | #### Filter Module Enhancements 132 | 133 | - **New function `parse_not_condition_with_parentheses`**: Robust NOT operator parsing with mandatory parentheses 134 | - **Enhanced `apply_filter_with_string_operations`**: Support for NOT operator in string operation pipelines 135 | - **Improved error handling**: Better error messages for missing parentheses and invalid syntax 136 | 137 | #### Slicing Infrastructure 138 | 139 | - **Universal slicing support**: `apply_universal_slice_operation` handles all data types 140 | - **Negative index processing**: `parse_index_with_negative` for Python-style negative indexing 141 | - **Data structure detection**: `detect_data_structure` for intelligent slicing behavior 142 | 143 | #### Pattern Matching Optimization 144 | 145 | - **OR pattern preprocessing**: Efficient pipe-delimited pattern parsing 146 | - **Multi-pattern contains operations**: Optimized string matching for multiple patterns 147 | - **Regex-free implementation**: Fast pattern matching without regex overhead 148 | 149 | ### 🔄 Breaking Changes 150 | 151 | None. This release is fully backward compatible with v0.2.x. 152 | 153 | ### 🐛 Bug Fixes 154 | 155 | - Fixed slice boundary checking for out-of-range indices 156 | - Improved pattern matching edge cases with empty patterns 157 | - Enhanced error reporting for malformed logical operations 158 | - Fixed memory usage issues with large slice operations 159 | 160 | ### 📖 Documentation Updates 161 | 162 | - **Complete logical operations reference**: Documentation for NOT and OR operators 163 | - **Comprehensive slicing guide**: All slicing capabilities with examples 164 | - **Advanced filtering patterns**: Real-world use case examples 165 | - **Performance best practices**: Guidelines for efficient query construction 166 | 167 | ### 🚀 Migration Guide 168 | 169 | #### For users upgrading from v0.2.1: 170 | 171 | All existing queries continue to work without changes. New features are additive: 172 | 173 | **New NOT operator usage:** 174 | 175 | ```bash 176 | # Old approach (still works) 177 | hawk '.users[] | select(.age <= 65)' users.json 178 | 179 | # New approach with NOT operator 180 | hawk '.users[] | select(not (.age > 65))' users.json 181 | ``` 182 | 183 | **New OR operator usage:** 184 | 185 | ```bash 186 | # Old approach with multiple queries 187 | hawk '.logs[] | select(.level == "ERROR")' logs.json 188 | hawk '.logs[] | select(.level == "FATAL")' logs.json 189 | 190 | # New approach with OR operator 191 | hawk '.logs[] | select(.level | contains("ERROR|FATAL"))' logs.json 192 | ``` 193 | 194 | **New slicing capabilities:** 195 | 196 | ```bash 197 | # Get last 10 users (new) 198 | hawk '.users[-10:]' users.json 199 | 200 | # Get middle section of data (new) 201 | hawk '.data[100:200]' data.json 202 | 203 | # Extract filename from path (new) 204 | hawk '.files[] | .path | split("/")[-1]' files.json 205 | ``` 206 | 207 | ## [0.2.1] - 2024-07-16 208 | 209 | ### 🐛 Bug Fixes 210 | 211 | - Fixed single object field access (e.g., `.Parameters` in CloudFormation templates) 212 | - Corrected info display for single objects ("Single Object" vs "Object Array") 213 | - Enhanced support for YAML/JSON single object files 214 | 215 | ### 🔧 Improvements 216 | 217 | - Better error messages for field access 218 | - Improved CloudFormation, Docker Compose, Kubernetes manifest support 219 | 220 | ## [0.2.0] - 2025-07-16 221 | 222 | ### 🎉 Major Features Added 223 | 224 | #### Plain Text Support 225 | 226 | - **Universal file format support**: Now processes plain text files, log files, configuration files, and any text-based data 227 | - **Automatic format detection**: Intelligently detects JSON, YAML, CSV, and plain text files 228 | - **Unified query syntax**: Same query language works across all supported formats 229 | - **Text-as-array processing**: Each line becomes a string element in an array for consistent processing 230 | 231 | #### String Operations 232 | 233 | - **Complete string manipulation suite**: `upper`, `lower`, `trim`, `trim_start`, `trim_end` 234 | - **String analysis functions**: `length`, `reverse` 235 | - **Pattern matching**: `contains(pattern)`, `starts_with(pattern)`, `ends_with(pattern)` 236 | - **Text transformation**: `replace(old, new)`, `substring(start, length)` 237 | - **String parsing**: `split(delimiter)` to convert strings to arrays 238 | - **Array joining**: `join(delimiter)` to convert arrays back to strings 239 | 240 | #### Enhanced map() Function 241 | 242 | - **Data transformation pipeline**: Transform data elements with chained string operations 243 | - **Type-safe operations**: Proper error handling for incompatible data types 244 | - **Complex pipelines**: Support for multi-step transformations like `map(. | trim | upper | replace("old", "new"))` 245 | 246 | #### Statistical Functions 247 | 248 | - **Descriptive statistics**: `median`, `stddev` (standard deviation) 249 | - **Data manipulation**: `unique` (remove duplicates), `sort` (sort values) 250 | - **Array operations**: `length` for counting elements 251 | - **Field-specific operations**: All statistical functions support field specification (e.g., `median(.price)`) 252 | 253 | #### Colored Output 254 | 255 | - **Automatic TTY detection**: Colors in terminal, plain text when piped or redirected 256 | - **Beautiful syntax highlighting**: 257 | - Table headers in blue with bold formatting 258 | - Numbers in green 259 | - Boolean values in yellow 260 | - Null values in gray 261 | - JSON syntax highlighting with colored keys and values 262 | - **Environment variable support**: Respects `NO_COLOR` environment variable 263 | - **Multiple output formats**: Enhanced table, JSON, and list outputs with appropriate coloring 264 | 265 | ### 🔧 Improvements 266 | 267 | #### Enhanced Error Handling 268 | 269 | - **Detailed error messages**: Context-aware error reporting with specific field and operation information 270 | - **Type-safe operations**: Better validation of operations on different data types 271 | - **Pipeline debugging**: Improved error location reporting in complex query pipelines 272 | 273 | #### Better File Format Detection 274 | 275 | - **Robust detection algorithms**: Improved heuristics for distinguishing between formats 276 | - **Edge case handling**: Better support for malformed or ambiguous files 277 | - **Fallback mechanisms**: Graceful degradation to text processing when format detection fails 278 | 279 | #### Performance Optimizations 280 | 281 | - **Memory-efficient processing**: Optimized data structures for large datasets 282 | - **Faster pipeline execution**: Improved query parsing and execution engine 283 | - **Reduced startup time**: Optimized initialization and dependency loading 284 | 285 | #### Pipeline Processing Improvements 286 | 287 | - **Parentheses-aware parsing**: Proper handling of nested operations like `map(. | contains("text") | not)` 288 | - **Complex query support**: Better support for multi-level operations and transformations 289 | - **Operation chaining**: Improved reliability of long pipeline chains 290 | 291 | ### 📊 New Use Cases Enabled 292 | 293 | #### Log File Analysis 294 | 295 | ```bash 296 | # Extract error logs with timestamps 297 | hawk '. | select(. | contains("ERROR")) | map(. | substring(0, 19))' app.log 298 | 299 | # Count log levels 300 | hawk '. | map(. | split(" ")[2]) | group_by(.) | count' application.log 301 | 302 | # Find unique IP addresses 303 | hawk '. | map(. | split(" ")[0]) | unique | sort' access.log 304 | ``` 305 | 306 | #### Text Data Processing 307 | 308 | ```bash 309 | # Clean and normalize text 310 | hawk '. | map(. | trim | lower)' names.txt 311 | 312 | # Extract file extensions 313 | hawk '. | map(. | split(".") | last)' filelist.txt 314 | 315 | # Statistical text analysis 316 | hawk '. | map(. | split(" ") | length) | median' documents.txt 317 | ``` 318 | 319 | #### Data Cleaning and Normalization 320 | 321 | ```bash 322 | # Email normalization 323 | hawk '.users[] | map(.email | lower | trim)' users.csv 324 | 325 | # Complex string transformations 326 | hawk '.products[] | map(.name | replace("_", " ") | upper)' inventory.json 327 | 328 | # Data validation and cleaning 329 | hawk '.records[] | select(.id | length == 8) | map(.status | upper)' data.csv 330 | ``` 331 | 332 | #### Advanced Analytics 333 | 334 | ```bash 335 | # Statistical analysis 336 | hawk '.measurements[] | group_by(.sensor) | stddev(.temperature)' sensor_data.json 337 | 338 | # Median calculations 339 | hawk '.sales[] | group_by(.region) | median(.amount)' sales_data.csv 340 | 341 | # Unique value analysis 342 | hawk '.users[] | unique(.department) | sort' employee_data.json 343 | ``` 344 | 345 | ### 🛠️ Technical Improvements 346 | 347 | #### New Dependencies 348 | 349 | - `termcolor ^1.4`: For colored output support 350 | - `is-terminal ^0.4`: For TTY detection 351 | 352 | #### Code Architecture 353 | 354 | - **New module `string_ops`**: Centralized string operation handling 355 | - **New module `stats_ops`**: Statistical function implementations 356 | - **Enhanced `filter.rs`**: Improved pipeline operation handling 357 | - **Updated `output.rs`**: Comprehensive colored output support 358 | - **Improved `setup.rs`**: Better file format detection and text processing 359 | 360 | #### Testing 361 | 362 | - **Comprehensive test suite**: Added tests for all new string operations 363 | - **Statistical function testing**: Validation of median, stddev, and other statistical operations 364 | - **Integration testing**: End-to-end testing of complex pipeline operations 365 | - **Edge case coverage**: Testing of malformed inputs and error conditions 366 | 367 | ### 🔄 Breaking Changes 368 | 369 | None. This release is fully backward compatible with v0.1.x. 370 | 371 | ### 📦 Migration Guide 372 | 373 | No migration required. All existing queries and workflows continue to work unchanged. 374 | 375 | ### 🐛 Bug Fixes 376 | 377 | - Fixed pipeline parsing issues with complex nested operations 378 | - Improved CSV type inference accuracy 379 | - Enhanced error reporting for malformed queries 380 | - Fixed memory usage issues with large datasets 381 | 382 | ### 📖 Documentation Updates 383 | 384 | - **Comprehensive README update**: Added extensive documentation for new features 385 | - **String operations guide**: Complete reference for all string manipulation functions 386 | - **Statistical functions documentation**: Usage examples and parameter descriptions 387 | - **Text processing examples**: Real-world use cases for log analysis and text processing 388 | - **Enhanced query syntax reference**: Updated with all new operations and examples 389 | 390 | ### 🙏 Acknowledgments 391 | 392 | - Community feedback on string processing needs 393 | - Performance suggestions from early adopters 394 | - Documentation improvements from user contributions 395 | 396 | ## [0.1.0] - 2024-07-12 397 | 398 | ### 🎉 Initial Release 399 | 400 | #### Core Features 401 | 402 | - **Multi-format support**: JSON, YAML, CSV parsing and processing 403 | - **Pandas-like query language**: Intuitive syntax for data analysis 404 | - **Field access and navigation**: Deep nested field access with array expansion 405 | - **Filtering operations**: `select()` with comparison operators 406 | - **Aggregation functions**: `count`, `sum`, `avg`, `min`, `max` 407 | - **Grouping operations**: `group_by()` with aggregation support 408 | - **Multiple output formats**: Table, JSON, list with automatic format detection 409 | 410 | #### Technical Foundation 411 | 412 | - **Rust implementation**: Fast, memory-safe data processing 413 | - **serde_json integration**: Robust JSON parsing and manipulation 414 | - **Type-aware processing**: Intelligent handling of numbers, strings, booleans 415 | - **Error handling**: Comprehensive error reporting with thiserror 416 | - **CLI interface**: User-friendly command-line interface with clap 417 | 418 | #### Supported Operations 419 | 420 | - Field access: `.field`, `.array[0]`, `.array[]`, `.nested.field` 421 | - Filtering: `select(.field > value)`, `select(.field == "value")` 422 | - Aggregation: `sum(.field)`, `avg(.field)`, `min(.field)`, `max(.field)`, `count` 423 | - Grouping: `group_by(.field)` with aggregation support 424 | - Info: `. | info` for data structure exploration 425 | 426 | #### Output Formats 427 | 428 | - **Table format**: Structured table output for object arrays 429 | - **JSON format**: Pretty-printed JSON output 430 | - **List format**: Simple list output for array data 431 | - **Auto format**: Intelligent format selection based on data structure 432 | 433 | --- 434 | 435 | For more details about any release, please see the [GitHub releases page](https://github.com/kyotalab/hawk/releases). 436 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to hawk 🦅 2 | 3 | Thank you for your interest in contributing to hawk! We welcome contributions from everyone, whether you're fixing a bug, adding a feature, improving documentation, or suggesting enhancements. 4 | 5 | ## 🤝 Ways to Contribute 6 | 7 | - 🐛 **Bug Reports**: Found an issue? Let us know! 8 | - ✨ **Feature Requests**: Have an idea for improvement? 9 | - 🔧 **Code Contributions**: Bug fixes, new features, optimizations 10 | - 📚 **Documentation**: Improve README, add examples, write tutorials 11 | - 🧪 **Testing**: Add test cases, improve test coverage 12 | - 💡 **Examples**: Real-world use cases and sample datasets 13 | 14 | ## 🚀 Getting Started 15 | 16 | ### Development Setup 17 | 18 | 1. **Fork the repository** 19 | ```bash 20 | git clone https://github.com/kyotalab/hawk.git 21 | cd hawk 22 | ``` 23 | 24 | 2. **Install Rust** (if not already installed) 25 | ```bash 26 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 27 | source ~/.cargo/env 28 | ``` 29 | 30 | 3. **Build the project** 31 | ```bash 32 | cargo build 33 | ``` 34 | 35 | 4. **Run tests** 36 | ```bash 37 | cargo test 38 | ``` 39 | 40 | 5. **Try it out** 41 | ```bash 42 | cargo run -- '.users[0].name' sample-data/users.json 43 | ``` 44 | 45 | ### Development Workflow 46 | 47 | 1. **Create a branch** for your changes 48 | ```bash 49 | git checkout -b feature/amazing-feature 50 | ``` 51 | 52 | 2. **Make your changes** with clear, focused commits 53 | 54 | 3. **Test thoroughly** 55 | ```bash 56 | cargo test 57 | cargo clippy # Lint check 58 | cargo fmt # Format code 59 | ``` 60 | 61 | 4. **Submit a Pull Request** with a clear description 62 | 63 | ## 🐛 Reporting Bugs 64 | 65 | When reporting bugs, please include: 66 | 67 | ### Bug Report Template 68 | ``` 69 | **Describe the bug** 70 | A clear description of what the bug is. 71 | 72 | **To Reproduce** 73 | Steps to reproduce the behavior: 74 | 1. Run command '...' 75 | 2. With data file '...' 76 | 3. See error 77 | 78 | **Expected behavior** 79 | What you expected to happen. 80 | 81 | **Actual behavior** 82 | What actually happened. 83 | 84 | **Environment** 85 | - OS: [e.g., Ubuntu 22.04, macOS 13.0, Windows 11] 86 | - Rust version: [e.g., 1.70.0] 87 | - hawk version: [e.g., 0.1.0] 88 | 89 | **Sample data (if applicable)** 90 | Minimal example that reproduces the issue. 91 | 92 | **Additional context** 93 | Any other relevant information. 94 | ``` 95 | 96 | ## ✨ Feature Requests 97 | 98 | We love new ideas! When suggesting features: 99 | 100 | ### Feature Request Template 101 | ``` 102 | **Feature Summary** 103 | Brief description of the feature. 104 | 105 | **Motivation** 106 | Why would this feature be useful? What problem does it solve? 107 | 108 | **Proposed Solution** 109 | How should this feature work? 110 | 111 | **Example Usage** 112 | Show how users would interact with this feature: 113 | hawk '.data | new_feature(.field)' data.json 114 | 115 | **Alternatives Considered** 116 | Are there other ways to solve this problem? 117 | 118 | **Additional Context** 119 | Any other relevant information, mockups, or examples. 120 | ``` 121 | 122 | ## 💻 Code Contributions 123 | 124 | ### Coding Standards 125 | 126 | - **Follow Rust conventions**: Use `cargo fmt` and `cargo clippy` 127 | - **Write tests**: All new features should include tests 128 | - **Document public APIs**: Add doc comments for public functions 129 | - **Keep it simple**: Prefer readable code over clever code 130 | - **Follow existing patterns**: Match the existing codebase style 131 | 132 | ### Code Organization 133 | 134 | ``` 135 | src/ 136 | ├── main.rs # Entry point 137 | ├── lib.rs # Library root 138 | ├── cli.rs # Command line interface 139 | ├── error.rs # Error types 140 | ├── setup.rs # File reading & format detection 141 | ├── parser.rs # Query parsing 142 | ├── executor.rs # Query execution 143 | ├── filter.rs # Filtering & aggregation 144 | ├── output.rs # Output formatting 145 | └── utils.rs # Utility functions 146 | ``` 147 | 148 | ### Adding New Features 149 | 150 | 1. **Start with tests**: Write tests for your feature first 151 | 2. **Implement incrementally**: Break large features into smaller chunks 152 | 3. **Update documentation**: Add examples and update README if needed 153 | 4. **Consider backwards compatibility**: Don't break existing queries 154 | 155 | ### Example: Adding a New Aggregation Function 156 | 157 | ```rust 158 | // 1. Add to apply_pipeline_operation in filter.rs 159 | } else if operation.starts_with("median(") && operation.ends_with(")") { 160 | let field = &operation[7..operation.len()-1]; 161 | let field_name = field.trim_start_matches('.'); 162 | 163 | if is_grouped_data(&data) { 164 | apply_aggregation_to_groups(data, "median", field_name) 165 | } else { 166 | calculate_median_simple(data, field_name) 167 | } 168 | 169 | // 2. Implement the calculation function 170 | fn calculate_median_simple(data: Vec, field_name: &str) -> Result, Error> { 171 | // Implementation here 172 | } 173 | 174 | // 3. Add group support in apply_aggregation_to_groups 175 | "median" => calculate_median(items, field_name)?, 176 | 177 | // 4. Write tests 178 | #[test] 179 | fn test_median_calculation() { 180 | // Test cases here 181 | } 182 | ``` 183 | 184 | ## 🧪 Testing Guidelines 185 | 186 | ### Running Tests 187 | ```bash 188 | cargo test # All tests 189 | cargo test test_name # Specific test 190 | cargo test --test integration # Integration tests only 191 | ``` 192 | 193 | ### Test Categories 194 | 195 | 1. **Unit Tests**: Test individual functions 196 | ```rust 197 | #[cfg(test)] 198 | mod tests { 199 | use super::*; 200 | 201 | #[test] 202 | fn test_parse_simple_query() { 203 | // Test implementation 204 | } 205 | } 206 | ``` 207 | 208 | 2. **Integration Tests**: Test complete workflows 209 | ```rust 210 | // tests/integration_test.rs 211 | #[test] 212 | fn test_csv_groupby_workflow() { 213 | // End-to-end test 214 | } 215 | ``` 216 | 217 | 3. **Example Tests**: Verify README examples work 218 | ```rust 219 | #[test] 220 | fn test_readme_examples() { 221 | // Test examples from documentation 222 | } 223 | ``` 224 | 225 | ### Adding Test Data 226 | 227 | Place test files in `tests/data/`: 228 | ``` 229 | tests/ 230 | ├── data/ 231 | │ ├── users.json 232 | │ ├── sales.csv 233 | │ └── config.yaml 234 | └── integration_test.rs 235 | ``` 236 | 237 | ## 📚 Documentation Guidelines 238 | 239 | ### Code Documentation 240 | ```rust 241 | /// Calculates the median value for a numeric field 242 | /// 243 | /// # Arguments 244 | /// * `data` - Vector of JSON values to process 245 | /// * `field_name` - Name of the field to calculate median for 246 | /// 247 | /// # Examples 248 | /// ``` 249 | /// let result = calculate_median(data, "price")?; 250 | /// ``` 251 | pub fn calculate_median(data: Vec, field_name: &str) -> Result { 252 | // Implementation 253 | } 254 | ``` 255 | 256 | ### README Updates 257 | - Add new features to the feature list 258 | - Include usage examples 259 | - Update the comparison table if needed 260 | - Add real-world use cases 261 | 262 | ## 🎯 Priority Areas 263 | 264 | We're especially interested in contributions in these areas: 265 | 266 | ### High Priority 267 | - 🐛 **Bug fixes**: Any correctness issues 268 | - 🚀 **Performance improvements**: Memory usage, speed optimizations 269 | - 📊 **New aggregation functions**: `median`, `stddev`, `percentile` 270 | - 🔧 **CSV improvements**: Better type detection, delimiter handling 271 | 272 | ### Medium Priority 273 | - 🌐 **Output formats**: XML, TSV support 274 | - 🔍 **Query enhancements**: Regular expressions, string functions 275 | - 📈 **Visualization**: ASCII charts, histograms 276 | - 🔄 **Streaming**: Large file support 277 | 278 | ### Lower Priority 279 | - 🎨 **UI improvements**: Colors, better formatting 280 | - 📦 **Packaging**: Homebrew, APT packages 281 | - 🔌 **Plugins**: Extensibility system 282 | 283 | ## 📋 Pull Request Guidelines 284 | 285 | ### Before Submitting 286 | - [ ] All tests pass (`cargo test`) 287 | - [ ] Code is formatted (`cargo fmt`) 288 | - [ ] No clippy warnings (`cargo clippy`) 289 | - [ ] Documentation updated if needed 290 | - [ ] Examples work as expected 291 | 292 | ### PR Description Template 293 | ``` 294 | ## Summary 295 | Brief description of changes 296 | 297 | ## Motivation 298 | Why is this change needed? 299 | 300 | ## Changes 301 | - [ ] Feature A added 302 | - [ ] Bug B fixed 303 | - [ ] Tests updated 304 | 305 | ## Testing 306 | How was this tested? 307 | 308 | ## Breaking Changes 309 | Any backwards incompatible changes? 310 | 311 | ## Related Issues 312 | Fixes #123 313 | ``` 314 | 315 | ## 🌟 Recognition 316 | 317 | Contributors will be recognized in: 318 | - README acknowledgments 319 | - Release notes 320 | - GitHub contributors page 321 | 322 | ## 📞 Getting Help 323 | 324 | - 💬 **Discussions**: Use GitHub Discussions for questions 325 | - 🐛 **Issues**: Use GitHub Issues for bugs and feature requests 326 | - 📧 **Email**: Contact maintainers for sensitive issues 327 | 328 | ## 📜 Code of Conduct 329 | 330 | We follow the [Rust Code of Conduct](https://www.rust-lang.org/policies/code-of-conduct). Please be respectful and inclusive in all interactions. 331 | 332 | ## 🙏 Thank You! 333 | 334 | Every contribution helps make hawk better for everyone. Whether it's a typo fix or a major feature, we appreciate your effort! 335 | 336 | --- 337 | 338 | Happy contributing! 🦅 -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.1.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "anstream" 16 | version = "0.6.19" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "301af1932e46185686725e0fad2f8f2aa7da69dd70bf6ecc44d6b703844a3933" 19 | dependencies = [ 20 | "anstyle", 21 | "anstyle-parse", 22 | "anstyle-query", 23 | "anstyle-wincon", 24 | "colorchoice", 25 | "is_terminal_polyfill", 26 | "utf8parse", 27 | ] 28 | 29 | [[package]] 30 | name = "anstyle" 31 | version = "1.0.11" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" 34 | 35 | [[package]] 36 | name = "anstyle-parse" 37 | version = "0.2.7" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" 40 | dependencies = [ 41 | "utf8parse", 42 | ] 43 | 44 | [[package]] 45 | name = "anstyle-query" 46 | version = "1.1.3" 47 | source = "registry+https://github.com/rust-lang/crates.io-index" 48 | checksum = "6c8bdeb6047d8983be085bab0ba1472e6dc604e7041dbf6fcd5e71523014fae9" 49 | dependencies = [ 50 | "windows-sys", 51 | ] 52 | 53 | [[package]] 54 | name = "anstyle-wincon" 55 | version = "3.0.9" 56 | source = "registry+https://github.com/rust-lang/crates.io-index" 57 | checksum = "403f75924867bb1033c59fbf0797484329750cfbe3c4325cd33127941fabc882" 58 | dependencies = [ 59 | "anstyle", 60 | "once_cell_polyfill", 61 | "windows-sys", 62 | ] 63 | 64 | [[package]] 65 | name = "anyhow" 66 | version = "1.0.98" 67 | source = "registry+https://github.com/rust-lang/crates.io-index" 68 | checksum = "e16d2d3311acee920a9eb8d33b8cbc1787ce4a264e85f964c2404b969bdcd487" 69 | 70 | [[package]] 71 | name = "clap" 72 | version = "4.5.40" 73 | source = "registry+https://github.com/rust-lang/crates.io-index" 74 | checksum = "40b6887a1d8685cebccf115538db5c0efe625ccac9696ad45c409d96566e910f" 75 | dependencies = [ 76 | "clap_builder", 77 | "clap_derive", 78 | ] 79 | 80 | [[package]] 81 | name = "clap_builder" 82 | version = "4.5.40" 83 | source = "registry+https://github.com/rust-lang/crates.io-index" 84 | checksum = "e0c66c08ce9f0c698cbce5c0279d0bb6ac936d8674174fe48f736533b964f59e" 85 | dependencies = [ 86 | "anstream", 87 | "anstyle", 88 | "clap_lex", 89 | "strsim", 90 | ] 91 | 92 | [[package]] 93 | name = "clap_derive" 94 | version = "4.5.40" 95 | source = "registry+https://github.com/rust-lang/crates.io-index" 96 | checksum = "d2c7947ae4cc3d851207c1adb5b5e260ff0cca11446b1d6d1423788e442257ce" 97 | dependencies = [ 98 | "heck", 99 | "proc-macro2", 100 | "quote", 101 | "syn", 102 | ] 103 | 104 | [[package]] 105 | name = "clap_lex" 106 | version = "0.7.5" 107 | source = "registry+https://github.com/rust-lang/crates.io-index" 108 | checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" 109 | 110 | [[package]] 111 | name = "colorchoice" 112 | version = "1.0.4" 113 | source = "registry+https://github.com/rust-lang/crates.io-index" 114 | checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" 115 | 116 | [[package]] 117 | name = "csv" 118 | version = "1.3.1" 119 | source = "registry+https://github.com/rust-lang/crates.io-index" 120 | checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf" 121 | dependencies = [ 122 | "csv-core", 123 | "itoa", 124 | "ryu", 125 | "serde", 126 | ] 127 | 128 | [[package]] 129 | name = "csv-core" 130 | version = "0.1.12" 131 | source = "registry+https://github.com/rust-lang/crates.io-index" 132 | checksum = "7d02f3b0da4c6504f86e9cd789d8dbafab48c2321be74e9987593de5a894d93d" 133 | dependencies = [ 134 | "memchr", 135 | ] 136 | 137 | [[package]] 138 | name = "equivalent" 139 | version = "1.0.2" 140 | source = "registry+https://github.com/rust-lang/crates.io-index" 141 | checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" 142 | 143 | [[package]] 144 | name = "hashbrown" 145 | version = "0.15.4" 146 | source = "registry+https://github.com/rust-lang/crates.io-index" 147 | checksum = "5971ac85611da7067dbfcabef3c70ebb5606018acd9e2a3903a0da507521e0d5" 148 | 149 | [[package]] 150 | name = "hawk-data" 151 | version = "0.2.3" 152 | dependencies = [ 153 | "anyhow", 154 | "clap", 155 | "csv", 156 | "indexmap", 157 | "is-terminal", 158 | "regex", 159 | "serde", 160 | "serde_json", 161 | "serde_yaml", 162 | "termcolor", 163 | "thiserror", 164 | ] 165 | 166 | [[package]] 167 | name = "heck" 168 | version = "0.5.0" 169 | source = "registry+https://github.com/rust-lang/crates.io-index" 170 | checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" 171 | 172 | [[package]] 173 | name = "hermit-abi" 174 | version = "0.5.2" 175 | source = "registry+https://github.com/rust-lang/crates.io-index" 176 | checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" 177 | 178 | [[package]] 179 | name = "indexmap" 180 | version = "2.10.0" 181 | source = "registry+https://github.com/rust-lang/crates.io-index" 182 | checksum = "fe4cd85333e22411419a0bcae1297d25e58c9443848b11dc6a86fefe8c78a661" 183 | dependencies = [ 184 | "equivalent", 185 | "hashbrown", 186 | "serde", 187 | ] 188 | 189 | [[package]] 190 | name = "is-terminal" 191 | version = "0.4.16" 192 | source = "registry+https://github.com/rust-lang/crates.io-index" 193 | checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" 194 | dependencies = [ 195 | "hermit-abi", 196 | "libc", 197 | "windows-sys", 198 | ] 199 | 200 | [[package]] 201 | name = "is_terminal_polyfill" 202 | version = "1.70.1" 203 | source = "registry+https://github.com/rust-lang/crates.io-index" 204 | checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" 205 | 206 | [[package]] 207 | name = "itoa" 208 | version = "1.0.15" 209 | source = "registry+https://github.com/rust-lang/crates.io-index" 210 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 211 | 212 | [[package]] 213 | name = "libc" 214 | version = "0.2.174" 215 | source = "registry+https://github.com/rust-lang/crates.io-index" 216 | checksum = "1171693293099992e19cddea4e8b849964e9846f4acee11b3948bcc337be8776" 217 | 218 | [[package]] 219 | name = "memchr" 220 | version = "2.7.5" 221 | source = "registry+https://github.com/rust-lang/crates.io-index" 222 | checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" 223 | 224 | [[package]] 225 | name = "once_cell_polyfill" 226 | version = "1.70.1" 227 | source = "registry+https://github.com/rust-lang/crates.io-index" 228 | checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" 229 | 230 | [[package]] 231 | name = "proc-macro2" 232 | version = "1.0.95" 233 | source = "registry+https://github.com/rust-lang/crates.io-index" 234 | checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" 235 | dependencies = [ 236 | "unicode-ident", 237 | ] 238 | 239 | [[package]] 240 | name = "quote" 241 | version = "1.0.40" 242 | source = "registry+https://github.com/rust-lang/crates.io-index" 243 | checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" 244 | dependencies = [ 245 | "proc-macro2", 246 | ] 247 | 248 | [[package]] 249 | name = "regex" 250 | version = "1.11.1" 251 | source = "registry+https://github.com/rust-lang/crates.io-index" 252 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" 253 | dependencies = [ 254 | "aho-corasick", 255 | "memchr", 256 | "regex-automata", 257 | "regex-syntax", 258 | ] 259 | 260 | [[package]] 261 | name = "regex-automata" 262 | version = "0.4.9" 263 | source = "registry+https://github.com/rust-lang/crates.io-index" 264 | checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" 265 | dependencies = [ 266 | "aho-corasick", 267 | "memchr", 268 | "regex-syntax", 269 | ] 270 | 271 | [[package]] 272 | name = "regex-syntax" 273 | version = "0.8.5" 274 | source = "registry+https://github.com/rust-lang/crates.io-index" 275 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" 276 | 277 | [[package]] 278 | name = "ryu" 279 | version = "1.0.20" 280 | source = "registry+https://github.com/rust-lang/crates.io-index" 281 | checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" 282 | 283 | [[package]] 284 | name = "serde" 285 | version = "1.0.219" 286 | source = "registry+https://github.com/rust-lang/crates.io-index" 287 | checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" 288 | dependencies = [ 289 | "serde_derive", 290 | ] 291 | 292 | [[package]] 293 | name = "serde_derive" 294 | version = "1.0.219" 295 | source = "registry+https://github.com/rust-lang/crates.io-index" 296 | checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" 297 | dependencies = [ 298 | "proc-macro2", 299 | "quote", 300 | "syn", 301 | ] 302 | 303 | [[package]] 304 | name = "serde_json" 305 | version = "1.0.140" 306 | source = "registry+https://github.com/rust-lang/crates.io-index" 307 | checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" 308 | dependencies = [ 309 | "indexmap", 310 | "itoa", 311 | "memchr", 312 | "ryu", 313 | "serde", 314 | ] 315 | 316 | [[package]] 317 | name = "serde_yaml" 318 | version = "0.9.34+deprecated" 319 | source = "registry+https://github.com/rust-lang/crates.io-index" 320 | checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" 321 | dependencies = [ 322 | "indexmap", 323 | "itoa", 324 | "ryu", 325 | "serde", 326 | "unsafe-libyaml", 327 | ] 328 | 329 | [[package]] 330 | name = "strsim" 331 | version = "0.11.1" 332 | source = "registry+https://github.com/rust-lang/crates.io-index" 333 | checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" 334 | 335 | [[package]] 336 | name = "syn" 337 | version = "2.0.104" 338 | source = "registry+https://github.com/rust-lang/crates.io-index" 339 | checksum = "17b6f705963418cdb9927482fa304bc562ece2fdd4f616084c50b7023b435a40" 340 | dependencies = [ 341 | "proc-macro2", 342 | "quote", 343 | "unicode-ident", 344 | ] 345 | 346 | [[package]] 347 | name = "termcolor" 348 | version = "1.4.1" 349 | source = "registry+https://github.com/rust-lang/crates.io-index" 350 | checksum = "06794f8f6c5c898b3275aebefa6b8a1cb24cd2c6c79397ab15774837a0bc5755" 351 | dependencies = [ 352 | "winapi-util", 353 | ] 354 | 355 | [[package]] 356 | name = "thiserror" 357 | version = "2.0.12" 358 | source = "registry+https://github.com/rust-lang/crates.io-index" 359 | checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" 360 | dependencies = [ 361 | "thiserror-impl", 362 | ] 363 | 364 | [[package]] 365 | name = "thiserror-impl" 366 | version = "2.0.12" 367 | source = "registry+https://github.com/rust-lang/crates.io-index" 368 | checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" 369 | dependencies = [ 370 | "proc-macro2", 371 | "quote", 372 | "syn", 373 | ] 374 | 375 | [[package]] 376 | name = "unicode-ident" 377 | version = "1.0.18" 378 | source = "registry+https://github.com/rust-lang/crates.io-index" 379 | checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" 380 | 381 | [[package]] 382 | name = "unsafe-libyaml" 383 | version = "0.2.11" 384 | source = "registry+https://github.com/rust-lang/crates.io-index" 385 | checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" 386 | 387 | [[package]] 388 | name = "utf8parse" 389 | version = "0.2.2" 390 | source = "registry+https://github.com/rust-lang/crates.io-index" 391 | checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" 392 | 393 | [[package]] 394 | name = "winapi-util" 395 | version = "0.1.9" 396 | source = "registry+https://github.com/rust-lang/crates.io-index" 397 | checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" 398 | dependencies = [ 399 | "windows-sys", 400 | ] 401 | 402 | [[package]] 403 | name = "windows-sys" 404 | version = "0.59.0" 405 | source = "registry+https://github.com/rust-lang/crates.io-index" 406 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" 407 | dependencies = [ 408 | "windows-targets", 409 | ] 410 | 411 | [[package]] 412 | name = "windows-targets" 413 | version = "0.52.6" 414 | source = "registry+https://github.com/rust-lang/crates.io-index" 415 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 416 | dependencies = [ 417 | "windows_aarch64_gnullvm", 418 | "windows_aarch64_msvc", 419 | "windows_i686_gnu", 420 | "windows_i686_gnullvm", 421 | "windows_i686_msvc", 422 | "windows_x86_64_gnu", 423 | "windows_x86_64_gnullvm", 424 | "windows_x86_64_msvc", 425 | ] 426 | 427 | [[package]] 428 | name = "windows_aarch64_gnullvm" 429 | version = "0.52.6" 430 | source = "registry+https://github.com/rust-lang/crates.io-index" 431 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 432 | 433 | [[package]] 434 | name = "windows_aarch64_msvc" 435 | version = "0.52.6" 436 | source = "registry+https://github.com/rust-lang/crates.io-index" 437 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 438 | 439 | [[package]] 440 | name = "windows_i686_gnu" 441 | version = "0.52.6" 442 | source = "registry+https://github.com/rust-lang/crates.io-index" 443 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 444 | 445 | [[package]] 446 | name = "windows_i686_gnullvm" 447 | version = "0.52.6" 448 | source = "registry+https://github.com/rust-lang/crates.io-index" 449 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 450 | 451 | [[package]] 452 | name = "windows_i686_msvc" 453 | version = "0.52.6" 454 | source = "registry+https://github.com/rust-lang/crates.io-index" 455 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 456 | 457 | [[package]] 458 | name = "windows_x86_64_gnu" 459 | version = "0.52.6" 460 | source = "registry+https://github.com/rust-lang/crates.io-index" 461 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 462 | 463 | [[package]] 464 | name = "windows_x86_64_gnullvm" 465 | version = "0.52.6" 466 | source = "registry+https://github.com/rust-lang/crates.io-index" 467 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 468 | 469 | [[package]] 470 | name = "windows_x86_64_msvc" 471 | version = "0.52.6" 472 | source = "registry+https://github.com/rust-lang/crates.io-index" 473 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 474 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hawk-data" 3 | version = "0.2.3" 4 | edition = "2024" 5 | authors = ["Kyota "] 6 | license = "MIT" 7 | description = "Modern data analysis tool for structured data (JSON, YAML, CSV)" 8 | readme = "README.md" 9 | homepage = "https://github.com/kyotalab/hawk" 10 | repository = "https://github.com/kyotalab/hawk" 11 | keywords = ["awk", "cli", "jq", "analysis"] 12 | categories = ["command-line-utilities"] 13 | 14 | [[bin]] 15 | name = "hawk" 16 | path = "src/main.rs" 17 | 18 | [dependencies] 19 | anyhow = "1.0.98" 20 | clap = { version = "4.5.40", features = ["derive"] } 21 | csv = "1.3.1" 22 | indexmap = { version = "2.10.0", features = ["serde"] } 23 | is-terminal = "0.4.16" 24 | regex = "1.11.1" 25 | serde = { version = "1.0.219", features = ["derive"] } 26 | serde_json = { version = "1.0.140", features = ["preserve_order"] } 27 | serde_yaml = "0.9.34" 28 | termcolor = "1.4.1" 29 | thiserror = "2.0.12" 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 kyotalab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hawk 🦅 2 | 3 | **Modern data analysis tool for JSON, YAML, CSV, and text files** 4 | 5 | [![Rust](https://img.shields.io/badge/rust-1.70%2B-orange.svg)](https://www.rust-lang.org/) 6 | [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) 7 | [![Crates.io](https://img.shields.io/crates/v/hawk-data.svg)](https://crates.io/crates/hawk-data) 8 | [![Crates.io](https://img.shields.io/crates/d/hawk-data.svg)](https://crates.io/crates/hawk-data) 9 | [![GitHub Stars](https://img.shields.io/github/stars/kyotalab/hawk.svg)](https://github.com/kyotalab/hawk/stargazers) 10 | [![GitHub Release](https://img.shields.io/github/release/kyotalab/hawk.svg)](https://github.com/kyotalab/hawk/releases) 11 | 12 | hawk combines the simplicity of `awk` with the power of `pandas`, bringing unified data processing to your command line. Process any data format with the same intuitive syntax. 13 | 14 | ## ⚡ Quick Start 15 | 16 | ### Installation 17 | 18 | ```bash 19 | # Homebrew (macOS/Linux) 20 | brew install kyotalab/tools/hawk 21 | 22 | # Cargo (Rust) 23 | cargo install hawk-data 24 | 25 | # Verify installation 26 | hawk --version 27 | ``` 28 | 29 | ### 30-Second Demo 30 | 31 | ```bash 32 | # JSON/CSV analysis - same syntax! 33 | hawk '.users[] | select(.age > 30) | count' users.json 34 | hawk '.[] | group_by(.department) | avg(.salary)' employees.csv 35 | 36 | # Text/log processing with slicing (NEW!) 37 | hawk -t '. | select(. | contains("ERROR|WARN")) | .[-100:]' app.log 38 | hawk -t '. | map(. | split(" ")[0:3]) | unique' access.log 39 | 40 | # Advanced string operations with multiple fields 41 | hawk '.posts[] | map(.title, .content | trim | lower)' blog.json 42 | hawk '.[] | group_by(.category) | .[0:10] | avg(.price)' products.json 43 | ``` 44 | 45 | ## 🚀 Why hawk? 46 | 47 | | Feature | hawk | jq | awk | pandas | 48 | | ------------------------ | -------------------------- | ---------------- | ------------- | ------------------ | 49 | | **Multi-format** | ✅ JSON, YAML, CSV, Text | ❌ JSON only | ❌ Text only | ❌ Python required | 50 | | **Unified syntax** | ✅ Same queries everywhere | ❌ JSON-specific | ❌ Line-based | ❌ Complex setup | 51 | | **String operations** | ✅ 14 built-in + slicing | ⚠️ Limited | ⚠️ Basic | ✅ Extensive | 52 | | **Statistical analysis** | ✅ Built-in median, stddev | ❌ None | ❌ None | ✅ Full suite | 53 | | **Learning curve** | 🟢 Familiar pandas-like | 🟡 Steep | 🟢 Simple | 🔴 High | 54 | 55 | ## 🎯 Key Features 56 | 57 | ### **Universal Data Processing** 58 | 59 | Process any format with identical syntax: 60 | 61 | ```bash 62 | hawk '.items[] | select(.price > 100)' data.json # JSON 63 | hawk '.items[] | select(.price > 100)' data.csv # CSV 64 | hawk '.items[] | select(.price > 100)' data.yaml # YAML 65 | hawk -t '. | select(. | contains("$"))' data.txt # Text 66 | ``` 67 | 68 | ### **Advanced Text Processing (NEW in v0.2.3!)** 69 | 70 | ```bash 71 | # Split with slicing - extract exactly what you need 72 | echo "2024-01-15 10:30:45 INFO message" | hawk -t '. | map(. | split(" ")[0:2])' 73 | # → ["2024-01-15", "10:30:45"] 74 | 75 | # OR conditions for flexible filtering 76 | hawk -t '. | select(. | contains("GET|POST|PUT"))' access.log 77 | 78 | # Powerful slicing for any operation result 79 | hawk '.[] | sort(.revenue) | .[-10:]' companies.json # Top 10 80 | hawk '.[] | group_by(.category) | .[0:5]' products.json # 5 from each group 81 | ``` 82 | 83 | ### **Statistical Analysis Made Simple** 84 | 85 | ```bash 86 | # Instant insights from your data 87 | hawk '.sales[] | group_by(.region) | median(.amount)' sales.json 88 | hawk '.users[] | select(.active) | stddev(.session_time)' analytics.json 89 | hawk '.metrics[] | unique(.user_id) | count' engagement.json 90 | ``` 91 | 92 | ## 📚 Documentation 93 | 94 | ### **Get Started in 5 Minutes** 95 | 96 | - 🚀 [**Quick Start Guide**](docs/getting-started.md) - Essential basics 97 | - 📖 [**Query Language Reference**](docs/query-language.md) - Complete syntax 98 | - 🧵 [**String Operations**](docs/string-operations.md) - Text processing guide 99 | 100 | ### **Master Advanced Features** 101 | 102 | - 📊 [**Data Analysis**](docs/data-analysis.md) - Statistical workflows 103 | - 📄 [**Text Processing**](docs/text-processing.md) - Log analysis and text manipulation 104 | - 💼 [**Real-world Examples**](docs/examples/) - Industry-specific use cases 105 | 106 | ### **Use Case Guides(In progress)** 107 | 108 | - 🔍 [**Log Analysis**](docs/examples/log-analysis.md) - Docker, nginx, application logs 109 | - ⚙️ [**DevOps Workflows**](docs/examples/devops-workflows.md) - Kubernetes, CI/CD, monitoring 110 | - 📈 [**Data Science**](docs/examples/data-science.md) - CSV analysis, statistics, ML prep 111 | 112 | ## 🌟 Popular Workflows 113 | 114 | ### **Log Analysis** 115 | 116 | ```bash 117 | # Find error patterns in application logs 118 | hawk -t '. | select(. | contains("ERROR")) | map(. | split(" ")[0:2]) | unique' app.log 119 | 120 | # Analyze Docker container performance 121 | hawk -t '. | group_by(. | split(" ")[1]) | count' docker.log 122 | ``` 123 | 124 | ### **Data Exploration** 125 | 126 | ```bash 127 | # Quick dataset overview 128 | hawk '. | info' unknown-data.json 129 | 130 | # Statistical analysis 131 | hawk '.users[] | group_by(.department) | median(.salary)' employees.csv 132 | ``` 133 | 134 | ### **DevOps Automation** 135 | 136 | ```bash 137 | # Kubernetes resource analysis 138 | hawk '.items[] | select(.status.phase == "Running") | count' pods.json 139 | 140 | # Performance monitoring 141 | hawk '.metrics[] | group_by(.service) | avg(.response_time)' monitoring.json 142 | ``` 143 | 144 | ## ⭐ What's New in v0.2.3 145 | 146 | - **🎯 Advanced Slicing**: `.[0:10]`, `.[-5:]`, `group_by(.field) | .[0:3]` 147 | - **✂️ Split with Slicing**: `split(" ")[0:3]`, `split(",")[-2:]` 148 | - **🔍 OR Conditions**: `contains("GET|POST")`, `starts_with("ERROR|WARN")` 149 | - **📊 Stratified Sampling**: Sample from each group for unbiased analysis 150 | - **⚡ Performance**: Optimized for large datasets with efficient memory usage 151 | 152 | ## 🤝 Contributing 153 | 154 | We welcome contributions! See our [Contributing Guide](CONTRIBUTING.md). 155 | 156 | ```bash 157 | git clone https://github.com/kyotalab/hawk.git 158 | cd hawk 159 | cargo build --release 160 | cargo test 161 | ``` 162 | 163 | ## 📄 License 164 | 165 | MIT License - see [LICENSE](LICENSE) for details. 166 | 167 | --- 168 | 169 | **Ready to transform your data workflows?** Start with our [5-minute tutorial](docs/getting-started.md) 🚀 170 | -------------------------------------------------------------------------------- /docs/data-analysis.md: -------------------------------------------------------------------------------- 1 | # Data Analysis Guide 2 | 3 | Comprehensive guide to data analysis workflows with hawk. 4 | 5 | ## 📖 Table of Contents 6 | 7 | - [Data Analysis Fundamentals](#data-analysis-fundamentals) 8 | - [Exploratory Data Analysis](#exploratory-data-analysis) 9 | - [Statistical Operations](#statistical-operations) 10 | - [Data Filtering and Selection](#data-filtering-and-selection) 11 | - [Grouping and Aggregation](#grouping-and-aggregation) 12 | - [Data Transformation](#data-transformation) 13 | - [Time Series Analysis](#time-series-analysis) 14 | - [Performance Analytics](#performance-analytics) 15 | - [Business Intelligence](#business-intelligence) 16 | - [Advanced Analytics Patterns](#advanced-analytics-patterns) 17 | 18 | ## Data Analysis Fundamentals 19 | 20 | ### The hawk Analytics Workflow 21 | 22 | ```bash 23 | 1. Data Exploration → hawk '. | info' data.json 24 | 2. Data Cleaning → hawk '.[] | select(.field) | map(.field | trim)' 25 | 3. Data Filtering → hawk '.[] | select(.condition)' 26 | 4. Data Aggregation → hawk '.[] | group_by(.field) | agg_function' 27 | 5. Results Export → hawk '.results[]' --format csv > output.csv 28 | ``` 29 | 30 | ### Understanding Your Data Structure 31 | 32 | Before analysis, always understand your data: 33 | 34 | ```bash 35 | # Get basic information 36 | hawk '. | info' dataset.json 37 | 38 | # Count total records 39 | hawk '. | count' data.csv 40 | 41 | # Sample the first few records 42 | hawk '.[0:5]' data.json 43 | 44 | # Check for missing values 45 | hawk '.[] | select(.field == null) | count' data.json 46 | ``` 47 | 48 | ## Exploratory Data Analysis 49 | 50 | ### Data Overview and Profiling 51 | 52 | ```bash 53 | # Dataset summary 54 | hawk '. | info' sales_data.json 55 | 56 | # Record count by category 57 | hawk '.[] | group_by(.category) | count' products.csv 58 | 59 | # Unique values in a field 60 | hawk '.[] | .department | unique' employees.json 61 | 62 | # Data quality check 63 | hawk '.[] | select(.email | contains("@")) | count' users.csv 64 | ``` 65 | 66 | ### Sample Data Analysis Workflow 67 | 68 | Let's work with a sample sales dataset: 69 | 70 | ```json 71 | { 72 | "sales": [ 73 | { 74 | "date": "2024-01-15", 75 | "product": "Laptop", 76 | "category": "Electronics", 77 | "amount": 1200, 78 | "quantity": 1, 79 | "region": "North", 80 | "salesperson": "Alice" 81 | }, 82 | { 83 | "date": "2024-01-15", 84 | "product": "Mouse", 85 | "category": "Electronics", 86 | "amount": 25, 87 | "quantity": 3, 88 | "region": "South", 89 | "salesperson": "Bob" 90 | }, 91 | { 92 | "date": "2024-01-16", 93 | "product": "Desk", 94 | "category": "Furniture", 95 | "amount": 300, 96 | "quantity": 2, 97 | "region": "North", 98 | "salesperson": "Alice" 99 | }, 100 | { 101 | "date": "2024-01-16", 102 | "product": "Chair", 103 | "category": "Furniture", 104 | "amount": 150, 105 | "quantity": 4, 106 | "region": "South", 107 | "salesperson": "Carol" 108 | } 109 | ] 110 | } 111 | ``` 112 | 113 | **Basic Analysis:** 114 | 115 | ```bash 116 | # Total sales count 117 | hawk '.sales[] | count' sales_data.json 118 | 119 | # Total revenue 120 | hawk '.sales[] | sum(.amount)' sales_data.json 121 | 122 | # Average sale amount 123 | hawk '.sales[] | avg(.amount)' sales_data.json 124 | 125 | # Sales by category 126 | hawk '.sales[] | group_by(.category) | sum(.amount)' sales_data.json 127 | 128 | # Top performing regions 129 | hawk '.sales[] | group_by(.region) | sum(.amount)' sales_data.json 130 | ``` 131 | 132 | ## Statistical Operations 133 | 134 | ### Descriptive Statistics 135 | 136 | ```bash 137 | # Central tendency 138 | hawk '.[] | avg(.field)' data.json # Mean 139 | hawk '.[] | median(.field)' data.json # Median 140 | 141 | # Variability 142 | hawk '.[] | min(.field)' data.json # Minimum 143 | hawk '.[] | max(.field)' data.json # Maximum 144 | hawk '.[] | stddev(.field)' data.json # Standard deviation 145 | 146 | # Distribution 147 | hawk '.[] | .field | unique | sort' data.json # Unique values 148 | hawk '.[] | .field | sort' data.json # All values sorted 149 | ``` 150 | 151 | ### Advanced Statistical Analysis 152 | 153 | ```bash 154 | # Quartile analysis (using slicing) 155 | hawk '.[] | sort(.price) | length' products.json # Get total count 156 | 157 | # Range analysis 158 | hawk '.[] | select(.price >= 100) | select(.price <= 500) | count' products.json 159 | 160 | # Frequency analysis 161 | hawk '.[] | group_by(.grade) | count' grades.json 162 | ``` 163 | 164 | ### Statistical Comparisons 165 | 166 | ```bash 167 | # Compare groups 168 | hawk '.[] | group_by(.department) | avg(.salary)' employees.csv 169 | hawk '.[] | group_by(.department) | stddev(.salary)' employees.csv 170 | 171 | # Performance metrics 172 | hawk '.[] | group_by(.team) | min(.response_time)' performance.json 173 | hawk '.[] | group_by(.team) | max(.response_time)' performance.json 174 | 175 | # Correlation analysis (manual) 176 | hawk '.[] | select(.x > 0) | select(.y > 0) | count' correlation_data.json 177 | ``` 178 | 179 | ## Data Filtering and Selection 180 | 181 | ### Conditional Filtering 182 | 183 | ```bash 184 | # Numeric conditions 185 | hawk '.[] | select(.age >= 18)' users.json # Adults only 186 | hawk '.[] | select(.price > 100) | select(.price <= 500)' products.json # Price range 187 | 188 | # String conditions 189 | hawk '.[] | select(.status == "active")' accounts.json # Active accounts 190 | hawk '.[] | select(.email | ends_with(".com"))' users.json # .com emails 191 | 192 | # Date filtering (string-based) 193 | hawk '.[] | select(.date >= "2024-01-01")' transactions.json 194 | hawk '.[] | select(.date | starts_with("2024-01"))' logs.json 195 | ``` 196 | 197 | ### Complex Multi-condition Filtering 198 | 199 | ```bash 200 | # Multiple AND conditions 201 | hawk '.[] | select(.age >= 18) | select(.status == "active") | select(.region == "North")' users.json 202 | 203 | # Range filtering 204 | hawk '.[] | select(.score >= 80) | select(.score <= 100)' grades.json 205 | 206 | # Category filtering 207 | hawk '.[] | select(.category == "Electronics") | select(.price < 1000)' products.json 208 | 209 | # Data quality filtering 210 | hawk '.[] | select(.email | contains("@")) | select(.phone | length == 10)' contacts.json 211 | ``` 212 | 213 | ### Sampling and Data Selection 214 | 215 | ```bash 216 | # Random sampling (using slicing) 217 | hawk '.[] | .[0:100]' large_dataset.json # First 100 records 218 | hawk '.[] | .[1000:1100]' large_dataset.json # Records 1000-1100 219 | 220 | # Stratified sampling 221 | hawk '.[] | group_by(.category) | .[0:10]' products.json # 10 from each category 222 | 223 | # Top/Bottom N 224 | hawk '.[] | sort(.revenue) | .[-10:]' companies.json # Top 10 by revenue 225 | hawk '.[] | sort(.score) | .[0:5]' results.json # Bottom 5 by score 226 | ``` 227 | 228 | ## Grouping and Aggregation 229 | 230 | ### Basic Grouping Operations 231 | 232 | ```bash 233 | # Group by single field 234 | hawk '.[] | group_by(.department) | count' employees.json 235 | hawk '.[] | group_by(.region) | sum(.sales)' sales.json 236 | hawk '.[] | group_by(.category) | avg(.price)' products.json 237 | 238 | # Group by multiple criteria (sequential) 239 | hawk '.[] | group_by(.region) | group_by(.category) | sum(.amount)' sales.json 240 | ``` 241 | 242 | ### Advanced Aggregation Patterns 243 | 244 | ```bash 245 | # Multiple aggregations per group 246 | hawk '.[] | group_by(.department)' employees.json # Then analyze each group 247 | hawk '.[] | group_by(.department) | count' employees.json # Count per group 248 | hawk '.[] | group_by(.department) | avg(.salary)' employees.json # Average per group 249 | hawk '.[] | group_by(.department) | sum(.salary)' employees.json # Total per group 250 | 251 | # Performance analytics 252 | hawk '.[] | group_by(.server) | avg(.response_time)' performance.json 253 | hawk '.[] | group_by(.server) | max(.memory_usage)' performance.json 254 | hawk '.[] | group_by(.server) | min(.cpu_usage)' performance.json 255 | ``` 256 | 257 | ### Business Intelligence Aggregations 258 | 259 | ```bash 260 | # Sales analysis 261 | hawk '.[] | group_by(.salesperson) | sum(.amount)' sales.json 262 | hawk '.[] | group_by(.product) | avg(.rating)' reviews.json 263 | hawk '.[] | group_by(.region) | count' customers.json 264 | 265 | # Financial analysis 266 | hawk '.[] | group_by(.quarter) | sum(.revenue)' financial.json 267 | hawk '.[] | group_by(.cost_center) | sum(.expenses)' budget.json 268 | 269 | # User behavior analysis 270 | hawk '.[] | group_by(.user_type) | avg(.session_duration)' analytics.json 271 | hawk '.[] | group_by(.device_type) | count' user_sessions.json 272 | ``` 273 | 274 | ## Data Transformation 275 | 276 | ### Data Cleaning and Normalization 277 | 278 | ```bash 279 | # Clean text data 280 | hawk '.[] | map(.name | trim | upper)' contacts.json 281 | hawk '.[] | map(.email | lower)' users.json 282 | 283 | # Normalize numeric data 284 | hawk '.[] | map(.amount | * 1.0)' transactions.json # Ensure float 285 | 286 | # Handle missing data 287 | hawk '.[] | select(.field)' data.json # Remove nulls 288 | hawk '.[] | map(.field // "default_value")' data.json # Replace nulls 289 | ``` 290 | 291 | ### Feature Engineering 292 | 293 | ```bash 294 | # Extract date components 295 | hawk '.[] | map(.year | split("-")[0])' events.json 296 | hawk '.[] | map(.month | split("-")[1])' events.json 297 | 298 | # Categorize numeric data 299 | hawk '.[] | select(.age >= 18) | select(.age < 65) | map(.age_group = "adult")' users.json 300 | ``` 301 | 302 | ### Data Reshaping 303 | 304 | ```bash 305 | # Extract specific fields 306 | hawk '.[] | select_fields(id,name,email)' users.json 307 | ``` 308 | 309 | ## Time Series Analysis 310 | 311 | ### Date-based Analysis 312 | 313 | ```bash 314 | # Group by time periods 315 | hawk '.[] | group_by(.date | split("-")[0])' time_series.json # By year 316 | hawk '.[] | group_by(.date | split("-")[1])' time_series.json # By month 317 | hawk '.[] | group_by(.date | substring(0, 7))' time_series.json # By year-month 318 | 319 | # Trend analysis 320 | hawk '.[] | sort(.date) | .[0:10]' events.json # First 10 chronologically 321 | hawk '.[] | sort(.date) | .[-10:]' events.json # Last 10 chronologically 322 | ``` 323 | 324 | ### Sales Trend Analysis 325 | 326 | ```bash 327 | # Monthly sales trends 328 | hawk '.[] | group_by(.date | substring(0, 7)) | sum(.amount)' sales.json 329 | 330 | # Daily transaction counts 331 | hawk '.[] | group_by(.date) | count' transactions.json 332 | 333 | # Seasonal analysis 334 | hawk '.[] | group_by(.date | split("-")[1]) | avg(.temperature)' weather.json 335 | 336 | # Growth analysis 337 | hawk '.[] | sort(.date) | .[0:100]' historical_data.json # Historical baseline 338 | hawk '.[] | sort(.date) | .[-100:]' historical_data.json # Recent data 339 | ``` 340 | 341 | ### Performance Over Time 342 | 343 | ```bash 344 | # System performance trends 345 | hawk '.[] | group_by(.hour) | avg(.response_time)' performance_logs.json 346 | 347 | # User engagement trends 348 | hawk '.[] | group_by(.week) | sum(.active_users)' analytics.json 349 | 350 | # Error rate analysis 351 | hawk '.[] | group_by(.date) | select(.level == "ERROR") | count' error_logs.json 352 | ``` 353 | 354 | ## Performance Analytics 355 | 356 | ### Application Performance Analysis 357 | 358 | ```bash 359 | # Response time analysis 360 | hawk '.[] | group_by(.endpoint) | avg(.response_time)' api_logs.json 361 | hawk '.[] | group_by(.endpoint) | max(.response_time)' api_logs.json 362 | hawk '.[] | group_by(.endpoint) | min(.response_time)' api_logs.json 363 | 364 | # Error rate calculation 365 | hawk '.[] | group_by(.service) | select(.status >= 400) | count' api_logs.json 366 | 367 | # Throughput analysis 368 | hawk '.[] | group_by(.hour) | count' requests.json 369 | ``` 370 | 371 | ### System Resource Analysis 372 | 373 | ```bash 374 | # Memory usage analysis 375 | hawk '.[] | group_by(.server) | avg(.memory_usage)' system_metrics.json 376 | hawk '.[] | group_by(.server) | max(.memory_usage)' system_metrics.json 377 | 378 | # CPU utilization 379 | hawk '.[] | group_by(.process) | avg(.cpu_percent)' process_metrics.json 380 | 381 | # Disk usage trends 382 | hawk '.[] | group_by(.mount_point) | max(.disk_usage)' disk_metrics.json 383 | ``` 384 | 385 | ### User Performance Analysis 386 | 387 | ```bash 388 | # Page load times 389 | hawk '.[] | group_by(.page) | avg(.load_time)' user_metrics.json 390 | 391 | # User session analysis 392 | hawk '.[] | group_by(.user_id) | avg(.session_duration)' sessions.json 393 | 394 | # Conversion rate analysis 395 | hawk '.[] | group_by(.campaign) | select(.converted == true) | count' marketing.json 396 | ``` 397 | 398 | ## Business Intelligence 399 | 400 | ### Sales Analytics 401 | 402 | ```bash 403 | # Revenue analysis 404 | hawk '.[] | group_by(.quarter) | sum(.revenue)' quarterly_sales.json 405 | hawk '.[] | group_by(.product_line) | sum(.revenue)' product_sales.json 406 | hawk '.[] | group_by(.region) | sum(.revenue)' regional_sales.json 407 | 408 | # Profitability analysis 409 | hawk '.[] | group_by(.product) | sum(.profit)' product_profitability.json 410 | hawk '.[] | group_by(.customer_segment) | avg(.margin)' customer_analysis.json 411 | 412 | # Sales performance 413 | hawk '.[] | group_by(.salesperson) | sum(.deals_closed)' sales_performance.json 414 | hawk '.[] | group_by(.salesperson) | avg(.deal_size)' sales_performance.json 415 | ``` 416 | 417 | ### Customer Analytics 418 | 419 | ```bash 420 | # Customer segmentation 421 | hawk '.[] | group_by(.customer_type) | avg(.lifetime_value)' customers.json 422 | hawk '.[] | group_by(.acquisition_channel) | count' customers.json 423 | 424 | # Customer behavior 425 | hawk '.[] | group_by(.customer_id) | sum(.total_spent)' transactions.json 426 | hawk '.[] | group_by(.customer_id) | count' purchases.json 427 | 428 | # Retention analysis 429 | hawk '.[] | group_by(.cohort) | avg(.retention_rate)' retention.json 430 | ``` 431 | 432 | ### Marketing Analytics 433 | 434 | ```bash 435 | # Campaign performance 436 | hawk '.[] | group_by(.campaign) | sum(.impressions)' marketing.json 437 | hawk '.[] | group_by(.campaign) | avg(.click_through_rate)' marketing.json 438 | 439 | # Channel effectiveness 440 | hawk '.[] | group_by(.channel) | sum(.conversions)' marketing.json 441 | hawk '.[] | group_by(.channel) | avg(.cost_per_acquisition)' marketing.json 442 | 443 | # ROI analysis 444 | hawk '.[] | group_by(.campaign) | sum(.revenue - .spend)' marketing.json 445 | ``` 446 | 447 | ## Advanced Analytics Patterns 448 | 449 | ### Cohort Analysis 450 | 451 | ```bash 452 | # User cohorts by signup month 453 | hawk '.[] | group_by(.signup_month) | count' users.json 454 | hawk '.[] | group_by(.signup_month) | avg(.lifetime_value)' users.json 455 | 456 | # Retention by cohort 457 | hawk '.[] | group_by(.cohort) | select(.active == true) | count' user_activity.json 458 | ``` 459 | 460 | ### Funnel Analysis 461 | 462 | ```bash 463 | # Conversion funnel 464 | hawk '.[] | select(.stage == "awareness") | count' funnel.json 465 | hawk '.[] | select(.stage == "consideration") | count' funnel.json 466 | hawk '.[] | select(.stage == "purchase") | count' funnel.json 467 | 468 | # Drop-off analysis 469 | hawk '.[] | group_by(.exit_page) | count' user_sessions.json 470 | ``` 471 | 472 | ### A/B Testing Analysis 473 | 474 | ```bash 475 | # Test group comparison 476 | hawk '.[] | group_by(.test_group) | avg(.conversion_rate)' ab_test.json 477 | hawk '.[] | group_by(.test_group) | count' ab_test.json 478 | 479 | # Statistical significance (basic) 480 | hawk '.[] | group_by(.variant) | stddev(.metric)' ab_test.json 481 | ``` 482 | 483 | ### Anomaly Detection (Basic) 484 | 485 | ```bash 486 | # Outlier detection using statistical methods 487 | hawk '.[] | sort(.value) | .[0:5]' data.json # Bottom 5 (potential outliers) 488 | hawk '.[] | sort(.value) | .[-5:]' data.json # Top 5 (potential outliers) 489 | 490 | # Threshold-based anomalies 491 | hawk '.[] | avg(.response_time)' baseline.json # Calculate baseline 492 | hawk '.[] | select(.response_time > baseline * 2)' current.json # 2x baseline 493 | ``` 494 | 495 | ## Export and Reporting 496 | 497 | ### Data Export Formats 498 | 499 | ```bash 500 | # Export to JSON 501 | hawk '.summary' --format json > summary_report.json 502 | 503 | # Export specific fields 504 | hawk '.[] | select_fields(id,name,value)' --format table > report.txt 505 | ``` 506 | 507 | ### Report Generation 508 | 509 | ```bash 510 | # Summary statistics report 511 | echo "=== Sales Summary ===" > report.txt 512 | hawk '.[] | sum(.amount)' sales.json >> report.txt 513 | hawk '.[] | avg(.amount)' sales.json >> report.txt 514 | hawk '.[] | count' sales.json >> report.txt 515 | ``` 516 | 517 | ## Best Practices 518 | 519 | ### Data Analysis Workflow 520 | 521 | 1. **Start with exploration**: Always use `hawk '. | info'` first 522 | 2. **Sample your data**: Use slicing `.[0:100]` for large datasets 523 | 3. **Check data quality**: Filter out invalid records early 524 | 4. **Build incrementally**: Add complexity step by step 525 | 5. **Validate results**: Cross-check with known values 526 | 527 | ### Performance Optimization 528 | 529 | ```bash 530 | # ✅ Filter early in pipeline 531 | hawk '.[] | select(.active == true) | group_by(.region) | count' 532 | 533 | # ❌ Filter late in pipeline 534 | hawk '.[] | group_by(.region) | select(.active == true) | count' 535 | 536 | # ✅ Use appropriate data types 537 | hawk '.[] | select(.amount > 100.0)' numeric_data.json 538 | 539 | # ✅ Sample large datasets 540 | hawk '.[0:1000] | group_by(.category) | avg(.price)' large_data.json 541 | ``` 542 | 543 | ### Common Pitfalls 544 | 545 | ```bash 546 | # ❌ Ignoring missing data 547 | hawk '.[] | avg(.field)' # May include nulls 548 | 549 | # ✅ Handle missing data 550 | hawk '.[] | select(.field) | avg(.field)' 551 | 552 | # ❌ Not validating data types 553 | hawk '.[] | sum(.text_field)' # Error if not numeric 554 | 555 | # ✅ Validate data types 556 | hawk '.[] | select(.numeric_field > 0) | sum(.numeric_field)' 557 | ``` 558 | 559 | --- 560 | 561 | **Related Documentation:** 562 | 563 | - [Getting Started](getting-started.md) - Basic introduction 564 | - [Query Language Reference](query-language.md) - Complete syntax 565 | - [String Operations](string-operations.md) - Text processing 566 | - [Examples](examples/) - Real-world use cases 567 | -------------------------------------------------------------------------------- /docs/getting-started.md: -------------------------------------------------------------------------------- 1 | # Getting Started with hawk 🦅 2 | 3 | **5-minute introduction to hawk's data processing capabilities** 4 | 5 | hawk is a command-line tool that lets you explore and analyze data using a simple, unified query language. Whether you're working with JSON APIs, CSV files, YAML configs, or log files, hawk uses the same intuitive syntax. 6 | 7 | ## 📦 Installation 8 | 9 | Choose your preferred installation method: 10 | 11 | ### Homebrew (Recommended) 12 | 13 | ```bash 14 | brew install kyotalab/tools/hawk 15 | ``` 16 | 17 | ### Cargo (Rust) 18 | 19 | ```bash 20 | cargo install hawk-data 21 | ``` 22 | 23 | ### Verify Installation 24 | 25 | ```bash 26 | hawk --version 27 | # Output: hawk 0.2.2 28 | ``` 29 | 30 | ## 🎯 Your First hawk Command 31 | 32 | Let's start with a simple example. Create a test file: 33 | 34 | ```bash 35 | cat << 'EOF' > users.json 36 | { 37 | "users": [ 38 | {"name": "Alice", "age": 30}, 39 | {"name": "Bob", "age": 25} 40 | ] 41 | } 42 | EOF 43 | ``` 44 | 45 | Now run your first hawk command: 46 | 47 | ```bash 48 | hawk '.users[0].name' users.json 49 | ``` 50 | 51 | **Output:** `Alice` 52 | 53 | **What happened?** 54 | 55 | - `.users` → access the "users" field 56 | - `[0]` → get the first element of the array 57 | - `.name` → get the "name" field from that element 58 | 59 | ## 🏗️ Basic Building Blocks 60 | 61 | ### 1. Field Access 62 | 63 | ```bash 64 | # Access a field 65 | hawk '.name' data.json 66 | 67 | # Access nested fields 68 | hawk '.user.profile.email' data.json 69 | 70 | # Access array elements 71 | hawk '.items[0]' data.json 72 | ``` 73 | 74 | ### 2. Array Operations 75 | 76 | ```bash 77 | # Get all array elements 78 | hawk '.users[]' users.json 79 | 80 | # Access specific fields from all elements 81 | hawk '.users[].name' users.json 82 | ``` 83 | 84 | ### 3. Filtering with select() 85 | 86 | ```bash 87 | # Find users older than 25 88 | hawk '.users[] | select(.age > 25)' users.json 89 | 90 | # Find users named "Alice" 91 | hawk '.users[] | select(.name == "Alice")' users.json 92 | ``` 93 | 94 | ### 4. Counting and Aggregation 95 | 96 | ```bash 97 | # Count total users 98 | hawk '.users | count' users.json 99 | 100 | # Average age 101 | hawk '.users[] | avg(.age)' users.json 102 | ``` 103 | 104 | ## 🧪 Hands-on Examples 105 | 106 | Let's work through progressively complex examples with sample data. 107 | 108 | ### Example 1: JSON Data Analysis 109 | 110 | Create a sample dataset: 111 | 112 | ```bash 113 | cat > sales.json << 'EOF' 114 | { 115 | "sales": [ 116 | {"product": "Laptop", "price": 1200, "quantity": 3, "region": "North"}, 117 | {"product": "Mouse", "price": 25, "quantity": 50, "region": "South"}, 118 | {"product": "Keyboard", "price": 80, "quantity": 20, "region": "North"}, 119 | {"product": "Monitor", "price": 300, "quantity": 10, "region": "South"} 120 | ] 121 | } 122 | EOF 123 | ``` 124 | 125 | **Basic Operations:** 126 | 127 | ```bash 128 | # See all products 129 | hawk '.sales[].product' sales.json 130 | 131 | # Find expensive items (>$100) 132 | hawk '.sales[] | select(.price > 100)' sales.json 133 | 134 | # Count items by region 135 | hawk '.sales[] | group_by(.region) | count' sales.json 136 | 137 | # Average price by region 138 | hawk '.sales[] | group_by(.region) | avg(.price)' sales.json 139 | ``` 140 | 141 | ### Example 2: CSV Data Processing 142 | 143 | Create a CSV file: 144 | 145 | ```bash 146 | cat > employees.csv << 'EOF' 147 | name,age,department,salary 148 | Alice,30,Engineering,95000 149 | Bob,25,Marketing,75000 150 | Carol,35,Engineering,105000 151 | David,28,Sales,80000 152 | EOF 153 | ``` 154 | 155 | **CSV Operations:** 156 | 157 | ```bash 158 | # See all names 159 | hawk '.[].name' employees.csv 160 | 161 | # Find engineers 162 | hawk '.[] | select(.department == "Engineering")' employees.csv 163 | 164 | # Average salary by department 165 | hawk '.[] | group_by(.department) | avg(.salary)' employees.csv 166 | 167 | # Count employees by department 168 | hawk '.[] | group_by(.department) | count' employees.csv 169 | ``` 170 | 171 | ### Example 3: Text/Log Processing 172 | 173 | Create a sample log file: 174 | 175 | ```bash 176 | cat > app.log << 'EOF' 177 | 2024-01-15 09:00:01 INFO Application started 178 | 2024-01-15 09:00:15 ERROR Database connection failed 179 | 2024-01-15 09:00:16 INFO Retrying connection 180 | 2024-01-15 09:01:20 WARN High memory usage: 85% 181 | 2024-01-15 09:01:45 ERROR Timeout occurred 182 | EOF 183 | ``` 184 | 185 | **Text Processing Operations:** 186 | 187 | ```bash 188 | # Process as text (use -t flag for logs) 189 | # Find all ERROR lines 190 | hawk -t '. | select(. | contains("ERROR"))' app.log 191 | 192 | # Extract timestamps 193 | hawk -t '. | map(. | split(" ")[0])' app.log 194 | 195 | # Extract log levels 196 | hawk -t '. | map(. | split(" ")[2])' app.log 197 | ``` 198 | 199 | ## 🔧 String Operations 200 | 201 | hawk includes powerful string manipulation: 202 | 203 | ```bash 204 | # Text transformation 205 | echo '" Hello World "' | hawk '. | map(. | trim | upper)' 206 | 207 | # String splitting with index access (NEW!) 208 | echo '"apple banana cherry"' | hawk '. | map(. | split(" ")[1])' 209 | 210 | # Multiple field processing 211 | cat << 'EOF' | hawk '. | map(.first, .last | upper)' 212 | { 213 | "first": "john", 214 | "last": "doe" 215 | } 216 | EOF 217 | 218 | ``` 219 | 220 | ## 📊 Understanding Output Formats 221 | 222 | hawk automatically chooses the best output format: 223 | 224 | ```bash 225 | # Single value → simple output 226 | hawk '.users[0].name' users.json 227 | # Output: Alice 228 | 229 | # Array of objects → table format 230 | hawk '.users[]' users.json 231 | # Output: Formatted table with columns 232 | ``` 233 | 234 | You can force specific formats: 235 | 236 | ```bash 237 | hawk '.users[]' --format json users.json # Force JSON 238 | hawk '.users[]' --format table users.json # Force table 239 | hawk '.users[].name' --format list users.json # Force list 240 | ``` 241 | 242 | ## 🎯 Common Patterns 243 | 244 | ### Data Exploration 245 | 246 | ```bash 247 | # Understand data structure 248 | hawk '. | info' unknown-data.json 249 | 250 | # Count total records 251 | hawk '. | count' data.json 252 | 253 | # See unique values 254 | hawk '.field_name[] | unique' data.json 255 | ``` 256 | 257 | ### Filtering and Aggregation 258 | 259 | ```bash 260 | # Filter → count pattern 261 | hawk '.items[] | select(.price > 100) | count' data.json 262 | 263 | # Group → aggregate pattern 264 | hawk '.sales[] | group_by(.category) | sum(.amount)' data.json 265 | 266 | # Filter → group → aggregate pattern 267 | hawk '.orders[] | select(.status == "completed") | group_by(.region) | avg(.total)' data.json 268 | ``` 269 | 270 | ### Text Processing 271 | 272 | ```bash 273 | # Extract → unique pattern 274 | hawk -t '. | map(. | split(" ")[0]) | unique' logs.txt 275 | 276 | # Filter → extract pattern 277 | hawk -t '. | select(. | contains("ERROR")) | map(. | split(" ")[1])' logs.txt 278 | 279 | # Clean → transform pattern 280 | hawk '.users[] | map(.email | lower | trim)' users.json 281 | ``` 282 | 283 | ## 🚨 When to Use --text Flag 284 | 285 | Use the `--text` flag when processing files that might be misdetected: 286 | 287 | ```bash 288 | # For log files that look like YAML 289 | hawk --text '. | select(. | contains("ERROR"))' structured.log 290 | 291 | # For any text file you want to process line-by-line 292 | hawk -t '. | map(. | length) | avg' text-file.txt 293 | ``` 294 | 295 | ## 🎓 Next Steps 296 | 297 | Now that you know the basics, explore these guides: 298 | 299 | ### **Immediate Next Steps** 300 | 301 | 1. **[String Operations Guide](string-operations.md)** - Master text processing 302 | 2. **[Query Language Reference](query-language.md)** - Complete syntax guide 303 | 3. **[Log Analysis Examples](examples/log-analysis.md)** - Real-world log processing 304 | 305 | ### **By Use Case** 306 | 307 | - **Data Analysis**: [Data Analysis Guide](data-analysis.md) 308 | - **DevOps**: [DevOps Workflows](examples/devops-workflows.md) 309 | - **API Work**: [API Exploration](examples/api-exploration.md) 310 | 311 | ### **Advanced Topics** 312 | 313 | - **Performance**: [Optimization Tips](advanced/performance.md) 314 | - **Complex Workflows**: [Custom Workflows](advanced/custom-workflows.md) 315 | 316 | ## 🔗 Quick Reference Card 317 | 318 | ### Essential Commands 319 | 320 | ```bash 321 | # Field access 322 | hawk '.field' data.json 323 | hawk '.array[0]' data.json 324 | hawk '.array[]' data.json 325 | 326 | # Filtering 327 | hawk '.array[] | select(.field > value)' data.json 328 | 329 | # Aggregation 330 | hawk '.array[] | count/sum/avg/min/max(.field)' data.json 331 | 332 | # Grouping 333 | hawk '.array[] | group_by(.field) | count' data.json 334 | 335 | # Text processing 336 | hawk -t '. | select(. | contains("pattern"))' file.txt 337 | hawk -t '. | map(. | split(" ")[0])' file.txt 338 | 339 | # String operations 340 | hawk '.field | upper/lower/trim/length' data.json 341 | hawk '.field | split(",")[0]' data.json 342 | ``` 343 | 344 | ### Data Types 345 | 346 | - **JSON**: `data.json` → auto-detected 347 | - **YAML**: `config.yaml` → auto-detected 348 | - **CSV**: `data.csv` → auto-detected 349 | - **Text**: `file.txt` → use `-t` flag for line processing 350 | 351 | ## 💡 Pro Tips 352 | 353 | 1. **Start Simple**: Begin with basic field access, then add complexity 354 | 2. **Use `info`**: Always start data exploration with `hawk '. | info' file` 355 | 3. **Test in Steps**: Build complex queries incrementally 356 | 4. **Use `--text`**: When in doubt with text files, use the `-t` flag 357 | 5. **Read Error Messages**: hawk provides helpful error context 358 | 359 | ## 🎉 You're Ready! 360 | 361 | You now know enough hawk to be productive! The key is to start with simple operations and gradually build more complex queries as you become comfortable with the syntax. 362 | 363 | **Remember**: hawk uses the same syntax across all data formats, so skills learned with JSON work with CSV, YAML, and text files. 364 | 365 | Happy data exploring! 🦅 366 | 367 | --- 368 | 369 | **Quick Links:** 370 | 371 | - [String Operations](string-operations.md) - Text processing guide 372 | - [Examples](../examples/README.md) - Real-world use cases 373 | -------------------------------------------------------------------------------- /docs/string-operations.md: -------------------------------------------------------------------------------- 1 | # String Operations Guide 2 | 3 | Comprehensive guide to hawk's text processing capabilities. 4 | 5 | ## 📖 Table of Contents 6 | 7 | - [Basic Operations](#basic-operations) 8 | - [Advanced Operations](#advanced-operations) 9 | - [Array Operations](#array-operations) 10 | - [Multi-field Operations](#multi-field-operations) 11 | - [Practical Examples](#practical-examples) 12 | - [Performance Tips](#performance-tips) 13 | 14 | ## Basic Operations 15 | 16 | ### Case Conversion 17 | 18 | ```bash 19 | # Convert to uppercase 20 | hawk '. | map(. | upper)' names.txt 21 | 22 | # Convert to lowercase 23 | hawk '.users[] | map(.email | lower)' users.json 24 | 25 | # Example 26 | "Hello World" | upper → "HELLO WORLD" 27 | "Hello World" | lower → "hello world" 28 | ``` 29 | 30 | ### Whitespace Management 31 | 32 | ```bash 33 | # Remove all whitespace 34 | hawk '. | map(. | trim)' messy-data.txt 35 | 36 | # Remove leading whitespace 37 | hawk '. | map(. | trim_start)' indented.txt 38 | 39 | # Remove trailing whitespace 40 | hawk '. | map(. | trim_end)' data.txt 41 | 42 | # Examples 43 | " hello " | trim → "hello" 44 | " hello " | trim_start → "hello " 45 | " hello " | trim_end → " hello" 46 | ``` 47 | 48 | ### String Analysis 49 | 50 | ```bash 51 | # Get string length 52 | hawk '. | map(. | length)' text.txt 53 | 54 | # Reverse strings 55 | hawk '. | map(. | reverse)' data.txt 56 | 57 | # Examples 58 | "hello" | length → 5 59 | "hello" | reverse → "olleh" 60 | ``` 61 | 62 | ## Advanced Operations 63 | 64 | ### Pattern Matching 65 | 66 | ```bash 67 | # Check if string contains pattern 68 | hawk '. | select(. | contains("ERROR"))' logs.txt 69 | 70 | # Check string start/end 71 | hawk '. | select(. | starts_with("2024"))' timestamps.txt 72 | hawk '. | select(. | ends_with(".log"))' filenames.txt 73 | 74 | # Examples 75 | "Hello World" | contains("World") → true 76 | "Hello World" | starts_with("Hello") → true 77 | "Hello World" | ends_with("World") → true 78 | ``` 79 | 80 | ### Text Transformation 81 | 82 | ```bash 83 | # Replace text 84 | hawk '. | map(. | replace("old", "new"))' text.txt 85 | 86 | # Extract substrings 87 | hawk '. | map(. | substring(0, 10))' long-text.txt 88 | hawk '. | map(. | substring(5))' text.txt # from index 5 to end 89 | 90 | # Examples 91 | "Hello World" | replace("World", "Rust") → "Hello Rust" 92 | "Hello World" | substring(0, 5) → "Hello" 93 | "Hello World" | substring(6) → "World" 94 | ``` 95 | 96 | ## Array Operations 97 | 98 | ### String Splitting 99 | 100 | ```bash 101 | # Split into array 102 | hawk '. | map(. | split(","))' csv-lines.txt 103 | hawk '. | map(. | split(" "))' sentences.txt 104 | 105 | # Split with index access (NEW in v0.2.2!) 106 | hawk '. | map(. | split(" ")[0])' space-separated.txt 107 | hawk '. | map(. | split(",")[2])' csv-data.txt 108 | 109 | # Examples 110 | "apple,banana,cherry" | split(",") → ["apple", "banana", "cherry"] 111 | "apple,banana,cherry" | split(",")[0] → "apple" 112 | "apple,banana,cherry" | split(",")[1] → "banana" 113 | ``` 114 | 115 | ### Array Joining 116 | 117 | ```bash 118 | # Join array elements 119 | hawk '.tags[] | join(",")' data.json 120 | hawk '.words[] | join(" ")' word-lists.json 121 | 122 | # Examples 123 | ["apple", "banana"] | join(",") → "apple,banana" 124 | ["hello", "world"] | join(" ") → "hello world" 125 | ``` 126 | 127 | ## Multi-field Operations 128 | 129 | Process multiple fields with the same operation (NEW in v0.2.2!): 130 | 131 | ```bash 132 | # Apply join to multiple array fields 133 | hawk '.users[] | map(.skills, .projects | join(","))' users.json 134 | 135 | # Convert multiple fields to uppercase 136 | hawk '.users[] | map(.first_name, .last_name | upper)' users.json 137 | 138 | # Get length of multiple string fields 139 | hawk '.posts[] | map(.title, .content | length)' posts.json 140 | ``` 141 | 142 | ### Example: User Data Processing 143 | 144 | ```json 145 | { 146 | "users": [ 147 | { 148 | "name": "alice", 149 | "skills": ["python", "rust"], 150 | "projects": ["web-app", "cli-tool"], 151 | "department": "engineering" 152 | } 153 | ] 154 | } 155 | ``` 156 | 157 | ```bash 158 | # Process multiple fields simultaneously 159 | hawk --format json '.users[] | map(.name, .department | upper)' users.json 160 | 161 | # Result 162 | { 163 | "users": [ 164 | { 165 | "name": "ALICE", // ← converted 166 | "skills": ["python", "rust"], 167 | "projects": ["web-app", "cli-tool"], 168 | "department": "ENGINEERING" // ← converted 169 | } 170 | ] 171 | } 172 | ``` 173 | 174 | ## Practical Examples 175 | 176 | ### Log File Processing 177 | 178 | ```bash 179 | # Extract timestamps from logs 180 | hawk -t '. | map(. | split(" ")[0])' app.log 181 | 182 | # Find unique IP addresses 183 | hawk -t '. | map(. | split(" ")[0]) | unique' access.log 184 | 185 | # Extract error messages 186 | hawk -t '. | select(. | contains("ERROR")) | map(. | split(": ")[1])' error.log 187 | ``` 188 | 189 | ### Data Cleaning 190 | 191 | ```bash 192 | # Normalize email addresses 193 | hawk '.users[] | map(.email | lower | trim)' users.csv 194 | 195 | # Clean phone numbers 196 | hawk '.contacts[] | map(.phone | replace("-", "") | replace("(", "") | replace(")", ""))' contacts.json 197 | 198 | # Standardize names 199 | hawk '.people[] | map(.name | trim | upper)' people.csv 200 | ``` 201 | 202 | ### CSV Processing 203 | 204 | ```bash 205 | # Extract specific columns from CSV-like text 206 | hawk -t '. | map(. | split(",")[1])' data.txt 207 | 208 | # Process headers and data separately 209 | hawk -t '.[0] | split(",")' data.txt # headers 210 | hawk -t '.[1:] | map(. | split(",")[2])' data.txt # data column 211 | ``` 212 | 213 | ### Docker/Container Logs 214 | 215 | ```bash 216 | # Extract container names 217 | hawk -t '. | map(. | split(" ")[1]) | unique' docker.log 218 | 219 | # Get timestamps and services 220 | hawk -t '. | map(. replace("T", " ")) | map(. | split(" ")[0:2] | map(. | join("-"))' docker.log 221 | 222 | # Filter by service and extract messages 223 | hawk -t '. | select(. | contains("web_server")) | map(. | split(" ")[3:] | join(" "))' docker.log 224 | ``` 225 | 226 | ## Performance Tips 227 | 228 | ### Efficient Patterns 229 | 230 | ```bash 231 | # ✅ Good: Filter first, then transform 232 | hawk '. | select(. | contains("ERROR")) | map(. | upper)' logs.txt 233 | 234 | # ❌ Avoid: Transform everything, then filter 235 | hawk '. | map(. | upper) | select(. | contains("ERROR"))' logs.txt 236 | ``` 237 | 238 | ### Memory Considerations 239 | 240 | ```bash 241 | # ✅ Process in chunks for large files 242 | hawk '. | select(. | length > 100) | map(. | substring(0, 50))' large.txt 243 | 244 | # ✅ Use specific operations instead of general ones 245 | hawk '. | map(. | split(" ")[0])' data.txt # Better than complex regex 246 | ``` 247 | 248 | ### Text Format Detection 249 | 250 | ```bash 251 | # ✅ Use --text flag for ambiguous files 252 | hawk -t '. | map(. | split(" ")[0])' structured.log 253 | 254 | # ✅ Especially important for logs that might be detected as YAML 255 | hawk --text '. | select(. | contains("GC"))' gc.log 256 | ``` 257 | 258 | ## Error Handling 259 | 260 | ### Common Issues 261 | 262 | ```bash 263 | # Array index out of bounds → returns empty string 264 | "a,b" | split(",")[5] → "" 265 | 266 | # Missing fields → error (use select to filter first) 267 | hawk '.users[] | select(.email) | map(.email | lower)' users.json 268 | ``` 269 | 270 | ### Debugging Tips 271 | 272 | ```bash 273 | # Check data structure first 274 | hawk '. | info' unknown-data.json 275 | 276 | # Test operations step by step 277 | hawk '. | map(. | split(" "))' data.txt # Step 1: split 278 | hawk '. | map(. | split(" ")[0])' data.txt # Step 2: index access 279 | ``` 280 | 281 | ## Chaining Operations 282 | 283 | ### Pipeline Examples 284 | 285 | ```bash 286 | # Complex text processing pipeline 287 | hawk -t '. | select(. | length > 10) | map(. | trim | upper | substring(0, 20))' text.txt 288 | 289 | # Multi-step data cleaning 290 | hawk '.users[] | map(.email | lower | trim) | select(. | ends_with(".com"))' users.json 291 | 292 | # Log analysis workflow 293 | hawk -t '. | select(. | contains("ERROR")) | map(. | split("][")[1] | split(" ")[0]) | unique | sort' app.log 294 | ``` 295 | 296 | --- 297 | 298 | **Next Steps:** 299 | 300 | - [Data Analysis Guide](data-analysis.md) - Statistical operations and aggregation 301 | - [Log Analysis Examples](examples/log-analysis.md) - Real-world log processing 302 | - [Query Language Reference](query-language.md) - Complete syntax guide 303 | -------------------------------------------------------------------------------- /docs/text-processing.md: -------------------------------------------------------------------------------- 1 | # Text Processing Guide 2 | 3 | Comprehensive guide to text and log processing with hawk. 4 | 5 | ## 📖 Table of Contents 6 | 7 | - [Text Processing Fundamentals](#text-processing-fundamentals) 8 | - [Log File Analysis](#log-file-analysis) 9 | - [String Operations](#string-operations) 10 | - [Pattern Matching and Filtering](#pattern-matching-and-filtering) 11 | - [Text Transformation](#text-transformation) 12 | - [Data Extraction](#data-extraction) 13 | - [Advanced Text Patterns](#advanced-text-patterns) 14 | - [Performance Optimization](#performance-optimization) 15 | - [Real-world Examples](#real-world-examples) 16 | 17 | ## Text Processing Fundamentals 18 | 19 | ### Understanding Text Mode 20 | 21 | hawk processes text files line-by-line when using the `--text` flag, treating each line as a string element in an array. 22 | 23 | ```bash 24 | # Force text processing mode 25 | hawk --text 'query' file.txt 26 | hawk -t 'query' file.txt 27 | 28 | # When to use --text flag 29 | hawk -t '. | select(. | contains("ERROR"))' app.log 30 | ``` 31 | 32 | ### Text vs Structured Data 33 | 34 | | Mode | Use Case | Example | 35 | | ------------------ | --------------------- | ------------------------------------------ | 36 | | **Auto-detect** | JSON, YAML, CSV files | `hawk '.field' data.json` | 37 | | **Text mode (-t)** | Log files, plain text | `hawk -t '. \| contains("ERROR")' app.log` | 38 | | **Force text** | Ambiguous files | `hawk -t 'query' structured.log` | 39 | 40 | ### Basic Text Processing Workflow 41 | 42 | ```bash 43 | 1. Read text file → hawk -t '. | length' file.txt 44 | 2. Filter lines → hawk -t '. | select(condition)' file.txt 45 | 3. Transform text → hawk -t '. | map(operation)' file.txt 46 | 4. Extract data → hawk -t '. | map(. | split(" ")[0])' file.txt 47 | 5. Analyze results → hawk -t '. | unique | count' file.txt 48 | ``` 49 | 50 | ## Log File Analysis 51 | 52 | ### Common Log Formats 53 | 54 | #### Application Logs 55 | 56 | ``` 57 | 2024-01-15 09:00:01 INFO Application started successfully 58 | 2024-01-15 09:00:02 DEBUG Loading configuration from /etc/app/config.json 59 | 2024-01-15 09:01:23 ERROR Failed to process user request: connection timeout 60 | 2024-01-15 09:01:24 INFO Retrying connection... 61 | 2024-01-15 09:02:45 WARN High memory usage detected: 85% 62 | ``` 63 | 64 | **Analysis Examples:** 65 | 66 | ```bash 67 | # Find all error messages 68 | hawk -t '. | select(. | contains("ERROR"))' app.log 69 | 70 | # Extract timestamps 71 | hawk -t '. | map(. | split(" ")[0])' app.log 72 | 73 | # Count log levels 74 | hawk -t '. | map(. | split(" ")[2]) | unique | count' app.log 75 | 76 | # Get unique dates 77 | hawk -t '. | map(. | substring(0, 10)) | unique | sort' app.log 78 | ``` 79 | 80 | #### Docker Container Logs 81 | 82 | ``` 83 | 2024-01-15T10:30:45Z web_server GET /api/users 200 0.045s 84 | 2024-01-15T10:30:46Z database_service Connected to MySQL 85 | 2024-01-15T10:30:47Z web_server POST /api/auth 401 0.012s 86 | 2024-01-15T10:30:48Z cache_service Redis cache miss for key:user:123 87 | ``` 88 | 89 | **Analysis Examples:** 90 | 91 | ```bash 92 | # Extract service names 93 | hawk -t '. | map(. | split(" ")[1]) | unique' docker.log 94 | 95 | # HTTP status code analysis 96 | hawk -t '. | select(. | contains("GET|POST")) | map(. | split(" ")[4]) | group_by(.) | count' docker.log 97 | 98 | # Service activity timeline 99 | hawk -t '. | map(. replace("T", " ")) | map(. | split(" ")[0:2] | map(. | join("-"))' docker.log 100 | ``` 101 | 102 | #### Nginx/Apache Access Logs 103 | 104 | ``` 105 | 192.168.1.100 - - [15/Jan/2024:10:30:45 +0000] "GET /api/users HTTP/1.1" 200 1234 "https://example.com" "Mozilla/5.0" 106 | 192.168.1.101 - - [15/Jan/2024:10:30:46 +0000] "POST /api/auth HTTP/1.1" 401 567 "-" "curl/7.68.0" 107 | 192.168.1.102 - - [15/Jan/2024:10:30:47 +0000] "GET /favicon.ico HTTP/1.1" 404 0 "https://example.com" "Mozilla/5.0" 108 | ``` 109 | 110 | **Analysis Examples:** 111 | 112 | ```bash 113 | # Extract IP addresses 114 | hawk -t '. | map(. | split(" ")[0]) | unique | sort' access.log 115 | 116 | # Status code distribution 117 | hawk -t '. | map(. | split("\"")[2] | split(" ")[1]) | group_by(.) | count' access.log 118 | 119 | # Find 4xx and 5xx errors 120 | hawk -t '. | select(. | contains("\" 4") | . | contains("\" 5"))' access.log 121 | 122 | # Top user agents 123 | hawk -t '. | map(. | split("\"")[5]) | group_by(.) | count | sort' access.log 124 | 125 | # Requests per hour 126 | hawk -t '. | map(. | split("[")[1] | split(":")[1]) | group_by(.) | count' access.log 127 | ``` 128 | 129 | #### System Logs (syslog format) 130 | 131 | ``` 132 | Jan 15 10:30:45 server01 kernel: [12345.678] TCP: Peer 192.168.1.100:443 unexpectedly shrunk window 133 | Jan 15 10:30:46 server01 sshd[1234]: Accepted password for user from 192.168.1.200 port 22 ssh2 134 | Jan 15 10:30:47 server01 systemd[1]: Started User Manager for UID 1000. 135 | ``` 136 | 137 | **Analysis Examples:** 138 | 139 | ```bash 140 | # Extract service names 141 | hawk -t '. | map(. | split(" ")[3] | split("[")[0]) | unique' syslog 142 | 143 | # SSH connection analysis 144 | hawk -t '. | select(. | contains("sshd")) | map(. | split(" from ")[1] | split(" ")[0]) | unique' syslog 145 | 146 | # System service events 147 | hawk -t '. | select(. | contains("systemd")) | map(. | split(": ")[1])' syslog 148 | 149 | # Error pattern analysis 150 | hawk -t '. | select(. | contains("error\|Error\|ERROR")) | map(. | split(" ")[3])' syslog 151 | ``` 152 | 153 | ## String Operations 154 | 155 | ### Basic String Transformations 156 | 157 | ```bash 158 | # Case conversion 159 | hawk -t '. | map(. | upper)' text.txt # Convert to uppercase 160 | hawk -t '. | map(. | lower)' text.txt # Convert to lowercase 161 | 162 | # Whitespace management 163 | hawk -t '. | map(. | trim)' text.txt # Remove leading/trailing spaces 164 | hawk -t '. | map(. | trim_start)' text.txt # Remove leading spaces only 165 | hawk -t '. | map(. | trim_end)' text.txt # Remove trailing spaces only 166 | 167 | # String analysis 168 | hawk -t '. | map(. | length)' text.txt # Get line lengths 169 | hawk -t '. | map(. | reverse)' text.txt # Reverse each line 170 | ``` 171 | 172 | ### Advanced String Operations 173 | 174 | ```bash 175 | # Text replacement 176 | hawk -t '. | map(. | replace("old", "new"))' text.txt 177 | 178 | # Substring extraction 179 | hawk -t '. | map(. | substring(0, 10))' text.txt # First 10 characters 180 | hawk -t '. | map(. | substring(5))' text.txt # From 5th character to end 181 | 182 | # String splitting with array access (NEW!) 183 | hawk -t '. | map(. | split(" ")[0])' text.txt # First word 184 | hawk -t '. | map(. | split(",")[2])' csv_like.txt # Third CSV column 185 | hawk -t '. | map(. | split(":")[1] | trim)' key_value.txt # Extract values 186 | ``` 187 | 188 | ### Multiple Field String Operations (NEW!) 189 | 190 | ```bash 191 | # Apply same operation to multiple fields in structured data 192 | hawk '.users[] | map(.first_name, .last_name | upper)' users.json 193 | hawk '.posts[] | map(.title, .content | trim)' posts.json 194 | hawk '.logs[] | map(.message, .details | lower)' structured_logs.json 195 | ``` 196 | 197 | ## Pattern Matching and Filtering 198 | 199 | ### Basic Pattern Matching 200 | 201 | ```bash 202 | # Contains pattern 203 | hawk -t '. | select(. | contains("ERROR"))' logs.txt 204 | 205 | # Case-insensitive search 206 | hawk -t '. | select(. | upper | contains("ERROR"))' logs.txt 207 | 208 | # Multiple patterns (OR logic) 209 | hawk -t '. | select(. | contains("ERROR") | . | contains("WARN"))' logs.txt 210 | 211 | # Exclude patterns 212 | hawk -t '. | select(not (. | contains("INFO"))' logs.txt 213 | ``` 214 | 215 | ### Advanced Pattern Matching 216 | 217 | ```bash 218 | # String starts/ends with pattern 219 | hawk -t '. | select(. | starts_with("[INFO]"))' logs.txt 220 | hawk -t '. | select(. | ends_with(".log"))' filenames.txt 221 | 222 | # Length-based filtering 223 | hawk -t '. | select(. | length > 100)' long_lines.txt 224 | hawk -t '. | select(. | length < 20)' short_lines.txt 225 | 226 | # Complex conditions 227 | hawk -t '. | select(. | contains("HTTP") && . | contains("200"))' access.log 228 | hawk -t '. | select(. | starts_with("2024") && . | contains("ERROR"))' timestamped.log 229 | ``` 230 | 231 | ### Log Level Filtering 232 | 233 | ```bash 234 | # Standard log levels 235 | hawk -t '. | select(. | contains("DEBUG"))' app.log 236 | hawk -t '. | select(. | contains("INFO"))' app.log 237 | hawk -t '. | select(. | contains("WARN"))' app.log 238 | hawk -t '. | select(. | contains("ERROR"))' app.log 239 | hawk -t '. | select(. | contains("FATAL"))' app.log 240 | 241 | # Severity filtering (ERROR and above) 242 | hawk -t '. | select(. | contains("ERROR|FATAL"))' app.log 243 | 244 | # Time-based filtering 245 | hawk -t '. | select(. | starts_with("2024-01-15"))' dated_logs.txt 246 | hawk -t '. | select(. | substring(11, 2) == "09")' hourly_filter.log # 9 AM only 247 | ``` 248 | 249 | ## Text Transformation 250 | 251 | ### Data Extraction 252 | 253 | ```bash 254 | # Extract timestamps from logs 255 | hawk -t '. | map(. | split(" ")[0])' timestamped.log 256 | 257 | # Extract IP addresses from access logs 258 | hawk -t '. | map(. | split(" ")[0])' access.log 259 | 260 | # Extract HTTP methods 261 | hawk -t '. | map(. | split("\"")[1] | split(" ")[0])' access.log 262 | 263 | # Extract file paths 264 | hawk -t '. | map(. | split("/")[-1])' file_paths.txt 265 | 266 | # Extract domains from URLs 267 | hawk -t '. | map(. | split("://")[1] | split("/")[0])' urls.txt 268 | ``` 269 | 270 | ### CSV-like Text Processing 271 | 272 | ```bash 273 | # Process comma-separated values 274 | hawk -t '. | map(. | split(",")[0])' csv_data.txt # First column 275 | hawk -t '. | map(. | split(",")[1] | trim)' csv_data.txt # Second column, trimmed 276 | 277 | # Process tab-separated values 278 | hawk -t '. | map(. | split("\t")[2])' tsv_data.txt 279 | 280 | # Process pipe-separated values 281 | hawk -t '. | map(. | split("|")[1])' pipe_data.txt 282 | 283 | # Join processed data back 284 | hawk -t '. | map(. | split(",") | join(" | "))' csv_data.txt 285 | ``` 286 | 287 | ### Key-Value Extraction 288 | 289 | ```bash 290 | # Extract values from key=value format 291 | hawk -t '. | select(. | contains("=")) | map(. | split("=")[1])' config.txt 292 | 293 | # Extract specific keys 294 | hawk -t '. | select(. | starts_with("user=")) | map(. | split("=")[1])' key_value.txt 295 | 296 | # Process JSON-like logs 297 | hawk -t '. | select(. | contains("\"level\"")) | map(. | split("\"level\":\"")[1] | split("\"")[0])' json_logs.txt 298 | ``` 299 | 300 | ## Data Extraction 301 | 302 | ### Email and URL Extraction 303 | 304 | ```bash 305 | # Extract email addresses (basic pattern) 306 | hawk -t '. | select(. | contains("@")) | map(. | split(" ") | select(. | contains("@")))' text.txt 307 | 308 | # Extract domains from emails 309 | hawk -t '. | select(. | contains("@")) | map(. | split("@")[1])' emails.txt 310 | 311 | # Extract URLs (basic pattern) 312 | hawk -t '. | select(. | contains("http")) | map(. | split(" ") | select(. | starts_with("http")))' text.txt 313 | ``` 314 | 315 | ### Numeric Data Extraction 316 | 317 | ```bash 318 | # Extract numbers from text 319 | hawk -t '. | map(. | split(" ") | select(. | length > 0) | select(. | replace("[^0-9.]", "") | length > 0))' mixed.txt 320 | 321 | # Extract percentages 322 | hawk -t '. | select(. | contains("%")) | map(. | split("%")[0] | split(" ") | last)' percentages.txt 323 | 324 | # Extract timestamps (ISO format) 325 | hawk -t '. | map(. | substring(0, 19))' iso_timestamps.txt 326 | 327 | # Extract version numbers 328 | hawk -t '. | select(. | contains("v")) | map(. | split("v")[1] | split(" ")[0])' versions.txt 329 | ``` 330 | 331 | ### Error Code and Status Extraction 332 | 333 | ```bash 334 | # HTTP status codes 335 | hawk -t '. | map(. | split(" ")[8])' access.log # Standard access log format 336 | hawk -t '. | select(. | split(" ")[8] >= "400")' access.log # 4xx and 5xx errors 337 | 338 | # Exit codes from logs 339 | hawk -t '. | select(. | contains("exit code")) | map(. | split("exit code ")[1] | split(" ")[0])' process.log 340 | 341 | # Error numbers 342 | hawk -t '. | select(. | contains("errno")) | map(. | split("errno=")[1] | split(" ")[0])' system.log 343 | ``` 344 | 345 | ## Advanced Text Patterns 346 | 347 | ### Multi-line Log Processing 348 | 349 | ```bash 350 | # Process stack traces (keep related lines together) 351 | hawk -t '. | select(. | contains("Exception") | . | starts_with("\t"))' java.log 352 | 353 | # Group by session ID 354 | hawk -t '. | select(. | contains("session=")) | map(. | split("session=")[1] | split(" ")[0])' session.log 355 | 356 | # Process multiline JSON logs (single line JSON per log entry) 357 | hawk -t '. | select(. | starts_with("{") && . | ends_with("}"))' json.log 358 | ``` 359 | 360 | ### Performance Log Analysis 361 | 362 | ```bash 363 | # Response time analysis 364 | hawk -t '. | select(. | contains("ms")) | map(. | split(" ") | select(. | ends_with("ms")) | replace("ms", ""))' perf.log 365 | 366 | # Memory usage tracking 367 | hawk -t '. | select(. | contains("memory")) | map(. | split("memory: ")[1] | split(" ")[0])' memory.log 368 | 369 | # CPU usage extraction 370 | hawk -t '. | select(. | contains("cpu")) | map(. | split("cpu: ")[1] | split("%")[0])' cpu.log 371 | ``` 372 | 373 | ### Security Log Analysis 374 | 375 | ```bash 376 | # Failed login attempts 377 | hawk -t '. | select(. | contains("failed login")) | map(. | split("from ")[1] | split(" ")[0])' auth.log 378 | 379 | # Suspicious activity patterns 380 | hawk -t '. | select(. | contains("SUSPICIOUS") | . | contains("ANOMALY"))' security.log 381 | 382 | # IP-based analysis 383 | hawk -t '. | map(. | split(" ")[0]) | group_by(.) | count | sort' network.log 384 | ``` 385 | 386 | ## Performance Optimization 387 | 388 | ### Efficient Text Processing 389 | 390 | ```bash 391 | # ✅ Filter early in pipeline 392 | hawk -t '. | select(. | contains("ERROR")) | map(. | split(" ")[0])' large.log 393 | 394 | # ❌ Process everything then filter 395 | hawk -t '. | map(. | split(" ")[0]) | select(. | contains("ERROR"))' large.log 396 | 397 | # ✅ Use specific operations 398 | hawk -t '. | map(. | split(" ")[0])' log.txt 399 | 400 | # ❌ Use complex operations when simple ones suffice 401 | hawk -t '. | map(. | replace(...) | substring(...) | split(...))' log.txt 402 | ``` 403 | 404 | ### Memory Management 405 | 406 | ```bash 407 | # ✅ Process in chunks for large files 408 | hawk -t '.[0:10000] | select(. | contains("ERROR"))' huge.log 409 | 410 | # ✅ Sample large datasets 411 | hawk -t '.[::100] | map(. | split(" ")[0])' massive.log # Every 100th line 412 | ``` 413 | 414 | ### Slicing for Performance (NEW!) 415 | 416 | ```bash 417 | # ✅ Process recent logs only 418 | hawk -t '.[-1000:] | select(. | contains("ERROR"))' app.log # Last 1000 lines 419 | 420 | # ✅ Sample from different time periods 421 | hawk -t '.[0:100] | .[500:600] | .[1000:1100]' distributed_sample.log 422 | 423 | # ✅ Top/bottom analysis 424 | hawk -t '. | sort | .[0:10]' values.txt # Bottom 10 425 | hawk -t '. | sort | .[-10:]' values.txt # Top 10 426 | ``` 427 | 428 | ## Real-world Examples 429 | 430 | ### Complete Log Analysis Workflows 431 | 432 | #### Web Server Log Analysis 433 | 434 | ```bash 435 | # 1. Overview of traffic 436 | hawk -t '. | count' access.log # Total requests 437 | hawk -t '. | map(. | split(" ")[0]) | unique | count' access.log # Unique IPs 438 | 439 | # 2. Error analysis 440 | hawk -t '. | select(. | contains("\" 4") | . | contains("\" 5")) | count' access.log 441 | 442 | # 3. Top pages 443 | hawk -t '. | map(. | split("\"")[1] | split(" ")[1]) | group_by(.) | count | sort' access.log 444 | 445 | # 4. Traffic patterns by hour 446 | hawk -t '. | map(. | split("[")[1] | split(":")[1]) | group_by(.) | count' access.log 447 | 448 | # 5. User agent analysis 449 | hawk -t '. | map(. | split("\"")[5]) | group_by(.) | count | .[-10:]' access.log 450 | ``` 451 | 452 | #### Application Error Investigation 453 | 454 | ```bash 455 | # 1. Error trend analysis 456 | hawk -t '. | select(. | contains("ERROR")) | map(. | substring(0, 13)) | group_by(.) | count' app.log 457 | 458 | # 2. Error types 459 | hawk -t '. | select(. | contains("ERROR")) | map(. | split("ERROR ")[1] | split(":")[0]) | count' app.log 460 | 461 | # 3. Related warnings 462 | hawk -t '. | select(. | contains("WARN")) | select(. | contains("connection\|timeout\|retry"))' app.log 463 | ``` 464 | 465 | #### System Performance Monitoring 466 | 467 | ```bash 468 | # 1. Memory usage trends 469 | hawk -t '. | select(. | contains("memory")) | map(. | split("memory: ")[1] | split(" ")[0])' system.log 470 | 471 | # 2. Disk space monitoring 472 | hawk -t '. | select(. | contains("disk")) | map(. | split("usage: ")[1] | split("%")[0])' disk.log 473 | 474 | # 3. Network activity 475 | hawk -t '. | select(. | contains("bytes")) | map(. | split("bytes: ")[1] | split(" ")[0])' network.log 476 | 477 | # 4. Process analysis 478 | hawk -t '. | select(. | contains("process")) | map(. | split(" ")[3]) | group_by(.) | count' process.log 479 | ``` 480 | 481 | #### Security Log Analysis 482 | 483 | ```bash 484 | # 1. Authentication failures 485 | hawk -t '. | select(. | contains("authentication failed")) | map(. | split("from ")[1] | split(" ")[0]) | group_by(.) | count' security.log 486 | 487 | # 2. Unusual access patterns 488 | hawk -t '. | select(. | contains("GET") && . | contains("admin")) | map(. | split(" ")[0])' access.log 489 | 490 | # 3. Brute force detection 491 | hawk -t '. | select(. | contains("failed password")) | map(. | split(" ")[0]) | group_by(.) | count | select(. > 10)' auth.log 492 | 493 | # 4. Geographic analysis (if GeoIP data available) 494 | hawk -t '. | map(. | split(" ")[0]) | unique' access.log # Extract IPs for GeoIP lookup 495 | ``` 496 | 497 | #### DevOps Pipeline Logs 498 | 499 | ```bash 500 | # 1. Build success/failure rates 501 | hawk -t '. | select(. | contains("BUILD")) | map(. | split("BUILD ")[1] | split(" ")[0]) | group_by(.) | count' ci.log 502 | 503 | # 2. Deployment timing 504 | hawk -t '. | select(. | contains("DEPLOY")) | map(. | split(" ")[0])' deploy.log 505 | 506 | # 3. Test results analysis 507 | hawk -t '. | select(. | contains("TEST")) | map(. | split("TEST ")[1]) | group_by(.) | count' test.log 508 | 509 | # 4. Resource usage during builds 510 | hawk -t '. | select(. | contains("CPU\|MEMORY")) | map(. | split(": ")[1])' resource.log 511 | ``` 512 | 513 | ## Best Practices 514 | 515 | ### Text Processing Guidelines 516 | 517 | 1. **Always use --text flag for log files**: Prevents YAML/JSON misdetection 518 | 2. **Filter early**: Apply `select()` before expensive operations 519 | 3. **Use specific extractors**: Prefer `split()[index]` over complex regex alternatives 520 | 4. **Handle edge cases**: Check for empty results and missing fields 521 | 5. **Sample large files**: Use slicing for performance with huge datasets 522 | 523 | ### Common Patterns 524 | 525 | ```bash 526 | # ✅ Good: Extract then analyze 527 | hawk -t '. | map(. | split(" ")[0]) | unique | count' log.txt 528 | 529 | # ✅ Good: Filter then transform 530 | hawk -t '. | select(. | contains("ERROR")) | map(. | split(" ")[1])' log.txt 531 | 532 | # ✅ Good: Use appropriate data types 533 | hawk -t '. | select(. | length > 0) | map(. | trim)' text.txt 534 | 535 | # ✅ Good: Handle missing data 536 | hawk -t '. | select(. | contains(" ")) | map(. | split(" ")[1])' structured.txt 537 | ``` 538 | 539 | ### Debugging Text Processing 540 | 541 | ```bash 542 | # Check data structure 543 | hawk -t '. | .[0:5]' file.txt # Sample first 5 lines 544 | 545 | # Validate operations step by step 546 | hawk -t '. | map(. | split(" "))' file.txt # Step 1: split 547 | hawk -t '. | map(. | split(" ")[0])' file.txt # Step 2: index access 548 | 549 | # Check for empty or problematic lines 550 | hawk -t '. | select(. | length == 0)' file.txt # Find empty lines 551 | hawk -t '. | select(. | contains("\t"))' file.txt # Find tab characters 552 | ``` 553 | 554 | --- 555 | 556 | **Related Documentation:** 557 | 558 | - [Getting Started](getting-started.md) - Basic hawk introduction 559 | - [String Operations](string-operations.md) - Detailed string processing reference 560 | - [Query Language](query-language.md) - Complete syntax guide 561 | - [Log Analysis Examples](examples/log-analysis.md) - Real-world log processing cases 562 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Hawk Examples 2 | 3 | This directory contains sample data and query examples to learn and explore all hawk features. 4 | 5 | ## 🚀 Quick Start 6 | 7 | Lightweight sample data (~200KB total) ready to use immediately after git clone: 8 | 9 | ```bash 10 | cd examples/small 11 | 12 | # Explore data structure 13 | hawk '. | info' customers.json 14 | 15 | # Basic filtering 16 | hawk '.[] | select(.status == "active")' customers.json 17 | 18 | # New feature: NOT operator 19 | hawk '.[] | select(not (.segment == "enterprise"))' customers.json 20 | 21 | # New feature: OR operator 22 | hawk -t '. | select(. | contains("ERROR|WARN"))' application.log 23 | 24 | # New feature: Array slicing 25 | hawk '.[]' customers.json | hawk '.[0:3]' 26 | ``` 27 | 28 | ## 📁 Dataset Overview 29 | 30 | ### small/ - Lightweight Learning Data 31 | 32 | | File | Size | Records | Format | Use Case | 33 | | ---------------------- | ---- | -------- | ------ | ---------------------------------------- | 34 | | `customers.json` | ~2KB | 10 | JSON | Customer management, basic queries | 35 | | `orders.csv` | ~1KB | 25 | CSV | Sales analysis, JOIN operations | 36 | | `products.yaml` | ~1KB | 8 | YAML | Product catalog, price analysis | 37 | | `employees.json` | ~2KB | 15 | JSON | HR data, grouping operations | 38 | | `ec2_instances.json` | ~2KB | 5 | JSON | AWS resources, infrastructure monitoring | 39 | | `user_behavior.json` | ~3KB | 20 | JSON | Analytics, statistical processing | 40 | | `survey_responses.csv` | ~2KB | 30 | CSV | Survey analysis, aggregation | 41 | | `application.log` | ~3KB | 50 lines | TEXT | Log analysis, error extraction | 42 | | `nginx_access.log` | ~2KB | 30 lines | TEXT | Web server logs, IP analysis | 43 | | `urls.txt` | ~1KB | 20 lines | TEXT | URL processing, domain extraction | 44 | | `error_messages.txt` | ~1KB | 15 lines | TEXT | Error categorization, pattern extraction | 45 | | `nginx.conf` | ~2KB | - | TEXT | Configuration file analysis | 46 | 47 | ## 🎯 Learning Path 48 | 49 | ### Level 1: Basic Operations 50 | 51 | ```bash 52 | # Understanding data structure 53 | hawk '. | info' customers.json 54 | hawk '.[] | count' customers.json 55 | 56 | # Simple filtering 57 | hawk '.[] | select(.country == "USA")' customers.json 58 | hawk '.products[] | select(.price > 100)' products.yaml 59 | ``` 60 | 61 | ### Level 2: Aggregation and Grouping 62 | 63 | ```bash 64 | # Aggregation functions 65 | hawk '.[] | sum(.lifetime_value)' customers.json 66 | hawk '.products[] | avg(.price)' products.yaml 67 | 68 | # Grouping 69 | hawk '.[] | group_by(.country) | count' customers.json 70 | hawk '.[] | group_by(.department) | avg(.salary)' employees.json 71 | ``` 72 | 73 | ### Level 3: New Features (Logical Operations & Slicing) 74 | 75 | ```bash 76 | # NOT operator 77 | hawk '.[] | select(not (.status == "inactive"))' customers.json 78 | hawk -t '. | select(not (. | contains("DEBUG")))' application.log 79 | 80 | # OR operator 81 | hawk '.[] | select(.segment | contains("enterprise|business"))' customers.json 82 | hawk -t '. | select(. | contains("ERROR|FATAL|CRITICAL"))' application.log 83 | 84 | # Array slicing 85 | hawk '.[0:5]' customers.json # First 5 records 86 | ``` 87 | 88 | ### Level 4: Complex Text Processing 89 | 90 | ```bash 91 | # Log analysis 92 | hawk -t '. | map(. | split(" ")[0:3] | join(" "))' application.log 93 | 94 | # URL processing 95 | hawk -t '. | map(. | split("://")[1] | split("/")[0])' urls.txt 96 | hawk -t '. | select(not (. | starts_with("https://")))' urls.txt 97 | 98 | # Configuration file analysis 99 | hawk -t '. | select(not (. | starts_with("#"))) | select(. | contains("="))' nginx.conf 100 | ``` 101 | 102 | ### Level 5: Advanced Queries 103 | 104 | ```bash 105 | # Multiple condition combinations 106 | hawk '.[] | select(.status == "active") | select(not (.segment == "test")) | group_by(.country) | count' customers.json 107 | 108 | # String operations with complex logic 109 | hawk -t '. | select(. | contains("ERROR|WARN")) | map(. | split(" ")[0:2] | join(" ")) | unique' application.log 110 | 111 | # Slicing with aggregation 112 | hawk '.[0:10] | avg(.duration_seconds)' user_behavior.json 113 | ``` 114 | 115 | ## 🛠️ Larger Datasets 116 | 117 | After mastering the basics with small sample data, practice with larger datasets: 118 | 119 | ```bash 120 | # Generate large sample datasets (1000-10000 records) 121 | ./scripts/generate_large.sh 122 | 123 | # Download real-world open datasets 124 | ./scripts/download_datasets.sh 125 | 126 | # Practice with generated data 127 | hawk '.[] | group_by(.country) | count' large/customers_large.json 128 | ``` 129 | 130 | ## 📊 Practical Use Cases 131 | 132 | ### Business Analytics 133 | 134 | ```bash 135 | # Sales trends (by month) 136 | hawk '.[] | map(.order_date | split("-")[0:2] | join("-")) | group_by(.) | sum(.price)' orders.csv 137 | ``` 138 | 139 | ### Infrastructure Monitoring 140 | 141 | ```bash 142 | # Identify high-load instances 143 | hawk '.[] | select(.cpu_utilization > 80)' ec2_instances.json 144 | 145 | # Time-series error log analysis 146 | hawk -t '. | select(. | contains("ERROR")) | map(. | split(" ")[0:2] | join(" ")) | group_by(.) | count' application.log 147 | ``` 148 | 149 | ### Data Cleaning 150 | 151 | ```bash 152 | # Filter out invalid data 153 | hawk '.[] | select(not (.email | contains("test|demo|temp"))) | select(.lifetime_value > 0)' customers.json 154 | ``` 155 | 156 | ## 🔧 Scripts 157 | 158 | ### scripts/generate_large.sh 159 | 160 | Generate larger sample datasets: 161 | 162 | - `--size N`: Specify number of records to generate 163 | - `--type TYPE`: Specify data type to generate 164 | - `--format FORMAT`: Specify output format 165 | 166 | ### scripts/download_datasets.sh 167 | 168 | Download real-world open datasets: 169 | 170 | - GitHub API responses 171 | - Public API sample data 172 | - Real log file examples 173 | 174 | See [scripts/README.md](scripts/README.md) for detailed documentation. 175 | 176 | ## 🎓 Next Steps 177 | 178 | 1. **Master the basics**: Try all features with small/ data 179 | 2. **Practice with real data**: Use scripts/ to generate larger datasets 180 | 3. **Apply to your projects**: Use hawk with your own data files 181 | 182 | ## 💡 Tips 183 | 184 | - **Performance**: Apply filters early for large datasets 185 | - **Debugging**: Use `| info` to inspect data structure 186 | - **Incremental building**: Build complex queries step by step 187 | - **Formatting**: Use `--format table` for readable output 188 | 189 | ## 🤝 Contributing 190 | 191 | New sample data and query examples are welcome! 192 | 193 | --- 194 | 195 | **Related Documentation:** 196 | 197 | - [Query Language Reference](../docs/query-language.md) 198 | - [Getting Started Guide](../docs/getting-started.md) 199 | - [String Operations](../docs/string-operations.md) 200 | -------------------------------------------------------------------------------- /examples/scripts/README.md: -------------------------------------------------------------------------------- 1 | # Sample Data Generation Scripts 2 | 3 | This directory contains scripts to generate larger sample datasets for hawk learning and testing. 4 | 5 | ## 📋 Scripts Overview 6 | 7 | ### generate_large.sh 8 | 9 | Generate larger sample datasets (1,000-10,000 records) for performance testing and advanced learning. 10 | 11 | **Usage:** 12 | 13 | ```bash 14 | # Generate all datasets with default settings 15 | ./scripts/generate_large.sh 16 | 17 | # Specify custom size 18 | ./scripts/generate_large.sh --size 5000 19 | 20 | # Generate specific data types only 21 | ./scripts/generate_large.sh --type customers 22 | ./scripts/generate_large.sh --type logs 23 | ./scripts/generate_large.sh --type metrics 24 | 25 | # Specify output directory 26 | ./scripts/generate_large.sh --output examples/large 27 | ``` 28 | 29 | **Options:** 30 | 31 | - `--size N`: Number of records to generate (default: 1000) 32 | - `--type TYPE`: Data type to generate (customers, orders, employees, logs, metrics, all) 33 | - `--output DIR`: Output directory (default: examples/large) 34 | - `--format FORMAT`: Output format (json, csv, yaml) 35 | - `--parallel`: Enable parallel generation for faster processing 36 | - `--help`: Show help message 37 | 38 | ### download_datasets.sh 39 | 40 | Download real-world open datasets for practicing with actual data patterns. 41 | 42 | **Usage:** 43 | 44 | ```bash 45 | # Download all available datasets 46 | ./scripts/download_datasets.sh 47 | 48 | # Download specific datasets only 49 | ./scripts/download_datasets.sh --dataset github 50 | ./scripts/download_datasets.sh --dataset logs 51 | ``` 52 | 53 | **Available Datasets:** 54 | 55 | - `github`: GitHub API responses (repositories, users, issues) 56 | - `apis`: Public API samples (REST, GraphQL responses) 57 | - `logs`: Real application logs from open source projects 58 | - `configs`: Configuration file samples (nginx, docker, k8s) 59 | 60 | ### cleanup.sh 61 | 62 | Clean up generated data files and temporary files. 63 | 64 | ```bash 65 | # Clean all generated files with confirmation 66 | ./scripts/cleanup.sh 67 | 68 | # Clean specific targets 69 | ./scripts/cleanup.sh --target large 70 | ./scripts/cleanup.sh --target external 71 | 72 | # Preview what would be deleted (dry run) 73 | ./scripts/cleanup.sh --dry-run 74 | ``` 75 | 76 | ## 🎯 Generated Data 77 | 78 | ### Large Dataset (examples/large/) 79 | 80 | | File | Records | Size (approx) | Use Case | 81 | | -------------------------- | -------------------- | ------------- | ------------------------------------- | 82 | | `customers_large.json` | 1,000-10,000 | 200KB-2MB | Customer analysis, segmentation | 83 | | `orders_large.csv` | 5,000-50,000 | 500KB-5MB | Sales analysis, trend analysis | 84 | | `employees_large.json` | 500-5,000 | 100KB-1MB | HR analysis, organizational analysis | 85 | | `logs_large.log` | 10,000-100,000 lines | 1MB-10MB | Log analysis, error analysis | 86 | | `metrics_large.csv` | 1,440-14,400 | 100KB-1MB | Time series analysis, monitoring data | 87 | | `user_behavior_large.json` | 10,000-100,000 | 2MB-20MB | Behavior analysis, A/B testing | 88 | 89 | ### External Dataset (examples/external/) 90 | 91 | | File | Source | Size | Use Case | 92 | | ----------------------- | --------------------- | ------ | ----------------------------------- | 93 | | `github_repos.json` | GitHub API | ~50KB | Real API response processing | 94 | | `public_apis.json` | Public APIs Directory | ~100KB | API data analysis | 95 | | `real_logs.log` | Open source projects | ~500KB | Real log analysis | 96 | | `config_samples.tar.gz` | Configuration samples | ~200KB | Config analysis, pattern extraction | 97 | 98 | ## 🚀 Usage Examples 99 | 100 | ### Performance Testing with Large Data 101 | 102 | ```bash 103 | # Generate large customer dataset for aggregation testing 104 | ./scripts/generate_large.sh --type customers --size 10000 105 | hawk '.[] | group_by(.country) | count' large/customers_large.json 106 | 107 | # Test filtering performance with large logs 108 | ./scripts/generate_large.sh --type logs --size 100000 109 | hawk -t '. | select(. | contains("ERROR|CRITICAL")) | count' large/logs_large.log 110 | 111 | # Time series analysis with metrics data 112 | ./scripts/generate_large.sh --type metrics --size 14400 113 | hawk '.[] | group_by(.hour) | avg(.cpu_usage)' large/metrics_large.csv 114 | ``` 115 | 116 | ### Practicing with Real Data 117 | 118 | ```bash 119 | # Practice with GitHub data 120 | ./scripts/download_datasets.sh --dataset github 121 | hawk '.items[] | select(.language == "Rust") | group_by(.owner.login) | count' external/github_repos.json 122 | 123 | # Real log error analysis 124 | ./scripts/download_datasets.sh --dataset logs 125 | hawk -t '. | select(. | contains("ERROR|FATAL")) | map(. | split(" ")[0:3] | join(" ")) | unique' external/real_logs.log 126 | ``` 127 | 128 | ## ⚙️ Script Details 129 | 130 | ### Data Generation Algorithms 131 | 132 | **customers_large.json:** 133 | 134 | - Random but realistic names, emails, countries 135 | - Realistic company names and segment distribution 136 | - Regional purchasing power reflected in lifetime_value 137 | - Distribution adjusted by country population 138 | 139 | **logs_large.log:** 140 | 141 | - Chronological natural log generation 142 | - Realistic ERROR/WARN/INFO ratios (1:5:20) 143 | - Correlated IP addresses, URLs, response codes 144 | - Mimics real application patterns 145 | 146 | **metrics_large.csv:** 147 | 148 | - 24 hours × days of time series data 149 | - CPU, memory, network correlation relationships 150 | - Load variation patterns by time of day 151 | - Weekend/weekday differences reflected 152 | 153 | ### Performance Considerations 154 | 155 | - **Parallel Generation**: Simultaneous generation of multiple files for speed 156 | - **Memory Efficiency**: Streaming generation for large capacity support 157 | - **Progress Display**: Real-time progress indication 158 | - **Error Handling**: Proper handling of generation failures 159 | 160 | ## 🛠️ Customization 161 | 162 | ### Custom Data Generation 163 | 164 | Create your own data patterns based on the scripts: 165 | 166 | ```bash 167 | # Copy template 168 | cp scripts/generate_large.sh scripts/generate_custom.sh 169 | 170 | # Implement custom data patterns 171 | # Add generate_custom_dataset() function 172 | ``` 173 | 174 | ### Configuration File 175 | 176 | Customize generation parameters with `scripts/config.yaml`: 177 | 178 | ```yaml 179 | generation: 180 | default_size: 1000 181 | output_dir: "examples/large" 182 | 183 | datasets: 184 | customers: 185 | countries: ["USA", "Canada", "UK", "Germany", "Japan"] 186 | segments: ["enterprise", "business", "small"] 187 | 188 | logs: 189 | log_levels: ["ERROR", "WARN", "INFO", "DEBUG"] 190 | level_ratios: [1, 5, 20, 50] 191 | ``` 192 | 193 | ## 🧹 Cleanup 194 | 195 | ### Safe Deletion 196 | 197 | ```bash 198 | # Deletion with confirmation 199 | ./scripts/cleanup.sh --interactive 200 | 201 | # Specific files only 202 | ./scripts/cleanup.sh --pattern "*.log" 203 | 204 | # Size-limited deletion 205 | ./scripts/cleanup.sh --size-limit 10MB 206 | ``` 207 | 208 | ### Automated Cleanup 209 | 210 | ```bash 211 | # Periodic cleanup of old files (cron example) 212 | 0 2 * * * /path/to/scripts/cleanup.sh --older-than 7days 213 | ``` 214 | 215 | ## 💡 Tips 216 | 217 | 1. **Progressive Learning**: Learn in order: small → large → external 218 | 2. **Memory Monitoring**: Watch memory usage when generating large datasets 219 | 3. **Disk Space**: 10,000 records require approximately 10-50MB 220 | 4. **Parallel Processing**: Test performance with multiple concurrent queries 221 | 222 | ## 🐛 Troubleshooting 223 | 224 | ### Common Issues 225 | 226 | **Slow generation:** 227 | 228 | ```bash 229 | # Enable parallel generation 230 | ./scripts/generate_large.sh --parallel 4 231 | 232 | # Adjust size 233 | ./scripts/generate_large.sh --size 1000 234 | ``` 235 | 236 | **Out of memory errors:** 237 | 238 | ```bash 239 | # Use streaming mode 240 | ./scripts/generate_large.sh --streaming 241 | 242 | # Adjust batch size 243 | ./scripts/generate_large.sh --batch-size 100 244 | ``` 245 | 246 | **Download failures:** 247 | 248 | ```bash 249 | # Enable retry 250 | ./scripts/download_datasets.sh --retry 3 251 | 252 | # Set proxy 253 | export https_proxy=http://proxy.company.com:8080 254 | ./scripts/download_datasets.sh 255 | ``` 256 | 257 | --- 258 | 259 | **Related Documentation:** 260 | 261 | - [Main Examples README](../README.md) 262 | - [Query Language Reference](../../docs/query-language.md) 263 | 264 | ## 🛠️ Customization 265 | 266 | ### Custom Data Generation 267 | 268 | スクリプトをベースに独自のデータパターンを作成: 269 | 270 | ```bash 271 | # Copy template 272 | cp scripts/generate_large.sh scripts/generate_custom.sh 273 | 274 | # Add custom data patterns 275 | # generate_custom_dataset() 関数を実装 276 | ``` 277 | 278 | ### Configuration File 279 | 280 | Generate custom parameters in `scripts/config.yaml`: 281 | 282 | ```yaml 283 | generation: 284 | default_size: 1000 285 | output_dir: "examples/large" 286 | 287 | datasets: 288 | customers: 289 | countries: ["USA", "Canada", "UK", "Germany", "Japan"] 290 | segments: ["enterprise", "business", "small"] 291 | 292 | logs: 293 | log_levels: ["ERROR", "WARN", "INFO", "DEBUG"] 294 | level_ratios: [1, 5, 20, 50] 295 | ``` 296 | 297 | ## 🧹 Cleanup 298 | 299 | ### Safe Deletion 300 | 301 | ```bash 302 | # Deletion with confirmation 303 | ./scripts/cleanup.sh --interactive 304 | 305 | # Specific files only 306 | ./scripts/cleanup.sh --pattern "*.log" 307 | 308 | # Size-limited deletion 309 | ./scripts/cleanup.sh --size-limit 10MB 310 | ``` 311 | 312 | ### Automated Cleanup 313 | 314 | ```bash 315 | # Periodic cleanup of old files (cron example) 316 | 0 2 * * * /path/to/scripts/cleanup.sh --older-than 7days 317 | ``` 318 | 319 | ## 💡 Tips 320 | 321 | 1. **Progressive Learning**: Learn in order: small → large → external 322 | 2. **Memory Monitoring**: Watch memory usage when generating large datasets 323 | 3. **Disk Space**: 10,000 records require approximately 10-50MB 324 | 4. **Parallel Processing**: Test performance with multiple concurrent queries 325 | 326 | ## 🐛 Troubleshooting 327 | 328 | ### Common Issues 329 | 330 | **Slow generation:** 331 | 332 | ```bash 333 | # Enable parallel generation 334 | ./scripts/generate_large.sh --parallel 4 335 | 336 | # Adjust size 337 | ./scripts/generate_large.sh --size 1000 338 | ``` 339 | 340 | **Out of memory errors:** 341 | 342 | ```bash 343 | # Use streaming mode 344 | ./scripts/generate_large.sh --streaming 345 | 346 | # Adjust batch size 347 | ./scripts/generate_large.sh --batch-size 100 348 | ``` 349 | 350 | **Download failures:** 351 | 352 | ```bash 353 | # Enable retry 354 | ./scripts/download_datasets.sh --retry 3 355 | 356 | # Set proxy 357 | export https_proxy=http://proxy.company.com:8080 358 | ./scripts/download_datasets.sh 359 | ``` 360 | 361 | --- 362 | 363 | **Related Documentation:** 364 | 365 | - [Main Examples README](../README.md) 366 | - [Query Language Reference](../../docs/query-language.md) 367 | -------------------------------------------------------------------------------- /examples/scripts/cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # cleanup.sh - Clean up generated datasets and temporary files 4 | # Usage: ./cleanup.sh [options] 5 | 6 | set -euo pipefail 7 | 8 | # Default configuration 9 | DEFAULT_TARGET="all" 10 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 11 | BASE_DIR="$(dirname "$SCRIPT_DIR")" 12 | 13 | # Colors for output 14 | RED='\033[0;31m' 15 | GREEN='\033[0;32m' 16 | YELLOW='\033[1;33m' 17 | BLUE='\033[0;34m' 18 | NC='\033[0m' # No Color 19 | 20 | # Configuration variables 21 | TARGET="$DEFAULT_TARGET" 22 | INTERACTIVE=false 23 | DRY_RUN=false 24 | FORCE=false 25 | 26 | # Helper functions 27 | log_info() { 28 | echo -e "${BLUE}[INFO]${NC} $1" 29 | } 30 | 31 | log_success() { 32 | echo -e "${GREEN}[SUCCESS]${NC} $1" 33 | } 34 | 35 | log_warning() { 36 | echo -e "${YELLOW}[WARNING]${NC} $1" 37 | } 38 | 39 | log_error() { 40 | echo -e "${RED}[ERROR]${NC} $1" 41 | } 42 | 43 | show_help() { 44 | cat </dev/null || true) 140 | ;; 141 | "all") 142 | if [[ -d "$BASE_DIR/large" ]]; then 143 | targets+=("$BASE_DIR/large") 144 | fi 145 | if [[ -d "$BASE_DIR/external" ]]; then 146 | targets+=("$BASE_DIR/external") 147 | fi 148 | # Add temp files 149 | while IFS= read -r -d '' file; do 150 | targets+=("$file") 151 | done < <(find "$BASE_DIR" -type f \( -name "*.tmp" -o -name "*.temp" -o -name "*.swp" -o -name "*.bak" \) -print0 2>/dev/null || true) 152 | ;; 153 | *) 154 | log_error "Unknown target: $target" 155 | exit 1 156 | ;; 157 | esac 158 | 159 | printf '%s\n' "${targets[@]}" 160 | } 161 | 162 | # Calculate total size of targets 163 | calculate_total_size() { 164 | local targets=("$@") 165 | local total_size=0 166 | 167 | for target in "${targets[@]}"; do 168 | if [[ -e "$target" ]]; then 169 | if [[ -d "$target" ]]; then 170 | local size=$(du -sb "$target" 2>/dev/null | cut -f1 || echo 0) 171 | else 172 | local size=$(stat -c%s "$target" 2>/dev/null || echo 0) 173 | fi 174 | total_size=$((total_size + size)) 175 | fi 176 | done 177 | 178 | # Convert bytes to human readable 179 | if [[ $total_size -eq 0 ]]; then 180 | echo "0B" 181 | elif [[ $total_size -lt 1024 ]]; then 182 | echo "${total_size}B" 183 | elif [[ $total_size -lt 1048576 ]]; then 184 | echo "$((total_size / 1024))KB" 185 | elif [[ $total_size -lt 1073741824 ]]; then 186 | echo "$((total_size / 1048576))MB" 187 | else 188 | echo "$((total_size / 1073741824))GB" 189 | fi 190 | } 191 | 192 | # Show what will be deleted 193 | show_cleanup_preview() { 194 | local targets=("$@") 195 | 196 | if [[ ${#targets[@]} -eq 0 ]]; then 197 | log_info "No files to clean up" 198 | return 0 199 | fi 200 | 201 | log_info "Files and directories to be deleted:" 202 | 203 | for target in "${targets[@]}"; do 204 | if [[ -e "$target" ]]; then 205 | local relative_path="${target#$BASE_DIR/}" 206 | if [[ -d "$target" ]]; then 207 | local size=$(du -sh "$target" 2>/dev/null | cut -f1 || echo "unknown") 208 | local count=$(find "$target" -type f | wc -l) 209 | echo " 📁 $relative_path/ ($size, $count files)" 210 | else 211 | local size=$(du -h "$target" 2>/dev/null | cut -f1 || echo "unknown") 212 | echo " 📄 $relative_path ($size)" 213 | fi 214 | fi 215 | done 216 | 217 | local total_size=$(calculate_total_size "${targets[@]}") 218 | echo 219 | log_info "Total size to be freed: $total_size" 220 | } 221 | 222 | # Confirm deletion 223 | confirm_deletion() { 224 | if [[ "$FORCE" == "true" ]]; then 225 | return 0 226 | fi 227 | 228 | echo 229 | if [[ "$DRY_RUN" == "true" ]]; then 230 | log_info "This is a dry run - no files will actually be deleted" 231 | return 0 232 | fi 233 | 234 | read -p "$(echo -e "${YELLOW}Do you want to proceed with deletion? (y/N): ${NC}")" -n 1 -r 235 | echo 236 | if [[ $REPLY =~ ^[Yy]$ ]]; then 237 | return 0 238 | else 239 | log_info "Cleanup cancelled by user" 240 | return 1 241 | fi 242 | } 243 | 244 | # Interactive confirmation for each item 245 | confirm_item() { 246 | local item="$1" 247 | local relative_path="${item#$BASE_DIR/}" 248 | 249 | read -p "$(echo -e "${YELLOW}Delete $relative_path? (y/N/q): ${NC}")" -n 1 -r 250 | echo 251 | case $REPLY in 252 | [Yy]) 253 | return 0 254 | ;; 255 | [Qq]) 256 | log_info "Cleanup cancelled by user" 257 | exit 0 258 | ;; 259 | *) 260 | return 1 261 | ;; 262 | esac 263 | } 264 | 265 | # Perform cleanup 266 | perform_cleanup() { 267 | local targets=("$@") 268 | local deleted_count=0 269 | local total_freed=0 270 | 271 | for target in "${targets[@]}"; do 272 | if [[ ! -e "$target" ]]; then 273 | continue 274 | fi 275 | 276 | if [[ "$INTERACTIVE" == "true" && "$DRY_RUN" != "true" ]]; then 277 | if ! confirm_item "$target"; then 278 | continue 279 | fi 280 | fi 281 | 282 | local relative_path="${target#$BASE_DIR/}" 283 | 284 | if [[ "$DRY_RUN" == "true" ]]; then 285 | log_info "[DRY RUN] Would delete: $relative_path" 286 | ((deleted_count++)) 287 | else 288 | # Calculate size before deletion 289 | local size=0 290 | if [[ -d "$target" ]]; then 291 | size=$(du -sb "$target" 2>/dev/null | cut -f1 || echo 0) 292 | else 293 | size=$(stat -c%s "$target" 2>/dev/null || echo 0) 294 | fi 295 | 296 | # Perform deletion 297 | if rm -rf "$target" 2>/dev/null; then 298 | log_success "Deleted: $relative_path" 299 | ((deleted_count++)) 300 | total_freed=$((total_freed + size)) 301 | else 302 | log_error "Failed to delete: $relative_path" 303 | fi 304 | fi 305 | done 306 | 307 | if [[ "$DRY_RUN" == "true" ]]; then 308 | log_info "Dry run completed: $deleted_count items would be deleted" 309 | else 310 | local freed_readable=$(echo $total_freed | awk '{ 311 | if ($1 >= 1073741824) printf "%.1fGB", $1/1073741824 312 | else if ($1 >= 1048576) printf "%.1fMB", $1/1048576 313 | else if ($1 >= 1024) printf "%.1fKB", $1/1024 314 | else printf "%dB", $1 315 | }') 316 | log_success "Cleanup completed: $deleted_count items deleted, $freed_readable freed" 317 | fi 318 | } 319 | 320 | # Validate target 321 | validate_target() { 322 | case "$TARGET" in 323 | "large" | "external" | "generated" | "temp" | "all") ;; 324 | *) 325 | log_error "Invalid target: $TARGET" 326 | log_error "Valid targets: large, external, generated, temp, all" 327 | exit 1 328 | ;; 329 | esac 330 | } 331 | 332 | # Safety check to prevent accidental deletion of important files 333 | safety_check() { 334 | local targets=("$@") 335 | 336 | for target in "${targets[@]}"; do 337 | # Ensure we never delete the small samples directory 338 | if [[ "$target" == *"/small"* ]] || [[ "$target" == *"/scripts"* ]]; then 339 | log_error "Safety check failed: attempting to delete protected directory: $target" 340 | log_error "This script will never delete small sample data or scripts" 341 | exit 1 342 | fi 343 | 344 | # Ensure we're only deleting within the examples directory 345 | if [[ "$target" != "$BASE_DIR"* ]]; then 346 | log_error "Safety check failed: attempting to delete outside examples directory: $target" 347 | exit 1 348 | fi 349 | done 350 | } 351 | 352 | # Main function 353 | main() { 354 | log_info "Starting cleanup process..." 355 | log_info "Target: $TARGET" 356 | 357 | # Get targets to clean 358 | mapfile -t targets < <(get_cleanup_targets "$TARGET") 359 | 360 | if [[ ${#targets[@]} -eq 0 ]]; then 361 | log_info "Nothing to clean up for target: $TARGET" 362 | return 0 363 | fi 364 | 365 | # Safety checks 366 | safety_check "${targets[@]}" 367 | 368 | # Show preview 369 | show_cleanup_preview "${targets[@]}" 370 | 371 | # Confirm and perform cleanup 372 | if confirm_deletion; then 373 | perform_cleanup "${targets[@]}" 374 | fi 375 | } 376 | 377 | # Display banner 378 | show_banner() { 379 | echo "🧹 Hawk Examples Cleanup Tool" 380 | echo "==============================" 381 | echo 382 | } 383 | 384 | # Parse arguments and run 385 | parse_args "$@" 386 | validate_target 387 | 388 | # Show banner unless in quiet mode 389 | show_banner 390 | 391 | # Conflict checking 392 | if [[ "$INTERACTIVE" == "true" && "$FORCE" == "true" ]]; then 393 | log_error "Cannot use --interactive and --force together" 394 | exit 1 395 | fi 396 | 397 | if [[ "$DRY_RUN" == "true" && "$FORCE" == "true" ]]; then 398 | log_warning "--force has no effect in dry-run mode" 399 | fi 400 | 401 | # Run main function 402 | main 403 | 404 | log_info "Cleanup process completed" 405 | -------------------------------------------------------------------------------- /examples/small/application.log: -------------------------------------------------------------------------------- 1 | 2024-07-18 09:15:23 INFO [main] Application started successfully on port 8080 2 | 2024-07-18 09:15:24 DEBUG [worker-1] Loading configuration from /etc/app/config.yaml 3 | 2024-07-18 09:15:25 INFO [database] Connected to PostgreSQL database: app_production 4 | 2024-07-18 09:15:26 DEBUG [cache] Redis connection established: localhost:6379 5 | 2024-07-18 09:15:27 INFO [auth] JWT authentication module initialized 6 | 2024-07-18 09:15:28 DEBUG [worker-2] Processing request GET /api/v1/health 7 | 2024-07-18 09:15:29 INFO [metrics] Prometheus metrics endpoint available at /metrics 8 | 2024-07-18 09:15:30 DEBUG [worker-1] Request completed: GET /api/v1/health - 200 OK (15ms) 9 | 2024-07-18 09:16:15 INFO [worker-3] User login successful: user_id=12345, email=alice@company.com 10 | 2024-07-18 09:16:45 DEBUG [worker-2] Processing request POST /api/v1/orders 11 | 2024-07-18 09:16:46 INFO [validation] Order validation passed: order_id=ORD001 12 | 2024-07-18 09:16:47 DEBUG [database] Executing SQL: INSERT INTO orders (id, customer_id, total) VALUES ($1, $2, $3) 13 | 2024-07-18 09:16:48 INFO [worker-2] Order created successfully: order_id=ORD001, total=$299.99 14 | 2024-07-18 09:17:12 WARN [worker-1] Slow query detected: SELECT * FROM products WHERE category='electronics' (2.5s) 15 | 2024-07-18 09:17:30 ERROR [worker-4] Database connection failed: connection timeout after 30s 16 | 2024-07-18 09:17:31 ERROR [retry] Retrying database connection (attempt 1/3) 17 | 2024-07-18 09:17:32 INFO [worker-4] Database connection restored successfully 18 | 2024-07-18 09:18:05 DEBUG [cache] Cache hit for key: user_profile_12345 19 | 2024-07-18 09:18:22 WARN [queue] Queue size approaching limit: 950/1000 messages 20 | 2024-07-18 09:18:45 INFO [worker-5] User logout: user_id=12345, session_duration=2m30s 21 | 2024-07-18 09:19:10 DEBUG [worker-1] Processing request GET /api/v1/products?category=electronics 22 | 2024-07-18 09:19:11 INFO [search] Search query executed: 'electronics', results=25, duration=45ms 23 | 2024-07-18 09:19:33 ERROR [payment] Payment processing failed: card_declined, order_id=ORD002 24 | 2024-07-18 09:19:34 WARN [notification] Failed to send payment failure email to customer@example.com 25 | 2024-07-18 09:20:01 INFO [scheduler] Starting scheduled job: daily_report_generation 26 | 2024-07-18 09:20:15 DEBUG [worker-3] Processing request PUT /api/v1/users/12345/profile 27 | 2024-07-18 09:20:16 INFO [validation] Profile update validation passed for user_id=12345 28 | 2024-07-18 09:20:17 DEBUG [database] Executing SQL: UPDATE users SET profile_data=$1 WHERE id=$2 29 | 2024-07-18 09:20:45 CRITICAL [security] Multiple failed login attempts detected: IP=192.168.1.100, attempts=5 30 | 2024-07-18 09:20:46 INFO [security] IP address blocked for 1 hour: 192.168.1.100 31 | 2024-07-18 09:21:12 ERROR [external_api] Third-party API call failed: timeout to payments.example.com 32 | 2024-07-18 09:21:13 WARN [circuit_breaker] Circuit breaker opened for payments service 33 | 2024-07-18 09:21:30 DEBUG [worker-2] Processing request DELETE /api/v1/cart/items/123 34 | 2024-07-18 09:21:31 INFO [cart] Item removed from cart: user_id=67890, item_id=123 35 | 2024-07-18 09:22:05 INFO [health_check] All services healthy: database=OK, cache=OK, queue=OK 36 | 2024-07-18 09:22:30 DEBUG [worker-4] Processing request GET /api/v1/analytics/dashboard 37 | 2024-07-18 09:22:31 WARN [performance] High memory usage detected: 85% of available memory 38 | 2024-07-18 09:22:55 ERROR [file_system] Failed to write log file: disk space low (5% remaining) 39 | 2024-07-18 09:23:10 FATAL [storage] Critical disk space shortage: less than 1GB remaining 40 | 2024-07-18 09:23:11 ERROR [alert] Failed to send critical alert: notification service unavailable 41 | 2024-07-18 09:23:30 INFO [maintenance] Starting automated cleanup: removing old log files 42 | 2024-07-18 09:23:45 DEBUG [cleanup] Removed 150 old log files, freed 2.5GB disk space 43 | 2024-07-18 09:24:01 INFO [recovery] System recovered from disk space issue 44 | 2024-07-18 09:24:15 DEBUG [worker-1] Processing request GET /api/v1/reports/sales 45 | 2024-07-18 09:24:16 INFO [reports] Sales report generated: period=2024-07, total_orders=1250 46 | 2024-07-18 09:24:45 WARN [rate_limit] Rate limit exceeded for API key: api_key_abc123 (100 req/min) 47 | 2024-07-18 09:25:10 DEBUG [worker-3] Processing request POST /api/v1/feedback 48 | 2024-07-18 09:25:11 INFO [feedback] Customer feedback submitted: rating=5, order_id=ORD003 49 | 2024-07-18 09:25:30 ERROR [email] SMTP server connection failed: authentication error 50 | 2024-07-18 09:25:45 INFO [backup] Database backup completed successfully: backup_20240718_092545.sql 51 | 2024-07-18 09:26:00 DEBUG [worker-2] Processing request GET /api/v1/inventory/status 52 | 2024-07-18 09:26:15 WARN [inventory] Low stock alert: product_id=PROD001, quantity=5 remaining 53 | 2024-07-18 09:26:30 INFO [scheduler] Scheduled job completed: daily_report_generation (duration=6m30s) 54 | -------------------------------------------------------------------------------- /examples/small/customers.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "CUST001", 4 | "name": "Alice Johnson", 5 | "email": "alice.johnson@techcorp.com", 6 | "company": "TechCorp Inc", 7 | "country": "USA", 8 | "status": "active", 9 | "lifetime_value": 15240.5, 10 | "segment": "enterprise", 11 | "registration_date": "2023-01-15" 12 | }, 13 | { 14 | "id": "CUST002", 15 | "name": "Bob Smith", 16 | "email": "bob.smith@startup.io", 17 | "company": "StartupXYZ", 18 | "country": "Canada", 19 | "status": "active", 20 | "lifetime_value": 8500.25, 21 | "segment": "business", 22 | "registration_date": "2023-03-22" 23 | }, 24 | { 25 | "id": "CUST003", 26 | "name": "Carol Wang", 27 | "email": "carol.wang@example.com", 28 | "company": "Global Solutions", 29 | "country": "USA", 30 | "status": "inactive", 31 | "lifetime_value": 12750.75, 32 | "segment": "enterprise", 33 | "registration_date": "2022-11-08" 34 | }, 35 | { 36 | "id": "CUST004", 37 | "name": "David Brown", 38 | "email": "david.brown@smallbiz.com", 39 | "company": "Small Business Co", 40 | "country": "UK", 41 | "status": "active", 42 | "lifetime_value": 3200.0, 43 | "segment": "small", 44 | "registration_date": "2024-01-10" 45 | }, 46 | { 47 | "id": "CUST005", 48 | "name": "Elena Rodriguez", 49 | "email": "elena.rodriguez@innovate.es", 50 | "company": "Innovate España", 51 | "country": "Spain", 52 | "status": "active", 53 | "lifetime_value": 9875.5, 54 | "segment": "business", 55 | "registration_date": "2023-06-14" 56 | }, 57 | { 58 | "id": "CUST006", 59 | "name": "Frank Chen", 60 | "email": "frank.chen@testcompany.com", 61 | "company": "Test Company", 62 | "country": "USA", 63 | "status": "suspended", 64 | "lifetime_value": 0.0, 65 | "segment": "test", 66 | "registration_date": "2024-02-01" 67 | }, 68 | { 69 | "id": "CUST007", 70 | "name": "Grace Kim", 71 | "email": "grace.kim@fintech.kr", 72 | "company": "FinTech Korea", 73 | "country": "South Korea", 74 | "status": "active", 75 | "lifetime_value": 18920.25, 76 | "segment": "enterprise", 77 | "registration_date": "2022-09-30" 78 | }, 79 | { 80 | "id": "CUST008", 81 | "name": "Henry Taylor", 82 | "email": "henry.taylor@demo.org", 83 | "company": "Demo Organization", 84 | "country": "Australia", 85 | "status": "inactive", 86 | "lifetime_value": 1500.0, 87 | "segment": "demo", 88 | "registration_date": "2024-03-15" 89 | }, 90 | { 91 | "id": "CUST009", 92 | "name": "Irene Foster", 93 | "email": "irene.foster@enterprise.de", 94 | "company": "Enterprise Germany", 95 | "country": "Germany", 96 | "status": "active", 97 | "lifetime_value": 22450.75, 98 | "segment": "enterprise", 99 | "registration_date": "2023-04-20" 100 | }, 101 | { 102 | "id": "CUST010", 103 | "name": "Jack Wilson", 104 | "email": "jack.wilson@temp.example", 105 | "company": "Temporary Inc", 106 | "country": "Canada", 107 | "status": "deleted", 108 | "lifetime_value": 500.0, 109 | "segment": "temp", 110 | "registration_date": "2024-01-01" 111 | } 112 | ] 113 | -------------------------------------------------------------------------------- /examples/small/ec2_instances.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "instance_id": "i-0123456789abcdef0", 4 | "instance_type": "t3.medium", 5 | "state": "running", 6 | "availability_zone": "us-west-2a", 7 | "private_ip": "10.0.1.15", 8 | "public_ip": "54.123.45.67", 9 | "security_groups": ["sg-web", "sg-ssh"], 10 | "tags": { 11 | "Name": "web-server-01", 12 | "Environment": "production", 13 | "Team": "backend", 14 | "Application": "web-app" 15 | }, 16 | "launch_time": "2024-01-10T09:30:00Z", 17 | "cpu_utilization": 45.2, 18 | "memory_utilization": 62.8, 19 | "network_in_mb": 125.4, 20 | "network_out_mb": 89.2 21 | }, 22 | { 23 | "instance_id": "i-0fedcba987654321a", 24 | "instance_type": "t3.large", 25 | "state": "running", 26 | "availability_zone": "us-west-2b", 27 | "private_ip": "10.0.2.22", 28 | "public_ip": "54.234.56.78", 29 | "security_groups": ["sg-database", "sg-internal"], 30 | "tags": { 31 | "Name": "database-server-01", 32 | "Environment": "production", 33 | "Team": "database", 34 | "Application": "postgres" 35 | }, 36 | "launch_time": "2024-01-05T14:15:00Z", 37 | "cpu_utilization": 78.5, 38 | "memory_utilization": 85.3, 39 | "network_in_mb": 45.7, 40 | "network_out_mb": 67.1 41 | }, 42 | { 43 | "instance_id": "i-0abcdef123456789b", 44 | "instance_type": "t3.small", 45 | "state": "stopped", 46 | "availability_zone": "us-west-2a", 47 | "private_ip": "10.0.1.33", 48 | "public_ip": null, 49 | "security_groups": ["sg-staging"], 50 | "tags": { 51 | "Name": "staging-server-01", 52 | "Environment": "staging", 53 | "Team": "qa", 54 | "Application": "test-app" 55 | }, 56 | "launch_time": "2024-01-15T11:20:00Z", 57 | "cpu_utilization": 0.0, 58 | "memory_utilization": 0.0, 59 | "network_in_mb": 0.0, 60 | "network_out_mb": 0.0 61 | }, 62 | { 63 | "instance_id": "i-0987654321fedcbac", 64 | "instance_type": "t3.xlarge", 65 | "state": "running", 66 | "availability_zone": "us-west-2c", 67 | "private_ip": "10.0.3.44", 68 | "public_ip": "54.345.67.89", 69 | "security_groups": ["sg-processing", "sg-ssh"], 70 | "tags": { 71 | "Name": "batch-processor-01", 72 | "Environment": "production", 73 | "Team": "data", 74 | "Application": "etl" 75 | }, 76 | "launch_time": "2024-01-08T16:45:00Z", 77 | "cpu_utilization": 92.7, 78 | "memory_utilization": 76.4, 79 | "network_in_mb": 234.8, 80 | "network_out_mb": 456.2 81 | }, 82 | { 83 | "instance_id": "i-0456789abcdef012d", 84 | "instance_type": "t3.micro", 85 | "state": "running", 86 | "availability_zone": "us-west-2a", 87 | "private_ip": "10.0.1.55", 88 | "public_ip": "54.456.78.90", 89 | "security_groups": ["sg-monitoring", "sg-ssh"], 90 | "tags": { 91 | "Name": "monitoring-server-01", 92 | "Environment": "production", 93 | "Team": "devops", 94 | "Application": "prometheus" 95 | }, 96 | "launch_time": "2024-01-12T08:00:00Z", 97 | "cpu_utilization": 25.3, 98 | "memory_utilization": 48.9, 99 | "network_in_mb": 15.6, 100 | "network_out_mb": 22.1 101 | } 102 | ] 103 | -------------------------------------------------------------------------------- /examples/small/employees.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "EMP001", 4 | "name": "Sarah Connor", 5 | "email": "sarah.connor@company.com", 6 | "department": "Engineering", 7 | "role": "Senior Developer", 8 | "salary": 95000, 9 | "hire_date": "2022-03-15", 10 | "status": "active", 11 | "location": "San Francisco", 12 | "skills": ["Python", "JavaScript", "AWS", "Docker"], 13 | "manager_id": "EMP003" 14 | }, 15 | { 16 | "id": "EMP002", 17 | "name": "John Doe", 18 | "email": "john.doe@company.com", 19 | "department": "Marketing", 20 | "role": "Marketing Manager", 21 | "salary": 75000, 22 | "hire_date": "2021-08-20", 23 | "status": "active", 24 | "location": "New York", 25 | "skills": ["SEO", "Analytics", "Content Marketing", "Social Media"], 26 | "manager_id": "EMP015" 27 | }, 28 | { 29 | "id": "EMP003", 30 | "name": "Alice Kim", 31 | "email": "alice.kim@company.com", 32 | "department": "Engineering", 33 | "role": "Engineering Manager", 34 | "salary": 120000, 35 | "hire_date": "2020-01-10", 36 | "status": "active", 37 | "location": "San Francisco", 38 | "skills": ["Leadership", "Architecture", "Python", "Kubernetes"], 39 | "manager_id": null 40 | }, 41 | { 42 | "id": "EMP004", 43 | "name": "Bob Johnson", 44 | "email": "bob.johnson@company.com", 45 | "department": "Sales", 46 | "role": "Sales Representative", 47 | "salary": 65000, 48 | "hire_date": "2023-05-12", 49 | "status": "active", 50 | "location": "Chicago", 51 | "skills": ["CRM", "Negotiation", "Presentations", "Lead Generation"], 52 | "manager_id": "EMP008" 53 | }, 54 | { 55 | "id": "EMP005", 56 | "name": "Emily Davis", 57 | "email": "emily.davis@company.com", 58 | "department": "Engineering", 59 | "role": "Junior Developer", 60 | "salary": 70000, 61 | "hire_date": "2023-09-01", 62 | "status": "active", 63 | "location": "Remote", 64 | "skills": ["JavaScript", "React", "Node.js", "Git"], 65 | "manager_id": "EMP003" 66 | }, 67 | { 68 | "id": "EMP006", 69 | "name": "Michael Chen", 70 | "email": "michael.chen@company.com", 71 | "department": "DevOps", 72 | "role": "DevOps Engineer", 73 | "salary": 105000, 74 | "hire_date": "2021-11-30", 75 | "status": "active", 76 | "location": "Seattle", 77 | "skills": ["AWS", "Terraform", "Jenkins", "Monitoring"], 78 | "manager_id": "EMP003" 79 | }, 80 | { 81 | "id": "EMP007", 82 | "name": "Lisa Wang", 83 | "email": "lisa.wang@company.com", 84 | "department": "Design", 85 | "role": "UX Designer", 86 | "salary": 80000, 87 | "hire_date": "2022-07-18", 88 | "status": "active", 89 | "location": "Los Angeles", 90 | "skills": ["Figma", "User Research", "Prototyping", "Usability Testing"], 91 | "manager_id": "EMP012" 92 | }, 93 | { 94 | "id": "EMP008", 95 | "name": "David Miller", 96 | "email": "david.miller@company.com", 97 | "department": "Sales", 98 | "role": "Sales Manager", 99 | "salary": 90000, 100 | "hire_date": "2020-06-05", 101 | "status": "active", 102 | "location": "Chicago", 103 | "skills": ["Team Leadership", "Strategy", "CRM", "Analytics"], 104 | "manager_id": null 105 | }, 106 | { 107 | "id": "EMP009", 108 | "name": "Jennifer Garcia", 109 | "email": "jennifer.garcia@company.com", 110 | "department": "HR", 111 | "role": "HR Specialist", 112 | "salary": 60000, 113 | "hire_date": "2023-02-14", 114 | "status": "active", 115 | "location": "Austin", 116 | "skills": ["Recruiting", "Employee Relations", "Benefits", "Compliance"], 117 | "manager_id": "EMP013" 118 | }, 119 | { 120 | "id": "EMP010", 121 | "name": "Robert Taylor", 122 | "email": "robert.taylor@company.com", 123 | "department": "Engineering", 124 | "role": "Data Engineer", 125 | "salary": 98000, 126 | "hire_date": "2022-04-25", 127 | "status": "on_leave", 128 | "location": "Boston", 129 | "skills": ["SQL", "Spark", "Airflow", "Data Modeling"], 130 | "manager_id": "EMP003" 131 | }, 132 | { 133 | "id": "EMP011", 134 | "name": "Amanda Wilson", 135 | "email": "amanda.wilson@company.com", 136 | "department": "Finance", 137 | "role": "Financial Analyst", 138 | "salary": 72000, 139 | "hire_date": "2021-12-10", 140 | "status": "active", 141 | "location": "New York", 142 | "skills": ["Excel", "Financial Modeling", "Reporting", "Analysis"], 143 | "manager_id": "EMP014" 144 | }, 145 | { 146 | "id": "EMP012", 147 | "name": "Kevin Brown", 148 | "email": "kevin.brown@company.com", 149 | "department": "Design", 150 | "role": "Design Manager", 151 | "salary": 95000, 152 | "hire_date": "2020-09-15", 153 | "status": "active", 154 | "location": "Los Angeles", 155 | "skills": ["Design Leadership", "Brand Strategy", "Creative Direction"], 156 | "manager_id": null 157 | }, 158 | { 159 | "id": "EMP013", 160 | "name": "Nancy Rodriguez", 161 | "email": "nancy.rodriguez@company.com", 162 | "department": "HR", 163 | "role": "HR Manager", 164 | "salary": 85000, 165 | "hire_date": "2019-11-20", 166 | "status": "active", 167 | "location": "Austin", 168 | "skills": ["HR Strategy", "Leadership", "Policy Development", "Training"], 169 | "manager_id": null 170 | }, 171 | { 172 | "id": "EMP014", 173 | "name": "Steve Anderson", 174 | "email": "steve.anderson@company.com", 175 | "department": "Finance", 176 | "role": "Finance Manager", 177 | "salary": 100000, 178 | "hire_date": "2020-03-08", 179 | "status": "active", 180 | "location": "New York", 181 | "skills": [ 182 | "Financial Planning", 183 | "Budgeting", 184 | "Team Leadership", 185 | "Strategy" 186 | ], 187 | "manager_id": null 188 | }, 189 | { 190 | "id": "EMP015", 191 | "name": "Michelle Lee", 192 | "email": "michelle.lee@company.com", 193 | "department": "Marketing", 194 | "role": "Marketing Director", 195 | "salary": 110000, 196 | "hire_date": "2019-05-30", 197 | "status": "terminated", 198 | "location": "New York", 199 | "skills": [ 200 | "Marketing Strategy", 201 | "Brand Management", 202 | "Digital Marketing", 203 | "Leadership" 204 | ], 205 | "manager_id": null 206 | } 207 | ] 208 | -------------------------------------------------------------------------------- /examples/small/error_messages.txt: -------------------------------------------------------------------------------- 1 | ERROR: Database connection timeout after 30 seconds 2 | WARNING: Memory usage exceeded 80% threshold (current: 85%) 3 | CRITICAL: Disk space below 5% on /var/log partition 4 | INFO: User session expired, redirecting to login page 5 | DEBUG: Cache miss for key user_profile_12345 6 | FATAL: Unable to start application server on port 8080 7 | ERROR: Payment processing failed - card declined 8 | WARNING: Rate limit exceeded for API key abc123 9 | CRITICAL: Security breach detected - multiple failed login attempts 10 | INFO: Scheduled maintenance window starting in 10 minutes 11 | ERROR: Failed to send notification email to user@example.com 12 | WARNING: SSL certificate expires in 7 days 13 | DEBUG: SQL query execution time: 2.5 seconds (slow query) 14 | FATAL: Out of memory - cannot allocate additional heap space 15 | ERROR: External API call failed - timeout to payments.example.com 16 | -------------------------------------------------------------------------------- /examples/small/nginx.conf: -------------------------------------------------------------------------------- 1 | # Main nginx configuration file 2 | user nginx; 3 | worker_processes auto; 4 | error_log /var/log/nginx/error.log warn; 5 | pid /var/run/nginx.pid; 6 | 7 | events { 8 | worker_connections 1024; 9 | use epoll; 10 | multi_accept on; 11 | } 12 | 13 | http { 14 | # Basic settings 15 | include /etc/nginx/mime.types; 16 | default_type application/octet-stream; 17 | 18 | # Logging format 19 | log_format main '$remote_addr - $remote_user [$time_local] "$request" ' 20 | '$status $body_bytes_sent "$http_referer" ' 21 | '"$http_user_agent" "$http_x_forwarded_for"'; 22 | 23 | access_log /var/log/nginx/access.log main; 24 | 25 | # Performance settings 26 | sendfile on; 27 | tcp_nopush on; 28 | tcp_nodelay on; 29 | keepalive_timeout 65; 30 | types_hash_max_size 2048; 31 | 32 | # Gzip compression 33 | gzip on; 34 | gzip_vary on; 35 | gzip_min_length 1024; 36 | gzip_types text/plain text/css application/json application/javascript text/xml; 37 | 38 | # Rate limiting 39 | limit_req_zone $binary_remote_addr zone=api:10m rate=100r/m; 40 | limit_req_zone $binary_remote_addr zone=login:10m rate=5r/m; 41 | 42 | # SSL settings 43 | ssl_protocols TLSv1.2 TLSv1.3; 44 | ssl_prefer_server_ciphers off; 45 | ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256; 46 | 47 | # Upstream backend servers 48 | upstream app_backend { 49 | server 127.0.0.1:8080 max_fails=3 fail_timeout=30s; 50 | server 127.0.0.1:8081 max_fails=3 fail_timeout=30s backup; 51 | } 52 | 53 | upstream api_backend { 54 | least_conn; 55 | server 10.0.1.10:3000 weight=3; 56 | server 10.0.1.11:3000 weight=2; 57 | server 10.0.1.12:3000 weight=1; 58 | } 59 | 60 | # Main server block 61 | server { 62 | listen 80; 63 | listen [::]:80; 64 | server_name example.com www.example.com; 65 | 66 | # Redirect HTTP to HTTPS 67 | return 301 https://$server_name$request_uri; 68 | } 69 | 70 | # HTTPS server block 71 | server { 72 | listen 443 ssl http2; 73 | listen [::]:443 ssl http2; 74 | server_name example.com www.example.com; 75 | 76 | # SSL configuration 77 | ssl_certificate /etc/ssl/certs/example.com.crt; 78 | ssl_certificate_key /etc/ssl/private/example.com.key; 79 | ssl_session_timeout 1d; 80 | ssl_session_cache shared:MozTLS:10m; 81 | ssl_session_tickets off; 82 | 83 | # Security headers 84 | add_header Strict-Transport-Security "max-age=63072000" always; 85 | add_header X-Content-Type-Options "nosniff" always; 86 | add_header X-Frame-Options "DENY" always; 87 | add_header X-XSS-Protection "1; mode=block" always; 88 | 89 | # API endpoints 90 | location /api/ { 91 | limit_req zone=api burst=20 nodelay; 92 | proxy_pass http://api_backend; 93 | proxy_set_header Host $host; 94 | proxy_set_header X-Real-IP $remote_addr; 95 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 96 | proxy_set_header X-Forwarded-Proto $scheme; 97 | proxy_connect_timeout 30s; 98 | proxy_send_timeout 30s; 99 | proxy_read_timeout 30s; 100 | } 101 | 102 | # Authentication endpoints (stricter rate limiting) 103 | location /api/auth/ { 104 | limit_req zone=login burst=5 nodelay; 105 | proxy_pass http://api_backend; 106 | proxy_set_header Host $host; 107 | proxy_set_header X-Real-IP $remote_addr; 108 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 109 | proxy_set_header X-Forwarded-Proto $scheme; 110 | } 111 | 112 | # Static files 113 | location /static/ { 114 | alias /var/www/static/; 115 | expires 1M; 116 | add_header Cache-Control "public, immutable"; 117 | } 118 | 119 | # Images with optimization 120 | location /images/ { 121 | alias /var/www/images/; 122 | expires 7d; 123 | add_header Cache-Control "public"; 124 | } 125 | 126 | # Health check endpoint 127 | location /health { 128 | access_log off; 129 | return 200 "healthy\n"; 130 | add_header Content-Type text/plain; 131 | } 132 | 133 | # Admin interface (restricted access) 134 | location /admin/ { 135 | allow 192.168.1.0/24; 136 | allow 10.0.0.0/8; 137 | deny all; 138 | 139 | proxy_pass http://app_backend; 140 | proxy_set_header Host $host; 141 | proxy_set_header X-Real-IP $remote_addr; 142 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 143 | } 144 | 145 | # Default location 146 | location / { 147 | proxy_pass http://app_backend; 148 | proxy_set_header Host $host; 149 | proxy_set_header X-Real-IP $remote_addr; 150 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 151 | proxy_set_header X-Forwarded-Proto $scheme; 152 | } 153 | 154 | # Custom error pages 155 | error_page 404 /404.html; 156 | error_page 500 502 503 504 /50x.html; 157 | 158 | location = /404.html { 159 | internal; 160 | root /var/www/error; 161 | } 162 | 163 | location = /50x.html { 164 | internal; 165 | root /var/www/error; 166 | } 167 | } 168 | } 169 | -------------------------------------------------------------------------------- /examples/small/nginx_access.log: -------------------------------------------------------------------------------- 1 | 192.168.1.100 - - [18/Jul/2024:09:15:30 +0000] "GET /api/v1/health HTTP/1.1" 200 15 "-" "curl/7.68.0" 2 | 203.0.113.45 - - [18/Jul/2024:09:16:45 +0000] "POST /api/v1/orders HTTP/1.1" 201 342 "https://shop.example.com/cart" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" 3 | 198.51.100.23 - - [18/Jul/2024:09:17:12 +0000] "GET /api/v1/products?category=electronics HTTP/1.1" 200 2500 "https://shop.example.com/search" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" 4 | 192.168.1.150 - - [18/Jul/2024:09:17:30 +0000] "GET /api/v1/health HTTP/1.1" 503 0 "-" "HealthCheck/1.0" 5 | 203.0.113.67 - - [18/Jul/2024:09:18:05 +0000] "GET /api/v1/users/12345/profile HTTP/1.1" 200 890 "https://shop.example.com/account" "Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15" 6 | 198.51.100.89 - - [18/Jul/2024:09:18:45 +0000] "PUT /api/v1/users/12345/profile HTTP/1.1" 200 156 "https://shop.example.com/account/edit" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0" 7 | 192.168.1.200 - - [18/Jul/2024:09:19:10 +0000] "GET /api/v1/products?page=2&limit=20 HTTP/1.1" 200 1800 "https://shop.example.com/products" "Mozilla/5.0 (Linux; Android 11; SM-G991B) AppleWebKit/537.36" 8 | 203.0.113.12 - - [18/Jul/2024:09:19:33 +0000] "POST /api/v1/payments HTTP/1.1" 400 89 "https://shop.example.com/checkout" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" 9 | 198.51.100.156 - - [18/Jul/2024:09:20:01 +0000] "GET /api/v1/analytics/dashboard HTTP/1.1" 200 5600 "https://admin.example.com/dashboard" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" 10 | 192.168.1.100 - - [18/Jul/2024:09:20:15 +0000] "DELETE /api/v1/cart/items/123 HTTP/1.1" 204 0 "https://shop.example.com/cart" "Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15" 11 | 203.0.113.45 - - [18/Jul/2024:09:20:45 +0000] "POST /api/v1/auth/login HTTP/1.1" 401 67 "https://shop.example.com/login" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" 12 | 203.0.113.45 - - [18/Jul/2024:09:20:46 +0000] "POST /api/v1/auth/login HTTP/1.1" 401 67 "https://shop.example.com/login" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" 13 | 203.0.113.45 - - [18/Jul/2024:09:20:47 +0000] "POST /api/v1/auth/login HTTP/1.1" 401 67 "https://shop.example.com/login" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" 14 | 198.51.100.78 - - [18/Jul/2024:09:21:12 +0000] "GET /api/v1/orders/recent HTTP/1.1" 200 1200 "https://shop.example.com/account/orders" "Mozilla/5.0 (Linux; Android 10; SM-A505F) AppleWebKit/537.36" 15 | 192.168.1.175 - - [18/Jul/2024:09:21:30 +0000] "GET /metrics HTTP/1.1" 200 3400 "-" "Prometheus/2.30.0" 16 | 203.0.113.89 - - [18/Jul/2024:09:22:05 +0000] "GET /api/v1/products/PROD001 HTTP/1.1" 200 678 "https://shop.example.com/products/electronics" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_16_0) AppleWebKit/537.36" 17 | 198.51.100.234 - - [18/Jul/2024:09:22:30 +0000] "POST /api/v1/reviews HTTP/1.1" 201 245 "https://shop.example.com/products/PROD001" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0" 18 | 192.168.1.100 - - [18/Jul/2024:09:22:55 +0000] "GET /api/v1/search?q=wireless+headphones HTTP/1.1" 200 2100 "https://shop.example.com/search" "Mozilla/5.0 (iPhone; CPU iPhone OS 15_0 like Mac OS X) AppleWebKit/605.1.15" 19 | 203.0.113.123 - - [18/Jul/2024:09:23:10 +0000] "GET /api/v1/categories HTTP/1.1" 200 890 "https://shop.example.com/" "Mozilla/5.0 (Linux; Android 12; Pixel 6) AppleWebKit/537.36" 20 | 198.51.100.67 - - [18/Jul/2024:09:23:30 +0000] "PUT /api/v1/cart/items/456 HTTP/1.1" 200 123 "https://shop.example.com/cart" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" 21 | 192.168.1.200 - - [18/Jul/2024:09:24:01 +0000] "GET /api/v1/wishlist HTTP/1.1" 200 567 "https://shop.example.com/account/wishlist" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15" 22 | 203.0.113.198 - - [18/Jul/2024:09:24:15 +0000] "POST /api/v1/feedback HTTP/1.1" 201 89 "https://shop.example.com/support/contact" "Mozilla/5.0 (iPad; CPU OS 15_0 like Mac OS X) AppleWebKit/605.1.15" 23 | 198.51.100.111 - - [18/Jul/2024:09:24:45 +0000] "GET /api/v1/products?category=office&sort=price HTTP/1.1" 200 1600 "https://shop.example.com/categories/office" "Mozilla/5.0 (Linux; Android 11; OnePlus 9) AppleWebKit/537.36" 24 | 192.168.1.150 - - [18/Jul/2024:09:25:10 +0000] "GET /api/v1/health HTTP/1.1" 200 15 "-" "HealthCheck/1.0" 25 | 203.0.113.45 - - [18/Jul/2024:09:25:30 +0000] "GET /api/v1/orders/ORD001/status HTTP/1.1" 200 234 "https://shop.example.com/orders/track" "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" 26 | 198.51.100.89 - - [18/Jul/2024:09:26:00 +0000] "POST /api/v1/auth/logout HTTP/1.1" 200 45 "https://shop.example.com/account" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" 27 | 192.168.1.175 - - [18/Jul/2024:09:26:15 +0000] "GET /metrics HTTP/1.1" 200 3500 "-" "Prometheus/2.30.0" 28 | 203.0.113.67 - - [18/Jul/2024:09:26:30 +0000] "GET /api/v1/notifications HTTP/1.1" 200 450 "https://shop.example.com/account/notifications" "Mozilla/5.0 (iPhone; CPU iPhone OS 14_8 like Mac OS X) AppleWebKit/605.1.15" 29 | 198.51.100.234 - - [18/Jul/2024:09:26:45 +0000] "GET /api/v1/inventory/status HTTP/1.1" 200 1100 "https://admin.example.com/inventory" "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0" 30 | 192.168.1.100 - - [18/Jul/2024:09:27:00 +0000] "GET /api/v1/reports/sales?period=monthly HTTP/1.1" 200 2800 "https://admin.example.com/reports" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" 31 | -------------------------------------------------------------------------------- /examples/small/orders.csv: -------------------------------------------------------------------------------- 1 | order_id,customer_id,product_id,quantity,price,order_date,status,payment_method 2 | ORD001,CUST001,PROD001,2,299.99,2024-01-15,completed,credit_card 3 | ORD002,CUST002,PROD003,1,599.99,2024-01-15,processing,paypal 4 | ORD003,CUST001,PROD002,3,149.97,2024-01-16,completed,credit_card 5 | ORD004,CUST003,PROD001,1,299.99,2024-01-17,cancelled,bank_transfer 6 | ORD005,CUST004,PROD004,5,249.95,2024-01-18,completed,credit_card 7 | ORD006,CUST005,PROD002,2,99.98,2024-01-19,shipped,paypal 8 | ORD007,CUST001,PROD005,1,899.99,2024-01-20,processing,credit_card 9 | ORD008,CUST007,PROD001,4,1199.96,2024-01-21,completed,bank_transfer 10 | ORD009,CUST002,PROD004,2,99.98,2024-01-22,shipped,paypal 11 | ORD010,CUST009,PROD003,1,599.99,2024-01-23,completed,credit_card 12 | ORD011,CUST004,PROD002,1,49.99,2024-01-24,processing,credit_card 13 | ORD012,CUST005,PROD005,1,899.99,2024-01-25,cancelled,paypal 14 | ORD013,CUST007,PROD004,3,149.97,2024-01-26,completed,bank_transfer 15 | ORD014,CUST001,PROD001,1,299.99,2024-01-27,shipped,credit_card 16 | ORD015,CUST003,PROD002,2,99.98,2024-01-28,processing,credit_card 17 | ORD016,CUST009,PROD001,2,599.98,2024-01-29,completed,credit_card 18 | ORD017,CUST002,PROD005,1,899.99,2024-01-30,shipped,paypal 19 | ORD018,CUST004,PROD003,1,599.99,2024-02-01,completed,credit_card 20 | ORD019,CUST007,PROD002,4,199.96,2024-02-02,processing,bank_transfer 21 | ORD020,CUST005,PROD001,1,299.99,2024-02-03,cancelled,paypal 22 | ORD021,CUST001,PROD004,2,99.98,2024-02-04,completed,credit_card 23 | ORD022,CUST009,PROD005,1,899.99,2024-02-05,shipped,credit_card 24 | ORD023,CUST002,PROD002,3,149.97,2024-02-06,processing,paypal 25 | ORD024,CUST003,PROD004,1,49.99,2024-02-07,completed,credit_card 26 | ORD025,CUST007,PROD003,1,599.99,2024-02-08,shipped,bank_transfer 27 | -------------------------------------------------------------------------------- /examples/small/products.yaml: -------------------------------------------------------------------------------- 1 | products: 2 | - id: PROD001 3 | name: "Wireless Headphones" 4 | category: "Electronics" 5 | price: 299.99 6 | in_stock: true 7 | supplier: "AudioTech" 8 | specifications: 9 | color: ["black", "white", "blue"] 10 | warranty_months: 24 11 | weight_grams: 250 12 | tags: ["bluetooth", "noise-canceling", "premium"] 13 | 14 | - id: PROD002 15 | name: "USB-C Cable" 16 | category: "Accessories" 17 | price: 49.99 18 | in_stock: true 19 | supplier: "CableCorp" 20 | specifications: 21 | length_meters: 2 22 | data_speed: "USB 3.0" 23 | warranty_months: 12 24 | tags: ["usb-c", "fast-charging", "durable"] 25 | 26 | - id: PROD003 27 | name: "Laptop Stand" 28 | category: "Office" 29 | price: 599.99 30 | in_stock: false 31 | supplier: "OfficeGear" 32 | specifications: 33 | material: "aluminum" 34 | adjustable: true 35 | max_weight_kg: 10 36 | warranty_months: 36 37 | tags: ["ergonomic", "adjustable", "premium"] 38 | 39 | - id: PROD004 40 | name: "Mouse Pad" 41 | category: "Accessories" 42 | price: 49.99 43 | in_stock: true 44 | supplier: "DeskMate" 45 | specifications: 46 | size: "large" 47 | material: "fabric" 48 | non_slip: true 49 | warranty_months: 6 50 | tags: ["gaming", "large", "non-slip"] 51 | 52 | - id: PROD005 53 | name: "4K Monitor" 54 | category: "Electronics" 55 | price: 899.99 56 | in_stock: true 57 | supplier: "DisplayTech" 58 | specifications: 59 | size_inches: 27 60 | resolution: "3840x2160" 61 | refresh_rate_hz: 60 62 | warranty_months: 24 63 | tags: ["4k", "monitor", "professional"] 64 | 65 | - id: PROD006 66 | name: "Mechanical Keyboard" 67 | category: "Electronics" 68 | price: 199.99 69 | in_stock: false 70 | supplier: "KeyCraft" 71 | specifications: 72 | switch_type: "blue" 73 | backlit: true 74 | wireless: false 75 | warranty_months: 12 76 | tags: ["mechanical", "tactile", "gaming"] 77 | 78 | - id: PROD007 79 | name: "Webcam HD" 80 | category: "Electronics" 81 | price: 149.99 82 | in_stock: true 83 | supplier: "VideoTech" 84 | specifications: 85 | resolution: "1080p" 86 | frame_rate: 30 87 | auto_focus: true 88 | warranty_months: 18 89 | tags: ["hd", "streaming", "auto-focus"] 90 | 91 | - id: PROD008 92 | name: "Desk Organizer" 93 | category: "Office" 94 | price: 79.99 95 | in_stock: true 96 | supplier: "OfficeGear" 97 | specifications: 98 | material: "bamboo" 99 | compartments: 6 100 | eco_friendly: true 101 | warranty_months: 12 102 | tags: ["bamboo", "eco-friendly", "organizer"] 103 | -------------------------------------------------------------------------------- /examples/small/urls.txt: -------------------------------------------------------------------------------- 1 | https://api.example.com/v1/users 2 | https://cdn.example.com/images/logo.png 3 | http://legacy.example.com/old-api 4 | https://docs.example.com/guide/getting-started 5 | ftp://files.example.com/downloads/ 6 | https://shop.example.com/products/electronics 7 | http://internal.example.com/health-check 8 | https://payments.stripe.com/api/v1/charges 9 | https://storage.googleapis.com/bucket/file.pdf 10 | http://monitoring.example.com/metrics 11 | https://auth.example.com/oauth/token 12 | https://api.github.com/repos/user/project 13 | http://database.internal/phpmyadmin 14 | https://mail.google.com/mail/ 15 | https://aws.amazon.com/s3/ 16 | http://192.168.1.100:8080/admin 17 | https://kubernetes.io/docs/ 18 | https://registry.hub.docker.com/ 19 | http://jenkins.company.com:8080/job/deploy 20 | https://grafana.monitoring.local/dashboard 21 | -------------------------------------------------------------------------------- /examples/small/user_behavior.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "session_id": "sess_123456789", 4 | "user_id": "user_001", 5 | "timestamp": "2024-07-18T14:30:15Z", 6 | "page": "/products/electronics", 7 | "action": "page_view", 8 | "duration_seconds": 45, 9 | "device": "desktop", 10 | "browser": "Chrome", 11 | "location": { 12 | "country": "USA", 13 | "state": "California", 14 | "city": "San Francisco" 15 | }, 16 | "referrer": "https://google.com/search?q=wireless+headphones" 17 | }, 18 | { 19 | "session_id": "sess_123456789", 20 | "user_id": "user_001", 21 | "timestamp": "2024-07-18T14:31:00Z", 22 | "page": "/products/headphones/wireless", 23 | "action": "click", 24 | "duration_seconds": 120, 25 | "device": "desktop", 26 | "browser": "Chrome", 27 | "location": { 28 | "country": "USA", 29 | "state": "California", 30 | "city": "San Francisco" 31 | }, 32 | "referrer": "/products/electronics" 33 | }, 34 | { 35 | "session_id": "sess_987654321", 36 | "user_id": "user_002", 37 | "timestamp": "2024-07-18T14:32:30Z", 38 | "page": "/cart", 39 | "action": "add_to_cart", 40 | "duration_seconds": 15, 41 | "device": "mobile", 42 | "browser": "Safari", 43 | "location": { 44 | "country": "Canada", 45 | "state": "Ontario", 46 | "city": "Toronto" 47 | }, 48 | "referrer": "/products/accessories/cables" 49 | }, 50 | { 51 | "session_id": "sess_456789123", 52 | "user_id": "user_003", 53 | "timestamp": "2024-07-18T14:33:45Z", 54 | "page": "/checkout", 55 | "action": "purchase", 56 | "duration_seconds": 180, 57 | "device": "desktop", 58 | "browser": "Firefox", 59 | "location": { 60 | "country": "UK", 61 | "state": "England", 62 | "city": "London" 63 | }, 64 | "referrer": "/cart" 65 | }, 66 | { 67 | "session_id": "sess_789123456", 68 | "user_id": "user_004", 69 | "timestamp": "2024-07-18T14:35:00Z", 70 | "page": "/support/contact", 71 | "action": "page_view", 72 | "duration_seconds": 90, 73 | "device": "tablet", 74 | "browser": "Chrome", 75 | "location": { 76 | "country": "Germany", 77 | "state": "Bavaria", 78 | "city": "Munich" 79 | }, 80 | "referrer": "/products/monitors" 81 | }, 82 | { 83 | "session_id": "sess_321654987", 84 | "user_id": "user_005", 85 | "timestamp": "2024-07-18T14:36:15Z", 86 | "page": "/login", 87 | "action": "login_attempt", 88 | "duration_seconds": 30, 89 | "device": "mobile", 90 | "browser": "Safari", 91 | "location": { 92 | "country": "Australia", 93 | "state": "New South Wales", 94 | "city": "Sydney" 95 | }, 96 | "referrer": "/account/dashboard" 97 | }, 98 | { 99 | "session_id": "sess_654987321", 100 | "user_id": "user_006", 101 | "timestamp": "2024-07-18T14:37:30Z", 102 | "page": "/search", 103 | "action": "search", 104 | "duration_seconds": 25, 105 | "device": "desktop", 106 | "browser": "Edge", 107 | "location": { 108 | "country": "USA", 109 | "state": "New York", 110 | "city": "New York" 111 | }, 112 | "referrer": "https://bing.com/search?q=laptop+stand" 113 | }, 114 | { 115 | "session_id": "sess_147258369", 116 | "user_id": "user_007", 117 | "timestamp": "2024-07-18T14:38:45Z", 118 | "page": "/products/office/stands", 119 | "action": "page_view", 120 | "duration_seconds": 75, 121 | "device": "desktop", 122 | "browser": "Chrome", 123 | "location": { 124 | "country": "Japan", 125 | "state": "Tokyo", 126 | "city": "Tokyo" 127 | }, 128 | "referrer": "/search" 129 | }, 130 | { 131 | "session_id": "sess_369258147", 132 | "user_id": "user_008", 133 | "timestamp": "2024-07-18T14:40:00Z", 134 | "page": "/wishlist", 135 | "action": "add_to_wishlist", 136 | "duration_seconds": 10, 137 | "device": "mobile", 138 | "browser": "Chrome", 139 | "location": { 140 | "country": "France", 141 | "state": "Île-de-France", 142 | "city": "Paris" 143 | }, 144 | "referrer": "/products/keyboards/mechanical" 145 | }, 146 | { 147 | "session_id": "sess_258147369", 148 | "user_id": "user_009", 149 | "timestamp": "2024-07-18T14:41:15Z", 150 | "page": "/products/webcams", 151 | "action": "page_view", 152 | "duration_seconds": 60, 153 | "device": "desktop", 154 | "browser": "Safari", 155 | "location": { 156 | "country": "Brazil", 157 | "state": "São Paulo", 158 | "city": "São Paulo" 159 | }, 160 | "referrer": "/categories/electronics" 161 | }, 162 | { 163 | "session_id": "sess_741852963", 164 | "user_id": "user_010", 165 | "timestamp": "2024-07-18T14:42:30Z", 166 | "page": "/reviews", 167 | "action": "write_review", 168 | "duration_seconds": 300, 169 | "device": "desktop", 170 | "browser": "Firefox", 171 | "location": { 172 | "country": "India", 173 | "state": "Maharashtra", 174 | "city": "Mumbai" 175 | }, 176 | "referrer": "/products/monitors/4k" 177 | }, 178 | { 179 | "session_id": "sess_852963741", 180 | "user_id": "user_011", 181 | "timestamp": "2024-07-18T14:45:00Z", 182 | "page": "/account/orders", 183 | "action": "page_view", 184 | "duration_seconds": 40, 185 | "device": "mobile", 186 | "browser": "Safari", 187 | "location": { 188 | "country": "Mexico", 189 | "state": "Mexico City", 190 | "city": "Mexico City" 191 | }, 192 | "referrer": "/login" 193 | }, 194 | { 195 | "session_id": "sess_963741852", 196 | "user_id": "user_012", 197 | "timestamp": "2024-07-18T14:46:15Z", 198 | "page": "/help/faq", 199 | "action": "page_view", 200 | "duration_seconds": 150, 201 | "device": "tablet", 202 | "browser": "Chrome", 203 | "location": { 204 | "country": "South Korea", 205 | "state": "Seoul", 206 | "city": "Seoul" 207 | }, 208 | "referrer": "/support/contact" 209 | }, 210 | { 211 | "session_id": "sess_159357426", 212 | "user_id": "user_013", 213 | "timestamp": "2024-07-18T14:48:30Z", 214 | "page": "/logout", 215 | "action": "logout", 216 | "duration_seconds": 5, 217 | "device": "desktop", 218 | "browser": "Chrome", 219 | "location": { 220 | "country": "Netherlands", 221 | "state": "North Holland", 222 | "city": "Amsterdam" 223 | }, 224 | "referrer": "/account/settings" 225 | }, 226 | { 227 | "session_id": "sess_357426159", 228 | "user_id": "user_014", 229 | "timestamp": "2024-07-18T14:50:00Z", 230 | "page": "/newsletter/signup", 231 | "action": "newsletter_signup", 232 | "duration_seconds": 45, 233 | "device": "mobile", 234 | "browser": "Firefox", 235 | "location": { 236 | "country": "Italy", 237 | "state": "Lazio", 238 | "city": "Rome" 239 | }, 240 | "referrer": "/home" 241 | }, 242 | { 243 | "session_id": "sess_426159357", 244 | "user_id": "user_015", 245 | "timestamp": "2024-07-18T14:51:30Z", 246 | "page": "/categories/office", 247 | "action": "page_view", 248 | "duration_seconds": 35, 249 | "device": "desktop", 250 | "browser": "Edge", 251 | "location": { 252 | "country": "Russia", 253 | "state": "Moscow", 254 | "city": "Moscow" 255 | }, 256 | "referrer": "/home" 257 | }, 258 | { 259 | "session_id": "sess_591837264", 260 | "user_id": "user_016", 261 | "timestamp": "2024-07-18T14:53:00Z", 262 | "page": "/products/organizers", 263 | "action": "page_view", 264 | "duration_seconds": 80, 265 | "device": "desktop", 266 | "browser": "Chrome", 267 | "location": { 268 | "country": "Spain", 269 | "state": "Madrid", 270 | "city": "Madrid" 271 | }, 272 | "referrer": "/categories/office" 273 | }, 274 | { 275 | "session_id": "sess_837264591", 276 | "user_id": "user_017", 277 | "timestamp": "2024-07-18T14:54:45Z", 278 | "page": "/compare", 279 | "action": "product_compare", 280 | "duration_seconds": 120, 281 | "device": "tablet", 282 | "browser": "Safari", 283 | "location": { 284 | "country": "Argentina", 285 | "state": "Buenos Aires", 286 | "city": "Buenos Aires" 287 | }, 288 | "referrer": "/products/headphones" 289 | }, 290 | { 291 | "session_id": "sess_264591837", 292 | "user_id": "user_018", 293 | "timestamp": "2024-07-18T14:56:30Z", 294 | "page": "/blog/tech-news", 295 | "action": "page_view", 296 | "duration_seconds": 200, 297 | "device": "desktop", 298 | "browser": "Firefox", 299 | "location": { 300 | "country": "Sweden", 301 | "state": "Stockholm", 302 | "city": "Stockholm" 303 | }, 304 | "referrer": "https://twitter.com/company" 305 | }, 306 | { 307 | "session_id": "sess_975318642", 308 | "user_id": "user_019", 309 | "timestamp": "2024-07-18T14:58:15Z", 310 | "page": "/sitemap", 311 | "action": "page_view", 312 | "duration_seconds": 20, 313 | "device": "mobile", 314 | "browser": "Chrome", 315 | "location": { 316 | "country": "Turkey", 317 | "state": "Istanbul", 318 | "city": "Istanbul" 319 | }, 320 | "referrer": "/help/faq" 321 | }, 322 | { 323 | "session_id": "sess_318642975", 324 | "user_id": "user_020", 325 | "timestamp": "2024-07-18T15:00:00Z", 326 | "page": "/contact/sales", 327 | "action": "contact_form", 328 | "duration_seconds": 240, 329 | "device": "desktop", 330 | "browser": "Safari", 331 | "location": { 332 | "country": "South Africa", 333 | "state": "Western Cape", 334 | "city": "Cape Town" 335 | }, 336 | "referrer": "/products/enterprise" 337 | } 338 | ] 339 | -------------------------------------------------------------------------------- /src/arg.rs: -------------------------------------------------------------------------------- 1 | use std::path::PathBuf; 2 | 3 | use clap::Parser; 4 | 5 | use crate::Error; 6 | 7 | /// hawk - Modern data analysis tool for structured data (JSON, YAML, CSV) 8 | /// 9 | /// hawk combines the simplicity of awk with the power of pandas for data exploration. 10 | /// Perfect for analyzing JSON APIs, YAML configs, and CSV datasets. 11 | #[derive(Debug, Parser)] 12 | #[command(name = "hawk")] 13 | #[command(version = "0.2.2")] 14 | #[command(about = "Modern data analysis tool for structured data and text files")] 15 | #[command(long_about = " 16 | hawk is a command-line data analysis tool that brings pandas-like functionality 17 | to your terminal. It supports JSON, YAML, CSV, and plain text formats with automatic 18 | detection, powerful filtering, grouping, aggregation, and string manipulation capabilities. 19 | 20 | EXAMPLES: 21 | # Basic field access 22 | hawk ‘.users[0].name’ data.json 23 | hawk ‘.users.name’ data.csv 24 | 25 | 26 | # Text processing (NEW in v0.2.0!) 27 | hawk '. | select(. | contains(\"ERROR\"))' app.log 28 | hawk '. | map(. | trim | upper)' data.txt 29 | hawk '. | map(. | substring(0, 19))' access.log 30 | 31 | # String operations 32 | hawk '. | map(. | replace(\"old\", \"new\"))' text.txt 33 | hawk '. | map(. | split(\",\") | join(\" | \"))' csv_lines.txt 34 | 35 | # Filtering and aggregation 36 | hawk '.users[] | select(.age > 30)' data.yaml 37 | hawk '.sales | group_by(.region) | avg(.amount)' sales.csv 38 | 39 | # Statistical analysis (NEW!) 40 | hawk '. | unique | sort' numbers.txt 41 | hawk '.scores[] | median(.value)' scores.json 42 | hawk '.data[] | stddev(.measurement)' sensor_data.csv 43 | 44 | # Complex pipelines 45 | hawk '. | select(. | contains(\"WARN\")) | map(. | substring(11, 8)) | unique' app.log 46 | hawk '.users[] | map(.email | lower | trim) | select(. | ends_with(\".com\"))' users.csv 47 | 48 | # Data exploration 49 | hawk '. | info' data.json 50 | hawk '.users | count' data.csv 51 | hawk '. | length' any_file.txt 52 | 53 | 54 | SUPPORTED FORMATS: 55 | JSON, YAML, CSV, Plain Text (automatically detected) 56 | 57 | QUERY SYNTAX: 58 | # Field Access 59 | .field - Access field 60 | .array[0] - Access array element 61 | .array[] - Access all array elements 62 | 63 | 64 | # Text Processing (NEW!) 65 | . | map(. | upper) - Convert to uppercase 66 | . | map(. | lower) - Convert to lowercase 67 | . | map(. | trim) - Remove whitespace 68 | . | map(. | length) - Get string length 69 | . | map(. | reverse) - Reverse string 70 | 71 | # String Manipulation 72 | . | map(. | replace(\"a\", \"b\")) - Replace text 73 | . | map(. | substring(0, 5)) - Extract substring 74 | . | map(. | split(\",\")) - Split by delimiter 75 | .array[] | join(\", \") - Join array elements 76 | 77 | # String Filtering 78 | . | select(. | contains(\"text\")) - Contains pattern 79 | . | select(. | starts_with(\"pre\")) - Starts with pattern 80 | . | select(. | ends_with(\"suf\")) - Ends with pattern 81 | 82 | # Statistical Functions (NEW!) 83 | . | unique - Remove duplicates 84 | . | sort - Sort values 85 | . | median - Calculate median 86 | . | stddev - Calculate standard deviation 87 | . | length - Get array/text length 88 | 89 | # Filtering & Aggregation 90 | . | select(.field > 10) - Filter data 91 | . | group_by(.category) - Group data 92 | . | count/sum/avg/min/max - Aggregate functions 93 | 94 | # Data Transformation 95 | . | map(.field | operation) - Transform data with string operations 96 | 97 | 98 | OUTPUT FORMATS: 99 | –format table - Colored table output (default for structured data) 100 | –format json - JSON output with syntax highlighting 101 | –format list - Simple list output 102 | –format auto - Smart format detection (default) 103 | 104 | COLORED OUTPUT: 105 | Automatic color detection (TTY), respects NO_COLOR environment variable 106 | ")] 107 | 108 | pub struct Args { 109 | /// JSONPath-style query to execute 110 | /// 111 | /// Examples: 112 | /// 113 | /// .users[0].name - Get first user's name 114 | /// 115 | /// .users | select(.age > 30) - Filter users by age 116 | /// 117 | /// . | group_by(.department) - Group by department 118 | pub query: String, 119 | 120 | /// Input file path (JSON, YAML, or CSV) 121 | /// 122 | /// If not provided, reads from stdin. 123 | /// File format is automatically detected. 124 | pub path: Option, 125 | 126 | /// Output format 127 | /// 128 | /// auto: Smart detection (table for arrays, list for values, json for complex) 129 | /// 130 | /// table: Force tabular output 131 | /// 132 | /// json: Force JSON output 133 | /// 134 | /// list: Force list output 135 | /// 136 | /// csv: Force CSV output 137 | #[arg(long, default_value = "auto")] 138 | #[arg(value_parser = ["auto", "table", "json", "list", "csv"])] 139 | pub format: String, 140 | 141 | #[arg(long, short)] 142 | #[arg(help = "Force text format (skip auto-detection)")] 143 | pub text: bool, 144 | } 145 | 146 | #[derive(Debug, Clone)] 147 | pub enum OutputFormat { 148 | Auto, 149 | Json, 150 | Table, 151 | List, 152 | Csv, 153 | } 154 | 155 | impl std::str::FromStr for OutputFormat { 156 | type Err = Error; 157 | 158 | fn from_str(s: &str) -> Result { 159 | match s.to_lowercase().as_str() { 160 | "auto" => Ok(OutputFormat::Auto), 161 | "json" => Ok(OutputFormat::Json), 162 | "table" => Ok(OutputFormat::Table), 163 | "list" => Ok(OutputFormat::List), 164 | "csv" => Ok(OutputFormat::Csv), 165 | _ => Err(Error::InvalidFormat(format!( 166 | "Invalid format: {}. Valid options: auto, json, table, list", 167 | s 168 | ))), 169 | } 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | #[derive(Debug, Error)] 4 | pub enum Error { 5 | #[error("Invalid output format: {0}")] 6 | InvalidFormat(String), 7 | 8 | #[error("File not found: {0}")] 9 | FileNotFound(#[from] std::io::Error), 10 | 11 | #[error("JSON deserialization error: {0}")] 12 | Json(#[from] serde_json::Error), 13 | 14 | #[error("YAML deserialization error: {0}")] 15 | Yaml(#[from] serde_yaml::Error), 16 | 17 | #[error("CSV parsing error: {0}")] 18 | Csv(#[from] csv::Error), 19 | 20 | #[error("str parse int error: {0}")] 21 | StrToInt(#[from] std::num::ParseIntError), 22 | 23 | #[error("Invalid query format: {0}")] 24 | InvalidQuery(String), 25 | 26 | #[error("Array index out of bounds: {0}")] 27 | IndexOutOfBounds(usize), 28 | 29 | #[error("Text processing error: {0}")] 30 | TextProcessing(String), 31 | 32 | #[error("String operation error: {0}")] 33 | StringOperation(String), 34 | } 35 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod arg; 2 | pub mod error; 3 | pub mod executor; 4 | pub mod filter; 5 | pub mod output; 6 | pub mod parser; 7 | pub mod setup; 8 | pub mod stats_opts; 9 | pub mod string_ops; 10 | pub mod utils; 11 | 12 | pub use arg::*; 13 | pub use error::*; 14 | pub use executor::*; 15 | pub use filter::*; 16 | pub use output::*; 17 | pub use parser::*; 18 | use serde_json::Value; 19 | pub use setup::*; 20 | pub use stats_opts::*; 21 | pub use string_ops::*; 22 | pub use utils::*; 23 | 24 | pub fn debug_json_order(json: &Value) { 25 | println!("=== Original JSON field order ==="); 26 | 27 | // ルートレベル 28 | // Root level 29 | if let Value::Object(obj) = json { 30 | println!("Root fields:"); 31 | for key in obj.keys() { 32 | println!(" {}", key); 33 | } 34 | 35 | // users配列の最初の要素のフィールド順序 36 | // Field order of the first element in the users array 37 | if let Some(Value::Array(users)) = obj.get("users") { 38 | if let Some(Value::Object(first_user)) = users.first() { 39 | println!("First user fields:"); 40 | for key in first_user.keys() { 41 | println!(" {}", key); 42 | } 43 | } 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::Result; 2 | use hawk_data::{Error, execute_query, setup}; 3 | 4 | fn main() -> Result<(), Error> { 5 | let result = run(); 6 | 7 | if let Err(ref e) = result { 8 | eprintln!("Error: {}", e); 9 | 10 | if let Error::InvalidQuery(_) = e { 11 | eprintln!("\nTry 'hawk --help' for usage examples."); 12 | } 13 | std::process::exit(1); 14 | } 15 | 16 | result 17 | } 18 | 19 | fn run() -> Result<(), Error> { 20 | let (json, query, format) = setup()?; 21 | execute_query(&json, &query, format)?; 22 | Ok(()) 23 | } 24 | -------------------------------------------------------------------------------- /src/parser.rs: -------------------------------------------------------------------------------- 1 | use crate::Error; 2 | 3 | pub fn parse_query_segments(query: &str) -> Result<(&str, Vec<&str>), Error> { 4 | // println!("=== parse_query_segments Debug ==="); 5 | // println!("Input query: '{}'", query); 6 | 7 | if query == "." { 8 | return Ok(("", vec![])); 9 | } 10 | 11 | // パイプライン操作がある場合は基本クエリ部分のみを処理 12 | let base_query = if query.contains('|') { 13 | query.split('|').next().unwrap().trim() 14 | } else { 15 | query 16 | }; 17 | 18 | // println!("Base query: '{}'", base_query); 19 | 20 | // ".[0]" のような場合の特別扱い 21 | if base_query.starts_with(".[") { 22 | let remaining = &base_query[1..]; 23 | let mut segments = remaining.split('.'); 24 | let first_segment = segments.next().unwrap(); 25 | let rest: Vec<&str> = segments.collect(); 26 | let result = Ok(("", [vec![first_segment], rest].concat())); 27 | // println!("Root array access result: {:?}", result); 28 | return result; 29 | } 30 | 31 | let mut segments = base_query.split('.').skip(1); 32 | let segment = segments 33 | .next() 34 | .ok_or(Error::InvalidQuery("Missing field segment in query".into()))?; 35 | let fields: Vec<&str> = segments.collect(); 36 | 37 | // println!("Normal parse result: {:?}", result); 38 | Ok((segment, fields)) 39 | } 40 | 41 | pub fn parse_array_segment(segment: &str) -> Result<(usize, usize), Error> { 42 | let idx = segment 43 | .find('[') 44 | .ok_or(Error::InvalidQuery("Missing '[' in segment".into()))?; 45 | let ridx = segment 46 | .find(']') 47 | .ok_or(Error::InvalidQuery("Missing ']' in segment".into()))?; 48 | 49 | if idx >= ridx { 50 | return Err(Error::InvalidQuery("Invalid bracket order".into())); 51 | } 52 | 53 | Ok((idx, ridx)) 54 | } 55 | 56 | #[cfg(test)] 57 | mod tests { 58 | use super::*; 59 | use crate::Error; 60 | 61 | #[test] 62 | fn test_parse_query_segments_normal_case() { 63 | // 正常ケース: 基本的なクエリ 64 | let result = parse_query_segments(".users.name"); 65 | assert!(result.is_ok()); 66 | let (segment, field) = result.unwrap(); 67 | assert_eq!(segment, "users"); 68 | assert_eq!(field, vec!["name"]); 69 | } 70 | 71 | #[test] 72 | fn test_parse_query_segments_with_array_index() { 73 | // 正常ケース: 配列インデックス付き 74 | let result = parse_query_segments(".users[0].name"); 75 | assert!(result.is_ok()); 76 | let (segment, field) = result.unwrap(); 77 | assert_eq!(segment, "users[0]"); 78 | assert_eq!(field, vec!["name"]); 79 | } 80 | 81 | #[test] 82 | fn test_parse_query_segments_different_fields() { 83 | // 正常ケース: 異なるフィールド名 84 | let result = parse_query_segments(".products.price"); 85 | assert!(result.is_ok()); 86 | let (segment, field) = result.unwrap(); 87 | assert_eq!(segment, "products"); 88 | assert_eq!(field, vec!["price"]); 89 | } 90 | 91 | #[test] 92 | fn test_parse_query_segments_complex_index() { 93 | // 正常ケース: 大きなインデックス 94 | let result = parse_query_segments(".items[123].description"); 95 | assert!(result.is_ok()); 96 | let (segment, field) = result.unwrap(); 97 | assert_eq!(segment, "items[123]"); 98 | assert_eq!(field, vec!["description"]); 99 | } 100 | 101 | #[test] 102 | fn test_parse_query_segments_truly_missing_field() { 103 | // エラーケース: 本当にフィールドセグメントが不足 104 | let result = parse_query_segments(""); 105 | assert!(result.is_err()); 106 | match result.unwrap_err() { 107 | Error::InvalidQuery(msg) => { 108 | assert!(msg.contains("Missing field segment")); 109 | } 110 | _ => panic!("Expected InvalidQuery error"), 111 | } 112 | } 113 | 114 | #[test] 115 | fn test_parse_query_segments_empty_query() { 116 | // エラーケース: 空のクエリ 117 | let result = parse_query_segments(""); 118 | assert!(result.is_err()); 119 | match result.unwrap_err() { 120 | Error::InvalidQuery(msg) => { 121 | assert!(msg.contains("Missing field segment")); 122 | } 123 | _ => panic!("Expected InvalidQuery error"), 124 | } 125 | } 126 | 127 | #[test] 128 | fn test_parse_array_segment_normal_case() { 129 | // 正常ケース: 基本的な配列インデックス 130 | let result = parse_array_segment("users[0]"); 131 | assert!(result.is_ok()); 132 | let (idx, ridx) = result.unwrap(); 133 | assert_eq!(idx, 5); // '[' の位置 134 | assert_eq!(ridx, 7); // ']' の位置 135 | } 136 | 137 | #[test] 138 | fn test_parse_array_segment_large_index() { 139 | // 正常ケース: 大きなインデックス 140 | let result = parse_array_segment("items[123]"); 141 | assert!(result.is_ok()); 142 | let (idx, ridx) = result.unwrap(); 143 | assert_eq!(idx, 5); // '[' の位置 144 | assert_eq!(ridx, 9); // ']' の位置 145 | } 146 | 147 | #[test] 148 | fn test_parse_array_segment_short_name() { 149 | // 正常ケース: 短いフィールド名 150 | let result = parse_array_segment("a[5]"); 151 | assert!(result.is_ok()); 152 | let (idx, ridx) = result.unwrap(); 153 | assert_eq!(idx, 1); // '[' の位置 154 | assert_eq!(ridx, 3); // ']' の位置 155 | } 156 | 157 | #[test] 158 | fn test_parse_array_segment_missing_open_bracket() { 159 | // エラーケース: '[' がない 160 | let result = parse_array_segment("users0]"); 161 | assert!(result.is_err()); 162 | match result.unwrap_err() { 163 | Error::InvalidQuery(msg) => { 164 | assert!(msg.contains("Missing '[' in segment")); 165 | } 166 | _ => panic!("Expected InvalidQuery error"), 167 | } 168 | } 169 | 170 | #[test] 171 | fn test_parse_array_segment_missing_close_bracket() { 172 | // エラーケース: ']' がない 173 | let result = parse_array_segment("users[0"); 174 | assert!(result.is_err()); 175 | match result.unwrap_err() { 176 | Error::InvalidQuery(msg) => { 177 | assert!(msg.contains("Missing ']' in segment")); 178 | } 179 | _ => panic!("Expected InvalidQuery error"), 180 | } 181 | } 182 | 183 | #[test] 184 | fn test_parse_array_segment_invalid_bracket_order() { 185 | // エラーケース: ブラケットの順序が逆 186 | let result = parse_array_segment("users]0["); 187 | assert!(result.is_err()); 188 | match result.unwrap_err() { 189 | Error::InvalidQuery(msg) => { 190 | assert!(msg.contains("Invalid bracket order")); 191 | } 192 | _ => panic!("Expected InvalidQuery error"), 193 | } 194 | } 195 | 196 | #[test] 197 | fn test_parse_array_segment_empty_brackets() { 198 | // エラーケース: 空のブラケット 199 | let result = parse_array_segment("users[]"); 200 | assert!(result.is_ok()); // パース自体は成功する 201 | let (idx, ridx) = result.unwrap(); 202 | assert_eq!(idx, 5); // '[' の位置 203 | assert_eq!(ridx, 6); // ']' の位置 204 | } 205 | 206 | #[test] 207 | fn test_parse_array_segment_no_brackets() { 208 | // エラーケース: ブラケットが全くない 209 | let result = parse_array_segment("users"); 210 | assert!(result.is_err()); 211 | match result.unwrap_err() { 212 | Error::InvalidQuery(msg) => { 213 | assert!(msg.contains("Missing '[' in segment")); 214 | } 215 | _ => panic!("Expected InvalidQuery error"), 216 | } 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /src/setup.rs: -------------------------------------------------------------------------------- 1 | use std::io::{self, Read}; 2 | 3 | use clap::Parser; 4 | use serde_json::Value; 5 | 6 | use crate::{Args, Error, OutputFormat}; 7 | 8 | pub fn setup() -> Result<(Value, String, OutputFormat), Error> { 9 | let args = Args::parse(); 10 | 11 | let content = if let Some(path) = args.path { 12 | std::fs::read_to_string(path)? 13 | } else { 14 | let mut buffer = String::new(); 15 | io::stdin().read_to_string(&mut buffer)?; 16 | buffer 17 | }; 18 | 19 | let input_format = if args.text { 20 | InputFormat::Text 21 | } else { 22 | detect_input_format(&content) 23 | }; 24 | 25 | let data = parse_content(&content, input_format)?; 26 | let query = args.query; 27 | 28 | let format = args 29 | .format 30 | .parse::() 31 | .map_err(|e| Error::InvalidFormat(e.to_string()))?; 32 | 33 | // debug 34 | // debug_json_order(&json); 35 | Ok((data, query, format)) 36 | } 37 | 38 | #[derive(Debug)] 39 | enum InputFormat { 40 | Json, 41 | Yaml, 42 | Csv, 43 | Text, 44 | } 45 | 46 | fn detect_input_format(content: &str) -> InputFormat { 47 | let trimmed = content.trim(); 48 | 49 | // CSV判定を最初に行う(シンプルな形式から) 50 | if is_likely_csv(trimmed) { 51 | return InputFormat::Csv; 52 | } 53 | 54 | // JSON判定(厳密にチェック)- YAMLより先に判定 55 | if (trimmed.starts_with('{') && trimmed.ends_with('}')) 56 | || (trimmed.starts_with('[') && trimmed.ends_with(']')) 57 | { 58 | // さらに、全体がJSONとして有効かチェック 59 | if serde_json::from_str::(trimmed).is_ok() { 60 | return InputFormat::Json; 61 | } 62 | } 63 | 64 | // YAML判定 - より厳格な条件に変更 65 | if is_structured_yaml(trimmed) { 66 | return InputFormat::Yaml; 67 | } 68 | 69 | // 上記のいずれにも該当しない場合はText 70 | InputFormat::Text 71 | } 72 | 73 | // 構造化されたYAMLかどうかを厳格に判定 74 | fn is_structured_yaml(content: &str) -> bool { 75 | let lines: Vec<&str> = content.lines().collect(); 76 | 77 | if lines.is_empty() { 78 | return false; 79 | } 80 | 81 | // Kubernetes/Docker Compose等の明確なYAMLマーカー 82 | if content.contains("apiVersion:") 83 | || content.contains("kind:") 84 | || content.contains("version:") 85 | || content.contains("services:") 86 | { 87 | return true; 88 | } 89 | 90 | let mut yaml_indicators = 0; 91 | let mut total_meaningful_lines = 0; 92 | 93 | for line in lines { 94 | let trimmed = line.trim(); 95 | 96 | // 空行やコメントは除外 97 | if trimmed.is_empty() || trimmed.starts_with('#') { 98 | continue; 99 | } 100 | 101 | total_meaningful_lines += 1; 102 | 103 | // YAML構造の特徴を検出 104 | if is_valid_yaml_line(trimmed) { 105 | yaml_indicators += 1; 106 | } 107 | } 108 | 109 | // 意味のある行が少ない場合はYAMLではない 110 | if total_meaningful_lines < 3 { 111 | return false; 112 | } 113 | 114 | // 80%以上の行がYAML構造ならYAMLと判定 115 | (yaml_indicators as f64 / total_meaningful_lines as f64) > 0.8 116 | } 117 | 118 | // 有効なYAML行かどうかを判定 119 | fn is_valid_yaml_line(line: &str) -> bool { 120 | // リスト形式 (- item) 121 | if line.starts_with("- ") { 122 | return true; 123 | } 124 | 125 | // key: value 形式 126 | if let Some(colon_pos) = line.find(':') { 127 | let key_part = line[..colon_pos].trim(); 128 | let value_part = line[colon_pos + 1..].trim(); 129 | 130 | // キー部分の検証 131 | if key_part.is_empty() { 132 | return false; 133 | } 134 | 135 | // キーに無効な文字が含まれていない 136 | if key_part.contains(' ') && !key_part.starts_with('"') && !key_part.starts_with('\'') { 137 | return false; 138 | } 139 | 140 | // インデントされたネスト構造 141 | if line.starts_with(" ") || line.starts_with("\t") { 142 | return true; 143 | } 144 | 145 | // 値が明らかにYAML的 146 | if value_part.is_empty() 147 | || value_part.starts_with('[') 148 | || value_part.starts_with('{') 149 | || value_part == "true" 150 | || value_part == "false" 151 | || value_part.parse::().is_ok() 152 | { 153 | return true; 154 | } 155 | 156 | // パス、URL、タイムスタンプなどが含まれていたらYAMLではない可能性が高い 157 | if value_part.contains('/') && value_part.len() > 10 { 158 | return false; 159 | } 160 | 161 | return true; 162 | } 163 | 164 | false 165 | } 166 | 167 | fn parse_content(content: &str, format: InputFormat) -> Result { 168 | match format { 169 | InputFormat::Json => serde_json::from_str(content).map_err(Error::Json), 170 | InputFormat::Yaml => { 171 | // 複数ドキュメントに対応 172 | if content.contains("---") { 173 | parse_multi_document_yaml(content) 174 | } else { 175 | serde_yaml::from_str(content).map_err(Error::Yaml) 176 | } 177 | } 178 | InputFormat::Csv => parse_csv_to_json(content), 179 | InputFormat::Text => parse_text_to_json(content), 180 | } 181 | } 182 | 183 | fn parse_text_to_json(content: &str) -> Result { 184 | // テキストを行ごとに分割して配列として扱う 185 | let lines: Vec = content 186 | .lines() 187 | .map(|line| Value::String(line.to_string())) 188 | .collect(); 189 | 190 | // 空のファイルの場合も配列として返す 191 | Ok(Value::Array(lines)) 192 | } 193 | 194 | fn parse_multi_document_yaml(content: &str) -> Result { 195 | let documents: Vec<&str> = content 196 | .split("---") 197 | .map(|doc| doc.trim()) 198 | .filter(|doc| !doc.is_empty()) 199 | .collect(); 200 | 201 | let mut parsed_docs = Vec::new(); 202 | 203 | for doc in documents { 204 | let parsed: Value = serde_yaml::from_str(doc).map_err(Error::Yaml)?; 205 | parsed_docs.push(parsed); 206 | } 207 | 208 | // 複数ドキュメントを配列として返す 209 | Ok(Value::Array(parsed_docs)) 210 | } 211 | 212 | fn is_likely_csv(content: &str) -> bool { 213 | let lines: Vec<&str> = content.lines().take(5).collect(); 214 | 215 | if lines.is_empty() { 216 | return false; 217 | } 218 | 219 | // 最初の行をヘッダーとして想定 220 | let first_line = lines[0]; 221 | let comma_count = first_line.matches(',').count(); 222 | 223 | // カンマが1個以上あり、他の行も同じような構造 224 | if comma_count > 0 { 225 | // 他の行も同じようなカンマ数か確認 226 | lines.iter().skip(1).all(|line| { 227 | let line_comma_count = line.matches(',').count(); 228 | (line_comma_count as i32 - comma_count as i32).abs() <= 1 229 | }) 230 | } else { 231 | false 232 | } 233 | } 234 | 235 | fn parse_csv_to_json(content: &str) -> Result { 236 | let mut reader = csv::Reader::from_reader(content.as_bytes()); 237 | 238 | // ヘッダーを取得 239 | let headers: Vec = reader 240 | .headers() 241 | .map_err(Error::Csv)? 242 | .iter() 243 | .map(|h| h.trim().to_string()) 244 | .collect(); 245 | 246 | let mut records = Vec::new(); 247 | 248 | for result in reader.records() { 249 | let record = result.map_err(Error::Csv)?; 250 | let mut object = serde_json::Map::new(); 251 | 252 | for (i, field) in record.iter().enumerate() { 253 | if let Some(header) = headers.get(i) { 254 | let value = infer_value_type(field.trim()); 255 | object.insert(header.clone(), value); 256 | } 257 | } 258 | 259 | records.push(Value::Object(object)); 260 | } 261 | 262 | // 直接配列を返す(二重配列にしない) 263 | Ok(Value::Array(records)) 264 | } 265 | 266 | fn infer_value_type(field: &str) -> Value { 267 | // 空文字チェック 268 | if field.is_empty() { 269 | return Value::Null; 270 | } 271 | 272 | // 真偽値判定 273 | match field.to_lowercase().as_str() { 274 | "true" => return Value::Bool(true), 275 | "false" => return Value::Bool(false), 276 | _ => {} 277 | } 278 | 279 | // 整数判定 280 | if let Ok(int_val) = field.parse::() { 281 | return Value::Number(serde_json::Number::from(int_val)); 282 | } 283 | 284 | // 浮動小数点数判定 285 | if let Ok(float_val) = field.parse::() { 286 | if let Some(num) = serde_json::Number::from_f64(float_val) { 287 | return Value::Number(num); 288 | } 289 | } 290 | 291 | // デフォルトは文字列 292 | Value::String(field.to_string()) 293 | } 294 | 295 | // テキスト処理用のヘルパー関数 296 | pub fn text_to_json_values(content: &str) -> Result, Error> { 297 | let lines: Vec = content 298 | .lines() 299 | .map(|line| Value::String(line.to_string())) 300 | .collect(); 301 | Ok(lines) 302 | } 303 | 304 | #[cfg(test)] 305 | mod tests { 306 | use super::*; 307 | 308 | #[test] 309 | fn test_text_parsing() { 310 | let content = "line1\nline2\nERROR: something happened"; 311 | let result = parse_text_to_json(content).unwrap(); 312 | 313 | if let Value::Array(lines) = result { 314 | assert_eq!(lines.len(), 3); 315 | assert_eq!(lines[0], Value::String("line1".to_string())); 316 | assert_eq!(lines[1], Value::String("line2".to_string())); 317 | assert_eq!( 318 | lines[2], 319 | Value::String("ERROR: something happened".to_string()) 320 | ); 321 | } else { 322 | panic!("Expected array result"); 323 | } 324 | } 325 | 326 | #[test] 327 | fn test_yaml_detection() { 328 | use super::{is_structured_yaml, is_valid_yaml_line}; 329 | 330 | // 明確にYAMLとして認識されるべき 331 | assert!(is_structured_yaml("apiVersion: v1\nkind: Pod")); 332 | assert!(is_structured_yaml( 333 | "key: value\nother: data\nnested:\n sub: item" 334 | )); 335 | 336 | // YAMLとして認識されないべき 337 | assert!(!is_structured_yaml("2024-01-01 10:00:00 INFO Starting")); 338 | assert!(!is_structured_yaml("plain text\nwith some: colons")); 339 | assert!(!is_structured_yaml( 340 | "ServerName: localhost\nServerPort: 8080" 341 | )); // 設定ファイル風だがYAMLではない 342 | 343 | // 個別行のテスト 344 | assert!(is_valid_yaml_line("key: value")); 345 | assert!(is_valid_yaml_line(" nested: item")); 346 | assert!(is_valid_yaml_line("- list_item")); 347 | assert!(!is_valid_yaml_line("2024-01-01 10:00:00 INFO message")); 348 | assert!(!is_valid_yaml_line("random text line")); 349 | } 350 | } 351 | -------------------------------------------------------------------------------- /src/stats_opts.rs: -------------------------------------------------------------------------------- 1 | use crate::Error; 2 | use serde_json::Value; 3 | 4 | /// 統計操作を適用する 5 | pub fn apply_stats_operation( 6 | data: &[Value], 7 | operation: &str, 8 | field: Option<&str>, 9 | ) -> Result { 10 | match operation { 11 | "unique" => apply_unique(data, field), 12 | "sort" => apply_sort(data, field), 13 | "median" => apply_median(data, field), 14 | "stddev" => apply_stddev(data, field), 15 | "length" => Ok(Value::Number(serde_json::Number::from(data.len()))), 16 | _ => Err(Error::StringOperation(format!( 17 | "Unknown stats operation: {}", 18 | operation 19 | ))), 20 | } 21 | } 22 | 23 | /// ユニーク値を取得 24 | fn apply_unique(data: &[Value], field: Option<&str>) -> Result { 25 | use std::collections::HashSet; 26 | 27 | let mut unique_values = HashSet::new(); 28 | let mut result = Vec::new(); 29 | 30 | for item in data { 31 | let value_to_check = if let Some(field_name) = field { 32 | // フィールド指定がある場合 33 | item.get(field_name).unwrap_or(&Value::Null).clone() 34 | } else { 35 | // フィールド指定がない場合は値そのもの 36 | item.clone() 37 | }; 38 | 39 | // JSON値をハッシュ可能な文字列に変換 40 | let key = serde_json::to_string(&value_to_check).unwrap_or_default(); 41 | 42 | if unique_values.insert(key) { 43 | result.push(value_to_check); 44 | } 45 | } 46 | 47 | Ok(Value::Array(result)) 48 | } 49 | 50 | /// ソート 51 | fn apply_sort(data: &[Value], field: Option<&str>) -> Result { 52 | let mut sorted_data = data.to_vec(); 53 | 54 | sorted_data.sort_by(|a, b| { 55 | let val_a = if let Some(field_name) = field { 56 | a.get(field_name).unwrap_or(&Value::Null) 57 | } else { 58 | a 59 | }; 60 | 61 | let val_b = if let Some(field_name) = field { 62 | b.get(field_name).unwrap_or(&Value::Null) 63 | } else { 64 | b 65 | }; 66 | 67 | compare_json_values(val_a, val_b) 68 | }); 69 | 70 | Ok(Value::Array(sorted_data)) 71 | } 72 | 73 | /// 中央値を計算 74 | fn apply_median(data: &[Value], field: Option<&str>) -> Result { 75 | let mut numbers = extract_numbers(data, field)?; 76 | 77 | if numbers.is_empty() { 78 | return Ok(Value::Null); 79 | } 80 | 81 | numbers.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); 82 | 83 | let len = numbers.len(); 84 | let median = if len % 2 == 0 { 85 | // 偶数個の場合は中央2つの平均 86 | (numbers[len / 2 - 1] + numbers[len / 2]) / 2.0 87 | } else { 88 | // 奇数個の場合は中央値 89 | numbers[len / 2] 90 | }; 91 | 92 | Ok(Value::Number(serde_json::Number::from_f64(median).unwrap())) 93 | } 94 | 95 | /// 標準偏差を計算 96 | fn apply_stddev(data: &[Value], field: Option<&str>) -> Result { 97 | let numbers = extract_numbers(data, field)?; 98 | 99 | if numbers.len() < 2 { 100 | return Ok(Value::Null); 101 | } 102 | 103 | let mean = numbers.iter().sum::() / numbers.len() as f64; 104 | let variance = 105 | numbers.iter().map(|x| (x - mean).powi(2)).sum::() / (numbers.len() - 1) as f64; // 標本標準偏差 106 | 107 | let stddev = variance.sqrt(); 108 | 109 | Ok(Value::Number(serde_json::Number::from_f64(stddev).unwrap())) 110 | } 111 | 112 | /// 数値を抽出 113 | fn extract_numbers(data: &[Value], field: Option<&str>) -> Result, Error> { 114 | let mut numbers = Vec::new(); 115 | 116 | for item in data { 117 | let value = if let Some(field_name) = field { 118 | item.get(field_name).unwrap_or(&Value::Null) 119 | } else { 120 | item 121 | }; 122 | 123 | if let Some(num) = value.as_f64() { 124 | numbers.push(num); 125 | } 126 | } 127 | 128 | Ok(numbers) 129 | } 130 | 131 | /// JSON値の比較 132 | fn compare_json_values(a: &Value, b: &Value) -> std::cmp::Ordering { 133 | use std::cmp::Ordering; 134 | 135 | match (a, b) { 136 | (Value::Number(n1), Value::Number(n2)) => n1 137 | .as_f64() 138 | .unwrap_or(0.0) 139 | .partial_cmp(&n2.as_f64().unwrap_or(0.0)) 140 | .unwrap_or(Ordering::Equal), 141 | (Value::String(s1), Value::String(s2)) => s1.cmp(s2), 142 | (Value::Bool(b1), Value::Bool(b2)) => b1.cmp(b2), 143 | (Value::Null, Value::Null) => Ordering::Equal, 144 | (Value::Null, _) => Ordering::Less, 145 | (_, Value::Null) => Ordering::Greater, 146 | // 異なる型の場合は型名で比較 147 | _ => get_type_priority(a).cmp(&get_type_priority(b)), 148 | } 149 | } 150 | 151 | /// 型の優先順位 152 | fn get_type_priority(value: &Value) -> u8 { 153 | match value { 154 | Value::Null => 0, 155 | Value::Bool(_) => 1, 156 | Value::Number(_) => 2, 157 | Value::String(_) => 3, 158 | Value::Array(_) => 4, 159 | Value::Object(_) => 5, 160 | } 161 | } 162 | 163 | #[cfg(test)] 164 | mod tests { 165 | use super::*; 166 | 167 | #[test] 168 | fn test_unique_operation() { 169 | let data = vec![ 170 | Value::String("apple".to_string()), 171 | Value::String("banana".to_string()), 172 | Value::String("apple".to_string()), 173 | Value::String("cherry".to_string()), 174 | ]; 175 | 176 | let result = apply_unique(&data, None).unwrap(); 177 | if let Value::Array(arr) = result { 178 | assert_eq!(arr.len(), 3); // apple, banana, cherry 179 | } else { 180 | panic!("Expected array result"); 181 | } 182 | } 183 | 184 | #[test] 185 | fn test_sort_numbers() { 186 | let data = vec![ 187 | Value::Number(3.into()), 188 | Value::Number(1.into()), 189 | Value::Number(4.into()), 190 | Value::Number(2.into()), 191 | ]; 192 | 193 | let result = apply_sort(&data, None).unwrap(); 194 | if let Value::Array(arr) = result { 195 | assert_eq!(arr[0], Value::Number(1.into())); 196 | assert_eq!(arr[1], Value::Number(2.into())); 197 | assert_eq!(arr[2], Value::Number(3.into())); 198 | assert_eq!(arr[3], Value::Number(4.into())); 199 | } else { 200 | panic!("Expected array result"); 201 | } 202 | } 203 | 204 | #[test] 205 | fn test_median_even() { 206 | let data = vec![ 207 | Value::Number(1.into()), 208 | Value::Number(2.into()), 209 | Value::Number(4.into()), 210 | Value::Number(5.into()), 211 | ]; 212 | 213 | let result = apply_median(&data, None).unwrap(); 214 | assert_eq!( 215 | result, 216 | Value::Number(serde_json::Number::from_f64(3.0).unwrap()) 217 | ); 218 | } 219 | 220 | #[test] 221 | fn test_stddev() { 222 | let data = vec![ 223 | Value::Number(1.into()), 224 | Value::Number(2.into()), 225 | Value::Number(3.into()), 226 | Value::Number(4.into()), 227 | Value::Number(5.into()), 228 | ]; 229 | 230 | let result = apply_stddev(&data, None).unwrap(); 231 | // 標本標準偏差 ≈ 1.58 232 | if let Value::Number(n) = result { 233 | let stddev = n.as_f64().unwrap(); 234 | assert!((stddev - 1.58).abs() < 0.1); 235 | } else { 236 | panic!("Expected number result"); 237 | } 238 | } 239 | 240 | #[test] 241 | fn test_unique_with_field() { 242 | let data = vec![ 243 | serde_json::json!({"name": "Alice", "age": 30}), 244 | serde_json::json!({"name": "Bob", "age": 25}), 245 | serde_json::json!({"name": "Alice", "age": 35}), 246 | ]; 247 | 248 | let result = apply_unique(&data, Some("name")).unwrap(); 249 | if let Value::Array(arr) = result { 250 | assert_eq!(arr.len(), 2); // Alice, Bob 251 | } else { 252 | panic!("Expected array result"); 253 | } 254 | } 255 | } 256 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | use serde_json::Value; 2 | 3 | pub fn value_to_string(value: &Value) -> String { 4 | match value { 5 | Value::String(s) => s.clone(), 6 | _ => value.to_string().trim_matches('"').to_string(), 7 | } 8 | } 9 | 10 | #[cfg(test)] 11 | mod tests { 12 | use super::*; 13 | 14 | #[test] 15 | fn test_value_to_string_with_string() { 16 | let value = Value::String("Alice".to_string()); 17 | assert_eq!(value_to_string(&value), "Alice"); 18 | } 19 | 20 | #[test] 21 | fn test_value_to_string_with_number() { 22 | let value = Value::Number(serde_json::Number::from(42)); 23 | assert_eq!(value_to_string(&value), "42"); 24 | } 25 | 26 | #[test] 27 | fn test_value_to_string_with_boolean() { 28 | let value = Value::Bool(true); 29 | assert_eq!(value_to_string(&value), "true"); 30 | 31 | let value = Value::Bool(false); 32 | assert_eq!(value_to_string(&value), "false"); 33 | } 34 | 35 | #[test] 36 | fn test_value_to_string_with_null() { 37 | let value = Value::Null; 38 | assert_eq!(value_to_string(&value), "null"); 39 | } 40 | } 41 | --------------------------------------------------------------------------------